/usr/share/pyshared/pdfrw/pdfreader.py

# A part of pdfrw (pdfrw.googlecode.com)
# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas
# MIT license -- See LICENSE.txt for details

'''
The PdfReader class reads an entire PDF file into memory and
parses the top-level container objects.  (It does not parse
into streams.)  The object subclasses PdfDict, and the
document pages are stored in a list in the pages attribute
of the object.
'''

from pdftokens import PdfTokens
from pdfobjects import PdfDict, PdfArray, PdfName
from pdfcompress import uncompress

class PdfReader(PdfDict):

    class unresolved:
        # Used as a placeholder until we have an object.
        pass

    def readindirect(self, objnum, gennum):
        ''' Read an indirect object.  If it has already
            been read, return it from the cache.
        '''

        def setobj(obj):
            # Store the new object in the dictionary
            # once we have its value
            record[1] = obj

        def ordinary(source, setobj, obj):
            # Deal with an ordinary (non-array, non-dict) object
            setobj(obj)
            return obj

        fdata, objnum, gennum = self.fdata, int(objnum), int(gennum)
        record = self.indirect_objects[fdata, objnum, gennum]
        if record[1] is not self.unresolved:
            return record[1]

        # Read the object header and validate it
        source = PdfTokens(fdata, record[0])
        objid = source.multiple(3)
        assert int(objid[0]) == objnum, objid
        assert int(objid[1]) == gennum, objid
        assert objid[2] == 'obj', objid

        # Read the object, and call special code if it starts
        # an array or dictionary
        obj = source.next()
        obj = self.special.get(obj, ordinary)(source, setobj, obj)
        self.readstream(obj, source)
        obj.indirect = True
        return obj

    def readstream(obj, source):
        ''' Read optional stream following a dictionary
            object.
        '''
        tok = source.next()
        if tok == 'endobj':
            return  # No stream

        assert isinstance(obj, PdfDict)
        assert tok == 'stream', tok
        fdata = source.fdata
        floc = fdata.rindex(tok, 0, source.floc) + len(tok)
        ch = fdata[floc]
        if ch == '\r':
            floc += 1
            ch = fdata[floc]
        assert ch == '\n'
        startstream = floc + 1
        endstream = startstream + int(obj.Length)
        obj._stream = fdata[startstream:endstream]
        source = PdfTokens(fdata, endstream)
        endit = source.multiple(2)
        if endit != 'endstream endobj'.split():
            # /Length attribute is broken, try to read stream
            # anyway disregarding the specified value
            # TODO: issue warning here once we have some kind of
            # logging
            endstream = fdata.index('endstream', startstream)
            if fdata[endstream-2:endstream] == '\r\n':
                endstream -= 2
            elif fdata[endstream-1] in ['\n', '\r']:
                endstream -= 1
            source = PdfTokens(fdata, endstream)
            endit = source.multiple(2)
            assert endit == 'endstream endobj'.split()
            obj.Length = str(endstream-startstream)
            obj._stream = fdata[startstream:endstream]
    readstream = staticmethod(readstream)

    def readarray(self, source, setobj=lambda x:None, original=None):
        special = self.special
        result = PdfArray()
        setobj(result)

        for value in source:
            if value == ']':
                break
            if value in special:
                value = special[value](source)
            elif value == 'R':
                generation = result.pop()
                value = self.readindirect(result.pop(), generation)
            result.append(value)
        return result

    def readdict(self, source, setobj=lambda x:None, original=None):
        special = self.special
        result = PdfDict()
        setobj(result)

        tok = source.next()
        while tok != '>>':
            assert tok.startswith('/'), (tok, source.multiple(10))
            key = tok
            value = source.next()
            if value in special:
                value = special[value](source)
                tok = source.next()
            else:
                tok = source.next()
                if value.isdigit() and tok.isdigit():
                    assert source.next() == 'R'
                    value = self.readindirect(value, tok)
                    tok = source.next()
            result[key] = value

        return result

    def readxref(fdata):
        startloc = fdata.rindex('startxref')
        xrefinfo = list(PdfTokens(fdata, startloc, False))
        assert len(xrefinfo) == 3, xrefinfo
        assert xrefinfo[0] == 'startxref', xrefinfo[0]
        assert xrefinfo[1].isdigit(), xrefinfo[1]
        assert xrefinfo[2].rstrip() == '%%EOF', repr(xrefinfo[2])
        return startloc, PdfTokens(fdata, int(xrefinfo[1]))
    readxref = staticmethod(readxref)

    def parsexref(self, source):
        tok = source.next()
        assert tok == 'xref', tok
        while 1:
            tok = source.next()
            if tok == 'trailer':
                break
            startobj = int(tok)
            for objnum in range(startobj, startobj + int(source.next())):
                offset = int(source.next())
                generation = int(source.next())
                if source.next() == 'n':
                    objid = self.fdata, objnum, generation
                    objval = [offset, self.unresolved]
                    self.indirect_objects.setdefault(objid, objval)

    pagename = PdfName.Page
    pagesname = PdfName.Pages

    def readpages(self, node):
        # PDFs can have arbitrarily nested Pages/Page
        # dictionary structures.
        if node.Type == self.pagename:
            return [node]
        assert node.Type == self.pagesname, node.Type
        result = []
        for node in node.Kids:
            result.extend(self.readpages(node))
        return result

    def __init__(self, fname=None, fdata=None, decompress=True):

        if fname is not None:
            assert fdata is None
            # Allow reading preexisting streams like pyPdf
            if hasattr(fname, 'read'):
                fdata = fname.read()
            else:
                f = open(fname, 'rb')
                fdata = f.read()
                f.close()

        assert fdata is not None
        fdata = fdata.rstrip('\00')
        self.private.fdata = fdata

        self.private.indirect_objects = {}
        self.private.special = {'<<': self.readdict, '[': self.readarray}

        startloc, source = self.readxref(fdata)
        self.parsexref(source)
        assert source.next() == '<<'
        self.update(self.readdict(source))
        assert source.next() == 'startxref' and source.floc > startloc
        self.private.pages = self.readpages(self.Root.Pages)
        if decompress:
            self.uncompress()

        # For compatibility with pyPdf
        self.private.numPages = len(self.pages)


    # For compatibility with pyPdf
    def getPage(self, pagenum):
        return self.pages[pagenum]

    def uncompress(self):
        uncompress([x[1] for x in self.indirect_objects.itervalues()])
python-pdfrw 0+svn136-3 / usr / share / pyshared / pdfrw / pdfreader.py