/usr/share/pyshared/pdfrw/pdfreader.py is in python-pdfrw 0+svn136-3.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 | # A part of pdfrw (pdfrw.googlecode.com)
# Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas
# MIT license -- See LICENSE.txt for details
'''
The PdfReader class reads an entire PDF file into memory and
parses the top-level container objects. (It does not parse
into streams.) The object subclasses PdfDict, and the
document pages are stored in a list in the pages attribute
of the object.
'''
from pdftokens import PdfTokens
from pdfobjects import PdfDict, PdfArray, PdfName
from pdfcompress import uncompress
class PdfReader(PdfDict):
class unresolved:
# Used as a placeholder until we have an object.
pass
def readindirect(self, objnum, gennum):
''' Read an indirect object. If it has already
been read, return it from the cache.
'''
def setobj(obj):
# Store the new object in the dictionary
# once we have its value
record[1] = obj
def ordinary(source, setobj, obj):
# Deal with an ordinary (non-array, non-dict) object
setobj(obj)
return obj
fdata, objnum, gennum = self.fdata, int(objnum), int(gennum)
record = self.indirect_objects[fdata, objnum, gennum]
if record[1] is not self.unresolved:
return record[1]
# Read the object header and validate it
source = PdfTokens(fdata, record[0])
objid = source.multiple(3)
assert int(objid[0]) == objnum, objid
assert int(objid[1]) == gennum, objid
assert objid[2] == 'obj', objid
# Read the object, and call special code if it starts
# an array or dictionary
obj = source.next()
obj = self.special.get(obj, ordinary)(source, setobj, obj)
self.readstream(obj, source)
obj.indirect = True
return obj
def readstream(obj, source):
''' Read optional stream following a dictionary
object.
'''
tok = source.next()
if tok == 'endobj':
return # No stream
assert isinstance(obj, PdfDict)
assert tok == 'stream', tok
fdata = source.fdata
floc = fdata.rindex(tok, 0, source.floc) + len(tok)
ch = fdata[floc]
if ch == '\r':
floc += 1
ch = fdata[floc]
assert ch == '\n'
startstream = floc + 1
endstream = startstream + int(obj.Length)
obj._stream = fdata[startstream:endstream]
source = PdfTokens(fdata, endstream)
endit = source.multiple(2)
if endit != 'endstream endobj'.split():
# /Length attribute is broken, try to read stream
# anyway disregarding the specified value
# TODO: issue warning here once we have some kind of
# logging
endstream = fdata.index('endstream', startstream)
if fdata[endstream-2:endstream] == '\r\n':
endstream -= 2
elif fdata[endstream-1] in ['\n', '\r']:
endstream -= 1
source = PdfTokens(fdata, endstream)
endit = source.multiple(2)
assert endit == 'endstream endobj'.split()
obj.Length = str(endstream-startstream)
obj._stream = fdata[startstream:endstream]
readstream = staticmethod(readstream)
def readarray(self, source, setobj=lambda x:None, original=None):
special = self.special
result = PdfArray()
setobj(result)
for value in source:
if value == ']':
break
if value in special:
value = special[value](source)
elif value == 'R':
generation = result.pop()
value = self.readindirect(result.pop(), generation)
result.append(value)
return result
def readdict(self, source, setobj=lambda x:None, original=None):
special = self.special
result = PdfDict()
setobj(result)
tok = source.next()
while tok != '>>':
assert tok.startswith('/'), (tok, source.multiple(10))
key = tok
value = source.next()
if value in special:
value = special[value](source)
tok = source.next()
else:
tok = source.next()
if value.isdigit() and tok.isdigit():
assert source.next() == 'R'
value = self.readindirect(value, tok)
tok = source.next()
result[key] = value
return result
def readxref(fdata):
startloc = fdata.rindex('startxref')
xrefinfo = list(PdfTokens(fdata, startloc, False))
assert len(xrefinfo) == 3, xrefinfo
assert xrefinfo[0] == 'startxref', xrefinfo[0]
assert xrefinfo[1].isdigit(), xrefinfo[1]
assert xrefinfo[2].rstrip() == '%%EOF', repr(xrefinfo[2])
return startloc, PdfTokens(fdata, int(xrefinfo[1]))
readxref = staticmethod(readxref)
def parsexref(self, source):
tok = source.next()
assert tok == 'xref', tok
while 1:
tok = source.next()
if tok == 'trailer':
break
startobj = int(tok)
for objnum in range(startobj, startobj + int(source.next())):
offset = int(source.next())
generation = int(source.next())
if source.next() == 'n':
objid = self.fdata, objnum, generation
objval = [offset, self.unresolved]
self.indirect_objects.setdefault(objid, objval)
pagename = PdfName.Page
pagesname = PdfName.Pages
def readpages(self, node):
# PDFs can have arbitrarily nested Pages/Page
# dictionary structures.
if node.Type == self.pagename:
return [node]
assert node.Type == self.pagesname, node.Type
result = []
for node in node.Kids:
result.extend(self.readpages(node))
return result
def __init__(self, fname=None, fdata=None, decompress=True):
if fname is not None:
assert fdata is None
# Allow reading preexisting streams like pyPdf
if hasattr(fname, 'read'):
fdata = fname.read()
else:
f = open(fname, 'rb')
fdata = f.read()
f.close()
assert fdata is not None
fdata = fdata.rstrip('\00')
self.private.fdata = fdata
self.private.indirect_objects = {}
self.private.special = {'<<': self.readdict, '[': self.readarray}
startloc, source = self.readxref(fdata)
self.parsexref(source)
assert source.next() == '<<'
self.update(self.readdict(source))
assert source.next() == 'startxref' and source.floc > startloc
self.private.pages = self.readpages(self.Root.Pages)
if decompress:
self.uncompress()
# For compatibility with pyPdf
self.private.numPages = len(self.pages)
# For compatibility with pyPdf
def getPage(self, pagenum):
return self.pages[pagenum]
def uncompress(self):
uncompress([x[1] for x in self.indirect_objects.itervalues()])
|