/usr/lib/python3/dist-packages/pdfrw/findobjs.py is in python3-pdfrw 0.2-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 | # A part of pdfrw (https://github.com/pmaupin/pdfrw)
# Copyright (C) 2015 Patrick Maupin, Austin, Texas
# MIT license -- See LICENSE.txt for details
''' This module contains a function to find all the XObjects
in a document, and another function that will wrap them
in page objects.
'''
from .objects import PdfDict, PdfArray, PdfName
from .pdfwriter import user_fmt
def find_objects(source, valid_types=(PdfName.XObject, None),
valid_subtypes=(PdfName.Form, PdfName.Image),
no_follow=(PdfName.Parent,),
isinstance=isinstance, id=id, sorted=sorted,
reversed=reversed, PdfDict=PdfDict):
'''
Find all the objects of a particular kind in a document
or array. Defaults to looking for Form and Image XObjects.
This could be done recursively, but some PDFs
are quite deeply nested, so we do it without
recursion.
Note that we don't know exactly where things appear on pages,
but we aim for a sort order that is (a) mostly in document order,
and (b) reproducible. For arrays, objects are processed in
array order, and for dicts, they are processed in key order.
'''
container = (PdfDict, PdfArray)
# Allow passing a list of pages, or a dict
if isinstance(source, PdfDict):
source = [source]
else:
source = list(source)
visited = set()
source.reverse()
while source:
obj = source.pop()
if not isinstance(obj, container):
continue
myid = id(obj)
if myid in visited:
continue
visited.add(myid)
if isinstance(obj, PdfDict):
if obj.Type in valid_types and obj.Subtype in valid_subtypes:
yield obj
obj = [y for (x, y) in sorted(obj.iteritems())
if x not in no_follow]
else:
# TODO: This forces resolution of any indirect objects in
# the array. It may not be necessary. Don't know if
# reversed() does any voodoo underneath the hood.
# It's cheap enough for now, but might be removeable.
obj and obj[0]
source.extend(reversed(obj))
def wrap_object(obj, width, margin):
''' Wrap an xobj in its own page object.
'''
fmt = 'q %s 0 0 %s %s %s cm /MyImage Do Q'
contents = PdfDict(indirect=True)
subtype = obj.Subtype
if subtype == PdfName.Form:
contents._stream = obj.stream
contents.Length = obj.Length
contents.Filter = obj.Filter
contents.DecodeParms = obj.DecodeParms
resources = obj.Resources
mbox = obj.BBox
elif subtype == PdfName.Image: # Image
xoffset = margin[0]
yoffset = margin[1]
cw = width - margin[0] - margin[2]
iw, ih = float(obj.Width), float(obj.Height)
ch = 1.0 * cw / iw * ih
height = ch + margin[1] + margin[3]
p = tuple(user_fmt(x) for x in (cw, ch, xoffset, yoffset))
contents.stream = fmt % p
resources = PdfDict(XObject=PdfDict(MyImage=obj))
mbox = PdfArray((0, 0, width, height))
else:
raise TypeError("Expected Form or Image XObject")
return PdfDict(
indirect=True,
Type=PdfName.Page,
MediaBox=mbox,
Resources=resources,
Contents=contents,
)
def trivial_xobjs(maxignore=300):
''' Ignore XObjects that trivially contain other XObjects.
'''
ignore = set('q Q cm Do'.split())
Image = PdfName.Image
def check(obj):
if obj.Subtype == Image:
return False
s = obj.stream
if len(s) < maxignore:
s = (x for x in s.split() if not x.startswith('/') and
x not in ignore)
s = (x.replace('.', '').replace('-', '') for x in s)
if not [x for x in s if not x.isdigit()]:
return True
return check
def page_per_xobj(xobj_iter, width=8.5 * 72, margin=0.0 * 72,
image_only=False, ignore=trivial_xobjs(),
wrap_object=wrap_object):
''' page_per_xobj wraps every XObj found
in its own page object.
width and margin are used to set image sizes.
'''
try:
iter(margin)
except:
margin = [margin]
while len(margin) < 4:
margin *= 2
if isinstance(xobj_iter, (list, dict)):
xobj_iter = find_objects(xobj_iter)
for obj in xobj_iter:
if not ignore(obj):
if not image_only or obj.Subtype == PdfName.IMage:
yield wrap_object(obj, width, margin)
|