/usr/lib/ruby/vendor_ruby/pdf/reader/page.rb is in ruby-pdf-reader 1.3.3-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 | # coding: utf-8
module PDF
class Reader
# high level representation of a single PDF page. Ties together the various
# low level classes in PDF::Reader and provides access to the various
# components of the page (text, images, fonts, etc) in convenient formats.
#
# If you require access to the raw PDF objects for this page, you can access
# the Page dictionary via the page_object accessor. You will need to use the
# objects accessor to help walk the page dictionary in any useful way.
#
class Page
include ResourceMethods
# lowlevel hash-like access to all objects in the underlying PDF
attr_reader :objects
# the raw PDF object that defines this page
attr_reader :page_object
# a Hash-like object for storing cached data. Generally this is scoped to
# the current document and is used to avoid repeating expensive
# operations
attr_reader :cache
# creates a new page wrapper.
#
# * objects - an ObjectHash instance that wraps a PDF file
# * pagenum - an int specifying the page number to expose. 1 indexed.
#
def initialize(objects, pagenum, options = {})
@objects, @pagenum = objects, pagenum
@page_object = objects.deref(objects.page_references[pagenum - 1])
@cache = options[:cache] || {}
unless @page_object.is_a?(::Hash)
raise ArgumentError, "invalid page: #{pagenum}"
end
end
# return the number of this page within the full document
#
def number
@pagenum
end
# return a friendly string representation of this page
#
def inspect
"<PDF::Reader::Page page: #{@pagenum}>"
end
# Returns the attributes that accompany this page, including
# attributes inherited from parents.
#
def attributes
@attributes ||= {}.tap { |hash|
page_with_ancestors.reverse.each do |obj|
hash.merge!(@objects.deref(obj))
end
}
# This shouldn't be necesary, but some non compliant PDFs leave MediaBox
# out. Assuming 8.5" x 11" is what Acobat does, so we do it too.
@attributes[:MediaBox] ||= [0,0,612,792]
@attributes
end
# returns the plain text content of this page encoded as UTF-8. Any
# characters that can't be translated will be returned as a ▯
#
def text
receiver = PageTextReceiver.new
walk(receiver)
receiver.content
end
alias :to_s :text
# processes the raw content stream for this page in sequential order and
# passes callbacks to the receiver objects.
#
# This is mostly low level and you can probably ignore it unless you need
# access to something like the raw encoded text. For an example of how
# this can be used as a basis for higher level functionality, see the
# text() method
#
# If someone was motivated enough, this method is intended to provide all
# the data required to faithfully render the entire page. If you find
# some required data isn't available it's a bug - let me know.
#
# Many operators that generate callbacks will reference resources stored
# in the page header - think images, fonts, etc. To facilitate these
# operators, the first available callback is page=. If your receiver
# accepts that callback it will be passed the current
# PDF::Reader::Page object. Use the Page#resources method to grab any
# required resources.
#
# It may help to think of each page as a self contained program made up of
# a set of instructions and associated resources. Calling walk() executes
# the program in the correct order and calls out to your implementation.
#
def walk(*receivers)
callback(receivers, :page=, [self])
content_stream(receivers, raw_content)
end
# returns the raw content stream for this page. This is plumbing, nothing to
# see here unless you're a PDF nerd like me.
#
def raw_content
contents = objects.deref(@page_object[:Contents])
[contents].flatten.compact.map { |obj|
objects.deref(obj)
}.map { |obj|
obj.unfiltered_data
}.join(" ")
end
private
def root
root ||= objects.deref(@objects.trailer[:Root])
end
# Returns the resources that accompany this page. Includes
# resources inherited from parents.
#
def resources
@resources ||= @objects.deref(attributes[:Resources]) || {}
end
def content_stream(receivers, instructions)
buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
parser = Parser.new(buffer, @objects)
params = []
while (token = parser.parse_token(PagesStrategy::OPERATORS))
if token.kind_of?(Token) and PagesStrategy::OPERATORS.has_key?(token)
callback(receivers, PagesStrategy::OPERATORS[token], params)
params.clear
else
params << token
end
end
rescue EOFError
raise MalformedPDFError, "End Of File while processing a content stream"
end
# calls the name callback method on each receiver object with params as the arguments
#
def callback (receivers, name, params=[])
receivers.each do |receiver|
receiver.send(name, *params) if receiver.respond_to?(name)
end
end
def page_with_ancestors
[ @page_object ] + ancestors
end
def ancestors(origin = @page_object[:Parent])
if origin.nil?
[]
else
obj = objects.deref(origin)
[ select_inheritable(obj) ] + ancestors(obj[:Parent])
end
end
# select the elements from a Pages dictionary that can be inherited by
# child Page dictionaries.
#
def select_inheritable(obj)
::Hash[obj.select { |key, value|
[:Resources, :MediaBox, :CropBox, :Rotate, :Parent].include?(key)
}]
end
end
end
end
|