This file is indexed.

/usr/share/perl5/CAM/PDF/PageText.pm is in libcam-pdf-perl 1.60-1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
package CAM::PDF::PageText;

use 5.006;
use warnings;
use strict;

our $VERSION = '1.60';

=head1 NAME

CAM::PDF::PageText - Extract text from PDF page tree

=head1 SYNOPSIS

   my $pdf = CAM::PDF->new($filename);
   my $pageone_tree = $pdf->getPageContentTree(1);
   print CAM::PDF::PageText->render($pageone_tree);

=head1 DESCRIPTION

This module attempts to extract sequential text from a PDF page.  This
is not a robust process, as PDF text is graphically laid out in
arbitrary order.  This module uses a few heuristics to try to guess
what text goes next to what other text, but may be fooled easily by,
say, subscripts, non-horizontal text, changes in font, form fields
etc.

All those disclaimers aside, it is useful for a quick dump of text
from a simple PDF file.

=head1 LICENSE

Same as L<CAM::PDF>

=head1 FUNCTIONS

=over

=item $pkg->render($pagetree)

=item $pkg->render($pagetree, $verbose)

Turn a page content tree into a string.  This is a class method that
should be called like:

   CAM::PDF::PageText->render($pagetree);

=cut

sub render
{
   my $pkg      = shift;
   my $pagetree = shift;
   my $verbose  = shift;

   my $str          = q{};
   my @stack        = ([@{$pagetree->{blocks}}]);
   my $in_textblock = 0;

   ## The stack is a list of blocks.  We do depth-first on blocks, but
   ## we must be sure to traverse the children of the blocks in their
   ## original order.

   while (@stack > 0)
   {
      # keep grabbing the same node until it's empty
      my $node = $stack[-1];
      if (ref $node)
      {
         if (@{$node} > 0)   # Still has children?
         {
            my $block = shift @{$node};   # grab the next child
            if ($block->{type} eq 'block')
            {
               if ($block->{name} eq 'BT')
               {
                  # Insert a flag on the stack to say when we leave the BT block
                  push @stack, 'BT';
                  $in_textblock = 1;
               }
               push @stack, [@{$block->{value}}];  # descend
            }
            elsif ($in_textblock)
            {
               if ($block->{type} ne 'op')
               {
                  die 'misconception';
               }
               my @args = @{$block->{args}};

               $str = $block->{name} eq 'TJ'   ? _TJ(     $str, \@args )
                    : $block->{name} eq 'Tj'   ? _Tj(     $str, \@args )
                    : $block->{name} eq q{\'}  ? _Tquote( $str, \@args )
                    : $block->{name} eq q{\"}  ? _Tquote( $str, \@args )
                    : $block->{name} eq 'Td'   ? _Td(     $str, \@args )
                    : $block->{name} eq 'TD'   ? _Td(     $str, \@args )
                    : $block->{name} eq 'T*'   ? _Tstar(  $str         )
                    : $str;
            }
         }
         else
         {
            # Node is now empty, clear it from the stack
            pop @stack;
         }
      }
      else
      {
         # This is the 'BT' flag we pushed on the stack above
         pop @stack;
         $in_textblock = 0;

         # Add a line break to divide the text
         $str =~ s/ [ ]* \z /\n/xms;
      }
   }
   return $str;
}

sub _TJ
{
   my $str = shift;
   my $args_ref = shift;

   if (@{$args_ref} != 1 || $args_ref->[0]->{type} ne 'array')
   {
      die 'Bad TJ';
   }

   $str =~ s/ (\S) \z /$1 /xms;
   foreach my $node (@{$args_ref->[0]->{value}})
   {
      if ($node->{type} eq 'string' || $node->{type} eq 'hexstring')
      {
         $str .= $node->{value};
      }
      elsif ($node->{type} eq 'number')
      {
         # Heuristic:
         #  "offset of more than a quarter unit forward"
         # means significant positive spacing
         if ($node->{value} < -250)
         {
            $str =~ s/ (\S) \z /$1 /xms;
         }
      }
   }
   return $str;
}

sub _Tj
{
   my $str      = shift;
   my $args_ref = shift;

   if (@{$args_ref} < 1 ||
       ($args_ref->[-1]->{type} ne 'string' && $args_ref->[-1]->{type} ne 'hexstring'))
   {
      die 'Bad Tj';
   }

   $str =~ s/ (\S) \z /$1 /xms;

   return $str . $args_ref->[-1]->{value};
}

sub _Tquote
{
   my $str      = shift;
   my $args_ref = shift;

   if (@{$args_ref} < 1 ||
       ($args_ref->[-1]->{type} ne 'string' && $args_ref->[-1]->{type} ne 'hexstring'))
   {
      die 'Bad Tquote';
   }

   $str =~ s/ [ ]* \z /\n/xms;

   return $str . $args_ref->[-1]->{value};
}

sub _Td
{
   my $str      = shift;
   my $args_ref = shift;

   if (@{$args_ref} != 2 ||
       $args_ref->[0]->{type} ne 'number' ||
       $args_ref->[1]->{type} ne 'number')
   {
      die 'Bad Td/TD';
   }

   # Heuristic:
   #   "move down in Y, and Y motion a large fraction of the X motion"
   # means new line
   if ($args_ref->[1]->{value} < 0 &&
       2 * (abs $args_ref->[1]->{value}) > abs $args_ref->[0]->{value})
   {
      $str =~ s/ [ ]* \z /\n/xms;
   }

   return $str;
}

sub _Tstar
{
   my $str = shift;

   $str =~ s/ [ ]* \z /\n/xms;

   return $str;
}

1;
__END__

=back

=head1 AUTHOR

See L<CAM::PDF>

=cut