This file is indexed.

/usr/lib/ruby/1.8/html/htmltokenizer.rb is in libhtml-htmltokenizer-ruby 1.0-3.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
# = HTMLTokenizer
#
# Author::    Ben Giddings  (mailto:bg-rubyforge@infofiend.com)
# Copyright:: Copyright (c) 2004 Ben Giddings
# License::   Distributes under the same terms as Ruby
#
#
# This is a partial port of the functionality behind Perl's TokeParser
# Provided a page it progressively returns tokens from that page
#
# $Id: htmltokenizer.rb,v 1.7 2005/06/07 21:05:53 merc Exp $

#
# A class to tokenize HTML.
#
# Example:
#
#   page = "<HTML>
#   <HEAD>
#   <TITLE>This is the title</TITLE>
#   </HEAD>
#    <!-- Here comes the <a href=\"missing.link\">blah</a>
#    comment body
#     -->
#    <BODY>
#      <H1>This is the header</H1>
#      <P>
#        This is the paragraph, it contains
#        <a href=\"link.html\">links</a>,
#        <img src=\"blah.gif\" optional alt='images
#        are
#        really cool'>.  Ok, here is some more text and
#        <A href=\"http://another.link.com/\" target=\"_blank\">another link</A>.
#      </P>
#    </body>
#    </HTML>
#    "
#    toke = HTMLTokenizer.new(page)
#
#    assert("<h1>" == toke.getTag("h1", "h2", "h3").to_s.downcase)
#    assert(HTMLTag.new("<a href=\"link.html\">") == toke.getTag("IMG", "A"))
#    assert("links" == toke.getTrimmedText)
#    assert(toke.getTag("IMG", "A").attr_hash['optional'])
#    assert("_blank" == toke.getTag("IMG", "A").attr_hash['target'])
#
class HTMLTokenizer
  @@version = 1.0

  # Get version of HTMLTokenizer lib
  def self.version
    @@version
  end

  attr_reader :page

  # Create a new tokenizer, based on the content, used as a string.
  def initialize(content)
    @page = content.to_s
    @cur_pos = 0
  end

  # Reset the parser, setting the current position back at the stop
  def reset
    @cur_pos = 0
  end

  # Look at the next token, but don't actually grab it
  def peekNextToken
    if @cur_pos == @page.length then return nil end

    if ?< == @page[@cur_pos]
      # Next token is a tag of some kind
      if '!--' == @page[(@cur_pos + 1), 3]
        # Token is a comment
        tag_end = @page.index('-->', (@cur_pos + 1))
        if tag_end.nil?
          raise "No end found to started comment:\n#{@page[@cur_pos,80]}"
        end
        # p @page[@cur_pos .. (tag_end+2)]
        HTMLComment.new(@page[@cur_pos .. (tag_end + 2)])
      else
        # Token is a html tag
        tag_end = @page.index('>', (@cur_pos + 1))
        if tag_end.nil?
          raise "No end found to started tag:\n#{@page[@cur_pos,80]}"
        end
        # p @page[@cur_pos .. tag_end]
        HTMLTag.new(@page[@cur_pos .. tag_end])
      end
    else
      # Next token is text
      text_end = @page.index('<', @cur_pos)
      text_end = text_end.nil? ? -1 : (text_end - 1)
      # p @page[@cur_pos .. text_end]
      HTMLText.new(@page[@cur_pos .. text_end])
    end
  end

  # Get the next token, returns an instance of
  # * HTMLText
  # * HTMLToken
  # * HTMLTag
  def getNextToken
    token = peekNextToken
    if token
      # @page = @page[token.raw.length .. -1]
      # @page.slice!(0, token.raw.length)
      @cur_pos += token.raw.length
    end
    #p token
    #print token.raw
    return token
  end

  # Get a tag from the specified set of desired tags.
  # For example:
  # <tt>foo =  toke.getTag("h1", "h2", "h3")</tt>
  # Will return the next header tag encountered.
  def getTag(*sought_tags)
    sought_tags.collect! {|elm| elm.downcase}

    while (tag = getNextToken)
      if tag.kind_of?(HTMLTag) and
          (0 == sought_tags.length or sought_tags.include?(tag.tag_name))
        break
      end
    end
    tag
  end

  # Get all the text between the current position and the next tag
  # (if specified) or a specific later tag
  def getText(until_tag = nil)
    if until_tag.nil?
      if ?< == @page[@cur_pos]
        # Next token is a tag, not text
        ""
      else
        # Next token is text
        getNextToken.text
      end
    else
      ret_str = ""

      while (tag = peekNextToken)
        if tag.kind_of?(HTMLTag) and tag.tag_name == until_tag
          break
        end

        if ("" != tag.text)
          ret_str << (tag.text + " ")
        end
        getNextToken
      end

      ret_str
    end
  end

  # Like getText, but squeeze all whitespace, getting rid of
  # leading and trailing whitespace, and squeezing multiple
  # spaces into a single space.
  def getTrimmedText(until_tag = nil)
    getText(until_tag).strip.gsub(/\s+/m, " ")
  end

end

# The parent class for all three types of HTML tokens
class HTMLToken
  attr_accessor :raw

  # Initialize the token based on the raw text
  def initialize(text)
    @raw = text
  end

  # By default, return exactly the string used to create the text
  def to_s
    raw
  end

  # By default tokens have no text representation
  def text
    ""
  end

  def trimmed_text
    text.strip.gsub(/\s+/m, " ")
  end

  # Compare to another based on the raw source
  def ==(other)
    raw == other.to_s
  end
end

# Class representing text that isn't inside a tag
class HTMLText < HTMLToken
  def text
    raw
  end
end

# Class representing an HTML comment
class HTMLComment < HTMLToken
  attr_accessor :contents
  def initialize(text)
    super(text)
    temp_arr = text.scan(/^<!--\s*(.*?)\s*-->$/m)
    if temp_arr[0].nil?
      raise "Text passed to HTMLComment.initialize is not a comment"
    end

    @contents = temp_arr[0][0]
  end
end

# Class representing an HTML tag
class HTMLTag < HTMLToken
  attr_reader :end_tag, :tag_name
  def initialize(text)
    super(text)
    if ?< != text[0] or ?> != text[-1]
      raise "Text passed to HTMLComment.initialize is not a comment"
    end

    @attr_hash = Hash.new
    @raw = text

    tag_name = text.scan(/[\w:-]+/)[0]
    if tag_name.nil?
      raise "Error, tag is nil: #{tag_name}"
    end

    if ?/ == text[1]
      # It's an end tag
      @end_tag = true
      @tag_name = '/' + tag_name.downcase
    else
      @end_tag = false
      @tag_name = tag_name.downcase
    end

    @hashed = false
  end

  # Retrieve a hash of all the tag's attributes.
  # Lazily done, so that if you don't look at a tag's attributes
  # things go quicker
  def attr_hash
    # Lazy initialize == don't build the hash until it's needed
    if !@hashed
      if !@end_tag
        # Get the attributes
        attr_arr = @raw.scan(/<[\w:-]+\s+(.*)>/m)[0]
        if attr_arr.kind_of?(Array)
          # Attributes found, parse them
          attrs = attr_arr[0]
          attr_arr = attrs.scan(/\s*([\w:-]+)(?:\s*=\s*("[^"]*"|'[^']*'|([^"'>][^\s>]*)))?/m)
          # clean up the array by:
          # * setting all nil elements to true
          # * removing enclosing quotes
          attr_arr.each {
            |item|
            val = if item[1].nil?
                    item[0]
                  elsif '"'[0] == item[1][0] or '\''[0] == item[1][0]
                    item[1][1 .. -2]
                  else
                    item[1]
                  end
            @attr_hash[item[0].downcase] = val
          }
        end
      end
      @hashed = true
    end

    #p self

    @attr_hash
  end

  # Get the 'alt' text for a tag, if it exists, or an empty string otherwise
  def text
    if !end_tag
      case tag_name
      when 'img'
        if !attr_hash['alt'].nil?
          return attr_hash['alt']
        end
      when 'applet'
        if !attr_hash['alt'].nil?
          return attr_hash['alt']
        end
      end
    end
    return ''
  end
end

if $0 == __FILE__
  require 'test/unit'

  class TC_TestHTMLTokenizer < Test::Unit::TestCase
    def test_bad_link
      toke = HTMLTokenizer.new("<p><a href=http://bad.com/link>foo</a></p>")
      assert("http://bad.com/link" == toke.getTag("a").attr_hash['href'])
    end

    def test_namespace
      toke = HTMLTokenizer.new("<f:table xmlns:f=\"http://www.com/foo\">")
      assert("http://www.com/foo" == toke.getTag("f:table").attr_hash['xmlns:f'])
    end

    def test_comment
      toke = HTMLTokenizer.new("<!-- comment on me -->")
      t = toke.getNextToken
      assert(HTMLComment == t.class)
      assert("comment on me" == t.contents)
    end


    def test_full
      page = "<HTML>
<HEAD>
<TITLE>This is the title</TITLE>
</HEAD>
<!-- Here comes the <a href=\"missing.link\">blah</a>
comment body
 -->
<BODY>
  <H1>This is the header</H1>
  <P>
    This is the paragraph, it contains
    <a href=\"link.html\">links</a>, 
    <img src=\"blah.gif\" optional alt='images
are
really cool'>.  Ok, here is some more text and
    <A href=\"http://another.link.com/\" target=\"_blank\">another link</A>.
  </P>
</body>
</HTML>
"
      toke = HTMLTokenizer.new(page)

      assert("<h1>" == toke.getTag("h1", "h2", "h3").to_s.downcase)
      assert(HTMLTag.new("<a href=\"link.html\">") == toke.getTag("IMG", "A"))
      assert("links" == toke.getTrimmedText)
      assert(toke.getTag("IMG", "A").attr_hash['optional'])
      assert("_blank" == toke.getTag("IMG", "A").attr_hash['target'])
    end
  end
end