This file is indexed.

/usr/share/doc/newsbeuter/contrib/heise.rb is in newsbeuter 2.9-7.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/ruby
#
# get, parse and enrich heise rss feeds
#
# call with the feed specified you like to retrieve. Currently supported:
#
#  news      - heise newsticker
#  teleopils - Telepolis
#  security  - heise security news
#
# Change history
#
#  26.06.2009    erb    suppressed error messages due to unrepsonsive servers
#  28.03.2010    erb    Added DB cache to speed things up (significantly!)
#
$:.push(File.dirname($0))
require 'feedgrabber'

require 'rexml/document'
include REXML

if ENV['http_proxy'].nil? && !ENV['HTTP_PROXY'].nil?
  ENV['http_proxy'] = ENV['HTTP_PROXY']
end

#          key            feed URL
FEEDS = { "news"      => "http://www.heise.de/newsticker/heise-atom.xml",
          "telepolis" => "http://www.heise.de/tp/news-atom.xml",
	  "security"  => "http://www.heise.de/security/news/news-atom.xml",
	  "netze"     => "http://www.heise.de/netze/rss/netze-atom.xml",
	  "it-blog"   => "http://www.heise.de/developer/rss/world-of-it/blog-atom.xml"
	}

GOOGLEON="<!--googleon: index-->"
GOOGLEOFF="<!--googleoff: index-->"

def listFeeds
  FEEDS.each_key { |k| print "  #{k}\n" }
end

if ARGV.length < 1
  print "usage: #{File::basename($0)} <feed>\n"
  print "<feed> is one of\n"
  listFeeds
  exit
end

def shortenArticle(article_text)
  article_text.gsub!(/<!\[CDATA\[/, "")
  article_text.gsub!(/\]\]>/, "")

  # now, heise speciality: get everything between GOOGLEON and GOOGLEOFF patterns :-)
  p1 = article_text.index(GOOGLEON)
  p2 = article_text.index(GOOGLEOFF)
  if (p1 && p2)
    result = ""
    pos = p1
    while(pos < article_text.length) do
      p1 = article_text.index(GOOGLEON, pos)
      break unless p1
      p2 = article_text.index(GOOGLEOFF, pos)
      p2 = article_text.length unless p2
      if p1 < p2
        result += article_text[p1+GOOGLEON.length..p2-1]
        pos = p2+GOOGLEOFF.length
      else
        pos = p1+GOOGLEON.length
      end
    end
    article_text = result
  end

  # get rid of comments and other annoying artifacts
  article_text.gsub!(/<!--LINK_ICON--><img[^>]*><!--\/LINK_ICON-->/m, " ")
  article_text.gsub!(/<!--[^>]*-->/, "")
  article_text.gsub!(/\s+/m, " ")
  article_text.gsub!(/href=\"\//m, "href=\"http://www.heise.de/")
  article_text.gsub!(/src=\"\//m, "src=\"http://www.heise.de/")

  article_text
end

feed=ARGV[0]

unless FEEDS.has_key?(feed)
  print "unknown feed '#{feed}'. Use one of these:\n"
  listFeeds
  exit
end

feedurl = FEEDS[feed]

#get feed
fg = FeedGrabber.new("heisecache-#{feed}")
feed_text = fg.getURL_uncached(feedurl)

exit 2 unless feed_text && feed_text.length > 20

xml = Document.new(feed_text)

#loop over items
xml.elements.each("//entry") do |item|
  # extract link to article
  article_url = item.elements['id'].text
  article_url.sub!(%r{from/.*$}, "")
  article_short_url = article_url.sub(%r{/[^/]*--/}, "/")

  # get full text for article
  article_text = fg.getURL(article_url)
  next unless article_text && article_text.length > 20

  # extract article comment link
  begin
    comments = /<a href=\"(\/[a-z\/]*\/foren\/[^\/]*\/forum-[0-9]*\/list\/)\"/m.match(article_text)[1]
  rescue
    comments =""
  end
  article_text = shortenArticle(article_text)
  
  article_text += "<p><a href=\"http://www.heise.de#{comments}\">Kommentare</a></p>" if comments.length > 5 && comments.length < 150

  # insert full text article into feed
  description = Element.new("content")
  description.add_attribute("type", "html")
  description.text= CData.new(article_text)
  item.add_element(description)
end

fg.cleanupDB
  
# reproduce the content enriched feed
xml.write($stdout, -1)