This file is indexed.

/usr/share/pyshared/archmod/CHMParser.py is in archmage 1:0.2.4-3.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
# -*- coding: utf-8 -*-

import re
import mimetypes
import sgmllib, urllib2

from BeautifulSoup import BeautifulSoup
from HTMLParser import HTMLParser, HTMLParseError
from urlparse import urlparse

from archmod import COMMASPACE, LF, CR

START_TAG = '['
END_TAG = ']'


class SitemapFile(object):
	"""Sitemap file class"""

	def __init__(self, lines):
		# XXX: Cooking tasty beautiful soup ;-)
		soup = BeautifulSoup(lines)
		lines = soup.prettify()
		# XXX: Removing empty tags
		lines = re.sub(re.compile(r'<ul>\s*</ul>', re.I | re.M), '', lines)
		lines = re.sub(re.compile(r'<li>\s*</li>', re.I | re.M), '', lines)
		self.lines = lines

	def parse(self):
		p = SitemapParser()
		p.feed(self.lines)
		# parsed text + last bracket
		return (p.parsed + LF + END_TAG)


class TagStack(list):
	"""from book of David Mertz 'Text Processing in Python'"""
	
	def append(self, tag):
		# Remove every paragraph-level tag if this is one
		if tag.lower() in ('p', 'blockquote'):
			self = TagStack([ t for t in super if t not in ('p', 'blockquote') ])
		super(TagStack, self).append(tag)

	def pop(self, tag):
		# 'Pop' by tag from nearest position, not only last item
		self.reverse()
		try:
			pos = self.index(tag)
		except ValueError:
			raise HTMLParseError, 'Tag not on stack'
		self[:] = self[pos + 1:]
		self.reverse()


class SitemapParser(sgmllib.SGMLParser):
	"""Class for parsing files in SiteMap format, such as .hhc"""
	
	def __init__(self):
		self.tagstack = TagStack()
		self.in_obj = False
		self.name = self.local = self.param = ""
		self.imagenumber = 1
		self.parsed = ""
		sgmllib.SGMLParser.__init__(self)

	def unknown_starttag(self, tag, attrs):
		# first ul, start processing from here
		if tag == 'ul' and not self.tagstack:
			self.tagstack.append(tag)
			# First bracket
			self.parsed += LF + START_TAG

		# if inside ul
		elif self.tagstack:
			if tag == 'li':
				# append closing bracket if needed
				if self.tagstack[-1] != 'ul':
					self.parsed += END_TAG
					self.tagstack.pop('li')
				indent = ' ' * len(self.tagstack)

				if self.parsed != LF + START_TAG:
					self.parsed += COMMASPACE

				self.parsed += LF + indent + START_TAG

			if tag == 'object':
				for x, y in attrs:
					if x.lower() == 'type' and y.lower() == 'text/sitemap':
						self.in_obj = True

			if tag.lower() == 'param' and self.in_obj:
				for x, y in attrs:
					if x.lower() == 'name':
						self.param = y.lower()
					elif x.lower() == 'value':
						if self.param == 'name' and not len(self.name):
							# XXX: Remove LF and/or CR signs from name
							self.name = y.replace(LF, '').replace(CR, '')
							# XXX: Un-escaping double quotes :-)
							self.name = self.name.replace('"', '\\"')
						elif self.param == 'local':
							# XXX: Change incorrect slashes in url
							self.local = y.lower().replace('\\', '/').replace('..\\', '')
						elif self.param == 'imagenumber':
							self.imagenumber = y
			self.tagstack.append(tag)

	def unknown_endtag(self, tag):
		# if inside ul
		if self.tagstack:
			if tag == 'ul':
				self.parsed += END_TAG
			if tag == 'object' and self.in_obj:
				# "Link Name", "URL", "Icon"
				self.parsed += "\"%s\", \"%s\", \"%s\"" % (self.name, self.local, self.imagenumber)
				# Set to default values
				self.in_obj = False
				self.name = self.local = ""
				self.imagenumber = 1
			if tag != 'li':
				self.tagstack.pop(tag)


class PageLister(sgmllib.SGMLParser):
	"""
	Parser of the chm.chm GetTopicsTree() method that retrieves the URL of the HTML
	page embedded in the CHM file.
	"""

	def reset(self):
		sgmllib.SGMLParser.reset(self)
		self.pages = []

	def start_param(self, attrs):
		urlparam_flag = False
		for key, value in attrs:
			if key == 'name' and value.lower() == 'local':
				urlparam_flag = True
			if urlparam_flag and key == 'value':
				# Sometime url has incorrect slashes
				value = urllib2.unquote(urlparse(value.replace('\\', '/')).geturl())
				value = '/' + re.sub("#.*$", '', value)
				# Avoid duplicates
				if not self.pages.count(value):
					self.pages.append(value)


class ImageCatcher(sgmllib.SGMLParser):
	"""
	Finds image urls in the current html page, so to take them out from the chm file.
	"""

	def reset(self):
		sgmllib.SGMLParser.reset(self)
		self.imgurls = []

	def start_img(self, attrs):
		for key, value in attrs:
			if key.lower() == 'src':
				# Avoid duplicates in the list of image URLs.
				if not self.imgurls.count('/' + value):
					self.imgurls.append('/' + value)

	def start_a(self, attrs):
		for key, value in attrs:
			if key.lower() == 'href':
				url = urlparse(value)
				value = urllib2.unquote(url.geturl())
				# Remove unwanted crap
				value = '/' + re.sub("#.*$", '', value)
				# Check file's mimetype
				type = mimetypes.guess_type(value)[0]
				# Avoid duplicates in the list of image URLs.
				if not url.scheme and not self.imgurls.count(value) and \
				        type and re.search('image/.*', type):
					self.imgurls.append(value)


class TOCCounter(HTMLParser):
	"""Count Table of Contents levels"""
	
	count = 0
	
	def __init__(self):
		self.tagstack = TagStack()
		HTMLParser.__init__(self)

	def handle_starttag(self, tag, attrs):
		self.tagstack.append(tag)
		
	def handle_endtag(self, tag):
		if self.tagstack:
			if tag.lower() == 'object':
				if self.count < self.tagstack.count('param'):
					self.count = self.tagstack.count('param')
			if tag.lower() != 'li':
				self.tagstack.pop(tag)


# XXX: Seems to be an ugly solution...
class HeadersCounter(HTMLParser):
	"""Count headers tags"""
	
	h1 = h2 = h3 = h4 = h5 = h6 = 0
	
	def handle_starttag(self, tag, attrs):
		if tag.lower() == 'h1':
			self.h1 += 1
		if tag.lower() == 'h2':
			self.h2 += 1
		if tag.lower() == 'h3':
			self.h3 += 1
		if tag.lower() == 'h4':
			self.h4 += 1
		if tag.lower() == 'h5':
			self.h5 += 1
		if tag.lower() == 'h6':
			self.h6 += 1