This file is indexed.

/usr/lib/python2.7/dist-packages/archmod/CHMParser.py is in archmage 1:0.3.1-3.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
# -*- coding: utf-8 -*-
#
# archmage -- CHM decompressor
# Copyright (c) 2009 Basil Shubin <bashu@users.sourceforge.net>
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation; either version 2 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
# Street, Fifth Floor, Boston, MA 02110-1301, USA.
#

import re
import mimetypes
import sgmllib, urllib2

from BeautifulSoup import BeautifulSoup
from HTMLParser import HTMLParser, HTMLParseError
from urlparse import urlparse

from archmod import COMMASPACE, LF, CR

START_TAG = '['
END_TAG = ']'


class SitemapFile(object):
    """Sitemap file class"""

    def __init__(self, lines):
        # XXX: Cooking tasty beautiful soup ;-)
        if lines:
            soup = BeautifulSoup(lines)
            lines = soup.prettify()
            # XXX: Removing empty tags
            lines = re.sub(re.compile(r'<ul>\s*</ul>', re.I | re.M), '', lines)
            lines = re.sub(re.compile(r'<li>\s*</li>', re.I | re.M), '', lines)
            self.lines = lines
        else:
            self.lines = None

    def parse(self):
        p = SitemapParser()
        if self.lines:
            p.feed(self.lines)
        # parsed text + last bracket
        return (p.parsed + LF + END_TAG)


class TagStack(list):
    """from book of David Mertz 'Text Processing in Python'"""

    def append(self, tag):
        # Remove every paragraph-level tag if this is one
        if tag.lower() in ('p', 'blockquote'):
            self = TagStack([ t for t in super if t not in ('p', 'blockquote') ])
        super(TagStack, self).append(tag)

    def pop(self, tag):
        # 'Pop' by tag from nearest position, not only last item
        self.reverse()
        try:
            pos = self.index(tag)
        except ValueError:
            raise HTMLParseError, 'Tag not on stack'
        self[:] = self[pos + 1:]
        self.reverse()


class SitemapParser(sgmllib.SGMLParser):
    """Class for parsing files in SiteMap format, such as .hhc"""

    def __init__(self):
        self.tagstack = TagStack()
        self.in_obj = False
        self.name = self.local = self.param = ""
        self.imagenumber = 1
        self.parsed = ""
        sgmllib.SGMLParser.__init__(self)

    def unknown_starttag(self, tag, attrs):
        # first ul, start processing from here
        if tag == 'ul' and not self.tagstack:
            self.tagstack.append(tag)
            # First bracket
            self.parsed += LF + START_TAG

        # if inside ul
        elif self.tagstack:
            if tag == 'li':
                # append closing bracket if needed
                if self.tagstack[-1] != 'ul':
                    self.parsed += END_TAG
                    self.tagstack.pop('li')
                indent = ' ' * len(self.tagstack)

                if self.parsed != LF + START_TAG:
                    self.parsed += COMMASPACE

                self.parsed += LF + indent + START_TAG

            if tag == 'object':
                for x, y in attrs:
                    if x.lower() == 'type' and y.lower() == 'text/sitemap':
                        self.in_obj = True

            if tag.lower() == 'param' and self.in_obj:
                for x, y in attrs:
                    if x.lower() == 'name':
                        self.param = y.lower()
                    elif x.lower() == 'value':
                        if self.param == 'name' and not len(self.name):
                            # XXX: Remove LF and/or CR signs from name
                            self.name = y.replace(LF, '').replace(CR, '')
                            # XXX: Un-escaping double quotes :-)
                            self.name = self.name.replace('"', '\\"')
                        elif self.param == 'local':
                            # XXX: Change incorrect slashes in url
                            self.local = y.lower().replace('\\', '/').replace('..\\', '')
                        elif self.param == 'imagenumber':
                            self.imagenumber = y
            self.tagstack.append(tag)

    def unknown_endtag(self, tag):
        # if inside ul
        if self.tagstack:
            if tag == 'ul':
                self.parsed += END_TAG
            if tag == 'object' and self.in_obj:
                # "Link Name", "URL", "Icon"
                self.parsed += "\"%s\", \"%s\", \"%s\"" % (self.name, self.local, self.imagenumber)
                # Set to default values
                self.in_obj = False
                self.name = self.local = ""
                self.imagenumber = 1
            if tag != 'li':
                self.tagstack.pop(tag)


class PageLister(sgmllib.SGMLParser):
    """
    Parser of the chm.chm GetTopicsTree() method that retrieves the URL of the HTML
    page embedded in the CHM file.
    """

    def reset(self):
        sgmllib.SGMLParser.reset(self)
        self.pages = []

    def start_param(self, attrs):
        urlparam_flag = False
        for key, value in attrs:
            if key == 'name' and value.lower() == 'local':
                urlparam_flag = True
            if urlparam_flag and key == 'value':
                # Sometime url has incorrect slashes
                value = urllib2.unquote(urlparse(value.replace('\\', '/')).geturl())
                value = '/' + re.sub("#.*$", '', value)
                # Avoid duplicates
                if not self.pages.count(value):
                    self.pages.append(value)


class ImageCatcher(sgmllib.SGMLParser):
    """
    Finds image urls in the current html page, so to take them out from the chm file.
    """

    def reset(self):
        sgmllib.SGMLParser.reset(self)
        self.imgurls = []

    def start_img(self, attrs):
        for key, value in attrs:
            if key.lower() == 'src':
                # Avoid duplicates in the list of image URLs.
                if not self.imgurls.count('/' + value):
                    self.imgurls.append('/' + value)

    def start_a(self, attrs):
        for key, value in attrs:
            if key.lower() == 'href':
                url = urlparse(value)
                value = urllib2.unquote(url.geturl())
                # Remove unwanted crap
                value = '/' + re.sub("#.*$", '', value)
                # Check file's mimetype
                type = mimetypes.guess_type(value)[0]
                # Avoid duplicates in the list of image URLs.
                if not url.scheme and not self.imgurls.count(value) and \
                        type and re.search('image/.*', type):
                    self.imgurls.append(value)


class TOCCounter(HTMLParser):
    """Count Table of Contents levels"""

    count = 0

    def __init__(self):
        self.tagstack = TagStack()
        HTMLParser.__init__(self)

    def handle_starttag(self, tag, attrs):
        self.tagstack.append(tag)

    def handle_endtag(self, tag):
        if self.tagstack:
            if tag.lower() == 'object':
                if self.count < self.tagstack.count('param'):
                    self.count = self.tagstack.count('param')
            if tag.lower() != 'li':
                self.tagstack.pop(tag)