/usr/share/pyshared/archmod/CHMParser.py is in archmage 1:0.2.4-3.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 | # -*- coding: utf-8 -*-
import re
import mimetypes
import sgmllib, urllib2
from BeautifulSoup import BeautifulSoup
from HTMLParser import HTMLParser, HTMLParseError
from urlparse import urlparse
from archmod import COMMASPACE, LF, CR
START_TAG = '['
END_TAG = ']'
class SitemapFile(object):
"""Sitemap file class"""
def __init__(self, lines):
# XXX: Cooking tasty beautiful soup ;-)
soup = BeautifulSoup(lines)
lines = soup.prettify()
# XXX: Removing empty tags
lines = re.sub(re.compile(r'<ul>\s*</ul>', re.I | re.M), '', lines)
lines = re.sub(re.compile(r'<li>\s*</li>', re.I | re.M), '', lines)
self.lines = lines
def parse(self):
p = SitemapParser()
p.feed(self.lines)
# parsed text + last bracket
return (p.parsed + LF + END_TAG)
class TagStack(list):
"""from book of David Mertz 'Text Processing in Python'"""
def append(self, tag):
# Remove every paragraph-level tag if this is one
if tag.lower() in ('p', 'blockquote'):
self = TagStack([ t for t in super if t not in ('p', 'blockquote') ])
super(TagStack, self).append(tag)
def pop(self, tag):
# 'Pop' by tag from nearest position, not only last item
self.reverse()
try:
pos = self.index(tag)
except ValueError:
raise HTMLParseError, 'Tag not on stack'
self[:] = self[pos + 1:]
self.reverse()
class SitemapParser(sgmllib.SGMLParser):
"""Class for parsing files in SiteMap format, such as .hhc"""
def __init__(self):
self.tagstack = TagStack()
self.in_obj = False
self.name = self.local = self.param = ""
self.imagenumber = 1
self.parsed = ""
sgmllib.SGMLParser.__init__(self)
def unknown_starttag(self, tag, attrs):
# first ul, start processing from here
if tag == 'ul' and not self.tagstack:
self.tagstack.append(tag)
# First bracket
self.parsed += LF + START_TAG
# if inside ul
elif self.tagstack:
if tag == 'li':
# append closing bracket if needed
if self.tagstack[-1] != 'ul':
self.parsed += END_TAG
self.tagstack.pop('li')
indent = ' ' * len(self.tagstack)
if self.parsed != LF + START_TAG:
self.parsed += COMMASPACE
self.parsed += LF + indent + START_TAG
if tag == 'object':
for x, y in attrs:
if x.lower() == 'type' and y.lower() == 'text/sitemap':
self.in_obj = True
if tag.lower() == 'param' and self.in_obj:
for x, y in attrs:
if x.lower() == 'name':
self.param = y.lower()
elif x.lower() == 'value':
if self.param == 'name' and not len(self.name):
# XXX: Remove LF and/or CR signs from name
self.name = y.replace(LF, '').replace(CR, '')
# XXX: Un-escaping double quotes :-)
self.name = self.name.replace('"', '\\"')
elif self.param == 'local':
# XXX: Change incorrect slashes in url
self.local = y.lower().replace('\\', '/').replace('..\\', '')
elif self.param == 'imagenumber':
self.imagenumber = y
self.tagstack.append(tag)
def unknown_endtag(self, tag):
# if inside ul
if self.tagstack:
if tag == 'ul':
self.parsed += END_TAG
if tag == 'object' and self.in_obj:
# "Link Name", "URL", "Icon"
self.parsed += "\"%s\", \"%s\", \"%s\"" % (self.name, self.local, self.imagenumber)
# Set to default values
self.in_obj = False
self.name = self.local = ""
self.imagenumber = 1
if tag != 'li':
self.tagstack.pop(tag)
class PageLister(sgmllib.SGMLParser):
"""
Parser of the chm.chm GetTopicsTree() method that retrieves the URL of the HTML
page embedded in the CHM file.
"""
def reset(self):
sgmllib.SGMLParser.reset(self)
self.pages = []
def start_param(self, attrs):
urlparam_flag = False
for key, value in attrs:
if key == 'name' and value.lower() == 'local':
urlparam_flag = True
if urlparam_flag and key == 'value':
# Sometime url has incorrect slashes
value = urllib2.unquote(urlparse(value.replace('\\', '/')).geturl())
value = '/' + re.sub("#.*$", '', value)
# Avoid duplicates
if not self.pages.count(value):
self.pages.append(value)
class ImageCatcher(sgmllib.SGMLParser):
"""
Finds image urls in the current html page, so to take them out from the chm file.
"""
def reset(self):
sgmllib.SGMLParser.reset(self)
self.imgurls = []
def start_img(self, attrs):
for key, value in attrs:
if key.lower() == 'src':
# Avoid duplicates in the list of image URLs.
if not self.imgurls.count('/' + value):
self.imgurls.append('/' + value)
def start_a(self, attrs):
for key, value in attrs:
if key.lower() == 'href':
url = urlparse(value)
value = urllib2.unquote(url.geturl())
# Remove unwanted crap
value = '/' + re.sub("#.*$", '', value)
# Check file's mimetype
type = mimetypes.guess_type(value)[0]
# Avoid duplicates in the list of image URLs.
if not url.scheme and not self.imgurls.count(value) and \
type and re.search('image/.*', type):
self.imgurls.append(value)
class TOCCounter(HTMLParser):
"""Count Table of Contents levels"""
count = 0
def __init__(self):
self.tagstack = TagStack()
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
self.tagstack.append(tag)
def handle_endtag(self, tag):
if self.tagstack:
if tag.lower() == 'object':
if self.count < self.tagstack.count('param'):
self.count = self.tagstack.count('param')
if tag.lower() != 'li':
self.tagstack.pop(tag)
# XXX: Seems to be an ugly solution...
class HeadersCounter(HTMLParser):
"""Count headers tags"""
h1 = h2 = h3 = h4 = h5 = h6 = 0
def handle_starttag(self, tag, attrs):
if tag.lower() == 'h1':
self.h1 += 1
if tag.lower() == 'h2':
self.h2 += 1
if tag.lower() == 'h3':
self.h3 += 1
if tag.lower() == 'h4':
self.h4 += 1
if tag.lower() == 'h5':
self.h5 += 1
if tag.lower() == 'h6':
self.h6 += 1
|