/usr/share/pyshared/archmod/CHMParser.py

# -*- coding: utf-8 -*-

import re
import mimetypes
import sgmllib, urllib2

from BeautifulSoup import BeautifulSoup
from HTMLParser import HTMLParser, HTMLParseError
from urlparse import urlparse

from archmod import COMMASPACE, LF, CR

START_TAG = '['
END_TAG = ']'


class SitemapFile(object):
	"""Sitemap file class"""

	def __init__(self, lines):
		# XXX: Cooking tasty beautiful soup ;-)
		soup = BeautifulSoup(lines)
		lines = soup.prettify()
		# XXX: Removing empty tags
		lines = re.sub(re.compile(r'<ul>\s*</ul>', re.I | re.M), '', lines)
		lines = re.sub(re.compile(r'<li>\s*</li>', re.I | re.M), '', lines)
		self.lines = lines

	def parse(self):
		p = SitemapParser()
		p.feed(self.lines)
		# parsed text + last bracket
		return (p.parsed + LF + END_TAG)


class TagStack(list):
	"""from book of David Mertz 'Text Processing in Python'"""
	
	def append(self, tag):
		# Remove every paragraph-level tag if this is one
		if tag.lower() in ('p', 'blockquote'):
			self = TagStack([ t for t in super if t not in ('p', 'blockquote') ])
		super(TagStack, self).append(tag)

	def pop(self, tag):
		# 'Pop' by tag from nearest position, not only last item
		self.reverse()
		try:
			pos = self.index(tag)
		except ValueError:
			raise HTMLParseError, 'Tag not on stack'
		self[:] = self[pos + 1:]
		self.reverse()


class SitemapParser(sgmllib.SGMLParser):
	"""Class for parsing files in SiteMap format, such as .hhc"""
	
	def __init__(self):
		self.tagstack = TagStack()
		self.in_obj = False
		self.name = self.local = self.param = ""
		self.imagenumber = 1
		self.parsed = ""
		sgmllib.SGMLParser.__init__(self)

	def unknown_starttag(self, tag, attrs):
		# first ul, start processing from here
		if tag == 'ul' and not self.tagstack:
			self.tagstack.append(tag)
			# First bracket
			self.parsed += LF + START_TAG

		# if inside ul
		elif self.tagstack:
			if tag == 'li':
				# append closing bracket if needed
				if self.tagstack[-1] != 'ul':
					self.parsed += END_TAG
					self.tagstack.pop('li')
				indent = ' ' * len(self.tagstack)

				if self.parsed != LF + START_TAG:
					self.parsed += COMMASPACE

				self.parsed += LF + indent + START_TAG

			if tag == 'object':
				for x, y in attrs:
					if x.lower() == 'type' and y.lower() == 'text/sitemap':
						self.in_obj = True

			if tag.lower() == 'param' and self.in_obj:
				for x, y in attrs:
					if x.lower() == 'name':
						self.param = y.lower()
					elif x.lower() == 'value':
						if self.param == 'name' and not len(self.name):
							# XXX: Remove LF and/or CR signs from name
							self.name = y.replace(LF, '').replace(CR, '')
							# XXX: Un-escaping double quotes :-)
							self.name = self.name.replace('"', '\\"')
						elif self.param == 'local':
							# XXX: Change incorrect slashes in url
							self.local = y.lower().replace('\\', '/').replace('..\\', '')
						elif self.param == 'imagenumber':
							self.imagenumber = y
			self.tagstack.append(tag)

	def unknown_endtag(self, tag):
		# if inside ul
		if self.tagstack:
			if tag == 'ul':
				self.parsed += END_TAG
			if tag == 'object' and self.in_obj:
				# "Link Name", "URL", "Icon"
				self.parsed += "\"%s\", \"%s\", \"%s\"" % (self.name, self.local, self.imagenumber)
				# Set to default values
				self.in_obj = False
				self.name = self.local = ""
				self.imagenumber = 1
			if tag != 'li':
				self.tagstack.pop(tag)


class PageLister(sgmllib.SGMLParser):
	"""
	Parser of the chm.chm GetTopicsTree() method that retrieves the URL of the HTML
	page embedded in the CHM file.
	"""

	def reset(self):
		sgmllib.SGMLParser.reset(self)
		self.pages = []

	def start_param(self, attrs):
		urlparam_flag = False
		for key, value in attrs:
			if key == 'name' and value.lower() == 'local':
				urlparam_flag = True
			if urlparam_flag and key == 'value':
				# Sometime url has incorrect slashes
				value = urllib2.unquote(urlparse(value.replace('\\', '/')).geturl())
				value = '/' + re.sub("#.*$", '', value)
				# Avoid duplicates
				if not self.pages.count(value):
					self.pages.append(value)


class ImageCatcher(sgmllib.SGMLParser):
	"""
	Finds image urls in the current html page, so to take them out from the chm file.
	"""

	def reset(self):
		sgmllib.SGMLParser.reset(self)
		self.imgurls = []

	def start_img(self, attrs):
		for key, value in attrs:
			if key.lower() == 'src':
				# Avoid duplicates in the list of image URLs.
				if not self.imgurls.count('/' + value):
					self.imgurls.append('/' + value)

	def start_a(self, attrs):
		for key, value in attrs:
			if key.lower() == 'href':
				url = urlparse(value)
				value = urllib2.unquote(url.geturl())
				# Remove unwanted crap
				value = '/' + re.sub("#.*$", '', value)
				# Check file's mimetype
				type = mimetypes.guess_type(value)[0]
				# Avoid duplicates in the list of image URLs.
				if not url.scheme and not self.imgurls.count(value) and \
				        type and re.search('image/.*', type):
					self.imgurls.append(value)


class TOCCounter(HTMLParser):
	"""Count Table of Contents levels"""
	
	count = 0
	
	def __init__(self):
		self.tagstack = TagStack()
		HTMLParser.__init__(self)

	def handle_starttag(self, tag, attrs):
		self.tagstack.append(tag)
		
	def handle_endtag(self, tag):
		if self.tagstack:
			if tag.lower() == 'object':
				if self.count < self.tagstack.count('param'):
					self.count = self.tagstack.count('param')
			if tag.lower() != 'li':
				self.tagstack.pop(tag)


# XXX: Seems to be an ugly solution...
class HeadersCounter(HTMLParser):
	"""Count headers tags"""
	
	h1 = h2 = h3 = h4 = h5 = h6 = 0
	
	def handle_starttag(self, tag, attrs):
		if tag.lower() == 'h1':
			self.h1 += 1
		if tag.lower() == 'h2':
			self.h2 += 1
		if tag.lower() == 'h3':
			self.h3 += 1
		if tag.lower() == 'h4':
			self.h4 += 1
		if tag.lower() == 'h5':
			self.h5 += 1
		if tag.lower() == 'h6':
			self.h6 += 1
archmage 1:0.2.4-3 / usr / share / pyshared / archmod / CHMParser.py