/usr/share/pyshared/archmod/CHM.py

# -*- coding: utf-8 -*-
#
# Copyright (c) 2003 Eugeny Korekin <aaaz@users.sourceforge.net>
# Copyright (c) 2005-2009 Basil Shubin <bashu@users.sourceforge.net>

import os
import sys
import re
import shutil
import errno
import string
import tempfile

import archmod

from archmod.CHMParser import SitemapFile, PageLister, ImageCatcher, TOCCounter, HeadersCounter
from archmod.Cached import Cached

# import PyCHM bindings
try:
	from chm import chmlib
except ImportError, msg:
	sys.exit('ImportError: %s\nPlease check README file for system requirements.' % msg)

# External file converters
from archmod.chmtotext import chmtotext
from archmod.htmldoc import htmldoc


class CHMDir(Cached):
	"""Class that represent CHM content from directory"""

	def __init__(self, name):
		# Name of source directory with CHM content
		self.sourcename = name
		# Import variables from config file into namespace
		execfile(archmod.config, self.__dict__)
		
		# build regexp from the list of auxiliary files
		self.aux_re = '|'.join([ re.escape(s) for s in self.auxes ])

		# Get and parse 'Table of Contents'
		topicstree = self.get_entry(self.topics)
		self.contents = SitemapFile(topicstree).parse()

	def _getitem(self, name):
		# Get all entries
		if name == 'entries':
			entries = []
			for fname in archmod.listdir(self.sourcename):
				name = '/' + fname
				if os.path.isdir(self.sourcename + name):
					name += '/'
				entries.append(name)
			return entries
		# retrieves the list of HTML files contained into the CHM file, **in order** (that's the important bit).
		# (actually performed by the PageLister class)
		if name == 'html_files':
			topicstree = self.get_entry(self.topics)
			lister = PageLister()
			lister.feed(topicstree)
			return lister.pages
		# retrieves the list of images urls contained into the CHM file.
		# (actually performed by the ImageCatcher class)
		if name == 'image_urls':
			image_urls = []
			image_catcher = ImageCatcher()
			for file in self.html_files:
				image_catcher.feed(CHMEntry(self, file).correct())
				for image_url in image_catcher.imgurls:
					if not image_urls.count(image_url):
						image_urls.append(image_url)
			return image_urls
		# retrieves a dictionary of actual file entries and corresponding urls into the CHM file 
		if name == 'image_files':
			image_files = {}
			for image_url in self.image_urls:
				for entry in self.entries:
					if re.search(image_url, entry.lower()) and not image_files.has_key(entry.lower()):
						image_files.update({entry : image_url})
			return image_files
		# Get topics file
		if name == 'topics':
			for e in self.entries:
				if e.lower().endswith('.hhc'):
					return e
		if name == 'deftopic':
			# use first page as deftopic. Note: without heading slash
			if self.html_files[0].startswith('/'):
				return self.html_files[0].replace('/', '', 1).lower()
			return self.html_files[0].lower()
		# Get index file
		if name == 'index':
			for e in self.entries:
				if e.lower().endswith('.hhk'):
					return e
		# Get frontpage name
		if name == 'frontpage':
			frontpage = os.path.join('/', 'index.html')
			index = 2 # index2.html and etc.
			for filename in self.entries:
				if frontpage == filename:
					frontpage = os.path.join('/', ('index%s.html' % index))
					index += 1
			return frontpage
		# Get all templates files
		if name == 'templates':
			templates = []
			for file in os.listdir(self.templates_dir):
				if os.path.isfile(os.path.join(self.templates_dir, file)):
					if os.path.join('/', file) not in self.entries:
						templates.append(os.path.join('/', file))
			return templates
		# Get ToC levels
		if name == 'toclevels':
			topicstree = self.get_entry(self.topics)
			counter = TOCCounter()
			counter.feed(topicstree)
			if counter.count > self.maxtoclvl:
				return self.maxtoclvl
			else:
				return counter.count
		# HTMLDOC doesn't working with missing <H1>...</H1> tag, 
		# so we need to fix it (for first page only)
		# XXX: Seems to be an ugly solution...
		if name == 'html_header_tags':
			html_header_tags = {'h1': 0, 'h2' : 0, 'h3' : 0, 'h4' : 0, 'h5' : 0, 'h6' :0}
			for html_file in self.html_files:
				counter = HeadersCounter()
				counter.feed(CHMEntry(self, html_file).read())
				tmp_dict = {'h1': html_header_tags['h1'] + counter.h1,
						    'h2': html_header_tags['h2'] + counter.h2,
						    'h3': html_header_tags['h3'] + counter.h3,
						    'h4': html_header_tags['h4'] + counter.h4,
						    'h5': html_header_tags['h5'] + counter.h5,
						    'h6': html_header_tags['h6'] + counter.h6}
				html_header_tags.update(tmp_dict)
			return html_header_tags
		# Number of missing H[1-6] tags
		# XXX: Find a better solution!
		if name == 'html_header_tags_missing':
			if self.html_header_tags['h6'] == 0:
				missing = 6
			if self.html_header_tags['h5'] == 0:
				missing = 5
			if self.html_header_tags['h4'] == 0:
				missing = 4
			if self.html_header_tags['h3'] == 0:
				missing = 3
			if self.html_header_tags['h2'] == 0:
				missing = 2
			if self.html_header_tags['h1'] == 0:
				missing = 1
			else:
				missing = 0
			return missing
		raise AttributeError(name)

	def get_entry(self, name):
		"""Get CHM entry by name"""
		# show index page or any other substitute
		if name == '/':
			name = self.frontpage
		if name in self.templates or name == self.frontpage:
			return self.get_template(name)
		if name.lower() in [ os.path.join('/icons', icon.lower()) for icon in os.listdir(self.icons_dir) ]:
			return open(os.path.join(self.icons_dir, os.path.basename(name))).read()
		for e in self.entries:
			if e.lower() == name.lower():
				return CHMEntry(self, e, frontpage=self.frontpage).get()
		else:
			archmod.message(archmod.ERROR, 'NameError: There is no %s' % name)

	def sub_mytag(self, re):
		"""Replacing tagname with attribute"""
		try:
			res = eval('self.' + re.group(1))
		except:
			res = eval(re.group(1))
		return res

	def get_template(self, name):
		"""Get template file by it's name"""
		if name == self.frontpage:
			tpl = open(os.path.join(self.templates_dir, os.path.basename('index.html'))).read()
		else:
			tpl = open(os.path.join(self.templates_dir, os.path.basename(name))).read()
		return re.sub('\<%(.+?)%\>', self.sub_mytag, tpl)

	def process_templates(self, destdir="."):
		"""Process templates"""
		for template in self.templates:
			open(os.path.join(destdir, os.path.basename(template)), 'w').write(self.get_template(template))
		if self.frontpage not in self.templates:
			open(os.path.join(destdir, os.path.basename(self.frontpage)), 'w').write(self.get_template('index.html'))
		if not os.path.exists(os.path.join(destdir, 'icons/')):
			shutil.copytree(os.path.join(self.icons_dir), os.path.join(destdir, 'icons/'))

	def extract_entry(self, entry, output_file, destdir=".", correct=False):
		# process output entry, remove first '/' in entry name
		fname = string.lower(output_file).replace('/', '', 1)
		# get directory name for file fname if any
		dname = os.path.dirname(os.path.join(destdir, fname))
		# if dname is a directory and it's not exist, than create it
		if dname and not os.path.exists(dname):
			os.makedirs(dname)
		# otherwise write a file from CHM entry
		if not os.path.isdir(os.path.join(destdir, fname)):
			# filename encoding conversion
			if self.fs_encoding:
				fname = fname.decode('utf-8').encode(self.fs_encoding)
			# write CHM entry content into the file, corrected or as is
			if correct:
				open(os.path.join(destdir, fname), 'w').writelines(CHMEntry(self, entry).correct())
			else:
				open(os.path.join(destdir, fname), 'w').writelines(CHMEntry(self, entry).get())
				
	def extract_entries(self, entries=[], destdir=".", correct=False):
		"""Extract raw CHM entries into the files"""
		for e in entries:
			# if entry is auxiliary file, than skip it
			if re.match(self.aux_re, e):
				continue
			self.extract_entry(e, output_file=e, destdir=destdir, correct=correct)

	def extract(self, destdir):
		"""Extract CHM file content into FS"""
		try:
			# Create destination directory
			os.mkdir(destdir)
			# make raw content extraction
			self.extract_entries(entries=self.entries, destdir=destdir)
			# process templates
			self.process_templates(destdir=destdir)
		except OSError, error:
			if error[0] == errno.EEXIST:
				sys.exit('%s is already exists' % destdir)

	def dump_html(self, output=sys.stdout):
		"""Dump HTML data from CHM file into standard output"""
		for e in self.html_files:
			# if entry is auxiliary file, than skip it
			if re.match(self.aux_re, e):
				continue
			print >> output, CHMEntry(self, e).get()

	def chm2text(self, output=sys.stdout):
		"""Convert CHM into Single Text file"""
		for e in self.html_files:
			# if entry is auxiliary file, than skip it
			if re.match(self.aux_re, e):
				continue
			# to use this function you should have 'lynx' or 'elinks' installed
			chmtotext(input=CHMEntry(self, e).get(), cmd=self.chmtotext, output=output)

	def htmldoc(self, output, format=archmod.CHM2HTML):
		"""CHM to other file formats converter using htmldoc"""
		# Extract CHM content into temporary directory
		output = output.replace(' ', '_')
		tempdir = tempfile.mkdtemp(prefix=output.rsplit('.', 1)[0])
		self.extract_entries(entries=self.html_files, destdir=tempdir, correct=True)
		# List of temporary files
		files = [ os.path.abspath(tempdir + file.lower()) for file in self.html_files ]
		if format == archmod.CHM2HTML:
			options = self.chmtohtml
			# change output from single html file to a directory with html file and images
			if self.image_files:
				dirname = archmod.file2dir(output)
				if os.path.exists(dirname):
					sys.exit('%s is already exists' % dirname)
				# Extract image files
				os.mkdir(dirname)
				# Extract all images
				for key, value in self.image_files.items():
					self.extract_entry(entry=key, output_file=value, destdir=dirname)
				# Fix output file name
				output = os.path.join(dirname, output)
		elif format == archmod.CHM2PDF:
			options = self.chmtopdf
			if self.image_files:
				# Extract all images
				for key, value in self.image_files.items():
					self.extract_entry(entry=key, output_file=key.lower(), destdir=tempdir)
		htmldoc(files, self.htmldoc_exec, options, self.toclevels, output)
		# Remove temporary files
		shutil.rmtree(path=tempdir)	


class CHMFile(CHMDir):
	"""CHM file class derived from CHMDir"""

	def _getitem(self, name):
		# Overriding CHMDir.entries attribute
		if name == 'entries':
			entries = []
			# get CHM file content and process it
			for name in self._get_names(self._handler):
				if (name == '/'):
					continue
				entries.append(name)
			return entries
		if name == '_handler':
			return chmlib.chm_open(self.sourcename)
		return super(CHMFile, self)._getitem(name)
	
	def __delattr__(self, name):
		# Closes CHM file handler on class destroying
		if name == '_handler':
			chmlib.chm_close(self._handler)
		return super(CHMFile, self).__delattr__(name)

	def _get_names(self, chmfile):
		"""Get object's names inside CHM file"""
		def get_name(chmfile, ui, content):
			content.append(ui.path)
			return chmlib.CHM_ENUMERATOR_CONTINUE

		chmdir = []
		if (chmlib.chm_enumerate(chmfile, chmlib.CHM_ENUMERATE_ALL, get_name, chmdir)) == 0:
			sys.exit('UnknownError: CHMLIB or PyCHM bug?')
		return chmdir


class CHMEntry(object):
	"""Class for CHM file entry"""

	def __init__(self, parent, name, frontpage='index.html'):
		# parent CHM file
		self.parent = parent
		# object inside CHM file
		self.name = name
		# frontpage name to substitute
		self.frontpage = os.path.basename(frontpage)

	def read(self):
		"""Read CHM entry content"""
		# Check where parent instance is CHMFile or CHMDir
		if isinstance(self.parent, CHMFile):
			result, ui = chmlib.chm_resolve_object(self.parent._handler, self.name)
			if (result != chmlib.CHM_RESOLVE_SUCCESS):
				return None

			size, content = chmlib.chm_retrieve_object(self.parent._handler, ui, 0l, ui.length)
			if (size == 0):
				return None
			return content
		else:
			return open(self.parent.sourcename + self.name).read()

	def lower_links(self, text):
		"""Links to lower case"""
		return re.sub('(?i)(href|src)\s*=\s*([^\s|>]+)', lambda m:m.group(0).lower(), text)

	def add_restoreframing_js(self, name, text):
		name = re.sub('/+', '/', name)
		depth = name.count('/')

		js = """<body><script language="javascript">
		if ((window.name != "content") && (navigator.userAgent.indexOf("Opera") <= -1) )
		document.write("<center><a href='%s%s?page=%s'>show framing</a></center>")
		</script>""" % ( '../' * depth, self.frontpage, name )
		
		return re.sub('(?i)<\s*body\s*>', js, text)

	def correct(self):
		"""Get correct CHM entry content"""
		data = self.read()
		# If entry is a html page?
		if re.search('(?i)\.html?$', self.name) and data is not None:
			# lower-casing links if needed
			if self.parent.filename_case:
				data = self.lower_links(data)

			# Delete unwanted HTML elements.
			data = re.sub('<div .*teamlib\.gif.*\/div>', '', data)
			data = re.sub('<a href.*>\[ Team LiB \]<\/a>', '', data)
			data = re.sub('<table.*larrow\.gif.*rarrow\.gif.*<\/table>', '', data)
			data = re.sub('<a href.*next\.gif[^>]*><\/a>', '' ,data)
			data = re.sub('<a href.*previous\.gif[^>]*><\/a>', '', data)
			data = re.sub('<a href.*prev\.gif[^>]*><\/a>', '', data)
			data = re.sub('"[^"]*previous\.gif"', '""', data)
			data = re.sub('"[^"]*prev\.gif"', '""', data)
			data = re.sub('"[^"]*next\.gif"', '""', data)
			# HTMLDOC doesn't working with missing <H1>...</H1> tag, 
			# so we need to fix it 
			# TODO: Seems to be an ugly solution...
			if not self.parent.html_header_tags['h1']:
				for header in xrange(self.parent.html_header_tags_missing + 1, 7):
					data =  re.sub(r'<[hH]%s' % str(header), r'<h%s' % str(header - self.parent.html_header_tags_missing), data)
					data = re.sub(r'</[hH]%s>' % str(header), r'</h%s>' % str(header - self.parent.html_header_tags_missing), data)
		if data is not None:
			return data
		else:
			return ''

	def get(self):
		"""Get CHM entry content"""
		# read entry content
		data = self.read()
		# If entry is a html page?
		if re.search('(?i)\.html?$', self.name) and data is not None:
			# lower-casing links if needed
			if self.parent.filename_case:
				data = self.lower_links(data)
			# restore framing if that option is set in config file
			if self.parent.restore_framing:
				data = self.add_restoreframing_js(self.name[1:], data)
		if data is not None:
			return data
		else:
			return ''
archmage 1:0.2.4-3 / usr / share / pyshared / archmod / CHM.py