/usr/lib/python2.7/dist-packages/webunit/IMGSucker.py is in python-webunit 1:1.3.10-2.1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 | #
# Copyright (c) 2003 Richard Jones (http://mechanicalcat.net/richard)
# Copyright (c) 2002 ekit.com Inc (http://www.ekit-inc.com/)
# Copyright (c) 2001 Bizar Software Pty Ltd (http://www.bizarsoftware.com.au/)
#
# See the README for full license details.
#
# $Id$
import htmllib, formatter, urlparse
class IMGSucker(htmllib.HTMLParser):
'''Suck in all the images and linked stylesheets for an HTML page.
The sucker uses a HTTP session object which provides:
url - the URL of the page that we're parsing
session - a HTTP session object which provides:
fetch: a method that retrieves a file from a URL
images: a mapping that holds the fetched images
Once instantiated, the sucker is fed data through its feed method and
then it must be close()'ed.
**CURRENTLY NOT IMPLEMENTED**
Once done, the output attribute of the sucker holds the HTML with URLs
rewritten for local files where appropriate.
**CURRENTLY NOT IMPLEMENTED**
'''
def __init__(self, url, session):
htmllib.HTMLParser.__init__(self, formatter.NullFormatter())
self.base = url
self.session = session
self.output = ""
def handle_data(self, data):
self.output = self.output + data
def unknown_starttag(self, tag, attributes):
self.output = self.output + '<%s' % tag
for name, value in attributes:
self.output = self.output + ' %s="%s"' % (name, value)
self.output = self.output + '>'
def handle_starttag(self, tag, method, attributes):
if tag == 'img' or tag == 'base' or tag == 'link':
method(attributes)
else:
self.unknown_starttag(tag, attributes)
def unknown_endtag(self, tag):
self.output = self.output + '</%s>' % tag
def handle_endtag(self, tag, method):
self.unknown_endtag(tag)
def close(self):
htmllib.HTMLParser.close(self)
def do_base(self, attributes):
for name, value in attributes:
if name == 'href':
self.base = value
# Write revised base tag to file
self.unknown_starttag('base', attributes)
def do_img(self, attributes):
newattributes = []
for name, value in attributes:
if name == 'src':
url = urlparse.urljoin(self.base, value)
# TODO: figure the re-write path
# newattributes.append((name, path))
if not self.session.images.has_key(url):
self.session.images[url] = self.session.fetch(url)
else:
newattributes.append((name, value))
# Write the img tag to file (with revised paths)
self.unknown_starttag('img', newattributes)
def do_link(self, attributes):
newattributes = [('rel', 'stylesheet'), ('type', 'text/css')]
for name, value in attributes:
if name == 'href':
url = urlparse.urljoin(self.base, value)
# TODO: figure the re-write path
# newattributes.append((name, path))
self.session.fetch(url)
else:
newattributes.append((name, value))
# Write the link tag to file (with revised paths)
self.unknown_starttag('link', newattributes)
#
# $Log$
# Revision 1.2 2003/07/22 01:19:22 richard
# patches
#
# Revision 1.1.1.1 2003/07/22 01:01:44 richard
#
#
# Revision 1.4 2002/02/27 03:00:08 rjones
# more tests, bugfixes
#
# Revision 1.3 2002/02/25 03:11:00 rjones
# *** empty log message ***
#
# Revision 1.2 2002/02/13 01:16:56 rjones
# *** empty log message ***
#
#
# vim: set filetype=python ts=4 sw=4 et si
|