/usr/lib/python2.7/dist-packages/gumbo/html5lib_adapter.py is in python-gumbo 0.10.1+dfsg-2.1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | # Copyright 2012 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Adapter between Gumbo and html5lib.
This exports one method, parse, with the same signature as html5lib.parse. It
takes the text to parse, and optionally an html5lib TreeBuilder to build the
tree, and gives back a DOM tree in that format. Example:
doc = parse(text, treebuilder='lxml')
"""
__author__ = 'jdtang@google.com (Jonathan Tang)'
import gumboc
# These should match html5lib.constants.namespaces, and be indexed by the enum
# values of gumboc.Namespace
_NAMESPACES = [
'http://www.w3.org/1999/xhtml',
'http://www.w3.org/2000/svg',
'http://www.w3.org/1998/Math/MathML',
]
def _convert_doctype(treebuilder, source_node):
if not source_node.has_doctype:
# Mimic html5lib behavior: if no doctype token, no doctype node.
return
treebuilder.insertDoctype({
'name': source_node.name.decode('utf-8'),
'publicId': source_node.public_identifier.decode('utf-8'),
'systemId': source_node.system_identifier.decode('utf-8'),
})
def _convert_attributes(source_node):
def maybe_namespace(attr):
if attr.namespace != gumboc.AttributeNamespace.NONE:
return (repr(attr.namespace).lower() if attr.name != 'xmlns' else None,
attr.name.decode('utf-8'),
attr.namespace.to_url())
else:
return attr.name.decode('utf-8')
return dict((maybe_namespace(attr), attr.value.decode('utf-8'))
for attr in source_node.attributes)
def _convert_element(source_node):
if source_node.type not in ( gumboc.NodeType.ELEMENT, gumboc.NodeType.TEMPLATE):
# If-statement instead of assert so it runs with -O
raise AssertionError(
'_convert_element only works with elements; found %r' %
source_node.type)
return {
'name': source_node.v.element.tag_name.decode('utf-8'),
'namespace': _NAMESPACES[source_node.v.element.tag_namespace.value],
'data': _convert_attributes(source_node),
}
def _insert_root(treebuilder, source_node, pop_element = True):
treebuilder.insertRoot(_convert_element(source_node))
for child_node in source_node.children:
_insert_node(treebuilder, child_node)
if pop_element:
treebuilder.openElements.pop()
def _insert_node(treebuilder, source_node):
assert source_node.type != gumboc.NodeType.DOCUMENT
if source_node.type == gumboc.NodeType.COMMENT:
treebuilder.insertComment({'data': source_node.v.text.text.decode('utf-8')})
elif source_node.type in (
gumboc.NodeType.TEXT,
gumboc.NodeType.WHITESPACE,
gumboc.NodeType.CDATA):
treebuilder.insertText(source_node.v.text.text.decode('utf-8'))
else:
treebuilder.insertElementNormal(_convert_element(source_node))
for child_node in source_node.v.element.children:
_insert_node(treebuilder, child_node)
treebuilder.openElements.pop()
class HTMLParser(object):
def __init__(self, tree):
self.tree = tree
def parse(self, text_or_file, **kwargs):
try:
text = text_or_file.read()
except AttributeError:
# Assume a string.
text = text_or_file
with gumboc.parse(text, **kwargs) as output:
_convert_doctype(self.tree, output.contents.document.contents)
for node in output.contents.document.contents.children:
if node.type == gumboc.NodeType.COMMENT:
self.tree.insertComment({'data': node.v.text.text.decode('utf-8')},
self.tree.document)
elif node.type in (gumboc.NodeType.ELEMENT, gumboc.NodeType.TEMPLATE):
_insert_root(self.tree, output.contents.root.contents)
else:
assert 'Only comments and <html> nodes allowed at the root'
return self.tree.getDocument()
def parseFragment(self, text_or_file, container, **kwargs):
try:
text = text_or_file.read()
except AttributeError:
# Assume a string.
text = text_or_file
if ' ' in container:
container_ns, container = container.split(' ')
else:
container_ns = "html"
with gumboc.parse(
text,
fragment_context=gumboc.Tag.from_str(container),
fragment_namespace=getattr(gumboc.Namespace, container_ns.upper()),
**kwargs) as output:
for node in output.contents.document.contents.children:
if node.type in (gumboc.NodeType.ELEMENT, gumboc.NodeType.TEMPLATE):
_insert_root(self.tree, output.contents.root.contents, False)
else:
assert 'Malformed fragment parse (??)'
return self.tree.getFragment()
|