/usr/lib/python3/dist-packages/html2text/utils.py is in python3-html2text 2018.1.9-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 | import sys
from html2text import config
from html2text.compat import htmlentitydefs
def name2cp(k):
"""Return sname to codepoint"""
if k == 'apos':
return ord("'")
return htmlentitydefs.name2codepoint[k]
unifiable_n = {}
for k in config.UNIFIABLE.keys():
unifiable_n[name2cp(k)] = config.UNIFIABLE[k]
def hn(tag):
if tag[0] == 'h' and len(tag) == 2:
try:
n = int(tag[1])
if n in range(1, 10): # pragma: no branch
return n
except ValueError:
return 0
def dumb_property_dict(style):
"""
:returns: A hash of css attributes
"""
out = dict([(x.strip().lower(), y.strip().lower()) for x, y in
[z.split(':', 1) for z in
style.split(';') if ':' in z
]
]
)
return out
def dumb_css_parser(data):
"""
:type data: str
:returns: A hash of css selectors, each of which contains a hash of
css attributes.
:rtype: dict
"""
# remove @import sentences
data += ';'
importIndex = data.find('@import')
while importIndex != -1:
data = data[0:importIndex] + data[data.find(';', importIndex) + 1:]
importIndex = data.find('@import')
# parse the css. reverted from dictionary comprehension in order to
# support older pythons
elements = [x.split('{') for x in data.split('}') if '{' in x.strip()]
try:
elements = dict([(a.strip(), dumb_property_dict(b))
for a, b in elements])
except ValueError: # pragma: no cover
elements = {} # not that important
return elements
def element_style(attrs, style_def, parent_style):
"""
:type attrs: dict
:type style_def: dict
:type style_def: dict
:returns: A hash of the 'final' style attributes of the element
:rtype: dict
"""
style = parent_style.copy()
if 'class' in attrs:
for css_class in attrs['class'].split():
css_style = style_def.get('.' + css_class, {})
style.update(css_style)
if 'style' in attrs:
immediate_style = dumb_property_dict(attrs['style'])
style.update(immediate_style)
return style
def google_list_style(style):
"""
Finds out whether this is an ordered or unordered list
:type style: dict
:rtype: str
"""
if 'list-style-type' in style:
list_style = style['list-style-type']
if list_style in ['disc', 'circle', 'square', 'none']:
return 'ul'
return 'ol'
def google_has_height(style):
"""
Check if the style of the element has the 'height' attribute
explicitly defined
:type style: dict
:rtype: bool
"""
if 'height' in style:
return True
return False
def google_text_emphasis(style):
"""
:type style: dict
:returns: A list of all emphasis modifiers of the element
:rtype: list
"""
emphasis = []
if 'text-decoration' in style:
emphasis.append(style['text-decoration'])
if 'font-style' in style:
emphasis.append(style['font-style'])
if 'font-weight' in style:
emphasis.append(style['font-weight'])
return emphasis
def google_fixed_width_font(style):
"""
Check if the css of the current element defines a fixed width font
:type style: dict
:rtype: bool
"""
font_family = ''
if 'font-family' in style:
font_family = style['font-family']
if 'courier new' == font_family or 'consolas' == font_family:
return True
return False
def list_numbering_start(attrs):
"""
Extract numbering from list element attributes
:type attrs: dict
:rtype: int or None
"""
if 'start' in attrs:
try:
return int(attrs['start']) - 1
except ValueError:
pass
return 0
def skipwrap(para, wrap_links):
# If it appears to contain a link
# don't wrap
if (len(config.RE_LINK.findall(para)) > 0) and not wrap_links:
return True
# If the text begins with four spaces or one tab, it's a code block;
# don't wrap
if para[0:4] == ' ' or para[0] == '\t':
return True
# If the text begins with only two "--", possibly preceded by
# whitespace, that's an emdash; so wrap.
stripped = para.lstrip()
if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
return False
# I'm not sure what this is for; I thought it was to detect lists,
# but there's a <br>-inside-<span> case in one of the tests that
# also depends upon it.
if stripped[0:1] in ('-', '*') and not stripped[0:2] == '**':
return True
# If the text begins with a single -, *, or +, followed by a space,
# or an integer, followed by a ., followed by a space (in either
# case optionally proceeded by whitespace), it's a list; don't wrap.
if config.RE_ORDERED_LIST_MATCHER.match(stripped) or \
config.RE_UNORDERED_LIST_MATCHER.match(stripped):
return True
return False
def wrapwrite(text):
text = text.encode('utf-8')
try: # Python3
sys.stdout.buffer.write(text)
except AttributeError:
sys.stdout.write(text)
def wrap_read(): # pragma: no cover
"""
:rtype: str
"""
try:
return sys.stdin.read()
except AttributeError:
return sys.stdin.buffer.read()
def escape_md(text):
"""
Escapes markdown-sensitive characters within other markdown
constructs.
"""
return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text)
def escape_md_section(text, snob=False):
"""
Escapes markdown-sensitive characters across whole document sections.
"""
text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text)
if snob:
text = config.RE_MD_CHARS_MATCHER_ALL.sub(r"\\\1", text)
text = config.RE_MD_DOT_MATCHER.sub(r"\1\\\2", text)
text = config.RE_MD_PLUS_MATCHER.sub(r"\1\\\2", text)
text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text)
return text
def reformat_table(lines, right_margin):
"""
Given the lines of a table
padds the cells and returns the new lines
"""
# find the maximum width of the columns
max_width = [len(x.rstrip()) + right_margin for x in lines[0].split('|')]
max_cols = len(max_width)
for line in lines:
cols = [x.rstrip() for x in line.split('|')]
num_cols = len(cols)
# don't drop any data if colspan attributes result in unequal lengths
if num_cols < max_cols:
cols += [''] * (max_cols - num_cols)
elif max_cols < num_cols:
max_width += [
len(x) + right_margin for x in
cols[-(num_cols - max_cols):]
]
max_cols = num_cols
max_width = [max(len(x) + right_margin, old_len)
for x, old_len in zip(cols, max_width)]
# reformat
new_lines = []
for line in lines:
cols = [x.rstrip() for x in line.split('|')]
if set(line.strip()) == set('-|'):
filler = '-'
new_cols = [x.rstrip() + (filler * (M - len(x.rstrip())))
for x, M in zip(cols, max_width)]
else:
filler = ' '
new_cols = [x.rstrip() + (filler * (M - len(x.rstrip())))
for x, M in zip(cols, max_width)]
new_lines.append('|'.join(new_cols))
return new_lines
def pad_tables_in_text(text, right_margin=1):
"""
Provide padding for tables in the text
"""
lines = text.split('\n')
table_buffer, table_started = [], False
new_lines = []
for line in lines:
# Toggle table started
if (config.TABLE_MARKER_FOR_PAD in line):
table_started = not table_started
if not table_started:
table = reformat_table(table_buffer, right_margin)
new_lines.extend(table)
table_buffer = []
new_lines.append('')
continue
# Process lines
if table_started:
table_buffer.append(line)
else:
new_lines.append(line)
new_text = '\n'.join(new_lines)
return new_text
|