/usr/lib/python2.7/dist-packages/docutils/utils/code_analyzer.py is in python-docutils 0.13.1+dfsg-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | #!/usr/bin/python
# coding: utf-8
"""Lexical analysis of formal languages (i.e. code) using Pygments."""
# :Author: Georg Brandl; Felix Wiemann; Günter Milde
# :Date: $Date: 2015-04-20 16:05:27 +0200 (Mo, 20. Apr 2015) $
# :Copyright: This module has been placed in the public domain.
from docutils import ApplicationError
try:
import pygments
from pygments.lexers import get_lexer_by_name
from pygments.formatters.html import _get_ttype_class
with_pygments = True
except (ImportError, SyntaxError): # pygments 2.0.1 fails with Py 3.1 and 3.2
with_pygments = False
# Filter the following token types from the list of class arguments:
unstyled_tokens = ['token', # Token (base token type)
'text', # Token.Text
''] # short name for Token and Text
# (Add, e.g., Token.Punctuation with ``unstyled_tokens += 'punctuation'``.)
class LexerError(ApplicationError):
pass
class Lexer(object):
"""Parse `code` lines and yield "classified" tokens.
Arguments
code -- string of source code to parse,
language -- formal language the code is written in,
tokennames -- either 'long', 'short', or '' (see below).
Merge subsequent tokens of the same token-type.
Iterating over an instance yields the tokens as ``(tokentype, value)``
tuples. The value of `tokennames` configures the naming of the tokentype:
'long': downcased full token type name,
'short': short name defined by pygments.token.STANDARD_TYPES
(= class argument used in pygments html output),
'none': skip lexical analysis.
"""
def __init__(self, code, language, tokennames='short'):
"""
Set up a lexical analyzer for `code` in `language`.
"""
self.code = code
self.language = language
self.tokennames = tokennames
self.lexer = None
# get lexical analyzer for `language`:
if language in ('', 'text') or tokennames == 'none':
return
if not with_pygments:
raise LexerError('Cannot analyze code. '
'Pygments package not found.')
try:
self.lexer = get_lexer_by_name(self.language)
except pygments.util.ClassNotFound:
raise LexerError('Cannot analyze code. '
'No Pygments lexer found for "%s".' % language)
# Since version 1.2. (released Jan 01, 2010) Pygments has a
# TokenMergeFilter. However, this requires Python >= 2.4. When Docutils
# requires same minimal version, ``self.merge(tokens)`` in __iter__ can
# be replaced by ``self.lexer.add_filter('tokenmerge')`` in __init__.
def merge(self, tokens):
"""Merge subsequent tokens of same token-type.
Also strip the final newline (added by pygments).
"""
tokens = iter(tokens)
(lasttype, lastval) = tokens.next()
for ttype, value in tokens:
if ttype is lasttype:
lastval += value
else:
yield(lasttype, lastval)
(lasttype, lastval) = (ttype, value)
if lastval.endswith('\n'):
lastval = lastval[:-1]
if lastval:
yield(lasttype, lastval)
def __iter__(self):
"""Parse self.code and yield "classified" tokens.
"""
if self.lexer is None:
yield ([], self.code)
return
tokens = pygments.lex(self.code, self.lexer)
for tokentype, value in self.merge(tokens):
if self.tokennames == 'long': # long CSS class args
classes = str(tokentype).lower().split('.')
else: # short CSS class args
classes = [_get_ttype_class(tokentype)]
classes = [cls for cls in classes if cls not in unstyled_tokens]
yield (classes, value)
class NumberLines(object):
"""Insert linenumber-tokens at the start of every code line.
Arguments
tokens -- iterable of ``(classes, value)`` tuples
startline -- first line number
endline -- last line number
Iterating over an instance yields the tokens with a
``(['ln'], '<the line number>')`` token added for every code line.
Multi-line tokens are splitted."""
def __init__(self, tokens, startline, endline):
self.tokens = tokens
self.startline = startline
# pad linenumbers, e.g. endline == 100 -> fmt_str = '%3d '
self.fmt_str = '%%%dd ' % len(str(endline))
def __iter__(self):
lineno = self.startline
yield (['ln'], self.fmt_str % lineno)
for ttype, value in self.tokens:
lines = value.split('\n')
for line in lines[:-1]:
yield (ttype, line + '\n')
lineno += 1
yield (['ln'], self.fmt_str % lineno)
yield (ttype, lines[-1])
|