This file is indexed.

/usr/share/pyshared/reportlab/lib/textsplit.py is in python-reportlab 2.5-1.1build1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#Copyright ReportLab Europe Ltd. 2000-2006
#see license.txt for license details
#history http://www.reportlab.co.uk/cgi-bin/viewcvs.cgi/public/reportlab/trunk/reportlab/lib/textsplit.py

"""Helpers for text wrapping, hyphenation, Asian text splitting and kinsoku shori.

How to split a 'big word' depends on the language and the writing system.  This module
works on a Unicode string.  It ought to grow by allowing ore algoriths to be plugged
in based on possible knowledge of the language and desirable 'niceness' of the algorithm.

"""

__version__=''' $Id: textsplit.py 3662 2010-02-09 11:23:58Z rgbecker $ '''

from types import StringType, UnicodeType
import unicodedata
from reportlab.pdfbase.pdfmetrics import stringWidth
from reportlab.rl_config import _FUZZ

CANNOT_START_LINE = [
    #strongly prohibited e.g. end brackets, stop, exclamation...
    u'!\',.:;?!")]\u3001\u3002\u300d\u300f\u3011\u3015\uff3d\u3011\uff09',
    #middle priority e.g. continuation small vowels - wrapped on two lines but one string...
    u'\u3005\u2015\u3041\u3043\u3045\u3047\u3049\u3063\u3083\u3085\u3087\u308e\u30a1\u30a3'
    u'\u30a5\u30a7\u30a9\u30c3\u30e3\u30e5\u30e7\u30ee\u30fc\u30f5\u30f6',
    #weakly prohibited - continuations, celsius symbol etc.
    u'\u309b\u309c\u30fb\u30fd\u30fe\u309d\u309e\u2015\u2010\xb0\u2032\u2033\u2103\uffe0\uff05\u2030'
    ]

ALL_CANNOT_START = u''.join(CANNOT_START_LINE)
CANNOT_END_LINE = [
    #strongly prohibited
    u'\u2018\u201c\uff08[{\uff08\u3014\uff3b\uff5b\u3008\u300a\u300c\u300e\u3010',
    #weaker - currency symbols, hash, postcode - prefixes
    u'$\u00a3@#\uffe5\uff04\uffe1\uff20\u3012\u00a7'
    ]
ALL_CANNOT_END = u''.join(CANNOT_END_LINE)
def getCharWidths(word, fontName, fontSize):
    """Returns a list of glyph widths.  Should be easy to optimize in _rl_accel

    >>> getCharWidths('Hello', 'Courier', 10)
    [6.0, 6.0, 6.0, 6.0, 6.0]
    >>> from reportlab.pdfbase.cidfonts import UnicodeCIDFont
    >>> from reportlab.pdfbase.pdfmetrics import registerFont
    >>> registerFont(UnicodeCIDFont('HeiseiMin-W3'))
    >>> getCharWidths(u'\u6771\u4EAC', 'HeiseiMin-W3', 10)   #most kanji are 100 ems
    [10.0, 10.0]
    """
    #character-level function call; the performance is going to SUCK

    return [stringWidth(uChar, fontName, fontSize) for uChar in word]

def wordSplit(word, availWidth, fontName, fontSize, encoding='utf8'):
    """Attempts to break a word which lacks spaces into two parts, the first of which
    fits in the remaining space.  It is allowed to add hyphens or whatever it wishes.

    This is intended as a wrapper for some language- and user-choice-specific splitting
    algorithms.  It should only be called after line breaking on spaces, which covers western
    languages and is highly optimised already.  It works on the 'last unsplit word'.

    Presumably with further study one could write a Unicode splitting algorithm for text
    fragments whick was much faster.

    Courier characters should be 6 points wide.
    >>> wordSplit('HelloWorld', 30, 'Courier', 10)
    [[0.0, 'Hello'], [0.0, 'World']]
    >>> wordSplit('HelloWorld', 31, 'Courier', 10)
    [[1.0, 'Hello'], [1.0, 'World']]
    """
    if type(word) is not UnicodeType:
        uword = word.decode(encoding)
    else:
        uword = word

    charWidths = getCharWidths(uword, fontName, fontSize)
    lines = dumbSplit(uword, charWidths, availWidth)

    if type(word) is not UnicodeType:
        lines2 = []
        #convert back
        for (extraSpace, text) in lines:
            lines2.append([extraSpace, text.encode(encoding)])
        lines = lines2

    return lines

def dumbSplit(word, widths, availWidth):
    """This function attempts to fit as many characters as possible into the available
    space, cutting "like a knife" between characters.  This would do for Chinese.
    It returns a list of (text, extraSpace) items where text is a Unicode string,
    and extraSpace is the points of unused space available on the line.  This is a
    structure which is fairly easy to display, and supports 'backtracking' approaches
    after the fact.

    Test cases assume each character is ten points wide...

    >>> dumbSplit(u'Hello', [10]*5, 60)
    [[10.0, u'Hello']]
    >>> dumbSplit(u'Hello', [10]*5, 50)
    [[0.0, u'Hello']]
    >>> dumbSplit(u'Hello', [10]*5, 40)
    [[0.0, u'Hell'], [30, u'o']]
    """
    _more = """
    #>>> dumbSplit(u'Hello', [10]*5, 4)   # less than one character
    #(u'', u'Hello')
    # this says 'Nihongo wa muzukashii desu ne!' (Japanese is difficult isn't it?) in 12 characters
    >>> jtext = u'\u65e5\u672c\u8a9e\u306f\u96e3\u3057\u3044\u3067\u3059\u306d\uff01'
    >>> dumbSplit(jtext, [10]*11, 30)   #
    (u'\u65e5\u672c\u8a9e', u'\u306f\u96e3\u3057\u3044\u3067\u3059\u306d\uff01')
    """
    assert type(word) is UnicodeType
    lines = []
    widthUsed = 0.0
    lineStartPos = 0
    for (i, w) in enumerate(widths):
        widthUsed += w
        if widthUsed > availWidth + _FUZZ:
            #used more than can fit...
            #ping out with previous cut, then set up next line with one character

            extraSpace = availWidth - widthUsed + w
            #print 'ending a line; used %d, available %d' % (widthUsed, availWidth)
            selected = word[lineStartPos:i]

            #This is the most important of the Japanese typography rules.
            #if next character cannot start a line, wrap it up to this line so it hangs
            #in the right margin. We won't do two or more though - that's unlikely and
            #would result in growing ugliness.
            nextChar = word[i]
            if nextChar in ALL_CANNOT_START:
                #it's punctuation or a closing bracket of some kind.  'wrap up'
                #so it stays on the line above, slightly exceeding our target width.
                #print 'wrapping up', repr(nextChar)
                selected += nextChar
                extraSpace -= w
                i += 1
            lines.append([extraSpace, selected])
            lineStartPos = i
            widthUsed = w
            i -= 1
    #any characters left?
    if widthUsed > 0:
        extraSpace = availWidth - widthUsed
        lines.append([extraSpace, word[lineStartPos:]])

    return lines

def kinsokuShoriSplit(word, widths, availWidth):
    #NOT USED OR FINISHED YET!
    """Split according to Japanese rules according to CJKV (Lunde).

    Essentially look for "nice splits" so that we don't end a line
    with an open bracket, or start one with a full stop, or stuff like
    that.  There is no attempt to try to split compound words into
    constituent kanji.  It currently uses wrap-down: packs as much
    on a line as possible, then backtracks if needed

    This returns a number of words each of which should just about fit
    on a line.  If you give it a whole paragraph at once, it will
    do all the splits.

    It's possible we might slightly step over the width limit
    if we do hanging punctuation marks in future (e.g. dangle a Japanese
    full stop in the right margin rather than using a whole character
    box.

    """
    lines = []
    assert len(word) == len(widths)
    curWidth = 0.0
    curLine = []
    i = 0   #character index - we backtrack at times so cannot use for loop
    while 1:
        ch = word[i]
        w = widths[i]
        if curWidth + w < availWidth:
            curLine.append(ch)
            curWidth += w
        else:
            #end of line.  check legality
            if ch in CANNOT_END_LINE[0]:
                pass
    #to be completed

# This recipe refers:
#
#  http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
import re
rx=re.compile(u"([\u2e80-\uffff])", re.UNICODE)
def cjkwrap(text, width, encoding="utf8"):
     return reduce(lambda line, word, width=width: '%s%s%s' %
                (line,
                 [' ','\n', ''][(len(line)-line.rfind('\n')-1
                       + len(word.split('\n',1)[0] ) >= width) or
                      line[-1:] == '\0' and 2],
                 word),
                rx.sub(r'\1\0 ', unicode(text,encoding)).split(' ')
            ).replace('\0', '').encode(encoding)

if __name__=='__main__':
    import doctest
    import textsplit
    doctest.testmod(textsplit)