/usr/share/pyshared/xlwt/UnicodeUtils.py is in python-xlwt 0.7.5+debian1-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 | # -*- coding: windows-1252 -*-
'''
From BIFF8 on, strings are always stored using UTF-16LE text encoding. The
character array is a sequence of 16-bit values4. Additionally it is
possible to use a compressed format, which omits the high bytes of all
characters, if they are all zero.
The following tables describe the standard format of the entire string, but
in many records the strings differ from this format. This will be mentioned
separately. It is possible (but not required) to store Rich-Text formatting
information and Asian phonetic information inside a Unicode string. This
results in four different ways to store a string. The character array
is not zero-terminated.
The string consists of the character count (as usual an 8-bit value or
a 16-bit value), option flags, the character array and optional formatting
information. If the string is empty, sometimes the option flags field will
not occur. This is mentioned at the respective place.
Offset Size Contents
0 1 or 2 Length of the string (character count, ln)
1 or 2 1 Option flags:
Bit Mask Contents
0 01H Character compression (ccompr):
0 = Compressed (8-bit characters)
1 = Uncompressed (16-bit characters)
2 04H Asian phonetic settings (phonetic):
0 = Does not contain Asian phonetic settings
1 = Contains Asian phonetic settings
3 08H Rich-Text settings (richtext):
0 = Does not contain Rich-Text settings
1 = Contains Rich-Text settings
[2 or 3] 2 (optional, only if richtext=1) Number of Rich-Text formatting runs (rt)
[var.] 4 (optional, only if phonetic=1) Size of Asian phonetic settings block (in bytes, sz)
var. ln or
2·ln Character array (8-bit characters or 16-bit characters, dependent on ccompr)
[var.] 4·rt (optional, only if richtext=1) List of rt formatting runs
[var.] sz (optional, only if phonetic=1) Asian Phonetic Settings Block
'''
from struct import pack
def upack2(s, encoding='ascii'):
# If not unicode, make it so.
if isinstance(s, unicode):
us = s
else:
us = unicode(s, encoding)
# Limit is based on number of content characters
# (not on number of bytes in packed result)
len_us = len(us)
if len_us > 32767:
raise Exception('String longer than 32767 characters')
try:
encs = us.encode('latin1')
# Success here means all chars are in U+0000 to U+00FF
# inclusive, meaning that we can use "compressed format".
flag = 0
n_items = len_us
except UnicodeEncodeError:
encs = us.encode('utf_16_le')
flag = 1
n_items = len(encs) // 2
# n_items is the number of "double byte characters" i.e. MS C wchars
# Can't use len(us).
# len(u"\U0001D400") -> 1 on a wide-unicode build
# and 2 on a narrow-unicode build.
# We need n_items == 2 in this case.
return pack('<HB', n_items, flag) + encs
def upack2rt(rt, encoding='ascii'):
us = u''
fr = ''
offset = 0
# convert rt strings to unicode if not already unicode
# also generate the formatting run for the styles added
for s, fontx in rt:
if not isinstance(s, unicode):
s = unicode(s, encoding)
us += s
if fontx is not None:
# code in Rows.py ensures that
# fontx can be None only for the first piece
fr += pack('<HH', offset, fontx)
# offset is the number of MS C wchar characters.
# That is 1 if c <= u'\uFFFF' else 2
offset += len(s.encode('utf_16_le')) // 2
num_fr = len(fr) // 4 # ensure result is int
if offset > 32767:
raise Exception('String longer than 32767 characters')
try:
encs = us.encode('latin1')
# Success here means all chars are in U+0000 to U+00FF
# inclusive, meaning that we can use "compressed format".
flag = 0 | 8
n_items = len(encs)
except UnicodeEncodeError:
encs = us.encode('utf_16_le')
flag = 1 | 8
n_items = len(encs) // 2 # see comments in upack2 function above
return pack('<HBH', n_items, flag, num_fr) + encs, fr
def upack1(s, encoding='ascii'):
# Same as upack2(), but with a one-byte length field.
if isinstance(s, unicode):
us = s
else:
us = unicode(s, encoding)
len_us = len(us)
if len_us > 255:
raise Exception('String longer than 255 characters')
try:
encs = us.encode('latin1')
flag = 0
n_items = len_us
except UnicodeEncodeError:
encs = us.encode('utf_16_le')
flag = 1
n_items = len(encs) // 2
return pack('<BB', n_items, flag) + encs
|