/usr/share/khmerconverter/modules/legacyConverter.py is in khmerconverter 1.4-1.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 | #!/usr/bin/python
# -*- coding: utf8 -*-
# Khmer Unicode to Khmer Legacy fonts Conversion
# (c) 2006 The WordForge Foundation, all rights reserved.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public License
# as published by the Free Software Foundation; either version 2.1
# of the License, or (at your option) any later version.
#
# See the LICENSE file for more details.
#
# Developed by:
# Hok Kakada (hokkakada@khmeros.info)
# Keo Sophon (keosophon@khmeros.info)
# San Titvirak (titvirak@khmeros.info)
# Seth Chanratha (sethchanratha@khmeros.info)
#
# This program converts an reordered unicode string based on legacy style to legacy font
import unittest
import sys
#convert from unicode to legacy
def converter(sin, data):
'''sin as reordered unicode string based on legacy style
data the font data for the conversion
returns legacy string where unkown unicode codepoints are dropped
'''
dicts = data[0] # dictionary not in unicode range
replaceData = data[1] # list with character replacement values
sout = ''
listLength = len(replaceData)
i = 0
end = len(sin)
while (i < end):
for j in range( len(dicts)-1, -1, -1):
if (dicts[j] == None):
continue
try:
sout += dicts[j][sin[i : i+j+1]]
i += j +1
break
except KeyError:
continue
else:
c = sin[i]
n = ord(c) - 0x1780
if ((n >= 0) and (n < listLength)):
sout += replaceData[n]
elif (ord(c) < 0x7f ): # keep ascii characters
sout += c.encode('cp1252')
i += 1
return sout
class TestConvert(unittest.TestCase):
MARK = unichr(0x17EA)
condenseData1 = {
unichr(0x200b):chr(0x20), #ZWSP
unichr(0x200c):"", #ZWNJ
unichr(0x200d):"" # ZWJ
}
condenseData2 = {u'បា': 'BAA', u'្ក':'Cok', u'្ស':'Cos', MARK + u'ី':'I'}
condenseData3 = {
MARK + MARK +u'៉':chr(0xFA), # Musekatoan (U long)
MARK + MARK +u'៊':chr(0xFA), # Trisap (U long)
}
condenseData6 = {
u'ខ្ញ'+ MARK + u'ុំ':chr(0xB4) # ខ្ញុំ one code point in limon
}
replaceData = ['k', 'x', 'K', 'X']
#dicts = [condenseData1, condenseData2, condenseData3]
dicts = [condenseData1, condenseData2, condenseData3, None, None, condenseData6]
data = [dicts, replaceData]
def setUp(self):
pass
def testConversion(self):
self.assertEqual(converter(unichr(0x200b), self.data), chr(0x20)) # in dict1
self.assertEqual(converter(unichr(0x200c), self.data), "")
self.assertEqual(converter(u'បា', self.data), 'BAA') # in dict2
self.assertEqual(converter(u'្ក', self.data), 'Cok')
self.assertEqual(converter(u'្ស'+ self.MARK + self.MARK + u'៊' + self.MARK + u'ី', self.data), 'Cos' + chr(0xFA) + 'I') # in dict3
self.assertEqual(converter(u'ខ្ញ'+ self.MARK + u'ុំ',self.data), chr(0xB4)) # in dict6
self.assertEqual(converter(u'ក', self.data), 'k') # in list
self.assertEqual(converter(u'ខ', self.data), 'x')
self.assertEqual(converter(u'ឃ', self.data), 'X')
def testNoConversion(self):
# keep characters we do not know
self.assertEqual(converter(u'?', self.data), '?') # neither in dict nor in list
self.assertEqual(converter(u'\n', self.data), '\n')
self.assertEqual(converter(u'', self.data), '')
# remove unknown unicode character
self.assertEqual(converter(unichr(255), self.data), '')
self.assertEqual(converter(unichr(0x1980), self.data), '')
def testConvertLongFirst(self):
# convert longer match first
# 123: A,1234: Z
# 1234 => Z... not A4
data = (({"0":"X"}, {"09":"M"}, {"123":"A"}, {"1234":"Z"}) , [])
# dictionary not in unicode range
# list with character replacement values
self.assertEqual(converter("1234", data), "Z")
if __name__ == '__main__':
unittest.main()
|