/usr/share/pyshared/zope/index/text/ricecode.py is in python-zope.index 3.6.4-0ubuntu1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | ##############################################################################
#
# Copyright (c) 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Rice coding (a variation of Golomb coding)
Based on a Java implementation by Glen McCluskey described in a Usenix
;login: article at
http://www.usenix.org/publications/login/2000-4/features/java.html
McCluskey's article explains the approach as follows. The encoding
for a value x is represented as a unary part and a binary part. The
unary part is a sequence of 1 bits followed by a 0 bit. The binary
part encodes some of the lower bits of x-1.
The encoding is parameterized by a value m that describes how many
bits to store in the binary part. If most of the values are smaller
than 2**m then they can be stored in only m+1 bits.
Compute the length of the unary part, q, where
q = math.floor((x-1)/ 2 ** m)
Emit q 1 bits followed by a 0 bit.
Emit the lower m bits of x-1, treating x-1 as a binary value.
"""
import array
class BitArray(object):
def __init__(self, buf=None):
self.bytes = array.array('B')
self.nbits = 0
self.bitsleft = 0
self.tostring = self.bytes.tostring
def __getitem__(self, i):
byte, offset = divmod(i, 8)
mask = 2 ** offset
if self.bytes[byte] & mask:
return 1
else:
return 0
def __setitem__(self, i, val):
byte, offset = divmod(i, 8)
mask = 2 ** offset
if val:
self.bytes[byte] |= mask
else:
self.bytes[byte] &= ~mask
def __len__(self):
return self.nbits
def append(self, bit):
"""Append a 1 if bit is true or 1 if it is false."""
if self.bitsleft == 0:
self.bytes.append(0)
self.bitsleft = 8
self.__setitem__(self.nbits, bit)
self.nbits += 1
self.bitsleft -= 1
def __getstate__(self):
return self.nbits, self.bitsleft, self.tostring()
def __setstate__(self, (nbits, bitsleft, s)):
self.bytes = array.array('B', s)
self.nbits = nbits
self.bitsleft = bitsleft
class RiceCode(object):
def __init__(self, m):
"""Constructor a RiceCode for m-bit values."""
if not (0 <= m <= 16):
raise ValueError("m must be between 0 and 16")
self.init(m)
self.bits = BitArray()
self.len = 0
def init(self, m):
self.m = m
self.lower = (1 << m) - 1
self.mask = 1 << (m - 1)
def append(self, val):
"""Append an item to the list."""
if val < 1:
raise ValueError("value >= 1 expected, got %s" % `val`)
val -= 1
# emit the unary part of the code
q = val >> self.m
for i in range(q):
self.bits.append(1)
self.bits.append(0)
# emit the binary part
r = val & self.lower
mask = self.mask
while mask:
self.bits.append(r & mask)
mask >>= 1
self.len += 1
def __len__(self):
return self.len
def tolist(self):
"""Return the items as a list."""
l = []
i = 0 # bit offset
binary_range = range(self.m)
for j in range(self.len):
unary = 0
while self.bits[i] == 1:
unary += 1
i += 1
assert self.bits[i] == 0
i += 1
binary = 0
for k in binary_range:
binary = (binary << 1) | self.bits[i]
i += 1
l.append((unary << self.m) + (binary + 1))
return l
def tostring(self):
"""Return a binary string containing the encoded data.
The binary string may contain some extra zeros at the end.
"""
return self.bits.tostring()
def __getstate__(self):
return self.m, self.bits
def __setstate__(self, (m, bits)):
self.init(m)
self.bits = bits
def encode(m, l):
c = RiceCode(m)
for elt in l:
c.append(elt)
assert c.tolist() == l
return c
def encode_deltas(l):
if len(l) == 1:
return l[0], []
deltas = RiceCode(6)
deltas.append(l[1] - l[0])
for i in range(2, len(l)):
deltas.append(l[i] - l[i - 1])
return l[0], deltas
def decode_deltas(start, enc_deltas):
deltas = enc_deltas.tolist()
l = [start]
for i in range(1, len(deltas)):
l.append(l[i-1] + deltas[i])
l.append(l[-1] + deltas[-1])
return l
def test():
import random
for size in [10, 20, 50, 100, 200]:
l = [random.randint(1, size) for i in range(50)]
c = encode(random.randint(1, 16), l)
assert c.tolist() == l
for size in [10, 20, 50, 100, 200]:
l = range(random.randint(1, size), size + random.randint(1, size))
t = encode_deltas(l)
l2 = decode_deltas(*t)
assert l == l2
if l != l2:
print l
print l2
def pickle_efficiency():
import pickle
import random
for m in [4, 8, 12]:
for size in [10, 20, 50, 100, 200, 500, 1000, 2000, 5000]:
for elt_range in [10, 20, 50, 100, 200, 500, 1000]:
l = [random.randint(1, elt_range) for i in range(size)]
raw = pickle.dumps(l, 1)
enc = pickle.dumps(encode(m, l), 1)
print "m=%2d size=%4d range=%4d" % (m, size, elt_range),
print "%5d %5d" % (len(raw), len(enc)),
if len(raw) > len(enc):
print "win"
else:
print "lose"
if __name__ == "__main__":
test()
|