/usr/share/pyshared/Codeville/lcsmatch.py is in codeville 0.8.0-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 | # Written by Bram Cohen and Uoti Urpala
from bisect import bisect
def unique_lcs(a, b):
# set index[line in a] = position of line in a unless
# unless a is a duplicate, in which case it's set to None
index = {}
for i in xrange(len(a)):
line = a[i]
if line in index:
index[line] = None
else:
index[line]= i
# make btoa[i] = position of line i in a, unless
# that line doesn't occur exactly once in both,
# in which case it's set to None
btoa = [None] * len(b)
index2 = {}
for pos, line in enumerate(b):
next = index.get(line)
if next is not None:
if line in index2:
# unset the previous mapping, which we now know to
# be invalid because the line isn't unique
btoa[index2[line]] = None
del index[line]
else:
index2[line] = pos
btoa[pos] = next
# this is the Patience sorting algorithm
# see http://en.wikipedia.org/wiki/Patience_sorting
backpointers = [None] * len(b)
stacks = []
lasts = []
k = 0
for bpos, apos in enumerate(btoa):
if apos is None:
continue
# skip over solitary matches with no surrounding matching context
if (bpos == 0 or apos == 0 or a[apos-1] != b[bpos-1]) and \
(bpos == len(b)-1 or apos == len(a)-1 or a[apos+1] != b[bpos+1]):
continue
# as an optimization, check if the next line comes right after
# the previous line, because usually it does
if stacks and stacks[k] < apos and (k == len(stacks) - 1 or stacks[k+1] > apos):
k += 1
else:
k = bisect(stacks, apos)
if k > 0:
backpointers[bpos] = lasts[k-1]
if k < len(stacks):
stacks[k] = apos
lasts[k] = bpos
else:
stacks.append(apos)
lasts.append(bpos)
if len(lasts) == 0:
return []
result = []
k = lasts[-1]
while k is not None:
result.append((btoa[k], k))
k = backpointers[k]
result.reverse()
return result
def test_lcs():
assert unique_lcs('', '') == []
assert unique_lcs('a', 'a') == []
assert unique_lcs('a', 'b') == []
assert unique_lcs('ab', 'ab') == [(0, 0), (1, 1)]
assert unique_lcs('abcde', 'cdeab') == [(2, 0), (3, 1), (4, 2)]
assert unique_lcs('cdeab', 'abcde') == [(0, 2), (1, 3), (2, 4)]
assert unique_lcs('abXde', 'abYde') == [(0, 0), (1, 1), (3, 3), (4, 4)]
return
def recurse_matches(a, b, ahi, bhi, answer, maxrecursion, is_junk):
# this will never happen normally, this check is to prevent DOS attacks
if maxrecursion < 0:
return
oldlength = len(answer)
# check for new matches after the last confirmed match ends
if oldlength == 0:
alo = 0
blo = 0
else:
lasta, lastb, lastlen = answer[-1]
alo = lasta + lastlen
blo = lastb + lastlen
if alo == ahi or blo == bhi:
return
last_enda = alo
last_endb = blo
last_checkb = blo
# extend individual line matches into match sections
for apos, bpos in unique_lcs(a[alo:ahi], b[blo:bhi]):
apos += alo
bpos += blo
# don't overlap with an existing match or check something which
# already got thrown out as junk
if bpos < last_checkb or apos < last_enda:
continue
# extend line match as far in either direction as possible
enda = apos + 1
endb = bpos + 1
while enda < ahi and endb < bhi and a[enda] == b[endb]:
enda += 1
endb += 1
while apos > last_enda and bpos > last_endb and a[apos-1] == b[bpos-1]:
apos -= 1
bpos -= 1
# mark what's been checked now, so even if it's junked it doesn't
# have to be checked again
last_checkb = endb
# if this section is junk, skip it
numreal = 0
for k in xrange(apos, enda):
if not is_junk(a[k]):
numreal += 1
if numreal >= 2:
break
else:
# Niklaus Wirth can bite me
continue
last_enda = enda
last_endb = endb
# find matches which come before the new match section
# this can happen because there may be lines which weren't unique
# in the whole file but are unique in the subsection
recurse_matches(a, b, apos, bpos, answer, maxrecursion - 1, is_junk)
answer.append((apos, bpos, enda - apos))
if len(answer) > oldlength:
# find matches between the last match and the end
recurse_matches(a, b, ahi, bhi, answer, maxrecursion - 1, is_junk)
# else: fall back to difflib (possibly a good idea, possibly not)
def default_is_junk(x):
return len(x.strip()) <= 2
def find_matches(a, b, is_junk = default_is_junk):
# single-line identical files match, despite being too short for
# a real match if they were part of a larger file
if a == b:
return [(0, 0, len(a))]
answer = []
recurse_matches(a, b, len(a), len(b), answer, 10, is_junk)
return answer
try:
import psyco
psyco.bind(unique_lcs, 0)
except ImportError:
pass
# Stuff below here is for testing
def x(a, b):
t1 = time.time()
r = unique_lcs(a, b)
t2 = time.time()
#print r
for i, j in r:
assert a[i] == b[j]
#print a[i]
print
print 'time:', t2-t1
def return_false(x):
return False
def x2(a, b, is_junk = return_false):
t1 = time.time()
r = find_matches(a, b, is_junk)
t2 = time.time()
#print r
for i, j, l in r:
assert a[i:i+l] == b[j:j+l]
#print ''.join(a[i:i+l]),
print
print 'time:', t2-t1
def completely_random_test(x):
a = [str(i) for i in range(100000)]
b = list(a)
random.shuffle(b)
#print ' '.join(b)
#print
x(a, b)
def random_with_ranges_test(x):
d = [[str(i), str(i+1)] for i in xrange(0, 100000, 2)]
c = list(d)
random.shuffle(c)
a = []
for i in d:
a.extend(i)
b = []
for i in c:
b.extend(i)
#print ' '.join(b)
#print
x(a, b)
def is_A_test(s):
return s == 'A'
def test_lcsmatch():
global random, time
import random
import time
cur_time = time.time()
random.seed(cur_time)
print 'Seeded tests with %s' % (cur_time,)
completely_random_test(x)
random_with_ranges_test(x)
x2('0123456789abc', ' 01 89a 34 67')
x2('AB 1 CD 1 AB P XY Q AB 1 CD 1 AB', 'AB 2 CD 2 AB Q XY P AB 2 CD 2 AB')
x2('AjBjC', 'AjC')
x2('AjC', 'AjBjC')
x2('AjBjC', 'AjC', is_A_test)
x2('AjC', 'AjBjC', is_A_test)
x2('x', 'x')
x2('01 2', '01')
x2('ABPXYQAB', 'ABQXYPAB')
return
|