/usr/lib/python2.7/dist-packages/cvs2svn_rcsparse/texttools.py is in cvs2svn 2.4.0-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 | # -*-python-*-
#
# Copyright (C) 1999-2008 The ViewCVS Group. All Rights Reserved.
#
# By using this file, you agree to the terms and conditions set forth in
# the LICENSE.html file which can be found at the top level of the ViewVC
# distribution or at http://viewvc.org/license-1.html.
#
# For more information, visit http://viewvc.org/
#
# -----------------------------------------------------------------------
import string
# note: this will raise an ImportError if it isn't available. the rcsparse
# package will recognize this and switch over to the default parser.
from mx import TextTools
import common
# for convenience
_tt = TextTools
_idchar_list = map(chr, range(33, 127)) + map(chr, range(160, 256))
_idchar_list.remove('$')
_idchar_list.remove(',')
#_idchar_list.remove('.') # leave as part of 'num' symbol
_idchar_list.remove(':')
_idchar_list.remove(';')
_idchar_list.remove('@')
_idchar = string.join(_idchar_list, '')
_idchar_set = _tt.set(_idchar)
_onechar_token_set = _tt.set(':;')
_not_at_set = _tt.invset('@')
_T_TOKEN = 30
_T_STRING_START = 40
_T_STRING_SPAN = 60
_T_STRING_END = 70
_E_COMPLETE = 100 # ended on a complete token
_E_TOKEN = 110 # ended mid-token
_E_STRING_SPAN = 130 # ended within a string
_E_STRING_END = 140 # ended with string-end ('@') (could be mid-@@)
_SUCCESS = +100
_EOF = 'EOF'
_CONTINUE = 'CONTINUE'
_UNUSED = 'UNUSED'
# continuation of a token over a chunk boundary
_c_token_table = (
(_T_TOKEN, _tt.AllInSet, _idchar_set),
)
class _mxTokenStream:
# the algorithm is about the same speed for any CHUNK_SIZE chosen.
# grab a good-sized chunk, but not too large to overwhelm memory.
# note: we use a multiple of a standard block size
CHUNK_SIZE = 192 * 512 # about 100k
# CHUNK_SIZE = 5 # for debugging, make the function grind...
def __init__(self, file):
self.rcsfile = file
self.tokens = [ ]
self.partial = None
self.string_end = None
def _parse_chunk(self, buf, start=0):
"Get the next token from the RCS file."
buflen = len(buf)
assert start < buflen
# construct a tag table which refers to the buffer we need to parse.
table = (
#1: ignore whitespace. with or without whitespace, move to the next rule.
(None, _tt.AllInSet, _tt.whitespace_set, +1),
#2
(_E_COMPLETE, _tt.EOF + _tt.AppendTagobj, _tt.Here, +1, _SUCCESS),
#3: accumulate token text and exit, or move to the next rule.
(_UNUSED, _tt.AllInSet + _tt.AppendMatch, _idchar_set, +2),
#4
(_E_TOKEN, _tt.EOF + _tt.AppendTagobj, _tt.Here, -3, _SUCCESS),
#5: single character tokens exit immediately, or move to the next rule
(_UNUSED, _tt.IsInSet + _tt.AppendMatch, _onechar_token_set, +2),
#6
(_E_COMPLETE, _tt.EOF + _tt.AppendTagobj, _tt.Here, -5, _SUCCESS),
#7: if this isn't an '@' symbol, then we have a syntax error (go to a
# negative index to indicate that condition). otherwise, suck it up
# and move to the next rule.
(_T_STRING_START, _tt.Is + _tt.AppendTagobj, '@'),
#8
(None, _tt.Is, '@', +4, +1),
#9
(buf, _tt.Is, '@', +1, -1),
#10
(_T_STRING_END, _tt.Skip + _tt.AppendTagobj, 0, 0, +1),
#11
(_E_STRING_END, _tt.EOF + _tt.AppendTagobj, _tt.Here, -10, _SUCCESS),
#12
(_E_STRING_SPAN, _tt.EOF + _tt.AppendTagobj, _tt.Here, +1, _SUCCESS),
#13: suck up everything that isn't an AT. go to next rule to look for EOF
(buf, _tt.AllInSet, _not_at_set, 0, +1),
#14: go back to look for double AT if we aren't at the end of the string
(_E_STRING_SPAN, _tt.EOF + _tt.AppendTagobj, _tt.Here, -6, _SUCCESS),
)
# Fast, texttools may be, but it's somewhat lacking in clarity.
# Here's an attempt to document the logic encoded in the table above:
#
# Flowchart:
# _____
# / /\
# 1 -> 2 -> 3 -> 5 -> 7 -> 8 -> 9 -> 10 -> 11
# | \/ \/ \/ /\ \/
# \ 4 6 12 14 /
# \_______/_____/ \ / /
# \ 13 /
# \__________________________________________/
#
# #1: Skip over any whitespace.
# #2: If now EOF, exit with code _E_COMPLETE.
# #3: If we have a series of characters in _idchar_set, then:
# #4: Output them as a token, and go back to #1.
# #5: If we have a character in _onechar_token_set, then:
# #6: Output it as a token, and go back to #1.
# #7: If we do not have an '@', then error.
# If we do, then log a _T_STRING_START and continue.
# #8: If we have another '@', continue on to #9. Otherwise:
# #12: If now EOF, exit with code _E_STRING_SPAN.
# #13: Record the slice up to the next '@' (or EOF).
# #14: If now EOF, exit with code _E_STRING_SPAN.
# Otherwise, go back to #8.
# #9: If we have another '@', then we've just seen an escaped
# (by doubling) '@' within an @-string. Record a slice including
# just one '@' character, and jump back to #8.
# Otherwise, we've *either* seen the terminating '@' of an @-string,
# *or* we've seen one half of an escaped @@ sequence that just
# happened to be split over a chunk boundary - in either case,
# we continue on to #10.
# #10: Log a _T_STRING_END.
# #11: If now EOF, exit with _E_STRING_END. Otherwise, go back to #1.
success, taglist, idx = _tt.tag(buf, table, start)
if not success:
### need a better way to report this error
raise common.RCSIllegalCharacter()
assert idx == buflen
# pop off the last item
last_which = taglist.pop()
i = 0
tlen = len(taglist)
while i < tlen:
if taglist[i] == _T_STRING_START:
j = i + 1
while j < tlen:
if taglist[j] == _T_STRING_END:
s = _tt.join(taglist, '', i+1, j)
del taglist[i:j]
tlen = len(taglist)
taglist[i] = s
break
j = j + 1
else:
assert last_which == _E_STRING_SPAN
s = _tt.join(taglist, '', i+1)
del taglist[i:]
self.partial = (_T_STRING_SPAN, [ s ])
break
i = i + 1
# figure out whether we have a partial last-token
if last_which == _E_TOKEN:
self.partial = (_T_TOKEN, [ taglist.pop() ])
elif last_which == _E_COMPLETE:
pass
elif last_which == _E_STRING_SPAN:
assert self.partial
else:
assert last_which == _E_STRING_END
self.partial = (_T_STRING_END, [ taglist.pop() ])
taglist.reverse()
taglist.extend(self.tokens)
self.tokens = taglist
def _set_end(self, taglist, text, l, r, subtags):
self.string_end = l
def _handle_partial(self, buf):
which, chunks = self.partial
if which == _T_TOKEN:
success, taglist, idx = _tt.tag(buf, _c_token_table)
if not success:
# The start of this buffer was not a token. So the end of the
# prior buffer was a complete token.
self.tokens.insert(0, string.join(chunks, ''))
else:
assert len(taglist) == 1 and taglist[0][0] == _T_TOKEN \
and taglist[0][1] == 0 and taglist[0][2] == idx
if idx == len(buf):
#
# The whole buffer was one huge token, so we may have a
# partial token again.
#
# Note: this modifies the list of chunks in self.partial
#
chunks.append(buf)
# consumed the whole buffer
return len(buf)
# got the rest of the token.
chunks.append(buf[:idx])
self.tokens.insert(0, string.join(chunks, ''))
# no more partial token
self.partial = None
return idx
if which == _T_STRING_END:
if buf[0] != '@':
self.tokens.insert(0, string.join(chunks, ''))
return 0
chunks.append('@')
start = 1
else:
start = 0
self.string_end = None
string_table = (
(None, _tt.Is, '@', +3, +1),
(_UNUSED, _tt.Is + _tt.AppendMatch, '@', +1, -1),
(self._set_end, _tt.Skip + _tt.CallTag, 0, 0, _SUCCESS),
(None, _tt.EOF, _tt.Here, +1, _SUCCESS),
# suck up everything that isn't an AT. move to next rule to look
# for EOF
(_UNUSED, _tt.AllInSet + _tt.AppendMatch, _not_at_set, 0, +1),
# go back to look for double AT if we aren't at the end of the string
(None, _tt.EOF, _tt.Here, -5, _SUCCESS),
)
success, unused, idx = _tt.tag(buf, string_table,
start, len(buf), chunks)
# must have matched at least one item
assert success
if self.string_end is None:
assert idx == len(buf)
self.partial = (_T_STRING_SPAN, chunks)
elif self.string_end < len(buf):
self.partial = None
self.tokens.insert(0, string.join(chunks, ''))
else:
self.partial = (_T_STRING_END, chunks)
return idx
def _parse_more(self):
buf = self.rcsfile.read(self.CHUNK_SIZE)
if not buf:
return _EOF
if self.partial:
idx = self._handle_partial(buf)
if idx is None:
return _CONTINUE
if idx < len(buf):
self._parse_chunk(buf, idx)
else:
self._parse_chunk(buf)
return _CONTINUE
def get(self):
try:
return self.tokens.pop()
except IndexError:
pass
while not self.tokens:
action = self._parse_more()
if action == _EOF:
return None
return self.tokens.pop()
# _get = get
# def get(self):
token = self._get()
print 'T:', `token`
return token
def match(self, match):
if self.tokens:
token = self.tokens.pop()
else:
token = self.get()
if token != match:
raise common.RCSExpected(token, match)
def unget(self, token):
self.tokens.append(token)
def mget(self, count):
"Return multiple tokens. 'next' is at the end."
while len(self.tokens) < count:
action = self._parse_more()
if action == _EOF:
### fix this
raise RuntimeError, 'EOF hit while expecting tokens'
result = self.tokens[-count:]
del self.tokens[-count:]
return result
class Parser(common._Parser):
stream_class = _mxTokenStream
|