/usr/share/pyshared/cobe/tokenizers.py is in python-cobe 2.1.0-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | # Copyright (C) 2010 Peter Teichman
import re
import Stemmer
import types
class MegaHALTokenizer:
"""A traditional MegaHAL style tokenizer. This considers any of these
to be a token:
* one or more consecutive alpha characters (plus apostrophe)
* one or more consecutive numeric characters
* one or more consecutive punctuation/space characters (not apostrophe)
This tokenizer ignores differences in capitalization."""
def split(self, phrase):
if type(phrase) != types.UnicodeType:
raise TypeError("Input must be Unicode")
if len(phrase) == 0:
return []
# add ending punctuation if it is missing
if phrase[-1] not in ".!?":
phrase = phrase + "."
words = re.findall("([A-Z']+|[0-9]+|[^A-Z'0-9]+)", phrase.upper(),
re.UNICODE)
return words
def join(self, words):
"""Capitalize the first alpha character in the reply and the
first alpha character that follows one of [.?!] and a
space."""
chars = list(u"".join(words))
start = True
for i in xrange(len(chars)):
char = chars[i]
if char.isalpha():
if start:
chars[i] = char.upper()
else:
chars[i] = char.lower()
start = False
else:
if i > 2 and chars[i - 1] in ".?!" and char.isspace():
start = True
return u"".join(chars)
class CobeTokenizer:
"""A tokenizer that is somewhat improved from MegaHAL. These are
considered tokens:
* one or more consecutive Unicode word characters (plus apostrophe and dash)
* one or more consecutive Unicode non-word characters, possibly with
internal whitespace
* the whitespace between word or non-word tokens
* an HTTP url, [word]: followed by any run of non-space characters.
This tokenizer collapses multiple spaces in a whitespace token into a
single space character.
It preserves differences in case. foo, Foo, and FOO are different
tokens."""
def __init__(self):
# Add hyphen to the list of possible word characters, so hyphenated
# words become one token (e.g. hy-phen). But don't remove it from
# the list of non-word characters, so if it's found entirely within
# punctuation it's a normal non-word (e.g. :-( )
self.regex = re.compile("(\w+:\S+" # urls
"|[\w'-]+" # words
"|[^\w\s][^\w]*[^\w\s]" # multiple punctuation
"|[^\w\s]" # a single punctuation character
"|\s+)", # whitespace
re.UNICODE)
def split(self, phrase):
if type(phrase) != types.UnicodeType:
raise TypeError("Input must be Unicode")
# Strip leading and trailing whitespace. This might not be the
# correct choice long-term, but in the brain it prevents edges
# from the root node that have has_space set.
phrase = phrase.strip()
if len(phrase) == 0:
return []
tokens = self.regex.findall(phrase)
# collapse runs of whitespace into a single space
space = u" "
for i, token in enumerate(tokens):
if token[0] == " " and len(token) > 1:
tokens[i] = space
return tokens
def join(self, words):
return u"".join(words)
class CobeStemmer:
def __init__(self, name):
# use the PyStemmer Snowball stemmer bindings
self.stemmer = Stemmer.Stemmer(name)
def stem(self, token):
if not re.search("\w", token, re.UNICODE):
return self.stem_nonword(token)
# Don't preserve case when stemming, i.e. create lowercase stems.
# This will allow us to create replies that switch the case of
# input words, but still generate the reply in context with the
# generated case.
stem = self.stemmer.stemWord(token.lower())
return stem
def stem_nonword(self, token):
# Stem common smile and frown emoticons down to :) and :(
if re.search(":-?[ \)]*\)", token):
return ":)"
if re.search(":-?[' \(]*\(", token):
return ":("
|