/usr/lib/python2.7/dist-packages/nltk/collocations.py is in python-nltk 3.2.1-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 | # Natural Language Toolkit: Collocations and Association Measures
#
# Copyright (C) 2001-2016 NLTK Project
# Author: Joel Nothman <jnothman@student.usyd.edu.au>
# URL: <http://nltk.org>
# For license information, see LICENSE.TXT
#
"""
Tools to identify collocations --- words that often appear consecutively
--- within corpora. They may also be used to find other associations between
word occurrences.
See Manning and Schutze ch. 5 at http://nlp.stanford.edu/fsnlp/promo/colloc.pdf
and the Text::NSP Perl package at http://ngram.sourceforge.net
Finding collocations requires first calculating the frequencies of words and
their appearance in the context of other words. Often the collection of words
will then requiring filtering to only retain useful content terms. Each ngram
of words may then be scored according to some association measure, in order
to determine the relative likelihood of each ngram being a collocation.
The ``BigramCollocationFinder`` and ``TrigramCollocationFinder`` classes provide
these functionalities, dependent on being provided a function which scores a
ngram given appropriate frequency counts. A number of standard association
measures are provided in bigram_measures and trigram_measures.
"""
from __future__ import print_function
# Possible TODOs:
# - consider the distinction between f(x,_) and f(x) and whether our
# approximation is good enough for fragmented data, and mention it
# - add a n-gram collocation finder with measures which only utilise n-gram
# and unigram counts (raw_freq, pmi, student_t)
import itertools as _itertools
from nltk.compat import iteritems
from nltk.probability import FreqDist
from nltk.util import ngrams
from nltk.metrics import ContingencyMeasures, BigramAssocMeasures, TrigramAssocMeasures
from nltk.metrics.spearman import ranks_from_scores, spearman_correlation
class AbstractCollocationFinder(object):
"""
An abstract base class for collocation finders whose purpose is to
collect collocation candidate frequencies, filter and rank them.
As a minimum, collocation finders require the frequencies of each
word in a corpus, and the joint frequency of word tuples. This data
should be provided through nltk.probability.FreqDist objects or an
identical interface.
"""
def __init__(self, word_fd, ngram_fd):
self.word_fd = word_fd
self.N = word_fd.N()
self.ngram_fd = ngram_fd
@classmethod
def _build_new_documents(cls, documents, window_size, pad_left=False, pad_right=False, pad_symbol=None):
'''
Pad the document with the place holder according to the window_size
'''
padding = (pad_symbol,) * (window_size - 1)
if pad_right:
return _itertools.chain.from_iterable(_itertools.chain(doc, padding) for doc in documents)
if pad_left:
return _itertools.chain.from_iterable(_itertools.chain(padding, doc) for doc in documents)
@classmethod
def from_documents(cls, documents):
"""Constructs a collocation finder given a collection of documents,
each of which is a list (or iterable) of tokens.
"""
#return cls.from_words(_itertools.chain(*documents))
return cls.from_words(cls._build_new_documents(documents, cls.default_ws, pad_right=True))
@staticmethod
def _ngram_freqdist(words, n):
return FreqDist(tuple(words[i:i + n]) for i in range(len(words) - 1))
def _apply_filter(self, fn=lambda ngram, freq: False):
"""Generic filter removes ngrams from the frequency distribution
if the function returns True when passed an ngram tuple.
"""
tmp_ngram = FreqDist()
for ngram, freq in iteritems(self.ngram_fd):
if not fn(ngram, freq):
tmp_ngram[ngram] = freq
self.ngram_fd = tmp_ngram
def apply_freq_filter(self, min_freq):
"""Removes candidate ngrams which have frequency less than min_freq."""
self._apply_filter(lambda ng, freq: freq < min_freq)
def apply_ngram_filter(self, fn):
"""Removes candidate ngrams (w1, w2, ...) where fn(w1, w2, ...)
evaluates to True.
"""
self._apply_filter(lambda ng, f: fn(*ng))
def apply_word_filter(self, fn):
"""Removes candidate ngrams (w1, w2, ...) where any of (fn(w1), fn(w2),
...) evaluates to True.
"""
self._apply_filter(lambda ng, f: any(fn(w) for w in ng))
def _score_ngrams(self, score_fn):
"""Generates of (ngram, score) pairs as determined by the scoring
function provided.
"""
for tup in self.ngram_fd:
score = self.score_ngram(score_fn, *tup)
if score is not None:
yield tup, score
def score_ngrams(self, score_fn):
"""Returns a sequence of (ngram, score) pairs ordered from highest to
lowest score, as determined by the scoring function provided.
"""
return sorted(self._score_ngrams(score_fn), key=lambda t: (-t[1], t[0]))
def nbest(self, score_fn, n):
"""Returns the top n ngrams when scored by the given function."""
return [p for p, s in self.score_ngrams(score_fn)[:n]]
def above_score(self, score_fn, min_score):
"""Returns a sequence of ngrams, ordered by decreasing score, whose
scores each exceed the given minimum score.
"""
for ngram, score in self.score_ngrams(score_fn):
if score > min_score:
yield ngram
else:
break
class BigramCollocationFinder(AbstractCollocationFinder):
"""A tool for the finding and ranking of bigram collocations or other
association measures. It is often useful to use from_words() rather than
constructing an instance directly.
"""
default_ws = 2
def __init__(self, word_fd, bigram_fd, window_size=2):
"""Construct a BigramCollocationFinder, given FreqDists for
appearances of words and (possibly non-contiguous) bigrams.
"""
AbstractCollocationFinder.__init__(self, word_fd, bigram_fd)
self.window_size = window_size
@classmethod
def from_words(cls, words, window_size=2):
"""Construct a BigramCollocationFinder for all bigrams in the given
sequence. When window_size > 2, count non-contiguous bigrams, in the
style of Church and Hanks's (1990) association ratio.
"""
wfd = FreqDist()
bfd = FreqDist()
if window_size < 2:
raise ValueError("Specify window_size at least 2")
for window in ngrams(words, window_size, pad_right=True):
w1 = window[0]
if w1 is None:
continue
wfd[w1] += 1
for w2 in window[1:]:
if w2 is not None:
bfd[(w1, w2)] += 1
return cls(wfd, bfd, window_size=window_size)
def score_ngram(self, score_fn, w1, w2):
"""Returns the score for a given bigram using the given scoring
function. Following Church and Hanks (1990), counts are scaled by
a factor of 1/(window_size - 1).
"""
n_all = self.N
n_ii = self.ngram_fd[(w1, w2)] / (self.window_size - 1.0)
if not n_ii:
return
n_ix = self.word_fd[w1]
n_xi = self.word_fd[w2]
return score_fn(n_ii, (n_ix, n_xi), n_all)
class TrigramCollocationFinder(AbstractCollocationFinder):
"""A tool for the finding and ranking of trigram collocations or other
association measures. It is often useful to use from_words() rather than
constructing an instance directly.
"""
default_ws = 3
def __init__(self, word_fd, bigram_fd, wildcard_fd, trigram_fd):
"""Construct a TrigramCollocationFinder, given FreqDists for
appearances of words, bigrams, two words with any word between them,
and trigrams.
"""
AbstractCollocationFinder.__init__(self, word_fd, trigram_fd)
self.wildcard_fd = wildcard_fd
self.bigram_fd = bigram_fd
@classmethod
def from_words(cls, words, window_size=3):
"""Construct a TrigramCollocationFinder for all trigrams in the given
sequence.
"""
if window_size < 3:
raise ValueError("Specify window_size at least 3")
wfd = FreqDist()
wildfd = FreqDist()
bfd = FreqDist()
tfd = FreqDist()
for window in ngrams(words, window_size, pad_right=True):
w1 = window[0]
if w1 is None:
continue
for w2, w3 in _itertools.combinations(window[1:], 2):
wfd[w1] += 1
if w2 is None:
continue
bfd[(w1, w2)] += 1
if w3 is None:
continue
wildfd[(w1, w3)] += 1
tfd[(w1, w2, w3)] += 1
return cls(wfd, bfd, wildfd, tfd)
def bigram_finder(self):
"""Constructs a bigram collocation finder with the bigram and unigram
data from this finder. Note that this does not include any filtering
applied to this finder.
"""
return BigramCollocationFinder(self.word_fd, self.bigram_fd)
def score_ngram(self, score_fn, w1, w2, w3):
"""Returns the score for a given trigram using the given scoring
function.
"""
n_all = self.N
n_iii = self.ngram_fd[(w1, w2, w3)]
if not n_iii:
return
n_iix = self.bigram_fd[(w1, w2)]
n_ixi = self.wildcard_fd[(w1, w3)]
n_xii = self.bigram_fd[(w2, w3)]
n_ixx = self.word_fd[w1]
n_xix = self.word_fd[w2]
n_xxi = self.word_fd[w3]
return score_fn(n_iii,
(n_iix, n_ixi, n_xii),
(n_ixx, n_xix, n_xxi),
n_all)
class QuadgramCollocationFinder(AbstractCollocationFinder):
"""A tool for the finding and ranking of quadgram collocations or other association measures.
It is often useful to use from_words() rather than constructing an instance directly.
"""
default_ws = 4
def __init__(self, word_fd, quadgram_fd, ii, iii, ixi, ixxi, iixi, ixii):
"""Construct a QuadgramCollocationFinder, given FreqDists for appearances of words,
bigrams, trigrams, two words with one word and two words between them, three words
with a word between them in both variations.
"""
AbstractCollocationFinder.__init__(self, word_fd, quadgram_fd)
self.iii = iii
self.ii = ii
self.ixi = ixi
self.ixxi = ixxi
self.iixi = iixi
self.ixii = ixii
@classmethod
def from_words(cls, words, window_size=4):
if window_size < 4:
raise ValueError("Specify window_size at least 4")
ixxx = FreqDist()
iiii = FreqDist()
ii = FreqDist()
iii = FreqDist()
ixi = FreqDist()
ixxi = FreqDist()
iixi = FreqDist()
ixii = FreqDist()
for window in ngrams(words, window_size, pad_right=True):
w1 = window[0]
if w1 is None:
continue
for w2, w3, w4 in _itertools.combinations(window[1:], 3):
ixxx[w1] += 1
if w2 is None:
continue
ii[(w1, w2)] += 1
if w3 is None:
continue
iii[(w1, w2, w3)] += 1
ixi[(w1, w3)] += 1
if w4 is None:
continue
iiii[(w1, w2, w3, w4)] += 1
ixxi[(w1, w4)] += 1
ixii[(w1, w3, w4)] += 1
iixi[(w1, w2, w4)] += 1
return cls(ixxx, iiii, ii, iii, ixi, ixxi, iixi, ixii)
def score_ngram(self, score_fn, w1, w2, w3, w4):
n_all = self.N
n_iiii = self.ngram_fd[(w1, w2, w3, w4)]
if not n_iiii:
return
n_iiix = self.iii[(w1, w2, w3)]
n_xiii = self.iii[(w2, w3, w4)]
n_iixi = self.iixi[(w1, w2, w4)]
n_ixii = self.ixii[(w1, w3, w4)]
n_iixx = self.ii[(w1, w2)]
n_xxii = self.ii[(w3, w4)]
n_xiix = self.ii[(w2, w3)]
n_ixix = self.ixi[(w1, w3)]
n_ixxi = self.ixxi[(w1, w4)]
n_xixi = self.ixi[(w2, w4)]
n_ixxx = self.word_fd[w1]
n_xixx = self.word_fd[w2]
n_xxix = self.word_fd[w3]
n_xxxi = self.word_fd[w4]
return score_fn(n_iiii,
(n_iiix, n_iixi, n_ixii, n_xiii),
(n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
(n_ixxx, n_xixx, n_xxix, n_xxxi),
n_all)
def demo(scorer=None, compare_scorer=None):
"""Finds bigram collocations in the files of the WebText corpus."""
from nltk.metrics import BigramAssocMeasures, spearman_correlation, ranks_from_scores
if scorer is None:
scorer = BigramAssocMeasures.likelihood_ratio
if compare_scorer is None:
compare_scorer = BigramAssocMeasures.raw_freq
from nltk.corpus import stopwords, webtext
ignored_words = stopwords.words('english')
word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words
for file in webtext.fileids():
words = [word.lower()
for word in webtext.words(file)]
cf = BigramCollocationFinder.from_words(words)
cf.apply_freq_filter(3)
cf.apply_word_filter(word_filter)
corr = spearman_correlation(ranks_from_scores(cf.score_ngrams(scorer)),
ranks_from_scores(cf.score_ngrams(compare_scorer)))
print(file)
print('\t', [' '.join(tup) for tup in cf.nbest(scorer, 15)])
print('\t Correlation to %s: %0.4f' % (compare_scorer.__name__, corr))
# Slows down loading too much
# bigram_measures = BigramAssocMeasures()
# trigram_measures = TrigramAssocMeasures()
if __name__ == '__main__':
import sys
from nltk.metrics import BigramAssocMeasures
try:
scorer = eval('BigramAssocMeasures.' + sys.argv[1])
except IndexError:
scorer = None
try:
compare_scorer = eval('BigramAssocMeasures.' + sys.argv[2])
except IndexError:
compare_scorer = None
demo(scorer, compare_scorer)
__all__ = ['BigramCollocationFinder',
'TrigramCollocationFinder', 'QuadgramCollocationFinder']
|