/usr/lib/python2.7/dist-packages/rdflib/TextIndex.py is in python-rdflib 2.4.2-3build1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 | try:
from hashlib import md5
except ImportError:
from md5 import md5
from rdflib.BNode import BNode
from rdflib.Graph import ConjunctiveGraph
from rdflib.Literal import Literal
from rdflib.Namespace import NamespaceDict as Namespace
from rdflib.URIRef import URIRef
from rdflib.store import TripleAddedEvent, TripleRemovedEvent
from rdflib.store.IOMemory import IOMemory
import logging
import re #, stopdict
_logger = logging.getLogger(__name__)
def get_stopdict():
"""Return a dictionary of stopwords."""
return _dict
_words = [
"a", "and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "such",
"that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
]
_dict = {}
for w in _words:
_dict[w] = None
word_pattern = re.compile(r"(?u)\w+")
has_stop = get_stopdict().has_key
def splitter(s):
return word_pattern.findall(s)
def stopper(s):
return [w.lower() for w in s if not has_stop(w)]
class TextIndex(ConjunctiveGraph):
"""
An rdflib graph event handler than indexes text literals that are
added to a another graph.
This class lets you 'search' the text literals in an RDF graph.
Typically in RDF to search for a substring in an RDF graph you
would have to 'brute force' search every literal string looking
for your substring.
Instead, this index stores the words in literals into another
graph whose structure makes searching for terms much less
expensive. It does this by chopping up the literals into words,
removing very common words (currently only in English) and then
adding each of those words into an RDF graph that describes the
statements in the original graph that the word came from.
First, let's create a graph that will transmit events and a text
index that will receive those events, and then subscribe the text
index to the event graph:
>>> e = ConjunctiveGraph()
>>> t = TextIndex()
>>> t.subscribe_to(e)
When triples are added to the event graph (e) events will be fired
that trigger event handlers in subscribers. In this case our only
subscriber is a text index and its action is to index triples that
contain literal RDF objects. Here are 3 such triples:
>>> e.add((URIRef('a'), URIRef('title'), Literal('one two three')))
>>> e.add((URIRef('b'), URIRef('title'), Literal('two three four')))
>>> e.add((URIRef('c'), URIRef('title'), Literal('three four five')))
Of the three literal objects that were added, they all contain
five unique terms. These terms can be queried directly from the
text index:
>>> t.term_strings() == set(['four', 'five', 'three', 'two', 'one'])
True
Now we can search for statement that contain certain terms. Let's
search for 'one' which occurs in only one of the literals
provided, 'a'. This can be queried for:
>>> t.search('one')
set([(rdflib.URIRef('a'), rdflib.URIRef('title'), None)])
'one' and 'five' only occur in one statement each, 'two' and
'four' occur in two, and 'three' occurs in three statements:
>>> len(list(t.search('one')))
1
>>> len(list(t.search('two')))
2
>>> len(list(t.search('three')))
3
>>> len(list(t.search('four')))
2
>>> len(list(t.search('five')))
1
Lets add some more statements with different predicates.
>>> e.add((URIRef('a'), URIRef('creator'), Literal('michel')))
>>> e.add((URIRef('b'), URIRef('creator'), Literal('Atilla the one Hun')))
>>> e.add((URIRef('c'), URIRef('creator'), Literal('michel')))
>>> e.add((URIRef('d'), URIRef('creator'), Literal('Hun Mung two')))
Now 'one' occurs in two statements:
>>> assert len(list(t.search('one'))) == 2
And 'two' occurs in three statements, here they are:
>>> t.search('two')
set([(rdflib.URIRef('d'), rdflib.URIRef('creator'), None), (rdflib.URIRef('a'), rdflib.URIRef('title'), None), (rdflib.URIRef('b'), rdflib.URIRef('title'), None)])
The predicates that are searched can be restricted by provding an
argument to 'search()':
>>> t.search('two', URIRef('creator'))
set([(rdflib.URIRef('d'), rdflib.URIRef('creator'), None)])
>>> t.search('two', URIRef(u'title'))
set([(rdflib.URIRef('a'), rdflib.URIRef('title'), None), (rdflib.URIRef('b'), rdflib.URIRef('title'), None)])
You can search for more than one term by simply including it in
the query:
>>> t.search('two three', URIRef(u'title'))
set([(rdflib.URIRef('c'), rdflib.URIRef('title'), None), (rdflib.URIRef('a'), rdflib.URIRef('title'), None), (rdflib.URIRef('b'), rdflib.URIRef('title'), None)])
The above query returns all the statements that contain 'two' OR
'three'. For the documents that contain 'two' AND 'three', do an
intersection of two queries:
>>> t.search('two', URIRef(u'title')).intersection(t.search(u'three', URIRef(u'title')))
set([(rdflib.URIRef('a'), rdflib.URIRef('title'), None), (rdflib.URIRef('b'), rdflib.URIRef('title'), None)])
Intersection two queries like this is probably not the most
efficient way to do it, but for reasonable data sets this isn't a
problem. Larger data sets will want to query the graph with
sparql or something else more efficient.
In all the above queries, the object of each statement was always
'None'. This is because the index graph does not store the object
data, that would make it very large, and besides the data is
available in the original data graph. For convenience, a method
is provides to 'link' an index graph to a data graph. This allows
the index to also provide object data in query results.
>>> t.link_to(e)
>>> set([str(i[2]) for i in t.search('two', URIRef(u'title')).intersection(t.search(u'three', URIRef(u'title')))]) == set(['two three four', 'one two three'])
True
You can remove the link by assigning None:
>>> t.link_to(None)
Unindexing means to remove statments from the index graph that
corespond to a statement in the data graph. Note that while it is
possible to remove the index information of the occurances of
terms in statements, it is not possible to remove the terms
themselves, terms are 'absolute' and are never removed from the
index graph. This is not a problem since languages have finite
terms:
>>> e.remove((URIRef('a'), URIRef('creator'), Literal('michel')))
>>> e.remove((URIRef('b'), URIRef('creator'), Literal('Atilla the one Hun')))
>>> e.remove((URIRef('c'), URIRef('creator'), Literal('michel')))
>>> e.remove((URIRef('d'), URIRef('creator'), Literal('Hun Mung two')))
Now 'one' only occurs in one statement:
>>> assert len(list(t.search('one'))) == 1
And 'two' only occurs in two statements, here they are:
>>> t.search('two')
set([(rdflib.URIRef('a'), rdflib.URIRef('title'), None), (rdflib.URIRef('b'), rdflib.URIRef('title'), None)])
The predicates that are searched can be restricted by provding an
argument to 'search()':
>>> t.search('two', URIRef(u'creator'))
set([])
>>> t.search('two', URIRef(u'title'))
set([(rdflib.URIRef('a'), rdflib.URIRef('title'), None), (rdflib.URIRef('b'), rdflib.URIRef('title'), None)])
"""
linked_data = None
text_index = Namespace('http://rdflib.net/text_index#')
term = Namespace('http://rdflib.net/text_index#')["term"]
termin = Namespace('http://rdflib.net/text_index#')["termin"]
def __init__(self, store='default'):
super(TextIndex, self).__init__(store)
def add_handler(self, event):
if type(event.triple[2]) is Literal:
self.index(event.triple)
def remove_handler(self, event):
if type(event.triple[2]) is Literal:
self.unindex(event.triple)
def index(self, (s, p, o)):
# this code is tricky so it's annotated. unindex is the reverse of this method.
if type(o) is Literal: # first, only index statements that have a literal object
for word in stopper(splitter(o)): # split the literal and remove any stopwords
word = Literal(word) # create a new literal for each word in the object
# if that word already exists in the statement
# loop over each context the term occurs in
if self.value(predicate=self.term, object=word, any=True):
for t in set(self.triples((None, self.term, word))):
t = t[0]
# if the graph does not contain an occurance of the term in the statement's subject
# then add it
if not (t, self.termin, s) in self:
self.add((t, self.termin, s))
# ditto for the predicate
if not (p, t, s) in self:
self.add((p, t, s))
else: # if the term does not exist in the graph, add it, and the references to the statement.
# t gets used as a predicate, create identifier accordingly (AKA can't be a BNode)
h = md5(word.encode('utf-8')); h.update(s.encode('utf-8')); h.update(p.encode('utf-8'))
t = self.text_index["term_%s" % h.hexdigest()]
self.add((t, self.term, word))
self.add((t, self.termin, s))
self.add((p, t, s))
def unindex(self, (s, p, o)):
if type(o) is Literal:
for word in stopper(splitter(o)):
word = Literal(word)
if self.value(predicate=self.term, object=word, any=True):
for t in self.triples((None, self.term, word)):
t = t[0]
if (t, self.termin, s) in self:
self.remove((t, self.termin, s))
if (p, t, s) in self:
self.remove((p, t, s))
def terms(self):
""" Returns a generator that yields all of the term literals in the graph. """
return set(self.objects(None, self.term))
def term_strings(self):
""" Return a list of term strings. """
return set([str(i) for i in self.terms()])
def search(self, terms, predicate=None):
""" Returns a set of all the statements the term occurs in. """
if predicate and not isinstance(predicate, URIRef):
_logger.warning("predicate is not a URIRef")
predicate = URIRef(predicate)
results = set()
terms = [Literal(term) for term in stopper(splitter(terms))]
for term in terms:
for t in self.triples((None, self.term, term)):
for o in self.objects(t[0], self.termin):
for p in self.triples((predicate, t[0], o)):
if self.linked_data is None:
results.add((o, p[0], None))
else:
results.add((o, p[0], self.linked_data.value(o, p[0])))
return results
def index_graph(self, graph):
"""
Index a whole graph. Must be a conjunctive graph.
"""
for t in graph.triples((None,None,None)):
self.index(t)
def link_to(self, graph):
"""
Link to a graph
"""
self.linked_data = graph
def subscribe_to(self, graph):
"""
Subscribe this index to a graph.
"""
graph.store.dispatcher.subscribe(TripleAddedEvent, self.add_handler)
graph.store.dispatcher.subscribe(TripleRemovedEvent, self.remove_handler)
def test():
import doctest
doctest.testmod()
if __name__ == '__main__':
test()
|