/usr/share/apt-xapian-index/plugins/translated-desc.py is in apt-xapian-index 0.47ubuntu8.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 | try:
import apt
HAS_APT=True
except ImportError:
HAS_APT=False
import xapian
import re
import os
import codecs
try:
from debian import deb822
except ImportError:
from debian_bundle import deb822
from six.moves.urllib_parse import unquote
APTLISTDIR="/var/lib/apt/lists"
def translationFiles(langs=None):
# Look for files like: ftp.uk.debian.org_debian_dists_sid_main_i18n_Translation-it
# And extract the language code at the end
tfile = re.compile(r"_i18n_Translation-([^-]+)$")
for f in os.listdir(APTLISTDIR):
mo = tfile.search(f)
if not mo: continue
if langs and not mo.group(1) in langs: continue
yield unquote(mo.group(1)), os.path.join(APTLISTDIR, f)
class Indexer:
def __init__(self, lang, file):
self.lang = lang
self.xlang = lang.split("_")[0]
self.indexer = xapian.TermGenerator()
# Get a stemmer for this language, if available
try:
self.stemmer = xapian.Stem(self.xlang)
self.indexer.set_stemmer(self.stemmer)
except xapian.InvalidArgumentError:
pass
# Read the translated descriptions
self.descs = dict()
desckey = "Description-"+self.lang
with codecs.open(file, 'r', encoding='utf-8') as fp:
for pkg in deb822.Deb822.iter_paragraphs(fp):
# I need this if because in some translation files, some
# packages have a different Description header. For example,
# in the -de translations, I once found a
# Description-de.noguide: header instead of Description-de:
if desckey in pkg:
self.descs[pkg["Package"]] = pkg[desckey]
def index(self, document):
name = document.get_data()
self.indexer.set_document(document)
self.indexer.index_text_without_positions(self.descs.get(name, ""))
class TranslatedDescriptions:
def __init__(self, langs):
self.langs = langs
def info(self):
"""
Return general information about the plugin.
The information returned is a dict with various keywords:
timestamp (required)
the last modified timestamp of this data source. This will be used
to see if we need to update the database or not. A timestamp of 0
means that this data source is either missing or always up to date.
values (optional)
an array of dicts { name: name, desc: description }, one for every
numeric value indexed by this data source.
Note that this method can be called before init. The idea is that, if
the timestamp shows that this plugin is currently not needed, then the
long initialisation can just be skipped.
"""
if not HAS_APT: return dict(timestamp = 0)
filelist = translationFiles(self.langs)
maxts = max([0] + [os.path.getmtime(f) for l, f in filelist])
return dict(
timestamp=maxts,
sources=[dict(path=f, desc="%s translation" % l) for l, f in filelist],
prefixes=[
dict(idx="Z", qp=None, type=None,
desc="Stemmed forms of keywords",
ldesc="This contains the stemmed forms of keywords as generated by"
" TermGenerator and matched by QueryParser"),
],
)
def init(self, info, progress):
"""
If needed, perform long initialisation tasks here.
info is a dictionary with useful information. Currently it contains
the following values:
"values": a dict mapping index mnemonics to index numbers
The progress indicator can be used to report progress.
"""
self.indexers = []
for lang, file in translationFiles(self.langs):
progress.begin("Reading %s translations from %s" % (lang, file))
self.indexers.append(Indexer(lang, file))
progress.end()
def doc(self):
"""
Return documentation information for this data source.
The documentation information is a dictionary with these keys:
name: the name for this data source
shortDesc: a short description
fullDoc: the full description as a chapter in ReST format
"""
return dict(
name = "Translated package descriptions",
shortDesc = "terms extracted from the translated package descriptions using Xapian's TermGenerator",
fullDoc = """
The TranslatedDescriptions data source reads translated description
files from %s, then uses Xapian's TermGenerator to tokenise and
index their content.
Currently this creates normal terms as well as stemmed terms
prefixed with ``Z``.
""" % APTLISTDIR
)
def index(self, document, pkg):
"""
Update the document with the information from this data source.
document is the document to update
pkg is the python-apt Package object for this package
"""
for i in self.indexers:
i.index(document)
def indexDeb822(self, document, pkg):
"""
Update the document with the information from this data source.
This is alternative to index, and it is used when indexing with package
data taken from a custom Packages file.
document is the document to update
pkg is the Deb822 object for this package
"""
for i in self.indexers:
i.index(document)
def init(langs=None, **kw):
"""
Create and return the plugin object.
"""
if not HAS_APT: return None
if not langs: return None
files = [f for l, f in translationFiles(langs)]
if len(files) == 0:
return None
return TranslatedDescriptions(langs=langs)
|