This file is indexed.

/usr/share/pyshared/translate/lang/identify.py is in translate-toolkit 1.10.0-2.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright 2009 Zuza Software Foundation
#
# This file is part of translate.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>.

"""
This module contains functions for identifying languages based on language
models.
"""

from os import extsep, path

from translate.misc.file_discovery import get_abs_data_filename
from translate.storage.base import TranslationStore
from translate.lang.ngram import NGram


class LanguageIdentifier(object):
    MODEL_DIR = get_abs_data_filename('langmodels')
    """The directory containing the ngram language model files."""
    CONF_FILE = 'fpdb.conf'
    """
    The name of the file that contains language name-code pairs
    (relative to ``MODEL_DIR``).
    """

    def __init__(self, model_dir=None, conf_file=None):
        if model_dir is None:
            model_dir = self.MODEL_DIR
        if not path.isdir(model_dir):
            raise ValueError('Directory does not exist: %s' % (model_dir))

        if conf_file is None:
            conf_file = self.CONF_FILE
        conf_file = path.abspath(path.join(model_dir, conf_file))
        if not path.isfile(conf_file):
            raise ValueError('File does not exist: %s' % (conf_file))

        self._lang_codes = {}
        self._load_config(conf_file)
        self.ngram = NGram(model_dir)

    def _load_config(self, conf_file):
        """Load the mapping of language names to language codes as given in the
            configuration file."""
        lines = open(conf_file).read().splitlines()
        for line in lines:
            parts = line.split()
            if not parts or line.startswith('#'):
                continue  # Skip comment- and empty lines
            lname, lcode = parts[0], parts[1]

            # Make sure lname is not prefixed by directory names
            lname = path.split(lname)[-1]
            if extsep in lname:
                lname = lname[:lname.rindex(extsep)]  # Remove extension if it has

            # Remove trailing '[_-]-utf8' from code
            if lcode.endswith('-utf8'):
                lcode = lcode[:-len('-utf8')]
            if lcode.endswith('-') or lcode.endswith('_'):
                lcode = lcode[:-1]

            self._lang_codes[lname] = lcode

    def identify_lang(self, text):
        """Identify the language of the text in the given string."""
        if not text:
            return None
        result = self.ngram.classify(text)
        if result in self._lang_codes:
            result = self._lang_codes[result]
        return result

    def identify_source_lang(self, instore):
        """Identify the source language of the given translation store or
            units.

            :type  instore: ``TranslationStore`` or list or tuple of
                ``TranslationUnit``s.
            :param instore: The translation store to extract source text from.
            :returns: The identified language's code or ``None`` if the language
                could not be identified."""
        if not isinstance(instore, (TranslationStore, list, tuple)):
            return None

        text = u' '.join(unit.source for unit in instore[:50] if unit.istranslatable() and unit.source)
        if not text:
            return None
        return self.identify_lang(text)

    def identify_target_lang(self, instore):
        """Identify the target language of the given translation store or
            units.

            :type  instore: ``TranslationStore`` or list or tuple of
                ``TranslationUnit``s.
            :param instore: The translation store to extract target text from.
            :returns: The identified language's code or ``None`` if the language
                could not be identified."""
        if not isinstance(instore, (TranslationStore, list, tuple)):
            return None

        text = u' '.join(unit.target for unit in instore[:200] if unit.istranslatable() and unit.target)
        if not text:
            return None
        return self.identify_lang(text)

if __name__ == "__main__":
    from sys import argv
    script_dir = path.abspath(path.dirname(argv[0]))
    identifier = LanguageIdentifier()
    import locale
    encoding = locale.getpreferredencoding()
    text = file(argv[1]).read().decode(encoding)
    print "Language detected:", identifier.identify_lang(text)