/usr/share/pyshared/translate/lang/identify.py is in translate-toolkit 1.10.0-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | #!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright 2009 Zuza Software Foundation
#
# This file is part of translate.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>.
"""
This module contains functions for identifying languages based on language
models.
"""
from os import extsep, path
from translate.misc.file_discovery import get_abs_data_filename
from translate.storage.base import TranslationStore
from translate.lang.ngram import NGram
class LanguageIdentifier(object):
MODEL_DIR = get_abs_data_filename('langmodels')
"""The directory containing the ngram language model files."""
CONF_FILE = 'fpdb.conf'
"""
The name of the file that contains language name-code pairs
(relative to ``MODEL_DIR``).
"""
def __init__(self, model_dir=None, conf_file=None):
if model_dir is None:
model_dir = self.MODEL_DIR
if not path.isdir(model_dir):
raise ValueError('Directory does not exist: %s' % (model_dir))
if conf_file is None:
conf_file = self.CONF_FILE
conf_file = path.abspath(path.join(model_dir, conf_file))
if not path.isfile(conf_file):
raise ValueError('File does not exist: %s' % (conf_file))
self._lang_codes = {}
self._load_config(conf_file)
self.ngram = NGram(model_dir)
def _load_config(self, conf_file):
"""Load the mapping of language names to language codes as given in the
configuration file."""
lines = open(conf_file).read().splitlines()
for line in lines:
parts = line.split()
if not parts or line.startswith('#'):
continue # Skip comment- and empty lines
lname, lcode = parts[0], parts[1]
# Make sure lname is not prefixed by directory names
lname = path.split(lname)[-1]
if extsep in lname:
lname = lname[:lname.rindex(extsep)] # Remove extension if it has
# Remove trailing '[_-]-utf8' from code
if lcode.endswith('-utf8'):
lcode = lcode[:-len('-utf8')]
if lcode.endswith('-') or lcode.endswith('_'):
lcode = lcode[:-1]
self._lang_codes[lname] = lcode
def identify_lang(self, text):
"""Identify the language of the text in the given string."""
if not text:
return None
result = self.ngram.classify(text)
if result in self._lang_codes:
result = self._lang_codes[result]
return result
def identify_source_lang(self, instore):
"""Identify the source language of the given translation store or
units.
:type instore: ``TranslationStore`` or list or tuple of
``TranslationUnit``s.
:param instore: The translation store to extract source text from.
:returns: The identified language's code or ``None`` if the language
could not be identified."""
if not isinstance(instore, (TranslationStore, list, tuple)):
return None
text = u' '.join(unit.source for unit in instore[:50] if unit.istranslatable() and unit.source)
if not text:
return None
return self.identify_lang(text)
def identify_target_lang(self, instore):
"""Identify the target language of the given translation store or
units.
:type instore: ``TranslationStore`` or list or tuple of
``TranslationUnit``s.
:param instore: The translation store to extract target text from.
:returns: The identified language's code or ``None`` if the language
could not be identified."""
if not isinstance(instore, (TranslationStore, list, tuple)):
return None
text = u' '.join(unit.target for unit in instore[:200] if unit.istranslatable() and unit.target)
if not text:
return None
return self.identify_lang(text)
if __name__ == "__main__":
from sys import argv
script_dir = path.abspath(path.dirname(argv[0]))
identifier = LanguageIdentifier()
import locale
encoding = locale.getpreferredencoding()
text = file(argv[1]).read().decode(encoding)
print "Language detected:", identifier.identify_lang(text)
|