/usr/share/namazu/pl/wakati.pl is in namazu2-index-tools 2.0.21-6.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | #
# -*- Perl -*-
# $Id: wakati.pl,v 1.9.8.10 2009-01-28 17:54:57 opengl2772 Exp $
# Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved.
# Copyright (C) 2000-2009 Namazu Project All rights reserved.
# This is free software with ABSOLUTELY NO WARRANTY.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either versions 2, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
# 02111-1307, USA
#
# This file must be encoded in EUC-JP encoding
#
package wakati;
use strict;
# Do wakatigaki processing for a Japanese text.
sub wakatize_japanese ($) {
my ($content) = @_;
my @tmp = wakatize_japanese_sub($content);
# Remove words consists of only Hiragana characters
# when -H option is specified.
# Original of this code was contributed by <furukawa@tcp-ip.or.jp>.
# [1997-11-13]
# And do Okurigana processing. [1998-04-24]
if ($var::Opt{'hiragana'} || $var::Opt{'okurigana'}) {
for (my $ndx = 0; $ndx <= $#tmp; ++$ndx) {
$tmp[$ndx] =~ s/(\s)/ $1/g;
$tmp[$ndx] = ' ' . $tmp[$ndx];
if ($var::Opt{'okurigana'}) {
$tmp[$ndx] =~ s/((?:[^\xa4][\xa1-\xfe])+)(?:\xa4[\xa1-\xf3])+ /$1 /g;
}
if ($var::Opt{'hiragana'}) {
$tmp[$ndx] =~ s/ (?:\xa4[\xa1-\xf3])+ //g;
}
}
}
# Collect only noun words when -m option is specified.
if ($var::Opt{'noun'}) {
$$content = "";
$$content .= shift(@tmp) =~ /(.+ )\xcc\xbe\xbb\xec/ ? $1 : "" while @tmp;
# noun (meisi) in Japanese "cc be bb ec"
} else {
$$content = join("\n", @tmp);
}
$$content =~ s/^\s+//gm;
$$content =~ s/\s+$//gm;
$$content =~ s/ +/ /gm;
$$content .= "\n";
util::dprint(_("-- wakatized content --\n")."$$content\n");
}
sub wakatize_japanese_sub ($) {
my ($content) = @_;
my $str = "";
my @tmp = ();
if ($conf::WAKATI =~ /^module_(\w+)/) {
my $module = $1;
if ($module eq "kakasi") {
$str = $$content;
$str =~ s/([\x80-\xff]+)/{my $text = Text::Kakasi::do_kakasi($1); " $text ";}/ge;
} elsif ($module eq "chasen") {
if ($var::Opt{'noun'}) {
$str = Text::ChaSen::sparse_tostr_long($$content);
} else {
$str = $$content;
$str =~ s/([\x80-\xff]+)/{my $text = Text::ChaSen::sparse_tostr_long($1); " $text ";}/ge;
}
} elsif ($module eq "mecab") {
use vars qw($t);
if (!defined $t) {
require MeCab;
import MeCab;
eval '$t = new MeCab::Tagger("-Owakati");' or
$t = new MeCab::Tagger([qw(mecab -O wakati)]);
}
END {
$t->DESTROY() if defined $t;
};
$str = $$content;
$str =~ s/([\x80-\xff]+)/{my $s = $1; my $text = $t->parse($s); " $text ";}/ge;
} else {
util::cdie(_("invalid wakati module: ")."$module\n");
}
util::dprint(_("-- wakatized bare content --\n")."$str\n\n");
@tmp = split('\n', $str);
} else {
my $tmpfile = util::tmpnam("NMZ.wakati");
util::dprint(_("wakati: using ")."$conf::WAKATI\n");
# Don't use IPC::Open2 because it's not efficent.
if ($var::Opt{'noun'}) {
my $fh_wakati = util::efopen("|$conf::WAKATI > $tmpfile");
print $fh_wakati $$content;
util::fclose($fh_wakati);
} else {
$str = $$content;
my $redirect = ">";
while(1) {
if ($str =~ s/^([\x80-\xff]+)//s) {
my $fh_wakati = util::efopen("|$conf::WAKATI $redirect $tmpfile");
print $fh_wakati " $1\n";
util::fclose($fh_wakati);
} elsif ($str =~ s/^([\x00-\x7f]+)//s) {
my $fh_wakati = util::efopen("$redirect $tmpfile");
print $fh_wakati " $1 ";
util::fclose($fh_wakati);
} else {
last;
}
$redirect = ">>";
}
}
{
my $fh_wakati = util::efopen($tmpfile);
@tmp = <$fh_wakati>;
chomp @tmp;
util::fclose($fh_wakati);
}
unlink $tmpfile;
}
return @tmp;
}
1;
|