/usr/share/perl5/Plucene/Analysis/Standard/StandardTokenizer.pm is in libplucene-perl 1.25-3.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | package Plucene::Analysis::Standard::StandardTokenizer;
=head1 NAME
Plucene::Analysis::Standard::StandardTokenizer - standard tokenizer
=head1 SYNOPSIS
# isa Plucene::Analysis::CharTokenizer
=head1 DESCRIPTION
This is the standard tokenizer.
This should be a good tokenizer for most European-language documents.
=head1 METHODS
=cut
use strict;
use warnings;
use base 'Plucene::Analysis::CharTokenizer';
# Don't blame me, blame the Plucene people!
my $alpha = qr/\p{IsAlpha}+/;
my $apostrophe = qr/$alpha('$alpha)+/;
my $acronym = qr/$alpha\.($alpha\.)+/;
my $company = qr/$alpha(&|\@)$alpha/;
my $hostname = qr/\w+(\.\w+)+/;
my $email = qr/\w+\@$hostname/;
my $p = qr/[_\/.,-]/;
my $hasdigit = qr/\w*\d\w*/;
my $num = qr/\w+$p$hasdigit|$hasdigit$p\w+
|\w+($p$hasdigit$p\w+)+
|$hasdigit($p\w+$p$hasdigit)+
|\w+$p$hasdigit($p\w+$p$hasdigit)+
|$hasdigit$p\w+($p$hasdigit$p\w+)+/x;
=head2 token_re
The regular expression for tokenising.
=cut
sub token_re {
qr/
$apostrophe | $acronym | $company | $hostname | $email | $num
| \w+
/x;
}
=head2 normalize
Remove 's and .
=cut
sub normalize {
my $class = shift;
# These are in the StandardFilter in Java, but Perl is not Java.
# Thankfully.
local $_ = shift;
if (/$apostrophe/) { s/'s//; }
if (/$company/) { s/\.//g; }
return $_;
}
1;
|