This file is indexed.

/usr/share/perl5/Plucene/Analysis/Standard/StandardTokenizer.pm is in libplucene-perl 1.25-3.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
package Plucene::Analysis::Standard::StandardTokenizer;

=head1 NAME 

Plucene::Analysis::Standard::StandardTokenizer - standard tokenizer

=head1 SYNOPSIS

	# isa Plucene::Analysis::CharTokenizer

=head1 DESCRIPTION

This is the standard tokenizer.

This should be a good tokenizer for most European-language documents.

=head1 METHODS

=cut

use strict;
use warnings;

use base 'Plucene::Analysis::CharTokenizer';

# Don't blame me, blame the Plucene people!
my $alpha      = qr/\p{IsAlpha}+/;
my $apostrophe = qr/$alpha('$alpha)+/;
my $acronym    = qr/$alpha\.($alpha\.)+/;
my $company    = qr/$alpha(&|\@)$alpha/;
my $hostname   = qr/\w+(\.\w+)+/;
my $email      = qr/\w+\@$hostname/;
my $p          = qr/[_\/.,-]/;
my $hasdigit   = qr/\w*\d\w*/;
my $num        = qr/\w+$p$hasdigit|$hasdigit$p\w+
                   |\w+($p$hasdigit$p\w+)+
                   |$hasdigit($p\w+$p$hasdigit)+
                   |\w+$p$hasdigit($p\w+$p$hasdigit)+
                   |$hasdigit$p\w+($p$hasdigit$p\w+)+/x;

=head2 token_re

The regular expression for tokenising.

=cut

sub token_re {
	qr/
        $apostrophe | $acronym | $company | $hostname | $email | $num
        | \w+
    /x;
}

=head2 normalize

Remove 's and .

=cut

sub normalize {
	my $class = shift;

	# These are in the StandardFilter in Java, but Perl is not Java.
	# Thankfully.
	local $_ = shift;
	if (/$apostrophe/) { s/'s//; }
	if (/$company/)    { s/\.//g; }
	return $_;
}

1;