This file is indexed.

/usr/lib/perl5/KinoSearch1/Analysis/PolyAnalyzer.pm is in libkinosearch1-perl 1.00-1build3.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
package KinoSearch1::Analysis::PolyAnalyzer;
use strict;
use warnings;
use KinoSearch1::Util::ToolSet;
use base qw( KinoSearch1::Analysis::Analyzer );

BEGIN {
    __PACKAGE__->init_instance_vars(
        # constructor params / members
        analyzers => undef,
    );
}

use KinoSearch1::Analysis::LCNormalizer;
use KinoSearch1::Analysis::Tokenizer;
use KinoSearch1::Analysis::Stemmer;

sub init_instance {
    my $self = shift;
    my $language = $self->{language} = lc( $self->{language} );

    # create a default set of analyzers if language was specified
    if ( !defined $self->{analyzers} ) {
        croak("Must specify either 'language' or 'analyzers'")
            unless $language;
        $self->{analyzers} = [
            KinoSearch1::Analysis::LCNormalizer->new( language => $language ),
            KinoSearch1::Analysis::Tokenizer->new( language => $language ),
            KinoSearch1::Analysis::Stemmer->new( language => $language ),
        ];
    }
}

sub analyze {
    my ( $self, $token_batch ) = @_;

    # iterate through each of the anayzers in order
    $token_batch = $_->analyze($token_batch) for @{ $self->{analyzers} };

    return $token_batch;
}

1;

__END__

=head1 NAME

KinoSearch1::Analysis::PolyAnalyzer - multiple analyzers in series 

=head1 SYNOPSIS

    my $analyzer = KinoSearch1::Analysis::PolyAnalyzer->new(
        language  => 'es',
    );
    
    # or...
    my $analyzer = KinoSearch1::Analysis::PolyAnalyzer->new(
        analyzers => [
            $lc_normalizer,
            $custom_tokenizer,
            $snowball_stemmer,
        ],
    );

=head1 DESCRIPTION

A PolyAnalyzer is a series of Analyzers -- objects which inherit from
L<KinoSearch1::Analysis::Analyzer|KinoSearch1::Analysis::Analyzer> -- each of
which will be called upon to "analyze" text in turn.  You can either provide
the Analyzers yourself, or you can specify a supported language, in which case
a PolyAnalyzer consisting of an
L<LCNormalizer|KinoSearch1::Analysis::LCNormalizer>, a
L<Tokenizer|KinoSearch1::Analysis::Tokenizer>, and a
L<Stemmer|KinoSearch1::Analysis::Stemmer> will be generated for you.

Supported languages:

    en => English,
    da => Danish,
    de => German,
    es => Spanish,
    fi => Finnish,
    fr => French,
    it => Italian,
    nl => Dutch,
    no => Norwegian,
    pt => Portuguese,
    ru => Russian,
    sv => Swedish,

=head1 CONSTRUCTOR

=head2 new()

    my $analyzer = KinoSearch1::Analysis::PolyAnalyzer->new(
        language   => 'en',
    );

Construct a PolyAnalyzer object.  If the parameter C<analyzers> is specified,
it will override C<language> and no attempt will be made to generate a default
set of Analyzers.

=over

=item

B<language> - Must be an ISO code from the list of supported languages.

=item

B<analyzers> - Must be an arrayref.  Each element in the array must inherit
from KinoSearch1::Analysis::Analyzer.  The order of the analyzers matters.
Don't put a Stemmer before a Tokenizer (can't stem whole documents or
paragraphs -- just individual words), or a Stopalizer after a Stemmer (stemmed
words, e.g. "themselv", will not appear in a stoplist).  In general, the
sequence should be: normalize, tokenize, stopalize, stem.

=back

=head1 COPYRIGHT

Copyright 2005-2010 Marvin Humphrey

=head1 LICENSE, DISCLAIMER, BUGS, etc.

See L<KinoSearch1> version 1.00.

=cut