/usr/lib/perl5/KinoSearch1/Analysis/PolyAnalyzer.pm is in libkinosearch1-perl 1.00-1build3.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 | package KinoSearch1::Analysis::PolyAnalyzer;
use strict;
use warnings;
use KinoSearch1::Util::ToolSet;
use base qw( KinoSearch1::Analysis::Analyzer );
BEGIN {
__PACKAGE__->init_instance_vars(
# constructor params / members
analyzers => undef,
);
}
use KinoSearch1::Analysis::LCNormalizer;
use KinoSearch1::Analysis::Tokenizer;
use KinoSearch1::Analysis::Stemmer;
sub init_instance {
my $self = shift;
my $language = $self->{language} = lc( $self->{language} );
# create a default set of analyzers if language was specified
if ( !defined $self->{analyzers} ) {
croak("Must specify either 'language' or 'analyzers'")
unless $language;
$self->{analyzers} = [
KinoSearch1::Analysis::LCNormalizer->new( language => $language ),
KinoSearch1::Analysis::Tokenizer->new( language => $language ),
KinoSearch1::Analysis::Stemmer->new( language => $language ),
];
}
}
sub analyze {
my ( $self, $token_batch ) = @_;
# iterate through each of the anayzers in order
$token_batch = $_->analyze($token_batch) for @{ $self->{analyzers} };
return $token_batch;
}
1;
__END__
=head1 NAME
KinoSearch1::Analysis::PolyAnalyzer - multiple analyzers in series
=head1 SYNOPSIS
my $analyzer = KinoSearch1::Analysis::PolyAnalyzer->new(
language => 'es',
);
# or...
my $analyzer = KinoSearch1::Analysis::PolyAnalyzer->new(
analyzers => [
$lc_normalizer,
$custom_tokenizer,
$snowball_stemmer,
],
);
=head1 DESCRIPTION
A PolyAnalyzer is a series of Analyzers -- objects which inherit from
L<KinoSearch1::Analysis::Analyzer|KinoSearch1::Analysis::Analyzer> -- each of
which will be called upon to "analyze" text in turn. You can either provide
the Analyzers yourself, or you can specify a supported language, in which case
a PolyAnalyzer consisting of an
L<LCNormalizer|KinoSearch1::Analysis::LCNormalizer>, a
L<Tokenizer|KinoSearch1::Analysis::Tokenizer>, and a
L<Stemmer|KinoSearch1::Analysis::Stemmer> will be generated for you.
Supported languages:
en => English,
da => Danish,
de => German,
es => Spanish,
fi => Finnish,
fr => French,
it => Italian,
nl => Dutch,
no => Norwegian,
pt => Portuguese,
ru => Russian,
sv => Swedish,
=head1 CONSTRUCTOR
=head2 new()
my $analyzer = KinoSearch1::Analysis::PolyAnalyzer->new(
language => 'en',
);
Construct a PolyAnalyzer object. If the parameter C<analyzers> is specified,
it will override C<language> and no attempt will be made to generate a default
set of Analyzers.
=over
=item
B<language> - Must be an ISO code from the list of supported languages.
=item
B<analyzers> - Must be an arrayref. Each element in the array must inherit
from KinoSearch1::Analysis::Analyzer. The order of the analyzers matters.
Don't put a Stemmer before a Tokenizer (can't stem whole documents or
paragraphs -- just individual words), or a Stopalizer after a Stemmer (stemmed
words, e.g. "themselv", will not appear in a stoplist). In general, the
sequence should be: normalize, tokenize, stopalize, stem.
=back
=head1 COPYRIGHT
Copyright 2005-2010 Marvin Humphrey
=head1 LICENSE, DISCLAIMER, BUGS, etc.
See L<KinoSearch1> version 1.00.
=cut
|