/usr/share/perl5/Text/Affixes.pm is in libtext-affixes-perl 0.09-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 | package Text::Affixes;
use 5.006;
use strict;
use warnings;
require Exporter;
our @ISA = qw(Exporter);
our %EXPORT_TAGS = ( 'all' => [ qw(
) ] );
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
our @EXPORT = qw(
get_prefixes
get_suffixes
);
our $VERSION = '0.09';
=head1 NAME
Text::Affixes - Prefixes and suffixes analysis of text
=head1 SYNOPSIS
use Text::Affixes;
my $text = "Hello, world. Hello, big world.";
my $prefixes = get_prefixes($text);
# $prefixes now holds
# {
# 3 => {
# 'Hel' => 2,
# 'wor' => 2,
# }
# }
# or
$prefixes = get_prefixes({min => 1, max => 2},$text);
# $prefixes now holds
# {
# 1 => {
# 'H' => 2,
# 'w' => 2,
# 'b' => 1,
# },
# 2 => {
# 'He' => 2,
# 'wo' => 2,
# 'bi' => 1,
# }
# }
# the use for get_suffixes is similar
=head1 DESCRIPTION
Provides methods for prefix and suffix analysis of text.
=head1 METHODS
=head2 get_prefixes
Extracts prefixes from text. You can specify the minimum and maximum
number of characters of prefixes you want.
Returns a reference to a hash, where the specified limits are mapped
in hashes; each of those hashes maps every prefix in the text into the
number of times it was found.
By default, both minimum and maximum limits are 3. If the minimum
limit is greater than the lower one, an empty hash is returned.
A prefix is considered to be a sequence of word characters (\w) in
the beginning of a word (that is, after a word boundary) that does not
reach the end of the word ("regular expressionly", a prefix is the $1
of /\b(\w+)\w/).
# extracting prefixes of size 3
$prefixes = get_prefixes( $text );
# extracting prefixes of sizes 2 and 3
$prefixes = get_prefixes( {min => 2}, $text );
# extracting prefixes of sizes 3 and 4
$prefixes = get_prefixes( {max => 4}, $text );
# extracting prefixes of sizes 2, 3 and 4
$prefixes = get_prefixes( {min => 2, max=> 4}, $text);
=cut
sub get_prefixes {
return _get_elements(1,@_);
}
=head2 get_suffixes
The get_suffixes function is similar to the get_prefixes one. You
should read the documentation for that one and than come back to this
point.
A suffix is considered to be a sequence of word characters (\w) in
the end of a word (that is, before a word boundary) that does not start
at the beginning of the word ("regular expressionly" speaking, a
suffix is the $1 of /\w(\w+)\b/).
# extracting suffixes of size 3
$suffixes = get_suffixes( $text );
# extracting suffixes of sizes 2 and 3
$suffixes = get_suffixes( {min => 2}, $text );
# extracting suffixes of sizes 3 and 4
$suffixes = get_suffixes( {max => 4}, $text );
# extracting suffixes of sizes 2, 3 and 4
$suffixes = get_suffixes( {min => 2, max=> 4}, $text);
=cut
sub get_suffixes {
return _get_elements(0,@_);
}
sub _get_elements {
my $task = shift;
=head1 OPTIONS
Apart from deciding on a minimum and maximum size for prefixes or suffixes, you
can also decide on some configuration options.
=cut
# configuration
my %conf = ( min => 3,
max => 3,
exclude_numbers => 1,
lowercase => 0,
);
if (ref $_[0] eq 'HASH') {
%conf = (%conf, %{+shift});
}
return {} if $conf{max} < $conf{min};
# get the elements
my %elements;
my $text = shift || return undef;
$conf{min} = 1 if $conf{min} < 1;
for ($conf{min} .. $conf{max}) {
my $regex = $task ? qr/\b(\w{$_})\w/ : # prefixes
qr/\w(\w{$_})\b/ ; # suffixes
while ($text =~ /$regex/g) {
$elements{$_}{$1}++;
}
}
=head2 exclude_numbers
Set to 0 if you consider numbers as part of words. Default value is 1.
# this
get_suffixes( {min => 1, max => 1, exclude_numbers => 0}, "Hello, but w8" );
# returns this:
{
1 => {
'o' => 1,
't' => 1,
'8' => 1
}
}
=cut
# exclude elements containing numbers
if ($conf{exclude_numbers}) {
for my $s (keys %elements) {
for (keys %{$elements{$s}}) {
delete ${$elements{$s}}{$_} if /\d/;
}
}
}
=head2 lowercase
Set to 1 to extract all prefixes in lowercase mode. Default value is 0.
ATTENTION: This does not mean that prefixes with uppercased characters won't be
extracted. It means they will be extracted after being lowercased.
# this...
get_prefixes( {min => 2, max => 2, lowercase => 1}, "Hello, hello");
# returns this:
{
2 => {
'he' => 2
}
}
=cut
# elements containing uppercased characters become lowercased ones
if ($conf{lowercase}) {
for my $s (keys %elements) {
for (keys %{$elements{$s}}) {
if (/[[:upper:]]/) {
${$elements{$s}}{lc $_} +=
delete ${$elements{$s}}{$_};
}
}
}
}
return \%elements;
}
1;
__END__
=head1 TO DO
=over 6
=item * Make it more efficient (use C for that)
=back
=head1 AUTHOR
Jose Castro, C<< <cog@cpan.org> >>
=head1 COPYRIGHT & LICENSE
Copyright 2004 Jose Castro, All Rights Reserved.
This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.
=cut
|