/usr/share/perl5/Text/English.pm is in libtext-english-perl 1.606-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | #!/usr/bin/perl
# -*- Mode: Perl -*-
# Author : Ian Phillipps
# Last Modified On: Sun May 2 15:35:33 2004
# Language : CPerl
package Text::English;
$VERSION = $VERSION = '0.01';
sub stem {
my @parms = @_;
foreach( @parms ) {
$_ = lc $_;
# Step 0 - remove punctuation
s/'s$//; s/^[^a-z]+//; s/[^a-z]+$//;
next unless /^[a-z]+$/;
# step1a_rules
if( /[^s]s$/ ) { s/sses$/ss/ || s/ies$/i/ || s/s$// }
# step1b_rules. The business with rule==106 is embedded in the
# boolean expressions here.
(/[aeiouy][^aeiouy].*eed$/ && s/eed$/ee/ ) ||
( s/([aeiou].*)ed$/$1/ || s/([aeiouy].*)ing$/$1/ ) &&
( # step1b1_rules
s/at$/ate/ || s/bl$/ble/ || s/iz$/ize/ || s/bb$/b/ ||
s/dd$/d/ || s/ff$/f/ || s/gg$/g/ || s/mm$/m/ ||
s/nn$/n/ || s/pp$/p/ || s/rr$/r/ || s/tt$/t/ ||
s/ww$/w/ || s/xx$/x/ ||
# This is wordsize==1 && CVC...addanE...
s/^[^aeiouy]+[aeiouy][^aeiouy]$/$&e/
)
#DEBUG && warn "step1b1: $_\n"
;
# step1c_rules
#DEBUG warn "step1c: $_\n" if
s/([aeiouy].*)y$/$1i/;
# step2_rules
if ( s/ational$/ate/ || s/tional$/tion/ || s/enci$/ence/ ||
s/anci$/ance/ || s/izer$/ize/ || s/iser$/ise/ ||
s/abli$/able/ || s/alli$/al/ || s/entli$/ent/ ||
s/eli$/e/ || s/ousli$/ous/ || s/ization$/ize/ ||
s/isation$/ise/ || s/ation$/ate/ || s/ator$/ate/ ||
s/alism$/al/ || s/iveness$/ive/ || s/fulnes$/ful/ ||
s/ousness$/ous/ || s/aliti$/al/ || s/iviti$/ive/ ||
s/biliti$/ble/
) {
my ($l,$m) = ($`,$&);
#DEBUG warn "step 2: l=$l m=$m\n";
$_ = $l.$m unless $l =~ /[^aeiou][aeiouy]/;
}
# step3_rules
if ( s/icate$/ic/ || s/ative$// || s/alize$/al/ ||
s/iciti$/ic/ || s/ical$/ic/ || s/ful$// ||
s/ness$//
) {
my ($l,$m) = ($`,$&);
#DEBUG warn "step 3: l=$l m=$m\n";
$_ = $l.$m unless $l =~ /[^aeiou][aeiouy]/;
}
# step4_rules
if ( s/al$// || s/ance$// || s/ence$// || s/er$// ||
s/ic$// || s/able$// || s/ible$// || s/ant$// ||
s/ement$// || s/ment$// || s/ent$// || s/sion$/s/ ||
s/tion$/t/ || s/ou$// || s/ism$// || s/ate$// ||
s/iti$// || s/ous$// || s/ive$// || s/ize$// ||
s/ise$//
) {
my ($l,$m) = ($`,$&);
# Look for two consonant/vowel transitions
# NB simplified...
#DEBUG warn "step 4: l=$l m=$m\n";
$_ = $l.$m unless $l =~ /[^aeiou][aeiouy].*[^aeiou][aeiouy]/;
}
# step5a_rules
#DEBUG warn("step 5a: $_\n") &&
s/e$// if ( /[^aeiou][aeiouy].*[^aeiou][aeiouy].*e$/ ||
( /[aeiou][^aeiouy].*e/ && ! /[^aeiou][aeiouy][^aeiouwxy]e$/) );
# step5b_rules
#DEBUG warn("step 5b: $_\n") &&
s/ll$/l/ if /[^aeiou][aeiouy].*[^aeiou][aeiouy].*ll$/;
# Cosmetic step
s/(.)i$/$1y/;
}
@parms;
}
1;
__END__
=head1 NAME
Text::English - Porter's stemming algorithm
=head1 SYNOPSIS
use Text::English;
@stems = Text::English::stem( @words );
=head1 DESCRIPTION
This routine applies the Porter Stemming Algorithm to its parameters,
returning the stemmed words.
It is derived from the C program "stemmer.c"
as found in freewais and elsewhere, which contains these notes:
Purpose: Implementation of the Porter stemming algorithm documented
in: Porter, M.F., "An Algorithm For Suffix Stripping,"
Program 14 (3), July 1980, pp. 130-137.
Provenance: Written by B. Frakes and C. Cox, 1986.
I have re-interpreted areas that use Frakes and Cox's "WordSize"
function. My version may misbehave on short words starting with "y",
but I can't think of any examples.
The step numbers correspond to Frakes and Cox, and are probably in
Porter's article (which I've not seen).
Porter's algorithm still has rough spots (e.g current/currency, -ings words),
which I've not attempted to cure, although I have added
support for the British -ise suffix.
=head1 NOTES
This is version 0.1. I would welcome feedback, especially improvements
to the punctuation-stripping step.
=head1 AUTHOR
Ian Phillipps <ian@unipalm.pipex.com>
=head1 COPYRIGHT
Copyright Public IP Exchange Ltd (PIPEX).
Available for use under the same terms as perl.
=cut
|