/usr/share/perl5/Text/Context.pm is in libtext-context-perl 3.7-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 | package Text::Context;
use strict;
use warnings;
use UNIVERSAL::require;
our $VERSION = "3.7";
=head1 NAME
Text::Context - Handle highlighting search result context snippets
=head1 SYNOPSIS
use Text::Context;
my $snippet = Text::Context->new($text, @keywords);
$snippet->keywords("foo", "bar"); # In case you change your mind
print $snippet->as_html;
print $snippet->as_text;
=head1 DESCRIPTION
Given a piece of text and some search terms, produces an object
which locates the search terms in the message, extracts a reasonable-length
string containing all the search terms, and optionally dumps the string out
as HTML text with the search terms highlighted in bold.
=head2 new
Creates a new snippet object for holding and formatting context for
search terms.
=cut
sub new {
my ($class, $text, @keywords) = @_;
my $self = bless { text => $text, keywords => [] }, $class;
$self->keywords(@keywords);
return $self;
}
=head2 keywords
Accessor method to get/set keywords. As the context search is done
case-insensitively, the keywords will be lower-cased.
=cut
sub keywords {
my ($self, @keywords) = @_;
$self->{keywords} = [ map { s/\s+/ /g; lc $_ } @keywords ] if @keywords;
return @{ $self->{keywords} };
}
=begin maintenance
=head2 prepare_text
Turns the text into a set of Paragraph objects, collapsing multiple
spaces in the text and feeding the paragraphs, in order, onto the
C<text_a> member.
=head2 para_class
The Paragraph class to use. This defaults to 'Text::Context::Para'
=end maintenance
=cut
sub para_class { "Text::Context::Para" }
sub prepare_text {
my $self = shift;
my @paras = split /\n\n/, $self->{text};
for (0 .. $#paras) {
my $x = $paras[$_];
$x =~ s/\s+/ /g;
$self->para_class->require;
push @{ $self->{text_a} }, $self->para_class->new($x, $_);
}
}
=begin maintenance
=head2 permute_keywords
This is very clever. To determine which keywords "apply" to a given
paragraph, we first produce a set of all possible keyword sets. For
instance, given "a", "b" and "c", we want to produce
a b c
a b
a c
a
b c
b
c
We do this by counting in binary, and then mapping the counts onto
keywords.
=end maintenance
=cut
sub permute_keywords {
my $self = shift;
my @permutation;
for my $bitstring (1 .. (2**@{ $self->{keywords} }) - 1) {
my @thisperm;
for my $bitmask (0 .. @{ $self->{keywords} } - 1) {
push @thisperm, $self->{keywords}[$bitmask]
if $bitstring & 2**$bitmask;
}
push @permutation, \@thisperm;
}
return reverse @permutation;
}
=begin maintenance
=head2 score_para / get_appropriate_paras
Now we want to find a "score" for this paragraph, finding the best set
of keywords which "apply" to it. We favour keyword sets which have a
large number of matches (obviously a paragraph is better if it matches
"a" and "c" than if it just matches "a") and with multi-word keywords.
(A paragraph which matches "fresh cheese sandwiches" en bloc is worth
picking out, even if it has no other matches.)
=end maintenance
=cut
sub score_para {
my ($self, $para) = @_;
my $content = $para->{content};
my %matches;
# Do all the matching of keywords in advance of the boring
# permutation bit
for my $word (@{ $self->{keywords} }) {
my $word_score = 0;
$word_score += 1 + ($content =~ tr/ / /) if $content =~ /\b\Q$word\E\b/i;
$matches{$word} = $word_score;
}
#XXX : Possible optimization: Give up if there are no matches
for my $wordset ($self->permute_keywords) {
my $this_score = 0;
$this_score += $matches{$_} for @$wordset;
$para->{scoretable}[$this_score] = $wordset if $this_score > @$wordset;
}
$para->{final_score} = $#{ $para->{scoretable} };
}
sub _set_intersection {
my %union;
my %isect;
for (@_) { $union{$_}++ && ($isect{$_} = $_) }
return values %isect;
}
sub _set_difference {
my ($a, $b) = @_;
my %seen;
@seen{@$b} = ();
return grep { !exists $seen{$_} } @$a;
}
sub get_appropriate_paras {
my $self = shift;
my @app_paras;
my @keywords = @{ $self->{keywords} };
my @paras =
sort { $b->{final_score} <=> $a->{final_score} } @{ $self->{text_a} };
for my $para (@paras) {
my @words = _set_intersection($para->best_keywords, @keywords);
if (@words) {
@keywords = _set_difference(\@keywords, \@words);
$para->{marked_words} = \@words;
push @app_paras, $para;
last if !@keywords;
}
}
$self->{app_paras} = [ sort { $a->{order} <=> $b->{order} } @app_paras ];
return @{ $self->{app_paras} };
}
=head2 paras
@paras = $self->paras($maxlen)
Return shortened paragraphs to fit together into a snippet of at most
C<$maxlen> characters.
=cut
sub paras {
my $self = shift;
my $max_len = shift || 80;
$self->prepare_text;
$self->score_para($_) for @{ $self->{text_a} };
my @paras = $self->get_appropriate_paras;
return unless @paras;
# XXX: Algorithm may get better here by considering number of marked
# up words as weight
return map { $_->slim($max_len / @paras) } $self->get_appropriate_paras;
}
=head2 as_text
Calculates a "representative" string which contains
the given search terms. If there's lots and lots of context between the
terms, it's replaced with an ellipsis.
=cut
sub as_text {
return join " ... ", map { $_->as_text } $_[0]->paras;
}
=head2 as_html([ start => "<some tag>", end => "<some end tag>" ])
Markup the snippet as a HTML string using the specified delimiters or
with a default set of delimiters (C<E<lt>span class="quoted"E<gt>>).
=cut
sub as_html {
my $self = shift;
my %args = @_;
my ($start, $end) = @args{qw(start end)};
return join " ... ", map { $_->marked_up($start, $end) } $self->paras;
}
=head1 AUTHOR
Original author: Simon Cozens
Current maintainer: Tony Bowden
=head1 BUGS and QUERIES
Please direct all correspondence regarding this module to:
bug-Text-Context@rt.cpan.org
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2002-2005 Kasei
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License; either version
2 of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
=cut
1;
|