/usr/share/httpry/plugins/tokenize.pm is in httpry-tools 0.1.7-3.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 | #
# ----------------------------------------------------
# httpry - HTTP logging and information retrieval tool
# ----------------------------------------------------
#
# Copyright (c) 2005-2012 Jason Bittel <jason.bittel@gmail.com>
#
package tokenize;
# -----------------------------------------------------------------------------
# GLOBAL VARIABLES
# -----------------------------------------------------------------------------
my %terms = ();
# -----------------------------------------------------------------------------
# Plugin core
# -----------------------------------------------------------------------------
main::register_plugin();
sub new {
return bless {};
}
sub init {
my $self = shift;
my $cfg_dir = shift;
_load_config($cfg_dir);
return;
}
sub list {
return qw(source-ip host request-uri);
}
sub main {
my $self = shift;
my $record = shift;
my $decoded_uri;
return unless $record->{'source-ip'} =~ /^(?:\d+)(?:\.\d+){3}$/;
$decoded_uri = $record->{"request-uri"};
$decoded_uri =~ s/%(?:25)+/%/g;
$decoded_uri =~ s/%([a-fA-F0-9][a-fA-F0-9])/chr(hex($1))/eg;
foreach my $term (split /[^A-Za-z0-9]/, "$record->{'host'}$decoded_uri") {
next if !$term;
next if (length($term) <= 2);
next if $term =~ /^\d+$/; # Ignore numbers
next if (exists $stopwords{$term});
$terms{$record->{'source-ip'}}->{$term}++;
}
return;
}
sub end {
my $ip;
my $term;
my $i;
# TODO: This could use more control over the output style and format
foreach $ip (keys %terms) {
open OUTFILE, ">$output_dir/terms_$ip.txt" or
die "Cannot open $output_dir/terms_$ip.txt: $!\n";
foreach $term (keys %{ $terms{$ip} }) {
for ($i = 0; $i < $terms{$ip}->{$term}; $i++) {
print OUTFILE "$term ";
}
print OUTFILE "\n";
}
close OUTFILE or die "Cannot close $output_dir/terms_$ip.txt: $!\n";
}
return;
}
# -----------------------------------------------------------------------------
# Load config file and check for required options
# -----------------------------------------------------------------------------
sub _load_config {
my $cfg_dir = shift;
# Load config file; by default in same directory as plugin
if (-e "$cfg_dir/" . __PACKAGE__ . ".cfg") {
require "$cfg_dir/" . __PACKAGE__ . ".cfg";
} else {
die "No config file found\n";
}
$output_dir = "." if (!$output_dir);
$output_dir =~ s/\/$//; # Remove trailing slash
return;
}
1;
|