This file is indexed.

/usr/share/httpry/plugins/tokenize.pm is in httpry-tools 0.1.8-1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
#
#  ----------------------------------------------------
#  httpry - HTTP logging and information retrieval tool
#  ----------------------------------------------------
#
#  Copyright (c) 2005-2014 Jason Bittel <jason.bittel@gmail.com>
#

package tokenize;

# -----------------------------------------------------------------------------
# GLOBAL VARIABLES
# -----------------------------------------------------------------------------
my %terms = ();

# -----------------------------------------------------------------------------
# Plugin core
# -----------------------------------------------------------------------------

main::register_plugin();

sub new {
        return bless {};
}

sub init {
        my $self = shift;
        my $cfg_dir = shift;

        _load_config($cfg_dir);

        return;
}

sub list {
        return qw(source-ip host request-uri);
}

sub main {
        my $self = shift;
        my $record = shift;
        my $decoded_uri;

        return unless $record->{'source-ip'} =~ /^(?:\d+)(?:\.\d+){3}$/;

        $decoded_uri = $record->{"request-uri"};
        $decoded_uri =~ s/%(?:25)+/%/g;
        $decoded_uri =~ s/%([a-fA-F0-9][a-fA-F0-9])/chr(hex($1))/eg;

        foreach my $term (split /[^A-Za-z0-9]/, "$record->{'host'}$decoded_uri") {
                next if !$term;
                next if (length($term) <= 2);
                next if $term =~ /^\d+$/; # Ignore numbers
                next if (exists $stopwords{$term});

                $terms{$record->{'source-ip'}}->{$term}++;
        }

        return;
}

sub end {
        my $ip;
        my $term;
        my $i;

        # TODO: This could use more control over the output style and format
        foreach $ip (keys %terms) {
                open OUTFILE, ">$output_dir/terms_$ip.txt" or
                        die "Cannot open $output_dir/terms_$ip.txt: $!\n";

                foreach $term (keys %{ $terms{$ip} }) {
                        for ($i = 0; $i < $terms{$ip}->{$term}; $i++) {
                                print OUTFILE "$term ";
                        }

                        print OUTFILE "\n";
                }

                close OUTFILE or die "Cannot close $output_dir/terms_$ip.txt: $!\n";
        }

        return;
}

# -----------------------------------------------------------------------------
# Load config file and check for required options
# -----------------------------------------------------------------------------
sub _load_config {
        my $cfg_dir = shift;

        # Load config file; by default in same directory as plugin
        if (-e "$cfg_dir/" . __PACKAGE__ . ".cfg") {
                require "$cfg_dir/" . __PACKAGE__ . ".cfg";
        } else {
                die "No config file found\n";
        }

        $output_dir = "." if (!$output_dir);
        $output_dir =~ s/\/$//; # Remove trailing slash

        return;
}

1;