This file is indexed.

/usr/share/perl5/Lire/W3CExtendedLog.pm is in lire 2:2.1.1-2.1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
package Lire::W3CExtendedLog;

use strict;

use base qw/ Exporter /;
use vars qw/ @EXPORT_OK %DEFAULTS_TYPE2REGEX %DEFAULTS_IDENTIFIER2TYPE /;

use Exporter;
use Time::Local;
use Lire::Logger;

BEGIN {
    @EXPORT_OK = qw/ w3c_time uri_decode string_decode /;

    # Maps W3C types to regex
    %DEFAULTS_TYPE2REGEX =
      (
       'integer'	=> '(\d+|-)',
       'fixed'	=> '(\d+(?:\.\d+)?|-)',
       'uri'	=> '(\S+)',
       'date'	=> '(\d\d\d\d-\d\d-\d\d)',
       'time'	=> '(\d\d:\d\d(?::\d\d(?:\.\d+)?)?)',

       # Match anything beween a starting " and another followed by a space
       # (Embedded " are doubled) -> "String with a "" in it"
       'string'	=> '"((?:[^"]|"")*)"',
       'name'	=> '([-_.0-9a-zA-Z]+)',  # This also match invalid domain names
       'address'	=> '(\d+\.\d+\.\d+\.\d+|-)',
      );

    # Maps identifier to type
    %DEFAULTS_IDENTIFIER2TYPE =
      (
       'count'      => 'integer',
       'time-from'  => 'fixed',
       'time-to'    => 'fixed',
       'interval'   => 'integer',
       'ip'	    => 'address',
       'dns'	    => 'name',
       'status'     => 'integer',
       'comment'    => 'string',
       'uri'	    => 'uri',
       'uri-stem'   => 'uri',
       'uri-query'  => 'uri',
       'method'     => 'name',
       'username'   => 'uri',
       'date'	    => 'date',
       'time'	    => 'time',
       'port'	    => 'integer',
       'time-taken' => 'fixed',
       'bytes'      => 'integer',
       'cached'     => 'integer',
      );
}

my $debug = 0;
sub debug {
    $debug and lr_debug($_[0]);
}

########################################################################
#			  UTILITY FUNCTIONS
########################################################################

sub uri_decode {
    $_[0] =~ s/\%(..)/hex( '0x' . $1)/ge;
    $_[0] =~ tr/+/ /;
}

sub string_decode {
    # Transform "" into "
    $_[0] =~ tr/"/"/s;
}

sub w3c_time {
    my ( $date, $time ) = @_;

    my ($year, $month, $mday ) = $date =~ /^(\d\d\d\d)-(\d\d)-(\d\d)$/
      or die "invalid date string: $date\n";

    my ($hours, $min, $sec, $msec ) =
      $time =~ /^(\d\d):(\d\d)(?::(\d\d)(?:\.(\d+))?)?$/
      or die "invalid time string: $time\n";

    $month--;
    $year -= 1900;

    # Yes, W3C Extended Log time are in UTC time.
    return timegm( $sec || 0, $min, $hours, $mday, $month, $year );
}

########################################################################
#			    PARSER METHODS
########################################################################

sub new {
    my $self = shift;
    my $class = ref($self) || $self;
    bless $self = 
      {
       # Copy in attributes so that they can be overidden
       'type2regex'	=> { %DEFAULTS_TYPE2REGEX },
       'identifier2type'	=> { %DEFAULTS_IDENTIFIER2TYPE },
       'log_date'		=> undef,
       'log_time'		=> undef,
       'version'		=> undef,
       'sofware'		=> undef,
       'fields'		=> undef,
       'is_iis'		=> undef,
       'in_header'	=> 1,
       'skip_to_next_header' => 0,
      }, $class;

    return $self;
}

sub field2re {
    my ( $self, $field ) = @_;

    if ( $field =~ /^\w+\(.*\)$/ ) {
	# Header
	if ( $self->{'is_iis'} ) {
	    return $self->{'type2regex'}{'uri'};
	} else {
	    return $self->{'type2regex'}{'string'};
	}
    } else {
        # csrx are prefixes used in field identifiers, cf W3C specs
	my ( $identifier ) = $field =~ /^(?:[csrx]{1,2}-)?(.*)$/;

	my $type = $self->{'identifier2type'}{$identifier};
	unless ( defined $type ) {
	    lr_warn( "identifier '$identifier' (constructed from '$field') " .
              "not found in identifier2type property. Will use uri type" );
            $debug and do {
                debug( "dumping identifier2type hash" );
                while ((my $k, my $v) = each %{ $self->{'identifier2type'} }) {
                    if (defined $v) {
                        debug( "identifier2type{'$k'} = '$v'" );
                    } else {
                        debug( "identifier2type{'$k'} undefined" );
                    }
                }
            };
	    return $self->{'type2regex'}{'uri'};
	} else {
	    if ( $type eq 'string' && $self->{'is_iis'} ) {
		return $self->{'type2regex'}{'uri'};
	    } elsif (defined $self->{'type2regex'}{$type}) {
                return $self->{'type2regex'}{$type};
            } else {
                lr_warn( "unknown type: $type.  Will return uri type" );
                return $self->{'type2regex'}{'uri'};
	    }
	}
    }
}

sub field2decoder {
    my ( $self, $field ) = @_;

    if ( $field =~ /^\w+\(.*\)$/ ) {
	# Header
	if ( $self->{'is_iis'} ) {
	    return \&uri_decode;
	} else {
	    return \&string_decode;
	}
    } else {
	my ( $identifier ) = $field =~ /^(?:[csrx]{1,2}-)?(.*)$/;

	my $type = $self->{'identifier2type'}{$identifier};
	return undef unless defined $type;

	if ( $type eq 'string' && $self->{'is_iis'} ) {
	    return \&uri_decode;
	} elsif ( $type eq 'string' ) {
	    return \&string_decode;
	}
	return undef;
    }
}

sub build_parser {
    my ( $self ) = @_;
    my @fields = split /\s+/, $self->{'fields'};

    # Quick check
    die( "we don't support aggregated records: $self->{'fields'}\n" )
      if $fields[0] eq 'count';

    my @re = ();

    my %decoders = ();
    foreach my $f ( @fields ) {
	if (defined $self->field2re( $f )) {
            push @re, $self->field2re( $f );
        } else {
            die( "internal inconsistency: field2re undefined in '$f'. " .
                "aborting.\n")
        }
	my $decoder = $self->field2decoder( $f );
	$decoders{$f} = $decoder if $decoder;
    }
    my $sep = $self->{'tab_sep'} ? '\t' : '\s+';
    my $re = "^" . join ( $sep, @re) . "\$";
    lr_info( "will use $re as lexer" );
    lr_info( "to parse fields: ", join ", ", @fields );
    my $rx = qr/$re/;

    $self->{'w3c_parser'} = sub {
	# Remove potential extra CR of DOS line ending
	$_[0] =~ s/\r?\n?$//; 

	my @rec = $_[0] =~ /$rx/
	  or die "lexer failed\n";

	my $i = 0;
	my %w3c = map { $fields[$i++] => $_ } @rec;

	# Decode fields
	foreach my $name ( keys %decoders ) {
	    $decoders{$name}->( $w3c{$name} );
	}

	# Decode timestamp
	my $date = $w3c{'date'} || $self->{'log_date'};
	if ( defined $date  && exists $w3c{'time'} ) {
	    $w3c{'lire_time'} = w3c_time( $date, $w3c{'time'} );
	}

	return \%w3c;
    }
}

sub parse_directive {
    my ( $self, $line ) = @_;
    $line =~ s/\r?\n?$//; # To remove potential extra CR of DOS line ending

    die "not a directive line: $line\n"
      unless $line =~ /^#/;

    my ( $directive, $text ) = $line =~ /^#(\w+): (.*)/
      or die( "error parsing directive: $line\n" );

  SWITCH:
    for ( $directive ) {
	/^Version$/i && do {
	    $self->{'version'} = $text + 0;
	    die( "Unsupported format version: $text != 1.0\n" )
	      unless $self->{'version'} == 1;

	    lr_info( "W3C Extended Log Format ", $text );
	    last SWITCH;
	};
	/^Fields$/i && do {
	    if ( $self->{'fields'} ) {
		if ( $self->{'fields'} ne $text ) {
		    $self->{'skip_to_next_header'} = 1;
		    lr_warn( "we don't support switching Fields directive: $text.Ignoring those log records\n");
		}

		last SWITCH;
	    }
	    $self->{'fields'} = $text;
            # this is reported to occur in MS ISA logs (the one IIS logfiles
            # we've seen use space as separator in both Fields and the log
            # itself.  Unfortunately, this logfile does _not_ escape spaces
            # in its values :(  )
	    $self->{'tab_sep'} = $text =~ /\t/;
	    lr_info( "found tabcharacter in Fields declaration; therefore " .
              "using tab as field separator and allowing non-escaped " .
              "spaces" ) if $self->{'tab_sep'};
	    $self->build_parser();
	    last SWITCH;
	};
	/^Software$/i && do {
	    lr_info( "Log generated by $text" );
	    $self->{'is_iis'} = $text =~ /Microsoft Internet/;
	    lr_info( "activating IIS specific support" )
	      if $self->{'is_iis'};
	    last SWITCH;
	};
	/^Date$/ && do {
	    my $date_re = $self->{'type2regex'}{'date'};
	    my $time_re = $self->{'type2regex'}{'time'};
	    ($self->{'log_date'}, $self->{'log_time'} ) =
	      $text =~ /$date_re $time_re/o;
	    if ( defined $self->{'log_date'} && defined $self->{'log_time'} ) {
		lr_info( "Date: ", $text );
	    } else {
		lr_warn( "Invalid date directive: $text" );
	    }
	    last SWITCH;
	};
	/^(Start-Date|End-Date|Date|Remark)$/ && do {
	    # Ignore those directive
	    lr_info( $directive, ": ", $text );
	    last SWITCH;
	};
	/^SubComponent$/ && do  {
	    lr_info( "ignoring log of SubComponent $text" );
	    $self->{'skip_to_next_header'} = 1;
	    last SWITCH;
	};

	# Defaults
	lr_warn( "unknown directive: $line" );
    }
}

sub parse {
    my ($self, $line) = @_;

    if ( $line =~ /^#/ ) {
	if ( $self->{'skip_to_next_header'} && ! $self->{'in_header'} ) {
	    $self->{'skip_to_next_header'} = 0;
	}
	$self->{'in_header'} = 1;
	$self->parse_directive( $line );
    } elsif ( $self->{'version'} && $self->{'fields'} ) {
	$self->{'in_header'} = 0;
	return undef 
	  if $self->{'skip_to_next_header'};

	$self->parse_record( $line );
    } else {
	lr_err( "invalid W3C extended log file: must start by Version and Fields directives" );
    }
}

sub parse_record {
    $_[0]->{'w3c_parser'}->( $_[1] );
}

1;

__END__

=pod

=head1 NAME

Lire::W3CExtendedLog - Base implementation of a W3C Extended Log parser

=head1 SYNOPSIS

use Lire::W3CExtendedLog;

my $parser = new Lire::W3CExtendedLog;

my $w3c_rec = $parser->parse( $line );

=head1 DESCRIPTION

This module defines objects able to parse W3C Extended Log Format.
This log format is defined at http://www.w3.org/TR/WD-logfile.html

All attributes of the created object can be overriden by e.g. modules extending
the object.  The attributes are:

=head2 type2regex

I<type2regex> is a hash containing key-value pairs like

 'name' => '([-_.0-9a-zA-Z]+)'

Keys are all data formats for log file field entries as defined in the W3C
specification: 'integer', 'fixed', 'uri', 'date', 'time' and 'string', along
with 'name' and 'address' types.

=head2 identifier2type

I<identifier2type> is a hash containing key-value pairs like

 'dns'        => 'name',
 'uri-query'  => 'uri',
 'ip'         =>

Keys are the W3C defined Field identifiers, with their prefixes stripped off.

=head2 field2re

I<field2re> is subroutine; when called as

 $self->{field2re('c-ip')}

it will return e.g.

 '(\d+\.\d+\.\d+\.\d+|-)'

Arguments are as found in the Fields directive, so, in an ideal world, should
be identifiers.  It uses I<type2regex>.

=head2 field2decoder

I<field2decoder> is a subroutine; it returns one of I<\&uri_decode> ,
I<\&string_decode> or I<undef>, depending on, a.o., I<is_iis>.  It is
used by I<build_parser>.

=head2 parse

I<parse> is the preferred interface to this module.  It expects a line as its
argument, and returns a reference to a hash (like I<&w3c_parser>), or executes
I<&parse_directive>.

=head2 parse_directive

I<parse_directive> expects a directive in its argument, it fills the object.

=head2 w3c_parser

I<w3c_parser> is a subroutine; it expects a logline as argument, and returns a
reference to a hash, mapping $self->{'fields'} entries to their decoded values.
It uses the I<&field2re> and I<&field2decoder> routines.  It is build in
build_parser.

=head2 build_parser

I<build_parser> is a subroutine, it builds and returns I<&w3c_parser>.
It is called in I<&parse_directive>.

=head2 log_date and log_time

I<log_date> and I<log_time> contain strings constructed from the Date
directive.

=head2 version and sofware

I<version> and I<software> contain strings constructed from the Version and
Software directives, respectively.

=head2 fields

I<fields> contains the entire string from the Fields directive.

=head2 is_iis

I<is_iis> is set in case the Software directive contains 'Microsoft Internet'
as a substring.  It is used to enable IIS specific support.

=head2 tab_sep

I<tab_sep> is set in case tabs are found in the Fields directive.  We assume
these will be used in the log itself too, and allow unescaped spaces in the
log.



Summarizing:

 &parse --calls--> &parse_directive
        `--calls--> &w3c_parser

 &parse_directive --calls--> &build_parser

 &build_parser --calls--> &field2decoder
              `--calls--> &field2re
              `--returns--> &w3c_parser

 &field2decoder --returns--> &uri_decode, &string_decode

 &field2re --uses--> %type2regex
           `--uses--> %identifier2type


=head1 BUILDING INHERITING MODULES

FIXME .  Needs to be written.  Steal from w3c_extended2dlf's
Lire::WWW::ExtendedLog, which ISA Lire::W3CExtendedLog.

=head1 SEE ALSO

w3c_extended2dlf(1), ms_isa2dlf(1)

=head1 AUTHOR

  Francis J. Lacoste <flacoste@logreport.org>

=head1 VERSION

$Id: W3CExtendedLog.pm,v 1.18 2006/07/23 13:16:30 vanbaal Exp $

=head1 COPYRIGHT

Copyright (C) 2001-2002 Stichting LogReport Foundation LogReport@LogReport.org

This file is part of Lire.

Lire is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program (see COPYING); if not, check with
http://www.gnu.org/copyleft/gpl.html. 

=cut