/usr/share/perl5/WebService/CIA/Parser.pm is in libwebservice-cia-perl 1.4-3.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | package WebService::CIA::Parser;
require 5.005_62;
use strict;
use warnings;
use WebService::CIA;
our $VERSION = '1.4';
sub new {
my $proto = shift;
my $source = shift;
my $class = ref($proto) || $proto;
my $self = {};
bless ($self, $class);
return $self;
}
sub parse {
my ($self, $cc, $html) = @_;
my $data = {
'URL - Flag' => $WebService::CIA::base_url . 'graphics/flags/large/' . $cc . '-lgflag.gif',
'URL - Map' => $WebService::CIA::base_url . 'graphics/maps/' . $cc . '-map.gif',
'URL' => $WebService::CIA::base_url . 'geos/' . $cc . '.html',
'URL - Print' => $WebService::CIA::base_url . 'geos/countrytemplate_' . $cc . '.html'
};
while ($html =~ m#
<div\s+class="category".*?>\s*
(?:<a\s[^>]+?>)?
(.+?)
(?::\s*</a>|</a>:|:)\s*
</div>.*?
<div\sclass="category_data">
(.*?)
</div>
#xsg) {
my $field = $1;
my $value = $2;
$field =~ s/\s+/ /sg;
$field =~ s/^\s*(.*?)\s*$/$1/;
$value =~ s/\s+/ /sg;
$value =~ s/^\s*(.*?)\s*$/$1/;
$value =~ s/\s*<br>\s*/\n/g;
$value =~ s/<\/?[^>+]>//g;
$data->{$field} = $value;
}
return $data;
}
1;
__END__
=head1 NAME
WebService::CIA::Parser - Parse pages from the CIA World Factbook
=head1 SYNOPSIS
use WebService::CIA::Parser;
my $parser = WebService::CIA::Parser->new;
my $data = $parser->parse($string);
=head1 DESCRIPTION
WebService::CIA::Parser takes a string of HTML and parses it. It will only give
sensible output if the string is the HTML for a page whose URL matches
C<https://www.cia.gov/library/publications/the-world-factbook/print/[a-z]{2}\.html>
This parsing is somewhat fragile, since it assumes a certain page structure.
It'll work just as long as the CIA don't choose to alter their pages.
=head1 METHODS
=over 4
=item C<new>
Creates a new WebService::CIA::Parser object. It takes no arguments.
=item C<parse($html)>
Parses a string of HTML take from the CIA World Factbook. It takes a single
string as its argument and returns a hashref of fields and values.
The values are stripped of all HTML. C<E<lt>brE<gt>> tags are replaced by
newlines.
It also creates four extra fields: "URL", "URL - Print", "URL - Flag", and
"URL - Map" which are the URLs of the country's Factbook page, the
printable version of that page, a GIF map of the country, and a GIF flag
of the country respectively.
=back
=head1 EXAMPLE
use WebService::CIA::Parser;
use LWP::Simple qw(get);
$html = get(
"https://www.cia.gov/library/publications/the-world-factbook/print/uk.html"
);
$parser = WebService::CIA::Parser->new;
$data = $parser->parse($html);
print $data->{"Population"};
=head1 AUTHOR
Ian Malpass (ian-cpan@indecorous.com)
=head1 COPYRIGHT
Copyright 2003-2007, Ian Malpass
This library is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.
The CIA World Factbook's copyright information page
(L<https://www.cia.gov/library/publications/the-world-factbook/docs/contributor_copyright.html>)
states:
The Factbook is in the public domain. Accordingly, it may be copied
freely without permission of the Central Intelligence Agency (CIA).
=head1 SEE ALSO
WebService::CIA
=cut
|