/usr/share/doc/libweb-scraper-perl/examples/scraper is in libweb-scraper-perl 0.38-1.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 | #!/usr/bin/perl
use strict;
use warnings;
use Config;
use Term::ReadLine;
use Data::Dumper;
use HTML::Entities;
use URI;
use Web::Scraper;
use YAML;
sub WARN() {
return sub {
warn $_->isTextNode
? HTML::Entities::encode($_->as_XML, q("'<>&))
: $_->as_HTML(q('"&<>), "", {});
};
}
my $print = sub {
if ($ENV{PAGER}) {
open my $pager, "|$ENV{PAGER}";
print $pager @_;
} else {
print @_;
}
};
my(@stack, $source);
my $stuff = process_args($ARGV[0])
or die "Usage: scraper [URI-or-filename]\n";
my $term = Term::ReadLine->new("Web::Scraper");
my $scraper = scraper { run_loop($_[0], $term) };
$scraper->user_agent->env_proxy;
my $result = $scraper->scrape($stuff);
sub process_args {
my $uri = shift;
if (!-t STDIN and my $content = join "", <STDIN>) {
$source = [ 'stdin' ];
return \$content;
} elsif ($uri && $uri =~ m!^https?://!) {
$source = [ "URI", $uri ];
return URI->new($uri);
} elsif ($uri && -e $uri) {
$source = [ 'file', $uri ];
open my $fh, "<", $uri or die "$uri: $!";
return join "", <$fh>;
}
return;
}
sub run_loop {
my($tree, $term) = @_;
while (defined(my $in = $term->readline("scraper> "))) {
if ($in eq 'd') {
$Data::Dumper::Indent = 1;
warn Dumper result;
} elsif ($in eq 'y') {
warn Dump result;
} elsif ($in eq 's') {
$print->($tree->as_HTML(q('"&<>), " ", {}));
} elsif ($in eq 'q') {
return;
} elsif ($in eq 'c') {
print generate_code($source, $stack[-1]);
} elsif ($in =~ /^c\s+all\s*$/) {
print generate_code($source, @stack);
} else {
my $res = eval $in;
warn $@ if $@;
push @stack, $in unless $@;
}
}
}
sub generate_code {
my($source, @stack) = @_;
my $code_stack = join "\n", map { " $_" . (/;$/ ? "" : ";") } @stack;
my($var, $stuff) =
$source->[0] eq 'stdin' ? ('$input', '\join "", <STDIN>') :
$source->[0] eq 'URI' ? ('$uri', qq(URI->new("$source->[1]"))) :
$source->[0] eq 'file' ? ('$file', qq(\\do { my \$file = "$source->[1]"; open my \$fh, \$file or die "\$file: \$!"; join '', <\$fh> })) :
'...';
return <<CODE;
#!$Config{perlpath}
use strict;
use Web::Scraper;
use URI;
my $var = $stuff;
my \$scraper = scraper {
$code_stack
};
my \$result = \$scraper->scrape($var);
CODE
}
|