/usr/share/perl5/Scrappy/Action/Download.pm is in libscrappy-perl 0.94112090-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | package Scrappy::Action::Download;
BEGIN {
$Scrappy::Action::Download::VERSION = '0.94112090';
}
use URI;
use Moose::Role;
use Scrappy;
with 'Scrappy::Action::Help';
sub page {
my ($self, @options) = @_;
my $url = $options[0];
die "Can't download a page without a proper URL"
unless $url;
$url = URI->new($url)->as_string;
my $scraper = Scrappy->new;
$scraper->debug(1);
$scraper->logger->write('download.log');
my $downloader = {
'//link[@href]' => sub {
my ($self, $item, $params) = @_;
my $link =
ref $item->{href} ? $item->{href}->as_string : $item->{href};
if ($link) {
if ($link =~ m{^$url} || $link !~ m/^http(s)?\:\/\//) {
$link = URI->new_abs($link, $url)->as_string
if $link !~ m/^http(s)?\:\/\//;
$self->download($link);
# assuming its a css stylesheet, lets see if we find
# any images that need downloading
# YES, ITS A HACK ... and a bad one, AHHHHHHHHHHHHHHH !!!!!!
if ($self->get($link)->page_loaded) {
if ( $self->worker->content_type =~ /css/
|| $self->worker->response->filename
=~ /\.css(\?.*)?$/)
{
if ($self->content) {
$self->content->decode;
my @urls = $self->content->as_string
=~ /url\s{0,}?\([\'\"\s]{0,}?([^\)]+)?[\'\"\s]{0,}?\)/g;
if (@urls) {
# download any found urls (probably images)
foreach my $url (@urls) {
$url =~ s/^\s+//g;
$url =~ s/\s+$//g;
$url =~ s/[\'\"]//g;
$url !~ m/^http(s)?\:\/\//
? $self->download(
URI->new_abs($url, $link))
: $self->download($url);
}
}
}
}
}
}
}
},
'//script[@src]' => sub {
my ($self, $item, $params) = @_;
my $script =
ref $item->{src} ? $item->{src}->as_string : $item->{src};
if ($script) {
$script = URI->new_abs($script, $url)->as_string
if $script !~ m/^http(s)?\:\/\//;
$self->download($script)
if $script =~ m{^$url}
|| $script !~ m/^http(s)?\:\/\//;
}
},
'//img[@src]' => sub {
my ($self, $item, $params) = @_;
my $image =
ref $item->{src} ? $item->{src}->as_string : $item->{src};
if ($image) {
$image = URI->new_abs($image, $url)->as_string
if $image !~ m/^http(s)?\:\/\//;
$self->download($image)
if $image =~ m{^$url}
|| $image !~ m/^http(s)?\:\/\//;
}
},
};
$scraper->crawl(
$url,
'/' => $downloader,
'/*' => $downloader
);
if ($scraper->get($url)->page_loaded) {
my $filename = $scraper->worker->response->filename || 'index.html';
$scraper->store($filename);
return "\n... successfully downloaded $filename and it's assets\n";
}
return "\n... downloading may have had some trouble, see download.log\n";
}
1;
__DATA__
The download action is use to download html pages and/or assets from the
Internet for various reasons, e.g. backing up HTML pages, etc.
* Download a web page and all images, scripts and stylesheets
USAGE: scrappy download page [URL]
EXAMPLE: scrappy download page http://search.cpan.org/
|