/usr/share/perl5/XMLTV/Get_nice.pm is in libxmltv-perl 0.5.70-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 | # $Id: Get_nice.pm,v 1.37 2017/09/12 00:38:58 knowledgejunkie Exp $
#
# Library to wrap LWP::UserAgent to put in a random delay between
# requests and set the User-Agent string. We really should be using
# LWP::RobotUI but this is better than nothing.
#
# If you're sure your app doesn't need a random delay (because it is
# fetching from a site designed for that purpose) then set
# $XMLTV::Get_nice::Delay to zero, or a value in seconds. This is the
# maximum delay - on average the sleep will be half that.
#
#This random delay will be between 0 and 5 ($Delay) seconds. This means
# some sites will complain you're grabbing too fast (since 20% of your
# grabs will be less than 1 second apart). To introduce a minimum delay
# set $XMLTV::Get_nice::MinDelay to a value in seconds.
# This will be added to $Delay to derive the actual delay used.
# E.g. Delay = 5 and MinDelay = 3, then the actual delay will be
# between 3 and 8 seconds,
#
# get_nice() is the function to call, however
# XMLTV::Get_nice::get_nice_aux() is the one to cache with
# XMLTV::Memoize or whatever. If you want an HTML::Tree object use
# get_nice_tree().
# Alternatively, get_nice_json() will get you a JSON object,
# or get_nice_xml() will get a XML::Parser 'Tree' object
#
use strict;
package XMLTV::Get_nice;
# use version number for feature detection:
# 0.005065 : new methods get_nice_json(), get_nice_xml()
# 0.005065 : add decode option to get_nice_tree()
# 0.005065 : expose the LWP response object ($Response)
# 0.005066 : support unknown tags in HTML::TreeBuilder ($IncludeUnknownTags)
# 0.005067 : new method post_nice_json()
# 0.005070 : skip get_nice sleep for cached pages
# 0.005070 : support passing HTML::TreeBuilder options via a hashref
our $VERSION = 0.005070;
use base 'Exporter';
our @EXPORT = qw(get_nice get_nice_tree get_nice_xml get_nice_json post_nice_json error_msg);
use Encode qw(decode);
use LWP::UserAgent;
use XMLTV;
our $Delay = 5; # in seconds
our $MinDelay = 0; # in seconds
our $FailOnError = 1; # Fail on fetch error
our $Response; # LWP response object
our $IncludeUnknownTags = 0; # add support for HTML5 tags which are unknown to older versions of TreeBuilder (and therfore ignored by it)
our $ua = LWP::UserAgent->new;
$ua->agent("xmltv/$XMLTV::VERSION");
$ua->env_proxy;
our %errors = ();
sub error_msg($) {
my ($url) = @_;
$errors{$url};
}
sub get_nice( $ ) {
# This is to ensure scalar context, to work around weirdnesses
# with Memoize (I just can't figure out how SCALAR_CACHE and
# LIST_CACHE relate to each other, with or without MERGE).
#
return scalar get_nice_aux($_[0]);
}
# Fetch page and return as HTML::Tree object.
# Optional arguments:
# i) a function to put the page data through (eg, to clean up bad characters)
# before parsing.
# ii) convert incoming page to UNICODE using this codepage (use "UTF-8" for
# strict utf-8)
# iii) a hashref containing options to configure the HTML::TreeBuilder object
# before parsing
#
sub get_nice_tree( $;$$$ ) {
my ($uri, $filter, $codepage, $htb_opts) = @_;
require HTML::TreeBuilder;
my $content = get_nice $uri;
$content = $filter->($content) if $filter;
if ($codepage) {
$content = decode($codepage, $content);
}
else {
$content = decode('UTF-8', $content);
}
my $t = HTML::TreeBuilder->new();
$t->ignore_unknown(!$IncludeUnknownTags);
if (ref $htb_opts eq 'HASH') {
$t->$_($htb_opts->{$_}) foreach (keys %$htb_opts);
}
$t->parse($content) or die "cannot parse content of $uri\n";
$t->eof;
return $t;
}
# Fetch page and return as XML::Parser 'Tree' object.
# Optional arguments:
# i) a function to put the page data through (eg, to clean up bad
# characters) before parsing.
# ii) convert incoming page to UNICODE using this codepage (use "UTF-8" for strict utf-8)
#
sub get_nice_xml( $;$$ ) {
my ($uri, $filter, $codepage) = @_;
require XML::Parser;
my $content = get_nice $uri;
$content = $filter->($content) if $filter;
if ($codepage) {
$content = decode($codepage, $content);
}
else {
$content = decode('UTF-8', $content);
}
my $t = XML::Parser->new(Style => 'Tree')->parse($content) or die "cannot parse content of $uri\n";
return $t;
}
# Fetch page and return as JSON::PP object.
# Optional arguments:
# i) a function to put the page data through (eg, to clean up bad
# characters) before parsing.
# ii) convert incoming UTF-8 to UNICODE
#
sub get_nice_json( $;$$ ) {
my ($uri, $filter, $utf8) = @_;
require JSON::PP;
my $content = get_nice $uri;
$content = $filter->($content) if $filter;
$utf8 = defined $utf8 ? 1 : 0;
my $t = JSON::PP->new()->utf8($utf8)->decode($content) or die "cannot parse content of $uri\n";
return $t;
}
my $last_get_time;
my $last_get_from_cache;
sub get_nice_aux( $ ) {
my $url = shift;
if (defined $last_get_time && (defined $last_get_from_cache && !$last_get_from_cache) ) {
# A page has already been retrieved recently. See if we need
# to sleep for a while before getting the next page - being
# nice to the server.
#
my $next_get_time = $last_get_time + (rand $Delay) + $MinDelay;
my $sleep_time = $next_get_time - time();
sleep $sleep_time if $sleep_time > 0;
}
my $r = $ua->get($url);
# Then start the delay from this time on the next fetch - so we
# make the gap _between_ requests rather than from the start of
# one request to the start of the next. This punishes modem users
# whose individual requests take longer, but it also punishes
# downloads that take a long time for other reasons (large file,
# slow server) so it's about right.
#
$last_get_time = time();
# expose the response object for those grabbers which need to process the headers, status code, etc.
$Response = $r;
# set flag if last fetch was from cache
$last_get_from_cache = (defined $r->{'_headers'}{'x-cached'} && $r->{'_headers'}{'x-cached'} == 1);
if ($r->is_error) {
# At the moment download failures seem rare, so the script dies if
# any page cannot be fetched. We could later change this routine
# to return undef on failure. But dying here makes sure that a
# failed page fetch doesn't get stored in XMLTV::Memoize's cache.
#
die "could not fetch $url, error: " . $r->status_line . ", aborting\n" if $FailOnError;
$errors{$url} = $r->status_line;
return undef;
} else {
return $r->content;
}
}
# Fetch page via a JSON object in the Content and return as a JSON object.
# Arguments:
# URI to post to
# JSON object with the AJAX data to be posted e.g. "{ 'programId':'123456', 'channel':'BBC'}"
#
sub post_nice_json( $$ ) {
my $url = shift;
my $json = shift;
require JSON::PP;
if (defined $last_get_time) {
# A page has already been retrieved recently. See if we need
# to sleep for a while before getting the next page
#
my $next_get_time = $last_get_time + (rand $Delay) + $MinDelay;
my $sleep_time = $next_get_time - time();
sleep $sleep_time if $sleep_time > 0;
}
my $r = $ua->post($url, 'Content_Type' => 'application/json; charset=utf-8', 'Content' => $json);
$last_get_time = time();
# expose the response object for those grabbers which need to process the headers, status code, etc.
$Response = $r;
if ($r->is_error) {
die "could not fetch $url, error: " . $r->status_line . ", aborting\n" if $FailOnError;
$errors{$url} = $r->status_line;
return undef;
} else {
my $content = JSON::PP->new()->utf8(1)->decode($r->content) or die "cannot parse content of $url\n";
return $content;
}
}
1;
|