/usr/bin/tv_remove_some_overlapping is in xmltv-util 0.5.69-1.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 | #!/usr/bin/perl -w
=pod
=head1 NAME
tv_remove_some_overlapping - Remove some overlapping programmes from XMLTV data.
=head1 SYNOPSIS
tv_remove_some_overlapping [--help] [--output FILE] [FILE...]
=head1 DESCRIPTION
Read one or more XMLTV files and write a file to standard output
containing the same data, except that some 'magazine' programmes which
seem to contain two or more other programmes are removed.
For example, if 'Schools TV' runs from 10:00 to 12:00, and there are
two programmes 'History' from 10:00 to 11:00 and 'Geography' from
11:00 to 12:00 on the same channel, then 'Schools TV' could be
removed. A programme is removed only if there are two or more other
programmes which partition its timeslot, which implies that it and
these other programmes must have stop times specified.
To avoid throwing away any real programmes, no programme will be
discarded if it has content data other than title and URL.
Filtering this tool won't remove all overlapping programmes but it
will deal with the 'big magazine programme containing smaller
programmes' data commonly seen from listings sources.
B<--output FILE> write to FILE rather than standard output
=head1 SEE ALSO
L<xmltv(5)>.
=head1 AUTHOR
Ed Avis, ed@membled.com
=cut
use strict;
use XMLTV::Version '$Id: tv_remove_some_overlapping,v 1.7 2015/06/23 20:05:25 knowledgejunkie Exp $ ';
use XMLTV::Date;
use Getopt::Long;
use Date::Manip;
BEGIN {
if (int(Date::Manip::DateManipVersion) >= 6) {
Date::Manip::Date_Init("SetDate=now,UTC");
} else {
Date::Manip::Date_Init("TZ=UTC");
}
}
# Use Log::TraceMessages if installed.
BEGIN {
eval { require Log::TraceMessages };
if ($@) {
*t = sub {};
*d = sub { '' };
}
else {
*t = \&Log::TraceMessages::t;
*d = \&Log::TraceMessages::d;
Log::TraceMessages::check_argv();
}
}
use XMLTV;
use XMLTV::Usage <<END
$0: remove some programmes which seem to be mere containers for others
usage: $0 [--help] [--output FILE] [FILE...]
END
;
# Memoize some subroutines if possible
eval { require Memoize };
unless ($@) {
foreach (qw/Date_Cmp pd interesting/) {
Memoize::memoize($_) or die "cannot memoize $_: $!";
}
}
sub pd( $ );
sub exists_partition( $$$$$ );
sub interesting( $ );
sub should_write( $ );
# Keys of a programme hash which don't indicate any data we're
# especially concerned to preserve. Poke around inside XMLTV.pm to
# find the list of attributes.
#
my %boring_programme_key = (title => 1);
$boring_programme_key{$_} = 1
foreach map { $_->[0] } @XMLTV::Programme_Attributes;
$boring_programme_key{url} = 1; # common in some sources, and boring
my ($opt_help, $opt_output);
GetOptions('help' => \$opt_help, 'output=s' => \$opt_output) or usage(0);
usage(1) if $opt_help;
@ARGV = ('-') if not @ARGV;
my %w_args = ();
if (defined $opt_output) {
my $fh = new IO::File ">$opt_output";
die "cannot write to $opt_output\n" if not $fh;
%w_args = (OUTPUT => $fh);
}
# Unfortunately we need to load the whole file before processing. I
# don't want to require the input to be sorted, since tv_sort adds
# guessed stop times which could cause this program to remove too many
# programmes. This is another reason to eventually change tv_sort not
# to add stop times (move into tv_guess_stop_times or whatever).
#
my ($encoding, $credits, $ch, $progs) = @{XMLTV::parsefiles(@ARGV)};
my $w = new XMLTV::Writer(%w_args, encoding => $encoding);
$w->start($credits);
$w->write_channels($ch);
# Since zero-length and unknown-length programmes are always written
# unchanged, we could write them immediately and discard them. But
# it's a bit nicer to write the output in the same order as the input.
# However, we don't bother to index these programmes, they cannot /
# should not be used in looking for partitionings.
#
my %by_channel_and_start;
foreach (@$progs) {
push @{$by_channel_and_start{$_->{channel}}{pd $_->{start}}}, $_
if interesting $_;
}
$w->write_programme($_) foreach grep { should_write($_) } @$progs;
$w->end();
exit();
# Given that %by_channel_and_start and %boring_programme_key have been
# set up, should a programme (with start and stop time) be written?
#
sub should_write( $ ) {
my $p = shift;
# Always write zero length and unknown-length programmes.
return 1 if not interesting $p;
# If this programme cannot be partitioned by at least two others,
# definitely write it.
#
return 1
unless exists_partition(pd $p->{start}, pd $p->{stop},
$p->{channel},
2, { $p => 1 });
foreach (keys %$p) {
if (not $boring_programme_key{$_}) {
warn <<END
not filtering programme at $p->{start} on $p->{channel} because it has $_
END
;
return 1;
}
}
return 0;
}
# We process only programmes with stop time and nonzero length.
sub interesting( $ ) {
my $p = shift;
my $stop = $p->{stop};
return 0 if not defined $stop;
my $cmp = Date_Cmp(pd $p->{start}, pd $stop);
if ($cmp < 0) {
# start < stop, okay.
return 1;
}
elsif ($cmp == 0) {
# Zero length, won't consider.
return 0;
}
elsif ($cmp > 0) {
warn "programme on $p->{channel} "
. "with start time ($p->{start}) "
. "before stop time ($stop)\n";
return 0;
}
else { die }
}
# Does there exist a sequence of programmes hopping from $start to
# $stop, where none of the programmes is in $used and the sequence is
# at least $min_length long?
#
# $start and $stop are Date::Manip objects, $used a hash whose keys
# are used programmes and whose values are true.
#
sub exists_partition( $$$$$ ) {
my ($start, $stop, $channel, $min_length, $used) = @_;
# local $Log::TraceMessages::On = 1;
t "seeking at least $min_length $start to $stop on $channel";
t '... not including: ' . d $used;
my $cmp = Date_Cmp($start, $stop);
if ($cmp < 0) {
t 'start before stop, okay';
my @poss = grep { not $used->{$_} }
@{$by_channel_and_start{$channel}{$start}};
t 'possible first steps of path: ' . d \@poss;
--$min_length if $min_length;
foreach my $p (@poss) {
return 1
if exists_partition(pd $p->{stop}, $stop, $channel,
$min_length, { %$used, $p => 1 });
}
t 'no paths found';
return 0;
}
elsif ($cmp == 0) {
t 'zero length, so path of length zero';
return not $min_length;
}
elsif ($cmp > 0) {
t 'stop < start, so no path';
return 0;
}
else { die }
}
# Lift parse_date() to handle undef.
sub pd( $ ) {
for ($_[0]) {
return undef if not defined;
return parse_date($_);
}
}
exit 1;
|