1# $Id: Grab_XML.pm,v 1.20 2015/07/05 04:32:22 knowledgejunkie Exp $
2package XMLTV::Grab_XML;
3use strict;
4use Getopt::Long;
5use Date::Manip;
6use XMLTV;
7use XMLTV::Usage;
8use XMLTV::Memoize;
9use XMLTV::ProgressBar;
10use XMLTV::Ask;
11use XMLTV::TZ qw(parse_local_date);
12use XMLTV::Get_nice qw();
13use XMLTV::Date;
14
15# Use Log::TraceMessages if installed.
16BEGIN {
17    eval { require Log::TraceMessages };
18    if ($@) {
19	*t = sub {};
20	*d = sub { '' };
21    }
22    else {
23	*t = \&Log::TraceMessages::t;
24	*d = \&Log::TraceMessages::d;
25	Log::TraceMessages::check_argv();
26    }
27}
28
29=pod
30
31=head1 NAME
32
33XMLTV::Grab_XML - Perl extension to fetch raw XMLTV data from a site
34
35=head1 SYNOPSIS
36
37    package Grab_XML_rur;
38    use base 'XMLTV::Grab_XML';
39    sub urls_by_date( $ ) { my $pkg = shift; ... }
40    sub country( $ ) { my $pkg = shift; return 'Ruritania' }
41    # Maybe override a couple of other methods as described below...
42    Grab_XML_rur->go();
43
44=head1 DESCRIPTION
45
46This module helps to write grabbers which fetch pages in XMLTV format
47from some website and output the data.  It is not used for grabbers
48which scrape human-readable sites.
49
50It consists of several class methods (package methods).  The way to
51use it is to subclass it and override some of these.
52
53=head1 METHODS
54
55=over
56
57=item XMLTV::Grab_XML->date_init()
58
59Called at the start of the program to set up Date::Manip.  You might
60want to override this with a method that sets the timezone.
61
62=cut
63sub date_init( $ ) {
64    my $pkg = shift;
65    Date_Init();
66}
67
68=pod
69
70=item XMLTV::Grab_XML->urls_by_date()
71
72Returns a hash mapping YYYYMMDD dates to a URL where listings for that
73date can be downloaded.  This method is abstract, you must override
74it.
75
76Arguments: the command line options for --config-file and --quiet.
77
78=cut
79sub urls_by_date( $$$ ) {
80    my $pkg = shift;
81    die 'abstract class method: override in subclass';
82}
83
84=pod
85
86=item XMLTV::Grab_XML->xml_from_data(data)
87
88Given page data for a particular day, turn it into XML.  The default
89implementation just returns the data unchanged, but you might override
90it if you need to decompress the data or patch it up.
91
92=cut
93sub xml_from_data( $$ ) {
94    my $pkg = shift;
95    t 'Grab_XML::xml_from_data()';
96    return shift; # leave unchanged
97}
98
99=pod
100
101=item XMLTV::Grab_XML->configure()
102
103Configure the grabber if needed.  Arguments are --config-file option
104(or undef) and --quiet flag (or undef).
105
106This method is not provided in the base class; if you don't provide it
107then attempts to --configure will give a message that configuration is
108not necessary.
109
110=item XMLTV::Grab_XML->nextday(day)
111
112Bump a YYYYMMDD date by one.  You probably shouldnE<39>t override this.
113
114=cut
115sub nextday( $$ ) {
116    my $pkg = shift;
117    my $d = shift; $d =~ /^\d{8}$/ or die;
118    my $p = parse_date($d);
119    my $n = DateCalc($p, '+ 1 day'); die if not defined $n;
120    return UnixDate($n, '%Q');
121}
122
123=item XMLTV::Grab_XML->country()
124
125Return the name of the country youE<39>re grabbing for, used in usage
126messages.  Abstract.
127
128=cut
129sub country( $ ) {
130    my $pkg = shift;
131    die 'abstract class method: override in subclass';
132}
133
134=item XMLTV::Grab_XML->usage_msg()
135
136Return a command-line usage message.  This calls C<country()>, so you
137probably need to override only that method.
138
139=cut
140sub usage_msg( $ ) {
141    my $pkg = shift;
142    my $country = $pkg->country();
143    if ($pkg->can('configure')) {
144        return <<END
145$0: get $country television listings in XMLTV format
146usage: $0 --configure [--config-file FILE]
147       $0 [--output FILE] [--days N] [--offset N] [--quiet] [--config-file FILE]
148       $0 --help
149END
150      ;
151    }
152    else {
153        return <<END
154$0: get $country television listings in XMLTV format
155usage: $0 [--output FILE] [--days N] [--offset N] [--quiet]
156       $0 --help
157END
158      ;
159    }
160}
161
162=item XMLTV::Grab_XML->get()
163
164Given a URL, fetch the content at that URL.  The default
165implementation calls XMLTV::Get_nice::get_nice() but you might want to
166override it if you need to do wacky things with http requests, like
167cookies.
168
169Note that while this method fetches a page, C<xml_from_data()> does
170any further processing of the result to turn it into XML.
171
172=cut
173sub get( $$ ) {
174    my $pkg = shift;
175    my $url = shift;
176    return XMLTV::Get_nice::get_nice($url);
177}
178
179=item XMLTV::Grab_XML->go()
180
181The main program.  Parse command line options, fetch and write data.
182
183Most of the options are fairly self-explanatory but this routine also
184calls the XMLTV::Memoize module to look for a B<--cache> argument.
185The functions memoized are those given by the C<cachables()> method.
186
187=cut
188sub go( $ ) {
189    my $pkg = shift;
190    XMLTV::Memoize::check_argv($pkg->cachables());
191    my ($opt_days,
192	$opt_help,
193	$opt_output,
194	$opt_share,
195	$opt_gui,
196	$opt_offset,
197	$opt_quiet,
198	$opt_configure,
199	$opt_config_file,
200	$opt_list_channels,
201       );
202    $opt_offset = 0;		# default
203    $opt_quiet = 0;		# default
204    GetOptions('days=i'        => \$opt_days,
205	       'help'          => \$opt_help,
206	       'output=s'      => \$opt_output,
207	       'share=s'       => \$opt_share, # undocumented
208	       'gui:s'         => \$opt_gui,
209	       'offset=i'      => \$opt_offset,
210	       'quiet'         => \$opt_quiet,
211	       'configure'     => \$opt_configure,
212	       'config-file=s' => \$opt_config_file,
213	       'list-channels' => \$opt_list_channels,
214	      )
215      or usage(0, $pkg->usage_msg());
216    die 'number of days must not be negative'
217      if (defined $opt_days && $opt_days < 0);
218    usage(1, $pkg->usage_msg()) if $opt_help;
219    usage(0, $pkg->usage_msg()) if @ARGV;
220
221    XMLTV::Ask::init($opt_gui);
222    if ($opt_share) {
223        if ($pkg->can('set_share_dir')) {
224            $pkg->set_share_dir($opt_share);
225        }
226        else {
227            print STDERR "share directory not in use\n";
228        }
229    }
230
231    my $has_config = $pkg->can('configure');
232    if ($opt_configure) {
233        if ($has_config) {
234            $pkg->configure($opt_config_file, $opt_quiet);
235        }
236        else {
237            print STDERR "no configuration necessary\n";
238        }
239        exit;
240    }
241
242    for ($opt_config_file) {
243        warn("this grabber has no configuration, so ignoring --config-file\n"), undef $_
244          if defined and not $has_config;
245    }
246
247    # Need to call parse_local_date() before any resetting of
248    # Date::Manip's timezone.
249    #
250    my $now = DateCalc(parse_local_date('now'), "$opt_offset days");
251    die if not defined $now;
252    $pkg->date_init();
253    my $today = UnixDate($now, '%Q');
254
255    my %urls = $pkg->urls_by_date($opt_config_file, $opt_quiet);
256    t 'URLs by date: ' . d \%urls;
257
258    my @to_get;
259    if ($opt_list_channels) {
260	# We won't bother to do an exhaustive check for every option
261	# that is ignored with --list-channels.
262	#
263	die "useless to give --days or --offset with --list-channels\n"
264	    if defined $opt_days or $opt_offset != 0;
265
266	# For now, assume that the upstream site doesn't provide any
267	# way to get just the channels, so we'll have to pick a
268	# listings file and then discard most of it.
269	#
270	my @dates = sort keys %urls;
271	die 'no dates found on site' if not @dates;
272	my $latest = $dates[-1];
273	@to_get = $urls{$latest};
274    }
275    else {
276	# Getting programme listings.
277	my $days_left = $opt_days;
278	t '$days_left starts at ' . d $days_left;
279	t '$today=' . d $today;
280	for (my $day = $today; defined $urls{$day}; $day = $pkg->nextday($day)) {
281	    t "\$urls{$day}=" . d $urls{$day};
282	    if (defined $days_left and $days_left-- == 0) {
283		t 'got to last day';
284		last;
285	    }
286	    push @to_get, $urls{$day};
287	}
288	if (defined $days_left and $days_left > 0) {
289	    warn "couldn't get all of $opt_days days, only "
290		. ($opt_days - $days_left) . "\n";
291	}
292	elsif (not @to_get) {
293	    warn "couldn't get any listings from the site for today or later\n";
294	}
295    }
296
297    my $bar = new XMLTV::ProgressBar('downloading listings', scalar @to_get)
298      if not $opt_quiet;
299    my @listingses;
300    foreach my $url (@to_get) {
301	my $xml;
302
303	# Set error handlers.  Strange bugs if you call warn() or
304	# die() inside these, at least I have seen such bugs in
305	# XMLTV.pm, so I'm avoiding it here.
306	#
307	local $SIG{__WARN__} = sub {
308	    my $msg = shift;
309	    $msg = "warning: something's wrong" if not defined $msg;
310	    print STDERR "$url: $msg\n";
311	};
312 	local $SIG{__DIE__} = sub {
313 	    my $msg = shift;
314 	    $msg = 'died' if not defined $msg;
315 	    print STDERR "$url: $msg, exiting\n";
316 	    exit(1);
317 	};
318
319	my $got = $pkg->get($url);
320	if (not defined $got) {
321	    warn 'failed to download, skipping';
322	    next;
323	}
324
325	$xml = $pkg->xml_from_data($got);
326	t 'got XML: ' . d $xml;
327	if (not defined $xml) {
328	    warn 'could not get XML from page, skipping';
329	    next;
330	}
331
332	push @listingses, XMLTV::parse($xml);
333	update $bar if not $opt_quiet;
334    }
335    $bar->finish() if not $opt_quiet;
336
337    my %w_args = ();
338    if (defined $opt_output) {
339	my $fh = new IO::File ">$opt_output";
340	die "cannot write to $opt_output\n" if not $fh;
341	%w_args = (OUTPUT => $fh);
342    }
343
344    if ($opt_list_channels) {
345	die if @listingses != 1;
346	my $l = $listingses[0];
347	undef $l->[3];                  # blank out programme data
348	XMLTV::write_data($l, %w_args);
349    }
350    else {
351	XMLTV::write_data(XMLTV::cat(@listingses), %w_args);
352    }
353}
354
355=item XMLTV::Grab_XML->cachables()
356
357Returns a list of names of functions which could reasonably be
358memoized between runs.  This will normally be whatever function
359fetches the web pages - you memoize that to save on repeated
360downloads.  A subclass might want to add things to this list
361if it has its own way of fetching web pages.
362
363=cut
364sub cachables( $ ) {
365    my $pkg = shift;
366    return ('XMLTV::Get_nice::get_nice_aux');
367}
368
369=pod
370
371=item XMLTV::Grab_XML->remove_early_stop_times()
372
373Checks each stop time and removes it if it's before the start time.
374
375Argument: the XML to correct
376Returns: the corrected XML
377
378=cut
379
380my $warned_bad_stop_time = 0;
381sub remove_early_stop_times( $$ ) {
382    my $pkg = shift;
383    my @lines = split /\n/, shift;
384    foreach (@lines) {
385	if (/<programme/) {
386	    # First change to numeric timezones.
387	    s{(start|stop)="(\d+) ([A-Z]+)"}
388	    {qq'$1="$2 ' . tz_to_num($3) . '"'}eg;
389
390	    # Now remove stop times before start.  Only worry about
391	    # cases where the timezone is the same - we hope the
392	    # upstream data will be fixed by the next TZ changeover.
393	    #
394	    /start="(\d+) (\S+)"/ or next;
395	    my ($start, $tz) = ($1, $2);
396	    /stop="(\d+) \Q$tz\E"/ or next;
397	    my $stop = $1;
398
399	    if ($stop lt $start) {
400		warn "removing stop time before start time: $_"
401		  unless $warned_bad_stop_time++;
402		s/stop="[^""]+"\s*// or die;
403	    }
404	}
405    }
406    return join("\n", @lines);
407}
408
409=pod
410
411=back
412
413=head1 AUTHOR
414
415Ed Avis, ed@membled.com
416
417=head1 SEE ALSO
418
419L<perl(1)>, L<XMLTV(3)>.
420
421=cut
4221;
423
424