1# $Id: Grab_XML.pm,v 1.20 2015/07/05 04:32:22 knowledgejunkie Exp $ 2package XMLTV::Grab_XML; 3use strict; 4use Getopt::Long; 5use Date::Manip; 6use XMLTV; 7use XMLTV::Usage; 8use XMLTV::Memoize; 9use XMLTV::ProgressBar; 10use XMLTV::Ask; 11use XMLTV::TZ qw(parse_local_date); 12use XMLTV::Get_nice qw(); 13use XMLTV::Date; 14 15# Use Log::TraceMessages if installed. 16BEGIN { 17 eval { require Log::TraceMessages }; 18 if ($@) { 19 *t = sub {}; 20 *d = sub { '' }; 21 } 22 else { 23 *t = \&Log::TraceMessages::t; 24 *d = \&Log::TraceMessages::d; 25 Log::TraceMessages::check_argv(); 26 } 27} 28 29=pod 30 31=head1 NAME 32 33XMLTV::Grab_XML - Perl extension to fetch raw XMLTV data from a site 34 35=head1 SYNOPSIS 36 37 package Grab_XML_rur; 38 use base 'XMLTV::Grab_XML'; 39 sub urls_by_date( $ ) { my $pkg = shift; ... } 40 sub country( $ ) { my $pkg = shift; return 'Ruritania' } 41 # Maybe override a couple of other methods as described below... 42 Grab_XML_rur->go(); 43 44=head1 DESCRIPTION 45 46This module helps to write grabbers which fetch pages in XMLTV format 47from some website and output the data. It is not used for grabbers 48which scrape human-readable sites. 49 50It consists of several class methods (package methods). The way to 51use it is to subclass it and override some of these. 52 53=head1 METHODS 54 55=over 56 57=item XMLTV::Grab_XML->date_init() 58 59Called at the start of the program to set up Date::Manip. You might 60want to override this with a method that sets the timezone. 61 62=cut 63sub date_init( $ ) { 64 my $pkg = shift; 65 Date_Init(); 66} 67 68=pod 69 70=item XMLTV::Grab_XML->urls_by_date() 71 72Returns a hash mapping YYYYMMDD dates to a URL where listings for that 73date can be downloaded. This method is abstract, you must override 74it. 75 76Arguments: the command line options for --config-file and --quiet. 77 78=cut 79sub urls_by_date( $$$ ) { 80 my $pkg = shift; 81 die 'abstract class method: override in subclass'; 82} 83 84=pod 85 86=item XMLTV::Grab_XML->xml_from_data(data) 87 88Given page data for a particular day, turn it into XML. The default 89implementation just returns the data unchanged, but you might override 90it if you need to decompress the data or patch it up. 91 92=cut 93sub xml_from_data( $$ ) { 94 my $pkg = shift; 95 t 'Grab_XML::xml_from_data()'; 96 return shift; # leave unchanged 97} 98 99=pod 100 101=item XMLTV::Grab_XML->configure() 102 103Configure the grabber if needed. Arguments are --config-file option 104(or undef) and --quiet flag (or undef). 105 106This method is not provided in the base class; if you don't provide it 107then attempts to --configure will give a message that configuration is 108not necessary. 109 110=item XMLTV::Grab_XML->nextday(day) 111 112Bump a YYYYMMDD date by one. You probably shouldnE<39>t override this. 113 114=cut 115sub nextday( $$ ) { 116 my $pkg = shift; 117 my $d = shift; $d =~ /^\d{8}$/ or die; 118 my $p = parse_date($d); 119 my $n = DateCalc($p, '+ 1 day'); die if not defined $n; 120 return UnixDate($n, '%Q'); 121} 122 123=item XMLTV::Grab_XML->country() 124 125Return the name of the country youE<39>re grabbing for, used in usage 126messages. Abstract. 127 128=cut 129sub country( $ ) { 130 my $pkg = shift; 131 die 'abstract class method: override in subclass'; 132} 133 134=item XMLTV::Grab_XML->usage_msg() 135 136Return a command-line usage message. This calls C<country()>, so you 137probably need to override only that method. 138 139=cut 140sub usage_msg( $ ) { 141 my $pkg = shift; 142 my $country = $pkg->country(); 143 if ($pkg->can('configure')) { 144 return <<END 145$0: get $country television listings in XMLTV format 146usage: $0 --configure [--config-file FILE] 147 $0 [--output FILE] [--days N] [--offset N] [--quiet] [--config-file FILE] 148 $0 --help 149END 150 ; 151 } 152 else { 153 return <<END 154$0: get $country television listings in XMLTV format 155usage: $0 [--output FILE] [--days N] [--offset N] [--quiet] 156 $0 --help 157END 158 ; 159 } 160} 161 162=item XMLTV::Grab_XML->get() 163 164Given a URL, fetch the content at that URL. The default 165implementation calls XMLTV::Get_nice::get_nice() but you might want to 166override it if you need to do wacky things with http requests, like 167cookies. 168 169Note that while this method fetches a page, C<xml_from_data()> does 170any further processing of the result to turn it into XML. 171 172=cut 173sub get( $$ ) { 174 my $pkg = shift; 175 my $url = shift; 176 return XMLTV::Get_nice::get_nice($url); 177} 178 179=item XMLTV::Grab_XML->go() 180 181The main program. Parse command line options, fetch and write data. 182 183Most of the options are fairly self-explanatory but this routine also 184calls the XMLTV::Memoize module to look for a B<--cache> argument. 185The functions memoized are those given by the C<cachables()> method. 186 187=cut 188sub go( $ ) { 189 my $pkg = shift; 190 XMLTV::Memoize::check_argv($pkg->cachables()); 191 my ($opt_days, 192 $opt_help, 193 $opt_output, 194 $opt_share, 195 $opt_gui, 196 $opt_offset, 197 $opt_quiet, 198 $opt_configure, 199 $opt_config_file, 200 $opt_list_channels, 201 ); 202 $opt_offset = 0; # default 203 $opt_quiet = 0; # default 204 GetOptions('days=i' => \$opt_days, 205 'help' => \$opt_help, 206 'output=s' => \$opt_output, 207 'share=s' => \$opt_share, # undocumented 208 'gui:s' => \$opt_gui, 209 'offset=i' => \$opt_offset, 210 'quiet' => \$opt_quiet, 211 'configure' => \$opt_configure, 212 'config-file=s' => \$opt_config_file, 213 'list-channels' => \$opt_list_channels, 214 ) 215 or usage(0, $pkg->usage_msg()); 216 die 'number of days must not be negative' 217 if (defined $opt_days && $opt_days < 0); 218 usage(1, $pkg->usage_msg()) if $opt_help; 219 usage(0, $pkg->usage_msg()) if @ARGV; 220 221 XMLTV::Ask::init($opt_gui); 222 if ($opt_share) { 223 if ($pkg->can('set_share_dir')) { 224 $pkg->set_share_dir($opt_share); 225 } 226 else { 227 print STDERR "share directory not in use\n"; 228 } 229 } 230 231 my $has_config = $pkg->can('configure'); 232 if ($opt_configure) { 233 if ($has_config) { 234 $pkg->configure($opt_config_file, $opt_quiet); 235 } 236 else { 237 print STDERR "no configuration necessary\n"; 238 } 239 exit; 240 } 241 242 for ($opt_config_file) { 243 warn("this grabber has no configuration, so ignoring --config-file\n"), undef $_ 244 if defined and not $has_config; 245 } 246 247 # Need to call parse_local_date() before any resetting of 248 # Date::Manip's timezone. 249 # 250 my $now = DateCalc(parse_local_date('now'), "$opt_offset days"); 251 die if not defined $now; 252 $pkg->date_init(); 253 my $today = UnixDate($now, '%Q'); 254 255 my %urls = $pkg->urls_by_date($opt_config_file, $opt_quiet); 256 t 'URLs by date: ' . d \%urls; 257 258 my @to_get; 259 if ($opt_list_channels) { 260 # We won't bother to do an exhaustive check for every option 261 # that is ignored with --list-channels. 262 # 263 die "useless to give --days or --offset with --list-channels\n" 264 if defined $opt_days or $opt_offset != 0; 265 266 # For now, assume that the upstream site doesn't provide any 267 # way to get just the channels, so we'll have to pick a 268 # listings file and then discard most of it. 269 # 270 my @dates = sort keys %urls; 271 die 'no dates found on site' if not @dates; 272 my $latest = $dates[-1]; 273 @to_get = $urls{$latest}; 274 } 275 else { 276 # Getting programme listings. 277 my $days_left = $opt_days; 278 t '$days_left starts at ' . d $days_left; 279 t '$today=' . d $today; 280 for (my $day = $today; defined $urls{$day}; $day = $pkg->nextday($day)) { 281 t "\$urls{$day}=" . d $urls{$day}; 282 if (defined $days_left and $days_left-- == 0) { 283 t 'got to last day'; 284 last; 285 } 286 push @to_get, $urls{$day}; 287 } 288 if (defined $days_left and $days_left > 0) { 289 warn "couldn't get all of $opt_days days, only " 290 . ($opt_days - $days_left) . "\n"; 291 } 292 elsif (not @to_get) { 293 warn "couldn't get any listings from the site for today or later\n"; 294 } 295 } 296 297 my $bar = new XMLTV::ProgressBar('downloading listings', scalar @to_get) 298 if not $opt_quiet; 299 my @listingses; 300 foreach my $url (@to_get) { 301 my $xml; 302 303 # Set error handlers. Strange bugs if you call warn() or 304 # die() inside these, at least I have seen such bugs in 305 # XMLTV.pm, so I'm avoiding it here. 306 # 307 local $SIG{__WARN__} = sub { 308 my $msg = shift; 309 $msg = "warning: something's wrong" if not defined $msg; 310 print STDERR "$url: $msg\n"; 311 }; 312 local $SIG{__DIE__} = sub { 313 my $msg = shift; 314 $msg = 'died' if not defined $msg; 315 print STDERR "$url: $msg, exiting\n"; 316 exit(1); 317 }; 318 319 my $got = $pkg->get($url); 320 if (not defined $got) { 321 warn 'failed to download, skipping'; 322 next; 323 } 324 325 $xml = $pkg->xml_from_data($got); 326 t 'got XML: ' . d $xml; 327 if (not defined $xml) { 328 warn 'could not get XML from page, skipping'; 329 next; 330 } 331 332 push @listingses, XMLTV::parse($xml); 333 update $bar if not $opt_quiet; 334 } 335 $bar->finish() if not $opt_quiet; 336 337 my %w_args = (); 338 if (defined $opt_output) { 339 my $fh = new IO::File ">$opt_output"; 340 die "cannot write to $opt_output\n" if not $fh; 341 %w_args = (OUTPUT => $fh); 342 } 343 344 if ($opt_list_channels) { 345 die if @listingses != 1; 346 my $l = $listingses[0]; 347 undef $l->[3]; # blank out programme data 348 XMLTV::write_data($l, %w_args); 349 } 350 else { 351 XMLTV::write_data(XMLTV::cat(@listingses), %w_args); 352 } 353} 354 355=item XMLTV::Grab_XML->cachables() 356 357Returns a list of names of functions which could reasonably be 358memoized between runs. This will normally be whatever function 359fetches the web pages - you memoize that to save on repeated 360downloads. A subclass might want to add things to this list 361if it has its own way of fetching web pages. 362 363=cut 364sub cachables( $ ) { 365 my $pkg = shift; 366 return ('XMLTV::Get_nice::get_nice_aux'); 367} 368 369=pod 370 371=item XMLTV::Grab_XML->remove_early_stop_times() 372 373Checks each stop time and removes it if it's before the start time. 374 375Argument: the XML to correct 376Returns: the corrected XML 377 378=cut 379 380my $warned_bad_stop_time = 0; 381sub remove_early_stop_times( $$ ) { 382 my $pkg = shift; 383 my @lines = split /\n/, shift; 384 foreach (@lines) { 385 if (/<programme/) { 386 # First change to numeric timezones. 387 s{(start|stop)="(\d+) ([A-Z]+)"} 388 {qq'$1="$2 ' . tz_to_num($3) . '"'}eg; 389 390 # Now remove stop times before start. Only worry about 391 # cases where the timezone is the same - we hope the 392 # upstream data will be fixed by the next TZ changeover. 393 # 394 /start="(\d+) (\S+)"/ or next; 395 my ($start, $tz) = ($1, $2); 396 /stop="(\d+) \Q$tz\E"/ or next; 397 my $stop = $1; 398 399 if ($stop lt $start) { 400 warn "removing stop time before start time: $_" 401 unless $warned_bad_stop_time++; 402 s/stop="[^""]+"\s*// or die; 403 } 404 } 405 } 406 return join("\n", @lines); 407} 408 409=pod 410 411=back 412 413=head1 AUTHOR 414 415Ed Avis, ed@membled.com 416 417=head1 SEE ALSO 418 419L<perl(1)>, L<XMLTV(3)>. 420 421=cut 4221; 423 424