1#!/usr/local/bin/perl -w 2 3=pod 4 5=head1 NAME 6 7tv_grab_es - Grab TV listings for Spain. 8 9=head1 SYNOPSIS 10 11tv_grab_es --help 12 13tv_grab_es [--config-file FILE] --configure [--gui OPTION] 14 15tv_grab_es [--config-file FILE] [--output FILE] [--days N] 16 [--offset N] [--quiet] 17 18tv_grab_es --list-channels 19 20tv_grab_es --capabilities 21 22tv_grab_es --version 23 24=head1 DESCRIPTION 25 26Output TV listings for several channels available in Spain. 27Now supports the terrestrial analog tv listings, which is the most common tv 28viewed in Spain that currently has no EPG information. I have plans to add 29Satelite listings (now the Spanish platforms are in a merger process between 30providers and also cable (the listings has to be grabbed from different sites)). 31The tv listings comes from www.elpais.es 32The grabber relies on parsing HTML so it might stop working at any time. 33 34First run B<tv_grab_es --configure> to choose, which channels you want 35to download. Then running B<tv_grab_es> with no arguments will output 36listings in XML format to standard output. 37 38B<--configure> Prompt for which channels, 39and write the configuration file. 40 41B<--config-file FILE> Set the name of the configuration file, the 42default is B<~/.xmltv/tv_grab_es.conf>. This is the file written by 43B<--configure> and read when grabbing. 44 45B<--gui OPTION> Use this option to enable a graphical interface to be used. 46OPTION may be 'Tk', or left blank for the best available choice. 47Additional allowed values of OPTION are 'Term' for normal terminal output 48(default) and 'TermNoProgressBar' to disable the use of XMLTV::ProgressBar. 49 50B<--output FILE> Write to FILE rather than standard output. 51 52B<--days N> Grab N days. The default is 3. 53 54B<--offset N> Start N days in the future. The default is to start 55from today. 56 57B<--quiet> Suppress the progress messages normally written to standard 58error. 59 60B<--capabilities> Show which capabilities the grabber supports. For more 61information, see L<http://wiki.xmltv.org/index.php/XmltvCapabilities> 62 63B<--version> Show the version of the grabber. 64 65B<--help> Print a help message and exit. 66 67=head1 SEE ALSO 68 69L<xmltv(5)>. 70 71=head1 AUTHOR 72 73Ramon Roca, Ramon.Roca@XCombo.com, based on tv_grab_fi, from Matti Airas. 74 75=head1 BUGS 76 77=cut 78 79# Author's TODOs & thoughts 80# 81# this is for analog listings, for D+ satellite listings please use 82# tv_grab_es_digital 83# 84# get the icons of each grabbed channel from the website 85# 86# findout how to setup properly the language, (catalan, basque, galician, vo) 87# 88# get channel ids in RFC2838 format (I don't, actually the Id comes directly 89# web site, i don't know where to go for getting th id's for spanish 90# tv broadcasters. 91# 92# do the listings from another site, just in case this one breaks, the most 93# similar sites to this grabber are television.ya.com. 94# we should consider also getting them from www.terra.es or 95# www.tvinteligente.com, they provide also some more local tv listings 96# however the grabber gets a lot more complex and needs many more urls 97# to collect the info, although it can be a little bit more complete 98# (i.e. credits, program duration...) 99# 100# 101 102 103###################################################################### 104# initializations 105 106use strict; 107use XMLTV::Version '$Id: tv_grab_es,v 1.39 2010/09/02 05:07:40 rmeden Exp $ '; 108use XMLTV::Capabilities qw/baseline manualconfig cache/; 109use XMLTV::Description 'Spain'; 110use Getopt::Long; 111use Date::Manip; 112use HTML::TreeBuilder; 113use HTML::Entities; # parse entities 114use IO::File; 115 116use XMLTV; 117use XMLTV::Memoize; 118use XMLTV::ProgressBar; 119use XMLTV::Ask; 120use XMLTV::Config_file; 121use XMLTV::DST; 122use XMLTV::Get_nice; 123use XMLTV::Mode; 124use XMLTV::Date; 125# Todo: perhaps we should internationalize messages and docs? 126use XMLTV::Usage <<END 127$0: get Spanish television listings in XMLTV format 128To configure: $0 --configure [--config-file FILE] 129To grab listings: $0 [--config-file FILE] [--output FILE] [--days N] 130 [--offset N] [--quiet] 131To list channels: $0 --list-channels 132To show capabilities: $0 --capabilities 133To show version: $0 --version 134END 135 ; 136 137# Attributes of the root element in output. 138my $HEAD = { 'source-info-url' => 'http://www.elpais.es/parrillatv/portada.html', 139 'source-data-url' => "http://www.elpais.es/parrillatv/resultados.html", 140 'generator-info-name' => 'XMLTV', 141 'generator-info-url' => 'http://xmltv.org/', 142 }; 143 144# Whether zero-length programmes should be included in the output. 145my $WRITE_ZERO_LENGTH = 0; 146 147# default language 148my $LANG="es"; 149 150# Global channel_data 151our @ch_all; 152 153###################################################################### 154# get options 155 156# Get options, including undocumented --cache option. 157XMLTV::Memoize::check_argv('XMLTV::Get_nice::get_nice_aux'); 158my ($opt_days, $opt_offset, $opt_help, $opt_output, 159 $opt_configure, $opt_config_file, $opt_gui, 160 $opt_quiet, $opt_list_channels); 161$opt_days = 3; # default 162$opt_offset = 0; # default 163$opt_quiet = 0; # default 164GetOptions('days=i' => \$opt_days, 165 'offset=i' => \$opt_offset, 166 'help' => \$opt_help, 167 'configure' => \$opt_configure, 168 'config-file=s' => \$opt_config_file, 169 'gui:s' => \$opt_gui, 170 'output=s' => \$opt_output, 171 'quiet' => \$opt_quiet, 172 'list-channels' => \$opt_list_channels 173 ) 174 or usage(0); 175die 'number of days must not be negative' 176 if (defined $opt_days && $opt_days < 0); 177usage(1) if $opt_help; 178 179XMLTV::Ask::init($opt_gui); 180 181my $mode = XMLTV::Mode::mode('grab', # default 182 $opt_configure => 'configure', 183 $opt_list_channels => 'list-channels', 184 ); 185 186# File that stores which channels to download. 187my $config_file 188 = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_es', $opt_quiet); 189 190my @config_lines; # used only in grab mode 191if ($mode eq 'configure') { 192 XMLTV::Config_file::check_no_overwrite($config_file); 193} 194elsif ($mode eq 'grab') { 195 @config_lines = XMLTV::Config_file::read_lines($config_file); 196} 197elsif ($mode eq 'list-channels') { 198 # Config file not used. 199} 200else { die } 201 202# Whatever we are doing, we need the channels data. 203my %channels = get_channels(); # sets @ch_all 204my @channels; 205 206###################################################################### 207# write configuration 208 209if ($mode eq 'configure') { 210 open(CONF, ">$config_file") or die "cannot write to $config_file: $!"; 211 212 # Ask about each channel. 213 my @chs = sort keys %channels; 214 my @names = map { $channels{$_} } @chs; 215 my @qs = map { "add channel $_?" } @names; 216 my @want = ask_many_boolean(1, @qs); 217 foreach (@chs) { 218 my $w = shift @want; 219 warn("cannot read input, stopping channel questions"), last 220 if not defined $w; 221 # No need to print to user - XMLTV::Ask is verbose enough. 222 223 # Print a config line, but comment it out if channel not wanted. 224 print CONF '#' if not $w; 225 my $name = shift @names; 226 print CONF "channel $_ $name\n"; 227 # TODO don't store display-name in config file. 228 } 229 230 close CONF or warn "cannot close $config_file: $!"; 231 say("Finished configuration."); 232 233 exit(); 234} 235 236 237# Not configuration, we must be writing something, either full 238# listings or just channels. 239# 240die if $mode ne 'grab' and $mode ne 'list-channels'; 241 242# Options to be used for XMLTV::Writer. 243my %w_args; 244if (defined $opt_output) { 245 my $fh = new IO::File(">$opt_output"); 246 die "cannot write to $opt_output: $!" if not defined $fh; 247 $w_args{OUTPUT} = $fh; 248} 249$w_args{encoding} = 'ISO-8859-1'; 250my $writer = new XMLTV::Writer(%w_args); 251$writer->start($HEAD); 252 253if ($mode eq 'list-channels') { 254 $writer->write_channel($_) foreach @ch_all; 255 $writer->end(); 256 exit(); 257} 258 259###################################################################### 260# We are producing full listings. 261die if $mode ne 'grab'; 262 263# Read configuration 264my $line_num = 1; 265foreach (@config_lines) { 266 ++ $line_num; 267 next if not defined; 268 if (/^channel:?\s+(\S+)\s+([^\#]+)/) { 269 my $ch_did = $1; 270 my $ch_name = $2; 271 $ch_name =~ s/\s*$//; 272 push @channels, $ch_did; 273 $channels{$ch_did} = $ch_name; 274 } 275 else { 276 warn "$config_file:$line_num: bad line\n"; 277 } 278} 279 280###################################################################### 281# begin main program 282 283# Assume the listings source uses CET (see BUGS above). 284my $now = DateCalc(parse_date('now'), "$opt_offset days"); 285die "No channels specified, run me with --configure\n" 286 if not keys %channels; 287my @to_get; 288 289 290# the order in which we fetch the channels matters 291foreach my $ch_did (@channels) { 292 my $ch_name=$channels{$ch_did}; 293 my $ch_xid="$ch_did.elpais.es"; 294 my $ch_num=$ch_did + 0; 295 $writer->write_channel({ id => $ch_xid, 296 'display-name' => [ [ $ch_name ], 297 [ $ch_num ] ] }); 298 my $day=UnixDate($now,'%Q'); 299 for (my $i=0;$i<$opt_days;$i++) { 300 push @to_get, [ $day, $ch_xid, $ch_did ]; 301 #for each day 302 $day=nextday($day); die if not defined $day; 303 } 304} 305 306# This progress bar is for both downloading and parsing. Maybe 307# they could be separate. 308# 309my $bar = new XMLTV::ProgressBar('getting listings', scalar @to_get) 310 if not $opt_quiet; 311foreach (@to_get) { 312 foreach (process_table($_->[0], $_->[1], $_->[2])) { 313 $writer->write_programme($_); 314 } 315 update $bar if not $opt_quiet; 316} 317$bar->finish() if not $opt_quiet; 318$writer->end(); 319 320###################################################################### 321# subroutine definitions 322 323# Use Log::TraceMessages if installed. 324BEGIN { 325 eval { require Log::TraceMessages }; 326 if ($@) { 327 *t = sub {}; 328 *d = sub { '' }; 329 } 330 else { 331 *t = \&Log::TraceMessages::t; 332 *d = \&Log::TraceMessages::d; 333 Log::TraceMessages::check_argv(); 334 } 335} 336 337#### 338# process_table: fetch a URL and process it 339# 340# arguments: 341# Date::Manip object giving the day to grab 342# xmltv id of channel 343# elpais.es id of channel 344# 345# returns: list of the programme hashes to write 346# 347sub process_table { 348 my ($date, $ch_xmltv_id, $ch_es_id) = @_; 349 350 my $today = UnixDate($date, '%Y%m%d'); 351 my $url = "http://www.elpais.es/parrillatv/resultados.html?franja=&tipo=&canal=$ch_es_id&dia=$today"; 352 t $url; 353 local $SIG{__WARN__} = sub { 354 warn "$url: $_[0]"; 355 }; 356 357 # parse the page to a document object 358 my $tree = get_nice_tree $url; 359 my @program_data = get_program_data($tree); 360 my $bump_start_day=0; 361 362 my @r; 363 while (@program_data) { 364 my $cur = shift @program_data; 365 my $next = shift @program_data; 366 unshift @program_data,$next if $next; 367 368 my $p = make_programme_hash($date, $ch_xmltv_id, $ch_es_id, $cur, $next); 369 if (not $p) { 370 require Data::Dumper; 371 my $d = Data::Dumper::Dumper($cur); 372 warn "cannot write programme on $ch_xmltv_id on $date:\n$d\n"; 373 } 374 else { 375 push @r, $p; 376 } 377 378 if (!$bump_start_day && bump_start_day($cur,$next)) { 379 $bump_start_day=1; 380 $date = UnixDate(DateCalc($date,"+ 1 day"),'%Q'); 381 } 382 } 383 return @r; 384} 385 386sub make_programme_hash { 387 my ($date, $ch_xmltv_id, $ch_es_id, $cur, $next) = @_; 388 389 my %prog; 390 391 $prog{channel}=$ch_xmltv_id; 392 $prog{title}=[ [ $cur->{title}, $LANG ] ]; 393 $prog{"sub-title"}=[ [ $cur->{subtitle}, $LANG ] ] if defined $cur->{subtitle}; 394 $prog{category}=[ [ $cur->{category}, $LANG ] ]; 395 396 t "turning local time $cur->{time}, on date $date, into UTC"; 397 eval { $prog{start}=utc_offset("$date $cur->{time}", '+0100') }; 398 if ($@) { 399 warn "bad time string: $cur->{time}"; 400 return undef; 401 } 402 t "...got $prog{start}"; 403 # FIXME: parse description field further 404 405 $prog{desc}=[ [ $cur->{desc}, $LANG ] ] if defined $cur->{desc}; 406 407 return \%prog; 408} 409sub bump_start_day { 410 my ($cur,$next) = @_; 411 if (!defined($next)) { 412 return undef; 413 } 414 my $start = UnixDate($cur->{time},'%H:%M'); 415 my $stop = UnixDate($next->{time},'%H:%M'); 416 if (Date_Cmp($start,$stop)>0) { 417 return 1; 418 } else { 419 return 0; 420 } 421} 422 423 424# 425# program data is split as follows: 426# - as 22/4/2003 elpais.es have changed again the page, now the table that 427# that contains the listings have a single header, so now we only look 428# once for it and use the time to findout where the listings ends. 429sub get_program_data { 430 my ($tree) = @_; 431 my @data; 432 433 my @txt_elems = get_txt_elems($tree); 434 435 # Actually time and title are required, but we don't check that. 436 437 my $index = 0; 438 while ($index <= scalar (@txt_elems-4)) { 439 if ( ($txt_elems[$index] eq "Hora") 440 && ($txt_elems[$index + 1] eq "Programa") 441 && ($txt_elems[$index + 2] eq "Canal") 442 && ($txt_elems[$index + 3] eq "Tipo") ) 443 { 444 t "Program listing comes below"; 445 $index = $index + 4; 446 while ( $txt_elems[$index] =~ /^\d\d:\d\d/ ) { 447 t "Program found: Hora: $txt_elems[$index] Programa: $txt_elems[$index+1]"; 448 # Look for duplicate start time, that occurs sometimes 449 # at elpais.es when a new program is lately scheduled but seems 450 # that they forget to remove the previous listing 451 # If it happens, we just grab the last program. 452 if ( $txt_elems[$index] =~ $txt_elems[$index + 5] ) { 453 $index = $index + 5; 454 } 455 456 my $p_stime = $txt_elems[$index]; 457 my @p_str = split (/:/,$txt_elems[$index + 1]); 458 my $p_title = $p_str[0]; 459 for ($p_title) { s/^\s+//; s/\s+$// } 460 my @strsub = split (/\"/,$p_str[1]) 461 if (defined $p_str[1]); 462 my $p_subtitle; 463 if (defined $strsub[0]) { 464 if ( $strsub[0] =~ " " ) { 465 $p_subtitle = $strsub[1]; 466 } 467 else { 468 $p_subtitle = $p_str[1]; 469 } 470 undef $p_subtitle 471 if defined $p_subtitle and $p_subtitle eq ''; 472 } 473 my $p_category = $txt_elems[$index + 3]; 474 my $p_desc; 475 if (not ( $txt_elems[$index + 4] =~ /^\d\d:\d\d/ ) ) { 476 # Program has Description 477 $p_desc = $txt_elems[$index + 4]; 478 $index = $index + 5; 479 } else { 480 # Program don't have Description 481 $index = $index + 4; 482 } 483 my %h = ( time => $p_stime, 484 category=> $p_category, 485 title=> $p_title, 486 desc => $p_desc ); 487 $h{subtitle} = $p_subtitle if defined $p_subtitle; 488 push @data, \%h; 489# t "Next time?: $txt_elems[$index]"; 490 } # end while prof the program 491 } 492 t $txt_elems[$index]; 493 $index = $index + 1; 494 } 495 return @data; 496} 497sub get_txt_elems { 498 my ($tree) = @_; 499 500 my @txt_elem; 501 my @txt_cont = $tree->look_down( 502 sub { ($_[0]->descendants() eq 0 ) }, 503 sub { defined($_[0]->attr ("_content") ) } ); 504 foreach my $txt (@txt_cont) { 505 my @children=$txt->content_list; 506 if (defined($children[0])) { 507 for (my $tmp=$children[0]) { 508 s/^\s+//;s/\s+$//; 509 push @txt_elem, $_; 510 } 511 } 512 } 513 return @txt_elem; 514} 515 516# get channel listing 517sub get_channels { 518 my $bar = new XMLTV::ProgressBar('getting list of channels', 1) 519 if not $opt_quiet; 520 my %channels; 521 my $url="http://www.elpais.es/parrillatv/portada.html"; 522 t $url; 523 524 my $tree = get_nice_tree $url; 525 my @menus = $tree->find_by_tag_name("_tag"=>"select"); 526 527 foreach my $elem (@menus) { 528 my $cname = $elem->attr('name'); 529 if ($cname eq "canal") { 530 my @ocanals = $elem->find_by_tag_name("_tag"=>"option"); 531 @ocanals = sort @ocanals; 532 foreach my $opt (@ocanals) { 533 if (not $opt->attr('value') eq "") { 534 my @str = split (/-/,$opt->attr('value')); 535 my $channel_id = $str[0]; 536 my $channel_name=$str[1]; 537 my $channel_num = $channel_id; 538 if (length $channel_id eq 1) { 539 $channel_id = "0" . $channel_id 540 } 541 $channels{$channel_id}=$channel_name; 542 push @ch_all, { 'display-name' => [ [ $str[1], $LANG ], 543 [ $channel_num ] ], 544 'id'=> "$channel_id.elpais.es" }; 545 } 546 } 547 } 548 } 549 die "no channels could be found" if not keys %channels; 550 update $bar if not $opt_quiet; 551 $bar->finish() if not $opt_quiet; 552 return %channels; 553} 554 555 556# Bump a YYYYMMDD date by one. 557sub nextday { 558 my $d = shift; 559 my $p = parse_date($d); 560 my $n = DateCalc($p, '+ 1 day'); 561 return UnixDate($n, '%Q'); 562} 563 564