1#!/usr/local/bin/perl -w 2 3=pod 4 5=head1 NAME 6 7tv_grab_es_laguiatv - Alternative TV grabber for Spain. 8 9=head1 SYNOPSIS 10 11tv_grab_es_laguiatv --help 12 13tv_grab_es_laguiatv [--config-file FILE] --configure [--gui OPTION] 14 15tv_grab_es_laguiatv [--config-file FILE] [--output FILE] [--days N] 16 [--offset N] [--quiet] 17 18tv_grab_es_laguiatv --list-channels 19 20tv_grab_es_laguiatv --capabilities 21 22tv_grab_es_laguiatv --version 23 24=head1 DESCRIPTION 25 26Output TV listings for spanish channels from www.laguiatv.com. 27Supports analogue and digital (D+) channels. 28The grabber relies on parsing HTML so it might stop working at any time. 29 30First run B<tv_grab_es_laguiatv --configure> to choose, which channels you want 31to download. Then running B<tv_grab_es_laguiatv> with no arguments will output 32listings in XML format to standard output. 33 34B<--configure> Prompt for which channels, 35and write the configuration file. 36 37B<--config-file FILE> Set the name of the configuration file, the 38default is B<~/.xmltv/tv_grab_es_laguiatv.conf>. This is the file written by 39B<--configure> and read when grabbing. 40 41B<--gui OPTION> Use this option to enable a graphical interface to be used. 42OPTION may be 'Tk', or left blank for the best available choice. 43Additional allowed values of OPTION are 'Term' for normal terminal output 44(default) and 'TermNoProgressBar' to disable the use of XMLTV::ProgressBar. 45 46B<--output FILE> Write to FILE rather than standard output. 47 48B<--days N> Grab N days. The default is 3. 49 50B<--offset N> Start N days in the future. The default is to start 51from today. 52 53B<--quiet> Suppress the progress messages normally written to standard 54error. 55 56B<--capabilities> Show which capabilities the grabber supports. For more 57information, see L<http://wiki.xmltv.org/index.php/XmltvCapabilities> 58 59B<--version> Show the version of the grabber. 60 61B<--help> Print a help message and exit. 62 63=head1 SEE ALSO 64 65L<xmltv(5)>. 66 67=head1 AUTHOR 68 69CandU, candu_sf@sourceforge.net, based on tv_grab_es, from Ramon Roca. 70 71=head1 BUGS 72 73=cut 74 75# 76 77 78###################################################################### 79# initializations 80 81use strict; 82use XMLTV::Version '$Id: tv_grab_es_laguiatv,v 1.25 2015/03/24 17:56:38 bilbo_uk Exp $ '; 83use XMLTV::Capabilities qw/baseline manualconfig cache/; 84use XMLTV::Description 'Spain (laguiatv.com)'; 85use Getopt::Long; 86use Date::Manip; 87use HTML::TreeBuilder; 88use HTML::Entities; # parse entities 89use IO::File; 90use DateTime; 91 92use LWP::Simple; 93use Encode; 94 95use XMLTV; 96use XMLTV::Memoize; 97use XMLTV::ProgressBar; 98use XMLTV::Ask; 99use XMLTV::Config_file; 100use XMLTV::DST; 101use XMLTV::Get_nice 0.005065; 102use XMLTV::Mode; 103use XMLTV::Date; 104# Todo: perhaps we should internationalize messages and docs? 105use XMLTV::Usage <<END 106$0: get Spanish television listings in XMLTV format 107To configure: $0 --configure [--config-file FILE] 108To grab listings: $0 [--config-file FILE] [--output FILE] [--days N] 109 [--offset N] [--quiet] 110To list channels: $0 --list-channels 111To show capabilities: $0 --capabilities 112To show version: $0 --version 113END 114 ; 115 116# Attributes of the root element in output. 117my $HEAD = { 'source-info-url' => 'http://www.laguiatv.com/programacion/', 118 'source-data-url' => "http://www.laguiatv.com/programacion/", 119 'generator-info-name' => 'XMLTV', 120 'generator-info-url' => 'http://xmltv.org/', 121 }; 122 123my $WRITE_ZERO_LENGTH = 0; # whether zero-length programmes should be included in the output. 124my $DO_SLOWER_DESC_GET = 0; 125my $CONFIG_VERSION = 1; # default to v1 (v1 doesnt have version info) 126my $EXPECTED_CONFIG_VERSION = 3; 127my $CONFIG_USECACHE = 0; # whether to use a disc cache for web pages 128my $CONFIG_CACHEDIR; # directory to store cached web pages 129 130# default language 131my $LANG="es"; 132 133# default web page encoding 134my $WEB_ENCODING = 'iso-8859-15'; 135 136# Global channel_data 137our @ch_all; 138 139my @hide_channels = ( 140 "canal-bar.a", # currently gives 404 not found 141); 142 143 144###################################################################### 145# get options 146 147# Get options, including undocumented --cache option. 148XMLTV::Memoize::check_argv('XMLTV::Get_nice::get_nice_aux'); 149my ($opt_days, $opt_offset, $opt_help, $opt_output, 150 $opt_configure, $opt_config_file, $opt_gui, 151 $opt_quiet, $opt_list_channels, $opt_debug); 152$opt_days = 4; # default 153$opt_offset = 0; # default 154$opt_quiet = 0; # default 155$opt_debug = 0; # default 156GetOptions('days=i' => \$opt_days, 157 'offset=i' => \$opt_offset, 158 'help' => \$opt_help, 159 'configure' => \$opt_configure, 160 'config-file=s' => \$opt_config_file, 161 'gui:s' => \$opt_gui, 162 'output=s' => \$opt_output, 163 'quiet' => \$opt_quiet, 164 'list-channels' => \$opt_list_channels, 165 'debug' => \$opt_debug, 166 ) 167 or usage(0); 168 169# Force days to be 1, since we get all days at once 170# $opt_days = 1; 171die 'number of days must not be negative' 172 if (defined $opt_days && $opt_days < 0); 173usage(1) if $opt_help; 174 175# [mod Jan 2014 - max days is 4 176die 'max days available is 4 (today + 3)' 177 if ( $opt_offset + $opt_days > 4 ); 178 179XMLTV::Ask::init($opt_gui); 180 181 182# Although we use HTTP::Cache::Transparent, this undocumented --cache 183# option for debugging is still useful since it will _always_ use a 184# cached copy of a page, without contacting the server at all. 185# 186use XMLTV::Memoize; XMLTV::Memoize::check_argv('XMLTV::Get_nice::get_nice_aux'); 187 188 189# debug print function 190sub debug_print 191{ 192 print STDERR $_[0]."\n" if $opt_debug; 193} 194 195my $mode = XMLTV::Mode::mode('grab', # default 196 $opt_configure => 'configure', 197 $opt_list_channels => 'list-channels', 198 ); 199 200# File that stores which channels to download. 201my $config_file 202 = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_es_laguiatv', $opt_quiet); 203 204my @config_lines; # used only in grab mode 205if ($mode eq 'configure') { 206 XMLTV::Config_file::check_no_overwrite($config_file); 207} 208elsif ($mode eq 'grab') { 209 @config_lines = XMLTV::Config_file::read_lines($config_file); 210} 211elsif ($mode eq 'list-channels') { 212 # Config file not used. 213} 214else { die } 215 216# Whatever we are doing, we need the channels data. 217my %channels = get_channels(); # sets @ch_all 218my @channels; 219 220my %icons; 221 222my %categories = ( 223 "tag-a" => "Cine", 224 "tag-b" => "Deportes", 225 "tag-c" => "Programas", 226 "tag-d" => "Series", 227 "tag-e" => "Noticias" 228); 229 230###################################################################### 231# write configuration 232 233if ($mode eq 'configure') { 234 %channels = get_channels(); 235 236 open(CONF, ">$config_file") or die "cannot write to $config_file: $!"; 237 238 print CONF "configversion 3\n"; 239 240 # Ask about using a cache 241 my $usecache = ask_boolean("Do you want to use a cache for web pages (recommended)", 'yes'); 242 warn("cannot read input, using default") 243 if not defined $usecache; 244 245 print CONF "usecache "; 246 print CONF "yes\n" if $usecache; 247 print CONF "no\n" if not $usecache; 248 249 my $cachedir = "$ENV{HOME}/.xmltv/cache"; 250 if ($usecache) 251 { 252 my $cachedir = ask("Directory for cache (default=$cachedir)"); 253 warn("cannot read input, using default") 254 if not defined $cachedir; 255 } 256 print CONF "cachedir ".$cachedir."\n"; 257 258 # Ask about getting descs 259 my $getdescs = ask_boolean("Do you want to get descriptions (very slow)", 'yes'); 260 warn("cannot read input, using default") 261 if not defined $getdescs; 262 263 print CONF "getdescriptions "; 264 print CONF "yes\n" if $getdescs; 265 print CONF "no\n" if not $getdescs; 266 267 #my $cacheicons = ask_boolean('Do you want to get and cache icons during configure', 'yes'); 268 #warn("cannot read input, using default") 269 # if not defined $cacheicons; 270 271 # Ask about each channel. 272 my @chs = sort { $channels{$a} cmp $channels{$b} } keys %channels; 273 my @names = map { $channels{$_} } @chs; 274 my @qs = map { "Add channel $_?" } @names; 275 my @want = ask_many_boolean(1, @qs); 276 277 #my $iconbar = new XMLTV::ProgressBar({name => 'getting icon urls', count => scalar @chs}) 278 #if ((not $opt_quiet) && $cacheicons); 279 280 foreach (@chs) { 281 my $w = shift @want; 282 warn("cannot read input, stopping channel questions"), last 283 if not defined $w; 284 # No need to print to user - XMLTV::Ask is verbose enough. 285 286 # Print a config line, but comment it out if channel not wanted. 287 print CONF '#' if not $w; 288 my $name = shift @names; 289# if ($cacheicons) 290# { 291# my $icon = get_icon($_); 292# print CONF "channel $_ $name icon:$icon\n"; 293# } 294# else 295# { 296 print CONF "channel $_ ".encode($WEB_ENCODING, $name)."\n"; 297# } 298 # TODO don't store display-name in config file. 299 300# update $iconbar if ((not $opt_quiet) && $cacheicons); 301 } 302 303 close CONF or warn "cannot close $config_file: $!"; 304 say("Finished configuration."); 305 306 exit(); 307} 308 309 310# Not configuration, we must be writing something, either full 311# listings or just channels. 312# 313die if $mode ne 'grab' and $mode ne 'list-channels'; 314 315# Options to be used for XMLTV::Writer. 316my %w_args; 317if (defined $opt_output) { 318 my $fh = new IO::File(">$opt_output"); 319 die "cannot write to $opt_output: $!" if not defined $fh; 320 $w_args{OUTPUT} = $fh; 321} 322$w_args{encoding} = 'UTF-8'; 323my $writer = new XMLTV::Writer(%w_args); 324$writer->start($HEAD); 325 326if ($mode eq 'list-channels') { 327 $writer->write_channel($_) foreach @ch_all; 328 $writer->end(); 329 exit(); 330} 331 332###################################################################### 333# We are producing full listings. 334die if $mode ne 'grab'; 335 336# Read configuration 337my $line_num = 1; 338foreach (@config_lines) { 339 ++ $line_num; 340 next if not defined; 341 342 if (/configversion:?\s+(\S+)/) 343 { 344 $CONFIG_VERSION = $1; 345 } 346 elsif (/usecache:?\s+(\S+)/) 347 { 348 if($1 eq "yes") 349 { 350 $CONFIG_USECACHE = 1; 351 } 352 } 353 elsif (/cachedir:?\s+(\S+)/) 354 { 355 $CONFIG_CACHEDIR = $1; 356 } 357 elsif (/getdescriptions:?\s+(\S+)/) 358 { 359 if("$CONFIG_VERSION" ne "$EXPECTED_CONFIG_VERSION") 360 { 361 die "Config file is out of date, please rerun with --configure\n"; 362 } 363 if($1 eq "yes") 364 { 365 $DO_SLOWER_DESC_GET = 1; 366 } 367 } 368 elsif (/^channel:?\s+(\S+)\s+([^#]+)icon\:([^#]+)/) 369 { 370 my $ch_did = $1; 371 my $ch_name = $2; 372 my $ch_icon = $3; 373 374 375 #debug_print "Got channel $ch_name icon $ch_icon\n"; 376 $ch_name =~ s/\s*$//; 377 push @channels, $ch_did; 378 $channels{$ch_did} = $ch_name; 379 $icons{$ch_did} = $ch_icon; 380 } 381 elsif (/^channel:?\s+(\S+)\s+([^#]+)/) 382 { 383 my $ch_did = $1; 384 my $ch_name = $2; 385 386 debug_print "Fetching channel $ch_name"; 387 $ch_name =~ s/\s*$//; 388 push @channels, $ch_did; 389 $channels{$ch_did} = $ch_name; 390 } 391 else { 392 warn "$config_file:$line_num: bad line\n"; 393 } 394} 395 396 397 398if ($CONFIG_USECACHE) { 399use HTTP::Cache::Transparent; 400HTTP::Cache::Transparent::init( { 401 BasePath => $CONFIG_CACHEDIR, 402 NoUpdate => 60*60, # cache time in seconds 403 MaxAge => 4, # flush time in hours 404 Verbose => $opt_debug, 405} ); 406} 407 408 409 410 411###################################################################### 412# begin main program 413 414# Assume the listings source uses CET (see BUGS above). 415my $now = DateCalc(parse_date('now'), "$opt_offset days"); 416die "No channels specified, run me with --configure\n" 417 if not keys %channels; 418my @to_get; 419 420my $iconbar = new XMLTV::ProgressBar({name => 'getting channel info', count => scalar @channels}) 421 if not $opt_quiet; 422# the order in which we fetch the channels matters 423foreach my $ch_did (@channels) { 424 my $ch_name=$channels{$ch_did}; 425 my $ch_xid="$ch_did.laguiatv.com"; 426# my $ch_icon=$icons{$ch_did}; 427# if (!$ch_icon) 428# { 429# $ch_icon = get_icon($ch_did); 430# } 431# 432# if(index($ch_icon, "shim.gif") < 0) 433# { 434# $writer->write_channel({ id => $ch_xid, 435# 'display-name' => [ [ $ch_name ] ] , 436# 'icon' => [ { 'src' => $ch_icon } ] }); 437# } 438# else 439# { 440 $writer->write_channel({ id => $ch_xid, 441 'display-name' => [ [ $ch_name ] ] }); 442# } 443 444 # [Jan 2014] - current website offers a fixed 4 days of data 445 # my $day=UnixDate($now,'%Q'); 446 # for (my $i=0;$i<$opt_days;$i++) { 447 # push @to_get, [ $day, $ch_xid, $ch_did ]; 448 # #for each day 449 # $day=nextday($day); die if not defined $day; 450 # } 451 # 452 push @to_get, [ '', $ch_xid, $ch_did ]; 453 454 update $iconbar if not $opt_quiet; 455} 456 457# This progress bar is for both downloading and parsing. Maybe 458# they could be separate. 459# 460my $bar = new XMLTV::ProgressBar({name => 'getting listings', count => scalar @to_get}) 461 if not $opt_quiet; 462foreach (@to_get) { 463 debug_print "process $_->[0], $_->[1], $_->[2]\n"; 464 foreach (process_table($_->[0], $_->[1], $_->[2])) { 465 $writer->write_programme($_); 466 } 467 update $bar if not $opt_quiet; 468} 469$bar->finish() if not $opt_quiet; 470$writer->end(); 471 472###################################################################### 473# subroutine definitions 474 475# Use Log::TraceMessages if installed. 476BEGIN { 477 eval { require Log::TraceMessages }; 478 if ($@) { 479 *t = sub {}; 480 *d = sub { '' }; 481 } 482 else { 483 *t = \&Log::TraceMessages::t; 484 *d = \&Log::TraceMessages::d; 485 Log::TraceMessages::check_argv(); 486 } 487} 488 489#### 490# process_table: fetch a URL and process it 491# 492# arguments: 493# Date::Manip object giving the day to grab 494# xmltv id of channel 495# elpais.es id of channel 496# 497# returns: list of the programme hashes to write 498# 499sub process_table { 500 501 my ($date, $ch_xmltv_id, $ch_es_id) = @_; 502 503 my $ch_conv_id = convert_id_to_laguiatvid($ch_es_id); 504 my $today = UnixDate($date, '%d/%m/%Y'); 505 506 my $url = 'http://www.laguiatv.com/programacion/'.$ch_es_id.'.html'; 507 debug_print "Getting $url\n"; 508 t $url; 509 local $SIG{__WARN__} = sub 510 { 511 warn "$url: $_[0]"; 512 }; 513 514 # parse the page to a document object 515 my $tree; 516 # HTML::Parse keeps reporting "Parsing of undecoded UTF-8 will give garbage when decoding entities" yet I can see no UTF8 in the pages! 517 # Save the page and run it again and you don't get the warning! 518 # You can't even supress the warning! What a crock. 519 { 520 local $SIG{__WARN__} = sub { 521 warn @_ unless (defined $_[0] && $_[0] =~ /^Parsing of undecoded UTF-/); 522 }; 523 $tree = get_nice_tree($url,'',$WEB_ENCODING); 524 } 525 526 my @program_data = get_program_data($tree); 527 my $bump_start_day=0; 528 529 my @r; 530 while (@program_data) { 531 my $cur = shift @program_data; 532 my $next = shift @program_data; 533 unshift @program_data,$next if $next; 534 535 my $p = make_programme_hash($date, $ch_xmltv_id, $ch_es_id, $cur, $next); 536 if (not $p) { 537 require Data::Dumper; 538 my $d = Data::Dumper::Dumper($cur); 539 warn "cannot write programme on $ch_xmltv_id on $date:\n$d\n"; 540 } 541 else { 542 push @r, $p; 543 } 544 545# if (!$bump_start_day && bump_start_day($cur,$next)) { 546# #$bump_start_day=1; 547# $date = UnixDate(DateCalc($date,"+ 1 day"),'%Q'); 548# } 549 } 550 return @r; 551} 552 553sub make_programme_hash { 554 my ($date, $ch_xmltv_id, $ch_es_id, $cur, $next) = @_; 555 556 #require Data::Dumper; debug_print Data::Dumper::Dumper($cur); 557 558 my %prog; 559 560 $prog{channel}=$ch_xmltv_id; 561 $prog{title}=[ [ encode( 'UTF-8', $cur->{title} ), $LANG ] ]; 562 $prog{"sub-title"}=[ [ encode( 'UTF-8', $cur->{subtitle} ), $LANG ] ] if defined $cur->{subtitle}; 563 # $prog{category}=[ [ $cur->{category}, $LANG ] ]; 564 $prog{start}=$cur->{stime}; 565 $prog{stop} =$cur->{etime} if defined $cur->{etime}; 566 $prog{desc}=[ [ encode( 'UTF-8', $cur->{desc} ), $LANG ] ] if defined $cur->{desc}; 567 # $prog{category}=[ [ encode( 'UTF-8', $cur->{category} ), $LANG ] ] if defined $cur->{category}; 568 $prog{'date'} = $cur->{year} if defined $cur->{year}; 569 $prog{'star-rating'} = [ $cur->{rating} . '/5' ] if defined $cur->{rating}; 570 $prog{'rating'} = [[ $cur->{classification}, '' ]] if defined $cur->{classification}; 571 572 if (defined $cur->{genres}) 573 { 574 foreach ( @{ $cur->{genres} } ) 575 { 576 push @{$prog{'category'}}, [ encode('UTF-8', $_), $LANG ] if $_ ne ''; 577 } 578 } 579 if (defined $cur->{directors}) 580 { 581 foreach ( @{ $cur->{directors} } ) 582 { 583 push @{$prog{'credits'}{'director'}}, encode('UTF-8', $_) if $_ ne ''; 584 } 585 } 586 if (defined $cur->{actors}) 587 { 588 foreach ( @{ $cur->{actors} } ) 589 { 590 push @{$prog{'credits'}{'actor'}}, encode('UTF-8', $_) if $_ ne ''; 591 } 592 } 593 594 595 return \%prog; 596} 597sub bump_start_day { 598 my ($cur,$next) = @_; 599 if (!defined($next)) { 600 return undef; 601 } 602 my $start = UnixDate($cur->{stime},'%H:%M'); 603 my $stop = UnixDate($next->{stime},'%H:%M'); 604 if (Date_Cmp($start,$stop)>0) { 605 return 1; 606 } else { 607 return 0; 608 } 609} 610 611 612# 613sub get_program_data 614{ 615 my ($tree) = @_; 616 my @data; 617 618 my $today = DateTime->today->set_time_zone('Europe/Madrid'); 619 620 # - current website offers a fixed 4 days of data 621 # ignore any programmes outside requested range 622 my $startgrab = $today->clone->add('days' => $opt_offset)->epoch(); 623 my $stopgrab = $today->clone->add('days' => ($opt_offset + $opt_days))->epoch(); 624 debug_print 'Grab times: start: '.DateTime->from_epoch(epoch=>$startgrab)->strftime("%Y %m %d %H%M %S %z").' stop: '.DateTime->from_epoch(epoch=>$stopgrab)->strftime("%Y %m %d %H%M %S %z"); 625 # find schedule table 626 627 # the following could could do with some error checking but I don't have time to do that right now :-( 628 629 my @divs = $tree->look_down('_tag' => 'div', 'id' => qr/dia1|nad2|nad3|nad4/); 630 631 foreach my $div (@divs) 632 { 633 my ($i) = $div->attr('id') =~ /(?:dia|nad)(\d)/; 634 #'debugtime' debug_print "i= $i ".$div->attr('id'); 635 #'debugtime' debug_print 'today: '.$today->strftime("%Y %m %d %H%M %S %z"); 636 my $theday = $today->clone->add(days => ($i - 1)); 637 #'debugtime' debug_print 'theday: '.$theday->strftime("%Y %m %d %H%M %S %z"); 638 639 my @trs = $div->look_down('_tag' => 'tr'); 640 641 foreach my $tr (@trs) 642 { 643 644 my $stime = $tr->look_down('_tag' => 'th')->as_text; 645 trim($stime); 646 647 my $p_div = $tr->look_down('_tag' => 'div', 'class' => 'programa'); 648 next if !$p_div; 649 650 my $a = $p_div->look_down('_tag' => 'a'); 651 652 my $p_url = $a->attr('href'); 653 my $p_title = $a->as_text; 654 655 my $p_times = $p_div->look_down('_tag' => 'p')->as_text; 656 657 my ($h, $i, $h2, $i2) = $p_times =~ /(\d*):(\d*)(?: *a *(\d*):(\d*))?/; 658 659 my $showtime = $theday->clone->set(hour => $h, minute => $i, second => 0); 660 661 # - current website offers a fixed 4 days of data 662 # ignore any programmes outside requested range 663 #'debugtime' debug_print 'this: '.$showtime->strftime("%Y %m %d %H%M %S %z"); 664 next if ( $showtime->epoch() < $startgrab ) || ( $showtime->epoch() >= $stopgrab ); 665 666 my $p_stime = $theday->clone->set(hour => $h, minute => $i, second => 0)->strftime("%Y%m%d%H%M%S %z"); 667 668 my $p_etime; 669 # this will probably fail around DST times 670 if (defined $h2 && $h2 >= 0) 671 { 672 $showtime->add(days => 1) if $h2 < $h; 673 eval { # try 674 $showtime->set(hour => $h2, minute => $i2, second => 0); 675 $p_etime = $showtime->strftime("%Y%m%d%H%M%S %z"); 676 } or do { # catch 677 # no output prog 'stop' time 678 } 679 } 680 681 682 # get descriptions? Kinda compulsory now since there is no longer *any* description on the schedule page 683 # 684 my ($p_description, $p_rating, $p_classification, $p_year, @p_genres, @p_actors, @p_directors) = ('', '', '', '', (), (), ()); 685 686 # 687 { # begin code block 688 if ($DO_SLOWER_DESC_GET) # get descriptions 689 { 690 691 my $url = $p_url; 692 debug_print "Getting $url"; 693 t $url; 694 695 last if $url eq 'javascript:void(0);' ; 696 697 # handle no programme info situation (probably means "Close"?) : 698 # <tr> 699 # <th scope="col"> 04:00</th> 700 # <td class="" data-type="programas"> 701 # <div class="programa"> 702 # <h2><a href="http://laguiatv.abc.es/programas/-72840/" title="*">*</a></h2> 703 # <p> 04:00 a 06:00</p> 704 # </div> 705 # </td> 706 #</tr> 707 last if $url =~ m%programas/-\d*/$%; 708 709 # parse the page to a document object 710 # HTML::Parse keeps reporting "Parsing of undecoded UTF-8 will give garbage when decoding entities" yet I can see no UTF8 in the pages! 711 # Often on the http://hoycinema.abc.es/ pages. (Could be due to the <script> they have *before* the <meta Content-Type> ?) 712 { 713 local $SIG{__WARN__} = sub { 714 warn @_ unless (defined $_[0] && $_[0] =~ /^Parsing of undecoded UTF-/); 715 }; 716 $tree = get_nice_tree($url,'',$WEB_ENCODING); 717 } 718 719 my $div = $tree->look_down('_tag' => 'div', 'id' => 'contenedor'); # container 720 $tree->dump if !$div; 721 exit if !$div; 722 723 # see if the title has a year 724 # <h1 itemprop="name">«Fin de semana al desnudo» 725 # <span><a href="/peliculas/1974.html" title="Películas del año 1974">(1974)</a></span> 726 # </h1> 727 my $h1 = $div->look_down('_tag' => 'h1', 'itemprop' => 'name'); 728 if ($h1) 729 { 730 my $a = $h1->look_down('_tag' => 'a'); 731 ($p_year) = $a->as_text =~ /^\((19\d\d|20\d\d)\)$/ if $a; 732 } 733 734 $div = $div->look_down('_tag' => 'div', 'class' => 'modulo'); 735 last if !$div; 736 737 # get the various <dl> blocks 738 my @dls = $div->look_down('_tag' => 'dl'); 739 foreach my $dl (@dls) 740 { 741 my $dt = $dl->look_down('_tag' => 'dt'); 742 next if !$dt; 743 744 if ($dt->as_text =~ /Informaci.n/) 745 { 746 #<dl class="datos"> 747 # <dt>Información:</dt> 748 # <dd class="calificacion">SC</dd> 749 # <dd itemprop="genre">Comedia</dd> 750 # <dd itemprop="duration">93 minutos</dd> 751 # <dd>Sin especificar</dd> OR e.g. <dd>EE.UU.</dd> 752 #</dl> 753 my $t = $dl->look_down('_tag' => 'dd', 'class' => 'calificacion'); 754 $p_classification = $t->as_text if $t; 755 # 756 $t = $dl->look_down('_tag' => 'dd', 'itemprop' => 'genre'); 757 @p_genres = split(/,/, $t->as_text) if $t; 758 } 759 760 if ($dt->as_text =~ /Director/) 761 { 762 #<dl> 763 # <dt>Director:</dt> 764 # <dd itemprop="director" itemscope itemtype="http://schema.org/Person"> 765 # <a href="/perfil-cine/peter-webber-97721/" title="Peter Webber"><span itemprop="name">Peter Webber</span></a> 766 # <a href="/perfil-cine/mariano-ozores-20092/" title="Mariano Ozores"><span itemprop="name">Mariano Ozores</span></a> 767 # </dd> 768 #</dl> 769 my @t = $dl->look_down('_tag' => 'span', 'itemprop' => 'name'); 770 foreach (@t) 771 { 772 push @p_directors, $_->as_text; 773 } 774 } 775 776 if ($dt->as_text =~ /Int.rpretes/) 777 { 778 #<dl class="interpretes" itemprop="actor" itemscope itemtype="http://schema.org/Person"> 779 # <dt>Intérpretes:</dt> 780 # <dd> 781 # <a href="/perfil-cine/mandy-patinkin-4279/" title="Mandy Patinkin"><span itemprop="name">Mandy Patinkin</span></a>, 782 # <a href="/perfil-cine/alfredo-landa-15923/" title="Alfredo Landa"><span itemprop="name">Alfredo Landa</span></a>, 783 # <a href="/perfil-cine/thomas-gibson-18922/" title="Thomas Gibson"><span itemprop="name">Thomas Gibson</span></a>, 784 # </dd> 785 # <dd class="enlace"><a href="/peliculas/1974/fin-de-semana-al-desnudo-9814/reparto.html" title="Reparto completo">Reparto completo</a></dd> 786 #</dl> 787 my @t = $dl->look_down('_tag' => 'span', 'itemprop' => 'name'); 788 foreach (@t) 789 { 790 push @p_actors, $_->as_text; 791 } 792 # note: we could follow the "Reparto completo" link for the complete cast list 793 } 794 795 if ($dt->as_text =~ /Descripci.n/) 796 { 797 #<dl> 798 # <dt>Descripción:</dt> 799 # <dd>Programa que repasa todas las noticias de interés general, nacionales, internacionales, así como deportivas. Incluye El tiempo. Presentador: Albert Martínez.</dd> 800 #</dl> 801 my $t = $dl->look_down('_tag' => 'dd'); 802 $p_description = $t->as_text if $t; 803 } 804 805 if ($dt->as_text =~ /Sinopsis/) 806 { 807 #<dl class="sinopsis"> 808 # <dt>Sinopsis:</dt> 809 # <dd itemprop="description">La historia de la psicóloga Virginia Johnson (Lizzy Caplan) y el tímido ginecólogo William Masters (Michael Sheen),...<a href="/series/masters-of-sex-25058/sinopsis.html" title="Leer sinopsis completa">Leer sinopsis completa</a>.</dd> 810 #</dl> 811 # 812 # For a *really* long description we could follow the 'Leer sinopsis completa' link but we won't do that until someone asks for it! ;-) 813 # 814 if ($p_description eq '') 815 { 816 my $t = $dl->look_down('_tag' => 'dd', 'itemprop' => 'description'); 817 if ($t) 818 { 819 my $a = $t->look_down('_tag' => 'a'); 820 $a->detach if $a; 821 $p_description = $t->as_text; 822 } 823 } 824 } 825 826 827 } 828 829 # if no description then try for a synopis 830 # <aside id="sinopsis"><h2>Sinopsis</h2><p itemprop="description">... 831 if ($p_description eq '') 832 { 833 my $h2 = $tree->look_down('_tag' => 'h2', sub { $_[0]->as_text =~ /Sinopsis/ } ); 834 if ($h2) 835 { 836 my $p = $h2->right(); 837 $p_description = $p->as_text if ( $p->tag() eq 'p' && $p->attr('itemprop') eq 'description' ); 838 # 839 # website sometimes has invalid html (nested <p>) which treebuilder flattens 840 # so append <p> siblings 841 while (1) 842 { 843 $p = $p->right(); 844 last if ( $p->tag() ne 'p' ); 845 $p_description .= $p->as_text if ( $p->tag() eq 'p' ); 846 } 847 } 848 } 849 850 # rating (x/5) 851 # <meta itemprop="ratingValue" content="2.2"/> 852 my $meta = $div->look_down('_tag' => 'meta', 'itemprop' => 'ratingValue'); 853 if ($meta) 854 { 855 $p_rating = $meta->attr('content'); 856 } 857 858 859 } 860 } # end code block 861 862 #debug_print("title: $p_title start: $p_stime end: $p_etime cat: $p_category c2: " . $categories{$p_category} . "\n"); 863 debug_print("title: $p_title start: $p_stime end: ".(defined $p_etime?$p_etime:'')); 864 865 # 2014-04-02 ignore programme where title = * 866 # <h2> <a href="javascript:void(0);" title="*">*</a> </h2> 867 # 868 if($p_title && $p_title ne "" && $p_title ne "*" && $p_stime && $p_stime ne "") 869 { 870 my %h = ('stime' => $p_stime, 871 'etime' => $p_etime, 872 'title' => $p_title, 873 ); 874 $h{year} = $p_year if defined $p_year && $p_year ne ""; 875 $h{rating} = $p_rating if $p_rating ne ""; 876 $h{desc} = $p_description if $p_description ne ""; 877 $h{classification} = $p_classification if $p_classification ne ""; 878 $h{directors} = \@p_directors if scalar @p_directors > 0; 879 $h{actors} = \@p_actors if scalar @p_actors > 0; 880 $h{genres} = \@p_genres if scalar @p_genres > 0; 881 882 push @data, \%h; 883 884 } 885 886 } 887 888 } 889 return @data; 890} 891 892sub get_icon 893{ 894 my ($ch_did) = @_; 895 896 return ""; 897 898 my $url = "http://www.laguiatv.com/programacion/$ch_did"; 899 debug_print "Getting $url\n"; 900 t $url; 901 local $SIG{__WARN__} = sub 902 { 903 warn "$url: $_[0]"; 904 }; 905 906 my $content = get $url; 907 my $pos = index($content, '<table class="grid cadena">'); 908 if($pos > 0) 909 { 910 $pos = index($content, '<img src="', $pos); 911 if($pos > 0) 912 { 913 $pos += 10; 914 my $end = index($content, '"', $pos); 915 916 my $icon = 'http://www.laguiatv.com/' . substr($content, $pos, $end - $pos); 917 918 debug_print "icon $icon\n"; 919 return $icon; 920 } 921 } 922 923 return 'http://www.laguiatv.com/shim.gif'; 924} 925 926 927sub get_prog_info 928{ 929 my ($url) = @_; 930 my $desc = ""; 931 my $cat = ""; 932 933 $url = "http://www.laguiatv.com/".$url; 934 debug_print "Get proginfo $url\n"; 935 936 my $content = get $url; 937 my $pos = index($content, '<div class="intro-datasheet">'); 938 939 if($pos >= 0) 940 { 941 $pos = index($content, 'class="text">', $pos); 942 if($pos >= 0) 943 { 944 my $divend = index($content, '</div', $pos); 945 $pos = index($content, '<p', $pos); 946 947 while($pos >= 0 && $pos < $divend) 948 { 949 $pos = index($content, '>', $pos) + 1; 950 my $end = index($content, '</p>', $pos); 951 if($end >= 0) 952 { 953 $desc = $desc . substr($content, $pos, $end - $pos) . " "; 954 } 955 $pos = index($content, '<p', $pos); 956 } 957 } 958 } 959 960 decode_entities($desc); 961 $desc =~ s/<\S+\s*\/*\/*>//g; 962 $desc =~ s/\s+/ /g; 963 $desc =~ s/\s+$//g; 964 965 return ($desc, $cat); 966} 967 968sub get_txt_elems { 969 my ($tree) = @_; 970 971 my @txt_elem; 972 my @txt_cont = $tree->look_down( 973 sub { ($_[0]->descendants() eq 0 ) }, 974 sub { defined($_[0]->attr ("_content") ) } ); 975 foreach my $txt (@txt_cont) { 976 my @children=$txt->content_list; 977 if (defined($children[0])) { 978 for (my $tmp=$children[0]) { 979 s/^\s+//;s/\s+$//; 980 push @txt_elem, $_; 981 } 982 } 983 } 984 return @txt_elem; 985} 986 987# get channel listing 988sub get_channels 989{ 990 my $bar = new XMLTV::ProgressBar({name => 'finding channels', count => 1}) 991 if not $opt_quiet; 992 my %channels; 993 994 # the front page is very big and slow to parse, so we'll 995 # get channels via a dummy call to TVE 1 and then parse out the channel selector 996 my $url="http://www.laguiatv.com/programacion/tve-1-807.html"; 997 t $url; 998 999 my $channel_id; 1000 my $channel_name; 1001 my $channel_num; 1002 1003 my $tree = get_nice_tree($url,'',$WEB_ENCODING); 1004 1005 my @options = $tree->look_down('_tag' => 'select', 'id' => 'cadenas_programacion')->look_down('_tag' => 'option'); 1006 1007 foreach my $option (@options) 1008 { 1009 next if !$option->attr('value'); 1010 1011 # <option value="tve-1-807">TVE 1</option> 1012 $channel_name = $option->as_text; 1013 $channel_id = $option->attr('value'); 1014 ($channel_num) = $option->attr('value') =~ /.*?-(\d+)$/; 1015 1016 # remove channels that should not be listed 1017 my $hide = 0; 1018 foreach my $hide_id (@hide_channels) 1019 { 1020 if($channel_id =~ m/$hide_id/) 1021 { 1022 $hide = 1; 1023 } 1024 } 1025 1026 if($hide == 0) 1027 { 1028 $channels{$channel_id}=$channel_name; 1029 debug_print "Got channel $channel_name with id $channel_id" if $opt_list_channels; 1030 1031 my $coded_chan_name=encode("utf-8",$channel_name); 1032 push @ch_all, { 1033 'display-name' => [[ $coded_chan_name, $LANG ],[$channel_num]], 1034 'channel-num' => $channel_num, 1035 'id'=> "$channel_id.laguiatv" 1036 }; 1037 } 1038 1039 } 1040 1041 die "no channels could be found" if not keys %channels; 1042 update $bar if not $opt_quiet; 1043 $bar->finish() if not $opt_quiet; 1044 return %channels; 1045} 1046 1047sub convert_laguiatvid_to_id 1048{ 1049 my ($str) = @_; 1050 1051 1052 $str =~ s/([^A-Za-z0-9])/sprintf("-%02X", ord("$1"))/seg; 1053 1054 $str = "C" . $str; 1055 return $str; 1056} 1057 1058sub convert_id_to_laguiatvid 1059{ 1060 my ($str) = @_; 1061 1062 # convert -20 to + (to replace spaces) 1063 $str =~ s/-20/+/g; 1064 1065 # convert - to % for URL encoded chars 1066 $str =~ s/\-/%/g; 1067 1068 # strip the C off the front 1069 $str = substr($str, 1); 1070 1071 return $str; 1072} 1073 1074# Bump a DDMMYYYY date by one. 1075sub nextday { 1076 my $d = shift; 1077 my $p = parse_date($d); 1078 my $n = DateCalc($p, '+ 1 day'); 1079 return UnixDate($n, '%Q'); 1080} 1081 1082sub trim { 1083 # Remove leading & trailing spaces 1084 $_[0] =~ s/^\s+|\s+$//g; 1085} 1086 1087sub utf8 { 1088 # Catch the error: 1089 # "Parsing of undecoded UTF-8 will give garbage when decoding entities 1090 return decode('UTF-8', $_[0]); 1091} 1092