1#!/usr/local/bin/perl -w
2
3=pod
4
5=head1 NAME
6
7tv_grab_es - Grab TV listings for Spain.
8
9=head1 SYNOPSIS
10
11tv_grab_es --help
12
13tv_grab_es [--config-file FILE] --configure [--gui OPTION]
14
15tv_grab_es [--config-file FILE] [--output FILE] [--days N]
16           [--offset N] [--quiet]
17
18tv_grab_es --list-channels
19
20tv_grab_es --capabilities
21
22tv_grab_es --version
23
24=head1 DESCRIPTION
25
26Output TV listings for several channels available in Spain.
27Now supports the terrestrial analog tv listings, which is the most common tv
28viewed in Spain that currently has no EPG information. I have plans to add
29Satelite listings (now the Spanish platforms are in a merger process between
30providers and also cable (the listings has to be grabbed from different sites)).
31The tv listings comes from www.elpais.es
32The grabber relies on parsing HTML so it might stop working at any time.
33
34First run B<tv_grab_es --configure> to choose, which channels you want
35to download. Then running B<tv_grab_es> with no arguments will output
36listings in XML format to standard output.
37
38B<--configure> Prompt for which channels,
39and write the configuration file.
40
41B<--config-file FILE> Set the name of the configuration file, the
42default is B<~/.xmltv/tv_grab_es.conf>.  This is the file written by
43B<--configure> and read when grabbing.
44
45B<--gui OPTION> Use this option to enable a graphical interface to be used.
46OPTION may be 'Tk', or left blank for the best available choice.
47Additional allowed values of OPTION are 'Term' for normal terminal output
48(default) and 'TermNoProgressBar' to disable the use of XMLTV::ProgressBar.
49
50B<--output FILE> Write to FILE rather than standard output.
51
52B<--days N> Grab N days.  The default is 3.
53
54B<--offset N> Start N days in the future.  The default is to start
55from today.
56
57B<--quiet> Suppress the progress messages normally written to standard
58error.
59
60B<--capabilities> Show which capabilities the grabber supports. For more
61information, see L<http://wiki.xmltv.org/index.php/XmltvCapabilities>
62
63B<--version> Show the version of the grabber.
64
65B<--help> Print a help message and exit.
66
67=head1 SEE ALSO
68
69L<xmltv(5)>.
70
71=head1 AUTHOR
72
73Ramon Roca, Ramon.Roca@XCombo.com, based on tv_grab_fi, from Matti Airas.
74
75=head1 BUGS
76
77=cut
78
79# Author's TODOs & thoughts
80#
81# this is for analog listings, for D+ satellite listings please use
82#       tv_grab_es_digital
83#
84# get the icons of each grabbed channel from the website
85#
86# findout how to setup properly the language, (catalan, basque, galician, vo)
87#
88# get channel ids in RFC2838 format (I don't, actually the Id comes directly
89# 	web site, i don't know where to go for getting th id's for spanish
90#	tv broadcasters.
91#
92# do the listings from another site, just in case this one breaks, the most
93#	similar sites to this grabber are television.ya.com.
94#	we should consider also getting them from www.terra.es or
95#	www.tvinteligente.com, they provide also some more local tv listings
96#	however the grabber gets a lot more complex and needs many more urls
97#	to collect the info, although it can be a little bit more complete
98#       (i.e. credits, program duration...)
99#
100#
101
102
103######################################################################
104# initializations
105
106use strict;
107use XMLTV::Version '$Id: tv_grab_es,v 1.39 2010/09/02 05:07:40 rmeden Exp $ ';
108use XMLTV::Capabilities qw/baseline manualconfig cache/;
109use XMLTV::Description 'Spain';
110use Getopt::Long;
111use Date::Manip;
112use HTML::TreeBuilder;
113use HTML::Entities; # parse entities
114use IO::File;
115
116use XMLTV;
117use XMLTV::Memoize;
118use XMLTV::ProgressBar;
119use XMLTV::Ask;
120use XMLTV::Config_file;
121use XMLTV::DST;
122use XMLTV::Get_nice;
123use XMLTV::Mode;
124use XMLTV::Date;
125# Todo: perhaps we should internationalize messages and docs?
126use XMLTV::Usage <<END
127$0: get Spanish television listings in XMLTV format
128To configure: $0 --configure [--config-file FILE]
129To grab listings: $0 [--config-file FILE] [--output FILE] [--days N]
130        [--offset N] [--quiet]
131To list channels: $0 --list-channels
132To show capabilities: $0 --capabilities
133To show version: $0 --version
134END
135  ;
136
137# Attributes of the root element in output.
138my $HEAD = { 'source-info-url'     => 'http://www.elpais.es/parrillatv/portada.html',
139	     'source-data-url'     => "http://www.elpais.es/parrillatv/resultados.html",
140	     'generator-info-name' => 'XMLTV',
141	     'generator-info-url'  => 'http://xmltv.org/',
142	   };
143
144# Whether zero-length programmes should be included in the output.
145my $WRITE_ZERO_LENGTH = 0;
146
147# default language
148my $LANG="es";
149
150# Global channel_data
151our @ch_all;
152
153######################################################################
154# get options
155
156# Get options, including undocumented --cache option.
157XMLTV::Memoize::check_argv('XMLTV::Get_nice::get_nice_aux');
158my ($opt_days, $opt_offset, $opt_help, $opt_output,
159    $opt_configure, $opt_config_file, $opt_gui,
160    $opt_quiet, $opt_list_channels);
161$opt_days  = 3; # default
162$opt_offset = 0; # default
163$opt_quiet  = 0; # default
164GetOptions('days=i'        => \$opt_days,
165	   'offset=i'      => \$opt_offset,
166	   'help'          => \$opt_help,
167	   'configure'     => \$opt_configure,
168	   'config-file=s' => \$opt_config_file,
169       'gui:s'         => \$opt_gui,
170	   'output=s'      => \$opt_output,
171	   'quiet'         => \$opt_quiet,
172	   'list-channels' => \$opt_list_channels
173	  )
174  or usage(0);
175die 'number of days must not be negative'
176  if (defined $opt_days && $opt_days < 0);
177usage(1) if $opt_help;
178
179XMLTV::Ask::init($opt_gui);
180
181my $mode = XMLTV::Mode::mode('grab', # default
182			     $opt_configure => 'configure',
183			     $opt_list_channels => 'list-channels',
184			    );
185
186# File that stores which channels to download.
187my $config_file
188  = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_es', $opt_quiet);
189
190my @config_lines; # used only in grab mode
191if ($mode eq 'configure') {
192    XMLTV::Config_file::check_no_overwrite($config_file);
193}
194elsif ($mode eq 'grab') {
195    @config_lines = XMLTV::Config_file::read_lines($config_file);
196}
197elsif ($mode eq 'list-channels') {
198    # Config file not used.
199}
200else { die }
201
202# Whatever we are doing, we need the channels data.
203my %channels = get_channels(); # sets @ch_all
204my @channels;
205
206######################################################################
207# write configuration
208
209if ($mode eq 'configure') {
210    open(CONF, ">$config_file") or die "cannot write to $config_file: $!";
211
212    # Ask about each channel.
213    my @chs = sort keys %channels;
214    my @names = map { $channels{$_} } @chs;
215    my @qs = map { "add channel $_?" } @names;
216    my @want = ask_many_boolean(1, @qs);
217    foreach (@chs) {
218	my $w = shift @want;
219	warn("cannot read input, stopping channel questions"), last
220	  if not defined $w;
221	# No need to print to user - XMLTV::Ask is verbose enough.
222
223	# Print a config line, but comment it out if channel not wanted.
224	print CONF '#' if not $w;
225	my $name = shift @names;
226	print CONF "channel $_ $name\n";
227	# TODO don't store display-name in config file.
228    }
229
230    close CONF or warn "cannot close $config_file: $!";
231    say("Finished configuration.");
232
233    exit();
234}
235
236
237# Not configuration, we must be writing something, either full
238# listings or just channels.
239#
240die if $mode ne 'grab' and $mode ne 'list-channels';
241
242# Options to be used for XMLTV::Writer.
243my %w_args;
244if (defined $opt_output) {
245    my $fh = new IO::File(">$opt_output");
246    die "cannot write to $opt_output: $!" if not defined $fh;
247    $w_args{OUTPUT} = $fh;
248}
249$w_args{encoding} = 'ISO-8859-1';
250my $writer = new XMLTV::Writer(%w_args);
251$writer->start($HEAD);
252
253if ($mode eq 'list-channels') {
254    $writer->write_channel($_) foreach @ch_all;
255    $writer->end();
256    exit();
257}
258
259######################################################################
260# We are producing full listings.
261die if $mode ne 'grab';
262
263# Read configuration
264my $line_num = 1;
265foreach (@config_lines) {
266    ++ $line_num;
267    next if not defined;
268    if (/^channel:?\s+(\S+)\s+([^\#]+)/) {
269	my $ch_did = $1;
270	my $ch_name = $2;
271	$ch_name =~ s/\s*$//;
272	push @channels, $ch_did;
273	$channels{$ch_did} = $ch_name;
274    }
275    else {
276	warn "$config_file:$line_num: bad line\n";
277    }
278}
279
280######################################################################
281# begin main program
282
283# Assume the listings source uses CET (see BUGS above).
284my $now = DateCalc(parse_date('now'), "$opt_offset days");
285die "No channels specified, run me with --configure\n"
286  if not keys %channels;
287my @to_get;
288
289
290# the order in which we fetch the channels matters
291foreach my $ch_did (@channels) {
292    my $ch_name=$channels{$ch_did};
293    my $ch_xid="$ch_did.elpais.es";
294    my $ch_num=$ch_did + 0;
295    $writer->write_channel({ id => $ch_xid,
296			     'display-name' => [ [ $ch_name ],
297						 [ $ch_num ] ] });
298    my $day=UnixDate($now,'%Q');
299    for (my $i=0;$i<$opt_days;$i++) {
300        push @to_get, [ $day, $ch_xid, $ch_did ];
301        #for each day
302        $day=nextday($day); die if not defined $day;
303    }
304}
305
306# This progress bar is for both downloading and parsing.  Maybe
307# they could be separate.
308#
309my $bar = new XMLTV::ProgressBar('getting listings', scalar @to_get)
310  if not $opt_quiet;
311foreach (@to_get) {
312	foreach (process_table($_->[0], $_->[1], $_->[2])) {
313		$writer->write_programme($_);
314	}
315	update $bar if not $opt_quiet;
316}
317$bar->finish() if not $opt_quiet;
318$writer->end();
319
320######################################################################
321# subroutine definitions
322
323# Use Log::TraceMessages if installed.
324BEGIN {
325    eval { require Log::TraceMessages };
326    if ($@) {
327	*t = sub {};
328	*d = sub { '' };
329    }
330    else {
331	*t = \&Log::TraceMessages::t;
332	*d = \&Log::TraceMessages::d;
333	Log::TraceMessages::check_argv();
334    }
335}
336
337####
338# process_table: fetch a URL and process it
339#
340# arguments:
341#    Date::Manip object giving the day to grab
342#    xmltv id of channel
343#    elpais.es id of channel
344#
345# returns: list of the programme hashes to write
346#
347sub process_table {
348    my ($date, $ch_xmltv_id, $ch_es_id) = @_;
349
350    my $today = UnixDate($date, '%Y%m%d');
351    my $url = "http://www.elpais.es/parrillatv/resultados.html?franja=&tipo=&canal=$ch_es_id&dia=$today";
352    t $url;
353    local $SIG{__WARN__} = sub {
354	warn "$url: $_[0]";
355    };
356
357    # parse the page to a document object
358    my $tree = get_nice_tree $url;
359    my @program_data = get_program_data($tree);
360    my $bump_start_day=0;
361
362    my @r;
363    while (@program_data) {
364	my $cur = shift @program_data;
365	my $next = shift @program_data;
366	unshift @program_data,$next if $next;
367
368	my $p = make_programme_hash($date, $ch_xmltv_id, $ch_es_id, $cur, $next);
369	if (not $p) {
370	    require Data::Dumper;
371	    my $d = Data::Dumper::Dumper($cur);
372	    warn "cannot write programme on $ch_xmltv_id on $date:\n$d\n";
373	}
374	else {
375	    push @r, $p;
376	}
377
378	if (!$bump_start_day && bump_start_day($cur,$next)) {
379	    $bump_start_day=1;
380	    $date = UnixDate(DateCalc($date,"+ 1 day"),'%Q');
381	}
382    }
383    return @r;
384}
385
386sub make_programme_hash {
387    my ($date, $ch_xmltv_id, $ch_es_id, $cur, $next) = @_;
388
389    my %prog;
390
391    $prog{channel}=$ch_xmltv_id;
392    $prog{title}=[ [ $cur->{title}, $LANG ] ];
393    $prog{"sub-title"}=[ [ $cur->{subtitle}, $LANG ] ] if defined $cur->{subtitle};
394    $prog{category}=[ [ $cur->{category}, $LANG ] ];
395
396    t "turning local time $cur->{time}, on date $date, into UTC";
397    eval { $prog{start}=utc_offset("$date $cur->{time}", '+0100') };
398    if ($@) {
399	warn "bad time string: $cur->{time}";
400	return undef;
401    }
402    t "...got $prog{start}";
403    # FIXME: parse description field further
404
405    $prog{desc}=[ [ $cur->{desc}, $LANG ] ] if defined $cur->{desc};
406
407    return \%prog;
408}
409sub bump_start_day {
410    my ($cur,$next) = @_;
411    if (!defined($next)) {
412	return undef;
413    }
414    my $start = UnixDate($cur->{time},'%H:%M');
415    my $stop = UnixDate($next->{time},'%H:%M');
416    if (Date_Cmp($start,$stop)>0) {
417	return 1;
418    } else {
419	return 0;
420    }
421}
422
423
424#
425# program data is split as follows:
426# - as 22/4/2003 elpais.es have changed again the page, now the table that
427#   that contains the listings have a single header, so now we only look
428#   once for it and use the time to findout where the listings ends.
429sub get_program_data {
430    my ($tree) = @_;
431    my @data;
432
433    my @txt_elems = get_txt_elems($tree);
434
435    # Actually time and title are required, but we don't check that.
436
437    my $index = 0;
438    while ($index <= scalar (@txt_elems-4)) {
439	if (       ($txt_elems[$index] eq "Hora")
440		&& ($txt_elems[$index + 1] eq "Programa")
441		&& ($txt_elems[$index + 2] eq "Canal")
442		&& ($txt_elems[$index + 3] eq "Tipo") )
443		{
444	   t "Program listing comes below";
445	   $index = $index + 4;
446	   while ( $txt_elems[$index] =~ /^\d\d:\d\d/ ) {
447		t "Program found: Hora: $txt_elems[$index] Programa: $txt_elems[$index+1]";
448                # Look for duplicate start time, that occurs sometimes
449                # at elpais.es when a new program is lately scheduled but seems
450                # that they forget to remove the previous listing
451                # If it happens, we just grab the last program.
452                if ( $txt_elems[$index] =~ $txt_elems[$index + 5] ) {
453                        $index = $index + 5;
454                }
455
456                my $p_stime     = $txt_elems[$index];
457                my @p_str       = split (/:/,$txt_elems[$index + 1]);
458                my $p_title     = $p_str[0];
459		for ($p_title) { s/^\s+//; s/\s+$// }
460                my @strsub      = split (/\"/,$p_str[1])
461		  if (defined $p_str[1]);
462                my $p_subtitle;
463                if (defined $strsub[0]) {
464		    if ( $strsub[0] =~ " " ) {
465                        $p_subtitle = $strsub[1];
466		    }
467		    else {
468                        $p_subtitle = $p_str[1];
469		    }
470		    undef $p_subtitle
471		      if defined $p_subtitle and $p_subtitle eq '';
472		}
473                my $p_category  = $txt_elems[$index + 3];
474                my $p_desc;
475                if (not ( $txt_elems[$index + 4] =~ /^\d\d:\d\d/ ) ) {
476                # Program has Description
477                        $p_desc = $txt_elems[$index + 4];
478                        $index = $index + 5;
479                } else {
480                # Program don't have Description
481                        $index = $index + 4;
482                }
483                my %h = (       time =>         $p_stime,
484                                category=>      $p_category,
485                                title=>         $p_title,
486                                desc =>         $p_desc );
487		$h{subtitle} = $p_subtitle if defined $p_subtitle;
488                push @data, \%h;
489#               t "Next time?: $txt_elems[$index]";
490           } # end while prof the program
491        }
492        t $txt_elems[$index];
493        $index = $index + 1;
494    }
495    return @data;
496}
497sub get_txt_elems {
498    my ($tree) = @_;
499
500    my @txt_elem;
501    my @txt_cont = $tree->look_down(
502                        sub { ($_[0]->descendants() eq 0  ) },
503			sub { defined($_[0]->attr ("_content") ) } );
504	foreach my $txt (@txt_cont) {
505        	my @children=$txt->content_list;
506		if (defined($children[0])) {
507                  for (my $tmp=$children[0]) {
508			s/^\s+//;s/\s+$//;
509			push @txt_elem, $_;
510                      }
511                }
512	}
513    return @txt_elem;
514}
515
516# get channel listing
517sub get_channels {
518    my $bar = new XMLTV::ProgressBar('getting list of channels', 1)
519	if not $opt_quiet;
520    my %channels;
521    my $url="http://www.elpais.es/parrillatv/portada.html";
522    t $url;
523
524    my $tree = get_nice_tree $url;
525    my @menus = $tree->find_by_tag_name("_tag"=>"select");
526
527    foreach my $elem (@menus) {
528	my $cname = $elem->attr('name');
529	if ($cname eq "canal") {
530	    my @ocanals = $elem->find_by_tag_name("_tag"=>"option");
531	    @ocanals = sort @ocanals;
532	    foreach my $opt (@ocanals) {
533		if (not $opt->attr('value') eq "") {
534		    my @str = split (/-/,$opt->attr('value'));
535		    my $channel_id = $str[0];
536		    my $channel_name=$str[1];
537		    my $channel_num = $channel_id;
538		    if (length $channel_id eq 1) {
539			$channel_id = "0" . $channel_id
540		    }
541		    $channels{$channel_id}=$channel_name;
542		    push @ch_all, { 'display-name' => [ [ $str[1], $LANG ],
543						        [ $channel_num ] ],
544				    'id'=> "$channel_id.elpais.es" };
545		}
546	    }
547	}
548    }
549    die "no channels could be found" if not keys %channels;
550    update $bar if not $opt_quiet;
551    $bar->finish() if not $opt_quiet;
552    return %channels;
553}
554
555
556# Bump a YYYYMMDD date by one.
557sub nextday {
558    my $d = shift;
559    my $p = parse_date($d);
560    my $n = DateCalc($p, '+ 1 day');
561    return UnixDate($n, '%Q');
562}
563
564