1#!/usr/local/bin/perl -w
2
3eval 'exec /usr/local/bin/perl -w -S $0 ${1+"$@"}'
4    if 0; # not running under some shell
5
6=pod
7
8=head1 NAME
9
10tv_grab_es_miguiatv - Alternative TV grabber for Spain.
11
12=head1 SYNOPSIS
13
14tv_grab_es_miguiatv --help
15
16tv_grab_es_miguiatv [--config-file FILE] --configure [--gui OPTION]
17
18tv_grab_es_miguiatv [--config-file FILE] [--output FILE] [--days N]
19           [--offset N] [--quiet]
20
21tv_grab_es_miguiatv --list-channels
22
23tv_grab_es_miguiatv --capabilities
24
25tv_grab_es_miguiatv --version
26
27=head1 DESCRIPTION
28
29Output TV listings for spanish channels from www.miguiatv.com.
30Supports analogue and digital (D+) channels.
31
32First run B<tv_grab_es_miguiatv --configure> to choose, which channels you want
33to download. Then running B<tv_grab_es_miguiatv> with no arguments will output
34listings in XML format to standard output.
35
36B<--configure> Prompt for which channels,
37and write the configuration file.
38
39B<--config-file FILE> Set the name of the configuration file, the
40default is B<~/.xmltv/tv_grab_es_miguiatv.conf>.  This is the file written by
41B<--configure> and read when grabbing.
42
43B<--gui OPTION> Use this option to enable a graphical interface to be used.
44OPTION may be 'Tk', or left blank for the best available choice.
45Additional allowed values of OPTION are 'Term' for normal terminal output
46(default) and 'TermNoProgressBar' to disable the use of XMLTV::ProgressBar.
47
48B<--output FILE> Write to FILE rather than standard output.
49
50B<--days N> Grab N days.  The default is 3.
51
52B<--offset N> Start N days in the future.  The default is to start
53from today.
54
55B<--quiet> Suppress the progress messages normally written to standard
56error.
57
58B<--capabilities> Show which capabilities the grabber supports. For more
59information, see L<http://wiki.xmltv.org/index.php/XmltvCapabilities>
60
61B<--version> Show the version of the grabber.
62
63B<--help> Print a help message and exit.
64
65=head1 SEE ALSO
66
67L<xmltv(5)>.
68
69=head1 AUTHOR
70
71Alberto Gonz�lez (alberto@pesadilla.org) based on tv_grab_es_laguiatv from CandU and tv_grab_es from Ramon Roca.
72
73=head1 BUGS
74
75=cut
76
77#
78
79
80######################################################################
81# initializations
82
83use strict;
84use XMLTV::Version '$Id: tv_grab_es_miguiatv,v 1.5 2013/12/02 22:02:07 dekarl Exp $ ';
85use XMLTV::Capabilities qw/baseline manualconfig cache/;
86use XMLTV::Description 'Spain (miguiatv.com)';
87use Getopt::Long;
88use Date::Manip;
89use HTML::TreeBuilder;
90use HTML::Entities; # parse entities
91use IO::File;
92use Data::Dumper;
93use Encode qw(decode_utf8 encode_utf8);
94
95
96use XMLTV;
97use XMLTV::Memoize;
98use XMLTV::ProgressBar;
99use XMLTV::Ask;
100use XMLTV::Config_file;
101use XMLTV::DST;
102use XMLTV::Get_nice;
103use XMLTV::Mode;
104use XMLTV::Date;
105# Todo: perhaps we should internationalize messages and docs?
106use XMLTV::Usage <<END
107$0: get Spanish television listings in XMLTV format
108To configure: $0 --configure [--config-file FILE]
109To grab listings: $0 [--config-file FILE] [--output FILE] [--days N]
110        [--offset N] [--quiet]
111To list channels: $0 --list-channels
112To show capabilities: $0 --capabilities
113To show version: $0 --version
114END
115  ;
116
117# Attributes of the root element in output.
118my $HEAD = { 'source-info-url'     => 'http://www.miguiatv.com/todos-los-canales',
119	     'source-data-url'     => 'http://www.miguiatv.com/todos-los-canales',
120	     'generator-info-name' => 'XMLTV',
121	     'generator-info-url'  => 'http://xmltv.org/',
122	   };
123
124# Whether zero-length programmes should be included in the output.
125my $WRITE_ZERO_LENGTH = 0;
126my $DO_SLOWER_DESC_GET = 0;
127
128# default language
129my $LANG="es";
130
131# Global channel_data
132our @ch_all;
133
134# debug print function
135sub debug_print
136{
137	# my ($str) = @_;
138
139	# print $str;
140}
141
142
143######################################################################
144# get options
145
146# Get options, including undocumented --cache option.
147XMLTV::Memoize::check_argv('XMLTV::Get_nice::get_nice_aux');
148my ($opt_days, $opt_offset, $opt_help, $opt_output,
149    $opt_configure, $opt_config_file, $opt_gui,
150    $opt_quiet, $opt_list_channels);
151$opt_days  = 3; # default
152$opt_offset = 0; # default
153$opt_quiet  = 0; # default
154GetOptions('days=i'        => \$opt_days,
155           'offset=i'      => \$opt_offset,
156           'help'          => \$opt_help,
157           'configure'     => \$opt_configure,
158           'config-file=s' => \$opt_config_file,
159           'gui:s'         => \$opt_gui,
160           'output=s'      => \$opt_output,
161           'quiet'         => \$opt_quiet,
162           'list-channels' => \$opt_list_channels
163	  )
164  or usage(0);
165die 'number of days must not be negative'
166  if (defined $opt_days && $opt_days < 0);
167usage(1) if $opt_help;
168
169XMLTV::Ask::init($opt_gui);
170
171my $mode = XMLTV::Mode::mode('grab', # default
172			     $opt_configure => 'configure',
173			     $opt_list_channels => 'list-channels',
174			    );
175
176# File that stores which channels to download.
177my $config_file
178  = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_es_miguiatv', $opt_quiet);
179
180my @config_lines; # used only in grab mode
181if ($mode eq 'configure') {
182    XMLTV::Config_file::check_no_overwrite($config_file);
183}
184elsif ($mode eq 'grab') {
185    @config_lines = XMLTV::Config_file::read_lines($config_file);
186}
187elsif ($mode eq 'list-channels') {
188    # Config file not used.
189}
190else { die }
191
192# Whatever we are doing, we need the channels data.
193my %channels; # sets @ch_all
194my @channels;
195my %urls;
196######################################################################
197# write configuration
198
199if ($mode eq 'configure') {
200	%channels = get_channels();
201
202	open(CONF, ">$config_file") or die "cannot write to $config_file: $!";
203
204	# Ask about getting descs
205	my $getdescs = ask_boolean("Do you want to get descriptions (very slow)");
206	warn("cannot read input, using default")
207	  if not defined $getdescs;
208
209	print CONF "getdescriptions ";
210	print CONF "yes\n" if $getdescs;
211	print CONF "no\n" if not $getdescs;
212
213    # Ask about each channel.
214    my @chs = sort keys %channels;
215    my @names = map { $channels{$_} } @chs;
216    my @qs = map { "Add channel $_?" } @names;
217    my @want = ask_many_boolean(1, @qs);
218    foreach (@chs) {
219	my $w = shift @want;
220	warn("cannot read input, stopping channel questions"), last
221	  if not defined $w;
222	# No need to print to user - XMLTV::Ask is verbose enough.
223
224	# Print a config line, but comment it out if channel not wanted.
225	print CONF '#' if not $w;
226	my $name = shift @names;
227	print CONF "channel $_ $name\n";
228	# TODO don't store display-name in config file.
229    }
230
231    close CONF or warn "cannot close $config_file: $!";
232    say("Finished configuration.");
233
234    exit();
235}
236
237
238# Not configuration, we must be writing something, either full
239# listings or just channels.
240#
241die if $mode ne 'grab' and $mode ne 'list-channels';
242
243# Options to be used for XMLTV::Writer.
244my %w_args;
245if (defined $opt_output) {
246    my $fh = new IO::File(">$opt_output");
247    die "cannot write to $opt_output: $!" if not defined $fh;
248    $w_args{OUTPUT} = $fh;
249}
250#$w_args{encoding} = 'ISO-8859-15';
251$w_args{encoding} = 'utf-8';
252my $writer = new XMLTV::Writer(%w_args);
253$writer->start($HEAD);
254
255if ($mode eq 'list-channels') {
256    $writer->write_channel($_) foreach @ch_all;
257    $writer->end();
258    exit();
259}
260
261######################################################################
262# We are producing full listings.
263die if $mode ne 'grab';
264
265# Read configuration
266my $line_num = 1;
267foreach (@config_lines) {
268    ++ $line_num;
269    next if not defined;
270    if (/getdescriptions:?\s+(\S+)/)
271	{
272		if($1 eq "yes")
273		{
274			$DO_SLOWER_DESC_GET = 1;
275		}
276    }
277	elsif (/^channel:?\s+(\S+)\s+([^\#]+)/)
278	{
279		my $ch_did = $1;
280		my $ch_name = $2;
281		$ch_name =~ s/\s*$//;
282		push @channels, $ch_did;
283		$channels{$ch_did} = $ch_name;
284    }
285    else {
286	warn "$config_file:$line_num: bad line\n";
287    }
288}
289
290######################################################################
291# begin main program
292
293# Assume the listings source uses CET (see BUGS above).
294my $now = DateCalc(parse_date('now'), "$opt_offset days");
295die "No channels specified, run me with --configure\n"
296  if not keys %channels;
297my @to_get;
298
299
300# the order in which we fetch the channels matters
301foreach my $ch_did (@channels) {
302    my $ch_name=$channels{$ch_did};
303    my $ch_xid="$ch_did.miguiatv.com";
304    $writer->write_channel({ id => $ch_xid,
305			     'display-name' => [ [ encode_utf8($ch_name) ] ] });
306    my $day=UnixDate($now,'%Q');
307    for (my $i=0;$i<$opt_days;$i++) {
308        push @to_get, [ $day, $ch_xid, $ch_did ];
309        #for each day
310        $day=nextday($day); die if not defined $day;
311    }
312}
313
314# This progress bar is for both downloading and parsing.  Maybe
315# they could be separate.
316#
317get_urls();
318my $bar = new XMLTV::ProgressBar({name => 'getting listings', count => scalar @to_get})
319  if not $opt_quiet;
320foreach (@to_get) {
321	foreach (process_table($_->[0], $_->[1], $_->[2])) {
322		$writer->write_programme($_);
323	}
324	update $bar if not $opt_quiet;
325}
326$bar->finish() if not $opt_quiet;
327$writer->end();
328
329######################################################################
330# subroutine definitions
331
332# Use Log::TraceMessages if installed.
333BEGIN {
334    eval { require Log::TraceMessages };
335    if ($@) {
336	*t = sub {};
337	*d = sub { '' };
338    }
339    else {
340	*t = \&Log::TraceMessages::t;
341	*d = \&Log::TraceMessages::d;
342	Log::TraceMessages::check_argv();
343    }
344}
345
346# Returns a TreeBuilder instance for a given url. The url is retrieved
347# via get_nice(), decoded into a Perl string, processed to remove HTML
348# entities and then parsed into a HTML::TreeBuilder object
349#
350sub get_tree( $ ) {
351    my $url = shift;
352    my $content = get_nice($url);
353    $content = decode_utf8($content);
354    $content = tidy_html($content);
355    my $t = new HTML::TreeBuilder;
356    $t->parse($content) or die "Cannot parse content of Tree\n";
357    $t->eof;
358    return $t;
359}
360
361# Replaces specific HTML entities with text replacements, and then
362# decodes any remaining entities in the string
363#
364sub tidy_html( $ ) {
365    for (my $s = shift) {
366        # handle specific entities
367        s/&nbsp;/ /g;
368        # decode remaining entities
369        decode_entities($s);
370
371        return $s;
372    }
373}
374
375####
376# process_table: fetch a URL and process it
377#
378# arguments:
379#    Date::Manip object giving the day to grab
380#    xmltv id of channel
381#    elpais.es id of channel
382#
383# returns: list of the programme hashes to write
384#
385sub process_table {
386
387    my ($date, $ch_xmltv_id, $ch_es_id) = @_;
388    my $today = UnixDate($date, '%Y%m%d');
389
390   my $url = $urls{$ch_es_id};
391    $url =~ s/programacion/$today/;
392	debug_print "Getting $url\n";
393    t $url;
394    local $SIG{__WARN__} = sub
395	{
396		warn "$url: $_[0]";
397	};
398
399    # parse the page to a document object
400    my $tree = get_tree($url);
401    #my $tree = get_nice_tree($url);
402    my @program_data = get_program_data($tree);
403    my $bump_start_day=0;
404
405    my @r;
406    while (@program_data) {
407	my $cur = shift @program_data;
408	my $next = shift @program_data;
409	unshift @program_data,$next if $next;
410
411	my $p = make_programme_hash($date, $ch_xmltv_id, $ch_es_id, $cur, $next);
412	if (not $p) {
413	    require Data::Dumper;
414	    my $d = Data::Dumper::Dumper($cur);
415	    warn "cannot write programme on $ch_xmltv_id on $date:\n$d\n";
416	}
417	else {
418	    push @r, $p;
419	}
420
421	if (!$bump_start_day && bump_start_day($cur,$next)) {
422	    $bump_start_day=1;
423	    $date = UnixDate(DateCalc($date,"+ 1 day"),'%Q');
424	}
425    }
426    return @r;
427}
428
429
430sub make_programme_hash {
431    my ($date, $ch_xmltv_id, $ch_es_id, $cur, $next) = @_;
432
433    my %prog;
434
435    $prog{channel}=$ch_xmltv_id;
436    $prog{title}=[ [ encode_utf8($cur->{title}), $LANG ] ];
437    $prog{"sub-title"}=[ [ encode_utf8($cur->{subtitle}), $LANG ] ] if defined $cur->{subtitle};
438    #$prog{category}=[ [ $cur->{category}, $LANG ] ];
439
440    t "turning local time $cur->{time}, on date $date, into UTC";
441    eval { $prog{start}=utc_offset("$date $cur->{time}", '+0100') };
442    if ($@) {
443	warn "bad time string: $cur->{time}";
444	return undef;
445    }
446    t "...got $prog{start}";
447    # FIXME: parse description field further
448
449    $prog{desc}=[ [ encode_utf8($cur->{desc}), $LANG ] ] if defined $cur->{desc};
450
451    return \%prog;
452}
453sub bump_start_day {
454    my ($cur,$next) = @_;
455    if (!defined($next)) {
456	return undef;
457    }
458    my $start = UnixDate($cur->{time},'%H:%M');
459    my $stop = UnixDate($next->{time},'%H:%M');
460    if (Date_Cmp($start,$stop)>0) {
461	return 1;
462    } else {
463	return 0;
464    }
465}
466
467
468# get time, title, description
469sub get_program_data
470{
471    my ($tree) = @_;
472    my @data;
473    #my @inputs = $tree->find("class","show_even","class","show_odd");
474    my @inputs = $tree->find("tr");
475    for my $elem (@inputs) {
476		if($elem->attr('class') && ($elem->attr('class') eq "show_odd" || $elem->attr('class') eq "show_even")) {
477			my $time = $elem->attr('_content')->[0]->attr('_content')->[0];
478			my $td  = pop @{$elem->attr('_content')};
479				for my $table (@{$td->attr('_content')}) {
480				   if($table->attr('_content')->[0]->attr('_content')->[0]->attr('_content')->[0]->attr('_content')->[1]) {
481				       my $title = $table->attr('_content')->[0]->attr('_content')->[0]->attr('_content')->[0]->attr('_content')->[1]->attr('_content')->[0];
482
483                       if( $table->attr('_content')->[0]->attr('_content')->[1]->attr('_content')->[0]->attr('_content') ||  $table->attr('_content')->[1]->attr('_content')->[0]->attr('_content')) {
484	    			       if($table->attr('_content')->[1]->attr('_content')->[0]) {
485                              my $description=" ";
486                              if( $table->attr('_content')->[1]->attr('_content')->[0]->attr('_content')) {
487		    			        $description = $table->attr('_content')->[1]->attr('_content')->[0]->attr('_content')->[0];
488                              } else {
489                                $description = $table->attr('_content')->[1]->attr('_content')->[0]->attr('_content')->[0];
490                              }
491			    		      my %h = (
492        		    	        	    time =>         $time,
493		        	    	            category =>     $title,
494		        		                title=>         $title,
495        	           		    	    desc =>         $description
496    				          );
497	    			   	      push @data,\%h;
498		    		      }
499                      }
500				   }
501			}
502		}
503    }
504
505    return @data;
506
507
508    my $xml = XMLin($tree);
509    if(ref($xml->{channel}->{item}) eq "ARRAY") {
510	    my $elementos = $#{$xml->{channel}->{item}};
511	    for (my $i=0;$i<$elementos;$i++) {
512		    my ($title,$time) = split(/\s*-\s*/,$xml->{channel}->{item}->[$i]->{title},2);
513		    my $description = $xml->{channel}->{item}->[$i]->{description};
514                    ($time) = $time =~ /(\d+:\d+)/;
515		    my $year = (((localtime(time))[5])+1900);
516		    #$time = $year . $mes .  sprintf("%02d",$dia) . $hora . $minuto . "00 +0100";
517		    $description =~ s/[^\n]*\n//;
518		    if(length($description) > 5) {
519			    my %h = (
520        		            time =>         $time,
521	        	            title=>         $title,
522                   		    desc =>         $description
523		           );
524		   	push @data,\%h;
525                   }
526            }
527    }
528
529    return @data;
530}
531
532
533# get channel listing
534sub get_channels
535{
536    my $bar = new XMLTV::ProgressBar({name => 'finding channels', count => 1})
537	if not $opt_quiet;
538    my %channels;
539    my $url='http://www.miguiatv.com/todos-los-canales';
540    t $url;
541    my $channel_id;
542    my $channel_name;
543    my $tree = get_tree $url;
544    #my $tree = get_nice_tree $url;
545    my @inputs = $tree->find("div");
546    foreach my $elem (@inputs) {
547		if($elem->attr('class') && $elem->attr('class') eq "footer_channels") {
548
549			for my $div  ( @{$elem->attr('_content')}) {
550				for my $li  ( @{$div->attr('_content')}) {
551					pop @{$li->attr('_content')};
552					for my $ul ( @{$li->attr('_content')}) {
553						if(ref($ul) eq "HTML::Element") {
554							if($ul->attr('href')) {
555								$channel_name = pop @{$ul->attr('_content')};
556								$channel_name =~ s/^\s+//;
557								$channel_name =~ s/\s+$//;
558				 			        $channel_id = convert_name_to_id($channel_name);
559				                                $channels{$channel_id}=$channel_name;
560							}
561						}
562					}
563				}
564			}
565		}
566     }
567
568    die "no channels could be found" if not keys %channels;
569    update $bar if not $opt_quiet;
570    $bar->finish() if not $opt_quiet;
571    return %channels;
572}
573
574# get xml list for channels
575sub get_urls
576{
577    my $bar = new XMLTV::ProgressBar({name => 'getting urls', count => 1})
578	if not $opt_quiet;
579    my %channels;
580    my $url='http://www.miguiatv.com/todos-los-canales';
581    t $url;
582    my $channel_id;
583    my $channel_name;
584    my $tree = get_tree $url;
585    #my $tree = get_nice_tree $url;
586    my @inputs = $tree->find("div");
587    foreach my $elem (@inputs) {
588		if($elem->attr('class') && $elem->attr('class') eq "footer_channels") {
589
590			for my $div  ( @{$elem->attr('_content')}) {
591				for my $li  ( @{$div->attr('_content')}) {
592					pop @{$li->attr('_content')};
593					for my $ul ( @{$li->attr('_content')}) {
594						if(ref($ul) eq "HTML::Element") {
595							if($ul->attr('href')) {
596								$channel_name = pop @{$ul->attr('_content')};
597								$channel_name =~ s/^\s+//;
598								$channel_name =~ s/\s+$//;
599				 			        $channel_id = convert_name_to_id($channel_name);
600				                                $urls{$channel_id}=$ul->attr('href');
601							}
602						}
603					}
604				}
605			}
606		}
607     }
608
609    die "no channels could be found" if not keys %urls;
610
611    update $bar if not $opt_quiet;
612    $bar->finish() if not $opt_quiet;
613}
614sub convert_name_to_id
615{
616    my ($str) = @_;
617
618
619	$str =~ s/([^A-Za-z0-9])/sprintf("-%02X", ord($1))/seg;
620
621	$str = "C" . $str;
622	return $str;
623}
624
625# Bump a DDMMYYYY date by one.
626sub nextday {
627    my $d = shift;
628    my $p = parse_date($d);
629    my $n = DateCalc($p, '+ 1 day');
630    return UnixDate($n, '%Q');
631}
632