1#!/usr/local/bin/perl -w
2
3=pod
4
5=head1 NAME
6
7tv_grab_es_laguiatv - Alternative TV grabber for Spain.
8
9=head1 SYNOPSIS
10
11tv_grab_es_laguiatv --help
12
13tv_grab_es_laguiatv [--config-file FILE] --configure [--gui OPTION]
14
15tv_grab_es_laguiatv [--config-file FILE] [--output FILE] [--days N]
16           [--offset N] [--quiet]
17
18tv_grab_es_laguiatv --list-channels
19
20tv_grab_es_laguiatv --capabilities
21
22tv_grab_es_laguiatv --version
23
24=head1 DESCRIPTION
25
26Output TV listings for spanish channels from www.laguiatv.com.
27Supports analogue and digital (D+) channels.
28The grabber relies on parsing HTML so it might stop working at any time.
29
30First run B<tv_grab_es_laguiatv --configure> to choose, which channels you want
31to download. Then running B<tv_grab_es_laguiatv> with no arguments will output
32listings in XML format to standard output.
33
34B<--configure> Prompt for which channels,
35and write the configuration file.
36
37B<--config-file FILE> Set the name of the configuration file, the
38default is B<~/.xmltv/tv_grab_es_laguiatv.conf>.  This is the file written by
39B<--configure> and read when grabbing.
40
41B<--gui OPTION> Use this option to enable a graphical interface to be used.
42OPTION may be 'Tk', or left blank for the best available choice.
43Additional allowed values of OPTION are 'Term' for normal terminal output
44(default) and 'TermNoProgressBar' to disable the use of XMLTV::ProgressBar.
45
46B<--output FILE> Write to FILE rather than standard output.
47
48B<--days N> Grab N days.  The default is 3.
49
50B<--offset N> Start N days in the future.  The default is to start
51from today.
52
53B<--quiet> Suppress the progress messages normally written to standard
54error.
55
56B<--capabilities> Show which capabilities the grabber supports. For more
57information, see L<http://wiki.xmltv.org/index.php/XmltvCapabilities>
58
59B<--version> Show the version of the grabber.
60
61B<--help> Print a help message and exit.
62
63=head1 SEE ALSO
64
65L<xmltv(5)>.
66
67=head1 AUTHOR
68
69CandU, candu_sf@sourceforge.net, based on tv_grab_es, from Ramon Roca.
70
71=head1 BUGS
72
73=cut
74
75#
76
77
78######################################################################
79# initializations
80
81use strict;
82use XMLTV::Version '$Id: tv_grab_es_laguiatv,v 1.25 2015/03/24 17:56:38 bilbo_uk Exp $ ';
83use XMLTV::Capabilities qw/baseline manualconfig cache/;
84use XMLTV::Description 'Spain (laguiatv.com)';
85use Getopt::Long;
86use Date::Manip;
87use HTML::TreeBuilder;
88use HTML::Entities; # parse entities
89use IO::File;
90use DateTime;
91
92use LWP::Simple;
93use Encode;
94
95use XMLTV;
96use XMLTV::Memoize;
97use XMLTV::ProgressBar;
98use XMLTV::Ask;
99use XMLTV::Config_file;
100use XMLTV::DST;
101use XMLTV::Get_nice 0.005065;
102use XMLTV::Mode;
103use XMLTV::Date;
104# Todo: perhaps we should internationalize messages and docs?
105use XMLTV::Usage <<END
106$0: get Spanish television listings in XMLTV format
107To configure: $0 --configure [--config-file FILE]
108To grab listings: $0 [--config-file FILE] [--output FILE] [--days N]
109        [--offset N] [--quiet]
110To list channels: $0 --list-channels
111To show capabilities: $0 --capabilities
112To show version: $0 --version
113END
114  ;
115
116# Attributes of the root element in output.
117my $HEAD = { 'source-info-url'     => 'http://www.laguiatv.com/programacion/',
118	     'source-data-url'     => "http://www.laguiatv.com/programacion/",
119	     'generator-info-name' => 'XMLTV',
120	     'generator-info-url'  => 'http://xmltv.org/',
121	   };
122
123my $WRITE_ZERO_LENGTH = 0;  # whether zero-length programmes should be included in the output.
124my $DO_SLOWER_DESC_GET = 0;
125my $CONFIG_VERSION = 1; # default to v1 (v1 doesnt have version info)
126my $EXPECTED_CONFIG_VERSION = 3;
127my $CONFIG_USECACHE = 0;  # whether to use a disc cache for web pages
128my $CONFIG_CACHEDIR;	# directory to store cached web pages
129
130# default language
131my $LANG="es";
132
133# default web page encoding
134my $WEB_ENCODING = 'iso-8859-15';
135
136# Global channel_data
137our @ch_all;
138
139my @hide_channels = (
140    "canal-bar.a", # currently gives 404 not found
141);
142
143
144######################################################################
145# get options
146
147# Get options, including undocumented --cache option.
148XMLTV::Memoize::check_argv('XMLTV::Get_nice::get_nice_aux');
149my ($opt_days, $opt_offset, $opt_help, $opt_output,
150    $opt_configure, $opt_config_file, $opt_gui,
151    $opt_quiet, $opt_list_channels, $opt_debug);
152$opt_days  = 4; # default
153$opt_offset = 0; # default
154$opt_quiet  = 0; # default
155$opt_debug  = 0; # default
156GetOptions('days=i'        => \$opt_days,
157	   'offset=i'      => \$opt_offset,
158	   'help'          => \$opt_help,
159	   'configure'     => \$opt_configure,
160	   'config-file=s' => \$opt_config_file,
161       'gui:s'         => \$opt_gui,
162	   'output=s'      => \$opt_output,
163	   'quiet'         => \$opt_quiet,
164	   'list-channels' => \$opt_list_channels,
165	   'debug'         => \$opt_debug,
166	  )
167  or usage(0);
168
169# Force days to be 1, since we get all days at once
170#		$opt_days = 1;
171die 'number of days must not be negative'
172  if (defined $opt_days && $opt_days < 0);
173usage(1) if $opt_help;
174
175# [mod Jan 2014 - max days is 4
176die 'max days available is 4 (today + 3)'
177  if ( $opt_offset + $opt_days > 4 );
178
179XMLTV::Ask::init($opt_gui);
180
181
182# Although we use HTTP::Cache::Transparent, this undocumented --cache
183# option for debugging is still useful since it will _always_ use a
184# cached copy of a page, without contacting the server at all.
185#
186use XMLTV::Memoize; XMLTV::Memoize::check_argv('XMLTV::Get_nice::get_nice_aux');
187
188
189# debug print function
190sub debug_print
191{
192	print STDERR $_[0]."\n" if $opt_debug;
193}
194
195my $mode = XMLTV::Mode::mode('grab', # default
196			     $opt_configure => 'configure',
197			     $opt_list_channels => 'list-channels',
198			    );
199
200# File that stores which channels to download.
201my $config_file
202  = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_es_laguiatv', $opt_quiet);
203
204my @config_lines; # used only in grab mode
205if ($mode eq 'configure') {
206    XMLTV::Config_file::check_no_overwrite($config_file);
207}
208elsif ($mode eq 'grab') {
209    @config_lines = XMLTV::Config_file::read_lines($config_file);
210}
211elsif ($mode eq 'list-channels') {
212    # Config file not used.
213}
214else { die }
215
216# Whatever we are doing, we need the channels data.
217my %channels = get_channels(); # sets @ch_all
218my @channels;
219
220my %icons;
221
222my %categories = (
223    "tag-a" => "Cine",
224    "tag-b" => "Deportes",
225    "tag-c" => "Programas",
226    "tag-d" => "Series",
227    "tag-e" => "Noticias"
228);
229
230######################################################################
231# write configuration
232
233if ($mode eq 'configure') {
234    %channels = get_channels();
235
236    open(CONF, ">$config_file") or die "cannot write to $config_file: $!";
237
238    print CONF "configversion 3\n";
239
240    # Ask about using a cache
241    my $usecache = ask_boolean("Do you want to use a cache for web pages (recommended)", 'yes');
242    warn("cannot read input, using default")
243        if not defined $usecache;
244
245    print CONF "usecache ";
246    print CONF "yes\n" if $usecache;
247    print CONF "no\n" if not $usecache;
248
249	my $cachedir = "$ENV{HOME}/.xmltv/cache";
250	if ($usecache)
251	{
252		my $cachedir = ask("Directory for cache (default=$cachedir)");
253		warn("cannot read input, using default")
254			if not defined $cachedir;
255	}
256	print CONF "cachedir ".$cachedir."\n";
257
258    # Ask about getting descs
259    my $getdescs = ask_boolean("Do you want to get descriptions (very slow)", 'yes');
260    warn("cannot read input, using default")
261        if not defined $getdescs;
262
263    print CONF "getdescriptions ";
264    print CONF "yes\n" if $getdescs;
265    print CONF "no\n" if not $getdescs;
266
267    #my $cacheicons = ask_boolean('Do you want to get and cache icons during configure', 'yes');
268    #warn("cannot read input, using default")
269    #    if not defined $cacheicons;
270
271    # Ask about each channel.
272    my @chs = sort { $channels{$a} cmp $channels{$b} } keys %channels;
273    my @names = map { $channels{$_} } @chs;
274    my @qs = map { "Add channel $_?" } @names;
275    my @want = ask_many_boolean(1, @qs);
276
277    #my $iconbar = new XMLTV::ProgressBar({name => 'getting icon urls', count => scalar @chs})
278    #if ((not $opt_quiet) && $cacheicons);
279
280    foreach (@chs) {
281	my $w = shift @want;
282	warn("cannot read input, stopping channel questions"), last
283	  if not defined $w;
284	# No need to print to user - XMLTV::Ask is verbose enough.
285
286	# Print a config line, but comment it out if channel not wanted.
287	print CONF '#' if not $w;
288	my $name = shift @names;
289#        if ($cacheicons)
290#        {
291#            my $icon = get_icon($_);
292#	    print CONF "channel $_ $name icon:$icon\n";
293#        }
294#        else
295#        {
296            print CONF "channel $_ ".encode($WEB_ENCODING, $name)."\n";
297#        }
298	# TODO don't store display-name in config file.
299
300#        update $iconbar if ((not $opt_quiet) && $cacheicons);
301    }
302
303    close CONF or warn "cannot close $config_file: $!";
304    say("Finished configuration.");
305
306    exit();
307}
308
309
310# Not configuration, we must be writing something, either full
311# listings or just channels.
312#
313die if $mode ne 'grab' and $mode ne 'list-channels';
314
315# Options to be used for XMLTV::Writer.
316my %w_args;
317if (defined $opt_output) {
318    my $fh = new IO::File(">$opt_output");
319    die "cannot write to $opt_output: $!" if not defined $fh;
320    $w_args{OUTPUT} = $fh;
321}
322$w_args{encoding} = 'UTF-8';
323my $writer = new XMLTV::Writer(%w_args);
324$writer->start($HEAD);
325
326if ($mode eq 'list-channels') {
327    $writer->write_channel($_) foreach @ch_all;
328    $writer->end();
329    exit();
330}
331
332######################################################################
333# We are producing full listings.
334die if $mode ne 'grab';
335
336# Read configuration
337my $line_num = 1;
338foreach (@config_lines) {
339    ++ $line_num;
340    next if not defined;
341
342    if (/configversion:?\s+(\S+)/)
343    {
344        $CONFIG_VERSION = $1;
345    }
346    elsif (/usecache:?\s+(\S+)/)
347    {
348		if($1 eq "yes")
349        {
350            $CONFIG_USECACHE = 1;
351        }
352    }
353    elsif (/cachedir:?\s+(\S+)/)
354    {
355		$CONFIG_CACHEDIR = $1;
356    }
357    elsif (/getdescriptions:?\s+(\S+)/)
358    {
359        if("$CONFIG_VERSION" ne "$EXPECTED_CONFIG_VERSION")
360        {
361            die "Config file is out of date, please rerun with --configure\n";
362        }
363        if($1 eq "yes")
364        {
365            $DO_SLOWER_DESC_GET = 1;
366        }
367    }
368    elsif (/^channel:?\s+(\S+)\s+([^#]+)icon\:([^#]+)/)
369    {
370        my $ch_did = $1;
371        my $ch_name = $2;
372        my $ch_icon = $3;
373
374
375        #debug_print "Got channel $ch_name icon $ch_icon\n";
376        $ch_name =~ s/\s*$//;
377        push @channels, $ch_did;
378        $channels{$ch_did} = $ch_name;
379        $icons{$ch_did} = $ch_icon;
380    }
381    elsif (/^channel:?\s+(\S+)\s+([^#]+)/)
382    {
383        my $ch_did = $1;
384        my $ch_name = $2;
385
386        debug_print "Fetching channel $ch_name";
387        $ch_name =~ s/\s*$//;
388        push @channels, $ch_did;
389        $channels{$ch_did} = $ch_name;
390    }
391    else {
392	warn "$config_file:$line_num: bad line\n";
393    }
394}
395
396
397
398if ($CONFIG_USECACHE) {
399use HTTP::Cache::Transparent;
400HTTP::Cache::Transparent::init( {
401    BasePath => $CONFIG_CACHEDIR,
402    NoUpdate => 60*60,			# cache time in seconds
403	MaxAge => 4,				# flush time in hours
404    Verbose => $opt_debug,
405} );
406}
407
408
409
410
411######################################################################
412# begin main program
413
414# Assume the listings source uses CET (see BUGS above).
415my $now = DateCalc(parse_date('now'), "$opt_offset days");
416die "No channels specified, run me with --configure\n"
417  if not keys %channels;
418my @to_get;
419
420my $iconbar = new XMLTV::ProgressBar({name => 'getting channel info', count => scalar @channels})
421  if not $opt_quiet;
422# the order in which we fetch the channels matters
423foreach my $ch_did (@channels) {
424    my $ch_name=$channels{$ch_did};
425    my $ch_xid="$ch_did.laguiatv.com";
426#    my $ch_icon=$icons{$ch_did};
427#    if (!$ch_icon)
428#    {
429#        $ch_icon = get_icon($ch_did);
430#    }
431#
432#    if(index($ch_icon, "shim.gif") < 0)
433#    {
434#		$writer->write_channel({ id => $ch_xid,
435#					 'display-name' => [ [ $ch_name ] ] ,
436#					 'icon' => [ { 'src' => $ch_icon } ] });
437#	}
438#	else
439#	{
440		$writer->write_channel({ id => $ch_xid,
441					 'display-name' => [ [ $ch_name ] ] });
442#	}
443
444	# [Jan 2014] - current website offers a fixed 4 days of data
445	#	my $day=UnixDate($now,'%Q');
446	#	for (my $i=0;$i<$opt_days;$i++) {
447	#		push @to_get, [ $day, $ch_xid, $ch_did ];
448	#		#for each day
449	#		$day=nextday($day); die if not defined $day;
450	#	}
451	#
452	push @to_get, [ '', $ch_xid, $ch_did ];
453
454	update $iconbar if not $opt_quiet;
455}
456
457# This progress bar is for both downloading and parsing.  Maybe
458# they could be separate.
459#
460my $bar = new XMLTV::ProgressBar({name => 'getting listings', count => scalar @to_get})
461  if not $opt_quiet;
462foreach (@to_get) {
463	debug_print "process $_->[0], $_->[1], $_->[2]\n";
464	foreach (process_table($_->[0], $_->[1], $_->[2])) {
465		$writer->write_programme($_);
466	}
467	update $bar if not $opt_quiet;
468}
469$bar->finish() if not $opt_quiet;
470$writer->end();
471
472######################################################################
473# subroutine definitions
474
475# Use Log::TraceMessages if installed.
476BEGIN {
477    eval { require Log::TraceMessages };
478    if ($@) {
479	*t = sub {};
480	*d = sub { '' };
481    }
482    else {
483	*t = \&Log::TraceMessages::t;
484	*d = \&Log::TraceMessages::d;
485	Log::TraceMessages::check_argv();
486    }
487}
488
489####
490# process_table: fetch a URL and process it
491#
492# arguments:
493#    Date::Manip object giving the day to grab
494#    xmltv id of channel
495#    elpais.es id of channel
496#
497# returns: list of the programme hashes to write
498#
499sub process_table {
500
501    my ($date, $ch_xmltv_id, $ch_es_id) = @_;
502
503	my $ch_conv_id = convert_id_to_laguiatvid($ch_es_id);
504    my $today = UnixDate($date, '%d/%m/%Y');
505
506    my $url = 'http://www.laguiatv.com/programacion/'.$ch_es_id.'.html';
507	debug_print "Getting $url\n";
508    t $url;
509    local $SIG{__WARN__} = sub
510	{
511		warn "$url: $_[0]";
512	};
513
514    # parse the page to a document object
515	my $tree;
516	#  HTML::Parse keeps reporting "Parsing of undecoded UTF-8 will give garbage when decoding entities" yet I can see no UTF8 in the pages!
517	#    Save the page and run it again and you don't get the warning!
518	#    You can't even supress the warning!  What a crock.
519	{
520		local $SIG{__WARN__} = sub {
521			warn @_ unless (defined $_[0] && $_[0] =~ /^Parsing of undecoded UTF-/);
522		};
523		$tree = get_nice_tree($url,'',$WEB_ENCODING);
524	}
525
526    my @program_data = get_program_data($tree);
527    my $bump_start_day=0;
528
529    my @r;
530    while (@program_data) {
531	my $cur = shift @program_data;
532	my $next = shift @program_data;
533	unshift @program_data,$next if $next;
534
535	my $p = make_programme_hash($date, $ch_xmltv_id, $ch_es_id, $cur, $next);
536	if (not $p) {
537	    require Data::Dumper;
538	    my $d = Data::Dumper::Dumper($cur);
539	    warn "cannot write programme on $ch_xmltv_id on $date:\n$d\n";
540	}
541	else {
542	    push @r, $p;
543	}
544
545#	if (!$bump_start_day && bump_start_day($cur,$next)) {
546#	    #$bump_start_day=1;
547#	    $date = UnixDate(DateCalc($date,"+ 1 day"),'%Q');
548#	}
549    }
550    return @r;
551}
552
553sub make_programme_hash {
554    my ($date, $ch_xmltv_id, $ch_es_id, $cur, $next) = @_;
555
556	#require Data::Dumper; debug_print Data::Dumper::Dumper($cur);
557
558    my %prog;
559
560    $prog{channel}=$ch_xmltv_id;
561    $prog{title}=[ [ encode( 'UTF-8', $cur->{title} ), $LANG ] ];
562    $prog{"sub-title"}=[ [ encode( 'UTF-8', $cur->{subtitle} ), $LANG ] ] if defined $cur->{subtitle};
563    # $prog{category}=[ [ $cur->{category}, $LANG ] ];
564	$prog{start}=$cur->{stime};
565	$prog{stop} =$cur->{etime} if defined $cur->{etime};
566    $prog{desc}=[ [ encode( 'UTF-8', $cur->{desc} ), $LANG ] ] if defined $cur->{desc};
567    # $prog{category}=[ [ encode( 'UTF-8', $cur->{category} ), $LANG ] ] if defined $cur->{category};
568	$prog{'date'} = $cur->{year}  if defined $cur->{year};
569	$prog{'star-rating'} =  [ $cur->{rating} . '/5' ] if defined $cur->{rating};
570	$prog{'rating'} =  [[ $cur->{classification}, '' ]] if defined $cur->{classification};
571
572	if (defined $cur->{genres})
573	{
574		foreach ( @{ $cur->{genres} } )
575		{
576			push @{$prog{'category'}}, [ encode('UTF-8', $_), $LANG ]  if $_ ne '';
577		}
578	}
579	if (defined $cur->{directors})
580	{
581		foreach ( @{ $cur->{directors} } )
582		{
583			push @{$prog{'credits'}{'director'}}, encode('UTF-8', $_)  if $_ ne '';
584		}
585	}
586	if (defined $cur->{actors})
587	{
588		foreach ( @{ $cur->{actors} } )
589		{
590			push @{$prog{'credits'}{'actor'}}, encode('UTF-8', $_)  if $_ ne '';
591    }
592	}
593
594
595    return \%prog;
596}
597sub bump_start_day {
598    my ($cur,$next) = @_;
599    if (!defined($next)) {
600	return undef;
601    }
602    my $start = UnixDate($cur->{stime},'%H:%M');
603    my $stop = UnixDate($next->{stime},'%H:%M');
604    if (Date_Cmp($start,$stop)>0) {
605	return 1;
606    } else {
607	return 0;
608    }
609}
610
611
612#
613sub get_program_data
614{
615    my ($tree) = @_;
616    my @data;
617
618	my $today = DateTime->today->set_time_zone('Europe/Madrid');
619
620	# - current website offers a fixed 4 days of data
621	# ignore any programmes outside requested range
622	my $startgrab = $today->clone->add('days' => $opt_offset)->epoch();
623	my $stopgrab = $today->clone->add('days' => ($opt_offset + $opt_days))->epoch();
624	debug_print 'Grab times: start: '.DateTime->from_epoch(epoch=>$startgrab)->strftime("%Y %m %d %H%M %S %z").' stop: '.DateTime->from_epoch(epoch=>$stopgrab)->strftime("%Y %m %d %H%M %S %z");
625    # find schedule table
626
627	# the following could could do with some error checking but I don't have time to do that right now  :-(
628
629    my @divs = $tree->look_down('_tag' => 'div', 'id' => qr/dia1|nad2|nad3|nad4/);
630
631    foreach my $div (@divs)
632    {
633		my ($i) = $div->attr('id') =~ /(?:dia|nad)(\d)/;
634		#'debugtime'  debug_print "i= $i ".$div->attr('id');
635		#'debugtime'  debug_print 'today: '.$today->strftime("%Y %m %d %H%M %S %z");
636		my $theday = $today->clone->add(days => ($i - 1));
637		#'debugtime'  debug_print 'theday: '.$theday->strftime("%Y %m %d %H%M %S %z");
638
639		my @trs = $div->look_down('_tag' => 'tr');
640
641		foreach my $tr (@trs)
642        {
643
644			my $stime = $tr->look_down('_tag' => 'th')->as_text;
645			trim($stime);
646
647			my $p_div = $tr->look_down('_tag' => 'div', 'class' => 'programa');
648			next if !$p_div;
649
650			my $a = $p_div->look_down('_tag' => 'a');
651
652			my $p_url = $a->attr('href');
653			my $p_title = $a->as_text;
654
655			my $p_times = $p_div->look_down('_tag' => 'p')->as_text;
656
657			my ($h, $i, $h2, $i2) = $p_times =~ /(\d*):(\d*)(?: *a *(\d*):(\d*))?/;
658
659			my $showtime = $theday->clone->set(hour => $h, minute => $i, second => 0);
660
661			# - current website offers a fixed 4 days of data
662			# ignore any programmes outside requested range
663			#'debugtime'  debug_print 'this: '.$showtime->strftime("%Y %m %d %H%M %S %z");
664			next if ( $showtime->epoch() < $startgrab ) || ( $showtime->epoch() >= $stopgrab );
665
666			my $p_stime = $theday->clone->set(hour => $h, minute => $i, second => 0)->strftime("%Y%m%d%H%M%S %z");
667
668			my $p_etime;
669			# this will probably fail around DST times
670			if (defined $h2 && $h2 >= 0)
671            {
672				$showtime->add(days => 1) if $h2 < $h;
673				eval {        # try
674					$showtime->set(hour => $h2, minute => $i2, second => 0);
675					$p_etime = $showtime->strftime("%Y%m%d%H%M%S %z");
676				} or do {     # catch
677					# no output prog 'stop' time
678            }
679			}
680
681
682			# get descriptions?  Kinda compulsory now since there is no longer *any* description on the schedule page
683			#
684			my ($p_description, $p_rating, $p_classification, $p_year, @p_genres, @p_actors, @p_directors) = ('', '', '', '', (), (), ());
685
686			#
687			{ # begin code block
688			if ($DO_SLOWER_DESC_GET) 	# get descriptions
689            {
690
691				my $url = $p_url;
692				debug_print "Getting $url";
693				t $url;
694
695				last if $url eq 'javascript:void(0);' ;
696
697				# handle no programme info situation (probably means "Close"?) :
698				# <tr>
699				#  <th scope="col"> 04:00</th>
700				#  <td class="" data-type="programas">
701				#    <div class="programa">
702				#      <h2><a href="http://laguiatv.abc.es/programas/-72840/" title="*">*</a></h2>
703				#      <p> 04:00 a  06:00</p>
704				#    </div>
705				#  </td>
706			  #</tr>
707				last if $url =~ m%programas/-\d*/$%;
708
709				# parse the page to a document object
710				#  HTML::Parse keeps reporting "Parsing of undecoded UTF-8 will give garbage when decoding entities" yet I can see no UTF8 in the pages!
711				# Often on the http://hoycinema.abc.es/ pages. (Could be due to the <script> they have *before* the <meta Content-Type> ?)
712				{
713					local $SIG{__WARN__} = sub {
714						warn @_ unless (defined $_[0] && $_[0] =~ /^Parsing of undecoded UTF-/);
715					};
716					$tree = get_nice_tree($url,'',$WEB_ENCODING);
717				}
718
719				my $div = $tree->look_down('_tag' => 'div', 'id' => 'contenedor');		# container
720				$tree->dump if !$div;
721				exit if !$div;
722
723				# see if the title has a year
724				# <h1 itemprop="name">&laquo;Fin de semana al desnudo&raquo;
725				#	<span><a href="/peliculas/1974.html" title="Películas del año 1974">(1974)</a></span>
726				# </h1>
727				my $h1 = $div->look_down('_tag' => 'h1', 'itemprop' => 'name');
728				if ($h1)
729				{
730					my $a = $h1->look_down('_tag' => 'a');
731					($p_year) = $a->as_text =~ /^\((19\d\d|20\d\d)\)$/  if $a;
732				}
733
734				$div = $div->look_down('_tag' => 'div', 'class' => 'modulo');
735				last if !$div;
736
737				# get the various <dl> blocks
738				my @dls = $div->look_down('_tag' => 'dl');
739				foreach my $dl (@dls)
740				{
741					my $dt = $dl->look_down('_tag' => 'dt');
742					next if !$dt;
743
744					if ($dt->as_text =~ /Informaci.n/)
745                {
746						#<dl class="datos">
747						#	<dt>Información:</dt>
748						#	<dd class="calificacion">SC</dd>
749						#	<dd itemprop="genre">Comedia</dd>
750						#	<dd itemprop="duration">93 minutos</dd>
751						#	<dd>Sin especificar</dd>    OR e.g.   <dd>EE.UU.</dd>
752						#</dl>
753						my $t = $dl->look_down('_tag' => 'dd', 'class' => 'calificacion');
754						$p_classification = $t->as_text  if $t;
755						#
756						$t = $dl->look_down('_tag' => 'dd', 'itemprop' => 'genre');
757						@p_genres = split(/,/, $t->as_text)  if $t;
758					}
759
760					if ($dt->as_text =~ /Director/)
761					{
762						#<dl>
763						#	<dt>Director:</dt>
764						#	<dd itemprop="director" itemscope itemtype="http://schema.org/Person">
765						#		<a href="/perfil-cine/peter-webber-97721/" title="Peter Webber"><span itemprop="name">Peter Webber</span></a>
766						#		<a href="/perfil-cine/mariano-ozores-20092/" title="Mariano Ozores"><span itemprop="name">Mariano Ozores</span></a>
767						#	</dd>
768						#</dl>
769						my @t = $dl->look_down('_tag' => 'span', 'itemprop' => 'name');
770						foreach (@t)
771						{
772							push @p_directors, $_->as_text;
773						}
774					}
775
776					if ($dt->as_text =~ /Int.rpretes/)
777					{
778						#<dl class="interpretes" itemprop="actor" itemscope itemtype="http://schema.org/Person">
779						#	<dt>Intérpretes:</dt>
780						#	<dd>
781						#		<a href="/perfil-cine/mandy-patinkin-4279/" title="Mandy Patinkin"><span itemprop="name">Mandy Patinkin</span></a>,
782						#		<a href="/perfil-cine/alfredo-landa-15923/" title="Alfredo Landa"><span itemprop="name">Alfredo Landa</span></a>,
783						#		<a href="/perfil-cine/thomas-gibson-18922/" title="Thomas Gibson"><span itemprop="name">Thomas Gibson</span></a>,
784						#	</dd>
785						#	<dd class="enlace"><a href="/peliculas/1974/fin-de-semana-al-desnudo-9814/reparto.html" title="Reparto completo">Reparto completo</a></dd>
786						#</dl>
787						my @t = $dl->look_down('_tag' => 'span', 'itemprop' => 'name');
788						foreach (@t)
789						{
790							push @p_actors, $_->as_text;
791						}
792						# note: we could follow the "Reparto completo" link for the complete cast list
793					}
794
795					if ($dt->as_text =~ /Descripci.n/)
796					{
797						#<dl>
798						#	<dt>Descripción:</dt>
799						#	<dd>Programa que repasa todas las noticias de interés general, nacionales, internacionales, así como deportivas. Incluye El tiempo. Presentador: Albert Martínez.</dd>
800						#</dl>
801						my $t = $dl->look_down('_tag' => 'dd');
802						$p_description = $t->as_text  if $t;
803					}
804
805					if ($dt->as_text =~ /Sinopsis/)
806					{
807						#<dl class="sinopsis">
808						#	<dt>Sinopsis:</dt>
809						#	<dd itemprop="description">La historia de la psicóloga Virginia Johnson (Lizzy Caplan) y el tímido ginecólogo William Masters (Michael Sheen),...<a href="/series/masters-of-sex-25058/sinopsis.html" title="Leer sinopsis completa">Leer sinopsis completa</a>.</dd>
810						#</dl>
811						#
812						# For a *really* long description we could follow the 'Leer sinopsis completa' link but we won't do that until someone asks for it!  ;-)
813						#
814						if ($p_description eq '')
815						{
816							my $t = $dl->look_down('_tag' => 'dd', 'itemprop' => 'description');
817							if ($t)
818							{
819								my $a = $t->look_down('_tag' => 'a');
820								$a->detach  if $a;
821								$p_description = $t->as_text;
822							}
823						}
824					}
825
826
827				}
828
829				# if no description then try for a synopis
830				#	<aside id="sinopsis"><h2>Sinopsis</h2><p itemprop="description">...
831				if ($p_description eq '')
832				{
833					my $h2 = $tree->look_down('_tag' => 'h2', sub { $_[0]->as_text =~ /Sinopsis/ } );
834					if ($h2)
835                    {
836						my $p = $h2->right();
837						$p_description = $p->as_text  if ( $p->tag() eq 'p' && $p->attr('itemprop') eq 'description' );
838						#
839						# website sometimes has invalid html (nested <p>) which treebuilder flattens
840						# so append <p> siblings
841						while (1)
842						{
843							$p = $p->right();
844							last if ( $p->tag() ne 'p' );
845							$p_description .= $p->as_text  if ( $p->tag() eq 'p' );
846						}
847					}
848				}
849
850				# rating  (x/5)
851				#	<meta itemprop="ratingValue" content="2.2"/>
852				my $meta = $div->look_down('_tag' => 'meta', 'itemprop' => 'ratingValue');
853				if ($meta)
854                        {
855					$p_rating = $meta->attr('content');
856                        }
857
858
859                    }
860			} # end code block
861
862            #debug_print("title: $p_title start: $p_stime end: $p_etime cat: $p_category c2: " . $categories{$p_category} . "\n");
863            debug_print("title: $p_title start: $p_stime end: ".(defined $p_etime?$p_etime:''));
864
865			# 2014-04-02 ignore programme where title = *
866			#     <h2>  <a href="javascript:void(0);" title="*">*</a>  </h2>
867			#
868			if($p_title && $p_title ne "" && $p_title ne "*" && $p_stime && $p_stime ne "")
869			{
870				my %h = ('stime' =>        $p_stime,
871						 'etime' =>        $p_etime,
872						 'title' =>        $p_title,
873						 );
874				$h{year} 			= $p_year if defined $p_year && $p_year ne "";
875				$h{rating} 			= $p_rating if $p_rating ne "";
876                    $h{desc} = $p_description if $p_description ne "";
877				$h{classification} 	= $p_classification if $p_classification ne "";
878				$h{directors} 		= \@p_directors if scalar @p_directors > 0;
879				$h{actors} 			= \@p_actors if scalar @p_actors > 0;
880				$h{genres} 			= \@p_genres if scalar @p_genres > 0;
881
882                    push @data, \%h;
883
884            }
885
886        }
887
888    }
889    return @data;
890}
891
892sub get_icon
893{
894    my ($ch_did) = @_;
895
896    return "";
897
898    my $url = "http://www.laguiatv.com/programacion/$ch_did";
899	debug_print "Getting $url\n";
900    t $url;
901    local $SIG{__WARN__} = sub
902	{
903		warn "$url: $_[0]";
904	};
905
906    my $content = get $url;
907    my $pos = index($content, '<table class="grid cadena">');
908    if($pos > 0)
909    {
910        $pos = index($content, '<img src="', $pos);
911        if($pos > 0)
912        {
913            $pos += 10;
914            my $end = index($content, '"', $pos);
915
916            my $icon = 'http://www.laguiatv.com/' . substr($content, $pos, $end - $pos);
917
918            debug_print "icon $icon\n";
919            return $icon;
920        }
921    }
922
923    return 'http://www.laguiatv.com/shim.gif';
924}
925
926
927sub get_prog_info
928{
929    my ($url) = @_;
930    my $desc = "";
931    my $cat = "";
932
933    $url = "http://www.laguiatv.com/".$url;
934    debug_print "Get proginfo $url\n";
935
936    my $content = get $url;
937    my $pos = index($content, '<div class="intro-datasheet">');
938
939    if($pos >= 0)
940    {
941        $pos = index($content, 'class="text">', $pos);
942        if($pos >= 0)
943        {
944            my $divend = index($content, '</div', $pos);
945            $pos = index($content, '<p', $pos);
946
947            while($pos >= 0 && $pos < $divend)
948            {
949                $pos = index($content, '>', $pos) + 1;
950                my $end = index($content, '</p>', $pos);
951                if($end >= 0)
952                {
953                    $desc = $desc . substr($content, $pos, $end - $pos) . " ";
954                }
955                $pos = index($content, '<p', $pos);
956            }
957        }
958    }
959
960    decode_entities($desc);
961    $desc =~ s/<\S+\s*\/*\/*>//g;
962    $desc =~ s/\s+/ /g;
963    $desc =~ s/\s+$//g;
964
965    return ($desc, $cat);
966}
967
968sub get_txt_elems {
969    my ($tree) = @_;
970
971    my @txt_elem;
972    my @txt_cont = $tree->look_down(
973                        sub { ($_[0]->descendants() eq 0  ) },
974			sub { defined($_[0]->attr ("_content") ) } );
975	foreach my $txt (@txt_cont) {
976        	my @children=$txt->content_list;
977		if (defined($children[0])) {
978                  for (my $tmp=$children[0]) {
979			s/^\s+//;s/\s+$//;
980			push @txt_elem, $_;
981                      }
982                }
983	}
984    return @txt_elem;
985}
986
987# get channel listing
988sub get_channels
989{
990    my $bar = new XMLTV::ProgressBar({name => 'finding channels', count => 1})
991	if not $opt_quiet;
992    my %channels;
993
994	# the front page is very big and slow to parse, so we'll
995	# get channels via a dummy call to TVE 1 and then parse out the channel selector
996    my $url="http://www.laguiatv.com/programacion/tve-1-807.html";
997    t $url;
998
999    my $channel_id;
1000    my $channel_name;
1001    my $channel_num;
1002
1003    my $tree = get_nice_tree($url,'',$WEB_ENCODING);
1004
1005    my @options = $tree->look_down('_tag' => 'select', 'id' => 'cadenas_programacion')->look_down('_tag' => 'option');
1006
1007    foreach my $option (@options)
1008            {
1009		next if !$option->attr('value');
1010
1011		# <option value="tve-1-807">TVE 1</option>
1012		$channel_name = $option->as_text;
1013		$channel_id = $option->attr('value');
1014		($channel_num) = $option->attr('value') =~ /.*?-(\d+)$/;
1015
1016		# remove channels that should not be listed
1017                my $hide = 0;
1018                foreach my $hide_id (@hide_channels)
1019                {
1020                    if($channel_id =~ m/$hide_id/)
1021                    {
1022                        $hide = 1;
1023                    }
1024                }
1025
1026                if($hide == 0)
1027                {
1028                    $channels{$channel_id}=$channel_name;
1029				debug_print "Got channel $channel_name with id $channel_id"  if $opt_list_channels;
1030
1031				my $coded_chan_name=encode("utf-8",$channel_name);
1032				push @ch_all, {
1033					'display-name' => [[ $coded_chan_name, $LANG ],[$channel_num]],
1034					'channel-num' => $channel_num,
1035					'id'=> "$channel_id.laguiatv"
1036				};
1037		}
1038
1039    }
1040
1041    die "no channels could be found" if not keys %channels;
1042    update $bar if not $opt_quiet;
1043    $bar->finish() if not $opt_quiet;
1044    return %channels;
1045}
1046
1047sub convert_laguiatvid_to_id
1048{
1049    my ($str) = @_;
1050
1051
1052	$str =~ s/([^A-Za-z0-9])/sprintf("-%02X", ord("$1"))/seg;
1053
1054	$str = "C" . $str;
1055	return $str;
1056}
1057
1058sub convert_id_to_laguiatvid
1059{
1060    my ($str) = @_;
1061
1062	# convert -20 to + (to replace spaces)
1063	$str =~ s/-20/+/g;
1064
1065	# convert - to % for URL encoded chars
1066	$str =~ s/\-/%/g;
1067
1068	# strip the C off the front
1069	$str = substr($str, 1);
1070
1071	return $str;
1072}
1073
1074# Bump a DDMMYYYY date by one.
1075sub nextday {
1076    my $d = shift;
1077    my $p = parse_date($d);
1078    my $n = DateCalc($p, '+ 1 day');
1079    return UnixDate($n, '%Q');
1080}
1081
1082sub trim {
1083	# Remove leading & trailing spaces
1084	$_[0] =~ s/^\s+|\s+$//g;
1085}
1086
1087sub utf8 {
1088		# Catch the error:
1089		#    "Parsing of undecoded UTF-8 will give garbage when decoding entities
1090		return decode('UTF-8', $_[0]);
1091}
1092