1#!/usr/local/bin/perl -w
2
3eval 'exec /usr/local/bin/perl -w -S $0 ${1+"$@"}'
4    if 0; # not running under some shell
5
6# $Id: tv_grab_dk_tvtid,v 1.3 2010/09/02 05:07:40 rmeden Exp $
7
8=pod
9
10=head1 NAME
11
12tv_grab_dk_tvtid - Grab TV listings for Denmark.
13
14=head1 SYNOPSIS
15
16tv_grab_dk_tvtid --help
17
18tv_grab_dk_tvtid [--config-file FILE] --configure [--gui OPTION]
19
20tv_grab_dk_tvtid [--config-file FILE] [--output FILE] [--days N]
21[--offset N] [--quiet]
22
23tv_grab_dk_tvtid --capabilities
24
25tv_grab_dk_tvtid --version
26
27=head1 DESCRIPTION
28
29Output TV listings for several channels available in Denmark.  The
30data comes from tvtid.tv2.dk. The grabber relies on parsing HTML so it
31might stop working at any time.
32
33First run B<tv_grab_dk_tvtid --configure> to choose, which channels you want
34to download. Then running B<tv_grab_dk_tvtid> with no arguments will output
35listings in XML format to standard output.
36
37B<--configure> Prompt for which channels,
38and write the configuration file.
39
40B<--config-file FILE> Set the name of the configuration file, the
41default is B<~/.xmltv/tv_grab_dk_tvtid.conf>.  This is the file written by
42B<--configure> and read when grabbing.
43
44B<--gui OPTION> Use this option to enable a graphical interface to be used.
45OPTION may be 'Tk', or left blank for the best available choice.
46Additional allowed values of OPTION are 'Term' for normal terminal output
47(default) and 'TermNoProgressBar' to disable the use of Term::ProgressBar.
48
49B<--output FILE> Write to FILE rather than standard output.
50
51B<--days N> Grab N days.  The default is one week.
52
53B<--offset N> Start N days in the future.  The default is to start
54from today.
55
56B<--quiet> Suppress the progress messages normally written to standard
57error.
58
59B<--capabilities> Show which capabilities the grabber supports. For more
60information, see L<http://wiki.xmltv.org/index.php/XmltvCapabilities>
61
62B<--version> Show the version of the grabber.
63
64B<--help> Print a help message and exit.
65
66=head1 SEE ALSO
67
68L<xmltv(5)>.
69
70=head1 AUTHOR
71
72S�ren Pingel Dalsgaard (soren@dalsgaards.dk). Based on the dr grabber by
73S�ren Pingel Dalsgaard (soren@dalsgaards.dk). Based on the tv2 grabber by
74Jesper Skov (jskov@zoftcorp.dk). Originally based on tv_grab_nl by
75Guido Diepen and Ed Avis (ed@membled.com) and tv_grab_fi by Matti
76Airas.
77Additions by Jesper Toft (jesper@bzimage.dk)
78
79=head1 BUGS
80
81Things in the programme descriptions to handle:
82
83* Better categories from descriptions.
84
85* Customization of subtitles "Episode #" - perhaps even make it optional
86
87=cut
88
89######################################################################
90# initializations
91
92use strict;
93use XMLTV::Version '$Id: tv_grab_dk_tvtid,v 1.3 2010/09/02 05:07:40 rmeden Exp $ ';
94use XMLTV::Capabilities qw/baseline manualconfig cache/;
95use XMLTV::Description 'Denmark';
96use Getopt::Long;
97use HTML::TreeBuilder;
98use HTML::Entities; # parse entities
99use IO::File;
100use URI;
101use JSON;
102
103use Date::Manip;
104
105use XMLTV;
106use XMLTV::Memoize;
107use XMLTV::ProgressBar;
108use XMLTV::Ask;
109use XMLTV::Mode;
110use XMLTV::Config_file;
111use XMLTV::DST;
112use XMLTV::Date;
113# Todo: perhaps we should internationalize messages and docs?
114use XMLTV::Usage <<END
115$0: get Danish television listings in XMLTV format
116To configure: $0 --configure [--config-file FILE]
117To grab listings: $0 [--config-file FILE] [--output FILE] [--days N]
118[--offset N] [--quiet]
119To show capabilities: $0 --capabilities
120To show version: $0 --version
121END
122;
123
124# Use Log::TraceMessages if installed.
125BEGIN {
126    eval { require Log::TraceMessages };
127    if ($@) {
128        *t = sub {};
129        *d = sub { '' };
130    }
131    else {
132        *t = \&Log::TraceMessages::t;
133        *d = \&Log::TraceMessages::d;
134        Log::TraceMessages::check_argv();
135    }
136}
137
138use LWP::UserAgent;
139my $ua = LWP::UserAgent->new;
140$ua->agent("xmltv/$XMLTV::VERSION");
141
142# Initialize cookie_jar
143use HTTP::Cookies;
144my $cookies = HTTP::Cookies->new;
145$ua->cookie_jar($cookies);
146
147# Whether zero-length programmes should be included in the output.
148my $WRITE_ZERO_LENGTH = 0;
149
150# default language
151my $LANG = 'da';
152
153# Winter time in Denmark - summer time is one hour ahead of this.
154my $TZ = '+0100';
155
156sub process_summary_page( $$$ );
157sub process_listings_page( $$$$$ );
158
159######################################################################
160# get options
161
162# Known categories.
163my %tvtid_categories = ( 11854683 => 'B�rn og Unge',
164			 11848684 => 'Dokumentar',
165			 11825897 => 'Film',
166			 11830626 => 'Livsstil',
167			 11847662 => 'Musik',
168			 11838192 => 'Natur og Milj�',
169			 11840363 => 'Nyheder',
170			 11870463 => 'Regional',
171			 11831900 => 'Serier',
172			 11792069 => 'Sport',
173			 11837090 => 'Sundhed og Mad',
174			 11844770 => 'Underholdning',
175                         # The following are educated guesses
176			 11839678 => 'Kultur',
177			 11860240 => 'Shopping',
178			 11860606 => 'Lotto',
179			 11870299 => 'Religion',
180			 11839487 => 'Videnskab',
181			 11837327 => 'Alment',
182			 11840285 => 'Undervisning',
183    );
184
185my %categories = ( 11854683 => 'kids',
186		   11848684 => 'documentary',
187		   11825897 => 'movie',
188		   11830626 => 'lifestyle',
189		   11847662 => 'music',
190		   11838192 => 'nature',
191		   11840363 => 'news',
192		   11870463 => 'local',
193		   11831900 => 'series',
194		   11792069 => 'sport',
195		   11837090 => 'health',
196		   11844770 => 'entertainment',
197		   # The following are educated guesses
198		   11839678 => 'culture',
199		   11860240 => 'shopping',
200		   11860606 => 'lotto',
201		   11870299 => 'religion',
202		   11839487 => 'science',
203		   11837327 => 'misc',
204		   11840285 => 'education'
205    );
206
207my %movietypes = (      'action'        => 'action',
208                        'drama'         => 'drama',
209                        'erotisk'       => 'erotic',
210                        'eventyr'       => 'adventure',
211                        'gyser'         => 'horror',
212                        'komedie'       => 'comedy',
213                        'krimi'         => 'crime',
214                        'thriller'      => 'thriller',
215                        'romantisk'     => 'romance',
216                        'western'       => 'western'
217			);
218# Get options
219XMLTV::Memoize::check_argv('get_url');
220my ($opt_days, $opt_offset, $opt_help, $opt_output,
221    $opt_configure, $opt_config_file, $opt_gui,
222    $opt_quiet, $opt_list_channels);
223$opt_offset = 0; # default
224GetOptions('days=i'        => \$opt_days,
225           'offset=i'      => \$opt_offset,
226           'help'          => \$opt_help,
227           'configure'     => \$opt_configure,
228           'config-file=s' => \$opt_config_file,
229           'gui:s'         => \$opt_gui,
230           'output=s'      => \$opt_output,
231           'quiet'         => \$opt_quiet,
232           'list-channels' => \$opt_list_channels,
233	   )
234    or usage(0);
235
236usage(1) if $opt_help;
237
238die 'number of days must not be negative'
239    if ((defined $opt_days && $opt_days < 0) || ($opt_offset < 0));
240
241my $maxdays=7;
242
243die "tvtid.tv2.dk only provide information for today and the next $maxdays days."
244    if ($opt_offset > $maxdays);
245
246if (! defined $opt_days) {
247    # If there is no --days given. Set it to as many as possible.
248    $opt_days = $maxdays - $opt_offset;
249} else {
250    # --days option was given. Warn if its too high.
251    if (($opt_days + $opt_offset) > $maxdays) {
252	$opt_days = $maxdays - $opt_offset;
253	warn "tvtid.tv2.dk only provide information for today and the next $maxdays days (and not for all channels).";
254    }
255}
256
257XMLTV::Ask::init($opt_gui);
258
259my $mode = XMLTV::Mode::mode('grab', # default
260                             $opt_configure => 'configure',
261                             $opt_list_channels => 'list-channels',
262			     );
263
264# File that stores which channels to download.
265my $config_file
266    = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_dk_tvtid', $opt_quiet);
267
268if ($mode eq 'configure') {
269    XMLTV::Config_file::check_no_overwrite($config_file);
270      open(CONF, ">$config_file") or die "cannot write to $config_file: $!";
271      # find list of available channels
272      my $bar = new XMLTV::ProgressBar('getting list of channels', 1)
273	  if not $opt_quiet;
274      my %channels = get_channels();
275      die 'no channels could be found' if (scalar(keys(%channels)) == 0);
276      update $bar if not $opt_quiet;
277      $bar->finish() if not $opt_quiet;
278
279      # Ask about each channel.
280      my @chs = sort keys %channels;
281      my @names = map { $channels{$_} } @chs;
282      my @qs = map { "add channel $_?" } @names;
283      my @want = ask_many_boolean(1, @qs);
284      foreach (@chs) {
285	  my $w = shift @want;
286	  warn("cannot read input, stopping channel questions"), last
287	      if not defined $w;
288	  # No need to print to user - XMLTV::Ask is verbose enough.
289
290	  # Print a config line, but comment it out if channel not wanted.
291	  print CONF '#' if not $w;
292	  my $name = shift @names;
293	  print CONF "channel $_ $name\n";
294	  # TODO don't store display-name in config file.
295      }
296
297      close CONF or warn "cannot close $config_file: $!";
298      say("Finished configuration.");
299
300      exit();
301  }
302
303# Not configuring, we will need to write some output.
304die if $mode ne 'grab' and $mode ne 'list-channels';
305
306# If we are grabbing, check we can read the config file before doing
307# anything else.
308#
309my @config_lines;
310if ($mode eq 'grab') {
311    @config_lines = XMLTV::Config_file::read_lines($config_file);
312}
313
314my %w_args;
315if (defined $opt_output) {
316    my $fh = new IO::File(">$opt_output");
317    die "cannot write to $opt_output: $!" if not defined $fh;
318    $w_args{OUTPUT} = $fh;
319}
320$w_args{encoding} = 'ISO-8859-1';
321my $writer = new XMLTV::Writer(%w_args);
322# TODO: standardize these things between grabbers.
323$writer->start
324    ({ 'source-info-url'     => 'http://tvtid.tv2.dk/',
325       'source-data-url'     => 'http://tvtid.tv2.dk/',
326       'generator-info-name' => 'XMLTV',
327       'generator-info-url'  => 'http://xmltv.org/',
328   });
329
330if ($opt_list_channels) {
331    my $bar = new XMLTV::ProgressBar('getting list of channels', 1)
332	if not $opt_quiet;
333    my %channels = get_channels();
334    die 'no channels could be found' if (scalar(keys(%channels)) == 0);
335    update $bar if not $opt_quiet;
336
337    foreach my $ch_did (sort(keys %channels)) {
338	my $ch_name = $channels{$ch_did};
339	$writer->write_channel({ id => $ch_did,
340                                 'display-name' => [ [ $ch_name ] ],
341                                 'icon' => [{'src' => get_icon($ch_did)}]
342				 });
343    }
344    $bar->finish() if not $opt_quiet;
345    $writer->end();
346    exit();
347}
348
349# Not configuring or writing channels, must be grabbing listings.
350die if $mode ne 'grab';
351my (%channels, @channels, $ch_did, $ch_name);
352my (%convert, $orig_ch, $new_ch);
353my $line_num = 1;
354foreach (@config_lines) {
355    ++ $line_num;
356    next if not defined;
357
358    # FIXME channel data should be read from the site, and then the
359    # config file only gives the XMLTV ids that are interesting.
360    #
361    if (/^channel:?\s+(\S+)\s+([^\#]+)/) {
362	($ch_did, $ch_name) = ($1, $2);
363	$ch_name =~ s/\s*$//;
364	push @channels, $ch_did;
365	$channels{$ch_did} = $ch_name;
366    } elsif (/^convert:?\s+(\S+)\s+([^\#]+)/) {
367	($orig_ch, $new_ch) = ($1, $2);
368	$new_ch =~ s/\s*$//;
369	$convert{$orig_ch} = $new_ch;
370    } else {
371	warn "$config_file:$.: bad line\n";
372    }
373}
374
375
376######################################################################
377# subroutine definitions
378
379# get channel listing
380sub get_channels {
381    my %channels;
382
383    my $sec_per_day = 24*60*60;
384    my $eight_hours = 8*60*60; # Off by two for some reason
385    my $now8 = int(time()/$sec_per_day)*$sec_per_day+$eight_hours;
386
387    use JSON;
388    my $url = 'http://tvtid.tv2.dk/allekanaler/get.php/day-'.$now8.'.html';
389    my $json_t = get_url($url);
390    #print $url."\n\n".$json_t."\n\n";
391    my $json = new JSON(autoconv => 0, pretty => 1, indent => 2, utf8 => 1);
392    my $js_ref = $json->jsonToObj($json_t);
393    my @js = @$js_ref;
394
395    foreach my $elm (@js)
396    {
397	my $id = $elm->{logo};
398	$id =~ s,/img/logos/logo-,,;
399	$id =~ s,\.gif,,;
400	$channels{$id} = $elm->{name};
401    }
402    return %channels;
403}
404
405# Clean up bad characters in the site's HTML.
406my $warned_bad_chars;
407sub tidy( $ ) {
408    for (my $tmp = shift) {
409	tr/\222/''/;
410	tr/\011/ /; # tabs are allowed - turn them into spaces
411	if (tr/\012\015\040-\176\240-\377//dc) {
412	    warn 'removing bad characters' unless ($warned_bad_chars++ || $opt_quiet);
413	}
414	return $_;
415    }
416}
417
418my $fetched;
419sub get_url( $ ) {
420    sleep rand(5) if defined $fetched;
421    $fetched = 1;
422    my $c = tidy($ua->get(shift)->content);
423    return $c;
424}
425
426# Bump a YYYYMMDD date by one.
427sub correct_day {
428    my $d = shift;
429    my $h = shift;
430
431    $d = UnixDate(DateCalc($d, "+ 1 day"), '%Q') if UnixDate($h, '%H') < 6;
432
433    my ($base, $tz) = @{date_to_local(parse_local_date("$d " . $h, $TZ), $TZ)};
434
435    return UnixDate($base, '%q') . " $tz";
436}
437
438# Icon URL for a given channel.
439sub get_icon {
440    my ($url) = @_;
441    return "http://tvtid.tv2.dk/img/logos/logo-" . $url . ".gif";
442}
443
444# Split list of people into array
445sub get_people {
446    my $p = shift;
447    my $people;
448    $p =~ s/ m\.fl.+//;
449    @$people = split(/, | og |\//, $p);
450    s/.+:// foreach @$people;
451    s/^\s+// foreach @$people;
452    s/\s+$// foreach @$people;
453    s/[.]$// foreach @$people;
454    return $people;
455}
456
457######################################################################
458# begin main program
459
460my $sec_per_day = 24*60*60;
461my $eight_hours = 8*60*60; # Off by two for some reason
462my $now8 = int(time()/$sec_per_day)*$sec_per_day+$eight_hours;
463
464Date_Init('TZ=utc');
465
466foreach $ch_did (@channels) {
467    $ch_name = $channels{$ch_did};
468    $writer->write_channel({ id => $ch_did,
469                             'display-name' => [ [ $ch_name ] ],
470			     'icon' => [{'src' => get_icon($ch_did)}]
471			     }) unless $convert{$ch_did};
472}
473
474for (my $i = $opt_offset;$i<($opt_offset + $opt_days);$i++) {
475
476    my $sec_per_day = 24*60*60;
477    my $eight_hours = 8*60*60; # Off by two for some reason
478    my $day8 = int(time()/$sec_per_day+$i)*$sec_per_day+$eight_hours;
479
480
481    my $day = UnixDate(DateCalc(parse_date('now'), "+ $i days"), '%Q');
482
483    use JSON;
484    my $url = 'http://tvtid.tv2.dk/allekanaler/get.php/day-'.$day8.'.html';
485    my $json_t = get_url($url);
486
487    my $json = new JSON(autoconv => 0, pretty => 1, indent => 2, utf8 => 1);
488    my $js_ref = $json->jsonToObj($json_t);
489    my @js = @$js_ref;
490
491    foreach my $elm (@js)
492    {
493	my $id = $elm->{logo};
494	$id =~ s,/img/logos/logo-,,;
495	$id =~ s,\.gif,,;
496	if (defined $channels{$id}) {
497
498	    my $programs_ref = $elm->{programs};
499	    my @programs = @$programs_ref;
500	    for my $program (@programs) {
501		# If 'overlap=1' the program is present on the
502		# previous day as well, so skip it
503		next if defined $program->{overlap};
504
505		#print $json->objToJson($program)."\n";
506
507		my %prog = ();
508		if ($convert{$id}) {
509		    $prog{channel} = $convert{$id};
510		} else {
511		    $prog{channel} = $id;
512		}
513		$prog{start} = correct_day($day, $program->{start});
514		$prog{stop} = correct_day($day, $program->{end});
515		$prog{category} = [ [ $categories{$program->{cat}} ] ];
516
517		my $program_url="http://tvtid.tv2.dk/program/index.php/id-".$program->{'id'}.".html";
518		my $contents = get_url($program_url);
519
520		my $aspect;
521		$aspect = '4:3' if ($contents =~ /pictureFormat43 enabled/);
522		$aspect = '16:9' if ($contents =~ /pictureFormat169 enabled/);
523		my $rerun;
524		$rerun = {} if ($contents =~ /rerun enabled/);
525		my $sound = 'stereo';
526		$sound = 'surround' if ($contents =~ /surround enabled/);
527		my $teletext;
528		$teletext = {} if ($contents =~ /teletext enabled/);
529		my $subtitles;
530		$subtitles = {} if ($contents =~ /subtitles enabled/);
531		my $colour = 1;
532		$colour = 0 if ($contents =~ /blackwhite enabled/);
533		#if ($contents =~ /subtitlesHearingImpaired enabled/) { print "TTH "; }
534
535		my $descr;
536		my $with;
537		my $actors;
538		my $writers;
539		my $adapters;
540		my $presenters;
541		# Get program information. Keep "<" at the end intentionally.
542		if ($contents =~ /<div class="longinfo">(.+?<)\/div>/) {
543		    $descr = $1;
544
545		    $descr =~ s/\<p\>Sendt f�rste gang .*?\</</; # Remove
546		    $descr =~ s/\<p\>Sendes ogs� .*?\</</; # Remove
547
548		    $descr =~ s/\<p\>/ /g;	  # Remove <p>
549		    $descr =~ s/\<\/p\>/ /g;	  # Remove </p>
550
551		    if ($descr =~ /(.*)\<h2 class="programListHeader"\>Medvirkende:\<\/h2\>(.*?)(\<.*)/) {
552			($descr, $actors) = ($1 . $3, get_people($2));
553		    }
554
555		    if ($descr =~ /(.*)\<h2 class="programListHeader"\>Instrukt�r:\<\/h2\>(.*)(\<.*)/) {
556			($descr, $writers) = ($1 . $3, get_people($2));
557		    }
558
559		    if ($descr =~ /(.*)\<BR\>Tilrettel�ggelse: (.+?)(\<.*)/) {
560			($descr, $adapters) = ($1 . $3, get_people($2));
561		    }
562
563		    if ($descr =~ /(.*)V�rt: (.+?)\.(.*)/) {
564			($descr, $presenters) = ($1 . $3, get_people($2));
565		    }
566
567		    #if ($descr =~ /<BR>(.+)/) {
568			#print "***** $1 *****\n";
569		    #}
570
571		    # Clean up $descr:
572		    $descr =~ s/\<BR\>/ /g;	  # Remove <BR>
573		    $descr =~ s/<$//g;	   # Remove < at end
574		    $descr =~ s/ \s+/ /g; # Remove double spaces
575		    $descr =~ s/ +$//g;	 # rtrim
576		    $descr =~ s/^ +//g;	# ltrim
577
578		}
579
580		my $original;
581		if ($contents =~ /\<h2 class="originalTitle"\>Originaltitel: (.+?)\<\/h2\>/) {
582		    $original = $1;
583		}
584
585		my $episode;
586		if ($contents =~ /\<div class="episode"\>Episode: \((.+?)\)\<\/div\>/) {
587		    $episode = $1;
588		    $episode =~ s/:/\//;
589		}
590
591		my @titles = ([ $program->{title}, $LANG ]);
592		push @titles, [ $original ] if defined $original;
593		$prog{title} = \@titles;
594		$prog{desc} = ([ [ $descr, $LANG ] ]) if defined $descr && $descr;
595		my %v = ( present => 1,
596			  colour => $colour );
597		$v{aspect} = $aspect if defined $aspect;
598		$prog{video} = \%v;
599		$prog{audio} = { present => 1,
600				 stereo => $sound };
601		my $subtitle;
602		$subtitle = $program->{description} if $program->{description};
603		if (defined $episode) {
604		    if (defined $subtitle) {
605			$subtitle .= "." unless $subtitle =~ /\.$/;
606			$subtitle .= " Episode " . $episode . ".";
607		    } else {
608			$subtitle = "Episode " . $episode . ".";
609		    }
610		}
611		$prog{'sub-title'} = [ [ $subtitle, $LANG ] ] if defined $subtitle;
612		if (defined $subtitle && $subtitle =~ / fra (\d{4})/) {
613		    $prog{date} = $1;
614		}
615		$prog{subtitles} = [ { type => 'teletext' } ] if defined $teletext;
616		$prog{subtitles} = [ { type => 'onscreen' } ] if defined $subtitles;
617		$prog{'episode-num'} = [ [ $episode, 'onscreen' ] ] if defined $episode;
618		$prog{'previously-shown'} = $rerun if defined $rerun;
619		my %c;
620		$c{actor} = $actors if defined $actors;
621		$c{writer} = $writers if defined $writers;
622		$c{adapter} = $adapters if defined $adapters;
623		$c{presenter} = $presenters if defined $presenters;
624		$prog{credits} = \%c if %c;
625
626		#print $json->objToJson(\%prog) . "\n";
627
628		$writer->write_programme(\%prog);
629	    }
630	}
631    }
632}
633$writer->end();
634exit(0);
635