1#!/usr/local/bin/perl -w
2
3=pod
4
5=head1 NAME
6
7tv_grab_no - Grab TV listings for Norway.
8
9=head1 SYNOPSIS
10
11tv_grab_no --help
12
13tv_grab_no [--config-file FILE] --configure [--gui OPTION]
14
15tv_grab_no [--config-file FILE] [--output FILE] [--days N]
16           [--offset N] [--quiet]
17
18=head1 DESCRIPTION
19
20Output TV listings for several channels available in Norway.  The
21data comes from fredag.dagbladet.no. The grabber relies on parsing HTML so it
22might stop working at any time.
23
24First run B<tv_grab_no --configure> to choose, which channels you want
25to download. Then running B<tv_grab_no> with no arguments will output
26listings in XML format to standard output.
27
28B<--configure> Prompt for which channels,
29and write the configuration file.
30
31B<--config-file FILE> Set the name of the configuration file, the
32default is B<~/.xmltv/tv_grab_no.conf>.  This is the file written by
33B<--configure> and read when grabbing.
34
35B<--gui OPTION> Use this option to enable a graphical interface to be used.
36OPTION may be 'Tk', or left blank for the best available choice.
37Additional allowed values of OPTION are 'Term' for normal terminal output
38(default) and 'TermNoProgressBar' to disable the use of Term::ProgressBar.
39
40B<--output FILE> Write to FILE rather than standard output.
41
42B<--days N> Grab N days.  The default is as many as the source carries.
43
44B<--offset N> Start N days in the future.  The default is to start
45from today.
46
47B<--quiet> Suppress the progress messages normally written to standard
48error.
49
50B<--version> Show the version of the grabber.
51
52B<--help> Print a help message and exit.
53
54=head1 SEE ALSO
55
56L<xmltv(5)>.
57
58=head1 AUTHOR
59
60Christian Wattengaard (christian@wattengard.com). Heavily based on
61tv_grab_dk by Jesper Skov (jskov@zoftcorp.dk). tv_grab_dk
62originally based on tv_grab_nl by Guido Diepen and Ed Avis
63(ed@membled.com) and tv_grab_fi by Matti Airas.
64
65=head1 BUGS
66
67First release. Not aware of any bugs yet. But I'm sure there are some.
68
69=cut
70
71######################################################################
72# initializations
73
74use strict;
75use XMLTV::Version '$Id: tv_grab_no,v 1.23 2010/09/02 05:07:40 rmeden Exp $ ';
76use XMLTV::Capabilities qw/baseline manualconfig cache/;
77use XMLTV::Description 'Norway';
78use Getopt::Long;
79use HTML::Entities; # parse entities
80use IO::File;
81use URI;
82#use HTML::TableExtract;
83use Date::Manip;
84#use HTML::LinkExtractor;
85#use WWW::Mechanize;
86
87use XMLTV;
88use XMLTV::Memoize;
89use XMLTV::ProgressBar;
90use XMLTV::Ask;
91use XMLTV::Mode;
92use XMLTV::Config_file;
93use XMLTV::DST;
94use XMLTV::Get_nice;
95use XMLTV::Date;
96# Todo: perhaps we should internationalize messages and docs?
97use XMLTV::Usage <<END
98$0: get Norwegian television listings in XMLTV format
99To configure: $0 --configure [--config-file FILE] [--gui OPTION]
100To grab listings: $0 [--config-file FILE] [--output FILE] [--days N]
101        [--offset N] [--quiet]
102END
103  ;
104
105sub getchandisplaynames( @ );
106
107# Use Log::TraceMessages if installed.
108BEGIN {
109    eval { require Log::TraceMessages };
110    if ($@) {
111	*t = sub {};
112	*d = sub { '' };
113    }
114    else {
115	*t = \&Log::TraceMessages::t;
116	*d = \&Log::TraceMessages::d;
117	#Log::TraceMessages::check_argv();
118        #$Log::TraceMessages::Logfile = 'tv_grab_no.log';
119    }
120}
121
122# Whether zero-length programmes should be included in the output.
123my $WRITE_ZERO_LENGTH = 0;
124
125# default language
126my $LANG = 'no';
127
128# Winter time in Norway - summer time is one hour ahead of this.
129my $TZ = '+0100';
130
131my %xmltvid = (
132	1 => 'nrk1.nrk.no',
133	3 => 'tv2.no',
134	4 => 'tv3.no',
135	5 => 'tvnorge.no',
136	2 => 'nrk2.nrk.no',
137	63 => 'ztv.no',
138	37 => 'tv4.se',
139	68 => '3plus.dk',
140	12 => 'dr1.dr.dk',
141	58 => 'dr2.dr.dk',
142	6 => 'svt1.svt.se',
143	7 => 'svt2.svt.se',
144	47 => 'kanal5.se',
145	13 => 'dr2-2.dr.dk',
146	35 => 'tv3.dk',
147	36 => 'tv3.se',
148	28 => 'tv6.no',
149	30 => 'ztv.se',
150	59 => 'no.hallmarkchannel.tv',
151	31 => 'turnerclassicmovies.com',
152	39 => 'canalplus.canalplus.no',
153	69 => 'film1.canalplus.no',
154	70 => 'film2.canalplus.no',
155	42 => 'cinema.tv1000.no',
156	71 => 'cmore1.canalplus.no',
157	34 => 'tv1000.no',
158	79 => 'classic.tv1000.no',
159	76 => 'nordic.tv1000.no',
160	78 => 'action.tv1000.no',
161	80 => 'family.tv1000.no',
162	41 => 'cartoonnetworkhq.net',
163	20 => 'nickelodeon.se',
164	67 => 'disneychannel.tv',
165	16 => 'foxkids.no',
166	32 => 'tcmcn.no',
167	15 => 'eurosport.no',
168	75 => 'xtra.tv2.no',
169	72 => 'sport.canalplus.no',
170	55 => 'sportn.no',
171	73 => 'sport2.viasat.no',
172	74 => 'sport3.viasat.no',
173	49 => 'extreme.com',
174	11 => 'dsf.de',
175	17 => 'mtve.com',
176	29 => 'vh1e.com',
177	53 => 'traveladventure.discoverychannel.com',
178	33 => 'travelchannel.com',
179	66 => 'explorer.viasat.no',
180	44 => 'discoverychannel.com',
181	10 => 'animalplanet.discoverychannel.com',
182	51 => 'civilisation.discoverychannel.com',
183	52 => 'scitrek.discoverychannel.com',
184	18 => 'cnbcng.no',
185	61 => 'realitytv.co.uk',
186	46 => 'tv8.se',
187	38 => 'world.bbc.co.uk',
188	62 => 'cnbc.com',
189	25 => 'news.sky.co.uk',
190	26 => 'skynewsng.no',
191	43 => 'cnn.com',
192	14 => 'dw.de',
193	9 => '3sat.de',
194	22 => '1.rtl.de',
195	23 => '2.rtl.de',
196	27 => 'tv5.fr',
197	21 => 'pro7.de',
198	24 => 'sat1.de',
199	19 => 'prime.bbc.co.uk',
200	50 => 'club.no',
201	77 => 'star.nonstop.tv',
202	88 => 'sport2.canalplus.no',
203	87 => 'mix.canalplus.no',
204	86 => '2.eurosport.no',
205	85 => 'toon.disneychannel.tv',
206	84 => 'thevoice.no',
207	81 => 'ngcnorge.com',
208	82 => 'jetix.no',
209	83 => 'showtime.nonstop.tv',
210	89 => 'playhouse.disneychannel.tv',
211	90 => 'tv6.viasat.no',
212	91 => 'hd.canalplus.no',
213	92 => 'film3.canalplus.no',
214	93 => 'film.tv2.no'
215	);
216
217sub process_summary_page( $$$ );
218sub process_listings_page( $$$$ );
219
220######################################################################
221# get options
222
223# Get options
224XMLTV::Memoize::check_argv('XMLTV::Get_nice::get_nice_aux');
225my ($opt_days, $opt_offset, $opt_help, $opt_output,
226    $opt_configure, $opt_config_file, $opt_gui, $opt_quiet,
227    $opt_list_channels);
228$opt_days   = 4; # default
229$opt_offset = 0; # default
230GetOptions('days=i'        => \$opt_days,
231	   'offset=i'      => \$opt_offset,
232	   'help'          => \$opt_help,
233	   'configure'     => \$opt_configure,
234	   'config-file=s' => \$opt_config_file,
235       'gui:s'         => \$opt_gui,
236	   'output=s'      => \$opt_output,
237	   'quiet'         => \$opt_quiet,
238	   'list-channels' => \$opt_list_channels,
239	  )
240  or usage(0);
241die 'number of days must not be negative'
242  if (defined $opt_days && $opt_days < 0);
243usage(1) if $opt_help;
244XMLTV::Ask::init($opt_gui);
245if ($opt_days > 7) {
246    print "WARNING: This grabber can only grab 7 days ahead!\n-------: I will grab 7 days and then quit.\n";
247    $opt_days = 7;
248}
249
250my $mode = XMLTV::Mode::mode('grab', # default
251			     $opt_configure => 'configure',
252			     $opt_list_channels => 'list-channels',
253			    );
254
255# File that stores which channels to download.
256my $config_file
257  = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_no', $opt_quiet);
258
259if ($mode eq 'configure') {
260    XMLTV::Config_file::check_no_overwrite($config_file);
261    open(CONF, ">$config_file") or die "cannot write to $config_file: $!";
262    # find list of available channels
263    my $bar = new XMLTV::ProgressBar('getting list of channels', 1)
264      if not $opt_quiet;
265    my %channels = get_channels();
266    die 'no channels could be found' if (scalar(keys(%channels)) == 0);
267    update $bar if not $opt_quiet;
268    $bar->finish() if not $opt_quiet;
269
270    # Ask about each channel.
271    my @chs = sort keys %channels;
272    my @names = map { $channels{$_} } @chs;
273    my @qs = map { "add channel $_?" } @names;
274    my @want = ask_many_boolean(1, @qs);
275    foreach (@chs) {
276	my $w = shift @want;
277	warn("cannot read input, stopping channel questions"), last
278	  if not defined $w;
279	# No need to print to user - XMLTV::Ask is verbose enough.
280
281	# Print a config line, but comment it out if channel not wanted.
282	print CONF '#' if not $w;
283	my $name = shift @names;
284        print CONF "channel $_\n";
285	# TODO don't store display-name in config file.
286    }
287
288    close CONF or warn "cannot close $config_file: $!";
289    say("Finished configuration.");
290
291    exit();
292}
293
294# Not configuring, we will need to write some output.
295die if $mode ne 'grab' and $mode ne 'list-channels';
296
297# If we are grabbing, check we can read the config file before doing
298# anything else.
299#
300my @config_lines;
301if ($mode eq 'grab') {
302    @config_lines = XMLTV::Config_file::read_lines($config_file);
303}
304
305my %w_args;
306if (defined $opt_output) {
307    my $fh = new IO::File(">$opt_output");
308    die "cannot write to $opt_output: $!" if not defined $fh;
309    $w_args{OUTPUT} = $fh;
310}
311$w_args{encoding} = 'ISO-8859-1';
312my $writer = new XMLTV::Writer(%w_args);
313# TODO: standardize these things between grabbers.
314$writer->start
315  ({ 'source-info-url'     => 'http://fredag.dagbladet.no/tv/',
316     'source-data-url'     => 'http://fredag.dagbladet.no/tv/',
317     'generator-info-name' => 'XMLTV',
318     'generator-info-url'  => 'http://xmltv.org/',
319   });
320
321if ($opt_list_channels) {
322    my $bar = new XMLTV::ProgressBar('getting list of channels', 1)
323      if not $opt_quiet;
324    my %channels = get_channels();
325    die 'no channels could be found' if (scalar(keys(%channels)) == 0);
326    update $bar if not $opt_quiet;
327    $bar->finish() if not $opt_quiet;
328
329    foreach my $ch_did (sort(keys %channels)) {
330	my $ch_name = $channels{$ch_did};
331	#my $ch_xid = $ch_did;
332	my $ch_xid = $xmltvid{$ch_did};
333	$writer->write_channel({ id => $ch_xid,
334				 'display-name' => [ [ $ch_name ] ] });
335    }
336    $writer->end();
337    exit();
338}
339
340# Not configuring or writing channels, must be grabbing listings.
341die if $mode ne 'grab';
342my (%channels, @channels, $ch_did, $ch_name);
343my $line_num = 1;
344foreach (@config_lines) {
345    ++ $line_num;
346    next if not defined;
347    if (/^channel ([0-9]+)/) {
348        push @channels, $1;
349    }
350    else {
351	warn "$config_file:$.: bad line\n";
352    }
353}
354
355my $configbar = new XMLTV::ProgressBar('fetching channel names', 1)
356  if not $opt_quiet;
357%channels = getchandisplaynames(@channels);
358update $configbar if not $opt_quiet;
359$configbar->finish() if not $opt_quiet;
360
361######################################################################
362# begin main program
363
364my $now = parse_date('now');
365die if not defined $now;
366
367my @to_get;
368
369# the order in which we fetch the channels matters
370my $today = UnixDate($now, '%Y-%m-%d'); die if not defined $today;
371foreach $ch_did (@channels) {
372    $ch_name = $channels{$ch_did};
373    my $ch_xid = $xmltvid{$ch_did};
374    $writer->write_channel({ id => $ch_xid,
375			     'display-name' => [ [ $ch_name ] ] });
376}
377
378my %warned_ch_name; # suppress duplicate warnings
379my $bar = new XMLTV::ProgressBar('fetching data', @channels * $opt_days)
380  if not $opt_quiet;
381foreach my $d (0 .. $opt_days - 1) {
382    my $i = $opt_offset + $d;
383    my $day = UnixDate(DateCalc($today, "+ $i days"), '%Y-%m-%d');
384    t "turned offset $i (from $today) into date $day";
385    my $num_this_day = 0;
386    foreach $ch_did (@channels) {
387	my $ch_xid = $xmltvid{$ch_did};
388	# Request day when constructing URL since it is represented as
389	# an integere offset from today. Still pass in the computed
390	# date - may need it sometime...
391	#my $url = 'http://fredag.dagbladet.no/tv/index.html'
392	#  . "?kanal_id=$ch_did&dag=$i&fra_tid=0&til_tid=24";
393	my $url = 'http://www.dagbladet.no/mobil/tv/'
394	  . "liste.html?kanal_id=$ch_did&dag=$i&fra_tid=0&til_tid=24";
395	push @to_get, [ $url, $day, $ch_xid, $ch_did ];
396
397	my $got = process_listings_page($writer, $ch_xid, $url, $day);
398	warn "no listings for channel $ch_xid on day $d, $url\n"
399	  if $got == 0;
400	$num_this_day += $got;
401	update $bar if not $opt_quiet;
402    }
403    if ($num_this_day == 0) {
404	die "could not get any listings\n" if $d == 0;
405	warn "could get only one day of listings, not $opt_days\n" if $d == 1;
406	warn "could get only $d days of listings, not $opt_days\n" if $d > 1;
407	last;
408    }
409}
410$bar->finish() if not $opt_quiet;
411$writer->end();
412
413######################################################################
414# subroutine definitions
415
416# Remove bad characters.
417my $warned_bad_chars;
418sub tidy ( $ ) {
419    my $s = shift;
420    $s =~ tr/\t\031/ /d;
421    if ($s =~ s/([^\012\015\040-\176\240-\377]+)//g) {
422	warn "removing bad characters: '$1'"
423	  unless $warned_bad_chars++;
424    }
425    return $s;
426}
427
428# arguments:
429#   XMLTV::Writer object to write to
430#   XMLTV id of channel
431#   URL to fetch
432#   Date::Manip object giving day for programmes in page (at least
433#     until they cross midnight)
434#
435# Returns number of programmes written.
436#
437my ($warned_discarding_parts, $commented_episode_num);
438sub process_listings_page ( $$$$ ){
439    #    local $Log::TraceMessages::On = 1;
440    my ($writer, $ch_xmltv_id, $url, $day_obj) = @_;
441    my $count = 0;
442    my $next_day = 0;
443    my $day = UnixDate($day_obj, '%Q');
444    t "getting channel $ch_xmltv_id, date $day";
445    my $ts;
446    my $row;
447    my $listingspage = tidy(get_nice($url));
448    #my $tabex = new HTML::TableExtract( depth => 1, count => 1 , keep_html => 1 );
449    #$tabex->parse($listingspage);
450	#my ($newstart, $newstop, $newtitle, $newdesc, $rerun, $normalsubs, $teletextsubs, $eptotal, $ep, $epall);
451	my @elements = split(/<hr.*>/, $listingspage);
452
453	foreach my $element (@elements) {
454		#print "\n$element";
455		#print $1 if $element =~ m!</b><br>\n(.*)!;
456		my ($newstart, $newstop, $newtitle, $newdesc, $rerun, $normalsubs, $teletextsubs, $eptotal, $ep, $epall);
457		if ($element =~ m/[0-9][0-9]\.[0-9][0-9] - [0-9][0-9]\.[0-9][0-9]/) {
458			#print "---\n$element\n---";
459			($newstart, $newstop) = ($1, $2) if $element =~ m/([0-9][0-9]\.[0-9][0-9]) - ([0-9][0-9]\.[0-9][0-9])/;
460			$newtitle = $1 if $element =~ m!<b>(.*)</b>!;
461			$newdesc = $1 if $element =~ m!</b><br>\n(.*)!;
462			#$newdesc = encode_entities($newdesc) if $newdesc;
463			#print $1 if $element =~ m!</b><br>\n(.*)!;
464			#print $newdesc;
465			$rerun = {} if $newtitle =~ s/\([Rr]\)//;
466			$normalsubs = 1 if $newtitle =~ s/\(t\)//;
467			$teletextsubs = 1 if $newtitle =~ s/\(ttv\)//;
468			#for ($newtitle) { s/^\s+//; s/\s+$//; }
469
470			#my ($eptotal, $ep, $epall);
471			for ($newtitle) {
472		    	if (s/\(([0-9]+)\)//) {
473				$eptotal = $1 - 1;
474		    	}
475		    	elsif (s/\(([0-9]+):([0-9]+)\)//) {
476				($ep, $epall) = ($1 - 1, $2);
477		    	}
478		    	elsif (s/\(:([0-9]+)\)//) {
479				$eptotal = $1 - 1;
480				}
481			}
482		    	#s/\s+$//;
483
484
485		my $originaltitle = $1
486		  if $newdesc && $newdesc =~ s/^\((.*)\) //;
487		for ($newtitle) { s/^\s+//; s/\s+$//; }
488		#$newdesc = " " if $newdesc eq "";
489		#print "$newstart - $newstop - $newtitle - $newdesc - Day: $day\n";
490		$newdesc = decode_entities($newdesc) if $newdesc;
491		if ($newdesc) {
492		for ($newdesc) { s/^\s+//; s/\s+$// }
493		}
494		$originaltitle = decode_entities($originaltitle) if $originaltitle;
495		$newstart =~ m/(\d.)\.(\d.)/ or die;
496		my $xmltvstart = utc_offset("$day$1${2}00", $TZ);
497
498		$newstop =~ m/(\d.)\.(\d.)/ or die;
499		my $xmltvstop = utc_offset("$day$1${2}00", $TZ);
500
501		my $newxmltvstop;
502		if (Date_Cmp($xmltvstart, $xmltvstop) > 0 ) {
503		    $day = nextday($day);
504		    $newstop =~ m/(\d.)\.(\d.)/ or die;
505		    $xmltvstop = utc_offset("$day$1${2}00", $TZ);
506		}
507		my %prog = (	start 	=> $xmltvstart,
508				stop	=> $xmltvstop,
509				channel => $ch_xmltv_id);
510		$prog{title} = [ [ $newtitle , $LANG ] , [ $originaltitle , "en" ] ] if $originaltitle;
511		$prog{title} = [ [ $newtitle , $LANG ] ] if not $originaltitle;
512		$prog{desc} = [ [ $newdesc , $LANG ] ] if $newdesc;
513		push @{$prog{'episode-num'}}, [ " . $ep/$epall . " , "xmltv_ns" ] if $ep;
514		push @{$prog{'episode-num'}}, [ $eptotal , "all-seasons" ] if $eptotal;
515		$prog{subtitles} = [ { type => 'teletext' } ] if $teletextsubs;
516		$prog{subtitles} = [ { type => 'onscreen' } ] if $normalsubs;
517		$prog{'previously-shown'} = $rerun if defined $rerun;
518
519		$writer->write_programme(\%prog);
520		++$count;
521		}
522	    }
523
524
525    return $count;
526}
527
528# get channel listing
529sub get_channels {
530    my $page = tidy(get_nice('http://fredag.dagbladet.no/tv/select_channels.html'));
531    ##my $page = tidy(get_nice('http://www.dagbladet.no/tv/mobil/index.html'));
532	my @chanarray = split(/<input/, $page);
533	my %channels;
534	foreach my $item (@chanarray) {
535		#print "\nAdding channel from itemtext:\n$item";
536		#print "\nFound: $1 --- $2\n" if $item =~ /value=\"([0-9]+)\".*>.(.*)<br>/;
537		$channels{$1} = $2 if $item =~ /value=\"?([0-9]+)\"?.*\">(.*)<\/label>/;
538	}
539	return %channels;
540}
541
542sub getchandisplaynames( @ ) {
543    my %channels = get_channels();
544    my %r;
545    foreach my $ch_did (@_) { $r{$ch_did} = $channels{$ch_did}; }
546    return %r;
547}
548
549# Bump a YYYYMMDD date by one.
550sub nextday {
551    my $d = shift;
552    my $p = parse_date($d);
553    my $n = DateCalc($p, '+ 1 day');
554    return UnixDate($n, '%Q');
555}
556