1#!/usr/local/bin/perl -w 2 3=pod 4 5=head1 NAME 6 7tv_grab_no - Grab TV listings for Norway. 8 9=head1 SYNOPSIS 10 11tv_grab_no --help 12 13tv_grab_no [--config-file FILE] --configure [--gui OPTION] 14 15tv_grab_no [--config-file FILE] [--output FILE] [--days N] 16 [--offset N] [--quiet] 17 18=head1 DESCRIPTION 19 20Output TV listings for several channels available in Norway. The 21data comes from fredag.dagbladet.no. The grabber relies on parsing HTML so it 22might stop working at any time. 23 24First run B<tv_grab_no --configure> to choose, which channels you want 25to download. Then running B<tv_grab_no> with no arguments will output 26listings in XML format to standard output. 27 28B<--configure> Prompt for which channels, 29and write the configuration file. 30 31B<--config-file FILE> Set the name of the configuration file, the 32default is B<~/.xmltv/tv_grab_no.conf>. This is the file written by 33B<--configure> and read when grabbing. 34 35B<--gui OPTION> Use this option to enable a graphical interface to be used. 36OPTION may be 'Tk', or left blank for the best available choice. 37Additional allowed values of OPTION are 'Term' for normal terminal output 38(default) and 'TermNoProgressBar' to disable the use of Term::ProgressBar. 39 40B<--output FILE> Write to FILE rather than standard output. 41 42B<--days N> Grab N days. The default is as many as the source carries. 43 44B<--offset N> Start N days in the future. The default is to start 45from today. 46 47B<--quiet> Suppress the progress messages normally written to standard 48error. 49 50B<--version> Show the version of the grabber. 51 52B<--help> Print a help message and exit. 53 54=head1 SEE ALSO 55 56L<xmltv(5)>. 57 58=head1 AUTHOR 59 60Christian Wattengaard (christian@wattengard.com). Heavily based on 61tv_grab_dk by Jesper Skov (jskov@zoftcorp.dk). tv_grab_dk 62originally based on tv_grab_nl by Guido Diepen and Ed Avis 63(ed@membled.com) and tv_grab_fi by Matti Airas. 64 65=head1 BUGS 66 67First release. Not aware of any bugs yet. But I'm sure there are some. 68 69=cut 70 71###################################################################### 72# initializations 73 74use strict; 75use XMLTV::Version '$Id: tv_grab_no,v 1.23 2010/09/02 05:07:40 rmeden Exp $ '; 76use XMLTV::Capabilities qw/baseline manualconfig cache/; 77use XMLTV::Description 'Norway'; 78use Getopt::Long; 79use HTML::Entities; # parse entities 80use IO::File; 81use URI; 82#use HTML::TableExtract; 83use Date::Manip; 84#use HTML::LinkExtractor; 85#use WWW::Mechanize; 86 87use XMLTV; 88use XMLTV::Memoize; 89use XMLTV::ProgressBar; 90use XMLTV::Ask; 91use XMLTV::Mode; 92use XMLTV::Config_file; 93use XMLTV::DST; 94use XMLTV::Get_nice; 95use XMLTV::Date; 96# Todo: perhaps we should internationalize messages and docs? 97use XMLTV::Usage <<END 98$0: get Norwegian television listings in XMLTV format 99To configure: $0 --configure [--config-file FILE] [--gui OPTION] 100To grab listings: $0 [--config-file FILE] [--output FILE] [--days N] 101 [--offset N] [--quiet] 102END 103 ; 104 105sub getchandisplaynames( @ ); 106 107# Use Log::TraceMessages if installed. 108BEGIN { 109 eval { require Log::TraceMessages }; 110 if ($@) { 111 *t = sub {}; 112 *d = sub { '' }; 113 } 114 else { 115 *t = \&Log::TraceMessages::t; 116 *d = \&Log::TraceMessages::d; 117 #Log::TraceMessages::check_argv(); 118 #$Log::TraceMessages::Logfile = 'tv_grab_no.log'; 119 } 120} 121 122# Whether zero-length programmes should be included in the output. 123my $WRITE_ZERO_LENGTH = 0; 124 125# default language 126my $LANG = 'no'; 127 128# Winter time in Norway - summer time is one hour ahead of this. 129my $TZ = '+0100'; 130 131my %xmltvid = ( 132 1 => 'nrk1.nrk.no', 133 3 => 'tv2.no', 134 4 => 'tv3.no', 135 5 => 'tvnorge.no', 136 2 => 'nrk2.nrk.no', 137 63 => 'ztv.no', 138 37 => 'tv4.se', 139 68 => '3plus.dk', 140 12 => 'dr1.dr.dk', 141 58 => 'dr2.dr.dk', 142 6 => 'svt1.svt.se', 143 7 => 'svt2.svt.se', 144 47 => 'kanal5.se', 145 13 => 'dr2-2.dr.dk', 146 35 => 'tv3.dk', 147 36 => 'tv3.se', 148 28 => 'tv6.no', 149 30 => 'ztv.se', 150 59 => 'no.hallmarkchannel.tv', 151 31 => 'turnerclassicmovies.com', 152 39 => 'canalplus.canalplus.no', 153 69 => 'film1.canalplus.no', 154 70 => 'film2.canalplus.no', 155 42 => 'cinema.tv1000.no', 156 71 => 'cmore1.canalplus.no', 157 34 => 'tv1000.no', 158 79 => 'classic.tv1000.no', 159 76 => 'nordic.tv1000.no', 160 78 => 'action.tv1000.no', 161 80 => 'family.tv1000.no', 162 41 => 'cartoonnetworkhq.net', 163 20 => 'nickelodeon.se', 164 67 => 'disneychannel.tv', 165 16 => 'foxkids.no', 166 32 => 'tcmcn.no', 167 15 => 'eurosport.no', 168 75 => 'xtra.tv2.no', 169 72 => 'sport.canalplus.no', 170 55 => 'sportn.no', 171 73 => 'sport2.viasat.no', 172 74 => 'sport3.viasat.no', 173 49 => 'extreme.com', 174 11 => 'dsf.de', 175 17 => 'mtve.com', 176 29 => 'vh1e.com', 177 53 => 'traveladventure.discoverychannel.com', 178 33 => 'travelchannel.com', 179 66 => 'explorer.viasat.no', 180 44 => 'discoverychannel.com', 181 10 => 'animalplanet.discoverychannel.com', 182 51 => 'civilisation.discoverychannel.com', 183 52 => 'scitrek.discoverychannel.com', 184 18 => 'cnbcng.no', 185 61 => 'realitytv.co.uk', 186 46 => 'tv8.se', 187 38 => 'world.bbc.co.uk', 188 62 => 'cnbc.com', 189 25 => 'news.sky.co.uk', 190 26 => 'skynewsng.no', 191 43 => 'cnn.com', 192 14 => 'dw.de', 193 9 => '3sat.de', 194 22 => '1.rtl.de', 195 23 => '2.rtl.de', 196 27 => 'tv5.fr', 197 21 => 'pro7.de', 198 24 => 'sat1.de', 199 19 => 'prime.bbc.co.uk', 200 50 => 'club.no', 201 77 => 'star.nonstop.tv', 202 88 => 'sport2.canalplus.no', 203 87 => 'mix.canalplus.no', 204 86 => '2.eurosport.no', 205 85 => 'toon.disneychannel.tv', 206 84 => 'thevoice.no', 207 81 => 'ngcnorge.com', 208 82 => 'jetix.no', 209 83 => 'showtime.nonstop.tv', 210 89 => 'playhouse.disneychannel.tv', 211 90 => 'tv6.viasat.no', 212 91 => 'hd.canalplus.no', 213 92 => 'film3.canalplus.no', 214 93 => 'film.tv2.no' 215 ); 216 217sub process_summary_page( $$$ ); 218sub process_listings_page( $$$$ ); 219 220###################################################################### 221# get options 222 223# Get options 224XMLTV::Memoize::check_argv('XMLTV::Get_nice::get_nice_aux'); 225my ($opt_days, $opt_offset, $opt_help, $opt_output, 226 $opt_configure, $opt_config_file, $opt_gui, $opt_quiet, 227 $opt_list_channels); 228$opt_days = 4; # default 229$opt_offset = 0; # default 230GetOptions('days=i' => \$opt_days, 231 'offset=i' => \$opt_offset, 232 'help' => \$opt_help, 233 'configure' => \$opt_configure, 234 'config-file=s' => \$opt_config_file, 235 'gui:s' => \$opt_gui, 236 'output=s' => \$opt_output, 237 'quiet' => \$opt_quiet, 238 'list-channels' => \$opt_list_channels, 239 ) 240 or usage(0); 241die 'number of days must not be negative' 242 if (defined $opt_days && $opt_days < 0); 243usage(1) if $opt_help; 244XMLTV::Ask::init($opt_gui); 245if ($opt_days > 7) { 246 print "WARNING: This grabber can only grab 7 days ahead!\n-------: I will grab 7 days and then quit.\n"; 247 $opt_days = 7; 248} 249 250my $mode = XMLTV::Mode::mode('grab', # default 251 $opt_configure => 'configure', 252 $opt_list_channels => 'list-channels', 253 ); 254 255# File that stores which channels to download. 256my $config_file 257 = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_no', $opt_quiet); 258 259if ($mode eq 'configure') { 260 XMLTV::Config_file::check_no_overwrite($config_file); 261 open(CONF, ">$config_file") or die "cannot write to $config_file: $!"; 262 # find list of available channels 263 my $bar = new XMLTV::ProgressBar('getting list of channels', 1) 264 if not $opt_quiet; 265 my %channels = get_channels(); 266 die 'no channels could be found' if (scalar(keys(%channels)) == 0); 267 update $bar if not $opt_quiet; 268 $bar->finish() if not $opt_quiet; 269 270 # Ask about each channel. 271 my @chs = sort keys %channels; 272 my @names = map { $channels{$_} } @chs; 273 my @qs = map { "add channel $_?" } @names; 274 my @want = ask_many_boolean(1, @qs); 275 foreach (@chs) { 276 my $w = shift @want; 277 warn("cannot read input, stopping channel questions"), last 278 if not defined $w; 279 # No need to print to user - XMLTV::Ask is verbose enough. 280 281 # Print a config line, but comment it out if channel not wanted. 282 print CONF '#' if not $w; 283 my $name = shift @names; 284 print CONF "channel $_\n"; 285 # TODO don't store display-name in config file. 286 } 287 288 close CONF or warn "cannot close $config_file: $!"; 289 say("Finished configuration."); 290 291 exit(); 292} 293 294# Not configuring, we will need to write some output. 295die if $mode ne 'grab' and $mode ne 'list-channels'; 296 297# If we are grabbing, check we can read the config file before doing 298# anything else. 299# 300my @config_lines; 301if ($mode eq 'grab') { 302 @config_lines = XMLTV::Config_file::read_lines($config_file); 303} 304 305my %w_args; 306if (defined $opt_output) { 307 my $fh = new IO::File(">$opt_output"); 308 die "cannot write to $opt_output: $!" if not defined $fh; 309 $w_args{OUTPUT} = $fh; 310} 311$w_args{encoding} = 'ISO-8859-1'; 312my $writer = new XMLTV::Writer(%w_args); 313# TODO: standardize these things between grabbers. 314$writer->start 315 ({ 'source-info-url' => 'http://fredag.dagbladet.no/tv/', 316 'source-data-url' => 'http://fredag.dagbladet.no/tv/', 317 'generator-info-name' => 'XMLTV', 318 'generator-info-url' => 'http://xmltv.org/', 319 }); 320 321if ($opt_list_channels) { 322 my $bar = new XMLTV::ProgressBar('getting list of channels', 1) 323 if not $opt_quiet; 324 my %channels = get_channels(); 325 die 'no channels could be found' if (scalar(keys(%channels)) == 0); 326 update $bar if not $opt_quiet; 327 $bar->finish() if not $opt_quiet; 328 329 foreach my $ch_did (sort(keys %channels)) { 330 my $ch_name = $channels{$ch_did}; 331 #my $ch_xid = $ch_did; 332 my $ch_xid = $xmltvid{$ch_did}; 333 $writer->write_channel({ id => $ch_xid, 334 'display-name' => [ [ $ch_name ] ] }); 335 } 336 $writer->end(); 337 exit(); 338} 339 340# Not configuring or writing channels, must be grabbing listings. 341die if $mode ne 'grab'; 342my (%channels, @channels, $ch_did, $ch_name); 343my $line_num = 1; 344foreach (@config_lines) { 345 ++ $line_num; 346 next if not defined; 347 if (/^channel ([0-9]+)/) { 348 push @channels, $1; 349 } 350 else { 351 warn "$config_file:$.: bad line\n"; 352 } 353} 354 355my $configbar = new XMLTV::ProgressBar('fetching channel names', 1) 356 if not $opt_quiet; 357%channels = getchandisplaynames(@channels); 358update $configbar if not $opt_quiet; 359$configbar->finish() if not $opt_quiet; 360 361###################################################################### 362# begin main program 363 364my $now = parse_date('now'); 365die if not defined $now; 366 367my @to_get; 368 369# the order in which we fetch the channels matters 370my $today = UnixDate($now, '%Y-%m-%d'); die if not defined $today; 371foreach $ch_did (@channels) { 372 $ch_name = $channels{$ch_did}; 373 my $ch_xid = $xmltvid{$ch_did}; 374 $writer->write_channel({ id => $ch_xid, 375 'display-name' => [ [ $ch_name ] ] }); 376} 377 378my %warned_ch_name; # suppress duplicate warnings 379my $bar = new XMLTV::ProgressBar('fetching data', @channels * $opt_days) 380 if not $opt_quiet; 381foreach my $d (0 .. $opt_days - 1) { 382 my $i = $opt_offset + $d; 383 my $day = UnixDate(DateCalc($today, "+ $i days"), '%Y-%m-%d'); 384 t "turned offset $i (from $today) into date $day"; 385 my $num_this_day = 0; 386 foreach $ch_did (@channels) { 387 my $ch_xid = $xmltvid{$ch_did}; 388 # Request day when constructing URL since it is represented as 389 # an integere offset from today. Still pass in the computed 390 # date - may need it sometime... 391 #my $url = 'http://fredag.dagbladet.no/tv/index.html' 392 # . "?kanal_id=$ch_did&dag=$i&fra_tid=0&til_tid=24"; 393 my $url = 'http://www.dagbladet.no/mobil/tv/' 394 . "liste.html?kanal_id=$ch_did&dag=$i&fra_tid=0&til_tid=24"; 395 push @to_get, [ $url, $day, $ch_xid, $ch_did ]; 396 397 my $got = process_listings_page($writer, $ch_xid, $url, $day); 398 warn "no listings for channel $ch_xid on day $d, $url\n" 399 if $got == 0; 400 $num_this_day += $got; 401 update $bar if not $opt_quiet; 402 } 403 if ($num_this_day == 0) { 404 die "could not get any listings\n" if $d == 0; 405 warn "could get only one day of listings, not $opt_days\n" if $d == 1; 406 warn "could get only $d days of listings, not $opt_days\n" if $d > 1; 407 last; 408 } 409} 410$bar->finish() if not $opt_quiet; 411$writer->end(); 412 413###################################################################### 414# subroutine definitions 415 416# Remove bad characters. 417my $warned_bad_chars; 418sub tidy ( $ ) { 419 my $s = shift; 420 $s =~ tr/\t\031/ /d; 421 if ($s =~ s/([^\012\015\040-\176\240-\377]+)//g) { 422 warn "removing bad characters: '$1'" 423 unless $warned_bad_chars++; 424 } 425 return $s; 426} 427 428# arguments: 429# XMLTV::Writer object to write to 430# XMLTV id of channel 431# URL to fetch 432# Date::Manip object giving day for programmes in page (at least 433# until they cross midnight) 434# 435# Returns number of programmes written. 436# 437my ($warned_discarding_parts, $commented_episode_num); 438sub process_listings_page ( $$$$ ){ 439 # local $Log::TraceMessages::On = 1; 440 my ($writer, $ch_xmltv_id, $url, $day_obj) = @_; 441 my $count = 0; 442 my $next_day = 0; 443 my $day = UnixDate($day_obj, '%Q'); 444 t "getting channel $ch_xmltv_id, date $day"; 445 my $ts; 446 my $row; 447 my $listingspage = tidy(get_nice($url)); 448 #my $tabex = new HTML::TableExtract( depth => 1, count => 1 , keep_html => 1 ); 449 #$tabex->parse($listingspage); 450 #my ($newstart, $newstop, $newtitle, $newdesc, $rerun, $normalsubs, $teletextsubs, $eptotal, $ep, $epall); 451 my @elements = split(/<hr.*>/, $listingspage); 452 453 foreach my $element (@elements) { 454 #print "\n$element"; 455 #print $1 if $element =~ m!</b><br>\n(.*)!; 456 my ($newstart, $newstop, $newtitle, $newdesc, $rerun, $normalsubs, $teletextsubs, $eptotal, $ep, $epall); 457 if ($element =~ m/[0-9][0-9]\.[0-9][0-9] - [0-9][0-9]\.[0-9][0-9]/) { 458 #print "---\n$element\n---"; 459 ($newstart, $newstop) = ($1, $2) if $element =~ m/([0-9][0-9]\.[0-9][0-9]) - ([0-9][0-9]\.[0-9][0-9])/; 460 $newtitle = $1 if $element =~ m!<b>(.*)</b>!; 461 $newdesc = $1 if $element =~ m!</b><br>\n(.*)!; 462 #$newdesc = encode_entities($newdesc) if $newdesc; 463 #print $1 if $element =~ m!</b><br>\n(.*)!; 464 #print $newdesc; 465 $rerun = {} if $newtitle =~ s/\([Rr]\)//; 466 $normalsubs = 1 if $newtitle =~ s/\(t\)//; 467 $teletextsubs = 1 if $newtitle =~ s/\(ttv\)//; 468 #for ($newtitle) { s/^\s+//; s/\s+$//; } 469 470 #my ($eptotal, $ep, $epall); 471 for ($newtitle) { 472 if (s/\(([0-9]+)\)//) { 473 $eptotal = $1 - 1; 474 } 475 elsif (s/\(([0-9]+):([0-9]+)\)//) { 476 ($ep, $epall) = ($1 - 1, $2); 477 } 478 elsif (s/\(:([0-9]+)\)//) { 479 $eptotal = $1 - 1; 480 } 481 } 482 #s/\s+$//; 483 484 485 my $originaltitle = $1 486 if $newdesc && $newdesc =~ s/^\((.*)\) //; 487 for ($newtitle) { s/^\s+//; s/\s+$//; } 488 #$newdesc = " " if $newdesc eq ""; 489 #print "$newstart - $newstop - $newtitle - $newdesc - Day: $day\n"; 490 $newdesc = decode_entities($newdesc) if $newdesc; 491 if ($newdesc) { 492 for ($newdesc) { s/^\s+//; s/\s+$// } 493 } 494 $originaltitle = decode_entities($originaltitle) if $originaltitle; 495 $newstart =~ m/(\d.)\.(\d.)/ or die; 496 my $xmltvstart = utc_offset("$day$1${2}00", $TZ); 497 498 $newstop =~ m/(\d.)\.(\d.)/ or die; 499 my $xmltvstop = utc_offset("$day$1${2}00", $TZ); 500 501 my $newxmltvstop; 502 if (Date_Cmp($xmltvstart, $xmltvstop) > 0 ) { 503 $day = nextday($day); 504 $newstop =~ m/(\d.)\.(\d.)/ or die; 505 $xmltvstop = utc_offset("$day$1${2}00", $TZ); 506 } 507 my %prog = ( start => $xmltvstart, 508 stop => $xmltvstop, 509 channel => $ch_xmltv_id); 510 $prog{title} = [ [ $newtitle , $LANG ] , [ $originaltitle , "en" ] ] if $originaltitle; 511 $prog{title} = [ [ $newtitle , $LANG ] ] if not $originaltitle; 512 $prog{desc} = [ [ $newdesc , $LANG ] ] if $newdesc; 513 push @{$prog{'episode-num'}}, [ " . $ep/$epall . " , "xmltv_ns" ] if $ep; 514 push @{$prog{'episode-num'}}, [ $eptotal , "all-seasons" ] if $eptotal; 515 $prog{subtitles} = [ { type => 'teletext' } ] if $teletextsubs; 516 $prog{subtitles} = [ { type => 'onscreen' } ] if $normalsubs; 517 $prog{'previously-shown'} = $rerun if defined $rerun; 518 519 $writer->write_programme(\%prog); 520 ++$count; 521 } 522 } 523 524 525 return $count; 526} 527 528# get channel listing 529sub get_channels { 530 my $page = tidy(get_nice('http://fredag.dagbladet.no/tv/select_channels.html')); 531 ##my $page = tidy(get_nice('http://www.dagbladet.no/tv/mobil/index.html')); 532 my @chanarray = split(/<input/, $page); 533 my %channels; 534 foreach my $item (@chanarray) { 535 #print "\nAdding channel from itemtext:\n$item"; 536 #print "\nFound: $1 --- $2\n" if $item =~ /value=\"([0-9]+)\".*>.(.*)<br>/; 537 $channels{$1} = $2 if $item =~ /value=\"?([0-9]+)\"?.*\">(.*)<\/label>/; 538 } 539 return %channels; 540} 541 542sub getchandisplaynames( @ ) { 543 my %channels = get_channels(); 544 my %r; 545 foreach my $ch_did (@_) { $r{$ch_did} = $channels{$ch_did}; } 546 return %r; 547} 548 549# Bump a YYYYMMDD date by one. 550sub nextday { 551 my $d = shift; 552 my $p = parse_date($d); 553 my $n = DateCalc($p, '+ 1 day'); 554 return UnixDate($n, '%Q'); 555} 556