#!/usr/local/bin/perl -w =pod =head1 NAME tv_grab_no - Grab TV listings for Norway. =head1 SYNOPSIS tv_grab_no --help tv_grab_no [--config-file FILE] --configure [--gui OPTION] tv_grab_no [--config-file FILE] [--output FILE] [--days N] [--offset N] [--quiet] =head1 DESCRIPTION Output TV listings for several channels available in Norway. The data comes from fredag.dagbladet.no. The grabber relies on parsing HTML so it might stop working at any time. First run B to choose, which channels you want to download. Then running B with no arguments will output listings in XML format to standard output. B<--configure> Prompt for which channels, and write the configuration file. B<--config-file FILE> Set the name of the configuration file, the default is B<~/.xmltv/tv_grab_no.conf>. This is the file written by B<--configure> and read when grabbing. B<--gui OPTION> Use this option to enable a graphical interface to be used. OPTION may be 'Tk', or left blank for the best available choice. Additional allowed values of OPTION are 'Term' for normal terminal output (default) and 'TermNoProgressBar' to disable the use of Term::ProgressBar. B<--output FILE> Write to FILE rather than standard output. B<--days N> Grab N days. The default is as many as the source carries. B<--offset N> Start N days in the future. The default is to start from today. B<--quiet> Suppress the progress messages normally written to standard error. B<--version> Show the version of the grabber. B<--help> Print a help message and exit. =head1 SEE ALSO L. =head1 AUTHOR Christian Wattengaard (christian@wattengard.com). Heavily based on tv_grab_dk by Jesper Skov (jskov@zoftcorp.dk). tv_grab_dk originally based on tv_grab_nl by Guido Diepen and Ed Avis (ed@membled.com) and tv_grab_fi by Matti Airas. =head1 BUGS First release. Not aware of any bugs yet. But I'm sure there are some. =cut ###################################################################### # initializations use strict; use XMLTV::Version '$Id: tv_grab_no,v 1.23 2010/09/02 05:07:40 rmeden Exp $ '; use XMLTV::Capabilities qw/baseline manualconfig cache/; use XMLTV::Description 'Norway'; use Getopt::Long; use HTML::Entities; # parse entities use IO::File; use URI; #use HTML::TableExtract; use Date::Manip; #use HTML::LinkExtractor; #use WWW::Mechanize; use XMLTV; use XMLTV::Memoize; use XMLTV::ProgressBar; use XMLTV::Ask; use XMLTV::Mode; use XMLTV::Config_file; use XMLTV::DST; use XMLTV::Get_nice; use XMLTV::Date; # Todo: perhaps we should internationalize messages and docs? use XMLTV::Usage < 'nrk1.nrk.no', 3 => 'tv2.no', 4 => 'tv3.no', 5 => 'tvnorge.no', 2 => 'nrk2.nrk.no', 63 => 'ztv.no', 37 => 'tv4.se', 68 => '3plus.dk', 12 => 'dr1.dr.dk', 58 => 'dr2.dr.dk', 6 => 'svt1.svt.se', 7 => 'svt2.svt.se', 47 => 'kanal5.se', 13 => 'dr2-2.dr.dk', 35 => 'tv3.dk', 36 => 'tv3.se', 28 => 'tv6.no', 30 => 'ztv.se', 59 => 'no.hallmarkchannel.tv', 31 => 'turnerclassicmovies.com', 39 => 'canalplus.canalplus.no', 69 => 'film1.canalplus.no', 70 => 'film2.canalplus.no', 42 => 'cinema.tv1000.no', 71 => 'cmore1.canalplus.no', 34 => 'tv1000.no', 79 => 'classic.tv1000.no', 76 => 'nordic.tv1000.no', 78 => 'action.tv1000.no', 80 => 'family.tv1000.no', 41 => 'cartoonnetworkhq.net', 20 => 'nickelodeon.se', 67 => 'disneychannel.tv', 16 => 'foxkids.no', 32 => 'tcmcn.no', 15 => 'eurosport.no', 75 => 'xtra.tv2.no', 72 => 'sport.canalplus.no', 55 => 'sportn.no', 73 => 'sport2.viasat.no', 74 => 'sport3.viasat.no', 49 => 'extreme.com', 11 => 'dsf.de', 17 => 'mtve.com', 29 => 'vh1e.com', 53 => 'traveladventure.discoverychannel.com', 33 => 'travelchannel.com', 66 => 'explorer.viasat.no', 44 => 'discoverychannel.com', 10 => 'animalplanet.discoverychannel.com', 51 => 'civilisation.discoverychannel.com', 52 => 'scitrek.discoverychannel.com', 18 => 'cnbcng.no', 61 => 'realitytv.co.uk', 46 => 'tv8.se', 38 => 'world.bbc.co.uk', 62 => 'cnbc.com', 25 => 'news.sky.co.uk', 26 => 'skynewsng.no', 43 => 'cnn.com', 14 => 'dw.de', 9 => '3sat.de', 22 => '1.rtl.de', 23 => '2.rtl.de', 27 => 'tv5.fr', 21 => 'pro7.de', 24 => 'sat1.de', 19 => 'prime.bbc.co.uk', 50 => 'club.no', 77 => 'star.nonstop.tv', 88 => 'sport2.canalplus.no', 87 => 'mix.canalplus.no', 86 => '2.eurosport.no', 85 => 'toon.disneychannel.tv', 84 => 'thevoice.no', 81 => 'ngcnorge.com', 82 => 'jetix.no', 83 => 'showtime.nonstop.tv', 89 => 'playhouse.disneychannel.tv', 90 => 'tv6.viasat.no', 91 => 'hd.canalplus.no', 92 => 'film3.canalplus.no', 93 => 'film.tv2.no' ); sub process_summary_page( $$$ ); sub process_listings_page( $$$$ ); ###################################################################### # get options # Get options XMLTV::Memoize::check_argv('XMLTV::Get_nice::get_nice_aux'); my ($opt_days, $opt_offset, $opt_help, $opt_output, $opt_configure, $opt_config_file, $opt_gui, $opt_quiet, $opt_list_channels); $opt_days = 4; # default $opt_offset = 0; # default GetOptions('days=i' => \$opt_days, 'offset=i' => \$opt_offset, 'help' => \$opt_help, 'configure' => \$opt_configure, 'config-file=s' => \$opt_config_file, 'gui:s' => \$opt_gui, 'output=s' => \$opt_output, 'quiet' => \$opt_quiet, 'list-channels' => \$opt_list_channels, ) or usage(0); die 'number of days must not be negative' if (defined $opt_days && $opt_days < 0); usage(1) if $opt_help; XMLTV::Ask::init($opt_gui); if ($opt_days > 7) { print "WARNING: This grabber can only grab 7 days ahead!\n-------: I will grab 7 days and then quit.\n"; $opt_days = 7; } my $mode = XMLTV::Mode::mode('grab', # default $opt_configure => 'configure', $opt_list_channels => 'list-channels', ); # File that stores which channels to download. my $config_file = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_no', $opt_quiet); if ($mode eq 'configure') { XMLTV::Config_file::check_no_overwrite($config_file); open(CONF, ">$config_file") or die "cannot write to $config_file: $!"; # find list of available channels my $bar = new XMLTV::ProgressBar('getting list of channels', 1) if not $opt_quiet; my %channels = get_channels(); die 'no channels could be found' if (scalar(keys(%channels)) == 0); update $bar if not $opt_quiet; $bar->finish() if not $opt_quiet; # Ask about each channel. my @chs = sort keys %channels; my @names = map { $channels{$_} } @chs; my @qs = map { "add channel $_?" } @names; my @want = ask_many_boolean(1, @qs); foreach (@chs) { my $w = shift @want; warn("cannot read input, stopping channel questions"), last if not defined $w; # No need to print to user - XMLTV::Ask is verbose enough. # Print a config line, but comment it out if channel not wanted. print CONF '#' if not $w; my $name = shift @names; print CONF "channel $_\n"; # TODO don't store display-name in config file. } close CONF or warn "cannot close $config_file: $!"; say("Finished configuration."); exit(); } # Not configuring, we will need to write some output. die if $mode ne 'grab' and $mode ne 'list-channels'; # If we are grabbing, check we can read the config file before doing # anything else. # my @config_lines; if ($mode eq 'grab') { @config_lines = XMLTV::Config_file::read_lines($config_file); } my %w_args; if (defined $opt_output) { my $fh = new IO::File(">$opt_output"); die "cannot write to $opt_output: $!" if not defined $fh; $w_args{OUTPUT} = $fh; } $w_args{encoding} = 'ISO-8859-1'; my $writer = new XMLTV::Writer(%w_args); # TODO: standardize these things between grabbers. $writer->start ({ 'source-info-url' => 'http://fredag.dagbladet.no/tv/', 'source-data-url' => 'http://fredag.dagbladet.no/tv/', 'generator-info-name' => 'XMLTV', 'generator-info-url' => 'http://xmltv.org/', }); if ($opt_list_channels) { my $bar = new XMLTV::ProgressBar('getting list of channels', 1) if not $opt_quiet; my %channels = get_channels(); die 'no channels could be found' if (scalar(keys(%channels)) == 0); update $bar if not $opt_quiet; $bar->finish() if not $opt_quiet; foreach my $ch_did (sort(keys %channels)) { my $ch_name = $channels{$ch_did}; #my $ch_xid = $ch_did; my $ch_xid = $xmltvid{$ch_did}; $writer->write_channel({ id => $ch_xid, 'display-name' => [ [ $ch_name ] ] }); } $writer->end(); exit(); } # Not configuring or writing channels, must be grabbing listings. die if $mode ne 'grab'; my (%channels, @channels, $ch_did, $ch_name); my $line_num = 1; foreach (@config_lines) { ++ $line_num; next if not defined; if (/^channel ([0-9]+)/) { push @channels, $1; } else { warn "$config_file:$.: bad line\n"; } } my $configbar = new XMLTV::ProgressBar('fetching channel names', 1) if not $opt_quiet; %channels = getchandisplaynames(@channels); update $configbar if not $opt_quiet; $configbar->finish() if not $opt_quiet; ###################################################################### # begin main program my $now = parse_date('now'); die if not defined $now; my @to_get; # the order in which we fetch the channels matters my $today = UnixDate($now, '%Y-%m-%d'); die if not defined $today; foreach $ch_did (@channels) { $ch_name = $channels{$ch_did}; my $ch_xid = $xmltvid{$ch_did}; $writer->write_channel({ id => $ch_xid, 'display-name' => [ [ $ch_name ] ] }); } my %warned_ch_name; # suppress duplicate warnings my $bar = new XMLTV::ProgressBar('fetching data', @channels * $opt_days) if not $opt_quiet; foreach my $d (0 .. $opt_days - 1) { my $i = $opt_offset + $d; my $day = UnixDate(DateCalc($today, "+ $i days"), '%Y-%m-%d'); t "turned offset $i (from $today) into date $day"; my $num_this_day = 0; foreach $ch_did (@channels) { my $ch_xid = $xmltvid{$ch_did}; # Request day when constructing URL since it is represented as # an integere offset from today. Still pass in the computed # date - may need it sometime... #my $url = 'http://fredag.dagbladet.no/tv/index.html' # . "?kanal_id=$ch_did&dag=$i&fra_tid=0&til_tid=24"; my $url = 'http://www.dagbladet.no/mobil/tv/' . "liste.html?kanal_id=$ch_did&dag=$i&fra_tid=0&til_tid=24"; push @to_get, [ $url, $day, $ch_xid, $ch_did ]; my $got = process_listings_page($writer, $ch_xid, $url, $day); warn "no listings for channel $ch_xid on day $d, $url\n" if $got == 0; $num_this_day += $got; update $bar if not $opt_quiet; } if ($num_this_day == 0) { die "could not get any listings\n" if $d == 0; warn "could get only one day of listings, not $opt_days\n" if $d == 1; warn "could get only $d days of listings, not $opt_days\n" if $d > 1; last; } } $bar->finish() if not $opt_quiet; $writer->end(); ###################################################################### # subroutine definitions # Remove bad characters. my $warned_bad_chars; sub tidy ( $ ) { my $s = shift; $s =~ tr/\t\031/ /d; if ($s =~ s/([^\012\015\040-\176\240-\377]+)//g) { warn "removing bad characters: '$1'" unless $warned_bad_chars++; } return $s; } # arguments: # XMLTV::Writer object to write to # XMLTV id of channel # URL to fetch # Date::Manip object giving day for programmes in page (at least # until they cross midnight) # # Returns number of programmes written. # my ($warned_discarding_parts, $commented_episode_num); sub process_listings_page ( $$$$ ){ # local $Log::TraceMessages::On = 1; my ($writer, $ch_xmltv_id, $url, $day_obj) = @_; my $count = 0; my $next_day = 0; my $day = UnixDate($day_obj, '%Q'); t "getting channel $ch_xmltv_id, date $day"; my $ts; my $row; my $listingspage = tidy(get_nice($url)); #my $tabex = new HTML::TableExtract( depth => 1, count => 1 , keep_html => 1 ); #$tabex->parse($listingspage); #my ($newstart, $newstop, $newtitle, $newdesc, $rerun, $normalsubs, $teletextsubs, $eptotal, $ep, $epall); my @elements = split(//, $listingspage); foreach my $element (@elements) { #print "\n$element"; #print $1 if $element =~ m!
\n(.*)!; my ($newstart, $newstop, $newtitle, $newdesc, $rerun, $normalsubs, $teletextsubs, $eptotal, $ep, $epall); if ($element =~ m/[0-9][0-9]\.[0-9][0-9] - [0-9][0-9]\.[0-9][0-9]/) { #print "---\n$element\n---"; ($newstart, $newstop) = ($1, $2) if $element =~ m/([0-9][0-9]\.[0-9][0-9]) - ([0-9][0-9]\.[0-9][0-9])/; $newtitle = $1 if $element =~ m!(.*)!; $newdesc = $1 if $element =~ m!
\n(.*)!; #$newdesc = encode_entities($newdesc) if $newdesc; #print $1 if $element =~ m!
\n(.*)!; #print $newdesc; $rerun = {} if $newtitle =~ s/\([Rr]\)//; $normalsubs = 1 if $newtitle =~ s/\(t\)//; $teletextsubs = 1 if $newtitle =~ s/\(ttv\)//; #for ($newtitle) { s/^\s+//; s/\s+$//; } #my ($eptotal, $ep, $epall); for ($newtitle) { if (s/\(([0-9]+)\)//) { $eptotal = $1 - 1; } elsif (s/\(([0-9]+):([0-9]+)\)//) { ($ep, $epall) = ($1 - 1, $2); } elsif (s/\(:([0-9]+)\)//) { $eptotal = $1 - 1; } } #s/\s+$//; my $originaltitle = $1 if $newdesc && $newdesc =~ s/^\((.*)\) //; for ($newtitle) { s/^\s+//; s/\s+$//; } #$newdesc = " " if $newdesc eq ""; #print "$newstart - $newstop - $newtitle - $newdesc - Day: $day\n"; $newdesc = decode_entities($newdesc) if $newdesc; if ($newdesc) { for ($newdesc) { s/^\s+//; s/\s+$// } } $originaltitle = decode_entities($originaltitle) if $originaltitle; $newstart =~ m/(\d.)\.(\d.)/ or die; my $xmltvstart = utc_offset("$day$1${2}00", $TZ); $newstop =~ m/(\d.)\.(\d.)/ or die; my $xmltvstop = utc_offset("$day$1${2}00", $TZ); my $newxmltvstop; if (Date_Cmp($xmltvstart, $xmltvstop) > 0 ) { $day = nextday($day); $newstop =~ m/(\d.)\.(\d.)/ or die; $xmltvstop = utc_offset("$day$1${2}00", $TZ); } my %prog = ( start => $xmltvstart, stop => $xmltvstop, channel => $ch_xmltv_id); $prog{title} = [ [ $newtitle , $LANG ] , [ $originaltitle , "en" ] ] if $originaltitle; $prog{title} = [ [ $newtitle , $LANG ] ] if not $originaltitle; $prog{desc} = [ [ $newdesc , $LANG ] ] if $newdesc; push @{$prog{'episode-num'}}, [ " . $ep/$epall . " , "xmltv_ns" ] if $ep; push @{$prog{'episode-num'}}, [ $eptotal , "all-seasons" ] if $eptotal; $prog{subtitles} = [ { type => 'teletext' } ] if $teletextsubs; $prog{subtitles} = [ { type => 'onscreen' } ] if $normalsubs; $prog{'previously-shown'} = $rerun if defined $rerun; $writer->write_programme(\%prog); ++$count; } } return $count; } # get channel listing sub get_channels { my $page = tidy(get_nice('http://fredag.dagbladet.no/tv/select_channels.html')); ##my $page = tidy(get_nice('http://www.dagbladet.no/tv/mobil/index.html')); my @chanarray = split(/.(.*)
/; $channels{$1} = $2 if $item =~ /value=\"?([0-9]+)\"?.*\">(.*)<\/label>/; } return %channels; } sub getchandisplaynames( @ ) { my %channels = get_channels(); my %r; foreach my $ch_did (@_) { $r{$ch_did} = $channels{$ch_did}; } return %r; } # Bump a YYYYMMDD date by one. sub nextday { my $d = shift; my $p = parse_date($d); my $n = DateCalc($p, '+ 1 day'); return UnixDate($n, '%Q'); }