1#!/usr/local/bin/perl -w 2 3eval 'exec /usr/local/bin/perl -w -S $0 ${1+"$@"}' 4 if 0; # not running under some shell 5 6# $Id: tv_grab_dk_tvtid,v 1.3 2010/09/02 05:07:40 rmeden Exp $ 7 8=pod 9 10=head1 NAME 11 12tv_grab_dk_tvtid - Grab TV listings for Denmark. 13 14=head1 SYNOPSIS 15 16tv_grab_dk_tvtid --help 17 18tv_grab_dk_tvtid [--config-file FILE] --configure [--gui OPTION] 19 20tv_grab_dk_tvtid [--config-file FILE] [--output FILE] [--days N] 21[--offset N] [--quiet] 22 23tv_grab_dk_tvtid --capabilities 24 25tv_grab_dk_tvtid --version 26 27=head1 DESCRIPTION 28 29Output TV listings for several channels available in Denmark. The 30data comes from tvtid.tv2.dk. The grabber relies on parsing HTML so it 31might stop working at any time. 32 33First run B<tv_grab_dk_tvtid --configure> to choose, which channels you want 34to download. Then running B<tv_grab_dk_tvtid> with no arguments will output 35listings in XML format to standard output. 36 37B<--configure> Prompt for which channels, 38and write the configuration file. 39 40B<--config-file FILE> Set the name of the configuration file, the 41default is B<~/.xmltv/tv_grab_dk_tvtid.conf>. This is the file written by 42B<--configure> and read when grabbing. 43 44B<--gui OPTION> Use this option to enable a graphical interface to be used. 45OPTION may be 'Tk', or left blank for the best available choice. 46Additional allowed values of OPTION are 'Term' for normal terminal output 47(default) and 'TermNoProgressBar' to disable the use of Term::ProgressBar. 48 49B<--output FILE> Write to FILE rather than standard output. 50 51B<--days N> Grab N days. The default is one week. 52 53B<--offset N> Start N days in the future. The default is to start 54from today. 55 56B<--quiet> Suppress the progress messages normally written to standard 57error. 58 59B<--capabilities> Show which capabilities the grabber supports. For more 60information, see L<http://wiki.xmltv.org/index.php/XmltvCapabilities> 61 62B<--version> Show the version of the grabber. 63 64B<--help> Print a help message and exit. 65 66=head1 SEE ALSO 67 68L<xmltv(5)>. 69 70=head1 AUTHOR 71 72S�ren Pingel Dalsgaard (soren@dalsgaards.dk). Based on the dr grabber by 73S�ren Pingel Dalsgaard (soren@dalsgaards.dk). Based on the tv2 grabber by 74Jesper Skov (jskov@zoftcorp.dk). Originally based on tv_grab_nl by 75Guido Diepen and Ed Avis (ed@membled.com) and tv_grab_fi by Matti 76Airas. 77Additions by Jesper Toft (jesper@bzimage.dk) 78 79=head1 BUGS 80 81Things in the programme descriptions to handle: 82 83* Better categories from descriptions. 84 85* Customization of subtitles "Episode #" - perhaps even make it optional 86 87=cut 88 89###################################################################### 90# initializations 91 92use strict; 93use XMLTV::Version '$Id: tv_grab_dk_tvtid,v 1.3 2010/09/02 05:07:40 rmeden Exp $ '; 94use XMLTV::Capabilities qw/baseline manualconfig cache/; 95use XMLTV::Description 'Denmark'; 96use Getopt::Long; 97use HTML::TreeBuilder; 98use HTML::Entities; # parse entities 99use IO::File; 100use URI; 101use JSON; 102 103use Date::Manip; 104 105use XMLTV; 106use XMLTV::Memoize; 107use XMLTV::ProgressBar; 108use XMLTV::Ask; 109use XMLTV::Mode; 110use XMLTV::Config_file; 111use XMLTV::DST; 112use XMLTV::Date; 113# Todo: perhaps we should internationalize messages and docs? 114use XMLTV::Usage <<END 115$0: get Danish television listings in XMLTV format 116To configure: $0 --configure [--config-file FILE] 117To grab listings: $0 [--config-file FILE] [--output FILE] [--days N] 118[--offset N] [--quiet] 119To show capabilities: $0 --capabilities 120To show version: $0 --version 121END 122; 123 124# Use Log::TraceMessages if installed. 125BEGIN { 126 eval { require Log::TraceMessages }; 127 if ($@) { 128 *t = sub {}; 129 *d = sub { '' }; 130 } 131 else { 132 *t = \&Log::TraceMessages::t; 133 *d = \&Log::TraceMessages::d; 134 Log::TraceMessages::check_argv(); 135 } 136} 137 138use LWP::UserAgent; 139my $ua = LWP::UserAgent->new; 140$ua->agent("xmltv/$XMLTV::VERSION"); 141 142# Initialize cookie_jar 143use HTTP::Cookies; 144my $cookies = HTTP::Cookies->new; 145$ua->cookie_jar($cookies); 146 147# Whether zero-length programmes should be included in the output. 148my $WRITE_ZERO_LENGTH = 0; 149 150# default language 151my $LANG = 'da'; 152 153# Winter time in Denmark - summer time is one hour ahead of this. 154my $TZ = '+0100'; 155 156sub process_summary_page( $$$ ); 157sub process_listings_page( $$$$$ ); 158 159###################################################################### 160# get options 161 162# Known categories. 163my %tvtid_categories = ( 11854683 => 'B�rn og Unge', 164 11848684 => 'Dokumentar', 165 11825897 => 'Film', 166 11830626 => 'Livsstil', 167 11847662 => 'Musik', 168 11838192 => 'Natur og Milj�', 169 11840363 => 'Nyheder', 170 11870463 => 'Regional', 171 11831900 => 'Serier', 172 11792069 => 'Sport', 173 11837090 => 'Sundhed og Mad', 174 11844770 => 'Underholdning', 175 # The following are educated guesses 176 11839678 => 'Kultur', 177 11860240 => 'Shopping', 178 11860606 => 'Lotto', 179 11870299 => 'Religion', 180 11839487 => 'Videnskab', 181 11837327 => 'Alment', 182 11840285 => 'Undervisning', 183 ); 184 185my %categories = ( 11854683 => 'kids', 186 11848684 => 'documentary', 187 11825897 => 'movie', 188 11830626 => 'lifestyle', 189 11847662 => 'music', 190 11838192 => 'nature', 191 11840363 => 'news', 192 11870463 => 'local', 193 11831900 => 'series', 194 11792069 => 'sport', 195 11837090 => 'health', 196 11844770 => 'entertainment', 197 # The following are educated guesses 198 11839678 => 'culture', 199 11860240 => 'shopping', 200 11860606 => 'lotto', 201 11870299 => 'religion', 202 11839487 => 'science', 203 11837327 => 'misc', 204 11840285 => 'education' 205 ); 206 207my %movietypes = ( 'action' => 'action', 208 'drama' => 'drama', 209 'erotisk' => 'erotic', 210 'eventyr' => 'adventure', 211 'gyser' => 'horror', 212 'komedie' => 'comedy', 213 'krimi' => 'crime', 214 'thriller' => 'thriller', 215 'romantisk' => 'romance', 216 'western' => 'western' 217 ); 218# Get options 219XMLTV::Memoize::check_argv('get_url'); 220my ($opt_days, $opt_offset, $opt_help, $opt_output, 221 $opt_configure, $opt_config_file, $opt_gui, 222 $opt_quiet, $opt_list_channels); 223$opt_offset = 0; # default 224GetOptions('days=i' => \$opt_days, 225 'offset=i' => \$opt_offset, 226 'help' => \$opt_help, 227 'configure' => \$opt_configure, 228 'config-file=s' => \$opt_config_file, 229 'gui:s' => \$opt_gui, 230 'output=s' => \$opt_output, 231 'quiet' => \$opt_quiet, 232 'list-channels' => \$opt_list_channels, 233 ) 234 or usage(0); 235 236usage(1) if $opt_help; 237 238die 'number of days must not be negative' 239 if ((defined $opt_days && $opt_days < 0) || ($opt_offset < 0)); 240 241my $maxdays=7; 242 243die "tvtid.tv2.dk only provide information for today and the next $maxdays days." 244 if ($opt_offset > $maxdays); 245 246if (! defined $opt_days) { 247 # If there is no --days given. Set it to as many as possible. 248 $opt_days = $maxdays - $opt_offset; 249} else { 250 # --days option was given. Warn if its too high. 251 if (($opt_days + $opt_offset) > $maxdays) { 252 $opt_days = $maxdays - $opt_offset; 253 warn "tvtid.tv2.dk only provide information for today and the next $maxdays days (and not for all channels)."; 254 } 255} 256 257XMLTV::Ask::init($opt_gui); 258 259my $mode = XMLTV::Mode::mode('grab', # default 260 $opt_configure => 'configure', 261 $opt_list_channels => 'list-channels', 262 ); 263 264# File that stores which channels to download. 265my $config_file 266 = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_dk_tvtid', $opt_quiet); 267 268if ($mode eq 'configure') { 269 XMLTV::Config_file::check_no_overwrite($config_file); 270 open(CONF, ">$config_file") or die "cannot write to $config_file: $!"; 271 # find list of available channels 272 my $bar = new XMLTV::ProgressBar('getting list of channels', 1) 273 if not $opt_quiet; 274 my %channels = get_channels(); 275 die 'no channels could be found' if (scalar(keys(%channels)) == 0); 276 update $bar if not $opt_quiet; 277 $bar->finish() if not $opt_quiet; 278 279 # Ask about each channel. 280 my @chs = sort keys %channels; 281 my @names = map { $channels{$_} } @chs; 282 my @qs = map { "add channel $_?" } @names; 283 my @want = ask_many_boolean(1, @qs); 284 foreach (@chs) { 285 my $w = shift @want; 286 warn("cannot read input, stopping channel questions"), last 287 if not defined $w; 288 # No need to print to user - XMLTV::Ask is verbose enough. 289 290 # Print a config line, but comment it out if channel not wanted. 291 print CONF '#' if not $w; 292 my $name = shift @names; 293 print CONF "channel $_ $name\n"; 294 # TODO don't store display-name in config file. 295 } 296 297 close CONF or warn "cannot close $config_file: $!"; 298 say("Finished configuration."); 299 300 exit(); 301 } 302 303# Not configuring, we will need to write some output. 304die if $mode ne 'grab' and $mode ne 'list-channels'; 305 306# If we are grabbing, check we can read the config file before doing 307# anything else. 308# 309my @config_lines; 310if ($mode eq 'grab') { 311 @config_lines = XMLTV::Config_file::read_lines($config_file); 312} 313 314my %w_args; 315if (defined $opt_output) { 316 my $fh = new IO::File(">$opt_output"); 317 die "cannot write to $opt_output: $!" if not defined $fh; 318 $w_args{OUTPUT} = $fh; 319} 320$w_args{encoding} = 'ISO-8859-1'; 321my $writer = new XMLTV::Writer(%w_args); 322# TODO: standardize these things between grabbers. 323$writer->start 324 ({ 'source-info-url' => 'http://tvtid.tv2.dk/', 325 'source-data-url' => 'http://tvtid.tv2.dk/', 326 'generator-info-name' => 'XMLTV', 327 'generator-info-url' => 'http://xmltv.org/', 328 }); 329 330if ($opt_list_channels) { 331 my $bar = new XMLTV::ProgressBar('getting list of channels', 1) 332 if not $opt_quiet; 333 my %channels = get_channels(); 334 die 'no channels could be found' if (scalar(keys(%channels)) == 0); 335 update $bar if not $opt_quiet; 336 337 foreach my $ch_did (sort(keys %channels)) { 338 my $ch_name = $channels{$ch_did}; 339 $writer->write_channel({ id => $ch_did, 340 'display-name' => [ [ $ch_name ] ], 341 'icon' => [{'src' => get_icon($ch_did)}] 342 }); 343 } 344 $bar->finish() if not $opt_quiet; 345 $writer->end(); 346 exit(); 347} 348 349# Not configuring or writing channels, must be grabbing listings. 350die if $mode ne 'grab'; 351my (%channels, @channels, $ch_did, $ch_name); 352my (%convert, $orig_ch, $new_ch); 353my $line_num = 1; 354foreach (@config_lines) { 355 ++ $line_num; 356 next if not defined; 357 358 # FIXME channel data should be read from the site, and then the 359 # config file only gives the XMLTV ids that are interesting. 360 # 361 if (/^channel:?\s+(\S+)\s+([^\#]+)/) { 362 ($ch_did, $ch_name) = ($1, $2); 363 $ch_name =~ s/\s*$//; 364 push @channels, $ch_did; 365 $channels{$ch_did} = $ch_name; 366 } elsif (/^convert:?\s+(\S+)\s+([^\#]+)/) { 367 ($orig_ch, $new_ch) = ($1, $2); 368 $new_ch =~ s/\s*$//; 369 $convert{$orig_ch} = $new_ch; 370 } else { 371 warn "$config_file:$.: bad line\n"; 372 } 373} 374 375 376###################################################################### 377# subroutine definitions 378 379# get channel listing 380sub get_channels { 381 my %channels; 382 383 my $sec_per_day = 24*60*60; 384 my $eight_hours = 8*60*60; # Off by two for some reason 385 my $now8 = int(time()/$sec_per_day)*$sec_per_day+$eight_hours; 386 387 use JSON; 388 my $url = 'http://tvtid.tv2.dk/allekanaler/get.php/day-'.$now8.'.html'; 389 my $json_t = get_url($url); 390 #print $url."\n\n".$json_t."\n\n"; 391 my $json = new JSON(autoconv => 0, pretty => 1, indent => 2, utf8 => 1); 392 my $js_ref = $json->jsonToObj($json_t); 393 my @js = @$js_ref; 394 395 foreach my $elm (@js) 396 { 397 my $id = $elm->{logo}; 398 $id =~ s,/img/logos/logo-,,; 399 $id =~ s,\.gif,,; 400 $channels{$id} = $elm->{name}; 401 } 402 return %channels; 403} 404 405# Clean up bad characters in the site's HTML. 406my $warned_bad_chars; 407sub tidy( $ ) { 408 for (my $tmp = shift) { 409 tr/\222/''/; 410 tr/\011/ /; # tabs are allowed - turn them into spaces 411 if (tr/\012\015\040-\176\240-\377//dc) { 412 warn 'removing bad characters' unless ($warned_bad_chars++ || $opt_quiet); 413 } 414 return $_; 415 } 416} 417 418my $fetched; 419sub get_url( $ ) { 420 sleep rand(5) if defined $fetched; 421 $fetched = 1; 422 my $c = tidy($ua->get(shift)->content); 423 return $c; 424} 425 426# Bump a YYYYMMDD date by one. 427sub correct_day { 428 my $d = shift; 429 my $h = shift; 430 431 $d = UnixDate(DateCalc($d, "+ 1 day"), '%Q') if UnixDate($h, '%H') < 6; 432 433 my ($base, $tz) = @{date_to_local(parse_local_date("$d " . $h, $TZ), $TZ)}; 434 435 return UnixDate($base, '%q') . " $tz"; 436} 437 438# Icon URL for a given channel. 439sub get_icon { 440 my ($url) = @_; 441 return "http://tvtid.tv2.dk/img/logos/logo-" . $url . ".gif"; 442} 443 444# Split list of people into array 445sub get_people { 446 my $p = shift; 447 my $people; 448 $p =~ s/ m\.fl.+//; 449 @$people = split(/, | og |\//, $p); 450 s/.+:// foreach @$people; 451 s/^\s+// foreach @$people; 452 s/\s+$// foreach @$people; 453 s/[.]$// foreach @$people; 454 return $people; 455} 456 457###################################################################### 458# begin main program 459 460my $sec_per_day = 24*60*60; 461my $eight_hours = 8*60*60; # Off by two for some reason 462my $now8 = int(time()/$sec_per_day)*$sec_per_day+$eight_hours; 463 464Date_Init('TZ=utc'); 465 466foreach $ch_did (@channels) { 467 $ch_name = $channels{$ch_did}; 468 $writer->write_channel({ id => $ch_did, 469 'display-name' => [ [ $ch_name ] ], 470 'icon' => [{'src' => get_icon($ch_did)}] 471 }) unless $convert{$ch_did}; 472} 473 474for (my $i = $opt_offset;$i<($opt_offset + $opt_days);$i++) { 475 476 my $sec_per_day = 24*60*60; 477 my $eight_hours = 8*60*60; # Off by two for some reason 478 my $day8 = int(time()/$sec_per_day+$i)*$sec_per_day+$eight_hours; 479 480 481 my $day = UnixDate(DateCalc(parse_date('now'), "+ $i days"), '%Q'); 482 483 use JSON; 484 my $url = 'http://tvtid.tv2.dk/allekanaler/get.php/day-'.$day8.'.html'; 485 my $json_t = get_url($url); 486 487 my $json = new JSON(autoconv => 0, pretty => 1, indent => 2, utf8 => 1); 488 my $js_ref = $json->jsonToObj($json_t); 489 my @js = @$js_ref; 490 491 foreach my $elm (@js) 492 { 493 my $id = $elm->{logo}; 494 $id =~ s,/img/logos/logo-,,; 495 $id =~ s,\.gif,,; 496 if (defined $channels{$id}) { 497 498 my $programs_ref = $elm->{programs}; 499 my @programs = @$programs_ref; 500 for my $program (@programs) { 501 # If 'overlap=1' the program is present on the 502 # previous day as well, so skip it 503 next if defined $program->{overlap}; 504 505 #print $json->objToJson($program)."\n"; 506 507 my %prog = (); 508 if ($convert{$id}) { 509 $prog{channel} = $convert{$id}; 510 } else { 511 $prog{channel} = $id; 512 } 513 $prog{start} = correct_day($day, $program->{start}); 514 $prog{stop} = correct_day($day, $program->{end}); 515 $prog{category} = [ [ $categories{$program->{cat}} ] ]; 516 517 my $program_url="http://tvtid.tv2.dk/program/index.php/id-".$program->{'id'}.".html"; 518 my $contents = get_url($program_url); 519 520 my $aspect; 521 $aspect = '4:3' if ($contents =~ /pictureFormat43 enabled/); 522 $aspect = '16:9' if ($contents =~ /pictureFormat169 enabled/); 523 my $rerun; 524 $rerun = {} if ($contents =~ /rerun enabled/); 525 my $sound = 'stereo'; 526 $sound = 'surround' if ($contents =~ /surround enabled/); 527 my $teletext; 528 $teletext = {} if ($contents =~ /teletext enabled/); 529 my $subtitles; 530 $subtitles = {} if ($contents =~ /subtitles enabled/); 531 my $colour = 1; 532 $colour = 0 if ($contents =~ /blackwhite enabled/); 533 #if ($contents =~ /subtitlesHearingImpaired enabled/) { print "TTH "; } 534 535 my $descr; 536 my $with; 537 my $actors; 538 my $writers; 539 my $adapters; 540 my $presenters; 541 # Get program information. Keep "<" at the end intentionally. 542 if ($contents =~ /<div class="longinfo">(.+?<)\/div>/) { 543 $descr = $1; 544 545 $descr =~ s/\<p\>Sendt f�rste gang .*?\</</; # Remove 546 $descr =~ s/\<p\>Sendes ogs� .*?\</</; # Remove 547 548 $descr =~ s/\<p\>/ /g; # Remove <p> 549 $descr =~ s/\<\/p\>/ /g; # Remove </p> 550 551 if ($descr =~ /(.*)\<h2 class="programListHeader"\>Medvirkende:\<\/h2\>(.*?)(\<.*)/) { 552 ($descr, $actors) = ($1 . $3, get_people($2)); 553 } 554 555 if ($descr =~ /(.*)\<h2 class="programListHeader"\>Instrukt�r:\<\/h2\>(.*)(\<.*)/) { 556 ($descr, $writers) = ($1 . $3, get_people($2)); 557 } 558 559 if ($descr =~ /(.*)\<BR\>Tilrettel�ggelse: (.+?)(\<.*)/) { 560 ($descr, $adapters) = ($1 . $3, get_people($2)); 561 } 562 563 if ($descr =~ /(.*)V�rt: (.+?)\.(.*)/) { 564 ($descr, $presenters) = ($1 . $3, get_people($2)); 565 } 566 567 #if ($descr =~ /<BR>(.+)/) { 568 #print "***** $1 *****\n"; 569 #} 570 571 # Clean up $descr: 572 $descr =~ s/\<BR\>/ /g; # Remove <BR> 573 $descr =~ s/<$//g; # Remove < at end 574 $descr =~ s/ \s+/ /g; # Remove double spaces 575 $descr =~ s/ +$//g; # rtrim 576 $descr =~ s/^ +//g; # ltrim 577 578 } 579 580 my $original; 581 if ($contents =~ /\<h2 class="originalTitle"\>Originaltitel: (.+?)\<\/h2\>/) { 582 $original = $1; 583 } 584 585 my $episode; 586 if ($contents =~ /\<div class="episode"\>Episode: \((.+?)\)\<\/div\>/) { 587 $episode = $1; 588 $episode =~ s/:/\//; 589 } 590 591 my @titles = ([ $program->{title}, $LANG ]); 592 push @titles, [ $original ] if defined $original; 593 $prog{title} = \@titles; 594 $prog{desc} = ([ [ $descr, $LANG ] ]) if defined $descr && $descr; 595 my %v = ( present => 1, 596 colour => $colour ); 597 $v{aspect} = $aspect if defined $aspect; 598 $prog{video} = \%v; 599 $prog{audio} = { present => 1, 600 stereo => $sound }; 601 my $subtitle; 602 $subtitle = $program->{description} if $program->{description}; 603 if (defined $episode) { 604 if (defined $subtitle) { 605 $subtitle .= "." unless $subtitle =~ /\.$/; 606 $subtitle .= " Episode " . $episode . "."; 607 } else { 608 $subtitle = "Episode " . $episode . "."; 609 } 610 } 611 $prog{'sub-title'} = [ [ $subtitle, $LANG ] ] if defined $subtitle; 612 if (defined $subtitle && $subtitle =~ / fra (\d{4})/) { 613 $prog{date} = $1; 614 } 615 $prog{subtitles} = [ { type => 'teletext' } ] if defined $teletext; 616 $prog{subtitles} = [ { type => 'onscreen' } ] if defined $subtitles; 617 $prog{'episode-num'} = [ [ $episode, 'onscreen' ] ] if defined $episode; 618 $prog{'previously-shown'} = $rerun if defined $rerun; 619 my %c; 620 $c{actor} = $actors if defined $actors; 621 $c{writer} = $writers if defined $writers; 622 $c{adapter} = $adapters if defined $adapters; 623 $c{presenter} = $presenters if defined $presenters; 624 $prog{credits} = \%c if %c; 625 626 #print $json->objToJson(\%prog) . "\n"; 627 628 $writer->write_programme(\%prog); 629 } 630 } 631 } 632} 633$writer->end(); 634exit(0); 635