1#!/usr/local/bin/perl 2 3=pod 4 5=head1 NAME 6 7tv_grab_ch_search - Grab TV listings for Switzerland (from tv.search.ch webpage). 8 9=head1 SYNOPSIS 10 11tv_grab_ch_search --help 12 13tv_grab_ch_search [--config-file FILE] --configure [--gui OPTION] 14 15tv_grab_ch_search [--config-file FILE] [--output FILE] [--quiet] 16 [--days N] [--offset N] 17 18tv_grab_ch_search --list-channels 19 20tv_grab_ch_search --capabilities 21 22tv_grab_ch_search --version 23 24=head1 DESCRIPTION 25 26Output TV listings for several channels available in Switzerland and 27(partly) central Europe. 28The data comes from tv.search.ch. The grabber relies on 29parsing HTML so it might stop working at any time. 30 31First run B<tv_grab_ch_search --configure> to choose, which channels 32you want to download. Then running B<tv_grab_ch_search> with no 33arguments will output listings in XML format to standard output. 34 35B<--configure> Ask for each available channel whether to download 36and write the configuration file. 37 38B<--config-file FILE> Set the name of the configuration file, the 39default is B<~/.xmltv/tv_grab_ch_search.conf>. This is the file 40written by B<--configure> and read when grabbing. 41 42B<--gui OPTION> Use this option to enable a graphical interface to be used. 43OPTION may be 'Tk', or left blank for the best available choice. 44Additional allowed values of OPTION are 'Term' for normal terminal output 45(default) and 'TermNoProgressBar' to disable the use of Term::ProgressBar. 46 47B<--output FILE> Write to FILE rather than standard output. 48 49B<--days N> Grab N days. The default is fourteen. 50 51B<--offset N> Start N days in the future. The default is to start 52from now on (= zero). 53 54B<--quiet> Suppress the progress messages normally written to standard 55error. 56 57B<--list-channels> Write output giving <channel> elements for every 58channel available (ignoring the config file), but no programmes. 59 60B<--capabilities> Show which capabilities the grabber supports. For more 61information, see L<http://wiki.xmltv.org/index.php/XmltvCapabilities> 62 63B<--version> Show the version of the grabber. 64 65B<--help> print a help message and exit. 66 67 68=head1 SEE ALSO 69 70L<xmltv(5)>. 71 72=head1 AUTHOR 73 74Daniel Bittel <betlit@gmx.net>. Inspired by tv_grab_ch by Stefan Siegl. 75Patric Mueller <bhaak@gmx.net>. 76 77=head1 BUGS 78 79If you happen to find a bug, you're requested to send a mail to one of the 80XMLTV mailing lists, see webpages at http://sourceforge.net/projects/xmltv/. 81 82=cut 83 84use warnings; 85use strict; 86use Encode; 87use DateTime; 88use LWP::Simple; 89use HTTP::Cookies; 90use XMLTV::Version '$Id: tv_grab_ch_search.in,v 1.22 2016/07/15 10:23:24 bhaak Exp $ '; 91use XMLTV::Capabilities qw/baseline manualconfig cache/; 92use XMLTV::Description 'Switzerland (tv.search.ch)'; 93use XMLTV::Supplement qw/GetSupplement/; 94use Getopt::Long; 95use HTML::TreeBuilder; 96use HTML::Entities; 97use URI::Escape; 98use URI::URL; 99use XMLTV; 100use XMLTV::Ask; 101use XMLTV::ProgressBar; 102use XMLTV::DST; 103use XMLTV::Config_file; 104use XMLTV::Mode; 105use XMLTV::Get_nice; 106use XMLTV::Memoize; 107use XMLTV::Usage<<END 108$0: get Swiss television listings from tv.search.ch in XMLTV format 109To configure: $0 --configure [--config-file FILE] [--gui OPTION] 110To grab data: $0 [--config-file FILE] [--output FILE] [--quiet] 111 [--days N] [--offset N] 112Channel List: $0 --list-channels 113To show capabilities: $0 --capabilities 114To show version: $0 --version 115 116END 117 ; 118 119# Use Log::TraceMessages if installed. 120BEGIN { 121 eval { require Log::TraceMessages }; 122 if ($@) { 123 *t = sub {}; 124 *d = sub { '' }; 125 } 126 else { 127 *t = \&Log::TraceMessages::t; 128 *d = \&Log::TraceMessages::d; 129 } 130} 131 132 133 134## our own prototypes first ... 135sub get_channels(); 136sub channel_id($); 137sub get_page($); 138sub grab_channel($); 139 140## attributes of xmltv root element 141my $head = { 142 'source-data-url' => 'https://tv.search.ch/channels', 143 'source-info-url' => 'https://tv.search.ch/', 144 'generator-info-name' => 'XMLTV', 145 'generator-info-url' => 'http://xmltv.org/', 146}; 147 148 149 150## the timezone tv.search.ch lives in is, CET/CEST 151my constant $TZ = "+0100"; 152my constant $lang = "de"; 153 154 155 156## Parse argv now. First do undocumented --cache option. 157XMLTV::Memoize::check_argv('XMLTV::Get_nice::get_nice_aux'); 158 159 160 161my $opt_configure; 162my $opt_config_file; 163my $opt_gui; 164my $opt_output; 165my $opt_days = 14; 166my $opt_offset = 0; 167my $opt_quiet = 0; 168my $opt_slow = 0; 169my $opt_list_channels; 170my $opt_help; 171 172GetOptions( 173 'configure' => \$opt_configure, 174 'config-file=s' => \$opt_config_file, 175 'gui:s' => \$opt_gui, 176 'output=s' => \$opt_output, 177 'days=i' => \$opt_days, 178 'offset=i' => \$opt_offset, 179 'quiet' => \$opt_quiet, 180 'slow' => \$opt_slow, 181 'list-channels' => \$opt_list_channels, 182 'help' => \$opt_help, 183) or usage(0); 184 185usage(1) if $opt_help; 186 187XMLTV::Ask::init($opt_gui); 188 189## make sure offset+days arguments are within range 190die "neither offset nor days may be negative" 191 if($opt_offset < 0 || $opt_days < 0); 192 193 194## calculate global start/stop times ... 195my $grab_start = DateTime->now(time_zone => 'Europe/Zurich')->add( days => $opt_offset ); 196my $grab_stop = DateTime->now(time_zone => 'Europe/Zurich')->add ( days => $opt_offset + $opt_days ); 197 198my $mode = XMLTV::Mode::mode('grab', # default value 199 $opt_configure => 'configure', 200 $opt_list_channels => 'list-channels', 201); 202 203 204 205## initialize config file support 206my $config_file = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_ch_search', $opt_quiet); 207my @config_lines; 208 209if($mode eq 'configure') { 210 XMLTV::Config_file::check_no_overwrite($config_file); 211} 212elsif($mode eq 'grab' || $mode eq 'list-channels') { 213 @config_lines = XMLTV::Config_file::read_lines($config_file); 214} 215else { die("never heard of XMLTV mode $mode, sorry :-(") } 216 217 218 219## hey, we can't live without channel data, so let's get those now! 220my $bar = new XMLTV::ProgressBar( 'getting list of channels', 1 ) 221 if not $opt_quiet; 222 223my %channels = get_channels(); 224$bar->update() if not $opt_quiet; 225$bar->finish() if not $opt_quiet; 226 227 228my @requests; 229 230## read our configuration file now 231my $line = 1; 232foreach(@config_lines) { 233 $line ++; 234 next unless defined; 235 236 if (/^channel:?\s+(\S+)/) { 237 warn("\nConfigured channel $1 not available anymore. \nPlease reconfigure tv_grab_ch_search.\n"), 238 next unless(defined($channels{$1})); 239 push @requests, $1; 240 } else { 241 warn "$config_file:$line: bad line\n"; 242 } 243} 244 245## if we're requested to do so, write out a new config file ... 246if ($mode eq 'configure') { 247 open(CONFIG, ">$config_file") or die("cannot write to $config_file, due to: $!"); 248 249 ## now let's annoy the user, sorry, I meant ask .. 250 my @chs = sort keys %channels; 251 my @names = map { $channels{$_} } @chs; 252 my @qs = map { "add channel $_?" } @names; 253 my @want = ask_many_boolean(1, @qs); 254 255 foreach (@chs) { 256 my $w = shift @want; 257 my $chname = shift @names; 258 259 warn("cannot read input, stopping to ask questions ..."), last if not defined $w; 260 261 print CONFIG '#' if not $w; #- comment line out if user answer 'no' 262 263 # shall we store the display name in the config file? 264 # leave it in, since it probably makes it a lot easier for the 265 # user to choose which channel to comment/uncommet - when manually 266 # viing the config file -- are there people who do that? 267 print CONFIG "channel $_ #$chname\n"; 268 } 269 270 close CONFIG or warn "unable to nicely close the config file: $!"; 271 say("Finished configuration."); 272 273 exit(); 274} 275 276 277 278## well, we don't have to write a config file, so, probably it's some xml stuff :) 279## if not, let's go dying ... 280die unless($mode eq 'grab' or $mode eq 'list-channels'); 281 282my %writer_args; 283if (defined $opt_output) { 284 my $handle = new IO::File(">$opt_output"); 285 die "cannot write to output file, $opt_output: $!" unless (defined $handle); 286 $writer_args{'OUTPUT'} = $handle; 287} 288 289$writer_args{'encoding'} = 'utf-8'; 290 291 292if( defined( $opt_days )) { 293 $writer_args{offset} = $opt_offset; 294 $writer_args{days} = $opt_days; 295 $writer_args{cutoff} = "000000"; 296} 297 298## create our writer object 299my $writer = new XMLTV::Writer(%writer_args); 300$writer->start($head); 301 302 303 304if ($mode eq 'list-channels') { 305 foreach (keys %channels) { 306 my %channel = ('id' => channel_id($_), 307 'display-name' => [[$channels{$_}, $lang]]); 308 $writer->write_channel(\%channel); 309 } 310 311 $writer->end(); 312 exit(); 313} 314 315 316 317## there's only one thing, why we might exist: write out tvdata! 318die unless ($mode eq 'grab'); 319die "No channels specified, run me with --configure flag\n" unless(scalar(@requests)); 320 321 322 323## write out <channel> tags 324my $paramstr =""; 325foreach(@requests) { 326 my $id = channel_id($_); 327 my %channel = ('id' => $id, 328 'display-name' => [[$channels{$_}, $lang]]); 329 $writer->write_channel(\%channel); 330 $paramstr = $paramstr."&channels[]=".$_; 331 332} 333 334 335## the page doesn't specify the year when the programmes begin or end, thus 336## we need to guess, store current year and month globally as needed for every 337## programme ... 338my $cur_year = DateTime->now()->year(); 339my $cur_month = DateTime->now()->month(); 340 341my $url=$head->{q(source-data-url)}; 342 343 344my $ua = LWP::UserAgent->new(keep_alive => 300); 345$ua->cookie_jar(HTTP::Cookies->new()); 346$ua->agent("xmltv/$XMLTV::VERSION"); 347$ua->env_proxy; 348 349my $req = HTTP::Request->new(POST => $url); 350$req->content_type('application/x-www-form-urlencoded'); 351$req->content(substr ( $paramstr, 1)); 352 353# FIXME what is this request doing? It fills the cookie jar 354$ua->request($req); 355$ua->request($req); 356 357## write out <programme> tags 358grab_channels(); 359 360## hey, looks like we've finished ... 361$writer->end(); 362 363 364## channel_id($s) :: turn site channel id into an xmltv id 365sub channel_id($) { 366 my $s = shift; 367 $s =~ s|^tv_||; 368 return "$s.search.ch" 369} 370 371sub parse_page { 372 my ($tb, $start_parse_date) = @_; 373 foreach my $tv_channel ( $tb->look_down('class' => 'sl-card tv-index-channel') ) { 374 my $channel_id = substr($tv_channel->attr('id'), 3); # tv-sf1 -> sf1 375 if ( defined($channel_id) ) { 376 foreach my $tv_show ( $tv_channel ->look_down('class' => 'tv-tooltip') ) { 377 my %show; 378 $show{channel} = channel_id($channel_id); 379 380 my $tmp = $tv_show->look_down('_tag', 'a'); 381 next unless defined($tmp); 382 383 my %params = URI::URL->new($tmp->attr('href'))->query_form(); 384 my $start_date = $params{'start'}; 385 my $end_date = $params{'end'}; 386 next unless defined($start_date); 387 388 my $show_start = DateTime->new ( 389 year => substr($start_date, 0, 4) 390 ,month => substr($start_date, 5, 2) 391 ,day => substr($start_date, 8, 2) 392 ,hour => substr($start_date, 11, 2) 393 ,minute => substr($start_date, 14, 2) 394 ,second => substr($start_date, 17, 2) 395 ,time_zone => 'Europe/Zurich'); 396 $show{start} = $show_start->strftime( "%Y%m%d%H%M%S %z" ); 397 # skip shows starting before the start date to prevent duplicates 398 next if $show_start < $start_parse_date; 399 400 $show{stop} = DateTime->new ( 401 year => substr($end_date, 0, 4) 402 ,month => substr($end_date, 5, 2) 403 ,day => substr($end_date, 8, 2) 404 ,hour => substr($end_date, 11, 2) 405 ,minute => substr($end_date, 14, 2) 406 ,second => substr($end_date, 17, 2) 407 ,time_zone => 'Europe/Zurich' 408 )->strftime( "%Y%m%d%H%M%S %z" ); 409 410 my $title_tag = $tv_show->look_down('_tag' => 'h2'); 411 $title_tag->objectify_text(); 412 my $title = $title_tag->look_down('_tag', '~text')->attr('text'); 413 $show{'title'} = [[$title, $lang]]; 414 415 my $sub_title = $tv_show->look_down('_tag' => 'h3'); 416 $show{'sub-title'} = [[$sub_title->as_text(), $lang]] if($sub_title); 417 418 # Note: The layout is using dl lists for displaying this data 419 # and only the dt tag is marked with meaningful classes. That's 420 # why $variable->right()-as_text() is employed here to get the 421 # content of the unmarked dd tag. 422 423 # Beschreibung 424 foreach my $description ($tv_show->look_down('class' => 'tv-detail-description')) { 425 $show{desc} = [[ $description->right()->as_text(), $lang ]] 426 } 427 428 # Produktionsjahr 429 foreach my $year ($tv_show->look_down('class' => 'tv-detail-year tv-detail-short')) { 430 $show{date} = $year->right()->as_text(); 431 } 432 433 # Kategorie 434 foreach my $category ($tv_show->look_down('class' => 'tv-detail-catname tv-detail-short')) { 435 my $s = $category->right()->as_text(); 436 my @categories = split(m/\s*[\/]\s*/, $s); 437 foreach (@categories) { 438 push @{$show{category}}, [$_, $lang ] if ($_) 439 } 440 } 441 442 # Produktionsinfos 443 foreach my $category ($tv_show->look_down('class' => 'tv-detail-production tv-detail-short')) { 444 my $s = $category->right()->as_text(); 445 $s=~ s/\(.*//; 446 my @categories = split(m/\s*[\/,]\s*/, $s); 447 foreach my $category (@categories) { 448 if ($category) { 449 my $is_defined = 0; 450 foreach ( @{$show{category}} ) { 451 if ("${$_}[0]" eq "$category" ) { 452 $is_defined = 1; 453 last; 454 } 455 } 456 push @{$show{category}}, [$category, $lang ] if ($is_defined == 0); 457 } 458 } 459 } 460 461 # Produktionsland 462 foreach my $country ($tv_show->look_down('class' => 'tv-detail-country tv-detail-short')) { 463 my @countries = split(m/\s*[\/,]\s*/, $country->right()->as_text()); 464 foreach (@countries) { 465 push @{$show{country}}, [$_, $lang ]; 466 } 467 } 468 469 # Cast 470 foreach my $cast ($tv_show->look_down('class' => 'tv-detail-cast')) { 471 my $s = $cast->right()->as_text(); 472 $s=~ s/\(.*//; 473 my @actors = split(m/\s*,\s*/, $s); 474 $show{credits}{actor} = \@actors; 475 } 476 477 # Regisseur 478 foreach my $directors ($tv_show->look_down('class' => 'tv-detail-director tv-detail-short')) { 479 my @directors = split(m/\s*,\s*/, $directors->right()->as_text()); 480 $show{credits}{director} = \@directors; 481 } 482 483 # Drehbuch 484 foreach my $writers ($tv_show->look_down('class' => 'tv-detail-writer tv-detail-short')) { 485 my @writers = split(m/\s*,\s*/, $writers->right()->as_text()); 486 $show{credits}{writer} = \@writers; 487 } 488 489 # Wiederholung 490 foreach my $previously_shown ($tv_show->look_down('class' => 'tv-detail-repetition')) { 491 $show{'previously-shown'} = {} 492 } 493 494 # Episode 495 foreach my $episode ($tv_show->look_down('class' => 'tv-detail-episode tv-detail-short')) { 496 $show{'episode-num'} = [[ $episode->right()->as_text(), 'onscreen' ]] 497 } 498 499 # Untertitel f�r Geh�rlose 500 foreach my $deaf ($tv_show->look_down('_tag' => 'img', 'title' => encode("utf-8", "Untertitel f�r Geh�rlose"))) { 501 $show{subtitles} = [{ type => 'teletext' }]; 502 } 503 504 # Zweikanalton 505 foreach my $bilingual ($tv_show->look_down('_tag' => 'img', 'title' => 'Zweikanalton')) { 506 $show{audio}{stereo} = 'bilingual'; 507 } 508 509 # 16:9 510 foreach my $aspect ($tv_show->look_down('_tag' => 'img', 'title' => '16:9')) { 511 $show{video}{aspect} = '16:9'; 512 } 513 514 $writer->write_programme(\%show); 515 } 516 } 517 } 518} 519 520sub grab_channels { 521 my $grabDate = $grab_start; 522 my $url = $head->{q(source-info-url)}; 523 524 $bar = new XMLTV::ProgressBar('grabbing channels ', (6*$opt_days)) 525 if not $opt_quiet; 526 527 grab_channel_loop: 528 for (my $count = 0; $count < 6; $count++) { 529 my $tb = HTML::TreeBuilder->new(); 530 531 my $loop_date = $grabDate->year() . '-' . substr("0".$grabDate->month(),-2) . '-' . substr("0".$grabDate->day(),-2); 532 my $req = HTTP::Request->new(GET => "$url?time=$loop_date+" . 4*$count .".00"); 533 $req->header('Accept' => 'text/html'); 534 535 $tb->ignore_unknown(0); # otherwise, html5 tags like section are stripped out 536 $tb->parse(($ua->request($req))->content) 537 or die "cannot parse content of http://tv.search.ch/?time=$loop_date+" . 4*$count .".00"; 538 $tb->eof; 539 540 parse_page($tb, $grabDate->clone()->truncate("to" => "hour")->set_hour(4*$count)); 541 542 $tb->delete(); 543 update $bar if not $opt_quiet; 544 } 545 $grabDate = $grabDate->add ( days => 1 ); 546 if( DateTime->compare ( $grab_stop, $grabDate ) > 0) { 547 goto grab_channel_loop; 548 } 549 $bar->finish() 550 unless($opt_quiet); 551} 552 553 554## get channel listing 555sub get_channels() { 556 my %channels; 557 my $url=$head->{q(source-data-url)}; 558 559 my $tb=new HTML::TreeBuilder(); 560 $tb->parse(get_page($url)) 561 or die "cannot parse content of $url"; 562 $tb->eof; 563 564 ## getting the channels directly selectable 565 foreach($tb->look_down('_tag' => 'label')) { 566 my $id = ($_->look_down('_tag' => 'input'))->id; # tv-channel-sf1 567 next unless(substr($id, 0, 10) eq "tv-channel"); 568 my $channel_name = $_->as_text(); 569 570 $channels{uri_escape(substr($id, 11))} = $channel_name; 571 } 572 $tb->delete; 573 574 return %channels; 575} 576 577 578 579## get_page($url) :: try to download $url via http://, look for closing </body> tag or die 580sub get_page($) { 581 my $url = shift; 582 my $retry = 0; 583 584 local $SIG{__DIE__} = sub { die "\n$url: $_[0]" }; 585 586 while($retry < 2) { 587 my $got = eval { get_nice($url . ($retry ? "&retry=$retry" : "")); }; 588 $retry ++; 589 590 next if($@); # unable to download, doesn't look too good for us. 591 return $got; 592 } 593 594 die "cannot grab webpage $url (tried $retry times). giving up. sorry"; 595} 596