1#!/usr/local/bin/perl
2
3=pod
4
5=head1 NAME
6
7tv_grab_ch_search - Grab TV listings for Switzerland (from tv.search.ch webpage).
8
9=head1 SYNOPSIS
10
11tv_grab_ch_search --help
12
13tv_grab_ch_search [--config-file FILE] --configure [--gui OPTION]
14
15tv_grab_ch_search [--config-file FILE] [--output FILE] [--quiet]
16           [--days N] [--offset N]
17
18tv_grab_ch_search --list-channels
19
20tv_grab_ch_search --capabilities
21
22tv_grab_ch_search --version
23
24=head1 DESCRIPTION
25
26Output TV listings for several channels available in Switzerland and
27(partly) central Europe.
28The data comes from tv.search.ch. The grabber relies on
29parsing HTML so it might stop working at any time.
30
31First run B<tv_grab_ch_search --configure> to choose, which channels
32you want to download. Then running B<tv_grab_ch_search> with no
33arguments will output listings in XML format to standard output.
34
35B<--configure> Ask for each available channel whether to download
36and write the configuration file.
37
38B<--config-file FILE> Set the name of the configuration file, the
39default is B<~/.xmltv/tv_grab_ch_search.conf>.  This is the file
40written by B<--configure> and read when grabbing.
41
42B<--gui OPTION> Use this option to enable a graphical interface to be used.
43OPTION may be 'Tk', or left blank for the best available choice.
44Additional allowed values of OPTION are 'Term' for normal terminal output
45(default) and 'TermNoProgressBar' to disable the use of Term::ProgressBar.
46
47B<--output FILE> Write to FILE rather than standard output.
48
49B<--days N> Grab N days.  The default is fourteen.
50
51B<--offset N> Start N days in the future.  The default is to start
52from now on (= zero).
53
54B<--quiet> Suppress the progress messages normally written to standard
55error.
56
57B<--list-channels> Write output giving <channel> elements for every
58channel available (ignoring the config file), but no programmes.
59
60B<--capabilities> Show which capabilities the grabber supports. For more
61information, see L<http://wiki.xmltv.org/index.php/XmltvCapabilities>
62
63B<--version> Show the version of the grabber.
64
65B<--help> print a help message and exit.
66
67
68=head1 SEE ALSO
69
70L<xmltv(5)>.
71
72=head1 AUTHOR
73
74Daniel Bittel <betlit@gmx.net>. Inspired by tv_grab_ch by Stefan Siegl.
75Patric Mueller <bhaak@gmx.net>.
76
77=head1 BUGS
78
79If you happen to find a bug, you're requested to send a mail to one of the
80XMLTV mailing lists, see webpages at http://sourceforge.net/projects/xmltv/.
81
82=cut
83
84use warnings;
85use strict;
86use Encode;
87use DateTime;
88use LWP::Simple;
89use HTTP::Cookies;
90use XMLTV::Version '$Id: tv_grab_ch_search.in,v 1.22 2016/07/15 10:23:24 bhaak Exp $ ';
91use XMLTV::Capabilities qw/baseline manualconfig cache/;
92use XMLTV::Description 'Switzerland (tv.search.ch)';
93use XMLTV::Supplement qw/GetSupplement/;
94use Getopt::Long;
95use HTML::TreeBuilder;
96use HTML::Entities;
97use URI::Escape;
98use URI::URL;
99use XMLTV;
100use XMLTV::Ask;
101use XMLTV::ProgressBar;
102use XMLTV::DST;
103use XMLTV::Config_file;
104use XMLTV::Mode;
105use XMLTV::Get_nice;
106use XMLTV::Memoize;
107use XMLTV::Usage<<END
108$0: get Swiss television listings from tv.search.ch in XMLTV format
109To configure: $0 --configure [--config-file FILE] [--gui OPTION]
110To grab data: $0 [--config-file FILE] [--output FILE] [--quiet]
111                 [--days N] [--offset N]
112Channel List: $0 --list-channels
113To show capabilities: $0 --capabilities
114To show version: $0 --version
115
116END
117  ;
118
119# Use Log::TraceMessages if installed.
120BEGIN {
121    eval { require Log::TraceMessages };
122    if ($@) {
123        *t = sub {};
124        *d = sub { '' };
125    }
126    else {
127        *t = \&Log::TraceMessages::t;
128        *d = \&Log::TraceMessages::d;
129    }
130}
131
132
133
134## our own prototypes first ...
135sub get_channels();
136sub channel_id($);
137sub get_page($);
138sub grab_channel($);
139
140## attributes of xmltv root element
141my $head = {
142    'source-data-url'      => 'https://tv.search.ch/channels',
143    'source-info-url'      => 'https://tv.search.ch/',
144    'generator-info-name'  => 'XMLTV',
145    'generator-info-url'   => 'http://xmltv.org/',
146};
147
148
149
150## the timezone tv.search.ch lives in is, CET/CEST
151my constant $TZ = "+0100";
152my constant $lang = "de";
153
154
155
156## Parse argv now.  First do undocumented --cache option.
157XMLTV::Memoize::check_argv('XMLTV::Get_nice::get_nice_aux');
158
159
160
161my $opt_configure;
162my $opt_config_file;
163my $opt_gui;
164my $opt_output;
165my $opt_days = 14;
166my $opt_offset = 0;
167my $opt_quiet = 0;
168my $opt_slow = 0;
169my $opt_list_channels;
170my $opt_help;
171
172GetOptions(
173    'configure'      => \$opt_configure,
174    'config-file=s'  => \$opt_config_file,
175    'gui:s'          => \$opt_gui,
176    'output=s'       => \$opt_output,
177    'days=i'         => \$opt_days,
178    'offset=i'       => \$opt_offset,
179    'quiet'          => \$opt_quiet,
180    'slow'           => \$opt_slow,
181    'list-channels'  => \$opt_list_channels,
182    'help'           => \$opt_help,
183) or usage(0);
184
185usage(1) if $opt_help;
186
187XMLTV::Ask::init($opt_gui);
188
189## make sure offset+days arguments are within range
190die "neither offset nor days may be negative"
191  if($opt_offset < 0 || $opt_days < 0);
192
193
194## calculate global start/stop times ...
195my $grab_start = DateTime->now(time_zone => 'Europe/Zurich')->add( days => $opt_offset );
196my $grab_stop = DateTime->now(time_zone => 'Europe/Zurich')->add ( days => $opt_offset + $opt_days );
197
198my $mode = XMLTV::Mode::mode('grab', # default value
199    $opt_configure => 'configure',
200    $opt_list_channels => 'list-channels',
201);
202
203
204
205## initialize config file support
206my $config_file = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_ch_search', $opt_quiet);
207my @config_lines;
208
209if($mode eq 'configure') {
210    XMLTV::Config_file::check_no_overwrite($config_file);
211}
212elsif($mode eq 'grab' || $mode eq 'list-channels') {
213    @config_lines = XMLTV::Config_file::read_lines($config_file);
214}
215else { die("never heard of XMLTV mode $mode, sorry :-(") }
216
217
218
219## hey, we can't live without channel data, so let's get those now!
220my $bar = new XMLTV::ProgressBar( 'getting list of channels', 1 )
221    if not $opt_quiet;
222
223my %channels = get_channels();
224$bar->update() if not $opt_quiet;
225$bar->finish() if not $opt_quiet;
226
227
228my @requests;
229
230## read our configuration file now
231my $line = 1;
232foreach(@config_lines) {
233    $line ++;
234    next unless defined;
235
236    if (/^channel:?\s+(\S+)/) {
237        warn("\nConfigured channel $1 not available anymore. \nPlease reconfigure tv_grab_ch_search.\n"),
238          next unless(defined($channels{$1}));
239        push @requests, $1;
240    } else {
241        warn "$config_file:$line: bad line\n";
242    }
243}
244
245## if we're requested to do so, write out a new config file ...
246if ($mode eq 'configure') {
247    open(CONFIG, ">$config_file") or die("cannot write to $config_file, due to: $!");
248
249    ## now let's annoy the user, sorry, I meant ask ..
250    my @chs = sort keys %channels;
251    my @names = map { $channels{$_} } @chs;
252    my @qs = map { "add channel $_?" } @names;
253    my @want = ask_many_boolean(1, @qs);
254
255    foreach (@chs) {
256        my $w = shift @want;
257        my $chname = shift @names;
258
259        warn("cannot read input, stopping to ask questions ..."), last if not defined $w;
260
261        print CONFIG '#' if not $w; #- comment line out if user answer 'no'
262
263        # shall we store the display name in the config file?
264        # leave it in, since it probably makes it a lot easier for the
265        # user to choose which channel to comment/uncommet - when manually
266        # viing the config file -- are there people who do that?
267        print CONFIG "channel $_ #$chname\n";
268    }
269
270    close CONFIG or warn "unable to nicely close the config file: $!";
271    say("Finished configuration.");
272
273    exit();
274}
275
276
277
278## well, we don't have to write a config file, so, probably it's some xml stuff :)
279## if not, let's go dying ...
280die unless($mode eq 'grab' or $mode eq 'list-channels');
281
282my %writer_args;
283if (defined $opt_output) {
284    my $handle = new IO::File(">$opt_output");
285    die "cannot write to output file, $opt_output: $!" unless (defined $handle);
286    $writer_args{'OUTPUT'} = $handle;
287}
288
289$writer_args{'encoding'} = 'utf-8';
290
291
292if( defined( $opt_days )) {
293    $writer_args{offset} = $opt_offset;
294    $writer_args{days} = $opt_days;
295    $writer_args{cutoff} = "000000";
296}
297
298## create our writer object
299my $writer = new XMLTV::Writer(%writer_args);
300$writer->start($head);
301
302
303
304if ($mode eq 'list-channels') {
305    foreach (keys %channels) {
306        my %channel = ('id'           => channel_id($_),
307                       'display-name' => [[$channels{$_}, $lang]]);
308        $writer->write_channel(\%channel);
309    }
310
311    $writer->end();
312    exit();
313}
314
315
316
317## there's only one thing, why we might exist: write out tvdata!
318die unless ($mode eq 'grab');
319die "No channels specified, run me with --configure flag\n" unless(scalar(@requests));
320
321
322
323## write out <channel> tags
324my $paramstr ="";
325foreach(@requests) {
326    my $id = channel_id($_);
327    my %channel = ('id'           => $id,
328                   'display-name' => [[$channels{$_}, $lang]]);
329    $writer->write_channel(\%channel);
330    $paramstr = $paramstr."&channels[]=".$_;
331
332}
333
334
335## the page doesn't specify the year when the programmes begin or end, thus
336## we need to guess, store current year and month globally as needed for every
337## programme ...
338my $cur_year = DateTime->now()->year();
339my $cur_month = DateTime->now()->month();
340
341my $url=$head->{q(source-data-url)};
342
343
344my $ua = LWP::UserAgent->new(keep_alive => 300);
345$ua->cookie_jar(HTTP::Cookies->new());
346$ua->agent("xmltv/$XMLTV::VERSION");
347$ua->env_proxy;
348
349my $req = HTTP::Request->new(POST => $url);
350$req->content_type('application/x-www-form-urlencoded');
351$req->content(substr ( $paramstr, 1));
352
353# FIXME what is this request doing? It fills the cookie jar
354$ua->request($req);
355$ua->request($req);
356
357## write out <programme> tags
358grab_channels();
359
360## hey, looks like we've finished ...
361$writer->end();
362
363
364## channel_id($s) :: turn site channel id into an xmltv id
365sub channel_id($) {
366    my $s = shift;
367    $s =~ s|^tv_||;
368    return "$s.search.ch"
369}
370
371sub parse_page {
372    my ($tb, $start_parse_date) = @_;
373    foreach my $tv_channel ( $tb->look_down('class' => 'sl-card tv-index-channel') ) {
374        my $channel_id = substr($tv_channel->attr('id'), 3); # tv-sf1 -> sf1
375        if ( defined($channel_id) ) {
376            foreach my $tv_show ( $tv_channel ->look_down('class' => 'tv-tooltip') ) {
377                my %show;
378                $show{channel} = channel_id($channel_id);
379
380                my $tmp = $tv_show->look_down('_tag', 'a');
381                next unless defined($tmp);
382
383                my %params = URI::URL->new($tmp->attr('href'))->query_form();
384                my $start_date = $params{'start'};
385                my $end_date = $params{'end'};
386                next unless defined($start_date);
387
388                my $show_start = DateTime->new (
389                    year => substr($start_date, 0, 4)
390                    ,month => substr($start_date, 5, 2)
391                    ,day   => substr($start_date, 8, 2)
392                    ,hour  => substr($start_date, 11, 2)
393                    ,minute => substr($start_date, 14, 2)
394                    ,second => substr($start_date, 17, 2)
395                    ,time_zone => 'Europe/Zurich');
396                $show{start} = $show_start->strftime( "%Y%m%d%H%M%S %z" );
397                # skip shows starting before the start date to prevent duplicates
398                next if $show_start < $start_parse_date;
399
400                $show{stop} = DateTime->new (
401                    year => substr($end_date, 0, 4)
402                    ,month => substr($end_date, 5, 2)
403                    ,day   => substr($end_date, 8, 2)
404                    ,hour  => substr($end_date, 11, 2)
405                    ,minute => substr($end_date, 14, 2)
406                    ,second => substr($end_date, 17, 2)
407                    ,time_zone => 'Europe/Zurich'
408                )->strftime( "%Y%m%d%H%M%S %z" );
409
410                my $title_tag = $tv_show->look_down('_tag' => 'h2');
411                $title_tag->objectify_text();
412                my $title = $title_tag->look_down('_tag', '~text')->attr('text');
413                $show{'title'} = [[$title, $lang]];
414
415                my $sub_title = $tv_show->look_down('_tag' => 'h3');
416                $show{'sub-title'} = [[$sub_title->as_text(), $lang]] if($sub_title);
417
418                # Note: The layout is using dl lists for displaying this data
419                # and only the dt tag is marked with meaningful classes. That's
420                # why $variable->right()-as_text() is employed here to get the
421                # content of the  unmarked dd tag.
422
423                # Beschreibung
424                foreach my $description ($tv_show->look_down('class' => 'tv-detail-description')) {
425                    $show{desc} = [[ $description->right()->as_text(), $lang ]]
426                }
427
428                # Produktionsjahr
429                foreach my $year ($tv_show->look_down('class' => 'tv-detail-year tv-detail-short')) {
430                    $show{date} = $year->right()->as_text();
431                }
432
433                # Kategorie
434                foreach my $category ($tv_show->look_down('class' => 'tv-detail-catname tv-detail-short')) {
435                    my $s = $category->right()->as_text();
436                    my @categories = split(m/\s*[\/]\s*/, $s);
437                    foreach  (@categories) {
438                        push @{$show{category}}, [$_, $lang ] if ($_)
439                    }
440                }
441
442                # Produktionsinfos
443                foreach my $category ($tv_show->look_down('class' => 'tv-detail-production tv-detail-short')) {
444                    my $s = $category->right()->as_text();
445                    $s=~ s/\(.*//;
446                    my @categories = split(m/\s*[\/,]\s*/, $s);
447                    foreach my $category (@categories) {
448                        if ($category) {
449                            my $is_defined = 0;
450                            foreach ( @{$show{category}} ) {
451                                if ("${$_}[0]" eq "$category" ) {
452                                    $is_defined = 1;
453                                    last;
454                                }
455                            }
456                            push @{$show{category}}, [$category, $lang ] if ($is_defined == 0);
457                        }
458                    }
459                }
460
461                # Produktionsland
462                foreach my $country ($tv_show->look_down('class' => 'tv-detail-country tv-detail-short')) {
463                    my @countries = split(m/\s*[\/,]\s*/, $country->right()->as_text());
464                    foreach (@countries) {
465                        push @{$show{country}}, [$_, $lang ];
466                    }
467                }
468
469                # Cast
470                foreach my $cast ($tv_show->look_down('class' => 'tv-detail-cast')) {
471                    my $s = $cast->right()->as_text();
472                    $s=~ s/\(.*//;
473                    my @actors = split(m/\s*,\s*/, $s);
474                    $show{credits}{actor} = \@actors;
475                }
476
477                # Regisseur
478                foreach my $directors ($tv_show->look_down('class' => 'tv-detail-director tv-detail-short')) {
479                    my @directors = split(m/\s*,\s*/, $directors->right()->as_text());
480                    $show{credits}{director} = \@directors;
481                }
482
483                # Drehbuch
484                foreach my $writers ($tv_show->look_down('class' => 'tv-detail-writer tv-detail-short')) {
485                    my @writers = split(m/\s*,\s*/, $writers->right()->as_text());
486                    $show{credits}{writer} = \@writers;
487                }
488
489                # Wiederholung
490                foreach my $previously_shown ($tv_show->look_down('class' => 'tv-detail-repetition')) {
491                    $show{'previously-shown'} = {}
492                }
493
494                # Episode
495                foreach my $episode ($tv_show->look_down('class' => 'tv-detail-episode tv-detail-short')) {
496                    $show{'episode-num'} = [[ $episode->right()->as_text(), 'onscreen' ]]
497                }
498
499                # Untertitel f�r Geh�rlose
500                foreach my $deaf ($tv_show->look_down('_tag' => 'img', 'title' => encode("utf-8", "Untertitel f�r Geh�rlose"))) {
501                    $show{subtitles} = [{ type => 'teletext' }];
502                }
503
504                # Zweikanalton
505                foreach my $bilingual ($tv_show->look_down('_tag' => 'img', 'title' => 'Zweikanalton')) {
506                    $show{audio}{stereo} = 'bilingual';
507                }
508
509                # 16:9
510                foreach my $aspect ($tv_show->look_down('_tag' => 'img', 'title' => '16:9')) {
511                    $show{video}{aspect} = '16:9';
512                }
513
514                $writer->write_programme(\%show);
515           }
516       }
517   }
518}
519
520sub grab_channels {
521    my $grabDate = $grab_start;
522    my $url = $head->{q(source-info-url)};
523
524    $bar = new XMLTV::ProgressBar('grabbing channels       ', (6*$opt_days))
525    if not $opt_quiet;
526
527    grab_channel_loop:
528    for (my $count = 0; $count < 6; $count++) {
529        my $tb = HTML::TreeBuilder->new();
530
531        my $loop_date = $grabDate->year() . '-' . substr("0".$grabDate->month(),-2) . '-' . substr("0".$grabDate->day(),-2);
532        my $req = HTTP::Request->new(GET => "$url?time=$loop_date+" . 4*$count .".00");
533        $req->header('Accept' => 'text/html');
534
535        $tb->ignore_unknown(0); # otherwise, html5 tags like section are stripped out
536        $tb->parse(($ua->request($req))->content)
537            or die "cannot parse content of http://tv.search.ch/?time=$loop_date+" . 4*$count .".00";
538        $tb->eof;
539
540        parse_page($tb, $grabDate->clone()->truncate("to" => "hour")->set_hour(4*$count));
541
542        $tb->delete();
543        update $bar if not $opt_quiet;
544    }
545    $grabDate = $grabDate->add ( days => 1 );
546    if( DateTime->compare ( $grab_stop, $grabDate ) > 0) {
547        goto grab_channel_loop;
548    }
549    $bar->finish()
550    unless($opt_quiet);
551}
552
553
554## get channel listing
555sub get_channels() {
556    my %channels;
557    my $url=$head->{q(source-data-url)};
558
559    my $tb=new HTML::TreeBuilder();
560    $tb->parse(get_page($url))
561        or die "cannot parse content of $url";
562    $tb->eof;
563
564    ## getting the channels directly selectable
565    foreach($tb->look_down('_tag' => 'label')) {
566        my $id = ($_->look_down('_tag' => 'input'))->id; # tv-channel-sf1
567        next unless(substr($id, 0, 10) eq "tv-channel");
568        my $channel_name = $_->as_text();
569
570        $channels{uri_escape(substr($id, 11))} = $channel_name;
571    }
572    $tb->delete;
573
574    return %channels;
575}
576
577
578
579## get_page($url) :: try to download $url via http://, look for closing </body> tag or die
580sub get_page($) {
581    my $url = shift;
582    my $retry = 0;
583
584    local $SIG{__DIE__} = sub { die "\n$url: $_[0]" };
585
586    while($retry < 2) {
587        my $got = eval { get_nice($url . ($retry ? "&retry=$retry" : "")); };
588        $retry ++;
589
590        next if($@); # unable to download, doesn't look too good for us.
591        return $got;
592    }
593
594    die "cannot grab webpage $url (tried $retry times). giving up. sorry";
595}
596