1#!/usr/local/bin/perl -w
2=pod
3
4=head1 NAME
5
6tv_grab_be - Grab TV listings for Belgium
7
8=head1 SYNOPSIS
9
10tv_grab_be --help
11
12tv_grab_be [--config-file FILE] --configure [--slow] [--gui OPTION]
13
14tv_grab_be [--config-file FILE] [--output FILE] [--days N]
15           [--offset N] [--quiet] [--slow] [--gui OPTION]
16
17tv_grab_be [--output FILE] [--quiet] [--config-file FILE] --list-channels
18
19tv_grab_be --capabilities
20
21tv_grab_be --version
22
23=head1 DESCRIPTION
24
25Output TV and radio listings in XMLTV format for many stations
26available in Belgium.  The data comes from the Sanoma magazines'
27websites: Tele Moustique and Teve Blad.
28
29=head1 USAGE
30
31First you must run B<tv_grab_be --configure> to choose the language,
32grab mode and which stations you want to receive.
33
34Then running B<tv_grab_be> with no arguments will get about 6
35dayE<39>s of summary only listings for the channels you chose.
36
37If you want to grab detailed information (such as episode name,
38detailed descriptions, actors) then use the B<--slow> flag when both
39onfiguring and running the grabber. The configure mode will prompt
40you for selection criteria for when the grabber should retrieve
41detailed information for programmes (selected by start time, category,
42and channel). This makes grabbing slow (hence the option name!)
43
44Note that different stations ar available in French and Dutch modes
45due to the listings differences from the two sites. The data is also
46different -- the French site has more detailed info for the french
47language channels, and the Dutch site has more info for the Dutch
48language channels.
49
50Some channels (BBC World, Euronews) although listed, have such bad
51listings data that you would be better off using the tv_grab_uk_rt
52grabber, and then merging the resulting files with tv_cat!
53
54It is perhaps worthwhile to use the --config-file option to maintain
55one config file with a selection of channels for each language, then
56using tv_cat to merge the resulting XML files.
57
58B<--configure> Prompt for language, grab mode and which
59stations to download and write the configuration file (see also --slow)
60
61B<--config-file FILE> Set the name of the configuration file, the
62default is B<~/.xmltv/tv_grab_be.conf>.  This is the file written by
63B<--configure> and read when grabbing.
64
65B<--output FILE> When grabbing, write output to FILE rather than
66standard output.
67
68B<--days N> When grabbing, grab N days rather than as many as
69possible.
70
71B<--offset N> Start grabbing at today + N.  N may be negative.
72
73B<--quiet> Suppress the progress messages normally written to standard
74error.
75
76B<--slow> Slow mode: get detailed information for specified
77programmes. With B<--configure>, this enables the configuration
78routine to prompt for the criteria which programs have to match for
79detailed information downloading. Otherwise, this enables the grabbing
80of detailed for programmes matching the defined criteria.
81
82B<--trace> Show debug information (if L<Log::TraceMessages> is installed)
83
84B<--gui OPTION> Use this option to enable a graphical interface to be used.
85OPTION may be 'Tk', or left blank for the best available choice.
86Additional allowed values of OPTION are 'Term' for normal terminal output
87(default) and 'TermNoProgressBar' to disable the use of Term::ProgressBar.
88
89B<--list-channels> Dump channel information for all channels but no
90programmes.  This grabber needs a config file first before the
91channels can be dumped.
92
93B<--capabilities> Show which capabilities the grabber supports. For more
94information, see L<http://wiki.xmltv.org/index.php/XmltvCapabilities>
95
96B<--version> Show the version of the grabber.
97
98B<--help> Print a help message and exit.
99
100=head1 WARNING
101
102In B<--slow> mode, unning this grabber requires very many web page
103fetches (one per channel per day, and then one per programme selected
104for detailed information) from a very slow web site.
105
106The number of web page fetches can be limited by limiting the number
107of programs to get detailed information for (by start time range,
108category or channel). This is defined when run with B<--configure
109--slow> or in the config file.
110
111=head1 SEE ALSO
112
113L<xmltv(5)>, L<http://www.telepocket.be>, L<http://www.teveblad.be>
114
115=head1 AUTHOR
116
117Niel Markwick, nielm@bigfoot.com
118Based on B<tv_grab_uk_rt>
119
120=head1 BUGS
121
122The website parsing isnE<39>t perfect and there may be warning
123messages about bits of HTML that arenE<39>t understood.  Some of the
124details provided by the site have to be thrown away because they
125cannot be accommodated in the XMLTV format; again, warning messages
126are printed.
127
128Programmes containing defined sections are not handled very well (such
129as Sportpaleis on Canvas) because the data source lists the sections
130separately with overlapping timeslots. eg:
131
132=over
133
134=item "13.30-14.00 Hands Up!"
135
136=item "13.30-17.30 Sportpaleis"
137
138=item "14.00 14.30 Champions League Magazine"
139
140=item "14.30 17.00 Wielrennen: Kuurne - Brussel - Kuurne"
141
142=item "17.00 17.15 Autorennen: F1"
143
144=item "17.15 17.30 Daar is 'm!"
145
146=back
147
148
149
150The data on the website can also be poor. Program names gain and lose
151random punctuation from week to week. eg:
152
153=over
154
155=item "Buffy, the Vampire Slayer.",
156
157=item "Buffy the Vampire Slayer",
158
159=item "Buffy, the Vampire Slayer",
160
161=item "Buffy the Vampire Slayer."
162
163=back
164
165The grabber strips trailing punctuation to help avoid this.
166
167Sometimes the stop time is not put on last programme of the day. This
168can be worked around by piping the output through tv_sort, and the start
169time of the first program of the next day will be used.
170Alternatively, the following complex tv_grep command can add an
171implicit stop time of 06:00:
172
173tv_grep -e 'if (not ${$_}{stop}) { (${$_}{stop} = ${$_}{start}) =~ s/\d{6}\b/060000/ }; 1'
174
175Finally there are several things still to do (see TODO list in source
176code for full description).
177
178=head1 HISTORY
179
180B<2008-10-14 nielm> - remove teveblad due to user-agent blocking
181
182B<2007-01-09 nielm> - switch back to telemoustique for fr
183
184B<2004-01-08 nielm> - first version with selective detail grabbing,
185based on tv_grab_uk_rt 0.5.27
186
187B<2004-01-09 nielm> - Disable detail grabbing by default; implemented
188--slow option to enable detail grabbing and to complicate
189configuration procedure; . Removed implicit generation of stop
190time. Correct windows special characters (128-159): oe ligatures ->
191oe; fancy quotes -> normal quotes, others -> ? (with warning); Fixed
192warning about Log::Tracemessages::On.
193
194B<2004-01-13 nielm> - Replace '...' Windows character; Added parsing
195of star ratings; warn about unrecognised images in description text;
196Added parsing of movie ratings (classifications); Future-proof config
197file to cope with grabbing multiple languages simultaneously; Added
198--output option
199
200B<2004-01-15 nielm> - Put year into date tag; Put director of films
201(if found in descr) into director tag; Get year from descr if not
202found, add channel logos (from satlogo.com).
203
204B<2004-01-26 nielm> - handle VO/OV image without warning; make
205multi-line descriptions; correct date parsing; correct episode num
206in FR listings; correct stop time bug when no stop time defined;
207removed lang=xx from title and sub-title;
208
209B<2004-01-29 nielm> - add icons in rating and star-rating; handle
210episode numbers in titles better; handle extracting of director
211better.
212
213B<2004-02-23 nielm/epaepa> - tidy up of help text, remove newlines
214from desc, improve start/stop time details matching, add detaul URL to
215fast mode programme info
216
217B<2004-03-04 nielm> - Correct usage, handle Duree (length), handle
218repeats (previously-shown), handle episode numbers in description.
219More things added to TODO list (see source code)
220
221B<2004-03-09 nielm> - Remove Duree and (R.) from description
222text. Remove categories in description. Do not put episode numbers in
223sub-title
224
225B<2004-04-01 nielm> - Fixed bug with no stop time for programs
226starting at midnight, handle 'New' icon, Fixed JIMTV channel ID,
227handle 'Divers' tags (which may contain info on previously-shown),
228remove 'gastacteurs:' from actor names.
229
230B<2004-04-05 epaepa/nielm> - Make time zones consistantly +0100 or
231+0200, but never mixed. Clean up punctuation around actor
232names. Remove duplicate ChannelID from file (only put alternative
233ID's). better actor parsing
234
235B<2004-04-05 epaepa/nielm> - Handle user input better during config
236(CTRL-D, CTRL-H), use substrings not regexps for category/channel
237matching to avoid nastyness when a user enters a bad regexp. Add magic
238category *NONE* and *ALL* for category matching.
239
240B<2004-04-15 nielm> - yet more cleanup in parsing actor names, ignore
241programmes with no titles.
242
243B<2005-03-12 nielm> - no function changes: just changes to the
244configuration section to use XMLTV::ask, and updates to the
245channel_ids files
246
247B<2005-09-22 nielm> - www.telemoustique.be no longer works: quick
248and dirty fix: use www.telepocket.be instead.
249
250=cut
251
252# TODO
253#
254# Merge fr and nl configs to allow single config file.
255#
256#
257# Handle listings where a program is shown again later in the day
258# without a separate lsting for it: eg:
259#    <desc lang="fr">les moments les plus delirants de
260#    l'emission. (13.30, 16.15, 20.45, 0.30)</desc>
261#    <desc lang="fr">S�rie anim�e. La d�cision de Petit-Coeur
262#    (R. � 17.30 et 24.00)</desc>
263#    <desc lang="fr">S�rie australienne (R. � 18.00)</desc>
264# implies this program will be shown at these later times...
265#
266#
267# Handle Followed by 'Suivi' in Fremch descriptions
268#   <desc lang="fr">(R.) Suivi de Le Shopping.</desc>
269#   <desc lang="fr">Suivi, � 14.40, de Tranche de rire.</desc>
270#   <desc lang="fr">suivi � 14.10 de La Boutique - 14.40 Tranche de rire.</desc>
271#   <desc lang="fr">S�rie quotidienne fran�aise suivie, � 19.55, de la M�t�o.</desc>
272#   <desc lang="fr">suivi de L'invit� - La m�t�o</desc>
273#   <desc lang="fr">suivi de L'invit� - La m�t�o. Pascal Vrebos re�oit Laurette Onkelinx, Ministre de la Justice (PS).</desc>
274#
275# Handle grouped programmes
276#     <desc lang="fr">Le lutin Plop
277#      - 7.00 La cour de r�cr� (R.)
278#      - 7.25 Pepper Ann (R.)
279#      - 7.50 Jim Bouton.</desc>
280#
281
282
283use strict;
284use XMLTV::Version '$Id: tv_grab_be.in,v 1.17 2010/09/02 05:07:40 rmeden Exp $ ';
285use XMLTV::Capabilities qw/baseline manualconfig cache/;
286use XMLTV::Description 'Belgium';
287use XMLTV::Supplement qw/GetSupplement/;
288
289use IO::Socket;
290use LWP::Simple;
291use Date::Manip;
292use Getopt::Long;
293use HTML::Entities;
294use XMLTV;
295use XMLTV::Memoize;
296use XMLTV::Ask;
297use XMLTV::ProgressBar;
298use XMLTV::DST;
299use XMLTV::Config_file;
300use XMLTV::Get_nice;
301use XMLTV::Date qw(parse_date);
302use XMLTV::Usage <<END
303To configure:     $0 --configure [--config-file FILE] [--slow] [--gui OPTION]
304To grab listings: $0 [--config-file FILE] [--output FILE]
305                     [--days N] [--offset N] [--quiet] [--slow] [--gui OPTION]
306To list channels: $0 [--output FILE] [--quiet] [--config-file FILE] --list-channels
307To show capabilities: $0 --capabilities
308To show version: $0 --version
309END
310  ;
311
312# Use Log::TraceMessages if installed.
313BEGIN {
314    eval { require Log::TraceMessages };
315    if ($@) {
316        *t = sub {};
317        *d = sub { '' };
318    }    else {
319        *t = \&Log::TraceMessages::t;
320        *d = \&Log::TraceMessages::d;
321        Log::TraceMessages::check_argv();
322    }
323}
324
325sub get_url( $ );
326sub get_programmes( $$$$$$ );
327sub get_programme_summary( $$$$ );
328sub get_programme_detailed_info($$);
329sub parse_programme_details($$); # ref of prog hash, array of descr strings
330sub get_channels();
331sub get_categories();
332sub get_available_dates();
333sub be_to_xmltv( $ );
334sub xmltv_to_be( $ );
335sub grab( $$ );
336sub configure();
337
338# GLOBAL CONSTANTS
339my $LANG_FR = 'fr';
340my $LANG_NL = 'nl';
341
342# language-dependant constants
343my %DOMAIN = ( $LANG_FR => 'telemoustique.be',
344		 $LANG_NL => 'teveblad.be' );
345my %BASE_URL = ( $LANG_FR => "http://www.$DOMAIN{$LANG_FR}/tm/",
346		 $LANG_NL => "http://www.$DOMAIN{$LANG_NL}/ndl/");
347
348# channel to use for getting dates
349my %DATE_CH   = ( $LANG_FR => 'LA%20UNE',
350		  $LANG_NL => 'TV1' );
351
352my %SUMMARY_PATH = ( $LANG_FR => "programme_tele_chaine.html",
353                    $LANG_NL => "zender.asp" );
354
355
356my %DETAIL_PATH = (  $LANG_FR => "programme_tele_detail.html?progid=" ,
357		     $LANG_NL => "detail.asp?progid=" );
358
359# populated from config file
360my @detailgenre;
361my $detailstarttime;
362my $detailstoptime;
363my %get_channel_detail;
364my $LANG;
365
366#stats
367my $numwebgets=0;
368my $kbwebgets=0;
369my $statstarttime=time();
370
371# Check options.  First do the undocumented --cache option (to cache
372# get(), which retrieves web pages), then the normal ones.
373#
374my $using_cache
375     = XMLTV::Memoize::check_argv('XMLTV::Get_nice::get_nice_aux');
376my ($opt_days,
377    $opt_help,
378    $opt_output,
379    $opt_input,
380    $opt_slow,
381    $opt_gui,
382    $opt_configure,
383    $opt_config_file,
384    $opt_offset,
385    $opt_quiet,
386    $opt_list_channels,
387   );
388# No default for $opt_days, we determine it from the site.
389$opt_offset = 0; # default today
390$opt_quiet  = 0; # default
391GetOptions('days=i'        => \$opt_days,
392           'help'          => \$opt_help,
393           'configure'     => \$opt_configure,
394           'slow'          => \$opt_slow,
395           'gui:s'         => \$opt_gui,
396           'config-file=s' => \$opt_config_file,
397           'output=s'      => \$opt_output,
398           'offset=i'      => \$opt_offset,
399	   'list-channels' => \$opt_list_channels,
400           'quiet'         => \$opt_quiet,
401           'input=s'       => \$opt_input, # undocumented -- debug mode:
402	                                   # read data from html file,
403	                                   # no web page gets apart
404	                                   # from program details
405          )
406  or usage(0);
407die 'number of days must not be negative'
408  if (defined $opt_days && $opt_days < 0);
409if ($opt_help) {
410    usage(1);
411}
412
413# Date::Manip has a bug where 'now' will be wrong if you change the
414# timezone.  It won't be correctly converted from the system timezone
415# to the new one.  So we call parse_date('today midnight') _before_
416# Date_Init().
417#
418my $today = DateCalc(parse_date('today midnight'), "$opt_offset days");
419Date_Init('TZ=UTC');
420
421XMLTV::Ask::init($opt_gui);
422
423# Tables to convert between telemoustique / teveblad and XMLTV ids of channels.
424# The way to access these is through the routines be_to_xmltv() and
425# xmltv_to_be(), not directly.  Those will deal sensibly with a new
426# channel that isn't mentioned in the file.
427#
428my (%be_to_xmltv, %xmltv_to_be, %extra_dn, %ch_warn, %logourl);
429my $line_num = 0;
430
431foreach my $CURLANG ( $LANG_FR, $LANG_NL ) {
432    my $CHANNEL_NAMES_FILE = "channel_ids_$CURLANG";
433    my $str = GetSupplement( 'tv_grab_be', $CHANNEL_NAMES_FILE );
434
435    $line_num=0;
436    foreach (split( /\n/, $str)) {
437        ++ $line_num;
438        tr/\r//d;
439        s/#.*//;
440        next if m/^\s*$/;
441        my $where = "$CHANNEL_NAMES_FILE:$line_num";
442        my @fields = split(/:/,$_,5);
443        die "$where: wrong number of fields: " . (scalar @fields)
444            if @fields < 4 or @fields > 5;
445        my ($xmltv_id, $be_id, $extra_dn, $logourl, $ch_warn) = @fields;
446	die "$where Sanonma id not specified"
447	    if ( not defined $be_id || $be_id eq '' );
448        warn "$where: $CURLANG Sanoma id $be_id seen already\n"
449            if defined $be_to_xmltv{$CURLANG}{$be_id};
450        $be_to_xmltv{$CURLANG}{$be_id} = $xmltv_id;
451        warn "$where: $CURLANG XMLTV id $xmltv_id seen already\n"
452            if defined $xmltv_to_be{$CURLANG}{$xmltv_id};
453        $xmltv_to_be{$CURLANG}{$xmltv_id} = $be_id;
454        $extra_dn{$CURLANG}{$xmltv_id} = $extra_dn
455	    if ( defined $extra_dn && $extra_dn ne '' );
456        $logourl{$CURLANG}{$xmltv_id} = $logourl
457	    if ( defined $logourl && $logourl ne '' );
458        $ch_warn{$CURLANG}{$xmltv_id} = $ch_warn
459	    if ( defined $ch_warn && $ch_warn ne '' );
460    }
461}
462t 'xmltv_to_be: ' . d \%xmltv_to_be;
463t 'be_to_xmltv: ' . d \%be_to_xmltv;
464t 'extra_dn: ' . d \%extra_dn;
465t 'ch_warn: ' . d \%ch_warn;
466
467# Arguments for XMLTV::Writer.
468my %g_args = ();
469if (defined $opt_output) {
470    die "cannot have both --output and --configure\n" if $opt_configure;
471    my $fh = new IO::File ">$opt_output";
472    die "cannot write to $opt_output\n" if not $fh;
473    %g_args = (OUTPUT => $fh);
474}
475
476# Find the configuration file.  This grabber needs it even for listing
477# channels since the channels available depend on the language.
478#
479my $config_file
480  = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_be', $opt_quiet);
481
482if ($opt_configure) {
483    configure();
484    exit;
485}
486
487# Not configuring - need to read an existing config file.
488my @config_lines = XMLTV::Config_file::read_lines($config_file);
489
490# Read the configuration file for language option
491# language <F|D>
492foreach (@config_lines) {
493    ++ $line_num;
494    next if not defined;
495    my $where = "$config_file:$line_num";
496    if (/^language\s+(.+)/) {
497	if ( $1 eq $LANG_FR || $1 eq $LANG_NL) {
498	    $LANG=$1;
499	}
500	else {
501	    die "$where: invalid language defined in conf file\n";
502	}
503    }
504}
505die "language not defined in $config_file" if (not defined $LANG );
506
507if ( $LANG eq $LANG_NL ) {
508    die "Dutch language grabbing is no longer supported because Teveblad blocks XMLTV\n";
509}
510
511# Stuff at the top of any output XML.
512my $metadata = { 'source-info-url'     => "$BASE_URL{$LANG}",
513		 'source-info-name'    => "$DOMAIN{$LANG}",
514		 'generator-info-name' => 'XMLTV',
515		 'generator-info-url'  =>
516		 'http://xmltv.org/',
517	       };
518
519if ($opt_list_channels) {
520    # Could check usage here to see --days etc. were not specified but
521    # I can't be bothered.
522    #
523    my %channels = get_channels; # uses $LANG
524    my $writer = new XMLTV::Writer(%g_args, encoding => 'ISO-8859-1');
525    $writer->start($metadata);
526    $writer->write_channels(\%channels);
527    $writer->end;
528}
529else {
530    if ($opt_input) {
531	warn "grabbing from source HTML file: $opt_input for dummy channel ID la1.rtbf.be\n";
532    }
533    grab(\%g_args, \@config_lines);
534}
535
536printf (STDERR "Accessed %d web pages, downloaded %d Kb, duration %d secs\n",$numwebgets,$kbwebgets,time()-$statstarttime) unless $opt_quiet;
537exit();
538
539# Grab listings and write them in XML.  Parameters:
540#
541# ref to hash of arguments to be passed to XMLTV::Writer (but encoding
542#   is always ISO-8859-1),
543# ref to list of lines from config file.
544#
545sub grab( $$ ) {
546    my ($w_args, $config_lines) = @_;
547    my $writer = new XMLTV::Writer(%$w_args, encoding => 'ISO-8859-1');
548    my %write_channels; # to be written as <channel> elements
549
550    # FIXME turn into progress bar.
551    print STDERR "finding channels:\t" unless $opt_quiet;
552
553    my %channels;
554    if ($opt_input) {
555	# skip getting channels
556        my @dns = ([ "La Une", $LANG ], [ "La Une"] );
557        my $ch = { 'display-name' => \@dns,
558                   'id' => "la1.rtbf.be"};
559        t 'channel object: ' . d $ch;
560        $channels{"la1.rtbf.be"} = $ch;
561    }
562    else {
563	%channels = get_channels();
564    }
565    print STDERR "got " . (scalar keys %channels) . ", done.\n" unless $opt_quiet;
566
567    # Read the configuration file.  At present the lines must be one
568    # of the forms:
569    #
570    # channel <xmltv id> <fr:nl> [dodetail]
571    # language <fr|nl>
572    # detailgenere <regex>
573    # detailstartime <hh:mm>
574    # detailstoptime  <hh:mm>
575    # ALL
576    #
577    my $line_num = 1;
578    foreach (@$config_lines) {
579        ++ $line_num;
580        next if not defined;
581        my $where = "$config_file:$line_num";
582        if (/^channel\s+([^\s]+)\s+($LANG_FR|$LANG_NL)\s*([^\s]*)/) {
583            my $xmltv_id = $1;
584	    # $2 is grab language -- for future use when grabber
585	    # can simultaneously grab both languages
586	    die "$where: Specification of different Grab language currently not implemented"
587		unless ( $2 eq $LANG);
588
589            if (not defined $channels{$xmltv_id}) {
590                warn "$where: no channel with XMLTV id $xmltv_id, skipping\n";
591                next;
592            }
593            $write_channels{$xmltv_id} = $channels{$xmltv_id};
594	    if ( $3 eq "dodetail" ) {
595		$get_channel_detail{$xmltv_id} = 1;
596	    }
597        }
598        elsif (/^language\s+(.+)/) {
599            # already read -- ignore
600        }
601        elsif (/^detailstarttime\s+([0-9]{2}:[0-9]{2})$/) {
602            $detailstarttime=$1;
603        }
604        elsif (/^detailstoptime\s+([0-9]{2}:[0-9]{2})$/) {
605	    $detailstoptime=$1;
606	}
607        elsif (/^detailgenre\s+(.+)/) {
608	    # allow obsolete detailgeneres for ^$ and .*
609	    if ( $1 eq ".*" ) {
610		warn "obsolete detailgenre \"$1\"in config file -- replace with *ALL*";
611		push @detailgenre, "*ALL*";
612	    } elsif ( $1 eq "^\$" ) {
613		warn "obsolete detailgenre \"$1\" in config file -- replace with *NONE*";
614		push @detailgenre, "*NONE*";
615	    } else {
616		push @detailgenre, $1;
617	    }
618        }
619        else { die "$where: bad line: \"$_\"\n" }
620    }
621
622    t "channels to get detail for: " . d \%get_channel_detail;
623    t "detailstarttime=$detailstarttime" if ( $detailstarttime );
624    t "detailstoptime=$detailstoptime" if ( $detailstoptime );
625    t "genre(s) to get detail for: " . d \@detailgenre;
626
627    if ( $opt_slow )
628    {
629 die <<END
630You must reconfigure with --configure --slow to choose the programmes
631to get details for.
632END
633   if not defined $detailstarttime or not defined $detailstoptime;
634
635	# sanity check detail start time
636	if ( ( $detailstoptime ge "24:00" )
637	     || ( $detailstarttime ge "24:00" ) ) {
638	    die "Invalid detail start/stop time range in $config_file: $detailstarttime - $detailstoptime";
639	}
640    }
641    else
642    {
643	if ( $detailstoptime
644	     || $detailstoptime
645	     || @detailgenre
646	     || %get_channel_detail ) {
647	    say ( <<END
648WARNING: Config file contains settings for downloading detailed
649programme information, but --slow has not been specified on command
650line
651
652No detailed programme information will be downloaded
653END
654		  ) ;
655	}
656    }
657
658    # FIXME turn this into progress bar.
659    print STDERR "getting dates for which listings available:\t"
660      unless $opt_quiet;
661    my @available_dates;
662    if ($opt_input) {
663	# skip getting dates
664	@available_dates = ( $today );
665    }
666    else {
667	@available_dates = get_available_dates();
668    }
669    t 'available dates: ' . d \@available_dates;
670    die 'apparently, there are no days of listings on the site'
671      if not @available_dates;
672    print STDERR "got " . @available_dates .  ", done.\n" unless $opt_quiet;
673
674    my $is_available = sub( $ ) {
675        my $d = shift;
676        foreach (@available_dates) {
677            return 1 if not Date_Cmp($d, $_);
678        }
679        return 0;
680    };
681
682    my @dates_to_get;
683    for (my $d = $today; $is_available->($d); $d = DateCalc($d, '+ 1 day')) {
684        push @dates_to_get, $d;
685    }
686    die "listings for today ($today) not available" if not @dates_to_get;
687    my $last_day = $dates_to_get[-1];
688    foreach (@available_dates) {
689        if (Date_Cmp($last_day, $_) < 0) {
690            warn "strangely, day $_ is available but there are gaps before it";
691        }
692    }
693
694    if (defined $opt_days) {
695        if ($opt_days > @dates_to_get) {
696            warn 'only ' . (scalar @dates_to_get)
697              . ' days of consecutive listings available';
698        }
699        else {
700            @dates_to_get = @dates_to_get[0 .. $opt_days - 1];
701        }
702    }
703    my $days = @dates_to_get > 1 ? 'days' : 'day';
704    say('getting ' . (scalar @dates_to_get) . " $days of listings\n")
705	unless $opt_quiet;
706    say("(\"#\" indicates a program with summary info,\n" .
707        " \"@\" indicates a program with detailed info)\n")
708	unless ($opt_quiet || ! $opt_slow);
709    t 'getting dates:' . d \@dates_to_get;
710
711    $writer->start($metadata);
712
713    # get the listings for each date
714    my %categories;
715    my %prog_to_cat;
716    my @programmes;
717    foreach my $date (@dates_to_get) {
718        my @new_programmes;
719
720        foreach my $chan (sort keys %write_channels) {
721            #
722            my $dn = $write_channels{$chan}->{'display-name'};
723            my $name = XMLTV::best_name([ $LANG ], $dn)->[0];
724            $name = $chan if not defined $name;
725
726            # FIXME turn into progress bar.
727            print STDERR 'date ', UnixDate($date, '%Y%m%d'), ", channel $name:\t"
728                unless $opt_quiet;
729            push @new_programmes,
730            get_programmes($chan, $date, \%prog_to_cat,
731                           \%categories, \%channels, $opt_input);
732            print STDERR "\n" unless $opt_quiet;
733        }
734
735	# push the new channels into the completlist
736	push (@programmes, @new_programmes);
737    }
738
739    # die; # die here when debugging parser
740
741    # write out the xml
742    # write out the channels
743    $writer->write_channels(\%write_channels);
744
745    #write out the programmes
746    foreach (@programmes) {
747        foreach my $k (keys %$_) {
748            die "undef \$_->{ $k } in $_->{title}->[0]->[0]" if not defined $_->{$k};
749        }
750        $writer->write_programme($_);
751    }
752    $writer->end();
753}
754
755
756# Function to get a url.  This also seems like a sensible place to do
757# HTML-demoronizing.
758#
759sub get_url( $ ) {
760    my $url = shift;
761    t "getting URL: $url";
762    for (my $tmp = get_nice($url)) {
763        die "cannot get $url" if not defined;
764	$numwebgets++; #update stats
765	$kbwebgets+= (length $_)/1024;
766        tr/\221\222\226/''-/;
767        tr/\010//d;
768        # There could be other illegal chars
769        return $_;
770    }
771}
772
773
774# Function to find all the programmes on a channel (at a given date +
775# time).
776#
777# Parameters:
778#   XMLTV id of channel
779#   Date::Manip object giving date and time
780#   prog_to_cat hash (see elsewhere for details)
781#   categories hash
782#   channels hash
783#   filename of test HTML file (does not read from web if ! undef)
784#
785# Returns: list of programmes
786#
787sub get_programmes( $$$$$$ ) {
788    my $channel_xid = shift;
789    my $origtime = shift;
790    my $time = $origtime;
791    my $tomorrow = DateCalc($time, '+ 1 day');
792    my $prog_to_cat = shift;
793    my $categories = shift;
794    my $channels = shift;
795    my $testinput = shift;
796
797    my @p;
798
799    my $data;
800    my $url;
801    if ( $testinput )
802    {
803	$url=$testinput;
804	local(*INPUT, $/);
805	open (INPUT, $testinput) 	|| die "can't open $testinput: $!";
806	warn "using $testinput as data source";
807	$data = <INPUT>;
808    }
809    else
810    {
811	$url = "$BASE_URL{$LANG}$SUMMARY_PATH{$LANG}?move=full";
812	$url .= "&channel=" . xmltv_to_be($channel_xid);
813	$url .= '&dag=' . UnixDate($time, '%m/%d/%Y');
814
815	# FIXME commonize this
816	local $SIG{__WARN__} = sub {
817	    warn "$url: $_[0]";
818	};
819	local $SIG{__DIE__} = sub {
820	    die "$url: $_[0]";
821	};
822	eval {
823	    $data = get_url($url);
824
825	    # This check is mostly for the benefit of those using --cache.
826	    die 'strange, get_url() not supposed to return undef'
827		if not defined $data;
828	};
829	if ($@) {
830	    warn "could not get $url\n";
831	    my $from_time = UnixDate($time, '%Q');
832	    warn "not fetching any programmes for channel $channel_xid "
833		. "at $from_time\n";
834	    return ();
835	}
836    }
837    $data =~ tr/[\r\n]//d;
838    print STDERR '#' unless $opt_quiet;
839
840    my @results = ($data =~ /<tr>\s*<td[^>]*class='*tvnucontent'*.*?<td[^>]*class=programmabeschrijving[^>]*[^<]*<\/td>\s*<\/tr>/ig);
841    if (not @results) {
842        if ($data =~ /aucun programme ne correspond|geen programma\'s gevonden /) {
843            # Assume that this is because nothing is showing on that
844            # channel, not because the site is missing some data.
845            #
846        }
847        else {
848            warn "$url: no results found in HTML\n";
849        }
850        return ();
851    }
852
853    # used later in detecting when a program is in tomorrow
854    # defined here for performance
855    my $time_1400 = DateCalc($origtime, '+ 12 hours');
856    my $time_1000 = DateCalc($origtime, '+ 10 hours');
857
858    foreach (@results) {
859        t "\nresult: " . $_ . "\n";
860        my $PATH=$DETAIL_PATH{$LANG};
861        $PATH =~ s/([.?])/\\$1/g;
862	my $PAT = "'$PATH([^']+)'";
863        m/$PAT/i or die "\n$url: \n matching pattern $PAT cannot find progid in \n$_";
864        my $programmeId = $1;
865
866
867	my $progs = get_programme_summary($channel_xid,  $programmeId,$time, $_);
868	if (not $progs) {
869	    warn "$url: could not get programme $programmeId on channel $channel_xid\n";
870	}
871	elsif (not @$progs) {
872	    warn "$url: strange, $programmeId on channel $channel_xid seems to be empty";
873	}
874	else {
875
876	    # attempt to determine when we are in tomorrow
877	    #
878	    # the problem is that the listings for a 'day' actually range from
879	    # 06:00 of "today" to 06:00 of "tomorrow"
880	    #
881	    # to detect when "Today" has become "tomorrow", we use 2 checking methods:
882	    # 1) program starts before midnight and finishes after midnight...
883	    #    (this is determined in get_programme_summary: Stoptime is set to tomorrow)
884	    # 2) previous program stop time > 14:00; whereas this program's start time < 10:00
885	    #    ( this is detected below )
886	    #
887	    # this relies on programs being returned by get_programme_summary() in date order
888	    #
889	    t 'determining tomorrow for programmes: ' . d $progs;
890	    foreach ( @$progs ) {
891		my $lastexistprog=$p[$#p]; #ref to hash
892		my $latestprog=$_; # ref to hash
893		if ( $time ne $tomorrow ) {
894		    if ( ${$latestprog}{'stop'} &&
895			 Date_Cmp(${$latestprog}{'stop'},$tomorrow) >= 0 )
896		    {
897			#if stoptime in tomorrow, assume all future progs are also in tomorrow
898			t "Passing into the land of tomorrow A - stop time = ${$latestprog}{'stop'}";
899			$time=$tomorrow;
900		    }
901		    if ( Date_Cmp(${$latestprog}{'start'},${$lastexistprog}{'stop'}) < 0 ) {
902			# start/stop time overlap... possibly start time is tommorrow
903			# check this -- if start time < today 10:00 and previous stop time > today 14:00
904			# then this prog is probably in tomorrow!
905			if ( ${$lastexistprog}{'stop'} &&
906			     (Date_Cmp(${$latestprog}{'start'}, $time_1000) < 0
907			      && Date_Cmp(${$lastexistprog}{'stop'}, $time_1400) > 0 ) )
908			{
909			    # so we have a program that starts much earlier than the pervious program stops...
910			    # Methinks start time has passed into the world of tomorrow!
911			    ${$latestprog}{'start'}=utc_offset(DateCalc(${$latestprog}{'start'},' + 1 day') . " UTC", '+0100');
912			    die if not defined ${$latestprog}{'start'};
913			    t "Passing into the land of tomorrow B - start time = ${$latestprog}{'start'}";
914			    if (${$latestprog}{'stop'} &&
915				Date_Cmp(${$latestprog}{'start'}, ${$latestprog}{'stop'}) > 0) {
916				${$latestprog}{'stop'} = utc_offset(DateCalc(${$latestprog}{'stop'}, '+ 1 day') . " UTC", '+0100');
917                                die if not defined ${$latestprog}{'stop'};
918				t "Passing into the land of tomorrow C - stop time = ${$latestprog}{'stop'}";
919			    }
920
921			    $time=$tomorrow;
922			}
923		    }
924		}
925		else
926		{
927		    # $time eq $tomorrow
928
929		    # we are already in tomorrow, so start and stop
930		    # times must be in tomorrow range check this
931		    # (necessary for other progs retrieve as a clump
932		    # by get_programme_summary()
933
934		    if ( Date_Cmp(${$latestprog}{'start'}, $tomorrow) < 0 )  {
935			DateCalc(${$latestprog}{'start'}, '+ 1 day');
936		    }
937		    if ( ${$latestprog}{'stop'} &&
938			 Date_Cmp(${$latestprog}{'stop'}, $tomorrow) < 0 ) {
939			DateCalc(${$latestprog}{'stop'}, '+ 1 day');
940		    }
941		}
942		# check for simple start/stop time overlap
943		if ( not defined ${$lastexistprog}{'stop'}
944		     && ${$latestprog}{'start'} ) {
945		    ${$lastexistprog}{'stop'} = ${$latestprog}{'start'}
946		}
947
948		if ( ${$lastexistprog}{'stop'} &&
949		     Date_Cmp(${$latestprog}{'start'},${$lastexistprog}{'stop'}) < 0 ) {
950		    # start time a little before previous stop time
951		    # correct previous stop time
952		    warn "$url: correcting program overlap stop = ${$lastexistprog}{'stop'} -> ${$latestprog}{'start'};";
953		    ${$lastexistprog}{'stop'}=${$latestprog}{'start'};
954		}
955		push @p, $_;
956	    }
957	}
958    }
959    return @p;
960}
961
962# Function to parse the HTML and get all the info we need
963#
964# Parameters:
965#   XMLTV id of channel
966#   Sanoma id of programme
967#   Date::Manip object giving date and time
968#   bit of html text- section of table from website with the 2 table rows containing program information
969#
970# <tr>
971# <td class='tvnucontent' rowspan=2> </td>
972# <td class='tvnucontent' > STARTTIME </td>
973# <td class='tvnucontent' > STOPTIME </td>
974# <td class='tvnucontent' > <a href='detail.asp?progid=PROGID' class=tvnu> TITLE </a></td>
975# <td class='tvnuthema' > CATEGORY </td>
976# <td class='tvnucontent' >   </td>
977# </tr>
978# <tr>
979# <td class=programmabeschrijving> </td>
980# <td class=programmabeschrijving> </td>
981# <td colspan=2 class=programmabeschrijving> DESCRIPTION </td>
982# <td align=right valign='top' class=programmabeschrijving></td>
983# </tr>
984#
985# Returns a listref of programmes: normally with just one element,
986# note, _start, _stop will be set to the TIME when the program is set
987# the caller is responsable for converting these into date::manip objects
988# and deciding whether it is today or tomorrow
989#
990sub get_programme_summary( $$$$ ) {
991#    local $Log::TraceMessages::On = 1;
992    my $channel_xid = shift;
993    my $channelId = xmltv_to_be($channel_xid);
994    my $programmeId = shift;
995    my $date = shift;
996    my $summaryhtml = shift;
997
998
999    # @followons are small extra programmes sharing its slot.  Things
1000    # like news bulletins which come in the middle of a film are also
1001    # counted as 'after' it, for simplicity.
1002    #
1003    my @followons;
1004
1005    # %p is the main programme we will return.
1006    my %p;
1007    $p{channel} = $channel_xid;
1008    $p{_chanID} = $channelId;
1009    $p{_progID} = $programmeId;
1010
1011
1012
1013
1014    # take summaryhtml and extract starttime, stoptime, title, genre, and description
1015    my @nucontent = ($summaryhtml =~ /<td[^>]+class='*tvnucontent'*.*?<\/td>/ig);
1016
1017    # attempt to get star rating from title
1018    if ( $nucontent[3] =~ /<img[^>]*src=[\"\']*([^>]*stars\/)([0-9]{2})\.gif[\"\']*/i )
1019    {
1020	# star ratings are 00,10,15,20,25,30,35,40
1021        # 8 possible ratings: convert to 1->4
1022        my %rating = ( '00' => 0,
1023		       '10' => 1,
1024                       '15' => 1,
1025                       '20' => 2,
1026		       '25' => 2,
1027		       '30' => 3,
1028		       '35' => 3,
1029		       '40' => 4 );
1030        if ( defined $rating{$2} ) {
1031	    $p{'star-rating'}=[ $rating{$2} ."/4", [ { src => "$BASE_URL{$LANG}$1" . $rating{$2} . "0.gif" } ] ];
1032        }
1033	else
1034	{
1035	    warn "$programmeId: could not translate rating: $2 / 40"
1036	}
1037    }
1038    foreach ( @nucontent ) {
1039	$_=clean_html_text($_);
1040    }
1041    t "nucontent: " . d \@nucontent;
1042    if ( scalar(@nucontent) ge 4 ) {
1043
1044	# check for episodenum in title "Stargate SG-1 (4/13)"
1045	# or "Stargate SG-1 (4)"
1046	if ( $nucontent[3] =~ /(.+)\s+(\([0-9]+(\/[0-9]+){0,1}\))/i ) {
1047	    t "got subtitle in title - $1 -- $2 " . d \$3;
1048	    $p{'title'} = [ [ $1 ] ];
1049	    $p{'episode-num'} = [ [ $2 ] ];
1050	}
1051	else
1052	{
1053	    $p{'title'} = [ [ $nucontent[3] ] ];
1054	}
1055        if ( not defined $p{'title'}
1056	     or not defined  $p{'title'}[0]
1057	     or not defined $p{'title'}[0][0]
1058	     or $p{'title'}[0][0] eq '' ) {
1059            warn "$programmeId: No title defined... skipping programme";
1060	    return undef
1061        }
1062
1063	# strip trailing puctuation from title
1064	# making sure "E.R." does not become "E.R" in
1065	# the process!
1066	$p{'title'}[0][0] =~ s/([^.,:;]{2,})[.,:;]*$/$1/;
1067
1068        my ($start, $start_tz);
1069        my ($stop, $stop_tz);
1070        my $pair;
1071        $nucontent[1] =~ s/([0-2][0-9])\./$1:/;
1072        $nucontent[2] =~ s/([0-2][0-9])\./$1:/;
1073        $nucontent[1] =~ s/24:/00:/;
1074        $nucontent[2] =~ s/24:/00:/;
1075        if ( ! $nucontent[1] =~ /[0-2]*[0-9]:[0-5]*[0-9]/ ) {
1076            warn "$programmeId: No start time defined... skipping programme";
1077	    return undef
1078        }
1079        t "start time $nucontent[1], calling utc_offset()";
1080	$p{start} = utc_offset(UnixDate($date, '%Y-%m-%d') . " $nucontent[1]", '+0100');
1081	t "turned into $p{start}";
1082
1083	if ( $nucontent[2] =~ /[0-2]*[0-9]:[0-5]*[0-9]/ ) {
1084	     t "stop time $nucontent[2], calling utc_offset()";
1085	     $p{stop} = utc_offset(UnixDate($date, '%Y-%m-%d') . " $nucontent[2]", '+0100');
1086	     t "turned into $p{stop}";
1087
1088	     # Some programmes have thir stop time on the next day.  (This test
1089	     # may break when the timezones change.)
1090	     #
1091	     if (Date_Cmp($p{start}, $p{stop}) > 0) {
1092		 t 'put stop time a day later';
1093		 my $n = DateCalc($p{stop}, '+ 1 day');
1094		 t "DateCalc() gave: $n";
1095		 $p{stop} = utc_offset("$n +0000", '+0100');
1096		 t "stop time now $p{stop}";
1097		 die if not defined $p{stop};
1098	     }
1099	 }
1100    }
1101    else {
1102        warn "$programmeId: invalid number of columns for program, skipping: \n" . d \@nucontent;
1103	return undef;
1104    }
1105
1106    $p{url}= [ "$BASE_URL{$LANG}$DETAIL_PATH{$LANG}" . $programmeId ];
1107
1108
1109    my @thema = ($summaryhtml =~ /<td[^>]+class='*tvnuthema'*.*?<\/td>/ig);
1110    foreach ( @thema ) {
1111	$_=clean_html_text($_);
1112	push @{$p{category}}, [ $_ ] if ( $_ ne '' );
1113    }
1114    t "nuthema: " . d \@thema;
1115
1116    # match for getting detailed info
1117    my $do_get_details;
1118    if ( $opt_slow
1119	 && defined $get_channel_detail{$channel_xid} ) {
1120
1121	t "channel  selected for details";
1122	# check match for time range
1123	my $start_hhmm=$nucontent[1];
1124	if ( (
1125	       ( $detailstarttime lt $detailstoptime )
1126	       # normal time range
1127	       &&
1128	       ( $start_hhmm ge $detailstarttime
1129		 && $start_hhmm lt $detailstoptime )
1130	       )
1131	     ||
1132	     (
1133	       ( $detailstarttime ge $detailstoptime )
1134	       # inverted time range: 17:00-02:00 or similar
1135	       &&
1136	       ( ( $start_hhmm ge $detailstarttime
1137		   && $start_hhmm le "24:00" )
1138		 || ( $start_hhmm ge "00:00"
1139		      && $start_hhmm lt $detailstoptime )
1140		 )
1141	       )
1142	     ) {
1143	    t "time range selected for details";
1144	    # check for genre match
1145	  MATCHCATEG:
1146	    foreach my $testgenre ( @detailgenre ) {
1147		if ( $testgenre eq '*ALL*' ) {
1148		    # Magic value meaning always yes;
1149		    #
1150		    $do_get_details = 1;
1151		    last MATCHCATEG;
1152		}
1153		if ( $p{category} ) {
1154		    foreach my $categ ( @{@{$p{category}}} ) {
1155			t "comparing \"${$categ}[0]\" with \"$testgenre\"";
1156			if ( index(lc ${$categ}[0], lc $testgenre) != -1 ) {
1157			    $do_get_details=1;
1158			    last MATCHCATEG;
1159			}
1160		    }
1161		} elsif ( $testgenre eq '*NONE*') {
1162		    # Magic value meaning empty category;
1163		    #
1164		    $do_get_details = 1;
1165		    last MATCHCATEG;
1166		}
1167	    }
1168	}
1169    }
1170    if ( $do_get_details )
1171    {
1172	print STDERR '@' unless $opt_quiet;
1173	get_programme_detailed_info(\%p,$programmeId);
1174    }
1175    else
1176    {
1177	print STDERR '#' unless $opt_quiet;
1178    }
1179
1180    # if no description yet, get it here
1181    # -- handles the case where getting details is not defined,
1182    # or if getting details failed
1183    # or if details had no description for some reason.
1184    if ( not defined $p{'desc'} ) {
1185	my $imagedescr;
1186	my $description;
1187	my @programmabeschrijving = ($summaryhtml =~ /<td[^>]+class='*programmabeschrijving'*.*?<\/td>/ig);
1188	parse_programme_details(\%p, \@programmabeschrijving);
1189    }
1190    t ' proginfo: ' . d \%p;
1191
1192    return [ \%p, @followons ];
1193}
1194
1195my %unknownimages;
1196sub parse_programme_details($$) {
1197    my $p = shift;  # ref to %p defined in get_programme_summary
1198    my $detailstrings = shift; # ref to array of descr strings
1199
1200    my $description;
1201    my $imagedescr;
1202    foreach ( @{$detailstrings} ) {
1203	t "details " . d \$_;
1204	# handle translating images with alt-text
1205	my @images = ( /<img [^>]*>/ig );
1206	t "images: " . d \@images;
1207	foreach ( @images ) {
1208	    if ( /<img[^>]+src\s*=\s*[\'\"]([^\'\"]+)[\'\"]\s+alt\s*=\s*[\'\"]([^\'\"]+)[\'\"][^>]*>/i ) {
1209		my $imagepath=$1;
1210		if ( $imagepath =~ /\/gehoor.gif/i || $imagepath =~ /tt.gif/i) {
1211		    ${$p}{subtitles} = [ { type => 'teletext' } ];
1212		}
1213		elsif ( $imagepath =~ /\/16-9\.gif/i ) {
1214		    ${$p}{video}{aspect} = "16:9";
1215		}
1216		elsif ( $imagepath =~ /\/stereo\.gif/i ) {
1217		    ${$p}{audio}{stereo} = "stereo";
1218		}
1219		elsif ( $imagepath =~ /\/ov\.gif/i ) {
1220		    #  VO image, use language specific abbrev
1221		    my $text = $2;
1222		    if ( $LANG eq $LANG_FR ) {
1223			$text="VO";
1224		    } elsif ( $LANG eq $LANG_NL ) {
1225			$text="OV";
1226		    }
1227		    if ( defined $imagedescr ) {
1228			$imagedescr=$imagedescr . " (" . $text . ")";
1229		    }
1230		    else {
1231			$imagedescr="(" . $text . ")";
1232		    }
1233		}
1234		elsif ( $imagepath =~ /\/dolby\.gif/i ) {
1235		    ${$p}{audio}{stereo} = "surround";
1236		}
1237		elsif ( $imagepath =~ /\/black-white\.gif/i ) {
1238		    ${$p}{video}{colour} = 0;
1239		}
1240		elsif ( $imagepath =~ /\/tele-([0-9]+)\.gif/i ) {
1241		    # Age rating $1="10,12,16,18"
1242		    # Cert issuer cannot be determined as it depends on
1243		    # nationality of channel
1244		    if ( defined ${$p}{rating} )
1245		    {
1246			if ( defined ${$p}{rating}[0][0] ne $1 )
1247			{
1248			    warn "${$p}{_progID}: already seen different certificate";
1249			    push @{${$p}{rating}}, [ $1, undef, [ { src => "$BASE_URL{$LANG}$imagepath" } ] ];
1250			}
1251		    }
1252		    else {
1253			push @{${$p}{rating}}, [ $1, undef, [ { src => "$BASE_URL{$LANG}$imagepath" } ] ];
1254		    }
1255		}
1256		elsif ( $imagepath =~ /\/premiere\.gif/i ) {
1257		    ${$p}{new} = "new";
1258		}
1259		else {
1260		    warn "${$p}{_progID}: Unknown info image ($imagepath), putting alt-text into description: \"$2\"" unless $unknownimages{$imagepath}++;
1261		    # unknown image, use alt-text
1262		    if ( defined $imagedescr ) {
1263			$imagedescr=$imagedescr . " (" . $2 . ")";
1264		    }
1265		    else {
1266			$imagedescr="(" . $2 . ")";
1267		    }
1268		}
1269	    }
1270	}
1271	$_=clean_html_text($_);
1272	if ( $_ ne '' ) {
1273	    if ( defined $description ) {
1274		$description=$description . ' ' . $_;
1275	    }
1276	    else {
1277		$description=$_;
1278	    }
1279	}
1280    }
1281    if ( $imagedescr )
1282    {
1283	if ( defined $description ) {
1284	    $description=$description . ' ' . $imagedescr;
1285	}
1286	else {
1287	    $description=$imagedescr;
1288	}
1289    }
1290    if ( defined $description ) {
1291	# check for epsiode num in description
1292	# "something (4/13)"
1293	if ( $description =~ /(.+) (\([0-9]+\/[0-9]+\))/ ) {
1294	    if ( not defined ${$p}{'episode-num'} ) {
1295		${$p}{'episode-num'} = [ [ $2 ] ] ;
1296	    }
1297	}
1298	# look for date in description if not already found
1299	# "something (19xx)" or "something (20xx)"
1300	# ONLY MATCHES dates in years 19xx  and 20xx
1301	if ( not defined ${$p}{'date'} ) {
1302	    if ( $description =~ m/(.+) \(((19|20)[0-9][0-9])\)/ ) {
1303		t "got year $2 in descr $description";
1304		${$p}{'date'}=$2;
1305	    }
1306	}
1307	# if film, look for director in description
1308	if ( ( not defined ${$p}{'director'})
1309	     && ${$p}{'category'}
1310	     && ${$p}{'category'}->[0]
1311	     && ${$p}{'category'}->[0]->[0]
1312	     && ${$p}{'category'}->[0]->[0] =~ /film/i  )
1313	{
1314	    # description is something like
1315	    # FR: thriller de John Doe.
1316	    # NL: thriller van John Doe
1317	    my $whomatchregexp;
1318	    if ( $LANG eq $LANG_FR ) {
1319		$whomatchregexp="(de |d'|par )";
1320	    } elsif ( $LANG eq $LANG_NL ) {
1321		$whomatchregexp="(van )";
1322	    }
1323	    # match "John Doe."
1324	    # match "John H. Doe."
1325	    # match "John Howard Doe."
1326	    # match "Jean-Dominique de La Rochefoucauld" (!)
1327	    # so name matching regexp is (CHAR. |chars )(repeated) CHARS.
1328	    if ( $description =~
1329		 m/.*? $whomatchregexp(([A-Z]\. |[A-Za-z�-�\-]{2,} )+[A-Z�-�a-z\-]{2,})\./ ) {
1330		t "got director $2 in descr $description";
1331		push @{${p}->{credits}->{director}}, $2;
1332	    }
1333	}
1334
1335	# check for duration in description (French only so far --
1336	# can't seem to find equivalent in Flemish listings
1337	if ( $LANG eq $LANG_FR
1338	     && $description =~ m/(.*) *Dur�e: +([0-9]+)\'[. ]*(.*)/i )
1339	{
1340	    ${$p}{'length'} = $2 * 60;
1341
1342	    $description = $1;
1343	    $description .= " " . $3 if ( $3 );
1344	}
1345
1346	# check for preogramme is a repeat flag in description
1347	# match (R. something) at *end* of text
1348	# will match:
1349	#   (R. du film de la soir�e)
1350	#   (R. de samedi)
1351	#   (R. d'hier)
1352	#   (R.)
1353	# will not match:
1354	#  (R): 5 V.O., 11, 13, 20 V.O., 25
1355	#  (R. � 17.30 et 24.00)
1356        # which is Canal Plus' future showing dates and MCM's future
1357        # showing times
1358	if ( $description =~ m/(.*) *\(R\.\)[ .]*$/ )
1359	{
1360	    # Plain (R.)  with no extra info.
1361	    # Strip (R.)
1362	    ${$p}{'previously-shown'} = {};
1363	    $description = $1;
1364	}
1365	elsif ( $description =~ m/\(R\. [^�][^\)]+\)[ .]*$/ )
1366	{
1367	    # Repeat with extra info keep info
1368	    ${$p}{'previously-shown'} = {};
1369	}
1370
1371
1372	# Compare one-word descriptions to categories and strip if matched
1373	$description =~ s/ +$//;
1374	if ( $description ne ""
1375	     && $description !~ m/[ ()-]/ ) {
1376	    $description =~ s/[ -_\.,]*$//;
1377	    if ( ${$p}{category} ) {
1378	      MATCHCATEG:
1379		foreach my $categ ( @{@{${$p}{category}}} ) {
1380		    t "stripping desc -- duplicate categ \"${$categ}[0]\" -- \"$description\"";
1381		    if ( index(lc ${$categ}[0], lc $description) != -1 ) {
1382			# Desc is subsrtring of category... strip desc
1383			$description = undef;
1384			last MATCHCATEG;
1385		    }
1386		    elsif ( index(lc $description, lc ${$categ}[0]) != -1 ) {
1387			# category is sub-string of desc... move desc to categoty
1388			push @{${$p}{category}}, [ $description ];
1389			$description = undef;
1390			last MATCHCATEG;
1391		    }
1392
1393		}
1394	    }
1395	}
1396
1397
1398
1399
1400	# Short descr: prepend to desc if present
1401	if ( $description ) {
1402	    if (${$p}{'desc'}
1403		&& ${$p}{'desc'}->[0]
1404		&& ${$p}{'desc'}->[0]->[0] )
1405	    {
1406		${$p}{'desc'}->[0]->[0] = $description . ' ' . ${$p}{'desc'}->[0]->[0];
1407	    }
1408	    else
1409	    {
1410		${$p}{'desc'} = [ [ $description, $LANG ] ];
1411	    }
1412	}
1413    }
1414}
1415
1416
1417
1418
1419my $warned_discarding_parts;
1420my %warn_others;
1421sub get_programme_detailed_info($$) {
1422    my $p = shift; # ref to %p defined in get_programme_summary
1423    my $programmeId = shift;
1424    my $data;
1425
1426    # if this func fails, warn, and undef ${$p}{'desc'} -- the
1427    # get_programme_summary will get description from the summary
1428    t "getting details for ${$p}{'title'}->[0]->[0] at ${$p}{'start'}";
1429
1430    my $url = "$BASE_URL{$LANG}$DETAIL_PATH{$LANG}" . $programmeId;
1431
1432    # FIXME commonize this
1433    local $SIG{__WARN__} = sub {
1434	warn "$url: $_[0]";
1435    };
1436    local $SIG{__DIE__} = sub {
1437	die "$url: $_[0]";
1438    };
1439    eval {
1440	$data = get_url($url);
1441
1442	# This check is mostly for the benefit of those using --cache.
1443	die 'strange, get_url() not supposed to return undef'
1444	    if not defined $data;
1445    };
1446    if ($@) {
1447	warn "could not get $url\n";
1448	my $from_time = UnixDate(${$p}{'start'}, '%Q');
1449	warn "not fetching detailed info for programme ${$p}{'title'}->[0]->[0] "
1450            . "for channel ${$p}{'channel'}"
1451	    . "at $from_time\n";
1452	return ();
1453    }
1454    $data =~ tr/[\r\n]//d;
1455
1456    # details are in table with rows:
1457    # <tr>
1458    # <td class=detailtitels valign='top' nowrap> DETAILTYPE </td>
1459    # <td class=detailtitels valign='top' nowrap>:</td>
1460    # <td class=detailcontent valign='top' width='100%'> DETAIL DESCRIPTION </td>
1461    # </tr>
1462
1463
1464    my @results = ($data =~/<tr>\s*<td class=\'*detailtitels .*?<\/tr>/ig);
1465
1466    t "results" . d \@results;
1467    my @detailstringsarr;
1468
1469    foreach (@results) {
1470	my $detailtype;
1471	my $detailcontent;
1472	if ( m/<td\s+class=\'*detailtitels[^>]+>([^<]+)<\/td>/i ) {
1473	    $detailtype=clean_html_text($1);
1474	}
1475	if (m/<td\s+class=\'*detailcontent[^>]+>(.*?)<\/td>\s*<\/tr>/i) {
1476	    $detailcontent=$1;
1477	}
1478
1479	t "detailtype = $detailtype, detailcontent = " . d \$detailcontent;
1480
1481	if ( not defined $detailtype ) {
1482	    warn "$url: Could not extract details from $_";
1483	}
1484	elsif ( not defined $detailcontent ) {
1485	    warn "$url: Could not extract details from $_";
1486	}
1487	elsif ( $detailtype =~ /^(la cha�ne|zender)$/i
1488		|| $detailtype =~ /^(la date|datum)$/i
1489		|| $detailtype =~ /^(le d[e�]but|begintijd)$/i
1490		|| $detailtype =~ /^(la fin|eindtijd)$/i
1491		|| $detailtype =~ /^(le )*genre$/i ) {
1492	    # already handled
1493	}
1494	elsif ( $detailtype =~ /^(info)$/i ) { # description, including images
1495	    push @detailstringsarr, $detailcontent ;
1496	}
1497	elsif ($detailtype =~ /^(Divers)$/i ){ # dometimes contains (R.) info
1498	    push @detailstringsarr, $detailcontent ;
1499	}
1500	elsif ( $detailtype =~ /^(inhoud|contenu)$/i ) { # detailed description
1501	    $detailcontent=clean_html_text($detailcontent);
1502	    if ( ${$p}{'desc'}
1503		 && ${$p}{'desc'}->[0]
1504		 && ${$p}{'desc'}->[0]->[0] )
1505	    {
1506		${$p}{'desc'}->[0]->[0] = ${$p}{'desc'}->[0]->[0] . ' ' . $detailcontent;
1507	    }
1508	    else
1509	    {
1510		${$p}{'desc'} = [ [ $detailcontent, $LANG ] ];
1511	    }
1512	}
1513	elsif ($detailtype =~ /^(Acteurs)$/i ) {
1514	    # actor (part), actor (part) ea
1515	    $detailcontent=clean_html_text($detailcontent);
1516	    # remove gastactor:  gastactrice: de stemming van: etc
1517	    $detailcontent =~ s/[.,:; ]*[^,:]+:/, /i;
1518
1519	    # remove 'and'
1520	    $detailcontent =~ s/\s+en\s+/, /i if ($LANG eq $LANG_NL);
1521	    $detailcontent =~ s/\s+et\s+/, /i if ($LANG eq $LANG_FR);
1522
1523	    # remove 'e.a.'
1524	    $detailcontent =~ s/\s+e\.a\.\s*$//i;
1525
1526	    # add a comma at the end for easy parsing later!
1527	    $detailcontent =~ s/\s*,*\s*$/,/i;
1528
1529	    # process each "actor (part)*," block
1530	    foreach ( $detailcontent =~ /[^,;]+/g ) {
1531		if ( m/\s*([^\(,]+?)\s+\(([^\),]+)\)[\s,;]*/ ) {
1532		    t "actor $_ => $1 -- $2";
1533		    #$1 = actor, $2 = part
1534		    warn "discarding information about the parts played by each actor\n"
1535			unless $warned_discarding_parts++;
1536
1537		    push @{${p}->{credits}->{actor}}, $1;
1538		} else {
1539		    s/^\s+//; s/\s+$//;
1540		    t "actor $_ -- (no part)";;
1541		    push @{${p}->{credits}->{actor}}, $_;
1542		}
1543	    }
1544	}
1545	elsif ($detailtype =~ /^(Jaar|Ann�e)$/i ) {
1546	    $detailcontent=clean_html_text($detailcontent);
1547	    ${$p}{'date'}=$detailcontent;
1548	}
1549	elsif ($detailtype =~ /^(land|pays)$/i ){
1550	    $detailcontent=clean_html_text($detailcontent);
1551	    ${$p}{'country'}=[ [ $detailcontent,  $LANG ] ];
1552	}
1553 	elsif ($detailtype =~ /^(Aflevering|�pisode)$/i ){
1554 	    $detailcontent=clean_html_text($detailcontent);
1555 	    # epsiode number assign to subtitle if not already defined
1556	    # and to episode num
1557 	    ${$p}{'episode-num'} = [ [ $detailcontent ] ] ;
1558 	}
1559	elsif ($detailtype =~ /^(Afleveringstitel|Titre de l\'�pisode)$/i ){
1560	    $detailcontent=clean_html_text($detailcontent);
1561	    # episode name
1562	    ${$p}{'sub-title'} =  [ [ $detailcontent,  $LANG ] ];
1563	}
1564	else {
1565	    warn "found unknown details tag $detailtype" unless $warn_others{$detailtype}++;
1566	}
1567    }
1568    parse_programme_details($p, \@detailstringsarr) if ( @detailstringsarr );
1569}
1570
1571
1572my %warn_windowschars;
1573sub clean_html_text( $ ) {
1574    local $_ = shift;
1575    t "original string" . d \$_;
1576    # br to newline
1577    s/<br[^>]*>/\r\n/g;
1578    # remaining tags to spaces
1579    s/<\/*[^>]*>/ /g;
1580
1581    # decode any HTML special chars (&amp; &nbsp;)
1582    decode_entities($_);
1583    # note &nbsp; -> \240 -> space
1584
1585
1586    # get rid of known Windows encoded characters
1587    # silly windows characters to simple quotes
1588    tr/\221\222\223\224\226\227/\'\'\"\"\-\-/;
1589    tr/\010//d;
1590
1591    # replace invalid windows chars oe ligatures
1592    s/\234/oe/g;
1593    s/\214/OE/g;
1594    # replace windows' "..." character
1595    s/\205/.../g;
1596
1597    foreach ( m/[\200-\237]/g ) {
1598	warn "stripping invalid windows character (" . ord($_) . " - $_) from input: $_" unless $warn_windowschars{ord($_)}++;
1599    }
1600    s/[\200-\237]/\?/g;
1601
1602    # multiple spaces to one space
1603    s/[\240\s]+/ /g;
1604    # trim leading and trailing spaces
1605    s/^\s+//;
1606    s/\s+$//;
1607    t "cleaned string" . d \$_;
1608    return $_;
1609}
1610
1611# Function which will locate all the available channels and return a hash
1612# with channelId as the key and a channel description.
1613#
1614sub get_channels() {
1615    my $data;
1616    eval {
1617        $data = get_url("$BASE_URL{$LANG}$SUMMARY_PATH{$LANG}");
1618        die 'strange, get_url() not supposed to return undef'
1619          if not defined $data;
1620    };
1621    if ($@) {
1622        die "could not get channels page $BASE_URL{$LANG}$SUMMARY_PATH{$LANG}, aborting\n";
1623    }
1624    $data =~ tr/\n\r/\n/ds;
1625    t 'got channels page: ' . d $data;
1626    $data =~ s/\n//g;
1627    $data =~ /<select class=PersoFormSelect size='1' name='channel'[^>]*?>(.*?)<\/select>/
1628      or die "cannot find channel string in HTML $data";
1629    my $channel_string = $1;
1630    t 'got string of channels: ' . d $channel_string;
1631    $channel_string =~ s/\s+/ /g;
1632    $channel_string =~ s/<option value=''[^>]*>[^<]*//ig;
1633    t 'cleanedup string of channels: ' . d $channel_string;
1634    my @channels = ($channel_string =~ /<option value='[^\']+'[^>]*>[^<]*/ig);
1635    t 'channels in string: ' . d @channels;
1636    warn "no channels found in $channel_string" if not @channels;
1637    my %c;
1638
1639    foreach (@channels) {
1640        t 'doing channel string: ' . d $_;
1641        m/'([^\']+)'/ or die "cannot find sanoma channel id in $_";
1642        my $channelId = $1;
1643        t 'got sanoma id: ' . d $channelId;
1644        m/>(.*)/ or die "cannot find channel description in $_";
1645        my $channelDesc = $1;
1646        for ($channelDesc) {
1647            s/^\s+//; s/\s+$//;
1648        }
1649        t 'got description: ' . d $channelDesc;
1650        my $chanID_to_output = be_to_xmltv($channelId);
1651        t 'XMLTV id to use: ' . d $chanID_to_output;
1652        die if not defined $chanID_to_output;
1653        die if not defined $channelId;
1654
1655        my @dns = ([ $channelDesc, $LANG ]);
1656        my $extra_dn = $extra_dn{$LANG}{$chanID_to_output};
1657        push @dns, [ $extra_dn ] if defined $extra_dn;
1658	my $ch = { 'display-name' => \@dns,
1659		    'id' => $chanID_to_output, };
1660
1661	${$ch}{'icon'} = [ { src => "http://" . $logourl{$LANG}{$chanID_to_output} } ]
1662	    if ( defined  $logourl{$LANG}{$chanID_to_output} );
1663        t 'channel object: ' . d $ch;
1664        $c{$chanID_to_output} = $ch;
1665        t "added to channels hash under key $chanID_to_output";
1666    }
1667
1668    t 'returning hash: ' . d \%c;
1669    return %c;
1670}
1671
1672
1673# Function which will locate all the available dates and return a list
1674# of Date::Manip objects, one for each day.
1675#
1676# (I was tempted to make this a hash (so you could say $available{$d}
1677# to see if a day exists) but string equality is a bit dirty for
1678# comparing two Date::Manip objects.  There needs to be a tied hash
1679# class which can use a specified equality operation.)
1680#
1681sub get_available_dates() {
1682    my @r;
1683    my $url = "$BASE_URL{$LANG}$SUMMARY_PATH{$LANG}?channel=$DATE_CH{$LANG}";
1684    my $data;
1685    eval {
1686        $data = get_url($url);
1687        die 'strange, get_url() not supposed to return undef'
1688          if not defined $data;
1689    };
1690    if ($@) {
1691        die "could not get $url, so cannot find available dates, aborting\n";
1692    }
1693
1694    $data =~ s/\n//g;
1695    $data =~ /<select class=PersoFormSelect size='1' name='dag'[^>]*?>(.*?)<\/select>/
1696      or die "cannot find searchDate string in HTML $data";
1697    local $_ = $1;
1698    s/&nbsp;/ /g;
1699    s/\s+/ /g;
1700    s/^\s*//;
1701    t 'date string: ' . d $_;
1702    while (length) {
1703        if (not s!<option value='(\d{1,2})/(\d{1,2})/(\d{4})'[^>]*>[^<]*</option>\s*!!i) {
1704            warn "remnant junk in date string: $_";
1705            return @r;
1706        }
1707
1708        my $val = "$1/$2/$3";
1709        my $text = $4;
1710
1711        my $date_from_val = "$3-$1-$2";
1712        my $parsed_val = parse_date($date_from_val);
1713
1714        push @r, $parsed_val;
1715    }
1716    return @r;
1717}
1718
1719sub be_to_xmltv( $ ) {
1720    my $n = shift;
1721    die "undef \$LANG" if not defined $LANG;
1722    if (not defined $be_to_xmltv{$LANG}{$n}) {
1723        my $new = (lc $n) . ".$DOMAIN{$LANG}";
1724        $new =~ s/ //g;
1725        warn "$DOMAIN{$LANG} Channel id $n not found in channel_ids_${LANG} file, assuming XMLTV id $new\n";
1726        die "channel id $new already exists" if defined $xmltv_to_be{$LANG}{$new};
1727        $be_to_xmltv{$LANG}{$n} = $new;
1728        $xmltv_to_be{$LANG}{$new} = $n;
1729    }
1730    return $be_to_xmltv{$LANG}{$n};
1731}
1732sub xmltv_to_be( $ ) {
1733    my $x = shift;
1734    die "undef \$LANG" if not defined $LANG;
1735    for ($xmltv_to_be{$LANG}{$x}) {
1736        die "no $DOMAIN{$LANG} id known for $x" if not defined;
1737        return $_;
1738    }
1739}
1740
1741
1742# Ask the user which channels to download, and write $config_file.
1743#
1744# Uses global %channels hash.
1745#
1746sub configure() {
1747#    local $Log::TraceMessages::On = 1;
1748
1749    # only lots of page fetches in slow mode!
1750    if ($opt_slow && not ask_boolean( <<END
1751Warning: this grabber requires a large number of page fetches from a
1752human-readable website.
1753
1754Proceed with configuration?
1755END
1756				     , 0)) {
1757	say("Exiting.\n");
1758	exit 0;
1759    }
1760
1761    XMLTV::Config_file::check_no_overwrite($config_file);
1762
1763    # FIXME need to make directory
1764    open(CONF, ">$config_file") or die "cannot write to $config_file: $!";
1765    print CONF <<END
1766\#
1767\# tv_grab_be config file.
1768\#
1769\# Format is:
1770\# language <$LANG_FR|$LANG_NL>
1771\# detailstarttime <24hr clock time>
1772\# detailstoptime <24hr clock time>
1773\# detailgenre <genre regexp>    \#- may be repeated mutiple times
1774\# channel <xmltv_id> <fr|nl> [dodetail] \#- may be repeated mutiple times
1775\#
1776END
1777;
1778
1779    for (;;) {
1780        my $in = ask_choice('Enter the language required (Note: dutch is no longer supported)','French', ('French'));
1781
1782	die "could not read answer\n" if not defined $in;
1783	# handle backspace (^H)
1784	$in =~ s/.\x08//g;
1785        $in = uc $in;
1786        if ( $in eq 'FRENCH' ) {
1787            $LANG=$LANG_FR;
1788            last;
1789        }
1790        elsif ( $in eq 'DUTCH' ) {
1791            say("Dutch language grabbing is no longer supported, because Teveblad blocks xmltv!.");
1792            last;
1793        }
1794        else {
1795            say("'$in' is not 'French', try again!.");
1796        }
1797    }
1798    print CONF <<END
1799
1800\#
1801\# definition of language mode: $LANG_FR or $LANG_NL
1802\#
1803END
1804;
1805    print CONF "language $LANG\n";
1806
1807    if ( $opt_slow )
1808    {
1809
1810	say(<<END
1811Configuring with --slow:
1812
1813Detailed information grabbing will require 1 web page get for every
1814program. This is slow, hard work on the web-server, and may upset the
1815listings provider...
1816
1817To limit this, there are three selection critera:
1818Time range (only programs between 16:00  and 00:00)
1819category (only Series, magazines, films, telefilms)
1820Channel (only get detailed info for La Une, La Deux, KA2 and VT4)
1821
1822If all citeria match, then program detail will be obtained (program
1823will show up as \@ instead of \# on progress bar)
1824
1825Note: for time range, the early hours of the morning are assumed to be in the same 'day' as the late ours of the night, so 17:00-02:00 is a valid range.
1826
1827Note: for category, regular expressions are allowed.
1828END
1829	    );
1830
1831	# Time range loop:
1832	my $starttime;
1833	my $stoptime;
1834      TIME_RANGE_LOOP:
1835	for (;;) {
1836	  START_LOOP: for (;;) {
1837	      $starttime = ask("Enter a starting time for grabbing detail (24h format [17:00])");
1838	      die "could not read answer\n" if not defined $starttime;
1839	      if ( $starttime eq "" ) {
1840		  $starttime="17:00";
1841		  last START_LOOP;
1842	      }
1843	      else {
1844		  $starttime =~ s/^\s+//; $starttime =~ s/\s+$//;
1845		  if ( $starttime =~ /^([0-9]{2}):([0-9]{2})$/
1846		       && $1 ge 0 && $1 lt 24
1847		       && $2 ge 0 && $2 lt 60 ) {
1848		      last START_LOOP;
1849		  }
1850		  say ( "Invalid time format: $starttime");
1851	      }
1852	  }
1853
1854	  STOP_LOOP:
1855	    for (;;) {
1856		$stoptime = ask("Enter an ending time for grabbing detail (24h format: [02:00])");
1857		die "could not read answer\n" if not defined $stoptime;
1858		if ( $stoptime eq "" ) {
1859		    $stoptime="02:00";
1860		    last STOP_LOOP;
1861		}
1862		else {
1863		    $stoptime =~ s/^\s+//; $stoptime =~ s/\s+$//;
1864		    if ( $stoptime =~ /^([0-9]{2}):([0-9]{2})$/
1865			 && $1 ge 0 && $1 lt 24
1866			 && $2 ge 0 && $2 lt 60 ) {
1867			last STOP_LOOP;
1868		    }
1869		    say ( "Invalid time format: $stoptime");
1870		}
1871	    }
1872	    last TIME_RANGE_LOOP;
1873	}
1874	print CONF <<END
1875
1876\#
1877\# definition of start and stop times for retrieving detailed
1878\# information for programmes. Times must be in 24 hour clock
1879\# and may overlap a day bounday (eg 17:00 - 02:00)
1880\#
1881END
1882;
1883	print CONF "detailstarttime $starttime\n";
1884	print CONF "detailstoptime $stoptime\n";
1885
1886	my $example_categs = "";
1887	if ( $LANG eq $LANG_FR ) {
1888	    $example_categs=
1889		"actualit�, court m�trage, divertissement, documentaire, enfant,\n" .
1890		"film, football, jeu, journal, magazine, musique, sport, s�rie,\n" .
1891		"talkshow, th��tre, t�l�film.";
1892	}
1893	if ( $LANG eq $LANG_NL ) {
1894	    $example_categs=
1895		"actua, documentaire, film, kinderprogramma, miniserie, muziek,\n" .
1896		"nieuws, quiz, serie, soap, spelprogramma, sport, talkshow,\n" .
1897		"tekenfilm, tekenfilm kind, tvfilm, wielrennen.";
1898	}
1899	say(<<END
1900Enter a list of program categories (genres) These will be sub-string
1901matched against the Genre column on the daily channel listing page of
1902TeleMoustique/TeveBlad.
1903
1904Example categories are:
1905$example_categs
1906
1907eg: *ALL* -- match all categories (use with care!)
1908    *NONE* -- match completely blank categories.
1909    film -- will match "tvfilm", "telefilm", as well as "film"
1910END
1911	    );
1912
1913	print CONF <<END
1914
1915\#
1916\# definition of genres/category substrings to get detailed information for
1917\# multiple detailgenre lines can be defined
1918\# eg: *ALL* -- match all categories (use with care!)
1919\#    *NONE* -- match completely blank categories.
1920\#    FILM -- will match tvfilm, telefilm, as well as film
1921\#
1922END
1923;
1924	for (;;) {
1925	    my $in = ask(<<END
1926Enter a Genre, or "." to finish:
1927END
1928			 );
1929	    # interpret EOF as '.'
1930	    last if not defined $in;
1931	    # handle backspace (^H)
1932	    $in =~ s/.\x08//g;
1933	    $in = uc $in;
1934	    last if $in eq '.';
1935	    if ( $in eq "" || $in eq '^$' ) {
1936		say ('Ignoring empty input: Use "*NONE*" to match a blank category');
1937		$in="";
1938	    }
1939	    if ( $in =~ m/[]/ ) {
1940		say('control characters not allowed -- try again');
1941		$in="";
1942	    }
1943	    print CONF "detailgenre $in\n" if ( $in ne "" );
1944	}
1945    }
1946    else
1947    {
1948
1949	# Slow mode not specified... Write dummy config comments, and
1950	print CONF <<END
1951\#
1952\# definition of start and stop times for retrieving detailed
1953\# information for programmes. Times must be in 24 hour clock
1954\# and my overlap a day bounday (eg 17:00 - 02:00)
1955\#
1956\# Configured without --slow flag; detailstarttime and detailstoptime not specified
1957
1958\#
1959\# definition of genres/category substrings to get detailed information for
1960\# multiple detailgenre lines can be defined
1961\# eg: *ALL* -- match all categories (use with care!)
1962\#    *NONE* -- match completely blank categories.
1963\#    FILM -- will match tvfilm, telefilm, as well as film
1964\#
1965\# Configured without --slow flag; detailgenre list not specified
1966END
1967;
1968
1969	# print out a message
1970	say( <<END
1971The Default configuration for this grabber is to only grab the summary
1972information for programmes. (channel/start/title/brief description)
1973
1974If you want detailed information (episode name, detailed description,
1975actors) then you must re-configure and run this grabber with the
1976--slow option
1977END
1978	     );
1979    }
1980
1981    # FIXME turn into progress bar.
1982    print STDERR "finding channels from $DOMAIN{$LANG} :\t";
1983    my %channels = get_channels();
1984    print STDERR "got " . (scalar keys %channels) . ", done.\n";
1985
1986    my %chose_ch;
1987    t 'channels: ' . d \%channels;
1988
1989    print CONF <<END
1990
1991\#
1992\# definition of channels to grab, and whether to grab detailed info for the channel
1993\# multiple channel lines can be defined as:
1994\#   channel xmltv.channel.id language dodetail
1995\# or (if no detail required)
1996\#   channel xmltv.channel.id
1997\#
1998\# where language is fr or nl (for future use: currently ignored)
1999\#
2000END
2001;
2002    # nielm 25/4/2007 convert to ask_many_boolean
2003    my @questions;
2004    my @chan_ids=keys %channels;
2005    t 'channel ids: ' . d \@chan_ids;
2006    @chan_ids=sort {$channels{$a}->{'display-name'}->[0]->[0] cmp $channels{$b}->{'display-name'}->[0]->[0] } keys %channels;
2007    t 'sorted channel ids: ' . d \@chan_ids;
2008
2009    foreach my $k (@chan_ids) {
2010        push @questions, "Add channel ".$channels{$k}->{'display-name'}->[0]->[0]."? ";
2011    }
2012    t 'questions ' . d \@questions;
2013    my @answers = ask_many_boolean(0, @questions);
2014    t 'answers ' . d \@answers;
2015
2016    for (my $i=0; $i < $#answers; $i++) {
2017        if ($answers[$i]) {
2018	    my $xmltv_id=$chan_ids[$i];
2019	    t 'selected chanel '.$i . ' id ' .  $xmltv_id . d \$channels{$xmltv_id};
2020
2021	    if ( defined $ch_warn{$LANG}{$xmltv_id} ) {
2022		if (ask_boolean( <<END
2023Warning for $channels{$xmltv_id}->{'display-name'}->[0]->[0] :
2024$ch_warn{$LANG}{$xmltv_id}
2025
2026Confirm add channel: $channels{$xmltv_id}->{'display-name'}->[0]->[0] ?
2027END
2028				 ,0)){
2029
2030		    if ($opt_slow
2031			&& ask_boolean(
2032				       'Get detailed info for channel '
2033				       .$channels{$xmltv_id}->{'display-name'}->[0]->[0]
2034				       .'?'
2035				       ,0)) {
2036			print CONF "channel $xmltv_id $LANG dodetail\n";
2037		    }
2038		    else {
2039			print CONF "channel $xmltv_id $LANG\n";
2040		    }
2041		}
2042	    }
2043	    else
2044	    {
2045		$chose_ch{$xmltv_id}++;
2046		if ($opt_slow
2047		    && ask_boolean(
2048				   'Get detailed info for channel '.
2049				   $channels{$xmltv_id}->{'display-name'}->[0]->[0]
2050				   .'?'
2051				   ,1)) {
2052		    print CONF "channel $xmltv_id $LANG dodetail\n";
2053		}
2054		else {
2055		    print CONF "channel $xmltv_id $LANG\n";
2056		}
2057	    }
2058        }
2059    }
2060
2061    close CONF or warn "cannot close $config_file: $!";
2062    say("Finished configuration.");
2063    exit();
2064}
2065