1#!/usr/local/bin/perl -w
2=pod
3
4=head1 NAME
5
6tv_grab_re - Grab TV listings for Nouvelle Caledonie Island (France).
7
8=head1 SYNOPSIS
9
10To configure: tv_grab_nc --configure [--config-file FILE]
11To grab channels listing: tv_grab_nc --list-channels [--output FILE]
12To grab programmes listings: tv_grab_nc [--output FILE] [--offset N] [--days N] [--quiet]
13Slower, detailed grab: tv_grab_nc --slow [--output FILE] [--offset N] [--days N] [--quiet]
14Help: tv_grab_nc --help
15
16=head1 DESCRIPTION
17
18Output TV listings for Canal Satellite Caledonie channels available in
19Nouvelle Caledonie Island. The data comes from www.canalsatellite-caledonie.com.
20The default is to grab listing only for the current day. By default program descriptions are not downloaded, so if you want description and credits, you should
21activate the --slow option.
22
23B<--configure> Grab channels informations and ask for channel type and names.
24
25B<--output FILE> Write to FILE rather than standard output.
26
27B<--days N> Grab N days, rather than only for the current day.
28
29B<--offset N> Start grabbing for N days in the future, eg offset 1
30means start with tomorrow.
31
32B<--slow> Get additional information from the website, like program
33description and credits.
34
35B<--quiet> Suppress the progress messages normally written to standard
36error.
37
38B<--version> Show the version of the grabber.
39
40B<--help> Print a help message and exit.
41
42=head1 SEE ALSO
43
44L<xmltv(5)>
45
46=head1 AUTHOR
47
48Eric Castelnau, eric.castelnau@free.fr
49Inspired by hacks from Marcus Westbury <marcus.westbury@gmail.com>
50
51=cut
52
53use XMLTV::Usage <<END
54$0: get Nouvelle Caledonie Island television listings in XMLTV format
55To configure: tv_grab_nc --configure [--config-file FILE]
56To grab channels listing: tv_grab_nc --list-channels [--output FILE]
57To grab programmes listings: tv_grab_nc [--output FILE] [--days N] [-offset N] [--quiet]
58Slower, detailed grab: tv_grab_nc --slow [--output FILE] [--days N] [--offset N] [--quiet]
59END
60  ;
61
62use warnings;
63use strict;
64use XMLTV::Version '$Id: tv_grab_nc,v 1.3 2010/09/02 05:07:40 rmeden Exp $ ';
65use XMLTV::Capabilities qw/baseline manualconfig cache/;
66use XMLTV::Description 'Nouvelle Caledonie Island';
67use Getopt::Long;
68use HTML::TreeBuilder;
69use HTML::Entities; # parse entities
70use IO::File;
71use URI;
72use Date::Manip;
73use XMLTV;
74use XMLTV::Memoize;
75use XMLTV::Ask;
76use XMLTV::ProgressBar;
77use XMLTV::Mode;
78use XMLTV::Config_file;
79use XMLTV::DST;
80use XMLTV::Get_nice;
81use XMLTV::Memoize; XMLTV::Memoize::check_argv 'get_nice';
82
83###
84### Main declarations
85###
86my %BROADCASTERS = (
87	'CANALSAT' => "Canal Satellite Nouvelle Caledonie",
88);
89my $CANALSAT_BASE_URL = "http://srv3.media-overseas.com/";
90my $CANALSAT_ICON_URL = "http://www.canalsatellite-caledonie.com/lebouquet/leschaines/pageschaines/images_chaines";
91
92###
93### Options processing
94###
95my ($opt_offset, $opt_days);
96my $opt_help;
97my $opt_output;
98my $opt_quiet;
99my $opt_config_file;
100my $opt_configure;
101my $opt_list_channels;
102my $opt_slow;
103
104GetOptions(	'days=i'	=> \$opt_days,
105		'offset=i'	=> \$opt_offset,
106		'help'          => \$opt_help,
107		'output=s'      => \$opt_output,
108		'quiet'         => \$opt_quiet,
109		'configure'     => \$opt_configure,
110		'config-file=s' => \$opt_config_file,
111		'list-channels' => \$opt_list_channels,
112		'slow'		=> \$opt_slow,
113) or usage(0);
114
115# need help
116usage(1) if $opt_help;
117
118# verbose by default
119$opt_quiet = 0;
120
121# number of day to process
122die 'Number of days must not be negative' if (defined $opt_days && $opt_days < 0);
123die 'Number of days must not be more than 5' if (defined $opt_days && $opt_days > 5);
124$opt_days = 1 if not defined $opt_days;
125
126# offset - zero (default) means start from today
127die 'Offset must not be negative' if (defined $opt_offset && $opt_offset < 0);
128$opt_offset = 0 if not defined $opt_offset;
129
130# output file
131$opt_output = '-' if not defined $opt_output;
132
133# slow mode off by default
134$opt_slow = 0 if not defined $opt_slow;
135
136# Now detects if we are in configure mode
137my $mode = XMLTV::Mode::mode('grab', $opt_configure => 'configure',
138                        	$opt_list_channels => 'list-channels');
139
140# File that stores which channels to download.
141my $config_file = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_nc',
142						$opt_quiet);
143
144# Content of $config_file
145my @config_lines;
146
147###
148### Global variables
149###
150
151# channels list
152my @channels;
153
154###
155### Sub sections
156###
157sub dprint($) {
158	my $msg = shift;
159	print STDERR "debug: " . $msg;
160}
161
162sub dump_channel($) {
163	my $c = shift;
164	print "type: $c->{'type'}\n";
165	print "id  : $c->{'id'}\n";
166	print "name: $c->{'name'}\n";
167	print "icon: $c->{'icon'}\n";
168}
169
170sub dump_programme($) {
171	my $c = shift;
172	print "channel  : $c->{'channel'}\n";
173	print "title    : $c->{'title'}[0][0]\n";
174	print "start    : $c->{'start'}\n";
175	print "stop     : $c->{'stop'}\n";
176	#print "length   : $c->{'length'}sec.\n";
177	print "category : $c->{'category'}[0][0]\n" if defined $c->{'category'};
178}
179
180sub new_xmltv_writer() {
181	my %writer_args;
182	my $file = new IO::File(">$opt_output");
183	die "Cannot write to $opt_output: $!" if not defined $file;
184	$writer_args{OUTPUT} = $file;
185	$writer_args{'encoding'}  = 'ISO-8859-1';
186	return new XMLTV::Writer(%writer_args);
187}
188
189sub get_channels_list($) {
190	my $arg = shift;
191	my @channels;
192
193	if ($arg eq 'CANALSAT') {
194		my $url = "http://srv3.media-overseas.com/FMPro?-db=caledonie.fp5&-lay=M1&-format=csat_caledonie/recherchecaledo.htm&-view";
195		my $html = get_nice_tree $url;
196
197		my $chaines = $html->look_down('_tag', 'select', 'name', 'idchaine');
198		foreach my $chaine ($chaines->look_down('_tag', 'option')) {
199			my %channel;
200
201			my $id = $chaine->attr_get_i('value');
202			next if ($id eq "0...999");
203			my $title = $chaine->as_text();
204
205			$channel{'type'} = "CANALSAT";
206			$channel{'id'} = $id;
207			$channel{'name'} = $title;
208			$channel{'icon'} = "$CANALSAT_ICON_URL/${id}_grand.gif";
209
210			push @channels,\%channel;
211		}
212
213		$html->delete();
214		undef $html;
215	}
216
217	return @channels;
218}
219
220sub get_canalsat_programmes_list_slow($%) {
221	my $url = shift(@_);
222	my $p = shift(@_);
223
224	# get request and parse
225	my $html = get_nice_tree $url;
226
227	# look for the résumé
228	my $table = $html->look_down('_tag', 'table', 'width', '480', 'vspace', '0', 'cellspacing', '0', 'cellpadding', '0', 'border', '0', 'align', 'center');
229	$table->objectify_text();
230    #$table->dump();dprint("\n\n");
231
232    my @text = $table->look_down('_tag', '~text');
233
234	foreach (@text) {
235		my $t = $_->attr_get_i('text');
236
237		next if ($t =~ /^ /);
238
239		next if (length $t < 7);
240
241		$p->{'desc'} = [ [ $t, "fr" ] ];
242	}
243
244	# look for director/actors
245	$table = $html->look_down('_tag', 'table', 'width', '621', 'height', '318', 'cellspacing', '4', 'cellpadding', '0', 'border', '0', 'bgcolor', '#CCCCCC', 'align', 'center');
246    $table->objectify_text();
247    my $td = $table->look_down('_tag', 'td', 'width', '475', 'height', '107');
248    my $i = $td->look_down('_tag', 'i');
249    #$i->dump();dprint("\n\n");
250    @text = $i->look_down('_tag', '~text');
251
252    my (@directors, @actors);
253	foreach (@text) {
254		my $t = $_->attr_get_i('text');
255
256		if ($t =~ /^.*\(.*\) r.alis. en \d{4} de (.*) Avec (.*)/) {
257			push @directors, $1;
258
259			my @a = split(',', $2);
260			foreach (@a) {
261				if ($_ =~ /(.*) \(.*\)/) {
262					push @actors, $1;
263				}
264			}
265		}
266	}
267
268	$p->{credits}{director} = \@directors if @directors;
269	$p->{credits}{actor}    = \@actors if @actors;
270
271	$html->delete();
272	undef $html;
273}
274
275sub get_canalsat_programmes_list($$$) {
276	my ($idchaine, $offset, $days) = @_;
277	die if $offset < 0;
278	die if $days < 1;
279
280	# the progs list to return
281	my @progs = ();
282
283	my $today = ParseDate 'today';
284
285	for ($offset + 1 .. $offset + $days) {
286		my $n = $_ - 1;
287
288		# the start tag of programs for this day
289		my $start = DateCalc($today, "+ $n days");
290		my $url_day = UnixDate($start, "%d%%2F%m%%2F%Y");
291
292		# build the url
293		my $url = "http://srv3.media-overseas.com/FMPro?-db=caledonie.fp5&-format=csat%5fcaledonie%2frechercheresultatscaledo.htm&-error=rechercheerreurreunion.htm&-SortField=presseheuretri&-SortORder=Ascending&-max=99&";
294		$url .= "-op=eq&jourdate=".$url_day."&";
295		$url .= "-op=cn&periodejour=a...z&";
296		$url .= "-op=cn&idchaine=".$idchaine."&";
297
298		# simulate a click on the submit button
299		my $random = int(rand(42)) + 1;
300		$url .= "-Find.x=".$random."&";
301		$random = int(rand(41)) + 1;
302		$url .= "-Find.y=".$random;
303
304		# get request and parse
305		my $html = get_nice_tree $url;
306
307		# look for the table of programmes
308		my $table = $html->look_down('_tag', 'table', 'width', '815', 'cellspacing', '2', 'cellpadding', '0', 'border', '0', 'align', 'center');
309
310		return @progs if not defined $table;
311
312		$table->objectify_text();
313
314		# look for the list of rows of the table
315		my @rows = $table->look_down('_tag', 'table', 'width', '797', 'height', '53', 'cellspacing', '1', 'cellpadding', '0', 'border', '0', 'align', 'center');
316
317		# scan each row
318		foreach my $r (@rows) {
319			# the current prog being processed
320			my %prog;
321			my ($tt, $stop);
322
323			$prog{'channel'} = $idchaine.".canalsatellite-caledonie.com";
324
325			# look for every column
326			my @td = $r->look_down('_tag', 'td');
327
328			# scan each cellule of the row
329			foreach my $cell (@td) {
330				my @b = $cell->look_down('_tag', '~text');
331				foreach my $tag (@b) {
332					$tt = $tag->attr_get_i('text');
333					#$tag->dump();
334
335					# here is the start hour
336					if ( $tt =~ /(\d\d):(\d\d)/ ) {
337						$start = Date_SetTime($start, $1, $2, 0);
338						my $start_str = UnixDate($start, "%Y%m%d%H%M%S");
339						$prog{'start'} = $start_str." +0400";
340					}
341
342					# here is the title with the duration in minutes
343					if ( $tt =~ /(.*)\s\((\d+)\'\).*/ ) {
344						# sometimes title doesn't exist for the first programme
345						next if ($1 eq "");
346
347						# "Fin des programmes" is not a real tv show
348						next if ($1 eq "Fin des programmes");
349
350						my $title = $1;
351
352						$prog{'title'} = [ [ $title ] ];
353
354						$stop = DateCalc($start, "+$2 min");
355						my $stop_str = UnixDate($stop, "%Y%m%d%H%M%S");
356						$prog{'stop'} = $stop_str." +0400";
357
358						# Change the start date because last programme begins
359						# this day (at 23:00 PM) and ends the day after
360						# (at 01:00 AM)
361						my $y = UnixDate($stop, "%Y");
362						my $m = UnixDate($stop, "%m");
363						my $d = UnixDate($stop, "%d");
364
365						$start = Date_SetDateField($stop, "y", $y);
366						$start = Date_SetDateField($start, "m", $m);
367						$start = Date_SetDateField($start, "d", $d);
368
369						# length tag not necessary if start and stop
370						# are presents
371						#$prog{'length'} = $2 * 60;
372
373						# sometime there is also the category
374						my $i = $cell->look_down('_tag', 'span', 'class', 'rouge');
375						my $ii = $i->look_down('_tag', '~text');
376						my $category = $ii->attr_get_i('text');
377						utf8::encode($category) if (utf8::is_utf8($category));
378						$category =~ s/^\s+//;
379						$category =~ s/\s+$//;
380						$prog{'category'} = [ [ $category, "fr" ] ] if not $category eq "";
381					}
382
383					# Year of the prog (if present)
384					if ( $tt =~ / - (\d\d\d\d)/ ) {
385						$prog{'date'} = $1;
386					}
387				}
388
389				# get director/actors if --slow was asked
390				if ($opt_slow) {
391					@b = $cell->look_down('_tag', 'a', 'class', 'rouge bold');
392					foreach (@b) {
393						my $href = "http://srv3.media-overseas.com/".$_->attr_get_i('href');
394						get_canalsat_programmes_list_slow($href, \%prog);
395					}
396				}
397			}
398
399			# add the current prog to the list if it is valid
400			if (defined $prog{'title'}) {
401				push @progs,\%prog;
402			}
403		}
404
405		$html->delete();
406		undef $html;
407	}
408
409	return @progs;
410}
411
412###
413### Configure mode
414###
415if ($mode eq 'configure') {
416	XMLTV::Config_file::check_no_overwrite($config_file);
417
418	# ask user to select his broadcasters
419	my @id = sort keys %BROADCASTERS;
420	my @questions = map { "Would you like to download data for '$BROADCASTERS{$_}' ?" } @id;
421	my @responses = ask_many_boolean(1, @questions);
422
423	# retrieve the channels list for each broadcasters
424	foreach (0..$#id) {
425		if ($responses[$_]) {
426			my @ch = get_channels_list($id[$_]);
427			@channels = (@channels, @ch) if @ch;
428		}
429	}
430
431	# ask user to add or not each channel
432	@questions = map { "Add channel $_->{'name'} ?" } @channels;
433	@responses = ask_many_boolean(1, @questions);
434
435	# create configuration file
436	open(CONF, ">$config_file") or die "Cannot write to $config_file: $!";
437
438	foreach (@channels) {
439		my $r = shift @responses;
440
441		if ($r) {
442			print CONF "channel:";
443		}
444		else {
445			print CONF "#channel:";
446		}
447
448#		if ( $_->{'type'} eq "CANALSAT" )
449#		{
450		print CONF "$_->{'id'}.canalsatellite-caledonie.com;$_->{'name'}\n";
451#		}
452#		else
453#		{
454#			print CONF "$_->{'id'}.parabolereunion.com;$_->{'name'}\n";
455#		}
456	}
457
458	close CONF or warn "Cannot close $config_file: $!";
459	say("Finished configuration.");
460	exit();
461}
462
463###
464### List channels
465###
466if ($mode eq 'list-channels') {
467	# init the XMLTV writer
468	my $writer = new_xmltv_writer();
469
470	# ask user to select his broadcasters
471	my @id = sort keys %BROADCASTERS;
472	my @questions = map { "Select '$BROADCASTERS{$_}' ?" } @id;
473	my @responses = ask_many_boolean(1, @questions);
474
475	# retrieve the channels list for each broadcasters
476	foreach (0..$#id) {
477		if ($responses[$_]) {
478			my @ch = get_channels_list($id[$_]);
479			@channels = (@channels, @ch) if @ch;
480		}
481	}
482
483	# write the XML header
484	$writer->start({
485		'generator-info-name' => 'XMLTV',
486		'generator-info-url'  => 'http://xmltv.org/',
487	});
488
489	foreach (@channels) {
490		my $id = "id";
491		$id = $_->{'id'}.".canalsatellite-caledonie.com";
492#		$id = $_->{'id'}.".parabolereunion.com" if ($_->{'type'} eq "PARABOLE");
493
494		$writer->write_channel({
495			'id'           => $id,
496			'display-name' => [[ $_->{'name'} ]],
497#			'icon'         => [{ 'src' => $_->{'icon'} }]
498		});
499	}
500
501	$writer->end();
502	exit();
503}
504
505###
506### Grab programmes listing
507###
508die if $mode ne 'grab';
509
510# Now let's do it
511Date_Init("TZ=UTC");
512
513# Change HTTP Headers to make canalsatellite-caledonie.com happy
514$XMLTV::Get_nice::ua->default_headers->push_header('Keep-Alive'=>'300');
515$XMLTV::Get_nice::ua->default_headers->push_header('Connection'=>'keep-alive');
516$XMLTV::Get_nice::ua->default_headers->push_header('Referer'=>'http://srv3.media-overseas.com/FMPro?-db=caledonie.fp5&-lay=M1&-format=csat_caledonie/recherchecaledo.htm&-view');
517
518# read tv_grab_nc conf file...
519@config_lines = XMLTV::Config_file::read_lines($config_file);
520
521# ...and parse its content
522my $n = 0;
523foreach (@config_lines) {
524	++$n;
525	next if not defined;
526
527	if ( /^channel:(\d+)\.(.*);(.*)/ ) {
528		my %channel;
529
530		$channel{'id'} = $1;
531		$channel{'name'} = $3;
532
533		if ($2 eq 'canalsatellite-caledonie.com') {
534			$channel{'type'} = "CANALSAT";
535			$channel{'icon'} = "$CANALSAT_ICON_URL/".$channel{'id'}."_grand.gif";
536		}
537
538#		if ($2 eq 'parabolereunion.com') {
539#			$channel{'type'} = "PARABOLE";
540#			$channel{'icon'} = "$PARABOLE_ICON_URL/channel_logo_small".$channel{'id'}.".gif";
541#		}
542
543		push @channels,\%channel;
544	}
545	else {
546		die "$config_file:$n - Bad line channel";
547	}
548}
549
550die "No working channels configured, so no grabing" if not @channels;
551
552# init the XMLTV writer
553my $writer = new_xmltv_writer();
554
555# write the XML header
556$writer->start({
557	'generator-info-name' => 'XMLTV',
558	'generator-info-url'  => 'http://xmltv.org/',
559});
560
561# first, write channels
562foreach (@channels) {
563	my $id = "id";
564	$id = $_->{'id'}.".canalsatellite-caledonie.com";
565#	$id = $_->{'id'}.".parabolereunion.com" if ($_->{'type'} eq "PARABOLE");
566
567	$writer->write_channel({
568		'id'           => $id,
569		'display-name' => [ [ $_->{'name'} ] ],
570#		'icon'         => [ { 'src' => $_->{'icon'} } ]
571	});
572}
573
574# then, programmes
575foreach (@channels) {
576	my @progs;
577
578	if ($_->{'type'} eq 'CANALSAT') {
579 	   @progs = get_canalsat_programmes_list($_->{'id'}, $opt_offset, $opt_days);
580	}
581
582#	if ($_->{'type'} eq 'PARABOLE') {
583#		@progs = get_parabole_programmes_list($_->{'id'}, $opt_offset, $opt_days);
584#	}
585
586	foreach my $prog (@progs) {
587		$writer->write_programme(\%$prog);
588	}
589}
590
591$writer->end();
592
593