1#!/usr/local/bin/perl -w 2=pod 3 4=head1 NAME 5 6tv_grab_re - Grab TV listings for Nouvelle Caledonie Island (France). 7 8=head1 SYNOPSIS 9 10To configure: tv_grab_nc --configure [--config-file FILE] 11To grab channels listing: tv_grab_nc --list-channels [--output FILE] 12To grab programmes listings: tv_grab_nc [--output FILE] [--offset N] [--days N] [--quiet] 13Slower, detailed grab: tv_grab_nc --slow [--output FILE] [--offset N] [--days N] [--quiet] 14Help: tv_grab_nc --help 15 16=head1 DESCRIPTION 17 18Output TV listings for Canal Satellite Caledonie channels available in 19Nouvelle Caledonie Island. The data comes from www.canalsatellite-caledonie.com. 20The default is to grab listing only for the current day. By default program descriptions are not downloaded, so if you want description and credits, you should 21activate the --slow option. 22 23B<--configure> Grab channels informations and ask for channel type and names. 24 25B<--output FILE> Write to FILE rather than standard output. 26 27B<--days N> Grab N days, rather than only for the current day. 28 29B<--offset N> Start grabbing for N days in the future, eg offset 1 30means start with tomorrow. 31 32B<--slow> Get additional information from the website, like program 33description and credits. 34 35B<--quiet> Suppress the progress messages normally written to standard 36error. 37 38B<--version> Show the version of the grabber. 39 40B<--help> Print a help message and exit. 41 42=head1 SEE ALSO 43 44L<xmltv(5)> 45 46=head1 AUTHOR 47 48Eric Castelnau, eric.castelnau@free.fr 49Inspired by hacks from Marcus Westbury <marcus.westbury@gmail.com> 50 51=cut 52 53use XMLTV::Usage <<END 54$0: get Nouvelle Caledonie Island television listings in XMLTV format 55To configure: tv_grab_nc --configure [--config-file FILE] 56To grab channels listing: tv_grab_nc --list-channels [--output FILE] 57To grab programmes listings: tv_grab_nc [--output FILE] [--days N] [-offset N] [--quiet] 58Slower, detailed grab: tv_grab_nc --slow [--output FILE] [--days N] [--offset N] [--quiet] 59END 60 ; 61 62use warnings; 63use strict; 64use XMLTV::Version '$Id: tv_grab_nc,v 1.3 2010/09/02 05:07:40 rmeden Exp $ '; 65use XMLTV::Capabilities qw/baseline manualconfig cache/; 66use XMLTV::Description 'Nouvelle Caledonie Island'; 67use Getopt::Long; 68use HTML::TreeBuilder; 69use HTML::Entities; # parse entities 70use IO::File; 71use URI; 72use Date::Manip; 73use XMLTV; 74use XMLTV::Memoize; 75use XMLTV::Ask; 76use XMLTV::ProgressBar; 77use XMLTV::Mode; 78use XMLTV::Config_file; 79use XMLTV::DST; 80use XMLTV::Get_nice; 81use XMLTV::Memoize; XMLTV::Memoize::check_argv 'get_nice'; 82 83### 84### Main declarations 85### 86my %BROADCASTERS = ( 87 'CANALSAT' => "Canal Satellite Nouvelle Caledonie", 88); 89my $CANALSAT_BASE_URL = "http://srv3.media-overseas.com/"; 90my $CANALSAT_ICON_URL = "http://www.canalsatellite-caledonie.com/lebouquet/leschaines/pageschaines/images_chaines"; 91 92### 93### Options processing 94### 95my ($opt_offset, $opt_days); 96my $opt_help; 97my $opt_output; 98my $opt_quiet; 99my $opt_config_file; 100my $opt_configure; 101my $opt_list_channels; 102my $opt_slow; 103 104GetOptions( 'days=i' => \$opt_days, 105 'offset=i' => \$opt_offset, 106 'help' => \$opt_help, 107 'output=s' => \$opt_output, 108 'quiet' => \$opt_quiet, 109 'configure' => \$opt_configure, 110 'config-file=s' => \$opt_config_file, 111 'list-channels' => \$opt_list_channels, 112 'slow' => \$opt_slow, 113) or usage(0); 114 115# need help 116usage(1) if $opt_help; 117 118# verbose by default 119$opt_quiet = 0; 120 121# number of day to process 122die 'Number of days must not be negative' if (defined $opt_days && $opt_days < 0); 123die 'Number of days must not be more than 5' if (defined $opt_days && $opt_days > 5); 124$opt_days = 1 if not defined $opt_days; 125 126# offset - zero (default) means start from today 127die 'Offset must not be negative' if (defined $opt_offset && $opt_offset < 0); 128$opt_offset = 0 if not defined $opt_offset; 129 130# output file 131$opt_output = '-' if not defined $opt_output; 132 133# slow mode off by default 134$opt_slow = 0 if not defined $opt_slow; 135 136# Now detects if we are in configure mode 137my $mode = XMLTV::Mode::mode('grab', $opt_configure => 'configure', 138 $opt_list_channels => 'list-channels'); 139 140# File that stores which channels to download. 141my $config_file = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_nc', 142 $opt_quiet); 143 144# Content of $config_file 145my @config_lines; 146 147### 148### Global variables 149### 150 151# channels list 152my @channels; 153 154### 155### Sub sections 156### 157sub dprint($) { 158 my $msg = shift; 159 print STDERR "debug: " . $msg; 160} 161 162sub dump_channel($) { 163 my $c = shift; 164 print "type: $c->{'type'}\n"; 165 print "id : $c->{'id'}\n"; 166 print "name: $c->{'name'}\n"; 167 print "icon: $c->{'icon'}\n"; 168} 169 170sub dump_programme($) { 171 my $c = shift; 172 print "channel : $c->{'channel'}\n"; 173 print "title : $c->{'title'}[0][0]\n"; 174 print "start : $c->{'start'}\n"; 175 print "stop : $c->{'stop'}\n"; 176 #print "length : $c->{'length'}sec.\n"; 177 print "category : $c->{'category'}[0][0]\n" if defined $c->{'category'}; 178} 179 180sub new_xmltv_writer() { 181 my %writer_args; 182 my $file = new IO::File(">$opt_output"); 183 die "Cannot write to $opt_output: $!" if not defined $file; 184 $writer_args{OUTPUT} = $file; 185 $writer_args{'encoding'} = 'ISO-8859-1'; 186 return new XMLTV::Writer(%writer_args); 187} 188 189sub get_channels_list($) { 190 my $arg = shift; 191 my @channels; 192 193 if ($arg eq 'CANALSAT') { 194 my $url = "http://srv3.media-overseas.com/FMPro?-db=caledonie.fp5&-lay=M1&-format=csat_caledonie/recherchecaledo.htm&-view"; 195 my $html = get_nice_tree $url; 196 197 my $chaines = $html->look_down('_tag', 'select', 'name', 'idchaine'); 198 foreach my $chaine ($chaines->look_down('_tag', 'option')) { 199 my %channel; 200 201 my $id = $chaine->attr_get_i('value'); 202 next if ($id eq "0...999"); 203 my $title = $chaine->as_text(); 204 205 $channel{'type'} = "CANALSAT"; 206 $channel{'id'} = $id; 207 $channel{'name'} = $title; 208 $channel{'icon'} = "$CANALSAT_ICON_URL/${id}_grand.gif"; 209 210 push @channels,\%channel; 211 } 212 213 $html->delete(); 214 undef $html; 215 } 216 217 return @channels; 218} 219 220sub get_canalsat_programmes_list_slow($%) { 221 my $url = shift(@_); 222 my $p = shift(@_); 223 224 # get request and parse 225 my $html = get_nice_tree $url; 226 227 # look for the résumé 228 my $table = $html->look_down('_tag', 'table', 'width', '480', 'vspace', '0', 'cellspacing', '0', 'cellpadding', '0', 'border', '0', 'align', 'center'); 229 $table->objectify_text(); 230 #$table->dump();dprint("\n\n"); 231 232 my @text = $table->look_down('_tag', '~text'); 233 234 foreach (@text) { 235 my $t = $_->attr_get_i('text'); 236 237 next if ($t =~ /^ /); 238 239 next if (length $t < 7); 240 241 $p->{'desc'} = [ [ $t, "fr" ] ]; 242 } 243 244 # look for director/actors 245 $table = $html->look_down('_tag', 'table', 'width', '621', 'height', '318', 'cellspacing', '4', 'cellpadding', '0', 'border', '0', 'bgcolor', '#CCCCCC', 'align', 'center'); 246 $table->objectify_text(); 247 my $td = $table->look_down('_tag', 'td', 'width', '475', 'height', '107'); 248 my $i = $td->look_down('_tag', 'i'); 249 #$i->dump();dprint("\n\n"); 250 @text = $i->look_down('_tag', '~text'); 251 252 my (@directors, @actors); 253 foreach (@text) { 254 my $t = $_->attr_get_i('text'); 255 256 if ($t =~ /^.*\(.*\) r.alis. en \d{4} de (.*) Avec (.*)/) { 257 push @directors, $1; 258 259 my @a = split(',', $2); 260 foreach (@a) { 261 if ($_ =~ /(.*) \(.*\)/) { 262 push @actors, $1; 263 } 264 } 265 } 266 } 267 268 $p->{credits}{director} = \@directors if @directors; 269 $p->{credits}{actor} = \@actors if @actors; 270 271 $html->delete(); 272 undef $html; 273} 274 275sub get_canalsat_programmes_list($$$) { 276 my ($idchaine, $offset, $days) = @_; 277 die if $offset < 0; 278 die if $days < 1; 279 280 # the progs list to return 281 my @progs = (); 282 283 my $today = ParseDate 'today'; 284 285 for ($offset + 1 .. $offset + $days) { 286 my $n = $_ - 1; 287 288 # the start tag of programs for this day 289 my $start = DateCalc($today, "+ $n days"); 290 my $url_day = UnixDate($start, "%d%%2F%m%%2F%Y"); 291 292 # build the url 293 my $url = "http://srv3.media-overseas.com/FMPro?-db=caledonie.fp5&-format=csat%5fcaledonie%2frechercheresultatscaledo.htm&-error=rechercheerreurreunion.htm&-SortField=presseheuretri&-SortORder=Ascending&-max=99&"; 294 $url .= "-op=eq&jourdate=".$url_day."&"; 295 $url .= "-op=cn&periodejour=a...z&"; 296 $url .= "-op=cn&idchaine=".$idchaine."&"; 297 298 # simulate a click on the submit button 299 my $random = int(rand(42)) + 1; 300 $url .= "-Find.x=".$random."&"; 301 $random = int(rand(41)) + 1; 302 $url .= "-Find.y=".$random; 303 304 # get request and parse 305 my $html = get_nice_tree $url; 306 307 # look for the table of programmes 308 my $table = $html->look_down('_tag', 'table', 'width', '815', 'cellspacing', '2', 'cellpadding', '0', 'border', '0', 'align', 'center'); 309 310 return @progs if not defined $table; 311 312 $table->objectify_text(); 313 314 # look for the list of rows of the table 315 my @rows = $table->look_down('_tag', 'table', 'width', '797', 'height', '53', 'cellspacing', '1', 'cellpadding', '0', 'border', '0', 'align', 'center'); 316 317 # scan each row 318 foreach my $r (@rows) { 319 # the current prog being processed 320 my %prog; 321 my ($tt, $stop); 322 323 $prog{'channel'} = $idchaine.".canalsatellite-caledonie.com"; 324 325 # look for every column 326 my @td = $r->look_down('_tag', 'td'); 327 328 # scan each cellule of the row 329 foreach my $cell (@td) { 330 my @b = $cell->look_down('_tag', '~text'); 331 foreach my $tag (@b) { 332 $tt = $tag->attr_get_i('text'); 333 #$tag->dump(); 334 335 # here is the start hour 336 if ( $tt =~ /(\d\d):(\d\d)/ ) { 337 $start = Date_SetTime($start, $1, $2, 0); 338 my $start_str = UnixDate($start, "%Y%m%d%H%M%S"); 339 $prog{'start'} = $start_str." +0400"; 340 } 341 342 # here is the title with the duration in minutes 343 if ( $tt =~ /(.*)\s\((\d+)\'\).*/ ) { 344 # sometimes title doesn't exist for the first programme 345 next if ($1 eq ""); 346 347 # "Fin des programmes" is not a real tv show 348 next if ($1 eq "Fin des programmes"); 349 350 my $title = $1; 351 352 $prog{'title'} = [ [ $title ] ]; 353 354 $stop = DateCalc($start, "+$2 min"); 355 my $stop_str = UnixDate($stop, "%Y%m%d%H%M%S"); 356 $prog{'stop'} = $stop_str." +0400"; 357 358 # Change the start date because last programme begins 359 # this day (at 23:00 PM) and ends the day after 360 # (at 01:00 AM) 361 my $y = UnixDate($stop, "%Y"); 362 my $m = UnixDate($stop, "%m"); 363 my $d = UnixDate($stop, "%d"); 364 365 $start = Date_SetDateField($stop, "y", $y); 366 $start = Date_SetDateField($start, "m", $m); 367 $start = Date_SetDateField($start, "d", $d); 368 369 # length tag not necessary if start and stop 370 # are presents 371 #$prog{'length'} = $2 * 60; 372 373 # sometime there is also the category 374 my $i = $cell->look_down('_tag', 'span', 'class', 'rouge'); 375 my $ii = $i->look_down('_tag', '~text'); 376 my $category = $ii->attr_get_i('text'); 377 utf8::encode($category) if (utf8::is_utf8($category)); 378 $category =~ s/^\s+//; 379 $category =~ s/\s+$//; 380 $prog{'category'} = [ [ $category, "fr" ] ] if not $category eq ""; 381 } 382 383 # Year of the prog (if present) 384 if ( $tt =~ / - (\d\d\d\d)/ ) { 385 $prog{'date'} = $1; 386 } 387 } 388 389 # get director/actors if --slow was asked 390 if ($opt_slow) { 391 @b = $cell->look_down('_tag', 'a', 'class', 'rouge bold'); 392 foreach (@b) { 393 my $href = "http://srv3.media-overseas.com/".$_->attr_get_i('href'); 394 get_canalsat_programmes_list_slow($href, \%prog); 395 } 396 } 397 } 398 399 # add the current prog to the list if it is valid 400 if (defined $prog{'title'}) { 401 push @progs,\%prog; 402 } 403 } 404 405 $html->delete(); 406 undef $html; 407 } 408 409 return @progs; 410} 411 412### 413### Configure mode 414### 415if ($mode eq 'configure') { 416 XMLTV::Config_file::check_no_overwrite($config_file); 417 418 # ask user to select his broadcasters 419 my @id = sort keys %BROADCASTERS; 420 my @questions = map { "Would you like to download data for '$BROADCASTERS{$_}' ?" } @id; 421 my @responses = ask_many_boolean(1, @questions); 422 423 # retrieve the channels list for each broadcasters 424 foreach (0..$#id) { 425 if ($responses[$_]) { 426 my @ch = get_channels_list($id[$_]); 427 @channels = (@channels, @ch) if @ch; 428 } 429 } 430 431 # ask user to add or not each channel 432 @questions = map { "Add channel $_->{'name'} ?" } @channels; 433 @responses = ask_many_boolean(1, @questions); 434 435 # create configuration file 436 open(CONF, ">$config_file") or die "Cannot write to $config_file: $!"; 437 438 foreach (@channels) { 439 my $r = shift @responses; 440 441 if ($r) { 442 print CONF "channel:"; 443 } 444 else { 445 print CONF "#channel:"; 446 } 447 448# if ( $_->{'type'} eq "CANALSAT" ) 449# { 450 print CONF "$_->{'id'}.canalsatellite-caledonie.com;$_->{'name'}\n"; 451# } 452# else 453# { 454# print CONF "$_->{'id'}.parabolereunion.com;$_->{'name'}\n"; 455# } 456 } 457 458 close CONF or warn "Cannot close $config_file: $!"; 459 say("Finished configuration."); 460 exit(); 461} 462 463### 464### List channels 465### 466if ($mode eq 'list-channels') { 467 # init the XMLTV writer 468 my $writer = new_xmltv_writer(); 469 470 # ask user to select his broadcasters 471 my @id = sort keys %BROADCASTERS; 472 my @questions = map { "Select '$BROADCASTERS{$_}' ?" } @id; 473 my @responses = ask_many_boolean(1, @questions); 474 475 # retrieve the channels list for each broadcasters 476 foreach (0..$#id) { 477 if ($responses[$_]) { 478 my @ch = get_channels_list($id[$_]); 479 @channels = (@channels, @ch) if @ch; 480 } 481 } 482 483 # write the XML header 484 $writer->start({ 485 'generator-info-name' => 'XMLTV', 486 'generator-info-url' => 'http://xmltv.org/', 487 }); 488 489 foreach (@channels) { 490 my $id = "id"; 491 $id = $_->{'id'}.".canalsatellite-caledonie.com"; 492# $id = $_->{'id'}.".parabolereunion.com" if ($_->{'type'} eq "PARABOLE"); 493 494 $writer->write_channel({ 495 'id' => $id, 496 'display-name' => [[ $_->{'name'} ]], 497# 'icon' => [{ 'src' => $_->{'icon'} }] 498 }); 499 } 500 501 $writer->end(); 502 exit(); 503} 504 505### 506### Grab programmes listing 507### 508die if $mode ne 'grab'; 509 510# Now let's do it 511Date_Init("TZ=UTC"); 512 513# Change HTTP Headers to make canalsatellite-caledonie.com happy 514$XMLTV::Get_nice::ua->default_headers->push_header('Keep-Alive'=>'300'); 515$XMLTV::Get_nice::ua->default_headers->push_header('Connection'=>'keep-alive'); 516$XMLTV::Get_nice::ua->default_headers->push_header('Referer'=>'http://srv3.media-overseas.com/FMPro?-db=caledonie.fp5&-lay=M1&-format=csat_caledonie/recherchecaledo.htm&-view'); 517 518# read tv_grab_nc conf file... 519@config_lines = XMLTV::Config_file::read_lines($config_file); 520 521# ...and parse its content 522my $n = 0; 523foreach (@config_lines) { 524 ++$n; 525 next if not defined; 526 527 if ( /^channel:(\d+)\.(.*);(.*)/ ) { 528 my %channel; 529 530 $channel{'id'} = $1; 531 $channel{'name'} = $3; 532 533 if ($2 eq 'canalsatellite-caledonie.com') { 534 $channel{'type'} = "CANALSAT"; 535 $channel{'icon'} = "$CANALSAT_ICON_URL/".$channel{'id'}."_grand.gif"; 536 } 537 538# if ($2 eq 'parabolereunion.com') { 539# $channel{'type'} = "PARABOLE"; 540# $channel{'icon'} = "$PARABOLE_ICON_URL/channel_logo_small".$channel{'id'}.".gif"; 541# } 542 543 push @channels,\%channel; 544 } 545 else { 546 die "$config_file:$n - Bad line channel"; 547 } 548} 549 550die "No working channels configured, so no grabing" if not @channels; 551 552# init the XMLTV writer 553my $writer = new_xmltv_writer(); 554 555# write the XML header 556$writer->start({ 557 'generator-info-name' => 'XMLTV', 558 'generator-info-url' => 'http://xmltv.org/', 559}); 560 561# first, write channels 562foreach (@channels) { 563 my $id = "id"; 564 $id = $_->{'id'}.".canalsatellite-caledonie.com"; 565# $id = $_->{'id'}.".parabolereunion.com" if ($_->{'type'} eq "PARABOLE"); 566 567 $writer->write_channel({ 568 'id' => $id, 569 'display-name' => [ [ $_->{'name'} ] ], 570# 'icon' => [ { 'src' => $_->{'icon'} } ] 571 }); 572} 573 574# then, programmes 575foreach (@channels) { 576 my @progs; 577 578 if ($_->{'type'} eq 'CANALSAT') { 579 @progs = get_canalsat_programmes_list($_->{'id'}, $opt_offset, $opt_days); 580 } 581 582# if ($_->{'type'} eq 'PARABOLE') { 583# @progs = get_parabole_programmes_list($_->{'id'}, $opt_offset, $opt_days); 584# } 585 586 foreach my $prog (@progs) { 587 $writer->write_programme(\%$prog); 588 } 589} 590 591$writer->end(); 592 593