1#!/usr/local/bin/perl -w 2 3=pod 4 5=head1 NAME 6 7tv_extractinfo_en - read English-language listings and extract info 8from programme descriptions. 9 10=head1 SYNOPSIS 11 12tv_extractinfo_en [--help] [--output FILE] [FILE...] 13 14=head1 DESCRIPTION 15 16Read XMLTV data and attempt to extract information from 17English-language programme descriptions, putting it into 18machine-readable form. For example the human-readable text '(repeat)' 19in a programme description might be replaced by the XML element 20<previously-shown>. 21 22B<--output FILE> write to FILE rather than standard output 23 24This tool also attempts to split multipart programmes into their 25constituents, by looking for a description that seems to contain lots 26of times and titles. But this depends on the description following 27one particular style and is useful only for some listings sources 28(Ananova). 29 30If some text is marked with the 'lang' attribute as being some 31language other than English ('en'), it is ignored. 32 33=head1 SEE ALSO 34 35L<xmltv(5)>. 36 37=head1 AUTHOR 38 39Ed Avis, ed@membled.com 40 41=head1 BUGS 42 43Trying to parse human-readable text is always error-prone, more so 44with the simple regexp-based approach used here. But because TV 45listing descriptions usually conform to one of a few set styles, 46tv_extractinfo_en does reasonably well. It is fairly conservative, 47trying to avoid false positives (extracting 'information' which 48isnE<39>t really there) even though this means some false negatives 49(failing to extract information and leaving it in the human-readable 50text). 51 52However, the leftover bits of text after extracting information may 53not form a meaningful English sentence, or the punctuation may be 54wrong. 55 56On the two listings sources currently supported by the XMLTV package, 57this program does a reasonably good job. But it has not been tested 58with every source of anglophone TV listings. 59 60=cut 61 62use strict; 63use XMLTV::Version '$Id: tv_extractinfo_en,v 1.70 2015/07/12 00:46:37 knowledgejunkie Exp $ '; 64use XMLTV::Date; 65use Date::Manip; 66use Carp; 67use Getopt::Long; 68 69BEGIN { 70 if (int(Date::Manip::DateManipVersion) >= 6) { 71 Date::Manip::Date_Init("SetDate=now,UTC"); 72 } else { 73 Date::Manip::Date_Init("TZ=UTC"); 74 } 75} 76 77# Use Log::TraceMessages if installed. 78BEGIN { 79 eval { require Log::TraceMessages }; 80 if ($@) { 81 *t = sub {}; 82 *d = sub { '' }; 83 } 84 else { 85 *t = \&Log::TraceMessages::t; 86 *d = \&Log::TraceMessages::d; 87 Log::TraceMessages::check_argv(); 88 } 89} 90 91# Use Term::ProgressBar if installed. 92use constant Have_bar => eval { require Term::ProgressBar; 1 }; 93 94use XMLTV; 95use XMLTV::TZ qw(gettz offset_to_gmt); 96use XMLTV::Clumps qw(clump_relation relatives fix_clumps nuke_from_rel); 97use XMLTV::Usage <<END 98$0: read English-language listings and extract info from programme descriptions 99usage: $0 [--help] [--output FILE] [FILE...] 100END 101; 102 103 104# There are some seeming bugs in Perl which corrupt the stop time of 105# programmes. They are less in 5.6.1 than 5.6.0 but still there. The 106# debugging assertions cst() and no_shared_scalars() have the effect 107# of stopping the problem (it's a Heisenbug). So one way of making 108# stop times correct is to call this routines regularly. 109# 110# Alternatively, we can limit the script's functionality to work 111# around the bug. It seems to affect stop times, so if we just don't 112# add stop times things should be okay. 113# 114# This flag decides which of the two to pick: slow but with maximum 115# information about stop times, or fast without them. (Stop times can 116# easily be added back in by tv_sort, and they weren't that good 117# anyway, so you should probably leave this off.) 118# 119my $SLOW = 0; 120warn "this version has debugging calls, will run slowly\n" if $SLOW; 121 122# It might turn out that a particular version of perl is needed. 123# BEGIN { 124# eval { require 5.6.1 }; 125# if ($@) { 126# for ($@) { 127# chomp; 128# s/, stopped at .+$//; 129# warn "$_, continuing but output may be wrong\n"; 130# } 131# } 132# } 133 134sub list_names( $ ); 135sub check_same_channel( $ ); 136sub special_category( $ ); 137sub special_multipart( $ ); 138sub special_credits( $ ); 139sub special_radio4( $ ); 140sub special_split_title( $ ); 141sub special_film( $ ); 142sub special_new_series( $ ); 143sub special_year( $ ); 144sub special_tv_movie( $ ); 145sub special_teletext_subtitles( $ ); 146sub special_useless( $ ); 147sub check_time_fits( $$ ); 148sub cst( $ ); 149sub no_shared_scalars( $ ); 150sub has( $$@ ); 151sub hasp( $$$ ); 152sub pd( $ ); 153sub ud( $ ); 154sub nd( $ ); 155sub bn( $ ); 156sub munge( $ ); 157sub multipart_split_desc( $$ ); 158sub clocks_poss( $ ); 159sub time12to24( $ ); 160sub add( $$$ ); 161sub scrub_empty( @ ); 162sub set_stop_time( $$ ); 163sub dump_pseudo_programme( $ ); 164 165# --no-create-sub-titles is an undocumented switch, affecting the 166# splitting of multipart programmes only, to not break a title 167# containing colons into title and sub-title, but always keep it as a 168# single title containing a colon. This is for consistency with some 169# data sources that do this. 170# 171my ($opt_help, $opt_output, $opt_no_create_sub_titles); 172GetOptions('help' => \$opt_help, 'output=s' => \$opt_output, 173 'no-create-sub-titles' => \$opt_no_create_sub_titles) 174 or usage(0); 175usage(1) if $opt_help; 176@ARGV = ('-') if not @ARGV; 177 178#### 179# Language selection stuff. 180# 181my $LANG = 'en'; 182 183# bn(): wrapper for XMLTV::best_name(). Does some memoizing (so 184# assumes that the languages in a list of pairs will not change). 185# 186my %bn; 187sub bn( $ ) { 188 my $pairs = shift; 189 return undef if not defined $pairs; 190 die 'bn(): expected ref to list of [text,lang] pairs' 191 if ref $pairs ne 'ARRAY'; 192 for ($bn{$pairs}) { 193 return $_ if defined; 194 foreach (@$pairs) { 195 carp "found bad [text,lang] pair: $_" if ref ne 'ARRAY'; 196 } 197 return $_ = XMLTV::best_name([ $LANG ], $pairs); 198 } 199} 200 201# pair_ok(): returns whether a [ text, lang ] pair is usable. 202sub pair_ok( $ ) { 203 not defined $_->[1] or $_->[1] =~ /^$LANG(?:_\w+)?$/o; 204} 205 206#### 207# Human name stuff. 208# 209 210# Regular expression to match a name 211my $UC = '[A-Z]'; # upper case 212my $LC = "[a-z]"; # lower case 213my $AC_P = "[\'A-Za-z-]"; # any case with punctuation 214my $NAME_RE; 215{ 216 # One word of a name. Uppercase, anycase then ending in at least 217 # two lowercase. Alternatively, uppercase then lowercase (eg 218 # 'Lee'), all uppercase ('DJ') or just uppercase and an optional dot (for 219 # initials). 220 # 221 my $name_comp_re = "(?:$UC(?:(?:$AC_P+$LC$LC)|(?:$LC+)|(?:$UC+)|\\.?))"; 222 foreach ('Simon', 'McCoy') { 223 die "cannot match name component $_" unless /^$name_comp_re$/o; 224 } 225 foreach ("Valentine's") { 226 die "wrongly matched name component $_" if /^$name_comp_re$/o; 227 } 228 229 # Additional words valid in the middle of names. 230 my $name_join_re = "(?:von|van|de|di|da|van\\s+den|bin|ben|al)"; 231 232 # A name must have at least two components. This excludes those 233 # celebrities known only by first name but it's a reasonable 234 # heuristic for distinguishing between the names of actors and the 235 # names of characters. 236 # 237 my $name_re = "(?:$name_comp_re\\s+(?:(?:(?:$name_comp_re)|$name_join_re)\\s+)*$name_comp_re)"; 238 foreach ('Simon McCoy', 'Annie Wu') { 239 die "cannot match $_" unless /^$name_re$/o; 240 } 241 242 # Special handling for some titles. But others beginning 'the' 243 # are specifically excluded (to avoid 'the Corornation Street 244 # star' parsing as '$NAME_RE star'). 245 # 246 $NAME_RE = "(?<!the\\s)(?:(?:[Tt]he\\s+Rev(?:\\.|erend)\\s+)?$name_re)"; 247} 248 249# Regexp to match a list of names: 'Tom, Dick, and Harry' 250my $NAMES_RE = "(?:$NAME_RE(?:(?:\\s*,\\s*$NAME_RE)*(?:\\s*,?\\s*\\band\\s+$NAME_RE))?(?!\\s*(?:and\\s+$UC|from|[0-9])))"; 251 252# Subroutine to extract the names from this list 253sub list_names( $ ) { 254 die 'usage: list_names(English string listing names)' 255 if @_ != 1; 256 local $_ = shift; die if not defined; 257 t 'list_names() processing string: ' . d $_; 258 my @r; 259 260 s/^($NAME_RE)\s*// or die "bad 'names' '$_'"; 261 push @r, $1; 262 263 while (s/^,?\s*(?:and\s+)?($NAME_RE)\s*//) { 264 push @r, $1; 265 } 266 die "unmatched bit of names $_" unless $_ eq ''; 267 268 return @r; 269} 270 271my @tests = 272 ( 273 [ 'Richard Whiteley and Carol Vorderman', [ 'Richard Whiteley', 'Carol Vorderman' ] ], 274 [ 'show presented by Jonathan Ross, with', [ 'Jonathan Ross' ] ], 275 [ 'Shane Richie, Michael Starke and Scott Wright', 276 [ 'Shane Richie', 'Michael Starke', 'Scott Wright' ] ], 277 [ 'Basil Brush,Barney Harwood and Ugly Yetty present', 278 [ 'Basil Brush', 'Barney Harwood', 'Ugly Yetty'] ], 279 ); 280foreach (@tests) { 281 my ($in, $expected) = @$_; 282 for ($in) { 283 /($NAMES_RE)/o or die "$in doesn't match \$NAMES_RE"; 284 my @out = list_names($1); 285 local $Log::TraceMessages::On = 1; 286 if (d(\@out) ne d($expected)) { 287 die "$in split into " . d(\@out); 288 } 289 } 290} 291 292 293#### 294# Date handling stuff. 295# 296# This loses any information on partial dates (FIXME). 297# 298sub pd( $ ) { 299 for ($_[0]) { 300 return undef if not defined; 301 return parse_date($_); 302 } 303} 304sub ud( $ ) { 305 for ($_[0]) { 306 return undef if not defined; 307 return UnixDate($_, '%q'); 308 } 309} 310sub nd( $ ) { 311 for ($_[0]) { 312 return undef if not defined; 313 return ud(pd($_)); 314 } 315} 316 317# Memoize some subroutines if possible. FIXME commonize to 318# XMLTV::Memoize. 319# 320eval { require Memoize }; 321unless ($@) { 322 foreach (qw(parse_date UnixDate Date_Cmp 323 clocks_poss time12to24)) { 324 Memoize::memoize($_) or die "cannot memoize $_: $!"; 325 } 326} 327 328my $encoding; 329my $credits; 330my %ch; 331my @progs; 332XMLTV::parsefiles_callback(sub( $ ) { $encoding = shift }, 333 sub( $ ) { $credits = shift }, 334 sub( $ ) { my $c = shift; $ch{$c->{id}} = $c }, 335 sub( $ ) { push @progs, shift }, 336 @ARGV); 337# Assume encoding is a superset of ASCII, and that Perl's regular 338# expressions work with it in the current locale. 339# 340 341my $related = clump_relation(\@progs); 342 343# Apply all handlers. We just haphazardly 344# run one after the other; when a programme has been run 345# through all of them in sequence without any changes, we 346# move it to @progs_done. 347# 348# The reason for using _lists_ is that some handlers turn 349# a single programme into several. 350# 351my @progs_done = (); 352my $bar = new Term::ProgressBar('munging programmes', scalar @progs) 353 if Have_bar; 354while (@progs) { 355 # Deal with one more programme from the input, it may transform 356 # itself into one or more programmes which need processing in 357 # turn. When all the offspring are dealt with we have finally 358 # finished with that input and can update the progress bar. 359 # 360 no_shared_scalars(\@progs) if $SLOW; 361 push @progs_done, munge(shift @progs); 362 update $bar if Have_bar; 363} 364if ($SLOW) { cst $_ foreach @progs_done } 365 366my %w_args = (); 367if (defined $opt_output) { 368 my $fh = new IO::File ">$opt_output"; 369 die "cannot write to $opt_output\n" if not $fh; 370 %w_args = (OUTPUT => $fh); 371} 372XMLTV::write_data([ $encoding, $credits, \%ch, \@progs_done ], %w_args); 373exit(); 374 375# Take a programme, munge it and return a list of programmes (empty if 376# the programme was deleted). Uses the global $related to fiddle with 377# other programmes in the same clump. 378# 379sub munge( $ ) { 380# local $Log::TraceMessages::On = 1; 381 t 'munge() ENTRY'; 382 my @todo = (shift); 383 my @done; 384 t 'todo list initialized to: ' . d \@todo; 385 t 'done list initialized to: ' . d \@done; 386 t 'relatives of todo programme: ' . d relatives($related, $todo[0]); 387 388 # Special-case mungers for various programme types. Each of these 389 # should take a single programme and return: a reference to a list of 390 # programmes, if successful; undef, if the programme is to be left 391 # alone. Most special-case handlers will not break up a programme 392 # into several others, so the returned list will have only one 393 # element. 394 # 395 # A handler may modify the programme passed in iff it returns a list 396 # of munged programmes. 397 # 398 # Ones earlier in the list get the chance to run first, so in general 399 # things like splitting programmes (which may be relied on by other 400 # handlers) should come at the top and last-chance guesswork (such as 401 # parsing English text) at the bottom. 402 # 403 my @special_handlers = 404 ( 405 \&special_multipart, 406 \&special_category, 407 \&special_credits, 408 \&special_new_series, 409 \&special_year, 410 \&special_tv_movie, 411 \&special_teletext_subtitles, 412 \&special_useless, 413 414 # There are three handlers specific to Ananova / UK listings. I 415 # haven't yet decided what to do with them: should they be in this 416 # program and enabled with a special flag, or moved into the 417 # Ananova grabber? 418 # 419 # They haven't been ported to the new XMLTV.pm data structures, so 420 # leave them commented for now. 421 # 422 # \&special_radio4, 423 # \&special_split_title, 424 # \&special_film, 425 ); 426 427 PROG: while (@todo) { 428 my $prog = shift @todo; 429 t('running handlers for prog: ' . d($prog)); 430 my $prog_length; 431 if (defined $prog->{stop}) { 432 # Get the programme length in seconds. 433 my $delta = DateCalc($prog->{start}, $prog->{stop}); 434 $prog_length = Delta_Format($delta, 0, '%st') if defined $delta; 435 } 436 foreach (@special_handlers) { 437 t('running handler: ' . d($_)); 438 my $out = $_->($prog); 439 if (defined $out) { 440 t('gave new list of progs: ' . d($out)); 441 die "handler didn't return list of progs" 442 if ref($out) ne 'ARRAY'; 443 if ($SLOW) { cst $_ foreach @$out } 444 check_time_fits($prog, $out); 445 if ($SLOW) { cst $_ foreach @$out } 446 fix_clumps($prog, $out, $related); 447 foreach (@$out) { 448 cst $_ if $SLOW; 449 # Sanity check that length hasn't increased. 450 if (defined $_->{stop}) { 451 my $delta = DateCalc($_->{start}, $_->{stop}); 452 if (defined $prog_length) { 453 my $new_length = Delta_Format($delta, 0, '%st'); 454 if ($new_length > $prog_length) { 455 local $Log::TraceMessages::On = 1; 456 t 'original programme (after handlers run): ' . d $prog; 457 t 'split into: ' . d $out; 458 t 'offending result: ' . d $_; 459 t 'length of result: ' . d $new_length; 460 t 'length of original programme: ' . d $prog_length; 461 die 'split into programme longer than the original'; 462 } 463 } 464 } 465 } 466 push @todo, @$out; 467 next PROG; 468 } 469 t('gave undef'); 470 } 471 cst $prog if $SLOW; 472 t 'none of the handlers fired, finished with this prog'; 473 cst $prog if $SLOW; 474 push @done, $prog; 475 } 476 return @done; 477} 478 479# All the special handlers 480 481# special_category() 482# 483# Some descriptions have the last word as the category: 'blah blah 484# blah. Western' (or 'Western series'). Remove this to the <category> 485# element. 486# 487# Also look for magic words like 'news' or 'interview' and add them as 488# categories. This is mostly so that other handlers can then fire. 489# 490sub special_category( $ ) { 491 t 'special_category() ENTRY'; 492 my $p = shift; 493 my $changed = 0; 494 495 # First, non-destructively look for 'news' in title or desc. 496 foreach (qw(title desc)) { 497 foreach my $pair (grep { pair_ok($_) } @{$p->{$_}}) { 498 t "pair for $_: " . d $pair; 499 if ($pair->[0] =~ /\bnews/i) { 500 t 'matches "news"'; 501 502 if (hasp($p, 'category', sub { $_[0] =~ /\b(?:soap|drama|game show)\b/i })) { 503 t '...but clearly not a news programme'; 504 } 505 else { 506 $changed |= add($p, 'category', 'news'); 507 cst $p if $SLOW; 508 } 509 } 510 if ($pair->[0] =~ /\binterviews\b/i) { 511 t 'matches "interviews"'; 512 $changed |= add($p, 'category', 'talk'); 513 cst $p if $SLOW; 514 } 515 } 516 } 517 518 # Now try the last-word-of-desc munging. 519 my $replacement = sub( $$$$ ) { 520 my ($punct, $adj, $country, $genre) = @_; 521 $changed |= add($p, 'category', lc $genre); 522 if (length $adj or length $country) { 523 return "$punct $adj$country$genre"; 524 } 525 else { 526 $changed = 1; 527 return $punct; 528 } 529 }; 530 foreach (grep { pair_ok($_) } @{$p->{desc}}) { 531 # 'Western' -> '' 532 # 'Western series' -> '' 533 # 'Classic Western' -> 'Classic Western' 534 # etc. 535 # 536 $_->[0] =~ s/(^|\.|\?)\s* 537 (Classic\s+|Award-winning\s+|) 538 (Australian\s+|) 539 ([aA]dventure|[aA]nimation|[bB]iopic|[cC]hiller 540 |[cC]omedy|[dD]ocumentary|[dD]rama|[fF]antasy 541 |[hH]eadlines|[hH]ighlights|[hH]orror|[mM]agazine 542 |[mM]elodrama|[mM]usical|[mM]ystery|[oO]mnibus 543 |[qQ]uiz|[rR]omance|[sS]itcom|[tT]earjerker 544 |[tT]hriller|[wW]estern)\s*(?:series\s*|)$/$replacement->($1, $2, $3, $4)/xe; 545 } 546 547 if ($changed) { 548 t 'some categories found, programme: ' . d $p; 549 scrub_empty($p->{title}, $p->{desc}); 550 t 'after removing empty titles and descs, programme: ' . d $p; 551 return [ $p ]; 552 } 553 else { 554 return undef; 555 } 556} 557 558 559# special_multipart() 560# 561# Often TV listings contain several programmes stuffed into one entry, 562# which might have made sense for a printed guide to save space, but 563# is stupid for electronic data. This special handler looks at the 564# programme description and haphazardly attempts to split the 565# programme into its components. 566# 567# Parameters: a 'programme' hash reference 568# Returns: reference to list of sub-programmes, or undef if programme 569# was not split 570# 571# We find the title using bn(), in other words we look only at 572# the first title. Similarly we use only the first description. But 573# it should work. FIXME should split the secondary title as well! 574# 575sub special_multipart( $ ) { 576# local $Log::TraceMessages::On = 1; 577 die "usage: special_multipart(hashref of programme details)" 578 if @_ != 1; 579 my $p = shift; 580 cst $p if $SLOW; 581 t 'special_multipart() ENTRY'; 582 t 'checking programme descs: ' . d $p->{desc}; 583 my $best_desc = bn($p->{desc}); 584 t 'got best desc: ' . d $best_desc; 585 return undef if not $best_desc; 586 my ($desc, $desc_lang) = @$best_desc; 587 t 'testing description for multipart: ' . d $desc; 588 local $_ = $desc; 589 my @words = split; 590 my @poss_times = split /[ ,;-]/; 591 my @r; 592 my ($p_start, $p_stop) = (pd($p->{start}), pd($p->{stop})); 593 # Assume that the timezone for every time listed in the 594 # description is the same as the timezone for the programme's 595 # start. FIXME will fail when timezone changes partway through. 596 # 597 my $tz = gettz($p->{start}); 598 599 my $day; 600 if (defined $tz) { 601 # Find the base day taking into account timezones. Eg if a 602 # programme starts at 00:45 BST on the 20th and then lists 603 # times as '01:00' etc, the base date for these times is the 604 # 20th, even though the real start time is 23:45 UTC on the 605 # 19th. 606 # 607 $day = pd(UnixDate(Date_ConvTZ($p_start, 'UTC', offset_to_gmt($tz)), '%Q')); 608 } 609 else { 610 $day = pd(UnixDate($p_start, '%q')); 611 } 612 t "day is $day"; 613 # FIXME won't be correct when split programme spans days. 614 615 # Sanity check for a time, that it is within the main programme's 616 # timespan. 617 # 618 my $within_time_period = sub { 619 my $t = shift; 620 t("checking whether $t is in time period $p_start.." 621 . (defined $p_stop ? $p_stop : '')); 622 if (Date_Cmp($t, $p_start) < 0) { 623 # Before start of programme, that makes no sense. 624 return 0; 625 } 626 if (defined $p_stop and Date_Cmp($p_stop, $t) < 0) { 627 # After end of programme, likewise. 628 return 0; 629 } 630 return 1; 631 }; 632 633 # Three different ways of interpreting a time. Return undef if 634 # not valid under that system, a 24 hour hh:mm otherwise. 635 # 636 # FIXME doesn't handle multiparts bridging noon or midnight. 637 # 638 my $as_12h_am = sub { 639 my $w = shift; 640 $w =~ s/[,;.]$//; 641 t "trying $w as 12 hour am time"; 642 clocks_poss($w)->[0] || return undef; 643 return time12to24("$w am"); 644 }; 645 my $as_12h_pm = sub { 646 my $w = shift; 647 $w =~ s/[,;.]$//; 648 t "trying $w as 12 hour pm time"; 649 clocks_poss($w)->[0] || return undef; 650 return time12to24("$w pm"); 651 }; 652 my $as_24h = sub { 653 my $w = shift; 654 $w =~ s/[,;.]$//; 655 t "trying $w as 24 hour time"; 656 clocks_poss($w)->[1] || return undef; 657 $w =~ tr/./:/; 658 return $w; 659 }; 660 661 if (defined $tz) { t "using timezone $tz for interpreting times" } 662 else { t "interpreting times with no timezone (ie UTC)" } 663 664 my ($best_interp, $best_count, 665 $best_first_word_is_time, $best_including_at_time); 666 INTERP: foreach my $interp ($as_24h, $as_12h_am, $as_12h_pm) { 667 t 'testing an interpretation of times'; 668 my $count = 0; 669 my $first_word_is_time = 0; 670 my $including_at_time = 0; 671 my $prev; 672 for (my $pos = 0; $pos < @poss_times; $pos++) { 673 t "testing word $poss_times[$pos] at position $pos"; 674 my $w = $poss_times[$pos]; 675 t "word is '$w'"; 676 my $i = $interp->($w); 677 if (not defined $i) { 678 t "doesn't parse to a time with this interp."; 679 next; 680 } 681 warn "bad 24h returned time: $i" unless $i =~ /^\d?\d:\d\d$/; 682 t "found a time that interprets: $i"; 683 my $t = Date_SetTime($day, $i); 684 die if not defined $t; 685 t "taken as day $day, gets time $t"; 686 $t = Date_ConvTZ($t, offset_to_gmt($tz), 'UTC') if defined $tz; 687 t "after converting to UTC, $t"; 688 if (not $within_time_period->($t)) { 689 # Obviously wrong. One bad time is enough to abandon 690 # this whole interpretation and try another. 691 # 692 t "not within time period, whole interpretation wrong"; 693 next INTERP; 694 } 695 # Don't insist that times be in order, this isn't the case 696 # for all listings (eg 'News at 0700 and 0730; Weather at 697 # 0715'). 698 # 699 700 $prev = $t; 701 ++ $count; 702 if ($pos == 0) { 703 $first_word_is_time = 1; 704 } 705 if ($pos >= 2 706 and $poss_times[$pos - 2] =~ /^[Ii]ncluding$/ 707 and $poss_times[$pos - 1] eq 'at') { 708 $including_at_time = 1; 709 t 'previous words are "including at", setting $including_at_time true'; 710 } 711 } 712 t "found $count matching times and nothing badly wrong"; 713 714 if (not defined $best_interp 715 or $count > $best_count) { 716 t 'best so far'; 717 $best_interp = $interp; 718 $best_count = $count; 719 $best_first_word_is_time = $first_word_is_time; 720 $best_including_at_time = $including_at_time; 721 } 722 } 723 724 if (defined $best_interp) { 725 t "best result found: count $best_count"; 726 t "first word? $best_first_word_is_time"; 727 t "best includes 'at time'? $best_including_at_time"; 728 } 729 else { 730 t "couldn't find any interpretation that worked at all"; 731 } 732 733 # Heuristic. We require at least three valid times to split - or 734 # when the programme description begins with a time, that's also 735 # good enough. Also when the description contains 'including at' 736 # followed by a time. 737 # 738 return undef if not defined $best_interp; 739 return undef unless ($best_count >= 3 740 or $best_first_word_is_time 741 or $best_including_at_time); 742 743# local $Log::TraceMessages::On = 1; 744 t 'looks reasonable, proceed'; 745 746 t 'calling multipart_split_desc() with words and interpretation fn'; 747 my $split = multipart_split_desc(\@words, $best_interp); 748 t 'got result from multipart_split_desc(): ' . d $split; 749 die if not defined $split->[0]; 750 die if not defined $split->[2]; 751 our @pps; local *pps = $split->[0]; 752 t 'got list of pseudo-programmes: ' . d \@pps; 753 if (not @pps) { 754 warn "programme looked like a multipart, but couldn't grok it"; 755 return undef; 756 } 757 if (@pps == 1) { 758 # Didn't really split, perhaps it wasn't a multipart. 759 t 'split into only one, leave unchanged'; 760 return undef; 761 } 762 763 foreach (@pps) { 764 die if not defined; 765 die if not keys %$_; 766 } 767 my $common = $split->[1]; 768 our @errors; local *errors = $split->[2]; 769 770 # We split the first description, and only after checking it did 771 # look like a plausible multipart. So if anything went wrong we 772 # should warn about it. 773 # 774 foreach (@errors) { 775 warn $_; 776 } 777 778 # What was returned is a list of pseudo-programmes, these have 779 # main_desc instead of real [text, lang] descriptions, and hh:mm 780 # 'time' instead of real start time+date. 781 # 782 # At most one of them is allowed to have time undefined; this is 783 # the 'rump' of the parent programme. Whether such a rump exists 784 # depends on what kind of splitting was done. 785 # 786 my $seen_rump = 0; 787 foreach (@pps) { 788 my $time = delete $_->{time}; 789 die if not defined $time and $seen_rump++; 790 if (defined $time) { 791 my $start = Date_SetTime($day, $time); 792 die if not defined $start; 793 $start = Date_ConvTZ($start, offset_to_gmt($tz), 'UTC') if defined $tz; 794 if (Date_Cmp($start, $p->{start}) < 0) { 795 my $dump = dump_pseudo_programme($_); 796 die "subprogramme ($dump, has 'time' $time) " 797 . "starts before main programme ($p->{start}, $p->{title}->[0]->[0])"; 798 } 799 if (defined $p->{stop} and Date_Cmp($p->{stop}, $start) < 0) { 800 my $dump = dump_pseudo_programme($_); 801 die "subprogramme ($dump, has 'time' $time) starts after main one stops"; 802 } 803 804 # Now we store the time in the official 'start' key. But 805 # convert back to the original timezone to look nice. 806 # 807 if (defined $tz) { 808 $_->{start} = ud(Date_ConvTZ($start, 'UTC', offset_to_gmt($tz))) . " $tz"; 809 } 810 else { 811 $_->{start} = ud($start); 812 } 813 } 814 else { 815 $_->{start} = $p->{start}; 816 } 817 818 if (not defined $_->{main_title}) { 819 # A title is needed, normally splitting will find one, but 820 # in case it didn't... 821 # 822 $_->{title} = $p->{title}; 823 } 824 825 # Now deal with each of the main_X fields turning them into 826 # real X. 827 # 828 foreach my $key (qw(desc title sub-title)) { 829 my $v = delete $_->{"main_$key"}; 830 next if not defined $v; 831 $_->{$key} = [ [ $v, $desc_lang ] ]; 832 } 833 834 if (defined $common) { 835 # Add the common text to this programme. So far it has at 836 # most one description in language $desc_lang. 837 # 838 for ($_->{desc}->[0]->[0]) { 839 if (defined and length) { 840 $_ .= '. ' if $_ !~ /[.?!]\s*$/; 841 $_ .= " $common"; 842 } 843 else { 844 $_ = $common; 845 } 846 } 847 $_->{desc}->[0]->[1] = $desc_lang; 848 } 849 850 $_->{channel} = $p->{channel}; 851 t "set channel of split programme to $_->{channel}"; 852 } 853 854 # The last subprogramme should stop at the same time as the 855 # multipart programme stopped. 856 # 857 if (defined $p->{stop}) { 858 t "setting stop time of last subprog to stop time of main prog ($p->{stop})"; 859 set_stop_time($pps[-1], $p->{stop}); 860 } 861 else { t 'main prog had no stop time, not adding to last subprog' } 862 863 # And similarly, the first should start at the same time as the 864 # multipart programme. Add a dummy prog to fill the gap if 865 # necessary. 866 # 867 my $first_sub_start = $pps[0]->{start}; 868 my $cmp = Date_Cmp(pd($first_sub_start), $p_start); 869 if ($cmp < 0) { 870 # Should have caught this already. 871 die 'first subprogramme starts _before_ main programme'; 872 } 873 elsif ($cmp == 0) { 874 # Okay. 875 } 876 elsif ($cmp > 0) { 877 my $dummy = { title => $p->{title}, 878 channel => $p->{channel}, 879 start => $p->{start}, 880 stop => $first_sub_start }; 881 t 'inserting dummy subprogramme: ' . d $dummy; 882 cst $dummy if $SLOW; 883 unshift @pps, $dummy; 884 } 885 else { die } 886 887 if ($SLOW) { cst $_ foreach @pps } 888 scrub_empty($_->{title}, $_->{"sub-title"}, $_->{desc}) foreach @pps; 889 t 'returning new list of programmes: ' . d \@pps; 890 return \@pps; 891} 892# Given a programme description split into a list of words, and a 893# subroutine to interpret times, return a list of the subprogrammes 894# (assuming it is a multipart). 895# 896# Returns [pps, common, errs] where pps is a list of 'pseudo-programmes', 897# hashes containing some of: 898# 899# time: 24h time within the main programme's day, 900# main_title, main_desc, main_sub-title: text in the same language as 901# the desc passed in, 902# 903# and where common is text which belongs to the description of every 904# subprogramme, and errs is a list of errors found (probably quite 905# large if the description was not multipart). 906# 907sub multipart_split_desc( $$ ) { 908 our @words; local *words = shift; 909 my $interp = shift; 910 911 # We need to decide what style of multipart listing this is. 912 # There's the kind that has time - title - description for each 913 # subprogramme. There's the kind that has 'News at time0, time1, 914 # time2; Weather at time3, time4'. And then something more like a 915 # normal English sentence, which of course is the hardest to 916 # parse. We use some heuristics to work out which it is and call 917 # the appropriate 'parsing' routine. 918 # 919 t "testing for 'Including at'"; 920 foreach my $i (0 .. $#words - 1) { 921 t "looking at pos $i, word is $words[$i]"; 922 if ($words[$i] =~ /^[Ii]ncluding$/ and $words[$i + 1] eq 'at') { 923 t 'yup, calling multipart_split_desc_including_at()'; 924 return multipart_split_desc_including_at(\@words, $interp); 925 } 926 } 927 928 t "testing for 'With X at T0, T1; ...'"; 929 if (@words >= 4 and $words[0] =~ /^with$/i) { 930 my $first_lc_word; 931 foreach (@words) { 932 if (not tr/[A-Z]//) { 933 $first_lc_word = $_; 934 last; 935 } 936 } 937 if (defined $first_lc_word and $first_lc_word eq 'at') { 938 return multipart_split_desc_rt(\@words, $interp); 939 } 940 } 941 942 t "looking for two times in a row, or separated only by 'and'"; 943 my $prev_was_time = 0; 944 foreach (@words) { 945 if (defined $interp->($_)) { 946 # Found a time. 947 if ($prev_was_time) { 948 t 'found two times in a row, using multipart_split_desc_simple()'; 949 return multipart_split_desc_simple(\@words, $interp); 950 } 951 $prev_was_time = 1; 952 } 953 elsif ($_ eq 'and') { 954 # Skip. 955 } 956 else { 957 $prev_was_time = 0; 958 } 959 } 960 961 t "looking for pairs of times 'from-to'"; 962 foreach (@words) { 963 if (/^([0-9.:]+)-([0-9.:]+)$/) { 964 my ($from, $to) = ($1, $2); 965 if (defined $interp->($from) and defined $interp->($to)) { 966 return multipart_split_desc_fromto(\@words, $interp); 967 } 968 } 969 } 970 971 t "must be old style of 'time title. description'"; 972 return multipart_split_desc_ananova(\@words, $interp); 973} 974# And these routines handle the different styles. 975sub multipart_split_desc_ananova( $$ ) { 976 our @words; local *words = shift; 977 my $interp = shift; 978 my @r; 979 my @errors; 980 981 # First extract any 'common text' at the start of the programme, 982 # before any sub-programmes. 983 # 984 my $common; 985 while (@words) { 986 my $first = shift @words; 987 if (defined $interp->($first)) { 988 unshift @words, $first; 989 last; 990 } 991 if (defined $common and length $common) { 992 $common .= " $first"; 993 } 994 else { 995 $common = $first; 996 } 997 } 998 t 'common text: ' . d $common; 999 1000 while (@words > 1) { # At least one thing after the time 1001 my $time = shift @words; 1002 my $i = $interp->($time); 1003 if (defined $i) { 1004 my (@title_words, @desc_words); 1005 1006 # Build up a current 'pseudo-programme' with title, 1007 # description and time. It's up to our caller to 1008 # manipulate these simple data structures into real 1009 # programmes. 1010 # 1011 my $curr_pp; 1012 $curr_pp->{time} = $i; 1013 my $done_title = 0; 1014 my @words_orig = @words; 1015 while (@words) { 1016 my $word = shift @words; 1017 1018 if (defined $interp->($word)) { 1019 # Finished this bit of multipart. 1020 unshift @words, $word; 1021 last; 1022 } 1023 elsif (not $done_title) { 1024 if ($word =~ s/[.]$// or $word =~ s/([!?])$/$1/) { 1025 # Finished the title, move on to description. 1026 $done_title = 1; 1027 } 1028 push @title_words, $word; 1029 } 1030 else { 1031 push @desc_words, $word; 1032 } 1033 } 1034 if (not @title_words) { 1035 warn "trouble finding title in multipart"; 1036 if (not @desc_words) { 1037 warn "cannot find title or description in multipart"; 1038 @title_words = ('???'); 1039 } 1040 else { 1041 # Use the description so far as the title. 1042 if ($desc_words[-1] eq 'at') { 1043 pop @desc_words; 1044 } 1045 @title_words = @desc_words; 1046 @desc_words = (); 1047 } 1048 } 1049 1050 # The title sometimes looks like 'History in Action: Women 1051 # in the 20th Century'; this should be broken into main 1052 # title and secondary title. But not 'GNVQ: Is It For You 1053 # 2'. So arbitrarily we check that the main title has at 1054 # least two words. 1055 # 1056 if (@title_words) { 1057 my (@main_title_words, @sub_title_words); 1058 1059 while (@title_words) { 1060 my $word = shift @title_words; 1061 my $main_title_length = @main_title_words + 1; 1062 1063 # Split at colon, sometimes 1064 if ((not $opt_no_create_sub_titles) 1065 and $main_title_length >= 2 and $word =~ s/:$//) { 1066 push @main_title_words, $word; 1067 @sub_title_words = @title_words; 1068 last; 1069 } 1070 else { 1071 push @main_title_words, $word; 1072 } 1073 } 1074 1075 $curr_pp->{main_title} = join(' ', @main_title_words); 1076 $curr_pp->{'main_sub-title'} = join(' ', @sub_title_words) 1077 if @sub_title_words; 1078 } 1079 1080 $curr_pp->{main_desc} = join(' ', @desc_words) if @desc_words; 1081 t 'built sub-programme: ' . d $curr_pp; 1082 push @r, $curr_pp; 1083 } 1084 else { 1085 push @errors, "expected time in multipart description, got $time"; 1086 # Add it to the previous programme, so it doesn't get lost 1087 if (@r) { 1088 my $prev = $r[-1]; 1089 $prev->{main_desc} = '' if not defined $prev->{main_desc}; 1090 $prev->{main_desc} .= $time; 1091 } 1092 else { 1093 # Cannot happen. If @r is empty, this must be the 1094 # first word. 1095 # 1096 warn 'first word of desc is not time, but checked this before'; 1097 # Not worthy of @errors, this is a bug in the code. 1098 } 1099 } 1100 } 1101 foreach (@r) { 1102 die if not keys %$_; 1103 die if not defined $_->{main_title}; 1104 } 1105 t 'returning list of pseudo-programmes: ' . d \@r; 1106 t '...and common text: ' . d $common; 1107 t '...and errors: ' . d \@errors; 1108 return [\@r, $common, \@errors]; 1109} 1110sub multipart_split_desc_rt( $$ ) { 1111 our @words; local *words = shift; 1112 my $interp = shift; 1113 my @errors; 1114 1115 my $with = shift @words; 1116 die if not defined $with; 1117 die if $with !~ /^with$/i; 1118 1119 my @got; 1120 my @title = (); 1121 my @times = (); 1122 my $done_title = 0; 1123 while (@words) { 1124 my $w = shift @words; 1125 if ($w eq 'at') { 1126 $done_title = 1; 1127 next; 1128 } 1129 1130 my $i = $interp->($w); 1131 if (defined $i) { 1132 # It's a time. 1133 if (not $done_title) { 1134 warn "unexpected time $w in multipart description, before 'at'"; 1135 push @errors, $w; 1136 } 1137 else { 1138 push @times, $i; 1139 } 1140 1141 if ($w =~ /[.;]$/) { 1142 # End of the list of times for this programme. 1143 push @got, [ [ @title ], [ @times ] ]; 1144 @title = (); 1145 @times = (); 1146 $done_title = 0; 1147 } 1148 elsif ($w =~ /,$/) { 1149 # List continues. 1150 } 1151 else { 1152 warn "strange time $w"; 1153 } 1154 1155 next; 1156 } 1157 1158 # Not a time, should be part of the title. 1159 if ($done_title) { 1160 warn "strange word $w in multipart description, expected a time"; 1161 push @errors, $w; 1162 } 1163 else { 1164 push @title, $w; 1165 } 1166 } 1167 1168 my @r; 1169 foreach (@got) { 1170 my ($title, $times) = @$_; 1171 foreach (@$times) { 1172 push @r, { main_title => join(' ', @$title), time => $_ }; 1173 } 1174 } 1175 1176 # There is no 'common text' with this splitter. 1177 return [\@r, undef, \@errors]; 1178} 1179# Split the programme by looking for times, but each new programme has 1180# the same words (except times). 1181# 1182sub multipart_split_desc_simple( $$ ) { 1183 our @words; local *words = shift; 1184 my $interp = shift; 1185 1186 my @common; 1187 my @times; 1188 foreach (@words) { 1189 die if not defined; 1190 my $i = $interp->($_); 1191 if (defined $i) { 1192 push @times, $i; 1193 if (@common and ($common[-1] eq 'at' or $common[-1] eq 'and')) { 1194 pop @common; 1195 } 1196 } 1197 else { 1198 push @common, $_; 1199 } 1200 } 1201 1202 my @r; 1203 foreach (@times) { 1204 die if not defined; 1205 push @r, { time => $_ }; 1206 } 1207 1208 # No 'errors' but lots of 'common text'. 1209 return [ \@r, join(' ', @common), [] ]; 1210} 1211sub multipart_split_desc_fromto( $$ ) { 1212 our @words; local *words = shift; 1213 my $interp = shift; 1214 my @r; 1215 my @errors; 1216 1217 # This routine is limited a bit because it's expected to return 1218 # hashes with just 'time'. But we know more than that, we know 1219 # both start time and stop time for each subprogramme. That 1220 # information would be thrown away. 1221 # 1222 # For now, it seems that this kind of multipart programme always 1223 # has one part beginning when the previous one ended, so we can 1224 # just check that this property holds. Then there will be no loss 1225 # of stop-time information. 1226 # 1227 my ($last_start, $last_stop); 1228 my @title = (); 1229 my $done_title = 0; 1230 my @desc = (); 1231 foreach (@words) { 1232 if (/^([0-9.:]+)-([0-9.:]+)$/ 1233 and defined(my $istart = $interp->($1)) 1234 and defined(my $istop = $interp->($2))) { 1235 # It's a pair of times. 1236 if (defined $last_start) { 1237 # Deal with the previous subprogramme. 1238 warn "mismatch between stop time $last_stop and start time $istart" 1239 if $last_stop ne $istart; 1240 my %p = ( time => $last_start, main_title => join(' ', @title) ); 1241 $p{main_desc} = join(' ', @desc) if @desc; 1242 push @r, \%p; 1243 } 1244 ($last_start, $last_stop) = ($istart, $istop); 1245 @title = (); 1246 $done_title = 0; 1247 @desc = (); 1248 } 1249 elsif (/:$/) { 1250 # A colon ends the title. 1251 if (not $done_title) { 1252 (my $tmp = $_) =~ s/:$//; 1253 push @title, $tmp; 1254 $done_title = 1; 1255 } 1256 else { 1257 warn "seen colon in description: '$_'"; 1258 push @desc, $_; 1259 } 1260 } 1261 elsif ($_ eq 'with') { 1262 # Also 'with' can end a title, as in 'News with...'. This 1263 # is probably the only time I've seen a use for the 1264 # convention that words in titles should be capitalized. 1265 # 1266 # The 'with' stuff goes into the description, where some 1267 # other handler can pick it up. 1268 # 1269 $done_title = 1; 1270 push @desc, $_; 1271 } 1272 else { 1273 if (not $done_title) { 1274 push @title, $_; 1275 } 1276 else { 1277 push @desc, $_; 1278 } 1279 } 1280 } 1281 if (defined $last_start) { 1282 my %p = ( time => $last_start, main_title => join(' ', @title) ); 1283 $p{main_desc} = join(' ', @desc) if @desc; 1284 push @r, \%p; 1285 } 1286 1287 return [ \@r, undef, [] ]; 1288} 1289# Really an 'including at' programme should be sandwiched in the 1290# middle of its parent, but the format doesn't allow that so for 1291# simplicity we treat as a multipart. 1292# 1293sub multipart_split_desc_including_at( $$ ) { 1294 our @words; local *words = shift; 1295 my $interp = shift; 1296 my @r; 1297 my @rump; 1298 1299 while (@words) { 1300 my $t; 1301 if (@words >= 4 1302 and $words[0] =~ /^[Ii]ncluding$/ 1303 and $words[1] eq 'at' 1304 and defined ($t = $interp->($words[2])) 1305 and $words[3] =~ /^[A-Z]/) { 1306 shift @words; shift @words; shift @words; 1307 my @title; 1308 while (@words and $words[0] =~ /^[A-Z]/) { 1309 my $w = shift @words; 1310 if ($w =~ s/[.,;]$//) { 1311 push @title, $w; 1312 last; 1313 } 1314 else { 1315 push @title, $w; 1316 } 1317 } 1318 push @r, { time => $t, main_title => join(' ', @title) }; 1319 } 1320 else { 1321 push @rump, shift @words; 1322 } 1323 } 1324 1325 unshift @r, { main_desc => join(' ', @rump) }; 1326 return [ \@r, '', [] ]; 1327} 1328# Is a time string using the 12 hour or 24 hour clock? Returns a pair 1329# of two booleans: the first means it could be 12h, the seecond that 1330# it could be 24h. Expects an h.mm or hh.mm time string. If the 1331# string is not a valid time under either clock, returns [0, 0]. 1332# 1333# Allows eg '5.30' to be a 24 hour time (05:30). 1334# 1335sub clocks_poss( $ ) { 1336 local $_ = shift; 1337 if (not /^(\d\d?)\.(\d\d)$/) { 1338 return [0, 0]; 1339 } 1340 my ($hh, $mm) = ($1, $2); 1341 return [0, 0] if $mm > 59; 1342 return [0, 1] if $hh =~ /^0/; 1343 return [1, 1] if 1 <= $hh && $hh < 13; 1344 return [0, 1] if 13 <= $hh && $hh < 24; 1345 1346 # Do not accept '24:00', '24:01' etc - not until it's proved we 1347 # need to. 1348 # 1349 return [0, 0]; 1350} 1351# Debugging stringification. 1352sub dump_pseudo_programme( $ ) { 1353 my @r; 1354 my $pp = shift; 1355 foreach (qw(time main_title main_desc)) { 1356 push @r, $pp->{$_} if defined $pp->{$_}; 1357 } 1358 return join(' ', @r); 1359} 1360 1361 1362# time12to24() 1363# 1364# Convert a 12 hour time string to a 24 hour one, without anything too 1365# fancy. In particular the timezone is passed through unchanged. 1366# 1367sub time12to24( $ ) { 1368 die 'usage: time12to24(12 hour time string)' if @_ != 1; 1369 local $_ = shift; 1370 die if not defined; 1371 1372 # Remove the timezone and stick it back on afterwards. 1373 my $tz = gettz($_); 1374 s/\Q$tz\E// if defined $tz; 1375 1376 s/\s+//; 1377 my ($hours, $mins, $ampm) = /^(\d\d?)[.:]?(\d\d)\s*(am|pm)$/ 1378 or die "bad time $_"; 1379 if ($ampm eq 'am') { 1380 if (1 <= $hours and $hours < 12) { 1381 $hours = $hours; # 5am = 05:00 1382 } 1383 elsif ($hours == 12) { 1384 $hours = 0; # 12am = 00:00 1385 } 1386 else { die "bad number of hours $hours" } 1387 } 1388 elsif ($ampm eq 'pm') { 1389 if ($hours == 12) { 1390 $hours = 12; # 12pm = 12:00 1391 } 1392 elsif (1 <= $hours and $hours < 12) { 1393 $hours = 12 + $hours; # 5pm = 17:00 1394 } 1395 else { die "bad number of hours $hours" } 1396 } 1397 else { die } 1398 1399 my $r = sprintf('%02d:%02d', $hours, $mins); 1400 $r .= " $tz" if defined $tz; 1401 return $r; 1402} 1403 1404 1405 1406 1407# special_credits() 1408# 1409# Try to sniff out presenter, actor or guest info from the start of the 1410# description and put it into the credits section instead. 1411# 1412# Parameters: one programme (hashref) 1413# Returns: [ modified programme ], or undef 1414# 1415# May modify the programme passed in, if return value is not undef. 1416# But that's okay for a special-case handler. 1417# 1418sub special_credits( $ ) { 1419# local $Log::TraceMessages::On = 1; 1420 die 'usage: special_credits(programme hashref)' if @_ != 1; 1421 my $prog = shift; 1422 t 'special_credits() ENTRY'; 1423 1424 # Caution: we need to make sure $_ is 'live' so updates to it 1425 # change the programme, when calling the extractors. 1426 # 1427 foreach my $pair (grep { pair_ok($_) } @{$prog->{desc}}) { 1428 die if not defined; 1429 t "testing desc: $pair->[0]"; 1430 if (not length $pair->[0]) { 1431 local $Log::TraceMessages::On = 1; 1432 t 'programme with empty desc:' . d $prog; 1433 } 1434 1435 if (s/\b([pP])resenteed\b/$1resented/g) { 1436 t "fixing spelling mistake!"; 1437 return [ $prog ]; 1438 } 1439 1440 # Regexps to apply to the description (currently only the 1441 # first English-language description is matched). The first 1442 # element is a subroutine which should alter $_ and return a 1443 # name or string of names if it succeeds, undef if it fails to 1444 # match. 1445 # 1446 # The first argument of the subroutine is the programme 1447 # itself, but this usually isn't used. In any case, it should 1448 # not be modified except by altering $_. 1449 # 1450 my @extractors = 1451 ( 1452 # Definitely presenter 1453 [ sub { 1454 s{(\b[a-z]\w+\s+)(?:(?:guest|virtual|new\s+)?presenters?)\s+($NAMES_RE)}{$1$2}o 1455 && return $2; 1456 s{((?:^|\.|\?)\s*)($NAMES_RE)\s+(?:(?:presents?)|(?:rounds?\s+up)|(?:introduces?))\b\s*(\.|,|\w|\Z)} 1457 {$1 . uc $3}oe 1458 && return $2; 1459 s{Presenters?\s+($NAMES_RE)}{$1}o 1460 && return $1; 1461 s{,?\s*[cC]o-?presenters?\s+($NAMES_RE)}{}o 1462 && return $1; 1463 s{,?\s*[pP]resented by\s+($NAMES_RE)\b\s*(.|,?\s+\w|\Z)}{uc $2}oe 1464 && return $1; 1465 s{^\s*([hH]eadlines?(?:\s+\S+)?),?\s*[wW]ith\s+($NAMES_RE)\b(?:\.\s*)?}{$1}o 1466 && return $2; 1467 s{,?\s*(?:[iI]ntroduced|[cC]haired)\s+by\s+($NAMES_RE)(?:\.\s*)?}{}o 1468 && return $1; 1469 1470 # This last one is special: it adds 'Last in series' 1471 # which some other handler might pick up. 1472 # 1473 s{((?:^|\.|\?)\s*)($NAMES_RE)\s+concludes?\s+the\s+series\b\s*(?:with\b\s*)?(\.|,|\w|\Z)} 1474 {$1 . 'Last in series. ' . uc $3}oe 1475 && return $2; 1476 1477 return undef; 1478 }, 'presenter' ], 1479 # Leave 'virtual presenter', 'aquatic presenter', 1480 # 'new presenter' alone for now 1481 # 1482 1483 # Might be presenter depending on type of show 1484 [ sub { 1485 if (hasp($_[0], 'category', 1486 sub { $_[0] =~ /\b(?:comedy|drama|childrens?)\b/i }) 1487 and not $prog->{credits}->{presenter}) { 1488 return undef; 1489 } 1490 s{^\s*,?\s*[wW]ith\s+($NAMES_RE)\b(?:(?:\.\s*)?$)?}{}o 1491 && return $1; 1492 s{^\s*(?:[hH]ost\s+)?($NAME_RE) (?:introduces|conducts) (\w)(?![^.,;:!?]*\bto\b)} {uc $2}oe 1493 && return $1; 1494 s{^\s*(?:[hH]ost\s+)?($NAME_RE)\s+(?:explores|examines)\s*}{}o 1495 && return $1; 1496 return undef; 1497 }, 'presenter' ], 1498 1499 [ sub { 1500 s{((?:^|\.|\?)\s*)($NAME_RE)\s+interviews\b\s*(\.|,|\w|\Z)}{$1 . uc $3}oe 1501 && return $2; 1502 return undef; 1503 }, 'presenter' ], # FIXME should be 'host' or 'interviewer' 1504 1505 # 'with' in quiz shows is guest (maybe) 1506 [ sub { 1507 return undef unless hasp($_[0], 'category', 1508 sub { $_[0] =~ /\b(?:quiz|sports?)\b/i }); 1509 s{((?:^|,|\.|\?)\s*)[wW]ith\s*($NAMES_RE)\b(?!\s+among)(\.\s*\S)} 1510 {$1 ne ',' ? "$1$2" : $2}oe 1511 && return $2; 1512 s{((?:^|,|\.|\?)\s*)[wW]ith\s*($NAMES_RE)\b(?!\s+among)(?:\.\s*$)?} 1513 {$1 ne ',' ? $1 : ''}oe 1514 && return $2; 1515 return undef; 1516 }, 'guest' ], 1517 1518 # 'with' in news/children shows is presenter (equally 1519 # dubious). Also a 'with' in a talk show might be 1520 # presenter or might be guest, but at least we know it's 1521 # not actor. 1522 # 1523 [ sub { 1524 return undef 1525 unless hasp($_[0], 'category', 1526 sub { $_[0] =~ /\b(?:news|business|economics?|political|factual|talk|childrens?|game show)\b/i }); 1527 s{(?:^|,|\.|\?)\s*[wW]ith\s*($NAMES_RE)\b(?:\.\s*)?}{}o && return $1; 1528 return undef; 1529 }, 'presenter' ], 1530 1531 [ sub { 1532 # Anything with a 'presenter' does not have actors. 1533 return undef if $prog->{credits}->{presenter}; 1534 s{(?:[Ww]ith\s+)?[gG]uest\s+star\s+($NAMES_RE)\b\s*[,;.]?\s*}{}o 1535 && return $1; 1536 s{^($NAMES_RE) (?:co-)?stars? in\s+(?:this\s+)?}{uc $2}oe 1537 && return $1; 1538 s{\s*($NAMES_RE) (?:co-)?stars?\.?\s*$}{}o 1539 && return $1; 1540 s{(?:^|\.|\?)\s*($NAMES_RE)\s+(?:co-)?stars?\s+as\s*$}{}o 1541 && return $1; 1542 return undef; 1543 }, 'actor' ], 1544 1545 [ sub { 1546 # A discussion of 'a film starring Robin Williams' 1547 # does not itself feature that actor. 1548 # 1549 return undef if $prog->{credits}->{presenter}; 1550 return undef if hasp($_[0], 'category', sub { $_[0] =~ /\barts\b/i }); 1551 s{(?:^|,|\.|\?)\s*[wW]ith\s*($NAMES_RE)\b(?:,|\.|;|$)?}{}o 1552 && return $1; 1553 s{,?\s*(?:(?:[Aa]lso|[Aa]nd)\s+)?(?:[Cc]o-|[Gg]uest-|[Gg]uest\s+)?[Ss]tarring\s+($NAMES_RE)\s*$}{}o 1554 && return $1; 1555 return undef; 1556 }, 'actor' ], 1557 1558 [ sub { 1559 s{,?\s*[wW]ith\s+guests?\s+($NAMES_RE)\b(?:\.\s*)?}{}o 1560 && return $1; 1561 s{((?:^|\.|!|\?)\s*)($NAME_RE)\s+guests(?:$|(?:\s+)|(?:.\s*))}{$1}o 1562 && return $2; 1563 return undef; 1564 }, 'guest' ], 1565 1566 [ sub { 1567 s{(?:^|\.|!|\?|,)(?:[Ww]ritten\s+)?\s*by\s+($NAMES_RE)\b($|\.)}{$2}o 1568 && return $1; 1569 return undef; 1570 }, 'writer' ], 1571 ); 1572 1573 # Run our own little hog-butchering algorithm to match each of 1574 # the subroutines in turn. 1575 # 1576 my $matched = 0; 1577 EXTRACTORS: foreach my $e (@extractors) { 1578 my ($sub, $person) = @$e; 1579 t "running extractor for role $person"; 1580 my $old_length = length $pair->[0]; 1581 my $match; 1582 for ($pair->[0]) { $match = $sub->($prog) } 1583 if (defined $match) { 1584 # Found one or more $person called $match. We add them to 1585 # the list unless they're already in there. We use a 1586 # per-programme cache of this information to avoid 1587 # going through the list each time (basically because 1588 # hashes are more Perlish). 1589 # 1590 t "got list of people: $match"; 1591 my @names = list_names($match); 1592 t 'that is, names: ' . d \@names; 1593 t 'by shortening desc, programme updated to: ' . d $prog; 1594 for my $credits ($prog->{credits}) { 1595 my %seen; 1596 if (lc $person eq 'guest') { 1597 # Impossible for someone to be guest as well 1598 # as another part, so don't add it if already 1599 # listed anywhere. 1600 # 1601 foreach (keys %$credits) { 1602 $seen{$_}++ foreach @{$credits->{$_}}; 1603 } 1604 } 1605 else { 1606 # Cannot add this person if they are already 1607 # given in the same job, or as a guest. 1608 # 1609 foreach (@{$credits->{$person}}, @{$credits->{guest}}) { 1610 $seen{$_}++ && warn "person $_ seen twice"; 1611 } 1612 } 1613 1614 t 'people already known (or ineligible): ' . d \%seen; 1615 foreach (@names) { 1616 t "maybe adding $_ as a $person"; 1617 push @{$credits->{$person}}, $_ unless $seen{$_}++; 1618 } 1619 t '$credits->{$person}=' . d $credits->{$person}; 1620 } 1621 1622 if (length $pair->[0] >= $old_length) { 1623 warn "extractor failed to shorten text: now $pair->[0]"; 1624 } 1625 1626 t 'by adding people, programme updated to: ' . d $prog; 1627 $matched = 1; 1628 goto EXTRACTORS; # start again from beginning of loop 1629 } 1630 } 1631 if ($matched) { 1632 t 'some handlers matched, programme: ' . d $prog; 1633 scrub_empty($prog->{desc}); 1634 t 'after removing empty things, programme: ' . d $prog; 1635 return [ $prog ]; 1636 } 1637 } 1638 return undef; 1639} 1640# has() 1641# 1642# Check whether some attribute of a programme matches a particular 1643# string. For example, does the programme have the category 'quiz'? 1644# This means checking all categories of acceptable language. 1645# 1646# has($programme, 'category', 'quiz'); 1647# 1648sub has( $$@ ) { 1649# local $Log::TraceMessages::On = 1; 1650 my ($p, $attr, @allowed) = @_; 1651 t 'testing whether programme: ' . d $p; 1652 t "has attribute $attr in the list: " . d \@allowed; 1653 my $list = $p->{$attr}; 1654 t 'all [text, lang] pairs for this attr: ' . d $list; 1655 return 0 if not defined $list; 1656 foreach (grep { pair_ok($_) } @$list) { 1657 my ($text, $lang) = @$_; 1658 foreach (@allowed) { 1659 t "testing if $text matches $_ (nocase)"; 1660 return 1 if lc $text eq $_; 1661 } 1662 } 1663 t 'none of them matched, returning false'; 1664 return 0; 1665} 1666# hasp() 1667# 1668# Like has() but instead of a list of strings to compare against, 1669# takes a subroutine reference. This subroutine will be run against 1670# all the text strings of suitable language in turn until it matches 1671# one, when true is returned. If none match, returns false. 1672# 1673# Parameters: 1674# ref to programme hash 1675# name of key to look under 1676# subroutine to apply to each value of key with acceptable language 1677# 1678# Returns: whether subroutine gives true for at least one value. 1679# 1680# The subroutine will get the text value passed in $_[0]. 1681# 1682sub hasp( $$$ ) { 1683# local $Log::TraceMessages::On = 1; 1684 my ($p, $attr, $sub) = @_; 1685 die "expected programme hash as first argument, not $p" 1686 if ref $p ne 'HASH'; 1687 t 'testing whether programme: ' . d $p; 1688 t "has a value for attribute $attr that makes sub return true"; 1689 1690 # FIXME commonize this with has(). 1691 my $list = $p->{$attr}; 1692 t 'all [text, lang] pairs for this attr: ' . d $list; 1693 return 0 if not defined $list; 1694 foreach (grep { pair_ok($_) } @$list) { 1695 my ($text, $lang) = @$_; 1696 t "testing if $text matches"; 1697 return 1 if $sub->($text); 1698 } 1699 t 'none of them matched, returning false'; 1700 return 0; 1701} 1702 1703 1704# special_new_series() 1705# 1706# Contrary to first appearances, the <new /> element in the XML isn't 1707# to indicate a new series - it means something stronger, a whole new 1708# show (not a new season of an existing show). But you can represent 1709# part of the meaning of 'new series' within the episode-num 1710# structure, because obviously a new series means that this is the 1711# first episode of the current series. 1712# 1713# This handler is mostly here to get rid of the 'New series' text at 1714# the start of descriptions, to try and make output from different 1715# grabbers look the same. 1716# 1717sub special_new_series( $ ) { 1718 die 'usage: special_new_series(programme)' if @_ != 1; 1719 my $p = shift; 1720 1721 # Just assume that if it contains 'New series' at the start then 1722 # it's English. 1723 # 1724 my $is_new_series = 0; 1725 foreach (@{$p->{desc}}) { 1726 for ($_->[0]) { 1727 if (s/^New series(?:\.\s*|$)// 1728 or s/^New series (?:of (?:the )?)?(\w)/uc $1/e 1729 ) { 1730 $is_new_series = 1; 1731 } 1732 } 1733 } 1734 1735 return undef if not $is_new_series; 1736 if (defined $p->{'episode-num'}) { 1737 foreach (@{$p->{'episode-num'}}) { 1738 my ($content, $system) = @$_; 1739 next unless $system eq 'xmltv_ns'; 1740 $content =~ m!^\s*(\d+/\d+|\d+|)\s*\.\s*(\d+/\d+|\d+|)\s*\.\s*(\d+/\d+|\d+|)\s*$! 1741 or warn("badly formed xmltv_ns episode-num: $content"), return [ $p ]; 1742 my ($season, $episode, $part) = ($1, $2, $3); 1743 if ($episode ne '' and $episode !~ /^0/) { 1744 warn "new series, but episode number $episode"; 1745 } 1746 elsif ($episode eq '') { 1747 # We now know the information that this is the first 1748 # episode of the series. 1749 # 1750 $episode = '0'; 1751 $content = "$season . $episode . $part"; 1752 $_ = [ $content, $system ]; 1753 last; 1754 } 1755 } 1756 } 1757 else { 1758 # Make a dummy episode num which says nothing other than 1759 # this is the first episode of the series. 1760 # 1761 $p->{'episode-num'} = [ [ ' . 0 . ', 'xmltv_ns' ] ]; 1762 } 1763 scrub_empty($p->{desc}); 1764 return [ $p ]; 1765} 1766 1767 1768# special_year(): take a year at the start of the description and move 1769# it to the 'date' field. 1770# 1771sub special_year( $ ) { 1772 die 'usage: special_new_series(programme)' if @_ != 1; 1773 my $p = shift; 1774 1775 my $year; 1776 foreach (@{$p->{desc}}) { 1777 if ($_->[0] =~ s/^(\d{4})\s+//) { 1778 my $got = $1; 1779 if (defined $year and $got ne $year) { 1780 warn "found different years $year and $got"; 1781 return [ $p ]; 1782 } 1783 $year = $got; 1784 } 1785 } 1786 return undef if not defined $year; 1787 if (defined $p->{date}) { 1788 if ($p->{date} !~ /^\s*$year/) { 1789 warn "found year $year in programme description, but date $p->{date}"; 1790 } 1791 } 1792 else { 1793 $p->{date} = $year; 1794 } 1795 scrub_empty($p->{desc}); 1796 return [ $p ]; 1797} 1798 1799 1800# 'TVM' at start of description means TV movie. 1801sub special_tv_movie( $ ) { 1802 die 'usage: special_tv_movie(programme)' if @_ != 1; 1803 my $p = shift; 1804 my $is_tv_movie = 0; 1805 foreach (@{$p->{desc}}) { 1806 my $lang = $_->[1]; 1807 if (not defined $lang or $lang =~ /^en/) { 1808 if ($_->[0] =~ s/^TVM\b\s*//) { 1809 $is_tv_movie = 1; 1810 } 1811 } 1812 } 1813 return undef if not $is_tv_movie; 1814 add($p, 'category', 'TV movie'); 1815 scrub_empty($p->{desc}); 1816 return [ $p ]; 1817} 1818 1819 1820# '(T)' in description means teletext subtitles. But this should run 1821# after doing any splitting and other stuff. 1822# 1823sub special_teletext_subtitles( $ ) { 1824 die 'usage: special_teletext_subtitles(programme)' if @_ != 1; 1825 my $p = shift; 1826 my $has_t = 0; 1827 foreach (@{$p->{desc}}) { 1828 if ($_->[0] =~ s/\s*\(T\)\s*$//) { 1829 $has_t = 1; 1830 } 1831 } 1832 return undef if not $has_t; 1833 if (defined $p->{subtitles}) { 1834 foreach (@{$p->{subtitles}}) { 1835 return [ $p ] if defined $_->{type} and $_->{type} eq 'teletext'; 1836 } 1837 } 1838 push @{$p->{subtitles}}, { type => 'teletext' }; 1839 scrub_empty($p->{desc}); 1840 return [ $p ]; 1841} 1842 1843 1844# Remove stock phrases that have no meaning. 1845sub special_useless( $ ) { 1846 die 'usage: special_useless(programme)' if @_ != 1; 1847 my $p = shift; 1848 1849 # FIXME need to commonize hog-butchering with special_credits(). 1850 my $changed = 0; 1851 foreach (@{$p->{desc}}) { 1852 for ($_->[0]) { 1853 $changed |= s/^(?:a\s+|)round-up\s+of\s+(\w)/uc $1/ie; 1854 $changed |= s/^(\w+[^s])\s+round-up\.?\s*$/$1 . 's'/ie; 1855 $changed |= s/((?:^|\.|!|\?)\s*)Coverage\s+of\s+(\w)/$1 . uc $2/e; 1856 } 1857 } 1858 1859 return [ $p ] if $changed; 1860 return undef; 1861} 1862 1863 1864# special_radio4() 1865# 1866# Split Radio 4 into FM and LW. 1867# 1868sub special_radio4( $ ) { 1869 die 'usage: special_radio4(programme)' if @_ != 1; 1870 my $p = shift; 1871 return undef if $p->{channel} ne 'radio4'; 1872 1873 for ($p->{title}) { 1874 if (s/^\(FM\)\s+//) { 1875 $p->{channel} = 'radio4-fm'; 1876 return [ $p ]; 1877 } 1878 if (s/^\(LW\)\s+//) { 1879 $p->{channel} = 'radio4-lw'; 1880 return [ $p ]; 1881 } 1882 1883 my %fm = ( %$p, channel => 'radio4-fm' ); 1884 my %lw = ( %$p, channel => 'radio4-lw' ); 1885 return [ \%fm, \%lw ]; 1886 } 1887} 1888 1889 1890# special_split_title() 1891# 1892# In addition to the 'programme tacked onto the end of another' 1893# handled by add_clumpidx, you also sometimes see two programmes 1894# totally sharing an entry. For example 'News; Shipping Forecast'. 1895# 1896sub special_split_title( $ ) { 1897 die 'usage: special_split_title(programme)' if @_ != 1; 1898 my $p = shift; 1899 return undef if $p->{title} !~ tr/;//; 1900 1901 # Split the title at ; and make N identical programmes one with 1902 # each title. The programme details are given to only the last of 1903 # the programmes - in the listings data we're getting, normally 1904 # the insignificant programme comes first with the main feature 1905 # last, as in 'News; Radio 3 Lunchtime Concert'. 1906 # 1907 1908 # List of elements which are meta-data and should be kept for all 1909 # the programmes we split into - the rest are given only to the 1910 # last programme. 1911 # 1912 my %meta = (start => 1, stop => 1, 'pdc-start' => 1, 1913 'vps-start' => 1, showview => 1, videoplus => 1, 1914 channel => 1); 1915 # but not clumpidx! 1916 1917 my %p_meta; 1918 foreach (grep { $meta{$_} } keys %$p) { 1919 $p_meta{$_} = $p->{$_}; 1920 } 1921 1922 my @r; 1923 my @titles = split /\s*;+\s*/, $p->{title}; 1924 for (my $i = 0; $i < @titles - 1; $i++) { 1925 push @r, { %p_meta, 1926 title => $titles[$i], 1927 clumpidx => ( "$i/" . scalar @titles ) }; 1928 } 1929 push @r, { %$p, 1930 title => $titles[-1], 1931 clumpidx => ("$#titles/" . scalar @titles) }; 1932 1933 return \@r; 1934} 1935 1936 1937# special_film() 1938# 1939sub special_film( $ ) { 1940 die 'usage: special_film(programme)' if @_ != 1; 1941 my $p = shift; 1942 if (not defined $p->{'sub-title'} or $p->{'sub-title'} ne '(Film)') { 1943 return undef; 1944 } 1945 1946 warn "replacing category $p->{category} with 'film'" 1947 if defined $p->{category}; 1948 $p->{category} = 'film'; 1949 undef $p->{'sub-title'}; 1950 1951 if (defined $p->{desc} and $p->{desc} =~ s/^(\d{4})\s+//) { 1952 warn "found year $1 in description, replacing date $p->{date}" 1953 if defined $p->{date}; 1954 $p->{date} = $1; 1955 } 1956 1957 return [ $p ]; 1958} 1959 1960 1961# add() 1962# 1963# Add a piece of human-readable information to a particular slot, but 1964# only if it isn't there already. For example add the category 1965# 'music', but only if that category isn't already set. This is for 1966# keys that take multiple values and each value is a [ text, lang ] 1967# pair. The language is assumed to be English. 1968# 1969# Parameters: 1970# programme hash to add to 1971# name of key 1972# textual value to add 1973# 1974# Returns: whether the programme was altered. 1975# 1976sub add( $$$ ) { 1977 my ($p, $k, $v) = @_; 1978 if (defined $p->{$k}) { 1979 foreach (@{$p->{$k}}) { 1980 return 0 if $_->[0] eq $v; 1981 } 1982 } 1983 push @{$p->{$k}}, [ $v, $LANG ]; 1984 return 1; 1985} 1986 1987 1988# scrub_empty(): remove empty strings from a list of [text, lang] 1989# pairs. 1990# 1991# Parameters: zero or more listrefs 1992# 1993# Modifies lists passed in, removing all [ '', whatever ] pairs. 1994# 1995sub scrub_empty( @ ) { 1996 foreach (@_) { 1997 @$_ = grep { length $_->[0] } @$_; 1998 } 1999} 2000 2001 2002# Make sure that a programme altered by a special handler does not 2003# spill outside its alotted timespan. This is just a sanity check 2004# before fix_clumps() does its stuff. In a future version we might 2005# remove this restriction and allow special handlers to move 2006# programmes outside their original timeslot. 2007# 2008# Parameters: 2009# original programme 2010# ref to list of new programmes 2011# 2012sub check_time_fits( $$ ) { 2013 my $orig = shift; 2014 my @new = @{shift()}; 2015 my $o_start = pd($orig->{start}); 2016 die if not defined $o_start; 2017 my $o_stop = pd($orig->{stop}); 2018 foreach (@new) { 2019 my $start = pd($_->{start}); 2020 die if not defined $start; 2021 if (Date_Cmp($start, $o_start) < 0) { 2022 die "programme starting at $o_start was split into one starting at $start"; 2023 } 2024 2025 if (defined $o_stop) { 2026 my $stop = pd($_->{stop}); 2027 if (defined $stop and Date_Cmp($o_stop, $stop) < 0) { 2028 die "programme stopping at $o_stop was split into one stopping at $stop"; 2029 } 2030 } 2031 } 2032} 2033 2034 2035 2036# Another sanity check. 2037sub check_same_channel( $ ) { 2038 my $progs = shift; 2039 my $ch; 2040 foreach my $prog (@$progs) { 2041 for ($prog->{channel}) { 2042 if (not defined) { 2043 t 'no channel! ' . d $prog; 2044 croak 'programme has no channel'; 2045 } 2046 if (not defined $ch) { 2047 $ch = $_; 2048 } 2049 elsif ($ch eq $_) { 2050 # Okay. 2051 } 2052 else { 2053 # Cannot use croak() due to this error message: 2054 # 2055 # Bizarre copy of ARRAY in aassign at /usr/lib/perl5/5.6.0/Carp/Heavy.pm line 79. 2056 # 2057 local $Log::TraceMessages::On = 1; 2058 t 'same clump, different channels: ' . d($progs->[0]) . ' and ' . d($prog); 2059 die "programmes in same clump have different channels: $_, $ch"; 2060 } 2061 } 2062 } 2063} 2064 2065 2066# There is a very hard to track down bug where stop times mysteriously 2067# get set to something ridiculous. It varies from one perl version to 2068# another (hence the version check at the top) but still occurs even 2069# with 5.6.1. To track it down I have isolated all code that sets 2070# stop times in this subroutine. 2071# 2072sub set_stop_time( $$ ) { 2073 my $p = shift; 2074 my $s = shift; 2075 2076 if ($SLOW) { 2077 # Another mysterious-bug-preventing line, see no_shared_scalars(). 2078 my $dummy = "$s"; 2079 2080 $p->{stop} = $s; 2081 } 2082 else { 2083 # Don't set stop times at all. 2084 delete $p->{stop}; 2085 } 2086} 2087 2088# More debugging aids. 2089sub cst( $ ) { 2090 my $p = shift; 2091 croak "prog $p->{title}->[0]->[0] has bogus stop time" 2092 if exists $p->{stop} and $p->{stop} eq 'boogus FIXME XXX'; 2093} 2094 2095sub no_shared_scalars( $ ) { 2096 my %seen; 2097 foreach my $h (@{$_[0]}) { 2098 foreach my $k (keys %$h) { 2099 my $ref = \ ($h->{$k}); 2100 my $addr = "$ref"; 2101 $seen{$addr}++ && die "scalar $addr seen twice"; 2102 } 2103 } 2104} 2105