1#!/usr/local/bin/perl -w
2
3=pod
4
5=head1 NAME
6
7tv_extractinfo_en - read English-language listings and extract info
8from programme descriptions.
9
10=head1 SYNOPSIS
11
12tv_extractinfo_en [--help] [--output FILE] [FILE...]
13
14=head1 DESCRIPTION
15
16Read XMLTV data and attempt to extract information from
17English-language programme descriptions, putting it into
18machine-readable form.  For example the human-readable text '(repeat)'
19in a programme description might be replaced by the XML element
20<previously-shown>.
21
22B<--output FILE> write to FILE rather than standard output
23
24This tool also attempts to split multipart programmes into their
25constituents, by looking for a description that seems to contain lots
26of times and titles.  But this depends on the description following
27one particular style and is useful only for some listings sources
28(Ananova).
29
30If some text is marked with the 'lang' attribute as being some
31language other than English ('en'), it is ignored.
32
33=head1 SEE ALSO
34
35L<xmltv(5)>.
36
37=head1 AUTHOR
38
39Ed Avis, ed@membled.com
40
41=head1 BUGS
42
43Trying to parse human-readable text is always error-prone, more so
44with the simple regexp-based approach used here.  But because TV
45listing descriptions usually conform to one of a few set styles,
46tv_extractinfo_en does reasonably well.  It is fairly conservative,
47trying to avoid false positives (extracting 'information' which
48isnE<39>t really there) even though this means some false negatives
49(failing to extract information and leaving it in the human-readable
50text).
51
52However, the leftover bits of text after extracting information may
53not form a meaningful English sentence, or the punctuation may be
54wrong.
55
56On the two listings sources currently supported by the XMLTV package,
57this program does a reasonably good job.  But it has not been tested
58with every source of anglophone TV listings.
59
60=cut
61
62use strict;
63use XMLTV::Version '$Id: tv_extractinfo_en,v 1.70 2015/07/12 00:46:37 knowledgejunkie Exp $ ';
64use XMLTV::Date;
65use Date::Manip;
66use Carp;
67use Getopt::Long;
68
69BEGIN {
70    if (int(Date::Manip::DateManipVersion) >= 6) {
71	Date::Manip::Date_Init("SetDate=now,UTC");
72    } else {
73	Date::Manip::Date_Init("TZ=UTC");
74    }
75}
76
77# Use Log::TraceMessages if installed.
78BEGIN {
79    eval { require Log::TraceMessages };
80    if ($@) {
81	*t = sub {};
82	*d = sub { '' };
83    }
84    else {
85	*t = \&Log::TraceMessages::t;
86	*d = \&Log::TraceMessages::d;
87	Log::TraceMessages::check_argv();
88    }
89}
90
91# Use Term::ProgressBar if installed.
92use constant Have_bar => eval { require Term::ProgressBar; 1 };
93
94use XMLTV;
95use XMLTV::TZ qw(gettz offset_to_gmt);
96use XMLTV::Clumps qw(clump_relation relatives fix_clumps nuke_from_rel);
97use XMLTV::Usage <<END
98$0: read English-language listings and extract info from programme descriptions
99usage: $0 [--help] [--output FILE] [FILE...]
100END
101;
102
103
104# There are some seeming bugs in Perl which corrupt the stop time of
105# programmes.  They are less in 5.6.1 than 5.6.0 but still there.  The
106# debugging assertions cst() and no_shared_scalars() have the effect
107# of stopping the problem (it's a Heisenbug).  So one way of making
108# stop times correct is to call this routines regularly.
109#
110# Alternatively, we can limit the script's functionality to work
111# around the bug.  It seems to affect stop times, so if we just don't
112# add stop times things should be okay.
113#
114# This flag decides which of the two to pick: slow but with maximum
115# information about stop times, or fast without them.  (Stop times can
116# easily be added back in by tv_sort, and they weren't that good
117# anyway, so you should probably leave this off.)
118#
119my $SLOW = 0;
120warn "this version has debugging calls, will run slowly\n" if $SLOW;
121
122# It might turn out that a particular version of perl is needed.
123# BEGIN {
124#     eval { require 5.6.1 };
125#     if ($@) {
126# 	for ($@) {
127# 	    chomp;
128# 	    s/, stopped at .+$//;
129# 	    warn "$_, continuing but output may be wrong\n";
130# 	}
131#     }
132# }
133
134sub list_names( $ );
135sub check_same_channel( $ );
136sub special_category( $ );
137sub special_multipart( $ );
138sub special_credits( $ );
139sub special_radio4( $ );
140sub special_split_title( $ );
141sub special_film( $ );
142sub special_new_series( $ );
143sub special_year( $ );
144sub special_tv_movie( $ );
145sub special_teletext_subtitles( $ );
146sub special_useless( $ );
147sub check_time_fits( $$ );
148sub cst( $ );
149sub no_shared_scalars( $ );
150sub has( $$@ );
151sub hasp( $$$ );
152sub pd( $ );
153sub ud( $ );
154sub nd( $ );
155sub bn( $ );
156sub munge( $ );
157sub multipart_split_desc( $$ );
158sub clocks_poss( $ );
159sub time12to24( $ );
160sub add( $$$ );
161sub scrub_empty( @ );
162sub set_stop_time( $$ );
163sub dump_pseudo_programme( $ );
164
165# --no-create-sub-titles is an undocumented switch, affecting the
166# splitting of multipart programmes only, to not break a title
167# containing colons into title and sub-title, but always keep it as a
168# single title containing a colon.  This is for consistency with some
169# data sources that do this.
170#
171my ($opt_help, $opt_output, $opt_no_create_sub_titles);
172GetOptions('help' => \$opt_help, 'output=s' => \$opt_output,
173	   'no-create-sub-titles' => \$opt_no_create_sub_titles)
174  or usage(0);
175usage(1) if $opt_help;
176@ARGV = ('-') if not @ARGV;
177
178####
179# Language selection stuff.
180#
181my $LANG = 'en';
182
183# bn(): wrapper for XMLTV::best_name().  Does some memoizing (so
184# assumes that the languages in a list of pairs will not change).
185#
186my %bn;
187sub bn( $ ) {
188    my $pairs = shift;
189    return undef if not defined $pairs;
190    die 'bn(): expected ref to list of [text,lang] pairs'
191      if ref $pairs ne 'ARRAY';
192    for ($bn{$pairs}) {
193	return $_ if defined;
194	foreach (@$pairs) {
195	    carp "found bad [text,lang] pair: $_" if ref ne 'ARRAY';
196	}
197	return $_ = XMLTV::best_name([ $LANG ], $pairs);
198    }
199}
200
201# pair_ok(): returns whether a [ text, lang ] pair is usable.
202sub pair_ok( $ ) {
203    not defined $_->[1] or $_->[1] =~ /^$LANG(?:_\w+)?$/o;
204}
205
206####
207# Human name stuff.
208#
209
210# Regular expression to match a name
211my $UC = '[A-Z]';         # upper case
212my $LC = "[a-z]";         # lower case
213my $AC_P = "[\'A-Za-z-]"; # any case with punctuation
214my $NAME_RE;
215{
216    # One word of a name.  Uppercase, anycase then ending in at least
217    # two lowercase.  Alternatively, uppercase then lowercase (eg
218    # 'Lee'), all uppercase ('DJ') or just uppercase and an optional dot (for
219    # initials).
220    #
221    my $name_comp_re = "(?:$UC(?:(?:$AC_P+$LC$LC)|(?:$LC+)|(?:$UC+)|\\.?))";
222    foreach ('Simon', 'McCoy') {
223	die "cannot match name component $_" unless /^$name_comp_re$/o;
224    }
225    foreach ("Valentine's") {
226	die "wrongly matched name component $_" if /^$name_comp_re$/o;
227    }
228
229    # Additional words valid in the middle of names.
230    my $name_join_re = "(?:von|van|de|di|da|van\\s+den|bin|ben|al)";
231
232    # A name must have at least two components.  This excludes those
233    # celebrities known only by first name but it's a reasonable
234    # heuristic for distinguishing between the names of actors and the
235    # names of characters.
236    #
237    my $name_re = "(?:$name_comp_re\\s+(?:(?:(?:$name_comp_re)|$name_join_re)\\s+)*$name_comp_re)";
238    foreach ('Simon McCoy', 'Annie Wu') {
239	die "cannot match $_" unless /^$name_re$/o;
240    }
241
242    # Special handling for some titles.  But others beginning 'the'
243    # are specifically excluded (to avoid 'the Corornation Street
244    # star' parsing as '$NAME_RE star').
245    #
246    $NAME_RE = "(?<!the\\s)(?:(?:[Tt]he\\s+Rev(?:\\.|erend)\\s+)?$name_re)";
247}
248
249# Regexp to match a list of names: 'Tom, Dick, and Harry'
250my $NAMES_RE = "(?:$NAME_RE(?:(?:\\s*,\\s*$NAME_RE)*(?:\\s*,?\\s*\\band\\s+$NAME_RE))?(?!\\s*(?:and\\s+$UC|from|[0-9])))";
251
252# Subroutine to extract the names from this list
253sub list_names( $ ) {
254    die 'usage: list_names(English string listing names)'
255      if @_ != 1;
256    local $_ = shift; die if not defined;
257    t 'list_names() processing string: ' . d $_;
258    my @r;
259
260    s/^($NAME_RE)\s*// or die "bad 'names' '$_'";
261    push @r, $1;
262
263    while (s/^,?\s*(?:and\s+)?($NAME_RE)\s*//) {
264	push @r, $1;
265    }
266    die "unmatched bit of names $_" unless $_ eq '';
267
268    return @r;
269}
270
271my @tests =
272  (
273   [ 'Richard Whiteley and Carol Vorderman', [ 'Richard Whiteley', 'Carol Vorderman' ] ],
274   [ 'show presented by Jonathan Ross, with', [ 'Jonathan Ross' ] ],
275   [ 'Shane Richie, Michael Starke and Scott Wright',
276     [ 'Shane Richie', 'Michael Starke', 'Scott Wright' ] ],
277   [ 'Basil Brush,Barney Harwood and Ugly Yetty present',
278     [ 'Basil Brush', 'Barney Harwood', 'Ugly Yetty'] ],
279  );
280foreach (@tests) {
281    my ($in, $expected) = @$_;
282    for ($in) {
283	/($NAMES_RE)/o or die "$in doesn't match \$NAMES_RE";
284	my @out = list_names($1);
285	local $Log::TraceMessages::On = 1;
286	if (d(\@out) ne d($expected)) {
287	    die "$in split into " . d(\@out);
288	}
289    }
290}
291
292
293####
294# Date handling stuff.
295#
296# This loses any information on partial dates (FIXME).
297#
298sub pd( $ ) {
299    for ($_[0]) {
300	return undef if not defined;
301	return parse_date($_);
302    }
303}
304sub ud( $ ) {
305    for ($_[0]) {
306	return undef if not defined;
307	return UnixDate($_, '%q');
308    }
309}
310sub nd( $ ) {
311    for ($_[0]) {
312	return undef if not defined;
313	return ud(pd($_));
314    }
315}
316
317# Memoize some subroutines if possible.  FIXME commonize to
318# XMLTV::Memoize.
319#
320eval { require Memoize };
321unless ($@) {
322    foreach (qw(parse_date UnixDate Date_Cmp
323		clocks_poss time12to24)) {
324	Memoize::memoize($_) or die "cannot memoize $_: $!";
325    }
326}
327
328my $encoding;
329my $credits;
330my %ch;
331my @progs;
332XMLTV::parsefiles_callback(sub( $ ) { $encoding = shift },
333			   sub( $ ) { $credits = shift },
334			   sub( $ ) { my $c = shift; $ch{$c->{id}} = $c },
335			   sub( $ ) { push @progs, shift },
336			   @ARGV);
337# Assume encoding is a superset of ASCII, and that Perl's regular
338# expressions work with it in the current locale.
339#
340
341my $related = clump_relation(\@progs);
342
343# Apply all handlers.  We just haphazardly
344# run one after the other; when a programme has been run
345# through all of them in sequence without any changes, we
346# move it to @progs_done.
347#
348# The reason for using _lists_ is that some handlers turn
349# a single programme into several.
350#
351my @progs_done = ();
352my $bar = new Term::ProgressBar('munging programmes', scalar @progs)
353  if Have_bar;
354while (@progs) {
355    # Deal with one more programme from the input, it may transform
356    # itself into one or more programmes which need processing in
357    # turn.  When all the offspring are dealt with we have finally
358    # finished with that input and can update the progress bar.
359    #
360    no_shared_scalars(\@progs) if $SLOW;
361    push @progs_done, munge(shift @progs);
362    update $bar if Have_bar;
363}
364if ($SLOW) { cst $_ foreach @progs_done }
365
366my %w_args = ();
367if (defined $opt_output) {
368    my $fh = new IO::File ">$opt_output";
369    die "cannot write to $opt_output\n" if not $fh;
370    %w_args = (OUTPUT => $fh);
371}
372XMLTV::write_data([ $encoding, $credits, \%ch, \@progs_done ], %w_args);
373exit();
374
375# Take a programme, munge it and return a list of programmes (empty if
376# the programme was deleted).  Uses the global $related to fiddle with
377# other programmes in the same clump.
378#
379sub munge( $ ) {
380#    local $Log::TraceMessages::On = 1;
381    t 'munge() ENTRY';
382    my @todo = (shift);
383    my @done;
384    t 'todo list initialized to: ' . d \@todo;
385    t 'done list initialized to: ' . d \@done;
386    t 'relatives of todo programme: ' . d relatives($related, $todo[0]);
387
388    # Special-case mungers for various programme types.  Each of these
389    # should take a single programme and return: a reference to a list of
390    # programmes, if successful; undef, if the programme is to be left
391    # alone.  Most special-case handlers will not break up a programme
392    # into several others, so the returned list will have only one
393    # element.
394    #
395    # A handler may modify the programme passed in iff it returns a list
396    # of munged programmes.
397    #
398    # Ones earlier in the list get the chance to run first, so in general
399    # things like splitting programmes (which may be relied on by other
400    # handlers) should come at the top and last-chance guesswork (such as
401    # parsing English text) at the bottom.
402    #
403    my @special_handlers =
404      (
405       \&special_multipart,
406       \&special_category,
407       \&special_credits,
408       \&special_new_series,
409       \&special_year,
410       \&special_tv_movie,
411       \&special_teletext_subtitles,
412       \&special_useless,
413
414       # There are three handlers specific to Ananova / UK listings.  I
415       # haven't yet decided what to do with them: should they be in this
416       # program and enabled with a special flag, or moved into the
417       # Ananova grabber?
418       #
419       # They haven't been ported to the new XMLTV.pm data structures, so
420       # leave them commented for now.
421       #
422       #   \&special_radio4,
423       #   \&special_split_title,
424       #   \&special_film,
425      );
426
427  PROG: while (@todo) {
428	my $prog = shift @todo;
429	t('running handlers for prog: ' . d($prog));
430	my $prog_length;
431	if (defined $prog->{stop}) {
432	    # Get the programme length in seconds.
433	    my $delta = DateCalc($prog->{start}, $prog->{stop});
434	    $prog_length = Delta_Format($delta, 0, '%st') if defined $delta;
435	}
436	foreach (@special_handlers) {
437	    t('running handler: ' . d($_));
438	    my $out = $_->($prog);
439	    if (defined $out) {
440		t('gave new list of progs: ' . d($out));
441		die "handler didn't return list of progs"
442		  if ref($out) ne 'ARRAY';
443		if ($SLOW) { cst $_ foreach @$out }
444		check_time_fits($prog, $out);
445		if ($SLOW) { cst $_ foreach @$out }
446		fix_clumps($prog, $out, $related);
447		foreach (@$out) {
448		    cst $_ if $SLOW;
449		    # Sanity check that length hasn't increased.
450		    if (defined $_->{stop}) {
451			my $delta = DateCalc($_->{start}, $_->{stop});
452			if (defined $prog_length) {
453			    my $new_length = Delta_Format($delta, 0, '%st');
454			    if ($new_length > $prog_length) {
455				local $Log::TraceMessages::On = 1;
456				t 'original programme (after handlers run): ' . d $prog;
457				t 'split into: ' . d $out;
458				t 'offending result: ' . d $_;
459				t 'length of result: ' . d $new_length;
460				t 'length of original programme: ' . d $prog_length;
461				die 'split into programme longer than the original';
462			    }
463			}
464		    }
465		}
466		push @todo, @$out;
467		next PROG;
468	    }
469	    t('gave undef');
470	}
471	cst $prog if $SLOW;
472	t 'none of the handlers fired, finished with this prog';
473	cst $prog if $SLOW;
474	push @done, $prog;
475    }
476    return @done;
477}
478
479# All the special handlers
480
481# special_category()
482#
483# Some descriptions have the last word as the category: 'blah blah
484# blah.  Western' (or 'Western series').  Remove this to the <category>
485# element.
486#
487# Also look for magic words like 'news' or 'interview' and add them as
488# categories.  This is mostly so that other handlers can then fire.
489#
490sub special_category( $ ) {
491    t 'special_category() ENTRY';
492    my $p = shift;
493    my $changed = 0;
494
495    # First, non-destructively look for 'news' in title or desc.
496    foreach (qw(title desc)) {
497	foreach my $pair (grep { pair_ok($_) } @{$p->{$_}}) {
498	    t "pair for $_: " . d $pair;
499	    if ($pair->[0] =~ /\bnews/i) {
500		t 'matches "news"';
501
502		if (hasp($p, 'category', sub { $_[0] =~ /\b(?:soap|drama|game show)\b/i })) {
503		    t '...but clearly not a news programme';
504		}
505		else {
506		    $changed |= add($p, 'category', 'news');
507		    cst $p if $SLOW;
508		}
509	    }
510	    if ($pair->[0] =~ /\binterviews\b/i) {
511		t 'matches "interviews"';
512		$changed |= add($p, 'category', 'talk');
513		cst $p if $SLOW;
514	    }
515	}
516    }
517
518    # Now try the last-word-of-desc munging.
519    my $replacement = sub( $$$$ ) {
520	my ($punct, $adj, $country, $genre) = @_;
521	$changed |= add($p, 'category', lc $genre);
522	if (length $adj or length $country) {
523	    return "$punct $adj$country$genre";
524	}
525	else {
526	    $changed = 1;
527	    return $punct;
528	}
529    };
530    foreach (grep { pair_ok($_) } @{$p->{desc}}) {
531	# 'Western'         -> ''
532	# 'Western series'  -> ''
533	# 'Classic Western' -> 'Classic Western'
534	# etc.
535	#
536	$_->[0] =~ s/(^|\.|\?)\s*
537	  (Classic\s+|Award-winning\s+|)
538	    (Australian\s+|)
539	      ([aA]dventure|[aA]nimation|[bB]iopic|[cC]hiller
540	       |[cC]omedy|[dD]ocumentary|[dD]rama|[fF]antasy
541	       |[hH]eadlines|[hH]ighlights|[hH]orror|[mM]agazine
542	       |[mM]elodrama|[mM]usical|[mM]ystery|[oO]mnibus
543	       |[qQ]uiz|[rR]omance|[sS]itcom|[tT]earjerker
544	       |[tT]hriller|[wW]estern)\s*(?:series\s*|)$/$replacement->($1, $2, $3, $4)/xe;
545    }
546
547    if ($changed) {
548	t 'some categories found, programme: ' . d $p;
549	scrub_empty($p->{title}, $p->{desc});
550	t 'after removing empty titles and descs, programme: ' . d $p;
551	return [ $p ];
552    }
553    else {
554	return undef;
555    }
556}
557
558
559# special_multipart()
560#
561# Often TV listings contain several programmes stuffed into one entry,
562# which might have made sense for a printed guide to save space, but
563# is stupid for electronic data.  This special handler looks at the
564# programme description and haphazardly attempts to split the
565# programme into its components.
566#
567# Parameters: a 'programme' hash reference
568# Returns: reference to list of sub-programmes, or undef if programme
569#          was not split
570#
571# We find the title using bn(), in other words we look only at
572# the first title.  Similarly we use only the first description.  But
573# it should work.  FIXME should split the secondary title as well!
574#
575sub special_multipart( $ ) {
576#    local $Log::TraceMessages::On = 1;
577    die "usage: special_multipart(hashref of programme details)"
578      if @_ != 1;
579    my $p = shift;
580    cst $p if $SLOW;
581    t 'special_multipart() ENTRY';
582    t 'checking programme descs: ' . d $p->{desc};
583    my $best_desc = bn($p->{desc});
584    t 'got best desc: ' . d $best_desc;
585    return undef if not $best_desc;
586    my ($desc, $desc_lang) = @$best_desc;
587    t 'testing description for multipart: ' . d $desc;
588    local $_ = $desc;
589    my @words = split;
590    my @poss_times = split /[ ,;-]/;
591    my @r;
592    my ($p_start, $p_stop) = (pd($p->{start}), pd($p->{stop}));
593    # Assume that the timezone for every time listed in the
594    # description is the same as the timezone for the programme's
595    # start.  FIXME will fail when timezone changes partway through.
596    #
597    my $tz = gettz($p->{start});
598
599    my $day;
600    if (defined $tz) {
601	# Find the base day taking into account timezones.  Eg if a
602	# programme starts at 00:45 BST on the 20th and then lists
603	# times as '01:00' etc, the base date for these times is the
604	# 20th, even though the real start time is 23:45 UTC on the
605	# 19th.
606	#
607	$day = pd(UnixDate(Date_ConvTZ($p_start, 'UTC', offset_to_gmt($tz)), '%Q'));
608    }
609    else {
610	$day = pd(UnixDate($p_start, '%q'));
611    }
612    t "day is $day";
613    # FIXME won't be correct when split programme spans days.
614
615    # Sanity check for a time, that it is within the main programme's
616    # timespan.
617    #
618    my $within_time_period = sub {
619	my $t = shift;
620	t("checking whether $t is in time period $p_start.."
621	  . (defined $p_stop ? $p_stop : ''));
622	if (Date_Cmp($t, $p_start) < 0) {
623	    # Before start of programme, that makes no sense.
624	    return 0;
625	}
626	if (defined $p_stop and Date_Cmp($p_stop, $t) < 0) {
627	    # After end of programme, likewise.
628	    return 0;
629	}
630	return 1;
631    };
632
633    # Three different ways of interpreting a time.  Return undef if
634    # not valid under that system, a 24 hour hh:mm otherwise.
635    #
636    # FIXME doesn't handle multiparts bridging noon or midnight.
637    #
638    my $as_12h_am = sub {
639	my $w = shift;
640	$w =~ s/[,;.]$//;
641	t "trying $w as 12 hour am time";
642	clocks_poss($w)->[0] || return undef;
643	return time12to24("$w am");
644    };
645    my $as_12h_pm = sub {
646	my $w = shift;
647	$w =~ s/[,;.]$//;
648	t "trying $w as 12 hour pm time";
649	clocks_poss($w)->[0] || return undef;
650	return time12to24("$w pm");
651    };
652    my $as_24h = sub {
653	my $w = shift;
654	$w =~ s/[,;.]$//;
655	t "trying $w as 24 hour time";
656	clocks_poss($w)->[1] || return undef;
657	$w =~ tr/./:/;
658	return $w;
659    };
660
661    if (defined $tz) { t "using timezone $tz for interpreting times" }
662    else { t "interpreting times with no timezone (ie UTC)" }
663
664    my ($best_interp, $best_count,
665	$best_first_word_is_time, $best_including_at_time);
666  INTERP: foreach my $interp ($as_24h, $as_12h_am, $as_12h_pm) {
667	t 'testing an interpretation of times';
668	my $count = 0;
669	my $first_word_is_time = 0;
670	my $including_at_time = 0;
671	my $prev;
672	for (my $pos = 0; $pos < @poss_times; $pos++) {
673	    t "testing word $poss_times[$pos] at position $pos";
674	    my $w = $poss_times[$pos];
675	    t "word is '$w'";
676	    my $i = $interp->($w);
677	    if (not defined $i) {
678		t "doesn't parse to a time with this interp.";
679		next;
680	    }
681	    warn "bad 24h returned time: $i" unless $i =~ /^\d?\d:\d\d$/;
682	    t "found a time that interprets: $i";
683	    my $t = Date_SetTime($day, $i);
684	    die if not defined $t;
685	    t "taken as day $day, gets time $t";
686	    $t = Date_ConvTZ($t, offset_to_gmt($tz), 'UTC') if defined $tz;
687	    t "after converting to UTC, $t";
688	    if (not $within_time_period->($t)) {
689		# Obviously wrong.  One bad time is enough to abandon
690		# this whole interpretation and try another.
691		#
692		t "not within time period, whole interpretation wrong";
693		next INTERP;
694	    }
695	    # Don't insist that times be in order, this isn't the case
696	    # for all listings (eg 'News at 0700 and 0730; Weather at
697	    # 0715').
698	    #
699
700	    $prev = $t;
701	    ++ $count;
702	    if ($pos == 0) {
703		$first_word_is_time = 1;
704	    }
705	    if ($pos >= 2
706		and $poss_times[$pos - 2] =~ /^[Ii]ncluding$/
707		and $poss_times[$pos - 1] eq 'at') {
708		$including_at_time = 1;
709		t 'previous words are "including at", setting $including_at_time true';
710	    }
711	}
712	t "found $count matching times and nothing badly wrong";
713
714	if (not defined $best_interp
715	    or $count > $best_count) {
716	    t 'best so far';
717	    $best_interp = $interp;
718	    $best_count = $count;
719	    $best_first_word_is_time = $first_word_is_time;
720	    $best_including_at_time = $including_at_time;
721	}
722    }
723
724    if (defined $best_interp) {
725	t "best result found: count $best_count";
726	t "first word? $best_first_word_is_time";
727	t "best includes 'at time'? $best_including_at_time";
728    }
729    else {
730	t "couldn't find any interpretation that worked at all";
731    }
732
733    # Heuristic.  We require at least three valid times to split - or
734    # when the programme description begins with a time, that's also
735    # good enough.  Also when the description contains 'including at'
736    # followed by a time.
737    #
738    return undef if not defined $best_interp;
739    return undef unless ($best_count >= 3
740			 or $best_first_word_is_time
741			 or $best_including_at_time);
742
743#    local $Log::TraceMessages::On = 1;
744    t 'looks reasonable, proceed';
745
746    t 'calling multipart_split_desc() with words and interpretation fn';
747    my $split = multipart_split_desc(\@words, $best_interp);
748    t 'got result from multipart_split_desc(): ' . d $split;
749    die if not defined $split->[0];
750    die if not defined $split->[2];
751    our @pps; local *pps = $split->[0];
752    t 'got list of pseudo-programmes: ' . d \@pps;
753    if (not @pps) {
754	warn "programme looked like a multipart, but couldn't grok it";
755	return undef;
756    }
757    if (@pps == 1) {
758	# Didn't really split, perhaps it wasn't a multipart.
759	t 'split into only one, leave unchanged';
760	return undef;
761    }
762
763    foreach (@pps) {
764	die if not defined;
765	die if not keys %$_;
766    }
767    my $common = $split->[1];
768    our @errors; local *errors = $split->[2];
769
770    # We split the first description, and only after checking it did
771    # look like a plausible multipart.  So if anything went wrong we
772    # should warn about it.
773    #
774    foreach (@errors) {
775	warn $_;
776    }
777
778    # What was returned is a list of pseudo-programmes, these have
779    # main_desc instead of real [text, lang] descriptions, and hh:mm
780    # 'time' instead of real start time+date.
781    #
782    # At most one of them is allowed to have time undefined; this is
783    # the 'rump' of the parent programme.  Whether such a rump exists
784    # depends on what kind of splitting was done.
785    #
786    my $seen_rump = 0;
787    foreach (@pps) {
788	my $time = delete $_->{time};
789	die if not defined $time and $seen_rump++;
790	if (defined $time) {
791	    my $start = Date_SetTime($day, $time);
792	    die if not defined $start;
793	    $start = Date_ConvTZ($start, offset_to_gmt($tz), 'UTC') if defined $tz;
794	    if (Date_Cmp($start, $p->{start}) < 0) {
795		my $dump = dump_pseudo_programme($_);
796		die "subprogramme ($dump, has 'time' $time) "
797		  . "starts before main programme ($p->{start}, $p->{title}->[0]->[0])";
798	    }
799	    if (defined $p->{stop} and Date_Cmp($p->{stop}, $start) < 0) {
800		my $dump = dump_pseudo_programme($_);
801		die "subprogramme ($dump, has 'time' $time) starts after main one stops";
802	    }
803
804	    # Now we store the time in the official 'start' key.  But
805	    # convert back to the original timezone to look nice.
806	    #
807	    if (defined $tz) {
808		$_->{start} = ud(Date_ConvTZ($start, 'UTC', offset_to_gmt($tz))) . " $tz";
809	    }
810	    else {
811		$_->{start} = ud($start);
812	    }
813	}
814	else {
815	    $_->{start} = $p->{start};
816	}
817
818	if (not defined $_->{main_title}) {
819	    # A title is needed, normally splitting will find one, but
820	    # in case it didn't...
821	    #
822	    $_->{title} = $p->{title};
823	}
824
825	# Now deal with each of the main_X fields turning them into
826	# real X.
827	#
828	foreach my $key (qw(desc title sub-title)) {
829	    my $v = delete $_->{"main_$key"};
830	    next if not defined $v;
831	    $_->{$key} = [ [ $v, $desc_lang ] ];
832	}
833
834	if (defined $common) {
835	    # Add the common text to this programme.  So far it has at
836	    # most one description in language $desc_lang.
837	    #
838	    for ($_->{desc}->[0]->[0]) {
839		if (defined and length) {
840		    $_ .= '. ' if $_ !~ /[.?!]\s*$/;
841		    $_ .= " $common";
842		}
843		else {
844		    $_ = $common;
845		}
846	    }
847	    $_->{desc}->[0]->[1] = $desc_lang;
848	}
849
850	$_->{channel} = $p->{channel};
851	t "set channel of split programme to $_->{channel}";
852    }
853
854    # The last subprogramme should stop at the same time as the
855    # multipart programme stopped.
856    #
857    if (defined $p->{stop}) {
858	t "setting stop time of last subprog to stop time of main prog ($p->{stop})";
859	set_stop_time($pps[-1], $p->{stop});
860    }
861    else { t 'main prog had no stop time, not adding to last subprog' }
862
863    # And similarly, the first should start at the same time as the
864    # multipart programme.  Add a dummy prog to fill the gap if
865    # necessary.
866    #
867    my $first_sub_start = $pps[0]->{start};
868    my $cmp = Date_Cmp(pd($first_sub_start), $p_start);
869    if ($cmp < 0) {
870	# Should have caught this already.
871	die 'first subprogramme starts _before_ main programme';
872    }
873    elsif ($cmp == 0) {
874	# Okay.
875    }
876    elsif ($cmp > 0) {
877	my $dummy = { title   => $p->{title},
878		      channel => $p->{channel},
879		      start   => $p->{start},
880		      stop    => $first_sub_start };
881	t 'inserting dummy subprogramme: ' . d $dummy;
882	cst $dummy if $SLOW;
883	unshift @pps, $dummy;
884    }
885    else { die }
886
887    if ($SLOW) { cst $_ foreach @pps }
888    scrub_empty($_->{title}, $_->{"sub-title"}, $_->{desc}) foreach @pps;
889    t 'returning new list of programmes: ' . d \@pps;
890    return \@pps;
891}
892# Given a programme description split into a list of words, and a
893# subroutine to interpret times, return a list of the subprogrammes
894# (assuming it is a multipart).
895#
896# Returns [pps, common, errs] where pps is a list of 'pseudo-programmes',
897# hashes containing some of:
898#
899#  time: 24h time within the main programme's day,
900#  main_title, main_desc, main_sub-title: text in the same language as
901#  the desc passed in,
902#
903# and where common is text which belongs to the description of every
904# subprogramme, and errs is a list of errors found (probably quite
905# large if the description was not multipart).
906#
907sub multipart_split_desc( $$ ) {
908    our @words; local *words = shift;
909    my $interp = shift;
910
911    # We need to decide what style of multipart listing this is.
912    # There's the kind that has time - title - description for each
913    # subprogramme.  There's the kind that has 'News at time0, time1,
914    # time2; Weather at time3, time4'.  And then something more like a
915    # normal English sentence, which of course is the hardest to
916    # parse.  We use some heuristics to work out which it is and call
917    # the appropriate 'parsing' routine.
918    #
919    t "testing for 'Including at'";
920    foreach my $i (0 .. $#words - 1) {
921	t "looking at pos $i, word is $words[$i]";
922	if ($words[$i] =~ /^[Ii]ncluding$/ and $words[$i + 1] eq 'at') {
923	    t 'yup, calling multipart_split_desc_including_at()';
924	    return multipart_split_desc_including_at(\@words, $interp);
925	}
926    }
927
928    t "testing for 'With X at T0, T1; ...'";
929    if (@words >= 4 and $words[0] =~ /^with$/i) {
930	my $first_lc_word;
931	foreach (@words) {
932	    if (not tr/[A-Z]//) {
933		$first_lc_word = $_;
934		last;
935	    }
936	}
937	if (defined $first_lc_word and $first_lc_word eq 'at') {
938	    return multipart_split_desc_rt(\@words, $interp);
939	}
940    }
941
942    t "looking for two times in a row, or separated only by 'and'";
943    my $prev_was_time = 0;
944    foreach (@words) {
945	if (defined $interp->($_)) {
946	    # Found a time.
947	    if ($prev_was_time) {
948		t 'found two times in a row, using multipart_split_desc_simple()';
949		return multipart_split_desc_simple(\@words, $interp);
950	    }
951	    $prev_was_time = 1;
952	}
953	elsif ($_ eq 'and') {
954	    # Skip.
955	}
956	else {
957	    $prev_was_time = 0;
958	}
959    }
960
961    t "looking for pairs of times 'from-to'";
962    foreach (@words) {
963	if (/^([0-9.:]+)-([0-9.:]+)$/) {
964	    my ($from, $to) = ($1, $2);
965	    if (defined $interp->($from) and defined $interp->($to)) {
966		return multipart_split_desc_fromto(\@words, $interp);
967	    }
968	}
969    }
970
971    t "must be old style of 'time title. description'";
972    return multipart_split_desc_ananova(\@words, $interp);
973}
974# And these routines handle the different styles.
975sub multipart_split_desc_ananova( $$ ) {
976    our @words; local *words = shift;
977    my $interp = shift;
978    my @r;
979    my @errors;
980
981    # First extract any 'common text' at the start of the programme,
982    # before any sub-programmes.
983    #
984    my $common;
985    while (@words) {
986	my $first = shift @words;
987	if (defined $interp->($first)) {
988	    unshift @words, $first;
989	    last;
990	}
991	if (defined $common and length $common) {
992	    $common .= " $first";
993	}
994	else {
995	    $common = $first;
996	}
997    }
998    t 'common text: ' . d $common;
999
1000    while (@words > 1) { # At least one thing after the time
1001	my $time = shift @words;
1002	my $i = $interp->($time);
1003	if (defined $i) {
1004	    my (@title_words, @desc_words);
1005
1006	    # Build up a current 'pseudo-programme' with title,
1007	    # description and time.  It's up to our caller to
1008	    # manipulate these simple data structures into real
1009	    # programmes.
1010	    #
1011	    my $curr_pp;
1012	    $curr_pp->{time} = $i;
1013	    my $done_title = 0;
1014	    my @words_orig = @words;
1015	    while (@words) {
1016		my $word = shift @words;
1017
1018		if (defined $interp->($word)) {
1019		    # Finished this bit of multipart.
1020		    unshift @words, $word;
1021		    last;
1022		}
1023		elsif (not $done_title) {
1024		    if ($word =~ s/[.]$// or $word =~ s/([!?])$/$1/) {
1025			# Finished the title, move on to description.
1026			$done_title = 1;
1027		    }
1028		    push @title_words, $word;
1029		}
1030		else {
1031		    push @desc_words, $word;
1032		}
1033	    }
1034	    if (not @title_words) {
1035		warn "trouble finding title in multipart";
1036		if (not @desc_words) {
1037		    warn "cannot find title or description in multipart";
1038		    @title_words = ('???');
1039		}
1040		else {
1041		    # Use the description so far as the title.
1042		    if ($desc_words[-1] eq 'at') {
1043			pop @desc_words;
1044		    }
1045		    @title_words = @desc_words;
1046		    @desc_words = ();
1047		}
1048	    }
1049
1050	    # The title sometimes looks like 'History in Action: Women
1051	    # in the 20th Century'; this should be broken into main
1052	    # title and secondary title.  But not 'GNVQ: Is It For You
1053	    # 2'.  So arbitrarily we check that the main title has at
1054	    # least two words.
1055	    #
1056	    if (@title_words) {
1057		my (@main_title_words, @sub_title_words);
1058
1059		while (@title_words) {
1060		    my $word = shift @title_words;
1061		    my $main_title_length = @main_title_words + 1;
1062
1063		    # Split at colon, sometimes
1064		    if ((not $opt_no_create_sub_titles)
1065			and $main_title_length >= 2 and $word =~ s/:$//) {
1066			push @main_title_words, $word;
1067			@sub_title_words = @title_words;
1068			last;
1069		    }
1070		    else {
1071			push @main_title_words, $word;
1072		    }
1073		}
1074
1075		$curr_pp->{main_title} = join(' ', @main_title_words);
1076		$curr_pp->{'main_sub-title'} = join(' ', @sub_title_words)
1077		  if @sub_title_words;
1078	    }
1079
1080	    $curr_pp->{main_desc} = join(' ', @desc_words) if @desc_words;
1081	    t 'built sub-programme: ' . d $curr_pp;
1082	    push @r, $curr_pp;
1083	}
1084	else {
1085	    push @errors, "expected time in multipart description, got $time";
1086	    # Add it to the previous programme, so it doesn't get lost
1087	    if (@r) {
1088		my $prev = $r[-1];
1089		$prev->{main_desc} = '' if not defined $prev->{main_desc};
1090		$prev->{main_desc} .= $time;
1091	    }
1092	    else {
1093		# Cannot happen.  If @r is empty, this must be the
1094		# first word.
1095		#
1096		warn 'first word of desc is not time, but checked this before';
1097		# Not worthy of @errors, this is a bug in the code.
1098	    }
1099	}
1100    }
1101    foreach (@r) {
1102	die if not keys %$_;
1103	die if not defined $_->{main_title};
1104    }
1105    t 'returning list of pseudo-programmes: ' . d \@r;
1106    t '...and common text: ' . d $common;
1107    t '...and errors: ' . d \@errors;
1108    return [\@r, $common, \@errors];
1109}
1110sub multipart_split_desc_rt( $$ ) {
1111    our @words; local *words = shift;
1112    my $interp = shift;
1113    my @errors;
1114
1115    my $with = shift @words;
1116    die if not defined $with;
1117    die if $with !~ /^with$/i;
1118
1119    my @got;
1120    my @title = ();
1121    my @times = ();
1122    my $done_title = 0;
1123    while (@words) {
1124	my $w = shift @words;
1125	if ($w eq 'at') {
1126	    $done_title = 1;
1127	    next;
1128	}
1129
1130	my $i = $interp->($w);
1131	if (defined $i) {
1132	    # It's a time.
1133	    if (not $done_title) {
1134		warn "unexpected time $w in multipart description, before 'at'";
1135		push @errors, $w;
1136	    }
1137	    else {
1138		push @times, $i;
1139	    }
1140
1141	    if ($w =~ /[.;]$/) {
1142		# End of the list of times for this programme.
1143		push @got, [ [ @title ], [ @times ] ];
1144		@title = ();
1145		@times = ();
1146		$done_title = 0;
1147	    }
1148	    elsif ($w =~ /,$/) {
1149		# List continues.
1150	    }
1151	    else {
1152		warn "strange time $w";
1153	    }
1154
1155	    next;
1156	}
1157
1158	# Not a time, should be part of the title.
1159	if ($done_title) {
1160	    warn "strange word $w in multipart description, expected a time";
1161	    push @errors, $w;
1162	}
1163	else {
1164	    push @title, $w;
1165	}
1166    }
1167
1168    my @r;
1169    foreach (@got) {
1170	my ($title, $times) = @$_;
1171	foreach (@$times) {
1172	    push @r, { main_title => join(' ', @$title), time => $_ };
1173	}
1174    }
1175
1176    # There is no 'common text' with this splitter.
1177    return [\@r, undef, \@errors];
1178}
1179# Split the programme by looking for times, but each new programme has
1180# the same words (except times).
1181#
1182sub multipart_split_desc_simple( $$ ) {
1183    our @words; local *words = shift;
1184    my $interp = shift;
1185
1186    my @common;
1187    my @times;
1188    foreach (@words) {
1189	die if not defined;
1190	my $i = $interp->($_);
1191	if (defined $i) {
1192	    push @times, $i;
1193	    if (@common and ($common[-1] eq 'at' or $common[-1] eq 'and')) {
1194		pop @common;
1195	    }
1196	}
1197	else {
1198	    push @common, $_;
1199	}
1200    }
1201
1202    my @r;
1203    foreach (@times) {
1204	die if not defined;
1205	push @r, { time => $_ };
1206    }
1207
1208    # No 'errors' but lots of 'common text'.
1209    return [ \@r, join(' ', @common), [] ];
1210}
1211sub multipart_split_desc_fromto( $$ ) {
1212    our @words; local *words = shift;
1213    my $interp = shift;
1214    my @r;
1215    my @errors;
1216
1217    # This routine is limited a bit because it's expected to return
1218    # hashes with just 'time'.  But we know more than that, we know
1219    # both start time and stop time for each subprogramme.  That
1220    # information would be thrown away.
1221    #
1222    # For now, it seems that this kind of multipart programme always
1223    # has one part beginning when the previous one ended, so we can
1224    # just check that this property holds.  Then there will be no loss
1225    # of stop-time information.
1226    #
1227    my ($last_start, $last_stop);
1228    my @title = ();
1229    my $done_title = 0;
1230    my @desc = ();
1231    foreach (@words) {
1232	if (/^([0-9.:]+)-([0-9.:]+)$/
1233	    and defined(my $istart = $interp->($1))
1234	    and defined(my $istop = $interp->($2))) {
1235	    # It's a pair of times.
1236	    if (defined $last_start) {
1237		# Deal with the previous subprogramme.
1238		warn "mismatch between stop time $last_stop and start time $istart"
1239		  if $last_stop ne $istart;
1240		my %p = ( time => $last_start, main_title => join(' ', @title) );
1241		$p{main_desc} = join(' ', @desc) if @desc;
1242		push @r, \%p;
1243	    }
1244	    ($last_start, $last_stop) = ($istart, $istop);
1245	    @title = ();
1246	    $done_title = 0;
1247	    @desc = ();
1248	}
1249	elsif (/:$/) {
1250	    # A colon ends the title.
1251	    if (not $done_title) {
1252		(my $tmp = $_) =~ s/:$//;
1253		push @title, $tmp;
1254		$done_title = 1;
1255	    }
1256	    else {
1257		warn "seen colon in description: '$_'";
1258		push @desc, $_;
1259	    }
1260	}
1261	elsif ($_ eq 'with') {
1262	    # Also 'with' can end a title, as in 'News with...'.  This
1263	    # is probably the only time I've seen a use for the
1264	    # convention that words in titles should be capitalized.
1265	    #
1266	    # The 'with' stuff goes into the description, where some
1267	    # other handler can pick it up.
1268	    #
1269	    $done_title = 1;
1270	    push @desc, $_;
1271	}
1272	else {
1273	    if (not $done_title) {
1274		push @title, $_;
1275	    }
1276	    else  {
1277		push @desc, $_;
1278	    }
1279	}
1280    }
1281    if (defined $last_start) {
1282	my %p = ( time => $last_start, main_title => join(' ', @title) );
1283	$p{main_desc} = join(' ', @desc) if @desc;
1284	push @r, \%p;
1285    }
1286
1287    return [ \@r, undef, [] ];
1288}
1289# Really an 'including at' programme should be sandwiched in the
1290# middle of its parent, but the format doesn't allow that so for
1291# simplicity we treat as a multipart.
1292#
1293sub multipart_split_desc_including_at( $$ ) {
1294    our @words; local *words = shift;
1295    my $interp = shift;
1296    my @r;
1297    my @rump;
1298
1299    while (@words) {
1300	my $t;
1301	if (@words >= 4
1302	    and $words[0] =~ /^[Ii]ncluding$/
1303	    and $words[1] eq 'at'
1304	    and defined ($t = $interp->($words[2]))
1305	    and $words[3] =~ /^[A-Z]/) {
1306	    shift @words; shift @words; shift @words;
1307	    my @title;
1308	    while (@words and $words[0] =~ /^[A-Z]/) {
1309		my $w = shift @words;
1310		if ($w =~ s/[.,;]$//) {
1311		    push @title, $w;
1312		    last;
1313		}
1314		else {
1315		    push @title, $w;
1316		}
1317	    }
1318	    push @r, { time => $t, main_title => join(' ', @title) };
1319	}
1320	else {
1321	    push @rump, shift @words;
1322	}
1323    }
1324
1325    unshift @r, { main_desc => join(' ', @rump) };
1326    return [ \@r, '', [] ];
1327}
1328# Is a time string using the 12 hour or 24 hour clock?  Returns a pair
1329# of two booleans: the first means it could be 12h, the seecond that
1330# it could be 24h.  Expects an h.mm or hh.mm time string.  If the
1331# string is not a valid time under either clock, returns [0, 0].
1332#
1333# Allows eg '5.30' to be a 24 hour time (05:30).
1334#
1335sub clocks_poss( $ ) {
1336    local $_ = shift;
1337    if (not /^(\d\d?)\.(\d\d)$/) {
1338	return [0, 0];
1339    }
1340    my ($hh, $mm) = ($1, $2);
1341    return [0, 0] if $mm > 59;
1342    return [0, 1] if $hh =~ /^0/;
1343    return [1, 1] if 1 <= $hh && $hh < 13;
1344    return [0, 1] if 13 <= $hh && $hh < 24;
1345
1346    # Do not accept '24:00', '24:01' etc - not until it's proved we
1347    # need to.
1348    #
1349    return [0, 0];
1350}
1351# Debugging stringification.
1352sub dump_pseudo_programme( $ ) {
1353    my @r;
1354    my $pp = shift;
1355    foreach (qw(time main_title main_desc)) {
1356	push @r, $pp->{$_} if defined $pp->{$_};
1357    }
1358    return join(' ', @r);
1359}
1360
1361
1362# time12to24()
1363#
1364# Convert a 12 hour time string to a 24 hour one, without anything too
1365# fancy.  In particular the timezone is passed through unchanged.
1366#
1367sub time12to24( $ ) {
1368    die 'usage: time12to24(12 hour time string)' if @_ != 1;
1369    local $_ = shift;
1370    die if not defined;
1371
1372    # Remove the timezone and stick it back on afterwards.
1373    my $tz = gettz($_);
1374    s/\Q$tz\E// if defined $tz;
1375
1376    s/\s+//;
1377    my ($hours, $mins, $ampm) = /^(\d\d?)[.:]?(\d\d)\s*(am|pm)$/
1378      or die "bad time $_";
1379    if ($ampm eq 'am') {
1380	if (1 <= $hours and $hours < 12) {
1381	    $hours = $hours;      # 5am = 05:00
1382	}
1383	elsif ($hours == 12) {
1384	    $hours = 0;           # 12am = 00:00
1385	}
1386	else { die "bad number of hours $hours" }
1387    }
1388    elsif ($ampm eq 'pm') {
1389	if ($hours == 12) {
1390	    $hours = 12;          # 12pm = 12:00
1391	}
1392	elsif (1 <= $hours and $hours < 12) {
1393	    $hours = 12 + $hours; # 5pm = 17:00
1394	}
1395	else { die "bad number of hours $hours" }
1396    }
1397    else { die }
1398
1399    my $r = sprintf('%02d:%02d', $hours, $mins);
1400    $r .= " $tz" if defined $tz;
1401    return $r;
1402}
1403
1404
1405
1406
1407# special_credits()
1408#
1409# Try to sniff out presenter, actor or guest info from the start of the
1410# description and put it into the credits section instead.
1411#
1412# Parameters: one programme (hashref)
1413# Returns: [ modified programme ], or undef
1414#
1415# May modify the programme passed in, if return value is not undef.
1416# But that's okay for a special-case handler.
1417#
1418sub special_credits( $ ) {
1419#    local $Log::TraceMessages::On = 1;
1420    die 'usage: special_credits(programme hashref)' if @_ != 1;
1421    my $prog = shift;
1422    t 'special_credits() ENTRY';
1423
1424    # Caution: we need to make sure $_ is 'live' so updates to it
1425    # change the programme, when calling the extractors.
1426    #
1427    foreach my $pair (grep { pair_ok($_) } @{$prog->{desc}}) {
1428	die if not defined;
1429	t "testing desc: $pair->[0]";
1430	if (not length $pair->[0]) {
1431	    local $Log::TraceMessages::On = 1;
1432	    t 'programme with empty desc:' . d $prog;
1433	}
1434
1435	if (s/\b([pP])resenteed\b/$1resented/g) {
1436	    t "fixing spelling mistake!";
1437	    return [ $prog ];
1438	}
1439
1440	# Regexps to apply to the description (currently only the
1441	# first English-language description is matched).  The first
1442	# element is a subroutine which should alter $_ and return a
1443	# name or string of names if it succeeds, undef if it fails to
1444	# match.
1445	#
1446	# The first argument of the subroutine is the programme
1447	# itself, but this usually isn't used.  In any case, it should
1448	# not be modified except by altering $_.
1449	#
1450 	my @extractors =
1451 	  (
1452	   # Definitely presenter
1453	   [ sub {
1454		 s{(\b[a-z]\w+\s+)(?:(?:guest|virtual|new\s+)?presenters?)\s+($NAMES_RE)}{$1$2}o
1455		   && return $2;
1456		 s{((?:^|\.|\?)\s*)($NAMES_RE)\s+(?:(?:presents?)|(?:rounds?\s+up)|(?:introduces?))\b\s*(\.|,|\w|\Z)}
1457		   {$1 . uc $3}oe
1458		     && return $2;
1459		 s{Presenters?\s+($NAMES_RE)}{$1}o
1460		   && return $1;
1461		 s{,?\s*[cC]o-?presenters?\s+($NAMES_RE)}{}o
1462		   && return $1;
1463		 s{,?\s*[pP]resented by\s+($NAMES_RE)\b\s*(.|,?\s+\w|\Z)}{uc $2}oe
1464		   && return $1;
1465		 s{^\s*([hH]eadlines?(?:\s+\S+)?),?\s*[wW]ith\s+($NAMES_RE)\b(?:\.\s*)?}{$1}o
1466		   && return $2;
1467		 s{,?\s*(?:[iI]ntroduced|[cC]haired)\s+by\s+($NAMES_RE)(?:\.\s*)?}{}o
1468		   && return $1;
1469
1470		 # This last one is special: it adds 'Last in series'
1471		 # which some other handler might pick up.
1472		 #
1473		 s{((?:^|\.|\?)\s*)($NAMES_RE)\s+concludes?\s+the\s+series\b\s*(?:with\b\s*)?(\.|,|\w|\Z)}
1474		   {$1 . 'Last in series.  ' . uc $3}oe
1475		     && return $2;
1476
1477		 return undef;
1478	     }, 'presenter' ],
1479	   # Leave 'virtual presenter', 'aquatic presenter',
1480	   # 'new presenter' alone for now
1481	   #
1482
1483	   # Might be presenter depending on type of show
1484 	   [ sub {
1485		 if (hasp($_[0], 'category',
1486			  sub { $_[0] =~ /\b(?:comedy|drama|childrens?)\b/i })
1487		     and not $prog->{credits}->{presenter}) {
1488		     return undef;
1489		 }
1490		 s{^\s*,?\s*[wW]ith\s+($NAMES_RE)\b(?:(?:\.\s*)?$)?}{}o
1491		   && return $1;
1492		 s{^\s*(?:[hH]ost\s+)?($NAME_RE) (?:introduces|conducts) (\w)(?![^.,;:!?]*\bto\b)}                    {uc $2}oe
1493		   && return $1;
1494		 s{^\s*(?:[hH]ost\s+)?($NAME_RE)\s+(?:explores|examines)\s*}{}o
1495		   && return $1;
1496		 return undef;
1497	     }, 'presenter' ],
1498
1499	   [ sub {
1500		 s{((?:^|\.|\?)\s*)($NAME_RE)\s+interviews\b\s*(\.|,|\w|\Z)}{$1 . uc $3}oe
1501		   && return $2;
1502		 return undef;
1503	     }, 'presenter' ], # FIXME should be 'host' or 'interviewer'
1504
1505	   # 'with' in quiz shows is guest (maybe)
1506	   [ sub {
1507		 return undef unless hasp($_[0], 'category',
1508					  sub { $_[0] =~ /\b(?:quiz|sports?)\b/i });
1509		 s{((?:^|,|\.|\?)\s*)[wW]ith\s*($NAMES_RE)\b(?!\s+among)(\.\s*\S)}
1510		   {$1 ne ',' ? "$1$2" : $2}oe
1511		     && return $2;
1512		 s{((?:^|,|\.|\?)\s*)[wW]ith\s*($NAMES_RE)\b(?!\s+among)(?:\.\s*$)?}
1513		   {$1 ne ',' ? $1 : ''}oe
1514		     && return $2;
1515		 return undef;
1516	     }, 'guest' ],
1517
1518	   # 'with' in news/children shows is presenter (equally
1519	   # dubious).  Also a 'with' in a talk show might be
1520	   # presenter or might be guest, but at least we know it's
1521	   # not actor.
1522	   #
1523	   [ sub {
1524		 return undef
1525		   unless hasp($_[0], 'category',
1526			       sub { $_[0] =~ /\b(?:news|business|economics?|political|factual|talk|childrens?|game show)\b/i });
1527		 s{(?:^|,|\.|\?)\s*[wW]ith\s*($NAMES_RE)\b(?:\.\s*)?}{}o && return $1;
1528		 return undef;
1529	     }, 'presenter' ],
1530
1531	   [ sub {
1532		 # Anything with a 'presenter' does not have actors.
1533		 return undef if $prog->{credits}->{presenter};
1534		 s{(?:[Ww]ith\s+)?[gG]uest\s+star\s+($NAMES_RE)\b\s*[,;.]?\s*}{}o
1535		   && return $1;
1536		 s{^($NAMES_RE) (?:co-)?stars? in\s+(?:this\s+)?}{uc $2}oe
1537		   && return $1;
1538		 s{\s*($NAMES_RE) (?:co-)?stars?\.?\s*$}{}o
1539		   && return $1;
1540		 s{(?:^|\.|\?)\s*($NAMES_RE)\s+(?:co-)?stars?\s+as\s*$}{}o
1541		   && return $1;
1542		 return undef;
1543	     }, 'actor' ],
1544
1545	   [ sub {
1546		 # A discussion of 'a film starring Robin Williams'
1547		 # does not itself feature that actor.
1548		 #
1549		 return undef if $prog->{credits}->{presenter};
1550		 return undef if hasp($_[0], 'category', sub { $_[0] =~ /\barts\b/i });
1551		 s{(?:^|,|\.|\?)\s*[wW]ith\s*($NAMES_RE)\b(?:,|\.|;|$)?}{}o
1552		   && return $1;
1553		 s{,?\s*(?:(?:[Aa]lso|[Aa]nd)\s+)?(?:[Cc]o-|[Gg]uest-|[Gg]uest\s+)?[Ss]tarring\s+($NAMES_RE)\s*$}{}o
1554		   && return $1;
1555		 return undef;
1556	     }, 'actor' ],
1557
1558	   [ sub {
1559		 s{,?\s*[wW]ith\s+guests?\s+($NAMES_RE)\b(?:\.\s*)?}{}o
1560		   && return $1;
1561		 s{((?:^|\.|!|\?)\s*)($NAME_RE)\s+guests(?:$|(?:\s+)|(?:.\s*))}{$1}o
1562		   && return $2;
1563		 return undef;
1564	     }, 'guest' ],
1565
1566	   [ sub {
1567		 s{(?:^|\.|!|\?|,)(?:[Ww]ritten\s+)?\s*by\s+($NAMES_RE)\b($|\.)}{$2}o
1568		   && return $1;
1569		 return undef;
1570	     }, 'writer' ],
1571	  );
1572
1573	# Run our own little hog-butchering algorithm to match each of
1574	# the subroutines in turn.
1575	#
1576	my $matched = 0;
1577	EXTRACTORS: foreach my $e (@extractors) {
1578	    my ($sub, $person) = @$e;
1579	    t "running extractor for role $person";
1580	    my $old_length = length $pair->[0];
1581	    my $match;
1582	    for ($pair->[0]) { $match = $sub->($prog) }
1583	    if (defined $match) {
1584		# Found one or more $person called $match.  We add them to
1585		# the list unless they're already in there.  We use a
1586		# per-programme cache of this information to avoid
1587		# going through the list each time (basically because
1588		# hashes are more Perlish).
1589		#
1590		t "got list of people: $match";
1591		my @names = list_names($match);
1592		t 'that is, names: ' . d \@names;
1593		t 'by shortening desc, programme updated to: ' . d $prog;
1594		for my $credits ($prog->{credits}) {
1595		    my %seen;
1596		    if (lc $person eq 'guest') {
1597			# Impossible for someone to be guest as well
1598			# as another part, so don't add it if already
1599			# listed anywhere.
1600			#
1601			foreach (keys %$credits) {
1602			    $seen{$_}++ foreach @{$credits->{$_}};
1603			}
1604		    }
1605		    else {
1606			# Cannot add this person if they are already
1607			# given in the same job, or as a guest.
1608			#
1609			foreach (@{$credits->{$person}}, @{$credits->{guest}}) {
1610			    $seen{$_}++ && warn "person $_ seen twice";
1611			}
1612		    }
1613
1614		    t 'people already known (or ineligible): ' . d \%seen;
1615		    foreach (@names) {
1616			t "maybe adding $_ as a $person";
1617			push @{$credits->{$person}}, $_ unless $seen{$_}++;
1618		    }
1619		    t '$credits->{$person}=' . d $credits->{$person};
1620		}
1621
1622		if (length $pair->[0] >= $old_length) {
1623		    warn "extractor failed to shorten text: now $pair->[0]";
1624		}
1625
1626		t 'by adding people, programme updated to: ' . d $prog;
1627		$matched = 1;
1628		goto EXTRACTORS; # start again from beginning of loop
1629	    }
1630	}
1631	if ($matched) {
1632	    t 'some handlers matched, programme: ' . d $prog;
1633	    scrub_empty($prog->{desc});
1634	    t 'after removing empty things, programme: ' . d $prog;
1635            return [ $prog ];
1636        }
1637    }
1638    return undef;
1639}
1640# has()
1641#
1642# Check whether some attribute of a programme matches a particular
1643# string.  For example, does the programme have the category 'quiz'?
1644# This means checking all categories of acceptable language.
1645#
1646#   has($programme, 'category', 'quiz');
1647#
1648sub has( $$@ ) {
1649#    local $Log::TraceMessages::On = 1;
1650    my ($p, $attr, @allowed) = @_;
1651    t 'testing whether programme: ' . d $p;
1652    t "has attribute $attr in the list: " . d \@allowed;
1653    my $list = $p->{$attr};
1654    t 'all [text, lang] pairs for this attr: ' . d $list;
1655    return 0 if not defined $list;
1656    foreach (grep { pair_ok($_) } @$list) {
1657	my ($text, $lang) = @$_;
1658	foreach (@allowed) {
1659	    t "testing if $text matches $_ (nocase)";
1660	    return 1 if lc $text eq $_;
1661	}
1662    }
1663    t 'none of them matched, returning false';
1664    return 0;
1665}
1666# hasp()
1667#
1668# Like has() but instead of a list of strings to compare against,
1669# takes a subroutine reference.  This subroutine will be run against
1670# all the text strings of suitable language in turn until it matches
1671# one, when true is returned.  If none match, returns false.
1672#
1673# Parameters:
1674#   ref to programme hash
1675#   name of key to look under
1676#   subroutine to apply to each value of key with acceptable language
1677#
1678# Returns: whether subroutine gives true for at least one value.
1679#
1680# The subroutine will get the text value passed in $_[0].
1681#
1682sub hasp( $$$ ) {
1683#    local $Log::TraceMessages::On = 1;
1684    my ($p, $attr, $sub) = @_;
1685    die "expected programme hash as first argument, not $p"
1686      if ref $p ne 'HASH';
1687    t 'testing whether programme: ' . d $p;
1688    t "has a value for attribute $attr that makes sub return true";
1689
1690    # FIXME commonize this with has().
1691    my $list = $p->{$attr};
1692    t 'all [text, lang] pairs for this attr: ' . d $list;
1693    return 0 if not defined $list;
1694    foreach (grep { pair_ok($_) } @$list) {
1695	my ($text, $lang) = @$_;
1696	t "testing if $text matches";
1697	return 1 if $sub->($text);
1698    }
1699    t 'none of them matched, returning false';
1700    return 0;
1701}
1702
1703
1704# special_new_series()
1705#
1706# Contrary to first appearances, the <new /> element in the XML isn't
1707# to indicate a new series - it means something stronger, a whole new
1708# show (not a new season of an existing show).  But you can represent
1709# part of the meaning of 'new series' within the episode-num
1710# structure, because obviously a new series means that this is the
1711# first episode of the current series.
1712#
1713# This handler is mostly here to get rid of the 'New series' text at
1714# the start of descriptions, to try and make output from different
1715# grabbers look the same.
1716#
1717sub special_new_series( $ ) {
1718    die 'usage: special_new_series(programme)' if @_ != 1;
1719    my $p = shift;
1720
1721    # Just assume that if it contains 'New series' at the start then
1722    # it's English.
1723    #
1724    my $is_new_series = 0;
1725    foreach (@{$p->{desc}}) {
1726	for ($_->[0]) {
1727	    if (s/^New series(?:\.\s*|$)//
1728		or s/^New series (?:of (?:the )?)?(\w)/uc $1/e
1729	       ) {
1730		$is_new_series = 1;
1731	    }
1732	}
1733    }
1734
1735    return undef if not $is_new_series;
1736    if (defined $p->{'episode-num'}) {
1737	foreach (@{$p->{'episode-num'}}) {
1738	    my ($content, $system) = @$_;
1739	    next unless $system eq 'xmltv_ns';
1740	    $content =~ m!^\s*(\d+/\d+|\d+|)\s*\.\s*(\d+/\d+|\d+|)\s*\.\s*(\d+/\d+|\d+|)\s*$!
1741		or warn("badly formed xmltv_ns episode-num: $content"), return [ $p ];
1742	    my ($season, $episode, $part) = ($1, $2, $3);
1743	    if ($episode ne '' and $episode !~ /^0/) {
1744		warn "new series, but episode number $episode";
1745	    }
1746	    elsif ($episode eq '') {
1747		# We now know the information that this is the first
1748		# episode of the series.
1749		#
1750		$episode = '0';
1751		$content = "$season . $episode . $part";
1752		$_ = [ $content, $system ];
1753		last;
1754	    }
1755	}
1756    }
1757    else {
1758	# Make a dummy episode num which says nothing other than
1759	# this is the first episode of the series.
1760	#
1761	$p->{'episode-num'} = [ [ ' . 0 . ', 'xmltv_ns' ] ];
1762    }
1763    scrub_empty($p->{desc});
1764    return [ $p ];
1765}
1766
1767
1768# special_year(): take a year at the start of the description and move
1769# it to the 'date' field.
1770#
1771sub special_year( $ ) {
1772    die 'usage: special_new_series(programme)' if @_ != 1;
1773    my $p = shift;
1774
1775    my $year;
1776    foreach (@{$p->{desc}}) {
1777	if ($_->[0] =~ s/^(\d{4})\s+//) {
1778	    my $got = $1;
1779	    if (defined $year and $got ne $year) {
1780		warn "found different years $year and $got";
1781		return [ $p ];
1782	    }
1783	    $year = $got;
1784	}
1785    }
1786    return undef if not defined $year;
1787    if (defined $p->{date}) {
1788	if ($p->{date} !~ /^\s*$year/) {
1789	    warn "found year $year in programme description, but date $p->{date}";
1790	}
1791    }
1792    else {
1793	$p->{date} = $year;
1794    }
1795    scrub_empty($p->{desc});
1796    return [ $p ];
1797}
1798
1799
1800# 'TVM' at start of description means TV movie.
1801sub special_tv_movie( $ ) {
1802    die 'usage: special_tv_movie(programme)' if @_ != 1;
1803    my $p = shift;
1804    my $is_tv_movie = 0;
1805    foreach (@{$p->{desc}}) {
1806	my $lang = $_->[1];
1807	if (not defined $lang or $lang =~ /^en/) {
1808	    if ($_->[0] =~ s/^TVM\b\s*//) {
1809		$is_tv_movie = 1;
1810	    }
1811	}
1812    }
1813    return undef if not $is_tv_movie;
1814    add($p, 'category', 'TV movie');
1815    scrub_empty($p->{desc});
1816    return [ $p ];
1817}
1818
1819
1820# '(T)' in description means teletext subtitles.  But this should run
1821# after doing any splitting and other stuff.
1822#
1823sub special_teletext_subtitles( $ ) {
1824    die 'usage: special_teletext_subtitles(programme)' if @_ != 1;
1825    my $p = shift;
1826    my $has_t = 0;
1827    foreach (@{$p->{desc}}) {
1828	if ($_->[0] =~ s/\s*\(T\)\s*$//) {
1829	    $has_t = 1;
1830	}
1831    }
1832    return undef if not $has_t;
1833    if (defined $p->{subtitles}) {
1834	foreach (@{$p->{subtitles}}) {
1835	    return [ $p ] if defined $_->{type} and $_->{type} eq 'teletext';
1836	}
1837    }
1838    push @{$p->{subtitles}}, { type => 'teletext' };
1839    scrub_empty($p->{desc});
1840    return [ $p ];
1841}
1842
1843
1844# Remove stock phrases that have no meaning.
1845sub special_useless( $ ) {
1846    die 'usage: special_useless(programme)' if @_ != 1;
1847    my $p = shift;
1848
1849    # FIXME need to commonize hog-butchering with special_credits().
1850    my $changed = 0;
1851    foreach (@{$p->{desc}}) {
1852	for ($_->[0]) {
1853	    $changed |= s/^(?:a\s+|)round-up\s+of\s+(\w)/uc $1/ie;
1854	    $changed |= s/^(\w+[^s])\s+round-up\.?\s*$/$1 . 's'/ie;
1855	    $changed |= s/((?:^|\.|!|\?)\s*)Coverage\s+of\s+(\w)/$1 . uc $2/e;
1856	}
1857    }
1858
1859    return [ $p ] if $changed;
1860    return undef;
1861}
1862
1863
1864# special_radio4()
1865#
1866# Split Radio 4 into FM and LW.
1867#
1868sub special_radio4( $ ) {
1869    die 'usage: special_radio4(programme)' if @_ != 1;
1870    my $p = shift;
1871    return undef if $p->{channel} ne 'radio4';
1872
1873    for ($p->{title}) {
1874	if (s/^\(FM\)\s+//) {
1875	    $p->{channel} = 'radio4-fm';
1876	    return [ $p ];
1877	}
1878	if (s/^\(LW\)\s+//) {
1879	    $p->{channel} = 'radio4-lw';
1880	    return [ $p ];
1881	}
1882
1883	my %fm = ( %$p, channel => 'radio4-fm' );
1884	my %lw = ( %$p, channel => 'radio4-lw' );
1885	return [ \%fm, \%lw ];
1886    }
1887}
1888
1889
1890# special_split_title()
1891#
1892# In addition to the 'programme tacked onto the end of another'
1893# handled by add_clumpidx, you also sometimes see two programmes
1894# totally sharing an entry.  For example 'News; Shipping Forecast'.
1895#
1896sub special_split_title( $ ) {
1897    die 'usage: special_split_title(programme)' if @_ != 1;
1898    my $p = shift;
1899    return undef if $p->{title} !~ tr/;//;
1900
1901    # Split the title at ; and make N identical programmes one with
1902    # each title.  The programme details are given to only the last of
1903    # the programmes - in the listings data we're getting, normally
1904    # the insignificant programme comes first with the main feature
1905    # last, as in 'News; Radio 3 Lunchtime Concert'.
1906    #
1907
1908    # List of elements which are meta-data and should be kept for all
1909    # the programmes we split into - the rest are given only to the
1910    # last programme.
1911    #
1912    my %meta = (start => 1, stop => 1, 'pdc-start' => 1,
1913		'vps-start' => 1, showview => 1, videoplus => 1,
1914		channel => 1);
1915    # but not clumpidx!
1916
1917    my %p_meta;
1918    foreach (grep { $meta{$_} } keys %$p) {
1919	$p_meta{$_} = $p->{$_};
1920    }
1921
1922    my @r;
1923    my @titles = split /\s*;+\s*/, $p->{title};
1924    for (my $i = 0; $i < @titles - 1; $i++) {
1925	push @r, { %p_meta,
1926		title => $titles[$i],
1927		clumpidx => ( "$i/" . scalar @titles ) };
1928    }
1929    push @r, { %$p,
1930		title => $titles[-1],
1931		clumpidx => ("$#titles/" . scalar @titles) };
1932
1933    return \@r;
1934}
1935
1936
1937# special_film()
1938#
1939sub special_film( $ ) {
1940    die 'usage: special_film(programme)' if @_ != 1;
1941    my $p = shift;
1942    if (not defined $p->{'sub-title'} or $p->{'sub-title'} ne '(Film)') {
1943	return undef;
1944    }
1945
1946    warn "replacing category $p->{category} with 'film'"
1947      if defined $p->{category};
1948    $p->{category} = 'film';
1949    undef $p->{'sub-title'};
1950
1951    if (defined $p->{desc} and $p->{desc} =~ s/^(\d{4})\s+//) {
1952	warn "found year $1 in description, replacing date $p->{date}"
1953	  if defined $p->{date};
1954	$p->{date} = $1;
1955    }
1956
1957    return [ $p ];
1958}
1959
1960
1961# add()
1962#
1963# Add a piece of human-readable information to a particular slot, but
1964# only if it isn't there already.  For example add the category
1965# 'music', but only if that category isn't already set.  This is for
1966# keys that take multiple values and each value is a [ text, lang ]
1967# pair.  The language is assumed to be English.
1968#
1969# Parameters:
1970#   programme hash to add to
1971#   name of key
1972#   textual value to add
1973#
1974# Returns: whether the programme was altered.
1975#
1976sub add( $$$ ) {
1977    my ($p, $k, $v) = @_;
1978    if (defined $p->{$k}) {
1979	foreach (@{$p->{$k}}) {
1980	    return 0 if $_->[0] eq $v;
1981	}
1982    }
1983    push @{$p->{$k}}, [ $v, $LANG ];
1984    return 1;
1985}
1986
1987
1988# scrub_empty(): remove empty strings from a list of [text, lang]
1989# pairs.
1990#
1991# Parameters: zero or more listrefs
1992#
1993# Modifies lists passed in, removing all [ '', whatever ] pairs.
1994#
1995sub scrub_empty( @ ) {
1996    foreach (@_) {
1997	@$_ = grep { length $_->[0] } @$_;
1998    }
1999}
2000
2001
2002# Make sure that a programme altered by a special handler does not
2003# spill outside its alotted timespan.  This is just a sanity check
2004# before fix_clumps() does its stuff.  In a future version we might
2005# remove this restriction and allow special handlers to move
2006# programmes outside their original timeslot.
2007#
2008# Parameters:
2009#   original programme
2010#   ref to list of new programmes
2011#
2012sub check_time_fits( $$ ) {
2013    my $orig = shift;
2014    my @new = @{shift()};
2015    my $o_start = pd($orig->{start});
2016    die if not defined $o_start;
2017    my $o_stop = pd($orig->{stop});
2018    foreach (@new) {
2019	my $start = pd($_->{start});
2020	die if not defined $start;
2021	if (Date_Cmp($start, $o_start) < 0) {
2022	    die "programme starting at $o_start was split into one starting at $start";
2023	}
2024
2025	if (defined $o_stop) {
2026	    my $stop = pd($_->{stop});
2027	    if (defined $stop and Date_Cmp($o_stop, $stop) < 0) {
2028		die "programme stopping at $o_stop was split into one stopping at $stop";
2029	    }
2030	}
2031    }
2032}
2033
2034
2035
2036# Another sanity check.
2037sub check_same_channel( $ ) {
2038    my $progs = shift;
2039    my $ch;
2040    foreach my $prog (@$progs) {
2041	for ($prog->{channel}) {
2042	    if (not defined) {
2043		t 'no channel! ' . d $prog;
2044		croak 'programme has no channel';
2045	    }
2046	    if (not defined $ch) {
2047		$ch = $_;
2048	    }
2049	    elsif ($ch eq $_) {
2050		# Okay.
2051	    }
2052	    else {
2053		# Cannot use croak() due to this error message:
2054		#
2055		# Bizarre copy of ARRAY in aassign at /usr/lib/perl5/5.6.0/Carp/Heavy.pm line 79.
2056		#
2057		local $Log::TraceMessages::On = 1;
2058		t 'same clump, different channels: ' . d($progs->[0]) . ' and ' . d($prog);
2059		die "programmes in same clump have different channels: $_, $ch";
2060	    }
2061	}
2062    }
2063}
2064
2065
2066# There is a very hard to track down bug where stop times mysteriously
2067# get set to something ridiculous.  It varies from one perl version to
2068# another (hence the version check at the top) but still occurs even
2069# with 5.6.1.  To track it down I have isolated all code that sets
2070# stop times in this subroutine.
2071#
2072sub set_stop_time( $$ ) {
2073    my $p = shift;
2074    my $s = shift;
2075
2076    if ($SLOW) {
2077	# Another mysterious-bug-preventing line, see no_shared_scalars().
2078	my $dummy = "$s";
2079
2080	$p->{stop} = $s;
2081    }
2082    else {
2083	# Don't set stop times at all.
2084	delete $p->{stop};
2085    }
2086}
2087
2088# More debugging aids.
2089sub cst( $ ) {
2090    my $p = shift;
2091    croak "prog $p->{title}->[0]->[0] has bogus stop time"
2092      if exists $p->{stop} and $p->{stop} eq 'boogus FIXME XXX';
2093}
2094
2095sub no_shared_scalars( $ ) {
2096    my %seen;
2097    foreach my $h (@{$_[0]}) {
2098	foreach my $k (keys %$h) {
2099	    my $ref = \ ($h->{$k});
2100	    my $addr = "$ref";
2101	    $seen{$addr}++ && die "scalar $addr seen twice";
2102	}
2103    }
2104}
2105