1# -*- perl -*-
2# $Id: XMLTV.pm.in,v 1.175 2017/11/28 05:28:04 rmeden Exp $
3package XMLTV;
4
5use strict;
6use base 'Exporter';
7our @EXPORT = ();
8our @EXPORT_OK = qw(read_data parse parsefile parsefiles write_data
9		    best_name list_channel_keys list_programme_keys);
10
11# For the time being the version of this library is tied to that of
12# the xmltv package as a whole.  This number should be checked by the
13# mkdist tool.
14#
15our $VERSION = '0.5.70';
16
17# Work around changing behaviour of XML::Twig.  On some systems (like
18# mine) it always returns UTF-8 data unless KeepEncoding is specified.
19# However the encoding() method tells you the encoding of the original
20# document, not of the data you receive.  To be sure of what you're
21# getting, it is easiest on such a system to not give KeepEncoding and
22# just use UTF-8.
23#
24# But on other systems (seemingly perl 5.8 and above), XML::Twig tries
25# to keep the original document's encoding in the strings returned.
26# You then have to call encoding() to find out what you're getting.
27# To make sure of this behaviour we set KeepEncoding to true on such a
28# system.
29#
30# Setting KeepEncoding true everywhere seems to do no harm, it's a
31# pity that we lose conversion to UTF-8 but at least it's the same
32# everywhere.  So the library is distributed with this flag on.
33#
34my $KEEP_ENCODING = 1;
35
36# We need a way of telling parsefiles_callback() to optionally *not* die when presented with multiple encodings,
37# but without affecting any other packages which uses it (i.e. so a new sub param is out of the question)
38# - best I can think of for the minute is a global  (yuk)
39my $DIE_ON_MULTIPLE_ENCODINGS = 1;
40
41my %warned_unknown_key;
42sub warn_unknown_keys( $$ );
43
44=pod
45
46=head1 NAME
47
48XMLTV - Perl extension to read and write TV listings in XMLTV format
49
50=head1 SYNOPSIS
51
52  use XMLTV;
53  my $data = XMLTV::parsefile('tv.xml');
54  my ($encoding, $credits, $ch, $progs) = @$data;
55  my $langs = [ 'en', 'fr' ];
56  print 'source of listings is: ', $credits->{'source-info-name'}, "\n"
57      if defined $credits->{'source-info-name'};
58  foreach (values %$ch) {
59      my ($text, $lang) = @{XMLTV::best_name($langs, $_->{'display-name'})};
60      print "channel $_->{id} has name $text\n";
61      print "...in language $lang\n" if defined $lang;
62  }
63  foreach (@$progs) {
64      print "programme on channel $_->{channel} at time $_->{start}\n";
65      next if not defined $_->{desc};
66      foreach (@{$_->{desc}}) {
67          my ($text, $lang) = @$_;
68          print "has description $text\n";
69          print "...in language $lang\n" if defined $lang;
70      }
71  }
72
73The value of $data will be something a bit like:
74
75  [ 'UTF-8',
76    { 'source-info-name' => 'Ananova', 'generator-info-name' => 'XMLTV' },
77    { 'radio-4.bbc.co.uk' => { 'display-name' => [ [ 'en',  'BBC Radio 4' ],
78						   [ 'en',  'Radio 4'     ],
79						   [ undef, '4'           ] ],
80			       'id' => 'radio-4.bbc.co.uk' },
81      ... },
82    [ { start => '200111121800', title => [ [ 'Simpsons', 'en' ] ],
83        channel => 'radio-4.bbc.co.uk' },
84      ... ] ]
85
86=head1 DESCRIPTION
87
88This module provides an interface to read and write files in XMLTV
89format (a TV listings format defined by xmltv.dtd).  In general element
90names in the XML correspond to hash keys in the Perl data structure.
91You can think of this module as a bit like B<XML::Simple>, but
92specialized to the XMLTV file format.
93
94The Perl data structure corresponding to an XMLTV file has four
95elements.  The first gives the character encoding used for text data,
96typically UTF-8 or ISO-8859-1.  (The encoding value could also be
97undef meaning 'unknown', when the library canE<39>t work out what it
98is.)  The second element gives the attributes of the root <tv>
99element, which give information about the source of the TV listings.
100The third element is a list of channels, each list element being a
101hash corresponding to one <channel> element.  The fourth element is
102similarly a list of programmes.  More details about the data structure
103are given later.  The easiest way to find out what it looks like is to
104load some small XMLTV files and use B<Data::Dumper> to print out the
105resulting structure.
106
107=head1 USAGE
108
109=cut
110
111use XML::Twig;
112use XML::Writer 0.600;
113use Date::Manip;
114use Carp;
115use Data::Dumper;
116
117# Use Lingua::Preferred if available, else kludge a replacement.
118sub my_which_lang { return $_[1]->[0] }
119BEGIN {
120    eval { require Lingua::Preferred };
121    *which_lang = $@ ? \&my_which_lang : \&Lingua::Preferred::which_lang;
122}
123
124# Use Log::TraceMessages if installed.
125BEGIN {
126    eval { require Log::TraceMessages };
127    if ($@) {
128	*t = sub {};
129	*d = sub { '' };
130    }
131    else {
132	*t = \&Log::TraceMessages::t;
133	*d = \&Log::TraceMessages::d;
134    }
135}
136
137# Attributes and subelements of channel.  Each subelement additionally
138# needs a handler defined.  Multiplicity is given for both, but for
139# attributes the only allowable values are '1' and '?'.
140#
141# Ordering of attributes is not really important, but we keep the same
142# order as they are given in the DTD so that output looks nice.
143#
144# The ordering of the subelements list gives the order in which these
145# elements must appear in the DTD.  In fact, these lists just
146# duplicate information in the DTD and add details of what handlers
147# to call.
148#
149our @Channel_Attributes = ([ 'id', '1' ]);
150our @Channel_Handlers =
151  (
152   [ 'display-name', 'with-lang', '+' ],
153   [ 'icon',         'icon',      '*' ],
154   [ 'url',          'scalar',    '*' ],
155  );
156
157# Same for <programme> elements.
158our @Programme_Attributes =
159  (
160   [ 'start',     '1' ],
161   [ 'stop',      '?' ],
162   [ 'pdc-start', '?' ],
163   [ 'vps-start', '?' ],
164   [ 'showview',  '?' ],
165   [ 'videoplus', '?' ],
166   [ 'channel',   '1' ],
167   [ 'clumpidx',  '?' ],
168  );
169our @Programme_Handlers =
170  (
171   [ 'title',            'with-lang',          '+' ],
172   [ 'sub-title',        'with-lang',          '*' ],
173   [ 'desc',             'with-lang/m',        '*' ],
174   [ 'credits',          'credits',            '?' ],
175   [ 'date',             'scalar',             '?' ],
176   [ 'category',         'with-lang',          '*' ],
177   [ 'keyword',          'with-lang',          '*' ],
178   [ 'language',         'with-lang',          '?' ],
179   [ 'orig-language',    'with-lang',          '?' ],
180   [ 'length',           'length',             '?' ],
181   [ 'icon',             'icon',               '*' ],
182   [ 'url',              'scalar',             '*' ],
183   [ 'country',          'with-lang',          '*' ],
184   [ 'episode-num',      'episode-num',        '*' ],
185   [ 'video',            'video',              '?' ],
186   [ 'audio',            'audio',              '?' ],
187   [ 'previously-shown', 'previously-shown',   '?' ],
188   [ 'premiere',         'with-lang/em',       '?' ],
189   [ 'last-chance',      'with-lang/em',       '?' ],
190   [ 'new',              'presence',           '?' ],
191   [ 'subtitles',        'subtitles',          '*' ],
192   [ 'rating',           'rating',             '*' ],
193   [ 'star-rating',      'star-rating',        '*' ],
194  );
195
196# And a hash mapping names like 'with-lang' to pairs of subs.  The
197# first for reading, the second for writing.  Note that the writers
198# alter the passed-in data as a side effect!  (If the writing sub is
199# called with an undef XML::Writer then it writes nothing but still
200# warns for (most) bad data checks - and still alters the data.)
201#
202our %Handlers = ();
203
204# Undocumented interface for adding extensions to the XMLTV format:
205# first add an entry to @XMLTV::Channel_Handlers or
206# @XMLTV::Programme_Handlers with your new element's name, 'type' and
207# multiplicity.  The 'type' should be a string you invent yourself.
208# Then $XMLTV::Handlers{'type'} should be a pair of subroutines, a
209# reader and a writer.  (Unless you want to use one of the existing
210# types such as 'with-lang' or 'scalar'.)
211#
212# Note that elements and attributes beginning 'x-' are skipped over
213# _automatically_, so you can't parse them with this method.  A better
214# way to add extensions is needed - doing this not encouraged but is
215# sometimes necessary.
216#
217
218# read_data() is a deprecated name for parsefile().
219sub read_data( $ ) { # FIXME remove altogether
220    warn "XMLTV::read_data() deprecated, use XMLTV::parsefile() instead\n";
221    &parsefile;
222}
223
224# Private.
225sub sanity( $ ) {
226    for (shift) {
227    	croak 'no <tv> element found' if not /<tv/;
228    }
229}
230
231=over
232
233=item parse(document)
234
235Takes an XMLTV document (a string) and returns the Perl data
236structure.  It is assumed that the document is valid XMLTV; if not
237the routine may die() with an error (although the current implementation
238just warns and continues for most small errors).
239
240The first element of the listref returned, the encoding, may vary
241according to the encoding of the input document, the versions of perl
242and C<XML::Parser> installed, the configuration of the XMLTV library
243and other factors including, but not limited to, the phase of the
244moon.  With luck it should always be either the encoding of the input
245file or UTF-8.
246
247Attributes and elements in the XML file whose names begin with 'x-'
248are skipped silently.  You can use these to include information which
249is not currently handled by the XMLTV format, or by this module.
250
251=cut
252
253sub parse( $ ) {
254    my $str = shift;
255    sanity($str);
256    # FIXME commonize with parsefiles()
257    my ($encoding, $credits);
258    my %channels;
259    my @programmes;
260    parse_callback($str,
261		   sub { $encoding = shift },
262		   sub { $credits = shift },
263		   sub { for (shift) { $channels{$_->{id}} = $_ } },
264		   sub { push @programmes, shift });
265    return [ $encoding, $credits, \%channels, \@programmes ];
266}
267
268=pod
269
270=item parsefiles(filename...)
271
272Like C<parse()> but takes one or more filenames instead of a string
273document.  The data returned is the merging of those file contents:
274the programmes will be concatenated in their original order, the
275channels just put together in arbitrary order (ordering of channels
276should not matter).
277
278It is necessary that each file have the same character encoding, if
279not, an exception is thrown.  Ideally the credits information would
280also be the same between all the files, since there is no obvious way to
281merge it - but if the credits information differs from one file to the
282next, one file is picked arbitrarily to provide credits and a warning
283is printed.  If two files give differing channel definitions for the
284same XMLTV channel id, then one is picked arbitrarily and a warning
285is printed.
286
287In the simple case, with just one file, you neednE<39>t worry
288about mismatching of encodings, credits or channels.
289
290The deprecated function C<parsefile()> is a wrapper allowing just one
291filename.
292
293=cut
294
295sub parsefiles( @ ) {
296    die 'one or more filenames required' if not @_;
297    my ($encoding, $credits);
298    my %channels;
299    my @programmes;
300    parsefiles_callback(sub { $encoding = shift },
301			sub { $credits = shift },
302			sub { for (shift) { $channels{$_->{id}} = $_ } },
303			sub { push @programmes, shift },
304			@_);
305    return [ $encoding, $credits, \%channels, \@programmes ];
306}
307
308sub parsefile( $ ) { parsefiles(@_) }
309
310=pod
311
312=item parse_callback(document, encoding_callback, credits_callback,
313                     channel_callback, programme_callback)
314
315An alternative interface.  Whereas C<parse()> reads the whole document
316and then returns a finished data structure, with this routine you
317specify a subroutine to be called as each <channel> element is read
318and another for each <programme> element.
319
320The first argument is the document to parse.  The remaining arguments
321are code references, one for each part of the document.
322
323The callback for encoding will be called once with a string giving the
324encoding.  In present releases of this module, it is also possible for
325the value to be undefined meaning 'unknown', but itE<39>s hoped that
326future releases will always be able to figure out the encoding used.
327
328The callback for credits will be called once with a hash reference.
329For channels and programmes, the appropriate function will be called
330zero or more times depending on how many channels / programmes are
331found in the file.
332
333The four subroutines will be called in order, that is, the encoding
334and credits will be done before the channel handler is called and all
335the channels will be dealt with before the first programme handler is
336called.
337
338If any of the code references is undef, nothing is called for that part
339of the file.
340
341For backwards compatibility, if the value for 'encoding callback' is
342not a code reference but a scalar reference, then the encoding found
343will be stored in that scalar.  Similarly if the 'credits callback'
344is a scalar reference, the scalar it points to will be set to point
345to the hash of credits.  This style of interface is deprecated: new
346code should just use four callbacks.
347
348For example:
349
350    my $document = '<tv>...</tv>';
351
352    my $encoding;
353    sub encoding_cb( $ ) { $encoding = shift }
354
355    my $credits;
356    sub credits_cb( $ ) { $credits = shift }
357
358    # The callback for each channel populates this hash.
359    my %channels;
360    sub channel_cb( $ ) {
361	my $c = shift;
362	$channels{$c->{id}} = $c;
363    }
364
365    # The callback for each programme.  We know that channels are
366    # always read before programmes, so the %channels hash will be
367    # fully populated.
368    #
369    sub programme_cb( $ ) {
370        my $p = shift;
371        print "got programme: $p->{title}->[0]->[0]\n";
372        my $c = $channels{$p->{channel}};
373        print 'channel name is: ', $c->{'display-name'}->[0]->[0], "\n";
374    }
375
376    # Let's go.
377    XMLTV::parse_callback($document, \&encoding_cb, \&credits_cb,
378                          \&channel_cb, \&programme_cb);
379
380=cut
381
382# Private.
383sub new_doc_callback( $$$$ ) {
384    my ($enc_cb, $cred_cb, $ch_cb, $p_cb) = @_;
385    t 'creating new XML::Twig';
386    t '\@Channel_Handlers=' . d \@Channel_Handlers;
387    t '\@Programme_Handlers=' . d \@Programme_Handlers;
388    new XML::Twig(StartTagHandlers =>
389		  { '/tv' => sub {
390			my ($t, $node) = @_;
391			my $enc;
392			if ($KEEP_ENCODING) {
393			    t 'KeepEncoding on, get original encoding';
394			    $enc = $t->encoding();
395			}
396			else {
397			    t 'assuming UTF-8 encoding';
398			    $enc = 'UTF-8';
399			}
400
401			if (defined $enc_cb) {
402			    for (ref $enc_cb) {
403				if ($_ eq 'CODE') {
404				    $enc_cb->($enc);
405				}
406				elsif ($_ eq 'SCALAR') {
407				    $$enc_cb = $enc;
408				}
409				else {
410				    die "callback should be code ref or scalar ref, or undef";
411				}
412			    }
413			}
414
415			if (defined $cred_cb) {
416			    my $cred = get_attrs($node);
417			    for (ref $cred_cb) {
418				if ($_ eq 'CODE') {
419				    $cred_cb->($cred);
420				}
421				elsif ($_ eq 'SCALAR') {
422				    $$cred_cb = $cred;
423				}
424				else {
425				    die "callback should be code ref or scalar ref, or undef";
426				}
427			    }
428			}
429			# Most of the above code can be removed in the
430			# next release.
431			#
432		    },
433		  },
434
435		  TwigHandlers =>
436		  { '/tv/channel'   => sub {
437			my ($t, $node) = @_;
438			die if not defined $node;
439			my $c = node_to_channel($node);
440			$t->purge();
441			if (not $c) {
442			    warn "skipping bad channel element\n";
443			}
444			else {
445			    $ch_cb->($c);
446			}
447		    },
448
449		    '/tv/programme' => sub {
450			my ($t, $node) = @_;
451			die if not defined $node;
452			my $p = node_to_programme($node);
453			$t->purge();
454			if (not $p) {
455			    warn "skipping bad programme element\n";
456			}
457			else {
458			    $p_cb->($p);
459			}
460		    },
461		  },
462
463		  KeepEncoding => $KEEP_ENCODING,
464		 );
465}
466
467sub parse_callback( $$$$$ ) {
468    my ($str, $enc_cb, $cred_cb, $ch_cb, $p_cb) = @_;
469    sanity($str);
470    new_doc_callback($enc_cb, $cred_cb, $ch_cb, $p_cb)->parse($str);
471}
472
473=pod
474
475=item parsefiles_callback(encoding_callback, credits_callback,
476                          channel_callback, programme_callback,
477                          filenames...)
478
479As C<parse_callback()> but takes one or more filenames to open,
480merging their contents in the same manner as C<parsefiles()>.  Note
481that the reading is still gradual - you get the channels and
482programmes one at a time, as they are read.
483
484Note that the same <channel> may be present in more than one file, so
485the channel callback will get called more than once.  ItE<39>s your
486responsibility to weed out duplicate channel elements (since writing
487them out again requires that each have a unique id).
488
489For compatibility, there is an alias C<parsefile_callback()> which is
490the same but takes only a single filename, B<before> the callback
491arguments.  This is deprecated.
492
493=cut
494
495sub parsefile_callback( $$$$$ ) {
496    my ($f, $enc_cb, $cred_cb, $ch_cb, $p_cb) = @_;
497    parsefiles_callback($enc_cb, $cred_cb, $ch_cb, $p_cb, $f);
498}
499
500sub parsefiles_callback( $$$$@ ) {
501    my ($enc_cb, $cred_cb, $ch_cb, $p_cb, @files) = @_;
502    die "one or more files required" if not @files;
503    my $all_encoding; my $have_encoding = 0;
504    my $all_credits;
505    my %all_channels;
506
507    my $do_next_file;    # sub to parse file ( defined below)
508    my $do_file_number;  # current file in @files array
509
510    my $my_enc_cb = sub( $ ) {
511	my $e = shift;
512	t 'encoding callback';
513	if ($have_encoding) {
514	    t 'seen encoding before, just check';
515	    my ($da, $de) = (defined $all_encoding, defined $e);
516	    if (not $da and not $de) {
517		warn "two files both have unspecified character encodings, hope they're the same\n";
518	    }
519	    elsif (not $da and $de) {
520		##warn "encoding $e not being returned to caller\n";
521		$all_encoding = $e;
522	    }
523	    elsif ($da and not $de) {
524		warn "input file with unspecified encoding, assuming same as others ($all_encoding)\n";
525	    }
526	    elsif ($da and $de) {
527		if (uc($all_encoding) ne uc($e)) {
528		    if ( defined $DIE_ON_MULTIPLE_ENCODINGS && !$DIE_ON_MULTIPLE_ENCODINGS ) {
529			warn "this file's encoding $e differs from others' $all_encoding \n";
530		    } else {
531		    die "this file's encoding $e differs from others' $all_encoding - aborting\n";
532		}
533	    }
534	    }
535	    else { die }
536	    t 'have encoding, call user';
537	    $enc_cb->($e, $do_file_number) if $enc_cb;
538	}
539	else {
540	    t 'not seen encoding before, call user';
541	    $enc_cb->($e, $do_file_number) if $enc_cb;
542	    $all_encoding = $e;
543	    $have_encoding = 1;
544	}
545    };
546
547    my $my_cred_cb = sub( $ ) {
548	my $c = shift;
549	$Data::Dumper::Sortkeys = 1; # ensure consistent order of dumped hash
550	if (defined $all_credits) {
551	    if (Dumper($all_credits) ne Dumper($c)) {
552		warn "different files have different credits, picking one arbitrarily\n";
553		# In fact, we pick the last file in the list since this is the
554		# first to be opened.
555		#
556	    }
557	}
558	else {
559	    $cred_cb->($c) if $cred_cb;
560	    $all_credits = $c;
561	}
562    };
563
564    my $my_ch_cb = sub( $ ) {
565	my $c = shift;
566	my $id = $c->{id};
567	$Data::Dumper::Sortkeys = 1; # ensure consistent order of dumped hash
568	if (defined $all_channels{$id} and Dumper($all_channels{$id}) ne Dumper($c)) {
569	    warn "differing channels with id $id, picking one arbitrarily\n";
570	}
571	else {
572	    $all_channels{$id} = $c;
573	    $ch_cb->($c, $do_file_number) if $ch_cb;
574	}
575    };
576
577    my $my_p_cb = sub( $ ) {
578	my $doing_file = $do_file_number;
579
580	$do_next_file->(); # if any
581
582	$do_file_number = $doing_file;
583	$p_cb->(@_, $do_file_number) if $p_cb;
584    };
585
586    $do_next_file = sub() {
587	while (@files) {
588	    # Last first.
589	    my $f = pop @files;
590
591	    $do_file_number = scalar @files;
592
593	    # In older versions of perl there were segmentation faults
594	    # when calling die() inside the parsing callbacks, so we
595	    # needed to override $SIG{__DIE__} here.  Assume that
596	    # newer perls don't have this issue.
597	    #
598
599	    my $t = new_doc_callback($my_enc_cb, $my_cred_cb, $my_ch_cb, $my_p_cb);
600	    $t->parsefile($f);
601	}
602    };
603
604    # Let's go.
605    $do_next_file->();
606}
607
608=pod
609
610=item write_data(data, options...)
611
612Takes a data structure and writes it as XML to standard output.  Any
613extra arguments are passed on to XML::WriterE<39>s constructor, for example
614
615    my $f = new IO::File '>out.xml'; die if not $f;
616    write_data($data, OUTPUT => $f);
617
618The encoding used for the output is given by the first element of the
619data.
620
621Normally, there will be a warning for any Perl data which is not
622understood and cannot be written as XMLTV, such as strange keys in
623hashes.  But as an exception, any hash key beginning with an
624underscore will be skipped over silently.  You can store 'internal use
625only' data this way.
626
627If a programme or channel hash contains a key beginning with 'debug',
628this key and its value will be written out as a comment inside the
629<programme> or <channel> element.  This lets you include small
630debugging messages in the XML output.
631
632=cut
633
634sub write_data( $;@ ) {
635    my $data = shift;
636    my $writer = new XMLTV::Writer(encoding => $data->[0], @_);
637    $writer->start($data->[1]);
638    $writer->write_channels($data->[2]);
639    $writer->write_programme($_) foreach @{$data->[3]};
640    $writer->end();
641}
642
643
644# Private.
645#
646# get_attrs()
647#
648# Given a node, return a hashref of its attributes.  Skips over
649# the 'x-whatever' attributes.
650#
651sub get_attrs( $ ) {
652    my $node = shift; die if not defined $node;
653    my %r = %{$node->atts()};
654    foreach (keys %r) {
655	if (/^x-/) {
656	    delete $r{$_};
657	}
658	else {
659	    tidy(\$r{$_});
660	}
661    }
662    return \%r;
663}
664
665
666# Private.
667#
668# get_text()
669#
670# Given a node containing only text, return that text (with whitespace
671# either side stripped).  If the node has no children (as in
672# <foo></foo> or <foo />), this is considered to be the empty string.
673#
674# Parameter: whether newlines are allowed (defaults to false)
675#
676sub get_text( $;$ ) {
677    my $node = shift;
678    my $allow_nl = shift; $allow_nl = 0 if not defined $allow_nl;
679    my @children = get_subelements($node);
680    if (@children == 0) {
681	return '';
682    }
683    elsif (@children == 1) {
684	my $v = $children[0]->pcdata();
685	t 'got pcdata: ' . d $v;
686	if (not defined $v) {
687	    my $name = get_name($node);
688	    warn "node $name expected to contain text has other stuff\n";
689	}
690	else {
691	    # Just hope that the encoding we got uses \n...
692	    if (not $allow_nl and $v =~ tr/\n//d) {
693		my $name = get_name($node);
694		warn "removing newlines from content of node $name\n";
695	    }
696	    tidy(\$v);
697	}
698	t 'returning: ' . d $v;
699	return $v;
700    }
701    elsif (@children > 1) {
702	my $name = get_name($node);
703	warn "node $name expected to contain text has more than one child\n";
704	return undef;
705    }
706    else { die }
707}
708
709# Private.  Clean up parsed text.  Takes ref to scalar.
710sub tidy( $ ) {
711    our $v; local *v = shift; die if not defined $v;
712    if ($XML::Twig::VERSION < 3.01 || $KEEP_ENCODING) {
713	# Old versions of XML::Twig had stupid behaviour with
714	# entities - and so do the new ones if KeepEncoding is on.
715	#
716	for ($v) {
717	    s/&gt;/>/g;
718	    s/&lt;/</g;
719	    s/&apos;/\'/g;
720	    s/&quot;/\"/g;
721	    s/&amp;/&/g;	# needs to be last
722	}
723    }
724    else {
725	t 'new XML::Twig, not KeepEncoding, entities already dealt with';
726    }
727
728    for ($v) {
729	s/^\s+//;
730	s/\s+$//;
731
732	# On Windows there seems to be an inconsistency between
733	# XML::Twig and XML::Writer.  The former returns text with
734	# \r\n line endings to the application, but the latter adds \r
735	# characters to text outputted.  So reading some text and
736	# writing it again accumulates an extra \r character.  We fix
737	# this by removing \r from the input here.
738	#
739	tr/\r//d;
740    }
741}
742
743# Private.
744#
745# get_subelements()
746#
747# Return a list of all subelements of a node.  Whitespace is
748# ignored; anything else that isn't a subelement is warned about.
749# Skips over elements with name 'x-whatever'.
750#
751sub get_subelements( $ ) {
752    grep { (my $tmp = get_name($_)) !~ /^x-/ } $_[0]->children();
753}
754
755# Private.
756#
757# get_name()
758#
759# Return the element name of a node.
760#
761sub get_name( $ ) { $_[0]->gi() }
762
763# Private.
764#
765# dump_node()
766#
767# Return some information about a node for debugging.
768#
769sub dump_node( $ ) {
770    my $n = shift;
771    # Doesn't seem to be easy way to get 'type' of node.
772    my $r = 'name: ' . get_name($n) . "\n";
773    for (trunc($n->text())) {
774	$r .= "value: $_\n" if defined and length;
775    }
776    return $r;
777}
778# Private.  Truncate a string to a reasonable length and add '...' if
779# necessary.
780#
781sub trunc {
782    local $_ = shift;
783    return undef if not defined;
784    if (length > 1000) {
785	return substr($_, 0, 1000) . '...';
786    }
787    return $_;
788}
789
790=pod
791
792=item best_name(languages, pairs [, comparator])
793
794The XMLTV format contains many places where human-readable text is
795given an optional 'lang' attribute, to allow mixed languages.  This is
796represented in Perl as a pair [ text, lang ], although the second
797element may be missing or undef if the language is unknown.  When
798several alernatives for an element (such as <title>) can be given, the
799representation is a list of [ text, lang ] pairs.  Given such a list,
800what is the best text to use?  It depends on the userE<39>s preferred
801language.
802
803This function takes a list of acceptable languages and a list of [string,
804language] pairs, and finds the best one to use.  This means first finding
805the appropriate language and then picking the 'best' string in that
806language.
807
808The best is normally defined as the first one found in a usable
809language, since the XMLTV format puts the most canonical versions
810first.  But you can pass in your own comparison function, for example
811if you want to choose the shortest piece of text that is in an
812acceptable language.
813
814The acceptable languages should be a reference to a list of language
815codes looking like 'ru', or like 'de_DE'.  The text pairs should be a
816reference to a list of pairs [ string, language ].  (As a special case
817if this list is empty or undef, that means no text is present, and the
818result is undef.)  The third argument if present should be a cmp-style
819function that compares two strings of text and returns 1 if the first
820argument is better, -1 if the second better, 0 if theyE<39>re equally
821good.
822
823Returns: [s, l] pair, where s is the best of the strings to use and l
824is its language.  This pair is 'live' - it is one of those from the
825list passed in.  So you can use C<best_name()> to find the best pair
826from a list and then modify the content of that pair.
827
828(This routine depends on the C<Lingua::Preferred> module being
829installed; if that module is missing then the first available
830language is always chosen.)
831
832Example:
833
834    my $langs = [ 'de', 'fr' ]; # German or French, please
835
836    # Say we found the following under $p->{title} for a programme $p.
837    my $pairs = [ [ 'La CitE des enfants perdus', 'fr' ],
838                  [ 'The City of Lost Children', 'en_US' ] ];
839
840    my $best = best_name($langs, $pairs);
841    print "chose title $best->[0]\n";
842
843=cut
844
845sub best_name( $$;$ ) {
846    my ($wanted_langs, $pairs, $compare) = @_;
847    t 'best_name() ENTRY';
848    t 'wanted langs: ' . d $wanted_langs;
849    t '[text,lang] pairs: ' . d $pairs;
850    t 'comparison fn: ' . d $compare;
851    return undef if not defined $pairs;
852    my @pairs = @$pairs;
853
854    my @avail_langs;
855    my (%seen_lang, $seen_undef);
856    # Collect the list of available languages.
857    foreach (map { $_->[1] } @pairs) {
858	if (defined) {
859	    next if $seen_lang{$_}++;
860	}
861	else {
862	    next if $seen_undef++;
863	}
864	push @avail_langs, $_;
865    }
866
867    my $pref_lang = which_lang($wanted_langs, \@avail_langs);
868
869    # Gather up [text, lang] pairs which have the desired language.
870    my @candidates;
871    foreach (@pairs) {
872	my ($text, $lang) = @$_;
873	next unless ((not defined $lang)
874		     or (defined $pref_lang and $lang eq $pref_lang));
875	push @candidates, $_;
876    }
877
878    return undef if not @candidates;
879
880    # If a comparison function was passed in, use it to compare the
881    # text strings from the candidate pairs.
882    #
883    @candidates = sort { $compare->($a->[0], $b->[0]) } @candidates
884      if defined $compare;
885
886    # Pick the first candidate.  This will be the one ordered first by
887    # the comparison function if given, otherwise the earliest in the
888    # original list.
889    #
890    return $candidates[0];
891}
892
893
894=item list_channel_keys(), list_programme_keys()
895
896Some users of this module may wish to enquire at runtime about which
897keys a programme or channel hash can contain.  The data in the hash
898comes from the attributes and subelements of the corresponding element
899in the XML.  The values of attributes are simply stored as strings,
900while subelements are processed with a handler which may return a
901complex data structure.  These subroutines returns a hash mapping key
902to handler name and multiplicity.  This lets you know what data types
903can be expected under each key.  For keys which come from attributes
904rather than subelements, the handler is set to 'scalar', just as for
905subelements which give a simple string.  See L<"DATA STRUCTURE"> for
906details on what the different handler names mean.
907
908It is not possible to find out which keys are mandatory and which
909optional, only a list of all those which might possibly be present.
910An example use of these routines is the L<tv_grep> program, which
911creates its allowed command line arguments from the names of programme
912subelements.
913
914=cut
915
916# Private.
917sub list_keys( $$ ) {
918    my %r;
919
920    # Attributes.
921    foreach (@{shift()}) {
922	my ($k, $mult) = @$_;
923	$r{$k} = [ 'scalar', $mult ];
924    }
925
926    # Subelements.
927    foreach (@{shift()}) {
928	my ($k, $h_name, $mult) = @$_;
929	$r{$k} = [ $h_name, $mult ];
930    }
931
932    return \%r;
933}
934# Public.
935sub list_channel_keys() {
936    list_keys(\@Channel_Attributes, \@Channel_Handlers);
937}
938sub list_programme_keys() {
939    list_keys(\@Programme_Attributes, \@Programme_Handlers);
940}
941
942=pod
943
944=item catfiles(w_args, filename...)
945
946Concatenate several listings files, writing the output to somewhere
947specified by C<w_args>.  Programmes are catenated together, channels
948are merged, for credits we just take the first and warn if the others
949differ.
950
951The first argument is a hash reference giving information to pass to
952C<XMLTV::Writer>E<39>s constructor.  But do not specify encoding, this
953will be taken from the input files.  C<catfiles()> will abort if the
954input files have different encodings, unless the 'UTF8'=1 argument
955is passed in.
956
957=cut
958
959sub catfiles( $@ ) {
960    my $w_args = shift;
961    my $w;
962    my $enc;	# encoding of current file
963    my @encs;	# encoding of all files being catenated
964    my %seen_ch;
965    my %seen_progs;
966
967    $DIE_ON_MULTIPLE_ENCODINGS = ( defined $w_args->{UTF8} ? 0 : 1 );
968    $Data::Dumper::Sortkeys = 1; # ensure consistent order of dumped hash
969
970    XMLTV::parsefiles_callback
971      (sub {
972	   $enc = shift;
973	   my $do_file = shift;
974	   $encs[$do_file] = (defined $enc ? $enc : 'unknown');
975	   t "file $do_file = $enc" if defined $enc;
976	   $w = new XMLTV::Writer(%$w_args, encoding => ( defined $w_args->{UTF8} ? 'UTF-8' : $enc ) )  if !defined $w;
977       },
978       sub { $w->start(shift) },
979       sub {
980	   my $c = shift;
981	   my $id = $c->{id};
982	   if (not defined $seen_ch{$id}) {
983
984	       my $do_file = shift;
985	       if (defined $w_args->{UTF8}) {
986	          if (uc($encs[$do_file]) ne 'UTF-8' && $encs[$do_file] ne 'unknown') {
987	             # recode the incoming channel name
988		     t 'recoding channel from '.$encs[$do_file].' to UTF-8';
989		     require XMLTV::Data::Recursive::Encode;
990		     $c = XMLTV::Data::Recursive::Encode->from_to($c, $encs[$do_file], 'UTF-8');
991	          }
992	       }
993
994	       $w->write_channel($c);
995	       $seen_ch{$id} = $c;
996	   }
997	   elsif (Dumper($seen_ch{$id}) eq Dumper($c)) {
998	       # They're identical, okay.
999	   }
1000	   else {
1001	       warn "channel $id may differ between two files, "
1002		 . "picking one arbitrarily\n";
1003	   }
1004       },
1005       sub {
1006	   my $p = shift;
1007	   my $do_file = shift;
1008	   if (defined $w_args->{UTF8}) {
1009	      if (uc($encs[$do_file]) ne 'UTF-8' && $encs[$do_file] ne 'unknown') {
1010	         # recode the incoming programme
1011		 t 'recoding prog from '.$encs[$do_file].' to UTF-8';
1012		 require XMLTV::Data::Recursive::Encode;
1013		 $p = XMLTV::Data::Recursive::Encode->from_to($p, $encs[$do_file], 'UTF-8');
1014	      }
1015	   }
1016	   if (! $seen_progs{ $p->{start} . "|" . $p->{title}[0][0] . "|" . $p->{channel} }++) {
1017	      $w->write_programme($p);
1018	   }
1019	   else {
1020	      # warn "duplicate programme detected, skipping\n"
1021	      # . "  " . $p->{start} . "|" . $p->{stop} . "|" . $p->{title}[0][0] . "|" . $p->{channel} . "\n";
1022	   }
1023       },
1024       @_);
1025    $w->end();
1026}
1027
1028=pod
1029
1030=item cat(data, ...)
1031
1032Concatenate (and merge) listings data.  Programmes are catenated
1033together, channels are merged, for credits we just take the first and
1034warn if the others differ (except that the 'date' of the result is the
1035latest date of all the inputs).
1036
1037Whereas C<catfiles()> reads and writes files, this function takes
1038already-parsed listings data and returns some more listings data.  It
1039is much more memory-hungry.
1040
1041=cut
1042
1043sub cat( @ ) { cat_aux(1, @_) }
1044
1045=pod
1046
1047=item cat_noprogrammes
1048
1049Like C<cat()> but ignores the programme data and just returns
1050encoding, credits and channels.  This is in case for scalability
1051reasons you want to handle programmes individually, but still
1052merge the smaller data.
1053
1054=cut
1055
1056sub cat_noprogrammes( @ ) { cat_aux(0, @_) }
1057
1058sub cat_aux( @ ) {
1059    my $all_encoding;
1060    my ($all_credits_nodate, $all_credits_date);
1061    my %all_channels;
1062    my @all_progs;
1063    my $do_progs = shift;
1064
1065    $Data::Dumper::Sortkeys = 1; # ensure consistent order of dumped hash
1066
1067    foreach (@_) {
1068	t 'doing arg: ' . d $_;
1069	my ($encoding, $credits, $channels, $progs) = @$_;
1070
1071	if (not defined $all_encoding) {
1072	    $all_encoding = $encoding;
1073	}
1074	elsif ($encoding ne $all_encoding) {
1075	    die "different files have different encodings, cannot continue\n";
1076	}
1077
1078	# If the credits are different between files there's not a lot
1079	# we can do to merge them.  Apart from 'date', that is.  There
1080	# we can say that the date of the concatenated listings is the
1081	# newest date from all the sources.
1082	#
1083 	my %credits_nodate = %$credits; # copy
1084 	my $d = delete $credits_nodate{date};
1085 	if (defined $d) {
1086	    # Need to 'require' rather than 'use' this because
1087	    # XMLTV.pm is loaded during the build process and
1088	    # XMLTV::Date isn't available then.  Urgh.
1089	    #
1090	    require XMLTV::Date;
1091 	    my $dp = XMLTV::Date::parse_date($d);
1092 	    for ($all_credits_date) {
1093 		if (not defined
1094		    or Date_Cmp(XMLTV::Date::parse_date($_), $dp) < 0) {
1095 		    $_ = $d;
1096 		}
1097 	    }
1098 	}
1099
1100 	# Now in uniqueness checks ignore the date.
1101	if (not defined $all_credits_nodate) {
1102	    $all_credits_nodate = \%credits_nodate;
1103	}
1104	elsif (Dumper(\%credits_nodate) ne Dumper($all_credits_nodate)) {
1105	    warn "different files have different credits, taking from first file\n";
1106	}
1107
1108	foreach (keys %$channels) {
1109	    if (not defined $all_channels{$_}) {
1110		$all_channels{$_} = $channels->{$_};
1111	    }
1112	    elsif (Dumper($all_channels{$_}) ne Dumper($channels->{$_})) {
1113		warn "channel $_ differs between two files, taking first appearance\n";
1114	    }
1115	}
1116
1117	push @all_progs, @$progs if $do_progs;
1118    }
1119
1120    $all_encoding = 'UTF-8' if not defined $all_encoding;
1121
1122    my %all_credits;
1123    %all_credits = %$all_credits_nodate
1124      if defined $all_credits_nodate;
1125    $all_credits{date} = $all_credits_date
1126      if defined $all_credits_date;
1127
1128    if ($do_progs) {
1129        @all_progs = reverse @all_progs;
1130        my %seen_progs;
1131        my @dupe_indexes = reverse(grep { $seen_progs{ $all_progs[$_]->{start} . "|" . $all_progs[$_]->{stop} . "|" . $all_progs[$_]->{title}[0][0] . "|" . $all_progs[$_]->{channel} }++ } 0..$#all_progs);
1132        foreach my $item (@dupe_indexes) {
1133            # warn "duplicate programme detected, skipping\n"
1134            #      . "  " . $all_progs[$item]->{start} . "|" . $all_progs[$item]->{stop} . "|" . $all_progs[$item]->{title}[0][0] . "|" . $all_progs[$item]->{channel} . "\n";
1135            splice (@all_progs,$item,1);
1136        }
1137        @all_progs = reverse @all_progs;
1138
1139	return [ $all_encoding, \%all_credits, \%all_channels, \@all_progs ];
1140    }
1141    else {
1142	return [ $all_encoding, \%all_credits, \%all_channels ];
1143    }
1144}
1145
1146
1147# For each subelement of programme, we define a subroutine to read it
1148# and one to write it.  The reader takes an node for a single
1149# subelement and returns its value as a Perl scalar (warning and
1150# returning undef if error).  The writer takes an XML::Writer, an
1151# element name and a scalar value and writes a subelement for that
1152# value.  Note that the element name is passed in to the writer just
1153# for symmetry, so that neither the writer or the reader have to know
1154# what their element is called.
1155#
1156=pod
1157
1158=back
1159
1160=head1 DATA STRUCTURE
1161
1162For completeness, we describe more precisely how channels and
1163programmes are represented in Perl.  Each element of the channels list
1164is a hashref corresponding to one <channel> element, and likewise for
1165programmes.  The possible keys of a channel (programme) hash are the
1166names of attributes or subelements of <channel> (<programme>).
1167
1168The values for attributes are not processed in any way; an attribute
1169C<fred="jim"> in the XML will become a hash element with key C<'fred'>,
1170value C<'jim'>.
1171
1172But for subelements, there is further processing needed to turn the
1173XML content of a subelement into Perl data.  What is done depends on
1174what type of data is stored under that subelement.  Also, if a certain
1175element can appear several times then the hash key for that element
1176points to a list of values rather than just one.
1177
1178The conversion of a subelementE<39>s content to and from Perl data is
1179done by a handler.  The most common handler is I<with-lang>, used for
1180human-readable text content plus an optional 'lang' attribute.  There
1181are other handlers for other data structures in the file format.
1182Often two subelements will share the same handler, since they hold the
1183same type of data.  The handlers defined are as follows; note that
1184many of them will silently strip leading and trailing whitespace in
1185element content.  Look at the DTD itself for an explanation of the
1186whole file format.
1187
1188Unless specified otherwise, it is not allowed for an element expected
1189to contain text to have empty content, nor for the text to contain
1190newline characters.
1191
1192=over
1193
1194=item I<credits>
1195
1196Turns a list of credits (for director, actor, writer, etc.) into a
1197hash mapping 'role' to a list of names.  The names in each role are
1198kept in the same order.
1199
1200=cut
1201
1202$Handlers{credits}->[0] = sub( $ ) {
1203    my $node = shift;
1204    my @roles = qw(director actor writer adapter producer composer
1205                   editor presenter commentator guest);
1206    my %known_role; ++$known_role{$_} foreach @roles;
1207    my %r;
1208    foreach (get_subelements($node)) {
1209	my $role = get_name($_);
1210	unless ($known_role{$role}++) {
1211		warn "unknown thing in credits: $role";
1212		next;
1213	}
1214	my %attrs = %{get_attrs($_)};
1215	my $character = $attrs{role} if exists $attrs{role};
1216	if (defined $character) {
1217		push @{$r{$role}}, [ get_text($_), $character ] ;
1218	} else {
1219		push @{$r{$role}}, get_text($_);
1220	}
1221    }
1222    return \%r;
1223};
1224
1225$Handlers{'credits'}->[1] = sub( $$$ ) {
1226    my ($w, $e, $v) = @_; die if not defined $v;
1227    my %h = %$v;
1228    return if not %h; # don't write empty element
1229    t 'writing credits: ' . d \%h;
1230    # TODO some 'do nothing' setting in XML::Writer to replace this
1231    # convention of passing undef.
1232    #
1233    $w->startTag($e) if $w;
1234    foreach ( qw[director actor writer adapter producer composer
1235                 editor presenter commentator guest] ) {
1236	next unless defined $h{$_};
1237	my @people = @{delete $h{$_}};
1238	foreach my $person (@people) {
1239	    	die if not defined $person;
1240		if (ref($person) eq 'ARRAY') {
1241			if ( defined @{$person}[1] && @{$person}[1] ne '' ) {
1242				$w->dataElement($_, @{$person}[0], 'role' => @{$person}[1] ) if $w;
1243			} else {
1244				$w->dataElement($_, @{$person}[0]) if $w;
1245			}
1246		} else {
1247			$w->dataElement($_, $person) if $w;
1248		}
1249        }
1250    }
1251    warn_unknown_keys($e, \%h);
1252    $w->endTag($e) if $w;
1253};
1254
1255=pod
1256
1257=item I<scalar>
1258
1259Reads and writes a simple string as the content of the XML element.
1260
1261=cut
1262
1263$Handlers{scalar}->[0] = sub( $ ) {
1264    my $node = shift;
1265    return get_text($node);
1266};
1267$Handlers{scalar}->[1] = sub( $$$ ) {
1268    my ($w, $e, $v) = @_;
1269    t 'scalar';
1270    $w->dataElement($e, $v) if $w;
1271};
1272
1273=pod
1274
1275=item I<length>
1276
1277Converts the content of a <length> element into a number of seconds
1278(so <length units="minutes">5</minutes> would be returned as 300).  On
1279writing out again tries to convert a number of seconds to a time in
1280minutes or hours if that would look better.
1281
1282=cut
1283
1284$Handlers{length}->[0] = sub( $ ) {
1285    my $node = shift; die if not defined $node;
1286    my %attrs = %{get_attrs($node)};
1287    my $d = get_text($node);
1288    if ($d =~ /^\s*$/) {
1289	warn "empty 'length' element";
1290	return undef;
1291    }
1292    if ($d !~ tr/0-9// or $d =~ tr/0-9//c) {
1293	warn "bad content of 'length' element: $d";
1294	return undef;
1295    }
1296    my $units = $attrs{units};
1297    if (not defined $units) {
1298	warn "missing 'units' attr in 'length' element";
1299	return undef;
1300    }
1301    # We want to return a length in seconds.
1302    if ($units eq 'seconds') {
1303	# Okay.
1304    }
1305    elsif ($units eq 'minutes') {
1306	$d *= 60;
1307    }
1308    elsif ($units eq 'hours') {
1309	$d *= 60 * 60;
1310    }
1311    else {
1312	warn "bad value of 'units': $units";
1313	return undef;
1314    }
1315    return $d;
1316};
1317$Handlers{length}->[1] = sub( $$$ ) {
1318    my ($w, $e, $v) = @_;
1319    t 'length';
1320    my $units;
1321    if ($v % 3600 == 0) {
1322	$units = 'hours';
1323	$v /= 3600;
1324    }
1325    elsif ($v % 60 == 0) {
1326	$units = 'minutes';
1327	$v /= 60;
1328    }
1329    else {
1330	$units = 'seconds';
1331    }
1332    $w->dataElement($e, $v, units => $units) if $w;
1333};
1334
1335=pod
1336
1337=item I<episode-num>
1338
1339The representation in Perl of XMLTVE<39>s odd episode numbers is as a
1340pair of [ content, system ].  As specified by the DTD, if the system is
1341not given in the file then 'onscreen' is assumed.  Whitespace in the
1342'xmltv_ns' system is unimportant, so on reading it is normalized to
1343a single space on either side of each dot.
1344
1345=cut
1346
1347$Handlers{'episode-num'}->[0] = sub( $ ) {
1348    my $node = shift; die if not defined $node;
1349    my %attrs = %{get_attrs($node)};
1350    my $system = $attrs{system};
1351    $system = 'onscreen' if not defined $system;
1352    my $content = get_text($node);
1353    if ($system eq 'xmltv_ns') {
1354	# Make it look nice.
1355	$content =~ s/\s+//g;
1356	$content =~ s/\./ . /g;
1357    }
1358    return [ $content, $system ];
1359};
1360$Handlers{'episode-num'}->[1] = sub( $$$ ) {
1361    my ($w, $e, $v) = @_;
1362    t 'episode number';
1363    if (not ref $v or ref $v ne 'ARRAY') {
1364	warn "not writing episode-num whose content is not an array";
1365	return;
1366    }
1367    my ($content, $system) = @$v;
1368    $system = 'onscreen' if not defined $system;
1369    $w->dataElement($e, $content, system => $system) if $w;
1370};
1371
1372=pod
1373
1374=item I<video>
1375
1376The <video> section is converted to a hash.  The <present> subelement
1377corresponds to the key 'present' of this hash, 'yes' and 'no' are
1378converted to Booleans.  The same applies to <colour>.  The content of
1379the <aspect> subelement is stored under the key 'aspect'.  These keys
1380can be missing in the hash just as the subelements can be missing in
1381the XML.
1382
1383=cut
1384
1385$Handlers{video}->[0] = sub ( $ ) {
1386    my $node = shift;
1387    my %r;
1388    foreach (get_subelements($node)) {
1389	my $name = get_name($_);
1390	my $value = get_text($_);
1391	if ($name eq 'present') {
1392	    warn "'present' seen twice" if defined $r{present};
1393	    $r{present} = decode_boolean($value);
1394	}
1395	elsif ($name eq 'colour') {
1396	    warn "'colour' seen twice" if defined $r{colour};
1397	    $r{colour} = decode_boolean($value);
1398	}
1399	elsif ($name eq 'aspect') {
1400	    warn "'aspect' seen twice" if defined $r{aspect};
1401	    $value =~ /^\d+:\d+$/ or warn "bad aspect ratio: $value";
1402	    $r{aspect} = $value;
1403	}
1404	elsif ($name eq 'quality') {
1405	    warn "'quality' seen twice" if defined $r{quality};
1406	    $r{quality} = $value;
1407	}
1408    }
1409    return \%r;
1410};
1411$Handlers{video}->[1] = sub( $$$ ) {
1412    my ($w, $e, $v) = @_;
1413    t "'video' element";
1414    my %h = %$v;
1415    return if not %h; # don't write empty element
1416    $w->startTag($e) if $w;
1417    if (defined (my $val = delete $h{present})) {
1418	$w->dataElement('present', encode_boolean($val)) if $w;
1419    }
1420    if (defined (my $val = delete $h{colour})) {
1421	$w->dataElement('colour', encode_boolean($val)) if $w;
1422    }
1423    if (defined (my $val = delete $h{aspect})) {
1424	$w->dataElement('aspect', $val) if $w;
1425    }
1426    if (defined (my $val = delete $h{quality})) {
1427	$w->dataElement('quality', $val) if $w;
1428    }
1429    warn_unknown_keys("zz $e", \%h);
1430    $w->endTag($e) if $w;
1431};
1432
1433=pod
1434
1435=item I<audio>
1436
1437This is similar to I<video>.  <present> is a Boolean value, while
1438the content of <stereo> is stored unchanged.
1439
1440=cut
1441
1442$Handlers{audio}->[0] = sub( $ ) {
1443    my $node = shift;
1444    my %r;
1445    foreach (get_subelements($node)) {
1446	my $name = get_name($_);
1447	my $value = get_text($_);
1448	if ($name eq 'present') {
1449	    warn "'present' seen twice" if defined $r{present};
1450	    $r{present} = decode_boolean($value);
1451	}
1452	elsif ($name eq 'stereo') {
1453	    warn "'stereo' seen twice" if defined $r{stereo};
1454	    if ($value eq '') {
1455		warn "empty 'stereo' element not permitted, should be <stereo>stereo</stereo>";
1456		$value = 'stereo';
1457	    }
1458	    warn "bad value for 'stereo': '$value'"
1459	      if ($value ne 'mono'
1460              and $value ne 'stereo'
1461              and $value ne 'bilingual'
1462              and $value ne 'surround'
1463              and $value ne 'dolby digital'
1464              and $value ne 'dolby');
1465	    $r{stereo} = $value;
1466	}
1467    }
1468    return \%r;
1469};
1470$Handlers{audio}->[1] = sub( $$$ ) {
1471    my ($w, $e, $v) = @_;
1472    my %h = %$v;
1473    return if not %h; # don't write empty element
1474    $w->startTag($e) if $w;
1475    if (defined (my $val = delete $h{present})) {
1476	$w->dataElement('present', encode_boolean($val)) if $w;
1477    }
1478    if (defined (my $val = delete $h{stereo})) {
1479	$w->dataElement('stereo', $val) if $w;
1480    }
1481    warn_unknown_keys($e, \%h);
1482    $w->endTag($e) if $w;
1483};
1484
1485=pod
1486
1487=item I<previously-shown>
1488
1489The 'start' and 'channel' attributes are converted to keys in a hash.
1490
1491=cut
1492
1493$Handlers{'previously-shown'}->[0] = sub( $ ) {
1494    my $node = shift; die if not defined $node;
1495    my %attrs = %{get_attrs($node)};
1496    my $r = {};
1497    foreach (qw(start channel)) {
1498	my $v = delete $attrs{$_};
1499	$r->{$_} = $v if defined $v;
1500    }
1501    foreach (keys %attrs) {
1502	warn "unknown attribute $_ in previously-shown";
1503    }
1504    return $r;
1505};
1506$Handlers{'previously-shown'}->[1] = sub( $$$ ) {
1507    my ($w, $e, $v) = @_;
1508    $w->emptyTag($e, %$v) if $w;
1509};
1510
1511=pod
1512
1513=item I<presence>
1514
1515The content of the element is ignored: it signfies something by its
1516very presence.  So the conversion from XML to Perl is a constant true
1517value whenever the element is found; the conversion from Perl to XML
1518is to write out the element if true, donE<39>t write anything if false.
1519
1520=cut
1521
1522$Handlers{presence}->[0] = sub( $ ) {
1523    my $node = shift;
1524    # The 'new' element is empty, it signifies newness by its very
1525    # presence.
1526    #
1527    return 1;
1528};
1529$Handlers{presence}->[1] = sub( $$$ ) {
1530    my ($w, $e, $v) = @_;
1531    if (not $v) {
1532	# Not new, so don't create an element.
1533    }
1534    else {
1535	$w->emptyTag($e) if $w;
1536    }
1537};
1538
1539=pod
1540
1541=item I<subtitles>
1542
1543The 'type' attribute and the 'language' subelement (both optional)
1544become keys in a hash.  But see I<language> for what to pass as the
1545value of that element.
1546
1547=cut
1548
1549$Handlers{subtitles}->[0] = sub( $ ) {
1550    my $node = shift; die if not defined $node;
1551    my %attrs = %{get_attrs($node)};
1552    my %r;
1553    $r{type} = $attrs{type} if defined $attrs{type};
1554    foreach (get_subelements($node)) {
1555	my $name = get_name($_);
1556	if ($name eq 'language') {
1557	    warn "'language' seen twice" if defined $r{language};
1558	    $r{language} = read_with_lang($_, 0, 0);
1559	}
1560	else {
1561	    warn "bad content of 'subtitles' element: $name";
1562	}
1563    }
1564    return \%r;
1565};
1566$Handlers{subtitles}->[1] = sub( $$$ ) {
1567    my ($w, $e, $v) = @_;
1568    t 'subtitles';
1569    my ($type, $language) = ($v->{type}, $v->{language});
1570    my %attrs; $attrs{type} = $type if defined $type;
1571    if (defined $language) {
1572	$w->startTag($e, %attrs) if $w;
1573	write_with_lang($w, 'language', $language, 0, 0);
1574	$w->endTag($e) if $w;
1575    }
1576    else {
1577	$w->emptyTag($e, %attrs) if $w;
1578    }
1579};
1580
1581=pod
1582
1583=item I<rating>
1584
1585The rating is represented as a tuple of [ rating, system, icons ].
1586The last element is itself a listref of structures returned by the
1587I<icon> handler.
1588
1589=cut
1590
1591$Handlers{rating}->[0] = sub( $ ) {
1592    my $node = shift; die if not defined $node;
1593    my %attrs = %{get_attrs($node)};
1594    my $system = delete $attrs{system} if exists $attrs{system};
1595    foreach (keys %attrs) {
1596	warn "unknown attribute in rating: $_";
1597    }
1598    my @children = get_subelements($node);
1599
1600    # First child node is value.
1601    my $value_node = shift @children;
1602    if (not defined $value_node) {
1603	warn "missing 'value' element inside rating";
1604	return undef;
1605    }
1606    if ((my $name = get_name($value_node)) ne 'value') {
1607	warn "expected 'value' node inside rating, got '$name'";
1608	return undef;
1609    }
1610
1611    my $rating = read_value($value_node);
1612
1613    # Remaining children are icons.
1614    my @icons = map { read_icon($_) } @children;
1615
1616    return [ $rating, $system, \@icons ];
1617};
1618$Handlers{rating}->[1] = sub( $$$ ) {
1619    my ($w, $e, $v) = @_;
1620    if (not ref $v or ref $v ne 'ARRAY') {
1621	warn "not writing rating whose content is not an array";
1622	return;
1623    }
1624    my ($rating, $system, $icons) = @$v;
1625    if (defined $system) {
1626	$w->startTag($e, system => $system) if $w;
1627    }
1628    else {
1629	$w->startTag($e) if $w;
1630    }
1631
1632    write_value($w, 'value', $rating) if $w;
1633    if ($w) { write_icon($w, 'icon', $_) foreach @$icons };
1634    $w->endTag($e) if $w;
1635};
1636
1637=pod
1638
1639=item I<star-rating>
1640
1641In XML this is a string 'X/Y' plus a list of icons.  In Perl represented
1642as a pair [ rating, icons ] similar to I<rating>.
1643
1644Multiple star ratings are now supported. For backward compatibility,
1645you may specify a single [rating,icon] or the preferred double array
1646[[rating,system,icon],[rating2,system2,icon2]] (like 'ratings')
1647
1648
1649=cut
1650
1651$Handlers{'star-rating'}->[0] = sub( $ ) {
1652    my $node = shift;
1653    my %attrs = %{get_attrs($node)};
1654    my $system = delete $attrs{system} if exists $attrs{system};
1655    my @children = get_subelements($node);
1656
1657    # First child node is value.
1658    my $value_node = shift @children;
1659    if (not defined $value_node) {
1660	warn "missing 'value' element inside star-rating";
1661	return undef;
1662    }
1663    if ((my $name = get_name($value_node)) ne 'value') {
1664	warn "expected 'value' node inside star-rating, got '$name'";
1665	return undef;
1666    }
1667    my $rating = read_value($value_node);
1668
1669    # Remaining children are icons.
1670    my @icons = map { read_icon($_) } @children;
1671
1672    return [ $rating, $system, \@icons ];
1673};
1674$Handlers{'star-rating'}->[1] = sub ( $$$ ) {
1675    my ($w, $e, $v) = @_;
1676#
1677# 10/31/2007 star-rating can now have multiple values (and system=)
1678# let's make it so old code still works!
1679#
1680    if (not ref $v or ref $v ne 'ARRAY') {
1681	   $v=[$v];
1682#	   warn "not writing star-rating whose content is not an array";
1683#	return;
1684    }
1685    my ($rating, $system, $icons) = @$v;
1686    if (defined $system) {
1687	$w->startTag($e, system => $system) if $w;
1688    }
1689    else {
1690	$w->startTag($e) if $w;
1691    }
1692    write_value($w, 'value', $rating) if $w;
1693    if ($w) { write_icon($w, 'icon', $_) foreach @$icons };
1694    $w->endTag($e) if $w;
1695};
1696
1697=pod
1698
1699=item I<icon>
1700
1701An icon in XMLTV files is like the <img> element in HTML.  It is
1702represented in Perl as a hashref with 'src' and optionally 'width'
1703and 'height' keys.
1704
1705=cut
1706
1707sub write_icon( $$$ ) {
1708    my ($w, $e, $v) = @_;
1709    croak "no 'src' attribute for icon\n" if not defined $v->{src};
1710    croak "bad width $v->{width} for icon\n"
1711      if defined $v->{width} and $v->{width} !~ /^\d+$/;
1712    croak "bad height $v->{height} for icon\n"
1713      if defined $v->{height} and $v->{height} !~ /^\d+$/;
1714
1715    foreach (keys %$v) {
1716	warn "unrecognized key in icon: $_\n"
1717	  if $_ ne 'src' and $_ ne 'width' and $_ ne 'height';
1718    }
1719
1720    $w->emptyTag($e, %$v);
1721}
1722sub read_icon( $ ) {
1723    my $node = shift; die if not defined $node;
1724    my %attrs = %{get_attrs($node)};
1725    warn "missing 'src' attribute in icon" if not defined $attrs{src};
1726    return \%attrs;
1727}
1728$Handlers{icon}->[0] = \&read_icon;
1729$Handlers{icon}->[1] = sub( $$$ ) {
1730    my ($w, $e, $v) = @_;
1731    write_icon($w, $e, $v) if $w;
1732};
1733
1734# To keep things tidy some elements that can have icons store their
1735# textual content inside a subelement called 'value'.  These two
1736# routines are a bit trivial but they're here for consistency.
1737#
1738sub read_value( $ ) {
1739    my $value_node = shift;
1740    my $v = get_text($value_node);
1741    if (not defined $v or $v eq '') {
1742	warn "no content of 'value' element";
1743	return undef;
1744    }
1745    return $v;
1746}
1747sub write_value( $$$ ) {
1748    my ($w, $e, $v) = @_;
1749    $w->dataElement($e, $v) if $w;
1750};
1751
1752
1753# Booleans in XMLTV files are 'yes' or 'no'.
1754sub decode_boolean( $ ) {
1755    my $value = shift;
1756    if ($value eq 'no') {
1757	return 0;
1758    }
1759    elsif ($value eq 'yes') {
1760	return 1;
1761    }
1762    else {
1763	warn "bad boolean: $value";
1764	return undef;
1765    }
1766}
1767sub encode_boolean( $ ) {
1768    my $v = shift;
1769    warn "expected a Perl boolean like 0 or 1, not '$v'\n"
1770      if $v and $v != 1;
1771    return $v ? 'yes' : 'no';
1772}
1773
1774
1775=pod
1776
1777=item I<with-lang>
1778
1779In XML something like title can be either <title>Foo</title>
1780or <title lang="en">Foo</title>.  In Perl these are stored as
1781[ 'Foo' ] and [ 'Foo', 'en' ].  For the former [ 'Foo', undef ]
1782would also be okay.
1783
1784This handler also has two modifiers which may be added to the name
1785after '/'.  I</e> means that empty text is allowed, and will be
1786returned as the empty tuple [], to mean that the element is present
1787but has no text.  When writing with I</e>, undef will also be
1788understood as present-but-empty.  You cannot however specify a
1789language if the text is empty.
1790
1791The modifier I</m> means that the text is allowed to span multiple
1792lines.
1793
1794So for example I<with-lang/em> is a handler for text with language,
1795where the text may be empty and may contain newlines.  Note that the
1796I<with-lang-or-empty> of earlier releases has been replaced by
1797I<with-lang/e>.
1798
1799=cut
1800
1801sub read_with_lang( $$$ ) {
1802    my ($node, $allow_empty, $allow_nl) = @_;
1803    die if not defined $node;
1804    my %attrs = %{get_attrs($node)};
1805    my $lang = $attrs{lang} if exists $attrs{lang};
1806    my $value = get_text($node, $allow_nl);
1807    if (not length $value) {
1808	if (not $allow_empty) {
1809	    warn 'empty string for with-lang value';
1810	    return undef;
1811	}
1812	warn 'empty string may not have language' if defined $lang;
1813	return [];
1814    }
1815    if (defined $lang) {
1816	return [ $value, $lang ];
1817    }
1818    else {
1819	return [ $value ];
1820    }
1821}
1822$Handlers{'with-lang'}->[0]    = sub( $ ) { read_with_lang($_[0], 0, 0) };
1823$Handlers{'with-lang/'}->[0]   = sub( $ ) { read_with_lang($_[0], 0, 0) };
1824$Handlers{'with-lang/e'}->[0]  = sub( $ ) { read_with_lang($_[0], 1, 0) };
1825$Handlers{'with-lang/m'}->[0]  = sub( $ ) { read_with_lang($_[0], 0, 1) };
1826$Handlers{'with-lang/em'}->[0] = sub( $ ) { read_with_lang($_[0], 1, 1) };
1827$Handlers{'with-lang/me'}->[0] = sub( $ ) { read_with_lang($_[0], 1, 1) };
1828
1829sub write_with_lang( $$$$$ ) {
1830    my ($w, $e, $v, $allow_empty, $allow_nl) = @_;
1831    if (not ref $v or ref $v ne 'ARRAY') {
1832	warn "not writing with-lang whose content is not an array";
1833	return;
1834    }
1835
1836    if (not @$v) {
1837	if (not $allow_empty) {
1838	    warn "not writing no content for $e";
1839	    return;
1840	}
1841	$v = [ '' ];
1842    }
1843
1844    my ($text, $lang) = @$v;
1845    t 'writing character data: ' . d $text;
1846    if (not defined $text) {
1847	warn "not writing undefined value for $e";
1848	return;
1849    }
1850
1851#
1852# strip whitespace silently.
1853# we used to use a warn, but later on the code catches this and drops the record
1854#
1855    my $old_text = $text;
1856    $text =~ s/^\s+//;
1857    $text =~ s/\s+$//;
1858
1859    if (not length $text) {
1860	if (not $allow_empty) {
1861	    warn "not writing empty content for $e";
1862	    return;
1863	}
1864	if (defined $lang) {
1865	    warn "not writing empty content with language for $e";
1866	    return;
1867	}
1868	$w->emptyTag($e) if $w;
1869	return;
1870    }
1871
1872    if (not $allow_nl and $text =~ tr/\n//) {
1873	warn "not writing text containing newlines for $e";
1874	return;
1875    }
1876
1877    if (defined $lang) {
1878	$w->dataElement($e, $text, lang => $lang) if $w;
1879    }
1880    else {
1881	$w->dataElement($e, $text) if $w;
1882    }
1883}
1884$Handlers{'with-lang'}->[1]    = sub( $$$ ) { write_with_lang($_[0], $_[1], $_[2], 0, 0) };
1885$Handlers{'with-lang/'}->[1]   = sub( $$$ ) { write_with_lang($_[0], $_[1], $_[2], 0, 0) };
1886$Handlers{'with-lang/e'}->[1]  = sub( $$$ ) { write_with_lang($_[0], $_[1], $_[2], 1, 0) };
1887$Handlers{'with-lang/m'}->[1]  = sub( $$$ ) { write_with_lang($_[0], $_[1], $_[2], 0, 1) };
1888$Handlers{'with-lang/em'}->[1] = sub( $$$ ) { write_with_lang($_[0], $_[1], $_[2], 1, 1) };
1889$Handlers{'with-lang/me'}->[1] = sub( $$$ ) { write_with_lang($_[0], $_[1], $_[2], 1, 1) };
1890
1891# Sanity check.
1892foreach (keys %Handlers) {
1893    my $v = $Handlers{$_};
1894    if (@$v != 2
1895        or ref($v->[0]) ne 'CODE'
1896        or ref($v->[1]) ne 'CODE') {
1897        die "bad handler pair for $_\n";
1898    }
1899}
1900
1901=pod
1902
1903=back
1904
1905Now, which handlers are used for which subelements (keys) of channels
1906and programmes?  And what is the multiplicity (should you expect a
1907single value or a list of values)?
1908
1909The following tables map subelements of <channel> and of <programme>
1910to the handlers used to read and write them.  Many elements have their
1911own handler with the same name, and most of the others use
1912I<with-lang>.  The third column specifies the multiplicity of the
1913element: B<*> (any number) will give a list of values in Perl, B<+>
1914(one or more) will give a nonempty list, B<?> (maybe one) will give a
1915scalar, and B<1> (exactly one) will give a scalar which is not undef.
1916
1917=head2 Handlers for <channel>
1918
1919@CHANNEL_HANDLERS
1920
1921=head2 Handlers for <programme>
1922
1923@PROGRAMME_HANDLERS
1924
1925At present, no parsing or validation on dates is done because dates
1926may be partially specified in XMLTV.  For example '2001' means that
1927the year is known but not the month, day or time of day.  Maybe in the
1928future dates will be automatically converted to and from
1929B<Date::Manip> objects.  For now they just use the I<scalar> handler.
1930Similar remarks apply to URLs.
1931
1932=cut
1933
1934# Private.
1935sub node_to_programme( $ ) {
1936    my $node = shift; die if not defined $node;
1937    my %programme;
1938
1939    # Attributes of programme element.
1940    %programme = %{get_attrs($node)};
1941    t 'attributes: ' . d \%programme;
1942
1943    # Check the required attributes are there.  As with most checking,
1944    # this isn't an alternative to using a validator but it does save
1945    # some headscratching during debugging.
1946    #
1947    foreach (qw(start channel)) {
1948	if (not defined $programme{$_}) {
1949	    warn "programme missing '$_' attribute\n";
1950	    return undef;
1951	}
1952    }
1953    my @known_attrs = map { $_->[0] } @Programme_Attributes;
1954    my %ka; ++$ka{$_} foreach @known_attrs;
1955    foreach (keys %programme) {
1956	unless ($ka{$_}) {
1957	    warn "deleting unknown attribute '$_'";
1958	    delete $programme{$_};
1959	}
1960    }
1961
1962    call_handlers_read($node, \@Programme_Handlers, \%programme);
1963    return \%programme;
1964}
1965
1966
1967# Private.
1968sub node_to_channel( $ ) {
1969    my $node = shift; die if not defined $node;
1970    my %channel;
1971    t 'node_to_channel() ENTRY';
1972
1973    %channel = %{get_attrs($node)};
1974    t 'attributes: ' . d \%channel;
1975    if (not defined $channel{id}) {
1976	warn "channel missing 'id' attribute\n";
1977    }
1978    foreach (keys %channel) {
1979	unless (/^_/ or $_ eq 'id') {
1980	    warn "deleting unknown attribute '$_'";
1981	    delete $channel{$_};
1982	}
1983    }
1984
1985    t '\@Channel_Handlers=' . d \@Channel_Handlers;
1986    call_handlers_read($node, \@Channel_Handlers, \%channel);
1987    return \%channel;
1988}
1989
1990
1991
1992# Private.
1993#
1994# call_handlers_read()
1995#
1996# Read the subelements of a node according to a list giving a
1997# handler subroutine for each subelement.
1998#
1999# Parameters:
2000#   node
2001#   Reference to list of handlers: tuples of
2002#     [element-name, handler-name, multiplicity]
2003#   Reference to hash for storing results
2004#
2005# Warns if errors, but attempts to contine.
2006#
2007sub call_handlers_read( $$$ ) {
2008    my ($node, $handlers, $r) = @_;
2009    t 'call_handlers_read() using handlers: ' . d $handlers;
2010
2011    die unless ref($r) eq 'HASH';
2012    our %r; local *r = $r;
2013    t 'going through each child of node';
2014
2015    # Current position in handlers.  We expect to read the subelements
2016    # in the correct order as specified by the DTD.
2017    #
2018    my $handler_pos = 0;
2019
2020    SUBELEMENT: foreach (get_subelements($node)) {
2021	t 'doing subelement';
2022	my $name = get_name($_);
2023	t "tag name: $name";
2024
2025	# Search for a handler - from $handler_pos onwards.  But
2026	# first, just warn if somebody is trying to use an element in
2027	# the wrong place (trying to go backwards in the list).
2028	#
2029	my $found_pos;
2030	foreach my $i (0 .. $handler_pos - 1) {
2031	    if ($name eq $handlers->[$i]->[0]) {
2032		warn "element $name not expected here";
2033		next SUBELEMENT;
2034	    }
2035	}
2036	for (my $i = $handler_pos; $i < @$handlers; $i++) {
2037	    if ($handlers->[$i]->[0] eq $name) {
2038		t 'found handler';
2039		$found_pos = $i;
2040		last;
2041	    }
2042	    else {
2043		t "doesn't match name $handlers->[$i]->[0]";
2044		my ($handler_name, $h, $multiplicity)
2045		  = @{$handlers->[$i]};
2046		die if not defined $handler_name;
2047		die if $handler_name eq '';
2048
2049		# Before we skip over this element, check that we got
2050		# the necessary values for it.
2051		#
2052		if ($multiplicity eq '?') {
2053		    # Don't need to check whether this set.
2054		}
2055		elsif ($multiplicity eq '1') {
2056		    if (not defined $r{$handler_name}) {
2057			warn "no element $handler_name found";
2058		    }
2059		}
2060		elsif ($multiplicity eq '*') {
2061		    # It's okay if nothing was ever set.  We don't
2062		    # insist on putting in an empty list.
2063		    #
2064		}
2065		elsif ($multiplicity eq '+') {
2066		    if (not defined $r{$handler_name}) {
2067			warn "no element $handler_name found";
2068		    }
2069		    elsif (not @{$r{$handler_name}}) {
2070			warn "strangely, empty list for $handler_name";
2071		    }
2072		}
2073		else {
2074		    warn "bad value of $multiplicity: $!";
2075		}
2076	    }
2077	}
2078	if (not defined $found_pos) {
2079	    warn "unknown element $name";
2080	    next;
2081	}
2082	# Next time we begin searching from this position.
2083	$handler_pos = $found_pos;
2084
2085	# Call the handler.
2086	t 'calling handler';
2087	my ($handler_name, $h_name, $multiplicity)
2088	  = @{$handlers->[$found_pos]};
2089	die if $handler_name ne $name;
2090	my $h = $Handlers{$h_name}; die "no handler $h_name" if not $h;
2091	my $result = $h->[0]->($_); # call reader sub
2092	t 'result: ' . d $result;
2093	warn("skipping bad $name\n"), next if not defined $result;
2094
2095	# Now set the value.  We can't do multiplicity checking yet
2096	# because there might be more elements of this type still to
2097	# come.
2098	#
2099	if ($multiplicity eq '?' or $multiplicity eq '1') {
2100	    warn "seen $name twice"
2101	      if defined $r{$name};
2102	    $r{$name} = $result;
2103	}
2104	elsif ($multiplicity eq '*' or $multiplicity eq '+') {
2105	    push @{$r{$name}}, $result;
2106	}
2107	else {
2108	    warn "bad multiplicity: $multiplicity";
2109	}
2110    }
2111}
2112
2113sub warn_unknown_keys( $$ ) {
2114    my $elem_name = shift;
2115    our %k; local *k = shift;
2116    foreach (keys %k) {
2117        /^_/
2118          or $warned_unknown_key{$elem_name}->{$_}++
2119          or warn "unknown key $_ in $elem_name hash\n";
2120    }
2121}
2122
2123package XMLTV::Writer;
2124use base 'XML::Writer';
2125use Carp;
2126
2127use Date::Manip qw/UnixDate DateCalc/;
2128
2129# Use Log::TraceMessages if installed.
2130BEGIN {
2131    eval { require Log::TraceMessages };
2132    if ($@) {
2133	*t = sub {};
2134	*d = sub { '' };
2135    }
2136    else {
2137	*t = \&Log::TraceMessages::t;
2138	*d = \&Log::TraceMessages::d;
2139    }
2140}
2141
2142BEGIN {
2143    if (int(Date::Manip::DateManipVersion) >= 6) {
2144	Date::Manip::Date_Init("SetDate=now,UTC");
2145    } else {
2146	Date::Manip::Date_Init("TZ=UTC");
2147    }
2148}
2149
2150# Override dataElement() to refuse writing empty or whitespace
2151# elements.
2152#
2153sub dataElement( $$$@ ) {
2154    my ($self, $elem, $content, @rest) = @_;
2155    if ($content !~ /\S/) {
2156        warn "not writing empty content for $elem";
2157        return;
2158    }
2159    return $self->SUPER::dataElement($elem, $content, @rest);
2160}
2161
2162=pod
2163
2164=head1 WRITING
2165
2166When reading a file you have the choice of using C<parse()> to gulp
2167the whole file and return a data structure, or using
2168C<parse_callback()> to get the programmes one at a time, although
2169channels and other data are still read all at once.
2170
2171There is a similar choice when writing data: the C<write_data()>
2172routine prints a whole XMLTV document at once, but if you want to
2173write an XMLTV document incrementally you can manually create an
2174C<XMLTV::Writer> object and call methods on it.  Synopsis:
2175
2176  use XMLTV;
2177  my $w = new XMLTV::Writer();
2178  $w->comment("Hello from XML::Writer's comment() method");
2179  $w->start({ 'generator-info-name' => 'Example code in pod' });
2180  my %ch = (id => 'test-channel', 'display-name' => [ [ 'Test', 'en' ] ]);
2181  $w->write_channel(\%ch);
2182  my %prog = (channel => 'test-channel', start => '200203161500',
2183	      title => [ [ 'News', 'en' ] ]);
2184  $w->write_programme(\%prog);
2185  $w->end();
2186
2187XMLTV::Writer inherits from XML::Writer, and provides the following extra
2188or overridden methods:
2189
2190=over
2191
2192=item new(), the constructor
2193
2194Creates an XMLTV::Writer object and starts writing an XMLTV file, printing
2195the DOCTYPE line.  Arguments are passed on to XML::WriterE<39>s constructor,
2196except for the following:
2197
2198the 'encoding' key if present gives the XML character encoding.
2199For example:
2200
2201  my $w = new XMLTV::Writer(encoding => 'ISO-8859-1');
2202
2203If encoding is not specified, XML::WriterE<39>s default is used
2204(currently UTF-8).
2205
2206XMLTW::Writer can also filter out specific days from the data. This is
2207useful if the datasource provides data for periods of time that does not
2208match the days that the user has asked for. The filtering is controlled
2209with the days, offset and cutoff arguments:
2210
2211  my $w = new XMLTV::Writer(
2212      offset => 1,
2213      days => 2,
2214      cutoff => "050000" );
2215
2216In this example, XMLTV::Writer will discard all entries that do not have
2217starttimes larger than or equal to 05:00 tomorrow and less than 05:00
2218two days after tomorrow. The time offset is stripped off the starttime before
2219the comparison is made.
2220
2221=cut
2222
2223sub new {
2224    my $proto = shift;
2225    my $class = ref($proto) || $proto;
2226    my %args = @_;
2227    croak 'OUTPUT requires a filehandle, not a filename or anything else'
2228      if exists $args{OUTPUT} and not ref $args{OUTPUT};
2229
2230    # force OUTPUT explicitly to standard output to avoid warnings about
2231    # undefined OUTPUT in XML::Writer where it tests against 'self'
2232    if (!exists $args{OUTPUT}) {
2233        $args{OUTPUT} = \*STDOUT;
2234    }
2235    my $encoding = delete $args{encoding};
2236    my $days = delete $args{days};
2237    my $offset = delete $args{offset};
2238    my $cutoff = delete $args{cutoff};
2239
2240    my $self = $class->SUPER::new(DATA_MODE => 1, DATA_INDENT => 2, %args);
2241    bless($self, $class);
2242
2243    if (defined $encoding) {
2244	$self->xmlDecl($encoding);
2245    }
2246    else {
2247	# XML::Writer puts in 'encoding="UTF-8"' even if you don't ask
2248	# for it.
2249	#
2250	warn "assuming default UTF-8 encoding for output\n";
2251	$self->xmlDecl();
2252    }
2253
2254#    $Log::TraceMessages::On = 1;
2255    $self->{mintime} = "19700101000000";
2256    $self->{maxtime} = "29991231235959";
2257
2258
2259    if (defined( $days ) and defined( $offset ) and defined( $cutoff )) {
2260      $self->{mintime} = UnixDate(
2261          DateCalc( "today", "+" . $offset . " days" ),
2262          "%Y%m%d") . $cutoff;
2263      t "using mintime $self->{mintime}";
2264
2265      $self->{maxtime} = UnixDate(
2266          DateCalc("today", "+" . ($offset+$days) . " days"),
2267          "%Y%m%d" ) . $cutoff;
2268      t "using maxtime $self->{maxtime}";
2269    }
2270    elsif (defined( $days ) or defined( $offset ) or defined($cutoff)) {
2271      croak 'You must specify days, offset and cutoff or none of them';
2272    }
2273
2274    {
2275	local $^W = 0; $self->doctype('tv', undef, 'xmltv.dtd');
2276    }
2277    $self->{xmltv_writer_state} = 'new';
2278    return $self;
2279}
2280
2281=pod
2282
2283=item start()
2284
2285Write the start of the <tv> element.  Parameter is a hashref which gives
2286the attributes of this element.
2287
2288=cut
2289
2290sub start {
2291    my $self = shift;
2292    die 'usage: XMLTV::Writer->start(hashref of attrs)' if @_ != 1;
2293    my $attrs = shift;
2294
2295    for ($self->{xmltv_writer_state}) {
2296	if ($_ eq 'new') {
2297	    # Okay.
2298	}
2299	elsif ($_ eq 'channels' or $_ eq 'programmes') {
2300	    croak 'cannot call start() more than once on XMLTV::Writer';
2301	}
2302	elsif ($_ eq 'end') {
2303	    croak 'cannot do anything with end()ed XMLTV::Writer';
2304	}
2305	else { die }
2306
2307	$_ = 'channels';
2308    }
2309    $self->startTag('tv', order_attrs(%{$attrs}));
2310}
2311
2312=pod
2313
2314=item write_channels()
2315
2316Write several channels at once.  Parameter is a reference to a hash
2317mapping channel id to channel details.  They will be written sorted
2318by id, which is reasonable since the order of channels in an XMLTV
2319file isnE<39>t significant.
2320
2321=cut
2322
2323sub write_channels {
2324    my ($self, $channels) = @_;
2325    t('write_channels(' . d($self) . ', ' . d($channels) . ') ENTRY');
2326    croak 'expected hashref of channels' if ref $channels ne 'HASH';
2327
2328    for ($self->{xmltv_writer_state}) {
2329	if ($_ eq 'new') {
2330	    croak 'must call start() on XMLTV::Writer first';
2331	}
2332	elsif ($_ eq 'channels') {
2333	    # Okay.
2334	}
2335	elsif ($_ eq 'programmes') {
2336	    croak 'cannot write channels after writing programmes';
2337	}
2338	elsif ($_ eq 'end') {
2339	    croak 'cannot do anything with end()ed XMLTV::Writer';
2340	}
2341	else { die }
2342    }
2343
2344    my @ids = sort keys %$channels;
2345    t 'sorted list of channel ids: ' . d \@ids;
2346    foreach (@ids) {
2347	t "writing channel with id $_";
2348	my $ch = $channels->{$_};
2349	$self->write_channel($ch);
2350    }
2351    t('write_channels() EXIT');
2352}
2353
2354=pod
2355
2356=item write_channel()
2357
2358Write a single channel.  You can call this routine if you want, but
2359most of the time C<write_channels()> is a better interface.
2360
2361=cut
2362
2363sub write_channel {
2364    my ($self, $ch) = @_;
2365    croak 'undef channel hash passed' if not defined $ch;
2366    croak "expected a hashref, got: $ch" if ref $ch ne 'HASH';
2367
2368    for ($self->{xmltv_writer_state}) {
2369	if ($_ eq 'new') {
2370	    croak 'must call start() on XMLTV::Writer first';
2371	}
2372	elsif ($_ eq 'channels') {
2373	    # Okay.
2374	}
2375	elsif ($_ eq 'programmes') {
2376	    croak 'cannot write channels after writing programmes';
2377	}
2378	elsif ($_ eq 'end') {
2379	    croak 'cannot do anything with end()ed XMLTV::Writer';
2380	}
2381	else { die }
2382    }
2383
2384    my %ch = %$ch; # make a copy
2385    my $id = delete $ch{id};
2386    die "no 'id' attribute in channel" if not defined $id;
2387    write_element_with_handlers($self, 'channel', { id => $id },
2388				\@XMLTV::Channel_Handlers, \%ch);
2389}
2390
2391=pod
2392
2393=item write_programme()
2394
2395Write details for a single programme as XML.
2396
2397=cut
2398
2399sub write_programme {
2400    my $self = shift;
2401    die 'usage: XMLTV::Writer->write_programme(programme hash)' if @_ != 1;
2402    my $ref = shift;
2403    croak 'write_programme() expects programme hashref'
2404      if ref $ref ne 'HASH';
2405    t('write_programme(' . d($self) . ', ' . d($ref) . ') ENTRY');
2406
2407    for ($self->{xmltv_writer_state}) {
2408	if ($_ eq 'new') {
2409	    croak 'must call start() on XMLTV::Writer first';
2410	}
2411	elsif ($_ eq 'channels') {
2412	    $_ = 'programmes';
2413	}
2414	elsif ($_ eq 'programmes') {
2415	    # Okay.
2416	}
2417	elsif ($_ eq 'end') {
2418	    croak 'cannot do anything with end()ed XMLTV::Writer';
2419	}
2420	else { die }
2421    }
2422
2423    # We make a copy of the programme hash and delete elements from it
2424    # as they are dealt with; then we can easily spot any unhandled
2425    # elements at the end.
2426    #
2427    my %p = %$ref;
2428
2429    # First deal with those hash keys that refer to metadata on when
2430    # the programme is broadcast.  After taking those out of the hash,
2431    # we can use the handlers to output individual details.
2432    #
2433    my %attrs;
2434    die if not @XMLTV::Programme_Attributes;
2435    foreach (@XMLTV::Programme_Attributes) {
2436	my ($name, $mult) = @$_;
2437	t "looking for key $name";
2438	my $val = delete $p{$name};
2439	if ($mult eq '?') {
2440	    # No need to check anything.
2441	}
2442	elsif ($mult eq '1') {
2443	    if (not defined $val) {
2444		warn "programme hash missing $name key, skipping";
2445		return;
2446	    }
2447	}
2448	else { die "bad multiplicity for attribute: $mult" }
2449	$attrs{$name} = $val if defined $val;
2450    }
2451
2452    # We use string comparisons without timeoffsets for comparing times.
2453    my( $start ) = split( /\s+/, $attrs{start} );
2454    if( $start lt $self->{mintime} or
2455        $start ge $self->{maxtime} ) {
2456      t "skipping programme with start $attrs{start}";
2457      return;
2458    }
2459
2460    t "beginning 'programme' element";
2461    write_element_with_handlers($self, 'programme', \%attrs,
2462				\@XMLTV::Programme_Handlers, \%p);
2463}
2464
2465=pod
2466
2467=item end()
2468
2469Say youE<39>ve finished writing programmes.  This ends the <tv> element
2470and the file.
2471
2472=cut
2473
2474sub end {
2475    my $self = shift;
2476
2477    for ($self->{xmltv_writer_state}) {
2478	if ($_ eq 'new') {
2479	    croak 'must call start() on XMLTV::Writer first';
2480	}
2481	elsif ($_ eq 'channels' or $_ eq 'programmes') {
2482	    $_ = 'end';
2483	}
2484	elsif ($_ eq 'end') {
2485	    croak 'cannot do anything with end()ed XMLTV::Writer';
2486	}
2487	else { die }
2488    }
2489
2490    $self->endTag('tv');
2491    $self->SUPER::end(@_);
2492}
2493
2494
2495# Private.
2496# order_attrs()
2497#
2498# In XML the order of attributes is not significant.  But to make
2499# things look nice we try to output them in the same order as given in
2500# the DTD.
2501#
2502# Takes a list of (key, value, key, value, ...) and returns one with
2503# keys in a nice-looking order.
2504#
2505sub order_attrs {
2506    die "expected even number of elements, from a hash"
2507      if @_ % 2;
2508    my @a = ((map { $_->[0] } (@XMLTV::Channel_Attributes,
2509			       @XMLTV::Programme_Attributes)),
2510	     qw(date source-info-url source-info-name source-data-url
2511		generator-info-name generator-info-url));
2512
2513    my @r;
2514    my %in = @_;
2515    foreach (@a) {
2516	if (exists $in{$_}) {
2517	    my $v = delete $in{$_};
2518	    push @r, $_, $v;
2519	}
2520    }
2521
2522    foreach (sort keys %in) {
2523	warn "unknown attribute $_" unless /^_/;
2524	push @r, $_, $in{$_};
2525    }
2526
2527    return @r;
2528}
2529
2530
2531# Private.
2532#
2533# Writes the elements of a hash to an XMLTV::Writer using a list of
2534# handlers.  Deletes keys (modifying the hash passed in) as they are
2535# written.
2536#
2537# Requires all mandatory keys be present in the hash - if you're not
2538# sure then use check_multiplicity() first.
2539#
2540# Returns true if the element was successfully written, or if any
2541# errors found don't look serious enough to cause bad XML.  If the
2542# XML::Writer object passed in is undef then nothing is written (since
2543# the write handlers are coded like that.)
2544#
2545sub call_handlers_write( $$$ ) {
2546    my ($self, $handlers, $input) = @_;
2547    t 'writing input hash: ' . d $input;
2548    die if not defined $input;
2549
2550    my $bad = 0;
2551    foreach (@$handlers) {
2552	my ($name, $h_name, $multiplicity) = @$_;
2553	my $h = $XMLTV::Handlers{$h_name}; die "no handler $h_name" if not $h;
2554	my $writer = $h->[1]; die if not defined $writer;
2555	t "doing handler for $name$multiplicity";
2556	local $SIG{__WARN__} = sub {
2557	    warn "$name element: $_[0]";
2558	    $bad = 1;
2559	};
2560	my $val = delete $input->{$name};
2561	t 'got value(s): ' . d $val;
2562	if ($multiplicity eq '1') {
2563	    $writer->($self, $name, $val);
2564	}
2565	elsif ($multiplicity eq '?') {
2566	    $writer->($self, $name, $val) if defined $val;
2567	}
2568	elsif ($multiplicity eq '*' or $multiplicity eq '+') {
2569	    croak "value for key $name should be an array ref"
2570	      if defined $val and ref $val ne 'ARRAY';
2571	    foreach (@{$val}) {
2572		t 'writing value: ' . d $_;
2573		$writer->($self, $name, $_);
2574		t 'finished writing multiple values';
2575	    }
2576	}
2577	else {
2578	    warn "bad multiplicity specifier: $multiplicity";
2579	}
2580    }
2581    t 'leftover keys: ' . d([ sort keys %$input ]);
2582    return not $bad;
2583}
2584
2585
2586# Private.
2587#
2588# Warns about missing keys that are supposed to be mandatory.  Returns
2589# true iff everything is okay.
2590#
2591sub check_multiplicity( $$ ) {
2592    my ($handlers, $input) = @_;
2593    foreach (@$handlers) {
2594	my ($name, $h_name, $multiplicity) = @$_;
2595	t "checking handler for $name: $h_name with multiplicity $multiplicity";
2596	if ($multiplicity eq '1') {
2597	    if (not defined $input->{$name}) {
2598		warn "hash missing value for $name";
2599		return 0;
2600	    }
2601	}
2602	elsif ($multiplicity eq '?') {
2603	    # Okay if not present.
2604	}
2605	elsif ($multiplicity eq '*') {
2606	    # Not present, or undef, is treated as empty list.
2607	}
2608	elsif ($multiplicity eq '+') {
2609	    t 'one or more, checking for a listref with no undef values';
2610	    my $val = $input->{$name};
2611	    if (not defined $val) {
2612		warn "hash missing value for $name (expected list)";
2613		return 0;
2614	    }
2615	    if (ref($val) ne 'ARRAY') {
2616		die "hash has bad contents for $name (expected list)";
2617		return 0;
2618	    }
2619
2620	    t 'all values: ' . d $val;
2621            my @new_val = grep { defined } @$val;
2622	    t 'values that are defined: ' . d \@new_val;
2623            if (@new_val != @$val) {
2624                warn "hash had some undef elements in list for $name, removed";
2625                @$val = @new_val;
2626            }
2627
2628	    if (not @$val) {
2629		warn "hash has empty list of $name properties (expected at least one)";
2630		return 0;
2631	    }
2632	}
2633	else {
2634	    warn "bad multiplicity specifier: $multiplicity";
2635	}
2636    }
2637    return 1;
2638}
2639
2640
2641# Private.
2642#
2643# Write a complete element with attributes, and subelements written
2644# using call_handlers_write().  The advantage over doing it by hand is
2645# that if some required keys are missing, nothing is written (rather
2646# than an incomplete and invalid element).
2647#
2648sub write_element_with_handlers( $$$$$ ) {
2649    my ($w, $name, $attrs, $handlers, $hash) = @_;
2650    if (not check_multiplicity($handlers, $hash)) {
2651        warn "keys missing in $name hash, not writing";
2652        return;
2653    }
2654
2655    # Special 'debug' keys written as comments inside the element.
2656    my %debug_keys;
2657    foreach (grep /^debug/, keys %$hash) {
2658	$debug_keys{$_} = delete $hash->{$_};
2659    }
2660
2661    # Call all the handlers with no writer object and make sure
2662    # they're happy.
2663    #
2664    if (not call_handlers_write(undef, $handlers, { %$hash })) {
2665	warn "bad data inside $name element, not writing\n";
2666	return;
2667    }
2668
2669    $w->startTag($name, order_attrs(%$attrs));
2670    foreach (sort keys %debug_keys) {
2671	my $val = $debug_keys{$_};
2672	$w->comment((defined $val) ? "$_: $val" : $_);
2673    }
2674    call_handlers_write($w, $handlers, $hash);
2675    XMLTV::warn_unknown_keys($name, $hash);
2676    $w->endTag($name);
2677}
2678
2679=pod
2680
2681=back
2682
2683=head1 AUTHOR
2684
2685Ed Avis, ed@membled.com
2686
2687=head1 SEE ALSO
2688
2689The file format is defined by the DTD xmltv.dtd, which is included in
2690the xmltv package along with this module.  It should be installed in
2691your systemE<39>s standard place for SGML and XML DTDs.
2692
2693The xmltv package has a web page at
2694<http://xmltv.org/> which carries
2695information about the file format and the various tools and apps which
2696are distributed with this module.
2697
2698=cut
2699
27001;
2701