1# -*- perl -*- 2# $Id: XMLTV.pm.in,v 1.175 2017/11/28 05:28:04 rmeden Exp $ 3package XMLTV; 4 5use strict; 6use base 'Exporter'; 7our @EXPORT = (); 8our @EXPORT_OK = qw(read_data parse parsefile parsefiles write_data 9 best_name list_channel_keys list_programme_keys); 10 11# For the time being the version of this library is tied to that of 12# the xmltv package as a whole. This number should be checked by the 13# mkdist tool. 14# 15our $VERSION = '0.5.70'; 16 17# Work around changing behaviour of XML::Twig. On some systems (like 18# mine) it always returns UTF-8 data unless KeepEncoding is specified. 19# However the encoding() method tells you the encoding of the original 20# document, not of the data you receive. To be sure of what you're 21# getting, it is easiest on such a system to not give KeepEncoding and 22# just use UTF-8. 23# 24# But on other systems (seemingly perl 5.8 and above), XML::Twig tries 25# to keep the original document's encoding in the strings returned. 26# You then have to call encoding() to find out what you're getting. 27# To make sure of this behaviour we set KeepEncoding to true on such a 28# system. 29# 30# Setting KeepEncoding true everywhere seems to do no harm, it's a 31# pity that we lose conversion to UTF-8 but at least it's the same 32# everywhere. So the library is distributed with this flag on. 33# 34my $KEEP_ENCODING = 1; 35 36# We need a way of telling parsefiles_callback() to optionally *not* die when presented with multiple encodings, 37# but without affecting any other packages which uses it (i.e. so a new sub param is out of the question) 38# - best I can think of for the minute is a global (yuk) 39my $DIE_ON_MULTIPLE_ENCODINGS = 1; 40 41my %warned_unknown_key; 42sub warn_unknown_keys( $$ ); 43 44=pod 45 46=head1 NAME 47 48XMLTV - Perl extension to read and write TV listings in XMLTV format 49 50=head1 SYNOPSIS 51 52 use XMLTV; 53 my $data = XMLTV::parsefile('tv.xml'); 54 my ($encoding, $credits, $ch, $progs) = @$data; 55 my $langs = [ 'en', 'fr' ]; 56 print 'source of listings is: ', $credits->{'source-info-name'}, "\n" 57 if defined $credits->{'source-info-name'}; 58 foreach (values %$ch) { 59 my ($text, $lang) = @{XMLTV::best_name($langs, $_->{'display-name'})}; 60 print "channel $_->{id} has name $text\n"; 61 print "...in language $lang\n" if defined $lang; 62 } 63 foreach (@$progs) { 64 print "programme on channel $_->{channel} at time $_->{start}\n"; 65 next if not defined $_->{desc}; 66 foreach (@{$_->{desc}}) { 67 my ($text, $lang) = @$_; 68 print "has description $text\n"; 69 print "...in language $lang\n" if defined $lang; 70 } 71 } 72 73The value of $data will be something a bit like: 74 75 [ 'UTF-8', 76 { 'source-info-name' => 'Ananova', 'generator-info-name' => 'XMLTV' }, 77 { 'radio-4.bbc.co.uk' => { 'display-name' => [ [ 'en', 'BBC Radio 4' ], 78 [ 'en', 'Radio 4' ], 79 [ undef, '4' ] ], 80 'id' => 'radio-4.bbc.co.uk' }, 81 ... }, 82 [ { start => '200111121800', title => [ [ 'Simpsons', 'en' ] ], 83 channel => 'radio-4.bbc.co.uk' }, 84 ... ] ] 85 86=head1 DESCRIPTION 87 88This module provides an interface to read and write files in XMLTV 89format (a TV listings format defined by xmltv.dtd). In general element 90names in the XML correspond to hash keys in the Perl data structure. 91You can think of this module as a bit like B<XML::Simple>, but 92specialized to the XMLTV file format. 93 94The Perl data structure corresponding to an XMLTV file has four 95elements. The first gives the character encoding used for text data, 96typically UTF-8 or ISO-8859-1. (The encoding value could also be 97undef meaning 'unknown', when the library canE<39>t work out what it 98is.) The second element gives the attributes of the root <tv> 99element, which give information about the source of the TV listings. 100The third element is a list of channels, each list element being a 101hash corresponding to one <channel> element. The fourth element is 102similarly a list of programmes. More details about the data structure 103are given later. The easiest way to find out what it looks like is to 104load some small XMLTV files and use B<Data::Dumper> to print out the 105resulting structure. 106 107=head1 USAGE 108 109=cut 110 111use XML::Twig; 112use XML::Writer 0.600; 113use Date::Manip; 114use Carp; 115use Data::Dumper; 116 117# Use Lingua::Preferred if available, else kludge a replacement. 118sub my_which_lang { return $_[1]->[0] } 119BEGIN { 120 eval { require Lingua::Preferred }; 121 *which_lang = $@ ? \&my_which_lang : \&Lingua::Preferred::which_lang; 122} 123 124# Use Log::TraceMessages if installed. 125BEGIN { 126 eval { require Log::TraceMessages }; 127 if ($@) { 128 *t = sub {}; 129 *d = sub { '' }; 130 } 131 else { 132 *t = \&Log::TraceMessages::t; 133 *d = \&Log::TraceMessages::d; 134 } 135} 136 137# Attributes and subelements of channel. Each subelement additionally 138# needs a handler defined. Multiplicity is given for both, but for 139# attributes the only allowable values are '1' and '?'. 140# 141# Ordering of attributes is not really important, but we keep the same 142# order as they are given in the DTD so that output looks nice. 143# 144# The ordering of the subelements list gives the order in which these 145# elements must appear in the DTD. In fact, these lists just 146# duplicate information in the DTD and add details of what handlers 147# to call. 148# 149our @Channel_Attributes = ([ 'id', '1' ]); 150our @Channel_Handlers = 151 ( 152 [ 'display-name', 'with-lang', '+' ], 153 [ 'icon', 'icon', '*' ], 154 [ 'url', 'scalar', '*' ], 155 ); 156 157# Same for <programme> elements. 158our @Programme_Attributes = 159 ( 160 [ 'start', '1' ], 161 [ 'stop', '?' ], 162 [ 'pdc-start', '?' ], 163 [ 'vps-start', '?' ], 164 [ 'showview', '?' ], 165 [ 'videoplus', '?' ], 166 [ 'channel', '1' ], 167 [ 'clumpidx', '?' ], 168 ); 169our @Programme_Handlers = 170 ( 171 [ 'title', 'with-lang', '+' ], 172 [ 'sub-title', 'with-lang', '*' ], 173 [ 'desc', 'with-lang/m', '*' ], 174 [ 'credits', 'credits', '?' ], 175 [ 'date', 'scalar', '?' ], 176 [ 'category', 'with-lang', '*' ], 177 [ 'keyword', 'with-lang', '*' ], 178 [ 'language', 'with-lang', '?' ], 179 [ 'orig-language', 'with-lang', '?' ], 180 [ 'length', 'length', '?' ], 181 [ 'icon', 'icon', '*' ], 182 [ 'url', 'scalar', '*' ], 183 [ 'country', 'with-lang', '*' ], 184 [ 'episode-num', 'episode-num', '*' ], 185 [ 'video', 'video', '?' ], 186 [ 'audio', 'audio', '?' ], 187 [ 'previously-shown', 'previously-shown', '?' ], 188 [ 'premiere', 'with-lang/em', '?' ], 189 [ 'last-chance', 'with-lang/em', '?' ], 190 [ 'new', 'presence', '?' ], 191 [ 'subtitles', 'subtitles', '*' ], 192 [ 'rating', 'rating', '*' ], 193 [ 'star-rating', 'star-rating', '*' ], 194 ); 195 196# And a hash mapping names like 'with-lang' to pairs of subs. The 197# first for reading, the second for writing. Note that the writers 198# alter the passed-in data as a side effect! (If the writing sub is 199# called with an undef XML::Writer then it writes nothing but still 200# warns for (most) bad data checks - and still alters the data.) 201# 202our %Handlers = (); 203 204# Undocumented interface for adding extensions to the XMLTV format: 205# first add an entry to @XMLTV::Channel_Handlers or 206# @XMLTV::Programme_Handlers with your new element's name, 'type' and 207# multiplicity. The 'type' should be a string you invent yourself. 208# Then $XMLTV::Handlers{'type'} should be a pair of subroutines, a 209# reader and a writer. (Unless you want to use one of the existing 210# types such as 'with-lang' or 'scalar'.) 211# 212# Note that elements and attributes beginning 'x-' are skipped over 213# _automatically_, so you can't parse them with this method. A better 214# way to add extensions is needed - doing this not encouraged but is 215# sometimes necessary. 216# 217 218# read_data() is a deprecated name for parsefile(). 219sub read_data( $ ) { # FIXME remove altogether 220 warn "XMLTV::read_data() deprecated, use XMLTV::parsefile() instead\n"; 221 &parsefile; 222} 223 224# Private. 225sub sanity( $ ) { 226 for (shift) { 227 croak 'no <tv> element found' if not /<tv/; 228 } 229} 230 231=over 232 233=item parse(document) 234 235Takes an XMLTV document (a string) and returns the Perl data 236structure. It is assumed that the document is valid XMLTV; if not 237the routine may die() with an error (although the current implementation 238just warns and continues for most small errors). 239 240The first element of the listref returned, the encoding, may vary 241according to the encoding of the input document, the versions of perl 242and C<XML::Parser> installed, the configuration of the XMLTV library 243and other factors including, but not limited to, the phase of the 244moon. With luck it should always be either the encoding of the input 245file or UTF-8. 246 247Attributes and elements in the XML file whose names begin with 'x-' 248are skipped silently. You can use these to include information which 249is not currently handled by the XMLTV format, or by this module. 250 251=cut 252 253sub parse( $ ) { 254 my $str = shift; 255 sanity($str); 256 # FIXME commonize with parsefiles() 257 my ($encoding, $credits); 258 my %channels; 259 my @programmes; 260 parse_callback($str, 261 sub { $encoding = shift }, 262 sub { $credits = shift }, 263 sub { for (shift) { $channels{$_->{id}} = $_ } }, 264 sub { push @programmes, shift }); 265 return [ $encoding, $credits, \%channels, \@programmes ]; 266} 267 268=pod 269 270=item parsefiles(filename...) 271 272Like C<parse()> but takes one or more filenames instead of a string 273document. The data returned is the merging of those file contents: 274the programmes will be concatenated in their original order, the 275channels just put together in arbitrary order (ordering of channels 276should not matter). 277 278It is necessary that each file have the same character encoding, if 279not, an exception is thrown. Ideally the credits information would 280also be the same between all the files, since there is no obvious way to 281merge it - but if the credits information differs from one file to the 282next, one file is picked arbitrarily to provide credits and a warning 283is printed. If two files give differing channel definitions for the 284same XMLTV channel id, then one is picked arbitrarily and a warning 285is printed. 286 287In the simple case, with just one file, you neednE<39>t worry 288about mismatching of encodings, credits or channels. 289 290The deprecated function C<parsefile()> is a wrapper allowing just one 291filename. 292 293=cut 294 295sub parsefiles( @ ) { 296 die 'one or more filenames required' if not @_; 297 my ($encoding, $credits); 298 my %channels; 299 my @programmes; 300 parsefiles_callback(sub { $encoding = shift }, 301 sub { $credits = shift }, 302 sub { for (shift) { $channels{$_->{id}} = $_ } }, 303 sub { push @programmes, shift }, 304 @_); 305 return [ $encoding, $credits, \%channels, \@programmes ]; 306} 307 308sub parsefile( $ ) { parsefiles(@_) } 309 310=pod 311 312=item parse_callback(document, encoding_callback, credits_callback, 313 channel_callback, programme_callback) 314 315An alternative interface. Whereas C<parse()> reads the whole document 316and then returns a finished data structure, with this routine you 317specify a subroutine to be called as each <channel> element is read 318and another for each <programme> element. 319 320The first argument is the document to parse. The remaining arguments 321are code references, one for each part of the document. 322 323The callback for encoding will be called once with a string giving the 324encoding. In present releases of this module, it is also possible for 325the value to be undefined meaning 'unknown', but itE<39>s hoped that 326future releases will always be able to figure out the encoding used. 327 328The callback for credits will be called once with a hash reference. 329For channels and programmes, the appropriate function will be called 330zero or more times depending on how many channels / programmes are 331found in the file. 332 333The four subroutines will be called in order, that is, the encoding 334and credits will be done before the channel handler is called and all 335the channels will be dealt with before the first programme handler is 336called. 337 338If any of the code references is undef, nothing is called for that part 339of the file. 340 341For backwards compatibility, if the value for 'encoding callback' is 342not a code reference but a scalar reference, then the encoding found 343will be stored in that scalar. Similarly if the 'credits callback' 344is a scalar reference, the scalar it points to will be set to point 345to the hash of credits. This style of interface is deprecated: new 346code should just use four callbacks. 347 348For example: 349 350 my $document = '<tv>...</tv>'; 351 352 my $encoding; 353 sub encoding_cb( $ ) { $encoding = shift } 354 355 my $credits; 356 sub credits_cb( $ ) { $credits = shift } 357 358 # The callback for each channel populates this hash. 359 my %channels; 360 sub channel_cb( $ ) { 361 my $c = shift; 362 $channels{$c->{id}} = $c; 363 } 364 365 # The callback for each programme. We know that channels are 366 # always read before programmes, so the %channels hash will be 367 # fully populated. 368 # 369 sub programme_cb( $ ) { 370 my $p = shift; 371 print "got programme: $p->{title}->[0]->[0]\n"; 372 my $c = $channels{$p->{channel}}; 373 print 'channel name is: ', $c->{'display-name'}->[0]->[0], "\n"; 374 } 375 376 # Let's go. 377 XMLTV::parse_callback($document, \&encoding_cb, \&credits_cb, 378 \&channel_cb, \&programme_cb); 379 380=cut 381 382# Private. 383sub new_doc_callback( $$$$ ) { 384 my ($enc_cb, $cred_cb, $ch_cb, $p_cb) = @_; 385 t 'creating new XML::Twig'; 386 t '\@Channel_Handlers=' . d \@Channel_Handlers; 387 t '\@Programme_Handlers=' . d \@Programme_Handlers; 388 new XML::Twig(StartTagHandlers => 389 { '/tv' => sub { 390 my ($t, $node) = @_; 391 my $enc; 392 if ($KEEP_ENCODING) { 393 t 'KeepEncoding on, get original encoding'; 394 $enc = $t->encoding(); 395 } 396 else { 397 t 'assuming UTF-8 encoding'; 398 $enc = 'UTF-8'; 399 } 400 401 if (defined $enc_cb) { 402 for (ref $enc_cb) { 403 if ($_ eq 'CODE') { 404 $enc_cb->($enc); 405 } 406 elsif ($_ eq 'SCALAR') { 407 $$enc_cb = $enc; 408 } 409 else { 410 die "callback should be code ref or scalar ref, or undef"; 411 } 412 } 413 } 414 415 if (defined $cred_cb) { 416 my $cred = get_attrs($node); 417 for (ref $cred_cb) { 418 if ($_ eq 'CODE') { 419 $cred_cb->($cred); 420 } 421 elsif ($_ eq 'SCALAR') { 422 $$cred_cb = $cred; 423 } 424 else { 425 die "callback should be code ref or scalar ref, or undef"; 426 } 427 } 428 } 429 # Most of the above code can be removed in the 430 # next release. 431 # 432 }, 433 }, 434 435 TwigHandlers => 436 { '/tv/channel' => sub { 437 my ($t, $node) = @_; 438 die if not defined $node; 439 my $c = node_to_channel($node); 440 $t->purge(); 441 if (not $c) { 442 warn "skipping bad channel element\n"; 443 } 444 else { 445 $ch_cb->($c); 446 } 447 }, 448 449 '/tv/programme' => sub { 450 my ($t, $node) = @_; 451 die if not defined $node; 452 my $p = node_to_programme($node); 453 $t->purge(); 454 if (not $p) { 455 warn "skipping bad programme element\n"; 456 } 457 else { 458 $p_cb->($p); 459 } 460 }, 461 }, 462 463 KeepEncoding => $KEEP_ENCODING, 464 ); 465} 466 467sub parse_callback( $$$$$ ) { 468 my ($str, $enc_cb, $cred_cb, $ch_cb, $p_cb) = @_; 469 sanity($str); 470 new_doc_callback($enc_cb, $cred_cb, $ch_cb, $p_cb)->parse($str); 471} 472 473=pod 474 475=item parsefiles_callback(encoding_callback, credits_callback, 476 channel_callback, programme_callback, 477 filenames...) 478 479As C<parse_callback()> but takes one or more filenames to open, 480merging their contents in the same manner as C<parsefiles()>. Note 481that the reading is still gradual - you get the channels and 482programmes one at a time, as they are read. 483 484Note that the same <channel> may be present in more than one file, so 485the channel callback will get called more than once. ItE<39>s your 486responsibility to weed out duplicate channel elements (since writing 487them out again requires that each have a unique id). 488 489For compatibility, there is an alias C<parsefile_callback()> which is 490the same but takes only a single filename, B<before> the callback 491arguments. This is deprecated. 492 493=cut 494 495sub parsefile_callback( $$$$$ ) { 496 my ($f, $enc_cb, $cred_cb, $ch_cb, $p_cb) = @_; 497 parsefiles_callback($enc_cb, $cred_cb, $ch_cb, $p_cb, $f); 498} 499 500sub parsefiles_callback( $$$$@ ) { 501 my ($enc_cb, $cred_cb, $ch_cb, $p_cb, @files) = @_; 502 die "one or more files required" if not @files; 503 my $all_encoding; my $have_encoding = 0; 504 my $all_credits; 505 my %all_channels; 506 507 my $do_next_file; # sub to parse file ( defined below) 508 my $do_file_number; # current file in @files array 509 510 my $my_enc_cb = sub( $ ) { 511 my $e = shift; 512 t 'encoding callback'; 513 if ($have_encoding) { 514 t 'seen encoding before, just check'; 515 my ($da, $de) = (defined $all_encoding, defined $e); 516 if (not $da and not $de) { 517 warn "two files both have unspecified character encodings, hope they're the same\n"; 518 } 519 elsif (not $da and $de) { 520 ##warn "encoding $e not being returned to caller\n"; 521 $all_encoding = $e; 522 } 523 elsif ($da and not $de) { 524 warn "input file with unspecified encoding, assuming same as others ($all_encoding)\n"; 525 } 526 elsif ($da and $de) { 527 if (uc($all_encoding) ne uc($e)) { 528 if ( defined $DIE_ON_MULTIPLE_ENCODINGS && !$DIE_ON_MULTIPLE_ENCODINGS ) { 529 warn "this file's encoding $e differs from others' $all_encoding \n"; 530 } else { 531 die "this file's encoding $e differs from others' $all_encoding - aborting\n"; 532 } 533 } 534 } 535 else { die } 536 t 'have encoding, call user'; 537 $enc_cb->($e, $do_file_number) if $enc_cb; 538 } 539 else { 540 t 'not seen encoding before, call user'; 541 $enc_cb->($e, $do_file_number) if $enc_cb; 542 $all_encoding = $e; 543 $have_encoding = 1; 544 } 545 }; 546 547 my $my_cred_cb = sub( $ ) { 548 my $c = shift; 549 $Data::Dumper::Sortkeys = 1; # ensure consistent order of dumped hash 550 if (defined $all_credits) { 551 if (Dumper($all_credits) ne Dumper($c)) { 552 warn "different files have different credits, picking one arbitrarily\n"; 553 # In fact, we pick the last file in the list since this is the 554 # first to be opened. 555 # 556 } 557 } 558 else { 559 $cred_cb->($c) if $cred_cb; 560 $all_credits = $c; 561 } 562 }; 563 564 my $my_ch_cb = sub( $ ) { 565 my $c = shift; 566 my $id = $c->{id}; 567 $Data::Dumper::Sortkeys = 1; # ensure consistent order of dumped hash 568 if (defined $all_channels{$id} and Dumper($all_channels{$id}) ne Dumper($c)) { 569 warn "differing channels with id $id, picking one arbitrarily\n"; 570 } 571 else { 572 $all_channels{$id} = $c; 573 $ch_cb->($c, $do_file_number) if $ch_cb; 574 } 575 }; 576 577 my $my_p_cb = sub( $ ) { 578 my $doing_file = $do_file_number; 579 580 $do_next_file->(); # if any 581 582 $do_file_number = $doing_file; 583 $p_cb->(@_, $do_file_number) if $p_cb; 584 }; 585 586 $do_next_file = sub() { 587 while (@files) { 588 # Last first. 589 my $f = pop @files; 590 591 $do_file_number = scalar @files; 592 593 # In older versions of perl there were segmentation faults 594 # when calling die() inside the parsing callbacks, so we 595 # needed to override $SIG{__DIE__} here. Assume that 596 # newer perls don't have this issue. 597 # 598 599 my $t = new_doc_callback($my_enc_cb, $my_cred_cb, $my_ch_cb, $my_p_cb); 600 $t->parsefile($f); 601 } 602 }; 603 604 # Let's go. 605 $do_next_file->(); 606} 607 608=pod 609 610=item write_data(data, options...) 611 612Takes a data structure and writes it as XML to standard output. Any 613extra arguments are passed on to XML::WriterE<39>s constructor, for example 614 615 my $f = new IO::File '>out.xml'; die if not $f; 616 write_data($data, OUTPUT => $f); 617 618The encoding used for the output is given by the first element of the 619data. 620 621Normally, there will be a warning for any Perl data which is not 622understood and cannot be written as XMLTV, such as strange keys in 623hashes. But as an exception, any hash key beginning with an 624underscore will be skipped over silently. You can store 'internal use 625only' data this way. 626 627If a programme or channel hash contains a key beginning with 'debug', 628this key and its value will be written out as a comment inside the 629<programme> or <channel> element. This lets you include small 630debugging messages in the XML output. 631 632=cut 633 634sub write_data( $;@ ) { 635 my $data = shift; 636 my $writer = new XMLTV::Writer(encoding => $data->[0], @_); 637 $writer->start($data->[1]); 638 $writer->write_channels($data->[2]); 639 $writer->write_programme($_) foreach @{$data->[3]}; 640 $writer->end(); 641} 642 643 644# Private. 645# 646# get_attrs() 647# 648# Given a node, return a hashref of its attributes. Skips over 649# the 'x-whatever' attributes. 650# 651sub get_attrs( $ ) { 652 my $node = shift; die if not defined $node; 653 my %r = %{$node->atts()}; 654 foreach (keys %r) { 655 if (/^x-/) { 656 delete $r{$_}; 657 } 658 else { 659 tidy(\$r{$_}); 660 } 661 } 662 return \%r; 663} 664 665 666# Private. 667# 668# get_text() 669# 670# Given a node containing only text, return that text (with whitespace 671# either side stripped). If the node has no children (as in 672# <foo></foo> or <foo />), this is considered to be the empty string. 673# 674# Parameter: whether newlines are allowed (defaults to false) 675# 676sub get_text( $;$ ) { 677 my $node = shift; 678 my $allow_nl = shift; $allow_nl = 0 if not defined $allow_nl; 679 my @children = get_subelements($node); 680 if (@children == 0) { 681 return ''; 682 } 683 elsif (@children == 1) { 684 my $v = $children[0]->pcdata(); 685 t 'got pcdata: ' . d $v; 686 if (not defined $v) { 687 my $name = get_name($node); 688 warn "node $name expected to contain text has other stuff\n"; 689 } 690 else { 691 # Just hope that the encoding we got uses \n... 692 if (not $allow_nl and $v =~ tr/\n//d) { 693 my $name = get_name($node); 694 warn "removing newlines from content of node $name\n"; 695 } 696 tidy(\$v); 697 } 698 t 'returning: ' . d $v; 699 return $v; 700 } 701 elsif (@children > 1) { 702 my $name = get_name($node); 703 warn "node $name expected to contain text has more than one child\n"; 704 return undef; 705 } 706 else { die } 707} 708 709# Private. Clean up parsed text. Takes ref to scalar. 710sub tidy( $ ) { 711 our $v; local *v = shift; die if not defined $v; 712 if ($XML::Twig::VERSION < 3.01 || $KEEP_ENCODING) { 713 # Old versions of XML::Twig had stupid behaviour with 714 # entities - and so do the new ones if KeepEncoding is on. 715 # 716 for ($v) { 717 s/>/>/g; 718 s/</</g; 719 s/'/\'/g; 720 s/"/\"/g; 721 s/&/&/g; # needs to be last 722 } 723 } 724 else { 725 t 'new XML::Twig, not KeepEncoding, entities already dealt with'; 726 } 727 728 for ($v) { 729 s/^\s+//; 730 s/\s+$//; 731 732 # On Windows there seems to be an inconsistency between 733 # XML::Twig and XML::Writer. The former returns text with 734 # \r\n line endings to the application, but the latter adds \r 735 # characters to text outputted. So reading some text and 736 # writing it again accumulates an extra \r character. We fix 737 # this by removing \r from the input here. 738 # 739 tr/\r//d; 740 } 741} 742 743# Private. 744# 745# get_subelements() 746# 747# Return a list of all subelements of a node. Whitespace is 748# ignored; anything else that isn't a subelement is warned about. 749# Skips over elements with name 'x-whatever'. 750# 751sub get_subelements( $ ) { 752 grep { (my $tmp = get_name($_)) !~ /^x-/ } $_[0]->children(); 753} 754 755# Private. 756# 757# get_name() 758# 759# Return the element name of a node. 760# 761sub get_name( $ ) { $_[0]->gi() } 762 763# Private. 764# 765# dump_node() 766# 767# Return some information about a node for debugging. 768# 769sub dump_node( $ ) { 770 my $n = shift; 771 # Doesn't seem to be easy way to get 'type' of node. 772 my $r = 'name: ' . get_name($n) . "\n"; 773 for (trunc($n->text())) { 774 $r .= "value: $_\n" if defined and length; 775 } 776 return $r; 777} 778# Private. Truncate a string to a reasonable length and add '...' if 779# necessary. 780# 781sub trunc { 782 local $_ = shift; 783 return undef if not defined; 784 if (length > 1000) { 785 return substr($_, 0, 1000) . '...'; 786 } 787 return $_; 788} 789 790=pod 791 792=item best_name(languages, pairs [, comparator]) 793 794The XMLTV format contains many places where human-readable text is 795given an optional 'lang' attribute, to allow mixed languages. This is 796represented in Perl as a pair [ text, lang ], although the second 797element may be missing or undef if the language is unknown. When 798several alernatives for an element (such as <title>) can be given, the 799representation is a list of [ text, lang ] pairs. Given such a list, 800what is the best text to use? It depends on the userE<39>s preferred 801language. 802 803This function takes a list of acceptable languages and a list of [string, 804language] pairs, and finds the best one to use. This means first finding 805the appropriate language and then picking the 'best' string in that 806language. 807 808The best is normally defined as the first one found in a usable 809language, since the XMLTV format puts the most canonical versions 810first. But you can pass in your own comparison function, for example 811if you want to choose the shortest piece of text that is in an 812acceptable language. 813 814The acceptable languages should be a reference to a list of language 815codes looking like 'ru', or like 'de_DE'. The text pairs should be a 816reference to a list of pairs [ string, language ]. (As a special case 817if this list is empty or undef, that means no text is present, and the 818result is undef.) The third argument if present should be a cmp-style 819function that compares two strings of text and returns 1 if the first 820argument is better, -1 if the second better, 0 if theyE<39>re equally 821good. 822 823Returns: [s, l] pair, where s is the best of the strings to use and l 824is its language. This pair is 'live' - it is one of those from the 825list passed in. So you can use C<best_name()> to find the best pair 826from a list and then modify the content of that pair. 827 828(This routine depends on the C<Lingua::Preferred> module being 829installed; if that module is missing then the first available 830language is always chosen.) 831 832Example: 833 834 my $langs = [ 'de', 'fr' ]; # German or French, please 835 836 # Say we found the following under $p->{title} for a programme $p. 837 my $pairs = [ [ 'La CitE des enfants perdus', 'fr' ], 838 [ 'The City of Lost Children', 'en_US' ] ]; 839 840 my $best = best_name($langs, $pairs); 841 print "chose title $best->[0]\n"; 842 843=cut 844 845sub best_name( $$;$ ) { 846 my ($wanted_langs, $pairs, $compare) = @_; 847 t 'best_name() ENTRY'; 848 t 'wanted langs: ' . d $wanted_langs; 849 t '[text,lang] pairs: ' . d $pairs; 850 t 'comparison fn: ' . d $compare; 851 return undef if not defined $pairs; 852 my @pairs = @$pairs; 853 854 my @avail_langs; 855 my (%seen_lang, $seen_undef); 856 # Collect the list of available languages. 857 foreach (map { $_->[1] } @pairs) { 858 if (defined) { 859 next if $seen_lang{$_}++; 860 } 861 else { 862 next if $seen_undef++; 863 } 864 push @avail_langs, $_; 865 } 866 867 my $pref_lang = which_lang($wanted_langs, \@avail_langs); 868 869 # Gather up [text, lang] pairs which have the desired language. 870 my @candidates; 871 foreach (@pairs) { 872 my ($text, $lang) = @$_; 873 next unless ((not defined $lang) 874 or (defined $pref_lang and $lang eq $pref_lang)); 875 push @candidates, $_; 876 } 877 878 return undef if not @candidates; 879 880 # If a comparison function was passed in, use it to compare the 881 # text strings from the candidate pairs. 882 # 883 @candidates = sort { $compare->($a->[0], $b->[0]) } @candidates 884 if defined $compare; 885 886 # Pick the first candidate. This will be the one ordered first by 887 # the comparison function if given, otherwise the earliest in the 888 # original list. 889 # 890 return $candidates[0]; 891} 892 893 894=item list_channel_keys(), list_programme_keys() 895 896Some users of this module may wish to enquire at runtime about which 897keys a programme or channel hash can contain. The data in the hash 898comes from the attributes and subelements of the corresponding element 899in the XML. The values of attributes are simply stored as strings, 900while subelements are processed with a handler which may return a 901complex data structure. These subroutines returns a hash mapping key 902to handler name and multiplicity. This lets you know what data types 903can be expected under each key. For keys which come from attributes 904rather than subelements, the handler is set to 'scalar', just as for 905subelements which give a simple string. See L<"DATA STRUCTURE"> for 906details on what the different handler names mean. 907 908It is not possible to find out which keys are mandatory and which 909optional, only a list of all those which might possibly be present. 910An example use of these routines is the L<tv_grep> program, which 911creates its allowed command line arguments from the names of programme 912subelements. 913 914=cut 915 916# Private. 917sub list_keys( $$ ) { 918 my %r; 919 920 # Attributes. 921 foreach (@{shift()}) { 922 my ($k, $mult) = @$_; 923 $r{$k} = [ 'scalar', $mult ]; 924 } 925 926 # Subelements. 927 foreach (@{shift()}) { 928 my ($k, $h_name, $mult) = @$_; 929 $r{$k} = [ $h_name, $mult ]; 930 } 931 932 return \%r; 933} 934# Public. 935sub list_channel_keys() { 936 list_keys(\@Channel_Attributes, \@Channel_Handlers); 937} 938sub list_programme_keys() { 939 list_keys(\@Programme_Attributes, \@Programme_Handlers); 940} 941 942=pod 943 944=item catfiles(w_args, filename...) 945 946Concatenate several listings files, writing the output to somewhere 947specified by C<w_args>. Programmes are catenated together, channels 948are merged, for credits we just take the first and warn if the others 949differ. 950 951The first argument is a hash reference giving information to pass to 952C<XMLTV::Writer>E<39>s constructor. But do not specify encoding, this 953will be taken from the input files. C<catfiles()> will abort if the 954input files have different encodings, unless the 'UTF8'=1 argument 955is passed in. 956 957=cut 958 959sub catfiles( $@ ) { 960 my $w_args = shift; 961 my $w; 962 my $enc; # encoding of current file 963 my @encs; # encoding of all files being catenated 964 my %seen_ch; 965 my %seen_progs; 966 967 $DIE_ON_MULTIPLE_ENCODINGS = ( defined $w_args->{UTF8} ? 0 : 1 ); 968 $Data::Dumper::Sortkeys = 1; # ensure consistent order of dumped hash 969 970 XMLTV::parsefiles_callback 971 (sub { 972 $enc = shift; 973 my $do_file = shift; 974 $encs[$do_file] = (defined $enc ? $enc : 'unknown'); 975 t "file $do_file = $enc" if defined $enc; 976 $w = new XMLTV::Writer(%$w_args, encoding => ( defined $w_args->{UTF8} ? 'UTF-8' : $enc ) ) if !defined $w; 977 }, 978 sub { $w->start(shift) }, 979 sub { 980 my $c = shift; 981 my $id = $c->{id}; 982 if (not defined $seen_ch{$id}) { 983 984 my $do_file = shift; 985 if (defined $w_args->{UTF8}) { 986 if (uc($encs[$do_file]) ne 'UTF-8' && $encs[$do_file] ne 'unknown') { 987 # recode the incoming channel name 988 t 'recoding channel from '.$encs[$do_file].' to UTF-8'; 989 require XMLTV::Data::Recursive::Encode; 990 $c = XMLTV::Data::Recursive::Encode->from_to($c, $encs[$do_file], 'UTF-8'); 991 } 992 } 993 994 $w->write_channel($c); 995 $seen_ch{$id} = $c; 996 } 997 elsif (Dumper($seen_ch{$id}) eq Dumper($c)) { 998 # They're identical, okay. 999 } 1000 else { 1001 warn "channel $id may differ between two files, " 1002 . "picking one arbitrarily\n"; 1003 } 1004 }, 1005 sub { 1006 my $p = shift; 1007 my $do_file = shift; 1008 if (defined $w_args->{UTF8}) { 1009 if (uc($encs[$do_file]) ne 'UTF-8' && $encs[$do_file] ne 'unknown') { 1010 # recode the incoming programme 1011 t 'recoding prog from '.$encs[$do_file].' to UTF-8'; 1012 require XMLTV::Data::Recursive::Encode; 1013 $p = XMLTV::Data::Recursive::Encode->from_to($p, $encs[$do_file], 'UTF-8'); 1014 } 1015 } 1016 if (! $seen_progs{ $p->{start} . "|" . $p->{title}[0][0] . "|" . $p->{channel} }++) { 1017 $w->write_programme($p); 1018 } 1019 else { 1020 # warn "duplicate programme detected, skipping\n" 1021 # . " " . $p->{start} . "|" . $p->{stop} . "|" . $p->{title}[0][0] . "|" . $p->{channel} . "\n"; 1022 } 1023 }, 1024 @_); 1025 $w->end(); 1026} 1027 1028=pod 1029 1030=item cat(data, ...) 1031 1032Concatenate (and merge) listings data. Programmes are catenated 1033together, channels are merged, for credits we just take the first and 1034warn if the others differ (except that the 'date' of the result is the 1035latest date of all the inputs). 1036 1037Whereas C<catfiles()> reads and writes files, this function takes 1038already-parsed listings data and returns some more listings data. It 1039is much more memory-hungry. 1040 1041=cut 1042 1043sub cat( @ ) { cat_aux(1, @_) } 1044 1045=pod 1046 1047=item cat_noprogrammes 1048 1049Like C<cat()> but ignores the programme data and just returns 1050encoding, credits and channels. This is in case for scalability 1051reasons you want to handle programmes individually, but still 1052merge the smaller data. 1053 1054=cut 1055 1056sub cat_noprogrammes( @ ) { cat_aux(0, @_) } 1057 1058sub cat_aux( @ ) { 1059 my $all_encoding; 1060 my ($all_credits_nodate, $all_credits_date); 1061 my %all_channels; 1062 my @all_progs; 1063 my $do_progs = shift; 1064 1065 $Data::Dumper::Sortkeys = 1; # ensure consistent order of dumped hash 1066 1067 foreach (@_) { 1068 t 'doing arg: ' . d $_; 1069 my ($encoding, $credits, $channels, $progs) = @$_; 1070 1071 if (not defined $all_encoding) { 1072 $all_encoding = $encoding; 1073 } 1074 elsif ($encoding ne $all_encoding) { 1075 die "different files have different encodings, cannot continue\n"; 1076 } 1077 1078 # If the credits are different between files there's not a lot 1079 # we can do to merge them. Apart from 'date', that is. There 1080 # we can say that the date of the concatenated listings is the 1081 # newest date from all the sources. 1082 # 1083 my %credits_nodate = %$credits; # copy 1084 my $d = delete $credits_nodate{date}; 1085 if (defined $d) { 1086 # Need to 'require' rather than 'use' this because 1087 # XMLTV.pm is loaded during the build process and 1088 # XMLTV::Date isn't available then. Urgh. 1089 # 1090 require XMLTV::Date; 1091 my $dp = XMLTV::Date::parse_date($d); 1092 for ($all_credits_date) { 1093 if (not defined 1094 or Date_Cmp(XMLTV::Date::parse_date($_), $dp) < 0) { 1095 $_ = $d; 1096 } 1097 } 1098 } 1099 1100 # Now in uniqueness checks ignore the date. 1101 if (not defined $all_credits_nodate) { 1102 $all_credits_nodate = \%credits_nodate; 1103 } 1104 elsif (Dumper(\%credits_nodate) ne Dumper($all_credits_nodate)) { 1105 warn "different files have different credits, taking from first file\n"; 1106 } 1107 1108 foreach (keys %$channels) { 1109 if (not defined $all_channels{$_}) { 1110 $all_channels{$_} = $channels->{$_}; 1111 } 1112 elsif (Dumper($all_channels{$_}) ne Dumper($channels->{$_})) { 1113 warn "channel $_ differs between two files, taking first appearance\n"; 1114 } 1115 } 1116 1117 push @all_progs, @$progs if $do_progs; 1118 } 1119 1120 $all_encoding = 'UTF-8' if not defined $all_encoding; 1121 1122 my %all_credits; 1123 %all_credits = %$all_credits_nodate 1124 if defined $all_credits_nodate; 1125 $all_credits{date} = $all_credits_date 1126 if defined $all_credits_date; 1127 1128 if ($do_progs) { 1129 @all_progs = reverse @all_progs; 1130 my %seen_progs; 1131 my @dupe_indexes = reverse(grep { $seen_progs{ $all_progs[$_]->{start} . "|" . $all_progs[$_]->{stop} . "|" . $all_progs[$_]->{title}[0][0] . "|" . $all_progs[$_]->{channel} }++ } 0..$#all_progs); 1132 foreach my $item (@dupe_indexes) { 1133 # warn "duplicate programme detected, skipping\n" 1134 # . " " . $all_progs[$item]->{start} . "|" . $all_progs[$item]->{stop} . "|" . $all_progs[$item]->{title}[0][0] . "|" . $all_progs[$item]->{channel} . "\n"; 1135 splice (@all_progs,$item,1); 1136 } 1137 @all_progs = reverse @all_progs; 1138 1139 return [ $all_encoding, \%all_credits, \%all_channels, \@all_progs ]; 1140 } 1141 else { 1142 return [ $all_encoding, \%all_credits, \%all_channels ]; 1143 } 1144} 1145 1146 1147# For each subelement of programme, we define a subroutine to read it 1148# and one to write it. The reader takes an node for a single 1149# subelement and returns its value as a Perl scalar (warning and 1150# returning undef if error). The writer takes an XML::Writer, an 1151# element name and a scalar value and writes a subelement for that 1152# value. Note that the element name is passed in to the writer just 1153# for symmetry, so that neither the writer or the reader have to know 1154# what their element is called. 1155# 1156=pod 1157 1158=back 1159 1160=head1 DATA STRUCTURE 1161 1162For completeness, we describe more precisely how channels and 1163programmes are represented in Perl. Each element of the channels list 1164is a hashref corresponding to one <channel> element, and likewise for 1165programmes. The possible keys of a channel (programme) hash are the 1166names of attributes or subelements of <channel> (<programme>). 1167 1168The values for attributes are not processed in any way; an attribute 1169C<fred="jim"> in the XML will become a hash element with key C<'fred'>, 1170value C<'jim'>. 1171 1172But for subelements, there is further processing needed to turn the 1173XML content of a subelement into Perl data. What is done depends on 1174what type of data is stored under that subelement. Also, if a certain 1175element can appear several times then the hash key for that element 1176points to a list of values rather than just one. 1177 1178The conversion of a subelementE<39>s content to and from Perl data is 1179done by a handler. The most common handler is I<with-lang>, used for 1180human-readable text content plus an optional 'lang' attribute. There 1181are other handlers for other data structures in the file format. 1182Often two subelements will share the same handler, since they hold the 1183same type of data. The handlers defined are as follows; note that 1184many of them will silently strip leading and trailing whitespace in 1185element content. Look at the DTD itself for an explanation of the 1186whole file format. 1187 1188Unless specified otherwise, it is not allowed for an element expected 1189to contain text to have empty content, nor for the text to contain 1190newline characters. 1191 1192=over 1193 1194=item I<credits> 1195 1196Turns a list of credits (for director, actor, writer, etc.) into a 1197hash mapping 'role' to a list of names. The names in each role are 1198kept in the same order. 1199 1200=cut 1201 1202$Handlers{credits}->[0] = sub( $ ) { 1203 my $node = shift; 1204 my @roles = qw(director actor writer adapter producer composer 1205 editor presenter commentator guest); 1206 my %known_role; ++$known_role{$_} foreach @roles; 1207 my %r; 1208 foreach (get_subelements($node)) { 1209 my $role = get_name($_); 1210 unless ($known_role{$role}++) { 1211 warn "unknown thing in credits: $role"; 1212 next; 1213 } 1214 my %attrs = %{get_attrs($_)}; 1215 my $character = $attrs{role} if exists $attrs{role}; 1216 if (defined $character) { 1217 push @{$r{$role}}, [ get_text($_), $character ] ; 1218 } else { 1219 push @{$r{$role}}, get_text($_); 1220 } 1221 } 1222 return \%r; 1223}; 1224 1225$Handlers{'credits'}->[1] = sub( $$$ ) { 1226 my ($w, $e, $v) = @_; die if not defined $v; 1227 my %h = %$v; 1228 return if not %h; # don't write empty element 1229 t 'writing credits: ' . d \%h; 1230 # TODO some 'do nothing' setting in XML::Writer to replace this 1231 # convention of passing undef. 1232 # 1233 $w->startTag($e) if $w; 1234 foreach ( qw[director actor writer adapter producer composer 1235 editor presenter commentator guest] ) { 1236 next unless defined $h{$_}; 1237 my @people = @{delete $h{$_}}; 1238 foreach my $person (@people) { 1239 die if not defined $person; 1240 if (ref($person) eq 'ARRAY') { 1241 if ( defined @{$person}[1] && @{$person}[1] ne '' ) { 1242 $w->dataElement($_, @{$person}[0], 'role' => @{$person}[1] ) if $w; 1243 } else { 1244 $w->dataElement($_, @{$person}[0]) if $w; 1245 } 1246 } else { 1247 $w->dataElement($_, $person) if $w; 1248 } 1249 } 1250 } 1251 warn_unknown_keys($e, \%h); 1252 $w->endTag($e) if $w; 1253}; 1254 1255=pod 1256 1257=item I<scalar> 1258 1259Reads and writes a simple string as the content of the XML element. 1260 1261=cut 1262 1263$Handlers{scalar}->[0] = sub( $ ) { 1264 my $node = shift; 1265 return get_text($node); 1266}; 1267$Handlers{scalar}->[1] = sub( $$$ ) { 1268 my ($w, $e, $v) = @_; 1269 t 'scalar'; 1270 $w->dataElement($e, $v) if $w; 1271}; 1272 1273=pod 1274 1275=item I<length> 1276 1277Converts the content of a <length> element into a number of seconds 1278(so <length units="minutes">5</minutes> would be returned as 300). On 1279writing out again tries to convert a number of seconds to a time in 1280minutes or hours if that would look better. 1281 1282=cut 1283 1284$Handlers{length}->[0] = sub( $ ) { 1285 my $node = shift; die if not defined $node; 1286 my %attrs = %{get_attrs($node)}; 1287 my $d = get_text($node); 1288 if ($d =~ /^\s*$/) { 1289 warn "empty 'length' element"; 1290 return undef; 1291 } 1292 if ($d !~ tr/0-9// or $d =~ tr/0-9//c) { 1293 warn "bad content of 'length' element: $d"; 1294 return undef; 1295 } 1296 my $units = $attrs{units}; 1297 if (not defined $units) { 1298 warn "missing 'units' attr in 'length' element"; 1299 return undef; 1300 } 1301 # We want to return a length in seconds. 1302 if ($units eq 'seconds') { 1303 # Okay. 1304 } 1305 elsif ($units eq 'minutes') { 1306 $d *= 60; 1307 } 1308 elsif ($units eq 'hours') { 1309 $d *= 60 * 60; 1310 } 1311 else { 1312 warn "bad value of 'units': $units"; 1313 return undef; 1314 } 1315 return $d; 1316}; 1317$Handlers{length}->[1] = sub( $$$ ) { 1318 my ($w, $e, $v) = @_; 1319 t 'length'; 1320 my $units; 1321 if ($v % 3600 == 0) { 1322 $units = 'hours'; 1323 $v /= 3600; 1324 } 1325 elsif ($v % 60 == 0) { 1326 $units = 'minutes'; 1327 $v /= 60; 1328 } 1329 else { 1330 $units = 'seconds'; 1331 } 1332 $w->dataElement($e, $v, units => $units) if $w; 1333}; 1334 1335=pod 1336 1337=item I<episode-num> 1338 1339The representation in Perl of XMLTVE<39>s odd episode numbers is as a 1340pair of [ content, system ]. As specified by the DTD, if the system is 1341not given in the file then 'onscreen' is assumed. Whitespace in the 1342'xmltv_ns' system is unimportant, so on reading it is normalized to 1343a single space on either side of each dot. 1344 1345=cut 1346 1347$Handlers{'episode-num'}->[0] = sub( $ ) { 1348 my $node = shift; die if not defined $node; 1349 my %attrs = %{get_attrs($node)}; 1350 my $system = $attrs{system}; 1351 $system = 'onscreen' if not defined $system; 1352 my $content = get_text($node); 1353 if ($system eq 'xmltv_ns') { 1354 # Make it look nice. 1355 $content =~ s/\s+//g; 1356 $content =~ s/\./ . /g; 1357 } 1358 return [ $content, $system ]; 1359}; 1360$Handlers{'episode-num'}->[1] = sub( $$$ ) { 1361 my ($w, $e, $v) = @_; 1362 t 'episode number'; 1363 if (not ref $v or ref $v ne 'ARRAY') { 1364 warn "not writing episode-num whose content is not an array"; 1365 return; 1366 } 1367 my ($content, $system) = @$v; 1368 $system = 'onscreen' if not defined $system; 1369 $w->dataElement($e, $content, system => $system) if $w; 1370}; 1371 1372=pod 1373 1374=item I<video> 1375 1376The <video> section is converted to a hash. The <present> subelement 1377corresponds to the key 'present' of this hash, 'yes' and 'no' are 1378converted to Booleans. The same applies to <colour>. The content of 1379the <aspect> subelement is stored under the key 'aspect'. These keys 1380can be missing in the hash just as the subelements can be missing in 1381the XML. 1382 1383=cut 1384 1385$Handlers{video}->[0] = sub ( $ ) { 1386 my $node = shift; 1387 my %r; 1388 foreach (get_subelements($node)) { 1389 my $name = get_name($_); 1390 my $value = get_text($_); 1391 if ($name eq 'present') { 1392 warn "'present' seen twice" if defined $r{present}; 1393 $r{present} = decode_boolean($value); 1394 } 1395 elsif ($name eq 'colour') { 1396 warn "'colour' seen twice" if defined $r{colour}; 1397 $r{colour} = decode_boolean($value); 1398 } 1399 elsif ($name eq 'aspect') { 1400 warn "'aspect' seen twice" if defined $r{aspect}; 1401 $value =~ /^\d+:\d+$/ or warn "bad aspect ratio: $value"; 1402 $r{aspect} = $value; 1403 } 1404 elsif ($name eq 'quality') { 1405 warn "'quality' seen twice" if defined $r{quality}; 1406 $r{quality} = $value; 1407 } 1408 } 1409 return \%r; 1410}; 1411$Handlers{video}->[1] = sub( $$$ ) { 1412 my ($w, $e, $v) = @_; 1413 t "'video' element"; 1414 my %h = %$v; 1415 return if not %h; # don't write empty element 1416 $w->startTag($e) if $w; 1417 if (defined (my $val = delete $h{present})) { 1418 $w->dataElement('present', encode_boolean($val)) if $w; 1419 } 1420 if (defined (my $val = delete $h{colour})) { 1421 $w->dataElement('colour', encode_boolean($val)) if $w; 1422 } 1423 if (defined (my $val = delete $h{aspect})) { 1424 $w->dataElement('aspect', $val) if $w; 1425 } 1426 if (defined (my $val = delete $h{quality})) { 1427 $w->dataElement('quality', $val) if $w; 1428 } 1429 warn_unknown_keys("zz $e", \%h); 1430 $w->endTag($e) if $w; 1431}; 1432 1433=pod 1434 1435=item I<audio> 1436 1437This is similar to I<video>. <present> is a Boolean value, while 1438the content of <stereo> is stored unchanged. 1439 1440=cut 1441 1442$Handlers{audio}->[0] = sub( $ ) { 1443 my $node = shift; 1444 my %r; 1445 foreach (get_subelements($node)) { 1446 my $name = get_name($_); 1447 my $value = get_text($_); 1448 if ($name eq 'present') { 1449 warn "'present' seen twice" if defined $r{present}; 1450 $r{present} = decode_boolean($value); 1451 } 1452 elsif ($name eq 'stereo') { 1453 warn "'stereo' seen twice" if defined $r{stereo}; 1454 if ($value eq '') { 1455 warn "empty 'stereo' element not permitted, should be <stereo>stereo</stereo>"; 1456 $value = 'stereo'; 1457 } 1458 warn "bad value for 'stereo': '$value'" 1459 if ($value ne 'mono' 1460 and $value ne 'stereo' 1461 and $value ne 'bilingual' 1462 and $value ne 'surround' 1463 and $value ne 'dolby digital' 1464 and $value ne 'dolby'); 1465 $r{stereo} = $value; 1466 } 1467 } 1468 return \%r; 1469}; 1470$Handlers{audio}->[1] = sub( $$$ ) { 1471 my ($w, $e, $v) = @_; 1472 my %h = %$v; 1473 return if not %h; # don't write empty element 1474 $w->startTag($e) if $w; 1475 if (defined (my $val = delete $h{present})) { 1476 $w->dataElement('present', encode_boolean($val)) if $w; 1477 } 1478 if (defined (my $val = delete $h{stereo})) { 1479 $w->dataElement('stereo', $val) if $w; 1480 } 1481 warn_unknown_keys($e, \%h); 1482 $w->endTag($e) if $w; 1483}; 1484 1485=pod 1486 1487=item I<previously-shown> 1488 1489The 'start' and 'channel' attributes are converted to keys in a hash. 1490 1491=cut 1492 1493$Handlers{'previously-shown'}->[0] = sub( $ ) { 1494 my $node = shift; die if not defined $node; 1495 my %attrs = %{get_attrs($node)}; 1496 my $r = {}; 1497 foreach (qw(start channel)) { 1498 my $v = delete $attrs{$_}; 1499 $r->{$_} = $v if defined $v; 1500 } 1501 foreach (keys %attrs) { 1502 warn "unknown attribute $_ in previously-shown"; 1503 } 1504 return $r; 1505}; 1506$Handlers{'previously-shown'}->[1] = sub( $$$ ) { 1507 my ($w, $e, $v) = @_; 1508 $w->emptyTag($e, %$v) if $w; 1509}; 1510 1511=pod 1512 1513=item I<presence> 1514 1515The content of the element is ignored: it signfies something by its 1516very presence. So the conversion from XML to Perl is a constant true 1517value whenever the element is found; the conversion from Perl to XML 1518is to write out the element if true, donE<39>t write anything if false. 1519 1520=cut 1521 1522$Handlers{presence}->[0] = sub( $ ) { 1523 my $node = shift; 1524 # The 'new' element is empty, it signifies newness by its very 1525 # presence. 1526 # 1527 return 1; 1528}; 1529$Handlers{presence}->[1] = sub( $$$ ) { 1530 my ($w, $e, $v) = @_; 1531 if (not $v) { 1532 # Not new, so don't create an element. 1533 } 1534 else { 1535 $w->emptyTag($e) if $w; 1536 } 1537}; 1538 1539=pod 1540 1541=item I<subtitles> 1542 1543The 'type' attribute and the 'language' subelement (both optional) 1544become keys in a hash. But see I<language> for what to pass as the 1545value of that element. 1546 1547=cut 1548 1549$Handlers{subtitles}->[0] = sub( $ ) { 1550 my $node = shift; die if not defined $node; 1551 my %attrs = %{get_attrs($node)}; 1552 my %r; 1553 $r{type} = $attrs{type} if defined $attrs{type}; 1554 foreach (get_subelements($node)) { 1555 my $name = get_name($_); 1556 if ($name eq 'language') { 1557 warn "'language' seen twice" if defined $r{language}; 1558 $r{language} = read_with_lang($_, 0, 0); 1559 } 1560 else { 1561 warn "bad content of 'subtitles' element: $name"; 1562 } 1563 } 1564 return \%r; 1565}; 1566$Handlers{subtitles}->[1] = sub( $$$ ) { 1567 my ($w, $e, $v) = @_; 1568 t 'subtitles'; 1569 my ($type, $language) = ($v->{type}, $v->{language}); 1570 my %attrs; $attrs{type} = $type if defined $type; 1571 if (defined $language) { 1572 $w->startTag($e, %attrs) if $w; 1573 write_with_lang($w, 'language', $language, 0, 0); 1574 $w->endTag($e) if $w; 1575 } 1576 else { 1577 $w->emptyTag($e, %attrs) if $w; 1578 } 1579}; 1580 1581=pod 1582 1583=item I<rating> 1584 1585The rating is represented as a tuple of [ rating, system, icons ]. 1586The last element is itself a listref of structures returned by the 1587I<icon> handler. 1588 1589=cut 1590 1591$Handlers{rating}->[0] = sub( $ ) { 1592 my $node = shift; die if not defined $node; 1593 my %attrs = %{get_attrs($node)}; 1594 my $system = delete $attrs{system} if exists $attrs{system}; 1595 foreach (keys %attrs) { 1596 warn "unknown attribute in rating: $_"; 1597 } 1598 my @children = get_subelements($node); 1599 1600 # First child node is value. 1601 my $value_node = shift @children; 1602 if (not defined $value_node) { 1603 warn "missing 'value' element inside rating"; 1604 return undef; 1605 } 1606 if ((my $name = get_name($value_node)) ne 'value') { 1607 warn "expected 'value' node inside rating, got '$name'"; 1608 return undef; 1609 } 1610 1611 my $rating = read_value($value_node); 1612 1613 # Remaining children are icons. 1614 my @icons = map { read_icon($_) } @children; 1615 1616 return [ $rating, $system, \@icons ]; 1617}; 1618$Handlers{rating}->[1] = sub( $$$ ) { 1619 my ($w, $e, $v) = @_; 1620 if (not ref $v or ref $v ne 'ARRAY') { 1621 warn "not writing rating whose content is not an array"; 1622 return; 1623 } 1624 my ($rating, $system, $icons) = @$v; 1625 if (defined $system) { 1626 $w->startTag($e, system => $system) if $w; 1627 } 1628 else { 1629 $w->startTag($e) if $w; 1630 } 1631 1632 write_value($w, 'value', $rating) if $w; 1633 if ($w) { write_icon($w, 'icon', $_) foreach @$icons }; 1634 $w->endTag($e) if $w; 1635}; 1636 1637=pod 1638 1639=item I<star-rating> 1640 1641In XML this is a string 'X/Y' plus a list of icons. In Perl represented 1642as a pair [ rating, icons ] similar to I<rating>. 1643 1644Multiple star ratings are now supported. For backward compatibility, 1645you may specify a single [rating,icon] or the preferred double array 1646[[rating,system,icon],[rating2,system2,icon2]] (like 'ratings') 1647 1648 1649=cut 1650 1651$Handlers{'star-rating'}->[0] = sub( $ ) { 1652 my $node = shift; 1653 my %attrs = %{get_attrs($node)}; 1654 my $system = delete $attrs{system} if exists $attrs{system}; 1655 my @children = get_subelements($node); 1656 1657 # First child node is value. 1658 my $value_node = shift @children; 1659 if (not defined $value_node) { 1660 warn "missing 'value' element inside star-rating"; 1661 return undef; 1662 } 1663 if ((my $name = get_name($value_node)) ne 'value') { 1664 warn "expected 'value' node inside star-rating, got '$name'"; 1665 return undef; 1666 } 1667 my $rating = read_value($value_node); 1668 1669 # Remaining children are icons. 1670 my @icons = map { read_icon($_) } @children; 1671 1672 return [ $rating, $system, \@icons ]; 1673}; 1674$Handlers{'star-rating'}->[1] = sub ( $$$ ) { 1675 my ($w, $e, $v) = @_; 1676# 1677# 10/31/2007 star-rating can now have multiple values (and system=) 1678# let's make it so old code still works! 1679# 1680 if (not ref $v or ref $v ne 'ARRAY') { 1681 $v=[$v]; 1682# warn "not writing star-rating whose content is not an array"; 1683# return; 1684 } 1685 my ($rating, $system, $icons) = @$v; 1686 if (defined $system) { 1687 $w->startTag($e, system => $system) if $w; 1688 } 1689 else { 1690 $w->startTag($e) if $w; 1691 } 1692 write_value($w, 'value', $rating) if $w; 1693 if ($w) { write_icon($w, 'icon', $_) foreach @$icons }; 1694 $w->endTag($e) if $w; 1695}; 1696 1697=pod 1698 1699=item I<icon> 1700 1701An icon in XMLTV files is like the <img> element in HTML. It is 1702represented in Perl as a hashref with 'src' and optionally 'width' 1703and 'height' keys. 1704 1705=cut 1706 1707sub write_icon( $$$ ) { 1708 my ($w, $e, $v) = @_; 1709 croak "no 'src' attribute for icon\n" if not defined $v->{src}; 1710 croak "bad width $v->{width} for icon\n" 1711 if defined $v->{width} and $v->{width} !~ /^\d+$/; 1712 croak "bad height $v->{height} for icon\n" 1713 if defined $v->{height} and $v->{height} !~ /^\d+$/; 1714 1715 foreach (keys %$v) { 1716 warn "unrecognized key in icon: $_\n" 1717 if $_ ne 'src' and $_ ne 'width' and $_ ne 'height'; 1718 } 1719 1720 $w->emptyTag($e, %$v); 1721} 1722sub read_icon( $ ) { 1723 my $node = shift; die if not defined $node; 1724 my %attrs = %{get_attrs($node)}; 1725 warn "missing 'src' attribute in icon" if not defined $attrs{src}; 1726 return \%attrs; 1727} 1728$Handlers{icon}->[0] = \&read_icon; 1729$Handlers{icon}->[1] = sub( $$$ ) { 1730 my ($w, $e, $v) = @_; 1731 write_icon($w, $e, $v) if $w; 1732}; 1733 1734# To keep things tidy some elements that can have icons store their 1735# textual content inside a subelement called 'value'. These two 1736# routines are a bit trivial but they're here for consistency. 1737# 1738sub read_value( $ ) { 1739 my $value_node = shift; 1740 my $v = get_text($value_node); 1741 if (not defined $v or $v eq '') { 1742 warn "no content of 'value' element"; 1743 return undef; 1744 } 1745 return $v; 1746} 1747sub write_value( $$$ ) { 1748 my ($w, $e, $v) = @_; 1749 $w->dataElement($e, $v) if $w; 1750}; 1751 1752 1753# Booleans in XMLTV files are 'yes' or 'no'. 1754sub decode_boolean( $ ) { 1755 my $value = shift; 1756 if ($value eq 'no') { 1757 return 0; 1758 } 1759 elsif ($value eq 'yes') { 1760 return 1; 1761 } 1762 else { 1763 warn "bad boolean: $value"; 1764 return undef; 1765 } 1766} 1767sub encode_boolean( $ ) { 1768 my $v = shift; 1769 warn "expected a Perl boolean like 0 or 1, not '$v'\n" 1770 if $v and $v != 1; 1771 return $v ? 'yes' : 'no'; 1772} 1773 1774 1775=pod 1776 1777=item I<with-lang> 1778 1779In XML something like title can be either <title>Foo</title> 1780or <title lang="en">Foo</title>. In Perl these are stored as 1781[ 'Foo' ] and [ 'Foo', 'en' ]. For the former [ 'Foo', undef ] 1782would also be okay. 1783 1784This handler also has two modifiers which may be added to the name 1785after '/'. I</e> means that empty text is allowed, and will be 1786returned as the empty tuple [], to mean that the element is present 1787but has no text. When writing with I</e>, undef will also be 1788understood as present-but-empty. You cannot however specify a 1789language if the text is empty. 1790 1791The modifier I</m> means that the text is allowed to span multiple 1792lines. 1793 1794So for example I<with-lang/em> is a handler for text with language, 1795where the text may be empty and may contain newlines. Note that the 1796I<with-lang-or-empty> of earlier releases has been replaced by 1797I<with-lang/e>. 1798 1799=cut 1800 1801sub read_with_lang( $$$ ) { 1802 my ($node, $allow_empty, $allow_nl) = @_; 1803 die if not defined $node; 1804 my %attrs = %{get_attrs($node)}; 1805 my $lang = $attrs{lang} if exists $attrs{lang}; 1806 my $value = get_text($node, $allow_nl); 1807 if (not length $value) { 1808 if (not $allow_empty) { 1809 warn 'empty string for with-lang value'; 1810 return undef; 1811 } 1812 warn 'empty string may not have language' if defined $lang; 1813 return []; 1814 } 1815 if (defined $lang) { 1816 return [ $value, $lang ]; 1817 } 1818 else { 1819 return [ $value ]; 1820 } 1821} 1822$Handlers{'with-lang'}->[0] = sub( $ ) { read_with_lang($_[0], 0, 0) }; 1823$Handlers{'with-lang/'}->[0] = sub( $ ) { read_with_lang($_[0], 0, 0) }; 1824$Handlers{'with-lang/e'}->[0] = sub( $ ) { read_with_lang($_[0], 1, 0) }; 1825$Handlers{'with-lang/m'}->[0] = sub( $ ) { read_with_lang($_[0], 0, 1) }; 1826$Handlers{'with-lang/em'}->[0] = sub( $ ) { read_with_lang($_[0], 1, 1) }; 1827$Handlers{'with-lang/me'}->[0] = sub( $ ) { read_with_lang($_[0], 1, 1) }; 1828 1829sub write_with_lang( $$$$$ ) { 1830 my ($w, $e, $v, $allow_empty, $allow_nl) = @_; 1831 if (not ref $v or ref $v ne 'ARRAY') { 1832 warn "not writing with-lang whose content is not an array"; 1833 return; 1834 } 1835 1836 if (not @$v) { 1837 if (not $allow_empty) { 1838 warn "not writing no content for $e"; 1839 return; 1840 } 1841 $v = [ '' ]; 1842 } 1843 1844 my ($text, $lang) = @$v; 1845 t 'writing character data: ' . d $text; 1846 if (not defined $text) { 1847 warn "not writing undefined value for $e"; 1848 return; 1849 } 1850 1851# 1852# strip whitespace silently. 1853# we used to use a warn, but later on the code catches this and drops the record 1854# 1855 my $old_text = $text; 1856 $text =~ s/^\s+//; 1857 $text =~ s/\s+$//; 1858 1859 if (not length $text) { 1860 if (not $allow_empty) { 1861 warn "not writing empty content for $e"; 1862 return; 1863 } 1864 if (defined $lang) { 1865 warn "not writing empty content with language for $e"; 1866 return; 1867 } 1868 $w->emptyTag($e) if $w; 1869 return; 1870 } 1871 1872 if (not $allow_nl and $text =~ tr/\n//) { 1873 warn "not writing text containing newlines for $e"; 1874 return; 1875 } 1876 1877 if (defined $lang) { 1878 $w->dataElement($e, $text, lang => $lang) if $w; 1879 } 1880 else { 1881 $w->dataElement($e, $text) if $w; 1882 } 1883} 1884$Handlers{'with-lang'}->[1] = sub( $$$ ) { write_with_lang($_[0], $_[1], $_[2], 0, 0) }; 1885$Handlers{'with-lang/'}->[1] = sub( $$$ ) { write_with_lang($_[0], $_[1], $_[2], 0, 0) }; 1886$Handlers{'with-lang/e'}->[1] = sub( $$$ ) { write_with_lang($_[0], $_[1], $_[2], 1, 0) }; 1887$Handlers{'with-lang/m'}->[1] = sub( $$$ ) { write_with_lang($_[0], $_[1], $_[2], 0, 1) }; 1888$Handlers{'with-lang/em'}->[1] = sub( $$$ ) { write_with_lang($_[0], $_[1], $_[2], 1, 1) }; 1889$Handlers{'with-lang/me'}->[1] = sub( $$$ ) { write_with_lang($_[0], $_[1], $_[2], 1, 1) }; 1890 1891# Sanity check. 1892foreach (keys %Handlers) { 1893 my $v = $Handlers{$_}; 1894 if (@$v != 2 1895 or ref($v->[0]) ne 'CODE' 1896 or ref($v->[1]) ne 'CODE') { 1897 die "bad handler pair for $_\n"; 1898 } 1899} 1900 1901=pod 1902 1903=back 1904 1905Now, which handlers are used for which subelements (keys) of channels 1906and programmes? And what is the multiplicity (should you expect a 1907single value or a list of values)? 1908 1909The following tables map subelements of <channel> and of <programme> 1910to the handlers used to read and write them. Many elements have their 1911own handler with the same name, and most of the others use 1912I<with-lang>. The third column specifies the multiplicity of the 1913element: B<*> (any number) will give a list of values in Perl, B<+> 1914(one or more) will give a nonempty list, B<?> (maybe one) will give a 1915scalar, and B<1> (exactly one) will give a scalar which is not undef. 1916 1917=head2 Handlers for <channel> 1918 1919@CHANNEL_HANDLERS 1920 1921=head2 Handlers for <programme> 1922 1923@PROGRAMME_HANDLERS 1924 1925At present, no parsing or validation on dates is done because dates 1926may be partially specified in XMLTV. For example '2001' means that 1927the year is known but not the month, day or time of day. Maybe in the 1928future dates will be automatically converted to and from 1929B<Date::Manip> objects. For now they just use the I<scalar> handler. 1930Similar remarks apply to URLs. 1931 1932=cut 1933 1934# Private. 1935sub node_to_programme( $ ) { 1936 my $node = shift; die if not defined $node; 1937 my %programme; 1938 1939 # Attributes of programme element. 1940 %programme = %{get_attrs($node)}; 1941 t 'attributes: ' . d \%programme; 1942 1943 # Check the required attributes are there. As with most checking, 1944 # this isn't an alternative to using a validator but it does save 1945 # some headscratching during debugging. 1946 # 1947 foreach (qw(start channel)) { 1948 if (not defined $programme{$_}) { 1949 warn "programme missing '$_' attribute\n"; 1950 return undef; 1951 } 1952 } 1953 my @known_attrs = map { $_->[0] } @Programme_Attributes; 1954 my %ka; ++$ka{$_} foreach @known_attrs; 1955 foreach (keys %programme) { 1956 unless ($ka{$_}) { 1957 warn "deleting unknown attribute '$_'"; 1958 delete $programme{$_}; 1959 } 1960 } 1961 1962 call_handlers_read($node, \@Programme_Handlers, \%programme); 1963 return \%programme; 1964} 1965 1966 1967# Private. 1968sub node_to_channel( $ ) { 1969 my $node = shift; die if not defined $node; 1970 my %channel; 1971 t 'node_to_channel() ENTRY'; 1972 1973 %channel = %{get_attrs($node)}; 1974 t 'attributes: ' . d \%channel; 1975 if (not defined $channel{id}) { 1976 warn "channel missing 'id' attribute\n"; 1977 } 1978 foreach (keys %channel) { 1979 unless (/^_/ or $_ eq 'id') { 1980 warn "deleting unknown attribute '$_'"; 1981 delete $channel{$_}; 1982 } 1983 } 1984 1985 t '\@Channel_Handlers=' . d \@Channel_Handlers; 1986 call_handlers_read($node, \@Channel_Handlers, \%channel); 1987 return \%channel; 1988} 1989 1990 1991 1992# Private. 1993# 1994# call_handlers_read() 1995# 1996# Read the subelements of a node according to a list giving a 1997# handler subroutine for each subelement. 1998# 1999# Parameters: 2000# node 2001# Reference to list of handlers: tuples of 2002# [element-name, handler-name, multiplicity] 2003# Reference to hash for storing results 2004# 2005# Warns if errors, but attempts to contine. 2006# 2007sub call_handlers_read( $$$ ) { 2008 my ($node, $handlers, $r) = @_; 2009 t 'call_handlers_read() using handlers: ' . d $handlers; 2010 2011 die unless ref($r) eq 'HASH'; 2012 our %r; local *r = $r; 2013 t 'going through each child of node'; 2014 2015 # Current position in handlers. We expect to read the subelements 2016 # in the correct order as specified by the DTD. 2017 # 2018 my $handler_pos = 0; 2019 2020 SUBELEMENT: foreach (get_subelements($node)) { 2021 t 'doing subelement'; 2022 my $name = get_name($_); 2023 t "tag name: $name"; 2024 2025 # Search for a handler - from $handler_pos onwards. But 2026 # first, just warn if somebody is trying to use an element in 2027 # the wrong place (trying to go backwards in the list). 2028 # 2029 my $found_pos; 2030 foreach my $i (0 .. $handler_pos - 1) { 2031 if ($name eq $handlers->[$i]->[0]) { 2032 warn "element $name not expected here"; 2033 next SUBELEMENT; 2034 } 2035 } 2036 for (my $i = $handler_pos; $i < @$handlers; $i++) { 2037 if ($handlers->[$i]->[0] eq $name) { 2038 t 'found handler'; 2039 $found_pos = $i; 2040 last; 2041 } 2042 else { 2043 t "doesn't match name $handlers->[$i]->[0]"; 2044 my ($handler_name, $h, $multiplicity) 2045 = @{$handlers->[$i]}; 2046 die if not defined $handler_name; 2047 die if $handler_name eq ''; 2048 2049 # Before we skip over this element, check that we got 2050 # the necessary values for it. 2051 # 2052 if ($multiplicity eq '?') { 2053 # Don't need to check whether this set. 2054 } 2055 elsif ($multiplicity eq '1') { 2056 if (not defined $r{$handler_name}) { 2057 warn "no element $handler_name found"; 2058 } 2059 } 2060 elsif ($multiplicity eq '*') { 2061 # It's okay if nothing was ever set. We don't 2062 # insist on putting in an empty list. 2063 # 2064 } 2065 elsif ($multiplicity eq '+') { 2066 if (not defined $r{$handler_name}) { 2067 warn "no element $handler_name found"; 2068 } 2069 elsif (not @{$r{$handler_name}}) { 2070 warn "strangely, empty list for $handler_name"; 2071 } 2072 } 2073 else { 2074 warn "bad value of $multiplicity: $!"; 2075 } 2076 } 2077 } 2078 if (not defined $found_pos) { 2079 warn "unknown element $name"; 2080 next; 2081 } 2082 # Next time we begin searching from this position. 2083 $handler_pos = $found_pos; 2084 2085 # Call the handler. 2086 t 'calling handler'; 2087 my ($handler_name, $h_name, $multiplicity) 2088 = @{$handlers->[$found_pos]}; 2089 die if $handler_name ne $name; 2090 my $h = $Handlers{$h_name}; die "no handler $h_name" if not $h; 2091 my $result = $h->[0]->($_); # call reader sub 2092 t 'result: ' . d $result; 2093 warn("skipping bad $name\n"), next if not defined $result; 2094 2095 # Now set the value. We can't do multiplicity checking yet 2096 # because there might be more elements of this type still to 2097 # come. 2098 # 2099 if ($multiplicity eq '?' or $multiplicity eq '1') { 2100 warn "seen $name twice" 2101 if defined $r{$name}; 2102 $r{$name} = $result; 2103 } 2104 elsif ($multiplicity eq '*' or $multiplicity eq '+') { 2105 push @{$r{$name}}, $result; 2106 } 2107 else { 2108 warn "bad multiplicity: $multiplicity"; 2109 } 2110 } 2111} 2112 2113sub warn_unknown_keys( $$ ) { 2114 my $elem_name = shift; 2115 our %k; local *k = shift; 2116 foreach (keys %k) { 2117 /^_/ 2118 or $warned_unknown_key{$elem_name}->{$_}++ 2119 or warn "unknown key $_ in $elem_name hash\n"; 2120 } 2121} 2122 2123package XMLTV::Writer; 2124use base 'XML::Writer'; 2125use Carp; 2126 2127use Date::Manip qw/UnixDate DateCalc/; 2128 2129# Use Log::TraceMessages if installed. 2130BEGIN { 2131 eval { require Log::TraceMessages }; 2132 if ($@) { 2133 *t = sub {}; 2134 *d = sub { '' }; 2135 } 2136 else { 2137 *t = \&Log::TraceMessages::t; 2138 *d = \&Log::TraceMessages::d; 2139 } 2140} 2141 2142BEGIN { 2143 if (int(Date::Manip::DateManipVersion) >= 6) { 2144 Date::Manip::Date_Init("SetDate=now,UTC"); 2145 } else { 2146 Date::Manip::Date_Init("TZ=UTC"); 2147 } 2148} 2149 2150# Override dataElement() to refuse writing empty or whitespace 2151# elements. 2152# 2153sub dataElement( $$$@ ) { 2154 my ($self, $elem, $content, @rest) = @_; 2155 if ($content !~ /\S/) { 2156 warn "not writing empty content for $elem"; 2157 return; 2158 } 2159 return $self->SUPER::dataElement($elem, $content, @rest); 2160} 2161 2162=pod 2163 2164=head1 WRITING 2165 2166When reading a file you have the choice of using C<parse()> to gulp 2167the whole file and return a data structure, or using 2168C<parse_callback()> to get the programmes one at a time, although 2169channels and other data are still read all at once. 2170 2171There is a similar choice when writing data: the C<write_data()> 2172routine prints a whole XMLTV document at once, but if you want to 2173write an XMLTV document incrementally you can manually create an 2174C<XMLTV::Writer> object and call methods on it. Synopsis: 2175 2176 use XMLTV; 2177 my $w = new XMLTV::Writer(); 2178 $w->comment("Hello from XML::Writer's comment() method"); 2179 $w->start({ 'generator-info-name' => 'Example code in pod' }); 2180 my %ch = (id => 'test-channel', 'display-name' => [ [ 'Test', 'en' ] ]); 2181 $w->write_channel(\%ch); 2182 my %prog = (channel => 'test-channel', start => '200203161500', 2183 title => [ [ 'News', 'en' ] ]); 2184 $w->write_programme(\%prog); 2185 $w->end(); 2186 2187XMLTV::Writer inherits from XML::Writer, and provides the following extra 2188or overridden methods: 2189 2190=over 2191 2192=item new(), the constructor 2193 2194Creates an XMLTV::Writer object and starts writing an XMLTV file, printing 2195the DOCTYPE line. Arguments are passed on to XML::WriterE<39>s constructor, 2196except for the following: 2197 2198the 'encoding' key if present gives the XML character encoding. 2199For example: 2200 2201 my $w = new XMLTV::Writer(encoding => 'ISO-8859-1'); 2202 2203If encoding is not specified, XML::WriterE<39>s default is used 2204(currently UTF-8). 2205 2206XMLTW::Writer can also filter out specific days from the data. This is 2207useful if the datasource provides data for periods of time that does not 2208match the days that the user has asked for. The filtering is controlled 2209with the days, offset and cutoff arguments: 2210 2211 my $w = new XMLTV::Writer( 2212 offset => 1, 2213 days => 2, 2214 cutoff => "050000" ); 2215 2216In this example, XMLTV::Writer will discard all entries that do not have 2217starttimes larger than or equal to 05:00 tomorrow and less than 05:00 2218two days after tomorrow. The time offset is stripped off the starttime before 2219the comparison is made. 2220 2221=cut 2222 2223sub new { 2224 my $proto = shift; 2225 my $class = ref($proto) || $proto; 2226 my %args = @_; 2227 croak 'OUTPUT requires a filehandle, not a filename or anything else' 2228 if exists $args{OUTPUT} and not ref $args{OUTPUT}; 2229 2230 # force OUTPUT explicitly to standard output to avoid warnings about 2231 # undefined OUTPUT in XML::Writer where it tests against 'self' 2232 if (!exists $args{OUTPUT}) { 2233 $args{OUTPUT} = \*STDOUT; 2234 } 2235 my $encoding = delete $args{encoding}; 2236 my $days = delete $args{days}; 2237 my $offset = delete $args{offset}; 2238 my $cutoff = delete $args{cutoff}; 2239 2240 my $self = $class->SUPER::new(DATA_MODE => 1, DATA_INDENT => 2, %args); 2241 bless($self, $class); 2242 2243 if (defined $encoding) { 2244 $self->xmlDecl($encoding); 2245 } 2246 else { 2247 # XML::Writer puts in 'encoding="UTF-8"' even if you don't ask 2248 # for it. 2249 # 2250 warn "assuming default UTF-8 encoding for output\n"; 2251 $self->xmlDecl(); 2252 } 2253 2254# $Log::TraceMessages::On = 1; 2255 $self->{mintime} = "19700101000000"; 2256 $self->{maxtime} = "29991231235959"; 2257 2258 2259 if (defined( $days ) and defined( $offset ) and defined( $cutoff )) { 2260 $self->{mintime} = UnixDate( 2261 DateCalc( "today", "+" . $offset . " days" ), 2262 "%Y%m%d") . $cutoff; 2263 t "using mintime $self->{mintime}"; 2264 2265 $self->{maxtime} = UnixDate( 2266 DateCalc("today", "+" . ($offset+$days) . " days"), 2267 "%Y%m%d" ) . $cutoff; 2268 t "using maxtime $self->{maxtime}"; 2269 } 2270 elsif (defined( $days ) or defined( $offset ) or defined($cutoff)) { 2271 croak 'You must specify days, offset and cutoff or none of them'; 2272 } 2273 2274 { 2275 local $^W = 0; $self->doctype('tv', undef, 'xmltv.dtd'); 2276 } 2277 $self->{xmltv_writer_state} = 'new'; 2278 return $self; 2279} 2280 2281=pod 2282 2283=item start() 2284 2285Write the start of the <tv> element. Parameter is a hashref which gives 2286the attributes of this element. 2287 2288=cut 2289 2290sub start { 2291 my $self = shift; 2292 die 'usage: XMLTV::Writer->start(hashref of attrs)' if @_ != 1; 2293 my $attrs = shift; 2294 2295 for ($self->{xmltv_writer_state}) { 2296 if ($_ eq 'new') { 2297 # Okay. 2298 } 2299 elsif ($_ eq 'channels' or $_ eq 'programmes') { 2300 croak 'cannot call start() more than once on XMLTV::Writer'; 2301 } 2302 elsif ($_ eq 'end') { 2303 croak 'cannot do anything with end()ed XMLTV::Writer'; 2304 } 2305 else { die } 2306 2307 $_ = 'channels'; 2308 } 2309 $self->startTag('tv', order_attrs(%{$attrs})); 2310} 2311 2312=pod 2313 2314=item write_channels() 2315 2316Write several channels at once. Parameter is a reference to a hash 2317mapping channel id to channel details. They will be written sorted 2318by id, which is reasonable since the order of channels in an XMLTV 2319file isnE<39>t significant. 2320 2321=cut 2322 2323sub write_channels { 2324 my ($self, $channels) = @_; 2325 t('write_channels(' . d($self) . ', ' . d($channels) . ') ENTRY'); 2326 croak 'expected hashref of channels' if ref $channels ne 'HASH'; 2327 2328 for ($self->{xmltv_writer_state}) { 2329 if ($_ eq 'new') { 2330 croak 'must call start() on XMLTV::Writer first'; 2331 } 2332 elsif ($_ eq 'channels') { 2333 # Okay. 2334 } 2335 elsif ($_ eq 'programmes') { 2336 croak 'cannot write channels after writing programmes'; 2337 } 2338 elsif ($_ eq 'end') { 2339 croak 'cannot do anything with end()ed XMLTV::Writer'; 2340 } 2341 else { die } 2342 } 2343 2344 my @ids = sort keys %$channels; 2345 t 'sorted list of channel ids: ' . d \@ids; 2346 foreach (@ids) { 2347 t "writing channel with id $_"; 2348 my $ch = $channels->{$_}; 2349 $self->write_channel($ch); 2350 } 2351 t('write_channels() EXIT'); 2352} 2353 2354=pod 2355 2356=item write_channel() 2357 2358Write a single channel. You can call this routine if you want, but 2359most of the time C<write_channels()> is a better interface. 2360 2361=cut 2362 2363sub write_channel { 2364 my ($self, $ch) = @_; 2365 croak 'undef channel hash passed' if not defined $ch; 2366 croak "expected a hashref, got: $ch" if ref $ch ne 'HASH'; 2367 2368 for ($self->{xmltv_writer_state}) { 2369 if ($_ eq 'new') { 2370 croak 'must call start() on XMLTV::Writer first'; 2371 } 2372 elsif ($_ eq 'channels') { 2373 # Okay. 2374 } 2375 elsif ($_ eq 'programmes') { 2376 croak 'cannot write channels after writing programmes'; 2377 } 2378 elsif ($_ eq 'end') { 2379 croak 'cannot do anything with end()ed XMLTV::Writer'; 2380 } 2381 else { die } 2382 } 2383 2384 my %ch = %$ch; # make a copy 2385 my $id = delete $ch{id}; 2386 die "no 'id' attribute in channel" if not defined $id; 2387 write_element_with_handlers($self, 'channel', { id => $id }, 2388 \@XMLTV::Channel_Handlers, \%ch); 2389} 2390 2391=pod 2392 2393=item write_programme() 2394 2395Write details for a single programme as XML. 2396 2397=cut 2398 2399sub write_programme { 2400 my $self = shift; 2401 die 'usage: XMLTV::Writer->write_programme(programme hash)' if @_ != 1; 2402 my $ref = shift; 2403 croak 'write_programme() expects programme hashref' 2404 if ref $ref ne 'HASH'; 2405 t('write_programme(' . d($self) . ', ' . d($ref) . ') ENTRY'); 2406 2407 for ($self->{xmltv_writer_state}) { 2408 if ($_ eq 'new') { 2409 croak 'must call start() on XMLTV::Writer first'; 2410 } 2411 elsif ($_ eq 'channels') { 2412 $_ = 'programmes'; 2413 } 2414 elsif ($_ eq 'programmes') { 2415 # Okay. 2416 } 2417 elsif ($_ eq 'end') { 2418 croak 'cannot do anything with end()ed XMLTV::Writer'; 2419 } 2420 else { die } 2421 } 2422 2423 # We make a copy of the programme hash and delete elements from it 2424 # as they are dealt with; then we can easily spot any unhandled 2425 # elements at the end. 2426 # 2427 my %p = %$ref; 2428 2429 # First deal with those hash keys that refer to metadata on when 2430 # the programme is broadcast. After taking those out of the hash, 2431 # we can use the handlers to output individual details. 2432 # 2433 my %attrs; 2434 die if not @XMLTV::Programme_Attributes; 2435 foreach (@XMLTV::Programme_Attributes) { 2436 my ($name, $mult) = @$_; 2437 t "looking for key $name"; 2438 my $val = delete $p{$name}; 2439 if ($mult eq '?') { 2440 # No need to check anything. 2441 } 2442 elsif ($mult eq '1') { 2443 if (not defined $val) { 2444 warn "programme hash missing $name key, skipping"; 2445 return; 2446 } 2447 } 2448 else { die "bad multiplicity for attribute: $mult" } 2449 $attrs{$name} = $val if defined $val; 2450 } 2451 2452 # We use string comparisons without timeoffsets for comparing times. 2453 my( $start ) = split( /\s+/, $attrs{start} ); 2454 if( $start lt $self->{mintime} or 2455 $start ge $self->{maxtime} ) { 2456 t "skipping programme with start $attrs{start}"; 2457 return; 2458 } 2459 2460 t "beginning 'programme' element"; 2461 write_element_with_handlers($self, 'programme', \%attrs, 2462 \@XMLTV::Programme_Handlers, \%p); 2463} 2464 2465=pod 2466 2467=item end() 2468 2469Say youE<39>ve finished writing programmes. This ends the <tv> element 2470and the file. 2471 2472=cut 2473 2474sub end { 2475 my $self = shift; 2476 2477 for ($self->{xmltv_writer_state}) { 2478 if ($_ eq 'new') { 2479 croak 'must call start() on XMLTV::Writer first'; 2480 } 2481 elsif ($_ eq 'channels' or $_ eq 'programmes') { 2482 $_ = 'end'; 2483 } 2484 elsif ($_ eq 'end') { 2485 croak 'cannot do anything with end()ed XMLTV::Writer'; 2486 } 2487 else { die } 2488 } 2489 2490 $self->endTag('tv'); 2491 $self->SUPER::end(@_); 2492} 2493 2494 2495# Private. 2496# order_attrs() 2497# 2498# In XML the order of attributes is not significant. But to make 2499# things look nice we try to output them in the same order as given in 2500# the DTD. 2501# 2502# Takes a list of (key, value, key, value, ...) and returns one with 2503# keys in a nice-looking order. 2504# 2505sub order_attrs { 2506 die "expected even number of elements, from a hash" 2507 if @_ % 2; 2508 my @a = ((map { $_->[0] } (@XMLTV::Channel_Attributes, 2509 @XMLTV::Programme_Attributes)), 2510 qw(date source-info-url source-info-name source-data-url 2511 generator-info-name generator-info-url)); 2512 2513 my @r; 2514 my %in = @_; 2515 foreach (@a) { 2516 if (exists $in{$_}) { 2517 my $v = delete $in{$_}; 2518 push @r, $_, $v; 2519 } 2520 } 2521 2522 foreach (sort keys %in) { 2523 warn "unknown attribute $_" unless /^_/; 2524 push @r, $_, $in{$_}; 2525 } 2526 2527 return @r; 2528} 2529 2530 2531# Private. 2532# 2533# Writes the elements of a hash to an XMLTV::Writer using a list of 2534# handlers. Deletes keys (modifying the hash passed in) as they are 2535# written. 2536# 2537# Requires all mandatory keys be present in the hash - if you're not 2538# sure then use check_multiplicity() first. 2539# 2540# Returns true if the element was successfully written, or if any 2541# errors found don't look serious enough to cause bad XML. If the 2542# XML::Writer object passed in is undef then nothing is written (since 2543# the write handlers are coded like that.) 2544# 2545sub call_handlers_write( $$$ ) { 2546 my ($self, $handlers, $input) = @_; 2547 t 'writing input hash: ' . d $input; 2548 die if not defined $input; 2549 2550 my $bad = 0; 2551 foreach (@$handlers) { 2552 my ($name, $h_name, $multiplicity) = @$_; 2553 my $h = $XMLTV::Handlers{$h_name}; die "no handler $h_name" if not $h; 2554 my $writer = $h->[1]; die if not defined $writer; 2555 t "doing handler for $name$multiplicity"; 2556 local $SIG{__WARN__} = sub { 2557 warn "$name element: $_[0]"; 2558 $bad = 1; 2559 }; 2560 my $val = delete $input->{$name}; 2561 t 'got value(s): ' . d $val; 2562 if ($multiplicity eq '1') { 2563 $writer->($self, $name, $val); 2564 } 2565 elsif ($multiplicity eq '?') { 2566 $writer->($self, $name, $val) if defined $val; 2567 } 2568 elsif ($multiplicity eq '*' or $multiplicity eq '+') { 2569 croak "value for key $name should be an array ref" 2570 if defined $val and ref $val ne 'ARRAY'; 2571 foreach (@{$val}) { 2572 t 'writing value: ' . d $_; 2573 $writer->($self, $name, $_); 2574 t 'finished writing multiple values'; 2575 } 2576 } 2577 else { 2578 warn "bad multiplicity specifier: $multiplicity"; 2579 } 2580 } 2581 t 'leftover keys: ' . d([ sort keys %$input ]); 2582 return not $bad; 2583} 2584 2585 2586# Private. 2587# 2588# Warns about missing keys that are supposed to be mandatory. Returns 2589# true iff everything is okay. 2590# 2591sub check_multiplicity( $$ ) { 2592 my ($handlers, $input) = @_; 2593 foreach (@$handlers) { 2594 my ($name, $h_name, $multiplicity) = @$_; 2595 t "checking handler for $name: $h_name with multiplicity $multiplicity"; 2596 if ($multiplicity eq '1') { 2597 if (not defined $input->{$name}) { 2598 warn "hash missing value for $name"; 2599 return 0; 2600 } 2601 } 2602 elsif ($multiplicity eq '?') { 2603 # Okay if not present. 2604 } 2605 elsif ($multiplicity eq '*') { 2606 # Not present, or undef, is treated as empty list. 2607 } 2608 elsif ($multiplicity eq '+') { 2609 t 'one or more, checking for a listref with no undef values'; 2610 my $val = $input->{$name}; 2611 if (not defined $val) { 2612 warn "hash missing value for $name (expected list)"; 2613 return 0; 2614 } 2615 if (ref($val) ne 'ARRAY') { 2616 die "hash has bad contents for $name (expected list)"; 2617 return 0; 2618 } 2619 2620 t 'all values: ' . d $val; 2621 my @new_val = grep { defined } @$val; 2622 t 'values that are defined: ' . d \@new_val; 2623 if (@new_val != @$val) { 2624 warn "hash had some undef elements in list for $name, removed"; 2625 @$val = @new_val; 2626 } 2627 2628 if (not @$val) { 2629 warn "hash has empty list of $name properties (expected at least one)"; 2630 return 0; 2631 } 2632 } 2633 else { 2634 warn "bad multiplicity specifier: $multiplicity"; 2635 } 2636 } 2637 return 1; 2638} 2639 2640 2641# Private. 2642# 2643# Write a complete element with attributes, and subelements written 2644# using call_handlers_write(). The advantage over doing it by hand is 2645# that if some required keys are missing, nothing is written (rather 2646# than an incomplete and invalid element). 2647# 2648sub write_element_with_handlers( $$$$$ ) { 2649 my ($w, $name, $attrs, $handlers, $hash) = @_; 2650 if (not check_multiplicity($handlers, $hash)) { 2651 warn "keys missing in $name hash, not writing"; 2652 return; 2653 } 2654 2655 # Special 'debug' keys written as comments inside the element. 2656 my %debug_keys; 2657 foreach (grep /^debug/, keys %$hash) { 2658 $debug_keys{$_} = delete $hash->{$_}; 2659 } 2660 2661 # Call all the handlers with no writer object and make sure 2662 # they're happy. 2663 # 2664 if (not call_handlers_write(undef, $handlers, { %$hash })) { 2665 warn "bad data inside $name element, not writing\n"; 2666 return; 2667 } 2668 2669 $w->startTag($name, order_attrs(%$attrs)); 2670 foreach (sort keys %debug_keys) { 2671 my $val = $debug_keys{$_}; 2672 $w->comment((defined $val) ? "$_: $val" : $_); 2673 } 2674 call_handlers_write($w, $handlers, $hash); 2675 XMLTV::warn_unknown_keys($name, $hash); 2676 $w->endTag($name); 2677} 2678 2679=pod 2680 2681=back 2682 2683=head1 AUTHOR 2684 2685Ed Avis, ed@membled.com 2686 2687=head1 SEE ALSO 2688 2689The file format is defined by the DTD xmltv.dtd, which is included in 2690the xmltv package along with this module. It should be installed in 2691your systemE<39>s standard place for SGML and XML DTDs. 2692 2693The xmltv package has a web page at 2694<http://xmltv.org/> which carries 2695information about the file format and the various tools and apps which 2696are distributed with this module. 2697 2698=cut 2699 27001; 2701