1package XMLTV::ValidateFile;
2
3use strict;
4
5BEGIN {
6    use Exporter   ();
7    our (@ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS);
8
9    @ISA         = qw(Exporter);
10    @EXPORT      = qw( );
11    %EXPORT_TAGS = ( );     # eg: TAG => [ qw!name1 name2! ],
12    @EXPORT_OK   = qw/LoadDtd ValidateFile/;
13}
14our @EXPORT_OK;
15
16use XML::LibXML;
17use File::Slurp qw/read_file/;
18use XMLTV::Supplement qw/GetSupplement/;
19
20our $REQUIRE_CHANNEL_ID=1;
21
22my( $dtd, $parser );
23
24=head1 NAME
25
26XMLTV::ValidateFile - Validates an XMLTV file
27
28=head1 DESCRIPTION
29
30Utility library that validates that a file is correct according to
31http://wiki.xmltv.org/index.php/XMLTVFormat.
32
33
34=head1 EXPORTED FUNCTIONS
35
36All these functions are exported on demand.
37
38=over 4
39
40=cut
41
42=item LoadDtd
43
44Load the xmltv dtd. Takes a single parameter which is the name of
45the xmltv dtd file.
46
47LoadDtd must be called before ValidateFile can be called.
48
49=cut
50
51sub LoadDtd {
52    my( $dtd_file ) = @_;
53
54    my $dtd_str = read_file($dtd_file)
55	or die "Failed to read $dtd_file";
56
57    $dtd = XML::LibXML::Dtd->parse_string($dtd_str);
58}
59
60=item ValidateFile
61
62Validate that a file is valid according to the XMLTV dtd and try to check
63that it contains valid information. ValidateFile takes a filename as parameter
64and optionally also a day and an offset and prints error messages to STDERR.
65
66ValidateFile returns a list of errors that it found with the file. Each
67error takes the form of a keyword:
68
69ValidateFile checks the following:
70
71=over
72
73=item notwell
74
75The file is not well-formed XML.
76
77=item notdtd
78
79The file does not follow the XMLTV DTD.
80
81=item unknownid
82
83No channel-entry found for a channelid that is used in a programme-entry.
84
85=item duplicatechannel
86
87More than one channel-entry found for a channelid.
88
89=item noprogrammes
90
91No programme entries were found in the file.
92
93=item channelnoprogramme
94
95There are no programme entries for one of the channels listed with a
96channel-entry.
97
98=item invalidid
99
100An xmltvid does not look like a proper id, i.e. it does not  match
101/^[-a-zA-Z0-9]+(\.[-a-zA-Z0-9]+)+$/.
102
103=item noid
104
105A programme-entry without an id was found.
106
107=item emptytitle
108
109A programme entry with an empty or missing title was found.
110
111=item emptydescription
112
113A programme entry with an empty desc-element was found. The desc-element
114shall be omitted if there is no description.
115
116=item badstart
117
118A programme entry with an invalid start-time was found.
119
120=item badstop
121
122A programme entry with an invalid stop-time was found.
123
124=item badepisode
125
126A programme entry with an invalid episode number was found.
127
128=item badiso8859
129
130The file is encoded in iso-8859 but contains characters that
131have no meaning in iso-8859 (or are control characters).
132If it's iso-8859-1 aka Latin 1 it might be some characters in windows-1252 encoding.
133
134=item badutf8
135
136The file is encoded in utf-8 but contains characters that look strange.
1371) Mis-encoded single characters represented with [EF][BF][BD] bytes
1382) Mis-encoded single characters represented with [C3][AF][C2][BF][C2][BD] bytes
1393) Mis-encoded single characters in range [C2][80-9F]
140
141=back
142
143If no errors are found, an empty list is returned.
144
145=cut
146
147my %errors;
148my %timezoneerrors;
149
150sub ValidateFile {
151    my( $file ) = @_;
152
153    if( not defined( $parser ) ) {
154	$parser = XML::LibXML->new();
155	$parser->line_numbers(1);
156    }
157
158    if( not defined( $dtd ) ) {
159	my $dtd_str = GetSupplement( undef, 'xmltv.dtd');
160	$dtd = XML::LibXML::Dtd->parse_string( $dtd_str );
161    }
162
163    %errors = ();
164
165    my $doc;
166
167    eval { $doc = $parser->parse_file( $file ); };
168
169    if ( $@ ) {
170	w( "The file is not well-formed xml:\n$@ ", 'notwell');
171	return (keys %errors);
172    }
173
174    eval { $doc->validate( $dtd ) };
175    if ( $@ ) {
176	w( "The file is not valid according to the xmltv dtd:\n $@",
177	   'notvalid' );
178	return (keys %errors);
179    }
180
181    if( $doc->encoding() =~ m/^iso-8859-\d+$/i ) {
182	verify_iso8859xx( $file, $doc->encoding() );
183    } elsif( $doc->encoding() =~ m/^utf-8$/i ) {
184	verify_utf8( $file );
185    }
186    verify_entities( $file );
187
188    my $w = sub {
189	my( $p, $msg, $id ) = @_;
190	w( "Line " . $p->line_number() . " $msg", $id );
191    };
192
193    my %channels;
194
195    my $ns = $doc->find( "//channel" );
196
197    foreach my $ch ($ns->get_nodelist) {
198	my $channelid = $ch->findvalue('@id');
199	my $display_name = $ch->findvalue('display-name/text()');
200
201	$w->( $ch, "Invalid channel-id '$channelid'", 'invalidid' )
202	    if $channelid !~ /^[-a-zA-Z0-9]+(\.[-a-zA-Z0-9]+)+$/;
203
204	$w->( $ch, "Duplicate channel-tag for '$channelid'", 'duplicateid' )
205	    if defined( $channels{$channelid} );
206
207	$channels{$channelid} = 0;
208    }
209
210    $ns = $doc->find( "//programme" );
211    if ($ns->size() == 0) {
212	w( "No programme entries found.", 'noprogrammes' );
213	return (keys %errors);
214    }
215
216    foreach my $p ($ns->get_nodelist) {
217	my $channelid = $p->findvalue('@channel');
218	my $start = $p->findvalue('@start');
219	my $stop = $p->findvalue('@stop');
220	my $title = $p->findvalue('title/text()');
221	my $desc;
222	$desc = $p->findvalue('desc/text()')
223	    if $p->findvalue( 'count(desc)' );
224
225	my $xmltv_episode = $p->findvalue('episode-num[@system="xmltv_ns"]' );
226
227	if ($REQUIRE_CHANNEL_ID and not exists( $channels{$channelid} )) {
228	    $w->( $p, "Channel '$channelid' does not have a <channel>-entry.",
229		  'unknownid' );
230	    $channels{$channelid} = 0;
231	}
232
233	$channels{$channelid}++;
234
235	$w->( $p, "Empty title", 'emptytitle' )
236	    if $title =~ /^\s*$/;
237
238	$w->( $p, "Empty description", 'emptydescription' )
239	    if defined($desc) and $desc =~ /^\s*$/;
240
241	$w->( $p, "Invalid start-time '$start'", 'badstart' )
242	    if not verify_time( $start );
243
244	$w->( $p, "Invalid stop-time '$stop'", 'badstop' )
245	    if $stop ne "" and not verify_time( $stop );
246
247	if( $xmltv_episode =~ /\S/ ) {
248	    $w->($p, "Invalid episode-number '$xmltv_episode'", 'badepisode' )
249		if $xmltv_episode !~ /^\s*\d* (\s* \/ \s*\d+)? \s* \.
250		                       \s*\d* (\s* \/ \s*\d+)? \s* \.
251		                       \s*\d* (\s* \/ \s*\d+)? \s* $/x;
252	}
253    }
254
255    foreach my $channel (keys %channels) {
256	if ($channels{$channel} == 0) {
257	    w( "No programme entries found for $channel",
258	       'channelnoprogramme' );
259	}
260    }
261
262    return (keys %errors);
263}
264
265sub verify_time
266{
267    my( $timestamp ) = @_;
268
269    my( $date, $time, $tz ) =
270	($timestamp =~ /^(\d{8})(\d{4,6})(\s+([A-Z]+|[+-]\d{4})){0,1}$/ );
271
272    return 0 unless defined $time;
273
274    if( not defined( $tz ) )
275    {
276	if( not defined( $timezoneerrors{$tz} ) ) {
277	    w( "No timezone specified", 'missingtimezone' );
278	    $timezoneerrors{$tz}++;
279	    return 0;
280	}
281    }
282
283    if( $tz =~ /[a-zA-Z]/ ) {
284	if( not defined( $timezoneerrors{$tz} ) ) {
285	    w( "Invalid timezone '$tz'", 'invalidtimezone' );
286	    $timezoneerrors{$tz}++;
287	    return 0;
288	}
289    }
290
291    return 1;
292}
293
294sub verify_iso8859xx
295{
296    # code points not used in iso-8859 according to http://de.wikipedia.org/wiki/ISO_8859
297    my %unused_iso8859 = (
298        'iso-8859-1'  => undef,
299        'iso-8859-2'  => undef,
300        'iso-8859-3'  => '\xa5\xae\xbe\xc3\xd0\xe3\xf0',
301        'iso-8859-4'  => undef,
302        'iso-8859-5'  => undef,
303        'iso-8859-6'  => '\xa1-\xa3\xa5-\xab\xae-\xba\xbc-\xbe\xc0\xdb-\xdf\xf3-xff',
304        'iso-8859-7'  => '\xae\xd2\xff',
305        'iso-8859-8'  => '\xa1\xbf-\xde\xfb-\xfc\xff',
306        'iso-8859-9'  => undef,
307        'iso-8859-10' => undef,
308        'iso-8859-11' => '\xdb-\xde\xfc-\xff',
309        'iso-8859-12' => undef,
310        'iso-8859-13' => undef,
311        'iso-8859-14' => undef,
312        'iso-8859-15' => undef,
313    );
314    # code points of unusual control characters used in iso-8859 according to http://de.wikipedia.org/wiki/ISO_8859
315    my %unusual_iso8859 = (
316        'iso-8859-1'  => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad',
317        'iso-8859-2'  => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad',
318        'iso-8859-3'  => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad',
319        'iso-8859-4'  => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad',
320        'iso-8859-5'  => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad',
321        'iso-8859-6'  => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad',
322        'iso-8859-7'  => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad',
323        'iso-8859-8'  => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad',
324        'iso-8859-9'  => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad',
325        'iso-8859-10' => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad',
326        'iso-8859-11' => '\x00-\x08\x0b-\x1f\x7f-\xa0',
327        'iso-8859-12' => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad',
328        'iso-8859-13' => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad',
329        'iso-8859-14' => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad',
330        'iso-8859-15' => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad',
331    );
332    my( $filename, $encoding ) = @_;
333    $encoding = lc( $encoding );
334
335    my $file_str = read_file($filename);
336    my $unusual = $unusual_iso8859{$encoding};
337    my $unused = $unused_iso8859{$encoding};
338
339    if( defined( $unusual ) ) {
340        if( $file_str =~ m/[$unusual]+/ ) {
341            my ($hintpre, $hint, $hintpost) = ( $file_str =~ m/(.{0,15})([$unusual]+)(.{0,15})/ );
342            w( "file contains unexpected control characters"
343               . "\nlook here \"" . $hintpre . $hint . $hintpost . "\""
344               . sprintf( "\n%*s", 12+length( $hintpre ) , "^" )
345               , 'badiso8859' );
346        }
347    }
348
349    if( defined( $unused ) ) {
350        if( $file_str =~ m/[$unused]+/ ) {
351            my ($hintpre, $hint, $hintpost) = ( $file_str =~ m/(.{0,15})([$unused]+)(.{0,15})/ );
352            w( "file contains bytes without meaning in " . $encoding
353               . "\nlook here \"" . $hintpre . $hint . $hintpost . "\""
354               . sprintf( "\n%*s", 12+length( $hintpre ) , "^" )
355               , 'badiso8859' );
356        }
357    }
358
359    return 1;
360}
361
362# inspired by utf8 fixups in _uk_rt
363sub verify_utf8 {
364    my( $filename ) = @_;
365
366    my $file_str = read_file($filename);
367
368    # 1) Mis-encoded single characters represented with [EF][BF][BD] bytes
369    if( $file_str =~ m/\xEF\xBF\xBD]/ ) {
370        my ($hintpre, $hint, $hintpost) = ( $file_str =~ m/(.{0,15})(\xEF\xBF\xBD)(.{0,15})/ );
371        w( "file contains misencoded characters"
372           . "\nlook here \"" . $hintpre . $hint . $hintpost . "\""
373           . sprintf( "\n%*s", 11+length( $hintpre ) , "^^^" )
374           , 'badutf8' );
375    }
376
377    # 2) Mis-encoded single characters represented with [C3][AF][C2][BF][C2][BD] bytes
378    if( $file_str =~ m/\xC3\xAF\xC2\xBF\xC2\xBD/ ) {
379        my ($hintpre, $hint, $hintpost) = ( $file_str =~ m/(.{0,15})(\xC3\xAF\xC2\xBF\xC2\xBD)(.{0,15})/ );
380        w( "file contains misencoded characters"
381           . "\nlook here \"" . $hintpre . $hint . $hintpost . "\""
382           . sprintf( "\n%*s", 11+length( $hintpre ) , "^^^^^^" )
383           , 'badutf8' );
384    }
385
386    # 3) Mis-encoded single characters in range [C2][80-9F]
387    if( $file_str =~ m/\xC2[\x80-\x9F]/ ) {
388        my ($hintpre, $hint, $hintpost) = ( $file_str =~ m/(.{0,15})(\xC2[\x80-\x9F])(.{0,15})/ );
389        w( "file contains unexpected control characters, misencoded windows-1252?"
390           . "\nlook here \"" . $hintpre . $hint . $hintpost . "\""
391           . sprintf( "\n%*s", 11+length( $hintpre ) , "^^" )
392           , 'badutf8' );
393    }
394
395    # 4) The first two (C0 and C1) could only be used for overlong encoding of basic ASCII characters.
396    if( $file_str =~ m/[\xC0-\xC1]/ ) {
397        my ($hintpre, $hint, $hintpost) = ( $file_str =~ m/(.{0,15})([\xC0-\xC1])(.{0,15})/ );
398        w( "file contains bytes that should never appear in utf-8"
399           . "\nlook here \"" . $hintpre . $hint . $hintpost . "\""
400           . sprintf( "\n%*s", 11+length( $hintpre ) , "^" )
401           , 'badutf8' );
402    }
403
404    # 5) start bytes of sequences that could only encode numbers larger than the 0x10FFFF limit of Unicode.
405    if( $file_str =~ m/[\xF5-\xFF]/ ) {
406        my ($hintpre, $hint, $hintpost) = ( $file_str =~ m/(.{0,15})([\xF5-\xFF])(.{0,15})/ );
407        w( "file contains bytes that should never appear in utf-8"
408           . "\nlook here \"" . $hintpre . $hint . $hintpost . "\""
409           . sprintf( "\n%*s", 11+length( $hintpre ) , "^" )
410           , 'badutf8' );
411    }
412
413    # 6) first continuation byte missing after start of sequence
414    if( $file_str =~ m/[\xC2-\xF4][\x00-\x7F\xC0-\xFF]/ ) {
415        my ($hintpre, $hint, $hintpost) = ( $file_str =~ m/(.{0,15})([\xC2-\xF4][\x00-\x7F\xC0-\xFF])(.{0,15})/ );
416        w( "file contains an utf-8 sequence with missing continuation bytes"
417           . "\nlook here \"" . $hintpre . $hint . $hintpost . "\""
418           . sprintf( "\n%*s", 11+length( $hintpre )+1 , "^" )
419           , 'badutf8' );
420    }
421
422    # 7) second continuation byte missing after start of sequence
423    if( $file_str =~ m/[\xE0-\xF4][\x80-\xBF][\x00-\x7F\xC0-\xFF]/ ) {
424        my ($hintpre, $hint, $hintpost) = ( $file_str =~ m/(.{0,15})([\xE0-\xF4][\x80-\xBF][\x00-\x7F\xC0-\xFF])(.{0,15})/ );
425        w( "file contains an utf-8 sequence with missing continuation bytes"
426           . "\nlook here \"" . $hintpre . $hint . $hintpost . "\""
427           . sprintf( "\n%*s", 11+length( $hintpre )+2 , "^" )
428           , 'badutf8' );
429    }
430
431    # 8) third continuation byte missing after start of sequence
432    if( $file_str =~ m/[\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x00-\x7F\xC0-\xFF]/ ) {
433        my ($hintpre, $hint, $hintpost) = ( $file_str =~ m/(.{0,15})([\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x00-\x7F\xC0-\xFF])(.{0,15})/ );
434        w( "file contains an utf-8 sequence with missing continuation bytes"
435           . "\nlook here \"" . $hintpre . $hint . $hintpost . "\""
436           . sprintf( "\n%*s", 11+length( $hintpre )+3 , "^" )
437           , 'badutf8' );
438    }
439
440    return 1;
441}
442
443sub verify_entities
444{
445    my( $filename ) = @_;
446
447    my $file_str = read_file($filename);
448
449    if( $file_str =~ m/&[^#].+?;/ ) {
450        my ($entity) = ( $file_str =~ m/&([^#].+?);/ );
451        my %fiveentities = ('quot' => 1, 'amp' => 1, 'apos' => 1, 'lt' => 1, 'gt' => 1);
452        if (!exists($fiveentities{$entity})) {
453            w( "file contains undefined entity: $entity", 'badentity' );
454        }
455    }
456
457    return 1;
458}
459
460sub w {
461    my( $msg, $id ) = @_;
462    print "$msg\n";
463    $errors{$id}++ if defined $id;
464}
465
466
4671;
468
469=back
470
471=head1 BUGS
472
473It is currently necessary to specify the path to the xmltv dtd-file.
474This should not be necessary.
475
476=head1 COPYRIGHT
477
478Copyright (C) 2006 Mattias Holmlund.
479
480This program is free software; you can redistribute it and/or
481modify it under the terms of the GNU General Public License
482as published by the Free Software Foundation; either version 2
483of the License, or (at your option) any later version.
484
485This program is distributed in the hope that it will be useful,
486but WITHOUT ANY WARRANTY; without even the implied warranty of
487MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
488GNU General Public License for more details.
489
490You should have received a copy of the GNU General Public License
491along with this program; if not, write to the Free Software
492Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
493
494=cut
495
496### Setup indentation in Emacs
497## Local Variables:
498## perl-indent-level: 4
499## perl-continued-statement-offset: 4
500## perl-continued-brace-offset: 0
501## perl-brace-offset: -4
502## perl-brace-imaginary-offset: 0
503## perl-label-offset: -2
504## cperl-indent-level: 4
505## cperl-brace-offset: 0
506## cperl-continued-brace-offset: 0
507## cperl-label-offset: -2
508## cperl-extra-newline-before-brace: t
509## cperl-merge-trailing-else: nil
510## cperl-continued-statement-offset: 2
511## indent-tabs-mode: t
512## End:
513