1package XMLTV::ValidateFile; 2 3use strict; 4 5BEGIN { 6 use Exporter (); 7 our (@ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS); 8 9 @ISA = qw(Exporter); 10 @EXPORT = qw( ); 11 %EXPORT_TAGS = ( ); # eg: TAG => [ qw!name1 name2! ], 12 @EXPORT_OK = qw/LoadDtd ValidateFile/; 13} 14our @EXPORT_OK; 15 16use XML::LibXML; 17use File::Slurp qw/read_file/; 18use XMLTV::Supplement qw/GetSupplement/; 19 20our $REQUIRE_CHANNEL_ID=1; 21 22my( $dtd, $parser ); 23 24=head1 NAME 25 26XMLTV::ValidateFile - Validates an XMLTV file 27 28=head1 DESCRIPTION 29 30Utility library that validates that a file is correct according to 31http://wiki.xmltv.org/index.php/XMLTVFormat. 32 33 34=head1 EXPORTED FUNCTIONS 35 36All these functions are exported on demand. 37 38=over 4 39 40=cut 41 42=item LoadDtd 43 44Load the xmltv dtd. Takes a single parameter which is the name of 45the xmltv dtd file. 46 47LoadDtd must be called before ValidateFile can be called. 48 49=cut 50 51sub LoadDtd { 52 my( $dtd_file ) = @_; 53 54 my $dtd_str = read_file($dtd_file) 55 or die "Failed to read $dtd_file"; 56 57 $dtd = XML::LibXML::Dtd->parse_string($dtd_str); 58} 59 60=item ValidateFile 61 62Validate that a file is valid according to the XMLTV dtd and try to check 63that it contains valid information. ValidateFile takes a filename as parameter 64and optionally also a day and an offset and prints error messages to STDERR. 65 66ValidateFile returns a list of errors that it found with the file. Each 67error takes the form of a keyword: 68 69ValidateFile checks the following: 70 71=over 72 73=item notwell 74 75The file is not well-formed XML. 76 77=item notdtd 78 79The file does not follow the XMLTV DTD. 80 81=item unknownid 82 83No channel-entry found for a channelid that is used in a programme-entry. 84 85=item duplicatechannel 86 87More than one channel-entry found for a channelid. 88 89=item noprogrammes 90 91No programme entries were found in the file. 92 93=item channelnoprogramme 94 95There are no programme entries for one of the channels listed with a 96channel-entry. 97 98=item invalidid 99 100An xmltvid does not look like a proper id, i.e. it does not match 101/^[-a-zA-Z0-9]+(\.[-a-zA-Z0-9]+)+$/. 102 103=item noid 104 105A programme-entry without an id was found. 106 107=item emptytitle 108 109A programme entry with an empty or missing title was found. 110 111=item emptydescription 112 113A programme entry with an empty desc-element was found. The desc-element 114shall be omitted if there is no description. 115 116=item badstart 117 118A programme entry with an invalid start-time was found. 119 120=item badstop 121 122A programme entry with an invalid stop-time was found. 123 124=item badepisode 125 126A programme entry with an invalid episode number was found. 127 128=item badiso8859 129 130The file is encoded in iso-8859 but contains characters that 131have no meaning in iso-8859 (or are control characters). 132If it's iso-8859-1 aka Latin 1 it might be some characters in windows-1252 encoding. 133 134=item badutf8 135 136The file is encoded in utf-8 but contains characters that look strange. 1371) Mis-encoded single characters represented with [EF][BF][BD] bytes 1382) Mis-encoded single characters represented with [C3][AF][C2][BF][C2][BD] bytes 1393) Mis-encoded single characters in range [C2][80-9F] 140 141=back 142 143If no errors are found, an empty list is returned. 144 145=cut 146 147my %errors; 148my %timezoneerrors; 149 150sub ValidateFile { 151 my( $file ) = @_; 152 153 if( not defined( $parser ) ) { 154 $parser = XML::LibXML->new(); 155 $parser->line_numbers(1); 156 } 157 158 if( not defined( $dtd ) ) { 159 my $dtd_str = GetSupplement( undef, 'xmltv.dtd'); 160 $dtd = XML::LibXML::Dtd->parse_string( $dtd_str ); 161 } 162 163 %errors = (); 164 165 my $doc; 166 167 eval { $doc = $parser->parse_file( $file ); }; 168 169 if ( $@ ) { 170 w( "The file is not well-formed xml:\n$@ ", 'notwell'); 171 return (keys %errors); 172 } 173 174 eval { $doc->validate( $dtd ) }; 175 if ( $@ ) { 176 w( "The file is not valid according to the xmltv dtd:\n $@", 177 'notvalid' ); 178 return (keys %errors); 179 } 180 181 if( $doc->encoding() =~ m/^iso-8859-\d+$/i ) { 182 verify_iso8859xx( $file, $doc->encoding() ); 183 } elsif( $doc->encoding() =~ m/^utf-8$/i ) { 184 verify_utf8( $file ); 185 } 186 verify_entities( $file ); 187 188 my $w = sub { 189 my( $p, $msg, $id ) = @_; 190 w( "Line " . $p->line_number() . " $msg", $id ); 191 }; 192 193 my %channels; 194 195 my $ns = $doc->find( "//channel" ); 196 197 foreach my $ch ($ns->get_nodelist) { 198 my $channelid = $ch->findvalue('@id'); 199 my $display_name = $ch->findvalue('display-name/text()'); 200 201 $w->( $ch, "Invalid channel-id '$channelid'", 'invalidid' ) 202 if $channelid !~ /^[-a-zA-Z0-9]+(\.[-a-zA-Z0-9]+)+$/; 203 204 $w->( $ch, "Duplicate channel-tag for '$channelid'", 'duplicateid' ) 205 if defined( $channels{$channelid} ); 206 207 $channels{$channelid} = 0; 208 } 209 210 $ns = $doc->find( "//programme" ); 211 if ($ns->size() == 0) { 212 w( "No programme entries found.", 'noprogrammes' ); 213 return (keys %errors); 214 } 215 216 foreach my $p ($ns->get_nodelist) { 217 my $channelid = $p->findvalue('@channel'); 218 my $start = $p->findvalue('@start'); 219 my $stop = $p->findvalue('@stop'); 220 my $title = $p->findvalue('title/text()'); 221 my $desc; 222 $desc = $p->findvalue('desc/text()') 223 if $p->findvalue( 'count(desc)' ); 224 225 my $xmltv_episode = $p->findvalue('episode-num[@system="xmltv_ns"]' ); 226 227 if ($REQUIRE_CHANNEL_ID and not exists( $channels{$channelid} )) { 228 $w->( $p, "Channel '$channelid' does not have a <channel>-entry.", 229 'unknownid' ); 230 $channels{$channelid} = 0; 231 } 232 233 $channels{$channelid}++; 234 235 $w->( $p, "Empty title", 'emptytitle' ) 236 if $title =~ /^\s*$/; 237 238 $w->( $p, "Empty description", 'emptydescription' ) 239 if defined($desc) and $desc =~ /^\s*$/; 240 241 $w->( $p, "Invalid start-time '$start'", 'badstart' ) 242 if not verify_time( $start ); 243 244 $w->( $p, "Invalid stop-time '$stop'", 'badstop' ) 245 if $stop ne "" and not verify_time( $stop ); 246 247 if( $xmltv_episode =~ /\S/ ) { 248 $w->($p, "Invalid episode-number '$xmltv_episode'", 'badepisode' ) 249 if $xmltv_episode !~ /^\s*\d* (\s* \/ \s*\d+)? \s* \. 250 \s*\d* (\s* \/ \s*\d+)? \s* \. 251 \s*\d* (\s* \/ \s*\d+)? \s* $/x; 252 } 253 } 254 255 foreach my $channel (keys %channels) { 256 if ($channels{$channel} == 0) { 257 w( "No programme entries found for $channel", 258 'channelnoprogramme' ); 259 } 260 } 261 262 return (keys %errors); 263} 264 265sub verify_time 266{ 267 my( $timestamp ) = @_; 268 269 my( $date, $time, $tz ) = 270 ($timestamp =~ /^(\d{8})(\d{4,6})(\s+([A-Z]+|[+-]\d{4})){0,1}$/ ); 271 272 return 0 unless defined $time; 273 274 if( not defined( $tz ) ) 275 { 276 if( not defined( $timezoneerrors{$tz} ) ) { 277 w( "No timezone specified", 'missingtimezone' ); 278 $timezoneerrors{$tz}++; 279 return 0; 280 } 281 } 282 283 if( $tz =~ /[a-zA-Z]/ ) { 284 if( not defined( $timezoneerrors{$tz} ) ) { 285 w( "Invalid timezone '$tz'", 'invalidtimezone' ); 286 $timezoneerrors{$tz}++; 287 return 0; 288 } 289 } 290 291 return 1; 292} 293 294sub verify_iso8859xx 295{ 296 # code points not used in iso-8859 according to http://de.wikipedia.org/wiki/ISO_8859 297 my %unused_iso8859 = ( 298 'iso-8859-1' => undef, 299 'iso-8859-2' => undef, 300 'iso-8859-3' => '\xa5\xae\xbe\xc3\xd0\xe3\xf0', 301 'iso-8859-4' => undef, 302 'iso-8859-5' => undef, 303 'iso-8859-6' => '\xa1-\xa3\xa5-\xab\xae-\xba\xbc-\xbe\xc0\xdb-\xdf\xf3-xff', 304 'iso-8859-7' => '\xae\xd2\xff', 305 'iso-8859-8' => '\xa1\xbf-\xde\xfb-\xfc\xff', 306 'iso-8859-9' => undef, 307 'iso-8859-10' => undef, 308 'iso-8859-11' => '\xdb-\xde\xfc-\xff', 309 'iso-8859-12' => undef, 310 'iso-8859-13' => undef, 311 'iso-8859-14' => undef, 312 'iso-8859-15' => undef, 313 ); 314 # code points of unusual control characters used in iso-8859 according to http://de.wikipedia.org/wiki/ISO_8859 315 my %unusual_iso8859 = ( 316 'iso-8859-1' => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad', 317 'iso-8859-2' => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad', 318 'iso-8859-3' => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad', 319 'iso-8859-4' => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad', 320 'iso-8859-5' => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad', 321 'iso-8859-6' => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad', 322 'iso-8859-7' => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad', 323 'iso-8859-8' => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad', 324 'iso-8859-9' => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad', 325 'iso-8859-10' => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad', 326 'iso-8859-11' => '\x00-\x08\x0b-\x1f\x7f-\xa0', 327 'iso-8859-12' => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad', 328 'iso-8859-13' => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad', 329 'iso-8859-14' => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad', 330 'iso-8859-15' => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad', 331 ); 332 my( $filename, $encoding ) = @_; 333 $encoding = lc( $encoding ); 334 335 my $file_str = read_file($filename); 336 my $unusual = $unusual_iso8859{$encoding}; 337 my $unused = $unused_iso8859{$encoding}; 338 339 if( defined( $unusual ) ) { 340 if( $file_str =~ m/[$unusual]+/ ) { 341 my ($hintpre, $hint, $hintpost) = ( $file_str =~ m/(.{0,15})([$unusual]+)(.{0,15})/ ); 342 w( "file contains unexpected control characters" 343 . "\nlook here \"" . $hintpre . $hint . $hintpost . "\"" 344 . sprintf( "\n%*s", 12+length( $hintpre ) , "^" ) 345 , 'badiso8859' ); 346 } 347 } 348 349 if( defined( $unused ) ) { 350 if( $file_str =~ m/[$unused]+/ ) { 351 my ($hintpre, $hint, $hintpost) = ( $file_str =~ m/(.{0,15})([$unused]+)(.{0,15})/ ); 352 w( "file contains bytes without meaning in " . $encoding 353 . "\nlook here \"" . $hintpre . $hint . $hintpost . "\"" 354 . sprintf( "\n%*s", 12+length( $hintpre ) , "^" ) 355 , 'badiso8859' ); 356 } 357 } 358 359 return 1; 360} 361 362# inspired by utf8 fixups in _uk_rt 363sub verify_utf8 { 364 my( $filename ) = @_; 365 366 my $file_str = read_file($filename); 367 368 # 1) Mis-encoded single characters represented with [EF][BF][BD] bytes 369 if( $file_str =~ m/\xEF\xBF\xBD]/ ) { 370 my ($hintpre, $hint, $hintpost) = ( $file_str =~ m/(.{0,15})(\xEF\xBF\xBD)(.{0,15})/ ); 371 w( "file contains misencoded characters" 372 . "\nlook here \"" . $hintpre . $hint . $hintpost . "\"" 373 . sprintf( "\n%*s", 11+length( $hintpre ) , "^^^" ) 374 , 'badutf8' ); 375 } 376 377 # 2) Mis-encoded single characters represented with [C3][AF][C2][BF][C2][BD] bytes 378 if( $file_str =~ m/\xC3\xAF\xC2\xBF\xC2\xBD/ ) { 379 my ($hintpre, $hint, $hintpost) = ( $file_str =~ m/(.{0,15})(\xC3\xAF\xC2\xBF\xC2\xBD)(.{0,15})/ ); 380 w( "file contains misencoded characters" 381 . "\nlook here \"" . $hintpre . $hint . $hintpost . "\"" 382 . sprintf( "\n%*s", 11+length( $hintpre ) , "^^^^^^" ) 383 , 'badutf8' ); 384 } 385 386 # 3) Mis-encoded single characters in range [C2][80-9F] 387 if( $file_str =~ m/\xC2[\x80-\x9F]/ ) { 388 my ($hintpre, $hint, $hintpost) = ( $file_str =~ m/(.{0,15})(\xC2[\x80-\x9F])(.{0,15})/ ); 389 w( "file contains unexpected control characters, misencoded windows-1252?" 390 . "\nlook here \"" . $hintpre . $hint . $hintpost . "\"" 391 . sprintf( "\n%*s", 11+length( $hintpre ) , "^^" ) 392 , 'badutf8' ); 393 } 394 395 # 4) The first two (C0 and C1) could only be used for overlong encoding of basic ASCII characters. 396 if( $file_str =~ m/[\xC0-\xC1]/ ) { 397 my ($hintpre, $hint, $hintpost) = ( $file_str =~ m/(.{0,15})([\xC0-\xC1])(.{0,15})/ ); 398 w( "file contains bytes that should never appear in utf-8" 399 . "\nlook here \"" . $hintpre . $hint . $hintpost . "\"" 400 . sprintf( "\n%*s", 11+length( $hintpre ) , "^" ) 401 , 'badutf8' ); 402 } 403 404 # 5) start bytes of sequences that could only encode numbers larger than the 0x10FFFF limit of Unicode. 405 if( $file_str =~ m/[\xF5-\xFF]/ ) { 406 my ($hintpre, $hint, $hintpost) = ( $file_str =~ m/(.{0,15})([\xF5-\xFF])(.{0,15})/ ); 407 w( "file contains bytes that should never appear in utf-8" 408 . "\nlook here \"" . $hintpre . $hint . $hintpost . "\"" 409 . sprintf( "\n%*s", 11+length( $hintpre ) , "^" ) 410 , 'badutf8' ); 411 } 412 413 # 6) first continuation byte missing after start of sequence 414 if( $file_str =~ m/[\xC2-\xF4][\x00-\x7F\xC0-\xFF]/ ) { 415 my ($hintpre, $hint, $hintpost) = ( $file_str =~ m/(.{0,15})([\xC2-\xF4][\x00-\x7F\xC0-\xFF])(.{0,15})/ ); 416 w( "file contains an utf-8 sequence with missing continuation bytes" 417 . "\nlook here \"" . $hintpre . $hint . $hintpost . "\"" 418 . sprintf( "\n%*s", 11+length( $hintpre )+1 , "^" ) 419 , 'badutf8' ); 420 } 421 422 # 7) second continuation byte missing after start of sequence 423 if( $file_str =~ m/[\xE0-\xF4][\x80-\xBF][\x00-\x7F\xC0-\xFF]/ ) { 424 my ($hintpre, $hint, $hintpost) = ( $file_str =~ m/(.{0,15})([\xE0-\xF4][\x80-\xBF][\x00-\x7F\xC0-\xFF])(.{0,15})/ ); 425 w( "file contains an utf-8 sequence with missing continuation bytes" 426 . "\nlook here \"" . $hintpre . $hint . $hintpost . "\"" 427 . sprintf( "\n%*s", 11+length( $hintpre )+2 , "^" ) 428 , 'badutf8' ); 429 } 430 431 # 8) third continuation byte missing after start of sequence 432 if( $file_str =~ m/[\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x00-\x7F\xC0-\xFF]/ ) { 433 my ($hintpre, $hint, $hintpost) = ( $file_str =~ m/(.{0,15})([\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x00-\x7F\xC0-\xFF])(.{0,15})/ ); 434 w( "file contains an utf-8 sequence with missing continuation bytes" 435 . "\nlook here \"" . $hintpre . $hint . $hintpost . "\"" 436 . sprintf( "\n%*s", 11+length( $hintpre )+3 , "^" ) 437 , 'badutf8' ); 438 } 439 440 return 1; 441} 442 443sub verify_entities 444{ 445 my( $filename ) = @_; 446 447 my $file_str = read_file($filename); 448 449 if( $file_str =~ m/&[^#].+?;/ ) { 450 my ($entity) = ( $file_str =~ m/&([^#].+?);/ ); 451 my %fiveentities = ('quot' => 1, 'amp' => 1, 'apos' => 1, 'lt' => 1, 'gt' => 1); 452 if (!exists($fiveentities{$entity})) { 453 w( "file contains undefined entity: $entity", 'badentity' ); 454 } 455 } 456 457 return 1; 458} 459 460sub w { 461 my( $msg, $id ) = @_; 462 print "$msg\n"; 463 $errors{$id}++ if defined $id; 464} 465 466 4671; 468 469=back 470 471=head1 BUGS 472 473It is currently necessary to specify the path to the xmltv dtd-file. 474This should not be necessary. 475 476=head1 COPYRIGHT 477 478Copyright (C) 2006 Mattias Holmlund. 479 480This program is free software; you can redistribute it and/or 481modify it under the terms of the GNU General Public License 482as published by the Free Software Foundation; either version 2 483of the License, or (at your option) any later version. 484 485This program is distributed in the hope that it will be useful, 486but WITHOUT ANY WARRANTY; without even the implied warranty of 487MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 488GNU General Public License for more details. 489 490You should have received a copy of the GNU General Public License 491along with this program; if not, write to the Free Software 492Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 493 494=cut 495 496### Setup indentation in Emacs 497## Local Variables: 498## perl-indent-level: 4 499## perl-continued-statement-offset: 4 500## perl-continued-brace-offset: 0 501## perl-brace-offset: -4 502## perl-brace-imaginary-offset: 0 503## perl-label-offset: -2 504## cperl-indent-level: 4 505## cperl-brace-offset: 0 506## cperl-continued-brace-offset: 0 507## cperl-label-offset: -2 508## cperl-extra-newline-before-brace: t 509## cperl-merge-trailing-else: nil 510## cperl-continued-statement-offset: 2 511## indent-tabs-mode: t 512## End: 513