ValidateFile.pm - OpenGrok cross reference for /dports/textproc/p5-xmltv/xmltv-0.5.70/lib/ValidateFile.pm

package XMLTV::ValidateFile;

use strict;

BEGIN {
    use Exporter   ();
    our (@ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS);

    @ISA         = qw(Exporter);
    @EXPORT      = qw( );
    %EXPORT_TAGS = ( );     # eg: TAG => [ qw!name1 name2! ],
    @EXPORT_OK   = qw/LoadDtd ValidateFile/;
}
our @EXPORT_OK;

use XML::LibXML;
use File::Slurp qw/read_file/;
use XMLTV::Supplement qw/GetSupplement/;

our $REQUIRE_CHANNEL_ID=1;

my( $dtd, $parser );

=head1 NAME

XMLTV::ValidateFile - Validates an XMLTV file

=head1 DESCRIPTION

Utility library that validates that a file is correct according to
http://wiki.xmltv.org/index.php/XMLTVFormat.


=head1 EXPORTED FUNCTIONS

All these functions are exported on demand.

=over 4

=cut

=item LoadDtd

Load the xmltv dtd. Takes a single parameter which is the name of
the xmltv dtd file.

LoadDtd must be called before ValidateFile can be called.

=cut

sub LoadDtd {
    my( $dtd_file ) = @_;

    my $dtd_str = read_file($dtd_file)
	or die "Failed to read $dtd_file";

    $dtd = XML::LibXML::Dtd->parse_string($dtd_str);
}

=item ValidateFile

Validate that a file is valid according to the XMLTV dtd and try to check
that it contains valid information. ValidateFile takes a filename as parameter
and optionally also a day and an offset and prints error messages to STDERR.

ValidateFile returns a list of errors that it found with the file. Each
error takes the form of a keyword:

ValidateFile checks the following:

=over

=item notwell

The file is not well-formed XML.

=item notdtd

The file does not follow the XMLTV DTD.

=item unknownid

No channel-entry found for a channelid that is used in a programme-entry.

=item duplicatechannel

More than one channel-entry found for a channelid.

=item noprogrammes

No programme entries were found in the file.

=item channelnoprogramme

There are no programme entries for one of the channels listed with a
channel-entry.

=item invalidid

An xmltvid does not look like a proper id, i.e. it does not  match
/^[-a-zA-Z0-9]+(\.[-a-zA-Z0-9]+)+$/.

=item noid

A programme-entry without an id was found.

=item emptytitle

A programme entry with an empty or missing title was found.

=item emptydescription

A programme entry with an empty desc-element was found. The desc-element
shall be omitted if there is no description.

=item badstart

A programme entry with an invalid start-time was found.

=item badstop

A programme entry with an invalid stop-time was found.

=item badepisode

A programme entry with an invalid episode number was found.

=item badiso8859

The file is encoded in iso-8859 but contains characters that
have no meaning in iso-8859 (or are control characters).
If it's iso-8859-1 aka Latin 1 it might be some characters in windows-1252 encoding.

=item badutf8

The file is encoded in utf-8 but contains characters that look strange.
1) Mis-encoded single characters represented with [EF][BF][BD] bytes
2) Mis-encoded single characters represented with [C3][AF][C2][BF][C2][BD] bytes
3) Mis-encoded single characters in range [C2][80-9F]

=back

If no errors are found, an empty list is returned.

=cut

my %errors;
my %timezoneerrors;

sub ValidateFile {
    my( $file ) = @_;

    if( not defined( $parser ) ) {
	$parser = XML::LibXML->new();
	$parser->line_numbers(1);
    }

    if( not defined( $dtd ) ) {
	my $dtd_str = GetSupplement( undef, 'xmltv.dtd');
	$dtd = XML::LibXML::Dtd->parse_string( $dtd_str );
    }

    %errors = ();

    my $doc;

    eval { $doc = $parser->parse_file( $file ); };

    if ( $@ ) {
	w( "The file is not well-formed xml:\n$@ ", 'notwell');
	return (keys %errors);
    }

    eval { $doc->validate( $dtd ) };
    if ( $@ ) {
	w( "The file is not valid according to the xmltv dtd:\n $@",
	   'notvalid' );
	return (keys %errors);
    }

    if( $doc->encoding() =~ m/^iso-8859-\d+$/i ) {
	verify_iso8859xx( $file, $doc->encoding() );
    } elsif( $doc->encoding() =~ m/^utf-8$/i ) {
	verify_utf8( $file );
    }
    verify_entities( $file );

    my $w = sub {
	my( $p, $msg, $id ) = @_;
	w( "Line " . $p->line_number() . " $msg", $id );
    };

    my %channels;

    my $ns = $doc->find( "//channel" );

    foreach my $ch ($ns->get_nodelist) {
	my $channelid = $ch->findvalue('@id');
	my $display_name = $ch->findvalue('display-name/text()');

	$w->( $ch, "Invalid channel-id '$channelid'", 'invalidid' )
	    if $channelid !~ /^[-a-zA-Z0-9]+(\.[-a-zA-Z0-9]+)+$/;

	$w->( $ch, "Duplicate channel-tag for '$channelid'", 'duplicateid' )
	    if defined( $channels{$channelid} );

	$channels{$channelid} = 0;
    }

    $ns = $doc->find( "//programme" );
    if ($ns->size() == 0) {
	w( "No programme entries found.", 'noprogrammes' );
	return (keys %errors);
    }

    foreach my $p ($ns->get_nodelist) {
	my $channelid = $p->findvalue('@channel');
	my $start = $p->findvalue('@start');
	my $stop = $p->findvalue('@stop');
	my $title = $p->findvalue('title/text()');
	my $desc;
	$desc = $p->findvalue('desc/text()')
	    if $p->findvalue( 'count(desc)' );

	my $xmltv_episode = $p->findvalue('episode-num[@system="xmltv_ns"]' );

	if ($REQUIRE_CHANNEL_ID and not exists( $channels{$channelid} )) {
	    $w->( $p, "Channel '$channelid' does not have a <channel>-entry.",
		  'unknownid' );
	    $channels{$channelid} = 0;
	}

	$channels{$channelid}++;

	$w->( $p, "Empty title", 'emptytitle' )
	    if $title =~ /^\s*$/;

	$w->( $p, "Empty description", 'emptydescription' )
	    if defined($desc) and $desc =~ /^\s*$/;

	$w->( $p, "Invalid start-time '$start'", 'badstart' )
	    if not verify_time( $start );

	$w->( $p, "Invalid stop-time '$stop'", 'badstop' )
	    if $stop ne "" and not verify_time( $stop );

	if( $xmltv_episode =~ /\S/ ) {
	    $w->($p, "Invalid episode-number '$xmltv_episode'", 'badepisode' )
		if $xmltv_episode !~ /^\s*\d* (\s* \/ \s*\d+)? \s* \.
		                       \s*\d* (\s* \/ \s*\d+)? \s* \.
		                       \s*\d* (\s* \/ \s*\d+)? \s* $/x;
	}
    }

    foreach my $channel (keys %channels) {
	if ($channels{$channel} == 0) {
	    w( "No programme entries found for $channel",
	       'channelnoprogramme' );
	}
    }

    return (keys %errors);
}

sub verify_time
{
    my( $timestamp ) = @_;

    my( $date, $time, $tz ) =
	($timestamp =~ /^(\d{8})(\d{4,6})(\s+([A-Z]+|[+-]\d{4})){0,1}$/ );

    return 0 unless defined $time;

    if( not defined( $tz ) )
    {
	if( not defined( $timezoneerrors{$tz} ) ) {
	    w( "No timezone specified", 'missingtimezone' );
	    $timezoneerrors{$tz}++;
	    return 0;
	}
    }

    if( $tz =~ /[a-zA-Z]/ ) {
	if( not defined( $timezoneerrors{$tz} ) ) {
	    w( "Invalid timezone '$tz'", 'invalidtimezone' );
	    $timezoneerrors{$tz}++;
	    return 0;
	}
    }

    return 1;
}

sub verify_iso8859xx
{
    # code points not used in iso-8859 according to http://de.wikipedia.org/wiki/ISO_8859
    my %unused_iso8859 = (
        'iso-8859-1'  => undef,
        'iso-8859-2'  => undef,
        'iso-8859-3'  => '\xa5\xae\xbe\xc3\xd0\xe3\xf0',
        'iso-8859-4'  => undef,
        'iso-8859-5'  => undef,
        'iso-8859-6'  => '\xa1-\xa3\xa5-\xab\xae-\xba\xbc-\xbe\xc0\xdb-\xdf\xf3-xff',
        'iso-8859-7'  => '\xae\xd2\xff',
        'iso-8859-8'  => '\xa1\xbf-\xde\xfb-\xfc\xff',
        'iso-8859-9'  => undef,
        'iso-8859-10' => undef,
        'iso-8859-11' => '\xdb-\xde\xfc-\xff',
        'iso-8859-12' => undef,
        'iso-8859-13' => undef,
        'iso-8859-14' => undef,
        'iso-8859-15' => undef,
    );
    # code points of unusual control characters used in iso-8859 according to http://de.wikipedia.org/wiki/ISO_8859
    my %unusual_iso8859 = (
        'iso-8859-1'  => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad',
        'iso-8859-2'  => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad',
        'iso-8859-3'  => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad',
        'iso-8859-4'  => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad',
        'iso-8859-5'  => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad',
        'iso-8859-6'  => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad',
        'iso-8859-7'  => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad',
        'iso-8859-8'  => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad',
        'iso-8859-9'  => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad',
        'iso-8859-10' => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad',
        'iso-8859-11' => '\x00-\x08\x0b-\x1f\x7f-\xa0',
        'iso-8859-12' => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad',
        'iso-8859-13' => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad',
        'iso-8859-14' => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad',
        'iso-8859-15' => '\x00-\x08\x0b-\x1f\x7f-\xa0\xad',
    );
    my( $filename, $encoding ) = @_;
    $encoding = lc( $encoding );

    my $file_str = read_file($filename);
    my $unusual = $unusual_iso8859{$encoding};
    my $unused = $unused_iso8859{$encoding};

    if( defined( $unusual ) ) {
        if( $file_str =~ m/[$unusual]+/ ) {
            my ($hintpre, $hint, $hintpost) = ( $file_str =~ m/(.{0,15})([$unusual]+)(.{0,15})/ );
            w( "file contains unexpected control characters"
               . "\nlook here \"" . $hintpre . $hint . $hintpost . "\""
               . sprintf( "\n%*s", 12+length( $hintpre ) , "^" )
               , 'badiso8859' );
        }
    }

    if( defined( $unused ) ) {
        if( $file_str =~ m/[$unused]+/ ) {
            my ($hintpre, $hint, $hintpost) = ( $file_str =~ m/(.{0,15})([$unused]+)(.{0,15})/ );
            w( "file contains bytes without meaning in " . $encoding
               . "\nlook here \"" . $hintpre . $hint . $hintpost . "\""
               . sprintf( "\n%*s", 12+length( $hintpre ) , "^" )
               , 'badiso8859' );
        }
    }

    return 1;
}

# inspired by utf8 fixups in _uk_rt
sub verify_utf8 {
    my( $filename ) = @_;

    my $file_str = read_file($filename);

    # 1) Mis-encoded single characters represented with [EF][BF][BD] bytes
    if( $file_str =~ m/\xEF\xBF\xBD]/ ) {
        my ($hintpre, $hint, $hintpost) = ( $file_str =~ m/(.{0,15})(\xEF\xBF\xBD)(.{0,15})/ );
        w( "file contains misencoded characters"
           . "\nlook here \"" . $hintpre . $hint . $hintpost . "\""
           . sprintf( "\n%*s", 11+length( $hintpre ) , "^^^" )
           , 'badutf8' );
    }

    # 2) Mis-encoded single characters represented with [C3][AF][C2][BF][C2][BD] bytes
    if( $file_str =~ m/\xC3\xAF\xC2\xBF\xC2\xBD/ ) {
        my ($hintpre, $hint, $hintpost) = ( $file_str =~ m/(.{0,15})(\xC3\xAF\xC2\xBF\xC2\xBD)(.{0,15})/ );
        w( "file contains misencoded characters"
           . "\nlook here \"" . $hintpre . $hint . $hintpost . "\""
           . sprintf( "\n%*s", 11+length( $hintpre ) , "^^^^^^" )
           , 'badutf8' );
    }

    # 3) Mis-encoded single characters in range [C2][80-9F]
    if( $file_str =~ m/\xC2[\x80-\x9F]/ ) {
        my ($hintpre, $hint, $hintpost) = ( $file_str =~ m/(.{0,15})(\xC2[\x80-\x9F])(.{0,15})/ );
        w( "file contains unexpected control characters, misencoded windows-1252?"
           . "\nlook here \"" . $hintpre . $hint . $hintpost . "\""
           . sprintf( "\n%*s", 11+length( $hintpre ) , "^^" )
           , 'badutf8' );
    }

    # 4) The first two (C0 and C1) could only be used for overlong encoding of basic ASCII characters.
    if( $file_str =~ m/[\xC0-\xC1]/ ) {
        my ($hintpre, $hint, $hintpost) = ( $file_str =~ m/(.{0,15})([\xC0-\xC1])(.{0,15})/ );
        w( "file contains bytes that should never appear in utf-8"
           . "\nlook here \"" . $hintpre . $hint . $hintpost . "\""
           . sprintf( "\n%*s", 11+length( $hintpre ) , "^" )
           , 'badutf8' );
    }

    # 5) start bytes of sequences that could only encode numbers larger than the 0x10FFFF limit of Unicode.
    if( $file_str =~ m/[\xF5-\xFF]/ ) {
        my ($hintpre, $hint, $hintpost) = ( $file_str =~ m/(.{0,15})([\xF5-\xFF])(.{0,15})/ );
        w( "file contains bytes that should never appear in utf-8"
           . "\nlook here \"" . $hintpre . $hint . $hintpost . "\""
           . sprintf( "\n%*s", 11+length( $hintpre ) , "^" )
           , 'badutf8' );
    }

    # 6) first continuation byte missing after start of sequence
    if( $file_str =~ m/[\xC2-\xF4][\x00-\x7F\xC0-\xFF]/ ) {
        my ($hintpre, $hint, $hintpost) = ( $file_str =~ m/(.{0,15})([\xC2-\xF4][\x00-\x7F\xC0-\xFF])(.{0,15})/ );
        w( "file contains an utf-8 sequence with missing continuation bytes"
           . "\nlook here \"" . $hintpre . $hint . $hintpost . "\""
           . sprintf( "\n%*s", 11+length( $hintpre )+1 , "^" )
           , 'badutf8' );
    }

    # 7) second continuation byte missing after start of sequence
    if( $file_str =~ m/[\xE0-\xF4][\x80-\xBF][\x00-\x7F\xC0-\xFF]/ ) {
        my ($hintpre, $hint, $hintpost) = ( $file_str =~ m/(.{0,15})([\xE0-\xF4][\x80-\xBF][\x00-\x7F\xC0-\xFF])(.{0,15})/ );
        w( "file contains an utf-8 sequence with missing continuation bytes"
           . "\nlook here \"" . $hintpre . $hint . $hintpost . "\""
           . sprintf( "\n%*s", 11+length( $hintpre )+2 , "^" )
           , 'badutf8' );
    }

    # 8) third continuation byte missing after start of sequence
    if( $file_str =~ m/[\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x00-\x7F\xC0-\xFF]/ ) {
        my ($hintpre, $hint, $hintpost) = ( $file_str =~ m/(.{0,15})([\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x00-\x7F\xC0-\xFF])(.{0,15})/ );
        w( "file contains an utf-8 sequence with missing continuation bytes"
           . "\nlook here \"" . $hintpre . $hint . $hintpost . "\""
           . sprintf( "\n%*s", 11+length( $hintpre )+3 , "^" )
           , 'badutf8' );
    }

    return 1;
}

sub verify_entities
{
    my( $filename ) = @_;

    my $file_str = read_file($filename);

    if( $file_str =~ m/&[^#].+?;/ ) {
        my ($entity) = ( $file_str =~ m/&([^#].+?);/ );
        my %fiveentities = ('quot' => 1, 'amp' => 1, 'apos' => 1, 'lt' => 1, 'gt' => 1);
        if (!exists($fiveentities{$entity})) {
            w( "file contains undefined entity: $entity", 'badentity' );
        }
    }

    return 1;
}

sub w {
    my( $msg, $id ) = @_;
    print "$msg\n";
    $errors{$id}++ if defined $id;
}


1;

=back

=head1 BUGS

It is currently necessary to specify the path to the xmltv dtd-file.
This should not be necessary.

=head1 COPYRIGHT

Copyright (C) 2006 Mattias Holmlund.

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

=cut

### Setup indentation in Emacs
## Local Variables:
## perl-indent-level: 4
## perl-continued-statement-offset: 4
## perl-continued-brace-offset: 0
## perl-brace-offset: -4
## perl-brace-imaginary-offset: 0
## perl-label-offset: -2
## cperl-indent-level: 4
## cperl-brace-offset: 0
## cperl-continued-brace-offset: 0
## cperl-label-offset: -2
## cperl-extra-newline-before-brace: t
## cperl-merge-trailing-else: nil
## cperl-continued-statement-offset: 2
## indent-tabs-mode: t
## End: