1use strict;
2use warnings;
3use utf8;
4
5use FileHandle   ();
6use HTML::Parser ();
7use Test::More tests => 7;
8
9my $HTML = <<'HTML';
10
11<!DOCTYPE HTML>
12
13<body>
14
15Various entities.  The parser must never break them in the middle:
16
17&#x2F
18&#x2F;
19&#200
20&#3030;
21&#XFFFF;
22&aring-&Aring;
23
24<ul>
25<li><a href="foo 'bar' baz>" id=33>This is a link</a>
26<li><a href='foo "bar" baz> &aring' id=34>This is another one</a>
27</ul>
28
29<p><div align="center"><img src="http://www.perl.com/perl.gif"
30alt="camel"></div>
31
32<!-- this is
33a comment --> and this is not.
34
35<!-- this is the kind of >comment< -- --> that Netscape hates -->
36
37< this > was not a tag. <this is/not either>
38
39</body>
40
41HTML
42
43#-------------------------------------------------------------------
44
45{
46
47    package P;
48    require HTML::Parser;
49    our @ISA   = qw(HTML::Parser);
50    our $OUT   = '';
51    our $COUNT = 0;
52
53    sub new {
54        my $class = shift;
55        my $self  = $class->SUPER::new;
56        $OUT = '';
57        die "Can only have one" if $COUNT++;
58        $self;
59    }
60
61    sub DESTROY {
62        my $self = shift;
63        eval { $self->SUPER::DESTROY; };
64        $COUNT--;
65    }
66
67    sub declaration {
68        my ($self, $decl) = @_;
69        $OUT .= "[[$decl]]|";
70    }
71
72    sub start {
73        my ($self, $tag, $attr) = @_;
74        $attr = join("/", map "$_=$attr->{$_}", sort keys %$attr);
75        $attr = "/$attr" if length $attr;
76        $OUT .= "<<$tag$attr>>|";
77    }
78
79    sub end {
80        my ($self, $tag) = @_;
81        $OUT .= ">>$tag<<|";
82    }
83
84    sub comment {
85        my ($self, $comment) = @_;
86        $OUT .= "##$comment##|";
87    }
88
89    sub text {
90        my ($self, $text) = @_;
91
92        #$text =~ s/\n/\\n/g;
93        #$text =~ s/\t/\\t/g;
94        #$text =~ s/ /�/g;
95        $OUT .= "$text|";
96    }
97
98    sub result {
99        $OUT;
100    }
101}
102
103my $last_res;
104for my $chunksize (64 * 1024, 64, 13, 3, 1, "file", "filehandle") {
105
106#for $chunksize (1) {
107    if ($chunksize =~ /^file/) {
108
109        #print "Parsing from $chunksize";
110    }
111    else {
112        #print "Parsing using $chunksize byte chunks";
113    }
114    my $p = P->new;
115
116    if ($chunksize =~ /^file/) {
117
118        # First we must create the file
119        my $tmpfile = "tmp-$$.html";
120        my $file    = $tmpfile;
121        die "$file already exists" if -e $file;
122        open(my $fh, '>', $file) or die "Can't create $file: $!";
123        binmode $fh;
124        print {$fh} $HTML;
125        close($fh);
126
127        if ($chunksize eq "filehandle") {
128            my $fh = FileHandle->new($file) || die "Can't open $file: $!";
129            $file = $fh;
130        }
131
132        # then we can parse it.
133        $p->parse_file($file);
134        close $file if $chunksize eq "filehandle";
135        unlink($tmpfile) || warn "Can't unlink $tmpfile: $!";
136    }
137    else {
138        my $copy = $HTML;
139        while (length $copy) {
140            my $chunk = substr($copy, 0, $chunksize);
141            substr($copy, 0, $chunksize) = '';
142            $p->parse($chunk);
143        }
144        $p->eof;
145    }
146
147    my $res = $p->result;
148    my $bad;
149
150    # Then we start looking for things that should not happen
151    if ($res =~ /\s\|\s/) {
152        diag "broken space";
153        $bad++;
154    }
155    for (
156        # Make sure entities are not broken
157        '&#x2F', '&#x2F;', '&#200', '&#3030;', '&#XFFFF;', '&aring', '&Aring',
158
159        # Some elements that should be produced
160        "|[[DOCTYPE HTML]]|", "|## this is\na comment ##|",
161        "|<<ul>>|\n|<<li>>|<<a/href=foo 'bar' baz>/id=33>>|",
162        '|<<li>>|<<a/href=foo "bar" baz> å/id=34>>', "|>>ul<<|",
163        "|>>body<<|\n\n|",
164        )
165    {
166        if (index($res, $_) < 0) {
167            diag "Can't find '$_' in parsed document";
168            $bad++;
169        }
170    }
171
172    diag $res if $bad || $ENV{PRINT_RESULTS};
173
174    # And we check that we get the same result all the time
175    $res =~ s/\|//g;    # remove all break marks
176    if ($last_res && $res ne $last_res) {
177        diag "The result is not the same as last time";
178        $bad++;
179    }
180    $last_res = $res;
181
182    unless ($res =~ /Various entities/) {
183        diag "Some text must be missing";
184        $bad++;
185    }
186
187    ok(!$bad);
188}
189