1use strict; 2use warnings; 3use utf8; 4 5use FileHandle (); 6use HTML::Parser (); 7use Test::More tests => 7; 8 9my $HTML = <<'HTML'; 10 11<!DOCTYPE HTML> 12 13<body> 14 15Various entities. The parser must never break them in the middle: 16 17/ 18/ 19È 20௖ 21 22å-Å 23 24<ul> 25<li><a href="foo 'bar' baz>" id=33>This is a link</a> 26<li><a href='foo "bar" baz> å' id=34>This is another one</a> 27</ul> 28 29<p><div align="center"><img src="http://www.perl.com/perl.gif" 30alt="camel"></div> 31 32<!-- this is 33a comment --> and this is not. 34 35<!-- this is the kind of >comment< -- --> that Netscape hates --> 36 37< this > was not a tag. <this is/not either> 38 39</body> 40 41HTML 42 43#------------------------------------------------------------------- 44 45{ 46 47 package P; 48 require HTML::Parser; 49 our @ISA = qw(HTML::Parser); 50 our $OUT = ''; 51 our $COUNT = 0; 52 53 sub new { 54 my $class = shift; 55 my $self = $class->SUPER::new; 56 $OUT = ''; 57 die "Can only have one" if $COUNT++; 58 $self; 59 } 60 61 sub DESTROY { 62 my $self = shift; 63 eval { $self->SUPER::DESTROY; }; 64 $COUNT--; 65 } 66 67 sub declaration { 68 my ($self, $decl) = @_; 69 $OUT .= "[[$decl]]|"; 70 } 71 72 sub start { 73 my ($self, $tag, $attr) = @_; 74 $attr = join("/", map "$_=$attr->{$_}", sort keys %$attr); 75 $attr = "/$attr" if length $attr; 76 $OUT .= "<<$tag$attr>>|"; 77 } 78 79 sub end { 80 my ($self, $tag) = @_; 81 $OUT .= ">>$tag<<|"; 82 } 83 84 sub comment { 85 my ($self, $comment) = @_; 86 $OUT .= "##$comment##|"; 87 } 88 89 sub text { 90 my ($self, $text) = @_; 91 92 #$text =~ s/\n/\\n/g; 93 #$text =~ s/\t/\\t/g; 94 #$text =~ s/ /�/g; 95 $OUT .= "$text|"; 96 } 97 98 sub result { 99 $OUT; 100 } 101} 102 103my $last_res; 104for my $chunksize (64 * 1024, 64, 13, 3, 1, "file", "filehandle") { 105 106#for $chunksize (1) { 107 if ($chunksize =~ /^file/) { 108 109 #print "Parsing from $chunksize"; 110 } 111 else { 112 #print "Parsing using $chunksize byte chunks"; 113 } 114 my $p = P->new; 115 116 if ($chunksize =~ /^file/) { 117 118 # First we must create the file 119 my $tmpfile = "tmp-$$.html"; 120 my $file = $tmpfile; 121 die "$file already exists" if -e $file; 122 open(my $fh, '>', $file) or die "Can't create $file: $!"; 123 binmode $fh; 124 print {$fh} $HTML; 125 close($fh); 126 127 if ($chunksize eq "filehandle") { 128 my $fh = FileHandle->new($file) || die "Can't open $file: $!"; 129 $file = $fh; 130 } 131 132 # then we can parse it. 133 $p->parse_file($file); 134 close $file if $chunksize eq "filehandle"; 135 unlink($tmpfile) || warn "Can't unlink $tmpfile: $!"; 136 } 137 else { 138 my $copy = $HTML; 139 while (length $copy) { 140 my $chunk = substr($copy, 0, $chunksize); 141 substr($copy, 0, $chunksize) = ''; 142 $p->parse($chunk); 143 } 144 $p->eof; 145 } 146 147 my $res = $p->result; 148 my $bad; 149 150 # Then we start looking for things that should not happen 151 if ($res =~ /\s\|\s/) { 152 diag "broken space"; 153 $bad++; 154 } 155 for ( 156 # Make sure entities are not broken 157 '/', '/', 'È', '௖', '', 'å', 'Å', 158 159 # Some elements that should be produced 160 "|[[DOCTYPE HTML]]|", "|## this is\na comment ##|", 161 "|<<ul>>|\n|<<li>>|<<a/href=foo 'bar' baz>/id=33>>|", 162 '|<<li>>|<<a/href=foo "bar" baz> å/id=34>>', "|>>ul<<|", 163 "|>>body<<|\n\n|", 164 ) 165 { 166 if (index($res, $_) < 0) { 167 diag "Can't find '$_' in parsed document"; 168 $bad++; 169 } 170 } 171 172 diag $res if $bad || $ENV{PRINT_RESULTS}; 173 174 # And we check that we get the same result all the time 175 $res =~ s/\|//g; # remove all break marks 176 if ($last_res && $res ne $last_res) { 177 diag "The result is not the same as last time"; 178 $bad++; 179 } 180 $last_res = $res; 181 182 unless ($res =~ /Various entities/) { 183 diag "Some text must be missing"; 184 $bad++; 185 } 186 187 ok(!$bad); 188} 189