1# The encoding detection heuristic will choose UTF8 or Latin-1. The current 2# implementation will usually treat CP1252 (aka "Win-Latin-1") as Latin-1 but 3# can be fooled into seeing it as UTF8. 4# 5# Note 1: Neither guess is 'correct' since even if we choose Latin-1, all the 6# smart quote symbols will be rendered as control characters 7# 8# Note 2: the guess is only applied if the source POD omits =encoding, so 9# CP1252 source will render correctly if properly declared 10# 11 12BEGIN { 13 if($ENV{PERL_CORE}) { 14 chdir 't'; 15 @INC = '../lib'; 16 } 17} 18 19use strict; 20use Test; 21BEGIN { plan tests => 5 }; 22 23ok 1; 24 25use Pod::Simple::DumpAsXML; 26use Pod::Simple::XMLOutStream; 27 28 29# Initial, isolated, non-ASCII byte triggers Latin-1 guess and later 30# multi-byte sequence is not considered by heuristic. 31 32my @output_lines = split m/[\cm\cj]+/, Pod::Simple::XMLOutStream->_out( qq{ 33 34=head1 NAME 35 36Em::Dash \x97 \x91CAF\xC9\x92 37 38=cut 39 40} ); 41 42my($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)}; 43if( $guess ) { 44 if( $guess eq 'ISO8859-1' ) { 45 if( grep m{Dash (\x97|—|—)}, @output_lines ) { 46 ok 1; 47 } else { 48 ok 0; 49 print "# failed to find expected control character in output\n" 50 } 51 } else { 52 ok 0; 53 print "# parser guessed wrong encoding expected 'ISO8859-1' got '$guess'\n"; 54 } 55} else { 56 ok 0; 57 print "# parser failed to detect non-ASCII bytes in input\n"; 58} 59 60 61# Initial smart-quote character triggers Latin-1 guess as expected 62 63@output_lines = split m/[\cm\cj]+/, Pod::Simple::XMLOutStream->_out( qq{ 64 65=head1 NAME 66 67Smart::Quote - \x91FUT\xC9\x92 68 69=cut 70 71} ); 72 73($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)}; 74if( $guess ) { 75 if( $guess eq 'ISO8859-1' ) { 76 ok 1; 77 } else { 78 ok 0; 79 print "# parser guessed wrong encoding expected 'ISO8859-1' got '$guess'\n"; 80 } 81} else { 82 ok 0; 83 print "# parser failed to detect non-ASCII bytes in input\n"; 84} 85 86 87# Initial accented character followed by 'smart' apostrophe causes heuristic 88# to choose UTF8 (a rather contrived example) 89 90@output_lines = split m/[\cm\cj]+/, Pod::Simple::XMLOutStream->_out( qq{ 91 92=head1 NAME 93 94Smart::Apostrophe::Fail - L\xC9\x92STRANGE 95 96=cut 97 98} ); 99 100($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)}; 101if( $guess ) { 102 if( $guess eq 'UTF-8' ) { 103 ok 1; 104 } else { 105 ok 0; 106 print "# parser guessed wrong encoding expected 'UTF-8' got '$guess'\n"; 107 } 108} else { 109 ok 0; 110 print "# parser failed to detect non-ASCII bytes in input\n"; 111} 112 113 114# The previous example used a CP1252 byte sequence that also happened to be a 115# valid UTF8 byte sequence. In this example the heuristic also guesses 'wrong' 116# despite the byte sequence not being valid UTF8 (it's too short). This could 117# arguably be 'fixed' by using a less naive regex. 118 119@output_lines = split m/[\cm\cj]+/, Pod::Simple::XMLOutStream->_out( qq{ 120 121=head1 NAME 122 123Smart::Apostrophe::Fail - L\xE9\x92Strange 124 125=cut 126 127} ); 128 129($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)}; 130if( $guess ) { 131 if( $guess eq 'UTF-8' ) { 132 ok 1; 133 } else { 134 ok 0; 135 print "# parser guessed wrong encoding expected 'UTF-8' got '$guess'\n"; 136 } 137} else { 138 ok 0; 139 print "# parser failed to detect non-ASCII bytes in input\n"; 140} 141 142 143exit; 144