1# The encoding detection heuristic will choose UTF8 or Latin-1.  The current
2# implementation will usually treat CP1252 (aka "Win-Latin-1") as Latin-1 but
3# can be fooled into seeing it as UTF8.
4#
5# Note 1: Neither guess is 'correct' since even if we choose Latin-1, all the
6#         smart quote symbols will be rendered as control characters
7#
8# Note 2: the guess is only applied if the source POD omits =encoding, so
9#         CP1252 source will render correctly if properly declared
10#
11
12BEGIN {
13    if($ENV{PERL_CORE}) {
14        chdir 't';
15        @INC = '../lib';
16    }
17}
18
19use strict;
20use Test;
21BEGIN { plan tests => 5 };
22
23ok 1;
24
25use Pod::Simple::DumpAsXML;
26use Pod::Simple::XMLOutStream;
27
28
29# Initial, isolated, non-ASCII byte triggers Latin-1 guess and later
30# multi-byte sequence is not considered by heuristic.
31
32my @output_lines = split m/[\cm\cj]+/, Pod::Simple::XMLOutStream->_out( qq{
33
34=head1 NAME
35
36Em::Dash \x97 \x91CAF\xC9\x92
37
38=cut
39
40} );
41
42my($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
43if( $guess ) {
44  if( $guess eq 'ISO8859-1' ) {
45    if( grep m{Dash (\x97|—|—)}, @output_lines ) {
46      ok 1;
47    } else {
48      ok 0;
49      print "# failed to find expected control character in output\n"
50    }
51  } else {
52    ok 0;
53    print "# parser guessed wrong encoding expected 'ISO8859-1' got '$guess'\n";
54  }
55} else {
56  ok 0;
57  print "# parser failed to detect non-ASCII bytes in input\n";
58}
59
60
61# Initial smart-quote character triggers Latin-1 guess as expected
62
63@output_lines = split m/[\cm\cj]+/, Pod::Simple::XMLOutStream->_out( qq{
64
65=head1 NAME
66
67Smart::Quote - \x91FUT\xC9\x92
68
69=cut
70
71} );
72
73($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
74if( $guess ) {
75  if( $guess eq 'ISO8859-1' ) {
76    ok 1;
77  } else {
78    ok 0;
79    print "# parser guessed wrong encoding expected 'ISO8859-1' got '$guess'\n";
80  }
81} else {
82  ok 0;
83  print "# parser failed to detect non-ASCII bytes in input\n";
84}
85
86
87# Initial accented character followed by 'smart' apostrophe causes heuristic
88# to choose UTF8 (a rather contrived example)
89
90@output_lines = split m/[\cm\cj]+/, Pod::Simple::XMLOutStream->_out( qq{
91
92=head1 NAME
93
94Smart::Apostrophe::Fail - L\xC9\x92STRANGE
95
96=cut
97
98} );
99
100($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
101if( $guess ) {
102  if( $guess eq 'UTF-8' ) {
103    ok 1;
104  } else {
105    ok 0;
106    print "# parser guessed wrong encoding expected 'UTF-8' got '$guess'\n";
107  }
108} else {
109  ok 0;
110  print "# parser failed to detect non-ASCII bytes in input\n";
111}
112
113
114# The previous example used a CP1252 byte sequence that also happened to be a
115# valid UTF8 byte sequence.  In this example the heuristic also guesses 'wrong'
116# despite the byte sequence not being valid UTF8 (it's too short).  This could
117# arguably be 'fixed' by using a less naive regex.
118
119@output_lines = split m/[\cm\cj]+/, Pod::Simple::XMLOutStream->_out( qq{
120
121=head1 NAME
122
123Smart::Apostrophe::Fail - L\xE9\x92Strange
124
125=cut
126
127} );
128
129($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
130if( $guess ) {
131  if( $guess eq 'UTF-8' ) {
132    ok 1;
133  } else {
134    ok 0;
135    print "# parser guessed wrong encoding expected 'UTF-8' got '$guess'\n";
136  }
137} else {
138  ok 0;
139  print "# parser failed to detect non-ASCII bytes in input\n";
140}
141
142
143exit;
144