1 // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 use super::*;
11 use data::*;
12 use handles::*;
13 use variant::*;
14 // Rust 1.14.0 requires the following despite the asterisk above.
15 use super::in_inclusive_range32;
16 
17 pub struct Big5Decoder {
18     lead: Option<u8>,
19 }
20 
21 impl Big5Decoder {
new() -> VariantDecoder22     pub fn new() -> VariantDecoder {
23         VariantDecoder::Big5(Big5Decoder { lead: None })
24     }
25 
in_neutral_state(&self) -> bool26     pub fn in_neutral_state(&self) -> bool {
27         self.lead.is_none()
28     }
29 
plus_one_if_lead(&self, byte_length: usize) -> Option<usize>30     fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
31         byte_length.checked_add(match self.lead {
32             None => 0,
33             Some(_) => 1,
34         })
35     }
36 
max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize>37     pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
38         // If there is a lead but the next byte isn't a valid trail, an
39         // error is generated for the lead (+1). Then another iteration checks
40         // space, which needs +1 to account for the possibility of astral
41         // output or combining pair.
42         checked_add(1, self.plus_one_if_lead(byte_length))
43     }
44 
max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize>45     pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
46         // No need to account for REPLACEMENT CHARACTERS.
47         // Cases:
48         // ASCII: 1 to 1
49         // Valid pair: 2 to 2, 2 to 3 or 2 to 4, i.e. worst case 2 to 4
50         // lead set and first byte is trail: 1 to 4 worst case
51         //
52         // When checking for space for the last byte:
53         // no lead: the last byte must be ASCII (or fatal error): 1 to 1
54         // lead set: space for 4 bytes was already checked when reading the
55         // lead, hence the last lead and the last trail together are worst
56         // case 2 to 4.
57         //
58         // If lead set and the input is a single trail byte, the worst-case
59         // output is 4, so we need to add one before multiplying if lead is
60         // set.
61         //
62         // Finally, add two so that if input is non-zero, the output is at
63         // least 4.
64         checked_add(2, checked_mul(2, self.plus_one_if_lead(byte_length)))
65     }
66 
max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize>67     pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
68         // If there is a lead but the next byte isn't a valid trail, an
69         // error is generated for the lead (+(1*3)). Then another iteration
70         // checks space, which needs +3 to account for the possibility of astral
71         // output or combining pair. In between start and end, the worst case
72         // is that every byte is bad: *3.
73         checked_add(3, checked_mul(3, self.plus_one_if_lead(byte_length)))
74     }
75 
76     ascii_compatible_two_byte_decoder_functions!(
77         {
78             // If lead is between 0x81 and 0xFE, inclusive,
79             // subtract offset 0x81.
80             let non_ascii_minus_offset =
81                 non_ascii.wrapping_sub(0x81);
82             if non_ascii_minus_offset > (0xFE - 0x81) {
83                 return (DecoderResult::Malformed(1, 0),
84                         source.consumed(),
85                         handle.written());
86             }
87             non_ascii_minus_offset
88         },
89         {
90             // If trail is between 0x40 and 0x7E, inclusive,
91             // subtract offset 0x40. Else if trail is
92             // between 0xA1 and 0xFE, inclusive, subtract
93             // offset 0x62.
94             // TODO: Find out which range is more probable.
95             let mut trail_minus_offset =
96                 byte.wrapping_sub(0x40);
97             if trail_minus_offset > (0x7E - 0x40) {
98                 let trail_minus_range_start =
99                     byte.wrapping_sub(0xA1);
100                 if trail_minus_range_start >
101                    (0xFE - 0xA1) {
102                     if byte < 0x80 {
103                         return (DecoderResult::Malformed(1, 0),
104                                 unread_handle_trail.unread(),
105                                 handle.written());
106                     }
107                     return (DecoderResult::Malformed(2, 0),
108                             unread_handle_trail.consumed(),
109                             handle.written());
110                 }
111                 trail_minus_offset = byte - 0x62;
112             }
113             let pointer = lead_minus_offset as usize *
114                           157usize +
115                           trail_minus_offset as usize;
116             let rebased_pointer = pointer.wrapping_sub(942);
117             let low_bits = big5_low_bits(rebased_pointer);
118             if low_bits == 0 {
119                 match pointer {
120                     1133 => {
121                         handle.write_big5_combination(0x00CAu16,
122                                                       0x0304u16)
123                     }
124                     1135 => {
125                         handle.write_big5_combination(0x00CAu16,
126                                                       0x030Cu16)
127                     }
128                     1164 => {
129                         handle.write_big5_combination(0x00EAu16,
130                                                       0x0304u16)
131                     }
132                     1166 => {
133                         handle.write_big5_combination(0x00EAu16,
134                                                       0x030Cu16)
135                     }
136                     _ => {
137                         if byte < 0x80 {
138                             return (DecoderResult::Malformed(1, 0),
139                                     unread_handle_trail.unread(),
140                                     handle.written());
141                         }
142                         return (DecoderResult::Malformed(2, 0),
143                                 unread_handle_trail.consumed(),
144                                 handle.written());
145                     }
146                 }
147             } else if big5_is_astral(rebased_pointer) {
148                 handle.write_astral(u32::from(low_bits) |
149                                     0x20000u32)
150             } else {
151                 handle.write_bmp_excl_ascii(low_bits)
152             }
153         },
154         self,
155         non_ascii,
156         byte,
157         lead_minus_offset,
158         unread_handle_trail,
159         source,
160         handle,
161         'outermost,
162         copy_ascii_from_check_space_astral,
163         check_space_astral,
164         false);
165 }
166 
167 pub struct Big5Encoder;
168 
169 impl Big5Encoder {
new(encoding: &'static Encoding) -> Encoder170     pub fn new(encoding: &'static Encoding) -> Encoder {
171         Encoder::new(encoding, VariantEncoder::Big5(Big5Encoder))
172     }
173 
max_buffer_length_from_utf16_without_replacement( &self, u16_length: usize, ) -> Option<usize>174     pub fn max_buffer_length_from_utf16_without_replacement(
175         &self,
176         u16_length: usize,
177     ) -> Option<usize> {
178         // Astral: 2 to 2
179         // ASCII: 1 to 1
180         // Other: 1 to 2
181         u16_length.checked_mul(2)
182     }
183 
max_buffer_length_from_utf8_without_replacement( &self, byte_length: usize, ) -> Option<usize>184     pub fn max_buffer_length_from_utf8_without_replacement(
185         &self,
186         byte_length: usize,
187     ) -> Option<usize> {
188         // Astral: 4 to 2
189         // Upper BMP: 3 to 2
190         // Lower BMP: 2 to 2
191         // ASCII: 1 to 1
192         byte_length.checked_add(1)
193     }
194 
195     ascii_compatible_encoder_functions!(
196         {
197             // For simplicity, unified ideographs
198             // in the pointer range 11206...11212 are handled
199             // as Level 1 Hanzi.
200             if let Some((lead, trail)) = big5_level1_hanzi_encode(bmp) {
201                 handle.write_two(lead, trail)
202             } else {
203                 let pointer = if let Some(pointer) = big5_box_encode(bmp) {
204                     pointer
205                 } else if let Some(pointer) = big5_other_encode(bmp) {
206                     pointer
207                 } else {
208                     return (
209                         EncoderResult::unmappable_from_bmp(bmp),
210                         source.consumed(),
211                         handle.written(),
212                     );
213                 };
214                 let lead = pointer / 157 + 0x81;
215                 let remainder = pointer % 157;
216                 let trail = if remainder < 0x3F {
217                     remainder + 0x40
218                 } else {
219                     remainder + 0x62
220                 };
221                 handle.write_two(lead as u8, trail as u8)
222             }
223         },
224         {
225             if in_inclusive_range32(astral as u32, 0x2008A, 0x2F8A6) {
226                 if let Some(rebased_pointer) = big5_astral_encode(astral as u16) {
227                     // big5_astral_encode returns rebased pointer,
228                     // so adding 0x87 instead of 0x81.
229                     let lead = rebased_pointer / 157 + 0x87;
230                     let remainder = rebased_pointer % 157;
231                     let trail = if remainder < 0x3F {
232                         remainder + 0x40
233                     } else {
234                         remainder + 0x62
235                     };
236                     handle.write_two(lead as u8, trail as u8)
237                 } else {
238                     return (
239                         EncoderResult::Unmappable(astral),
240                         source.consumed(),
241                         handle.written(),
242                     );
243                 }
244             } else {
245                 return (
246                     EncoderResult::Unmappable(astral),
247                     source.consumed(),
248                     handle.written(),
249                 );
250             }
251         },
252         bmp,
253         astral,
254         self,
255         source,
256         handle,
257         copy_ascii_to_check_space_two,
258         check_space_two,
259         false
260     );
261 }
262 
263 // Any copyright to the test code below this comment is dedicated to the
264 // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
265 
266 #[cfg(test)]
267 mod tests {
268     use super::super::testing::*;
269     use super::super::*;
270 
decode_big5(bytes: &[u8], expect: &str)271     fn decode_big5(bytes: &[u8], expect: &str) {
272         decode(BIG5, bytes, expect);
273     }
274 
encode_big5(string: &str, expect: &[u8])275     fn encode_big5(string: &str, expect: &[u8]) {
276         encode(BIG5, string, expect);
277     }
278 
279     #[test]
test_big5_decode()280     fn test_big5_decode() {
281         // Empty
282         decode_big5(b"", &"");
283 
284         // ASCII
285         decode_big5(&[0x61u8, 0x62u8], &"\u{0061}\u{0062}");
286 
287         // Edge cases
288         decode_big5(&[0x87u8, 0x40u8], &"\u{43F0}");
289         decode_big5(&[0xFEu8, 0xFEu8], &"\u{79D4}");
290         decode_big5(&[0xFEu8, 0xFDu8], &"\u{2910D}");
291         decode_big5(&[0x88u8, 0x62u8], &"\u{00CA}\u{0304}");
292         decode_big5(&[0x88u8, 0x64u8], &"\u{00CA}\u{030C}");
293         decode_big5(&[0x88u8, 0x66u8], &"\u{00CA}");
294         decode_big5(&[0x88u8, 0xA3u8], &"\u{00EA}\u{0304}");
295         decode_big5(&[0x88u8, 0xA5u8], &"\u{00EA}\u{030C}");
296         decode_big5(&[0x88u8, 0xA7u8], &"\u{00EA}");
297         decode_big5(&[0x99u8, 0xD4u8], &"\u{8991}");
298         decode_big5(&[0x99u8, 0xD5u8], &"\u{27967}");
299         decode_big5(&[0x99u8, 0xD6u8], &"\u{8A29}");
300 
301         // Edge cases surrounded with ASCII
302         decode_big5(
303             &[0x61u8, 0x87u8, 0x40u8, 0x62u8],
304             &"\u{0061}\u{43F0}\u{0062}",
305         );
306         decode_big5(
307             &[0x61u8, 0xFEu8, 0xFEu8, 0x62u8],
308             &"\u{0061}\u{79D4}\u{0062}",
309         );
310         decode_big5(
311             &[0x61u8, 0xFEu8, 0xFDu8, 0x62u8],
312             &"\u{0061}\u{2910D}\u{0062}",
313         );
314         decode_big5(
315             &[0x61u8, 0x88u8, 0x62u8, 0x62u8],
316             &"\u{0061}\u{00CA}\u{0304}\u{0062}",
317         );
318         decode_big5(
319             &[0x61u8, 0x88u8, 0x64u8, 0x62u8],
320             &"\u{0061}\u{00CA}\u{030C}\u{0062}",
321         );
322         decode_big5(
323             &[0x61u8, 0x88u8, 0x66u8, 0x62u8],
324             &"\u{0061}\u{00CA}\u{0062}",
325         );
326         decode_big5(
327             &[0x61u8, 0x88u8, 0xA3u8, 0x62u8],
328             &"\u{0061}\u{00EA}\u{0304}\u{0062}",
329         );
330         decode_big5(
331             &[0x61u8, 0x88u8, 0xA5u8, 0x62u8],
332             &"\u{0061}\u{00EA}\u{030C}\u{0062}",
333         );
334         decode_big5(
335             &[0x61u8, 0x88u8, 0xA7u8, 0x62u8],
336             &"\u{0061}\u{00EA}\u{0062}",
337         );
338         decode_big5(
339             &[0x61u8, 0x99u8, 0xD4u8, 0x62u8],
340             &"\u{0061}\u{8991}\u{0062}",
341         );
342         decode_big5(
343             &[0x61u8, 0x99u8, 0xD5u8, 0x62u8],
344             &"\u{0061}\u{27967}\u{0062}",
345         );
346         decode_big5(
347             &[0x61u8, 0x99u8, 0xD6u8, 0x62u8],
348             &"\u{0061}\u{8A29}\u{0062}",
349         );
350 
351         // Bad sequences
352         decode_big5(&[0x80u8, 0x61u8], &"\u{FFFD}\u{0061}");
353         decode_big5(&[0xFFu8, 0x61u8], &"\u{FFFD}\u{0061}");
354         decode_big5(&[0xFEu8, 0x39u8], &"\u{FFFD}\u{0039}");
355         decode_big5(&[0x87u8, 0x66u8], &"\u{FFFD}\u{0066}");
356         decode_big5(&[0x81u8, 0x40u8], &"\u{FFFD}\u{0040}");
357         decode_big5(&[0x61u8, 0x81u8], &"\u{0061}\u{FFFD}");
358     }
359 
360     #[test]
test_big5_encode()361     fn test_big5_encode() {
362         // Empty
363         encode_big5("", b"");
364 
365         // ASCII
366         encode_big5("\u{0061}\u{0062}", b"\x61\x62");
367 
368         // Edge cases
369         encode_big5("\u{9EA6}\u{0061}", b"&#40614;\x61");
370         encode_big5("\u{2626B}\u{0061}", b"&#156267;\x61");
371         encode_big5("\u{3000}", b"\xA1\x40");
372         encode_big5("\u{20AC}", b"\xA3\xE1");
373         encode_big5("\u{4E00}", b"\xA4\x40");
374         encode_big5("\u{27607}", b"\xC8\xA4");
375         encode_big5("\u{FFE2}", b"\xC8\xCD");
376         encode_big5("\u{79D4}", b"\xFE\xFE");
377 
378         // Not in index
379         encode_big5("\u{2603}\u{0061}", b"&#9731;\x61");
380 
381         // duplicate low bits
382         encode_big5("\u{203B5}", b"\xFD\x6A");
383         encode_big5("\u{25605}", b"\xFE\x46");
384 
385         // prefer last
386         encode_big5("\u{2550}", b"\xF9\xF9");
387     }
388 
389     #[test]
test_big5_decode_all()390     fn test_big5_decode_all() {
391         let input = include_bytes!("test_data/big5_in.txt");
392         let expectation = include_str!("test_data/big5_in_ref.txt");
393         let (cow, had_errors) = BIG5.decode_without_bom_handling(input);
394         assert!(had_errors, "Should have had errors.");
395         assert_eq!(&cow[..], expectation);
396     }
397 
398     #[test]
test_big5_encode_all()399     fn test_big5_encode_all() {
400         let input = include_str!("test_data/big5_out.txt");
401         let expectation = include_bytes!("test_data/big5_out_ref.txt");
402         let (cow, encoding, had_errors) = BIG5.encode(input);
403         assert!(!had_errors, "Should not have had errors.");
404         assert_eq!(encoding, BIG5);
405         assert_eq!(&cow[..], &expectation[..]);
406     }
407 
408     #[test]
test_big5_encode_from_two_low_surrogates()409     fn test_big5_encode_from_two_low_surrogates() {
410         let expectation = b"&#65533;&#65533;";
411         let mut output = [0u8; 40];
412         let mut encoder = BIG5.new_encoder();
413         let (result, read, written, had_errors) =
414             encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
415         assert_eq!(result, CoderResult::InputEmpty);
416         assert_eq!(read, 2);
417         assert_eq!(written, expectation.len());
418         assert!(had_errors);
419         assert_eq!(&output[..written], expectation);
420     }
421 }
422