1 // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9
10 use handles::*;
11 use data::*;
12 use variant::*;
13 use super::*;
14 // Rust 1.14.0 requires the following despite the asterisk above.
15 use super::in_inclusive_range16;
16
17 #[derive(Copy,Clone)]
18 enum Iso2022JpDecoderState {
19 Ascii,
20 Roman,
21 Katakana,
22 LeadByte,
23 TrailByte,
24 EscapeStart,
25 Escape,
26 }
27
28 pub struct Iso2022JpDecoder {
29 decoder_state: Iso2022JpDecoderState,
30 output_state: Iso2022JpDecoderState, // only takes 1 of first 4 values
31 lead: u8,
32 output_flag: bool,
33 pending_prepended: bool,
34 }
35
36 impl Iso2022JpDecoder {
new() -> VariantDecoder37 pub fn new() -> VariantDecoder {
38 VariantDecoder::Iso2022Jp(
39 Iso2022JpDecoder {
40 decoder_state: Iso2022JpDecoderState::Ascii,
41 output_state: Iso2022JpDecoderState::Ascii,
42 lead: 0u8,
43 output_flag: false,
44 pending_prepended: false,
45 }
46 )
47 }
48
extra_to_input_from_state(&self, byte_length: usize) -> Option<usize>49 fn extra_to_input_from_state(&self, byte_length: usize) -> Option<usize> {
50 byte_length.checked_add(
51 if self.lead == 0 || self.pending_prepended {
52 0
53 } else {
54 1
55 } +
56 match self.decoder_state {
57 Iso2022JpDecoderState::Escape |
58 Iso2022JpDecoderState::EscapeStart => 1,
59 _ => 0,
60 }
61 )
62 }
63
extra_to_output_from_state(&self) -> usize64 fn extra_to_output_from_state(&self) -> usize {
65 if self.lead != 0 && self.pending_prepended {
66 1 + self.output_flag as usize
67 } else {
68 self.output_flag as usize
69 }
70 }
71
max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize>72 pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
73 checked_add(
74 self.extra_to_output_from_state(),
75 self.extra_to_input_from_state(byte_length),
76 )
77 }
78
max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize>79 pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
80 // worst case: 1 to 3 (half-width katakana)
81 self.max_utf8_buffer_length(byte_length)
82 }
83
max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize>84 pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
85 checked_mul(
86 3,
87 checked_add(
88 self.extra_to_output_from_state(),
89 self.extra_to_input_from_state(byte_length),
90 ),
91 )
92 }
93
94 decoder_functions!(
95 {
96 if self.pending_prepended {
97 // lead was set in EscapeStart and "prepended"
98 // in Escape.
99 debug_assert!(self.lead == 0x24u8 || self.lead == 0x28u8);
100 match dest.check_space_bmp() {
101 Space::Full(_) => {
102 return (DecoderResult::OutputFull, 0, 0);
103 }
104 Space::Available(destination_handle) => {
105 self.pending_prepended = false;
106 self.output_flag = false;
107 match self.decoder_state {
108 Iso2022JpDecoderState::Ascii |
109 Iso2022JpDecoderState::Roman => {
110 destination_handle.write_ascii(self.lead);
111 self.lead = 0x0u8;
112 }
113 Iso2022JpDecoderState::Katakana => {
114 destination_handle
115 .write_upper_bmp(self.lead as u16 - 0x21u16 + 0xFF61u16);
116 self.lead = 0x0u8;
117 }
118 Iso2022JpDecoderState::LeadByte => {
119 self.decoder_state = Iso2022JpDecoderState::TrailByte;
120 }
121 _ => unreachable!(),
122 }
123 }
124 }
125 }
126 },
127 {},
128 {
129 match self.decoder_state {
130 Iso2022JpDecoderState::TrailByte |
131 Iso2022JpDecoderState::EscapeStart => {
132 self.decoder_state = self.output_state;
133 return (DecoderResult::Malformed(1, 0), src_consumed, dest.written());
134 }
135 Iso2022JpDecoderState::Escape => {
136 self.pending_prepended = true;
137 self.decoder_state = self.output_state;
138 return (DecoderResult::Malformed(1, 1), src_consumed, dest.written());
139 }
140 _ => {}
141 }
142 },
143 {
144 match self.decoder_state {
145 Iso2022JpDecoderState::Ascii => {
146 if b == 0x1Bu8 {
147 self.decoder_state = Iso2022JpDecoderState::EscapeStart;
148 continue;
149 }
150 self.output_flag = false;
151 if b > 0x7Fu8 || b == 0x0Eu8 || b == 0x0Fu8 {
152 return (DecoderResult::Malformed(1, 0),
153 unread_handle.consumed(),
154 destination_handle.written());
155 }
156 destination_handle.write_ascii(b);
157 continue;
158 }
159 Iso2022JpDecoderState::Roman => {
160 if b == 0x1Bu8 {
161 self.decoder_state = Iso2022JpDecoderState::EscapeStart;
162 continue;
163 }
164 self.output_flag = false;
165 if b == 0x5Cu8 {
166 destination_handle.write_mid_bmp(0x00A5u16);
167 continue;
168 }
169 if b == 0x7Eu8 {
170 destination_handle.write_upper_bmp(0x203Eu16);
171 continue;
172 }
173 if b > 0x7Fu8 || b == 0x0Eu8 || b == 0x0Fu8 {
174 return (DecoderResult::Malformed(1, 0),
175 unread_handle.consumed(),
176 destination_handle.written());
177 }
178 destination_handle.write_ascii(b);
179 continue;
180 }
181 Iso2022JpDecoderState::Katakana => {
182 if b == 0x1Bu8 {
183 self.decoder_state = Iso2022JpDecoderState::EscapeStart;
184 continue;
185 }
186 self.output_flag = false;
187 if b >= 0x21u8 && b <= 0x5Fu8 {
188 destination_handle.write_upper_bmp(b as u16 - 0x21u16 + 0xFF61u16);
189 continue;
190 }
191 return (DecoderResult::Malformed(1, 0),
192 unread_handle.consumed(),
193 destination_handle.written());
194 }
195 Iso2022JpDecoderState::LeadByte => {
196 if b == 0x1Bu8 {
197 self.decoder_state = Iso2022JpDecoderState::EscapeStart;
198 continue;
199 }
200 self.output_flag = false;
201 if b >= 0x21u8 && b <= 0x7Eu8 {
202 self.lead = b;
203 self.decoder_state = Iso2022JpDecoderState::TrailByte;
204 continue;
205 }
206 return (DecoderResult::Malformed(1, 0),
207 unread_handle.consumed(),
208 destination_handle.written());
209 }
210 Iso2022JpDecoderState::TrailByte => {
211 if b == 0x1Bu8 {
212 self.decoder_state = Iso2022JpDecoderState::EscapeStart;
213 // The byte in error is the previous
214 // lead byte.
215 return (DecoderResult::Malformed(1, 1),
216 unread_handle.consumed(),
217 destination_handle.written());
218 }
219 self.decoder_state = Iso2022JpDecoderState::LeadByte;
220 let jis0208_lead_minus_offset = self.lead - 0x21;
221 let byte = b;
222 let handle = destination_handle;
223 // The code below uses else after continue in
224 // order to retain the structure seen in EUC-JP.
225 let trail_minus_offset = byte.wrapping_sub(0x21);
226 // Fast-track Hiragana (60% according to Lunde)
227 // and Katakana (10% acconding to Lunde).
228 if jis0208_lead_minus_offset == 0x03 && trail_minus_offset < 0x53 {
229 // Hiragana
230 handle.write_upper_bmp(0x3041 + trail_minus_offset as u16);
231 continue;
232 } else if jis0208_lead_minus_offset == 0x04 && trail_minus_offset < 0x56 {
233 // Katakana
234 handle.write_upper_bmp(0x30A1 + trail_minus_offset as u16);
235 continue;
236 } else if trail_minus_offset > (0xFE - 0xA1) {
237 return (DecoderResult::Malformed(2, 0),
238 unread_handle.consumed(),
239 handle.written());
240 } else {
241 let pointer = mul_94(jis0208_lead_minus_offset) +
242 trail_minus_offset as usize;
243 let level1_pointer = pointer.wrapping_sub(1410);
244 if level1_pointer < JIS0208_LEVEL1_KANJI.len() {
245 handle.write_upper_bmp(JIS0208_LEVEL1_KANJI[level1_pointer]);
246 continue;
247 } else {
248 let level2_pointer = pointer.wrapping_sub(4418);
249 if level2_pointer < JIS0208_LEVEL2_AND_ADDITIONAL_KANJI.len() {
250 handle.write_upper_bmp(
251 JIS0208_LEVEL2_AND_ADDITIONAL_KANJI[level2_pointer],
252 );
253 continue;
254 } else {
255 let ibm_pointer = pointer.wrapping_sub(8272);
256 if ibm_pointer < IBM_KANJI.len() {
257 handle.write_upper_bmp(IBM_KANJI[ibm_pointer]);
258 continue;
259 } else if let Some(bmp) = jis0208_symbol_decode(pointer) {
260 handle.write_bmp_excl_ascii(bmp);
261 continue;
262 } else if let Some(bmp) = jis0208_range_decode(pointer) {
263 handle.write_bmp_excl_ascii(bmp);
264 continue;
265 } else {
266 return (DecoderResult::Malformed(2, 0),
267 unread_handle.consumed(),
268 handle.written());
269 }
270 }
271 }
272 }
273 }
274 Iso2022JpDecoderState::EscapeStart => {
275 if b == 0x24u8 || b == 0x28u8 {
276 self.lead = b;
277 self.decoder_state = Iso2022JpDecoderState::Escape;
278 continue;
279 }
280 self.output_flag = false;
281 self.decoder_state = self.output_state;
282 return (DecoderResult::Malformed(1, 0),
283 unread_handle.unread(),
284 destination_handle.written());
285 }
286 Iso2022JpDecoderState::Escape => {
287 let mut state: Option<Iso2022JpDecoderState> = None;
288 if self.lead == 0x28u8 && b == 0x42u8 {
289 state = Some(Iso2022JpDecoderState::Ascii);
290 } else if self.lead == 0x28u8 && b == 0x4Au8 {
291 state = Some(Iso2022JpDecoderState::Roman);
292 } else if self.lead == 0x28u8 && b == 0x49u8 {
293 state = Some(Iso2022JpDecoderState::Katakana);
294 } else if self.lead == 0x24u8 && (b == 0x40u8 || b == 0x42u8) {
295 state = Some(Iso2022JpDecoderState::LeadByte);
296 }
297 match state {
298 Some(s) => {
299 self.lead = 0x0u8;
300 self.decoder_state = s;
301 self.output_state = s;
302 let flag = self.output_flag;
303 self.output_flag = true;
304 if flag {
305 // We had an escape sequence
306 // immediately following another
307 // escape sequence. Therefore,
308 // the first one of these was
309 // useless.
310 return (DecoderResult::Malformed(3, 3),
311 unread_handle.consumed(),
312 destination_handle.written());
313 }
314 continue;
315 }
316 None => {
317 // self.lead is still the previous
318 // byte. It will be processed in
319 // the preabmle upon next call.
320 self.pending_prepended = true;
321 self.output_flag = false;
322 self.decoder_state = self.output_state;
323 // The byte in error is not the
324 // current or the previous byte but
325 // the one before those (lone 0x1B).
326 return (DecoderResult::Malformed(1, 1),
327 unread_handle.unread(),
328 destination_handle.written());
329 }
330 }
331 }
332 }
333 },
334 self,
335 src_consumed,
336 dest,
337 source,
338 b,
339 destination_handle,
340 unread_handle,
341 check_space_bmp
342 );
343 }
344
345
346 #[cfg_attr(feature = "cargo-clippy", allow(if_let_redundant_pattern_matching, if_same_then_else))]
is_mapped_for_two_byte_encode(bmp: u16) -> bool347 fn is_mapped_for_two_byte_encode(bmp: u16) -> bool {
348 // The code below uses else after return to
349 // keep the same structure as in EUC-JP.
350 // Lunde says 60% Hiragana, 30% Kanji, 10% Katakana
351 let bmp_minus_hiragana = bmp.wrapping_sub(0x3041);
352 if bmp_minus_hiragana < 0x53 {
353 true
354 } else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
355 if 0x4EDD == bmp {
356 true
357 } else if let Some(_) = jis0208_level1_kanji_shift_jis_encode(bmp) {
358 // Use the shift_jis variant, because we don't care about the
359 // byte values here.
360 true
361 } else if let Some(_) = jis0208_level2_and_additional_kanji_encode(bmp) {
362 true
363 } else if let Some(_) = position(&IBM_KANJI[..], bmp) {
364 true
365 } else {
366 false
367 }
368 } else {
369 let bmp_minus_katakana = bmp.wrapping_sub(0x30A1);
370 if bmp_minus_katakana < 0x56 {
371 true
372 } else {
373 let bmp_minus_space = bmp.wrapping_sub(0x3000);
374 if bmp_minus_space < 3 {
375 // fast-track common punctuation
376 true
377 } else if in_inclusive_range16(bmp, 0xFF61, 0xFF9F) {
378 true
379 } else if bmp == 0x2212 {
380 true
381 } else if let Some(_) = jis0208_range_encode(bmp) {
382 true
383 } else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D) || bmp == 0xF929 || bmp == 0xF9DC {
384 true
385 } else if let Some(_) = ibm_symbol_encode(bmp) {
386 true
387 } else if let Some(_) = jis0208_symbol_encode(bmp) {
388 true
389 } else {
390 false
391 }
392 }
393 }
394 }
395
396 enum Iso2022JpEncoderState {
397 Ascii,
398 Roman,
399 Jis0208,
400 }
401
402 pub struct Iso2022JpEncoder {
403 state: Iso2022JpEncoderState,
404 }
405
406 impl Iso2022JpEncoder {
new(encoding: &'static Encoding) -> Encoder407 pub fn new(encoding: &'static Encoding) -> Encoder {
408 Encoder::new(
409 encoding,
410 VariantEncoder::Iso2022Jp(Iso2022JpEncoder { state: Iso2022JpEncoderState::Ascii }),
411 )
412 }
413
has_pending_state(&self) -> bool414 pub fn has_pending_state(&self) -> bool {
415 match self.state {
416 Iso2022JpEncoderState::Ascii => false,
417 _ => true,
418 }
419 }
420
421
max_buffer_length_from_utf16_without_replacement(&self, u16_length: usize) -> Option<usize>422 pub fn max_buffer_length_from_utf16_without_replacement(&self,
423 u16_length: usize)
424 -> Option<usize> {
425 // Worst case: every other character is ASCII/Roman and every other
426 // JIS0208.
427 // Two UTF-16 input units:
428 // Transition to Roman: 3
429 // Roman/ASCII: 1
430 // Transition to JIS0208: 3
431 // JIS0208: 2
432 // End transition: 3
433 checked_add_opt(
434 checked_add(3, u16_length.checked_mul(4)),
435 checked_div(u16_length.checked_add(1), 2),
436 )
437 }
438
max_buffer_length_from_utf8_without_replacement(&self, byte_length: usize) -> Option<usize>439 pub fn max_buffer_length_from_utf8_without_replacement(&self,
440 byte_length: usize)
441 -> Option<usize> {
442 // Worst case: every other character is ASCII/Roman and every other
443 // JIS0208.
444 // Three UTF-8 input units: 1 ASCII, 2 JIS0208
445 // Transition to ASCII: 3
446 // Roman/ASCII: 1
447 // Transition to JIS0208: 3
448 // JIS0208: 2
449 // End transition: 3
450 checked_add(3, byte_length.checked_mul(3))
451 }
452
453 encoder_functions!(
454 {
455 match self.state {
456 Iso2022JpEncoderState::Ascii => {}
457 _ => {
458 match dest.check_space_three() {
459 Space::Full(dst_written) => {
460 return (EncoderResult::OutputFull, src_consumed, dst_written);
461 }
462 Space::Available(destination_handle) => {
463 self.state = Iso2022JpEncoderState::Ascii;
464 destination_handle.write_three(0x1Bu8, 0x28u8, 0x42u8);
465 }
466 }
467 }
468 }
469 },
470 {
471 match self.state {
472 Iso2022JpEncoderState::Ascii => {
473 if c == '\u{0E}' || c == '\u{0F}' || c == '\u{1B}' {
474 return (EncoderResult::Unmappable('\u{FFFD}'),
475 unread_handle.consumed(),
476 destination_handle.written());
477 }
478 if c <= '\u{7F}' {
479 destination_handle.write_one(c as u8);
480 continue;
481 }
482 if c == '\u{A5}' || c == '\u{203E}' {
483 self.state = Iso2022JpEncoderState::Roman;
484 destination_handle.write_three(0x1Bu8, 0x28u8, 0x4Au8);
485 unread_handle.unread();
486 continue;
487 }
488 if c > '\u{FFFF}' {
489 return (EncoderResult::Unmappable(c),
490 unread_handle.consumed(),
491 destination_handle.written());
492 }
493 // Yes, if c is in index, we'll search
494 // again in the Jis0208 state, but this
495 // encoder is not worth optimizing.
496 if is_mapped_for_two_byte_encode(c as u16) {
497 self.state = Iso2022JpEncoderState::Jis0208;
498 destination_handle.write_three(0x1Bu8, 0x24u8, 0x42u8);
499 unread_handle.unread();
500 continue;
501 }
502 return (EncoderResult::Unmappable(c),
503 unread_handle.consumed(),
504 destination_handle.written());
505 }
506 Iso2022JpEncoderState::Roman => {
507 if c == '\u{0E}' || c == '\u{0F}' || c == '\u{1B}' {
508 return (EncoderResult::Unmappable('\u{FFFD}'),
509 unread_handle.consumed(),
510 destination_handle.written());
511 }
512 if c == '\u{5C}' || c == '\u{7E}' {
513 self.state = Iso2022JpEncoderState::Ascii;
514 destination_handle.write_three(0x1Bu8, 0x28u8, 0x42u8);
515 unread_handle.unread();
516 continue;
517 }
518 if c <= '\u{7F}' {
519 destination_handle.write_one(c as u8);
520 continue;
521 }
522 if c == '\u{A5}' {
523 destination_handle.write_one(0x5Cu8);
524 continue;
525 }
526 if c == '\u{203E}' {
527 destination_handle.write_one(0x7Eu8);
528 continue;
529 }
530 if c > '\u{FFFF}' {
531 return (EncoderResult::Unmappable(c),
532 unread_handle.consumed(),
533 destination_handle.written());
534 }
535 // Yes, if c is in index, we'll search
536 // again in the Jis0208 state, but this
537 // encoder is not worth optimizing.
538 if is_mapped_for_two_byte_encode(c as u16) {
539 self.state = Iso2022JpEncoderState::Jis0208;
540 destination_handle.write_three(0x1Bu8, 0x24u8, 0x42u8);
541 unread_handle.unread();
542 continue;
543 }
544 return (EncoderResult::Unmappable(c),
545 unread_handle.consumed(),
546 destination_handle.written());
547 }
548 Iso2022JpEncoderState::Jis0208 => {
549 if c <= '\u{7F}' {
550 self.state = Iso2022JpEncoderState::Ascii;
551 destination_handle.write_three(0x1Bu8, 0x28u8, 0x42u8);
552 unread_handle.unread();
553 continue;
554 }
555 if c == '\u{A5}' || c == '\u{203E}' {
556 self.state = Iso2022JpEncoderState::Roman;
557 destination_handle.write_three(0x1Bu8, 0x28u8, 0x4Au8);
558 unread_handle.unread();
559 continue;
560 }
561 if c > '\u{FFFF}' {
562 // Transition to ASCII here in order
563 // not to make it the responsibility
564 // of the caller.
565 self.state = Iso2022JpEncoderState::Ascii;
566 return (EncoderResult::Unmappable(c),
567 unread_handle.consumed(),
568 destination_handle
569 .write_three_return_written(0x1Bu8, 0x28u8, 0x42u8));
570 }
571 let bmp = c as u16;
572 let handle = destination_handle;
573 // The code below uses else after continue to
574 // keep the same structure as in EUC-JP.
575 // Lunde says 60% Hiragana, 30% Kanji, 10% Katakana
576 let bmp_minus_hiragana = bmp.wrapping_sub(0x3041);
577 if bmp_minus_hiragana < 0x53 {
578 handle.write_two(0x24, 0x21 + bmp_minus_hiragana as u8);
579 continue;
580 } else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
581 if 0x4EDD == bmp {
582 // Ideograph on the symbol row!
583 handle.write_two(0x21, 0xB8 - 0x80);
584 continue;
585 } else if let Some((lead, trail)) =
586 jis0208_level1_kanji_iso_2022_jp_encode(bmp) {
587 handle.write_two(lead, trail);
588 continue;
589 } else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) {
590 let lead = (pos / 94) + (0xD0 - 0x80);
591 let trail = (pos % 94) + 0x21;
592 handle.write_two(lead as u8, trail as u8);
593 continue;
594 } else if let Some(pos) = position(&IBM_KANJI[..], bmp) {
595 let lead = (pos / 94) + (0xF9 - 0x80);
596 let trail = (pos % 94) + 0x21;
597 handle.write_two(lead as u8, trail as u8);
598 continue;
599 } else {
600 self.state = Iso2022JpEncoderState::Ascii;
601 return (EncoderResult::Unmappable(c),
602 unread_handle.consumed(),
603 handle.write_three_return_written(0x1Bu8, 0x28u8, 0x42u8));
604 }
605 } else {
606 let bmp_minus_katakana = bmp.wrapping_sub(0x30A1);
607 if bmp_minus_katakana < 0x56 {
608 handle.write_two(0x25, 0x21 + bmp_minus_katakana as u8);
609 continue;
610 } else {
611 let bmp_minus_space = bmp.wrapping_sub(0x3000);
612 if bmp_minus_space < 3 {
613 // fast-track common punctuation
614 handle.write_two(0x21, 0x21 + bmp_minus_space as u8);
615 continue;
616 }
617 let bmp_minus_half_width = bmp.wrapping_sub(0xFF61);
618 if bmp_minus_half_width <= (0xFF9F - 0xFF61) {
619 // We have half-width katakana. The lead is either
620 // row 1 or 5 of JIS X 0208, so the lookup table
621 // only stores the trail.
622 let lead = if bmp != 0xFF70 &&
623 in_inclusive_range16(bmp, 0xFF66, 0xFF9D) {
624 0x25u8
625 } else {
626 0x21u8
627 };
628 let trail = ISO_2022_JP_HALF_WIDTH_TRAIL[bmp_minus_half_width as
629 usize];
630 handle.write_two(lead, trail);
631 continue;
632 } else if bmp == 0x2212 {
633 handle.write_two(0x21, 0x5D);
634 continue;
635 } else if let Some(pointer) = jis0208_range_encode(bmp) {
636 let lead = (pointer / 94) + 0x21;
637 let trail = (pointer % 94) + 0x21;
638 handle.write_two(lead as u8, trail as u8);
639 continue;
640 } else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D) || bmp == 0xF929 ||
641 bmp == 0xF9DC {
642 // Guaranteed to be found in IBM_KANJI
643 let pos = position(&IBM_KANJI[..], bmp).unwrap();
644 let lead = (pos / 94) + (0xF9 - 0x80);
645 let trail = (pos % 94) + 0x21;
646 handle.write_two(lead as u8, trail as u8);
647 continue;
648 } else if let Some(pointer) = ibm_symbol_encode(bmp) {
649 let lead = (pointer / 94) + 0x21;
650 let trail = (pointer % 94) + 0x21;
651 handle.write_two(lead as u8, trail as u8);
652 continue;
653 } else if let Some(pointer) = jis0208_symbol_encode(bmp) {
654 let lead = (pointer / 94) + 0x21;
655 let trail = (pointer % 94) + 0x21;
656 handle.write_two(lead as u8, trail as u8);
657 continue;
658 } else {
659 self.state = Iso2022JpEncoderState::Ascii;
660 return (EncoderResult::Unmappable(c),
661 unread_handle.consumed(),
662 handle.write_three_return_written(0x1Bu8, 0x28u8, 0x42u8));
663 }
664 }
665 }
666 }
667 }
668 },
669 self,
670 src_consumed,
671 source,
672 dest,
673 c,
674 destination_handle,
675 unread_handle,
676 check_space_three
677 );
678 }
679
680 // Any copyright to the test code below this comment is dedicated to the
681 // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
682
683 #[cfg(test)]
684 mod tests {
685 use super::super::testing::*;
686 use super::super::*;
687
decode_iso_2022_jp(bytes: &[u8], expect: &str)688 fn decode_iso_2022_jp(bytes: &[u8], expect: &str) {
689 decode(ISO_2022_JP, bytes, expect);
690 }
691
encode_iso_2022_jp(string: &str, expect: &[u8])692 fn encode_iso_2022_jp(string: &str, expect: &[u8]) {
693 encode(ISO_2022_JP, string, expect);
694 }
695
696 #[test]
test_iso_2022_jp_decode()697 fn test_iso_2022_jp_decode() {
698 // Empty
699 decode_iso_2022_jp(b"", &"");
700
701 // ASCII
702 decode_iso_2022_jp(b"\x61\x62", "\u{0061}\u{0062}");
703 decode_iso_2022_jp(b"\x7F\x0E\x0F", "\u{007F}\u{FFFD}\u{FFFD}");
704
705 // Partial escapes
706 decode_iso_2022_jp(b"\x1B", "\u{FFFD}");
707 decode_iso_2022_jp(b"\x1B$", "\u{FFFD}$");
708 decode_iso_2022_jp(b"\x1B(", "\u{FFFD}(");
709 decode_iso_2022_jp(b"\x1B.", "\u{FFFD}.");
710
711 // ISO escapes
712 decode_iso_2022_jp(b"\x1B(B", ""); // ASCII
713 decode_iso_2022_jp(b"\x1B(J", ""); // Roman
714 decode_iso_2022_jp(b"\x1B$@", ""); // 0208
715 decode_iso_2022_jp(b"\x1B$B", ""); // 0208
716 decode_iso_2022_jp(b"\x1B$(D", "\u{FFFD}$(D"); // 2012
717 decode_iso_2022_jp(b"\x1B$A", "\u{FFFD}$A"); // GB2312
718 decode_iso_2022_jp(b"\x1B$(C", "\u{FFFD}$(C"); // KR
719 decode_iso_2022_jp(b"\x1B.A", "\u{FFFD}.A"); // Latin-1
720 decode_iso_2022_jp(b"\x1B.F", "\u{FFFD}.F"); // Greek
721 decode_iso_2022_jp(b"\x1B(I", ""); // Half-width Katakana
722 decode_iso_2022_jp(b"\x1B$(O", "\u{FFFD}$(O"); // 2013
723 decode_iso_2022_jp(b"\x1B$(P", "\u{FFFD}$(P"); // 2013
724 decode_iso_2022_jp(b"\x1B$(Q", "\u{FFFD}$(Q"); // 2013
725 decode_iso_2022_jp(b"\x1B$)C", "\u{FFFD}$)C"); // KR
726 decode_iso_2022_jp(b"\x1B$)A", "\u{FFFD}$)A"); // GB2312
727 decode_iso_2022_jp(b"\x1B$)G", "\u{FFFD}$)G"); // CNS
728 decode_iso_2022_jp(b"\x1B$*H", "\u{FFFD}$*H"); // CNS
729 decode_iso_2022_jp(b"\x1B$)E", "\u{FFFD}$)E"); // IR
730 decode_iso_2022_jp(b"\x1B$+I", "\u{FFFD}$+I"); // CNS
731 decode_iso_2022_jp(b"\x1B$+J", "\u{FFFD}$+J"); // CNS
732 decode_iso_2022_jp(b"\x1B$+K", "\u{FFFD}$+K"); // CNS
733 decode_iso_2022_jp(b"\x1B$+L", "\u{FFFD}$+L"); // CNS
734 decode_iso_2022_jp(b"\x1B$+M", "\u{FFFD}$+M"); // CNS
735 decode_iso_2022_jp(b"\x1B$(@", "\u{FFFD}$(@"); // 0208
736 decode_iso_2022_jp(b"\x1B$(A", "\u{FFFD}$(A"); // GB2312
737 decode_iso_2022_jp(b"\x1B$(B", "\u{FFFD}$(B"); // 0208
738 decode_iso_2022_jp(b"\x1B%G", "\u{FFFD}%G"); // UTF-8
739
740 // ASCII
741 decode_iso_2022_jp(b"\x5B", "\u{005B}");
742 decode_iso_2022_jp(b"\x5C", "\u{005C}");
743 decode_iso_2022_jp(b"\x7E", "\u{007E}");
744 decode_iso_2022_jp(b"\x0E", "\u{FFFD}");
745 decode_iso_2022_jp(b"\x0F", "\u{FFFD}");
746 decode_iso_2022_jp(b"\x80", "\u{FFFD}");
747 decode_iso_2022_jp(b"\xFF", "\u{FFFD}");
748 decode_iso_2022_jp(b"\x1B(B\x5B", "\u{005B}");
749 decode_iso_2022_jp(b"\x1B(B\x5C", "\u{005C}");
750 decode_iso_2022_jp(b"\x1B(B\x7E", "\u{007E}");
751 decode_iso_2022_jp(b"\x1B(B\x0E", "\u{FFFD}");
752 decode_iso_2022_jp(b"\x1B(B\x0F", "\u{FFFD}");
753 decode_iso_2022_jp(b"\x1B(B\x80", "\u{FFFD}");
754 decode_iso_2022_jp(b"\x1B(B\xFF", "\u{FFFD}");
755
756 // Roman
757 decode_iso_2022_jp(b"\x1B(J\x5B", "\u{005B}");
758 decode_iso_2022_jp(b"\x1B(J\x5C", "\u{00A5}");
759 decode_iso_2022_jp(b"\x1B(J\x7E", "\u{203E}");
760 decode_iso_2022_jp(b"\x1B(J\x0E", "\u{FFFD}");
761 decode_iso_2022_jp(b"\x1B(J\x0F", "\u{FFFD}");
762 decode_iso_2022_jp(b"\x1B(J\x80", "\u{FFFD}");
763 decode_iso_2022_jp(b"\x1B(J\xFF", "\u{FFFD}");
764
765 // Katakana
766 decode_iso_2022_jp(b"\x1B(I\x20", "\u{FFFD}");
767 decode_iso_2022_jp(b"\x1B(I\x21", "\u{FF61}");
768 decode_iso_2022_jp(b"\x1B(I\x5F", "\u{FF9F}");
769 decode_iso_2022_jp(b"\x1B(I\x60", "\u{FFFD}");
770 decode_iso_2022_jp(b"\x1B(I\x0E", "\u{FFFD}");
771 decode_iso_2022_jp(b"\x1B(I\x0F", "\u{FFFD}");
772 decode_iso_2022_jp(b"\x1B(I\x80", "\u{FFFD}");
773 decode_iso_2022_jp(b"\x1B(I\xFF", "\u{FFFD}");
774
775 // 0208 differences from 1978 to 1983
776 decode_iso_2022_jp(b"\x1B$@\x54\x64", "\u{58FA}");
777 decode_iso_2022_jp(b"\x1B$@\x44\x5B", "\u{58F7}");
778 decode_iso_2022_jp(b"\x1B$@\x74\x21", "\u{582F}");
779 decode_iso_2022_jp(b"\x1B$@\x36\x46", "\u{5C2D}");
780 decode_iso_2022_jp(b"\x1B$@\x28\x2E", "\u{250F}");
781 decode_iso_2022_jp(b"\x1B$B\x54\x64", "\u{58FA}");
782 decode_iso_2022_jp(b"\x1B$B\x44\x5B", "\u{58F7}");
783 decode_iso_2022_jp(b"\x1B$B\x74\x21", "\u{582F}");
784 decode_iso_2022_jp(b"\x1B$B\x36\x46", "\u{5C2D}");
785 decode_iso_2022_jp(b"\x1B$B\x28\x2E", "\u{250F}");
786
787 // Broken 0208
788 decode_iso_2022_jp(b"\x1B$B\x28\x41", "\u{FFFD}");
789 decode_iso_2022_jp(b"\x1B$@\x80\x54\x64", "\u{FFFD}\u{58FA}");
790 decode_iso_2022_jp(b"\x1B$B\x28\x80", "\u{FFFD}");
791
792 // Transitions
793 decode_iso_2022_jp(b"\x1B(B\x5C\x1B(J\x5C", "\u{005C}\u{00A5}");
794 decode_iso_2022_jp(b"\x1B(B\x5C\x1B(I\x21", "\u{005C}\u{FF61}");
795 decode_iso_2022_jp(b"\x1B(B\x5C\x1B$@\x54\x64", "\u{005C}\u{58FA}");
796 decode_iso_2022_jp(b"\x1B(B\x5C\x1B$B\x54\x64", "\u{005C}\u{58FA}");
797
798 decode_iso_2022_jp(b"\x1B(J\x5C\x1B(B\x5C", "\u{00A5}\u{005C}");
799 decode_iso_2022_jp(b"\x1B(J\x5C\x1B(I\x21", "\u{00A5}\u{FF61}");
800 decode_iso_2022_jp(b"\x1B(J\x5C\x1B$@\x54\x64", "\u{00A5}\u{58FA}");
801 decode_iso_2022_jp(b"\x1B(J\x5C\x1B$B\x54\x64", "\u{00A5}\u{58FA}");
802
803 decode_iso_2022_jp(b"\x1B(I\x21\x1B(J\x5C", "\u{FF61}\u{00A5}");
804 decode_iso_2022_jp(b"\x1B(I\x21\x1B(B\x5C", "\u{FF61}\u{005C}");
805 decode_iso_2022_jp(b"\x1B(I\x21\x1B$@\x54\x64", "\u{FF61}\u{58FA}");
806 decode_iso_2022_jp(b"\x1B(I\x21\x1B$B\x54\x64", "\u{FF61}\u{58FA}");
807
808 decode_iso_2022_jp(b"\x1B$@\x54\x64\x1B(J\x5C", "\u{58FA}\u{00A5}");
809 decode_iso_2022_jp(b"\x1B$@\x54\x64\x1B(I\x21", "\u{58FA}\u{FF61}");
810 decode_iso_2022_jp(b"\x1B$@\x54\x64\x1B(B\x5C", "\u{58FA}\u{005C}");
811 decode_iso_2022_jp(b"\x1B$@\x54\x64\x1B$B\x54\x64", "\u{58FA}\u{58FA}");
812
813 decode_iso_2022_jp(b"\x1B$B\x54\x64\x1B(J\x5C", "\u{58FA}\u{00A5}");
814 decode_iso_2022_jp(b"\x1B$B\x54\x64\x1B(I\x21", "\u{58FA}\u{FF61}");
815 decode_iso_2022_jp(b"\x1B$B\x54\x64\x1B$@\x54\x64", "\u{58FA}\u{58FA}");
816 decode_iso_2022_jp(b"\x1B$B\x54\x64\x1B(B\x5C", "\u{58FA}\u{005C}");
817
818 // Empty transitions
819 decode_iso_2022_jp(b"\x1B(B\x1B(J", "\u{FFFD}");
820 decode_iso_2022_jp(b"\x1B(B\x1B(I", "\u{FFFD}");
821 decode_iso_2022_jp(b"\x1B(B\x1B$@", "\u{FFFD}");
822 decode_iso_2022_jp(b"\x1B(B\x1B$B", "\u{FFFD}");
823
824 decode_iso_2022_jp(b"\x1B(J\x1B(B", "\u{FFFD}");
825 decode_iso_2022_jp(b"\x1B(J\x1B(I", "\u{FFFD}");
826 decode_iso_2022_jp(b"\x1B(J\x1B$@", "\u{FFFD}");
827 decode_iso_2022_jp(b"\x1B(J\x1B$B", "\u{FFFD}");
828
829 decode_iso_2022_jp(b"\x1B(I\x1B(J", "\u{FFFD}");
830 decode_iso_2022_jp(b"\x1B(I\x1B(B", "\u{FFFD}");
831 decode_iso_2022_jp(b"\x1B(I\x1B$@", "\u{FFFD}");
832 decode_iso_2022_jp(b"\x1B(I\x1B$B", "\u{FFFD}");
833
834 decode_iso_2022_jp(b"\x1B$@\x1B(J", "\u{FFFD}");
835 decode_iso_2022_jp(b"\x1B$@\x1B(I", "\u{FFFD}");
836 decode_iso_2022_jp(b"\x1B$@\x1B(B", "\u{FFFD}");
837 decode_iso_2022_jp(b"\x1B$@\x1B$B", "\u{FFFD}");
838
839 decode_iso_2022_jp(b"\x1B$B\x1B(J", "\u{FFFD}");
840 decode_iso_2022_jp(b"\x1B$B\x1B(I", "\u{FFFD}");
841 decode_iso_2022_jp(b"\x1B$B\x1B$@", "\u{FFFD}");
842 decode_iso_2022_jp(b"\x1B$B\x1B(B", "\u{FFFD}");
843
844 // Transitions to self
845 decode_iso_2022_jp(b"\x1B(B\x5C\x1B(B\x5C", "\u{005C}\u{005C}");
846 decode_iso_2022_jp(b"\x1B(J\x5C\x1B(J\x5C", "\u{00A5}\u{00A5}");
847 decode_iso_2022_jp(b"\x1B(I\x21\x1B(I\x21", "\u{FF61}\u{FF61}");
848 decode_iso_2022_jp(b"\x1B$@\x54\x64\x1B$@\x54\x64", "\u{58FA}\u{58FA}");
849 decode_iso_2022_jp(b"\x1B$B\x54\x64\x1B$B\x54\x64", "\u{58FA}\u{58FA}");
850 }
851
852 #[test]
test_iso_2022_jp_encode()853 fn test_iso_2022_jp_encode() {
854 // Empty
855 encode_iso_2022_jp("", b"");
856
857 // ASCII
858 encode_iso_2022_jp("ab", b"ab");
859 encode_iso_2022_jp("\u{1F4A9}", b"💩");
860 encode_iso_2022_jp("\x1B", b"�");
861 encode_iso_2022_jp("\x0E", b"�");
862 encode_iso_2022_jp("\x0F", b"�");
863
864 // Roman
865 encode_iso_2022_jp("a\u{00A5}b", b"a\x1B(J\x5Cb\x1B(B");
866 encode_iso_2022_jp("a\u{203E}b", b"a\x1B(J\x7Eb\x1B(B");
867 encode_iso_2022_jp("a\u{00A5}b\x5C", b"a\x1B(J\x5Cb\x1B(B\x5C");
868 encode_iso_2022_jp("a\u{203E}b\x7E", b"a\x1B(J\x7Eb\x1B(B\x7E");
869 encode_iso_2022_jp("\u{00A5}\u{1F4A9}", b"\x1B(J\x5C💩\x1B(B");
870 encode_iso_2022_jp("\u{00A5}\x1B", b"\x1B(J\x5C�\x1B(B");
871 encode_iso_2022_jp("\u{00A5}\x0E", b"\x1B(J\x5C�\x1B(B");
872 encode_iso_2022_jp("\u{00A5}\x0F", b"\x1B(J\x5C�\x1B(B");
873 encode_iso_2022_jp("\u{00A5}\u{58FA}", b"\x1B(J\x5C\x1B$B\x54\x64\x1B(B");
874
875 // Half-width Katakana
876 encode_iso_2022_jp("\u{FF61}", b"\x1B$B\x21\x23\x1B(B");
877 encode_iso_2022_jp("\u{FF65}", b"\x1B$B\x21\x26\x1B(B");
878 encode_iso_2022_jp("\u{FF66}", b"\x1B$B\x25\x72\x1B(B");
879 encode_iso_2022_jp("\u{FF70}", b"\x1B$B\x21\x3C\x1B(B");
880 encode_iso_2022_jp("\u{FF9D}", b"\x1B$B\x25\x73\x1B(B");
881 encode_iso_2022_jp("\u{FF9E}", b"\x1B$B\x21\x2B\x1B(B");
882 encode_iso_2022_jp("\u{FF9F}", b"\x1B$B\x21\x2C\x1B(B");
883
884 // 0208
885 encode_iso_2022_jp("\u{58FA}", b"\x1B$B\x54\x64\x1B(B");
886 encode_iso_2022_jp("\u{58FA}\u{250F}", b"\x1B$B\x54\x64\x28\x2E\x1B(B");
887 encode_iso_2022_jp("\u{58FA}\u{1F4A9}", b"\x1B$B\x54\x64\x1B(B💩");
888 encode_iso_2022_jp("\u{58FA}\x1B", b"\x1B$B\x54\x64\x1B(B�");
889 encode_iso_2022_jp("\u{58FA}\x0E", b"\x1B$B\x54\x64\x1B(B�");
890 encode_iso_2022_jp("\u{58FA}\x0F", b"\x1B$B\x54\x64\x1B(B�");
891 encode_iso_2022_jp("\u{58FA}\u{00A5}", b"\x1B$B\x54\x64\x1B(J\x5C\x1B(B");
892 encode_iso_2022_jp("\u{58FA}a", b"\x1B$B\x54\x64\x1B(Ba");
893
894 }
895
896 #[test]
test_iso_2022_jp_decode_all()897 fn test_iso_2022_jp_decode_all() {
898 let input = include_bytes!("test_data/iso_2022_jp_in.txt");
899 let expectation = include_str!("test_data/iso_2022_jp_in_ref.txt");
900 let (cow, had_errors) = ISO_2022_JP.decode_without_bom_handling(input);
901 assert!(had_errors, "Should have had errors.");
902 assert_eq!(&cow[..], expectation);
903 }
904
905 #[test]
test_iso_2022_jp_encode_all()906 fn test_iso_2022_jp_encode_all() {
907 let input = include_str!("test_data/iso_2022_jp_out.txt");
908 let expectation = include_bytes!("test_data/iso_2022_jp_out_ref.txt");
909 let (cow, encoding, had_errors) = ISO_2022_JP.encode(input);
910 assert!(!had_errors, "Should not have had errors.");
911 assert_eq!(encoding, ISO_2022_JP);
912 assert_eq!(&cow[..], &expectation[..]);
913 }
914
915 #[test]
test_iso_2022_jp_half_width_katakana_length()916 fn test_iso_2022_jp_half_width_katakana_length() {
917 let mut output = [0u8; 20];
918 let mut decoder = ISO_2022_JP.new_decoder();
919 {
920 let (result, read, written) =
921 decoder.decode_to_utf8_without_replacement(b"\x1B\x28\x49", &mut output, false);
922 assert_eq!(result, DecoderResult::InputEmpty);
923 assert_eq!(read, 3);
924 assert_eq!(written, 0);
925 }
926 {
927 let needed = decoder
928 .max_utf8_buffer_length_without_replacement(1)
929 .unwrap();
930 let (result, read, written) =
931 decoder.decode_to_utf8_without_replacement(b"\x21", &mut output[..needed], true);
932 assert_eq!(result, DecoderResult::InputEmpty);
933 assert_eq!(read, 1);
934 assert_eq!(written, 3);
935 assert_eq!(output[0], 0xEF);
936 assert_eq!(output[1], 0xBD);
937 assert_eq!(output[2], 0xA1);
938 }
939 }
940
941 #[test]
test_iso_2022_jp_length_after_escape()942 fn test_iso_2022_jp_length_after_escape() {
943 let mut output = [0u16; 20];
944 let mut decoder = ISO_2022_JP.new_decoder();
945 {
946 let (result, read, written, had_errors) =
947 decoder.decode_to_utf16(b"\x1B", &mut output, false);
948 assert_eq!(result, CoderResult::InputEmpty);
949 assert_eq!(read, 1);
950 assert_eq!(written, 0);
951 assert!(!had_errors);
952 }
953 {
954 let needed = decoder.max_utf16_buffer_length(1).unwrap();
955 let (result, read, written, had_errors) =
956 decoder.decode_to_utf16(b"A", &mut output[..needed], true);
957 assert_eq!(result, CoderResult::InputEmpty);
958 assert_eq!(read, 1);
959 assert_eq!(written, 2);
960 assert!(had_errors);
961 assert_eq!(output[0], 0xFFFD);
962 assert_eq!(output[1], 0x0041);
963 }
964 }
965
966 #[test]
test_iso_2022_jp_encode_from_two_low_surrogates()967 fn test_iso_2022_jp_encode_from_two_low_surrogates() {
968 let expectation = b"��";
969 let mut output = [0u8; 40];
970 let mut encoder = ISO_2022_JP.new_encoder();
971 let (result, read, written, had_errors) =
972 encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
973 assert_eq!(result, CoderResult::InputEmpty);
974 assert_eq!(read, 2);
975 assert_eq!(written, expectation.len());
976 assert!(had_errors);
977 assert_eq!(&output[..written], expectation);
978 }
979
980 }
981