1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package japanese
6
7import (
8	"unicode/utf8"
9
10	"golang.org/x/text/encoding"
11	"golang.org/x/text/encoding/internal"
12	"golang.org/x/text/encoding/internal/identifier"
13	"golang.org/x/text/transform"
14)
15
16// ISO2022JP is the ISO-2022-JP encoding.
17var ISO2022JP encoding.Encoding = &iso2022JP
18
19var iso2022JP = internal.Encoding{
20	internal.FuncEncoding{iso2022JPNewDecoder, iso2022JPNewEncoder},
21	"ISO-2022-JP",
22	identifier.ISO2022JP,
23}
24
25func iso2022JPNewDecoder() transform.Transformer {
26	return new(iso2022JPDecoder)
27}
28
29func iso2022JPNewEncoder() transform.Transformer {
30	return new(iso2022JPEncoder)
31}
32
33const (
34	asciiState = iota
35	katakanaState
36	jis0208State
37	jis0212State
38)
39
40const asciiEsc = 0x1b
41
42type iso2022JPDecoder int
43
44func (d *iso2022JPDecoder) Reset() {
45	*d = asciiState
46}
47
48func (d *iso2022JPDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
49	r, size := rune(0), 0
50	for ; nSrc < len(src); nSrc += size {
51		c0 := src[nSrc]
52		if c0 >= utf8.RuneSelf {
53			r, size = '\ufffd', 1
54			goto write
55		}
56
57		if c0 == asciiEsc {
58			if nSrc+2 >= len(src) {
59				if !atEOF {
60					return nDst, nSrc, transform.ErrShortSrc
61				}
62				// TODO: is it correct to only skip 1??
63				r, size = '\ufffd', 1
64				goto write
65			}
66			size = 3
67			c1 := src[nSrc+1]
68			c2 := src[nSrc+2]
69			switch {
70			case c1 == '$' && (c2 == '@' || c2 == 'B'): // 0x24 {0x40, 0x42}
71				*d = jis0208State
72				continue
73			case c1 == '$' && c2 == '(': // 0x24 0x28
74				if nSrc+3 >= len(src) {
75					if !atEOF {
76						return nDst, nSrc, transform.ErrShortSrc
77					}
78					r, size = '\ufffd', 1
79					goto write
80				}
81				size = 4
82				if src[nSrc+3] == 'D' {
83					*d = jis0212State
84					continue
85				}
86			case c1 == '(' && (c2 == 'B' || c2 == 'J'): // 0x28 {0x42, 0x4A}
87				*d = asciiState
88				continue
89			case c1 == '(' && c2 == 'I': // 0x28 0x49
90				*d = katakanaState
91				continue
92			}
93			r, size = '\ufffd', 1
94			goto write
95		}
96
97		switch *d {
98		case asciiState:
99			r, size = rune(c0), 1
100
101		case katakanaState:
102			if c0 < 0x21 || 0x60 <= c0 {
103				r, size = '\ufffd', 1
104				goto write
105			}
106			r, size = rune(c0)+(0xff61-0x21), 1
107
108		default:
109			if c0 == 0x0a {
110				*d = asciiState
111				r, size = rune(c0), 1
112				goto write
113			}
114			if nSrc+1 >= len(src) {
115				if !atEOF {
116					return nDst, nSrc, transform.ErrShortSrc
117				}
118				r, size = '\ufffd', 1
119				goto write
120			}
121			size = 2
122			c1 := src[nSrc+1]
123			i := int(c0-0x21)*94 + int(c1-0x21)
124			if *d == jis0208State && i < len(jis0208Decode) {
125				r = rune(jis0208Decode[i])
126			} else if *d == jis0212State && i < len(jis0212Decode) {
127				r = rune(jis0212Decode[i])
128			} else {
129				r = '\ufffd'
130				goto write
131			}
132			if r == 0 {
133				r = '\ufffd'
134			}
135		}
136
137	write:
138		if nDst+utf8.RuneLen(r) > len(dst) {
139			return nDst, nSrc, transform.ErrShortDst
140		}
141		nDst += utf8.EncodeRune(dst[nDst:], r)
142	}
143	return nDst, nSrc, err
144}
145
146type iso2022JPEncoder int
147
148func (e *iso2022JPEncoder) Reset() {
149	*e = asciiState
150}
151
152func (e *iso2022JPEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
153	r, size := rune(0), 0
154	for ; nSrc < len(src); nSrc += size {
155		r = rune(src[nSrc])
156
157		// Decode a 1-byte rune.
158		if r < utf8.RuneSelf {
159			size = 1
160
161		} else {
162			// Decode a multi-byte rune.
163			r, size = utf8.DecodeRune(src[nSrc:])
164			if size == 1 {
165				// All valid runes of size 1 (those below utf8.RuneSelf) were
166				// handled above. We have invalid UTF-8 or we haven't seen the
167				// full character yet.
168				if !atEOF && !utf8.FullRune(src[nSrc:]) {
169					err = transform.ErrShortSrc
170					break
171				}
172			}
173
174			// func init checks that the switch covers all tables.
175			//
176			// http://encoding.spec.whatwg.org/#iso-2022-jp says that "the index jis0212
177			// is not used by the iso-2022-jp encoder due to lack of widespread support".
178			//
179			// TODO: do we have to special-case U+00A5 and U+203E, as per
180			// http://encoding.spec.whatwg.org/#iso-2022-jp
181			// Doing so would mean that "\u00a5" would not be preserved
182			// after an encode-decode round trip.
183			switch {
184			case encode0Low <= r && r < encode0High:
185				if r = rune(encode0[r-encode0Low]); r>>tableShift == jis0208 {
186					goto writeJIS
187				}
188			case encode1Low <= r && r < encode1High:
189				if r = rune(encode1[r-encode1Low]); r>>tableShift == jis0208 {
190					goto writeJIS
191				}
192			case encode2Low <= r && r < encode2High:
193				if r = rune(encode2[r-encode2Low]); r>>tableShift == jis0208 {
194					goto writeJIS
195				}
196			case encode3Low <= r && r < encode3High:
197				if r = rune(encode3[r-encode3Low]); r>>tableShift == jis0208 {
198					goto writeJIS
199				}
200			case encode4Low <= r && r < encode4High:
201				if r = rune(encode4[r-encode4Low]); r>>tableShift == jis0208 {
202					goto writeJIS
203				}
204			case encode5Low <= r && r < encode5High:
205				if 0xff61 <= r && r < 0xffa0 {
206					goto writeKatakana
207				}
208				if r = rune(encode5[r-encode5Low]); r>>tableShift == jis0208 {
209					goto writeJIS
210				}
211			}
212
213			// Switch back to ASCII state in case of error so that an ASCII
214			// replacement character can be written in the correct state.
215			if *e != asciiState {
216				if nDst+3 > len(dst) {
217					err = transform.ErrShortDst
218					break
219				}
220				*e = asciiState
221				dst[nDst+0] = asciiEsc
222				dst[nDst+1] = '('
223				dst[nDst+2] = 'B'
224				nDst += 3
225			}
226			err = internal.ErrASCIIReplacement
227			break
228		}
229
230		if *e != asciiState {
231			if nDst+4 > len(dst) {
232				err = transform.ErrShortDst
233				break
234			}
235			*e = asciiState
236			dst[nDst+0] = asciiEsc
237			dst[nDst+1] = '('
238			dst[nDst+2] = 'B'
239			nDst += 3
240		} else if nDst >= len(dst) {
241			err = transform.ErrShortDst
242			break
243		}
244		dst[nDst] = uint8(r)
245		nDst++
246		continue
247
248	writeJIS:
249		if *e != jis0208State {
250			if nDst+5 > len(dst) {
251				err = transform.ErrShortDst
252				break
253			}
254			*e = jis0208State
255			dst[nDst+0] = asciiEsc
256			dst[nDst+1] = '$'
257			dst[nDst+2] = 'B'
258			nDst += 3
259		} else if nDst+2 > len(dst) {
260			err = transform.ErrShortDst
261			break
262		}
263		dst[nDst+0] = 0x21 + uint8(r>>codeShift)&codeMask
264		dst[nDst+1] = 0x21 + uint8(r)&codeMask
265		nDst += 2
266		continue
267
268	writeKatakana:
269		if *e != katakanaState {
270			if nDst+4 > len(dst) {
271				err = transform.ErrShortDst
272				break
273			}
274			*e = katakanaState
275			dst[nDst+0] = asciiEsc
276			dst[nDst+1] = '('
277			dst[nDst+2] = 'I'
278			nDst += 3
279		} else if nDst >= len(dst) {
280			err = transform.ErrShortDst
281			break
282		}
283		dst[nDst] = uint8(r - (0xff61 - 0x21))
284		nDst++
285		continue
286	}
287	if atEOF && err == nil && *e != asciiState {
288		if nDst+3 > len(dst) {
289			err = transform.ErrShortDst
290		} else {
291			*e = asciiState
292			dst[nDst+0] = asciiEsc
293			dst[nDst+1] = '('
294			dst[nDst+2] = 'B'
295			nDst += 3
296		}
297	}
298	return nDst, nSrc, err
299}
300