1 /*
2  * iso2022s.c - support for ISO-2022 subset encodings.
3  */
4 
5 #ifndef ENUM_CHARSETS
6 
7 #include <stdio.h>
8 #include <string.h>
9 #include <assert.h>
10 
11 #include "charset.h"
12 #include "internal.h"
13 #include "sbcsdat.h"
14 
15 #define SO (0x0E)
16 #define SI (0x0F)
17 #define ESC (0x1B)
18 
19 /* Functional description of a single ISO 2022 escape sequence. */
20 struct iso2022_escape {
21     char const *sequence;
22     unsigned long andbits, xorbits;
23     /*
24      * For output, these variables help us figure out which escape
25      * sequences we need to get where we want to be.
26      *
27      * `container' should be in the range 0-3, but can also be ORed
28      * with the bit flag RO to indicate that this is not a
29      * preferred container to use for this charset during output.
30      */
31     int container, subcharset;
32 };
33 #define RO 0x80
34 
35 struct iso2022 {
36     /*
37      * List of escape sequences supported in this subset. Must be
38      * in ASCII order, so that we can narrow down the list as
39      * necessary.
40      */
41     const struct iso2022_escape *escapes;/* must be sorted in ASCII order! */
42     int nescapes;
43 
44     /*
45      * We assign indices from 0 upwards to the sub-charsets of a
46      * given ISO 2022 subset. nbytes[i] tells us how many bytes per
47      * character are required by sub-charset i. (It's a string
48      * mainly because that makes it easier to declare in C syntax
49      * than an int array.)
50      */
51     char const *nbytes;
52 
53     /*
54      * The characters in this string are indices-plus-one (so that
55      * NUL can still terminate) of escape sequences in `escapes'.
56      * These escapes are output in the given sequence to reset the
57      * encoding state, unless it turns out that a given escape
58      * would not change the state at all.
59      */
60     char const *reset;
61 
62     /*
63      * Initial value of s1, in case the default container contents
64      * needs to be something other than charset 0 in all cases.
65      * (Note that this must have the top bit set!)
66      */
67     unsigned long s1;
68 
69     /*
70      * For output, some ISO 2022 subsets _mandate_ an initial shift
71      * sequence. If so, here it is so we can output it. (For the
72      * sake of basic sanity we won't bother to _require_ it on
73      * input, although it should of course be listed under
74      * `escapes' above so that we ignore it when present.)
75      */
76     char const *initial_sequence;
77 
78     /*
79      * Is this an 8-bit ISO 2022 subset?
80      */
81     int eightbit;
82 
83     /*
84      * Function calls to do the actual translation.
85      */
86     long int (*to_ucs)(int subcharset, unsigned long bytes);
87     int (*from_ucs)(long int ucs, int *subcharset, unsigned long *bytes);
88 };
89 
read_iso2022s(charset_spec const * charset,long int input_chr,charset_state * state,void (* emit)(void * ctx,long int output),void * emitctx)90 static void read_iso2022s(charset_spec const *charset, long int input_chr,
91 			  charset_state *state,
92 			  void (*emit)(void *ctx, long int output),
93 			  void *emitctx)
94 {
95     struct iso2022 const *iso = (struct iso2022 *)charset->data;
96 
97     /*
98      * For reading ISO-2022 subsets, we divide up our state
99      * variables as follows:
100      *
101      * 	- The top byte of s0 (bits 31:24) indicates, if nonzero,
102      * 	  that we are part-way through a recognised ISO-2022 escape
103      * 	  sequence. Five of those bits (31:27) give the index of
104      * 	  the first member of the escapes list matching what we
105      * 	  have so far; the remaining three (26:24) give the number
106      * 	  of characters we have seen so far.
107      *
108      * 	- The top bit of s1 (bit 31) is non-zero at all times, to
109      * 	  indicate that we have performed any necessary
110      * 	  initialisation. When we start, we detect a zero s1 and
111      * 	  respond to it by initialising the default container
112      * 	  contents.
113      *
114      * 	- The next three bits of s1 (bits 30:28) indicate which
115      * 	  _container_ is currently selected. This isn't quite as
116      * 	  simple as it sounds, since we have to preserve memory of
117      * 	  which of the SI/SO containers we came from when we're
118      * 	  temporarily in SS2/SS3. Hence, what happens is:
119      *     + bit 28 indicates SI/SO.
120      * 	   + if we're in an SS2/SS3 container, that's indicated by
121      * 	     the two bits above that being nonzero and holding
122      * 	     either 2 or 3.
123      * 	   + Hence: 0 is SI, 1 is SO, 4 is SS2-from-SI, 5 is
124      * 	     SS2-from-SO, 6 is SS3-from-SI, 7 is SS3-from-SO.
125      * 	   + For added fun: in an _8-bit_ ISO 2022 subset, we have
126      * 	     the further special value 2, which means that we're
127      * 	     theoretically in SI but the current character being
128      * 	     accumulated is composed of 8-bit characters and will
129      * 	     therefore be interpreted as if in SO.
130      *
131      * 	- The next nibble of s1 (27:24) indicates how many bytes
132      * 	  have been accumulated in the current character.
133      *
134      * 	- The remaining three bytes of s1 are divided into four
135      * 	  six-bit sections, and each section gives the current
136      * 	  sub-charset selected in one of the possible containers.
137      * 	  (Those containers are SI, SO, SS2 and SS3, respectively
138      * 	  and in order from the bottom of s0 to the top.)
139      *
140      * 	- The bottom 24 bits of s0 give the accumulated character
141      * 	  data so far.
142      *
143      * (Note that this means s1 contains all the parts of the state
144      * which might need to be operated on by escape sequences.
145      * Cunning, eh?)
146      */
147 
148     if (!(state->s1 & 0x80000000)) {
149 	state->s1 = iso->s1;
150     }
151 
152     /*
153      * So. Firstly, we process escape sequences, if we're in the
154      * middle of one or if we see a possible introducer (SI, SO,
155      * ESC).
156      */
157     if ((state->s0 >> 24) ||
158 	(input_chr == SO || input_chr == SI || input_chr == ESC)) {
159 	int n = (state->s0 >> 24) & 7, i = (state->s0 >> 27), oi = i, j;
160 
161 	/*
162 	 * If this is the start of an escape sequence, we might be
163 	 * in mid-character. If so, clear the character state and
164 	 * emit an error token for the incomplete character.
165 	 */
166 	if (state->s1 & 0x0F000000) {
167 	    state->s1 &= ~0x0F000000;
168 	    state->s0 &= 0xFF000000;
169 	    /*
170 	     * If we were in the SS2 or SS3 container, we
171 	     * automatically exit it.
172 	     */
173 	    if (state->s1 & 0x60000000)
174 		state->s1 &= 0x9FFFFFFF;
175 	    emit(emitctx, ERROR);
176 	}
177 
178 	j = i;
179 	while (j < iso->nescapes &&
180 	       !memcmp(iso->escapes[j].sequence,
181 		       iso->escapes[oi].sequence, n)) {
182 	    if (iso->escapes[j].sequence[n] < input_chr)
183 		i = ++j;
184 	    else
185 		break;
186 	}
187 	if (i >= iso->nescapes ||
188 	    memcmp(iso->escapes[i].sequence,
189 		   iso->escapes[oi].sequence, n) ||
190 	    iso->escapes[i].sequence[n] != input_chr) {
191 	    /*
192 	     * This character does not appear in any valid escape
193 	     * sequence. Therefore, we must emit all the characters
194 	     * we had previously swallowed, plus this one, and
195 	     * return to non-escape-sequence state.
196 	     */
197 	    for (j = 0; j < n; j++)
198 		emit(emitctx, iso->escapes[oi].sequence[j]);
199 	    emit(emitctx, input_chr);
200 	    state->s0 = 0;
201 	    return;
202 	}
203 
204 	/*
205 	 * Otherwise, we have found an additional character in our
206 	 * escape sequence. See if we have reached the _end_ of our
207 	 * sequence (and therefore must process the sequence).
208 	 */
209 	n++;
210 	if (!iso->escapes[i].sequence[n]) {
211 	    state->s0 = 0;
212 	    state->s1 &= iso->escapes[i].andbits;
213 	    state->s1 ^= iso->escapes[i].xorbits;
214 	    return;
215 	}
216 
217 	/*
218 	 * Failing _that_, we simply update our escape-sequence-
219 	 * tracking state.
220 	 */
221 	assert(i < 32 && n < 8);
222 	state->s0 = (i << 27) | (n << 24);
223 	return;
224     }
225 
226     /*
227      * If this isn't an escape sequence, it must be part of a
228      * character. One possibility is that it's a control character
229      * (00-20 or 7F-9F; also in non-8-bit ISO 2022 subsets I'm
230      * going to treat all top-half characters as controls), in
231      * which case we output it verbatim.
232      */
233     if (input_chr < 0x21 ||
234 	(input_chr > 0x7E && (!iso->eightbit || input_chr < 0xA0))) {
235 	/*
236 	 * We might be in mid-multibyte-character. If so, clear the
237 	 * character state and emit an error token for the
238 	 * incomplete character.
239 	 */
240 	if (state->s1 & 0x0F000000) {
241 	    state->s1 &= ~0x0F000000;
242 	    state->s0 &= 0xFF000000;
243 	    emit(emitctx, ERROR);
244 	    /*
245 	     * If we were in the SS2 or SS3 container, we
246 	     * automatically exit it.
247 	     */
248 	    if (state->s1 & 0x60000000)
249 		state->s1 &= 0x9FFFFFFF;
250 	}
251 
252 	emit(emitctx, input_chr);
253 	return;
254     }
255 
256     /*
257      * Otherwise, accumulate character data.
258      */
259     {
260 	unsigned long chr;
261 	int chrlen, cont, subcharset, bytes;
262 
263 	/*
264 	 * Verify that we've seen the right kind of character for
265 	 * what we're currently doing. This only matters in 8-bit
266 	 * subsets.
267 	 */
268 	if (iso->eightbit) {
269 	    cont = (state->s1 >> 28) & 7;
270 	    /*
271 	     * If cont==0, we're entitled to see either GL or GR
272 	     * characters. If cont==2, we expect only GR; otherwise
273 	     * we expect only GL.
274 	     *
275 	     * If we see a GR character while cont==0, we set
276 	     * cont=2 immediately.
277 	     */
278 	    if ((cont == 2 && !(input_chr & 0x80)) ||
279 		(cont != 0 && cont != 2 && (input_chr & 0x80))) {
280 		/*
281 		 * Clear the previous character; it was prematurely
282 		 * terminated by this error.
283 		 */
284 		state->s1 &= ~0x0F000000;
285 		state->s0 &= 0xFF000000;
286 		emit(emitctx, ERROR);
287 		/*
288 		 * If we were in the SS2 or SS3 container, we
289 		 * automatically exit it.
290 		 */
291 		if (state->s1 & 0x60000000)
292 		    state->s1 &= 0x9FFFFFFF;
293 	    }
294 
295 	    if (cont == 0 && (input_chr & 0x80)) {
296 		state->s1 |= 0x20000000;
297 	    }
298 	}
299 
300 	/* The current character and its length. */
301 	chr = ((state->s0 & 0x00FFFFFF) << 8) | (input_chr & 0x7F);
302 	chrlen = ((state->s1 >> 24) & 0xF) + 1;
303 	/* The current sub-charset. */
304 	cont = (state->s1 >> 28) & 7;
305 	if (cont > 1) cont >>= 1;
306 	subcharset = (state->s1 >> (6*cont)) & 0x3F;
307 	/* The number of bytes-per-character in that sub-charset. */
308 	bytes = iso->nbytes[subcharset];
309 
310 	/*
311 	 * If this character is now complete, we convert and emit
312 	 * it. Otherwise, we simply update the state and return.
313 	 */
314 	if (chrlen >= bytes) {
315 	    emit(emitctx, iso->to_ucs(subcharset, chr));
316 	    chr = chrlen = 0;
317 	    /*
318 	     * If we were in the SS2 or SS3 container, we
319 	     * automatically exit it.
320 	     */
321 	    if (state->s1 & 0x60000000)
322 		state->s1 &= 0x9FFFFFFF;
323 	}
324 	state->s0 = (state->s0 & 0xFF000000) | chr;
325 	state->s1 = (state->s1 & 0xF0FFFFFF) | (chrlen << 24);
326     }
327 }
328 
write_iso2022s(charset_spec const * charset,long int input_chr,charset_state * state,void (* emit)(void * ctx,long int output),void * emitctx)329 static int write_iso2022s(charset_spec const *charset, long int input_chr,
330 			  charset_state *state,
331 			  void (*emit)(void *ctx, long int output),
332 			  void *emitctx)
333 {
334     struct iso2022 const *iso = (struct iso2022 *)charset->data;
335     int subcharset, len, i, j, cont, topbit = 0;
336     unsigned long bytes;
337 
338     /*
339      * For output, our s1 state variable contains most of the same
340      * stuff as it did for input - initial-state indicator bit,
341      * current container, and current subcharset selected in each
342      * container.
343      */
344 
345     /*
346      * Analyse the character and find out what subcharset it needs
347      * to go in.
348      */
349     if (input_chr >= 0 && !iso->from_ucs(input_chr, &subcharset, &bytes))
350 	return FALSE;
351 
352     if (!(state->s1 & 0x80000000)) {
353 	state->s1 = iso->s1;
354 	if (iso->initial_sequence)
355 	    for (i = 0; iso->initial_sequence[i]; i++)
356 		emit(emitctx, iso->initial_sequence[i]);
357     }
358 
359     if (input_chr == -1) {
360 	unsigned long oldstate;
361 	int k;
362 
363 	/*
364 	 * Special case: reset encoding state.
365 	 */
366 	for (i = 0; iso->reset[i]; i++) {
367 	    j = iso->reset[i] - 1;
368 	    oldstate = state->s1;
369 	    state->s1 &= iso->escapes[j].andbits;
370 	    state->s1 ^= iso->escapes[j].xorbits;
371 	    if (state->s1 != oldstate) {
372 		/* We must actually emit this sequence. */
373 		for (k = 0; iso->escapes[j].sequence[k]; k++)
374 		    emit(emitctx, iso->escapes[j].sequence[k]);
375 	    }
376 	}
377 
378 	return TRUE;
379     }
380 
381     /*
382      * Now begins the fun. We now know what subcharset we want. So
383      * we must find out which container we should select it into,
384      * select it into it if necessary, select that _container_ if
385      * necessary, and then output the given bytes.
386      */
387     for (i = 0; i < iso->nescapes; i++)
388 	if (iso->escapes[i].subcharset == subcharset &&
389 	    !(iso->escapes[i].container & RO))
390 	    break;
391     assert(i < iso->nescapes);
392 
393     /*
394      * We've found the escape sequence which would select this
395      * subcharset into a container. However, that subcharset might
396      * already _be_ selected in that container! Check before we go
397      * to the effort of emitting the sequence.
398      */
399     cont = iso->escapes[i].container &~ RO;
400     if (((state->s1 >> (6*cont)) & 0x3F) != (unsigned)subcharset) {
401 	for (j = 0; iso->escapes[i].sequence[j]; j++)
402 	    emit(emitctx, iso->escapes[i].sequence[j]);
403 	state->s1 &= iso->escapes[i].andbits;
404 	state->s1 ^= iso->escapes[i].xorbits;
405     }
406 
407     /*
408      * Now we know what container our subcharset is in, so we want
409      * to select that container.
410      */
411     if (cont > 1) {
412 	/* SS2 or SS3; just output the sequence and be done. */
413 	emit(emitctx, ESC);
414 	emit(emitctx, 'L' + cont);     /* comes out to 'N' or 'O' */
415     } else {
416 	/*
417 	 * Emit SI or SO, but only if the current container isn't already
418 	 * the right one.
419 	 *
420 	 * Also, in an 8-bit subset, we need not do this; we'll
421 	 * just use 8-bit characters to output SO-container
422 	 * characters.
423 	 */
424 	if (iso->eightbit && cont == 1 && ((state->s1 >> 28) & 7) == 0) {
425 	    topbit = 0x80;
426 	} else if (((state->s1 >> 28) & 7) != (unsigned)cont) {
427 	    emit(emitctx, cont ? SO : SI);
428 	    state->s1 = (state->s1 & 0x8FFFFFFF) | (cont << 28);
429 	}
430     }
431 
432     /*
433      * We're done. Subcharset is selected in container, container
434      * is selected. All we need now is to write out the bytes.
435      */
436     len = iso->nbytes[subcharset];
437     while (len--)
438 	emit(emitctx, ((bytes >> (8*len)) & 0xFF) | topbit);
439 
440     return TRUE;
441 }
442 
443 /*
444  * ISO-2022-JP, defined in RFC 1468.
445  */
iso2022jp_to_ucs(int subcharset,unsigned long bytes)446 static long int iso2022jp_to_ucs(int subcharset, unsigned long bytes)
447 {
448     switch (subcharset) {
449       case 1:			       /* JIS X 0201 bottom half */
450 	if (bytes == 0x5C)
451 	    return 0xA5;
452 	else if (bytes == 0x7E)
453 	    return 0x203E;
454 	/* else fall through to ASCII */
455       case 0: return bytes;	       /* one-byte ASCII */
456 	/* (no break needed since all control paths have returned) */
457       case 2: return jisx0208_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
458 					 ((bytes     ) & 0xFF) - 0x21);
459       default: return ERROR;
460     }
461 }
iso2022jp_from_ucs(long int ucs,int * subcharset,unsigned long * bytes)462 static int iso2022jp_from_ucs(long int ucs, int *subcharset,
463 			      unsigned long *bytes)
464 {
465     int r, c;
466     if (ucs < 0x80) {
467 	*subcharset = 0;
468 	*bytes = ucs;
469 	return 1;
470     } else if (ucs == 0xA5 || ucs == 0x203E) {
471 	*subcharset = 1;
472 	*bytes = (ucs == 0xA5 ? 0x5C : 0x7E);
473 	return 1;
474     } else if (unicode_to_jisx0208(ucs, &r, &c)) {
475 	*subcharset = 2;
476 	*bytes = ((r+0x21) << 8) | (c+0x21);
477 	return 1;
478     } else {
479 	return 0;
480     }
481 }
482 static const struct iso2022_escape iso2022jp_escapes[] = {
483     {"\033$@", 0xFFFFFFC0, 0x00000002, -1, -1},   /* we ignore this one */
484     {"\033$B", 0xFFFFFFC0, 0x00000002, 0, 2},
485     {"\033(B", 0xFFFFFFC0, 0x00000000, 0, 0},
486     {"\033(J", 0xFFFFFFC0, 0x00000001, 0, 1},
487 };
488 static const struct iso2022 iso2022jp = {
489     iso2022jp_escapes, lenof(iso2022jp_escapes),
490     "\1\1\2", "\3", 0x80000000, NULL, FALSE,
491     iso2022jp_to_ucs, iso2022jp_from_ucs
492 };
493 const charset_spec charset_CS_ISO2022_JP = {
494     CS_ISO2022_JP, read_iso2022s, write_iso2022s, &iso2022jp
495 };
496 
497 /*
498  * ISO-2022-KR, defined in RFC 1557.
499  */
iso2022kr_to_ucs(int subcharset,unsigned long bytes)500 static long int iso2022kr_to_ucs(int subcharset, unsigned long bytes)
501 {
502     switch (subcharset) {
503       case 0: return bytes;	       /* one-byte ASCII */
504       case 1: return ksx1001_to_unicode(((bytes >> 8) & 0xFF) - 0x21,
505 					((bytes     ) & 0xFF) - 0x21);
506       default: return ERROR;
507     }
508 }
iso2022kr_from_ucs(long int ucs,int * subcharset,unsigned long * bytes)509 static int iso2022kr_from_ucs(long int ucs, int *subcharset,
510 			      unsigned long *bytes)
511 {
512     int r, c;
513     if (ucs < 0x80) {
514 	*subcharset = 0;
515 	*bytes = ucs;
516 	return 1;
517     } else if (unicode_to_ksx1001(ucs, &r, &c)) {
518 	*subcharset = 1;
519 	*bytes = ((r+0x21) << 8) | (c+0x21);
520 	return 1;
521     } else {
522 	return 0;
523     }
524 }
525 static const struct iso2022_escape iso2022kr_escapes[] = {
526     {"\016", 0x8FFFFFFF, 0x10000000, -1, -1},
527     {"\017", 0x8FFFFFFF, 0x00000000, 0, 0},
528     {"\033$)C", 0xFFFFF03F, 0x00000040, 1, 1},   /* bits[11:6] <- 1 */
529 };
530 static const struct iso2022 iso2022kr = {
531     iso2022kr_escapes, lenof(iso2022kr_escapes),
532     "\1\2", "\2", 0x80000040, "\033$)C", FALSE,
533     iso2022kr_to_ucs, iso2022kr_from_ucs
534 };
535 const charset_spec charset_CS_ISO2022_KR = {
536     CS_ISO2022_KR, read_iso2022s, write_iso2022s, &iso2022kr
537 };
538 
539 #else /* ENUM_CHARSETS */
540 
541 ENUM_CHARSET(CS_ISO2022_JP)
542 ENUM_CHARSET(CS_ISO2022_KR)
543 
544 #endif /* ENUM_CHARSETS */
545