1 /*
2 * Copyright (c) 1987, Fujitsu LTD. (Itaru ICHIKAWA).
3 * Copyright (c) 1996-2018, The nkf Project.
4 *
5 * This software is provided 'as-is', without any express or implied
6 * warranty. In no event will the authors be held liable for any damages
7 * arising from the use of this software.
8 *
9 * Permission is granted to anyone to use this software for any purpose,
10 * including commercial applications, and to alter it and redistribute it
11 * freely, subject to the following restrictions:
12 *
13 * 1. The origin of this software must not be misrepresented; you must not
14 * claim that you wrote the original software. If you use this software
15 * in a product, an acknowledgment in the product documentation would be
16 * appreciated but is not required.
17 *
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
20 *
21 * 3. This notice may not be removed or altered from any source distribution.
22 */
23 #define NKF_VERSION "2.1.5"
24 #define NKF_RELEASE_DATE "2018-12-15"
25 #define COPY_RIGHT \
26 "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa).\n" \
27 "Copyright (C) 1996-2018, The nkf Project."
28
29 #include "config.h"
30 #include "nkf.h"
31 #include "utf8tbl.h"
32 #ifdef __WIN32__
33 #include <windows.h>
34 #include <locale.h>
35 #endif
36 #if defined(__OS2__)
37 # define INCL_DOS
38 # define INCL_DOSERRORS
39 # include <os2.h>
40 #endif
41 #include <assert.h>
42
43
44 /* state of output_mode and input_mode
45
46 c2 0 means ASCII
47 JIS_X_0201_1976_K
48 ISO_8859_1
49 JIS_X_0208
50 EOF all termination
51 c1 32bit data
52
53 */
54
55 /* MIME ENCODE */
56
57 #define FIXED_MIME 7
58 #define STRICT_MIME 8
59
60 /* byte order */
61 enum byte_order {
62 ENDIAN_BIG = 1,
63 ENDIAN_LITTLE = 2,
64 ENDIAN_2143 = 3,
65 ENDIAN_3412 = 4
66 };
67
68 /* ASCII CODE */
69
70 #define BS 0x08
71 #define TAB 0x09
72 #define LF 0x0a
73 #define CR 0x0d
74 #define ESC 0x1b
75 #define SP 0x20
76 #define DEL 0x7f
77 #define SI 0x0f
78 #define SO 0x0e
79 #define SS2 0x8e
80 #define SS3 0x8f
81 #define CRLF 0x0D0A
82
83
84 /* encodings */
85
86 enum nkf_encodings {
87 ASCII,
88 ISO_8859_1,
89 ISO_2022_JP,
90 CP50220,
91 CP50221,
92 CP50222,
93 ISO_2022_JP_1,
94 ISO_2022_JP_3,
95 ISO_2022_JP_2004,
96 SHIFT_JIS,
97 WINDOWS_31J,
98 CP10001,
99 EUC_JP,
100 EUCJP_NKF,
101 CP51932,
102 EUCJP_MS,
103 EUCJP_ASCII,
104 SHIFT_JISX0213,
105 SHIFT_JIS_2004,
106 EUC_JISX0213,
107 EUC_JIS_2004,
108 UTF_8,
109 UTF_8N,
110 UTF_8_BOM,
111 UTF8_MAC,
112 UTF_16,
113 UTF_16BE,
114 UTF_16BE_BOM,
115 UTF_16LE,
116 UTF_16LE_BOM,
117 UTF_32,
118 UTF_32BE,
119 UTF_32BE_BOM,
120 UTF_32LE,
121 UTF_32LE_BOM,
122 BINARY,
123 NKF_ENCODING_TABLE_SIZE,
124 JIS_X_0201_1976_K = 0x1013, /* I */ /* JIS C 6220-1969 */
125 /* JIS_X_0201_1976_R = 0x1014, */ /* J */ /* JIS C 6220-1969 */
126 /* JIS_X_0208_1978 = 0x1040, */ /* @ */ /* JIS C 6226-1978 */
127 /* JIS_X_0208_1983 = 0x1087, */ /* B */ /* JIS C 6226-1983 */
128 JIS_X_0208 = 0x1168, /* @B */
129 JIS_X_0212 = 0x1159, /* D */
130 /* JIS_X_0213_2000_1 = 0x1228, */ /* O */
131 JIS_X_0213_2 = 0x1229, /* P */
132 JIS_X_0213_1 = 0x1233 /* Q */
133 };
134
135 static nkf_char s_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
136 static nkf_char e_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
137 static nkf_char w_iconv(nkf_char c2, nkf_char c1, nkf_char c0);
138 static nkf_char w_iconv16(nkf_char c2, nkf_char c1, nkf_char c0);
139 static nkf_char w_iconv32(nkf_char c2, nkf_char c1, nkf_char c0);
140 static void j_oconv(nkf_char c2, nkf_char c1);
141 static void s_oconv(nkf_char c2, nkf_char c1);
142 static void e_oconv(nkf_char c2, nkf_char c1);
143 static void w_oconv(nkf_char c2, nkf_char c1);
144 static void w_oconv16(nkf_char c2, nkf_char c1);
145 static void w_oconv32(nkf_char c2, nkf_char c1);
146
147 typedef struct {
148 const char *name;
149 nkf_char (*iconv)(nkf_char c2, nkf_char c1, nkf_char c0);
150 void (*oconv)(nkf_char c2, nkf_char c1);
151 } nkf_native_encoding;
152
153 nkf_native_encoding NkfEncodingASCII = { "ASCII", e_iconv, e_oconv };
154 nkf_native_encoding NkfEncodingISO_2022_JP = { "ISO-2022-JP", e_iconv, j_oconv };
155 nkf_native_encoding NkfEncodingShift_JIS = { "Shift_JIS", s_iconv, s_oconv };
156 nkf_native_encoding NkfEncodingEUC_JP = { "EUC-JP", e_iconv, e_oconv };
157 nkf_native_encoding NkfEncodingUTF_8 = { "UTF-8", w_iconv, w_oconv };
158 nkf_native_encoding NkfEncodingUTF_16 = { "UTF-16", w_iconv16, w_oconv16 };
159 nkf_native_encoding NkfEncodingUTF_32 = { "UTF-32", w_iconv32, w_oconv32 };
160
161 typedef struct {
162 const int id;
163 const char *name;
164 const nkf_native_encoding *base_encoding;
165 } nkf_encoding;
166
167 nkf_encoding nkf_encoding_table[] = {
168 {ASCII, "US-ASCII", &NkfEncodingASCII},
169 {ISO_8859_1, "ISO-8859-1", &NkfEncodingASCII},
170 {ISO_2022_JP, "ISO-2022-JP", &NkfEncodingISO_2022_JP},
171 {CP50220, "CP50220", &NkfEncodingISO_2022_JP},
172 {CP50221, "CP50221", &NkfEncodingISO_2022_JP},
173 {CP50222, "CP50222", &NkfEncodingISO_2022_JP},
174 {ISO_2022_JP_1, "ISO-2022-JP-1", &NkfEncodingISO_2022_JP},
175 {ISO_2022_JP_3, "ISO-2022-JP-3", &NkfEncodingISO_2022_JP},
176 {ISO_2022_JP_2004, "ISO-2022-JP-2004", &NkfEncodingISO_2022_JP},
177 {SHIFT_JIS, "Shift_JIS", &NkfEncodingShift_JIS},
178 {WINDOWS_31J, "Windows-31J", &NkfEncodingShift_JIS},
179 {CP10001, "CP10001", &NkfEncodingShift_JIS},
180 {EUC_JP, "EUC-JP", &NkfEncodingEUC_JP},
181 {EUCJP_NKF, "eucJP-nkf", &NkfEncodingEUC_JP},
182 {CP51932, "CP51932", &NkfEncodingEUC_JP},
183 {EUCJP_MS, "eucJP-MS", &NkfEncodingEUC_JP},
184 {EUCJP_ASCII, "eucJP-ASCII", &NkfEncodingEUC_JP},
185 {SHIFT_JISX0213, "Shift_JISX0213", &NkfEncodingShift_JIS},
186 {SHIFT_JIS_2004, "Shift_JIS-2004", &NkfEncodingShift_JIS},
187 {EUC_JISX0213, "EUC-JISX0213", &NkfEncodingEUC_JP},
188 {EUC_JIS_2004, "EUC-JIS-2004", &NkfEncodingEUC_JP},
189 {UTF_8, "UTF-8", &NkfEncodingUTF_8},
190 {UTF_8N, "UTF-8N", &NkfEncodingUTF_8},
191 {UTF_8_BOM, "UTF-8-BOM", &NkfEncodingUTF_8},
192 {UTF8_MAC, "UTF8-MAC", &NkfEncodingUTF_8},
193 {UTF_16, "UTF-16", &NkfEncodingUTF_16},
194 {UTF_16BE, "UTF-16BE", &NkfEncodingUTF_16},
195 {UTF_16BE_BOM, "UTF-16BE-BOM", &NkfEncodingUTF_16},
196 {UTF_16LE, "UTF-16LE", &NkfEncodingUTF_16},
197 {UTF_16LE_BOM, "UTF-16LE-BOM", &NkfEncodingUTF_16},
198 {UTF_32, "UTF-32", &NkfEncodingUTF_32},
199 {UTF_32BE, "UTF-32BE", &NkfEncodingUTF_32},
200 {UTF_32BE_BOM, "UTF-32BE-BOM", &NkfEncodingUTF_32},
201 {UTF_32LE, "UTF-32LE", &NkfEncodingUTF_32},
202 {UTF_32LE_BOM, "UTF-32LE-BOM", &NkfEncodingUTF_32},
203 {BINARY, "BINARY", &NkfEncodingASCII},
204 {-1, NULL, NULL}
205 };
206
207 struct {
208 const char *name;
209 const int id;
210 } encoding_name_to_id_table[] = {
211 {"US-ASCII", ASCII},
212 {"ASCII", ASCII},
213 {"646", ASCII},
214 {"ROMAN8", ASCII},
215 {"ISO-2022-JP", ISO_2022_JP},
216 {"ISO2022JP-CP932", CP50220},
217 {"CP50220", CP50220},
218 {"CP50221", CP50221},
219 {"CSISO2022JP", CP50221},
220 {"CP50222", CP50222},
221 {"ISO-2022-JP-1", ISO_2022_JP_1},
222 {"ISO-2022-JP-3", ISO_2022_JP_3},
223 {"ISO-2022-JP-2004", ISO_2022_JP_2004},
224 {"SHIFT_JIS", SHIFT_JIS},
225 {"SJIS", SHIFT_JIS},
226 {"MS_Kanji", SHIFT_JIS},
227 {"PCK", SHIFT_JIS},
228 {"WINDOWS-31J", WINDOWS_31J},
229 {"CSWINDOWS31J", WINDOWS_31J},
230 {"CP932", WINDOWS_31J},
231 {"MS932", WINDOWS_31J},
232 {"CP10001", CP10001},
233 {"EUCJP", EUC_JP},
234 {"EUC-JP", EUC_JP},
235 {"EUCJP-NKF", EUCJP_NKF},
236 {"CP51932", CP51932},
237 {"EUC-JP-MS", EUCJP_MS},
238 {"EUCJP-MS", EUCJP_MS},
239 {"EUCJPMS", EUCJP_MS},
240 {"EUC-JP-ASCII", EUCJP_ASCII},
241 {"EUCJP-ASCII", EUCJP_ASCII},
242 {"SHIFT_JISX0213", SHIFT_JISX0213},
243 {"SHIFT_JIS-2004", SHIFT_JIS_2004},
244 {"EUC-JISX0213", EUC_JISX0213},
245 {"EUC-JIS-2004", EUC_JIS_2004},
246 {"UTF-8", UTF_8},
247 {"UTF-8N", UTF_8N},
248 {"UTF-8-BOM", UTF_8_BOM},
249 {"UTF8-MAC", UTF8_MAC},
250 {"UTF-8-MAC", UTF8_MAC},
251 {"UTF-16", UTF_16},
252 {"UTF-16BE", UTF_16BE},
253 {"UTF-16BE-BOM", UTF_16BE_BOM},
254 {"UTF-16LE", UTF_16LE},
255 {"UTF-16LE-BOM", UTF_16LE_BOM},
256 {"UTF-32", UTF_32},
257 {"UTF-32BE", UTF_32BE},
258 {"UTF-32BE-BOM", UTF_32BE_BOM},
259 {"UTF-32LE", UTF_32LE},
260 {"UTF-32LE-BOM", UTF_32LE_BOM},
261 {"BINARY", BINARY},
262 {NULL, -1}
263 };
264
265 #if defined(DEFAULT_CODE_JIS)
266 #define DEFAULT_ENCIDX ISO_2022_JP
267 #elif defined(DEFAULT_CODE_SJIS)
268 #define DEFAULT_ENCIDX SHIFT_JIS
269 #elif defined(DEFAULT_CODE_WINDOWS_31J)
270 #define DEFAULT_ENCIDX WINDOWS_31J
271 #elif defined(DEFAULT_CODE_EUC)
272 #define DEFAULT_ENCIDX EUC_JP
273 #elif defined(DEFAULT_CODE_UTF8)
274 #define DEFAULT_ENCIDX UTF_8
275 #endif
276
277
278 #define is_alnum(c) \
279 (('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9'))
280
281 /* I don't trust portablity of toupper */
282 #define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c)
283 #define nkf_isoctal(c) ('0'<=c && c<='7')
284 #define nkf_isdigit(c) ('0'<=c && c<='9')
285 #define nkf_isxdigit(c) (nkf_isdigit(c) || ('a'<=c && c<='f') || ('A'<=c && c <= 'F'))
286 #define nkf_isblank(c) (c == SP || c == TAB)
287 #define nkf_isspace(c) (nkf_isblank(c) || c == CR || c == LF)
288 #define nkf_isalpha(c) (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
289 #define nkf_isalnum(c) (nkf_isdigit(c) || nkf_isalpha(c))
290 #define nkf_isprint(c) (SP<=c && c<='~')
291 #define nkf_isgraph(c) ('!'<=c && c<='~')
292 #define hex2bin(c) (('0'<=c&&c<='9') ? (c-'0') : \
293 ('A'<=c&&c<='F') ? (c-'A'+10) : \
294 ('a'<=c&&c<='f') ? (c-'a'+10) : 0)
295 #define bin2hex(c) ("0123456789ABCDEF"[c&15])
296 #define is_eucg3(c2) (((unsigned short)c2 >> 8) == SS3)
297 #define nkf_noescape_mime(c) ((c == CR) || (c == LF) || \
298 ((c > SP) && (c < DEL) && (c != '?') && (c != '=') && (c != '_') \
299 && (c != '(') && (c != ')') && (c != '.') && (c != 0x22)))
300
301 #define is_ibmext_in_sjis(c2) (CP932_TABLE_BEGIN <= c2 && c2 <= CP932_TABLE_END)
302 #define nkf_byte_jisx0201_katakana_p(c) (SP <= c && c <= 0x5F)
303
304 #define HOLD_SIZE 1024
305 #if defined(INT_IS_SHORT)
306 #define IOBUF_SIZE 2048
307 #else
308 #define IOBUF_SIZE 16384
309 #endif
310
311 #define DEFAULT_J 'B'
312 #define DEFAULT_R 'B'
313
314
315 #define GETA1 0x22
316 #define GETA2 0x2e
317
318
319 /* MIME preprocessor */
320
321 #ifdef EASYWIN /*Easy Win */
322 extern POINT _BufferSize;
323 #endif
324
325 struct input_code{
326 const char *name;
327 nkf_char stat;
328 nkf_char score;
329 nkf_char index;
330 nkf_char buf[3];
331 void (*status_func)(struct input_code *, nkf_char);
332 nkf_char (*iconv_func)(nkf_char c2, nkf_char c1, nkf_char c0);
333 int _file_stat;
334 };
335
336 static const char *input_codename = NULL; /* NULL: unestablished, "": BINARY */
337 static nkf_encoding *input_encoding = NULL;
338 static nkf_encoding *output_encoding = NULL;
339
340 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
341 /* UCS Mapping
342 * 0: Shift_JIS, eucJP-ascii
343 * 1: eucJP-ms
344 * 2: CP932, CP51932
345 * 3: CP10001
346 */
347 #define UCS_MAP_ASCII 0
348 #define UCS_MAP_MS 1
349 #define UCS_MAP_CP932 2
350 #define UCS_MAP_CP10001 3
351 static int ms_ucs_map_f = UCS_MAP_ASCII;
352 #endif
353 #ifdef UTF8_INPUT_ENABLE
354 /* no NEC special, NEC-selected IBM extended and IBM extended characters */
355 static int no_cp932ext_f = FALSE;
356 /* ignore ZERO WIDTH NO-BREAK SPACE */
357 static int no_best_fit_chars_f = FALSE;
358 static int input_endian = ENDIAN_BIG;
359 static int input_bom_f = FALSE;
360 static nkf_char unicode_subchar = '?'; /* the regular substitution character */
361 static void (*encode_fallback)(nkf_char c) = NULL;
362 static void w_status(struct input_code *, nkf_char);
363 #endif
364 #ifdef UTF8_OUTPUT_ENABLE
365 static int output_bom_f = FALSE;
366 static int output_endian = ENDIAN_BIG;
367 #endif
368
369 static void std_putc(nkf_char c);
370 static nkf_char std_getc(FILE *f);
371 static nkf_char std_ungetc(nkf_char c,FILE *f);
372
373 static nkf_char broken_getc(FILE *f);
374 static nkf_char broken_ungetc(nkf_char c,FILE *f);
375
376 static nkf_char mime_getc(FILE *f);
377
378 static void mime_putc(nkf_char c);
379
380 /* buffers */
381
382 #if !defined(PERL_XS) && !defined(WIN32DLL)
383 static unsigned char stdibuf[IOBUF_SIZE];
384 static unsigned char stdobuf[IOBUF_SIZE];
385 #endif
386
387 #define NKF_UNSPECIFIED (-TRUE)
388
389 /* flags */
390 static int unbuf_f = FALSE;
391 static int estab_f = FALSE;
392 static int nop_f = FALSE;
393 static int binmode_f = TRUE; /* binary mode */
394 static int rot_f = FALSE; /* rot14/43 mode */
395 static int hira_f = FALSE; /* hira/kata henkan */
396 static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */
397 static int mime_f = MIME_DECODE_DEFAULT; /* convert MIME B base64 or Q */
398 static int mime_decode_f = FALSE; /* mime decode is explicitly on */
399 static int mimebuf_f = FALSE; /* MIME buffered input */
400 static int broken_f = FALSE; /* convert ESC-less broken JIS */
401 static int iso8859_f = FALSE; /* ISO8859 through */
402 static int mimeout_f = FALSE; /* base64 mode */
403 static int x0201_f = NKF_UNSPECIFIED; /* convert JIS X 0201 */
404 static int iso2022jp_f = FALSE; /* replace non ISO-2022-JP with GETA */
405
406 #ifdef UNICODE_NORMALIZATION
407 static int nfc_f = FALSE;
408 static nkf_char (*i_nfc_getc)(FILE *) = std_getc; /* input of ugetc */
409 static nkf_char (*i_nfc_ungetc)(nkf_char c ,FILE *f) = std_ungetc;
410 #endif
411
412 #ifdef INPUT_OPTION
413 static int cap_f = FALSE;
414 static nkf_char (*i_cgetc)(FILE *) = std_getc; /* input of cgetc */
415 static nkf_char (*i_cungetc)(nkf_char c ,FILE *f) = std_ungetc;
416
417 static int url_f = FALSE;
418 static nkf_char (*i_ugetc)(FILE *) = std_getc; /* input of ugetc */
419 static nkf_char (*i_uungetc)(nkf_char c ,FILE *f) = std_ungetc;
420 #endif
421
422 #define PREFIX_EUCG3 NKF_INT32_C(0x8F00)
423 #define CLASS_MASK NKF_INT32_C(0xFF000000)
424 #define CLASS_UNICODE NKF_INT32_C(0x01000000)
425 #define VALUE_MASK NKF_INT32_C(0x00FFFFFF)
426 #define UNICODE_BMP_MAX NKF_INT32_C(0x0000FFFF)
427 #define UNICODE_MAX NKF_INT32_C(0x0010FFFF)
428 #define nkf_char_euc3_new(c) ((c) | PREFIX_EUCG3)
429 #define nkf_char_unicode_new(c) ((c) | CLASS_UNICODE)
430 #define nkf_char_unicode_p(c) ((c & CLASS_MASK) == CLASS_UNICODE)
431 #define nkf_char_unicode_bmp_p(c) ((c & VALUE_MASK) <= UNICODE_BMP_MAX)
432 #define nkf_char_unicode_value_p(c) ((c & VALUE_MASK) <= UNICODE_MAX)
433
434 #define UTF16_TO_UTF32(lead, trail) (((lead) << 10) + (trail) - NKF_INT32_C(0x35FDC00))
435
436 #ifdef NUMCHAR_OPTION
437 static int numchar_f = FALSE;
438 static nkf_char (*i_ngetc)(FILE *) = std_getc; /* input of ugetc */
439 static nkf_char (*i_nungetc)(nkf_char c ,FILE *f) = std_ungetc;
440 #endif
441
442 #ifdef CHECK_OPTION
443 static int noout_f = FALSE;
444 static void no_putc(nkf_char c);
445 static int debug_f = FALSE;
446 static void debug(const char *str);
447 static nkf_char (*iconv_for_check)(nkf_char c2,nkf_char c1,nkf_char c0) = 0;
448 #endif
449
450 static int guess_f = 0; /* 0: OFF, 1: ON, 2: VERBOSE */
451 static void set_input_codename(const char *codename);
452
453 #ifdef EXEC_IO
454 static int exec_f = 0;
455 #endif
456
457 #ifdef SHIFTJIS_CP932
458 /* invert IBM extended characters to others */
459 static int cp51932_f = FALSE;
460
461 /* invert NEC-selected IBM extended characters to IBM extended characters */
462 static int cp932inv_f = TRUE;
463
464 /* static nkf_char cp932_conv(nkf_char c2, nkf_char c1); */
465 #endif /* SHIFTJIS_CP932 */
466
467 static int x0212_f = FALSE;
468 static int x0213_f = FALSE;
469
470 static unsigned char prefix_table[256];
471
472 static void e_status(struct input_code *, nkf_char);
473 static void s_status(struct input_code *, nkf_char);
474
475 struct input_code input_code_list[] = {
476 {"EUC-JP", 0, 0, 0, {0, 0, 0}, e_status, e_iconv, 0},
477 {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
478 #ifdef UTF8_INPUT_ENABLE
479 {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
480 {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
481 {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
482 #endif
483 {NULL, 0, 0, 0, {0, 0, 0}, NULL, NULL, 0}
484 };
485
486 static int mimeout_mode = 0; /* 0, -1, 'Q', 'B', 1, 2 */
487 static int base64_count = 0;
488
489 /* X0208 -> ASCII converter */
490
491 /* fold parameter */
492 static int f_line = 0; /* chars in line */
493 static int f_prev = 0;
494 static int fold_preserve_f = FALSE; /* preserve new lines */
495 static int fold_f = FALSE;
496 static int fold_len = 0;
497
498 /* options */
499 static unsigned char kanji_intro = DEFAULT_J;
500 static unsigned char ascii_intro = DEFAULT_R;
501
502 /* Folding */
503
504 #define FOLD_MARGIN 10
505 #define DEFAULT_FOLD 60
506
507 static int fold_margin = FOLD_MARGIN;
508
509 /* process default */
510
511 static nkf_char
no_connection2(ARG_UNUSED nkf_char c2,ARG_UNUSED nkf_char c1,ARG_UNUSED nkf_char c0)512 no_connection2(ARG_UNUSED nkf_char c2, ARG_UNUSED nkf_char c1, ARG_UNUSED nkf_char c0)
513 {
514 fprintf(stderr,"nkf internal module connection failure.\n");
515 exit(EXIT_FAILURE);
516 return 0; /* LINT */
517 }
518
519 static void
no_connection(nkf_char c2,nkf_char c1)520 no_connection(nkf_char c2, nkf_char c1)
521 {
522 no_connection2(c2,c1,0);
523 }
524
525 static nkf_char (*iconv)(nkf_char c2,nkf_char c1,nkf_char c0) = no_connection2;
526 static void (*oconv)(nkf_char c2,nkf_char c1) = no_connection;
527
528 static void (*o_zconv)(nkf_char c2,nkf_char c1) = no_connection;
529 static void (*o_fconv)(nkf_char c2,nkf_char c1) = no_connection;
530 static void (*o_eol_conv)(nkf_char c2,nkf_char c1) = no_connection;
531 static void (*o_rot_conv)(nkf_char c2,nkf_char c1) = no_connection;
532 static void (*o_hira_conv)(nkf_char c2,nkf_char c1) = no_connection;
533 static void (*o_base64conv)(nkf_char c2,nkf_char c1) = no_connection;
534 static void (*o_iso2022jp_check_conv)(nkf_char c2,nkf_char c1) = no_connection;
535
536 /* static redirections */
537
538 static void (*o_putc)(nkf_char c) = std_putc;
539
540 static nkf_char (*i_getc)(FILE *f) = std_getc; /* general input */
541 static nkf_char (*i_ungetc)(nkf_char c,FILE *f) =std_ungetc;
542
543 static nkf_char (*i_bgetc)(FILE *) = std_getc; /* input of mgetc */
544 static nkf_char (*i_bungetc)(nkf_char c ,FILE *f) = std_ungetc;
545
546 static void (*o_mputc)(nkf_char c) = std_putc ; /* output of mputc */
547
548 static nkf_char (*i_mgetc)(FILE *) = std_getc; /* input of mgetc */
549 static nkf_char (*i_mungetc)(nkf_char c ,FILE *f) = std_ungetc;
550
551 /* for strict mime */
552 static nkf_char (*i_mgetc_buf)(FILE *) = std_getc; /* input of mgetc_buf */
553 static nkf_char (*i_mungetc_buf)(nkf_char c,FILE *f) = std_ungetc;
554
555 /* Global states */
556 static int output_mode = ASCII; /* output kanji mode */
557 static int input_mode = ASCII; /* input kanji mode */
558 static int mime_decode_mode = FALSE; /* MIME mode B base64, Q hex */
559
560 /* X0201 / X0208 conversion tables */
561
562 /* X0201 kana conversion table */
563 /* 90-9F A0-DF */
564 static const unsigned char cv[]= {
565 0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57,
566 0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21,
567 0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29,
568 0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43,
569 0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26,
570 0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d,
571 0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35,
572 0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d,
573 0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46,
574 0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c,
575 0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52,
576 0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e,
577 0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62,
578 0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69,
579 0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d,
580 0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c,
581 0x00,0x00};
582
583
584 /* X0201 kana conversion table for daguten */
585 /* 90-9F A0-DF */
586 static const unsigned char dv[]= {
587 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
588 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
589 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
590 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
591 0x00,0x00,0x00,0x00,0x00,0x00,0x25,0x74,
592 0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e,
593 0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36,
594 0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e,
595 0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47,
596 0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00,
597 0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53,
598 0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00,
599 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
600 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
601 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
602 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
603 0x00,0x00};
604
605 /* X0201 kana conversion table for han-daguten */
606 /* 90-9F A0-DF */
607 static const unsigned char ev[]= {
608 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
609 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
610 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
611 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
612 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
613 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
614 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
615 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
616 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
617 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
618 0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54,
619 0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00,
620 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
621 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
622 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
623 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
624 0x00,0x00};
625
626 /* X0201 kana to X0213 conversion table for han-daguten */
627 /* 90-9F A0-DF */
628 static const unsigned char ev_x0213[]= {
629 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
630 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
631 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
632 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
633 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
634 0x00,0x00,0x00,0x00,0x25,0x77,0x25,0x78,
635 0x25,0x79,0x25,0x7a,0x25,0x7b,0x00,0x00,
636 0x00,0x00,0x00,0x00,0x25,0x7c,0x00,0x00,
637 0x00,0x00,0x00,0x00,0x25,0x7d,0x00,0x00,
638 0x25,0x7e,0x00,0x00,0x00,0x00,0x00,0x00,
639 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
640 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
641 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
642 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
643 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
644 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
645 0x00,0x00};
646
647
648 /* X0208 kigou conversion table */
649 /* 0x8140 - 0x819e */
650 static const unsigned char fv[] = {
651
652 0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a,
653 0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00,
654 0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00,
655 0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f,
656 0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27,
657 0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d,
658 0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00,
659 0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00,
660 0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00,
661 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
662 0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40,
663 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
664 } ;
665
666
667
668 static int option_mode = 0;
669 static int file_out_f = FALSE;
670 #ifdef OVERWRITE
671 static int overwrite_f = FALSE;
672 static int preserve_time_f = FALSE;
673 static int backup_f = FALSE;
674 static char *backup_suffix = "";
675 #endif
676
677 static int eolmode_f = 0; /* CR, LF, CRLF */
678 static int input_eol = 0; /* 0: unestablished, EOF: MIXED */
679 static nkf_char prev_cr = 0; /* CR or 0 */
680 #ifdef EASYWIN /*Easy Win */
681 static int end_check;
682 #endif /*Easy Win */
683
684 static void *
nkf_xmalloc(size_t size)685 nkf_xmalloc(size_t size)
686 {
687 void *ptr;
688
689 if (size == 0) size = 1;
690
691 ptr = malloc(size);
692 if (ptr == NULL) {
693 perror("can't malloc");
694 exit(EXIT_FAILURE);
695 }
696
697 return ptr;
698 }
699
700 static void *
nkf_xrealloc(void * ptr,size_t size)701 nkf_xrealloc(void *ptr, size_t size)
702 {
703 if (size == 0) size = 1;
704
705 ptr = realloc(ptr, size);
706 if (ptr == NULL) {
707 perror("can't realloc");
708 exit(EXIT_FAILURE);
709 }
710
711 return ptr;
712 }
713
714 #define nkf_xfree(ptr) free(ptr)
715
716 static int
nkf_str_caseeql(const char * src,const char * target)717 nkf_str_caseeql(const char *src, const char *target)
718 {
719 int i;
720 for (i = 0; src[i] && target[i]; i++) {
721 if (nkf_toupper(src[i]) != nkf_toupper(target[i])) return FALSE;
722 }
723 if (src[i] || target[i]) return FALSE;
724 else return TRUE;
725 }
726
727 static nkf_encoding*
nkf_enc_from_index(int idx)728 nkf_enc_from_index(int idx)
729 {
730 if (idx < 0 || NKF_ENCODING_TABLE_SIZE <= idx) {
731 return 0;
732 }
733 return &nkf_encoding_table[idx];
734 }
735
736 static int
nkf_enc_find_index(const char * name)737 nkf_enc_find_index(const char *name)
738 {
739 int i;
740 if (name[0] == 'X' && *(name+1) == '-') name += 2;
741 for (i = 0; encoding_name_to_id_table[i].id >= 0; i++) {
742 if (nkf_str_caseeql(encoding_name_to_id_table[i].name, name)) {
743 return encoding_name_to_id_table[i].id;
744 }
745 }
746 return -1;
747 }
748
749 static nkf_encoding*
nkf_enc_find(const char * name)750 nkf_enc_find(const char *name)
751 {
752 int idx = -1;
753 idx = nkf_enc_find_index(name);
754 if (idx < 0) return 0;
755 return nkf_enc_from_index(idx);
756 }
757
758 #define nkf_enc_name(enc) (enc)->name
759 #define nkf_enc_to_index(enc) (enc)->id
760 #define nkf_enc_to_base_encoding(enc) (enc)->base_encoding
761 #define nkf_enc_to_iconv(enc) nkf_enc_to_base_encoding(enc)->iconv
762 #define nkf_enc_to_oconv(enc) nkf_enc_to_base_encoding(enc)->oconv
763 #define nkf_enc_asciicompat(enc) (\
764 nkf_enc_to_base_encoding(enc) == &NkfEncodingASCII ||\
765 nkf_enc_to_base_encoding(enc) == &NkfEncodingISO_2022_JP)
766 #define nkf_enc_unicode_p(enc) (\
767 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_8 ||\
768 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_16 ||\
769 nkf_enc_to_base_encoding(enc) == &NkfEncodingUTF_32)
770 #define nkf_enc_cp5022x_p(enc) (\
771 nkf_enc_to_index(enc) == CP50220 ||\
772 nkf_enc_to_index(enc) == CP50221 ||\
773 nkf_enc_to_index(enc) == CP50222)
774
775 #ifdef DEFAULT_CODE_LOCALE
776 static const char*
nkf_locale_charmap(void)777 nkf_locale_charmap(void)
778 {
779 #ifdef HAVE_LANGINFO_H
780 return nl_langinfo(CODESET);
781 #elif defined(__WIN32__)
782 static char buf[16];
783 sprintf(buf, "CP%d", GetACP());
784 return buf;
785 #elif defined(__OS2__)
786 # if defined(INT_IS_SHORT)
787 /* OS/2 1.x */
788 return NULL;
789 # else
790 /* OS/2 32bit */
791 static char buf[16];
792 ULONG ulCP[1], ulncp;
793 DosQueryCp(sizeof(ulCP), ulCP, &ulncp);
794 if (ulCP[0] == 932 || ulCP[0] == 943)
795 strcpy(buf, "Shift_JIS");
796 else
797 sprintf(buf, "CP%lu", ulCP[0]);
798 return buf;
799 # endif
800 #endif
801 return NULL;
802 }
803
804 static nkf_encoding*
nkf_locale_encoding(void)805 nkf_locale_encoding(void)
806 {
807 nkf_encoding *enc = 0;
808 const char *encname = nkf_locale_charmap();
809 if (encname)
810 enc = nkf_enc_find(encname);
811 return enc;
812 }
813 #endif /* DEFAULT_CODE_LOCALE */
814
815 static nkf_encoding*
nkf_utf8_encoding(void)816 nkf_utf8_encoding(void)
817 {
818 return &nkf_encoding_table[UTF_8];
819 }
820
821 static nkf_encoding*
nkf_default_encoding(void)822 nkf_default_encoding(void)
823 {
824 nkf_encoding *enc = 0;
825 #ifdef DEFAULT_CODE_LOCALE
826 enc = nkf_locale_encoding();
827 #elif defined(DEFAULT_ENCIDX)
828 enc = nkf_enc_from_index(DEFAULT_ENCIDX);
829 #endif
830 if (!enc) enc = nkf_utf8_encoding();
831 return enc;
832 }
833
834 typedef struct {
835 long capa;
836 long len;
837 nkf_char *ptr;
838 } nkf_buf_t;
839
840 static nkf_buf_t *
nkf_buf_new(int length)841 nkf_buf_new(int length)
842 {
843 nkf_buf_t *buf = nkf_xmalloc(sizeof(nkf_buf_t));
844 buf->ptr = nkf_xmalloc(sizeof(nkf_char) * length);
845 buf->capa = length;
846 buf->len = 0;
847 return buf;
848 }
849
850 #if 0
851 static void
852 nkf_buf_dispose(nkf_buf_t *buf)
853 {
854 nkf_xfree(buf->ptr);
855 nkf_xfree(buf);
856 }
857 #endif
858
859 #define nkf_buf_length(buf) ((buf)->len)
860 #define nkf_buf_empty_p(buf) ((buf)->len == 0)
861
862 static nkf_char
nkf_buf_at(nkf_buf_t * buf,int index)863 nkf_buf_at(nkf_buf_t *buf, int index)
864 {
865 assert(index <= buf->len);
866 return buf->ptr[index];
867 }
868
869 static void
nkf_buf_clear(nkf_buf_t * buf)870 nkf_buf_clear(nkf_buf_t *buf)
871 {
872 buf->len = 0;
873 }
874
875 static void
nkf_buf_push(nkf_buf_t * buf,nkf_char c)876 nkf_buf_push(nkf_buf_t *buf, nkf_char c)
877 {
878 if (buf->capa <= buf->len) {
879 exit(EXIT_FAILURE);
880 }
881 buf->ptr[buf->len++] = c;
882 }
883
884 static nkf_char
nkf_buf_pop(nkf_buf_t * buf)885 nkf_buf_pop(nkf_buf_t *buf)
886 {
887 assert(!nkf_buf_empty_p(buf));
888 return buf->ptr[--buf->len];
889 }
890
891 /* Normalization Form C */
892 #ifndef PERL_XS
893 #ifdef WIN32DLL
894 #define fprintf dllprintf
895 #endif
896
897 static void
version(void)898 version(void)
899 {
900 fprintf(HELP_OUTPUT,"Network Kanji Filter Version " NKF_VERSION " (" NKF_RELEASE_DATE ") \n" COPY_RIGHT "\n");
901 }
902
903 static void
usage(void)904 usage(void)
905 {
906 fprintf(HELP_OUTPUT,
907 "Usage: nkf -[flags] [--] [in file] .. [out file for -O flag]\n"
908 #ifdef UTF8_OUTPUT_ENABLE
909 " j/s/e/w Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
910 " UTF options is -w[8[0],{16,32}[{B,L}[0]]]\n"
911 #else
912 #endif
913 #ifdef UTF8_INPUT_ENABLE
914 " J/S/E/W Specify input encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
915 " UTF option is -W[8,[16,32][B,L]]\n"
916 #else
917 " J/S/E Specify output encoding ISO-2022-JP, Shift_JIS, EUC-JP\n"
918 #endif
919 );
920 fprintf(HELP_OUTPUT,
921 " m[BQSN0] MIME decode [B:base64,Q:quoted,S:strict,N:nonstrict,0:no decode]\n"
922 " M[BQ] MIME encode [B:base64 Q:quoted]\n"
923 " f/F Folding: -f60 or -f or -f60-10 (fold margin 10) F preserve nl\n"
924 );
925 fprintf(HELP_OUTPUT,
926 " Z[0-4] Default/0: Convert JISX0208 Alphabet to ASCII\n"
927 " 1: Kankaku to one space 2: to two spaces 3: HTML Entity\n"
928 " 4: JISX0208 Katakana to JISX0201 Katakana\n"
929 " X,x Convert Halfwidth Katakana to Fullwidth or preserve it\n"
930 );
931 fprintf(HELP_OUTPUT,
932 " O Output to File (DEFAULT 'nkf.out')\n"
933 " L[uwm] Line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n"
934 );
935 fprintf(HELP_OUTPUT,
936 " --ic=<encoding> Specify the input encoding\n"
937 " --oc=<encoding> Specify the output encoding\n"
938 " --hiragana --katakana Hiragana/Katakana Conversion\n"
939 " --katakana-hiragana Converts each other\n"
940 );
941 fprintf(HELP_OUTPUT,
942 #ifdef INPUT_OPTION
943 " --{cap, url}-input Convert hex after ':' or '%%'\n"
944 #endif
945 #ifdef NUMCHAR_OPTION
946 " --numchar-input Convert Unicode Character Reference\n"
947 #endif
948 #ifdef UTF8_INPUT_ENABLE
949 " --fb-{skip, html, xml, perl, java, subchar}\n"
950 " Specify unassigned character's replacement\n"
951 #endif
952 );
953 fprintf(HELP_OUTPUT,
954 #ifdef OVERWRITE
955 " --in-place[=SUF] Overwrite original files\n"
956 " --overwrite[=SUF] Preserve timestamp of original files\n"
957 #endif
958 " -g --guess Guess the input code\n"
959 " -v --version Print the version\n"
960 " --help/-V Print this help / configuration\n"
961 );
962 version();
963 }
964
965 static void
show_configuration(void)966 show_configuration(void)
967 {
968 fprintf(HELP_OUTPUT,
969 "Summary of my nkf " NKF_VERSION " (" NKF_RELEASE_DATE ") configuration:\n"
970 " Compile-time options:\n"
971 " Compiled at: " __DATE__ " " __TIME__ "\n"
972 );
973 fprintf(HELP_OUTPUT,
974 " Default output encoding: "
975 #ifdef DEFAULT_CODE_LOCALE
976 "LOCALE (%s)\n", nkf_enc_name(nkf_default_encoding())
977 #elif defined(DEFAULT_ENCIDX)
978 "CONFIG (%s)\n", nkf_enc_name(nkf_default_encoding())
979 #else
980 "NONE\n"
981 #endif
982 );
983 fprintf(HELP_OUTPUT,
984 " Default output end of line: "
985 #if DEFAULT_NEWLINE == CR
986 "CR"
987 #elif DEFAULT_NEWLINE == CRLF
988 "CRLF"
989 #else
990 "LF"
991 #endif
992 "\n"
993 " Decode MIME encoded string: "
994 #if MIME_DECODE_DEFAULT
995 "ON"
996 #else
997 "OFF"
998 #endif
999 "\n"
1000 " Convert JIS X 0201 Katakana: "
1001 #if X0201_DEFAULT
1002 "ON"
1003 #else
1004 "OFF"
1005 #endif
1006 "\n"
1007 " --help, --version output: "
1008 #if HELP_OUTPUT_HELP_OUTPUT
1009 "HELP_OUTPUT"
1010 #else
1011 "STDOUT"
1012 #endif
1013 "\n");
1014 }
1015 #endif /*PERL_XS*/
1016
1017 #ifdef OVERWRITE
1018 static char*
get_backup_filename(const char * suffix,const char * filename)1019 get_backup_filename(const char *suffix, const char *filename)
1020 {
1021 char *backup_filename;
1022 int asterisk_count = 0;
1023 int i, j;
1024 int filename_length = strlen(filename);
1025
1026 for(i = 0; suffix[i]; i++){
1027 if(suffix[i] == '*') asterisk_count++;
1028 }
1029
1030 if(asterisk_count){
1031 backup_filename = nkf_xmalloc(strlen(suffix) + (asterisk_count * (filename_length - 1)) + 1);
1032 for(i = 0, j = 0; suffix[i];){
1033 if(suffix[i] == '*'){
1034 backup_filename[j] = '\0';
1035 strncat(backup_filename, filename, filename_length);
1036 i++;
1037 j += filename_length;
1038 }else{
1039 backup_filename[j++] = suffix[i++];
1040 }
1041 }
1042 backup_filename[j] = '\0';
1043 }else{
1044 j = filename_length + strlen(suffix);
1045 backup_filename = nkf_xmalloc(j + 1);
1046 strcpy(backup_filename, filename);
1047 strcat(backup_filename, suffix);
1048 backup_filename[j] = '\0';
1049 }
1050 return backup_filename;
1051 }
1052 #endif
1053
1054 #ifdef UTF8_INPUT_ENABLE
1055 static void
nkf_each_char_to_hex(void (* f)(nkf_char c2,nkf_char c1),nkf_char c)1056 nkf_each_char_to_hex(void (*f)(nkf_char c2,nkf_char c1), nkf_char c)
1057 {
1058 int shift = 20;
1059 c &= VALUE_MASK;
1060 while(shift >= 0){
1061 if(c >= NKF_INT32_C(1)<<shift){
1062 while(shift >= 0){
1063 (*f)(0, bin2hex(c>>shift));
1064 shift -= 4;
1065 }
1066 }else{
1067 shift -= 4;
1068 }
1069 }
1070 return;
1071 }
1072
1073 static void
encode_fallback_html(nkf_char c)1074 encode_fallback_html(nkf_char c)
1075 {
1076 (*oconv)(0, '&');
1077 (*oconv)(0, '#');
1078 c &= VALUE_MASK;
1079 if(c >= NKF_INT32_C(1000000))
1080 (*oconv)(0, 0x30+(c/NKF_INT32_C(1000000))%10);
1081 if(c >= NKF_INT32_C(100000))
1082 (*oconv)(0, 0x30+(c/NKF_INT32_C(100000) )%10);
1083 if(c >= 10000)
1084 (*oconv)(0, 0x30+(c/10000 )%10);
1085 if(c >= 1000)
1086 (*oconv)(0, 0x30+(c/1000 )%10);
1087 if(c >= 100)
1088 (*oconv)(0, 0x30+(c/100 )%10);
1089 if(c >= 10)
1090 (*oconv)(0, 0x30+(c/10 )%10);
1091 if(c >= 0)
1092 (*oconv)(0, 0x30+ c %10);
1093 (*oconv)(0, ';');
1094 return;
1095 }
1096
1097 static void
encode_fallback_xml(nkf_char c)1098 encode_fallback_xml(nkf_char c)
1099 {
1100 (*oconv)(0, '&');
1101 (*oconv)(0, '#');
1102 (*oconv)(0, 'x');
1103 nkf_each_char_to_hex(oconv, c);
1104 (*oconv)(0, ';');
1105 return;
1106 }
1107
1108 static void
encode_fallback_java(nkf_char c)1109 encode_fallback_java(nkf_char c)
1110 {
1111 (*oconv)(0, '\\');
1112 c &= VALUE_MASK;
1113 if(!nkf_char_unicode_bmp_p(c)){
1114 int high = (c >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
1115 int low = (c & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
1116 (*oconv)(0, 'u');
1117 (*oconv)(0, bin2hex(high>>12));
1118 (*oconv)(0, bin2hex(high>> 8));
1119 (*oconv)(0, bin2hex(high>> 4));
1120 (*oconv)(0, bin2hex(high ));
1121 (*oconv)(0, '\\');
1122 (*oconv)(0, 'u');
1123 (*oconv)(0, bin2hex(low>>12));
1124 (*oconv)(0, bin2hex(low>> 8));
1125 (*oconv)(0, bin2hex(low>> 4));
1126 (*oconv)(0, bin2hex(low ));
1127 }else{
1128 (*oconv)(0, 'u');
1129 (*oconv)(0, bin2hex(c>>12));
1130 (*oconv)(0, bin2hex(c>> 8));
1131 (*oconv)(0, bin2hex(c>> 4));
1132 (*oconv)(0, bin2hex(c ));
1133 }
1134 return;
1135 }
1136
1137 static void
encode_fallback_perl(nkf_char c)1138 encode_fallback_perl(nkf_char c)
1139 {
1140 (*oconv)(0, '\\');
1141 (*oconv)(0, 'x');
1142 (*oconv)(0, '{');
1143 nkf_each_char_to_hex(oconv, c);
1144 (*oconv)(0, '}');
1145 return;
1146 }
1147
1148 static void
encode_fallback_subchar(nkf_char c)1149 encode_fallback_subchar(nkf_char c)
1150 {
1151 c = unicode_subchar;
1152 (*oconv)((c>>8)&0xFF, c&0xFF);
1153 return;
1154 }
1155 #endif
1156
1157 static const struct {
1158 const char *name;
1159 const char *alias;
1160 } long_option[] = {
1161 {"ic=", ""},
1162 {"oc=", ""},
1163 {"base64","jMB"},
1164 {"euc","e"},
1165 {"euc-input","E"},
1166 {"fj","jm"},
1167 {"help",""},
1168 {"jis","j"},
1169 {"jis-input","J"},
1170 {"mac","sLm"},
1171 {"mime","jM"},
1172 {"mime-input","m"},
1173 {"msdos","sLw"},
1174 {"sjis","s"},
1175 {"sjis-input","S"},
1176 {"unix","eLu"},
1177 {"version","v"},
1178 {"windows","sLw"},
1179 {"hiragana","h1"},
1180 {"katakana","h2"},
1181 {"katakana-hiragana","h3"},
1182 {"guess=", ""},
1183 {"guess", "g2"},
1184 {"cp932", ""},
1185 {"no-cp932", ""},
1186 #ifdef X0212_ENABLE
1187 {"x0212", ""},
1188 #endif
1189 #ifdef UTF8_OUTPUT_ENABLE
1190 {"utf8", "w"},
1191 {"utf16", "w16"},
1192 {"ms-ucs-map", ""},
1193 {"fb-skip", ""},
1194 {"fb-html", ""},
1195 {"fb-xml", ""},
1196 {"fb-perl", ""},
1197 {"fb-java", ""},
1198 {"fb-subchar", ""},
1199 {"fb-subchar=", ""},
1200 #endif
1201 #ifdef UTF8_INPUT_ENABLE
1202 {"utf8-input", "W"},
1203 {"utf16-input", "W16"},
1204 {"no-cp932ext", ""},
1205 {"no-best-fit-chars",""},
1206 #endif
1207 #ifdef UNICODE_NORMALIZATION
1208 {"utf8mac-input", ""},
1209 #endif
1210 #ifdef OVERWRITE
1211 {"overwrite", ""},
1212 {"overwrite=", ""},
1213 {"in-place", ""},
1214 {"in-place=", ""},
1215 #endif
1216 #ifdef INPUT_OPTION
1217 {"cap-input", ""},
1218 {"url-input", ""},
1219 #endif
1220 #ifdef NUMCHAR_OPTION
1221 {"numchar-input", ""},
1222 #endif
1223 #ifdef CHECK_OPTION
1224 {"no-output", ""},
1225 {"debug", ""},
1226 #endif
1227 #ifdef SHIFTJIS_CP932
1228 {"cp932inv", ""},
1229 #endif
1230 #ifdef EXEC_IO
1231 {"exec-in", ""},
1232 {"exec-out", ""},
1233 #endif
1234 {"prefix=", ""},
1235 };
1236
1237 static void
set_input_encoding(nkf_encoding * enc)1238 set_input_encoding(nkf_encoding *enc)
1239 {
1240 switch (nkf_enc_to_index(enc)) {
1241 case ISO_8859_1:
1242 iso8859_f = TRUE;
1243 break;
1244 case CP50221:
1245 case CP50222:
1246 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1247 case CP50220:
1248 #ifdef SHIFTJIS_CP932
1249 cp51932_f = TRUE;
1250 #endif
1251 #ifdef UTF8_OUTPUT_ENABLE
1252 ms_ucs_map_f = UCS_MAP_CP932;
1253 #endif
1254 break;
1255 case ISO_2022_JP_1:
1256 x0212_f = TRUE;
1257 break;
1258 case ISO_2022_JP_3:
1259 x0212_f = TRUE;
1260 x0213_f = TRUE;
1261 break;
1262 case ISO_2022_JP_2004:
1263 x0212_f = TRUE;
1264 x0213_f = TRUE;
1265 break;
1266 case SHIFT_JIS:
1267 break;
1268 case WINDOWS_31J:
1269 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1270 #ifdef SHIFTJIS_CP932
1271 cp51932_f = TRUE;
1272 #endif
1273 #ifdef UTF8_OUTPUT_ENABLE
1274 ms_ucs_map_f = UCS_MAP_CP932;
1275 #endif
1276 break;
1277 break;
1278 case CP10001:
1279 #ifdef SHIFTJIS_CP932
1280 cp51932_f = TRUE;
1281 #endif
1282 #ifdef UTF8_OUTPUT_ENABLE
1283 ms_ucs_map_f = UCS_MAP_CP10001;
1284 #endif
1285 break;
1286 case EUC_JP:
1287 break;
1288 case EUCJP_NKF:
1289 break;
1290 case CP51932:
1291 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1292 #ifdef SHIFTJIS_CP932
1293 cp51932_f = TRUE;
1294 #endif
1295 #ifdef UTF8_OUTPUT_ENABLE
1296 ms_ucs_map_f = UCS_MAP_CP932;
1297 #endif
1298 break;
1299 case EUCJP_MS:
1300 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1301 #ifdef SHIFTJIS_CP932
1302 cp51932_f = FALSE;
1303 #endif
1304 #ifdef UTF8_OUTPUT_ENABLE
1305 ms_ucs_map_f = UCS_MAP_MS;
1306 #endif
1307 break;
1308 case EUCJP_ASCII:
1309 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1310 #ifdef SHIFTJIS_CP932
1311 cp51932_f = FALSE;
1312 #endif
1313 #ifdef UTF8_OUTPUT_ENABLE
1314 ms_ucs_map_f = UCS_MAP_ASCII;
1315 #endif
1316 break;
1317 case SHIFT_JISX0213:
1318 case SHIFT_JIS_2004:
1319 x0213_f = TRUE;
1320 #ifdef SHIFTJIS_CP932
1321 cp51932_f = FALSE;
1322 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1323 #endif
1324 break;
1325 case EUC_JISX0213:
1326 case EUC_JIS_2004:
1327 x0213_f = TRUE;
1328 #ifdef SHIFTJIS_CP932
1329 cp51932_f = FALSE;
1330 #endif
1331 break;
1332 #ifdef UTF8_INPUT_ENABLE
1333 #ifdef UNICODE_NORMALIZATION
1334 case UTF8_MAC:
1335 nfc_f = TRUE;
1336 break;
1337 #endif
1338 case UTF_16:
1339 case UTF_16BE:
1340 case UTF_16BE_BOM:
1341 input_endian = ENDIAN_BIG;
1342 break;
1343 case UTF_16LE:
1344 case UTF_16LE_BOM:
1345 input_endian = ENDIAN_LITTLE;
1346 break;
1347 case UTF_32:
1348 case UTF_32BE:
1349 case UTF_32BE_BOM:
1350 input_endian = ENDIAN_BIG;
1351 break;
1352 case UTF_32LE:
1353 case UTF_32LE_BOM:
1354 input_endian = ENDIAN_LITTLE;
1355 break;
1356 #endif
1357 }
1358 }
1359
1360 static void
set_output_encoding(nkf_encoding * enc)1361 set_output_encoding(nkf_encoding *enc)
1362 {
1363 switch (nkf_enc_to_index(enc)) {
1364 case CP50220:
1365 #ifdef SHIFTJIS_CP932
1366 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1367 #endif
1368 #ifdef UTF8_OUTPUT_ENABLE
1369 ms_ucs_map_f = UCS_MAP_CP932;
1370 #endif
1371 break;
1372 case CP50221:
1373 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1374 #ifdef SHIFTJIS_CP932
1375 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1376 #endif
1377 #ifdef UTF8_OUTPUT_ENABLE
1378 ms_ucs_map_f = UCS_MAP_CP932;
1379 #endif
1380 break;
1381 case ISO_2022_JP:
1382 #ifdef SHIFTJIS_CP932
1383 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1384 #endif
1385 break;
1386 case ISO_2022_JP_1:
1387 x0212_f = TRUE;
1388 #ifdef SHIFTJIS_CP932
1389 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1390 #endif
1391 break;
1392 case ISO_2022_JP_3:
1393 case ISO_2022_JP_2004:
1394 x0212_f = TRUE;
1395 x0213_f = TRUE;
1396 #ifdef SHIFTJIS_CP932
1397 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1398 #endif
1399 break;
1400 case SHIFT_JIS:
1401 break;
1402 case WINDOWS_31J:
1403 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1404 #ifdef UTF8_OUTPUT_ENABLE
1405 ms_ucs_map_f = UCS_MAP_CP932;
1406 #endif
1407 break;
1408 case CP10001:
1409 #ifdef UTF8_OUTPUT_ENABLE
1410 ms_ucs_map_f = UCS_MAP_CP10001;
1411 #endif
1412 break;
1413 case EUC_JP:
1414 x0212_f = TRUE;
1415 #ifdef SHIFTJIS_CP932
1416 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1417 #endif
1418 #ifdef UTF8_OUTPUT_ENABLE
1419 ms_ucs_map_f = UCS_MAP_ASCII;
1420 #endif
1421 break;
1422 case EUCJP_NKF:
1423 x0212_f = FALSE;
1424 #ifdef SHIFTJIS_CP932
1425 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1426 #endif
1427 #ifdef UTF8_OUTPUT_ENABLE
1428 ms_ucs_map_f = UCS_MAP_ASCII;
1429 #endif
1430 break;
1431 case CP51932:
1432 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1433 #ifdef SHIFTJIS_CP932
1434 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1435 #endif
1436 #ifdef UTF8_OUTPUT_ENABLE
1437 ms_ucs_map_f = UCS_MAP_CP932;
1438 #endif
1439 break;
1440 case EUCJP_MS:
1441 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1442 x0212_f = TRUE;
1443 #ifdef UTF8_OUTPUT_ENABLE
1444 ms_ucs_map_f = UCS_MAP_MS;
1445 #endif
1446 break;
1447 case EUCJP_ASCII:
1448 if (x0201_f == NKF_UNSPECIFIED) x0201_f = FALSE; /* -x specified implicitly */
1449 x0212_f = TRUE;
1450 #ifdef UTF8_OUTPUT_ENABLE
1451 ms_ucs_map_f = UCS_MAP_ASCII;
1452 #endif
1453 break;
1454 case SHIFT_JISX0213:
1455 case SHIFT_JIS_2004:
1456 x0213_f = TRUE;
1457 #ifdef SHIFTJIS_CP932
1458 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1459 #endif
1460 break;
1461 case EUC_JISX0213:
1462 case EUC_JIS_2004:
1463 x0212_f = TRUE;
1464 x0213_f = TRUE;
1465 #ifdef SHIFTJIS_CP932
1466 if (cp932inv_f == TRUE) cp932inv_f = FALSE;
1467 #endif
1468 break;
1469 #ifdef UTF8_OUTPUT_ENABLE
1470 case UTF_8_BOM:
1471 output_bom_f = TRUE;
1472 break;
1473 case UTF_16:
1474 case UTF_16BE_BOM:
1475 output_bom_f = TRUE;
1476 break;
1477 case UTF_16LE:
1478 output_endian = ENDIAN_LITTLE;
1479 output_bom_f = FALSE;
1480 break;
1481 case UTF_16LE_BOM:
1482 output_endian = ENDIAN_LITTLE;
1483 output_bom_f = TRUE;
1484 break;
1485 case UTF_32:
1486 case UTF_32BE_BOM:
1487 output_bom_f = TRUE;
1488 break;
1489 case UTF_32LE:
1490 output_endian = ENDIAN_LITTLE;
1491 output_bom_f = FALSE;
1492 break;
1493 case UTF_32LE_BOM:
1494 output_endian = ENDIAN_LITTLE;
1495 output_bom_f = TRUE;
1496 break;
1497 #endif
1498 }
1499 }
1500
1501 static struct input_code*
find_inputcode_byfunc(nkf_char (* iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))1502 find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1503 {
1504 if (iconv_func){
1505 struct input_code *p = input_code_list;
1506 while (p->name){
1507 if (iconv_func == p->iconv_func){
1508 return p;
1509 }
1510 p++;
1511 }
1512 }
1513 return 0;
1514 }
1515
1516 static void
set_iconv(nkf_char f,nkf_char (* iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))1517 set_iconv(nkf_char f, nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
1518 {
1519 #ifdef INPUT_CODE_FIX
1520 if (f || !input_encoding)
1521 #endif
1522 if (estab_f != f){
1523 estab_f = f;
1524 }
1525
1526 if (iconv_func
1527 #ifdef INPUT_CODE_FIX
1528 && (f == -TRUE || !input_encoding) /* -TRUE means "FORCE" */
1529 #endif
1530 ){
1531 iconv = iconv_func;
1532 }
1533 #ifdef CHECK_OPTION
1534 if (estab_f && iconv_for_check != iconv){
1535 struct input_code *p = find_inputcode_byfunc(iconv);
1536 if (p){
1537 set_input_codename(p->name);
1538 debug(p->name);
1539 }
1540 iconv_for_check = iconv;
1541 }
1542 #endif
1543 }
1544
1545 #ifdef X0212_ENABLE
1546 static nkf_char
x0212_shift(nkf_char c)1547 x0212_shift(nkf_char c)
1548 {
1549 nkf_char ret = c;
1550 c &= 0x7f;
1551 if (is_eucg3(ret)){
1552 if (0x75 <= c && c <= 0x7f){
1553 ret = c + (0x109 - 0x75);
1554 }
1555 }else{
1556 if (0x75 <= c && c <= 0x7f){
1557 ret = c + (0x113 - 0x75);
1558 }
1559 }
1560 return ret;
1561 }
1562
1563
1564 static nkf_char
x0212_unshift(nkf_char c)1565 x0212_unshift(nkf_char c)
1566 {
1567 nkf_char ret = c;
1568 if (0x7f <= c && c <= 0x88){
1569 ret = c + (0x75 - 0x7f);
1570 }else if (0x89 <= c && c <= 0x92){
1571 ret = PREFIX_EUCG3 | 0x80 | (c + (0x75 - 0x89));
1572 }
1573 return ret;
1574 }
1575 #endif /* X0212_ENABLE */
1576
1577 static int
is_x0213_2_in_x0212(nkf_char c1)1578 is_x0213_2_in_x0212(nkf_char c1)
1579 {
1580 static const char x0213_2_table[] =
1581 {0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1};
1582 int ku = c1 - 0x20;
1583 if (ku <= 15)
1584 return x0213_2_table[ku]; /* 1, 3-5, 8, 12-15 */
1585 if (78 <= ku && ku <= 94)
1586 return 1;
1587 return 0;
1588 }
1589
1590 static nkf_char
e2s_conv(nkf_char c2,nkf_char c1,nkf_char * p2,nkf_char * p1)1591 e2s_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1592 {
1593 nkf_char ndx;
1594 if (is_eucg3(c2)){
1595 ndx = c2 & 0x7f;
1596 if (x0213_f && is_x0213_2_in_x0212(ndx)){
1597 if((0x21 <= ndx && ndx <= 0x2F)){
1598 if (p2) *p2 = ((ndx - 1) >> 1) + 0xec - ndx / 8 * 3;
1599 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1600 return 0;
1601 }else if(0x6E <= ndx && ndx <= 0x7E){
1602 if (p2) *p2 = ((ndx - 1) >> 1) + 0xbe;
1603 if (p1) *p1 = c1 + ((ndx & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1604 return 0;
1605 }
1606 return 1;
1607 }
1608 #ifdef X0212_ENABLE
1609 else if(nkf_isgraph(ndx)){
1610 nkf_char val = 0;
1611 const unsigned short *ptr;
1612 ptr = x0212_shiftjis[ndx - 0x21];
1613 if (ptr){
1614 val = ptr[(c1 & 0x7f) - 0x21];
1615 }
1616 if (val){
1617 c2 = val >> 8;
1618 c1 = val & 0xff;
1619 if (p2) *p2 = c2;
1620 if (p1) *p1 = c1;
1621 return 0;
1622 }
1623 c2 = x0212_shift(c2);
1624 }
1625 #endif /* X0212_ENABLE */
1626 }
1627 if(0x7F < c2) return 1;
1628 if (p2) *p2 = ((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1);
1629 if (p1) *p1 = c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e);
1630 return 0;
1631 }
1632
1633 static nkf_char
s2e_conv(nkf_char c2,nkf_char c1,nkf_char * p2,nkf_char * p1)1634 s2e_conv(nkf_char c2, nkf_char c1, nkf_char *p2, nkf_char *p1)
1635 {
1636 #if defined(SHIFTJIS_CP932) || defined(X0212_ENABLE)
1637 nkf_char val;
1638 #endif
1639 static const char shift_jisx0213_s1a3_table[5][2] ={ { 1, 8}, { 3, 4}, { 5,12}, {13,14}, {15, 0} };
1640 if (0xFC < c1) return 1;
1641 #ifdef SHIFTJIS_CP932
1642 if (!cp932inv_f && !x0213_f && is_ibmext_in_sjis(c2)){
1643 val = shiftjis_cp932[c2 - CP932_TABLE_BEGIN][c1 - 0x40];
1644 if (val){
1645 c2 = val >> 8;
1646 c1 = val & 0xff;
1647 }
1648 }
1649 if (cp932inv_f
1650 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
1651 val = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
1652 if (val){
1653 c2 = val >> 8;
1654 c1 = val & 0xff;
1655 }
1656 }
1657 #endif /* SHIFTJIS_CP932 */
1658 #ifdef X0212_ENABLE
1659 if (!x0213_f && is_ibmext_in_sjis(c2)){
1660 val = shiftjis_x0212[c2 - 0xfa][c1 - 0x40];
1661 if (val){
1662 if (val > 0x7FFF){
1663 c2 = PREFIX_EUCG3 | ((val >> 8) & 0x7f);
1664 c1 = val & 0xff;
1665 }else{
1666 c2 = val >> 8;
1667 c1 = val & 0xff;
1668 }
1669 if (p2) *p2 = c2;
1670 if (p1) *p1 = c1;
1671 return 0;
1672 }
1673 }
1674 #endif
1675 if(c2 >= 0x80){
1676 if(x0213_f && c2 >= 0xF0){
1677 if(c2 <= 0xF3 || (c2 == 0xF4 && c1 < 0x9F)){ /* k=1, 3<=k<=5, k=8, 12<=k<=15 */
1678 c2 = PREFIX_EUCG3 | 0x20 | shift_jisx0213_s1a3_table[c2 - 0xF0][0x9E < c1];
1679 }else{ /* 78<=k<=94 */
1680 c2 = PREFIX_EUCG3 | (c2 * 2 - 0x17B);
1681 if (0x9E < c1) c2++;
1682 }
1683 }else{
1684 #define SJ0162 0x00e1 /* 01 - 62 ku offset */
1685 #define SJ6394 0x0161 /* 63 - 94 ku offset */
1686 c2 = c2 + c2 - ((c2 <= 0x9F) ? SJ0162 : SJ6394);
1687 if (0x9E < c1) c2++;
1688 }
1689 if (c1 < 0x9F)
1690 c1 = c1 - ((c1 > DEL) ? SP : 0x1F);
1691 else {
1692 c1 = c1 - 0x7E;
1693 }
1694 }
1695
1696 #ifdef X0212_ENABLE
1697 c2 = x0212_unshift(c2);
1698 #endif
1699 if (p2) *p2 = c2;
1700 if (p1) *p1 = c1;
1701 return 0;
1702 }
1703
1704 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
1705 static void
nkf_unicode_to_utf8(nkf_char val,nkf_char * p1,nkf_char * p2,nkf_char * p3,nkf_char * p4)1706 nkf_unicode_to_utf8(nkf_char val, nkf_char *p1, nkf_char *p2, nkf_char *p3, nkf_char *p4)
1707 {
1708 val &= VALUE_MASK;
1709 if (val < 0x80){
1710 *p1 = val;
1711 *p2 = 0;
1712 *p3 = 0;
1713 *p4 = 0;
1714 }else if (val < 0x800){
1715 *p1 = 0xc0 | (val >> 6);
1716 *p2 = 0x80 | (val & 0x3f);
1717 *p3 = 0;
1718 *p4 = 0;
1719 } else if (nkf_char_unicode_bmp_p(val)) {
1720 *p1 = 0xe0 | (val >> 12);
1721 *p2 = 0x80 | ((val >> 6) & 0x3f);
1722 *p3 = 0x80 | ( val & 0x3f);
1723 *p4 = 0;
1724 } else if (nkf_char_unicode_value_p(val)) {
1725 *p1 = 0xf0 | (val >> 18);
1726 *p2 = 0x80 | ((val >> 12) & 0x3f);
1727 *p3 = 0x80 | ((val >> 6) & 0x3f);
1728 *p4 = 0x80 | ( val & 0x3f);
1729 } else {
1730 *p1 = 0;
1731 *p2 = 0;
1732 *p3 = 0;
1733 *p4 = 0;
1734 }
1735 }
1736
1737 static nkf_char
nkf_utf8_to_unicode(nkf_char c1,nkf_char c2,nkf_char c3,nkf_char c4)1738 nkf_utf8_to_unicode(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
1739 {
1740 nkf_char wc;
1741 if (c1 <= 0x7F) {
1742 /* single byte */
1743 wc = c1;
1744 }
1745 else if (c1 <= 0xC1) {
1746 /* trail byte or invalid */
1747 return -1;
1748 }
1749 else if (c1 <= 0xDF) {
1750 /* 2 bytes */
1751 wc = (c1 & 0x1F) << 6;
1752 wc |= (c2 & 0x3F);
1753 }
1754 else if (c1 <= 0xEF) {
1755 /* 3 bytes */
1756 wc = (c1 & 0x0F) << 12;
1757 wc |= (c2 & 0x3F) << 6;
1758 wc |= (c3 & 0x3F);
1759 }
1760 else if (c2 <= 0xF4) {
1761 /* 4 bytes */
1762 wc = (c1 & 0x0F) << 18;
1763 wc |= (c2 & 0x3F) << 12;
1764 wc |= (c3 & 0x3F) << 6;
1765 wc |= (c4 & 0x3F);
1766 }
1767 else {
1768 return -1;
1769 }
1770 return wc;
1771 }
1772 #endif
1773
1774 #ifdef UTF8_INPUT_ENABLE
1775 static int
unicode_to_jis_common2(nkf_char c1,nkf_char c0,const unsigned short * const * pp,nkf_char psize,nkf_char * p2,nkf_char * p1)1776 unicode_to_jis_common2(nkf_char c1, nkf_char c0,
1777 const unsigned short *const *pp, nkf_char psize,
1778 nkf_char *p2, nkf_char *p1)
1779 {
1780 nkf_char c2;
1781 const unsigned short *p;
1782 unsigned short val;
1783
1784 if (pp == 0) return 1;
1785
1786 c1 -= 0x80;
1787 if (c1 < 0 || psize <= c1) return 1;
1788 p = pp[c1];
1789 if (p == 0) return 1;
1790
1791 c0 -= 0x80;
1792 if (c0 < 0 || sizeof_utf8_to_euc_C2 <= c0) return 1;
1793 val = p[c0];
1794 if (val == 0) return 1;
1795 if (no_cp932ext_f && (
1796 (val>>8) == 0x2D || /* NEC special characters */
1797 val > NKF_INT32_C(0xF300) /* IBM extended characters */
1798 )) return 1;
1799
1800 c2 = val >> 8;
1801 if (val > 0x7FFF){
1802 c2 &= 0x7f;
1803 c2 |= PREFIX_EUCG3;
1804 }
1805 if (c2 == SO) c2 = JIS_X_0201_1976_K;
1806 c1 = val & 0xFF;
1807 if (p2) *p2 = c2;
1808 if (p1) *p1 = c1;
1809 return 0;
1810 }
1811
1812 static int
unicode_to_jis_common(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char * p2,nkf_char * p1)1813 unicode_to_jis_common(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
1814 {
1815 const unsigned short *const *pp;
1816 const unsigned short *const *const *ppp;
1817 static const char no_best_fit_chars_table_C2[] =
1818 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1819 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1820 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2,
1821 0, 0, 1, 1, 0, 1, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1};
1822 static const char no_best_fit_chars_table_C2_ms[] =
1823 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1824 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1825 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
1826 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0};
1827 static const char no_best_fit_chars_table_932_C2[] =
1828 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1829 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1830 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1831 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0};
1832 static const char no_best_fit_chars_table_932_C3[] =
1833 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1834 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1835 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1836 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1};
1837 nkf_char ret = 0;
1838
1839 if(c2 < 0x80){
1840 *p2 = 0;
1841 *p1 = c2;
1842 }else if(c2 < 0xe0){
1843 if(no_best_fit_chars_f){
1844 if(ms_ucs_map_f == UCS_MAP_CP932){
1845 switch(c2){
1846 case 0xC2:
1847 if(no_best_fit_chars_table_932_C2[c1&0x3F]) return 1;
1848 break;
1849 case 0xC3:
1850 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1851 break;
1852 }
1853 }else if(!cp932inv_f){
1854 switch(c2){
1855 case 0xC2:
1856 if(no_best_fit_chars_table_C2[c1&0x3F]) return 1;
1857 break;
1858 case 0xC3:
1859 if(no_best_fit_chars_table_932_C3[c1&0x3F]) return 1;
1860 break;
1861 }
1862 }else if(ms_ucs_map_f == UCS_MAP_MS){
1863 if(c2 == 0xC2 && no_best_fit_chars_table_C2_ms[c1&0x3F]) return 1;
1864 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1865 switch(c2){
1866 case 0xC2:
1867 switch(c1){
1868 case 0xA2:
1869 case 0xA3:
1870 case 0xA5:
1871 case 0xA6:
1872 case 0xAC:
1873 case 0xAF:
1874 case 0xB8:
1875 return 1;
1876 }
1877 break;
1878 }
1879 }
1880 }
1881 pp =
1882 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_2bytes_932 :
1883 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_2bytes_ms :
1884 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_2bytes_mac :
1885 x0213_f ? utf8_to_euc_2bytes_x0213 :
1886 utf8_to_euc_2bytes;
1887 ret = unicode_to_jis_common2(c2, c1, pp, sizeof_utf8_to_euc_2bytes, p2, p1);
1888 }else if(c0 < 0xF0){
1889 if(no_best_fit_chars_f){
1890 if(ms_ucs_map_f == UCS_MAP_CP932){
1891 if(c2 == 0xE3 && c1 == 0x82 && c0 == 0x94) return 1;
1892 }else if(ms_ucs_map_f == UCS_MAP_MS){
1893 switch(c2){
1894 case 0xE2:
1895 switch(c1){
1896 case 0x80:
1897 if(c0 == 0x94 || c0 == 0x96 || c0 == 0xBE) return 1;
1898 break;
1899 case 0x88:
1900 if(c0 == 0x92) return 1;
1901 break;
1902 }
1903 break;
1904 case 0xE3:
1905 if(c1 == 0x80 || c0 == 0x9C) return 1;
1906 break;
1907 }
1908 }else if(ms_ucs_map_f == UCS_MAP_CP10001){
1909 switch(c2){
1910 case 0xE3:
1911 switch(c1){
1912 case 0x82:
1913 if(c0 == 0x94) return 1;
1914 break;
1915 case 0x83:
1916 if(c0 == 0xBB) return 1;
1917 break;
1918 }
1919 break;
1920 }
1921 }else{
1922 switch(c2){
1923 case 0xE2:
1924 switch(c1){
1925 case 0x80:
1926 if(c0 == 0x95) return 1;
1927 break;
1928 case 0x88:
1929 if(c0 == 0xA5) return 1;
1930 break;
1931 }
1932 break;
1933 case 0xEF:
1934 switch(c1){
1935 case 0xBC:
1936 if(c0 == 0x8D) return 1;
1937 break;
1938 case 0xBD:
1939 if(c0 == 0x9E && !cp932inv_f) return 1;
1940 break;
1941 case 0xBF:
1942 if(0xA0 <= c0 && c0 <= 0xA5) return 1;
1943 break;
1944 }
1945 break;
1946 }
1947 }
1948 }
1949 ppp =
1950 ms_ucs_map_f == UCS_MAP_CP932 ? utf8_to_euc_3bytes_932 :
1951 ms_ucs_map_f == UCS_MAP_MS ? utf8_to_euc_3bytes_ms :
1952 ms_ucs_map_f == UCS_MAP_CP10001 ? utf8_to_euc_3bytes_mac :
1953 x0213_f ? utf8_to_euc_3bytes_x0213 :
1954 utf8_to_euc_3bytes;
1955 ret = unicode_to_jis_common2(c1, c0, ppp[c2 - 0xE0], sizeof_utf8_to_euc_C2, p2, p1);
1956 }else return -1;
1957 #ifdef SHIFTJIS_CP932
1958 if (!ret&& is_eucg3(*p2)) {
1959 if (cp932inv_f) {
1960 if (encode_fallback) ret = 1;
1961 }
1962 else {
1963 nkf_char s2, s1;
1964 if (e2s_conv(*p2, *p1, &s2, &s1) == 0) {
1965 s2e_conv(s2, s1, p2, p1);
1966 }else{
1967 ret = 1;
1968 }
1969 }
1970 }
1971 #endif
1972 return ret;
1973 }
1974
1975 #ifdef UTF8_OUTPUT_ENABLE
1976 #define X0213_SURROGATE_FIND(tbl, size, euc) do { \
1977 int i; \
1978 for (i = 0; i < size; i++) \
1979 if (tbl[i][0] == euc) { \
1980 low = tbl[i][2]; \
1981 break; \
1982 } \
1983 } while (0)
1984
1985 static nkf_char
e2w_conv(nkf_char c2,nkf_char c1)1986 e2w_conv(nkf_char c2, nkf_char c1)
1987 {
1988 const unsigned short *p;
1989
1990 if (c2 == JIS_X_0201_1976_K) {
1991 if (ms_ucs_map_f == UCS_MAP_CP10001) {
1992 switch (c1) {
1993 case 0x20:
1994 return 0xA0;
1995 case 0x7D:
1996 return 0xA9;
1997 }
1998 }
1999 p = euc_to_utf8_1byte;
2000 #ifdef X0212_ENABLE
2001 } else if (is_eucg3(c2)){
2002 if(ms_ucs_map_f == UCS_MAP_ASCII&& c2 == NKF_INT32_C(0x8F22) && c1 == 0x43){
2003 return 0xA6;
2004 }
2005 c2 = (c2&0x7f) - 0x21;
2006 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
2007 p =
2008 x0213_f ? x0212_to_utf8_2bytes_x0213[c2] :
2009 x0212_to_utf8_2bytes[c2];
2010 else
2011 return 0;
2012 #endif
2013 } else {
2014 c2 &= 0x7f;
2015 c2 = (c2&0x7f) - 0x21;
2016 if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes)
2017 p =
2018 x0213_f ? euc_to_utf8_2bytes_x0213[c2] :
2019 ms_ucs_map_f == UCS_MAP_ASCII ? euc_to_utf8_2bytes[c2] :
2020 ms_ucs_map_f == UCS_MAP_CP10001 ? euc_to_utf8_2bytes_mac[c2] :
2021 euc_to_utf8_2bytes_ms[c2];
2022 else
2023 return 0;
2024 }
2025 if (!p) return 0;
2026 c1 = (c1 & 0x7f) - 0x21;
2027 if (0<=c1 && c1<sizeof_euc_to_utf8_1byte) {
2028 nkf_char val = p[c1];
2029 if (x0213_f && 0xD800<=val && val<=0xDBFF) {
2030 nkf_char euc = (c2+0x21)<<8 | (c1+0x21);
2031 nkf_char low = 0;
2032 if (p==x0212_to_utf8_2bytes_x0213[c2]) {
2033 X0213_SURROGATE_FIND(x0213_2_surrogate_table, sizeof_x0213_2_surrogate_table, euc);
2034 } else {
2035 X0213_SURROGATE_FIND(x0213_1_surrogate_table, sizeof_x0213_1_surrogate_table, euc);
2036 }
2037 if (!low) return 0;
2038 return UTF16_TO_UTF32(val, low);
2039 } else {
2040 return val;
2041 }
2042 }
2043 return 0;
2044 }
2045
2046 static nkf_char
e2w_combining(nkf_char comb,nkf_char c2,nkf_char c1)2047 e2w_combining(nkf_char comb, nkf_char c2, nkf_char c1)
2048 {
2049 nkf_char euc;
2050 int i;
2051 for (i = 0; i < sizeof_x0213_combining_chars; i++)
2052 if (x0213_combining_chars[i] == comb)
2053 break;
2054 if (i >= sizeof_x0213_combining_chars)
2055 return 0;
2056 euc = (c2&0x7f)<<8 | (c1&0x7f);
2057 for (i = 0; i < sizeof_x0213_combining_table; i++)
2058 if (x0213_combining_table[i][0] == euc)
2059 return x0213_combining_table[i][1];
2060 return 0;
2061 }
2062 #endif
2063
2064 static nkf_char
w2e_conv(nkf_char c2,nkf_char c1,nkf_char c0,nkf_char * p2,nkf_char * p1)2065 w2e_conv(nkf_char c2, nkf_char c1, nkf_char c0, nkf_char *p2, nkf_char *p1)
2066 {
2067 nkf_char ret = 0;
2068
2069 if (!c1){
2070 *p2 = 0;
2071 *p1 = c2;
2072 }else if (0xc0 <= c2 && c2 <= 0xef) {
2073 ret = unicode_to_jis_common(c2, c1, c0, p2, p1);
2074 #ifdef NUMCHAR_OPTION
2075 if (ret > 0){
2076 if (p2) *p2 = 0;
2077 if (p1) *p1 = nkf_char_unicode_new(nkf_utf8_to_unicode(c2, c1, c0, 0));
2078 ret = 0;
2079 }
2080 #endif
2081 }
2082 return ret;
2083 }
2084
2085 #ifdef UTF8_INPUT_ENABLE
2086 static nkf_char
w16e_conv(nkf_char val,nkf_char * p2,nkf_char * p1)2087 w16e_conv(nkf_char val, nkf_char *p2, nkf_char *p1)
2088 {
2089 nkf_char c1, c2, c3, c4;
2090 nkf_char ret = 0;
2091 val &= VALUE_MASK;
2092 if (val < 0x80) {
2093 *p2 = 0;
2094 *p1 = val;
2095 }
2096 else if (nkf_char_unicode_bmp_p(val)){
2097 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4);
2098 ret = unicode_to_jis_common(c1, c2, c3, p2, p1);
2099 if (ret > 0){
2100 *p2 = 0;
2101 *p1 = nkf_char_unicode_new(val);
2102 ret = 0;
2103 }
2104 }
2105 else {
2106 int i;
2107 if (x0213_f) {
2108 c1 = (val >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */
2109 c2 = (val & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */
2110 for (i = 0; i < sizeof_x0213_1_surrogate_table; i++)
2111 if (x0213_1_surrogate_table[i][1] == c1 && x0213_1_surrogate_table[i][2] == c2) {
2112 val = x0213_1_surrogate_table[i][0];
2113 *p2 = val >> 8;
2114 *p1 = val & 0xFF;
2115 return 0;
2116 }
2117 for (i = 0; i < sizeof_x0213_2_surrogate_table; i++)
2118 if (x0213_2_surrogate_table[i][1] == c1 && x0213_2_surrogate_table[i][2] == c2) {
2119 val = x0213_2_surrogate_table[i][0];
2120 *p2 = PREFIX_EUCG3 | (val >> 8);
2121 *p1 = val & 0xFF;
2122 return 0;
2123 }
2124 }
2125 *p2 = 0;
2126 *p1 = nkf_char_unicode_new(val);
2127 }
2128 return ret;
2129 }
2130 #endif
2131
2132 static nkf_char
e_iconv(nkf_char c2,nkf_char c1,nkf_char c0)2133 e_iconv(nkf_char c2, nkf_char c1, nkf_char c0)
2134 {
2135 if (c2 == JIS_X_0201_1976_K || c2 == SS2){
2136 if (iso2022jp_f && !x0201_f) {
2137 c2 = GETA1; c1 = GETA2;
2138 } else {
2139 c2 = JIS_X_0201_1976_K;
2140 c1 &= 0x7f;
2141 }
2142 #ifdef X0212_ENABLE
2143 }else if (c2 == 0x8f){
2144 if (c0 == 0){
2145 return -1;
2146 }
2147 if (!cp51932_f && !x0213_f && 0xF5 <= c1 && c1 <= 0xFE && 0xA1 <= c0 && c0 <= 0xFE) {
2148 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
2149 c1 = nkf_char_unicode_new((c1 - 0xF5) * 94 + c0 - 0xA1 + 0xE3AC);
2150 c2 = 0;
2151 } else {
2152 c2 = (c2 << 8) | (c1 & 0x7f);
2153 c1 = c0 & 0x7f;
2154 #ifdef SHIFTJIS_CP932
2155 if (cp51932_f){
2156 nkf_char s2, s1;
2157 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2158 s2e_conv(s2, s1, &c2, &c1);
2159 if (c2 < 0x100){
2160 c1 &= 0x7f;
2161 c2 &= 0x7f;
2162 }
2163 }
2164 }
2165 #endif /* SHIFTJIS_CP932 */
2166 }
2167 #endif /* X0212_ENABLE */
2168 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP || c2 == ISO_8859_1) {
2169 /* NOP */
2170 } else {
2171 if (!cp51932_f && ms_ucs_map_f && 0xF5 <= c2 && c2 <= 0xFE && 0xA1 <= c1 && c1 <= 0xFE) {
2172 /* encoding is eucJP-ms, so invert to Unicode Private User Area */
2173 c1 = nkf_char_unicode_new((c2 - 0xF5) * 94 + c1 - 0xA1 + 0xE000);
2174 c2 = 0;
2175 } else {
2176 c1 &= 0x7f;
2177 c2 &= 0x7f;
2178 #ifdef SHIFTJIS_CP932
2179 if (cp51932_f && 0x79 <= c2 && c2 <= 0x7c){
2180 nkf_char s2, s1;
2181 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2182 s2e_conv(s2, s1, &c2, &c1);
2183 if (c2 < 0x100){
2184 c1 &= 0x7f;
2185 c2 &= 0x7f;
2186 }
2187 }
2188 }
2189 #endif /* SHIFTJIS_CP932 */
2190 }
2191 }
2192 (*oconv)(c2, c1);
2193 return 0;
2194 }
2195
2196 static nkf_char
s_iconv(ARG_UNUSED nkf_char c2,nkf_char c1,ARG_UNUSED nkf_char c0)2197 s_iconv(ARG_UNUSED nkf_char c2, nkf_char c1, ARG_UNUSED nkf_char c0)
2198 {
2199 if (c2 == JIS_X_0201_1976_K || (0xA1 <= c2 && c2 <= 0xDF)) {
2200 if (iso2022jp_f && !x0201_f) {
2201 c2 = GETA1; c1 = GETA2;
2202 } else {
2203 c1 &= 0x7f;
2204 }
2205 } else if ((c2 == EOF) || (c2 == 0) || c2 < SP) {
2206 /* NOP */
2207 } else if (!x0213_f && 0xF0 <= c2 && c2 <= 0xF9 && 0x40 <= c1 && c1 <= 0xFC) {
2208 /* CP932 UDC */
2209 if(c1 == 0x7F) return 0;
2210 c1 = nkf_char_unicode_new((c2 - 0xF0) * 188 + (c1 - 0x40 - (0x7E < c1)) + 0xE000);
2211 c2 = 0;
2212 } else {
2213 nkf_char ret = s2e_conv(c2, c1, &c2, &c1);
2214 if (ret) return ret;
2215 }
2216 (*oconv)(c2, c1);
2217 return 0;
2218 }
2219
2220 static int
x0213_wait_combining_p(nkf_char wc)2221 x0213_wait_combining_p(nkf_char wc)
2222 {
2223 int i;
2224 for (i = 0; i < sizeof_x0213_combining_table; i++) {
2225 if (x0213_combining_table[i][1] == wc) {
2226 return TRUE;
2227 }
2228 }
2229 return FALSE;
2230 }
2231
2232 static int
x0213_combining_p(nkf_char wc)2233 x0213_combining_p(nkf_char wc)
2234 {
2235 int i;
2236 for (i = 0; i < sizeof_x0213_combining_chars; i++) {
2237 if (x0213_combining_chars[i] == wc) {
2238 return TRUE;
2239 }
2240 }
2241 return FALSE;
2242 }
2243
2244 static nkf_char
w_iconv(nkf_char c1,nkf_char c2,nkf_char c3)2245 w_iconv(nkf_char c1, nkf_char c2, nkf_char c3)
2246 {
2247 nkf_char ret = 0, c4 = 0;
2248 static const char w_iconv_utf8_1st_byte[] =
2249 { /* 0xC0 - 0xFF */
2250 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2251 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
2252 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
2253 40, 41, 41, 41, 42, 43, 43, 43, 50, 50, 50, 50, 60, 60, 70, 70};
2254
2255 if (c3 > 0xFF) {
2256 c4 = c3 & 0xFF;
2257 c3 >>= 8;
2258 }
2259
2260 if (c1 < 0 || 0xff < c1) {
2261 }else if (c1 == 0) { /* 0 : 1 byte*/
2262 c3 = 0;
2263 } else if ((c1 & 0xC0) == 0x80) { /* 0x80-0xbf : trail byte */
2264 return 0;
2265 } else{
2266 switch (w_iconv_utf8_1st_byte[c1 - 0xC0]) {
2267 case 21:
2268 if (c2 < 0x80 || 0xBF < c2) return 0;
2269 break;
2270 case 30:
2271 if (c3 == 0) return -1;
2272 if (c2 < 0xA0 || 0xBF < c2 || (c3 & 0xC0) != 0x80)
2273 return 0;
2274 break;
2275 case 31:
2276 case 33:
2277 if (c3 == 0) return -1;
2278 if ((c2 & 0xC0) != 0x80 || (c3 & 0xC0) != 0x80)
2279 return 0;
2280 break;
2281 case 32:
2282 if (c3 == 0) return -1;
2283 if (c2 < 0x80 || 0x9F < c2 || (c3 & 0xC0) != 0x80)
2284 return 0;
2285 break;
2286 case 40:
2287 if (c3 == 0) return -2;
2288 if (c2 < 0x90 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2289 return 0;
2290 break;
2291 case 41:
2292 if (c3 == 0) return -2;
2293 if (c2 < 0x80 || 0xBF < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2294 return 0;
2295 break;
2296 case 42:
2297 if (c3 == 0) return -2;
2298 if (c2 < 0x80 || 0x8F < c2 || (c3 & 0xC0) != 0x80 || (c4 & 0xC0) != 0x80)
2299 return 0;
2300 break;
2301 default:
2302 return 0;
2303 break;
2304 }
2305 }
2306 if (c1 == 0 || c1 == EOF){
2307 } else if ((c1 & 0xf8) == 0xf0) { /* 4 bytes */
2308 c2 = nkf_char_unicode_new(nkf_utf8_to_unicode(c1, c2, c3, c4));
2309 c1 = 0;
2310 } else {
2311 if (x0213_f && x0213_wait_combining_p(nkf_utf8_to_unicode(c1, c2, c3, c4)))
2312 return -3;
2313 ret = w2e_conv(c1, c2, c3, &c1, &c2);
2314 }
2315 if (ret == 0){
2316 (*oconv)(c1, c2);
2317 }
2318 return ret;
2319 }
2320
2321 static nkf_char
w_iconv_nocombine(nkf_char c1,nkf_char c2,nkf_char c3)2322 w_iconv_nocombine(nkf_char c1, nkf_char c2, nkf_char c3)
2323 {
2324 /* continue from the line below 'return -3;' in w_iconv() */
2325 nkf_char ret = w2e_conv(c1, c2, c3, &c1, &c2);
2326 if (ret == 0){
2327 (*oconv)(c1, c2);
2328 }
2329 return ret;
2330 }
2331
2332 #define NKF_ICONV_INVALID_CODE_RANGE -13
2333 #define NKF_ICONV_WAIT_COMBINING_CHAR -14
2334 #define NKF_ICONV_NOT_COMBINED -15
2335 static size_t
unicode_iconv(nkf_char wc,int nocombine)2336 unicode_iconv(nkf_char wc, int nocombine)
2337 {
2338 nkf_char c1, c2;
2339 int ret = 0;
2340
2341 if (wc < 0x80) {
2342 c2 = 0;
2343 c1 = wc;
2344 }else if ((wc>>11) == 27) {
2345 /* unpaired surrogate */
2346 return NKF_ICONV_INVALID_CODE_RANGE;
2347 }else if (wc < 0xFFFF) {
2348 if (!nocombine && x0213_f && x0213_wait_combining_p(wc))
2349 return NKF_ICONV_WAIT_COMBINING_CHAR;
2350 ret = w16e_conv(wc, &c2, &c1);
2351 if (ret) return ret;
2352 }else if (wc < 0x10FFFF) {
2353 c2 = 0;
2354 c1 = nkf_char_unicode_new(wc);
2355 } else {
2356 return NKF_ICONV_INVALID_CODE_RANGE;
2357 }
2358 (*oconv)(c2, c1);
2359 return 0;
2360 }
2361
2362 static nkf_char
unicode_iconv_combine(nkf_char wc,nkf_char wc2)2363 unicode_iconv_combine(nkf_char wc, nkf_char wc2)
2364 {
2365 nkf_char c1, c2;
2366 int i;
2367
2368 if (wc2 < 0x80) {
2369 return NKF_ICONV_NOT_COMBINED;
2370 }else if ((wc2>>11) == 27) {
2371 /* unpaired surrogate */
2372 return NKF_ICONV_INVALID_CODE_RANGE;
2373 }else if (wc2 < 0xFFFF) {
2374 if (!x0213_combining_p(wc2))
2375 return NKF_ICONV_NOT_COMBINED;
2376 for (i = 0; i < sizeof_x0213_combining_table; i++) {
2377 if (x0213_combining_table[i][1] == wc &&
2378 x0213_combining_table[i][2] == wc2) {
2379 c2 = x0213_combining_table[i][0] >> 8;
2380 c1 = x0213_combining_table[i][0] & 0x7f;
2381 (*oconv)(c2, c1);
2382 return 0;
2383 }
2384 }
2385 }else if (wc2 < 0x10FFFF) {
2386 return NKF_ICONV_NOT_COMBINED;
2387 } else {
2388 return NKF_ICONV_INVALID_CODE_RANGE;
2389 }
2390 return NKF_ICONV_NOT_COMBINED;
2391 }
2392
2393 static nkf_char
w_iconv_combine(nkf_char c1,nkf_char c2,nkf_char c3,nkf_char c4,nkf_char c5,nkf_char c6)2394 w_iconv_combine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4, nkf_char c5, nkf_char c6)
2395 {
2396 nkf_char wc, wc2;
2397 wc = nkf_utf8_to_unicode(c1, c2, c3, 0);
2398 wc2 = nkf_utf8_to_unicode(c4, c5, c6, 0);
2399 if (wc2 < 0)
2400 return wc2;
2401 return unicode_iconv_combine(wc, wc2);
2402 }
2403
2404 #define NKF_ICONV_NEED_ONE_MORE_BYTE (size_t)-1
2405 #define NKF_ICONV_NEED_TWO_MORE_BYTES (size_t)-2
2406 static size_t
nkf_iconv_utf_16(nkf_char c1,nkf_char c2,nkf_char c3,nkf_char c4)2407 nkf_iconv_utf_16(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2408 {
2409 nkf_char wc;
2410
2411 if (c1 == EOF) {
2412 (*oconv)(EOF, 0);
2413 return 0;
2414 }
2415
2416 if (input_endian == ENDIAN_BIG) {
2417 if (0xD8 <= c1 && c1 <= 0xDB) {
2418 if (0xDC <= c3 && c3 <= 0xDF) {
2419 wc = UTF16_TO_UTF32(c1 << 8 | c2, c3 << 8 | c4);
2420 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2421 } else {
2422 wc = c1 << 8 | c2;
2423 }
2424 } else {
2425 if (0xD8 <= c2 && c2 <= 0xDB) {
2426 if (0xDC <= c4 && c4 <= 0xDF) {
2427 wc = UTF16_TO_UTF32(c2 << 8 | c1, c4 << 8 | c3);
2428 } else return NKF_ICONV_NEED_TWO_MORE_BYTES;
2429 } else {
2430 wc = c2 << 8 | c1;
2431 }
2432 }
2433
2434 return (*unicode_iconv)(wc, FALSE);
2435 }
2436
2437 static size_t
nkf_iconv_utf_16_combine(nkf_char c1,nkf_char c2,nkf_char c3,nkf_char c4)2438 nkf_iconv_utf_16_combine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2439 {
2440 nkf_char wc, wc2;
2441
2442 if (input_endian == ENDIAN_BIG) {
2443 if (0xD8 <= c3 && c3 <= 0xDB) {
2444 return NKF_ICONV_NOT_COMBINED;
2445 } else {
2446 wc = c1 << 8 | c2;
2447 wc2 = c3 << 8 | c4;
2448 }
2449 } else {
2450 if (0xD8 <= c2 && c2 <= 0xDB) {
2451 return NKF_ICONV_NOT_COMBINED;
2452 } else {
2453 wc = c2 << 8 | c1;
2454 wc2 = c4 << 8 | c3;
2455 }
2456 }
2457
2458 return unicode_iconv_combine(wc, wc2);
2459 }
2460
2461 static size_t
nkf_iconv_utf_16_nocombine(nkf_char c1,nkf_char c2)2462 nkf_iconv_utf_16_nocombine(nkf_char c1, nkf_char c2)
2463 {
2464 nkf_char wc;
2465 if (input_endian == ENDIAN_BIG)
2466 wc = c1 << 8 | c2;
2467 else
2468 wc = c2 << 8 | c1;
2469 return (*unicode_iconv)(wc, TRUE);
2470 }
2471
2472 static nkf_char
w_iconv16(nkf_char c2,nkf_char c1,ARG_UNUSED nkf_char c0)2473 w_iconv16(nkf_char c2, nkf_char c1, ARG_UNUSED nkf_char c0)
2474 {
2475 (*oconv)(c2, c1);
2476 return 16; /* different from w_iconv32 */
2477 }
2478
2479 static nkf_char
w_iconv32(nkf_char c2,nkf_char c1,ARG_UNUSED nkf_char c0)2480 w_iconv32(nkf_char c2, nkf_char c1, ARG_UNUSED nkf_char c0)
2481 {
2482 (*oconv)(c2, c1);
2483 return 32; /* different from w_iconv16 */
2484 }
2485
2486 static nkf_char
utf32_to_nkf_char(nkf_char c1,nkf_char c2,nkf_char c3,nkf_char c4)2487 utf32_to_nkf_char(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2488 {
2489 nkf_char wc;
2490
2491 switch(input_endian){
2492 case ENDIAN_BIG:
2493 wc = c2 << 16 | c3 << 8 | c4;
2494 break;
2495 case ENDIAN_LITTLE:
2496 wc = c3 << 16 | c2 << 8 | c1;
2497 break;
2498 case ENDIAN_2143:
2499 wc = c1 << 16 | c4 << 8 | c3;
2500 break;
2501 case ENDIAN_3412:
2502 wc = c4 << 16 | c1 << 8 | c2;
2503 break;
2504 default:
2505 return NKF_ICONV_INVALID_CODE_RANGE;
2506 }
2507 return wc;
2508 }
2509
2510 static size_t
nkf_iconv_utf_32(nkf_char c1,nkf_char c2,nkf_char c3,nkf_char c4)2511 nkf_iconv_utf_32(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2512 {
2513 nkf_char wc;
2514
2515 if (c1 == EOF) {
2516 (*oconv)(EOF, 0);
2517 return 0;
2518 }
2519
2520 wc = utf32_to_nkf_char(c1, c2, c3, c4);
2521 if (wc < 0)
2522 return wc;
2523
2524 return (*unicode_iconv)(wc, FALSE);
2525 }
2526
2527 static nkf_char
nkf_iconv_utf_32_combine(nkf_char c1,nkf_char c2,nkf_char c3,nkf_char c4,nkf_char c5,nkf_char c6,nkf_char c7,nkf_char c8)2528 nkf_iconv_utf_32_combine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4, nkf_char c5, nkf_char c6, nkf_char c7, nkf_char c8)
2529 {
2530 nkf_char wc, wc2;
2531
2532 wc = utf32_to_nkf_char(c1, c2, c3, c4);
2533 if (wc < 0)
2534 return wc;
2535 wc2 = utf32_to_nkf_char(c5, c6, c7, c8);
2536 if (wc2 < 0)
2537 return wc2;
2538
2539 return unicode_iconv_combine(wc, wc2);
2540 }
2541
2542 static size_t
nkf_iconv_utf_32_nocombine(nkf_char c1,nkf_char c2,nkf_char c3,nkf_char c4)2543 nkf_iconv_utf_32_nocombine(nkf_char c1, nkf_char c2, nkf_char c3, nkf_char c4)
2544 {
2545 nkf_char wc;
2546
2547 wc = utf32_to_nkf_char(c1, c2, c3, c4);
2548 return (*unicode_iconv)(wc, TRUE);
2549 }
2550 #endif
2551
2552 #define output_ascii_escape_sequence(mode) do { \
2553 if (output_mode != ASCII && output_mode != ISO_8859_1) { \
2554 (*o_putc)(ESC); \
2555 (*o_putc)('('); \
2556 (*o_putc)(ascii_intro); \
2557 output_mode = mode; \
2558 } \
2559 } while (0)
2560
2561 static void
output_escape_sequence(int mode)2562 output_escape_sequence(int mode)
2563 {
2564 if (output_mode == mode)
2565 return;
2566 switch(mode) {
2567 case ISO_8859_1:
2568 (*o_putc)(ESC);
2569 (*o_putc)('.');
2570 (*o_putc)('A');
2571 break;
2572 case JIS_X_0201_1976_K:
2573 (*o_putc)(ESC);
2574 (*o_putc)('(');
2575 (*o_putc)('I');
2576 break;
2577 case JIS_X_0208:
2578 (*o_putc)(ESC);
2579 (*o_putc)('$');
2580 (*o_putc)(kanji_intro);
2581 break;
2582 case JIS_X_0212:
2583 (*o_putc)(ESC);
2584 (*o_putc)('$');
2585 (*o_putc)('(');
2586 (*o_putc)('D');
2587 break;
2588 case JIS_X_0213_1:
2589 (*o_putc)(ESC);
2590 (*o_putc)('$');
2591 (*o_putc)('(');
2592 (*o_putc)('Q');
2593 break;
2594 case JIS_X_0213_2:
2595 (*o_putc)(ESC);
2596 (*o_putc)('$');
2597 (*o_putc)('(');
2598 (*o_putc)('P');
2599 break;
2600 }
2601 output_mode = mode;
2602 }
2603
2604 static void
j_oconv(nkf_char c2,nkf_char c1)2605 j_oconv(nkf_char c2, nkf_char c1)
2606 {
2607 #ifdef NUMCHAR_OPTION
2608 if (c2 == 0 && nkf_char_unicode_p(c1)){
2609 w16e_conv(c1, &c2, &c1);
2610 if (c2 == 0 && nkf_char_unicode_p(c1)){
2611 c2 = c1 & VALUE_MASK;
2612 if (ms_ucs_map_f && 0xE000 <= c2 && c2 <= 0xE757) {
2613 /* CP5022x UDC */
2614 c1 &= 0xFFF;
2615 c2 = 0x7F + c1 / 94;
2616 c1 = 0x21 + c1 % 94;
2617 } else {
2618 if (encode_fallback) (*encode_fallback)(c1);
2619 return;
2620 }
2621 }
2622 }
2623 #endif
2624 if (c2 == 0) {
2625 output_ascii_escape_sequence(ASCII);
2626 (*o_putc)(c1);
2627 }
2628 else if (c2 == EOF) {
2629 output_ascii_escape_sequence(ASCII);
2630 (*o_putc)(EOF);
2631 }
2632 else if (c2 == ISO_8859_1) {
2633 output_ascii_escape_sequence(ISO_8859_1);
2634 (*o_putc)(c1|0x80);
2635 }
2636 else if (c2 == JIS_X_0201_1976_K) {
2637 output_escape_sequence(JIS_X_0201_1976_K);
2638 (*o_putc)(c1);
2639 #ifdef X0212_ENABLE
2640 } else if (is_eucg3(c2)){
2641 output_escape_sequence(x0213_f ? JIS_X_0213_2 : JIS_X_0212);
2642 (*o_putc)(c2 & 0x7f);
2643 (*o_putc)(c1);
2644 #endif
2645 } else {
2646 if(ms_ucs_map_f
2647 ? c2<0x20 || 0x92<c2 || c1<0x20 || 0x7e<c1
2648 : c2<0x20 || 0x7e<c2 || c1<0x20 || 0x7e<c1) return;
2649 output_escape_sequence(x0213_f ? JIS_X_0213_1 : JIS_X_0208);
2650 (*o_putc)(c2);
2651 (*o_putc)(c1);
2652 }
2653 }
2654
2655 static void
e_oconv(nkf_char c2,nkf_char c1)2656 e_oconv(nkf_char c2, nkf_char c1)
2657 {
2658 if (c2 == 0 && nkf_char_unicode_p(c1)){
2659 w16e_conv(c1, &c2, &c1);
2660 if (c2 == 0 && nkf_char_unicode_p(c1)){
2661 c2 = c1 & VALUE_MASK;
2662 if (x0212_f && 0xE000 <= c2 && c2 <= 0xE757) {
2663 /* eucJP-ms UDC */
2664 c1 &= 0xFFF;
2665 c2 = c1 / 94;
2666 c2 += c2 < 10 ? 0x75 : 0x8FEB;
2667 c1 = 0x21 + c1 % 94;
2668 if (is_eucg3(c2)){
2669 (*o_putc)(0x8f);
2670 (*o_putc)((c2 & 0x7f) | 0x080);
2671 (*o_putc)(c1 | 0x080);
2672 }else{
2673 (*o_putc)((c2 & 0x7f) | 0x080);
2674 (*o_putc)(c1 | 0x080);
2675 }
2676 return;
2677 } else {
2678 if (encode_fallback) (*encode_fallback)(c1);
2679 return;
2680 }
2681 }
2682 }
2683
2684 if (c2 == EOF) {
2685 (*o_putc)(EOF);
2686 } else if (c2 == 0) {
2687 output_mode = ASCII;
2688 (*o_putc)(c1);
2689 } else if (c2 == JIS_X_0201_1976_K) {
2690 output_mode = EUC_JP;
2691 (*o_putc)(SS2); (*o_putc)(c1|0x80);
2692 } else if (c2 == ISO_8859_1) {
2693 output_mode = ISO_8859_1;
2694 (*o_putc)(c1 | 0x080);
2695 #ifdef X0212_ENABLE
2696 } else if (is_eucg3(c2)){
2697 output_mode = EUC_JP;
2698 #ifdef SHIFTJIS_CP932
2699 if (!cp932inv_f){
2700 nkf_char s2, s1;
2701 if (e2s_conv(c2, c1, &s2, &s1) == 0){
2702 s2e_conv(s2, s1, &c2, &c1);
2703 }
2704 }
2705 #endif
2706 if (c2 == 0) {
2707 output_mode = ASCII;
2708 (*o_putc)(c1);
2709 }else if (is_eucg3(c2)){
2710 if (x0212_f){
2711 (*o_putc)(0x8f);
2712 (*o_putc)((c2 & 0x7f) | 0x080);
2713 (*o_putc)(c1 | 0x080);
2714 }
2715 }else{
2716 (*o_putc)((c2 & 0x7f) | 0x080);
2717 (*o_putc)(c1 | 0x080);
2718 }
2719 #endif
2720 } else {
2721 if (!nkf_isgraph(c1) || !nkf_isgraph(c2)) {
2722 set_iconv(FALSE, 0);
2723 return; /* too late to rescue this char */
2724 }
2725 output_mode = EUC_JP;
2726 (*o_putc)(c2 | 0x080);
2727 (*o_putc)(c1 | 0x080);
2728 }
2729 }
2730
2731 static void
s_oconv(nkf_char c2,nkf_char c1)2732 s_oconv(nkf_char c2, nkf_char c1)
2733 {
2734 #ifdef NUMCHAR_OPTION
2735 if (c2 == 0 && nkf_char_unicode_p(c1)){
2736 w16e_conv(c1, &c2, &c1);
2737 if (c2 == 0 && nkf_char_unicode_p(c1)){
2738 c2 = c1 & VALUE_MASK;
2739 if (!x0213_f && 0xE000 <= c2 && c2 <= 0xE757) {
2740 /* CP932 UDC */
2741 c1 &= 0xFFF;
2742 c2 = c1 / 188 + (cp932inv_f ? 0xF0 : 0xEB);
2743 c1 = c1 % 188;
2744 c1 += 0x40 + (c1 > 0x3e);
2745 (*o_putc)(c2);
2746 (*o_putc)(c1);
2747 return;
2748 } else {
2749 if(encode_fallback)(*encode_fallback)(c1);
2750 return;
2751 }
2752 }
2753 }
2754 #endif
2755 if (c2 == EOF) {
2756 (*o_putc)(EOF);
2757 return;
2758 } else if (c2 == 0) {
2759 output_mode = ASCII;
2760 (*o_putc)(c1);
2761 } else if (c2 == JIS_X_0201_1976_K) {
2762 output_mode = SHIFT_JIS;
2763 (*o_putc)(c1|0x80);
2764 } else if (c2 == ISO_8859_1) {
2765 output_mode = ISO_8859_1;
2766 (*o_putc)(c1 | 0x080);
2767 #ifdef X0212_ENABLE
2768 } else if (is_eucg3(c2)){
2769 output_mode = SHIFT_JIS;
2770 if (e2s_conv(c2, c1, &c2, &c1) == 0){
2771 (*o_putc)(c2);
2772 (*o_putc)(c1);
2773 }
2774 #endif
2775 } else {
2776 if (!nkf_isprint(c1) || !nkf_isprint(c2)) {
2777 set_iconv(FALSE, 0);
2778 return; /* too late to rescue this char */
2779 }
2780 output_mode = SHIFT_JIS;
2781 e2s_conv(c2, c1, &c2, &c1);
2782
2783 #ifdef SHIFTJIS_CP932
2784 if (cp932inv_f
2785 && CP932INV_TABLE_BEGIN <= c2 && c2 <= CP932INV_TABLE_END){
2786 nkf_char c = cp932inv[c2 - CP932INV_TABLE_BEGIN][c1 - 0x40];
2787 if (c){
2788 c2 = c >> 8;
2789 c1 = c & 0xff;
2790 }
2791 }
2792 #endif /* SHIFTJIS_CP932 */
2793
2794 (*o_putc)(c2);
2795 if (prefix_table[(unsigned char)c1]){
2796 (*o_putc)(prefix_table[(unsigned char)c1]);
2797 }
2798 (*o_putc)(c1);
2799 }
2800 }
2801
2802 #ifdef UTF8_OUTPUT_ENABLE
2803 #define OUTPUT_UTF8(val) do { \
2804 nkf_unicode_to_utf8(val, &c1, &c2, &c3, &c4); \
2805 (*o_putc)(c1); \
2806 if (c2) (*o_putc)(c2); \
2807 if (c3) (*o_putc)(c3); \
2808 if (c4) (*o_putc)(c4); \
2809 } while (0)
2810
2811 static void
w_oconv(nkf_char c2,nkf_char c1)2812 w_oconv(nkf_char c2, nkf_char c1)
2813 {
2814 nkf_char c3, c4;
2815 nkf_char val, val2;
2816
2817 if (output_bom_f) {
2818 output_bom_f = FALSE;
2819 (*o_putc)('\357');
2820 (*o_putc)('\273');
2821 (*o_putc)('\277');
2822 }
2823
2824 if (c2 == EOF) {
2825 (*o_putc)(EOF);
2826 return;
2827 }
2828
2829 if (c2 == 0 && nkf_char_unicode_p(c1)){
2830 val = c1 & VALUE_MASK;
2831 OUTPUT_UTF8(val);
2832 return;
2833 }
2834
2835 if (c2 == 0) {
2836 (*o_putc)(c1);
2837 } else {
2838 val = e2w_conv(c2, c1);
2839 if (val){
2840 val2 = e2w_combining(val, c2, c1);
2841 if (val2)
2842 OUTPUT_UTF8(val2);
2843 OUTPUT_UTF8(val);
2844 }
2845 }
2846 }
2847
2848 #define OUTPUT_UTF16_BYTES(c1, c2) do { \
2849 if (output_endian == ENDIAN_LITTLE){ \
2850 (*o_putc)(c1); \
2851 (*o_putc)(c2); \
2852 }else{ \
2853 (*o_putc)(c2); \
2854 (*o_putc)(c1); \
2855 } \
2856 } while (0)
2857
2858 #define OUTPUT_UTF16(val) do { \
2859 if (nkf_char_unicode_bmp_p(val)) { \
2860 c2 = (val >> 8) & 0xff; \
2861 c1 = val & 0xff; \
2862 OUTPUT_UTF16_BYTES(c1, c2); \
2863 } else { \
2864 val &= VALUE_MASK; \
2865 if (val <= UNICODE_MAX) { \
2866 c2 = (val >> 10) + NKF_INT32_C(0xD7C0); /* high surrogate */ \
2867 c1 = (val & 0x3FF) + NKF_INT32_C(0xDC00); /* low surrogate */ \
2868 OUTPUT_UTF16_BYTES(c2 & 0xff, (c2 >> 8) & 0xff); \
2869 OUTPUT_UTF16_BYTES(c1 & 0xff, (c1 >> 8) & 0xff); \
2870 } \
2871 } \
2872 } while (0)
2873
2874 static void
w_oconv16(nkf_char c2,nkf_char c1)2875 w_oconv16(nkf_char c2, nkf_char c1)
2876 {
2877 if (output_bom_f) {
2878 output_bom_f = FALSE;
2879 OUTPUT_UTF16_BYTES(0xFF, 0xFE);
2880 }
2881
2882 if (c2 == EOF) {
2883 (*o_putc)(EOF);
2884 return;
2885 }
2886
2887 if (c2 == 0 && nkf_char_unicode_p(c1)) {
2888 OUTPUT_UTF16(c1);
2889 } else if (c2) {
2890 nkf_char val, val2;
2891 val = e2w_conv(c2, c1);
2892 if (!val) return;
2893 val2 = e2w_combining(val, c2, c1);
2894 if (val2)
2895 OUTPUT_UTF16(val2);
2896 OUTPUT_UTF16(val);
2897 } else {
2898 OUTPUT_UTF16_BYTES(c1, c2);
2899 }
2900 }
2901
2902 #define OUTPUT_UTF32(c) do { \
2903 if (output_endian == ENDIAN_LITTLE){ \
2904 (*o_putc)( (c) & 0xFF); \
2905 (*o_putc)(((c) >> 8) & 0xFF); \
2906 (*o_putc)(((c) >> 16) & 0xFF); \
2907 (*o_putc)(0); \
2908 }else{ \
2909 (*o_putc)(0); \
2910 (*o_putc)(((c) >> 16) & 0xFF); \
2911 (*o_putc)(((c) >> 8) & 0xFF); \
2912 (*o_putc)( (c) & 0xFF); \
2913 } \
2914 } while (0)
2915
2916 static void
w_oconv32(nkf_char c2,nkf_char c1)2917 w_oconv32(nkf_char c2, nkf_char c1)
2918 {
2919 if (output_bom_f) {
2920 output_bom_f = FALSE;
2921 if (output_endian == ENDIAN_LITTLE){
2922 (*o_putc)(0xFF);
2923 (*o_putc)(0xFE);
2924 (*o_putc)(0);
2925 (*o_putc)(0);
2926 }else{
2927 (*o_putc)(0);
2928 (*o_putc)(0);
2929 (*o_putc)(0xFE);
2930 (*o_putc)(0xFF);
2931 }
2932 }
2933
2934 if (c2 == EOF) {
2935 (*o_putc)(EOF);
2936 return;
2937 }
2938
2939 if (c2 == ISO_8859_1) {
2940 c1 |= 0x80;
2941 } else if (c2 == 0 && nkf_char_unicode_p(c1)) {
2942 c1 &= VALUE_MASK;
2943 } else if (c2) {
2944 nkf_char val, val2;
2945 val = e2w_conv(c2, c1);
2946 if (!val) return;
2947 val2 = e2w_combining(val, c2, c1);
2948 if (val2)
2949 OUTPUT_UTF32(val2);
2950 c1 = val;
2951 }
2952 OUTPUT_UTF32(c1);
2953 }
2954 #endif
2955
2956 #define SCORE_L2 (1) /* Kanji Level 2 */
2957 #define SCORE_KANA (SCORE_L2 << 1) /* Halfwidth Katakana */
2958 #define SCORE_DEPEND (SCORE_KANA << 1) /* MD Characters */
2959 #define SCORE_CP932 (SCORE_DEPEND << 1) /* IBM extended characters */
2960 #define SCORE_X0212 (SCORE_CP932 << 1) /* JIS X 0212 */
2961 #define SCORE_X0213 (SCORE_X0212 << 1) /* JIS X 0213 */
2962 #define SCORE_NO_EXIST (SCORE_X0213 << 1) /* Undefined Characters */
2963 #define SCORE_iMIME (SCORE_NO_EXIST << 1) /* MIME selected */
2964 #define SCORE_ERROR (SCORE_iMIME << 1) /* Error */
2965
2966 #define SCORE_INIT (SCORE_iMIME)
2967
2968 static const nkf_char score_table_A0[] = {
2969 0, 0, 0, 0,
2970 0, 0, 0, 0,
2971 0, SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND,
2972 SCORE_DEPEND, SCORE_DEPEND, SCORE_DEPEND, SCORE_X0213,
2973 };
2974
2975 static const nkf_char score_table_F0[] = {
2976 SCORE_L2, SCORE_L2, SCORE_L2, SCORE_L2,
2977 SCORE_L2, SCORE_DEPEND, SCORE_X0213, SCORE_X0213,
2978 SCORE_DEPEND, SCORE_DEPEND, SCORE_CP932, SCORE_CP932,
2979 SCORE_CP932, SCORE_X0213, SCORE_X0213, SCORE_ERROR,
2980 };
2981
2982 static const nkf_char score_table_8FA0[] = {
2983 0, SCORE_X0213, SCORE_X0212, SCORE_X0213,
2984 SCORE_X0213, SCORE_X0213, SCORE_X0212, SCORE_X0212,
2985 SCORE_X0213, SCORE_X0212, SCORE_X0212, SCORE_X0212,
2986 SCORE_X0213, SCORE_X0213, SCORE_X0213, SCORE_X0213,
2987 };
2988
2989 static const nkf_char score_table_8FE0[] = {
2990 SCORE_X0212, SCORE_X0212, SCORE_X0212, SCORE_X0212,
2991 SCORE_X0212, SCORE_X0212, SCORE_X0212, SCORE_X0212,
2992 SCORE_X0212, SCORE_X0212, SCORE_X0212, SCORE_X0212,
2993 SCORE_X0212, SCORE_X0212, SCORE_X0213, SCORE_X0213,
2994 };
2995
2996 static const nkf_char score_table_8FF0[] = {
2997 SCORE_X0213, SCORE_X0213, SCORE_X0213, SCORE_X0212,
2998 SCORE_X0212, SCORE_X0213, SCORE_X0213, SCORE_X0213,
2999 SCORE_X0213, SCORE_X0213, SCORE_X0213, SCORE_X0213,
3000 SCORE_X0213, SCORE_X0213, SCORE_X0213, SCORE_X0213,
3001 };
3002
3003 static void
set_code_score(struct input_code * ptr,nkf_char score)3004 set_code_score(struct input_code *ptr, nkf_char score)
3005 {
3006 if (ptr){
3007 ptr->score |= score;
3008 }
3009 }
3010
3011 static void
clr_code_score(struct input_code * ptr,nkf_char score)3012 clr_code_score(struct input_code *ptr, nkf_char score)
3013 {
3014 if (ptr){
3015 ptr->score &= ~score;
3016 }
3017 }
3018
3019 static void
code_score(struct input_code * ptr)3020 code_score(struct input_code *ptr)
3021 {
3022 nkf_char c2 = ptr->buf[0];
3023 nkf_char c1 = ptr->buf[1];
3024 if (c2 < 0){
3025 set_code_score(ptr, SCORE_ERROR);
3026 }else if (c2 == SS2){
3027 set_code_score(ptr, SCORE_KANA);
3028 }else if (c2 == 0x8f){
3029 if ((c1 & 0x70) == 0x20){
3030 set_code_score(ptr, score_table_8FA0[c1 & 0x0f]);
3031 }else if ((c1 & 0x70) == 0x60){
3032 set_code_score(ptr, score_table_8FE0[c1 & 0x0f]);
3033 }else if ((c1 & 0x70) == 0x70){
3034 set_code_score(ptr, score_table_8FF0[c1 & 0x0f]);
3035 }else{
3036 set_code_score(ptr, SCORE_X0212);
3037 }
3038 #ifdef UTF8_OUTPUT_ENABLE
3039 }else if (!e2w_conv(c2, c1)){
3040 set_code_score(ptr, SCORE_NO_EXIST);
3041 #endif
3042 }else if ((c2 & 0x70) == 0x20){
3043 set_code_score(ptr, score_table_A0[c2 & 0x0f]);
3044 }else if ((c2 & 0x70) == 0x70){
3045 set_code_score(ptr, score_table_F0[c2 & 0x0f]);
3046 }else if ((c2 & 0x70) >= 0x50){
3047 set_code_score(ptr, SCORE_L2);
3048 }
3049 }
3050
3051 static void
status_disable(struct input_code * ptr)3052 status_disable(struct input_code *ptr)
3053 {
3054 ptr->stat = -1;
3055 ptr->buf[0] = -1;
3056 code_score(ptr);
3057 if (iconv == ptr->iconv_func) set_iconv(FALSE, 0);
3058 }
3059
3060 static void
status_push_ch(struct input_code * ptr,nkf_char c)3061 status_push_ch(struct input_code *ptr, nkf_char c)
3062 {
3063 ptr->buf[ptr->index++] = c;
3064 }
3065
3066 static void
status_clear(struct input_code * ptr)3067 status_clear(struct input_code *ptr)
3068 {
3069 ptr->stat = 0;
3070 ptr->index = 0;
3071 }
3072
3073 static void
status_reset(struct input_code * ptr)3074 status_reset(struct input_code *ptr)
3075 {
3076 status_clear(ptr);
3077 ptr->score = SCORE_INIT;
3078 }
3079
3080 static void
status_reinit(struct input_code * ptr)3081 status_reinit(struct input_code *ptr)
3082 {
3083 status_reset(ptr);
3084 ptr->_file_stat = 0;
3085 }
3086
3087 static void
status_check(struct input_code * ptr,nkf_char c)3088 status_check(struct input_code *ptr, nkf_char c)
3089 {
3090 if (c <= DEL && estab_f){
3091 status_reset(ptr);
3092 }
3093 }
3094
3095 static void
s_status(struct input_code * ptr,nkf_char c)3096 s_status(struct input_code *ptr, nkf_char c)
3097 {
3098 switch(ptr->stat){
3099 case -1:
3100 status_check(ptr, c);
3101 break;
3102 case 0:
3103 if (c <= DEL){
3104 break;
3105 }else if (nkf_char_unicode_p(c)){
3106 break;
3107 }else if (0xa1 <= c && c <= 0xdf){
3108 status_push_ch(ptr, SS2);
3109 status_push_ch(ptr, c);
3110 code_score(ptr);
3111 status_clear(ptr);
3112 }else if ((0x81 <= c && c < 0xa0) || (0xe0 <= c && c <= 0xea)){
3113 ptr->stat = 1;
3114 status_push_ch(ptr, c);
3115 }else if (0xed <= c && c <= 0xee){
3116 ptr->stat = 3;
3117 status_push_ch(ptr, c);
3118 #ifdef SHIFTJIS_CP932
3119 }else if (is_ibmext_in_sjis(c)){
3120 ptr->stat = 2;
3121 status_push_ch(ptr, c);
3122 #endif /* SHIFTJIS_CP932 */
3123 #ifdef X0212_ENABLE
3124 }else if (0xf0 <= c && c <= 0xfc){
3125 ptr->stat = 1;
3126 status_push_ch(ptr, c);
3127 #endif /* X0212_ENABLE */
3128 }else{
3129 status_disable(ptr);
3130 }
3131 break;
3132 case 1:
3133 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
3134 status_push_ch(ptr, c);
3135 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
3136 code_score(ptr);
3137 status_clear(ptr);
3138 }else{
3139 status_disable(ptr);
3140 }
3141 break;
3142 case 2:
3143 #ifdef SHIFTJIS_CP932
3144 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)) {
3145 status_push_ch(ptr, c);
3146 if (s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]) == 0) {
3147 set_code_score(ptr, SCORE_CP932);
3148 status_clear(ptr);
3149 break;
3150 }
3151 }
3152 #endif /* SHIFTJIS_CP932 */
3153 status_disable(ptr);
3154 break;
3155 case 3:
3156 if ((0x40 <= c && c <= 0x7e) || (0x80 <= c && c <= 0xfc)){
3157 status_push_ch(ptr, c);
3158 s2e_conv(ptr->buf[0], ptr->buf[1], &ptr->buf[0], &ptr->buf[1]);
3159 set_code_score(ptr, SCORE_CP932);
3160 status_clear(ptr);
3161 }else{
3162 status_disable(ptr);
3163 }
3164 break;
3165 }
3166 }
3167
3168 static void
e_status(struct input_code * ptr,nkf_char c)3169 e_status(struct input_code *ptr, nkf_char c)
3170 {
3171 switch (ptr->stat){
3172 case -1:
3173 status_check(ptr, c);
3174 break;
3175 case 0:
3176 if (c <= DEL){
3177 break;
3178 }else if (nkf_char_unicode_p(c)){
3179 break;
3180 }else if (SS2 == c || (0xa1 <= c && c <= 0xfe)){
3181 ptr->stat = 1;
3182 status_push_ch(ptr, c);
3183 #ifdef X0212_ENABLE
3184 }else if (0x8f == c){
3185 ptr->stat = 2;
3186 status_push_ch(ptr, c);
3187 #endif /* X0212_ENABLE */
3188 }else{
3189 status_disable(ptr);
3190 }
3191 break;
3192 case 1:
3193 if (0xa1 <= c && c <= 0xfe){
3194 status_push_ch(ptr, c);
3195 code_score(ptr);
3196 status_clear(ptr);
3197 }else{
3198 status_disable(ptr);
3199 }
3200 break;
3201 #ifdef X0212_ENABLE
3202 case 2:
3203 if (0xa1 <= c && c <= 0xfe){
3204 ptr->stat = 1;
3205 status_push_ch(ptr, c);
3206 }else{
3207 status_disable(ptr);
3208 }
3209 #endif /* X0212_ENABLE */
3210 }
3211 }
3212
3213 #ifdef UTF8_INPUT_ENABLE
3214 static void
w_status(struct input_code * ptr,nkf_char c)3215 w_status(struct input_code *ptr, nkf_char c)
3216 {
3217 switch (ptr->stat){
3218 case -1:
3219 status_check(ptr, c);
3220 break;
3221 case 0:
3222 if (c <= DEL){
3223 break;
3224 }else if (nkf_char_unicode_p(c)){
3225 break;
3226 }else if (0xc0 <= c && c <= 0xdf){
3227 ptr->stat = 1;
3228 status_push_ch(ptr, c);
3229 }else if (0xe0 <= c && c <= 0xef){
3230 ptr->stat = 2;
3231 status_push_ch(ptr, c);
3232 }else if (0xf0 <= c && c <= 0xf4){
3233 ptr->stat = 3;
3234 status_push_ch(ptr, c);
3235 }else{
3236 status_disable(ptr);
3237 }
3238 break;
3239 case 1:
3240 case 2:
3241 if (0x80 <= c && c <= 0xbf){
3242 status_push_ch(ptr, c);
3243 if (ptr->index > ptr->stat){
3244 int bom = (ptr->buf[0] == 0xef && ptr->buf[1] == 0xbb
3245 && ptr->buf[2] == 0xbf);
3246 w2e_conv(ptr->buf[0], ptr->buf[1], ptr->buf[2],
3247 &ptr->buf[0], &ptr->buf[1]);
3248 if (!bom){
3249 code_score(ptr);
3250 }
3251 status_clear(ptr);
3252 }
3253 }else{
3254 status_disable(ptr);
3255 }
3256 break;
3257 case 3:
3258 if (0x80 <= c && c <= 0xbf){
3259 if (ptr->index < ptr->stat){
3260 status_push_ch(ptr, c);
3261 } else {
3262 status_clear(ptr);
3263 }
3264 }else{
3265 status_disable(ptr);
3266 }
3267 break;
3268 }
3269 }
3270 #endif
3271
3272 static void
code_status(nkf_char c)3273 code_status(nkf_char c)
3274 {
3275 int action_flag = 1;
3276 struct input_code *result = 0;
3277 struct input_code *p = input_code_list;
3278 while (p->name){
3279 if (!p->status_func) {
3280 ++p;
3281 continue;
3282 }
3283 if (!p->status_func)
3284 continue;
3285 (p->status_func)(p, c);
3286 if (p->stat > 0){
3287 action_flag = 0;
3288 }else if(p->stat == 0){
3289 if (result){
3290 action_flag = 0;
3291 }else{
3292 result = p;
3293 }
3294 }
3295 ++p;
3296 }
3297
3298 if (action_flag){
3299 if (result && !estab_f){
3300 set_iconv(TRUE, result->iconv_func);
3301 }else if (c <= DEL){
3302 struct input_code *ptr = input_code_list;
3303 while (ptr->name){
3304 status_reset(ptr);
3305 ++ptr;
3306 }
3307 }
3308 }
3309 }
3310
3311 typedef struct {
3312 nkf_buf_t *std_gc_buf;
3313 nkf_char broken_state;
3314 nkf_buf_t *broken_buf;
3315 nkf_char mimeout_state;
3316 nkf_buf_t *nfc_buf;
3317 } nkf_state_t;
3318
3319 static nkf_state_t *nkf_state = NULL;
3320
3321 #define STD_GC_BUFSIZE (256)
3322
3323 static void
nkf_state_init(void)3324 nkf_state_init(void)
3325 {
3326 if (nkf_state) {
3327 nkf_buf_clear(nkf_state->std_gc_buf);
3328 nkf_buf_clear(nkf_state->broken_buf);
3329 nkf_buf_clear(nkf_state->nfc_buf);
3330 }
3331 else {
3332 nkf_state = nkf_xmalloc(sizeof(nkf_state_t));
3333 nkf_state->std_gc_buf = nkf_buf_new(STD_GC_BUFSIZE);
3334 nkf_state->broken_buf = nkf_buf_new(3);
3335 nkf_state->nfc_buf = nkf_buf_new(9);
3336 }
3337 nkf_state->broken_state = 0;
3338 nkf_state->mimeout_state = 0;
3339 }
3340
3341 #ifndef WIN32DLL
3342 static nkf_char
std_getc(FILE * f)3343 std_getc(FILE *f)
3344 {
3345 if (!nkf_buf_empty_p(nkf_state->std_gc_buf)){
3346 return nkf_buf_pop(nkf_state->std_gc_buf);
3347 }
3348 return getc(f);
3349 }
3350 #endif /*WIN32DLL*/
3351
3352 static nkf_char
std_ungetc(nkf_char c,ARG_UNUSED FILE * f)3353 std_ungetc(nkf_char c, ARG_UNUSED FILE *f)
3354 {
3355 nkf_buf_push(nkf_state->std_gc_buf, c);
3356 return c;
3357 }
3358
3359 #ifndef WIN32DLL
3360 static void
std_putc(nkf_char c)3361 std_putc(nkf_char c)
3362 {
3363 if(c!=EOF)
3364 putchar(c);
3365 }
3366 #endif /*WIN32DLL*/
3367
3368 static nkf_char hold_buf[HOLD_SIZE*2];
3369 static int hold_count = 0;
3370 static nkf_char
push_hold_buf(nkf_char c2)3371 push_hold_buf(nkf_char c2)
3372 {
3373 if (hold_count >= HOLD_SIZE*2)
3374 return (EOF);
3375 hold_buf[hold_count++] = c2;
3376 return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count);
3377 }
3378
3379 static int
h_conv(FILE * f,nkf_char c1,nkf_char c2)3380 h_conv(FILE *f, nkf_char c1, nkf_char c2)
3381 {
3382 int ret;
3383 int hold_index;
3384 int fromhold_count;
3385 nkf_char c3, c4;
3386
3387 /** it must NOT be in the kanji shifte sequence */
3388 /** it must NOT be written in JIS7 */
3389 /** and it must be after 2 byte 8bit code */
3390
3391 hold_count = 0;
3392 push_hold_buf(c1);
3393 push_hold_buf(c2);
3394
3395 while ((c2 = (*i_getc)(f)) != EOF) {
3396 if (c2 == ESC){
3397 (*i_ungetc)(c2,f);
3398 break;
3399 }
3400 code_status(c2);
3401 if (push_hold_buf(c2) == EOF || estab_f) {
3402 break;
3403 }
3404 }
3405
3406 if (!estab_f) {
3407 struct input_code *p = input_code_list;
3408 struct input_code *result = p;
3409 if (c2 == EOF) {
3410 code_status(c2);
3411 }
3412 while (p->name) {
3413 if (p->status_func && p->score < result->score) {
3414 result = p;
3415 }
3416 p++;
3417 }
3418 set_iconv(TRUE, result->iconv_func);
3419 }
3420
3421
3422 /** now,
3423 ** 1) EOF is detected, or
3424 ** 2) Code is established, or
3425 ** 3) Buffer is FULL (but last word is pushed)
3426 **
3427 ** in 1) and 3) cases, we continue to use
3428 ** Kanji codes by oconv and leave estab_f unchanged.
3429 **/
3430
3431 ret = c2;
3432 hold_index = 0;
3433 while (hold_index < hold_count){
3434 c1 = hold_buf[hold_index++];
3435 if (nkf_char_unicode_p(c1)) {
3436 (*oconv)(0, c1);
3437 continue;
3438 }
3439 else if (c1 <= DEL){
3440 (*iconv)(0, c1, 0);
3441 continue;
3442 }else if (iconv == s_iconv && 0xa1 <= c1 && c1 <= 0xdf){
3443 (*iconv)(JIS_X_0201_1976_K, c1, 0);
3444 continue;
3445 }
3446 fromhold_count = 1;
3447 if (hold_index < hold_count){
3448 c2 = hold_buf[hold_index++];
3449 fromhold_count++;
3450 }else{
3451 c2 = (*i_getc)(f);
3452 if (c2 == EOF){
3453 c4 = EOF;
3454 break;
3455 }
3456 code_status(c2);
3457 }
3458 c3 = 0;
3459 switch ((*iconv)(c1, c2, 0)) { /* can be EUC/SJIS/UTF-8 */
3460 case -2:
3461 /* 4 bytes UTF-8 */
3462 if (hold_index < hold_count){
3463 c3 = hold_buf[hold_index++];
3464 } else if ((c3 = (*i_getc)(f)) == EOF) {
3465 ret = EOF;
3466 break;
3467 }
3468 code_status(c3);
3469 if (hold_index < hold_count){
3470 c4 = hold_buf[hold_index++];
3471 } else if ((c4 = (*i_getc)(f)) == EOF) {
3472 c3 = ret = EOF;
3473 break;
3474 }
3475 code_status(c4);
3476 (*iconv)(c1, c2, (c3<<8)|c4);
3477 break;
3478 case -3:
3479 /* 4 bytes UTF-8 (check combining character) */
3480 if (hold_index < hold_count){
3481 c3 = hold_buf[hold_index++];
3482 fromhold_count++;
3483 } else if ((c3 = (*i_getc)(f)) == EOF) {
3484 w_iconv_nocombine(c1, c2, 0);
3485 break;
3486 }
3487 if (hold_index < hold_count){
3488 c4 = hold_buf[hold_index++];
3489 fromhold_count++;
3490 } else if ((c4 = (*i_getc)(f)) == EOF) {
3491 w_iconv_nocombine(c1, c2, 0);
3492 if (fromhold_count <= 2)
3493 (*i_ungetc)(c3,f);
3494 else
3495 hold_index--;
3496 continue;
3497 }
3498 if (w_iconv_combine(c1, c2, 0, c3, c4, 0)) {
3499 w_iconv_nocombine(c1, c2, 0);
3500 if (fromhold_count <= 2) {
3501 (*i_ungetc)(c4,f);
3502 (*i_ungetc)(c3,f);
3503 } else if (fromhold_count == 3) {
3504 (*i_ungetc)(c4,f);
3505 hold_index--;
3506 } else {
3507 hold_index -= 2;
3508 }
3509 }
3510 break;
3511 case -1:
3512 /* 3 bytes EUC or UTF-8 */
3513 if (hold_index < hold_count){
3514 c3 = hold_buf[hold_index++];
3515 fromhold_count++;
3516 } else if ((c3 = (*i_getc)(f)) == EOF) {
3517 ret = EOF;
3518 break;
3519 } else {
3520 code_status(c3);
3521 }
3522 if ((*iconv)(c1, c2, c3) == -3) {
3523 /* 6 bytes UTF-8 (check combining character) */
3524 nkf_char c5, c6;
3525 if (hold_index < hold_count){
3526 c4 = hold_buf[hold_index++];
3527 fromhold_count++;
3528 } else if ((c4 = (*i_getc)(f)) == EOF) {
3529 w_iconv_nocombine(c1, c2, c3);
3530 continue;
3531 }
3532 if (hold_index < hold_count){
3533 c5 = hold_buf[hold_index++];
3534 fromhold_count++;
3535 } else if ((c5 = (*i_getc)(f)) == EOF) {
3536 w_iconv_nocombine(c1, c2, c3);
3537 if (fromhold_count == 4)
3538 hold_index--;
3539 else
3540 (*i_ungetc)(c4,f);
3541 continue;
3542 }
3543 if (hold_index < hold_count){
3544 c6 = hold_buf[hold_index++];
3545 fromhold_count++;
3546 } else if ((c6 = (*i_getc)(f)) == EOF) {
3547 w_iconv_nocombine(c1, c2, c3);
3548 if (fromhold_count == 5) {
3549 hold_index -= 2;
3550 } else if (fromhold_count == 4) {
3551 hold_index--;
3552 (*i_ungetc)(c5,f);
3553 } else {
3554 (*i_ungetc)(c5,f);
3555 (*i_ungetc)(c4,f);
3556 }
3557 continue;
3558 }
3559 if (w_iconv_combine(c1, c2, c3, c4, c5, c6)) {
3560 w_iconv_nocombine(c1, c2, c3);
3561 if (fromhold_count == 6) {
3562 hold_index -= 3;
3563 } else if (fromhold_count == 5) {
3564 hold_index -= 2;
3565 (*i_ungetc)(c6,f);
3566 } else if (fromhold_count == 4) {
3567 hold_index--;
3568 (*i_ungetc)(c6,f);
3569 (*i_ungetc)(c5,f);
3570 } else {
3571 (*i_ungetc)(c6,f);
3572 (*i_ungetc)(c5,f);
3573 (*i_ungetc)(c4,f);
3574 }
3575 }
3576 }
3577 break;
3578 }
3579 if (c3 == EOF) break;
3580 }
3581 return ret;
3582 }
3583
3584 /*
3585 * Check and Ignore BOM
3586 */
3587 static void
check_bom(FILE * f)3588 check_bom(FILE *f)
3589 {
3590 int c2;
3591 input_bom_f = FALSE;
3592 switch(c2 = (*i_getc)(f)){
3593 case 0x00:
3594 if((c2 = (*i_getc)(f)) == 0x00){
3595 if((c2 = (*i_getc)(f)) == 0xFE){
3596 if((c2 = (*i_getc)(f)) == 0xFF){
3597 if(!input_encoding){
3598 set_iconv(TRUE, w_iconv32);
3599 }
3600 if (iconv == w_iconv32) {
3601 input_bom_f = TRUE;
3602 input_endian = ENDIAN_BIG;
3603 return;
3604 }
3605 (*i_ungetc)(0xFF,f);
3606 }else (*i_ungetc)(c2,f);
3607 (*i_ungetc)(0xFE,f);
3608 }else if(c2 == 0xFF){
3609 if((c2 = (*i_getc)(f)) == 0xFE){
3610 if(!input_encoding){
3611 set_iconv(TRUE, w_iconv32);
3612 }
3613 if (iconv == w_iconv32) {
3614 input_endian = ENDIAN_2143;
3615 return;
3616 }
3617 (*i_ungetc)(0xFF,f);
3618 }else (*i_ungetc)(c2,f);
3619 (*i_ungetc)(0xFF,f);
3620 }else (*i_ungetc)(c2,f);
3621 (*i_ungetc)(0x00,f);
3622 }else (*i_ungetc)(c2,f);
3623 (*i_ungetc)(0x00,f);
3624 break;
3625 case 0xEF:
3626 if((c2 = (*i_getc)(f)) == 0xBB){
3627 if((c2 = (*i_getc)(f)) == 0xBF){
3628 if(!input_encoding){
3629 set_iconv(TRUE, w_iconv);
3630 }
3631 if (iconv == w_iconv) {
3632 input_bom_f = TRUE;
3633 return;
3634 }
3635 (*i_ungetc)(0xBF,f);
3636 }else (*i_ungetc)(c2,f);
3637 (*i_ungetc)(0xBB,f);
3638 }else (*i_ungetc)(c2,f);
3639 (*i_ungetc)(0xEF,f);
3640 break;
3641 case 0xFE:
3642 if((c2 = (*i_getc)(f)) == 0xFF){
3643 if((c2 = (*i_getc)(f)) == 0x00){
3644 if((c2 = (*i_getc)(f)) == 0x00){
3645 if(!input_encoding){
3646 set_iconv(TRUE, w_iconv32);
3647 }
3648 if (iconv == w_iconv32) {
3649 input_endian = ENDIAN_3412;
3650 return;
3651 }
3652 (*i_ungetc)(0x00,f);
3653 }else (*i_ungetc)(c2,f);
3654 (*i_ungetc)(0x00,f);
3655 }else (*i_ungetc)(c2,f);
3656 if(!input_encoding){
3657 set_iconv(TRUE, w_iconv16);
3658 }
3659 if (iconv == w_iconv16) {
3660 input_endian = ENDIAN_BIG;
3661 input_bom_f = TRUE;
3662 return;
3663 }
3664 (*i_ungetc)(0xFF,f);
3665 }else (*i_ungetc)(c2,f);
3666 (*i_ungetc)(0xFE,f);
3667 break;
3668 case 0xFF:
3669 if((c2 = (*i_getc)(f)) == 0xFE){
3670 if((c2 = (*i_getc)(f)) == 0x00){
3671 if((c2 = (*i_getc)(f)) == 0x00){
3672 if(!input_encoding){
3673 set_iconv(TRUE, w_iconv32);
3674 }
3675 if (iconv == w_iconv32) {
3676 input_endian = ENDIAN_LITTLE;
3677 input_bom_f = TRUE;
3678 return;
3679 }
3680 (*i_ungetc)(0x00,f);
3681 }else (*i_ungetc)(c2,f);
3682 (*i_ungetc)(0x00,f);
3683 }else (*i_ungetc)(c2,f);
3684 if(!input_encoding){
3685 set_iconv(TRUE, w_iconv16);
3686 }
3687 if (iconv == w_iconv16) {
3688 input_endian = ENDIAN_LITTLE;
3689 input_bom_f = TRUE;
3690 return;
3691 }
3692 (*i_ungetc)(0xFE,f);
3693 }else (*i_ungetc)(c2,f);
3694 (*i_ungetc)(0xFF,f);
3695 break;
3696 default:
3697 (*i_ungetc)(c2,f);
3698 break;
3699 }
3700 }
3701
3702 static nkf_char
broken_getc(FILE * f)3703 broken_getc(FILE *f)
3704 {
3705 nkf_char c, c1;
3706
3707 if (!nkf_buf_empty_p(nkf_state->broken_buf)) {
3708 return nkf_buf_pop(nkf_state->broken_buf);
3709 }
3710 c = (*i_bgetc)(f);
3711 if (c=='$' && nkf_state->broken_state != ESC
3712 && (input_mode == ASCII || input_mode == JIS_X_0201_1976_K)) {
3713 c1= (*i_bgetc)(f);
3714 nkf_state->broken_state = 0;
3715 if (c1=='@'|| c1=='B') {
3716 nkf_buf_push(nkf_state->broken_buf, c1);
3717 nkf_buf_push(nkf_state->broken_buf, c);
3718 return ESC;
3719 } else {
3720 (*i_bungetc)(c1,f);
3721 return c;
3722 }
3723 } else if (c=='(' && nkf_state->broken_state != ESC
3724 && (input_mode == JIS_X_0208 || input_mode == JIS_X_0201_1976_K)) {
3725 c1= (*i_bgetc)(f);
3726 nkf_state->broken_state = 0;
3727 if (c1=='J'|| c1=='B') {
3728 nkf_buf_push(nkf_state->broken_buf, c1);
3729 nkf_buf_push(nkf_state->broken_buf, c);
3730 return ESC;
3731 } else {
3732 (*i_bungetc)(c1,f);
3733 return c;
3734 }
3735 } else {
3736 nkf_state->broken_state = c;
3737 return c;
3738 }
3739 }
3740
3741 static nkf_char
broken_ungetc(nkf_char c,ARG_UNUSED FILE * f)3742 broken_ungetc(nkf_char c, ARG_UNUSED FILE *f)
3743 {
3744 if (nkf_buf_length(nkf_state->broken_buf) < 2)
3745 nkf_buf_push(nkf_state->broken_buf, c);
3746 return c;
3747 }
3748
3749 static void
eol_conv(nkf_char c2,nkf_char c1)3750 eol_conv(nkf_char c2, nkf_char c1)
3751 {
3752 if (guess_f && input_eol != EOF) {
3753 if (c2 == 0 && c1 == LF) {
3754 if (!input_eol) input_eol = prev_cr ? CRLF : LF;
3755 else if (input_eol != (prev_cr ? CRLF : LF)) input_eol = EOF;
3756 } else if (c2 == 0 && c1 == CR && input_eol == LF) input_eol = EOF;
3757 else if (!prev_cr);
3758 else if (!input_eol) input_eol = CR;
3759 else if (input_eol != CR) input_eol = EOF;
3760 }
3761 if (prev_cr || (c2 == 0 && c1 == LF)) {
3762 prev_cr = 0;
3763 if (eolmode_f != LF) (*o_eol_conv)(0, CR);
3764 if (eolmode_f != CR) (*o_eol_conv)(0, LF);
3765 }
3766 if (c2 == 0 && c1 == CR) prev_cr = CR;
3767 else if (c2 != 0 || c1 != LF) (*o_eol_conv)(c2, c1);
3768 }
3769
3770 static void
put_newline(void (* func)(nkf_char))3771 put_newline(void (*func)(nkf_char))
3772 {
3773 switch (eolmode_f ? eolmode_f : DEFAULT_NEWLINE) {
3774 case CRLF:
3775 (*func)(0x0D);
3776 (*func)(0x0A);
3777 break;
3778 case CR:
3779 (*func)(0x0D);
3780 break;
3781 case LF:
3782 (*func)(0x0A);
3783 break;
3784 }
3785 }
3786
3787 static void
oconv_newline(void (* func)(nkf_char,nkf_char))3788 oconv_newline(void (*func)(nkf_char, nkf_char))
3789 {
3790 switch (eolmode_f ? eolmode_f : DEFAULT_NEWLINE) {
3791 case CRLF:
3792 (*func)(0, 0x0D);
3793 (*func)(0, 0x0A);
3794 break;
3795 case CR:
3796 (*func)(0, 0x0D);
3797 break;
3798 case LF:
3799 (*func)(0, 0x0A);
3800 break;
3801 }
3802 }
3803
3804 /*
3805 Return value of fold_conv()
3806
3807 LF add newline and output char
3808 CR add newline and output nothing
3809 SP space
3810 0 skip
3811 1 (or else) normal output
3812
3813 fold state in prev (previous character)
3814
3815 >0x80 Japanese (X0208/X0201)
3816 <0x80 ASCII
3817 LF new line
3818 SP space
3819
3820 This fold algorthm does not preserve heading space in a line.
3821 This is the main difference from fmt.
3822 */
3823
3824 #define char_size(c2,c1) (c2?2:1)
3825
3826 static void
fold_conv(nkf_char c2,nkf_char c1)3827 fold_conv(nkf_char c2, nkf_char c1)
3828 {
3829 nkf_char prev0;
3830 nkf_char fold_state;
3831
3832 if (c1== CR && !fold_preserve_f) {
3833 fold_state=0; /* ignore cr */
3834 }else if (c1== LF&&f_prev==CR && fold_preserve_f) {
3835 f_prev = LF;
3836 fold_state=0; /* ignore cr */
3837 } else if (c1== BS) {
3838 if (f_line>0) f_line--;
3839 fold_state = 1;
3840 } else if (c2==EOF && f_line != 0) { /* close open last line */
3841 fold_state = LF;
3842 } else if ((c1==LF && !fold_preserve_f)
3843 || ((c1==CR||(c1==LF&&f_prev!=CR))
3844 && fold_preserve_f)) {
3845 /* new line */
3846 if (fold_preserve_f) {
3847 f_prev = c1;
3848 f_line = 0;
3849 fold_state = CR;
3850 } else if ((f_prev == c1)
3851 || (f_prev == LF)
3852 ) { /* duplicate newline */
3853 if (f_line) {
3854 f_line = 0;
3855 fold_state = LF; /* output two newline */
3856 } else {
3857 f_line = 0;
3858 fold_state = 1;
3859 }
3860 } else {
3861 if (f_prev&0x80) { /* Japanese? */
3862 f_prev = c1;
3863 fold_state = 0; /* ignore given single newline */
3864 } else if (f_prev==SP) {
3865 fold_state = 0;
3866 } else {
3867 f_prev = c1;
3868 if (++f_line<=fold_len)
3869 fold_state = SP;
3870 else {
3871 f_line = 0;
3872 fold_state = CR; /* fold and output nothing */
3873 }
3874 }
3875 }
3876 } else if (c1=='\f') {
3877 f_prev = LF;
3878 f_line = 0;
3879 fold_state = LF; /* output newline and clear */
3880 } else if ((c2==0 && nkf_isblank(c1)) || (c2 == '!' && c1 == '!')) {
3881 /* X0208 kankaku or ascii space */
3882 if (f_prev == SP) {
3883 fold_state = 0; /* remove duplicate spaces */
3884 } else {
3885 f_prev = SP;
3886 if (++f_line<=fold_len)
3887 fold_state = SP; /* output ASCII space only */
3888 else {
3889 f_prev = SP; f_line = 0;
3890 fold_state = CR; /* fold and output nothing */
3891 }
3892 }
3893 } else {
3894 prev0 = f_prev; /* we still need this one... , but almost done */
3895 f_prev = c1;
3896 if (c2 || c2 == JIS_X_0201_1976_K)
3897 f_prev |= 0x80; /* this is Japanese */
3898 f_line += c2 == JIS_X_0201_1976_K ? 1: char_size(c2,c1);
3899 if (f_line<=fold_len) { /* normal case */
3900 fold_state = 1;
3901 } else {
3902 if (f_line>fold_len+fold_margin) { /* too many kinsoku suspension */
3903 f_line = char_size(c2,c1);
3904 fold_state = LF; /* We can't wait, do fold now */
3905 } else if (c2 == JIS_X_0201_1976_K) {
3906 /* simple kinsoku rules return 1 means no folding */
3907 if (c1==(0xde&0x7f)) fold_state = 1; /* $B!+(B*/
3908 else if (c1==(0xdf&0x7f)) fold_state = 1; /* $B!,(B*/
3909 else if (c1==(0xa4&0x7f)) fold_state = 1; /* $B!#(B*/
3910 else if (c1==(0xa3&0x7f)) fold_state = 1; /* $B!$(B*/
3911 else if (c1==(0xa1&0x7f)) fold_state = 1; /* $B!W(B*/
3912 else if (c1==(0xb0&0x7f)) fold_state = 1; /* - */
3913 else if (SP<=c1 && c1<=(0xdf&0x7f)) { /* X0201 */
3914 f_line = 1;
3915 fold_state = LF;/* add one new f_line before this character */
3916 } else {
3917 f_line = 1;
3918 fold_state = LF;/* add one new f_line before this character */
3919 }
3920 } else if (c2==0) {
3921 /* kinsoku point in ASCII */
3922 if ( c1==')'|| /* { [ ( */
3923 c1==']'||
3924 c1=='}'||
3925 c1=='.'||
3926 c1==','||
3927 c1=='!'||
3928 c1=='?'||
3929 c1=='/'||
3930 c1==':'||
3931 c1==';') {
3932 fold_state = 1;
3933 /* just after special */
3934 } else if (!is_alnum(prev0)) {
3935 f_line = char_size(c2,c1);
3936 fold_state = LF;
3937 } else if ((prev0==SP) || /* ignored new f_line */
3938 (prev0==LF)|| /* ignored new f_line */
3939 (prev0&0x80)) { /* X0208 - ASCII */
3940 f_line = char_size(c2,c1);
3941 fold_state = LF;/* add one new f_line before this character */
3942 } else {
3943 fold_state = 1; /* default no fold in ASCII */
3944 }
3945 } else {
3946 if (c2=='!') {
3947 if (c1=='"') fold_state = 1; /* $B!"(B */
3948 else if (c1=='#') fold_state = 1; /* $B!#(B */
3949 else if (c1=='W') fold_state = 1; /* $B!W(B */
3950 else if (c1=='K') fold_state = 1; /* $B!K(B */
3951 else if (c1=='$') fold_state = 1; /* $B!$(B */
3952 else if (c1=='%') fold_state = 1; /* $B!%(B */
3953 else if (c1=='\'') fold_state = 1; /* $B!\(B */
3954 else if (c1=='(') fold_state = 1; /* $B!((B */
3955 else if (c1==')') fold_state = 1; /* $B!)(B */
3956 else if (c1=='*') fold_state = 1; /* $B!*(B */
3957 else if (c1=='+') fold_state = 1; /* $B!+(B */
3958 else if (c1==',') fold_state = 1; /* $B!,(B */
3959 /* default no fold in kinsoku */
3960 else {
3961 fold_state = LF;
3962 f_line = char_size(c2,c1);
3963 /* add one new f_line before this character */
3964 }
3965 } else {
3966 f_line = char_size(c2,c1);
3967 fold_state = LF;
3968 /* add one new f_line before this character */
3969 }
3970 }
3971 }
3972 }
3973 /* terminator process */
3974 switch(fold_state) {
3975 case LF:
3976 oconv_newline(o_fconv);
3977 (*o_fconv)(c2,c1);
3978 break;
3979 case 0:
3980 return;
3981 case CR:
3982 oconv_newline(o_fconv);
3983 break;
3984 case TAB:
3985 case SP:
3986 (*o_fconv)(0,SP);
3987 break;
3988 default:
3989 (*o_fconv)(c2,c1);
3990 }
3991 }
3992
3993 static nkf_char z_prev2=0,z_prev1=0;
3994
3995 static void
z_conv(nkf_char c2,nkf_char c1)3996 z_conv(nkf_char c2, nkf_char c1)
3997 {
3998
3999 /* if (c2) c1 &= 0x7f; assertion */
4000
4001 if (c2 == JIS_X_0201_1976_K && (c1 == 0x20 || c1 == 0x7D || c1 == 0x7E)) {
4002 (*o_zconv)(c2,c1);
4003 return;
4004 }
4005
4006 if (x0201_f) {
4007 if (z_prev2 == JIS_X_0201_1976_K) {
4008 if (c2 == JIS_X_0201_1976_K) {
4009 if (c1 == (0xde&0x7f)) { /* $BByE@(B */
4010 z_prev2 = 0;
4011 (*o_zconv)(dv[(z_prev1-SP)*2], dv[(z_prev1-SP)*2+1]);
4012 return;
4013 } else if (c1 == (0xdf&0x7f) && ev[(z_prev1-SP)*2]) { /* $BH>ByE@(B */
4014 z_prev2 = 0;
4015 (*o_zconv)(ev[(z_prev1-SP)*2], ev[(z_prev1-SP)*2+1]);
4016 return;
4017 } else if (x0213_f && c1 == (0xdf&0x7f) && ev_x0213[(z_prev1-SP)*2]) { /* $BH>ByE@(B */
4018 z_prev2 = 0;
4019 (*o_zconv)(ev_x0213[(z_prev1-SP)*2], ev_x0213[(z_prev1-SP)*2+1]);
4020 return;
4021 }
4022 }
4023 z_prev2 = 0;
4024 (*o_zconv)(cv[(z_prev1-SP)*2], cv[(z_prev1-SP)*2+1]);
4025 }
4026 if (c2 == JIS_X_0201_1976_K) {
4027 if (dv[(c1-SP)*2] || ev[(c1-SP)*2] || (x0213_f && ev_x0213[(c1-SP)*2])) {
4028 /* wait for $BByE@(B or $BH>ByE@(B */
4029 z_prev1 = c1;
4030 z_prev2 = c2;
4031 return;
4032 } else {
4033 (*o_zconv)(cv[(c1-SP)*2], cv[(c1-SP)*2+1]);
4034 return;
4035 }
4036 }
4037 }
4038
4039 if (c2 == EOF) {
4040 (*o_zconv)(c2, c1);
4041 return;
4042 }
4043
4044 if (alpha_f&1 && c2 == 0x23) {
4045 /* JISX0208 Alphabet */
4046 c2 = 0;
4047 } else if (c2 == 0x21) {
4048 /* JISX0208 Kigou */
4049 if (0x21==c1) {
4050 if (alpha_f&2) {
4051 c2 = 0;
4052 c1 = SP;
4053 } else if (alpha_f&4) {
4054 (*o_zconv)(0, SP);
4055 (*o_zconv)(0, SP);
4056 return;
4057 }
4058 } else if (alpha_f&1 && 0x20<c1 && c1<0x7f && fv[c1-0x20]) {
4059 c2 = 0;
4060 c1 = fv[c1-0x20];
4061 }
4062 }
4063
4064 if (alpha_f&8 && c2 == 0) {
4065 /* HTML Entity */
4066 const char *entity = 0;
4067 switch (c1){
4068 case '>': entity = ">"; break;
4069 case '<': entity = "<"; break;
4070 case '\"': entity = """; break;
4071 case '&': entity = "&"; break;
4072 }
4073 if (entity){
4074 while (*entity) (*o_zconv)(0, *entity++);
4075 return;
4076 }
4077 }
4078
4079 if (alpha_f & 16) {
4080 /* JIS X 0208 Katakana to JIS X 0201 Katakana */
4081 if (c2 == 0x21) {
4082 nkf_char c = 0;
4083 switch (c1) {
4084 case 0x23:
4085 /* U+3002 (0x8142) Ideographic Full Stop -> U+FF61 (0xA1) Halfwidth Ideographic Full Stop */
4086 c = 0xA1;
4087 break;
4088 case 0x56:
4089 /* U+300C (0x8175) Left Corner Bracket -> U+FF62 (0xA2) Halfwidth Left Corner Bracket */
4090 c = 0xA2;
4091 break;
4092 case 0x57:
4093 /* U+300D (0x8176) Right Corner Bracket -> U+FF63 (0xA3) Halfwidth Right Corner Bracket */
4094 c = 0xA3;
4095 break;
4096 case 0x22:
4097 /* U+3001 (0x8141) Ideographic Comma -> U+FF64 (0xA4) Halfwidth Ideographic Comma */
4098 c = 0xA4;
4099 break;
4100 case 0x26:
4101 /* U+30FB (0x8145) Katakana Middle Dot -> U+FF65 (0xA5) Halfwidth Katakana Middle Dot */
4102 c = 0xA5;
4103 break;
4104 case 0x3C:
4105 /* U+30FC (0x815B) Katakana-Hiragana Prolonged Sound Mark -> U+FF70 (0xB0) Halfwidth Katakana-Hiragana Prolonged Sound Mark */
4106 c = 0xB0;
4107 break;
4108 case 0x2B:
4109 /* U+309B (0x814A) Katakana-Hiragana Voiced Sound Mark -> U+FF9E (0xDE) Halfwidth Katakana Voiced Sound Mark */
4110 c = 0xDE;
4111 break;
4112 case 0x2C:
4113 /* U+309C (0x814B) Katakana-Hiragana Semi-Voiced Sound Mark -> U+FF9F (0xDF) Halfwidth Katakana Semi-Voiced Sound Mark */
4114 c = 0xDF;
4115 break;
4116 }
4117 if (c) {
4118 (*o_zconv)(JIS_X_0201_1976_K, c);
4119 return;
4120 }
4121 } else if (c2 == 0x25) {
4122 /* JISX0208 Katakana */
4123 static const int fullwidth_to_halfwidth[] =
4124 {
4125 0x0000, 0x2700, 0x3100, 0x2800, 0x3200, 0x2900, 0x3300, 0x2A00,
4126 0x3400, 0x2B00, 0x3500, 0x3600, 0x365E, 0x3700, 0x375E, 0x3800,
4127 0x385E, 0x3900, 0x395E, 0x3A00, 0x3A5E, 0x3B00, 0x3B5E, 0x3C00,
4128 0x3C5E, 0x3D00, 0x3D5E, 0x3E00, 0x3E5E, 0x3F00, 0x3F5E, 0x4000,
4129 0x405E, 0x4100, 0x415E, 0x2F00, 0x4200, 0x425E, 0x4300, 0x435E,
4130 0x4400, 0x445E, 0x4500, 0x4600, 0x4700, 0x4800, 0x4900, 0x4A00,
4131 0x4A5E, 0x4A5F, 0x4B00, 0x4B5E, 0x4B5F, 0x4C00, 0x4C5E, 0x4C5F,
4132 0x4D00, 0x4D5E, 0x4D5F, 0x4E00, 0x4E5E, 0x4E5F, 0x4F00, 0x5000,
4133 0x5100, 0x5200, 0x5300, 0x2C00, 0x5400, 0x2D00, 0x5500, 0x2E00,
4134 0x5600, 0x5700, 0x5800, 0x5900, 0x5A00, 0x5B00, 0x0000, 0x5C00,
4135 0x0000, 0x0000, 0x2600, 0x5D00, 0x335E, 0x0000, 0x0000, 0x365F,
4136 0x375F, 0x385F, 0x395F, 0x3A5F, 0x3E5F, 0x425F, 0x445F, 0x0000
4137 };
4138 if (fullwidth_to_halfwidth[c1-0x20]){
4139 c2 = fullwidth_to_halfwidth[c1-0x20];
4140 (*o_zconv)(JIS_X_0201_1976_K, c2>>8);
4141 if (c2 & 0xFF) {
4142 (*o_zconv)(JIS_X_0201_1976_K, c2&0xFF);
4143 }
4144 return;
4145 }
4146 } else if (c2 == 0 && nkf_char_unicode_p(c1) &&
4147 ((c1&VALUE_MASK) == 0x3099 || (c1&VALUE_MASK) == 0x309A)) { /* $B9g@.MQByE@!&H>ByE@(B */
4148 (*o_zconv)(JIS_X_0201_1976_K, 0x5E + (c1&VALUE_MASK) - 0x3099);
4149 return;
4150 }
4151 }
4152 (*o_zconv)(c2,c1);
4153 }
4154
4155
4156 #define rot13(c) ( \
4157 ( c < 'A') ? c: \
4158 (c <= 'M') ? (c + 13): \
4159 (c <= 'Z') ? (c - 13): \
4160 (c < 'a') ? (c): \
4161 (c <= 'm') ? (c + 13): \
4162 (c <= 'z') ? (c - 13): \
4163 (c) \
4164 )
4165
4166 #define rot47(c) ( \
4167 ( c < '!') ? c: \
4168 ( c <= 'O') ? (c + 47) : \
4169 ( c <= '~') ? (c - 47) : \
4170 c \
4171 )
4172
4173 static void
rot_conv(nkf_char c2,nkf_char c1)4174 rot_conv(nkf_char c2, nkf_char c1)
4175 {
4176 if (c2 == 0 || c2 == JIS_X_0201_1976_K || c2 == ISO_8859_1) {
4177 c1 = rot13(c1);
4178 } else if (c2) {
4179 c1 = rot47(c1);
4180 c2 = rot47(c2);
4181 }
4182 (*o_rot_conv)(c2,c1);
4183 }
4184
4185 static void
hira_conv(nkf_char c2,nkf_char c1)4186 hira_conv(nkf_char c2, nkf_char c1)
4187 {
4188 if (hira_f & 1) {
4189 if (c2 == 0x25) {
4190 if (0x20 < c1 && c1 < 0x74) {
4191 c2 = 0x24;
4192 (*o_hira_conv)(c2,c1);
4193 return;
4194 } else if (c1 == 0x74 && nkf_enc_unicode_p(output_encoding)) {
4195 c2 = 0;
4196 c1 = nkf_char_unicode_new(0x3094);
4197 (*o_hira_conv)(c2,c1);
4198 return;
4199 }
4200 } else if (c2 == 0x21 && (c1 == 0x33 || c1 == 0x34)) {
4201 c1 += 2;
4202 (*o_hira_conv)(c2,c1);
4203 return;
4204 }
4205 }
4206 if (hira_f & 2) {
4207 if (c2 == 0 && c1 == nkf_char_unicode_new(0x3094)) {
4208 c2 = 0x25;
4209 c1 = 0x74;
4210 } else if (c2 == 0x24 && 0x20 < c1 && c1 < 0x74) {
4211 c2 = 0x25;
4212 } else if (c2 == 0x21 && (c1 == 0x35 || c1 == 0x36)) {
4213 c1 -= 2;
4214 }
4215 }
4216 (*o_hira_conv)(c2,c1);
4217 }
4218
4219
4220 static void
iso2022jp_check_conv(nkf_char c2,nkf_char c1)4221 iso2022jp_check_conv(nkf_char c2, nkf_char c1)
4222 {
4223 #define RANGE_NUM_MAX 18
4224 static const nkf_char range[RANGE_NUM_MAX][2] = {
4225 {0x222f, 0x2239,},
4226 {0x2242, 0x2249,},
4227 {0x2251, 0x225b,},
4228 {0x226b, 0x2271,},
4229 {0x227a, 0x227d,},
4230 {0x2321, 0x232f,},
4231 {0x233a, 0x2340,},
4232 {0x235b, 0x2360,},
4233 {0x237b, 0x237e,},
4234 {0x2474, 0x247e,},
4235 {0x2577, 0x257e,},
4236 {0x2639, 0x2640,},
4237 {0x2659, 0x267e,},
4238 {0x2742, 0x2750,},
4239 {0x2772, 0x277e,},
4240 {0x2841, 0x287e,},
4241 {0x4f54, 0x4f7e,},
4242 {0x7425, 0x747e},
4243 };
4244 nkf_char i;
4245 nkf_char start, end, c;
4246
4247 if(c2 >= 0x00 && c2 <= 0x20 && c1 >= 0x7f && c1 <= 0xff) {
4248 c2 = GETA1;
4249 c1 = GETA2;
4250 }
4251 if((c2 >= 0x29 && c2 <= 0x2f) || (c2 >= 0x75 && c2 <= 0x7e)) {
4252 c2 = GETA1;
4253 c1 = GETA2;
4254 }
4255
4256 for (i = 0; i < RANGE_NUM_MAX; i++) {
4257 start = range[i][0];
4258 end = range[i][1];
4259 c = (c2 << 8) + c1;
4260 if (c >= start && c <= end) {
4261 c2 = GETA1;
4262 c1 = GETA2;
4263 }
4264 }
4265 (*o_iso2022jp_check_conv)(c2,c1);
4266 }
4267
4268
4269 /* This converts =?ISO-2022-JP?B?HOGE HOGE?= */
4270
4271 static const unsigned char *mime_pattern[] = {
4272 (const unsigned char *)"\075?EUC-JP?B?",
4273 (const unsigned char *)"\075?SHIFT_JIS?B?",
4274 (const unsigned char *)"\075?ISO-8859-1?Q?",
4275 (const unsigned char *)"\075?ISO-8859-1?B?",
4276 (const unsigned char *)"\075?ISO-2022-JP?B?",
4277 (const unsigned char *)"\075?ISO-2022-JP?B?",
4278 (const unsigned char *)"\075?ISO-2022-JP?Q?",
4279 #if defined(UTF8_INPUT_ENABLE)
4280 (const unsigned char *)"\075?UTF-8?B?",
4281 (const unsigned char *)"\075?UTF-8?Q?",
4282 #endif
4283 (const unsigned char *)"\075?US-ASCII?Q?",
4284 NULL
4285 };
4286
4287
4288 /* $B3:Ev$9$k%3!<%I$NM%@hEY$r>e$2$k$?$a$NL\0u(B */
4289 nkf_char (*mime_priority_func[])(nkf_char c2, nkf_char c1, nkf_char c0) = {
4290 e_iconv, s_iconv, 0, 0, 0, 0, 0,
4291 #if defined(UTF8_INPUT_ENABLE)
4292 w_iconv, w_iconv,
4293 #endif
4294 0,
4295 };
4296
4297 static const nkf_char mime_encode[] = {
4298 EUC_JP, SHIFT_JIS, ISO_8859_1, ISO_8859_1, JIS_X_0208, JIS_X_0201_1976_K, JIS_X_0201_1976_K,
4299 #if defined(UTF8_INPUT_ENABLE)
4300 UTF_8, UTF_8,
4301 #endif
4302 ASCII,
4303 0
4304 };
4305
4306 static const nkf_char mime_encode_method[] = {
4307 'B', 'B','Q', 'B', 'B', 'B', 'Q',
4308 #if defined(UTF8_INPUT_ENABLE)
4309 'B', 'Q',
4310 #endif
4311 'Q',
4312 0
4313 };
4314
4315
4316 /* MIME preprocessor fifo */
4317
4318 #define MIME_BUF_SIZE (1024) /* 2^n ring buffer */
4319 #define MIME_BUF_MASK (MIME_BUF_SIZE-1)
4320 #define mime_input_buf(n) mime_input_state.buf[(n)&MIME_BUF_MASK]
4321 static struct {
4322 unsigned char buf[MIME_BUF_SIZE];
4323 unsigned int top;
4324 unsigned int last; /* decoded */
4325 unsigned int input; /* undecoded */
4326 } mime_input_state;
4327 static nkf_char (*mime_iconv_back)(nkf_char c2,nkf_char c1,nkf_char c0) = NULL;
4328
4329 #define MAXRECOVER 20
4330
4331 static void
mime_input_buf_unshift(nkf_char c)4332 mime_input_buf_unshift(nkf_char c)
4333 {
4334 mime_input_buf(--mime_input_state.top) = (unsigned char)c;
4335 }
4336
4337 static nkf_char
mime_ungetc(nkf_char c,ARG_UNUSED FILE * f)4338 mime_ungetc(nkf_char c, ARG_UNUSED FILE *f)
4339 {
4340 mime_input_buf_unshift(c);
4341 return c;
4342 }
4343
4344 static nkf_char
mime_ungetc_buf(nkf_char c,FILE * f)4345 mime_ungetc_buf(nkf_char c, FILE *f)
4346 {
4347 if (mimebuf_f)
4348 (*i_mungetc_buf)(c,f);
4349 else
4350 mime_input_buf(--mime_input_state.input) = (unsigned char)c;
4351 return c;
4352 }
4353
4354 static nkf_char
mime_getc_buf(FILE * f)4355 mime_getc_buf(FILE *f)
4356 {
4357 /* we don't keep eof of mime_input_buf, because it contains ?= as
4358 a terminator. It was checked in mime_integrity. */
4359 return ((mimebuf_f)?
4360 (*i_mgetc_buf)(f):mime_input_buf(mime_input_state.input++));
4361 }
4362
4363 static void
switch_mime_getc(void)4364 switch_mime_getc(void)
4365 {
4366 if (i_getc!=mime_getc) {
4367 i_mgetc = i_getc; i_getc = mime_getc;
4368 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
4369 if(mime_f==STRICT_MIME) {
4370 i_mgetc_buf = i_mgetc; i_mgetc = mime_getc_buf;
4371 i_mungetc_buf = i_mungetc; i_mungetc = mime_ungetc_buf;
4372 }
4373 }
4374 }
4375
4376 static void
unswitch_mime_getc(void)4377 unswitch_mime_getc(void)
4378 {
4379 if(mime_f==STRICT_MIME) {
4380 i_mgetc = i_mgetc_buf;
4381 i_mungetc = i_mungetc_buf;
4382 }
4383 i_getc = i_mgetc;
4384 i_ungetc = i_mungetc;
4385 if(mime_iconv_back)set_iconv(FALSE, mime_iconv_back);
4386 mime_iconv_back = NULL;
4387 }
4388
4389 static nkf_char
mime_integrity(FILE * f,const unsigned char * p)4390 mime_integrity(FILE *f, const unsigned char *p)
4391 {
4392 nkf_char c,d;
4393 unsigned int q;
4394 /* In buffered mode, read until =? or NL or buffer full
4395 */
4396 mime_input_state.input = mime_input_state.top;
4397 mime_input_state.last = mime_input_state.top;
4398
4399 while(*p) mime_input_buf(mime_input_state.input++) = *p++;
4400 d = 0;
4401 q = mime_input_state.input;
4402 while((c=(*i_getc)(f))!=EOF) {
4403 if (((mime_input_state.input-mime_input_state.top)&MIME_BUF_MASK)==0) {
4404 break; /* buffer full */
4405 }
4406 if (c=='=' && d=='?') {
4407 /* checked. skip header, start decode */
4408 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
4409 /* mime_last_input = mime_input_state.input; */
4410 mime_input_state.input = q;
4411 switch_mime_getc();
4412 return 1;
4413 }
4414 if (!( (c=='+'||c=='/'|| c=='=' || c=='?' || is_alnum(c))))
4415 break;
4416 /* Should we check length mod 4? */
4417 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
4418 d=c;
4419 }
4420 /* In case of Incomplete MIME, no MIME decode */
4421 mime_input_buf(mime_input_state.input++) = (unsigned char)c;
4422 mime_input_state.last = mime_input_state.input; /* point undecoded buffer */
4423 mime_decode_mode = 1; /* no decode on mime_input_buf last in mime_getc */
4424 switch_mime_getc(); /* anyway we need buffered getc */
4425 return 1;
4426 }
4427
4428 static nkf_char
mime_begin_strict(FILE * f)4429 mime_begin_strict(FILE *f)
4430 {
4431 nkf_char c1 = 0;
4432 int i,j,k;
4433 const unsigned char *p,*q;
4434 nkf_char r[MAXRECOVER]; /* recovery buffer, max mime pattern length */
4435
4436 mime_decode_mode = FALSE;
4437 /* =? has been checked */
4438 j = 0;
4439 p = mime_pattern[j];
4440 r[0]='='; r[1]='?';
4441
4442 for(i=2;p[i]>SP;i++) { /* start at =? */
4443 if (((r[i] = c1 = (*i_getc)(f))==EOF) || nkf_toupper(c1) != p[i]) {
4444 /* pattern fails, try next one */
4445 q = p;
4446 while (mime_pattern[++j]) {
4447 p = mime_pattern[j];
4448 for(k=2;k<i;k++) /* assume length(p) > i */
4449 if (p[k]!=q[k]) break;
4450 if (k==i && nkf_toupper(c1)==p[k]) break;
4451 }
4452 p = mime_pattern[j];
4453 if (p) continue; /* found next one, continue */
4454 /* all fails, output from recovery buffer */
4455 (*i_ungetc)(c1,f);
4456 for(j=0;j<i;j++) {
4457 (*oconv)(0,r[j]);
4458 }
4459 return c1;
4460 }
4461 }
4462 mime_decode_mode = p[i-2];
4463
4464 mime_iconv_back = iconv;
4465 set_iconv(FALSE, mime_priority_func[j]);
4466 clr_code_score(find_inputcode_byfunc(mime_priority_func[j]), SCORE_iMIME);
4467
4468 if (mime_decode_mode=='B') {
4469 mimebuf_f = unbuf_f;
4470 if (!unbuf_f) {
4471 /* do MIME integrity check */
4472 return mime_integrity(f,mime_pattern[j]);
4473 }
4474 }
4475 switch_mime_getc();
4476 mimebuf_f = TRUE;
4477 return c1;
4478 }
4479
4480 static nkf_char
mime_begin(FILE * f)4481 mime_begin(FILE *f)
4482 {
4483 nkf_char c1 = 0;
4484 int i,k;
4485
4486 /* In NONSTRICT mode, only =? is checked. In case of failure, we */
4487 /* re-read and convert again from mime_buffer. */
4488
4489 /* =? has been checked */
4490 k = mime_input_state.last;
4491 mime_input_buf(mime_input_state.last++)='='; mime_input_buf(mime_input_state.last++)='?';
4492 for(i=2;i<MAXRECOVER;i++) { /* start at =? */
4493 /* We accept any character type even if it is breaked by new lines */
4494 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4495 if (c1==LF||c1==SP||c1==CR||
4496 c1=='-'||c1=='_'||is_alnum(c1)) continue;
4497 if (c1=='=') {
4498 /* Failed. But this could be another MIME preemble */
4499 (*i_ungetc)(c1,f);
4500 mime_input_state.last--;
4501 break;
4502 }
4503 if (c1!='?') break;
4504 else {
4505 /* c1=='?' */
4506 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4507 if (!(++i<MAXRECOVER) || c1==EOF) break;
4508 if (c1=='b'||c1=='B') {
4509 mime_decode_mode = 'B';
4510 } else if (c1=='q'||c1=='Q') {
4511 mime_decode_mode = 'Q';
4512 } else {
4513 break;
4514 }
4515 c1 = (*i_getc)(f); mime_input_buf(mime_input_state.last++) = (unsigned char)c1;
4516 if (!(++i<MAXRECOVER) || c1==EOF) break;
4517 if (c1!='?') {
4518 mime_decode_mode = FALSE;
4519 }
4520 break;
4521 }
4522 }
4523 switch_mime_getc();
4524 if (!mime_decode_mode) {
4525 /* false MIME premble, restart from mime_buffer */
4526 mime_decode_mode = 1; /* no decode, but read from the mime_buffer */
4527 /* Since we are in MIME mode until buffer becomes empty, */
4528 /* we never go into mime_begin again for a while. */
4529 return c1;
4530 }
4531 /* discard mime preemble, and goto MIME mode */
4532 mime_input_state.last = k;
4533 /* do no MIME integrity check */
4534 return c1; /* used only for checking EOF */
4535 }
4536
4537 #ifdef CHECK_OPTION
4538 static void
no_putc(ARG_UNUSED nkf_char c)4539 no_putc(ARG_UNUSED nkf_char c)
4540 {
4541 ;
4542 }
4543
4544 static void
debug(const char * str)4545 debug(const char *str)
4546 {
4547 if (debug_f){
4548 fprintf(stderr, "%s\n", str ? str : "NULL");
4549 }
4550 }
4551 #endif
4552
4553 static void
set_input_codename(const char * codename)4554 set_input_codename(const char *codename)
4555 {
4556 if (!input_codename) {
4557 input_codename = codename;
4558 } else if (strcmp(codename, input_codename) != 0) {
4559 input_codename = "";
4560 }
4561 }
4562
4563 static const char*
get_guessed_code(void)4564 get_guessed_code(void)
4565 {
4566 if (input_codename && !*input_codename) {
4567 input_codename = "BINARY";
4568 } else {
4569 struct input_code *p = find_inputcode_byfunc(iconv);
4570 if (!input_codename) {
4571 input_codename = "ASCII";
4572 } else if (strcmp(input_codename, "Shift_JIS") == 0) {
4573 if (p->score & (SCORE_DEPEND|SCORE_CP932))
4574 input_codename = "CP932";
4575 } else if (strcmp(input_codename, "EUC-JP") == 0) {
4576 if (p->score & SCORE_X0213)
4577 input_codename = "EUC-JIS-2004";
4578 else if (p->score & (SCORE_X0212))
4579 input_codename = "EUCJP-MS";
4580 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4581 input_codename = "CP51932";
4582 } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
4583 if (p->score & (SCORE_KANA))
4584 input_codename = "CP50221";
4585 else if (p->score & (SCORE_DEPEND|SCORE_CP932))
4586 input_codename = "CP50220";
4587 }
4588 }
4589 return input_codename;
4590 }
4591
4592 #if !defined(PERL_XS) && !defined(WIN32DLL)
4593 static void
print_guessed_code(char * filename)4594 print_guessed_code(char *filename)
4595 {
4596 if (filename != NULL) printf("%s: ", filename);
4597 if (input_codename && !*input_codename) {
4598 printf("BINARY\n");
4599 } else {
4600 input_codename = get_guessed_code();
4601 if (guess_f == 1) {
4602 printf("%s\n", input_codename);
4603 } else {
4604 printf("%s%s%s%s\n",
4605 input_codename,
4606 iconv != w_iconv16 && iconv != w_iconv32 ? "" :
4607 input_endian == ENDIAN_LITTLE ? " LE" :
4608 input_endian == ENDIAN_BIG ? " BE" :
4609 "[BUG]",
4610 input_bom_f ? " (BOM)" : "",
4611 input_eol == CR ? " (CR)" :
4612 input_eol == LF ? " (LF)" :
4613 input_eol == CRLF ? " (CRLF)" :
4614 input_eol == EOF ? " (MIXED NL)" :
4615 "");
4616 }
4617 }
4618 }
4619 #endif /*WIN32DLL*/
4620
4621 #ifdef INPUT_OPTION
4622
4623 static nkf_char
hex_getc(nkf_char ch,FILE * f,nkf_char (* g)(FILE * f),nkf_char (* u)(nkf_char c,FILE * f))4624 hex_getc(nkf_char ch, FILE *f, nkf_char (*g)(FILE *f), nkf_char (*u)(nkf_char c, FILE *f))
4625 {
4626 nkf_char c1, c2, c3;
4627 c1 = (*g)(f);
4628 if (c1 != ch){
4629 return c1;
4630 }
4631 c2 = (*g)(f);
4632 if (!nkf_isxdigit(c2)){
4633 (*u)(c2, f);
4634 return c1;
4635 }
4636 c3 = (*g)(f);
4637 if (!nkf_isxdigit(c3)){
4638 (*u)(c2, f);
4639 (*u)(c3, f);
4640 return c1;
4641 }
4642 return (hex2bin(c2) << 4) | hex2bin(c3);
4643 }
4644
4645 static nkf_char
cap_getc(FILE * f)4646 cap_getc(FILE *f)
4647 {
4648 return hex_getc(':', f, i_cgetc, i_cungetc);
4649 }
4650
4651 static nkf_char
cap_ungetc(nkf_char c,FILE * f)4652 cap_ungetc(nkf_char c, FILE *f)
4653 {
4654 return (*i_cungetc)(c, f);
4655 }
4656
4657 static nkf_char
url_getc(FILE * f)4658 url_getc(FILE *f)
4659 {
4660 return hex_getc('%', f, i_ugetc, i_uungetc);
4661 }
4662
4663 static nkf_char
url_ungetc(nkf_char c,FILE * f)4664 url_ungetc(nkf_char c, FILE *f)
4665 {
4666 return (*i_uungetc)(c, f);
4667 }
4668 #endif
4669
4670 #ifdef NUMCHAR_OPTION
4671 static nkf_char
numchar_getc(FILE * f)4672 numchar_getc(FILE *f)
4673 {
4674 nkf_char (*g)(FILE *) = i_ngetc;
4675 nkf_char (*u)(nkf_char c ,FILE *f) = i_nungetc;
4676 int i = 0, j;
4677 nkf_char buf[12];
4678 nkf_char c = -1;
4679
4680 buf[i] = (*g)(f);
4681 if (buf[i] == '&'){
4682 buf[++i] = (*g)(f);
4683 if (buf[i] == '#'){
4684 c = 0;
4685 buf[++i] = (*g)(f);
4686 if (buf[i] == 'x' || buf[i] == 'X'){
4687 for (j = 0; j < 7; j++){
4688 buf[++i] = (*g)(f);
4689 if (!nkf_isxdigit(buf[i])){
4690 if (buf[i] != ';'){
4691 c = -1;
4692 }
4693 break;
4694 }
4695 c <<= 4;
4696 c |= hex2bin(buf[i]);
4697 }
4698 }else{
4699 for (j = 0; j < 8; j++){
4700 if (j){
4701 buf[++i] = (*g)(f);
4702 }
4703 if (!nkf_isdigit(buf[i])){
4704 if (buf[i] != ';'){
4705 c = -1;
4706 }
4707 break;
4708 }
4709 c *= 10;
4710 c += hex2bin(buf[i]);
4711 }
4712 }
4713 }
4714 }
4715 if (c != -1){
4716 return nkf_char_unicode_new(c);
4717 }
4718 while (i > 0){
4719 (*u)(buf[i], f);
4720 --i;
4721 }
4722 return buf[0];
4723 }
4724
4725 static nkf_char
numchar_ungetc(nkf_char c,FILE * f)4726 numchar_ungetc(nkf_char c, FILE *f)
4727 {
4728 return (*i_nungetc)(c, f);
4729 }
4730 #endif
4731
4732 #ifdef UNICODE_NORMALIZATION
4733
4734 static nkf_char
nfc_getc(FILE * f)4735 nfc_getc(FILE *f)
4736 {
4737 nkf_char (*g)(FILE *f) = i_nfc_getc;
4738 nkf_char (*u)(nkf_char c ,FILE *f) = i_nfc_ungetc;
4739 nkf_buf_t *buf = nkf_state->nfc_buf;
4740 const unsigned char *array;
4741 int lower=0, upper=NORMALIZATION_TABLE_LENGTH-1;
4742 nkf_char c = (*g)(f);
4743
4744 if (c == EOF || c > 0xFF || (c & 0xc0) == 0x80) return c;
4745
4746 nkf_buf_push(buf, c);
4747 do {
4748 while (lower <= upper) {
4749 int mid = (lower+upper) / 2;
4750 int len;
4751 array = normalization_table[mid].nfd;
4752 for (len=0; len < NORMALIZATION_TABLE_NFD_LENGTH && array[len]; len++) {
4753 if (len >= nkf_buf_length(buf)) {
4754 c = (*g)(f);
4755 if (c == EOF) {
4756 len = 0;
4757 lower = 1, upper = 0;
4758 break;
4759 }
4760 nkf_buf_push(buf, c);
4761 }
4762 if (array[len] != nkf_buf_at(buf, len)) {
4763 if (array[len] < nkf_buf_at(buf, len)) lower = mid + 1;
4764 else upper = mid - 1;
4765 len = 0;
4766 break;
4767 }
4768 }
4769 if (len > 0) {
4770 int i;
4771 array = normalization_table[mid].nfc;
4772 nkf_buf_clear(buf);
4773 for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++)
4774 nkf_buf_push(buf, array[i]);
4775 break;
4776 }
4777 }
4778 } while (lower <= upper);
4779
4780 while (nkf_buf_length(buf) > 1) (*u)(nkf_buf_pop(buf), f);
4781 c = nkf_buf_pop(buf);
4782
4783 return c;
4784 }
4785
4786 static nkf_char
nfc_ungetc(nkf_char c,FILE * f)4787 nfc_ungetc(nkf_char c, FILE *f)
4788 {
4789 return (*i_nfc_ungetc)(c, f);
4790 }
4791 #endif /* UNICODE_NORMALIZATION */
4792
4793
4794 static nkf_char
base64decode(nkf_char c)4795 base64decode(nkf_char c)
4796 {
4797 int i;
4798 if (c > '@') {
4799 if (c < '[') {
4800 i = c - 'A'; /* A..Z 0-25 */
4801 } else if (c == '_') {
4802 i = '?' /* 63 */ ; /* _ 63 */
4803 } else {
4804 i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */
4805 }
4806 } else if (c > '/') {
4807 i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */
4808 } else if (c == '+' || c == '-') {
4809 i = '>' /* 62 */ ; /* + and - 62 */
4810 } else {
4811 i = '?' /* 63 */ ; /* / 63 */
4812 }
4813 return (i);
4814 }
4815
4816 static nkf_char
mime_getc(FILE * f)4817 mime_getc(FILE *f)
4818 {
4819 nkf_char c1, c2, c3, c4, cc;
4820 nkf_char t1, t2, t3, t4, mode, exit_mode;
4821 nkf_char lwsp_count;
4822 char *lwsp_buf;
4823 char *lwsp_buf_new;
4824 nkf_char lwsp_size = 128;
4825
4826 if (mime_input_state.top != mime_input_state.last) { /* Something is in FIFO */
4827 return mime_input_buf(mime_input_state.top++);
4828 }
4829 if (mime_decode_mode==1 ||mime_decode_mode==FALSE) {
4830 mime_decode_mode=FALSE;
4831 unswitch_mime_getc();
4832 return (*i_getc)(f);
4833 }
4834
4835 if (mimebuf_f == FIXED_MIME)
4836 exit_mode = mime_decode_mode;
4837 else
4838 exit_mode = FALSE;
4839 if (mime_decode_mode == 'Q') {
4840 if ((c1 = (*i_mgetc)(f)) == EOF) return (EOF);
4841 restart_mime_q:
4842 if (c1=='_' && mimebuf_f != FIXED_MIME) return SP;
4843 if (c1<=SP || DEL<=c1) {
4844 mime_decode_mode = exit_mode; /* prepare for quit */
4845 return c1;
4846 }
4847 if (c1!='=' && (c1!='?' || mimebuf_f == FIXED_MIME)) {
4848 return c1;
4849 }
4850
4851 mime_decode_mode = exit_mode; /* prepare for quit */
4852 if ((c2 = (*i_mgetc)(f)) == EOF) return (EOF);
4853 if (c1=='?'&&c2=='=' && mimebuf_f != FIXED_MIME) {
4854 /* end Q encoding */
4855 input_mode = exit_mode;
4856 lwsp_count = 0;
4857 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char));
4858 while ((c1=(*i_getc)(f))!=EOF) {
4859 switch (c1) {
4860 case LF:
4861 case CR:
4862 if (c1==LF) {
4863 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4864 i_ungetc(SP,f);
4865 continue;
4866 } else {
4867 i_ungetc(c1,f);
4868 }
4869 c1 = LF;
4870 } else {
4871 if ((c1=(*i_getc)(f))!=EOF && c1 == LF) {
4872 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4873 i_ungetc(SP,f);
4874 continue;
4875 } else {
4876 i_ungetc(c1,f);
4877 }
4878 i_ungetc(LF,f);
4879 } else {
4880 i_ungetc(c1,f);
4881 }
4882 c1 = CR;
4883 }
4884 break;
4885 case SP:
4886 case TAB:
4887 lwsp_buf[lwsp_count] = (unsigned char)c1;
4888 if (lwsp_count++>lwsp_size){
4889 lwsp_size <<= 1;
4890 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4891 lwsp_buf = lwsp_buf_new;
4892 }
4893 continue;
4894 }
4895 break;
4896 }
4897 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
4898 i_ungetc(c1,f);
4899 for(lwsp_count--;lwsp_count>0;lwsp_count--)
4900 i_ungetc(lwsp_buf[lwsp_count],f);
4901 c1 = lwsp_buf[0];
4902 }
4903 nkf_xfree(lwsp_buf);
4904 return c1;
4905 }
4906 if (c1=='='&&c2<SP) { /* this is soft wrap */
4907 while((c1 = (*i_mgetc)(f)) <=SP) {
4908 if (c1 == EOF) return (EOF);
4909 }
4910 mime_decode_mode = 'Q'; /* still in MIME */
4911 goto restart_mime_q;
4912 }
4913 if (c1=='?') {
4914 mime_decode_mode = 'Q'; /* still in MIME */
4915 (*i_mungetc)(c2,f);
4916 return c1;
4917 }
4918 if ((c3 = (*i_mgetc)(f)) == EOF) return (EOF);
4919 if (c2<=SP) return c2;
4920 mime_decode_mode = 'Q'; /* still in MIME */
4921 return ((hex2bin(c2)<<4) + hex2bin(c3));
4922 }
4923
4924 if (mime_decode_mode != 'B') {
4925 mime_decode_mode = FALSE;
4926 return (*i_mgetc)(f);
4927 }
4928
4929
4930 /* Base64 encoding */
4931 /*
4932 MIME allows line break in the middle of
4933 Base64, but we are very pessimistic in decoding
4934 in unbuf mode because MIME encoded code may broken by
4935 less or editor's control sequence (such as ESC-[-K in unbuffered
4936 mode. ignore incomplete MIME.
4937 */
4938 mode = mime_decode_mode;
4939 mime_decode_mode = exit_mode; /* prepare for quit */
4940
4941 while ((c1 = (*i_mgetc)(f))<=SP) {
4942 if (c1==EOF)
4943 return (EOF);
4944 }
4945 mime_c2_retry:
4946 if ((c2 = (*i_mgetc)(f))<=SP) {
4947 if (c2==EOF)
4948 return (EOF);
4949 if (mime_f != STRICT_MIME) goto mime_c2_retry;
4950 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
4951 return c2;
4952 }
4953 if ((c1 == '?') && (c2 == '=')) {
4954 input_mode = ASCII;
4955 lwsp_count = 0;
4956 lwsp_buf = nkf_xmalloc((lwsp_size+5)*sizeof(char));
4957 while ((c1=(*i_getc)(f))!=EOF) {
4958 switch (c1) {
4959 case LF:
4960 case CR:
4961 if (c1==LF) {
4962 if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4963 i_ungetc(SP,f);
4964 continue;
4965 } else {
4966 i_ungetc(c1,f);
4967 }
4968 c1 = LF;
4969 } else {
4970 if ((c1=(*i_getc)(f))!=EOF) {
4971 if (c1==SP) {
4972 i_ungetc(SP,f);
4973 continue;
4974 } else if ((c1=(*i_getc)(f))!=EOF && nkf_isblank(c1)) {
4975 i_ungetc(SP,f);
4976 continue;
4977 } else {
4978 i_ungetc(c1,f);
4979 }
4980 i_ungetc(LF,f);
4981 } else {
4982 i_ungetc(c1,f);
4983 }
4984 c1 = CR;
4985 }
4986 break;
4987 case SP:
4988 case TAB:
4989 lwsp_buf[lwsp_count] = (unsigned char)c1;
4990 if (lwsp_count++>lwsp_size){
4991 lwsp_size <<= 1;
4992 lwsp_buf_new = nkf_xrealloc(lwsp_buf, (lwsp_size+5)*sizeof(char));
4993 lwsp_buf = lwsp_buf_new;
4994 }
4995 continue;
4996 }
4997 break;
4998 }
4999 if (lwsp_count > 0 && (c1 != '=' || (lwsp_buf[lwsp_count-1] != SP && lwsp_buf[lwsp_count-1] != TAB))) {
5000 i_ungetc(c1,f);
5001 for(lwsp_count--;lwsp_count>0;lwsp_count--)
5002 i_ungetc(lwsp_buf[lwsp_count],f);
5003 c1 = lwsp_buf[0];
5004 }
5005 nkf_xfree(lwsp_buf);
5006 return c1;
5007 }
5008 mime_c3_retry:
5009 if ((c3 = (*i_mgetc)(f))<=SP) {
5010 if (c3==EOF)
5011 return (EOF);
5012 if (mime_f != STRICT_MIME) goto mime_c3_retry;
5013 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5014 return c3;
5015 }
5016 mime_c4_retry:
5017 if ((c4 = (*i_mgetc)(f))<=SP) {
5018 if (c4==EOF)
5019 return (EOF);
5020 if (mime_f != STRICT_MIME) goto mime_c4_retry;
5021 if (mimebuf_f!=FIXED_MIME) input_mode = ASCII;
5022 return c4;
5023 }
5024
5025 mime_decode_mode = mode; /* still in MIME sigh... */
5026
5027 /* BASE 64 decoding */
5028
5029 t1 = 0x3f & base64decode(c1);
5030 t2 = 0x3f & base64decode(c2);
5031 t3 = 0x3f & base64decode(c3);
5032 t4 = 0x3f & base64decode(c4);
5033 cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03);
5034 if (c2 != '=') {
5035 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
5036 cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f);
5037 if (c3 != '=') {
5038 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
5039 cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f);
5040 if (c4 != '=')
5041 mime_input_buf(mime_input_state.last++) = (unsigned char)cc;
5042 }
5043 } else {
5044 return c1;
5045 }
5046 return mime_input_buf(mime_input_state.top++);
5047 }
5048
5049 static const char basis_64[] =
5050 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
5051
5052 #define MIMEOUT_BUF_LENGTH 74
5053 static struct {
5054 unsigned char buf[MIMEOUT_BUF_LENGTH+1];
5055 int count;
5056 } mimeout_state;
5057
5058 /*nkf_char mime_lastchar2, mime_lastchar1;*/
5059
5060 static void
open_mime(nkf_char mode)5061 open_mime(nkf_char mode)
5062 {
5063 const unsigned char *p;
5064 int i;
5065 int j;
5066 p = mime_pattern[0];
5067 for(i=0;mime_pattern[i];i++) {
5068 if (mode == mime_encode[i]) {
5069 p = mime_pattern[i];
5070 break;
5071 }
5072 }
5073 mimeout_mode = mime_encode_method[i];
5074 i = 0;
5075 if (base64_count>45) {
5076 if (mimeout_state.count>0 && nkf_isblank(mimeout_state.buf[i])){
5077 (*o_mputc)(mimeout_state.buf[i]);
5078 i++;
5079 }
5080 put_newline(o_mputc);
5081 (*o_mputc)(SP);
5082 base64_count = 1;
5083 if (mimeout_state.count>0 && nkf_isspace(mimeout_state.buf[i])) {
5084 i++;
5085 }
5086 }
5087 for (;i<mimeout_state.count;i++) {
5088 if (nkf_isspace(mimeout_state.buf[i])) {
5089 (*o_mputc)(mimeout_state.buf[i]);
5090 base64_count ++;
5091 } else {
5092 break;
5093 }
5094 }
5095 while(*p) {
5096 (*o_mputc)(*p++);
5097 base64_count ++;
5098 }
5099 j = mimeout_state.count;
5100 mimeout_state.count = 0;
5101 for (;i<j;i++) {
5102 mime_putc(mimeout_state.buf[i]);
5103 }
5104 }
5105
5106 static void
mime_prechar(nkf_char c2,nkf_char c1)5107 mime_prechar(nkf_char c2, nkf_char c1)
5108 {
5109 if (mimeout_mode > 0){
5110 if (c2 == EOF){
5111 if (base64_count + mimeout_state.count/3*4> 73){
5112 (*o_base64conv)(EOF,0);
5113 oconv_newline(o_base64conv);
5114 (*o_base64conv)(0,SP);
5115 base64_count = 1;
5116 }
5117 } else {
5118 if ((c2 != 0 || c1 > DEL) && base64_count + mimeout_state.count/3*4> 66) {
5119 (*o_base64conv)(EOF,0);
5120 oconv_newline(o_base64conv);
5121 (*o_base64conv)(0,SP);
5122 base64_count = 1;
5123 mimeout_mode = -1;
5124 }
5125 }
5126 } else if (c2) {
5127 if (c2 != EOF && base64_count + mimeout_state.count/3*4> 60) {
5128 mimeout_mode = (output_mode==ASCII ||output_mode == ISO_8859_1) ? 'Q' : 'B';
5129 open_mime(output_mode);
5130 (*o_base64conv)(EOF,0);
5131 oconv_newline(o_base64conv);
5132 (*o_base64conv)(0,SP);
5133 base64_count = 1;
5134 mimeout_mode = -1;
5135 }
5136 }
5137 }
5138
5139 static void
close_mime(void)5140 close_mime(void)
5141 {
5142 (*o_mputc)('?');
5143 (*o_mputc)('=');
5144 base64_count += 2;
5145 mimeout_mode = 0;
5146 }
5147
5148 static void
eof_mime(void)5149 eof_mime(void)
5150 {
5151 switch(mimeout_mode) {
5152 case 'Q':
5153 case 'B':
5154 break;
5155 case 2:
5156 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4)]);
5157 (*o_mputc)('=');
5158 (*o_mputc)('=');
5159 base64_count += 3;
5160 break;
5161 case 1:
5162 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2)]);
5163 (*o_mputc)('=');
5164 base64_count += 2;
5165 break;
5166 }
5167 if (mimeout_mode > 0) {
5168 if (mimeout_f!=FIXED_MIME) {
5169 close_mime();
5170 } else if (mimeout_mode != 'Q')
5171 mimeout_mode = 'B';
5172 }
5173 }
5174
5175 static void
mimeout_addchar(nkf_char c)5176 mimeout_addchar(nkf_char c)
5177 {
5178 switch(mimeout_mode) {
5179 case 'Q':
5180 if (c==CR||c==LF) {
5181 (*o_mputc)(c);
5182 base64_count = 0;
5183 } else if(!nkf_isalnum(c)) {
5184 (*o_mputc)('=');
5185 (*o_mputc)(bin2hex(((c>>4)&0xf)));
5186 (*o_mputc)(bin2hex((c&0xf)));
5187 base64_count += 3;
5188 } else {
5189 (*o_mputc)(c);
5190 base64_count++;
5191 }
5192 break;
5193 case 'B':
5194 nkf_state->mimeout_state=c;
5195 (*o_mputc)(basis_64[c>>2]);
5196 mimeout_mode=2;
5197 base64_count ++;
5198 break;
5199 case 2:
5200 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0x3)<< 4) | ((c & 0xF0) >> 4)]);
5201 nkf_state->mimeout_state=c;
5202 mimeout_mode=1;
5203 base64_count ++;
5204 break;
5205 case 1:
5206 (*o_mputc)(basis_64[((nkf_state->mimeout_state & 0xF) << 2) | ((c & 0xC0) >>6)]);
5207 (*o_mputc)(basis_64[c & 0x3F]);
5208 mimeout_mode='B';
5209 base64_count += 2;
5210 break;
5211 default:
5212 (*o_mputc)(c);
5213 base64_count++;
5214 break;
5215 }
5216 }
5217
5218 static void
mime_putc(nkf_char c)5219 mime_putc(nkf_char c)
5220 {
5221 int i, j;
5222 nkf_char lastchar;
5223
5224 if (mimeout_f == FIXED_MIME){
5225 if (mimeout_mode == 'Q'){
5226 if (base64_count > 71){
5227 if (c!=CR && c!=LF) {
5228 (*o_mputc)('=');
5229 put_newline(o_mputc);
5230 }
5231 base64_count = 0;
5232 }
5233 }else{
5234 if (base64_count > 71){
5235 eof_mime();
5236 put_newline(o_mputc);
5237 base64_count = 0;
5238 }
5239 if (c == EOF) { /* c==EOF */
5240 eof_mime();
5241 }
5242 }
5243 if (c != EOF) { /* c==EOF */
5244 mimeout_addchar(c);
5245 }
5246 return;
5247 }
5248
5249 /* mimeout_f != FIXED_MIME */
5250
5251 if (c == EOF) { /* c==EOF */
5252 if (mimeout_mode == -1 && mimeout_state.count > 1) open_mime(output_mode);
5253 j = mimeout_state.count;
5254 mimeout_state.count = 0;
5255 i = 0;
5256 if (mimeout_mode > 0) {
5257 if (!nkf_isblank(mimeout_state.buf[j-1])) {
5258 for (;i<j;i++) {
5259 if (nkf_isspace(mimeout_state.buf[i]) && base64_count < 71){
5260 break;
5261 }
5262 mimeout_addchar(mimeout_state.buf[i]);
5263 }
5264 eof_mime();
5265 for (;i<j;i++) {
5266 mimeout_addchar(mimeout_state.buf[i]);
5267 }
5268 } else {
5269 for (;i<j;i++) {
5270 mimeout_addchar(mimeout_state.buf[i]);
5271 }
5272 eof_mime();
5273 }
5274 } else {
5275 for (;i<j;i++) {
5276 mimeout_addchar(mimeout_state.buf[i]);
5277 }
5278 }
5279 return;
5280 }
5281
5282 if (mimeout_state.count > 0){
5283 lastchar = mimeout_state.buf[mimeout_state.count - 1];
5284 }else{
5285 lastchar = -1;
5286 }
5287
5288 if (mimeout_mode=='Q') {
5289 if (c <= DEL && (output_mode==ASCII ||output_mode == ISO_8859_1)) {
5290 if (c == CR || c == LF) {
5291 close_mime();
5292 (*o_mputc)(c);
5293 base64_count = 0;
5294 return;
5295 } else if (c <= SP) {
5296 close_mime();
5297 if (base64_count > 70) {
5298 put_newline(o_mputc);
5299 base64_count = 0;
5300 }
5301 if (!nkf_isblank(c)) {
5302 (*o_mputc)(SP);
5303 base64_count++;
5304 }
5305 } else {
5306 if (base64_count > 70) {
5307 close_mime();
5308 put_newline(o_mputc);
5309 (*o_mputc)(SP);
5310 base64_count = 1;
5311 open_mime(output_mode);
5312 }
5313 if (!nkf_noescape_mime(c)) {
5314 mimeout_addchar(c);
5315 return;
5316 }
5317 }
5318 if (c != 0x1B) {
5319 (*o_mputc)(c);
5320 base64_count++;
5321 return;
5322 }
5323 }
5324 }
5325
5326 if (mimeout_mode <= 0) {
5327 if (c <= DEL && (output_mode==ASCII || output_mode == ISO_8859_1 ||
5328 output_mode == UTF_8)) {
5329 if (nkf_isspace(c)) {
5330 int flag = 0;
5331 if (mimeout_mode == -1) {
5332 flag = 1;
5333 }
5334 if (c==CR || c==LF) {
5335 if (flag) {
5336 open_mime(output_mode);
5337 output_mode = 0;
5338 } else {
5339 base64_count = 0;
5340 }
5341 }
5342 for (i=0;i<mimeout_state.count;i++) {
5343 (*o_mputc)(mimeout_state.buf[i]);
5344 if (mimeout_state.buf[i] == CR || mimeout_state.buf[i] == LF){
5345 base64_count = 0;
5346 }else{
5347 base64_count++;
5348 }
5349 }
5350 if (flag) {
5351 eof_mime();
5352 base64_count = 0;
5353 mimeout_mode = 0;
5354 }
5355 mimeout_state.buf[0] = (char)c;
5356 mimeout_state.count = 1;
5357 }else{
5358 if (base64_count > 1
5359 && base64_count + mimeout_state.count > 76
5360 && mimeout_state.buf[0] != CR && mimeout_state.buf[0] != LF){
5361 static const char *str = "boundary=\"";
5362 static int len = 10;
5363 i = 0;
5364
5365 for (; i < mimeout_state.count - len; ++i) {
5366 if (!strncmp((char *)(mimeout_state.buf+i), str, len)) {
5367 i += len - 2;
5368 break;
5369 }
5370 }
5371
5372 if (i == 0 || i == mimeout_state.count - len) {
5373 put_newline(o_mputc);
5374 base64_count = 0;
5375 if (!nkf_isspace(mimeout_state.buf[0])){
5376 (*o_mputc)(SP);
5377 base64_count++;
5378 }
5379 }
5380 else {
5381 int j;
5382 for (j = 0; j <= i; ++j) {
5383 (*o_mputc)(mimeout_state.buf[j]);
5384 }
5385 put_newline(o_mputc);
5386 base64_count = 1;
5387 for (; j <= mimeout_state.count; ++j) {
5388 mimeout_state.buf[j - i] = mimeout_state.buf[j];
5389 }
5390 mimeout_state.count -= i;
5391 }
5392 }
5393 mimeout_state.buf[mimeout_state.count++] = (char)c;
5394 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
5395 open_mime(output_mode);
5396 }
5397 }
5398 return;
5399 }else{
5400 if (lastchar==CR || lastchar == LF){
5401 for (i=0;i<mimeout_state.count;i++) {
5402 (*o_mputc)(mimeout_state.buf[i]);
5403 }
5404 base64_count = 0;
5405 mimeout_state.count = 0;
5406 }
5407 if (lastchar==SP) {
5408 for (i=0;i<mimeout_state.count-1;i++) {
5409 (*o_mputc)(mimeout_state.buf[i]);
5410 base64_count++;
5411 }
5412 mimeout_state.buf[0] = SP;
5413 mimeout_state.count = 1;
5414 }
5415 open_mime(output_mode);
5416 }
5417 }else{
5418 /* mimeout_mode == 'B', 1, 2 */
5419 if (c <= DEL && (output_mode==ASCII || output_mode == ISO_8859_1 ||
5420 output_mode == UTF_8)) {
5421 if (lastchar == CR || lastchar == LF){
5422 if (nkf_isblank(c)) {
5423 for (i=0;i<mimeout_state.count;i++) {
5424 mimeout_addchar(mimeout_state.buf[i]);
5425 }
5426 mimeout_state.count = 0;
5427 } else {
5428 eof_mime();
5429 for (i=0;i<mimeout_state.count;i++) {
5430 (*o_mputc)(mimeout_state.buf[i]);
5431 }
5432 base64_count = 0;
5433 mimeout_state.count = 0;
5434 }
5435 mimeout_state.buf[mimeout_state.count++] = (char)c;
5436 return;
5437 }
5438 if (nkf_isspace(c)) {
5439 for (i=0;i<mimeout_state.count;i++) {
5440 if (SP<mimeout_state.buf[i] && mimeout_state.buf[i]<DEL) {
5441 eof_mime();
5442 for (i=0;i<mimeout_state.count;i++) {
5443 (*o_mputc)(mimeout_state.buf[i]);
5444 base64_count++;
5445 }
5446 mimeout_state.count = 0;
5447 }
5448 }
5449 mimeout_state.buf[mimeout_state.count++] = (char)c;
5450 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
5451 eof_mime();
5452 for (j=0;j<mimeout_state.count;j++) {
5453 (*o_mputc)(mimeout_state.buf[j]);
5454 base64_count++;
5455 }
5456 mimeout_state.count = 0;
5457 }
5458 return;
5459 }
5460 if (mimeout_state.count>0 && SP<c && c!='=') {
5461 mimeout_state.buf[mimeout_state.count++] = (char)c;
5462 if (mimeout_state.count>MIMEOUT_BUF_LENGTH) {
5463 j = mimeout_state.count;
5464 mimeout_state.count = 0;
5465 for (i=0;i<j;i++) {
5466 mimeout_addchar(mimeout_state.buf[i]);
5467 }
5468 }
5469 return;
5470 }
5471 }
5472 }
5473 if (mimeout_state.count>0) {
5474 j = mimeout_state.count;
5475 mimeout_state.count = 0;
5476 for (i=0;i<j;i++) {
5477 if (mimeout_state.buf[i]==CR || mimeout_state.buf[i]==LF)
5478 break;
5479 mimeout_addchar(mimeout_state.buf[i]);
5480 }
5481 if (i<j) {
5482 eof_mime();
5483 base64_count=0;
5484 for (;i<j;i++) {
5485 (*o_mputc)(mimeout_state.buf[i]);
5486 }
5487 open_mime(output_mode);
5488 }
5489 }
5490 mimeout_addchar(c);
5491 }
5492
5493 static void
base64_conv(nkf_char c2,nkf_char c1)5494 base64_conv(nkf_char c2, nkf_char c1)
5495 {
5496 mime_prechar(c2, c1);
5497 (*o_base64conv)(c2,c1);
5498 }
5499
5500 #ifdef HAVE_ICONV_H
5501 typedef struct nkf_iconv_t {
5502 iconv_t cd;
5503 char *input_buffer;
5504 size_t input_buffer_size;
5505 char *output_buffer;
5506 size_t output_buffer_size;
5507 };
5508
5509 static nkf_iconv_t
nkf_iconv_new(char * tocode,char * fromcode)5510 nkf_iconv_new(char *tocode, char *fromcode)
5511 {
5512 nkf_iconv_t converter;
5513
5514 converter->input_buffer_size = IOBUF_SIZE;
5515 converter->input_buffer = nkf_xmalloc(converter->input_buffer_size);
5516 converter->output_buffer_size = IOBUF_SIZE * 2;
5517 converter->output_buffer = nkf_xmalloc(converter->output_buffer_size);
5518 converter->cd = iconv_open(tocode, fromcode);
5519 if (converter->cd == (iconv_t)-1)
5520 {
5521 switch (errno) {
5522 case EINVAL:
5523 perror(fprintf("iconv doesn't support %s to %s conversion.", fromcode, tocode));
5524 return -1;
5525 default:
5526 perror("can't iconv_open");
5527 }
5528 }
5529 }
5530
5531 static size_t
nkf_iconv_convert(nkf_iconv_t * converter,FILE * input)5532 nkf_iconv_convert(nkf_iconv_t *converter, FILE *input)
5533 {
5534 size_t invalid = (size_t)0;
5535 char *input_buffer = converter->input_buffer;
5536 size_t input_length = (size_t)0;
5537 char *output_buffer = converter->output_buffer;
5538 size_t output_length = converter->output_buffer_size;
5539 int c;
5540
5541 do {
5542 if (c != EOF) {
5543 while ((c = (*i_getc)(f)) != EOF) {
5544 input_buffer[input_length++] = c;
5545 if (input_length < converter->input_buffer_size) break;
5546 }
5547 }
5548
5549 size_t ret = iconv(converter->cd, &input_buffer, &input_length, &output_buffer, &output_length);
5550 while (output_length-- > 0) {
5551 (*o_putc)(output_buffer[converter->output_buffer_size-output_length]);
5552 }
5553 if (ret == (size_t) - 1) {
5554 switch (errno) {
5555 case EINVAL:
5556 if (input_buffer != converter->input_buffer)
5557 memmove(converter->input_buffer, input_buffer, input_length);
5558 break;
5559 case E2BIG:
5560 converter->output_buffer_size *= 2;
5561 output_buffer = realloc(converter->outbuf, converter->output_buffer_size);
5562 if (output_buffer == NULL) {
5563 perror("can't realloc");
5564 return -1;
5565 }
5566 converter->output_buffer = output_buffer;
5567 break;
5568 default:
5569 perror("can't iconv");
5570 return -1;
5571 }
5572 } else {
5573 invalid += ret;
5574 }
5575 } while (1);
5576
5577 return invalid;
5578 }
5579
5580
5581 static void
nkf_iconv_close(nkf_iconv_t * convert)5582 nkf_iconv_close(nkf_iconv_t *convert)
5583 {
5584 nkf_xfree(converter->inbuf);
5585 nkf_xfree(converter->outbuf);
5586 iconv_close(converter->cd);
5587 }
5588 #endif
5589
5590
5591 static void
reinit(void)5592 reinit(void)
5593 {
5594 {
5595 struct input_code *p = input_code_list;
5596 while (p->name){
5597 status_reinit(p++);
5598 }
5599 }
5600 unbuf_f = FALSE;
5601 estab_f = FALSE;
5602 nop_f = FALSE;
5603 binmode_f = TRUE;
5604 rot_f = FALSE;
5605 hira_f = FALSE;
5606 alpha_f = FALSE;
5607 mime_f = MIME_DECODE_DEFAULT;
5608 mime_decode_f = FALSE;
5609 mimebuf_f = FALSE;
5610 broken_f = FALSE;
5611 iso8859_f = FALSE;
5612 mimeout_f = FALSE;
5613 x0201_f = NKF_UNSPECIFIED;
5614 iso2022jp_f = FALSE;
5615 #if defined(UTF8_INPUT_ENABLE) || defined(UTF8_OUTPUT_ENABLE)
5616 ms_ucs_map_f = UCS_MAP_ASCII;
5617 #endif
5618 #ifdef UTF8_INPUT_ENABLE
5619 no_cp932ext_f = FALSE;
5620 no_best_fit_chars_f = FALSE;
5621 encode_fallback = NULL;
5622 unicode_subchar = '?';
5623 input_endian = ENDIAN_BIG;
5624 #endif
5625 #ifdef UTF8_OUTPUT_ENABLE
5626 output_bom_f = FALSE;
5627 output_endian = ENDIAN_BIG;
5628 #endif
5629 #ifdef UNICODE_NORMALIZATION
5630 nfc_f = FALSE;
5631 #endif
5632 #ifdef INPUT_OPTION
5633 cap_f = FALSE;
5634 url_f = FALSE;
5635 numchar_f = FALSE;
5636 #endif
5637 #ifdef CHECK_OPTION
5638 noout_f = FALSE;
5639 debug_f = FALSE;
5640 #endif
5641 guess_f = 0;
5642 #ifdef EXEC_IO
5643 exec_f = 0;
5644 #endif
5645 #ifdef SHIFTJIS_CP932
5646 cp51932_f = TRUE;
5647 cp932inv_f = TRUE;
5648 #endif
5649 #ifdef X0212_ENABLE
5650 x0212_f = FALSE;
5651 x0213_f = FALSE;
5652 #endif
5653 {
5654 int i;
5655 for (i = 0; i < 256; i++){
5656 prefix_table[i] = 0;
5657 }
5658 }
5659 hold_count = 0;
5660 mimeout_state.count = 0;
5661 mimeout_mode = 0;
5662 base64_count = 0;
5663 f_line = 0;
5664 f_prev = 0;
5665 fold_preserve_f = FALSE;
5666 fold_f = FALSE;
5667 fold_len = 0;
5668 kanji_intro = DEFAULT_J;
5669 ascii_intro = DEFAULT_R;
5670 fold_margin = FOLD_MARGIN;
5671 o_zconv = no_connection;
5672 o_fconv = no_connection;
5673 o_eol_conv = no_connection;
5674 o_rot_conv = no_connection;
5675 o_hira_conv = no_connection;
5676 o_base64conv = no_connection;
5677 o_iso2022jp_check_conv = no_connection;
5678 o_putc = std_putc;
5679 i_getc = std_getc;
5680 i_ungetc = std_ungetc;
5681 i_bgetc = std_getc;
5682 i_bungetc = std_ungetc;
5683 o_mputc = std_putc;
5684 i_mgetc = std_getc;
5685 i_mungetc = std_ungetc;
5686 i_mgetc_buf = std_getc;
5687 i_mungetc_buf = std_ungetc;
5688 output_mode = ASCII;
5689 input_mode = ASCII;
5690 mime_decode_mode = FALSE;
5691 file_out_f = FALSE;
5692 eolmode_f = 0;
5693 input_eol = 0;
5694 prev_cr = 0;
5695 option_mode = 0;
5696 z_prev2=0,z_prev1=0;
5697 #ifdef CHECK_OPTION
5698 iconv_for_check = 0;
5699 #endif
5700 input_codename = NULL;
5701 input_encoding = NULL;
5702 output_encoding = NULL;
5703 nkf_state_init();
5704 #ifdef WIN32DLL
5705 reinitdll();
5706 #endif /*WIN32DLL*/
5707 }
5708
5709 static int
module_connection(void)5710 module_connection(void)
5711 {
5712 if (input_encoding) set_input_encoding(input_encoding);
5713 if (!output_encoding) {
5714 output_encoding = nkf_default_encoding();
5715 }
5716 if (!output_encoding) {
5717 if (noout_f || guess_f) output_encoding = nkf_enc_from_index(ISO_2022_JP);
5718 else return -1;
5719 }
5720 set_output_encoding(output_encoding);
5721 oconv = nkf_enc_to_oconv(output_encoding);
5722 o_putc = std_putc;
5723 if (nkf_enc_unicode_p(output_encoding))
5724 output_mode = UTF_8;
5725
5726 if (x0201_f == NKF_UNSPECIFIED) {
5727 x0201_f = X0201_DEFAULT;
5728 }
5729
5730 /* replace continuation module, from output side */
5731
5732 /* output redirection */
5733 #ifdef CHECK_OPTION
5734 if (noout_f || guess_f){
5735 o_putc = no_putc;
5736 }
5737 #endif
5738 if (mimeout_f) {
5739 o_mputc = o_putc;
5740 o_putc = mime_putc;
5741 if (mimeout_f == TRUE) {
5742 o_base64conv = oconv; oconv = base64_conv;
5743 }
5744 /* base64_count = 0; */
5745 }
5746
5747 if (eolmode_f || guess_f) {
5748 o_eol_conv = oconv; oconv = eol_conv;
5749 }
5750 if (rot_f) {
5751 o_rot_conv = oconv; oconv = rot_conv;
5752 }
5753 if (iso2022jp_f) {
5754 o_iso2022jp_check_conv = oconv; oconv = iso2022jp_check_conv;
5755 }
5756 if (hira_f) {
5757 o_hira_conv = oconv; oconv = hira_conv;
5758 }
5759 if (fold_f) {
5760 o_fconv = oconv; oconv = fold_conv;
5761 f_line = 0;
5762 }
5763 if (alpha_f || x0201_f) {
5764 o_zconv = oconv; oconv = z_conv;
5765 }
5766
5767 i_getc = std_getc;
5768 i_ungetc = std_ungetc;
5769 /* input redirection */
5770 #ifdef INPUT_OPTION
5771 if (cap_f){
5772 i_cgetc = i_getc; i_getc = cap_getc;
5773 i_cungetc = i_ungetc; i_ungetc= cap_ungetc;
5774 }
5775 if (url_f){
5776 i_ugetc = i_getc; i_getc = url_getc;
5777 i_uungetc = i_ungetc; i_ungetc= url_ungetc;
5778 }
5779 #endif
5780 #ifdef NUMCHAR_OPTION
5781 if (numchar_f){
5782 i_ngetc = i_getc; i_getc = numchar_getc;
5783 i_nungetc = i_ungetc; i_ungetc= numchar_ungetc;
5784 }
5785 #endif
5786 #ifdef UNICODE_NORMALIZATION
5787 if (nfc_f){
5788 i_nfc_getc = i_getc; i_getc = nfc_getc;
5789 i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc;
5790 }
5791 #endif
5792 if (mime_f && mimebuf_f==FIXED_MIME) {
5793 i_mgetc = i_getc; i_getc = mime_getc;
5794 i_mungetc = i_ungetc; i_ungetc = mime_ungetc;
5795 }
5796 if (broken_f & 1) {
5797 i_bgetc = i_getc; i_getc = broken_getc;
5798 i_bungetc = i_ungetc; i_ungetc = broken_ungetc;
5799 }
5800 if (input_encoding) {
5801 set_iconv(-TRUE, nkf_enc_to_iconv(input_encoding));
5802 } else {
5803 set_iconv(FALSE, e_iconv);
5804 }
5805
5806 {
5807 struct input_code *p = input_code_list;
5808 while (p->name){
5809 status_reinit(p++);
5810 }
5811 }
5812 return 0;
5813 }
5814
5815 /*
5816 Conversion main loop. Code detection only.
5817 */
5818
5819 #if !defined(PERL_XS) && !defined(WIN32DLL)
5820 static nkf_char
noconvert(FILE * f)5821 noconvert(FILE *f)
5822 {
5823 nkf_char c;
5824
5825 if (nop_f == 2)
5826 module_connection();
5827 while ((c = (*i_getc)(f)) != EOF)
5828 (*o_putc)(c);
5829 (*o_putc)(EOF);
5830 return 1;
5831 }
5832 #endif
5833
5834 #define NEXT continue /* no output, get next */
5835 #define SKIP c2=0;continue /* no output, get next */
5836 #define MORE c2=c1;continue /* need one more byte */
5837 #define SEND (void)0 /* output c1 and c2, get next */
5838 #define LAST break /* end of loop, go closing */
5839 #define set_input_mode(mode) do { \
5840 input_mode = mode; \
5841 shift_mode = 0; \
5842 set_input_codename("ISO-2022-JP"); \
5843 debug("ISO-2022-JP"); \
5844 } while (0)
5845
5846 static int
kanji_convert(FILE * f)5847 kanji_convert(FILE *f)
5848 {
5849 nkf_char c1=0, c2=0, c3=0, c4=0;
5850 int shift_mode = 0; /* 0, 1, 2, 3 */
5851 int g2 = 0;
5852 int is_8bit = FALSE;
5853
5854 if (input_encoding && !nkf_enc_asciicompat(input_encoding)) {
5855 is_8bit = TRUE;
5856 }
5857
5858 input_mode = ASCII;
5859 output_mode = ASCII;
5860
5861 if (module_connection() < 0) {
5862 #if !defined(PERL_XS) && !defined(WIN32DLL)
5863 fprintf(stderr, "no output encoding given\n");
5864 #endif
5865 return -1;
5866 }
5867 check_bom(f);
5868
5869 #ifdef UTF8_INPUT_ENABLE
5870 if(iconv == w_iconv32){
5871 while ((c1 = (*i_getc)(f)) != EOF &&
5872 (c2 = (*i_getc)(f)) != EOF &&
5873 (c3 = (*i_getc)(f)) != EOF &&
5874 (c4 = (*i_getc)(f)) != EOF) {
5875 nkf_char c5, c6, c7, c8;
5876 if (nkf_iconv_utf_32(c1, c2, c3, c4) == (size_t)NKF_ICONV_WAIT_COMBINING_CHAR) {
5877 if ((c5 = (*i_getc)(f)) != EOF &&
5878 (c6 = (*i_getc)(f)) != EOF &&
5879 (c7 = (*i_getc)(f)) != EOF &&
5880 (c8 = (*i_getc)(f)) != EOF) {
5881 if (nkf_iconv_utf_32_combine(c1, c2, c3, c4, c5, c6, c7, c8)) {
5882 (*i_ungetc)(c8, f);
5883 (*i_ungetc)(c7, f);
5884 (*i_ungetc)(c6, f);
5885 (*i_ungetc)(c5, f);
5886 nkf_iconv_utf_32_nocombine(c1, c2, c3, c4);
5887 }
5888 } else {
5889 nkf_iconv_utf_32_nocombine(c1, c2, c3, c4);
5890 }
5891 }
5892 }
5893 goto finished;
5894 }
5895 else if (iconv == w_iconv16) {
5896 while ((c1 = (*i_getc)(f)) != EOF &&
5897 (c2 = (*i_getc)(f)) != EOF) {
5898 size_t ret = nkf_iconv_utf_16(c1, c2, 0, 0);
5899 if (ret == NKF_ICONV_NEED_TWO_MORE_BYTES &&
5900 (c3 = (*i_getc)(f)) != EOF &&
5901 (c4 = (*i_getc)(f)) != EOF) {
5902 nkf_iconv_utf_16(c1, c2, c3, c4);
5903 } else if (ret == (size_t)NKF_ICONV_WAIT_COMBINING_CHAR) {
5904 if ((c3 = (*i_getc)(f)) != EOF &&
5905 (c4 = (*i_getc)(f)) != EOF) {
5906 if (nkf_iconv_utf_16_combine(c1, c2, c3, c4)) {
5907 (*i_ungetc)(c4, f);
5908 (*i_ungetc)(c3, f);
5909 nkf_iconv_utf_16_nocombine(c1, c2);
5910 }
5911 } else {
5912 nkf_iconv_utf_16_nocombine(c1, c2);
5913 }
5914 }
5915 }
5916 goto finished;
5917 }
5918 #endif
5919
5920 while ((c1 = (*i_getc)(f)) != EOF) {
5921 #ifdef INPUT_CODE_FIX
5922 if (!input_encoding)
5923 #endif
5924 code_status(c1);
5925 if (c2) {
5926 /* second byte */
5927 if (c2 > ((input_encoding && nkf_enc_cp5022x_p(input_encoding)) ? 0x92 : DEL)) {
5928 /* in case of 8th bit is on */
5929 if (!estab_f&&!mime_decode_mode) {
5930 /* in case of not established yet */
5931 /* It is still ambiguous */
5932 if (h_conv(f, c2, c1)==EOF) {
5933 LAST;
5934 }
5935 else {
5936 SKIP;
5937 }
5938 }
5939 else {
5940 /* in case of already established */
5941 if (c1 < 0x40) {
5942 /* ignore bogus code */
5943 SKIP;
5944 } else {
5945 SEND;
5946 }
5947 }
5948 }
5949 else {
5950 /* 2nd byte of 7 bit code or SJIS */
5951 SEND;
5952 }
5953 }
5954 else if (nkf_char_unicode_p(c1)) {
5955 (*oconv)(0, c1);
5956 NEXT;
5957 }
5958 else {
5959 /* first byte */
5960 if (input_mode == JIS_X_0208 && DEL <= c1 && c1 < 0x92) {
5961 /* CP5022x */
5962 MORE;
5963 }else if (input_codename && input_codename[0] == 'I' &&
5964 0xA1 <= c1 && c1 <= 0xDF) {
5965 /* JIS X 0201 Katakana in 8bit JIS */
5966 c2 = JIS_X_0201_1976_K;
5967 c1 &= 0x7f;
5968 SEND;
5969 } else if (c1 > DEL) {
5970 /* 8 bit code */
5971 if (!estab_f && !iso8859_f) {
5972 /* not established yet */
5973 MORE;
5974 } else { /* estab_f==TRUE */
5975 if (iso8859_f) {
5976 c2 = ISO_8859_1;
5977 c1 &= 0x7f;
5978 SEND;
5979 }
5980 else if ((iconv == s_iconv && 0xA0 <= c1 && c1 <= 0xDF) ||
5981 (ms_ucs_map_f == UCS_MAP_CP10001 && (c1 == 0xFD || c1 == 0xFE))) {
5982 /* JIS X 0201 */
5983 c2 = JIS_X_0201_1976_K;
5984 c1 &= 0x7f;
5985 SEND;
5986 }
5987 else {
5988 /* already established */
5989 MORE;
5990 }
5991 }
5992 } else if (SP < c1 && c1 < DEL) {
5993 /* in case of Roman characters */
5994 if (shift_mode) {
5995 /* output 1 shifted byte */
5996 if (iso8859_f) {
5997 c2 = ISO_8859_1;
5998 SEND;
5999 } else if (nkf_byte_jisx0201_katakana_p(c1)){
6000 /* output 1 shifted byte */
6001 c2 = JIS_X_0201_1976_K;
6002 SEND;
6003 } else {
6004 /* look like bogus code */
6005 SKIP;
6006 }
6007 } else if (input_mode == JIS_X_0208 || input_mode == JIS_X_0212 ||
6008 input_mode == JIS_X_0213_1 || input_mode == JIS_X_0213_2) {
6009 /* in case of Kanji shifted */
6010 MORE;
6011 } else if (c1 == '=' && mime_f && !mime_decode_mode) {
6012 /* Check MIME code */
6013 if ((c1 = (*i_getc)(f)) == EOF) {
6014 (*oconv)(0, '=');
6015 LAST;
6016 } else if (c1 == '?') {
6017 /* =? is mime conversion start sequence */
6018 if(mime_f == STRICT_MIME) {
6019 /* check in real detail */
6020 if (mime_begin_strict(f) == EOF)
6021 LAST;
6022 SKIP;
6023 } else if (mime_begin(f) == EOF)
6024 LAST;
6025 SKIP;
6026 } else {
6027 (*oconv)(0, '=');
6028 (*i_ungetc)(c1,f);
6029 SKIP;
6030 }
6031 } else {
6032 /* normal ASCII code */
6033 SEND;
6034 }
6035 } else if (c1 == SI && (!is_8bit || mime_decode_mode)) {
6036 shift_mode = 0;
6037 SKIP;
6038 } else if (c1 == SO && (!is_8bit || mime_decode_mode)) {
6039 shift_mode = 1;
6040 SKIP;
6041 } else if (c1 == ESC && (!is_8bit || mime_decode_mode)) {
6042 if ((c1 = (*i_getc)(f)) == EOF) {
6043 (*oconv)(0, ESC);
6044 LAST;
6045 }
6046 else if (c1 == '&') {
6047 /* IRR */
6048 if ((c1 = (*i_getc)(f)) == EOF) {
6049 LAST;
6050 } else {
6051 SKIP;
6052 }
6053 }
6054 else if (c1 == '$') {
6055 /* GZDMx */
6056 if ((c1 = (*i_getc)(f)) == EOF) {
6057 /* don't send bogus code
6058 (*oconv)(0, ESC);
6059 (*oconv)(0, '$'); */
6060 LAST;
6061 } else if (c1 == '@' || c1 == 'B') {
6062 /* JIS X 0208 */
6063 set_input_mode(JIS_X_0208);
6064 SKIP;
6065 } else if (c1 == '(') {
6066 /* GZDM4 */
6067 if ((c1 = (*i_getc)(f)) == EOF) {
6068 /* don't send bogus code
6069 (*oconv)(0, ESC);
6070 (*oconv)(0, '$');
6071 (*oconv)(0, '(');
6072 */
6073 LAST;
6074 } else if (c1 == '@'|| c1 == 'B') {
6075 /* JIS X 0208 */
6076 set_input_mode(JIS_X_0208);
6077 SKIP;
6078 #ifdef X0212_ENABLE
6079 } else if (c1 == 'D'){
6080 set_input_mode(JIS_X_0212);
6081 SKIP;
6082 #endif /* X0212_ENABLE */
6083 } else if (c1 == 'O' || c1 == 'Q'){
6084 set_input_mode(JIS_X_0213_1);
6085 SKIP;
6086 } else if (c1 == 'P'){
6087 set_input_mode(JIS_X_0213_2);
6088 SKIP;
6089 } else {
6090 /* could be some special code */
6091 (*oconv)(0, ESC);
6092 (*oconv)(0, '$');
6093 (*oconv)(0, '(');
6094 (*oconv)(0, c1);
6095 SKIP;
6096 }
6097 } else if (broken_f&0x2) {
6098 /* accept any ESC-(-x as broken code ... */
6099 input_mode = JIS_X_0208;
6100 shift_mode = 0;
6101 SKIP;
6102 } else {
6103 (*oconv)(0, ESC);
6104 (*oconv)(0, '$');
6105 (*oconv)(0, c1);
6106 SKIP;
6107 }
6108 } else if (c1 == '(') {
6109 /* GZD4 */
6110 if ((c1 = (*i_getc)(f)) == EOF) {
6111 /* don't send bogus code
6112 (*oconv)(0, ESC);
6113 (*oconv)(0, '('); */
6114 LAST;
6115 }
6116 else if (c1 == 'I') {
6117 /* JIS X 0201 Katakana */
6118 set_input_mode(JIS_X_0201_1976_K);
6119 shift_mode = 1;
6120 SKIP;
6121 }
6122 else if (c1 == 'B' || c1 == 'J' || c1 == 'H') {
6123 /* ISO-646IRV:1983 or JIS X 0201 Roman or JUNET */
6124 set_input_mode(ASCII);
6125 SKIP;
6126 }
6127 else if (broken_f&0x2) {
6128 set_input_mode(ASCII);
6129 SKIP;
6130 }
6131 else {
6132 (*oconv)(0, ESC);
6133 (*oconv)(0, '(');
6134 SEND;
6135 }
6136 }
6137 else if (c1 == '.') {
6138 /* G2D6 */
6139 if ((c1 = (*i_getc)(f)) == EOF) {
6140 LAST;
6141 }
6142 else if (c1 == 'A') {
6143 /* ISO-8859-1 */
6144 g2 = ISO_8859_1;
6145 SKIP;
6146 }
6147 else {
6148 (*oconv)(0, ESC);
6149 (*oconv)(0, '.');
6150 SEND;
6151 }
6152 }
6153 else if (c1 == 'N') {
6154 /* SS2 */
6155 c1 = (*i_getc)(f);
6156 if (g2 == ISO_8859_1) {
6157 c2 = ISO_8859_1;
6158 SEND;
6159 }else{
6160 (*i_ungetc)(c1, f);
6161 /* lonely ESC */
6162 (*oconv)(0, ESC);
6163 SEND;
6164 }
6165 }
6166 else {
6167 i_ungetc(c1,f);
6168 /* lonely ESC */
6169 (*oconv)(0, ESC);
6170 SKIP;
6171 }
6172 } else if (c1 == ESC && iconv == s_iconv) {
6173 /* ESC in Shift_JIS */
6174 if ((c1 = (*i_getc)(f)) == EOF) {
6175 (*oconv)(0, ESC);
6176 LAST;
6177 } else if (c1 == '$') {
6178 /* J-PHONE emoji */
6179 if ((c1 = (*i_getc)(f)) == EOF) {
6180 LAST;
6181 } else if (('E' <= c1 && c1 <= 'G') ||
6182 ('O' <= c1 && c1 <= 'Q')) {
6183 /*
6184 NUM : 0 1 2 3 4 5
6185 BYTE: G E F O P Q
6186 C%7 : 1 6 0 2 3 4
6187 C%7 : 0 1 2 3 4 5 6
6188 NUM : 2 0 3 4 5 X 1
6189 */
6190 static const nkf_char jphone_emoji_first_table[7] =
6191 {0xE1E0, 0xDFE0, 0xE2E0, 0xE3E0, 0xE4E0, 0xDFE0, 0xE0E0};
6192 c3 = nkf_char_unicode_new(jphone_emoji_first_table[c1 % 7]);
6193 if ((c1 = (*i_getc)(f)) == EOF) LAST;
6194 while (SP <= c1 && c1 <= 'z') {
6195 (*oconv)(0, c1 + c3);
6196 if ((c1 = (*i_getc)(f)) == EOF) LAST;
6197 }
6198 SKIP;
6199 }
6200 else {
6201 (*oconv)(0, ESC);
6202 (*oconv)(0, '$');
6203 SEND;
6204 }
6205 }
6206 else {
6207 i_ungetc(c1,f);
6208 /* lonely ESC */
6209 (*oconv)(0, ESC);
6210 SKIP;
6211 }
6212 } else if (c1 == LF || c1 == CR) {
6213 if (broken_f&4) {
6214 input_mode = ASCII; set_iconv(FALSE, 0);
6215 SEND;
6216 } else if (mime_decode_f && !mime_decode_mode){
6217 if (c1 == LF) {
6218 if ((c1=(*i_getc)(f))!=EOF && c1 == SP) {
6219 i_ungetc(SP,f);
6220 continue;
6221 } else {
6222 i_ungetc(c1,f);
6223 }
6224 c1 = LF;
6225 SEND;
6226 } else { /* if (c1 == CR)*/
6227 if ((c1=(*i_getc)(f))!=EOF) {
6228 if (c1==SP) {
6229 i_ungetc(SP,f);
6230 continue;
6231 } else if (c1 == LF && (c1=(*i_getc)(f))!=EOF && c1 == SP) {
6232 i_ungetc(SP,f);
6233 continue;
6234 } else {
6235 i_ungetc(c1,f);
6236 }
6237 i_ungetc(LF,f);
6238 } else {
6239 i_ungetc(c1,f);
6240 }
6241 c1 = CR;
6242 SEND;
6243 }
6244 }
6245 } else
6246 SEND;
6247 }
6248 /* send: */
6249 switch(input_mode){
6250 case ASCII:
6251 switch ((*iconv)(c2, c1, 0)) { /* can be EUC / SJIS / UTF-8 */
6252 case -2:
6253 /* 4 bytes UTF-8 */
6254 if ((c3 = (*i_getc)(f)) != EOF) {
6255 code_status(c3);
6256 c3 <<= 8;
6257 if ((c4 = (*i_getc)(f)) != EOF) {
6258 code_status(c4);
6259 (*iconv)(c2, c1, c3|c4);
6260 }
6261 }
6262 break;
6263 case -3:
6264 /* 4 bytes UTF-8 (check combining character) */
6265 if ((c3 = (*i_getc)(f)) != EOF) {
6266 if ((c4 = (*i_getc)(f)) != EOF) {
6267 if (w_iconv_combine(c2, c1, 0, c3, c4, 0)) {
6268 (*i_ungetc)(c4, f);
6269 (*i_ungetc)(c3, f);
6270 w_iconv_nocombine(c2, c1, 0);
6271 }
6272 } else {
6273 (*i_ungetc)(c3, f);
6274 w_iconv_nocombine(c2, c1, 0);
6275 }
6276 } else {
6277 w_iconv_nocombine(c2, c1, 0);
6278 }
6279 break;
6280 case -1:
6281 /* 3 bytes EUC or UTF-8 */
6282 if ((c3 = (*i_getc)(f)) != EOF) {
6283 code_status(c3);
6284 if ((*iconv)(c2, c1, c3) == -3) {
6285 /* 6 bytes UTF-8 (check combining character) */
6286 nkf_char c5, c6;
6287 if ((c4 = (*i_getc)(f)) != EOF) {
6288 if ((c5 = (*i_getc)(f)) != EOF) {
6289 if ((c6 = (*i_getc)(f)) != EOF) {
6290 if (w_iconv_combine(c2, c1, c3, c4, c5, c6)) {
6291 (*i_ungetc)(c6, f);
6292 (*i_ungetc)(c5, f);
6293 (*i_ungetc)(c4, f);
6294 w_iconv_nocombine(c2, c1, c3);
6295 }
6296 } else {
6297 (*i_ungetc)(c5, f);
6298 (*i_ungetc)(c4, f);
6299 w_iconv_nocombine(c2, c1, c3);
6300 }
6301 } else {
6302 (*i_ungetc)(c4, f);
6303 w_iconv_nocombine(c2, c1, c3);
6304 }
6305 } else {
6306 w_iconv_nocombine(c2, c1, c3);
6307 }
6308 }
6309 }
6310 break;
6311 }
6312 break;
6313 case JIS_X_0208:
6314 case JIS_X_0213_1:
6315 if (ms_ucs_map_f &&
6316 0x7F <= c2 && c2 <= 0x92 &&
6317 0x21 <= c1 && c1 <= 0x7E) {
6318 /* CP932 UDC */
6319 c1 = nkf_char_unicode_new((c2 - 0x7F) * 94 + c1 - 0x21 + 0xE000);
6320 c2 = 0;
6321 }
6322 (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */
6323 break;
6324 #ifdef X0212_ENABLE
6325 case JIS_X_0212:
6326 (*oconv)(PREFIX_EUCG3 | c2, c1);
6327 break;
6328 #endif /* X0212_ENABLE */
6329 case JIS_X_0213_2:
6330 (*oconv)(PREFIX_EUCG3 | c2, c1);
6331 break;
6332 default:
6333 (*oconv)(input_mode, c1); /* other special case */
6334 }
6335
6336 c2 = 0;
6337 c3 = 0;
6338 continue;
6339 /* goto next_word */
6340 }
6341
6342 finished:
6343 /* epilogue */
6344 (*iconv)(EOF, 0, 0);
6345 if (!input_codename)
6346 {
6347 if (is_8bit) {
6348 struct input_code *p = input_code_list;
6349 struct input_code *result = p;
6350 while (p->name){
6351 if (p->score < result->score) result = p;
6352 ++p;
6353 }
6354 set_input_codename(result->name);
6355 #ifdef CHECK_OPTION
6356 debug(result->name);
6357 #endif
6358 }
6359 }
6360 return 0;
6361 }
6362
6363 /*
6364 * int options(unsigned char *cp)
6365 *
6366 * return values:
6367 * 0: success
6368 * -1: ArgumentError
6369 */
6370 static int
options(unsigned char * cp)6371 options(unsigned char *cp)
6372 {
6373 nkf_char i, j;
6374 unsigned char *p;
6375 unsigned char *cp_back = NULL;
6376 nkf_encoding *enc;
6377
6378 if (option_mode==1)
6379 return 0;
6380 while(*cp && *cp++!='-');
6381 while (*cp || cp_back) {
6382 if(!*cp){
6383 cp = cp_back;
6384 cp_back = NULL;
6385 continue;
6386 }
6387 p = 0;
6388 switch (*cp++) {
6389 case '-': /* literal options */
6390 if (!*cp || *cp == SP) { /* ignore the rest of arguments */
6391 option_mode = 1;
6392 return 0;
6393 }
6394 for (i=0;i<(int)(sizeof(long_option)/sizeof(long_option[0]));i++) {
6395 p = (unsigned char *)long_option[i].name;
6396 for (j=0;*p && *p != '=' && *p == cp[j];p++, j++);
6397 if (*p == cp[j] || cp[j] == SP){
6398 p = &cp[j] + 1;
6399 break;
6400 }
6401 p = 0;
6402 }
6403 if (p == 0) {
6404 #if !defined(PERL_XS) && !defined(WIN32DLL)
6405 fprintf(stderr, "unknown long option: --%s\n", cp);
6406 #endif
6407 return -1;
6408 }
6409 while(*cp && *cp != SP && cp++);
6410 if (long_option[i].alias[0]){
6411 cp_back = cp;
6412 cp = (unsigned char *)long_option[i].alias;
6413 }else{
6414 #ifndef PERL_XS
6415 if (strcmp(long_option[i].name, "help") == 0){
6416 usage();
6417 exit(EXIT_SUCCESS);
6418 }
6419 #endif
6420 if (strcmp(long_option[i].name, "ic=") == 0){
6421 enc = nkf_enc_find((char *)p);
6422 if (!enc) continue;
6423 input_encoding = enc;
6424 continue;
6425 }
6426 if (strcmp(long_option[i].name, "oc=") == 0){
6427 enc = nkf_enc_find((char *)p);
6428 /* if (enc <= 0) continue; */
6429 if (!enc) continue;
6430 output_encoding = enc;
6431 continue;
6432 }
6433 if (strcmp(long_option[i].name, "guess=") == 0){
6434 if (p[0] == '0' || p[0] == '1') {
6435 guess_f = 1;
6436 } else {
6437 guess_f = 2;
6438 }
6439 continue;
6440 }
6441 #ifdef OVERWRITE
6442 if (strcmp(long_option[i].name, "overwrite") == 0){
6443 file_out_f = TRUE;
6444 overwrite_f = TRUE;
6445 preserve_time_f = TRUE;
6446 continue;
6447 }
6448 if (strcmp(long_option[i].name, "overwrite=") == 0){
6449 file_out_f = TRUE;
6450 overwrite_f = TRUE;
6451 preserve_time_f = TRUE;
6452 backup_f = TRUE;
6453 backup_suffix = (char *)p;
6454 continue;
6455 }
6456 if (strcmp(long_option[i].name, "in-place") == 0){
6457 file_out_f = TRUE;
6458 overwrite_f = TRUE;
6459 preserve_time_f = FALSE;
6460 continue;
6461 }
6462 if (strcmp(long_option[i].name, "in-place=") == 0){
6463 file_out_f = TRUE;
6464 overwrite_f = TRUE;
6465 preserve_time_f = FALSE;
6466 backup_f = TRUE;
6467 backup_suffix = (char *)p;
6468 continue;
6469 }
6470 #endif
6471 #ifdef INPUT_OPTION
6472 if (strcmp(long_option[i].name, "cap-input") == 0){
6473 cap_f = TRUE;
6474 continue;
6475 }
6476 if (strcmp(long_option[i].name, "url-input") == 0){
6477 url_f = TRUE;
6478 continue;
6479 }
6480 #endif
6481 #ifdef NUMCHAR_OPTION
6482 if (strcmp(long_option[i].name, "numchar-input") == 0){
6483 numchar_f = TRUE;
6484 continue;
6485 }
6486 #endif
6487 #ifdef CHECK_OPTION
6488 if (strcmp(long_option[i].name, "no-output") == 0){
6489 noout_f = TRUE;
6490 continue;
6491 }
6492 if (strcmp(long_option[i].name, "debug") == 0){
6493 debug_f = TRUE;
6494 continue;
6495 }
6496 #endif
6497 if (strcmp(long_option[i].name, "cp932") == 0){
6498 #ifdef SHIFTJIS_CP932
6499 cp51932_f = TRUE;
6500 cp932inv_f = -TRUE;
6501 #endif
6502 #ifdef UTF8_OUTPUT_ENABLE
6503 ms_ucs_map_f = UCS_MAP_CP932;
6504 #endif
6505 continue;
6506 }
6507 if (strcmp(long_option[i].name, "no-cp932") == 0){
6508 #ifdef SHIFTJIS_CP932
6509 cp51932_f = FALSE;
6510 cp932inv_f = FALSE;
6511 #endif
6512 #ifdef UTF8_OUTPUT_ENABLE
6513 ms_ucs_map_f = UCS_MAP_ASCII;
6514 #endif
6515 continue;
6516 }
6517 #ifdef SHIFTJIS_CP932
6518 if (strcmp(long_option[i].name, "cp932inv") == 0){
6519 cp932inv_f = -TRUE;
6520 continue;
6521 }
6522 #endif
6523
6524 #ifdef X0212_ENABLE
6525 if (strcmp(long_option[i].name, "x0212") == 0){
6526 x0212_f = TRUE;
6527 continue;
6528 }
6529 #endif
6530
6531 #ifdef EXEC_IO
6532 if (strcmp(long_option[i].name, "exec-in") == 0){
6533 exec_f = 1;
6534 return 0;
6535 }
6536 if (strcmp(long_option[i].name, "exec-out") == 0){
6537 exec_f = -1;
6538 return 0;
6539 }
6540 #endif
6541 #if defined(UTF8_OUTPUT_ENABLE) && defined(UTF8_INPUT_ENABLE)
6542 if (strcmp(long_option[i].name, "no-cp932ext") == 0){
6543 no_cp932ext_f = TRUE;
6544 continue;
6545 }
6546 if (strcmp(long_option[i].name, "no-best-fit-chars") == 0){
6547 no_best_fit_chars_f = TRUE;
6548 continue;
6549 }
6550 if (strcmp(long_option[i].name, "fb-skip") == 0){
6551 encode_fallback = NULL;
6552 continue;
6553 }
6554 if (strcmp(long_option[i].name, "fb-html") == 0){
6555 encode_fallback = encode_fallback_html;
6556 continue;
6557 }
6558 if (strcmp(long_option[i].name, "fb-xml") == 0){
6559 encode_fallback = encode_fallback_xml;
6560 continue;
6561 }
6562 if (strcmp(long_option[i].name, "fb-java") == 0){
6563 encode_fallback = encode_fallback_java;
6564 continue;
6565 }
6566 if (strcmp(long_option[i].name, "fb-perl") == 0){
6567 encode_fallback = encode_fallback_perl;
6568 continue;
6569 }
6570 if (strcmp(long_option[i].name, "fb-subchar") == 0){
6571 encode_fallback = encode_fallback_subchar;
6572 continue;
6573 }
6574 if (strcmp(long_option[i].name, "fb-subchar=") == 0){
6575 encode_fallback = encode_fallback_subchar;
6576 unicode_subchar = 0;
6577 if (p[0] != '0'){
6578 /* decimal number */
6579 for (i = 0; i < 7 && nkf_isdigit(p[i]); i++){
6580 unicode_subchar *= 10;
6581 unicode_subchar += hex2bin(p[i]);
6582 }
6583 }else if(p[1] == 'x' || p[1] == 'X'){
6584 /* hexadecimal number */
6585 for (i = 2; i < 8 && nkf_isxdigit(p[i]); i++){
6586 unicode_subchar <<= 4;
6587 unicode_subchar |= hex2bin(p[i]);
6588 }
6589 }else{
6590 /* octal number */
6591 for (i = 1; i < 8 && nkf_isoctal(p[i]); i++){
6592 unicode_subchar *= 8;
6593 unicode_subchar += hex2bin(p[i]);
6594 }
6595 }
6596 w16e_conv(unicode_subchar, &i, &j);
6597 unicode_subchar = i<<8 | j;
6598 continue;
6599 }
6600 #endif
6601 #ifdef UTF8_OUTPUT_ENABLE
6602 if (strcmp(long_option[i].name, "ms-ucs-map") == 0){
6603 ms_ucs_map_f = UCS_MAP_MS;
6604 continue;
6605 }
6606 #endif
6607 #ifdef UNICODE_NORMALIZATION
6608 if (strcmp(long_option[i].name, "utf8mac-input") == 0){
6609 nfc_f = TRUE;
6610 continue;
6611 }
6612 #endif
6613 if (strcmp(long_option[i].name, "prefix=") == 0){
6614 if (nkf_isgraph(p[0])){
6615 for (i = 1; nkf_isgraph(p[i]); i++){
6616 prefix_table[p[i]] = p[0];
6617 }
6618 }
6619 continue;
6620 }
6621 #if !defined(PERL_XS) && !defined(WIN32DLL)
6622 fprintf(stderr, "unsupported long option: --%s\n", long_option[i].name);
6623 #endif
6624 return -1;
6625 }
6626 continue;
6627 case 'b': /* buffered mode */
6628 unbuf_f = FALSE;
6629 continue;
6630 case 'u': /* non bufferd mode */
6631 unbuf_f = TRUE;
6632 continue;
6633 case 't': /* transparent mode */
6634 if (*cp=='1') {
6635 /* alias of -t */
6636 cp++;
6637 nop_f = TRUE;
6638 } else if (*cp=='2') {
6639 /*
6640 * -t with put/get
6641 *
6642 * nkf -t2MB hoge.bin | nkf -t2mB | diff -s - hoge.bin
6643 *
6644 */
6645 cp++;
6646 nop_f = 2;
6647 } else
6648 nop_f = TRUE;
6649 continue;
6650 case 'j': /* JIS output */
6651 case 'n':
6652 output_encoding = nkf_enc_from_index(ISO_2022_JP);
6653 continue;
6654 case 'e': /* AT&T EUC output */
6655 output_encoding = nkf_enc_from_index(EUCJP_NKF);
6656 continue;
6657 case 's': /* SJIS output */
6658 output_encoding = nkf_enc_from_index(SHIFT_JIS);
6659 continue;
6660 case 'l': /* ISO8859 Latin-1 support, no conversion */
6661 iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */
6662 input_encoding = nkf_enc_from_index(ISO_8859_1);
6663 continue;
6664 case 'i': /* Kanji IN ESC-$-@/B */
6665 if (*cp=='@'||*cp=='B')
6666 kanji_intro = *cp++;
6667 continue;
6668 case 'o': /* ASCII IN ESC-(-J/B/H */
6669 /* ESC ( H was used in initial JUNET messages */
6670 if (*cp=='J'||*cp=='B'||*cp=='H')
6671 ascii_intro = *cp++;
6672 continue;
6673 case 'h':
6674 /*
6675 bit:1 katakana->hiragana
6676 bit:2 hiragana->katakana
6677 */
6678 if ('9'>= *cp && *cp>='0')
6679 hira_f |= (*cp++ -'0');
6680 else
6681 hira_f |= 1;
6682 continue;
6683 case 'r':
6684 rot_f = TRUE;
6685 continue;
6686 #if defined(MSDOS) || defined(__OS2__)
6687 case 'T':
6688 binmode_f = FALSE;
6689 continue;
6690 #endif
6691 #ifndef PERL_XS
6692 case 'V':
6693 show_configuration();
6694 exit(EXIT_SUCCESS);
6695 break;
6696 case 'v':
6697 version();
6698 exit(EXIT_SUCCESS);
6699 break;
6700 #endif
6701 #ifdef UTF8_OUTPUT_ENABLE
6702 case 'w': /* UTF-{8,16,32} output */
6703 if (cp[0] == '8') {
6704 cp++;
6705 if (cp[0] == '0'){
6706 cp++;
6707 output_encoding = nkf_enc_from_index(UTF_8N);
6708 } else {
6709 output_bom_f = TRUE;
6710 output_encoding = nkf_enc_from_index(UTF_8_BOM);
6711 }
6712 } else {
6713 int enc_idx;
6714 if ('1'== cp[0] && '6'==cp[1]) {
6715 cp += 2;
6716 enc_idx = UTF_16;
6717 } else if ('3'== cp[0] && '2'==cp[1]) {
6718 cp += 2;
6719 enc_idx = UTF_32;
6720 } else {
6721 output_encoding = nkf_enc_from_index(UTF_8);
6722 continue;
6723 }
6724 if (cp[0]=='L') {
6725 cp++;
6726 output_endian = ENDIAN_LITTLE;
6727 output_bom_f = TRUE;
6728 } else if (cp[0] == 'B') {
6729 cp++;
6730 output_bom_f = TRUE;
6731 }
6732 if (cp[0] == '0'){
6733 output_bom_f = FALSE;
6734 cp++;
6735 enc_idx = enc_idx == UTF_16
6736 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6737 : (output_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE);
6738 } else {
6739 enc_idx = enc_idx == UTF_16
6740 ? (output_endian == ENDIAN_LITTLE ? UTF_16LE_BOM : UTF_16BE_BOM)
6741 : (output_endian == ENDIAN_LITTLE ? UTF_32LE_BOM : UTF_32BE_BOM);
6742 }
6743 output_encoding = nkf_enc_from_index(enc_idx);
6744 }
6745 continue;
6746 #endif
6747 #ifdef UTF8_INPUT_ENABLE
6748 case 'W': /* UTF input */
6749 if (cp[0] == '8') {
6750 cp++;
6751 input_encoding = nkf_enc_from_index(UTF_8);
6752 }else{
6753 int enc_idx;
6754 if ('1'== cp[0] && '6'==cp[1]) {
6755 cp += 2;
6756 input_endian = ENDIAN_BIG;
6757 enc_idx = UTF_16;
6758 } else if ('3'== cp[0] && '2'==cp[1]) {
6759 cp += 2;
6760 input_endian = ENDIAN_BIG;
6761 enc_idx = UTF_32;
6762 } else {
6763 input_encoding = nkf_enc_from_index(UTF_8);
6764 continue;
6765 }
6766 if (cp[0]=='L') {
6767 cp++;
6768 input_endian = ENDIAN_LITTLE;
6769 } else if (cp[0] == 'B') {
6770 cp++;
6771 input_endian = ENDIAN_BIG;
6772 }
6773 enc_idx = (enc_idx == UTF_16
6774 ? (input_endian == ENDIAN_LITTLE ? UTF_16LE : UTF_16BE)
6775 : (input_endian == ENDIAN_LITTLE ? UTF_32LE : UTF_32BE));
6776 input_encoding = nkf_enc_from_index(enc_idx);
6777 }
6778 continue;
6779 #endif
6780 /* Input code assumption */
6781 case 'J': /* ISO-2022-JP input */
6782 input_encoding = nkf_enc_from_index(ISO_2022_JP);
6783 continue;
6784 case 'E': /* EUC-JP input */
6785 input_encoding = nkf_enc_from_index(EUCJP_NKF);
6786 continue;
6787 case 'S': /* Shift_JIS input */
6788 input_encoding = nkf_enc_from_index(SHIFT_JIS);
6789 continue;
6790 case 'Z': /* Convert X0208 alphabet to asii */
6791 /* alpha_f
6792 bit:0 Convert JIS X 0208 Alphabet to ASCII
6793 bit:1 Convert Kankaku to one space
6794 bit:2 Convert Kankaku to two spaces
6795 bit:3 Convert HTML Entity
6796 bit:4 Convert JIS X 0208 Katakana to JIS X 0201 Katakana
6797 */
6798 while ('0'<= *cp && *cp <='4') {
6799 alpha_f |= 1 << (*cp++ - '0');
6800 }
6801 alpha_f |= 1;
6802 continue;
6803 case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */
6804 x0201_f = FALSE; /* No X0201->X0208 conversion */
6805 /* accept X0201
6806 ESC-(-I in JIS, EUC, MS Kanji
6807 SI/SO in JIS, EUC, MS Kanji
6808 SS2 in EUC, JIS, not in MS Kanji
6809 MS Kanji (0xa0-0xdf)
6810 output X0201
6811 ESC-(-I in JIS (0x20-0x5f)
6812 SS2 in EUC (0xa0-0xdf)
6813 0xa0-0xd in MS Kanji (0xa0-0xdf)
6814 */
6815 continue;
6816 case 'X': /* Convert X0201 kana to X0208 */
6817 x0201_f = TRUE;
6818 continue;
6819 case 'F': /* prserve new lines */
6820 fold_preserve_f = TRUE;
6821 case 'f': /* folding -f60 or -f */
6822 fold_f = TRUE;
6823 fold_len = 0;
6824 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6825 fold_len *= 10;
6826 fold_len += *cp++ - '0';
6827 }
6828 if (!(0<fold_len && fold_len<BUFSIZ))
6829 fold_len = DEFAULT_FOLD;
6830 if (*cp=='-') {
6831 fold_margin = 0;
6832 cp++;
6833 while('0'<= *cp && *cp <='9') { /* we don't use atoi here */
6834 fold_margin *= 10;
6835 fold_margin += *cp++ - '0';
6836 }
6837 }
6838 continue;
6839 case 'm': /* MIME support */
6840 /* mime_decode_f = TRUE; */ /* this has too large side effects... */
6841 if (*cp=='B'||*cp=='Q') {
6842 mime_decode_mode = *cp++;
6843 mimebuf_f = FIXED_MIME;
6844 } else if (*cp=='N') {
6845 mime_f = TRUE; cp++;
6846 } else if (*cp=='S') {
6847 mime_f = STRICT_MIME; cp++;
6848 } else if (*cp=='0') {
6849 mime_decode_f = FALSE;
6850 mime_f = FALSE; cp++;
6851 } else {
6852 mime_f = STRICT_MIME;
6853 }
6854 continue;
6855 case 'M': /* MIME output */
6856 if (*cp=='B') {
6857 mimeout_mode = 'B';
6858 mimeout_f = FIXED_MIME; cp++;
6859 } else if (*cp=='Q') {
6860 mimeout_mode = 'Q';
6861 mimeout_f = FIXED_MIME; cp++;
6862 } else {
6863 mimeout_f = TRUE;
6864 }
6865 continue;
6866 case 'B': /* Broken JIS support */
6867 /* bit:0 no ESC JIS
6868 bit:1 allow any x on ESC-(-x or ESC-$-x
6869 bit:2 reset to ascii on NL
6870 */
6871 if ('9'>= *cp && *cp>='0')
6872 broken_f |= 1<<(*cp++ -'0');
6873 else
6874 broken_f |= TRUE;
6875 continue;
6876 #ifndef PERL_XS
6877 case 'O':/* for Output file */
6878 file_out_f = TRUE;
6879 continue;
6880 #endif
6881 case 'c':/* add cr code */
6882 eolmode_f = CRLF;
6883 continue;
6884 case 'd':/* delete cr code */
6885 eolmode_f = LF;
6886 continue;
6887 case 'I': /* ISO-2022-JP output */
6888 iso2022jp_f = TRUE;
6889 continue;
6890 case 'L': /* line mode */
6891 if (*cp=='u') { /* unix */
6892 eolmode_f = LF; cp++;
6893 } else if (*cp=='m') { /* mac */
6894 eolmode_f = CR; cp++;
6895 } else if (*cp=='w') { /* windows */
6896 eolmode_f = CRLF; cp++;
6897 } else if (*cp=='0') { /* no conversion */
6898 eolmode_f = 0; cp++;
6899 }
6900 continue;
6901 #ifndef PERL_XS
6902 case 'g':
6903 if ('2' <= *cp && *cp <= '9') {
6904 guess_f = 2;
6905 cp++;
6906 } else if (*cp == '0' || *cp == '1') {
6907 guess_f = 1;
6908 cp++;
6909 } else {
6910 guess_f = 1;
6911 }
6912 continue;
6913 #endif
6914 case SP:
6915 /* module multiple options in a string are allowed for Perl module */
6916 while(*cp && *cp++!='-');
6917 continue;
6918 default:
6919 #if !defined(PERL_XS) && !defined(WIN32DLL)
6920 fprintf(stderr, "unknown option: -%c\n", *(cp-1));
6921 #endif
6922 /* bogus option but ignored */
6923 return -1;
6924 }
6925 }
6926 return 0;
6927 }
6928
6929 #ifdef WIN32DLL
6930 #include "nkf32dll.c"
6931 #elif defined(PERL_XS)
6932 #else /* WIN32DLL */
6933 int
main(int argc,char ** argv)6934 main(int argc, char **argv)
6935 {
6936 FILE *fin;
6937 unsigned char *cp;
6938
6939 char *outfname = NULL;
6940 char *origfname;
6941
6942 #ifdef EASYWIN /*Easy Win */
6943 _BufferSize.y = 400;/*Set Scroll Buffer Size*/
6944 #endif
6945 #ifdef DEFAULT_CODE_LOCALE
6946 setlocale(LC_CTYPE, "");
6947 #endif
6948 nkf_state_init();
6949
6950 for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) {
6951 cp = (unsigned char *)*argv;
6952 options(cp);
6953 #ifdef EXEC_IO
6954 if (exec_f){
6955 int fds[2], pid;
6956 if (pipe(fds) < 0 || (pid = fork()) < 0){
6957 abort();
6958 }
6959 if (pid == 0){
6960 if (exec_f > 0){
6961 close(fds[0]);
6962 dup2(fds[1], 1);
6963 }else{
6964 close(fds[1]);
6965 dup2(fds[0], 0);
6966 }
6967 execvp(argv[1], &argv[1]);
6968 }
6969 if (exec_f > 0){
6970 close(fds[1]);
6971 dup2(fds[0], 0);
6972 }else{
6973 close(fds[0]);
6974 dup2(fds[1], 1);
6975 }
6976 argc = 0;
6977 break;
6978 }
6979 #endif
6980 }
6981
6982 if (guess_f) {
6983 #ifdef CHECK_OPTION
6984 int debug_f_back = debug_f;
6985 #endif
6986 #ifdef EXEC_IO
6987 int exec_f_back = exec_f;
6988 #endif
6989 #ifdef X0212_ENABLE
6990 int x0212_f_back = x0212_f;
6991 #endif
6992 int x0213_f_back = x0213_f;
6993 int guess_f_back = guess_f;
6994 reinit();
6995 guess_f = guess_f_back;
6996 mime_f = FALSE;
6997 #ifdef CHECK_OPTION
6998 debug_f = debug_f_back;
6999 #endif
7000 #ifdef EXEC_IO
7001 exec_f = exec_f_back;
7002 #endif
7003 x0212_f = x0212_f_back;
7004 x0213_f = x0213_f_back;
7005 }
7006
7007 if (binmode_f == TRUE)
7008 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
7009 if (freopen("","wb",stdout) == NULL)
7010 return (-1);
7011 #else
7012 setbinmode(stdout);
7013 #endif
7014
7015 if (unbuf_f)
7016 setbuf(stdout, (char *) NULL);
7017 else
7018 setvbuffer(stdout, (char *) stdobuf, IOBUF_SIZE);
7019
7020 if (argc == 0) {
7021 if (binmode_f == TRUE)
7022 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
7023 if (freopen("","rb",stdin) == NULL) return (-1);
7024 #else
7025 setbinmode(stdin);
7026 #endif
7027 setvbuffer(stdin, (char *) stdibuf, IOBUF_SIZE);
7028 if (nop_f)
7029 noconvert(stdin);
7030 else {
7031 kanji_convert(stdin);
7032 if (guess_f) print_guessed_code(NULL);
7033 }
7034 } else {
7035 int nfiles = argc;
7036 int is_argument_error = FALSE;
7037 while (argc--) {
7038 input_codename = NULL;
7039 input_eol = 0;
7040 #ifdef CHECK_OPTION
7041 iconv_for_check = 0;
7042 #endif
7043 if ((fin = fopen((origfname = *argv++), "r")) == NULL) {
7044 perror(*(argv-1));
7045 is_argument_error = TRUE;
7046 continue;
7047 } else {
7048 #ifdef OVERWRITE
7049 int fd = 0;
7050 int fd_backup = 0;
7051 #endif
7052
7053 /* reopen file for stdout */
7054 if (file_out_f == TRUE) {
7055 #ifdef OVERWRITE
7056 if (overwrite_f){
7057 outfname = nkf_xmalloc(strlen(origfname)
7058 + strlen(".nkftmpXXXXXX")
7059 + 1);
7060 strcpy(outfname, origfname);
7061 #ifdef MSDOS
7062 {
7063 int i;
7064 for (i = strlen(outfname); i; --i){
7065 if (outfname[i - 1] == '/'
7066 || outfname[i - 1] == '\\'){
7067 break;
7068 }
7069 }
7070 outfname[i] = '\0';
7071 }
7072 strcat(outfname, "ntXXXXXX");
7073 mktemp(outfname);
7074 fd = open(outfname, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL,
7075 S_IREAD | S_IWRITE);
7076 #else
7077 strcat(outfname, ".nkftmpXXXXXX");
7078 fd = mkstemp(outfname);
7079 #endif
7080 if (fd < 0
7081 || (fd_backup = dup(fileno(stdout))) < 0
7082 || dup2(fd, fileno(stdout)) < 0
7083 ){
7084 perror(origfname);
7085 return -1;
7086 }
7087 }else
7088 #endif
7089 if(argc == 1) {
7090 outfname = *argv++;
7091 argc--;
7092 } else {
7093 outfname = "nkf.out";
7094 }
7095
7096 if(freopen(outfname, "w", stdout) == NULL) {
7097 perror (outfname);
7098 return (-1);
7099 }
7100 if (binmode_f == TRUE) {
7101 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
7102 if (freopen("","wb",stdout) == NULL)
7103 return (-1);
7104 #else
7105 setbinmode(stdout);
7106 #endif
7107 }
7108 }
7109 if (binmode_f == TRUE)
7110 #if defined(__OS2__) && (defined(__IBMC__) || defined(__IBMCPP__))
7111 if (freopen("","rb",fin) == NULL)
7112 return (-1);
7113 #else
7114 setbinmode(fin);
7115 #endif
7116 setvbuffer(fin, (char *) stdibuf, IOBUF_SIZE);
7117 if (nop_f)
7118 noconvert(fin);
7119 else {
7120 char *filename = NULL;
7121 kanji_convert(fin);
7122 if (nfiles > 1) filename = origfname;
7123 if (guess_f) print_guessed_code(filename);
7124 }
7125 fclose(fin);
7126 #ifdef OVERWRITE
7127 if (overwrite_f) {
7128 struct stat sb;
7129 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
7130 time_t tb[2];
7131 #else
7132 struct utimbuf tb;
7133 #endif
7134
7135 fflush(stdout);
7136 close(fd);
7137 if (dup2(fd_backup, fileno(stdout)) < 0){
7138 perror("dup2");
7139 }
7140 if (stat(origfname, &sb)) {
7141 fprintf(stderr, "Can't stat %s\n", origfname);
7142 }
7143 /* $B%Q!<%_%C%7%g%s$rI|85(B */
7144 if (chmod(outfname, sb.st_mode)) {
7145 fprintf(stderr, "Can't set permission %s\n", outfname);
7146 }
7147
7148 /* $B%?%$%`%9%?%s%W$rI|85(B */
7149 if(preserve_time_f){
7150 #if defined(MSDOS) && !defined(__MINGW32__) && !defined(__WIN32__) && !defined(__WATCOMC__) && !defined(__EMX__) && !defined(__OS2__) && !defined(__DJGPP__)
7151 tb[0] = tb[1] = sb.st_mtime;
7152 if (utime(outfname, tb)) {
7153 fprintf(stderr, "Can't set timestamp %s\n", outfname);
7154 }
7155 #else
7156 tb.actime = sb.st_atime;
7157 tb.modtime = sb.st_mtime;
7158 if (utime(outfname, &tb)) {
7159 fprintf(stderr, "Can't set timestamp %s\n", outfname);
7160 }
7161 #endif
7162 }
7163 if(backup_f){
7164 char *backup_filename = get_backup_filename(backup_suffix, origfname);
7165 #ifdef MSDOS
7166 unlink(backup_filename);
7167 #endif
7168 if (rename(origfname, backup_filename)) {
7169 perror(backup_filename);
7170 fprintf(stderr, "Can't rename %s to %s\n",
7171 origfname, backup_filename);
7172 }
7173 nkf_xfree(backup_filename);
7174 }else{
7175 #ifdef MSDOS
7176 if (unlink(origfname)){
7177 perror(origfname);
7178 }
7179 #endif
7180 }
7181 if (rename(outfname, origfname)) {
7182 perror(origfname);
7183 fprintf(stderr, "Can't rename %s to %s\n",
7184 outfname, origfname);
7185 }
7186 nkf_xfree(outfname);
7187 }
7188 #endif
7189 }
7190 }
7191 if (is_argument_error)
7192 return(-1);
7193 }
7194 #ifdef EASYWIN /*Easy Win */
7195 if (file_out_f == FALSE)
7196 scanf("%d",&end_check);
7197 else
7198 fclose(stdout);
7199 #else /* for Other OS */
7200 if (file_out_f == TRUE)
7201 fclose(stdout);
7202 #endif /*Easy Win */
7203 return (0);
7204 }
7205 #endif /* WIN32DLL */
7206