1 /* -*- c-basic-offset:2; tab-width:2; indent-tabs-mode:nil -*- */
2
3 #include "ef_sjis_parser.h"
4
5 #include <string.h> /* memset */
6 #include <pobl/bl_mem.h>
7 #include <pobl/bl_debug.h>
8
9 #include "ef_iso2022_parser.h"
10 #include "ef_sjis_env.h"
11 #include "ef_jis_property.h"
12
13 /* --- static functions --- */
14
sjis_parser_next_char_intern(ef_parser_t * sjis_parser,ef_char_t * ch,int is_sjisx0213)15 static int sjis_parser_next_char_intern(ef_parser_t *sjis_parser, ef_char_t *ch,
16 int is_sjisx0213) {
17 u_char c1;
18
19 if (sjis_parser->is_eos) {
20 return 0;
21 }
22
23 /* initialize */
24 ef_parser_mark(sjis_parser);
25 memset(ch, 0, sizeof(ef_char_t));
26
27 c1 = *sjis_parser->str;
28
29 if (c1 <= 0x7E) {
30 ch->cs = US_ASCII;
31 *ch->ch = c1;
32 ch->size = 1;
33 ch->property = 0;
34 } else if (0xa1 <= c1 && c1 <= 0xdf) {
35 ch->cs = JISX0201_KATA;
36 *ch->ch = UNSET_MSB(c1);
37 ch->size = 1;
38 ch->property = 0;
39 } else {
40 u_char c2;
41 u_int16_t sjis_ch;
42 u_char high;
43 u_char low;
44 ef_charset_t cs;
45
46 if (ef_parser_increment(sjis_parser) == 0) {
47 goto shortage;
48 }
49
50 c2 = *sjis_parser->str;
51
52 /*
53 * specifying character set type.
54 */
55
56 sjis_ch = ((c1 << 8) & 0xff00) + (c2 & 0xff);
57
58 if (is_sjisx0213) {
59 if (c1 >= 0xf0) {
60 cs = JISX0213_2000_2;
61 } else {
62 cs = JISX0213_2000_1;
63 }
64 } else {
65 if (ef_get_sjis_output_type() == APPLE_CS) {
66 /*
67 * XXX
68 * this check is not exact , but not a problem for practical use.
69 */
70 if (0x00fd <= sjis_ch && sjis_ch <= 0x00ff) {
71 cs = JISX0208_1983_MAC_EXT;
72 } else if (0x8540 <= sjis_ch && sjis_ch <= 0x886d) {
73 cs = JISX0208_1983_MAC_EXT;
74 } else if (0xeb41 <= sjis_ch && sjis_ch <= 0xed96) {
75 cs = JISX0208_1983_MAC_EXT;
76 } else {
77 cs = JISX0208_1983;
78 }
79 } else /* if( ef_get_sjis_output_type() == MICROSOFT_CS) */
80 {
81 /*
82 * XXX
83 * this check is not exact , but not a problem for practical use.
84 */
85 if (0x8740 <= sjis_ch && sjis_ch <= 0x879c) {
86 cs = JISC6226_1978_NEC_EXT;
87 } else if (0xed40 <= sjis_ch && sjis_ch <= 0xeefc) {
88 cs = JISC6226_1978_NECIBM_EXT;
89 } else if (0xfa40 <= sjis_ch && sjis_ch <= 0xfc4b) {
90 cs = SJIS_IBM_EXT;
91 } else {
92 cs = JISX0208_1983;
93 }
94 }
95 }
96
97 /*
98 * converting SJIS -> JIS process.
99 */
100
101 if (cs == SJIS_IBM_EXT) {
102 /*
103 * SJIS_IBM_EXT
104 *
105 * IBM extension characters are placed in the empty space of Shift-JIS
106 *encoding ,
107 * then these characters cannot be mapped to jisx0208 which uses only
108 *0x20-0x7f
109 * because the decoded byte of them can be 0x93,0x94,0x95,0x96...
110 * So , we keep them sjis encoded bytes in ef_char_t as
111 * JIS6226_1978_IBM_EXT charset.
112 */
113 } else if (cs == JISX0213_2000_2) {
114 u_char sjis_upper_to_jisx02132_map_1[] = {
115 /* 0xf0 - 0xfc(sjis) */
116 0x21, 0x23, 0x25, 0x2d, 0x2f, 0x6f, 0x71, 0x73, 0x75, 0x77, 0x79, 0x7b, 0x7d,
117 };
118
119 u_char sjis_upper_to_jisx02132_map_2[] = {
120 /* 0xf0 - 0xfc(sjis) */
121 0x28, 0x24, 0x2c, 0x2e, 0x6e, 0x70, 0x72, 0x74, 0x76, 0x78, 0x7a, 0x7c, 0x7e,
122 };
123
124 /* c1 >= 0xf0 is checked above */
125 if (0xfc < c1) {
126 #ifdef DEBUG
127 bl_warn_printf(BL_DEBUG_TAG " 0x%.2x is illegal upper byte of jisx0213_2.\n", c1);
128 #endif
129
130 goto error;
131 }
132
133 if (c2 <= 0x9e) {
134 c1 = sjis_upper_to_jisx02132_map_1[c1 - 0xf0];
135
136 if (c2 > 0x7e) {
137 c2 -= 0x20;
138 } else {
139 c2 -= 0x1f;
140 }
141 } else {
142 c1 = sjis_upper_to_jisx02132_map_2[c1 - 0xf0];
143 c2 -= 0x7e;
144 }
145 } else {
146 if (0x81 <= c1 && c1 <= 0x9f) {
147 high = c1 - 0x71;
148 } else if (0xe0 <= c1 && c1 <= 0xfc) {
149 high = c1 - 0xb1;
150 } else {
151 /* XXX what's this ? */
152 goto error;
153 }
154
155 high = high * 2 + 1;
156
157 if (0x80 <= c2) {
158 low = c2 - 1;
159 } else {
160 low = c2;
161 }
162
163 if (0x9e <= low && low <= 0xfb) {
164 low -= 0x7d;
165 high++;
166 } else if (0x40 <= low && low <= 0x9d) {
167 low -= 0x1f;
168 } else {
169 /* XXX what's this ? */
170 goto error;
171 }
172
173 c1 = high;
174 c2 = low;
175 }
176
177 ch->cs = cs;
178 ch->ch[0] = c1;
179 ch->ch[1] = c2;
180 ch->size = 2;
181
182 if (cs == JISX0208_1983) {
183 ch->property = ef_get_jisx0208_1983_property(ch->ch);
184 } else if (cs == JISX0213_2000_1) {
185 ch->property = ef_get_jisx0213_2000_1_property(ch->ch);
186 } else {
187 ch->property = 0;
188 }
189 }
190
191 ef_parser_increment(sjis_parser);
192
193 return 1;
194
195 error:
196 shortage:
197 ef_parser_reset(sjis_parser);
198
199 return 0;
200 }
201
sjis_parser_next_char(ef_parser_t * sjis_parser,ef_char_t * ch)202 static int sjis_parser_next_char(ef_parser_t *sjis_parser, ef_char_t *ch) {
203 return sjis_parser_next_char_intern(sjis_parser, ch, 0);
204 }
205
sjisx0213_parser_next_char(ef_parser_t * sjis_parser,ef_char_t * ch)206 static int sjisx0213_parser_next_char(ef_parser_t *sjis_parser, ef_char_t *ch) {
207 return sjis_parser_next_char_intern(sjis_parser, ch, 1);
208 }
209
sjis_parser_set_str(ef_parser_t * sjis_parser,u_char * str,size_t size)210 static void sjis_parser_set_str(ef_parser_t *sjis_parser, u_char *str, size_t size) {
211 sjis_parser->str = str;
212 sjis_parser->left = size;
213 sjis_parser->marked_left = 0;
214 sjis_parser->is_eos = 0;
215 }
216
sjis_parser_destroy(ef_parser_t * s)217 static void sjis_parser_destroy(ef_parser_t *s) { free(s); }
218
219 /* --- global functions --- */
220
ef_sjis_parser_new(void)221 ef_parser_t *ef_sjis_parser_new(void) {
222 ef_parser_t *sjis_parser;
223
224 if ((sjis_parser = malloc(sizeof(ef_parser_t))) == NULL) {
225 return NULL;
226 }
227
228 ef_parser_init(sjis_parser);
229
230 sjis_parser->init = ef_parser_init;
231 sjis_parser->set_str = sjis_parser_set_str;
232 sjis_parser->destroy = sjis_parser_destroy;
233 sjis_parser->next_char = sjis_parser_next_char;
234
235 return sjis_parser;
236 }
237
ef_sjisx0213_parser_new(void)238 ef_parser_t *ef_sjisx0213_parser_new(void) {
239 ef_parser_t *sjis_parser;
240
241 if ((sjis_parser = malloc(sizeof(ef_parser_t))) == NULL) {
242 return NULL;
243 }
244
245 ef_parser_init(sjis_parser);
246
247 sjis_parser->init = ef_parser_init;
248 sjis_parser->set_str = sjis_parser_set_str;
249 sjis_parser->destroy = sjis_parser_destroy;
250 sjis_parser->next_char = sjisx0213_parser_next_char;
251
252 return sjis_parser;
253 }
254