1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this file: Moriyoshi Koizumi <koizumi@gree.co.jp>
22  *
23  */
24 
25 #include <stdint.h>
26 #include <stdbool.h>
27 #include "mbfilter_tl_jisx0201_jisx0208.h"
28 #include "translit_kana_jisx0201_jisx0208.h"
29 
30 /* Apply various transforms to input codepoint, such as converting halfwidth katakana
31  * to fullwidth katakana. `mode` is a bitfield which controls which transforms are
32  * actually performed. The bit values are defined in mbfilter_tl_jisx0201_jix0208.h.
33  * `mode` must not call for transforms which are inverses (i.e. which would cancel
34  * each other out).
35  *
36  * In some cases, successive input codepoints may be merged into one output codepoint.
37  * (That is the purpose of the `next` parameter.) If the `next` codepoint is consumed
38  * and should be skipped over, `*consumed` will be set to true. Otherwise, `*consumed`
39  * will not be modified. If there is no following codepoint, `next` should be zero.
40  *
41  * Again, in some cases, one input codepoint may convert to two output codepoints.
42  * If so, the second output codepoint will be stored in `*second`.
43  *
44  * Return the resulting codepoint. If none of the requested transforms apply, return
45  * the input codepoint unchanged.
46  */
mbfl_convert_kana(int c,int next,bool * consumed,int * second,int mode)47 int mbfl_convert_kana(int c, int next, bool *consumed, int *second, int mode)
48 {
49 	if ((mode & MBFL_FILT_TL_HAN2ZEN_ALL) && c >= 0x21 && c <= 0x7d && c != '"' && c != '\'' && c != '\\') {
50 		return c + 0xfee0;
51 	} else if ((mode & MBFL_FILT_TL_HAN2ZEN_ALPHA) && ((c >= 0x41 && c <= 0x5a) || (c >= 0x61 && c <= 0x7a))) { /* alphabetic */
52 		return c + 0xfee0;
53 	} else if ((mode & MBFL_FILT_TL_HAN2ZEN_NUMERIC) && c >= 0x30 && c <= 0x39) { /* num */
54 		return c + 0xfee0;
55 	} else if ((mode & MBFL_FILT_TL_HAN2ZEN_SPACE) && c == 0x20) { /* space */
56 		return 0x3000;
57 	}
58 
59 	if (mode & (MBFL_FILT_TL_HAN2ZEN_KATAKANA | MBFL_FILT_TL_HAN2ZEN_HIRAGANA)) {
60 		/* Convert Hankaku kana to Zenkaku kana
61 		 * Either all Hankaku kana (including katakana and hiragana) will be converted
62 		 * to Zenkaku katakana, or to Zenkaku hiragana */
63 		if ((mode & MBFL_FILT_TL_HAN2ZEN_KATAKANA) && (mode & MBFL_FILT_TL_HAN2ZEN_GLUE)) {
64 			if (c >= 0xff61 && c <= 0xff9f) {
65 				int n = c - 0xff60;
66 				if (next >= 0xff61 && next <= 0xff9f) {
67 					if (next == 0xff9e && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) {
68 						*consumed = true;
69 						return 0x3001 + hankana2zenkana_table[n];
70 					} else if (next == 0xff9e && n == 19) {
71 						*consumed = true;
72 						return 0x30f4;
73 					} else if (next == 0xff9f && n >= 42 && n <= 46) {
74 						*consumed = true;
75 						return 0x3002 + hankana2zenkana_table[n];
76 					}
77 				}
78 
79 				return 0x3000 + hankana2zenkana_table[n];
80 			}
81 		} else if ((mode & MBFL_FILT_TL_HAN2ZEN_HIRAGANA) && (mode & MBFL_FILT_TL_HAN2ZEN_GLUE)) {
82 			if (c >= 0xff61 && c <= 0xff9f) {
83 				int n = c - 0xff60;
84 				if (next >= 0xff61 && next <= 0xff9f) {
85 					if (next == 0xff9e && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) {
86 						*consumed = true;
87 						return 0x3001 + hankana2zenhira_table[n];
88 					} else if (next == 0xff9f && n >= 42 && n <= 46) {
89 						*consumed = true;
90 						return 0x3002 + hankana2zenhira_table[n];
91 					}
92 				}
93 
94 				return 0x3000 + hankana2zenhira_table[n];
95 			}
96 		} else if ((mode & MBFL_FILT_TL_HAN2ZEN_KATAKANA) && c >= 0xff61 && c <= 0xff9f) {
97 			return 0x3000 + hankana2zenkana_table[c - 0xff60];
98 		} else if ((mode & MBFL_FILT_TL_HAN2ZEN_HIRAGANA) && c >= 0xff61 && c <= 0xff9f) {
99 			return 0x3000 + hankana2zenhira_table[c - 0xff60];
100 		}
101 	}
102 
103 	if (mode & MBFL_FILT_TL_HAN2ZEN_COMPAT1) { /* special ascii to symbol */
104 		if (c == 0x5c) {
105 			return 0xffe5; /* FULLWIDTH YEN SIGN */
106 		} else if (c == 0xa5) { /* YEN SIGN */
107 			return 0xffe5; /* FULLWIDTH YEN SIGN */
108 		} else if (c == 0x7e) {
109 			return 0xffe3; /* FULLWIDTH MACRON */
110 		} else if (c == 0x203e) { /* OVERLINE */
111 			return 0xffe3; /* FULLWIDTH MACRON */
112 		} else if (c == 0x27) {
113 			return 0x2019; /* RIGHT SINGLE QUOTATION MARK */
114 		} else if (c == 0x22) {
115 			return 0x201d; /* RIGHT DOUBLE QUOTATION MARK */
116 		}
117 	} else if (mode & MBFL_FILT_TL_HAN2ZEN_COMPAT2) { /* special ascii to symbol */
118 		if (c == 0x5c) {
119 			return 0xff3c; /* FULLWIDTH REVERSE SOLIDUS */
120 		} else if (c == 0x7e) {
121 			return 0xff5e; /* FULLWIDTH TILDE */
122 		} else if (c == 0x27) {
123 			return 0xff07; /* FULLWIDTH APOSTROPHE */
124 		} else if (c == 0x22) {
125 			return 0xff02; /* FULLWIDTH QUOTATION MARK */
126 		}
127 	}
128 
129 	if (mode & (MBFL_FILT_TL_ZEN2HAN_ALL | MBFL_FILT_TL_ZEN2HAN_ALPHA | MBFL_FILT_TL_ZEN2HAN_NUMERIC | MBFL_FILT_TL_ZEN2HAN_SPACE)) {
130 		/* Zenkaku to Hankaku */
131 		if ((mode & MBFL_FILT_TL_ZEN2HAN_ALL) && c >= 0xff01 && c <= 0xff5d && c != 0xff02 && c != 0xff07 && c!= 0xff3c) {
132 			/* all except " ' \ ~ */
133 			return c - 0xfee0;
134 		} else if ((mode & MBFL_FILT_TL_ZEN2HAN_ALPHA) && ((c >= 0xff21 && c <= 0xff3a) || (c >= 0xff41 && c <= 0xff5a))) {
135 			return c - 0xfee0;
136 		} else if ((mode & MBFL_FILT_TL_ZEN2HAN_NUMERIC) && (c >= 0xff10 && c <= 0xff19)) {
137 			return c - 0xfee0;
138 		} else if ((mode & MBFL_FILT_TL_ZEN2HAN_SPACE) && (c == 0x3000)) {
139 			return 0x20;
140 		} else if ((mode & MBFL_FILT_TL_ZEN2HAN_ALL) && (c == 0x2212)) { /* MINUS SIGN */
141 			return 0x2d;
142 		}
143 	}
144 
145 	if (mode & (MBFL_FILT_TL_ZEN2HAN_KATAKANA | MBFL_FILT_TL_ZEN2HAN_HIRAGANA)) {
146 		/* Zenkaku kana to hankaku kana */
147 		if ((mode & MBFL_FILT_TL_ZEN2HAN_KATAKANA) && c >= 0x30a1 && c <= 0x30f4) {
148 			/* Zenkaku katakana to hankaku kana */
149 			int n = c - 0x30a1;
150 			if (zenkana2hankana_table[n][1]) {
151 				*second = 0xff00 + zenkana2hankana_table[n][1];
152 			}
153 			return 0xff00 + zenkana2hankana_table[n][0];
154 		} else if ((mode & MBFL_FILT_TL_ZEN2HAN_HIRAGANA) && c >= 0x3041 && c <= 0x3093) {
155 			/* Zenkaku hiragana to hankaku kana */
156 			int n = c - 0x3041;
157 			if (zenkana2hankana_table[n][1]) {
158 				*second = 0xff00 + zenkana2hankana_table[n][1];
159 			}
160 			return 0xff00 + zenkana2hankana_table[n][0];
161 		} else if (c == 0x3001) {
162 			return 0xff64; /* HALFWIDTH IDEOGRAPHIC COMMA */
163 		} else if (c == 0x3002) {
164 			return 0xff61; /* HALFWIDTH IDEOGRAPHIC FULL STOP */
165 		} else if (c == 0x300c) {
166 			return 0xff62; /* HALFWIDTH LEFT CORNER BRACKET */
167 		} else if (c == 0x300d) {
168 			return 0xff63; /* HALFWIDTH RIGHT CORNER BRACKET */
169 		} else if (c == 0x309b) {
170 			return 0xff9e; /* HALFWIDTH KATAKANA VOICED SOUND MARK */
171 		} else if (c == 0x309c) {
172 			return 0xff9f; /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
173 		} else if (c == 0x30fc) {
174 			return 0xff70; /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
175 		} else if (c == 0x30fb) {
176 			return 0xff65; /* HALFWIDTH KATAKANA MIDDLE DOT */
177 		}
178 	} else if (mode & (MBFL_FILT_TL_ZEN2HAN_HIRA2KANA | MBFL_FILT_TL_ZEN2HAN_KANA2HIRA)) {
179 		if ((mode & MBFL_FILT_TL_ZEN2HAN_HIRA2KANA) && ((c >= 0x3041 && c <= 0x3093) || c == 0x309d || c == 0x309e)) {
180 			/* Zenkaku hiragana to Zenkaku katakana */
181 			return c + 0x60;
182 		} else if ((mode & MBFL_FILT_TL_ZEN2HAN_KANA2HIRA) && ((c >= 0x30a1 && c <= 0x30f3) || c == 0x30fd || c == 0x30fe)) {
183 			/* Zenkaku katakana to Zenkaku hiragana */
184 			return c - 0x60;
185 		}
186 	}
187 
188 	if (mode & MBFL_FILT_TL_ZEN2HAN_COMPAT1) { /* special symbol to ascii */
189 		if (c == 0xffe5) { /* FULLWIDTH YEN SIGN */
190 			return 0x5c;
191 		} else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
192 			return 0x5c;
193 		} else if (c == 0xffe3) { /* FULLWIDTH MACRON */
194 			return 0x7e;
195 		} else if (c == 0x203e) { /* OVERLINE */
196 			return 0x7e;
197 		} else if (c == 0x2018) { /* LEFT SINGLE QUOTATION MARK*/
198 			return 0x27;
199 		} else if (c == 0x2019) { /* RIGHT SINGLE QUOTATION MARK */
200 			return 0x27;
201 		} else if (c == 0x201c) { /* LEFT DOUBLE QUOTATION MARK */
202 			return 0x22;
203 		} else if (c == 0x201d) { /* RIGHT DOUBLE QUOTATION MARK */
204 			return 0x22;
205 		}
206 	}
207 
208 	if (mode & MBFL_FILT_TL_ZEN2HAN_COMPAT2) { /* special symbol to ascii */
209 		if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
210 			return 0x5c;
211 		} else if (c == 0xff5e) { /* FULLWIDTH TILDE */
212 			return 0x7e;
213 		} else if (c == 0xff07) { /* FULLWIDTH APOSTROPHE */
214 			return 0x27;
215 		} else if (c == 0xff02) { /* FULLWIDTH QUOTATION MARK */
216 			return 0x22;
217 		}
218 	}
219 
220 	return c;
221 }
222 
mbfl_filt_tl_jisx0201_jisx0208(int c,mbfl_convert_filter * filt)223 int mbfl_filt_tl_jisx0201_jisx0208(int c, mbfl_convert_filter *filt)
224 {
225 	int mode = (intptr_t)filt->opaque, second = 0;
226 	bool consumed = false;
227 
228 	if (filt->cache) {
229 		int s = mbfl_convert_kana(filt->cache, c, &consumed, &second, mode);
230 		filt->cache = consumed ? 0 : c;
231 		(*filt->output_function)(s, filt->data);
232 		if (second) {
233 			(*filt->output_function)(second, filt->data);
234 		}
235 	} else if (c == 0) {
236 		/* This case has to be handled separately, since `filt->cache == 0` means no
237 		 * codepoint is cached */
238 		(*filt->output_function)(0, filt->data);
239 	} else {
240 		filt->cache = c;
241 	}
242 
243 	return 0;
244 }
245 
mbfl_filt_tl_jisx0201_jisx0208_flush(mbfl_convert_filter * filt)246 int mbfl_filt_tl_jisx0201_jisx0208_flush(mbfl_convert_filter *filt)
247 {
248 	int mode = (intptr_t)filt->opaque, second = 0;
249 
250 	if (filt->cache) {
251 		int s = mbfl_convert_kana(filt->cache, 0, NULL, &second, mode);
252 		(*filt->output_function)(s, filt->data);
253 		if (second) {
254 			(*filt->output_function)(second, filt->data);
255 		}
256 		filt->cache = 0;
257 	}
258 
259 	if (filt->flush_function) {
260 		return (*filt->flush_function)(filt->data);
261 	}
262 
263 	return 0;
264 }
265 
266 const struct mbfl_convert_vtbl vtbl_tl_jisx0201_jisx0208 = {
267 	mbfl_no_encoding_wchar,
268 	mbfl_no_encoding_wchar,
269 	mbfl_filt_conv_common_ctor,
270 	NULL,
271 	mbfl_filt_tl_jisx0201_jisx0208,
272 	mbfl_filt_tl_jisx0201_jisx0208_flush,
273 	NULL,
274 };
275