1 /*
2 * "streamable kanji code filter and converter"
3 * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4 *
5 * LICENSE NOTICES
6 *
7 * This file is part of "streamable kanji code filter and converter",
8 * which is distributed under the terms of GNU Lesser General Public
9 * License (version 2) as published by the Free Software Foundation.
10 *
11 * This software is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with "streamable kanji code filter and converter";
18 * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19 * Suite 330, Boston, MA 02111-1307 USA
20 *
21 * The author of this file: Moriyoshi Koizumi <koizumi@gree.co.jp>
22 *
23 */
24
25 #include <stdint.h>
26 #include <stdbool.h>
27 #include "mbfilter_tl_jisx0201_jisx0208.h"
28 #include "translit_kana_jisx0201_jisx0208.h"
29
30 /* Apply various transforms to input codepoint, such as converting halfwidth katakana
31 * to fullwidth katakana. `mode` is a bitfield which controls which transforms are
32 * actually performed. The bit values are defined in mbfilter_tl_jisx0201_jix0208.h.
33 * `mode` must not call for transforms which are inverses (i.e. which would cancel
34 * each other out).
35 *
36 * In some cases, successive input codepoints may be merged into one output codepoint.
37 * (That is the purpose of the `next` parameter.) If the `next` codepoint is consumed
38 * and should be skipped over, `*consumed` will be set to true. Otherwise, `*consumed`
39 * will not be modified. If there is no following codepoint, `next` should be zero.
40 *
41 * Again, in some cases, one input codepoint may convert to two output codepoints.
42 * If so, the second output codepoint will be stored in `*second`.
43 *
44 * Return the resulting codepoint. If none of the requested transforms apply, return
45 * the input codepoint unchanged.
46 */
mbfl_convert_kana(int c,int next,bool * consumed,int * second,int mode)47 int mbfl_convert_kana(int c, int next, bool *consumed, int *second, int mode)
48 {
49 if ((mode & MBFL_FILT_TL_HAN2ZEN_ALL) && c >= 0x21 && c <= 0x7d && c != '"' && c != '\'' && c != '\\') {
50 return c + 0xfee0;
51 } else if ((mode & MBFL_FILT_TL_HAN2ZEN_ALPHA) && ((c >= 0x41 && c <= 0x5a) || (c >= 0x61 && c <= 0x7a))) { /* alphabetic */
52 return c + 0xfee0;
53 } else if ((mode & MBFL_FILT_TL_HAN2ZEN_NUMERIC) && c >= 0x30 && c <= 0x39) { /* num */
54 return c + 0xfee0;
55 } else if ((mode & MBFL_FILT_TL_HAN2ZEN_SPACE) && c == 0x20) { /* space */
56 return 0x3000;
57 }
58
59 if (mode & (MBFL_FILT_TL_HAN2ZEN_KATAKANA | MBFL_FILT_TL_HAN2ZEN_HIRAGANA)) {
60 /* Convert Hankaku kana to Zenkaku kana
61 * Either all Hankaku kana (including katakana and hiragana) will be converted
62 * to Zenkaku katakana, or to Zenkaku hiragana */
63 if ((mode & MBFL_FILT_TL_HAN2ZEN_KATAKANA) && (mode & MBFL_FILT_TL_HAN2ZEN_GLUE)) {
64 if (c >= 0xff61 && c <= 0xff9f) {
65 int n = c - 0xff60;
66 if (next >= 0xff61 && next <= 0xff9f) {
67 if (next == 0xff9e && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) {
68 *consumed = true;
69 return 0x3001 + hankana2zenkana_table[n];
70 } else if (next == 0xff9e && n == 19) {
71 *consumed = true;
72 return 0x30f4;
73 } else if (next == 0xff9f && n >= 42 && n <= 46) {
74 *consumed = true;
75 return 0x3002 + hankana2zenkana_table[n];
76 }
77 }
78
79 return 0x3000 + hankana2zenkana_table[n];
80 }
81 } else if ((mode & MBFL_FILT_TL_HAN2ZEN_HIRAGANA) && (mode & MBFL_FILT_TL_HAN2ZEN_GLUE)) {
82 if (c >= 0xff61 && c <= 0xff9f) {
83 int n = c - 0xff60;
84 if (next >= 0xff61 && next <= 0xff9f) {
85 if (next == 0xff9e && ((n >= 22 && n <= 36) || (n >= 42 && n <= 46))) {
86 *consumed = true;
87 return 0x3001 + hankana2zenhira_table[n];
88 } else if (next == 0xff9f && n >= 42 && n <= 46) {
89 *consumed = true;
90 return 0x3002 + hankana2zenhira_table[n];
91 }
92 }
93
94 return 0x3000 + hankana2zenhira_table[n];
95 }
96 } else if ((mode & MBFL_FILT_TL_HAN2ZEN_KATAKANA) && c >= 0xff61 && c <= 0xff9f) {
97 return 0x3000 + hankana2zenkana_table[c - 0xff60];
98 } else if ((mode & MBFL_FILT_TL_HAN2ZEN_HIRAGANA) && c >= 0xff61 && c <= 0xff9f) {
99 return 0x3000 + hankana2zenhira_table[c - 0xff60];
100 }
101 }
102
103 if (mode & MBFL_FILT_TL_HAN2ZEN_COMPAT1) { /* special ascii to symbol */
104 if (c == 0x5c) {
105 return 0xffe5; /* FULLWIDTH YEN SIGN */
106 } else if (c == 0xa5) { /* YEN SIGN */
107 return 0xffe5; /* FULLWIDTH YEN SIGN */
108 } else if (c == 0x7e) {
109 return 0xffe3; /* FULLWIDTH MACRON */
110 } else if (c == 0x203e) { /* OVERLINE */
111 return 0xffe3; /* FULLWIDTH MACRON */
112 } else if (c == 0x27) {
113 return 0x2019; /* RIGHT SINGLE QUOTATION MARK */
114 } else if (c == 0x22) {
115 return 0x201d; /* RIGHT DOUBLE QUOTATION MARK */
116 }
117 } else if (mode & MBFL_FILT_TL_HAN2ZEN_COMPAT2) { /* special ascii to symbol */
118 if (c == 0x5c) {
119 return 0xff3c; /* FULLWIDTH REVERSE SOLIDUS */
120 } else if (c == 0x7e) {
121 return 0xff5e; /* FULLWIDTH TILDE */
122 } else if (c == 0x27) {
123 return 0xff07; /* FULLWIDTH APOSTROPHE */
124 } else if (c == 0x22) {
125 return 0xff02; /* FULLWIDTH QUOTATION MARK */
126 }
127 }
128
129 if (mode & (MBFL_FILT_TL_ZEN2HAN_ALL | MBFL_FILT_TL_ZEN2HAN_ALPHA | MBFL_FILT_TL_ZEN2HAN_NUMERIC | MBFL_FILT_TL_ZEN2HAN_SPACE)) {
130 /* Zenkaku to Hankaku */
131 if ((mode & MBFL_FILT_TL_ZEN2HAN_ALL) && c >= 0xff01 && c <= 0xff5d && c != 0xff02 && c != 0xff07 && c!= 0xff3c) {
132 /* all except " ' \ ~ */
133 return c - 0xfee0;
134 } else if ((mode & MBFL_FILT_TL_ZEN2HAN_ALPHA) && ((c >= 0xff21 && c <= 0xff3a) || (c >= 0xff41 && c <= 0xff5a))) {
135 return c - 0xfee0;
136 } else if ((mode & MBFL_FILT_TL_ZEN2HAN_NUMERIC) && (c >= 0xff10 && c <= 0xff19)) {
137 return c - 0xfee0;
138 } else if ((mode & MBFL_FILT_TL_ZEN2HAN_SPACE) && (c == 0x3000)) {
139 return 0x20;
140 } else if ((mode & MBFL_FILT_TL_ZEN2HAN_ALL) && (c == 0x2212)) { /* MINUS SIGN */
141 return 0x2d;
142 }
143 }
144
145 if (mode & (MBFL_FILT_TL_ZEN2HAN_KATAKANA | MBFL_FILT_TL_ZEN2HAN_HIRAGANA)) {
146 /* Zenkaku kana to hankaku kana */
147 if ((mode & MBFL_FILT_TL_ZEN2HAN_KATAKANA) && c >= 0x30a1 && c <= 0x30f4) {
148 /* Zenkaku katakana to hankaku kana */
149 int n = c - 0x30a1;
150 if (zenkana2hankana_table[n][1]) {
151 *second = 0xff00 + zenkana2hankana_table[n][1];
152 }
153 return 0xff00 + zenkana2hankana_table[n][0];
154 } else if ((mode & MBFL_FILT_TL_ZEN2HAN_HIRAGANA) && c >= 0x3041 && c <= 0x3093) {
155 /* Zenkaku hiragana to hankaku kana */
156 int n = c - 0x3041;
157 if (zenkana2hankana_table[n][1]) {
158 *second = 0xff00 + zenkana2hankana_table[n][1];
159 }
160 return 0xff00 + zenkana2hankana_table[n][0];
161 } else if (c == 0x3001) {
162 return 0xff64; /* HALFWIDTH IDEOGRAPHIC COMMA */
163 } else if (c == 0x3002) {
164 return 0xff61; /* HALFWIDTH IDEOGRAPHIC FULL STOP */
165 } else if (c == 0x300c) {
166 return 0xff62; /* HALFWIDTH LEFT CORNER BRACKET */
167 } else if (c == 0x300d) {
168 return 0xff63; /* HALFWIDTH RIGHT CORNER BRACKET */
169 } else if (c == 0x309b) {
170 return 0xff9e; /* HALFWIDTH KATAKANA VOICED SOUND MARK */
171 } else if (c == 0x309c) {
172 return 0xff9f; /* HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK */
173 } else if (c == 0x30fc) {
174 return 0xff70; /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
175 } else if (c == 0x30fb) {
176 return 0xff65; /* HALFWIDTH KATAKANA MIDDLE DOT */
177 }
178 } else if (mode & (MBFL_FILT_TL_ZEN2HAN_HIRA2KANA | MBFL_FILT_TL_ZEN2HAN_KANA2HIRA)) {
179 if ((mode & MBFL_FILT_TL_ZEN2HAN_HIRA2KANA) && ((c >= 0x3041 && c <= 0x3093) || c == 0x309d || c == 0x309e)) {
180 /* Zenkaku hiragana to Zenkaku katakana */
181 return c + 0x60;
182 } else if ((mode & MBFL_FILT_TL_ZEN2HAN_KANA2HIRA) && ((c >= 0x30a1 && c <= 0x30f3) || c == 0x30fd || c == 0x30fe)) {
183 /* Zenkaku katakana to Zenkaku hiragana */
184 return c - 0x60;
185 }
186 }
187
188 if (mode & MBFL_FILT_TL_ZEN2HAN_COMPAT1) { /* special symbol to ascii */
189 if (c == 0xffe5) { /* FULLWIDTH YEN SIGN */
190 return 0x5c;
191 } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
192 return 0x5c;
193 } else if (c == 0xffe3) { /* FULLWIDTH MACRON */
194 return 0x7e;
195 } else if (c == 0x203e) { /* OVERLINE */
196 return 0x7e;
197 } else if (c == 0x2018) { /* LEFT SINGLE QUOTATION MARK*/
198 return 0x27;
199 } else if (c == 0x2019) { /* RIGHT SINGLE QUOTATION MARK */
200 return 0x27;
201 } else if (c == 0x201c) { /* LEFT DOUBLE QUOTATION MARK */
202 return 0x22;
203 } else if (c == 0x201d) { /* RIGHT DOUBLE QUOTATION MARK */
204 return 0x22;
205 }
206 }
207
208 if (mode & MBFL_FILT_TL_ZEN2HAN_COMPAT2) { /* special symbol to ascii */
209 if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
210 return 0x5c;
211 } else if (c == 0xff5e) { /* FULLWIDTH TILDE */
212 return 0x7e;
213 } else if (c == 0xff07) { /* FULLWIDTH APOSTROPHE */
214 return 0x27;
215 } else if (c == 0xff02) { /* FULLWIDTH QUOTATION MARK */
216 return 0x22;
217 }
218 }
219
220 return c;
221 }
222
mbfl_filt_tl_jisx0201_jisx0208(int c,mbfl_convert_filter * filt)223 int mbfl_filt_tl_jisx0201_jisx0208(int c, mbfl_convert_filter *filt)
224 {
225 int mode = (intptr_t)filt->opaque, second = 0;
226 bool consumed = false;
227
228 if (filt->cache) {
229 int s = mbfl_convert_kana(filt->cache, c, &consumed, &second, mode);
230 filt->cache = consumed ? 0 : c;
231 (*filt->output_function)(s, filt->data);
232 if (second) {
233 (*filt->output_function)(second, filt->data);
234 }
235 } else if (c == 0) {
236 /* This case has to be handled separately, since `filt->cache == 0` means no
237 * codepoint is cached */
238 (*filt->output_function)(0, filt->data);
239 } else {
240 filt->cache = c;
241 }
242
243 return 0;
244 }
245
mbfl_filt_tl_jisx0201_jisx0208_flush(mbfl_convert_filter * filt)246 int mbfl_filt_tl_jisx0201_jisx0208_flush(mbfl_convert_filter *filt)
247 {
248 int mode = (intptr_t)filt->opaque, second = 0;
249
250 if (filt->cache) {
251 int s = mbfl_convert_kana(filt->cache, 0, NULL, &second, mode);
252 (*filt->output_function)(s, filt->data);
253 if (second) {
254 (*filt->output_function)(second, filt->data);
255 }
256 filt->cache = 0;
257 }
258
259 if (filt->flush_function) {
260 return (*filt->flush_function)(filt->data);
261 }
262
263 return 0;
264 }
265
266 const struct mbfl_convert_vtbl vtbl_tl_jisx0201_jisx0208 = {
267 mbfl_no_encoding_wchar,
268 mbfl_no_encoding_wchar,
269 mbfl_filt_conv_common_ctor,
270 NULL,
271 mbfl_filt_tl_jisx0201_jisx0208,
272 mbfl_filt_tl_jisx0201_jisx0208_flush,
273 NULL,
274 };
275