1 /*
2 * Anthy内部で使う文字列の処理
3 * typedef struct xstr_ {
4 * xstr *str; int len;
5 * } xstr;
6 *
7 * malloc(0);の意味は考えないで0文字の文字列を扱えるような
8 * コーディングをする。free(0)は良い。
9 *
10 * デフォルトの設定では
11 * cstrはCの普通のEUC文字列
12 *
13 * Copyright (C) 2000-2007 TABATA Yusuke
14 *
15 */
16 /*
17 This library is free software; you can redistribute it and/or
18 modify it under the terms of the GNU Lesser General Public
19 License as published by the Free Software Foundation; either
20 version 2 of the License, or (at your option) any later version.
21
22 This library is distributed in the hope that it will be useful,
23 but WITHOUT ANY WARRANTY; without even the implied warranty of
24 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
25 Lesser General Public License for more details.
26
27 You should have received a copy of the GNU Lesser General Public
28 License along with this library; if not, write to the Free Software
29 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30 */
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34
35 #include "config.h"
36 /* for ANTHY_*_ENCODING */
37 #include <anthy/anthy.h>
38
39 #include <anthy/xstr.h>
40 #include <anthy/xchar.h>
41 #include "diclib_inner.h"
42
43 /* 画面に出力するときのエンコーディング */
44 static int print_encoding;
45
46 #define MAX_BYTES_PER_XCHAR 10
47
48 static int
xc_isprint(xchar xc)49 xc_isprint(xchar xc)
50 {
51 return xc > 0;
52 }
53
54 /** Cの文字列に対応するxstrの長さを計算する
55 */
56 static int
xlengthofcstr(const char * c)57 xlengthofcstr(const char *c)
58 {
59 int ll = 0;
60 int len = strlen(c);
61 int i;
62 for (i = 0; i < len; i++) {
63 ll ++;
64 if ((c[i] & 0x80)) {
65 i++;
66 }
67 }
68 return ll;
69 }
70
71 const char *
anthy_utf8_to_ucs4_xchar(const char * s,xchar * res)72 anthy_utf8_to_ucs4_xchar(const char *s, xchar *res)
73 {
74 const unsigned char *str = (const unsigned char *)s;
75 int i, len;
76 xchar cur;
77 cur = str[0];
78 if (str[0] < 0x80) {
79 len = 1;
80 } else if (str[0] < 0xe0) {
81 cur &= 0x1f;
82 len = 2;
83 } else if (str[0] < 0xf0) {
84 cur &= 0x0f;
85 len = 3;
86 } else if (str[0] < 0xf8) {
87 cur &= 0x07;
88 len = 4;
89 } else if (str[0] < 0xfc) {
90 cur &= 0x03;
91 len = 5;
92 } else {
93 cur &= 0x01;
94 len = 6;
95 }
96 str ++;
97 for (i = 1; i < len; i++) {
98 cur <<= 6;
99 cur |= (str[0] & 0x3f);
100 str++;
101 }
102 *res = cur;
103 return (const char *)str;
104 }
105
106 static xstr *
utf8_to_ucs4_xstr(const char * s)107 utf8_to_ucs4_xstr(const char *s)
108 {
109 const unsigned char *str = (const unsigned char *)s;
110 xstr res;
111 res.str = (xchar *)alloca(sizeof(xchar) * strlen(s));
112 res.len = 0;
113
114 while (*str) {
115 xchar cur;
116 str = (const unsigned char *)anthy_utf8_to_ucs4_xchar((const char *)str,
117 &cur);
118 res.str[res.len] = cur;
119 res.len ++;
120 }
121 return anthy_xstr_dup(&res);
122 }
123
124 static int
put_xchar_to_utf8_str(xchar xc,char * buf_)125 put_xchar_to_utf8_str(xchar xc, char *buf_)
126 {
127 int i, len;
128 unsigned char *buf = (unsigned char *)buf_;
129 if (xc < 0x80) {
130 buf[0] = 0;
131 len = 1;
132 } else if (xc < 0x800) {
133 buf[0] = 0xc0;
134 len = 2;
135 } else if (xc < 0x10000) {
136 buf[0] = 0xe0;
137 len = 3;
138 } else if (xc < 0x200000) {
139 buf[0] = 0xf0;
140 len = 4;
141 } else if (xc < 0x400000) {
142 buf[0] = 0xf8;
143 len = 5;
144 } else {
145 buf[0] = 0xfc;
146 len = 6;
147 }
148 for (i = len - 1; i > 0; i--) {
149 buf[i] = (xc & 0x3f) | 0x80;
150 xc >>= 6;
151 }
152 buf[0] += xc;
153 buf[len] = 0;
154 return len;
155 }
156
157 static char *
ucs4_xstr_to_utf8(xstr * xs)158 ucs4_xstr_to_utf8(xstr *xs)
159 {
160 char *buf = alloca(xs->len * 6 + 1);
161 int i, t = 0;
162 buf[0] = 0;
163 for (i = 0; i < xs->len; i++) {
164 xchar xc = xs->str[i];
165 put_xchar_to_utf8_str(xc, &buf[t]);
166 t = strlen(buf);
167 }
168 return strdup(buf);
169 }
170
171 /** Cの文字列をxstrに変更する
172 */
173 xstr *
anthy_cstr_to_xstr(const char * c,int encoding)174 anthy_cstr_to_xstr(const char *c, int encoding)
175 {
176 xstr *x;
177 int i, j, l;
178 if (encoding == ANTHY_UTF8_ENCODING) {
179 return utf8_to_ucs4_xstr(c);
180 }
181 l = xlengthofcstr(c);
182 x = (xstr *)malloc(sizeof(struct xstr_));
183 if (!x) {
184 return NULL;
185 }
186 x->len = l;
187 x->str = malloc(sizeof(xchar)*l);
188 for (i = 0, j = 0; i < l; i++) {
189 if (!(c[j] & 0x80)){
190 x->str[i] = c[j];
191 j++;
192 } else {
193 unsigned char *p = (unsigned char *)&c[j];
194 x->str[i] = (p[1] | (p[0]<<8)) | 0x8080;
195 x->str[i] = anthy_euc_to_ucs(x->str[i]);
196 j++;
197 j++;
198 }
199 }
200 return x;
201 }
202
203 char *
anthy_xstr_to_cstr(xstr * s,int encoding)204 anthy_xstr_to_cstr(xstr *s, int encoding)
205 {
206 int i, j, l;
207 char *p;
208
209 if (encoding == ANTHY_UTF8_ENCODING) {
210 return ucs4_xstr_to_utf8(s);
211 }
212
213 l = s->len;
214 for (i = 0; i < s->len; i++) {
215 int ec = anthy_ucs_to_euc(s->str[i]);
216 if (ec > 255) {
217 l++;
218 }
219 }
220 p = (char *)malloc(l + 1);
221 p[l] = 0;
222 j = 0;
223 for (i = 0; i < s->len; i++) {
224 int ec = anthy_ucs_to_euc(s->str[i]);
225 if (ec < 256) {
226 p[j] = ec;
227 j++;
228 }else{
229 p[j] = ec >> 8;
230 j++;
231 p[j] = ec & 255;
232 j++;
233 }
234 }
235 return p;
236 }
237
238 xstr *
anthy_xstr_dup(xstr * s)239 anthy_xstr_dup(xstr *s)
240 {
241 int i;
242 xstr *x = (xstr *)malloc(sizeof(xstr));
243 x->len = s->len;
244 if (s->len) {
245 x->str = malloc(sizeof(xchar)*s->len);
246 }else{
247 x->str = NULL;
248 }
249 for (i = 0; i < x->len; i++) {
250 x->str[i] = s->str[i];
251 }
252 return x;
253 }
254
255 xchar *
anthy_xstr_dup_str(xstr * s)256 anthy_xstr_dup_str(xstr *s)
257 {
258 xchar *c;
259 int i;
260 if (s->len) {
261 c = malloc(sizeof(xchar)*s->len);
262 }else{
263 c = 0;
264 }
265 for (i = 0; i < s->len; i++) {
266 c[i] = s->str[i];
267 }
268 return c;
269 }
270
271 void
anthy_free_xstr(xstr * x)272 anthy_free_xstr(xstr *x)
273 {
274 if (!x) {
275 return ;
276 }
277 /**/
278 free(x->str);
279 free(x);
280 }
281
282 void
anthy_free_xstr_str(xstr * x)283 anthy_free_xstr_str(xstr *x)
284 {
285 if (!x) {
286 return ;
287 }
288 free(x->str);
289 }
290
291 int
anthy_sputxchar(char * buf,xchar x,int encoding)292 anthy_sputxchar(char *buf, xchar x, int encoding)
293 {
294 if (!xc_isprint(x)) {
295 sprintf(buf, "??");
296 return 2;
297 }
298 if (encoding == ANTHY_UTF8_ENCODING) {
299 return put_xchar_to_utf8_str(x, buf);
300 }
301 x = anthy_ucs_to_euc(x);
302 if (x < 256) {
303 buf[0] = x;
304 buf[1] = 0;
305 return 1;
306 }
307 buf[2] = 0;
308 buf[1] = 0x80 | (x & 255);
309 buf[0] = 0x80 | ((x>>8) & 255);
310 return 2;
311 }
312
313 int
anthy_sputxstr(char * buf,xstr * x,int encoding)314 anthy_sputxstr(char *buf, xstr *x, int encoding)
315 {
316 char b[MAX_BYTES_PER_XCHAR];
317 int i, l = 0;
318 for (i = 0; i < x->len; i++) {
319 anthy_sputxchar(b, x->str[i], encoding);
320 sprintf(&buf[l], "%s", b);
321 l += strlen(b);
322 }
323 return l;
324 }
325
326 int
anthy_snputxstr(char * buf,int n,xstr * x,int encoding)327 anthy_snputxstr(char *buf, int n, xstr *x, int encoding)
328 {
329 char b[MAX_BYTES_PER_XCHAR];
330 int i, l=0;
331 for (i = 0; i < x->len; i++) {
332 anthy_sputxchar(b, x->str[i], encoding);
333 if ((int)strlen(b) + l >= n) {
334 return l;
335 }
336 n -= sprintf(&buf[l], "%s", b);
337 l += strlen(b);
338 }
339 return l;
340 }
341
342 void
anthy_putxchar(xchar x)343 anthy_putxchar(xchar x)
344 {
345 char buf[MAX_BYTES_PER_XCHAR];
346 if (!xc_isprint(x)) {
347 printf("\\%x", x);
348 return ;
349 }
350 anthy_sputxchar(buf, x, print_encoding);
351 printf("%s", buf);
352 }
353
354 void
anthy_putxstr(xstr * x)355 anthy_putxstr(xstr *x)
356 {
357 int i;
358 for (i = 0; i < x->len; i++) {
359 anthy_putxchar(x->str[i]);
360 }
361 }
362
363 void
anthy_putxstrln(xstr * x)364 anthy_putxstrln(xstr *x)
365 {
366 anthy_putxstr(x);
367 printf("\n");
368 }
369
370 xstr*
anthy_xstrcpy(xstr * dest,xstr * src)371 anthy_xstrcpy(xstr *dest, xstr *src)
372 {
373 int i;
374 /* 文字列をコピー */
375 dest->len = src->len;
376 for (i = 0; i < src->len; i++) {
377 dest->str[i] = src->str[i];
378 }
379
380 return dest;
381 }
382 /* 返り値の符号はstrcmpと同じ */
383 int
anthy_xstrcmp(xstr * x1,xstr * x2)384 anthy_xstrcmp(xstr *x1, xstr *x2)
385 {
386 int i, m;
387 if (x1->len < x2->len) {
388 m = x1->len;
389 }else{
390 m = x2->len;
391 }
392 for (i = 0 ; i < m ; i++) {
393 if (x1->str[i] < x2->str[i]) {
394 return -1;
395 }
396 if (x1->str[i] > x2->str[i]) {
397 return 1;
398 }
399 }
400 if (x1->len < x2->len) {
401 return -1;
402 }
403 if (x1->len > x2->len) {
404 return 1;
405 }
406 return 0;
407 }
408
409 /* 返り値の符号はstrncmpと同じ */
410 int
anthy_xstrncmp(xstr * x1,xstr * x2,int n)411 anthy_xstrncmp(xstr *x1, xstr *x2, int n)
412 {
413 int i, m;
414 if (x1->len < x2->len) {
415 m = x1->len;
416 }else{
417 m = x2->len;
418 }
419 if (m > n) m = n;
420 for (i = 0 ; i < m ; i++) {
421 if (x1->str[i] < x2->str[i]) {
422 return -1;
423 }
424 if (x1->str[i] > x2->str[i]) {
425 return 1;
426 }
427 }
428 if (x2->len <= n && x1->len < x2->len) {
429 return -1;
430 }
431 if (x1->len <= n && x1->len > x2->len) {
432 return 1;
433 }
434 return 0;
435 }
436
437
438 xstr *
anthy_xstrcat(xstr * s,xstr * a)439 anthy_xstrcat(xstr *s, xstr *a)
440 {
441 int i, l;
442 if (!s) {
443 s = malloc(sizeof(xstr));
444 s->str = NULL;
445 s->len = 0;
446 }
447 l = s->len + a->len;
448
449 if (l < 1) { /* 辞書もしくは学習データが壊れていた時の対策 */
450 free(s->str);
451 s->str = NULL;
452 s->len = 0;
453 return s;
454 }
455
456 s->str = realloc(s->str, sizeof(xchar)*l);
457 for (i = 0; i < a->len; i ++) {
458 s->str[s->len+i] = a->str[i];
459 }
460 s->len = l;
461 return s;
462 }
463
464 xstr *
anthy_xstrappend(xstr * xs,xchar xc)465 anthy_xstrappend(xstr *xs, xchar xc)
466 {
467 xstr p;
468 xchar q[1];
469 p.len = 1;
470 p.str = q;
471 q[0] = xc;
472 return anthy_xstrcat(xs, &p);
473 }
474
475 long long
anthy_xstrtoll(xstr * x)476 anthy_xstrtoll(xstr *x)
477 {
478 xchar c;
479 int i;
480 long long n = 0;/* 数 */
481 if (!x->len || x->len > 16) {
482 return -1;
483 }
484 if (!(anthy_get_xstr_type(x) & (XCT_NUM | XCT_WIDENUM))) {
485 return -1;
486 }
487 for (i = 0; i < x->len; i++) {
488 c = x->str[i];
489 n *= 10;
490 n += anthy_xchar_to_num(c);
491 }
492 return n;
493 }
494
495 /** 全角の数字を半角にする
496 */
497 xstr *
anthy_xstr_wide_num_to_num(xstr * src_xs)498 anthy_xstr_wide_num_to_num(xstr* src_xs)
499 {
500 int i;
501 xstr *dst_xs;
502 dst_xs = anthy_xstr_dup(src_xs);
503 for (i = 0; i < src_xs->len; ++i) {
504 dst_xs->str[i] = anthy_xchar_wide_num_to_num(src_xs->str[i]);
505 }
506 return dst_xs;
507 }
508
509 /** 平仮名をカタカナに変換する
510 */
511 xstr *
anthy_xstr_hira_to_kata(xstr * src_xs)512 anthy_xstr_hira_to_kata(xstr *src_xs)
513 {
514 xstr *dst_xs;
515 int i, j;
516 dst_xs = anthy_xstr_dup(src_xs);
517
518 for (i = 0 ,j = 0; i < dst_xs->len; i++, j++) {
519 /* 「う゛」のチェック */
520 if (i < dst_xs->len - 1 && dst_xs->str[i] == HK_U
521 && dst_xs->str[i+1] == HK_DDOT) {
522 dst_xs->str[j] = KK_VU;/* ヴ */
523 i++;
524 continue ;
525 }
526 /**/
527 dst_xs->str[j] = dst_xs->str[i];
528 if ((anthy_ucs_to_euc(dst_xs->str[j]) & 0xff00) == 0xa400) {
529 /* ひらがなだったら256足す */
530 dst_xs->str[j] = anthy_ucs_to_euc(dst_xs->str[j]);
531 dst_xs->str[j] += 256;
532 dst_xs->str[j] = anthy_euc_to_ucs(dst_xs->str[j]);
533 }
534 }
535 dst_xs->len = j;
536 return dst_xs;
537 }
538
539 xstr *
anthy_xstr_hira_to_half_kata(xstr * src_xs)540 anthy_xstr_hira_to_half_kata(xstr *src_xs)
541 {
542 int len = src_xs->len;
543 int i, j;
544 xstr *xs;
545 for (i = 0; i < src_xs->len; i++) {
546 const struct half_kana_table *tab = anthy_find_half_kana(src_xs->str[i]);
547 if (tab && tab->mod) {
548 len ++;
549 }
550 }
551 xs = malloc(sizeof(xstr));
552 xs->len = len;
553 xs->str = malloc(sizeof(xchar) * len);
554 j = 0;
555 for (i = 0; i < src_xs->len; i++) {
556 const struct half_kana_table *tab = anthy_find_half_kana(src_xs->str[i]);
557 if (tab) {
558 xs->str[j] = anthy_euc_to_ucs(tab->dst);
559 if (tab->mod) {
560 j++;
561 xs->str[j] = anthy_euc_to_ucs(tab->mod);
562 }
563 } else {
564 xs->str[j] = src_xs->str[i];
565 }
566 j++;
567 }
568 return xs;
569 }
570
571 xstr *
anthy_conv_half_wide(xstr * xs)572 anthy_conv_half_wide(xstr *xs)
573 {
574 int i;
575 xstr *res;
576 for (i = 0; i < xs->len; i++) {
577 if (!anthy_lookup_half_wide(xs->str[i])) {
578 return NULL;
579 }
580 }
581 res = anthy_xstr_dup(xs);
582 for (i = 0; i < xs->len; i++) {
583 res->str[i] = anthy_lookup_half_wide(xs->str[i]);
584 }
585 return res;
586 }
587
588 int
anthy_xstr_hash(xstr * xs)589 anthy_xstr_hash(xstr *xs)
590 {
591 int h,i;
592 h = 0;
593 for (i = 0 ;i < xs->len ;i++) {
594 h *= 97;
595 h += xs->str[i]<<4;
596 h += xs->str[i]>>4;
597 }
598 if (h < 0) {
599 return -h;
600 }
601 return h;
602 }
603
604 static char *
conv_cstr(const char * s,int from,int to)605 conv_cstr(const char *s, int from, int to)
606 {
607 char *res;
608 xstr *xs = anthy_cstr_to_xstr(s, from);
609 if (!xs) {
610 return NULL;
611 }
612 res = anthy_xstr_to_cstr(xs, to);
613 anthy_free_xstr(xs);
614 return res;
615 }
616
617 char *
anthy_conv_euc_to_utf8(const char * s)618 anthy_conv_euc_to_utf8(const char *s)
619 {
620 return conv_cstr(s, ANTHY_EUC_JP_ENCODING, ANTHY_UTF8_ENCODING);
621 }
622
623 char *
anthy_conv_utf8_to_euc(const char * s)624 anthy_conv_utf8_to_euc(const char *s)
625 {
626 return conv_cstr(s, ANTHY_UTF8_ENCODING, ANTHY_EUC_JP_ENCODING);
627 }
628
629 void
anthy_xstr_set_print_encoding(int encoding)630 anthy_xstr_set_print_encoding(int encoding)
631 {
632 print_encoding = encoding;
633 }
634
635 int
anthy_init_xstr(void)636 anthy_init_xstr(void)
637 {
638 return 0;
639 }
640
anthy_quit_xstr(void)641 void anthy_quit_xstr(void)
642 {
643 }
644