1 /*
2
3 Copyright (c) 2003-2013 uim Project https://github.com/uim/uim
4
5 All rights reserved.
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10
11 1. Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13 2. Redistributions in binary form must reproduce the above copyright
14 notice, this list of conditions and the following disclaimer in the
15 documentation and/or other materials provided with the distribution.
16 3. Neither the name of authors nor the names of its contributors
17 may be used to endorse or promote products derived from this software
18 without specific prior written permission.
19
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND
21 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE
24 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 SUCH DAMAGE.
31
32 */
33
34 #ifdef HAVE_CONFIG_H
35 #include <config.h>
36 #endif
37 #include <stdio.h>
38 #if (!defined(DEBUG) && !defined(NDEBUG))
39 #define NDEBUG
40 #endif
41 #ifdef HAVE_ASSERT_H
42 #include <assert.h>
43 #endif
44 #ifdef HAVE_LOCALE_H
45 #include <locale.h>
46 #endif
47 #ifdef HAVE_WCHAR_H
48 #include <wchar.h>
49 #endif
50 #ifdef HAVE_STRING_H
51 #include <string.h>
52 #endif
53 #ifdef HAVE_STRINGS_H
54 #include <strings.h>
55 #endif
56 #ifdef HAVE_CTYPE_H
57 #include <ctype.h>
58 #endif
59 #ifdef HAVE_STDLIB_H
60 #include <stdlib.h>
61 #endif
62 #ifdef HAVE_LANGINFO_CODESET
63 #include <langinfo.h>
64 #endif
65 #include "uim-fep.h"
66 #include "str.h"
67
68 #define min(a, b) ((a) < (b) ? (a) : (b))
69
70 static int s_utf8;
71
72 static int str2wcstr(const char *str, wchar_t **wcstr);
73 static int byte2width(char *str, int n);
74 static int byte2width2(char *str, int n);
75
init_str(void)76 void init_str(void)
77 {
78 const char *enc;
79
80 if (setlocale(LC_CTYPE, "") == NULL) {
81 printf("locale not supported\n");
82 exit(EXIT_FAILURE);
83 }
84
85 enc = get_enc();
86 s_utf8 = (strcasecmp(enc, "UTF-8") == 0 || strcasecmp(enc, "UTF8") == 0);
87 }
88
89 /*
90 * setlocaleで得られるエンコーディングを返す
91 * 設定されていない場合は"UTF-8"を返す
92 */
get_enc(void)93 const char *get_enc(void)
94 {
95 #ifdef __CYGWIN32__
96 return "EUC-JP";
97 #else
98 #ifdef HAVE_LANGINFO_CODESET
99 return nl_langinfo(CODESET);
100 #else
101 char *locale;
102
103 locale = setlocale(LC_CTYPE, NULL);
104 assert(locale != NULL);
105
106 if (strcasecmp(locale, "ja") == 0) {
107 return "EUC-JP";
108 } else {
109 char *ptr;
110 ptr = strstr(locale, ".");
111 return ptr != NULL ? ptr + 1 : "UTF-8";
112 }
113 #endif
114 #endif
115 }
116
str2wcstr(const char * str,wchar_t ** wcstr)117 static int str2wcstr(const char *str, wchar_t **wcstr)
118 {
119 int str_byte;
120 int nr_wchars;
121
122 assert(str != NULL);
123
124 str_byte = strlen(str);
125
126 if (str_byte == 0) {
127 *wcstr = NULL;
128 return 0;
129 }
130
131 *wcstr = uim_malloc(sizeof(wchar_t) * (str_byte + 1));
132 nr_wchars = mbstowcs(*wcstr, str, str_byte);
133 assert((size_t)nr_wchars != (size_t)-1);
134 (*wcstr)[str_byte] = 0;
135
136 return nr_wchars;
137 }
138
139 /*
140 * str1とstr2の先頭からの共通部分文字列の幅を返す
141 * compare_str("a", "b") = 0
142 * compare_str("a", "ab") = 1
143 * compare_str("aあ", "aあ") = 3
144 * compare_str("い(0xa4a4)", "あ(0xa4a2)") = 0
145 */
compare_str(char * str1,char * str2)146 int compare_str(char *str1, char *str2)
147 {
148 int i;
149 int len1;
150 int len2;
151
152 assert(str1 != NULL && str2 != NULL);
153
154 len1 = strlen(str1);
155 len2 = strlen(str2);
156
157 for (i = 0; i < min(len1, len2); i++) {
158 if (str1[i] != str2[i]) {
159 break;
160 }
161 }
162
163 return byte2width(str1, i);
164 }
165
166 /*
167 * str1とstr2の末尾からの共通部分文字列の幅を返す
168 * compare_str_rev("a", "b") = 0
169 * compare_str_rev("a", "ba") = 1
170 * compare_str_rev("aあ", "baあ") = 3
171 * compare_str_rev("□(0xa2a2)", "あ(0xa4a2)") = 0
172 */
compare_str_rev(char * str1,char * str2)173 int compare_str_rev(char *str1, char *str2)
174 {
175 int i;
176 int len1;
177 int len2;
178 int width1;
179 int width2;
180
181 assert(str1 != NULL && str2 != NULL);
182
183 len1 = strlen(str1);
184 len2 = strlen(str2);
185
186 for (i = 1; i <= min(len1, len2); i++) {
187 if (str1[len1 - i] != str2[len2 - i]) {
188 break;
189 }
190 }
191
192 width1 = strwidth(str1) - byte2width2((char *)str1, len1 - i + 1);
193 width2 = strwidth(str2) - byte2width2((char *)str2, len2 - i + 1);
194 return (width1 == width2) ? width1 : 0;
195 }
196
197 /*
198 * 文字列の幅を返す
199 * strwidth("abc") = 3
200 * strwidth("あa") = 3
201 * strwidth("") = 0
202 */
203 #if defined(HAVE_WCSWIDTH) && !defined(__CYGWIN32__)
strwidth(const char * str)204 int strwidth(const char *str)
205 {
206 int width;
207 wchar_t *wcstr;
208 int nr_wchars;
209
210 assert(str != NULL);
211
212 nr_wchars = str2wcstr(str, &wcstr);
213
214 if (nr_wchars == 0) {
215 return 0;
216 }
217
218 width = wcswidth(wcstr, nr_wchars);
219 assert(width != -1);
220 free(wcstr);
221 return width;
222 }
223 #else
strwidth(const char * str)224 int strwidth(const char *str)
225 {
226 int width = 0;
227
228 assert(str != NULL);
229
230 while (*str != '\0') {
231 if (isascii((unsigned char)*str)) {
232 width++;
233 str++;
234 } else {
235 if (s_utf8) {
236 width += 2;
237 str += 3;
238 } else {
239 /* euc-jp */
240 if ((unsigned char)*str == 0x8e) {
241 /* 半角カタカナ */
242 width++;
243 str += 2;
244 } else if ((unsigned char)*str == 0x8f) {
245 /* G3 */
246 width += 2;
247 str += 3;
248 } else {
249 width += 2;
250 str += 2;
251 }
252 }
253 }
254 }
255 return width;
256 }
257 #endif
258
259 /*
260 * substr = strのnバイト以下の先頭からの最長部分文字列として、
261 * strwidth(substr)を返す。
262 * byte2width("abc", 2) = 2
263 * byte2width("ああ", 3) = 2 (euc)
264 * byte2width("ああ", 4) = 4 (euc)
265 * byte2width("ああ", 6) = 4 (euc)
266 * byte2width("ああ", 4) = 2 (utf8)
267 * byte2width("ああ", 5) = 2 (utf8)
268 * byte2width("ああ", 6) = 4 (utf8)
269 */
270 #if defined(HAVE_WCSWIDTH) && !defined(__CYGWIN32__)
byte2width(char * str,int n)271 static int byte2width(char *str, int n)
272 {
273 int width;
274 int str_byte;
275 char save_char;
276 char *save_str;
277 wchar_t *wcstr;
278 int nr_wchars;
279
280 assert(str != NULL);
281
282 if (n <= 0) {
283 return 0;
284 }
285
286 str_byte = strlen(str);
287 if (str_byte == 0) {
288 return 0;
289 }
290
291 if (n > str_byte) {
292 n = str_byte;
293 }
294
295 wcstr = uim_malloc(sizeof(wchar_t) * str_byte);
296
297 save_str = str;
298
299 save_char = str[n];
300 str[n] = '\0';
301 nr_wchars = mbsrtowcs(wcstr, (const char **)&str, str_byte, NULL);
302 save_str[n] = save_char;
303
304 if ((size_t)nr_wchars != (size_t)(-1)) {
305 width = wcswidth(wcstr, nr_wchars);
306 } else {
307 save_char = str[0];
308 str[0] = '\0';
309 width = strwidth(save_str);
310 str[0] = save_char;
311 }
312 free(wcstr);
313 assert(width >= 0);
314 return width;
315 }
316 #else
byte2width(char * str,int n)317 static int byte2width(char *str, int n)
318 {
319 int width = 0;
320 int byte = 0;
321 int char_width;
322 int char_byte;
323
324 assert(str != NULL);
325
326 if (n <= 0) {
327 return 0;
328 }
329
330 while (*str != '\0') {
331 if (isascii((unsigned char)*str)) {
332 char_width = 1;
333 char_byte = 1;
334 } else {
335 if (s_utf8) {
336 char_byte = 3;
337 char_width = 2;
338 } else {
339 /* euc-jp */
340 if ((unsigned char)*str == 0x8e) {
341 /* 半角カタカナ */
342 char_width = 1;
343 char_byte = 2;
344 } else if ((unsigned char)*str == 0x8f) {
345 /* G3 */
346 char_width = 2;
347 char_byte = 3;
348 } else {
349 char_width = 2;
350 char_byte = 2;
351 }
352 }
353 }
354 byte += char_byte;
355 if (byte == n) {
356 width += char_width;
357 break;
358 } else if (byte > n) {
359 break;
360 }
361 width += char_width;
362 str += char_byte;
363 }
364 return width;
365 }
366 #endif
367
368 /*
369 * substr = strのnバイト以上の先頭からの最短部分文字列として、
370 * strwidth(substr)を返す。
371 * n > strlen(str)の場合は substr = str
372 * byte2width2("abc", 2) = 2
373 * byte2width2("ああ", 3) = 4 (euc)
374 * byte2width2("ああ", 4) = 4 (euc)
375 * byte2width2("ああ", 6) = 4 (euc)
376 * byte2width2("ああ", 4) = 4 (utf8)
377 * byte2width2("ああ", 5) = 4 (utf8)
378 * byte2width2("ああ", 6) = 4 (utf8)
379 */
380 #if defined(HAVE_WCSWIDTH) && !defined(__CYGWIN32__)
byte2width2(char * str,int n)381 static int byte2width2(char *str, int n)
382 {
383 int width;
384 int str_byte;
385 char save_char;
386 char *save_str;
387 wchar_t *wcstr;
388 int nr_wchars;
389
390 assert(str != NULL);
391
392 if (n <= 0) {
393 return 0;
394 }
395
396 str_byte = strlen(str);
397 if (str_byte == 0) {
398 return 0;
399 }
400
401 if (n > str_byte) {
402 n = str_byte;
403 }
404
405 wcstr = uim_malloc(sizeof(wchar_t) * str_byte);
406
407 save_str = str;
408
409 save_char = str[n];
410 str[n] = '\0';
411 nr_wchars = mbsrtowcs(wcstr, (const char **)&str, str_byte, NULL);
412 save_str[n] = save_char;
413
414 if ((size_t)nr_wchars != (size_t)(-1)) {
415 width = wcswidth(wcstr, nr_wchars);
416 } else {
417 mbsrtowcs(wcstr, (const char **)&str, 1, NULL);
418 /* strを最後まで変換するとNULLになる */
419 assert(str != NULL);
420 save_char = str[0];
421 str[0] = '\0';
422 width = strwidth(save_str);
423 str[0] = save_char;
424 }
425 free(wcstr);
426 assert(width >= 0);
427 return width;
428 }
429 #else
byte2width2(char * str,int n)430 static int byte2width2(char *str, int n)
431 {
432 int width = 0;
433 int byte = 0;
434 int char_width;
435 int char_byte;
436
437 assert(str != NULL);
438
439 if (n <= 0) {
440 return 0;
441 }
442
443 while (*str != '\0') {
444 if (isascii((unsigned char)*str)) {
445 char_width = 1;
446 char_byte = 1;
447 } else {
448 if (s_utf8) {
449 char_byte = 3;
450 char_width = 2;
451 } else {
452 /* euc-jp */
453 if ((unsigned char)*str == 0x8e) {
454 /* 半角カタカナ */
455 char_width = 1;
456 char_byte = 2;
457 } else if ((unsigned char)*str == 0x8f) {
458 /* G3 */
459 char_width = 2;
460 char_byte = 3;
461 } else {
462 char_width = 2;
463 char_byte = 2;
464 }
465 }
466 }
467 byte += char_byte;
468 width += char_width;
469 if (byte >= n) {
470 break;
471 }
472 str += char_byte;
473 }
474 return width;
475 }
476 #endif
477
478 /*
479 * 返り値 rval[2]
480 * substr = strの幅n以下の先頭からの最長部分文字列として、
481 * rval[0] = substrのバイト
482 * rval[1] = substrの幅
483 * width2byte("ああ", 3) = [2, 2] (euc)
484 * width2byte("ああ", 4) = [4, 4] (euc)
485 * width2byte("ああ", 6) = [4, 4] (euc)
486 * width2byte("ああ", 3) = [3, 2] (utf8)
487 * width2byte("ああ", 4) = [6, 4] (utf8)
488 */
489 #if defined(HAVE_WCSWIDTH) && !defined(__CYGWIN32__)
width2byte(const char * str,int n)490 int *width2byte(const char *str, int n)
491 {
492 int width = 0;
493 int str_byte;
494 wchar_t *wcstr;
495 int nr_wchars;
496 static int rval[2];
497 int i;
498
499 assert(str != NULL);
500
501 if (n < 0) {
502 n = 0;
503 }
504
505 str_byte = strlen(str);
506 if (str_byte == 0) {
507 rval[0] = rval[1] = 0;
508 return rval;
509 }
510
511 if (n > str_byte) {
512 n = str_byte;
513 }
514
515 nr_wchars = str2wcstr(str, &wcstr);
516
517 for (i = nr_wchars; i >= 0; i--) {
518 width = wcswidth(wcstr, i);
519 if (width <= n) {
520 wcstr[i] = '\0';
521 str_byte = wcstombs(NULL, wcstr, 0);
522 break;
523 }
524 }
525 assert((size_t)str_byte != (size_t)-1 && width >= 0);
526 rval[0] = str_byte;
527 rval[1] = width;
528 free(wcstr);
529 return rval;
530 }
531 #else
width2byte(const char * str,int n)532 int *width2byte(const char *str, int n)
533 {
534 int width = 0;
535 int byte = 0;
536 int char_width;
537 int char_byte;
538 static int rval[2];
539
540 assert(str != NULL);
541
542 for (; *str != '\0'; str++) {
543 if (isascii((unsigned char)*str)) {
544 char_width = 1;
545 char_byte = 1;
546 } else {
547 if (s_utf8) {
548 char_byte = 3;
549 char_width = 2;
550 } else {
551 if ((unsigned char)*str == 0x8e) {
552 char_width = 1;
553 char_byte = 2;
554 } else if ((unsigned char)*str == 0x8f) {
555 /* G3 */
556 char_width = 2;
557 char_byte = 3;
558 } else {
559 char_width = 2;
560 char_byte = 2;
561 }
562 }
563 }
564 if (width + char_width == n) {
565 width += char_width;
566 byte += char_byte;
567 break;
568 } else if (width + char_width > n) {
569 break;
570 }
571 width += char_width;
572 str += char_byte - 1;
573 byte += char_byte;
574 }
575 rval[0] = byte;
576 rval[1] = width;
577 return rval;
578 }
579 #endif
580
581 /*
582 * 返り値 rval[2]
583 * substr = strの幅n以上の先頭からの最短部分文字列として、
584 * rval[0] = substrのバイト
585 * rval[1] = substrの幅
586 * n > strwidth(str)の場合は substr = str
587 * width2byte2("ああ", 1) = [2, 2] (euc)
588 * width2byte2("ああ", 3) = [4, 4] (euc)
589 * width2byte2("ああ", 6) = [4, 4] (euc)
590 * width2byte2("ああ", 1) = [3, 2] (utf8)
591 * width2byte2("ああ", 4) = [6, 4] (utf8)
592 */
593 #if defined(HAVE_WCSWIDTH) && !defined(__CYGWIN32__)
width2byte2(const char * str,int n)594 int *width2byte2(const char *str, int n)
595 {
596 int width = 0;
597 int str_byte;
598 wchar_t *wcstr;
599 int nr_wchars;
600 static int rval[2];
601 int i;
602
603 assert(str != NULL);
604
605 if (n < 0) {
606 n = 0;
607 }
608
609 str_byte = strlen(str);
610 if (str_byte == 0) {
611 rval[0] = rval[1] = 0;
612 return rval;
613 }
614
615 if (n > str_byte) {
616 n = str_byte;
617 }
618
619 nr_wchars = str2wcstr(str, &wcstr);
620
621 for (i = 0; i <= nr_wchars; i++) {
622 width = wcswidth(wcstr, i);
623 if (width >= n) {
624 wcstr[i] = '\0';
625 str_byte = wcstombs(NULL, wcstr, 0);
626 break;
627 }
628 }
629 assert((size_t)str_byte != (size_t)-1 && width >= 0);
630 rval[0] = str_byte;
631 rval[1] = width;
632 free(wcstr);
633 return rval;
634 }
635 #else
width2byte2(const char * str,int n)636 int *width2byte2(const char *str, int n)
637 {
638 int width = 0;
639 int byte = 0;
640 int char_width;
641 int char_byte;
642 static int rval[2];
643
644 assert(str != NULL);
645
646 for (; *str != '\0'; str++) {
647 if (isascii((unsigned char)*str)) {
648 char_width = 1;
649 char_byte = 1;
650 } else {
651 if (s_utf8) {
652 char_byte = 3;
653 char_width = 2;
654 } else {
655 if ((unsigned char)*str == 0x8e) {
656 char_width = 1;
657 char_byte = 2;
658 } else if ((unsigned char)*str == 0x8f) {
659 /* G3 */
660 char_width = 2;
661 char_byte = 3;
662 } else {
663 char_width = 2;
664 char_byte = 2;
665 }
666 }
667 }
668 if (width + char_width >= n) {
669 width += char_width;
670 byte += char_byte;
671 break;
672 }
673 width += char_width;
674 str += char_byte - 1;
675 byte += char_byte;
676 }
677 rval[0] = byte;
678 rval[1] = width;
679 return rval;
680 }
681 #endif
682
683 /*
684 * substr = strの幅n以下の先頭からの最長部分文字列として、
685 * str[strlne(substr)] = '\0'
686 * strwidth(substr)を返す。
687 * strhead("ああ", 3) = 2 , str = "あ"
688 * strhead("ああ", 4) = 4 , str = "ああ"
689 * strhead("ああ", 6) = 4 , str = "ああ"
690 */
strhead(char * str,int n)691 int strhead(char *str, int n)
692 {
693 int *rval = width2byte(str, n);
694 assert(0 <= rval[0] && rval[0] <= (int)strlen(str));
695 str[rval[0]] = '\0';
696 return rval[1];
697 }
698
699 /*
700 * haystackの中で最も右に現われるneedleの次の文字列のポインタを返す
701 * needleが空文字列の場合はNULLを返す
702 * needleがNULLのときはNULLを返す
703 */
rstrstr_len(const char * haystack,const char * needle,int haystack_len)704 char *rstrstr_len(const char *haystack, const char *needle, int haystack_len)
705 {
706 const char *str = NULL;
707 int needle_len;
708 const char *new_haystack = haystack;
709 assert(haystack != NULL);
710 if (needle == NULL) {
711 return NULL;
712 }
713 needle_len = strlen(needle);
714 if (needle_len <= 0) {
715 return NULL;
716 }
717 while ((new_haystack = strstr_len(haystack, needle, haystack_len)) != NULL) {
718 new_haystack += needle_len;
719 haystack_len -= (new_haystack - haystack);
720 str = haystack = new_haystack;
721 }
722 return (char *)str;
723 }
724
725 /*
726 * haystackに'\0'が含まれてもよいstrstr
727 * haystackの長さはhaystack_len
728 * haystackとneedleはNULLでない
729 * needleが""のときはhaystackを返す
730 */
strstr_len(const char * haystack,const char * needle,int haystack_len)731 char *strstr_len(const char *haystack, const char *needle, int haystack_len)
732 {
733 int needle_len;
734 int i, j;
735
736 assert(haystack != NULL && needle != NULL);
737
738 needle_len = strlen(needle);
739
740 for (i = 0; i < haystack_len - needle_len + 1; i++) {
741 for (j = 0; j < needle_len; j++) {
742 if (haystack[i + j] != needle[j]) {
743 break;
744 }
745 }
746 if (j == needle_len) {
747 return (char *)haystack + i;
748 }
749 }
750 return NULL;
751 }
752
753 #define TAB_WIDTH 4
754 /*
755 * tabstrのタブをTAB_WIDTH個のスペースに置き換える。
756 * 返り値はfreeする。
757 */
tab2space(const char * tabstr)758 char *tab2space(const char *tabstr)
759 {
760 char *spacestr;
761 int tabstr_len = strlen(tabstr);
762 int i, j;
763 int tabcount = 0;
764
765 for (i = 0; i < tabstr_len; i++) {
766 if (tabstr[i] == '\t') {
767 tabcount++;
768 }
769 }
770
771 spacestr = uim_malloc((tabstr_len - tabcount) + (TAB_WIDTH * tabcount) + 1);
772
773 for (i = 0, j = 0; i < tabstr_len + 1; i++, j++) {
774 if (tabstr[i] == '\t') {
775 int i2;
776 for (i2 = 0; i2 < TAB_WIDTH; i2++, j++) {
777 spacestr[j] = ' ';
778 }
779 j--;
780 } else {
781 spacestr[j] = tabstr[i];
782 }
783 }
784
785 return spacestr;
786 }
787