1 /* Copyright(C) 2004 Brazil
2
3 This library is free software; you can redistribute it and/or
4 modify it under the terms of the GNU Lesser General Public
5 License as published by the Free Software Foundation; either
6 version 2.1 of the License, or (at your option) any later version.
7
8 This library is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 Lesser General Public License for more details.
12
13 You should have received a copy of the GNU Lesser General Public
14 License along with this library; if not, write to the Free Software
15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16 */
17 #include "senna_in.h"
18 #include <stdio.h>
19 #include <string.h>
20 #include "ctx.h"
21 #include "str.h"
22 #include "set.h"
23
24 #ifndef __USE_ISOC99
25 #define __USE_ISOC99
26 #endif /* __USE_ISOC99 */
27 #include <math.h>
28
29 static sen_set *prefix = NULL;
30 static sen_set *suffix = NULL;
31
32 #define N_PREFIX 2048
33 #define N_SUFFIX 0
34
35 #define PREFIX_PATH SENNA_HOME PATH_SEPARATOR "prefix"
36 #define SUFFIX_PATH SENNA_HOME PATH_SEPARATOR "suffix"
37
38 inline static void
prefix_init(void)39 prefix_init(void)
40 {
41 int i, *ip;
42 FILE *fp;
43 char buffer[4];
44 prefix = sen_set_open(2, sizeof(int), 0);
45 if (!prefix) { SEN_LOG(sen_log_alert, "sen_set_open on prefix_init failed !"); return; }
46 if ((fp = fopen(PREFIX_PATH, "r"))) {
47 for (i = 0; i < N_PREFIX; i++) {
48 if (!fgets(buffer, 4, fp)) { break; }
49 sen_set_get(prefix, buffer, (void **)&ip);
50 *ip = i;
51 }
52 fclose(fp);
53 }
54 }
55
56 inline static void
suffix_init(void)57 suffix_init(void)
58 {
59 int i;
60 FILE *fp;
61 char buffer[4];
62 suffix = sen_set_open(2, 0, 0);
63 if (!suffix) { SEN_LOG(sen_log_alert, "sen_set_open on suffix_init failed !"); return; }
64 if ((fp = fopen(SUFFIX_PATH, "r"))) {
65 for (i = N_SUFFIX; i; i--) {
66 if (!fgets(buffer, 4, fp)) { break; }
67 sen_set_get(suffix, buffer, NULL);
68 }
69 fclose(fp);
70 }
71 }
72
73 inline size_t
sen_str_charlen_utf8(const unsigned char * str,const unsigned char * end)74 sen_str_charlen_utf8(const unsigned char *str, const unsigned char *end)
75 {
76 /* MEMO: This function allows non-null-terminated string as str. */
77 /* But requires the end of string. */
78 const unsigned char *p = str;
79 if (!*p || p >= end) { return 0; }
80 if (*p & 0x80) {
81 int b, w;
82 size_t size;
83 for (b = 0x40, w = 0; b && (*p & b); b >>= 1, w++);
84 if (!w) {
85 SEN_LOG(sen_log_warning, "invalid utf8 string(1) on sen_str_charlen_utf8");
86 return 0;
87 }
88 for (size = 1; w--; size++) {
89 if (++p >= end || !*p || (*p & 0xc0) != 0x80) {
90 SEN_LOG(sen_log_warning, "invalid utf8 string(2) on sen_str_charlen_utf8");
91 return 0;
92 }
93 }
94 return size;
95 } else {
96 return 1;
97 }
98 return 0;
99 }
100
101 unsigned int
sen_str_charlen(const char * str,sen_encoding encoding)102 sen_str_charlen(const char *str, sen_encoding encoding)
103 {
104 /* MEMO: This function requires null-terminated string as str.*/
105 unsigned char *p = (unsigned char *) str;
106 if (!*p) { return 0; }
107 switch (encoding) {
108 case sen_enc_euc_jp :
109 if (*p & 0x80) {
110 if (*(p + 1)) {
111 return 2;
112 } else {
113 /* This is invalid character */
114 SEN_LOG(sen_log_warning, "invalid euc-jp string end on sen_str_charlen");
115 return 0;
116 }
117 }
118 return 1;
119 break;
120 case sen_enc_utf8 :
121 if (*p & 0x80) {
122 int b, w;
123 size_t size;
124 for (b = 0x40, w = 0; b && (*p & b); b >>= 1, w++);
125 if (!w) {
126 SEN_LOG(sen_log_warning, "invalid utf8 string(1) on sen_str_charlen");
127 return 0;
128 }
129 for (size = 1; w--; size++) {
130 if (!*++p || (*p & 0xc0) != 0x80) {
131 SEN_LOG(sen_log_warning, "invalid utf8 string(2) on sen_str_charlen");
132 return 0;
133 }
134 }
135 return size;
136 } else {
137 return 1;
138 }
139 break;
140 case sen_enc_sjis :
141 if (*p & 0x80) {
142 /* we regard 0xa0 as JIS X 0201 KANA. adjusted to other tools. */
143 if (0xa0 <= *p && *p <= 0xdf) {
144 /* hankaku-kana */
145 return 1;
146 } else if (!(*(p + 1))) {
147 /* This is invalid character */
148 SEN_LOG(sen_log_warning, "invalid sjis string end on sen_str_charlen");
149 return 0;
150 } else {
151 return 2;
152 }
153 } else {
154 return 1;
155 }
156 break;
157 default :
158 return 1;
159 break;
160 }
161 return 0;
162 }
163
164 size_t
sen_str_charlen_nonnull(const char * str,const char * end,sen_encoding encoding)165 sen_str_charlen_nonnull(const char *str, const char *end, sen_encoding encoding)
166 {
167 /* MEMO: This function allows non-null-terminated string as str. */
168 /* But requires the end of string. */
169 unsigned char *p = (unsigned char *) str;
170 if (p >= (unsigned char *)end) { return 0; }
171 switch (encoding) {
172 case sen_enc_euc_jp :
173 if (*p & 0x80) {
174 if ((p + 1) < (unsigned char *)end) {
175 return 2;
176 } else {
177 /* This is invalid character */
178 SEN_LOG(sen_log_warning, "invalid euc-jp string end on sen_str_charlen_nonnull");
179 return 0;
180 }
181 }
182 return 1;
183 break;
184 case sen_enc_utf8 :
185 return sen_str_charlen_utf8(p, (unsigned char *)end);
186 break;
187 case sen_enc_sjis :
188 if (*p & 0x80) {
189 /* we regard 0xa0 as JIS X 0201 KANA. adjusted to other tools. */
190 if (0xa0 <= *p && *p <= 0xdf) {
191 /* hankaku-kana */
192 return 1;
193 } else if (++p >= (unsigned char *)end) {
194 /* This is invalid character */
195 SEN_LOG(sen_log_warning, "invalid sjis string end on sen_str_charlen_nonnull");
196 return 0;
197 } else {
198 return 2;
199 }
200 } else {
201 return 1;
202 }
203 break;
204 default :
205 return 1;
206 break;
207 }
208 return 0;
209 }
210
211 sen_rc
sen_str_fin(void)212 sen_str_fin(void)
213 {
214 if (prefix) { sen_set_close(prefix); }
215 if (suffix) { sen_set_close(suffix); }
216 return sen_success;
217 }
218
219 int
sen_str_get_prefix_order(const char * str)220 sen_str_get_prefix_order(const char *str)
221 {
222 int *ip;
223 if (!str) { return -1; }
224 if (!prefix) { prefix_init(); }
225 if (prefix && sen_set_at(prefix, str, (void **)&ip)) {
226 return *ip;
227 } else {
228 return -1;
229 }
230 }
231
232 static unsigned char symbol[] = {
233 ',', '.', 0, ':', ';', '?', '!', 0, 0, 0, '`', 0, '^', '~', '_', 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, '-', '-', '/', '\\', 0, 0, '|', 0, 0, 0, '\'', 0,
235 '"', '(', ')', 0, 0, '[', ']', '{', '}', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
236 '+', '-', 0, 0, 0, '=', 0, '<', '>', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
237 '$', 0, 0, '%', '#', '&', '*', '@', 0, 0, 0, 0, 0, 0, 0, 0
238 };
239
240 inline static sen_rc
normalize_euc(sen_nstr * nstr)241 normalize_euc(sen_nstr *nstr)
242 {
243 static uint16_t hankana[] = {
244 0xa1a1, 0xa1a3, 0xa1d6, 0xa1d7, 0xa1a2, 0xa1a6, 0xa5f2, 0xa5a1, 0xa5a3,
245 0xa5a5, 0xa5a7, 0xa5a9, 0xa5e3, 0xa5e5, 0xa5e7, 0xa5c3, 0xa1bc, 0xa5a2,
246 0xa5a4, 0xa5a6, 0xa5a8, 0xa5aa, 0xa5ab, 0xa5ad, 0xa5af, 0xa5b1, 0xa5b3,
247 0xa5b5, 0xa5b7, 0xa5b9, 0xa5bb, 0xa5bd, 0xa5bf, 0xa5c1, 0xa5c4, 0xa5c6,
248 0xa5c8, 0xa5ca, 0xa5cb, 0xa5cc, 0xa5cd, 0xa5ce, 0xa5cf, 0xa5d2, 0xa5d5,
249 0xa5d8, 0xa5db, 0xa5de, 0xa5df, 0xa5e0, 0xa5e1, 0xa5e2, 0xa5e4, 0xa5e6,
250 0xa5e8, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5ef, 0xa5f3, 0xa1ab,
251 0xa1eb
252 };
253 static unsigned char dakuten[] = {
254 0xf4, 0, 0, 0, 0, 0xac, 0, 0xae, 0, 0xb0, 0, 0xb2, 0, 0xb4, 0, 0xb6, 0,
255 0xb8, 0, 0xba, 0, 0xbc, 0, 0xbe, 0, 0xc0, 0, 0xc2, 0, 0, 0xc5, 0, 0xc7,
256 0, 0xc9, 0, 0, 0, 0, 0, 0, 0xd0, 0, 0, 0xd3, 0, 0, 0xd6, 0, 0, 0xd9, 0,
257 0, 0xdc
258 };
259 static unsigned char handaku[] = {
260 0xd1, 0, 0, 0xd4, 0, 0, 0xd7, 0, 0, 0xda, 0, 0, 0xdd
261 };
262 int16_t *ch;
263 sen_ctx *ctx = nstr->ctx;
264 const unsigned char *s, *s_, *e;
265 unsigned char *d, *d0, *d_, b;
266 uint_least8_t *cp, *ctypes, ctype;
267 size_t size = nstr->orig_blen, length = 0;
268 int removeblankp = nstr->flags & SEN_STR_REMOVEBLANK;
269 if (!(nstr->norm = SEN_MALLOC(size * 2 + 1))) {
270 return sen_memory_exhausted;
271 }
272 d0 = (unsigned char *) nstr->norm;
273 if (nstr->flags & SEN_STR_WITH_CHECKS) {
274 if (!(nstr->checks = SEN_MALLOC(size * 2 * sizeof(int16_t) + 1))) {
275 SEN_FREE(nstr->norm);
276 nstr->norm = NULL;
277 return sen_memory_exhausted;
278 }
279 }
280 ch = nstr->checks;
281 if (nstr->flags & SEN_STR_WITH_CTYPES) {
282 if (!(nstr->ctypes = SEN_MALLOC(size + 1))) {
283 SEN_FREE(nstr->checks);
284 SEN_FREE(nstr->norm);
285 nstr->checks = NULL;
286 nstr->norm = NULL;
287 return sen_memory_exhausted;
288 }
289 }
290 cp = ctypes = nstr->ctypes;
291 e = (unsigned char *)nstr->orig + size;
292 for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) {
293 if ((*s & 0x80)) {
294 if (((s + 1) < e) && (*(s + 1) & 0x80)) {
295 unsigned char c1 = *s++, c2 = *s, c3 = 0;
296 switch (c1 >> 4) {
297 case 0x08 :
298 if (c1 == 0x8e && 0xa0 <= c2 && c2 <= 0xdf) {
299 uint16_t c = hankana[c2 - 0xa0];
300 switch (c) {
301 case 0xa1ab :
302 if (d > d0 + 1 && d[-2] == 0xa5
303 && 0xa6 <= d[-1] && d[-1] <= 0xdb && (b = dakuten[d[-1] - 0xa6])) {
304 *(d - 1) = b;
305 if (ch) { ch[-1] += 2; s_ += 2; }
306 continue;
307 } else {
308 *d++ = c >> 8; *d = c & 0xff;
309 }
310 break;
311 case 0xa1eb :
312 if (d > d0 + 1 && d[-2] == 0xa5
313 && 0xcf <= d[-1] && d[-1] <= 0xdb && (b = handaku[d[-1] - 0xcf])) {
314 *(d - 1) = b;
315 if (ch) { ch[-1] += 2; s_ += 2; }
316 continue;
317 } else {
318 *d++ = c >> 8; *d = c & 0xff;
319 }
320 break;
321 default :
322 *d++ = c >> 8; *d = c & 0xff;
323 break;
324 }
325 ctype = sen_str_katakana;
326 } else {
327 *d++ = c1; *d = c2;
328 ctype = sen_str_others;
329 }
330 break;
331 case 0x09 :
332 *d++ = c1; *d = c2;
333 ctype = sen_str_others;
334 break;
335 case 0x0a :
336 switch (c1 & 0x0f) {
337 case 1 :
338 switch (c2) {
339 case 0xbc :
340 *d++ = c1; *d = c2;
341 ctype = sen_str_katakana;
342 break;
343 case 0xb9 :
344 *d++ = c1; *d = c2;
345 ctype = sen_str_kanji;
346 break;
347 case 0xa1 :
348 if (removeblankp) {
349 if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
350 continue;
351 } else {
352 *d = ' ';
353 ctype = SEN_NSTR_BLANK|sen_str_symbol;
354 }
355 break;
356 default :
357 if (c2 >= 0xa4 && (c3 = symbol[c2 - 0xa4])) {
358 *d = c3;
359 ctype = sen_str_symbol;
360 } else {
361 *d++ = c1; *d = c2;
362 ctype = sen_str_others;
363 }
364 break;
365 }
366 break;
367 case 2 :
368 *d++ = c1; *d = c2;
369 ctype = sen_str_symbol;
370 break;
371 case 3 :
372 c3 = c2 - 0x80;
373 if ('a' <= c3 && c3 <= 'z') {
374 ctype = sen_str_alpha;
375 *d = c3;
376 } else if ('A' <= c3 && c3 <= 'Z') {
377 ctype = sen_str_alpha;
378 *d = c3 + 0x20;
379 } else if ('0' <= c3 && c3 <= '9') {
380 ctype = sen_str_digit;
381 *d = c3;
382 } else {
383 ctype = sen_str_others;
384 *d++ = c1; *d = c2;
385 }
386 break;
387 case 4 :
388 *d++ = c1; *d = c2;
389 ctype = sen_str_hiragana;
390 break;
391 case 5 :
392 *d++ = c1; *d = c2;
393 ctype = sen_str_katakana;
394 break;
395 case 6 :
396 case 7 :
397 case 8 :
398 *d++ = c1; *d = c2;
399 ctype = sen_str_symbol;
400 break;
401 default :
402 *d++ = c1; *d = c2;
403 ctype = sen_str_others;
404 break;
405 }
406 break;
407 default :
408 *d++ = c1; *d = c2;
409 ctype = sen_str_kanji;
410 break;
411 }
412 } else {
413 /* skip invalid character */
414 continue;
415 }
416 } else {
417 unsigned char c = *s;
418 switch (c >> 4) {
419 case 0 :
420 case 1 :
421 /* skip unprintable ascii */
422 if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
423 continue;
424 case 2 :
425 if (c == 0x20) {
426 if (removeblankp) {
427 if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
428 continue;
429 } else {
430 *d = ' ';
431 ctype = SEN_NSTR_BLANK|sen_str_symbol;
432 }
433 } else {
434 *d = c;
435 ctype = sen_str_symbol;
436 }
437 break;
438 case 3 :
439 *d = c;
440 ctype = (c <= 0x39) ? sen_str_digit : sen_str_symbol;
441 break;
442 case 4 :
443 *d = ('A' <= c) ? c + 0x20 : c;
444 ctype = (c == 0x40) ? sen_str_symbol : sen_str_alpha;
445 break;
446 case 5 :
447 *d = (c <= 'Z') ? c + 0x20 : c;
448 ctype = (c <= 0x5a) ? sen_str_alpha : sen_str_symbol;
449 break;
450 case 6 :
451 *d = c;
452 ctype = (c == 0x60) ? sen_str_symbol : sen_str_alpha;
453 break;
454 case 7 :
455 *d = c;
456 ctype = (c <= 0x7a) ? sen_str_alpha : (c == 0x7f ? sen_str_others : sen_str_symbol);
457 break;
458 default :
459 *d = c;
460 ctype = sen_str_others;
461 break;
462 }
463 }
464 d++;
465 length++;
466 if (cp) { *cp++ = ctype; }
467 if (ch) {
468 *ch++ = (int16_t)(s + 1 - s_);
469 s_ = s + 1;
470 while (++d_ < d) { *ch++ = 0; }
471 }
472 }
473 if (cp) { *cp = sen_str_null; }
474 *d = '\0';
475 nstr->length = length;
476 nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
477 return sen_success;
478 }
479
480 #ifndef NO_NFKC
481 uint_least8_t sen_nfkc_ctype(const unsigned char *str);
482 const char *sen_nfkc_map1(const unsigned char *str);
483 const char *sen_nfkc_map2(const unsigned char *prefix, const unsigned char *suffix);
484
485 inline static sen_rc
normalize_utf8(sen_nstr * nstr)486 normalize_utf8(sen_nstr *nstr)
487 {
488 int16_t *ch;
489 sen_ctx *ctx = nstr->ctx;
490 const unsigned char *s, *s_, *s__, *p, *p2, *pe, *e;
491 unsigned char *d, *d_, *de;
492 uint_least8_t *cp;
493 size_t length = 0, ls, lp, size = nstr->orig_blen, ds = size * 3;
494 int removeblankp = nstr->flags & SEN_STR_REMOVEBLANK;
495 if (!(nstr->norm = SEN_MALLOC(ds + 1))) {
496 return sen_memory_exhausted;
497 }
498 if (nstr->flags & SEN_STR_WITH_CHECKS) {
499 if (!(nstr->checks = SEN_MALLOC(ds * sizeof(int16_t) + 1))) {
500 SEN_FREE(nstr->norm);
501 nstr->norm = NULL;
502 return sen_memory_exhausted;
503 }
504 }
505 ch = nstr->checks;
506 if (nstr->flags & SEN_STR_WITH_CTYPES) {
507 if (!(nstr->ctypes = SEN_MALLOC(ds + 1))) {
508 if (nstr->checks) {
509 SEN_FREE(nstr->checks); nstr->checks = NULL;
510 }
511 SEN_FREE(nstr->norm); nstr->norm = NULL;
512 return sen_memory_exhausted;
513 }
514 }
515 cp = nstr->ctypes;
516 d = (unsigned char *)nstr->norm;
517 de = d + ds;
518 d_ = NULL;
519 e = (unsigned char *)nstr->orig + size;
520 for (s = s_ = (unsigned char *)nstr->orig; ; s += ls) {
521 if (!(ls = sen_str_charlen_utf8(s, e))) {
522 break;
523 }
524 if ((p = (unsigned char *)sen_nfkc_map1(s))) {
525 pe = p + strlen((char *)p);
526 } else {
527 p = s;
528 pe = p + ls;
529 }
530 if (d_ && (p2 = (unsigned char *)sen_nfkc_map2(d_, p))) {
531 p = p2;
532 pe = p + strlen((char *)p);
533 if (cp) { cp--; }
534 if (ch) {
535 ch -= (d - d_);
536 s_ = s__;
537 }
538 d = d_;
539 length--;
540 }
541 for (; ; p += lp) {
542 if (!(lp = sen_str_charlen_utf8(p, pe))) {
543 break;
544 }
545 if ((*p == ' ' && removeblankp) || *p < 0x20 /* skip unprintable ascii */ ) {
546 if (cp > nstr->ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
547 } else {
548 if (de <= d + lp) {
549 unsigned char *norm;
550 ds += (ds >> 1) + lp;
551 if (!(norm = SEN_REALLOC(nstr->norm, ds + 1))) {
552 if (nstr->ctypes) { SEN_FREE(nstr->ctypes); nstr->ctypes = NULL; }
553 if (nstr->checks) { SEN_FREE(nstr->checks); nstr->checks = NULL; }
554 SEN_FREE(nstr->norm); nstr->norm = NULL;
555 return sen_memory_exhausted;
556 }
557 de = norm + ds;
558 d = norm + (d - (unsigned char *)nstr->norm);
559 nstr->norm = norm;
560 if (ch) {
561 int16_t *checks;
562 if (!(checks = SEN_REALLOC(nstr->checks, ds * sizeof(int16_t)+ 1))) {
563 if (nstr->ctypes) { SEN_FREE(nstr->ctypes); nstr->ctypes = NULL; }
564 SEN_FREE(nstr->checks); nstr->checks = NULL;
565 SEN_FREE(nstr->norm); nstr->norm = NULL;
566 return sen_memory_exhausted;
567 }
568 ch = checks + (ch - nstr->checks);
569 nstr->checks = checks;
570 }
571 if (cp) {
572 uint_least8_t *ctypes;
573 if (!(ctypes = SEN_REALLOC(nstr->ctypes, ds + 1))) {
574 SEN_FREE(nstr->ctypes); nstr->ctypes = NULL;
575 if (nstr->checks) { SEN_FREE(nstr->checks); nstr->checks = NULL; }
576 SEN_FREE(nstr->norm); nstr->norm = NULL;
577 return sen_memory_exhausted;
578 }
579 cp = ctypes + (cp - nstr->ctypes);
580 nstr->ctypes = ctypes;
581 }
582 }
583
584 memcpy(d, p, lp);
585 d_ = d;
586 d += lp;
587 length++;
588 if (cp) { *cp++ = sen_nfkc_ctype(p); }
589 if (ch) {
590 size_t i;
591 if (s_ == s + ls) {
592 *ch++ = -1;
593 } else {
594 *ch++ = (int16_t)(s + ls - s_);
595 s__ = s_;
596 s_ = s + ls;
597 }
598 for (i = lp; i > 1; i--) { *ch++ = 0; }
599 }
600 }
601 }
602 }
603 if (cp) { *cp = sen_str_null; }
604 *d = '\0';
605 nstr->length = length;
606 nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
607 return sen_success;
608 }
609 #endif /* NO_NFKC */
610
611 inline static sen_rc
normalize_sjis(sen_nstr * nstr)612 normalize_sjis(sen_nstr *nstr)
613 {
614 static uint16_t hankana[] = {
615 0x8140, 0x8142, 0x8175, 0x8176, 0x8141, 0x8145, 0x8392, 0x8340, 0x8342,
616 0x8344, 0x8346, 0x8348, 0x8383, 0x8385, 0x8387, 0x8362, 0x815b, 0x8341,
617 0x8343, 0x8345, 0x8347, 0x8349, 0x834a, 0x834c, 0x834e, 0x8350, 0x8352,
618 0x8354, 0x8356, 0x8358, 0x835a, 0x835c, 0x835e, 0x8360, 0x8363, 0x8365,
619 0x8367, 0x8369, 0x836a, 0x836b, 0x836c, 0x836d, 0x836e, 0x8371, 0x8374,
620 0x8377, 0x837a, 0x837d, 0x837e, 0x8380, 0x8381, 0x8382, 0x8384, 0x8386,
621 0x8388, 0x8389, 0x838a, 0x838b, 0x838c, 0x838d, 0x838f, 0x8393, 0x814a,
622 0x814b
623 };
624 static unsigned char dakuten[] = {
625 0x94, 0, 0, 0, 0, 0x4b, 0, 0x4d, 0, 0x4f, 0, 0x51, 0, 0x53, 0, 0x55, 0,
626 0x57, 0, 0x59, 0, 0x5b, 0, 0x5d, 0, 0x5f, 0, 0x61, 0, 0, 0x64, 0, 0x66,
627 0, 0x68, 0, 0, 0, 0, 0, 0, 0x6f, 0, 0, 0x72, 0, 0, 0x75, 0, 0, 0x78, 0,
628 0, 0x7b
629 };
630 static unsigned char handaku[] = {
631 0x70, 0, 0, 0x73, 0, 0, 0x76, 0, 0, 0x79, 0, 0, 0x7c
632 };
633 int16_t *ch;
634 sen_ctx *ctx = nstr->ctx;
635 const unsigned char *s, *s_;
636 unsigned char *d, *d0, *d_, b, *e;
637 uint_least8_t *cp, *ctypes, ctype;
638 size_t size = nstr->orig_blen, length = 0;
639 int removeblankp = nstr->flags & SEN_STR_REMOVEBLANK;
640 if (!(nstr->norm = SEN_MALLOC(size * 2 + 1))) {
641 return sen_memory_exhausted;
642 }
643 d0 = (unsigned char *) nstr->norm;
644 if (nstr->flags & SEN_STR_WITH_CHECKS) {
645 if (!(nstr->checks = SEN_MALLOC(size * 2 * sizeof(int16_t) + 1))) {
646 SEN_FREE(nstr->norm);
647 nstr->norm = NULL;
648 return sen_memory_exhausted;
649 }
650 }
651 ch = nstr->checks;
652 if (nstr->flags & SEN_STR_WITH_CTYPES) {
653 if (!(nstr->ctypes = SEN_MALLOC(size + 1))) {
654 SEN_FREE(nstr->checks);
655 SEN_FREE(nstr->norm);
656 nstr->checks = NULL;
657 nstr->norm = NULL;
658 return sen_memory_exhausted;
659 }
660 }
661 cp = ctypes = nstr->ctypes;
662 e = (unsigned char *)nstr->orig + size;
663 for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) {
664 if ((*s & 0x80)) {
665 if (0xa0 <= *s && *s <= 0xdf) {
666 uint16_t c = hankana[*s - 0xa0];
667 switch (c) {
668 case 0x814a :
669 if (d > d0 + 1 && d[-2] == 0x83
670 && 0x45 <= d[-1] && d[-1] <= 0x7a && (b = dakuten[d[-1] - 0x45])) {
671 *(d - 1) = b;
672 if (ch) { ch[-1]++; s_++; }
673 continue;
674 } else {
675 *d++ = c >> 8; *d = c & 0xff;
676 }
677 break;
678 case 0x814b :
679 if (d > d0 + 1 && d[-2] == 0x83
680 && 0x6e <= d[-1] && d[-1] <= 0x7a && (b = handaku[d[-1] - 0x6e])) {
681 *(d - 1) = b;
682 if (ch) { ch[-1]++; s_++; }
683 continue;
684 } else {
685 *d++ = c >> 8; *d = c & 0xff;
686 }
687 break;
688 default :
689 *d++ = c >> 8; *d = c & 0xff;
690 break;
691 }
692 ctype = sen_str_katakana;
693 } else {
694 if ((s + 1) < e && 0x40 <= *(s + 1) && *(s + 1) <= 0xfc) {
695 unsigned char c1 = *s++, c2 = *s, c3 = 0;
696 if (0x81 <= c1 && c1 <= 0x87) {
697 switch (c1 & 0x0f) {
698 case 1 :
699 switch (c2) {
700 case 0x5b :
701 *d++ = c1; *d = c2;
702 ctype = sen_str_katakana;
703 break;
704 case 0x58 :
705 *d++ = c1; *d = c2;
706 ctype = sen_str_kanji;
707 break;
708 case 0x40 :
709 if (removeblankp) {
710 if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
711 continue;
712 } else {
713 *d = ' ';
714 ctype = SEN_NSTR_BLANK|sen_str_symbol;
715 }
716 break;
717 default :
718 if (0x43 <= c2 && c2 <= 0x7e && (c3 = symbol[c2 - 0x43])) {
719 *d = c3;
720 ctype = sen_str_symbol;
721 } else if (0x7f <= c2 && c2 <= 0x97 && (c3 = symbol[c2 - 0x44])) {
722 *d = c3;
723 ctype = sen_str_symbol;
724 } else {
725 *d++ = c1; *d = c2;
726 ctype = sen_str_others;
727 }
728 break;
729 }
730 break;
731 case 2 :
732 c3 = c2 - 0x1f;
733 if (0x4f <= c2 && c2 <= 0x58) {
734 ctype = sen_str_digit;
735 *d = c2 - 0x1f;
736 } else if (0x60 <= c2 && c2 <= 0x79) {
737 ctype = sen_str_alpha;
738 *d = c2 + 0x01;
739 } else if (0x81 <= c2 && c2 <= 0x9a) {
740 ctype = sen_str_alpha;
741 *d = c2 - 0x20;
742 } else if (0x9f <= c2 && c2 <= 0xf1) {
743 *d++ = c1; *d = c2;
744 ctype = sen_str_hiragana;
745 } else {
746 *d++ = c1; *d = c2;
747 ctype = sen_str_others;
748 }
749 break;
750 case 3 :
751 if (0x40 <= c2 && c2 <= 0x96) {
752 *d++ = c1; *d = c2;
753 ctype = sen_str_katakana;
754 } else {
755 *d++ = c1; *d = c2;
756 ctype = sen_str_symbol;
757 }
758 break;
759 case 4 :
760 case 7 :
761 *d++ = c1; *d = c2;
762 ctype = sen_str_symbol;
763 break;
764 default :
765 *d++ = c1; *d = c2;
766 ctype = sen_str_others;
767 break;
768 }
769 } else {
770 *d++ = c1; *d = c2;
771 ctype = sen_str_kanji;
772 }
773 } else {
774 /* skip invalid character */
775 continue;
776 }
777 }
778 } else {
779 unsigned char c = *s;
780 switch (c >> 4) {
781 case 0 :
782 case 1 :
783 /* skip unprintable ascii */
784 if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
785 continue;
786 case 2 :
787 if (c == 0x20) {
788 if (removeblankp) {
789 if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
790 continue;
791 } else {
792 *d = ' ';
793 ctype = SEN_NSTR_BLANK|sen_str_symbol;
794 }
795 } else {
796 *d = c;
797 ctype = sen_str_symbol;
798 }
799 break;
800 case 3 :
801 *d = c;
802 ctype = (c <= 0x39) ? sen_str_digit : sen_str_symbol;
803 break;
804 case 4 :
805 *d = ('A' <= c) ? c + 0x20 : c;
806 ctype = (c == 0x40) ? sen_str_symbol : sen_str_alpha;
807 break;
808 case 5 :
809 *d = (c <= 'Z') ? c + 0x20 : c;
810 ctype = (c <= 0x5a) ? sen_str_alpha : sen_str_symbol;
811 break;
812 case 6 :
813 *d = c;
814 ctype = (c == 0x60) ? sen_str_symbol : sen_str_alpha;
815 break;
816 case 7 :
817 *d = c;
818 ctype = (c <= 0x7a) ? sen_str_alpha : (c == 0x7f ? sen_str_others : sen_str_symbol);
819 break;
820 default :
821 *d = c;
822 ctype = sen_str_others;
823 break;
824 }
825 }
826 d++;
827 length++;
828 if (cp) { *cp++ = ctype; }
829 if (ch) {
830 *ch++ = (int16_t)(s + 1 - s_);
831 s_ = s + 1;
832 while (++d_ < d) { *ch++ = 0; }
833 }
834 }
835 if (cp) { *cp = sen_str_null; }
836 *d = '\0';
837 nstr->length = length;
838 nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
839 return sen_success;
840 }
841
842 inline static sen_rc
normalize_none(sen_nstr * nstr)843 normalize_none(sen_nstr *nstr)
844 {
845 int16_t *ch;
846 sen_ctx *ctx = nstr->ctx;
847 const unsigned char *s, *s_, *e;
848 unsigned char *d, *d0, *d_;
849 uint_least8_t *cp, *ctypes, ctype;
850 size_t size = nstr->orig_blen, length = 0;
851 int removeblankp = nstr->flags & SEN_STR_REMOVEBLANK;
852 if (!(nstr->norm = SEN_MALLOC(size + 1))) {
853 return sen_memory_exhausted;
854 }
855 d0 = (unsigned char *) nstr->norm;
856 if (nstr->flags & SEN_STR_WITH_CHECKS) {
857 if (!(nstr->checks = SEN_MALLOC(size * sizeof(int16_t) + 1))) {
858 SEN_FREE(nstr->norm);
859 nstr->norm = NULL;
860 return sen_memory_exhausted;
861 }
862 }
863 ch = nstr->checks;
864 if (nstr->flags & SEN_STR_WITH_CTYPES) {
865 if (!(nstr->ctypes = SEN_MALLOC(size + 1))) {
866 SEN_FREE(nstr->checks);
867 SEN_FREE(nstr->norm);
868 nstr->checks = NULL;
869 nstr->norm = NULL;
870 return sen_memory_exhausted;
871 }
872 }
873 cp = ctypes = nstr->ctypes;
874 e = (unsigned char *)nstr->orig + size;
875 for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) {
876 unsigned char c = *s;
877 switch (c >> 4) {
878 case 0 :
879 case 1 :
880 /* skip unprintable ascii */
881 if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
882 continue;
883 case 2 :
884 if (c == 0x20) {
885 if (removeblankp) {
886 if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
887 continue;
888 } else {
889 *d = ' ';
890 ctype = SEN_NSTR_BLANK|sen_str_symbol;
891 }
892 } else {
893 *d = c;
894 ctype = sen_str_symbol;
895 }
896 break;
897 case 3 :
898 *d = c;
899 ctype = (c <= 0x39) ? sen_str_digit : sen_str_symbol;
900 break;
901 case 4 :
902 *d = ('A' <= c) ? c + 0x20 : c;
903 ctype = (c == 0x40) ? sen_str_symbol : sen_str_alpha;
904 break;
905 case 5 :
906 *d = (c <= 'Z') ? c + 0x20 : c;
907 ctype = (c <= 0x5a) ? sen_str_alpha : sen_str_symbol;
908 break;
909 case 6 :
910 *d = c;
911 ctype = (c == 0x60) ? sen_str_symbol : sen_str_alpha;
912 break;
913 case 7 :
914 *d = c;
915 ctype = (c <= 0x7a) ? sen_str_alpha : (c == 0x7f ? sen_str_others : sen_str_symbol);
916 break;
917 default :
918 *d = c;
919 ctype = sen_str_others;
920 break;
921 }
922 d++;
923 length++;
924 if (cp) { *cp++ = ctype; }
925 if (ch) {
926 *ch++ = (int16_t)(s + 1 - s_);
927 s_ = s + 1;
928 while (++d_ < d) { *ch++ = 0; }
929 }
930 }
931 if (cp) { *cp = sen_str_null; }
932 *d = '\0';
933 nstr->length = length;
934 nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
935 return sen_success;
936 }
937
938 /* use cp1252 as latin1 */
939 inline static sen_rc
normalize_latin1(sen_nstr * nstr)940 normalize_latin1(sen_nstr *nstr)
941 {
942 int16_t *ch;
943 sen_ctx *ctx = nstr->ctx;
944 const unsigned char *s, *s_, *e;
945 unsigned char *d, *d0, *d_;
946 uint_least8_t *cp, *ctypes, ctype;
947 size_t size = strlen(nstr->orig), length = 0;
948 int removeblankp = nstr->flags & SEN_STR_REMOVEBLANK;
949 if (!(nstr->norm = SEN_MALLOC(size + 1))) {
950 return sen_memory_exhausted;
951 }
952 d0 = (unsigned char *) nstr->norm;
953 if (nstr->flags & SEN_STR_WITH_CHECKS) {
954 if (!(nstr->checks = SEN_MALLOC(size * sizeof(int16_t) + 1))) {
955 SEN_FREE(nstr->norm);
956 nstr->norm = NULL;
957 return sen_memory_exhausted;
958 }
959 }
960 ch = nstr->checks;
961 if (nstr->flags & SEN_STR_WITH_CTYPES) {
962 if (!(nstr->ctypes = SEN_MALLOC(size + 1))) {
963 SEN_FREE(nstr->checks);
964 SEN_FREE(nstr->norm);
965 nstr->checks = NULL;
966 nstr->norm = NULL;
967 return sen_memory_exhausted;
968 }
969 }
970 cp = ctypes = nstr->ctypes;
971 e = (unsigned char *)nstr->orig + size;
972 for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) {
973 unsigned char c = *s;
974 switch (c >> 4) {
975 case 0 :
976 case 1 :
977 /* skip unprintable ascii */
978 if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
979 continue;
980 case 2 :
981 if (c == 0x20) {
982 if (removeblankp) {
983 if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
984 continue;
985 } else {
986 *d = ' ';
987 ctype = SEN_NSTR_BLANK|sen_str_symbol;
988 }
989 } else {
990 *d = c;
991 ctype = sen_str_symbol;
992 }
993 break;
994 case 3 :
995 *d = c;
996 ctype = (c <= 0x39) ? sen_str_digit : sen_str_symbol;
997 break;
998 case 4 :
999 *d = ('A' <= c) ? c + 0x20 : c;
1000 ctype = (c == 0x40) ? sen_str_symbol : sen_str_alpha;
1001 break;
1002 case 5 :
1003 *d = (c <= 'Z') ? c + 0x20 : c;
1004 ctype = (c <= 0x5a) ? sen_str_alpha : sen_str_symbol;
1005 break;
1006 case 6 :
1007 *d = c;
1008 ctype = (c == 0x60) ? sen_str_symbol : sen_str_alpha;
1009 break;
1010 case 7 :
1011 *d = c;
1012 ctype = (c <= 0x7a) ? sen_str_alpha : (c == 0x7f ? sen_str_others : sen_str_symbol);
1013 break;
1014 case 8 :
1015 if (c == 0x8a || c == 0x8c || c == 0x8e) {
1016 *d = c + 0x10;
1017 ctype = sen_str_alpha;
1018 } else {
1019 *d = c;
1020 ctype = sen_str_symbol;
1021 }
1022 break;
1023 case 9 :
1024 if (c == 0x9a || c == 0x9c || c == 0x9e || c == 0x9f) {
1025 *d = (c == 0x9f) ? c + 0x60 : c;
1026 ctype = sen_str_alpha;
1027 } else {
1028 *d = c;
1029 ctype = sen_str_symbol;
1030 }
1031 break;
1032 case 0x0c :
1033 *d = c + 0x20;
1034 ctype = sen_str_alpha;
1035 break;
1036 case 0x0d :
1037 *d = (c == 0xd7 || c == 0xdf) ? c : c + 0x20;
1038 ctype = (c == 0xd7) ? sen_str_symbol : sen_str_alpha;
1039 break;
1040 case 0x0e :
1041 *d = c;
1042 ctype = sen_str_alpha;
1043 break;
1044 case 0x0f :
1045 *d = c;
1046 ctype = (c == 0xf7) ? sen_str_symbol : sen_str_alpha;
1047 break;
1048 default :
1049 *d = c;
1050 ctype = sen_str_others;
1051 break;
1052 }
1053 d++;
1054 length++;
1055 if (cp) { *cp++ = ctype; }
1056 if (ch) {
1057 *ch++ = (int16_t)(s + 1 - s_);
1058 s_ = s + 1;
1059 while (++d_ < d) { *ch++ = 0; }
1060 }
1061 }
1062 if (cp) { *cp = sen_str_null; }
1063 *d = '\0';
1064 nstr->length = length;
1065 nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
1066 return sen_success;
1067 }
1068
1069 inline static sen_rc
normalize_koi8r(sen_nstr * nstr)1070 normalize_koi8r(sen_nstr *nstr)
1071 {
1072 int16_t *ch;
1073 sen_ctx *ctx = nstr->ctx;
1074 const unsigned char *s, *s_, *e;
1075 unsigned char *d, *d0, *d_;
1076 uint_least8_t *cp, *ctypes, ctype;
1077 size_t size = strlen(nstr->orig), length = 0;
1078 int removeblankp = nstr->flags & SEN_STR_REMOVEBLANK;
1079 if (!(nstr->norm = SEN_MALLOC(size + 1))) {
1080 return sen_memory_exhausted;
1081 }
1082 d0 = (unsigned char *) nstr->norm;
1083 if (nstr->flags & SEN_STR_WITH_CHECKS) {
1084 if (!(nstr->checks = SEN_MALLOC(size * sizeof(int16_t) + 1))) {
1085 SEN_FREE(nstr->norm);
1086 nstr->norm = NULL;
1087 return sen_memory_exhausted;
1088 }
1089 }
1090 ch = nstr->checks;
1091 if (nstr->flags & SEN_STR_WITH_CTYPES) {
1092 if (!(nstr->ctypes = SEN_MALLOC(size + 1))) {
1093 SEN_FREE(nstr->checks);
1094 SEN_FREE(nstr->norm);
1095 nstr->checks = NULL;
1096 nstr->norm = NULL;
1097 return sen_memory_exhausted;
1098 }
1099 }
1100 cp = ctypes = nstr->ctypes;
1101 e = (unsigned char *)nstr->orig + size;
1102 for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) {
1103 unsigned char c = *s;
1104 switch (c >> 4) {
1105 case 0 :
1106 case 1 :
1107 /* skip unprintable ascii */
1108 if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
1109 continue;
1110 case 2 :
1111 if (c == 0x20) {
1112 if (removeblankp) {
1113 if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
1114 continue;
1115 } else {
1116 *d = ' ';
1117 ctype = SEN_NSTR_BLANK|sen_str_symbol;
1118 }
1119 } else {
1120 *d = c;
1121 ctype = sen_str_symbol;
1122 }
1123 break;
1124 case 3 :
1125 *d = c;
1126 ctype = (c <= 0x39) ? sen_str_digit : sen_str_symbol;
1127 break;
1128 case 4 :
1129 *d = ('A' <= c) ? c + 0x20 : c;
1130 ctype = (c == 0x40) ? sen_str_symbol : sen_str_alpha;
1131 break;
1132 case 5 :
1133 *d = (c <= 'Z') ? c + 0x20 : c;
1134 ctype = (c <= 0x5a) ? sen_str_alpha : sen_str_symbol;
1135 break;
1136 case 6 :
1137 *d = c;
1138 ctype = (c == 0x60) ? sen_str_symbol : sen_str_alpha;
1139 break;
1140 case 7 :
1141 *d = c;
1142 ctype = (c <= 0x7a) ? sen_str_alpha : (c == 0x7f ? sen_str_others : sen_str_symbol);
1143 break;
1144 case 0x0a :
1145 *d = c;
1146 ctype = (c == 0xa3) ? sen_str_alpha : sen_str_others;
1147 break;
1148 case 0x0b :
1149 if (c == 0xb3) {
1150 *d = c - 0x10;
1151 ctype = sen_str_alpha;
1152 } else {
1153 *d = c;
1154 ctype = sen_str_others;
1155 }
1156 break;
1157 case 0x0c :
1158 case 0x0d :
1159 *d = c;
1160 ctype = sen_str_alpha;
1161 break;
1162 case 0x0e :
1163 case 0x0f :
1164 *d = c - 0x20;
1165 ctype = sen_str_alpha;
1166 break;
1167 default :
1168 *d = c;
1169 ctype = sen_str_others;
1170 break;
1171 }
1172 d++;
1173 length++;
1174 if (cp) { *cp++ = ctype; }
1175 if (ch) {
1176 *ch++ = (int16_t)(s + 1 - s_);
1177 s_ = s + 1;
1178 while (++d_ < d) { *ch++ = 0; }
1179 }
1180 }
1181 if (cp) { *cp = sen_str_null; }
1182 *d = '\0';
1183 nstr->length = length;
1184 nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
1185 return sen_success;
1186 }
1187
1188 sen_nstr *
sen_nstr_open(const char * str,size_t str_len,sen_encoding encoding,int flags)1189 sen_nstr_open(const char *str, size_t str_len, sen_encoding encoding, int flags)
1190 {
1191 sen_rc rc;
1192 sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
1193 sen_nstr *nstr;
1194 if (!str) { return NULL; }
1195 if (!(nstr = SEN_MALLOC(sizeof(sen_nstr)))) {
1196 SEN_LOG(sen_log_alert, "memory allocation on sen_fakenstr_open failed !");
1197 return NULL;
1198 }
1199 nstr->orig = str;
1200 nstr->orig_blen = str_len;
1201 nstr->norm = NULL;
1202 nstr->norm_blen = 0;
1203 nstr->checks = NULL;
1204 nstr->ctypes = NULL;
1205 nstr->encoding = encoding;
1206 nstr->flags = flags;
1207 nstr->ctx = ctx;
1208 switch (encoding) {
1209 case sen_enc_euc_jp :
1210 rc = normalize_euc(nstr);
1211 break;
1212 case sen_enc_utf8 :
1213 #ifdef NO_NFKC
1214 rc = normalize_none(nstr);
1215 #else /* NO_NFKC */
1216 rc = normalize_utf8(nstr);
1217 #endif /* NO_NFKC */
1218 break;
1219 case sen_enc_sjis :
1220 rc = normalize_sjis(nstr);
1221 break;
1222 case sen_enc_latin1 :
1223 rc = normalize_latin1(nstr);
1224 break;
1225 case sen_enc_koi8r :
1226 rc = normalize_koi8r(nstr);
1227 break;
1228 default :
1229 rc = normalize_none(nstr);
1230 break;
1231 }
1232 if (rc) {
1233 sen_nstr_close(nstr);
1234 return NULL;
1235 }
1236 return nstr;
1237 }
1238
1239 sen_nstr *
sen_fakenstr_open(const char * str,size_t str_len,sen_encoding encoding,int flags)1240 sen_fakenstr_open(const char *str, size_t str_len, sen_encoding encoding, int flags)
1241 {
1242 /* TODO: support SEN_STR_REMOVEBLANK flag and ctypes */
1243 sen_nstr *nstr;
1244 sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
1245
1246 if (!(nstr = SEN_MALLOC(sizeof(sen_nstr)))) {
1247 SEN_LOG(sen_log_alert, "memory allocation on sen_fakenstr_open failed !");
1248 return NULL;
1249 }
1250 if (!(nstr->norm = SEN_MALLOC(str_len + 1))) {
1251 SEN_LOG(sen_log_alert, "memory allocation for keyword on sen_snip_add_cond failed !");
1252 SEN_FREE(nstr);
1253 return NULL;
1254 }
1255 nstr->orig = str;
1256 nstr->orig_blen = str_len;
1257 memcpy(nstr->norm, str, str_len);
1258 nstr->norm[str_len] = '\0';
1259 nstr->norm_blen = str_len;
1260 nstr->ctypes = NULL;
1261 nstr->flags = flags;
1262 nstr->ctx = ctx;
1263
1264 if (flags & SEN_STR_WITH_CHECKS) {
1265 int16_t f = 0;
1266 unsigned char c;
1267 size_t i;
1268 if (!(nstr->checks = (int16_t *) SEN_MALLOC(sizeof(int16_t) * str_len))) {
1269 SEN_FREE(nstr->norm);
1270 SEN_FREE(nstr);
1271 return NULL;
1272 }
1273 switch (encoding) {
1274 case sen_enc_euc_jp:
1275 for (i = 0; i < str_len; i++) {
1276 if (!f) {
1277 c = (unsigned char) str[i];
1278 f = ((c >= 0xa1U && c <= 0xfeU) || c == 0x8eU ? 2 : (c == 0x8fU ? 3 : 1)
1279 );
1280 nstr->checks[i] = f;
1281 } else {
1282 nstr->checks[i] = 0;
1283 }
1284 f--;
1285 }
1286 break;
1287 case sen_enc_sjis:
1288 for (i = 0; i < str_len; i++) {
1289 if (!f) {
1290 c = (unsigned char) str[i];
1291 f = (c >= 0x81U && ((c <= 0x9fU) || (c >= 0xe0U && c <= 0xfcU)) ? 2 : 1);
1292 nstr->checks[i] = f;
1293 } else {
1294 nstr->checks[i] = 0;
1295 }
1296 f--;
1297 }
1298 break;
1299 case sen_enc_utf8:
1300 for (i = 0; i < str_len; i++) {
1301 if (!f) {
1302 c = (unsigned char) str[i];
1303 f = (c & 0x80U ? (c & 0x20U ? (c & 0x10U ? 4 : 3)
1304 : 2)
1305 : 1);
1306 nstr->checks[i] = f;
1307 } else {
1308 nstr->checks[i] = 0;
1309 }
1310 f--;
1311 }
1312 break;
1313 default:
1314 for (i = 0; i < str_len; i++) {
1315 nstr->checks[i] = 1;
1316 }
1317 break;
1318 }
1319 }
1320 else {
1321 nstr->checks = NULL;
1322 }
1323 return nstr;
1324 }
1325
1326 sen_rc
sen_nstr_close(sen_nstr * nstr)1327 sen_nstr_close(sen_nstr *nstr)
1328 {
1329 if (nstr) {
1330 sen_ctx *ctx = nstr->ctx;
1331 if (nstr->norm) { SEN_FREE(nstr->norm); }
1332 if (nstr->ctypes) { SEN_FREE(nstr->ctypes); }
1333 if (nstr->checks) { SEN_FREE(nstr->checks); }
1334 SEN_FREE(nstr);
1335 return sen_success;
1336 } else {
1337 return sen_invalid_argument;
1338 }
1339 }
1340
1341 static const char *sen_enc_string[] = {
1342 "default",
1343 "none",
1344 "euc_jp",
1345 "utf8",
1346 "sjis",
1347 "latin1",
1348 "koi8r"
1349 };
1350
1351 const char *
sen_enctostr(sen_encoding enc)1352 sen_enctostr(sen_encoding enc)
1353 {
1354 if (enc < (sizeof(sen_enc_string) / sizeof(char *))) {
1355 return sen_enc_string[enc];
1356 } else {
1357 return "unknown";
1358 }
1359 }
1360
1361 sen_encoding
sen_strtoenc(const char * str)1362 sen_strtoenc(const char *str)
1363 {
1364 sen_encoding e = sen_enc_euc_jp;
1365 int i = sizeof(sen_enc_string) / sizeof(sen_enc_string[0]);
1366 while (i--) {
1367 if (!strcmp(str, sen_enc_string[i])) {
1368 e = (sen_encoding)i;
1369 }
1370 }
1371 return e;
1372 }
1373
1374 size_t
sen_str_len(const char * str,sen_encoding encoding,const char ** last)1375 sen_str_len(const char *str, sen_encoding encoding, const char **last)
1376 {
1377 size_t len, tlen;
1378 const char *p = NULL;
1379 for (len = 0; ; len++) {
1380 p = str;
1381 if (!(tlen = sen_str_charlen(str, encoding))) {
1382 break;
1383 }
1384 str += tlen;
1385 }
1386 if (last) { *last = p; }
1387 return len;
1388 }
1389
1390 int
sen_isspace(const char * str,sen_encoding encoding)1391 sen_isspace(const char *str, sen_encoding encoding)
1392 {
1393 const unsigned char *s = (const unsigned char *) str;
1394 if (!s) { return 0; }
1395 switch (s[0]) {
1396 case ' ' :
1397 case '\f' :
1398 case '\n' :
1399 case '\r' :
1400 case '\t' :
1401 case '\v' :
1402 return 1;
1403 case 0x81 :
1404 if (encoding == sen_enc_sjis && s[1] == 0x40) { return 2; }
1405 break;
1406 case 0xA1 :
1407 if (encoding == sen_enc_euc_jp && s[1] == 0xA1) { return 2; }
1408 break;
1409 case 0xE3 :
1410 if (encoding == sen_enc_utf8 && s[1] == 0x80 && s[2] == 0x80) { return 3; }
1411 break;
1412 default :
1413 break;
1414 }
1415 return 0;
1416 }
1417
1418 int
sen_atoi(const char * nptr,const char * end,const char ** rest)1419 sen_atoi(const char *nptr, const char *end, const char **rest)
1420 {
1421 /* FIXME: INT_MIN is not supported */
1422 const char *p = nptr;
1423 int v = 0, t, n = 0, o = 0;
1424 if (p < end && *p == '-') {
1425 p++;
1426 n = 1;
1427 o = 1;
1428 }
1429 while (p < end && *p >= '0' && *p <= '9') {
1430 t = v * 10 + (*p - '0');
1431 if (t < v) { v =0; break; }
1432 v = t;
1433 o = 0;
1434 p++;
1435 }
1436 if (rest) { *rest = o ? nptr : p; }
1437 return n ? -v : v;
1438 }
1439
1440 unsigned int
sen_atoui(const char * nptr,const char * end,const char ** rest)1441 sen_atoui(const char *nptr, const char *end, const char **rest)
1442 {
1443 unsigned int v = 0, t;
1444 while (nptr < end && *nptr >= '0' && *nptr <= '9') {
1445 t = v * 10 + (*nptr - '0');
1446 if (t < v) { v = 0; break; }
1447 v = t;
1448 nptr++;
1449 }
1450 if (rest) { *rest = nptr; }
1451 return v;
1452 }
1453
1454 int64_t
sen_atoll(const char * nptr,const char * end,const char ** rest)1455 sen_atoll(const char *nptr, const char *end, const char **rest)
1456 {
1457 /* FIXME: INT_MIN is not supported */
1458 const char *p = nptr;
1459 int n = 0, o = 0;
1460 int64_t v = 0, t;
1461 if (p < end && *p == '-') {
1462 p++;
1463 n = 1;
1464 o = 1;
1465 }
1466 while (p < end && *p >= '0' && *p <= '9') {
1467 t = v * 10 + (*p - '0');
1468 if (t < v) { v = 0; break; }
1469 v = t;
1470 o = 0;
1471 p++;
1472 }
1473 if (rest) { *rest = o ? nptr : p; }
1474 return n ? -v : v;
1475 }
1476
1477 unsigned int
sen_htoui(const char * nptr,const char * end,const char ** rest)1478 sen_htoui(const char *nptr, const char *end, const char **rest)
1479 {
1480 unsigned int v = 0, t;
1481 while (nptr < end) {
1482 switch (*nptr) {
1483 case '0' :
1484 case '1' :
1485 case '2' :
1486 case '3' :
1487 case '4' :
1488 case '5' :
1489 case '6' :
1490 case '7' :
1491 case '8' :
1492 case '9' :
1493 t = v * 16 + (*nptr++ - '0');
1494 break;
1495 case 'a' :
1496 case 'b' :
1497 case 'c' :
1498 case 'd' :
1499 case 'e' :
1500 case 'f' :
1501 t = v * 16 + (*nptr++ - 'a') + 10;
1502 break;
1503 case 'A' :
1504 case 'B' :
1505 case 'C' :
1506 case 'D' :
1507 case 'E' :
1508 case 'F' :
1509 t = v * 16 + (*nptr++ - 'A') + 10;
1510 break;
1511 default :
1512 v = 0; goto exit;
1513 }
1514 if (t < v) { v = 0; goto exit; }
1515 v = t;
1516 }
1517 exit :
1518 if (rest) { *rest = nptr; }
1519 return v;
1520 }
1521
1522 void
sen_str_itoh(unsigned int i,char * p,unsigned int len)1523 sen_str_itoh(unsigned int i, char *p, unsigned int len)
1524 {
1525 static const char *hex = "0123456789ABCDEF";
1526 p += len;
1527 *p-- = '\0';
1528 while (len--) {
1529 *p-- = hex[i & 0xf];
1530 i >>= 4;
1531 }
1532 }
1533
1534 sen_rc
sen_str_itoa(int i,char * p,char * end,char ** rest)1535 sen_str_itoa(int i, char *p, char *end, char **rest)
1536 {
1537 /* FIXME: INT_MIN is not supported */
1538 char *q;
1539 if (p >= end) { return sen_invalid_argument; }
1540 if (i < 0) {
1541 *p++ = '-';
1542 i = -i;
1543 }
1544 q = p;
1545 do {
1546 if (p >= end) { return sen_invalid_argument; }
1547 *p++ = i % 10 + '0';
1548 } while ((i /= 10) > 0);
1549 if (rest) { *rest = p; }
1550 for (p--; q < p; q++, p--) {
1551 char t = *q;
1552 *q = *p;
1553 *p = t;
1554 }
1555 return sen_success;
1556 }
1557
1558 sen_rc
sen_str_lltoa(int64_t i,char * p,char * end,char ** rest)1559 sen_str_lltoa(int64_t i, char *p, char *end, char **rest)
1560 {
1561 /* FIXME: INT_MIN is not supported */
1562 char *q;
1563 if (p >= end) { return sen_invalid_argument; }
1564 if (i < 0) {
1565 *p++ = '-';
1566 i = -i;
1567 }
1568 q = p;
1569 do {
1570 if (p >= end) { return sen_invalid_argument; }
1571 *p++ = i % 10 + '0';
1572 } while ((i /= 10) > 0);
1573 if (rest) { *rest = p; }
1574 for (p--; q < p; q++, p--) {
1575 char t = *q;
1576 *q = *p;
1577 *p = t;
1578 }
1579 return sen_success;
1580 }
1581
1582 #define I2B(i) \
1583 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(i) & 0x3f])
1584
1585 #define B2I(b) \
1586 (((b) < '+' || 'z' < (b)) ? 0xff : "\x3e\xff\xff\xff\x3f\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\xff\xff\xff\xff\xff\xff\xff\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\xff\xff\xff\xff\xff\xff\x1a\x1b\x1c\x1d\x1e\x1f\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f\x30\x31\x32\x33"[(b) - '+'])
1587
1588 #define MASK 0x34d34d34
1589
1590 char *
sen_str_itob(sen_id id,char * p)1591 sen_str_itob(sen_id id, char *p)
1592 {
1593 id ^= MASK;
1594 *p++ = I2B(id >> 24);
1595 *p++ = I2B(id >> 18);
1596 *p++ = I2B(id >> 12);
1597 *p++ = I2B(id >> 6);
1598 *p++ = I2B(id);
1599 return p;
1600 }
1601
1602 sen_id
sen_str_btoi(char * b)1603 sen_str_btoi(char *b)
1604 {
1605 uint8_t i;
1606 sen_id id = 0;
1607 int len = 5;
1608 while (len--) {
1609 char c = *b++;
1610 if ((i = B2I(c)) == 0xff) { return 0; }
1611 id = (id << 6) + i;
1612 }
1613 return id ^ MASK;
1614 }
1615
1616 #define I2B32H(i) ("0123456789ABCDEFGHIJKLMNOPQRSTUV"[(i) & 0x1f])
1617
1618 char *
sen_str_lltob32h(int64_t i,char * p)1619 sen_str_lltob32h(int64_t i, char *p)
1620 {
1621 uint64_t u = (uint64_t)i + 0x8000000000000000ULL;
1622 *p++ = I2B32H(u >> 60);
1623 *p++ = I2B32H(u >> 55);
1624 *p++ = I2B32H(u >> 50);
1625 *p++ = I2B32H(u >> 45);
1626 *p++ = I2B32H(u >> 40);
1627 *p++ = I2B32H(u >> 35);
1628 *p++ = I2B32H(u >> 30);
1629 *p++ = I2B32H(u >> 25);
1630 *p++ = I2B32H(u >> 20);
1631 *p++ = I2B32H(u >> 15);
1632 *p++ = I2B32H(u >> 10);
1633 *p++ = I2B32H(u >> 5);
1634 *p++ = I2B32H(u);
1635 return p;
1636 }
1637
1638 char *
sen_str_ulltob32h(uint64_t i,char * p)1639 sen_str_ulltob32h(uint64_t i, char *p)
1640 {
1641 char lb = (i >> 59) & 0x10;
1642 i += 0x8000000000000000ULL;
1643 *p++ = lb + I2B32H(i >> 60);
1644 *p++ = I2B32H(i >> 55);
1645 *p++ = I2B32H(i >> 50);
1646 *p++ = I2B32H(i >> 45);
1647 *p++ = I2B32H(i >> 40);
1648 *p++ = I2B32H(i >> 35);
1649 *p++ = I2B32H(i >> 30);
1650 *p++ = I2B32H(i >> 25);
1651 *p++ = I2B32H(i >> 20);
1652 *p++ = I2B32H(i >> 15);
1653 *p++ = I2B32H(i >> 10);
1654 *p++ = I2B32H(i >> 5);
1655 *p++ = I2B32H(i);
1656 return p;
1657 }
1658
1659 int
sen_str_tok(char * str,size_t str_len,char delim,char ** tokbuf,int buf_size,char ** rest)1660 sen_str_tok(char *str, size_t str_len, char delim, char **tokbuf, int buf_size, char **rest)
1661 {
1662 char **tok = tokbuf, **tok_end = tokbuf + buf_size;
1663 if (buf_size > 0) {
1664 char *str_end = str + str_len;
1665 for (;;str++) {
1666 if (str == str_end) {
1667 *tok++ = str;
1668 break;
1669 }
1670 if (delim == *str) {
1671 *str = '\0';
1672 *tok++ = str;
1673 if (tok == tok_end) { break; }
1674 }
1675 }
1676 }
1677 if (rest) { *rest = str; }
1678 return tok - tokbuf;
1679 }
1680
1681 inline static void
op_getopt_flag(int * flags,const sen_str_getopt_opt * o,int argc,char * const argv[],int * i)1682 op_getopt_flag(int *flags, const sen_str_getopt_opt *o,
1683 int argc, char * const argv[], int *i)
1684 {
1685 switch (o->op) {
1686 case getopt_op_none:
1687 break;
1688 case getopt_op_on:
1689 *flags |= o->flag;
1690 break;
1691 case getopt_op_off:
1692 *flags &= ~o->flag;
1693 break;
1694 case getopt_op_update:
1695 *flags = o->flag;
1696 break;
1697 default:
1698 return;
1699 }
1700 if (o->arg) {
1701 if (++(*i) < argc) {
1702 *o->arg = argv[*i];
1703 } else {
1704 /* TODO: error */
1705 }
1706 }
1707 }
1708
1709 int
sen_str_getopt(int argc,char * const argv[],const sen_str_getopt_opt * opts,int * flags)1710 sen_str_getopt(int argc, char * const argv[], const sen_str_getopt_opt *opts,
1711 int *flags)
1712 {
1713 int i;
1714 for (i = 1; i < argc; i++) {
1715 const char * v = argv[i];
1716 if (*v == '-') {
1717 const sen_str_getopt_opt *o;
1718 int found;
1719 if (*++v == '-') {
1720 found = 0;
1721 for (o = opts; o->opt != '\0' || o->longopt != NULL; o++) {
1722 if (o->longopt && !strcmp(v, o->longopt)) {
1723 op_getopt_flag(flags, o, argc, argv, &i);
1724 found = 1;
1725 break;
1726 }
1727 }
1728 if (!found) { goto exit; }
1729 } else {
1730 const char *p;
1731 for (p = v; *p; p++) {
1732 found = 0;
1733 for (o = opts; o->opt != '\0' || o->longopt != NULL; o++) {
1734 if (o->opt && *p == o->opt) {
1735 op_getopt_flag(flags, o, argc, argv, &i);
1736 found = 1;
1737 break;
1738 }
1739 }
1740 if (!found) { goto exit; }
1741 }
1742 }
1743 } else {
1744 break;
1745 }
1746 }
1747 return i;
1748 exit:
1749 fprintf(stderr, "cannot recognize option '%s'.\n", argv[i]);
1750 return -1;
1751 }
1752
1753 #define UNIT_SIZE (1 << 12)
1754 #define UNIT_MASK (UNIT_SIZE - 1)
1755
1756 int sen_rbuf_margin_size = 0;
1757
1758 sen_rc
sen_rbuf_init(sen_rbuf * buf,size_t size)1759 sen_rbuf_init(sen_rbuf *buf, size_t size)
1760 {
1761 buf->head = NULL;
1762 buf->curr = NULL;
1763 buf->tail = NULL;
1764 return size ? sen_rbuf_resize(buf, size) : sen_success;
1765 }
1766
1767 sen_rc
sen_rbuf_resize(sen_rbuf * buf,size_t newsize)1768 sen_rbuf_resize(sen_rbuf *buf, size_t newsize)
1769 {
1770 char *head;
1771 sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
1772 newsize += sen_rbuf_margin_size + 1;
1773 newsize = (newsize + (UNIT_MASK)) & ~UNIT_MASK;
1774 head = buf->head - (buf->head ? sen_rbuf_margin_size : 0);
1775 if (!(head = SEN_REALLOC(head, newsize))) { return sen_memory_exhausted; }
1776 buf->curr = head + sen_rbuf_margin_size + SEN_RBUF_VSIZE(buf);
1777 buf->head = head + sen_rbuf_margin_size;
1778 buf->tail = head + newsize;
1779 return sen_success;
1780 }
1781
1782 sen_rc
sen_rbuf_reinit(sen_rbuf * buf,size_t size)1783 sen_rbuf_reinit(sen_rbuf *buf, size_t size)
1784 {
1785 SEN_RBUF_REWIND(buf);
1786 return sen_rbuf_resize(buf, size);
1787 }
1788
1789 sen_rc
sen_rbuf_write(sen_rbuf * buf,const char * str,size_t len)1790 sen_rbuf_write(sen_rbuf *buf, const char *str, size_t len)
1791 {
1792 sen_rc rc = sen_success;
1793 if (SEN_RBUF_REST(buf) < len) {
1794 if ((rc = sen_rbuf_resize(buf, SEN_RBUF_VSIZE(buf) + len))) { return rc; }
1795 }
1796 memcpy(buf->curr, str, len);
1797 buf->curr += len;
1798 return rc;
1799 }
1800
1801 sen_rc
sen_rbuf_reserve(sen_rbuf * buf,size_t len)1802 sen_rbuf_reserve(sen_rbuf *buf, size_t len)
1803 {
1804 sen_rc rc = sen_success;
1805 if (SEN_RBUF_REST(buf) < len) {
1806 if ((rc = sen_rbuf_resize(buf, SEN_RBUF_VSIZE(buf) + len))) { return rc; }
1807 }
1808 return rc;
1809 }
1810
1811 sen_rc
sen_rbuf_space(sen_rbuf * buf,size_t len)1812 sen_rbuf_space(sen_rbuf *buf, size_t len)
1813 {
1814 sen_rc rc = sen_rbuf_reserve(buf, len);
1815 if (!rc) { buf->curr += len; }
1816 return rc;
1817 }
1818
1819 sen_rc
sen_rbuf_itoa(sen_rbuf * buf,int i)1820 sen_rbuf_itoa(sen_rbuf *buf, int i)
1821 {
1822 sen_rc rc = sen_success;
1823 while (sen_str_itoa(i, buf->curr, buf->tail, &buf->curr)) {
1824 if ((rc = sen_rbuf_resize(buf, SEN_RBUF_WSIZE(buf) + UNIT_SIZE))) { return rc; }
1825 }
1826 return rc;
1827 }
1828
1829 sen_rc
sen_rbuf_lltoa(sen_rbuf * buf,int64_t i)1830 sen_rbuf_lltoa(sen_rbuf *buf, int64_t i)
1831 {
1832 sen_rc rc = sen_success;
1833 while (sen_str_lltoa(i, buf->curr, buf->tail, &buf->curr)) {
1834 if ((rc = sen_rbuf_resize(buf, SEN_RBUF_WSIZE(buf) + UNIT_SIZE))) { return rc; }
1835 }
1836 return rc;
1837 }
1838
1839 sen_rc
sen_rbuf_ftoa(sen_rbuf * buf,double d)1840 sen_rbuf_ftoa(sen_rbuf *buf, double d)
1841 {
1842 size_t len = 32;
1843 sen_rc rc = sen_success;
1844 if (SEN_RBUF_REST(buf) < len) {
1845 if ((rc = sen_rbuf_resize(buf, SEN_RBUF_VSIZE(buf) + len))) { return rc; }
1846 }
1847 switch (fpclassify(d)) {
1848 CASE_FP_NAN
1849 SEN_RBUF_PUTS(buf, "#<nan>");
1850 break;
1851 CASE_FP_INFINITE
1852 SEN_RBUF_PUTS(buf, d > 0 ? "#i1/0" : "#i-1/0");
1853 break;
1854 default :
1855 len = sprintf(buf->curr, "%#.15g", d);
1856 if (buf->curr[len - 1] == '.') {
1857 buf->curr += len;
1858 SEN_RBUF_PUTC(buf, '0');
1859 } else {
1860 char *p, *q;
1861 buf->curr[len] = '\0';
1862 if ((p = strchr(buf->curr, 'e'))) {
1863 for (q = p; *(q - 2) != '.' && *(q - 1) == '0'; q--) { len--; }
1864 memmove(q, p, buf->curr + len - q);
1865 } else {
1866 for (q = buf->curr + len; *(q - 2) != '.' && *(q - 1) == '0'; q--) { len--; }
1867 }
1868 buf->curr += len;
1869 }
1870 break;
1871 }
1872 return rc;
1873 }
1874
1875 sen_rc
sen_rbuf_itoh(sen_rbuf * buf,int i)1876 sen_rbuf_itoh(sen_rbuf *buf, int i)
1877 {
1878 size_t len = 8;
1879 sen_rc rc = sen_success;
1880 if (SEN_RBUF_REST(buf) < len) {
1881 if ((rc = sen_rbuf_resize(buf, SEN_RBUF_VSIZE(buf) + len))) { return rc; }
1882 }
1883 sen_str_itoh(i, buf->curr, len);
1884 buf->curr += len;
1885 return rc;
1886 }
1887
1888 sen_rc
sen_rbuf_itob(sen_rbuf * buf,sen_id id)1889 sen_rbuf_itob(sen_rbuf *buf, sen_id id)
1890 {
1891 size_t len = 5;
1892 sen_rc rc = sen_success;
1893 if (SEN_RBUF_REST(buf) < len) {
1894 if ((rc = sen_rbuf_resize(buf, SEN_RBUF_VSIZE(buf) + len))) { return rc; }
1895 }
1896 sen_str_itob(id, buf->curr);
1897 buf->curr += len;
1898 return rc;
1899 }
1900
1901 sen_rc
sen_rbuf_lltob32h(sen_rbuf * buf,int64_t i)1902 sen_rbuf_lltob32h(sen_rbuf *buf, int64_t i)
1903 {
1904 size_t len = 13;
1905 sen_rc rc = sen_success;
1906 if (SEN_RBUF_REST(buf) < len) {
1907 if ((rc = sen_rbuf_resize(buf, SEN_RBUF_VSIZE(buf) + len))) { return rc; }
1908 }
1909 sen_str_lltob32h(i, buf->curr);
1910 buf->curr += len;
1911 return rc;
1912 }
1913
1914 void
sen_rbuf_str_esc(sen_rbuf * buf,const char * s,int len,sen_encoding encoding)1915 sen_rbuf_str_esc(sen_rbuf *buf, const char *s, int len, sen_encoding encoding)
1916 {
1917 const char *e;
1918 unsigned int l;
1919 if (len < 0) { len = strlen(s); }
1920 SEN_RBUF_PUTC(buf, '"');
1921 for (e = s + len; s < e; s += l) {
1922 if (!(l = sen_str_charlen_nonnull(s, e, encoding))) { break; }
1923 if (l == 1) {
1924 switch (*s) {
1925 case '\t' :
1926 sen_rbuf_write(buf, "\\t", 2);
1927 break;
1928 case '\n' :
1929 sen_rbuf_write(buf, "\\n", 2);
1930 break;
1931 case '"' :
1932 sen_rbuf_write(buf, "\\\"", 2);
1933 break;
1934 case '\\' :
1935 sen_rbuf_write(buf, "\\\\", 2);
1936 break;
1937 default :
1938 SEN_RBUF_PUTC(buf, *s);
1939 }
1940 } else {
1941 sen_rbuf_write(buf, s, l);
1942 }
1943 }
1944 SEN_RBUF_PUTC(buf, '"');
1945 }
1946
1947 sen_rc
sen_rbuf_fin(sen_rbuf * buf)1948 sen_rbuf_fin(sen_rbuf *buf)
1949 {
1950 sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
1951 if (buf->head) {
1952 SEN_REALLOC(buf->head - sen_rbuf_margin_size, 0);
1953 buf->head = NULL;
1954 }
1955 return sen_success;
1956 }
1957
1958 struct _sen_lbuf_node {
1959 sen_lbuf_node *next;
1960 size_t size;
1961 char val[1];
1962 };
1963
1964 sen_rc
sen_lbuf_init(sen_lbuf * buf)1965 sen_lbuf_init(sen_lbuf *buf)
1966 {
1967 buf->head = NULL;
1968 buf->tail = &buf->head;
1969 return sen_success;
1970 }
1971
1972 void *
sen_lbuf_add(sen_lbuf * buf,size_t size)1973 sen_lbuf_add(sen_lbuf *buf, size_t size)
1974 {
1975 sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
1976 sen_lbuf_node *node = SEN_MALLOC(size + (size_t)(&((sen_lbuf_node *)0)->val));
1977 if (!node) { return NULL; }
1978 node->next = NULL;
1979 node->size = size;
1980 *buf->tail = node;
1981 buf->tail = &node->next;
1982 return node->val;
1983 }
1984
1985 sen_rc
sen_lbuf_fin(sen_lbuf * buf)1986 sen_lbuf_fin(sen_lbuf *buf)
1987 {
1988 sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
1989 sen_lbuf_node *cur, *next;
1990 for (cur = buf->head; cur; cur = next) {
1991 next = cur->next;
1992 SEN_FREE(cur);
1993 }
1994 return sen_success;
1995 }
1996
1997 sen_rc
sen_substring(char ** str,char ** str_end,int start,int end,sen_encoding encoding)1998 sen_substring(char **str, char **str_end, int start, int end, sen_encoding encoding)
1999 {
2000 int i;
2001 size_t l;
2002 char *s = *str, *e = *str_end;
2003 for (i = 0; s < e; i++, s += l) {
2004 if (i == start) { *str = s; }
2005 if (!(l = sen_str_charlen_nonnull(s, e, encoding))) {
2006 return sen_invalid_argument;
2007 }
2008 if (i == end) {
2009 *str_end = s;
2010 break;
2011 }
2012 }
2013 return sen_success;
2014 }
2015
2016 int
sen_str_normalize(const char * str,unsigned int str_len,sen_encoding encoding,int flags,char * nstrbuf,int buf_size)2017 sen_str_normalize(const char *str, unsigned int str_len,
2018 sen_encoding encoding, int flags,
2019 char *nstrbuf, int buf_size)
2020 {
2021 int len;
2022 sen_nstr *nstr;
2023 if (!(nstr = sen_nstr_open(str, str_len, encoding, flags))) {
2024 return -1;
2025 }
2026 /* if the buffer size is short to store for the normalized string,
2027 the required size is returned
2028 (to inform the caller to cast me again). */
2029 len = (int)nstr->norm_blen;
2030 if (buf_size > len) {
2031 memcpy(nstrbuf, nstr->norm, len + 1);
2032 } else if (buf_size == len) {
2033 /* NB: non-NULL-terminated */
2034 memcpy(nstrbuf, nstr->norm, len);
2035 }
2036 sen_nstr_close(nstr);
2037 return len;
2038 }
2039