1 /* Copyright(C) 2004 Brazil
2 
3   This library is free software; you can redistribute it and/or
4   modify it under the terms of the GNU Lesser General Public
5   License as published by the Free Software Foundation; either
6   version 2.1 of the License, or (at your option) any later version.
7 
8   This library is distributed in the hope that it will be useful,
9   but WITHOUT ANY WARRANTY; without even the implied warranty of
10   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11   Lesser General Public License for more details.
12 
13   You should have received a copy of the GNU Lesser General Public
14   License along with this library; if not, write to the Free Software
15   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16 */
17 #include "senna_in.h"
18 #include <stdio.h>
19 #include <string.h>
20 #include "ctx.h"
21 #include "str.h"
22 #include "set.h"
23 
24 #ifndef __USE_ISOC99
25 #define __USE_ISOC99
26 #endif /* __USE_ISOC99 */
27 #include <math.h>
28 
29 static sen_set *prefix = NULL;
30 static sen_set *suffix = NULL;
31 
32 #define N_PREFIX 2048
33 #define N_SUFFIX 0
34 
35 #define PREFIX_PATH SENNA_HOME PATH_SEPARATOR "prefix"
36 #define SUFFIX_PATH SENNA_HOME PATH_SEPARATOR "suffix"
37 
38 inline static void
prefix_init(void)39 prefix_init(void)
40 {
41   int i, *ip;
42   FILE *fp;
43   char buffer[4];
44   prefix = sen_set_open(2, sizeof(int), 0);
45   if (!prefix) { SEN_LOG(sen_log_alert, "sen_set_open on prefix_init failed !"); return; }
46   if ((fp = fopen(PREFIX_PATH, "r"))) {
47     for (i = 0; i < N_PREFIX; i++) {
48       if (!fgets(buffer, 4, fp)) { break; }
49       sen_set_get(prefix, buffer, (void **)&ip);
50       *ip = i;
51     }
52     fclose(fp);
53   }
54 }
55 
56 inline static void
suffix_init(void)57 suffix_init(void)
58 {
59   int i;
60   FILE *fp;
61   char buffer[4];
62   suffix = sen_set_open(2, 0, 0);
63   if (!suffix) { SEN_LOG(sen_log_alert, "sen_set_open on suffix_init failed !"); return; }
64   if ((fp = fopen(SUFFIX_PATH, "r"))) {
65     for (i = N_SUFFIX; i; i--) {
66       if (!fgets(buffer, 4, fp)) { break; }
67       sen_set_get(suffix, buffer, NULL);
68     }
69     fclose(fp);
70   }
71 }
72 
73 inline size_t
sen_str_charlen_utf8(const unsigned char * str,const unsigned char * end)74 sen_str_charlen_utf8(const unsigned char *str, const unsigned char *end)
75 {
76   /* MEMO: This function allows non-null-terminated string as str. */
77   /*       But requires the end of string. */
78   const unsigned char *p = str;
79   if (!*p || p >= end) { return 0; }
80   if (*p & 0x80) {
81     int b, w;
82     size_t size;
83     for (b = 0x40, w = 0; b && (*p & b); b >>= 1, w++);
84     if (!w) {
85       SEN_LOG(sen_log_warning, "invalid utf8 string(1) on sen_str_charlen_utf8");
86       return 0;
87     }
88     for (size = 1; w--; size++) {
89       if (++p >= end || !*p || (*p & 0xc0) != 0x80) {
90         SEN_LOG(sen_log_warning, "invalid utf8 string(2) on sen_str_charlen_utf8");
91         return 0;
92       }
93     }
94     return size;
95   } else {
96     return 1;
97   }
98   return 0;
99 }
100 
101 unsigned int
sen_str_charlen(const char * str,sen_encoding encoding)102 sen_str_charlen(const char *str, sen_encoding encoding)
103 {
104   /* MEMO: This function requires null-terminated string as str.*/
105   unsigned char *p = (unsigned char *) str;
106   if (!*p) { return 0; }
107   switch (encoding) {
108   case sen_enc_euc_jp :
109     if (*p & 0x80) {
110       if (*(p + 1)) {
111         return 2;
112       } else {
113         /* This is invalid character */
114         SEN_LOG(sen_log_warning, "invalid euc-jp string end on sen_str_charlen");
115         return 0;
116       }
117     }
118     return 1;
119     break;
120   case sen_enc_utf8 :
121     if (*p & 0x80) {
122       int b, w;
123       size_t size;
124       for (b = 0x40, w = 0; b && (*p & b); b >>= 1, w++);
125       if (!w) {
126         SEN_LOG(sen_log_warning, "invalid utf8 string(1) on sen_str_charlen");
127         return 0;
128       }
129       for (size = 1; w--; size++) {
130         if (!*++p || (*p & 0xc0) != 0x80) {
131           SEN_LOG(sen_log_warning, "invalid utf8 string(2) on sen_str_charlen");
132           return 0;
133         }
134       }
135       return size;
136     } else {
137       return 1;
138     }
139     break;
140   case sen_enc_sjis :
141     if (*p & 0x80) {
142       /* we regard 0xa0 as JIS X 0201 KANA. adjusted to other tools. */
143       if (0xa0 <= *p && *p <= 0xdf) {
144         /* hankaku-kana */
145         return 1;
146       } else if (!(*(p + 1))) {
147         /* This is invalid character */
148         SEN_LOG(sen_log_warning, "invalid sjis string end on sen_str_charlen");
149         return 0;
150       } else {
151         return 2;
152       }
153     } else {
154       return 1;
155     }
156     break;
157   default :
158     return 1;
159     break;
160   }
161   return 0;
162 }
163 
164 size_t
sen_str_charlen_nonnull(const char * str,const char * end,sen_encoding encoding)165 sen_str_charlen_nonnull(const char *str, const char *end, sen_encoding encoding)
166 {
167   /* MEMO: This function allows non-null-terminated string as str. */
168   /*       But requires the end of string. */
169   unsigned char *p = (unsigned char *) str;
170   if (p >= (unsigned char *)end) { return 0; }
171   switch (encoding) {
172   case sen_enc_euc_jp :
173     if (*p & 0x80) {
174       if ((p + 1) < (unsigned char *)end) {
175         return 2;
176       } else {
177         /* This is invalid character */
178         SEN_LOG(sen_log_warning, "invalid euc-jp string end on sen_str_charlen_nonnull");
179         return 0;
180       }
181     }
182     return 1;
183     break;
184   case sen_enc_utf8 :
185     return sen_str_charlen_utf8(p, (unsigned char *)end);
186     break;
187   case sen_enc_sjis :
188     if (*p & 0x80) {
189       /* we regard 0xa0 as JIS X 0201 KANA. adjusted to other tools. */
190       if (0xa0 <= *p && *p <= 0xdf) {
191         /* hankaku-kana */
192         return 1;
193       } else if (++p >= (unsigned char *)end) {
194         /* This is invalid character */
195         SEN_LOG(sen_log_warning, "invalid sjis string end on sen_str_charlen_nonnull");
196         return 0;
197       } else {
198         return 2;
199       }
200     } else {
201       return 1;
202     }
203     break;
204   default :
205     return 1;
206     break;
207   }
208   return 0;
209 }
210 
211 sen_rc
sen_str_fin(void)212 sen_str_fin(void)
213 {
214   if (prefix) { sen_set_close(prefix); }
215   if (suffix) { sen_set_close(suffix); }
216   return sen_success;
217 }
218 
219 int
sen_str_get_prefix_order(const char * str)220 sen_str_get_prefix_order(const char *str)
221 {
222   int *ip;
223   if (!str) { return -1; }
224   if (!prefix) { prefix_init(); }
225   if (prefix && sen_set_at(prefix, str, (void **)&ip)) {
226     return *ip;
227   } else {
228     return -1;
229   }
230 }
231 
232 static unsigned char symbol[] = {
233   ',', '.', 0, ':', ';', '?', '!', 0, 0, 0, '`', 0, '^', '~', '_', 0, 0, 0,
234   0, 0, 0, 0, 0, 0, 0, '-', '-', '/', '\\', 0, 0, '|', 0, 0, 0, '\'', 0,
235   '"', '(', ')', 0, 0, '[', ']', '{', '}', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
236   '+', '-', 0, 0, 0, '=', 0, '<', '>', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
237   '$', 0, 0, '%', '#', '&', '*', '@', 0, 0, 0, 0, 0, 0, 0, 0
238 };
239 
240 inline static sen_rc
normalize_euc(sen_nstr * nstr)241 normalize_euc(sen_nstr *nstr)
242 {
243   static uint16_t hankana[] = {
244     0xa1a1, 0xa1a3, 0xa1d6, 0xa1d7, 0xa1a2, 0xa1a6, 0xa5f2, 0xa5a1, 0xa5a3,
245     0xa5a5, 0xa5a7, 0xa5a9, 0xa5e3, 0xa5e5, 0xa5e7, 0xa5c3, 0xa1bc, 0xa5a2,
246     0xa5a4, 0xa5a6, 0xa5a8, 0xa5aa, 0xa5ab, 0xa5ad, 0xa5af, 0xa5b1, 0xa5b3,
247     0xa5b5, 0xa5b7, 0xa5b9, 0xa5bb, 0xa5bd, 0xa5bf, 0xa5c1, 0xa5c4, 0xa5c6,
248     0xa5c8, 0xa5ca, 0xa5cb, 0xa5cc, 0xa5cd, 0xa5ce, 0xa5cf, 0xa5d2, 0xa5d5,
249     0xa5d8, 0xa5db, 0xa5de, 0xa5df, 0xa5e0, 0xa5e1, 0xa5e2, 0xa5e4, 0xa5e6,
250     0xa5e8, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5ef, 0xa5f3, 0xa1ab,
251     0xa1eb
252   };
253   static unsigned char dakuten[] = {
254     0xf4, 0, 0, 0, 0, 0xac, 0, 0xae, 0, 0xb0, 0, 0xb2, 0, 0xb4, 0, 0xb6, 0,
255     0xb8, 0, 0xba, 0, 0xbc, 0, 0xbe, 0, 0xc0, 0, 0xc2, 0, 0, 0xc5, 0, 0xc7,
256     0, 0xc9, 0, 0, 0, 0, 0, 0, 0xd0, 0, 0, 0xd3, 0, 0, 0xd6, 0, 0, 0xd9, 0,
257     0, 0xdc
258   };
259   static unsigned char handaku[] = {
260     0xd1, 0, 0, 0xd4, 0, 0, 0xd7, 0, 0, 0xda, 0, 0, 0xdd
261   };
262   int16_t *ch;
263   sen_ctx *ctx = nstr->ctx;
264   const unsigned char *s, *s_, *e;
265   unsigned char *d, *d0, *d_, b;
266   uint_least8_t *cp, *ctypes, ctype;
267   size_t size = nstr->orig_blen, length = 0;
268   int removeblankp = nstr->flags & SEN_STR_REMOVEBLANK;
269   if (!(nstr->norm = SEN_MALLOC(size * 2 + 1))) {
270     return sen_memory_exhausted;
271   }
272   d0 = (unsigned char *) nstr->norm;
273   if (nstr->flags & SEN_STR_WITH_CHECKS) {
274     if (!(nstr->checks = SEN_MALLOC(size * 2 * sizeof(int16_t) + 1))) {
275       SEN_FREE(nstr->norm);
276       nstr->norm = NULL;
277       return sen_memory_exhausted;
278     }
279   }
280   ch = nstr->checks;
281   if (nstr->flags & SEN_STR_WITH_CTYPES) {
282     if (!(nstr->ctypes = SEN_MALLOC(size + 1))) {
283       SEN_FREE(nstr->checks);
284       SEN_FREE(nstr->norm);
285       nstr->checks = NULL;
286       nstr->norm = NULL;
287       return sen_memory_exhausted;
288     }
289   }
290   cp = ctypes = nstr->ctypes;
291   e = (unsigned char *)nstr->orig + size;
292   for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) {
293     if ((*s & 0x80)) {
294       if (((s + 1) < e) && (*(s + 1) & 0x80)) {
295         unsigned char c1 = *s++, c2 = *s, c3 = 0;
296         switch (c1 >> 4) {
297         case 0x08 :
298           if (c1 == 0x8e && 0xa0 <= c2 && c2 <= 0xdf) {
299             uint16_t c = hankana[c2 - 0xa0];
300             switch (c) {
301             case 0xa1ab :
302               if (d > d0 + 1 && d[-2] == 0xa5
303                   && 0xa6 <= d[-1] && d[-1] <= 0xdb && (b = dakuten[d[-1] - 0xa6])) {
304                 *(d - 1) = b;
305                 if (ch) { ch[-1] += 2; s_ += 2; }
306                 continue;
307               } else {
308                 *d++ = c >> 8; *d = c & 0xff;
309               }
310               break;
311             case 0xa1eb :
312               if (d > d0 + 1 && d[-2] == 0xa5
313                   && 0xcf <= d[-1] && d[-1] <= 0xdb && (b = handaku[d[-1] - 0xcf])) {
314                 *(d - 1) = b;
315                 if (ch) { ch[-1] += 2; s_ += 2; }
316                 continue;
317               } else {
318                 *d++ = c >> 8; *d = c & 0xff;
319               }
320               break;
321             default :
322               *d++ = c >> 8; *d = c & 0xff;
323               break;
324             }
325             ctype = sen_str_katakana;
326           } else {
327             *d++ = c1; *d = c2;
328             ctype = sen_str_others;
329           }
330           break;
331         case 0x09 :
332           *d++ = c1; *d = c2;
333           ctype = sen_str_others;
334           break;
335         case 0x0a :
336           switch (c1 & 0x0f) {
337           case 1 :
338             switch (c2) {
339             case 0xbc :
340               *d++ = c1; *d = c2;
341               ctype = sen_str_katakana;
342               break;
343             case 0xb9 :
344               *d++ = c1; *d = c2;
345               ctype = sen_str_kanji;
346               break;
347             case 0xa1 :
348               if (removeblankp) {
349                 if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
350                 continue;
351               } else {
352                 *d = ' ';
353                 ctype = SEN_NSTR_BLANK|sen_str_symbol;
354               }
355               break;
356             default :
357               if (c2 >= 0xa4 && (c3 = symbol[c2 - 0xa4])) {
358                 *d = c3;
359                 ctype = sen_str_symbol;
360               } else {
361                 *d++ = c1; *d = c2;
362                 ctype = sen_str_others;
363               }
364               break;
365             }
366             break;
367           case 2 :
368             *d++ = c1; *d = c2;
369             ctype = sen_str_symbol;
370             break;
371           case 3 :
372             c3 = c2 - 0x80;
373             if ('a' <= c3 && c3 <= 'z') {
374               ctype = sen_str_alpha;
375               *d = c3;
376             } else if ('A' <= c3 && c3 <= 'Z') {
377               ctype = sen_str_alpha;
378               *d = c3 + 0x20;
379             } else if ('0' <= c3 && c3 <= '9') {
380               ctype = sen_str_digit;
381               *d = c3;
382             } else {
383               ctype = sen_str_others;
384               *d++ = c1; *d = c2;
385             }
386             break;
387           case 4 :
388             *d++ = c1; *d = c2;
389             ctype = sen_str_hiragana;
390             break;
391           case 5 :
392             *d++ = c1; *d = c2;
393             ctype = sen_str_katakana;
394             break;
395           case 6 :
396           case 7 :
397           case 8 :
398             *d++ = c1; *d = c2;
399             ctype = sen_str_symbol;
400             break;
401           default :
402             *d++ = c1; *d = c2;
403             ctype = sen_str_others;
404             break;
405           }
406           break;
407         default :
408           *d++ = c1; *d = c2;
409           ctype = sen_str_kanji;
410           break;
411         }
412       } else {
413         /* skip invalid character */
414         continue;
415       }
416     } else {
417       unsigned char c = *s;
418       switch (c >> 4) {
419       case 0 :
420       case 1 :
421         /* skip unprintable ascii */
422         if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
423         continue;
424       case 2 :
425         if (c == 0x20) {
426           if (removeblankp) {
427             if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
428             continue;
429           } else {
430             *d = ' ';
431             ctype = SEN_NSTR_BLANK|sen_str_symbol;
432           }
433         } else {
434           *d = c;
435           ctype = sen_str_symbol;
436         }
437         break;
438       case 3 :
439         *d = c;
440         ctype = (c <= 0x39) ? sen_str_digit : sen_str_symbol;
441         break;
442       case 4 :
443         *d = ('A' <= c) ? c + 0x20 : c;
444         ctype = (c == 0x40) ? sen_str_symbol : sen_str_alpha;
445         break;
446       case 5 :
447         *d = (c <= 'Z') ? c + 0x20 : c;
448         ctype = (c <= 0x5a) ? sen_str_alpha : sen_str_symbol;
449         break;
450       case 6 :
451         *d = c;
452         ctype = (c == 0x60) ? sen_str_symbol : sen_str_alpha;
453         break;
454       case 7 :
455         *d = c;
456         ctype = (c <= 0x7a) ? sen_str_alpha : (c == 0x7f ? sen_str_others : sen_str_symbol);
457         break;
458       default :
459         *d = c;
460         ctype = sen_str_others;
461         break;
462       }
463     }
464     d++;
465     length++;
466     if (cp) { *cp++ = ctype; }
467     if (ch) {
468       *ch++ = (int16_t)(s + 1 - s_);
469       s_ = s + 1;
470       while (++d_ < d) { *ch++ = 0; }
471     }
472   }
473   if (cp) { *cp = sen_str_null; }
474   *d = '\0';
475   nstr->length = length;
476   nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
477   return sen_success;
478 }
479 
480 #ifndef NO_NFKC
481 uint_least8_t sen_nfkc_ctype(const unsigned char *str);
482 const char *sen_nfkc_map1(const unsigned char *str);
483 const char *sen_nfkc_map2(const unsigned char *prefix, const unsigned char *suffix);
484 
485 inline static sen_rc
normalize_utf8(sen_nstr * nstr)486 normalize_utf8(sen_nstr *nstr)
487 {
488   int16_t *ch;
489   sen_ctx *ctx = nstr->ctx;
490   const unsigned char *s, *s_, *s__, *p, *p2, *pe, *e;
491   unsigned char *d, *d_, *de;
492   uint_least8_t *cp;
493   size_t length = 0, ls, lp, size = nstr->orig_blen, ds = size * 3;
494   int removeblankp = nstr->flags & SEN_STR_REMOVEBLANK;
495   if (!(nstr->norm = SEN_MALLOC(ds + 1))) {
496     return sen_memory_exhausted;
497   }
498   if (nstr->flags & SEN_STR_WITH_CHECKS) {
499     if (!(nstr->checks = SEN_MALLOC(ds * sizeof(int16_t) + 1))) {
500       SEN_FREE(nstr->norm);
501       nstr->norm = NULL;
502       return sen_memory_exhausted;
503     }
504   }
505   ch = nstr->checks;
506   if (nstr->flags & SEN_STR_WITH_CTYPES) {
507     if (!(nstr->ctypes = SEN_MALLOC(ds + 1))) {
508       if (nstr->checks) {
509         SEN_FREE(nstr->checks); nstr->checks = NULL;
510       }
511       SEN_FREE(nstr->norm); nstr->norm = NULL;
512       return sen_memory_exhausted;
513     }
514   }
515   cp = nstr->ctypes;
516   d = (unsigned char *)nstr->norm;
517   de = d + ds;
518   d_ = NULL;
519   e = (unsigned char *)nstr->orig + size;
520   for (s = s_ = (unsigned char *)nstr->orig; ; s += ls) {
521     if (!(ls = sen_str_charlen_utf8(s, e))) {
522       break;
523     }
524     if ((p = (unsigned char *)sen_nfkc_map1(s))) {
525       pe = p + strlen((char *)p);
526     } else {
527       p = s;
528       pe = p + ls;
529     }
530     if (d_ && (p2 = (unsigned char *)sen_nfkc_map2(d_, p))) {
531       p = p2;
532       pe = p + strlen((char *)p);
533       if (cp) { cp--; }
534       if (ch) {
535         ch -= (d - d_);
536         s_ = s__;
537       }
538       d = d_;
539       length--;
540     }
541     for (; ; p += lp) {
542       if (!(lp = sen_str_charlen_utf8(p, pe))) {
543         break;
544       }
545       if ((*p == ' ' && removeblankp) || *p < 0x20  /* skip unprintable ascii */ ) {
546         if (cp > nstr->ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
547       } else {
548         if (de <= d + lp) {
549           unsigned char *norm;
550           ds += (ds >> 1) + lp;
551           if (!(norm = SEN_REALLOC(nstr->norm, ds + 1))) {
552             if (nstr->ctypes) { SEN_FREE(nstr->ctypes); nstr->ctypes = NULL; }
553             if (nstr->checks) { SEN_FREE(nstr->checks); nstr->checks = NULL; }
554             SEN_FREE(nstr->norm); nstr->norm = NULL;
555             return sen_memory_exhausted;
556           }
557           de = norm + ds;
558           d = norm + (d - (unsigned char *)nstr->norm);
559           nstr->norm = norm;
560           if (ch) {
561             int16_t *checks;
562             if (!(checks = SEN_REALLOC(nstr->checks, ds * sizeof(int16_t)+ 1))) {
563               if (nstr->ctypes) { SEN_FREE(nstr->ctypes); nstr->ctypes = NULL; }
564               SEN_FREE(nstr->checks); nstr->checks = NULL;
565               SEN_FREE(nstr->norm); nstr->norm = NULL;
566               return sen_memory_exhausted;
567             }
568             ch = checks + (ch - nstr->checks);
569             nstr->checks = checks;
570           }
571           if (cp) {
572             uint_least8_t *ctypes;
573             if (!(ctypes = SEN_REALLOC(nstr->ctypes, ds + 1))) {
574               SEN_FREE(nstr->ctypes); nstr->ctypes = NULL;
575               if (nstr->checks) { SEN_FREE(nstr->checks); nstr->checks = NULL; }
576               SEN_FREE(nstr->norm); nstr->norm = NULL;
577               return sen_memory_exhausted;
578             }
579             cp = ctypes + (cp - nstr->ctypes);
580             nstr->ctypes = ctypes;
581           }
582         }
583 
584         memcpy(d, p, lp);
585         d_ = d;
586         d += lp;
587         length++;
588         if (cp) { *cp++ = sen_nfkc_ctype(p); }
589         if (ch) {
590           size_t i;
591           if (s_ == s + ls) {
592             *ch++ = -1;
593           } else {
594             *ch++ = (int16_t)(s + ls - s_);
595             s__ = s_;
596             s_ = s + ls;
597           }
598           for (i = lp; i > 1; i--) { *ch++ = 0; }
599         }
600       }
601     }
602   }
603   if (cp) { *cp = sen_str_null; }
604   *d = '\0';
605   nstr->length = length;
606   nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
607   return sen_success;
608 }
609 #endif /* NO_NFKC */
610 
611 inline static sen_rc
normalize_sjis(sen_nstr * nstr)612 normalize_sjis(sen_nstr *nstr)
613 {
614   static uint16_t hankana[] = {
615     0x8140, 0x8142, 0x8175, 0x8176, 0x8141, 0x8145, 0x8392, 0x8340, 0x8342,
616     0x8344, 0x8346, 0x8348, 0x8383, 0x8385, 0x8387, 0x8362, 0x815b, 0x8341,
617     0x8343, 0x8345, 0x8347, 0x8349, 0x834a, 0x834c, 0x834e, 0x8350, 0x8352,
618     0x8354, 0x8356, 0x8358, 0x835a, 0x835c, 0x835e, 0x8360, 0x8363, 0x8365,
619     0x8367, 0x8369, 0x836a, 0x836b, 0x836c, 0x836d, 0x836e, 0x8371, 0x8374,
620     0x8377, 0x837a, 0x837d, 0x837e, 0x8380, 0x8381, 0x8382, 0x8384, 0x8386,
621     0x8388, 0x8389, 0x838a, 0x838b, 0x838c, 0x838d, 0x838f, 0x8393, 0x814a,
622     0x814b
623   };
624   static unsigned char dakuten[] = {
625     0x94, 0, 0, 0, 0, 0x4b, 0, 0x4d, 0, 0x4f, 0, 0x51, 0, 0x53, 0, 0x55, 0,
626     0x57, 0, 0x59, 0, 0x5b, 0, 0x5d, 0, 0x5f, 0, 0x61, 0, 0, 0x64, 0, 0x66,
627     0, 0x68, 0, 0, 0, 0, 0, 0, 0x6f, 0, 0, 0x72, 0, 0, 0x75, 0, 0, 0x78, 0,
628     0, 0x7b
629   };
630   static unsigned char handaku[] = {
631     0x70, 0, 0, 0x73, 0, 0, 0x76, 0, 0, 0x79, 0, 0, 0x7c
632   };
633   int16_t *ch;
634   sen_ctx *ctx = nstr->ctx;
635   const unsigned char *s, *s_;
636   unsigned char *d, *d0, *d_, b, *e;
637   uint_least8_t *cp, *ctypes, ctype;
638   size_t size = nstr->orig_blen, length = 0;
639   int removeblankp = nstr->flags & SEN_STR_REMOVEBLANK;
640   if (!(nstr->norm = SEN_MALLOC(size * 2 + 1))) {
641     return sen_memory_exhausted;
642   }
643   d0 = (unsigned char *) nstr->norm;
644   if (nstr->flags & SEN_STR_WITH_CHECKS) {
645     if (!(nstr->checks = SEN_MALLOC(size * 2 * sizeof(int16_t) + 1))) {
646       SEN_FREE(nstr->norm);
647       nstr->norm = NULL;
648       return sen_memory_exhausted;
649     }
650   }
651   ch = nstr->checks;
652   if (nstr->flags & SEN_STR_WITH_CTYPES) {
653     if (!(nstr->ctypes = SEN_MALLOC(size + 1))) {
654       SEN_FREE(nstr->checks);
655       SEN_FREE(nstr->norm);
656       nstr->checks = NULL;
657       nstr->norm = NULL;
658       return sen_memory_exhausted;
659     }
660   }
661   cp = ctypes = nstr->ctypes;
662   e = (unsigned char *)nstr->orig + size;
663   for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) {
664     if ((*s & 0x80)) {
665       if (0xa0 <= *s && *s <= 0xdf) {
666         uint16_t c = hankana[*s - 0xa0];
667         switch (c) {
668         case 0x814a :
669           if (d > d0 + 1 && d[-2] == 0x83
670               && 0x45 <= d[-1] && d[-1] <= 0x7a && (b = dakuten[d[-1] - 0x45])) {
671             *(d - 1) = b;
672             if (ch) { ch[-1]++; s_++; }
673             continue;
674           } else {
675             *d++ = c >> 8; *d = c & 0xff;
676           }
677           break;
678         case 0x814b :
679           if (d > d0 + 1 && d[-2] == 0x83
680               && 0x6e <= d[-1] && d[-1] <= 0x7a && (b = handaku[d[-1] - 0x6e])) {
681             *(d - 1) = b;
682             if (ch) { ch[-1]++; s_++; }
683             continue;
684           } else {
685             *d++ = c >> 8; *d = c & 0xff;
686           }
687           break;
688         default :
689           *d++ = c >> 8; *d = c & 0xff;
690           break;
691         }
692         ctype = sen_str_katakana;
693       } else {
694         if ((s + 1) < e && 0x40 <= *(s + 1) && *(s + 1) <= 0xfc) {
695           unsigned char c1 = *s++, c2 = *s, c3 = 0;
696           if (0x81 <= c1 && c1 <= 0x87) {
697             switch (c1 & 0x0f) {
698             case 1 :
699               switch (c2) {
700               case 0x5b :
701                 *d++ = c1; *d = c2;
702                 ctype = sen_str_katakana;
703                 break;
704               case 0x58 :
705                 *d++ = c1; *d = c2;
706                 ctype = sen_str_kanji;
707                 break;
708               case 0x40 :
709                 if (removeblankp) {
710                   if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
711                   continue;
712                 } else {
713                   *d = ' ';
714                   ctype = SEN_NSTR_BLANK|sen_str_symbol;
715                 }
716                 break;
717               default :
718                 if (0x43 <= c2 && c2 <= 0x7e && (c3 = symbol[c2 - 0x43])) {
719                   *d = c3;
720                   ctype = sen_str_symbol;
721                 } else if (0x7f <= c2 && c2 <= 0x97 && (c3 = symbol[c2 - 0x44])) {
722                   *d = c3;
723                   ctype = sen_str_symbol;
724                 } else {
725                   *d++ = c1; *d = c2;
726                   ctype = sen_str_others;
727                 }
728                 break;
729               }
730               break;
731             case 2 :
732               c3 = c2 - 0x1f;
733               if (0x4f <= c2 && c2 <= 0x58) {
734                 ctype = sen_str_digit;
735                 *d = c2 - 0x1f;
736               } else if (0x60 <= c2 && c2 <= 0x79) {
737                 ctype = sen_str_alpha;
738                 *d = c2 + 0x01;
739               } else if (0x81 <= c2 && c2 <= 0x9a) {
740                 ctype = sen_str_alpha;
741                 *d = c2 - 0x20;
742               } else if (0x9f <= c2 && c2 <= 0xf1) {
743                 *d++ = c1; *d = c2;
744                 ctype = sen_str_hiragana;
745               } else {
746                 *d++ = c1; *d = c2;
747                 ctype = sen_str_others;
748               }
749               break;
750             case 3 :
751               if (0x40 <= c2 && c2 <= 0x96) {
752                 *d++ = c1; *d = c2;
753                 ctype = sen_str_katakana;
754               } else {
755                 *d++ = c1; *d = c2;
756                 ctype = sen_str_symbol;
757               }
758               break;
759             case 4 :
760             case 7 :
761               *d++ = c1; *d = c2;
762               ctype = sen_str_symbol;
763               break;
764             default :
765               *d++ = c1; *d = c2;
766               ctype = sen_str_others;
767               break;
768             }
769           } else {
770             *d++ = c1; *d = c2;
771             ctype = sen_str_kanji;
772           }
773         } else {
774           /* skip invalid character */
775           continue;
776         }
777       }
778     } else {
779       unsigned char c = *s;
780       switch (c >> 4) {
781       case 0 :
782       case 1 :
783         /* skip unprintable ascii */
784         if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
785         continue;
786       case 2 :
787         if (c == 0x20) {
788           if (removeblankp) {
789             if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
790             continue;
791           } else {
792             *d = ' ';
793             ctype = SEN_NSTR_BLANK|sen_str_symbol;
794           }
795         } else {
796           *d = c;
797           ctype = sen_str_symbol;
798         }
799         break;
800       case 3 :
801         *d = c;
802         ctype = (c <= 0x39) ? sen_str_digit : sen_str_symbol;
803         break;
804       case 4 :
805         *d = ('A' <= c) ? c + 0x20 : c;
806         ctype = (c == 0x40) ? sen_str_symbol : sen_str_alpha;
807         break;
808       case 5 :
809         *d = (c <= 'Z') ? c + 0x20 : c;
810         ctype = (c <= 0x5a) ? sen_str_alpha : sen_str_symbol;
811         break;
812       case 6 :
813         *d = c;
814         ctype = (c == 0x60) ? sen_str_symbol : sen_str_alpha;
815         break;
816       case 7 :
817         *d = c;
818         ctype = (c <= 0x7a) ? sen_str_alpha : (c == 0x7f ? sen_str_others : sen_str_symbol);
819         break;
820       default :
821         *d = c;
822         ctype = sen_str_others;
823         break;
824       }
825     }
826     d++;
827     length++;
828     if (cp) { *cp++ = ctype; }
829     if (ch) {
830       *ch++ = (int16_t)(s + 1 - s_);
831       s_ = s + 1;
832       while (++d_ < d) { *ch++ = 0; }
833     }
834   }
835   if (cp) { *cp = sen_str_null; }
836   *d = '\0';
837   nstr->length = length;
838   nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
839   return sen_success;
840 }
841 
842 inline static sen_rc
normalize_none(sen_nstr * nstr)843 normalize_none(sen_nstr *nstr)
844 {
845   int16_t *ch;
846   sen_ctx *ctx = nstr->ctx;
847   const unsigned char *s, *s_, *e;
848   unsigned char *d, *d0, *d_;
849   uint_least8_t *cp, *ctypes, ctype;
850   size_t size = nstr->orig_blen, length = 0;
851   int removeblankp = nstr->flags & SEN_STR_REMOVEBLANK;
852   if (!(nstr->norm = SEN_MALLOC(size + 1))) {
853     return sen_memory_exhausted;
854   }
855   d0 = (unsigned char *) nstr->norm;
856   if (nstr->flags & SEN_STR_WITH_CHECKS) {
857     if (!(nstr->checks = SEN_MALLOC(size * sizeof(int16_t) + 1))) {
858       SEN_FREE(nstr->norm);
859       nstr->norm = NULL;
860       return sen_memory_exhausted;
861     }
862   }
863   ch = nstr->checks;
864   if (nstr->flags & SEN_STR_WITH_CTYPES) {
865     if (!(nstr->ctypes = SEN_MALLOC(size + 1))) {
866       SEN_FREE(nstr->checks);
867       SEN_FREE(nstr->norm);
868       nstr->checks = NULL;
869       nstr->norm = NULL;
870       return sen_memory_exhausted;
871     }
872   }
873   cp = ctypes = nstr->ctypes;
874   e = (unsigned char *)nstr->orig + size;
875   for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) {
876     unsigned char c = *s;
877     switch (c >> 4) {
878     case 0 :
879     case 1 :
880       /* skip unprintable ascii */
881       if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
882       continue;
883     case 2 :
884       if (c == 0x20) {
885         if (removeblankp) {
886           if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
887           continue;
888         } else {
889           *d = ' ';
890           ctype = SEN_NSTR_BLANK|sen_str_symbol;
891         }
892       } else {
893         *d = c;
894         ctype = sen_str_symbol;
895       }
896       break;
897     case 3 :
898       *d = c;
899       ctype = (c <= 0x39) ? sen_str_digit : sen_str_symbol;
900       break;
901     case 4 :
902       *d = ('A' <= c) ? c + 0x20 : c;
903       ctype = (c == 0x40) ? sen_str_symbol : sen_str_alpha;
904       break;
905     case 5 :
906       *d = (c <= 'Z') ? c + 0x20 : c;
907       ctype = (c <= 0x5a) ? sen_str_alpha : sen_str_symbol;
908       break;
909     case 6 :
910       *d = c;
911       ctype = (c == 0x60) ? sen_str_symbol : sen_str_alpha;
912       break;
913     case 7 :
914       *d = c;
915       ctype = (c <= 0x7a) ? sen_str_alpha : (c == 0x7f ? sen_str_others : sen_str_symbol);
916       break;
917     default :
918       *d = c;
919       ctype = sen_str_others;
920       break;
921     }
922     d++;
923     length++;
924     if (cp) { *cp++ = ctype; }
925     if (ch) {
926       *ch++ = (int16_t)(s + 1 - s_);
927       s_ = s + 1;
928       while (++d_ < d) { *ch++ = 0; }
929     }
930   }
931   if (cp) { *cp = sen_str_null; }
932   *d = '\0';
933   nstr->length = length;
934   nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
935   return sen_success;
936 }
937 
938 /* use cp1252 as latin1 */
939 inline static sen_rc
normalize_latin1(sen_nstr * nstr)940 normalize_latin1(sen_nstr *nstr)
941 {
942   int16_t *ch;
943   sen_ctx *ctx = nstr->ctx;
944   const unsigned char *s, *s_, *e;
945   unsigned char *d, *d0, *d_;
946   uint_least8_t *cp, *ctypes, ctype;
947   size_t size = strlen(nstr->orig), length = 0;
948   int removeblankp = nstr->flags & SEN_STR_REMOVEBLANK;
949   if (!(nstr->norm = SEN_MALLOC(size + 1))) {
950     return sen_memory_exhausted;
951   }
952   d0 = (unsigned char *) nstr->norm;
953   if (nstr->flags & SEN_STR_WITH_CHECKS) {
954     if (!(nstr->checks = SEN_MALLOC(size * sizeof(int16_t) + 1))) {
955       SEN_FREE(nstr->norm);
956       nstr->norm = NULL;
957       return sen_memory_exhausted;
958     }
959   }
960   ch = nstr->checks;
961   if (nstr->flags & SEN_STR_WITH_CTYPES) {
962     if (!(nstr->ctypes = SEN_MALLOC(size + 1))) {
963       SEN_FREE(nstr->checks);
964       SEN_FREE(nstr->norm);
965       nstr->checks = NULL;
966       nstr->norm = NULL;
967       return sen_memory_exhausted;
968     }
969   }
970   cp = ctypes = nstr->ctypes;
971   e = (unsigned char *)nstr->orig + size;
972   for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) {
973     unsigned char c = *s;
974     switch (c >> 4) {
975     case 0 :
976     case 1 :
977       /* skip unprintable ascii */
978       if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
979       continue;
980     case 2 :
981       if (c == 0x20) {
982         if (removeblankp) {
983           if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
984           continue;
985         } else {
986           *d = ' ';
987           ctype = SEN_NSTR_BLANK|sen_str_symbol;
988         }
989       } else {
990         *d = c;
991         ctype = sen_str_symbol;
992       }
993       break;
994     case 3 :
995       *d = c;
996       ctype = (c <= 0x39) ? sen_str_digit : sen_str_symbol;
997       break;
998     case 4 :
999       *d = ('A' <= c) ? c + 0x20 : c;
1000       ctype = (c == 0x40) ? sen_str_symbol : sen_str_alpha;
1001       break;
1002     case 5 :
1003       *d = (c <= 'Z') ? c + 0x20 : c;
1004       ctype = (c <= 0x5a) ? sen_str_alpha : sen_str_symbol;
1005       break;
1006     case 6 :
1007       *d = c;
1008       ctype = (c == 0x60) ? sen_str_symbol : sen_str_alpha;
1009       break;
1010     case 7 :
1011       *d = c;
1012       ctype = (c <= 0x7a) ? sen_str_alpha : (c == 0x7f ? sen_str_others : sen_str_symbol);
1013       break;
1014     case 8 :
1015       if (c == 0x8a || c == 0x8c || c == 0x8e) {
1016         *d = c + 0x10;
1017         ctype = sen_str_alpha;
1018       } else {
1019         *d = c;
1020         ctype = sen_str_symbol;
1021       }
1022       break;
1023     case 9 :
1024       if (c == 0x9a || c == 0x9c || c == 0x9e || c == 0x9f) {
1025         *d = (c == 0x9f) ? c + 0x60 : c;
1026         ctype = sen_str_alpha;
1027       } else {
1028         *d = c;
1029         ctype = sen_str_symbol;
1030       }
1031       break;
1032     case 0x0c :
1033       *d = c + 0x20;
1034       ctype = sen_str_alpha;
1035       break;
1036     case 0x0d :
1037       *d = (c == 0xd7 || c == 0xdf) ? c : c + 0x20;
1038       ctype = (c == 0xd7) ? sen_str_symbol : sen_str_alpha;
1039       break;
1040     case 0x0e :
1041       *d = c;
1042       ctype = sen_str_alpha;
1043       break;
1044     case 0x0f :
1045       *d = c;
1046       ctype = (c == 0xf7) ? sen_str_symbol : sen_str_alpha;
1047       break;
1048     default :
1049       *d = c;
1050       ctype = sen_str_others;
1051       break;
1052     }
1053     d++;
1054     length++;
1055     if (cp) { *cp++ = ctype; }
1056     if (ch) {
1057       *ch++ = (int16_t)(s + 1 - s_);
1058       s_ = s + 1;
1059       while (++d_ < d) { *ch++ = 0; }
1060     }
1061   }
1062   if (cp) { *cp = sen_str_null; }
1063   *d = '\0';
1064   nstr->length = length;
1065   nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
1066   return sen_success;
1067 }
1068 
1069 inline static sen_rc
normalize_koi8r(sen_nstr * nstr)1070 normalize_koi8r(sen_nstr *nstr)
1071 {
1072   int16_t *ch;
1073   sen_ctx *ctx = nstr->ctx;
1074   const unsigned char *s, *s_, *e;
1075   unsigned char *d, *d0, *d_;
1076   uint_least8_t *cp, *ctypes, ctype;
1077   size_t size = strlen(nstr->orig), length = 0;
1078   int removeblankp = nstr->flags & SEN_STR_REMOVEBLANK;
1079   if (!(nstr->norm = SEN_MALLOC(size + 1))) {
1080     return sen_memory_exhausted;
1081   }
1082   d0 = (unsigned char *) nstr->norm;
1083   if (nstr->flags & SEN_STR_WITH_CHECKS) {
1084     if (!(nstr->checks = SEN_MALLOC(size * sizeof(int16_t) + 1))) {
1085       SEN_FREE(nstr->norm);
1086       nstr->norm = NULL;
1087       return sen_memory_exhausted;
1088     }
1089   }
1090   ch = nstr->checks;
1091   if (nstr->flags & SEN_STR_WITH_CTYPES) {
1092     if (!(nstr->ctypes = SEN_MALLOC(size + 1))) {
1093       SEN_FREE(nstr->checks);
1094       SEN_FREE(nstr->norm);
1095       nstr->checks = NULL;
1096       nstr->norm = NULL;
1097       return sen_memory_exhausted;
1098     }
1099   }
1100   cp = ctypes = nstr->ctypes;
1101   e = (unsigned char *)nstr->orig + size;
1102   for (s = s_ = (unsigned char *) nstr->orig, d = d_ = d0; s < e; s++) {
1103     unsigned char c = *s;
1104     switch (c >> 4) {
1105     case 0 :
1106     case 1 :
1107       /* skip unprintable ascii */
1108       if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
1109       continue;
1110     case 2 :
1111       if (c == 0x20) {
1112         if (removeblankp) {
1113           if (cp > ctypes) { *(cp - 1) |= SEN_NSTR_BLANK; }
1114           continue;
1115         } else {
1116           *d = ' ';
1117           ctype = SEN_NSTR_BLANK|sen_str_symbol;
1118         }
1119       } else {
1120         *d = c;
1121         ctype = sen_str_symbol;
1122       }
1123       break;
1124     case 3 :
1125       *d = c;
1126       ctype = (c <= 0x39) ? sen_str_digit : sen_str_symbol;
1127       break;
1128     case 4 :
1129       *d = ('A' <= c) ? c + 0x20 : c;
1130       ctype = (c == 0x40) ? sen_str_symbol : sen_str_alpha;
1131       break;
1132     case 5 :
1133       *d = (c <= 'Z') ? c + 0x20 : c;
1134       ctype = (c <= 0x5a) ? sen_str_alpha : sen_str_symbol;
1135       break;
1136     case 6 :
1137       *d = c;
1138       ctype = (c == 0x60) ? sen_str_symbol : sen_str_alpha;
1139       break;
1140     case 7 :
1141       *d = c;
1142       ctype = (c <= 0x7a) ? sen_str_alpha : (c == 0x7f ? sen_str_others : sen_str_symbol);
1143       break;
1144     case 0x0a :
1145       *d = c;
1146       ctype = (c == 0xa3) ? sen_str_alpha : sen_str_others;
1147       break;
1148     case 0x0b :
1149       if (c == 0xb3) {
1150         *d = c - 0x10;
1151         ctype = sen_str_alpha;
1152       } else {
1153         *d = c;
1154         ctype = sen_str_others;
1155       }
1156       break;
1157     case 0x0c :
1158     case 0x0d :
1159       *d = c;
1160       ctype = sen_str_alpha;
1161       break;
1162     case 0x0e :
1163     case 0x0f :
1164       *d = c - 0x20;
1165       ctype = sen_str_alpha;
1166       break;
1167     default :
1168       *d = c;
1169       ctype = sen_str_others;
1170       break;
1171     }
1172     d++;
1173     length++;
1174     if (cp) { *cp++ = ctype; }
1175     if (ch) {
1176       *ch++ = (int16_t)(s + 1 - s_);
1177       s_ = s + 1;
1178       while (++d_ < d) { *ch++ = 0; }
1179     }
1180   }
1181   if (cp) { *cp = sen_str_null; }
1182   *d = '\0';
1183   nstr->length = length;
1184   nstr->norm_blen = (size_t)(d - (unsigned char *)nstr->norm);
1185   return sen_success;
1186 }
1187 
1188 sen_nstr *
sen_nstr_open(const char * str,size_t str_len,sen_encoding encoding,int flags)1189 sen_nstr_open(const char *str, size_t str_len, sen_encoding encoding, int flags)
1190 {
1191   sen_rc rc;
1192   sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
1193   sen_nstr *nstr;
1194   if (!str) { return NULL; }
1195   if (!(nstr = SEN_MALLOC(sizeof(sen_nstr)))) {
1196     SEN_LOG(sen_log_alert, "memory allocation on sen_fakenstr_open failed !");
1197     return NULL;
1198   }
1199   nstr->orig = str;
1200   nstr->orig_blen = str_len;
1201   nstr->norm = NULL;
1202   nstr->norm_blen = 0;
1203   nstr->checks = NULL;
1204   nstr->ctypes = NULL;
1205   nstr->encoding = encoding;
1206   nstr->flags = flags;
1207   nstr->ctx = ctx;
1208   switch (encoding) {
1209   case sen_enc_euc_jp :
1210     rc = normalize_euc(nstr);
1211     break;
1212   case sen_enc_utf8 :
1213 #ifdef NO_NFKC
1214     rc = normalize_none(nstr);
1215 #else /* NO_NFKC */
1216     rc = normalize_utf8(nstr);
1217 #endif /* NO_NFKC */
1218     break;
1219   case sen_enc_sjis :
1220     rc = normalize_sjis(nstr);
1221     break;
1222   case sen_enc_latin1 :
1223     rc = normalize_latin1(nstr);
1224     break;
1225   case sen_enc_koi8r :
1226     rc = normalize_koi8r(nstr);
1227     break;
1228   default :
1229     rc = normalize_none(nstr);
1230     break;
1231   }
1232   if (rc) {
1233     sen_nstr_close(nstr);
1234     return NULL;
1235   }
1236   return nstr;
1237 }
1238 
1239 sen_nstr *
sen_fakenstr_open(const char * str,size_t str_len,sen_encoding encoding,int flags)1240 sen_fakenstr_open(const char *str, size_t str_len, sen_encoding encoding, int flags)
1241 {
1242   /* TODO: support SEN_STR_REMOVEBLANK flag and ctypes */
1243   sen_nstr *nstr;
1244   sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
1245 
1246   if (!(nstr = SEN_MALLOC(sizeof(sen_nstr)))) {
1247     SEN_LOG(sen_log_alert, "memory allocation on sen_fakenstr_open failed !");
1248     return NULL;
1249   }
1250   if (!(nstr->norm = SEN_MALLOC(str_len + 1))) {
1251     SEN_LOG(sen_log_alert, "memory allocation for keyword on sen_snip_add_cond failed !");
1252     SEN_FREE(nstr);
1253     return NULL;
1254   }
1255   nstr->orig = str;
1256   nstr->orig_blen = str_len;
1257   memcpy(nstr->norm, str, str_len);
1258   nstr->norm[str_len] = '\0';
1259   nstr->norm_blen = str_len;
1260   nstr->ctypes = NULL;
1261   nstr->flags = flags;
1262   nstr->ctx = ctx;
1263 
1264   if (flags & SEN_STR_WITH_CHECKS) {
1265     int16_t f = 0;
1266     unsigned char c;
1267     size_t i;
1268     if (!(nstr->checks = (int16_t *) SEN_MALLOC(sizeof(int16_t) * str_len))) {
1269       SEN_FREE(nstr->norm);
1270       SEN_FREE(nstr);
1271       return NULL;
1272     }
1273     switch (encoding) {
1274     case sen_enc_euc_jp:
1275       for (i = 0; i < str_len; i++) {
1276         if (!f) {
1277           c = (unsigned char) str[i];
1278           f = ((c >= 0xa1U && c <= 0xfeU) || c == 0x8eU ? 2 : (c == 0x8fU ? 3 : 1)
1279             );
1280           nstr->checks[i] = f;
1281         } else {
1282           nstr->checks[i] = 0;
1283         }
1284         f--;
1285       }
1286       break;
1287     case sen_enc_sjis:
1288       for (i = 0; i < str_len; i++) {
1289         if (!f) {
1290           c = (unsigned char) str[i];
1291           f = (c >= 0x81U && ((c <= 0x9fU) || (c >= 0xe0U && c <= 0xfcU)) ? 2 : 1);
1292           nstr->checks[i] = f;
1293         } else {
1294           nstr->checks[i] = 0;
1295         }
1296         f--;
1297       }
1298       break;
1299     case sen_enc_utf8:
1300       for (i = 0; i < str_len; i++) {
1301         if (!f) {
1302           c = (unsigned char) str[i];
1303           f = (c & 0x80U ? (c & 0x20U ? (c & 0x10U ? 4 : 3)
1304                            : 2)
1305                : 1);
1306           nstr->checks[i] = f;
1307         } else {
1308           nstr->checks[i] = 0;
1309         }
1310         f--;
1311       }
1312       break;
1313     default:
1314       for (i = 0; i < str_len; i++) {
1315         nstr->checks[i] = 1;
1316       }
1317       break;
1318     }
1319   }
1320   else {
1321     nstr->checks = NULL;
1322   }
1323   return nstr;
1324 }
1325 
1326 sen_rc
sen_nstr_close(sen_nstr * nstr)1327 sen_nstr_close(sen_nstr *nstr)
1328 {
1329   if (nstr) {
1330     sen_ctx *ctx = nstr->ctx;
1331     if (nstr->norm) { SEN_FREE(nstr->norm); }
1332     if (nstr->ctypes) { SEN_FREE(nstr->ctypes); }
1333     if (nstr->checks) { SEN_FREE(nstr->checks); }
1334     SEN_FREE(nstr);
1335     return sen_success;
1336   } else {
1337     return sen_invalid_argument;
1338   }
1339 }
1340 
1341 static const char *sen_enc_string[] = {
1342   "default",
1343   "none",
1344   "euc_jp",
1345   "utf8",
1346   "sjis",
1347   "latin1",
1348   "koi8r"
1349 };
1350 
1351 const char *
sen_enctostr(sen_encoding enc)1352 sen_enctostr(sen_encoding enc)
1353 {
1354   if (enc < (sizeof(sen_enc_string) / sizeof(char *))) {
1355     return sen_enc_string[enc];
1356   } else {
1357     return "unknown";
1358   }
1359 }
1360 
1361 sen_encoding
sen_strtoenc(const char * str)1362 sen_strtoenc(const char *str)
1363 {
1364   sen_encoding e = sen_enc_euc_jp;
1365   int i = sizeof(sen_enc_string) / sizeof(sen_enc_string[0]);
1366   while (i--) {
1367     if (!strcmp(str, sen_enc_string[i])) {
1368       e = (sen_encoding)i;
1369     }
1370   }
1371   return e;
1372 }
1373 
1374 size_t
sen_str_len(const char * str,sen_encoding encoding,const char ** last)1375 sen_str_len(const char *str, sen_encoding encoding, const char **last)
1376 {
1377   size_t len, tlen;
1378   const char *p = NULL;
1379   for (len = 0; ; len++) {
1380     p = str;
1381     if (!(tlen = sen_str_charlen(str, encoding))) {
1382       break;
1383     }
1384     str += tlen;
1385   }
1386   if (last) { *last = p; }
1387   return len;
1388 }
1389 
1390 int
sen_isspace(const char * str,sen_encoding encoding)1391 sen_isspace(const char *str, sen_encoding encoding)
1392 {
1393   const unsigned char *s = (const unsigned char *) str;
1394   if (!s) { return 0; }
1395   switch (s[0]) {
1396   case ' ' :
1397   case '\f' :
1398   case '\n' :
1399   case '\r' :
1400   case '\t' :
1401   case '\v' :
1402     return 1;
1403   case 0x81 :
1404     if (encoding == sen_enc_sjis && s[1] == 0x40) { return 2; }
1405     break;
1406   case 0xA1 :
1407     if (encoding == sen_enc_euc_jp && s[1] == 0xA1) { return 2; }
1408     break;
1409   case 0xE3 :
1410     if (encoding == sen_enc_utf8 && s[1] == 0x80 && s[2] == 0x80) { return 3; }
1411     break;
1412   default :
1413     break;
1414   }
1415   return 0;
1416 }
1417 
1418 int
sen_atoi(const char * nptr,const char * end,const char ** rest)1419 sen_atoi(const char *nptr, const char *end, const char **rest)
1420 {
1421   /* FIXME: INT_MIN is not supported */
1422   const char *p = nptr;
1423   int v = 0, t, n = 0, o = 0;
1424   if (p < end && *p == '-') {
1425     p++;
1426     n = 1;
1427     o = 1;
1428   }
1429   while (p < end && *p >= '0' && *p <= '9') {
1430     t = v * 10 + (*p - '0');
1431     if (t < v) { v =0; break; }
1432     v = t;
1433     o = 0;
1434     p++;
1435   }
1436   if (rest) { *rest = o ? nptr : p; }
1437   return n ? -v : v;
1438 }
1439 
1440 unsigned int
sen_atoui(const char * nptr,const char * end,const char ** rest)1441 sen_atoui(const char *nptr, const char *end, const char **rest)
1442 {
1443   unsigned int v = 0, t;
1444   while (nptr < end && *nptr >= '0' && *nptr <= '9') {
1445     t = v * 10 + (*nptr - '0');
1446     if (t < v) { v = 0; break; }
1447     v = t;
1448     nptr++;
1449   }
1450   if (rest) { *rest = nptr; }
1451   return v;
1452 }
1453 
1454 int64_t
sen_atoll(const char * nptr,const char * end,const char ** rest)1455 sen_atoll(const char *nptr, const char *end, const char **rest)
1456 {
1457   /* FIXME: INT_MIN is not supported */
1458   const char *p = nptr;
1459   int n = 0, o = 0;
1460   int64_t v = 0, t;
1461   if (p < end && *p == '-') {
1462     p++;
1463     n = 1;
1464     o = 1;
1465   }
1466   while (p < end && *p >= '0' && *p <= '9') {
1467     t = v * 10 + (*p - '0');
1468     if (t < v) { v = 0; break; }
1469     v = t;
1470     o = 0;
1471     p++;
1472   }
1473   if (rest) { *rest = o ? nptr : p; }
1474   return n ? -v : v;
1475 }
1476 
1477 unsigned int
sen_htoui(const char * nptr,const char * end,const char ** rest)1478 sen_htoui(const char *nptr, const char *end, const char **rest)
1479 {
1480   unsigned int v = 0, t;
1481   while (nptr < end) {
1482     switch (*nptr) {
1483     case '0' :
1484     case '1' :
1485     case '2' :
1486     case '3' :
1487     case '4' :
1488     case '5' :
1489     case '6' :
1490     case '7' :
1491     case '8' :
1492     case '9' :
1493       t = v * 16 + (*nptr++ - '0');
1494       break;
1495     case 'a' :
1496     case 'b' :
1497     case 'c' :
1498     case 'd' :
1499     case 'e' :
1500     case 'f' :
1501       t = v * 16 + (*nptr++ - 'a') + 10;
1502       break;
1503     case 'A' :
1504     case 'B' :
1505     case 'C' :
1506     case 'D' :
1507     case 'E' :
1508     case 'F' :
1509       t = v * 16 + (*nptr++ - 'A') + 10;
1510       break;
1511     default :
1512       v = 0; goto exit;
1513     }
1514     if (t < v) { v = 0; goto exit; }
1515     v = t;
1516   }
1517 exit :
1518   if (rest) { *rest = nptr; }
1519   return v;
1520 }
1521 
1522 void
sen_str_itoh(unsigned int i,char * p,unsigned int len)1523 sen_str_itoh(unsigned int i, char *p, unsigned int len)
1524 {
1525   static const char *hex = "0123456789ABCDEF";
1526   p += len;
1527   *p-- = '\0';
1528   while (len--) {
1529     *p-- = hex[i & 0xf];
1530     i >>= 4;
1531   }
1532 }
1533 
1534 sen_rc
sen_str_itoa(int i,char * p,char * end,char ** rest)1535 sen_str_itoa(int i, char *p, char *end, char **rest)
1536 {
1537   /* FIXME: INT_MIN is not supported */
1538   char *q;
1539   if (p >= end) { return sen_invalid_argument; }
1540   if (i < 0) {
1541     *p++ = '-';
1542     i = -i;
1543   }
1544   q = p;
1545   do {
1546     if (p >= end) { return sen_invalid_argument; }
1547     *p++ = i % 10 + '0';
1548   } while ((i /= 10) > 0);
1549   if (rest) { *rest = p; }
1550   for (p--; q < p; q++, p--) {
1551     char t = *q;
1552     *q = *p;
1553     *p = t;
1554   }
1555   return sen_success;
1556 }
1557 
1558 sen_rc
sen_str_lltoa(int64_t i,char * p,char * end,char ** rest)1559 sen_str_lltoa(int64_t i, char *p, char *end, char **rest)
1560 {
1561   /* FIXME: INT_MIN is not supported */
1562   char *q;
1563   if (p >= end) { return sen_invalid_argument; }
1564   if (i < 0) {
1565     *p++ = '-';
1566     i = -i;
1567   }
1568   q = p;
1569   do {
1570     if (p >= end) { return sen_invalid_argument; }
1571     *p++ = i % 10 + '0';
1572   } while ((i /= 10) > 0);
1573   if (rest) { *rest = p; }
1574   for (p--; q < p; q++, p--) {
1575     char t = *q;
1576     *q = *p;
1577     *p = t;
1578   }
1579   return sen_success;
1580 }
1581 
1582 #define I2B(i) \
1583  ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(i) & 0x3f])
1584 
1585 #define B2I(b) \
1586  (((b) < '+' || 'z' < (b)) ? 0xff : "\x3e\xff\xff\xff\x3f\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\xff\xff\xff\xff\xff\xff\xff\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\xff\xff\xff\xff\xff\xff\x1a\x1b\x1c\x1d\x1e\x1f\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f\x30\x31\x32\x33"[(b) - '+'])
1587 
1588 #define MASK 0x34d34d34
1589 
1590 char *
sen_str_itob(sen_id id,char * p)1591 sen_str_itob(sen_id id, char *p)
1592 {
1593   id ^= MASK;
1594   *p++ = I2B(id >> 24);
1595   *p++ = I2B(id >> 18);
1596   *p++ = I2B(id >> 12);
1597   *p++ = I2B(id >> 6);
1598   *p++ = I2B(id);
1599   return p;
1600 }
1601 
1602 sen_id
sen_str_btoi(char * b)1603 sen_str_btoi(char *b)
1604 {
1605   uint8_t i;
1606   sen_id id = 0;
1607   int len = 5;
1608   while (len--) {
1609     char c = *b++;
1610     if ((i = B2I(c)) == 0xff) { return 0; }
1611     id = (id << 6) + i;
1612   }
1613   return id ^ MASK;
1614 }
1615 
1616 #define I2B32H(i) ("0123456789ABCDEFGHIJKLMNOPQRSTUV"[(i) & 0x1f])
1617 
1618 char *
sen_str_lltob32h(int64_t i,char * p)1619 sen_str_lltob32h(int64_t i, char *p)
1620 {
1621   uint64_t u = (uint64_t)i + 0x8000000000000000ULL;
1622   *p++ = I2B32H(u >> 60);
1623   *p++ = I2B32H(u >> 55);
1624   *p++ = I2B32H(u >> 50);
1625   *p++ = I2B32H(u >> 45);
1626   *p++ = I2B32H(u >> 40);
1627   *p++ = I2B32H(u >> 35);
1628   *p++ = I2B32H(u >> 30);
1629   *p++ = I2B32H(u >> 25);
1630   *p++ = I2B32H(u >> 20);
1631   *p++ = I2B32H(u >> 15);
1632   *p++ = I2B32H(u >> 10);
1633   *p++ = I2B32H(u >> 5);
1634   *p++ = I2B32H(u);
1635   return p;
1636 }
1637 
1638 char *
sen_str_ulltob32h(uint64_t i,char * p)1639 sen_str_ulltob32h(uint64_t i, char *p)
1640 {
1641   char lb = (i >> 59) & 0x10;
1642   i += 0x8000000000000000ULL;
1643   *p++ = lb + I2B32H(i >> 60);
1644   *p++ = I2B32H(i >> 55);
1645   *p++ = I2B32H(i >> 50);
1646   *p++ = I2B32H(i >> 45);
1647   *p++ = I2B32H(i >> 40);
1648   *p++ = I2B32H(i >> 35);
1649   *p++ = I2B32H(i >> 30);
1650   *p++ = I2B32H(i >> 25);
1651   *p++ = I2B32H(i >> 20);
1652   *p++ = I2B32H(i >> 15);
1653   *p++ = I2B32H(i >> 10);
1654   *p++ = I2B32H(i >> 5);
1655   *p++ = I2B32H(i);
1656   return p;
1657 }
1658 
1659 int
sen_str_tok(char * str,size_t str_len,char delim,char ** tokbuf,int buf_size,char ** rest)1660 sen_str_tok(char *str, size_t str_len, char delim, char **tokbuf, int buf_size, char **rest)
1661 {
1662   char **tok = tokbuf, **tok_end = tokbuf + buf_size;
1663   if (buf_size > 0) {
1664     char *str_end = str + str_len;
1665     for (;;str++) {
1666       if (str == str_end) {
1667         *tok++ = str;
1668         break;
1669       }
1670       if (delim == *str) {
1671         *str = '\0';
1672         *tok++ = str;
1673         if (tok == tok_end) { break; }
1674       }
1675     }
1676   }
1677   if (rest) { *rest = str; }
1678   return tok - tokbuf;
1679 }
1680 
1681 inline static void
op_getopt_flag(int * flags,const sen_str_getopt_opt * o,int argc,char * const argv[],int * i)1682 op_getopt_flag(int *flags, const sen_str_getopt_opt *o,
1683                int argc, char * const argv[], int *i)
1684 {
1685   switch (o->op) {
1686     case getopt_op_none:
1687       break;
1688     case getopt_op_on:
1689       *flags |= o->flag;
1690       break;
1691     case getopt_op_off:
1692       *flags &= ~o->flag;
1693       break;
1694     case getopt_op_update:
1695       *flags = o->flag;
1696       break;
1697     default:
1698       return;
1699   }
1700   if (o->arg) {
1701     if (++(*i) < argc) {
1702       *o->arg = argv[*i];
1703     } else {
1704       /* TODO: error */
1705     }
1706   }
1707 }
1708 
1709 int
sen_str_getopt(int argc,char * const argv[],const sen_str_getopt_opt * opts,int * flags)1710 sen_str_getopt(int argc, char * const argv[], const sen_str_getopt_opt *opts,
1711                int *flags)
1712 {
1713   int i;
1714   for (i = 1; i < argc; i++) {
1715     const char * v = argv[i];
1716     if (*v == '-') {
1717       const sen_str_getopt_opt *o;
1718       int found;
1719       if (*++v == '-') {
1720         found = 0;
1721         for (o = opts; o->opt != '\0' || o->longopt != NULL; o++) {
1722           if (o->longopt && !strcmp(v, o->longopt)) {
1723             op_getopt_flag(flags, o, argc, argv, &i);
1724             found = 1;
1725             break;
1726           }
1727         }
1728         if (!found) { goto exit; }
1729       } else {
1730         const char *p;
1731         for (p = v; *p; p++) {
1732           found = 0;
1733           for (o = opts; o->opt != '\0' || o->longopt != NULL; o++) {
1734             if (o->opt && *p == o->opt) {
1735               op_getopt_flag(flags, o, argc, argv, &i);
1736               found = 1;
1737               break;
1738             }
1739           }
1740           if (!found) { goto exit; }
1741         }
1742       }
1743     } else {
1744       break;
1745     }
1746   }
1747   return i;
1748 exit:
1749   fprintf(stderr, "cannot recognize option '%s'.\n", argv[i]);
1750   return -1;
1751 }
1752 
1753 #define UNIT_SIZE (1 << 12)
1754 #define UNIT_MASK (UNIT_SIZE - 1)
1755 
1756 int sen_rbuf_margin_size = 0;
1757 
1758 sen_rc
sen_rbuf_init(sen_rbuf * buf,size_t size)1759 sen_rbuf_init(sen_rbuf *buf, size_t size)
1760 {
1761   buf->head = NULL;
1762   buf->curr = NULL;
1763   buf->tail = NULL;
1764   return size ? sen_rbuf_resize(buf, size) : sen_success;
1765 }
1766 
1767 sen_rc
sen_rbuf_resize(sen_rbuf * buf,size_t newsize)1768 sen_rbuf_resize(sen_rbuf *buf, size_t newsize)
1769 {
1770   char *head;
1771   sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
1772   newsize += sen_rbuf_margin_size + 1;
1773   newsize = (newsize + (UNIT_MASK)) & ~UNIT_MASK;
1774   head = buf->head - (buf->head ? sen_rbuf_margin_size : 0);
1775   if (!(head = SEN_REALLOC(head, newsize))) { return sen_memory_exhausted; }
1776   buf->curr = head + sen_rbuf_margin_size + SEN_RBUF_VSIZE(buf);
1777   buf->head = head + sen_rbuf_margin_size;
1778   buf->tail = head + newsize;
1779   return sen_success;
1780 }
1781 
1782 sen_rc
sen_rbuf_reinit(sen_rbuf * buf,size_t size)1783 sen_rbuf_reinit(sen_rbuf *buf, size_t size)
1784 {
1785   SEN_RBUF_REWIND(buf);
1786   return sen_rbuf_resize(buf, size);
1787 }
1788 
1789 sen_rc
sen_rbuf_write(sen_rbuf * buf,const char * str,size_t len)1790 sen_rbuf_write(sen_rbuf *buf, const char *str, size_t len)
1791 {
1792   sen_rc rc = sen_success;
1793   if (SEN_RBUF_REST(buf) < len) {
1794     if ((rc = sen_rbuf_resize(buf, SEN_RBUF_VSIZE(buf) + len))) { return rc; }
1795   }
1796   memcpy(buf->curr, str, len);
1797   buf->curr += len;
1798   return rc;
1799 }
1800 
1801 sen_rc
sen_rbuf_reserve(sen_rbuf * buf,size_t len)1802 sen_rbuf_reserve(sen_rbuf *buf, size_t len)
1803 {
1804   sen_rc rc = sen_success;
1805   if (SEN_RBUF_REST(buf) < len) {
1806     if ((rc = sen_rbuf_resize(buf, SEN_RBUF_VSIZE(buf) + len))) { return rc; }
1807   }
1808   return rc;
1809 }
1810 
1811 sen_rc
sen_rbuf_space(sen_rbuf * buf,size_t len)1812 sen_rbuf_space(sen_rbuf *buf, size_t len)
1813 {
1814   sen_rc rc = sen_rbuf_reserve(buf, len);
1815   if (!rc) { buf->curr += len; }
1816   return rc;
1817 }
1818 
1819 sen_rc
sen_rbuf_itoa(sen_rbuf * buf,int i)1820 sen_rbuf_itoa(sen_rbuf *buf, int i)
1821 {
1822   sen_rc rc = sen_success;
1823   while (sen_str_itoa(i, buf->curr, buf->tail, &buf->curr)) {
1824     if ((rc = sen_rbuf_resize(buf, SEN_RBUF_WSIZE(buf) + UNIT_SIZE))) { return rc; }
1825   }
1826   return rc;
1827 }
1828 
1829 sen_rc
sen_rbuf_lltoa(sen_rbuf * buf,int64_t i)1830 sen_rbuf_lltoa(sen_rbuf *buf, int64_t i)
1831 {
1832   sen_rc rc = sen_success;
1833   while (sen_str_lltoa(i, buf->curr, buf->tail, &buf->curr)) {
1834     if ((rc = sen_rbuf_resize(buf, SEN_RBUF_WSIZE(buf) + UNIT_SIZE))) { return rc; }
1835   }
1836   return rc;
1837 }
1838 
1839 sen_rc
sen_rbuf_ftoa(sen_rbuf * buf,double d)1840 sen_rbuf_ftoa(sen_rbuf *buf, double d)
1841 {
1842   size_t len = 32;
1843   sen_rc rc = sen_success;
1844   if (SEN_RBUF_REST(buf) < len) {
1845     if ((rc = sen_rbuf_resize(buf, SEN_RBUF_VSIZE(buf) + len))) { return rc; }
1846   }
1847   switch (fpclassify(d)) {
1848   CASE_FP_NAN
1849     SEN_RBUF_PUTS(buf, "#<nan>");
1850     break;
1851   CASE_FP_INFINITE
1852     SEN_RBUF_PUTS(buf, d > 0 ? "#i1/0" : "#i-1/0");
1853     break;
1854   default :
1855     len = sprintf(buf->curr, "%#.15g", d);
1856     if (buf->curr[len - 1] == '.') {
1857       buf->curr += len;
1858       SEN_RBUF_PUTC(buf, '0');
1859     } else {
1860       char *p, *q;
1861       buf->curr[len] = '\0';
1862       if ((p = strchr(buf->curr, 'e'))) {
1863         for (q = p; *(q - 2) != '.' && *(q - 1) == '0'; q--) { len--; }
1864         memmove(q, p, buf->curr + len - q);
1865       } else {
1866         for (q = buf->curr + len; *(q - 2) != '.' && *(q - 1) == '0'; q--) { len--; }
1867       }
1868       buf->curr += len;
1869     }
1870     break;
1871   }
1872   return rc;
1873 }
1874 
1875 sen_rc
sen_rbuf_itoh(sen_rbuf * buf,int i)1876 sen_rbuf_itoh(sen_rbuf *buf, int i)
1877 {
1878   size_t len = 8;
1879   sen_rc rc = sen_success;
1880   if (SEN_RBUF_REST(buf) < len) {
1881     if ((rc = sen_rbuf_resize(buf, SEN_RBUF_VSIZE(buf) + len))) { return rc; }
1882   }
1883   sen_str_itoh(i, buf->curr, len);
1884   buf->curr += len;
1885   return rc;
1886 }
1887 
1888 sen_rc
sen_rbuf_itob(sen_rbuf * buf,sen_id id)1889 sen_rbuf_itob(sen_rbuf *buf, sen_id id)
1890 {
1891   size_t len = 5;
1892   sen_rc rc = sen_success;
1893   if (SEN_RBUF_REST(buf) < len) {
1894     if ((rc = sen_rbuf_resize(buf, SEN_RBUF_VSIZE(buf) + len))) { return rc; }
1895   }
1896   sen_str_itob(id, buf->curr);
1897   buf->curr += len;
1898   return rc;
1899 }
1900 
1901 sen_rc
sen_rbuf_lltob32h(sen_rbuf * buf,int64_t i)1902 sen_rbuf_lltob32h(sen_rbuf *buf, int64_t i)
1903 {
1904   size_t len = 13;
1905   sen_rc rc = sen_success;
1906   if (SEN_RBUF_REST(buf) < len) {
1907     if ((rc = sen_rbuf_resize(buf, SEN_RBUF_VSIZE(buf) + len))) { return rc; }
1908   }
1909   sen_str_lltob32h(i, buf->curr);
1910   buf->curr += len;
1911   return rc;
1912 }
1913 
1914 void
sen_rbuf_str_esc(sen_rbuf * buf,const char * s,int len,sen_encoding encoding)1915 sen_rbuf_str_esc(sen_rbuf *buf, const char *s, int len, sen_encoding encoding)
1916 {
1917   const char *e;
1918   unsigned int l;
1919   if (len < 0) { len = strlen(s); }
1920   SEN_RBUF_PUTC(buf, '"');
1921   for (e = s + len; s < e; s += l) {
1922     if (!(l = sen_str_charlen_nonnull(s, e, encoding))) { break; }
1923     if (l == 1) {
1924       switch (*s) {
1925       case '\t' :
1926         sen_rbuf_write(buf, "\\t", 2);
1927         break;
1928       case '\n' :
1929         sen_rbuf_write(buf, "\\n", 2);
1930         break;
1931       case '"' :
1932         sen_rbuf_write(buf, "\\\"", 2);
1933         break;
1934       case '\\' :
1935         sen_rbuf_write(buf, "\\\\", 2);
1936         break;
1937       default :
1938         SEN_RBUF_PUTC(buf, *s);
1939       }
1940     } else {
1941       sen_rbuf_write(buf, s, l);
1942     }
1943   }
1944   SEN_RBUF_PUTC(buf, '"');
1945 }
1946 
1947 sen_rc
sen_rbuf_fin(sen_rbuf * buf)1948 sen_rbuf_fin(sen_rbuf *buf)
1949 {
1950   sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
1951   if (buf->head) {
1952     SEN_REALLOC(buf->head - sen_rbuf_margin_size, 0);
1953     buf->head = NULL;
1954   }
1955   return sen_success;
1956 }
1957 
1958 struct _sen_lbuf_node {
1959   sen_lbuf_node *next;
1960   size_t size;
1961   char val[1];
1962 };
1963 
1964 sen_rc
sen_lbuf_init(sen_lbuf * buf)1965 sen_lbuf_init(sen_lbuf *buf)
1966 {
1967   buf->head = NULL;
1968   buf->tail = &buf->head;
1969   return sen_success;
1970 }
1971 
1972 void *
sen_lbuf_add(sen_lbuf * buf,size_t size)1973 sen_lbuf_add(sen_lbuf *buf, size_t size)
1974 {
1975   sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
1976   sen_lbuf_node *node = SEN_MALLOC(size + (size_t)(&((sen_lbuf_node *)0)->val));
1977   if (!node) { return NULL;  }
1978   node->next = NULL;
1979   node->size = size;
1980   *buf->tail = node;
1981   buf->tail = &node->next;
1982   return node->val;
1983 }
1984 
1985 sen_rc
sen_lbuf_fin(sen_lbuf * buf)1986 sen_lbuf_fin(sen_lbuf *buf)
1987 {
1988   sen_ctx *ctx = &sen_gctx; /* todo : replace it with the local ctx */
1989   sen_lbuf_node *cur, *next;
1990   for (cur = buf->head; cur; cur = next) {
1991     next = cur->next;
1992     SEN_FREE(cur);
1993   }
1994   return sen_success;
1995 }
1996 
1997 sen_rc
sen_substring(char ** str,char ** str_end,int start,int end,sen_encoding encoding)1998 sen_substring(char **str, char **str_end, int start, int end, sen_encoding encoding)
1999 {
2000   int i;
2001   size_t l;
2002   char *s = *str, *e = *str_end;
2003   for (i = 0; s < e; i++, s += l) {
2004     if (i == start) { *str = s; }
2005     if (!(l = sen_str_charlen_nonnull(s, e, encoding))) {
2006       return sen_invalid_argument;
2007     }
2008     if (i == end) {
2009       *str_end = s;
2010       break;
2011     }
2012   }
2013   return sen_success;
2014 }
2015 
2016 int
sen_str_normalize(const char * str,unsigned int str_len,sen_encoding encoding,int flags,char * nstrbuf,int buf_size)2017 sen_str_normalize(const char *str, unsigned int str_len,
2018                   sen_encoding encoding, int flags,
2019                   char *nstrbuf, int buf_size)
2020 {
2021   int len;
2022   sen_nstr *nstr;
2023   if (!(nstr = sen_nstr_open(str, str_len, encoding, flags))) {
2024     return -1;
2025   }
2026   /* if the buffer size is short to store for the normalized string,
2027      the required size is returned
2028      (to inform the caller to cast me again). */
2029   len = (int)nstr->norm_blen;
2030   if (buf_size > len) {
2031     memcpy(nstrbuf, nstr->norm, len + 1);
2032   } else if (buf_size == len) {
2033     /* NB: non-NULL-terminated */
2034     memcpy(nstrbuf, nstr->norm, len);
2035   }
2036   sen_nstr_close(nstr);
2037   return len;
2038 }
2039