1 /* -*- c-basic-offset: 2 -*- */
2 /* Copyright(C) 2009-2014 Brazil
3 
4   This library is free software; you can redistribute it and/or
5   modify it under the terms of the GNU Lesser General Public
6   License version 2.1 as published by the Free Software Foundation.
7 
8   This library is distributed in the hope that it will be useful,
9   but WITHOUT ANY WARRANTY; without even the implied warranty of
10   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11   Lesser General Public License for more details.
12 
13   You should have received a copy of the GNU Lesser General Public
14   License along with this library; if not, write to the Free Software
15   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1335  USA
16 */
17 #include "grn.h"
18 #include <string.h>
19 #include <stddef.h>
20 #include "grn_snip.h"
21 #include "grn_ctx.h"
22 
23 #if !defined MAX
24 #define MAX(a, b) ((a) > (b) ? (a) : (b))
25 #endif
26 
27 #if !defined MIN
28 #define MIN(a, b) ((a) < (b) ? (a) : (b))
29 #endif
30 
31 static int
grn_bm_check_euc(const unsigned char * x,const size_t y)32 grn_bm_check_euc(const unsigned char *x, const size_t y)
33 {
34   const unsigned char *p;
35   for (p = x + y - 1; p >= x && *p >= 0x80U; p--);
36   return (int) ((x + y - p) & 1);
37 }
38 
39 static int
grn_bm_check_sjis(const unsigned char * x,const size_t y)40 grn_bm_check_sjis(const unsigned char *x, const size_t y)
41 {
42   const unsigned char *p;
43   for (p = x + y - 1; p >= x; p--)
44     if ((*p < 0x81U) || (*p > 0x9fU && *p < 0xe0U) || (*p > 0xfcU))
45       break;
46   return (int) ((x + y - p) & 1);
47 }
48 
49 /*
50 static void
51 grn_bm_suffixes(const unsigned char *x, size_t m, size_t *suff)
52 {
53   size_t f, g;
54   intptr_t i;
55   f = 0;
56   suff[m - 1] = m;
57   g = m - 1;
58   for (i = m - 2; i >= 0; --i) {
59     if (i > (intptr_t) g && suff[i + m - 1 - f] < i - g)
60       suff[i] = suff[i + m - 1 - f];
61     else {
62       if (i < (intptr_t) g)
63         g = i;
64       f = i;
65       while (g > 0 && x[g] == x[g + m - 1 - f])
66         --g;
67       suff[i] = f - g;
68     }
69   }
70 }
71 */
72 
73 static void
grn_bm_preBmBc(const unsigned char * x,size_t m,size_t * bmBc)74 grn_bm_preBmBc(const unsigned char *x, size_t m, size_t *bmBc)
75 {
76   size_t i;
77   for (i = 0; i < ASIZE; ++i) {
78     bmBc[i] = m;
79   }
80   for (i = 0; i < m - 1; ++i) {
81     bmBc[(unsigned int) x[i]] = m - (i + 1);
82   }
83 }
84 
85 #define GRN_BM_COMPARE do { \
86   if (string_checks[found]) { \
87     size_t offset = cond->last_offset, found_alpha_head = cond->found_alpha_head; \
88     /* calc real offset */\
89     for (i = cond->last_found; i < found; i++) { \
90       if (string_checks[i] > 0) { \
91         found_alpha_head = i; \
92         offset += string_checks[i]; \
93       } \
94     } \
95     /* if real offset is in a character, move it the head of the character */ \
96     if (string_checks[found] < 0) { \
97       offset -= string_checks[found_alpha_head]; \
98       cond->last_found = found_alpha_head; \
99     } else { \
100       cond->last_found = found; \
101     } \
102     cond->start_offset = cond->last_offset = offset; \
103     if (flags & GRN_SNIP_SKIP_LEADING_SPACES) { \
104       while (cond->start_offset < string_original_length_in_bytes && \
105              (i = grn_isspace(string_original + cond->start_offset, \
106                               string_encoding))) { cond->start_offset += i; } \
107     } \
108     for (i = cond->last_found; i < found + m; i++) { \
109       if (string_checks[i] > 0) { \
110         offset += string_checks[i]; \
111       } \
112     } \
113     cond->end_offset = offset; \
114     cond->found = found + shift; \
115     cond->found_alpha_head = found_alpha_head; \
116     /* printf("bm: cond:%p found:%zd last_found:%zd st_off:%zd ed_off:%zd\n", cond, cond->found,cond->last_found,cond->start_offset,cond->end_offset); */ \
117     return; \
118   } \
119 } while (0)
120 
121 #define GRN_BM_BM_COMPARE do { \
122   if (p[-2] == ck) { \
123     for (i = 3; i <= m && p[-(intptr_t)i] == cp[-(intptr_t)i]; ++i) { \
124     } \
125     if (i > m) { \
126       found = p - y - m; \
127       GRN_BM_COMPARE; \
128     } \
129   } \
130 } while (0)
131 
132 void
grn_bm_tunedbm(grn_ctx * ctx,snip_cond * cond,grn_obj * string,int flags)133 grn_bm_tunedbm(grn_ctx *ctx, snip_cond *cond, grn_obj *string, int flags)
134 {
135   register unsigned char *limit, ck;
136   register const unsigned char *p, *cp;
137   register size_t *bmBc, delta1, i;
138 
139   const unsigned char *x;
140   unsigned char *y;
141   size_t shift, found;
142 
143   const char *string_original;
144   unsigned int string_original_length_in_bytes;
145   const short *string_checks;
146   grn_encoding string_encoding;
147   const char *string_norm, *keyword_norm;
148   unsigned int n, m;
149 
150   grn_string_get_original(ctx, string,
151                           &string_original, &string_original_length_in_bytes);
152   string_checks = grn_string_get_checks(ctx, string);
153   string_encoding = grn_string_get_encoding(ctx, string);
154   grn_string_get_normalized(ctx, string, &string_norm, &n, NULL);
155   grn_string_get_normalized(ctx, cond->keyword, &keyword_norm, &m, NULL);
156 
157   y = (unsigned char *)string_norm;
158   if (m == 1) {
159     if (n > cond->found) {
160       shift = 1;
161       p = memchr(y + cond->found, keyword_norm[0], n - cond->found);
162       if (p != NULL) {
163         found = p - y;
164         GRN_BM_COMPARE;
165       }
166     }
167     cond->stopflag = SNIPCOND_STOP;
168     return;
169   }
170 
171   x = (unsigned char *)keyword_norm;
172   bmBc = cond->bmBc;
173   shift = cond->shift;
174 
175   /* Restart */
176   p = y + m + cond->found;
177   cp = x + m;
178   ck = cp[-2];
179 
180   /* 12 means 1(initial offset) + 10 (in loop) + 1 (shift) */
181   if (n - cond->found > 12 * m) {
182     limit = y + n - 11 * m;
183     while (p <= limit) {
184       p += bmBc[p[-1]];
185       if(!(delta1 = bmBc[p[-1]])) {
186         goto check;
187       }
188       p += delta1;
189       p += bmBc[p[-1]];
190       p += bmBc[p[-1]];
191       if(!(delta1 = bmBc[p[-1]])) {
192         goto check;
193       }
194       p += delta1;
195       p += bmBc[p[-1]];
196       p += bmBc[p[-1]];
197       if(!(delta1 = bmBc[p[-1]])) {
198         goto check;
199       }
200       p += delta1;
201       p += bmBc[p[-1]];
202       p += bmBc[p[-1]];
203       continue;
204     check:
205       GRN_BM_BM_COMPARE;
206       p += shift;
207     }
208   }
209   /* limit check + search */
210   limit = y + n;
211   while(p <= limit) {
212     if (!(delta1 = bmBc[p[-1]])) {
213       GRN_BM_BM_COMPARE;
214       p += shift;
215     }
216     p += delta1;
217   }
218   cond->stopflag = SNIPCOND_STOP;
219 }
220 
221 static size_t
count_mapped_chars(const char * str,const char * end)222 count_mapped_chars(const char *str, const char *end)
223 {
224   const char *p;
225   size_t dl;
226 
227   dl = 0;
228   for (p = str; p != end; p++) {
229     switch (*p) {
230     case '<':
231     case '>':
232       dl += 4;                  /* &lt; or &gt; */
233       break;
234     case '&':
235       dl += 5;                  /* &amp; */
236       break;
237     case '"':
238       dl += 6;                  /* &quot; */
239       break;
240     default:
241       dl++;
242       break;
243     }
244   }
245   return dl;
246 }
247 
248 grn_rc
grn_snip_cond_close(grn_ctx * ctx,snip_cond * cond)249 grn_snip_cond_close(grn_ctx *ctx, snip_cond *cond)
250 {
251   if (!cond) {
252     return GRN_INVALID_ARGUMENT;
253   }
254   if (cond->keyword) {
255     grn_obj_close(ctx, cond->keyword);
256   }
257   return GRN_SUCCESS;
258 }
259 
260 grn_rc
grn_snip_cond_init(grn_ctx * ctx,snip_cond * sc,const char * keyword,unsigned int keyword_len,grn_encoding enc,grn_obj * normalizer,int flags)261 grn_snip_cond_init(grn_ctx *ctx, snip_cond *sc, const char *keyword, unsigned int keyword_len,
262                    grn_encoding enc, grn_obj *normalizer, int flags)
263 {
264   const char *norm;
265   unsigned int norm_blen;
266   int f = GRN_STR_REMOVEBLANK;
267   memset(sc, 0, sizeof(snip_cond));
268   if (!(sc->keyword = grn_string_open(ctx, keyword, keyword_len,
269                                       normalizer, f))) {
270     GRN_LOG(ctx, GRN_LOG_ALERT,
271             "grn_string_open on snip_cond_init failed!");
272     return GRN_NO_MEMORY_AVAILABLE;
273   }
274   grn_string_get_normalized(ctx, sc->keyword, &norm, &norm_blen, NULL);
275   if (!norm_blen) {
276     grn_snip_cond_close(ctx, sc);
277     return GRN_INVALID_ARGUMENT;
278   }
279   if (norm_blen != 1) {
280     grn_bm_preBmBc((unsigned char *)norm, norm_blen, sc->bmBc);
281     sc->shift = sc->bmBc[(unsigned char)norm[norm_blen - 1]];
282     sc->bmBc[(unsigned char)norm[norm_blen - 1]] = 0;
283   }
284   return GRN_SUCCESS;
285 }
286 
287 void
grn_snip_cond_reinit(snip_cond * cond)288 grn_snip_cond_reinit(snip_cond *cond)
289 {
290   cond->found = 0;
291   cond->last_found = 0;
292   cond->last_offset = 0;
293   cond->start_offset = 0;
294   cond->end_offset = 0;
295 
296   cond->count = 0;
297   cond->stopflag = SNIPCOND_NONSTOP;
298 }
299 
300 inline static char *
grn_snip_strndup(grn_ctx * ctx,const char * string,unsigned int string_len)301 grn_snip_strndup(grn_ctx *ctx, const char *string, unsigned int string_len)
302 {
303    char *copied_string;
304 
305    copied_string = GRN_MALLOC(string_len + 1);
306    if (!copied_string) {
307      return NULL;
308    }
309    grn_memcpy(copied_string, string, string_len);
310    copied_string[string_len]= '\0'; /* not required, but for ql use */
311    return copied_string;
312 }
313 
314 inline static grn_rc
grn_snip_cond_set_tag(grn_ctx * ctx,const char ** dest_tag,size_t * dest_tag_len,const char * tag,unsigned int tag_len,const char * default_tag,unsigned int default_tag_len,int copy_tag)315 grn_snip_cond_set_tag(grn_ctx *ctx,
316                       const char **dest_tag, size_t *dest_tag_len,
317                       const char *tag, unsigned int tag_len,
318                       const char *default_tag, unsigned int default_tag_len,
319                       int copy_tag)
320 {
321   if (tag) {
322     if (copy_tag) {
323       char *copied_tag;
324       copied_tag = grn_snip_strndup(ctx, tag, tag_len);
325       if (!copied_tag) {
326         return GRN_NO_MEMORY_AVAILABLE;
327       }
328       *dest_tag = copied_tag;
329     } else {
330       *dest_tag = tag;
331     }
332     *dest_tag_len = tag_len;
333   } else {
334     *dest_tag = default_tag;
335     *dest_tag_len = default_tag_len;
336   }
337   return GRN_SUCCESS;
338 }
339 
340 grn_rc
grn_snip_set_normalizer(grn_ctx * ctx,grn_obj * snip,grn_obj * normalizer)341 grn_snip_set_normalizer(grn_ctx *ctx, grn_obj *snip,
342                         grn_obj *normalizer)
343 {
344   grn_snip *snip_;
345   if (!snip) {
346     return GRN_INVALID_ARGUMENT;
347   }
348 
349   snip_ = (grn_snip *)snip;
350   snip_->normalizer = normalizer;
351   return GRN_SUCCESS;
352 }
353 
354 grn_obj *
grn_snip_get_normalizer(grn_ctx * ctx,grn_obj * snip)355 grn_snip_get_normalizer(grn_ctx *ctx, grn_obj *snip)
356 {
357   grn_snip *snip_;
358 
359   if (!snip) {
360     return NULL;
361   }
362 
363   snip_ = (grn_snip *)snip;
364   return snip_->normalizer;
365 }
366 
367 grn_rc
grn_snip_add_cond(grn_ctx * ctx,grn_obj * snip,const char * keyword,unsigned int keyword_len,const char * opentag,unsigned int opentag_len,const char * closetag,unsigned int closetag_len)368 grn_snip_add_cond(grn_ctx *ctx, grn_obj *snip,
369                   const char *keyword, unsigned int keyword_len,
370                   const char *opentag, unsigned int opentag_len,
371                   const char *closetag, unsigned int closetag_len)
372 {
373   grn_rc rc;
374   int copy_tag;
375   snip_cond *cond;
376   unsigned int norm_blen;
377   grn_snip *snip_;
378 
379   snip_ = (grn_snip *)snip;
380   if (!snip_ || !keyword || !keyword_len || snip_->cond_len >= MAX_SNIP_COND_COUNT) {
381     return GRN_INVALID_ARGUMENT;
382   }
383 
384   cond = snip_->cond + snip_->cond_len;
385   if ((rc = grn_snip_cond_init(ctx, cond, keyword, keyword_len,
386                                snip_->encoding, snip_->normalizer, snip_->flags))) {
387     return rc;
388   }
389   grn_string_get_normalized(ctx, cond->keyword, NULL, &norm_blen, NULL);
390   if (norm_blen > snip_->width) {
391     grn_snip_cond_close(ctx, cond);
392     return GRN_INVALID_ARGUMENT;
393   }
394 
395   copy_tag = snip_->flags & GRN_SNIP_COPY_TAG;
396   rc = grn_snip_cond_set_tag(ctx,
397                              &(cond->opentag), &(cond->opentag_len),
398                              opentag, opentag_len,
399                              snip_->defaultopentag, snip_->defaultopentag_len,
400                              copy_tag);
401   if (rc) {
402     grn_snip_cond_close(ctx, cond);
403     return rc;
404   }
405 
406   rc = grn_snip_cond_set_tag(ctx,
407                              &(cond->closetag), &(cond->closetag_len),
408                              closetag, closetag_len,
409                              snip_->defaultclosetag, snip_->defaultclosetag_len,
410                              copy_tag);
411   if (rc) {
412     if (opentag && copy_tag) {
413       GRN_FREE((void *)cond->opentag);
414     }
415     grn_snip_cond_close(ctx, cond);
416     return rc;
417   }
418 
419   snip_->cond_len++;
420   return GRN_SUCCESS;
421 }
422 
423 static size_t
grn_snip_find_firstbyte(const char * string,grn_encoding encoding,size_t offset,size_t doffset)424 grn_snip_find_firstbyte(const char *string, grn_encoding encoding, size_t offset,
425                         size_t doffset)
426 {
427   switch (encoding) {
428   case GRN_ENC_EUC_JP:
429     while (!(grn_bm_check_euc((unsigned char *) string, offset)))
430       offset += doffset;
431     break;
432   case GRN_ENC_SJIS:
433     if (!(grn_bm_check_sjis((unsigned char *) string, offset)))
434       offset += doffset;
435     break;
436   case GRN_ENC_UTF8:
437     while ((signed char)string[offset] <= (signed char)0xc0)
438       offset += doffset;
439     break;
440   default:
441     break;
442   }
443   return offset;
444 }
445 
446 inline static grn_rc
grn_snip_set_default_tag(grn_ctx * ctx,const char ** dest_tag,size_t * dest_tag_len,const char * tag,unsigned int tag_len,int copy_tag)447 grn_snip_set_default_tag(grn_ctx *ctx,
448                          const char **dest_tag, size_t *dest_tag_len,
449                          const char *tag, unsigned int tag_len,
450                          int copy_tag)
451 {
452   if (copy_tag && tag) {
453     char *copied_tag;
454     copied_tag = grn_snip_strndup(ctx, tag, tag_len);
455     if (!copied_tag) {
456       return GRN_NO_MEMORY_AVAILABLE;
457     }
458     *dest_tag = copied_tag;
459   } else {
460     *dest_tag = tag;
461   }
462   *dest_tag_len = tag_len;
463   return GRN_SUCCESS;
464 }
465 
466 grn_obj *
grn_snip_open(grn_ctx * ctx,int flags,unsigned int width,unsigned int max_results,const char * defaultopentag,unsigned int defaultopentag_len,const char * defaultclosetag,unsigned int defaultclosetag_len,grn_snip_mapping * mapping)467 grn_snip_open(grn_ctx *ctx, int flags, unsigned int width,
468               unsigned int max_results,
469               const char *defaultopentag, unsigned int defaultopentag_len,
470               const char *defaultclosetag, unsigned int defaultclosetag_len,
471               grn_snip_mapping *mapping)
472 {
473   int copy_tag;
474   grn_snip *ret = NULL;
475   if (!(ret = GRN_MALLOC(sizeof(grn_snip)))) {
476     GRN_LOG(ctx, GRN_LOG_ALERT, "grn_snip allocation failed on grn_snip_open");
477     return NULL;
478   }
479   if (max_results > MAX_SNIP_RESULT_COUNT || max_results == 0) {
480     GRN_LOG(ctx, GRN_LOG_WARNING, "max_results is invalid on grn_snip_open");
481     GRN_FREE(ret);
482     return NULL;
483   }
484   GRN_API_ENTER;
485   ret->encoding = ctx->encoding;
486   ret->flags = flags;
487   ret->width = width;
488   ret->max_results = max_results;
489   ret->defaultopentag = NULL;
490   ret->defaultclosetag = NULL;
491 
492   copy_tag = flags & GRN_SNIP_COPY_TAG;
493   if (grn_snip_set_default_tag(ctx,
494                                &(ret->defaultopentag),
495                                &(ret->defaultopentag_len),
496                                defaultopentag, defaultopentag_len,
497                                copy_tag)) {
498     GRN_FREE(ret);
499     GRN_API_RETURN(NULL);
500   }
501 
502   if (grn_snip_set_default_tag(ctx,
503                                &(ret->defaultclosetag),
504                                &(ret->defaultclosetag_len),
505                                defaultclosetag, defaultclosetag_len,
506                                copy_tag)) {
507     if (copy_tag && ret->defaultopentag) {
508       GRN_FREE((void *)ret->defaultopentag);
509     }
510     GRN_FREE(ret);
511     GRN_API_RETURN(NULL);
512   }
513 
514   ret->cond_len = 0;
515   ret->mapping = mapping;
516   ret->nstr = NULL;
517   ret->tag_count = 0;
518   ret->snip_count = 0;
519   if (ret->flags & GRN_SNIP_NORMALIZE) {
520     ret->normalizer = GRN_NORMALIZER_AUTO;
521   } else {
522     ret->normalizer = NULL;
523   }
524 
525   GRN_DB_OBJ_SET_TYPE(ret, GRN_SNIP);
526   {
527     grn_obj *db;
528     grn_id id;
529     db = grn_ctx_db(ctx);
530     id = grn_obj_register(ctx, db, NULL, 0);
531     DB_OBJ(ret)->header.domain = GRN_ID_NIL;
532     DB_OBJ(ret)->range = GRN_ID_NIL;
533     grn_db_obj_init(ctx, db, id, DB_OBJ(ret));
534   }
535 
536   GRN_API_RETURN((grn_obj *)ret);
537 }
538 
539 static grn_rc
exec_clean(grn_ctx * ctx,grn_snip * snip)540 exec_clean(grn_ctx *ctx, grn_snip *snip)
541 {
542   snip_cond *cond, *cond_end;
543   if (snip->nstr) {
544     grn_obj_close(ctx, snip->nstr);
545     snip->nstr = NULL;
546   }
547   snip->tag_count = 0;
548   snip->snip_count = 0;
549   for (cond = snip->cond, cond_end = cond + snip->cond_len;
550        cond < cond_end; cond++) {
551     grn_snip_cond_reinit(cond);
552   }
553   return GRN_SUCCESS;
554 }
555 
556 grn_rc
grn_snip_close(grn_ctx * ctx,grn_snip * snip)557 grn_snip_close(grn_ctx *ctx, grn_snip *snip)
558 {
559   snip_cond *cond, *cond_end;
560   if (!snip) { return GRN_INVALID_ARGUMENT; }
561   GRN_API_ENTER;
562   if (snip->flags & GRN_SNIP_COPY_TAG) {
563     int i;
564     snip_cond *sc;
565     const char *dot = snip->defaultopentag, *dct = snip->defaultclosetag;
566     for (i = snip->cond_len, sc = snip->cond; i; i--, sc++) {
567       if (sc->opentag != dot) { GRN_FREE((void *)sc->opentag); }
568       if (sc->closetag != dct) { GRN_FREE((void *)sc->closetag); }
569     }
570     if (dot) { GRN_FREE((void *)dot); }
571     if (dct) { GRN_FREE((void *)dct); }
572   }
573   if (snip->nstr) {
574     grn_obj_close(ctx, snip->nstr);
575   }
576   for (cond = snip->cond, cond_end = cond + snip->cond_len;
577        cond < cond_end; cond++) {
578     grn_snip_cond_close(ctx, cond);
579   }
580   GRN_FREE(snip);
581   GRN_API_RETURN(GRN_SUCCESS);
582 }
583 
584 grn_rc
grn_snip_exec(grn_ctx * ctx,grn_obj * snip,const char * string,unsigned int string_len,unsigned int * nresults,unsigned int * max_tagged_len)585 grn_snip_exec(grn_ctx *ctx, grn_obj *snip, const char *string, unsigned int string_len,
586               unsigned int *nresults, unsigned int *max_tagged_len)
587 {
588   size_t i;
589   grn_snip *snip_;
590   int f = GRN_STR_WITH_CHECKS|GRN_STR_REMOVEBLANK;
591   if (!snip || !string || !nresults || !max_tagged_len) {
592     return GRN_INVALID_ARGUMENT;
593   }
594   GRN_API_ENTER;
595   snip_ = (grn_snip *)snip;
596   exec_clean(ctx, snip_);
597   *nresults = 0;
598   snip_->nstr = grn_string_open(ctx, string, string_len, snip_->normalizer, f);
599   if (!snip_->nstr) {
600     exec_clean(ctx, snip_);
601     GRN_LOG(ctx, GRN_LOG_ALERT, "grn_string_open on grn_snip_exec failed !");
602     GRN_API_RETURN(ctx->rc);
603   }
604   for (i = 0; i < snip_->cond_len; i++) {
605     grn_bm_tunedbm(ctx, snip_->cond + i, snip_->nstr, snip_->flags);
606   }
607 
608   {
609     _snip_tag_result *tag_result = snip_->tag_result;
610     _snip_result *snip_result = snip_->snip_result;
611     size_t last_end_offset = 0, last_last_end_offset = 0;
612     unsigned int unfound_cond_count = snip_->cond_len;
613 
614     *max_tagged_len = 0;
615     while (1) {
616       size_t tagged_len = 0, last_tag_end = 0;
617       int_least8_t all_stop = 1, found_cond = 0;
618       snip_result->tag_count = 0;
619 
620       while (1) {
621         size_t min_start_offset = (size_t) -1;
622         size_t max_end_offset = 0;
623         snip_cond *cond = NULL;
624 
625         /* get condition which have minimum offset and is not stopped */
626         for (i = 0; i < snip_->cond_len; i++) {
627           if (snip_->cond[i].stopflag == SNIPCOND_NONSTOP &&
628               (min_start_offset > snip_->cond[i].start_offset ||
629                (min_start_offset == snip_->cond[i].start_offset &&
630                 max_end_offset < snip_->cond[i].end_offset))) {
631             min_start_offset = snip_->cond[i].start_offset;
632             max_end_offset = snip_->cond[i].end_offset;
633             cond = &snip_->cond[i];
634           }
635         }
636         if (!cond) {
637           break;
638         }
639         /* check whether condtion is the first condition in snippet */
640         if (snip_result->tag_count == 0) {
641           /* skip condition if the number of rest snippet field is smaller than */
642           /* the number of unfound keywords. */
643           if (snip_->max_results - *nresults <= unfound_cond_count && cond->count > 0) {
644             int_least8_t exclude_other_cond = 1;
645             for (i = 0; i < snip_->cond_len; i++) {
646               if ((snip_->cond + i) != cond
647                   && snip_->cond[i].end_offset <= cond->start_offset + snip_->width
648                   && snip_->cond[i].count == 0) {
649                 exclude_other_cond = 0;
650               }
651             }
652             if (exclude_other_cond) {
653               grn_bm_tunedbm(ctx, cond, snip_->nstr, snip_->flags);
654               continue;
655             }
656           }
657           snip_result->start_offset = cond->start_offset;
658           snip_result->first_tag_result_idx = snip_->tag_count;
659         } else {
660           if (cond->start_offset >= snip_result->start_offset + snip_->width) {
661             break;
662           }
663           /* check nesting to make valid HTML */
664           /* ToDo: allow <test><te>te</te><st>st</st></test> */
665           if (cond->start_offset < last_tag_end) {
666             grn_bm_tunedbm(ctx, cond, snip_->nstr, snip_->flags);
667             continue;
668           }
669         }
670         if (cond->end_offset > snip_result->start_offset + snip_->width) {
671           /* If a keyword gets across a snippet, */
672           /* it was skipped and never to be tagged. */
673           cond->stopflag = SNIPCOND_ACROSS;
674           grn_bm_tunedbm(ctx, cond, snip_->nstr, snip_->flags);
675         } else {
676           found_cond = 1;
677           if (cond->count == 0) {
678             unfound_cond_count--;
679           }
680           cond->count++;
681           last_end_offset = cond->end_offset;
682 
683           tag_result->cond = cond;
684           tag_result->start_offset = cond->start_offset;
685           tag_result->end_offset = last_tag_end = cond->end_offset;
686 
687           snip_result->tag_count++;
688           tag_result++;
689           tagged_len += cond->opentag_len + cond->closetag_len;
690           if (++snip_->tag_count >= MAX_SNIP_TAG_COUNT) {
691             break;
692           }
693           grn_bm_tunedbm(ctx, cond, snip_->nstr, snip_->flags);
694         }
695       }
696       if (!found_cond) {
697         break;
698       }
699       if (snip_result->start_offset + last_end_offset < snip_->width) {
700         snip_result->start_offset = 0;
701       } else {
702         snip_result->start_offset =
703           MAX(MIN
704               ((snip_result->start_offset + last_end_offset - snip_->width) / 2,
705                string_len - snip_->width), last_last_end_offset);
706       }
707       snip_result->start_offset =
708         grn_snip_find_firstbyte(string, snip_->encoding, snip_result->start_offset, 1);
709 
710       snip_result->end_offset = snip_result->start_offset + snip_->width;
711       if (snip_result->end_offset < string_len) {
712         snip_result->end_offset =
713           grn_snip_find_firstbyte(string, snip_->encoding, snip_result->end_offset, -1);
714       } else {
715         snip_result->end_offset = string_len;
716       }
717       last_last_end_offset = snip_result->end_offset;
718 
719       if (snip_->mapping == (grn_snip_mapping *) -1) {
720         tagged_len +=
721           count_mapped_chars(&string[snip_result->start_offset],
722                              &string[snip_result->end_offset]) + 1;
723       } else {
724         tagged_len += snip_result->end_offset - snip_result->start_offset + 1;
725       }
726 
727       *max_tagged_len = MAX(*max_tagged_len, tagged_len);
728 
729       snip_result->last_tag_result_idx = snip_->tag_count - 1;
730       (*nresults)++;
731       snip_result++;
732 
733       if (*nresults == snip_->max_results || snip_->tag_count == MAX_SNIP_TAG_COUNT) {
734         break;
735       }
736       for (i = 0; i < snip_->cond_len; i++) {
737         if (snip_->cond[i].stopflag != SNIPCOND_STOP) {
738           all_stop = 0;
739           snip_->cond[i].stopflag = SNIPCOND_NONSTOP;
740         }
741       }
742       if (all_stop) {
743         break;
744       }
745     }
746   }
747   snip_->snip_count = *nresults;
748   snip_->string = string;
749 
750   snip_->max_tagged_len = *max_tagged_len;
751 
752   GRN_API_RETURN(ctx->rc);
753 }
754 
755 grn_rc
grn_snip_get_result(grn_ctx * ctx,grn_obj * snip,const unsigned int index,char * result,unsigned int * result_len)756 grn_snip_get_result(grn_ctx *ctx, grn_obj *snip, const unsigned int index, char *result, unsigned int *result_len)
757 {
758   char *p;
759   size_t i, j, k;
760   _snip_result *sres;
761   grn_snip *snip_;
762 
763   snip_ = (grn_snip *)snip;
764   if (snip_->snip_count <= index || !snip_->nstr) {
765     return GRN_INVALID_ARGUMENT;
766   }
767 
768   GRN_ASSERT(snip_->snip_count != 0 && snip_->tag_count != 0);
769 
770   GRN_API_ENTER;
771   sres = &snip_->snip_result[index];
772   j = sres->first_tag_result_idx;
773   for (p = result, i = sres->start_offset; i < sres->end_offset; i++) {
774     for (; j <= sres->last_tag_result_idx && snip_->tag_result[j].start_offset == i; j++) {
775       if (snip_->tag_result[j].end_offset > sres->end_offset) {
776         continue;
777       }
778       grn_memcpy(p,
779                  snip_->tag_result[j].cond->opentag,
780                  snip_->tag_result[j].cond->opentag_len);
781       p += snip_->tag_result[j].cond->opentag_len;
782     }
783 
784     if (snip_->mapping == GRN_SNIP_MAPPING_HTML_ESCAPE) {
785       switch (snip_->string[i]) {
786       case '<':
787         *p++ = '&';
788         *p++ = 'l';
789         *p++ = 't';
790         *p++ = ';';
791         break;
792       case '>':
793         *p++ = '&';
794         *p++ = 'g';
795         *p++ = 't';
796         *p++ = ';';
797         break;
798       case '&':
799         *p++ = '&';
800         *p++ = 'a';
801         *p++ = 'm';
802         *p++ = 'p';
803         *p++ = ';';
804         break;
805       case '"':
806         *p++ = '&';
807         *p++ = 'q';
808         *p++ = 'u';
809         *p++ = 'o';
810         *p++ = 't';
811         *p++ = ';';
812         break;
813       default:
814         *p++ = snip_->string[i];
815         break;
816       }
817     } else {
818       *p++ = snip_->string[i];
819     }
820 
821     for (k = sres->last_tag_result_idx;
822          snip_->tag_result[k].end_offset <= sres->end_offset; k--) {
823       /* TODO: avoid all loop */
824       if (snip_->tag_result[k].end_offset == i + 1) {
825         grn_memcpy(p,
826                    snip_->tag_result[k].cond->closetag,
827                    snip_->tag_result[k].cond->closetag_len);
828         p += snip_->tag_result[k].cond->closetag_len;
829       }
830       if (k <= sres->first_tag_result_idx) {
831         break;
832       }
833     };
834   }
835   *p = '\0';
836 
837   if(result_len) { *result_len = (unsigned int)(p - result); }
838   GRN_ASSERT((unsigned int)(p - result) <= snip_->max_tagged_len);
839 
840   GRN_API_RETURN(ctx->rc);
841 }
842