1 #include "forward_index.h"
2 #include "stopwords.h"
3 #include "tokenize.h"
4 #include "toksep.h"
5 #include "rmalloc.h"
6 #include <ctype.h>
7 #include <stdlib.h>
8 #include <strings.h>
9 #include "phonetic_manager.h"
10 
11 typedef struct {
12   RSTokenizer base;
13   char **pos;
14   Stemmer *stemmer;
15 } simpleTokenizer;
16 
simpleTokenizer_Start(RSTokenizer * base,char * text,size_t len,uint32_t options)17 static void simpleTokenizer_Start(RSTokenizer *base, char *text, size_t len, uint32_t options) {
18   simpleTokenizer *self = (simpleTokenizer *)base;
19   TokenizerCtx *ctx = &base->ctx;
20   ctx->text = text;
21   ctx->options = options;
22   ctx->len = len;
23   self->pos = &ctx->text;
24 }
25 
26 // Shortest word which can/should actually be stemmed
27 #define MIN_STEM_CANDIDATE_LEN 4
28 
29 // Normalization buffer
30 #define MAX_NORMALIZE_SIZE 128
31 
32 /**
33  * Normalizes text.
34  * - s contains the raw token
35  * - dst is the destination buffer which contains the normalized text
36  * - len on input contains the length of the raw token. on output contains the
37  * on output contains the length of the normalized token
38  */
DefaultNormalize(char * s,char * dst,size_t * len)39 static char *DefaultNormalize(char *s, char *dst, size_t *len) {
40   size_t origLen = *len;
41   char *realDest = s;
42   size_t dstLen = 0;
43 
44 #define SWITCH_DEST()        \
45   if (realDest != dst) {     \
46     realDest = dst;          \
47     memcpy(realDest, s, ii); \
48   }
49   // set to 1 if the previous character was a backslash escape
50   int escaped = 0;
51   for (size_t ii = 0; ii < origLen; ++ii) {
52     if (isupper(s[ii])) {
53       SWITCH_DEST();
54       realDest[dstLen++] = tolower(s[ii]);
55     } else if ((isblank(s[ii]) && !escaped) || iscntrl(s[ii])) {
56       SWITCH_DEST();
57     } else if (s[ii] == '\\' && !escaped) {
58       SWITCH_DEST();
59       escaped = 1;
60       continue;
61     } else {
62       dst[dstLen++] = s[ii];
63     }
64     escaped = 0;
65   }
66 
67   *len = dstLen;
68   return dst;
69 }
70 
71 // tokenize the text in the context
simpleTokenizer_Next(RSTokenizer * base,Token * t)72 uint32_t simpleTokenizer_Next(RSTokenizer *base, Token *t) {
73   TokenizerCtx *ctx = &base->ctx;
74   simpleTokenizer *self = (simpleTokenizer *)base;
75   while (*self->pos != NULL) {
76     // get the next token
77     size_t origLen;
78     char *tok = toksep(self->pos, &origLen);
79 
80     // normalize the token
81     size_t normLen = origLen;
82 
83     char normalized_s[MAX_NORMALIZE_SIZE];
84     char *normBuf;
85     if (ctx->options & TOKENIZE_NOMODIFY) {
86       normBuf = normalized_s;
87       if (normLen > MAX_NORMALIZE_SIZE) {
88         normLen = MAX_NORMALIZE_SIZE;
89       }
90     } else {
91       normBuf = tok;
92     }
93 
94     char *normalized = DefaultNormalize(tok, normBuf, &normLen);
95     // ignore tokens that turn into nothing
96     if (normalized == NULL || normLen == 0) {
97       continue;
98     }
99 
100     // skip stopwords
101     if (StopWordList_Contains(ctx->stopwords, normalized, normLen)) {
102       continue;
103     }
104 
105     *t = (Token){.tok = normalized,
106                  .tokLen = normLen,
107                  .raw = tok,
108                  .rawLen = origLen,
109                  .pos = ++ctx->lastOffset,
110                  .flags = Token_CopyStem,
111                  .phoneticsPrimary = t->phoneticsPrimary};
112 
113     // if we support stemming - try to stem the word
114     if (!(ctx->options & TOKENIZE_NOSTEM) && self->stemmer && normLen >= MIN_STEM_CANDIDATE_LEN) {
115       size_t sl;
116       const char *stem = self->stemmer->Stem(self->stemmer->ctx, tok, normLen, &sl);
117       if (stem) {
118         t->stem = stem;
119         t->stemLen = sl;
120       }
121     }
122 
123     if ((ctx->options & TOKENIZE_PHONETICS) && normLen >= RSGlobalConfig.minPhoneticTermLen) {
124       // VLA: eww
125       if (t->phoneticsPrimary) {
126         rm_free(t->phoneticsPrimary);
127         t->phoneticsPrimary = NULL;
128       }
129       PhoneticManager_ExpandPhonetics(NULL, tok, normLen, &t->phoneticsPrimary, NULL);
130     }
131 
132     return ctx->lastOffset;
133   }
134 
135   return 0;
136 }
137 
simpleTokenizer_Free(RSTokenizer * self)138 void simpleTokenizer_Free(RSTokenizer *self) {
139   rm_free(self);
140 }
141 
doReset(RSTokenizer * tokbase,Stemmer * stemmer,StopWordList * stopwords,uint32_t opts)142 static void doReset(RSTokenizer *tokbase, Stemmer *stemmer, StopWordList *stopwords,
143                     uint32_t opts) {
144   simpleTokenizer *t = (simpleTokenizer *)tokbase;
145   t->stemmer = stemmer;
146   t->base.ctx.stopwords = stopwords;
147   t->base.ctx.options = opts;
148   t->base.ctx.lastOffset = 0;
149   if (stopwords) {
150     // Initially this function is called when we receive it from the mempool;
151     // in which case stopwords is NULL.
152     StopWordList_Ref(stopwords);
153   }
154 }
155 
NewSimpleTokenizer(Stemmer * stemmer,StopWordList * stopwords,uint32_t opts)156 RSTokenizer *NewSimpleTokenizer(Stemmer *stemmer, StopWordList *stopwords, uint32_t opts) {
157   simpleTokenizer *t = rm_calloc(1, sizeof(*t));
158   t->base.Free = simpleTokenizer_Free;
159   t->base.Next = simpleTokenizer_Next;
160   t->base.Start = simpleTokenizer_Start;
161   t->base.Reset = doReset;
162   t->base.Reset(&t->base, stemmer, stopwords, opts);
163   return &t->base;
164 }
165 
166 static mempool_t *tokpoolLatin_g = NULL;
167 static mempool_t *tokpoolCn_g = NULL;
168 
newLatinTokenizerAlloc()169 static void *newLatinTokenizerAlloc() {
170   return NewSimpleTokenizer(NULL, NULL, 0);
171 }
newCnTokenizerAlloc()172 static void *newCnTokenizerAlloc() {
173   return NewChineseTokenizer(NULL, NULL, 0);
174 }
tokenizerFree(void * p)175 static void tokenizerFree(void *p) {
176   RSTokenizer *t = p;
177   t->Free(t);
178 }
179 
GetTokenizer(RSLanguage language,Stemmer * stemmer,StopWordList * stopwords)180 RSTokenizer *GetTokenizer(RSLanguage language, Stemmer *stemmer, StopWordList *stopwords) {
181   if (language == RS_LANG_CHINESE) {
182     return GetChineseTokenizer(stemmer, stopwords);
183   } else {
184     return GetSimpleTokenizer(stemmer, stopwords);
185   }
186 }
187 
GetChineseTokenizer(Stemmer * stemmer,StopWordList * stopwords)188 RSTokenizer *GetChineseTokenizer(Stemmer *stemmer, StopWordList *stopwords) {
189   if (!tokpoolCn_g) {
190     mempool_options opts = {
191         .isGlobal = 1, .initialCap = 16, .alloc = newCnTokenizerAlloc, .free = tokenizerFree};
192     tokpoolCn_g = mempool_new(&opts);
193   }
194 
195   RSTokenizer *t = mempool_get(tokpoolCn_g);
196   t->Reset(t, stemmer, stopwords, 0);
197   return t;
198 }
199 
GetSimpleTokenizer(Stemmer * stemmer,StopWordList * stopwords)200 RSTokenizer *GetSimpleTokenizer(Stemmer *stemmer, StopWordList *stopwords) {
201   if (!tokpoolLatin_g) {
202     mempool_options opts = {
203         .isGlobal = 1, .initialCap = 16, .alloc = newLatinTokenizerAlloc, .free = tokenizerFree};
204     tokpoolLatin_g = mempool_new(&opts);
205   }
206   RSTokenizer *t = mempool_get(tokpoolLatin_g);
207   t->Reset(t, stemmer, stopwords, 0);
208   return t;
209 }
210 
Tokenizer_Release(RSTokenizer * t)211 void Tokenizer_Release(RSTokenizer *t) {
212   // In the future it would be nice to have an actual ID field or w/e, but for
213   // now we can just compare callback pointers
214   if (t->Next == simpleTokenizer_Next) {
215     if (t->ctx.stopwords) {
216       StopWordList_Unref(t->ctx.stopwords);
217       t->ctx.stopwords = NULL;
218     }
219     mempool_release(tokpoolLatin_g, t);
220   } else {
221     mempool_release(tokpoolCn_g, t);
222   }
223 }
224