1 #include "forward_index.h"
2 #include "stopwords.h"
3 #include "tokenize.h"
4 #include "toksep.h"
5 #include "rmalloc.h"
6 #include <ctype.h>
7 #include <stdlib.h>
8 #include <strings.h>
9 #include "phonetic_manager.h"
10
11 typedef struct {
12 RSTokenizer base;
13 char **pos;
14 Stemmer *stemmer;
15 } simpleTokenizer;
16
simpleTokenizer_Start(RSTokenizer * base,char * text,size_t len,uint32_t options)17 static void simpleTokenizer_Start(RSTokenizer *base, char *text, size_t len, uint32_t options) {
18 simpleTokenizer *self = (simpleTokenizer *)base;
19 TokenizerCtx *ctx = &base->ctx;
20 ctx->text = text;
21 ctx->options = options;
22 ctx->len = len;
23 self->pos = &ctx->text;
24 }
25
26 // Shortest word which can/should actually be stemmed
27 #define MIN_STEM_CANDIDATE_LEN 4
28
29 // Normalization buffer
30 #define MAX_NORMALIZE_SIZE 128
31
32 /**
33 * Normalizes text.
34 * - s contains the raw token
35 * - dst is the destination buffer which contains the normalized text
36 * - len on input contains the length of the raw token. on output contains the
37 * on output contains the length of the normalized token
38 */
DefaultNormalize(char * s,char * dst,size_t * len)39 static char *DefaultNormalize(char *s, char *dst, size_t *len) {
40 size_t origLen = *len;
41 char *realDest = s;
42 size_t dstLen = 0;
43
44 #define SWITCH_DEST() \
45 if (realDest != dst) { \
46 realDest = dst; \
47 memcpy(realDest, s, ii); \
48 }
49 // set to 1 if the previous character was a backslash escape
50 int escaped = 0;
51 for (size_t ii = 0; ii < origLen; ++ii) {
52 if (isupper(s[ii])) {
53 SWITCH_DEST();
54 realDest[dstLen++] = tolower(s[ii]);
55 } else if ((isblank(s[ii]) && !escaped) || iscntrl(s[ii])) {
56 SWITCH_DEST();
57 } else if (s[ii] == '\\' && !escaped) {
58 SWITCH_DEST();
59 escaped = 1;
60 continue;
61 } else {
62 dst[dstLen++] = s[ii];
63 }
64 escaped = 0;
65 }
66
67 *len = dstLen;
68 return dst;
69 }
70
71 // tokenize the text in the context
simpleTokenizer_Next(RSTokenizer * base,Token * t)72 uint32_t simpleTokenizer_Next(RSTokenizer *base, Token *t) {
73 TokenizerCtx *ctx = &base->ctx;
74 simpleTokenizer *self = (simpleTokenizer *)base;
75 while (*self->pos != NULL) {
76 // get the next token
77 size_t origLen;
78 char *tok = toksep(self->pos, &origLen);
79
80 // normalize the token
81 size_t normLen = origLen;
82
83 char normalized_s[MAX_NORMALIZE_SIZE];
84 char *normBuf;
85 if (ctx->options & TOKENIZE_NOMODIFY) {
86 normBuf = normalized_s;
87 if (normLen > MAX_NORMALIZE_SIZE) {
88 normLen = MAX_NORMALIZE_SIZE;
89 }
90 } else {
91 normBuf = tok;
92 }
93
94 char *normalized = DefaultNormalize(tok, normBuf, &normLen);
95 // ignore tokens that turn into nothing
96 if (normalized == NULL || normLen == 0) {
97 continue;
98 }
99
100 // skip stopwords
101 if (StopWordList_Contains(ctx->stopwords, normalized, normLen)) {
102 continue;
103 }
104
105 *t = (Token){.tok = normalized,
106 .tokLen = normLen,
107 .raw = tok,
108 .rawLen = origLen,
109 .pos = ++ctx->lastOffset,
110 .flags = Token_CopyStem,
111 .phoneticsPrimary = t->phoneticsPrimary};
112
113 // if we support stemming - try to stem the word
114 if (!(ctx->options & TOKENIZE_NOSTEM) && self->stemmer && normLen >= MIN_STEM_CANDIDATE_LEN) {
115 size_t sl;
116 const char *stem = self->stemmer->Stem(self->stemmer->ctx, tok, normLen, &sl);
117 if (stem) {
118 t->stem = stem;
119 t->stemLen = sl;
120 }
121 }
122
123 if ((ctx->options & TOKENIZE_PHONETICS) && normLen >= RSGlobalConfig.minPhoneticTermLen) {
124 // VLA: eww
125 if (t->phoneticsPrimary) {
126 rm_free(t->phoneticsPrimary);
127 t->phoneticsPrimary = NULL;
128 }
129 PhoneticManager_ExpandPhonetics(NULL, tok, normLen, &t->phoneticsPrimary, NULL);
130 }
131
132 return ctx->lastOffset;
133 }
134
135 return 0;
136 }
137
simpleTokenizer_Free(RSTokenizer * self)138 void simpleTokenizer_Free(RSTokenizer *self) {
139 rm_free(self);
140 }
141
doReset(RSTokenizer * tokbase,Stemmer * stemmer,StopWordList * stopwords,uint32_t opts)142 static void doReset(RSTokenizer *tokbase, Stemmer *stemmer, StopWordList *stopwords,
143 uint32_t opts) {
144 simpleTokenizer *t = (simpleTokenizer *)tokbase;
145 t->stemmer = stemmer;
146 t->base.ctx.stopwords = stopwords;
147 t->base.ctx.options = opts;
148 t->base.ctx.lastOffset = 0;
149 if (stopwords) {
150 // Initially this function is called when we receive it from the mempool;
151 // in which case stopwords is NULL.
152 StopWordList_Ref(stopwords);
153 }
154 }
155
NewSimpleTokenizer(Stemmer * stemmer,StopWordList * stopwords,uint32_t opts)156 RSTokenizer *NewSimpleTokenizer(Stemmer *stemmer, StopWordList *stopwords, uint32_t opts) {
157 simpleTokenizer *t = rm_calloc(1, sizeof(*t));
158 t->base.Free = simpleTokenizer_Free;
159 t->base.Next = simpleTokenizer_Next;
160 t->base.Start = simpleTokenizer_Start;
161 t->base.Reset = doReset;
162 t->base.Reset(&t->base, stemmer, stopwords, opts);
163 return &t->base;
164 }
165
166 static mempool_t *tokpoolLatin_g = NULL;
167 static mempool_t *tokpoolCn_g = NULL;
168
newLatinTokenizerAlloc()169 static void *newLatinTokenizerAlloc() {
170 return NewSimpleTokenizer(NULL, NULL, 0);
171 }
newCnTokenizerAlloc()172 static void *newCnTokenizerAlloc() {
173 return NewChineseTokenizer(NULL, NULL, 0);
174 }
tokenizerFree(void * p)175 static void tokenizerFree(void *p) {
176 RSTokenizer *t = p;
177 t->Free(t);
178 }
179
GetTokenizer(RSLanguage language,Stemmer * stemmer,StopWordList * stopwords)180 RSTokenizer *GetTokenizer(RSLanguage language, Stemmer *stemmer, StopWordList *stopwords) {
181 if (language == RS_LANG_CHINESE) {
182 return GetChineseTokenizer(stemmer, stopwords);
183 } else {
184 return GetSimpleTokenizer(stemmer, stopwords);
185 }
186 }
187
GetChineseTokenizer(Stemmer * stemmer,StopWordList * stopwords)188 RSTokenizer *GetChineseTokenizer(Stemmer *stemmer, StopWordList *stopwords) {
189 if (!tokpoolCn_g) {
190 mempool_options opts = {
191 .isGlobal = 1, .initialCap = 16, .alloc = newCnTokenizerAlloc, .free = tokenizerFree};
192 tokpoolCn_g = mempool_new(&opts);
193 }
194
195 RSTokenizer *t = mempool_get(tokpoolCn_g);
196 t->Reset(t, stemmer, stopwords, 0);
197 return t;
198 }
199
GetSimpleTokenizer(Stemmer * stemmer,StopWordList * stopwords)200 RSTokenizer *GetSimpleTokenizer(Stemmer *stemmer, StopWordList *stopwords) {
201 if (!tokpoolLatin_g) {
202 mempool_options opts = {
203 .isGlobal = 1, .initialCap = 16, .alloc = newLatinTokenizerAlloc, .free = tokenizerFree};
204 tokpoolLatin_g = mempool_new(&opts);
205 }
206 RSTokenizer *t = mempool_get(tokpoolLatin_g);
207 t->Reset(t, stemmer, stopwords, 0);
208 return t;
209 }
210
Tokenizer_Release(RSTokenizer * t)211 void Tokenizer_Release(RSTokenizer *t) {
212 // In the future it would be nice to have an actual ID field or w/e, but for
213 // now we can just compare callback pointers
214 if (t->Next == simpleTokenizer_Next) {
215 if (t->ctx.stopwords) {
216 StopWordList_Unref(t->ctx.stopwords);
217 t->ctx.stopwords = NULL;
218 }
219 mempool_release(tokpoolLatin_g, t);
220 } else {
221 mempool_release(tokpoolCn_g, t);
222 }
223 }
224