1 //
2 // $Id$
3 //
4
5 //
6 // Copyright (c) 2010, Ammar Ali
7 // Copyright (c) 2012, Sphinx Technologies Inc
8 //
9 // All rights reserved
10 //
11 // This program is free software; you can redistribute it and/or modify
12 // it under the terms of the GNU General Public License. You should have
13 // received a copy of the GPL license along with this program; if you
14 // did not, you can find it at http://www.gnu.org/
15 //
16
17 //
18 // The extended ISRI Arabic stemmer
19 //
20 // Original algorithm as described by Kazem Taghva, Rania Elkoury, and
21 // Jeffery Coombs in "Arabic Stemming Without A Root Dictionary", 2005.
22 // DOI 10.1109/ITCC.2005.90
23 //
24 // Extensions (by Ammar Ali) include:
25 // - added stripping of kashida (tatweel) characters;
26 // - added support for matching recurring stem characters;
27 // - added "ef3ou3ala" verb form to the known patterns.
28 //
29
30 #include "sphinx.h"
31
32 /// characters used in affix and word form patterns
33 /// FIXME! not right on big-endian
34 #define AR_ALEF_HA 0xA3D8U // U+0623
35 #define AR_ALEF 0xA7D8U // U+0627
36 #define AR_BA 0xA8D8U // U+0628
37 #define AR_TA_M 0xA9D8U // U+0629
38 #define AR_TA 0xAAD8U // U+062A
39 #define AR_SEEN 0xB3D8U // U+0633
40 #define AR_TATWEEL 0x80D9U // U+0640
41 #define AR_FA 0x81D9U // U+0641
42 #define AR_KAF 0x83D9U // U+0643
43 #define AR_LAM 0x84D9U // U+0644
44 #define AR_MIM 0x85D9U // U+0645
45 #define AR_NOON 0x86D9U // U+0646
46 #define AR_HA 0x87D9U // U+0647
47 #define AR_WAW 0x88D9U // U+0648
48 #define AR_YA 0x8AD9U // U+064A
49
50 /// extension; used for recurring root character matching
51 #define MATCH_M 0xB0DBU // 0x06F0
52 #define MATCH_0 MATCH_M
53 #define MATCH_1 0xB1DBU // 0x06F1
54 #define MATCH_2 0xB2DBU // 0x06F2
55 #define MATCH_3 0xB3DBU // 0x06F3
56 #define MATCH_4 0xB4DBU // 0x06F4
57
58 //////////////////////////////////////////////////////////////////////////
59 // CHARACTER SET TESTS
60 //////////////////////////////////////////////////////////////////////////
61
62 typedef WORD ar_char;
63
64 // AR_HAMZA_SET (U+0621, U+0624, U+0626)
65 #define AR_HAMZA_SET ( c==0xA1D8U || c==0xA4D8U || c==0xA6D8U )
66
67 // ALEF_SET (U+0623, U+0625, U+0671)
68 #define AR_ALEF_SET ( c==0xA3D8U || c==0xA5D8U || c==0xB1D9U )
69
70 // AR_DIACRITIC (>= U+064B &&<=U+0652)
71 #define AR_DIACRITIC(c) \
72 ( c==0x8BD9U || c==0x8CD9U || c==0x8DD9U \
73 || c==0x8ED9U || c==0x8FD9U || c==0x90D9U \
74 || c==0x91D9U || c==0x92D9U )
75
76 // AR_KASHIDA (U+0640)
77 #define AR_KASHIDA(c) ( c==0x80D9U )
78
79 // OPTIMIZE?
80 #define AR_WORD_LENGTH ((int)(strlen((char*)word) / sizeof(ar_char)))
81
82 // FIXME? can crash on misaligned reads
83 #define AR_CHAR_AT(i) (*((ar_char*)(&word[(i * sizeof(ar_char))])))
84
85 #define AR_CHAR_SET(i,c) \
86 { \
87 ar_char *p = (ar_char*) &word[(i * sizeof(ar_char))]; \
88 *p = (ar_char) c; \
89 }
90
91
92 /// remove length chars starting from start, null-terminating to new length
ar_remove(BYTE * word,int start,int length)93 static inline void ar_remove ( BYTE * word, int start, int length )
94 {
95 int remain = ((AR_WORD_LENGTH - length - start) * sizeof(ar_char));
96 ar_char *s = (ar_char*) &word[(start * sizeof(ar_char))];
97 if ( remain > 0 ) {
98 memmove((void*)s, (void*)(s + length), remain);
99 s = s + (remain / sizeof(ar_char));
100 }
101 *s = '\0';
102 }
103
104
105 /// normalize (replace) all occurrences of chars in set to to_char
106 #define AR_NORMALIZE(set, to_char) \
107 { \
108 int wlen = AR_WORD_LENGTH; \
109 while ( wlen > 0 ) { \
110 int ni = wlen - 1; \
111 ar_char c = AR_CHAR_AT(ni); \
112 if ( set ) { \
113 AR_CHAR_SET ( ni, to_char ); \
114 } wlen--; \
115 } \
116 }
117
118
119 /// remove all occurances of chars that match the given macro (e.g. KASHIDA)
120 #define AR_STRIP(_what) \
121 { \
122 int wlen = AR_WORD_LENGTH; \
123 while ( wlen > 0 ) { \
124 int si = wlen - 1; \
125 if ( _what ( AR_CHAR_AT(si) ) ) \
126 ar_remove ( word, si, 1 ); \
127 wlen--; \
128 } \
129 }
130
131
132 /// attempt to match and remove a prefix with the given character count
133 #define AR_PREFIX(count) \
134 { \
135 int match = ar_match_affix ( word, prefix_##count, count, 0 ); \
136 if ( match>=0 ) \
137 ar_remove ( word, 0, count ); \
138 }
139
140
141 /// attempt to match and remove a suffix with the given character count
142 #define AR_SUFFIX(count) \
143 { \
144 int match = ar_match_affix ( word, suffix_##count, count, 1 ); \
145 if ( match>=0 ) \
146 ar_remove ( word, AR_WORD_LENGTH - count, count ); \
147 }
148
149 //////////////////////////////////////////////////////////////////////////
150 // TYPES
151 //////////////////////////////////////////////////////////////////////////
152
153 struct ar_affix_t
154 {
155 ar_char chars[4];
156 };
157
158 struct ar_form_entry_t
159 {
160 int at; // index to match at
161 ar_char cp; // code point to match
162 };
163
164 struct ar_form_t
165 {
166 ar_form_entry_t entry[4];
167 };
168
169 //////////////////////////////////////////////////////////////////////////
170 // PREFIX LOOKUP TABLES
171 //////////////////////////////////////////////////////////////////////////
172
173 /// 3-letter prefixes
174 static struct ar_affix_t prefix_3[] =
175 {
176 { { AR_WAW, AR_LAM, AR_LAM, 0 } },
177 { { AR_WAW, AR_ALEF, AR_LAM, 0 } },
178 { { AR_KAF, AR_ALEF, AR_LAM, 0 } },
179 { { AR_BA, AR_ALEF, AR_LAM, 0 } },
180
181 // Extensions
182 { { AR_ALEF, AR_SEEN, AR_TA, 0 }},
183 { { AR_WAW, AR_BA, AR_MIM, 0 }},
184 { { AR_WAW, AR_BA, AR_ALEF, 0 }},
185 { { 0 } }
186 };
187
188 /// 2-letter prefixes
189 static struct ar_affix_t prefix_2[] =
190 {
191 { { AR_ALEF, AR_LAM, 0 } },
192 { { AR_LAM, AR_LAM, 0 } },
193 { { 0 } }
194 };
195
196 /// 1-letter prefixes
197 static struct ar_affix_t prefix_1[] =
198 {
199 { { AR_ALEF, 0 } },
200 { { AR_BA, 0 } },
201 { { AR_TA, 0 } },
202 { { AR_SEEN, 0 } },
203 { { AR_FA, 0 } },
204 { { AR_LAM, 0 } },
205 { { AR_NOON, 0 } },
206 { { AR_WAW, 0 } },
207 { { AR_YA, 0 } },
208 { { 0 } }
209 };
210
211 //////////////////////////////////////////////////////////////////////////
212 // SUFFIX LOOKUP TABLES
213 //////////////////////////////////////////////////////////////////////////
214
215 /// 3-letter suffixes
216 static struct ar_affix_t suffix_3[] =
217 {
218 { { AR_TA, AR_MIM, AR_LAM, 0 } },
219 { { AR_HA, AR_MIM, AR_LAM, 0 } },
220 { { AR_TA, AR_ALEF, AR_NOON, 0 } },
221 { { AR_TA, AR_YA, AR_NOON, 0 } },
222 { { AR_KAF, AR_MIM, AR_LAM, 0 } },
223 { { 0 } }
224 };
225
226 /// 2-letter suffixes
227 static struct ar_affix_t suffix_2[] = {
228 { { AR_WAW, AR_NOON, 0 } },
229 { { AR_ALEF, AR_TA, 0 } },
230 { { AR_ALEF, AR_NOON, 0 } },
231 { { AR_YA, AR_NOON, 0 } },
232 { { AR_TA, AR_NOON, 0 } },
233 { { AR_KAF, AR_MIM, 0 } },
234 { { AR_HA, AR_NOON, 0 } },
235 { { AR_NOON, AR_ALEF, 0 } },
236 { { AR_YA, AR_ALEF, 0 } },
237 { { AR_HA, AR_ALEF, 0 } },
238 { { AR_TA, AR_MIM, 0 } },
239 { { AR_KAF, AR_NOON, 0 } },
240 { { AR_NOON, AR_YA, 0 } },
241 { { AR_WAW, AR_ALEF, 0 } },
242 { { AR_MIM, AR_ALEF, 0 } },
243 { { AR_HA, AR_MIM, 0 } },
244
245 // extensions
246 { { AR_WAW, AR_HA, 0 }},
247 { { 0 } }
248 };
249
250 /// 1-letter prefixes
251 static struct ar_affix_t suffix_1[] =
252 {
253 { { AR_ALEF, 0 } },
254 { { AR_TA_M, 0 } },
255 { { AR_TA, 0 } },
256 { { AR_KAF, 0 } },
257 { { AR_NOON, 0 } },
258 { { AR_HA, 0 } },
259 { { AR_YA, 0 } },
260 { { 0 } }
261 };
262
263 //////////////////////////////////////////////////////////////////////////
264 // FORMS
265 //////////////////////////////////////////////////////////////////////////
266
267 /// forms for 4 letter words that yield a 3 letter stem
268 static struct ar_form_t form_4_3[] =
269 {
270 { { { 3, AR_TA_M }, { 0xFF, 0 } } },
271 { { { 1, AR_ALEF }, { 0xFF, 0 } } },
272 { { { 0, AR_MIM }, { 0xFF, 0 } } }, // originally at index 05
273 { { { 2, AR_WAW }, { 0xFF, 0 } } },
274 { { { 2, AR_ALEF }, { 0xFF, 0 } } },
275 { { { 2, AR_YA }, { 0xFF, 0 } } },
276 { { { 0xFF, 0 } } }
277 };
278
279 /// forms for 5 letter words that yield a 3 letter stem
280 static struct ar_form_t form_5_3[] =
281 {
282 { { { 0, AR_TA }, { 2, AR_ALEF }, { 0xFF, 0 } } },
283 { { { 0, AR_ALEF }, { 2, AR_TA }, { 0xFF, 0 } } },
284 { { { 0, AR_ALEF }, { 3, AR_ALEF }, { 0xFF, 0 } } },
285 { { { 0, AR_ALEF }, { 2, AR_ALEF }, { 0xFF, 0 } } },
286 { { { 2, AR_ALEF }, { 4, AR_TA_M }, { 0xFF, 0 } } },
287 { { { 3, AR_ALEF }, { 4, AR_NOON }, { 0xFF, 0 } } },
288 { { { 2, AR_WAW }, { 4, AR_TA_M }, { 0xFF, 0 } } },
289 { { { 0, AR_TA }, { 4, AR_TA_M }, { 0xFF, 0 } } },
290 { { { 0, AR_TA }, { 3, AR_YA }, { 0xFF, 0 } } },
291 { { { 0, AR_MIM }, { 4, AR_TA_M }, { 0xFF, 0 } } },
292 { { { 0, AR_MIM }, { 2, AR_ALEF }, { 0xFF, 0 } } },
293 { { { 0, AR_MIM }, { 3, AR_WAW }, { 0xFF, 0 } } },
294 { { { 1, AR_ALEF }, { 3, AR_WAW }, { 0xFF, 0 } } },
295 { { { 1, AR_WAW }, { 2, AR_ALEF }, { 0xFF, 0 } } },
296 { { { 0, AR_MIM }, { 3, AR_ALEF }, { 0xFF, 0 } } },
297 { { { 0, AR_MIM }, { 3, AR_YA }, { 0xFF, 0 } } },
298 { { { 0, AR_ALEF }, { 4, AR_TA_M }, { 0xFF, 0 } } },
299 { { { 2, AR_ALEF }, { 3, AR_NOON }, { 0xFF, 0 } } },
300 { { { 0, AR_MIM }, { 1, AR_NOON }, { 0xFF, 0 } } },
301 { { { 0, AR_MIM }, { 2, AR_TA }, { 0xFF, 0 } } },
302 { { { 1, AR_ALEF }, { 4, AR_TA_M }, { 0xFF, 0 } } },
303 { { { 0, AR_YA }, { 2, AR_TA }, { 0xFF, 0 } } },
304 { { { 0, AR_TA }, { 2, AR_TA }, { 0xFF, 0 } } },
305 { { { 2, AR_ALEF }, { 4, AR_YA }, { 0xFF, 0 } } },
306 { { { 0, AR_ALEF }, { 1, AR_NOON }, { 0xFF, 0 } } },
307
308 // extensions
309 { { { 1, AR_TA }, { 4, AR_WAW }, { 0xFF, 0 } } },
310 { { { 0, AR_MIM }, { 1, AR_TA }, { 0xFF, 0 } } },
311 { { { 0, AR_TA }, { 4, AR_TA }, { 0xFF, 0 } } },
312 { { { 1, AR_ALEF }, { 3, AR_YA }, { 0xFF, 0 } } },
313 { { { 0xFF, 0 } } }
314 };
315
316 /// forms for 5 letter words that yield a 4 letter stem
317 static struct ar_form_t form_5_4[] =
318 {
319 { { { 0, AR_TA }, { 0xFF, 0 } } },
320 { { { 0, AR_ALEF }, { 0xFF, 0 } } },
321 { { { 0, AR_MIM }, { 0xFF, 0 } } },
322 { { { 4, AR_TA_M }, { 0xFF, 0 } } },
323 { { { 2, AR_ALEF }, { 0xFF, 0 } } },
324 { { { 0xFF, 0} }}
325 };
326
327 /// forms for 6 letter words that yield a 3 letter stem
328 static struct ar_form_t form_6_3[] =
329 {
330 { { { 0, AR_ALEF}, { 1, AR_SEEN }, { 2, AR_TA }, { 0xFF, 0 } }},
331 { { { 0, AR_MIM}, { 3, AR_ALEF }, { 5, AR_TA_M }, { 0xFF, 0 } }},
332 { { { 0, AR_ALEF}, { 2, AR_TA }, { 4, AR_ALEF }, { 0xFF, 0 } }},
333
334 // extensions that match recurring 2nd root letter
335 { { { 0, AR_ALEF }, { 3, AR_WAW }, { 4, MATCH_2 }, { 0xFF, 0 } }},
336 { { { 0, AR_MIM }, { 1, AR_SEEN }, { 2, AR_TA }, { 0xFF, 0 } }},
337
338 // extensions
339 { { { 0, AR_MIM }, { 2, AR_ALEF}, { 4, AR_YA }, { 0xFF, 0 } }},
340 { { { 0xFF, 0 } }}
341 };
342
343 /// forms for 6 letter words that yield a 4 letter stem
344 static struct ar_form_t form_6_4[] =
345 {
346 { { { 0, AR_ALEF }, { 4, AR_ALEF }, { 0xFF, 0 } } },
347 { { { 0, AR_MIM }, { 1, AR_TA }, { 0xFF, 0 } } },
348 { { { 0xFF, 0} }}
349 };
350
351
352 /// attempt to match the given word against one of the given affix rules
ar_match_affix(BYTE * word,struct ar_affix_t * affixes,int length,int reverse)353 static int ar_match_affix ( BYTE * word, struct ar_affix_t * affixes, int length, int reverse )
354 {
355 int match = -1, ai = 0;
356 while ( affixes[ai].chars[0] && match<0 )
357 {
358 int ci = 0;
359 while ( affixes[ai].chars[ci] && match<0 )
360 {
361 int wi = ci;
362 if ( reverse )
363 wi = (AR_WORD_LENGTH - length) + ci;
364
365 if ( AR_CHAR_AT(wi)!=affixes[ai].chars[ci] )
366 break;
367
368 ci++;
369 if ( affixes[ai].chars[ci]==0 )
370 match = ai;
371 }
372 ai++;
373 }
374 return match;
375 }
376
377
378 /// attempt to match the given word against one of the given form
379 /// rules, and if found, extract the stem
ar_match_form(BYTE * word,struct ar_form_t * forms)380 static int ar_match_form ( BYTE * word, struct ar_form_t * forms )
381 {
382 int match = -1, fi = 0;
383 while ( forms[fi].entry[0].at!=0xFF && match < 0 )
384 {
385 int pi = 0;
386 while ( forms[fi].entry[pi].at!=0xFF && match<0 )
387 {
388 if ( forms[fi].entry[pi].cp>=MATCH_M && forms[fi].entry[pi].cp<=MATCH_4 )
389 {
390 int index = ( forms[fi].entry[pi].cp - MATCH_M ) >> 8;
391 if ( AR_CHAR_AT(index)!=AR_CHAR_AT(forms[fi].entry[pi].at) )
392 break;
393 } else
394 {
395 if ( forms[fi].entry[pi].cp!=AR_CHAR_AT ( forms[fi].entry[pi].at ) )
396 break;
397 }
398
399 pi++;
400 if ( forms[fi].entry[pi].at==0xFF )
401 match = fi;
402 }
403 fi++;
404 }
405
406 // if match found, extract the stem
407 if ( match>=0 )
408 {
409 int pi = 0;
410 while ( forms[match].entry[pi].at!=0xFF )
411 {
412 ar_remove ( word, (forms[match].entry[pi].at - pi), 1 );
413 pi++;
414 }
415 }
416
417 return match;
418 }
419
420
ar_word_4(BYTE * word)421 static void ar_word_4 ( BYTE * word )
422 {
423 if ( ar_match_form ( word, form_4_3 )>=0 )
424 return;
425 AR_SUFFIX(1);
426 if ( AR_WORD_LENGTH==4 )
427 AR_PREFIX(1);
428 }
429
430
ar_word_5(BYTE * word)431 static void ar_word_5 ( BYTE * word )
432 {
433 if ( ar_match_form ( word, form_5_3 )>=0 )
434 return;
435 AR_SUFFIX(1);
436 if ( AR_WORD_LENGTH==4 )
437 {
438 ar_word_4(word);
439 return;
440 }
441 AR_PREFIX(1);
442 if ( AR_WORD_LENGTH==4 )
443 ar_word_4(word);
444 else if ( AR_WORD_LENGTH==5 )
445 ar_match_form ( word, form_5_4 );
446 }
447
448
ar_word_6(BYTE * word)449 static void ar_word_6 ( BYTE * word )
450 {
451 if ( ar_match_form ( word, form_6_3 )>=0 )
452 return;
453 AR_SUFFIX(1);
454 if ( AR_WORD_LENGTH==5 )
455 {
456 ar_word_5(word);
457 return;
458 }
459 AR_PREFIX(1);
460 if ( AR_WORD_LENGTH==5 )
461 ar_word_5(word);
462 else if ( AR_WORD_LENGTH==6 )
463 ar_match_form ( word, form_6_4 );
464 }
465
466
stem_ar_utf8(BYTE * word)467 void stem_ar_utf8 ( BYTE * word )
468 {
469 AR_STRIP ( AR_DIACRITIC );
470 AR_STRIP ( AR_KASHIDA ); // extension
471
472 AR_NORMALIZE ( AR_HAMZA_SET, AR_ALEF_HA );
473
474 #ifdef AR_STEM_AGGRESSIVE
475 // extension; does both if possible (i.e. 6 and 5, not 6 else 5)
476 if ( AR_WORD_LENGTH>=6 )
477 AR_PREFIX(3);
478 if ( AR_WORD_LENGTH>=5 )
479 AR_PREFIX(2);
480
481 if ( AR_WORD_LENGTH>=6 )
482 AR_SUFFIX(3);
483 if ( AR_WORD_LENGTH>=5 )
484 AR_SUFFIX(2);
485 #else
486 // original; does one only (i.e. 6 or 5, not 6 and 5)
487 if ( AR_WORD_LENGTH>=6 )
488 AR_PREFIX(3)
489 else if ( AR_WORD_LENGTH>=5 )
490 AR_PREFIX(2);
491
492 if ( AR_WORD_LENGTH>=6 )
493 AR_SUFFIX(3)
494 else if ( AR_WORD_LENGTH>=5 )
495 AR_SUFFIX(2);
496 #endif
497
498 AR_NORMALIZE ( AR_ALEF_SET, AR_ALEF );
499
500 switch ( AR_WORD_LENGTH )
501 {
502 case 4: ar_word_4 ( word ); return;
503 case 5: ar_word_5 ( word ); return;
504 case 6: ar_word_6 ( word ); return;
505 case 7:
506 AR_SUFFIX(1);
507 if ( AR_WORD_LENGTH==6 )
508 {
509 ar_word_6(word);
510 return;
511 }
512 AR_PREFIX(1);
513 if ( AR_WORD_LENGTH==6 )
514 ar_word_6(word);
515 return;
516 }
517 }
518
519 //
520 // $Id$
521 //
522