1/*
2* Affix stripping stemming algorithm for Tamil
3* By Damodharan Rajalingam
4*/
5
6stringescapes {}
7
8/* Aytham */
9stringdef aytham   '{U+0B83}'
10
11/* Uyir - independent vowels */
12stringdef a        '{U+0B85}'
13stringdef aa       '{U+0B86}'
14stringdef i        '{U+0B87}'
15stringdef ii       '{U+0B88}'
16stringdef u        '{U+0B89}'
17stringdef uu       '{U+0B8A}'
18stringdef e        '{U+0B8E}'
19stringdef ee       '{U+0B8F}'
20stringdef ai       '{U+0B90}'
21stringdef o        '{U+0B92}'
22stringdef oo       '{U+0B93}'
23stringdef au       '{U+0B94}'
24
25/* Consonants */
26stringdef ka       '{U+0B95}'
27stringdef nga      '{U+0B99}'
28stringdef ca       '{U+0B9A}'
29stringdef ja       '{U+0B9C}'
30stringdef nya      '{U+0B9E}'
31stringdef tta      '{U+0B9F}'
32stringdef nna      '{U+0BA3}'
33stringdef ta       '{U+0BA4}'
34stringdef tha      '{U+0BA4}'
35stringdef na       '{U+0BA8}'
36stringdef nnna     '{U+0BA9}'
37stringdef pa       '{U+0BAA}'
38stringdef ma       '{U+0BAE}'
39stringdef ya       '{U+0BAF}'
40stringdef ra       '{U+0BB0}'
41stringdef rra      '{U+0BB1}'
42stringdef la       '{U+0BB2}'
43stringdef lla      '{U+0BB3}'
44stringdef llla     '{U+0BB4}'
45stringdef zha      '{U+0BB4}'
46stringdef va       '{U+0BB5}'
47
48/* Vatamozi - borrowed */
49stringdef sha      '{U+0BB6}'
50stringdef ssa      '{U+0BB7}'
51stringdef sa       '{U+0BB8}'
52stringdef ha       '{U+0BB9}'
53
54
55/* Dependent vowel signs (kombu etc.) */
56stringdef vs_aa    '{U+0BBE}'
57stringdef vs_i     '{U+0BBF}'
58stringdef vs_ii    '{U+0BC0}'
59stringdef vs_u     '{U+0BC1}'
60stringdef vs_uu    '{U+0BC2}'
61stringdef vs_e     '{U+0BC6}'
62stringdef vs_ee    '{U+0BC7}'
63stringdef vs_ai    '{U+0BC8}'
64stringdef vs_o     '{U+0BCA}'
65stringdef vs_oo    '{U+0BCB}'
66stringdef vs_au    '{U+0BCC}'
67
68/* Pulli */
69stringdef pulli    '{U+0BCD}'
70
71/* AU length markk */
72stringdef au_lmark '{U+0BD7}'
73
74
75routines (
76 remove_plural_suffix
77 remove_question_suffixes
78 remove_question_prefixes
79 remove_pronoun_prefixes
80 remove_command_suffixes
81 remove_um
82 remove_vetrumai_urupukal
83 fix_va_start
84 fix_ending
85 fix_endings
86 remove_tense_suffix
87 remove_tense_suffixes
88 remove_common_word_endings
89 has_min_length
90)
91
92externals ( stem )
93
94booleans (
95 found_a_match
96 found_vetrumai_urupu
97)
98
99define has_min_length as (
100 $(len > 4)
101)
102
103define fix_va_start as (
104 (try '{va}{vs_oo}' and [ '{va}{vs_oo}' ] <- '{oo}' ) or
105 (try '{va}{vs_o}' and [ '{va}{vs_o}' ] <- '{o}' ) or
106 (try '{va}{vs_u}' and [ '{va}{vs_u}' ] <- '{u}' ) or
107 (try '{va}{vs_uu}' and [ '{va}{vs_uu}' ] <- '{uu}' )
108)
109
110define fix_endings as (
111 do repeat fix_ending
112)
113
114define remove_question_prefixes as (
115 [ ('{e}' ) among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete
116 do fix_va_start
117)
118
119// Gives signal t if an ending was fixed, signal f otherwise.
120define fix_ending as (
121 $(len > 3)
122 backwards (
123  ( [among('{na}{pulli}' '{na}{pulli}{ta}' '{na}{pulli}{ta}{pulli}') ] delete )
124  or
125  ( ['{ya}{pulli}' test among('{vs_ai}' '{vs_i}' '{vs_ii}') ] delete )
126  or
127  ( [ '{tta}{pulli}{pa}{pulli}' or '{tta}{pulli}{ka}{pulli}' ] <- '{lla}{pulli}' )
128  or
129  ( [ '{nnna}{pulli}{rra}{pulli}' ] <- '{la}{pulli}' )
130  or
131//		( [ '{rra}{pulli}{ka}{pulli}'  or '{nnna}{pulli}{nnna}{pulli}' ] <- '{la}{pulli}'  )
132  ( [ '{rra}{pulli}{ka}{pulli}' ] <- '{la}{pulli}' )
133  or
134  ( [ '{tta}{pulli}{tta}{pulli}' ] <- '{tta}{vs_u}' )
135  or
136  ( found_vetrumai_urupu [ '{ta}{pulli}{ta}{pulli}' (test not '{vs_ai}') ] <- '{ma}{pulli}' ] )
137  or
138  ( [ '{vs_u}{ka}{pulli}' or '{vs_u}{ka}{pulli}{ka}{pulli}' ] <- '{pulli}' )
139  or
140  ( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete )
141  or
142  ( [ '{vs_u}{ka}{pulli}' ] <- '{pulli}' )
143  or
144  ( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete )
145  or
146  ( [ '{pulli}' (among('{ya}' '{ra}' '{la}' '{va}' '{zha}' '{lla}') or among('{nga}' '{nya}' '{nna}' '{na}' '{ma}' '{nnna}')) '{pulli}' ] <- '{pulli}' )
147  or
148  ( [ among('{va}' '{ya}' '{va}{pulli}') ] delete )
149  or
150  ( [ '{nnna}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')) ] delete )
151  or
152  ( [ '{nga}{pulli}' (test not '{vs_ai}')] <- '{ma}{pulli}' )
153  or
154  ( [ '{nga}{pulli}' ] delete )
155  or
156  ( [ '{pulli}' (test (among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}') or '{pulli}')) ] delete )
157  )
158)
159
160define remove_pronoun_prefixes as (
161 unset found_a_match
162 [ among('{a}' '{i}' '{u}') among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete
163 (set found_a_match)
164 do fix_va_start
165)
166
167define remove_plural_suffix as (
168 unset found_a_match
169 backwards (
170  ( [ '{vs_u}{nga}{pulli}{ka}{lla}{pulli}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}')) ] <- '{pulli}' ) or
171  ( [ '{rra}{pulli}{ka}{lla}{pulli}' ] <- '{la}{pulli}' ) or
172  ( [ '{tta}{pulli}{ka}{lla}{pulli}' ] <- '{lla}{pulli}' ) or
173  ( [ '{ka}{lla}{pulli}' ] delete )
174  (set found_a_match)
175 )
176)
177
178define remove_question_suffixes as (
179 has_min_length
180 unset found_a_match
181 backwards (
182  do (
183   [ among('{vs_oo}' '{vs_ee}' '{vs_aa}') ] <- '{pulli}'
184   (set found_a_match)
185  )
186 )
187 do fix_endings
188)
189
190define remove_command_suffixes as (
191 has_min_length
192 unset found_a_match
193 backwards (
194  [ among('{pa}{vs_i}' '{va}{vs_i}') ] delete
195  (set found_a_match)
196 )
197)
198
199define remove_um as (
200 unset found_a_match
201 has_min_length
202 backwards ( [ '{vs_u}{ma}{pulli}' ] <- '{pulli}'
203    (set found_a_match)
204    )
205 do fix_ending
206)
207
208define remove_common_word_endings as (
209 // These are not suffixes actually but are
210 // some words that are attached to other words
211 // but can be removed for stemming
212 unset found_a_match
213 has_min_length
214 backwards (
215  test ( [ '{vs_u}{tta}{nnna}{pulli}' or
216     '{vs_i}{la}{pulli}{la}{vs_ai}' or
217     '{vs_i}{tta}{ma}{pulli}' or
218     '{vs_i}{nnna}{pulli}{rra}{vs_i}' or
219     '{vs_aa}{ka}{vs_i}' or
220     '{vs_aa}{ka}{vs_i}{ya}' or
221     '{vs_e}{nnna}{pulli}{rra}{vs_u}' or
222     '{vs_u}{lla}{pulli}{lla}' or
223     '{vs_u}{tta}{vs_ai}{ya}' or
224     '{vs_u}{tta}{vs_ai}' or
225     '{vs_e}{nnna}{vs_u}{ma}{pulli}' or
226     ('{la}{pulli}{la}' test (not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or
227     '{vs_e}{nnna}' or
228     '{vs_aa}{ka}{vs_i}' ] <- '{pulli}'
229     (set found_a_match)
230     )
231  or
232  test ( [ among('{pa}{tta}{vs_u}'
233     '{pa}{tta}{pulli}{tta}'
234     '{pa}{tta}{pulli}{tta}{vs_u}'
235     '{pa}{tta}{pulli}{tta}{ta}{vs_u}'
236     '{pa}{tta}{pulli}{tta}{nna}'
237     '{ka}{vs_u}{ra}{vs_i}{ya}'
238     '{pa}{rra}{pulli}{rra}{vs_i}'
239     '{va}{vs_i}{tta}{vs_u}'
240     '{va}{vs_i}{tta}{pulli}{tta}{vs_u}'
241     '{pa}{tta}{vs_i}{ta}{vs_aa}{nnna}'
242     '{pa}{tta}{vs_i}'
243     '{ta}{vs_aa}{nnna}'
244     '{vs_e}{la}{pulli}{la}{vs_aa}{ma}{pulli}')
245    ] delete
246    (set found_a_match)
247    )
248 )
249 do fix_endings
250)
251
252define remove_vetrumai_urupukal as (
253 unset found_a_match
254 unset found_vetrumai_urupu
255 has_min_length
256 backwards (
257  (
258    test ( ['{nnna}{vs_ai}'] delete )
259   or
260    test ([ ( '{vs_i}{nnna}{vs_ai}' or
261     '{vs_ai}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}'))) or
262      ( '{vs_ai}' (test (among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}')))
263    ] <- '{pulli}'
264    )
265   or
266   test ( [
267            '{vs_o}{tta}{vs_u}' or
268            '{vs_oo}{tta}{vs_u}' or
269            '{vs_i}{la}{pulli}' or
270            '{vs_i}{rra}{pulli}' or
271            ('{vs_i}{nnna}{pulli}' (test not '{ma}')) or
272            '{vs_i}{nnna}{pulli}{rra}{vs_u}' or
273            '{vs_i}{ra}{vs_u}{na}{pulli}{ta}{vs_u}' or
274            '{va}{vs_i}{tta}' or
275            ($(len >= 7) '{vs_i}{tta}{ma}{pulli}') or
276            '{vs_aa}{la}{pulli}' or
277            '{vs_u}{tta}{vs_ai}' or
278            '{vs_aa}{ma}{la}{pulli}' or
279            ('{la}{pulli}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or
280            '{vs_u}{lla}{pulli}'
281           ] <- '{pulli}'
282     )
283   or
284   test ( [
285           '{ka}{nna}{pulli}' or
286           '{ma}{vs_u}{nnna}{pulli}' or
287           '{ma}{vs_ee}{la}{pulli}' or
288           '{ma}{vs_ee}{rra}{pulli}' or
289           '{ka}{vs_ii}{llla}{pulli}' or
290           '{pa}{vs_i}{nnna}{pulli}' or
291           ('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')))
292          ] delete
293      )
294   or
295   test ([ '{vs_ii}' ] <- '{vs_i}')
296  )
297  (set found_a_match)
298  (set found_vetrumai_urupu)
299  do ( [ '{vs_i}{nnna}{pulli}' ] <- '{pulli}' )
300 )
301 do fix_endings
302)
303
304define remove_tense_suffixes as (
305 set found_a_match
306 repeat ( found_a_match (do remove_tense_suffix) )
307)
308
309define remove_tense_suffix as (
310 unset found_a_match
311 has_min_length
312 backwards (
313  do (
314   test ( [among(
315           '{ka}{vs_o}{nna}{pulli}{tta}{vs_i}{ra}{pulli}'
316           '{pa}{tta}{vs_u}'
317           )] delete
318     (set found_a_match)
319     )
320   or
321   test ( [
322            '{ma}{vs_aa}{ra}{pulli}' or
323            '{ma}{vs_i}{nnna}{pulli}' or
324            '{nnna}{nnna}{pulli}' or
325            '{nnna}{vs_aa}{nnna}{pulli}' or
326            '{nnna}{vs_aa}{lla}{pulli}' or
327            '{nnna}{vs_aa}{ra}{pulli}' or
328            ('{va}{nnna}{pulli}' test (not among('{a}' '{aa}' '{i}' '{ii}' '{u}' '{uu}' '{e}' '{ee}' '{ai}' '{o}' '{oo}' '{au}')) ) or
329            '{nnna}{lla}{pulli}' or
330            '{va}{lla}{pulli}' or
331            '{nnna}{ra}{pulli}' or
332            '{va}{ra}{pulli}' or
333            '{nnna}' or '{pa}' or '{ka}' or '{ta}' or '{ya}' or
334            '{pa}{nnna}{pulli}' or
335            '{pa}{lla}{pulli}' or
336            '{pa}{ra}{pulli}' or
337            ('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or
338            '{vs_i}{rra}{pulli}{rra}{vs_u}' or
339            '{pa}{ma}{pulli}' or
340            '{nnna}{ma}{pulli}' or
341            '{ta}{vs_u}{ma}{pulli}' or
342            '{rra}{vs_u}{ma}{pulli}' or
343            '{ka}{vs_u}{ma}{pulli}' or
344            '{nnna}{vs_e}{nnna}{pulli}' or
345            '{nnna}{vs_ai}' or
346            '{va}{vs_ai}'
347       ] delete
348       (set found_a_match)
349       )
350   or
351   test ( [
352           ('{vs_aa}{nnna}{pulli}' test (not '{ca}')) or
353           '{vs_aa}{lla}{pulli}' or
354           '{vs_aa}{ra}{pulli}' or
355           '{vs_ee}{nnna}{pulli}' or
356           '{vs_aa}' or
357           '{vs_aa}{ma}{pulli}' or
358           '{vs_e}{ma}{pulli}' or
359           '{vs_ee}{ma}{pulli}' or
360           '{vs_oo}{ma}{pulli}' or
361           '{ka}{vs_u}{ma}{pulli}' or
362           '{ta}{vs_u}{ma}{pulli}' or
363           '{tta}{vs_u}{ma}{pulli}' or
364           '{rra}{vs_u}{ma}{pulli}' or
365           '{vs_aa}{ya}{pulli}' or
366           '{nnna}{vs_e}{nnna}{pulli}' or
367           '{nnna}{vs_i}{ra}{pulli}' or
368           '{vs_ii}{ra}{pulli}' or
369           '{vs_ii}{ya}{ra}{pulli}'
370          ] <- '{pulli}'
371      (set found_a_match)
372      )
373   or
374   test ( ([ '{ka}{vs_u}' or '{ta}{vs_u}' ) (test '{pulli}') ] delete
375      (set found_a_match)
376      )
377  )
378  do ([among(
379              '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}'
380              '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}{pulli}'
381              '{ka}{vs_i}{nnna}{pulli}{rra}'
382              '{ka}{vs_i}{nnna}{pulli}{rra}{pulli}'
383              '{ka}{vs_i}{rra}'
384              '{ka}{vs_i}{rra}{pulli}'
385            )] delete
386    (set found_a_match)
387    )
388 )
389 do fix_endings
390)
391
392define stem as (
393 unset found_vetrumai_urupu
394 do fix_ending
395 has_min_length
396 do remove_question_prefixes
397 do remove_pronoun_prefixes
398 do remove_question_suffixes
399 do remove_um
400 do remove_common_word_endings
401 do remove_vetrumai_urupukal
402 do remove_plural_suffix
403 do remove_command_suffixes
404 do remove_tense_suffixes
405)
406