1// Alias: ta
2
3/*
4* Affix stripping stemming algorithm for Tamil
5* By Damodharan Rajalingam
6*/
7
8stringescapes {}
9
10/* Aytham */
11stringdef aytham   '{U+0B83}'
12
13/* Uyir - independent vowels */
14stringdef a        '{U+0B85}'
15stringdef aa       '{U+0B86}'
16stringdef i        '{U+0B87}'
17stringdef ii       '{U+0B88}'
18stringdef u        '{U+0B89}'
19stringdef uu       '{U+0B8A}'
20stringdef e        '{U+0B8E}'
21stringdef ee       '{U+0B8F}'
22stringdef ai       '{U+0B90}'
23stringdef o        '{U+0B92}'
24stringdef oo       '{U+0B93}'
25stringdef au       '{U+0B94}'
26
27/* Consonants */
28stringdef ka       '{U+0B95}'
29stringdef nga      '{U+0B99}'
30stringdef ca       '{U+0B9A}'
31stringdef ja       '{U+0B9C}'
32stringdef nya      '{U+0B9E}'
33stringdef tta      '{U+0B9F}'
34stringdef nna      '{U+0BA3}'
35stringdef ta       '{U+0BA4}'
36stringdef tha      '{U+0BA4}'
37stringdef na       '{U+0BA8}'
38stringdef nnna     '{U+0BA9}'
39stringdef pa       '{U+0BAA}'
40stringdef ma       '{U+0BAE}'
41stringdef ya       '{U+0BAF}'
42stringdef ra       '{U+0BB0}'
43stringdef rra      '{U+0BB1}'
44stringdef la       '{U+0BB2}'
45stringdef lla      '{U+0BB3}'
46stringdef llla     '{U+0BB4}'
47stringdef zha      '{U+0BB4}'
48stringdef va       '{U+0BB5}'
49
50/* Vatamozi - borrowed */
51stringdef sha      '{U+0BB6}'
52stringdef ssa      '{U+0BB7}'
53stringdef sa       '{U+0BB8}'
54stringdef ha       '{U+0BB9}'
55
56
57/* Dependent vowel signs (kombu etc.) */
58stringdef vs_aa    '{U+0BBE}'
59stringdef vs_i     '{U+0BBF}'
60stringdef vs_ii    '{U+0BC0}'
61stringdef vs_u     '{U+0BC1}'
62stringdef vs_uu    '{U+0BC2}'
63stringdef vs_e     '{U+0BC6}'
64stringdef vs_ee    '{U+0BC7}'
65stringdef vs_ai    '{U+0BC8}'
66stringdef vs_o     '{U+0BCA}'
67stringdef vs_oo    '{U+0BCB}'
68stringdef vs_au    '{U+0BCC}'
69
70/* Pulli */
71stringdef pulli    '{U+0BCD}'
72
73/* AU length markk */
74stringdef au_lmark '{U+0BD7}'
75
76
77routines (
78 remove_plural_suffix
79 remove_question_suffixes
80 remove_question_prefixes
81 remove_pronoun_prefixes
82 remove_command_suffixes
83 remove_um
84 remove_vetrumai_urupukal
85 fix_va_start
86 fix_ending
87 fix_endings
88 remove_tense_suffix
89 remove_tense_suffixes
90 remove_common_word_endings
91 has_min_length
92)
93
94externals ( stem )
95
96booleans (
97 found_a_match
98 found_vetrumai_urupu
99)
100
101define has_min_length as (
102 $(len > 4)
103)
104
105define fix_va_start as (
106 (try '{va}{vs_oo}' and [ '{va}{vs_oo}' ] <- '{oo}' ) or
107 (try '{va}{vs_o}' and [ '{va}{vs_o}' ] <- '{o}' ) or
108 (try '{va}{vs_u}' and [ '{va}{vs_u}' ] <- '{u}' ) or
109 (try '{va}{vs_uu}' and [ '{va}{vs_uu}' ] <- '{uu}' )
110)
111
112define fix_endings as (
113 do repeat fix_ending
114)
115
116define remove_question_prefixes as (
117 [ ('{e}' ) among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete
118 do fix_va_start
119)
120
121// Gives signal t if an ending was fixed, signal f otherwise.
122define fix_ending as (
123 $(len > 3)
124 backwards (
125  ( [among('{na}{pulli}' '{na}{pulli}{ta}' '{na}{pulli}{ta}{pulli}') ] delete )
126  or
127  ( ['{ya}{pulli}' test among('{vs_ai}' '{vs_i}' '{vs_ii}') ] delete )
128  or
129  ( [ '{tta}{pulli}{pa}{pulli}' or '{tta}{pulli}{ka}{pulli}' ] <- '{lla}{pulli}' )
130  or
131  ( [ '{nnna}{pulli}{rra}{pulli}' ] <- '{la}{pulli}' )
132  or
133//		( [ '{rra}{pulli}{ka}{pulli}'  or '{nnna}{pulli}{nnna}{pulli}' ] <- '{la}{pulli}'  )
134  ( [ '{rra}{pulli}{ka}{pulli}' ] <- '{la}{pulli}' )
135  or
136  ( [ '{tta}{pulli}{tta}{pulli}' ] <- '{tta}{vs_u}' )
137  or
138  ( found_vetrumai_urupu [ '{ta}{pulli}{ta}{pulli}' (test not '{vs_ai}') ] <- '{ma}{pulli}' ] )
139  or
140  ( [ '{vs_u}{ka}{pulli}' or '{vs_u}{ka}{pulli}{ka}{pulli}' ] <- '{pulli}' )
141  or
142  ( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete )
143  or
144  ( [ '{vs_u}{ka}{pulli}' ] <- '{pulli}' )
145  or
146  ( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete )
147  or
148  ( [ '{pulli}' (among('{ya}' '{ra}' '{la}' '{va}' '{zha}' '{lla}') or among('{nga}' '{nya}' '{nna}' '{na}' '{ma}' '{nnna}')) '{pulli}' ] <- '{pulli}' )
149  or
150  ( [ among('{va}' '{ya}' '{va}{pulli}') ] delete )
151  or
152  ( [ '{nnna}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')) ] delete )
153  or
154  ( [ '{nga}{pulli}' (test not '{vs_ai}')] <- '{ma}{pulli}' )
155  or
156  ( [ '{nga}{pulli}' ] delete )
157  or
158  ( [ '{pulli}' (test (among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}') or '{pulli}')) ] delete )
159  )
160)
161
162define remove_pronoun_prefixes as (
163 unset found_a_match
164 [ among('{a}' '{i}' '{u}') among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete
165 (set found_a_match)
166 do fix_va_start
167)
168
169define remove_plural_suffix as (
170 unset found_a_match
171 backwards (
172  ( [ '{vs_u}{nga}{pulli}{ka}{lla}{pulli}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}')) ] <- '{pulli}' ) or
173  ( [ '{rra}{pulli}{ka}{lla}{pulli}' ] <- '{la}{pulli}' ) or
174  ( [ '{tta}{pulli}{ka}{lla}{pulli}' ] <- '{lla}{pulli}' ) or
175  ( [ '{ka}{lla}{pulli}' ] delete )
176  (set found_a_match)
177 )
178)
179
180define remove_question_suffixes as (
181 has_min_length
182 unset found_a_match
183 backwards (
184  do (
185   [ among('{vs_oo}' '{vs_ee}' '{vs_aa}') ] <- '{pulli}'
186   (set found_a_match)
187  )
188 )
189 do fix_endings
190)
191
192define remove_command_suffixes as (
193 has_min_length
194 unset found_a_match
195 backwards (
196  [ among('{pa}{vs_i}' '{va}{vs_i}') ] delete
197  (set found_a_match)
198 )
199)
200
201define remove_um as (
202 unset found_a_match
203 has_min_length
204 backwards ( [ '{vs_u}{ma}{pulli}' ] <- '{pulli}'
205    (set found_a_match)
206    )
207 do fix_ending
208)
209
210define remove_common_word_endings as (
211 // These are not suffixes actually but are
212 // some words that are attached to other words
213 // but can be removed for stemming
214 unset found_a_match
215 has_min_length
216 backwards (
217  test ( [ '{vs_u}{tta}{nnna}{pulli}' or
218     '{vs_i}{la}{pulli}{la}{vs_ai}' or
219     '{vs_i}{tta}{ma}{pulli}' or
220     '{vs_i}{nnna}{pulli}{rra}{vs_i}' or
221     '{vs_aa}{ka}{vs_i}' or
222     '{vs_aa}{ka}{vs_i}{ya}' or
223     '{vs_e}{nnna}{pulli}{rra}{vs_u}' or
224     '{vs_u}{lla}{pulli}{lla}' or
225     '{vs_u}{tta}{vs_ai}{ya}' or
226     '{vs_u}{tta}{vs_ai}' or
227     '{vs_e}{nnna}{vs_u}{ma}{pulli}' or
228     ('{la}{pulli}{la}' test (not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or
229     '{vs_e}{nnna}' or
230     '{vs_aa}{ka}{vs_i}' ] <- '{pulli}'
231     (set found_a_match)
232     )
233  or
234  test ( [ among('{pa}{tta}{vs_u}'
235     '{pa}{tta}{pulli}{tta}'
236     '{pa}{tta}{pulli}{tta}{vs_u}'
237     '{pa}{tta}{pulli}{tta}{ta}{vs_u}'
238     '{pa}{tta}{pulli}{tta}{nna}'
239     '{ka}{vs_u}{ra}{vs_i}{ya}'
240     '{pa}{rra}{pulli}{rra}{vs_i}'
241     '{va}{vs_i}{tta}{vs_u}'
242     '{va}{vs_i}{tta}{pulli}{tta}{vs_u}'
243     '{pa}{tta}{vs_i}{ta}{vs_aa}{nnna}'
244     '{pa}{tta}{vs_i}'
245     '{ta}{vs_aa}{nnna}'
246     '{vs_e}{la}{pulli}{la}{vs_aa}{ma}{pulli}')
247    ] delete
248    (set found_a_match)
249    )
250 )
251 do fix_endings
252)
253
254define remove_vetrumai_urupukal as (
255 unset found_a_match
256 unset found_vetrumai_urupu
257 has_min_length
258 backwards (
259  (
260    test ( ['{nnna}{vs_ai}'] delete )
261   or
262    test ([ ( '{vs_i}{nnna}{vs_ai}' or
263     '{vs_ai}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}'))) or
264      ( '{vs_ai}' (test (among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}')))
265    ] <- '{pulli}'
266    )
267   or
268   test ( [
269            '{vs_o}{tta}{vs_u}' or
270            '{vs_oo}{tta}{vs_u}' or
271            '{vs_i}{la}{pulli}' or
272            '{vs_i}{rra}{pulli}' or
273            ('{vs_i}{nnna}{pulli}' (test not '{ma}')) or
274            '{vs_i}{nnna}{pulli}{rra}{vs_u}' or
275            '{vs_i}{ra}{vs_u}{na}{pulli}{ta}{vs_u}' or
276            '{va}{vs_i}{tta}' or
277            ($(len >= 7) '{vs_i}{tta}{ma}{pulli}') or
278            '{vs_aa}{la}{pulli}' or
279            '{vs_u}{tta}{vs_ai}' or
280            '{vs_aa}{ma}{la}{pulli}' or
281            ('{la}{pulli}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or
282            '{vs_u}{lla}{pulli}'
283           ] <- '{pulli}'
284     )
285   or
286   test ( [
287           '{ka}{nna}{pulli}' or
288           '{ma}{vs_u}{nnna}{pulli}' or
289           '{ma}{vs_ee}{la}{pulli}' or
290           '{ma}{vs_ee}{rra}{pulli}' or
291           '{ka}{vs_ii}{llla}{pulli}' or
292           '{pa}{vs_i}{nnna}{pulli}' or
293           ('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')))
294          ] delete
295      )
296   or
297   test ([ '{vs_ii}' ] <- '{vs_i}')
298  )
299  (set found_a_match)
300  (set found_vetrumai_urupu)
301  do ( [ '{vs_i}{nnna}{pulli}' ] <- '{pulli}' )
302 )
303 do fix_endings
304)
305
306define remove_tense_suffixes as (
307 set found_a_match
308 repeat ( found_a_match (do remove_tense_suffix) )
309)
310
311define remove_tense_suffix as (
312 unset found_a_match
313 has_min_length
314 backwards (
315  do (
316   test ( [among(
317           '{ka}{vs_o}{nna}{pulli}{tta}{vs_i}{ra}{pulli}'
318           '{pa}{tta}{vs_u}'
319           )] delete
320     (set found_a_match)
321     )
322   or
323   test ( [
324            '{ma}{vs_aa}{ra}{pulli}' or
325            '{ma}{vs_i}{nnna}{pulli}' or
326            '{nnna}{nnna}{pulli}' or
327            '{nnna}{vs_aa}{nnna}{pulli}' or
328            '{nnna}{vs_aa}{lla}{pulli}' or
329            '{nnna}{vs_aa}{ra}{pulli}' or
330            ('{va}{nnna}{pulli}' test (not among('{a}' '{aa}' '{i}' '{ii}' '{u}' '{uu}' '{e}' '{ee}' '{ai}' '{o}' '{oo}' '{au}')) ) or
331            '{nnna}{lla}{pulli}' or
332            '{va}{lla}{pulli}' or
333            '{nnna}{ra}{pulli}' or
334            '{va}{ra}{pulli}' or
335            '{nnna}' or '{pa}' or '{ka}' or '{ta}' or '{ya}' or
336            '{pa}{nnna}{pulli}' or
337            '{pa}{lla}{pulli}' or
338            '{pa}{ra}{pulli}' or
339            ('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or
340            '{vs_i}{rra}{pulli}{rra}{vs_u}' or
341            '{pa}{ma}{pulli}' or
342            '{nnna}{ma}{pulli}' or
343            '{ta}{vs_u}{ma}{pulli}' or
344            '{rra}{vs_u}{ma}{pulli}' or
345            '{ka}{vs_u}{ma}{pulli}' or
346            '{nnna}{vs_e}{nnna}{pulli}' or
347            '{nnna}{vs_ai}' or
348            '{va}{vs_ai}'
349       ] delete
350       (set found_a_match)
351       )
352   or
353   test ( [
354           ('{vs_aa}{nnna}{pulli}' test (not '{ca}')) or
355           '{vs_aa}{lla}{pulli}' or
356           '{vs_aa}{ra}{pulli}' or
357           '{vs_ee}{nnna}{pulli}' or
358           '{vs_aa}' or
359           '{vs_aa}{ma}{pulli}' or
360           '{vs_e}{ma}{pulli}' or
361           '{vs_ee}{ma}{pulli}' or
362           '{vs_oo}{ma}{pulli}' or
363           '{ka}{vs_u}{ma}{pulli}' or
364           '{ta}{vs_u}{ma}{pulli}' or
365           '{tta}{vs_u}{ma}{pulli}' or
366           '{rra}{vs_u}{ma}{pulli}' or
367           '{vs_aa}{ya}{pulli}' or
368           '{nnna}{vs_e}{nnna}{pulli}' or
369           '{nnna}{vs_i}{ra}{pulli}' or
370           '{vs_ii}{ra}{pulli}' or
371           '{vs_ii}{ya}{ra}{pulli}'
372          ] <- '{pulli}'
373      (set found_a_match)
374      )
375   or
376   test ( ([ '{ka}{vs_u}' or '{ta}{vs_u}' ) (test '{pulli}') ] delete
377      (set found_a_match)
378      )
379  )
380  do ([among(
381              '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}'
382              '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}{pulli}'
383              '{ka}{vs_i}{nnna}{pulli}{rra}'
384              '{ka}{vs_i}{nnna}{pulli}{rra}{pulli}'
385              '{ka}{vs_i}{rra}'
386              '{ka}{vs_i}{rra}{pulli}'
387            )] delete
388    (set found_a_match)
389    )
390 )
391 do fix_endings
392)
393
394define stem as (
395 unset found_vetrumai_urupu
396 do fix_ending
397 has_min_length
398 do remove_question_prefixes
399 do remove_pronoun_prefixes
400 do remove_question_suffixes
401 do remove_um
402 do remove_common_word_endings
403 do remove_vetrumai_urupukal
404 do remove_plural_suffix
405 do remove_command_suffixes
406 do remove_tense_suffixes
407)
408