1// Alias: ta 2 3/* 4* Affix stripping stemming algorithm for Tamil 5* By Damodharan Rajalingam 6*/ 7 8stringescapes {} 9 10/* Aytham */ 11stringdef aytham '{U+0B83}' 12 13/* Uyir - independent vowels */ 14stringdef a '{U+0B85}' 15stringdef aa '{U+0B86}' 16stringdef i '{U+0B87}' 17stringdef ii '{U+0B88}' 18stringdef u '{U+0B89}' 19stringdef uu '{U+0B8A}' 20stringdef e '{U+0B8E}' 21stringdef ee '{U+0B8F}' 22stringdef ai '{U+0B90}' 23stringdef o '{U+0B92}' 24stringdef oo '{U+0B93}' 25stringdef au '{U+0B94}' 26 27/* Consonants */ 28stringdef ka '{U+0B95}' 29stringdef nga '{U+0B99}' 30stringdef ca '{U+0B9A}' 31stringdef ja '{U+0B9C}' 32stringdef nya '{U+0B9E}' 33stringdef tta '{U+0B9F}' 34stringdef nna '{U+0BA3}' 35stringdef ta '{U+0BA4}' 36stringdef tha '{U+0BA4}' 37stringdef na '{U+0BA8}' 38stringdef nnna '{U+0BA9}' 39stringdef pa '{U+0BAA}' 40stringdef ma '{U+0BAE}' 41stringdef ya '{U+0BAF}' 42stringdef ra '{U+0BB0}' 43stringdef rra '{U+0BB1}' 44stringdef la '{U+0BB2}' 45stringdef lla '{U+0BB3}' 46stringdef llla '{U+0BB4}' 47stringdef zha '{U+0BB4}' 48stringdef va '{U+0BB5}' 49 50/* Vatamozi - borrowed */ 51stringdef sha '{U+0BB6}' 52stringdef ssa '{U+0BB7}' 53stringdef sa '{U+0BB8}' 54stringdef ha '{U+0BB9}' 55 56 57/* Dependent vowel signs (kombu etc.) */ 58stringdef vs_aa '{U+0BBE}' 59stringdef vs_i '{U+0BBF}' 60stringdef vs_ii '{U+0BC0}' 61stringdef vs_u '{U+0BC1}' 62stringdef vs_uu '{U+0BC2}' 63stringdef vs_e '{U+0BC6}' 64stringdef vs_ee '{U+0BC7}' 65stringdef vs_ai '{U+0BC8}' 66stringdef vs_o '{U+0BCA}' 67stringdef vs_oo '{U+0BCB}' 68stringdef vs_au '{U+0BCC}' 69 70/* Pulli */ 71stringdef pulli '{U+0BCD}' 72 73/* AU length markk */ 74stringdef au_lmark '{U+0BD7}' 75 76 77routines ( 78 remove_plural_suffix 79 remove_question_suffixes 80 remove_question_prefixes 81 remove_pronoun_prefixes 82 remove_command_suffixes 83 remove_um 84 remove_vetrumai_urupukal 85 fix_va_start 86 fix_ending 87 fix_endings 88 remove_tense_suffix 89 remove_tense_suffixes 90 remove_common_word_endings 91 has_min_length 92) 93 94externals ( stem ) 95 96booleans ( 97 found_a_match 98 found_vetrumai_urupu 99) 100 101define has_min_length as ( 102 $(len > 4) 103) 104 105define fix_va_start as ( 106 (try '{va}{vs_oo}' and [ '{va}{vs_oo}' ] <- '{oo}' ) or 107 (try '{va}{vs_o}' and [ '{va}{vs_o}' ] <- '{o}' ) or 108 (try '{va}{vs_u}' and [ '{va}{vs_u}' ] <- '{u}' ) or 109 (try '{va}{vs_uu}' and [ '{va}{vs_uu}' ] <- '{uu}' ) 110) 111 112define fix_endings as ( 113 do repeat fix_ending 114) 115 116define remove_question_prefixes as ( 117 [ ('{e}' ) among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete 118 do fix_va_start 119) 120 121// Gives signal t if an ending was fixed, signal f otherwise. 122define fix_ending as ( 123 $(len > 3) 124 backwards ( 125 ( [among('{na}{pulli}' '{na}{pulli}{ta}' '{na}{pulli}{ta}{pulli}') ] delete ) 126 or 127 ( ['{ya}{pulli}' test among('{vs_ai}' '{vs_i}' '{vs_ii}') ] delete ) 128 or 129 ( [ '{tta}{pulli}{pa}{pulli}' or '{tta}{pulli}{ka}{pulli}' ] <- '{lla}{pulli}' ) 130 or 131 ( [ '{nnna}{pulli}{rra}{pulli}' ] <- '{la}{pulli}' ) 132 or 133// ( [ '{rra}{pulli}{ka}{pulli}' or '{nnna}{pulli}{nnna}{pulli}' ] <- '{la}{pulli}' ) 134 ( [ '{rra}{pulli}{ka}{pulli}' ] <- '{la}{pulli}' ) 135 or 136 ( [ '{tta}{pulli}{tta}{pulli}' ] <- '{tta}{vs_u}' ) 137 or 138 ( found_vetrumai_urupu [ '{ta}{pulli}{ta}{pulli}' (test not '{vs_ai}') ] <- '{ma}{pulli}' ] ) 139 or 140 ( [ '{vs_u}{ka}{pulli}' or '{vs_u}{ka}{pulli}{ka}{pulli}' ] <- '{pulli}' ) 141 or 142 ( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete ) 143 or 144 ( [ '{vs_u}{ka}{pulli}' ] <- '{pulli}' ) 145 or 146 ( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete ) 147 or 148 ( [ '{pulli}' (among('{ya}' '{ra}' '{la}' '{va}' '{zha}' '{lla}') or among('{nga}' '{nya}' '{nna}' '{na}' '{ma}' '{nnna}')) '{pulli}' ] <- '{pulli}' ) 149 or 150 ( [ among('{va}' '{ya}' '{va}{pulli}') ] delete ) 151 or 152 ( [ '{nnna}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')) ] delete ) 153 or 154 ( [ '{nga}{pulli}' (test not '{vs_ai}')] <- '{ma}{pulli}' ) 155 or 156 ( [ '{nga}{pulli}' ] delete ) 157 or 158 ( [ '{pulli}' (test (among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}') or '{pulli}')) ] delete ) 159 ) 160) 161 162define remove_pronoun_prefixes as ( 163 unset found_a_match 164 [ among('{a}' '{i}' '{u}') among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete 165 (set found_a_match) 166 do fix_va_start 167) 168 169define remove_plural_suffix as ( 170 unset found_a_match 171 backwards ( 172 ( [ '{vs_u}{nga}{pulli}{ka}{lla}{pulli}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}')) ] <- '{pulli}' ) or 173 ( [ '{rra}{pulli}{ka}{lla}{pulli}' ] <- '{la}{pulli}' ) or 174 ( [ '{tta}{pulli}{ka}{lla}{pulli}' ] <- '{lla}{pulli}' ) or 175 ( [ '{ka}{lla}{pulli}' ] delete ) 176 (set found_a_match) 177 ) 178) 179 180define remove_question_suffixes as ( 181 has_min_length 182 unset found_a_match 183 backwards ( 184 do ( 185 [ among('{vs_oo}' '{vs_ee}' '{vs_aa}') ] <- '{pulli}' 186 (set found_a_match) 187 ) 188 ) 189 do fix_endings 190) 191 192define remove_command_suffixes as ( 193 has_min_length 194 unset found_a_match 195 backwards ( 196 [ among('{pa}{vs_i}' '{va}{vs_i}') ] delete 197 (set found_a_match) 198 ) 199) 200 201define remove_um as ( 202 unset found_a_match 203 has_min_length 204 backwards ( [ '{vs_u}{ma}{pulli}' ] <- '{pulli}' 205 (set found_a_match) 206 ) 207 do fix_ending 208) 209 210define remove_common_word_endings as ( 211 // These are not suffixes actually but are 212 // some words that are attached to other words 213 // but can be removed for stemming 214 unset found_a_match 215 has_min_length 216 backwards ( 217 test ( [ '{vs_u}{tta}{nnna}{pulli}' or 218 '{vs_i}{la}{pulli}{la}{vs_ai}' or 219 '{vs_i}{tta}{ma}{pulli}' or 220 '{vs_i}{nnna}{pulli}{rra}{vs_i}' or 221 '{vs_aa}{ka}{vs_i}' or 222 '{vs_aa}{ka}{vs_i}{ya}' or 223 '{vs_e}{nnna}{pulli}{rra}{vs_u}' or 224 '{vs_u}{lla}{pulli}{lla}' or 225 '{vs_u}{tta}{vs_ai}{ya}' or 226 '{vs_u}{tta}{vs_ai}' or 227 '{vs_e}{nnna}{vs_u}{ma}{pulli}' or 228 ('{la}{pulli}{la}' test (not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or 229 '{vs_e}{nnna}' or 230 '{vs_aa}{ka}{vs_i}' ] <- '{pulli}' 231 (set found_a_match) 232 ) 233 or 234 test ( [ among('{pa}{tta}{vs_u}' 235 '{pa}{tta}{pulli}{tta}' 236 '{pa}{tta}{pulli}{tta}{vs_u}' 237 '{pa}{tta}{pulli}{tta}{ta}{vs_u}' 238 '{pa}{tta}{pulli}{tta}{nna}' 239 '{ka}{vs_u}{ra}{vs_i}{ya}' 240 '{pa}{rra}{pulli}{rra}{vs_i}' 241 '{va}{vs_i}{tta}{vs_u}' 242 '{va}{vs_i}{tta}{pulli}{tta}{vs_u}' 243 '{pa}{tta}{vs_i}{ta}{vs_aa}{nnna}' 244 '{pa}{tta}{vs_i}' 245 '{ta}{vs_aa}{nnna}' 246 '{vs_e}{la}{pulli}{la}{vs_aa}{ma}{pulli}') 247 ] delete 248 (set found_a_match) 249 ) 250 ) 251 do fix_endings 252) 253 254define remove_vetrumai_urupukal as ( 255 unset found_a_match 256 unset found_vetrumai_urupu 257 has_min_length 258 backwards ( 259 ( 260 test ( ['{nnna}{vs_ai}'] delete ) 261 or 262 test ([ ( '{vs_i}{nnna}{vs_ai}' or 263 '{vs_ai}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}'))) or 264 ( '{vs_ai}' (test (among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}'))) 265 ] <- '{pulli}' 266 ) 267 or 268 test ( [ 269 '{vs_o}{tta}{vs_u}' or 270 '{vs_oo}{tta}{vs_u}' or 271 '{vs_i}{la}{pulli}' or 272 '{vs_i}{rra}{pulli}' or 273 ('{vs_i}{nnna}{pulli}' (test not '{ma}')) or 274 '{vs_i}{nnna}{pulli}{rra}{vs_u}' or 275 '{vs_i}{ra}{vs_u}{na}{pulli}{ta}{vs_u}' or 276 '{va}{vs_i}{tta}' or 277 ($(len >= 7) '{vs_i}{tta}{ma}{pulli}') or 278 '{vs_aa}{la}{pulli}' or 279 '{vs_u}{tta}{vs_ai}' or 280 '{vs_aa}{ma}{la}{pulli}' or 281 ('{la}{pulli}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or 282 '{vs_u}{lla}{pulli}' 283 ] <- '{pulli}' 284 ) 285 or 286 test ( [ 287 '{ka}{nna}{pulli}' or 288 '{ma}{vs_u}{nnna}{pulli}' or 289 '{ma}{vs_ee}{la}{pulli}' or 290 '{ma}{vs_ee}{rra}{pulli}' or 291 '{ka}{vs_ii}{llla}{pulli}' or 292 '{pa}{vs_i}{nnna}{pulli}' or 293 ('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) 294 ] delete 295 ) 296 or 297 test ([ '{vs_ii}' ] <- '{vs_i}') 298 ) 299 (set found_a_match) 300 (set found_vetrumai_urupu) 301 do ( [ '{vs_i}{nnna}{pulli}' ] <- '{pulli}' ) 302 ) 303 do fix_endings 304) 305 306define remove_tense_suffixes as ( 307 set found_a_match 308 repeat ( found_a_match (do remove_tense_suffix) ) 309) 310 311define remove_tense_suffix as ( 312 unset found_a_match 313 has_min_length 314 backwards ( 315 do ( 316 test ( [among( 317 '{ka}{vs_o}{nna}{pulli}{tta}{vs_i}{ra}{pulli}' 318 '{pa}{tta}{vs_u}' 319 )] delete 320 (set found_a_match) 321 ) 322 or 323 test ( [ 324 '{ma}{vs_aa}{ra}{pulli}' or 325 '{ma}{vs_i}{nnna}{pulli}' or 326 '{nnna}{nnna}{pulli}' or 327 '{nnna}{vs_aa}{nnna}{pulli}' or 328 '{nnna}{vs_aa}{lla}{pulli}' or 329 '{nnna}{vs_aa}{ra}{pulli}' or 330 ('{va}{nnna}{pulli}' test (not among('{a}' '{aa}' '{i}' '{ii}' '{u}' '{uu}' '{e}' '{ee}' '{ai}' '{o}' '{oo}' '{au}')) ) or 331 '{nnna}{lla}{pulli}' or 332 '{va}{lla}{pulli}' or 333 '{nnna}{ra}{pulli}' or 334 '{va}{ra}{pulli}' or 335 '{nnna}' or '{pa}' or '{ka}' or '{ta}' or '{ya}' or 336 '{pa}{nnna}{pulli}' or 337 '{pa}{lla}{pulli}' or 338 '{pa}{ra}{pulli}' or 339 ('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or 340 '{vs_i}{rra}{pulli}{rra}{vs_u}' or 341 '{pa}{ma}{pulli}' or 342 '{nnna}{ma}{pulli}' or 343 '{ta}{vs_u}{ma}{pulli}' or 344 '{rra}{vs_u}{ma}{pulli}' or 345 '{ka}{vs_u}{ma}{pulli}' or 346 '{nnna}{vs_e}{nnna}{pulli}' or 347 '{nnna}{vs_ai}' or 348 '{va}{vs_ai}' 349 ] delete 350 (set found_a_match) 351 ) 352 or 353 test ( [ 354 ('{vs_aa}{nnna}{pulli}' test (not '{ca}')) or 355 '{vs_aa}{lla}{pulli}' or 356 '{vs_aa}{ra}{pulli}' or 357 '{vs_ee}{nnna}{pulli}' or 358 '{vs_aa}' or 359 '{vs_aa}{ma}{pulli}' or 360 '{vs_e}{ma}{pulli}' or 361 '{vs_ee}{ma}{pulli}' or 362 '{vs_oo}{ma}{pulli}' or 363 '{ka}{vs_u}{ma}{pulli}' or 364 '{ta}{vs_u}{ma}{pulli}' or 365 '{tta}{vs_u}{ma}{pulli}' or 366 '{rra}{vs_u}{ma}{pulli}' or 367 '{vs_aa}{ya}{pulli}' or 368 '{nnna}{vs_e}{nnna}{pulli}' or 369 '{nnna}{vs_i}{ra}{pulli}' or 370 '{vs_ii}{ra}{pulli}' or 371 '{vs_ii}{ya}{ra}{pulli}' 372 ] <- '{pulli}' 373 (set found_a_match) 374 ) 375 or 376 test ( ([ '{ka}{vs_u}' or '{ta}{vs_u}' ) (test '{pulli}') ] delete 377 (set found_a_match) 378 ) 379 ) 380 do ([among( 381 '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}' 382 '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}{pulli}' 383 '{ka}{vs_i}{nnna}{pulli}{rra}' 384 '{ka}{vs_i}{nnna}{pulli}{rra}{pulli}' 385 '{ka}{vs_i}{rra}' 386 '{ka}{vs_i}{rra}{pulli}' 387 )] delete 388 (set found_a_match) 389 ) 390 ) 391 do fix_endings 392) 393 394define stem as ( 395 unset found_vetrumai_urupu 396 do fix_ending 397 has_min_length 398 do remove_question_prefixes 399 do remove_pronoun_prefixes 400 do remove_question_suffixes 401 do remove_um 402 do remove_common_word_endings 403 do remove_vetrumai_urupukal 404 do remove_plural_suffix 405 do remove_command_suffixes 406 do remove_tense_suffixes 407) 408