1/* 2* Affix stripping stemming algorithm for Tamil 3* By Damodharan Rajalingam 4*/ 5 6stringescapes {} 7 8/* Aytham */ 9stringdef aytham '{U+0B83}' 10 11/* Uyir - independent vowels */ 12stringdef a '{U+0B85}' 13stringdef aa '{U+0B86}' 14stringdef i '{U+0B87}' 15stringdef ii '{U+0B88}' 16stringdef u '{U+0B89}' 17stringdef uu '{U+0B8A}' 18stringdef e '{U+0B8E}' 19stringdef ee '{U+0B8F}' 20stringdef ai '{U+0B90}' 21stringdef o '{U+0B92}' 22stringdef oo '{U+0B93}' 23stringdef au '{U+0B94}' 24 25/* Consonants */ 26stringdef ka '{U+0B95}' 27stringdef nga '{U+0B99}' 28stringdef ca '{U+0B9A}' 29stringdef ja '{U+0B9C}' 30stringdef nya '{U+0B9E}' 31stringdef tta '{U+0B9F}' 32stringdef nna '{U+0BA3}' 33stringdef ta '{U+0BA4}' 34stringdef tha '{U+0BA4}' 35stringdef na '{U+0BA8}' 36stringdef nnna '{U+0BA9}' 37stringdef pa '{U+0BAA}' 38stringdef ma '{U+0BAE}' 39stringdef ya '{U+0BAF}' 40stringdef ra '{U+0BB0}' 41stringdef rra '{U+0BB1}' 42stringdef la '{U+0BB2}' 43stringdef lla '{U+0BB3}' 44stringdef llla '{U+0BB4}' 45stringdef zha '{U+0BB4}' 46stringdef va '{U+0BB5}' 47 48/* Vatamozi - borrowed */ 49stringdef sha '{U+0BB6}' 50stringdef ssa '{U+0BB7}' 51stringdef sa '{U+0BB8}' 52stringdef ha '{U+0BB9}' 53 54 55/* Dependent vowel signs (kombu etc.) */ 56stringdef vs_aa '{U+0BBE}' 57stringdef vs_i '{U+0BBF}' 58stringdef vs_ii '{U+0BC0}' 59stringdef vs_u '{U+0BC1}' 60stringdef vs_uu '{U+0BC2}' 61stringdef vs_e '{U+0BC6}' 62stringdef vs_ee '{U+0BC7}' 63stringdef vs_ai '{U+0BC8}' 64stringdef vs_o '{U+0BCA}' 65stringdef vs_oo '{U+0BCB}' 66stringdef vs_au '{U+0BCC}' 67 68/* Pulli */ 69stringdef pulli '{U+0BCD}' 70 71/* AU length markk */ 72stringdef au_lmark '{U+0BD7}' 73 74 75routines ( 76 remove_plural_suffix 77 remove_question_suffixes 78 remove_question_prefixes 79 remove_pronoun_prefixes 80 remove_command_suffixes 81 remove_um 82 remove_vetrumai_urupukal 83 fix_va_start 84 fix_ending 85 fix_endings 86 remove_tense_suffix 87 remove_tense_suffixes 88 remove_common_word_endings 89 has_min_length 90) 91 92externals ( stem ) 93 94booleans ( 95 found_a_match 96 found_vetrumai_urupu 97) 98 99define has_min_length as ( 100 $(len > 4) 101) 102 103define fix_va_start as ( 104 (try '{va}{vs_oo}' and [ '{va}{vs_oo}' ] <- '{oo}' ) or 105 (try '{va}{vs_o}' and [ '{va}{vs_o}' ] <- '{o}' ) or 106 (try '{va}{vs_u}' and [ '{va}{vs_u}' ] <- '{u}' ) or 107 (try '{va}{vs_uu}' and [ '{va}{vs_uu}' ] <- '{uu}' ) 108) 109 110define fix_endings as ( 111 do repeat fix_ending 112) 113 114define remove_question_prefixes as ( 115 [ ('{e}' ) among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete 116 do fix_va_start 117) 118 119// Gives signal t if an ending was fixed, signal f otherwise. 120define fix_ending as ( 121 $(len > 3) 122 backwards ( 123 ( [among('{na}{pulli}' '{na}{pulli}{ta}' '{na}{pulli}{ta}{pulli}') ] delete ) 124 or 125 ( ['{ya}{pulli}' test among('{vs_ai}' '{vs_i}' '{vs_ii}') ] delete ) 126 or 127 ( [ '{tta}{pulli}{pa}{pulli}' or '{tta}{pulli}{ka}{pulli}' ] <- '{lla}{pulli}' ) 128 or 129 ( [ '{nnna}{pulli}{rra}{pulli}' ] <- '{la}{pulli}' ) 130 or 131// ( [ '{rra}{pulli}{ka}{pulli}' or '{nnna}{pulli}{nnna}{pulli}' ] <- '{la}{pulli}' ) 132 ( [ '{rra}{pulli}{ka}{pulli}' ] <- '{la}{pulli}' ) 133 or 134 ( [ '{tta}{pulli}{tta}{pulli}' ] <- '{tta}{vs_u}' ) 135 or 136 ( found_vetrumai_urupu [ '{ta}{pulli}{ta}{pulli}' (test not '{vs_ai}') ] <- '{ma}{pulli}' ] ) 137 or 138 ( [ '{vs_u}{ka}{pulli}' or '{vs_u}{ka}{pulli}{ka}{pulli}' ] <- '{pulli}' ) 139 or 140 ( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete ) 141 or 142 ( [ '{vs_u}{ka}{pulli}' ] <- '{pulli}' ) 143 or 144 ( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete ) 145 or 146 ( [ '{pulli}' (among('{ya}' '{ra}' '{la}' '{va}' '{zha}' '{lla}') or among('{nga}' '{nya}' '{nna}' '{na}' '{ma}' '{nnna}')) '{pulli}' ] <- '{pulli}' ) 147 or 148 ( [ among('{va}' '{ya}' '{va}{pulli}') ] delete ) 149 or 150 ( [ '{nnna}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')) ] delete ) 151 or 152 ( [ '{nga}{pulli}' (test not '{vs_ai}')] <- '{ma}{pulli}' ) 153 or 154 ( [ '{nga}{pulli}' ] delete ) 155 or 156 ( [ '{pulli}' (test (among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}') or '{pulli}')) ] delete ) 157 ) 158) 159 160define remove_pronoun_prefixes as ( 161 unset found_a_match 162 [ among('{a}' '{i}' '{u}') among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete 163 (set found_a_match) 164 do fix_va_start 165) 166 167define remove_plural_suffix as ( 168 unset found_a_match 169 backwards ( 170 ( [ '{vs_u}{nga}{pulli}{ka}{lla}{pulli}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}')) ] <- '{pulli}' ) or 171 ( [ '{rra}{pulli}{ka}{lla}{pulli}' ] <- '{la}{pulli}' ) or 172 ( [ '{tta}{pulli}{ka}{lla}{pulli}' ] <- '{lla}{pulli}' ) or 173 ( [ '{ka}{lla}{pulli}' ] delete ) 174 (set found_a_match) 175 ) 176) 177 178define remove_question_suffixes as ( 179 has_min_length 180 unset found_a_match 181 backwards ( 182 do ( 183 [ among('{vs_oo}' '{vs_ee}' '{vs_aa}') ] <- '{pulli}' 184 (set found_a_match) 185 ) 186 ) 187 do fix_endings 188) 189 190define remove_command_suffixes as ( 191 has_min_length 192 unset found_a_match 193 backwards ( 194 [ among('{pa}{vs_i}' '{va}{vs_i}') ] delete 195 (set found_a_match) 196 ) 197) 198 199define remove_um as ( 200 unset found_a_match 201 has_min_length 202 backwards ( [ '{vs_u}{ma}{pulli}' ] <- '{pulli}' 203 (set found_a_match) 204 ) 205 do fix_ending 206) 207 208define remove_common_word_endings as ( 209 // These are not suffixes actually but are 210 // some words that are attached to other words 211 // but can be removed for stemming 212 unset found_a_match 213 has_min_length 214 backwards ( 215 test ( [ '{vs_u}{tta}{nnna}{pulli}' or 216 '{vs_i}{la}{pulli}{la}{vs_ai}' or 217 '{vs_i}{tta}{ma}{pulli}' or 218 '{vs_i}{nnna}{pulli}{rra}{vs_i}' or 219 '{vs_aa}{ka}{vs_i}' or 220 '{vs_aa}{ka}{vs_i}{ya}' or 221 '{vs_e}{nnna}{pulli}{rra}{vs_u}' or 222 '{vs_u}{lla}{pulli}{lla}' or 223 '{vs_u}{tta}{vs_ai}{ya}' or 224 '{vs_u}{tta}{vs_ai}' or 225 '{vs_e}{nnna}{vs_u}{ma}{pulli}' or 226 ('{la}{pulli}{la}' test (not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or 227 '{vs_e}{nnna}' or 228 '{vs_aa}{ka}{vs_i}' ] <- '{pulli}' 229 (set found_a_match) 230 ) 231 or 232 test ( [ among('{pa}{tta}{vs_u}' 233 '{pa}{tta}{pulli}{tta}' 234 '{pa}{tta}{pulli}{tta}{vs_u}' 235 '{pa}{tta}{pulli}{tta}{ta}{vs_u}' 236 '{pa}{tta}{pulli}{tta}{nna}' 237 '{ka}{vs_u}{ra}{vs_i}{ya}' 238 '{pa}{rra}{pulli}{rra}{vs_i}' 239 '{va}{vs_i}{tta}{vs_u}' 240 '{va}{vs_i}{tta}{pulli}{tta}{vs_u}' 241 '{pa}{tta}{vs_i}{ta}{vs_aa}{nnna}' 242 '{pa}{tta}{vs_i}' 243 '{ta}{vs_aa}{nnna}' 244 '{vs_e}{la}{pulli}{la}{vs_aa}{ma}{pulli}') 245 ] delete 246 (set found_a_match) 247 ) 248 ) 249 do fix_endings 250) 251 252define remove_vetrumai_urupukal as ( 253 unset found_a_match 254 unset found_vetrumai_urupu 255 has_min_length 256 backwards ( 257 ( 258 test ( ['{nnna}{vs_ai}'] delete ) 259 or 260 test ([ ( '{vs_i}{nnna}{vs_ai}' or 261 '{vs_ai}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}'))) or 262 ( '{vs_ai}' (test (among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}'))) 263 ] <- '{pulli}' 264 ) 265 or 266 test ( [ 267 '{vs_o}{tta}{vs_u}' or 268 '{vs_oo}{tta}{vs_u}' or 269 '{vs_i}{la}{pulli}' or 270 '{vs_i}{rra}{pulli}' or 271 ('{vs_i}{nnna}{pulli}' (test not '{ma}')) or 272 '{vs_i}{nnna}{pulli}{rra}{vs_u}' or 273 '{vs_i}{ra}{vs_u}{na}{pulli}{ta}{vs_u}' or 274 '{va}{vs_i}{tta}' or 275 ($(len >= 7) '{vs_i}{tta}{ma}{pulli}') or 276 '{vs_aa}{la}{pulli}' or 277 '{vs_u}{tta}{vs_ai}' or 278 '{vs_aa}{ma}{la}{pulli}' or 279 ('{la}{pulli}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or 280 '{vs_u}{lla}{pulli}' 281 ] <- '{pulli}' 282 ) 283 or 284 test ( [ 285 '{ka}{nna}{pulli}' or 286 '{ma}{vs_u}{nnna}{pulli}' or 287 '{ma}{vs_ee}{la}{pulli}' or 288 '{ma}{vs_ee}{rra}{pulli}' or 289 '{ka}{vs_ii}{llla}{pulli}' or 290 '{pa}{vs_i}{nnna}{pulli}' or 291 ('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) 292 ] delete 293 ) 294 or 295 test ([ '{vs_ii}' ] <- '{vs_i}') 296 ) 297 (set found_a_match) 298 (set found_vetrumai_urupu) 299 do ( [ '{vs_i}{nnna}{pulli}' ] <- '{pulli}' ) 300 ) 301 do fix_endings 302) 303 304define remove_tense_suffixes as ( 305 set found_a_match 306 repeat ( found_a_match (do remove_tense_suffix) ) 307) 308 309define remove_tense_suffix as ( 310 unset found_a_match 311 has_min_length 312 backwards ( 313 do ( 314 test ( [among( 315 '{ka}{vs_o}{nna}{pulli}{tta}{vs_i}{ra}{pulli}' 316 '{pa}{tta}{vs_u}' 317 )] delete 318 (set found_a_match) 319 ) 320 or 321 test ( [ 322 '{ma}{vs_aa}{ra}{pulli}' or 323 '{ma}{vs_i}{nnna}{pulli}' or 324 '{nnna}{nnna}{pulli}' or 325 '{nnna}{vs_aa}{nnna}{pulli}' or 326 '{nnna}{vs_aa}{lla}{pulli}' or 327 '{nnna}{vs_aa}{ra}{pulli}' or 328 ('{va}{nnna}{pulli}' test (not among('{a}' '{aa}' '{i}' '{ii}' '{u}' '{uu}' '{e}' '{ee}' '{ai}' '{o}' '{oo}' '{au}')) ) or 329 '{nnna}{lla}{pulli}' or 330 '{va}{lla}{pulli}' or 331 '{nnna}{ra}{pulli}' or 332 '{va}{ra}{pulli}' or 333 '{nnna}' or '{pa}' or '{ka}' or '{ta}' or '{ya}' or 334 '{pa}{nnna}{pulli}' or 335 '{pa}{lla}{pulli}' or 336 '{pa}{ra}{pulli}' or 337 ('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or 338 '{vs_i}{rra}{pulli}{rra}{vs_u}' or 339 '{pa}{ma}{pulli}' or 340 '{nnna}{ma}{pulli}' or 341 '{ta}{vs_u}{ma}{pulli}' or 342 '{rra}{vs_u}{ma}{pulli}' or 343 '{ka}{vs_u}{ma}{pulli}' or 344 '{nnna}{vs_e}{nnna}{pulli}' or 345 '{nnna}{vs_ai}' or 346 '{va}{vs_ai}' 347 ] delete 348 (set found_a_match) 349 ) 350 or 351 test ( [ 352 ('{vs_aa}{nnna}{pulli}' test (not '{ca}')) or 353 '{vs_aa}{lla}{pulli}' or 354 '{vs_aa}{ra}{pulli}' or 355 '{vs_ee}{nnna}{pulli}' or 356 '{vs_aa}' or 357 '{vs_aa}{ma}{pulli}' or 358 '{vs_e}{ma}{pulli}' or 359 '{vs_ee}{ma}{pulli}' or 360 '{vs_oo}{ma}{pulli}' or 361 '{ka}{vs_u}{ma}{pulli}' or 362 '{ta}{vs_u}{ma}{pulli}' or 363 '{tta}{vs_u}{ma}{pulli}' or 364 '{rra}{vs_u}{ma}{pulli}' or 365 '{vs_aa}{ya}{pulli}' or 366 '{nnna}{vs_e}{nnna}{pulli}' or 367 '{nnna}{vs_i}{ra}{pulli}' or 368 '{vs_ii}{ra}{pulli}' or 369 '{vs_ii}{ya}{ra}{pulli}' 370 ] <- '{pulli}' 371 (set found_a_match) 372 ) 373 or 374 test ( ([ '{ka}{vs_u}' or '{ta}{vs_u}' ) (test '{pulli}') ] delete 375 (set found_a_match) 376 ) 377 ) 378 do ([among( 379 '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}' 380 '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}{pulli}' 381 '{ka}{vs_i}{nnna}{pulli}{rra}' 382 '{ka}{vs_i}{nnna}{pulli}{rra}{pulli}' 383 '{ka}{vs_i}{rra}' 384 '{ka}{vs_i}{rra}{pulli}' 385 )] delete 386 (set found_a_match) 387 ) 388 ) 389 do fix_endings 390) 391 392define stem as ( 393 unset found_vetrumai_urupu 394 do fix_ending 395 has_min_length 396 do remove_question_prefixes 397 do remove_pronoun_prefixes 398 do remove_question_suffixes 399 do remove_um 400 do remove_common_word_endings 401 do remove_vetrumai_urupukal 402 do remove_plural_suffix 403 do remove_command_suffixes 404 do remove_tense_suffixes 405) 406