1// 2// $Id$ 3// 4 5#undef LOC_TABLE_ENTRY 6#undef LOC_TABLE_INDEX 7#define LOC_TABLE_ENTRY LOC_PREFIX(stem_table_entry_) 8#define LOC_TABLE_INDEX LOC_PREFIX(stem_table_index_) 9 10 11struct LOC_TABLE_ENTRY 12{ 13 LOC_CHAR_TYPE suffix[8]; 14 int remove, len; 15}; 16 17 18struct LOC_TABLE_INDEX 19{ 20 LOC_CHAR_TYPE first; 21 int count; 22}; 23 24 25// TableStringN, where N is a number of chars 26#undef TS1 27#undef TS2 28#undef TS3 29#undef TS4 30#undef TS5 31#define TS1(c1) { RUS::c1 } 32#define TS2(c1,c2) { RUS::c1, RUS::c2 } 33#define TS3(c1,c2,c3) { RUS::c1, RUS::c2, RUS::c3 } 34#define TS4(c1,c2,c3,c4) { RUS::c1, RUS::c2, RUS::c3, RUS::c4 } 35#define TS5(c1,c2,c3,c4,c5) { RUS::c1, RUS::c2, RUS::c3, RUS::c4, RUS::c5 } 36 37 38static LOC_TABLE_INDEX LOC_PREFIX(ru_adj_i)[] = 39{ 40 { RUS::E, 4 }, 41 { RUS::I, 2 }, 42 { RUS::IY, 4 }, 43 { RUS::M, 7 }, 44 { RUS::O, 2 }, 45 { RUS::U, 2 }, 46 { RUS::H, 2 }, 47 { RUS::YU, 4 }, 48 { RUS::YA, 2 }, 49}; 50 51 52static LOC_TABLE_ENTRY LOC_PREFIX(ru_adj)[] = 53{ 54 { TS2(E,E), 2, -1 }, 55 { TS2(I,E), 2, -1 }, 56 { TS2(Y,E), 2, -1 }, 57 { TS2(O,E), 2, -1 }, 58 59 { TS3(I,M,I), 3, -1 }, 60 { TS3(Y,M,I), 3, -1 }, 61 62 { TS2(E,IY), 2, -1 }, 63 { TS2(I,IY), 2, -1 }, 64 { TS2(Y,IY), 2, -1 }, 65 { TS2(O,IY), 2, -1 }, 66 67 { TS3(A,E,M), 0, -1 }, 68 { TS3(U,E,M), 0, -1 }, 69 { TS3(YA,E,M), 0, -1 }, 70 { TS2(E,M), 2, -1 }, 71 { TS2(I,M), 2, -1 }, 72 { TS2(Y,M), 2, -1 }, 73 { TS2(O,M), 2, -1 }, 74 75 { TS3(E,G,O), 3, -1 }, 76 { TS3(O,G,O), 3, -1 }, 77 78 { TS3(E,M,U), 3, -1 }, 79 { TS3(O,M,U), 3, -1 }, 80 81 { TS2(I,H), 2, -1 }, 82 { TS2(Y,H), 2, -1 }, 83 84 { TS2(E,YU), 2, -1 }, 85 { TS2(O,YU), 2, -1 }, 86 { TS2(U,YU), 2, -1 }, 87 { TS2(YU,YU), 2, -1 }, 88 89 { TS2(A,YA), 2, -1 }, 90 { TS2(YA,YA), 2, -1 } 91}; 92 93 94static LOC_TABLE_INDEX LOC_PREFIX(ru_part_i)[] = 95{ 96 { RUS::A, 3 }, 97 { RUS::M, 1 }, 98 { RUS::N, 3 }, 99 { RUS::O, 3 }, 100 { RUS::Y, 3 }, 101 { RUS::SH, 4 }, 102 { RUS::SCH, 5 } 103}; 104 105 106static LOC_TABLE_ENTRY LOC_PREFIX(ru_part)[] = 107{ 108 { TS4(A,N,N,A), 2, -1 }, 109 { TS4(E,N,N,A), 2, -1 }, 110 { TS4(YA,N,N,A), 2, -1 }, 111 112 { TS3(YA,E,M), 2, -1 }, 113 114 { TS3(A,N,N), 1, -1 }, 115 { TS3(E,N,N), 1, -1 }, 116 { TS3(YA,N,N), 1, -1 }, 117 118 { TS4(A,N,N,O), 2, -1 }, 119 { TS4(E,N,N,O), 2, -1 }, 120 { TS4(YA,N,N,O), 2, -1 }, 121 122 { TS4(A,N,N,Y), 2, -1 }, 123 { TS4(E,N,N,Y), 2, -1 }, 124 { TS4(YA,N,N,Y), 2, -1 }, 125 126 { TS3(A,V,SH), 2, -1 }, 127 { TS3(I,V,SH), 3, -1 }, 128 { TS3(Y,V,SH), 3, -1 }, 129 { TS3(YA,V,SH), 2, -1 }, 130 131 { TS3(A,YU,SCH), 2, -1 }, 132 { TS2(A,SCH), 1, -1 }, 133 { TS3(YA,YU,SCH), 2, -1 }, 134 { TS2(YA,SCH), 1, -1 }, 135 { TS3(U,YU,SCH), 3, -1 } 136}; 137 138 139static LOC_TABLE_INDEX LOC_PREFIX(ru_verb_i)[] = 140{ 141 { RUS::A, 7 }, 142 { RUS::E, 9 }, 143 { RUS::I, 4 }, 144 { RUS::IY, 4 }, 145 { RUS::L, 4 }, 146 { RUS::M, 5 }, 147 { RUS::O, 7 }, 148 { RUS::T, 9 }, 149 { RUS::Y, 3 }, 150 { RUS::MYA, 10 }, 151 { RUS::YU, 4 }, 152 { RUS::YA, 1 } 153}; 154 155 156static LOC_TABLE_ENTRY LOC_PREFIX(ru_verb)[] = 157{ 158 { TS3(A,L,A), 3, -1 }, 159 { TS3(A,N,A), 3, -1 }, 160 { TS3(YA,L,A), 3, -1 }, 161 { TS3(YA,N,A), 3, -1 }, 162 { TS3(I,L,A), 3, -1 }, 163 { TS3(Y,L,A), 3, -1 }, 164 { TS3(E,N,A), 3, -1 }, 165 166 { TS4(A,E,T,E), 4, -1 }, 167 { TS4(A,IY,T,E), 4, -1 }, 168 { TS3(MYA,T,E), 3, -1 }, 169 { TS4(U,E,T,E), 4, -1 }, 170 { TS4(YA,E,T,E), 4, -1 }, 171 { TS4(YA,IY,T,E), 4, -1 }, 172 { TS4(E,IY,T,E), 4, -1 }, 173 { TS4(U,IY,T,E), 4, -1 }, 174 { TS3(I,T,E), 3, -1 }, 175 176 { TS3(A,L,I), 3, -1 }, 177 { TS3(YA,L,I), 3, -1 }, 178 { TS3(I,L,I), 3, -1 }, 179 { TS3(Y,L,I), 3, -1 }, 180 181 { TS2(A,IY), 2, -1 }, 182 { TS2(YA,IY), 2, -1 }, 183 { TS2(E,IY), 2, -1 }, 184 { TS2(U,IY), 2, -1 }, 185 186 { TS2(A,L), 2, -1 }, 187 { TS2(YA,L), 2, -1 }, 188 { TS2(I,L), 2, -1 }, 189 { TS2(Y,L), 2, -1 }, 190 191 { TS3(A,E,M), 3, -1 }, 192 { TS3(YA,E,M), 3, -1 }, 193 { TS3(U,E,M), 3, -1 }, 194 { TS2(I,M), 2, -1 }, 195 { TS2(Y,M), 2, -1 }, 196 197 { TS3(A,L,O), 3, -1 }, 198 { TS3(A,N,O), 3, -1 }, 199 { TS3(YA,L,O), 3, -1 }, 200 { TS3(YA,N,O), 3, -1 }, 201 { TS3(I,L,O), 3, -1 }, 202 { TS3(Y,L,O), 3, -1 }, 203 { TS3(E,N,O), 3, -1 }, 204 205 { TS3(A,E,T), 3, -1 }, 206 { TS3(A,YU,T), 3, -1 }, 207 { TS3(YA,E,T), 3, -1 }, 208 { TS3(YA,YU,T), 3, -1 }, 209 { TS2(YA,T), 2, -1 }, 210 { TS3(U,E,T), 3, -1 }, 211 { TS3(U,YU,T), 3, -1 }, 212 { TS2(I,T), 2, -1 }, 213 { TS2(Y,T), 2, -1 }, 214 215 { TS3(A,N,Y), 3, -1 }, 216 { TS3(YA,N,Y), 3, -1 }, 217 { TS3(E,N,Y), 3, -1 }, 218 219 { TS4(A,E,SH,MYA), 4, -1 }, 220 { TS4(U,E,SH,MYA), 4, -1 }, 221 { TS4(YA,E,SH,MYA), 4, -1 }, 222 { TS3(A,T,MYA), 3, -1 }, 223 { TS3(E,T,MYA), 3, -1 }, 224 { TS3(I,T,MYA), 3, -1 }, 225 { TS3(U,T,MYA), 3, -1 }, 226 { TS3(Y,T,MYA), 3, -1 }, 227 { TS3(I,SH,MYA), 3, -1 }, 228 { TS3(YA,T,MYA), 3, -1 }, 229 230 { TS2(A,YU), 2, -1 }, 231 { TS2(U,YU), 2, -1 }, 232 { TS2(YA,YU), 2, -1 }, 233 { TS1(YU), 1, -1 }, 234 235 { TS2(U,YA), 2, -1 } 236}; 237 238 239static LOC_TABLE_INDEX LOC_PREFIX(ru_dear_i)[] = 240{ 241 { RUS::K, 3 }, 242 { RUS::A, 2 }, 243 { RUS::V, 2 }, 244 { RUS::E, 2 }, 245 { RUS::I, 4 }, 246 { RUS::IY, 2 }, 247 { RUS::M, 4 }, 248 { RUS::O, 2 }, 249 { RUS::U, 2 }, 250 { RUS::H, 2 }, 251 { RUS::YU, 2 } 252}; 253 254 255static LOC_TABLE_ENTRY LOC_PREFIX(ru_dear)[] = 256{ 257 { TS3(CH,E,K), 3, -1 }, 258 { TS3(CH,O,K), 3, -1 }, 259 { TS3(N,O,K), 3, -1 }, 260 261 { TS3(CH,K,A), 3, -1 }, 262 { TS3(N, K,A), 3, -1 }, 263 { TS4(CH,K,O,V), 4, -1 }, 264 { TS4(N, K,O,V), 4, -1 }, 265 { TS3(CH,K,E), 3, -1 }, 266 { TS3(N, K,E), 3, -1 }, 267 { TS3(CH,K,I), 3, -1 }, 268 { TS3(N, K,I), 3, -1 }, 269 { TS5(CH,K,A,M,I), 5, -1 }, 270 { TS5(N, K,A,M,I), 5, -1 }, 271 { TS4(CH,K,O,IY), 4, -1 }, 272 { TS4(N, K,O,IY), 4, -1 }, 273 { TS4(CH,K,A,M), 4, -1 }, 274 { TS4(N, K,A,M), 4, -1 }, 275 { TS4(CH,K,O,M), 4, -1 }, 276 { TS4(N, K,O,M), 4, -1 }, 277 { TS3(CH,K,O), 3, -1 }, 278 { TS3(N, K,O), 3, -1 }, 279 { TS3(CH,K,U), 3, -1 }, 280 { TS3(N, K,U), 3, -1 }, 281 { TS4(CH,K,A,H), 4, -1 }, 282 { TS4(N, K,A,H), 4, -1 }, 283 { TS4(CH,K,O,YU), 4, -1 }, 284 { TS4(N, K,O,YU), 4, -1 } 285}; 286 287 288static LOC_TABLE_INDEX LOC_PREFIX(ru_noun_i)[] = 289{ 290 { RUS::A, 1 }, 291 { RUS::V, 2 }, 292 { RUS::E, 3 }, 293 { RUS::I, 6 }, 294 { RUS::IY, 4 }, 295 { RUS::M, 5 }, 296 { RUS::O, 1 }, 297 { RUS::U, 1 }, 298 { RUS::H, 3 }, 299 { RUS::Y, 1 }, 300 { RUS::MYA, 1 }, 301 { RUS::YU, 3 }, 302 { RUS::YA, 3 } 303}; 304 305 306static LOC_TABLE_ENTRY LOC_PREFIX(ru_noun)[] = 307{ 308 { TS1(A), 1, -1 }, 309 310 { TS2(E,V), 2, -1 }, 311 { TS2(O,V), 2, -1 }, 312 313 { TS2(I,E), 2, -1 }, 314 { TS2(MYA,E), 2, -1 }, 315 { TS1(E), 1, -1 }, 316 317 { TS4(I,YA,M,I),4, -1 }, 318 { TS3(YA,M,I), 3, -1 }, 319 { TS3(A,M,I), 3, -1 }, 320 { TS2(E,I), 2, -1 }, 321 { TS2(I,I), 2, -1 }, 322 { TS1(I), 1, -1 }, 323 324 { TS3(I,E,IY), 3, -1 }, 325 { TS2(E,IY), 2, -1 }, 326 { TS2(O,IY), 2, -1 }, 327 { TS2(I,IY), 2, -1 }, 328 329 { TS3(I,YA,M), 3, -1 }, 330 { TS2(YA,M), 2, -1 }, 331 { TS3(I,E,M), 3, -1 }, 332 { TS2(A,M), 2, -1 }, 333 { TS2(O,M), 2, -1 }, 334 335 { TS1(O), 1, -1 }, 336 337 { TS1(U), 1, -1 }, 338 339 { TS2(A,H), 2, -1 }, 340 { TS3(I,YA,H), 3, -1 }, 341 { TS2(YA,H), 2, -1 }, 342 343 { TS1(Y), 1, -1 }, 344 345 { TS1(MYA), 1, -1 }, 346 347 { TS2(I,YU), 2, -1 }, 348 { TS2(MYA,YU), 2, -1 }, 349 { TS1(YU), 1, -1 }, 350 351 { TS2(I,YA), 2, -1 }, 352 { TS2(MYA,YA), 2, -1 }, 353 { TS1(YA), 1, -1 } 354}; 355 356 357int stem_ru_table_i ( LOC_CHAR_TYPE * word, int len, LOC_TABLE_ENTRY * table, LOC_TABLE_INDEX * itable, int icount ) 358{ 359 int i, j, k, m; 360 LOC_CHAR_TYPE l = word[--len]; 361 362 for ( i=0, j=0; i<icount; i++ ) 363 { 364 if ( l==itable[i].first ) 365 { 366 m = itable[i].count; 367 i = j-1; 368 while ( m-- ) 369 { 370 i++; 371 j = table[i].len; 372 k = len; 373 if ( j>k ) 374 continue; 375 for ( ; j>=0; k--, j-- ) 376 if ( word[k]!=table[i].suffix[j] ) 377 break; 378 if ( j>=0 ) 379 continue; 380 return table[i].remove; 381 } 382 return 0; 383 } 384 j += itable[i].count; 385 } 386 return 0; 387} 388 389 390#undef STEM_RU_FUNC 391#define STEM_RU_FUNC(func,table) \ 392 int func ( LOC_CHAR_TYPE * word, int len ) \ 393 { \ 394 return stem_ru_table ( word, len, LOC_PREFIX(table), \ 395 sizeof(LOC_PREFIX(table))/sizeof(LOC_TABLE_ENTRY) ); \ 396 } 397 398#undef STEM_RU_FUNC_I 399#define STEM_RU_FUNC_I(table) \ 400 int LOC_PREFIX(stem_##table##_i) ( LOC_CHAR_TYPE * word, int len ) \ 401 { \ 402 return stem_ru_table_i ( word, len, LOC_PREFIX(table), LOC_PREFIX(table##_i), \ 403 sizeof(LOC_PREFIX(table##_i))/sizeof(LOC_TABLE_INDEX) ); \ 404 } 405 406 407STEM_RU_FUNC_I(ru_adj) 408STEM_RU_FUNC_I(ru_part) 409STEM_RU_FUNC_I(ru_dear) 410STEM_RU_FUNC_I(ru_verb) 411STEM_RU_FUNC_I(ru_noun) 412 413 414static int LOC_PREFIX(stem_ru_adjectival) ( LOC_CHAR_TYPE * word, int len ) 415{ 416 int i = LOC_PREFIX(stem_ru_adj_i) ( word, len ); 417 if ( i ) 418 i += LOC_PREFIX(stem_ru_part_i) ( word, len-i ); 419 return i; 420} 421 422 423static int LOC_PREFIX(stem_ru_verb_ov) ( LOC_CHAR_TYPE * word, int len ) 424{ 425 int i = LOC_PREFIX(stem_ru_verb_i) ( word, len ); 426 if ( i && (len>=i+2) && word[len-i-2] == RUS::O && word[len-i-1] == RUS::V ) 427 return i+2; 428 return i; 429} 430 431 432void LOC_PREFIX(stem_ru_init) () 433{ 434 int i; 435 436 #undef STEM_RU_INIT_TABLE 437 #define STEM_RU_INIT_TABLE(table) \ 438 for ( i=0; i<int(sizeof(LOC_PREFIX(table))/sizeof(LOC_TABLE_ENTRY)); i++ ) \ 439 LOC_PREFIX(table)[i].len = ((int)strlen((char*)LOC_PREFIX(table)[i].suffix)/sizeof(LOC_CHAR_TYPE))- 1; 440 441 STEM_RU_INIT_TABLE(ru_adj) 442 STEM_RU_INIT_TABLE(ru_part) 443 STEM_RU_INIT_TABLE(ru_verb) 444 STEM_RU_INIT_TABLE(ru_noun) 445 STEM_RU_INIT_TABLE(ru_dear) 446} 447 448 449void LOC_PREFIX(stem_ru) ( LOC_CHAR_TYPE * word ) 450{ 451 int r1, r2; 452 int i, len; 453 454 // IsVowel 455 #undef IV 456 #define IV(c) ( \ 457 c==RUS::A || c==RUS::E || c==RUS::YO || c==RUS::I || c==RUS::O || \ 458 c==RUS::U || c==RUS::Y || c==RUS::EE || c==RUS::YU || c==RUS::YA ) 459 460 // EndOfWord 461 #undef EOW 462 #define EOW(_arg) (!(*((unsigned char*)(_arg)))) 463 464 while ( !EOW(word) ) if ( IV(*word) ) break; else word++; 465 if ( !EOW(word) ) word++; else return; 466 len = 0; while ( !EOW(word+len) ) len++; 467 468 r1 = r2 = len; 469 for ( i=-1; i<len-1; i++ ) if ( IV(word[i]) && !IV(word[i+1]) ) { r1 = i+2; break; } 470 for ( i=r1; i<len-1; i++ ) if ( IV(word[i]) && !IV(word[i+1]) ) { r2 = i+2; break; } 471 472 #define C(p) word[len-p] 473 #define W(p,c) ( C(p)==c ) 474 #define XSUFF2(c2,c1) ( W(1,c1) && W(2,c2) ) 475 #define XSUFF3(c3,c2,c1) ( W(1,c1) && W(2,c2) && W(3,c3) ) 476 #define XSUFF4(c4,c3,c2,c1) ( W(1,c1) && W(2,c2) && W(3,c3) && W(4,c4) ) 477 #define XSUFF5(c5,c4,c3,c2,c1) ( W(1,c1) && W(2,c2) && W(3,c3) && W(4,c4) && W(5,c5) ) 478 #define BRK(_arg) { len -= _arg; break; } 479 #define CHK(_func) { i = LOC_PREFIX(_func) ( word, len ); if ( i ) BRK ( i ); } 480 481 for ( ;; ) 482 { 483 CHK ( stem_ru_dear_i ); 484 485 if ( C(1)==RUS::V && len>=2 ) 486 { 487 if ( C(2)==RUS::I || C(2)==RUS::Y || C(2)==RUS::YA ) 488 BRK(2); 489 490 if ( C(2)==RUS::A ) 491 { 492 if ( C(3)==RUS::V && C(4)==RUS::A ) 493 BRK(4); 494 BRK(2); 495 } 496 } 497 498 if ( len>=3 && XSUFF3 ( RUS::V, RUS::SH, RUS::I ) 499 && ( C(4)==RUS::A || C(4)==RUS::I || C(4)==RUS::Y || C(4)==RUS::YA ) ) 500 BRK(4); 501 502 if ( len>=5 && XSUFF5 ( RUS::V, RUS::SH, RUS::I, RUS::S, RUS::MYA ) 503 && ( C(6)==RUS::A || C(6)==RUS::I || C(6)==RUS::Y || C(6)==RUS::YA ) ) 504 BRK(6); 505 506 CHK ( stem_ru_adjectival ); 507 508 if ( len>=2 && ( XSUFF2 ( RUS::S, RUS::MYA ) || XSUFF2 ( RUS::S, RUS::YA ) ) ) 509 { 510 len -= 2; 511 CHK ( stem_ru_adjectival ); 512 CHK ( stem_ru_verb_ov ); 513 } else 514 { 515 CHK ( stem_ru_verb_ov ); 516 } 517 518 CHK ( stem_ru_noun_i ); 519 break; 520 } 521 522 if ( len>0 && ( W(1,RUS::IY) || W(1,RUS::I) ) ) 523 len--; 524 525 if ( len-r2>=3 && XSUFF3 ( RUS::O, RUS::S, RUS::T ) ) 526 len -= 3; 527 else if ( len-r2>=4 && XSUFF4 ( RUS::O, RUS::S, RUS::T, RUS::MYA ) ) 528 len -= 4; 529 530 if ( len>=3 && XSUFF3 ( RUS::E, RUS::IY, RUS::SH ) ) 531 len -= 3; 532 else if ( len>=4 && XSUFF4 ( RUS::E, RUS::IY, RUS::SH, RUS::E ) ) 533 len -= 4; 534 535 if ( len>=2 && XSUFF2 ( RUS::N, RUS::N ) ) 536 len--; 537 538 if ( len>0 && W(1,RUS::MYA) ) 539 len--; 540 541 *((unsigned char*)(word+len)) = '\0'; 542} 543 544// undefine externally defined stuff 545#undef LOC_CHAR_TYPE 546#undef LOC_PREFIX 547#undef RUS 548 549// 550// $Id$ 551// 552