1package porterstemmer 2 3 4 5import ( 6// "log" 7 "unicode" 8) 9 10 11 12func isConsonant(s []rune, i int) bool { 13 14 //DEBUG 15 //log.Printf("isConsonant: [%+v]", string(s[i])) 16 17 result := true 18 19 switch ( s[i] ) { 20 case 'a', 'e', 'i', 'o', 'u': 21 result = false 22 case 'y': 23 if 0 == i { 24 result = true 25 } else { 26 result = !isConsonant(s, i-1) 27 } 28 default: 29 result = true 30 } 31 32 return result 33} 34 35 36 37func measure(s []rune) uint { 38 39 // Initialize. 40 lenS := len(s) 41 result := uint(0) 42 i := 0 43 44 45 // Short Circuit. 46 if 0 == lenS { 47/////////// RETURN 48 return result 49 } 50 51 52 // Ignore (potential) consonant sequence at the beginning of word. 53 for isConsonant(s, i) { 54 55 //DEBUG 56 //log.Printf("[measure([%s])] Eat Consonant [%d] -> [%s]", string(s), i, string(s[i])) 57 58 i++ 59 if i >= lenS { 60/////////////// RETURN 61 return result 62 } 63 } 64 65 66 // For each pair of a vowel sequence followed by a consonant sequence, increment result. 67 Outer: 68 for i < lenS { 69 70 for !isConsonant(s, i) { 71 72 //DEBUG 73 //log.Printf("[measure([%s])] VOWEL [%d] -> [%s]", string(s), i, string(s[i])) 74 75 i++ 76 if i >= lenS { 77 /////////// BREAK 78 break Outer 79 } 80 } 81 for isConsonant(s, i) { 82 83 //DEBUG 84 //log.Printf("[measure([%s])] CONSONANT [%d] -> [%s]", string(s), i, string(s[i])) 85 86 i++ 87 if i >= lenS { 88 result++ 89 /////////// BREAK 90 break Outer 91 } 92 } 93 result++ 94 } 95 96 97 // Return 98 return result 99} 100 101 102 103func hasSuffix(s, suffix []rune) bool { 104 105 lenSMinusOne := len(s) - 1 106 lenSuffixMinusOne := len(suffix) - 1 107 108 if lenSMinusOne <= lenSuffixMinusOne { 109 return false 110 } else if s[lenSMinusOne] != suffix[lenSuffixMinusOne] { // I suspect checking this first should speed this function up in practice. 111/////// RETURN 112 return false 113 } else { 114 115 for i := 0; i < lenSuffixMinusOne ; i++ { 116 117 if suffix[i] != s[lenSMinusOne-lenSuffixMinusOne+i] { 118/////////////// RETURN 119 return false 120 } 121 122 } 123 124 } 125 126 127 return true 128} 129 130 131 132func containsVowel(s []rune) bool { 133 134 lenS := len(s) 135 136 for i := 0 ; i < lenS ; i++ { 137 138 if !isConsonant(s, i) { 139/////////// RETURN 140 return true 141 } 142 143 } 144 145 return false 146} 147 148 149 150func hasRepeatDoubleConsonantSuffix(s []rune) bool { 151 152 // Initialize. 153 lenS := len(s) 154 155 result := false 156 157 158 // Do it! 159 if 2 > lenS { 160 result = false 161 } else if s[lenS-1] == s[lenS-2] && isConsonant(s, lenS-1) { // Will using isConsonant() cause a problem with "YY"? 162 result = true 163 } else { 164 result = false 165 } 166 167 168 // Return, 169 return result 170} 171 172 173 174func hasConsonantVowelConsonantSuffix(s []rune) bool { 175 176 // Initialize. 177 lenS := len(s) 178 179 result := false 180 181 182 // Do it! 183 if 3 > lenS { 184 result = false 185 } else if isConsonant(s, lenS-3) && !isConsonant(s, lenS-2) && isConsonant(s, lenS-1) { 186 result = true 187 } else { 188 result = false 189 } 190 191 192 // Return 193 return result 194} 195 196 197 198func step1a(s []rune) []rune { 199 200 // Initialize. 201 var result []rune = s 202 203 lenS := len(s) 204 205 206 // Do it! 207 if suffix := []rune("sses") ; hasSuffix(s, suffix) { 208 209 lenTrim := 2 210 211 subSlice := s[:lenS-lenTrim] 212 213 result = subSlice 214 } else if suffix := []rune("ies") ; hasSuffix(s, suffix) { 215 lenTrim := 2 216 217 subSlice := s[:lenS-lenTrim] 218 219 result = subSlice 220 } else if suffix := []rune("ss") ; hasSuffix(s, suffix) { 221 222 result = s 223 } else if suffix := []rune("s") ; hasSuffix(s, suffix) { 224 225 lenSuffix := 1 226 227 subSlice := s[:lenS-lenSuffix] 228 229 result = subSlice 230 } 231 232 233 // Return. 234 return result 235} 236 237 238 239func step1b(s []rune) []rune { 240 241 // Initialize. 242 var result []rune = s 243 244 lenS := len(s) 245 246 247 // Do it! 248 if suffix := []rune("eed") ; hasSuffix(s, suffix) { 249 lenSuffix := len(suffix) 250 251 subSlice := s[:lenS-lenSuffix] 252 253 m := measure(subSlice) 254 255 if 0 < m { 256 lenTrim := 1 257 258 result = s[:lenS-lenTrim] 259 } 260 } else if suffix := []rune("ed") ; hasSuffix(s, suffix) { 261 lenSuffix := len(suffix) 262 263 subSlice := s[:lenS-lenSuffix] 264 265 if containsVowel(subSlice) { 266 267 if suffix2 := []rune("at") ; hasSuffix(subSlice, suffix2) { 268 lenTrim := -1 269 270 result = s[:lenS-lenSuffix-lenTrim] 271 } else if suffix2 := []rune("bl") ; hasSuffix(subSlice, suffix2) { 272 lenTrim := -1 273 274 result = s[:lenS-lenSuffix-lenTrim] 275 } else if suffix2 := []rune("iz") ; hasSuffix(subSlice, suffix2) { 276 lenTrim := -1 277 278 result = s[:lenS-lenSuffix-lenTrim] 279 } else if c := subSlice[len(subSlice)-1] ; 'l' != c && 's' != c && 'z' != c && hasRepeatDoubleConsonantSuffix(subSlice) { 280 lenTrim := 1 281 282 lenSubSlice := len(subSlice) 283 284 result = subSlice[:lenSubSlice-lenTrim] 285 } else if c := subSlice[len(subSlice)-1] ; 1 == measure(subSlice) && hasConsonantVowelConsonantSuffix(subSlice) && 'w' != c && 'x' != c && 'y' != c { 286 lenTrim := -1 287 288 result = s[:lenS-lenSuffix-lenTrim] 289 290 result[len(result)-1] = 'e' 291 } else { 292 result = subSlice 293 } 294 295 } 296 } else if suffix := []rune("ing") ; hasSuffix(s, suffix) { 297 lenSuffix := len(suffix) 298 299 subSlice := s[:lenS-lenSuffix] 300 301 if containsVowel(subSlice) { 302 303 if suffix2 := []rune("at") ; hasSuffix(subSlice, suffix2) { 304 lenTrim := -1 305 306 result = s[:lenS-lenSuffix-lenTrim] 307 308 result[len(result)-1] = 'e' 309 } else if suffix2 := []rune("bl") ; hasSuffix(subSlice, suffix2) { 310 lenTrim := -1 311 312 result = s[:lenS-lenSuffix-lenTrim] 313 314 result[len(result)-1] = 'e' 315 } else if suffix2 := []rune("iz") ; hasSuffix(subSlice, suffix2) { 316 lenTrim := -1 317 318 result = s[:lenS-lenSuffix-lenTrim] 319 320 result[len(result)-1] = 'e' 321 } else if c := subSlice[len(subSlice)-1] ; 'l' != c && 's' != c && 'z' != c && hasRepeatDoubleConsonantSuffix(subSlice) { 322 lenTrim := 1 323 324 lenSubSlice := len(subSlice) 325 326 result = subSlice[:lenSubSlice-lenTrim] 327 } else if c := subSlice[len(subSlice)-1] ; 1 == measure(subSlice) && hasConsonantVowelConsonantSuffix(subSlice) && 'w' != c && 'x' != c && 'y' != c { 328 lenTrim := -1 329 330 result = s[:lenS-lenSuffix-lenTrim] 331 332 result[len(result)-1] = 'e' 333 } else { 334 result = subSlice 335 } 336 337 } 338 } 339 340 341 // Return. 342 return result 343} 344 345 346 347func step1c(s []rune) []rune { 348 349 // Initialize. 350 lenS := len(s) 351 352 result := s 353 354 355 // Do it! 356 if 2 > lenS { 357/////////// RETURN 358 return result 359 } 360 361 if 'y' == s[lenS-1] && containsVowel(s[:lenS-1]) { 362 363 result[lenS-1] = 'i'; 364 365 } else if 'Y' == s[lenS-1] && containsVowel(s[:lenS-1]) { 366 367 result[lenS-1] = 'I'; 368 369 } 370 371 372 // Return. 373 return result 374} 375 376 377 378func step2(s []rune) []rune { 379 380 // Initialize. 381 lenS := len(s) 382 383 result := s 384 385 386 // Do it! 387 if suffix := []rune("ational") ; hasSuffix(s, suffix) { 388 if 0 < measure(s[:lenS-len(suffix)]) { 389 result[lenS-5] = 'e' 390 result = result[:lenS-4] 391 } 392 } else if suffix := []rune("tional") ; hasSuffix(s, suffix) { 393 if 0 < measure(s[:lenS-len(suffix)]) { 394 result = result[:lenS-2] 395 } 396 } else if suffix := []rune("enci") ; hasSuffix(s, suffix) { 397 if 0 < measure(s[:lenS-len(suffix)]) { 398 result[lenS-1] = 'e' 399 } 400 } else if suffix := []rune("anci") ; hasSuffix(s, suffix) { 401 if 0 < measure(s[:lenS-len(suffix)]) { 402 result[lenS-1] = 'e' 403 } 404 } else if suffix := []rune("izer") ; hasSuffix(s, suffix) { 405 if 0 < measure(s[:lenS-len(suffix)]) { 406 result = s[:lenS-1] 407 } 408 } else if suffix := []rune("bli") ; hasSuffix(s, suffix) { // --DEPARTURE-- 409// } else if suffix := []rune("abli") ; hasSuffix(s, suffix) { 410 if 0 < measure(s[:lenS-len(suffix)]) { 411 result[lenS-1] = 'e' 412 } 413 } else if suffix := []rune("alli") ; hasSuffix(s, suffix) { 414 if 0 < measure(s[:lenS-len(suffix)]) { 415 result = s[:lenS-2] 416 } 417 } else if suffix := []rune("entli") ; hasSuffix(s, suffix) { 418 if 0 < measure(s[:lenS-len(suffix)]) { 419 result = s[:lenS-2] 420 } 421 } else if suffix := []rune("eli") ; hasSuffix(s, suffix) { 422 if 0 < measure(s[:lenS-len(suffix)]) { 423 result = s[:lenS-2] 424 } 425 } else if suffix := []rune("ousli") ; hasSuffix(s, suffix) { 426 if 0 < measure(s[:lenS-len(suffix)]) { 427 result = s[:lenS-2] 428 } 429 } else if suffix := []rune("ization") ; hasSuffix(s, suffix) { 430 if 0 < measure(s[:lenS-len(suffix)]) { 431 result[lenS-5] = 'e' 432 433 result = s[:lenS-4] 434 } 435 } else if suffix := []rune("ation") ; hasSuffix(s, suffix) { 436 if 0 < measure(s[:lenS-len(suffix)]) { 437 result[lenS-3] = 'e' 438 439 result = s[:lenS-2] 440 } 441 } else if suffix := []rune("ator") ; hasSuffix(s, suffix) { 442 if 0 < measure(s[:lenS-len(suffix)]) { 443 result[lenS-2] = 'e' 444 445 result = s[:lenS-1] 446 } 447 } else if suffix := []rune("alism") ; hasSuffix(s, suffix) { 448 if 0 < measure(s[:lenS-len(suffix)]) { 449 result = s[:lenS-3] 450 } 451 } else if suffix := []rune("iveness") ; hasSuffix(s, suffix) { 452 if 0 < measure(s[:lenS-len(suffix)]) { 453 result = s[:lenS-4] 454 } 455 } else if suffix := []rune("fulness") ; hasSuffix(s, suffix) { 456 if 0 < measure(s[:lenS-len(suffix)]) { 457 result = s[:lenS-4] 458 } 459 } else if suffix := []rune("ousness") ; hasSuffix(s, suffix) { 460 if 0 < measure(s[:lenS-len(suffix)]) { 461 result = s[:lenS-4] 462 } 463 } else if suffix := []rune("aliti") ; hasSuffix(s, suffix) { 464 if 0 < measure(s[:lenS-len(suffix)]) { 465 result = s[:lenS-3] 466 } 467 } else if suffix := []rune("iviti") ; hasSuffix(s, suffix) { 468 if 0 < measure(s[:lenS-len(suffix)]) { 469 result[lenS-3] = 'e' 470 471 result = result[:lenS-2] 472 } 473 } else if suffix := []rune("biliti") ; hasSuffix(s, suffix) { 474 if 0 < measure(s[:lenS-len(suffix)]) { 475 result[lenS-5] = 'l' 476 result[lenS-4] = 'e' 477 478 result = result[:lenS-3] 479 } 480 } else if suffix := []rune("logi") ; hasSuffix(s, suffix) { // --DEPARTURE-- 481 if 0 < measure(s[:lenS-len(suffix)]) { 482 lenTrim := 1 483 484 result = s[:lenS-lenTrim] 485 } 486 } 487 488 489 // Return. 490 return result 491} 492 493 494 495func step3(s []rune) []rune { 496 497 // Initialize. 498 lenS := len(s) 499 result := s 500 501 502 // Do it! 503 if suffix := []rune("icate") ; hasSuffix(s, suffix) { 504 lenSuffix := len(suffix) 505 506 if 0 < measure(s[:lenS-lenSuffix]) { 507 result = result[:lenS-3] 508 } 509 } else if suffix := []rune("ative") ; hasSuffix(s, suffix) { 510 lenSuffix := len(suffix) 511 512 subSlice := s[:lenS-lenSuffix] 513 514 m := measure(subSlice) 515 516 if 0 < m { 517 result = subSlice 518 } 519 } else if suffix := []rune("alize") ; hasSuffix(s, suffix) { 520 lenSuffix := len(suffix) 521 522 if 0 < measure(s[:lenS-lenSuffix]) { 523 result = result[:lenS-3] 524 } 525 } else if suffix := []rune("iciti") ; hasSuffix(s, suffix) { 526 lenSuffix := len(suffix) 527 528 if 0 < measure(s[:lenS-lenSuffix]) { 529 result = result[:lenS-3] 530 } 531 } else if suffix := []rune("ical") ; hasSuffix(s, suffix) { 532 lenSuffix := len(suffix) 533 534 if 0 < measure(s[:lenS-lenSuffix]) { 535 result = result[:lenS-2] 536 } 537 } else if suffix := []rune("ful") ; hasSuffix(s, suffix) { 538 lenSuffix := len(suffix) 539 540 subSlice := s[:lenS-lenSuffix] 541 542 m := measure(subSlice) 543 544 if 0 < m { 545 result = subSlice 546 } 547 } else if suffix := []rune("ness") ; hasSuffix(s, suffix) { 548 lenSuffix := len(suffix) 549 550 subSlice := s[:lenS-lenSuffix] 551 552 m := measure(subSlice) 553 554 if 0 < m { 555 result = subSlice 556 } 557 } 558 559 560 // Return. 561 return result 562} 563 564 565 566func step4(s []rune) []rune { 567 568 // Initialize. 569 lenS := len(s) 570 result := s 571 572 573 // Do it! 574 if suffix := []rune("al") ; hasSuffix(s, suffix) { 575 lenSuffix := len(suffix) 576 577 subSlice := s[:lenS-lenSuffix] 578 579 m := measure(subSlice) 580 581 if 1 < m { 582 result = result[:lenS-lenSuffix] 583 } 584 } else if suffix := []rune("ance") ; hasSuffix(s, suffix) { 585 lenSuffix := len(suffix) 586 587 subSlice := s[:lenS-lenSuffix] 588 589 m := measure(subSlice) 590 591 if 1 < m { 592 result = result[:lenS-lenSuffix] 593 } 594 } else if suffix := []rune("ence") ; hasSuffix(s, suffix) { 595 lenSuffix := len(suffix) 596 597 subSlice := s[:lenS-lenSuffix] 598 599 m := measure(subSlice) 600 601 if 1 < m { 602 result = result[:lenS-lenSuffix] 603 } 604 } else if suffix := []rune("er") ; hasSuffix(s, suffix) { 605 lenSuffix := len(suffix) 606 607 subSlice := s[:lenS-lenSuffix] 608 609 m := measure(subSlice) 610 611 if 1 < m { 612 result = subSlice 613 } 614 } else if suffix := []rune("ic") ; hasSuffix(s, suffix) { 615 lenSuffix := len(suffix) 616 617 subSlice := s[:lenS-lenSuffix] 618 619 m := measure(subSlice) 620 621 if 1 < m { 622 result = subSlice 623 } 624 } else if suffix := []rune("able") ; hasSuffix(s, suffix) { 625 lenSuffix := len(suffix) 626 627 subSlice := s[:lenS-lenSuffix] 628 629 m := measure(subSlice) 630 631 if 1 < m { 632 result = subSlice 633 } 634 } else if suffix := []rune("ible") ; hasSuffix(s, suffix) { 635 lenSuffix := len(suffix) 636 637 subSlice := s[:lenS-lenSuffix] 638 639 m := measure(subSlice) 640 641 if 1 < m { 642 result = subSlice 643 } 644 } else if suffix := []rune("ant") ; hasSuffix(s, suffix) { 645 lenSuffix := len(suffix) 646 647 subSlice := s[:lenS-lenSuffix] 648 649 m := measure(subSlice) 650 651 if 1 < m { 652 result = subSlice 653 } 654 } else if suffix := []rune("ement") ; hasSuffix(s, suffix) { 655 lenSuffix := len(suffix) 656 657 subSlice := s[:lenS-lenSuffix] 658 659 m := measure(subSlice) 660 661 if 1 < m { 662 result = subSlice 663 } 664 } else if suffix := []rune("ment") ; hasSuffix(s, suffix) { 665 lenSuffix := len(suffix) 666 667 subSlice := s[:lenS-lenSuffix] 668 669 m := measure(subSlice) 670 671 if 1 < m { 672 result = subSlice 673 } 674 } else if suffix := []rune("ent") ; hasSuffix(s, suffix) { 675 lenSuffix := len(suffix) 676 677 subSlice := s[:lenS-lenSuffix] 678 679 m := measure(subSlice) 680 681 if 1 < m { 682 result = subSlice 683 } 684 } else if suffix := []rune("ion") ; hasSuffix(s, suffix) { 685 lenSuffix := len(suffix) 686 687 subSlice := s[:lenS-lenSuffix] 688 689 m := measure(subSlice) 690 691 c := subSlice[len(subSlice)-1] 692 693 if 1 < m && ('s' == c || 't' == c) { 694 result = subSlice 695 } 696 } else if suffix := []rune("ou") ; hasSuffix(s, suffix) { 697 lenSuffix := len(suffix) 698 699 subSlice := s[:lenS-lenSuffix] 700 701 m := measure(subSlice) 702 703 if 1 < m { 704 result = subSlice 705 } 706 } else if suffix := []rune("ism") ; hasSuffix(s, suffix) { 707 lenSuffix := len(suffix) 708 709 subSlice := s[:lenS-lenSuffix] 710 711 m := measure(subSlice) 712 713 if 1 < m { 714 result = subSlice 715 } 716 } else if suffix := []rune("ate") ; hasSuffix(s, suffix) { 717 lenSuffix := len(suffix) 718 719 subSlice := s[:lenS-lenSuffix] 720 721 m := measure(subSlice) 722 723 if 1 < m { 724 result = subSlice 725 } 726 } else if suffix := []rune("iti") ; hasSuffix(s, suffix) { 727 lenSuffix := len(suffix) 728 729 subSlice := s[:lenS-lenSuffix] 730 731 m := measure(subSlice) 732 733 if 1 < m { 734 result = subSlice 735 } 736 } else if suffix := []rune("ous") ; hasSuffix(s, suffix) { 737 lenSuffix := len(suffix) 738 739 subSlice := s[:lenS-lenSuffix] 740 741 m := measure(subSlice) 742 743 if 1 < m { 744 result = subSlice 745 } 746 } else if suffix := []rune("ive") ; hasSuffix(s, suffix) { 747 lenSuffix := len(suffix) 748 749 subSlice := s[:lenS-lenSuffix] 750 751 m := measure(subSlice) 752 753 if 1 < m { 754 result = subSlice 755 } 756 } else if suffix := []rune("ize") ; hasSuffix(s, suffix) { 757 lenSuffix := len(suffix) 758 759 subSlice := s[:lenS-lenSuffix] 760 761 m := measure(subSlice) 762 763 if 1 < m { 764 result = subSlice 765 } 766 } 767 768 769 // Return. 770 return result 771} 772 773 774 775func step5a(s []rune) []rune { 776 777 // Initialize. 778 lenS := len(s) 779 result := s 780 781 782 // Do it! 783 if 'e' == s[lenS-1] { 784 lenSuffix := 1 785 786 subSlice := s[:lenS-lenSuffix] 787 788 m := measure(subSlice) 789 790 if 1 < m { 791 result = subSlice 792 } else if c := subSlice[len(subSlice)-1] ; 1 == m && !( hasConsonantVowelConsonantSuffix(subSlice) && 'w' != c && 'x' != c && 'y' != c) { 793 result = subSlice 794 } 795 } 796 797 798 // Return. 799 return result 800} 801 802 803 804func step5b(s []rune) []rune { 805 806 // Initialize. 807 lenS := len(s) 808 result := s 809 810 811 // Do it! 812 if 2 < lenS && 'l' == s[lenS-2] && 'l' == s[lenS-1] { 813 814 lenSuffix := 1 815 816 subSlice := s[:lenS-lenSuffix] 817 818 m := measure(subSlice) 819 820 if 1 < m { 821 result = subSlice 822 } 823 } 824 825 826 // Return. 827 return result 828} 829 830 831 832 833func StemString(s string) string { 834 835 // Convert string to []rune 836 runeArr := []rune(s) 837 838 // Stem. 839 runeArr = Stem(runeArr) 840 841 // Convert []rune to string 842 str := string(runeArr) 843 844 // Return. 845 return str 846} 847 848func Stem(s []rune) []rune { 849 850 // Initialize. 851 lenS := len(s) 852 853 854 // Short circuit. 855 if 0 == lenS { 856/////////// RETURN 857 return s 858 } 859 860 861 // Make all runes lowercase. 862 for i := 0 ; i < lenS ; i++ { 863 s[i] = unicode.ToLower(s[i]) 864 } 865 866 867 // Stem 868 result := StemWithoutLowerCasing(s) 869 870 871 // Return. 872 return result 873} 874 875func StemWithoutLowerCasing(s []rune) []rune { 876 877 // Initialize. 878 lenS := len(s) 879 880 881 // Words that are of length 2 or less is already stemmed. 882 // Don't do anything. 883 if 2 >= lenS { 884/////////// RETURN 885 return s 886 } 887 888 889 // Stem 890 s = step1a(s) 891 s = step1b(s) 892 s = step1c(s) 893 s = step2(s) 894 s = step3(s) 895 s = step4(s) 896 s = step5a(s) 897 s = step5b(s) 898 899 900 // Return. 901 return s 902} 903 904