1package porterstemmer
2
3
4
5import (
6//	"log"
7	"unicode"
8)
9
10
11
12func isConsonant(s []rune, i int) bool {
13
14	//DEBUG
15	//log.Printf("isConsonant: [%+v]", string(s[i]))
16
17	result := true
18
19	switch (  s[i]  ) {
20		case 'a', 'e', 'i', 'o', 'u':
21			result = false
22		case 'y':
23			if 0 == i {
24				result = true
25			} else {
26				result = !isConsonant(s, i-1)
27			}
28		default:
29			result = true
30   }
31
32	return result
33}
34
35
36
37func measure(s []rune) uint {
38
39	// Initialize.
40		lenS := len(s)
41		result := uint(0)
42		i := 0
43
44
45	// Short Circuit.
46		if 0 == lenS {
47/////////// RETURN
48			return result
49		}
50
51
52	// Ignore (potential) consonant sequence at the beginning of word.
53		for isConsonant(s, i) {
54
55			//DEBUG
56			//log.Printf("[measure([%s])] Eat Consonant [%d] -> [%s]", string(s), i, string(s[i]))
57
58			i++
59			if i >= lenS {
60/////////////// RETURN
61				return result
62			}
63		}
64
65
66	// For each pair of a vowel sequence followed by a consonant sequence, increment result.
67		Outer:
68		for i < lenS {
69
70			for !isConsonant(s, i) {
71
72				//DEBUG
73				//log.Printf("[measure([%s])] VOWEL [%d] -> [%s]", string(s), i, string(s[i]))
74
75				i++
76				if i >= lenS {
77		/////////// BREAK
78					break Outer
79				}
80			}
81			for isConsonant(s, i) {
82
83				//DEBUG
84				//log.Printf("[measure([%s])] CONSONANT [%d] -> [%s]", string(s), i, string(s[i]))
85
86				i++
87				if i >= lenS {
88					result++
89		/////////// BREAK
90					break Outer
91				}
92			}
93			result++
94		}
95
96
97	// Return
98		return result
99}
100
101
102
103func hasSuffix(s, suffix []rune) bool {
104
105	lenSMinusOne      := len(s)      - 1
106	lenSuffixMinusOne := len(suffix) - 1
107
108	if lenSMinusOne <= lenSuffixMinusOne {
109		return false
110	} else if s[lenSMinusOne] != suffix[lenSuffixMinusOne] { // I suspect checking this first should speed this function up in practice.
111/////// RETURN
112		return false
113	} else {
114
115		for i := 0; i < lenSuffixMinusOne ; i++ {
116
117			if suffix[i] != s[lenSMinusOne-lenSuffixMinusOne+i] {
118/////////////// RETURN
119				return false
120			}
121
122		}
123
124	}
125
126
127	return true
128}
129
130
131
132func containsVowel(s []rune) bool {
133
134	lenS := len(s)
135
136	for i := 0 ; i < lenS ; i++ {
137
138		if !isConsonant(s, i) {
139/////////// RETURN
140			return true
141		}
142
143	}
144
145	return false
146}
147
148
149
150func hasRepeatDoubleConsonantSuffix(s []rune) bool {
151
152	// Initialize.
153		lenS := len(s)
154
155		result := false
156
157
158	// Do it!
159		if 2 > lenS {
160			result = false
161		} else if s[lenS-1] == s[lenS-2] && isConsonant(s, lenS-1) { // Will using isConsonant() cause a problem with "YY"?
162			result = true
163		} else {
164			result = false
165		}
166
167
168	// Return,
169		return result
170}
171
172
173
174func hasConsonantVowelConsonantSuffix(s []rune) bool {
175
176	// Initialize.
177		lenS := len(s)
178
179		result := false
180
181
182	// Do it!
183		if 3 > lenS {
184			result = false
185		} else if isConsonant(s, lenS-3) && !isConsonant(s, lenS-2) && isConsonant(s, lenS-1) {
186			result = true
187		} else  {
188			result = false
189		}
190
191
192	// Return
193		return result
194}
195
196
197
198func step1a(s []rune) []rune {
199
200	// Initialize.
201		var result []rune = s
202
203		lenS := len(s)
204
205
206	// Do it!
207		if suffix := []rune("sses") ; hasSuffix(s, suffix) {
208
209			lenTrim := 2
210
211			subSlice := s[:lenS-lenTrim]
212
213			result = subSlice
214		} else if suffix := []rune("ies") ; hasSuffix(s, suffix) {
215			lenTrim := 2
216
217			subSlice := s[:lenS-lenTrim]
218
219			result = subSlice
220		} else if suffix := []rune("ss") ; hasSuffix(s, suffix) {
221
222			result = s
223		} else if suffix := []rune("s") ; hasSuffix(s, suffix) {
224
225			lenSuffix := 1
226
227			subSlice := s[:lenS-lenSuffix]
228
229			result = subSlice
230		}
231
232
233	// Return.
234		return result
235}
236
237
238
239func step1b(s []rune) []rune {
240
241	// Initialize.
242		var result []rune = s
243
244		lenS := len(s)
245
246
247	// Do it!
248		if suffix := []rune("eed") ; hasSuffix(s, suffix) {
249			lenSuffix := len(suffix)
250
251			subSlice := s[:lenS-lenSuffix]
252
253			m := measure(subSlice)
254
255			if  0 < m {
256				lenTrim := 1
257
258				result = s[:lenS-lenTrim]
259			}
260		} else if suffix := []rune("ed") ; hasSuffix(s, suffix) {
261			lenSuffix := len(suffix)
262
263			subSlice := s[:lenS-lenSuffix]
264
265			if containsVowel(subSlice) {
266
267				if suffix2 := []rune("at") ; hasSuffix(subSlice, suffix2) {
268					lenTrim := -1
269
270					result = s[:lenS-lenSuffix-lenTrim]
271				} else if suffix2 := []rune("bl") ; hasSuffix(subSlice, suffix2) {
272					lenTrim := -1
273
274					result = s[:lenS-lenSuffix-lenTrim]
275				} else if suffix2 := []rune("iz") ; hasSuffix(subSlice, suffix2) {
276					lenTrim := -1
277
278					result = s[:lenS-lenSuffix-lenTrim]
279				} else if c := subSlice[len(subSlice)-1] ; 'l' != c && 's' != c && 'z' != c && hasRepeatDoubleConsonantSuffix(subSlice) {
280					lenTrim := 1
281
282					lenSubSlice := len(subSlice)
283
284					result = subSlice[:lenSubSlice-lenTrim]
285				} else if c := subSlice[len(subSlice)-1] ; 1 == measure(subSlice) && hasConsonantVowelConsonantSuffix(subSlice) && 'w' != c && 'x' != c && 'y' != c {
286					lenTrim := -1
287
288					result = s[:lenS-lenSuffix-lenTrim]
289
290					result[len(result)-1] = 'e'
291				} else {
292					result = subSlice
293				}
294
295			}
296		} else if suffix := []rune("ing") ; hasSuffix(s, suffix) {
297			lenSuffix := len(suffix)
298
299			subSlice := s[:lenS-lenSuffix]
300
301			if containsVowel(subSlice) {
302
303				if suffix2 := []rune("at") ; hasSuffix(subSlice, suffix2) {
304					lenTrim := -1
305
306					result = s[:lenS-lenSuffix-lenTrim]
307
308					result[len(result)-1] = 'e'
309				} else if suffix2 := []rune("bl") ; hasSuffix(subSlice, suffix2) {
310					lenTrim := -1
311
312					result = s[:lenS-lenSuffix-lenTrim]
313
314					result[len(result)-1] = 'e'
315				} else if suffix2 := []rune("iz") ; hasSuffix(subSlice, suffix2) {
316					lenTrim := -1
317
318					result = s[:lenS-lenSuffix-lenTrim]
319
320					result[len(result)-1] = 'e'
321				} else if c := subSlice[len(subSlice)-1] ; 'l' != c && 's' != c && 'z' != c && hasRepeatDoubleConsonantSuffix(subSlice) {
322					lenTrim := 1
323
324					lenSubSlice := len(subSlice)
325
326					result = subSlice[:lenSubSlice-lenTrim]
327				} else if c := subSlice[len(subSlice)-1] ; 1 == measure(subSlice) && hasConsonantVowelConsonantSuffix(subSlice) && 'w' != c && 'x' != c && 'y' != c {
328					lenTrim := -1
329
330					result = s[:lenS-lenSuffix-lenTrim]
331
332					result[len(result)-1] = 'e'
333				} else {
334					result = subSlice
335				}
336
337			}
338		}
339
340
341	// Return.
342		return result
343}
344
345
346
347func step1c(s []rune) []rune {
348
349	// Initialize.
350		lenS := len(s)
351
352		result := s
353
354
355	// Do it!
356		if 2 > lenS {
357/////////// RETURN
358			return result
359		}
360
361		if 'y' == s[lenS-1] && containsVowel(s[:lenS-1])  {
362
363			result[lenS-1] = 'i';
364
365		} else if 'Y' == s[lenS-1] && containsVowel(s[:lenS-1])  {
366
367			result[lenS-1] = 'I';
368
369		}
370
371
372	// Return.
373		return result
374}
375
376
377
378func step2(s []rune) []rune {
379
380	// Initialize.
381		lenS := len(s)
382
383		result := s
384
385
386	// Do it!
387		if suffix := []rune("ational") ; hasSuffix(s, suffix) {
388			if 0 < measure(s[:lenS-len(suffix)]) {
389				result[lenS-5] = 'e'
390				result = result[:lenS-4]
391			}
392		} else if suffix := []rune("tional") ; hasSuffix(s, suffix) {
393			if 0 < measure(s[:lenS-len(suffix)]) {
394				result = result[:lenS-2]
395			}
396		} else if suffix := []rune("enci") ; hasSuffix(s, suffix) {
397			if 0 < measure(s[:lenS-len(suffix)]) {
398				result[lenS-1] = 'e'
399			}
400		} else if suffix := []rune("anci") ; hasSuffix(s, suffix) {
401			if 0 < measure(s[:lenS-len(suffix)]) {
402				result[lenS-1] = 'e'
403			}
404		} else if suffix := []rune("izer") ; hasSuffix(s, suffix) {
405			if 0 < measure(s[:lenS-len(suffix)]) {
406				result = s[:lenS-1]
407			}
408		} else if suffix := []rune("bli") ; hasSuffix(s, suffix) { // --DEPARTURE--
409//		} else if suffix := []rune("abli") ; hasSuffix(s, suffix) {
410			if 0 < measure(s[:lenS-len(suffix)]) {
411				result[lenS-1] = 'e'
412			}
413		} else if suffix := []rune("alli") ; hasSuffix(s, suffix) {
414			if 0 < measure(s[:lenS-len(suffix)]) {
415				result = s[:lenS-2]
416			}
417		} else if suffix := []rune("entli") ; hasSuffix(s, suffix) {
418			if 0 < measure(s[:lenS-len(suffix)]) {
419				result = s[:lenS-2]
420			}
421		} else if suffix := []rune("eli") ; hasSuffix(s, suffix) {
422			if 0 < measure(s[:lenS-len(suffix)]) {
423				result = s[:lenS-2]
424			}
425		} else if suffix := []rune("ousli") ; hasSuffix(s, suffix) {
426			if 0 < measure(s[:lenS-len(suffix)]) {
427				result = s[:lenS-2]
428			}
429		} else if suffix := []rune("ization") ; hasSuffix(s, suffix) {
430			if 0 < measure(s[:lenS-len(suffix)]) {
431				result[lenS-5] = 'e'
432
433				result = s[:lenS-4]
434			}
435		} else if suffix := []rune("ation") ; hasSuffix(s, suffix) {
436			if 0 < measure(s[:lenS-len(suffix)]) {
437				result[lenS-3] = 'e'
438
439				result = s[:lenS-2]
440			}
441		} else if suffix := []rune("ator") ; hasSuffix(s, suffix) {
442			if 0 < measure(s[:lenS-len(suffix)]) {
443				result[lenS-2] = 'e'
444
445				result = s[:lenS-1]
446			}
447		} else if suffix := []rune("alism") ; hasSuffix(s, suffix) {
448			if 0 < measure(s[:lenS-len(suffix)]) {
449				result = s[:lenS-3]
450			}
451		} else if suffix := []rune("iveness") ; hasSuffix(s, suffix) {
452			if 0 < measure(s[:lenS-len(suffix)]) {
453				result = s[:lenS-4]
454			}
455		} else if suffix := []rune("fulness") ; hasSuffix(s, suffix) {
456			if 0 < measure(s[:lenS-len(suffix)]) {
457				result = s[:lenS-4]
458			}
459		} else if suffix := []rune("ousness") ; hasSuffix(s, suffix) {
460			if 0 < measure(s[:lenS-len(suffix)]) {
461				result = s[:lenS-4]
462			}
463		} else if suffix := []rune("aliti") ; hasSuffix(s, suffix) {
464			if 0 < measure(s[:lenS-len(suffix)]) {
465				result = s[:lenS-3]
466			}
467		} else if suffix := []rune("iviti") ; hasSuffix(s, suffix) {
468			if 0 < measure(s[:lenS-len(suffix)]) {
469				result[lenS-3] = 'e'
470
471				result = result[:lenS-2]
472			}
473		} else if suffix := []rune("biliti") ; hasSuffix(s, suffix) {
474			if 0 < measure(s[:lenS-len(suffix)]) {
475				result[lenS-5] = 'l'
476				result[lenS-4] = 'e'
477
478				result = result[:lenS-3]
479			}
480		} else if suffix := []rune("logi") ; hasSuffix(s, suffix) { // --DEPARTURE--
481			if 0 < measure(s[:lenS-len(suffix)]) {
482				lenTrim := 1
483
484				result = s[:lenS-lenTrim]
485			}
486		}
487
488
489	// Return.
490		return result
491}
492
493
494
495func step3(s []rune) []rune {
496
497	// Initialize.
498		lenS := len(s)
499		result := s
500
501
502	// Do it!
503		if suffix := []rune("icate") ; hasSuffix(s, suffix) {
504			lenSuffix := len(suffix)
505
506			if 0 < measure(s[:lenS-lenSuffix]) {
507				result = result[:lenS-3]
508			}
509		} else if suffix := []rune("ative") ; hasSuffix(s, suffix) {
510			lenSuffix := len(suffix)
511
512			subSlice := s[:lenS-lenSuffix]
513
514			m := measure(subSlice)
515
516			if 0 < m {
517				result = subSlice
518			}
519		} else if suffix := []rune("alize") ; hasSuffix(s, suffix) {
520			lenSuffix := len(suffix)
521
522			if 0 < measure(s[:lenS-lenSuffix]) {
523				result = result[:lenS-3]
524			}
525		} else if suffix := []rune("iciti") ; hasSuffix(s, suffix) {
526			lenSuffix := len(suffix)
527
528			if 0 < measure(s[:lenS-lenSuffix]) {
529				result = result[:lenS-3]
530			}
531		} else if suffix := []rune("ical") ; hasSuffix(s, suffix) {
532			lenSuffix := len(suffix)
533
534			if 0 < measure(s[:lenS-lenSuffix]) {
535				result = result[:lenS-2]
536			}
537		} else if suffix := []rune("ful") ; hasSuffix(s, suffix) {
538			lenSuffix := len(suffix)
539
540			subSlice := s[:lenS-lenSuffix]
541
542			m := measure(subSlice)
543
544			if 0 < m {
545				result = subSlice
546			}
547		} else if suffix := []rune("ness") ; hasSuffix(s, suffix) {
548			lenSuffix := len(suffix)
549
550			subSlice := s[:lenS-lenSuffix]
551
552			m := measure(subSlice)
553
554			if 0 < m {
555				result = subSlice
556			}
557		}
558
559
560	// Return.
561		return result
562}
563
564
565
566func step4(s []rune) []rune {
567
568	// Initialize.
569		lenS := len(s)
570		result := s
571
572
573	// Do it!
574		if suffix := []rune("al") ; hasSuffix(s, suffix) {
575			lenSuffix := len(suffix)
576
577			subSlice := s[:lenS-lenSuffix]
578
579			m := measure(subSlice)
580
581			if 1 < m {
582				result = result[:lenS-lenSuffix]
583			}
584		} else if suffix := []rune("ance") ; hasSuffix(s, suffix) {
585			lenSuffix := len(suffix)
586
587			subSlice := s[:lenS-lenSuffix]
588
589			m := measure(subSlice)
590
591			if 1 < m {
592				result = result[:lenS-lenSuffix]
593			}
594		} else if suffix := []rune("ence") ; hasSuffix(s, suffix) {
595			lenSuffix := len(suffix)
596
597			subSlice := s[:lenS-lenSuffix]
598
599			m := measure(subSlice)
600
601			if 1 < m {
602				result = result[:lenS-lenSuffix]
603			}
604		} else if suffix := []rune("er") ; hasSuffix(s, suffix) {
605			lenSuffix := len(suffix)
606
607			subSlice := s[:lenS-lenSuffix]
608
609			m := measure(subSlice)
610
611			if 1 < m {
612				result = subSlice
613			}
614		} else if suffix := []rune("ic") ; hasSuffix(s, suffix) {
615			lenSuffix := len(suffix)
616
617			subSlice := s[:lenS-lenSuffix]
618
619			m := measure(subSlice)
620
621			if 1 < m {
622				result = subSlice
623			}
624		} else if suffix := []rune("able") ; hasSuffix(s, suffix) {
625			lenSuffix := len(suffix)
626
627			subSlice := s[:lenS-lenSuffix]
628
629			m := measure(subSlice)
630
631			if 1 < m {
632				result = subSlice
633			}
634		} else if suffix := []rune("ible") ; hasSuffix(s, suffix) {
635			lenSuffix := len(suffix)
636
637			subSlice := s[:lenS-lenSuffix]
638
639			m := measure(subSlice)
640
641			if 1 < m {
642				result = subSlice
643			}
644		} else if suffix := []rune("ant") ; hasSuffix(s, suffix) {
645			lenSuffix := len(suffix)
646
647			subSlice := s[:lenS-lenSuffix]
648
649			m := measure(subSlice)
650
651			if 1 < m {
652				result = subSlice
653			}
654		} else if suffix := []rune("ement") ; hasSuffix(s, suffix) {
655			lenSuffix := len(suffix)
656
657			subSlice := s[:lenS-lenSuffix]
658
659			m := measure(subSlice)
660
661			if 1 < m {
662				result = subSlice
663			}
664		} else if suffix := []rune("ment") ; hasSuffix(s, suffix) {
665			lenSuffix := len(suffix)
666
667			subSlice := s[:lenS-lenSuffix]
668
669			m := measure(subSlice)
670
671			if 1 < m {
672				result = subSlice
673			}
674		} else if suffix := []rune("ent") ; hasSuffix(s, suffix) {
675			lenSuffix := len(suffix)
676
677			subSlice := s[:lenS-lenSuffix]
678
679			m := measure(subSlice)
680
681			if 1 < m {
682				result = subSlice
683			}
684		} else if suffix := []rune("ion") ; hasSuffix(s, suffix) {
685			lenSuffix := len(suffix)
686
687			subSlice := s[:lenS-lenSuffix]
688
689			m := measure(subSlice)
690
691			c := subSlice[len(subSlice)-1]
692
693			if 1 < m && ('s' == c || 't' == c) {
694				result = subSlice
695			}
696		} else if suffix := []rune("ou") ; hasSuffix(s, suffix) {
697			lenSuffix := len(suffix)
698
699			subSlice := s[:lenS-lenSuffix]
700
701			m := measure(subSlice)
702
703			if 1 < m {
704				result = subSlice
705			}
706		} else if suffix := []rune("ism") ; hasSuffix(s, suffix) {
707			lenSuffix := len(suffix)
708
709			subSlice := s[:lenS-lenSuffix]
710
711			m := measure(subSlice)
712
713			if 1 < m {
714				result = subSlice
715			}
716		} else if suffix := []rune("ate") ; hasSuffix(s, suffix) {
717			lenSuffix := len(suffix)
718
719			subSlice := s[:lenS-lenSuffix]
720
721			m := measure(subSlice)
722
723			if 1 < m {
724				result = subSlice
725			}
726		} else if suffix := []rune("iti") ; hasSuffix(s, suffix) {
727			lenSuffix := len(suffix)
728
729			subSlice := s[:lenS-lenSuffix]
730
731			m := measure(subSlice)
732
733			if 1 < m {
734				result = subSlice
735			}
736		} else if suffix := []rune("ous") ; hasSuffix(s, suffix) {
737			lenSuffix := len(suffix)
738
739			subSlice := s[:lenS-lenSuffix]
740
741			m := measure(subSlice)
742
743			if 1 < m {
744				result = subSlice
745			}
746		} else if suffix := []rune("ive") ; hasSuffix(s, suffix) {
747			lenSuffix := len(suffix)
748
749			subSlice := s[:lenS-lenSuffix]
750
751			m := measure(subSlice)
752
753			if 1 < m {
754				result = subSlice
755			}
756		} else if suffix := []rune("ize") ; hasSuffix(s, suffix) {
757			lenSuffix := len(suffix)
758
759			subSlice := s[:lenS-lenSuffix]
760
761			m := measure(subSlice)
762
763			if 1 < m {
764				result = subSlice
765			}
766		}
767
768
769	// Return.
770		return result
771}
772
773
774
775func step5a(s []rune) []rune {
776
777	// Initialize.
778		lenS := len(s)
779		result := s
780
781
782	// Do it!
783		if 'e' == s[lenS-1] {
784			lenSuffix := 1
785
786			subSlice := s[:lenS-lenSuffix]
787
788			m := measure(subSlice)
789
790			if 1 < m {
791				result = subSlice
792			} else if c := subSlice[len(subSlice)-1] ; 1 == m && !( hasConsonantVowelConsonantSuffix(subSlice) && 'w' != c && 'x' != c && 'y' != c)  {
793				result = subSlice
794			}
795		}
796
797
798	// Return.
799		return result
800}
801
802
803
804func step5b(s []rune) []rune {
805
806	// Initialize.
807		lenS := len(s)
808		result := s
809
810
811	// Do it!
812		if 2 < lenS && 'l' == s[lenS-2] && 'l' == s[lenS-1] {
813
814			lenSuffix := 1
815
816			subSlice := s[:lenS-lenSuffix]
817
818			m := measure(subSlice)
819
820			if 1 < m {
821				result = subSlice
822			}
823		}
824
825
826	// Return.
827		return result
828}
829
830
831
832
833func StemString(s string) string {
834
835	// Convert string to []rune
836		runeArr := []rune(s)
837
838	// Stem.
839		runeArr = Stem(runeArr)
840
841	// Convert []rune to string
842		str := string(runeArr)
843
844	// Return.
845		return str
846}
847
848func Stem(s []rune) []rune {
849
850	// Initialize.
851		lenS := len(s)
852
853
854	// Short circuit.
855		if 0 == lenS {
856/////////// RETURN
857			return s
858		}
859
860
861	// Make all runes lowercase.
862		for i := 0 ; i < lenS ; i++ {
863			s[i] = unicode.ToLower(s[i])
864		}
865
866
867	// Stem
868		result := StemWithoutLowerCasing(s)
869
870
871	// Return.
872		return result
873}
874
875func StemWithoutLowerCasing(s []rune) []rune {
876
877	// Initialize.
878		lenS := len(s)
879
880
881	// Words that are of length 2 or less is already stemmed.
882	// Don't do anything.
883		if 2 >= lenS {
884/////////// RETURN
885			return s
886		}
887
888
889	// Stem
890		s = step1a(s)
891		s = step1b(s)
892		s = step1c(s)
893		s = step2(s)
894		s = step3(s)
895		s = step4(s)
896		s = step5a(s)
897		s = step5b(s)
898
899
900	// Return.
901		return s
902}
903
904