1// Copyright 2012 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package strings
6
7// stringFinder efficiently finds strings in a source text. It's implemented
8// using the Boyer-Moore string search algorithm:
9// http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
10// http://www.cs.utexas.edu/~moore/publications/fstrpos.pdf (note: this aged
11// document uses 1-based indexing)
12type stringFinder struct {
13	// pattern is the string that we are searching for in the text.
14	pattern string
15
16	// badCharSkip[b] contains the distance between the last byte of pattern
17	// and the rightmost occurrence of b in pattern. If b is not in pattern,
18	// badCharSkip[b] is len(pattern).
19	//
20	// Whenever a mismatch is found with byte b in the text, we can safely
21	// shift the matching frame at least badCharSkip[b] until the next time
22	// the matching char could be in alignment.
23	badCharSkip [256]int
24
25	// goodSuffixSkip[i] defines how far we can shift the matching frame given
26	// that the suffix pattern[i+1:] matches, but the byte pattern[i] does
27	// not. There are two cases to consider:
28	//
29	// 1. The matched suffix occurs elsewhere in pattern (with a different
30	// byte preceding it that we might possibly match). In this case, we can
31	// shift the matching frame to align with the next suffix chunk. For
32	// example, the pattern "mississi" has the suffix "issi" next occurring
33	// (in right-to-left order) at index 1, so goodSuffixSkip[3] ==
34	// shift+len(suffix) == 3+4 == 7.
35	//
36	// 2. If the matched suffix does not occur elsewhere in pattern, then the
37	// matching frame may share part of its prefix with the end of the
38	// matching suffix. In this case, goodSuffixSkip[i] will contain how far
39	// to shift the frame to align this portion of the prefix to the
40	// suffix. For example, in the pattern "abcxxxabc", when the first
41	// mismatch from the back is found to be in position 3, the matching
42	// suffix "xxabc" is not found elsewhere in the pattern. However, its
43	// rightmost "abc" (at position 6) is a prefix of the whole pattern, so
44	// goodSuffixSkip[3] == shift+len(suffix) == 6+5 == 11.
45	goodSuffixSkip []int
46}
47
48func makeStringFinder(pattern string) *stringFinder {
49	f := &stringFinder{
50		pattern:        pattern,
51		goodSuffixSkip: make([]int, len(pattern)),
52	}
53	// last is the index of the last character in the pattern.
54	last := len(pattern) - 1
55
56	// Build bad character table.
57	// Bytes not in the pattern can skip one pattern's length.
58	for i := range f.badCharSkip {
59		f.badCharSkip[i] = len(pattern)
60	}
61	// The loop condition is < instead of <= so that the last byte does not
62	// have a zero distance to itself. Finding this byte out of place implies
63	// that it is not in the last position.
64	for i := 0; i < last; i++ {
65		f.badCharSkip[pattern[i]] = last - i
66	}
67
68	// Build good suffix table.
69	// First pass: set each value to the next index which starts a prefix of
70	// pattern.
71	lastPrefix := last
72	for i := last; i >= 0; i-- {
73		if HasPrefix(pattern, pattern[i+1:]) {
74			lastPrefix = i + 1
75		}
76		// lastPrefix is the shift, and (last-i) is len(suffix).
77		f.goodSuffixSkip[i] = lastPrefix + last - i
78	}
79	// Second pass: find repeats of pattern's suffix starting from the front.
80	for i := 0; i < last; i++ {
81		lenSuffix := longestCommonSuffix(pattern, pattern[1:i+1])
82		if pattern[i-lenSuffix] != pattern[last-lenSuffix] {
83			// (last-i) is the shift, and lenSuffix is len(suffix).
84			f.goodSuffixSkip[last-lenSuffix] = lenSuffix + last - i
85		}
86	}
87
88	return f
89}
90
91func longestCommonSuffix(a, b string) (i int) {
92	for ; i < len(a) && i < len(b); i++ {
93		if a[len(a)-1-i] != b[len(b)-1-i] {
94			break
95		}
96	}
97	return
98}
99
100// next returns the index in text of the first occurrence of the pattern. If
101// the pattern is not found, it returns -1.
102func (f *stringFinder) next(text string) int {
103	i := len(f.pattern) - 1
104	for i < len(text) {
105		// Compare backwards from the end until the first unmatching character.
106		j := len(f.pattern) - 1
107		for j >= 0 && text[i] == f.pattern[j] {
108			i--
109			j--
110		}
111		if j < 0 {
112			return i + 1 // match
113		}
114		i += max(f.badCharSkip[text[i]], f.goodSuffixSkip[j])
115	}
116	return -1
117}
118
119func max(a, b int) int {
120	if a > b {
121		return a
122	}
123	return b
124}
125