1package closestmatch
2
3import (
4	"compress/gzip"
5	"encoding/json"
6	"math/rand"
7	"os"
8	"sort"
9	"strings"
10	"sync"
11)
12
13// ClosestMatch is the structure that contains the
14// substring sizes and carrys a map of the substrings for
15// easy lookup
16type ClosestMatch struct {
17	SubstringSizes []int
18	SubstringToID  map[string]map[uint32]struct{}
19	ID             map[uint32]IDInfo
20	mux            sync.Mutex
21}
22
23// IDInfo carries the information about the keys
24type IDInfo struct {
25	Key           string
26	NumSubstrings int
27}
28
29// New returns a new structure for performing closest matches
30func New(possible []string, subsetSize []int) *ClosestMatch {
31	cm := new(ClosestMatch)
32	cm.SubstringSizes = subsetSize
33	cm.SubstringToID = make(map[string]map[uint32]struct{})
34	cm.ID = make(map[uint32]IDInfo)
35	for i, s := range possible {
36		substrings := cm.splitWord(strings.ToLower(s))
37		cm.ID[uint32(i)] = IDInfo{Key: s, NumSubstrings: len(substrings)}
38		for substring := range substrings {
39			if _, ok := cm.SubstringToID[substring]; !ok {
40				cm.SubstringToID[substring] = make(map[uint32]struct{})
41			}
42			cm.SubstringToID[substring][uint32(i)] = struct{}{}
43		}
44	}
45
46	return cm
47}
48
49// Load can load a previously saved ClosestMatch object from disk
50func Load(filename string) (*ClosestMatch, error) {
51	cm := new(ClosestMatch)
52
53	f, err := os.Open(filename)
54	defer f.Close()
55	if err != nil {
56		return cm, err
57	}
58
59	w, err := gzip.NewReader(f)
60	if err != nil {
61		return cm, err
62	}
63
64	err = json.NewDecoder(w).Decode(&cm)
65	return cm, err
66}
67
68// Add more words to ClosestMatch structure
69func (cm *ClosestMatch) Add(possible []string) {
70	cm.mux.Lock()
71	for i, s := range possible {
72		substrings := cm.splitWord(strings.ToLower(s))
73		cm.ID[uint32(i)] = IDInfo{Key: s, NumSubstrings: len(substrings)}
74		for substring := range substrings {
75			if _, ok := cm.SubstringToID[substring]; !ok {
76				cm.SubstringToID[substring] = make(map[uint32]struct{})
77			}
78			cm.SubstringToID[substring][uint32(i)] = struct{}{}
79		}
80	}
81	cm.mux.Unlock()
82}
83
84// Save writes the current ClosestSave object as a gzipped JSON file
85func (cm *ClosestMatch) Save(filename string) error {
86	f, err := os.Create(filename)
87	if err != nil {
88		return err
89	}
90	defer f.Close()
91	w := gzip.NewWriter(f)
92	defer w.Close()
93	enc := json.NewEncoder(w)
94	// enc.SetIndent("", " ")
95	return enc.Encode(cm)
96}
97
98func (cm *ClosestMatch) worker(id int, jobs <-chan job, results chan<- result) {
99	for j := range jobs {
100		m := make(map[string]int)
101		cm.mux.Lock()
102		if ids, ok := cm.SubstringToID[j.substring]; ok {
103			weight := 1000 / len(ids)
104			for id := range ids {
105				if _, ok2 := m[cm.ID[id].Key]; !ok2 {
106					m[cm.ID[id].Key] = 0
107				}
108				m[cm.ID[id].Key] += 1 + 1000/len(cm.ID[id].Key) + weight
109			}
110		}
111		cm.mux.Unlock()
112		results <- result{m: m}
113	}
114}
115
116type job struct {
117	substring string
118}
119
120type result struct {
121	m map[string]int
122}
123
124func (cm *ClosestMatch) match(searchWord string) map[string]int {
125	searchSubstrings := cm.splitWord(searchWord)
126	searchSubstringsLen := len(searchSubstrings)
127
128	jobs := make(chan job, searchSubstringsLen)
129	results := make(chan result, searchSubstringsLen)
130	workers := 8
131
132	for w := 1; w <= workers; w++ {
133		go cm.worker(w, jobs, results)
134	}
135
136	for substring := range searchSubstrings {
137		jobs <- job{substring: substring}
138	}
139	close(jobs)
140
141	m := make(map[string]int)
142	for a := 1; a <= searchSubstringsLen; a++ {
143		r := <-results
144		for key := range r.m {
145			if _, ok := m[key]; ok {
146				m[key] += r.m[key]
147			} else {
148				m[key] = r.m[key]
149			}
150		}
151	}
152
153	return m
154}
155
156// Closest searches for the `searchWord` and returns the closest match
157func (cm *ClosestMatch) Closest(searchWord string) string {
158	for _, pair := range rankByWordCount(cm.match(searchWord)) {
159		return pair.Key
160	}
161	return ""
162}
163
164// ClosestN searches for the `searchWord` and returns the n closests matches
165func (cm *ClosestMatch) ClosestN(searchWord string, max int) []string {
166	matches := make([]string, 0, max)
167	for i, pair := range rankByWordCount(cm.match(searchWord)) {
168		if i >= max {
169			break
170		}
171		matches = append(matches, pair.Key)
172	}
173	return matches
174}
175
176func rankByWordCount(wordFrequencies map[string]int) PairList {
177	pl := make(PairList, len(wordFrequencies))
178	i := 0
179	for k, v := range wordFrequencies {
180		pl[i] = Pair{k, v}
181		i++
182	}
183	sort.Sort(sort.Reverse(pl))
184	return pl
185}
186
187type Pair struct {
188	Key   string
189	Value int
190}
191
192type PairList []Pair
193
194func (p PairList) Len() int           { return len(p) }
195func (p PairList) Less(i, j int) bool { return p[i].Value < p[j].Value }
196func (p PairList) Swap(i, j int)      { p[i], p[j] = p[j], p[i] }
197
198func (cm *ClosestMatch) splitWord(word string) map[string]struct{} {
199	wordHash := make(map[string]struct{})
200	for _, j := range cm.SubstringSizes {
201		for i := 0; i < len(word)-j+1; i++ {
202			substring := string(word[i : i+j])
203			if len(strings.TrimSpace(substring)) > 0 {
204				wordHash[string(word[i:i+j])] = struct{}{}
205			}
206		}
207	}
208	if len(wordHash) == 0 {
209		wordHash[word] = struct{}{}
210	}
211	return wordHash
212}
213
214// AccuracyMutatingWords runs some basic tests against the wordlist to
215// see how accurate this bag-of-characters method is against
216// the target dataset
217func (cm *ClosestMatch) AccuracyMutatingWords() float64 {
218	rand.Seed(1)
219	percentCorrect := 0.0
220	numTrials := 0.0
221
222	for wordTrials := 0; wordTrials < 200; wordTrials++ {
223
224		var testString, originalTestString string
225		cm.mux.Lock()
226		testStringNum := rand.Intn(len(cm.ID))
227		i := 0
228		for id := range cm.ID {
229			i++
230			if i != testStringNum {
231				continue
232			}
233			originalTestString = cm.ID[id].Key
234			break
235		}
236		cm.mux.Unlock()
237
238		var words []string
239		choice := rand.Intn(3)
240		if choice == 0 {
241			// remove a random word
242			words = strings.Split(originalTestString, " ")
243			if len(words) < 3 {
244				continue
245			}
246			deleteWordI := rand.Intn(len(words))
247			words = append(words[:deleteWordI], words[deleteWordI+1:]...)
248			testString = strings.Join(words, " ")
249		} else if choice == 1 {
250			// remove a random word and reverse
251			words = strings.Split(originalTestString, " ")
252			if len(words) > 1 {
253				deleteWordI := rand.Intn(len(words))
254				words = append(words[:deleteWordI], words[deleteWordI+1:]...)
255				for left, right := 0, len(words)-1; left < right; left, right = left+1, right-1 {
256					words[left], words[right] = words[right], words[left]
257				}
258			} else {
259				continue
260			}
261			testString = strings.Join(words, " ")
262		} else {
263			// remove a random word and shuffle and replace 2 random letters
264			words = strings.Split(originalTestString, " ")
265			if len(words) > 1 {
266				deleteWordI := rand.Intn(len(words))
267				words = append(words[:deleteWordI], words[deleteWordI+1:]...)
268				for i := range words {
269					j := rand.Intn(i + 1)
270					words[i], words[j] = words[j], words[i]
271				}
272			}
273			testString = strings.Join(words, " ")
274			letters := "abcdefghijklmnopqrstuvwxyz"
275			if len(testString) == 0 {
276				continue
277			}
278			ii := rand.Intn(len(testString))
279			testString = testString[:ii] + string(letters[rand.Intn(len(letters))]) + testString[ii+1:]
280			ii = rand.Intn(len(testString))
281			testString = testString[:ii] + string(letters[rand.Intn(len(letters))]) + testString[ii+1:]
282		}
283		closest := cm.Closest(testString)
284		if closest == originalTestString {
285			percentCorrect += 1.0
286		} else {
287			//fmt.Printf("Original: %s, Mutilated: %s, Match: %s\n", originalTestString, testString, closest)
288		}
289		numTrials += 1.0
290	}
291	return 100.0 * percentCorrect / numTrials
292}
293
294// AccuracyMutatingLetters runs some basic tests against the wordlist to
295// see how accurate this bag-of-characters method is against
296// the target dataset when mutating individual letters (adding, removing, changing)
297func (cm *ClosestMatch) AccuracyMutatingLetters() float64 {
298	rand.Seed(1)
299	percentCorrect := 0.0
300	numTrials := 0.0
301
302	for wordTrials := 0; wordTrials < 200; wordTrials++ {
303
304		var testString, originalTestString string
305		cm.mux.Lock()
306		testStringNum := rand.Intn(len(cm.ID))
307		i := 0
308		for id := range cm.ID {
309			i++
310			if i != testStringNum {
311				continue
312			}
313			originalTestString = cm.ID[id].Key
314			break
315		}
316		cm.mux.Unlock()
317		testString = originalTestString
318
319		// letters to replace with
320		letters := "abcdefghijklmnopqrstuvwxyz"
321
322		choice := rand.Intn(3)
323		if choice == 0 {
324			// replace random letter
325			ii := rand.Intn(len(testString))
326			testString = testString[:ii] + string(letters[rand.Intn(len(letters))]) + testString[ii+1:]
327		} else if choice == 1 {
328			// delete random letter
329			ii := rand.Intn(len(testString))
330			testString = testString[:ii] + testString[ii+1:]
331		} else {
332			// add random letter
333			ii := rand.Intn(len(testString))
334			testString = testString[:ii] + string(letters[rand.Intn(len(letters))]) + testString[ii:]
335		}
336		closest := cm.Closest(testString)
337		if closest == originalTestString {
338			percentCorrect += 1.0
339		} else {
340			//fmt.Printf("Original: %s, Mutilated: %s, Match: %s\n", originalTestString, testString, closest)
341		}
342		numTrials += 1.0
343	}
344
345	return 100.0 * percentCorrect / numTrials
346}
347