1package closestmatch 2 3import ( 4 "compress/gzip" 5 "encoding/json" 6 "math/rand" 7 "os" 8 "sort" 9 "strings" 10 "sync" 11) 12 13// ClosestMatch is the structure that contains the 14// substring sizes and carrys a map of the substrings for 15// easy lookup 16type ClosestMatch struct { 17 SubstringSizes []int 18 SubstringToID map[string]map[uint32]struct{} 19 ID map[uint32]IDInfo 20 mux sync.Mutex 21} 22 23// IDInfo carries the information about the keys 24type IDInfo struct { 25 Key string 26 NumSubstrings int 27} 28 29// New returns a new structure for performing closest matches 30func New(possible []string, subsetSize []int) *ClosestMatch { 31 cm := new(ClosestMatch) 32 cm.SubstringSizes = subsetSize 33 cm.SubstringToID = make(map[string]map[uint32]struct{}) 34 cm.ID = make(map[uint32]IDInfo) 35 for i, s := range possible { 36 substrings := cm.splitWord(strings.ToLower(s)) 37 cm.ID[uint32(i)] = IDInfo{Key: s, NumSubstrings: len(substrings)} 38 for substring := range substrings { 39 if _, ok := cm.SubstringToID[substring]; !ok { 40 cm.SubstringToID[substring] = make(map[uint32]struct{}) 41 } 42 cm.SubstringToID[substring][uint32(i)] = struct{}{} 43 } 44 } 45 46 return cm 47} 48 49// Load can load a previously saved ClosestMatch object from disk 50func Load(filename string) (*ClosestMatch, error) { 51 cm := new(ClosestMatch) 52 53 f, err := os.Open(filename) 54 defer f.Close() 55 if err != nil { 56 return cm, err 57 } 58 59 w, err := gzip.NewReader(f) 60 if err != nil { 61 return cm, err 62 } 63 64 err = json.NewDecoder(w).Decode(&cm) 65 return cm, err 66} 67 68// Add more words to ClosestMatch structure 69func (cm *ClosestMatch) Add(possible []string) { 70 cm.mux.Lock() 71 for i, s := range possible { 72 substrings := cm.splitWord(strings.ToLower(s)) 73 cm.ID[uint32(i)] = IDInfo{Key: s, NumSubstrings: len(substrings)} 74 for substring := range substrings { 75 if _, ok := cm.SubstringToID[substring]; !ok { 76 cm.SubstringToID[substring] = make(map[uint32]struct{}) 77 } 78 cm.SubstringToID[substring][uint32(i)] = struct{}{} 79 } 80 } 81 cm.mux.Unlock() 82} 83 84// Save writes the current ClosestSave object as a gzipped JSON file 85func (cm *ClosestMatch) Save(filename string) error { 86 f, err := os.Create(filename) 87 if err != nil { 88 return err 89 } 90 defer f.Close() 91 w := gzip.NewWriter(f) 92 defer w.Close() 93 enc := json.NewEncoder(w) 94 // enc.SetIndent("", " ") 95 return enc.Encode(cm) 96} 97 98func (cm *ClosestMatch) worker(id int, jobs <-chan job, results chan<- result) { 99 for j := range jobs { 100 m := make(map[string]int) 101 cm.mux.Lock() 102 if ids, ok := cm.SubstringToID[j.substring]; ok { 103 weight := 1000 / len(ids) 104 for id := range ids { 105 if _, ok2 := m[cm.ID[id].Key]; !ok2 { 106 m[cm.ID[id].Key] = 0 107 } 108 m[cm.ID[id].Key] += 1 + 1000/len(cm.ID[id].Key) + weight 109 } 110 } 111 cm.mux.Unlock() 112 results <- result{m: m} 113 } 114} 115 116type job struct { 117 substring string 118} 119 120type result struct { 121 m map[string]int 122} 123 124func (cm *ClosestMatch) match(searchWord string) map[string]int { 125 searchSubstrings := cm.splitWord(searchWord) 126 searchSubstringsLen := len(searchSubstrings) 127 128 jobs := make(chan job, searchSubstringsLen) 129 results := make(chan result, searchSubstringsLen) 130 workers := 8 131 132 for w := 1; w <= workers; w++ { 133 go cm.worker(w, jobs, results) 134 } 135 136 for substring := range searchSubstrings { 137 jobs <- job{substring: substring} 138 } 139 close(jobs) 140 141 m := make(map[string]int) 142 for a := 1; a <= searchSubstringsLen; a++ { 143 r := <-results 144 for key := range r.m { 145 if _, ok := m[key]; ok { 146 m[key] += r.m[key] 147 } else { 148 m[key] = r.m[key] 149 } 150 } 151 } 152 153 return m 154} 155 156// Closest searches for the `searchWord` and returns the closest match 157func (cm *ClosestMatch) Closest(searchWord string) string { 158 for _, pair := range rankByWordCount(cm.match(searchWord)) { 159 return pair.Key 160 } 161 return "" 162} 163 164// ClosestN searches for the `searchWord` and returns the n closests matches 165func (cm *ClosestMatch) ClosestN(searchWord string, max int) []string { 166 matches := make([]string, 0, max) 167 for i, pair := range rankByWordCount(cm.match(searchWord)) { 168 if i >= max { 169 break 170 } 171 matches = append(matches, pair.Key) 172 } 173 return matches 174} 175 176func rankByWordCount(wordFrequencies map[string]int) PairList { 177 pl := make(PairList, len(wordFrequencies)) 178 i := 0 179 for k, v := range wordFrequencies { 180 pl[i] = Pair{k, v} 181 i++ 182 } 183 sort.Sort(sort.Reverse(pl)) 184 return pl 185} 186 187type Pair struct { 188 Key string 189 Value int 190} 191 192type PairList []Pair 193 194func (p PairList) Len() int { return len(p) } 195func (p PairList) Less(i, j int) bool { return p[i].Value < p[j].Value } 196func (p PairList) Swap(i, j int) { p[i], p[j] = p[j], p[i] } 197 198func (cm *ClosestMatch) splitWord(word string) map[string]struct{} { 199 wordHash := make(map[string]struct{}) 200 for _, j := range cm.SubstringSizes { 201 for i := 0; i < len(word)-j+1; i++ { 202 substring := string(word[i : i+j]) 203 if len(strings.TrimSpace(substring)) > 0 { 204 wordHash[string(word[i:i+j])] = struct{}{} 205 } 206 } 207 } 208 if len(wordHash) == 0 { 209 wordHash[word] = struct{}{} 210 } 211 return wordHash 212} 213 214// AccuracyMutatingWords runs some basic tests against the wordlist to 215// see how accurate this bag-of-characters method is against 216// the target dataset 217func (cm *ClosestMatch) AccuracyMutatingWords() float64 { 218 rand.Seed(1) 219 percentCorrect := 0.0 220 numTrials := 0.0 221 222 for wordTrials := 0; wordTrials < 200; wordTrials++ { 223 224 var testString, originalTestString string 225 cm.mux.Lock() 226 testStringNum := rand.Intn(len(cm.ID)) 227 i := 0 228 for id := range cm.ID { 229 i++ 230 if i != testStringNum { 231 continue 232 } 233 originalTestString = cm.ID[id].Key 234 break 235 } 236 cm.mux.Unlock() 237 238 var words []string 239 choice := rand.Intn(3) 240 if choice == 0 { 241 // remove a random word 242 words = strings.Split(originalTestString, " ") 243 if len(words) < 3 { 244 continue 245 } 246 deleteWordI := rand.Intn(len(words)) 247 words = append(words[:deleteWordI], words[deleteWordI+1:]...) 248 testString = strings.Join(words, " ") 249 } else if choice == 1 { 250 // remove a random word and reverse 251 words = strings.Split(originalTestString, " ") 252 if len(words) > 1 { 253 deleteWordI := rand.Intn(len(words)) 254 words = append(words[:deleteWordI], words[deleteWordI+1:]...) 255 for left, right := 0, len(words)-1; left < right; left, right = left+1, right-1 { 256 words[left], words[right] = words[right], words[left] 257 } 258 } else { 259 continue 260 } 261 testString = strings.Join(words, " ") 262 } else { 263 // remove a random word and shuffle and replace 2 random letters 264 words = strings.Split(originalTestString, " ") 265 if len(words) > 1 { 266 deleteWordI := rand.Intn(len(words)) 267 words = append(words[:deleteWordI], words[deleteWordI+1:]...) 268 for i := range words { 269 j := rand.Intn(i + 1) 270 words[i], words[j] = words[j], words[i] 271 } 272 } 273 testString = strings.Join(words, " ") 274 letters := "abcdefghijklmnopqrstuvwxyz" 275 if len(testString) == 0 { 276 continue 277 } 278 ii := rand.Intn(len(testString)) 279 testString = testString[:ii] + string(letters[rand.Intn(len(letters))]) + testString[ii+1:] 280 ii = rand.Intn(len(testString)) 281 testString = testString[:ii] + string(letters[rand.Intn(len(letters))]) + testString[ii+1:] 282 } 283 closest := cm.Closest(testString) 284 if closest == originalTestString { 285 percentCorrect += 1.0 286 } else { 287 //fmt.Printf("Original: %s, Mutilated: %s, Match: %s\n", originalTestString, testString, closest) 288 } 289 numTrials += 1.0 290 } 291 return 100.0 * percentCorrect / numTrials 292} 293 294// AccuracyMutatingLetters runs some basic tests against the wordlist to 295// see how accurate this bag-of-characters method is against 296// the target dataset when mutating individual letters (adding, removing, changing) 297func (cm *ClosestMatch) AccuracyMutatingLetters() float64 { 298 rand.Seed(1) 299 percentCorrect := 0.0 300 numTrials := 0.0 301 302 for wordTrials := 0; wordTrials < 200; wordTrials++ { 303 304 var testString, originalTestString string 305 cm.mux.Lock() 306 testStringNum := rand.Intn(len(cm.ID)) 307 i := 0 308 for id := range cm.ID { 309 i++ 310 if i != testStringNum { 311 continue 312 } 313 originalTestString = cm.ID[id].Key 314 break 315 } 316 cm.mux.Unlock() 317 testString = originalTestString 318 319 // letters to replace with 320 letters := "abcdefghijklmnopqrstuvwxyz" 321 322 choice := rand.Intn(3) 323 if choice == 0 { 324 // replace random letter 325 ii := rand.Intn(len(testString)) 326 testString = testString[:ii] + string(letters[rand.Intn(len(letters))]) + testString[ii+1:] 327 } else if choice == 1 { 328 // delete random letter 329 ii := rand.Intn(len(testString)) 330 testString = testString[:ii] + testString[ii+1:] 331 } else { 332 // add random letter 333 ii := rand.Intn(len(testString)) 334 testString = testString[:ii] + string(letters[rand.Intn(len(letters))]) + testString[ii:] 335 } 336 closest := cm.Closest(testString) 337 if closest == originalTestString { 338 percentCorrect += 1.0 339 } else { 340 //fmt.Printf("Original: %s, Mutilated: %s, Match: %s\n", originalTestString, testString, closest) 341 } 342 numTrials += 1.0 343 } 344 345 return 100.0 * percentCorrect / numTrials 346} 347