1package boom 2 3import ( 4 "math" 5 "math/rand" 6) 7 8// MinHash is a variation of the technique for estimating similarity between 9// two sets as presented by Broder in On the resemblance and containment of 10// documents: 11// 12// http://gatekeeper.dec.com/ftp/pub/dec/SRC/publications/broder/positano-final-wpnums.pdf 13// 14// This can be used to cluster or compare documents by splitting the corpus 15// into a bag of words. MinHash returns the approximated similarity ratio of 16// the two bags. The similarity is less accurate for very small bags of words. 17func MinHash(bag1, bag2 []string) float32 { 18 k := len(bag1) + len(bag2) 19 hashes := make([]int, k) 20 for i := 0; i < k; i++ { 21 a := uint(rand.Int()) 22 b := uint(rand.Int()) 23 c := uint(rand.Int()) 24 x := computeHash(a*b*c, a, b, c) 25 hashes[i] = int(x) 26 } 27 28 bitMap := bitMap(bag1, bag2) 29 minHashValues := hashBuckets(2, k) 30 minHash(bag1, 0, minHashValues, bitMap, k, hashes) 31 minHash(bag2, 1, minHashValues, bitMap, k, hashes) 32 return similarity(minHashValues, k) 33} 34 35func minHash(bag []string, bagIndex int, minHashValues [][]int, 36 bitArray map[string][]bool, k int, hashes []int) { 37 index := 0 38 for element := range bitArray { 39 for i := 0; i < k; i++ { 40 if contains(bag, element) { 41 hindex := hashes[index] 42 if hindex < minHashValues[bagIndex][index] { 43 minHashValues[bagIndex][index] = hindex 44 } 45 } 46 } 47 index++ 48 } 49} 50 51func contains(bag []string, element string) bool { 52 for _, e := range bag { 53 if e == element { 54 return true 55 } 56 } 57 return false 58} 59 60func bitMap(bag1, bag2 []string) map[string][]bool { 61 bitArray := map[string][]bool{} 62 for _, element := range bag1 { 63 bitArray[element] = []bool{true, false} 64 } 65 66 for _, element := range bag2 { 67 if _, ok := bitArray[element]; ok { 68 bitArray[element] = []bool{true, true} 69 } else if _, ok := bitArray[element]; !ok { 70 bitArray[element] = []bool{false, true} 71 } 72 } 73 74 return bitArray 75} 76 77func hashBuckets(numSets, k int) [][]int { 78 minHashValues := make([][]int, numSets) 79 for i := 0; i < numSets; i++ { 80 minHashValues[i] = make([]int, k) 81 } 82 83 for i := 0; i < numSets; i++ { 84 for j := 0; j < k; j++ { 85 minHashValues[i][j] = math.MaxInt32 86 } 87 } 88 return minHashValues 89} 90 91func computeHash(x, a, b, u uint) uint { 92 return (a*x + b) >> (32 - u) 93} 94 95func similarity(minHashValues [][]int, k int) float32 { 96 identicalMinHashes := 0 97 for i := 0; i < k; i++ { 98 if minHashValues[0][i] == minHashValues[1][i] { 99 identicalMinHashes++ 100 } 101 } 102 103 return (1.0 * float32(identicalMinHashes)) / float32(k) 104} 105