1package boom
2
3import (
4	"math"
5	"math/rand"
6)
7
8// MinHash is a variation of the technique for estimating similarity between
9// two sets as presented by Broder in On the resemblance and containment of
10// documents:
11//
12// http://gatekeeper.dec.com/ftp/pub/dec/SRC/publications/broder/positano-final-wpnums.pdf
13//
14// This can be used to cluster or compare documents by splitting the corpus
15// into a bag of words. MinHash returns the approximated similarity ratio of
16// the two bags. The similarity is less accurate for very small bags of words.
17func MinHash(bag1, bag2 []string) float32 {
18	k := len(bag1) + len(bag2)
19	hashes := make([]int, k)
20	for i := 0; i < k; i++ {
21		a := uint(rand.Int())
22		b := uint(rand.Int())
23		c := uint(rand.Int())
24		x := computeHash(a*b*c, a, b, c)
25		hashes[i] = int(x)
26	}
27
28	bitMap := bitMap(bag1, bag2)
29	minHashValues := hashBuckets(2, k)
30	minHash(bag1, 0, minHashValues, bitMap, k, hashes)
31	minHash(bag2, 1, minHashValues, bitMap, k, hashes)
32	return similarity(minHashValues, k)
33}
34
35func minHash(bag []string, bagIndex int, minHashValues [][]int,
36	bitArray map[string][]bool, k int, hashes []int) {
37	index := 0
38	for element := range bitArray {
39		for i := 0; i < k; i++ {
40			if contains(bag, element) {
41				hindex := hashes[index]
42				if hindex < minHashValues[bagIndex][index] {
43					minHashValues[bagIndex][index] = hindex
44				}
45			}
46		}
47		index++
48	}
49}
50
51func contains(bag []string, element string) bool {
52	for _, e := range bag {
53		if e == element {
54			return true
55		}
56	}
57	return false
58}
59
60func bitMap(bag1, bag2 []string) map[string][]bool {
61	bitArray := map[string][]bool{}
62	for _, element := range bag1 {
63		bitArray[element] = []bool{true, false}
64	}
65
66	for _, element := range bag2 {
67		if _, ok := bitArray[element]; ok {
68			bitArray[element] = []bool{true, true}
69		} else if _, ok := bitArray[element]; !ok {
70			bitArray[element] = []bool{false, true}
71		}
72	}
73
74	return bitArray
75}
76
77func hashBuckets(numSets, k int) [][]int {
78	minHashValues := make([][]int, numSets)
79	for i := 0; i < numSets; i++ {
80		minHashValues[i] = make([]int, k)
81	}
82
83	for i := 0; i < numSets; i++ {
84		for j := 0; j < k; j++ {
85			minHashValues[i][j] = math.MaxInt32
86		}
87	}
88	return minHashValues
89}
90
91func computeHash(x, a, b, u uint) uint {
92	return (a*x + b) >> (32 - u)
93}
94
95func similarity(minHashValues [][]int, k int) float32 {
96	identicalMinHashes := 0
97	for i := 0; i < k; i++ {
98		if minHashValues[0][i] == minHashValues[1][i] {
99			identicalMinHashes++
100		}
101	}
102
103	return (1.0 * float32(identicalMinHashes)) / float32(k)
104}
105