1// Package histogram provides a Go implementation of BigML's histogram package
2// for Clojure/Java. It is currently experimental.
3package histogram
4
5import (
6	"container/heap"
7	"math"
8	"sort"
9)
10
11type Bin struct {
12	Count int
13	Sum   float64
14}
15
16func (b *Bin) Update(x *Bin) {
17	b.Count += x.Count
18	b.Sum += x.Sum
19}
20
21func (b *Bin) Mean() float64 {
22	return b.Sum / float64(b.Count)
23}
24
25type Bins []*Bin
26
27func (bs Bins) Len() int           { return len(bs) }
28func (bs Bins) Less(i, j int) bool { return bs[i].Mean() < bs[j].Mean() }
29func (bs Bins) Swap(i, j int)      { bs[i], bs[j] = bs[j], bs[i] }
30
31func (bs *Bins) Push(x interface{}) {
32	*bs = append(*bs, x.(*Bin))
33}
34
35func (bs *Bins) Pop() interface{} {
36	return bs.remove(len(*bs) - 1)
37}
38
39func (bs *Bins) remove(n int) *Bin {
40	if n < 0 || len(*bs) < n {
41		return nil
42	}
43	x := (*bs)[n]
44	*bs = append((*bs)[:n], (*bs)[n+1:]...)
45	return x
46}
47
48type Histogram struct {
49	res *reservoir
50}
51
52func New(maxBins int) *Histogram {
53	return &Histogram{res: newReservoir(maxBins)}
54}
55
56func (h *Histogram) Insert(f float64) {
57	h.res.insert(&Bin{1, f})
58	h.res.compress()
59}
60
61func (h *Histogram) Bins() Bins {
62	return h.res.bins
63}
64
65type reservoir struct {
66	n       int
67	maxBins int
68	bins    Bins
69}
70
71func newReservoir(maxBins int) *reservoir {
72	return &reservoir{maxBins: maxBins}
73}
74
75func (r *reservoir) insert(bin *Bin) {
76	r.n += bin.Count
77	i := sort.Search(len(r.bins), func(i int) bool {
78		return r.bins[i].Mean() >= bin.Mean()
79	})
80	if i < 0 || i == r.bins.Len() {
81		// TODO(blake): Maybe use an .insert(i, bin) instead of
82		// performing the extra work of a heap.Push.
83		heap.Push(&r.bins, bin)
84		return
85	}
86	r.bins[i].Update(bin)
87}
88
89func (r *reservoir) compress() {
90	for r.bins.Len() > r.maxBins {
91		minGapIndex := -1
92		minGap := math.MaxFloat64
93		for i := 0; i < r.bins.Len()-1; i++ {
94			gap := gapWeight(r.bins[i], r.bins[i+1])
95			if minGap > gap {
96				minGap = gap
97				minGapIndex = i
98			}
99		}
100		prev := r.bins[minGapIndex]
101		next := r.bins.remove(minGapIndex + 1)
102		prev.Update(next)
103	}
104}
105
106func gapWeight(prev, next *Bin) float64 {
107	return next.Mean() - prev.Mean()
108}
109