1package syntax
2
3import (
4	"crypto/sha1"
5
6	"github.com/mibk/dupl/suffixtree"
7)
8
9type Node struct {
10	Type     int
11	Filename string
12	Pos, End int
13	Children []*Node
14	Owns     int
15}
16
17func NewNode() *Node {
18	return &Node{}
19}
20
21func (n *Node) AddChildren(children ...*Node) {
22	n.Children = append(n.Children, children...)
23}
24
25func (n *Node) Val() int {
26	return n.Type
27}
28
29type Match struct {
30	Hash  string
31	Frags [][]*Node
32}
33
34func Serialize(n *Node) []*Node {
35	stream := make([]*Node, 0, 10)
36	serial(n, &stream)
37	return stream
38}
39
40func serial(n *Node, stream *[]*Node) int {
41	*stream = append(*stream, n)
42	var count int
43	for _, child := range n.Children {
44		count += serial(child, stream)
45	}
46	n.Owns = count
47	return count + 1
48}
49
50// FindSyntaxUnits finds all complete syntax units in the match group and returns them
51// with the corresponding hash.
52func FindSyntaxUnits(data []*Node, m suffixtree.Match, threshold int) Match {
53	if len(m.Ps) == 0 {
54		return Match{}
55	}
56	firstSeq := data[m.Ps[0] : m.Ps[0]+m.Len]
57	indexes := getUnitsIndexes(firstSeq, threshold)
58
59	// TODO: is this really working?
60	indexCnt := len(indexes)
61	if indexCnt > 0 {
62		lasti := indexes[indexCnt-1]
63		firstn := firstSeq[lasti]
64		for i := 1; i < len(m.Ps); i++ {
65			n := data[int(m.Ps[i])+lasti]
66			if firstn.Owns != n.Owns {
67				indexes = indexes[:indexCnt-1]
68				break
69			}
70		}
71	}
72	if len(indexes) == 0 || isCyclic(indexes, firstSeq) || spansMultipleFiles(indexes, firstSeq) {
73		return Match{}
74	}
75
76	match := Match{Frags: make([][]*Node, len(m.Ps))}
77	for i, pos := range m.Ps {
78		match.Frags[i] = make([]*Node, len(indexes))
79		for j, index := range indexes {
80			match.Frags[i][j] = data[int(pos)+index]
81		}
82	}
83
84	lastIndex := indexes[len(indexes)-1]
85	match.Hash = hashSeq(firstSeq[indexes[0] : lastIndex+firstSeq[lastIndex].Owns])
86	return match
87}
88
89func getUnitsIndexes(nodeSeq []*Node, threshold int) []int {
90	var indexes []int
91	var split bool
92	for i := 0; i < len(nodeSeq); {
93		n := nodeSeq[i]
94		switch {
95		case n.Owns >= len(nodeSeq)-i:
96			// not complete syntax unit
97			i++
98			split = true
99			continue
100		case n.Owns+1 < threshold:
101			split = true
102		default:
103			if split {
104				indexes = indexes[:0]
105				split = false
106			}
107			indexes = append(indexes, i)
108		}
109		i += n.Owns + 1
110	}
111	return indexes
112}
113
114// isCyclic finds out whether there is a repetive pattern in the found clone. If positive,
115// it return false to point out that the clone would be redundant.
116func isCyclic(indexes []int, nodes []*Node) bool {
117	cnt := len(indexes)
118	if cnt <= 1 {
119		return false
120	}
121
122	alts := make(map[int]bool)
123	for i := 1; i <= cnt/2; i++ {
124		if cnt%i == 0 {
125			alts[i] = true
126		}
127	}
128
129	for i := 0; i < indexes[cnt/2]; i++ {
130		nstart := nodes[i+indexes[0]]
131	AltLoop:
132		for alt := range alts {
133			for j := alt; j < cnt; j += alt {
134				index := i + indexes[j]
135				if index < len(nodes) {
136					nalt := nodes[index]
137					if nstart.Owns == nalt.Owns && nstart.Type == nalt.Type {
138						continue
139					}
140				} else if i >= indexes[alt] {
141					return true
142				}
143				delete(alts, alt)
144				continue AltLoop
145			}
146		}
147		if len(alts) == 0 {
148			return false
149		}
150	}
151	return true
152}
153
154func spansMultipleFiles(indexes []int, nodes []*Node) bool {
155	if len(indexes) < 2 {
156		return false
157	}
158	f := nodes[indexes[0]].Filename
159	for i := 1; i < len(indexes); i++ {
160		if nodes[indexes[i]].Filename != f {
161			return true
162		}
163	}
164	return false
165}
166
167func hashSeq(nodes []*Node) string {
168	h := sha1.New()
169	bytes := make([]byte, len(nodes))
170	for i, node := range nodes {
171		bytes[i] = byte(node.Type)
172	}
173	h.Write(bytes)
174	return string(h.Sum(nil))
175}
176