1//  Copyright (c) 2014 Couchbase, Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// 		http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package analysis
16
17// TokenLocation represents one occurrence of a term at a particular location in
18// a field. Start, End and Position have the same meaning as in analysis.Token.
19// Field and ArrayPositions identify the field value in the source document.
20// See document.Field for details.
21type TokenLocation struct {
22	Field          string
23	ArrayPositions []uint64
24	Start          int
25	End            int
26	Position       int
27}
28
29// TokenFreq represents all the occurrences of a term in all fields of a
30// document.
31type TokenFreq struct {
32	Term      []byte
33	Locations []*TokenLocation
34	frequency int
35}
36
37func (tf *TokenFreq) Frequency() int {
38	return tf.frequency
39}
40
41// TokenFrequencies maps document terms to their combined frequencies from all
42// fields.
43type TokenFrequencies map[string]*TokenFreq
44
45func (tfs TokenFrequencies) MergeAll(remoteField string, other TokenFrequencies) {
46	// walk the new token frequencies
47	for tfk, tf := range other {
48		// set the remoteField value in incoming token freqs
49		for _, l := range tf.Locations {
50			l.Field = remoteField
51		}
52		existingTf, exists := tfs[tfk]
53		if exists {
54			existingTf.Locations = append(existingTf.Locations, tf.Locations...)
55			existingTf.frequency = existingTf.frequency + tf.frequency
56		} else {
57			tfs[tfk] = &TokenFreq{
58				Term:      tf.Term,
59				frequency: tf.frequency,
60				Locations: make([]*TokenLocation, len(tf.Locations)),
61			}
62			copy(tfs[tfk].Locations, tf.Locations)
63		}
64	}
65}
66
67func TokenFrequency(tokens TokenStream, arrayPositions []uint64, includeTermVectors bool) TokenFrequencies {
68	rv := make(map[string]*TokenFreq, len(tokens))
69
70	if includeTermVectors {
71		tls := make([]TokenLocation, len(tokens))
72		tlNext := 0
73
74		for _, token := range tokens {
75			tls[tlNext] = TokenLocation{
76				ArrayPositions: arrayPositions,
77				Start:          token.Start,
78				End:            token.End,
79				Position:       token.Position,
80			}
81
82			curr, ok := rv[string(token.Term)]
83			if ok {
84				curr.Locations = append(curr.Locations, &tls[tlNext])
85				curr.frequency++
86			} else {
87				rv[string(token.Term)] = &TokenFreq{
88					Term:      token.Term,
89					Locations: []*TokenLocation{&tls[tlNext]},
90					frequency: 1,
91				}
92			}
93
94			tlNext++
95		}
96	} else {
97		for _, token := range tokens {
98			curr, exists := rv[string(token.Term)]
99			if exists {
100				curr.frequency++
101			} else {
102				rv[string(token.Term)] = &TokenFreq{
103					Term:      token.Term,
104					frequency: 1,
105				}
106			}
107		}
108	}
109
110	return rv
111}
112