1//  Copyright (c) 2014 Couchbase, Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// 		http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package analysis
16
17import (
18	"reflect"
19
20	"github.com/blevesearch/bleve/size"
21)
22
23var reflectStaticSizeTokenLocation int
24var reflectStaticSizeTokenFreq int
25
26func init() {
27	var tl TokenLocation
28	reflectStaticSizeTokenLocation = int(reflect.TypeOf(tl).Size())
29	var tf TokenFreq
30	reflectStaticSizeTokenFreq = int(reflect.TypeOf(tf).Size())
31}
32
33// TokenLocation represents one occurrence of a term at a particular location in
34// a field. Start, End and Position have the same meaning as in analysis.Token.
35// Field and ArrayPositions identify the field value in the source document.
36// See document.Field for details.
37type TokenLocation struct {
38	Field          string
39	ArrayPositions []uint64
40	Start          int
41	End            int
42	Position       int
43}
44
45func (tl *TokenLocation) Size() int {
46	rv := reflectStaticSizeTokenLocation
47	rv += len(tl.ArrayPositions) * size.SizeOfUint64
48	return rv
49}
50
51// TokenFreq represents all the occurrences of a term in all fields of a
52// document.
53type TokenFreq struct {
54	Term      []byte
55	Locations []*TokenLocation
56	frequency int
57}
58
59func (tf *TokenFreq) Size() int {
60	rv := reflectStaticSizeTokenFreq
61	rv += len(tf.Term)
62	for _, loc := range tf.Locations {
63		rv += loc.Size()
64	}
65	return rv
66}
67
68func (tf *TokenFreq) Frequency() int {
69	return tf.frequency
70}
71
72// TokenFrequencies maps document terms to their combined frequencies from all
73// fields.
74type TokenFrequencies map[string]*TokenFreq
75
76func (tfs TokenFrequencies) Size() int {
77	rv := size.SizeOfMap
78	rv += len(tfs) * (size.SizeOfString + size.SizeOfPtr)
79	for k, v := range tfs {
80		rv += len(k)
81		rv += v.Size()
82	}
83	return rv
84}
85
86func (tfs TokenFrequencies) MergeAll(remoteField string, other TokenFrequencies) {
87	// walk the new token frequencies
88	for tfk, tf := range other {
89		// set the remoteField value in incoming token freqs
90		for _, l := range tf.Locations {
91			l.Field = remoteField
92		}
93		existingTf, exists := tfs[tfk]
94		if exists {
95			existingTf.Locations = append(existingTf.Locations, tf.Locations...)
96			existingTf.frequency = existingTf.frequency + tf.frequency
97		} else {
98			tfs[tfk] = &TokenFreq{
99				Term:      tf.Term,
100				frequency: tf.frequency,
101				Locations: make([]*TokenLocation, len(tf.Locations)),
102			}
103			copy(tfs[tfk].Locations, tf.Locations)
104		}
105	}
106}
107
108func TokenFrequency(tokens TokenStream, arrayPositions []uint64, includeTermVectors bool) TokenFrequencies {
109	rv := make(map[string]*TokenFreq, len(tokens))
110
111	if includeTermVectors {
112		tls := make([]TokenLocation, len(tokens))
113		tlNext := 0
114
115		for _, token := range tokens {
116			tls[tlNext] = TokenLocation{
117				ArrayPositions: arrayPositions,
118				Start:          token.Start,
119				End:            token.End,
120				Position:       token.Position,
121			}
122
123			curr, ok := rv[string(token.Term)]
124			if ok {
125				curr.Locations = append(curr.Locations, &tls[tlNext])
126				curr.frequency++
127			} else {
128				rv[string(token.Term)] = &TokenFreq{
129					Term:      token.Term,
130					Locations: []*TokenLocation{&tls[tlNext]},
131					frequency: 1,
132				}
133			}
134
135			tlNext++
136		}
137	} else {
138		for _, token := range tokens {
139			curr, exists := rv[string(token.Term)]
140			if exists {
141				curr.frequency++
142			} else {
143				rv[string(token.Term)] = &TokenFreq{
144					Term:      token.Term,
145					frequency: 1,
146				}
147			}
148		}
149	}
150
151	return rv
152}
153