1// Copyright (c) 2014 Couchbase, Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package analysis 16 17// TokenLocation represents one occurrence of a term at a particular location in 18// a field. Start, End and Position have the same meaning as in analysis.Token. 19// Field and ArrayPositions identify the field value in the source document. 20// See document.Field for details. 21type TokenLocation struct { 22 Field string 23 ArrayPositions []uint64 24 Start int 25 End int 26 Position int 27} 28 29// TokenFreq represents all the occurrences of a term in all fields of a 30// document. 31type TokenFreq struct { 32 Term []byte 33 Locations []*TokenLocation 34 frequency int 35} 36 37func (tf *TokenFreq) Frequency() int { 38 return tf.frequency 39} 40 41// TokenFrequencies maps document terms to their combined frequencies from all 42// fields. 43type TokenFrequencies map[string]*TokenFreq 44 45func (tfs TokenFrequencies) MergeAll(remoteField string, other TokenFrequencies) { 46 // walk the new token frequencies 47 for tfk, tf := range other { 48 // set the remoteField value in incoming token freqs 49 for _, l := range tf.Locations { 50 l.Field = remoteField 51 } 52 existingTf, exists := tfs[tfk] 53 if exists { 54 existingTf.Locations = append(existingTf.Locations, tf.Locations...) 55 existingTf.frequency = existingTf.frequency + tf.frequency 56 } else { 57 tfs[tfk] = &TokenFreq{ 58 Term: tf.Term, 59 frequency: tf.frequency, 60 Locations: make([]*TokenLocation, len(tf.Locations)), 61 } 62 copy(tfs[tfk].Locations, tf.Locations) 63 } 64 } 65} 66 67func TokenFrequency(tokens TokenStream, arrayPositions []uint64, includeTermVectors bool) TokenFrequencies { 68 rv := make(map[string]*TokenFreq, len(tokens)) 69 70 if includeTermVectors { 71 tls := make([]TokenLocation, len(tokens)) 72 tlNext := 0 73 74 for _, token := range tokens { 75 tls[tlNext] = TokenLocation{ 76 ArrayPositions: arrayPositions, 77 Start: token.Start, 78 End: token.End, 79 Position: token.Position, 80 } 81 82 curr, ok := rv[string(token.Term)] 83 if ok { 84 curr.Locations = append(curr.Locations, &tls[tlNext]) 85 curr.frequency++ 86 } else { 87 rv[string(token.Term)] = &TokenFreq{ 88 Term: token.Term, 89 Locations: []*TokenLocation{&tls[tlNext]}, 90 frequency: 1, 91 } 92 } 93 94 tlNext++ 95 } 96 } else { 97 for _, token := range tokens { 98 curr, exists := rv[string(token.Term)] 99 if exists { 100 curr.frequency++ 101 } else { 102 rv[string(token.Term)] = &TokenFreq{ 103 Term: token.Term, 104 frequency: 1, 105 } 106 } 107 } 108 } 109 110 return rv 111} 112