1// Copyright (c) 2014 Couchbase, Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package analysis 16 17import ( 18 "reflect" 19 20 "github.com/blevesearch/bleve/size" 21) 22 23var reflectStaticSizeTokenLocation int 24var reflectStaticSizeTokenFreq int 25 26func init() { 27 var tl TokenLocation 28 reflectStaticSizeTokenLocation = int(reflect.TypeOf(tl).Size()) 29 var tf TokenFreq 30 reflectStaticSizeTokenFreq = int(reflect.TypeOf(tf).Size()) 31} 32 33// TokenLocation represents one occurrence of a term at a particular location in 34// a field. Start, End and Position have the same meaning as in analysis.Token. 35// Field and ArrayPositions identify the field value in the source document. 36// See document.Field for details. 37type TokenLocation struct { 38 Field string 39 ArrayPositions []uint64 40 Start int 41 End int 42 Position int 43} 44 45func (tl *TokenLocation) Size() int { 46 rv := reflectStaticSizeTokenLocation 47 rv += len(tl.ArrayPositions) * size.SizeOfUint64 48 return rv 49} 50 51// TokenFreq represents all the occurrences of a term in all fields of a 52// document. 53type TokenFreq struct { 54 Term []byte 55 Locations []*TokenLocation 56 frequency int 57} 58 59func (tf *TokenFreq) Size() int { 60 rv := reflectStaticSizeTokenFreq 61 rv += len(tf.Term) 62 for _, loc := range tf.Locations { 63 rv += loc.Size() 64 } 65 return rv 66} 67 68func (tf *TokenFreq) Frequency() int { 69 return tf.frequency 70} 71 72// TokenFrequencies maps document terms to their combined frequencies from all 73// fields. 74type TokenFrequencies map[string]*TokenFreq 75 76func (tfs TokenFrequencies) Size() int { 77 rv := size.SizeOfMap 78 rv += len(tfs) * (size.SizeOfString + size.SizeOfPtr) 79 for k, v := range tfs { 80 rv += len(k) 81 rv += v.Size() 82 } 83 return rv 84} 85 86func (tfs TokenFrequencies) MergeAll(remoteField string, other TokenFrequencies) { 87 // walk the new token frequencies 88 for tfk, tf := range other { 89 // set the remoteField value in incoming token freqs 90 for _, l := range tf.Locations { 91 l.Field = remoteField 92 } 93 existingTf, exists := tfs[tfk] 94 if exists { 95 existingTf.Locations = append(existingTf.Locations, tf.Locations...) 96 existingTf.frequency = existingTf.frequency + tf.frequency 97 } else { 98 tfs[tfk] = &TokenFreq{ 99 Term: tf.Term, 100 frequency: tf.frequency, 101 Locations: make([]*TokenLocation, len(tf.Locations)), 102 } 103 copy(tfs[tfk].Locations, tf.Locations) 104 } 105 } 106} 107 108func TokenFrequency(tokens TokenStream, arrayPositions []uint64, includeTermVectors bool) TokenFrequencies { 109 rv := make(map[string]*TokenFreq, len(tokens)) 110 111 if includeTermVectors { 112 tls := make([]TokenLocation, len(tokens)) 113 tlNext := 0 114 115 for _, token := range tokens { 116 tls[tlNext] = TokenLocation{ 117 ArrayPositions: arrayPositions, 118 Start: token.Start, 119 End: token.End, 120 Position: token.Position, 121 } 122 123 curr, ok := rv[string(token.Term)] 124 if ok { 125 curr.Locations = append(curr.Locations, &tls[tlNext]) 126 curr.frequency++ 127 } else { 128 rv[string(token.Term)] = &TokenFreq{ 129 Term: token.Term, 130 Locations: []*TokenLocation{&tls[tlNext]}, 131 frequency: 1, 132 } 133 } 134 135 tlNext++ 136 } 137 } else { 138 for _, token := range tokens { 139 curr, exists := rv[string(token.Term)] 140 if exists { 141 curr.frequency++ 142 } else { 143 rv[string(token.Term)] = &TokenFreq{ 144 Term: token.Term, 145 frequency: 1, 146 } 147 } 148 } 149 } 150 151 return rv 152} 153