1 // Copyright 2013 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 //
16 // Author: dsites@google.com (Dick Sites)
17 //
18
19 #include "tote.h"
20 #include "lang_script.h" // For LanguageCode in Dump
21
22 #include <stdio.h>
23 #include <string.h> // For memset
24
25 namespace CLD2 {
26
27 // Take a set of <key, value> pairs and tote them up.
28 // After explicitly sorting, retrieve top key, value pairs
29 // Normal use is key=per-script language and value = probability score
Tote()30 Tote::Tote() {
31 in_use_mask_ = 0;
32 byte_count_ = 0;
33 score_count_ = 0;
34 // No need to initialize values
35 }
36
~Tote()37 Tote::~Tote() {
38 }
39
Reinit()40 void Tote::Reinit() {
41 in_use_mask_ = 0;
42 byte_count_ = 0;
43 score_count_ = 0;
44 // No need to initialize values
45 }
46 // Increment count of quadgrams/trigrams/unigrams scored
AddScoreCount()47 void Tote::AddScoreCount() {
48 ++score_count_;
49 }
50
51
Add(uint8 ikey,int idelta)52 void Tote::Add(uint8 ikey, int idelta) {
53 int key_group = ikey >> 2;
54 uint64 groupmask = (1ULL << key_group);
55 if ((in_use_mask_ & groupmask) == 0) {
56 // Initialize this group
57 gscore_[key_group] = 0;
58 in_use_mask_ |= groupmask;
59 }
60 score_[ikey] += idelta;
61 }
62
63
64 // Return current top three keys
CurrentTopThreeKeys(int * key3) const65 void Tote::CurrentTopThreeKeys(int* key3) const {
66 key3[0] = -1;
67 key3[1] = -1;
68 key3[2] = -1;
69 int score3[3] = {-1, -1, -1};
70 uint64 tempmask = in_use_mask_;
71 int base = 0;
72 while (tempmask != 0) {
73 if (tempmask & 1) {
74 // Look at four in-use keys
75 for (int i = 0; i < 4; ++i) {
76 int insert_me = score_[base + i];
77 // Favor lower numbers on ties
78 if (insert_me > score3[2]) {
79 // Insert
80 int insert_at = 2;
81 if (insert_me > score3[1]) {
82 score3[2] = score3[1];
83 key3[2] = key3[1];
84 insert_at = 1;
85 if (insert_me > score3[0]) {
86 score3[1] = score3[0];
87 key3[1] = key3[0];
88 insert_at = 0;
89 }
90 }
91 score3[insert_at] = insert_me;
92 key3[insert_at] = base + i;
93 }
94 }
95 }
96 tempmask >>= 1;
97 base += 4;
98 }
99 }
100
101
102 // Take a set of <key, value> pairs and tote them up.
103 // After explicitly sorting, retrieve top key, value pairs
104 // 0xFFFF in key signifies unused
DocTote()105 DocTote::DocTote() {
106 // No need to initialize score_ or value_
107 incr_count_ = 0;
108 sorted_ = 0;
109 memset(closepair_, 0, sizeof(closepair_));
110 memset(key_, 0xFF, sizeof(key_));
111 }
112
~DocTote()113 DocTote::~DocTote() {
114 }
115
Reinit()116 void DocTote::Reinit() {
117 // No need to initialize score_ or value_
118 incr_count_ = 0;
119 sorted_ = 0;
120 memset(closepair_, 0, sizeof(closepair_));
121 memset(key_, 0xFF, sizeof(key_));
122 runningscore_.Reinit();
123 }
124
125 // Weight reliability by ibytes
126 // Also see three-way associative comments above for Tote
Add(uint16 ikey,int ibytes,int score,int ireliability)127 void DocTote::Add(uint16 ikey, int ibytes,
128 int score, int ireliability) {
129 ++incr_count_;
130
131 // Look for existing entry in top 2 positions of 3, times 8 columns
132 int sub0 = ikey & 15;
133 if (key_[sub0] == ikey) {
134 value_[sub0] += ibytes;
135 score_[sub0] += score;
136 reliability_[sub0] += ireliability * ibytes;
137 return;
138 }
139 // Look for existing entry in other of top 2 positions of 3, times 8 columns
140 int sub1 = sub0 ^ 8;
141 if (key_[sub1] == ikey) {
142 value_[sub1] += ibytes;
143 score_[sub1] += score;
144 reliability_[sub1] += ireliability * ibytes;
145 return;
146 }
147 // Look for existing entry in third position of 3, times 8 columns
148 int sub2 = (ikey & 7) + 16;
149 if (key_[sub2] == ikey) {
150 value_[sub2] += ibytes;
151 score_[sub2] += score;
152 reliability_[sub2] += ireliability * ibytes;
153 return;
154 }
155
156 // Allocate new entry
157 int alloc = -1;
158 if (key_[sub0] == kUnusedKey) {
159 alloc = sub0;
160 } else if (key_[sub1] == kUnusedKey) {
161 alloc = sub1;
162 } else if (key_[sub2] == kUnusedKey) {
163 alloc = sub2;
164 } else {
165 // All choices allocated, need to replace smallest one
166 alloc = sub0;
167 if (value_[sub1] < value_[alloc]) {alloc = sub1;}
168 if (value_[sub2] < value_[alloc]) {alloc = sub2;}
169 }
170 key_[alloc] = ikey;
171 value_[alloc] = ibytes;
172 score_[alloc] = score;
173 reliability_[alloc] = ireliability * ibytes;
174 return;
175 }
176
177 // Find subscript of a given packed language, or -1
Find(uint16 ikey)178 int DocTote::Find(uint16 ikey) {
179 if (sorted_) {
180 // Linear search if sorted
181 for (int sub = 0; sub < kMaxSize_; ++sub) {
182 if (key_[sub] == ikey) {return sub;}
183 }
184 return -1;
185 }
186
187 // Look for existing entry
188 int sub0 = ikey & 15;
189 if (key_[sub0] == ikey) {
190 return sub0;
191 }
192 int sub1 = sub0 ^ 8;
193 if (key_[sub1] == ikey) {
194 return sub1;
195 }
196 int sub2 = (ikey & 7) + 16;
197 if (key_[sub2] == ikey) {
198 return sub2;
199 }
200
201 return -1;
202 }
203
204 // Return current top key
CurrentTopKey()205 int DocTote::CurrentTopKey() {
206 int top_key = 0;
207 int top_value = -1;
208 for (int sub = 0; sub < kMaxSize_; ++sub) {
209 if (key_[sub] == kUnusedKey) {continue;}
210 if (top_value < value_[sub]) {
211 top_value = value_[sub];
212 top_key = key_[sub];
213 }
214 }
215 return top_key;
216 }
217
218
219 // Sort first n entries by decreasing order of value
220 // If key==0 other fields are not valid, treat value as -1
Sort(int n)221 void DocTote::Sort(int n) {
222 // This is n**2, but n is small
223 for (int sub = 0; sub < n; ++sub) {
224 if (key_[sub] == kUnusedKey) {value_[sub] = -1;}
225
226 // Bubble sort key[sub] and entry[sub]
227 for (int sub2 = sub + 1; sub2 < kMaxSize_; ++sub2) {
228 if (key_[sub2] == kUnusedKey) {value_[sub2] = -1;}
229 if (value_[sub] < value_[sub2]) {
230 // swap
231 uint16 tmpk = key_[sub];
232 key_[sub] = key_[sub2];
233 key_[sub2] = tmpk;
234
235 int tmpv = value_[sub];
236 value_[sub] = value_[sub2];
237 value_[sub2] = tmpv;
238
239 double tmps = score_[sub];
240 score_[sub] = score_[sub2];
241 score_[sub2] = tmps;
242
243 int tmpr = reliability_[sub];
244 reliability_[sub] = reliability_[sub2];
245 reliability_[sub2] = tmpr;
246 }
247 }
248 }
249 sorted_ = 1;
250 }
251
Dump(FILE * f)252 void DocTote::Dump(FILE* f) {
253 fprintf(f, "DocTote::Dump\n");
254 for (int sub = 0; sub < kMaxSize_; ++sub) {
255 if (key_[sub] != kUnusedKey) {
256 Language lang = static_cast<Language>(key_[sub]);
257 fprintf(f, "[%2d] %3s %6dB %5dp %4dR,\n", sub, LanguageCode(lang),
258 value_[sub], score_[sub], reliability_[sub]);
259 }
260 }
261 fprintf(f, " %d chunks scored<br>\n", incr_count_);
262 }
263
264 } // End namespace CLD2
265
266