1 // Copyright 2013 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 //
16 // Author: dsites@google.com (Dick Sites)
17 // Updated 2014.01 for dual table lookup
18 //
19
20 #include "scoreonescriptspan.h"
21
22 #include "cldutil.h"
23 #include "debug.h"
24 #include "lang_script.h"
25
26 #include <stdio.h>
27
28 using namespace std;
29
30 namespace CLD2 {
31
32 static const int kUnreliablePercentThreshold = 75;
33
AddLangProb(uint32 langprob,Tote * chunk_tote)34 void AddLangProb(uint32 langprob, Tote* chunk_tote) {
35 ProcessProbV2Tote(langprob, chunk_tote);
36 }
37
ZeroPSLang(uint32 langprob,Tote * chunk_tote)38 void ZeroPSLang(uint32 langprob, Tote* chunk_tote) {
39 uint8 top1 = (langprob >> 8) & 0xff;
40 chunk_tote->SetScore(top1, 0);
41 }
42
SameCloseSet(uint16 lang1,uint16 lang2)43 bool SameCloseSet(uint16 lang1, uint16 lang2) {
44 int lang1_close_set = LanguageCloseSet(static_cast<Language>(lang1));
45 if (lang1_close_set == 0) {return false;}
46 int lang2_close_set = LanguageCloseSet(static_cast<Language>(lang2));
47 return (lang1_close_set == lang2_close_set);
48 }
49
SameCloseSet(Language lang1,Language lang2)50 bool SameCloseSet(Language lang1, Language lang2) {
51 int lang1_close_set = LanguageCloseSet(lang1);
52 if (lang1_close_set == 0) {return false;}
53 int lang2_close_set = LanguageCloseSet(lang2);
54 return (lang1_close_set == lang2_close_set);
55 }
56
57
58 // Needs expected score per 1KB in scoring context
SetChunkSummary(ULScript ulscript,int first_linear_in_chunk,int offset,int len,const ScoringContext * scoringcontext,const Tote * chunk_tote,ChunkSummary * chunksummary)59 void SetChunkSummary(ULScript ulscript, int first_linear_in_chunk,
60 int offset, int len,
61 const ScoringContext* scoringcontext,
62 const Tote* chunk_tote,
63 ChunkSummary* chunksummary) {
64 int key3[3];
65 chunk_tote->CurrentTopThreeKeys(key3);
66 Language lang1 = FromPerScriptNumber(ulscript, key3[0]);
67 Language lang2 = FromPerScriptNumber(ulscript, key3[1]);
68
69 int actual_score_per_kb = 0;
70 if (len > 0) {
71 actual_score_per_kb = (chunk_tote->GetScore(key3[0]) << 10) / len;
72 }
73 int expected_subscr = lang1 * 4 + LScript4(ulscript);
74 int expected_score_per_kb =
75 scoringcontext->scoringtables->kExpectedScore[expected_subscr];
76
77 chunksummary->offset = offset;
78 chunksummary->chunk_start = first_linear_in_chunk;
79 chunksummary->lang1 = lang1;
80 chunksummary->lang2 = lang2;
81 chunksummary->score1 = chunk_tote->GetScore(key3[0]);
82 chunksummary->score2 = chunk_tote->GetScore(key3[1]);
83 chunksummary->bytes = len;
84 chunksummary->grams = chunk_tote->GetScoreCount();
85 chunksummary->ulscript = ulscript;
86 chunksummary->reliability_delta = ReliabilityDelta(chunksummary->score1,
87 chunksummary->score2,
88 chunksummary->grams);
89 // If lang1/lang2 in same close set, set delta reliability to 100%
90 if (SameCloseSet(lang1, lang2)) {
91 chunksummary->reliability_delta = 100;
92 }
93 chunksummary->reliability_score =
94 ReliabilityExpected(actual_score_per_kb, expected_score_per_kb);
95 }
96
97 // Return true if just lang1 is there: lang2=0 and lang3=0
IsSingleLang(uint32 langprob)98 bool IsSingleLang(uint32 langprob) {
99 // Probably a bug -- which end is lang1? But only used to call empty Boost1
100 return ((langprob & 0x00ffff00) == 0);
101 }
102
103 // Update scoring context distinct_boost for single language quad
AddDistinctBoost1(uint32 langprob,ScoringContext * scoringcontext)104 void AddDistinctBoost1(uint32 langprob, ScoringContext* scoringcontext) {
105 // Probably keep this empty -- not a good enough signal
106 }
107
108 // Update scoring context distinct_boost for distinct octagram
109 // Keep last 4 used. Since these are mostly (except at splices) in
110 // hitbuffer, we might be able to just use a subscript and splice
AddDistinctBoost2(uint32 langprob,ScoringContext * scoringcontext)111 void AddDistinctBoost2(uint32 langprob, ScoringContext* scoringcontext) {
112 // this is called 0..n times per chunk with decoded hitbuffer->distinct...
113 LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn;
114 if (scoringcontext->ulscript != ULScript_Latin) {
115 distinct_boost = &scoringcontext->distinct_boost.othr;
116 }
117 int n = distinct_boost->n;
118 distinct_boost->langprob[n] = langprob;
119 distinct_boost->n = distinct_boost->wrap(n + 1);
120 }
121
122 // For each chunk, add extra weight for language priors (from content-lang and
123 // meta lang=xx) and distinctive tokens
ScoreBoosts(const ScoringContext * scoringcontext,Tote * chunk_tote)124 void ScoreBoosts(const ScoringContext* scoringcontext, Tote* chunk_tote) {
125 // Get boosts for current script
126 const LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn;
127 const LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn;
128 const LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn;
129 if (scoringcontext->ulscript != ULScript_Latin) {
130 langprior_boost = &scoringcontext->langprior_boost.othr;
131 langprior_whack = &scoringcontext->langprior_whack.othr;
132 distinct_boost = &scoringcontext->distinct_boost.othr;
133 }
134
135 for (int k = 0; k < kMaxBoosts; ++k) {
136 uint32 langprob = langprior_boost->langprob[k];
137 if (langprob > 0) {AddLangProb(langprob, chunk_tote);}
138 }
139 for (int k = 0; k < kMaxBoosts; ++k) {
140 uint32 langprob = distinct_boost->langprob[k];
141 if (langprob > 0) {AddLangProb(langprob, chunk_tote);}
142 }
143 // boost has a packed set of per-script langs and probabilites
144 // whack has a packed set of per-script lang to be suppressed (zeroed)
145 // When a language in a close set is given as an explicit hint, others in
146 // that set will be whacked here.
147 for (int k = 0; k < kMaxBoosts; ++k) {
148 uint32 langprob = langprior_whack->langprob[k];
149 if (langprob > 0) {ZeroPSLang(langprob, chunk_tote);}
150 }
151 }
152
153
154
155 // At this point, The chunk is described by
156 // hitbuffer->base[cspan->chunk_base .. cspan->chunk_base + cspan->base_len)
157 // hitbuffer->delta[cspan->chunk_delta ... )
158 // hitbuffer->distinct[cspan->chunk_distinct ... )
159 // Scored text is in text[lo..hi) where
160 // lo is 0 or the min of first base/delta/distinct hitbuffer offset and
161 // hi is the min of next base/delta/distinct hitbuffer offset after
162 // base_len, etc.
GetTextSpanOffsets(const ScoringHitBuffer * hitbuffer,const ChunkSpan * cspan,int * lo,int * hi)163 void GetTextSpanOffsets(const ScoringHitBuffer* hitbuffer,
164 const ChunkSpan* cspan, int* lo, int* hi) {
165 // Front of this span
166 int lo_base = hitbuffer->base[cspan->chunk_base].offset;
167 int lo_delta = hitbuffer->delta[cspan->chunk_delta].offset;
168 int lo_distinct = hitbuffer->distinct[cspan->chunk_distinct].offset;
169 // Front of next span
170 int hi_base = hitbuffer->base[cspan->chunk_base +
171 cspan->base_len].offset;
172 int hi_delta = hitbuffer->delta[cspan->chunk_delta +
173 cspan->delta_len].offset;
174 int hi_distinct = hitbuffer->distinct[cspan->chunk_distinct +
175 cspan->distinct_len].offset;
176
177 *lo = 0;
178 // if (cspan->chunk_base > 0) {
179 // *lo = minint(minint(lo_base, lo_delta), lo_distinct);
180 // }
181 *lo = minint(minint(lo_base, lo_delta), lo_distinct);
182 *hi = minint(minint(hi_base, hi_delta), hi_distinct);
183 }
184
185
DiffScore(const CLD2TableSummary * obj,int indirect,uint16 lang1,uint16 lang2)186 int DiffScore(const CLD2TableSummary* obj, int indirect,
187 uint16 lang1, uint16 lang2) {
188 if (indirect < static_cast<int>(obj->kCLDTableSizeOne)) {
189 // Up to three languages at indirect
190 uint32 langprob = obj->kCLDTableInd[indirect];
191 return GetLangScore(langprob, lang1) - GetLangScore(langprob, lang2);
192 } else {
193 // Up to six languages at start + 2 * (indirect - start)
194 indirect += (indirect - obj->kCLDTableSizeOne);
195 uint32 langprob = obj->kCLDTableInd[indirect];
196 uint32 langprob2 = obj->kCLDTableInd[indirect + 1];
197 return (GetLangScore(langprob, lang1) + GetLangScore(langprob2, lang1)) -
198 (GetLangScore(langprob, lang2) + GetLangScore(langprob2, lang2));
199 }
200
201 }
202
203 // Score all the bases, deltas, distincts, boosts for one chunk into chunk_tote
204 // After last chunk there is always a hitbuffer entry with an offset just off
205 // the end of the text.
206 // Sets delta_len, and distinct_len
ScoreOneChunk(const char * text,ULScript ulscript,const ScoringHitBuffer * hitbuffer,int chunk_i,ScoringContext * scoringcontext,ChunkSpan * cspan,Tote * chunk_tote,ChunkSummary * chunksummary)207 void ScoreOneChunk(const char* text, ULScript ulscript,
208 const ScoringHitBuffer* hitbuffer,
209 int chunk_i,
210 ScoringContext* scoringcontext,
211 ChunkSpan* cspan, Tote* chunk_tote,
212 ChunkSummary* chunksummary) {
213 int first_linear_in_chunk = hitbuffer->chunk_start[chunk_i];
214 int first_linear_in_next_chunk = hitbuffer->chunk_start[chunk_i + 1];
215
216 chunk_tote->Reinit();
217 cspan->delta_len = 0;
218 cspan->distinct_len = 0;
219 if (scoringcontext->flags_cld2_verbose) {
220 fprintf(scoringcontext->debug_file, "<br>ScoreOneChunk[%d..%d) ",
221 first_linear_in_chunk, first_linear_in_next_chunk);
222 }
223
224 // 2013.02.05 linear design: just use base and base_len for the span
225 cspan->chunk_base = first_linear_in_chunk;
226 cspan->base_len = first_linear_in_next_chunk - first_linear_in_chunk;
227 for (int i = first_linear_in_chunk; i < first_linear_in_next_chunk; ++i) {
228 uint32 langprob = hitbuffer->linear[i].langprob;
229 AddLangProb(langprob, chunk_tote);
230 if (hitbuffer->linear[i].type <= QUADHIT) {
231 chunk_tote->AddScoreCount(); // Just count quads, not octas
232 }
233 if (hitbuffer->linear[i].type == DISTINCTHIT) {
234 AddDistinctBoost2(langprob, scoringcontext);
235 }
236 }
237
238 // Score language prior boosts
239 // Score distinct word boost
240 ScoreBoosts(scoringcontext, chunk_tote);
241
242 int lo = hitbuffer->linear[first_linear_in_chunk].offset;
243 int hi = hitbuffer->linear[first_linear_in_next_chunk].offset;
244
245 // Chunk_tote: get top langs, scores, etc. and fill in chunk summary
246 SetChunkSummary(ulscript, first_linear_in_chunk, lo, hi - lo,
247 scoringcontext, chunk_tote, chunksummary);
248
249 bool more_to_come = false;
250 bool score_cjk = false;
251 if (scoringcontext->flags_cld2_html) {
252 // Show one chunk in readable output
253 CLD2_Debug(text, lo, hi, more_to_come, score_cjk, hitbuffer,
254 scoringcontext, cspan, chunksummary);
255 }
256
257 scoringcontext->prior_chunk_lang = static_cast<Language>(chunksummary->lang1);
258 }
259
260
261 // Score chunks of text described by hitbuffer, allowing each to be in a
262 // different language, and optionally adjusting the boundaries inbetween.
263 // Set last_cspan to the last chunkspan used
ScoreAllHits(const char * text,ULScript ulscript,bool more_to_come,bool score_cjk,const ScoringHitBuffer * hitbuffer,ScoringContext * scoringcontext,SummaryBuffer * summarybuffer,ChunkSpan * last_cspan)264 void ScoreAllHits(const char* text, ULScript ulscript,
265 bool more_to_come, bool score_cjk,
266 const ScoringHitBuffer* hitbuffer,
267 ScoringContext* scoringcontext,
268 SummaryBuffer* summarybuffer, ChunkSpan* last_cspan) {
269 ChunkSpan prior_cspan = {0, 0, 0, 0, 0, 0};
270 ChunkSpan cspan = {0, 0, 0, 0, 0, 0};
271
272 for (int i = 0; i < hitbuffer->next_chunk_start; ++i) {
273 // Score one chunk
274 // Sets delta_len, and distinct_len
275 Tote chunk_tote;
276 ChunkSummary chunksummary;
277 ScoreOneChunk(text, ulscript,
278 hitbuffer, i,
279 scoringcontext, &cspan, &chunk_tote, &chunksummary);
280
281 // Put result in summarybuffer
282 if (summarybuffer->n < kMaxSummaries) {
283 summarybuffer->chunksummary[summarybuffer->n] = chunksummary;
284 summarybuffer->n += 1;
285 }
286
287 prior_cspan = cspan;
288 cspan.chunk_base += cspan.base_len;
289 cspan.chunk_delta += cspan.delta_len;
290 cspan.chunk_distinct += cspan.distinct_len;
291 }
292
293 // Add one dummy off the end to hold first unused linear_in_chunk
294 int linear_off_end = hitbuffer->next_linear;
295 int offset_off_end = hitbuffer->linear[linear_off_end].offset;
296 ChunkSummary* cs = &summarybuffer->chunksummary[summarybuffer->n];
297 memset(cs, 0, sizeof(ChunkSummary));
298 cs->offset = offset_off_end;
299 cs->chunk_start = linear_off_end;
300 *last_cspan = prior_cspan;
301 }
302
303
SummaryBufferToDocTote(const SummaryBuffer * summarybuffer,bool more_to_come,DocTote * doc_tote)304 void SummaryBufferToDocTote(const SummaryBuffer* summarybuffer,
305 bool more_to_come, DocTote* doc_tote) {
306 int cs_bytes_sum = 0;
307 for (int i = 0; i < summarybuffer->n; ++i) {
308 const ChunkSummary* cs = &summarybuffer->chunksummary[i];
309 int reliability = minint(cs->reliability_delta, cs->reliability_score);
310 // doc_tote uses full languages
311 doc_tote->Add(cs->lang1, cs->bytes, cs->score1, reliability);
312 cs_bytes_sum += cs->bytes;
313 }
314 }
315
316 // Turn on for debugging vectors
317 static const bool kShowLettersOriginal = false;
318
319
320 // If next chunk language matches last vector language, extend last element
321 // Otherwise add new element to vector
ItemToVector(ScriptScanner * scanner,ResultChunkVector * vec,Language new_lang,int mapped_offset,int mapped_len)322 void ItemToVector(ScriptScanner* scanner,
323 ResultChunkVector* vec, Language new_lang,
324 int mapped_offset, int mapped_len) {
325 uint16 last_vec_lang = static_cast<uint16>(UNKNOWN_LANGUAGE);
326 int last_vec_subscr = vec->size() - 1;
327 if (last_vec_subscr >= 0) {
328 ResultChunk* priorrc = &(*vec)[last_vec_subscr];
329 last_vec_lang = priorrc->lang1;
330 if (new_lang == last_vec_lang) {
331 // Extend prior. Current mapped_offset may be beyond prior end, so do
332 // the arithmetic to include any such gap
333 priorrc->bytes = minint((mapped_offset + mapped_len) - priorrc->offset,
334 kMaxResultChunkBytes);
335 if (kShowLettersOriginal) {
336 // Optionally print the new chunk original text
337 string temp2(&scanner->GetBufferStart()[priorrc->offset],
338 priorrc->bytes);
339 fprintf(stderr, "Item[%d..%d) '%s'<br>\n",
340 priorrc->offset, priorrc->offset + priorrc->bytes,
341 GetHtmlEscapedText(temp2).c_str());
342 }
343 return;
344 }
345 }
346 // Add new vector element
347 ResultChunk rc;
348 rc.offset = mapped_offset;
349 rc.bytes = minint(mapped_len, kMaxResultChunkBytes);
350 rc.lang1 = static_cast<uint16>(new_lang);
351 vec->push_back(rc);
352 if (kShowLettersOriginal) {
353 // Optionally print the new chunk original text
354 string temp2(&scanner->GetBufferStart()[rc.offset], rc.bytes);
355 fprintf(stderr, "Item[%d..%d) '%s'<br>\n",
356 rc.offset, rc.offset + rc.bytes,
357 GetHtmlEscapedText(temp2).c_str());
358 }
359 }
360
PriorVecLang(const ResultChunkVector * vec)361 uint16 PriorVecLang(const ResultChunkVector* vec) {
362 if (vec->empty()) {return static_cast<uint16>(UNKNOWN_LANGUAGE);}
363 return (*vec)[vec->size() - 1].lang1;
364 }
365
NextChunkLang(const SummaryBuffer * summarybuffer,int i)366 uint16 NextChunkLang(const SummaryBuffer* summarybuffer, int i) {
367 if ((i + 1) >= summarybuffer->n) {
368 return static_cast<uint16>(UNKNOWN_LANGUAGE);
369 }
370 return summarybuffer->chunksummary[i + 1].lang1;
371 }
372
373
374
375 // Add n elements of summarybuffer to resultchunk vector:
376 // Each element is letters-only text [offset..offset+bytes)
377 // This maps back to original[Back(offset)..Back(offset+bytes))
378 //
379 // We go out of our way to minimize the variation in the ResultChunkVector,
380 // so that the caller has fewer but more meaningful spans in different
381 // lanaguges, for the likely purpose of translation or spell-check.
382 //
383 // The language of each chunk is lang1, but it might be unreliable for
384 // either of two reasons: its score is relatively too close to the score of
385 // lang2, or its score is too far away from the expected score of real text in
386 // the given language. Unreliable languages are mapped to Unknown.
387 //
SummaryBufferToVector(ScriptScanner * scanner,const char * text,const SummaryBuffer * summarybuffer,bool more_to_come,ResultChunkVector * vec)388 void SummaryBufferToVector(ScriptScanner* scanner, const char* text,
389 const SummaryBuffer* summarybuffer,
390 bool more_to_come, ResultChunkVector* vec) {
391 if (vec == NULL) {return;}
392
393 if (kShowLettersOriginal) {
394 fprintf(stderr, "map2original_ ");
395 scanner->map2original_.DumpWindow();
396 fprintf(stderr, "<br>\n");
397 fprintf(stderr, "map2uplow_ ");
398 scanner->map2uplow_.DumpWindow();
399 fprintf(stderr, "<br>\n");
400 }
401
402 for (int i = 0; i < summarybuffer->n; ++i) {
403 const ChunkSummary* cs = &summarybuffer->chunksummary[i];
404 int unmapped_offset = cs->offset;
405 int unmapped_len = cs->bytes;
406
407 if (kShowLettersOriginal) {
408 // Optionally print the chunk lowercase letters/marks text
409 string temp(&text[unmapped_offset], unmapped_len);
410 fprintf(stderr, "Letters [%d..%d) '%s'<br>\n",
411 unmapped_offset, unmapped_offset + unmapped_len,
412 GetHtmlEscapedText(temp).c_str());
413 }
414
415 int mapped_offset = scanner->MapBack(unmapped_offset);
416
417 // Trim back a little to prefer splicing original at word boundaries
418 if (mapped_offset > 0) {
419 // Size of prior vector entry, if any
420 int prior_size = 0;
421 if (!vec->empty()) {
422 ResultChunk* rc = &(*vec)[vec->size() - 1];
423 prior_size = rc->bytes;
424 }
425 // Maximum back up size to leave at least 3 bytes in prior,
426 // and not entire buffer, and no more than 12 bytes total backup
427 int n_limit = minint(prior_size - 3, mapped_offset);
428 n_limit = minint(n_limit, 12);
429
430 // Backscan over letters, stopping if prior byte is < 0x41
431 // There is some possibility that we will backscan over a different script
432 const char* s = &scanner->GetBufferStart()[mapped_offset];
433 const unsigned char* us = reinterpret_cast<const unsigned char*>(s);
434 int n = 0;
435 while ((n < n_limit) && (us[-n - 1] >= 0x41)) {++n;}
436 if (n >= n_limit) {n = 0;} // New boundary not found within range
437
438 // Also back up exactly one leading punctuation character if '"#@
439 if (n < n_limit) {
440 unsigned char c = us[-n - 1];
441 if ((c == '\'') || (c == '"') || (c == '#') || (c == '@')) {++n;}
442 }
443 // Shrink the previous chunk slightly
444 if (n > 0) {
445 ResultChunk* rc = &(*vec)[vec->size() - 1];
446 rc->bytes -= n;
447 mapped_offset -= n;
448 if (kShowLettersOriginal) {
449 fprintf(stderr, "Back up %d bytes<br>\n", n);
450 // Optionally print the prior chunk original text
451 string temp2(&scanner->GetBufferStart()[rc->offset], rc->bytes);
452 fprintf(stderr, "Prior [%d..%d) '%s'<br>\n",
453 rc->offset, rc->offset + rc->bytes,
454 GetHtmlEscapedText(temp2).c_str());
455 }
456 }
457 }
458
459 int mapped_len =
460 scanner->MapBack(unmapped_offset + unmapped_len) - mapped_offset;
461
462 if (kShowLettersOriginal) {
463 // Optionally print the chunk original text
464 string temp2(&scanner->GetBufferStart()[mapped_offset], mapped_len);
465 fprintf(stderr, "Original[%d..%d) '%s'<br>\n",
466 mapped_offset, mapped_offset + mapped_len,
467 GetHtmlEscapedText(temp2).c_str());
468 }
469
470 Language new_lang = static_cast<Language>(cs->lang1);
471 bool reliability_delta_bad =
472 (cs->reliability_delta < kUnreliablePercentThreshold);
473 bool reliability_score_bad =
474 (cs->reliability_score < kUnreliablePercentThreshold);
475
476 // If the top language matches last vector, ignore reliability_delta
477 uint16 prior_lang = PriorVecLang(vec);
478 if (prior_lang == cs->lang1) {
479 reliability_delta_bad = false;
480 }
481 // If the top language is in same close set as last vector, set up to merge
482 if (SameCloseSet(cs->lang1, prior_lang)) {
483 new_lang = static_cast<Language>(prior_lang);
484 reliability_delta_bad = false;
485 }
486 // If the top two languages are in the same close set and the last vector
487 // language is the second language, set up to merge
488 if (SameCloseSet(cs->lang1, cs->lang2) &&
489 (prior_lang == cs->lang2)) {
490 new_lang = static_cast<Language>(prior_lang);
491 reliability_delta_bad = false;
492 }
493 // If unreliable and the last and next vector languages are both
494 // the second language, set up to merge
495 uint16 next_lang = NextChunkLang(summarybuffer, i);
496 if (reliability_delta_bad &&
497 (prior_lang == cs->lang2) && (next_lang == cs->lang2)) {
498 new_lang = static_cast<Language>(prior_lang);
499 reliability_delta_bad = false;
500 }
501
502 if (reliability_delta_bad || reliability_score_bad) {
503 new_lang = UNKNOWN_LANGUAGE;
504 }
505 ItemToVector(scanner, vec, new_lang, mapped_offset, mapped_len);
506 }
507 }
508
509 // Add just one element to resultchunk vector:
510 // For RTypeNone or RTypeOne
JustOneItemToVector(ScriptScanner * scanner,const char * text,Language lang1,int unmapped_offset,int unmapped_len,ResultChunkVector * vec)511 void JustOneItemToVector(ScriptScanner* scanner, const char* text,
512 Language lang1, int unmapped_offset, int unmapped_len,
513 ResultChunkVector* vec) {
514 if (vec == NULL) {return;}
515
516 if (kShowLettersOriginal) {
517 fprintf(stderr, "map2original_ ");
518 scanner->map2original_.DumpWindow();
519 fprintf(stderr, "<br>\n");
520 fprintf(stderr, "map2uplow_ ");
521 scanner->map2uplow_.DumpWindow();
522 fprintf(stderr, "<br>\n");
523 }
524
525 if (kShowLettersOriginal) {
526 // Optionally print the chunk lowercase letters/marks text
527 string temp(&text[unmapped_offset], unmapped_len);
528 fprintf(stderr, "Letters1 [%d..%d) '%s'<br>\n",
529 unmapped_offset, unmapped_offset + unmapped_len,
530 GetHtmlEscapedText(temp).c_str());
531 }
532
533 int mapped_offset = scanner->MapBack(unmapped_offset);
534 int mapped_len =
535 scanner->MapBack(unmapped_offset + unmapped_len) - mapped_offset;
536
537 if (kShowLettersOriginal) {
538 // Optionally print the chunk original text
539 string temp2(&scanner->GetBufferStart()[mapped_offset], mapped_len);
540 fprintf(stderr, "Original1[%d..%d) '%s'<br>\n",
541 mapped_offset, mapped_offset + mapped_len,
542 GetHtmlEscapedText(temp2).c_str());
543 }
544
545 ItemToVector(scanner, vec, lang1, mapped_offset, mapped_len);
546 }
547
548
549 // Debugging. Not thread safe. Defined in getonescriptspan
550 char* DisplayPiece(const char* next_byte_, int byte_length_);
551
552 // If high bit is on, take out high bit and add 2B to make table2 entries easy
PrintableIndirect(int x)553 inline int PrintableIndirect(int x) {
554 if ((x & 0x80000000u) != 0) {
555 return (x & ~0x80000000u) + 2000000000;
556 }
557 return x;
558 }
DumpHitBuffer(FILE * df,const char * text,const ScoringHitBuffer * hitbuffer)559 void DumpHitBuffer(FILE* df, const char* text,
560 const ScoringHitBuffer* hitbuffer) {
561 fprintf(df,
562 "<br>DumpHitBuffer[%s, next_base/delta/distinct %d, %d, %d)<br>\n",
563 ULScriptCode(hitbuffer->ulscript),
564 hitbuffer->next_base, hitbuffer->next_delta,
565 hitbuffer->next_distinct);
566 for (int i = 0; i < hitbuffer->maxscoringhits; ++i) {
567 if (i < hitbuffer->next_base) {
568 fprintf(df, "Q[%d]%d,%d,%s ",
569 i, hitbuffer->base[i].offset,
570 PrintableIndirect(hitbuffer->base[i].indirect),
571 DisplayPiece(&text[hitbuffer->base[i].offset], 6));
572 }
573 if (i < hitbuffer->next_delta) {
574 fprintf(df, "DL[%d]%d,%d,%s ",
575 i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect,
576 DisplayPiece(&text[hitbuffer->delta[i].offset], 12));
577 }
578 if (i < hitbuffer->next_distinct) {
579 fprintf(df, "D[%d]%d,%d,%s ",
580 i, hitbuffer->distinct[i].offset, hitbuffer->distinct[i].indirect,
581 DisplayPiece(&text[hitbuffer->distinct[i].offset], 12));
582 }
583 if (i < hitbuffer->next_base) {
584 fprintf(df, "<br>\n");
585 }
586 if (i > 50) {break;}
587 }
588 if (hitbuffer->next_base > 50) {
589 int i = hitbuffer->next_base;
590 fprintf(df, "Q[%d]%d,%d,%s ",
591 i, hitbuffer->base[i].offset,
592 PrintableIndirect(hitbuffer->base[i].indirect),
593 DisplayPiece(&text[hitbuffer->base[i].offset], 6));
594 }
595 if (hitbuffer->next_delta > 50) {
596 int i = hitbuffer->next_delta;
597 fprintf(df, "DL[%d]%d,%d,%s ",
598 i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect,
599 DisplayPiece(&text[hitbuffer->delta[i].offset], 12));
600 }
601 if (hitbuffer->next_distinct > 50) {
602 int i = hitbuffer->next_distinct;
603 fprintf(df, "D[%d]%d,%d,%s ",
604 i, hitbuffer->distinct[i].offset, hitbuffer->distinct[i].indirect,
605 DisplayPiece(&text[hitbuffer->distinct[i].offset], 12));
606 }
607 fprintf(df, "<br>\n");
608 }
609
610
DumpLinearBuffer(FILE * df,const char * text,const ScoringHitBuffer * hitbuffer)611 void DumpLinearBuffer(FILE* df, const char* text,
612 const ScoringHitBuffer* hitbuffer) {
613 fprintf(df, "<br>DumpLinearBuffer[%d)<br>\n",
614 hitbuffer->next_linear);
615 // Include the dummy entry off the end
616 for (int i = 0; i < hitbuffer->next_linear + 1; ++i) {
617 if ((50 < i) && (i < (hitbuffer->next_linear - 1))) {continue;}
618 fprintf(df, "[%d]%d,%c=%08x,%s<br>\n",
619 i, hitbuffer->linear[i].offset,
620 "UQLD"[hitbuffer->linear[i].type],
621 hitbuffer->linear[i].langprob,
622 DisplayPiece(&text[hitbuffer->linear[i].offset], 6));
623 }
624 fprintf(df, "<br>\n");
625
626 fprintf(df, "DumpChunkStart[%d]<br>\n", hitbuffer->next_chunk_start);
627 for (int i = 0; i < hitbuffer->next_chunk_start + 1; ++i) {
628 fprintf(df, "[%d]%d\n", i, hitbuffer->chunk_start[i]);
629 }
630 fprintf(df, "<br>\n");
631 }
632
633 // Move this verbose debugging output to debug.cc eventually
DumpChunkSummary(FILE * df,const ChunkSummary * cs)634 void DumpChunkSummary(FILE* df, const ChunkSummary* cs) {
635 // Print chunksummary
636 fprintf(df, "%d lin[%d] %s.%d %s.%d %dB %d# %s %dRd %dRs<br>\n",
637 cs->offset,
638 cs->chunk_start,
639 LanguageCode(static_cast<Language>(cs->lang1)),
640 cs->score1,
641 LanguageCode(static_cast<Language>(cs->lang2)),
642 cs->score2,
643 cs->bytes,
644 cs->grams,
645 ULScriptCode(static_cast<ULScript>(cs->ulscript)),
646 cs->reliability_delta,
647 cs->reliability_score);
648 }
649
DumpSummaryBuffer(FILE * df,const SummaryBuffer * summarybuffer)650 void DumpSummaryBuffer(FILE* df, const SummaryBuffer* summarybuffer) {
651 fprintf(df, "<br>DumpSummaryBuffer[%d]<br>\n", summarybuffer->n);
652 fprintf(df, "[i] offset linear[chunk_start] lang.score1 lang.score2 "
653 "bytesB ngrams# script rel_delta rel_score<br>\n");
654 for (int i = 0; i <= summarybuffer->n; ++i) {
655 fprintf(df, "[%d] ", i);
656 DumpChunkSummary(df, &summarybuffer->chunksummary[i]);
657 }
658 fprintf(df, "<br>\n");
659 }
660
661
662
663 // Within hitbufer->linear[]
664 // <-- prior chunk --><-- this chunk -->
665 // | | |
666 // linear0 linear1 linear2
667 // lang0 lang1
668 // The goal of sharpening is to move this_linear to better separate langs
BetterBoundary(const char * text,ScoringHitBuffer * hitbuffer,ScoringContext * scoringcontext,uint16 pslang0,uint16 pslang1,int linear0,int linear1,int linear2)669 int BetterBoundary(const char* text,
670 ScoringHitBuffer* hitbuffer,
671 ScoringContext* scoringcontext,
672 uint16 pslang0, uint16 pslang1,
673 int linear0, int linear1, int linear2) {
674 // Degenerate case, no change
675 if ((linear2 - linear0) <= 8) {return linear1;}
676
677 // Each diff gives pslang0 score - pslang1 score
678 // Running diff has four entries + + + + followed by four entries - - - -
679 // so that this value is maximal at the sharpest boundary between pslang0
680 // (positive diffs) and pslang1 (negative diffs)
681 int running_diff = 0;
682 int diff[8]; // Ring buffer of pslang0-pslang1 differences
683 // Initialize with first 8 diffs
684 for (int i = linear0; i < linear0 + 8; ++i) {
685 int j = i & 7;
686 uint32 langprob = hitbuffer->linear[i].langprob;
687 diff[j] = GetLangScore(langprob, pslang0) -
688 GetLangScore(langprob, pslang1);
689 if (i < linear0 + 4) {
690 // First four diffs pslang0 - pslang1
691 running_diff += diff[j];
692 } else {
693 // Second four diffs -(pslang0 - pslang1)
694 running_diff -= diff[j];
695 }
696 }
697
698 // Now scan for sharpest boundary. j is at left end of 8 entries
699 // To be a boundary, there must be both >0 and <0 entries in the window
700 int better_boundary_value = 0;
701 int better_boundary = linear1;
702 for (int i = linear0; i < linear2 - 8; ++i) {
703 int j = i & 7;
704 if (better_boundary_value < running_diff) {
705 bool has_plus = false;
706 bool has_minus = false;
707 for (int kk = 0; kk < 8; ++kk) {
708 if (diff[kk] > 0) {has_plus = true;}
709 if (diff[kk] < 0) {has_minus = true;}
710 }
711 if (has_plus && has_minus) {
712 better_boundary_value = running_diff;
713 better_boundary = i + 4;
714 }
715 }
716 // Shift right one entry
717 uint32 langprob = hitbuffer->linear[i + 8].langprob;
718 int newdiff = GetLangScore(langprob, pslang0) -
719 GetLangScore(langprob, pslang1);
720 int middiff = diff[(i + 4) & 7];
721 int olddiff = diff[j];
722 diff[j] = newdiff;
723 running_diff -= olddiff; // Remove left
724 running_diff += 2 * middiff; // Convert middle from - to +
725 running_diff -= newdiff; // Insert right
726 }
727
728 if (scoringcontext->flags_cld2_verbose && (linear1 != better_boundary)) {
729 Language lang0 = FromPerScriptNumber(scoringcontext->ulscript, pslang0);
730 Language lang1 = FromPerScriptNumber(scoringcontext->ulscript, pslang1);
731 fprintf(scoringcontext->debug_file, " Better lin[%d=>%d] %s^^%s <br>\n",
732 linear1, better_boundary,
733 LanguageCode(lang0), LanguageCode(lang1));
734 int lin0_off = hitbuffer->linear[linear0].offset;
735 int lin1_off = hitbuffer->linear[linear1].offset;
736 int lin2_off = hitbuffer->linear[linear2].offset;
737 int better_offm1 = hitbuffer->linear[better_boundary - 1].offset;
738 int better_off = hitbuffer->linear[better_boundary].offset;
739 int better_offp1 = hitbuffer->linear[better_boundary + 1].offset;
740 string old0(&text[lin0_off], lin1_off - lin0_off);
741 string old1(&text[lin1_off], lin2_off - lin1_off);
742 string new0(&text[lin0_off], better_offm1 - lin0_off);
743 string new0m1(&text[better_offm1], better_off - better_offm1);
744 string new1(&text[better_off], better_offp1 - better_off);
745 string new1p1(&text[better_offp1], lin2_off - better_offp1);
746 fprintf(scoringcontext->debug_file, "%s^^%s => <br>\n%s^%s^^%s^%s<br>\n",
747 GetHtmlEscapedText(old0).c_str(),
748 GetHtmlEscapedText(old1).c_str(),
749 GetHtmlEscapedText(new0).c_str(),
750 GetHtmlEscapedText(new0m1).c_str(),
751 GetHtmlEscapedText(new1).c_str(),
752 GetHtmlEscapedText(new1p1).c_str());
753 // Slow picture of differences per linear entry
754 int d;
755 for (int i = linear0; i < linear2; ++i) {
756 if (i == better_boundary) {
757 fprintf(scoringcontext->debug_file, "^^ ");
758 }
759 uint32 langprob = hitbuffer->linear[i].langprob;
760 d = GetLangScore(langprob, pslang0) - GetLangScore(langprob, pslang1);
761 const char* s = "=";
762 //if (d > 2) {s = "\xc2\xaf";} // Macron
763 if (d > 2) {s = "#";}
764 else if (d > 0) {s = "+";}
765 else if (d < -2) {s = "_";}
766 else if (d < 0) {s = "-";}
767 fprintf(scoringcontext->debug_file, "%s ", s);
768 }
769 fprintf(scoringcontext->debug_file, " (scale: #+=-_)<br>\n");
770 }
771 return better_boundary;
772 }
773
774
775 // For all but the first summary, if its top language differs from
776 // the previous chunk, refine the boundary
777 // Linearized version
SharpenBoundaries(const char * text,bool more_to_come,ScoringHitBuffer * hitbuffer,ScoringContext * scoringcontext,SummaryBuffer * summarybuffer)778 void SharpenBoundaries(const char* text,
779 bool more_to_come,
780 ScoringHitBuffer* hitbuffer,
781 ScoringContext* scoringcontext,
782 SummaryBuffer* summarybuffer) {
783
784 int prior_linear = summarybuffer->chunksummary[0].chunk_start;
785 uint16 prior_lang = summarybuffer->chunksummary[0].lang1;
786
787 if (scoringcontext->flags_cld2_verbose) {
788 fprintf(scoringcontext->debug_file, "<br>SharpenBoundaries<br>\n");
789 }
790 for (int i = 1; i < summarybuffer->n; ++i) {
791 ChunkSummary* cs = &summarybuffer->chunksummary[i];
792 uint16 this_lang = cs->lang1;
793 if (this_lang == prior_lang) {
794 prior_linear = cs->chunk_start;
795 continue;
796 }
797
798 int this_linear = cs->chunk_start;
799 int next_linear = summarybuffer->chunksummary[i + 1].chunk_start;
800
801 // If this/prior in same close set, don't move boundary
802 if (SameCloseSet(prior_lang, this_lang)) {
803 prior_linear = this_linear;
804 prior_lang = this_lang;
805 continue;
806 }
807
808
809 // Within hitbuffer->linear[]
810 // <-- prior chunk --><-- this chunk -->
811 // | | |
812 // prior_linear this_linear next_linear
813 // prior_lang this_lang
814 // The goal of sharpening is to move this_linear to better separate langs
815
816 uint8 pslang0 = PerScriptNumber(scoringcontext->ulscript,
817 static_cast<Language>(prior_lang));
818 uint8 pslang1 = PerScriptNumber(scoringcontext->ulscript,
819 static_cast<Language>(this_lang));
820 int better_linear = BetterBoundary(text,
821 hitbuffer,
822 scoringcontext,
823 pslang0, pslang1,
824 prior_linear, this_linear, next_linear);
825
826 int old_offset = hitbuffer->linear[this_linear].offset;
827 int new_offset = hitbuffer->linear[better_linear].offset;
828 cs->chunk_start = better_linear;
829 cs->offset = new_offset;
830 // If this_linear moved right, make bytes smaller for this, larger for prior
831 // If this_linear moved left, make bytes larger for this, smaller for prior
832 cs->bytes -= (new_offset - old_offset);
833 summarybuffer->chunksummary[i - 1].bytes += (new_offset - old_offset);
834
835 this_linear = better_linear; // Update so that next chunk doesn't intrude
836
837 // Consider rescoring the two chunks
838
839 // Update for next round (note: using pre-updated boundary)
840 prior_linear = this_linear;
841 prior_lang = this_lang;
842 }
843 }
844
845 // Make a langprob that gives small weight to the default language for ulscript
DefaultLangProb(ULScript ulscript)846 uint32 DefaultLangProb(ULScript ulscript) {
847 Language default_lang = DefaultLanguage(ulscript);
848 return MakeLangProb(default_lang, 1);
849 }
850
851 // Effectively, do a merge-sort based on text offsets
852 // Look up each indirect value in appropriate scoring table and keep
853 // just the resulting langprobs
LinearizeAll(ScoringContext * scoringcontext,bool score_cjk,ScoringHitBuffer * hitbuffer)854 void LinearizeAll(ScoringContext* scoringcontext, bool score_cjk,
855 ScoringHitBuffer* hitbuffer) {
856 const CLD2TableSummary* base_obj; // unigram or quadgram
857 const CLD2TableSummary* base_obj2; // quadgram dual table
858 const CLD2TableSummary* delta_obj; // bigram or octagram
859 const CLD2TableSummary* distinct_obj; // bigram or octagram
860 uint16 base_hit;
861 if (score_cjk) {
862 base_obj = scoringcontext->scoringtables->unigram_compat_obj;
863 base_obj2 = scoringcontext->scoringtables->unigram_compat_obj;
864 delta_obj = scoringcontext->scoringtables->deltabi_obj;
865 distinct_obj = scoringcontext->scoringtables->distinctbi_obj;
866 base_hit = UNIHIT;
867 } else {
868 base_obj = scoringcontext->scoringtables->quadgram_obj;
869 base_obj2 = scoringcontext->scoringtables->quadgram_obj2;
870 delta_obj = scoringcontext->scoringtables->deltaocta_obj;
871 distinct_obj = scoringcontext->scoringtables->distinctocta_obj;
872 base_hit = QUADHIT;
873 }
874
875 int base_limit = hitbuffer->next_base;
876 int delta_limit = hitbuffer->next_delta;
877 int distinct_limit = hitbuffer->next_distinct;
878 int base_i = 0;
879 int delta_i = 0;
880 int distinct_i = 0;
881 int linear_i = 0;
882
883 // Start with an initial base hit for the default language for this script
884 // Inserting this avoids edge effects with no hits at all
885 hitbuffer->linear[linear_i].offset = hitbuffer->lowest_offset;
886 hitbuffer->linear[linear_i].type = base_hit;
887 hitbuffer->linear[linear_i].langprob =
888 DefaultLangProb(scoringcontext->ulscript);
889 ++linear_i;
890
891 while ((base_i < base_limit) || (delta_i < delta_limit) ||
892 (distinct_i < distinct_limit)) {
893 int base_off = hitbuffer->base[base_i].offset;
894 int delta_off = hitbuffer->delta[delta_i].offset;
895 int distinct_off = hitbuffer->distinct[distinct_i].offset;
896
897 // Do delta and distinct first, so that they are not lost at base_limit
898 if ((delta_i < delta_limit) &&
899 (delta_off <= base_off) && (delta_off <= distinct_off)) {
900 // Add delta entry
901 int indirect = hitbuffer->delta[delta_i].indirect;
902 ++delta_i;
903 uint32 langprob = delta_obj->kCLDTableInd[indirect];
904 if (langprob > 0) {
905 hitbuffer->linear[linear_i].offset = delta_off;
906 hitbuffer->linear[linear_i].type = DELTAHIT;
907 hitbuffer->linear[linear_i].langprob = langprob;
908 ++linear_i;
909 }
910 }
911 else if ((distinct_i < distinct_limit) &&
912 (distinct_off <= base_off) && (distinct_off <= delta_off)) {
913 // Add distinct entry
914 int indirect = hitbuffer->distinct[distinct_i].indirect;
915 ++distinct_i;
916 uint32 langprob = distinct_obj->kCLDTableInd[indirect];
917 if (langprob > 0) {
918 hitbuffer->linear[linear_i].offset = distinct_off;
919 hitbuffer->linear[linear_i].type = DISTINCTHIT;
920 hitbuffer->linear[linear_i].langprob = langprob;
921 ++linear_i;
922 }
923 }
924 else {
925 // Add one or two base entries
926 int indirect = hitbuffer->base[base_i].indirect;
927 // First, get right scoring table
928 const CLD2TableSummary* local_base_obj = base_obj;
929 if ((indirect & 0x80000000u) != 0) {
930 local_base_obj = base_obj2;
931 indirect &= ~0x80000000u;
932 }
933 ++base_i;
934 // One langprob in kQuadInd[0..SingleSize),
935 // two in kQuadInd[SingleSize..Size)
936 if (indirect < static_cast<int>(local_base_obj->kCLDTableSizeOne)) {
937 // Up to three languages at indirect
938 uint32 langprob = local_base_obj->kCLDTableInd[indirect];
939 if (langprob > 0) {
940 hitbuffer->linear[linear_i].offset = base_off;
941 hitbuffer->linear[linear_i].type = base_hit;
942 hitbuffer->linear[linear_i].langprob = langprob;
943 ++linear_i;
944 }
945 } else {
946 // Up to six languages at start + 2 * (indirect - start)
947 indirect += (indirect - local_base_obj->kCLDTableSizeOne);
948 uint32 langprob = local_base_obj->kCLDTableInd[indirect];
949 uint32 langprob2 = local_base_obj->kCLDTableInd[indirect + 1];
950 if (langprob > 0) {
951 hitbuffer->linear[linear_i].offset = base_off;
952 hitbuffer->linear[linear_i].type = base_hit;
953 hitbuffer->linear[linear_i].langprob = langprob;
954 ++linear_i;
955 }
956 if (langprob2 > 0) {
957 hitbuffer->linear[linear_i].offset = base_off;
958 hitbuffer->linear[linear_i].type = base_hit;
959 hitbuffer->linear[linear_i].langprob = langprob2;
960 ++linear_i;
961 }
962 }
963 }
964 }
965
966 // Update
967 hitbuffer->next_linear = linear_i;
968
969 // Add a dummy entry off the end, just to capture final offset
970 hitbuffer->linear[linear_i].offset =
971 hitbuffer->base[hitbuffer->next_base].offset;
972 hitbuffer->linear[linear_i].langprob = 0;
973 }
974
975 // Break linear array into chunks of ~20 quadgram hits or ~50 CJK unigram hits
ChunkAll(int letter_offset,bool score_cjk,ScoringHitBuffer * hitbuffer)976 void ChunkAll(int letter_offset, bool score_cjk, ScoringHitBuffer* hitbuffer) {
977 int chunksize;
978 uint16 base_hit;
979 if (score_cjk) {
980 chunksize = kChunksizeUnis;
981 base_hit = UNIHIT;
982 } else {
983 chunksize = kChunksizeQuads;
984 base_hit = QUADHIT;
985 }
986
987 int linear_i = 0;
988 int linear_off_end = hitbuffer->next_linear;
989 int text_i = letter_offset; // Next unseen text offset
990 int next_chunk_start = 0;
991 int bases_left = hitbuffer->next_base;
992 while (bases_left > 0) {
993 // Linearize one chunk
994 int base_len = chunksize; // Default; may be changed below
995 if (bases_left < (chunksize + (chunksize >> 1))) {
996 // If within 1.5 chunks of the end, avoid runts by using it all
997 base_len = bases_left;
998 } else if (bases_left < (2 * chunksize)) {
999 // Avoid runts by splitting 1.5 to 2 chunks in half (about 3/4 each)
1000 base_len = (bases_left + 1) >> 1;
1001 }
1002
1003 hitbuffer->chunk_start[next_chunk_start] = linear_i;
1004 hitbuffer->chunk_offset[next_chunk_start] = text_i;
1005 ++next_chunk_start;
1006
1007 int base_count = 0;
1008 while ((base_count < base_len) && (linear_i < linear_off_end)) {
1009 if (hitbuffer->linear[linear_i].type == base_hit) {++base_count;}
1010 ++linear_i;
1011 }
1012 text_i = hitbuffer->linear[linear_i].offset; // Next unseen text offset
1013 bases_left -= base_len;
1014 }
1015
1016 // If no base hits at all, make a single dummy chunk
1017 if (next_chunk_start == 0) {
1018 hitbuffer->chunk_start[next_chunk_start] = 0;
1019 hitbuffer->chunk_offset[next_chunk_start] = hitbuffer->linear[0].offset;
1020 ++next_chunk_start;
1021 }
1022
1023 // Remember the linear array start of dummy entry
1024 hitbuffer->next_chunk_start = next_chunk_start;
1025
1026 // Add a dummy entry off the end, just to capture final linear subscr
1027 hitbuffer->chunk_start[next_chunk_start] = hitbuffer->next_linear;
1028 hitbuffer->chunk_offset[next_chunk_start] = text_i;
1029 }
1030
1031
1032 // Merge-sort the individual hit arrays, go indirect on the scoring subscripts,
1033 // break linear array into chunks.
1034 //
1035 // Input:
1036 // hitbuffer base, delta, distinct arrays
1037 // Output:
1038 // linear array
1039 // chunk_start array
1040 //
LinearizeHitBuffer(int letter_offset,ScoringContext * scoringcontext,bool more_to_come,bool score_cjk,ScoringHitBuffer * hitbuffer)1041 void LinearizeHitBuffer(int letter_offset,
1042 ScoringContext* scoringcontext,
1043 bool more_to_come, bool score_cjk,
1044 ScoringHitBuffer* hitbuffer) {
1045 LinearizeAll(scoringcontext, score_cjk, hitbuffer);
1046 ChunkAll(letter_offset, score_cjk, hitbuffer);
1047 }
1048
1049
1050
1051 // The hitbuffer is in an awkward form -- three sets of base/delta/distinct
1052 // scores, each with an indirect subscript to one of six scoring tables, some
1053 // of which can yield two langprobs for six languages, others one langprob for
1054 // three languages. The only correlation between base/delta/distinct is their
1055 // offsets into the letters-only text buffer.
1056 //
1057 // SummaryBuffer needs to be built to linear, giving linear offset of start of
1058 // each chunk
1059 //
1060 // So we first do all the langprob lookups and merge-sort by offset to make
1061 // a single linear vector, building a side vector of chunk beginnings as we go.
1062 // The sharpening is simply moving the beginnings, scoring is a simple linear
1063 // sweep, etc.
1064
ProcessHitBuffer(const LangSpan & scriptspan,int letter_offset,ScoringContext * scoringcontext,DocTote * doc_tote,ResultChunkVector * vec,bool more_to_come,bool score_cjk,ScoringHitBuffer * hitbuffer)1065 void ProcessHitBuffer(const LangSpan& scriptspan,
1066 int letter_offset,
1067 ScoringContext* scoringcontext,
1068 DocTote* doc_tote,
1069 ResultChunkVector* vec,
1070 bool more_to_come, bool score_cjk,
1071 ScoringHitBuffer* hitbuffer) {
1072 if (scoringcontext->flags_cld2_verbose) {
1073 fprintf(scoringcontext->debug_file, "Hitbuffer[) ");
1074 DumpHitBuffer(scoringcontext->debug_file, scriptspan.text, hitbuffer);
1075 }
1076
1077 LinearizeHitBuffer(letter_offset, scoringcontext, more_to_come, score_cjk,
1078 hitbuffer);
1079
1080 if (scoringcontext->flags_cld2_verbose) {
1081 fprintf(scoringcontext->debug_file, "Linear[) ");
1082 DumpLinearBuffer(scoringcontext->debug_file, scriptspan.text, hitbuffer);
1083 }
1084
1085 SummaryBuffer summarybuffer;
1086 summarybuffer.n = 0;
1087 ChunkSpan last_cspan;
1088 ScoreAllHits(scriptspan.text, scriptspan.ulscript,
1089 more_to_come, score_cjk, hitbuffer,
1090 scoringcontext,
1091 &summarybuffer, &last_cspan);
1092
1093 if (scoringcontext->flags_cld2_verbose) {
1094 DumpSummaryBuffer(scoringcontext->debug_file, &summarybuffer);
1095 }
1096
1097 if (vec != NULL) {
1098 // Sharpen boundaries of summarybuffer
1099 // This is not a high-performance path
1100 SharpenBoundaries(scriptspan.text, more_to_come, hitbuffer, scoringcontext,
1101 &summarybuffer);
1102 // Show after the sharpening
1103 // CLD2_Debug2(scriptspan.text, more_to_come, score_cjk,
1104 // hitbuffer, scoringcontext, &summarybuffer);
1105
1106 if (scoringcontext->flags_cld2_verbose) {
1107 DumpSummaryBuffer(scoringcontext->debug_file, &summarybuffer);
1108 }
1109 }
1110
1111 SummaryBufferToDocTote(&summarybuffer, more_to_come, doc_tote);
1112 SummaryBufferToVector(scoringcontext->scanner, scriptspan.text,
1113 &summarybuffer, more_to_come, vec);
1114 }
1115
SpliceHitBuffer(ScoringHitBuffer * hitbuffer,int next_offset)1116 void SpliceHitBuffer(ScoringHitBuffer* hitbuffer, int next_offset) {
1117 // Splice hitbuffer and summarybuffer for next round. With big chunks and
1118 // distinctive-word state carried across chunks, we might not need to do this.
1119 hitbuffer->next_base = 0;
1120 hitbuffer->next_delta = 0;
1121 hitbuffer->next_distinct = 0;
1122 hitbuffer->next_linear = 0;
1123 hitbuffer->next_chunk_start = 0;
1124 hitbuffer->lowest_offset = next_offset;
1125 }
1126
1127
1128 // Score RTypeNone or RTypeOne scriptspan into doc_tote and vec, updating
1129 // scoringcontext
ScoreEntireScriptSpan(const LangSpan & scriptspan,ScoringContext * scoringcontext,DocTote * doc_tote,ResultChunkVector * vec)1130 void ScoreEntireScriptSpan(const LangSpan& scriptspan,
1131 ScoringContext* scoringcontext,
1132 DocTote* doc_tote,
1133 ResultChunkVector* vec) {
1134 int bytes = scriptspan.text_bytes;
1135 // Artificially set score to 1024 per 1KB, or 1 per byte
1136 int score = bytes;
1137 int reliability = 100;
1138 // doc_tote uses full languages
1139 Language one_one_lang = DefaultLanguage(scriptspan.ulscript);
1140 doc_tote->Add(one_one_lang, bytes, score, reliability);
1141
1142 if (scoringcontext->flags_cld2_html) {
1143 ChunkSummary chunksummary = {
1144 1, 0,
1145 one_one_lang, UNKNOWN_LANGUAGE, score, 1,
1146 bytes, 0, scriptspan.ulscript, reliability, reliability
1147 };
1148 CLD2_Debug(scriptspan.text, 1, scriptspan.text_bytes,
1149 false, false, NULL,
1150 scoringcontext, NULL, &chunksummary);
1151 }
1152
1153 // First byte is always a space
1154 JustOneItemToVector(scoringcontext->scanner, scriptspan.text,
1155 one_one_lang, 1, bytes - 1, vec);
1156
1157 scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;
1158 }
1159
1160 // Score RTypeCJK scriptspan into doc_tote and vec, updating scoringcontext
ScoreCJKScriptSpan(const LangSpan & scriptspan,ScoringContext * scoringcontext,DocTote * doc_tote,ResultChunkVector * vec)1161 void ScoreCJKScriptSpan(const LangSpan& scriptspan,
1162 ScoringContext* scoringcontext,
1163 DocTote* doc_tote,
1164 ResultChunkVector* vec) {
1165 // Allocate three parallel arrays of scoring hits
1166 ScoringHitBuffer* hitbuffer = new ScoringHitBuffer;
1167 hitbuffer->init();
1168 hitbuffer->ulscript = scriptspan.ulscript;
1169
1170 scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;
1171 scoringcontext->oldest_distinct_boost = 0;
1172
1173 // Incoming scriptspan has a single leading space at scriptspan.text[0]
1174 // and three trailing spaces then NUL at scriptspan.text[text_bytes + 0/1/2/3]
1175
1176 int letter_offset = 1; // Skip initial space
1177 hitbuffer->lowest_offset = letter_offset;
1178 int letter_limit = scriptspan.text_bytes;
1179 while (letter_offset < letter_limit) {
1180 if (scoringcontext->flags_cld2_verbose) {
1181 fprintf(scoringcontext->debug_file, " ScoreCJKScriptSpan[%d,%d)<br>\n",
1182 letter_offset, letter_limit);
1183 }
1184 //
1185 // Fill up one hitbuffer, possibly splicing onto previous fragment
1186 //
1187 // NOTE: GetUniHits deals with close repeats
1188 // NOTE: After last chunk there is always a hitbuffer entry with an offset
1189 // just off the end of the text = next_offset.
1190 int next_offset = GetUniHits(scriptspan.text, letter_offset, letter_limit,
1191 scoringcontext, hitbuffer);
1192 // NOTE: GetBiHitVectors deals with close repeats,
1193 // does one hash and two lookups (delta and distinct) per word
1194 GetBiHits(scriptspan.text, letter_offset, next_offset,
1195 scoringcontext, hitbuffer);
1196
1197 //
1198 // Score one hitbuffer in chunks to summarybuffer
1199 //
1200 bool more_to_come = next_offset < letter_limit;
1201 bool score_cjk = true;
1202 ProcessHitBuffer(scriptspan, letter_offset, scoringcontext, doc_tote, vec,
1203 more_to_come, score_cjk, hitbuffer);
1204 SpliceHitBuffer(hitbuffer, next_offset);
1205
1206 letter_offset = next_offset;
1207 }
1208
1209 delete hitbuffer;
1210 // Context across buffers is not connected yet
1211 scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;
1212 }
1213
1214
1215
1216 // Score RTypeMany scriptspan into doc_tote and vec, updating scoringcontext
1217 // We have a scriptspan with all lowercase text in one script. Look up
1218 // quadgrams and octagrams, saving the hits in three parallel vectors.
1219 // Score from those vectors in chunks, toting each chunk to get a single
1220 // language, and combining into the overall document score. The hit vectors
1221 // in general are not big enough to handle and entire scriptspan, so
1222 // repeat until the entire scriptspan is scored.
1223 // Caller deals with minimizing numbr of runt scriptspans
1224 // This routine deals with minimizing number of runt chunks.
1225 //
1226 // Returns updated scoringcontext
1227 // Returns updated doc_tote
1228 // If vec != NULL, appends to that vector of ResultChunk's
ScoreQuadScriptSpan(const LangSpan & scriptspan,ScoringContext * scoringcontext,DocTote * doc_tote,ResultChunkVector * vec)1229 void ScoreQuadScriptSpan(const LangSpan& scriptspan,
1230 ScoringContext* scoringcontext,
1231 DocTote* doc_tote,
1232 ResultChunkVector* vec) {
1233 // Allocate three parallel arrays of scoring hits
1234 ScoringHitBuffer* hitbuffer = new ScoringHitBuffer;
1235 hitbuffer->init();
1236 hitbuffer->ulscript = scriptspan.ulscript;
1237
1238 scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;
1239 scoringcontext->oldest_distinct_boost = 0;
1240
1241 // Incoming scriptspan has a single leading space at scriptspan.text[0]
1242 // and three trailing spaces then NUL at scriptspan.text[text_bytes + 0/1/2/3]
1243
1244 int letter_offset = 1; // Skip initial space
1245 hitbuffer->lowest_offset = letter_offset;
1246 int letter_limit = scriptspan.text_bytes;
1247 while (letter_offset < letter_limit) {
1248 //
1249 // Fill up one hitbuffer, possibly splicing onto previous fragment
1250 //
1251 // NOTE: GetQuadHits deals with close repeats
1252 // NOTE: After last chunk there is always a hitbuffer entry with an offset
1253 // just off the end of the text = next_offset.
1254 int next_offset = GetQuadHits(scriptspan.text, letter_offset, letter_limit,
1255 scoringcontext, hitbuffer);
1256 // If true, there is more text to process in this scriptspan
1257 // NOTE: GetOctaHitVectors deals with close repeats,
1258 // does one hash and two lookups (delta and distinct) per word
1259 GetOctaHits(scriptspan.text, letter_offset, next_offset,
1260 scoringcontext, hitbuffer);
1261
1262 //
1263 // Score one hitbuffer in chunks to summarybuffer
1264 //
1265 bool more_to_come = next_offset < letter_limit;
1266 bool score_cjk = false;
1267 ProcessHitBuffer(scriptspan, letter_offset, scoringcontext, doc_tote, vec,
1268 more_to_come, score_cjk, hitbuffer);
1269 SpliceHitBuffer(hitbuffer, next_offset);
1270
1271 letter_offset = next_offset;
1272 }
1273
1274 delete hitbuffer;
1275 }
1276
1277
1278 // Score one scriptspan into doc_tote and vec, updating scoringcontext
1279 // Inputs:
1280 // One scriptspan of perhaps 40-60KB, all same script lower-case letters
1281 // and single ASCII spaces. First character is a space to allow simple
1282 // begining-of-word detect. End of buffer has three spaces and NUL to
1283 // allow easy scan-to-end-of-word.
1284 // Scoring context of
1285 // scoring tables
1286 // flags
1287 // running boosts
1288 // Outputs:
1289 // Updated doc_tote giving overall languages and byte counts
1290 // Optional updated chunk vector giving offset, length, language
1291 //
1292 // Caller initializes flags, boosts, doc_tote and vec.
1293 // Caller aggregates across multiple scriptspans
1294 // Caller calculates final document result
1295 // Caller deals with detecting and triggering suppression of repeated text.
1296 //
1297 // This top-level routine just chooses the recognition type and calls one of
1298 // the next-level-down routines.
1299 //
ScoreOneScriptSpan(const LangSpan & scriptspan,ScoringContext * scoringcontext,DocTote * doc_tote,ResultChunkVector * vec)1300 void ScoreOneScriptSpan(const LangSpan& scriptspan,
1301 ScoringContext* scoringcontext,
1302 DocTote* doc_tote,
1303 ResultChunkVector* vec) {
1304 if (scoringcontext->flags_cld2_verbose) {
1305 fprintf(scoringcontext->debug_file, "<br>ScoreOneScriptSpan(%s,%d) ",
1306 ULScriptCode(scriptspan.ulscript), scriptspan.text_bytes);
1307 // Optionally print the chunk lowercase letters/marks text
1308 string temp(&scriptspan.text[0], scriptspan.text_bytes);
1309 fprintf(scoringcontext->debug_file, "'%s'",
1310 GetHtmlEscapedText(temp).c_str());
1311 fprintf(scoringcontext->debug_file, "<br>\n");
1312 }
1313 scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;
1314 scoringcontext->oldest_distinct_boost = 0;
1315 ULScriptRType rtype = ULScriptRecognitionType(scriptspan.ulscript);
1316 if (scoringcontext->flags_cld2_score_as_quads && (rtype != RTypeCJK)) {
1317 rtype = RTypeMany;
1318 }
1319 switch (rtype) {
1320 case RTypeNone:
1321 case RTypeOne:
1322 ScoreEntireScriptSpan(scriptspan, scoringcontext, doc_tote, vec);
1323 break;
1324 case RTypeCJK:
1325 ScoreCJKScriptSpan(scriptspan, scoringcontext, doc_tote, vec);
1326 break;
1327 case RTypeMany:
1328 ScoreQuadScriptSpan(scriptspan, scoringcontext, doc_tote, vec);
1329 break;
1330 }
1331 }
1332
1333 } // End namespace CLD2
1334
1335