1 // Copyright 2013 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 //
16 // Author: dsites@google.com (Dick Sites)
17 // Updated 2014.01 for dual table lookup
18 //
19 
20 #include "scoreonescriptspan.h"
21 
22 #include "cldutil.h"
23 #include "debug.h"
24 #include "lang_script.h"
25 
26 #include <stdio.h>
27 
28 using namespace std;
29 
30 namespace CLD2 {
31 
32 static const int kUnreliablePercentThreshold = 75;
33 
AddLangProb(uint32 langprob,Tote * chunk_tote)34 void AddLangProb(uint32 langprob, Tote* chunk_tote) {
35   ProcessProbV2Tote(langprob, chunk_tote);
36 }
37 
ZeroPSLang(uint32 langprob,Tote * chunk_tote)38 void ZeroPSLang(uint32 langprob, Tote* chunk_tote) {
39   uint8 top1 = (langprob >> 8) & 0xff;
40   chunk_tote->SetScore(top1, 0);
41 }
42 
SameCloseSet(uint16 lang1,uint16 lang2)43 bool SameCloseSet(uint16 lang1, uint16 lang2) {
44   int lang1_close_set = LanguageCloseSet(static_cast<Language>(lang1));
45   if (lang1_close_set == 0) {return false;}
46   int lang2_close_set = LanguageCloseSet(static_cast<Language>(lang2));
47   return (lang1_close_set == lang2_close_set);
48 }
49 
SameCloseSet(Language lang1,Language lang2)50 bool SameCloseSet(Language lang1, Language lang2) {
51   int lang1_close_set = LanguageCloseSet(lang1);
52   if (lang1_close_set == 0) {return false;}
53   int lang2_close_set = LanguageCloseSet(lang2);
54   return (lang1_close_set == lang2_close_set);
55 }
56 
57 
58 // Needs expected score per 1KB in scoring context
SetChunkSummary(ULScript ulscript,int first_linear_in_chunk,int offset,int len,const ScoringContext * scoringcontext,const Tote * chunk_tote,ChunkSummary * chunksummary)59 void SetChunkSummary(ULScript ulscript, int first_linear_in_chunk,
60                      int offset, int len,
61                      const ScoringContext* scoringcontext,
62                      const Tote* chunk_tote,
63                      ChunkSummary* chunksummary) {
64   int key3[3];
65   chunk_tote->CurrentTopThreeKeys(key3);
66   Language lang1 = FromPerScriptNumber(ulscript, key3[0]);
67   Language lang2 = FromPerScriptNumber(ulscript, key3[1]);
68 
69   int actual_score_per_kb = 0;
70   if (len > 0) {
71     actual_score_per_kb = (chunk_tote->GetScore(key3[0]) << 10) / len;
72   }
73   int expected_subscr = lang1 * 4 + LScript4(ulscript);
74   int expected_score_per_kb =
75      scoringcontext->scoringtables->kExpectedScore[expected_subscr];
76 
77   chunksummary->offset = offset;
78   chunksummary->chunk_start = first_linear_in_chunk;
79   chunksummary->lang1 = lang1;
80   chunksummary->lang2 = lang2;
81   chunksummary->score1 = chunk_tote->GetScore(key3[0]);
82   chunksummary->score2 = chunk_tote->GetScore(key3[1]);
83   chunksummary->bytes = len;
84   chunksummary->grams = chunk_tote->GetScoreCount();
85   chunksummary->ulscript = ulscript;
86   chunksummary->reliability_delta = ReliabilityDelta(chunksummary->score1,
87                                                      chunksummary->score2,
88                                                      chunksummary->grams);
89   // If lang1/lang2 in same close set, set delta reliability to 100%
90   if (SameCloseSet(lang1, lang2)) {
91     chunksummary->reliability_delta = 100;
92   }
93   chunksummary->reliability_score =
94      ReliabilityExpected(actual_score_per_kb, expected_score_per_kb);
95 }
96 
97 // Return true if just lang1 is there: lang2=0 and lang3=0
IsSingleLang(uint32 langprob)98 bool IsSingleLang(uint32 langprob) {
99   // Probably a bug -- which end is lang1? But only used to call empty Boost1
100   return ((langprob & 0x00ffff00) == 0);
101 }
102 
103 // Update scoring context distinct_boost for single language quad
AddDistinctBoost1(uint32 langprob,ScoringContext * scoringcontext)104 void AddDistinctBoost1(uint32 langprob, ScoringContext* scoringcontext) {
105   // Probably keep this empty -- not a good enough signal
106 }
107 
108 // Update scoring context distinct_boost for distinct octagram
109 // Keep last 4 used. Since these are mostly (except at splices) in
110 // hitbuffer, we might be able to just use a subscript and splice
AddDistinctBoost2(uint32 langprob,ScoringContext * scoringcontext)111 void AddDistinctBoost2(uint32 langprob, ScoringContext* scoringcontext) {
112 // this is called 0..n times per chunk with decoded hitbuffer->distinct...
113   LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn;
114   if (scoringcontext->ulscript != ULScript_Latin) {
115     distinct_boost = &scoringcontext->distinct_boost.othr;
116   }
117   int n = distinct_boost->n;
118   distinct_boost->langprob[n] = langprob;
119   distinct_boost->n = distinct_boost->wrap(n + 1);
120 }
121 
122 // For each chunk, add extra weight for language priors (from content-lang and
123 // meta lang=xx) and distinctive tokens
ScoreBoosts(const ScoringContext * scoringcontext,Tote * chunk_tote)124 void ScoreBoosts(const ScoringContext* scoringcontext, Tote* chunk_tote) {
125   // Get boosts for current script
126   const LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn;
127   const LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn;
128   const LangBoosts* distinct_boost = &scoringcontext->distinct_boost.latn;
129   if (scoringcontext->ulscript != ULScript_Latin) {
130     langprior_boost = &scoringcontext->langprior_boost.othr;
131     langprior_whack = &scoringcontext->langprior_whack.othr;
132     distinct_boost = &scoringcontext->distinct_boost.othr;
133   }
134 
135   for (int k = 0; k < kMaxBoosts; ++k) {
136     uint32 langprob = langprior_boost->langprob[k];
137     if (langprob > 0) {AddLangProb(langprob, chunk_tote);}
138   }
139   for (int k = 0; k < kMaxBoosts; ++k) {
140     uint32 langprob = distinct_boost->langprob[k];
141     if (langprob > 0) {AddLangProb(langprob, chunk_tote);}
142   }
143   // boost has a packed set of per-script langs and probabilites
144   // whack has a packed set of per-script lang to be suppressed (zeroed)
145   // When a language in a close set is given as an explicit hint, others in
146   //  that set will be whacked here.
147   for (int k = 0; k < kMaxBoosts; ++k) {
148     uint32 langprob = langprior_whack->langprob[k];
149     if (langprob > 0) {ZeroPSLang(langprob, chunk_tote);}
150   }
151 }
152 
153 
154 
155 // At this point, The chunk is described by
156 //  hitbuffer->base[cspan->chunk_base .. cspan->chunk_base + cspan->base_len)
157 //  hitbuffer->delta[cspan->chunk_delta ... )
158 //  hitbuffer->distinct[cspan->chunk_distinct ... )
159 // Scored text is in text[lo..hi) where
160 //  lo is 0 or the min of first base/delta/distinct hitbuffer offset and
161 //  hi is the min of next base/delta/distinct hitbuffer offset after
162 //  base_len, etc.
GetTextSpanOffsets(const ScoringHitBuffer * hitbuffer,const ChunkSpan * cspan,int * lo,int * hi)163 void GetTextSpanOffsets(const ScoringHitBuffer* hitbuffer,
164                         const ChunkSpan* cspan, int* lo, int* hi) {
165   // Front of this span
166   int lo_base = hitbuffer->base[cspan->chunk_base].offset;
167   int lo_delta = hitbuffer->delta[cspan->chunk_delta].offset;
168   int lo_distinct = hitbuffer->distinct[cspan->chunk_distinct].offset;
169   // Front of next span
170   int hi_base = hitbuffer->base[cspan->chunk_base +
171     cspan->base_len].offset;
172   int hi_delta = hitbuffer->delta[cspan->chunk_delta +
173     cspan->delta_len].offset;
174   int hi_distinct = hitbuffer->distinct[cspan->chunk_distinct +
175     cspan->distinct_len].offset;
176 
177   *lo = 0;
178 //  if (cspan->chunk_base > 0) {
179 //    *lo = minint(minint(lo_base, lo_delta), lo_distinct);
180 //  }
181   *lo = minint(minint(lo_base, lo_delta), lo_distinct);
182   *hi = minint(minint(hi_base, hi_delta), hi_distinct);
183 }
184 
185 
DiffScore(const CLD2TableSummary * obj,int indirect,uint16 lang1,uint16 lang2)186 int DiffScore(const CLD2TableSummary* obj, int indirect,
187               uint16 lang1, uint16 lang2) {
188   if (indirect < static_cast<int>(obj->kCLDTableSizeOne)) {
189     // Up to three languages at indirect
190     uint32 langprob = obj->kCLDTableInd[indirect];
191     return GetLangScore(langprob, lang1) - GetLangScore(langprob, lang2);
192   } else {
193     // Up to six languages at start + 2 * (indirect - start)
194     indirect += (indirect - obj->kCLDTableSizeOne);
195     uint32 langprob = obj->kCLDTableInd[indirect];
196     uint32 langprob2 = obj->kCLDTableInd[indirect + 1];
197     return (GetLangScore(langprob, lang1) + GetLangScore(langprob2, lang1)) -
198       (GetLangScore(langprob, lang2) + GetLangScore(langprob2, lang2));
199   }
200 
201 }
202 
203 // Score all the bases, deltas, distincts, boosts for one chunk into chunk_tote
204 // After last chunk there is always a hitbuffer entry with an offset just off
205 // the end of the text.
206 // Sets delta_len, and distinct_len
ScoreOneChunk(const char * text,ULScript ulscript,const ScoringHitBuffer * hitbuffer,int chunk_i,ScoringContext * scoringcontext,ChunkSpan * cspan,Tote * chunk_tote,ChunkSummary * chunksummary)207 void ScoreOneChunk(const char* text, ULScript ulscript,
208                    const ScoringHitBuffer* hitbuffer,
209                    int chunk_i,
210                    ScoringContext* scoringcontext,
211                    ChunkSpan* cspan, Tote* chunk_tote,
212                    ChunkSummary* chunksummary) {
213   int first_linear_in_chunk = hitbuffer->chunk_start[chunk_i];
214   int first_linear_in_next_chunk = hitbuffer->chunk_start[chunk_i + 1];
215 
216   chunk_tote->Reinit();
217   cspan->delta_len = 0;
218   cspan->distinct_len = 0;
219   if (scoringcontext->flags_cld2_verbose) {
220     fprintf(scoringcontext->debug_file, "<br>ScoreOneChunk[%d..%d) ",
221             first_linear_in_chunk, first_linear_in_next_chunk);
222   }
223 
224   // 2013.02.05 linear design: just use base and base_len for the span
225   cspan->chunk_base = first_linear_in_chunk;
226   cspan->base_len = first_linear_in_next_chunk - first_linear_in_chunk;
227   for (int i = first_linear_in_chunk; i < first_linear_in_next_chunk; ++i) {
228     uint32 langprob = hitbuffer->linear[i].langprob;
229     AddLangProb(langprob, chunk_tote);
230     if (hitbuffer->linear[i].type <= QUADHIT) {
231       chunk_tote->AddScoreCount();      // Just count quads, not octas
232     }
233     if (hitbuffer->linear[i].type == DISTINCTHIT) {
234       AddDistinctBoost2(langprob, scoringcontext);
235     }
236   }
237 
238   // Score language prior boosts
239   // Score distinct word boost
240   ScoreBoosts(scoringcontext, chunk_tote);
241 
242   int lo = hitbuffer->linear[first_linear_in_chunk].offset;
243   int hi = hitbuffer->linear[first_linear_in_next_chunk].offset;
244 
245   // Chunk_tote: get top langs, scores, etc. and fill in chunk summary
246   SetChunkSummary(ulscript, first_linear_in_chunk, lo, hi - lo,
247                   scoringcontext, chunk_tote, chunksummary);
248 
249   bool more_to_come = false;
250   bool score_cjk = false;
251   if (scoringcontext->flags_cld2_html) {
252     // Show one chunk in readable output
253     CLD2_Debug(text, lo, hi, more_to_come, score_cjk, hitbuffer,
254                scoringcontext, cspan, chunksummary);
255   }
256 
257   scoringcontext->prior_chunk_lang = static_cast<Language>(chunksummary->lang1);
258 }
259 
260 
261 // Score chunks of text described by hitbuffer, allowing each to be in a
262 // different language, and optionally adjusting the boundaries inbetween.
263 // Set last_cspan to the last chunkspan used
ScoreAllHits(const char * text,ULScript ulscript,bool more_to_come,bool score_cjk,const ScoringHitBuffer * hitbuffer,ScoringContext * scoringcontext,SummaryBuffer * summarybuffer,ChunkSpan * last_cspan)264 void ScoreAllHits(const char* text,  ULScript ulscript,
265                   bool more_to_come, bool score_cjk,
266                   const ScoringHitBuffer* hitbuffer,
267                   ScoringContext* scoringcontext,
268                   SummaryBuffer* summarybuffer, ChunkSpan* last_cspan) {
269   ChunkSpan prior_cspan = {0, 0, 0, 0, 0, 0};
270   ChunkSpan cspan = {0, 0, 0, 0, 0, 0};
271 
272   for (int i = 0; i < hitbuffer->next_chunk_start; ++i) {
273     // Score one chunk
274     // Sets delta_len, and distinct_len
275     Tote chunk_tote;
276     ChunkSummary chunksummary;
277     ScoreOneChunk(text, ulscript,
278                   hitbuffer, i,
279                   scoringcontext, &cspan, &chunk_tote, &chunksummary);
280 
281     // Put result in summarybuffer
282     if (summarybuffer->n < kMaxSummaries) {
283       summarybuffer->chunksummary[summarybuffer->n] = chunksummary;
284       summarybuffer->n += 1;
285     }
286 
287     prior_cspan = cspan;
288     cspan.chunk_base += cspan.base_len;
289     cspan.chunk_delta += cspan.delta_len;
290     cspan.chunk_distinct += cspan.distinct_len;
291   }
292 
293   // Add one dummy off the end to hold first unused linear_in_chunk
294   int linear_off_end = hitbuffer->next_linear;
295   int offset_off_end = hitbuffer->linear[linear_off_end].offset;
296   ChunkSummary* cs = &summarybuffer->chunksummary[summarybuffer->n];
297   memset(cs, 0, sizeof(ChunkSummary));
298   cs->offset = offset_off_end;
299   cs->chunk_start = linear_off_end;
300   *last_cspan = prior_cspan;
301 }
302 
303 
SummaryBufferToDocTote(const SummaryBuffer * summarybuffer,bool more_to_come,DocTote * doc_tote)304 void SummaryBufferToDocTote(const SummaryBuffer* summarybuffer,
305                             bool more_to_come, DocTote* doc_tote) {
306   int cs_bytes_sum = 0;
307   for (int i = 0; i < summarybuffer->n; ++i) {
308     const ChunkSummary* cs = &summarybuffer->chunksummary[i];
309     int reliability = minint(cs->reliability_delta, cs->reliability_score);
310     // doc_tote uses full languages
311     doc_tote->Add(cs->lang1, cs->bytes, cs->score1, reliability);
312     cs_bytes_sum += cs->bytes;
313   }
314 }
315 
316 // Turn on for debugging vectors
317 static const bool kShowLettersOriginal = false;
318 
319 
320 // If next chunk language matches last vector language, extend last element
321 // Otherwise add new element to vector
ItemToVector(ScriptScanner * scanner,ResultChunkVector * vec,Language new_lang,int mapped_offset,int mapped_len)322 void ItemToVector(ScriptScanner* scanner,
323                   ResultChunkVector* vec, Language new_lang,
324                   int mapped_offset, int mapped_len) {
325   uint16 last_vec_lang = static_cast<uint16>(UNKNOWN_LANGUAGE);
326   int last_vec_subscr = vec->size() - 1;
327   if (last_vec_subscr >= 0) {
328     ResultChunk* priorrc = &(*vec)[last_vec_subscr];
329     last_vec_lang = priorrc->lang1;
330     if (new_lang == last_vec_lang) {
331       // Extend prior. Current mapped_offset may be beyond prior end, so do
332       // the arithmetic to include any such gap
333       priorrc->bytes = minint((mapped_offset + mapped_len) - priorrc->offset,
334                               kMaxResultChunkBytes);
335       if (kShowLettersOriginal) {
336         // Optionally print the new chunk original text
337         string temp2(&scanner->GetBufferStart()[priorrc->offset],
338                      priorrc->bytes);
339         fprintf(stderr, "Item[%d..%d) '%s'<br>\n",
340                 priorrc->offset, priorrc->offset + priorrc->bytes,
341                 GetHtmlEscapedText(temp2).c_str());
342       }
343       return;
344     }
345   }
346   // Add new vector element
347   ResultChunk rc;
348   rc.offset = mapped_offset;
349   rc.bytes = minint(mapped_len, kMaxResultChunkBytes);
350   rc.lang1 = static_cast<uint16>(new_lang);
351   vec->push_back(rc);
352   if (kShowLettersOriginal) {
353     // Optionally print the new chunk original text
354     string temp2(&scanner->GetBufferStart()[rc.offset], rc.bytes);
355     fprintf(stderr, "Item[%d..%d) '%s'<br>\n",
356             rc.offset, rc.offset + rc.bytes,
357             GetHtmlEscapedText(temp2).c_str());
358   }
359 }
360 
PriorVecLang(const ResultChunkVector * vec)361 uint16 PriorVecLang(const ResultChunkVector* vec) {
362   if (vec->empty()) {return static_cast<uint16>(UNKNOWN_LANGUAGE);}
363   return (*vec)[vec->size() - 1].lang1;
364 }
365 
NextChunkLang(const SummaryBuffer * summarybuffer,int i)366 uint16 NextChunkLang(const SummaryBuffer* summarybuffer, int i) {
367   if ((i + 1) >= summarybuffer->n) {
368     return static_cast<uint16>(UNKNOWN_LANGUAGE);
369   }
370   return summarybuffer->chunksummary[i + 1].lang1;
371 }
372 
373 
374 
375 // Add n elements of summarybuffer to resultchunk vector:
376 // Each element is letters-only text [offset..offset+bytes)
377 // This maps back to original[Back(offset)..Back(offset+bytes))
378 //
379 // We go out of our way to minimize the variation in the ResultChunkVector,
380 // so that the caller has fewer but more meaningful spans in different
381 // lanaguges, for the likely purpose of translation or spell-check.
382 //
383 // The language of each chunk is lang1, but it might be unreliable for
384 // either of two reasons: its score is relatively too close to the score of
385 // lang2, or its score is too far away from the expected score of real text in
386 // the given language. Unreliable languages are mapped to Unknown.
387 //
SummaryBufferToVector(ScriptScanner * scanner,const char * text,const SummaryBuffer * summarybuffer,bool more_to_come,ResultChunkVector * vec)388 void SummaryBufferToVector(ScriptScanner* scanner, const char* text,
389                            const SummaryBuffer* summarybuffer,
390                            bool more_to_come, ResultChunkVector* vec) {
391   if (vec == NULL) {return;}
392 
393   if (kShowLettersOriginal) {
394     fprintf(stderr, "map2original_ ");
395     scanner->map2original_.DumpWindow();
396     fprintf(stderr, "<br>\n");
397     fprintf(stderr, "map2uplow_ ");
398     scanner->map2uplow_.DumpWindow();
399     fprintf(stderr, "<br>\n");
400   }
401 
402   for (int i = 0; i < summarybuffer->n; ++i) {
403     const ChunkSummary* cs = &summarybuffer->chunksummary[i];
404     int unmapped_offset = cs->offset;
405     int unmapped_len = cs->bytes;
406 
407     if (kShowLettersOriginal) {
408       // Optionally print the chunk lowercase letters/marks text
409       string temp(&text[unmapped_offset], unmapped_len);
410       fprintf(stderr, "Letters [%d..%d) '%s'<br>\n",
411               unmapped_offset, unmapped_offset + unmapped_len,
412               GetHtmlEscapedText(temp).c_str());
413     }
414 
415     int mapped_offset = scanner->MapBack(unmapped_offset);
416 
417     // Trim back a little to prefer splicing original at word boundaries
418     if (mapped_offset > 0) {
419       // Size of prior vector entry, if any
420       int prior_size = 0;
421       if (!vec->empty()) {
422         ResultChunk* rc = &(*vec)[vec->size() - 1];
423         prior_size = rc->bytes;
424       }
425       // Maximum back up size to leave at least 3 bytes in prior,
426       // and not entire buffer, and no more than 12 bytes total backup
427       int n_limit = minint(prior_size - 3, mapped_offset);
428       n_limit = minint(n_limit, 12);
429 
430       // Backscan over letters, stopping if prior byte is < 0x41
431       // There is some possibility that we will backscan over a different script
432       const char* s = &scanner->GetBufferStart()[mapped_offset];
433       const unsigned char* us = reinterpret_cast<const unsigned char*>(s);
434       int n = 0;
435       while ((n < n_limit) && (us[-n - 1] >= 0x41)) {++n;}
436       if (n >= n_limit) {n = 0;} // New boundary not found within range
437 
438       // Also back up exactly one leading punctuation character if '"#@
439       if (n < n_limit) {
440         unsigned char c = us[-n - 1];
441         if ((c == '\'') || (c == '"') || (c == '#') || (c == '@')) {++n;}
442       }
443       // Shrink the previous chunk slightly
444       if (n > 0) {
445         ResultChunk* rc = &(*vec)[vec->size() - 1];
446         rc->bytes -= n;
447         mapped_offset -= n;
448         if (kShowLettersOriginal) {
449           fprintf(stderr, "Back up %d bytes<br>\n", n);
450           // Optionally print the prior chunk original text
451           string temp2(&scanner->GetBufferStart()[rc->offset], rc->bytes);
452           fprintf(stderr, "Prior   [%d..%d) '%s'<br>\n",
453                   rc->offset, rc->offset + rc->bytes,
454                   GetHtmlEscapedText(temp2).c_str());
455         }
456       }
457     }
458 
459     int mapped_len =
460       scanner->MapBack(unmapped_offset + unmapped_len) - mapped_offset;
461 
462     if (kShowLettersOriginal) {
463       // Optionally print the chunk original text
464       string temp2(&scanner->GetBufferStart()[mapped_offset], mapped_len);
465       fprintf(stderr, "Original[%d..%d) '%s'<br>\n",
466               mapped_offset, mapped_offset + mapped_len,
467               GetHtmlEscapedText(temp2).c_str());
468     }
469 
470     Language new_lang = static_cast<Language>(cs->lang1);
471     bool reliability_delta_bad =
472       (cs->reliability_delta < kUnreliablePercentThreshold);
473     bool reliability_score_bad =
474       (cs->reliability_score < kUnreliablePercentThreshold);
475 
476     // If the top language matches last vector, ignore reliability_delta
477     uint16 prior_lang = PriorVecLang(vec);
478     if (prior_lang == cs->lang1) {
479       reliability_delta_bad = false;
480     }
481     // If the top language is in same close set as last vector, set up to merge
482     if (SameCloseSet(cs->lang1, prior_lang)) {
483       new_lang = static_cast<Language>(prior_lang);
484       reliability_delta_bad = false;
485     }
486     // If the top two languages are in the same close set and the last vector
487     // language is the second language, set up to merge
488     if (SameCloseSet(cs->lang1, cs->lang2) &&
489         (prior_lang == cs->lang2)) {
490       new_lang = static_cast<Language>(prior_lang);
491       reliability_delta_bad = false;
492     }
493     // If unreliable and the last and next vector languages are both
494     // the second language, set up to merge
495     uint16 next_lang = NextChunkLang(summarybuffer, i);
496     if (reliability_delta_bad &&
497         (prior_lang == cs->lang2) && (next_lang == cs->lang2)) {
498       new_lang = static_cast<Language>(prior_lang);
499       reliability_delta_bad = false;
500     }
501 
502     if (reliability_delta_bad || reliability_score_bad) {
503       new_lang = UNKNOWN_LANGUAGE;
504     }
505     ItemToVector(scanner, vec, new_lang, mapped_offset, mapped_len);
506   }
507 }
508 
509 // Add just one element to resultchunk vector:
510 // For RTypeNone or RTypeOne
JustOneItemToVector(ScriptScanner * scanner,const char * text,Language lang1,int unmapped_offset,int unmapped_len,ResultChunkVector * vec)511 void JustOneItemToVector(ScriptScanner* scanner, const char* text,
512                          Language lang1, int unmapped_offset, int unmapped_len,
513                          ResultChunkVector* vec) {
514   if (vec == NULL) {return;}
515 
516   if (kShowLettersOriginal) {
517     fprintf(stderr, "map2original_ ");
518     scanner->map2original_.DumpWindow();
519     fprintf(stderr, "<br>\n");
520     fprintf(stderr, "map2uplow_ ");
521     scanner->map2uplow_.DumpWindow();
522     fprintf(stderr, "<br>\n");
523   }
524 
525   if (kShowLettersOriginal) {
526    // Optionally print the chunk lowercase letters/marks text
527    string temp(&text[unmapped_offset], unmapped_len);
528    fprintf(stderr, "Letters1 [%d..%d) '%s'<br>\n",
529            unmapped_offset, unmapped_offset + unmapped_len,
530            GetHtmlEscapedText(temp).c_str());
531   }
532 
533   int mapped_offset = scanner->MapBack(unmapped_offset);
534   int mapped_len =
535     scanner->MapBack(unmapped_offset + unmapped_len) - mapped_offset;
536 
537   if (kShowLettersOriginal) {
538     // Optionally print the chunk original text
539     string temp2(&scanner->GetBufferStart()[mapped_offset], mapped_len);
540     fprintf(stderr, "Original1[%d..%d) '%s'<br>\n",
541             mapped_offset, mapped_offset + mapped_len,
542             GetHtmlEscapedText(temp2).c_str());
543   }
544 
545   ItemToVector(scanner, vec, lang1, mapped_offset, mapped_len);
546 }
547 
548 
549 // Debugging. Not thread safe. Defined in getonescriptspan
550 char* DisplayPiece(const char* next_byte_, int byte_length_);
551 
552 // If high bit is on, take out high bit and add 2B to make table2 entries easy
PrintableIndirect(int x)553 inline int PrintableIndirect(int x) {
554   if ((x & 0x80000000u) != 0) {
555     return (x & ~0x80000000u) + 2000000000;
556   }
557   return x;
558 }
DumpHitBuffer(FILE * df,const char * text,const ScoringHitBuffer * hitbuffer)559 void DumpHitBuffer(FILE* df, const char* text,
560                    const ScoringHitBuffer* hitbuffer) {
561   fprintf(df,
562           "<br>DumpHitBuffer[%s, next_base/delta/distinct %d, %d, %d)<br>\n",
563           ULScriptCode(hitbuffer->ulscript),
564           hitbuffer->next_base, hitbuffer->next_delta,
565           hitbuffer->next_distinct);
566   for (int i = 0; i < hitbuffer->maxscoringhits; ++i) {
567     if (i < hitbuffer->next_base) {
568       fprintf(df, "Q[%d]%d,%d,%s ",
569               i, hitbuffer->base[i].offset,
570               PrintableIndirect(hitbuffer->base[i].indirect),
571               DisplayPiece(&text[hitbuffer->base[i].offset], 6));
572     }
573     if (i < hitbuffer->next_delta) {
574       fprintf(df, "DL[%d]%d,%d,%s ",
575               i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect,
576               DisplayPiece(&text[hitbuffer->delta[i].offset], 12));
577     }
578     if (i < hitbuffer->next_distinct) {
579       fprintf(df, "D[%d]%d,%d,%s ",
580               i, hitbuffer->distinct[i].offset, hitbuffer->distinct[i].indirect,
581               DisplayPiece(&text[hitbuffer->distinct[i].offset], 12));
582     }
583     if (i < hitbuffer->next_base) {
584       fprintf(df, "<br>\n");
585     }
586     if (i > 50) {break;}
587   }
588   if (hitbuffer->next_base > 50) {
589     int i = hitbuffer->next_base;
590     fprintf(df, "Q[%d]%d,%d,%s ",
591             i, hitbuffer->base[i].offset,
592             PrintableIndirect(hitbuffer->base[i].indirect),
593             DisplayPiece(&text[hitbuffer->base[i].offset], 6));
594   }
595   if (hitbuffer->next_delta > 50) {
596     int i = hitbuffer->next_delta;
597     fprintf(df, "DL[%d]%d,%d,%s ",
598             i, hitbuffer->delta[i].offset, hitbuffer->delta[i].indirect,
599             DisplayPiece(&text[hitbuffer->delta[i].offset], 12));
600   }
601   if (hitbuffer->next_distinct > 50) {
602     int i = hitbuffer->next_distinct;
603     fprintf(df, "D[%d]%d,%d,%s ",
604             i, hitbuffer->distinct[i].offset, hitbuffer->distinct[i].indirect,
605             DisplayPiece(&text[hitbuffer->distinct[i].offset], 12));
606   }
607   fprintf(df, "<br>\n");
608 }
609 
610 
DumpLinearBuffer(FILE * df,const char * text,const ScoringHitBuffer * hitbuffer)611 void DumpLinearBuffer(FILE* df, const char* text,
612                       const ScoringHitBuffer* hitbuffer) {
613   fprintf(df, "<br>DumpLinearBuffer[%d)<br>\n",
614           hitbuffer->next_linear);
615   // Include the dummy entry off the end
616   for (int i = 0; i < hitbuffer->next_linear + 1; ++i) {
617     if ((50 < i) && (i < (hitbuffer->next_linear - 1))) {continue;}
618     fprintf(df, "[%d]%d,%c=%08x,%s<br>\n",
619             i, hitbuffer->linear[i].offset,
620             "UQLD"[hitbuffer->linear[i].type],
621             hitbuffer->linear[i].langprob,
622             DisplayPiece(&text[hitbuffer->linear[i].offset], 6));
623   }
624   fprintf(df, "<br>\n");
625 
626   fprintf(df, "DumpChunkStart[%d]<br>\n", hitbuffer->next_chunk_start);
627   for (int i = 0; i < hitbuffer->next_chunk_start + 1; ++i) {
628     fprintf(df, "[%d]%d\n", i, hitbuffer->chunk_start[i]);
629   }
630   fprintf(df, "<br>\n");
631 }
632 
633 // Move this verbose debugging output to debug.cc eventually
DumpChunkSummary(FILE * df,const ChunkSummary * cs)634 void DumpChunkSummary(FILE* df, const ChunkSummary* cs) {
635   // Print chunksummary
636   fprintf(df, "%d lin[%d] %s.%d %s.%d %dB %d# %s %dRd %dRs<br>\n",
637           cs->offset,
638           cs->chunk_start,
639           LanguageCode(static_cast<Language>(cs->lang1)),
640           cs->score1,
641           LanguageCode(static_cast<Language>(cs->lang2)),
642           cs->score2,
643           cs->bytes,
644           cs->grams,
645           ULScriptCode(static_cast<ULScript>(cs->ulscript)),
646           cs->reliability_delta,
647           cs->reliability_score);
648 }
649 
DumpSummaryBuffer(FILE * df,const SummaryBuffer * summarybuffer)650 void DumpSummaryBuffer(FILE* df, const SummaryBuffer* summarybuffer) {
651   fprintf(df, "<br>DumpSummaryBuffer[%d]<br>\n", summarybuffer->n);
652   fprintf(df, "[i] offset linear[chunk_start] lang.score1 lang.score2 "
653               "bytesB ngrams# script rel_delta rel_score<br>\n");
654   for (int i = 0; i <= summarybuffer->n; ++i) {
655     fprintf(df, "[%d] ", i);
656     DumpChunkSummary(df, &summarybuffer->chunksummary[i]);
657   }
658   fprintf(df, "<br>\n");
659 }
660 
661 
662 
663 // Within hitbufer->linear[]
664 // <-- prior chunk --><-- this chunk -->
665 // |                  |                 |
666 // linear0            linear1           linear2
667 //     lang0              lang1
668 // The goal of sharpening is to move this_linear to better separate langs
BetterBoundary(const char * text,ScoringHitBuffer * hitbuffer,ScoringContext * scoringcontext,uint16 pslang0,uint16 pslang1,int linear0,int linear1,int linear2)669 int BetterBoundary(const char* text,
670                    ScoringHitBuffer* hitbuffer,
671                    ScoringContext* scoringcontext,
672                    uint16 pslang0, uint16 pslang1,
673                    int linear0, int linear1, int linear2) {
674   // Degenerate case, no change
675   if ((linear2 - linear0) <= 8) {return linear1;}
676 
677   // Each diff gives pslang0 score - pslang1 score
678   // Running diff has four entries + + + + followed by four entries - - - -
679   // so that this value is maximal at the sharpest boundary between pslang0
680   // (positive diffs) and pslang1 (negative diffs)
681   int running_diff = 0;
682   int diff[8];    // Ring buffer of pslang0-pslang1 differences
683   // Initialize with first 8 diffs
684   for (int i = linear0; i < linear0 + 8; ++i) {
685     int j = i & 7;
686     uint32 langprob = hitbuffer->linear[i].langprob;
687     diff[j] = GetLangScore(langprob, pslang0) -
688        GetLangScore(langprob, pslang1);
689     if (i < linear0 + 4) {
690       // First four diffs pslang0 - pslang1
691       running_diff += diff[j];
692     } else {
693       // Second four diffs -(pslang0 - pslang1)
694       running_diff -= diff[j];
695     }
696   }
697 
698   // Now scan for sharpest boundary. j is at left end of 8 entries
699   // To be a boundary, there must be both >0 and <0 entries in the window
700   int better_boundary_value = 0;
701   int better_boundary = linear1;
702   for (int i = linear0; i < linear2 - 8; ++i) {
703     int j = i & 7;
704     if (better_boundary_value < running_diff) {
705       bool has_plus = false;
706       bool has_minus = false;
707       for (int kk = 0; kk < 8; ++kk) {
708         if (diff[kk] > 0) {has_plus = true;}
709         if (diff[kk] < 0) {has_minus = true;}
710       }
711       if (has_plus && has_minus) {
712         better_boundary_value = running_diff;
713         better_boundary = i + 4;
714       }
715     }
716     // Shift right one entry
717     uint32 langprob = hitbuffer->linear[i + 8].langprob;
718     int newdiff = GetLangScore(langprob, pslang0) -
719        GetLangScore(langprob, pslang1);
720     int middiff = diff[(i + 4) & 7];
721     int olddiff = diff[j];
722     diff[j] = newdiff;
723     running_diff -= olddiff;                  // Remove left
724     running_diff += 2 * middiff;              // Convert middle from - to +
725     running_diff -= newdiff;                  // Insert right
726   }
727 
728   if (scoringcontext->flags_cld2_verbose && (linear1 != better_boundary)) {
729     Language lang0 = FromPerScriptNumber(scoringcontext->ulscript, pslang0);
730     Language lang1 = FromPerScriptNumber(scoringcontext->ulscript, pslang1);
731     fprintf(scoringcontext->debug_file, " Better lin[%d=>%d] %s^^%s <br>\n",
732             linear1, better_boundary,
733             LanguageCode(lang0), LanguageCode(lang1));
734     int lin0_off = hitbuffer->linear[linear0].offset;
735     int lin1_off = hitbuffer->linear[linear1].offset;
736     int lin2_off = hitbuffer->linear[linear2].offset;
737     int better_offm1 = hitbuffer->linear[better_boundary - 1].offset;
738     int better_off = hitbuffer->linear[better_boundary].offset;
739     int better_offp1 = hitbuffer->linear[better_boundary + 1].offset;
740     string old0(&text[lin0_off], lin1_off - lin0_off);
741     string old1(&text[lin1_off], lin2_off - lin1_off);
742     string new0(&text[lin0_off], better_offm1 - lin0_off);
743     string new0m1(&text[better_offm1], better_off - better_offm1);
744     string new1(&text[better_off], better_offp1 - better_off);
745     string new1p1(&text[better_offp1], lin2_off - better_offp1);
746     fprintf(scoringcontext->debug_file, "%s^^%s => <br>\n%s^%s^^%s^%s<br>\n",
747             GetHtmlEscapedText(old0).c_str(),
748             GetHtmlEscapedText(old1).c_str(),
749             GetHtmlEscapedText(new0).c_str(),
750             GetHtmlEscapedText(new0m1).c_str(),
751             GetHtmlEscapedText(new1).c_str(),
752             GetHtmlEscapedText(new1p1).c_str());
753     // Slow picture of differences per linear entry
754     int d;
755     for (int i = linear0; i < linear2; ++i) {
756       if (i == better_boundary) {
757         fprintf(scoringcontext->debug_file, "^^ ");
758       }
759       uint32 langprob = hitbuffer->linear[i].langprob;
760       d = GetLangScore(langprob, pslang0) - GetLangScore(langprob, pslang1);
761       const char* s = "=";
762       //if (d > 2) {s = "\xc2\xaf";}    // Macron
763       if (d > 2) {s = "#";}
764       else if (d > 0) {s = "+";}
765       else if (d < -2) {s = "_";}
766       else if (d < 0) {s = "-";}
767       fprintf(scoringcontext->debug_file, "%s ", s);
768     }
769     fprintf(scoringcontext->debug_file, " &nbsp;&nbsp;(scale: #+=-_)<br>\n");
770   }
771   return better_boundary;
772 }
773 
774 
775 // For all but the first summary, if its top language differs from
776 // the previous chunk, refine the boundary
777 // Linearized version
SharpenBoundaries(const char * text,bool more_to_come,ScoringHitBuffer * hitbuffer,ScoringContext * scoringcontext,SummaryBuffer * summarybuffer)778 void SharpenBoundaries(const char* text,
779                        bool more_to_come,
780                        ScoringHitBuffer* hitbuffer,
781                        ScoringContext* scoringcontext,
782                        SummaryBuffer* summarybuffer) {
783 
784   int prior_linear = summarybuffer->chunksummary[0].chunk_start;
785   uint16 prior_lang = summarybuffer->chunksummary[0].lang1;
786 
787   if (scoringcontext->flags_cld2_verbose) {
788     fprintf(scoringcontext->debug_file, "<br>SharpenBoundaries<br>\n");
789   }
790   for (int i = 1; i < summarybuffer->n; ++i) {
791     ChunkSummary* cs = &summarybuffer->chunksummary[i];
792     uint16 this_lang = cs->lang1;
793     if (this_lang == prior_lang) {
794       prior_linear = cs->chunk_start;
795       continue;
796     }
797 
798     int this_linear = cs->chunk_start;
799     int next_linear = summarybuffer->chunksummary[i + 1].chunk_start;
800 
801     // If this/prior in same close set, don't move boundary
802     if (SameCloseSet(prior_lang, this_lang)) {
803       prior_linear = this_linear;
804       prior_lang = this_lang;
805       continue;
806     }
807 
808 
809     // Within hitbuffer->linear[]
810     // <-- prior chunk --><-- this chunk -->
811     // |                  |                 |
812     // prior_linear       this_linear       next_linear
813     //     prior_lang         this_lang
814     // The goal of sharpening is to move this_linear to better separate langs
815 
816     uint8 pslang0 = PerScriptNumber(scoringcontext->ulscript,
817                                     static_cast<Language>(prior_lang));
818     uint8 pslang1 = PerScriptNumber(scoringcontext->ulscript,
819                                     static_cast<Language>(this_lang));
820     int better_linear = BetterBoundary(text,
821                                        hitbuffer,
822                                        scoringcontext,
823                                        pslang0, pslang1,
824                                        prior_linear, this_linear, next_linear);
825 
826     int old_offset = hitbuffer->linear[this_linear].offset;
827     int new_offset = hitbuffer->linear[better_linear].offset;
828     cs->chunk_start = better_linear;
829     cs->offset = new_offset;
830     // If this_linear moved right, make bytes smaller for this, larger for prior
831     // If this_linear moved left, make bytes larger for this, smaller for prior
832     cs->bytes -= (new_offset - old_offset);
833     summarybuffer->chunksummary[i - 1].bytes += (new_offset - old_offset);
834 
835     this_linear = better_linear;    // Update so that next chunk doesn't intrude
836 
837     // Consider rescoring the two chunks
838 
839     // Update for next round (note: using pre-updated boundary)
840     prior_linear = this_linear;
841     prior_lang = this_lang;
842   }
843 }
844 
845 // Make a langprob that gives small weight to the default language for ulscript
DefaultLangProb(ULScript ulscript)846 uint32 DefaultLangProb(ULScript ulscript) {
847   Language default_lang = DefaultLanguage(ulscript);
848   return MakeLangProb(default_lang, 1);
849 }
850 
851 // Effectively, do a merge-sort based on text offsets
852 // Look up each indirect value in appropriate scoring table and keep
853 // just the resulting langprobs
LinearizeAll(ScoringContext * scoringcontext,bool score_cjk,ScoringHitBuffer * hitbuffer)854 void LinearizeAll(ScoringContext* scoringcontext, bool score_cjk,
855                   ScoringHitBuffer* hitbuffer) {
856   const CLD2TableSummary* base_obj;       // unigram or quadgram
857   const CLD2TableSummary* base_obj2;      // quadgram dual table
858   const CLD2TableSummary* delta_obj;      // bigram or octagram
859   const CLD2TableSummary* distinct_obj;   // bigram or octagram
860   uint16 base_hit;
861   if (score_cjk) {
862     base_obj = scoringcontext->scoringtables->unigram_compat_obj;
863     base_obj2 = scoringcontext->scoringtables->unigram_compat_obj;
864     delta_obj = scoringcontext->scoringtables->deltabi_obj;
865     distinct_obj = scoringcontext->scoringtables->distinctbi_obj;
866     base_hit = UNIHIT;
867   } else {
868     base_obj = scoringcontext->scoringtables->quadgram_obj;
869     base_obj2 = scoringcontext->scoringtables->quadgram_obj2;
870     delta_obj = scoringcontext->scoringtables->deltaocta_obj;
871     distinct_obj = scoringcontext->scoringtables->distinctocta_obj;
872     base_hit = QUADHIT;
873   }
874 
875   int base_limit = hitbuffer->next_base;
876   int delta_limit = hitbuffer->next_delta;
877   int distinct_limit = hitbuffer->next_distinct;
878   int base_i = 0;
879   int delta_i = 0;
880   int distinct_i = 0;
881   int linear_i = 0;
882 
883   // Start with an initial base hit for the default language for this script
884   // Inserting this avoids edge effects with no hits at all
885   hitbuffer->linear[linear_i].offset = hitbuffer->lowest_offset;
886   hitbuffer->linear[linear_i].type = base_hit;
887   hitbuffer->linear[linear_i].langprob =
888     DefaultLangProb(scoringcontext->ulscript);
889   ++linear_i;
890 
891   while ((base_i < base_limit) || (delta_i < delta_limit) ||
892          (distinct_i < distinct_limit)) {
893     int base_off = hitbuffer->base[base_i].offset;
894     int delta_off = hitbuffer->delta[delta_i].offset;
895     int distinct_off = hitbuffer->distinct[distinct_i].offset;
896 
897     // Do delta and distinct first, so that they are not lost at base_limit
898     if ((delta_i < delta_limit) &&
899         (delta_off <= base_off) && (delta_off <= distinct_off)) {
900       // Add delta entry
901       int indirect = hitbuffer->delta[delta_i].indirect;
902       ++delta_i;
903       uint32 langprob = delta_obj->kCLDTableInd[indirect];
904       if (langprob > 0) {
905         hitbuffer->linear[linear_i].offset = delta_off;
906         hitbuffer->linear[linear_i].type = DELTAHIT;
907         hitbuffer->linear[linear_i].langprob = langprob;
908         ++linear_i;
909       }
910     }
911     else if ((distinct_i < distinct_limit) &&
912              (distinct_off <= base_off) && (distinct_off <= delta_off)) {
913       // Add distinct entry
914       int indirect = hitbuffer->distinct[distinct_i].indirect;
915       ++distinct_i;
916       uint32 langprob = distinct_obj->kCLDTableInd[indirect];
917       if (langprob > 0) {
918         hitbuffer->linear[linear_i].offset = distinct_off;
919         hitbuffer->linear[linear_i].type = DISTINCTHIT;
920         hitbuffer->linear[linear_i].langprob = langprob;
921         ++linear_i;
922       }
923     }
924     else {
925       // Add one or two base entries
926       int indirect = hitbuffer->base[base_i].indirect;
927       // First, get right scoring table
928       const CLD2TableSummary* local_base_obj = base_obj;
929       if ((indirect & 0x80000000u) != 0) {
930         local_base_obj = base_obj2;
931         indirect &= ~0x80000000u;
932       }
933       ++base_i;
934       // One langprob in kQuadInd[0..SingleSize),
935       // two in kQuadInd[SingleSize..Size)
936       if (indirect < static_cast<int>(local_base_obj->kCLDTableSizeOne)) {
937         // Up to three languages at indirect
938         uint32 langprob = local_base_obj->kCLDTableInd[indirect];
939         if (langprob > 0) {
940           hitbuffer->linear[linear_i].offset = base_off;
941           hitbuffer->linear[linear_i].type = base_hit;
942           hitbuffer->linear[linear_i].langprob = langprob;
943           ++linear_i;
944         }
945       } else {
946         // Up to six languages at start + 2 * (indirect - start)
947         indirect += (indirect - local_base_obj->kCLDTableSizeOne);
948         uint32 langprob = local_base_obj->kCLDTableInd[indirect];
949         uint32 langprob2 = local_base_obj->kCLDTableInd[indirect + 1];
950         if (langprob > 0) {
951           hitbuffer->linear[linear_i].offset = base_off;
952           hitbuffer->linear[linear_i].type = base_hit;
953           hitbuffer->linear[linear_i].langprob = langprob;
954           ++linear_i;
955         }
956         if (langprob2 > 0) {
957           hitbuffer->linear[linear_i].offset = base_off;
958           hitbuffer->linear[linear_i].type = base_hit;
959           hitbuffer->linear[linear_i].langprob = langprob2;
960           ++linear_i;
961         }
962       }
963     }
964   }
965 
966   // Update
967   hitbuffer->next_linear = linear_i;
968 
969   // Add a dummy entry off the end, just to capture final offset
970   hitbuffer->linear[linear_i].offset =
971   hitbuffer->base[hitbuffer->next_base].offset;
972   hitbuffer->linear[linear_i].langprob = 0;
973 }
974 
975 // Break linear array into chunks of ~20 quadgram hits or ~50 CJK unigram hits
ChunkAll(int letter_offset,bool score_cjk,ScoringHitBuffer * hitbuffer)976 void ChunkAll(int letter_offset, bool score_cjk, ScoringHitBuffer* hitbuffer) {
977   int chunksize;
978   uint16 base_hit;
979   if (score_cjk) {
980     chunksize = kChunksizeUnis;
981     base_hit = UNIHIT;
982   } else {
983     chunksize = kChunksizeQuads;
984     base_hit = QUADHIT;
985   }
986 
987   int linear_i = 0;
988   int linear_off_end = hitbuffer->next_linear;
989   int text_i = letter_offset;               // Next unseen text offset
990   int next_chunk_start = 0;
991   int bases_left = hitbuffer->next_base;
992   while (bases_left > 0) {
993     // Linearize one chunk
994     int base_len = chunksize;     // Default; may be changed below
995     if (bases_left < (chunksize + (chunksize >> 1))) {
996       // If within 1.5 chunks of the end, avoid runts by using it all
997       base_len = bases_left;
998     } else if (bases_left < (2 * chunksize)) {
999       // Avoid runts by splitting 1.5 to 2 chunks in half (about 3/4 each)
1000       base_len = (bases_left + 1) >> 1;
1001     }
1002 
1003     hitbuffer->chunk_start[next_chunk_start] = linear_i;
1004     hitbuffer->chunk_offset[next_chunk_start] = text_i;
1005     ++next_chunk_start;
1006 
1007     int base_count = 0;
1008     while ((base_count < base_len) && (linear_i < linear_off_end)) {
1009       if (hitbuffer->linear[linear_i].type == base_hit) {++base_count;}
1010       ++linear_i;
1011     }
1012     text_i = hitbuffer->linear[linear_i].offset;    // Next unseen text offset
1013     bases_left -= base_len;
1014   }
1015 
1016   // If no base hits at all, make a single dummy chunk
1017   if (next_chunk_start == 0) {
1018      hitbuffer->chunk_start[next_chunk_start] = 0;
1019      hitbuffer->chunk_offset[next_chunk_start] = hitbuffer->linear[0].offset;
1020      ++next_chunk_start;
1021   }
1022 
1023   // Remember the linear array start of dummy entry
1024   hitbuffer->next_chunk_start = next_chunk_start;
1025 
1026   // Add a dummy entry off the end, just to capture final linear subscr
1027   hitbuffer->chunk_start[next_chunk_start] = hitbuffer->next_linear;
1028   hitbuffer->chunk_offset[next_chunk_start] = text_i;
1029 }
1030 
1031 
1032 // Merge-sort the individual hit arrays, go indirect on the scoring subscripts,
1033 // break linear array into chunks.
1034 //
1035 // Input:
1036 //  hitbuffer base, delta, distinct arrays
1037 // Output:
1038 //  linear array
1039 //  chunk_start array
1040 //
LinearizeHitBuffer(int letter_offset,ScoringContext * scoringcontext,bool more_to_come,bool score_cjk,ScoringHitBuffer * hitbuffer)1041 void LinearizeHitBuffer(int letter_offset,
1042                         ScoringContext* scoringcontext,
1043                         bool more_to_come, bool score_cjk,
1044                         ScoringHitBuffer* hitbuffer) {
1045   LinearizeAll(scoringcontext, score_cjk, hitbuffer);
1046   ChunkAll(letter_offset, score_cjk, hitbuffer);
1047 }
1048 
1049 
1050 
1051 // The hitbuffer is in an awkward form -- three sets of base/delta/distinct
1052 // scores, each with an indirect subscript to one of six scoring tables, some
1053 // of which can yield two langprobs for six languages, others one langprob for
1054 // three languages. The only correlation between base/delta/distinct is their
1055 // offsets into the letters-only text buffer.
1056 //
1057 // SummaryBuffer needs to be built to linear, giving linear offset of start of
1058 // each chunk
1059 //
1060 // So we first do all the langprob lookups and merge-sort by offset to make
1061 // a single linear vector, building a side vector of chunk beginnings as we go.
1062 // The sharpening is simply moving the beginnings, scoring is a simple linear
1063 // sweep, etc.
1064 
ProcessHitBuffer(const LangSpan & scriptspan,int letter_offset,ScoringContext * scoringcontext,DocTote * doc_tote,ResultChunkVector * vec,bool more_to_come,bool score_cjk,ScoringHitBuffer * hitbuffer)1065 void ProcessHitBuffer(const LangSpan& scriptspan,
1066                       int letter_offset,
1067                       ScoringContext* scoringcontext,
1068                       DocTote* doc_tote,
1069                       ResultChunkVector* vec,
1070                       bool more_to_come, bool score_cjk,
1071                       ScoringHitBuffer* hitbuffer) {
1072   if (scoringcontext->flags_cld2_verbose) {
1073     fprintf(scoringcontext->debug_file, "Hitbuffer[) ");
1074     DumpHitBuffer(scoringcontext->debug_file, scriptspan.text, hitbuffer);
1075   }
1076 
1077   LinearizeHitBuffer(letter_offset, scoringcontext, more_to_come, score_cjk,
1078                      hitbuffer);
1079 
1080   if (scoringcontext->flags_cld2_verbose) {
1081     fprintf(scoringcontext->debug_file, "Linear[) ");
1082     DumpLinearBuffer(scoringcontext->debug_file, scriptspan.text, hitbuffer);
1083   }
1084 
1085   SummaryBuffer summarybuffer;
1086   summarybuffer.n = 0;
1087   ChunkSpan last_cspan;
1088   ScoreAllHits(scriptspan.text, scriptspan.ulscript,
1089                     more_to_come, score_cjk, hitbuffer,
1090                     scoringcontext,
1091                     &summarybuffer, &last_cspan);
1092 
1093   if (scoringcontext->flags_cld2_verbose) {
1094     DumpSummaryBuffer(scoringcontext->debug_file, &summarybuffer);
1095   }
1096 
1097   if (vec != NULL) {
1098     // Sharpen boundaries of summarybuffer
1099     // This is not a high-performance path
1100     SharpenBoundaries(scriptspan.text, more_to_come, hitbuffer, scoringcontext,
1101                       &summarybuffer);
1102     // Show after the sharpening
1103     // CLD2_Debug2(scriptspan.text, more_to_come, score_cjk,
1104     //             hitbuffer, scoringcontext, &summarybuffer);
1105 
1106     if (scoringcontext->flags_cld2_verbose) {
1107       DumpSummaryBuffer(scoringcontext->debug_file, &summarybuffer);
1108     }
1109   }
1110 
1111   SummaryBufferToDocTote(&summarybuffer, more_to_come, doc_tote);
1112   SummaryBufferToVector(scoringcontext->scanner, scriptspan.text,
1113                         &summarybuffer, more_to_come, vec);
1114 }
1115 
SpliceHitBuffer(ScoringHitBuffer * hitbuffer,int next_offset)1116 void SpliceHitBuffer(ScoringHitBuffer* hitbuffer, int next_offset) {
1117   // Splice hitbuffer and summarybuffer for next round. With big chunks and
1118   // distinctive-word state carried across chunks, we might not need to do this.
1119   hitbuffer->next_base = 0;
1120   hitbuffer->next_delta = 0;
1121   hitbuffer->next_distinct = 0;
1122   hitbuffer->next_linear = 0;
1123   hitbuffer->next_chunk_start = 0;
1124   hitbuffer->lowest_offset = next_offset;
1125 }
1126 
1127 
1128 // Score RTypeNone or RTypeOne scriptspan into doc_tote and vec, updating
1129 // scoringcontext
ScoreEntireScriptSpan(const LangSpan & scriptspan,ScoringContext * scoringcontext,DocTote * doc_tote,ResultChunkVector * vec)1130 void ScoreEntireScriptSpan(const LangSpan& scriptspan,
1131                            ScoringContext* scoringcontext,
1132                            DocTote* doc_tote,
1133                            ResultChunkVector* vec) {
1134   int bytes = scriptspan.text_bytes;
1135   // Artificially set score to 1024 per 1KB, or 1 per byte
1136   int score = bytes;
1137   int reliability = 100;
1138   // doc_tote uses full languages
1139   Language one_one_lang = DefaultLanguage(scriptspan.ulscript);
1140   doc_tote->Add(one_one_lang, bytes, score, reliability);
1141 
1142   if (scoringcontext->flags_cld2_html) {
1143     ChunkSummary chunksummary = {
1144       1, 0,
1145       one_one_lang, UNKNOWN_LANGUAGE, score, 1,
1146       bytes, 0, scriptspan.ulscript, reliability, reliability
1147     };
1148     CLD2_Debug(scriptspan.text, 1, scriptspan.text_bytes,
1149                false, false, NULL,
1150                scoringcontext, NULL, &chunksummary);
1151   }
1152 
1153   // First byte is always a space
1154   JustOneItemToVector(scoringcontext->scanner, scriptspan.text,
1155                       one_one_lang, 1, bytes - 1, vec);
1156 
1157   scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;
1158 }
1159 
1160 // Score RTypeCJK scriptspan into doc_tote and vec, updating scoringcontext
ScoreCJKScriptSpan(const LangSpan & scriptspan,ScoringContext * scoringcontext,DocTote * doc_tote,ResultChunkVector * vec)1161 void ScoreCJKScriptSpan(const LangSpan& scriptspan,
1162                         ScoringContext* scoringcontext,
1163                         DocTote* doc_tote,
1164                         ResultChunkVector* vec) {
1165   // Allocate three parallel arrays of scoring hits
1166   ScoringHitBuffer* hitbuffer = new ScoringHitBuffer;
1167   hitbuffer->init();
1168   hitbuffer->ulscript = scriptspan.ulscript;
1169 
1170   scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;
1171   scoringcontext->oldest_distinct_boost = 0;
1172 
1173   // Incoming scriptspan has a single leading space at scriptspan.text[0]
1174   // and three trailing spaces then NUL at scriptspan.text[text_bytes + 0/1/2/3]
1175 
1176   int letter_offset = 1;        // Skip initial space
1177   hitbuffer->lowest_offset = letter_offset;
1178   int letter_limit = scriptspan.text_bytes;
1179   while (letter_offset < letter_limit) {
1180     if (scoringcontext->flags_cld2_verbose) {
1181       fprintf(scoringcontext->debug_file, " ScoreCJKScriptSpan[%d,%d)<br>\n",
1182               letter_offset, letter_limit);
1183     }
1184     //
1185     // Fill up one hitbuffer, possibly splicing onto previous fragment
1186     //
1187     // NOTE: GetUniHits deals with close repeats
1188     // NOTE: After last chunk there is always a hitbuffer entry with an offset
1189     // just off the end of the text = next_offset.
1190     int next_offset = GetUniHits(scriptspan.text, letter_offset, letter_limit,
1191                                   scoringcontext, hitbuffer);
1192     // NOTE: GetBiHitVectors deals with close repeats,
1193     // does one hash and two lookups (delta and distinct) per word
1194     GetBiHits(scriptspan.text, letter_offset, next_offset,
1195                 scoringcontext, hitbuffer);
1196 
1197     //
1198     // Score one hitbuffer in chunks to summarybuffer
1199     //
1200     bool more_to_come = next_offset < letter_limit;
1201     bool score_cjk = true;
1202     ProcessHitBuffer(scriptspan, letter_offset, scoringcontext, doc_tote, vec,
1203                      more_to_come, score_cjk, hitbuffer);
1204     SpliceHitBuffer(hitbuffer, next_offset);
1205 
1206     letter_offset = next_offset;
1207   }
1208 
1209   delete hitbuffer;
1210   // Context across buffers is not connected yet
1211   scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;
1212 }
1213 
1214 
1215 
1216 // Score RTypeMany scriptspan into doc_tote and vec, updating scoringcontext
1217 // We have a scriptspan with all lowercase text in one script. Look up
1218 // quadgrams and octagrams, saving the hits in three parallel vectors.
1219 // Score from those vectors in chunks, toting each chunk to get a single
1220 // language, and combining into the overall document score. The hit vectors
1221 // in general are not big enough to handle and entire scriptspan, so
1222 // repeat until the entire scriptspan is scored.
1223 // Caller deals with minimizing numbr of runt scriptspans
1224 // This routine deals with minimizing number of runt chunks.
1225 //
1226 // Returns updated scoringcontext
1227 // Returns updated doc_tote
1228 // If vec != NULL, appends to that vector of ResultChunk's
ScoreQuadScriptSpan(const LangSpan & scriptspan,ScoringContext * scoringcontext,DocTote * doc_tote,ResultChunkVector * vec)1229 void ScoreQuadScriptSpan(const LangSpan& scriptspan,
1230                          ScoringContext* scoringcontext,
1231                          DocTote* doc_tote,
1232                          ResultChunkVector* vec) {
1233   // Allocate three parallel arrays of scoring hits
1234   ScoringHitBuffer* hitbuffer = new ScoringHitBuffer;
1235   hitbuffer->init();
1236   hitbuffer->ulscript = scriptspan.ulscript;
1237 
1238   scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;
1239   scoringcontext->oldest_distinct_boost = 0;
1240 
1241   // Incoming scriptspan has a single leading space at scriptspan.text[0]
1242   // and three trailing spaces then NUL at scriptspan.text[text_bytes + 0/1/2/3]
1243 
1244   int letter_offset = 1;        // Skip initial space
1245   hitbuffer->lowest_offset = letter_offset;
1246   int letter_limit = scriptspan.text_bytes;
1247   while (letter_offset < letter_limit) {
1248     //
1249     // Fill up one hitbuffer, possibly splicing onto previous fragment
1250     //
1251     // NOTE: GetQuadHits deals with close repeats
1252     // NOTE: After last chunk there is always a hitbuffer entry with an offset
1253     // just off the end of the text = next_offset.
1254     int next_offset = GetQuadHits(scriptspan.text, letter_offset, letter_limit,
1255                                   scoringcontext, hitbuffer);
1256     // If true, there is more text to process in this scriptspan
1257     // NOTE: GetOctaHitVectors deals with close repeats,
1258     // does one hash and two lookups (delta and distinct) per word
1259     GetOctaHits(scriptspan.text, letter_offset, next_offset,
1260                 scoringcontext, hitbuffer);
1261 
1262     //
1263     // Score one hitbuffer in chunks to summarybuffer
1264     //
1265     bool more_to_come = next_offset < letter_limit;
1266     bool score_cjk = false;
1267     ProcessHitBuffer(scriptspan, letter_offset, scoringcontext, doc_tote, vec,
1268                      more_to_come, score_cjk, hitbuffer);
1269     SpliceHitBuffer(hitbuffer, next_offset);
1270 
1271     letter_offset = next_offset;
1272   }
1273 
1274   delete hitbuffer;
1275 }
1276 
1277 
1278 // Score one scriptspan into doc_tote and vec, updating scoringcontext
1279 // Inputs:
1280 //  One scriptspan of perhaps 40-60KB, all same script lower-case letters
1281 //    and single ASCII spaces. First character is a space to allow simple
1282 //    begining-of-word detect. End of buffer has three spaces and NUL to
1283 //    allow easy scan-to-end-of-word.
1284 //  Scoring context of
1285 //    scoring tables
1286 //    flags
1287 //    running boosts
1288 // Outputs:
1289 //  Updated doc_tote giving overall languages and byte counts
1290 //  Optional updated chunk vector giving offset, length, language
1291 //
1292 // Caller initializes flags, boosts, doc_tote and vec.
1293 // Caller aggregates across multiple scriptspans
1294 // Caller calculates final document result
1295 // Caller deals with detecting and triggering suppression of repeated text.
1296 //
1297 // This top-level routine just chooses the recognition type and calls one of
1298 // the next-level-down routines.
1299 //
ScoreOneScriptSpan(const LangSpan & scriptspan,ScoringContext * scoringcontext,DocTote * doc_tote,ResultChunkVector * vec)1300 void ScoreOneScriptSpan(const LangSpan& scriptspan,
1301                         ScoringContext* scoringcontext,
1302                         DocTote* doc_tote,
1303                         ResultChunkVector* vec) {
1304   if (scoringcontext->flags_cld2_verbose) {
1305     fprintf(scoringcontext->debug_file, "<br>ScoreOneScriptSpan(%s,%d) ",
1306             ULScriptCode(scriptspan.ulscript), scriptspan.text_bytes);
1307     // Optionally print the chunk lowercase letters/marks text
1308     string temp(&scriptspan.text[0], scriptspan.text_bytes);
1309     fprintf(scoringcontext->debug_file, "'%s'",
1310             GetHtmlEscapedText(temp).c_str());
1311     fprintf(scoringcontext->debug_file, "<br>\n");
1312   }
1313   scoringcontext->prior_chunk_lang = UNKNOWN_LANGUAGE;
1314   scoringcontext->oldest_distinct_boost = 0;
1315   ULScriptRType rtype = ULScriptRecognitionType(scriptspan.ulscript);
1316   if (scoringcontext->flags_cld2_score_as_quads && (rtype != RTypeCJK)) {
1317     rtype = RTypeMany;
1318   }
1319   switch (rtype) {
1320   case RTypeNone:
1321   case RTypeOne:
1322     ScoreEntireScriptSpan(scriptspan, scoringcontext, doc_tote, vec);
1323     break;
1324   case RTypeCJK:
1325     ScoreCJKScriptSpan(scriptspan, scoringcontext, doc_tote, vec);
1326     break;
1327   case RTypeMany:
1328     ScoreQuadScriptSpan(scriptspan, scoringcontext, doc_tote, vec);
1329     break;
1330   }
1331 }
1332 
1333 }       // End namespace CLD2
1334 
1335