1 // Copyright 2013 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 //
16 // Author: dsites@google.com (Dick Sites)
17 // Updated 2014.01 for dual table lookup
18 //
19 
20 #include <stdio.h>
21 #include <string.h>
22 #include <string>
23 #include <vector>
24 
25 #include "cldutil.h"
26 #include "debug.h"
27 #include "integral_types.h"
28 #include "lang_script.h"
29 #include "utf8statetable.h"
30 
31 #ifdef CLD2_DYNAMIC_MODE
32 #include "cld2_dynamic_data.h"
33 #include "cld2_dynamic_data_loader.h"
34 #endif
35 #include "cld2tablesummary.h"
36 #include "compact_lang_det_impl.h"
37 #include "compact_lang_det_hint_code.h"
38 #include "getonescriptspan.h"
39 #include "tote.h"
40 
41 
42 namespace CLD2 {
43 
44 using namespace std;
45 
46 // Linker supplies the right tables, From files
47 // cld_generated_cjk_uni_prop_80.cc  cld2_generated_cjk_compatible.cc
48 // cld_generated_cjk_delta_bi_32.cc  generated_distinct_bi_0.cc
49 // cld2_generated_quad*.cc  cld2_generated_deltaocta*.cc
50 // cld2_generated_distinctocta*.cc
51 // cld_generated_score_quad_octa_1024_256.cc
52 
53 // 2014.01 Now implementing quadgram dual lookup tables, to allow main table
54 //   sizes that are 1/3/5 times a power of two, instead of just powers of two.
55 //   Gives more flexibility of total footprint for CLD2.
56 
57 extern const int kLanguageToPLangSize;
58 extern const int kCloseSetSize;
59 
60 extern const UTF8PropObj cld_generated_CjkUni_obj;
61 extern const CLD2TableSummary kCjkCompat_obj;
62 extern const CLD2TableSummary kCjkDeltaBi_obj;
63 extern const CLD2TableSummary kDistinctBiTable_obj;
64 extern const CLD2TableSummary kQuad_obj;
65 extern const CLD2TableSummary kQuad_obj2;     // Dual lookup tables
66 extern const CLD2TableSummary kDeltaOcta_obj;
67 extern const CLD2TableSummary kDistinctOcta_obj;
68 extern const short kAvgDeltaOctaScore[];
69 
70 #ifdef CLD2_DYNAMIC_MODE
71   // CLD2_DYNAMIC_MODE is defined:
72   // Data will be read from an mmap opened at runtime.
73   static ScoringTables kScoringtables = {
74     NULL, //&cld_generated_CjkUni_obj,
75     NULL, //&kCjkCompat_obj,
76     NULL, //&kCjkDeltaBi_obj,
77     NULL, //&kDistinctBiTable_obj,
78     NULL, //&kQuad_obj,
79     NULL, //&kQuad_obj2,
80     NULL, //&kDeltaOcta_obj,
81     NULL, //&kDistinctOcta_obj,
82     NULL, //kAvgDeltaOctaScore,
83   };
84   static bool dynamicDataLoaded = false;
85   static ScoringTables* dynamicTables = NULL;
86   static void* mmapAddress = NULL;
87   static int mmapLength = 0;
88 
isDataLoaded()89   bool isDataLoaded() { return dynamicDataLoaded; }
90 
loadData(const char * fileName)91   void loadData(const char* fileName) {
92     if (isDataLoaded()) {
93       unloadData();
94     }
95     dynamicTables = CLD2DynamicDataLoader::loadDataFile(fileName, &mmapAddress, &mmapLength);
96     kScoringtables = *dynamicTables;
97     dynamicDataLoaded = true;
98   };
99 
unloadData()100   void unloadData() {
101     if (!dynamicDataLoaded) return;
102     dynamicDataLoaded = false;
103     // unloading will null all the pointers out.
104     CLD2DynamicDataLoader::unloadData(&dynamicTables, &mmapAddress, &mmapLength);
105   }
106 #else
107   // This initializes kScoringtables.quadgram_obj etc.
108   static const ScoringTables kScoringtables = {
109     &cld_generated_CjkUni_obj,
110     &kCjkCompat_obj,
111     &kCjkDeltaBi_obj,
112     &kDistinctBiTable_obj,
113 
114     &kQuad_obj,
115     &kQuad_obj2,                              // Dual lookup tables
116     &kDeltaOcta_obj,
117     &kDistinctOcta_obj,
118 
119     kAvgDeltaOctaScore,
120   };
121 #endif // #ifdef CLD2_DYNAMIC_MODE
122 
123 
124 static const bool FLAGS_cld_no_minimum_bytes = false;
125 static const bool FLAGS_cld_forcewords = true;
126 static const bool FLAGS_cld_showme = false;
127 static const bool FLAGS_cld_echotext = true;
128 static const int32 FLAGS_cld_textlimit = 160;
129 static const int32 FLAGS_cld_smoothwidth = 20;
130 static const bool FLAGS_cld_2011_hints = true;
131 static const int32 FLAGS_cld_max_lang_tag_scan_kb = 8;
132 
133 static const bool FLAGS_dbgscore = false;
134 
135 
136 static const int kLangHintInitial = 12;  // Boost language by N initially
137 static const int kLangHintBoost = 12;    // Boost language by N/16 per quadgram
138 
139 static const int kShortSpanThresh = 32;       // Bytes
140 static const int kMaxSecondChanceLen = 1024;  // Look at first 1K of short spans
141 
142 static const int kCheapSqueezeTestThresh = 4096;  // Only look for squeezing
143                                                   // after this many text bytes
144 static const int kCheapSqueezeTestLen = 256;  // Bytes to test to trigger sqz
145 static const int kSpacesTriggerPercent = 25;  // Trigger sqz if >=25% spaces
146 static const int kPredictTriggerPercent = 67; // Trigger sqz if >=67% predicted
147 
148 static const int kChunksizeDefault = 48;      // Squeeze 48-byte chunks
149 static const int kSpacesThreshPercent = 25;   // Squeeze if >=25% spaces
150 static const int kPredictThreshPercent = 40;  // Squeeze if >=40% predicted
151 
152 static const int kMaxSpaceScan = 32;          // Bytes
153 
154 static const int kGoodLang1Percent = 70;
155 static const int kGoodLang1and2Percent = 93;
156 static const int kShortTextThresh = 256;      // Bytes
157 
158 static const int kMinChunkSizeQuads = 4;      // Chunk is at least four quads
159 static const int kMaxChunkSizeQuads = 1024;   // Chunk is at most 1K quads
160 
161 static const int kDefaultWordSpan = 256;      // Scan at least this many initial
162                                               // bytes with word scoring
163 static const int kReallyBigWordSpan = 9999999;  // Forces word scoring all text
164 
165 static const int kMinReliableSeq = 50;      // Record in seq if >= 50% reliable
166 
167 static const int kPredictionTableSize = 4096;   // Must be exactly 4096 for
168                                                 // cheap compressor
169 
170 static const int kNonEnBoilerplateMinPercent = 17;    // <this => no second
171 static const int kNonFIGSBoilerplateMinPercent = 20;  // <this => no second
172 static const int kGoodFirstMinPercent = 26;           // <this => UNK
173 static const int kGoodFirstReliableMinPercent = 51;   // <this => unreli
174 static const int kIgnoreMaxPercent = 20;              // >this => unreli
175 static const int kKeepMinPercent = 2;                 // <this => unreli
176 
177 
178 
179 // Statistically closest language, based on quadgram table
180 // Those that are far from other languges map to UNKNOWN_LANGUAGE
181 // Subscripted by Language
182 //
183 // From lang_correlation.txt and hand-edits
184 // sed 's/^\([^ ]*\) \([^ ]*\) coef=0\.\(..\).*$/
185 //   (\3 >= kMinCorrPercent) ? \2 : UNKNOWN_LANGUAGE,
186 //   \/\/ \1/' lang_correlation.txt >/tmp/closest_lang_decl.txt
187 //
188 static const int kMinCorrPercent = 24;        // Pick off how close you want
189                                               // 24 catches PERSIAN <== ARABIC
190                                               // but not SPANISH <== PORTUGESE
191 static Language Unknown = UNKNOWN_LANGUAGE;
192 
193 // Suspect idea
194 // Subscripted by Language
195 static const Language kClosestAltLanguage[] = {
196   (28 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE,  // ENGLISH
197   (36 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE,  // DANISH
198   (31 >= kMinCorrPercent) ? AFRIKAANS : UNKNOWN_LANGUAGE,  // DUTCH
199   (15 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE,  // FINNISH
200   (11 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE,  // FRENCH
201   (17 >= kMinCorrPercent) ? LUXEMBOURGISH : UNKNOWN_LANGUAGE,  // GERMAN
202   (27 >= kMinCorrPercent) ? YIDDISH : UNKNOWN_LANGUAGE,  // HEBREW
203   (16 >= kMinCorrPercent) ? CORSICAN : UNKNOWN_LANGUAGE,  // ITALIAN
204   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Japanese
205   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Korean
206   (41 >= kMinCorrPercent) ? NORWEGIAN_N : UNKNOWN_LANGUAGE,  // NORWEGIAN
207   ( 5 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE,  // POLISH
208   (23 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE,  // PORTUGUESE
209   (33 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE,  // RUSSIAN
210   (28 >= kMinCorrPercent) ? GALICIAN : UNKNOWN_LANGUAGE,  // SPANISH
211   (17 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE,  // SWEDISH
212   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Chinese
213   (42 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE,  // CZECH
214   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // GREEK
215   (35 >= kMinCorrPercent) ? FAROESE : UNKNOWN_LANGUAGE,  // ICELANDIC
216   ( 7 >= kMinCorrPercent) ? LITHUANIAN : UNKNOWN_LANGUAGE,  // LATVIAN
217   ( 7 >= kMinCorrPercent) ? LATVIAN : UNKNOWN_LANGUAGE,  // LITHUANIAN
218   ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE,  // ROMANIAN
219   ( 4 >= kMinCorrPercent) ? SLOVAK : UNKNOWN_LANGUAGE,  // HUNGARIAN
220   (15 >= kMinCorrPercent) ? FINNISH : UNKNOWN_LANGUAGE,  // ESTONIAN
221   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Ignore
222   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // Unknown
223   (33 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE,  // BULGARIAN
224   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CROATIAN
225   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // SERBIAN
226   (24 >= kMinCorrPercent) ? SCOTS_GAELIC : UNKNOWN_LANGUAGE,  // IRISH
227   (28 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE,  // GALICIAN
228   ( 8 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE,  // TAGALOG
229   (29 >= kMinCorrPercent) ? AZERBAIJANI : UNKNOWN_LANGUAGE,  // TURKISH
230   (28 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE,  // UKRAINIAN
231   (37 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE,  // HINDI
232   (29 >= kMinCorrPercent) ? BULGARIAN : UNKNOWN_LANGUAGE,  // MACEDONIAN
233   (14 >= kMinCorrPercent) ? ASSAMESE : UNKNOWN_LANGUAGE,  // BENGALI
234   (46 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE,  // INDONESIAN
235   ( 9 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE,  // LATIN
236   (46 >= kMinCorrPercent) ? INDONESIAN : UNKNOWN_LANGUAGE,  // MALAY
237   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // MALAYALAM
238   ( 4 >= kMinCorrPercent) ? BRETON : UNKNOWN_LANGUAGE,  // WELSH
239   ( 8 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // NEPALI
240   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // TELUGU
241   ( 3 >= kMinCorrPercent) ? ESPERANTO : UNKNOWN_LANGUAGE,  // ALBANIAN
242   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // TAMIL
243   (22 >= kMinCorrPercent) ? UKRAINIAN : UNKNOWN_LANGUAGE,  // BELARUSIAN
244   (15 >= kMinCorrPercent) ? SUNDANESE : UNKNOWN_LANGUAGE,  // JAVANESE
245   (19 >= kMinCorrPercent) ? CATALAN : UNKNOWN_LANGUAGE,  // OCCITAN
246   (27 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // URDU
247   (36 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // BIHARI
248   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // GUJARATI
249   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // THAI
250   (24 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // ARABIC
251   (19 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE,  // CATALAN
252   ( 4 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE,  // ESPERANTO
253   ( 3 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE,  // BASQUE
254   ( 9 >= kMinCorrPercent) ? LATIN : UNKNOWN_LANGUAGE,  // INTERLINGUA
255   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // KANNADA
256   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // PUNJABI
257   (24 >= kMinCorrPercent) ? IRISH : UNKNOWN_LANGUAGE,  // SCOTS_GAELIC
258   ( 7 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // SWAHILI
259   (28 >= kMinCorrPercent) ? SERBIAN : UNKNOWN_LANGUAGE,  // SLOVENIAN
260   (37 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // MARATHI
261   ( 3 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE,  // MALTESE
262   ( 1 >= kMinCorrPercent) ? YORUBA : UNKNOWN_LANGUAGE,  // VIETNAMESE
263   (15 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE,  // FRISIAN
264   (42 >= kMinCorrPercent) ? CZECH : UNKNOWN_LANGUAGE,  // SLOVAK
265   // Original ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ChineseT
266   (24 >= kMinCorrPercent) ? CHINESE : UNKNOWN_LANGUAGE,  // ChineseT
267   (35 >= kMinCorrPercent) ? ICELANDIC : UNKNOWN_LANGUAGE,  // FAROESE
268   (15 >= kMinCorrPercent) ? JAVANESE : UNKNOWN_LANGUAGE,  // SUNDANESE
269   (17 >= kMinCorrPercent) ? TAJIK : UNKNOWN_LANGUAGE,  // UZBEK
270   ( 7 >= kMinCorrPercent) ? TIGRINYA : UNKNOWN_LANGUAGE,  // AMHARIC
271   (29 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE,  // AZERBAIJANI
272   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // GEORGIAN
273   ( 7 >= kMinCorrPercent) ? AMHARIC : UNKNOWN_LANGUAGE,  // TIGRINYA
274   (27 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE,  // PERSIAN
275   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // BOSNIAN
276   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // SINHALESE
277   (41 >= kMinCorrPercent) ? NORWEGIAN : UNKNOWN_LANGUAGE,  // NORWEGIAN_N
278   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // PORTUGUESE_P
279   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // PORTUGUESE_B
280   (37 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE,  // XHOSA
281   (37 >= kMinCorrPercent) ? XHOSA : UNKNOWN_LANGUAGE,  // ZULU
282   ( 2 >= kMinCorrPercent) ? SPANISH : UNKNOWN_LANGUAGE,  // GUARANI
283   (29 >= kMinCorrPercent) ? TSWANA : UNKNOWN_LANGUAGE,  // SESOTHO
284   ( 7 >= kMinCorrPercent) ? TURKISH : UNKNOWN_LANGUAGE,  // TURKMEN
285   ( 8 >= kMinCorrPercent) ? KAZAKH : UNKNOWN_LANGUAGE,  // KYRGYZ
286   ( 5 >= kMinCorrPercent) ? FRENCH : UNKNOWN_LANGUAGE,  // BRETON
287   ( 3 >= kMinCorrPercent) ? GANDA : UNKNOWN_LANGUAGE,  // TWI
288   (27 >= kMinCorrPercent) ? HEBREW : UNKNOWN_LANGUAGE,  // YIDDISH
289   (28 >= kMinCorrPercent) ? SLOVENIAN : UNKNOWN_LANGUAGE,  // SERBO_CROATIAN
290   (12 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE,  // SOMALI
291   ( 9 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE,  // UIGHUR
292   (15 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // KURDISH
293   ( 6 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE,  // MONGOLIAN
294   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ARMENIAN
295   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // LAOTHIAN
296   ( 8 >= kMinCorrPercent) ? URDU : UNKNOWN_LANGUAGE,  // SINDHI
297   (10 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE,  // RHAETO_ROMANCE
298   (31 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE,  // AFRIKAANS
299   (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE,  // LUXEMBOURGISH
300   ( 2 >= kMinCorrPercent) ? SCOTS : UNKNOWN_LANGUAGE,  // BURMESE
301   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // KHMER
302   (45 >= kMinCorrPercent) ? DZONGKHA : UNKNOWN_LANGUAGE,  // TIBETAN
303   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // DHIVEHI
304   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CHEROKEE
305   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // SYRIAC
306   ( 8 >= kMinCorrPercent) ? DUTCH : UNKNOWN_LANGUAGE,  // LIMBU
307   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ORIYA
308   (14 >= kMinCorrPercent) ? BENGALI : UNKNOWN_LANGUAGE,  // ASSAMESE
309   (16 >= kMinCorrPercent) ? ITALIAN : UNKNOWN_LANGUAGE,  // CORSICAN
310   ( 5 >= kMinCorrPercent) ? INTERLINGUA : UNKNOWN_LANGUAGE,  // INTERLINGUE
311   ( 8 >= kMinCorrPercent) ? KYRGYZ : UNKNOWN_LANGUAGE,  // KAZAKH
312   ( 4 >= kMinCorrPercent) ? SWAHILI : UNKNOWN_LANGUAGE,  // LINGALA
313   (11 >= kMinCorrPercent) ? RUSSIAN : UNKNOWN_LANGUAGE,  // MOLDAVIAN
314   (19 >= kMinCorrPercent) ? PERSIAN : UNKNOWN_LANGUAGE,  // PASHTO
315   ( 5 >= kMinCorrPercent) ? AYMARA : UNKNOWN_LANGUAGE,  // QUECHUA
316   ( 5 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // SHONA
317   (17 >= kMinCorrPercent) ? UZBEK : UNKNOWN_LANGUAGE,  // TAJIK
318   (13 >= kMinCorrPercent) ? BASHKIR : UNKNOWN_LANGUAGE,  // TATAR
319   (11 >= kMinCorrPercent) ? SAMOAN : UNKNOWN_LANGUAGE,  // TONGA
320   ( 2 >= kMinCorrPercent) ? TWI : UNKNOWN_LANGUAGE,  // YORUBA
321   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_ENGLISH_BASED
322   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_FRENCH_BASED
323   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_PORTUGUESE_BASED
324   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // CREOLES_AND_PIDGINS_OTHER
325   ( 6 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE,  // MAORI
326   ( 3 >= kMinCorrPercent) ? OROMO : UNKNOWN_LANGUAGE,  // WOLOF
327   ( 1 >= kMinCorrPercent) ? MONGOLIAN : UNKNOWN_LANGUAGE,  // ABKHAZIAN
328   ( 8 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE,  // AFAR
329   ( 5 >= kMinCorrPercent) ? QUECHUA : UNKNOWN_LANGUAGE,  // AYMARA
330   (13 >= kMinCorrPercent) ? TATAR : UNKNOWN_LANGUAGE,  // BASHKIR
331   ( 3 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE,  // BISLAMA
332   (45 >= kMinCorrPercent) ? TIBETAN : UNKNOWN_LANGUAGE,  // DZONGKHA
333   ( 4 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE,  // FIJIAN
334   ( 7 >= kMinCorrPercent) ? INUPIAK : UNKNOWN_LANGUAGE,  // GREENLANDIC
335   ( 3 >= kMinCorrPercent) ? AFAR : UNKNOWN_LANGUAGE,  // HAUSA
336   ( 3 >= kMinCorrPercent) ? OCCITAN : UNKNOWN_LANGUAGE,  // HAITIAN_CREOLE
337   ( 7 >= kMinCorrPercent) ? GREENLANDIC : UNKNOWN_LANGUAGE,  // INUPIAK
338   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // INUKTITUT
339   ( 4 >= kMinCorrPercent) ? HINDI : UNKNOWN_LANGUAGE,  // KASHMIRI
340   (30 >= kMinCorrPercent) ? RUNDI : UNKNOWN_LANGUAGE,  // KINYARWANDA
341   ( 2 >= kMinCorrPercent) ? TAGALOG : UNKNOWN_LANGUAGE,  // MALAGASY
342   (17 >= kMinCorrPercent) ? GERMAN : UNKNOWN_LANGUAGE,  // NAURU
343   (12 >= kMinCorrPercent) ? SOMALI : UNKNOWN_LANGUAGE,  // OROMO
344   (30 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // RUNDI
345   (11 >= kMinCorrPercent) ? TONGA : UNKNOWN_LANGUAGE,  // SAMOAN
346   ( 1 >= kMinCorrPercent) ? LINGALA : UNKNOWN_LANGUAGE,  // SANGO
347   (32 >= kMinCorrPercent) ? MARATHI : UNKNOWN_LANGUAGE,  // SANSKRIT
348   (16 >= kMinCorrPercent) ? ZULU : UNKNOWN_LANGUAGE,  // SISWANT
349   ( 5 >= kMinCorrPercent) ? SISWANT : UNKNOWN_LANGUAGE,  // TSONGA
350   (29 >= kMinCorrPercent) ? SESOTHO : UNKNOWN_LANGUAGE,  // TSWANA
351   ( 2 >= kMinCorrPercent) ? ESTONIAN : UNKNOWN_LANGUAGE,  // VOLAPUK
352   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // ZHUANG
353   ( 1 >= kMinCorrPercent) ? MALAY : UNKNOWN_LANGUAGE,  // KHASI
354   (28 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE,  // SCOTS
355   (15 >= kMinCorrPercent) ? KINYARWANDA : UNKNOWN_LANGUAGE,  // GANDA
356   ( 7 >= kMinCorrPercent) ? ENGLISH : UNKNOWN_LANGUAGE,  // MANX
357   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // MONTENEGRIN
358 
359   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // AKAN
360   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // IGBO
361   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // MAURITIAN_CREOLE
362   ( 0 >= kMinCorrPercent) ? Unknown : UNKNOWN_LANGUAGE,  // HAWAIIAN
363 };
364 
365 // COMPILE_ASSERT(arraysize(kClosestAltLanguage) == NUM_LANGUAGES,
366 //                kClosestAltLanguage_has_incorrect_size);
367 
368 
FlagFinish(int flags)369 inline bool FlagFinish(int flags) {return (flags & kCLDFlagFinish) != 0;}
FlagSqueeze(int flags)370 inline bool FlagSqueeze(int flags) {return (flags & kCLDFlagSqueeze) != 0;}
FlagRepeats(int flags)371 inline bool FlagRepeats(int flags) {return (flags & kCLDFlagRepeats) != 0;}
FlagTop40(int flags)372 inline bool FlagTop40(int flags) {return (flags & kCLDFlagTop40) != 0;}
FlagShort(int flags)373 inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;}
FlagHint(int flags)374 inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;}
FlagUseWords(int flags)375 inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;}
376 
377 
378   // Defines Top40 packed languages
379 
380   // Google top 40 languages
381   //
382   // Tier 0/1 Language enum list (16)
383   //   ENGLISH, /*no en_GB,*/ FRENCH, ITALIAN, GERMAN, SPANISH,    // E - FIGS
384   //   DUTCH, CHINESE, CHINESE_T, JAPANESE, KOREAN,
385   //   PORTUGUESE, RUSSIAN, POLISH, TURKISH, THAI,
386   //   ARABIC,
387   //
388   // Tier 2 Language enum list (22)
389   //   SWEDISH, FINNISH, DANISH, /*no pt-PT,*/ ROMANIAN, HUNGARIAN,
390   //   HEBREW, INDONESIAN, CZECH, GREEK, NORWEGIAN,
391   //   VIETNAMESE, BULGARIAN, CROATIAN, LITHUANIAN, SLOVAK,
392   //   TAGALOG, SLOVENIAN, SERBIAN, CATALAN, LATVIAN,
393   //   UKRAINIAN, HINDI,
394   //
395   //   use SERBO_CROATIAN instead of BOSNIAN, SERBIAN, CROATIAN, MONTENEGRIN(21)
396   //
397   // Include IgnoreMe (TG_UNKNOWN_LANGUAGE, 25+1) as a top 40
398 
399 
DemoteNotTop40(Tote * chunk_tote,uint16 psplus_one)400 void DemoteNotTop40(Tote* chunk_tote, uint16 psplus_one) {
401   // REVISIT
402 }
403 
PrintText(FILE * f,Language cur_lang,const string & temp)404 void PrintText(FILE* f, Language cur_lang, const string& temp) {
405   if (temp.size() == 0) {return;}
406   fprintf(f, "PrintText[%s]%s<br>\n", LanguageName(cur_lang), temp.c_str());
407 }
408 
409 
410 //------------------------------------------------------------------------------
411 // For --cld_html debugging output. Not thread safe
412 //------------------------------------------------------------------------------
413 static Language prior_lang = UNKNOWN_LANGUAGE;
414 static bool prior_unreliable = false;
415 
416 //------------------------------------------------------------------------------
417 // End For --cld_html debugging output
418 //------------------------------------------------------------------------------
419 
420 
421 // Backscan to word boundary, returning how many bytes n to go back
422 // so that src - n is non-space ans src - n - 1 is space.
423 // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary
BackscanToSpace(const char * src,int limit)424 int BackscanToSpace(const char* src, int limit) {
425   int n = 0;
426   limit = minint(limit, kMaxSpaceScan);
427   while (n < limit) {
428     if (src[-n - 1] == ' ') {return n;}    // We are at _X
429     ++n;
430   }
431   n = 0;
432   while (n < limit) {
433     if ((src[-n] & 0xc0) != 0x80) {return n;}    // We are at char begin
434     ++n;
435   }
436   return 0;
437 }
438 
439 // Forwardscan to word boundary, returning how many bytes n to go forward
440 // so that src + n is non-space ans src + n - 1 is space.
441 // If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary
ForwardscanToSpace(const char * src,int limit)442 int ForwardscanToSpace(const char* src, int limit) {
443   int n = 0;
444   limit = minint(limit, kMaxSpaceScan);
445   while (n < limit) {
446     if (src[n] == ' ') {return n + 1;}    // We are at _X
447     ++n;
448   }
449   n = 0;
450   while (n < limit) {
451     if ((src[n] & 0xc0) != 0x80) {return n;}    // We are at char begin
452     ++n;
453   }
454   return 0;
455 }
456 
457 
458 // This uses a cheap predictor to get a measure of compression, and
459 // hence a measure of repetitiveness. It works on complete UTF-8 characters
460 // instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly
461 // all the time when done with a byte-based count. Sigh.
462 //
463 // To allow running prediction across multiple chunks, caller passes in current
464 // 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
465 //
466 // Returns the number of *bytes* correctly predicted, increments by 1..4 for
467 // each correctly-predicted character.
468 //
469 // NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text
470 //
471 
472 // TODO(dsites) make this use just one byte per UTF-8 char and incr by charlen
473 
CountPredictedBytes(const char * isrc,int src_len,int * hash,int * tbl)474 int CountPredictedBytes(const char* isrc, int src_len, int* hash, int* tbl) {
475   int p_count = 0;
476   const uint8* src = reinterpret_cast<const uint8*>(isrc);
477   const uint8* srclimit = src + src_len;
478   int local_hash = *hash;
479 
480   while (src < srclimit) {
481     int c = src[0];
482     int incr = 1;
483 
484     // Pick up one char and length
485     if (c < 0xc0) {
486       // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
487       // Do nothing more
488     } else if ((c & 0xe0) == 0xc0) {
489       // Two-byte
490       c = (c << 8) | src[1];
491       incr = 2;
492     } else if ((c & 0xf0) == 0xe0) {
493       // Three-byte
494       c = (c << 16) | (src[1] << 8) | src[2];
495       incr = 3;
496     } else {
497       // Four-byte
498       c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
499       incr = 4;
500     }
501     src += incr;
502 
503     int p = tbl[local_hash];            // Prediction
504     tbl[local_hash] = c;                // Update prediction
505     if (c == p) {
506       p_count += incr;                  // Count bytes of good predictions
507     }
508 
509     local_hash = ((local_hash << 4) ^ c) & 0xfff;
510   }
511   *hash = local_hash;
512   return p_count;
513 }
514 
515 
516 
517 // Counts number of spaces; a little faster than one-at-a-time
518 // Doesn't count odd bytes at end
CountSpaces4(const char * src,int src_len)519 int CountSpaces4(const char* src, int src_len) {
520   int s_count = 0;
521   for (int i = 0; i < (src_len & ~3); i += 4) {
522     s_count += (src[i] == ' ');
523     s_count += (src[i+1] == ' ');
524     s_count += (src[i+2] == ' ');
525     s_count += (src[i+3] == ' ');
526   }
527   return s_count;
528 }
529 
530 
531 // Remove words of text that have more than half their letters predicted
532 // correctly by our cheap predictor, moving the remaining words in-place
533 // to the front of the input buffer.
534 //
535 // To allow running prediction across multiple chunks, caller passes in current
536 // 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
537 //
538 // Return the new, possibly-shorter length
539 //
540 // Result Buffer ALWAYS has leading space and trailing space space space NUL,
541 // if input does
542 //
CheapRepWordsInplace(char * isrc,int src_len,int * hash,int * tbl)543 int CheapRepWordsInplace(char* isrc, int src_len, int* hash, int* tbl) {
544   const uint8* src = reinterpret_cast<const uint8*>(isrc);
545   const uint8* srclimit = src + src_len;
546   char* dst = isrc;
547   int local_hash = *hash;
548   char* word_dst = dst;           // Start of next word
549   int good_predict_bytes = 0;
550   int word_length_bytes = 0;
551 
552   while (src < srclimit) {
553     int c = src[0];
554     int incr = 1;
555     *dst++ = c;
556 
557     if (c == ' ') {
558       if ((good_predict_bytes * 2) > word_length_bytes) {
559         // Word is well-predicted: backup to start of this word
560         dst = word_dst;
561         if (FLAGS_cld_showme) {
562           // Mark the deletion point with period
563           // Don't repeat multiple periods
564           // Cannot mark with more bytes or may overwrite unseen input
565           if ((isrc < (dst - 2)) && (dst[-2] != '.')) {
566             *dst++ = '.';
567             *dst++ = ' ';
568           }
569         }
570       }
571       word_dst = dst;              // Start of next word
572       good_predict_bytes = 0;
573       word_length_bytes = 0;
574     }
575 
576     // Pick up one char and length
577     if (c < 0xc0) {
578       // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
579       // Do nothing more
580     } else if ((c & 0xe0) == 0xc0) {
581       // Two-byte
582       *dst++ = src[1];
583       c = (c << 8) | src[1];
584       incr = 2;
585     } else if ((c & 0xf0) == 0xe0) {
586       // Three-byte
587       *dst++ = src[1];
588       *dst++ = src[2];
589       c = (c << 16) | (src[1] << 8) | src[2];
590       incr = 3;
591     } else {
592       // Four-byte
593       *dst++ = src[1];
594       *dst++ = src[2];
595       *dst++ = src[3];
596       c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
597       incr = 4;
598     }
599     src += incr;
600     word_length_bytes += incr;
601 
602     int p = tbl[local_hash];            // Prediction
603     tbl[local_hash] = c;                // Update prediction
604     if (c == p) {
605       good_predict_bytes += incr;       // Count good predictions
606     }
607 
608     local_hash = ((local_hash << 4) ^ c) & 0xfff;
609   }
610 
611   *hash = local_hash;
612 
613   if ((dst - isrc) < (src_len - 3)) {
614     // Pad and make last char clean UTF-8 by putting following spaces
615     dst[0] = ' ';
616     dst[1] = ' ';
617     dst[2] = ' ';
618     dst[3] = '\0';
619   } else  if ((dst - isrc) < src_len) {
620     // Make last char clean UTF-8 by putting following space off the end
621     dst[0] = ' ';
622   }
623 
624   return static_cast<int>(dst - isrc);
625 }
626 
627 
628 // This alternate form overwrites redundant words, thus avoiding corrupting the
629 // backmap for generate a vector of original-text ranges.
CheapRepWordsInplaceOverwrite(char * isrc,int src_len,int * hash,int * tbl)630 int CheapRepWordsInplaceOverwrite(char* isrc, int src_len, int* hash, int* tbl) {
631   const uint8* src = reinterpret_cast<const uint8*>(isrc);
632   const uint8* srclimit = src + src_len;
633   char* dst = isrc;
634   int local_hash = *hash;
635   char* word_dst = dst;           // Start of next word
636   int good_predict_bytes = 0;
637   int word_length_bytes = 0;
638 
639   while (src < srclimit) {
640     int c = src[0];
641     int incr = 1;
642     *dst++ = c;
643 
644     if (c == ' ') {
645       if ((good_predict_bytes * 2) > word_length_bytes) {
646         // Word [word_dst..dst-1) is well-predicted: overwrite
647         for (char* p = word_dst; p < dst - 1; ++p) {*p = '.';}
648       }
649       word_dst = dst;              // Start of next word
650       good_predict_bytes = 0;
651       word_length_bytes = 0;
652     }
653 
654     // Pick up one char and length
655     if (c < 0xc0) {
656       // One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
657       // Do nothing more
658     } else if ((c & 0xe0) == 0xc0) {
659       // Two-byte
660       *dst++ = src[1];
661       c = (c << 8) | src[1];
662       incr = 2;
663     } else if ((c & 0xf0) == 0xe0) {
664       // Three-byte
665       *dst++ = src[1];
666       *dst++ = src[2];
667       c = (c << 16) | (src[1] << 8) | src[2];
668       incr = 3;
669     } else {
670       // Four-byte
671       *dst++ = src[1];
672       *dst++ = src[2];
673       *dst++ = src[3];
674       c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
675       incr = 4;
676     }
677     src += incr;
678     word_length_bytes += incr;
679 
680     int p = tbl[local_hash];            // Prediction
681     tbl[local_hash] = c;                // Update prediction
682     if (c == p) {
683       good_predict_bytes += incr;       // Count good predictions
684     }
685 
686     local_hash = ((local_hash << 4) ^ c) & 0xfff;
687   }
688 
689   *hash = local_hash;
690 
691   if ((dst - isrc) < (src_len - 3)) {
692     // Pad and make last char clean UTF-8 by putting following spaces
693     dst[0] = ' ';
694     dst[1] = ' ';
695     dst[2] = ' ';
696     dst[3] = '\0';
697   } else  if ((dst - isrc) < src_len) {
698     // Make last char clean UTF-8 by putting following space off the end
699     dst[0] = ' ';
700   }
701 
702   return static_cast<int>(dst - isrc);
703 }
704 
705 
706 // Remove portions of text that have a high density of spaces, or that are
707 // overly repetitive, squeezing the remaining text in-place to the front of the
708 // input buffer.
709 //
710 // Squeezing looks at density of space/prediced chars in fixed-size chunks,
711 // specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes.
712 //
713 // Return the new, possibly-shorter length
714 //
715 // Result Buffer ALWAYS has leading space and trailing space space space NUL,
716 // if input does
717 //
CheapSqueezeInplace(char * isrc,int src_len,int ichunksize)718 int CheapSqueezeInplace(char* isrc,
719                                             int src_len,
720                                             int ichunksize) {
721   char* src = isrc;
722   char* dst = src;
723   char* srclimit = src + src_len;
724   bool skipping = false;
725 
726   int hash = 0;
727   // Allocate local prediction table.
728   int* predict_tbl = new int[kPredictionTableSize];
729   memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
730 
731   int chunksize = ichunksize;
732   if (chunksize == 0) {chunksize = kChunksizeDefault;}
733   int space_thresh = (chunksize * kSpacesThreshPercent) / 100;
734   int predict_thresh = (chunksize * kPredictThreshPercent) / 100;
735 
736   while (src < srclimit) {
737     int remaining_bytes = srclimit - src;
738     int len = minint(chunksize, remaining_bytes);
739     // Make len land us on a UTF-8 character boundary.
740     // Ah. Also fixes mispredict because we could get out of phase
741     // Loop always terminates at trailing space in buffer
742     while ((src[len] & 0xc0) == 0x80) {++len;}  // Move past continuation bytes
743 
744     int space_n = CountSpaces4(src, len);
745     int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);
746     if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {
747       // Skip the text
748       if (!skipping) {
749         // Keeping-to-skipping transition; do it at a space
750         int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));
751         dst -= n;
752         if (dst == isrc) {
753           // Force a leading space if the first chunk is deleted
754           *dst++ = ' ';
755         }
756         if (FLAGS_cld_showme) {
757           // Mark the deletion point with black square U+25A0
758           *dst++ = static_cast<unsigned char>(0xe2);
759           *dst++ = static_cast<unsigned char>(0x96);
760           *dst++ = static_cast<unsigned char>(0xa0);
761           *dst++ = ' ';
762         }
763         skipping = true;
764       }
765     } else {
766       // Keep the text
767       if (skipping) {
768         // Skipping-to-keeping transition; do it at a space
769         int n = ForwardscanToSpace(src, len);
770         src += n;
771         remaining_bytes -= n;   // Shrink remaining length
772         len -= n;
773         skipping = false;
774       }
775       // "len" can be negative in some cases
776       if (len > 0) {
777         memmove(dst, src, len);
778         dst += len;
779       }
780     }
781     src += len;
782   }
783 
784   if ((dst - isrc) < (src_len - 3)) {
785     // Pad and make last char clean UTF-8 by putting following spaces
786     dst[0] = ' ';
787     dst[1] = ' ';
788     dst[2] = ' ';
789     dst[3] = '\0';
790   } else   if ((dst - isrc) < src_len) {
791     // Make last char clean UTF-8 by putting following space off the end
792     dst[0] = ' ';
793   }
794 
795   // Deallocate local prediction table
796   delete[] predict_tbl;
797   return static_cast<int>(dst - isrc);
798 }
799 
800 // This alternate form overwrites redundant words, thus avoiding corrupting the
801 // backmap for generate a vector of original-text ranges.
CheapSqueezeInplaceOverwrite(char * isrc,int src_len,int ichunksize)802 int CheapSqueezeInplaceOverwrite(char* isrc,
803                                             int src_len,
804                                             int ichunksize) {
805   char* src = isrc;
806   char* dst = src;
807   char* srclimit = src + src_len;
808   bool skipping = false;
809 
810   int hash = 0;
811   // Allocate local prediction table.
812   int* predict_tbl = new int[kPredictionTableSize];
813   memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
814 
815   int chunksize = ichunksize;
816   if (chunksize == 0) {chunksize = kChunksizeDefault;}
817   int space_thresh = (chunksize * kSpacesThreshPercent) / 100;
818   int predict_thresh = (chunksize * kPredictThreshPercent) / 100;
819 
820   // Always keep first byte (space)
821   ++src;
822   ++dst;
823   while (src < srclimit) {
824     int remaining_bytes = srclimit - src;
825     int len = minint(chunksize, remaining_bytes);
826     // Make len land us on a UTF-8 character boundary.
827     // Ah. Also fixes mispredict because we could get out of phase
828     // Loop always terminates at trailing space in buffer
829     while ((src[len] & 0xc0) == 0x80) {++len;}  // Move past continuation bytes
830 
831     int space_n = CountSpaces4(src, len);
832     int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);
833     if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {
834       // Overwrite the text [dst-n..dst)
835       if (!skipping) {
836         // Keeping-to-skipping transition; do it at a space
837         int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));
838         // Text [word_dst..dst) is well-predicted: overwrite
839         for (char* p = dst - n; p < dst; ++p) {*p = '.';}
840         skipping = true;
841       }
842       // Overwrite the text [dst..dst+len)
843       for (char* p = dst; p < dst + len; ++p) {*p = '.';}
844       dst[len - 1] = ' ';    // Space at end so we can see what is happening
845     } else {
846       // Keep the text
847       if (skipping) {
848         // Skipping-to-keeping transition; do it at a space
849         int n = ForwardscanToSpace(src, len);
850         // Text [dst..dst+n) is well-predicted: overwrite
851         for (char* p = dst; p < dst + n - 1; ++p) {*p = '.';}
852         skipping = false;
853       }
854     }
855     dst += len;
856     src += len;
857   }
858 
859   if ((dst - isrc) < (src_len - 3)) {
860     // Pad and make last char clean UTF-8 by putting following spaces
861     dst[0] = ' ';
862     dst[1] = ' ';
863     dst[2] = ' ';
864     dst[3] = '\0';
865   } else   if ((dst - isrc) < src_len) {
866     // Make last char clean UTF-8 by putting following space off the end
867     dst[0] = ' ';
868   }
869 
870   // Deallocate local prediction table
871   delete[] predict_tbl;
872   return static_cast<int>(dst - isrc);
873 }
874 
875 // Timing 2.8GHz P4 (dsites 2008.03.20) with 170KB input
876 //  About 90 MB/sec, with or without memcpy, chunksize 48 or 4096
877 //  Just CountSpaces is about 340 MB/sec
878 //  Byte-only CountPredictedBytes is about 150 MB/sec
879 //  Byte-only CountPredictedBytes, conditional tbl[] = is about 85! MB/sec
880 //  Byte-only CountPredictedBytes is about 180 MB/sec, byte tbl, byte/int c
881 //  Unjammed byte-only both = 170 MB/sec
882 //  Jammed byte-only both = 120 MB/sec
883 //  Back to original w/slight updates, 110 MB/sec
884 //
CheapSqueezeTriggerTest(const char * src,int src_len,int testsize)885 bool CheapSqueezeTriggerTest(const char* src, int src_len, int testsize) {
886   // Don't trigger at all on short text
887   if (src_len < testsize) {return false;}
888   int space_thresh = (testsize * kSpacesTriggerPercent) / 100;
889   int predict_thresh = (testsize * kPredictTriggerPercent) / 100;
890   int hash = 0;
891   // Allocate local prediction table.
892   int* predict_tbl = new int[kPredictionTableSize];
893   memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
894 
895   bool retval = false;
896   if ((CountSpaces4(src, testsize) >= space_thresh) ||
897       (CountPredictedBytes(src, testsize, &hash, predict_tbl) >=
898        predict_thresh)) {
899     retval = true;
900   }
901   // Deallocate local prediction table
902   delete[] predict_tbl;
903   return retval;
904 }
905 
906 
907 
908 
909 // Delete any extended languages from doc_tote
RemoveExtendedLanguages(DocTote * doc_tote)910 void RemoveExtendedLanguages(DocTote* doc_tote) {
911   // Now a nop
912 }
913 
914 static const int kMinReliableKeepPercent = 41;  // Remove lang if reli < this
915 
916 // For Tier3 languages, require a minimum number of bytes to be first-place lang
917 static const int kGoodFirstT3MinBytes = 24;         // <this => no first
918 
919 // Move bytes for unreliable langs to another lang or UNKNOWN
920 // doc_tote is sorted, so cannot Add
921 //
922 // If both CHINESE and CHINESET are present and unreliable, do not delete both;
923 // merge both into CHINESE.
924 //
925 //dsites 2009.03.19
926 // we also want to remove Tier3 languages as the first lang if there is very
927 // little text like ej1 ej2 ej3 ej4
928 // maybe fold this back in earlier
929 //
RemoveUnreliableLanguages(DocTote * doc_tote,bool FLAGS_cld2_html,bool FLAGS_cld2_quiet)930 void RemoveUnreliableLanguages(DocTote* doc_tote,
931                                bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) {
932   // Prepass to merge some low-reliablility languages
933   // TODO: this shouldn't really reach in to the internal structure of doc_tote
934   int total_bytes = 0;
935   for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
936     int plang = doc_tote->Key(sub);
937     if (plang == DocTote::kUnusedKey) {continue;}               // Empty slot
938 
939     Language lang = static_cast<Language>(plang);
940     int bytes = doc_tote->Value(sub);
941     int reli = doc_tote->Reliability(sub);
942     if (bytes == 0) {continue;}                     // Zero bytes
943     total_bytes += bytes;
944 
945     // Reliable percent = stored reliable score over stored bytecount
946     int reliable_percent = reli / bytes;
947     if (reliable_percent >= kMinReliableKeepPercent) {continue;}   // Keeper
948 
949     // This language is too unreliable to keep, but we might merge it.
950     Language altlang = UNKNOWN_LANGUAGE;
951     if (lang <= HAWAIIAN) {altlang = kClosestAltLanguage[lang];}
952     if (altlang == UNKNOWN_LANGUAGE) {continue;}    // No alternative
953 
954     // Look for alternative in doc_tote
955     int altsub = doc_tote->Find(altlang);
956     if (altsub < 0) {continue;}                     // No alternative text
957 
958     int bytes2 = doc_tote->Value(altsub);
959     int reli2 = doc_tote->Reliability(altsub);
960     if (bytes2 == 0) {continue;}                    // Zero bytes
961 
962     // Reliable percent is stored reliable score over stored bytecount
963     int reliable_percent2 = reli2 / bytes2;
964 
965     // Merge one language into the other. Break ties toward lower lang #
966     int tosub = altsub;
967     int fromsub = sub;
968     bool into_lang = false;
969     if ((reliable_percent2 < reliable_percent) ||
970         ((reliable_percent2 == reliable_percent) && (lang < altlang))) {
971       tosub = sub;
972       fromsub = altsub;
973       into_lang = true;
974     }
975 
976     // Make sure merged reliability doesn't drop and is enough to avoid delete
977     int newpercent = maxint(reliable_percent, reliable_percent2);
978     newpercent = maxint(newpercent, kMinReliableKeepPercent);
979     int newbytes = bytes + bytes2;
980     int newreli = newpercent * newbytes;
981 
982     doc_tote->SetKey(fromsub, DocTote::kUnusedKey);
983     doc_tote->SetScore(fromsub, 0);
984     doc_tote->SetReliability(fromsub, 0);
985     doc_tote->SetScore(tosub, newbytes);
986     doc_tote->SetReliability(tosub, newreli);
987 
988     // Show fate of unreliable languages if at least 10 bytes
989     if (FLAGS_cld2_html && (newbytes >= 10) &&
990         !FLAGS_cld2_quiet) {
991       if (into_lang) {
992         fprintf(stderr, "{Unreli %s.%dR,%dB => %s} ",
993                 LanguageCode(altlang), reliable_percent2, bytes2,
994                 LanguageCode(lang));
995       } else {
996         fprintf(stderr, "{Unreli %s.%dR,%dB => %s} ",
997                 LanguageCode(lang), reliable_percent, bytes,
998                 LanguageCode(altlang));
999       }
1000     }
1001   }
1002 
1003 
1004   // Pass to delete any remaining unreliable languages
1005   for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
1006     int plang = doc_tote->Key(sub);
1007     if (plang == DocTote::kUnusedKey) {continue;}               // Empty slot
1008 
1009     Language lang = static_cast<Language>(plang);
1010     int bytes = doc_tote->Value(sub);
1011     int reli = doc_tote->Reliability(sub);
1012     if (bytes == 0) {continue;}                     // Zero bytes
1013 
1014     // Reliable percent is stored as reliable score over stored bytecount
1015     int reliable_percent = reli / bytes;
1016     if (reliable_percent >= kMinReliableKeepPercent) {  // Keeper?
1017        continue;                                        // yes
1018     }
1019 
1020     // Delete unreliable entry
1021     doc_tote->SetKey(sub, DocTote::kUnusedKey);
1022     doc_tote->SetScore(sub, 0);
1023     doc_tote->SetReliability(sub, 0);
1024 
1025     // Show fate of unreliable languages if at least 10 bytes
1026     if (FLAGS_cld2_html && (bytes >= 10) &&
1027         !FLAGS_cld2_quiet) {
1028       fprintf(stderr, "{Unreli %s.%dR,%dB} ",
1029               LanguageCode(lang), reliable_percent, bytes);
1030     }
1031   }
1032 
1033   ////if (FLAGS_cld2_html) {fprintf(stderr, "<br>\n");}
1034 }
1035 
1036 
1037 // Move all the text bytes from lower byte-count to higher one
MoveLang1ToLang2(Language lang1,Language lang2,int lang1_sub,int lang2_sub,DocTote * doc_tote,ResultChunkVector * resultchunkvector)1038 void MoveLang1ToLang2(Language lang1, Language lang2,
1039                       int lang1_sub, int lang2_sub,
1040                       DocTote* doc_tote,
1041                       ResultChunkVector* resultchunkvector) {
1042   // In doc_tote, move all the bytes lang1 => lang2
1043   int sum = doc_tote->Value(lang2_sub) + doc_tote->Value(lang1_sub);
1044   doc_tote->SetValue(lang2_sub, sum);
1045   sum = doc_tote->Score(lang2_sub) + doc_tote->Score(lang1_sub);
1046   doc_tote->SetScore(lang2_sub, sum);
1047   sum = doc_tote->Reliability(lang2_sub) + doc_tote->Reliability(lang1_sub);
1048   doc_tote->SetReliability(lang2_sub, sum);
1049 
1050   // Delete old entry
1051   doc_tote->SetKey(lang1_sub, DocTote::kUnusedKey);
1052   doc_tote->SetScore(lang1_sub, 0);
1053   doc_tote->SetReliability(lang1_sub, 0);
1054 
1055   // In resultchunkvector, move all the bytes lang1 => lang2
1056   if (resultchunkvector == NULL) {return;}
1057 
1058   int k = 0;
1059   uint16 prior_lang = UNKNOWN_LANGUAGE;
1060   for (int i = 0; i < static_cast<int>(resultchunkvector->size()); ++i) {
1061     ResultChunk* rc = &(*resultchunkvector)[i];
1062     if (rc->lang1 == lang1) {
1063       // Update entry[i] lang1 => lang2
1064       rc->lang1 = lang2;
1065     }
1066     // One change may produce two merges -- entry before and entry after
1067     if ((rc->lang1 == prior_lang) && (k > 0)) {
1068       // Merge with previous, deleting entry[i]
1069       ResultChunk* prior_rc = &(*resultchunkvector)[k - 1];
1070       prior_rc->bytes += rc->bytes;
1071       // fprintf(stderr, "MoveLang1ToLang2 merged [%d] => [%d]<br>\n", i, k-1);
1072     } else {
1073       // Keep entry[i]
1074       (*resultchunkvector)[k] = (*resultchunkvector)[i];
1075       // fprintf(stderr, "MoveLang1ToLang2 keep [%d] => [%d]<br>\n", i, k);
1076       ++k;
1077     }
1078     prior_lang = rc->lang1;
1079   }
1080   resultchunkvector->resize(k);
1081 }
1082 
1083 
1084 
1085 // Move less likely byte count to more likely for close pairs of languages
1086 // If given, also update resultchunkvector
RefineScoredClosePairs(DocTote * doc_tote,ResultChunkVector * resultchunkvector,bool FLAGS_cld2_html,bool FLAGS_cld2_quiet)1087 void RefineScoredClosePairs(DocTote* doc_tote,
1088                             ResultChunkVector* resultchunkvector,
1089                             bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) {
1090   for (int sub = 0; sub < doc_tote->MaxSize(); ++sub) {
1091     int close_packedlang = doc_tote->Key(sub);
1092     int subscr = LanguageCloseSet(static_cast<Language>(close_packedlang));
1093     if (subscr == 0) {continue;}
1094 
1095     // We have a close pair language -- if the other one is also scored and the
1096     // longword score differs enough, put all our eggs into one basket
1097 
1098     // Nonzero longword score: Go look for the other of this pair
1099     for (int sub2 = sub + 1; sub2 < doc_tote->MaxSize(); ++sub2) {
1100       if (LanguageCloseSet(static_cast<Language>(doc_tote->Key(sub2))) == subscr) {
1101         // We have a matching pair
1102         int close_packedlang2 = doc_tote->Key(sub2);
1103 
1104         // Move all the text bytes from lower byte-count to higher one
1105         int from_sub, to_sub;
1106         Language from_lang, to_lang;
1107         if (doc_tote->Value(sub) < doc_tote->Value(sub2)) {
1108           from_sub = sub;
1109           to_sub = sub2;
1110           from_lang = static_cast<Language>(close_packedlang);
1111           to_lang = static_cast<Language>(close_packedlang2);
1112         } else {
1113           from_sub = sub2;
1114           to_sub = sub;
1115           from_lang = static_cast<Language>(close_packedlang2);
1116           to_lang = static_cast<Language>(close_packedlang);
1117         }
1118 
1119         if ((FLAGS_cld2_html || FLAGS_dbgscore) && !FLAGS_cld2_quiet) {
1120           // Show fate of closepair language
1121           int val = doc_tote->Value(from_sub);           // byte count
1122           int reli = doc_tote->Reliability(from_sub);
1123           int reliable_percent = reli / (val ? val : 1);  // avoid zdiv
1124           fprintf(stderr, "{CloseLangPair: %s.%dR,%dB => %s}<br>\n",
1125                   LanguageCode(from_lang),
1126                   reliable_percent,
1127                   doc_tote->Value(from_sub),
1128                   LanguageCode(to_lang));
1129         }
1130         MoveLang1ToLang2(from_lang, to_lang, from_sub, to_sub,
1131                          doc_tote, resultchunkvector);
1132         break;    // Exit inner for sub2 loop
1133       }
1134     }     // End for sub2
1135   }   // End for sub
1136 }
1137 
1138 
ApplyAllLanguageHints(Tote * chunk_tote,int tote_grams,uint8 * lang_hint_boost)1139 void ApplyAllLanguageHints(Tote* chunk_tote, int tote_grams,
1140                         uint8* lang_hint_boost) {
1141 }
1142 
1143 
PrintHtmlEscapedText(FILE * f,const char * txt,int len)1144 void PrintHtmlEscapedText(FILE* f, const char* txt, int len) {
1145    string temp(txt, len);
1146    fprintf(f, "%s", GetHtmlEscapedText(temp).c_str());
1147 }
1148 
PrintLang(FILE * f,Tote * chunk_tote,Language cur_lang,bool cur_unreliable,Language prior_lang,bool prior_unreliable)1149 void PrintLang(FILE* f, Tote* chunk_tote,
1150               Language cur_lang, bool cur_unreliable,
1151               Language prior_lang, bool prior_unreliable) {
1152   if (cur_lang == prior_lang) {
1153     fprintf(f, "[]");
1154   } else {
1155     fprintf(f, "[%s%s]", LanguageCode(cur_lang), cur_unreliable ? "*" : "");
1156   }
1157 }
1158 
1159 
PrintTopLang(Language top_lang)1160 void PrintTopLang(Language top_lang) {
1161   if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
1162     fprintf(stderr, "[] ");
1163   } else {
1164     fprintf(stderr, "[%s] ", LanguageName(top_lang));
1165     prior_lang = top_lang;
1166   }
1167 }
1168 
PrintTopLangSpeculative(Language top_lang)1169 void PrintTopLangSpeculative(Language top_lang) {
1170   fprintf(stderr, "<span style=\"color:#%06X;\">", 0xa0a0a0);
1171   if ((top_lang == prior_lang) && (top_lang != UNKNOWN_LANGUAGE)) {
1172     fprintf(stderr, "[] ");
1173   } else {
1174     fprintf(stderr, "[%s] ", LanguageName(top_lang));
1175     prior_lang = top_lang;
1176   }
1177   fprintf(stderr, "</span>\n");
1178 }
1179 
PrintLangs(FILE * f,const Language * language3,const int * percent3,const int * text_bytes,const bool * is_reliable)1180 void PrintLangs(FILE* f, const Language* language3, const int* percent3,
1181                 const int* text_bytes, const bool* is_reliable) {
1182   fprintf(f, "<br>&nbsp;&nbsp;Initial_Languages ");
1183   if (language3[0] != UNKNOWN_LANGUAGE) {
1184     fprintf(f, "%s%s(%d%%)  ",
1185             LanguageName(language3[0]),
1186             *is_reliable ? "" : "*",
1187             percent3[0]);
1188   }
1189   if (language3[1] != UNKNOWN_LANGUAGE) {
1190     fprintf(f, "%s(%d%%)  ", LanguageName(language3[1]), percent3[1]);
1191   }
1192   if (language3[2] != UNKNOWN_LANGUAGE) {
1193     fprintf(f, "%s(%d%%)  ", LanguageName(language3[2]), percent3[2]);
1194   }
1195   fprintf(f, "%d bytes \n", *text_bytes);
1196 
1197   fprintf(f, "<br>\n");
1198 }
1199 
1200 
1201 // Return internal probability score (sum) per 1024 bytes
GetNormalizedScore(Language lang,ULScript ulscript,int bytecount,int score)1202 double GetNormalizedScore(Language lang, ULScript ulscript,
1203                           int bytecount, int score) {
1204   if (bytecount <= 0) {return 0.0;}
1205   return (score << 10) / bytecount;
1206 }
1207 
1208 // Extract return values before fixups
ExtractLangEtc(DocTote * doc_tote,int total_text_bytes,int * reliable_percent3,Language * language3,int * percent3,double * normalized_score3,int * text_bytes,bool * is_reliable)1209 void ExtractLangEtc(DocTote* doc_tote, int total_text_bytes,
1210                     int* reliable_percent3, Language* language3, int* percent3,
1211                     double*  normalized_score3,
1212                     int* text_bytes, bool* is_reliable) {
1213   reliable_percent3[0] = 0;
1214   reliable_percent3[1] = 0;
1215   reliable_percent3[2] = 0;
1216   language3[0] = UNKNOWN_LANGUAGE;
1217   language3[1] = UNKNOWN_LANGUAGE;
1218   language3[2] = UNKNOWN_LANGUAGE;
1219   percent3[0] = 0;
1220   percent3[1] = 0;
1221   percent3[2] = 0;
1222   normalized_score3[0] = 0.0;
1223   normalized_score3[1] = 0.0;
1224   normalized_score3[2] = 0.0;
1225 
1226   *text_bytes = total_text_bytes;
1227   *is_reliable = false;
1228 
1229   int bytecount1 = 0;
1230   int bytecount2 = 0;
1231   int bytecount3 = 0;
1232 
1233   int lang1 = doc_tote->Key(0);
1234   if ((lang1 != DocTote::kUnusedKey) && (lang1 != UNKNOWN_LANGUAGE)) {
1235     // We have a top language
1236     language3[0] = static_cast<Language>(lang1);
1237     bytecount1 = doc_tote->Value(0);
1238     int reli1 = doc_tote->Reliability(0);
1239     reliable_percent3[0] = reli1 / (bytecount1 ? bytecount1 : 1);  // avoid zdiv
1240     normalized_score3[0] = GetNormalizedScore(language3[0],
1241                                                   ULScript_Common,
1242                                                   bytecount1,
1243                                                   doc_tote->Score(0));
1244   }
1245 
1246   int lang2 = doc_tote->Key(1);
1247   if ((lang2 != DocTote::kUnusedKey) && (lang2 != UNKNOWN_LANGUAGE)) {
1248     language3[1] = static_cast<Language>(lang2);
1249     bytecount2 = doc_tote->Value(1);
1250     int reli2 = doc_tote->Reliability(1);
1251     reliable_percent3[1] = reli2 / (bytecount2 ? bytecount2 : 1);  // avoid zdiv
1252     normalized_score3[1] = GetNormalizedScore(language3[1],
1253                                                   ULScript_Common,
1254                                                   bytecount2,
1255                                                   doc_tote->Score(1));
1256   }
1257 
1258   int lang3 = doc_tote->Key(2);
1259   if ((lang3 != DocTote::kUnusedKey) && (lang3 != UNKNOWN_LANGUAGE)) {
1260     language3[2] = static_cast<Language>(lang3);
1261     bytecount3 = doc_tote->Value(2);
1262     int reli3 = doc_tote->Reliability(2);
1263     reliable_percent3[2] = reli3 / (bytecount3 ? bytecount3 : 1);  // avoid zdiv
1264     normalized_score3[2] = GetNormalizedScore(language3[2],
1265                                                   ULScript_Common,
1266                                                   bytecount3,
1267                                                   doc_tote->Score(2));
1268   }
1269 
1270   // Increase total bytes to sum (top 3) if low for some reason
1271   int total_bytecount12 = bytecount1 + bytecount2;
1272   int total_bytecount123 = total_bytecount12 + bytecount3;
1273   if (total_text_bytes < total_bytecount123) {
1274     total_text_bytes = total_bytecount123;
1275     *text_bytes = total_text_bytes;
1276   }
1277 
1278   // Sum minus previous % gives better roundoff behavior than bytecount/total
1279   int total_text_bytes_div = maxint(1, total_text_bytes);    // Avoid zdiv
1280   percent3[0] = (bytecount1 * 100) / total_text_bytes_div;
1281   percent3[1] = (total_bytecount12 * 100) / total_text_bytes_div;
1282   percent3[2] = (total_bytecount123 * 100) / total_text_bytes_div;
1283   percent3[2] -= percent3[1];
1284   percent3[1] -= percent3[0];
1285 
1286   // Roundoff, say 96% 1.6% 1.4%, will produce non-obvious 96% 1% 2%
1287   // Fix this explicitly
1288   if (percent3[1] < percent3[2]) {
1289     ++percent3[1];
1290     --percent3[2];
1291   }
1292   if (percent3[0] < percent3[1]) {
1293     ++percent3[0];
1294     --percent3[1];
1295   }
1296 
1297   *text_bytes = total_text_bytes;
1298 
1299   if ((lang1 != DocTote::kUnusedKey) && (lang1 != UNKNOWN_LANGUAGE)) {
1300     // We have a top language
1301     // Its reliability is overall result reliability
1302     int bytecount = doc_tote->Value(0);
1303     int reli = doc_tote->Reliability(0);
1304     int reliable_percent = reli / (bytecount ? bytecount : 1);  // avoid zdiv
1305     *is_reliable = (reliable_percent >= kMinReliableKeepPercent);
1306   } else {
1307     // No top language at all. This can happen with zero text or 100% Klingon
1308     // if extended=false. Just return all UNKNOWN_LANGUAGE, unreliable.
1309     *is_reliable = false;
1310   }
1311 
1312   // If ignore percent is too large, set unreliable.
1313   int ignore_percent = 100 - (percent3[0] + percent3[1] + percent3[2]);
1314   if ((ignore_percent > kIgnoreMaxPercent)) {
1315     *is_reliable = false;
1316   }
1317 }
1318 
IsFIGS(Language lang)1319 bool IsFIGS(Language lang) {
1320   if (lang == FRENCH) {return true;}
1321   if (lang == ITALIAN) {return true;}
1322   if (lang == GERMAN) {return true;}
1323   if (lang == SPANISH) {return true;}
1324   return false;
1325 }
1326 
IsEFIGS(Language lang)1327 bool IsEFIGS(Language lang) {
1328   if (lang == ENGLISH) {return true;}
1329   if (lang == FRENCH) {return true;}
1330   if (lang == ITALIAN) {return true;}
1331   if (lang == GERMAN) {return true;}
1332   if (lang == SPANISH) {return true;}
1333   return false;
1334 }
1335 
1336 // For Tier3 languages, require more bytes of text to override
1337 // the first-place language
1338 static const int kGoodSecondT1T2MinBytes = 15;        // <this => no second
1339 static const int kGoodSecondT3MinBytes = 128;         // <this => no second
1340 
1341 // Calculate a single summary language for the document, and its reliability.
1342 // Returns language3[0] or language3[1] or ENGLISH or UNKNOWN_LANGUAGE
1343 // This is the heart of matching human-rater perception.
1344 // reliable_percent3[] is currently unused
1345 //
1346 // Do not return Tier3 second language unless there are at least 128 bytes
CalcSummaryLang(DocTote * doc_tote,int total_text_bytes,const int * reliable_percent3,const Language * language3,const int * percent3,Language * summary_lang,bool * is_reliable,bool FLAGS_cld2_html,bool FLAGS_cld2_quiet)1347 void CalcSummaryLang(DocTote* doc_tote, int total_text_bytes,
1348                      const int* reliable_percent3,
1349                      const Language* language3,
1350                      const int* percent3,
1351                      Language* summary_lang, bool* is_reliable,
1352                      bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) {
1353   // Vector of active languages; changes if we delete some
1354   int slot_count = 3;
1355   int active_slot[3] = {0, 1, 2};
1356 
1357   int ignore_percent = 0;
1358   int return_percent = percent3[0];   // Default to top lang
1359   *summary_lang = language3[0];
1360   *is_reliable = true;
1361   if (percent3[0] < kKeepMinPercent) {*is_reliable = false;}
1362 
1363   // If any of top 3 is IGNORE, remove it and increment ignore_percent
1364   for (int i = 0; i < 3; ++i) {
1365     if (language3[i] == TG_UNKNOWN_LANGUAGE) {
1366       ignore_percent += percent3[i];
1367       // Move the rest up, levaing input vectors unchanged
1368       for (int j=i+1; j < 3; ++j) {
1369         active_slot[j - 1] = active_slot[j];
1370       }
1371       -- slot_count;
1372       // Logically remove Ignore from percentage-text calculation
1373       // (extra 1 in 101 avoids zdiv, biases slightly small)
1374       return_percent = (percent3[0] * 100) / (101 - ignore_percent);
1375       *summary_lang = language3[active_slot[0]];
1376       if (percent3[active_slot[0]] < kKeepMinPercent) {*is_reliable = false;}
1377     }
1378   }
1379 
1380 
1381   // If English and X, where X (not UNK) is big enough,
1382   // assume the English is boilerplate and return X.
1383   // Logically remove English from percentage-text calculation
1384   int second_bytes = (total_text_bytes * percent3[active_slot[1]]) / 100;
1385   // Require more bytes of text for Tier3 languages
1386   int minbytesneeded = kGoodSecondT1T2MinBytes;
1387   int plang_second = PerScriptNumber(ULScript_Latin, language3[active_slot[1]]);
1388 
1389   if ((language3[active_slot[0]] == ENGLISH) &&
1390       (language3[active_slot[1]] != ENGLISH) &&
1391       (language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
1392       (percent3[active_slot[1]] >= kNonEnBoilerplateMinPercent) &&
1393       (second_bytes >= minbytesneeded)) {
1394     ignore_percent += percent3[active_slot[0]];
1395     return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
1396     *summary_lang = language3[active_slot[1]];
1397     if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
1398 
1399   // Else If FIGS and X, where X (not UNK, EFIGS) is big enough,
1400   // assume the FIGS is boilerplate and return X.
1401   // Logically remove FIGS from percentage-text calculation
1402   } else if (IsFIGS(language3[active_slot[0]]) &&
1403              !IsEFIGS(language3[active_slot[1]]) &&
1404              (language3[active_slot[1]] != UNKNOWN_LANGUAGE) &&
1405              (percent3[active_slot[1]] >= kNonFIGSBoilerplateMinPercent) &&
1406              (second_bytes >= minbytesneeded)) {
1407     ignore_percent += percent3[active_slot[0]];
1408     return_percent = (percent3[active_slot[1]] * 100) / (101 - ignore_percent);
1409     *summary_lang = language3[active_slot[1]];
1410     if (percent3[active_slot[1]] < kKeepMinPercent) {*is_reliable = false;}
1411 
1412   // Else we are returning the first language, but want to improve its
1413   // return_percent if the second language should be ignored
1414   } else  if ((language3[active_slot[1]] == ENGLISH) &&
1415               (language3[active_slot[0]] != ENGLISH)) {
1416     ignore_percent += percent3[active_slot[1]];
1417     return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
1418   } else  if (IsFIGS(language3[active_slot[1]]) &&
1419               !IsEFIGS(language3[active_slot[0]])) {
1420     ignore_percent += percent3[active_slot[1]];
1421     return_percent = (percent3[active_slot[0]] * 100) / (101 - ignore_percent);
1422   }
1423 
1424   // If return percent is too small (too many languages), return UNKNOWN
1425   if ((return_percent < kGoodFirstMinPercent)) {
1426     if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
1427       fprintf(stderr, "{Unreli %s %d%% percent too small} ",
1428               LanguageCode(*summary_lang), return_percent);
1429     }
1430     *summary_lang = UNKNOWN_LANGUAGE;
1431     *is_reliable = false;
1432   }
1433 
1434   // If return percent is small, return language but set unreliable.
1435   if ((return_percent < kGoodFirstReliableMinPercent)) {
1436     *is_reliable = false;
1437   }
1438 
1439   // If ignore percent is too large, set unreliable.
1440   ignore_percent = 100 - (percent3[0] + percent3[1] + percent3[2]);
1441   if ((ignore_percent > kIgnoreMaxPercent)) {
1442     *is_reliable = false;
1443   }
1444 
1445   // If we removed all the active languages, return UNKNOWN
1446   if (slot_count == 0) {
1447     if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
1448       fprintf(stderr, "{Unreli %s no languages left} ",
1449               LanguageCode(*summary_lang));
1450     }
1451     *summary_lang = UNKNOWN_LANGUAGE;
1452     *is_reliable = false;
1453   }
1454 }
1455 
AddLangPriorBoost(Language lang,uint32 langprob,ScoringContext * scoringcontext)1456 void AddLangPriorBoost(Language lang, uint32 langprob,
1457                        ScoringContext* scoringcontext) {
1458   // This is called 0..n times with language hints
1459   // but we don't know the script -- so boost either or both Latn, Othr.
1460 
1461   if (IsLatnLanguage(lang)) {
1462     LangBoosts* langprior_boost = &scoringcontext->langprior_boost.latn;
1463     int n = langprior_boost->n;
1464     langprior_boost->langprob[n] = langprob;
1465     langprior_boost->n = langprior_boost->wrap(n + 1);
1466   }
1467 
1468   if (IsOthrLanguage(lang)) {
1469     LangBoosts* langprior_boost = &scoringcontext->langprior_boost.othr;
1470     int n = langprior_boost->n;
1471     langprior_boost->langprob[n] = langprob;
1472     langprior_boost->n = langprior_boost->wrap(n + 1);
1473   }
1474 
1475 }
1476 
AddOneWhack(Language whacker_lang,Language whackee_lang,ScoringContext * scoringcontext)1477 void AddOneWhack(Language whacker_lang, Language whackee_lang,
1478                  ScoringContext* scoringcontext) {
1479   uint32 langprob = MakeLangProb(whackee_lang, 1);
1480   // This logic avoids hr-Latn whacking sr-Cyrl, but still whacks sr-Latn
1481   if (IsLatnLanguage(whacker_lang) && IsLatnLanguage(whackee_lang)) {
1482     LangBoosts* langprior_whack = &scoringcontext->langprior_whack.latn;
1483     int n = langprior_whack->n;
1484     langprior_whack->langprob[n] = langprob;
1485     langprior_whack->n = langprior_whack->wrap(n + 1);
1486   }
1487   if (IsOthrLanguage(whacker_lang) && IsOthrLanguage(whackee_lang)) {
1488     LangBoosts* langprior_whack = &scoringcontext->langprior_whack.othr;
1489     int n = langprior_whack->n;
1490     langprior_whack->langprob[n] = langprob;
1491     langprior_whack->n = langprior_whack->wrap(n + 1);
1492  }
1493 }
1494 
AddCloseLangWhack(Language lang,ScoringContext * scoringcontext)1495 void AddCloseLangWhack(Language lang, ScoringContext* scoringcontext) {
1496   // We do not in general want zh-Hans and zh-Hant to be close pairs,
1497   // but we do here.
1498   if (lang == CLD2::CHINESE) {
1499     AddOneWhack(lang, CLD2::CHINESE_T, scoringcontext);
1500     return;
1501   }
1502   if (lang == CLD2::CHINESE_T) {
1503     AddOneWhack(lang, CLD2::CHINESE, scoringcontext);
1504     return;
1505   }
1506 
1507   int base_lang_set = LanguageCloseSet(lang);
1508   if (base_lang_set == 0) {return;}
1509   // TODO: add an explicit list of each set to avoid this 512-times loop
1510   for (int i = 0; i < kLanguageToPLangSize; ++i) {
1511     Language lang2 = static_cast<Language>(i);
1512     if ((base_lang_set == LanguageCloseSet(lang2)) && (lang != lang2)) {
1513       AddOneWhack(lang, lang2, scoringcontext);
1514     }
1515   }
1516 }
1517 
1518 
ApplyHints(const char * buffer,int buffer_length,bool is_plain_text,const CLDHints * cld_hints,ScoringContext * scoringcontext)1519 void ApplyHints(const char* buffer,
1520                 int buffer_length,
1521                 bool is_plain_text,
1522                 const CLDHints* cld_hints,
1523                 ScoringContext* scoringcontext) {
1524   CLDLangPriors lang_priors;
1525   InitCLDLangPriors(&lang_priors);
1526 
1527   // We now use lang= tags.
1528   // Last look, circa 2008 found only 15% of web pages with lang= tags and
1529   // many of those were wrong. Now (July 2011), we find 44% of web pages have
1530   // lang= tags, and most of them are correct. So we now give them substantial
1531   // weight in each chunk scored.
1532   if (!is_plain_text) {
1533     // Get any contained language tags in first n KB
1534     int32 max_scan_bytes = FLAGS_cld_max_lang_tag_scan_kb << 10;
1535     string lang_tags = GetLangTagsFromHtml(buffer, buffer_length,
1536                                            max_scan_bytes);
1537     SetCLDLangTagsHint(lang_tags, &lang_priors);
1538     if (scoringcontext->flags_cld2_html) {
1539       if (!lang_tags.empty()) {
1540         fprintf(scoringcontext->debug_file, "<br>lang_tags '%s'<br>\n",
1541                 lang_tags.c_str());
1542       }
1543     }
1544   }
1545 
1546   if (cld_hints != NULL) {
1547     if ((cld_hints->content_language_hint != NULL) &&
1548         (cld_hints->content_language_hint[0] != '\0')) {
1549       SetCLDContentLangHint(cld_hints->content_language_hint, &lang_priors);
1550     }
1551 
1552     // Input is from GetTLD(), already lowercased
1553     if ((cld_hints->tld_hint != NULL) && (cld_hints->tld_hint[0] != '\0')) {
1554       SetCLDTLDHint(cld_hints->tld_hint, &lang_priors);
1555     }
1556 
1557     if (cld_hints->encoding_hint != UNKNOWN_ENCODING) {
1558       Encoding enc = static_cast<Encoding>(cld_hints->encoding_hint);
1559       SetCLDEncodingHint(enc, &lang_priors);
1560     }
1561 
1562     if (cld_hints->language_hint != UNKNOWN_LANGUAGE) {
1563       SetCLDLanguageHint(cld_hints->language_hint, &lang_priors);
1564     }
1565   }
1566 
1567   // Keep no more than four different languages with hints
1568   TrimCLDLangPriors(4, &lang_priors);
1569 
1570   if (scoringcontext->flags_cld2_html) {
1571     string print_temp = DumpCLDLangPriors(&lang_priors);
1572     if (!print_temp.empty()) {
1573       fprintf(scoringcontext->debug_file, "DumpCLDLangPriors %s<br>\n",
1574               print_temp.c_str());
1575     }
1576   }
1577 
1578   // Put boosts into ScoringContext
1579   for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
1580     Language lang = GetCLDPriorLang(lang_priors.prior[i]);
1581     int qprob = GetCLDPriorWeight(lang_priors.prior[i]);
1582     if (qprob > 0) {
1583       uint32 langprob = MakeLangProb(lang, qprob);
1584       AddLangPriorBoost(lang, langprob, scoringcontext);
1585     }
1586   }
1587 
1588   // Put whacks into scoring context
1589   // We do not in general want zh-Hans and zh-Hant to be close pairs,
1590   // but we do here. Use close_set_count[kCloseSetSize] to count zh, zh-Hant
1591   std::vector<int> close_set_count(kCloseSetSize + 1, 0);
1592 
1593   for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
1594     Language lang = GetCLDPriorLang(lang_priors.prior[i]);
1595     ++close_set_count[LanguageCloseSet(lang)];
1596     if (lang == CLD2::CHINESE) {++close_set_count[kCloseSetSize];}
1597     if (lang == CLD2::CHINESE_T) {++close_set_count[kCloseSetSize];}
1598   }
1599 
1600   // If a boost language is in a close set, force suppressing the others in
1601   // that set, if exactly one of the set is present
1602   for (int i = 0; i < GetCLDLangPriorCount(&lang_priors); ++i) {
1603     Language lang = GetCLDPriorLang(lang_priors.prior[i]);
1604     int qprob = GetCLDPriorWeight(lang_priors.prior[i]);
1605     if (qprob > 0) {
1606       int close_set = LanguageCloseSet(lang);
1607       if ((close_set > 0) && (close_set_count[close_set] == 1)) {
1608         AddCloseLangWhack(lang, scoringcontext);
1609       }
1610       if (((lang == CLD2::CHINESE) || (lang == CLD2::CHINESE_T)) &&
1611           (close_set_count[kCloseSetSize] == 1)) {
1612         AddCloseLangWhack(lang, scoringcontext);
1613       }
1614     }
1615   }
1616 
1617 
1618 
1619 
1620 
1621 
1622 }
1623 
1624 
1625 
1626 // Results language3/percent3/text_bytes must be exactly three items
DetectLanguageSummaryV2(const char * buffer,int buffer_length,bool is_plain_text,const CLDHints * cld_hints,bool allow_extended_lang,int flags,Language plus_one,Language * language3,int * percent3,double * normalized_score3,ResultChunkVector * resultchunkvector,int * text_bytes,bool * is_reliable)1627 Language DetectLanguageSummaryV2(
1628                         const char* buffer,
1629                         int buffer_length,
1630                         bool is_plain_text,
1631                         const CLDHints* cld_hints,
1632                         bool allow_extended_lang,
1633                         int flags,
1634                         Language plus_one,
1635                         Language* language3,
1636                         int* percent3,
1637                         double* normalized_score3,
1638                         ResultChunkVector* resultchunkvector,
1639                         int* text_bytes,
1640                         bool* is_reliable) {
1641   language3[0] = UNKNOWN_LANGUAGE;
1642   language3[1] = UNKNOWN_LANGUAGE;
1643   language3[2] = UNKNOWN_LANGUAGE;
1644   percent3[0] = 0;
1645   percent3[1] = 0;
1646   percent3[2] = 0;
1647   normalized_score3[0] = 0.0;
1648   normalized_score3[1] = 0.0;
1649   normalized_score3[2] = 0.0;
1650   if (resultchunkvector != NULL) {
1651     resultchunkvector->clear();
1652   }
1653   *text_bytes = 0;
1654   *is_reliable = false;
1655 
1656   if ((flags & kCLDFlagEcho) != 0) {
1657      string temp(buffer, buffer_length);
1658      if ((flags & kCLDFlagHtml) != 0) {
1659         fprintf(stderr, "CLD2[%d] '%s'<br>\n",
1660                 buffer_length, GetHtmlEscapedText(temp).c_str());
1661      } else {
1662         fprintf(stderr, "CLD2[%d] '%s'\n",
1663                 buffer_length, GetPlainEscapedText(temp).c_str());
1664      }
1665   }
1666 
1667 #ifdef CLD2_DYNAMIC_MODE
1668   // In dynamic mode, we immediately return UNKNOWN_LANGUAGE if the data file
1669   // hasn't been loaded yet. This is the only sane thing we can do, as there
1670   // are no scoring tables to consult.
1671   bool dataLoaded = isDataLoaded();
1672   if ((flags & kCLDFlagVerbose) != 0) {
1673     fprintf(stderr, "Data loaded: %s\n", (dataLoaded ? "true" : "false"));
1674   }
1675   if (!dataLoaded) {
1676     return UNKNOWN_LANGUAGE;
1677   }
1678 #endif
1679 
1680   // Exit now if no text
1681   if (buffer_length == 0) {return UNKNOWN_LANGUAGE;}
1682   if (kScoringtables.quadgram_obj == NULL) {return UNKNOWN_LANGUAGE;}
1683 
1684   // Document totals
1685   DocTote doc_tote;   // Reliability = 0..100
1686 
1687   // ScoringContext carries state across scriptspans
1688   ScoringContext scoringcontext;
1689   scoringcontext.debug_file = stderr;
1690   scoringcontext.flags_cld2_score_as_quads =
1691     ((flags & kCLDFlagScoreAsQuads) != 0);
1692   scoringcontext.flags_cld2_html = ((flags & kCLDFlagHtml) != 0);
1693   scoringcontext.flags_cld2_cr = ((flags & kCLDFlagCr) != 0);
1694   scoringcontext.flags_cld2_verbose = ((flags & kCLDFlagVerbose) != 0);
1695   scoringcontext.prior_chunk_lang = UNKNOWN_LANGUAGE;
1696   scoringcontext.ulscript = ULScript_Common;
1697   scoringcontext.scoringtables = &kScoringtables;
1698   scoringcontext.scanner = NULL;
1699   scoringcontext.init();            // Clear the internal memory arrays
1700 
1701   // Now thread safe.
1702   bool FLAGS_cld2_html = ((flags & kCLDFlagHtml) != 0);
1703   bool FLAGS_cld2_quiet = ((flags & kCLDFlagQuiet) != 0);
1704 
1705   ApplyHints(buffer, buffer_length, is_plain_text, cld_hints, &scoringcontext);
1706 
1707   // Four individual script totals, Latin, Han, other2, other3
1708   int next_other_tote = 2;
1709   int tote_num = 0;
1710 
1711   // Four totes for up to four different scripts pending at once
1712   Tote totes[4];                  // [0] Latn  [1] Hani  [2] other  [3] other
1713   bool tote_seen[4] = {false, false, false, false};
1714   int tote_grams[4] = {0, 0, 0, 0};     // Number in partial chunk
1715   ULScript tote_script[4] =
1716     {ULScript_Latin, ULScript_Hani, ULScript_Common, ULScript_Common};
1717 
1718   // Loop through text spans in a single script
1719   ScriptScanner ss(buffer, buffer_length, is_plain_text);
1720   LangSpan scriptspan;
1721 
1722   scoringcontext.scanner = &ss;
1723 
1724   scriptspan.text = NULL;
1725   scriptspan.text_bytes = 0;
1726   scriptspan.offset = 0;
1727   scriptspan.ulscript = ULScript_Common;
1728   scriptspan.lang = UNKNOWN_LANGUAGE;
1729 
1730   int total_text_bytes = 0;
1731   int textlimit = FLAGS_cld_textlimit << 10;    // in KB
1732   if (textlimit == 0) {textlimit = 0x7fffffff;}
1733 
1734   int advance_by = 2;                   // Advance 2 bytes
1735   int advance_limit = textlimit >> 3;   // For first 1/8 of max document
1736 
1737   int initial_word_span = kDefaultWordSpan;
1738   if (FLAGS_cld_forcewords) {
1739     initial_word_span = kReallyBigWordSpan;
1740   }
1741 
1742   // Pick up chunk sizes
1743   // Smoothwidth is units of quadgrams, about 2.5 chars (unigrams) each
1744   // Sanity check -- force into a reasonable range
1745   int chunksizequads = FLAGS_cld_smoothwidth;
1746   chunksizequads = minint(maxint(chunksizequads, kMinChunkSizeQuads),
1747                                kMaxChunkSizeQuads);
1748   int chunksizeunis = (chunksizequads * 5) >> 1;
1749 
1750   // Varying short-span limit doesn't work well -- skips too much beyond 20KB
1751   // int spantooshortlimit = advance_by * FLAGS_cld_smoothwidth;
1752   int spantooshortlimit = kShortSpanThresh;
1753 
1754   // For debugging only. Not thread-safe
1755   prior_lang = UNKNOWN_LANGUAGE;
1756   prior_unreliable = false;
1757 
1758   // Allocate full-document prediction table for finding repeating words
1759   int hash = 0;
1760   int* predict_tbl = new int[kPredictionTableSize];
1761   if (FlagRepeats(flags)) {
1762     memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
1763   }
1764 
1765 
1766 
1767   // Loop through scriptspans accumulating number of text bytes in each language
1768   while (ss.GetOneScriptSpanLower(&scriptspan)) {
1769     ULScript ulscript = scriptspan.ulscript;
1770 
1771     // Squeeze out big chunks of text span if asked to
1772     if (FlagSqueeze(flags)) {
1773       // Remove repetitive or mostly-spaces chunks
1774       int newlen;
1775       int chunksize = 0;    // Use the default
1776       if (resultchunkvector != NULL) {
1777          newlen = CheapSqueezeInplaceOverwrite(scriptspan.text,
1778                                                scriptspan.text_bytes,
1779                                                chunksize);
1780       } else {
1781          newlen = CheapSqueezeInplace(scriptspan.text, scriptspan.text_bytes,
1782                                       chunksize);
1783       }
1784       scriptspan.text_bytes = newlen;
1785     } else {
1786       // Check now and then to see if we should be squeezing
1787       if (((kCheapSqueezeTestThresh >> 1) < scriptspan.text_bytes) &&
1788           !FlagFinish(flags)) {
1789         // fprintf(stderr, "CheapSqueezeTriggerTest, "
1790         //                 "first %d bytes of %d (>%d/2)<br>\n",
1791         //         kCheapSqueezeTestLen,
1792         //         scriptspan.text_bytes,
1793         //         kCheapSqueezeTestThresh);
1794 
1795         if (CheapSqueezeTriggerTest(scriptspan.text,
1796                                       scriptspan.text_bytes,
1797                                       kCheapSqueezeTestLen)) {
1798           // Recursive call with big-chunk squeezing set
1799           if (FLAGS_cld2_html || FLAGS_dbgscore) {
1800             fprintf(stderr,
1801                     "<br>---text_bytes[%d] Recursive(Squeeze)---<br><br>\n",
1802                     total_text_bytes);
1803           }
1804           // Deallocate full-document prediction table
1805           delete[] predict_tbl;
1806 
1807           return DetectLanguageSummaryV2(
1808                             buffer,
1809                             buffer_length,
1810                             is_plain_text,
1811                             cld_hints,
1812                             allow_extended_lang,
1813                             flags | kCLDFlagSqueeze,
1814                             plus_one,
1815                             language3,
1816                             percent3,
1817                             normalized_score3,
1818                             resultchunkvector,
1819                             text_bytes,
1820                             is_reliable);
1821         }
1822       }
1823     }
1824 
1825     // Remove repetitive words if asked to
1826     if (FlagRepeats(flags)) {
1827       // Remove repetitive words
1828       int newlen;
1829       if (resultchunkvector != NULL) {
1830         newlen = CheapRepWordsInplaceOverwrite(scriptspan.text,
1831                                                scriptspan.text_bytes,
1832                                                &hash, predict_tbl);
1833       } else {
1834         newlen = CheapRepWordsInplace(scriptspan.text, scriptspan.text_bytes,
1835                                       &hash, predict_tbl);
1836       }
1837       scriptspan.text_bytes = newlen;
1838     }
1839 
1840     // Scoring depends on scriptspan buffer ALWAYS having
1841     // leading space and off-the-end space space space NUL,
1842     // DCHECK(scriptspan.text[0] == ' ');
1843     // DCHECK(scriptspan.text[scriptspan.text_bytes + 0] == ' ');
1844     // DCHECK(scriptspan.text[scriptspan.text_bytes + 1] == ' ');
1845     // DCHECK(scriptspan.text[scriptspan.text_bytes + 2] == ' ');
1846     // DCHECK(scriptspan.text[scriptspan.text_bytes + 3] == '\0');
1847 
1848     // The real scoring
1849     // Accumulate directly into the document total, or accmulate in one of four
1850     // chunk totals. The purpose of the multiple chunk totals is to piece
1851     // together short choppy pieces of text in alternating scripts. One total is
1852     // dedicated to Latin text, one to Han text, and the other two are dynamicly
1853     // assigned.
1854 
1855     scoringcontext.ulscript = scriptspan.ulscript;
1856     // FLAGS_cld2_html = scoringcontext.flags_cld2_html;
1857 
1858     ScoreOneScriptSpan(scriptspan,
1859                        &scoringcontext,
1860                        &doc_tote,
1861                        resultchunkvector);
1862 
1863     total_text_bytes += scriptspan.text_bytes;
1864   }     // End while (ss.GetOneScriptSpanLower())
1865 
1866   // Deallocate full-document prediction table
1867   delete[] predict_tbl;
1868 
1869   if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
1870     // If no forced <cr>, put one in front of dump
1871     if (!scoringcontext.flags_cld2_cr) {fprintf(stderr, "<br>\n");}
1872     doc_tote.Dump(stderr);
1873   }
1874 
1875 
1876   // If extended langauges are disallowed, remove them here
1877   if (!allow_extended_lang) {
1878     RemoveExtendedLanguages(&doc_tote);
1879   }
1880 
1881   // Force close pairs to one or the other
1882   // If given, also update resultchunkvector
1883   RefineScoredClosePairs(&doc_tote, resultchunkvector,
1884                          FLAGS_cld2_html, FLAGS_cld2_quiet);
1885 
1886 
1887   // Calculate return results
1888   // Find top three byte counts in tote heap
1889   int reliable_percent3[3];
1890 
1891   // Cannot use Add, etc. after sorting
1892   doc_tote.Sort(3);
1893 
1894   ExtractLangEtc(&doc_tote, total_text_bytes,
1895                  reliable_percent3, language3, percent3, normalized_score3,
1896                  text_bytes, is_reliable);
1897 
1898   bool have_good_answer = false;
1899   if (FlagFinish(flags)) {
1900     // Force a result
1901     have_good_answer = true;
1902   } else if (total_text_bytes <= kShortTextThresh) {
1903     // Don't recurse on short text -- we already did word scores
1904     have_good_answer = true;
1905   } else if (*is_reliable &&
1906              (percent3[0] >= kGoodLang1Percent)) {
1907     have_good_answer = true;
1908   } else if (*is_reliable &&
1909              ((percent3[0] + percent3[1]) >= kGoodLang1and2Percent)) {
1910     have_good_answer = true;
1911   }
1912 
1913 
1914   if (have_good_answer) {
1915     // This is the real, non-recursive return
1916 
1917     // Move bytes for unreliable langs to another lang or UNKNOWN
1918     RemoveUnreliableLanguages(&doc_tote, FLAGS_cld2_html, FLAGS_cld2_quiet);
1919 
1920     // Redo the result extraction after the removal above
1921     doc_tote.Sort(3);
1922     ExtractLangEtc(&doc_tote, total_text_bytes,
1923                    reliable_percent3, language3, percent3, normalized_score3,
1924                    text_bytes, is_reliable);
1925 
1926 
1927 
1928     Language summary_lang;
1929     CalcSummaryLang(&doc_tote, total_text_bytes,
1930                     reliable_percent3, language3, percent3,
1931                     &summary_lang, is_reliable,
1932                     FLAGS_cld2_html, FLAGS_cld2_quiet);
1933 
1934     if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
1935       for (int i = 0; i < 3; ++i) {
1936         if (language3[i] != UNKNOWN_LANGUAGE) {
1937           fprintf(stderr, "%s.%dR(%d%%) ",
1938                   LanguageCode(language3[i]),
1939                   reliable_percent3[i],
1940                   percent3[i]);
1941         }
1942       }
1943 
1944       fprintf(stderr, "%d bytes ", total_text_bytes);
1945       fprintf(stderr, "= %s%c ",
1946               LanguageName(summary_lang), *is_reliable ? ' ' : '*');
1947       fprintf(stderr, "<br><br>\n");
1948     }
1949 
1950     // Slightly condensed if quiet
1951     if (FLAGS_cld2_html && FLAGS_cld2_quiet) {
1952       fprintf(stderr, "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; ");
1953       for (int i = 0; i < 3; ++i) {
1954         if (language3[i] != UNKNOWN_LANGUAGE) {
1955           fprintf(stderr, "&nbsp;&nbsp;%s %d%% ",
1956                   LanguageCode(language3[i]),
1957                   percent3[i]);
1958         }
1959       }
1960       fprintf(stderr, "= %s%c ",
1961               LanguageName(summary_lang), *is_reliable ? ' ' : '*');
1962       fprintf(stderr, "<br>\n");
1963     }
1964 
1965     return summary_lang;
1966   }
1967 
1968   // Not a good answer -- do recursive call to refine
1969   if ((FLAGS_cld2_html || FLAGS_dbgscore) && !FLAGS_cld2_quiet) {
1970     // This is what we hope to improve on in the recursive call, if any
1971     PrintLangs(stderr, language3, percent3, text_bytes, is_reliable);
1972   }
1973 
1974   // For restriction to Top40 + one, the one is 1st/2nd lang that is not Top40
1975   // For this purpose, we treate "Ignore" as top40
1976   Language new_plus_one = UNKNOWN_LANGUAGE;
1977 
1978   if (total_text_bytes < kShortTextThresh) {
1979       // Short text: Recursive call with top40 and short set
1980       if (FLAGS_cld2_html || FLAGS_dbgscore) {
1981         fprintf(stderr, "&nbsp;&nbsp;---text_bytes[%d] "
1982                 "Recursive(Top40/Rep/Short/Words)---<br><br>\n",
1983                 total_text_bytes);
1984       }
1985       return DetectLanguageSummaryV2(
1986                         buffer,
1987                         buffer_length,
1988                         is_plain_text,
1989                         cld_hints,
1990                         allow_extended_lang,
1991                         flags | kCLDFlagTop40 | kCLDFlagRepeats |
1992                           kCLDFlagShort | kCLDFlagUseWords | kCLDFlagFinish,
1993                         new_plus_one,
1994                         language3,
1995                         percent3,
1996                         normalized_score3,
1997                         resultchunkvector,
1998                         text_bytes,
1999                         is_reliable);
2000   }
2001 
2002   // Longer text: Recursive call with top40 set
2003   if (FLAGS_cld2_html || FLAGS_dbgscore) {
2004     fprintf(stderr,
2005             "&nbsp;&nbsp;---text_bytes[%d] Recursive(Top40/Rep)---<br><br>\n",
2006             total_text_bytes);
2007   }
2008   return DetectLanguageSummaryV2(
2009                         buffer,
2010                         buffer_length,
2011                         is_plain_text,
2012                         cld_hints,
2013                         allow_extended_lang,
2014                         flags | kCLDFlagTop40 | kCLDFlagRepeats |
2015                           kCLDFlagFinish,
2016                         new_plus_one,
2017                         language3,
2018                         percent3,
2019                         normalized_score3,
2020                         resultchunkvector,
2021                         text_bytes,
2022                         is_reliable);
2023 }
2024 
2025 
2026 // For debugging and wrappers. Not thread safe.
2027 static char temp_detectlanguageversion[32];
2028 
2029 // Return version text string
2030 // String is "code_version - data_build_date"
DetectLanguageVersion()2031 const char* DetectLanguageVersion() {
2032   if (kScoringtables.quadgram_obj == NULL) {return "";}
2033   sprintf(temp_detectlanguageversion,
2034           "V2.0 - %u", kScoringtables.quadgram_obj->kCLDTableBuildDate);
2035   return temp_detectlanguageversion;
2036 }
2037 
2038 
2039 }       // End namespace CLD2
2040