1 // Copyright 2016 Google Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 ////////////////////////////////////////////////////////////////////////////////
16 
17 #include "compact_enc_det/compact_enc_det.h"
18 
19 #include <math.h>                       // for sqrt
20 #include <stddef.h>                     // for size_t
21 #include <stdio.h>                      // for printf, fprintf, NULL, etc
22 #include <stdlib.h>                     // for qsort
23 #include <string.h>                     // for memset, memcpy, memcmp, etc
24 #include <memory>
25 #include <string>                       // for string, operator==, etc
26 
27 #include "compact_enc_det/compact_enc_det_hint_code.h"
28 #include "util/string_util.h"
29 #include "util/basictypes.h"
30 #include "util/commandlineflags.h"
31 #include "util/logging.h"
32 
33 using std::string;
34 
35 // TODO as of 2007.10.09:
36 //
37 // Consider font=TT-BHxxx as user-defined => binary
38 // Demote GB18030 if no 8x3x pair
39 // Map byte2 ascii punct to 0x60, digits to 0x7e, gets them into hires
40 // Consider removing/ignoring bytes 01-1F to avoid crap pollution
41 // Possibly boost declared encoding in robust scan
42 // googlebot tiny files
43 // look for ranges of encodings
44 // consider tags just as > < within aligned block of 32
45 // flag too few characters in postproc (Latin 6 problem)
46 // Remove slow scan beyond 16KB
47 // Consider removing kMostLikelyEncoding or cut it in half
48 
49 
50 // A note on mixed encodings
51 //
52 // The most common encoding error on the web is a page containing a mixture of
53 // CP-1252 and UTF-8. A less common encoding error is a third-party feed that
54 // has been converted from CP-1252 to UTF-8 and then those bytes converted a
55 // second time to UTF-8. CED originally attempted to detect these error cases
56 // by using two  synthetic encodings, UTF8CP1252 and UTF8UTF8. The intended
57 // implementation was to start these just below CP1252 and UTF8 respectively in
58 // overall  liklihood, and allow 1252 and UTF8 to fall behind if mixtures are
59 // found.
60 //
61 // The UTF8UTF8 encoding is a possible outcome from CED, but unfortunately the
62 // UTF8CP1252 internal encoding was added late and not put into encodings.proto,
63 // so at the final step it is mapped to UTF8UTF8 also. This was a bad idea and
64 // is removed in this November 2011 CL.
65 //
66 // Mixed encoding detection never worked out as well as envisioned, so the
67 // ced_allow_utf8utf8 flag normally disables all this.
68 //
69 // The effect is that CP-1252 and UTF-8 mixtures will usually be detected as
70 // UTF8, and the inputconverter code for UTF8 normally will convert bare
71 // CP-1252 bytes to UTF-8, instead of the less-helpful FFFD substitution. UTF-8
72 // and double-UTF-8 mixtures will be detected as UTF-8, and the double
73 // conversion will stand.
74 //
75 // However, it is occasionally useful to use CED to detect double-converted
76 // UTF-8 coming from third-party data feeds, so they can be fixed at the source.
77 // For this purpose, the  UTF8UTF8 encoding remains available under the
78 // ced_allow_utf8utf8 flag.
79 //
80 // When UTF8UTF8 is detected, the inputconverter code will undo the double
81 // conversion, giving good text.
82 
83 // Norbert Runge has noted these words in CP1252 that are mistakenly identified
84 // as UTF-8 because of the last pair of characters:
85 //  NESTLÉ®               0xC9 0xAE U+00C9 U+00AE   C9AE = U+026E;SMALL LEZH
86 //  drauß\u2019           0xDF 0x92 U+00DF U+2019   DF92 = U+07D2;NKO LETTER N
87 //  Mutterschoß\u201c     0xDF 0x93 U+00DF U+201C   DF93 = U+07D3;NKO LETTER BA
88 //  Schoß\u201c           0xDF 0x93 U+00DF U+201C
89 //  weiß\u201c            0xDF 0x93 U+00DF U+00AB
90 //  Schnellfuß\u201c      0xDF 0x93 U+00DF U+201C
91 //  süß«                  0xDF 0xAB U+00DF U+00AB   DFAB = U+07EB;NKO HIGH TONE
92 // These four byte combinations now explicitly boost Latin1/CP1252.
93 
94 // And for reference, here are a couple of Portuguese spellings
95 // that may be mistaken as double-byte encodings.
96 //   informações          0xE7 0xF5
97 //   traição              0xE7 0xE3
98 
99 
100 static const char* kVersion = "2.2";
101 
102 DEFINE_bool(ced_allow_utf8utf8, false, "Allow the UTF8UTF8 encoding, "
103                                        "to handle mixtures of CP1252 "
104                                        "converted to UTF-8 zero, one, "
105                                        "or two times");
106 DEFINE_int32(enc_detect_slow_max_kb, 16,
107              "Maximum number of Kbytes to examine for "
108              "7-bit-only (2022, Hz, UTF7) encoding detect. "
109              "You are unlikely to want to change this.");
110 DEFINE_int32(enc_detect_fast_max_kb, 256,
111              "Maximum number of Kbytes to examine for encoding detect. "
112              "You are unlikely to want to change this.");
113 
114 DEFINE_int32(ced_reliable_difference, 300, "30 * Bits of minimum probablility "
115              "difference 1st - 2nd to be considered reliable \n"
116              "  2 corresponds to min 4x difference\n"
117              "  4 corresponds to min 16x difference\n"
118              "  8 corresponds to min 256x difference\n"
119              "  10 corresponds to min 1024x difference\n"
120              "  20 corresponds to min 1Mx difference.");
121 
122 // Text debug output options
123 DEFINE_bool(enc_detect_summary, false,
124             "Print first 16 interesting pairs at exit.");
125 DEFINE_bool(counts, false, "Count major-section usage");
126 
127 // PostScript debug output options
128 DEFINE_bool(enc_detect_detail, false,
129              "Print PostScript of every update, to stderr.");
130 DEFINE_bool(enc_detect_detail2, false,
131              "More PostScript detail of every update, to stderr.");
132 DEFINE_bool(enc_detect_source, false, "Include source text in detail");
133 // Encoding name must exactly match FIRST column of kI18NInfoByEncoding in
134 // lang_enc.cc
135 
136 // Following flags are not in use. Replace them with constants to
137 // avoid static initialization.
138 
139 //DEFINE_string(enc_detect_watch1, "", "Do detail2 about this encoding name.");
140 //DEFINE_string(enc_detect_watch2, "", "Do detail2 about this encoding name.");
141 
142 static const char* const FLAGS_enc_detect_watch1 = "";
143 static const char* const FLAGS_enc_detect_watch2 = "";
144 
145 // Only for experiments. Delete soon.
146 DEFINE_bool(force127, false, "Force Latin1, Latin2, Latin7 based on trigrams");
147 
148 // Demo-mode/debugging experiment
149 DEFINE_bool(demo_nodefault, false,
150              "Default to all equal; no boost for declared encoding.");
151 DEFINE_bool(dirtsimple, false, "Just scan and count for all encodings");
152 DEFINE_bool(ced_echo_input, false, "Echo ced input to stderr");
153 
154 
155 static const int XDECILOG2 = 3;             // Multiplier for log base 2 ** n/10
156 static const int XLOG2 = 30;                // Multiplier for log base 2 ** n
157 
158 static const int kFinalPruneDifference = 10 * XLOG2;
159                                             // Final bits of minimum
160                                             // probability difference 1st-nth
161                                             // to be pruned
162 
163 static const int kInititalPruneDifference = kFinalPruneDifference * 4;
164                                             // Initial bits of minimum
165                                             // probability difference 1st-nth
166                                             // to be pruned
167                                             //
168 static const int kPruneDiffDecrement = kFinalPruneDifference;
169                                             // Decrements bits of minimum
170                                             // probability difference 1st-nth
171                                             // to be pruned
172 
173 static const int kSmallInitDiff = 2 * XLOG2;       // bits of minimum
174                                             // probability difference, base to
175                                             // superset encodings
176 
177 static const int kBoostInitial = 20 * XLOG2;    // bits of boost for
178                                             // initial byte patterns (BOM, 00)
179 
180 static const int kBadPairWhack = 20 * XLOG2;    // bits of whack for
181                                             // one bad pair
182 
183 static const int kBoostOnePair = 20 * XLOG2;    // bits of boost for
184                                             // one good pair in Hz, etc.
185 
186 static const int kGentleOnePair = 4 * XLOG2;    // bits of boost for
187                                             // one good sequence
188                                             //
189 static const int kGentlePairWhack = 2 * XLOG2;       // bits of whack
190                                             // for ill-formed sequence
191 
192 static const int kGentlePairBoost = 2 * XLOG2;       // bits of boost
193                                             // for well-formed sequence
194 
195 static const int kDeclaredEncBoost = 5 * XDECILOG2;  // bits/10 of boost for
196                                             // best declared encoding per bigram
197 
198 static const int kBestEncBoost = 5 * XDECILOG2;     // bits/10 of boost for
199                                             // best encoding per bigram
200 
201 static const int kTrigramBoost = 2 * XLOG2; // bits of boost for Latin127 tri
202 
203 static const int kMaxPairs = 48;            // Max interesting pairs to look at
204                                             // If you change this,
205                                             // adjust *PruneDiff*
206 
207 static const int kPruneMask = 0x07;         // Prune every 8 interesting pairs
208 
209 
210 static const int kBestPairsCount = 16;      // For first N pairs, do extra boost
211                                             // based on most likely encoding
212                                             // of pair over entire web
213 
214 static const int kDerateHintsBelow = 12;    // If we have fewer than N bigrams,
215                                             // weaken the hints enough that
216                                             // unhinted encodings have a hope of
217                                             // rising to the top
218 
219 static const int kMinRescanLength = 800;    // Don't bother rescanning for
220                                             // unreliable encoding if fewer
221                                             // than this many bytes unscanned.
222                                             // We will rescan at most last half
223                                             // of this.
224 
225 static const int kStrongBinary = 12;  // Make F_BINARY the only encoding
226 static const int kWeakerBinary = 4;   // Make F_BINARY likely encoding
227 
228 // These are byte counts from front of file
229 static const int kBinaryHardAsciiLimit = 6 * 1024;  // Not binary if all ASCII
230 static const int kBinarySoftAsciiLimit = 8 * 1024;  //   "   if mostly ASCII
231 
232 // We try here to avoid having title text dominate the encoding detection,
233 // for the not-infrequent error case of title in encoding1, body in encoding2:
234 // we want to bias toward encoding2 winning.
235 //
236 // kMaxBigramsTagTitleText should be a multiple of 2, 3, and 4, so that we
237 // rarely cut off mid-character in the original (not-yet-detected) encoding.
238 // This matters most for UTF-8 two- and three-byte codes and for
239 // Shift-JIS three-byte codes.
240 static const int kMaxBigramsTagTitleText = 12;      // Keep only some tag text
241 static const int kWeightshiftForTagTitleText = 4;   // Give text in tags, etc.
242                                                     // 1/16 normal weight
243 
244 static const int kStrongPairs = 6;          // Let reliable enc with this many
245                                             // pairs overcome missing hint
246 
247 enum CEDInternalFlags {
248   kCEDNone = 0,           // The empty flag
249   kCEDRescanning = 1,     // Do not further recurse
250   kCEDSlowscore = 2,      // Do extra scoring
251   kCEDForceTags = 4,      // Always examine text inside tags
252 };
253 
254 // Forward declaration
255 Encoding InternalDetectEncoding(
256     CEDInternalFlags flags, const char* text, int text_length,
257     const char* url_hint, const char* http_charset_hint,
258     const char* meta_charset_hint, const int encoding_hint,
259     const Language language_hint,  // User interface lang
260     const CompactEncDet::TextCorpusType corpus_type,
261     bool ignore_7bit_mail_encodings, int* bytes_consumed, bool* is_reliable,
262     Encoding* second_best_enc);
263 
264 typedef struct {
265   const uint8* hires[4];  // Pointers to possible high-resolution bigram deltas
266   uint8 x_bar;          // Average byte2 value
267   uint8 y_bar;          // Average byte1 value
268   uint8 x_stddev;       // Standard deviation of byte2 value
269   uint8 y_stddev;       // Standard deviation of byte1 value
270   int so;               // Scaling offset -- add to probabilities below
271   uint8 b1[256];        // Unigram probability for first byte of aligned bigram
272   uint8 b2[256];        // Unigram probability for second byte of aligned bigram
273   uint8 b12[256];       // Unigram probability for cross bytes of aligned bigram
274 } UnigramEntry;
275 
276 //typedef struct {
277 //  uint8 b12[256*256]; // Bigram probability for aligned bigram
278 //} FullBigramEntry;
279 
280 
281 // Include all the postproc-generated tables here:
282 // RankedEncoding
283 // kMapToEncoding
284 // unigram_table
285 // kMostLIkelyEncoding
286 // kTLDHintProbs
287 // kCharsetHintProbs
288 // HintEntry, kMaxTldKey kMaxTldVector, etc.
289 // =============================================================================
290 
291 #include "compact_enc_det/compact_enc_det_generated_tables.h"
292 
293 
294 #define F_ASCII F_Latin1    // "ASCII" is a misnomer, so this code uses "Latin1"
295 
296 #define F_BINARY F_X_BINARYENC        // We are mid-update for name change
297 #define F_UTF8UTF8 F_X_UTF8UTF8       // We are mid-update for name change
298 #define F_BIG5_CP950 F_BIG5           // We are mid-update for name change
299 #define F_Unicode F_UTF_16LE          // We are mid-update for name change
300 // =============================================================================
301 
302 // 7-bit encodings have at least one "interesting" byte value < 0x80
303 //   (00 0E 1B + ~)
304 // JIS 2022-cn 2022-kr hz utf7
305 // Unicode UTF-16 UTF-32
306 // 8-bit encodings have no interesting byte values < 0x80
307 static const uint32 kSevenBitActive = 0x00000001;   // needs <80 to detect
308 static const uint32 kUTF7Active     = 0x00000002;   // <80 and +
309 static const uint32 kHzActive       = 0x00000004;   // <80 and ~
310 static const uint32 kIso2022Active  = 0x00000008;   // <80 and 1B 0E 0F
311 static const uint32 kUTF8Active     = 0x00000010;
312 static const uint32 kUTF8UTF8Active = 0x00000020;
313 static const uint32 kUTF1632Active  = 0x00000040;   // <80 and 00
314 static const uint32 kBinaryActive   = 0x00000080;   // <80 and 00
315 static const uint32 kTwobyteCode    = 0x00000100;   // Needs 8xxx
316 static const uint32 kIsIndicCode    = 0x00000200;   //
317 static const uint32 kHighAlphaCode  = 0x00000400;   // full alphabet in 8x-Fx
318 static const uint32 kHighAccentCode = 0x00000800;   // accents in 8x-Fx
319 static const uint32 kEUCJPActive    = 0x00001000;   // Have to mess with phase
320 
321 
322 // Debug only. not thread safe
323 static int encdet_used = 0;
324 static int rescore_used = 0;
325 static int rescan_used = 0;
326 static int robust_used = 0;
327 static int looking_used = 0;
328 static int doing_used = 0;
329 
330 
331 // For debugging only -- about 256B/entry times about 500 = 128KB
332 // TODO: only allocate this if being used
333 typedef struct {
334   int offset;
335   int best_enc;     // Best ranked encoding for this bigram, or
336                     // -1 for overhead entries
337   string label;
338   int detail_enc_prob[NUM_RANKEDENCODING];
339 } DetailEntry;
340 
341 static int watch1_rankedenc = -1;     // Debug. not threadsafe
342 static int watch2_rankedenc = -1;     // Debug. not threadsafe
343 ////static int next_detail_entry = 0;     // Debug. not threadsafe
344 ////static DetailEntry details[kMaxPairs * 10];  // Allow 10 details per bigram
345 // End For debugging only
346 
347 // Must match kTestPrintableAsciiTildePlus exit codes, minus one
348 enum PairSet {AsciiPair = 0, OtherPair = 1, NUM_PAIR_SETS = 2};
349 
350 // The reasons for pruning
351 enum PruneReason {PRUNE_NORMAL, PRUNE_SLOWEND, PRUNE_FINAL};
352 
353 static const char* kWhatSetName[] = {"Ascii", "Other"};
354 
355 
356 // State for encodings that do shift-out/shift-in between one- and two-byte
357 // regions (ISO-2022-xx, HZ)
358 enum StateSoSi {SOSI_NONE, SOSI_ERROR, SOSI_ONEBYTE, SOSI_TWOBYTE};
359 
360 typedef struct {
361   const uint8* initial_src;       // For calculating byte offsets
362   const uint8* limit_src;         // Range of input source
363   const uint8* prior_src;         // Source consumed by prior call to BoostPrune
364   const uint8* last_pair;         // Last pair inserted into interesting_pairs
365 
366   DetailEntry* debug_data;        // Normally NULL. Ptr to debug data for
367                                   // FLAGS_enc_detect_detail PostScript data
368   int next_detail_entry;          // Debug
369 
370   bool done;
371   bool reliable;
372   bool hints_derated;
373   int declared_enc_1;             // From http/meta hint
374   int declared_enc_2;             // from http/meta hint
375   int prune_count;                // Number of times we have pruned
376 
377   int trigram_highwater_mark;       // Byte offset of last trigram processing
378   bool looking_for_latin_trigrams;  // True if we should test for doing
379                                     //  Latin1/2/7 trigram processing
380   bool do_latin_trigrams;           // True if we actually are scoring trigrams
381 
382   // Miscellaneous state variables for difficult encodings
383   int binary_quadrants_count;     // Number of four bigram quadrants seen:
384                                   //  0xxxxxxx0xxxxxxx 0xxxxxxx1xxxxxx
385                                   //  1xxxxxxx0xxxxxxx 1xxxxxxx1xxxxxx
386   int binary_8x4_count;           // Number of 8x4 buckets seen:
387   uint32 binary_quadrants_seen;   // Bit[i] set if bigram i.......i....... seen
388   uint32 binary_8x4_seen;         // Bit[i] set if bigram iii.....ii...... seen
389   int utf7_starts;                // Count of possible UTF-7 beginnings seen
390   int prior_utf7_offset;          // Source consumed by prior UTF-7 string
391   int next_utf8_ministate;        // Mini state for UTF-8 sequences
392   int utf8_minicount[6];          // Number of correct 2- 3- 4-byte seq, errors
393   int next_utf8utf8_ministate;    // Mini state for UTF8UTF8 sequences
394   int utf8utf8_odd_byte;          // UTF8UTF8 seq has odd number of bytes
395   int utf8utf8_minicount[6];      // Number of correct 2- 3- 4-byte seq, errors
396   StateSoSi next_2022_state;            // Mini state for 2022 sequences
397   StateSoSi next_hz_state;              // Mini state for HZ sequences
398   bool next_eucjp_oddphase;             // Mini state for EUC-JP sequences
399   int byte32_count[8];            // Count of top 3 bits of byte1 of bigram
400                                   // 0x1x 2x3x 4x5x 6x7x 8x9x AxBx CxDx ExFx
401   uint32 active_special;          // Bits showing which special cases are active
402 
403   Encoding tld_hint;              // Top TLD encoding or UNKNOWN
404   Encoding http_hint;             // What the document says about itself or
405   Encoding meta_hint;             // UNKNOWN_ENCODING. BOM is initial byte
406   Encoding bom_hint;              // order mark for UTF-xx
407 
408   // small cache of previous interesting bigrams
409   int next_prior_bigram;
410   int prior_bigram[4];
411   int prior_binary[1];
412 
413   int top_rankedencoding;         // Top two probabilities and families
414   int second_top_rankedencoding;
415   int top_prob;
416   int second_top_prob;
417   int prune_difference;           // Prune things this much below the top prob
418   int rankedencoding_list_len;                // Number of active encodings
419   int rankedencoding_list[NUM_RANKEDENCODING];  // List of active encodings
420                                                 //
421   int enc_prob[NUM_RANKEDENCODING];           // Cumulative probability per enc
422                                               // This is where all the action is
423   int hint_prob[NUM_RANKEDENCODING];          // Initial hint probabilities
424   int hint_weight[NUM_RANKEDENCODING];        // Number of hints for this enc
425 
426   // Two sets -- one for printable ASCII, one for the rest
427   int prior_interesting_pair[NUM_PAIR_SETS];  // Pairs consumed by prior call
428   int next_interesting_pair[NUM_PAIR_SETS];   // Next pair to write
429   char interesting_pairs[NUM_PAIR_SETS][kMaxPairs * 2];   // Two bytes per pair
430   int interesting_offsets[NUM_PAIR_SETS][kMaxPairs];      // Src offset of pair
431   int interesting_weightshift[NUM_PAIR_SETS][kMaxPairs];  // weightshift of pair
432 } DetectEncodingState;
433 
434 
435 // Record a debug event that changes probabilities
SetDetailsEncProb(DetectEncodingState * destatep,int offset,int best_enc,const char * label)436 void SetDetailsEncProb(DetectEncodingState* destatep,
437                        int offset, int best_enc, const char* label) {
438   int next = destatep->next_detail_entry;
439   destatep->debug_data[next].offset = offset;
440   destatep->debug_data[next].best_enc = best_enc;
441   destatep->debug_data[next].label = label;
442   memcpy(&destatep->debug_data[next].detail_enc_prob,
443          &destatep->enc_prob,
444          sizeof(destatep->enc_prob));
445   ++destatep->next_detail_entry;
446 }
447 
448 // Record a debug event that changes probabilities, copy offset
SetDetailsEncProbCopyOffset(DetectEncodingState * destatep,int best_enc,const char * label)449 void SetDetailsEncProbCopyOffset(DetectEncodingState* destatep,
450                                  int best_enc, const char* label) {
451   int next = destatep->next_detail_entry;
452   destatep->debug_data[next].offset = destatep->debug_data[next - 1].offset;
453   destatep->debug_data[next].best_enc = best_enc;
454   destatep->debug_data[next].label = label;
455   memcpy(&destatep->debug_data[next].detail_enc_prob,
456          &destatep->enc_prob,
457          sizeof(destatep->enc_prob));
458   ++destatep->next_detail_entry;
459 }
460 
461 // Record a debug event that changes probs and has simple text label
SetDetailsEncLabel(DetectEncodingState * destatep,const char * label)462 void SetDetailsEncLabel(DetectEncodingState* destatep, const char* label) {
463   int next = destatep->next_detail_entry;
464   destatep->debug_data[next].offset = destatep->debug_data[next - 1].offset;
465   destatep->debug_data[next].best_enc = -1;
466   destatep->debug_data[next].label = label;
467   memcpy(&destatep->debug_data[next].detail_enc_prob,
468          &destatep->enc_prob,
469          sizeof(destatep->enc_prob));
470   ++destatep->next_detail_entry;
471 }
472 
473 // Record a debug event that is just a text label, no change in probs
SetDetailsLabel(DetectEncodingState * destatep,const char * label)474 void SetDetailsLabel(DetectEncodingState* destatep, const char* label) {
475   int next = destatep->next_detail_entry;
476   destatep->debug_data[next].offset = destatep->debug_data[next - 1].offset;
477   destatep->debug_data[next].best_enc = -1;
478   destatep->debug_data[next].label = label;
479   memcpy(&destatep->debug_data[next].detail_enc_prob,
480          &destatep->debug_data[next - 1].detail_enc_prob,
481          sizeof(destatep->enc_prob));
482   ++destatep->next_detail_entry;
483 }
484 
485 
486 // Maps superset encodings to base, to see if 2 encodings are compatible
487 // (Non-identity mappings are marked "-->" below.)
488 static const Encoding kMapEncToBaseEncoding[] = {
489   ISO_8859_1,       // 0: Teragram ASCII
490   ISO_8859_2,       // 1: Teragram Latin2
491   ISO_8859_3,       // 2: in BasisTech but not in Teragram
492   ISO_8859_4,       // 3: Teragram Latin4
493   ISO_8859_5,       // 4: Teragram ISO-8859-5
494   ISO_8859_6,       // 5: Teragram Arabic
495   ISO_8859_7,       // 6: Teragram Greek
496   MSFT_CP1255,      // 7: Teragram Hebrew --> 36
497   ISO_8859_9,       // 8: in BasisTech but not in Teragram
498   ISO_8859_10,      // 9: in BasisTech but not in Teragram
499   JAPANESE_EUC_JP,  // 10: Teragram EUC_JP
500   JAPANESE_SHIFT_JIS,  // 11: Teragram SJS
501   JAPANESE_JIS,     // 12: Teragram JIS
502   CHINESE_BIG5,     // 13: Teragram BIG5
503   CHINESE_GB,       // 14: Teragram GB
504   CHINESE_EUC_CN,   // 15: Teragram EUC-CN
505   KOREAN_EUC_KR,    // 16: Teragram KSC
506   UNICODE,          // 17: Teragram Unicode
507   CHINESE_EUC_CN,   // 18: Teragram EUC --> 15
508   CHINESE_EUC_CN,   // 19: Teragram CNS --> 15
509   CHINESE_BIG5,     // 20: Teragram BIG5_CP950 --> 13
510   JAPANESE_SHIFT_JIS,   // 21: Teragram CP932 --> 11
511   UTF8,             // 22
512   UNKNOWN_ENCODING, // 23
513   ISO_8859_1,       // 24: ISO_8859_1 with all characters <= 127 --> 0
514   RUSSIAN_KOI8_R,   // 25: Teragram KOI8R
515   RUSSIAN_CP1251,   // 26: Teragram CP1251
516   ISO_8859_1,       // 27: CP1252 aka MSFT euro ascii --> 0
517   RUSSIAN_KOI8_RU,  // 28: CP21866 aka KOI8_RU, used for Ukrainian
518   MSFT_CP1250,      // 29: CP1250 aka MSFT eastern european
519   ISO_8859_1,       // 30: aka ISO_8859_0 aka ISO_8859_1 euroized --> 0
520   ISO_8859_9,       // 31: used for Turkish
521   ISO_8859_13,      // 32: used in Baltic countries --> 43
522   ISO_8859_11,      // 33: aka TIS-620, used for Thai
523   ISO_8859_11,      // 34: used for Thai --> 33
524   MSFT_CP1256,      // 35: used for Arabic
525   MSFT_CP1255,      // 36: Logical Hebrew Microsoft
526   MSFT_CP1255,      // 37: Iso Hebrew Logical --> 36
527   MSFT_CP1255,      // 38: Iso Hebrew Visual --> 36
528   CZECH_CP852,      // 39
529   ISO_8859_2,       // 40: aka ISO_IR_139 aka KOI8_CS --> 1
530   MSFT_CP1253,      // 41: used for Greek, but NOT a superset of 8859-7
531   RUSSIAN_CP866,    // 42
532   ISO_8859_13,      // 43
533   ISO_2022_KR,      // 44
534   CHINESE_GB,       // 45 GBK --> 14
535   CHINESE_GB,       // 46 GB18030 --> 14
536   CHINESE_BIG5,     // 47 BIG5_HKSCS --> 13
537   ISO_2022_KR,      // 48 ISO_2022_CN --> 44
538   TSCII,            // 49 Indic encoding
539   TAMIL_MONO,       // 50 Indic encoding - Tamil
540   TAMIL_BI,         // 51 Indic encoding - Tamil
541   JAGRAN,           // 52 Indic encoding - Devanagari
542   MACINTOSH_ROMAN,  // 53
543   UTF7,             // 54
544   BHASKAR,          // 55 Indic encoding - Devanagari
545   HTCHANAKYA,       // 56 Indic encoding - Devanagari
546   UTF16BE,          // 57
547   UTF16LE,          // 58
548   UTF32BE,          // 59
549   UTF32LE,          // 60
550   BINARYENC,        // 61
551   HZ_GB_2312,       // 62
552   UTF8UTF8,         // 63
553   TAM_ELANGO,       // 64 Elango - Tamil
554   TAM_LTTMBARANI,   // 65 Barani - Tamil
555   TAM_SHREE,        // 66 Shree - Tamil
556   TAM_TBOOMIS,      // 67 TBoomis - Tamil
557   TAM_TMNEWS,       // 68 TMNews - Tamil
558   TAM_WEBTAMIL,     // 69 Webtamil - Tamil
559   KDDI_SHIFT_JIS,         // 70 KDDI Shift_JIS
560   DOCOMO_SHIFT_JIS,       // 71 DoCoMo Shift_JIS
561   SOFTBANK_SHIFT_JIS,     // 72 SoftBank Shift_JIS
562   KDDI_ISO_2022_JP,       // 73 KDDI ISO-2022-JP
563   SOFTBANK_ISO_2022_JP,   // 74 SOFTBANK ISO-2022-JP
564 };
565 
566 COMPILE_ASSERT(arraysize(kMapEncToBaseEncoding) == NUM_ENCODINGS,
567                kMapEncToBaseEncoding_has_incorrect_size);
568 
569 // Maps base encodings to 0, supersets to 1+, undesired to -1
570 // (Non-identity mappings are marked "-->" below.)
571 static const int kMapEncToSuperLevel[] = {
572   0,       // 0: Teragram ASCII
573   0,       // 1: Teragram Latin2
574   0,       // 2: in BasisTech but not in Teragram
575   0,       // 3: Teragram Latin4
576   0,       // 4: Teragram ISO-8859-5
577   0,       // 5: Teragram Arabic
578   0,       // 6: Teragram Greek
579   0,       // 7: Teragram Hebrew
580   0,       // 8: in BasisTech but not in Teragram
581   0,      // 9: in BasisTech but not in Teragram
582   0,      // 10: Teragram EUC_JP
583   0,      // 11: Teragram SJS
584   0,      // 12: Teragram JIS
585   0,      // 13: Teragram BIG5
586   0,       // 14: Teragram GB
587   0,      // 15: Teragram EUC-CN
588   0,      // 16: Teragram KSC
589   0,          // 17: Teragram Unicode
590   -1,     // 18: Teragram EUC --> 15
591   -1,     // 19: Teragram CNS --> 15
592   1,      // 20: Teragram BIG5_CP950 --> 13
593   1,      // 21: Teragram CP932 --> 11
594   0,             // 22
595   -1,     // 23
596   -1,       // 24: ISO_8859_1 with all characters <= 127 --> 0
597   0,      // 25: Teragram KOI8R
598   0,      // 26: Teragram CP1251
599   1,       // 27: CP1252 aka MSFT euro ascii --> 0
600   0,      // 28: CP21866 aka KOI8_RU, used for Ukrainian
601   0,      // 29: CP1250 aka MSFT eastern european
602   1,       // 30: aka ISO_8859_0 aka ISO_8859_1 euroized --> 0
603   0,       // 31: used for Turkish
604   1,      // 32: used in Baltic countries --> 43
605   0,      // 33: aka TIS-620, used for Thai
606   1,      // 34: used for Thai --> 33
607   0,      // 35: used for Arabic
608   0,      // 36: Logical Hebrew Microsoft
609   -1,      // 37: Iso Hebrew Logical --> 36
610   -1,       // 38: Iso Hebrew Visual --> 7
611   0,      // 39
612   1,       // 40: aka ISO_IR_139 aka KOI8_CS --> 1
613   0,       // 41: used for Greek, NOT superset of 8859-7
614   0,      // 42
615   0,      // 43
616   0,      // 44
617   1,       // 45 GBK --> 14
618   1,       // 46 GB18030 --> 14
619   1,      // 47 BIG5_HKSCS --> 13
620   1,      // 48 ISO_2022_CN --> 44
621   0,      // 49 Indic encoding
622   0,       // 50 Indic encoding - Tamil
623   0,         // 51 Indic encoding - Tamil
624   0,           // 52 Indic encoding - Devanagari
625   0,      // 53
626   0,      // 54
627   0,      // 55 Indic encoding - Devanagari
628   0,      // 56 Indic encoding - Devanagari
629   0,          // 57
630   0,          // 58
631   0,          // 59
632   0,          // 60
633   0,        // 61
634   0,       // 62
635   2,         // 63
636   0, 0, 0, 0, 0, 0,         // add six more Tamil
637   0, 0, 0, 0, 0,            // add five encodings with emoji
638 };
639 
640 COMPILE_ASSERT(arraysize(kMapEncToSuperLevel) == NUM_ENCODINGS,
641                kMapEncToSuperLevel_has_incorrect_size);
642 
643 
644 
645 // Subscripted by Encoding enum value
646 static const uint32 kSpecialMask[] = {
647   kHighAccentCode,                    // 0
648   kHighAccentCode,
649   kHighAccentCode,
650   kHighAccentCode,
651   kHighAlphaCode,                     // 4
652   kHighAlphaCode,
653   kHighAlphaCode,
654   kHighAlphaCode,
655   kHighAccentCode,
656   kHighAccentCode,
657 
658   kTwobyteCode + kEUCJPActive,        // 10 euc-jp
659   kTwobyteCode,
660   kSevenBitActive + kIso2022Active,   // jis
661   kTwobyteCode,
662   kTwobyteCode,
663   kTwobyteCode,
664   kTwobyteCode,
665   kSevenBitActive + kUTF1632Active,   // Unicode
666   kTwobyteCode,
667   kTwobyteCode,
668 
669   kTwobyteCode,                       // 20
670   kTwobyteCode,
671   kUTF8Active,                        // UTF-8
672   0,
673   0,
674   kHighAlphaCode,                     // 25
675   kHighAlphaCode,
676   kHighAccentCode,
677   kHighAlphaCode,
678   kHighAccentCode,
679 
680   kHighAccentCode,                   // 30
681   kHighAccentCode,
682   kHighAccentCode,
683   kHighAlphaCode,
684   kHighAlphaCode,
685   kHighAlphaCode,                    // 35
686   kHighAlphaCode,
687   kHighAlphaCode,
688   kHighAlphaCode,
689   0,
690 
691   0,                                  // 40
692   kHighAlphaCode,
693   kHighAlphaCode,
694   kHighAccentCode,
695   kSevenBitActive + kIso2022Active,   // 2022-kr
696   kTwobyteCode,
697   kTwobyteCode,
698   kTwobyteCode,
699   kSevenBitActive + kIso2022Active,   // 2022-cn
700   kHighAlphaCode + kIsIndicCode,       // 49 TSCII
701 
702   kHighAlphaCode + kIsIndicCode,       // 50 TAMIL_MONO
703   kHighAlphaCode + kIsIndicCode,       // 51 TAMIL_BI
704   kHighAlphaCode + kIsIndicCode,       // 52 JAGRAN
705   kHighAccentCode,                     // 53 MACINTOSH_ROMAN
706   kSevenBitActive + kUTF7Active,      // 54 UTF-7
707   kHighAlphaCode + kIsIndicCode,       // 55 BHASKAR Indic encoding - Devanagari
708   kHighAlphaCode + kIsIndicCode,       // 56 HTCHANAKYA Indic encoding - Devanagari
709   kSevenBitActive + kUTF1632Active,   // 57 UTF16BE
710   kSevenBitActive + kUTF1632Active,   // 58 UTF16LE
711   kSevenBitActive + kUTF1632Active,   // 59 UTF32BE
712   kSevenBitActive + kUTF1632Active,   // 60 UTF32LE
713 
714   kSevenBitActive + kBinaryActive,    // 61 BINARYENC
715   kSevenBitActive + kHzActive,        // 62 HZ_GB_2312
716   kHighAccentCode + kUTF8Active + kUTF8UTF8Active,      // 63 UTF8UTF8
717   kHighAlphaCode + kIsIndicCode,       // 64 Elango - Tamil
718   kHighAlphaCode + kIsIndicCode,       // 65 Barani - Tamil
719   kHighAlphaCode + kIsIndicCode,       // 66 Shree - Tamil
720   kHighAlphaCode + kIsIndicCode,       // 67 TBoomis - Tamil
721   kHighAlphaCode + kIsIndicCode,       // 68 TMNews - Tamil
722   kHighAlphaCode + kIsIndicCode,       // 69 Webtamil - Tamil
723   kTwobyteCode,                       // 70 KDDI Shift_JIS
724   kTwobyteCode,                       // 71 DoCoMo Shift_JIS
725   kTwobyteCode,                       // 72 SoftBank Shift_JIS
726   kSevenBitActive + kIso2022Active,   // 73 KDDI-ISO-2022-JP
727   kSevenBitActive + kIso2022Active,   // 74 SOFTBANK-ISO-2022-JP
728 };
729 
730 COMPILE_ASSERT(arraysize(kSpecialMask) == NUM_ENCODINGS,
731                kSpecialMask_has_incorrect_size);
732 
733 
734 /***
735   kHighAlphaCode -- full alphabet in 8x-Fx range, not just accents
736 
737   ISO_8859_5,       // 4: Teragram ISO-8859-5 Cyrl      UL bd
738   RUSSIAN_CP1251,   // 26: Teragram CP1251              UL cdef
739   RUSSIAN_KOI8_R,   // 25: Teragram KOI8R               LU cdef
740   RUSSIAN_KOI8_RU,  // 28: CP21866 aka KOI8_RU,         LU cdef
741   RUSSIAN_CP866,     // 42                              89ae
742 
743   ISO_8859_6,       // 5: Teragram Arabic               nocase cde
744   MSFT_CP1256,      // 35: used for Arabic              nocase cde
745 
746   ISO_8859_7,       // 6: Teragram Greek                UL cdef
747   MSFT_CP1253,       // 41: used for Greek              UL cdef
748 
749   ISO_8859_8,       // 7: Teragram Hebrew               nocase ef
750   MSFT_CP1255,      // 36: Logical Hebrew Microsoft     nocase ef
751   ISO_8859_8_I,     // 37: Iso Hebrew Logical           nocase ef
752   HEBREW_VISUAL,    // 38: Iso Hebrew Visual            nocase ef
753 
754   ISO_8859_11,      // 33: aka TIS-620, used for Thai   nocase abcde
755   MSFT_CP874,       // 34: used for Thai                nocase abcde
756 
757   TSCII,             // 49                              8-f
758   TAMIL_MONO,        // 50
759   TAMIL_BI,          // 51
760   JAGRAN,            // 52
761   BHASKAR,           // 55 Indic encoding - Devanagari
762   HTCHANAKYA,        // 56 Indic encoding - Devanagari
763 ***/
764 
765 // We can scan bytes using this at about 500 MB/sec 2.8GHz P4
766 // Slow scan uses this, stopping on NUL ESC SO SI bad C0 and + ~
767 // We allow FF, 0x0C, here because it gives a better result for old
768 // Ascii text formatted for a TTY
769 // non-zero exits scan loop -- 1 for printable ASCII, 2 otherwise
770 static const char kTestPrintableAsciiTildePlus[256] = {
771   2,2,2,2,2,2,2,2, 2,0,0,2,0,0,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
772   0,0,0,0,0,0,0,0, 0,0,0,1,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
773   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
774   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,1,2,
775 
776   2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
777   2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
778   2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
779   2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
780 };
781 
782 // We can scan bytes using this at about 550 MB/sec 2.8GHz P4
783 // Slow scan uses this, stopping on NUL ESC SO SI and bad C0
784 // after Hz and UTF7 are pruned away
785 // We allow Form Feed, 0x0C, here
786 static const char kTestPrintableAscii[256] = {
787   2,2,2,2,2,2,2,2, 2,0,0,2,0,0,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
788   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
789   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
790   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,2,
791 
792   2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
793   2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
794   2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
795   2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
796 };
797 
798 // Used in first-four-byte testing
799 static const char kIsPrintableAscii[256] = {
800   0,0,0,0,0,0,0,0, 0,1,1,0,0,1,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
801   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
802   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
803   1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,0,
804 
805   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
806   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
807   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
808   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
809 };
810 
811 
812 static const signed char kBase64Value[256] = {
813   -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
814   -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
815   -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,62,-1,-1,-1,63,
816   52,53,54,55,56,57,58,59, 60,61,-1,-1,-1,-1,-1,-1,
817 
818   -1, 0, 1, 2, 3, 4, 5, 6,  7, 8, 9,10,11,12,13,14,
819   15,16,17,18,19,20,21,22, 23,24,25,-1,-1,-1,-1,-1,
820   -1,26,27,28,29,30,31,32, 33,34,35,36,37,38,39,40,
821   41,42,43,44,45,46,47,48, 49,50,51,-1,-1,-1,-1,-1,
822 
823   -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
824   -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
825   -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
826   -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
827 
828   -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
829   -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
830   -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
831   -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
832 };
833 
834 
835 // Subscripted by <state, byte/16>
836 // Accepts Cx->8x Dx->8x Ex->8x->8x Fx->8x->8x->8x
837 //
838 // Fixed Problem: GB has sequences like B2DB B8D6 BDE1 B9B9
839 // which we can mis-parse as an error byte followed by good UTF-8:
840 //                                      B2 DBB8 D6BD E1B9B9
841 // To counteract this, we now require an ASCII7 byte to resync out
842 // of the error state
843 // Next problem: good UTF-8 with bad byte
844 // efbc a012 eea4 bee7 b280 c2b7
845 // efbca0 12 eea4be e7b280 c2b7
846 //        ^^ bad byte
847 // fix: change state0 byte 1x to be don't-care
848 //
849 // Short UTF-8 ending in ASCII7 byte should resync immediately:
850 // E0 20 E0 A6 AA should give one error and resync at 2nd E0
851 //
852 static const char kMiniUTF8State[8][16] = {
853   {0,0,0,0,0,0,0,0, 7,7,7,7,1,1,2,4,},      // [0] start char (allow cr/lf/ht)
854   {0,7,0,0,0,0,0,0, 0,0,0,0,7,7,7,7,},      // [1] continue 1 of 2
855   {0,7,0,0,0,0,0,0, 3,3,3,3,7,7,7,7,},      // [2] continue 1 of 3
856   {0,7,0,0,0,0,0,0, 0,0,0,0,7,7,7,7,},      // [3] continue 2 of 3
857   {0,7,0,0,0,0,0,0, 5,5,5,5,7,7,7,7,},      // [4] continue 1 of 4
858   {0,7,0,0,0,0,0,0, 6,6,6,6,7,7,7,7,},      // [5] continue 2 of 4
859   {0,7,0,0,0,0,0,0, 0,0,0,0,7,7,7,7,},      // [6] continue 3 of 4
860   {0,7,0,0,0,0,0,0, 7,7,7,7,7,7,7,7,},      // [7] error, soak up continues,
861                                             // ONLY resync after Ascii char
862                                             //     then restart
863 };
864 // Counter to increment: 0-don'tcare 1-error 2-good_2B 3-good_3B 4-good_4B
865 static const char kMiniUTF8Count[8][16] = {
866   {0,0,0,0,0,0,0,0, 1,1,1,1,0,0,0,0,},      // [0] start char (allow cr/lf/ht)
867   {1,1,1,1,1,1,1,1, 2,2,2,2,1,1,1,1,},      // [1] continue 1 of 2
868   {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,},      // [2] continue 1 of 3
869   {1,1,1,1,1,1,1,1, 3,3,3,3,1,1,1,1,},      // [3] continue 2 of 3
870   {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,},      // [4] continue 1 of 4
871   {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,},      // [5] continue 2 of 4
872   {1,1,1,1,1,1,1,1, 4,4,4,4,1,1,1,1,},      // [6] continue 3 of 4
873   {0,1,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,},      // [7] error, soak up continues,
874                                             //     then restart
875 };
876 
877 // Subscripted by <state, f(byte1) + g(byte2)>
878 // where f(x)= E2->4, Cx->8 and C3->12 and 0 otherwise
879 // and g(x) = (x >> 4) & 3        8x->0 9x->1 Ax->2 Bx->3 Cx->0, etc.
880 //                                (no checking for illegal bytes)
881 // Here are example patterns of CP1252 converted to UTF-8 0/1/2 times. We want
882 // to detect two, so we can back-convert to one.
883 // zero one    two                 pattern
884 // ---- ------ ----------------    -----------------
885 // 81   C281   C382C281            C3->8x->C2->xx
886 // 98   CB9C   C38BC593            C3->8x->C5->xx
887 // C3   C383   C383C692            C3->8x->C6->xx
888 // C8   C388   C383CB86            C3->8x->CB->xx
889 // 83   C692   C386E28099          C3->8x->E2->xx->8x
890 // 80   E282AC C3A2E2809AC2AC      C3->A2->E2->xx->xx->Cx->xx
891 // 92   E28099 C3A2E282ACE284A2    C3->A2->E2->xx->xx->E2->xx->xx
892 //
893 // We also want to detect bare-byte extra UTF-8 conversions:
894 // zero one    two                 pattern
895 // ---- ------ ----------------    -----------------
896 // C3   C3     C383                C3->8x->C2->xx
897 // D3   D3     C393                C3->9x->C2->xx->C2->xx
898 // E3   E3     C3A3                C3->Ax->C2->xx->C2->xx->C2->xx
899 // F3   F3     C3B2                C3->Bx->C2->xx->C2->xx->C2->xx->C2->xx
900 //
901 
902 /**
903 CP1252 => UTF8 => UTF8UTF8
904 80 => E282AC => C3A2E2809AC2AC
905 81 => C281 => C382C281
906 82 => E2809A => C3A2E282ACC5A1
907 83 => C692 => C386E28099
908 84 => E2809E => C3A2E282ACC5BE
909 85 => E280A6 => C3A2E282ACC2A6
910 86 => E280A0 => C3A2E282ACC2A0
911 87 => E280A1 => C3A2E282ACC2A1
912 88 => CB86 => C38BE280A0
913 89 => E280B0 => C3A2E282ACC2B0
914 8A => C5A0 => C385C2A0
915 8B => E280B9 => C3A2E282ACC2B9
916 8C => C592 => C385E28099
917 8D => C28D => C382C28D
918 8E => C5BD => C385C2BD
919 8F => C28F => C382C28F
920 90 => C290 => C382C290
921 91 => E28098 => C3A2E282ACCB9C
922 92 => E28099 => C3A2E282ACE284A2
923 93 => E2809C => C3A2E282ACC593
924 94 => E2809D => C3A2E282ACC29D
925 95 => E280A2 => C3A2E282ACC2A2
926 96 => E28093 => C3A2E282ACE2809C
927 97 => E28094 => C3A2E282ACE2809D
928 98 => CB9C => C38BC593
929 99 => E284A2 => C3A2E2809EC2A2
930 9A => C5A1 => C385C2A1
931 9B => E280BA => C3A2E282ACC2BA
932 9C => C593 => C385E2809C
933 9D => C29D => C382C29D
934 9E => C5BE => C385C2BE
935 9F => C5B8 => C385C2B8
936 A0 => C2A0 => C382C2A0
937 A1 => C2A1 => C382C2A1
938 A2 => C2A2 => C382C2A2
939 A3 => C2A3 => C382C2A3
940 A4 => C2A4 => C382C2A4
941 A5 => C2A5 => C382C2A5
942 A6 => C2A6 => C382C2A6
943 A7 => C2A7 => C382C2A7
944 A8 => C2A8 => C382C2A8
945 A9 => C2A9 => C382C2A9
946 AA => C2AA => C382C2AA
947 AB => C2AB => C382C2AB
948 AC => C2AC => C382C2AC
949 AD => C2AD => C382C2AD
950 AE => C2AE => C382C2AE
951 AF => C2AF => C382C2AF
952 B0 => C2B0 => C382C2B0
953 B1 => C2B1 => C382C2B1
954 B2 => C2B2 => C382C2B2
955 B3 => C2B3 => C382C2B3
956 B4 => C2B4 => C382C2B4
957 B5 => C2B5 => C382C2B5
958 B6 => C2B6 => C382C2B6
959 B7 => C2B7 => C382C2B7
960 B8 => C2B8 => C382C2B8
961 B9 => C2B9 => C382C2B9
962 BA => C2BA => C382C2BA
963 BB => C2BB => C382C2BB
964 BC => C2BC => C382C2BC
965 BD => C2BD => C382C2BD
966 BE => C2BE => C382C2BE
967 BF => C2BF => C382C2BF
968 C0 => C380 => C383E282AC
969 C1 => C381 => C383C281
970 C2 => C382 => C383E2809A
971 C3 => C383 => C383C692
972 C4 => C384 => C383E2809E
973 C5 => C385 => C383E280A6
974 C6 => C386 => C383E280A0
975 C7 => C387 => C383E280A1
976 C8 => C388 => C383CB86
977 C9 => C389 => C383E280B0
978 CA => C38A => C383C5A0
979 CB => C38B => C383E280B9
980 CC => C38C => C383C592
981 CD => C38D => C383C28D
982 CE => C38E => C383C5BD
983 CF => C38F => C383C28F
984 D0 => C390 => C383C290
985 D1 => C391 => C383E28098
986 D2 => C392 => C383E28099
987 D3 => C393 => C383E2809C
988 D4 => C394 => C383E2809D
989 D5 => C395 => C383E280A2
990 D6 => C396 => C383E28093
991 D7 => C397 => C383E28094
992 D8 => C398 => C383CB9C
993 D9 => C399 => C383E284A2
994 DA => C39A => C383C5A1
995 DB => C39B => C383E280BA
996 DC => C39C => C383C593
997 DD => C39D => C383C29D
998 DE => C39E => C383C5BE
999 DF => C39F => C383C5B8
1000 E0 => C3A0 => C383C2A0
1001 E1 => C3A1 => C383C2A1
1002 E2 => C3A2 => C383C2A2
1003 E3 => C3A3 => C383C2A3
1004 E4 => C3A4 => C383C2A4
1005 E5 => C3A5 => C383C2A5
1006 E6 => C3A6 => C383C2A6
1007 E7 => C3A7 => C383C2A7
1008 E8 => C3A8 => C383C2A8
1009 E9 => C3A9 => C383C2A9
1010 EA => C3AA => C383C2AA
1011 EB => C3AB => C383C2AB
1012 EC => C3AC => C383C2AC
1013 ED => C3AD => C383C2AD
1014 EE => C3AE => C383C2AE
1015 EF => C3AF => C383C2AF
1016 F0 => C3B0 => C383C2B0
1017 F1 => C3B1 => C383C2B1
1018 F2 => C3B2 => C383C2B2
1019 F3 => C3B3 => C383C2B3
1020 F4 => C3B4 => C383C2B4
1021 F5 => C3B5 => C383C2B5
1022 F6 => C3B6 => C383C2B6
1023 F7 => C3B7 => C383C2B7
1024 F8 => C3B8 => C383C2B8
1025 F9 => C3B9 => C383C2B9
1026 FA => C3BA => C383C2BA
1027 FB => C3BB => C383C2BB
1028 FC => C3BC => C383C2BC
1029 FD => C3BD => C383C2BD
1030 FE => C3BE => C383C2BE
1031 FF => C3BF => C383C2BF
1032 **/
1033 
1034 // Subscripted by <state, f(byte1) + g(byte2)>
1035 // where f(x)= E2->4, C2/5/6/B->8 and C3->12 and 0 otherwise
1036 // and g(x) = (x >> 4) & 3        8x->0 9x->1 Ax->2 Bx->3 Cx->0, etc.
1037 
1038 // 81   C281   C382C281            C3->8x->C2->xx
1039 // 98   CB9C   C38BC593            C3->8x->C5->xx
1040 // C3   C383   C383C692            C3->8x->C6->xx
1041 // C8   C388   C383CB86            C3->8x->CB->xx
1042 //                                 [0]     [2]   [0]
1043 // 83   C692   C386E28099          C3->8x->E2->xx->xx
1044 //   odd_byte=0                    [0]     [2]       [0+]  odd_byte flipped
1045 //   odd_byte=1                    [0+]    [2] [0]   [0]   odd_byte unflipped
1046 // 80   E282AC C3A2E2809AC2AC      C3->A2->E2->xx->xx->Cx->xx
1047 //   odd_byte=0                    [0]     [3]         [4]   [0+]
1048 //   odd_byte=1                    [0+]    [3] [4]     [4]   [0]
1049 // 92   E28099 C3A2E282ACE284A2    C3->A2->E2->xx->xx->E2->xx->xx
1050 //   odd_byte=0                    [0]     [3]         [4] [0]   [0]
1051 //   odd_byte=1                    [0+]    [3] [4]     [4]       [0+]
1052 //
1053 // When an E2xxxx sequence is encountered, we absorb the two bytes E2xx and flip
1054 // the odd_byte state. If that goes from 0 to 1, the next pair is offset up
1055 // by one byte, picking up the two bytes just after E2xxxx. If odd_byte goes
1056 // from 1 to 0, the next two bytes picked up are the two bytes xxxx of E2xxxx.
1057 // These are absorbed with no error in state 0 or state 4
1058 //
1059 // C3   C3     C383                C3->8x->C2->xx
1060 // D3   D3     C393                C3->9x->C2->xx->C2->xx
1061 // E3   E3     C3A3                C3->Ax->C2->xx->C2->xx->C2->xx
1062 // F3   F3     C3B2                C3->Bx->C2->xx->C2->xx->C2->xx->C2->xx
1063 // Counter3 for Fx Ex sequences is incremented at last C2
1064 
1065 static const char kMiniUTF8UTF8State[8][16] = {
1066   // xxxx  E2xx     CXxx    C3xx
1067   //       8 9 a b  8 9 a b 8 9 a b
1068   {0,0,0,0,1,1,1,1, 1,1,1,1,2,2,3,5,},      // [0] looking for C38x/C3Ax/2020/8x8x, or err
1069   {0,0,0,0,1,1,1,1, 1,1,1,1,2,2,3,5,},      // [1] error, back to looking
1070   {1,1,1,1,0,0,0,0, 0,0,0,0,1,1,1,1,},      // [2] C38x looking for CXxx/E2xxxx
1071   //       + + + +                          //      E2xxxx flips odd_byte
1072   {1,1,1,1,4,4,4,4, 7,7,7,7,1,1,1,1,},      // [3] C3Ax looking for E2xx or C2xxC2xx
1073   //       + + + +                          //      E2xxxx flips odd_byte
1074   {4,4,4,4,0,0,0,0, 0,0,0,0,1,1,1,1,},      // [4] C3AxE2xx-- looking for C2xx/E2xxxx
1075   //       + + + +                          //      E2xxxx flips odd_byte
1076   {1,1,1,1,1,1,1,1, 6,6,6,6,1,1,1,1,},      // [5] C3Bx -- looking for C2xxC2xxC2xx
1077   {1,1,1,1,1,1,1,1, 7,7,7,7,1,1,1,1,},      // [6] C3Bx -- looking for C2xxC2xx
1078   {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,},      // [7] C3Bx -- looking for C2xx
1079 };
1080 // Counter to increment: 0-don'tcare 1-error 2-good_2B 3-good_3B 4-good_4B
1081 static const char kMiniUTF8UTF8Count[8][16] = {
1082   // xxxx  E2xx     C2Xx    C3xx
1083   //       8 9 a b  8 9 a b 8 9 a b
1084   {0,0,0,0,1,1,1,1, 1,1,1,1,0,0,0,0,},      // [0] looking for C38x/C3Ax/2020/8x8x, or err
1085   {0,0,0,0,1,1,1,1, 1,1,1,1,0,0,0,0,},      // [1] error, back to looking
1086   {1,1,1,1,3,3,3,3, 2,2,2,2,1,1,1,1,},      // [2] C38x looking for CXxx/E2xxxx
1087   //       + + + +                          //      E2xxxx flips odd_byte
1088   {1,1,1,1,0,0,0,0, 0,0,0,0,1,1,1,1,},      // [3] C3Ax looking for E2xx
1089   //       + + + +                          //      E2xxxx flips odd_byte
1090   {1,1,1,1,4,4,4,4, 4,4,4,4,1,1,1,1,},      // [4] C3AxE2xx-- looking for C2xx/E2xxxx
1091   //       + + + +                          //      E2xxxx flips odd_byte
1092   {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,},      // [5] C3Bx -- looking for C2xxC2xxC2xx
1093   {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,},      // [6] C3Bx -- looking for C2xxC2xx
1094   {1,1,1,1,1,1,1,1, 3,3,3,3,1,1,1,1,},      // [7] C3Bx -- looking for C2xx
1095 };
1096 
1097 static const char kMiniUTF8UTF8Odd[8][16] = {
1098   // xxxx  E2xx     C2Xx    C3xx
1099   //       8 9 a b  8 9 a b 8 9 a b
1100   {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,},      // [0] looking for C38x/C3Ax/2020/8x8x, or err
1101   {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,},      // [1] error, back to looking
1102   {0,0,0,0,1,1,1,1, 0,0,0,0,0,0,0,0,},      // [2] C38x looking for CXxx/E2xxxx
1103   //       + + + +                          //      E2xxxx flips odd_byte
1104   {0,0,0,0,1,1,1,1, 0,0,0,0,0,0,0,0,},      // [3] C3Ax looking for E2xx
1105   //       + + + +                          //      E2xxxx flips odd_byte
1106   {0,0,0,0,1,1,1,1, 0,0,0,0,0,0,0,0,},      // [4] C3AxE2xx-- looking for C2xx/E2xxxx
1107   //       + + + +                          //      E2xxxx flips odd_byte
1108   {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,},      // [5] C3Bx -- looking for C2xxC2xxC2xx
1109   {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,},      // [6] C3Bx -- looking for C2xxC2xx
1110   {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,},      // [7] C3Bx -- looking for C2xx
1111 };
1112 
1113 // Turn a pair of bytes into the subscript for UTF8UTF8 tables above
UTF88Sub(char s0,char s1)1114 int UTF88Sub(char s0, char s1) {
1115   int sub = (s1 >> 4) & 0x03;
1116   uint8 u0 = static_cast<uint8>(s0);
1117   if (u0 == 0xc3) {
1118     sub += 12;
1119   } else if ((u0 & 0xf0) == 0xc0) {
1120     if ((u0 == 0xc2) || (u0 == 0xc5) || (u0 == 0xc6) || (u0 == 0xcb)) {
1121       sub += 8;
1122     }
1123   } else if (u0 == 0xe2) {
1124     sub += 4;
1125   }
1126   return sub;
1127 }
1128 
1129 
1130 
1131 
1132 
1133 // Default probability for an encoding rankedencoding
1134 // Based on a scan of 55M web pages
1135 // These values are 255 - log base 2**1/10 (occurrences / total)
1136 // Large values are most likely. This the reverse of some Google code
1137 // 255 = 1.0, 245 = 1/2, 235 = 1/4, 15 = 1/2**24, 0 = 0 (< 1/50M)
1138 //
1139 // TODO change this to be per encoding, not permuted
1140 //
1141 
1142 
1143 // Support function for unit test program
1144 // Return ranked encoding corresponding to enc
1145 // (also exported to compact_enc_det_text.cc)
BackmapEncodingToRankedEncoding(Encoding enc)1146 int CompactEncDet::BackmapEncodingToRankedEncoding(Encoding enc) {
1147   for (int i = 0; i < NUM_RANKEDENCODING; ++i) {
1148     if (kMapToEncoding[i] == enc) {
1149       return i;
1150     }
1151   }
1152   return -1;
1153 }
1154 
1155 
DecodeActive(uint32 active)1156 string DecodeActive(uint32 active) {
1157   string temp("");
1158   if (active & kBinaryActive) {
1159     temp.append("Binary ");
1160   }
1161   if (active & kUTF1632Active) {
1162     temp.append("UTF1632 ");
1163   }
1164   if (active & kUTF8UTF8Active) {
1165     temp.append("UTF8UTF8 ");
1166   }
1167   if (active & kUTF8Active) {
1168     temp.append("UTF8 ");
1169   }
1170   if (active & kIso2022Active) {
1171     temp.append("Iso2022 ");
1172   }
1173   if (active & kHzActive) {
1174     temp.append("Hz ");
1175   }
1176   if (active & kUTF7Active) {
1177     temp.append("UTF7A ");
1178   }
1179   if (active & kSevenBitActive) {
1180     temp.append("SevenBit ");
1181   }
1182   if (active & kIsIndicCode) {
1183     temp.append("Indic ");
1184   }
1185   if (active & kHighAlphaCode) {
1186     temp.append("HighAlpha ");
1187   }
1188   if (active & kHighAccentCode) {
1189     temp.append("HighAccent ");
1190   }
1191   if (active & kEUCJPActive) {
1192     temp.append("EUCJP ");
1193   }
1194   return temp;
1195 }
1196 
SevenBitEncoding(int enc)1197 static inline bool SevenBitEncoding(int enc) {
1198   return ((kSpecialMask[enc] & kSevenBitActive) != 0);
1199 }
TwoByteEncoding(int enc)1200 static inline bool TwoByteEncoding(int enc) {
1201   return ((kSpecialMask[enc] & kTwobyteCode) != 0);
1202 }
IndicEncoding(int enc)1203 static inline bool IndicEncoding(int enc) {
1204   return ((kSpecialMask[enc] & kIsIndicCode) != 0);
1205 }
HighAlphaEncoding(int enc)1206 static inline bool HighAlphaEncoding(int enc) {
1207   return ((kSpecialMask[enc] & kHighAlphaCode) != 0);
1208 }
HighAccentEncoding(int enc)1209 static inline bool HighAccentEncoding(int enc) {
1210   return ((kSpecialMask[enc] & kHighAccentCode) != 0);
1211 }
1212 
1213 
AnyActive(DetectEncodingState * destatep)1214 static inline bool AnyActive(DetectEncodingState* destatep) {
1215   return (destatep->active_special != 0);
1216 }
SevenBitActive(DetectEncodingState * destatep)1217 static inline bool SevenBitActive(DetectEncodingState* destatep) {
1218   return (destatep->active_special & kSevenBitActive) != 0;
1219 }
HzActive(DetectEncodingState * destatep)1220 static inline bool HzActive(DetectEncodingState* destatep) {
1221   return (destatep->active_special & kHzActive) != 0;
1222 }
Iso2022Active(DetectEncodingState * destatep)1223 static inline bool Iso2022Active(DetectEncodingState* destatep) {
1224   return (destatep->active_special & kIso2022Active) != 0;
1225 }
UTF8Active(DetectEncodingState * destatep)1226 static inline bool UTF8Active(DetectEncodingState* destatep) {
1227   return (destatep->active_special & kUTF8Active) != 0;
1228 }
UTF8UTF8Active(DetectEncodingState * destatep)1229 static inline bool UTF8UTF8Active(DetectEncodingState* destatep) {
1230   return (destatep->active_special & kUTF8UTF8Active) != 0;
1231 }
UTF1632Active(DetectEncodingState * destatep)1232 static inline bool UTF1632Active(DetectEncodingState* destatep) {
1233   return (destatep->active_special & kUTF1632Active) != 0;
1234 }
BinaryActive(DetectEncodingState * destatep)1235 static inline bool BinaryActive(DetectEncodingState* destatep) {
1236   return (destatep->active_special & kBinaryActive) != 0;
1237 }
UTF7OrHzActive(DetectEncodingState * destatep)1238 static inline bool UTF7OrHzActive(DetectEncodingState* destatep) {
1239   return (destatep->active_special & (kHzActive + kUTF7Active)) != 0;
1240 }
EUCJPActive(DetectEncodingState * destatep)1241 static inline bool EUCJPActive(DetectEncodingState* destatep) {
1242   return ((destatep->active_special & kEUCJPActive) != 0);
1243 }
OtherActive(DetectEncodingState * destatep)1244 static inline bool OtherActive(DetectEncodingState* destatep) {
1245   return (destatep->active_special & (kIso2022Active + kBinaryActive +
1246                                       kUTF8Active + kUTF8UTF8Active +
1247                                       kUTF1632Active + kEUCJPActive)) != 0;
1248 }
1249 
1250 
CEDFlagRescanning(CEDInternalFlags flags)1251 static inline bool CEDFlagRescanning(CEDInternalFlags flags) {
1252   return (flags & kCEDRescanning) != 0;
1253 }
1254 
CEDFlagForceTags(CEDInternalFlags flags)1255 static inline bool CEDFlagForceTags(CEDInternalFlags flags) {
1256   return (flags & kCEDForceTags) != 0;
1257 }
1258 
1259 
maxint(int a,int b)1260 static inline int maxint(int a, int b) {return (a > b) ? a : b;}
minint(int a,int b)1261 static inline int minint(int a, int b) {return (a < b) ? a : b;}
1262 
MyRankedEncName(int r_enc)1263 static inline const char* MyRankedEncName(int r_enc) {
1264   return MyEncodingName(kMapToEncoding[r_enc]);
1265 }
1266 
1267 
1268 // Only for debugging. not thread safe
1269 static const int kPsSourceWidth = 32;
1270 static int pssourcenext = 0;    // debug only. not threadsafe. dump only >= this
1271 static int pssourcewidth = 0;   // debug only.
1272 static char* pssource_mark_buffer = NULL;
1273 int next_do_src_line;
1274 int do_src_offset[16];
1275 
1276 
PsSourceInit(int len)1277 void PsSourceInit(int len) {
1278    pssourcenext = 0;
1279    pssourcewidth = len;
1280    delete[] pssource_mark_buffer;
1281    // Allocate 2 Ascii characters per input byte
1282    pssource_mark_buffer = new char[(pssourcewidth * 2) + 8];  // 8 = overscan
1283    memset(pssource_mark_buffer, ' ', pssourcewidth * 2);
1284    memset(pssource_mark_buffer + (pssourcewidth * 2), '\0', 8);
1285 
1286    next_do_src_line = 0;
1287    memset(do_src_offset, 0, sizeof(do_src_offset));
1288 }
1289 
PsSourceFinish()1290 void PsSourceFinish() {
1291   // Print preceding mark buffer
1292   int j = (pssourcewidth * 2) - 1;
1293   while ((0 <= j) && (pssource_mark_buffer[j] == ' ')) {--j;}   // trim
1294   pssource_mark_buffer[j + 1] = '\0';
1295   fprintf(stderr, "(      %s) do-src\n", pssource_mark_buffer);
1296   memset(pssource_mark_buffer, ' ', pssourcewidth * 2);
1297   memset(pssource_mark_buffer + (pssourcewidth * 2), '\0', 8);
1298 
1299   delete[] pssource_mark_buffer;
1300   pssource_mark_buffer = NULL;
1301 }
1302 
1303 // Dump aligned len bytes src... if not already dumped
PsSource(const uint8 * src,const uint8 * isrc,const uint8 * srclimit)1304 void PsSource(const uint8* src, const uint8* isrc, const uint8* srclimit) {
1305   int offset = src - isrc;
1306   offset -= (offset % pssourcewidth);     // round down to multiple of len bytes
1307   if (offset < pssourcenext) {
1308     return;
1309   }
1310   pssourcenext = offset + pssourcewidth;  // Min offset for next dump
1311 
1312   // Print preceding mark buffer
1313   int j = (pssourcewidth * 2) - 1;
1314   while ((0 <= j) && (pssource_mark_buffer[j] == ' ')) {--j;}   // trim
1315   pssource_mark_buffer[j + 1] = '\0';
1316   fprintf(stderr, "(      %s) do-src\n", pssource_mark_buffer);
1317   memset(pssource_mark_buffer, ' ', pssourcewidth * 2);
1318   memset(pssource_mark_buffer + (pssourcewidth * 2), '\0', 8);
1319 
1320   // Print source bytes
1321   const uint8* src_aligned = isrc + offset;
1322   int length = srclimit - src_aligned;
1323   length = minint(pssourcewidth, length);
1324 
1325   fprintf(stderr, "(%05x ", offset);
1326   for (int i = 0; i < length; ++i) {
1327     char c = src_aligned[i];
1328     if (c == '\n') {c = ' ';}
1329     if (c == '\r') {c = ' ';}
1330     if (c == '\t') {c = ' ';}
1331     if (c == '(') {
1332       fprintf(stderr, "%s", "\\( ");
1333     } else if (c == ')') {
1334       fprintf(stderr, "%s", "\\) ");
1335     } else if (c == '\\') {
1336       fprintf(stderr, "%s", "\\\\ ");
1337     } else if ((0x20 <= c) && (c <= 0x7e)) {
1338       fprintf(stderr, "%c ", c);
1339     } else {
1340       fprintf(stderr, "%02x", c);
1341     }
1342   }
1343   fprintf(stderr, ") do-src\n");
1344   // Remember which source offsets are where, mod 16
1345   do_src_offset[next_do_src_line & 0x0f] = offset;
1346   ++next_do_src_line;
1347 }
1348 
1349 // Mark bytes in just-previous source bytes
PsMark(const uint8 * src,int len,const uint8 * isrc,int weightshift)1350 void PsMark(const uint8* src, int len, const uint8* isrc, int weightshift) {
1351   int offset = src - isrc;
1352   offset = (offset % pssourcewidth);     // mod len bytes
1353   char mark = (weightshift == 0) ? '-' : 'x';
1354 
1355   pssource_mark_buffer[(offset * 2)] = '=';
1356   pssource_mark_buffer[(offset * 2) + 1] = '=';
1357   for (int i = 1; i < len; ++i) {
1358     pssource_mark_buffer[(offset + i) * 2] = mark;
1359     pssource_mark_buffer[((offset + i) * 2) + 1] = mark;
1360   }
1361 }
1362 
1363 
1364 // Highlight trigram bytes in just-previous source bytes
1365 // Unfortunately, we have to skip back N lines since source was printed for
1366 // up to 8 bigrams before we get here. Match on src+1 to handle 0/31 better
PsHighlight(const uint8 * src,const uint8 * isrc,int trigram_val,int n)1367 void PsHighlight(const uint8* src, const uint8* isrc, int trigram_val, int n) {
1368   int offset = (src + 1) - isrc;
1369   int offset32 = (offset % pssourcewidth);    // mod len bytes
1370   offset -= offset32;                     // round down to multiple of len bytes
1371 
1372   for (int i = 1; i <= 16; ++i) {
1373     if (do_src_offset[(next_do_src_line - i) & 0x0f] == offset) {
1374       fprintf(stderr, "%d %d %d do-highlight%d\n",
1375               i, offset32 - 1, trigram_val, n);
1376       break;
1377     }
1378   }
1379 }
1380 
1381 
InitDetectEncodingState(DetectEncodingState * destatep)1382 void InitDetectEncodingState(DetectEncodingState* destatep) {
1383   destatep->initial_src = NULL;       // Filled in by caller
1384   destatep->limit_src = NULL;
1385   destatep->prior_src = NULL;
1386   destatep->last_pair = NULL;
1387 
1388   destatep->debug_data = NULL;
1389   destatep->next_detail_entry = 0;
1390 
1391   destatep->done = false;
1392   destatep->reliable = false;
1393   destatep->hints_derated = false;
1394   //destatep->declared_enc_1 init in ApplyHints
1395   //destatep->declared_enc_2 init in ApplyHints
1396   destatep->prune_count = 0;
1397 
1398   destatep->trigram_highwater_mark = 0;
1399   destatep->looking_for_latin_trigrams = false;
1400   destatep->do_latin_trigrams = false;
1401 
1402   // Miscellaneous state variables for difficult encodings
1403   destatep->binary_quadrants_count = 0;
1404   destatep->binary_8x4_count = 0;
1405   destatep->binary_quadrants_seen = 0;
1406   destatep->binary_8x4_seen = 0;
1407   destatep->utf7_starts = 0;
1408   destatep->prior_utf7_offset = 0;
1409   destatep->next_utf8_ministate = 0;
1410   for (int i = 0; i < 6; i++) {destatep->utf8_minicount[i] = 0;}
1411   destatep->next_utf8utf8_ministate = 0;
1412   destatep->utf8utf8_odd_byte = 0;
1413   for (int i = 0; i < 6; i++) {destatep->utf8utf8_minicount[i] = 0;}
1414   destatep->next_2022_state = SOSI_NONE;
1415   destatep->next_hz_state = SOSI_NONE;
1416   destatep->next_eucjp_oddphase = false;
1417   for (int i = 0; i < 8; i++) {destatep->byte32_count[i] = 0;}
1418   destatep->active_special = 0xffffffff;
1419   destatep->tld_hint = UNKNOWN_ENCODING;
1420   destatep->http_hint = UNKNOWN_ENCODING;
1421   destatep->meta_hint = UNKNOWN_ENCODING;
1422   destatep->bom_hint = UNKNOWN_ENCODING;
1423   destatep->top_rankedencoding = 0;         // ASCII [seven-bit] is the default
1424   destatep->second_top_rankedencoding = 0;  // ASCII [seven-bit] is the default
1425   destatep->top_prob = -1;
1426   destatep->second_top_prob = -1;
1427   // This is wide for first pruning, shrinks for 2nd and later
1428   destatep->prune_difference = kInititalPruneDifference;
1429 
1430   destatep->next_prior_bigram = 0;
1431   destatep->prior_bigram[0] = -1;
1432   destatep->prior_bigram[1] = -1;
1433   destatep->prior_bigram[2] = -1;
1434   destatep->prior_bigram[3] = -1;
1435 
1436   destatep->prior_binary[0] = -1;
1437 
1438   // Initialize with all but Indic encodings, which we never detect
1439   int k = 0;
1440   for (int rankedencoding = 0;
1441         rankedencoding < NUM_RANKEDENCODING;
1442         rankedencoding++) {
1443     Encoding enc = kMapToEncoding[rankedencoding];
1444     if (!IndicEncoding(enc)) {
1445       destatep->rankedencoding_list[k++] = rankedencoding;
1446     }
1447   }
1448   destatep->rankedencoding_list_len = k;
1449 
1450   // This is where all the action is
1451   memset(destatep->enc_prob, 0, sizeof(destatep->enc_prob));
1452 
1453   memset(destatep->hint_prob, 0, sizeof(destatep->hint_prob));
1454   memset(destatep->hint_weight, 0, sizeof(destatep->hint_weight));
1455 
1456   destatep->prior_interesting_pair[AsciiPair] = 0;
1457   destatep->prior_interesting_pair[OtherPair] = 0;
1458   destatep->next_interesting_pair[AsciiPair] = 0;
1459   destatep->next_interesting_pair[OtherPair] = 0;
1460   // interesting_pairs/offsets/weightshifts not initialized; no need
1461 }
1462 
1463 // Probability strings are uint8, with zeros removed via simple run-length:
1464 //  (<skip-take byte> <data bytes>)*
1465 // skip-take:
1466 //  00  end
1467 //  x0  skip 16 x locations, take 0 data values
1468 //  xy  skip x locations, take y data values
1469 // Multiply all the incoming values by 3 to account for 3x unigram sums
1470 //
1471 // {{0x77,0x69,0x6e,0x64,0x31,0x32,0x35,0x35,
1472 //   0x01,0xc2,0x10,0x41,0xfe,0x71,0xba,0x00,}}, // "wind1255"
1473 //
1474 // Weight is 0..100 percent
1475 //
1476 // Returns subscript of largest (most probable) value
1477 //
1478 
1479 
1480 //  {{0x6e,0x6c,0x5f,0x5f, 0x05,0xb2,0xae,0xa0,0x32,0xa1,0x36,0x31,0x42,0x39,0x3b,0x33,0x45,0x11,0x6f,0x00,}}, // "nl__"
1481 //        // ASCII-7-bit=178  Latin1=174  UTF8=160  GB=50  CP1252=161  BIG5=49  Latin2=66  CP1251=57  CP1256=59  CP1250=51  Latin5=69  ISO-8859-15=111  [top ASCII-7-bit]
ApplyCompressedProb(const unsigned char * iprob,int len,int weight,DetectEncodingState * destatep)1482 int ApplyCompressedProb(const unsigned char* iprob, int len,
1483                          int weight, DetectEncodingState* destatep) {
1484   int* dst = &destatep->enc_prob[0];
1485   int* dst2 = &destatep->hint_weight[0];
1486   const uint8* prob = reinterpret_cast<const uint8*>(iprob);
1487   const uint8* problimit = prob + len;
1488 
1489   int largest = -1;
1490   int subscript_of_largest = 0;
1491 
1492   // Continue with first byte and subsequent ones
1493   while (prob < problimit) {
1494     int skiptake = *prob++;
1495     int skip = (skiptake & 0xf0) >> 4;
1496     int take = skiptake & 0x0f;
1497     if (skiptake == 00) {
1498       break;
1499     } else if (take == 0) {
1500       dst += (skip << 4);
1501       dst2 += (skip << 4);
1502     } else {
1503       dst += skip;    // Normal case
1504       dst2 += skip;  // Normal case
1505       for (int i = 0; i < take; i++) {
1506         int enc = static_cast<int>(dst - &destatep->enc_prob[0]) + i;
1507         if (largest < prob[i]) {
1508           largest = prob[i];
1509           subscript_of_largest = enc;
1510         }
1511 
1512         int increment = prob[i] * 3;    // The actual increment
1513 
1514         // Do maximum of previous hints plus this new one
1515         if (weight > 0) {
1516           increment = (increment * weight)  / 100;
1517           dst[i] = maxint(dst[i], increment);
1518           dst2[i] = 1;              // New total weight
1519         }
1520       }
1521       prob += take;
1522       dst += take;
1523       dst2 += take;
1524     }
1525   }
1526   return subscript_of_largest;
1527 }
1528 
1529 
1530 // Returns subscript of largest (most probable) value [for unit test]
TopCompressedProb(const unsigned char * iprob,int len)1531 int TopCompressedProb(const unsigned char* iprob, int len) {
1532   const uint8* prob = reinterpret_cast<const uint8*>(iprob);
1533   const uint8* problimit = prob + len;
1534   int next_prob_sub = 0;
1535   int topprob = 0;
1536   int toprankenc = 0;
1537 
1538   while (prob < problimit) {
1539     int skiptake = *prob++;
1540     int skip = (skiptake & 0xf0) >> 4;
1541     int take = skiptake & 0x0f;
1542     if (skiptake == 0) {
1543       break;
1544     } else if (take == 0) {
1545       next_prob_sub += (skip << 4);
1546     } else {
1547       next_prob_sub += skip;    // Normal case
1548       for (int i = 0; i < take; i++) {
1549         if (topprob < prob[i]) {
1550           topprob = prob[i];
1551           toprankenc = next_prob_sub + i;
1552         }
1553       }
1554       prob += take;
1555       next_prob_sub += take;
1556     }
1557   }
1558   return toprankenc;
1559 }
1560 
1561 
1562 // Find subscript of matching key in first 8 bytes of sorted hint array, or -1
HintBinaryLookup8(const HintEntry * hintprobs,int hintprobssize,const char * norm_key)1563 int HintBinaryLookup8(const HintEntry* hintprobs, int hintprobssize,
1564                      const char* norm_key) {
1565   // Key is always in range [lo..hi)
1566   int lo = 0;
1567   int hi = hintprobssize;
1568   while (lo < hi) {
1569     int mid = (lo + hi) >> 1;
1570     int comp = memcmp(&hintprobs[mid].key_prob[0], norm_key, 8);
1571     if (comp < 0) {
1572       lo = mid + 1;
1573     } else if (comp > 0) {
1574       hi = mid;
1575     } else {
1576       return mid;
1577     }
1578   }
1579   return -1;
1580 }
1581 
1582 // Find subscript of matching key in first 4 bytes of sorted hint array, or -1
HintBinaryLookup4(const HintEntry * hintprobs,int hintprobssize,const char * norm_key)1583 int HintBinaryLookup4(const HintEntry* hintprobs, int hintprobssize,
1584                      const char* norm_key) {
1585   // Key is always in range [lo..hi)
1586   int lo = 0;
1587   int hi = hintprobssize;
1588   while (lo < hi) {
1589     int mid = (lo + hi) >> 1;
1590     int comp = memcmp(&hintprobs[mid].key_prob[0], norm_key, 4);
1591     if (comp < 0) {
1592       lo = mid + 1;
1593     } else if (comp > 0) {
1594       hi = mid;
1595     } else {
1596       return mid;
1597     }
1598   }
1599   return -1;
1600 }
1601 
Boost(DetectEncodingState * destatep,int r_enc,int boost)1602 static inline void Boost(DetectEncodingState* destatep, int r_enc, int boost) {
1603   destatep->enc_prob[r_enc] += boost;
1604 }
1605 
Whack(DetectEncodingState * destatep,int r_enc,int whack)1606 static inline void Whack(DetectEncodingState* destatep, int r_enc, int whack) {
1607   destatep->enc_prob[r_enc] -= whack;
1608 }
1609 
1610 // Apply initial probability hint based on top level domain name
1611 // Weight is 0..100 percent
1612 // Return 1 if name match found
ApplyTldHint(const char * url_tld_hint,int weight,DetectEncodingState * destatep)1613 int ApplyTldHint(const char* url_tld_hint, int weight,
1614                   DetectEncodingState* destatep) {
1615   if (url_tld_hint[0] == '~') {
1616     return 0;
1617   }
1618   string normalized_tld = MakeChar4(string(url_tld_hint));
1619   int n = HintBinaryLookup4(kTLDHintProbs, kTLDHintProbsSize,
1620                            normalized_tld.c_str());
1621   if (n >= 0) {
1622     // TLD is four bytes, probability table is ~12 bytes
1623     int best_sub = ApplyCompressedProb(&kTLDHintProbs[n].key_prob[kMaxTldKey],
1624                                        kMaxTldVector, weight, destatep);
1625     // Never boost ASCII7; do CP1252 instead
1626     if (best_sub == F_ASCII_7_bit) {best_sub = F_CP1252;}
1627     destatep->declared_enc_1 = best_sub;
1628     if (destatep->debug_data != NULL) {
1629       // Show TLD hint
1630       SetDetailsEncProb(destatep, 0, best_sub, url_tld_hint);
1631     }
1632     return 1;
1633   }
1634   return 0;
1635 }
1636 
1637 // Apply initial probability hint based on charset= name
1638 // Weight is 0..100 percent
1639 // Return 1 if name match found
ApplyCharsetHint(const char * charset_hint,int weight,DetectEncodingState * destatep)1640 int ApplyCharsetHint(const char* charset_hint, int weight,
1641                       DetectEncodingState* destatep) {
1642   if (charset_hint[0] == '~') {
1643     return 0;
1644   }
1645   string normalized_charset = MakeChar44(string(charset_hint));
1646   int n = HintBinaryLookup8(kCharsetHintProbs, kCharsetHintProbsSize,
1647                            normalized_charset.c_str());
1648   if (n >= 0) {
1649     // Charset is eight bytes, probability table is ~eight bytes
1650     int best_sub = ApplyCompressedProb(&kCharsetHintProbs[n].key_prob[kMaxCharsetKey],
1651                                        kMaxCharsetVector, weight, destatep);
1652     // Never boost ASCII7; do CP1252 instead
1653     if (best_sub == F_ASCII_7_bit) {best_sub = F_CP1252;}
1654     destatep->declared_enc_1 = best_sub;
1655 
1656     // If first explicitly declared charset is confusable with Latin1/1252, put
1657     // both declared forms in declared_enc_*, displacing Latin1/1252.
1658     // This avoids a bit of Latin1 creep.
1659     // Also boost the declared encoding and its pair
1660     // TODO: This should all be folded into postproc-enc-detect.cc
1661     if ((destatep->http_hint == UNKNOWN_ENCODING) &&
1662         (destatep->meta_hint == UNKNOWN_ENCODING)) {
1663       // This is the first charset=hint
1664       switch (best_sub) {
1665       case F_Latin2:            // 8859-2 Latin2, east euro
1666         destatep->declared_enc_2 = F_CP1250;
1667         Boost(destatep, F_Latin2, kGentleOnePair);
1668         Boost(destatep, F_CP1250, kGentleOnePair);
1669         break;
1670       case F_CP1250:
1671         destatep->declared_enc_2 = F_Latin2;
1672         Boost(destatep, F_Latin2, kGentleOnePair);
1673         Boost(destatep, F_CP1250, kGentleOnePair);
1674         break;
1675 
1676       case F_Latin3:            // 8859-3 Latin3, south euro, Esperanto
1677         destatep->declared_enc_2 = F_ASCII_7_bit;
1678         Boost(destatep, F_Latin3, kGentleOnePair);
1679         break;
1680 
1681       case F_Latin4:            // 8859-4 Latin4, north euro
1682         destatep->declared_enc_2 = F_ASCII_7_bit;
1683         Boost(destatep, F_Latin4, kGentleOnePair);
1684         break;
1685 
1686       case F_ISO_8859_5:        // 8859-5 Cyrillic
1687         destatep->declared_enc_2 = F_ASCII_7_bit;       // Don't boost 1251
1688         Boost(destatep, F_ISO_8859_5, kGentleOnePair);  // (too different)
1689         break;
1690       case F_CP1251:
1691         destatep->declared_enc_2 = F_ASCII_7_bit;       // Don't boost -5
1692         Boost(destatep, F_CP1251, kGentleOnePair);      // (too different)
1693         break;
1694 
1695       case F_Arabic:            // 8859-6 Arabic
1696         destatep->declared_enc_2 = F_CP1256;
1697         Boost(destatep, F_Arabic, kGentleOnePair);
1698         Boost(destatep, F_CP1256, kGentleOnePair);
1699         break;
1700       case F_CP1256:
1701         destatep->declared_enc_2 = F_Arabic;
1702         Boost(destatep, F_Arabic, kGentleOnePair);
1703         Boost(destatep, F_CP1256, kGentleOnePair);
1704         break;
1705 
1706       case F_Greek:             // 8859-7 Greek
1707         destatep->declared_enc_2 = F_CP1253;
1708         Boost(destatep, F_Greek, kGentleOnePair);
1709         Boost(destatep, F_CP1253, kGentleOnePair);
1710         break;
1711       case F_CP1253:
1712         destatep->declared_enc_2 = F_Greek;
1713         Boost(destatep, F_Greek, kGentleOnePair);
1714         Boost(destatep, F_CP1253, kGentleOnePair);
1715         break;
1716 
1717       case F_Hebrew:            // 8859-8 Hebrew
1718         destatep->declared_enc_2 = F_CP1255;
1719         Boost(destatep, F_Hebrew, kGentleOnePair);
1720         Boost(destatep, F_CP1255, kGentleOnePair);
1721         break;
1722       case F_CP1255:
1723         destatep->declared_enc_2 = F_Hebrew;
1724         Boost(destatep, F_Hebrew, kGentleOnePair);
1725         Boost(destatep, F_CP1255, kGentleOnePair);
1726         break;
1727 
1728       case F_Latin5:            // 8859-9 Latin5, Turkish
1729         destatep->declared_enc_2 = F_ASCII_7_bit;       // Don't boost 1254
1730         Boost(destatep, F_Latin5, kGentleOnePair);      // (too different)
1731         break;
1732       case F_CP1254:
1733         destatep->declared_enc_2 = F_ASCII_7_bit;       // Don't boost Latin5
1734         Boost(destatep, F_CP1254, kGentleOnePair);      // (too different)
1735         break;
1736 
1737       case F_Latin6:            // 8859-10 Latin6, Nordic
1738         destatep->declared_enc_2 = F_ASCII_7_bit;
1739         Boost(destatep, F_Latin6, kGentleOnePair);
1740         break;
1741 
1742       case F_ISO_8859_11:       // 8859-11 Thai,
1743         destatep->declared_enc_2 = F_CP874;
1744         Boost(destatep, F_ISO_8859_11, kGentleOnePair);
1745         Boost(destatep, F_CP874, kGentleOnePair);
1746         break;
1747       case F_CP874:
1748         destatep->declared_enc_2 = F_ISO_8859_11;
1749         Boost(destatep, F_ISO_8859_11, kGentleOnePair);
1750         Boost(destatep, F_CP874, kGentleOnePair);
1751         break;
1752 
1753       case F_ISO_8859_13:       // 8859-13 Latin7, Baltic
1754         destatep->declared_enc_2 = F_CP1257;
1755         Boost(destatep, F_ISO_8859_13, kGentleOnePair);
1756         Boost(destatep, F_CP1257, kGentleOnePair);
1757         break;
1758       case F_CP1257:
1759         destatep->declared_enc_2 = F_ISO_8859_13;
1760         Boost(destatep, F_ISO_8859_13, kGentleOnePair);
1761         Boost(destatep, F_CP1257, kGentleOnePair);
1762         break;
1763 
1764       case F_ISO_8859_15:       // 8859-15 Latin9, Latin0, Euro-ized Latin1
1765         destatep->declared_enc_2 = F_ASCII_7_bit;
1766         Boost(destatep, F_ISO_8859_15, kGentleOnePair);
1767         break;
1768 
1769 
1770         // Greek all-caps is confusable with KOI8x all-lower and Hebrew.
1771         // This turns some Greek documents into Cyrillic, etc. by mistake.
1772         // Greek and Hebrew are boosted explicitly above; do KOI8x here.
1773         // Boosting the declared encodingmakes it harder for the wrong one to
1774         // creep up.
1775       case F_KOI8R:
1776         Boost(destatep, F_KOI8R, kGentleOnePair);
1777         break;
1778       case F_KOI8U:
1779         Boost(destatep, F_KOI8U, kGentleOnePair);
1780         break;
1781 
1782       default:
1783         break;
1784       }
1785     }
1786 
1787     if (destatep->debug_data != NULL) {
1788       // Show charset hint
1789       SetDetailsEncProb(destatep, 0, best_sub, charset_hint);
1790     }
1791 
1792     //
1793     // Some fix-ups for the declared encodings
1794     //
1795 
1796     // If non-UTF8, non-Latin1/1252 encoding declared, disable UTF8 combos
1797     // TODO: This should all be folded into postproc-enc-detect.cc
1798     if ((best_sub != F_UTF8) &&
1799         (best_sub != F_Latin1) &&
1800         (best_sub != F_CP1252)) {
1801       Whack(destatep, F_UTF8UTF8, kBadPairWhack * 4);         // demote
1802     }
1803 
1804     // Latin2 and CP1250 differ in the overlap part, such as B1 or B9
1805     // The initial probabilites for charset=Latin2 explicitly put CP1250
1806     // down twice as far as normal, and vice versa. This is done in
1807     // postproc-enc-detect.cc
1808 
1809     // If charset=user-defined, treat as Binary --
1810     // we can safely only do low ASCII, might be Indic
1811     if (normalized_charset.substr(0,4) == "user") {
1812       Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
1813     }
1814 
1815     return 1;
1816   }
1817   return 0;
1818 }
1819 
1820 // Apply initial probability hint based on caller-supplied encoding
1821 // Negative hint whacks ~encoding, non-negative boosts encoding
1822 //
1823 // Negative hints are an experiment to see if they might be useful.
1824 // Not operator used instead of unary minus to allow specifying not-zero
ApplyEncodingHint(const int encoding_hint,int weight,DetectEncodingState * destatep)1825 int ApplyEncodingHint(const int encoding_hint, int weight,
1826                        DetectEncodingState* destatep) {
1827   Encoding enc_hint = static_cast<Encoding>((encoding_hint < 0) ?
1828                                             ~encoding_hint : encoding_hint);
1829   // Map to the right internal subscript
1830   int rankedenc_hint = CompactEncDet::BackmapEncodingToRankedEncoding(enc_hint);
1831 
1832   // I'm not sure how strong this hint should be. Weight 100% = 1 bigram
1833   int increment = (kBoostOnePair * weight) / 100;
1834 
1835   if (encoding_hint < 0) {
1836     destatep->enc_prob[rankedenc_hint] -= increment;
1837   } else {
1838     destatep->enc_prob[rankedenc_hint] += increment;
1839   }
1840 
1841   if (destatep->debug_data != NULL) {
1842     // Show encoding hint
1843     SetDetailsEncProb(destatep, 0, -1, MyEncodingName(enc_hint));
1844   }
1845   return 1;
1846 }
1847 
1848 // Apply initial probability hint based on user interface language
1849 // Weight is 0..100 percent
1850 // Return 1 if name match found
ApplyUILanguageHint(const Language language_hint,int weight,DetectEncodingState * destatep)1851 int ApplyUILanguageHint(const Language language_hint,
1852                         int weight, DetectEncodingState* destatep) {
1853   if (language_hint == UNKNOWN_LANGUAGE) {
1854     return 0;
1855   }
1856   string normalized_lang = MakeChar8(LanguageName(language_hint));
1857   int n = HintBinaryLookup8(kLangHintProbs, kLangHintProbsSize,
1858                            normalized_lang.c_str());
1859   if (n >= 0) {
1860     // Language is eight bytes, probability table is ~eight bytes
1861     int best_sub = ApplyCompressedProb(&kLangHintProbs[n].key_prob[kMaxLangKey],
1862                                        kMaxLangVector, weight, destatep);
1863     // Never boost ASCII7; do CP1252 instead
1864     if (best_sub == F_ASCII_7_bit) {best_sub = F_CP1252;}
1865     destatep->declared_enc_1 = best_sub;
1866     if (destatep->debug_data != NULL) {
1867       // Show language hint
1868       SetDetailsEncProb(destatep, 0, best_sub, normalized_lang.c_str());
1869     }
1870     return 1;
1871   }
1872   return 0;
1873 }
1874 
1875 // Apply initial probability hint based on corpus type (web, email, etc)
1876 // Return 1 if name match found
ApplyDefaultHint(const CompactEncDet::TextCorpusType corpus_type,DetectEncodingState * destatep)1877 int ApplyDefaultHint(const CompactEncDet::TextCorpusType corpus_type,
1878                       DetectEncodingState* destatep) {
1879 
1880   for (int i = 0; i < NUM_RANKEDENCODING; i++) {
1881     // Set the default probability
1882     destatep->enc_prob[i] = kDefaultProb[i] * 3;
1883     // Deliberately set 2022 seven-bit encodings to zero,
1884     // so we can look for actual use
1885     // TODO: This should all be folded into postproc-enc-detect.cc
1886     if (SevenBitEncoding(kMapToEncoding[i])) {
1887       destatep->enc_prob[i] = 0;
1888     }
1889   }
1890 
1891   //  A little corpus distinction
1892   switch (corpus_type) {
1893   case CompactEncDet::WEB_CORPUS:
1894   case CompactEncDet::XML_CORPUS:
1895     // Allow double-converted UTF-8 to start nearly equal to normal UTF-8
1896     destatep->enc_prob[F_UTF8UTF8] =
1897       destatep->enc_prob[F_UTF8] - kSmallInitDiff;
1898   break;
1899   case CompactEncDet::QUERY_CORPUS:
1900   case CompactEncDet::EMAIL_CORPUS:
1901   default:
1902     break;
1903   }
1904 
1905   if (FLAGS_demo_nodefault) {
1906     // Demo, make initial probs all zero
1907     for (int i = 0; i < NUM_RANKEDENCODING; i++) {
1908       destatep->enc_prob[i] = 0;
1909     }
1910   }
1911 
1912   if (destatep->debug_data != NULL) {
1913     // Show default hint
1914     SetDetailsEncProb(destatep, 0, -1, "Default");
1915   }
1916   return 1;
1917 }
1918 
1919 
1920 
1921 // Do reverse search for c in [str..str+len)
1922 // Note: initial pointer is to FRONT of string, not back
MyMemrchr(const char * str,char c,size_t len)1923 const char* MyMemrchr(const char* str, char c, size_t len) {
1924   const char* ret = str + len;
1925   while (str <= --ret) {
1926     if (*ret == c) {return ret;}
1927   }
1928   return NULL;
1929 }
1930 
1931 
1932 // Minimum real URL is 11 bytes: "http://a.bc" -- shorter is assumed to be TLD
1933 // Now that we are no longer trying to do Indic font-based encodigns, we
1934 // don't need the full URL and can go back to simple TLD. This test remains for
1935 // backwards compatility with any caller using full URL.
1936 static const int kMinURLLength = 11;
1937 
1938 // Extract TLD from a full URL or just a TLD
1939 // Return hostname and length if a full URL
ExtractTLD(const char * url_hint,char * tld_hint,int tld_hint_len,const char ** ret_host_start,int * ret_host_len)1940 void ExtractTLD(const char* url_hint, char* tld_hint, int tld_hint_len,
1941                 const char** ret_host_start, int* ret_host_len) {
1942   // url_hint can either be a full URL (preferred) or just top-level domain name
1943   // Extract the TLD from a full URL and use it for
1944   // a normal TLD hint
1945 
1946   strncpy(tld_hint, "~", tld_hint_len);
1947   tld_hint[tld_hint_len - 1] = '\0';
1948   *ret_host_start = NULL;
1949   *ret_host_len = 0;
1950 
1951   int url_len = (url_hint != NULL) ? strlen(url_hint) : 0;
1952   if (url_len == 0) {
1953     // Empty TLD
1954     return;
1955   }
1956 
1957   // Minimum real URL is 11 bytes: "http://a.bc" -- shorter is assumed to be TLD
1958   if (kMinURLLength <= url_len) {
1959     // See if it really is a URL
1960     const char* first_slash = strchr(url_hint, '/');
1961     if ((first_slash != NULL) && (first_slash != url_hint) &&
1962         (first_slash[-1] == ':') && (first_slash[1] == '/') &&
1963         (memrchr(url_hint, '.', first_slash - url_hint) == NULL)) {
1964       // We found :// and no dot in front of it, so declare a real URL
1965 
1966       const char* hostname_start = first_slash + 2;
1967       const char* hostname_end = strchr(hostname_start, '/');
1968       if (hostname_end == NULL) {
1969         // No slash; end is first byte off end of the URL string
1970         hostname_end = url_hint + url_len;
1971       }
1972       size_t hostname_len = hostname_end - hostname_start;
1973       const char* port_start =
1974         (const char*)memchr(hostname_start, ':', hostname_len);
1975       if (port_start != NULL) {
1976         // Port; shorten hostname
1977         hostname_end = port_start;
1978         hostname_len = hostname_end - hostname_start;
1979       }
1980 
1981       const char* tld_start = MyMemrchr(hostname_start, '.', hostname_len);
1982       if (tld_start != NULL) {
1983         // Remember the TLD we just found
1984         int tld_len = hostname_start + hostname_len - tld_start - 1;
1985         if (tld_len > (tld_hint_len - 1)) {
1986           tld_len = tld_hint_len - 1;
1987         }
1988         memcpy(tld_hint, tld_start + 1, tld_len);
1989         tld_hint[tld_len] = '\0';
1990       }
1991       *ret_host_start = hostname_start;
1992       *ret_host_len = hostname_len;
1993       return;
1994     }
1995   } else {
1996     strncpy(tld_hint, url_hint, tld_hint_len);
1997     tld_hint[tld_hint_len - 1] = '\0';
1998   }
1999 }
2000 
2001 // Apply hints, if any, to probabilities
2002 // NOTE: Encoding probabilites are all zero at this point
ApplyHints(const char * url_hint,const char * http_charset_hint,const char * meta_charset_hint,const int encoding_hint,const Language language_hint,const CompactEncDet::TextCorpusType corpus_type,DetectEncodingState * destatep)2003 void ApplyHints(const char* url_hint,
2004                 const char* http_charset_hint,
2005                 const char* meta_charset_hint,
2006                 const int encoding_hint,
2007                 const Language language_hint,
2008                 const CompactEncDet::TextCorpusType corpus_type,
2009                 DetectEncodingState* destatep) {
2010   int hint_count = 0;
2011   // url_hint can either be a full URL (preferred) or just top-level domain name
2012   // Extract the TLD from a full URL and use it for
2013   // a normal TLD hint
2014 
2015   char tld_hint[16];
2016   const char* hostname_start = NULL;
2017   int hostname_len = 0;
2018   ExtractTLD(url_hint, tld_hint, sizeof(tld_hint),
2019              &hostname_start, &hostname_len);
2020 
2021 
2022   // Initial hints give slight boost to Ascii-7-bit and code page 1252
2023   // ApplyXxx routines copy enc_1 to enc_2 then update declared_enc_1
2024   // This gives a boost to 1252 if one of HTTP/META is specified,
2025   // but this could be the wrong thing to do if Latin2/3/4/etc. is specified
2026   destatep->declared_enc_1 = F_CP1252;
2027   destatep->declared_enc_2 = F_ASCII_7_bit;
2028 
2029   // Applying various hints takes max of new hint and any old hint.
2030   // This does better on multiple hints that a weighted average
2031 
2032   // Weight is 0..100 percent
2033   if ((http_charset_hint != NULL) && (http_charset_hint[0] != '~')) {
2034     destatep->declared_enc_2 = destatep->declared_enc_1;
2035     hint_count += ApplyCharsetHint(http_charset_hint, 100, destatep);
2036     destatep->http_hint = kMapToEncoding[destatep->declared_enc_1];
2037     if ((destatep->declared_enc_1 == F_CP1252) ||
2038         (destatep->declared_enc_1 == F_Latin1)) {
2039       destatep->looking_for_latin_trigrams = true;
2040     }
2041   }
2042   if ((meta_charset_hint != NULL) && (meta_charset_hint[0] != '~')) {
2043     destatep->declared_enc_2 = destatep->declared_enc_1;
2044     hint_count += ApplyCharsetHint(meta_charset_hint, 100, destatep);
2045     destatep->meta_hint = kMapToEncoding[destatep->declared_enc_1];
2046     if ((destatep->declared_enc_1 == F_CP1252) ||
2047         (destatep->declared_enc_1 == F_Latin1)) {
2048       destatep->looking_for_latin_trigrams = true;
2049     }
2050   }
2051   if (encoding_hint != UNKNOWN_ENCODING) {
2052     destatep->declared_enc_2 = destatep->declared_enc_1;
2053     hint_count += ApplyEncodingHint(encoding_hint, 50, destatep);
2054   }
2055   if (language_hint != UNKNOWN_LANGUAGE) {
2056     destatep->declared_enc_2 = destatep->declared_enc_1;
2057     hint_count += ApplyUILanguageHint(language_hint, 50, destatep);
2058   }
2059   // Use top level domain if not .com and <=1 other hint was available
2060   if (url_hint != NULL) {
2061     destatep->tld_hint = CompactEncDet::TopEncodingOfTLDHint(tld_hint);
2062     if (hint_count == 0) {
2063       // Apply with weight 100%
2064       destatep->declared_enc_2 = destatep->declared_enc_1;
2065       hint_count += ApplyTldHint(tld_hint, 100, destatep);
2066       if ((destatep->declared_enc_1 == F_CP1252) ||
2067           (destatep->declared_enc_1 == F_Latin1)) {
2068         destatep->looking_for_latin_trigrams = true;
2069       }
2070       if (strcmp("hu", tld_hint) == 0) {
2071         // Hungarian is particularly difficult to separate Latin2 from Latin1,
2072         // so always look for trigram scanning if bare TLD=hu hint
2073         destatep->looking_for_latin_trigrams = true;
2074       }
2075     // Treat .com as no TLD hint at all
2076     } else if ((hint_count == 1) && (strcmp("com", tld_hint) != 0)) {
2077       // Either shift weighting or consider doing no TLD here -- seems to
2078       // distract from correct charset= hints. Or perhaps apply only if
2079       // charset = Latin1/1252...
2080       // Apply with weight 50%
2081       destatep->declared_enc_2 = destatep->declared_enc_1;
2082       hint_count += ApplyTldHint(tld_hint, 50, destatep);
2083       if ((destatep->declared_enc_1 == F_CP1252) ||
2084           (destatep->declared_enc_1 == F_Latin1)) {
2085         destatep->looking_for_latin_trigrams = true;  // These need trigrams
2086       }
2087     }
2088     // Else ignore TLD hint entirely
2089   }
2090 
2091   // Use all-web default distribution if not even a TLD hint
2092   if (hint_count == 0) {
2093     destatep->looking_for_latin_trigrams = true;    // Default needs trigrams
2094     destatep->declared_enc_2 = destatep->declared_enc_1;
2095     hint_count += ApplyDefaultHint(corpus_type, destatep);
2096   }
2097 
2098 
2099 // ISO-Microsoft Pairs
2100 //    F_Latin1, F_CP1252,
2101 //    F_Latin2, F_CP1250,   NOT really strict subset/superset pairs
2102 //    F_Latin3,
2103 //    F_Latin4,
2104 //    F_ISO_8859_5, F_CP1251,
2105 //    F_Arabic, F_CP1256,   NOT
2106 //    F_Greek,  F_CP1253,   NOT really pairs
2107 //                              (or upgrade incvt to make Greek use CP)
2108 //    F_Hebrew, F_CP1255,   NOT really pairs
2109 //    F_Latin5, F_CP1254,
2110 //    F_Latin6,
2111 //    F_ISO_8859_11,
2112 //    F_ISO_8859_13, F_CP1257,
2113 //    F_ISO_8859_15,
2114 // ISO-Microsoft Pairs
2115 
2116   // Get important families started together
2117   // // This should fall out of the initializatoin vectors for charset,
2118   // but we need to get rid of families alltogetrher
2119   //
2120   // TODO make this more graceful
2121 
2122   // Add small bias for subsets
2123 
2124   // Subtract small bias for supersets
2125   destatep->enc_prob[F_CP932] = destatep->enc_prob[F_SJS] - kSmallInitDiff;
2126 
2127   destatep->enc_prob[F_GBK] = destatep->enc_prob[F_GB] - kSmallInitDiff;
2128   destatep->enc_prob[F_GB18030] = destatep->enc_prob[F_GB] - kSmallInitDiff;
2129 
2130   destatep->enc_prob[F_BIG5_CP950] = destatep->enc_prob[F_BIG5] -
2131     kSmallInitDiff;
2132   destatep->enc_prob[F_BIG5_HKSCS] = destatep->enc_prob[F_BIG5] -
2133     kSmallInitDiff;
2134 
2135   // Deliberate over-bias Ascii7 and underbias Binary [unneeded]
2136   // destatep->enc_prob[F_ASCII_7_bit] = destatep->enc_prob[F_ASCII_7_bit] + kSmallInitDiff;
2137   // destatep->enc_prob[F_BINARY] = destatep->enc_prob[F_BINARY] - (kBoostInitial / 2);
2138 
2139   if (destatep->debug_data != NULL) {
2140     // Show state at end of hints
2141     SetDetailsEncProb(destatep, 0, -1, "Endhints");
2142     if(FLAGS_enc_detect_detail2) {
2143       // Add a line showing the watched encoding(s)
2144       if (watch1_rankedenc >= 0) {
2145         SetDetailsEncProb(destatep, 0,
2146                           watch1_rankedenc, FLAGS_enc_detect_watch1);
2147       }
2148       if (watch2_rankedenc >= 0) {
2149         SetDetailsEncProb(destatep, 0,
2150                           watch2_rankedenc, FLAGS_enc_detect_watch2);
2151       }
2152     }     // End detail2
2153   }
2154 
2155   // If duplicate hints, set second one to ASCII_7BIT to prevent double-boost
2156   if (destatep->declared_enc_1 == destatep->declared_enc_2) {
2157     destatep->declared_enc_2 = F_ASCII_7_bit;
2158   }
2159 
2160   if (FLAGS_force127) {
2161     destatep->do_latin_trigrams = true;
2162     if (FLAGS_enc_detect_source) {
2163       PsHighlight(0, destatep->initial_src, 0, 2);
2164     }
2165   }
2166 
2167 
2168   if (FLAGS_counts && destatep->looking_for_latin_trigrams) {++looking_used;}
2169   if (FLAGS_counts && destatep->do_latin_trigrams) {++doing_used;}
2170 
2171   //
2172   // At this point, destatep->enc_prob[] is an initial probability vector based
2173   // on the given hints/default. In general, it spreads out least-likely
2174   // encodings to be about 2**-25 below the most-likely encoding.
2175   // For input text with lots of bigrams, an unlikely encoding can rise to
2176   // the top at a rate of about 2**6 per bigram, and more commonly 2**2 per
2177   // bigram. So more than 4 bigrams and commonly more than 12 are
2178   // needed to overcome the initial hints when the least-likely encoding
2179   // is in fact the correct answer. So if the entire text has very few bigrams
2180   // (as a two-word query might), it can be impossible for the correct
2181   // encoding to win.
2182   //
2183   // To compensate for this, we take the initial hint vector and effectively
2184   // apply it at the rate of 1/16 every bigram for the first 16 bigrams. The
2185   // actual mechanism is done just before the last prune.
2186   //
2187 
2188   // Remember Initial hint probabilities
2189   memcpy(destatep->hint_prob, destatep->enc_prob, sizeof(destatep->enc_prob));
2190 }
2191 
2192 // Look for specific high-value patterns in the first 4 bytes
2193 // Byte order marks (BOM)
2194 //  EFBBBF    UTF-8
2195 //  FEFF      UTF-16 BE
2196 //  FFFE      UTF-16 LE
2197 //  FFFE0000  UTF-32 BE
2198 //  0000FEFF  UTF-32 LE
2199 //
2200 // Likely UTF-x of seven-bit ASCII
2201 //  00xx      UTF-16 BE  xx printable ASCII
2202 //  xx00      UTF-16 LE
2203 //  000000xx  UTF-32 BE
2204 //  xx000000  UTF-32 LE
2205 //
InitialBytesBoost(const uint8 * src,int text_length,DetectEncodingState * destatep)2206 void InitialBytesBoost(const uint8* src,
2207                        int text_length,
2208                        DetectEncodingState* destatep) {
2209   if (text_length < 4) {return;}
2210 
2211   uint32 pair01 = (src[0] << 8) | src[1];
2212   uint32 pair23 = (src[2] << 8) | src[3];
2213   uint32 quad0123 = (pair01 << 16) | pair23;
2214 
2215   bool utf_16_indication = false;
2216   bool utf_32_indication = false;
2217   int best_enc = -1;
2218 
2219   // Byte order marks
2220   // UTF-8
2221   if ((quad0123 & 0xffffff00) == 0xEFBBBF00) {
2222     destatep->bom_hint = UTF8;
2223     Boost(destatep, F_UTF8, kBoostInitial * 2);
2224     Boost(destatep, F_UTF8UTF8, kBoostInitial * 2);
2225     best_enc = F_UTF8;
2226   // UTF-32 (test before UTF-16)
2227   } else if (quad0123 == 0x0000FEFF) {
2228     destatep->bom_hint = UTF32BE;
2229     Boost(destatep, F_UTF_32BE, kBoostInitial * 2);
2230     best_enc = F_UTF_32BE;
2231   } else if (quad0123 == 0xFFFE0000) {
2232     destatep->bom_hint = UTF32LE;
2233     Boost(destatep, F_UTF_32LE, kBoostInitial * 2);
2234     best_enc = F_UTF_32LE;
2235   // UTF-16
2236   } else if (pair01 == 0xFEFF) {
2237     destatep->bom_hint = UTF16BE;
2238     Boost(destatep, F_UTF_16BE, kBoostInitial * 3);
2239     best_enc = F_UTF_16BE;
2240   } else if (pair01 == 0xFFFE) {
2241     destatep->bom_hint = UTF16LE;
2242     Boost(destatep, F_UTF_16LE, kBoostInitial * 3);
2243     best_enc = F_UTF_16LE;
2244 
2245   // Possible seven-bit ASCII encoded as UTF-16/32
2246   // UTF-32 (test before UTF-16)
2247   } else if (((quad0123 & 0xffffff00) == 0) &&
2248              (kIsPrintableAscii[src[3]] != 0)) {
2249     Boost(destatep, F_UTF_32BE, kBoostInitial);
2250     Whack(destatep, F_UTF_32LE, kBadPairWhack);         // Illegal char
2251     best_enc = F_UTF_32BE;
2252   } else if (((quad0123 & 0x00ffffff) == 0) &&
2253              (kIsPrintableAscii[src[0]] != 0)) {
2254     Boost(destatep, F_UTF_32LE, kBoostInitial);
2255     Whack(destatep, F_UTF_32BE, kBadPairWhack);         // Illegal char
2256     best_enc = F_UTF_32LE;
2257   } else if ((src[0] == 0x00) && (kIsPrintableAscii[src[1]] != 0)) {
2258     Boost(destatep, F_UTF_16BE, kBoostInitial);
2259     best_enc = F_UTF_16BE;
2260   } else if ((src[1] == 0x00) && (kIsPrintableAscii[src[0]] != 0)) {
2261     Boost(destatep, F_UTF_16LE, kBoostInitial);
2262     best_enc = F_UTF_16LE;
2263 
2264   // Whack if 0000 or FFFF
2265   // UTF-32 (test before UTF-16)
2266   } else if (quad0123 == 0x00000000) {
2267     Whack(destatep, F_UTF_32BE, kBadPairWhack);         // Illegal char
2268     Whack(destatep, F_UTF_32LE, kBadPairWhack);
2269     Whack(destatep, F_UTF_16BE, kBadPairWhack);
2270     Whack(destatep, F_UTF_16LE, kBadPairWhack);
2271     best_enc = -1;
2272   } else if (quad0123 == 0xffffffff) {
2273     Whack(destatep, F_UTF_32BE, kBadPairWhack);         // Illegal char
2274     Whack(destatep, F_UTF_32LE, kBadPairWhack);
2275     Whack(destatep, F_UTF_16BE, kBadPairWhack);
2276     Whack(destatep, F_UTF_16LE, kBadPairWhack);
2277     best_enc = -1;
2278   } else if (pair01 == 0x0000) {
2279     Whack(destatep, F_UTF_16BE, kBadPairWhack);         // Illegal char
2280     Whack(destatep, F_UTF_16LE, kBadPairWhack);
2281     best_enc = -1;
2282   } else if (pair01 == 0xffff) {
2283     Whack(destatep, F_UTF_16BE, kBadPairWhack);         // Illegal char
2284     Whack(destatep, F_UTF_16LE, kBadPairWhack);
2285     best_enc = -1;
2286 
2287 
2288   // These are the first four bytes of some known binary file formats
2289 
2290   // Boost BINARY bigtime if JPEG FFD8FFxx
2291   // Boost BINARY bigtime if png  89504E47  (.PNG)
2292   // Boost BINARY bigtime if gif  47494638  (GIF8)
2293   // Boost BINARY bigtime if zip  504B0304  (PK..)
2294   // Boost BINARY bigtime if gzip 1F8B08xx
2295   // Boost BINARY bigtime if gzip 78DAxxxx
2296   // Boost BINARY if PDF 25504446 (%PDF)
2297   // Boost BINARY if SWF (FWSx or CWSx where x <= 0x1f)
2298   } else if ((quad0123 & 0xffffff00) == 0xFFD8FF00) {       // JPEG FFD8FFxx
2299     Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2300   } else if (quad0123 == 0x89504E47) {                      // Hex 89 P N G
2301     Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2302   } else if (quad0123 == 0x47494638) {                      // Hex GIF8
2303     Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2304   } else if (quad0123 == 0x504B0304) {                      // Hex P K 03 04
2305     Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2306   } else if ((quad0123 & 0xffffff00) == 0x1F8B0800) {       // gzip 1F8B08xx
2307     Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2308   } else if (pair01 == 0x78DA) {                            // gzip 78DAxxxx
2309     Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2310   } else if (quad0123 == 0x25504446) {                      // Hex %PDF
2311     Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2312   } else if ((quad0123 & 0xffffff1f) == 0x66535700) {       // Hex FWSx
2313     Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2314   } else if ((quad0123 & 0xffffff1f) == 0x63535700) {       // Hex CWSx
2315     Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2316 
2317   // More binary detect prefixes
2318   // 7F E L F   Executable and linking format
2319   // M M 00 *   TIFF (little-endian)
2320   // * 00 M M   TIFF (big-endian)
2321   // 01 f c p   Final cut pro
2322   } else if (quad0123 == 0x7F454C46) {                      // Hex 7F E L F
2323     Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2324   } else if (quad0123 == 0x4D4D002A) {                      // Hex M M 00 *
2325     Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2326   } else if (quad0123 == 0x2A004D4D) {                      // Hex * 00 M M
2327     Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2328   } else if (quad0123 == 0x01666370) {                      // Hex 01 f c p
2329     Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2330 
2331   // More binary detect prefixes; all-ASCII names; heavy weight to avoid ASCII
2332   // prefix overcoming binary
2333   // C C S D    USGS ISIS 3-D cube files
2334   // S I M P    FITS image header    "SIMPLE "
2335   } else if (quad0123 == 0x43435344) {                      // Hex C C S D
2336     Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2337   } else if (quad0123 == 0x53494D50) {                      // Hex S I M P
2338     Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2339 
2340   // More binary detect prefixes; all-ASCII names; lighter weight
2341   // H W P      Hangul word processor
2342   // 8 B P S    Photoshop
2343   // P D S _    xx "PDS_VERSION_ID "
2344   } else if (quad0123 == 0x48575020) {                      // Hex H W P
2345     if ((19 <= text_length) &&
2346         (memcmp(src, "HWP.Document.File.V", 19) == 0)) {
2347       Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2348     } else if ((19 <= text_length) &&
2349                (memcmp(src, "HWP Document File V", 19) == 0)) {
2350       Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2351     } else {
2352       Boost(destatep, F_BINARY, kBoostInitial * kWeakerBinary);
2353     }
2354   } else if (quad0123 == 0x38425053) {                      // Hex 8 B P S
2355     Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2356   } else if (quad0123 == 0x5044535F) {                      // Hex P D S _
2357     if ((14 <= text_length) && (memcmp(src, "PDS_VERSION_ID", 14) == 0)) {
2358       Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2359     } else {
2360       Boost(destatep, F_BINARY, kBoostInitial * kWeakerBinary);
2361     }
2362   }
2363 
2364   // There are several main Windows EXE file formats.
2365   // Not examined here (prefix too short; never see them in Google pipeline)
2366   // M Z        DOS .exe  Mark Zbikowski
2367   // N E        DOS 4.0 16-bit
2368   // L E        OS/2 VxD drivers
2369   // L X        OS/2
2370   // P E        Windows NT
2371 
2372 
2373   // More user-defined
2374   // http://www.freenet.am/armscii/ Armenian
2375 
2376   // If any hints or BOM, etc. keep UTF 16/32 around
2377   if ((destatep->enc_prob[F_UTF_16BE] > 0) ||
2378       (destatep->enc_prob[F_UTF_16LE] > 0)) {
2379     utf_16_indication = true;
2380   }
2381   if ((destatep->enc_prob[F_UTF_32BE] > 0) ||
2382       (destatep->enc_prob[F_UTF_32LE] > 0)) {
2383     utf_32_indication = true;
2384   }
2385 
2386 
2387   // Kill UTF16/32 right now if no positive indication of them
2388   // Otherwise, they tend to rise to the top in 7-bit files with an
2389   // occasional 0x02 byte  in some comment or javascript
2390   if (!utf_16_indication) {
2391     Whack(destatep, F_UTF_16BE, kBadPairWhack * 8);
2392     Whack(destatep, F_UTF_16LE, kBadPairWhack * 8);
2393     Whack(destatep, F_Unicode, kBadPairWhack * 8);
2394   }
2395   if (!utf_32_indication) {
2396     Whack(destatep, F_UTF_32BE, kBadPairWhack * 8);
2397     Whack(destatep, F_UTF_32LE, kBadPairWhack * 8);
2398   }
2399 
2400   // Usually kill mixed encodings
2401   if (!FLAGS_ced_allow_utf8utf8) {
2402     Whack(destatep, F_UTF8UTF8, kBadPairWhack * 8);
2403   }
2404   // 2011.11.07 never use UTF8CP1252 -- answer will be UTF8 instead
2405   Whack(destatep, F_UTF8CP1252, kBadPairWhack * 8);
2406 
2407   if (destatep->debug_data != NULL) {
2408     // Show first four bytes of the input
2409     char buff[16];
2410     snprintf(buff, sizeof(buff), "%04x%04x", pair01, pair23);
2411     SetDetailsEncProb(destatep, 0, best_enc, buff);
2412   }
2413 }
2414 
2415 
2416 
2417 // Descending order
IntCompare(const void * v1,const void * v2)2418 int IntCompare(const void* v1, const void* v2) {
2419   const int* p1 = reinterpret_cast<const int*>(v1);
2420   const int* p2 = reinterpret_cast<const int*>(v2);
2421   if (*p1 < *p2) {return 1;}
2422   if (*p1 > *p2) {return -1;}
2423   return 0;
2424 }
2425 
Base64Char(uint8 c)2426 bool Base64Char(uint8 c) {
2427   if (('A' <= c) && (c <= 'Z')) {return true;}
2428   if (('a' <= c) && (c <= 'z')) {return true;}
2429   if (('0' <= c) && (c <= '9')) {return true;}
2430   if ('+' == c) {return true;}
2431   if ('/' == c) {return true;}
2432   return false;
2433 }
2434 
Base64ScanLen(const uint8 * start,const uint8 * limit)2435 int Base64ScanLen(const uint8* start, const uint8* limit) {
2436   // We have a plausible beginning; scan entire base64 string
2437   const uint8* ib64str = start;
2438   const uint8* b64str = ib64str;
2439   const uint8* b64strlimit = limit;
2440   // if starts with + +++, assume it is drawing, so bogus
2441   if (((limit - start) > 3) && (start[0] == '+') &&
2442     (start[1] == '+') && (start[2] == '+')) {
2443     return 81;
2444   }
2445   // Scan over base64
2446   while ((b64str < b64strlimit) && (kBase64Value[*b64str++] >= 0))  {
2447   }
2448   b64str--;      // We overshot by 1
2449   return b64str - ib64str;
2450 }
2451 
2452 // Input is at least 8-character legal base64 string after +.
2453 // But might be say + "Presse+Termine"
GoodUnicodeFromBase64(const uint8 * start,const uint8 * limit)2454 bool GoodUnicodeFromBase64(const uint8* start, const uint8* limit) {
2455   // Reject base64 string len N if density of '+' is > 1 + N/16 (expect 1/64)
2456   // Reject base64 string len N if density of A-Z is < 1 + N/16 (expect 26/64)
2457   // Reject base64 string len N if density of a-z is < 1 + N/16 (expect 26/64)
2458   // Reject base64 string len N if density of 0-9 is < 1 + N/32 (expect 10/64)
2459   // NOTE: this requires at least one lower AND one upper AND one digit to pass
2460   //
2461   int plus_count = 0;
2462   int lower_count = 0;
2463   int upper_count = 0;
2464   int digit_count = 0;
2465   int len = limit - start;
2466   for (const uint8* src = start; src < limit; ++src) {
2467     uint8 c = *src;
2468     if (('a' <= c) && (c <= 'z')) {
2469       ++lower_count;
2470     } else if (('A' <= c) && (c <= 'Z')) {
2471       ++upper_count;
2472     } else if (('0' <= c) && (c <= '0')) {
2473       ++digit_count;
2474     } else if (*src == '+') {
2475       ++plus_count;
2476     }
2477   }
2478 
2479   if (plus_count > (1 + (len >> 4))) {return false;}
2480   if (lower_count < (1 + (len >> 4))) {return false;}
2481   if (upper_count < (1 + (len >> 4))) {return false;}
2482   if (digit_count < (1 + (len >> 5))) {return false;}
2483 
2484   // checking the last character to reduce false positive
2485   // since the last character may be padded to 0 bits at the end.
2486   // refer to http://en.wikipedia.org/wiki/UTF-7
2487   int nmod8 = len & 7;
2488   const uint8 last = *(start+len-1);
2489   // When UTF-7 string length%8=3, the last two bits must be padded as 0
2490   if ((nmod8 == 3) && (kBase64Value[last] & 3)) {return false;}
2491   // When UTF-7 string length%8=6, the last four bits must be padded as 0
2492   if ((nmod8 == 6) && (kBase64Value[last] & 15)) {return false;}
2493   return true;
2494 }
2495 
2496 // Prune here after N bytes
2497 // Boost here for seven-bit sequences (at every prune)
2498 // if (sevenbitrankedencoding)
2499 //   + UTF7   scan and boost/demote len mod 8 = 0 3 6
2500 //   ~ Hz     scan and boost/demote len mod 8 = 0 2 4 6
2501 //   1B 2022  scan and boost/demote len mod 8 = 0 2 4 6
2502 //   0E 2022  scan and boost/demote len mod 8 = 0 2 4 6
2503 //   [0F 2022  boost/demote]
2504 //   00 UTF16/32  scan and boost/demote offset = even/odd
2505 //
2506 // If still some seven-bit possibilities > pure ASCII,
2507 // scan each possibility for clearer prob, s.t. about
2508 // two good sequences is a clear win
2509 // A-Z 00-19 00xx-64xx   (B = 04xx)
2510 // a-z 1A-33 68xx-CCxx   (f = 7Cxx)
2511 // 0-9 34-3D D0xx-F4xx   (1 = D4xx)
2512 // +   3E    F8xx
2513 // /   3F    FCxx
2514 // do another chunk  with slow scan
2515 
2516 
2517 // Boost, whack, or leave alone UTF-7 probablilty
UTF7BoostWhack(DetectEncodingState * destatep,int next_pair,uint8 byte2)2518 void UTF7BoostWhack(DetectEncodingState* destatep, int next_pair, uint8 byte2) {
2519   int off = destatep->interesting_offsets[AsciiPair][next_pair];
2520   if (off >= destatep->prior_utf7_offset) {
2521     // Not part of a previous successful UTF-7 string
2522     ++destatep->utf7_starts;
2523 
2524     if (byte2 == '-') {
2525       // +- encoding for '+'  neutral
2526     } else if (!Base64Char(byte2)) {
2527       // Not base64 -- not UTF-7, whack
2528       Whack(destatep, F_UTF7, kBadPairWhack);                 // Illegal pair
2529     } else {
2530       // Starts with base64 byte, might be a good UTF7 sequence
2531       const uint8* start = destatep->initial_src + off + 1;   // over the +
2532       int n = Base64ScanLen(start, destatep->limit_src);
2533       int nmod8 = n & 7;
2534       if ((n == 3) || (n == 6)) {
2535         // short but legal -- treat as neutral
2536       } else if ((nmod8 == 0) | (nmod8 == 3) | (nmod8 == 6)) {
2537         // Good length. Check for good Unicode.
2538         if (GoodUnicodeFromBase64(start, start + n)) {
2539           // Good length and Unicode, boost
2540           Boost(destatep, F_UTF7, kBoostOnePair);         // Found good
2541           destatep->prior_utf7_offset = off + n + 1;
2542         } else {
2543           // Bad Unicode. Whack
2544           Whack(destatep, F_UTF7, kBadPairWhack);         // Illegal length
2545         }
2546       } else {
2547         // Bad length. Whack
2548         Whack(destatep, F_UTF7, kBadPairWhack);         // Illegal length
2549       }
2550     }
2551   }
2552 }
2553 
2554 // Boost, whack, or leave alone HZ probablilty
HzBoostWhack(DetectEncodingState * destatep,uint8 byte2)2555 void HzBoostWhack(DetectEncodingState* destatep, uint8 byte2) {
2556   if ((byte2 == '{') || (byte2 == '}')) {
2557     Boost(destatep, F_HZ_GB_2312, kBoostOnePair);         // Found ~{ or ~}
2558   } else if ((byte2 == '~') || (byte2 == '\n')) {
2559     destatep->enc_prob[F_HZ_GB_2312] += 0;                // neutral
2560   } else {
2561     Whack(destatep, F_HZ_GB_2312, kBadPairWhack);         // Illegal pair
2562   }
2563 }
2564 
2565 // Boost, whack, or leave alone BINARY probablilty
BinaryBoostWhack(DetectEncodingState * destatep,uint8 byte1,uint8 byte2)2566 void BinaryBoostWhack(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) {
2567   int quadrant = ((byte1 & 0x80) >> 6) | ((byte2 & 0x80) >> 7);
2568   int bucket8x4 = ((byte1 & 0xe0) >> 3) | ((byte2 & 0xc0) >> 6);
2569   uint32 quad_mask = 1 << quadrant;
2570   uint32 bucket8x4_mask = 1 << bucket8x4;
2571   if ((destatep->binary_quadrants_seen & quad_mask) == 0) {
2572     destatep->binary_quadrants_seen |= quad_mask;
2573     destatep->binary_quadrants_count += 1;
2574     if (destatep->binary_quadrants_count == 4) {
2575       Boost(destatep, F_BINARY, kBoostOnePair * 2);   // Found all 4 quadrants,
2576                                                       // boost 2 pairs
2577     }
2578   }
2579   if ((destatep->binary_8x4_seen & bucket8x4_mask) == 0) {
2580     destatep->binary_8x4_seen |= bucket8x4_mask;
2581     destatep->binary_8x4_count += 1;
2582     if (destatep->binary_8x4_count >= 11) {
2583       Boost(destatep, F_BINARY, kBoostOnePair * 4);   // Found 11+/20 buckets,
2584                                                       // boost 4 pairs each time
2585     }
2586   }
2587 }
2588 
2589 
2590 // Demote UTF-16/32 on 0000 or FFFF, favoring Binary
UTF1632BoostWhack(DetectEncodingState * destatep,int offset,uint8 byte1)2591 void UTF1632BoostWhack(DetectEncodingState* destatep, int offset, uint8 byte1) {
2592   if (byte1 == 0) {     // We have 0000
2593     Whack(destatep, F_UTF_16BE, kBadPairWhack);           // Illegal pair
2594     Whack(destatep, F_UTF_16LE, kBadPairWhack);           // Illegal pair
2595     switch (offset & 3) {
2596     case 0:         // We get called with 0 4 8, etc. for ASCII/BMP as UTF-32BE
2597       Whack(destatep, F_UTF_32LE, kBadPairWhack);         // Illegal pair
2598       Boost(destatep, F_UTF_32BE, kSmallInitDiff);        // Good pair
2599       break;
2600     case 1:         // We get called with 1 5 9, etc. for ASCII as UTF-32LE
2601     case 2:         // We get called with 2 6 10, etc. for BMP as UTF-32LE
2602       Whack(destatep, F_UTF_32BE, kBadPairWhack);         // Illegal pair
2603       Boost(destatep, F_UTF_32LE, kSmallInitDiff);        // Good pair
2604       break;
2605     case 3:         // ambiguous
2606       break;
2607     }
2608   } else {              // We have ffff
2609     Whack(destatep, F_UTF_32BE, kBadPairWhack);           // Illegal pair
2610     Whack(destatep, F_UTF_32LE, kBadPairWhack);           // Illegal pair
2611     Whack(destatep, F_UTF_16BE, kBadPairWhack);           // Illegal pair
2612     Whack(destatep, F_UTF_16LE, kBadPairWhack);           // Illegal pair
2613   }
2614 }
2615 
2616 // Make even offset
UTF16MakeEven(DetectEncodingState * destatep,int next_pair)2617 void UTF16MakeEven(DetectEncodingState* destatep, int next_pair) {
2618   destatep->interesting_offsets[OtherPair][next_pair] &= ~1;
2619 }
2620 
ConsecutivePair(DetectEncodingState * destatep,int i)2621 bool ConsecutivePair(DetectEncodingState* destatep, int i) {
2622   if (i <= 0) {
2623     return false;
2624   }
2625   return destatep->interesting_offsets[OtherPair][i] ==
2626          (destatep->interesting_offsets[OtherPair][i - 1] + 2);
2627 }
2628 
2629 // boost, whack, or leave alone UTF-8 probablilty
2630 // Any whacks are also applied to UTF8UTF8; CheckUTF8UTF8Seq assumes good UTF8
2631 // Returns total boost
CheckUTF8Seq(DetectEncodingState * destatep,int weightshift)2632 int CheckUTF8Seq(DetectEncodingState* destatep, int weightshift) {
2633   int startcount = destatep->prior_interesting_pair[OtherPair];
2634   int endcount = destatep->next_interesting_pair[OtherPair];
2635 
2636   int demotion_count = 0;
2637   for (int i = startcount; i < endcount; ++i) {
2638     int sub;
2639     char* s = &destatep->interesting_pairs[OtherPair][i * 2];
2640     // Demote four byte patterns that are more likely Latin1 than UTF-8
2641     // C9AE, DF92, DF93, DFAB. See note at top.
2642     // Demotion also boosts Latin1 and CP1252
2643     uint8 s0 = static_cast<uint8>(s[0]);
2644     uint8 s1 = static_cast<uint8>(s[1]);
2645     if ((s0 == 0xc9) && (s1 == 0xae)) {++demotion_count;}
2646     if ((s0 == 0xdf) && (s1 == 0x92)) {++demotion_count;}
2647     if ((s0 == 0xdf) && (s1 == 0x93)) {++demotion_count;}
2648     if ((s0 == 0xdf) && (s1 == 0xab)) {++demotion_count;}
2649 
2650     if (!ConsecutivePair(destatep, i)) {
2651       // Insert a blank into the sequence; avoid wrong splices
2652       sub = (' ' >> 4) & 0x0f;
2653       ++destatep->utf8_minicount[
2654           static_cast<int>(kMiniUTF8Count[static_cast<int>(destatep->next_utf8_ministate)][sub])];
2655       destatep->next_utf8_ministate =
2656         kMiniUTF8State[destatep->next_utf8_ministate][sub];
2657     }
2658     // Byte 0
2659     sub = (s0 >> 4) & 0x0f;
2660     ++destatep->utf8_minicount[
2661         static_cast<int>(kMiniUTF8Count[static_cast<int>(destatep->next_utf8_ministate)][sub])];
2662     destatep->next_utf8_ministate =
2663       kMiniUTF8State[destatep->next_utf8_ministate][sub];
2664     // Byte 1
2665     sub = (s1 >> 4) & 0x0f;
2666     ++destatep->utf8_minicount[
2667         static_cast<int>(kMiniUTF8Count[static_cast<int>(destatep->next_utf8_ministate)][sub])];
2668     destatep->next_utf8_ministate =
2669       kMiniUTF8State[destatep->next_utf8_ministate][sub];
2670     DCHECK((0 <= destatep->next_utf8_ministate) &&
2671            (destatep->next_utf8_ministate < 8));
2672   }
2673 
2674 
2675   // For the four specific byte combinations above, Latin1/CP1252 is more likely
2676   if (demotion_count > 0) {
2677     Boost(destatep, F_Latin1, kGentleOnePair * demotion_count);
2678     Boost(destatep, F_CP1252, kGentleOnePair * demotion_count);
2679   }
2680 
2681   // Boost UTF8 for completed good sequences
2682   int total_boost = 2 * destatep->utf8_minicount[2] +
2683                     3 * destatep->utf8_minicount[3] +
2684                     4 * destatep->utf8_minicount[4];
2685   // But not so much for demoted bytes
2686   total_boost -= (3 * demotion_count);
2687 
2688   total_boost *= kGentleOnePair;
2689   total_boost >>= weightshift;
2690   // Design: boost both UTF8 and UTF8UTF8 for each good sequence
2691   Boost(destatep, F_UTF8, total_boost);
2692   Boost(destatep, F_UTF8UTF8, total_boost);
2693 
2694   destatep->utf8_minicount[5] += destatep->utf8_minicount[2];   // total chars
2695   destatep->utf8_minicount[5] += destatep->utf8_minicount[3];   // total chars
2696   destatep->utf8_minicount[5] += destatep->utf8_minicount[4];   // total chars
2697   destatep->utf8_minicount[2] = 0;
2698   destatep->utf8_minicount[3] = 0;
2699   destatep->utf8_minicount[4] = 0;
2700 
2701   // Whack (2 bytes) for errors
2702   int error_whack = 2 * destatep->utf8_minicount[1];
2703   error_whack *= kGentlePairWhack;
2704   error_whack >>= weightshift;
2705   Whack(destatep, F_UTF8, error_whack);
2706   Whack(destatep, F_UTF8UTF8, error_whack);
2707   destatep->utf8_minicount[1] = 0;
2708 
2709   return total_boost - error_whack;
2710 }
2711 
2712 
2713 // Boost, whack, or leave alone UTF8UTF8 probablilty
2714 //
2715 // We are looking for
2716 // (1) chars ONLY in set UTF8(0080)..UTF8(00FF), including for 80..9F the
2717 //     MS CP1252 mappings, and
2718 // (2) sequences of 2 or more such characters
2719 //
2720 // If so, we could be looking at some non-7-bit encoding extra-converted
2721 // to UTF-8. The most common observed is CP1252->UTF8 twice,
2722 //    1252=>UTF8 : 1252=>UTF8
2723 // where the colon means "take those bytes and pretend that they are 1252".
2724 // We have a couple of examples of BIG5 bytes converted as though
2725 // they were 1252,
2726 //    BIG5 : 1252=>UTF8
2727 //
2728 // Of course, we don't want correctly converted 1252 to be flagged here
2729 //    1252=>UTF8
2730 // So we want the input high bytes to be in pairs or longer, hence the
2731 // output UTF8 in groups of four bytes or more
2732 //
2733 // Good chars: C2xx, C3xx,
2734 // Good chars: C592, C593, C5A0, C5A1, C5B8, C5BD, C5BE, C692, CB86, CB9C
2735 // Good chars: E280xx E282AC E284A2
2736 //             C2xx 1100001x 10xxxxxx   (128/128)
2737 //             C5xx 11000101 10xx00xx   (16/4)
2738 //             C5xx 11000101 10111xxx   (8/3)
2739 //             C692 11000110 10010010   (1/1)
2740 //             CBxx 11001011 100xx1x0   (8/2)
2741 //             E28x 11100010 10000xx0   (4/3)
2742 //
2743 // Returns total boost
CheckUTF8UTF8Seq(DetectEncodingState * destatep,int weightshift)2744 int CheckUTF8UTF8Seq(DetectEncodingState* destatep, int weightshift) {
2745   int this_pair = destatep->prior_interesting_pair[OtherPair];
2746   int startbyteoffset = this_pair * 2;
2747   int endbyteoffset = destatep->next_interesting_pair[OtherPair] * 2;
2748   char* startbyte = &destatep->interesting_pairs[OtherPair][startbyteoffset];
2749   char* endbyte = &destatep->interesting_pairs[OtherPair][endbyteoffset];
2750 
2751   int pair_number = this_pair;
2752   for (char* s = startbyte; s < endbyte; s += 2) {
2753     int next = destatep->next_utf8utf8_ministate;
2754     if (!ConsecutivePair(destatep, pair_number)) {
2755       // Insert two blanks into the sequence to avoid wrong splices
2756       // go back to no odd-byte offset
2757       destatep->utf8utf8_odd_byte = 0;
2758       int sub = UTF88Sub(' ', ' ');
2759       ++destatep->utf8utf8_minicount[static_cast<int>(kMiniUTF8UTF8Count[next][sub])];
2760       next = kMiniUTF8UTF8State[next][sub];
2761     }
2762 
2763     int odd = destatep->utf8utf8_odd_byte;
2764     if (s + 1 + odd >= endbyte) continue;
2765     int sub = UTF88Sub(s[0 + odd], s[1 + odd]);
2766     destatep->utf8utf8_odd_byte ^= kMiniUTF8UTF8Odd[next][sub];
2767     ++destatep->utf8utf8_minicount[
2768         static_cast<int>(kMiniUTF8UTF8Count[next][sub])];
2769     destatep->next_utf8utf8_ministate = kMiniUTF8UTF8State[next][sub];
2770     ++pair_number;
2771   }
2772 
2773   // Boost for completed good sequences; each count covers two chars.
2774   // Design: boost UTF8UTF8 above UTF8 for each good sequence
2775   int total_boost = (2) * destatep->utf8utf8_minicount[2] +
2776                     (2) * destatep->utf8utf8_minicount[3] +
2777                     (2) * destatep->utf8utf8_minicount[4];
2778   total_boost *= kGentleOnePair;
2779   total_boost >>= weightshift;
2780   Boost(destatep, F_UTF8UTF8, total_boost);
2781 
2782   // Track total characters
2783   destatep->utf8utf8_minicount[5] += destatep->utf8utf8_minicount[2];
2784   destatep->utf8utf8_minicount[5] += destatep->utf8utf8_minicount[3];
2785   destatep->utf8utf8_minicount[5] += destatep->utf8utf8_minicount[4];
2786   destatep->utf8utf8_minicount[2] = 0;
2787   destatep->utf8utf8_minicount[3] = 0;
2788   destatep->utf8utf8_minicount[4] = 0;
2789 
2790   // Design: Do not whack UTF8UTF8 below UTF8 for each bad sequence
2791 
2792   destatep->utf8utf8_minicount[1] = 0;
2793   return total_boost;
2794 }
2795 
2796 
2797 // We give a gentle boost for each paired SO ... SI, whack others
CheckIso2022ActiveSeq(DetectEncodingState * destatep)2798 void CheckIso2022ActiveSeq(DetectEncodingState* destatep) {
2799   int this_pair = destatep->prior_interesting_pair[OtherPair];
2800   int startbyteoffset = this_pair * 2;
2801   int endbyteoffset = destatep->next_interesting_pair[OtherPair] * 2;
2802   char* startbyte = &destatep->interesting_pairs[OtherPair][startbyteoffset];
2803   char* endbyte = &destatep->interesting_pairs[OtherPair][endbyteoffset];
2804 
2805   // Initial <esc> char must precede SO/SI
2806   // HZ_GB_2312 has no alternation constraint on 1- and 2-byte segments
2807   // ISO-2022-JP (JIS) has no alternation constraint on 1- and 2-byte segments
2808   // ISO-2022-CN has no alternation constraint on 1- and 2-byte segments
2809   // ISO-2022-KR requires alternation between 1- and 2-byte segments
2810   // JIS:
2811   //  <esc> ( B ISO-2022-JP     [1b 28 42]  SI to ASCII
2812   //  <esc> ( J ISO-2022-JP     [1b 28 4a]  SI to X0201
2813   //  <esc> $ @ ISO-2022-JP     [1b 24 40]  SO to X0208-78 twobyte
2814   //  <esc> $ B ISO-2022-JP     [1b 24 42]  SO to X0208-83 twobyte
2815   for (char* s = startbyte; s < endbyte; s += 2) {
2816     if (s[0] == 0x1b) {
2817       if (s[1] == 0x24) {
2818         // <esc> $  is SO
2819         destatep->next_2022_state = SOSI_TWOBYTE;       // SO to two-byte
2820       } else if (s[1] == 0x28) {
2821         if (destatep->next_2022_state == SOSI_TWOBYTE) {
2822           Boost(destatep, F_JIS, kGentlePairBoost);
2823         } else if (destatep->next_2022_state == SOSI_ONEBYTE) {
2824           Whack(destatep, F_JIS, kGentlePairWhack);
2825         }
2826         destatep->next_2022_state = SOSI_ONEBYTE;       // JIS SI to one-byte
2827       } else {
2828         Whack(destatep, F_JIS, kBadPairWhack);
2829         Whack(destatep, F_ISO_2022_CN, kBadPairWhack);
2830         Whack(destatep, F_ISO_2022_KR, kBadPairWhack);
2831         destatep->next_2022_state = SOSI_ERROR;     // not 2022
2832       }
2833     } else if (s[0] == 0x0e)  {
2834       // <so>
2835       Whack(destatep, F_JIS, kBadPairWhack);
2836       if (destatep->next_2022_state != SOSI_NONE) {
2837         destatep->next_2022_state = SOSI_TWOBYTE;       // SO to two-byte
2838       } else {
2839         // ESC required before SO/SI
2840         Whack(destatep, F_ISO_2022_CN, kBadPairWhack * 4);
2841         Whack(destatep, F_ISO_2022_KR, kBadPairWhack * 4);
2842         destatep->next_2022_state = SOSI_ERROR;     // SO not after SI
2843       }
2844     } else if (s[0] == 0x0f)  {
2845       // <si>
2846       Whack(destatep, F_JIS, kBadPairWhack);
2847       if (destatep->next_2022_state != SOSI_NONE) {
2848         if (destatep->next_2022_state == SOSI_TWOBYTE) {
2849           Boost(destatep, F_ISO_2022_CN, kGentlePairBoost);
2850           Boost(destatep, F_ISO_2022_KR, kGentlePairBoost);
2851         } else if (destatep->next_2022_state == SOSI_ONEBYTE) {
2852           Whack(destatep, F_ISO_2022_CN, kGentlePairWhack);
2853           Whack(destatep, F_ISO_2022_KR, kGentlePairWhack);
2854         }
2855         destatep->next_2022_state = SOSI_ONEBYTE;       // SI to one-byte
2856       } else {
2857         // ESC required before SO/SI
2858         Whack(destatep, F_ISO_2022_CN, kBadPairWhack * 4);
2859         Whack(destatep, F_ISO_2022_KR, kBadPairWhack * 4);
2860         destatep->next_2022_state = SOSI_ERROR;     // SI not after SO
2861       }
2862     } else if (s[0] <= 0x1f)  {
2863       // Some other control code. Allow ht lf [ff] cr
2864       if ((s[0] != 0x09) && (s[0] != 0x0a) &&
2865           (s[0] != 0x0c) && (s[0] != 0x0d)) {
2866         // Otherwise these can float to the top on bad bytes
2867         Whack(destatep, F_JIS, kBadPairWhack);
2868         Whack(destatep, F_ISO_2022_CN, kBadPairWhack);
2869         Whack(destatep, F_ISO_2022_KR, kBadPairWhack);
2870       }
2871     }
2872   }
2873 
2874   // If no start, keep the probability pinned at zero (or below)
2875   if (destatep->next_2022_state == SOSI_NONE) {
2876     destatep->enc_prob[F_ISO_2022_CN] =
2877       minint(0, destatep->enc_prob[F_ISO_2022_CN]);
2878     destatep->enc_prob[F_ISO_2022_KR] =
2879       minint(0, destatep->enc_prob[F_ISO_2022_KR]);
2880     destatep->enc_prob[F_JIS] =
2881       minint(0, destatep->enc_prob[F_JIS]);
2882   }
2883 }
2884 
2885 // We give a gentle boost for each paired ~{ ... ~}, whack others
CheckHzActiveSeq(DetectEncodingState * destatep)2886 void CheckHzActiveSeq(DetectEncodingState* destatep) {
2887   int this_pair = destatep->prior_interesting_pair[AsciiPair];
2888   int startbyteoffset = this_pair * 2;
2889   int endbyteoffset = destatep->next_interesting_pair[AsciiPair] * 2;
2890   char* startbyte = &destatep->interesting_pairs[AsciiPair][startbyteoffset];
2891   char* endbyte = &destatep->interesting_pairs[AsciiPair][endbyteoffset];
2892 
2893   for (char* s = startbyte; s < endbyte; s += 2) {
2894     // Look for initial ~{ pair
2895     if ((s[0] == '~') && (s[1] == '{')) {
2896       destatep->next_hz_state = SOSI_TWOBYTE;       // SO to two-byte
2897     }
2898     // Also look for closing ~} pair
2899     if ((s[0] == '~') && (s[1] == '}'))  {
2900       if (destatep->next_hz_state == SOSI_TWOBYTE) {
2901         Boost(destatep, F_HZ_GB_2312, kGentlePairBoost);
2902       } else if (destatep->next_hz_state == SOSI_ONEBYTE) {
2903         Whack(destatep, F_HZ_GB_2312, kGentlePairWhack);
2904       }
2905       destatep->next_hz_state = SOSI_ONEBYTE;       // SI to one-byte
2906     }
2907   }
2908 
2909   // If no start, keep the probability pinned at zero (or below)
2910   if (destatep->next_hz_state == SOSI_NONE) {
2911     destatep->enc_prob[F_HZ_GB_2312] =
2912       minint(0, destatep->enc_prob[F_HZ_GB_2312]);
2913   }
2914 }
2915 
2916 // We give a gentle boost after an odd number of 8Fxxxx triples, which
2917 // put subsequent bigrams out of phase until a low byte or another 8Fxxxx
CheckEucJpSeq(DetectEncodingState * destatep)2918 void CheckEucJpSeq(DetectEncodingState* destatep) {
2919   int this_pair = destatep->prior_interesting_pair[OtherPair];
2920   int startbyteoffset = this_pair * 2;
2921   int endbyteoffset = destatep->next_interesting_pair[OtherPair] * 2;
2922   char* startbyte = &destatep->interesting_pairs[OtherPair][startbyteoffset];
2923   char* endbyte = &destatep->interesting_pairs[OtherPair][endbyteoffset];
2924 
2925   for (char* s = startbyte; s < endbyte; s += 2) {
2926     // Boost if out of phase (otherwise, EUC-JP will score badly after 8Fxxxx)
2927     if (destatep->next_eucjp_oddphase) {
2928       //printf("  EucJp boost[%02x%02x]\n", s[0], s[1]);    // TEMP
2929       Boost(destatep, F_EUC_JP, kGentlePairBoost * 2);
2930     }
2931 
2932     uint8 s0 = static_cast<uint8>(s[0]);
2933     uint8 s1 = static_cast<uint8>(s[1]);
2934     // Look for phase flip at 8F
2935     if ((s0 & 0x80) == 0x00) {
2936       destatep->next_eucjp_oddphase = false;
2937     } else if (s0 == 0x8f) {
2938       destatep->next_eucjp_oddphase = !destatep->next_eucjp_oddphase;
2939     }
2940     if ((s1 & 0x80) == 0x00) {
2941       destatep->next_eucjp_oddphase = false;
2942     } else if (s1 == 0x8f) {
2943       destatep->next_eucjp_oddphase = !destatep->next_eucjp_oddphase;
2944     }
2945   }
2946 }
2947 
2948 // Boost, whack, or leave alone BINARY probablilty
2949 // Also called if UTF 16/32 active
CheckBinaryDensity(const uint8 * src,DetectEncodingState * destatep,int delta_otherpairs)2950 void CheckBinaryDensity(const uint8* src, DetectEncodingState* destatep,
2951                         int delta_otherpairs) {
2952   // No change if not much gathered information
2953   if (delta_otherpairs == 0) {
2954     // Only ASCII pairs this call
2955     return;
2956   }
2957   int next_pair = destatep->next_interesting_pair[OtherPair];
2958 
2959   // Look at density of interesting pairs [0..src)
2960   int delta_offset =  static_cast<int>(src - destatep->initial_src);   // actual
2961 
2962   // Look at density of interesting pairs [0..next_interesting)
2963   int low_byte = destatep->interesting_offsets[OtherPair][0];
2964   //int high_byte = destatep->interesting_offsets[OtherPair][next_pair - 1] + 2;
2965   //int byte_span = high_byte - low_byte;
2966   int byte_span = delta_offset - low_byte;
2967 
2968   // If all ASCII for the first 4KB, reject
2969   // If mostly ASCII in the first 5KB, reject
2970   if ((low_byte >= kBinaryHardAsciiLimit) || (delta_offset >= kBinarySoftAsciiLimit)) {
2971     // Not binary early enough in text
2972     Whack(destatep, F_BINARY, kBadPairWhack * 4);
2973     Whack(destatep, F_UTF_32BE, kBadPairWhack * 4);
2974     Whack(destatep, F_UTF_32LE, kBadPairWhack * 4);
2975     Whack(destatep, F_UTF_16BE, kBadPairWhack * 4);
2976     Whack(destatep, F_UTF_16LE, kBadPairWhack * 4);
2977     return;
2978   }
2979 
2980   // Density 1.0 for N pairs takes 2*N bytes
2981   // Whack if < 1/16 after first non_ASCII pair
2982   if ((next_pair * 2 * 16) < byte_span) {
2983     // Not dense enough
2984     Whack(destatep, F_BINARY, kBadPairWhack * 4);
2985     Whack(destatep, F_UTF_32BE, kBadPairWhack * 4);
2986     Whack(destatep, F_UTF_32LE, kBadPairWhack * 4);
2987     Whack(destatep, F_UTF_16BE, kBadPairWhack * 4);
2988     Whack(destatep, F_UTF_16LE, kBadPairWhack * 4);
2989   }
2990 
2991   if (next_pair < 8) {
2992     // Fewer than 8 non-ASCII total; too soon to boost
2993     return;
2994   }
2995 
2996   // Density 1.0 for N pairs takes 2*N bytes
2997   // Boost if density >= 1/4, whack if < 1/16
2998   if ((next_pair * 2 * 4) >= byte_span) {
2999     // Very dense
3000     // Only boost if at least 2 quadrants seen
3001     if (destatep->binary_quadrants_count >= 2) {
3002       Boost(destatep, F_BINARY, kSmallInitDiff);
3003       Boost(destatep, F_UTF_32BE, kSmallInitDiff);
3004       Boost(destatep, F_UTF_32LE, kSmallInitDiff);
3005       Boost(destatep, F_UTF_16BE, kSmallInitDiff);
3006       Boost(destatep, F_UTF_16LE, kSmallInitDiff);
3007     }
3008   }
3009 }
3010 
3011 
3012 // Look at a number of special-case encodings whose reliable detection depends
3013 // on sequencing or other properties
3014 // AsciiPair probibilities (UTF7 and HZ) are all done here
ActiveSpecialBoostWhack(const uint8 * src,DetectEncodingState * destatep)3015 void ActiveSpecialBoostWhack(const uint8* src, DetectEncodingState* destatep) {
3016   int delta_asciipairs = destatep->next_interesting_pair[AsciiPair] -
3017     destatep->prior_interesting_pair[AsciiPair];
3018   int delta_otherpairs = destatep->next_interesting_pair[OtherPair] -
3019     destatep->prior_interesting_pair[OtherPair];
3020 
3021   // The two pure ASCII encodings
3022   if (UTF7OrHzActive(destatep) && (delta_asciipairs > 0)) {
3023     // Adjust per pair
3024     for (int i = 0; i < delta_asciipairs; ++i) {
3025       int next_pair = destatep->prior_interesting_pair[AsciiPair] + i;
3026       uint8 byte1 = destatep->interesting_pairs[AsciiPair][next_pair * 2 + 0];
3027       uint8 byte2 = destatep->interesting_pairs[AsciiPair][next_pair * 2 + 1];
3028       if (byte1 == '+') {
3029         // Boost, whack, or leave alone UTF-7 probablilty
3030         UTF7BoostWhack(destatep, next_pair, byte2);
3031         if (destatep->debug_data != NULL) {
3032           // Show UTF7 entry
3033           char buff[16];
3034           snprintf(buff, sizeof(buff), "%02x%02x+", byte1, byte2);
3035           SetDetailsEncProb(destatep,
3036                             destatep->interesting_offsets[AsciiPair][next_pair],
3037                             kMostLikelyEncoding[(byte1 << 8) + byte2],
3038                             buff);
3039         }
3040       } else if (byte1 == '~') {
3041         // Boost, whack, or leave alone HZ probablilty
3042         HzBoostWhack(destatep, byte2);
3043         if (destatep->debug_data != NULL) {
3044           // Show Hz entry
3045           char buff[16];
3046           snprintf(buff, sizeof(buff), "%02x%02x~", byte1, byte2);
3047           SetDetailsEncProb(destatep,
3048                             destatep->interesting_offsets[AsciiPair][next_pair],
3049                             kMostLikelyEncoding[(byte1 << 8) + byte2],
3050                             buff);
3051         }
3052       }
3053     }
3054 
3055     // Kill UTF-7 now if at least 8 + pairs and not confirmed valid UTF-7
3056     if ((destatep->utf7_starts >= 8) && (destatep->prior_utf7_offset == 0)) {
3057       Whack(destatep, F_UTF7, kBadPairWhack * 8);         // flush
3058     }
3059   }
3060 
3061 
3062 
3063   // All the other encodings
3064   if (OtherActive(destatep) && (delta_otherpairs > 0)) {
3065     // Adjust per pair
3066     int biggest_weightshift = 0;
3067     for (int i = 0; i < delta_otherpairs; ++i) {
3068       int next_pair = destatep->prior_interesting_pair[OtherPair] + i;
3069       uint8 byte1 = destatep->interesting_pairs[OtherPair][next_pair * 2 + 0];
3070       uint8 byte2 = destatep->interesting_pairs[OtherPair][next_pair * 2 + 1];
3071       int off = destatep->interesting_offsets[OtherPair][next_pair];
3072       int weightshift = destatep->interesting_weightshift[OtherPair][next_pair];
3073       biggest_weightshift = maxint(biggest_weightshift, weightshift);
3074 
3075       if (byte1 == 0x00) {
3076         if (byte2 == 0x00) {
3077           UTF1632BoostWhack(destatep, off, byte1);
3078         } else if ((kIsPrintableAscii[byte2] != 0) && ((off & 1) != 0)) {
3079           // We have 00xx at an odd offset. Turn into preceding even offset
3080           // for possible Ascii text in UTF-16LE or UTF-32LE (vs BE)
3081           // This will cascade into caller's probability update
3082           // 00 is illegal for all other encodings, so it doesn't matter to them
3083           UTF16MakeEven(destatep, next_pair);
3084         }
3085         if (destatep->debug_data != NULL) {
3086           // Show 0000 detail entry for this bigram
3087           char buff[16];
3088           snprintf(buff, sizeof(buff), "%02x%02xZ", byte1, byte2);
3089           SetDetailsEncProb(destatep,
3090                             destatep->interesting_offsets[OtherPair][next_pair],
3091                             kMostLikelyEncoding[(byte1 << 8) + byte2],
3092                             buff);
3093         }
3094       }
3095       if (byte1 == 0xff) {
3096         if (byte2 == 0xff) {
3097           UTF1632BoostWhack(destatep, off, byte1);
3098         }
3099         if (destatep->debug_data != NULL) {
3100           // Show FFFF detail entry for this bigram
3101           char buff[16];
3102           snprintf(buff, sizeof(buff), "%02x%02xF", byte1, byte2);
3103           SetDetailsEncProb(destatep,
3104                             destatep->interesting_offsets[OtherPair][next_pair],
3105                             kMostLikelyEncoding[(byte1 << 8) + byte2],
3106                             buff);
3107         }
3108       }
3109       if (BinaryActive(destatep)) {
3110         BinaryBoostWhack(destatep, byte1, byte2);
3111       }
3112     }         // End for i
3113 
3114     // Adjust per entire-pair-span
3115     if (UTF8Active(destatep)) {
3116       CheckUTF8Seq(destatep, biggest_weightshift);
3117     }
3118 
3119     if (UTF8UTF8Active(destatep)) {
3120       CheckUTF8UTF8Seq(destatep, biggest_weightshift);
3121     }
3122 
3123     if (Iso2022Active(destatep)) {
3124       CheckIso2022ActiveSeq(destatep);
3125     }
3126 
3127     if (HzActive(destatep)) {
3128       CheckHzActiveSeq(destatep);
3129     }
3130 
3131     if (EUCJPActive(destatep)) {
3132       CheckEucJpSeq(destatep);
3133     }
3134 
3135     if (BinaryActive(destatep) || UTF1632Active(destatep)) {
3136       CheckBinaryDensity(src, destatep, delta_otherpairs);
3137     }
3138   }
3139   // ISO-2022 do OK on their own, using stright probabilities? Not on bad bytes
3140 
3141   if (destatep->debug_data != NULL) {
3142     // Show sequencing result
3143     SetDetailsEncLabel(destatep, "seq");
3144   }
3145 }
3146 
3147 
PrintTopEnc(DetectEncodingState * destatep,int n)3148 void PrintTopEnc(DetectEncodingState* destatep, int n) {
3149   // Print top n or fewer
3150   int temp_sort[NUM_RANKEDENCODING];
3151   for (int j = 0; j < destatep->rankedencoding_list_len; ++j) {
3152     int rankedencoding = destatep->rankedencoding_list[j];
3153     temp_sort[j] = destatep->enc_prob[rankedencoding];
3154   }
3155 
3156   qsort(temp_sort, destatep->rankedencoding_list_len,
3157         sizeof(temp_sort[0]), IntCompare);
3158 
3159   int top_n = minint(n, destatep->rankedencoding_list_len);
3160   int showme = temp_sort[top_n - 1];    // Print this value and above
3161 
3162   printf("rankedencodingList top %d: ", top_n);
3163   for (int j = 0; j < destatep->rankedencoding_list_len; ++j) {
3164     int rankedencoding = destatep->rankedencoding_list[j];
3165     if (showme <= destatep->enc_prob[rankedencoding]) {
3166       printf("%s=%d ",
3167              MyEncodingName(kMapToEncoding[rankedencoding]),
3168              destatep->enc_prob[rankedencoding]);
3169     }
3170   }
3171   printf("\n\n");
3172 }
3173 
3174 // If the same bigram repeats, don't boost its best encoding too much
RepeatedBigram(DetectEncodingState * destatep,uint8 byte1,uint8 byte2)3175 bool RepeatedBigram(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) {
3176   int this_bigram = (byte1 << 8) | byte2;
3177   // If 00xx 01xx 02xx ... 1fxx, take out bottom 4 bits of xx.
3178   // This ignores parts of Yahoo 0255 0254 0243 0247 0245 0243 0250 0255 ...
3179   // It may screw up UTF-16BE
3180   // It may screw up ISO-2022 (1b24 suppresses 1b28)
3181   if (byte1 < 0x20) {
3182     this_bigram &= 0xfff0;
3183   }
3184   if (this_bigram == destatep->prior_bigram[0]) {return true;}
3185   if (this_bigram == destatep->prior_bigram[1]) {return true;}
3186   if (this_bigram == destatep->prior_bigram[2]) {return true;}
3187   if (this_bigram == destatep->prior_bigram[3]) {return true;}
3188   // Round-robin replacement
3189   destatep->prior_bigram[destatep->next_prior_bigram] = this_bigram;
3190   destatep->next_prior_bigram = (destatep->next_prior_bigram + 1) & 3;
3191   return false;
3192 }
3193 
3194 // Sometimes illegal bytes are used as markers between text that Javascript
3195 // is going to decode. Don't overboost the Binary encoding for markers 01-FF.
3196 // Just count first pair per 8x4 bucket
RepeatedBinary(DetectEncodingState * destatep,uint8 byte1,uint8 byte2)3197 bool RepeatedBinary(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) {
3198   int bucket8x4 = ((byte1 & 0xe0) >> 3) | ((byte2 & 0xc0) >> 6);
3199   uint32 bucket8x4_mask = 1 << bucket8x4;
3200   if ((destatep->binary_8x4_seen & bucket8x4_mask) == 0) {
3201     destatep->binary_8x4_seen |= bucket8x4_mask;
3202     destatep->binary_8x4_count += 1;
3203     return false;
3204   }
3205   return true;
3206 }
3207 
3208 
3209 
3210 
3211 // Find current top two rankedencoding probabilities
ReRank(DetectEncodingState * destatep)3212 void ReRank(DetectEncodingState* destatep) {
3213   destatep->top_prob = -1;
3214   destatep->second_top_prob = -1;
3215   // Leave unchanged
3216   //destatep->top_rankedencoding =
3217   //  destatep->rankedencoding_list[0];     // Just to make well-defined
3218   //destatep->second_top_rankedencoding =
3219   //  destatep->rankedencoding_list[1];     // Just to make well-defined
3220   for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
3221     int rankedencoding = destatep->rankedencoding_list[j];
3222     if (destatep->top_prob < destatep->enc_prob[rankedencoding]) {
3223       // Make sure top 2 are in different superset groups
3224       if (kMapEncToBaseEncoding[kMapToEncoding[destatep->top_rankedencoding]] !=
3225           kMapEncToBaseEncoding[kMapToEncoding[rankedencoding]]) {
3226         destatep->second_top_prob =
3227           destatep->top_prob;             // old top to second
3228         destatep->second_top_rankedencoding =
3229           destatep->top_rankedencoding;   // old top to second
3230       }
3231       destatep->top_prob = destatep->enc_prob[rankedencoding];
3232       destatep->top_rankedencoding = rankedencoding;
3233     } else if (destatep->second_top_prob < destatep->enc_prob[rankedencoding]) {
3234       if (kMapEncToBaseEncoding[kMapToEncoding[destatep->top_rankedencoding]] !=
3235           kMapEncToBaseEncoding[kMapToEncoding[rankedencoding]]) {
3236         destatep->second_top_prob = destatep->enc_prob[rankedencoding];
3237         destatep->second_top_rankedencoding = rankedencoding;
3238       }
3239     }
3240   }
3241 }
3242 
SimplePrune(DetectEncodingState * destatep,int prune_diff)3243 void SimplePrune(DetectEncodingState* destatep, int prune_diff) {
3244   // Prune the list of active encoding families
3245   int keep_prob = destatep->top_prob - prune_diff;
3246 
3247   destatep->active_special = 0;
3248   int k = 0;
3249   for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
3250     bool keep = true;
3251     int rankedencoding = destatep->rankedencoding_list[j];
3252 
3253     // If count is too low, ditch it
3254     if (destatep->enc_prob[rankedencoding] < keep_prob) {keep = false;}
3255 
3256     // Keep it. This will always keep at least top_prob rankedencoding
3257     if (keep) {
3258       destatep->active_special |= kSpecialMask[kMapToEncoding[rankedencoding]];
3259       destatep->rankedencoding_list[k++] = rankedencoding;
3260     }
3261   }
3262 
3263   destatep->rankedencoding_list_len = k;
3264 }
3265 
3266 // Recalculate reliable
CalcReliable(DetectEncodingState * destatep)3267 void CalcReliable(DetectEncodingState* destatep) {
3268   // Encoding result is reliable if big difference in top two, or if
3269   // only Ascii7 ever encountered
3270   // Also reliable if exactly one OtherPair and it's best encoding matches top
3271   destatep->reliable = false;
3272   if (destatep->next_interesting_pair[OtherPair] == 0) {
3273     // Only 7-bit ASCII
3274     destatep->reliable = true;
3275     return;
3276   }
3277   if ((destatep->top_prob - destatep->second_top_prob) >=
3278       FLAGS_ced_reliable_difference) {
3279     destatep->reliable = true;
3280     return;
3281   }
3282   if (destatep->next_interesting_pair[OtherPair] == 1) {
3283     uint8 byte1 = destatep->interesting_pairs[OtherPair][0];
3284     uint8 byte2 = destatep->interesting_pairs[OtherPair][1];
3285     int best_enc = kMostLikelyEncoding[(byte1 << 8) + byte2];
3286     if (best_enc == destatep->top_rankedencoding) {
3287       destatep->reliable = true;
3288       return;
3289     }
3290   }
3291 
3292   // If we pruned to one encoding, we are done
3293   if (destatep->rankedencoding_list_len == 1) {
3294     destatep->reliable = true;
3295     destatep->done = true;
3296     return;
3297   }
3298 
3299   // If we pruned to two or three encodings in the same *superset/subset
3300   // rankedencoding*  and enough pairs, we are done. Else keep going
3301   if (destatep->rankedencoding_list_len == 2) {
3302     Encoding enc0 = kMapToEncoding[destatep->rankedencoding_list[0]];
3303     Encoding enc1 = kMapToEncoding[destatep->rankedencoding_list[1]];
3304     if (kMapEncToBaseEncoding[enc0] == kMapEncToBaseEncoding[enc1]) {
3305       if (destatep->prune_count >= 3) {
3306         destatep->reliable = true;
3307         destatep->done = true;
3308         return;
3309       }
3310     }
3311   } else if (destatep->rankedencoding_list_len == 3) {
3312     Encoding enc0 = kMapToEncoding[destatep->rankedencoding_list[0]];
3313     Encoding enc1 = kMapToEncoding[destatep->rankedencoding_list[1]];
3314     Encoding enc2 = kMapToEncoding[destatep->rankedencoding_list[2]];
3315     Encoding base0 = kMapEncToBaseEncoding[enc0];
3316     Encoding base1 = kMapEncToBaseEncoding[enc1];
3317     Encoding base2 = kMapEncToBaseEncoding[enc2];
3318 
3319     if ((base0 == base1) && (base0 == base2)) {
3320       if (destatep->prune_count >= 3) {
3321         destatep->reliable = true;
3322         destatep->done = true;
3323         return;
3324       }
3325     }
3326   }
3327 
3328 }
3329 
3330 
3331 // Find current top two rankedencoding probabilities
FindTop2(DetectEncodingState * destatep,int * first_renc,int * second_renc,int * first_prob,int * second_prob)3332 void FindTop2(DetectEncodingState* destatep,
3333               int* first_renc, int* second_renc,
3334               int* first_prob, int* second_prob) {
3335   *first_prob = -1;
3336   *second_prob = -1;
3337   *first_renc = 0;
3338   *second_renc = 0;
3339   for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
3340     int rankedencoding = destatep->rankedencoding_list[j];
3341     if (*first_prob < destatep->enc_prob[rankedencoding]) {
3342       *second_prob = *first_prob;             // old top to second
3343       *second_renc = *first_renc;   // old top to second
3344       *first_prob = destatep->enc_prob[rankedencoding];
3345       *first_renc = rankedencoding;
3346     } else if (*second_prob < destatep->enc_prob[rankedencoding]) {
3347       *second_prob = destatep->enc_prob[rankedencoding];
3348       *second_renc = rankedencoding;
3349     }
3350   }
3351 }
3352 
3353 
PrintRankedEncodingList(DetectEncodingState * destatep,const char * str)3354 void PrintRankedEncodingList(DetectEncodingState* destatep, const char* str) {
3355   printf("Current ranked encoding list %s\n", str);
3356   for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
3357     int rankedencoding = destatep->rankedencoding_list[j];
3358     if ((rankedencoding < 0) || (rankedencoding > NUM_RANKEDENCODING)) {
3359       printf(" [%d] BOGUS rankedencoding = %d\n", j, rankedencoding);
3360     } else {
3361       printf(" [%d] rankedencoding = %d %-12.12s enc_prob = %d\n",
3362              j, rankedencoding, MyRankedEncName(rankedencoding),
3363              destatep->enc_prob[rankedencoding]);
3364     }
3365   }
3366   printf("End current ranked encoding list\n\n");
3367 }
3368 
3369 
3370 
3371 
3372 // Map unencoded bytes down to five bits, largely preserving letters
3373 // This design struggles to put 33 values into 5 bits.
3374 #define XX 0    // Punctuation (00-7F range)
3375 #define HA 27   // High vowel a in Latin1/2/sometimes7
3376 #define HE 28   // High vowel e
3377 #define HI 29   // High vowel i
3378 #define HO 30   // High vowel o
3379 #define HU 30   // High vowel u on top of HO
3380 #define Hc 31   // High consonant (80-FF range)
3381 static const char kMapToFiveBits[256] = {
3382   XX,XX,XX,XX,XX,XX,XX,XX,  XX,XX,XX,XX,XX,XX,XX,XX,
3383   XX,XX,XX,XX,XX,XX,XX,XX,  XX,XX,XX,XX,XX,XX,XX,XX,
3384   XX,XX,XX,XX,XX,XX,XX,XX,  XX,XX,XX,XX,XX,XX,XX,XX,
3385   XX,XX,XX,XX,XX,XX,XX,XX,  XX,XX,XX,XX,XX,XX,XX,XX,
3386 
3387   XX, 1, 2, 3, 4, 5, 6, 7,   8, 9,10,11,12,13,14,15,
3388   16,17,18,19,20,21,22,23,  24,25,26,XX,XX,XX,XX,XX,
3389   XX, 1, 2, 3, 4, 5, 6, 7,   8, 9,10,11,12,13,14,15,
3390   16,17,18,19,20,21,22,23,  24,25,26,XX,XX,XX,XX,XX,
3391 
3392   Hc,HA,Hc,Hc,Hc,Hc,Hc,Hc,  HO,Hc,Hc,Hc,Hc,Hc,Hc,Hc,
3393   Hc,HA,Hc,Hc,Hc,Hc,Hc,Hc,  HO,Hc,Hc,Hc,Hc,Hc,Hc,Hc,
3394   Hc,HA,Hc,Hc,Hc,Hc,Hc,Hc,  HO,Hc,Hc,Hc,Hc,Hc,Hc,Hc,
3395   Hc,HA,Hc,Hc,Hc,Hc,Hc,Hc,  HO,Hc,Hc,Hc,Hc,Hc,Hc,Hc,
3396 
3397   Hc,HA,HA,HA,HA,Hc,Hc,Hc,  Hc,HE,HE,HE,HI,HI,HI,Hc,
3398   Hc,Hc,Hc,HO,HO,HO,HO,Hc,  Hc,HU,HU,HU,HU,Hc,Hc,Hc,
3399   Hc,HA,HA,HA,HA,Hc,Hc,Hc,  Hc,HE,HE,HE,HI,HI,HI,Hc,
3400   Hc,Hc,Hc,HO,HO,HO,HO,Hc,  Hc,HU,HU,HU,HU,Hc,Hc,Hc,
3401 
3402 };
3403 #undef XX
3404 #undef HA
3405 #undef HE
3406 #undef HI
3407 #undef HO
3408 #undef HU
3409 #undef Hc
3410 
3411 static const int kTriLatin1Likely = 1;
3412 static const int kTriLatin2Likely = 2;
3413 static const int kTriLatin7Likely = 3;
3414 
3415 // Each table entry has 32 times two bits, selected by byte[2]
3416 // Entry subscript is selected by byte[0] and byte[1]
3417 // Latin1/2/7 boost vector, generated 2007.09.26 by postproc-enc-detect-short.cc
3418 static const uint64 kLatin127Trigrams[1024] = {
3419 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
3420 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
3421 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
3422 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
3423 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
3424 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
3425 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
3426 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
3427 0x0000000000000000ULL, 0x304080c0402c3330ULL, 0x0008400004000000ULL, 0x082800000c200000ULL,
3428 0x23a0000420800030ULL, 0x00000000000ccc00ULL, 0x0500100100100000ULL, 0x0388400000200010ULL,
3429 0x0000000000000c00ULL, 0xd0f0300740f0cf00ULL, 0x2aa0a2a22882a2acULL, 0x081d800000000080ULL,
3430 0x0c82000020000000ULL, 0x200a03c000a00000ULL, 0x0008400400290000ULL, 0x0400870000000000ULL,
3431 0x00f040c00000c080ULL, 0x0008004000000410ULL, 0x0020300000000030ULL, 0x00a030002c300000ULL,
3432 0x0c8030c020a00000ULL, 0x15410030f0f4c000ULL, 0x3000000300a00000ULL, 0xa2880980a0880a88ULL,
3433 0x0900300000000000ULL, 0x0000040100300000ULL, 0x0888820020a00000ULL, 0xc044002242010000ULL,
3434 0x000000121d300040ULL, 0x40100040440c0d54ULL, 0x00008423102f8144ULL, 0x0b40808400000280ULL,
3435 0x0000000000000000ULL, 0x0680a000000c0000ULL, 0x0880008020aa0000ULL, 0x2aaa0141010a4940ULL,
3436 0xcb80000000010000ULL, 0x2280000000000000ULL, 0x5248000001800000ULL, 0x8000401004040010ULL,
3437 0x1540010201001010ULL, 0x0080080400000000ULL, 0x5a00044040000108ULL, 0x0288000282080008ULL,
3438 0x4800008002200000ULL, 0x4a00000000010100ULL, 0x8a88040080000800ULL, 0x0140800000000400ULL,
3439 0x40010050000c0000ULL, 0x0000008000000000ULL, 0x0028000020140040ULL, 0x8620401401005308ULL,
3440 0xc082000000000400ULL, 0x05c0b004c0240600ULL, 0x0288000080000000ULL, 0x0000014000000000ULL,
3441 0x00000000040000c0ULL, 0x8001861008004280ULL, 0x0200000000000300ULL, 0x0000240242288620ULL,
3442 0x801000c05434c200ULL, 0x9020162040a2d2b4ULL, 0x0021840000240704ULL, 0x2a80280080084908ULL,
3443 0x0000000000000000ULL, 0x0500004000000040ULL, 0x0080000000040000ULL, 0x0108058104440000ULL,
3444 0x0900000000040000ULL, 0x00c0000000208008ULL, 0x2000005000000000ULL, 0x0080000000050000ULL,
3445 0x0808000000001080ULL, 0x9880810100308000ULL, 0x2285480080081a08ULL, 0x8a80000080080000ULL,
3446 0x1450000000600010ULL, 0x2210000100000000ULL, 0x8a88000100011000ULL, 0x1541804000000010ULL,
3447 0xc084011140040100ULL, 0x0000000000000800ULL, 0x0400000000000030ULL, 0x2a800000a0890128ULL,
3448 0x1140a00054000104ULL, 0x1440000101200404ULL, 0x028800400400d800ULL, 0x0000000000000000ULL,
3449 0x0000000000002330ULL, 0x0020820228a02280ULL, 0xa2888a02aa8008a8ULL, 0xd0040a0044202500ULL,
3450 0x8000044104a29424ULL, 0xc000100178b2c5b4ULL, 0x0000810100241504ULL, 0xd040030000380008ULL,
3451 0x0000000000000000ULL, 0x26c08c0000200130ULL, 0x4a08000110080000ULL, 0x2aa0004001080800ULL,
3452 0x0aac000000004000ULL, 0x2000000000200000ULL, 0x4240000100020000ULL, 0x4100000080000000ULL,
3453 0x4900040000000000ULL, 0x0800000400300040ULL, 0x6a80000000040800ULL, 0x2a08182000588008ULL,
3454 0x0a00000c81000008ULL, 0x0a000c0010000000ULL, 0x8a88001080280808ULL, 0x0020000200300600ULL,
3455 0xaac00000900a0000ULL, 0x0000100004000000ULL, 0x0020081020000000ULL, 0x8220105010084110ULL,
3456 0x4a80800000004000ULL, 0x050000c0c0200000ULL, 0x288c000084000000ULL, 0xa048082280000000ULL,
3457 0x0000000000000000ULL, 0x8000900000032080ULL, 0xee889e81b8880820ULL, 0xc2200a8142800424ULL,
3458 0xc020141543361010ULL, 0x10a000204a801634ULL, 0x3a808800802a00a0ULL, 0x28808b00803d0800ULL,
3459 0x0000000000000000ULL, 0x0020000000000030ULL, 0x0808400121010040ULL, 0x0c28240100200040ULL,
3460 0x2008200028800000ULL, 0xc10004c80f30c030ULL, 0x0400440114100000ULL, 0x2208200280a22220ULL,
3461 0x0600000030c01000ULL, 0x1201001040c00000ULL, 0x0aa02ea22aa22aa0ULL, 0x30008000000200a0ULL,
3462 0x20c8400400800000ULL, 0x08280b0420800000ULL, 0x0800100000210000ULL, 0x10000300c0100400ULL,
3463 0xc8c0000420000000ULL, 0x1000000010000000ULL, 0x0420000400000000ULL, 0x0220000500204000ULL,
3464 0x2200000420000000ULL, 0x0000540400000000ULL, 0x0000000020000000ULL, 0x00080c00a0810080ULL,
3465 0x1540000000043000ULL, 0x0000000000100000ULL, 0x2e88a22220200a20ULL, 0xc06030e34ea503a0ULL,
3466 0x0001100204048500ULL, 0x000000e0000c0d54ULL, 0x3000820310a31400ULL, 0x13088c0320e00280ULL,
3467 0x0000000000000000ULL, 0x0480000000200000ULL, 0x4000200100000000ULL, 0x0000300040040000ULL,
3468 0x4400000000000000ULL, 0x0401000002240000ULL, 0x0540000000040000ULL, 0x4004010000000000ULL,
3469 0x4001111001100000ULL, 0x2880000000300040ULL, 0x4040004040002404ULL, 0x0200000000000000ULL,
3470 0x0140040000100000ULL, 0x4040010040040080ULL, 0x0a00140000041004ULL, 0x0000a00400808000ULL,
3471 0x1010200000430040ULL, 0x0010000000000000ULL, 0x0540000000104000ULL, 0x1400114005000000ULL,
3472 0x0000204000440010ULL, 0x0500000000004400ULL, 0x4500000018000400ULL, 0x0000400000000000ULL,
3473 0x000000300000cc00ULL, 0x0100001011300000ULL, 0x0040000000000000ULL, 0xc0e0000248a00444ULL,
3474 0x0000040020340144ULL, 0x0000046445105454ULL, 0x32a0a80280880128ULL, 0x0880040000100100ULL,
3475 0x0000000000000000ULL, 0x14003000030c0004ULL, 0x4a04001100000000ULL, 0x0a00108010000000ULL,
3476 0x28a8004000200248ULL, 0x0100040000b00000ULL, 0x42000000000008c0ULL, 0x6008044010550010ULL,
3477 0x0800401000010400ULL, 0x080080040cf80000ULL, 0x5080000001001010ULL, 0x2a80100000000000ULL,
3478 0xcc8010010d401100ULL, 0x0200000001001000ULL, 0x0480001004001000ULL, 0x8d00800040b40210ULL,
3479 0x6200800000300000ULL, 0x0000010000000000ULL, 0x0428004100010000ULL, 0x4320105141501100ULL,
3480 0xe28c0000000c1000ULL, 0xd5c000c3c0e00300ULL, 0x0001000000100200ULL, 0x1004010202400008ULL,
3481 0x0000000000003000ULL, 0x2aa038a0800aab08ULL, 0x2a88038000000000ULL, 0xc220040242f09720ULL,
3482 0x8020200200ba0420ULL, 0x0020106105101004ULL, 0x0480800000220400ULL, 0x2280100080000008ULL,
3483 0x0000000000000000ULL, 0x9000000000200000ULL, 0x0001000000100000ULL, 0x2aa40c0000080800ULL,
3484 0x0040000040010000ULL, 0x0040000000c01000ULL, 0x4000000040000400ULL, 0x0000001000200000ULL,
3485 0x0000010000000000ULL, 0x05808004000c0000ULL, 0x50400c0000000400ULL, 0x020040008f000040ULL,
3486 0x0800000000100000ULL, 0x0000000000000000ULL, 0x0a08440000004000ULL, 0x0064000400008200ULL,
3487 0x0010010010034170ULL, 0x0000000010000000ULL, 0x0100204021000000ULL, 0x022000d000010100ULL,
3488 0x0840300000c00000ULL, 0x1400000040204400ULL, 0x09800c0040000000ULL, 0x0209708000000000ULL,
3489 0x000000000000c040ULL, 0x90000c50204040a0ULL, 0x0000000000000000ULL, 0x00e1500040200004ULL,
3490 0x8020260540204494ULL, 0x0020026150201054ULL, 0x0281800380105634ULL, 0x0884900481105000ULL,
3491 0x0000000000000000ULL, 0x84203c00002c0200ULL, 0xc089040000000000ULL, 0xc2a8100040200004ULL,
3492 0xe00c1c0000000000ULL, 0x0ce1330080200080ULL, 0x0000000000200000ULL, 0xc400110000404010ULL,
3493 0x0088400000000000ULL, 0x00083cc00c00c00cULL, 0xcac01c00c000580cULL, 0xe300b0f000100000ULL,
3494 0x0300000000000000ULL, 0xc0000f0000000000ULL, 0xc3c01c0400000000ULL, 0x81008004c0f40000ULL,
3495 0xc3d8003000000440ULL, 0x0000000000000000ULL, 0xc430000000000000ULL, 0x0060000000001000ULL,
3496 0x0800000000000000ULL, 0x00c03300f0fc0008ULL, 0x3000000400200010ULL, 0xa2a80892a0880a28ULL,
3497 0x0500000040000004ULL, 0x0000000000000000ULL, 0xc80032070c200020ULL, 0x0220820060a296a0ULL,
3498 0x802084021db486a0ULL, 0x00000d60080c0080ULL, 0xb281803313a32428ULL, 0x1808300320300000ULL,
3499 0x0000000000000000ULL, 0x85208cc0ccac1f20ULL, 0x2081000186100808ULL, 0x22a80880000a0808ULL,
3500 0xaaa8086880000000ULL, 0x802084800a2e9200ULL, 0xa280000000002008ULL, 0xa000000080080400ULL,
3501 0x2080010000000008ULL, 0x802020c00c028c80ULL, 0x2080000000140810ULL, 0x2a80086080080008ULL,
3502 0x2a800000a8000800ULL, 0xaa881800a2080800ULL, 0xaa98004080280808ULL, 0x004483d0c0300000ULL,
3503 0xa280002080080000ULL, 0x0000000000300000ULL, 0x22a1030000000008ULL, 0xa8a0301088880880ULL,
3504 0xaa80002080222808ULL, 0x85400c03fc030400ULL, 0x8a88000000000008ULL, 0xa008008010080008ULL,
3505 0x0000000000010000ULL, 0x0040100000301040ULL, 0x28800000a0002008ULL, 0x122482306cbc0eacULL,
3506 0x8020224222b8c6a0ULL, 0x802002004a82c284ULL, 0x0aa08fc440a41c80ULL, 0x888080d181385098ULL,
3507 0x0000000000000000ULL, 0x00c0b000000c0080ULL, 0x2208001000000800ULL, 0x0a28000000200000ULL,
3508 0x0000000300000000ULL, 0x00c1040000200000ULL, 0x0203020000000000ULL, 0x0248000000020000ULL,
3509 0x0000840000100000ULL, 0x0a808c00c000008cULL, 0x5200040040000004ULL, 0x02000c00000080a0ULL,
3510 0x0b0c000020000000ULL, 0x0b04000001000000ULL, 0x088c0010002000c0ULL, 0x80e08b00c0030c20ULL,
3511 0x0280000200014040ULL, 0x0000000000000000ULL, 0x0e20a0a008000020ULL, 0x0e280fd03f00111cULL,
3512 0x200080c020001000ULL, 0x8cc00c02c02f0400ULL, 0x480c0001000c404cULL, 0x0208014281080808ULL,
3513 0x000000000000fcfcULL, 0x004403300cf00030ULL, 0x2200000000004400ULL, 0x02202000c08c0c20ULL,
3514 0x02202022683a80a0ULL, 0x4020228028008c00ULL, 0x32208cc0002c0200ULL, 0x3ec00c0080304008ULL,
3515 0x0000000000000000ULL, 0x34000c00002c0000ULL, 0x0b00000100100030ULL, 0x0823018000000000ULL,
3516 0x0e8c001c01e00000ULL, 0x1200800600330000ULL, 0x4000110000000000ULL, 0x0080000300000000ULL,
3517 0x0800000000000000ULL, 0x08c08c04000c0000ULL, 0x0080400000880000ULL, 0x0a08000080c00008ULL,
3518 0x0800000304400000ULL, 0x0208000000c00000ULL, 0x2888300080400800ULL, 0x8dc0204400000000ULL,
3519 0xc0000000c0800000ULL, 0x0000c10000000000ULL, 0x24000c4010c00000ULL, 0x272000541d811000ULL,
3520 0x0200400000001000ULL, 0x0400000400001004ULL, 0xc08c007004001000ULL, 0x2048004000000000ULL,
3521 0x000000000003fcfcULL, 0x2aa030000cf8c800ULL, 0xe280000000000000ULL, 0x0a21008142000340ULL,
3522 0x0021002000b61040ULL, 0x800004064006d444ULL, 0x3aa0800300230008ULL, 0x0b00030000300000ULL,
3523 0x0000000000000000ULL, 0x01c080000000040cULL, 0x0100000000004000ULL, 0x0aa8018010001000ULL,
3524 0x0800000000100000ULL, 0x3000000000008c00ULL, 0x5400000013000000ULL, 0x02c0c00004004010ULL,
3525 0x5241100010000c00ULL, 0x0e00080000000808ULL, 0x5281000000000800ULL, 0x0a08108020000800ULL,
3526 0x0a80000000005210ULL, 0x0100000041000000ULL, 0x2a88000002080110ULL, 0x8520800000c00080ULL,
3527 0x01000010108c0100ULL, 0x0000000000000000ULL, 0x42a0420080000000ULL, 0x0020001004010010ULL,
3528 0xc4000000000c0000ULL, 0x01000c00c0200400ULL, 0x4600000100000000ULL, 0x0000000000000000ULL,
3529 0x0010001000000010ULL, 0x910400900820d030ULL, 0x2280000000000000ULL, 0xc2212004400040e4ULL,
3530 0x8001000000b61420ULL, 0xa00002a248e810b4ULL, 0x32008000002c0008ULL, 0x0c010034803c5010ULL,
3531 0x0000000000000000ULL, 0x85008002002c0000ULL, 0x0204001000004010ULL, 0x0120008000200000ULL,
3532 0x000010000c2000c0ULL, 0xccc0000000200000ULL, 0x0400000c00100040ULL, 0x0003300100004100ULL,
3533 0x4000551040000004ULL, 0x0e0080000c820808ULL, 0xc000000000080800ULL, 0xc803000000000000ULL,
3534 0x0a4000c000200000ULL, 0x0040000000c00000ULL, 0x0918145000405000ULL, 0x81400000c0300400ULL,
3535 0x0050000000000000ULL, 0xd000045000000000ULL, 0x0400004000400000ULL, 0x0420104010000110ULL,
3536 0x0700000000203000ULL, 0x34800300c0e00704ULL, 0x4440100044000400ULL, 0x0040000040000000ULL,
3537 0x0030000044000000ULL, 0xeaaca0008808c880ULL, 0x0a01000000200000ULL, 0x1220a300403ccf20ULL,
3538 0x002024c200b61044ULL, 0x802014346aa2d434ULL, 0x30008c00c0820c44ULL, 0x0a000000000c4800ULL,
3539 0x0000000000000000ULL, 0x0000404000340c90ULL, 0x08a8a10820800280ULL, 0x8128009022201000ULL,
3540 0x0020808228a000a0ULL, 0x0020400100410000ULL, 0x0400000110000000ULL, 0xa609000000200000ULL,
3541 0x8008330000d00000ULL, 0x8060100040404010ULL, 0xeaa00ea0ea00808cULL, 0x200c8020a0000020ULL,
3542 0x0408800020200000ULL, 0x0189001403200000ULL, 0xc00800000000c000ULL, 0x200430c00c300000ULL,
3543 0x0100300100004000ULL, 0x0000040000000000ULL, 0x2420000400001000ULL, 0x89a1200400000000ULL,
3544 0x20c8a000208c0000ULL, 0x8080000000000000ULL, 0x28a0108020210080ULL, 0xa2a84800a0880988ULL,
3545 0x258008000400c000ULL, 0x0140000000100000ULL, 0xa028a222a0aa0228ULL, 0xc060012054044040ULL,
3546 0x0010010400000000ULL, 0x00000050150c0114ULL, 0x0000008010c20010ULL, 0xaa088000a0200880ULL,
3547 0x0000000000000000ULL, 0x0700b0c0000c0000ULL, 0x2200040000080030ULL, 0x2aa8808040240800ULL,
3548 0x08b0500000000100ULL, 0x1000830400200000ULL, 0x4204000010000000ULL, 0x40c2200050040050ULL,
3549 0x0104404001010000ULL, 0x1a808c8103c00030ULL, 0x30900010c0000b00ULL, 0x200812b283000008ULL,
3550 0x000c000020e00000ULL, 0x2140000000400000ULL, 0x0288000080200000ULL, 0x8060a200c8a20280ULL,
3551 0x0400114010215000ULL, 0x0000000000000000ULL, 0x082b200002000010ULL, 0x22a0030000031000ULL,
3552 0x008100001000000cULL, 0x05400c00c0230400ULL, 0xca3000003c080100ULL, 0x0000000020000004ULL,
3553 0x0000000100000000ULL, 0x8004320813f5c000ULL, 0xa280080200000800ULL, 0xc22000044e334c20ULL,
3554 0x000004146e361024ULL, 0x800126806aa0d584ULL, 0xb000a0040023c41cULL, 0x0a083000803053d8ULL,
3555 0x0000000000000000ULL, 0x0000100000020000ULL, 0x0000000010000010ULL, 0x0000000045040004ULL,
3556 0x0000000000100000ULL, 0x0000020400000010ULL, 0x0003015000000000ULL, 0x0400000000000000ULL,
3557 0x0000000400000000ULL, 0x0100000000000800ULL, 0x0000001000000000ULL, 0x0000000000000000ULL,
3558 0x0000000040000000ULL, 0x0000000000000000ULL, 0x0004001000000000ULL, 0x0008001000000000ULL,
3559 0x0010000000000004ULL, 0x0000010100001000ULL, 0x0004000000000004ULL, 0x0000014040050014ULL,
3560 0x0014000000000040ULL, 0x5540000000041000ULL, 0x0000000000000000ULL, 0x0000040000000d00ULL,
3561 0x0000000000000000ULL, 0x0000000000100000ULL, 0x0001000000000000ULL, 0x0000000000000000ULL,
3562 0x0000000000000000ULL, 0x0000000000000000ULL, 0x4500000000040400ULL, 0x0000800000000400ULL,
3563 0x0000000000000000ULL, 0x13e080000020000cULL, 0xcf00001005100000ULL, 0x04a8008000200300ULL,
3564 0x00280100100000c0ULL, 0x1c8c000040200000ULL, 0x0600005000100000ULL, 0x050800000c104000ULL,
3565 0x4c10101000110000ULL, 0x0c00000000300000ULL, 0x22040c00100000c0ULL, 0x0800700010100000ULL,
3566 0x0000000000001000ULL, 0x0a08000010000040ULL, 0x0800034004210010ULL, 0x04e0000400000000ULL,
3567 0x0800030020000000ULL, 0x0000005000000000ULL, 0x0400110101304110ULL, 0x0428000010a01000ULL,
3568 0x060b000000800010ULL, 0x35810c00c020c000ULL, 0x00800c4321800000ULL, 0x4208088020000080ULL,
3569 0x040000111003ff00ULL, 0x0020900020202080ULL, 0x22888180a8000888ULL, 0x0225200542005420ULL,
3570 0x2020040400340020ULL, 0x10300424500cc444ULL, 0x3081a00400e00200ULL, 0x33001300c0300000ULL,
3571 0x0000000000000000ULL, 0x04003c0000000000ULL, 0x0a04001000100100ULL, 0x1408000001000000ULL,
3572 0x1800000044100000ULL, 0x3400040400000300ULL, 0x5000040801000040ULL, 0x4088401040000040ULL,
3573 0x1010110130100000ULL, 0xca800c3000300000ULL, 0x5a01000000080100ULL, 0x020280000cd01300ULL,
3574 0x0302000410200010ULL, 0x0000102000300000ULL, 0x0b09000000000000ULL, 0x20008004c4800004ULL,
3575 0x28c0410010000000ULL, 0x0004015041000050ULL, 0x0a01006000200200ULL, 0x0020d00000100040ULL,
3576 0x0010a00100900000ULL, 0x3500bf00c0030300ULL, 0x080c010000200d00ULL, 0x2248000004020010ULL,
3577 0x0000c00000000000ULL, 0x8044b00200e08000ULL, 0xaaa82aa2aa8a2aa8ULL, 0x0220002241c08604ULL,
3578 0x4200260440328444ULL, 0x68001226103008b4ULL, 0x3a0080c0b0000400ULL, 0x2a804804803c4008ULL,
3579 0x0000000000000000ULL, 0x04008c0300000400ULL, 0x008000c0000c0000ULL, 0x088001000000001cULL,
3580 0x0840000001000010ULL, 0x0400000000200c00ULL, 0x4244000101040000ULL, 0x4238007011100000ULL,
3581 0x1000d00100000010ULL, 0x1d00800400300000ULL, 0x4204080c00000000ULL, 0x2a88080080000008ULL,
3582 0x08001c0200001000ULL, 0x0a00000400000000ULL, 0x8a88003080080000ULL, 0x0521800400300000ULL,
3583 0x3200051000201000ULL, 0x0000000000000000ULL, 0x0020801404000000ULL, 0x322010401c0c101cULL,
3584 0x0c01100013000000ULL, 0x04003000c0204000ULL, 0x088c0020a0cc0000ULL, 0x2200000080000018ULL,
3585 0x0404000044000000ULL, 0x82a0b000008820b0ULL, 0x0000040020440000ULL, 0xc2650004403f1420ULL,
3586 0x0021340241b64464ULL, 0x8020040242c2d474ULL, 0x32018c0480288000ULL, 0x00800b0080300000ULL,
3587 0x0000000000000000ULL, 0x05008c0000040130ULL, 0xc0d8000000800000ULL, 0x0020000020200200ULL,
3588 0x23a2000120204000ULL, 0x5052100550104150ULL, 0x1000101100040000ULL, 0xc40001c301000000ULL,
3589 0x8288000000c00000ULL, 0x5150040144d01404ULL, 0xea8c0ea028ae088cULL, 0xc31010c000000c80ULL,
3590 0x0002000060000000ULL, 0xc80800f030000000ULL, 0x0000000400300000ULL, 0xc00080c00ff0c344ULL,
3591 0x00080001200c0000ULL, 0x0000050080000000ULL, 0x0328000300300000ULL, 0x082030000cc01040ULL,
3592 0xeb08800100004000ULL, 0x8030003300c80f00ULL, 0xfb0d0000e4ac0000ULL, 0x0020006080000008ULL,
3593 0x0500100100040000ULL, 0x1140000000000000ULL, 0xcb883330a0e00000ULL, 0xc000010050000080ULL,
3594 0x0010104005b54150ULL, 0x40111d5155001554ULL, 0x80000070140f0004ULL, 0x0b0830c3a0003380ULL,
3595 0x0000000000000000ULL, 0x04c13000000f830cULL, 0x2808000000000000ULL, 0x2810000000000800ULL,
3596 0x08c0080004400000ULL, 0x04c0240300801c20ULL, 0x4040000080000004ULL, 0x0000400100100010ULL,
3597 0x020001008000c0c0ULL, 0x1d008c000c3c0000ULL, 0x0080003000000800ULL, 0x2288080080000008ULL,
3598 0x0a84004020220000ULL, 0x0800080000100000ULL, 0xaa80004080400008ULL, 0x8024000400c01660ULL,
3599 0x80841c2001000104ULL, 0x0001000000000000ULL, 0x0020028020020280ULL, 0x0860404011900100ULL,
3600 0xec80080200000000ULL, 0x010103c100200400ULL, 0x0200004000000000ULL, 0x0000000000400400ULL,
3601 0x000010000003fcfcULL, 0x8040083238c20000ULL, 0x08800220a0920a00ULL, 0x08210004483c0c24ULL,
3602 0xc020240740b0a200ULL, 0x802006014a201494ULL, 0x3201233070ac0e00ULL, 0x08002806033a48a0ULL,
3603 0x0000000000000000ULL, 0x8020820028a00680ULL, 0x2000002000000104ULL, 0x22a80801100a0808ULL,
3604 0xa2a8002080000000ULL, 0xa000800008a08000ULL, 0x0000100000400000ULL, 0x8000002100000000ULL,
3605 0x0000010000004404ULL, 0xa2a0088080000888ULL, 0x0000000010400800ULL, 0xa280082080080008ULL,
3606 0x2280000080010008ULL, 0x2000000000000000ULL, 0x228800008c080808ULL, 0x8021828002a98200ULL,
3607 0xa200002000080000ULL, 0x0000040000000000ULL, 0x22a0000080000000ULL, 0x202882c200800080ULL,
3608 0xa000000001004000ULL, 0x000000c808a00600ULL, 0x0000000010000000ULL, 0x000001000000040cULL,
3609 0x0000000000000000ULL, 0x802002a2a8aa82a0ULL, 0x20000024a8088228ULL, 0x8020820001000000ULL,
3610 0x8020000000808280ULL, 0x8000000000000000ULL, 0x0020800000200280ULL, 0x2080082280a00888ULL,
3611 0x0000000000000000ULL, 0x0000015000000040ULL, 0x0000040000040000ULL, 0x0100010010001000ULL,
3612 0x0000003210008000ULL, 0x0000000404000000ULL, 0x0000000000000400ULL, 0x0200000000000000ULL,
3613 0x0000000000000100ULL, 0x5180014400004050ULL, 0x1000000014000000ULL, 0x4200000000000000ULL,
3614 0x0040200000000000ULL, 0x0201004000000000ULL, 0x0a00000000000010ULL, 0x0040200000800000ULL,
3615 0x0040051000000500ULL, 0x0000000100800400ULL, 0x6000000000000000ULL, 0x0000000000000000ULL,
3616 0x280000c1400040ccULL, 0x4180001000000000ULL, 0x00000000c1000104ULL, 0x0000000000000000ULL,
3617 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0080000000c00000ULL, 0x0004006066004000ULL,
3618 0x0000005000040440ULL, 0x0000106005804044ULL, 0x0000a10511004440ULL, 0x0000000000000110ULL,
3619 0x0000000000000000ULL, 0x0000000000080000ULL, 0xeb0808a020800080ULL, 0x29a80081002a1800ULL,
3620 0x0b2c000202100100ULL, 0x0001000000888000ULL, 0x2280102010000000ULL, 0x020000602a004110ULL,
3621 0x8a800160a6108100ULL, 0x0280000000000020ULL, 0x8a8000a0a8808208ULL, 0x0280882080500308ULL,
3622 0x0b18010020804100ULL, 0xeb080000c0080080ULL, 0x2b08000000810130ULL, 0x0000000008040020ULL,
3623 0xaa0a08e082894140ULL, 0x0000000000000000ULL, 0x202081409010001cULL, 0x8aa8805082806000ULL,
3624 0xeb082900289c0000ULL, 0x0000000000008000ULL, 0xf80c2e20002e0000ULL, 0xa288080420880888ULL,
3625 0x0000010000000000ULL, 0x0000000000102000ULL, 0x22880000a8a80808ULL, 0x022022a22aa880a0ULL,
3626 0x0000222222aa0620ULL, 0x0000022002800000ULL, 0x208080004028a000ULL, 0x2b888800801c0828ULL,
3627 0x0000000000000000ULL, 0x22e0828280a08028ULL, 0xaa88002082080308ULL, 0x0ea80080410a0040ULL,
3628 0x2a28222000a00000ULL, 0x8aa2808028a0a2a0ULL, 0x0200001000000000ULL, 0x82080000a0000000ULL,
3629 0x8800000082000808ULL, 0x2a008a0000300888ULL, 0x0a80080080080808ULL, 0xaa882800840b0808ULL,
3630 0x0a80000080000040ULL, 0xea080820a0000000ULL, 0xaa88080080080808ULL, 0x8040a2800a8024a0ULL,
3631 0xaa800020a0080808ULL, 0x0000040000000000ULL, 0x2a280a0080080880ULL, 0x2a20081080008a00ULL,
3632 0x2a88882088aa0008ULL, 0x81800202c0a01480ULL, 0xea88082082200000ULL, 0xaa88002080080008ULL,
3633 0x0000100000000000ULL, 0x802082a22aa0a2a0ULL, 0x2e80000000000000ULL, 0x0220a2a26aa0a2a8ULL,
3634 0x800022a2228a22a0ULL, 0x880002212e82c0b0ULL, 0x02a0aa0002a82228ULL, 0x2d808b0080380008ULL,
3635 0x0000000000000000ULL, 0x000407551c154244ULL, 0x2a00208088a02228ULL, 0x12a82182a2402a88ULL,
3636 0xe32821e020826d00ULL, 0x801130100ccc1330ULL, 0x028010c000841008ULL, 0x88a08002a0a664a0ULL,
3637 0x0048270080000100ULL, 0x00001f010cd10f30ULL, 0xe2242ce22aaea2a0ULL, 0xc2c00cc20ae22460ULL,
3638 0xe208003128021c10ULL, 0x2a2021c010821080ULL, 0x2a88202082202020ULL, 0x4010111104941410ULL,
3639 0xc80c02c182b00080ULL, 0x0000040000000000ULL, 0xe28030068002c300ULL, 0x2aa02024a2a22228ULL,
3640 0xe20889328aa22080ULL, 0x0000000000210100ULL, 0xaa0028e0a9b221a0ULL, 0x2000008080400000ULL,
3641 0x0000010041150404ULL, 0x0000105114410100ULL, 0xeaa82aa6aaaaaaa8ULL, 0x000000f44300c434ULL,
3642 0x0000222222b00020ULL, 0x0000002000000000ULL, 0x0000004014000000ULL, 0x0039b3f73fbcd3fcULL,
3643 0x0000000000000000ULL, 0x0000104015045040ULL, 0x20a80490a08800a0ULL, 0x40a8258410a909a0ULL,
3644 0xe0a8a2022aa2e2a0ULL, 0xc111010014000500ULL, 0x2080044041840004ULL, 0x28a8200220a2aba0ULL,
3645 0x008400a0a2840800ULL, 0x0101015451009464ULL, 0x20000ea0e02c2c2cULL, 0xe2a828a2aca2aaa8ULL,
3646 0x682020a228a222a0ULL, 0xe8882ae22aa2a2a0ULL, 0xe9a80e6022a24140ULL, 0x0011055005001040ULL,
3647 0x2aa8208229a0aaa4ULL, 0x0000040000000000ULL, 0x28a0228026a62260ULL, 0xe2a020a422a2a020ULL,
3648 0xe808a0022aa1a220ULL, 0x0000010014000100ULL, 0x28ac22802aa2a020ULL, 0x0020000000000000ULL,
3649 0x0100010100040000ULL, 0x0000000000000000ULL, 0x22a822a22a8aaaa0ULL, 0x0000000000000000ULL,
3650 0x0000102410800100ULL, 0x0000000000000000ULL, 0x0000000002000000ULL, 0x00000fb2a08c0aa8ULL,
3651 0x0000000000000000ULL, 0x4010005015440140ULL, 0x18c81c00b180001cULL, 0x2800048021820800ULL,
3652 0x8ab820c06a802580ULL, 0x00100170f4040000ULL, 0x4000144041041404ULL, 0x0ac800d0002e440cULL,
3653 0x20880820a2000808ULL, 0x400000f03f300c00ULL, 0xaa000ea22aa22aa0ULL, 0xa2880ac0a8942a20ULL,
3654 0xaa880a81a1804188ULL, 0xeea022a0aaa02080ULL, 0xaaa820a2aaa66120ULL, 0x0000005115800150ULL,
3655 0x2a880920a0840040ULL, 0x0000040000000000ULL, 0xaea82222aaa22a28ULL, 0x8a28041260055150ULL,
3656 0xa28824008aa28880ULL, 0x0000025014019000ULL, 0xea882ae02aa200a0ULL, 0x0000000000000000ULL,
3657 0x0000000040000400ULL, 0x0000000000000000ULL, 0xaaa82aa22aaaaaa0ULL, 0x0000000000000000ULL,
3658 0x0000000000000000ULL, 0x002003003c80c000ULL, 0x0000020014000000ULL, 0x00200010a0980a20ULL,
3659 0x0000000000000000ULL, 0x0020001200801240ULL, 0x0a88000089800020ULL, 0xcaa00080a1000000ULL,
3660 0x0a200c0020a04080ULL, 0x4002034003840880ULL, 0x4690500190000050ULL, 0x2228004000601000ULL,
3661 0x0a803f00803f400cULL, 0x400033e24dd0cf34ULL, 0xaa80a2a229a220a0ULL, 0x0a224000002c0000ULL,
3662 0x028000202000008cULL, 0x0a08000070000030ULL, 0x00800c040020000cULL, 0x0000000002850000ULL,
3663 0x02881cc310200000ULL, 0x0000040004000000ULL, 0xcba8000400000080ULL, 0xcaa02c0680000000ULL,
3664 0xcc880002008c4080ULL, 0x300000f007f0cf0cULL, 0x0a80001080a00000ULL, 0x820880802a880a80ULL,
3665 0x0000050001040004ULL, 0x0000011000000000ULL, 0x0a8020a2a0202000ULL, 0x0000022202008000ULL,
3666 0x0000222212808000ULL, 0x0020226010000000ULL, 0x000033f33ff3c33cULL, 0x00288002a08c02a8ULL,
3667 0x0000000000000000ULL, 0x04408e0000008200ULL, 0x0808004000900000ULL, 0x0aa8200010ca00c0ULL,
3668 0x0ba80101005d4010ULL, 0x00018604802c8288ULL, 0x00049400101c0000ULL, 0x000c101110505010ULL,
3669 0x0000000000100000ULL, 0x30000c00c022000cULL, 0xd0c00dd0d51d431cULL, 0x0008000010100000ULL,
3670 0x000c1001a0280000ULL, 0x0bc80000c0000000ULL, 0x0a00000080280000ULL, 0x8000a00220308420ULL,
3671 0x0808000010301000ULL, 0x0000040000000000ULL, 0x0d00031480100000ULL, 0x07200000108c0300ULL,
3672 0x0bc0a0c000004000ULL, 0x8000b002c0208480ULL, 0x340c0100118c111cULL, 0x8008008020890000ULL,
3673 0x0000000000040010ULL, 0x0020b00320c1d0b0ULL, 0x00002000000c0000ULL, 0x0020be226e2008a0ULL,
3674 0x002010c03fb0a6a0ULL, 0x00202e222aaec284ULL, 0x00008f0000208400ULL, 0x0000000000300000ULL,
3675 };
3676 // Latin1 6%, Latin2 11%, Latin7 3%
3677 
3678 
3679 
3680 // Just for debugging. not thread-safe
3681 static char tri_string[4];
Latin127Str(int trisub)3682 char* Latin127Str(int trisub) {
3683   tri_string[0] = "_abcdefghijklmnopqrstuvwxyzAEIOC"[(trisub >> 10) & 0x1f];
3684   tri_string[1] = "_abcdefghijklmnopqrstuvwxyzAEIOC"[(trisub >> 5) & 0x1f];
3685   tri_string[2] = "_abcdefghijklmnopqrstuvwxyzAEIOC"[(trisub >> 0) & 0x1f];
3686   tri_string[3] = '\0';
3687   return tri_string;
3688 }
3689 
3690 // Returns two bits per three-byte trigram, indicating
3691 // dont-care, Latin1 likely, Latin2 likely, and Latin7 (ISO-8859-13) likely
TrigramValue(const uint8 * trisrc)3692 int TrigramValue(const uint8* trisrc) {
3693   int byte0_p = kMapToFiveBits[trisrc[0]];
3694   int byte1_p = kMapToFiveBits[trisrc[1]];
3695   int byte2_p = kMapToFiveBits[trisrc[2]];
3696   int subscr = ((byte0_p) << 5) | byte1_p;
3697   int temp = static_cast<int>((kLatin127Trigrams[subscr] >> (byte2_p * 2)));
3698   //printf("%s=%d ", Latin127Str((subscr << 5) | byte2_p), temp & 3);
3699   return temp & 3;
3700 }
3701 
3702 
3703 // Put out trigrams for surrounding 32 bytes for Latin encodings
3704 // Return true if more Latin2 & 7 than Latin1
BoostLatin127Trigrams(int tri_block_offset,DetectEncodingState * destatep)3705 bool BoostLatin127Trigrams(int tri_block_offset,
3706                            DetectEncodingState* destatep) {
3707   //printf("BoostLatin127Trigrams[%06x]\n", tri_block_offset);
3708   int excess_latin27 = 0;
3709   int srclen = destatep->limit_src - destatep->initial_src;
3710   int hi_limit = minint(tri_block_offset + 32, srclen - 2);
3711   const uint8* trisrc = &destatep->initial_src[tri_block_offset];
3712   const uint8* trisrclimit = &destatep->initial_src[hi_limit];
3713   while (trisrc < trisrclimit) {
3714     // Selectively boost Latin1, Latin2, or Latin7 and friends
3715     int trigram_val = TrigramValue(trisrc);
3716     if (trigram_val != 0) {
3717       if (FLAGS_enc_detect_source) {
3718         PsHighlight(trisrc, destatep->initial_src, trigram_val, 1);
3719       }
3720       if (trigram_val == kTriLatin1Likely) {
3721         Boost(destatep, F_Latin1, kTrigramBoost);
3722         Boost(destatep, F_CP1252, kTrigramBoost);
3723         // We don't want to upset the relative rank of a declared 8859-15
3724         Boost(destatep, F_ISO_8859_15, kTrigramBoost);
3725         --excess_latin27;
3726       } else if (trigram_val == kTriLatin2Likely) {
3727         Boost(destatep, F_Latin2, kTrigramBoost);
3728         Boost(destatep, F_CP1250, kTrigramBoost);
3729         ++excess_latin27;
3730       } else if (trigram_val == kTriLatin7Likely) {
3731         Boost(destatep, F_ISO_8859_13, kTrigramBoost);
3732         Boost(destatep, F_CP1257, kTrigramBoost);
3733         // We don't want to upset the relative rank of a declared 8859-4 or -6
3734         // for Estonian
3735         Boost(destatep, F_Latin4, kTrigramBoost);
3736         Boost(destatep, F_Latin6, kTrigramBoost);
3737         ++excess_latin27;
3738       }
3739     }
3740 
3741     ++trisrc;
3742   }
3743   //printf("\n");
3744 
3745   return (0 < excess_latin27);
3746 }
3747 
3748 
3749 
3750 // Boost any encodings that need extra detection help, then prune
3751 // src is first unscanned byte
3752 // slowend means extra pruning when dropping out of initial slow scan
3753 // final means last call -- no bigram at src
BoostPrune(const uint8 * src,DetectEncodingState * destatep,int prunereason)3754 void BoostPrune(const uint8* src, DetectEncodingState* destatep,
3755                 int prunereason) {
3756   int delta_asciipairs = destatep->next_interesting_pair[AsciiPair] -
3757     destatep->prior_interesting_pair[AsciiPair];
3758   int delta_otherpairs = destatep->next_interesting_pair[OtherPair] -
3759     destatep->prior_interesting_pair[OtherPair];
3760 
3761   if (prunereason == PRUNE_FINAL) {
3762     // We are about done
3763     // If we get here with very little accumulated data, the initial hints
3764     // were too strong, so we derate them to n+1 / 12 for n bigrams
3765     if (!destatep->hints_derated  &&
3766         (destatep->next_interesting_pair[OtherPair] < kDerateHintsBelow)) {
3767       int n = destatep->next_interesting_pair[OtherPair];
3768 
3769       // Map N pairs to (N+1)/12 portions of the initial hints, etc.
3770       // Floor of 3/12 -- 1/12 and 2/12 are too easy to overcome
3771       int m = maxint(3, (n + 1));
3772       for (int i = 0; i < NUM_RANKEDENCODING; ++i) {
3773         int original_delta = destatep->hint_prob[i];
3774         int scaled_delta = (original_delta * m) / kDerateHintsBelow;
3775         destatep->enc_prob[i] -= original_delta;
3776         destatep->enc_prob[i] += scaled_delta;
3777       }
3778       destatep->hints_derated = true;
3779       if (destatep->debug_data != NULL) {
3780         // Show derated-hint result
3781         char buff[32];
3782         snprintf(buff, sizeof(buff), "Hints %d/%d", m, kDerateHintsBelow);
3783         SetDetailsEncLabel(destatep, buff);
3784       }
3785     }
3786   }
3787 
3788 
3789   ++destatep->prune_count;
3790 
3791   if (prunereason != PRUNE_FINAL) {
3792     // Early outs
3793     if (destatep->rankedencoding_list_len <= 1) {            // nothing to prune
3794       destatep->done = true;
3795       return;
3796     }
3797 
3798     if ((destatep->prune_count > 0) &&
3799         (delta_asciipairs + delta_otherpairs) == 0) {
3800       // Nothing to do; must have just been called earlier
3801       return;
3802     }
3803   }
3804 
3805 
3806 
3807   // INCREMENT
3808   // ====================
3809   // Accumulate OtherPair probibilities over all active families
3810   // AsciiPair probibilities are all done in ActiveSpecialBoostWhack
3811   uint8 prior_bad_byte1 = ' ';    // won't match first bad pair
3812   uint8 prior_bad_byte2 = ' ';    // won't match first bad pair
3813   uint8 or_byte1 = 0;             // Track if any current pair has a high bit
3814   int counted_otherpairs = 0;
3815   uint8 prior_byte1x2x = 0;
3816   for (int i = 0; i < delta_otherpairs; ++i) {
3817     int watch1_incr = 0;
3818     int watch2_incr = 0;
3819     int next_pair = destatep->prior_interesting_pair[OtherPair] + i;
3820 
3821     uint8 byte1 = destatep->interesting_pairs[OtherPair][next_pair * 2 + 0];
3822     uint8 byte2 = destatep->interesting_pairs[OtherPair][next_pair * 2 + 1];
3823     uint8 byte1x2x = (byte1 & 0xf0) | ((byte2 >> 4) & 0x0f);
3824     int weightshift = destatep->interesting_weightshift[OtherPair][next_pair];
3825 
3826     int offset_byte12 = destatep->interesting_offsets[OtherPair][next_pair];
3827 
3828     // To help distinguish some Cyrillic, Arabic, Greek, Hebrew, Thai
3829     // Remember if this is a CDEF pair immediately following the previous pair
3830     // 8xxx CxCx or CxCx 8xxx
3831     bool next_pair_consec_hi = false;
3832     if (ConsecutivePair(destatep, next_pair)) {
3833       if ((byte1x2x & 0xcc) == 0xcc) {                // 8xxx CxCx
3834         next_pair_consec_hi = true;
3835       } else if ((prior_byte1x2x & 0xcc) == 0xcc) {   // CxCx 8xxx
3836         next_pair_consec_hi = true;
3837       }
3838     }
3839     //printf("prior/cur/consec %02x %02x %d\n",
3840     // prior_byte1x2x, byte1x2x, next_pair_consec_hi);
3841     prior_byte1x2x = byte1x2x;
3842 
3843     or_byte1 |= byte1;
3844     uint8 byte1f = byte1;
3845     // Flip top bit of subscript to better separate quadrant 4 (esp. for Hebrew)
3846     byte1f ^= (byte2 & 0x80);
3847 
3848     // If the same bigram occurred recently, don't increment again
3849     bool pair_used = false;
3850     if (!RepeatedBigram(destatep, byte1, byte2)) {
3851       ++counted_otherpairs;
3852       pair_used = true;
3853       // Boost both charset= declared encodings, so
3854       // Nearly-same probability nearby encoding doesn't drift to the top
3855       if (!FLAGS_demo_nodefault) {
3856         destatep->enc_prob[destatep->declared_enc_1] += kDeclaredEncBoost >> weightshift;
3857         destatep->enc_prob[destatep->declared_enc_2] += kDeclaredEncBoost >> weightshift;
3858       }
3859       bool was_bad_pair = false;
3860       for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
3861         int incr_shift = 0;
3862         int rankedencoding = destatep->rankedencoding_list[j];
3863         Encoding enc = kMapToEncoding[rankedencoding];
3864 
3865         // For binary, Skip over repeated marker bytes, such as 02, FF, etc.
3866         if ((rankedencoding == F_BINARY) &&
3867             RepeatedBinary(destatep, byte1, byte2)) {
3868           incr_shift = 2;       // count 1/4 as much if repeated
3869         }
3870 
3871         // If byte 1x2x for this encoding is exactly zero, illegal byte pair
3872         // Don't increment, but instead penalize
3873         const UnigramEntry* ue = &unigram_table[rankedencoding];
3874         if (ue->b12[byte1x2x] == 0) {
3875           // Don't whack consecutive duplicate bad pairs -- overkill
3876           if ((byte1 != prior_bad_byte1) || (byte2 != prior_bad_byte2)) {
3877             // Extra whack for illegal pair in this encoding
3878             Whack(destatep, rankedencoding, kBadPairWhack >> weightshift);
3879             was_bad_pair = true;
3880           }
3881         } else {
3882           // OK to do the real increment
3883           int incr = ue->b1[byte1f] + ue->b2[byte2] + ue->b12[byte1x2x];
3884           if ((ue->b12[byte1x2x] & 0x01) != 0) {
3885             // Use a more-precise table
3886             int byte32x32 = ((byte1 & 0x1f) << 5) | (byte2 & 0x1f);
3887             int hiressub = (byte2 & 0x60) >> 5;   // select w/bits 5&6 of byte 2
3888             DCHECK(ue->hires[hiressub] != NULL);
3889             incr += ue->hires[hiressub][byte32x32];
3890           } else {
3891             // Default final offset
3892             incr += ue->so;
3893           }
3894           incr >>= incr_shift;
3895 
3896           incr >>= weightshift;
3897           destatep->enc_prob[rankedencoding] += incr;   // The actual increment
3898 
3899           if (FLAGS_enc_detect_detail2) {
3900             if (watch1_rankedenc == rankedencoding) {watch1_incr = incr;}
3901             if (watch2_rankedenc == rankedencoding) {watch2_incr = incr;}
3902           }
3903         }
3904 
3905 
3906         // If consecutive pair of high bytes, give slight boost to one-byte
3907         // encodings that have a full alphabet in the high bytes
3908         if (next_pair_consec_hi && HighAlphaEncoding(enc)) {
3909           Boost(destatep, rankedencoding, kDeclaredEncBoost >> weightshift);
3910         }
3911       }     // End for j < rankedencoding_list_len
3912 
3913       if (was_bad_pair) {
3914         prior_bad_byte1 = byte1;
3915         prior_bad_byte2 = byte2;
3916       }
3917 
3918       // Fold in per-bigram most likely encoding for first N bigrams
3919       if (next_pair < kBestPairsCount) {
3920         int best_enc = kMostLikelyEncoding[(byte1 << 8) + byte2];
3921         Boost(destatep, best_enc, kBestEncBoost >> weightshift);
3922       }
3923 
3924       // Possibly score 32 trigrams around a bigram to better separate
3925       // Latin1 from Latin2 and Latin7. Especially helpful for detecting
3926       // mis-labelled Hungarian latin2.
3927       // If looking and at bigram 0,8,16,... do full scoring, else just 1 tri
3928       if (destatep->do_latin_trigrams ||
3929           destatep->looking_for_latin_trigrams) {
3930         // If just looking, do full scan every 8 times
3931         // Just look up one trigram the other 7 and do full scan if Latin2,7
3932         bool scan32 = false;
3933         const uint8* trisrc = &destatep->initial_src[offset_byte12 - 1];
3934         if (!destatep->do_latin_trigrams) {
3935           if ((i & 7) == 0 || trisrc + 3 > destatep->limit_src) {
3936             scan32 = true;
3937           } else {
3938             scan32 = (kTriLatin1Likely < TrigramValue(trisrc));
3939           }
3940         }
3941         if (destatep->do_latin_trigrams || scan32) {
3942           // Just score each block of 32 bytes once
3943           int tri_block_offset = offset_byte12 & ~0x1f;
3944           if (destatep->trigram_highwater_mark <= tri_block_offset) {
3945             bool turnon = BoostLatin127Trigrams(tri_block_offset, destatep);
3946             if (FLAGS_counts && !destatep->do_latin_trigrams && turnon) {
3947               ++doing_used;    // First time
3948             }
3949             if (FLAGS_enc_detect_source) {
3950               if (!destatep->do_latin_trigrams && turnon) {
3951                 // First time
3952                 PsHighlight(trisrc, destatep->initial_src, 0, 2);
3953               }
3954             }
3955             destatep->do_latin_trigrams |= turnon;
3956             destatep->trigram_highwater_mark = tri_block_offset + 32;
3957           }
3958         }
3959       }
3960 
3961     }       // end if RepeatedBigram()
3962 
3963     // Keep track of initial byte high 3 bits
3964     ++destatep->byte32_count[byte1 >> 5];
3965 
3966 
3967     // TODO: boost subset/superset also
3968     // Boost(destatep, kRelatedEncoding[best_enc], kBestEncBoost);
3969 
3970     if (destatep->debug_data != NULL) {
3971       // Show detail entry for this bigram
3972       char buff[16];
3973       snprintf(buff, sizeof(buff), "%c%02x%02x%c%c",
3974                pair_used ? ' ' : '[',
3975                byte1,
3976                byte2,
3977                pair_used ? ' ' : ']',
3978                (weightshift == 0) ? ' ' : '-');
3979 
3980       SetDetailsEncProb(destatep,
3981                         destatep->interesting_offsets[OtherPair][next_pair],
3982                         kMostLikelyEncoding[(byte1 << 8) + byte2],
3983                         buff);
3984     }
3985     if (FLAGS_enc_detect_detail2) {
3986       if ((watch1_incr != 0) || (watch2_incr != 0)) {
3987         // Show increment detail for this encoding
3988         char buff[32];
3989         snprintf(buff, sizeof(buff), "%c%d %c%d",
3990                  (watch1_incr < 0) ? '-' : '+', watch1_incr,
3991                  (watch2_incr < 0) ? '-' : '+', watch2_incr);
3992         SetDetailsEncLabel(destatep, buff);
3993       }
3994     }
3995   }       // End for i
3996 
3997 
3998   // If no high bit on, demote all the two-byte codes
3999   // WAS BUG. This was inside the loop above and should be outside
4000   if ((counted_otherpairs > 0) && ((or_byte1 & 0x80) == 0)) {
4001     // No high bit in this group (just 02xx, etc.). Whack 2-byte codes
4002     // This keeps SJS from creeping past Latin1 on illegal C0 bytes
4003     for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
4004       int rankedencoding = destatep->rankedencoding_list[j];
4005       Encoding enc = kMapToEncoding[rankedencoding];
4006       if (TwoByteEncoding(enc)) {
4007         Whack(destatep, rankedencoding, kGentlePairWhack * counted_otherpairs);
4008       }
4009     }
4010   }
4011 
4012 
4013   // BOOST
4014   // ====================
4015   if (AnyActive(destatep)) {
4016     ActiveSpecialBoostWhack(src, destatep);
4017   }
4018 
4019   // Update for next time
4020   destatep->prior_src = src;
4021   destatep->prior_interesting_pair[AsciiPair] =
4022     destatep->next_interesting_pair[AsciiPair];
4023   destatep->prior_interesting_pair[OtherPair] =
4024     destatep->next_interesting_pair[OtherPair];
4025 
4026 
4027   // Do any pre-prune final adjustments
4028   // ====================
4029   if (prunereason == PRUNE_FINAL) {
4030     // If UTF8 not in base state, whack
4031     if (destatep->next_utf8_ministate != 0) {
4032       Whack(destatep, F_UTF8, kGentlePairWhack * 2 * 1);
4033     }
4034     // If UTF8UTF8 not in base state, whack
4035     if (destatep->next_utf8utf8_ministate != 0) {
4036       Whack(destatep, F_UTF8UTF8, kGentlePairWhack * 2 * 1);
4037     }
4038 
4039     // If no valid UTF-8 char ever seen, whack
4040     if (destatep->utf8_minicount[5] == 0) {
4041       Whack(destatep, F_UTF8, kBadPairWhack * 8);           // No sequence
4042       Whack(destatep, F_UTF8UTF8, kBadPairWhack * 8);       // No sequence
4043     }
4044 
4045     // If no valid UTF8UTF8 char ever seen, whack
4046     if (destatep->utf8utf8_minicount[5] == 0) {
4047       Whack(destatep, F_UTF8UTF8, kBadPairWhack * 8);       // No sequence
4048     }
4049 
4050     // If not all four binary quadrants, whack BINARY;
4051     // worth 2 pair if 3 quads, 4 pair if 1 or 2 quads
4052     if (destatep->binary_quadrants_count < 4) {
4053       if (destatep->binary_quadrants_count == 3) {
4054         Whack(destatep, F_BINARY, kBadPairWhack * 2);
4055       } else {
4056         Whack(destatep, F_BINARY, kBadPairWhack * 4);
4057       }
4058     }
4059 
4060     // If 1st pair is 1b24, choose between ISO-2022-xx
4061     //  <esc> $ ) C ISO-2022-KR   [1b 24 29 43]
4062     //  <esc> $ ) A ISO-2022-CN   [1b 24 29 41]
4063     //  <esc> $ ) G ISO-2022-CN   [1b 24 29 47]
4064     //  <esc> $ * H ISO-2022-CN   [1b 24 2a 48]
4065     //  <esc> ( B ISO-2022-JP     [1b 28 42]  to ASCII
4066     //  <esc> ( J ISO-2022-JP     [1b 28 4a]  to X0201
4067     //  <esc> $ @ ISO-2022-JP     [1b 24 40]  to X0208-78 twobyte
4068     //  <esc> $ B ISO-2022-JP     [1b 24 42]  to X0208-83 twobyte
4069     if ((destatep->next_interesting_pair[OtherPair] >= 1) &&
4070         Iso2022Active(destatep)) {
4071       if ((destatep->interesting_pairs[OtherPair][0] == 0x1b) &&
4072           (destatep->interesting_pairs[OtherPair][1] == 0x24)) {
4073         int offset = destatep->interesting_offsets[OtherPair][0];
4074         const uint8* esc_src = destatep->initial_src + offset;
4075         if ((destatep->initial_src + offset) < (destatep->limit_src - 3)) {
4076           if ((esc_src[2] == ')') && (esc_src[3] == 'C')) {
4077             Boost(destatep, F_ISO_2022_KR, kBoostOnePair);
4078             Whack(destatep, F_ISO_2022_CN, kBadPairWhack);
4079             Whack(destatep, F_JIS, kBadPairWhack);
4080           } else if ((esc_src[2] == ')') && ((esc_src[3] == 'A') ||
4081                                              (esc_src[3] == 'G'))) {
4082             Boost(destatep, F_ISO_2022_CN, kBoostOnePair);
4083             Whack(destatep, F_ISO_2022_KR, kBadPairWhack);
4084             Whack(destatep, F_JIS, kBadPairWhack);
4085           } else if ((esc_src[2] == '@') || (esc_src[2] == 'B')) {
4086             Boost(destatep, F_JIS, kBoostOnePair);
4087             Whack(destatep, F_ISO_2022_CN, kBadPairWhack);
4088             Whack(destatep, F_ISO_2022_KR, kBadPairWhack);
4089           }
4090         } else {
4091           // Incomplete escape sequence. Whack them all
4092           Whack(destatep, F_JIS, kBadPairWhack);
4093           Whack(destatep, F_ISO_2022_CN, kBadPairWhack);
4094           Whack(destatep, F_ISO_2022_KR, kBadPairWhack);
4095         }
4096       }
4097     }
4098     if (destatep->debug_data != NULL) {
4099       SetDetailsEncLabel(destatep, "pre-final");
4100     }
4101   }
4102 
4103   // PRUNE
4104   // ====================
4105   // Find current top two rankedencoding probabilities
4106   ReRank(destatep);
4107 
4108   if (prunereason == PRUNE_SLOWEND) {
4109     if (destatep->debug_data != NULL) {
4110       SetDetailsEncLabel(destatep, "slow-end");
4111     }
4112   }
4113 
4114   // Keep every rankedencoding with probablity >= top_prob - prune_difference
4115   int prune_diff = destatep->prune_difference;
4116   // If the top encoding is BINARY, it might be overstated, and we might
4117   // therefore prune away the real encoding. Make the pruning delta
4118   // twice as big.
4119   if (destatep->top_rankedencoding == F_BINARY) {
4120     prune_diff *= 2;
4121   }
4122   int keep_prob = destatep->top_prob - prune_diff;
4123 
4124   // Tighten pruning difference (we start wide) for next time
4125   if (destatep->prune_difference > kFinalPruneDifference) {
4126     int decrement = kPruneDiffDecrement;
4127     // If only ASCII pairs, small tighten; if some non-ASCII, full tighten
4128     if (counted_otherpairs == 0) {
4129       decrement >>= 1;
4130     }
4131     destatep->prune_difference -= decrement;
4132   }
4133 
4134   // Prune the list of active encoding families
4135   destatep->active_special = 0;
4136   int k = 0;
4137   for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
4138     bool keep = true;
4139     int rankedencoding = destatep->rankedencoding_list[j];
4140 
4141     // If count is too low, ditch it
4142     if (destatep->enc_prob[rankedencoding] < keep_prob) {
4143       keep = false;
4144     }
4145 
4146     // If at end of slow section, ditch any 7-bit with zero evidence so far
4147     if ((prunereason == PRUNE_SLOWEND) &&
4148         SevenBitEncoding(kMapToEncoding[rankedencoding]) &&
4149         (destatep->enc_prob[rankedencoding] <= 0) &&
4150         (rankedencoding != destatep->top_rankedencoding)) {
4151       keep = false;
4152     }
4153 
4154     // Keep it. This will always keep at least top_prob rankedencoding
4155     if (keep) {
4156       destatep->active_special |= kSpecialMask[kMapToEncoding[rankedencoding]];
4157       destatep->rankedencoding_list[k++] = rankedencoding;
4158     }
4159   }
4160 
4161   if (destatep->debug_data != NULL) {
4162     char buff[32];
4163     snprintf(buff, sizeof(buff), "%d prune", prune_diff / XLOG2);
4164     SetDetailsEncLabel(destatep, buff);
4165   }
4166   destatep->rankedencoding_list_len = k;
4167 
4168 
4169 
4170   // Force final result in some cases
4171   // Do any post-prune final adjustments
4172   if (prunereason == PRUNE_FINAL) {
4173     // If no high-byte pairs, result is ASCII7, BINARY, UTF7, 2022, or HZ
4174     if (destatep->next_interesting_pair[OtherPair] == 0) {
4175       if ((destatep->top_rankedencoding != F_BINARY) &&
4176           (destatep->top_rankedencoding != F_UTF7) &&
4177           (destatep->top_rankedencoding != F_ISO_2022_CN) &&
4178           (destatep->top_rankedencoding != F_ISO_2022_KR) &&
4179           (destatep->top_rankedencoding != F_JIS) &&
4180           (destatep->top_rankedencoding != F_HZ_GB_2312)) {
4181         destatep->top_rankedencoding = F_ASCII_7_bit;
4182         Boost(destatep, F_ASCII_7_bit, kBoostOnePair * 2);
4183       }
4184     }
4185 
4186     // If some 89 pairs, not ISO_8859_x  and vice versa
4187     if (destatep->byte32_count[4] > 0) {
4188       switch (destatep->top_rankedencoding) {
4189       case F_ASCII:         // ISO-8859-1
4190         destatep->top_rankedencoding = F_CP1252;
4191         // Better: destatep->enc_prob[F_ASCII] <==> destatep->enc_prob[F_CP1252]
4192         Boost(destatep, F_CP1252, kBoostOnePair * 2);
4193         break;
4194       case F_Latin2:        // ISO-8859-2
4195         // Don't swap back; not superset
4196         //destatep->top_rankedencoding = F_CP1250;
4197         //Boost(destatep, F_CP1250, kBoostOnePair * 2);
4198         break;
4199       case F_Arabic:         // ISO-8859-6
4200         destatep->top_rankedencoding = F_CP1256;
4201         Boost(destatep, F_CP1256, kBoostOnePair * 2);
4202         break;
4203       case F_Greek:         // ISO-8859-7
4204         // Don't swap -- not proper superset
4205         // Capital Alpha tonos at 0xB6 in ISO-8859-7, 0xA2 in CP1253
4206         //destatep->top_rankedencoding = F_CP1253;
4207         //Boost(destatep, F_CP1253, kBoostOnePair * 2);
4208         break;
4209       case F_Hebrew:        // ISO-8859-8
4210         // Don't swap -- visual vs. logical
4211         //destatep->top_rankedencoding = F_CP1255;
4212         //Boost(destatep, F_CP1255, kBoostOnePair * 2);
4213         break;
4214       case F_Latin5:        // ISO-8859-9
4215         destatep->top_rankedencoding = F_CP1254;
4216         Boost(destatep, F_CP1254, kBoostOnePair * 2);
4217         break;
4218       case F_ISO_8859_11:   // ISO-8859-11
4219         destatep->top_rankedencoding = F_CP874;
4220         Boost(destatep, F_CP874, kBoostOnePair * 2);
4221         break;
4222       }
4223     } else {
4224       switch (destatep->top_rankedencoding) {
4225       case F_CP1252:        // ISO-8859-1
4226         destatep->top_rankedencoding = F_ASCII;
4227         Boost(destatep, F_ASCII, kBoostOnePair * 2);
4228         break;
4229       case F_CP1250:        // ISO-8859-2
4230         // Don't swap back; not superset
4231         //destatep->top_rankedencoding = F_Latin2;
4232         //Boost(destatep, F_Latin2, kBoostOnePair * 2);
4233         break;
4234       case F_CP1256:        // ISO-8859-6
4235         // Don't swap back -- not proper superset
4236         //destatep->top_rankedencoding = F_Arabic;
4237         //Boost(destatep, F_Arabic, kBoostOnePair * 2);
4238         break;
4239       case F_CP1253:        // ISO-8859-7
4240         // Don't swap back -- not proper superset
4241         //destatep->top_rankedencoding = F_Greek;
4242         //Boost(destatep, F_Greek, kBoostOnePair * 2);
4243         break;
4244       case F_CP1255:        // ISO-8859-8
4245         // Don't swap back -- not proper superset
4246         //destatep->top_rankedencoding = F_Hebrew;
4247         //Boost(destatep, F_Hebrew, kBoostOnePair * 2);
4248         break;
4249       case F_CP1254:        // ISO-8859-9
4250         destatep->top_rankedencoding = F_Latin5;
4251         Boost(destatep, F_Latin5, kBoostOnePair * 2);
4252         break;
4253       case F_CP874:         // ISO-8859-11
4254         destatep->top_rankedencoding = F_ISO_8859_11;
4255         Boost(destatep, F_ISO_8859_11, kBoostOnePair * 2);
4256         break;
4257       }
4258     }
4259 
4260     if (destatep->debug_data != NULL) {
4261       char buff[32];
4262       snprintf(buff, sizeof(buff), "final %d",
4263                static_cast<int>(src - destatep->initial_src));
4264       SetDetailsEncLabel(destatep, buff);
4265 
4266       // Show winning encoding and its delta log base2 from 2nd-best
4267       // Divide delta by XLOG2 to get log base 2
4268       int delta = destatep->top_prob - destatep->second_top_prob;
4269       if (delta < (2 * XLOG2)) {
4270         delta /= XDECILOG2;
4271         snprintf(buff, sizeof(buff), "+%d.%d %s ",
4272                  delta / 10, delta % 10,
4273                  MyEncodingName(kMapToEncoding[destatep->top_rankedencoding]));
4274       } else if (delta < (50 * XLOG2)) {
4275         delta /= XLOG2;
4276         snprintf(buff, sizeof(buff), "+%d %s",
4277                  delta,
4278                  MyEncodingName(kMapToEncoding[destatep->top_rankedencoding]));
4279       } else {
4280         snprintf(buff, sizeof(buff), "%s",
4281                  MyEncodingName(kMapToEncoding[destatep->top_rankedencoding]));
4282       }
4283       SetDetailsEncProbCopyOffset(destatep, destatep->top_rankedencoding, buff);
4284     }
4285   }
4286 
4287 
4288   // FINISH
4289   // ====================
4290   // Eventual encoding result is reliable if big difference in top two, or if
4291   // only Ascii7 ever encountered
4292   // Also reliable if exactly one OtherPair and it's best encoding matches top
4293   destatep->reliable = false;
4294   if (destatep->next_interesting_pair[OtherPair] == 0) {
4295     // Only 7-bit ASCII
4296     destatep->reliable = true;
4297   }
4298   if ((destatep->top_prob - destatep->second_top_prob) >=
4299       FLAGS_ced_reliable_difference) {
4300     destatep->reliable = true;
4301   }
4302   if (destatep->next_interesting_pair[OtherPair] == 1) {
4303     uint8 byte1 = destatep->interesting_pairs[OtherPair][0];
4304     uint8 byte2 = destatep->interesting_pairs[OtherPair][1];
4305     int best_enc = kMostLikelyEncoding[(byte1 << 8) + byte2];
4306     if (best_enc == destatep->top_rankedencoding) {
4307       destatep->reliable = true;
4308     }
4309   }
4310 
4311   // If we pruned to one encoding, we are done
4312   if (destatep->rankedencoding_list_len == 1) {
4313     destatep->reliable = true;
4314     destatep->done = true;
4315   }
4316 
4317   // If we pruned to two or three encodings in the same *superset/subset
4318   // rankedencoding*  and enough pairs, we are done. Else keep going
4319   if (destatep->rankedencoding_list_len == 2) {
4320     Encoding enc0 = kMapToEncoding[destatep->rankedencoding_list[0]];
4321     Encoding enc1 = kMapToEncoding[destatep->rankedencoding_list[1]];
4322     if (kMapEncToBaseEncoding[enc0] == kMapEncToBaseEncoding[enc1]) {
4323       if (destatep->prune_count >= 3) {
4324         destatep->reliable = true;
4325         destatep->done = true;
4326       }
4327     }
4328   } else if (destatep->rankedencoding_list_len == 3) {
4329     Encoding enc0 = kMapToEncoding[destatep->rankedencoding_list[0]];
4330     Encoding enc1 = kMapToEncoding[destatep->rankedencoding_list[1]];
4331     Encoding enc2 = kMapToEncoding[destatep->rankedencoding_list[2]];
4332     Encoding base0 = kMapEncToBaseEncoding[enc0];
4333     Encoding base1 = kMapEncToBaseEncoding[enc1];
4334     Encoding base2 = kMapEncToBaseEncoding[enc2];
4335 
4336     if ((base0 == base1) && (base0 == base2)) {
4337       if (destatep->prune_count >= 3) {
4338         destatep->reliable = true;
4339         destatep->done = true;
4340       }
4341     }
4342   }
4343 }
4344 
4345 
4346 // Accumulate aligned byte-pair at src
4347 // Occasionally, calc boost for some encodings and then prune the active list
4348 // weightshift is used to give low weight some text, such as inside tags
4349 // Returns true if pruning occurred
IncrementAndBoostPrune(const uint8 * src,int remaining_length,DetectEncodingState * destatep,int weightshift,int exit_reason)4350 bool IncrementAndBoostPrune(const uint8* src,
4351                             int remaining_length,
4352                             DetectEncodingState* destatep,
4353                             int weightshift,
4354                             int exit_reason) {
4355   destatep->last_pair = src;
4356   // Pick up byte pair, or very last byte plus 0x20
4357   uint8 byte1 = src[0];
4358   uint8 byte2 = 0x20;
4359   if (1 < remaining_length) {byte2 = src[1];}
4360 
4361   // whatset=0 for Ascii + ~, 1 for all others; see kTestPrintableAsciiTildePlus
4362   int whatset = exit_reason - 1;
4363   int next_pair = destatep->next_interesting_pair[whatset];
4364 
4365   if (next_pair > 16) {
4366     // If not clear by 16 bigrams, stop accumulating + ~ 00
4367     if (byte1 == '+') {return false;}
4368     if (byte1 == '~') {return false;}
4369     if (byte1 == 0x00) {return false;}
4370   }
4371 
4372   // Remember pair in appropriate list
4373   if (next_pair >= kMaxPairs) {
4374     // We have filled up our alloted space for interesting pairs with no
4375     // decision. If ASCII pairs full, just skip until end of slow loop; if
4376     // non-Ascii pairs full, force done
4377     if (whatset == OtherPair) {
4378       destatep->done = true;
4379     }
4380   } else {
4381     int offset = static_cast<int>(src - destatep->initial_src);
4382     destatep->interesting_pairs[whatset][next_pair * 2 + 0] = byte1;
4383     destatep->interesting_pairs[whatset][next_pair * 2 + 1] = byte2;
4384     destatep->interesting_offsets[whatset][next_pair] = offset;
4385     destatep->interesting_weightshift[whatset][next_pair] = weightshift;
4386     ++destatep->next_interesting_pair[whatset];
4387     ++next_pair;
4388   }
4389 
4390   // Prune now and then , but always if forced to be done
4391   if (destatep->done || ((next_pair & kPruneMask) == 0)) {  // Prune every M
4392     BoostPrune(src + 2, destatep, PRUNE_NORMAL);  // src+2 first unscanned byte
4393                                                   // may be off end of input
4394     return true;
4395   }
4396   return false;
4397 }
4398 
DumpSummary(DetectEncodingState * destatep,int whatset,int n)4399 void DumpSummary(DetectEncodingState* destatep, int whatset, int n) {
4400   printf("  %sSummary[%2d]: ", kWhatSetName[whatset],
4401          destatep->next_interesting_pair[whatset]);
4402   int limit = minint(n, destatep->next_interesting_pair[whatset]);
4403   for (int i = 0; i < limit; ++i) {
4404     printf("%02x%02x ",
4405            destatep->interesting_pairs[whatset][i * 2 + 0],
4406            destatep->interesting_pairs[whatset][i * 2 + 1]);
4407     if ((i & 7) == 7) {printf("  ");}
4408   }
4409   printf("\n");
4410 }
4411 
BeginDetail(DetectEncodingState * destatep)4412 void BeginDetail(DetectEncodingState* destatep) {
4413   fprintf(stderr, "%d [", NUM_RANKEDENCODING);
4414   for (int e = 0; e < NUM_RANKEDENCODING; ++e) {
4415     fprintf(stderr, "(%s)",  MyRankedEncName(e));
4416     if ((e % 10) == 9) {fprintf(stderr, "\n    ");}
4417   }
4418   fprintf(stderr, "] size-detail\n");
4419   destatep->next_detail_entry = 0;
4420 }
4421 
4422 // Single character to represent (printable ASCII) gap between bigrams
DetailOffsetChar(int delta)4423 char DetailOffsetChar(int delta) {
4424   if (delta == 0) {return ' ';}
4425   if (delta <= 2) {return '=';}
4426   if (delta <= 15) {return '_';}
4427   if (delta <= 31) {return '+';}
4428   {return ' ';}
4429 }
4430 
DumpDetail(DetectEncodingState * destatep)4431 void DumpDetail(DetectEncodingState* destatep) {
4432   // Turn all counts into delta from previous entry
4433   fprintf(stderr, "%d count-detail\n", destatep->next_detail_entry);
4434   // Rewrite, recording deltas
4435   for (int z = destatep->next_detail_entry - 1; z > 0; --z) {
4436     destatep->debug_data[z].offset -= destatep->debug_data[z - 1].offset;
4437     for (int e = 0; e < NUM_RANKEDENCODING; ++e) {
4438       destatep->debug_data[z].detail_enc_prob[e] -=
4439         destatep->debug_data[z - 1].detail_enc_prob[e];
4440     }
4441   }
4442   // Now print
4443   for (int z = 0; z < destatep->next_detail_entry; ++z) {
4444     // Highlight some entries ending in '!' with light red underbar
4445     int len = destatep->debug_data[z].label.size();
4446     if (destatep->debug_data[z].label[len - 1] == '!') {
4447       fprintf(stderr, "1 0.9 0.9 do-flag\n");
4448     }
4449     fprintf(stderr, "(%c%s) %d [",
4450             DetailOffsetChar(destatep->debug_data[z].offset),
4451             destatep->debug_data[z].label.c_str(),
4452             destatep->debug_data[z].best_enc);
4453     for (int e = 0; e < NUM_RANKEDENCODING; ++e) {
4454       fprintf(stderr, "%d ", destatep->debug_data[z].detail_enc_prob[e]);
4455       if ((e % 10) == 9) {fprintf(stderr, "  ");}
4456     }
4457     fprintf(stderr, "] do-detail-e\n");
4458   }
4459   // Get ready for next time,if any
4460   destatep->next_detail_entry = 0;
4461 }
4462 
PsRecurse(const char * buff)4463 void PsRecurse(const char* buff) {
4464   fprintf(stderr, "() end-detail (%s) start-detail\n\n", buff);
4465 }
4466 
DumpReliable(DetectEncodingState * destatep)4467 void DumpReliable(DetectEncodingState* destatep) {
4468   printf("Not reliable: ");
4469 
4470   // Find center of gravity of OtherPair list
4471   int x_sum = 0;
4472   int y_sum = 0;
4473   int count = destatep->next_interesting_pair[OtherPair];
4474   for (int i = 0; i < count; ++i) {
4475     uint8 byte1 = destatep->interesting_pairs[OtherPair][i * 2 + 0];
4476     uint8 byte2 = destatep->interesting_pairs[OtherPair][i * 2 + 1];
4477     x_sum += byte2;
4478     y_sum += byte1;
4479   }
4480   if (count == 0) {count = 1;}    // adoid zdiv
4481   int x_bar = x_sum / count;
4482   int y_bar = y_sum / count;
4483   printf("center %02X,%02X\n", x_bar, y_bar);
4484 
4485   double closest_dist = 999.0;
4486   int closest = 0;
4487   for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
4488     int rankedencoding = destatep->rankedencoding_list[j];
4489     const UnigramEntry* ue = &unigram_table[rankedencoding];
4490     printf("  %8s = %4d at %02x,%02x +/- %02X,%02X ",
4491            MyEncodingName(kMapToEncoding[rankedencoding]),
4492            destatep->enc_prob[rankedencoding],
4493            ue->x_bar, ue->y_bar,
4494            ue->x_stddev, ue->y_stddev);
4495     double x_diff = x_bar - ue->x_bar;
4496     double y_diff = y_bar - ue->y_bar;
4497     double dist = sqrt((x_diff * x_diff) + (y_diff * y_diff));
4498     printf("(%3.1f)\n", dist);
4499 
4500     if (closest_dist > dist) {
4501       closest_dist = dist;
4502       closest = rankedencoding;
4503     }
4504   }
4505   printf("Closest=%s (%3.1f)\n",
4506          MyEncodingName(kMapToEncoding[closest]), closest_dist);
4507 
4508   for (int i = 0; i < 8; ++i) {
4509     // Demote by distance to CG and see if that helps, or just quit
4510   }
4511 }
4512 
4513 // Scan short single lines quickly for all printable ASCII
4514 // Return true if all bytes are in [20..7F], false otherwise
QuickPrintableAsciiScan(const char * text,int text_length)4515 bool QuickPrintableAsciiScan(const char* text, int text_length) {
4516   const uint8* src = reinterpret_cast<const uint8*>(text);
4517   const uint8* srclimit = src + text_length;
4518   const uint8* srclimit8 = srclimit - 7;
4519   while (src < srclimit8) {
4520     // Exits on any byte outside [0x20..0x7E] range (HT LF CR exit)
4521     uint8 mask = 0;
4522     for (int i = 0; i < 8; ++i) mask |= (src[i]-0x20)|(src[i]+0x01);
4523     if ((mask & 0x80) != 0) break;
4524     src += 8;
4525   }
4526   while (src < srclimit) {
4527     uint8 uc = *src++;
4528     if (kIsPrintableAscii[uc] == 0) {return false;}
4529   }
4530   return true;
4531 }
4532 
4533 static const int kMaxScanBack = 192;
4534 
4535 // Return true if text is inside a tag or JS comment
TextInsideTag(const uint8 * isrc,const uint8 * src,const uint8 * srclimit)4536 bool TextInsideTag(const uint8* isrc, const uint8* src, const uint8* srclimit) {
4537   const uint8* srcbacklimit = src - kMaxScanBack;
4538   if (srcbacklimit < isrc) {
4539     srcbacklimit = isrc;
4540   }
4541   const uint8* ss = src - 1;
4542   while (srcbacklimit <= ss) {
4543     uint8 c = *ss--;
4544     if ((c & ~0x02) == '<') {
4545       // We found preceding < 3C or > 3E nearby
4546       // Even cheaper: if inside a tag, we don't care what tag; return true
4547       if (c == '<') {
4548         return true;
4549       }
4550       // See if we are just after <title>...
4551       if ((c == '>') && (isrc <= (ss - 5)) &&
4552           (ss[-5] == '<') &&
4553           ((ss[-4] | 0x20) == 't') &&
4554           ((ss[-3] | 0x20) == 'i') &&
4555           ((ss[-2] | 0x20) == 't') &&
4556           ((ss[-1] | 0x20) == 'l') &&
4557           ((ss[-0] | 0x20) == 'e')) {
4558         return true;
4559       }
4560       // See if we are just after <SCRIPT language=javascript>...
4561       if ((c == '>') && (isrc <= (ss - 5)) &&
4562           (ss[-5] == 's') &&
4563           ((ss[-4] | 0x20) == 'c') &&
4564           ((ss[-3] | 0x20) == 'r') &&
4565           ((ss[-2] | 0x20) == 'i') &&
4566           ((ss[-1] | 0x20) == 'p') &&
4567           ((ss[-0] | 0x20) == 't')) {
4568         return true;
4569       }
4570       // Not in a tag
4571       return false;
4572     // See if we are just after JavaScript comment /* ...
4573     } else if (c == '/') {
4574       if (((ss + 2) < srclimit) && (ss[2] == '*')) {
4575         // We backscanned to /*
4576         return true;
4577       }
4578     }
4579   }
4580 
4581   return false;
4582 }
4583 
SkipToTagEnd(const uint8 * src,const uint8 * srclimit)4584 const uint8* SkipToTagEnd(const uint8* src, const uint8* srclimit) {
4585   const uint8* ss = src + 1;
4586   while (ss <= srclimit) {
4587     uint8 c = *ss++;
4588     if ((c == '<') || (c == '>')) {
4589       return ss;
4590     }
4591   }
4592   return src + 2;     // Always make progress, Otherwise we get an infinite loop
4593 }
4594 
4595 
4596 // Take a watch string and map to a ranked encoding. If no match, return -1
LookupWatchEnc(const string & watch_str)4597 int LookupWatchEnc(const string& watch_str) {
4598   int watchval = -1;
4599   // Mixed encoding maps to enc=UTF8UTF8
4600   if (watch_str == "UTF8UTF8") {
4601     watchval = F_UTF8UTF8;
4602   } else {
4603     Encoding enc;
4604     if (EncodingFromName(watch_str.c_str(), &enc)) {
4605       watchval = CompactEncDet::BackmapEncodingToRankedEncoding(enc);
4606     }
4607   }
4608   return watchval;
4609 }
4610 
4611 // Return true if enc and enc2 are equal or one is a subset of the other
4612 // or either is UNKNOWN
4613 // also UTF8UTF8 is compatible with both Latin1 and UTF8
CompatibleEnc(Encoding enc,Encoding enc2)4614 bool CompatibleEnc(Encoding enc, Encoding enc2) {
4615   if (enc < 0) {return false;}
4616   if (NUM_ENCODINGS <= enc) {return false;}
4617   if (enc2 < 0) {return false;}
4618   if (NUM_ENCODINGS <= enc2) {return false;}
4619   if (enc == enc2) {return true;}
4620   if (kMapEncToBaseEncoding[enc] == kMapEncToBaseEncoding[enc2]) {return true;}
4621 
4622   if (enc == ASCII_7BIT) {return true;}
4623   if (enc2 == ASCII_7BIT) {return true;}
4624   if (enc == UNKNOWN_ENCODING) {return true;}
4625   if (enc2 == UNKNOWN_ENCODING) {return true;}
4626   if (enc == UTF8UTF8) {
4627     if (enc2 == UTF8) {return true;}
4628     if (kMapEncToBaseEncoding[enc2] == ISO_8859_1) {return true;}
4629   }
4630   if (enc2 == UTF8UTF8) {
4631     if (enc == UTF8) {return true;}
4632     if (kMapEncToBaseEncoding[enc] == ISO_8859_1) {return true;}
4633   }
4634 
4635   return false;
4636 }
4637 
4638 // Return superset of enc and enc2, which must be compatible
SupersetEnc(Encoding enc,Encoding enc2)4639 Encoding SupersetEnc(Encoding enc, Encoding enc2) {
4640   //printf("  SupersetEnc (%s, ", MyEncodingName(enc)); // TEMP
4641   //printf("%s) ", MyEncodingName(enc2));
4642   //printf("= %s\n",
4643   //       MyEncodingName(kMapEncToSuperLevel[enc] >= kMapEncToSuperLevel[enc2] ?
4644   //                      enc :enc2));
4645   if (kMapEncToSuperLevel[enc] >= kMapEncToSuperLevel[enc2]) {
4646     return enc;
4647   }
4648   return enc2;
4649 }
4650 
4651 
4652 // If unreliable, try rescoring to separate some encodings
Rescore(Encoding enc,const uint8 * isrc,const uint8 * srctextlimit,DetectEncodingState * destatep)4653 Encoding Rescore(Encoding enc, const uint8* isrc,
4654                  const uint8* srctextlimit, DetectEncodingState* destatep) {
4655   if (FLAGS_counts) {++rescore_used;}
4656   Encoding new_enc = enc;
4657 
4658   bool rescore_change = false;
4659 
4660   int count = destatep->next_interesting_pair[OtherPair];
4661   int text_length = srctextlimit - isrc;
4662   for (int i = 0; i < count; ++i) {
4663     int bigram_offset = destatep->interesting_offsets[OtherPair][i];
4664     uint8 byte0 = (0 < bigram_offset) ?
4665         isrc[bigram_offset - 1] : 0x20;
4666     uint8 byte1 = isrc[bigram_offset + 0];  // Known to have high bit on
4667     uint8 byte2 = ((bigram_offset + 1) < text_length) ?
4668         isrc[bigram_offset + 1] : 0x20;
4669     uint8 byte3 = ((bigram_offset + 2) < text_length) ?
4670         isrc[bigram_offset + 2] : 0x20;
4671     int high_hash = ((byte0 & 0xc0) >> 0) |
4672                     ((byte1 & 0xc0) >> 1) |
4673                     ((byte2 & 0xc0) >> 4) |
4674                     ((byte3 & 0xc0) >> 6);    // 00112233
4675 
4676     // Boost HighAccent encodings for Ascii bit patterns
4677     //  0x1x  0x0x
4678     //  1010  1010
4679     //  0010  0000
4680     //
4681     if ((high_hash & 0xaa) == 0x20) {
4682       for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
4683         int rankedencoding = destatep->rankedencoding_list[j];
4684         if (HighAccentEncoding(kMapToEncoding[rankedencoding])) {
4685           // TODO: also want to boost Shift-JIS here if byte1 is Ax..Dx
4686           // TEMP
4687           //printf("  Rescore[%02x] %s +%d\n",
4688           //       high_hash, MyRankedEncName(rankedencoding), kGentlePairBoost);
4689           Boost(destatep, rankedencoding, kGentlePairBoost);
4690           rescore_change = true;
4691         }
4692       }
4693     }
4694 
4695     // Whack HighAccent encodings for high bit patterns
4696     //  1x1x  1x1x
4697     //  1010  1010
4698     //  1010  1010
4699     //
4700     if ((high_hash & 0xaa) == 0xaa) {
4701       for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
4702         int rankedencoding = destatep->rankedencoding_list[j];
4703         if (HighAccentEncoding(kMapToEncoding[rankedencoding])) {
4704           // TEMP
4705           //printf("  Rescore[%02x] %s -%d\n",
4706           //       high_hash, MyRankedEncName(rankedencoding), kGentlePairBoost);
4707           Whack(destatep, rankedencoding, kGentlePairBoost);
4708           rescore_change = true;
4709         }
4710       }
4711     }
4712 
4713   }
4714 
4715   if (rescore_change) {
4716     ReRank(destatep);
4717     new_enc = kMapToEncoding[destatep->top_rankedencoding];
4718 
4719     if (destatep->debug_data != NULL) {
4720       char buff[32];
4721       snprintf(buff, sizeof(buff), "=Rescore %s", MyEncodingName(new_enc));
4722       SetDetailsEncProb(destatep,
4723                         0,
4724                         CompactEncDet::BackmapEncodingToRankedEncoding(new_enc),
4725                         buff);
4726       //// DumpDetail(destatep);
4727     }
4728 
4729     SimplePrune(destatep, kFinalPruneDifference);
4730     CalcReliable(destatep);
4731   }
4732 
4733   //if (new_enc != enc) {
4734   //  // TEMP
4735   //  printf("  Rescore new top encoding = %s\n",
4736   //         MyRankedEncName(destatep->top_rankedencoding));
4737   //}
4738 
4739   return new_enc;
4740 }
4741 
4742 
4743 // Given an encoding, add its corresponding ranked encoding to the set
AddToSet(Encoding enc,int * list_len,int * list)4744 void AddToSet(Encoding enc, int* list_len, int* list) {
4745   // TEMP print
4746   int item = CompactEncDet::BackmapEncodingToRankedEncoding(enc);
4747   for (int i = 0; i < *list_len; ++i) {
4748     if (list[i] == item) {
4749       return;                 // Already in the set; don't add again
4750     }
4751   }
4752   list[(*list_len)++] = item;
4753 }
4754 
4755 
4756 static const int kMinRobustBigramCount = 1000;
4757 static const int kMinKBToRobustScan =  64;
4758 static const int kMaxKBToRobustScan = 256;
4759 
4760 // Scan the first 64K or so, just doing raw bigram increments on given
4761 // probability list.
4762 // No fancy duplicate filtering or anything else here.
4763 // Returns number of bigrams counted
RobustScan(const char * text,int text_length,int robust_renc_list_len,int * robust_renc_list,int * robust_renc_probs)4764 int RobustScan(const char* text,
4765                 int text_length,
4766                 int robust_renc_list_len,
4767                 int* robust_renc_list,
4768                 int* robust_renc_probs) {
4769   if (FLAGS_counts) {++robust_used;}
4770   // Zero all the result probabilities
4771   for (int i = 0; i < robust_renc_list_len; ++i) {
4772     robust_renc_probs[i] = 0;
4773   }
4774   int max_fast_len = minint(text_length, (kMaxKBToRobustScan << 10));
4775   const uint8* isrc = reinterpret_cast<const uint8*>(text);
4776   const uint8* src = isrc;
4777   const uint8* srclimitfast2 = isrc + max_fast_len - 1;
4778   const uint8* srclimitfast4 = isrc + max_fast_len - 3;
4779 
4780   int min_fast_len = minint(text_length, (kMinKBToRobustScan << 10));
4781   const uint8* srclimitmin = isrc + min_fast_len - 1;
4782 
4783   int bigram_count = 0;
4784 
4785   if (FLAGS_enc_detect_source) {
4786     PsSourceInit(kPsSourceWidth);
4787     fprintf(stderr, "(RobustScan) do-src\n");
4788   }
4789 
4790   // Sum over a big chunk of the input
4791   // Faster loop, no 7-bit-encodings possible, approx 3000 GB/sec
4792   //====================================
4793   while (src < srclimitfast2) {
4794     // Skip to next interesting bigram
4795 
4796     while (src < srclimitfast4) {
4797       if (((src[0] | src[1] | src[2] | src[3]) & 0x80) != 0) break;
4798       src += 4;
4799     }
4800 
4801     while (src < srclimitfast2) {
4802       if ((src[0] & 0x80) != 0) break;
4803       src++;
4804     }
4805 
4806     if (src < srclimitfast2) {
4807       // We found a bigram with high bit on
4808       // Next 5 lines commented out so we don't show all the source.
4809       //const uint8* srctextlimit = isrc + text_length;
4810       //if (FLAGS_enc_detect_source) {
4811       //  PsSource(src, isrc, srctextlimit);
4812       //  PsMark(src, 2, isrc, 0);
4813       //}
4814 
4815       uint8 byte1 = src[0];
4816       uint8 byte2 = src[1];
4817       uint8 byte1x2x = (byte1 & 0xf0) | ((byte2 >> 4) & 0x0f);
4818       uint8 byte1f = byte1;
4819       // Flip top bit of subscript to better separate quadrant 4 (esp. for Hebrew)
4820       byte1f ^= (byte2 & 0x80);
4821 
4822       // The real increments
4823       for (int j = 0; j < robust_renc_list_len; ++j) {
4824         int rankedencoding = robust_renc_list[j];
4825         const UnigramEntry* ue = &unigram_table[rankedencoding];
4826         int incr = ue->b1[byte1f] + ue->b2[byte2] + ue->b12[byte1x2x];
4827         if ((ue->b12[byte1x2x] & 0x01) != 0) {
4828           // Use a more-precise table
4829           int byte32x32 = ((byte1 & 0x1f) << 5) | (byte2 & 0x1f);
4830           int hiressub = (byte2 & 0x60) >> 5;   // select w/bits 5&6 of byte 2
4831           DCHECK(ue->hires[hiressub] != NULL);
4832           incr += ue->hires[hiressub][byte32x32];
4833         } else {
4834           // Default final offset
4835           incr += ue->so;
4836         }
4837         robust_renc_probs[j] += incr;
4838       }
4839 
4840       src += 2;       // Continue after this bigram
4841       ++bigram_count;
4842 
4843       // Stop after 1000 bigrams reached, if at least 64KB scanned
4844       if ((bigram_count > kMinRobustBigramCount) && (src > srclimitmin)) {
4845         break;
4846       }
4847 
4848     }
4849   }
4850 
4851   if (FLAGS_enc_detect_source) {
4852     fprintf(stderr, "(  bigram_count = %d) do-src\n", bigram_count);
4853     if (bigram_count == 0) {bigram_count = 1;}    // zdiv
4854     for (int i = 0; i < robust_renc_list_len; ++i) {
4855       fprintf(stderr, "(  enc[%-12.12s] = %7d (avg %d)) do-src\n",
4856               MyRankedEncName(robust_renc_list[i]), robust_renc_probs[i],
4857               robust_renc_probs[i] / bigram_count);
4858     }
4859     PsSourceFinish();
4860   }
4861 
4862   return bigram_count;
4863 }
4864 
4865 // If unreliable, rescan middle of document to see if we can get a better
4866 // answer. Rescan is only worthwhile if there are ~200 bytes or more left,
4867 // since the detector takes as much as 96 bytes of bigrams to decide.
Rescan(Encoding enc,const uint8 * isrc,const uint8 * src,const uint8 * srctextlimit,const char * url_hint,const char * http_charset_hint,const char * meta_charset_hint,const int encoding_hint,const Language language_hint,const CompactEncDet::TextCorpusType corpus_type,bool ignore_7bit_mail_encodings,DetectEncodingState * destatep)4868 Encoding Rescan(Encoding enc,
4869                 const uint8* isrc,
4870                 const uint8* src,
4871                 const uint8* srctextlimit,
4872                 const char* url_hint,
4873                 const char* http_charset_hint,
4874                 const char* meta_charset_hint,
4875                 const int encoding_hint,
4876                 const Language language_hint,
4877                 const CompactEncDet::TextCorpusType corpus_type,
4878                 bool ignore_7bit_mail_encodings,
4879                 DetectEncodingState* destatep) {
4880   bool enc_is_reliable = destatep->reliable;
4881   Encoding new_enc = enc;
4882   Encoding second_best_enc =
4883     kMapToEncoding[destatep->second_top_rankedencoding];
4884 
4885   if (FLAGS_counts) {++rescan_used;}
4886 
4887   int scanned_bytes = src - isrc;
4888   int unscanned_bytes = srctextlimit - src;
4889   int text_length = srctextlimit - isrc;
4890   bool empty_rescan = true;
4891 
4892   // See if enough bytes left to bother doing rescan
4893   if (kMinRescanLength < unscanned_bytes) {
4894     const char* text = reinterpret_cast<const char*>(isrc);
4895 
4896     Encoding one_hint = destatep->http_hint;
4897     if ((one_hint == UNKNOWN_ENCODING) &&
4898         (destatep->meta_hint != UNKNOWN_ENCODING)) {
4899       one_hint = destatep->meta_hint;
4900     }
4901     if ((one_hint == UNKNOWN_ENCODING) &&
4902         (destatep->bom_hint != UNKNOWN_ENCODING)) {
4903       one_hint = destatep->bom_hint;
4904     }
4905 
4906     // Go to an even offset to keep UTF-16 in synch
4907     int middle_offset = (scanned_bytes + (unscanned_bytes / 2)) & ~1;
4908     CHECK(middle_offset <= text_length);
4909 
4910     // Look back a bit for a low byte to synchronize, else hope for the best.
4911     const uint8* srcbacklimit = isrc + middle_offset - kMaxScanBack;
4912     if (srcbacklimit < src) {
4913       srcbacklimit = src;
4914     }
4915     const uint8* ss = isrc + middle_offset - 1;
4916     while (srcbacklimit <= ss) {
4917       if ((*ss & 0x80) == 0) {break;}
4918       --ss;
4919     }
4920     // Leave middle offset unchanged unless we found a low byte
4921     if (srcbacklimit <= ss) {
4922       // Align to low byte or high byte just after it, whichever is even
4923       middle_offset = (ss - isrc + 1) & ~1;     // Even to keep UTF-16 in sync
4924     }
4925     CHECK(middle_offset <= text_length);
4926 
4927     if (destatep->debug_data != NULL) {
4928       SetDetailsEncLabel(destatep, ">> Rescan");
4929       // Print the current chart before recursive call
4930       DumpDetail(destatep);
4931 
4932       char buff[32];
4933       snprintf(buff, sizeof(buff), ">> Rescan[%d..%d]",
4934                middle_offset, text_length);
4935       PsRecurse(buff);
4936     }
4937 
4938     int mid_bytes_consumed;
4939     bool mid_is_reliable;
4940     Encoding mid_second_best_enc;
4941     CEDInternalFlags newflags = static_cast<CEDInternalFlags>(
4942       kCEDRescanning + kCEDForceTags);
4943     // Recursive call for rescan of half of remaining
4944     Encoding mid_enc = InternalDetectEncoding(
4945                              newflags,
4946                              text + middle_offset,
4947                              text_length - middle_offset,
4948                              url_hint,
4949                              http_charset_hint,
4950                              meta_charset_hint,
4951                              encoding_hint,
4952                              language_hint,   // User interface lang
4953                              corpus_type,
4954                              ignore_7bit_mail_encodings,
4955                              &mid_bytes_consumed,
4956                              &mid_is_reliable,
4957                              &mid_second_best_enc);
4958     destatep->reliable = mid_is_reliable;
4959 
4960     empty_rescan = (mid_enc == ASCII_7BIT);
4961 
4962     // Not the right decision if, e.g. enc=Greek, mid=ASCII7, one=KSC
4963     // hence the !empty_rescan term
4964     if (!empty_rescan && CompatibleEnc(one_hint, mid_enc)) {
4965       // Encoding we just found is compatible with the
4966       // single hint (if any); return superset
4967       new_enc = SupersetEnc(one_hint, mid_enc);
4968     }
4969 
4970     // If original and mid are compatible, and both reliable,
4971     // return new_enc = SupersetEnc(enc, mid_enc)
4972     //
4973     // This avoids too much weight on a bogus hint causing a RobustScan
4974     // that gets the wrong answer
4975     if (!empty_rescan && mid_is_reliable && enc_is_reliable &&
4976         CompatibleEnc(enc, mid_enc)) {
4977       new_enc = SupersetEnc(enc, mid_enc);
4978       return new_enc;
4979     }
4980 
4981     // if mid unreliable, robustscan
4982     // if mid empty, robustscan
4983     // if original and mid not compatible, robustscan
4984     // if mid and one_hint not compatible, robustscan
4985 
4986     // If we found conflicting data, drop back and do a robust scan of a big
4987     // chunk of the input over a set of candidate encodings
4988     //
4989     if (!mid_is_reliable ||
4990         empty_rescan ||
4991         !CompatibleEnc(enc, mid_enc) ||
4992         !CompatibleEnc(one_hint, mid_enc)) {
4993       int robust_renc_list_len;         // Number of active encodings
4994       int robust_renc_list[NUM_RANKEDENCODING];   // List of ranked encodings
4995       int robust_renc_probs[NUM_RANKEDENCODING];  // List of matching probs
4996 
4997       robust_renc_list_len = 0;
4998       AddToSet(enc, &robust_renc_list_len, robust_renc_list);
4999       AddToSet(second_best_enc, &robust_renc_list_len, robust_renc_list);
5000       AddToSet(mid_enc, &robust_renc_list_len, robust_renc_list);
5001       AddToSet(mid_second_best_enc, &robust_renc_list_len, robust_renc_list);
5002       if (destatep->http_hint != UNKNOWN_ENCODING) {
5003         AddToSet(destatep->http_hint, &robust_renc_list_len, robust_renc_list);
5004       }
5005       if (destatep->meta_hint != UNKNOWN_ENCODING) {
5006         AddToSet(destatep->meta_hint, &robust_renc_list_len, robust_renc_list);
5007       }
5008       if (destatep->bom_hint != UNKNOWN_ENCODING) {
5009         AddToSet(destatep->bom_hint, &robust_renc_list_len, robust_renc_list);
5010       }
5011       if (destatep->tld_hint != UNKNOWN_ENCODING) {
5012         AddToSet(destatep->tld_hint, &robust_renc_list_len, robust_renc_list);
5013       }
5014 
5015       // Separate simple scan
5016       // =====================
5017       if (destatep->debug_data != NULL) {
5018         SetDetailsEncLabel(destatep, ">> RobustScan");
5019         // Print the current chart before recursive call
5020         DumpDetail(destatep);
5021 
5022         char buff[32];
5023         snprintf(buff, sizeof(buff), ">> RobustScan[0..%d]", text_length);
5024         PsRecurse(buff);
5025       }
5026 
5027       int bigram_count = RobustScan(text, text_length,
5028                  robust_renc_list_len, robust_renc_list, robust_renc_probs);
5029 
5030       // Default to new_enc and update if something better was found
5031       int best_prob = -1;
5032       // TEMP print
5033       for (int i = 0; i < robust_renc_list_len; ++i) {
5034         if (best_prob < robust_renc_probs[i]) {
5035           best_prob = robust_renc_probs[i];
5036           new_enc = kMapToEncoding[robust_renc_list[i]];
5037         }
5038       }
5039 
5040       if (destatep->debug_data != NULL) {
5041         char buff[32];
5042         snprintf(buff, sizeof(buff), "=Robust[%d] %s",
5043                  bigram_count, MyEncodingName(new_enc));
5044         SetDetailsEncProb(destatep,
5045                           0,
5046                           CompactEncDet::BackmapEncodingToRankedEncoding(new_enc),
5047                           buff);
5048       }
5049     }
5050   }     // End if enough bytes
5051 
5052   return new_enc;
5053 }
5054 
5055 // With no hints at all, and perhaps on rescan, we relax our pickiness
5056 // and go ahead and accept the top multibyte encodings, even though
5057 // strictly their web pages should have declared an explicit encoding to
5058 // avoid the HTML standard's default ISO-8859-1.
NoHintsCloseEnoughCompatible(Encoding top_enc)5059 bool NoHintsCloseEnoughCompatible(Encoding top_enc) {
5060   // First test accepts degenerate cases plus UTF8 and UTF8UTF8
5061   if (CompatibleEnc(UTF8, top_enc)) {return true;}
5062 
5063   // The rest look for exact match of base encoding
5064   Encoding base_enc = kMapEncToBaseEncoding[top_enc];
5065   if (base_enc == JAPANESE_EUC_JP) {return true;}
5066   if (base_enc == JAPANESE_SHIFT_JIS) {return true;}
5067   if (base_enc == CHINESE_BIG5) {return true;}
5068   if (base_enc == CHINESE_GB) {return true;}
5069   if (base_enc == KOREAN_EUC_KR) {return true;}
5070   return false;
5071 }
5072 
5073 
5074 
5075 // Scan raw bytes and detect most likely encoding
5076 // Design goals:
5077 //   Skip over big initial stretches of seven-bit ASCII bytes very quickly
5078 //   Thread safe
5079 //   Works equally well on
5080 //    50-byte queries,
5081 //    5000-byte email and
5082 //    50000-byte web pages
5083 // Length 0 input returns ISO_8859_1 (ASCII) encoding
5084 // Setting ignore_7bit_mail_encodings effectively turns off detection of
5085 //  UTF-7, HZ, and ISO-2022-xx
InternalDetectEncoding(CEDInternalFlags flags,const char * text,int text_length,const char * url_hint,const char * http_charset_hint,const char * meta_charset_hint,const int encoding_hint,const Language language_hint,const CompactEncDet::TextCorpusType corpus_type,bool ignore_7bit_mail_encodings,int * bytes_consumed,bool * is_reliable,Encoding * second_best_enc)5086 Encoding InternalDetectEncoding(
5087     CEDInternalFlags flags, const char* text, int text_length,
5088     const char* url_hint, const char* http_charset_hint,
5089     const char* meta_charset_hint, const int encoding_hint,
5090     const Language language_hint,  // User interface lang
5091     const CompactEncDet::TextCorpusType corpus_type,
5092     bool ignore_7bit_mail_encodings, int* bytes_consumed, bool* is_reliable,
5093     Encoding* second_best_enc) {
5094   *bytes_consumed = 0;
5095   *is_reliable = false;
5096   *second_best_enc = ASCII_7BIT;
5097 
5098   if (text_length == 0) {
5099     // Follow the spec. Text might be NULL.
5100     *is_reliable = true;
5101     return ISO_8859_1;
5102   }
5103 
5104   // For very short (20-50 byte) input strings that are highly likely to be
5105   // all printable ASCII, our startup overhead might dominate. We have to do the
5106   // full detection if the ISO-2022-xx, HZ, or UTF-7 encodings are possible.
5107   // Otherwise, we can do a quick scan for printable ASCII.
5108   if ((text_length <= 500) && ignore_7bit_mail_encodings &&
5109       QuickPrintableAsciiScan(text, text_length)) {
5110     *is_reliable = true;
5111     return ASCII_7BIT;
5112   }
5113 
5114   // Go for the full boat detection
5115   DetectEncodingState destate;
5116   InitDetectEncodingState(&destate);
5117 
5118   std::unique_ptr<DetailEntry[]> scoped_debug_data;
5119   if (FLAGS_enc_detect_detail) {
5120     // Allocate max 10 details per bigram
5121     scoped_debug_data.reset(new DetailEntry[kMaxPairs * 10]);
5122     destate.debug_data = scoped_debug_data.get();
5123     // NOTE: destate and scoped_debug_data have exactly the same scope
5124     // All other FLAGS_enc_detect_detail tests use destate.debug_data != NULL
5125   }
5126 
5127   // Get text length limits
5128   // Typically, we scan the first 16KB looking for all encodings, then
5129   // scan the rest (up to 256KB) a bit faster by no longer looking for
5130   // interesting bytes below 0x80. This allows us to skip over runs of
5131   // 7-bit-ASCII much more quickly.
5132   int slow_len = minint(text_length, (FLAGS_enc_detect_slow_max_kb << 10));
5133   int fast_len = minint(text_length, (FLAGS_enc_detect_fast_max_kb << 10));
5134 
5135   // Initialize pointers.
5136   // In general, we do not look at last 3 bytes of input in the fast scan
5137   // We do, however want to look at the last byte or so in the slow scan,
5138   // especilly in the case of a very short text whose only interesting
5139   // information is a 3-byte UTF-8 character in the last three bytes.
5140   // If necessary, we fake a last bigram with 0x20 space as a pad byte.
5141   const uint8* isrc = reinterpret_cast<const uint8*>(text);
5142   const uint8* src = isrc;
5143   const uint8* srctextlimit = isrc + text_length;
5144   const uint8* srclimitslow2 = isrc + slow_len - 1;
5145   const uint8* srclimitfast2 = isrc + fast_len - 1;
5146   const uint8* srclimitfast4 = isrc + fast_len - 3;
5147   if (srclimitslow2 > srclimitfast2) {
5148     srclimitslow2 = srclimitfast2;
5149   }
5150   destate.initial_src = isrc;
5151   destate.limit_src = srclimitfast2 + 1;      // May include last byte
5152   destate.prior_src = isrc;
5153   destate.last_pair = isrc - 2;
5154 
5155   const char* scan_table = kTestPrintableAsciiTildePlus;
5156   if (ignore_7bit_mail_encodings) {
5157     // Caller wants to ignore UTF-7, HZ, ISO-2022-xx
5158     // Don't stop on + (for UTF-7), nor on ~ (for HZ)
5159     scan_table = kTestPrintableAscii;
5160   }
5161   int exit_reason = 0;
5162 
5163   if (destate.debug_data != NULL) {
5164     BeginDetail(&destate);
5165     // Take any incoming watch encoding name and backmap to the corresponding
5166     // ranked enum value
5167     watch1_rankedenc = LookupWatchEnc(FLAGS_enc_detect_watch1);
5168     if (watch1_rankedenc >= 0) {
5169       fprintf(stderr, "/track-me %d def\n", watch1_rankedenc);
5170     }
5171 
5172     watch2_rankedenc = LookupWatchEnc(FLAGS_enc_detect_watch2);
5173     if (watch2_rankedenc >= 0) {
5174       fprintf(stderr, "/track-me2 %d def\n", watch2_rankedenc);
5175     }
5176 
5177     fprintf(stderr, "%% kDerateHintsBelow = %d\n", kDerateHintsBelow);
5178   }
5179   if (FLAGS_enc_detect_source) {
5180     PsSourceInit(kPsSourceWidth);
5181     PsSource(src, isrc, srctextlimit);
5182     PsMark(src, 4, isrc, 0);
5183   }
5184 
5185   // Apply hints, if any, to probabilities
5186   // NOTE: Encoding probabilites are all zero at this point
5187   ApplyHints(url_hint,
5188              http_charset_hint,
5189              meta_charset_hint,
5190              encoding_hint,
5191              language_hint,
5192              corpus_type,
5193              &destate);
5194 
5195   // NOTE: probabilities up to this point are subject to derating for
5196   // small numbers of bigrams.
5197   // Probability changes after this point are not derated.
5198 
5199   // Do first 4 bytes to pick off strong markers
5200   InitialBytesBoost(isrc, text_length, &destate);
5201 
5202   bool ignored_some_tag_text = false;
5203   int tag_text_bigram_count = 0;
5204 
5205   // Slower loop, approx 500 MB/sec (2.8 GHz P4)
5206   // ASSERT(srclimitslow2 <= srclimitfast2);
5207   //====================================
5208  DoMoreSlowLoop:
5209   while (src < srclimitslow2) {
5210     // Skip to next interesting byte (this is the slower part)
5211     while (src < srclimitslow2) {
5212       uint8 uc = *src++;
5213       if (scan_table[uc] != 0) {exit_reason = scan_table[uc]; src--; break;}
5214     }
5215 
5216     if (src < srclimitslow2) {
5217       if (FLAGS_enc_detect_source) {
5218         PsSource(src, isrc, srctextlimit);    // don't mark yet
5219       }
5220 
5221       int weightshift = 0;
5222       // In the first 16KB, derate new text run inside <title>...</title> and
5223       // inside <!-- ... -->
5224       if (////((destate.last_pair + 6) <= src) &&             // if beyond last one
5225           ////(tag_text_bigram_count < kMaxBigramsTagTitleText) &&
5226           (corpus_type == CompactEncDet::WEB_CORPUS) &&   // and web page
5227           !CEDFlagForceTags(flags)) {                     // and OK to skip
5228         ////if (TextInsideTag(destate.last_pair + 2, src, srclimitslow2)) {
5229         if (TextInsideTag(isrc, src, srclimitslow2)) {
5230           if (tag_text_bigram_count >= kMaxBigramsTagTitleText) {
5231             ignored_some_tag_text = true;
5232             src = SkipToTagEnd(src, srclimitslow2);
5233             continue;
5234           } else {
5235             weightshift = kWeightshiftForTagTitleText;
5236             ++tag_text_bigram_count;
5237           }
5238         }
5239       }
5240       if (FLAGS_enc_detect_source) {
5241         PsMark(src, 2, isrc, weightshift);
5242       }
5243       // Saves byte pair and offset
5244       bool pruned = IncrementAndBoostPrune(src, srctextlimit - src,
5245                                            &destate, weightshift, exit_reason);
5246       // Advance; if inside tag, advance to end of tag
5247       if (weightshift == 0) {
5248         src += exit_reason;               // 1 Ascii, 2 other
5249       } else {
5250         src += exit_reason;               // 1 Ascii, 2 other
5251         //// src = SkipToTagEnd(src, srclimitslow2);
5252       }
5253 
5254       if (pruned) {
5255         // Scoring and active encodings have been updated
5256         if (destate.done) {break;}
5257         // Check if all the reasons for the slow loop have been pruned
5258         // If so, go to fast loop
5259         if (!SevenBitActive(&destate)) {break;}
5260       }
5261     }
5262   }
5263   //====================================
5264 
5265   // We reached the end of a slow scan, possibly because no more SevenBitActive,
5266   // or possibly are at end of source.
5267   // If we are exactly at the end of the source, make sure we look at the very
5268   // last byte.
5269   bool very_last_byte_incremented = false;
5270   if (src == (srctextlimit - 1)) {
5271     exit_reason = scan_table[*src];
5272     if (exit_reason != 0) {
5273       // The very last byte is an interesting byte
5274       // Saves byte pair and offset
5275       //printf("Interesting very last slow byte = 0x%02x\n", *src);
5276       IncrementAndBoostPrune(src, srctextlimit - src, &destate, 0, exit_reason);
5277       very_last_byte_incremented = true;
5278     }
5279   }
5280 
5281   if (FLAGS_enc_detect_source) {
5282     PsSource(src, isrc, srctextlimit);
5283     PsMark(src, 2, isrc, 0);
5284   }
5285   // Force a pruning based on whatever we have
5286   // Delete the seven-bit encodings if there is no evidence of them so far
5287   BoostPrune(src, &destate, PRUNE_SLOWEND);
5288 
5289   if (!destate.done) {
5290     // If not clear yet on 7-bit-encodings and more bytes, do more slow
5291     if (SevenBitActive(&destate) && (src < srclimitfast2)) {
5292       // Increment limit by another xxxK
5293       slow_len += (FLAGS_enc_detect_slow_max_kb << 10);
5294       srclimitslow2 = isrc + slow_len - 1;
5295       if (srclimitslow2 > srclimitfast2) {
5296         srclimitslow2 = srclimitfast2;
5297       }
5298       if (!UTF7OrHzActive(&destate)) {
5299         // We can switch to table that does not stop on + ~
5300         scan_table = kTestPrintableAscii;
5301       }
5302       goto DoMoreSlowLoop;
5303     }
5304 
5305 
5306     exit_reason = 2;
5307     // Faster loop, no 7-bit-encodings possible, approx 3000 GB/sec
5308     //====================================
5309     while (src < srclimitfast2) {
5310       // Skip to next interesting byte (this is the faster part)
5311       while (src < srclimitfast4) {
5312         if (((src[0] | src[1] | src[2] | src[3]) & 0x80) != 0) break;
5313         src += 4;
5314       }
5315 
5316       while (src < srclimitfast2) {
5317         if ((src[0] & 0x80) != 0) break;
5318         src++;
5319       }
5320 
5321       if (src < srclimitfast2) {
5322         if (FLAGS_enc_detect_source) {
5323           PsSource(src, isrc, srctextlimit);
5324           PsMark(src, 2, isrc, 0);
5325         }
5326         // saves byte pair and offset
5327         bool pruned = IncrementAndBoostPrune(src, srctextlimit - src,
5328                                              &destate, 0, exit_reason);
5329         src += exit_reason;               // 1 Ascii, 2 other
5330         if (pruned) {
5331           // Scoring and active encodings have been updated
5332           if (destate.done) {break;}
5333         }
5334       }
5335     }
5336     //====================================
5337     // We reached the end of fast scan
5338 
5339     // If we are exactly at the end of the source, make sure we look at the very
5340     // last byte.
5341     if (src == (srctextlimit - 1) && !very_last_byte_incremented) {
5342       exit_reason = scan_table[*src];
5343       if (exit_reason != 0) {
5344         // The very last byte is an interesting byte
5345         // Saves byte pair and offset
5346         //printf("Interesting very last fast byte = 0x%02x\n", *src);
5347         IncrementAndBoostPrune(src, srctextlimit - src, &destate, 0, exit_reason);
5348         very_last_byte_incremented = true;
5349       }
5350     }
5351 
5352   }     // End if !done
5353 
5354   if (FLAGS_enc_detect_source) {
5355     PsSource(src, isrc, srctextlimit);
5356     PsMark(src, 2, isrc, 0);
5357   }
5358   // Force a pruning based on whatever we have
5359   BoostPrune(src, &destate, PRUNE_FINAL);
5360 
5361   if (FLAGS_enc_detect_summary) {
5362     DumpSummary(&destate, AsciiPair, 32);
5363     DumpSummary(&destate, OtherPair, 32);
5364   }
5365   if (FLAGS_enc_detect_source) {
5366     PsSourceFinish();
5367   }
5368   if (destate.debug_data != NULL) {
5369     //// DumpDetail(&destate);
5370   }
5371 
5372 
5373   if (ignored_some_tag_text &&
5374       (kMapToEncoding[destate.top_rankedencoding] == ASCII_7BIT)) {
5375     // There were some interesting bytes, but only in tag text.
5376     // Recursive call to reprocess looking at the tags this time.
5377 
5378     if (destate.debug_data != NULL) {
5379       SetDetailsEncLabel(&destate, ">> Recurse/tags");
5380       // Print the current chart before recursive call
5381       DumpDetail(&destate);
5382 
5383       char buff[32];
5384       snprintf(buff, sizeof(buff), ">> Recurse for tags");
5385       PsRecurse(buff);
5386     }
5387 
5388     // Recursive call for high bytes in tags [no longer used, 1/16 tag score]
5389     Encoding enc2 = InternalDetectEncoding(
5390                              kCEDForceTags,  // force
5391                              text,
5392                              text_length,
5393                              url_hint,
5394                              http_charset_hint,
5395                              meta_charset_hint,
5396                              encoding_hint,
5397                              language_hint,
5398                              corpus_type,
5399                              ignore_7bit_mail_encodings,
5400                              bytes_consumed,
5401                              is_reliable,
5402                              second_best_enc);
5403 
5404     if (destate.debug_data != NULL) {
5405       // Show winning encoding and dump PostScript
5406       char buff[32];
5407       snprintf(buff, sizeof(buff), "=2 %s", MyEncodingName(enc2));
5408       SetDetailsEncProb(&destate,
5409                         0,
5410                         CompactEncDet::BackmapEncodingToRankedEncoding(enc2),
5411                         buff);
5412       DumpDetail(&destate);
5413     }
5414 
5415     return enc2;
5416   }
5417 
5418 
5419   // If the detected encoding does not match default/hints, or if the hints
5420   // conflict with each other, mark as unreliable. This can be used to trigger
5421   // further scoring.
5422   // Three buckets of input documents;
5423   // ~19% of the web no hints, and top == 7bit, Latin1, or CP1252
5424   // ~79% of the web one or more hints, all same encoding X and top == X
5425   // ~ 2% of the web one or more hints that are inconsistent
5426 
5427   Encoding top_enc = kMapToEncoding[destate.top_rankedencoding];
5428   Encoding one_hint = destate.http_hint;
5429   if ((one_hint == UNKNOWN_ENCODING) &&
5430       (destate.meta_hint != UNKNOWN_ENCODING)) {
5431     one_hint = destate.meta_hint;
5432   }
5433   if ((one_hint == UNKNOWN_ENCODING) &&
5434       (destate.bom_hint != UNKNOWN_ENCODING)) {
5435     one_hint = destate.bom_hint;
5436   }
5437 
5438   bool found_compatible_encoding = true;
5439   if (one_hint == UNKNOWN_ENCODING) {
5440     // [~14% of the web] No hints, and top == 7bit, Latin1, or CP1252
5441     if (!CompatibleEnc(ISO_8859_1, top_enc)) {
5442       found_compatible_encoding = false;
5443       // If there is nothing but a TLD hint and its top encoding matches, OK
5444       if ((destate.tld_hint != UNKNOWN_ENCODING) &&
5445           CompatibleEnc(destate.tld_hint, top_enc)) {
5446         found_compatible_encoding = true;
5447       }
5448     }
5449   } else if (CompatibleEnc(one_hint, destate.http_hint) &&
5450              CompatibleEnc(one_hint, destate.meta_hint) &&
5451              CompatibleEnc(one_hint, destate.bom_hint)) {
5452     // [~83% of the web] One or more hints, all same encoding X and top == X
5453     if (!CompatibleEnc(one_hint, top_enc)) {
5454       // [~ 2% of the web] Oops, not the declared encoding
5455       found_compatible_encoding = false;
5456     }
5457   } else {
5458     // [~ 3% of the web] Two or more hints that are inconsistent
5459     one_hint = UNKNOWN_ENCODING;
5460     found_compatible_encoding = false;
5461   }
5462 
5463   // If we turned Latin1 into Latin2 or 7 via trigrams, don't fail it here
5464   if (destate.do_latin_trigrams) {
5465     if (CompatibleEnc(kMapToEncoding[F_Latin1], top_enc) ||
5466         CompatibleEnc(kMapToEncoding[F_Latin2], top_enc) ||
5467         CompatibleEnc(kMapToEncoding[F_CP1250], top_enc) ||
5468         CompatibleEnc(kMapToEncoding[F_ISO_8859_13], top_enc)) {
5469       found_compatible_encoding = true;
5470       destate.reliable = true;
5471     }
5472   }
5473 
5474   // If top encoding is not compatible with the hints, but it is reliably
5475   // UTF-8, accept it anyway.
5476   // This will perform badly with mixed UTF-8 prefix plus another encoding in
5477   // the body if done too early, so we want to be rescanning.
5478   if (!found_compatible_encoding &&
5479       destate.reliable &&
5480       NoHintsCloseEnoughCompatible(top_enc) &&
5481       (destate.next_interesting_pair[OtherPair] >= kStrongPairs) &&
5482       CEDFlagRescanning(flags)) {
5483     found_compatible_encoding = true;
5484   }
5485 
5486   // Hold off on this so Rescan() can see if the original encoding was reliable
5487   //if (!found_compatible_encoding) {
5488   //  destate.reliable = false;
5489   //}
5490 
5491   // If unreliable, try rescoring to separate some encodings
5492   if (!destate.reliable || !found_compatible_encoding) {
5493     top_enc = Rescore(top_enc, isrc, srctextlimit, &destate);
5494   }
5495 
5496   *second_best_enc = kMapToEncoding[destate.second_top_rankedencoding];
5497 
5498   // If unreliable, and not already rescanning,
5499   // rescan middle of document to see if we can get a better
5500   // answer. Rescan is only worthwhile if there are ~200 bytes or more left,
5501   // since the detector takes as much as 96 bytes of bigrams to decide.
5502   //
5503   // CANNOT retry ISO-2022-xx HZ etc. because no declaration escape at the front
5504   // or we may land in the middle of some partial state. Skip them all.
5505   //
5506   if ((!destate.reliable || !found_compatible_encoding) &&
5507       !CEDFlagRescanning(flags) &&
5508       !SevenBitEncoding(top_enc)) {
5509     top_enc = Rescan(top_enc,
5510                      isrc,
5511                      src,
5512                      srctextlimit,
5513                      url_hint,
5514                      http_charset_hint,
5515                      meta_charset_hint,
5516                      encoding_hint,
5517                      language_hint,
5518                      corpus_type,
5519                      ignore_7bit_mail_encodings,
5520                      &destate);
5521   } else {
5522     if (!found_compatible_encoding) {
5523       destate.reliable = false;
5524     }
5525   }
5526 
5527   if (destate.debug_data != NULL) {
5528     // Dump PostScript
5529     DumpDetail(&destate);
5530   }
5531 
5532   *bytes_consumed = src - isrc + 1;       // We looked 1 byte beyond src
5533   *is_reliable = destate.reliable;
5534   return top_enc;
5535 }
5536 
DetectEncoding(const char * text,int text_length,const char * url_hint,const char * http_charset_hint,const char * meta_charset_hint,const int encoding_hint,const Language language_hint,const TextCorpusType corpus_type,bool ignore_7bit_mail_encodings,int * bytes_consumed,bool * is_reliable)5537 Encoding CompactEncDet::DetectEncoding(
5538     const char* text, int text_length, const char* url_hint,
5539     const char* http_charset_hint, const char* meta_charset_hint,
5540     const int encoding_hint,
5541     const Language language_hint,  // User interface lang
5542     const TextCorpusType corpus_type, bool ignore_7bit_mail_encodings,
5543     int* bytes_consumed, bool* is_reliable) {
5544   if (FLAGS_ced_echo_input) {
5545     string temp(text, text_length);
5546     fprintf(stderr, "CompactEncDet::DetectEncoding()\n%s\n\n", temp.c_str());
5547   }
5548 
5549   if (FLAGS_counts) {
5550     encdet_used = 0;
5551     rescore_used = 0;
5552     rescan_used = 0;
5553     robust_used = 0;
5554     looking_used = 0;
5555     doing_used = 0;
5556     ++encdet_used;
5557   }
5558   if (FLAGS_dirtsimple) {
5559     // Just count first 64KB bigram encoding probabilities for each encoding
5560     int robust_renc_list_len;         // Number of active encodings
5561     int robust_renc_list[NUM_RANKEDENCODING];   // List of ranked encodings
5562     int robust_renc_probs[NUM_RANKEDENCODING];  // List of matching probs
5563 
5564     for (int i = 0; i < NUM_RANKEDENCODING; ++i) {
5565       robust_renc_list[i] = i;
5566     }
5567     robust_renc_list_len = NUM_RANKEDENCODING;
5568 
5569     RobustScan(text, text_length,
5570                  robust_renc_list_len, robust_renc_list, robust_renc_probs);
5571 
5572     // Pick off best encoding
5573     int best_prob = -1;
5574     Encoding enc = UNKNOWN_ENCODING;
5575     for (int i = 0; i < robust_renc_list_len; ++i) {
5576       if (best_prob < robust_renc_probs[i]) {
5577         best_prob = robust_renc_probs[i];
5578         enc = kMapToEncoding[robust_renc_list[i]];
5579       }
5580     }
5581 
5582     *bytes_consumed = minint(text_length, (kMaxKBToRobustScan << 10));
5583     *is_reliable = true;
5584     if (FLAGS_counts) {
5585       printf("CEDcounts ");
5586       while (encdet_used--) {printf("encdet ");}
5587       while (rescore_used--) {printf("rescore ");}
5588       while (rescan_used--) {printf("rescan ");}
5589       while (robust_used--) {printf("robust ");}
5590       while (looking_used--) {printf("looking ");}
5591       while (doing_used--) {printf("doing ");}
5592       printf("\n");
5593     }
5594 
5595     return enc;
5596   }
5597 
5598   Encoding second_best_enc;
5599   Encoding enc = InternalDetectEncoding(kCEDNone,
5600                            text,
5601                            text_length,
5602                            url_hint,
5603                            http_charset_hint,
5604                            meta_charset_hint,
5605                            encoding_hint,
5606                            language_hint,   // User interface lang
5607                            corpus_type,
5608                            ignore_7bit_mail_encodings,
5609                            bytes_consumed,
5610                            is_reliable,
5611                            &second_best_enc);
5612   if (FLAGS_counts) {
5613     printf("CEDcounts ");
5614     while (encdet_used--) {printf("encdet ");}
5615     while (rescore_used--) {printf("rescore ");}
5616     while (rescan_used--) {printf("rescan ");}
5617     while (robust_used--) {printf("robust ");}
5618     while (looking_used--) {printf("looking ");}
5619     while (doing_used--) {printf("doing ");}
5620     printf("\n");
5621   }
5622 
5623 #if defined(HTML5_MODE)
5624   // Map all the Shift-JIS variants to Shift-JIS when used in Japanese locale.
5625   if (language_hint == JAPANESE && IsShiftJisOrVariant(enc)) {
5626     enc = JAPANESE_SHIFT_JIS;
5627   }
5628 
5629   // 7-bit encodings (except ISO-2022-JP), and some obscure encodings not
5630   // supported in WHATWG encoding standard are marked as ASCII to keep the raw
5631   // bytes intact.
5632   switch (enc) {
5633     case ISO_2022_KR:
5634     case ISO_2022_CN:
5635     case HZ_GB_2312:
5636     case UTF7:
5637     case UTF16LE:
5638     case UTF16BE:
5639 
5640     case CHINESE_EUC_DEC:
5641     case CHINESE_CNS:
5642     case CHINESE_BIG5_CP950:
5643     case JAPANESE_CP932:
5644     case MSFT_CP874:
5645     case TSCII:
5646     case TAMIL_MONO:
5647     case TAMIL_BI:
5648     case JAGRAN:
5649     case BHASKAR:
5650     case HTCHANAKYA:
5651     case BINARYENC:
5652     case UTF8UTF8:
5653     case TAM_ELANGO:
5654     case TAM_LTTMBARANI:
5655     case TAM_SHREE:
5656     case TAM_TBOOMIS:
5657     case TAM_TMNEWS:
5658     case TAM_WEBTAMIL:
5659     case KDDI_SHIFT_JIS:
5660     case DOCOMO_SHIFT_JIS:
5661     case SOFTBANK_SHIFT_JIS:
5662     case KDDI_ISO_2022_JP:
5663     case SOFTBANK_ISO_2022_JP:
5664       enc = ASCII_7BIT;
5665       break;
5666     default:
5667       break;
5668   }
5669 #endif
5670 
5671   return enc;
5672 }
5673 
5674 
5675 // Return top encoding hint for given string
TopEncodingOfLangHint(const char * name)5676 Encoding CompactEncDet::TopEncodingOfLangHint(const char* name) {
5677   string normalized_lang = MakeChar8(string(name));
5678   int n = HintBinaryLookup8(kLangHintProbs, kLangHintProbsSize,
5679                            normalized_lang.c_str());
5680   if (n < 0) {return UNKNOWN_ENCODING;}
5681 
5682   // Charset is eight bytes, probability table is eight bytes
5683   int toprankenc =
5684     TopCompressedProb(&kLangHintProbs[n].key_prob[kMaxLangKey],
5685                       kMaxLangVector);
5686   return kMapToEncoding[toprankenc];
5687 }
5688 
5689 // Return top encoding hint for given string
TopEncodingOfTLDHint(const char * name)5690 Encoding CompactEncDet::TopEncodingOfTLDHint(const char* name) {
5691   string normalized_tld = MakeChar4(string(name));
5692   int n = HintBinaryLookup4(kTLDHintProbs, kTLDHintProbsSize,
5693                            normalized_tld.c_str());
5694   if (n < 0) {return UNKNOWN_ENCODING;}
5695 
5696   // TLD is four bytes, probability table is 12 bytes
5697   int toprankenc =
5698     TopCompressedProb(&kTLDHintProbs[n].key_prob[kMaxTldKey],
5699                       kMaxTldVector);
5700   return kMapToEncoding[toprankenc];
5701 }
5702 
5703 // Return top encoding hint for given string
TopEncodingOfCharsetHint(const char * name)5704 Encoding CompactEncDet::TopEncodingOfCharsetHint(const char* name) {
5705   string normalized_charset = MakeChar44(string(name));
5706   int n = HintBinaryLookup8(kCharsetHintProbs, kCharsetHintProbsSize,
5707                            normalized_charset.c_str());
5708   if (n < 0) {return UNKNOWN_ENCODING;}
5709 
5710   // Charset is eight bytes, probability table is eight bytes
5711   int toprankenc =
5712     TopCompressedProb(&kCharsetHintProbs[n].key_prob[kMaxCharsetKey],
5713                       kMaxCharsetVector);
5714   return kMapToEncoding[toprankenc];
5715 }
5716 
Version(void)5717 const char* CompactEncDet::Version(void) {
5718   return kVersion;
5719 }
5720