1 // Copyright 2016 Google Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 ////////////////////////////////////////////////////////////////////////////////
16
17 #include "compact_enc_det/compact_enc_det.h"
18
19 #include <math.h> // for sqrt
20 #include <stddef.h> // for size_t
21 #include <stdio.h> // for printf, fprintf, NULL, etc
22 #include <stdlib.h> // for qsort
23 #include <string.h> // for memset, memcpy, memcmp, etc
24 #include <memory>
25 #include <string> // for string, operator==, etc
26
27 #include "compact_enc_det/compact_enc_det_hint_code.h"
28 #include "util/string_util.h"
29 #include "util/basictypes.h"
30 #include "util/commandlineflags.h"
31 #include "util/logging.h"
32
33 using std::string;
34
35 // TODO as of 2007.10.09:
36 //
37 // Consider font=TT-BHxxx as user-defined => binary
38 // Demote GB18030 if no 8x3x pair
39 // Map byte2 ascii punct to 0x60, digits to 0x7e, gets them into hires
40 // Consider removing/ignoring bytes 01-1F to avoid crap pollution
41 // Possibly boost declared encoding in robust scan
42 // googlebot tiny files
43 // look for ranges of encodings
44 // consider tags just as > < within aligned block of 32
45 // flag too few characters in postproc (Latin 6 problem)
46 // Remove slow scan beyond 16KB
47 // Consider removing kMostLikelyEncoding or cut it in half
48
49
50 // A note on mixed encodings
51 //
52 // The most common encoding error on the web is a page containing a mixture of
53 // CP-1252 and UTF-8. A less common encoding error is a third-party feed that
54 // has been converted from CP-1252 to UTF-8 and then those bytes converted a
55 // second time to UTF-8. CED originally attempted to detect these error cases
56 // by using two synthetic encodings, UTF8CP1252 and UTF8UTF8. The intended
57 // implementation was to start these just below CP1252 and UTF8 respectively in
58 // overall liklihood, and allow 1252 and UTF8 to fall behind if mixtures are
59 // found.
60 //
61 // The UTF8UTF8 encoding is a possible outcome from CED, but unfortunately the
62 // UTF8CP1252 internal encoding was added late and not put into encodings.proto,
63 // so at the final step it is mapped to UTF8UTF8 also. This was a bad idea and
64 // is removed in this November 2011 CL.
65 //
66 // Mixed encoding detection never worked out as well as envisioned, so the
67 // ced_allow_utf8utf8 flag normally disables all this.
68 //
69 // The effect is that CP-1252 and UTF-8 mixtures will usually be detected as
70 // UTF8, and the inputconverter code for UTF8 normally will convert bare
71 // CP-1252 bytes to UTF-8, instead of the less-helpful FFFD substitution. UTF-8
72 // and double-UTF-8 mixtures will be detected as UTF-8, and the double
73 // conversion will stand.
74 //
75 // However, it is occasionally useful to use CED to detect double-converted
76 // UTF-8 coming from third-party data feeds, so they can be fixed at the source.
77 // For this purpose, the UTF8UTF8 encoding remains available under the
78 // ced_allow_utf8utf8 flag.
79 //
80 // When UTF8UTF8 is detected, the inputconverter code will undo the double
81 // conversion, giving good text.
82
83 // Norbert Runge has noted these words in CP1252 that are mistakenly identified
84 // as UTF-8 because of the last pair of characters:
85 // NESTLÉ® 0xC9 0xAE U+00C9 U+00AE C9AE = U+026E;SMALL LEZH
86 // drauß\u2019 0xDF 0x92 U+00DF U+2019 DF92 = U+07D2;NKO LETTER N
87 // Mutterschoß\u201c 0xDF 0x93 U+00DF U+201C DF93 = U+07D3;NKO LETTER BA
88 // Schoß\u201c 0xDF 0x93 U+00DF U+201C
89 // weiß\u201c 0xDF 0x93 U+00DF U+00AB
90 // Schnellfuß\u201c 0xDF 0x93 U+00DF U+201C
91 // süß« 0xDF 0xAB U+00DF U+00AB DFAB = U+07EB;NKO HIGH TONE
92 // These four byte combinations now explicitly boost Latin1/CP1252.
93
94 // And for reference, here are a couple of Portuguese spellings
95 // that may be mistaken as double-byte encodings.
96 // informações 0xE7 0xF5
97 // traição 0xE7 0xE3
98
99
100 static const char* kVersion = "2.2";
101
102 DEFINE_bool(ced_allow_utf8utf8, false, "Allow the UTF8UTF8 encoding, "
103 "to handle mixtures of CP1252 "
104 "converted to UTF-8 zero, one, "
105 "or two times");
106 DEFINE_int32(enc_detect_slow_max_kb, 16,
107 "Maximum number of Kbytes to examine for "
108 "7-bit-only (2022, Hz, UTF7) encoding detect. "
109 "You are unlikely to want to change this.");
110 DEFINE_int32(enc_detect_fast_max_kb, 256,
111 "Maximum number of Kbytes to examine for encoding detect. "
112 "You are unlikely to want to change this.");
113
114 DEFINE_int32(ced_reliable_difference, 300, "30 * Bits of minimum probablility "
115 "difference 1st - 2nd to be considered reliable \n"
116 " 2 corresponds to min 4x difference\n"
117 " 4 corresponds to min 16x difference\n"
118 " 8 corresponds to min 256x difference\n"
119 " 10 corresponds to min 1024x difference\n"
120 " 20 corresponds to min 1Mx difference.");
121
122 // Text debug output options
123 DEFINE_bool(enc_detect_summary, false,
124 "Print first 16 interesting pairs at exit.");
125 DEFINE_bool(counts, false, "Count major-section usage");
126
127 // PostScript debug output options
128 DEFINE_bool(enc_detect_detail, false,
129 "Print PostScript of every update, to stderr.");
130 DEFINE_bool(enc_detect_detail2, false,
131 "More PostScript detail of every update, to stderr.");
132 DEFINE_bool(enc_detect_source, false, "Include source text in detail");
133 // Encoding name must exactly match FIRST column of kI18NInfoByEncoding in
134 // lang_enc.cc
135
136 // Following flags are not in use. Replace them with constants to
137 // avoid static initialization.
138
139 //DEFINE_string(enc_detect_watch1, "", "Do detail2 about this encoding name.");
140 //DEFINE_string(enc_detect_watch2, "", "Do detail2 about this encoding name.");
141
142 static const char* const FLAGS_enc_detect_watch1 = "";
143 static const char* const FLAGS_enc_detect_watch2 = "";
144
145 // Only for experiments. Delete soon.
146 DEFINE_bool(force127, false, "Force Latin1, Latin2, Latin7 based on trigrams");
147
148 // Demo-mode/debugging experiment
149 DEFINE_bool(demo_nodefault, false,
150 "Default to all equal; no boost for declared encoding.");
151 DEFINE_bool(dirtsimple, false, "Just scan and count for all encodings");
152 DEFINE_bool(ced_echo_input, false, "Echo ced input to stderr");
153
154
155 static const int XDECILOG2 = 3; // Multiplier for log base 2 ** n/10
156 static const int XLOG2 = 30; // Multiplier for log base 2 ** n
157
158 static const int kFinalPruneDifference = 10 * XLOG2;
159 // Final bits of minimum
160 // probability difference 1st-nth
161 // to be pruned
162
163 static const int kInititalPruneDifference = kFinalPruneDifference * 4;
164 // Initial bits of minimum
165 // probability difference 1st-nth
166 // to be pruned
167 //
168 static const int kPruneDiffDecrement = kFinalPruneDifference;
169 // Decrements bits of minimum
170 // probability difference 1st-nth
171 // to be pruned
172
173 static const int kSmallInitDiff = 2 * XLOG2; // bits of minimum
174 // probability difference, base to
175 // superset encodings
176
177 static const int kBoostInitial = 20 * XLOG2; // bits of boost for
178 // initial byte patterns (BOM, 00)
179
180 static const int kBadPairWhack = 20 * XLOG2; // bits of whack for
181 // one bad pair
182
183 static const int kBoostOnePair = 20 * XLOG2; // bits of boost for
184 // one good pair in Hz, etc.
185
186 static const int kGentleOnePair = 4 * XLOG2; // bits of boost for
187 // one good sequence
188 //
189 static const int kGentlePairWhack = 2 * XLOG2; // bits of whack
190 // for ill-formed sequence
191
192 static const int kGentlePairBoost = 2 * XLOG2; // bits of boost
193 // for well-formed sequence
194
195 static const int kDeclaredEncBoost = 5 * XDECILOG2; // bits/10 of boost for
196 // best declared encoding per bigram
197
198 static const int kBestEncBoost = 5 * XDECILOG2; // bits/10 of boost for
199 // best encoding per bigram
200
201 static const int kTrigramBoost = 2 * XLOG2; // bits of boost for Latin127 tri
202
203 static const int kMaxPairs = 48; // Max interesting pairs to look at
204 // If you change this,
205 // adjust *PruneDiff*
206
207 static const int kPruneMask = 0x07; // Prune every 8 interesting pairs
208
209
210 static const int kBestPairsCount = 16; // For first N pairs, do extra boost
211 // based on most likely encoding
212 // of pair over entire web
213
214 static const int kDerateHintsBelow = 12; // If we have fewer than N bigrams,
215 // weaken the hints enough that
216 // unhinted encodings have a hope of
217 // rising to the top
218
219 static const int kMinRescanLength = 800; // Don't bother rescanning for
220 // unreliable encoding if fewer
221 // than this many bytes unscanned.
222 // We will rescan at most last half
223 // of this.
224
225 static const int kStrongBinary = 12; // Make F_BINARY the only encoding
226 static const int kWeakerBinary = 4; // Make F_BINARY likely encoding
227
228 // These are byte counts from front of file
229 static const int kBinaryHardAsciiLimit = 6 * 1024; // Not binary if all ASCII
230 static const int kBinarySoftAsciiLimit = 8 * 1024; // " if mostly ASCII
231
232 // We try here to avoid having title text dominate the encoding detection,
233 // for the not-infrequent error case of title in encoding1, body in encoding2:
234 // we want to bias toward encoding2 winning.
235 //
236 // kMaxBigramsTagTitleText should be a multiple of 2, 3, and 4, so that we
237 // rarely cut off mid-character in the original (not-yet-detected) encoding.
238 // This matters most for UTF-8 two- and three-byte codes and for
239 // Shift-JIS three-byte codes.
240 static const int kMaxBigramsTagTitleText = 12; // Keep only some tag text
241 static const int kWeightshiftForTagTitleText = 4; // Give text in tags, etc.
242 // 1/16 normal weight
243
244 static const int kStrongPairs = 6; // Let reliable enc with this many
245 // pairs overcome missing hint
246
247 enum CEDInternalFlags {
248 kCEDNone = 0, // The empty flag
249 kCEDRescanning = 1, // Do not further recurse
250 kCEDSlowscore = 2, // Do extra scoring
251 kCEDForceTags = 4, // Always examine text inside tags
252 };
253
254 // Forward declaration
255 Encoding InternalDetectEncoding(
256 CEDInternalFlags flags, const char* text, int text_length,
257 const char* url_hint, const char* http_charset_hint,
258 const char* meta_charset_hint, const int encoding_hint,
259 const Language language_hint, // User interface lang
260 const CompactEncDet::TextCorpusType corpus_type,
261 bool ignore_7bit_mail_encodings, int* bytes_consumed, bool* is_reliable,
262 Encoding* second_best_enc);
263
264 typedef struct {
265 const uint8* hires[4]; // Pointers to possible high-resolution bigram deltas
266 uint8 x_bar; // Average byte2 value
267 uint8 y_bar; // Average byte1 value
268 uint8 x_stddev; // Standard deviation of byte2 value
269 uint8 y_stddev; // Standard deviation of byte1 value
270 int so; // Scaling offset -- add to probabilities below
271 uint8 b1[256]; // Unigram probability for first byte of aligned bigram
272 uint8 b2[256]; // Unigram probability for second byte of aligned bigram
273 uint8 b12[256]; // Unigram probability for cross bytes of aligned bigram
274 } UnigramEntry;
275
276 //typedef struct {
277 // uint8 b12[256*256]; // Bigram probability for aligned bigram
278 //} FullBigramEntry;
279
280
281 // Include all the postproc-generated tables here:
282 // RankedEncoding
283 // kMapToEncoding
284 // unigram_table
285 // kMostLIkelyEncoding
286 // kTLDHintProbs
287 // kCharsetHintProbs
288 // HintEntry, kMaxTldKey kMaxTldVector, etc.
289 // =============================================================================
290
291 #include "compact_enc_det/compact_enc_det_generated_tables.h"
292
293
294 #define F_ASCII F_Latin1 // "ASCII" is a misnomer, so this code uses "Latin1"
295
296 #define F_BINARY F_X_BINARYENC // We are mid-update for name change
297 #define F_UTF8UTF8 F_X_UTF8UTF8 // We are mid-update for name change
298 #define F_BIG5_CP950 F_BIG5 // We are mid-update for name change
299 #define F_Unicode F_UTF_16LE // We are mid-update for name change
300 // =============================================================================
301
302 // 7-bit encodings have at least one "interesting" byte value < 0x80
303 // (00 0E 1B + ~)
304 // JIS 2022-cn 2022-kr hz utf7
305 // Unicode UTF-16 UTF-32
306 // 8-bit encodings have no interesting byte values < 0x80
307 static const uint32 kSevenBitActive = 0x00000001; // needs <80 to detect
308 static const uint32 kUTF7Active = 0x00000002; // <80 and +
309 static const uint32 kHzActive = 0x00000004; // <80 and ~
310 static const uint32 kIso2022Active = 0x00000008; // <80 and 1B 0E 0F
311 static const uint32 kUTF8Active = 0x00000010;
312 static const uint32 kUTF8UTF8Active = 0x00000020;
313 static const uint32 kUTF1632Active = 0x00000040; // <80 and 00
314 static const uint32 kBinaryActive = 0x00000080; // <80 and 00
315 static const uint32 kTwobyteCode = 0x00000100; // Needs 8xxx
316 static const uint32 kIsIndicCode = 0x00000200; //
317 static const uint32 kHighAlphaCode = 0x00000400; // full alphabet in 8x-Fx
318 static const uint32 kHighAccentCode = 0x00000800; // accents in 8x-Fx
319 static const uint32 kEUCJPActive = 0x00001000; // Have to mess with phase
320
321
322 // Debug only. not thread safe
323 static int encdet_used = 0;
324 static int rescore_used = 0;
325 static int rescan_used = 0;
326 static int robust_used = 0;
327 static int looking_used = 0;
328 static int doing_used = 0;
329
330
331 // For debugging only -- about 256B/entry times about 500 = 128KB
332 // TODO: only allocate this if being used
333 typedef struct {
334 int offset;
335 int best_enc; // Best ranked encoding for this bigram, or
336 // -1 for overhead entries
337 string label;
338 int detail_enc_prob[NUM_RANKEDENCODING];
339 } DetailEntry;
340
341 static int watch1_rankedenc = -1; // Debug. not threadsafe
342 static int watch2_rankedenc = -1; // Debug. not threadsafe
343 ////static int next_detail_entry = 0; // Debug. not threadsafe
344 ////static DetailEntry details[kMaxPairs * 10]; // Allow 10 details per bigram
345 // End For debugging only
346
347 // Must match kTestPrintableAsciiTildePlus exit codes, minus one
348 enum PairSet {AsciiPair = 0, OtherPair = 1, NUM_PAIR_SETS = 2};
349
350 // The reasons for pruning
351 enum PruneReason {PRUNE_NORMAL, PRUNE_SLOWEND, PRUNE_FINAL};
352
353 static const char* kWhatSetName[] = {"Ascii", "Other"};
354
355
356 // State for encodings that do shift-out/shift-in between one- and two-byte
357 // regions (ISO-2022-xx, HZ)
358 enum StateSoSi {SOSI_NONE, SOSI_ERROR, SOSI_ONEBYTE, SOSI_TWOBYTE};
359
360 typedef struct {
361 const uint8* initial_src; // For calculating byte offsets
362 const uint8* limit_src; // Range of input source
363 const uint8* prior_src; // Source consumed by prior call to BoostPrune
364 const uint8* last_pair; // Last pair inserted into interesting_pairs
365
366 DetailEntry* debug_data; // Normally NULL. Ptr to debug data for
367 // FLAGS_enc_detect_detail PostScript data
368 int next_detail_entry; // Debug
369
370 bool done;
371 bool reliable;
372 bool hints_derated;
373 int declared_enc_1; // From http/meta hint
374 int declared_enc_2; // from http/meta hint
375 int prune_count; // Number of times we have pruned
376
377 int trigram_highwater_mark; // Byte offset of last trigram processing
378 bool looking_for_latin_trigrams; // True if we should test for doing
379 // Latin1/2/7 trigram processing
380 bool do_latin_trigrams; // True if we actually are scoring trigrams
381
382 // Miscellaneous state variables for difficult encodings
383 int binary_quadrants_count; // Number of four bigram quadrants seen:
384 // 0xxxxxxx0xxxxxxx 0xxxxxxx1xxxxxx
385 // 1xxxxxxx0xxxxxxx 1xxxxxxx1xxxxxx
386 int binary_8x4_count; // Number of 8x4 buckets seen:
387 uint32 binary_quadrants_seen; // Bit[i] set if bigram i.......i....... seen
388 uint32 binary_8x4_seen; // Bit[i] set if bigram iii.....ii...... seen
389 int utf7_starts; // Count of possible UTF-7 beginnings seen
390 int prior_utf7_offset; // Source consumed by prior UTF-7 string
391 int next_utf8_ministate; // Mini state for UTF-8 sequences
392 int utf8_minicount[6]; // Number of correct 2- 3- 4-byte seq, errors
393 int next_utf8utf8_ministate; // Mini state for UTF8UTF8 sequences
394 int utf8utf8_odd_byte; // UTF8UTF8 seq has odd number of bytes
395 int utf8utf8_minicount[6]; // Number of correct 2- 3- 4-byte seq, errors
396 StateSoSi next_2022_state; // Mini state for 2022 sequences
397 StateSoSi next_hz_state; // Mini state for HZ sequences
398 bool next_eucjp_oddphase; // Mini state for EUC-JP sequences
399 int byte32_count[8]; // Count of top 3 bits of byte1 of bigram
400 // 0x1x 2x3x 4x5x 6x7x 8x9x AxBx CxDx ExFx
401 uint32 active_special; // Bits showing which special cases are active
402
403 Encoding tld_hint; // Top TLD encoding or UNKNOWN
404 Encoding http_hint; // What the document says about itself or
405 Encoding meta_hint; // UNKNOWN_ENCODING. BOM is initial byte
406 Encoding bom_hint; // order mark for UTF-xx
407
408 // small cache of previous interesting bigrams
409 int next_prior_bigram;
410 int prior_bigram[4];
411 int prior_binary[1];
412
413 int top_rankedencoding; // Top two probabilities and families
414 int second_top_rankedencoding;
415 int top_prob;
416 int second_top_prob;
417 int prune_difference; // Prune things this much below the top prob
418 int rankedencoding_list_len; // Number of active encodings
419 int rankedencoding_list[NUM_RANKEDENCODING]; // List of active encodings
420 //
421 int enc_prob[NUM_RANKEDENCODING]; // Cumulative probability per enc
422 // This is where all the action is
423 int hint_prob[NUM_RANKEDENCODING]; // Initial hint probabilities
424 int hint_weight[NUM_RANKEDENCODING]; // Number of hints for this enc
425
426 // Two sets -- one for printable ASCII, one for the rest
427 int prior_interesting_pair[NUM_PAIR_SETS]; // Pairs consumed by prior call
428 int next_interesting_pair[NUM_PAIR_SETS]; // Next pair to write
429 char interesting_pairs[NUM_PAIR_SETS][kMaxPairs * 2]; // Two bytes per pair
430 int interesting_offsets[NUM_PAIR_SETS][kMaxPairs]; // Src offset of pair
431 int interesting_weightshift[NUM_PAIR_SETS][kMaxPairs]; // weightshift of pair
432 } DetectEncodingState;
433
434
435 // Record a debug event that changes probabilities
SetDetailsEncProb(DetectEncodingState * destatep,int offset,int best_enc,const char * label)436 void SetDetailsEncProb(DetectEncodingState* destatep,
437 int offset, int best_enc, const char* label) {
438 int next = destatep->next_detail_entry;
439 destatep->debug_data[next].offset = offset;
440 destatep->debug_data[next].best_enc = best_enc;
441 destatep->debug_data[next].label = label;
442 memcpy(&destatep->debug_data[next].detail_enc_prob,
443 &destatep->enc_prob,
444 sizeof(destatep->enc_prob));
445 ++destatep->next_detail_entry;
446 }
447
448 // Record a debug event that changes probabilities, copy offset
SetDetailsEncProbCopyOffset(DetectEncodingState * destatep,int best_enc,const char * label)449 void SetDetailsEncProbCopyOffset(DetectEncodingState* destatep,
450 int best_enc, const char* label) {
451 int next = destatep->next_detail_entry;
452 destatep->debug_data[next].offset = destatep->debug_data[next - 1].offset;
453 destatep->debug_data[next].best_enc = best_enc;
454 destatep->debug_data[next].label = label;
455 memcpy(&destatep->debug_data[next].detail_enc_prob,
456 &destatep->enc_prob,
457 sizeof(destatep->enc_prob));
458 ++destatep->next_detail_entry;
459 }
460
461 // Record a debug event that changes probs and has simple text label
SetDetailsEncLabel(DetectEncodingState * destatep,const char * label)462 void SetDetailsEncLabel(DetectEncodingState* destatep, const char* label) {
463 int next = destatep->next_detail_entry;
464 destatep->debug_data[next].offset = destatep->debug_data[next - 1].offset;
465 destatep->debug_data[next].best_enc = -1;
466 destatep->debug_data[next].label = label;
467 memcpy(&destatep->debug_data[next].detail_enc_prob,
468 &destatep->enc_prob,
469 sizeof(destatep->enc_prob));
470 ++destatep->next_detail_entry;
471 }
472
473 // Record a debug event that is just a text label, no change in probs
SetDetailsLabel(DetectEncodingState * destatep,const char * label)474 void SetDetailsLabel(DetectEncodingState* destatep, const char* label) {
475 int next = destatep->next_detail_entry;
476 destatep->debug_data[next].offset = destatep->debug_data[next - 1].offset;
477 destatep->debug_data[next].best_enc = -1;
478 destatep->debug_data[next].label = label;
479 memcpy(&destatep->debug_data[next].detail_enc_prob,
480 &destatep->debug_data[next - 1].detail_enc_prob,
481 sizeof(destatep->enc_prob));
482 ++destatep->next_detail_entry;
483 }
484
485
486 // Maps superset encodings to base, to see if 2 encodings are compatible
487 // (Non-identity mappings are marked "-->" below.)
488 static const Encoding kMapEncToBaseEncoding[] = {
489 ISO_8859_1, // 0: Teragram ASCII
490 ISO_8859_2, // 1: Teragram Latin2
491 ISO_8859_3, // 2: in BasisTech but not in Teragram
492 ISO_8859_4, // 3: Teragram Latin4
493 ISO_8859_5, // 4: Teragram ISO-8859-5
494 ISO_8859_6, // 5: Teragram Arabic
495 ISO_8859_7, // 6: Teragram Greek
496 MSFT_CP1255, // 7: Teragram Hebrew --> 36
497 ISO_8859_9, // 8: in BasisTech but not in Teragram
498 ISO_8859_10, // 9: in BasisTech but not in Teragram
499 JAPANESE_EUC_JP, // 10: Teragram EUC_JP
500 JAPANESE_SHIFT_JIS, // 11: Teragram SJS
501 JAPANESE_JIS, // 12: Teragram JIS
502 CHINESE_BIG5, // 13: Teragram BIG5
503 CHINESE_GB, // 14: Teragram GB
504 CHINESE_EUC_CN, // 15: Teragram EUC-CN
505 KOREAN_EUC_KR, // 16: Teragram KSC
506 UNICODE, // 17: Teragram Unicode
507 CHINESE_EUC_CN, // 18: Teragram EUC --> 15
508 CHINESE_EUC_CN, // 19: Teragram CNS --> 15
509 CHINESE_BIG5, // 20: Teragram BIG5_CP950 --> 13
510 JAPANESE_SHIFT_JIS, // 21: Teragram CP932 --> 11
511 UTF8, // 22
512 UNKNOWN_ENCODING, // 23
513 ISO_8859_1, // 24: ISO_8859_1 with all characters <= 127 --> 0
514 RUSSIAN_KOI8_R, // 25: Teragram KOI8R
515 RUSSIAN_CP1251, // 26: Teragram CP1251
516 ISO_8859_1, // 27: CP1252 aka MSFT euro ascii --> 0
517 RUSSIAN_KOI8_RU, // 28: CP21866 aka KOI8_RU, used for Ukrainian
518 MSFT_CP1250, // 29: CP1250 aka MSFT eastern european
519 ISO_8859_1, // 30: aka ISO_8859_0 aka ISO_8859_1 euroized --> 0
520 ISO_8859_9, // 31: used for Turkish
521 ISO_8859_13, // 32: used in Baltic countries --> 43
522 ISO_8859_11, // 33: aka TIS-620, used for Thai
523 ISO_8859_11, // 34: used for Thai --> 33
524 MSFT_CP1256, // 35: used for Arabic
525 MSFT_CP1255, // 36: Logical Hebrew Microsoft
526 MSFT_CP1255, // 37: Iso Hebrew Logical --> 36
527 MSFT_CP1255, // 38: Iso Hebrew Visual --> 36
528 CZECH_CP852, // 39
529 ISO_8859_2, // 40: aka ISO_IR_139 aka KOI8_CS --> 1
530 MSFT_CP1253, // 41: used for Greek, but NOT a superset of 8859-7
531 RUSSIAN_CP866, // 42
532 ISO_8859_13, // 43
533 ISO_2022_KR, // 44
534 CHINESE_GB, // 45 GBK --> 14
535 CHINESE_GB, // 46 GB18030 --> 14
536 CHINESE_BIG5, // 47 BIG5_HKSCS --> 13
537 ISO_2022_KR, // 48 ISO_2022_CN --> 44
538 TSCII, // 49 Indic encoding
539 TAMIL_MONO, // 50 Indic encoding - Tamil
540 TAMIL_BI, // 51 Indic encoding - Tamil
541 JAGRAN, // 52 Indic encoding - Devanagari
542 MACINTOSH_ROMAN, // 53
543 UTF7, // 54
544 BHASKAR, // 55 Indic encoding - Devanagari
545 HTCHANAKYA, // 56 Indic encoding - Devanagari
546 UTF16BE, // 57
547 UTF16LE, // 58
548 UTF32BE, // 59
549 UTF32LE, // 60
550 BINARYENC, // 61
551 HZ_GB_2312, // 62
552 UTF8UTF8, // 63
553 TAM_ELANGO, // 64 Elango - Tamil
554 TAM_LTTMBARANI, // 65 Barani - Tamil
555 TAM_SHREE, // 66 Shree - Tamil
556 TAM_TBOOMIS, // 67 TBoomis - Tamil
557 TAM_TMNEWS, // 68 TMNews - Tamil
558 TAM_WEBTAMIL, // 69 Webtamil - Tamil
559 KDDI_SHIFT_JIS, // 70 KDDI Shift_JIS
560 DOCOMO_SHIFT_JIS, // 71 DoCoMo Shift_JIS
561 SOFTBANK_SHIFT_JIS, // 72 SoftBank Shift_JIS
562 KDDI_ISO_2022_JP, // 73 KDDI ISO-2022-JP
563 SOFTBANK_ISO_2022_JP, // 74 SOFTBANK ISO-2022-JP
564 };
565
566 COMPILE_ASSERT(arraysize(kMapEncToBaseEncoding) == NUM_ENCODINGS,
567 kMapEncToBaseEncoding_has_incorrect_size);
568
569 // Maps base encodings to 0, supersets to 1+, undesired to -1
570 // (Non-identity mappings are marked "-->" below.)
571 static const int kMapEncToSuperLevel[] = {
572 0, // 0: Teragram ASCII
573 0, // 1: Teragram Latin2
574 0, // 2: in BasisTech but not in Teragram
575 0, // 3: Teragram Latin4
576 0, // 4: Teragram ISO-8859-5
577 0, // 5: Teragram Arabic
578 0, // 6: Teragram Greek
579 0, // 7: Teragram Hebrew
580 0, // 8: in BasisTech but not in Teragram
581 0, // 9: in BasisTech but not in Teragram
582 0, // 10: Teragram EUC_JP
583 0, // 11: Teragram SJS
584 0, // 12: Teragram JIS
585 0, // 13: Teragram BIG5
586 0, // 14: Teragram GB
587 0, // 15: Teragram EUC-CN
588 0, // 16: Teragram KSC
589 0, // 17: Teragram Unicode
590 -1, // 18: Teragram EUC --> 15
591 -1, // 19: Teragram CNS --> 15
592 1, // 20: Teragram BIG5_CP950 --> 13
593 1, // 21: Teragram CP932 --> 11
594 0, // 22
595 -1, // 23
596 -1, // 24: ISO_8859_1 with all characters <= 127 --> 0
597 0, // 25: Teragram KOI8R
598 0, // 26: Teragram CP1251
599 1, // 27: CP1252 aka MSFT euro ascii --> 0
600 0, // 28: CP21866 aka KOI8_RU, used for Ukrainian
601 0, // 29: CP1250 aka MSFT eastern european
602 1, // 30: aka ISO_8859_0 aka ISO_8859_1 euroized --> 0
603 0, // 31: used for Turkish
604 1, // 32: used in Baltic countries --> 43
605 0, // 33: aka TIS-620, used for Thai
606 1, // 34: used for Thai --> 33
607 0, // 35: used for Arabic
608 0, // 36: Logical Hebrew Microsoft
609 -1, // 37: Iso Hebrew Logical --> 36
610 -1, // 38: Iso Hebrew Visual --> 7
611 0, // 39
612 1, // 40: aka ISO_IR_139 aka KOI8_CS --> 1
613 0, // 41: used for Greek, NOT superset of 8859-7
614 0, // 42
615 0, // 43
616 0, // 44
617 1, // 45 GBK --> 14
618 1, // 46 GB18030 --> 14
619 1, // 47 BIG5_HKSCS --> 13
620 1, // 48 ISO_2022_CN --> 44
621 0, // 49 Indic encoding
622 0, // 50 Indic encoding - Tamil
623 0, // 51 Indic encoding - Tamil
624 0, // 52 Indic encoding - Devanagari
625 0, // 53
626 0, // 54
627 0, // 55 Indic encoding - Devanagari
628 0, // 56 Indic encoding - Devanagari
629 0, // 57
630 0, // 58
631 0, // 59
632 0, // 60
633 0, // 61
634 0, // 62
635 2, // 63
636 0, 0, 0, 0, 0, 0, // add six more Tamil
637 0, 0, 0, 0, 0, // add five encodings with emoji
638 };
639
640 COMPILE_ASSERT(arraysize(kMapEncToSuperLevel) == NUM_ENCODINGS,
641 kMapEncToSuperLevel_has_incorrect_size);
642
643
644
645 // Subscripted by Encoding enum value
646 static const uint32 kSpecialMask[] = {
647 kHighAccentCode, // 0
648 kHighAccentCode,
649 kHighAccentCode,
650 kHighAccentCode,
651 kHighAlphaCode, // 4
652 kHighAlphaCode,
653 kHighAlphaCode,
654 kHighAlphaCode,
655 kHighAccentCode,
656 kHighAccentCode,
657
658 kTwobyteCode + kEUCJPActive, // 10 euc-jp
659 kTwobyteCode,
660 kSevenBitActive + kIso2022Active, // jis
661 kTwobyteCode,
662 kTwobyteCode,
663 kTwobyteCode,
664 kTwobyteCode,
665 kSevenBitActive + kUTF1632Active, // Unicode
666 kTwobyteCode,
667 kTwobyteCode,
668
669 kTwobyteCode, // 20
670 kTwobyteCode,
671 kUTF8Active, // UTF-8
672 0,
673 0,
674 kHighAlphaCode, // 25
675 kHighAlphaCode,
676 kHighAccentCode,
677 kHighAlphaCode,
678 kHighAccentCode,
679
680 kHighAccentCode, // 30
681 kHighAccentCode,
682 kHighAccentCode,
683 kHighAlphaCode,
684 kHighAlphaCode,
685 kHighAlphaCode, // 35
686 kHighAlphaCode,
687 kHighAlphaCode,
688 kHighAlphaCode,
689 0,
690
691 0, // 40
692 kHighAlphaCode,
693 kHighAlphaCode,
694 kHighAccentCode,
695 kSevenBitActive + kIso2022Active, // 2022-kr
696 kTwobyteCode,
697 kTwobyteCode,
698 kTwobyteCode,
699 kSevenBitActive + kIso2022Active, // 2022-cn
700 kHighAlphaCode + kIsIndicCode, // 49 TSCII
701
702 kHighAlphaCode + kIsIndicCode, // 50 TAMIL_MONO
703 kHighAlphaCode + kIsIndicCode, // 51 TAMIL_BI
704 kHighAlphaCode + kIsIndicCode, // 52 JAGRAN
705 kHighAccentCode, // 53 MACINTOSH_ROMAN
706 kSevenBitActive + kUTF7Active, // 54 UTF-7
707 kHighAlphaCode + kIsIndicCode, // 55 BHASKAR Indic encoding - Devanagari
708 kHighAlphaCode + kIsIndicCode, // 56 HTCHANAKYA Indic encoding - Devanagari
709 kSevenBitActive + kUTF1632Active, // 57 UTF16BE
710 kSevenBitActive + kUTF1632Active, // 58 UTF16LE
711 kSevenBitActive + kUTF1632Active, // 59 UTF32BE
712 kSevenBitActive + kUTF1632Active, // 60 UTF32LE
713
714 kSevenBitActive + kBinaryActive, // 61 BINARYENC
715 kSevenBitActive + kHzActive, // 62 HZ_GB_2312
716 kHighAccentCode + kUTF8Active + kUTF8UTF8Active, // 63 UTF8UTF8
717 kHighAlphaCode + kIsIndicCode, // 64 Elango - Tamil
718 kHighAlphaCode + kIsIndicCode, // 65 Barani - Tamil
719 kHighAlphaCode + kIsIndicCode, // 66 Shree - Tamil
720 kHighAlphaCode + kIsIndicCode, // 67 TBoomis - Tamil
721 kHighAlphaCode + kIsIndicCode, // 68 TMNews - Tamil
722 kHighAlphaCode + kIsIndicCode, // 69 Webtamil - Tamil
723 kTwobyteCode, // 70 KDDI Shift_JIS
724 kTwobyteCode, // 71 DoCoMo Shift_JIS
725 kTwobyteCode, // 72 SoftBank Shift_JIS
726 kSevenBitActive + kIso2022Active, // 73 KDDI-ISO-2022-JP
727 kSevenBitActive + kIso2022Active, // 74 SOFTBANK-ISO-2022-JP
728 };
729
730 COMPILE_ASSERT(arraysize(kSpecialMask) == NUM_ENCODINGS,
731 kSpecialMask_has_incorrect_size);
732
733
734 /***
735 kHighAlphaCode -- full alphabet in 8x-Fx range, not just accents
736
737 ISO_8859_5, // 4: Teragram ISO-8859-5 Cyrl UL bd
738 RUSSIAN_CP1251, // 26: Teragram CP1251 UL cdef
739 RUSSIAN_KOI8_R, // 25: Teragram KOI8R LU cdef
740 RUSSIAN_KOI8_RU, // 28: CP21866 aka KOI8_RU, LU cdef
741 RUSSIAN_CP866, // 42 89ae
742
743 ISO_8859_6, // 5: Teragram Arabic nocase cde
744 MSFT_CP1256, // 35: used for Arabic nocase cde
745
746 ISO_8859_7, // 6: Teragram Greek UL cdef
747 MSFT_CP1253, // 41: used for Greek UL cdef
748
749 ISO_8859_8, // 7: Teragram Hebrew nocase ef
750 MSFT_CP1255, // 36: Logical Hebrew Microsoft nocase ef
751 ISO_8859_8_I, // 37: Iso Hebrew Logical nocase ef
752 HEBREW_VISUAL, // 38: Iso Hebrew Visual nocase ef
753
754 ISO_8859_11, // 33: aka TIS-620, used for Thai nocase abcde
755 MSFT_CP874, // 34: used for Thai nocase abcde
756
757 TSCII, // 49 8-f
758 TAMIL_MONO, // 50
759 TAMIL_BI, // 51
760 JAGRAN, // 52
761 BHASKAR, // 55 Indic encoding - Devanagari
762 HTCHANAKYA, // 56 Indic encoding - Devanagari
763 ***/
764
765 // We can scan bytes using this at about 500 MB/sec 2.8GHz P4
766 // Slow scan uses this, stopping on NUL ESC SO SI bad C0 and + ~
767 // We allow FF, 0x0C, here because it gives a better result for old
768 // Ascii text formatted for a TTY
769 // non-zero exits scan loop -- 1 for printable ASCII, 2 otherwise
770 static const char kTestPrintableAsciiTildePlus[256] = {
771 2,2,2,2,2,2,2,2, 2,0,0,2,0,0,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
772 0,0,0,0,0,0,0,0, 0,0,0,1,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
773 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
774 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,1,2,
775
776 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
777 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
778 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
779 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
780 };
781
782 // We can scan bytes using this at about 550 MB/sec 2.8GHz P4
783 // Slow scan uses this, stopping on NUL ESC SO SI and bad C0
784 // after Hz and UTF7 are pruned away
785 // We allow Form Feed, 0x0C, here
786 static const char kTestPrintableAscii[256] = {
787 2,2,2,2,2,2,2,2, 2,0,0,2,0,0,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
788 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
789 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
790 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,2,
791
792 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
793 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
794 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
795 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
796 };
797
798 // Used in first-four-byte testing
799 static const char kIsPrintableAscii[256] = {
800 0,0,0,0,0,0,0,0, 0,1,1,0,0,1,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
801 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
802 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
803 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,0,
804
805 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
806 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
807 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
808 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
809 };
810
811
812 static const signed char kBase64Value[256] = {
813 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
814 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
815 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,62,-1,-1,-1,63,
816 52,53,54,55,56,57,58,59, 60,61,-1,-1,-1,-1,-1,-1,
817
818 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,
819 15,16,17,18,19,20,21,22, 23,24,25,-1,-1,-1,-1,-1,
820 -1,26,27,28,29,30,31,32, 33,34,35,36,37,38,39,40,
821 41,42,43,44,45,46,47,48, 49,50,51,-1,-1,-1,-1,-1,
822
823 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
824 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
825 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
826 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
827
828 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
829 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
830 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
831 -1,-1,-1,-1,-1,-1,-1,-1, -1,-1,-1,-1,-1,-1,-1,-1,
832 };
833
834
835 // Subscripted by <state, byte/16>
836 // Accepts Cx->8x Dx->8x Ex->8x->8x Fx->8x->8x->8x
837 //
838 // Fixed Problem: GB has sequences like B2DB B8D6 BDE1 B9B9
839 // which we can mis-parse as an error byte followed by good UTF-8:
840 // B2 DBB8 D6BD E1B9B9
841 // To counteract this, we now require an ASCII7 byte to resync out
842 // of the error state
843 // Next problem: good UTF-8 with bad byte
844 // efbc a012 eea4 bee7 b280 c2b7
845 // efbca0 12 eea4be e7b280 c2b7
846 // ^^ bad byte
847 // fix: change state0 byte 1x to be don't-care
848 //
849 // Short UTF-8 ending in ASCII7 byte should resync immediately:
850 // E0 20 E0 A6 AA should give one error and resync at 2nd E0
851 //
852 static const char kMiniUTF8State[8][16] = {
853 {0,0,0,0,0,0,0,0, 7,7,7,7,1,1,2,4,}, // [0] start char (allow cr/lf/ht)
854 {0,7,0,0,0,0,0,0, 0,0,0,0,7,7,7,7,}, // [1] continue 1 of 2
855 {0,7,0,0,0,0,0,0, 3,3,3,3,7,7,7,7,}, // [2] continue 1 of 3
856 {0,7,0,0,0,0,0,0, 0,0,0,0,7,7,7,7,}, // [3] continue 2 of 3
857 {0,7,0,0,0,0,0,0, 5,5,5,5,7,7,7,7,}, // [4] continue 1 of 4
858 {0,7,0,0,0,0,0,0, 6,6,6,6,7,7,7,7,}, // [5] continue 2 of 4
859 {0,7,0,0,0,0,0,0, 0,0,0,0,7,7,7,7,}, // [6] continue 3 of 4
860 {0,7,0,0,0,0,0,0, 7,7,7,7,7,7,7,7,}, // [7] error, soak up continues,
861 // ONLY resync after Ascii char
862 // then restart
863 };
864 // Counter to increment: 0-don'tcare 1-error 2-good_2B 3-good_3B 4-good_4B
865 static const char kMiniUTF8Count[8][16] = {
866 {0,0,0,0,0,0,0,0, 1,1,1,1,0,0,0,0,}, // [0] start char (allow cr/lf/ht)
867 {1,1,1,1,1,1,1,1, 2,2,2,2,1,1,1,1,}, // [1] continue 1 of 2
868 {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [2] continue 1 of 3
869 {1,1,1,1,1,1,1,1, 3,3,3,3,1,1,1,1,}, // [3] continue 2 of 3
870 {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [4] continue 1 of 4
871 {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [5] continue 2 of 4
872 {1,1,1,1,1,1,1,1, 4,4,4,4,1,1,1,1,}, // [6] continue 3 of 4
873 {0,1,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,}, // [7] error, soak up continues,
874 // then restart
875 };
876
877 // Subscripted by <state, f(byte1) + g(byte2)>
878 // where f(x)= E2->4, Cx->8 and C3->12 and 0 otherwise
879 // and g(x) = (x >> 4) & 3 8x->0 9x->1 Ax->2 Bx->3 Cx->0, etc.
880 // (no checking for illegal bytes)
881 // Here are example patterns of CP1252 converted to UTF-8 0/1/2 times. We want
882 // to detect two, so we can back-convert to one.
883 // zero one two pattern
884 // ---- ------ ---------------- -----------------
885 // 81 C281 C382C281 C3->8x->C2->xx
886 // 98 CB9C C38BC593 C3->8x->C5->xx
887 // C3 C383 C383C692 C3->8x->C6->xx
888 // C8 C388 C383CB86 C3->8x->CB->xx
889 // 83 C692 C386E28099 C3->8x->E2->xx->8x
890 // 80 E282AC C3A2E2809AC2AC C3->A2->E2->xx->xx->Cx->xx
891 // 92 E28099 C3A2E282ACE284A2 C3->A2->E2->xx->xx->E2->xx->xx
892 //
893 // We also want to detect bare-byte extra UTF-8 conversions:
894 // zero one two pattern
895 // ---- ------ ---------------- -----------------
896 // C3 C3 C383 C3->8x->C2->xx
897 // D3 D3 C393 C3->9x->C2->xx->C2->xx
898 // E3 E3 C3A3 C3->Ax->C2->xx->C2->xx->C2->xx
899 // F3 F3 C3B2 C3->Bx->C2->xx->C2->xx->C2->xx->C2->xx
900 //
901
902 /**
903 CP1252 => UTF8 => UTF8UTF8
904 80 => E282AC => C3A2E2809AC2AC
905 81 => C281 => C382C281
906 82 => E2809A => C3A2E282ACC5A1
907 83 => C692 => C386E28099
908 84 => E2809E => C3A2E282ACC5BE
909 85 => E280A6 => C3A2E282ACC2A6
910 86 => E280A0 => C3A2E282ACC2A0
911 87 => E280A1 => C3A2E282ACC2A1
912 88 => CB86 => C38BE280A0
913 89 => E280B0 => C3A2E282ACC2B0
914 8A => C5A0 => C385C2A0
915 8B => E280B9 => C3A2E282ACC2B9
916 8C => C592 => C385E28099
917 8D => C28D => C382C28D
918 8E => C5BD => C385C2BD
919 8F => C28F => C382C28F
920 90 => C290 => C382C290
921 91 => E28098 => C3A2E282ACCB9C
922 92 => E28099 => C3A2E282ACE284A2
923 93 => E2809C => C3A2E282ACC593
924 94 => E2809D => C3A2E282ACC29D
925 95 => E280A2 => C3A2E282ACC2A2
926 96 => E28093 => C3A2E282ACE2809C
927 97 => E28094 => C3A2E282ACE2809D
928 98 => CB9C => C38BC593
929 99 => E284A2 => C3A2E2809EC2A2
930 9A => C5A1 => C385C2A1
931 9B => E280BA => C3A2E282ACC2BA
932 9C => C593 => C385E2809C
933 9D => C29D => C382C29D
934 9E => C5BE => C385C2BE
935 9F => C5B8 => C385C2B8
936 A0 => C2A0 => C382C2A0
937 A1 => C2A1 => C382C2A1
938 A2 => C2A2 => C382C2A2
939 A3 => C2A3 => C382C2A3
940 A4 => C2A4 => C382C2A4
941 A5 => C2A5 => C382C2A5
942 A6 => C2A6 => C382C2A6
943 A7 => C2A7 => C382C2A7
944 A8 => C2A8 => C382C2A8
945 A9 => C2A9 => C382C2A9
946 AA => C2AA => C382C2AA
947 AB => C2AB => C382C2AB
948 AC => C2AC => C382C2AC
949 AD => C2AD => C382C2AD
950 AE => C2AE => C382C2AE
951 AF => C2AF => C382C2AF
952 B0 => C2B0 => C382C2B0
953 B1 => C2B1 => C382C2B1
954 B2 => C2B2 => C382C2B2
955 B3 => C2B3 => C382C2B3
956 B4 => C2B4 => C382C2B4
957 B5 => C2B5 => C382C2B5
958 B6 => C2B6 => C382C2B6
959 B7 => C2B7 => C382C2B7
960 B8 => C2B8 => C382C2B8
961 B9 => C2B9 => C382C2B9
962 BA => C2BA => C382C2BA
963 BB => C2BB => C382C2BB
964 BC => C2BC => C382C2BC
965 BD => C2BD => C382C2BD
966 BE => C2BE => C382C2BE
967 BF => C2BF => C382C2BF
968 C0 => C380 => C383E282AC
969 C1 => C381 => C383C281
970 C2 => C382 => C383E2809A
971 C3 => C383 => C383C692
972 C4 => C384 => C383E2809E
973 C5 => C385 => C383E280A6
974 C6 => C386 => C383E280A0
975 C7 => C387 => C383E280A1
976 C8 => C388 => C383CB86
977 C9 => C389 => C383E280B0
978 CA => C38A => C383C5A0
979 CB => C38B => C383E280B9
980 CC => C38C => C383C592
981 CD => C38D => C383C28D
982 CE => C38E => C383C5BD
983 CF => C38F => C383C28F
984 D0 => C390 => C383C290
985 D1 => C391 => C383E28098
986 D2 => C392 => C383E28099
987 D3 => C393 => C383E2809C
988 D4 => C394 => C383E2809D
989 D5 => C395 => C383E280A2
990 D6 => C396 => C383E28093
991 D7 => C397 => C383E28094
992 D8 => C398 => C383CB9C
993 D9 => C399 => C383E284A2
994 DA => C39A => C383C5A1
995 DB => C39B => C383E280BA
996 DC => C39C => C383C593
997 DD => C39D => C383C29D
998 DE => C39E => C383C5BE
999 DF => C39F => C383C5B8
1000 E0 => C3A0 => C383C2A0
1001 E1 => C3A1 => C383C2A1
1002 E2 => C3A2 => C383C2A2
1003 E3 => C3A3 => C383C2A3
1004 E4 => C3A4 => C383C2A4
1005 E5 => C3A5 => C383C2A5
1006 E6 => C3A6 => C383C2A6
1007 E7 => C3A7 => C383C2A7
1008 E8 => C3A8 => C383C2A8
1009 E9 => C3A9 => C383C2A9
1010 EA => C3AA => C383C2AA
1011 EB => C3AB => C383C2AB
1012 EC => C3AC => C383C2AC
1013 ED => C3AD => C383C2AD
1014 EE => C3AE => C383C2AE
1015 EF => C3AF => C383C2AF
1016 F0 => C3B0 => C383C2B0
1017 F1 => C3B1 => C383C2B1
1018 F2 => C3B2 => C383C2B2
1019 F3 => C3B3 => C383C2B3
1020 F4 => C3B4 => C383C2B4
1021 F5 => C3B5 => C383C2B5
1022 F6 => C3B6 => C383C2B6
1023 F7 => C3B7 => C383C2B7
1024 F8 => C3B8 => C383C2B8
1025 F9 => C3B9 => C383C2B9
1026 FA => C3BA => C383C2BA
1027 FB => C3BB => C383C2BB
1028 FC => C3BC => C383C2BC
1029 FD => C3BD => C383C2BD
1030 FE => C3BE => C383C2BE
1031 FF => C3BF => C383C2BF
1032 **/
1033
1034 // Subscripted by <state, f(byte1) + g(byte2)>
1035 // where f(x)= E2->4, C2/5/6/B->8 and C3->12 and 0 otherwise
1036 // and g(x) = (x >> 4) & 3 8x->0 9x->1 Ax->2 Bx->3 Cx->0, etc.
1037
1038 // 81 C281 C382C281 C3->8x->C2->xx
1039 // 98 CB9C C38BC593 C3->8x->C5->xx
1040 // C3 C383 C383C692 C3->8x->C6->xx
1041 // C8 C388 C383CB86 C3->8x->CB->xx
1042 // [0] [2] [0]
1043 // 83 C692 C386E28099 C3->8x->E2->xx->xx
1044 // odd_byte=0 [0] [2] [0+] odd_byte flipped
1045 // odd_byte=1 [0+] [2] [0] [0] odd_byte unflipped
1046 // 80 E282AC C3A2E2809AC2AC C3->A2->E2->xx->xx->Cx->xx
1047 // odd_byte=0 [0] [3] [4] [0+]
1048 // odd_byte=1 [0+] [3] [4] [4] [0]
1049 // 92 E28099 C3A2E282ACE284A2 C3->A2->E2->xx->xx->E2->xx->xx
1050 // odd_byte=0 [0] [3] [4] [0] [0]
1051 // odd_byte=1 [0+] [3] [4] [4] [0+]
1052 //
1053 // When an E2xxxx sequence is encountered, we absorb the two bytes E2xx and flip
1054 // the odd_byte state. If that goes from 0 to 1, the next pair is offset up
1055 // by one byte, picking up the two bytes just after E2xxxx. If odd_byte goes
1056 // from 1 to 0, the next two bytes picked up are the two bytes xxxx of E2xxxx.
1057 // These are absorbed with no error in state 0 or state 4
1058 //
1059 // C3 C3 C383 C3->8x->C2->xx
1060 // D3 D3 C393 C3->9x->C2->xx->C2->xx
1061 // E3 E3 C3A3 C3->Ax->C2->xx->C2->xx->C2->xx
1062 // F3 F3 C3B2 C3->Bx->C2->xx->C2->xx->C2->xx->C2->xx
1063 // Counter3 for Fx Ex sequences is incremented at last C2
1064
1065 static const char kMiniUTF8UTF8State[8][16] = {
1066 // xxxx E2xx CXxx C3xx
1067 // 8 9 a b 8 9 a b 8 9 a b
1068 {0,0,0,0,1,1,1,1, 1,1,1,1,2,2,3,5,}, // [0] looking for C38x/C3Ax/2020/8x8x, or err
1069 {0,0,0,0,1,1,1,1, 1,1,1,1,2,2,3,5,}, // [1] error, back to looking
1070 {1,1,1,1,0,0,0,0, 0,0,0,0,1,1,1,1,}, // [2] C38x looking for CXxx/E2xxxx
1071 // + + + + // E2xxxx flips odd_byte
1072 {1,1,1,1,4,4,4,4, 7,7,7,7,1,1,1,1,}, // [3] C3Ax looking for E2xx or C2xxC2xx
1073 // + + + + // E2xxxx flips odd_byte
1074 {4,4,4,4,0,0,0,0, 0,0,0,0,1,1,1,1,}, // [4] C3AxE2xx-- looking for C2xx/E2xxxx
1075 // + + + + // E2xxxx flips odd_byte
1076 {1,1,1,1,1,1,1,1, 6,6,6,6,1,1,1,1,}, // [5] C3Bx -- looking for C2xxC2xxC2xx
1077 {1,1,1,1,1,1,1,1, 7,7,7,7,1,1,1,1,}, // [6] C3Bx -- looking for C2xxC2xx
1078 {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [7] C3Bx -- looking for C2xx
1079 };
1080 // Counter to increment: 0-don'tcare 1-error 2-good_2B 3-good_3B 4-good_4B
1081 static const char kMiniUTF8UTF8Count[8][16] = {
1082 // xxxx E2xx C2Xx C3xx
1083 // 8 9 a b 8 9 a b 8 9 a b
1084 {0,0,0,0,1,1,1,1, 1,1,1,1,0,0,0,0,}, // [0] looking for C38x/C3Ax/2020/8x8x, or err
1085 {0,0,0,0,1,1,1,1, 1,1,1,1,0,0,0,0,}, // [1] error, back to looking
1086 {1,1,1,1,3,3,3,3, 2,2,2,2,1,1,1,1,}, // [2] C38x looking for CXxx/E2xxxx
1087 // + + + + // E2xxxx flips odd_byte
1088 {1,1,1,1,0,0,0,0, 0,0,0,0,1,1,1,1,}, // [3] C3Ax looking for E2xx
1089 // + + + + // E2xxxx flips odd_byte
1090 {1,1,1,1,4,4,4,4, 4,4,4,4,1,1,1,1,}, // [4] C3AxE2xx-- looking for C2xx/E2xxxx
1091 // + + + + // E2xxxx flips odd_byte
1092 {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [5] C3Bx -- looking for C2xxC2xxC2xx
1093 {1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,}, // [6] C3Bx -- looking for C2xxC2xx
1094 {1,1,1,1,1,1,1,1, 3,3,3,3,1,1,1,1,}, // [7] C3Bx -- looking for C2xx
1095 };
1096
1097 static const char kMiniUTF8UTF8Odd[8][16] = {
1098 // xxxx E2xx C2Xx C3xx
1099 // 8 9 a b 8 9 a b 8 9 a b
1100 {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [0] looking for C38x/C3Ax/2020/8x8x, or err
1101 {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [1] error, back to looking
1102 {0,0,0,0,1,1,1,1, 0,0,0,0,0,0,0,0,}, // [2] C38x looking for CXxx/E2xxxx
1103 // + + + + // E2xxxx flips odd_byte
1104 {0,0,0,0,1,1,1,1, 0,0,0,0,0,0,0,0,}, // [3] C3Ax looking for E2xx
1105 // + + + + // E2xxxx flips odd_byte
1106 {0,0,0,0,1,1,1,1, 0,0,0,0,0,0,0,0,}, // [4] C3AxE2xx-- looking for C2xx/E2xxxx
1107 // + + + + // E2xxxx flips odd_byte
1108 {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [5] C3Bx -- looking for C2xxC2xxC2xx
1109 {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [6] C3Bx -- looking for C2xxC2xx
1110 {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,}, // [7] C3Bx -- looking for C2xx
1111 };
1112
1113 // Turn a pair of bytes into the subscript for UTF8UTF8 tables above
UTF88Sub(char s0,char s1)1114 int UTF88Sub(char s0, char s1) {
1115 int sub = (s1 >> 4) & 0x03;
1116 uint8 u0 = static_cast<uint8>(s0);
1117 if (u0 == 0xc3) {
1118 sub += 12;
1119 } else if ((u0 & 0xf0) == 0xc0) {
1120 if ((u0 == 0xc2) || (u0 == 0xc5) || (u0 == 0xc6) || (u0 == 0xcb)) {
1121 sub += 8;
1122 }
1123 } else if (u0 == 0xe2) {
1124 sub += 4;
1125 }
1126 return sub;
1127 }
1128
1129
1130
1131
1132
1133 // Default probability for an encoding rankedencoding
1134 // Based on a scan of 55M web pages
1135 // These values are 255 - log base 2**1/10 (occurrences / total)
1136 // Large values are most likely. This the reverse of some Google code
1137 // 255 = 1.0, 245 = 1/2, 235 = 1/4, 15 = 1/2**24, 0 = 0 (< 1/50M)
1138 //
1139 // TODO change this to be per encoding, not permuted
1140 //
1141
1142
1143 // Support function for unit test program
1144 // Return ranked encoding corresponding to enc
1145 // (also exported to compact_enc_det_text.cc)
BackmapEncodingToRankedEncoding(Encoding enc)1146 int CompactEncDet::BackmapEncodingToRankedEncoding(Encoding enc) {
1147 for (int i = 0; i < NUM_RANKEDENCODING; ++i) {
1148 if (kMapToEncoding[i] == enc) {
1149 return i;
1150 }
1151 }
1152 return -1;
1153 }
1154
1155
DecodeActive(uint32 active)1156 string DecodeActive(uint32 active) {
1157 string temp("");
1158 if (active & kBinaryActive) {
1159 temp.append("Binary ");
1160 }
1161 if (active & kUTF1632Active) {
1162 temp.append("UTF1632 ");
1163 }
1164 if (active & kUTF8UTF8Active) {
1165 temp.append("UTF8UTF8 ");
1166 }
1167 if (active & kUTF8Active) {
1168 temp.append("UTF8 ");
1169 }
1170 if (active & kIso2022Active) {
1171 temp.append("Iso2022 ");
1172 }
1173 if (active & kHzActive) {
1174 temp.append("Hz ");
1175 }
1176 if (active & kUTF7Active) {
1177 temp.append("UTF7A ");
1178 }
1179 if (active & kSevenBitActive) {
1180 temp.append("SevenBit ");
1181 }
1182 if (active & kIsIndicCode) {
1183 temp.append("Indic ");
1184 }
1185 if (active & kHighAlphaCode) {
1186 temp.append("HighAlpha ");
1187 }
1188 if (active & kHighAccentCode) {
1189 temp.append("HighAccent ");
1190 }
1191 if (active & kEUCJPActive) {
1192 temp.append("EUCJP ");
1193 }
1194 return temp;
1195 }
1196
SevenBitEncoding(int enc)1197 static inline bool SevenBitEncoding(int enc) {
1198 return ((kSpecialMask[enc] & kSevenBitActive) != 0);
1199 }
TwoByteEncoding(int enc)1200 static inline bool TwoByteEncoding(int enc) {
1201 return ((kSpecialMask[enc] & kTwobyteCode) != 0);
1202 }
IndicEncoding(int enc)1203 static inline bool IndicEncoding(int enc) {
1204 return ((kSpecialMask[enc] & kIsIndicCode) != 0);
1205 }
HighAlphaEncoding(int enc)1206 static inline bool HighAlphaEncoding(int enc) {
1207 return ((kSpecialMask[enc] & kHighAlphaCode) != 0);
1208 }
HighAccentEncoding(int enc)1209 static inline bool HighAccentEncoding(int enc) {
1210 return ((kSpecialMask[enc] & kHighAccentCode) != 0);
1211 }
1212
1213
AnyActive(DetectEncodingState * destatep)1214 static inline bool AnyActive(DetectEncodingState* destatep) {
1215 return (destatep->active_special != 0);
1216 }
SevenBitActive(DetectEncodingState * destatep)1217 static inline bool SevenBitActive(DetectEncodingState* destatep) {
1218 return (destatep->active_special & kSevenBitActive) != 0;
1219 }
HzActive(DetectEncodingState * destatep)1220 static inline bool HzActive(DetectEncodingState* destatep) {
1221 return (destatep->active_special & kHzActive) != 0;
1222 }
Iso2022Active(DetectEncodingState * destatep)1223 static inline bool Iso2022Active(DetectEncodingState* destatep) {
1224 return (destatep->active_special & kIso2022Active) != 0;
1225 }
UTF8Active(DetectEncodingState * destatep)1226 static inline bool UTF8Active(DetectEncodingState* destatep) {
1227 return (destatep->active_special & kUTF8Active) != 0;
1228 }
UTF8UTF8Active(DetectEncodingState * destatep)1229 static inline bool UTF8UTF8Active(DetectEncodingState* destatep) {
1230 return (destatep->active_special & kUTF8UTF8Active) != 0;
1231 }
UTF1632Active(DetectEncodingState * destatep)1232 static inline bool UTF1632Active(DetectEncodingState* destatep) {
1233 return (destatep->active_special & kUTF1632Active) != 0;
1234 }
BinaryActive(DetectEncodingState * destatep)1235 static inline bool BinaryActive(DetectEncodingState* destatep) {
1236 return (destatep->active_special & kBinaryActive) != 0;
1237 }
UTF7OrHzActive(DetectEncodingState * destatep)1238 static inline bool UTF7OrHzActive(DetectEncodingState* destatep) {
1239 return (destatep->active_special & (kHzActive + kUTF7Active)) != 0;
1240 }
EUCJPActive(DetectEncodingState * destatep)1241 static inline bool EUCJPActive(DetectEncodingState* destatep) {
1242 return ((destatep->active_special & kEUCJPActive) != 0);
1243 }
OtherActive(DetectEncodingState * destatep)1244 static inline bool OtherActive(DetectEncodingState* destatep) {
1245 return (destatep->active_special & (kIso2022Active + kBinaryActive +
1246 kUTF8Active + kUTF8UTF8Active +
1247 kUTF1632Active + kEUCJPActive)) != 0;
1248 }
1249
1250
CEDFlagRescanning(CEDInternalFlags flags)1251 static inline bool CEDFlagRescanning(CEDInternalFlags flags) {
1252 return (flags & kCEDRescanning) != 0;
1253 }
1254
CEDFlagForceTags(CEDInternalFlags flags)1255 static inline bool CEDFlagForceTags(CEDInternalFlags flags) {
1256 return (flags & kCEDForceTags) != 0;
1257 }
1258
1259
maxint(int a,int b)1260 static inline int maxint(int a, int b) {return (a > b) ? a : b;}
minint(int a,int b)1261 static inline int minint(int a, int b) {return (a < b) ? a : b;}
1262
MyRankedEncName(int r_enc)1263 static inline const char* MyRankedEncName(int r_enc) {
1264 return MyEncodingName(kMapToEncoding[r_enc]);
1265 }
1266
1267
1268 // Only for debugging. not thread safe
1269 static const int kPsSourceWidth = 32;
1270 static int pssourcenext = 0; // debug only. not threadsafe. dump only >= this
1271 static int pssourcewidth = 0; // debug only.
1272 static char* pssource_mark_buffer = NULL;
1273 int next_do_src_line;
1274 int do_src_offset[16];
1275
1276
PsSourceInit(int len)1277 void PsSourceInit(int len) {
1278 pssourcenext = 0;
1279 pssourcewidth = len;
1280 delete[] pssource_mark_buffer;
1281 // Allocate 2 Ascii characters per input byte
1282 pssource_mark_buffer = new char[(pssourcewidth * 2) + 8]; // 8 = overscan
1283 memset(pssource_mark_buffer, ' ', pssourcewidth * 2);
1284 memset(pssource_mark_buffer + (pssourcewidth * 2), '\0', 8);
1285
1286 next_do_src_line = 0;
1287 memset(do_src_offset, 0, sizeof(do_src_offset));
1288 }
1289
PsSourceFinish()1290 void PsSourceFinish() {
1291 // Print preceding mark buffer
1292 int j = (pssourcewidth * 2) - 1;
1293 while ((0 <= j) && (pssource_mark_buffer[j] == ' ')) {--j;} // trim
1294 pssource_mark_buffer[j + 1] = '\0';
1295 fprintf(stderr, "( %s) do-src\n", pssource_mark_buffer);
1296 memset(pssource_mark_buffer, ' ', pssourcewidth * 2);
1297 memset(pssource_mark_buffer + (pssourcewidth * 2), '\0', 8);
1298
1299 delete[] pssource_mark_buffer;
1300 pssource_mark_buffer = NULL;
1301 }
1302
1303 // Dump aligned len bytes src... if not already dumped
PsSource(const uint8 * src,const uint8 * isrc,const uint8 * srclimit)1304 void PsSource(const uint8* src, const uint8* isrc, const uint8* srclimit) {
1305 int offset = src - isrc;
1306 offset -= (offset % pssourcewidth); // round down to multiple of len bytes
1307 if (offset < pssourcenext) {
1308 return;
1309 }
1310 pssourcenext = offset + pssourcewidth; // Min offset for next dump
1311
1312 // Print preceding mark buffer
1313 int j = (pssourcewidth * 2) - 1;
1314 while ((0 <= j) && (pssource_mark_buffer[j] == ' ')) {--j;} // trim
1315 pssource_mark_buffer[j + 1] = '\0';
1316 fprintf(stderr, "( %s) do-src\n", pssource_mark_buffer);
1317 memset(pssource_mark_buffer, ' ', pssourcewidth * 2);
1318 memset(pssource_mark_buffer + (pssourcewidth * 2), '\0', 8);
1319
1320 // Print source bytes
1321 const uint8* src_aligned = isrc + offset;
1322 int length = srclimit - src_aligned;
1323 length = minint(pssourcewidth, length);
1324
1325 fprintf(stderr, "(%05x ", offset);
1326 for (int i = 0; i < length; ++i) {
1327 char c = src_aligned[i];
1328 if (c == '\n') {c = ' ';}
1329 if (c == '\r') {c = ' ';}
1330 if (c == '\t') {c = ' ';}
1331 if (c == '(') {
1332 fprintf(stderr, "%s", "\\( ");
1333 } else if (c == ')') {
1334 fprintf(stderr, "%s", "\\) ");
1335 } else if (c == '\\') {
1336 fprintf(stderr, "%s", "\\\\ ");
1337 } else if ((0x20 <= c) && (c <= 0x7e)) {
1338 fprintf(stderr, "%c ", c);
1339 } else {
1340 fprintf(stderr, "%02x", c);
1341 }
1342 }
1343 fprintf(stderr, ") do-src\n");
1344 // Remember which source offsets are where, mod 16
1345 do_src_offset[next_do_src_line & 0x0f] = offset;
1346 ++next_do_src_line;
1347 }
1348
1349 // Mark bytes in just-previous source bytes
PsMark(const uint8 * src,int len,const uint8 * isrc,int weightshift)1350 void PsMark(const uint8* src, int len, const uint8* isrc, int weightshift) {
1351 int offset = src - isrc;
1352 offset = (offset % pssourcewidth); // mod len bytes
1353 char mark = (weightshift == 0) ? '-' : 'x';
1354
1355 pssource_mark_buffer[(offset * 2)] = '=';
1356 pssource_mark_buffer[(offset * 2) + 1] = '=';
1357 for (int i = 1; i < len; ++i) {
1358 pssource_mark_buffer[(offset + i) * 2] = mark;
1359 pssource_mark_buffer[((offset + i) * 2) + 1] = mark;
1360 }
1361 }
1362
1363
1364 // Highlight trigram bytes in just-previous source bytes
1365 // Unfortunately, we have to skip back N lines since source was printed for
1366 // up to 8 bigrams before we get here. Match on src+1 to handle 0/31 better
PsHighlight(const uint8 * src,const uint8 * isrc,int trigram_val,int n)1367 void PsHighlight(const uint8* src, const uint8* isrc, int trigram_val, int n) {
1368 int offset = (src + 1) - isrc;
1369 int offset32 = (offset % pssourcewidth); // mod len bytes
1370 offset -= offset32; // round down to multiple of len bytes
1371
1372 for (int i = 1; i <= 16; ++i) {
1373 if (do_src_offset[(next_do_src_line - i) & 0x0f] == offset) {
1374 fprintf(stderr, "%d %d %d do-highlight%d\n",
1375 i, offset32 - 1, trigram_val, n);
1376 break;
1377 }
1378 }
1379 }
1380
1381
InitDetectEncodingState(DetectEncodingState * destatep)1382 void InitDetectEncodingState(DetectEncodingState* destatep) {
1383 destatep->initial_src = NULL; // Filled in by caller
1384 destatep->limit_src = NULL;
1385 destatep->prior_src = NULL;
1386 destatep->last_pair = NULL;
1387
1388 destatep->debug_data = NULL;
1389 destatep->next_detail_entry = 0;
1390
1391 destatep->done = false;
1392 destatep->reliable = false;
1393 destatep->hints_derated = false;
1394 //destatep->declared_enc_1 init in ApplyHints
1395 //destatep->declared_enc_2 init in ApplyHints
1396 destatep->prune_count = 0;
1397
1398 destatep->trigram_highwater_mark = 0;
1399 destatep->looking_for_latin_trigrams = false;
1400 destatep->do_latin_trigrams = false;
1401
1402 // Miscellaneous state variables for difficult encodings
1403 destatep->binary_quadrants_count = 0;
1404 destatep->binary_8x4_count = 0;
1405 destatep->binary_quadrants_seen = 0;
1406 destatep->binary_8x4_seen = 0;
1407 destatep->utf7_starts = 0;
1408 destatep->prior_utf7_offset = 0;
1409 destatep->next_utf8_ministate = 0;
1410 for (int i = 0; i < 6; i++) {destatep->utf8_minicount[i] = 0;}
1411 destatep->next_utf8utf8_ministate = 0;
1412 destatep->utf8utf8_odd_byte = 0;
1413 for (int i = 0; i < 6; i++) {destatep->utf8utf8_minicount[i] = 0;}
1414 destatep->next_2022_state = SOSI_NONE;
1415 destatep->next_hz_state = SOSI_NONE;
1416 destatep->next_eucjp_oddphase = false;
1417 for (int i = 0; i < 8; i++) {destatep->byte32_count[i] = 0;}
1418 destatep->active_special = 0xffffffff;
1419 destatep->tld_hint = UNKNOWN_ENCODING;
1420 destatep->http_hint = UNKNOWN_ENCODING;
1421 destatep->meta_hint = UNKNOWN_ENCODING;
1422 destatep->bom_hint = UNKNOWN_ENCODING;
1423 destatep->top_rankedencoding = 0; // ASCII [seven-bit] is the default
1424 destatep->second_top_rankedencoding = 0; // ASCII [seven-bit] is the default
1425 destatep->top_prob = -1;
1426 destatep->second_top_prob = -1;
1427 // This is wide for first pruning, shrinks for 2nd and later
1428 destatep->prune_difference = kInititalPruneDifference;
1429
1430 destatep->next_prior_bigram = 0;
1431 destatep->prior_bigram[0] = -1;
1432 destatep->prior_bigram[1] = -1;
1433 destatep->prior_bigram[2] = -1;
1434 destatep->prior_bigram[3] = -1;
1435
1436 destatep->prior_binary[0] = -1;
1437
1438 // Initialize with all but Indic encodings, which we never detect
1439 int k = 0;
1440 for (int rankedencoding = 0;
1441 rankedencoding < NUM_RANKEDENCODING;
1442 rankedencoding++) {
1443 Encoding enc = kMapToEncoding[rankedencoding];
1444 if (!IndicEncoding(enc)) {
1445 destatep->rankedencoding_list[k++] = rankedencoding;
1446 }
1447 }
1448 destatep->rankedencoding_list_len = k;
1449
1450 // This is where all the action is
1451 memset(destatep->enc_prob, 0, sizeof(destatep->enc_prob));
1452
1453 memset(destatep->hint_prob, 0, sizeof(destatep->hint_prob));
1454 memset(destatep->hint_weight, 0, sizeof(destatep->hint_weight));
1455
1456 destatep->prior_interesting_pair[AsciiPair] = 0;
1457 destatep->prior_interesting_pair[OtherPair] = 0;
1458 destatep->next_interesting_pair[AsciiPair] = 0;
1459 destatep->next_interesting_pair[OtherPair] = 0;
1460 // interesting_pairs/offsets/weightshifts not initialized; no need
1461 }
1462
1463 // Probability strings are uint8, with zeros removed via simple run-length:
1464 // (<skip-take byte> <data bytes>)*
1465 // skip-take:
1466 // 00 end
1467 // x0 skip 16 x locations, take 0 data values
1468 // xy skip x locations, take y data values
1469 // Multiply all the incoming values by 3 to account for 3x unigram sums
1470 //
1471 // {{0x77,0x69,0x6e,0x64,0x31,0x32,0x35,0x35,
1472 // 0x01,0xc2,0x10,0x41,0xfe,0x71,0xba,0x00,}}, // "wind1255"
1473 //
1474 // Weight is 0..100 percent
1475 //
1476 // Returns subscript of largest (most probable) value
1477 //
1478
1479
1480 // {{0x6e,0x6c,0x5f,0x5f, 0x05,0xb2,0xae,0xa0,0x32,0xa1,0x36,0x31,0x42,0x39,0x3b,0x33,0x45,0x11,0x6f,0x00,}}, // "nl__"
1481 // // ASCII-7-bit=178 Latin1=174 UTF8=160 GB=50 CP1252=161 BIG5=49 Latin2=66 CP1251=57 CP1256=59 CP1250=51 Latin5=69 ISO-8859-15=111 [top ASCII-7-bit]
ApplyCompressedProb(const char * iprob,int len,int weight,DetectEncodingState * destatep)1482 int ApplyCompressedProb(const char* iprob, int len,
1483 int weight, DetectEncodingState* destatep) {
1484 int* dst = &destatep->enc_prob[0];
1485 int* dst2 = &destatep->hint_weight[0];
1486 const uint8* prob = reinterpret_cast<const uint8*>(iprob);
1487 const uint8* problimit = prob + len;
1488
1489 int largest = -1;
1490 int subscript_of_largest = 0;
1491
1492 // Continue with first byte and subsequent ones
1493 while (prob < problimit) {
1494 int skiptake = *prob++;
1495 int skip = (skiptake & 0xf0) >> 4;
1496 int take = skiptake & 0x0f;
1497 if (skiptake == 00) {
1498 break;
1499 } else if (take == 0) {
1500 dst += (skip << 4);
1501 dst2 += (skip << 4);
1502 } else {
1503 dst += skip; // Normal case
1504 dst2 += skip; // Normal case
1505 for (int i = 0; i < take; i++) {
1506 int enc = static_cast<int>(dst - &destatep->enc_prob[0]) + i;
1507 if (largest < prob[i]) {
1508 largest = prob[i];
1509 subscript_of_largest = enc;
1510 }
1511
1512 int increment = prob[i] * 3; // The actual increment
1513
1514 // Do maximum of previous hints plus this new one
1515 if (weight > 0) {
1516 increment = (increment * weight) / 100;
1517 dst[i] = maxint(dst[i], increment);
1518 dst2[i] = 1; // New total weight
1519 }
1520 }
1521 prob += take;
1522 dst += take;
1523 dst2 += take;
1524 }
1525 }
1526 return subscript_of_largest;
1527 }
1528
1529
1530 // Returns subscript of largest (most probable) value [for unit test]
TopCompressedProb(const char * iprob,int len)1531 int TopCompressedProb(const char* iprob, int len) {
1532 const uint8* prob = reinterpret_cast<const uint8*>(iprob);
1533 const uint8* problimit = prob + len;
1534 int next_prob_sub = 0;
1535 int topprob = 0;
1536 int toprankenc = 0;
1537
1538 while (prob < problimit) {
1539 int skiptake = *prob++;
1540 int skip = (skiptake & 0xf0) >> 4;
1541 int take = skiptake & 0x0f;
1542 if (skiptake == 0) {
1543 break;
1544 } else if (take == 0) {
1545 next_prob_sub += (skip << 4);
1546 } else {
1547 next_prob_sub += skip; // Normal case
1548 for (int i = 0; i < take; i++) {
1549 if (topprob < prob[i]) {
1550 topprob = prob[i];
1551 toprankenc = next_prob_sub + i;
1552 }
1553 }
1554 prob += take;
1555 next_prob_sub += take;
1556 }
1557 }
1558 return toprankenc;
1559 }
1560
1561
1562 // Find subscript of matching key in first 8 bytes of sorted hint array, or -1
HintBinaryLookup8(const HintEntry * hintprobs,int hintprobssize,const char * norm_key)1563 int HintBinaryLookup8(const HintEntry* hintprobs, int hintprobssize,
1564 const char* norm_key) {
1565 // Key is always in range [lo..hi)
1566 int lo = 0;
1567 int hi = hintprobssize;
1568 while (lo < hi) {
1569 int mid = (lo + hi) >> 1;
1570 int comp = memcmp(&hintprobs[mid].key_prob[0], norm_key, 8);
1571 if (comp < 0) {
1572 lo = mid + 1;
1573 } else if (comp > 0) {
1574 hi = mid;
1575 } else {
1576 return mid;
1577 }
1578 }
1579 return -1;
1580 }
1581
1582 // Find subscript of matching key in first 4 bytes of sorted hint array, or -1
HintBinaryLookup4(const HintEntry * hintprobs,int hintprobssize,const char * norm_key)1583 int HintBinaryLookup4(const HintEntry* hintprobs, int hintprobssize,
1584 const char* norm_key) {
1585 // Key is always in range [lo..hi)
1586 int lo = 0;
1587 int hi = hintprobssize;
1588 while (lo < hi) {
1589 int mid = (lo + hi) >> 1;
1590 int comp = memcmp(&hintprobs[mid].key_prob[0], norm_key, 4);
1591 if (comp < 0) {
1592 lo = mid + 1;
1593 } else if (comp > 0) {
1594 hi = mid;
1595 } else {
1596 return mid;
1597 }
1598 }
1599 return -1;
1600 }
1601
Boost(DetectEncodingState * destatep,int r_enc,int boost)1602 static inline void Boost(DetectEncodingState* destatep, int r_enc, int boost) {
1603 destatep->enc_prob[r_enc] += boost;
1604 }
1605
Whack(DetectEncodingState * destatep,int r_enc,int whack)1606 static inline void Whack(DetectEncodingState* destatep, int r_enc, int whack) {
1607 destatep->enc_prob[r_enc] -= whack;
1608 }
1609
1610 // Apply initial probability hint based on top level domain name
1611 // Weight is 0..100 percent
1612 // Return 1 if name match found
ApplyTldHint(const char * url_tld_hint,int weight,DetectEncodingState * destatep)1613 int ApplyTldHint(const char* url_tld_hint, int weight,
1614 DetectEncodingState* destatep) {
1615 if (url_tld_hint[0] == '~') {
1616 return 0;
1617 }
1618 string normalized_tld = MakeChar4(string(url_tld_hint));
1619 int n = HintBinaryLookup4(kTLDHintProbs, kTLDHintProbsSize,
1620 normalized_tld.c_str());
1621 if (n >= 0) {
1622 // TLD is four bytes, probability table is ~12 bytes
1623 int best_sub = ApplyCompressedProb(&kTLDHintProbs[n].key_prob[kMaxTldKey],
1624 kMaxTldVector, weight, destatep);
1625 // Never boost ASCII7; do CP1252 instead
1626 if (best_sub == F_ASCII_7_bit) {best_sub = F_CP1252;}
1627 destatep->declared_enc_1 = best_sub;
1628 if (destatep->debug_data != NULL) {
1629 // Show TLD hint
1630 SetDetailsEncProb(destatep, 0, best_sub, url_tld_hint);
1631 }
1632 return 1;
1633 }
1634 return 0;
1635 }
1636
1637 // Apply initial probability hint based on charset= name
1638 // Weight is 0..100 percent
1639 // Return 1 if name match found
ApplyCharsetHint(const char * charset_hint,int weight,DetectEncodingState * destatep)1640 int ApplyCharsetHint(const char* charset_hint, int weight,
1641 DetectEncodingState* destatep) {
1642 if (charset_hint[0] == '~') {
1643 return 0;
1644 }
1645 string normalized_charset = MakeChar44(string(charset_hint));
1646 int n = HintBinaryLookup8(kCharsetHintProbs, kCharsetHintProbsSize,
1647 normalized_charset.c_str());
1648 if (n >= 0) {
1649 // Charset is eight bytes, probability table is ~eight bytes
1650 int best_sub = ApplyCompressedProb(&kCharsetHintProbs[n].key_prob[kMaxCharsetKey],
1651 kMaxCharsetVector, weight, destatep);
1652 // Never boost ASCII7; do CP1252 instead
1653 if (best_sub == F_ASCII_7_bit) {best_sub = F_CP1252;}
1654 destatep->declared_enc_1 = best_sub;
1655
1656 // If first explicitly declared charset is confusable with Latin1/1252, put
1657 // both declared forms in declared_enc_*, displacing Latin1/1252.
1658 // This avoids a bit of Latin1 creep.
1659 // Also boost the declared encoding and its pair
1660 // TODO: This should all be folded into postproc-enc-detect.cc
1661 if ((destatep->http_hint == UNKNOWN_ENCODING) &&
1662 (destatep->meta_hint == UNKNOWN_ENCODING)) {
1663 // This is the first charset=hint
1664 switch (best_sub) {
1665 case F_Latin2: // 8859-2 Latin2, east euro
1666 destatep->declared_enc_2 = F_CP1250;
1667 Boost(destatep, F_Latin2, kGentleOnePair);
1668 Boost(destatep, F_CP1250, kGentleOnePair);
1669 break;
1670 case F_CP1250:
1671 destatep->declared_enc_2 = F_Latin2;
1672 Boost(destatep, F_Latin2, kGentleOnePair);
1673 Boost(destatep, F_CP1250, kGentleOnePair);
1674 break;
1675
1676 case F_Latin3: // 8859-3 Latin3, south euro, Esperanto
1677 destatep->declared_enc_2 = F_ASCII_7_bit;
1678 Boost(destatep, F_Latin3, kGentleOnePair);
1679 break;
1680
1681 case F_Latin4: // 8859-4 Latin4, north euro
1682 destatep->declared_enc_2 = F_ASCII_7_bit;
1683 Boost(destatep, F_Latin4, kGentleOnePair);
1684 break;
1685
1686 case F_ISO_8859_5: // 8859-5 Cyrillic
1687 destatep->declared_enc_2 = F_ASCII_7_bit; // Don't boost 1251
1688 Boost(destatep, F_ISO_8859_5, kGentleOnePair); // (too different)
1689 break;
1690 case F_CP1251:
1691 destatep->declared_enc_2 = F_ASCII_7_bit; // Don't boost -5
1692 Boost(destatep, F_CP1251, kGentleOnePair); // (too different)
1693 break;
1694
1695 case F_Arabic: // 8859-6 Arabic
1696 destatep->declared_enc_2 = F_CP1256;
1697 Boost(destatep, F_Arabic, kGentleOnePair);
1698 Boost(destatep, F_CP1256, kGentleOnePair);
1699 break;
1700 case F_CP1256:
1701 destatep->declared_enc_2 = F_Arabic;
1702 Boost(destatep, F_Arabic, kGentleOnePair);
1703 Boost(destatep, F_CP1256, kGentleOnePair);
1704 break;
1705
1706 case F_Greek: // 8859-7 Greek
1707 destatep->declared_enc_2 = F_CP1253;
1708 Boost(destatep, F_Greek, kGentleOnePair);
1709 Boost(destatep, F_CP1253, kGentleOnePair);
1710 break;
1711 case F_CP1253:
1712 destatep->declared_enc_2 = F_Greek;
1713 Boost(destatep, F_Greek, kGentleOnePair);
1714 Boost(destatep, F_CP1253, kGentleOnePair);
1715 break;
1716
1717 case F_Hebrew: // 8859-8 Hebrew
1718 destatep->declared_enc_2 = F_CP1255;
1719 Boost(destatep, F_Hebrew, kGentleOnePair);
1720 Boost(destatep, F_CP1255, kGentleOnePair);
1721 break;
1722 case F_CP1255:
1723 destatep->declared_enc_2 = F_Hebrew;
1724 Boost(destatep, F_Hebrew, kGentleOnePair);
1725 Boost(destatep, F_CP1255, kGentleOnePair);
1726 break;
1727
1728 case F_Latin5: // 8859-9 Latin5, Turkish
1729 destatep->declared_enc_2 = F_ASCII_7_bit; // Don't boost 1254
1730 Boost(destatep, F_Latin5, kGentleOnePair); // (too different)
1731 break;
1732 case F_CP1254:
1733 destatep->declared_enc_2 = F_ASCII_7_bit; // Don't boost Latin5
1734 Boost(destatep, F_CP1254, kGentleOnePair); // (too different)
1735 break;
1736
1737 case F_Latin6: // 8859-10 Latin6, Nordic
1738 destatep->declared_enc_2 = F_ASCII_7_bit;
1739 Boost(destatep, F_Latin6, kGentleOnePair);
1740 break;
1741
1742 case F_ISO_8859_11: // 8859-11 Thai,
1743 destatep->declared_enc_2 = F_CP874;
1744 Boost(destatep, F_ISO_8859_11, kGentleOnePair);
1745 Boost(destatep, F_CP874, kGentleOnePair);
1746 break;
1747 case F_CP874:
1748 destatep->declared_enc_2 = F_ISO_8859_11;
1749 Boost(destatep, F_ISO_8859_11, kGentleOnePair);
1750 Boost(destatep, F_CP874, kGentleOnePair);
1751 break;
1752
1753 case F_ISO_8859_13: // 8859-13 Latin7, Baltic
1754 destatep->declared_enc_2 = F_CP1257;
1755 Boost(destatep, F_ISO_8859_13, kGentleOnePair);
1756 Boost(destatep, F_CP1257, kGentleOnePair);
1757 break;
1758 case F_CP1257:
1759 destatep->declared_enc_2 = F_ISO_8859_13;
1760 Boost(destatep, F_ISO_8859_13, kGentleOnePair);
1761 Boost(destatep, F_CP1257, kGentleOnePair);
1762 break;
1763
1764 case F_ISO_8859_15: // 8859-15 Latin9, Latin0, Euro-ized Latin1
1765 destatep->declared_enc_2 = F_ASCII_7_bit;
1766 Boost(destatep, F_ISO_8859_15, kGentleOnePair);
1767 break;
1768
1769
1770 // Greek all-caps is confusable with KOI8x all-lower and Hebrew.
1771 // This turns some Greek documents into Cyrillic, etc. by mistake.
1772 // Greek and Hebrew are boosted explicitly above; do KOI8x here.
1773 // Boosting the declared encodingmakes it harder for the wrong one to
1774 // creep up.
1775 case F_KOI8R:
1776 Boost(destatep, F_KOI8R, kGentleOnePair);
1777 break;
1778 case F_KOI8U:
1779 Boost(destatep, F_KOI8U, kGentleOnePair);
1780 break;
1781
1782 default:
1783 break;
1784 }
1785 }
1786
1787 if (destatep->debug_data != NULL) {
1788 // Show charset hint
1789 SetDetailsEncProb(destatep, 0, best_sub, charset_hint);
1790 }
1791
1792 //
1793 // Some fix-ups for the declared encodings
1794 //
1795
1796 // If non-UTF8, non-Latin1/1252 encoding declared, disable UTF8 combos
1797 // TODO: This should all be folded into postproc-enc-detect.cc
1798 if ((best_sub != F_UTF8) &&
1799 (best_sub != F_Latin1) &&
1800 (best_sub != F_CP1252)) {
1801 Whack(destatep, F_UTF8UTF8, kBadPairWhack * 4); // demote
1802 }
1803
1804 // Latin2 and CP1250 differ in the overlap part, such as B1 or B9
1805 // The initial probabilites for charset=Latin2 explicitly put CP1250
1806 // down twice as far as normal, and vice versa. This is done in
1807 // postproc-enc-detect.cc
1808
1809 // If charset=user-defined, treat as Binary --
1810 // we can safely only do low ASCII, might be Indic
1811 if (normalized_charset.substr(0,4) == "user") {
1812 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
1813 }
1814
1815 return 1;
1816 }
1817 return 0;
1818 }
1819
1820 // Apply initial probability hint based on caller-supplied encoding
1821 // Negative hint whacks ~encoding, non-negative boosts encoding
1822 //
1823 // Negative hints are an experiment to see if they might be useful.
1824 // Not operator used instead of unary minus to allow specifying not-zero
ApplyEncodingHint(const int encoding_hint,int weight,DetectEncodingState * destatep)1825 int ApplyEncodingHint(const int encoding_hint, int weight,
1826 DetectEncodingState* destatep) {
1827 Encoding enc_hint = static_cast<Encoding>((encoding_hint < 0) ?
1828 ~encoding_hint : encoding_hint);
1829 // Map to the right internal subscript
1830 int rankedenc_hint = CompactEncDet::BackmapEncodingToRankedEncoding(enc_hint);
1831
1832 // I'm not sure how strong this hint should be. Weight 100% = 1 bigram
1833 int increment = (kBoostOnePair * weight) / 100;
1834
1835 if (encoding_hint < 0) {
1836 destatep->enc_prob[rankedenc_hint] -= increment;
1837 } else {
1838 destatep->enc_prob[rankedenc_hint] += increment;
1839 }
1840
1841 if (destatep->debug_data != NULL) {
1842 // Show encoding hint
1843 SetDetailsEncProb(destatep, 0, -1, MyEncodingName(enc_hint));
1844 }
1845 return 1;
1846 }
1847
1848 // Apply initial probability hint based on user interface language
1849 // Weight is 0..100 percent
1850 // Return 1 if name match found
ApplyUILanguageHint(const Language language_hint,int weight,DetectEncodingState * destatep)1851 int ApplyUILanguageHint(const Language language_hint,
1852 int weight, DetectEncodingState* destatep) {
1853 if (language_hint == UNKNOWN_LANGUAGE) {
1854 return 0;
1855 }
1856 string normalized_lang = MakeChar8(LanguageName(language_hint));
1857 int n = HintBinaryLookup8(kLangHintProbs, kLangHintProbsSize,
1858 normalized_lang.c_str());
1859 if (n >= 0) {
1860 // Language is eight bytes, probability table is ~eight bytes
1861 int best_sub = ApplyCompressedProb(&kLangHintProbs[n].key_prob[kMaxLangKey],
1862 kMaxLangVector, weight, destatep);
1863 // Never boost ASCII7; do CP1252 instead
1864 if (best_sub == F_ASCII_7_bit) {best_sub = F_CP1252;}
1865 destatep->declared_enc_1 = best_sub;
1866 if (destatep->debug_data != NULL) {
1867 // Show language hint
1868 SetDetailsEncProb(destatep, 0, best_sub, normalized_lang.c_str());
1869 }
1870 return 1;
1871 }
1872 return 0;
1873 }
1874
1875 // Apply initial probability hint based on corpus type (web, email, etc)
1876 // Return 1 if name match found
ApplyDefaultHint(const CompactEncDet::TextCorpusType corpus_type,DetectEncodingState * destatep)1877 int ApplyDefaultHint(const CompactEncDet::TextCorpusType corpus_type,
1878 DetectEncodingState* destatep) {
1879
1880 for (int i = 0; i < NUM_RANKEDENCODING; i++) {
1881 // Set the default probability
1882 destatep->enc_prob[i] = kDefaultProb[i] * 3;
1883 // Deliberately set 2022 seven-bit encodings to zero,
1884 // so we can look for actual use
1885 // TODO: This should all be folded into postproc-enc-detect.cc
1886 if (SevenBitEncoding(kMapToEncoding[i])) {
1887 destatep->enc_prob[i] = 0;
1888 }
1889 }
1890
1891 // A little corpus distinction
1892 switch (corpus_type) {
1893 case CompactEncDet::WEB_CORPUS:
1894 case CompactEncDet::XML_CORPUS:
1895 // Allow double-converted UTF-8 to start nearly equal to normal UTF-8
1896 destatep->enc_prob[F_UTF8UTF8] =
1897 destatep->enc_prob[F_UTF8] - kSmallInitDiff;
1898 break;
1899 case CompactEncDet::QUERY_CORPUS:
1900 case CompactEncDet::EMAIL_CORPUS:
1901 default:
1902 break;
1903 }
1904
1905 if (FLAGS_demo_nodefault) {
1906 // Demo, make initial probs all zero
1907 for (int i = 0; i < NUM_RANKEDENCODING; i++) {
1908 destatep->enc_prob[i] = 0;
1909 }
1910 }
1911
1912 if (destatep->debug_data != NULL) {
1913 // Show default hint
1914 SetDetailsEncProb(destatep, 0, -1, "Default");
1915 }
1916 return 1;
1917 }
1918
1919
1920
1921 // Do reverse search for c in [str..str+len)
1922 // Note: initial pointer is to FRONT of string, not back
MyMemrchr(const char * str,char c,size_t len)1923 const char* MyMemrchr(const char* str, char c, size_t len) {
1924 const char* ret = str + len;
1925 while (str <= --ret) {
1926 if (*ret == c) {return ret;}
1927 }
1928 return NULL;
1929 }
1930
1931
1932 // Minimum real URL is 11 bytes: "http://a.bc" -- shorter is assumed to be TLD
1933 // Now that we are no longer trying to do Indic font-based encodigns, we
1934 // don't need the full URL and can go back to simple TLD. This test remains for
1935 // backwards compatility with any caller using full URL.
1936 static const int kMinURLLength = 11;
1937
1938 // Extract TLD from a full URL or just a TLD
1939 // Return hostname and length if a full URL
ExtractTLD(const char * url_hint,char * tld_hint,int tld_hint_len,const char ** ret_host_start,int * ret_host_len)1940 void ExtractTLD(const char* url_hint, char* tld_hint, int tld_hint_len,
1941 const char** ret_host_start, int* ret_host_len) {
1942 // url_hint can either be a full URL (preferred) or just top-level domain name
1943 // Extract the TLD from a full URL and use it for
1944 // a normal TLD hint
1945
1946 strncpy(tld_hint, "~", tld_hint_len);
1947 tld_hint[tld_hint_len - 1] = '\0';
1948 *ret_host_start = NULL;
1949 *ret_host_len = 0;
1950
1951 int url_len = (url_hint != NULL) ? strlen(url_hint) : 0;
1952 if (url_len == 0) {
1953 // Empty TLD
1954 return;
1955 }
1956
1957 // Minimum real URL is 11 bytes: "http://a.bc" -- shorter is assumed to be TLD
1958 if (kMinURLLength <= url_len) {
1959 // See if it really is a URL
1960 const char* first_slash = strchr(url_hint, '/');
1961 if ((first_slash != NULL) && (first_slash != url_hint) &&
1962 (first_slash[-1] == ':') && (first_slash[1] == '/') &&
1963 (memrchr(url_hint, '.', first_slash - url_hint) == NULL)) {
1964 // We found :// and no dot in front of it, so declare a real URL
1965
1966 const char* hostname_start = first_slash + 2;
1967 const char* hostname_end = strchr(hostname_start, '/');
1968 if (hostname_end == NULL) {
1969 // No slash; end is first byte off end of the URL string
1970 hostname_end = url_hint + url_len;
1971 }
1972 size_t hostname_len = hostname_end - hostname_start;
1973 const char* port_start =
1974 (const char*)memchr(hostname_start, ':', hostname_len);
1975 if (port_start != NULL) {
1976 // Port; shorten hostname
1977 hostname_end = port_start;
1978 hostname_len = hostname_end - hostname_start;
1979 }
1980
1981 const char* tld_start = MyMemrchr(hostname_start, '.', hostname_len);
1982 if (tld_start != NULL) {
1983 // Remember the TLD we just found
1984 int tld_len = hostname_start + hostname_len - tld_start - 1;
1985 if (tld_len > (tld_hint_len - 1)) {
1986 tld_len = tld_hint_len - 1;
1987 }
1988 memcpy(tld_hint, tld_start + 1, tld_len);
1989 tld_hint[tld_len] = '\0';
1990 }
1991 *ret_host_start = hostname_start;
1992 *ret_host_len = hostname_len;
1993 return;
1994 }
1995 } else {
1996 strncpy(tld_hint, url_hint, tld_hint_len);
1997 tld_hint[tld_hint_len - 1] = '\0';
1998 }
1999 }
2000
2001 // Apply hints, if any, to probabilities
2002 // NOTE: Encoding probabilites are all zero at this point
ApplyHints(const char * url_hint,const char * http_charset_hint,const char * meta_charset_hint,const int encoding_hint,const Language language_hint,const CompactEncDet::TextCorpusType corpus_type,DetectEncodingState * destatep)2003 void ApplyHints(const char* url_hint,
2004 const char* http_charset_hint,
2005 const char* meta_charset_hint,
2006 const int encoding_hint,
2007 const Language language_hint,
2008 const CompactEncDet::TextCorpusType corpus_type,
2009 DetectEncodingState* destatep) {
2010 int hint_count = 0;
2011 // url_hint can either be a full URL (preferred) or just top-level domain name
2012 // Extract the TLD from a full URL and use it for
2013 // a normal TLD hint
2014
2015 char tld_hint[16];
2016 const char* hostname_start = NULL;
2017 int hostname_len = 0;
2018 ExtractTLD(url_hint, tld_hint, sizeof(tld_hint),
2019 &hostname_start, &hostname_len);
2020
2021
2022 // Initial hints give slight boost to Ascii-7-bit and code page 1252
2023 // ApplyXxx routines copy enc_1 to enc_2 then update declared_enc_1
2024 // This gives a boost to 1252 if one of HTTP/META is specified,
2025 // but this could be the wrong thing to do if Latin2/3/4/etc. is specified
2026 destatep->declared_enc_1 = F_CP1252;
2027 destatep->declared_enc_2 = F_ASCII_7_bit;
2028
2029 // Applying various hints takes max of new hint and any old hint.
2030 // This does better on multiple hints that a weighted average
2031
2032 // Weight is 0..100 percent
2033 if ((http_charset_hint != NULL) && (http_charset_hint[0] != '~')) {
2034 destatep->declared_enc_2 = destatep->declared_enc_1;
2035 hint_count += ApplyCharsetHint(http_charset_hint, 100, destatep);
2036 destatep->http_hint = kMapToEncoding[destatep->declared_enc_1];
2037 if ((destatep->declared_enc_1 == F_CP1252) ||
2038 (destatep->declared_enc_1 == F_Latin1)) {
2039 destatep->looking_for_latin_trigrams = true;
2040 }
2041 }
2042 if ((meta_charset_hint != NULL) && (meta_charset_hint[0] != '~')) {
2043 destatep->declared_enc_2 = destatep->declared_enc_1;
2044 hint_count += ApplyCharsetHint(meta_charset_hint, 100, destatep);
2045 destatep->meta_hint = kMapToEncoding[destatep->declared_enc_1];
2046 if ((destatep->declared_enc_1 == F_CP1252) ||
2047 (destatep->declared_enc_1 == F_Latin1)) {
2048 destatep->looking_for_latin_trigrams = true;
2049 }
2050 }
2051 if (encoding_hint != UNKNOWN_ENCODING) {
2052 destatep->declared_enc_2 = destatep->declared_enc_1;
2053 hint_count += ApplyEncodingHint(encoding_hint, 50, destatep);
2054 }
2055 if (language_hint != UNKNOWN_LANGUAGE) {
2056 destatep->declared_enc_2 = destatep->declared_enc_1;
2057 hint_count += ApplyUILanguageHint(language_hint, 50, destatep);
2058 }
2059 // Use top level domain if not .com and <=1 other hint was available
2060 if (url_hint != NULL) {
2061 destatep->tld_hint = CompactEncDet::TopEncodingOfTLDHint(tld_hint);
2062 if (hint_count == 0) {
2063 // Apply with weight 100%
2064 destatep->declared_enc_2 = destatep->declared_enc_1;
2065 hint_count += ApplyTldHint(tld_hint, 100, destatep);
2066 if ((destatep->declared_enc_1 == F_CP1252) ||
2067 (destatep->declared_enc_1 == F_Latin1)) {
2068 destatep->looking_for_latin_trigrams = true;
2069 }
2070 if (strcmp("hu", tld_hint) == 0) {
2071 // Hungarian is particularly difficult to separate Latin2 from Latin1,
2072 // so always look for trigram scanning if bare TLD=hu hint
2073 destatep->looking_for_latin_trigrams = true;
2074 }
2075 // Treat .com as no TLD hint at all
2076 } else if ((hint_count == 1) && (strcmp("com", tld_hint) != 0)) {
2077 // Either shift weighting or consider doing no TLD here -- seems to
2078 // distract from correct charset= hints. Or perhaps apply only if
2079 // charset = Latin1/1252...
2080 // Apply with weight 50%
2081 destatep->declared_enc_2 = destatep->declared_enc_1;
2082 hint_count += ApplyTldHint(tld_hint, 50, destatep);
2083 if ((destatep->declared_enc_1 == F_CP1252) ||
2084 (destatep->declared_enc_1 == F_Latin1)) {
2085 destatep->looking_for_latin_trigrams = true; // These need trigrams
2086 }
2087 }
2088 // Else ignore TLD hint entirely
2089 }
2090
2091 // Use all-web default distribution if not even a TLD hint
2092 if (hint_count == 0) {
2093 destatep->looking_for_latin_trigrams = true; // Default needs trigrams
2094 destatep->declared_enc_2 = destatep->declared_enc_1;
2095 hint_count += ApplyDefaultHint(corpus_type, destatep);
2096 }
2097
2098
2099 // ISO-Microsoft Pairs
2100 // F_Latin1, F_CP1252,
2101 // F_Latin2, F_CP1250, NOT really strict subset/superset pairs
2102 // F_Latin3,
2103 // F_Latin4,
2104 // F_ISO_8859_5, F_CP1251,
2105 // F_Arabic, F_CP1256, NOT
2106 // F_Greek, F_CP1253, NOT really pairs
2107 // (or upgrade incvt to make Greek use CP)
2108 // F_Hebrew, F_CP1255, NOT really pairs
2109 // F_Latin5, F_CP1254,
2110 // F_Latin6,
2111 // F_ISO_8859_11,
2112 // F_ISO_8859_13, F_CP1257,
2113 // F_ISO_8859_15,
2114 // ISO-Microsoft Pairs
2115
2116 // Get important families started together
2117 // // This should fall out of the initializatoin vectors for charset,
2118 // but we need to get rid of families alltogetrher
2119 //
2120 // TODO make this more graceful
2121
2122 // Add small bias for subsets
2123
2124 // Subtract small bias for supersets
2125 destatep->enc_prob[F_CP932] = destatep->enc_prob[F_SJS] - kSmallInitDiff;
2126
2127 destatep->enc_prob[F_GBK] = destatep->enc_prob[F_GB] - kSmallInitDiff;
2128 destatep->enc_prob[F_GB18030] = destatep->enc_prob[F_GB] - kSmallInitDiff;
2129
2130 destatep->enc_prob[F_BIG5_CP950] = destatep->enc_prob[F_BIG5] -
2131 kSmallInitDiff;
2132 destatep->enc_prob[F_BIG5_HKSCS] = destatep->enc_prob[F_BIG5] -
2133 kSmallInitDiff;
2134
2135 // Deliberate over-bias Ascii7 and underbias Binary [unneeded]
2136 // destatep->enc_prob[F_ASCII_7_bit] = destatep->enc_prob[F_ASCII_7_bit] + kSmallInitDiff;
2137 // destatep->enc_prob[F_BINARY] = destatep->enc_prob[F_BINARY] - (kBoostInitial / 2);
2138
2139 if (destatep->debug_data != NULL) {
2140 // Show state at end of hints
2141 SetDetailsEncProb(destatep, 0, -1, "Endhints");
2142 if(FLAGS_enc_detect_detail2) {
2143 // Add a line showing the watched encoding(s)
2144 if (watch1_rankedenc >= 0) {
2145 SetDetailsEncProb(destatep, 0,
2146 watch1_rankedenc, FLAGS_enc_detect_watch1);
2147 }
2148 if (watch2_rankedenc >= 0) {
2149 SetDetailsEncProb(destatep, 0,
2150 watch2_rankedenc, FLAGS_enc_detect_watch2);
2151 }
2152 } // End detail2
2153 }
2154
2155 // If duplicate hints, set second one to ASCII_7BIT to prevent double-boost
2156 if (destatep->declared_enc_1 == destatep->declared_enc_2) {
2157 destatep->declared_enc_2 = F_ASCII_7_bit;
2158 }
2159
2160 if (FLAGS_force127) {
2161 destatep->do_latin_trigrams = true;
2162 if (FLAGS_enc_detect_source) {
2163 PsHighlight(0, destatep->initial_src, 0, 2);
2164 }
2165 }
2166
2167
2168 if (FLAGS_counts && destatep->looking_for_latin_trigrams) {++looking_used;}
2169 if (FLAGS_counts && destatep->do_latin_trigrams) {++doing_used;}
2170
2171 //
2172 // At this point, destatep->enc_prob[] is an initial probability vector based
2173 // on the given hints/default. In general, it spreads out least-likely
2174 // encodings to be about 2**-25 below the most-likely encoding.
2175 // For input text with lots of bigrams, an unlikely encoding can rise to
2176 // the top at a rate of about 2**6 per bigram, and more commonly 2**2 per
2177 // bigram. So more than 4 bigrams and commonly more than 12 are
2178 // needed to overcome the initial hints when the least-likely encoding
2179 // is in fact the correct answer. So if the entire text has very few bigrams
2180 // (as a two-word query might), it can be impossible for the correct
2181 // encoding to win.
2182 //
2183 // To compensate for this, we take the initial hint vector and effectively
2184 // apply it at the rate of 1/16 every bigram for the first 16 bigrams. The
2185 // actual mechanism is done just before the last prune.
2186 //
2187
2188 // Remember Initial hint probabilities
2189 memcpy(destatep->hint_prob, destatep->enc_prob, sizeof(destatep->enc_prob));
2190 }
2191
2192 // Look for specific high-value patterns in the first 4 bytes
2193 // Byte order marks (BOM)
2194 // EFBBBF UTF-8
2195 // FEFF UTF-16 BE
2196 // FFFE UTF-16 LE
2197 // FFFE0000 UTF-32 BE
2198 // 0000FEFF UTF-32 LE
2199 //
2200 // Likely UTF-x of seven-bit ASCII
2201 // 00xx UTF-16 BE xx printable ASCII
2202 // xx00 UTF-16 LE
2203 // 000000xx UTF-32 BE
2204 // xx000000 UTF-32 LE
2205 //
InitialBytesBoost(const uint8 * src,int text_length,DetectEncodingState * destatep)2206 void InitialBytesBoost(const uint8* src,
2207 int text_length,
2208 DetectEncodingState* destatep) {
2209 if (text_length < 4) {return;}
2210
2211 uint32 pair01 = (src[0] << 8) | src[1];
2212 uint32 pair23 = (src[2] << 8) | src[3];
2213 uint32 quad0123 = (pair01 << 16) | pair23;
2214
2215 bool utf_16_indication = false;
2216 bool utf_32_indication = false;
2217 int best_enc = -1;
2218
2219 // Byte order marks
2220 // UTF-8
2221 if ((quad0123 & 0xffffff00) == 0xEFBBBF00) {
2222 destatep->bom_hint = UTF8;
2223 Boost(destatep, F_UTF8, kBoostInitial * 2);
2224 Boost(destatep, F_UTF8UTF8, kBoostInitial * 2);
2225 best_enc = F_UTF8;
2226 // UTF-32 (test before UTF-16)
2227 } else if (quad0123 == 0x0000FEFF) {
2228 destatep->bom_hint = UTF32BE;
2229 Boost(destatep, F_UTF_32BE, kBoostInitial * 2);
2230 best_enc = F_UTF_32BE;
2231 } else if (quad0123 == 0xFFFE0000) {
2232 destatep->bom_hint = UTF32LE;
2233 Boost(destatep, F_UTF_32LE, kBoostInitial * 2);
2234 best_enc = F_UTF_32LE;
2235 // UTF-16
2236 } else if (pair01 == 0xFEFF) {
2237 destatep->bom_hint = UTF16BE;
2238 Boost(destatep, F_UTF_16BE, kBoostInitial * 3);
2239 best_enc = F_UTF_16BE;
2240 } else if (pair01 == 0xFFFE) {
2241 destatep->bom_hint = UTF16LE;
2242 Boost(destatep, F_UTF_16LE, kBoostInitial * 3);
2243 best_enc = F_UTF_16LE;
2244
2245 // Possible seven-bit ASCII encoded as UTF-16/32
2246 // UTF-32 (test before UTF-16)
2247 } else if (((quad0123 & 0xffffff00) == 0) &&
2248 (kIsPrintableAscii[src[3]] != 0)) {
2249 Boost(destatep, F_UTF_32BE, kBoostInitial);
2250 Whack(destatep, F_UTF_32LE, kBadPairWhack); // Illegal char
2251 best_enc = F_UTF_32BE;
2252 } else if (((quad0123 & 0x00ffffff) == 0) &&
2253 (kIsPrintableAscii[src[0]] != 0)) {
2254 Boost(destatep, F_UTF_32LE, kBoostInitial);
2255 Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal char
2256 best_enc = F_UTF_32LE;
2257 } else if ((src[0] == 0x00) && (kIsPrintableAscii[src[1]] != 0)) {
2258 Boost(destatep, F_UTF_16BE, kBoostInitial);
2259 best_enc = F_UTF_16BE;
2260 } else if ((src[1] == 0x00) && (kIsPrintableAscii[src[0]] != 0)) {
2261 Boost(destatep, F_UTF_16LE, kBoostInitial);
2262 best_enc = F_UTF_16LE;
2263
2264 // Whack if 0000 or FFFF
2265 // UTF-32 (test before UTF-16)
2266 } else if (quad0123 == 0x00000000) {
2267 Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal char
2268 Whack(destatep, F_UTF_32LE, kBadPairWhack);
2269 Whack(destatep, F_UTF_16BE, kBadPairWhack);
2270 Whack(destatep, F_UTF_16LE, kBadPairWhack);
2271 best_enc = -1;
2272 } else if (quad0123 == 0xffffffff) {
2273 Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal char
2274 Whack(destatep, F_UTF_32LE, kBadPairWhack);
2275 Whack(destatep, F_UTF_16BE, kBadPairWhack);
2276 Whack(destatep, F_UTF_16LE, kBadPairWhack);
2277 best_enc = -1;
2278 } else if (pair01 == 0x0000) {
2279 Whack(destatep, F_UTF_16BE, kBadPairWhack); // Illegal char
2280 Whack(destatep, F_UTF_16LE, kBadPairWhack);
2281 best_enc = -1;
2282 } else if (pair01 == 0xffff) {
2283 Whack(destatep, F_UTF_16BE, kBadPairWhack); // Illegal char
2284 Whack(destatep, F_UTF_16LE, kBadPairWhack);
2285 best_enc = -1;
2286
2287
2288 // These are the first four bytes of some known binary file formats
2289
2290 // Boost BINARY bigtime if JPEG FFD8FFxx
2291 // Boost BINARY bigtime if png 89504E47 (.PNG)
2292 // Boost BINARY bigtime if gif 47494638 (GIF8)
2293 // Boost BINARY bigtime if zip 504B0304 (PK..)
2294 // Boost BINARY bigtime if gzip 1F8B08xx
2295 // Boost BINARY bigtime if gzip 78DAxxxx
2296 // Boost BINARY if PDF 25504446 (%PDF)
2297 // Boost BINARY if SWF (FWSx or CWSx where x <= 0x1f)
2298 } else if ((quad0123 & 0xffffff00) == 0xFFD8FF00) { // JPEG FFD8FFxx
2299 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2300 } else if (quad0123 == 0x89504E47) { // Hex 89 P N G
2301 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2302 } else if (quad0123 == 0x47494638) { // Hex GIF8
2303 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2304 } else if (quad0123 == 0x504B0304) { // Hex P K 03 04
2305 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2306 } else if ((quad0123 & 0xffffff00) == 0x1F8B0800) { // gzip 1F8B08xx
2307 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2308 } else if (pair01 == 0x78DA) { // gzip 78DAxxxx
2309 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2310 } else if (quad0123 == 0x25504446) { // Hex %PDF
2311 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2312 } else if ((quad0123 & 0xffffff1f) == 0x66535700) { // Hex FWSx
2313 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2314 } else if ((quad0123 & 0xffffff1f) == 0x63535700) { // Hex CWSx
2315 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2316
2317 // More binary detect prefixes
2318 // 7F E L F Executable and linking format
2319 // M M 00 * TIFF (little-endian)
2320 // * 00 M M TIFF (big-endian)
2321 // 01 f c p Final cut pro
2322 } else if (quad0123 == 0x7F454C46) { // Hex 7F E L F
2323 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2324 } else if (quad0123 == 0x4D4D002A) { // Hex M M 00 *
2325 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2326 } else if (quad0123 == 0x2A004D4D) { // Hex * 00 M M
2327 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2328 } else if (quad0123 == 0x01666370) { // Hex 01 f c p
2329 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2330
2331 // More binary detect prefixes; all-ASCII names; heavy weight to avoid ASCII
2332 // prefix overcoming binary
2333 // C C S D USGS ISIS 3-D cube files
2334 // S I M P FITS image header "SIMPLE "
2335 } else if (quad0123 == 0x43435344) { // Hex C C S D
2336 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2337 } else if (quad0123 == 0x53494D50) { // Hex S I M P
2338 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2339
2340 // More binary detect prefixes; all-ASCII names; lighter weight
2341 // H W P Hangul word processor
2342 // 8 B P S Photoshop
2343 // P D S _ xx "PDS_VERSION_ID "
2344 } else if (quad0123 == 0x48575020) { // Hex H W P
2345 if ((19 <= text_length) &&
2346 (memcmp(src, "HWP.Document.File.V", 19) == 0)) {
2347 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2348 } else if ((19 <= text_length) &&
2349 (memcmp(src, "HWP Document File V", 19) == 0)) {
2350 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2351 } else {
2352 Boost(destatep, F_BINARY, kBoostInitial * kWeakerBinary);
2353 }
2354 } else if (quad0123 == 0x38425053) { // Hex 8 B P S
2355 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2356 } else if (quad0123 == 0x5044535F) { // Hex P D S _
2357 if ((14 <= text_length) && (memcmp(src, "PDS_VERSION_ID", 14) == 0)) {
2358 Boost(destatep, F_BINARY, kBoostInitial * kStrongBinary);
2359 } else {
2360 Boost(destatep, F_BINARY, kBoostInitial * kWeakerBinary);
2361 }
2362 }
2363
2364 // There are several main Windows EXE file formats.
2365 // Not examined here (prefix too short; never see them in Google pipeline)
2366 // M Z DOS .exe Mark Zbikowski
2367 // N E DOS 4.0 16-bit
2368 // L E OS/2 VxD drivers
2369 // L X OS/2
2370 // P E Windows NT
2371
2372
2373 // More user-defined
2374 // http://www.freenet.am/armscii/ Armenian
2375
2376 // If any hints or BOM, etc. keep UTF 16/32 around
2377 if ((destatep->enc_prob[F_UTF_16BE] > 0) ||
2378 (destatep->enc_prob[F_UTF_16LE] > 0)) {
2379 utf_16_indication = true;
2380 }
2381 if ((destatep->enc_prob[F_UTF_32BE] > 0) ||
2382 (destatep->enc_prob[F_UTF_32LE] > 0)) {
2383 utf_32_indication = true;
2384 }
2385
2386
2387 // Kill UTF16/32 right now if no positive indication of them
2388 // Otherwise, they tend to rise to the top in 7-bit files with an
2389 // occasional 0x02 byte in some comment or javascript
2390 if (!utf_16_indication) {
2391 Whack(destatep, F_UTF_16BE, kBadPairWhack * 8);
2392 Whack(destatep, F_UTF_16LE, kBadPairWhack * 8);
2393 Whack(destatep, F_Unicode, kBadPairWhack * 8);
2394 }
2395 if (!utf_32_indication) {
2396 Whack(destatep, F_UTF_32BE, kBadPairWhack * 8);
2397 Whack(destatep, F_UTF_32LE, kBadPairWhack * 8);
2398 }
2399
2400 // Usually kill mixed encodings
2401 if (!FLAGS_ced_allow_utf8utf8) {
2402 Whack(destatep, F_UTF8UTF8, kBadPairWhack * 8);
2403 }
2404 // 2011.11.07 never use UTF8CP1252 -- answer will be UTF8 instead
2405 Whack(destatep, F_UTF8CP1252, kBadPairWhack * 8);
2406
2407 if (destatep->debug_data != NULL) {
2408 // Show first four bytes of the input
2409 char buff[16];
2410 snprintf(buff, sizeof(buff), "%04x%04x", pair01, pair23);
2411 SetDetailsEncProb(destatep, 0, best_enc, buff);
2412 }
2413 }
2414
2415
2416
2417 // Descending order
IntCompare(const void * v1,const void * v2)2418 int IntCompare(const void* v1, const void* v2) {
2419 const int* p1 = reinterpret_cast<const int*>(v1);
2420 const int* p2 = reinterpret_cast<const int*>(v2);
2421 if (*p1 < *p2) {return 1;}
2422 if (*p1 > *p2) {return -1;}
2423 return 0;
2424 }
2425
Base64Char(uint8 c)2426 bool Base64Char(uint8 c) {
2427 if (('A' <= c) && (c <= 'Z')) {return true;}
2428 if (('a' <= c) && (c <= 'z')) {return true;}
2429 if (('0' <= c) && (c <= '9')) {return true;}
2430 if ('+' == c) {return true;}
2431 if ('/' == c) {return true;}
2432 return false;
2433 }
2434
Base64ScanLen(const uint8 * start,const uint8 * limit)2435 int Base64ScanLen(const uint8* start, const uint8* limit) {
2436 // We have a plausible beginning; scan entire base64 string
2437 const uint8* ib64str = start;
2438 const uint8* b64str = ib64str;
2439 const uint8* b64strlimit = limit;
2440 // if starts with + +++, assume it is drawing, so bogus
2441 if (((limit - start) > 3) && (start[0] == '+') &&
2442 (start[1] == '+') && (start[2] == '+')) {
2443 return 81;
2444 }
2445 // Scan over base64
2446 while ((b64str < b64strlimit) && (kBase64Value[*b64str++] >= 0)) {
2447 }
2448 b64str--; // We overshot by 1
2449 return b64str - ib64str;
2450 }
2451
2452 // Input is at least 8-character legal base64 string after +.
2453 // But might be say + "Presse+Termine"
GoodUnicodeFromBase64(const uint8 * start,const uint8 * limit)2454 bool GoodUnicodeFromBase64(const uint8* start, const uint8* limit) {
2455 // Reject base64 string len N if density of '+' is > 1 + N/16 (expect 1/64)
2456 // Reject base64 string len N if density of A-Z is < 1 + N/16 (expect 26/64)
2457 // Reject base64 string len N if density of a-z is < 1 + N/16 (expect 26/64)
2458 // Reject base64 string len N if density of 0-9 is < 1 + N/32 (expect 10/64)
2459 // NOTE: this requires at least one lower AND one upper AND one digit to pass
2460 //
2461 int plus_count = 0;
2462 int lower_count = 0;
2463 int upper_count = 0;
2464 int digit_count = 0;
2465 int len = limit - start;
2466 for (const uint8* src = start; src < limit; ++src) {
2467 uint8 c = *src;
2468 if (('a' <= c) && (c <= 'z')) {
2469 ++lower_count;
2470 } else if (('A' <= c) && (c <= 'Z')) {
2471 ++upper_count;
2472 } else if (('0' <= c) && (c <= '0')) {
2473 ++digit_count;
2474 } else if (*src == '+') {
2475 ++plus_count;
2476 }
2477 }
2478
2479 if (plus_count > (1 + (len >> 4))) {return false;}
2480 if (lower_count < (1 + (len >> 4))) {return false;}
2481 if (upper_count < (1 + (len >> 4))) {return false;}
2482 if (digit_count < (1 + (len >> 5))) {return false;}
2483
2484 // checking the last character to reduce false positive
2485 // since the last character may be padded to 0 bits at the end.
2486 // refer to http://en.wikipedia.org/wiki/UTF-7
2487 int nmod8 = len & 7;
2488 const uint8 last = *(start+len-1);
2489 // When UTF-7 string length%8=3, the last two bits must be padded as 0
2490 if ((nmod8 == 3) && (kBase64Value[last] & 3)) {return false;}
2491 // When UTF-7 string length%8=6, the last four bits must be padded as 0
2492 if ((nmod8 == 6) && (kBase64Value[last] & 15)) {return false;}
2493 return true;
2494 }
2495
2496 // Prune here after N bytes
2497 // Boost here for seven-bit sequences (at every prune)
2498 // if (sevenbitrankedencoding)
2499 // + UTF7 scan and boost/demote len mod 8 = 0 3 6
2500 // ~ Hz scan and boost/demote len mod 8 = 0 2 4 6
2501 // 1B 2022 scan and boost/demote len mod 8 = 0 2 4 6
2502 // 0E 2022 scan and boost/demote len mod 8 = 0 2 4 6
2503 // [0F 2022 boost/demote]
2504 // 00 UTF16/32 scan and boost/demote offset = even/odd
2505 //
2506 // If still some seven-bit possibilities > pure ASCII,
2507 // scan each possibility for clearer prob, s.t. about
2508 // two good sequences is a clear win
2509 // A-Z 00-19 00xx-64xx (B = 04xx)
2510 // a-z 1A-33 68xx-CCxx (f = 7Cxx)
2511 // 0-9 34-3D D0xx-F4xx (1 = D4xx)
2512 // + 3E F8xx
2513 // / 3F FCxx
2514 // do another chunk with slow scan
2515
2516
2517 // Boost, whack, or leave alone UTF-7 probablilty
UTF7BoostWhack(DetectEncodingState * destatep,int next_pair,uint8 byte2)2518 void UTF7BoostWhack(DetectEncodingState* destatep, int next_pair, uint8 byte2) {
2519 int off = destatep->interesting_offsets[AsciiPair][next_pair];
2520 if (off >= destatep->prior_utf7_offset) {
2521 // Not part of a previous successful UTF-7 string
2522 ++destatep->utf7_starts;
2523
2524 if (byte2 == '-') {
2525 // +- encoding for '+' neutral
2526 } else if (!Base64Char(byte2)) {
2527 // Not base64 -- not UTF-7, whack
2528 Whack(destatep, F_UTF7, kBadPairWhack); // Illegal pair
2529 } else {
2530 // Starts with base64 byte, might be a good UTF7 sequence
2531 const uint8* start = destatep->initial_src + off + 1; // over the +
2532 int n = Base64ScanLen(start, destatep->limit_src);
2533 int nmod8 = n & 7;
2534 if ((n == 3) || (n == 6)) {
2535 // short but legal -- treat as neutral
2536 } else if ((nmod8 == 0) | (nmod8 == 3) | (nmod8 == 6)) {
2537 // Good length. Check for good Unicode.
2538 if (GoodUnicodeFromBase64(start, start + n)) {
2539 // Good length and Unicode, boost
2540 Boost(destatep, F_UTF7, kBoostOnePair); // Found good
2541 destatep->prior_utf7_offset = off + n + 1;
2542 } else {
2543 // Bad Unicode. Whack
2544 Whack(destatep, F_UTF7, kBadPairWhack); // Illegal length
2545 }
2546 } else {
2547 // Bad length. Whack
2548 Whack(destatep, F_UTF7, kBadPairWhack); // Illegal length
2549 }
2550 }
2551 }
2552 }
2553
2554 // Boost, whack, or leave alone HZ probablilty
HzBoostWhack(DetectEncodingState * destatep,uint8 byte2)2555 void HzBoostWhack(DetectEncodingState* destatep, uint8 byte2) {
2556 if ((byte2 == '{') || (byte2 == '}')) {
2557 Boost(destatep, F_HZ_GB_2312, kBoostOnePair); // Found ~{ or ~}
2558 } else if ((byte2 == '~') || (byte2 == '\n')) {
2559 destatep->enc_prob[F_HZ_GB_2312] += 0; // neutral
2560 } else {
2561 Whack(destatep, F_HZ_GB_2312, kBadPairWhack); // Illegal pair
2562 }
2563 }
2564
2565 // Boost, whack, or leave alone BINARY probablilty
BinaryBoostWhack(DetectEncodingState * destatep,uint8 byte1,uint8 byte2)2566 void BinaryBoostWhack(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) {
2567 int quadrant = ((byte1 & 0x80) >> 6) | ((byte2 & 0x80) >> 7);
2568 int bucket8x4 = ((byte1 & 0xe0) >> 3) | ((byte2 & 0xc0) >> 6);
2569 uint32 quad_mask = 1 << quadrant;
2570 uint32 bucket8x4_mask = 1 << bucket8x4;
2571 if ((destatep->binary_quadrants_seen & quad_mask) == 0) {
2572 destatep->binary_quadrants_seen |= quad_mask;
2573 destatep->binary_quadrants_count += 1;
2574 if (destatep->binary_quadrants_count == 4) {
2575 Boost(destatep, F_BINARY, kBoostOnePair * 2); // Found all 4 quadrants,
2576 // boost 2 pairs
2577 }
2578 }
2579 if ((destatep->binary_8x4_seen & bucket8x4_mask) == 0) {
2580 destatep->binary_8x4_seen |= bucket8x4_mask;
2581 destatep->binary_8x4_count += 1;
2582 if (destatep->binary_8x4_count >= 11) {
2583 Boost(destatep, F_BINARY, kBoostOnePair * 4); // Found 11+/20 buckets,
2584 // boost 4 pairs each time
2585 }
2586 }
2587 }
2588
2589
2590 // Demote UTF-16/32 on 0000 or FFFF, favoring Binary
UTF1632BoostWhack(DetectEncodingState * destatep,int offset,uint8 byte1)2591 void UTF1632BoostWhack(DetectEncodingState* destatep, int offset, uint8 byte1) {
2592 if (byte1 == 0) { // We have 0000
2593 Whack(destatep, F_UTF_16BE, kBadPairWhack); // Illegal pair
2594 Whack(destatep, F_UTF_16LE, kBadPairWhack); // Illegal pair
2595 switch (offset & 3) {
2596 case 0: // We get called with 0 4 8, etc. for ASCII/BMP as UTF-32BE
2597 Whack(destatep, F_UTF_32LE, kBadPairWhack); // Illegal pair
2598 Boost(destatep, F_UTF_32BE, kSmallInitDiff); // Good pair
2599 break;
2600 case 1: // We get called with 1 5 9, etc. for ASCII as UTF-32LE
2601 case 2: // We get called with 2 6 10, etc. for BMP as UTF-32LE
2602 Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal pair
2603 Boost(destatep, F_UTF_32LE, kSmallInitDiff); // Good pair
2604 break;
2605 case 3: // ambiguous
2606 break;
2607 }
2608 } else { // We have ffff
2609 Whack(destatep, F_UTF_32BE, kBadPairWhack); // Illegal pair
2610 Whack(destatep, F_UTF_32LE, kBadPairWhack); // Illegal pair
2611 Whack(destatep, F_UTF_16BE, kBadPairWhack); // Illegal pair
2612 Whack(destatep, F_UTF_16LE, kBadPairWhack); // Illegal pair
2613 }
2614 }
2615
2616 // Make even offset
UTF16MakeEven(DetectEncodingState * destatep,int next_pair)2617 void UTF16MakeEven(DetectEncodingState* destatep, int next_pair) {
2618 destatep->interesting_offsets[OtherPair][next_pair] &= ~1;
2619 }
2620
ConsecutivePair(DetectEncodingState * destatep,int i)2621 bool ConsecutivePair(DetectEncodingState* destatep, int i) {
2622 if (i <= 0) {
2623 return false;
2624 }
2625 return destatep->interesting_offsets[OtherPair][i] ==
2626 (destatep->interesting_offsets[OtherPair][i - 1] + 2);
2627 }
2628
2629 // boost, whack, or leave alone UTF-8 probablilty
2630 // Any whacks are also applied to UTF8UTF8; CheckUTF8UTF8Seq assumes good UTF8
2631 // Returns total boost
CheckUTF8Seq(DetectEncodingState * destatep,int weightshift)2632 int CheckUTF8Seq(DetectEncodingState* destatep, int weightshift) {
2633 int startcount = destatep->prior_interesting_pair[OtherPair];
2634 int endcount = destatep->next_interesting_pair[OtherPair];
2635
2636 int demotion_count = 0;
2637 for (int i = startcount; i < endcount; ++i) {
2638 int sub;
2639 char* s = &destatep->interesting_pairs[OtherPair][i * 2];
2640 // Demote four byte patterns that are more likely Latin1 than UTF-8
2641 // C9AE, DF92, DF93, DFAB. See note at top.
2642 // Demotion also boosts Latin1 and CP1252
2643 uint8 s0 = static_cast<uint8>(s[0]);
2644 uint8 s1 = static_cast<uint8>(s[1]);
2645 if ((s0 == 0xc9) && (s1 == 0xae)) {++demotion_count;}
2646 if ((s0 == 0xdf) && (s1 == 0x92)) {++demotion_count;}
2647 if ((s0 == 0xdf) && (s1 == 0x93)) {++demotion_count;}
2648 if ((s0 == 0xdf) && (s1 == 0xab)) {++demotion_count;}
2649
2650 if (!ConsecutivePair(destatep, i)) {
2651 // Insert a blank into the sequence; avoid wrong splices
2652 sub = (' ' >> 4) & 0x0f;
2653 ++destatep->utf8_minicount[
2654 static_cast<int>(kMiniUTF8Count[static_cast<int>(destatep->next_utf8_ministate)][sub])];
2655 destatep->next_utf8_ministate =
2656 kMiniUTF8State[destatep->next_utf8_ministate][sub];
2657 }
2658 // Byte 0
2659 sub = (s0 >> 4) & 0x0f;
2660 ++destatep->utf8_minicount[
2661 static_cast<int>(kMiniUTF8Count[static_cast<int>(destatep->next_utf8_ministate)][sub])];
2662 destatep->next_utf8_ministate =
2663 kMiniUTF8State[destatep->next_utf8_ministate][sub];
2664 // Byte 1
2665 sub = (s1 >> 4) & 0x0f;
2666 ++destatep->utf8_minicount[
2667 static_cast<int>(kMiniUTF8Count[static_cast<int>(destatep->next_utf8_ministate)][sub])];
2668 destatep->next_utf8_ministate =
2669 kMiniUTF8State[destatep->next_utf8_ministate][sub];
2670 DCHECK((0 <= destatep->next_utf8_ministate) &&
2671 (destatep->next_utf8_ministate < 8));
2672 }
2673
2674
2675 // For the four specific byte combinations above, Latin1/CP1252 is more likely
2676 if (demotion_count > 0) {
2677 Boost(destatep, F_Latin1, kGentleOnePair * demotion_count);
2678 Boost(destatep, F_CP1252, kGentleOnePair * demotion_count);
2679 }
2680
2681 // Boost UTF8 for completed good sequences
2682 int total_boost = 2 * destatep->utf8_minicount[2] +
2683 3 * destatep->utf8_minicount[3] +
2684 4 * destatep->utf8_minicount[4];
2685 // But not so much for demoted bytes
2686 total_boost -= (3 * demotion_count);
2687
2688 total_boost *= kGentleOnePair;
2689 total_boost >>= weightshift;
2690 // Design: boost both UTF8 and UTF8UTF8 for each good sequence
2691 Boost(destatep, F_UTF8, total_boost);
2692 Boost(destatep, F_UTF8UTF8, total_boost);
2693
2694 destatep->utf8_minicount[5] += destatep->utf8_minicount[2]; // total chars
2695 destatep->utf8_minicount[5] += destatep->utf8_minicount[3]; // total chars
2696 destatep->utf8_minicount[5] += destatep->utf8_minicount[4]; // total chars
2697 destatep->utf8_minicount[2] = 0;
2698 destatep->utf8_minicount[3] = 0;
2699 destatep->utf8_minicount[4] = 0;
2700
2701 // Whack (2 bytes) for errors
2702 int error_whack = 2 * destatep->utf8_minicount[1];
2703 error_whack *= kGentlePairWhack;
2704 error_whack >>= weightshift;
2705 Whack(destatep, F_UTF8, error_whack);
2706 Whack(destatep, F_UTF8UTF8, error_whack);
2707 destatep->utf8_minicount[1] = 0;
2708
2709 return total_boost - error_whack;
2710 }
2711
2712
2713 // Boost, whack, or leave alone UTF8UTF8 probablilty
2714 //
2715 // We are looking for
2716 // (1) chars ONLY in set UTF8(0080)..UTF8(00FF), including for 80..9F the
2717 // MS CP1252 mappings, and
2718 // (2) sequences of 2 or more such characters
2719 //
2720 // If so, we could be looking at some non-7-bit encoding extra-converted
2721 // to UTF-8. The most common observed is CP1252->UTF8 twice,
2722 // 1252=>UTF8 : 1252=>UTF8
2723 // where the colon means "take those bytes and pretend that they are 1252".
2724 // We have a couple of examples of BIG5 bytes converted as though
2725 // they were 1252,
2726 // BIG5 : 1252=>UTF8
2727 //
2728 // Of course, we don't want correctly converted 1252 to be flagged here
2729 // 1252=>UTF8
2730 // So we want the input high bytes to be in pairs or longer, hence the
2731 // output UTF8 in groups of four bytes or more
2732 //
2733 // Good chars: C2xx, C3xx,
2734 // Good chars: C592, C593, C5A0, C5A1, C5B8, C5BD, C5BE, C692, CB86, CB9C
2735 // Good chars: E280xx E282AC E284A2
2736 // C2xx 1100001x 10xxxxxx (128/128)
2737 // C5xx 11000101 10xx00xx (16/4)
2738 // C5xx 11000101 10111xxx (8/3)
2739 // C692 11000110 10010010 (1/1)
2740 // CBxx 11001011 100xx1x0 (8/2)
2741 // E28x 11100010 10000xx0 (4/3)
2742 //
2743 // Returns total boost
CheckUTF8UTF8Seq(DetectEncodingState * destatep,int weightshift)2744 int CheckUTF8UTF8Seq(DetectEncodingState* destatep, int weightshift) {
2745 int this_pair = destatep->prior_interesting_pair[OtherPair];
2746 int startbyteoffset = this_pair * 2;
2747 int endbyteoffset = destatep->next_interesting_pair[OtherPair] * 2;
2748 char* startbyte = &destatep->interesting_pairs[OtherPair][startbyteoffset];
2749 char* endbyte = &destatep->interesting_pairs[OtherPair][endbyteoffset];
2750
2751 int pair_number = this_pair;
2752 for (char* s = startbyte; s < endbyte; s += 2) {
2753 int next = destatep->next_utf8utf8_ministate;
2754 if (!ConsecutivePair(destatep, pair_number)) {
2755 // Insert two blanks into the sequence to avoid wrong splices
2756 // go back to no odd-byte offset
2757 destatep->utf8utf8_odd_byte = 0;
2758 int sub = UTF88Sub(' ', ' ');
2759 ++destatep->utf8utf8_minicount[static_cast<int>(kMiniUTF8UTF8Count[next][sub])];
2760 next = kMiniUTF8UTF8State[next][sub];
2761 }
2762
2763 int odd = destatep->utf8utf8_odd_byte;
2764 if (s + 1 + odd >= endbyte) continue;
2765 int sub = UTF88Sub(s[0 + odd], s[1 + odd]);
2766 destatep->utf8utf8_odd_byte ^= kMiniUTF8UTF8Odd[next][sub];
2767 ++destatep->utf8utf8_minicount[
2768 static_cast<int>(kMiniUTF8UTF8Count[next][sub])];
2769 destatep->next_utf8utf8_ministate = kMiniUTF8UTF8State[next][sub];
2770 ++pair_number;
2771 }
2772
2773 // Boost for completed good sequences; each count covers two chars.
2774 // Design: boost UTF8UTF8 above UTF8 for each good sequence
2775 int total_boost = (2) * destatep->utf8utf8_minicount[2] +
2776 (2) * destatep->utf8utf8_minicount[3] +
2777 (2) * destatep->utf8utf8_minicount[4];
2778 total_boost *= kGentleOnePair;
2779 total_boost >>= weightshift;
2780 Boost(destatep, F_UTF8UTF8, total_boost);
2781
2782 // Track total characters
2783 destatep->utf8utf8_minicount[5] += destatep->utf8utf8_minicount[2];
2784 destatep->utf8utf8_minicount[5] += destatep->utf8utf8_minicount[3];
2785 destatep->utf8utf8_minicount[5] += destatep->utf8utf8_minicount[4];
2786 destatep->utf8utf8_minicount[2] = 0;
2787 destatep->utf8utf8_minicount[3] = 0;
2788 destatep->utf8utf8_minicount[4] = 0;
2789
2790 // Design: Do not whack UTF8UTF8 below UTF8 for each bad sequence
2791
2792 destatep->utf8utf8_minicount[1] = 0;
2793 return total_boost;
2794 }
2795
2796
2797 // We give a gentle boost for each paired SO ... SI, whack others
CheckIso2022ActiveSeq(DetectEncodingState * destatep)2798 void CheckIso2022ActiveSeq(DetectEncodingState* destatep) {
2799 int this_pair = destatep->prior_interesting_pair[OtherPair];
2800 int startbyteoffset = this_pair * 2;
2801 int endbyteoffset = destatep->next_interesting_pair[OtherPair] * 2;
2802 char* startbyte = &destatep->interesting_pairs[OtherPair][startbyteoffset];
2803 char* endbyte = &destatep->interesting_pairs[OtherPair][endbyteoffset];
2804
2805 // Initial <esc> char must precede SO/SI
2806 // HZ_GB_2312 has no alternation constraint on 1- and 2-byte segments
2807 // ISO-2022-JP (JIS) has no alternation constraint on 1- and 2-byte segments
2808 // ISO-2022-CN has no alternation constraint on 1- and 2-byte segments
2809 // ISO-2022-KR requires alternation between 1- and 2-byte segments
2810 // JIS:
2811 // <esc> ( B ISO-2022-JP [1b 28 42] SI to ASCII
2812 // <esc> ( J ISO-2022-JP [1b 28 4a] SI to X0201
2813 // <esc> $ @ ISO-2022-JP [1b 24 40] SO to X0208-78 twobyte
2814 // <esc> $ B ISO-2022-JP [1b 24 42] SO to X0208-83 twobyte
2815 for (char* s = startbyte; s < endbyte; s += 2) {
2816 if (s[0] == 0x1b) {
2817 if (s[1] == 0x24) {
2818 // <esc> $ is SO
2819 destatep->next_2022_state = SOSI_TWOBYTE; // SO to two-byte
2820 } else if (s[1] == 0x28) {
2821 if (destatep->next_2022_state == SOSI_TWOBYTE) {
2822 Boost(destatep, F_JIS, kGentlePairBoost);
2823 } else if (destatep->next_2022_state == SOSI_ONEBYTE) {
2824 Whack(destatep, F_JIS, kGentlePairWhack);
2825 }
2826 destatep->next_2022_state = SOSI_ONEBYTE; // JIS SI to one-byte
2827 } else {
2828 Whack(destatep, F_JIS, kBadPairWhack);
2829 Whack(destatep, F_ISO_2022_CN, kBadPairWhack);
2830 Whack(destatep, F_ISO_2022_KR, kBadPairWhack);
2831 destatep->next_2022_state = SOSI_ERROR; // not 2022
2832 }
2833 } else if (s[0] == 0x0e) {
2834 // <so>
2835 Whack(destatep, F_JIS, kBadPairWhack);
2836 if (destatep->next_2022_state != SOSI_NONE) {
2837 destatep->next_2022_state = SOSI_TWOBYTE; // SO to two-byte
2838 } else {
2839 // ESC required before SO/SI
2840 Whack(destatep, F_ISO_2022_CN, kBadPairWhack * 4);
2841 Whack(destatep, F_ISO_2022_KR, kBadPairWhack * 4);
2842 destatep->next_2022_state = SOSI_ERROR; // SO not after SI
2843 }
2844 } else if (s[0] == 0x0f) {
2845 // <si>
2846 Whack(destatep, F_JIS, kBadPairWhack);
2847 if (destatep->next_2022_state != SOSI_NONE) {
2848 if (destatep->next_2022_state == SOSI_TWOBYTE) {
2849 Boost(destatep, F_ISO_2022_CN, kGentlePairBoost);
2850 Boost(destatep, F_ISO_2022_KR, kGentlePairBoost);
2851 } else if (destatep->next_2022_state == SOSI_ONEBYTE) {
2852 Whack(destatep, F_ISO_2022_CN, kGentlePairWhack);
2853 Whack(destatep, F_ISO_2022_KR, kGentlePairWhack);
2854 }
2855 destatep->next_2022_state = SOSI_ONEBYTE; // SI to one-byte
2856 } else {
2857 // ESC required before SO/SI
2858 Whack(destatep, F_ISO_2022_CN, kBadPairWhack * 4);
2859 Whack(destatep, F_ISO_2022_KR, kBadPairWhack * 4);
2860 destatep->next_2022_state = SOSI_ERROR; // SI not after SO
2861 }
2862 } else if (s[0] <= 0x1f) {
2863 // Some other control code. Allow ht lf [ff] cr
2864 if ((s[0] != 0x09) && (s[0] != 0x0a) &&
2865 (s[0] != 0x0c) && (s[0] != 0x0d)) {
2866 // Otherwise these can float to the top on bad bytes
2867 Whack(destatep, F_JIS, kBadPairWhack);
2868 Whack(destatep, F_ISO_2022_CN, kBadPairWhack);
2869 Whack(destatep, F_ISO_2022_KR, kBadPairWhack);
2870 }
2871 }
2872 }
2873
2874 // If no start, keep the probability pinned at zero (or below)
2875 if (destatep->next_2022_state == SOSI_NONE) {
2876 destatep->enc_prob[F_ISO_2022_CN] =
2877 minint(0, destatep->enc_prob[F_ISO_2022_CN]);
2878 destatep->enc_prob[F_ISO_2022_KR] =
2879 minint(0, destatep->enc_prob[F_ISO_2022_KR]);
2880 destatep->enc_prob[F_JIS] =
2881 minint(0, destatep->enc_prob[F_JIS]);
2882 }
2883 }
2884
2885 // We give a gentle boost for each paired ~{ ... ~}, whack others
CheckHzActiveSeq(DetectEncodingState * destatep)2886 void CheckHzActiveSeq(DetectEncodingState* destatep) {
2887 int this_pair = destatep->prior_interesting_pair[AsciiPair];
2888 int startbyteoffset = this_pair * 2;
2889 int endbyteoffset = destatep->next_interesting_pair[AsciiPair] * 2;
2890 char* startbyte = &destatep->interesting_pairs[AsciiPair][startbyteoffset];
2891 char* endbyte = &destatep->interesting_pairs[AsciiPair][endbyteoffset];
2892
2893 for (char* s = startbyte; s < endbyte; s += 2) {
2894 // Look for initial ~{ pair
2895 if ((s[0] == '~') && (s[1] == '{')) {
2896 destatep->next_hz_state = SOSI_TWOBYTE; // SO to two-byte
2897 }
2898 // Also look for closing ~} pair
2899 if ((s[0] == '~') && (s[1] == '}')) {
2900 if (destatep->next_hz_state == SOSI_TWOBYTE) {
2901 Boost(destatep, F_HZ_GB_2312, kGentlePairBoost);
2902 } else if (destatep->next_hz_state == SOSI_ONEBYTE) {
2903 Whack(destatep, F_HZ_GB_2312, kGentlePairWhack);
2904 }
2905 destatep->next_hz_state = SOSI_ONEBYTE; // SI to one-byte
2906 }
2907 }
2908
2909 // If no start, keep the probability pinned at zero (or below)
2910 if (destatep->next_hz_state == SOSI_NONE) {
2911 destatep->enc_prob[F_HZ_GB_2312] =
2912 minint(0, destatep->enc_prob[F_HZ_GB_2312]);
2913 }
2914 }
2915
2916 // We give a gentle boost after an odd number of 8Fxxxx triples, which
2917 // put subsequent bigrams out of phase until a low byte or another 8Fxxxx
CheckEucJpSeq(DetectEncodingState * destatep)2918 void CheckEucJpSeq(DetectEncodingState* destatep) {
2919 int this_pair = destatep->prior_interesting_pair[OtherPair];
2920 int startbyteoffset = this_pair * 2;
2921 int endbyteoffset = destatep->next_interesting_pair[OtherPair] * 2;
2922 char* startbyte = &destatep->interesting_pairs[OtherPair][startbyteoffset];
2923 char* endbyte = &destatep->interesting_pairs[OtherPair][endbyteoffset];
2924
2925 for (char* s = startbyte; s < endbyte; s += 2) {
2926 // Boost if out of phase (otherwise, EUC-JP will score badly after 8Fxxxx)
2927 if (destatep->next_eucjp_oddphase) {
2928 //printf(" EucJp boost[%02x%02x]\n", s[0], s[1]); // TEMP
2929 Boost(destatep, F_EUC_JP, kGentlePairBoost * 2);
2930 }
2931
2932 uint8 s0 = static_cast<uint8>(s[0]);
2933 uint8 s1 = static_cast<uint8>(s[1]);
2934 // Look for phase flip at 8F
2935 if ((s0 & 0x80) == 0x00) {
2936 destatep->next_eucjp_oddphase = false;
2937 } else if (s0 == 0x8f) {
2938 destatep->next_eucjp_oddphase = !destatep->next_eucjp_oddphase;
2939 }
2940 if ((s1 & 0x80) == 0x00) {
2941 destatep->next_eucjp_oddphase = false;
2942 } else if (s1 == 0x8f) {
2943 destatep->next_eucjp_oddphase = !destatep->next_eucjp_oddphase;
2944 }
2945 }
2946 }
2947
2948 // Boost, whack, or leave alone BINARY probablilty
2949 // Also called if UTF 16/32 active
CheckBinaryDensity(const uint8 * src,DetectEncodingState * destatep,int delta_otherpairs)2950 void CheckBinaryDensity(const uint8* src, DetectEncodingState* destatep,
2951 int delta_otherpairs) {
2952 // No change if not much gathered information
2953 if (delta_otherpairs == 0) {
2954 // Only ASCII pairs this call
2955 return;
2956 }
2957 int next_pair = destatep->next_interesting_pair[OtherPair];
2958
2959 // Look at density of interesting pairs [0..src)
2960 int delta_offset = static_cast<int>(src - destatep->initial_src); // actual
2961
2962 // Look at density of interesting pairs [0..next_interesting)
2963 int low_byte = destatep->interesting_offsets[OtherPair][0];
2964 //int high_byte = destatep->interesting_offsets[OtherPair][next_pair - 1] + 2;
2965 //int byte_span = high_byte - low_byte;
2966 int byte_span = delta_offset - low_byte;
2967
2968 // If all ASCII for the first 4KB, reject
2969 // If mostly ASCII in the first 5KB, reject
2970 if ((low_byte >= kBinaryHardAsciiLimit) || (delta_offset >= kBinarySoftAsciiLimit)) {
2971 // Not binary early enough in text
2972 Whack(destatep, F_BINARY, kBadPairWhack * 4);
2973 Whack(destatep, F_UTF_32BE, kBadPairWhack * 4);
2974 Whack(destatep, F_UTF_32LE, kBadPairWhack * 4);
2975 Whack(destatep, F_UTF_16BE, kBadPairWhack * 4);
2976 Whack(destatep, F_UTF_16LE, kBadPairWhack * 4);
2977 return;
2978 }
2979
2980 // Density 1.0 for N pairs takes 2*N bytes
2981 // Whack if < 1/16 after first non_ASCII pair
2982 if ((next_pair * 2 * 16) < byte_span) {
2983 // Not dense enough
2984 Whack(destatep, F_BINARY, kBadPairWhack * 4);
2985 Whack(destatep, F_UTF_32BE, kBadPairWhack * 4);
2986 Whack(destatep, F_UTF_32LE, kBadPairWhack * 4);
2987 Whack(destatep, F_UTF_16BE, kBadPairWhack * 4);
2988 Whack(destatep, F_UTF_16LE, kBadPairWhack * 4);
2989 }
2990
2991 if (next_pair < 8) {
2992 // Fewer than 8 non-ASCII total; too soon to boost
2993 return;
2994 }
2995
2996 // Density 1.0 for N pairs takes 2*N bytes
2997 // Boost if density >= 1/4, whack if < 1/16
2998 if ((next_pair * 2 * 4) >= byte_span) {
2999 // Very dense
3000 // Only boost if at least 2 quadrants seen
3001 if (destatep->binary_quadrants_count >= 2) {
3002 Boost(destatep, F_BINARY, kSmallInitDiff);
3003 Boost(destatep, F_UTF_32BE, kSmallInitDiff);
3004 Boost(destatep, F_UTF_32LE, kSmallInitDiff);
3005 Boost(destatep, F_UTF_16BE, kSmallInitDiff);
3006 Boost(destatep, F_UTF_16LE, kSmallInitDiff);
3007 }
3008 }
3009 }
3010
3011
3012 // Look at a number of special-case encodings whose reliable detection depends
3013 // on sequencing or other properties
3014 // AsciiPair probibilities (UTF7 and HZ) are all done here
ActiveSpecialBoostWhack(const uint8 * src,DetectEncodingState * destatep)3015 void ActiveSpecialBoostWhack(const uint8* src, DetectEncodingState* destatep) {
3016 int delta_asciipairs = destatep->next_interesting_pair[AsciiPair] -
3017 destatep->prior_interesting_pair[AsciiPair];
3018 int delta_otherpairs = destatep->next_interesting_pair[OtherPair] -
3019 destatep->prior_interesting_pair[OtherPair];
3020
3021 // The two pure ASCII encodings
3022 if (UTF7OrHzActive(destatep) && (delta_asciipairs > 0)) {
3023 // Adjust per pair
3024 for (int i = 0; i < delta_asciipairs; ++i) {
3025 int next_pair = destatep->prior_interesting_pair[AsciiPair] + i;
3026 uint8 byte1 = destatep->interesting_pairs[AsciiPair][next_pair * 2 + 0];
3027 uint8 byte2 = destatep->interesting_pairs[AsciiPair][next_pair * 2 + 1];
3028 if (byte1 == '+') {
3029 // Boost, whack, or leave alone UTF-7 probablilty
3030 UTF7BoostWhack(destatep, next_pair, byte2);
3031 if (destatep->debug_data != NULL) {
3032 // Show UTF7 entry
3033 char buff[16];
3034 snprintf(buff, sizeof(buff), "%02x%02x+", byte1, byte2);
3035 SetDetailsEncProb(destatep,
3036 destatep->interesting_offsets[AsciiPair][next_pair],
3037 kMostLikelyEncoding[(byte1 << 8) + byte2],
3038 buff);
3039 }
3040 } else if (byte1 == '~') {
3041 // Boost, whack, or leave alone HZ probablilty
3042 HzBoostWhack(destatep, byte2);
3043 if (destatep->debug_data != NULL) {
3044 // Show Hz entry
3045 char buff[16];
3046 snprintf(buff, sizeof(buff), "%02x%02x~", byte1, byte2);
3047 SetDetailsEncProb(destatep,
3048 destatep->interesting_offsets[AsciiPair][next_pair],
3049 kMostLikelyEncoding[(byte1 << 8) + byte2],
3050 buff);
3051 }
3052 }
3053 }
3054
3055 // Kill UTF-7 now if at least 8 + pairs and not confirmed valid UTF-7
3056 if ((destatep->utf7_starts >= 8) && (destatep->prior_utf7_offset == 0)) {
3057 Whack(destatep, F_UTF7, kBadPairWhack * 8); // flush
3058 }
3059 }
3060
3061
3062
3063 // All the other encodings
3064 if (OtherActive(destatep) && (delta_otherpairs > 0)) {
3065 // Adjust per pair
3066 int biggest_weightshift = 0;
3067 for (int i = 0; i < delta_otherpairs; ++i) {
3068 int next_pair = destatep->prior_interesting_pair[OtherPair] + i;
3069 uint8 byte1 = destatep->interesting_pairs[OtherPair][next_pair * 2 + 0];
3070 uint8 byte2 = destatep->interesting_pairs[OtherPair][next_pair * 2 + 1];
3071 int off = destatep->interesting_offsets[OtherPair][next_pair];
3072 int weightshift = destatep->interesting_weightshift[OtherPair][next_pair];
3073 biggest_weightshift = maxint(biggest_weightshift, weightshift);
3074
3075 if (byte1 == 0x00) {
3076 if (byte2 == 0x00) {
3077 UTF1632BoostWhack(destatep, off, byte1);
3078 } else if ((kIsPrintableAscii[byte2] != 0) && ((off & 1) != 0)) {
3079 // We have 00xx at an odd offset. Turn into preceding even offset
3080 // for possible Ascii text in UTF-16LE or UTF-32LE (vs BE)
3081 // This will cascade into caller's probability update
3082 // 00 is illegal for all other encodings, so it doesn't matter to them
3083 UTF16MakeEven(destatep, next_pair);
3084 }
3085 if (destatep->debug_data != NULL) {
3086 // Show 0000 detail entry for this bigram
3087 char buff[16];
3088 snprintf(buff, sizeof(buff), "%02x%02xZ", byte1, byte2);
3089 SetDetailsEncProb(destatep,
3090 destatep->interesting_offsets[OtherPair][next_pair],
3091 kMostLikelyEncoding[(byte1 << 8) + byte2],
3092 buff);
3093 }
3094 }
3095 if (byte1 == 0xff) {
3096 if (byte2 == 0xff) {
3097 UTF1632BoostWhack(destatep, off, byte1);
3098 }
3099 if (destatep->debug_data != NULL) {
3100 // Show FFFF detail entry for this bigram
3101 char buff[16];
3102 snprintf(buff, sizeof(buff), "%02x%02xF", byte1, byte2);
3103 SetDetailsEncProb(destatep,
3104 destatep->interesting_offsets[OtherPair][next_pair],
3105 kMostLikelyEncoding[(byte1 << 8) + byte2],
3106 buff);
3107 }
3108 }
3109 if (BinaryActive(destatep)) {
3110 BinaryBoostWhack(destatep, byte1, byte2);
3111 }
3112 } // End for i
3113
3114 // Adjust per entire-pair-span
3115 if (UTF8Active(destatep)) {
3116 CheckUTF8Seq(destatep, biggest_weightshift);
3117 }
3118
3119 if (UTF8UTF8Active(destatep)) {
3120 CheckUTF8UTF8Seq(destatep, biggest_weightshift);
3121 }
3122
3123 if (Iso2022Active(destatep)) {
3124 CheckIso2022ActiveSeq(destatep);
3125 }
3126
3127 if (HzActive(destatep)) {
3128 CheckHzActiveSeq(destatep);
3129 }
3130
3131 if (EUCJPActive(destatep)) {
3132 CheckEucJpSeq(destatep);
3133 }
3134
3135 if (BinaryActive(destatep) || UTF1632Active(destatep)) {
3136 CheckBinaryDensity(src, destatep, delta_otherpairs);
3137 }
3138 }
3139 // ISO-2022 do OK on their own, using stright probabilities? Not on bad bytes
3140
3141 if (destatep->debug_data != NULL) {
3142 // Show sequencing result
3143 SetDetailsEncLabel(destatep, "seq");
3144 }
3145 }
3146
3147
PrintTopEnc(DetectEncodingState * destatep,int n)3148 void PrintTopEnc(DetectEncodingState* destatep, int n) {
3149 // Print top n or fewer
3150 int temp_sort[NUM_RANKEDENCODING];
3151 for (int j = 0; j < destatep->rankedencoding_list_len; ++j) {
3152 int rankedencoding = destatep->rankedencoding_list[j];
3153 temp_sort[j] = destatep->enc_prob[rankedencoding];
3154 }
3155
3156 qsort(temp_sort, destatep->rankedencoding_list_len,
3157 sizeof(temp_sort[0]), IntCompare);
3158
3159 int top_n = minint(n, destatep->rankedencoding_list_len);
3160 int showme = temp_sort[top_n - 1]; // Print this value and above
3161
3162 printf("rankedencodingList top %d: ", top_n);
3163 for (int j = 0; j < destatep->rankedencoding_list_len; ++j) {
3164 int rankedencoding = destatep->rankedencoding_list[j];
3165 if (showme <= destatep->enc_prob[rankedencoding]) {
3166 printf("%s=%d ",
3167 MyEncodingName(kMapToEncoding[rankedencoding]),
3168 destatep->enc_prob[rankedencoding]);
3169 }
3170 }
3171 printf("\n\n");
3172 }
3173
3174 // If the same bigram repeats, don't boost its best encoding too much
RepeatedBigram(DetectEncodingState * destatep,uint8 byte1,uint8 byte2)3175 bool RepeatedBigram(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) {
3176 int this_bigram = (byte1 << 8) | byte2;
3177 // If 00xx 01xx 02xx ... 1fxx, take out bottom 4 bits of xx.
3178 // This ignores parts of Yahoo 0255 0254 0243 0247 0245 0243 0250 0255 ...
3179 // It may screw up UTF-16BE
3180 // It may screw up ISO-2022 (1b24 suppresses 1b28)
3181 if (byte1 < 0x20) {
3182 this_bigram &= 0xfff0;
3183 }
3184 if (this_bigram == destatep->prior_bigram[0]) {return true;}
3185 if (this_bigram == destatep->prior_bigram[1]) {return true;}
3186 if (this_bigram == destatep->prior_bigram[2]) {return true;}
3187 if (this_bigram == destatep->prior_bigram[3]) {return true;}
3188 // Round-robin replacement
3189 destatep->prior_bigram[destatep->next_prior_bigram] = this_bigram;
3190 destatep->next_prior_bigram = (destatep->next_prior_bigram + 1) & 3;
3191 return false;
3192 }
3193
3194 // Sometimes illegal bytes are used as markers between text that Javascript
3195 // is going to decode. Don't overboost the Binary encoding for markers 01-FF.
3196 // Just count first pair per 8x4 bucket
RepeatedBinary(DetectEncodingState * destatep,uint8 byte1,uint8 byte2)3197 bool RepeatedBinary(DetectEncodingState* destatep, uint8 byte1, uint8 byte2) {
3198 int bucket8x4 = ((byte1 & 0xe0) >> 3) | ((byte2 & 0xc0) >> 6);
3199 uint32 bucket8x4_mask = 1 << bucket8x4;
3200 if ((destatep->binary_8x4_seen & bucket8x4_mask) == 0) {
3201 destatep->binary_8x4_seen |= bucket8x4_mask;
3202 destatep->binary_8x4_count += 1;
3203 return false;
3204 }
3205 return true;
3206 }
3207
3208
3209
3210
3211 // Find current top two rankedencoding probabilities
ReRank(DetectEncodingState * destatep)3212 void ReRank(DetectEncodingState* destatep) {
3213 destatep->top_prob = -1;
3214 destatep->second_top_prob = -1;
3215 // Leave unchanged
3216 //destatep->top_rankedencoding =
3217 // destatep->rankedencoding_list[0]; // Just to make well-defined
3218 //destatep->second_top_rankedencoding =
3219 // destatep->rankedencoding_list[1]; // Just to make well-defined
3220 for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
3221 int rankedencoding = destatep->rankedencoding_list[j];
3222 if (destatep->top_prob < destatep->enc_prob[rankedencoding]) {
3223 // Make sure top 2 are in different superset groups
3224 if (kMapEncToBaseEncoding[kMapToEncoding[destatep->top_rankedencoding]] !=
3225 kMapEncToBaseEncoding[kMapToEncoding[rankedencoding]]) {
3226 destatep->second_top_prob =
3227 destatep->top_prob; // old top to second
3228 destatep->second_top_rankedencoding =
3229 destatep->top_rankedencoding; // old top to second
3230 }
3231 destatep->top_prob = destatep->enc_prob[rankedencoding];
3232 destatep->top_rankedencoding = rankedencoding;
3233 } else if (destatep->second_top_prob < destatep->enc_prob[rankedencoding]) {
3234 if (kMapEncToBaseEncoding[kMapToEncoding[destatep->top_rankedencoding]] !=
3235 kMapEncToBaseEncoding[kMapToEncoding[rankedencoding]]) {
3236 destatep->second_top_prob = destatep->enc_prob[rankedencoding];
3237 destatep->second_top_rankedencoding = rankedencoding;
3238 }
3239 }
3240 }
3241 }
3242
SimplePrune(DetectEncodingState * destatep,int prune_diff)3243 void SimplePrune(DetectEncodingState* destatep, int prune_diff) {
3244 // Prune the list of active encoding families
3245 int keep_prob = destatep->top_prob - prune_diff;
3246
3247 destatep->active_special = 0;
3248 int k = 0;
3249 for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
3250 bool keep = true;
3251 int rankedencoding = destatep->rankedencoding_list[j];
3252
3253 // If count is too low, ditch it
3254 if (destatep->enc_prob[rankedencoding] < keep_prob) {keep = false;}
3255
3256 // Keep it. This will always keep at least top_prob rankedencoding
3257 if (keep) {
3258 destatep->active_special |= kSpecialMask[kMapToEncoding[rankedencoding]];
3259 destatep->rankedencoding_list[k++] = rankedencoding;
3260 }
3261 }
3262
3263 destatep->rankedencoding_list_len = k;
3264 }
3265
3266 // Recalculate reliable
CalcReliable(DetectEncodingState * destatep)3267 void CalcReliable(DetectEncodingState* destatep) {
3268 // Encoding result is reliable if big difference in top two, or if
3269 // only Ascii7 ever encountered
3270 // Also reliable if exactly one OtherPair and it's best encoding matches top
3271 destatep->reliable = false;
3272 if (destatep->next_interesting_pair[OtherPair] == 0) {
3273 // Only 7-bit ASCII
3274 destatep->reliable = true;
3275 return;
3276 }
3277 if ((destatep->top_prob - destatep->second_top_prob) >=
3278 FLAGS_ced_reliable_difference) {
3279 destatep->reliable = true;
3280 return;
3281 }
3282 if (destatep->next_interesting_pair[OtherPair] == 1) {
3283 uint8 byte1 = destatep->interesting_pairs[OtherPair][0];
3284 uint8 byte2 = destatep->interesting_pairs[OtherPair][1];
3285 int best_enc = kMostLikelyEncoding[(byte1 << 8) + byte2];
3286 if (best_enc == destatep->top_rankedencoding) {
3287 destatep->reliable = true;
3288 return;
3289 }
3290 }
3291
3292 // If we pruned to one encoding, we are done
3293 if (destatep->rankedencoding_list_len == 1) {
3294 destatep->reliable = true;
3295 destatep->done = true;
3296 return;
3297 }
3298
3299 // If we pruned to two or three encodings in the same *superset/subset
3300 // rankedencoding* and enough pairs, we are done. Else keep going
3301 if (destatep->rankedencoding_list_len == 2) {
3302 Encoding enc0 = kMapToEncoding[destatep->rankedencoding_list[0]];
3303 Encoding enc1 = kMapToEncoding[destatep->rankedencoding_list[1]];
3304 if (kMapEncToBaseEncoding[enc0] == kMapEncToBaseEncoding[enc1]) {
3305 if (destatep->prune_count >= 3) {
3306 destatep->reliable = true;
3307 destatep->done = true;
3308 return;
3309 }
3310 }
3311 } else if (destatep->rankedencoding_list_len == 3) {
3312 Encoding enc0 = kMapToEncoding[destatep->rankedencoding_list[0]];
3313 Encoding enc1 = kMapToEncoding[destatep->rankedencoding_list[1]];
3314 Encoding enc2 = kMapToEncoding[destatep->rankedencoding_list[2]];
3315 Encoding base0 = kMapEncToBaseEncoding[enc0];
3316 Encoding base1 = kMapEncToBaseEncoding[enc1];
3317 Encoding base2 = kMapEncToBaseEncoding[enc2];
3318
3319 if ((base0 == base1) && (base0 == base2)) {
3320 if (destatep->prune_count >= 3) {
3321 destatep->reliable = true;
3322 destatep->done = true;
3323 return;
3324 }
3325 }
3326 }
3327
3328 }
3329
3330
3331 // Find current top two rankedencoding probabilities
FindTop2(DetectEncodingState * destatep,int * first_renc,int * second_renc,int * first_prob,int * second_prob)3332 void FindTop2(DetectEncodingState* destatep,
3333 int* first_renc, int* second_renc,
3334 int* first_prob, int* second_prob) {
3335 *first_prob = -1;
3336 *second_prob = -1;
3337 *first_renc = 0;
3338 *second_renc = 0;
3339 for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
3340 int rankedencoding = destatep->rankedencoding_list[j];
3341 if (*first_prob < destatep->enc_prob[rankedencoding]) {
3342 *second_prob = *first_prob; // old top to second
3343 *second_renc = *first_renc; // old top to second
3344 *first_prob = destatep->enc_prob[rankedencoding];
3345 *first_renc = rankedencoding;
3346 } else if (*second_prob < destatep->enc_prob[rankedencoding]) {
3347 *second_prob = destatep->enc_prob[rankedencoding];
3348 *second_renc = rankedencoding;
3349 }
3350 }
3351 }
3352
3353
PrintRankedEncodingList(DetectEncodingState * destatep,const char * str)3354 void PrintRankedEncodingList(DetectEncodingState* destatep, const char* str) {
3355 printf("Current ranked encoding list %s\n", str);
3356 for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
3357 int rankedencoding = destatep->rankedencoding_list[j];
3358 if ((rankedencoding < 0) || (rankedencoding > NUM_RANKEDENCODING)) {
3359 printf(" [%d] BOGUS rankedencoding = %d\n", j, rankedencoding);
3360 } else {
3361 printf(" [%d] rankedencoding = %d %-12.12s enc_prob = %d\n",
3362 j, rankedencoding, MyRankedEncName(rankedencoding),
3363 destatep->enc_prob[rankedencoding]);
3364 }
3365 }
3366 printf("End current ranked encoding list\n\n");
3367 }
3368
3369
3370
3371
3372 // Map unencoded bytes down to five bits, largely preserving letters
3373 // This design struggles to put 33 values into 5 bits.
3374 #define XX 0 // Punctuation (00-7F range)
3375 #define HA 27 // High vowel a in Latin1/2/sometimes7
3376 #define HE 28 // High vowel e
3377 #define HI 29 // High vowel i
3378 #define HO 30 // High vowel o
3379 #define HU 30 // High vowel u on top of HO
3380 #define Hc 31 // High consonant (80-FF range)
3381 static const char kMapToFiveBits[256] = {
3382 XX,XX,XX,XX,XX,XX,XX,XX, XX,XX,XX,XX,XX,XX,XX,XX,
3383 XX,XX,XX,XX,XX,XX,XX,XX, XX,XX,XX,XX,XX,XX,XX,XX,
3384 XX,XX,XX,XX,XX,XX,XX,XX, XX,XX,XX,XX,XX,XX,XX,XX,
3385 XX,XX,XX,XX,XX,XX,XX,XX, XX,XX,XX,XX,XX,XX,XX,XX,
3386
3387 XX, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
3388 16,17,18,19,20,21,22,23, 24,25,26,XX,XX,XX,XX,XX,
3389 XX, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
3390 16,17,18,19,20,21,22,23, 24,25,26,XX,XX,XX,XX,XX,
3391
3392 Hc,HA,Hc,Hc,Hc,Hc,Hc,Hc, HO,Hc,Hc,Hc,Hc,Hc,Hc,Hc,
3393 Hc,HA,Hc,Hc,Hc,Hc,Hc,Hc, HO,Hc,Hc,Hc,Hc,Hc,Hc,Hc,
3394 Hc,HA,Hc,Hc,Hc,Hc,Hc,Hc, HO,Hc,Hc,Hc,Hc,Hc,Hc,Hc,
3395 Hc,HA,Hc,Hc,Hc,Hc,Hc,Hc, HO,Hc,Hc,Hc,Hc,Hc,Hc,Hc,
3396
3397 Hc,HA,HA,HA,HA,Hc,Hc,Hc, Hc,HE,HE,HE,HI,HI,HI,Hc,
3398 Hc,Hc,Hc,HO,HO,HO,HO,Hc, Hc,HU,HU,HU,HU,Hc,Hc,Hc,
3399 Hc,HA,HA,HA,HA,Hc,Hc,Hc, Hc,HE,HE,HE,HI,HI,HI,Hc,
3400 Hc,Hc,Hc,HO,HO,HO,HO,Hc, Hc,HU,HU,HU,HU,Hc,Hc,Hc,
3401
3402 };
3403 #undef XX
3404 #undef HA
3405 #undef HE
3406 #undef HI
3407 #undef HO
3408 #undef HU
3409 #undef Hc
3410
3411 static const int kTriLatin1Likely = 1;
3412 static const int kTriLatin2Likely = 2;
3413 static const int kTriLatin7Likely = 3;
3414
3415 // Each table entry has 32 times two bits, selected by byte[2]
3416 // Entry subscript is selected by byte[0] and byte[1]
3417 // Latin1/2/7 boost vector, generated 2007.09.26 by postproc-enc-detect-short.cc
3418 static const uint64 kLatin127Trigrams[1024] = {
3419 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
3420 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
3421 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
3422 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
3423 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
3424 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
3425 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
3426 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
3427 0x0000000000000000ULL, 0x304080c0402c3330ULL, 0x0008400004000000ULL, 0x082800000c200000ULL,
3428 0x23a0000420800030ULL, 0x00000000000ccc00ULL, 0x0500100100100000ULL, 0x0388400000200010ULL,
3429 0x0000000000000c00ULL, 0xd0f0300740f0cf00ULL, 0x2aa0a2a22882a2acULL, 0x081d800000000080ULL,
3430 0x0c82000020000000ULL, 0x200a03c000a00000ULL, 0x0008400400290000ULL, 0x0400870000000000ULL,
3431 0x00f040c00000c080ULL, 0x0008004000000410ULL, 0x0020300000000030ULL, 0x00a030002c300000ULL,
3432 0x0c8030c020a00000ULL, 0x15410030f0f4c000ULL, 0x3000000300a00000ULL, 0xa2880980a0880a88ULL,
3433 0x0900300000000000ULL, 0x0000040100300000ULL, 0x0888820020a00000ULL, 0xc044002242010000ULL,
3434 0x000000121d300040ULL, 0x40100040440c0d54ULL, 0x00008423102f8144ULL, 0x0b40808400000280ULL,
3435 0x0000000000000000ULL, 0x0680a000000c0000ULL, 0x0880008020aa0000ULL, 0x2aaa0141010a4940ULL,
3436 0xcb80000000010000ULL, 0x2280000000000000ULL, 0x5248000001800000ULL, 0x8000401004040010ULL,
3437 0x1540010201001010ULL, 0x0080080400000000ULL, 0x5a00044040000108ULL, 0x0288000282080008ULL,
3438 0x4800008002200000ULL, 0x4a00000000010100ULL, 0x8a88040080000800ULL, 0x0140800000000400ULL,
3439 0x40010050000c0000ULL, 0x0000008000000000ULL, 0x0028000020140040ULL, 0x8620401401005308ULL,
3440 0xc082000000000400ULL, 0x05c0b004c0240600ULL, 0x0288000080000000ULL, 0x0000014000000000ULL,
3441 0x00000000040000c0ULL, 0x8001861008004280ULL, 0x0200000000000300ULL, 0x0000240242288620ULL,
3442 0x801000c05434c200ULL, 0x9020162040a2d2b4ULL, 0x0021840000240704ULL, 0x2a80280080084908ULL,
3443 0x0000000000000000ULL, 0x0500004000000040ULL, 0x0080000000040000ULL, 0x0108058104440000ULL,
3444 0x0900000000040000ULL, 0x00c0000000208008ULL, 0x2000005000000000ULL, 0x0080000000050000ULL,
3445 0x0808000000001080ULL, 0x9880810100308000ULL, 0x2285480080081a08ULL, 0x8a80000080080000ULL,
3446 0x1450000000600010ULL, 0x2210000100000000ULL, 0x8a88000100011000ULL, 0x1541804000000010ULL,
3447 0xc084011140040100ULL, 0x0000000000000800ULL, 0x0400000000000030ULL, 0x2a800000a0890128ULL,
3448 0x1140a00054000104ULL, 0x1440000101200404ULL, 0x028800400400d800ULL, 0x0000000000000000ULL,
3449 0x0000000000002330ULL, 0x0020820228a02280ULL, 0xa2888a02aa8008a8ULL, 0xd0040a0044202500ULL,
3450 0x8000044104a29424ULL, 0xc000100178b2c5b4ULL, 0x0000810100241504ULL, 0xd040030000380008ULL,
3451 0x0000000000000000ULL, 0x26c08c0000200130ULL, 0x4a08000110080000ULL, 0x2aa0004001080800ULL,
3452 0x0aac000000004000ULL, 0x2000000000200000ULL, 0x4240000100020000ULL, 0x4100000080000000ULL,
3453 0x4900040000000000ULL, 0x0800000400300040ULL, 0x6a80000000040800ULL, 0x2a08182000588008ULL,
3454 0x0a00000c81000008ULL, 0x0a000c0010000000ULL, 0x8a88001080280808ULL, 0x0020000200300600ULL,
3455 0xaac00000900a0000ULL, 0x0000100004000000ULL, 0x0020081020000000ULL, 0x8220105010084110ULL,
3456 0x4a80800000004000ULL, 0x050000c0c0200000ULL, 0x288c000084000000ULL, 0xa048082280000000ULL,
3457 0x0000000000000000ULL, 0x8000900000032080ULL, 0xee889e81b8880820ULL, 0xc2200a8142800424ULL,
3458 0xc020141543361010ULL, 0x10a000204a801634ULL, 0x3a808800802a00a0ULL, 0x28808b00803d0800ULL,
3459 0x0000000000000000ULL, 0x0020000000000030ULL, 0x0808400121010040ULL, 0x0c28240100200040ULL,
3460 0x2008200028800000ULL, 0xc10004c80f30c030ULL, 0x0400440114100000ULL, 0x2208200280a22220ULL,
3461 0x0600000030c01000ULL, 0x1201001040c00000ULL, 0x0aa02ea22aa22aa0ULL, 0x30008000000200a0ULL,
3462 0x20c8400400800000ULL, 0x08280b0420800000ULL, 0x0800100000210000ULL, 0x10000300c0100400ULL,
3463 0xc8c0000420000000ULL, 0x1000000010000000ULL, 0x0420000400000000ULL, 0x0220000500204000ULL,
3464 0x2200000420000000ULL, 0x0000540400000000ULL, 0x0000000020000000ULL, 0x00080c00a0810080ULL,
3465 0x1540000000043000ULL, 0x0000000000100000ULL, 0x2e88a22220200a20ULL, 0xc06030e34ea503a0ULL,
3466 0x0001100204048500ULL, 0x000000e0000c0d54ULL, 0x3000820310a31400ULL, 0x13088c0320e00280ULL,
3467 0x0000000000000000ULL, 0x0480000000200000ULL, 0x4000200100000000ULL, 0x0000300040040000ULL,
3468 0x4400000000000000ULL, 0x0401000002240000ULL, 0x0540000000040000ULL, 0x4004010000000000ULL,
3469 0x4001111001100000ULL, 0x2880000000300040ULL, 0x4040004040002404ULL, 0x0200000000000000ULL,
3470 0x0140040000100000ULL, 0x4040010040040080ULL, 0x0a00140000041004ULL, 0x0000a00400808000ULL,
3471 0x1010200000430040ULL, 0x0010000000000000ULL, 0x0540000000104000ULL, 0x1400114005000000ULL,
3472 0x0000204000440010ULL, 0x0500000000004400ULL, 0x4500000018000400ULL, 0x0000400000000000ULL,
3473 0x000000300000cc00ULL, 0x0100001011300000ULL, 0x0040000000000000ULL, 0xc0e0000248a00444ULL,
3474 0x0000040020340144ULL, 0x0000046445105454ULL, 0x32a0a80280880128ULL, 0x0880040000100100ULL,
3475 0x0000000000000000ULL, 0x14003000030c0004ULL, 0x4a04001100000000ULL, 0x0a00108010000000ULL,
3476 0x28a8004000200248ULL, 0x0100040000b00000ULL, 0x42000000000008c0ULL, 0x6008044010550010ULL,
3477 0x0800401000010400ULL, 0x080080040cf80000ULL, 0x5080000001001010ULL, 0x2a80100000000000ULL,
3478 0xcc8010010d401100ULL, 0x0200000001001000ULL, 0x0480001004001000ULL, 0x8d00800040b40210ULL,
3479 0x6200800000300000ULL, 0x0000010000000000ULL, 0x0428004100010000ULL, 0x4320105141501100ULL,
3480 0xe28c0000000c1000ULL, 0xd5c000c3c0e00300ULL, 0x0001000000100200ULL, 0x1004010202400008ULL,
3481 0x0000000000003000ULL, 0x2aa038a0800aab08ULL, 0x2a88038000000000ULL, 0xc220040242f09720ULL,
3482 0x8020200200ba0420ULL, 0x0020106105101004ULL, 0x0480800000220400ULL, 0x2280100080000008ULL,
3483 0x0000000000000000ULL, 0x9000000000200000ULL, 0x0001000000100000ULL, 0x2aa40c0000080800ULL,
3484 0x0040000040010000ULL, 0x0040000000c01000ULL, 0x4000000040000400ULL, 0x0000001000200000ULL,
3485 0x0000010000000000ULL, 0x05808004000c0000ULL, 0x50400c0000000400ULL, 0x020040008f000040ULL,
3486 0x0800000000100000ULL, 0x0000000000000000ULL, 0x0a08440000004000ULL, 0x0064000400008200ULL,
3487 0x0010010010034170ULL, 0x0000000010000000ULL, 0x0100204021000000ULL, 0x022000d000010100ULL,
3488 0x0840300000c00000ULL, 0x1400000040204400ULL, 0x09800c0040000000ULL, 0x0209708000000000ULL,
3489 0x000000000000c040ULL, 0x90000c50204040a0ULL, 0x0000000000000000ULL, 0x00e1500040200004ULL,
3490 0x8020260540204494ULL, 0x0020026150201054ULL, 0x0281800380105634ULL, 0x0884900481105000ULL,
3491 0x0000000000000000ULL, 0x84203c00002c0200ULL, 0xc089040000000000ULL, 0xc2a8100040200004ULL,
3492 0xe00c1c0000000000ULL, 0x0ce1330080200080ULL, 0x0000000000200000ULL, 0xc400110000404010ULL,
3493 0x0088400000000000ULL, 0x00083cc00c00c00cULL, 0xcac01c00c000580cULL, 0xe300b0f000100000ULL,
3494 0x0300000000000000ULL, 0xc0000f0000000000ULL, 0xc3c01c0400000000ULL, 0x81008004c0f40000ULL,
3495 0xc3d8003000000440ULL, 0x0000000000000000ULL, 0xc430000000000000ULL, 0x0060000000001000ULL,
3496 0x0800000000000000ULL, 0x00c03300f0fc0008ULL, 0x3000000400200010ULL, 0xa2a80892a0880a28ULL,
3497 0x0500000040000004ULL, 0x0000000000000000ULL, 0xc80032070c200020ULL, 0x0220820060a296a0ULL,
3498 0x802084021db486a0ULL, 0x00000d60080c0080ULL, 0xb281803313a32428ULL, 0x1808300320300000ULL,
3499 0x0000000000000000ULL, 0x85208cc0ccac1f20ULL, 0x2081000186100808ULL, 0x22a80880000a0808ULL,
3500 0xaaa8086880000000ULL, 0x802084800a2e9200ULL, 0xa280000000002008ULL, 0xa000000080080400ULL,
3501 0x2080010000000008ULL, 0x802020c00c028c80ULL, 0x2080000000140810ULL, 0x2a80086080080008ULL,
3502 0x2a800000a8000800ULL, 0xaa881800a2080800ULL, 0xaa98004080280808ULL, 0x004483d0c0300000ULL,
3503 0xa280002080080000ULL, 0x0000000000300000ULL, 0x22a1030000000008ULL, 0xa8a0301088880880ULL,
3504 0xaa80002080222808ULL, 0x85400c03fc030400ULL, 0x8a88000000000008ULL, 0xa008008010080008ULL,
3505 0x0000000000010000ULL, 0x0040100000301040ULL, 0x28800000a0002008ULL, 0x122482306cbc0eacULL,
3506 0x8020224222b8c6a0ULL, 0x802002004a82c284ULL, 0x0aa08fc440a41c80ULL, 0x888080d181385098ULL,
3507 0x0000000000000000ULL, 0x00c0b000000c0080ULL, 0x2208001000000800ULL, 0x0a28000000200000ULL,
3508 0x0000000300000000ULL, 0x00c1040000200000ULL, 0x0203020000000000ULL, 0x0248000000020000ULL,
3509 0x0000840000100000ULL, 0x0a808c00c000008cULL, 0x5200040040000004ULL, 0x02000c00000080a0ULL,
3510 0x0b0c000020000000ULL, 0x0b04000001000000ULL, 0x088c0010002000c0ULL, 0x80e08b00c0030c20ULL,
3511 0x0280000200014040ULL, 0x0000000000000000ULL, 0x0e20a0a008000020ULL, 0x0e280fd03f00111cULL,
3512 0x200080c020001000ULL, 0x8cc00c02c02f0400ULL, 0x480c0001000c404cULL, 0x0208014281080808ULL,
3513 0x000000000000fcfcULL, 0x004403300cf00030ULL, 0x2200000000004400ULL, 0x02202000c08c0c20ULL,
3514 0x02202022683a80a0ULL, 0x4020228028008c00ULL, 0x32208cc0002c0200ULL, 0x3ec00c0080304008ULL,
3515 0x0000000000000000ULL, 0x34000c00002c0000ULL, 0x0b00000100100030ULL, 0x0823018000000000ULL,
3516 0x0e8c001c01e00000ULL, 0x1200800600330000ULL, 0x4000110000000000ULL, 0x0080000300000000ULL,
3517 0x0800000000000000ULL, 0x08c08c04000c0000ULL, 0x0080400000880000ULL, 0x0a08000080c00008ULL,
3518 0x0800000304400000ULL, 0x0208000000c00000ULL, 0x2888300080400800ULL, 0x8dc0204400000000ULL,
3519 0xc0000000c0800000ULL, 0x0000c10000000000ULL, 0x24000c4010c00000ULL, 0x272000541d811000ULL,
3520 0x0200400000001000ULL, 0x0400000400001004ULL, 0xc08c007004001000ULL, 0x2048004000000000ULL,
3521 0x000000000003fcfcULL, 0x2aa030000cf8c800ULL, 0xe280000000000000ULL, 0x0a21008142000340ULL,
3522 0x0021002000b61040ULL, 0x800004064006d444ULL, 0x3aa0800300230008ULL, 0x0b00030000300000ULL,
3523 0x0000000000000000ULL, 0x01c080000000040cULL, 0x0100000000004000ULL, 0x0aa8018010001000ULL,
3524 0x0800000000100000ULL, 0x3000000000008c00ULL, 0x5400000013000000ULL, 0x02c0c00004004010ULL,
3525 0x5241100010000c00ULL, 0x0e00080000000808ULL, 0x5281000000000800ULL, 0x0a08108020000800ULL,
3526 0x0a80000000005210ULL, 0x0100000041000000ULL, 0x2a88000002080110ULL, 0x8520800000c00080ULL,
3527 0x01000010108c0100ULL, 0x0000000000000000ULL, 0x42a0420080000000ULL, 0x0020001004010010ULL,
3528 0xc4000000000c0000ULL, 0x01000c00c0200400ULL, 0x4600000100000000ULL, 0x0000000000000000ULL,
3529 0x0010001000000010ULL, 0x910400900820d030ULL, 0x2280000000000000ULL, 0xc2212004400040e4ULL,
3530 0x8001000000b61420ULL, 0xa00002a248e810b4ULL, 0x32008000002c0008ULL, 0x0c010034803c5010ULL,
3531 0x0000000000000000ULL, 0x85008002002c0000ULL, 0x0204001000004010ULL, 0x0120008000200000ULL,
3532 0x000010000c2000c0ULL, 0xccc0000000200000ULL, 0x0400000c00100040ULL, 0x0003300100004100ULL,
3533 0x4000551040000004ULL, 0x0e0080000c820808ULL, 0xc000000000080800ULL, 0xc803000000000000ULL,
3534 0x0a4000c000200000ULL, 0x0040000000c00000ULL, 0x0918145000405000ULL, 0x81400000c0300400ULL,
3535 0x0050000000000000ULL, 0xd000045000000000ULL, 0x0400004000400000ULL, 0x0420104010000110ULL,
3536 0x0700000000203000ULL, 0x34800300c0e00704ULL, 0x4440100044000400ULL, 0x0040000040000000ULL,
3537 0x0030000044000000ULL, 0xeaaca0008808c880ULL, 0x0a01000000200000ULL, 0x1220a300403ccf20ULL,
3538 0x002024c200b61044ULL, 0x802014346aa2d434ULL, 0x30008c00c0820c44ULL, 0x0a000000000c4800ULL,
3539 0x0000000000000000ULL, 0x0000404000340c90ULL, 0x08a8a10820800280ULL, 0x8128009022201000ULL,
3540 0x0020808228a000a0ULL, 0x0020400100410000ULL, 0x0400000110000000ULL, 0xa609000000200000ULL,
3541 0x8008330000d00000ULL, 0x8060100040404010ULL, 0xeaa00ea0ea00808cULL, 0x200c8020a0000020ULL,
3542 0x0408800020200000ULL, 0x0189001403200000ULL, 0xc00800000000c000ULL, 0x200430c00c300000ULL,
3543 0x0100300100004000ULL, 0x0000040000000000ULL, 0x2420000400001000ULL, 0x89a1200400000000ULL,
3544 0x20c8a000208c0000ULL, 0x8080000000000000ULL, 0x28a0108020210080ULL, 0xa2a84800a0880988ULL,
3545 0x258008000400c000ULL, 0x0140000000100000ULL, 0xa028a222a0aa0228ULL, 0xc060012054044040ULL,
3546 0x0010010400000000ULL, 0x00000050150c0114ULL, 0x0000008010c20010ULL, 0xaa088000a0200880ULL,
3547 0x0000000000000000ULL, 0x0700b0c0000c0000ULL, 0x2200040000080030ULL, 0x2aa8808040240800ULL,
3548 0x08b0500000000100ULL, 0x1000830400200000ULL, 0x4204000010000000ULL, 0x40c2200050040050ULL,
3549 0x0104404001010000ULL, 0x1a808c8103c00030ULL, 0x30900010c0000b00ULL, 0x200812b283000008ULL,
3550 0x000c000020e00000ULL, 0x2140000000400000ULL, 0x0288000080200000ULL, 0x8060a200c8a20280ULL,
3551 0x0400114010215000ULL, 0x0000000000000000ULL, 0x082b200002000010ULL, 0x22a0030000031000ULL,
3552 0x008100001000000cULL, 0x05400c00c0230400ULL, 0xca3000003c080100ULL, 0x0000000020000004ULL,
3553 0x0000000100000000ULL, 0x8004320813f5c000ULL, 0xa280080200000800ULL, 0xc22000044e334c20ULL,
3554 0x000004146e361024ULL, 0x800126806aa0d584ULL, 0xb000a0040023c41cULL, 0x0a083000803053d8ULL,
3555 0x0000000000000000ULL, 0x0000100000020000ULL, 0x0000000010000010ULL, 0x0000000045040004ULL,
3556 0x0000000000100000ULL, 0x0000020400000010ULL, 0x0003015000000000ULL, 0x0400000000000000ULL,
3557 0x0000000400000000ULL, 0x0100000000000800ULL, 0x0000001000000000ULL, 0x0000000000000000ULL,
3558 0x0000000040000000ULL, 0x0000000000000000ULL, 0x0004001000000000ULL, 0x0008001000000000ULL,
3559 0x0010000000000004ULL, 0x0000010100001000ULL, 0x0004000000000004ULL, 0x0000014040050014ULL,
3560 0x0014000000000040ULL, 0x5540000000041000ULL, 0x0000000000000000ULL, 0x0000040000000d00ULL,
3561 0x0000000000000000ULL, 0x0000000000100000ULL, 0x0001000000000000ULL, 0x0000000000000000ULL,
3562 0x0000000000000000ULL, 0x0000000000000000ULL, 0x4500000000040400ULL, 0x0000800000000400ULL,
3563 0x0000000000000000ULL, 0x13e080000020000cULL, 0xcf00001005100000ULL, 0x04a8008000200300ULL,
3564 0x00280100100000c0ULL, 0x1c8c000040200000ULL, 0x0600005000100000ULL, 0x050800000c104000ULL,
3565 0x4c10101000110000ULL, 0x0c00000000300000ULL, 0x22040c00100000c0ULL, 0x0800700010100000ULL,
3566 0x0000000000001000ULL, 0x0a08000010000040ULL, 0x0800034004210010ULL, 0x04e0000400000000ULL,
3567 0x0800030020000000ULL, 0x0000005000000000ULL, 0x0400110101304110ULL, 0x0428000010a01000ULL,
3568 0x060b000000800010ULL, 0x35810c00c020c000ULL, 0x00800c4321800000ULL, 0x4208088020000080ULL,
3569 0x040000111003ff00ULL, 0x0020900020202080ULL, 0x22888180a8000888ULL, 0x0225200542005420ULL,
3570 0x2020040400340020ULL, 0x10300424500cc444ULL, 0x3081a00400e00200ULL, 0x33001300c0300000ULL,
3571 0x0000000000000000ULL, 0x04003c0000000000ULL, 0x0a04001000100100ULL, 0x1408000001000000ULL,
3572 0x1800000044100000ULL, 0x3400040400000300ULL, 0x5000040801000040ULL, 0x4088401040000040ULL,
3573 0x1010110130100000ULL, 0xca800c3000300000ULL, 0x5a01000000080100ULL, 0x020280000cd01300ULL,
3574 0x0302000410200010ULL, 0x0000102000300000ULL, 0x0b09000000000000ULL, 0x20008004c4800004ULL,
3575 0x28c0410010000000ULL, 0x0004015041000050ULL, 0x0a01006000200200ULL, 0x0020d00000100040ULL,
3576 0x0010a00100900000ULL, 0x3500bf00c0030300ULL, 0x080c010000200d00ULL, 0x2248000004020010ULL,
3577 0x0000c00000000000ULL, 0x8044b00200e08000ULL, 0xaaa82aa2aa8a2aa8ULL, 0x0220002241c08604ULL,
3578 0x4200260440328444ULL, 0x68001226103008b4ULL, 0x3a0080c0b0000400ULL, 0x2a804804803c4008ULL,
3579 0x0000000000000000ULL, 0x04008c0300000400ULL, 0x008000c0000c0000ULL, 0x088001000000001cULL,
3580 0x0840000001000010ULL, 0x0400000000200c00ULL, 0x4244000101040000ULL, 0x4238007011100000ULL,
3581 0x1000d00100000010ULL, 0x1d00800400300000ULL, 0x4204080c00000000ULL, 0x2a88080080000008ULL,
3582 0x08001c0200001000ULL, 0x0a00000400000000ULL, 0x8a88003080080000ULL, 0x0521800400300000ULL,
3583 0x3200051000201000ULL, 0x0000000000000000ULL, 0x0020801404000000ULL, 0x322010401c0c101cULL,
3584 0x0c01100013000000ULL, 0x04003000c0204000ULL, 0x088c0020a0cc0000ULL, 0x2200000080000018ULL,
3585 0x0404000044000000ULL, 0x82a0b000008820b0ULL, 0x0000040020440000ULL, 0xc2650004403f1420ULL,
3586 0x0021340241b64464ULL, 0x8020040242c2d474ULL, 0x32018c0480288000ULL, 0x00800b0080300000ULL,
3587 0x0000000000000000ULL, 0x05008c0000040130ULL, 0xc0d8000000800000ULL, 0x0020000020200200ULL,
3588 0x23a2000120204000ULL, 0x5052100550104150ULL, 0x1000101100040000ULL, 0xc40001c301000000ULL,
3589 0x8288000000c00000ULL, 0x5150040144d01404ULL, 0xea8c0ea028ae088cULL, 0xc31010c000000c80ULL,
3590 0x0002000060000000ULL, 0xc80800f030000000ULL, 0x0000000400300000ULL, 0xc00080c00ff0c344ULL,
3591 0x00080001200c0000ULL, 0x0000050080000000ULL, 0x0328000300300000ULL, 0x082030000cc01040ULL,
3592 0xeb08800100004000ULL, 0x8030003300c80f00ULL, 0xfb0d0000e4ac0000ULL, 0x0020006080000008ULL,
3593 0x0500100100040000ULL, 0x1140000000000000ULL, 0xcb883330a0e00000ULL, 0xc000010050000080ULL,
3594 0x0010104005b54150ULL, 0x40111d5155001554ULL, 0x80000070140f0004ULL, 0x0b0830c3a0003380ULL,
3595 0x0000000000000000ULL, 0x04c13000000f830cULL, 0x2808000000000000ULL, 0x2810000000000800ULL,
3596 0x08c0080004400000ULL, 0x04c0240300801c20ULL, 0x4040000080000004ULL, 0x0000400100100010ULL,
3597 0x020001008000c0c0ULL, 0x1d008c000c3c0000ULL, 0x0080003000000800ULL, 0x2288080080000008ULL,
3598 0x0a84004020220000ULL, 0x0800080000100000ULL, 0xaa80004080400008ULL, 0x8024000400c01660ULL,
3599 0x80841c2001000104ULL, 0x0001000000000000ULL, 0x0020028020020280ULL, 0x0860404011900100ULL,
3600 0xec80080200000000ULL, 0x010103c100200400ULL, 0x0200004000000000ULL, 0x0000000000400400ULL,
3601 0x000010000003fcfcULL, 0x8040083238c20000ULL, 0x08800220a0920a00ULL, 0x08210004483c0c24ULL,
3602 0xc020240740b0a200ULL, 0x802006014a201494ULL, 0x3201233070ac0e00ULL, 0x08002806033a48a0ULL,
3603 0x0000000000000000ULL, 0x8020820028a00680ULL, 0x2000002000000104ULL, 0x22a80801100a0808ULL,
3604 0xa2a8002080000000ULL, 0xa000800008a08000ULL, 0x0000100000400000ULL, 0x8000002100000000ULL,
3605 0x0000010000004404ULL, 0xa2a0088080000888ULL, 0x0000000010400800ULL, 0xa280082080080008ULL,
3606 0x2280000080010008ULL, 0x2000000000000000ULL, 0x228800008c080808ULL, 0x8021828002a98200ULL,
3607 0xa200002000080000ULL, 0x0000040000000000ULL, 0x22a0000080000000ULL, 0x202882c200800080ULL,
3608 0xa000000001004000ULL, 0x000000c808a00600ULL, 0x0000000010000000ULL, 0x000001000000040cULL,
3609 0x0000000000000000ULL, 0x802002a2a8aa82a0ULL, 0x20000024a8088228ULL, 0x8020820001000000ULL,
3610 0x8020000000808280ULL, 0x8000000000000000ULL, 0x0020800000200280ULL, 0x2080082280a00888ULL,
3611 0x0000000000000000ULL, 0x0000015000000040ULL, 0x0000040000040000ULL, 0x0100010010001000ULL,
3612 0x0000003210008000ULL, 0x0000000404000000ULL, 0x0000000000000400ULL, 0x0200000000000000ULL,
3613 0x0000000000000100ULL, 0x5180014400004050ULL, 0x1000000014000000ULL, 0x4200000000000000ULL,
3614 0x0040200000000000ULL, 0x0201004000000000ULL, 0x0a00000000000010ULL, 0x0040200000800000ULL,
3615 0x0040051000000500ULL, 0x0000000100800400ULL, 0x6000000000000000ULL, 0x0000000000000000ULL,
3616 0x280000c1400040ccULL, 0x4180001000000000ULL, 0x00000000c1000104ULL, 0x0000000000000000ULL,
3617 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0080000000c00000ULL, 0x0004006066004000ULL,
3618 0x0000005000040440ULL, 0x0000106005804044ULL, 0x0000a10511004440ULL, 0x0000000000000110ULL,
3619 0x0000000000000000ULL, 0x0000000000080000ULL, 0xeb0808a020800080ULL, 0x29a80081002a1800ULL,
3620 0x0b2c000202100100ULL, 0x0001000000888000ULL, 0x2280102010000000ULL, 0x020000602a004110ULL,
3621 0x8a800160a6108100ULL, 0x0280000000000020ULL, 0x8a8000a0a8808208ULL, 0x0280882080500308ULL,
3622 0x0b18010020804100ULL, 0xeb080000c0080080ULL, 0x2b08000000810130ULL, 0x0000000008040020ULL,
3623 0xaa0a08e082894140ULL, 0x0000000000000000ULL, 0x202081409010001cULL, 0x8aa8805082806000ULL,
3624 0xeb082900289c0000ULL, 0x0000000000008000ULL, 0xf80c2e20002e0000ULL, 0xa288080420880888ULL,
3625 0x0000010000000000ULL, 0x0000000000102000ULL, 0x22880000a8a80808ULL, 0x022022a22aa880a0ULL,
3626 0x0000222222aa0620ULL, 0x0000022002800000ULL, 0x208080004028a000ULL, 0x2b888800801c0828ULL,
3627 0x0000000000000000ULL, 0x22e0828280a08028ULL, 0xaa88002082080308ULL, 0x0ea80080410a0040ULL,
3628 0x2a28222000a00000ULL, 0x8aa2808028a0a2a0ULL, 0x0200001000000000ULL, 0x82080000a0000000ULL,
3629 0x8800000082000808ULL, 0x2a008a0000300888ULL, 0x0a80080080080808ULL, 0xaa882800840b0808ULL,
3630 0x0a80000080000040ULL, 0xea080820a0000000ULL, 0xaa88080080080808ULL, 0x8040a2800a8024a0ULL,
3631 0xaa800020a0080808ULL, 0x0000040000000000ULL, 0x2a280a0080080880ULL, 0x2a20081080008a00ULL,
3632 0x2a88882088aa0008ULL, 0x81800202c0a01480ULL, 0xea88082082200000ULL, 0xaa88002080080008ULL,
3633 0x0000100000000000ULL, 0x802082a22aa0a2a0ULL, 0x2e80000000000000ULL, 0x0220a2a26aa0a2a8ULL,
3634 0x800022a2228a22a0ULL, 0x880002212e82c0b0ULL, 0x02a0aa0002a82228ULL, 0x2d808b0080380008ULL,
3635 0x0000000000000000ULL, 0x000407551c154244ULL, 0x2a00208088a02228ULL, 0x12a82182a2402a88ULL,
3636 0xe32821e020826d00ULL, 0x801130100ccc1330ULL, 0x028010c000841008ULL, 0x88a08002a0a664a0ULL,
3637 0x0048270080000100ULL, 0x00001f010cd10f30ULL, 0xe2242ce22aaea2a0ULL, 0xc2c00cc20ae22460ULL,
3638 0xe208003128021c10ULL, 0x2a2021c010821080ULL, 0x2a88202082202020ULL, 0x4010111104941410ULL,
3639 0xc80c02c182b00080ULL, 0x0000040000000000ULL, 0xe28030068002c300ULL, 0x2aa02024a2a22228ULL,
3640 0xe20889328aa22080ULL, 0x0000000000210100ULL, 0xaa0028e0a9b221a0ULL, 0x2000008080400000ULL,
3641 0x0000010041150404ULL, 0x0000105114410100ULL, 0xeaa82aa6aaaaaaa8ULL, 0x000000f44300c434ULL,
3642 0x0000222222b00020ULL, 0x0000002000000000ULL, 0x0000004014000000ULL, 0x0039b3f73fbcd3fcULL,
3643 0x0000000000000000ULL, 0x0000104015045040ULL, 0x20a80490a08800a0ULL, 0x40a8258410a909a0ULL,
3644 0xe0a8a2022aa2e2a0ULL, 0xc111010014000500ULL, 0x2080044041840004ULL, 0x28a8200220a2aba0ULL,
3645 0x008400a0a2840800ULL, 0x0101015451009464ULL, 0x20000ea0e02c2c2cULL, 0xe2a828a2aca2aaa8ULL,
3646 0x682020a228a222a0ULL, 0xe8882ae22aa2a2a0ULL, 0xe9a80e6022a24140ULL, 0x0011055005001040ULL,
3647 0x2aa8208229a0aaa4ULL, 0x0000040000000000ULL, 0x28a0228026a62260ULL, 0xe2a020a422a2a020ULL,
3648 0xe808a0022aa1a220ULL, 0x0000010014000100ULL, 0x28ac22802aa2a020ULL, 0x0020000000000000ULL,
3649 0x0100010100040000ULL, 0x0000000000000000ULL, 0x22a822a22a8aaaa0ULL, 0x0000000000000000ULL,
3650 0x0000102410800100ULL, 0x0000000000000000ULL, 0x0000000002000000ULL, 0x00000fb2a08c0aa8ULL,
3651 0x0000000000000000ULL, 0x4010005015440140ULL, 0x18c81c00b180001cULL, 0x2800048021820800ULL,
3652 0x8ab820c06a802580ULL, 0x00100170f4040000ULL, 0x4000144041041404ULL, 0x0ac800d0002e440cULL,
3653 0x20880820a2000808ULL, 0x400000f03f300c00ULL, 0xaa000ea22aa22aa0ULL, 0xa2880ac0a8942a20ULL,
3654 0xaa880a81a1804188ULL, 0xeea022a0aaa02080ULL, 0xaaa820a2aaa66120ULL, 0x0000005115800150ULL,
3655 0x2a880920a0840040ULL, 0x0000040000000000ULL, 0xaea82222aaa22a28ULL, 0x8a28041260055150ULL,
3656 0xa28824008aa28880ULL, 0x0000025014019000ULL, 0xea882ae02aa200a0ULL, 0x0000000000000000ULL,
3657 0x0000000040000400ULL, 0x0000000000000000ULL, 0xaaa82aa22aaaaaa0ULL, 0x0000000000000000ULL,
3658 0x0000000000000000ULL, 0x002003003c80c000ULL, 0x0000020014000000ULL, 0x00200010a0980a20ULL,
3659 0x0000000000000000ULL, 0x0020001200801240ULL, 0x0a88000089800020ULL, 0xcaa00080a1000000ULL,
3660 0x0a200c0020a04080ULL, 0x4002034003840880ULL, 0x4690500190000050ULL, 0x2228004000601000ULL,
3661 0x0a803f00803f400cULL, 0x400033e24dd0cf34ULL, 0xaa80a2a229a220a0ULL, 0x0a224000002c0000ULL,
3662 0x028000202000008cULL, 0x0a08000070000030ULL, 0x00800c040020000cULL, 0x0000000002850000ULL,
3663 0x02881cc310200000ULL, 0x0000040004000000ULL, 0xcba8000400000080ULL, 0xcaa02c0680000000ULL,
3664 0xcc880002008c4080ULL, 0x300000f007f0cf0cULL, 0x0a80001080a00000ULL, 0x820880802a880a80ULL,
3665 0x0000050001040004ULL, 0x0000011000000000ULL, 0x0a8020a2a0202000ULL, 0x0000022202008000ULL,
3666 0x0000222212808000ULL, 0x0020226010000000ULL, 0x000033f33ff3c33cULL, 0x00288002a08c02a8ULL,
3667 0x0000000000000000ULL, 0x04408e0000008200ULL, 0x0808004000900000ULL, 0x0aa8200010ca00c0ULL,
3668 0x0ba80101005d4010ULL, 0x00018604802c8288ULL, 0x00049400101c0000ULL, 0x000c101110505010ULL,
3669 0x0000000000100000ULL, 0x30000c00c022000cULL, 0xd0c00dd0d51d431cULL, 0x0008000010100000ULL,
3670 0x000c1001a0280000ULL, 0x0bc80000c0000000ULL, 0x0a00000080280000ULL, 0x8000a00220308420ULL,
3671 0x0808000010301000ULL, 0x0000040000000000ULL, 0x0d00031480100000ULL, 0x07200000108c0300ULL,
3672 0x0bc0a0c000004000ULL, 0x8000b002c0208480ULL, 0x340c0100118c111cULL, 0x8008008020890000ULL,
3673 0x0000000000040010ULL, 0x0020b00320c1d0b0ULL, 0x00002000000c0000ULL, 0x0020be226e2008a0ULL,
3674 0x002010c03fb0a6a0ULL, 0x00202e222aaec284ULL, 0x00008f0000208400ULL, 0x0000000000300000ULL,
3675 };
3676 // Latin1 6%, Latin2 11%, Latin7 3%
3677
3678
3679
3680 // Just for debugging. not thread-safe
3681 static char tri_string[4];
Latin127Str(int trisub)3682 char* Latin127Str(int trisub) {
3683 tri_string[0] = "_abcdefghijklmnopqrstuvwxyzAEIOC"[(trisub >> 10) & 0x1f];
3684 tri_string[1] = "_abcdefghijklmnopqrstuvwxyzAEIOC"[(trisub >> 5) & 0x1f];
3685 tri_string[2] = "_abcdefghijklmnopqrstuvwxyzAEIOC"[(trisub >> 0) & 0x1f];
3686 tri_string[3] = '\0';
3687 return tri_string;
3688 }
3689
3690 // Returns two bits per three-byte trigram, indicating
3691 // dont-care, Latin1 likely, Latin2 likely, and Latin7 (ISO-8859-13) likely
TrigramValue(const uint8 * trisrc)3692 int TrigramValue(const uint8* trisrc) {
3693 int byte0_p = kMapToFiveBits[trisrc[0]];
3694 int byte1_p = kMapToFiveBits[trisrc[1]];
3695 int byte2_p = kMapToFiveBits[trisrc[2]];
3696 int subscr = ((byte0_p) << 5) | byte1_p;
3697 int temp = static_cast<int>((kLatin127Trigrams[subscr] >> (byte2_p * 2)));
3698 //printf("%s=%d ", Latin127Str((subscr << 5) | byte2_p), temp & 3);
3699 return temp & 3;
3700 }
3701
3702
3703 // Put out trigrams for surrounding 32 bytes for Latin encodings
3704 // Return true if more Latin2 & 7 than Latin1
BoostLatin127Trigrams(int tri_block_offset,DetectEncodingState * destatep)3705 bool BoostLatin127Trigrams(int tri_block_offset,
3706 DetectEncodingState* destatep) {
3707 //printf("BoostLatin127Trigrams[%06x]\n", tri_block_offset);
3708 int excess_latin27 = 0;
3709 int srclen = destatep->limit_src - destatep->initial_src;
3710 int hi_limit = minint(tri_block_offset + 32, srclen - 2);
3711 const uint8* trisrc = &destatep->initial_src[tri_block_offset];
3712 const uint8* trisrclimit = &destatep->initial_src[hi_limit];
3713 while (trisrc < trisrclimit) {
3714 // Selectively boost Latin1, Latin2, or Latin7 and friends
3715 int trigram_val = TrigramValue(trisrc);
3716 if (trigram_val != 0) {
3717 if (FLAGS_enc_detect_source) {
3718 PsHighlight(trisrc, destatep->initial_src, trigram_val, 1);
3719 }
3720 if (trigram_val == kTriLatin1Likely) {
3721 Boost(destatep, F_Latin1, kTrigramBoost);
3722 Boost(destatep, F_CP1252, kTrigramBoost);
3723 // We don't want to upset the relative rank of a declared 8859-15
3724 Boost(destatep, F_ISO_8859_15, kTrigramBoost);
3725 --excess_latin27;
3726 } else if (trigram_val == kTriLatin2Likely) {
3727 Boost(destatep, F_Latin2, kTrigramBoost);
3728 Boost(destatep, F_CP1250, kTrigramBoost);
3729 ++excess_latin27;
3730 } else if (trigram_val == kTriLatin7Likely) {
3731 Boost(destatep, F_ISO_8859_13, kTrigramBoost);
3732 Boost(destatep, F_CP1257, kTrigramBoost);
3733 // We don't want to upset the relative rank of a declared 8859-4 or -6
3734 // for Estonian
3735 Boost(destatep, F_Latin4, kTrigramBoost);
3736 Boost(destatep, F_Latin6, kTrigramBoost);
3737 ++excess_latin27;
3738 }
3739 }
3740
3741 ++trisrc;
3742 }
3743 //printf("\n");
3744
3745 return (0 < excess_latin27);
3746 }
3747
3748
3749
3750 // Boost any encodings that need extra detection help, then prune
3751 // src is first unscanned byte
3752 // slowend means extra pruning when dropping out of initial slow scan
3753 // final means last call -- no bigram at src
BoostPrune(const uint8 * src,DetectEncodingState * destatep,int prunereason)3754 void BoostPrune(const uint8* src, DetectEncodingState* destatep,
3755 int prunereason) {
3756 int delta_asciipairs = destatep->next_interesting_pair[AsciiPair] -
3757 destatep->prior_interesting_pair[AsciiPair];
3758 int delta_otherpairs = destatep->next_interesting_pair[OtherPair] -
3759 destatep->prior_interesting_pair[OtherPair];
3760
3761 if (prunereason == PRUNE_FINAL) {
3762 // We are about done
3763 // If we get here with very little accumulated data, the initial hints
3764 // were too strong, so we derate them to n+1 / 12 for n bigrams
3765 if (!destatep->hints_derated &&
3766 (destatep->next_interesting_pair[OtherPair] < kDerateHintsBelow)) {
3767 int n = destatep->next_interesting_pair[OtherPair];
3768
3769 // Map N pairs to (N+1)/12 portions of the initial hints, etc.
3770 // Floor of 3/12 -- 1/12 and 2/12 are too easy to overcome
3771 int m = maxint(3, (n + 1));
3772 for (int i = 0; i < NUM_RANKEDENCODING; ++i) {
3773 int original_delta = destatep->hint_prob[i];
3774 int scaled_delta = (original_delta * m) / kDerateHintsBelow;
3775 destatep->enc_prob[i] -= original_delta;
3776 destatep->enc_prob[i] += scaled_delta;
3777 }
3778 destatep->hints_derated = true;
3779 if (destatep->debug_data != NULL) {
3780 // Show derated-hint result
3781 char buff[32];
3782 snprintf(buff, sizeof(buff), "Hints %d/%d", m, kDerateHintsBelow);
3783 SetDetailsEncLabel(destatep, buff);
3784 }
3785 }
3786 }
3787
3788
3789 ++destatep->prune_count;
3790
3791 if (prunereason != PRUNE_FINAL) {
3792 // Early outs
3793 if (destatep->rankedencoding_list_len <= 1) { // nothing to prune
3794 destatep->done = true;
3795 return;
3796 }
3797
3798 if ((destatep->prune_count > 0) &&
3799 (delta_asciipairs + delta_otherpairs) == 0) {
3800 // Nothing to do; must have just been called earlier
3801 return;
3802 }
3803 }
3804
3805
3806
3807 // INCREMENT
3808 // ====================
3809 // Accumulate OtherPair probibilities over all active families
3810 // AsciiPair probibilities are all done in ActiveSpecialBoostWhack
3811 uint8 prior_bad_byte1 = ' '; // won't match first bad pair
3812 uint8 prior_bad_byte2 = ' '; // won't match first bad pair
3813 uint8 or_byte1 = 0; // Track if any current pair has a high bit
3814 int counted_otherpairs = 0;
3815 uint8 prior_byte1x2x = 0;
3816 for (int i = 0; i < delta_otherpairs; ++i) {
3817 int watch1_incr = 0;
3818 int watch2_incr = 0;
3819 int next_pair = destatep->prior_interesting_pair[OtherPair] + i;
3820
3821 uint8 byte1 = destatep->interesting_pairs[OtherPair][next_pair * 2 + 0];
3822 uint8 byte2 = destatep->interesting_pairs[OtherPair][next_pair * 2 + 1];
3823 uint8 byte1x2x = (byte1 & 0xf0) | ((byte2 >> 4) & 0x0f);
3824 int weightshift = destatep->interesting_weightshift[OtherPair][next_pair];
3825
3826 int offset_byte12 = destatep->interesting_offsets[OtherPair][next_pair];
3827
3828 // To help distinguish some Cyrillic, Arabic, Greek, Hebrew, Thai
3829 // Remember if this is a CDEF pair immediately following the previous pair
3830 // 8xxx CxCx or CxCx 8xxx
3831 bool next_pair_consec_hi = false;
3832 if (ConsecutivePair(destatep, next_pair)) {
3833 if ((byte1x2x & 0xcc) == 0xcc) { // 8xxx CxCx
3834 next_pair_consec_hi = true;
3835 } else if ((prior_byte1x2x & 0xcc) == 0xcc) { // CxCx 8xxx
3836 next_pair_consec_hi = true;
3837 }
3838 }
3839 //printf("prior/cur/consec %02x %02x %d\n",
3840 // prior_byte1x2x, byte1x2x, next_pair_consec_hi);
3841 prior_byte1x2x = byte1x2x;
3842
3843 or_byte1 |= byte1;
3844 uint8 byte1f = byte1;
3845 // Flip top bit of subscript to better separate quadrant 4 (esp. for Hebrew)
3846 byte1f ^= (byte2 & 0x80);
3847
3848 // If the same bigram occurred recently, don't increment again
3849 bool pair_used = false;
3850 if (!RepeatedBigram(destatep, byte1, byte2)) {
3851 ++counted_otherpairs;
3852 pair_used = true;
3853 // Boost both charset= declared encodings, so
3854 // Nearly-same probability nearby encoding doesn't drift to the top
3855 if (!FLAGS_demo_nodefault) {
3856 destatep->enc_prob[destatep->declared_enc_1] += kDeclaredEncBoost >> weightshift;
3857 destatep->enc_prob[destatep->declared_enc_2] += kDeclaredEncBoost >> weightshift;
3858 }
3859 bool was_bad_pair = false;
3860 for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
3861 int incr_shift = 0;
3862 int rankedencoding = destatep->rankedencoding_list[j];
3863 Encoding enc = kMapToEncoding[rankedencoding];
3864
3865 // For binary, Skip over repeated marker bytes, such as 02, FF, etc.
3866 if ((rankedencoding == F_BINARY) &&
3867 RepeatedBinary(destatep, byte1, byte2)) {
3868 incr_shift = 2; // count 1/4 as much if repeated
3869 }
3870
3871 // If byte 1x2x for this encoding is exactly zero, illegal byte pair
3872 // Don't increment, but instead penalize
3873 const UnigramEntry* ue = &unigram_table[rankedencoding];
3874 if (ue->b12[byte1x2x] == 0) {
3875 // Don't whack consecutive duplicate bad pairs -- overkill
3876 if ((byte1 != prior_bad_byte1) || (byte2 != prior_bad_byte2)) {
3877 // Extra whack for illegal pair in this encoding
3878 Whack(destatep, rankedencoding, kBadPairWhack >> weightshift);
3879 was_bad_pair = true;
3880 }
3881 } else {
3882 // OK to do the real increment
3883 int incr = ue->b1[byte1f] + ue->b2[byte2] + ue->b12[byte1x2x];
3884 if ((ue->b12[byte1x2x] & 0x01) != 0) {
3885 // Use a more-precise table
3886 int byte32x32 = ((byte1 & 0x1f) << 5) | (byte2 & 0x1f);
3887 int hiressub = (byte2 & 0x60) >> 5; // select w/bits 5&6 of byte 2
3888 DCHECK(ue->hires[hiressub] != NULL);
3889 incr += ue->hires[hiressub][byte32x32];
3890 } else {
3891 // Default final offset
3892 incr += ue->so;
3893 }
3894 incr >>= incr_shift;
3895
3896 incr >>= weightshift;
3897 destatep->enc_prob[rankedencoding] += incr; // The actual increment
3898
3899 if (FLAGS_enc_detect_detail2) {
3900 if (watch1_rankedenc == rankedencoding) {watch1_incr = incr;}
3901 if (watch2_rankedenc == rankedencoding) {watch2_incr = incr;}
3902 }
3903 }
3904
3905
3906 // If consecutive pair of high bytes, give slight boost to one-byte
3907 // encodings that have a full alphabet in the high bytes
3908 if (next_pair_consec_hi && HighAlphaEncoding(enc)) {
3909 Boost(destatep, rankedencoding, kDeclaredEncBoost >> weightshift);
3910 }
3911 } // End for j < rankedencoding_list_len
3912
3913 if (was_bad_pair) {
3914 prior_bad_byte1 = byte1;
3915 prior_bad_byte2 = byte2;
3916 }
3917
3918 // Fold in per-bigram most likely encoding for first N bigrams
3919 if (next_pair < kBestPairsCount) {
3920 int best_enc = kMostLikelyEncoding[(byte1 << 8) + byte2];
3921 Boost(destatep, best_enc, kBestEncBoost >> weightshift);
3922 }
3923
3924 // Possibly score 32 trigrams around a bigram to better separate
3925 // Latin1 from Latin2 and Latin7. Especially helpful for detecting
3926 // mis-labelled Hungarian latin2.
3927 // If looking and at bigram 0,8,16,... do full scoring, else just 1 tri
3928 if (destatep->do_latin_trigrams ||
3929 destatep->looking_for_latin_trigrams) {
3930 // If just looking, do full scan every 8 times
3931 // Just look up one trigram the other 7 and do full scan if Latin2,7
3932 bool scan32 = false;
3933 const uint8* trisrc = &destatep->initial_src[offset_byte12 - 1];
3934 if (!destatep->do_latin_trigrams) {
3935 if ((i & 7) == 0 || trisrc + 3 > destatep->limit_src) {
3936 scan32 = true;
3937 } else {
3938 scan32 = (kTriLatin1Likely < TrigramValue(trisrc));
3939 }
3940 }
3941 if (destatep->do_latin_trigrams || scan32) {
3942 // Just score each block of 32 bytes once
3943 int tri_block_offset = offset_byte12 & ~0x1f;
3944 if (destatep->trigram_highwater_mark <= tri_block_offset) {
3945 bool turnon = BoostLatin127Trigrams(tri_block_offset, destatep);
3946 if (FLAGS_counts && !destatep->do_latin_trigrams && turnon) {
3947 ++doing_used; // First time
3948 }
3949 if (FLAGS_enc_detect_source) {
3950 if (!destatep->do_latin_trigrams && turnon) {
3951 // First time
3952 PsHighlight(trisrc, destatep->initial_src, 0, 2);
3953 }
3954 }
3955 destatep->do_latin_trigrams |= turnon;
3956 destatep->trigram_highwater_mark = tri_block_offset + 32;
3957 }
3958 }
3959 }
3960
3961 } // end if RepeatedBigram()
3962
3963 // Keep track of initial byte high 3 bits
3964 ++destatep->byte32_count[byte1 >> 5];
3965
3966
3967 // TODO: boost subset/superset also
3968 // Boost(destatep, kRelatedEncoding[best_enc], kBestEncBoost);
3969
3970 if (destatep->debug_data != NULL) {
3971 // Show detail entry for this bigram
3972 char buff[16];
3973 snprintf(buff, sizeof(buff), "%c%02x%02x%c%c",
3974 pair_used ? ' ' : '[',
3975 byte1,
3976 byte2,
3977 pair_used ? ' ' : ']',
3978 (weightshift == 0) ? ' ' : '-');
3979
3980 SetDetailsEncProb(destatep,
3981 destatep->interesting_offsets[OtherPair][next_pair],
3982 kMostLikelyEncoding[(byte1 << 8) + byte2],
3983 buff);
3984 }
3985 if (FLAGS_enc_detect_detail2) {
3986 if ((watch1_incr != 0) || (watch2_incr != 0)) {
3987 // Show increment detail for this encoding
3988 char buff[32];
3989 snprintf(buff, sizeof(buff), "%c%d %c%d",
3990 (watch1_incr < 0) ? '-' : '+', watch1_incr,
3991 (watch2_incr < 0) ? '-' : '+', watch2_incr);
3992 SetDetailsEncLabel(destatep, buff);
3993 }
3994 }
3995 } // End for i
3996
3997
3998 // If no high bit on, demote all the two-byte codes
3999 // WAS BUG. This was inside the loop above and should be outside
4000 if ((counted_otherpairs > 0) && ((or_byte1 & 0x80) == 0)) {
4001 // No high bit in this group (just 02xx, etc.). Whack 2-byte codes
4002 // This keeps SJS from creeping past Latin1 on illegal C0 bytes
4003 for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
4004 int rankedencoding = destatep->rankedencoding_list[j];
4005 Encoding enc = kMapToEncoding[rankedencoding];
4006 if (TwoByteEncoding(enc)) {
4007 Whack(destatep, rankedencoding, kGentlePairWhack * counted_otherpairs);
4008 }
4009 }
4010 }
4011
4012
4013 // BOOST
4014 // ====================
4015 if (AnyActive(destatep)) {
4016 ActiveSpecialBoostWhack(src, destatep);
4017 }
4018
4019 // Update for next time
4020 destatep->prior_src = src;
4021 destatep->prior_interesting_pair[AsciiPair] =
4022 destatep->next_interesting_pair[AsciiPair];
4023 destatep->prior_interesting_pair[OtherPair] =
4024 destatep->next_interesting_pair[OtherPair];
4025
4026
4027 // Do any pre-prune final adjustments
4028 // ====================
4029 if (prunereason == PRUNE_FINAL) {
4030 // If UTF8 not in base state, whack
4031 if (destatep->next_utf8_ministate != 0) {
4032 Whack(destatep, F_UTF8, kGentlePairWhack * 2 * 1);
4033 }
4034 // If UTF8UTF8 not in base state, whack
4035 if (destatep->next_utf8utf8_ministate != 0) {
4036 Whack(destatep, F_UTF8UTF8, kGentlePairWhack * 2 * 1);
4037 }
4038
4039 // If no valid UTF-8 char ever seen, whack
4040 if (destatep->utf8_minicount[5] == 0) {
4041 Whack(destatep, F_UTF8, kBadPairWhack * 8); // No sequence
4042 Whack(destatep, F_UTF8UTF8, kBadPairWhack * 8); // No sequence
4043 }
4044
4045 // If no valid UTF8UTF8 char ever seen, whack
4046 if (destatep->utf8utf8_minicount[5] == 0) {
4047 Whack(destatep, F_UTF8UTF8, kBadPairWhack * 8); // No sequence
4048 }
4049
4050 // If not all four binary quadrants, whack BINARY;
4051 // worth 2 pair if 3 quads, 4 pair if 1 or 2 quads
4052 if (destatep->binary_quadrants_count < 4) {
4053 if (destatep->binary_quadrants_count == 3) {
4054 Whack(destatep, F_BINARY, kBadPairWhack * 2);
4055 } else {
4056 Whack(destatep, F_BINARY, kBadPairWhack * 4);
4057 }
4058 }
4059
4060 // If 1st pair is 1b24, choose between ISO-2022-xx
4061 // <esc> $ ) C ISO-2022-KR [1b 24 29 43]
4062 // <esc> $ ) A ISO-2022-CN [1b 24 29 41]
4063 // <esc> $ ) G ISO-2022-CN [1b 24 29 47]
4064 // <esc> $ * H ISO-2022-CN [1b 24 2a 48]
4065 // <esc> ( B ISO-2022-JP [1b 28 42] to ASCII
4066 // <esc> ( J ISO-2022-JP [1b 28 4a] to X0201
4067 // <esc> $ @ ISO-2022-JP [1b 24 40] to X0208-78 twobyte
4068 // <esc> $ B ISO-2022-JP [1b 24 42] to X0208-83 twobyte
4069 if ((destatep->next_interesting_pair[OtherPair] >= 1) &&
4070 Iso2022Active(destatep)) {
4071 if ((destatep->interesting_pairs[OtherPair][0] == 0x1b) &&
4072 (destatep->interesting_pairs[OtherPair][1] == 0x24)) {
4073 int offset = destatep->interesting_offsets[OtherPair][0];
4074 const uint8* esc_src = destatep->initial_src + offset;
4075 if ((destatep->initial_src + offset) < (destatep->limit_src - 3)) {
4076 if ((esc_src[2] == ')') && (esc_src[3] == 'C')) {
4077 Boost(destatep, F_ISO_2022_KR, kBoostOnePair);
4078 Whack(destatep, F_ISO_2022_CN, kBadPairWhack);
4079 Whack(destatep, F_JIS, kBadPairWhack);
4080 } else if ((esc_src[2] == ')') && ((esc_src[3] == 'A') ||
4081 (esc_src[3] == 'G'))) {
4082 Boost(destatep, F_ISO_2022_CN, kBoostOnePair);
4083 Whack(destatep, F_ISO_2022_KR, kBadPairWhack);
4084 Whack(destatep, F_JIS, kBadPairWhack);
4085 } else if ((esc_src[2] == '@') || (esc_src[2] == 'B')) {
4086 Boost(destatep, F_JIS, kBoostOnePair);
4087 Whack(destatep, F_ISO_2022_CN, kBadPairWhack);
4088 Whack(destatep, F_ISO_2022_KR, kBadPairWhack);
4089 }
4090 } else {
4091 // Incomplete escape sequence. Whack them all
4092 Whack(destatep, F_JIS, kBadPairWhack);
4093 Whack(destatep, F_ISO_2022_CN, kBadPairWhack);
4094 Whack(destatep, F_ISO_2022_KR, kBadPairWhack);
4095 }
4096 }
4097 }
4098 if (destatep->debug_data != NULL) {
4099 SetDetailsEncLabel(destatep, "pre-final");
4100 }
4101 }
4102
4103 // PRUNE
4104 // ====================
4105 // Find current top two rankedencoding probabilities
4106 ReRank(destatep);
4107
4108 if (prunereason == PRUNE_SLOWEND) {
4109 if (destatep->debug_data != NULL) {
4110 SetDetailsEncLabel(destatep, "slow-end");
4111 }
4112 }
4113
4114 // Keep every rankedencoding with probablity >= top_prob - prune_difference
4115 int prune_diff = destatep->prune_difference;
4116 // If the top encoding is BINARY, it might be overstated, and we might
4117 // therefore prune away the real encoding. Make the pruning delta
4118 // twice as big.
4119 if (destatep->top_rankedencoding == F_BINARY) {
4120 prune_diff *= 2;
4121 }
4122 int keep_prob = destatep->top_prob - prune_diff;
4123
4124 // Tighten pruning difference (we start wide) for next time
4125 if (destatep->prune_difference > kFinalPruneDifference) {
4126 int decrement = kPruneDiffDecrement;
4127 // If only ASCII pairs, small tighten; if some non-ASCII, full tighten
4128 if (counted_otherpairs == 0) {
4129 decrement >>= 1;
4130 }
4131 destatep->prune_difference -= decrement;
4132 }
4133
4134 // Prune the list of active encoding families
4135 destatep->active_special = 0;
4136 int k = 0;
4137 for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
4138 bool keep = true;
4139 int rankedencoding = destatep->rankedencoding_list[j];
4140
4141 // If count is too low, ditch it
4142 if (destatep->enc_prob[rankedencoding] < keep_prob) {
4143 keep = false;
4144 }
4145
4146 // If at end of slow section, ditch any 7-bit with zero evidence so far
4147 if ((prunereason == PRUNE_SLOWEND) &&
4148 SevenBitEncoding(kMapToEncoding[rankedencoding]) &&
4149 (destatep->enc_prob[rankedencoding] <= 0) &&
4150 (rankedencoding != destatep->top_rankedencoding)) {
4151 keep = false;
4152 }
4153
4154 // Keep it. This will always keep at least top_prob rankedencoding
4155 if (keep) {
4156 destatep->active_special |= kSpecialMask[kMapToEncoding[rankedencoding]];
4157 destatep->rankedencoding_list[k++] = rankedencoding;
4158 }
4159 }
4160
4161 if (destatep->debug_data != NULL) {
4162 char buff[32];
4163 snprintf(buff, sizeof(buff), "%d prune", prune_diff / XLOG2);
4164 SetDetailsEncLabel(destatep, buff);
4165 }
4166 destatep->rankedencoding_list_len = k;
4167
4168
4169
4170 // Force final result in some cases
4171 // Do any post-prune final adjustments
4172 if (prunereason == PRUNE_FINAL) {
4173 // If no high-byte pairs, result is ASCII7, BINARY, UTF7, 2022, or HZ
4174 if (destatep->next_interesting_pair[OtherPair] == 0) {
4175 if ((destatep->top_rankedencoding != F_BINARY) &&
4176 (destatep->top_rankedencoding != F_UTF7) &&
4177 (destatep->top_rankedencoding != F_ISO_2022_CN) &&
4178 (destatep->top_rankedencoding != F_ISO_2022_KR) &&
4179 (destatep->top_rankedencoding != F_JIS) &&
4180 (destatep->top_rankedencoding != F_HZ_GB_2312)) {
4181 destatep->top_rankedencoding = F_ASCII_7_bit;
4182 Boost(destatep, F_ASCII_7_bit, kBoostOnePair * 2);
4183 }
4184 }
4185
4186 // If some 89 pairs, not ISO_8859_x and vice versa
4187 if (destatep->byte32_count[4] > 0) {
4188 switch (destatep->top_rankedencoding) {
4189 case F_ASCII: // ISO-8859-1
4190 destatep->top_rankedencoding = F_CP1252;
4191 // Better: destatep->enc_prob[F_ASCII] <==> destatep->enc_prob[F_CP1252]
4192 Boost(destatep, F_CP1252, kBoostOnePair * 2);
4193 break;
4194 case F_Latin2: // ISO-8859-2
4195 // Don't swap back; not superset
4196 //destatep->top_rankedencoding = F_CP1250;
4197 //Boost(destatep, F_CP1250, kBoostOnePair * 2);
4198 break;
4199 case F_Arabic: // ISO-8859-6
4200 destatep->top_rankedencoding = F_CP1256;
4201 Boost(destatep, F_CP1256, kBoostOnePair * 2);
4202 break;
4203 case F_Greek: // ISO-8859-7
4204 // Don't swap -- not proper superset
4205 // Capital Alpha tonos at 0xB6 in ISO-8859-7, 0xA2 in CP1253
4206 //destatep->top_rankedencoding = F_CP1253;
4207 //Boost(destatep, F_CP1253, kBoostOnePair * 2);
4208 break;
4209 case F_Hebrew: // ISO-8859-8
4210 // Don't swap -- visual vs. logical
4211 //destatep->top_rankedencoding = F_CP1255;
4212 //Boost(destatep, F_CP1255, kBoostOnePair * 2);
4213 break;
4214 case F_Latin5: // ISO-8859-9
4215 destatep->top_rankedencoding = F_CP1254;
4216 Boost(destatep, F_CP1254, kBoostOnePair * 2);
4217 break;
4218 case F_ISO_8859_11: // ISO-8859-11
4219 destatep->top_rankedencoding = F_CP874;
4220 Boost(destatep, F_CP874, kBoostOnePair * 2);
4221 break;
4222 }
4223 } else {
4224 switch (destatep->top_rankedencoding) {
4225 case F_CP1252: // ISO-8859-1
4226 destatep->top_rankedencoding = F_ASCII;
4227 Boost(destatep, F_ASCII, kBoostOnePair * 2);
4228 break;
4229 case F_CP1250: // ISO-8859-2
4230 // Don't swap back; not superset
4231 //destatep->top_rankedencoding = F_Latin2;
4232 //Boost(destatep, F_Latin2, kBoostOnePair * 2);
4233 break;
4234 case F_CP1256: // ISO-8859-6
4235 // Don't swap back -- not proper superset
4236 //destatep->top_rankedencoding = F_Arabic;
4237 //Boost(destatep, F_Arabic, kBoostOnePair * 2);
4238 break;
4239 case F_CP1253: // ISO-8859-7
4240 // Don't swap back -- not proper superset
4241 //destatep->top_rankedencoding = F_Greek;
4242 //Boost(destatep, F_Greek, kBoostOnePair * 2);
4243 break;
4244 case F_CP1255: // ISO-8859-8
4245 // Don't swap back -- not proper superset
4246 //destatep->top_rankedencoding = F_Hebrew;
4247 //Boost(destatep, F_Hebrew, kBoostOnePair * 2);
4248 break;
4249 case F_CP1254: // ISO-8859-9
4250 destatep->top_rankedencoding = F_Latin5;
4251 Boost(destatep, F_Latin5, kBoostOnePair * 2);
4252 break;
4253 case F_CP874: // ISO-8859-11
4254 destatep->top_rankedencoding = F_ISO_8859_11;
4255 Boost(destatep, F_ISO_8859_11, kBoostOnePair * 2);
4256 break;
4257 }
4258 }
4259
4260 if (destatep->debug_data != NULL) {
4261 char buff[32];
4262 snprintf(buff, sizeof(buff), "final %d",
4263 static_cast<int>(src - destatep->initial_src));
4264 SetDetailsEncLabel(destatep, buff);
4265
4266 // Show winning encoding and its delta log base2 from 2nd-best
4267 // Divide delta by XLOG2 to get log base 2
4268 int delta = destatep->top_prob - destatep->second_top_prob;
4269 if (delta < (2 * XLOG2)) {
4270 delta /= XDECILOG2;
4271 snprintf(buff, sizeof(buff), "+%d.%d %s ",
4272 delta / 10, delta % 10,
4273 MyEncodingName(kMapToEncoding[destatep->top_rankedencoding]));
4274 } else if (delta < (50 * XLOG2)) {
4275 delta /= XLOG2;
4276 snprintf(buff, sizeof(buff), "+%d %s",
4277 delta,
4278 MyEncodingName(kMapToEncoding[destatep->top_rankedencoding]));
4279 } else {
4280 snprintf(buff, sizeof(buff), "%s",
4281 MyEncodingName(kMapToEncoding[destatep->top_rankedencoding]));
4282 }
4283 SetDetailsEncProbCopyOffset(destatep, destatep->top_rankedencoding, buff);
4284 }
4285 }
4286
4287
4288 // FINISH
4289 // ====================
4290 // Eventual encoding result is reliable if big difference in top two, or if
4291 // only Ascii7 ever encountered
4292 // Also reliable if exactly one OtherPair and it's best encoding matches top
4293 destatep->reliable = false;
4294 if (destatep->next_interesting_pair[OtherPair] == 0) {
4295 // Only 7-bit ASCII
4296 destatep->reliable = true;
4297 }
4298 if ((destatep->top_prob - destatep->second_top_prob) >=
4299 FLAGS_ced_reliable_difference) {
4300 destatep->reliable = true;
4301 }
4302 if (destatep->next_interesting_pair[OtherPair] == 1) {
4303 uint8 byte1 = destatep->interesting_pairs[OtherPair][0];
4304 uint8 byte2 = destatep->interesting_pairs[OtherPair][1];
4305 int best_enc = kMostLikelyEncoding[(byte1 << 8) + byte2];
4306 if (best_enc == destatep->top_rankedencoding) {
4307 destatep->reliable = true;
4308 }
4309 }
4310
4311 // If we pruned to one encoding, we are done
4312 if (destatep->rankedencoding_list_len == 1) {
4313 destatep->reliable = true;
4314 destatep->done = true;
4315 }
4316
4317 // If we pruned to two or three encodings in the same *superset/subset
4318 // rankedencoding* and enough pairs, we are done. Else keep going
4319 if (destatep->rankedencoding_list_len == 2) {
4320 Encoding enc0 = kMapToEncoding[destatep->rankedencoding_list[0]];
4321 Encoding enc1 = kMapToEncoding[destatep->rankedencoding_list[1]];
4322 if (kMapEncToBaseEncoding[enc0] == kMapEncToBaseEncoding[enc1]) {
4323 if (destatep->prune_count >= 3) {
4324 destatep->reliable = true;
4325 destatep->done = true;
4326 }
4327 }
4328 } else if (destatep->rankedencoding_list_len == 3) {
4329 Encoding enc0 = kMapToEncoding[destatep->rankedencoding_list[0]];
4330 Encoding enc1 = kMapToEncoding[destatep->rankedencoding_list[1]];
4331 Encoding enc2 = kMapToEncoding[destatep->rankedencoding_list[2]];
4332 Encoding base0 = kMapEncToBaseEncoding[enc0];
4333 Encoding base1 = kMapEncToBaseEncoding[enc1];
4334 Encoding base2 = kMapEncToBaseEncoding[enc2];
4335
4336 if ((base0 == base1) && (base0 == base2)) {
4337 if (destatep->prune_count >= 3) {
4338 destatep->reliable = true;
4339 destatep->done = true;
4340 }
4341 }
4342 }
4343 }
4344
4345
4346 // Accumulate aligned byte-pair at src
4347 // Occasionally, calc boost for some encodings and then prune the active list
4348 // weightshift is used to give low weight some text, such as inside tags
4349 // Returns true if pruning occurred
IncrementAndBoostPrune(const uint8 * src,int remaining_length,DetectEncodingState * destatep,int weightshift,int exit_reason)4350 bool IncrementAndBoostPrune(const uint8* src,
4351 int remaining_length,
4352 DetectEncodingState* destatep,
4353 int weightshift,
4354 int exit_reason) {
4355 destatep->last_pair = src;
4356 // Pick up byte pair, or very last byte plus 0x20
4357 uint8 byte1 = src[0];
4358 uint8 byte2 = 0x20;
4359 if (1 < remaining_length) {byte2 = src[1];}
4360
4361 // whatset=0 for Ascii + ~, 1 for all others; see kTestPrintableAsciiTildePlus
4362 int whatset = exit_reason - 1;
4363 int next_pair = destatep->next_interesting_pair[whatset];
4364
4365 if (next_pair > 16) {
4366 // If not clear by 16 bigrams, stop accumulating + ~ 00
4367 if (byte1 == '+') {return false;}
4368 if (byte1 == '~') {return false;}
4369 if (byte1 == 0x00) {return false;}
4370 }
4371
4372 // Remember pair in appropriate list
4373 if (next_pair >= kMaxPairs) {
4374 // We have filled up our alloted space for interesting pairs with no
4375 // decision. If ASCII pairs full, just skip until end of slow loop; if
4376 // non-Ascii pairs full, force done
4377 if (whatset == OtherPair) {
4378 destatep->done = true;
4379 }
4380 } else {
4381 int offset = static_cast<int>(src - destatep->initial_src);
4382 destatep->interesting_pairs[whatset][next_pair * 2 + 0] = byte1;
4383 destatep->interesting_pairs[whatset][next_pair * 2 + 1] = byte2;
4384 destatep->interesting_offsets[whatset][next_pair] = offset;
4385 destatep->interesting_weightshift[whatset][next_pair] = weightshift;
4386 ++destatep->next_interesting_pair[whatset];
4387 ++next_pair;
4388 }
4389
4390 // Prune now and then , but always if forced to be done
4391 if (destatep->done || ((next_pair & kPruneMask) == 0)) { // Prune every M
4392 BoostPrune(src + 2, destatep, PRUNE_NORMAL); // src+2 first unscanned byte
4393 // may be off end of input
4394 return true;
4395 }
4396 return false;
4397 }
4398
DumpSummary(DetectEncodingState * destatep,int whatset,int n)4399 void DumpSummary(DetectEncodingState* destatep, int whatset, int n) {
4400 printf(" %sSummary[%2d]: ", kWhatSetName[whatset],
4401 destatep->next_interesting_pair[whatset]);
4402 int limit = minint(n, destatep->next_interesting_pair[whatset]);
4403 for (int i = 0; i < limit; ++i) {
4404 printf("%02x%02x ",
4405 destatep->interesting_pairs[whatset][i * 2 + 0],
4406 destatep->interesting_pairs[whatset][i * 2 + 1]);
4407 if ((i & 7) == 7) {printf(" ");}
4408 }
4409 printf("\n");
4410 }
4411
BeginDetail(DetectEncodingState * destatep)4412 void BeginDetail(DetectEncodingState* destatep) {
4413 fprintf(stderr, "%d [", NUM_RANKEDENCODING);
4414 for (int e = 0; e < NUM_RANKEDENCODING; ++e) {
4415 fprintf(stderr, "(%s)", MyRankedEncName(e));
4416 if ((e % 10) == 9) {fprintf(stderr, "\n ");}
4417 }
4418 fprintf(stderr, "] size-detail\n");
4419 destatep->next_detail_entry = 0;
4420 }
4421
4422 // Single character to represent (printable ASCII) gap between bigrams
DetailOffsetChar(int delta)4423 char DetailOffsetChar(int delta) {
4424 if (delta == 0) {return ' ';}
4425 if (delta <= 2) {return '=';}
4426 if (delta <= 15) {return '_';}
4427 if (delta <= 31) {return '+';}
4428 {return ' ';}
4429 }
4430
DumpDetail(DetectEncodingState * destatep)4431 void DumpDetail(DetectEncodingState* destatep) {
4432 // Turn all counts into delta from previous entry
4433 fprintf(stderr, "%d count-detail\n", destatep->next_detail_entry);
4434 // Rewrite, recording deltas
4435 for (int z = destatep->next_detail_entry - 1; z > 0; --z) {
4436 destatep->debug_data[z].offset -= destatep->debug_data[z - 1].offset;
4437 for (int e = 0; e < NUM_RANKEDENCODING; ++e) {
4438 destatep->debug_data[z].detail_enc_prob[e] -=
4439 destatep->debug_data[z - 1].detail_enc_prob[e];
4440 }
4441 }
4442 // Now print
4443 for (int z = 0; z < destatep->next_detail_entry; ++z) {
4444 // Highlight some entries ending in '!' with light red underbar
4445 int len = destatep->debug_data[z].label.size();
4446 if (destatep->debug_data[z].label[len - 1] == '!') {
4447 fprintf(stderr, "1 0.9 0.9 do-flag\n");
4448 }
4449 fprintf(stderr, "(%c%s) %d [",
4450 DetailOffsetChar(destatep->debug_data[z].offset),
4451 destatep->debug_data[z].label.c_str(),
4452 destatep->debug_data[z].best_enc);
4453 for (int e = 0; e < NUM_RANKEDENCODING; ++e) {
4454 fprintf(stderr, "%d ", destatep->debug_data[z].detail_enc_prob[e]);
4455 if ((e % 10) == 9) {fprintf(stderr, " ");}
4456 }
4457 fprintf(stderr, "] do-detail-e\n");
4458 }
4459 // Get ready for next time,if any
4460 destatep->next_detail_entry = 0;
4461 }
4462
PsRecurse(const char * buff)4463 void PsRecurse(const char* buff) {
4464 fprintf(stderr, "() end-detail (%s) start-detail\n\n", buff);
4465 }
4466
DumpReliable(DetectEncodingState * destatep)4467 void DumpReliable(DetectEncodingState* destatep) {
4468 printf("Not reliable: ");
4469
4470 // Find center of gravity of OtherPair list
4471 int x_sum = 0;
4472 int y_sum = 0;
4473 int count = destatep->next_interesting_pair[OtherPair];
4474 for (int i = 0; i < count; ++i) {
4475 uint8 byte1 = destatep->interesting_pairs[OtherPair][i * 2 + 0];
4476 uint8 byte2 = destatep->interesting_pairs[OtherPair][i * 2 + 1];
4477 x_sum += byte2;
4478 y_sum += byte1;
4479 }
4480 if (count == 0) {count = 1;} // adoid zdiv
4481 int x_bar = x_sum / count;
4482 int y_bar = y_sum / count;
4483 printf("center %02X,%02X\n", x_bar, y_bar);
4484
4485 double closest_dist = 999.0;
4486 int closest = 0;
4487 for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
4488 int rankedencoding = destatep->rankedencoding_list[j];
4489 const UnigramEntry* ue = &unigram_table[rankedencoding];
4490 printf(" %8s = %4d at %02x,%02x +/- %02X,%02X ",
4491 MyEncodingName(kMapToEncoding[rankedencoding]),
4492 destatep->enc_prob[rankedencoding],
4493 ue->x_bar, ue->y_bar,
4494 ue->x_stddev, ue->y_stddev);
4495 double x_diff = x_bar - ue->x_bar;
4496 double y_diff = y_bar - ue->y_bar;
4497 double dist = sqrt((x_diff * x_diff) + (y_diff * y_diff));
4498 printf("(%3.1f)\n", dist);
4499
4500 if (closest_dist > dist) {
4501 closest_dist = dist;
4502 closest = rankedencoding;
4503 }
4504 }
4505 printf("Closest=%s (%3.1f)\n",
4506 MyEncodingName(kMapToEncoding[closest]), closest_dist);
4507
4508 for (int i = 0; i < 8; ++i) {
4509 // Demote by distance to CG and see if that helps, or just quit
4510 }
4511 }
4512
4513 // Scan short single lines quickly for all printable ASCII
4514 // Return true if all bytes are in [20..7F], false otherwise
QuickPrintableAsciiScan(const char * text,int text_length)4515 bool QuickPrintableAsciiScan(const char* text, int text_length) {
4516 const uint8* src = reinterpret_cast<const uint8*>(text);
4517 const uint8* srclimit = src + text_length;
4518 const uint8* srclimit8 = srclimit - 7;
4519 while (src < srclimit8) {
4520 // Exits on any byte outside [0x20..0x7E] range (HT LF CR exit)
4521 uint8 mask = 0;
4522 for (int i = 0; i < 8; ++i) mask |= (src[i]-0x20)|(src[i]+0x01);
4523 if ((mask & 0x80) != 0) break;
4524 src += 8;
4525 }
4526 while (src < srclimit) {
4527 uint8 uc = *src++;
4528 if (kIsPrintableAscii[uc] == 0) {return false;}
4529 }
4530 return true;
4531 }
4532
4533 static const int kMaxScanBack = 192;
4534
4535 // Return true if text is inside a tag or JS comment
TextInsideTag(const uint8 * isrc,const uint8 * src,const uint8 * srclimit)4536 bool TextInsideTag(const uint8* isrc, const uint8* src, const uint8* srclimit) {
4537 const uint8* srcbacklimit = src - kMaxScanBack;
4538 if (srcbacklimit < isrc) {
4539 srcbacklimit = isrc;
4540 }
4541 const uint8* ss = src - 1;
4542 while (srcbacklimit <= ss) {
4543 uint8 c = *ss--;
4544 if ((c & ~0x02) == '<') {
4545 // We found preceding < 3C or > 3E nearby
4546 // Even cheaper: if inside a tag, we don't care what tag; return true
4547 if (c == '<') {
4548 return true;
4549 }
4550 // See if we are just after <title>...
4551 if ((c == '>') && (isrc <= (ss - 5)) &&
4552 (ss[-5] == '<') &&
4553 ((ss[-4] | 0x20) == 't') &&
4554 ((ss[-3] | 0x20) == 'i') &&
4555 ((ss[-2] | 0x20) == 't') &&
4556 ((ss[-1] | 0x20) == 'l') &&
4557 ((ss[-0] | 0x20) == 'e')) {
4558 return true;
4559 }
4560 // See if we are just after <SCRIPT language=javascript>...
4561 if ((c == '>') && (isrc <= (ss - 5)) &&
4562 (ss[-5] == 's') &&
4563 ((ss[-4] | 0x20) == 'c') &&
4564 ((ss[-3] | 0x20) == 'r') &&
4565 ((ss[-2] | 0x20) == 'i') &&
4566 ((ss[-1] | 0x20) == 'p') &&
4567 ((ss[-0] | 0x20) == 't')) {
4568 return true;
4569 }
4570 // Not in a tag
4571 return false;
4572 // See if we are just after JavaScript comment /* ...
4573 } else if (c == '/') {
4574 if (((ss + 2) < srclimit) && (ss[2] == '*')) {
4575 // We backscanned to /*
4576 return true;
4577 }
4578 }
4579 }
4580
4581 return false;
4582 }
4583
SkipToTagEnd(const uint8 * src,const uint8 * srclimit)4584 const uint8* SkipToTagEnd(const uint8* src, const uint8* srclimit) {
4585 const uint8* ss = src + 1;
4586 while (ss <= srclimit) {
4587 uint8 c = *ss++;
4588 if ((c == '<') || (c == '>')) {
4589 return ss;
4590 }
4591 }
4592 return src + 2; // Always make progress, Otherwise we get an infinite loop
4593 }
4594
4595
4596 // Take a watch string and map to a ranked encoding. If no match, return -1
LookupWatchEnc(const string & watch_str)4597 int LookupWatchEnc(const string& watch_str) {
4598 int watchval = -1;
4599 // Mixed encoding maps to enc=UTF8UTF8
4600 if (watch_str == "UTF8UTF8") {
4601 watchval = F_UTF8UTF8;
4602 } else {
4603 Encoding enc;
4604 if (EncodingFromName(watch_str.c_str(), &enc)) {
4605 watchval = CompactEncDet::BackmapEncodingToRankedEncoding(enc);
4606 }
4607 }
4608 return watchval;
4609 }
4610
4611 // Return true if enc and enc2 are equal or one is a subset of the other
4612 // or either is UNKNOWN
4613 // also UTF8UTF8 is compatible with both Latin1 and UTF8
CompatibleEnc(Encoding enc,Encoding enc2)4614 bool CompatibleEnc(Encoding enc, Encoding enc2) {
4615 if (enc < 0) {return false;}
4616 if (NUM_ENCODINGS <= enc) {return false;}
4617 if (enc2 < 0) {return false;}
4618 if (NUM_ENCODINGS <= enc2) {return false;}
4619 if (enc == enc2) {return true;}
4620 if (kMapEncToBaseEncoding[enc] == kMapEncToBaseEncoding[enc2]) {return true;}
4621
4622 if (enc == ASCII_7BIT) {return true;}
4623 if (enc2 == ASCII_7BIT) {return true;}
4624 if (enc == UNKNOWN_ENCODING) {return true;}
4625 if (enc2 == UNKNOWN_ENCODING) {return true;}
4626 if (enc == UTF8UTF8) {
4627 if (enc2 == UTF8) {return true;}
4628 if (kMapEncToBaseEncoding[enc2] == ISO_8859_1) {return true;}
4629 }
4630 if (enc2 == UTF8UTF8) {
4631 if (enc == UTF8) {return true;}
4632 if (kMapEncToBaseEncoding[enc] == ISO_8859_1) {return true;}
4633 }
4634
4635 return false;
4636 }
4637
4638 // Return superset of enc and enc2, which must be compatible
SupersetEnc(Encoding enc,Encoding enc2)4639 Encoding SupersetEnc(Encoding enc, Encoding enc2) {
4640 //printf(" SupersetEnc (%s, ", MyEncodingName(enc)); // TEMP
4641 //printf("%s) ", MyEncodingName(enc2));
4642 //printf("= %s\n",
4643 // MyEncodingName(kMapEncToSuperLevel[enc] >= kMapEncToSuperLevel[enc2] ?
4644 // enc :enc2));
4645 if (kMapEncToSuperLevel[enc] >= kMapEncToSuperLevel[enc2]) {
4646 return enc;
4647 }
4648 return enc2;
4649 }
4650
4651
4652 // If unreliable, try rescoring to separate some encodings
Rescore(Encoding enc,const uint8 * isrc,const uint8 * srctextlimit,DetectEncodingState * destatep)4653 Encoding Rescore(Encoding enc, const uint8* isrc,
4654 const uint8* srctextlimit, DetectEncodingState* destatep) {
4655 if (FLAGS_counts) {++rescore_used;}
4656 Encoding new_enc = enc;
4657
4658 bool rescore_change = false;
4659
4660 int count = destatep->next_interesting_pair[OtherPair];
4661 int text_length = srctextlimit - isrc;
4662 for (int i = 0; i < count; ++i) {
4663 int bigram_offset = destatep->interesting_offsets[OtherPair][i];
4664 uint8 byte0 = (0 < bigram_offset) ?
4665 isrc[bigram_offset - 1] : 0x20;
4666 uint8 byte1 = isrc[bigram_offset + 0]; // Known to have high bit on
4667 uint8 byte2 = ((bigram_offset + 1) < text_length) ?
4668 isrc[bigram_offset + 1] : 0x20;
4669 uint8 byte3 = ((bigram_offset + 2) < text_length) ?
4670 isrc[bigram_offset + 2] : 0x20;
4671 int high_hash = ((byte0 & 0xc0) >> 0) |
4672 ((byte1 & 0xc0) >> 1) |
4673 ((byte2 & 0xc0) >> 4) |
4674 ((byte3 & 0xc0) >> 6); // 00112233
4675
4676 // Boost HighAccent encodings for Ascii bit patterns
4677 // 0x1x 0x0x
4678 // 1010 1010
4679 // 0010 0000
4680 //
4681 if ((high_hash & 0xaa) == 0x20) {
4682 for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
4683 int rankedencoding = destatep->rankedencoding_list[j];
4684 if (HighAccentEncoding(kMapToEncoding[rankedencoding])) {
4685 // TODO: also want to boost Shift-JIS here if byte1 is Ax..Dx
4686 // TEMP
4687 //printf(" Rescore[%02x] %s +%d\n",
4688 // high_hash, MyRankedEncName(rankedencoding), kGentlePairBoost);
4689 Boost(destatep, rankedencoding, kGentlePairBoost);
4690 rescore_change = true;
4691 }
4692 }
4693 }
4694
4695 // Whack HighAccent encodings for high bit patterns
4696 // 1x1x 1x1x
4697 // 1010 1010
4698 // 1010 1010
4699 //
4700 if ((high_hash & 0xaa) == 0xaa) {
4701 for (int j = 0; j < destatep->rankedencoding_list_len; j++) {
4702 int rankedencoding = destatep->rankedencoding_list[j];
4703 if (HighAccentEncoding(kMapToEncoding[rankedencoding])) {
4704 // TEMP
4705 //printf(" Rescore[%02x] %s -%d\n",
4706 // high_hash, MyRankedEncName(rankedencoding), kGentlePairBoost);
4707 Whack(destatep, rankedencoding, kGentlePairBoost);
4708 rescore_change = true;
4709 }
4710 }
4711 }
4712
4713 }
4714
4715 if (rescore_change) {
4716 ReRank(destatep);
4717 new_enc = kMapToEncoding[destatep->top_rankedencoding];
4718
4719 if (destatep->debug_data != NULL) {
4720 char buff[32];
4721 snprintf(buff, sizeof(buff), "=Rescore %s", MyEncodingName(new_enc));
4722 SetDetailsEncProb(destatep,
4723 0,
4724 CompactEncDet::BackmapEncodingToRankedEncoding(new_enc),
4725 buff);
4726 //// DumpDetail(destatep);
4727 }
4728
4729 SimplePrune(destatep, kFinalPruneDifference);
4730 CalcReliable(destatep);
4731 }
4732
4733 //if (new_enc != enc) {
4734 // // TEMP
4735 // printf(" Rescore new top encoding = %s\n",
4736 // MyRankedEncName(destatep->top_rankedencoding));
4737 //}
4738
4739 return new_enc;
4740 }
4741
4742
4743 // Given an encoding, add its corresponding ranked encoding to the set
AddToSet(Encoding enc,int * list_len,int * list)4744 void AddToSet(Encoding enc, int* list_len, int* list) {
4745 // TEMP print
4746 int item = CompactEncDet::BackmapEncodingToRankedEncoding(enc);
4747 for (int i = 0; i < *list_len; ++i) {
4748 if (list[i] == item) {
4749 return; // Already in the set; don't add again
4750 }
4751 }
4752 list[(*list_len)++] = item;
4753 }
4754
4755
4756 static const int kMinRobustBigramCount = 1000;
4757 static const int kMinKBToRobustScan = 64;
4758 static const int kMaxKBToRobustScan = 256;
4759
4760 // Scan the first 64K or so, just doing raw bigram increments on given
4761 // probability list.
4762 // No fancy duplicate filtering or anything else here.
4763 // Returns number of bigrams counted
RobustScan(const char * text,int text_length,int robust_renc_list_len,int * robust_renc_list,int * robust_renc_probs)4764 int RobustScan(const char* text,
4765 int text_length,
4766 int robust_renc_list_len,
4767 int* robust_renc_list,
4768 int* robust_renc_probs) {
4769 if (FLAGS_counts) {++robust_used;}
4770 // Zero all the result probabilities
4771 for (int i = 0; i < robust_renc_list_len; ++i) {
4772 robust_renc_probs[i] = 0;
4773 }
4774 int max_fast_len = minint(text_length, (kMaxKBToRobustScan << 10));
4775 const uint8* isrc = reinterpret_cast<const uint8*>(text);
4776 const uint8* src = isrc;
4777 const uint8* srclimitfast2 = isrc + max_fast_len - 1;
4778 const uint8* srclimitfast4 = isrc + max_fast_len - 3;
4779
4780 int min_fast_len = minint(text_length, (kMinKBToRobustScan << 10));
4781 const uint8* srclimitmin = isrc + min_fast_len - 1;
4782
4783 int bigram_count = 0;
4784
4785 if (FLAGS_enc_detect_source) {
4786 PsSourceInit(kPsSourceWidth);
4787 fprintf(stderr, "(RobustScan) do-src\n");
4788 }
4789
4790 // Sum over a big chunk of the input
4791 // Faster loop, no 7-bit-encodings possible, approx 3000 GB/sec
4792 //====================================
4793 while (src < srclimitfast2) {
4794 // Skip to next interesting bigram
4795
4796 while (src < srclimitfast4) {
4797 if (((src[0] | src[1] | src[2] | src[3]) & 0x80) != 0) break;
4798 src += 4;
4799 }
4800
4801 while (src < srclimitfast2) {
4802 if ((src[0] & 0x80) != 0) break;
4803 src++;
4804 }
4805
4806 if (src < srclimitfast2) {
4807 // We found a bigram with high bit on
4808 // Next 5 lines commented out so we don't show all the source.
4809 //const uint8* srctextlimit = isrc + text_length;
4810 //if (FLAGS_enc_detect_source) {
4811 // PsSource(src, isrc, srctextlimit);
4812 // PsMark(src, 2, isrc, 0);
4813 //}
4814
4815 uint8 byte1 = src[0];
4816 uint8 byte2 = src[1];
4817 uint8 byte1x2x = (byte1 & 0xf0) | ((byte2 >> 4) & 0x0f);
4818 uint8 byte1f = byte1;
4819 // Flip top bit of subscript to better separate quadrant 4 (esp. for Hebrew)
4820 byte1f ^= (byte2 & 0x80);
4821
4822 // The real increments
4823 for (int j = 0; j < robust_renc_list_len; ++j) {
4824 int rankedencoding = robust_renc_list[j];
4825 const UnigramEntry* ue = &unigram_table[rankedencoding];
4826 int incr = ue->b1[byte1f] + ue->b2[byte2] + ue->b12[byte1x2x];
4827 if ((ue->b12[byte1x2x] & 0x01) != 0) {
4828 // Use a more-precise table
4829 int byte32x32 = ((byte1 & 0x1f) << 5) | (byte2 & 0x1f);
4830 int hiressub = (byte2 & 0x60) >> 5; // select w/bits 5&6 of byte 2
4831 DCHECK(ue->hires[hiressub] != NULL);
4832 incr += ue->hires[hiressub][byte32x32];
4833 } else {
4834 // Default final offset
4835 incr += ue->so;
4836 }
4837 robust_renc_probs[j] += incr;
4838 }
4839
4840 src += 2; // Continue after this bigram
4841 ++bigram_count;
4842
4843 // Stop after 1000 bigrams reached, if at least 64KB scanned
4844 if ((bigram_count > kMinRobustBigramCount) && (src > srclimitmin)) {
4845 break;
4846 }
4847
4848 }
4849 }
4850
4851 if (FLAGS_enc_detect_source) {
4852 fprintf(stderr, "( bigram_count = %d) do-src\n", bigram_count);
4853 if (bigram_count == 0) {bigram_count = 1;} // zdiv
4854 for (int i = 0; i < robust_renc_list_len; ++i) {
4855 fprintf(stderr, "( enc[%-12.12s] = %7d (avg %d)) do-src\n",
4856 MyRankedEncName(robust_renc_list[i]), robust_renc_probs[i],
4857 robust_renc_probs[i] / bigram_count);
4858 }
4859 PsSourceFinish();
4860 }
4861
4862 return bigram_count;
4863 }
4864
4865 // If unreliable, rescan middle of document to see if we can get a better
4866 // answer. Rescan is only worthwhile if there are ~200 bytes or more left,
4867 // since the detector takes as much as 96 bytes of bigrams to decide.
Rescan(Encoding enc,const uint8 * isrc,const uint8 * src,const uint8 * srctextlimit,const char * url_hint,const char * http_charset_hint,const char * meta_charset_hint,const int encoding_hint,const Language language_hint,const CompactEncDet::TextCorpusType corpus_type,bool ignore_7bit_mail_encodings,DetectEncodingState * destatep)4868 Encoding Rescan(Encoding enc,
4869 const uint8* isrc,
4870 const uint8* src,
4871 const uint8* srctextlimit,
4872 const char* url_hint,
4873 const char* http_charset_hint,
4874 const char* meta_charset_hint,
4875 const int encoding_hint,
4876 const Language language_hint,
4877 const CompactEncDet::TextCorpusType corpus_type,
4878 bool ignore_7bit_mail_encodings,
4879 DetectEncodingState* destatep) {
4880 bool enc_is_reliable = destatep->reliable;
4881 Encoding new_enc = enc;
4882 Encoding second_best_enc =
4883 kMapToEncoding[destatep->second_top_rankedencoding];
4884
4885 if (FLAGS_counts) {++rescan_used;}
4886
4887 int scanned_bytes = src - isrc;
4888 int unscanned_bytes = srctextlimit - src;
4889 int text_length = srctextlimit - isrc;
4890 bool empty_rescan = true;
4891
4892 // See if enough bytes left to bother doing rescan
4893 if (kMinRescanLength < unscanned_bytes) {
4894 const char* text = reinterpret_cast<const char*>(isrc);
4895
4896 Encoding one_hint = destatep->http_hint;
4897 if ((one_hint == UNKNOWN_ENCODING) &&
4898 (destatep->meta_hint != UNKNOWN_ENCODING)) {
4899 one_hint = destatep->meta_hint;
4900 }
4901 if ((one_hint == UNKNOWN_ENCODING) &&
4902 (destatep->bom_hint != UNKNOWN_ENCODING)) {
4903 one_hint = destatep->bom_hint;
4904 }
4905
4906 // Go to an even offset to keep UTF-16 in synch
4907 int middle_offset = (scanned_bytes + (unscanned_bytes / 2)) & ~1;
4908 CHECK(middle_offset <= text_length);
4909
4910 // Look back a bit for a low byte to synchronize, else hope for the best.
4911 const uint8* srcbacklimit = isrc + middle_offset - kMaxScanBack;
4912 if (srcbacklimit < src) {
4913 srcbacklimit = src;
4914 }
4915 const uint8* ss = isrc + middle_offset - 1;
4916 while (srcbacklimit <= ss) {
4917 if ((*ss & 0x80) == 0) {break;}
4918 --ss;
4919 }
4920 // Leave middle offset unchanged unless we found a low byte
4921 if (srcbacklimit <= ss) {
4922 // Align to low byte or high byte just after it, whichever is even
4923 middle_offset = (ss - isrc + 1) & ~1; // Even to keep UTF-16 in sync
4924 }
4925 CHECK(middle_offset <= text_length);
4926
4927 if (destatep->debug_data != NULL) {
4928 SetDetailsEncLabel(destatep, ">> Rescan");
4929 // Print the current chart before recursive call
4930 DumpDetail(destatep);
4931
4932 char buff[32];
4933 snprintf(buff, sizeof(buff), ">> Rescan[%d..%d]",
4934 middle_offset, text_length);
4935 PsRecurse(buff);
4936 }
4937
4938 int mid_bytes_consumed;
4939 bool mid_is_reliable;
4940 Encoding mid_second_best_enc;
4941 CEDInternalFlags newflags = static_cast<CEDInternalFlags>(
4942 kCEDRescanning + kCEDForceTags);
4943 // Recursive call for rescan of half of remaining
4944 Encoding mid_enc = InternalDetectEncoding(
4945 newflags,
4946 text + middle_offset,
4947 text_length - middle_offset,
4948 url_hint,
4949 http_charset_hint,
4950 meta_charset_hint,
4951 encoding_hint,
4952 language_hint, // User interface lang
4953 corpus_type,
4954 ignore_7bit_mail_encodings,
4955 &mid_bytes_consumed,
4956 &mid_is_reliable,
4957 &mid_second_best_enc);
4958 destatep->reliable = mid_is_reliable;
4959
4960 empty_rescan = (mid_enc == ASCII_7BIT);
4961
4962 // Not the right decision if, e.g. enc=Greek, mid=ASCII7, one=KSC
4963 // hence the !empty_rescan term
4964 if (!empty_rescan && CompatibleEnc(one_hint, mid_enc)) {
4965 // Encoding we just found is compatible with the
4966 // single hint (if any); return superset
4967 new_enc = SupersetEnc(one_hint, mid_enc);
4968 }
4969
4970 // If original and mid are compatible, and both reliable,
4971 // return new_enc = SupersetEnc(enc, mid_enc)
4972 //
4973 // This avoids too much weight on a bogus hint causing a RobustScan
4974 // that gets the wrong answer
4975 if (!empty_rescan && mid_is_reliable && enc_is_reliable &&
4976 CompatibleEnc(enc, mid_enc)) {
4977 new_enc = SupersetEnc(enc, mid_enc);
4978 return new_enc;
4979 }
4980
4981 // if mid unreliable, robustscan
4982 // if mid empty, robustscan
4983 // if original and mid not compatible, robustscan
4984 // if mid and one_hint not compatible, robustscan
4985
4986 // If we found conflicting data, drop back and do a robust scan of a big
4987 // chunk of the input over a set of candidate encodings
4988 //
4989 if (!mid_is_reliable ||
4990 empty_rescan ||
4991 !CompatibleEnc(enc, mid_enc) ||
4992 !CompatibleEnc(one_hint, mid_enc)) {
4993 int robust_renc_list_len; // Number of active encodings
4994 int robust_renc_list[NUM_RANKEDENCODING]; // List of ranked encodings
4995 int robust_renc_probs[NUM_RANKEDENCODING]; // List of matching probs
4996
4997 robust_renc_list_len = 0;
4998 AddToSet(enc, &robust_renc_list_len, robust_renc_list);
4999 AddToSet(second_best_enc, &robust_renc_list_len, robust_renc_list);
5000 AddToSet(mid_enc, &robust_renc_list_len, robust_renc_list);
5001 AddToSet(mid_second_best_enc, &robust_renc_list_len, robust_renc_list);
5002 if (destatep->http_hint != UNKNOWN_ENCODING) {
5003 AddToSet(destatep->http_hint, &robust_renc_list_len, robust_renc_list);
5004 }
5005 if (destatep->meta_hint != UNKNOWN_ENCODING) {
5006 AddToSet(destatep->meta_hint, &robust_renc_list_len, robust_renc_list);
5007 }
5008 if (destatep->bom_hint != UNKNOWN_ENCODING) {
5009 AddToSet(destatep->bom_hint, &robust_renc_list_len, robust_renc_list);
5010 }
5011 if (destatep->tld_hint != UNKNOWN_ENCODING) {
5012 AddToSet(destatep->tld_hint, &robust_renc_list_len, robust_renc_list);
5013 }
5014
5015 // Separate simple scan
5016 // =====================
5017 if (destatep->debug_data != NULL) {
5018 SetDetailsEncLabel(destatep, ">> RobustScan");
5019 // Print the current chart before recursive call
5020 DumpDetail(destatep);
5021
5022 char buff[32];
5023 snprintf(buff, sizeof(buff), ">> RobustScan[0..%d]", text_length);
5024 PsRecurse(buff);
5025 }
5026
5027 int bigram_count = RobustScan(text, text_length,
5028 robust_renc_list_len, robust_renc_list, robust_renc_probs);
5029
5030 // Default to new_enc and update if something better was found
5031 int best_prob = -1;
5032 // TEMP print
5033 for (int i = 0; i < robust_renc_list_len; ++i) {
5034 if (best_prob < robust_renc_probs[i]) {
5035 best_prob = robust_renc_probs[i];
5036 new_enc = kMapToEncoding[robust_renc_list[i]];
5037 }
5038 }
5039
5040 if (destatep->debug_data != NULL) {
5041 char buff[32];
5042 snprintf(buff, sizeof(buff), "=Robust[%d] %s",
5043 bigram_count, MyEncodingName(new_enc));
5044 SetDetailsEncProb(destatep,
5045 0,
5046 CompactEncDet::BackmapEncodingToRankedEncoding(new_enc),
5047 buff);
5048 }
5049 }
5050 } // End if enough bytes
5051
5052 return new_enc;
5053 }
5054
5055 // With no hints at all, and perhaps on rescan, we relax our pickiness
5056 // and go ahead and accept the top multibyte encodings, even though
5057 // strictly their web pages should have declared an explicit encoding to
5058 // avoid the HTML standard's default ISO-8859-1.
NoHintsCloseEnoughCompatible(Encoding top_enc)5059 bool NoHintsCloseEnoughCompatible(Encoding top_enc) {
5060 // First test accepts degenerate cases plus UTF8 and UTF8UTF8
5061 if (CompatibleEnc(UTF8, top_enc)) {return true;}
5062
5063 // The rest look for exact match of base encoding
5064 Encoding base_enc = kMapEncToBaseEncoding[top_enc];
5065 if (base_enc == JAPANESE_EUC_JP) {return true;}
5066 if (base_enc == JAPANESE_SHIFT_JIS) {return true;}
5067 if (base_enc == CHINESE_BIG5) {return true;}
5068 if (base_enc == CHINESE_GB) {return true;}
5069 if (base_enc == KOREAN_EUC_KR) {return true;}
5070 return false;
5071 }
5072
5073
5074
5075 // Scan raw bytes and detect most likely encoding
5076 // Design goals:
5077 // Skip over big initial stretches of seven-bit ASCII bytes very quickly
5078 // Thread safe
5079 // Works equally well on
5080 // 50-byte queries,
5081 // 5000-byte email and
5082 // 50000-byte web pages
5083 // Length 0 input returns ISO_8859_1 (ASCII) encoding
5084 // Setting ignore_7bit_mail_encodings effectively turns off detection of
5085 // UTF-7, HZ, and ISO-2022-xx
InternalDetectEncoding(CEDInternalFlags flags,const char * text,int text_length,const char * url_hint,const char * http_charset_hint,const char * meta_charset_hint,const int encoding_hint,const Language language_hint,const CompactEncDet::TextCorpusType corpus_type,bool ignore_7bit_mail_encodings,int * bytes_consumed,bool * is_reliable,Encoding * second_best_enc)5086 Encoding InternalDetectEncoding(
5087 CEDInternalFlags flags, const char* text, int text_length,
5088 const char* url_hint, const char* http_charset_hint,
5089 const char* meta_charset_hint, const int encoding_hint,
5090 const Language language_hint, // User interface lang
5091 const CompactEncDet::TextCorpusType corpus_type,
5092 bool ignore_7bit_mail_encodings, int* bytes_consumed, bool* is_reliable,
5093 Encoding* second_best_enc) {
5094 *bytes_consumed = 0;
5095 *is_reliable = false;
5096 *second_best_enc = ASCII_7BIT;
5097
5098 if (text_length == 0) {
5099 // Follow the spec. Text might be NULL.
5100 *is_reliable = true;
5101 return ISO_8859_1;
5102 }
5103
5104 // For very short (20-50 byte) input strings that are highly likely to be
5105 // all printable ASCII, our startup overhead might dominate. We have to do the
5106 // full detection if the ISO-2022-xx, HZ, or UTF-7 encodings are possible.
5107 // Otherwise, we can do a quick scan for printable ASCII.
5108 if ((text_length <= 500) && ignore_7bit_mail_encodings &&
5109 QuickPrintableAsciiScan(text, text_length)) {
5110 *is_reliable = true;
5111 return ASCII_7BIT;
5112 }
5113
5114 // Go for the full boat detection
5115 DetectEncodingState destate;
5116 InitDetectEncodingState(&destate);
5117
5118 std::unique_ptr<DetailEntry[]> scoped_debug_data;
5119 if (FLAGS_enc_detect_detail) {
5120 // Allocate max 10 details per bigram
5121 scoped_debug_data.reset(new DetailEntry[kMaxPairs * 10]);
5122 destate.debug_data = scoped_debug_data.get();
5123 // NOTE: destate and scoped_debug_data have exactly the same scope
5124 // All other FLAGS_enc_detect_detail tests use destate.debug_data != NULL
5125 }
5126
5127 // Get text length limits
5128 // Typically, we scan the first 16KB looking for all encodings, then
5129 // scan the rest (up to 256KB) a bit faster by no longer looking for
5130 // interesting bytes below 0x80. This allows us to skip over runs of
5131 // 7-bit-ASCII much more quickly.
5132 int slow_len = minint(text_length, (FLAGS_enc_detect_slow_max_kb << 10));
5133 int fast_len = minint(text_length, (FLAGS_enc_detect_fast_max_kb << 10));
5134
5135 // Initialize pointers.
5136 // In general, we do not look at last 3 bytes of input in the fast scan
5137 // We do, however want to look at the last byte or so in the slow scan,
5138 // especilly in the case of a very short text whose only interesting
5139 // information is a 3-byte UTF-8 character in the last three bytes.
5140 // If necessary, we fake a last bigram with 0x20 space as a pad byte.
5141 const uint8* isrc = reinterpret_cast<const uint8*>(text);
5142 const uint8* src = isrc;
5143 const uint8* srctextlimit = isrc + text_length;
5144 const uint8* srclimitslow2 = isrc + slow_len - 1;
5145 const uint8* srclimitfast2 = isrc + fast_len - 1;
5146 const uint8* srclimitfast4 = isrc + fast_len - 3;
5147 if (srclimitslow2 > srclimitfast2) {
5148 srclimitslow2 = srclimitfast2;
5149 }
5150 destate.initial_src = isrc;
5151 destate.limit_src = srclimitfast2 + 1; // May include last byte
5152 destate.prior_src = isrc;
5153 destate.last_pair = isrc - 2;
5154
5155 const char* scan_table = kTestPrintableAsciiTildePlus;
5156 if (ignore_7bit_mail_encodings) {
5157 // Caller wants to ignore UTF-7, HZ, ISO-2022-xx
5158 // Don't stop on + (for UTF-7), nor on ~ (for HZ)
5159 scan_table = kTestPrintableAscii;
5160 }
5161 int exit_reason = 0;
5162
5163 if (destate.debug_data != NULL) {
5164 BeginDetail(&destate);
5165 // Take any incoming watch encoding name and backmap to the corresponding
5166 // ranked enum value
5167 watch1_rankedenc = LookupWatchEnc(FLAGS_enc_detect_watch1);
5168 if (watch1_rankedenc >= 0) {
5169 fprintf(stderr, "/track-me %d def\n", watch1_rankedenc);
5170 }
5171
5172 watch2_rankedenc = LookupWatchEnc(FLAGS_enc_detect_watch2);
5173 if (watch2_rankedenc >= 0) {
5174 fprintf(stderr, "/track-me2 %d def\n", watch2_rankedenc);
5175 }
5176
5177 fprintf(stderr, "%% kDerateHintsBelow = %d\n", kDerateHintsBelow);
5178 }
5179 if (FLAGS_enc_detect_source) {
5180 PsSourceInit(kPsSourceWidth);
5181 PsSource(src, isrc, srctextlimit);
5182 PsMark(src, 4, isrc, 0);
5183 }
5184
5185 // Apply hints, if any, to probabilities
5186 // NOTE: Encoding probabilites are all zero at this point
5187 ApplyHints(url_hint,
5188 http_charset_hint,
5189 meta_charset_hint,
5190 encoding_hint,
5191 language_hint,
5192 corpus_type,
5193 &destate);
5194
5195 // NOTE: probabilities up to this point are subject to derating for
5196 // small numbers of bigrams.
5197 // Probability changes after this point are not derated.
5198
5199 // Do first 4 bytes to pick off strong markers
5200 InitialBytesBoost(isrc, text_length, &destate);
5201
5202 bool ignored_some_tag_text = false;
5203 int tag_text_bigram_count = 0;
5204
5205 // Slower loop, approx 500 MB/sec (2.8 GHz P4)
5206 // ASSERT(srclimitslow2 <= srclimitfast2);
5207 //====================================
5208 DoMoreSlowLoop:
5209 while (src < srclimitslow2) {
5210 // Skip to next interesting byte (this is the slower part)
5211 while (src < srclimitslow2) {
5212 uint8 uc = *src++;
5213 if (scan_table[uc] != 0) {exit_reason = scan_table[uc]; src--; break;}
5214 }
5215
5216 if (src < srclimitslow2) {
5217 if (FLAGS_enc_detect_source) {
5218 PsSource(src, isrc, srctextlimit); // don't mark yet
5219 }
5220
5221 int weightshift = 0;
5222 // In the first 16KB, derate new text run inside <title>...</title> and
5223 // inside <!-- ... -->
5224 if (////((destate.last_pair + 6) <= src) && // if beyond last one
5225 ////(tag_text_bigram_count < kMaxBigramsTagTitleText) &&
5226 (corpus_type == CompactEncDet::WEB_CORPUS) && // and web page
5227 !CEDFlagForceTags(flags)) { // and OK to skip
5228 ////if (TextInsideTag(destate.last_pair + 2, src, srclimitslow2)) {
5229 if (TextInsideTag(isrc, src, srclimitslow2)) {
5230 if (tag_text_bigram_count >= kMaxBigramsTagTitleText) {
5231 ignored_some_tag_text = true;
5232 src = SkipToTagEnd(src, srclimitslow2);
5233 continue;
5234 } else {
5235 weightshift = kWeightshiftForTagTitleText;
5236 ++tag_text_bigram_count;
5237 }
5238 }
5239 }
5240 if (FLAGS_enc_detect_source) {
5241 PsMark(src, 2, isrc, weightshift);
5242 }
5243 // Saves byte pair and offset
5244 bool pruned = IncrementAndBoostPrune(src, srctextlimit - src,
5245 &destate, weightshift, exit_reason);
5246 // Advance; if inside tag, advance to end of tag
5247 if (weightshift == 0) {
5248 src += exit_reason; // 1 Ascii, 2 other
5249 } else {
5250 src += exit_reason; // 1 Ascii, 2 other
5251 //// src = SkipToTagEnd(src, srclimitslow2);
5252 }
5253
5254 if (pruned) {
5255 // Scoring and active encodings have been updated
5256 if (destate.done) {break;}
5257 // Check if all the reasons for the slow loop have been pruned
5258 // If so, go to fast loop
5259 if (!SevenBitActive(&destate)) {break;}
5260 }
5261 }
5262 }
5263 //====================================
5264
5265 // We reached the end of a slow scan, possibly because no more SevenBitActive,
5266 // or possibly are at end of source.
5267 // If we are exactly at the end of the source, make sure we look at the very
5268 // last byte.
5269 bool very_last_byte_incremented = false;
5270 if (src == (srctextlimit - 1)) {
5271 exit_reason = scan_table[*src];
5272 if (exit_reason != 0) {
5273 // The very last byte is an interesting byte
5274 // Saves byte pair and offset
5275 //printf("Interesting very last slow byte = 0x%02x\n", *src);
5276 IncrementAndBoostPrune(src, srctextlimit - src, &destate, 0, exit_reason);
5277 very_last_byte_incremented = true;
5278 }
5279 }
5280
5281 if (FLAGS_enc_detect_source) {
5282 PsSource(src, isrc, srctextlimit);
5283 PsMark(src, 2, isrc, 0);
5284 }
5285 // Force a pruning based on whatever we have
5286 // Delete the seven-bit encodings if there is no evidence of them so far
5287 BoostPrune(src, &destate, PRUNE_SLOWEND);
5288
5289 if (!destate.done) {
5290 // If not clear yet on 7-bit-encodings and more bytes, do more slow
5291 if (SevenBitActive(&destate) && (src < srclimitfast2)) {
5292 // Increment limit by another xxxK
5293 slow_len += (FLAGS_enc_detect_slow_max_kb << 10);
5294 srclimitslow2 = isrc + slow_len - 1;
5295 if (srclimitslow2 > srclimitfast2) {
5296 srclimitslow2 = srclimitfast2;
5297 }
5298 if (!UTF7OrHzActive(&destate)) {
5299 // We can switch to table that does not stop on + ~
5300 scan_table = kTestPrintableAscii;
5301 }
5302 goto DoMoreSlowLoop;
5303 }
5304
5305
5306 exit_reason = 2;
5307 // Faster loop, no 7-bit-encodings possible, approx 3000 GB/sec
5308 //====================================
5309 while (src < srclimitfast2) {
5310 // Skip to next interesting byte (this is the faster part)
5311 while (src < srclimitfast4) {
5312 if (((src[0] | src[1] | src[2] | src[3]) & 0x80) != 0) break;
5313 src += 4;
5314 }
5315
5316 while (src < srclimitfast2) {
5317 if ((src[0] & 0x80) != 0) break;
5318 src++;
5319 }
5320
5321 if (src < srclimitfast2) {
5322 if (FLAGS_enc_detect_source) {
5323 PsSource(src, isrc, srctextlimit);
5324 PsMark(src, 2, isrc, 0);
5325 }
5326 // saves byte pair and offset
5327 bool pruned = IncrementAndBoostPrune(src, srctextlimit - src,
5328 &destate, 0, exit_reason);
5329 src += exit_reason; // 1 Ascii, 2 other
5330 if (pruned) {
5331 // Scoring and active encodings have been updated
5332 if (destate.done) {break;}
5333 }
5334 }
5335 }
5336 //====================================
5337 // We reached the end of fast scan
5338
5339 // If we are exactly at the end of the source, make sure we look at the very
5340 // last byte.
5341 if (src == (srctextlimit - 1) && !very_last_byte_incremented) {
5342 exit_reason = scan_table[*src];
5343 if (exit_reason != 0) {
5344 // The very last byte is an interesting byte
5345 // Saves byte pair and offset
5346 //printf("Interesting very last fast byte = 0x%02x\n", *src);
5347 IncrementAndBoostPrune(src, srctextlimit - src, &destate, 0, exit_reason);
5348 very_last_byte_incremented = true;
5349 }
5350 }
5351
5352 } // End if !done
5353
5354 if (FLAGS_enc_detect_source) {
5355 PsSource(src, isrc, srctextlimit);
5356 PsMark(src, 2, isrc, 0);
5357 }
5358 // Force a pruning based on whatever we have
5359 BoostPrune(src, &destate, PRUNE_FINAL);
5360
5361 if (FLAGS_enc_detect_summary) {
5362 DumpSummary(&destate, AsciiPair, 32);
5363 DumpSummary(&destate, OtherPair, 32);
5364 }
5365 if (FLAGS_enc_detect_source) {
5366 PsSourceFinish();
5367 }
5368 if (destate.debug_data != NULL) {
5369 //// DumpDetail(&destate);
5370 }
5371
5372
5373 if (ignored_some_tag_text &&
5374 (kMapToEncoding[destate.top_rankedencoding] == ASCII_7BIT)) {
5375 // There were some interesting bytes, but only in tag text.
5376 // Recursive call to reprocess looking at the tags this time.
5377
5378 if (destate.debug_data != NULL) {
5379 SetDetailsEncLabel(&destate, ">> Recurse/tags");
5380 // Print the current chart before recursive call
5381 DumpDetail(&destate);
5382
5383 char buff[32];
5384 snprintf(buff, sizeof(buff), ">> Recurse for tags");
5385 PsRecurse(buff);
5386 }
5387
5388 // Recursive call for high bytes in tags [no longer used, 1/16 tag score]
5389 Encoding enc2 = InternalDetectEncoding(
5390 kCEDForceTags, // force
5391 text,
5392 text_length,
5393 url_hint,
5394 http_charset_hint,
5395 meta_charset_hint,
5396 encoding_hint,
5397 language_hint,
5398 corpus_type,
5399 ignore_7bit_mail_encodings,
5400 bytes_consumed,
5401 is_reliable,
5402 second_best_enc);
5403
5404 if (destate.debug_data != NULL) {
5405 // Show winning encoding and dump PostScript
5406 char buff[32];
5407 snprintf(buff, sizeof(buff), "=2 %s", MyEncodingName(enc2));
5408 SetDetailsEncProb(&destate,
5409 0,
5410 CompactEncDet::BackmapEncodingToRankedEncoding(enc2),
5411 buff);
5412 DumpDetail(&destate);
5413 }
5414
5415 return enc2;
5416 }
5417
5418
5419 // If the detected encoding does not match default/hints, or if the hints
5420 // conflict with each other, mark as unreliable. This can be used to trigger
5421 // further scoring.
5422 // Three buckets of input documents;
5423 // ~19% of the web no hints, and top == 7bit, Latin1, or CP1252
5424 // ~79% of the web one or more hints, all same encoding X and top == X
5425 // ~ 2% of the web one or more hints that are inconsistent
5426
5427 Encoding top_enc = kMapToEncoding[destate.top_rankedencoding];
5428 Encoding one_hint = destate.http_hint;
5429 if ((one_hint == UNKNOWN_ENCODING) &&
5430 (destate.meta_hint != UNKNOWN_ENCODING)) {
5431 one_hint = destate.meta_hint;
5432 }
5433 if ((one_hint == UNKNOWN_ENCODING) &&
5434 (destate.bom_hint != UNKNOWN_ENCODING)) {
5435 one_hint = destate.bom_hint;
5436 }
5437
5438 bool found_compatible_encoding = true;
5439 if (one_hint == UNKNOWN_ENCODING) {
5440 // [~14% of the web] No hints, and top == 7bit, Latin1, or CP1252
5441 if (!CompatibleEnc(ISO_8859_1, top_enc)) {
5442 found_compatible_encoding = false;
5443 // If there is nothing but a TLD hint and its top encoding matches, OK
5444 if ((destate.tld_hint != UNKNOWN_ENCODING) &&
5445 CompatibleEnc(destate.tld_hint, top_enc)) {
5446 found_compatible_encoding = true;
5447 }
5448 }
5449 } else if (CompatibleEnc(one_hint, destate.http_hint) &&
5450 CompatibleEnc(one_hint, destate.meta_hint) &&
5451 CompatibleEnc(one_hint, destate.bom_hint)) {
5452 // [~83% of the web] One or more hints, all same encoding X and top == X
5453 if (!CompatibleEnc(one_hint, top_enc)) {
5454 // [~ 2% of the web] Oops, not the declared encoding
5455 found_compatible_encoding = false;
5456 }
5457 } else {
5458 // [~ 3% of the web] Two or more hints that are inconsistent
5459 one_hint = UNKNOWN_ENCODING;
5460 found_compatible_encoding = false;
5461 }
5462
5463 // If we turned Latin1 into Latin2 or 7 via trigrams, don't fail it here
5464 if (destate.do_latin_trigrams) {
5465 if (CompatibleEnc(kMapToEncoding[F_Latin1], top_enc) ||
5466 CompatibleEnc(kMapToEncoding[F_Latin2], top_enc) ||
5467 CompatibleEnc(kMapToEncoding[F_CP1250], top_enc) ||
5468 CompatibleEnc(kMapToEncoding[F_ISO_8859_13], top_enc)) {
5469 found_compatible_encoding = true;
5470 destate.reliable = true;
5471 }
5472 }
5473
5474 // If top encoding is not compatible with the hints, but it is reliably
5475 // UTF-8, accept it anyway.
5476 // This will perform badly with mixed UTF-8 prefix plus another encoding in
5477 // the body if done too early, so we want to be rescanning.
5478 if (!found_compatible_encoding &&
5479 destate.reliable &&
5480 NoHintsCloseEnoughCompatible(top_enc) &&
5481 (destate.next_interesting_pair[OtherPair] >= kStrongPairs) &&
5482 CEDFlagRescanning(flags)) {
5483 found_compatible_encoding = true;
5484 }
5485
5486 // Hold off on this so Rescan() can see if the original encoding was reliable
5487 //if (!found_compatible_encoding) {
5488 // destate.reliable = false;
5489 //}
5490
5491 // If unreliable, try rescoring to separate some encodings
5492 if (!destate.reliable || !found_compatible_encoding) {
5493 top_enc = Rescore(top_enc, isrc, srctextlimit, &destate);
5494 }
5495
5496 *second_best_enc = kMapToEncoding[destate.second_top_rankedencoding];
5497
5498 // If unreliable, and not already rescanning,
5499 // rescan middle of document to see if we can get a better
5500 // answer. Rescan is only worthwhile if there are ~200 bytes or more left,
5501 // since the detector takes as much as 96 bytes of bigrams to decide.
5502 //
5503 // CANNOT retry ISO-2022-xx HZ etc. because no declaration escape at the front
5504 // or we may land in the middle of some partial state. Skip them all.
5505 //
5506 if ((!destate.reliable || !found_compatible_encoding) &&
5507 !CEDFlagRescanning(flags) &&
5508 !SevenBitEncoding(top_enc)) {
5509 top_enc = Rescan(top_enc,
5510 isrc,
5511 src,
5512 srctextlimit,
5513 url_hint,
5514 http_charset_hint,
5515 meta_charset_hint,
5516 encoding_hint,
5517 language_hint,
5518 corpus_type,
5519 ignore_7bit_mail_encodings,
5520 &destate);
5521 } else {
5522 if (!found_compatible_encoding) {
5523 destate.reliable = false;
5524 }
5525 }
5526
5527 if (destate.debug_data != NULL) {
5528 // Dump PostScript
5529 DumpDetail(&destate);
5530 }
5531
5532 *bytes_consumed = src - isrc + 1; // We looked 1 byte beyond src
5533 *is_reliable = destate.reliable;
5534 return top_enc;
5535 }
5536
DetectEncoding(const char * text,int text_length,const char * url_hint,const char * http_charset_hint,const char * meta_charset_hint,const int encoding_hint,const Language language_hint,const TextCorpusType corpus_type,bool ignore_7bit_mail_encodings,int * bytes_consumed,bool * is_reliable)5537 Encoding CompactEncDet::DetectEncoding(
5538 const char* text, int text_length, const char* url_hint,
5539 const char* http_charset_hint, const char* meta_charset_hint,
5540 const int encoding_hint,
5541 const Language language_hint, // User interface lang
5542 const TextCorpusType corpus_type, bool ignore_7bit_mail_encodings,
5543 int* bytes_consumed, bool* is_reliable) {
5544 if (FLAGS_ced_echo_input) {
5545 string temp(text, text_length);
5546 fprintf(stderr, "CompactEncDet::DetectEncoding()\n%s\n\n", temp.c_str());
5547 }
5548
5549 if (FLAGS_counts) {
5550 encdet_used = 0;
5551 rescore_used = 0;
5552 rescan_used = 0;
5553 robust_used = 0;
5554 looking_used = 0;
5555 doing_used = 0;
5556 ++encdet_used;
5557 }
5558 if (FLAGS_dirtsimple) {
5559 // Just count first 64KB bigram encoding probabilities for each encoding
5560 int robust_renc_list_len; // Number of active encodings
5561 int robust_renc_list[NUM_RANKEDENCODING]; // List of ranked encodings
5562 int robust_renc_probs[NUM_RANKEDENCODING]; // List of matching probs
5563
5564 for (int i = 0; i < NUM_RANKEDENCODING; ++i) {
5565 robust_renc_list[i] = i;
5566 }
5567 robust_renc_list_len = NUM_RANKEDENCODING;
5568
5569 RobustScan(text, text_length,
5570 robust_renc_list_len, robust_renc_list, robust_renc_probs);
5571
5572 // Pick off best encoding
5573 int best_prob = -1;
5574 Encoding enc = UNKNOWN_ENCODING;
5575 for (int i = 0; i < robust_renc_list_len; ++i) {
5576 if (best_prob < robust_renc_probs[i]) {
5577 best_prob = robust_renc_probs[i];
5578 enc = kMapToEncoding[robust_renc_list[i]];
5579 }
5580 }
5581
5582 *bytes_consumed = minint(text_length, (kMaxKBToRobustScan << 10));
5583 *is_reliable = true;
5584 if (FLAGS_counts) {
5585 printf("CEDcounts ");
5586 while (encdet_used--) {printf("encdet ");}
5587 while (rescore_used--) {printf("rescore ");}
5588 while (rescan_used--) {printf("rescan ");}
5589 while (robust_used--) {printf("robust ");}
5590 while (looking_used--) {printf("looking ");}
5591 while (doing_used--) {printf("doing ");}
5592 printf("\n");
5593 }
5594
5595 return enc;
5596 }
5597
5598 Encoding second_best_enc;
5599 Encoding enc = InternalDetectEncoding(kCEDNone,
5600 text,
5601 text_length,
5602 url_hint,
5603 http_charset_hint,
5604 meta_charset_hint,
5605 encoding_hint,
5606 language_hint, // User interface lang
5607 corpus_type,
5608 ignore_7bit_mail_encodings,
5609 bytes_consumed,
5610 is_reliable,
5611 &second_best_enc);
5612 if (FLAGS_counts) {
5613 printf("CEDcounts ");
5614 while (encdet_used--) {printf("encdet ");}
5615 while (rescore_used--) {printf("rescore ");}
5616 while (rescan_used--) {printf("rescan ");}
5617 while (robust_used--) {printf("robust ");}
5618 while (looking_used--) {printf("looking ");}
5619 while (doing_used--) {printf("doing ");}
5620 printf("\n");
5621 }
5622
5623 #if defined(HTML5_MODE)
5624 // Map all the Shift-JIS variants to Shift-JIS when used in Japanese locale.
5625 if (language_hint == JAPANESE && IsShiftJisOrVariant(enc)) {
5626 enc = JAPANESE_SHIFT_JIS;
5627 }
5628
5629 // 7-bit encodings (except ISO-2022-JP), and some obscure encodings not
5630 // supported in WHATWG encoding standard are marked as ASCII to keep the raw
5631 // bytes intact.
5632 switch (enc) {
5633 case ISO_2022_KR:
5634 case ISO_2022_CN:
5635 case HZ_GB_2312:
5636 case UTF7:
5637 case UTF16LE:
5638 case UTF16BE:
5639
5640 case CHINESE_EUC_DEC:
5641 case CHINESE_CNS:
5642 case CHINESE_BIG5_CP950:
5643 case JAPANESE_CP932:
5644 case MSFT_CP874:
5645 case TSCII:
5646 case TAMIL_MONO:
5647 case TAMIL_BI:
5648 case JAGRAN:
5649 case BHASKAR:
5650 case HTCHANAKYA:
5651 case BINARYENC:
5652 case UTF8UTF8:
5653 case TAM_ELANGO:
5654 case TAM_LTTMBARANI:
5655 case TAM_SHREE:
5656 case TAM_TBOOMIS:
5657 case TAM_TMNEWS:
5658 case TAM_WEBTAMIL:
5659 case KDDI_SHIFT_JIS:
5660 case DOCOMO_SHIFT_JIS:
5661 case SOFTBANK_SHIFT_JIS:
5662 case KDDI_ISO_2022_JP:
5663 case SOFTBANK_ISO_2022_JP:
5664 enc = ASCII_7BIT;
5665 break;
5666 default:
5667 break;
5668 }
5669 #endif
5670
5671 return enc;
5672 }
5673
5674
5675 // Return top encoding hint for given string
TopEncodingOfLangHint(const char * name)5676 Encoding CompactEncDet::TopEncodingOfLangHint(const char* name) {
5677 string normalized_lang = MakeChar8(string(name));
5678 int n = HintBinaryLookup8(kLangHintProbs, kLangHintProbsSize,
5679 normalized_lang.c_str());
5680 if (n < 0) {return UNKNOWN_ENCODING;}
5681
5682 // Charset is eight bytes, probability table is eight bytes
5683 int toprankenc =
5684 TopCompressedProb(&kLangHintProbs[n].key_prob[kMaxLangKey],
5685 kMaxLangVector);
5686 return kMapToEncoding[toprankenc];
5687 }
5688
5689 // Return top encoding hint for given string
TopEncodingOfTLDHint(const char * name)5690 Encoding CompactEncDet::TopEncodingOfTLDHint(const char* name) {
5691 string normalized_tld = MakeChar4(string(name));
5692 int n = HintBinaryLookup4(kTLDHintProbs, kTLDHintProbsSize,
5693 normalized_tld.c_str());
5694 if (n < 0) {return UNKNOWN_ENCODING;}
5695
5696 // TLD is four bytes, probability table is 12 bytes
5697 int toprankenc =
5698 TopCompressedProb(&kTLDHintProbs[n].key_prob[kMaxTldKey],
5699 kMaxTldVector);
5700 return kMapToEncoding[toprankenc];
5701 }
5702
5703 // Return top encoding hint for given string
TopEncodingOfCharsetHint(const char * name)5704 Encoding CompactEncDet::TopEncodingOfCharsetHint(const char* name) {
5705 string normalized_charset = MakeChar44(string(name));
5706 int n = HintBinaryLookup8(kCharsetHintProbs, kCharsetHintProbsSize,
5707 normalized_charset.c_str());
5708 if (n < 0) {return UNKNOWN_ENCODING;}
5709
5710 // Charset is eight bytes, probability table is eight bytes
5711 int toprankenc =
5712 TopCompressedProb(&kCharsetHintProbs[n].key_prob[kMaxCharsetKey],
5713 kMaxCharsetVector);
5714 return kMapToEncoding[toprankenc];
5715 }
5716
Version(void)5717 const char* CompactEncDet::Version(void) {
5718 return kVersion;
5719 }
5720