1 /* Copyright 2010 Google Inc. All Rights Reserved.
2 
3    Distributed under MIT license.
4    See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5 */
6 
7 /* A (forgetful) hash table to the data seen by the compressor, to
8    help create backward references to previous data. */
9 
10 #ifndef BROTLI_ENC_HASH_H_
11 #define BROTLI_ENC_HASH_H_
12 
13 #include <string.h>  /* memcmp, memset */
14 
15 #include "../common/constants.h"
16 #include "../common/dictionary.h"
17 #include <brotli/types.h>
18 #include "./fast_log.h"
19 #include "./find_match_length.h"
20 #include "./memory.h"
21 #include "./port.h"
22 #include "./quality.h"
23 #include "./static_dict.h"
24 
25 #if defined(__cplusplus) || defined(c_plusplus)
26 extern "C" {
27 #endif
28 
29 /* Pointer to hasher data.
30  *
31  * Excluding initialization and destruction, hasher can be passed as
32  * HasherHandle by value.
33  *
34  * Typically hasher data consists of 3 sections:
35  * * HasherCommon structure
36  * * private structured hasher data, depending on hasher type
37  * * private dynamic hasher data, depending on hasher type and parameters
38  */
39 typedef uint8_t* HasherHandle;
40 
41 typedef struct {
42   BrotliHasherParams params;
43 
44   /* False if hasher needs to be "prepared" before use. */
45   BROTLI_BOOL is_prepared_;
46 
47   size_t dict_num_lookups;
48   size_t dict_num_matches;
49 } HasherCommon;
50 
GetHasherCommon(HasherHandle handle)51 static BROTLI_INLINE HasherCommon* GetHasherCommon(HasherHandle handle) {
52   return (HasherCommon*)handle;
53 }
54 
55 #define score_t size_t
56 
57 static const uint32_t kCutoffTransformsCount = 10;
58 /*   0,  12,   27,    23,    42,    63,    56,    48,    59,    64 */
59 /* 0+0, 4+8, 8+19, 12+11, 16+26, 20+43, 24+32, 28+20, 32+27, 36+28 */
60 static const uint64_t kCutoffTransforms =
61     BROTLI_MAKE_UINT64_T(0x071B520A, 0xDA2D3200);
62 
63 typedef struct HasherSearchResult {
64   size_t len;
65   size_t len_x_code; /* == len ^ len_code */
66   size_t distance;
67   score_t score;
68 } HasherSearchResult;
69 
70 /* kHashMul32 multiplier has these properties:
71    * The multiplier must be odd. Otherwise we may lose the highest bit.
72    * No long streaks of ones or zeros.
73    * There is no effort to ensure that it is a prime, the oddity is enough
74      for this use.
75    * The number has been tuned heuristically against compression benchmarks. */
76 static const uint32_t kHashMul32 = 0x1e35a7bd;
77 static const uint64_t kHashMul64 = BROTLI_MAKE_UINT64_T(0x1e35a7bd, 0x1e35a7bd);
78 static const uint64_t kHashMul64Long =
79     BROTLI_MAKE_UINT64_T(0x1fe35a7bU, 0xd3579bd3U);
80 
Hash14(const uint8_t * data)81 static BROTLI_INLINE uint32_t Hash14(const uint8_t* data) {
82   uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32;
83   /* The higher bits contain more mixture from the multiplication,
84      so we take our results from there. */
85   return h >> (32 - 14);
86 }
87 
PrepareDistanceCache(int * BROTLI_RESTRICT distance_cache,const int num_distances)88 static BROTLI_INLINE void PrepareDistanceCache(
89     int* BROTLI_RESTRICT distance_cache, const int num_distances) {
90   if (num_distances > 4) {
91     int last_distance = distance_cache[0];
92     distance_cache[4] = last_distance - 1;
93     distance_cache[5] = last_distance + 1;
94     distance_cache[6] = last_distance - 2;
95     distance_cache[7] = last_distance + 2;
96     distance_cache[8] = last_distance - 3;
97     distance_cache[9] = last_distance + 3;
98     if (num_distances > 10) {
99       int next_last_distance = distance_cache[1];
100       distance_cache[10] = next_last_distance - 1;
101       distance_cache[11] = next_last_distance + 1;
102       distance_cache[12] = next_last_distance - 2;
103       distance_cache[13] = next_last_distance + 2;
104       distance_cache[14] = next_last_distance - 3;
105       distance_cache[15] = next_last_distance + 3;
106     }
107   }
108 }
109 
110 #define BROTLI_LITERAL_BYTE_SCORE 135
111 #define BROTLI_DISTANCE_BIT_PENALTY 30
112 /* Score must be positive after applying maximal penalty. */
113 #define BROTLI_SCORE_BASE (BROTLI_DISTANCE_BIT_PENALTY * 8 * sizeof(size_t))
114 
115 /* Usually, we always choose the longest backward reference. This function
116    allows for the exception of that rule.
117 
118    If we choose a backward reference that is further away, it will
119    usually be coded with more bits. We approximate this by assuming
120    log2(distance). If the distance can be expressed in terms of the
121    last four distances, we use some heuristic constants to estimate
122    the bits cost. For the first up to four literals we use the bit
123    cost of the literals from the literal cost model, after that we
124    use the average bit cost of the cost model.
125 
126    This function is used to sometimes discard a longer backward reference
127    when it is not much longer and the bit cost for encoding it is more
128    than the saved literals.
129 
130    backward_reference_offset MUST be positive. */
BackwardReferenceScore(size_t copy_length,size_t backward_reference_offset)131 static BROTLI_INLINE score_t BackwardReferenceScore(
132     size_t copy_length, size_t backward_reference_offset) {
133   return BROTLI_SCORE_BASE + BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length -
134       BROTLI_DISTANCE_BIT_PENALTY * Log2FloorNonZero(backward_reference_offset);
135 }
136 
BackwardReferenceScoreUsingLastDistance(size_t copy_length)137 static BROTLI_INLINE score_t BackwardReferenceScoreUsingLastDistance(
138     size_t copy_length) {
139   return BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length +
140       BROTLI_SCORE_BASE + 15;
141 }
142 
BackwardReferencePenaltyUsingLastDistance(size_t distance_short_code)143 static BROTLI_INLINE score_t BackwardReferencePenaltyUsingLastDistance(
144     size_t distance_short_code) {
145   return (score_t)39 + ((0x1CA10 >> (distance_short_code & 0xE)) & 0xE);
146 }
147 
TestStaticDictionaryItem(const BrotliDictionary * dictionary,size_t item,const uint8_t * data,size_t max_length,size_t max_backward,HasherSearchResult * out)148 static BROTLI_INLINE BROTLI_BOOL TestStaticDictionaryItem(
149     const BrotliDictionary* dictionary, size_t item, const uint8_t* data,
150     size_t max_length, size_t max_backward, HasherSearchResult* out) {
151   size_t len;
152   size_t dist;
153   size_t offset;
154   size_t matchlen;
155   size_t backward;
156   score_t score;
157   len = item & 0x1F;
158   dist = item >> 5;
159   offset = dictionary->offsets_by_length[len] + len * dist;
160   if (len > max_length) {
161     return BROTLI_FALSE;
162   }
163 
164   matchlen =
165       FindMatchLengthWithLimit(data, &dictionary->data[offset], len);
166   if (matchlen + kCutoffTransformsCount <= len || matchlen == 0) {
167     return BROTLI_FALSE;
168   }
169   {
170     size_t cut = len - matchlen;
171     size_t transform_id =
172         (cut << 2) + (size_t)((kCutoffTransforms >> (cut * 6)) & 0x3F);
173     backward = max_backward + dist + 1 +
174         (transform_id << dictionary->size_bits_by_length[len]);
175   }
176   score = BackwardReferenceScore(matchlen, backward);
177   if (score < out->score) {
178     return BROTLI_FALSE;
179   }
180   out->len = matchlen;
181   out->len_x_code = len ^ matchlen;
182   out->distance = backward;
183   out->score = score;
184   return BROTLI_TRUE;
185 }
186 
SearchInStaticDictionary(const BrotliDictionary * dictionary,const uint16_t * dictionary_hash,HasherHandle handle,const uint8_t * data,size_t max_length,size_t max_backward,HasherSearchResult * out,BROTLI_BOOL shallow)187 static BROTLI_INLINE BROTLI_BOOL SearchInStaticDictionary(
188     const BrotliDictionary* dictionary, const uint16_t* dictionary_hash,
189     HasherHandle handle, const uint8_t* data, size_t max_length,
190     size_t max_backward, HasherSearchResult* out, BROTLI_BOOL shallow) {
191   size_t key;
192   size_t i;
193   BROTLI_BOOL is_match_found = BROTLI_FALSE;
194   HasherCommon* self = GetHasherCommon(handle);
195   if (self->dict_num_matches < (self->dict_num_lookups >> 7)) {
196     return BROTLI_FALSE;
197   }
198   key = Hash14(data) << 1;
199   for (i = 0; i < (shallow ? 1u : 2u); ++i, ++key) {
200     size_t item = dictionary_hash[key];
201     self->dict_num_lookups++;
202     if (item != 0) {
203       BROTLI_BOOL item_matches = TestStaticDictionaryItem(
204           dictionary, item, data, max_length, max_backward, out);
205       if (item_matches) {
206         self->dict_num_matches++;
207         is_match_found = BROTLI_TRUE;
208       }
209     }
210   }
211   return is_match_found;
212 }
213 
214 typedef struct BackwardMatch {
215   uint32_t distance;
216   uint32_t length_and_code;
217 } BackwardMatch;
218 
InitBackwardMatch(BackwardMatch * self,size_t dist,size_t len)219 static BROTLI_INLINE void InitBackwardMatch(BackwardMatch* self,
220     size_t dist, size_t len) {
221   self->distance = (uint32_t)dist;
222   self->length_and_code = (uint32_t)(len << 5);
223 }
224 
InitDictionaryBackwardMatch(BackwardMatch * self,size_t dist,size_t len,size_t len_code)225 static BROTLI_INLINE void InitDictionaryBackwardMatch(BackwardMatch* self,
226     size_t dist, size_t len, size_t len_code) {
227   self->distance = (uint32_t)dist;
228   self->length_and_code =
229       (uint32_t)((len << 5) | (len == len_code ? 0 : len_code));
230 }
231 
BackwardMatchLength(const BackwardMatch * self)232 static BROTLI_INLINE size_t BackwardMatchLength(const BackwardMatch* self) {
233   return self->length_and_code >> 5;
234 }
235 
BackwardMatchLengthCode(const BackwardMatch * self)236 static BROTLI_INLINE size_t BackwardMatchLengthCode(const BackwardMatch* self) {
237   size_t code = self->length_and_code & 31;
238   return code ? code : BackwardMatchLength(self);
239 }
240 
241 #define EXPAND_CAT(a, b) CAT(a, b)
242 #define CAT(a, b) a ## b
243 #define FN(X) EXPAND_CAT(X, HASHER())
244 
245 #define HASHER() H10
246 #define BUCKET_BITS 17
247 #define MAX_TREE_SEARCH_DEPTH 64
248 #define MAX_TREE_COMP_LENGTH 128
249 #include "./hash_to_binary_tree_inc.h"  /* NOLINT(build/include) */
250 #undef MAX_TREE_SEARCH_DEPTH
251 #undef MAX_TREE_COMP_LENGTH
252 #undef BUCKET_BITS
253 #undef HASHER
254 /* MAX_NUM_MATCHES == 64 + MAX_TREE_SEARCH_DEPTH */
255 #define MAX_NUM_MATCHES_H10 128
256 
257 /* For BUCKET_SWEEP == 1, enabling the dictionary lookup makes compression
258    a little faster (0.5% - 1%) and it compresses 0.15% better on small text
259    and HTML inputs. */
260 
261 #define HASHER() H2
262 #define BUCKET_BITS 16
263 #define BUCKET_SWEEP 1
264 #define HASH_LEN 5
265 #define USE_DICTIONARY 1
266 #include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
267 #undef BUCKET_SWEEP
268 #undef USE_DICTIONARY
269 #undef HASHER
270 
271 #define HASHER() H3
272 #define BUCKET_SWEEP 2
273 #define USE_DICTIONARY 0
274 #include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
275 #undef USE_DICTIONARY
276 #undef BUCKET_SWEEP
277 #undef BUCKET_BITS
278 #undef HASHER
279 
280 #define HASHER() H4
281 #define BUCKET_BITS 17
282 #define BUCKET_SWEEP 4
283 #define USE_DICTIONARY 1
284 #include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
285 #undef USE_DICTIONARY
286 #undef HASH_LEN
287 #undef BUCKET_SWEEP
288 #undef BUCKET_BITS
289 #undef HASHER
290 
291 #define HASHER() H5
292 #include "./hash_longest_match_inc.h"  /* NOLINT(build/include) */
293 #undef HASHER
294 
295 #define HASHER() H6
296 #include "./hash_longest_match64_inc.h"  /* NOLINT(build/include) */
297 #undef HASHER
298 
299 #define BUCKET_BITS 15
300 
301 #define NUM_LAST_DISTANCES_TO_CHECK 4
302 #define NUM_BANKS 1
303 #define BANK_BITS 16
304 #define HASHER() H40
305 #include "./hash_forgetful_chain_inc.h"  /* NOLINT(build/include) */
306 #undef HASHER
307 #undef NUM_LAST_DISTANCES_TO_CHECK
308 
309 #define NUM_LAST_DISTANCES_TO_CHECK 10
310 #define HASHER() H41
311 #include "./hash_forgetful_chain_inc.h"  /* NOLINT(build/include) */
312 #undef HASHER
313 #undef NUM_LAST_DISTANCES_TO_CHECK
314 #undef NUM_BANKS
315 #undef BANK_BITS
316 
317 #define NUM_LAST_DISTANCES_TO_CHECK 16
318 #define NUM_BANKS 512
319 #define BANK_BITS 9
320 #define HASHER() H42
321 #include "./hash_forgetful_chain_inc.h"  /* NOLINT(build/include) */
322 #undef HASHER
323 #undef NUM_LAST_DISTANCES_TO_CHECK
324 #undef NUM_BANKS
325 #undef BANK_BITS
326 
327 #undef BUCKET_BITS
328 
329 #define HASHER() H54
330 #define BUCKET_BITS 20
331 #define BUCKET_SWEEP 4
332 #define HASH_LEN 7
333 #define USE_DICTIONARY 0
334 #include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
335 #undef USE_DICTIONARY
336 #undef HASH_LEN
337 #undef BUCKET_SWEEP
338 #undef BUCKET_BITS
339 #undef HASHER
340 
341 #undef FN
342 #undef CAT
343 #undef EXPAND_CAT
344 
345 #define FOR_GENERIC_HASHERS(H) H(2) H(3) H(4) H(5) H(6) H(40) H(41) H(42) H(54)
346 #define FOR_ALL_HASHERS(H) FOR_GENERIC_HASHERS(H) H(10)
347 
DestroyHasher(MemoryManager * m,HasherHandle * handle)348 static BROTLI_INLINE void DestroyHasher(
349     MemoryManager* m, HasherHandle* handle) {
350   if (*handle == NULL) return;
351   BROTLI_FREE(m, *handle);
352 }
353 
HasherReset(HasherHandle handle)354 static BROTLI_INLINE void HasherReset(HasherHandle handle) {
355   if (handle == NULL) return;
356   GetHasherCommon(handle)->is_prepared_ = BROTLI_FALSE;
357 }
358 
HasherSize(const BrotliEncoderParams * params,BROTLI_BOOL one_shot,const size_t input_size)359 static BROTLI_INLINE size_t HasherSize(const BrotliEncoderParams* params,
360     BROTLI_BOOL one_shot, const size_t input_size) {
361   size_t result = sizeof(HasherCommon);
362   switch (params->hasher.type) {
363 #define SIZE_(N)                                                         \
364     case N:                                                              \
365       result += HashMemAllocInBytesH ## N(params, one_shot, input_size); \
366       break;
367     FOR_ALL_HASHERS(SIZE_)
368 #undef SIZE_
369     default:
370       break;
371   }
372   return result;
373 }
374 
HasherSetup(MemoryManager * m,HasherHandle * handle,BrotliEncoderParams * params,const uint8_t * data,size_t position,size_t input_size,BROTLI_BOOL is_last)375 static BROTLI_INLINE void HasherSetup(MemoryManager* m, HasherHandle* handle,
376     BrotliEncoderParams* params, const uint8_t* data, size_t position,
377     size_t input_size, BROTLI_BOOL is_last) {
378   HasherHandle self = NULL;
379   HasherCommon* common = NULL;
380   BROTLI_BOOL one_shot = (position == 0 && is_last);
381   if (*handle == NULL) {
382     size_t alloc_size;
383     ChooseHasher(params, &params->hasher);
384     alloc_size = HasherSize(params, one_shot, input_size);
385     self = BROTLI_ALLOC(m, uint8_t, alloc_size);
386     if (BROTLI_IS_OOM(m)) return;
387     *handle = self;
388     common = GetHasherCommon(self);
389     common->params = params->hasher;
390     switch (common->params.type) {
391 #define INITIALIZE_(N)                     \
392       case N:                              \
393         InitializeH ## N(*handle, params); \
394         break;
395       FOR_ALL_HASHERS(INITIALIZE_);
396 #undef INITIALIZE_
397       default:
398         break;
399     }
400     HasherReset(*handle);
401   }
402 
403   self = *handle;
404   common = GetHasherCommon(self);
405   if (!common->is_prepared_) {
406     switch (common->params.type) {
407 #define PREPARE_(N)                                      \
408       case N:                                            \
409         PrepareH ## N(self, one_shot, input_size, data); \
410         break;
411       FOR_ALL_HASHERS(PREPARE_)
412 #undef PREPARE_
413       default: break;
414     }
415     if (position == 0) {
416         common->dict_num_lookups = 0;
417         common->dict_num_matches = 0;
418     }
419     common->is_prepared_ = BROTLI_TRUE;
420   }
421 }
422 
423 /* Custom LZ77 window. */
HasherPrependCustomDictionary(MemoryManager * m,HasherHandle * handle,BrotliEncoderParams * params,const size_t size,const uint8_t * dict)424 static BROTLI_INLINE void HasherPrependCustomDictionary(
425     MemoryManager* m, HasherHandle* handle, BrotliEncoderParams* params,
426     const size_t size, const uint8_t* dict) {
427   size_t overlap;
428   size_t i;
429   HasherHandle self;
430   HasherSetup(m, handle, params, dict, 0, size, BROTLI_FALSE);
431   if (BROTLI_IS_OOM(m)) return;
432   self = *handle;
433   switch (GetHasherCommon(self)->params.type) {
434 #define PREPEND_(N)                             \
435     case N:                                     \
436       overlap = (StoreLookaheadH ## N()) - 1;   \
437       for (i = 0; i + overlap < size; i++) {    \
438         StoreH ## N(self, dict, ~(size_t)0, i); \
439       }                                         \
440       break;
441     FOR_ALL_HASHERS(PREPEND_)
442 #undef PREPEND_
443     default: break;
444   }
445 }
446 
InitOrStitchToPreviousBlock(MemoryManager * m,HasherHandle * handle,const uint8_t * data,size_t mask,BrotliEncoderParams * params,size_t position,size_t input_size,BROTLI_BOOL is_last)447 static BROTLI_INLINE void InitOrStitchToPreviousBlock(
448     MemoryManager* m, HasherHandle* handle, const uint8_t* data, size_t mask,
449     BrotliEncoderParams* params, size_t position, size_t input_size,
450     BROTLI_BOOL is_last) {
451   HasherHandle self;
452   HasherSetup(m, handle, params, data, position, input_size, is_last);
453   if (BROTLI_IS_OOM(m)) return;
454   self = *handle;
455   switch (GetHasherCommon(self)->params.type) {
456 #define INIT_(N)                                                           \
457     case N:                                                                \
458       StitchToPreviousBlockH ## N(self, input_size, position, data, mask); \
459     break;
460     FOR_ALL_HASHERS(INIT_)
461 #undef INIT_
462     default: break;
463   }
464 }
465 
466 #if defined(__cplusplus) || defined(c_plusplus)
467 }  /* extern "C" */
468 #endif
469 
470 #endif  /* BROTLI_ENC_HASH_H_ */
471