1 /* Copyright 2010 Google Inc. All Rights Reserved.
2 
3    Distributed under MIT license.
4    See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5 */
6 
7 /* A (forgetful) hash table to the data seen by the compressor, to
8    help create backward references to previous data. */
9 
10 #ifndef BROTLI_ENC_HASH_H_
11 #define BROTLI_ENC_HASH_H_
12 
13 #include <string.h>  /* memcmp, memset */
14 
15 #include "../common/constants.h"
16 #include "../common/dictionary.h"
17 #include "../common/platform.h"
18 #include <brotli/types.h>
19 #include "./encoder_dict.h"
20 #include "./fast_log.h"
21 #include "./find_match_length.h"
22 #include "./memory.h"
23 #include "./quality.h"
24 #include "./static_dict.h"
25 
26 #if defined(__cplusplus) || defined(c_plusplus)
27 extern "C" {
28 #endif
29 
30 /* Pointer to hasher data.
31  *
32  * Excluding initialization and destruction, hasher can be passed as
33  * HasherHandle by value.
34  *
35  * Typically hasher data consists of 3 sections:
36  * * HasherCommon structure
37  * * private structured hasher data, depending on hasher type
38  * * private dynamic hasher data, depending on hasher type and parameters
39  */
40 typedef uint8_t* HasherHandle;
41 
42 typedef struct {
43   BrotliHasherParams params;
44 
45   /* False if hasher needs to be "prepared" before use. */
46   BROTLI_BOOL is_prepared_;
47 
48   size_t dict_num_lookups;
49   size_t dict_num_matches;
50 } HasherCommon;
51 
GetHasherCommon(HasherHandle handle)52 static BROTLI_INLINE HasherCommon* GetHasherCommon(HasherHandle handle) {
53   return (HasherCommon*)handle;
54 }
55 
56 #define score_t size_t
57 
58 static const uint32_t kCutoffTransformsCount = 10;
59 /*   0,  12,   27,    23,    42,    63,    56,    48,    59,    64 */
60 /* 0+0, 4+8, 8+19, 12+11, 16+26, 20+43, 24+32, 28+20, 32+27, 36+28 */
61 static const uint64_t kCutoffTransforms =
62     BROTLI_MAKE_UINT64_T(0x071B520A, 0xDA2D3200);
63 
64 typedef struct HasherSearchResult {
65   size_t len;
66   size_t distance;
67   score_t score;
68   int len_code_delta; /* == len_code - len */
69 } HasherSearchResult;
70 
71 /* kHashMul32 multiplier has these properties:
72    * The multiplier must be odd. Otherwise we may lose the highest bit.
73    * No long streaks of ones or zeros.
74    * There is no effort to ensure that it is a prime, the oddity is enough
75      for this use.
76    * The number has been tuned heuristically against compression benchmarks. */
77 static const uint32_t kHashMul32 = 0x1E35A7BD;
78 static const uint64_t kHashMul64 = BROTLI_MAKE_UINT64_T(0x1E35A7BD, 0x1E35A7BD);
79 static const uint64_t kHashMul64Long =
80     BROTLI_MAKE_UINT64_T(0x1FE35A7Bu, 0xD3579BD3u);
81 
Hash14(const uint8_t * data)82 static BROTLI_INLINE uint32_t Hash14(const uint8_t* data) {
83   uint32_t h = BROTLI_UNALIGNED_LOAD32LE(data) * kHashMul32;
84   /* The higher bits contain more mixture from the multiplication,
85      so we take our results from there. */
86   return h >> (32 - 14);
87 }
88 
PrepareDistanceCache(int * BROTLI_RESTRICT distance_cache,const int num_distances)89 static BROTLI_INLINE void PrepareDistanceCache(
90     int* BROTLI_RESTRICT distance_cache, const int num_distances) {
91   if (num_distances > 4) {
92     int last_distance = distance_cache[0];
93     distance_cache[4] = last_distance - 1;
94     distance_cache[5] = last_distance + 1;
95     distance_cache[6] = last_distance - 2;
96     distance_cache[7] = last_distance + 2;
97     distance_cache[8] = last_distance - 3;
98     distance_cache[9] = last_distance + 3;
99     if (num_distances > 10) {
100       int next_last_distance = distance_cache[1];
101       distance_cache[10] = next_last_distance - 1;
102       distance_cache[11] = next_last_distance + 1;
103       distance_cache[12] = next_last_distance - 2;
104       distance_cache[13] = next_last_distance + 2;
105       distance_cache[14] = next_last_distance - 3;
106       distance_cache[15] = next_last_distance + 3;
107     }
108   }
109 }
110 
111 #define BROTLI_LITERAL_BYTE_SCORE 135
112 #define BROTLI_DISTANCE_BIT_PENALTY 30
113 /* Score must be positive after applying maximal penalty. */
114 #define BROTLI_SCORE_BASE (BROTLI_DISTANCE_BIT_PENALTY * 8 * sizeof(size_t))
115 
116 /* Usually, we always choose the longest backward reference. This function
117    allows for the exception of that rule.
118 
119    If we choose a backward reference that is further away, it will
120    usually be coded with more bits. We approximate this by assuming
121    log2(distance). If the distance can be expressed in terms of the
122    last four distances, we use some heuristic constants to estimate
123    the bits cost. For the first up to four literals we use the bit
124    cost of the literals from the literal cost model, after that we
125    use the average bit cost of the cost model.
126 
127    This function is used to sometimes discard a longer backward reference
128    when it is not much longer and the bit cost for encoding it is more
129    than the saved literals.
130 
131    backward_reference_offset MUST be positive. */
BackwardReferenceScore(size_t copy_length,size_t backward_reference_offset)132 static BROTLI_INLINE score_t BackwardReferenceScore(
133     size_t copy_length, size_t backward_reference_offset) {
134   return BROTLI_SCORE_BASE + BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length -
135       BROTLI_DISTANCE_BIT_PENALTY * Log2FloorNonZero(backward_reference_offset);
136 }
137 
BackwardReferenceScoreUsingLastDistance(size_t copy_length)138 static BROTLI_INLINE score_t BackwardReferenceScoreUsingLastDistance(
139     size_t copy_length) {
140   return BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length +
141       BROTLI_SCORE_BASE + 15;
142 }
143 
BackwardReferencePenaltyUsingLastDistance(size_t distance_short_code)144 static BROTLI_INLINE score_t BackwardReferencePenaltyUsingLastDistance(
145     size_t distance_short_code) {
146   return (score_t)39 + ((0x1CA10 >> (distance_short_code & 0xE)) & 0xE);
147 }
148 
TestStaticDictionaryItem(const BrotliEncoderDictionary * dictionary,size_t item,const uint8_t * data,size_t max_length,size_t max_backward,size_t max_distance,HasherSearchResult * out)149 static BROTLI_INLINE BROTLI_BOOL TestStaticDictionaryItem(
150     const BrotliEncoderDictionary* dictionary, size_t item, const uint8_t* data,
151     size_t max_length, size_t max_backward, size_t max_distance,
152     HasherSearchResult* out) {
153   size_t len;
154   size_t dist;
155   size_t offset;
156   size_t matchlen;
157   size_t backward;
158   score_t score;
159   len = item & 0x1F;
160   dist = item >> 5;
161   offset = dictionary->words->offsets_by_length[len] + len * dist;
162   if (len > max_length) {
163     return BROTLI_FALSE;
164   }
165 
166   matchlen =
167       FindMatchLengthWithLimit(data, &dictionary->words->data[offset], len);
168   if (matchlen + dictionary->cutoffTransformsCount <= len || matchlen == 0) {
169     return BROTLI_FALSE;
170   }
171   {
172     size_t cut = len - matchlen;
173     size_t transform_id = (cut << 2) +
174         (size_t)((dictionary->cutoffTransforms >> (cut * 6)) & 0x3F);
175     backward = max_backward + dist + 1 +
176         (transform_id << dictionary->words->size_bits_by_length[len]);
177   }
178   if (backward > max_distance) {
179     return BROTLI_FALSE;
180   }
181   score = BackwardReferenceScore(matchlen, backward);
182   if (score < out->score) {
183     return BROTLI_FALSE;
184   }
185   out->len = matchlen;
186   out->len_code_delta = (int)len - (int)matchlen;
187   out->distance = backward;
188   out->score = score;
189   return BROTLI_TRUE;
190 }
191 
SearchInStaticDictionary(const BrotliEncoderDictionary * dictionary,HasherHandle handle,const uint8_t * data,size_t max_length,size_t max_backward,size_t max_distance,HasherSearchResult * out,BROTLI_BOOL shallow)192 static BROTLI_INLINE void SearchInStaticDictionary(
193     const BrotliEncoderDictionary* dictionary,
194     HasherHandle handle, const uint8_t* data, size_t max_length,
195     size_t max_backward, size_t max_distance,
196     HasherSearchResult* out, BROTLI_BOOL shallow) {
197   size_t key;
198   size_t i;
199   HasherCommon* self = GetHasherCommon(handle);
200   if (self->dict_num_matches < (self->dict_num_lookups >> 7)) {
201     return;
202   }
203   key = Hash14(data) << 1;
204   for (i = 0; i < (shallow ? 1u : 2u); ++i, ++key) {
205     size_t item = dictionary->hash_table[key];
206     self->dict_num_lookups++;
207     if (item != 0) {
208       BROTLI_BOOL item_matches = TestStaticDictionaryItem(
209           dictionary, item, data, max_length, max_backward, max_distance, out);
210       if (item_matches) {
211         self->dict_num_matches++;
212       }
213     }
214   }
215 }
216 
217 typedef struct BackwardMatch {
218   uint32_t distance;
219   uint32_t length_and_code;
220 } BackwardMatch;
221 
InitBackwardMatch(BackwardMatch * self,size_t dist,size_t len)222 static BROTLI_INLINE void InitBackwardMatch(BackwardMatch* self,
223     size_t dist, size_t len) {
224   self->distance = (uint32_t)dist;
225   self->length_and_code = (uint32_t)(len << 5);
226 }
227 
InitDictionaryBackwardMatch(BackwardMatch * self,size_t dist,size_t len,size_t len_code)228 static BROTLI_INLINE void InitDictionaryBackwardMatch(BackwardMatch* self,
229     size_t dist, size_t len, size_t len_code) {
230   self->distance = (uint32_t)dist;
231   self->length_and_code =
232       (uint32_t)((len << 5) | (len == len_code ? 0 : len_code));
233 }
234 
BackwardMatchLength(const BackwardMatch * self)235 static BROTLI_INLINE size_t BackwardMatchLength(const BackwardMatch* self) {
236   return self->length_and_code >> 5;
237 }
238 
BackwardMatchLengthCode(const BackwardMatch * self)239 static BROTLI_INLINE size_t BackwardMatchLengthCode(const BackwardMatch* self) {
240   size_t code = self->length_and_code & 31;
241   return code ? code : BackwardMatchLength(self);
242 }
243 
244 #define EXPAND_CAT(a, b) CAT(a, b)
245 #define CAT(a, b) a ## b
246 #define FN(X) EXPAND_CAT(X, HASHER())
247 
248 #define HASHER() H10
249 #define BUCKET_BITS 17
250 #define MAX_TREE_SEARCH_DEPTH 64
251 #define MAX_TREE_COMP_LENGTH 128
252 #include "./hash_to_binary_tree_inc.h"  /* NOLINT(build/include) */
253 #undef MAX_TREE_SEARCH_DEPTH
254 #undef MAX_TREE_COMP_LENGTH
255 #undef BUCKET_BITS
256 #undef HASHER
257 /* MAX_NUM_MATCHES == 64 + MAX_TREE_SEARCH_DEPTH */
258 #define MAX_NUM_MATCHES_H10 128
259 
260 /* For BUCKET_SWEEP == 1, enabling the dictionary lookup makes compression
261    a little faster (0.5% - 1%) and it compresses 0.15% better on small text
262    and HTML inputs. */
263 
264 #define HASHER() H2
265 #define BUCKET_BITS 16
266 #define BUCKET_SWEEP 1
267 #define HASH_LEN 5
268 #define USE_DICTIONARY 1
269 #include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
270 #undef BUCKET_SWEEP
271 #undef USE_DICTIONARY
272 #undef HASHER
273 
274 #define HASHER() H3
275 #define BUCKET_SWEEP 2
276 #define USE_DICTIONARY 0
277 #include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
278 #undef USE_DICTIONARY
279 #undef BUCKET_SWEEP
280 #undef BUCKET_BITS
281 #undef HASHER
282 
283 #define HASHER() H4
284 #define BUCKET_BITS 17
285 #define BUCKET_SWEEP 4
286 #define USE_DICTIONARY 1
287 #include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
288 #undef USE_DICTIONARY
289 #undef HASH_LEN
290 #undef BUCKET_SWEEP
291 #undef BUCKET_BITS
292 #undef HASHER
293 
294 #define HASHER() H5
295 #include "./hash_longest_match_inc.h"  /* NOLINT(build/include) */
296 #undef HASHER
297 
298 #define HASHER() H6
299 #include "./hash_longest_match64_inc.h"  /* NOLINT(build/include) */
300 #undef HASHER
301 
302 #define BUCKET_BITS 15
303 
304 #define NUM_LAST_DISTANCES_TO_CHECK 4
305 #define NUM_BANKS 1
306 #define BANK_BITS 16
307 #define HASHER() H40
308 #include "./hash_forgetful_chain_inc.h"  /* NOLINT(build/include) */
309 #undef HASHER
310 #undef NUM_LAST_DISTANCES_TO_CHECK
311 
312 #define NUM_LAST_DISTANCES_TO_CHECK 10
313 #define HASHER() H41
314 #include "./hash_forgetful_chain_inc.h"  /* NOLINT(build/include) */
315 #undef HASHER
316 #undef NUM_LAST_DISTANCES_TO_CHECK
317 #undef NUM_BANKS
318 #undef BANK_BITS
319 
320 #define NUM_LAST_DISTANCES_TO_CHECK 16
321 #define NUM_BANKS 512
322 #define BANK_BITS 9
323 #define HASHER() H42
324 #include "./hash_forgetful_chain_inc.h"  /* NOLINT(build/include) */
325 #undef HASHER
326 #undef NUM_LAST_DISTANCES_TO_CHECK
327 #undef NUM_BANKS
328 #undef BANK_BITS
329 
330 #undef BUCKET_BITS
331 
332 #define HASHER() H54
333 #define BUCKET_BITS 20
334 #define BUCKET_SWEEP 4
335 #define HASH_LEN 7
336 #define USE_DICTIONARY 0
337 #include "./hash_longest_match_quickly_inc.h"  /* NOLINT(build/include) */
338 #undef USE_DICTIONARY
339 #undef HASH_LEN
340 #undef BUCKET_SWEEP
341 #undef BUCKET_BITS
342 #undef HASHER
343 
344 #undef FN
345 #undef CAT
346 #undef EXPAND_CAT
347 
348 #define FOR_GENERIC_HASHERS(H) H(2) H(3) H(4) H(5) H(6) H(40) H(41) H(42) H(54)
349 #define FOR_ALL_HASHERS(H) FOR_GENERIC_HASHERS(H) H(10)
350 
DestroyHasher(MemoryManager * m,HasherHandle * handle)351 static BROTLI_INLINE void DestroyHasher(
352     MemoryManager* m, HasherHandle* handle) {
353   if (*handle == NULL) return;
354   BROTLI_FREE(m, *handle);
355 }
356 
HasherReset(HasherHandle handle)357 static BROTLI_INLINE void HasherReset(HasherHandle handle) {
358   if (handle == NULL) return;
359   GetHasherCommon(handle)->is_prepared_ = BROTLI_FALSE;
360 }
361 
HasherSize(const BrotliEncoderParams * params,BROTLI_BOOL one_shot,const size_t input_size)362 static BROTLI_INLINE size_t HasherSize(const BrotliEncoderParams* params,
363     BROTLI_BOOL one_shot, const size_t input_size) {
364   size_t result = sizeof(HasherCommon);
365   switch (params->hasher.type) {
366 #define SIZE_(N)                                                         \
367     case N:                                                              \
368       result += HashMemAllocInBytesH ## N(params, one_shot, input_size); \
369       break;
370     FOR_ALL_HASHERS(SIZE_)
371 #undef SIZE_
372     default:
373       break;
374   }
375   return result;
376 }
377 
HasherSetup(MemoryManager * m,HasherHandle * handle,BrotliEncoderParams * params,const uint8_t * data,size_t position,size_t input_size,BROTLI_BOOL is_last)378 static BROTLI_INLINE void HasherSetup(MemoryManager* m, HasherHandle* handle,
379     BrotliEncoderParams* params, const uint8_t* data, size_t position,
380     size_t input_size, BROTLI_BOOL is_last) {
381   HasherHandle self = NULL;
382   HasherCommon* common = NULL;
383   BROTLI_BOOL one_shot = (position == 0 && is_last);
384   if (*handle == NULL) {
385     size_t alloc_size;
386     ChooseHasher(params, &params->hasher);
387     alloc_size = HasherSize(params, one_shot, input_size);
388     self = BROTLI_ALLOC(m, uint8_t, alloc_size);
389     if (BROTLI_IS_OOM(m)) return;
390     *handle = self;
391     common = GetHasherCommon(self);
392     common->params = params->hasher;
393     switch (common->params.type) {
394 #define INITIALIZE_(N)                     \
395       case N:                              \
396         InitializeH ## N(*handle, params); \
397         break;
398       FOR_ALL_HASHERS(INITIALIZE_);
399 #undef INITIALIZE_
400       default:
401         break;
402     }
403     HasherReset(*handle);
404   }
405 
406   self = *handle;
407   common = GetHasherCommon(self);
408   if (!common->is_prepared_) {
409     switch (common->params.type) {
410 #define PREPARE_(N)                                      \
411       case N:                                            \
412         PrepareH ## N(self, one_shot, input_size, data); \
413         break;
414       FOR_ALL_HASHERS(PREPARE_)
415 #undef PREPARE_
416       default: break;
417     }
418     if (position == 0) {
419         common->dict_num_lookups = 0;
420         common->dict_num_matches = 0;
421     }
422     common->is_prepared_ = BROTLI_TRUE;
423   }
424 }
425 
InitOrStitchToPreviousBlock(MemoryManager * m,HasherHandle * handle,const uint8_t * data,size_t mask,BrotliEncoderParams * params,size_t position,size_t input_size,BROTLI_BOOL is_last)426 static BROTLI_INLINE void InitOrStitchToPreviousBlock(
427     MemoryManager* m, HasherHandle* handle, const uint8_t* data, size_t mask,
428     BrotliEncoderParams* params, size_t position, size_t input_size,
429     BROTLI_BOOL is_last) {
430   HasherHandle self;
431   HasherSetup(m, handle, params, data, position, input_size, is_last);
432   if (BROTLI_IS_OOM(m)) return;
433   self = *handle;
434   switch (GetHasherCommon(self)->params.type) {
435 #define INIT_(N)                                                           \
436     case N:                                                                \
437       StitchToPreviousBlockH ## N(self, input_size, position, data, mask); \
438     break;
439     FOR_ALL_HASHERS(INIT_)
440 #undef INIT_
441     default: break;
442   }
443 }
444 
445 #if defined(__cplusplus) || defined(c_plusplus)
446 }  /* extern "C" */
447 #endif
448 
449 #endif  /* BROTLI_ENC_HASH_H_ */
450