1 /* Copyright 2010 Google Inc. All Rights Reserved.
2
3 Distributed under MIT license.
4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5 */
6
7 /* A (forgetful) hash table to the data seen by the compressor, to
8 help create backward references to previous data. */
9
10 #ifndef BROTLI_ENC_HASH_H_
11 #define BROTLI_ENC_HASH_H_
12
13 #include <string.h> /* memcmp, memset */
14
15 #include "../common/constants.h"
16 #include "../common/dictionary.h"
17 #include "../common/platform.h"
18 #include <brotli/types.h>
19 #include "./encoder_dict.h"
20 #include "./fast_log.h"
21 #include "./find_match_length.h"
22 #include "./memory.h"
23 #include "./quality.h"
24 #include "./static_dict.h"
25
26 #if defined(__cplusplus) || defined(c_plusplus)
27 extern "C" {
28 #endif
29
30 /* Pointer to hasher data.
31 *
32 * Excluding initialization and destruction, hasher can be passed as
33 * HasherHandle by value.
34 *
35 * Typically hasher data consists of 3 sections:
36 * * HasherCommon structure
37 * * private structured hasher data, depending on hasher type
38 * * private dynamic hasher data, depending on hasher type and parameters
39 */
40 typedef uint8_t* HasherHandle;
41
42 typedef struct {
43 BrotliHasherParams params;
44
45 /* False if hasher needs to be "prepared" before use. */
46 BROTLI_BOOL is_prepared_;
47
48 size_t dict_num_lookups;
49 size_t dict_num_matches;
50 } HasherCommon;
51
GetHasherCommon(HasherHandle handle)52 static BROTLI_INLINE HasherCommon* GetHasherCommon(HasherHandle handle) {
53 return (HasherCommon*)handle;
54 }
55
56 #define score_t size_t
57
58 static const uint32_t kCutoffTransformsCount = 10;
59 /* 0, 12, 27, 23, 42, 63, 56, 48, 59, 64 */
60 /* 0+0, 4+8, 8+19, 12+11, 16+26, 20+43, 24+32, 28+20, 32+27, 36+28 */
61 static const uint64_t kCutoffTransforms =
62 BROTLI_MAKE_UINT64_T(0x071B520A, 0xDA2D3200);
63
64 typedef struct HasherSearchResult {
65 size_t len;
66 size_t distance;
67 score_t score;
68 int len_code_delta; /* == len_code - len */
69 } HasherSearchResult;
70
71 /* kHashMul32 multiplier has these properties:
72 * The multiplier must be odd. Otherwise we may lose the highest bit.
73 * No long streaks of ones or zeros.
74 * There is no effort to ensure that it is a prime, the oddity is enough
75 for this use.
76 * The number has been tuned heuristically against compression benchmarks. */
77 static const uint32_t kHashMul32 = 0x1E35A7BD;
78 static const uint64_t kHashMul64 = BROTLI_MAKE_UINT64_T(0x1E35A7BD, 0x1E35A7BD);
79 static const uint64_t kHashMul64Long =
80 BROTLI_MAKE_UINT64_T(0x1FE35A7Bu, 0xD3579BD3u);
81
Hash14(const uint8_t * data)82 static BROTLI_INLINE uint32_t Hash14(const uint8_t* data) {
83 uint32_t h = BROTLI_UNALIGNED_LOAD32LE(data) * kHashMul32;
84 /* The higher bits contain more mixture from the multiplication,
85 so we take our results from there. */
86 return h >> (32 - 14);
87 }
88
PrepareDistanceCache(int * BROTLI_RESTRICT distance_cache,const int num_distances)89 static BROTLI_INLINE void PrepareDistanceCache(
90 int* BROTLI_RESTRICT distance_cache, const int num_distances) {
91 if (num_distances > 4) {
92 int last_distance = distance_cache[0];
93 distance_cache[4] = last_distance - 1;
94 distance_cache[5] = last_distance + 1;
95 distance_cache[6] = last_distance - 2;
96 distance_cache[7] = last_distance + 2;
97 distance_cache[8] = last_distance - 3;
98 distance_cache[9] = last_distance + 3;
99 if (num_distances > 10) {
100 int next_last_distance = distance_cache[1];
101 distance_cache[10] = next_last_distance - 1;
102 distance_cache[11] = next_last_distance + 1;
103 distance_cache[12] = next_last_distance - 2;
104 distance_cache[13] = next_last_distance + 2;
105 distance_cache[14] = next_last_distance - 3;
106 distance_cache[15] = next_last_distance + 3;
107 }
108 }
109 }
110
111 #define BROTLI_LITERAL_BYTE_SCORE 135
112 #define BROTLI_DISTANCE_BIT_PENALTY 30
113 /* Score must be positive after applying maximal penalty. */
114 #define BROTLI_SCORE_BASE (BROTLI_DISTANCE_BIT_PENALTY * 8 * sizeof(size_t))
115
116 /* Usually, we always choose the longest backward reference. This function
117 allows for the exception of that rule.
118
119 If we choose a backward reference that is further away, it will
120 usually be coded with more bits. We approximate this by assuming
121 log2(distance). If the distance can be expressed in terms of the
122 last four distances, we use some heuristic constants to estimate
123 the bits cost. For the first up to four literals we use the bit
124 cost of the literals from the literal cost model, after that we
125 use the average bit cost of the cost model.
126
127 This function is used to sometimes discard a longer backward reference
128 when it is not much longer and the bit cost for encoding it is more
129 than the saved literals.
130
131 backward_reference_offset MUST be positive. */
BackwardReferenceScore(size_t copy_length,size_t backward_reference_offset)132 static BROTLI_INLINE score_t BackwardReferenceScore(
133 size_t copy_length, size_t backward_reference_offset) {
134 return BROTLI_SCORE_BASE + BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length -
135 BROTLI_DISTANCE_BIT_PENALTY * Log2FloorNonZero(backward_reference_offset);
136 }
137
BackwardReferenceScoreUsingLastDistance(size_t copy_length)138 static BROTLI_INLINE score_t BackwardReferenceScoreUsingLastDistance(
139 size_t copy_length) {
140 return BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length +
141 BROTLI_SCORE_BASE + 15;
142 }
143
BackwardReferencePenaltyUsingLastDistance(size_t distance_short_code)144 static BROTLI_INLINE score_t BackwardReferencePenaltyUsingLastDistance(
145 size_t distance_short_code) {
146 return (score_t)39 + ((0x1CA10 >> (distance_short_code & 0xE)) & 0xE);
147 }
148
TestStaticDictionaryItem(const BrotliEncoderDictionary * dictionary,size_t item,const uint8_t * data,size_t max_length,size_t max_backward,size_t max_distance,HasherSearchResult * out)149 static BROTLI_INLINE BROTLI_BOOL TestStaticDictionaryItem(
150 const BrotliEncoderDictionary* dictionary, size_t item, const uint8_t* data,
151 size_t max_length, size_t max_backward, size_t max_distance,
152 HasherSearchResult* out) {
153 size_t len;
154 size_t dist;
155 size_t offset;
156 size_t matchlen;
157 size_t backward;
158 score_t score;
159 len = item & 0x1F;
160 dist = item >> 5;
161 offset = dictionary->words->offsets_by_length[len] + len * dist;
162 if (len > max_length) {
163 return BROTLI_FALSE;
164 }
165
166 matchlen =
167 FindMatchLengthWithLimit(data, &dictionary->words->data[offset], len);
168 if (matchlen + dictionary->cutoffTransformsCount <= len || matchlen == 0) {
169 return BROTLI_FALSE;
170 }
171 {
172 size_t cut = len - matchlen;
173 size_t transform_id = (cut << 2) +
174 (size_t)((dictionary->cutoffTransforms >> (cut * 6)) & 0x3F);
175 backward = max_backward + dist + 1 +
176 (transform_id << dictionary->words->size_bits_by_length[len]);
177 }
178 if (backward > max_distance) {
179 return BROTLI_FALSE;
180 }
181 score = BackwardReferenceScore(matchlen, backward);
182 if (score < out->score) {
183 return BROTLI_FALSE;
184 }
185 out->len = matchlen;
186 out->len_code_delta = (int)len - (int)matchlen;
187 out->distance = backward;
188 out->score = score;
189 return BROTLI_TRUE;
190 }
191
SearchInStaticDictionary(const BrotliEncoderDictionary * dictionary,HasherHandle handle,const uint8_t * data,size_t max_length,size_t max_backward,size_t max_distance,HasherSearchResult * out,BROTLI_BOOL shallow)192 static BROTLI_INLINE void SearchInStaticDictionary(
193 const BrotliEncoderDictionary* dictionary,
194 HasherHandle handle, const uint8_t* data, size_t max_length,
195 size_t max_backward, size_t max_distance,
196 HasherSearchResult* out, BROTLI_BOOL shallow) {
197 size_t key;
198 size_t i;
199 HasherCommon* self = GetHasherCommon(handle);
200 if (self->dict_num_matches < (self->dict_num_lookups >> 7)) {
201 return;
202 }
203 key = Hash14(data) << 1;
204 for (i = 0; i < (shallow ? 1u : 2u); ++i, ++key) {
205 size_t item = dictionary->hash_table[key];
206 self->dict_num_lookups++;
207 if (item != 0) {
208 BROTLI_BOOL item_matches = TestStaticDictionaryItem(
209 dictionary, item, data, max_length, max_backward, max_distance, out);
210 if (item_matches) {
211 self->dict_num_matches++;
212 }
213 }
214 }
215 }
216
217 typedef struct BackwardMatch {
218 uint32_t distance;
219 uint32_t length_and_code;
220 } BackwardMatch;
221
InitBackwardMatch(BackwardMatch * self,size_t dist,size_t len)222 static BROTLI_INLINE void InitBackwardMatch(BackwardMatch* self,
223 size_t dist, size_t len) {
224 self->distance = (uint32_t)dist;
225 self->length_and_code = (uint32_t)(len << 5);
226 }
227
InitDictionaryBackwardMatch(BackwardMatch * self,size_t dist,size_t len,size_t len_code)228 static BROTLI_INLINE void InitDictionaryBackwardMatch(BackwardMatch* self,
229 size_t dist, size_t len, size_t len_code) {
230 self->distance = (uint32_t)dist;
231 self->length_and_code =
232 (uint32_t)((len << 5) | (len == len_code ? 0 : len_code));
233 }
234
BackwardMatchLength(const BackwardMatch * self)235 static BROTLI_INLINE size_t BackwardMatchLength(const BackwardMatch* self) {
236 return self->length_and_code >> 5;
237 }
238
BackwardMatchLengthCode(const BackwardMatch * self)239 static BROTLI_INLINE size_t BackwardMatchLengthCode(const BackwardMatch* self) {
240 size_t code = self->length_and_code & 31;
241 return code ? code : BackwardMatchLength(self);
242 }
243
244 #define EXPAND_CAT(a, b) CAT(a, b)
245 #define CAT(a, b) a ## b
246 #define FN(X) EXPAND_CAT(X, HASHER())
247
248 #define HASHER() H10
249 #define BUCKET_BITS 17
250 #define MAX_TREE_SEARCH_DEPTH 64
251 #define MAX_TREE_COMP_LENGTH 128
252 #include "./hash_to_binary_tree_inc.h" /* NOLINT(build/include) */
253 #undef MAX_TREE_SEARCH_DEPTH
254 #undef MAX_TREE_COMP_LENGTH
255 #undef BUCKET_BITS
256 #undef HASHER
257 /* MAX_NUM_MATCHES == 64 + MAX_TREE_SEARCH_DEPTH */
258 #define MAX_NUM_MATCHES_H10 128
259
260 /* For BUCKET_SWEEP == 1, enabling the dictionary lookup makes compression
261 a little faster (0.5% - 1%) and it compresses 0.15% better on small text
262 and HTML inputs. */
263
264 #define HASHER() H2
265 #define BUCKET_BITS 16
266 #define BUCKET_SWEEP 1
267 #define HASH_LEN 5
268 #define USE_DICTIONARY 1
269 #include "./hash_longest_match_quickly_inc.h" /* NOLINT(build/include) */
270 #undef BUCKET_SWEEP
271 #undef USE_DICTIONARY
272 #undef HASHER
273
274 #define HASHER() H3
275 #define BUCKET_SWEEP 2
276 #define USE_DICTIONARY 0
277 #include "./hash_longest_match_quickly_inc.h" /* NOLINT(build/include) */
278 #undef USE_DICTIONARY
279 #undef BUCKET_SWEEP
280 #undef BUCKET_BITS
281 #undef HASHER
282
283 #define HASHER() H4
284 #define BUCKET_BITS 17
285 #define BUCKET_SWEEP 4
286 #define USE_DICTIONARY 1
287 #include "./hash_longest_match_quickly_inc.h" /* NOLINT(build/include) */
288 #undef USE_DICTIONARY
289 #undef HASH_LEN
290 #undef BUCKET_SWEEP
291 #undef BUCKET_BITS
292 #undef HASHER
293
294 #define HASHER() H5
295 #include "./hash_longest_match_inc.h" /* NOLINT(build/include) */
296 #undef HASHER
297
298 #define HASHER() H6
299 #include "./hash_longest_match64_inc.h" /* NOLINT(build/include) */
300 #undef HASHER
301
302 #define BUCKET_BITS 15
303
304 #define NUM_LAST_DISTANCES_TO_CHECK 4
305 #define NUM_BANKS 1
306 #define BANK_BITS 16
307 #define HASHER() H40
308 #include "./hash_forgetful_chain_inc.h" /* NOLINT(build/include) */
309 #undef HASHER
310 #undef NUM_LAST_DISTANCES_TO_CHECK
311
312 #define NUM_LAST_DISTANCES_TO_CHECK 10
313 #define HASHER() H41
314 #include "./hash_forgetful_chain_inc.h" /* NOLINT(build/include) */
315 #undef HASHER
316 #undef NUM_LAST_DISTANCES_TO_CHECK
317 #undef NUM_BANKS
318 #undef BANK_BITS
319
320 #define NUM_LAST_DISTANCES_TO_CHECK 16
321 #define NUM_BANKS 512
322 #define BANK_BITS 9
323 #define HASHER() H42
324 #include "./hash_forgetful_chain_inc.h" /* NOLINT(build/include) */
325 #undef HASHER
326 #undef NUM_LAST_DISTANCES_TO_CHECK
327 #undef NUM_BANKS
328 #undef BANK_BITS
329
330 #undef BUCKET_BITS
331
332 #define HASHER() H54
333 #define BUCKET_BITS 20
334 #define BUCKET_SWEEP 4
335 #define HASH_LEN 7
336 #define USE_DICTIONARY 0
337 #include "./hash_longest_match_quickly_inc.h" /* NOLINT(build/include) */
338 #undef USE_DICTIONARY
339 #undef HASH_LEN
340 #undef BUCKET_SWEEP
341 #undef BUCKET_BITS
342 #undef HASHER
343
344 #undef FN
345 #undef CAT
346 #undef EXPAND_CAT
347
348 #define FOR_GENERIC_HASHERS(H) H(2) H(3) H(4) H(5) H(6) H(40) H(41) H(42) H(54)
349 #define FOR_ALL_HASHERS(H) FOR_GENERIC_HASHERS(H) H(10)
350
DestroyHasher(MemoryManager * m,HasherHandle * handle)351 static BROTLI_INLINE void DestroyHasher(
352 MemoryManager* m, HasherHandle* handle) {
353 if (*handle == NULL) return;
354 BROTLI_FREE(m, *handle);
355 }
356
HasherReset(HasherHandle handle)357 static BROTLI_INLINE void HasherReset(HasherHandle handle) {
358 if (handle == NULL) return;
359 GetHasherCommon(handle)->is_prepared_ = BROTLI_FALSE;
360 }
361
HasherSize(const BrotliEncoderParams * params,BROTLI_BOOL one_shot,const size_t input_size)362 static BROTLI_INLINE size_t HasherSize(const BrotliEncoderParams* params,
363 BROTLI_BOOL one_shot, const size_t input_size) {
364 size_t result = sizeof(HasherCommon);
365 switch (params->hasher.type) {
366 #define SIZE_(N) \
367 case N: \
368 result += HashMemAllocInBytesH ## N(params, one_shot, input_size); \
369 break;
370 FOR_ALL_HASHERS(SIZE_)
371 #undef SIZE_
372 default:
373 break;
374 }
375 return result;
376 }
377
HasherSetup(MemoryManager * m,HasherHandle * handle,BrotliEncoderParams * params,const uint8_t * data,size_t position,size_t input_size,BROTLI_BOOL is_last)378 static BROTLI_INLINE void HasherSetup(MemoryManager* m, HasherHandle* handle,
379 BrotliEncoderParams* params, const uint8_t* data, size_t position,
380 size_t input_size, BROTLI_BOOL is_last) {
381 HasherHandle self = NULL;
382 HasherCommon* common = NULL;
383 BROTLI_BOOL one_shot = (position == 0 && is_last);
384 if (*handle == NULL) {
385 size_t alloc_size;
386 ChooseHasher(params, ¶ms->hasher);
387 alloc_size = HasherSize(params, one_shot, input_size);
388 self = BROTLI_ALLOC(m, uint8_t, alloc_size);
389 if (BROTLI_IS_OOM(m)) return;
390 *handle = self;
391 common = GetHasherCommon(self);
392 common->params = params->hasher;
393 switch (common->params.type) {
394 #define INITIALIZE_(N) \
395 case N: \
396 InitializeH ## N(*handle, params); \
397 break;
398 FOR_ALL_HASHERS(INITIALIZE_);
399 #undef INITIALIZE_
400 default:
401 break;
402 }
403 HasherReset(*handle);
404 }
405
406 self = *handle;
407 common = GetHasherCommon(self);
408 if (!common->is_prepared_) {
409 switch (common->params.type) {
410 #define PREPARE_(N) \
411 case N: \
412 PrepareH ## N(self, one_shot, input_size, data); \
413 break;
414 FOR_ALL_HASHERS(PREPARE_)
415 #undef PREPARE_
416 default: break;
417 }
418 if (position == 0) {
419 common->dict_num_lookups = 0;
420 common->dict_num_matches = 0;
421 }
422 common->is_prepared_ = BROTLI_TRUE;
423 }
424 }
425
InitOrStitchToPreviousBlock(MemoryManager * m,HasherHandle * handle,const uint8_t * data,size_t mask,BrotliEncoderParams * params,size_t position,size_t input_size,BROTLI_BOOL is_last)426 static BROTLI_INLINE void InitOrStitchToPreviousBlock(
427 MemoryManager* m, HasherHandle* handle, const uint8_t* data, size_t mask,
428 BrotliEncoderParams* params, size_t position, size_t input_size,
429 BROTLI_BOOL is_last) {
430 HasherHandle self;
431 HasherSetup(m, handle, params, data, position, input_size, is_last);
432 if (BROTLI_IS_OOM(m)) return;
433 self = *handle;
434 switch (GetHasherCommon(self)->params.type) {
435 #define INIT_(N) \
436 case N: \
437 StitchToPreviousBlockH ## N(self, input_size, position, data, mask); \
438 break;
439 FOR_ALL_HASHERS(INIT_)
440 #undef INIT_
441 default: break;
442 }
443 }
444
445 #if defined(__cplusplus) || defined(c_plusplus)
446 } /* extern "C" */
447 #endif
448
449 #endif /* BROTLI_ENC_HASH_H_ */
450