1 /* Copyright 2013 Google Inc. All Rights Reserved.
2 
3    Distributed under MIT license.
4    See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5 */
6 
7 #include "./static_dict.h"
8 
9 #include "../common/dictionary.h"
10 #include "../common/platform.h"
11 #include "../common/transform.h"
12 #include "./encoder_dict.h"
13 #include "./find_match_length.h"
14 
15 #if defined(__cplusplus) || defined(c_plusplus)
16 extern "C" {
17 #endif
18 
Hash(const uint8_t * data)19 static BROTLI_INLINE uint32_t Hash(const uint8_t* data) {
20   uint32_t h = BROTLI_UNALIGNED_LOAD32LE(data) * kDictHashMul32;
21   /* The higher bits contain more mixture from the multiplication,
22      so we take our results from there. */
23   return h >> (32 - kDictNumBits);
24 }
25 
AddMatch(size_t distance,size_t len,size_t len_code,uint32_t * matches)26 static BROTLI_INLINE void AddMatch(size_t distance, size_t len, size_t len_code,
27                                    uint32_t* matches) {
28   uint32_t match = (uint32_t)((distance << 5) + len_code);
29   matches[len] = BROTLI_MIN(uint32_t, matches[len], match);
30 }
31 
DictMatchLength(const BrotliDictionary * dictionary,const uint8_t * data,size_t id,size_t len,size_t maxlen)32 static BROTLI_INLINE size_t DictMatchLength(const BrotliDictionary* dictionary,
33                                             const uint8_t* data,
34                                             size_t id,
35                                             size_t len,
36                                             size_t maxlen) {
37   const size_t offset = dictionary->offsets_by_length[len] + len * id;
38   return FindMatchLengthWithLimit(&dictionary->data[offset], data,
39                                   BROTLI_MIN(size_t, len, maxlen));
40 }
41 
IsMatch(const BrotliDictionary * dictionary,DictWord w,const uint8_t * data,size_t max_length)42 static BROTLI_INLINE BROTLI_BOOL IsMatch(const BrotliDictionary* dictionary,
43     DictWord w, const uint8_t* data, size_t max_length) {
44   if (w.len > max_length) {
45     return BROTLI_FALSE;
46   } else {
47     const size_t offset = dictionary->offsets_by_length[w.len] +
48         (size_t)w.len * (size_t)w.idx;
49     const uint8_t* dict = &dictionary->data[offset];
50     if (w.transform == 0) {
51       /* Match against base dictionary word. */
52       return
53           TO_BROTLI_BOOL(FindMatchLengthWithLimit(dict, data, w.len) == w.len);
54     } else if (w.transform == 10) {
55       /* Match against uppercase first transform.
56          Note that there are only ASCII uppercase words in the lookup table. */
57       return TO_BROTLI_BOOL(dict[0] >= 'a' && dict[0] <= 'z' &&
58               (dict[0] ^ 32) == data[0] &&
59               FindMatchLengthWithLimit(&dict[1], &data[1], w.len - 1u) ==
60               w.len - 1u);
61     } else {
62       /* Match against uppercase all transform.
63          Note that there are only ASCII uppercase words in the lookup table. */
64       size_t i;
65       for (i = 0; i < w.len; ++i) {
66         if (dict[i] >= 'a' && dict[i] <= 'z') {
67           if ((dict[i] ^ 32) != data[i]) return BROTLI_FALSE;
68         } else {
69           if (dict[i] != data[i]) return BROTLI_FALSE;
70         }
71       }
72       return BROTLI_TRUE;
73     }
74   }
75 }
76 
BrotliFindAllStaticDictionaryMatches(const BrotliEncoderDictionary * dictionary,const uint8_t * data,size_t min_length,size_t max_length,uint32_t * matches)77 BROTLI_BOOL BrotliFindAllStaticDictionaryMatches(
78     const BrotliEncoderDictionary* dictionary, const uint8_t* data,
79     size_t min_length, size_t max_length, uint32_t* matches) {
80   BROTLI_BOOL has_found_match = BROTLI_FALSE;
81   {
82     size_t offset = dictionary->buckets[Hash(data)];
83     BROTLI_BOOL end = !offset;
84     while (!end) {
85       DictWord w = dictionary->dict_words[offset++];
86       const size_t l = w.len & 0x1F;
87       const size_t n = (size_t)1 << dictionary->words->size_bits_by_length[l];
88       const size_t id = w.idx;
89       end = !!(w.len & 0x80);
90       w.len = (uint8_t)l;
91       if (w.transform == 0) {
92         const size_t matchlen =
93             DictMatchLength(dictionary->words, data, id, l, max_length);
94         const uint8_t* s;
95         size_t minlen;
96         size_t maxlen;
97         size_t len;
98         /* Transform "" + BROTLI_TRANSFORM_IDENTITY + "" */
99         if (matchlen == l) {
100           AddMatch(id, l, l, matches);
101           has_found_match = BROTLI_TRUE;
102         }
103         /* Transforms "" + BROTLI_TRANSFORM_OMIT_LAST_1 + "" and
104                       "" + BROTLI_TRANSFORM_OMIT_LAST_1 + "ing " */
105         if (matchlen >= l - 1) {
106           AddMatch(id + 12 * n, l - 1, l, matches);
107           if (l + 2 < max_length &&
108               data[l - 1] == 'i' && data[l] == 'n' && data[l + 1] == 'g' &&
109               data[l + 2] == ' ') {
110             AddMatch(id + 49 * n, l + 3, l, matches);
111           }
112           has_found_match = BROTLI_TRUE;
113         }
114         /* Transform "" + BROTLI_TRANSFORM_OMIT_LAST_# + "" (# = 2 .. 9) */
115         minlen = min_length;
116         if (l > 9) minlen = BROTLI_MAX(size_t, minlen, l - 9);
117         maxlen = BROTLI_MIN(size_t, matchlen, l - 2);
118         for (len = minlen; len <= maxlen; ++len) {
119           size_t cut = l - len;
120           size_t transform_id = (cut << 2) +
121               (size_t)((dictionary->cutoffTransforms >> (cut * 6)) & 0x3F);
122           AddMatch(id + transform_id * n, len, l, matches);
123           has_found_match = BROTLI_TRUE;
124         }
125         if (matchlen < l || l + 6 >= max_length) {
126           continue;
127         }
128         s = &data[l];
129         /* Transforms "" + BROTLI_TRANSFORM_IDENTITY + <suffix> */
130         if (s[0] == ' ') {
131           AddMatch(id + n, l + 1, l, matches);
132           if (s[1] == 'a') {
133             if (s[2] == ' ') {
134               AddMatch(id + 28 * n, l + 3, l, matches);
135             } else if (s[2] == 's') {
136               if (s[3] == ' ') AddMatch(id + 46 * n, l + 4, l, matches);
137             } else if (s[2] == 't') {
138               if (s[3] == ' ') AddMatch(id + 60 * n, l + 4, l, matches);
139             } else if (s[2] == 'n') {
140               if (s[3] == 'd' && s[4] == ' ') {
141                 AddMatch(id + 10 * n, l + 5, l, matches);
142               }
143             }
144           } else if (s[1] == 'b') {
145             if (s[2] == 'y' && s[3] == ' ') {
146               AddMatch(id + 38 * n, l + 4, l, matches);
147             }
148           } else if (s[1] == 'i') {
149             if (s[2] == 'n') {
150               if (s[3] == ' ') AddMatch(id + 16 * n, l + 4, l, matches);
151             } else if (s[2] == 's') {
152               if (s[3] == ' ') AddMatch(id + 47 * n, l + 4, l, matches);
153             }
154           } else if (s[1] == 'f') {
155             if (s[2] == 'o') {
156               if (s[3] == 'r' && s[4] == ' ') {
157                 AddMatch(id + 25 * n, l + 5, l, matches);
158               }
159             } else if (s[2] == 'r') {
160               if (s[3] == 'o' && s[4] == 'm' && s[5] == ' ') {
161                 AddMatch(id + 37 * n, l + 6, l, matches);
162               }
163             }
164           } else if (s[1] == 'o') {
165             if (s[2] == 'f') {
166               if (s[3] == ' ') AddMatch(id + 8 * n, l + 4, l, matches);
167             } else if (s[2] == 'n') {
168               if (s[3] == ' ') AddMatch(id + 45 * n, l + 4, l, matches);
169             }
170           } else if (s[1] == 'n') {
171             if (s[2] == 'o' && s[3] == 't' && s[4] == ' ') {
172               AddMatch(id + 80 * n, l + 5, l, matches);
173             }
174           } else if (s[1] == 't') {
175             if (s[2] == 'h') {
176               if (s[3] == 'e') {
177                 if (s[4] == ' ') AddMatch(id + 5 * n, l + 5, l, matches);
178               } else if (s[3] == 'a') {
179                 if (s[4] == 't' && s[5] == ' ') {
180                   AddMatch(id + 29 * n, l + 6, l, matches);
181                 }
182               }
183             } else if (s[2] == 'o') {
184               if (s[3] == ' ') AddMatch(id + 17 * n, l + 4, l, matches);
185             }
186           } else if (s[1] == 'w') {
187             if (s[2] == 'i' && s[3] == 't' && s[4] == 'h' && s[5] == ' ') {
188               AddMatch(id + 35 * n, l + 6, l, matches);
189             }
190           }
191         } else if (s[0] == '"') {
192           AddMatch(id + 19 * n, l + 1, l, matches);
193           if (s[1] == '>') {
194             AddMatch(id + 21 * n, l + 2, l, matches);
195           }
196         } else if (s[0] == '.') {
197           AddMatch(id + 20 * n, l + 1, l, matches);
198           if (s[1] == ' ') {
199             AddMatch(id + 31 * n, l + 2, l, matches);
200             if (s[2] == 'T' && s[3] == 'h') {
201               if (s[4] == 'e') {
202                 if (s[5] == ' ') AddMatch(id + 43 * n, l + 6, l, matches);
203               } else if (s[4] == 'i') {
204                 if (s[5] == 's' && s[6] == ' ') {
205                   AddMatch(id + 75 * n, l + 7, l, matches);
206                 }
207               }
208             }
209           }
210         } else if (s[0] == ',') {
211           AddMatch(id + 76 * n, l + 1, l, matches);
212           if (s[1] == ' ') {
213             AddMatch(id + 14 * n, l + 2, l, matches);
214           }
215         } else if (s[0] == '\n') {
216           AddMatch(id + 22 * n, l + 1, l, matches);
217           if (s[1] == '\t') {
218             AddMatch(id + 50 * n, l + 2, l, matches);
219           }
220         } else if (s[0] == ']') {
221           AddMatch(id + 24 * n, l + 1, l, matches);
222         } else if (s[0] == '\'') {
223           AddMatch(id + 36 * n, l + 1, l, matches);
224         } else if (s[0] == ':') {
225           AddMatch(id + 51 * n, l + 1, l, matches);
226         } else if (s[0] == '(') {
227           AddMatch(id + 57 * n, l + 1, l, matches);
228         } else if (s[0] == '=') {
229           if (s[1] == '"') {
230             AddMatch(id + 70 * n, l + 2, l, matches);
231           } else if (s[1] == '\'') {
232             AddMatch(id + 86 * n, l + 2, l, matches);
233           }
234         } else if (s[0] == 'a') {
235           if (s[1] == 'l' && s[2] == ' ') {
236             AddMatch(id + 84 * n, l + 3, l, matches);
237           }
238         } else if (s[0] == 'e') {
239           if (s[1] == 'd') {
240             if (s[2] == ' ') AddMatch(id + 53 * n, l + 3, l, matches);
241           } else if (s[1] == 'r') {
242             if (s[2] == ' ') AddMatch(id + 82 * n, l + 3, l, matches);
243           } else if (s[1] == 's') {
244             if (s[2] == 't' && s[3] == ' ') {
245               AddMatch(id + 95 * n, l + 4, l, matches);
246             }
247           }
248         } else if (s[0] == 'f') {
249           if (s[1] == 'u' && s[2] == 'l' && s[3] == ' ') {
250             AddMatch(id + 90 * n, l + 4, l, matches);
251           }
252         } else if (s[0] == 'i') {
253           if (s[1] == 'v') {
254             if (s[2] == 'e' && s[3] == ' ') {
255               AddMatch(id + 92 * n, l + 4, l, matches);
256             }
257           } else if (s[1] == 'z') {
258             if (s[2] == 'e' && s[3] == ' ') {
259               AddMatch(id + 100 * n, l + 4, l, matches);
260             }
261           }
262         } else if (s[0] == 'l') {
263           if (s[1] == 'e') {
264             if (s[2] == 's' && s[3] == 's' && s[4] == ' ') {
265               AddMatch(id + 93 * n, l + 5, l, matches);
266             }
267           } else if (s[1] == 'y') {
268             if (s[2] == ' ') AddMatch(id + 61 * n, l + 3, l, matches);
269           }
270         } else if (s[0] == 'o') {
271           if (s[1] == 'u' && s[2] == 's' && s[3] == ' ') {
272             AddMatch(id + 106 * n, l + 4, l, matches);
273           }
274         }
275       } else {
276         /* Set is_all_caps=0 for BROTLI_TRANSFORM_UPPERCASE_FIRST and
277                is_all_caps=1 otherwise (BROTLI_TRANSFORM_UPPERCASE_ALL)
278            transform. */
279         const BROTLI_BOOL is_all_caps =
280             TO_BROTLI_BOOL(w.transform != BROTLI_TRANSFORM_UPPERCASE_FIRST);
281         const uint8_t* s;
282         if (!IsMatch(dictionary->words, w, data, max_length)) {
283           continue;
284         }
285         /* Transform "" + kUppercase{First,All} + "" */
286         AddMatch(id + (is_all_caps ? 44 : 9) * n, l, l, matches);
287         has_found_match = BROTLI_TRUE;
288         if (l + 1 >= max_length) {
289           continue;
290         }
291         /* Transforms "" + kUppercase{First,All} + <suffix> */
292         s = &data[l];
293         if (s[0] == ' ') {
294           AddMatch(id + (is_all_caps ? 68 : 4) * n, l + 1, l, matches);
295         } else if (s[0] == '"') {
296           AddMatch(id + (is_all_caps ? 87 : 66) * n, l + 1, l, matches);
297           if (s[1] == '>') {
298             AddMatch(id + (is_all_caps ? 97 : 69) * n, l + 2, l, matches);
299           }
300         } else if (s[0] == '.') {
301           AddMatch(id + (is_all_caps ? 101 : 79) * n, l + 1, l, matches);
302           if (s[1] == ' ') {
303             AddMatch(id + (is_all_caps ? 114 : 88) * n, l + 2, l, matches);
304           }
305         } else if (s[0] == ',') {
306           AddMatch(id + (is_all_caps ? 112 : 99) * n, l + 1, l, matches);
307           if (s[1] == ' ') {
308             AddMatch(id + (is_all_caps ? 107 : 58) * n, l + 2, l, matches);
309           }
310         } else if (s[0] == '\'') {
311           AddMatch(id + (is_all_caps ? 94 : 74) * n, l + 1, l, matches);
312         } else if (s[0] == '(') {
313           AddMatch(id + (is_all_caps ? 113 : 78) * n, l + 1, l, matches);
314         } else if (s[0] == '=') {
315           if (s[1] == '"') {
316             AddMatch(id + (is_all_caps ? 105 : 104) * n, l + 2, l, matches);
317           } else if (s[1] == '\'') {
318             AddMatch(id + (is_all_caps ? 116 : 108) * n, l + 2, l, matches);
319           }
320         }
321       }
322     }
323   }
324   /* Transforms with prefixes " " and "." */
325   if (max_length >= 5 && (data[0] == ' ' || data[0] == '.')) {
326     BROTLI_BOOL is_space = TO_BROTLI_BOOL(data[0] == ' ');
327     size_t offset = dictionary->buckets[Hash(&data[1])];
328     BROTLI_BOOL end = !offset;
329     while (!end) {
330       DictWord w = dictionary->dict_words[offset++];
331       const size_t l = w.len & 0x1F;
332       const size_t n = (size_t)1 << dictionary->words->size_bits_by_length[l];
333       const size_t id = w.idx;
334       end = !!(w.len & 0x80);
335       w.len = (uint8_t)l;
336       if (w.transform == 0) {
337         const uint8_t* s;
338         if (!IsMatch(dictionary->words, w, &data[1], max_length - 1)) {
339           continue;
340         }
341         /* Transforms " " + BROTLI_TRANSFORM_IDENTITY + "" and
342                       "." + BROTLI_TRANSFORM_IDENTITY + "" */
343         AddMatch(id + (is_space ? 6 : 32) * n, l + 1, l, matches);
344         has_found_match = BROTLI_TRUE;
345         if (l + 2 >= max_length) {
346           continue;
347         }
348         /* Transforms " " + BROTLI_TRANSFORM_IDENTITY + <suffix> and
349                       "." + BROTLI_TRANSFORM_IDENTITY + <suffix>
350         */
351         s = &data[l + 1];
352         if (s[0] == ' ') {
353           AddMatch(id + (is_space ? 2 : 77) * n, l + 2, l, matches);
354         } else if (s[0] == '(') {
355           AddMatch(id + (is_space ? 89 : 67) * n, l + 2, l, matches);
356         } else if (is_space) {
357           if (s[0] == ',') {
358             AddMatch(id + 103 * n, l + 2, l, matches);
359             if (s[1] == ' ') {
360               AddMatch(id + 33 * n, l + 3, l, matches);
361             }
362           } else if (s[0] == '.') {
363             AddMatch(id + 71 * n, l + 2, l, matches);
364             if (s[1] == ' ') {
365               AddMatch(id + 52 * n, l + 3, l, matches);
366             }
367           } else if (s[0] == '=') {
368             if (s[1] == '"') {
369               AddMatch(id + 81 * n, l + 3, l, matches);
370             } else if (s[1] == '\'') {
371               AddMatch(id + 98 * n, l + 3, l, matches);
372             }
373           }
374         }
375       } else if (is_space) {
376         /* Set is_all_caps=0 for BROTLI_TRANSFORM_UPPERCASE_FIRST and
377                is_all_caps=1 otherwise (BROTLI_TRANSFORM_UPPERCASE_ALL)
378            transform. */
379         const BROTLI_BOOL is_all_caps =
380             TO_BROTLI_BOOL(w.transform != BROTLI_TRANSFORM_UPPERCASE_FIRST);
381         const uint8_t* s;
382         if (!IsMatch(dictionary->words, w, &data[1], max_length - 1)) {
383           continue;
384         }
385         /* Transforms " " + kUppercase{First,All} + "" */
386         AddMatch(id + (is_all_caps ? 85 : 30) * n, l + 1, l, matches);
387         has_found_match = BROTLI_TRUE;
388         if (l + 2 >= max_length) {
389           continue;
390         }
391         /* Transforms " " + kUppercase{First,All} + <suffix> */
392         s = &data[l + 1];
393         if (s[0] == ' ') {
394           AddMatch(id + (is_all_caps ? 83 : 15) * n, l + 2, l, matches);
395         } else if (s[0] == ',') {
396           if (!is_all_caps) {
397             AddMatch(id + 109 * n, l + 2, l, matches);
398           }
399           if (s[1] == ' ') {
400             AddMatch(id + (is_all_caps ? 111 : 65) * n, l + 3, l, matches);
401           }
402         } else if (s[0] == '.') {
403           AddMatch(id + (is_all_caps ? 115 : 96) * n, l + 2, l, matches);
404           if (s[1] == ' ') {
405             AddMatch(id + (is_all_caps ? 117 : 91) * n, l + 3, l, matches);
406           }
407         } else if (s[0] == '=') {
408           if (s[1] == '"') {
409             AddMatch(id + (is_all_caps ? 110 : 118) * n, l + 3, l, matches);
410           } else if (s[1] == '\'') {
411             AddMatch(id + (is_all_caps ? 119 : 120) * n, l + 3, l, matches);
412           }
413         }
414       }
415     }
416   }
417   if (max_length >= 6) {
418     /* Transforms with prefixes "e ", "s ", ", " and "\xC2\xA0" */
419     if ((data[1] == ' ' &&
420          (data[0] == 'e' || data[0] == 's' || data[0] == ',')) ||
421         (data[0] == 0xC2 && data[1] == 0xA0)) {
422       size_t offset = dictionary->buckets[Hash(&data[2])];
423       BROTLI_BOOL end = !offset;
424       while (!end) {
425         DictWord w = dictionary->dict_words[offset++];
426         const size_t l = w.len & 0x1F;
427         const size_t n = (size_t)1 << dictionary->words->size_bits_by_length[l];
428         const size_t id = w.idx;
429         end = !!(w.len & 0x80);
430         w.len = (uint8_t)l;
431         if (w.transform == 0 &&
432             IsMatch(dictionary->words, w, &data[2], max_length - 2)) {
433           if (data[0] == 0xC2) {
434             AddMatch(id + 102 * n, l + 2, l, matches);
435             has_found_match = BROTLI_TRUE;
436           } else if (l + 2 < max_length && data[l + 2] == ' ') {
437             size_t t = data[0] == 'e' ? 18 : (data[0] == 's' ? 7 : 13);
438             AddMatch(id + t * n, l + 3, l, matches);
439             has_found_match = BROTLI_TRUE;
440           }
441         }
442       }
443     }
444   }
445   if (max_length >= 9) {
446     /* Transforms with prefixes " the " and ".com/" */
447     if ((data[0] == ' ' && data[1] == 't' && data[2] == 'h' &&
448          data[3] == 'e' && data[4] == ' ') ||
449         (data[0] == '.' && data[1] == 'c' && data[2] == 'o' &&
450          data[3] == 'm' && data[4] == '/')) {
451       size_t offset = dictionary->buckets[Hash(&data[5])];
452       BROTLI_BOOL end = !offset;
453       while (!end) {
454         DictWord w = dictionary->dict_words[offset++];
455         const size_t l = w.len & 0x1F;
456         const size_t n = (size_t)1 << dictionary->words->size_bits_by_length[l];
457         const size_t id = w.idx;
458         end = !!(w.len & 0x80);
459         w.len = (uint8_t)l;
460         if (w.transform == 0 &&
461             IsMatch(dictionary->words, w, &data[5], max_length - 5)) {
462           AddMatch(id + (data[0] == ' ' ? 41 : 72) * n, l + 5, l, matches);
463           has_found_match = BROTLI_TRUE;
464           if (l + 5 < max_length) {
465             const uint8_t* s = &data[l + 5];
466             if (data[0] == ' ') {
467               if (l + 8 < max_length &&
468                   s[0] == ' ' && s[1] == 'o' && s[2] == 'f' && s[3] == ' ') {
469                 AddMatch(id + 62 * n, l + 9, l, matches);
470                 if (l + 12 < max_length &&
471                     s[4] == 't' && s[5] == 'h' && s[6] == 'e' && s[7] == ' ') {
472                   AddMatch(id + 73 * n, l + 13, l, matches);
473                 }
474               }
475             }
476           }
477         }
478       }
479     }
480   }
481   return has_found_match;
482 }
483 
484 #if defined(__cplusplus) || defined(c_plusplus)
485 }  /* extern "C" */
486 #endif
487