1 /*
2  * Copyright (c) Yann Collet, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under both the BSD-style license (found in the
6  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7  * in the COPYING file in the root directory of this source tree).
8  * You may select, at your option, one of the above-listed licenses.
9  */
10 
11 #include "zstd_compress_internal.h"
12 #include "zstd_lazy.h"
13 
14 
15 /*-*************************************
16 *  Binary Tree search
17 ***************************************/
18 
19 static void
ZSTD_updateDUBT(ZSTD_matchState_t * ms,const BYTE * ip,const BYTE * iend,U32 mls)20 ZSTD_updateDUBT(ZSTD_matchState_t* ms,
21                 const BYTE* ip, const BYTE* iend,
22                 U32 mls)
23 {
24     const ZSTD_compressionParameters* const cParams = &ms->cParams;
25     U32* const hashTable = ms->hashTable;
26     U32  const hashLog = cParams->hashLog;
27 
28     U32* const bt = ms->chainTable;
29     U32  const btLog  = cParams->chainLog - 1;
30     U32  const btMask = (1 << btLog) - 1;
31 
32     const BYTE* const base = ms->window.base;
33     U32 const target = (U32)(ip - base);
34     U32 idx = ms->nextToUpdate;
35 
36     if (idx != target)
37         DEBUGLOG(7, "ZSTD_updateDUBT, from %u to %u (dictLimit:%u)",
38                     idx, target, ms->window.dictLimit);
39     assert(ip + 8 <= iend);   /* condition for ZSTD_hashPtr */
40     (void)iend;
41 
42     assert(idx >= ms->window.dictLimit);   /* condition for valid base+idx */
43     for ( ; idx < target ; idx++) {
44         size_t const h  = ZSTD_hashPtr(base + idx, hashLog, mls);   /* assumption : ip + 8 <= iend */
45         U32    const matchIndex = hashTable[h];
46 
47         U32*   const nextCandidatePtr = bt + 2*(idx&btMask);
48         U32*   const sortMarkPtr  = nextCandidatePtr + 1;
49 
50         DEBUGLOG(8, "ZSTD_updateDUBT: insert %u", idx);
51         hashTable[h] = idx;   /* Update Hash Table */
52         *nextCandidatePtr = matchIndex;   /* update BT like a chain */
53         *sortMarkPtr = ZSTD_DUBT_UNSORTED_MARK;
54     }
55     ms->nextToUpdate = target;
56 }
57 
58 
59 /** ZSTD_insertDUBT1() :
60  *  sort one already inserted but unsorted position
61  *  assumption : curr >= btlow == (curr - btmask)
62  *  doesn't fail */
63 static void
ZSTD_insertDUBT1(ZSTD_matchState_t * ms,U32 curr,const BYTE * inputEnd,U32 nbCompares,U32 btLow,const ZSTD_dictMode_e dictMode)64 ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
65                  U32 curr, const BYTE* inputEnd,
66                  U32 nbCompares, U32 btLow,
67                  const ZSTD_dictMode_e dictMode)
68 {
69     const ZSTD_compressionParameters* const cParams = &ms->cParams;
70     U32* const bt = ms->chainTable;
71     U32  const btLog  = cParams->chainLog - 1;
72     U32  const btMask = (1 << btLog) - 1;
73     size_t commonLengthSmaller=0, commonLengthLarger=0;
74     const BYTE* const base = ms->window.base;
75     const BYTE* const dictBase = ms->window.dictBase;
76     const U32 dictLimit = ms->window.dictLimit;
77     const BYTE* const ip = (curr>=dictLimit) ? base + curr : dictBase + curr;
78     const BYTE* const iend = (curr>=dictLimit) ? inputEnd : dictBase + dictLimit;
79     const BYTE* const dictEnd = dictBase + dictLimit;
80     const BYTE* const prefixStart = base + dictLimit;
81     const BYTE* match;
82     U32* smallerPtr = bt + 2*(curr&btMask);
83     U32* largerPtr  = smallerPtr + 1;
84     U32 matchIndex = *smallerPtr;   /* this candidate is unsorted : next sorted candidate is reached through *smallerPtr, while *largerPtr contains previous unsorted candidate (which is already saved and can be overwritten) */
85     U32 dummy32;   /* to be nullified at the end */
86     U32 const windowValid = ms->window.lowLimit;
87     U32 const maxDistance = 1U << cParams->windowLog;
88     U32 const windowLow = (curr - windowValid > maxDistance) ? curr - maxDistance : windowValid;
89 
90 
91     DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)",
92                 curr, dictLimit, windowLow);
93     assert(curr >= btLow);
94     assert(ip < iend);   /* condition for ZSTD_count */
95 
96     while (nbCompares-- && (matchIndex > windowLow)) {
97         U32* const nextPtr = bt + 2*(matchIndex & btMask);
98         size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
99         assert(matchIndex < curr);
100         /* note : all candidates are now supposed sorted,
101          * but it's still possible to have nextPtr[1] == ZSTD_DUBT_UNSORTED_MARK
102          * when a real index has the same value as ZSTD_DUBT_UNSORTED_MARK */
103 
104         if ( (dictMode != ZSTD_extDict)
105           || (matchIndex+matchLength >= dictLimit)  /* both in current segment*/
106           || (curr < dictLimit) /* both in extDict */) {
107             const BYTE* const mBase = ( (dictMode != ZSTD_extDict)
108                                      || (matchIndex+matchLength >= dictLimit)) ?
109                                         base : dictBase;
110             assert( (matchIndex+matchLength >= dictLimit)   /* might be wrong if extDict is incorrectly set to 0 */
111                  || (curr < dictLimit) );
112             match = mBase + matchIndex;
113             matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
114         } else {
115             match = dictBase + matchIndex;
116             matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
117             if (matchIndex+matchLength >= dictLimit)
118                 match = base + matchIndex;   /* preparation for next read of match[matchLength] */
119         }
120 
121         DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ",
122                     curr, matchIndex, (U32)matchLength);
123 
124         if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
125             break;   /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */
126         }
127 
128         if (match[matchLength] < ip[matchLength]) {  /* necessarily within buffer */
129             /* match is smaller than current */
130             *smallerPtr = matchIndex;             /* update smaller idx */
131             commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
132             if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop searching */
133             DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is smaller : next => %u",
134                         matchIndex, btLow, nextPtr[1]);
135             smallerPtr = nextPtr+1;               /* new "candidate" => larger than match, which was smaller than target */
136             matchIndex = nextPtr[1];              /* new matchIndex, larger than previous and closer to current */
137         } else {
138             /* match is larger than current */
139             *largerPtr = matchIndex;
140             commonLengthLarger = matchLength;
141             if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop searching */
142             DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is larger => %u",
143                         matchIndex, btLow, nextPtr[0]);
144             largerPtr = nextPtr;
145             matchIndex = nextPtr[0];
146     }   }
147 
148     *smallerPtr = *largerPtr = 0;
149 }
150 
151 
152 static size_t
ZSTD_DUBT_findBetterDictMatch(ZSTD_matchState_t * ms,const BYTE * const ip,const BYTE * const iend,size_t * offsetPtr,size_t bestLength,U32 nbCompares,U32 const mls,const ZSTD_dictMode_e dictMode)153 ZSTD_DUBT_findBetterDictMatch (
154         ZSTD_matchState_t* ms,
155         const BYTE* const ip, const BYTE* const iend,
156         size_t* offsetPtr,
157         size_t bestLength,
158         U32 nbCompares,
159         U32 const mls,
160         const ZSTD_dictMode_e dictMode)
161 {
162     const ZSTD_matchState_t * const dms = ms->dictMatchState;
163     const ZSTD_compressionParameters* const dmsCParams = &dms->cParams;
164     const U32 * const dictHashTable = dms->hashTable;
165     U32         const hashLog = dmsCParams->hashLog;
166     size_t      const h  = ZSTD_hashPtr(ip, hashLog, mls);
167     U32               dictMatchIndex = dictHashTable[h];
168 
169     const BYTE* const base = ms->window.base;
170     const BYTE* const prefixStart = base + ms->window.dictLimit;
171     U32         const curr = (U32)(ip-base);
172     const BYTE* const dictBase = dms->window.base;
173     const BYTE* const dictEnd = dms->window.nextSrc;
174     U32         const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base);
175     U32         const dictLowLimit = dms->window.lowLimit;
176     U32         const dictIndexDelta = ms->window.lowLimit - dictHighLimit;
177 
178     U32*        const dictBt = dms->chainTable;
179     U32         const btLog  = dmsCParams->chainLog - 1;
180     U32         const btMask = (1 << btLog) - 1;
181     U32         const btLow = (btMask >= dictHighLimit - dictLowLimit) ? dictLowLimit : dictHighLimit - btMask;
182 
183     size_t commonLengthSmaller=0, commonLengthLarger=0;
184 
185     (void)dictMode;
186     assert(dictMode == ZSTD_dictMatchState);
187 
188     while (nbCompares-- && (dictMatchIndex > dictLowLimit)) {
189         U32* const nextPtr = dictBt + 2*(dictMatchIndex & btMask);
190         size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
191         const BYTE* match = dictBase + dictMatchIndex;
192         matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
193         if (dictMatchIndex+matchLength >= dictHighLimit)
194             match = base + dictMatchIndex + dictIndexDelta;   /* to prepare for next usage of match[matchLength] */
195 
196         if (matchLength > bestLength) {
197             U32 matchIndex = dictMatchIndex + dictIndexDelta;
198             if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
199                 DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
200                     curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + curr - matchIndex, dictMatchIndex, matchIndex);
201                 bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;
202             }
203             if (ip+matchLength == iend) {   /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
204                 break;   /* drop, to guarantee consistency (miss a little bit of compression) */
205             }
206         }
207 
208         if (match[matchLength] < ip[matchLength]) {
209             if (dictMatchIndex <= btLow) { break; }   /* beyond tree size, stop the search */
210             commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
211             dictMatchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
212         } else {
213             /* match is larger than current */
214             if (dictMatchIndex <= btLow) { break; }   /* beyond tree size, stop the search */
215             commonLengthLarger = matchLength;
216             dictMatchIndex = nextPtr[0];
217         }
218     }
219 
220     if (bestLength >= MINMATCH) {
221         U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
222         DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
223                     curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
224     }
225     return bestLength;
226 
227 }
228 
229 
230 static size_t
ZSTD_DUBT_findBestMatch(ZSTD_matchState_t * ms,const BYTE * const ip,const BYTE * const iend,size_t * offsetPtr,U32 const mls,const ZSTD_dictMode_e dictMode)231 ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
232                         const BYTE* const ip, const BYTE* const iend,
233                         size_t* offsetPtr,
234                         U32 const mls,
235                         const ZSTD_dictMode_e dictMode)
236 {
237     const ZSTD_compressionParameters* const cParams = &ms->cParams;
238     U32*   const hashTable = ms->hashTable;
239     U32    const hashLog = cParams->hashLog;
240     size_t const h  = ZSTD_hashPtr(ip, hashLog, mls);
241     U32          matchIndex  = hashTable[h];
242 
243     const BYTE* const base = ms->window.base;
244     U32    const curr = (U32)(ip-base);
245     U32    const windowLow = ZSTD_getLowestMatchIndex(ms, curr, cParams->windowLog);
246 
247     U32*   const bt = ms->chainTable;
248     U32    const btLog  = cParams->chainLog - 1;
249     U32    const btMask = (1 << btLog) - 1;
250     U32    const btLow = (btMask >= curr) ? 0 : curr - btMask;
251     U32    const unsortLimit = MAX(btLow, windowLow);
252 
253     U32*         nextCandidate = bt + 2*(matchIndex&btMask);
254     U32*         unsortedMark = bt + 2*(matchIndex&btMask) + 1;
255     U32          nbCompares = 1U << cParams->searchLog;
256     U32          nbCandidates = nbCompares;
257     U32          previousCandidate = 0;
258 
259     DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", curr);
260     assert(ip <= iend-8);   /* required for h calculation */
261     assert(dictMode != ZSTD_dedicatedDictSearch);
262 
263     /* reach end of unsorted candidates list */
264     while ( (matchIndex > unsortLimit)
265          && (*unsortedMark == ZSTD_DUBT_UNSORTED_MARK)
266          && (nbCandidates > 1) ) {
267         DEBUGLOG(8, "ZSTD_DUBT_findBestMatch: candidate %u is unsorted",
268                     matchIndex);
269         *unsortedMark = previousCandidate;  /* the unsortedMark becomes a reversed chain, to move up back to original position */
270         previousCandidate = matchIndex;
271         matchIndex = *nextCandidate;
272         nextCandidate = bt + 2*(matchIndex&btMask);
273         unsortedMark = bt + 2*(matchIndex&btMask) + 1;
274         nbCandidates --;
275     }
276 
277     /* nullify last candidate if it's still unsorted
278      * simplification, detrimental to compression ratio, beneficial for speed */
279     if ( (matchIndex > unsortLimit)
280       && (*unsortedMark==ZSTD_DUBT_UNSORTED_MARK) ) {
281         DEBUGLOG(7, "ZSTD_DUBT_findBestMatch: nullify last unsorted candidate %u",
282                     matchIndex);
283         *nextCandidate = *unsortedMark = 0;
284     }
285 
286     /* batch sort stacked candidates */
287     matchIndex = previousCandidate;
288     while (matchIndex) {  /* will end on matchIndex == 0 */
289         U32* const nextCandidateIdxPtr = bt + 2*(matchIndex&btMask) + 1;
290         U32 const nextCandidateIdx = *nextCandidateIdxPtr;
291         ZSTD_insertDUBT1(ms, matchIndex, iend,
292                          nbCandidates, unsortLimit, dictMode);
293         matchIndex = nextCandidateIdx;
294         nbCandidates++;
295     }
296 
297     /* find longest match */
298     {   size_t commonLengthSmaller = 0, commonLengthLarger = 0;
299         const BYTE* const dictBase = ms->window.dictBase;
300         const U32 dictLimit = ms->window.dictLimit;
301         const BYTE* const dictEnd = dictBase + dictLimit;
302         const BYTE* const prefixStart = base + dictLimit;
303         U32* smallerPtr = bt + 2*(curr&btMask);
304         U32* largerPtr  = bt + 2*(curr&btMask) + 1;
305         U32 matchEndIdx = curr + 8 + 1;
306         U32 dummy32;   /* to be nullified at the end */
307         size_t bestLength = 0;
308 
309         matchIndex  = hashTable[h];
310         hashTable[h] = curr;   /* Update Hash Table */
311 
312         while (nbCompares-- && (matchIndex > windowLow)) {
313             U32* const nextPtr = bt + 2*(matchIndex & btMask);
314             size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
315             const BYTE* match;
316 
317             if ((dictMode != ZSTD_extDict) || (matchIndex+matchLength >= dictLimit)) {
318                 match = base + matchIndex;
319                 matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
320             } else {
321                 match = dictBase + matchIndex;
322                 matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
323                 if (matchIndex+matchLength >= dictLimit)
324                     match = base + matchIndex;   /* to prepare for next usage of match[matchLength] */
325             }
326 
327             if (matchLength > bestLength) {
328                 if (matchLength > matchEndIdx - matchIndex)
329                     matchEndIdx = matchIndex + (U32)matchLength;
330                 if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
331                     bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;
332                 if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
333                     if (dictMode == ZSTD_dictMatchState) {
334                         nbCompares = 0; /* in addition to avoiding checking any
335                                          * further in this loop, make sure we
336                                          * skip checking in the dictionary. */
337                     }
338                     break;   /* drop, to guarantee consistency (miss a little bit of compression) */
339                 }
340             }
341 
342             if (match[matchLength] < ip[matchLength]) {
343                 /* match is smaller than current */
344                 *smallerPtr = matchIndex;             /* update smaller idx */
345                 commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
346                 if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
347                 smallerPtr = nextPtr+1;               /* new "smaller" => larger of match */
348                 matchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
349             } else {
350                 /* match is larger than current */
351                 *largerPtr = matchIndex;
352                 commonLengthLarger = matchLength;
353                 if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
354                 largerPtr = nextPtr;
355                 matchIndex = nextPtr[0];
356         }   }
357 
358         *smallerPtr = *largerPtr = 0;
359 
360         if (dictMode == ZSTD_dictMatchState && nbCompares) {
361             bestLength = ZSTD_DUBT_findBetterDictMatch(
362                     ms, ip, iend,
363                     offsetPtr, bestLength, nbCompares,
364                     mls, dictMode);
365         }
366 
367         assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
368         ms->nextToUpdate = matchEndIdx - 8;   /* skip repetitive patterns */
369         if (bestLength >= MINMATCH) {
370             U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
371             DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
372                         curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
373         }
374         return bestLength;
375     }
376 }
377 
378 
379 /** ZSTD_BtFindBestMatch() : Tree updater, providing best match */
380 FORCE_INLINE_TEMPLATE size_t
ZSTD_BtFindBestMatch(ZSTD_matchState_t * ms,const BYTE * const ip,const BYTE * const iLimit,size_t * offsetPtr,const U32 mls,const ZSTD_dictMode_e dictMode)381 ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
382                 const BYTE* const ip, const BYTE* const iLimit,
383                       size_t* offsetPtr,
384                 const U32 mls /* template */,
385                 const ZSTD_dictMode_e dictMode)
386 {
387     DEBUGLOG(7, "ZSTD_BtFindBestMatch");
388     if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
389     ZSTD_updateDUBT(ms, ip, iLimit, mls);
390     return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
391 }
392 
393 
394 static size_t
ZSTD_BtFindBestMatch_selectMLS(ZSTD_matchState_t * ms,const BYTE * ip,const BYTE * const iLimit,size_t * offsetPtr)395 ZSTD_BtFindBestMatch_selectMLS (  ZSTD_matchState_t* ms,
396                             const BYTE* ip, const BYTE* const iLimit,
397                                   size_t* offsetPtr)
398 {
399     switch(ms->cParams.minMatch)
400     {
401     default : /* includes case 3 */
402     case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
403     case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
404     case 7 :
405     case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
406     }
407 }
408 
409 
ZSTD_BtFindBestMatch_dictMatchState_selectMLS(ZSTD_matchState_t * ms,const BYTE * ip,const BYTE * const iLimit,size_t * offsetPtr)410 static size_t ZSTD_BtFindBestMatch_dictMatchState_selectMLS (
411                         ZSTD_matchState_t* ms,
412                         const BYTE* ip, const BYTE* const iLimit,
413                         size_t* offsetPtr)
414 {
415     switch(ms->cParams.minMatch)
416     {
417     default : /* includes case 3 */
418     case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
419     case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
420     case 7 :
421     case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
422     }
423 }
424 
425 
ZSTD_BtFindBestMatch_extDict_selectMLS(ZSTD_matchState_t * ms,const BYTE * ip,const BYTE * const iLimit,size_t * offsetPtr)426 static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
427                         ZSTD_matchState_t* ms,
428                         const BYTE* ip, const BYTE* const iLimit,
429                         size_t* offsetPtr)
430 {
431     switch(ms->cParams.minMatch)
432     {
433     default : /* includes case 3 */
434     case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
435     case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
436     case 7 :
437     case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
438     }
439 }
440 
441 /***********************************
442 * Dedicated dict search
443 ***********************************/
444 
ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t * ms,const BYTE * const ip)445 void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip)
446 {
447     const BYTE* const base = ms->window.base;
448     U32 const target = (U32)(ip - base);
449     U32* const hashTable = ms->hashTable;
450     U32* const chainTable = ms->chainTable;
451     U32 const chainSize = 1 << ms->cParams.chainLog;
452     U32 idx = ms->nextToUpdate;
453     U32 const minChain = chainSize < target ? target - chainSize : idx;
454     U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG;
455     U32 const cacheSize = bucketSize - 1;
456     U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize;
457     U32 const chainLimit = chainAttempts > 255 ? 255 : chainAttempts;
458 
459     /* We know the hashtable is oversized by a factor of `bucketSize`.
460      * We are going to temporarily pretend `bucketSize == 1`, keeping only a
461      * single entry. We will use the rest of the space to construct a temporary
462      * chaintable.
463      */
464     U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
465     U32* const tmpHashTable = hashTable;
466     U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog);
467     U32 const tmpChainSize = ((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;
468     U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx;
469     U32 hashIdx;
470 
471     assert(ms->cParams.chainLog <= 24);
472     assert(ms->cParams.hashLog > ms->cParams.chainLog);
473     assert(idx != 0);
474     assert(tmpMinChain <= minChain);
475 
476     /* fill conventional hash table and conventional chain table */
477     for ( ; idx < target; idx++) {
478         U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch);
479         if (idx >= tmpMinChain) {
480             tmpChainTable[idx - tmpMinChain] = hashTable[h];
481         }
482         tmpHashTable[h] = idx;
483     }
484 
485     /* sort chains into ddss chain table */
486     {
487         U32 chainPos = 0;
488         for (hashIdx = 0; hashIdx < (1U << hashLog); hashIdx++) {
489             U32 count;
490             U32 countBeyondMinChain = 0;
491             U32 i = tmpHashTable[hashIdx];
492             for (count = 0; i >= tmpMinChain && count < cacheSize; count++) {
493                 /* skip through the chain to the first position that won't be
494                  * in the hash cache bucket */
495                 if (i < minChain) {
496                     countBeyondMinChain++;
497                 }
498                 i = tmpChainTable[i - tmpMinChain];
499             }
500             if (count == cacheSize) {
501                 for (count = 0; count < chainLimit;) {
502                     if (i < minChain) {
503                         if (!i || ++countBeyondMinChain > cacheSize) {
504                             /* only allow pulling `cacheSize` number of entries
505                              * into the cache or chainTable beyond `minChain`,
506                              * to replace the entries pulled out of the
507                              * chainTable into the cache. This lets us reach
508                              * back further without increasing the total number
509                              * of entries in the chainTable, guaranteeing the
510                              * DDSS chain table will fit into the space
511                              * allocated for the regular one. */
512                             break;
513                         }
514                     }
515                     chainTable[chainPos++] = i;
516                     count++;
517                     if (i < tmpMinChain) {
518                         break;
519                     }
520                     i = tmpChainTable[i - tmpMinChain];
521                 }
522             } else {
523                 count = 0;
524             }
525             if (count) {
526                 tmpHashTable[hashIdx] = ((chainPos - count) << 8) + count;
527             } else {
528                 tmpHashTable[hashIdx] = 0;
529             }
530         }
531         assert(chainPos <= chainSize); /* I believe this is guaranteed... */
532     }
533 
534     /* move chain pointers into the last entry of each hash bucket */
535     for (hashIdx = (1 << hashLog); hashIdx; ) {
536         U32 const bucketIdx = --hashIdx << ZSTD_LAZY_DDSS_BUCKET_LOG;
537         U32 const chainPackedPointer = tmpHashTable[hashIdx];
538         U32 i;
539         for (i = 0; i < cacheSize; i++) {
540             hashTable[bucketIdx + i] = 0;
541         }
542         hashTable[bucketIdx + bucketSize - 1] = chainPackedPointer;
543     }
544 
545     /* fill the buckets of the hash table */
546     for (idx = ms->nextToUpdate; idx < target; idx++) {
547         U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch)
548                    << ZSTD_LAZY_DDSS_BUCKET_LOG;
549         U32 i;
550         /* Shift hash cache down 1. */
551         for (i = cacheSize - 1; i; i--)
552             hashTable[h + i] = hashTable[h + i - 1];
553         hashTable[h] = idx;
554     }
555 
556     ms->nextToUpdate = target;
557 }
558 
559 /* Returns the longest match length found in the dedicated dict search structure.
560  * If none are longer than the argument ml, then ml will be returned.
561  */
562 FORCE_INLINE_TEMPLATE
ZSTD_dedicatedDictSearch_lazy_search(size_t * offsetPtr,size_t ml,U32 nbAttempts,const ZSTD_matchState_t * const dms,const BYTE * const ip,const BYTE * const iLimit,const BYTE * const prefixStart,const U32 curr,const U32 dictLimit,const size_t ddsIdx)563 size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts,
564                                             const ZSTD_matchState_t* const dms,
565                                             const BYTE* const ip, const BYTE* const iLimit,
566                                             const BYTE* const prefixStart, const U32 curr,
567                                             const U32 dictLimit, const size_t ddsIdx) {
568     const U32 ddsLowestIndex  = dms->window.dictLimit;
569     const BYTE* const ddsBase = dms->window.base;
570     const BYTE* const ddsEnd  = dms->window.nextSrc;
571     const U32 ddsSize         = (U32)(ddsEnd - ddsBase);
572     const U32 ddsIndexDelta   = dictLimit - ddsSize;
573     const U32 bucketSize      = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG);
574     const U32 bucketLimit     = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1;
575     U32 ddsAttempt;
576     U32 matchIndex;
577 
578     for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) {
579         PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);
580     }
581 
582     {
583         U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
584         U32 const chainIndex = chainPackedPointer >> 8;
585 
586         PREFETCH_L1(&dms->chainTable[chainIndex]);
587     }
588 
589     for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) {
590         size_t currentMl=0;
591         const BYTE* match;
592         matchIndex = dms->hashTable[ddsIdx + ddsAttempt];
593         match = ddsBase + matchIndex;
594 
595         if (!matchIndex) {
596             return ml;
597         }
598 
599         /* guaranteed by table construction */
600         (void)ddsLowestIndex;
601         assert(matchIndex >= ddsLowestIndex);
602         assert(match+4 <= ddsEnd);
603         if (MEM_read32(match) == MEM_read32(ip)) {
604             /* assumption : matchIndex <= dictLimit-4 (by table construction) */
605             currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
606         }
607 
608         /* save best solution */
609         if (currentMl > ml) {
610             ml = currentMl;
611             *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
612             if (ip+currentMl == iLimit) {
613                 /* best possible, avoids read overflow on next attempt */
614                 return ml;
615             }
616         }
617     }
618 
619     {
620         U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
621         U32 chainIndex = chainPackedPointer >> 8;
622         U32 const chainLength = chainPackedPointer & 0xFF;
623         U32 const chainAttempts = nbAttempts - ddsAttempt;
624         U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts;
625         U32 chainAttempt;
626 
627         for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) {
628             PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]);
629         }
630 
631         for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) {
632             size_t currentMl=0;
633             const BYTE* match;
634             matchIndex = dms->chainTable[chainIndex];
635             match = ddsBase + matchIndex;
636 
637             /* guaranteed by table construction */
638             assert(matchIndex >= ddsLowestIndex);
639             assert(match+4 <= ddsEnd);
640             if (MEM_read32(match) == MEM_read32(ip)) {
641                 /* assumption : matchIndex <= dictLimit-4 (by table construction) */
642                 currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
643             }
644 
645             /* save best solution */
646             if (currentMl > ml) {
647                 ml = currentMl;
648                 *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
649                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
650             }
651         }
652     }
653     return ml;
654 }
655 
656 
657 /* *********************************
658 *  Hash Chain
659 ***********************************/
660 #define NEXT_IN_CHAIN(d, mask)   chainTable[(d) & (mask)]
661 
662 /* Update chains up to ip (excluded)
663    Assumption : always within prefix (i.e. not within extDict) */
ZSTD_insertAndFindFirstIndex_internal(ZSTD_matchState_t * ms,const ZSTD_compressionParameters * const cParams,const BYTE * ip,U32 const mls)664 FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
665                         ZSTD_matchState_t* ms,
666                         const ZSTD_compressionParameters* const cParams,
667                         const BYTE* ip, U32 const mls)
668 {
669     U32* const hashTable  = ms->hashTable;
670     const U32 hashLog = cParams->hashLog;
671     U32* const chainTable = ms->chainTable;
672     const U32 chainMask = (1 << cParams->chainLog) - 1;
673     const BYTE* const base = ms->window.base;
674     const U32 target = (U32)(ip - base);
675     U32 idx = ms->nextToUpdate;
676 
677     while(idx < target) { /* catch up */
678         size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls);
679         NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
680         hashTable[h] = idx;
681         idx++;
682     }
683 
684     ms->nextToUpdate = target;
685     return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
686 }
687 
ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t * ms,const BYTE * ip)688 U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
689     const ZSTD_compressionParameters* const cParams = &ms->cParams;
690     return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
691 }
692 
693 /* inlining is important to hardwire a hot branch (template emulation) */
694 FORCE_INLINE_TEMPLATE
ZSTD_HcFindBestMatch_generic(ZSTD_matchState_t * ms,const BYTE * const ip,const BYTE * const iLimit,size_t * offsetPtr,const U32 mls,const ZSTD_dictMode_e dictMode)695 size_t ZSTD_HcFindBestMatch_generic (
696                         ZSTD_matchState_t* ms,
697                         const BYTE* const ip, const BYTE* const iLimit,
698                         size_t* offsetPtr,
699                         const U32 mls, const ZSTD_dictMode_e dictMode)
700 {
701     const ZSTD_compressionParameters* const cParams = &ms->cParams;
702     U32* const chainTable = ms->chainTable;
703     const U32 chainSize = (1 << cParams->chainLog);
704     const U32 chainMask = chainSize-1;
705     const BYTE* const base = ms->window.base;
706     const BYTE* const dictBase = ms->window.dictBase;
707     const U32 dictLimit = ms->window.dictLimit;
708     const BYTE* const prefixStart = base + dictLimit;
709     const BYTE* const dictEnd = dictBase + dictLimit;
710     const U32 curr = (U32)(ip-base);
711     const U32 maxDistance = 1U << cParams->windowLog;
712     const U32 lowestValid = ms->window.lowLimit;
713     const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
714     const U32 isDictionary = (ms->loadedDictEnd != 0);
715     const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
716     const U32 minChain = curr > chainSize ? curr - chainSize : 0;
717     U32 nbAttempts = 1U << cParams->searchLog;
718     size_t ml=4-1;
719 
720     const ZSTD_matchState_t* const dms = ms->dictMatchState;
721     const U32 ddsHashLog = dictMode == ZSTD_dedicatedDictSearch
722                          ? dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG : 0;
723     const size_t ddsIdx = dictMode == ZSTD_dedicatedDictSearch
724                         ? ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG : 0;
725 
726     U32 matchIndex;
727 
728     if (dictMode == ZSTD_dedicatedDictSearch) {
729         const U32* entry = &dms->hashTable[ddsIdx];
730         PREFETCH_L1(entry);
731     }
732 
733     /* HC4 match finder */
734     matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
735 
736     for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
737         size_t currentMl=0;
738         if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
739             const BYTE* const match = base + matchIndex;
740             assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
741             if (match[ml] == ip[ml])   /* potentially better */
742                 currentMl = ZSTD_count(ip, match, iLimit);
743         } else {
744             const BYTE* const match = dictBase + matchIndex;
745             assert(match+4 <= dictEnd);
746             if (MEM_read32(match) == MEM_read32(ip))   /* assumption : matchIndex <= dictLimit-4 (by table construction) */
747                 currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
748         }
749 
750         /* save best solution */
751         if (currentMl > ml) {
752             ml = currentMl;
753             *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
754             if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
755         }
756 
757         if (matchIndex <= minChain) break;
758         matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
759     }
760 
761     if (dictMode == ZSTD_dedicatedDictSearch) {
762         ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms,
763                                                   ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
764     } else if (dictMode == ZSTD_dictMatchState) {
765         const U32* const dmsChainTable = dms->chainTable;
766         const U32 dmsChainSize         = (1 << dms->cParams.chainLog);
767         const U32 dmsChainMask         = dmsChainSize - 1;
768         const U32 dmsLowestIndex       = dms->window.dictLimit;
769         const BYTE* const dmsBase      = dms->window.base;
770         const BYTE* const dmsEnd       = dms->window.nextSrc;
771         const U32 dmsSize              = (U32)(dmsEnd - dmsBase);
772         const U32 dmsIndexDelta        = dictLimit - dmsSize;
773         const U32 dmsMinChain = dmsSize > dmsChainSize ? dmsSize - dmsChainSize : 0;
774 
775         matchIndex = dms->hashTable[ZSTD_hashPtr(ip, dms->cParams.hashLog, mls)];
776 
777         for ( ; (matchIndex>=dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) {
778             size_t currentMl=0;
779             const BYTE* const match = dmsBase + matchIndex;
780             assert(match+4 <= dmsEnd);
781             if (MEM_read32(match) == MEM_read32(ip))   /* assumption : matchIndex <= dictLimit-4 (by table construction) */
782                 currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;
783 
784             /* save best solution */
785             if (currentMl > ml) {
786                 ml = currentMl;
787                 *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
788                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
789             }
790 
791             if (matchIndex <= dmsMinChain) break;
792 
793             matchIndex = dmsChainTable[matchIndex & dmsChainMask];
794         }
795     }
796 
797     return ml;
798 }
799 
800 
ZSTD_HcFindBestMatch_selectMLS(ZSTD_matchState_t * ms,const BYTE * ip,const BYTE * const iLimit,size_t * offsetPtr)801 FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS (
802                         ZSTD_matchState_t* ms,
803                         const BYTE* ip, const BYTE* const iLimit,
804                         size_t* offsetPtr)
805 {
806     switch(ms->cParams.minMatch)
807     {
808     default : /* includes case 3 */
809     case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
810     case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
811     case 7 :
812     case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
813     }
814 }
815 
816 
ZSTD_HcFindBestMatch_dictMatchState_selectMLS(ZSTD_matchState_t * ms,const BYTE * ip,const BYTE * const iLimit,size_t * offsetPtr)817 static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS (
818                         ZSTD_matchState_t* ms,
819                         const BYTE* ip, const BYTE* const iLimit,
820                         size_t* offsetPtr)
821 {
822     switch(ms->cParams.minMatch)
823     {
824     default : /* includes case 3 */
825     case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
826     case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
827     case 7 :
828     case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
829     }
830 }
831 
832 
ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS(ZSTD_matchState_t * ms,const BYTE * ip,const BYTE * const iLimit,size_t * offsetPtr)833 static size_t ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS (
834                         ZSTD_matchState_t* ms,
835                         const BYTE* ip, const BYTE* const iLimit,
836                         size_t* offsetPtr)
837 {
838     switch(ms->cParams.minMatch)
839     {
840     default : /* includes case 3 */
841     case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dedicatedDictSearch);
842     case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dedicatedDictSearch);
843     case 7 :
844     case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dedicatedDictSearch);
845     }
846 }
847 
848 
ZSTD_HcFindBestMatch_extDict_selectMLS(ZSTD_matchState_t * ms,const BYTE * ip,const BYTE * const iLimit,size_t * offsetPtr)849 FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
850                         ZSTD_matchState_t* ms,
851                         const BYTE* ip, const BYTE* const iLimit,
852                         size_t* offsetPtr)
853 {
854     switch(ms->cParams.minMatch)
855     {
856     default : /* includes case 3 */
857     case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
858     case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
859     case 7 :
860     case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
861     }
862 }
863 
864 /* *********************************
865 * (SIMD) Row-based matchfinder
866 ***********************************/
867 /* Constants for row-based hash */
868 #define ZSTD_ROW_HASH_TAG_OFFSET 1                               /* byte offset of hashes in the match state's tagTable from the beginning of a row */
869 #define ZSTD_ROW_HASH_TAG_BITS 8                                 /* nb bits to use for the tag */
870 #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
871 
872 #define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1)
873 
874 typedef U32 ZSTD_VecMask;   /* Clarifies when we are interacting with a U32 representing a mask of matches */
875 
876 #if !defined(ZSTD_NO_INTRINSICS) && defined(__SSE2__) /* SIMD SSE version */
877 
878 #include <emmintrin.h>
879 typedef __m128i ZSTD_Vec128;
880 
881 /* Returns a 128-bit container with 128-bits from src */
ZSTD_Vec128_read(const void * const src)882 static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
883   return _mm_loadu_si128((ZSTD_Vec128 const*)src);
884 }
885 
886 /* Returns a ZSTD_Vec128 with the byte "val" packed 16 times */
ZSTD_Vec128_set8(BYTE val)887 static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
888   return _mm_set1_epi8((char)val);
889 }
890 
891 /* Do byte-by-byte comparison result of x and y. Then collapse 128-bit resultant mask
892  * into a 32-bit mask that is the MSB of each byte.
893  * */
ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x,ZSTD_Vec128 y)894 static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
895   return (ZSTD_VecMask)_mm_movemask_epi8(_mm_cmpeq_epi8(x, y));
896 }
897 
898 typedef struct {
899   __m128i fst;
900   __m128i snd;
901 } ZSTD_Vec256;
902 
ZSTD_Vec256_read(const void * const ptr)903 static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) {
904   ZSTD_Vec256 v;
905   v.fst = ZSTD_Vec128_read(ptr);
906   v.snd = ZSTD_Vec128_read((ZSTD_Vec128 const*)ptr + 1);
907   return v;
908 }
909 
ZSTD_Vec256_set8(BYTE val)910 static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
911   ZSTD_Vec256 v;
912   v.fst = ZSTD_Vec128_set8(val);
913   v.snd = ZSTD_Vec128_set8(val);
914   return v;
915 }
916 
ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x,ZSTD_Vec256 y)917 static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
918   ZSTD_VecMask fstMask;
919   ZSTD_VecMask sndMask;
920   fstMask = ZSTD_Vec128_cmpMask8(x.fst, y.fst);
921   sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd);
922   return fstMask | (sndMask << 16);
923 }
924 
925 #elif !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON) /* SIMD ARM NEON Version */
926 
927 #include <arm_neon.h>
928 typedef uint8x16_t ZSTD_Vec128;
929 
ZSTD_Vec128_read(const void * const src)930 static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
931   return vld1q_u8((const BYTE* const)src);
932 }
933 
ZSTD_Vec128_set8(BYTE val)934 static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
935   return vdupq_n_u8(val);
936 }
937 
938 /* Mimics '_mm_movemask_epi8()' from SSE */
ZSTD_vmovmaskq_u8(ZSTD_Vec128 val)939 static U32 ZSTD_vmovmaskq_u8(ZSTD_Vec128 val) {
940     /* Shift out everything but the MSB bits in each byte */
941     uint16x8_t highBits = vreinterpretq_u16_u8(vshrq_n_u8(val, 7));
942     /* Merge the even lanes together with vsra (right shift and add) */
943     uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(highBits, highBits, 7));
944     uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
945     uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
946     /* Extract the low 8 bits from each lane, merge */
947     return vgetq_lane_u8(paired64, 0) | ((U32)vgetq_lane_u8(paired64, 8) << 8);
948 }
949 
ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x,ZSTD_Vec128 y)950 static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
951   return (ZSTD_VecMask)ZSTD_vmovmaskq_u8(vceqq_u8(x, y));
952 }
953 
954 typedef struct {
955     uint8x16_t fst;
956     uint8x16_t snd;
957 } ZSTD_Vec256;
958 
ZSTD_Vec256_read(const void * const ptr)959 static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) {
960   ZSTD_Vec256 v;
961   v.fst = ZSTD_Vec128_read(ptr);
962   v.snd = ZSTD_Vec128_read((ZSTD_Vec128 const*)ptr + 1);
963   return v;
964 }
965 
ZSTD_Vec256_set8(BYTE val)966 static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
967   ZSTD_Vec256 v;
968   v.fst = ZSTD_Vec128_set8(val);
969   v.snd = ZSTD_Vec128_set8(val);
970   return v;
971 }
972 
ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x,ZSTD_Vec256 y)973 static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
974   ZSTD_VecMask fstMask;
975   ZSTD_VecMask sndMask;
976   fstMask = ZSTD_Vec128_cmpMask8(x.fst, y.fst);
977   sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd);
978   return fstMask | (sndMask << 16);
979 }
980 
981 #else /* Scalar fallback version */
982 
983 #define VEC128_NB_SIZE_T (16 / sizeof(size_t))
984 typedef struct {
985     size_t vec[VEC128_NB_SIZE_T];
986 } ZSTD_Vec128;
987 
ZSTD_Vec128_read(const void * const src)988 static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
989     ZSTD_Vec128 ret;
990     ZSTD_memcpy(ret.vec, src, VEC128_NB_SIZE_T*sizeof(size_t));
991     return ret;
992 }
993 
ZSTD_Vec128_set8(BYTE val)994 static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
995     ZSTD_Vec128 ret = { {0} };
996     int startBit = sizeof(size_t) * 8 - 8;
997     for (;startBit >= 0; startBit -= 8) {
998         unsigned j = 0;
999         for (;j < VEC128_NB_SIZE_T; ++j) {
1000             ret.vec[j] |= ((size_t)val << startBit);
1001         }
1002     }
1003     return ret;
1004 }
1005 
1006 /* Compare x to y, byte by byte, generating a "matches" bitfield */
ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x,ZSTD_Vec128 y)1007 static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
1008     ZSTD_VecMask res = 0;
1009     unsigned i = 0;
1010     unsigned l = 0;
1011     for (; i < VEC128_NB_SIZE_T; ++i) {
1012         const size_t cmp1 = x.vec[i];
1013         const size_t cmp2 = y.vec[i];
1014         unsigned j = 0;
1015         for (; j < sizeof(size_t); ++j, ++l) {
1016             if (((cmp1 >> j*8) & 0xFF) == ((cmp2 >> j*8) & 0xFF)) {
1017                 res |= ((U32)1 << (j+i*sizeof(size_t)));
1018             }
1019         }
1020     }
1021     return res;
1022 }
1023 
1024 #define VEC256_NB_SIZE_T 2*VEC128_NB_SIZE_T
1025 typedef struct {
1026     size_t vec[VEC256_NB_SIZE_T];
1027 } ZSTD_Vec256;
1028 
ZSTD_Vec256_read(const void * const src)1029 static ZSTD_Vec256 ZSTD_Vec256_read(const void* const src) {
1030     ZSTD_Vec256 ret;
1031     ZSTD_memcpy(ret.vec, src, VEC256_NB_SIZE_T*sizeof(size_t));
1032     return ret;
1033 }
1034 
ZSTD_Vec256_set8(BYTE val)1035 static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
1036     ZSTD_Vec256 ret = { {0} };
1037     int startBit = sizeof(size_t) * 8 - 8;
1038     for (;startBit >= 0; startBit -= 8) {
1039         unsigned j = 0;
1040         for (;j < VEC256_NB_SIZE_T; ++j) {
1041             ret.vec[j] |= ((size_t)val << startBit);
1042         }
1043     }
1044     return ret;
1045 }
1046 
1047 /* Compare x to y, byte by byte, generating a "matches" bitfield */
ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x,ZSTD_Vec256 y)1048 static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
1049     ZSTD_VecMask res = 0;
1050     unsigned i = 0;
1051     unsigned l = 0;
1052     for (; i < VEC256_NB_SIZE_T; ++i) {
1053         const size_t cmp1 = x.vec[i];
1054         const size_t cmp2 = y.vec[i];
1055         unsigned j = 0;
1056         for (; j < sizeof(size_t); ++j, ++l) {
1057             if (((cmp1 >> j*8) & 0xFF) == ((cmp2 >> j*8) & 0xFF)) {
1058                 res |= ((U32)1 << (j+i*sizeof(size_t)));
1059             }
1060         }
1061     }
1062     return res;
1063 }
1064 
1065 #endif /* !defined(ZSTD_NO_INTRINSICS) && defined(__SSE2__) */
1066 
1067 /* ZSTD_VecMask_next():
1068  * Starting from the LSB, returns the idx of the next non-zero bit.
1069  * Basically counting the nb of trailing zeroes.
1070  */
ZSTD_VecMask_next(ZSTD_VecMask val)1071 static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
1072 #   if defined(_MSC_VER)   /* Visual */
1073     unsigned long r=0;
1074     return _BitScanForward(&r, val) ? (U32)r : 0;
1075 #   elif defined(__GNUC__) && (__GNUC__ >= 3)
1076     return (U32)__builtin_ctz(val);
1077 #   else
1078     /* Software ctz version: http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightMultLookup */
1079     static const U32 multiplyDeBruijnBitPosition[32] =
1080     {
1081         0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
1082 		31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
1083     };
1084 	return multiplyDeBruijnBitPosition[((U32)((val & -(int)val) * 0x077CB531U)) >> 27];
1085 
1086 #   endif
1087 }
1088 
1089 /* ZSTD_VecMask_rotateRight():
1090  * Rotates a bitfield to the right by "rotation" bits.
1091  * If the rotation is greater than totalBits, the returned mask is 0.
1092  */
1093 FORCE_INLINE_TEMPLATE ZSTD_VecMask
ZSTD_VecMask_rotateRight(ZSTD_VecMask mask,U32 const rotation,U32 const totalBits)1094 ZSTD_VecMask_rotateRight(ZSTD_VecMask mask, U32 const rotation, U32 const totalBits) {
1095   if (rotation == 0)
1096     return mask;
1097   switch (totalBits) {
1098     default:
1099       assert(0);
1100     case 16:
1101       return (mask >> rotation) | (U16)(mask << (16 - rotation));
1102     case 32:
1103       return (mask >> rotation) | (U32)(mask << (32 - rotation));
1104   }
1105 }
1106 
1107 /* ZSTD_row_nextIndex():
1108  * Returns the next index to insert at within a tagTable row, and updates the "head"
1109  * value to reflect the update. Essentially cycles backwards from [0, {entries per row})
1110  */
ZSTD_row_nextIndex(BYTE * const tagRow,U32 const rowMask)1111 FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
1112   U32 const next = (*tagRow - 1) & rowMask;
1113   *tagRow = (BYTE)next;
1114   return next;
1115 }
1116 
1117 /* ZSTD_isAligned():
1118  * Checks that a pointer is aligned to "align" bytes which must be a power of 2.
1119  */
ZSTD_isAligned(void const * ptr,size_t align)1120 MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
1121     assert((align & (align - 1)) == 0);
1122     return (((size_t)ptr) & (align - 1)) == 0;
1123 }
1124 
1125 /* ZSTD_row_prefetch():
1126  * Performs prefetching for the hashTable and tagTable at a given row.
1127  */
ZSTD_row_prefetch(U32 const * hashTable,U16 const * tagTable,U32 const relRow,U32 const rowLog)1128 FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
1129     PREFETCH_L1(hashTable + relRow);
1130     if (rowLog == 5) {
1131         PREFETCH_L1(hashTable + relRow + 16);
1132     }
1133     PREFETCH_L1(tagTable + relRow);
1134     assert(rowLog == 4 || rowLog == 5);
1135     assert(ZSTD_isAligned(hashTable + relRow, 64));                 /* prefetched hash row always 64-byte aligned */
1136     assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on a multiple of 32 or 64 bytes */
1137 }
1138 
1139 /* ZSTD_row_fillHashCache():
1140  * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
1141  * but not beyond iLimit.
1142  */
ZSTD_row_fillHashCache(ZSTD_matchState_t * ms,const BYTE * base,U32 const rowLog,U32 const mls,U32 idx,const BYTE * const iLimit)1143 static void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
1144                                    U32 const rowLog, U32 const mls,
1145                                    U32 idx, const BYTE* const iLimit)
1146 {
1147     U32 const* const hashTable = ms->hashTable;
1148     U16 const* const tagTable = ms->tagTable;
1149     U32 const hashLog = ms->rowHashLog;
1150     U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
1151     U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
1152 
1153     for (; idx < lim; ++idx) {
1154         U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1155         U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1156         ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
1157         ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
1158     }
1159 
1160     DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[0], ms->hashCache[1],
1161                                                      ms->hashCache[2], ms->hashCache[3], ms->hashCache[4],
1162                                                      ms->hashCache[5], ms->hashCache[6], ms->hashCache[7]);
1163 }
1164 
1165 /* ZSTD_row_nextCachedHash():
1166  * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at
1167  * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
1168  */
ZSTD_row_nextCachedHash(U32 * cache,U32 const * hashTable,U16 const * tagTable,BYTE const * base,U32 idx,U32 const hashLog,U32 const rowLog,U32 const mls)1169 FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
1170                                                   U16 const* tagTable, BYTE const* base,
1171                                                   U32 idx, U32 const hashLog,
1172                                                   U32 const rowLog, U32 const mls)
1173 {
1174     U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1175     U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1176     ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
1177     {   U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
1178         cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash;
1179         return hash;
1180     }
1181 }
1182 
1183 /* ZSTD_row_update_internal():
1184  * Inserts the byte at ip into the appropriate position in the hash table.
1185  * Determines the relative row, and the position within the {16, 32} entry row to insert at.
1186  */
ZSTD_row_update_internal(ZSTD_matchState_t * ms,const BYTE * ip,U32 const mls,U32 const rowLog,U32 const rowMask,U32 const useCache)1187 FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
1188                                                     U32 const mls, U32 const rowLog,
1189                                                     U32 const rowMask, U32 const useCache)
1190 {
1191     U32* const hashTable = ms->hashTable;
1192     U16* const tagTable = ms->tagTable;
1193     U32 const hashLog = ms->rowHashLog;
1194     const BYTE* const base = ms->window.base;
1195     const U32 target = (U32)(ip - base);
1196     U32 idx = ms->nextToUpdate;
1197 
1198     DEBUGLOG(6, "ZSTD_row_update_internal(): nextToUpdate=%u, current=%u", idx, target);
1199     for (; idx < target; ++idx) {
1200         U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, idx, hashLog, rowLog, mls)
1201                                   : (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1202         U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1203         U32* const row = hashTable + relRow;
1204         BYTE* tagRow = (BYTE*)(tagTable + relRow);  /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
1205                                                        Explicit cast allows us to get exact desired position within each row */
1206         U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
1207 
1208         assert(hash == ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
1209         ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
1210         row[pos] = idx;
1211     }
1212     ms->nextToUpdate = target;
1213 }
1214 
1215 /* ZSTD_row_update():
1216  * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary
1217  * processing.
1218  */
ZSTD_row_update(ZSTD_matchState_t * const ms,const BYTE * ip)1219 void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
1220     const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
1221     const U32 rowMask = (1u << rowLog) - 1;
1222     const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
1223 
1224     DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
1225     ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
1226 }
1227 
1228 /* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
1229  * the hash at the nth position in a row of the tagTable.
1230  */
1231 FORCE_INLINE_TEMPLATE
ZSTD_row_getMatchMask(const BYTE * const tagRow,const BYTE tag,const U32 head,const U32 rowEntries)1232 ZSTD_VecMask ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries) {
1233     ZSTD_VecMask matches = 0;
1234     if (rowEntries == 16) {
1235         ZSTD_Vec128 hashes        = ZSTD_Vec128_read(tagRow + ZSTD_ROW_HASH_TAG_OFFSET);
1236         ZSTD_Vec128 expandedTags  = ZSTD_Vec128_set8(tag);
1237         matches                   = ZSTD_Vec128_cmpMask8(hashes, expandedTags);
1238     } else if (rowEntries == 32) {
1239         ZSTD_Vec256 hashes        = ZSTD_Vec256_read(tagRow + ZSTD_ROW_HASH_TAG_OFFSET);
1240         ZSTD_Vec256 expandedTags  = ZSTD_Vec256_set8(tag);
1241         matches                   = ZSTD_Vec256_cmpMask8(hashes, expandedTags);
1242     } else {
1243         assert(0);
1244     }
1245     /* Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
1246         to match up with the actual layout of the entries within the hashTable */
1247     return ZSTD_VecMask_rotateRight(matches, head, rowEntries);
1248 }
1249 
1250 /* The high-level approach of the SIMD row based match finder is as follows:
1251  * - Figure out where to insert the new entry:
1252  *      - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag"
1253  *      - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines
1254  *        which row to insert into.
1255  *      - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can
1256  *        be considered as a circular buffer with a "head" index that resides in the tagTable.
1257  *      - Also insert the "tag" into the equivalent row and position in the tagTable.
1258  *          - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry.
1259  *                  The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively,
1260  *                  for alignment/performance reasons, leaving some bytes unused.
1261  * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and
1262  *   generate a bitfield that we can cycle through to check the collisions in the hash table.
1263  * - Pick the longest match.
1264  */
1265 FORCE_INLINE_TEMPLATE
ZSTD_RowFindBestMatch_generic(ZSTD_matchState_t * ms,const BYTE * const ip,const BYTE * const iLimit,size_t * offsetPtr,const U32 mls,const ZSTD_dictMode_e dictMode,const U32 rowLog)1266 size_t ZSTD_RowFindBestMatch_generic (
1267                         ZSTD_matchState_t* ms,
1268                         const BYTE* const ip, const BYTE* const iLimit,
1269                         size_t* offsetPtr,
1270                         const U32 mls, const ZSTD_dictMode_e dictMode,
1271                         const U32 rowLog)
1272 {
1273     U32* const hashTable = ms->hashTable;
1274     U16* const tagTable = ms->tagTable;
1275     U32* const hashCache = ms->hashCache;
1276     const U32 hashLog = ms->rowHashLog;
1277     const ZSTD_compressionParameters* const cParams = &ms->cParams;
1278     const BYTE* const base = ms->window.base;
1279     const BYTE* const dictBase = ms->window.dictBase;
1280     const U32 dictLimit = ms->window.dictLimit;
1281     const BYTE* const prefixStart = base + dictLimit;
1282     const BYTE* const dictEnd = dictBase + dictLimit;
1283     const U32 curr = (U32)(ip-base);
1284     const U32 maxDistance = 1U << cParams->windowLog;
1285     const U32 lowestValid = ms->window.lowLimit;
1286     const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
1287     const U32 isDictionary = (ms->loadedDictEnd != 0);
1288     const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
1289     const U32 rowEntries = (1U << rowLog);
1290     const U32 rowMask = rowEntries - 1;
1291     const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
1292     U32 nbAttempts = 1U << cappedSearchLog;
1293     size_t ml=4-1;
1294 
1295     /* DMS/DDS variables that may be referenced laster */
1296     const ZSTD_matchState_t* const dms = ms->dictMatchState;
1297     size_t ddsIdx;
1298     U32 ddsExtraAttempts; /* cctx hash tables are limited in searches, but allow extra searches into DDS */
1299     U32 dmsTag;
1300     U32* dmsRow;
1301     BYTE* dmsTagRow;
1302 
1303     if (dictMode == ZSTD_dedicatedDictSearch) {
1304         const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
1305         {   /* Prefetch DDS hashtable entry */
1306             ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG;
1307             PREFETCH_L1(&dms->hashTable[ddsIdx]);
1308         }
1309         ddsExtraAttempts = cParams->searchLog > rowLog ? 1U << (cParams->searchLog - rowLog) : 0;
1310     }
1311 
1312     if (dictMode == ZSTD_dictMatchState) {
1313         /* Prefetch DMS rows */
1314         U32* const dmsHashTable = dms->hashTable;
1315         U16* const dmsTagTable = dms->tagTable;
1316         U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1317         U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1318         dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
1319         dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow);
1320         dmsRow = dmsHashTable + dmsRelRow;
1321         ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog);
1322     }
1323 
1324     /* Update the hashTable and tagTable up to (but not including) ip */
1325     ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
1326     {   /* Get the hash for ip, compute the appropriate row */
1327         U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
1328         U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1329         U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
1330         U32* const row = hashTable + relRow;
1331         BYTE* tagRow = (BYTE*)(tagTable + relRow);
1332         U32 const head = *tagRow & rowMask;
1333         U32 matchBuffer[32 /* maximum nb entries per row */];
1334         size_t numMatches = 0;
1335         size_t currMatch = 0;
1336         ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
1337 
1338         /* Cycle through the matches and prefetch */
1339         for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
1340             U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
1341             U32 const matchIndex = row[matchPos];
1342             assert(numMatches < rowEntries);
1343             if (matchIndex < lowLimit)
1344                 break;
1345             if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
1346                 PREFETCH_L1(base + matchIndex);
1347             } else {
1348                 PREFETCH_L1(dictBase + matchIndex);
1349             }
1350             matchBuffer[numMatches++] = matchIndex;
1351         }
1352 
1353         /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
1354            in ZSTD_row_update_internal() at the next search. */
1355         {
1356             U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
1357             tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
1358             row[pos] = ms->nextToUpdate++;
1359         }
1360 
1361         /* Return the longest match */
1362         for (; currMatch < numMatches; ++currMatch) {
1363             U32 const matchIndex = matchBuffer[currMatch];
1364             size_t currentMl=0;
1365             assert(matchIndex < curr);
1366             assert(matchIndex >= lowLimit);
1367 
1368             if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
1369                 const BYTE* const match = base + matchIndex;
1370                 assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
1371                 if (match[ml] == ip[ml])   /* potentially better */
1372                     currentMl = ZSTD_count(ip, match, iLimit);
1373             } else {
1374                 const BYTE* const match = dictBase + matchIndex;
1375                 assert(match+4 <= dictEnd);
1376                 if (MEM_read32(match) == MEM_read32(ip))   /* assumption : matchIndex <= dictLimit-4 (by table construction) */
1377                     currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
1378             }
1379 
1380             /* Save best solution */
1381             if (currentMl > ml) {
1382                 ml = currentMl;
1383                 *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
1384                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
1385             }
1386         }
1387     }
1388 
1389     if (dictMode == ZSTD_dedicatedDictSearch) {
1390         ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms,
1391                                                   ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
1392     } else if (dictMode == ZSTD_dictMatchState) {
1393         /* TODO: Measure and potentially add prefetching to DMS */
1394         const U32 dmsLowestIndex       = dms->window.dictLimit;
1395         const BYTE* const dmsBase      = dms->window.base;
1396         const BYTE* const dmsEnd       = dms->window.nextSrc;
1397         const U32 dmsSize              = (U32)(dmsEnd - dmsBase);
1398         const U32 dmsIndexDelta        = dictLimit - dmsSize;
1399 
1400         {   U32 const head = *dmsTagRow & rowMask;
1401             U32 matchBuffer[32 /* maximum nb row entries */];
1402             size_t numMatches = 0;
1403             size_t currMatch = 0;
1404             ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
1405 
1406             for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
1407                 U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
1408                 U32 const matchIndex = dmsRow[matchPos];
1409                 if (matchIndex < dmsLowestIndex)
1410                     break;
1411                 PREFETCH_L1(dmsBase + matchIndex);
1412                 matchBuffer[numMatches++] = matchIndex;
1413             }
1414 
1415             /* Return the longest match */
1416             for (; currMatch < numMatches; ++currMatch) {
1417                 U32 const matchIndex = matchBuffer[currMatch];
1418                 size_t currentMl=0;
1419                 assert(matchIndex >= dmsLowestIndex);
1420                 assert(matchIndex < curr);
1421 
1422                 {   const BYTE* const match = dmsBase + matchIndex;
1423                     assert(match+4 <= dmsEnd);
1424                     if (MEM_read32(match) == MEM_read32(ip))
1425                         currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;
1426                 }
1427 
1428                 if (currentMl > ml) {
1429                     ml = currentMl;
1430                     *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
1431                     if (ip+currentMl == iLimit) break;
1432                 }
1433             }
1434         }
1435     }
1436     return ml;
1437 }
1438 
1439 /* Inlining is important to hardwire a hot branch (template emulation) */
ZSTD_RowFindBestMatch_selectMLS(ZSTD_matchState_t * ms,const BYTE * ip,const BYTE * const iLimit,const ZSTD_dictMode_e dictMode,size_t * offsetPtr,const U32 rowLog)1440 FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_selectMLS (
1441                         ZSTD_matchState_t* ms,
1442                         const BYTE* ip, const BYTE* const iLimit,
1443                         const ZSTD_dictMode_e dictMode, size_t* offsetPtr, const U32 rowLog)
1444 {
1445     switch(ms->cParams.minMatch)
1446     {
1447     default : /* includes case 3 */
1448     case 4 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, dictMode, rowLog);
1449     case 5 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, dictMode, rowLog);
1450     case 7 :
1451     case 6 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, dictMode, rowLog);
1452     }
1453 }
1454 
ZSTD_RowFindBestMatch_selectRowLog(ZSTD_matchState_t * ms,const BYTE * ip,const BYTE * const iLimit,size_t * offsetPtr)1455 FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_selectRowLog (
1456                         ZSTD_matchState_t* ms,
1457                         const BYTE* ip, const BYTE* const iLimit,
1458                         size_t* offsetPtr)
1459 {
1460     const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1461     switch(cappedSearchLog)
1462     {
1463     default :
1464     case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 4);
1465     case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 5);
1466     }
1467 }
1468 
ZSTD_RowFindBestMatch_dictMatchState_selectRowLog(ZSTD_matchState_t * ms,const BYTE * ip,const BYTE * const iLimit,size_t * offsetPtr)1469 FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_dictMatchState_selectRowLog(
1470                         ZSTD_matchState_t* ms,
1471                         const BYTE* ip, const BYTE* const iLimit,
1472                         size_t* offsetPtr)
1473 {
1474     const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1475     switch(cappedSearchLog)
1476     {
1477     default :
1478     case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 4);
1479     case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 5);
1480     }
1481 }
1482 
ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog(ZSTD_matchState_t * ms,const BYTE * ip,const BYTE * const iLimit,size_t * offsetPtr)1483 FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog(
1484                         ZSTD_matchState_t* ms,
1485                         const BYTE* ip, const BYTE* const iLimit,
1486                         size_t* offsetPtr)
1487 {
1488     const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1489     switch(cappedSearchLog)
1490     {
1491     default :
1492     case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 4);
1493     case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 5);
1494     }
1495 }
1496 
ZSTD_RowFindBestMatch_extDict_selectRowLog(ZSTD_matchState_t * ms,const BYTE * ip,const BYTE * const iLimit,size_t * offsetPtr)1497 FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_extDict_selectRowLog (
1498                         ZSTD_matchState_t* ms,
1499                         const BYTE* ip, const BYTE* const iLimit,
1500                         size_t* offsetPtr)
1501 {
1502     const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1503     switch(cappedSearchLog)
1504     {
1505     default :
1506     case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 4);
1507     case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 5);
1508     }
1509 }
1510 
1511 
1512 /* *******************************
1513 *  Common parser - lazy strategy
1514 *********************************/
1515 typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
1516 
1517 FORCE_INLINE_TEMPLATE size_t
ZSTD_compressBlock_lazy_generic(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],const void * src,size_t srcSize,const searchMethod_e searchMethod,const U32 depth,ZSTD_dictMode_e const dictMode)1518 ZSTD_compressBlock_lazy_generic(
1519                         ZSTD_matchState_t* ms, seqStore_t* seqStore,
1520                         U32 rep[ZSTD_REP_NUM],
1521                         const void* src, size_t srcSize,
1522                         const searchMethod_e searchMethod, const U32 depth,
1523                         ZSTD_dictMode_e const dictMode)
1524 {
1525     const BYTE* const istart = (const BYTE*)src;
1526     const BYTE* ip = istart;
1527     const BYTE* anchor = istart;
1528     const BYTE* const iend = istart + srcSize;
1529     const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
1530     const BYTE* const base = ms->window.base;
1531     const U32 prefixLowestIndex = ms->window.dictLimit;
1532     const BYTE* const prefixLowest = base + prefixLowestIndex;
1533     const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
1534 
1535     typedef size_t (*searchMax_f)(
1536                         ZSTD_matchState_t* ms,
1537                         const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
1538 
1539     /**
1540      * This table is indexed first by the four ZSTD_dictMode_e values, and then
1541      * by the two searchMethod_e values. NULLs are placed for configurations
1542      * that should never occur (extDict modes go to the other implementation
1543      * below and there is no DDSS for binary tree search yet).
1544      */
1545     const searchMax_f searchFuncs[4][3] = {
1546         {
1547             ZSTD_HcFindBestMatch_selectMLS,
1548             ZSTD_BtFindBestMatch_selectMLS,
1549             ZSTD_RowFindBestMatch_selectRowLog
1550         },
1551         {
1552             NULL,
1553             NULL,
1554             NULL
1555         },
1556         {
1557             ZSTD_HcFindBestMatch_dictMatchState_selectMLS,
1558             ZSTD_BtFindBestMatch_dictMatchState_selectMLS,
1559             ZSTD_RowFindBestMatch_dictMatchState_selectRowLog
1560         },
1561         {
1562             ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS,
1563             NULL,
1564             ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog
1565         }
1566     };
1567 
1568     searchMax_f const searchMax = searchFuncs[dictMode][(int)searchMethod];
1569     U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
1570 
1571     const int isDMS = dictMode == ZSTD_dictMatchState;
1572     const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
1573     const int isDxS = isDMS || isDDS;
1574     const ZSTD_matchState_t* const dms = ms->dictMatchState;
1575     const U32 dictLowestIndex      = isDxS ? dms->window.dictLimit : 0;
1576     const BYTE* const dictBase     = isDxS ? dms->window.base : NULL;
1577     const BYTE* const dictLowest   = isDxS ? dictBase + dictLowestIndex : NULL;
1578     const BYTE* const dictEnd      = isDxS ? dms->window.nextSrc : NULL;
1579     const U32 dictIndexDelta       = isDxS ?
1580                                      prefixLowestIndex - (U32)(dictEnd - dictBase) :
1581                                      0;
1582     const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));
1583 
1584     assert(searchMax != NULL);
1585 
1586     DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);
1587     ip += (dictAndPrefixLength == 0);
1588     if (dictMode == ZSTD_noDict) {
1589         U32 const curr = (U32)(ip - base);
1590         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
1591         U32 const maxRep = curr - windowLow;
1592         if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
1593         if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
1594     }
1595     if (isDxS) {
1596         /* dictMatchState repCode checks don't currently handle repCode == 0
1597          * disabling. */
1598         assert(offset_1 <= dictAndPrefixLength);
1599         assert(offset_2 <= dictAndPrefixLength);
1600     }
1601 
1602     if (searchMethod == search_rowHash) {
1603         ZSTD_row_fillHashCache(ms, base, rowLog,
1604                             MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
1605                             ms->nextToUpdate, ilimit);
1606     }
1607 
1608     /* Match Loop */
1609 #if defined(__GNUC__) && defined(__x86_64__)
1610     /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
1611      * code alignment is perturbed. To fix the instability align the loop on 32-bytes.
1612      */
1613     __asm__(".p2align 5");
1614 #endif
1615     while (ip < ilimit) {
1616         size_t matchLength=0;
1617         size_t offset=0;
1618         const BYTE* start=ip+1;
1619 
1620         /* check repCode */
1621         if (isDxS) {
1622             const U32 repIndex = (U32)(ip - base) + 1 - offset_1;
1623             const BYTE* repMatch = ((dictMode == ZSTD_dictMatchState || dictMode == ZSTD_dedicatedDictSearch)
1624                                 && repIndex < prefixLowestIndex) ?
1625                                    dictBase + (repIndex - dictIndexDelta) :
1626                                    base + repIndex;
1627             if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
1628                 && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
1629                 const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
1630                 matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
1631                 if (depth==0) goto _storeSequence;
1632             }
1633         }
1634         if ( dictMode == ZSTD_noDict
1635           && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) {
1636             matchLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
1637             if (depth==0) goto _storeSequence;
1638         }
1639 
1640         /* first search (depth 0) */
1641         {   size_t offsetFound = 999999999;
1642             size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
1643             if (ml2 > matchLength)
1644                 matchLength = ml2, start = ip, offset=offsetFound;
1645         }
1646 
1647         if (matchLength < 4) {
1648             ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
1649             continue;
1650         }
1651 
1652         /* let's try to find a better solution */
1653         if (depth>=1)
1654         while (ip<ilimit) {
1655             ip ++;
1656             if ( (dictMode == ZSTD_noDict)
1657               && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1658                 size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
1659                 int const gain2 = (int)(mlRep * 3);
1660                 int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
1661                 if ((mlRep >= 4) && (gain2 > gain1))
1662                     matchLength = mlRep, offset = 0, start = ip;
1663             }
1664             if (isDxS) {
1665                 const U32 repIndex = (U32)(ip - base) - offset_1;
1666                 const BYTE* repMatch = repIndex < prefixLowestIndex ?
1667                                dictBase + (repIndex - dictIndexDelta) :
1668                                base + repIndex;
1669                 if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
1670                     && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
1671                     const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
1672                     size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
1673                     int const gain2 = (int)(mlRep * 3);
1674                     int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
1675                     if ((mlRep >= 4) && (gain2 > gain1))
1676                         matchLength = mlRep, offset = 0, start = ip;
1677                 }
1678             }
1679             {   size_t offset2=999999999;
1680                 size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1681                 int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
1682                 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
1683                 if ((ml2 >= 4) && (gain2 > gain1)) {
1684                     matchLength = ml2, offset = offset2, start = ip;
1685                     continue;   /* search a better one */
1686             }   }
1687 
1688             /* let's find an even better one */
1689             if ((depth==2) && (ip<ilimit)) {
1690                 ip ++;
1691                 if ( (dictMode == ZSTD_noDict)
1692                   && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1693                     size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
1694                     int const gain2 = (int)(mlRep * 4);
1695                     int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
1696                     if ((mlRep >= 4) && (gain2 > gain1))
1697                         matchLength = mlRep, offset = 0, start = ip;
1698                 }
1699                 if (isDxS) {
1700                     const U32 repIndex = (U32)(ip - base) - offset_1;
1701                     const BYTE* repMatch = repIndex < prefixLowestIndex ?
1702                                    dictBase + (repIndex - dictIndexDelta) :
1703                                    base + repIndex;
1704                     if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
1705                         && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
1706                         const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
1707                         size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
1708                         int const gain2 = (int)(mlRep * 4);
1709                         int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
1710                         if ((mlRep >= 4) && (gain2 > gain1))
1711                             matchLength = mlRep, offset = 0, start = ip;
1712                     }
1713                 }
1714                 {   size_t offset2=999999999;
1715                     size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1716                     int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
1717                     int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
1718                     if ((ml2 >= 4) && (gain2 > gain1)) {
1719                         matchLength = ml2, offset = offset2, start = ip;
1720                         continue;
1721             }   }   }
1722             break;  /* nothing found : store previous solution */
1723         }
1724 
1725         /* NOTE:
1726          * start[-offset+ZSTD_REP_MOVE-1] is undefined behavior.
1727          * (-offset+ZSTD_REP_MOVE-1) is unsigned, and is added to start, which
1728          * overflows the pointer, which is undefined behavior.
1729          */
1730         /* catch up */
1731         if (offset) {
1732             if (dictMode == ZSTD_noDict) {
1733                 while ( ((start > anchor) & (start - (offset-ZSTD_REP_MOVE) > prefixLowest))
1734                      && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) )  /* only search for offset within prefix */
1735                     { start--; matchLength++; }
1736             }
1737             if (isDxS) {
1738                 U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
1739                 const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
1740                 const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
1741                 while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
1742             }
1743             offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE);
1744         }
1745         /* store sequence */
1746 _storeSequence:
1747         {   size_t const litLength = start - anchor;
1748             ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
1749             anchor = ip = start + matchLength;
1750         }
1751 
1752         /* check immediate repcode */
1753         if (isDxS) {
1754             while (ip <= ilimit) {
1755                 U32 const current2 = (U32)(ip-base);
1756                 U32 const repIndex = current2 - offset_2;
1757                 const BYTE* repMatch = repIndex < prefixLowestIndex ?
1758                         dictBase - dictIndexDelta + repIndex :
1759                         base + repIndex;
1760                 if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex) >= 3 /* intentional overflow */)
1761                    && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
1762                     const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
1763                     matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
1764                     offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset;   /* swap offset_2 <=> offset_1 */
1765                     ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
1766                     ip += matchLength;
1767                     anchor = ip;
1768                     continue;
1769                 }
1770                 break;
1771             }
1772         }
1773 
1774         if (dictMode == ZSTD_noDict) {
1775             while ( ((ip <= ilimit) & (offset_2>0))
1776                  && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
1777                 /* store sequence */
1778                 matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
1779                 offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */
1780                 ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
1781                 ip += matchLength;
1782                 anchor = ip;
1783                 continue;   /* faster when present ... (?) */
1784     }   }   }
1785 
1786     /* Save reps for next block */
1787     rep[0] = offset_1 ? offset_1 : savedOffset;
1788     rep[1] = offset_2 ? offset_2 : savedOffset;
1789 
1790     /* Return the last literals size */
1791     return (size_t)(iend - anchor);
1792 }
1793 
1794 
ZSTD_compressBlock_btlazy2(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1795 size_t ZSTD_compressBlock_btlazy2(
1796         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1797         void const* src, size_t srcSize)
1798 {
1799     return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
1800 }
1801 
ZSTD_compressBlock_lazy2(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1802 size_t ZSTD_compressBlock_lazy2(
1803         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1804         void const* src, size_t srcSize)
1805 {
1806     return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
1807 }
1808 
ZSTD_compressBlock_lazy(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1809 size_t ZSTD_compressBlock_lazy(
1810         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1811         void const* src, size_t srcSize)
1812 {
1813     return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
1814 }
1815 
ZSTD_compressBlock_greedy(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1816 size_t ZSTD_compressBlock_greedy(
1817         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1818         void const* src, size_t srcSize)
1819 {
1820     return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
1821 }
1822 
ZSTD_compressBlock_btlazy2_dictMatchState(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1823 size_t ZSTD_compressBlock_btlazy2_dictMatchState(
1824         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1825         void const* src, size_t srcSize)
1826 {
1827     return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
1828 }
1829 
ZSTD_compressBlock_lazy2_dictMatchState(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1830 size_t ZSTD_compressBlock_lazy2_dictMatchState(
1831         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1832         void const* src, size_t srcSize)
1833 {
1834     return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
1835 }
1836 
ZSTD_compressBlock_lazy_dictMatchState(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1837 size_t ZSTD_compressBlock_lazy_dictMatchState(
1838         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1839         void const* src, size_t srcSize)
1840 {
1841     return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);
1842 }
1843 
ZSTD_compressBlock_greedy_dictMatchState(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1844 size_t ZSTD_compressBlock_greedy_dictMatchState(
1845         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1846         void const* src, size_t srcSize)
1847 {
1848     return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
1849 }
1850 
1851 
ZSTD_compressBlock_lazy2_dedicatedDictSearch(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1852 size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
1853         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1854         void const* src, size_t srcSize)
1855 {
1856     return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
1857 }
1858 
ZSTD_compressBlock_lazy_dedicatedDictSearch(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1859 size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
1860         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1861         void const* src, size_t srcSize)
1862 {
1863     return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
1864 }
1865 
ZSTD_compressBlock_greedy_dedicatedDictSearch(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1866 size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
1867         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1868         void const* src, size_t srcSize)
1869 {
1870     return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
1871 }
1872 
1873 /* Row-based matchfinder */
ZSTD_compressBlock_lazy2_row(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1874 size_t ZSTD_compressBlock_lazy2_row(
1875         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1876         void const* src, size_t srcSize)
1877 {
1878     return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
1879 }
1880 
ZSTD_compressBlock_lazy_row(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1881 size_t ZSTD_compressBlock_lazy_row(
1882         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1883         void const* src, size_t srcSize)
1884 {
1885     return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
1886 }
1887 
ZSTD_compressBlock_greedy_row(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1888 size_t ZSTD_compressBlock_greedy_row(
1889         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1890         void const* src, size_t srcSize)
1891 {
1892     return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
1893 }
1894 
ZSTD_compressBlock_lazy2_dictMatchState_row(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1895 size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
1896         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1897         void const* src, size_t srcSize)
1898 {
1899     return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
1900 }
1901 
ZSTD_compressBlock_lazy_dictMatchState_row(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1902 size_t ZSTD_compressBlock_lazy_dictMatchState_row(
1903         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1904         void const* src, size_t srcSize)
1905 {
1906     return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
1907 }
1908 
ZSTD_compressBlock_greedy_dictMatchState_row(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1909 size_t ZSTD_compressBlock_greedy_dictMatchState_row(
1910         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1911         void const* src, size_t srcSize)
1912 {
1913     return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
1914 }
1915 
1916 
ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1917 size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
1918         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1919         void const* src, size_t srcSize)
1920 {
1921     return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch);
1922 }
1923 
ZSTD_compressBlock_lazy_dedicatedDictSearch_row(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1924 size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
1925         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1926         void const* src, size_t srcSize)
1927 {
1928     return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
1929 }
1930 
ZSTD_compressBlock_greedy_dedicatedDictSearch_row(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1931 size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
1932         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1933         void const* src, size_t srcSize)
1934 {
1935     return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
1936 }
1937 
1938 FORCE_INLINE_TEMPLATE
ZSTD_compressBlock_lazy_extDict_generic(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],const void * src,size_t srcSize,const searchMethod_e searchMethod,const U32 depth)1939 size_t ZSTD_compressBlock_lazy_extDict_generic(
1940                         ZSTD_matchState_t* ms, seqStore_t* seqStore,
1941                         U32 rep[ZSTD_REP_NUM],
1942                         const void* src, size_t srcSize,
1943                         const searchMethod_e searchMethod, const U32 depth)
1944 {
1945     const BYTE* const istart = (const BYTE*)src;
1946     const BYTE* ip = istart;
1947     const BYTE* anchor = istart;
1948     const BYTE* const iend = istart + srcSize;
1949     const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
1950     const BYTE* const base = ms->window.base;
1951     const U32 dictLimit = ms->window.dictLimit;
1952     const BYTE* const prefixStart = base + dictLimit;
1953     const BYTE* const dictBase = ms->window.dictBase;
1954     const BYTE* const dictEnd  = dictBase + dictLimit;
1955     const BYTE* const dictStart  = dictBase + ms->window.lowLimit;
1956     const U32 windowLog = ms->cParams.windowLog;
1957     const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
1958 
1959     typedef size_t (*searchMax_f)(
1960                         ZSTD_matchState_t* ms,
1961                         const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
1962     const searchMax_f searchFuncs[3] = {
1963         ZSTD_HcFindBestMatch_extDict_selectMLS,
1964         ZSTD_BtFindBestMatch_extDict_selectMLS,
1965         ZSTD_RowFindBestMatch_extDict_selectRowLog
1966     };
1967     searchMax_f searchMax = searchFuncs[(int)searchMethod];
1968     U32 offset_1 = rep[0], offset_2 = rep[1];
1969 
1970     DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
1971 
1972     /* init */
1973     ip += (ip == prefixStart);
1974     if (searchMethod == search_rowHash) {
1975         ZSTD_row_fillHashCache(ms, base, rowLog,
1976                                MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
1977                                ms->nextToUpdate, ilimit);
1978     }
1979 
1980     /* Match Loop */
1981 #if defined(__GNUC__) && defined(__x86_64__)
1982     /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
1983      * code alignment is perturbed. To fix the instability align the loop on 32-bytes.
1984      */
1985     __asm__(".p2align 5");
1986 #endif
1987     while (ip < ilimit) {
1988         size_t matchLength=0;
1989         size_t offset=0;
1990         const BYTE* start=ip+1;
1991         U32 curr = (U32)(ip-base);
1992 
1993         /* check repCode */
1994         {   const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr+1, windowLog);
1995             const U32 repIndex = (U32)(curr+1 - offset_1);
1996             const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
1997             const BYTE* const repMatch = repBase + repIndex;
1998             if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */
1999                & (offset_1 < curr+1 - windowLow) ) /* note: we are searching at curr+1 */
2000             if (MEM_read32(ip+1) == MEM_read32(repMatch)) {
2001                 /* repcode detected we should take it */
2002                 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
2003                 matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repEnd, prefixStart) + 4;
2004                 if (depth==0) goto _storeSequence;
2005         }   }
2006 
2007         /* first search (depth 0) */
2008         {   size_t offsetFound = 999999999;
2009             size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
2010             if (ml2 > matchLength)
2011                 matchLength = ml2, start = ip, offset=offsetFound;
2012         }
2013 
2014          if (matchLength < 4) {
2015             ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
2016             continue;
2017         }
2018 
2019         /* let's try to find a better solution */
2020         if (depth>=1)
2021         while (ip<ilimit) {
2022             ip ++;
2023             curr++;
2024             /* check repCode */
2025             if (offset) {
2026                 const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
2027                 const U32 repIndex = (U32)(curr - offset_1);
2028                 const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
2029                 const BYTE* const repMatch = repBase + repIndex;
2030                 if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments  */
2031                    & (offset_1 < curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
2032                 if (MEM_read32(ip) == MEM_read32(repMatch)) {
2033                     /* repcode detected */
2034                     const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
2035                     size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
2036                     int const gain2 = (int)(repLength * 3);
2037                     int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
2038                     if ((repLength >= 4) && (gain2 > gain1))
2039                         matchLength = repLength, offset = 0, start = ip;
2040             }   }
2041 
2042             /* search match, depth 1 */
2043             {   size_t offset2=999999999;
2044                 size_t const ml2 = searchMax(ms, ip, iend, &offset2);
2045                 int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
2046                 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
2047                 if ((ml2 >= 4) && (gain2 > gain1)) {
2048                     matchLength = ml2, offset = offset2, start = ip;
2049                     continue;   /* search a better one */
2050             }   }
2051 
2052             /* let's find an even better one */
2053             if ((depth==2) && (ip<ilimit)) {
2054                 ip ++;
2055                 curr++;
2056                 /* check repCode */
2057                 if (offset) {
2058                     const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
2059                     const U32 repIndex = (U32)(curr - offset_1);
2060                     const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
2061                     const BYTE* const repMatch = repBase + repIndex;
2062                     if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments  */
2063                        & (offset_1 < curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
2064                     if (MEM_read32(ip) == MEM_read32(repMatch)) {
2065                         /* repcode detected */
2066                         const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
2067                         size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
2068                         int const gain2 = (int)(repLength * 4);
2069                         int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
2070                         if ((repLength >= 4) && (gain2 > gain1))
2071                             matchLength = repLength, offset = 0, start = ip;
2072                 }   }
2073 
2074                 /* search match, depth 2 */
2075                 {   size_t offset2=999999999;
2076                     size_t const ml2 = searchMax(ms, ip, iend, &offset2);
2077                     int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
2078                     int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
2079                     if ((ml2 >= 4) && (gain2 > gain1)) {
2080                         matchLength = ml2, offset = offset2, start = ip;
2081                         continue;
2082             }   }   }
2083             break;  /* nothing found : store previous solution */
2084         }
2085 
2086         /* catch up */
2087         if (offset) {
2088             U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
2089             const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
2090             const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
2091             while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
2092             offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE);
2093         }
2094 
2095         /* store sequence */
2096 _storeSequence:
2097         {   size_t const litLength = start - anchor;
2098             ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
2099             anchor = ip = start + matchLength;
2100         }
2101 
2102         /* check immediate repcode */
2103         while (ip <= ilimit) {
2104             const U32 repCurrent = (U32)(ip-base);
2105             const U32 windowLow = ZSTD_getLowestMatchIndex(ms, repCurrent, windowLog);
2106             const U32 repIndex = repCurrent - offset_2;
2107             const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
2108             const BYTE* const repMatch = repBase + repIndex;
2109             if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments  */
2110                & (offset_2 < repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
2111             if (MEM_read32(ip) == MEM_read32(repMatch)) {
2112                 /* repcode detected we should take it */
2113                 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
2114                 matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
2115                 offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset;   /* swap offset history */
2116                 ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
2117                 ip += matchLength;
2118                 anchor = ip;
2119                 continue;   /* faster when present ... (?) */
2120             }
2121             break;
2122     }   }
2123 
2124     /* Save reps for next block */
2125     rep[0] = offset_1;
2126     rep[1] = offset_2;
2127 
2128     /* Return the last literals size */
2129     return (size_t)(iend - anchor);
2130 }
2131 
2132 
ZSTD_compressBlock_greedy_extDict(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)2133 size_t ZSTD_compressBlock_greedy_extDict(
2134         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2135         void const* src, size_t srcSize)
2136 {
2137     return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0);
2138 }
2139 
ZSTD_compressBlock_lazy_extDict(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)2140 size_t ZSTD_compressBlock_lazy_extDict(
2141         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2142         void const* src, size_t srcSize)
2143 
2144 {
2145     return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);
2146 }
2147 
ZSTD_compressBlock_lazy2_extDict(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)2148 size_t ZSTD_compressBlock_lazy2_extDict(
2149         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2150         void const* src, size_t srcSize)
2151 
2152 {
2153     return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);
2154 }
2155 
ZSTD_compressBlock_btlazy2_extDict(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)2156 size_t ZSTD_compressBlock_btlazy2_extDict(
2157         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2158         void const* src, size_t srcSize)
2159 
2160 {
2161     return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
2162 }
2163 
ZSTD_compressBlock_greedy_extDict_row(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)2164 size_t ZSTD_compressBlock_greedy_extDict_row(
2165         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2166         void const* src, size_t srcSize)
2167 {
2168     return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
2169 }
2170 
ZSTD_compressBlock_lazy_extDict_row(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)2171 size_t ZSTD_compressBlock_lazy_extDict_row(
2172         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2173         void const* src, size_t srcSize)
2174 
2175 {
2176     return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
2177 }
2178 
ZSTD_compressBlock_lazy2_extDict_row(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)2179 size_t ZSTD_compressBlock_lazy2_extDict_row(
2180         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2181         void const* src, size_t srcSize)
2182 
2183 {
2184     return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
2185 }
2186