1 /*
2 * Copyright (c) Yann Collet, Facebook, Inc.
3 * All rights reserved.
4 *
5 * This source code is licensed under both the BSD-style license (found in the
6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7 * in the COPYING file in the root directory of this source tree).
8 * You may select, at your option, one of the above-listed licenses.
9 */
10
11 #include "zstd_compress_internal.h"
12 #include "zstd_lazy.h"
13
14
15 /*-*************************************
16 * Binary Tree search
17 ***************************************/
18
19 static void
ZSTD_updateDUBT(ZSTD_matchState_t * ms,const BYTE * ip,const BYTE * iend,U32 mls)20 ZSTD_updateDUBT(ZSTD_matchState_t* ms,
21 const BYTE* ip, const BYTE* iend,
22 U32 mls)
23 {
24 const ZSTD_compressionParameters* const cParams = &ms->cParams;
25 U32* const hashTable = ms->hashTable;
26 U32 const hashLog = cParams->hashLog;
27
28 U32* const bt = ms->chainTable;
29 U32 const btLog = cParams->chainLog - 1;
30 U32 const btMask = (1 << btLog) - 1;
31
32 const BYTE* const base = ms->window.base;
33 U32 const target = (U32)(ip - base);
34 U32 idx = ms->nextToUpdate;
35
36 if (idx != target)
37 DEBUGLOG(7, "ZSTD_updateDUBT, from %u to %u (dictLimit:%u)",
38 idx, target, ms->window.dictLimit);
39 assert(ip + 8 <= iend); /* condition for ZSTD_hashPtr */
40 (void)iend;
41
42 assert(idx >= ms->window.dictLimit); /* condition for valid base+idx */
43 for ( ; idx < target ; idx++) {
44 size_t const h = ZSTD_hashPtr(base + idx, hashLog, mls); /* assumption : ip + 8 <= iend */
45 U32 const matchIndex = hashTable[h];
46
47 U32* const nextCandidatePtr = bt + 2*(idx&btMask);
48 U32* const sortMarkPtr = nextCandidatePtr + 1;
49
50 DEBUGLOG(8, "ZSTD_updateDUBT: insert %u", idx);
51 hashTable[h] = idx; /* Update Hash Table */
52 *nextCandidatePtr = matchIndex; /* update BT like a chain */
53 *sortMarkPtr = ZSTD_DUBT_UNSORTED_MARK;
54 }
55 ms->nextToUpdate = target;
56 }
57
58
59 /** ZSTD_insertDUBT1() :
60 * sort one already inserted but unsorted position
61 * assumption : curr >= btlow == (curr - btmask)
62 * doesn't fail */
63 static void
ZSTD_insertDUBT1(ZSTD_matchState_t * ms,U32 curr,const BYTE * inputEnd,U32 nbCompares,U32 btLow,const ZSTD_dictMode_e dictMode)64 ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
65 U32 curr, const BYTE* inputEnd,
66 U32 nbCompares, U32 btLow,
67 const ZSTD_dictMode_e dictMode)
68 {
69 const ZSTD_compressionParameters* const cParams = &ms->cParams;
70 U32* const bt = ms->chainTable;
71 U32 const btLog = cParams->chainLog - 1;
72 U32 const btMask = (1 << btLog) - 1;
73 size_t commonLengthSmaller=0, commonLengthLarger=0;
74 const BYTE* const base = ms->window.base;
75 const BYTE* const dictBase = ms->window.dictBase;
76 const U32 dictLimit = ms->window.dictLimit;
77 const BYTE* const ip = (curr>=dictLimit) ? base + curr : dictBase + curr;
78 const BYTE* const iend = (curr>=dictLimit) ? inputEnd : dictBase + dictLimit;
79 const BYTE* const dictEnd = dictBase + dictLimit;
80 const BYTE* const prefixStart = base + dictLimit;
81 const BYTE* match;
82 U32* smallerPtr = bt + 2*(curr&btMask);
83 U32* largerPtr = smallerPtr + 1;
84 U32 matchIndex = *smallerPtr; /* this candidate is unsorted : next sorted candidate is reached through *smallerPtr, while *largerPtr contains previous unsorted candidate (which is already saved and can be overwritten) */
85 U32 dummy32; /* to be nullified at the end */
86 U32 const windowValid = ms->window.lowLimit;
87 U32 const maxDistance = 1U << cParams->windowLog;
88 U32 const windowLow = (curr - windowValid > maxDistance) ? curr - maxDistance : windowValid;
89
90
91 DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)",
92 curr, dictLimit, windowLow);
93 assert(curr >= btLow);
94 assert(ip < iend); /* condition for ZSTD_count */
95
96 while (nbCompares-- && (matchIndex > windowLow)) {
97 U32* const nextPtr = bt + 2*(matchIndex & btMask);
98 size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
99 assert(matchIndex < curr);
100 /* note : all candidates are now supposed sorted,
101 * but it's still possible to have nextPtr[1] == ZSTD_DUBT_UNSORTED_MARK
102 * when a real index has the same value as ZSTD_DUBT_UNSORTED_MARK */
103
104 if ( (dictMode != ZSTD_extDict)
105 || (matchIndex+matchLength >= dictLimit) /* both in current segment*/
106 || (curr < dictLimit) /* both in extDict */) {
107 const BYTE* const mBase = ( (dictMode != ZSTD_extDict)
108 || (matchIndex+matchLength >= dictLimit)) ?
109 base : dictBase;
110 assert( (matchIndex+matchLength >= dictLimit) /* might be wrong if extDict is incorrectly set to 0 */
111 || (curr < dictLimit) );
112 match = mBase + matchIndex;
113 matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
114 } else {
115 match = dictBase + matchIndex;
116 matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
117 if (matchIndex+matchLength >= dictLimit)
118 match = base + matchIndex; /* preparation for next read of match[matchLength] */
119 }
120
121 DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ",
122 curr, matchIndex, (U32)matchLength);
123
124 if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
125 break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */
126 }
127
128 if (match[matchLength] < ip[matchLength]) { /* necessarily within buffer */
129 /* match is smaller than current */
130 *smallerPtr = matchIndex; /* update smaller idx */
131 commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
132 if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop searching */
133 DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is smaller : next => %u",
134 matchIndex, btLow, nextPtr[1]);
135 smallerPtr = nextPtr+1; /* new "candidate" => larger than match, which was smaller than target */
136 matchIndex = nextPtr[1]; /* new matchIndex, larger than previous and closer to current */
137 } else {
138 /* match is larger than current */
139 *largerPtr = matchIndex;
140 commonLengthLarger = matchLength;
141 if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop searching */
142 DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is larger => %u",
143 matchIndex, btLow, nextPtr[0]);
144 largerPtr = nextPtr;
145 matchIndex = nextPtr[0];
146 } }
147
148 *smallerPtr = *largerPtr = 0;
149 }
150
151
152 static size_t
ZSTD_DUBT_findBetterDictMatch(ZSTD_matchState_t * ms,const BYTE * const ip,const BYTE * const iend,size_t * offsetPtr,size_t bestLength,U32 nbCompares,U32 const mls,const ZSTD_dictMode_e dictMode)153 ZSTD_DUBT_findBetterDictMatch (
154 ZSTD_matchState_t* ms,
155 const BYTE* const ip, const BYTE* const iend,
156 size_t* offsetPtr,
157 size_t bestLength,
158 U32 nbCompares,
159 U32 const mls,
160 const ZSTD_dictMode_e dictMode)
161 {
162 const ZSTD_matchState_t * const dms = ms->dictMatchState;
163 const ZSTD_compressionParameters* const dmsCParams = &dms->cParams;
164 const U32 * const dictHashTable = dms->hashTable;
165 U32 const hashLog = dmsCParams->hashLog;
166 size_t const h = ZSTD_hashPtr(ip, hashLog, mls);
167 U32 dictMatchIndex = dictHashTable[h];
168
169 const BYTE* const base = ms->window.base;
170 const BYTE* const prefixStart = base + ms->window.dictLimit;
171 U32 const curr = (U32)(ip-base);
172 const BYTE* const dictBase = dms->window.base;
173 const BYTE* const dictEnd = dms->window.nextSrc;
174 U32 const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base);
175 U32 const dictLowLimit = dms->window.lowLimit;
176 U32 const dictIndexDelta = ms->window.lowLimit - dictHighLimit;
177
178 U32* const dictBt = dms->chainTable;
179 U32 const btLog = dmsCParams->chainLog - 1;
180 U32 const btMask = (1 << btLog) - 1;
181 U32 const btLow = (btMask >= dictHighLimit - dictLowLimit) ? dictLowLimit : dictHighLimit - btMask;
182
183 size_t commonLengthSmaller=0, commonLengthLarger=0;
184
185 (void)dictMode;
186 assert(dictMode == ZSTD_dictMatchState);
187
188 while (nbCompares-- && (dictMatchIndex > dictLowLimit)) {
189 U32* const nextPtr = dictBt + 2*(dictMatchIndex & btMask);
190 size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
191 const BYTE* match = dictBase + dictMatchIndex;
192 matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
193 if (dictMatchIndex+matchLength >= dictHighLimit)
194 match = base + dictMatchIndex + dictIndexDelta; /* to prepare for next usage of match[matchLength] */
195
196 if (matchLength > bestLength) {
197 U32 matchIndex = dictMatchIndex + dictIndexDelta;
198 if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
199 DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
200 curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + curr - matchIndex, dictMatchIndex, matchIndex);
201 bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;
202 }
203 if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
204 break; /* drop, to guarantee consistency (miss a little bit of compression) */
205 }
206 }
207
208 if (match[matchLength] < ip[matchLength]) {
209 if (dictMatchIndex <= btLow) { break; } /* beyond tree size, stop the search */
210 commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
211 dictMatchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */
212 } else {
213 /* match is larger than current */
214 if (dictMatchIndex <= btLow) { break; } /* beyond tree size, stop the search */
215 commonLengthLarger = matchLength;
216 dictMatchIndex = nextPtr[0];
217 }
218 }
219
220 if (bestLength >= MINMATCH) {
221 U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
222 DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
223 curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
224 }
225 return bestLength;
226
227 }
228
229
230 static size_t
ZSTD_DUBT_findBestMatch(ZSTD_matchState_t * ms,const BYTE * const ip,const BYTE * const iend,size_t * offsetPtr,U32 const mls,const ZSTD_dictMode_e dictMode)231 ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
232 const BYTE* const ip, const BYTE* const iend,
233 size_t* offsetPtr,
234 U32 const mls,
235 const ZSTD_dictMode_e dictMode)
236 {
237 const ZSTD_compressionParameters* const cParams = &ms->cParams;
238 U32* const hashTable = ms->hashTable;
239 U32 const hashLog = cParams->hashLog;
240 size_t const h = ZSTD_hashPtr(ip, hashLog, mls);
241 U32 matchIndex = hashTable[h];
242
243 const BYTE* const base = ms->window.base;
244 U32 const curr = (U32)(ip-base);
245 U32 const windowLow = ZSTD_getLowestMatchIndex(ms, curr, cParams->windowLog);
246
247 U32* const bt = ms->chainTable;
248 U32 const btLog = cParams->chainLog - 1;
249 U32 const btMask = (1 << btLog) - 1;
250 U32 const btLow = (btMask >= curr) ? 0 : curr - btMask;
251 U32 const unsortLimit = MAX(btLow, windowLow);
252
253 U32* nextCandidate = bt + 2*(matchIndex&btMask);
254 U32* unsortedMark = bt + 2*(matchIndex&btMask) + 1;
255 U32 nbCompares = 1U << cParams->searchLog;
256 U32 nbCandidates = nbCompares;
257 U32 previousCandidate = 0;
258
259 DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", curr);
260 assert(ip <= iend-8); /* required for h calculation */
261 assert(dictMode != ZSTD_dedicatedDictSearch);
262
263 /* reach end of unsorted candidates list */
264 while ( (matchIndex > unsortLimit)
265 && (*unsortedMark == ZSTD_DUBT_UNSORTED_MARK)
266 && (nbCandidates > 1) ) {
267 DEBUGLOG(8, "ZSTD_DUBT_findBestMatch: candidate %u is unsorted",
268 matchIndex);
269 *unsortedMark = previousCandidate; /* the unsortedMark becomes a reversed chain, to move up back to original position */
270 previousCandidate = matchIndex;
271 matchIndex = *nextCandidate;
272 nextCandidate = bt + 2*(matchIndex&btMask);
273 unsortedMark = bt + 2*(matchIndex&btMask) + 1;
274 nbCandidates --;
275 }
276
277 /* nullify last candidate if it's still unsorted
278 * simplification, detrimental to compression ratio, beneficial for speed */
279 if ( (matchIndex > unsortLimit)
280 && (*unsortedMark==ZSTD_DUBT_UNSORTED_MARK) ) {
281 DEBUGLOG(7, "ZSTD_DUBT_findBestMatch: nullify last unsorted candidate %u",
282 matchIndex);
283 *nextCandidate = *unsortedMark = 0;
284 }
285
286 /* batch sort stacked candidates */
287 matchIndex = previousCandidate;
288 while (matchIndex) { /* will end on matchIndex == 0 */
289 U32* const nextCandidateIdxPtr = bt + 2*(matchIndex&btMask) + 1;
290 U32 const nextCandidateIdx = *nextCandidateIdxPtr;
291 ZSTD_insertDUBT1(ms, matchIndex, iend,
292 nbCandidates, unsortLimit, dictMode);
293 matchIndex = nextCandidateIdx;
294 nbCandidates++;
295 }
296
297 /* find longest match */
298 { size_t commonLengthSmaller = 0, commonLengthLarger = 0;
299 const BYTE* const dictBase = ms->window.dictBase;
300 const U32 dictLimit = ms->window.dictLimit;
301 const BYTE* const dictEnd = dictBase + dictLimit;
302 const BYTE* const prefixStart = base + dictLimit;
303 U32* smallerPtr = bt + 2*(curr&btMask);
304 U32* largerPtr = bt + 2*(curr&btMask) + 1;
305 U32 matchEndIdx = curr + 8 + 1;
306 U32 dummy32; /* to be nullified at the end */
307 size_t bestLength = 0;
308
309 matchIndex = hashTable[h];
310 hashTable[h] = curr; /* Update Hash Table */
311
312 while (nbCompares-- && (matchIndex > windowLow)) {
313 U32* const nextPtr = bt + 2*(matchIndex & btMask);
314 size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
315 const BYTE* match;
316
317 if ((dictMode != ZSTD_extDict) || (matchIndex+matchLength >= dictLimit)) {
318 match = base + matchIndex;
319 matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
320 } else {
321 match = dictBase + matchIndex;
322 matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
323 if (matchIndex+matchLength >= dictLimit)
324 match = base + matchIndex; /* to prepare for next usage of match[matchLength] */
325 }
326
327 if (matchLength > bestLength) {
328 if (matchLength > matchEndIdx - matchIndex)
329 matchEndIdx = matchIndex + (U32)matchLength;
330 if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
331 bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;
332 if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
333 if (dictMode == ZSTD_dictMatchState) {
334 nbCompares = 0; /* in addition to avoiding checking any
335 * further in this loop, make sure we
336 * skip checking in the dictionary. */
337 }
338 break; /* drop, to guarantee consistency (miss a little bit of compression) */
339 }
340 }
341
342 if (match[matchLength] < ip[matchLength]) {
343 /* match is smaller than current */
344 *smallerPtr = matchIndex; /* update smaller idx */
345 commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */
346 if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop the search */
347 smallerPtr = nextPtr+1; /* new "smaller" => larger of match */
348 matchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */
349 } else {
350 /* match is larger than current */
351 *largerPtr = matchIndex;
352 commonLengthLarger = matchLength;
353 if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop the search */
354 largerPtr = nextPtr;
355 matchIndex = nextPtr[0];
356 } }
357
358 *smallerPtr = *largerPtr = 0;
359
360 if (dictMode == ZSTD_dictMatchState && nbCompares) {
361 bestLength = ZSTD_DUBT_findBetterDictMatch(
362 ms, ip, iend,
363 offsetPtr, bestLength, nbCompares,
364 mls, dictMode);
365 }
366
367 assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
368 ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */
369 if (bestLength >= MINMATCH) {
370 U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
371 DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
372 curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
373 }
374 return bestLength;
375 }
376 }
377
378
379 /** ZSTD_BtFindBestMatch() : Tree updater, providing best match */
380 FORCE_INLINE_TEMPLATE size_t
ZSTD_BtFindBestMatch(ZSTD_matchState_t * ms,const BYTE * const ip,const BYTE * const iLimit,size_t * offsetPtr,const U32 mls,const ZSTD_dictMode_e dictMode)381 ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
382 const BYTE* const ip, const BYTE* const iLimit,
383 size_t* offsetPtr,
384 const U32 mls /* template */,
385 const ZSTD_dictMode_e dictMode)
386 {
387 DEBUGLOG(7, "ZSTD_BtFindBestMatch");
388 if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */
389 ZSTD_updateDUBT(ms, ip, iLimit, mls);
390 return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
391 }
392
393
394 static size_t
ZSTD_BtFindBestMatch_selectMLS(ZSTD_matchState_t * ms,const BYTE * ip,const BYTE * const iLimit,size_t * offsetPtr)395 ZSTD_BtFindBestMatch_selectMLS ( ZSTD_matchState_t* ms,
396 const BYTE* ip, const BYTE* const iLimit,
397 size_t* offsetPtr)
398 {
399 switch(ms->cParams.minMatch)
400 {
401 default : /* includes case 3 */
402 case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
403 case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
404 case 7 :
405 case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
406 }
407 }
408
409
ZSTD_BtFindBestMatch_dictMatchState_selectMLS(ZSTD_matchState_t * ms,const BYTE * ip,const BYTE * const iLimit,size_t * offsetPtr)410 static size_t ZSTD_BtFindBestMatch_dictMatchState_selectMLS (
411 ZSTD_matchState_t* ms,
412 const BYTE* ip, const BYTE* const iLimit,
413 size_t* offsetPtr)
414 {
415 switch(ms->cParams.minMatch)
416 {
417 default : /* includes case 3 */
418 case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
419 case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
420 case 7 :
421 case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
422 }
423 }
424
425
ZSTD_BtFindBestMatch_extDict_selectMLS(ZSTD_matchState_t * ms,const BYTE * ip,const BYTE * const iLimit,size_t * offsetPtr)426 static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
427 ZSTD_matchState_t* ms,
428 const BYTE* ip, const BYTE* const iLimit,
429 size_t* offsetPtr)
430 {
431 switch(ms->cParams.minMatch)
432 {
433 default : /* includes case 3 */
434 case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
435 case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
436 case 7 :
437 case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
438 }
439 }
440
441 /***********************************
442 * Dedicated dict search
443 ***********************************/
444
ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t * ms,const BYTE * const ip)445 void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip)
446 {
447 const BYTE* const base = ms->window.base;
448 U32 const target = (U32)(ip - base);
449 U32* const hashTable = ms->hashTable;
450 U32* const chainTable = ms->chainTable;
451 U32 const chainSize = 1 << ms->cParams.chainLog;
452 U32 idx = ms->nextToUpdate;
453 U32 const minChain = chainSize < target ? target - chainSize : idx;
454 U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG;
455 U32 const cacheSize = bucketSize - 1;
456 U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize;
457 U32 const chainLimit = chainAttempts > 255 ? 255 : chainAttempts;
458
459 /* We know the hashtable is oversized by a factor of `bucketSize`.
460 * We are going to temporarily pretend `bucketSize == 1`, keeping only a
461 * single entry. We will use the rest of the space to construct a temporary
462 * chaintable.
463 */
464 U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
465 U32* const tmpHashTable = hashTable;
466 U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog);
467 U32 const tmpChainSize = ((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;
468 U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx;
469 U32 hashIdx;
470
471 assert(ms->cParams.chainLog <= 24);
472 assert(ms->cParams.hashLog > ms->cParams.chainLog);
473 assert(idx != 0);
474 assert(tmpMinChain <= minChain);
475
476 /* fill conventional hash table and conventional chain table */
477 for ( ; idx < target; idx++) {
478 U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch);
479 if (idx >= tmpMinChain) {
480 tmpChainTable[idx - tmpMinChain] = hashTable[h];
481 }
482 tmpHashTable[h] = idx;
483 }
484
485 /* sort chains into ddss chain table */
486 {
487 U32 chainPos = 0;
488 for (hashIdx = 0; hashIdx < (1U << hashLog); hashIdx++) {
489 U32 count;
490 U32 countBeyondMinChain = 0;
491 U32 i = tmpHashTable[hashIdx];
492 for (count = 0; i >= tmpMinChain && count < cacheSize; count++) {
493 /* skip through the chain to the first position that won't be
494 * in the hash cache bucket */
495 if (i < minChain) {
496 countBeyondMinChain++;
497 }
498 i = tmpChainTable[i - tmpMinChain];
499 }
500 if (count == cacheSize) {
501 for (count = 0; count < chainLimit;) {
502 if (i < minChain) {
503 if (!i || ++countBeyondMinChain > cacheSize) {
504 /* only allow pulling `cacheSize` number of entries
505 * into the cache or chainTable beyond `minChain`,
506 * to replace the entries pulled out of the
507 * chainTable into the cache. This lets us reach
508 * back further without increasing the total number
509 * of entries in the chainTable, guaranteeing the
510 * DDSS chain table will fit into the space
511 * allocated for the regular one. */
512 break;
513 }
514 }
515 chainTable[chainPos++] = i;
516 count++;
517 if (i < tmpMinChain) {
518 break;
519 }
520 i = tmpChainTable[i - tmpMinChain];
521 }
522 } else {
523 count = 0;
524 }
525 if (count) {
526 tmpHashTable[hashIdx] = ((chainPos - count) << 8) + count;
527 } else {
528 tmpHashTable[hashIdx] = 0;
529 }
530 }
531 assert(chainPos <= chainSize); /* I believe this is guaranteed... */
532 }
533
534 /* move chain pointers into the last entry of each hash bucket */
535 for (hashIdx = (1 << hashLog); hashIdx; ) {
536 U32 const bucketIdx = --hashIdx << ZSTD_LAZY_DDSS_BUCKET_LOG;
537 U32 const chainPackedPointer = tmpHashTable[hashIdx];
538 U32 i;
539 for (i = 0; i < cacheSize; i++) {
540 hashTable[bucketIdx + i] = 0;
541 }
542 hashTable[bucketIdx + bucketSize - 1] = chainPackedPointer;
543 }
544
545 /* fill the buckets of the hash table */
546 for (idx = ms->nextToUpdate; idx < target; idx++) {
547 U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch)
548 << ZSTD_LAZY_DDSS_BUCKET_LOG;
549 U32 i;
550 /* Shift hash cache down 1. */
551 for (i = cacheSize - 1; i; i--)
552 hashTable[h + i] = hashTable[h + i - 1];
553 hashTable[h] = idx;
554 }
555
556 ms->nextToUpdate = target;
557 }
558
559 /* Returns the longest match length found in the dedicated dict search structure.
560 * If none are longer than the argument ml, then ml will be returned.
561 */
562 FORCE_INLINE_TEMPLATE
ZSTD_dedicatedDictSearch_lazy_search(size_t * offsetPtr,size_t ml,U32 nbAttempts,const ZSTD_matchState_t * const dms,const BYTE * const ip,const BYTE * const iLimit,const BYTE * const prefixStart,const U32 curr,const U32 dictLimit,const size_t ddsIdx)563 size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts,
564 const ZSTD_matchState_t* const dms,
565 const BYTE* const ip, const BYTE* const iLimit,
566 const BYTE* const prefixStart, const U32 curr,
567 const U32 dictLimit, const size_t ddsIdx) {
568 const U32 ddsLowestIndex = dms->window.dictLimit;
569 const BYTE* const ddsBase = dms->window.base;
570 const BYTE* const ddsEnd = dms->window.nextSrc;
571 const U32 ddsSize = (U32)(ddsEnd - ddsBase);
572 const U32 ddsIndexDelta = dictLimit - ddsSize;
573 const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG);
574 const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1;
575 U32 ddsAttempt;
576 U32 matchIndex;
577
578 for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) {
579 PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);
580 }
581
582 {
583 U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
584 U32 const chainIndex = chainPackedPointer >> 8;
585
586 PREFETCH_L1(&dms->chainTable[chainIndex]);
587 }
588
589 for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) {
590 size_t currentMl=0;
591 const BYTE* match;
592 matchIndex = dms->hashTable[ddsIdx + ddsAttempt];
593 match = ddsBase + matchIndex;
594
595 if (!matchIndex) {
596 return ml;
597 }
598
599 /* guaranteed by table construction */
600 (void)ddsLowestIndex;
601 assert(matchIndex >= ddsLowestIndex);
602 assert(match+4 <= ddsEnd);
603 if (MEM_read32(match) == MEM_read32(ip)) {
604 /* assumption : matchIndex <= dictLimit-4 (by table construction) */
605 currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
606 }
607
608 /* save best solution */
609 if (currentMl > ml) {
610 ml = currentMl;
611 *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
612 if (ip+currentMl == iLimit) {
613 /* best possible, avoids read overflow on next attempt */
614 return ml;
615 }
616 }
617 }
618
619 {
620 U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
621 U32 chainIndex = chainPackedPointer >> 8;
622 U32 const chainLength = chainPackedPointer & 0xFF;
623 U32 const chainAttempts = nbAttempts - ddsAttempt;
624 U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts;
625 U32 chainAttempt;
626
627 for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) {
628 PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]);
629 }
630
631 for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) {
632 size_t currentMl=0;
633 const BYTE* match;
634 matchIndex = dms->chainTable[chainIndex];
635 match = ddsBase + matchIndex;
636
637 /* guaranteed by table construction */
638 assert(matchIndex >= ddsLowestIndex);
639 assert(match+4 <= ddsEnd);
640 if (MEM_read32(match) == MEM_read32(ip)) {
641 /* assumption : matchIndex <= dictLimit-4 (by table construction) */
642 currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
643 }
644
645 /* save best solution */
646 if (currentMl > ml) {
647 ml = currentMl;
648 *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
649 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
650 }
651 }
652 }
653 return ml;
654 }
655
656
657 /* *********************************
658 * Hash Chain
659 ***********************************/
660 #define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)]
661
662 /* Update chains up to ip (excluded)
663 Assumption : always within prefix (i.e. not within extDict) */
ZSTD_insertAndFindFirstIndex_internal(ZSTD_matchState_t * ms,const ZSTD_compressionParameters * const cParams,const BYTE * ip,U32 const mls)664 FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
665 ZSTD_matchState_t* ms,
666 const ZSTD_compressionParameters* const cParams,
667 const BYTE* ip, U32 const mls)
668 {
669 U32* const hashTable = ms->hashTable;
670 const U32 hashLog = cParams->hashLog;
671 U32* const chainTable = ms->chainTable;
672 const U32 chainMask = (1 << cParams->chainLog) - 1;
673 const BYTE* const base = ms->window.base;
674 const U32 target = (U32)(ip - base);
675 U32 idx = ms->nextToUpdate;
676
677 while(idx < target) { /* catch up */
678 size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls);
679 NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
680 hashTable[h] = idx;
681 idx++;
682 }
683
684 ms->nextToUpdate = target;
685 return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
686 }
687
ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t * ms,const BYTE * ip)688 U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
689 const ZSTD_compressionParameters* const cParams = &ms->cParams;
690 return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
691 }
692
693 /* inlining is important to hardwire a hot branch (template emulation) */
694 FORCE_INLINE_TEMPLATE
ZSTD_HcFindBestMatch_generic(ZSTD_matchState_t * ms,const BYTE * const ip,const BYTE * const iLimit,size_t * offsetPtr,const U32 mls,const ZSTD_dictMode_e dictMode)695 size_t ZSTD_HcFindBestMatch_generic (
696 ZSTD_matchState_t* ms,
697 const BYTE* const ip, const BYTE* const iLimit,
698 size_t* offsetPtr,
699 const U32 mls, const ZSTD_dictMode_e dictMode)
700 {
701 const ZSTD_compressionParameters* const cParams = &ms->cParams;
702 U32* const chainTable = ms->chainTable;
703 const U32 chainSize = (1 << cParams->chainLog);
704 const U32 chainMask = chainSize-1;
705 const BYTE* const base = ms->window.base;
706 const BYTE* const dictBase = ms->window.dictBase;
707 const U32 dictLimit = ms->window.dictLimit;
708 const BYTE* const prefixStart = base + dictLimit;
709 const BYTE* const dictEnd = dictBase + dictLimit;
710 const U32 curr = (U32)(ip-base);
711 const U32 maxDistance = 1U << cParams->windowLog;
712 const U32 lowestValid = ms->window.lowLimit;
713 const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
714 const U32 isDictionary = (ms->loadedDictEnd != 0);
715 const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
716 const U32 minChain = curr > chainSize ? curr - chainSize : 0;
717 U32 nbAttempts = 1U << cParams->searchLog;
718 size_t ml=4-1;
719
720 const ZSTD_matchState_t* const dms = ms->dictMatchState;
721 const U32 ddsHashLog = dictMode == ZSTD_dedicatedDictSearch
722 ? dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG : 0;
723 const size_t ddsIdx = dictMode == ZSTD_dedicatedDictSearch
724 ? ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG : 0;
725
726 U32 matchIndex;
727
728 if (dictMode == ZSTD_dedicatedDictSearch) {
729 const U32* entry = &dms->hashTable[ddsIdx];
730 PREFETCH_L1(entry);
731 }
732
733 /* HC4 match finder */
734 matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
735
736 for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
737 size_t currentMl=0;
738 if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
739 const BYTE* const match = base + matchIndex;
740 assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
741 if (match[ml] == ip[ml]) /* potentially better */
742 currentMl = ZSTD_count(ip, match, iLimit);
743 } else {
744 const BYTE* const match = dictBase + matchIndex;
745 assert(match+4 <= dictEnd);
746 if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
747 currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
748 }
749
750 /* save best solution */
751 if (currentMl > ml) {
752 ml = currentMl;
753 *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
754 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
755 }
756
757 if (matchIndex <= minChain) break;
758 matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
759 }
760
761 if (dictMode == ZSTD_dedicatedDictSearch) {
762 ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms,
763 ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
764 } else if (dictMode == ZSTD_dictMatchState) {
765 const U32* const dmsChainTable = dms->chainTable;
766 const U32 dmsChainSize = (1 << dms->cParams.chainLog);
767 const U32 dmsChainMask = dmsChainSize - 1;
768 const U32 dmsLowestIndex = dms->window.dictLimit;
769 const BYTE* const dmsBase = dms->window.base;
770 const BYTE* const dmsEnd = dms->window.nextSrc;
771 const U32 dmsSize = (U32)(dmsEnd - dmsBase);
772 const U32 dmsIndexDelta = dictLimit - dmsSize;
773 const U32 dmsMinChain = dmsSize > dmsChainSize ? dmsSize - dmsChainSize : 0;
774
775 matchIndex = dms->hashTable[ZSTD_hashPtr(ip, dms->cParams.hashLog, mls)];
776
777 for ( ; (matchIndex>=dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) {
778 size_t currentMl=0;
779 const BYTE* const match = dmsBase + matchIndex;
780 assert(match+4 <= dmsEnd);
781 if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
782 currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;
783
784 /* save best solution */
785 if (currentMl > ml) {
786 ml = currentMl;
787 *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
788 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
789 }
790
791 if (matchIndex <= dmsMinChain) break;
792
793 matchIndex = dmsChainTable[matchIndex & dmsChainMask];
794 }
795 }
796
797 return ml;
798 }
799
800
ZSTD_HcFindBestMatch_selectMLS(ZSTD_matchState_t * ms,const BYTE * ip,const BYTE * const iLimit,size_t * offsetPtr)801 FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS (
802 ZSTD_matchState_t* ms,
803 const BYTE* ip, const BYTE* const iLimit,
804 size_t* offsetPtr)
805 {
806 switch(ms->cParams.minMatch)
807 {
808 default : /* includes case 3 */
809 case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
810 case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
811 case 7 :
812 case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
813 }
814 }
815
816
ZSTD_HcFindBestMatch_dictMatchState_selectMLS(ZSTD_matchState_t * ms,const BYTE * ip,const BYTE * const iLimit,size_t * offsetPtr)817 static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS (
818 ZSTD_matchState_t* ms,
819 const BYTE* ip, const BYTE* const iLimit,
820 size_t* offsetPtr)
821 {
822 switch(ms->cParams.minMatch)
823 {
824 default : /* includes case 3 */
825 case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
826 case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
827 case 7 :
828 case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
829 }
830 }
831
832
ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS(ZSTD_matchState_t * ms,const BYTE * ip,const BYTE * const iLimit,size_t * offsetPtr)833 static size_t ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS (
834 ZSTD_matchState_t* ms,
835 const BYTE* ip, const BYTE* const iLimit,
836 size_t* offsetPtr)
837 {
838 switch(ms->cParams.minMatch)
839 {
840 default : /* includes case 3 */
841 case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dedicatedDictSearch);
842 case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dedicatedDictSearch);
843 case 7 :
844 case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dedicatedDictSearch);
845 }
846 }
847
848
ZSTD_HcFindBestMatch_extDict_selectMLS(ZSTD_matchState_t * ms,const BYTE * ip,const BYTE * const iLimit,size_t * offsetPtr)849 FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
850 ZSTD_matchState_t* ms,
851 const BYTE* ip, const BYTE* const iLimit,
852 size_t* offsetPtr)
853 {
854 switch(ms->cParams.minMatch)
855 {
856 default : /* includes case 3 */
857 case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
858 case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
859 case 7 :
860 case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
861 }
862 }
863
864 /* *********************************
865 * (SIMD) Row-based matchfinder
866 ***********************************/
867 /* Constants for row-based hash */
868 #define ZSTD_ROW_HASH_TAG_OFFSET 1 /* byte offset of hashes in the match state's tagTable from the beginning of a row */
869 #define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */
870 #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
871
872 #define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1)
873
874 typedef U32 ZSTD_VecMask; /* Clarifies when we are interacting with a U32 representing a mask of matches */
875
876 #if !defined(ZSTD_NO_INTRINSICS) && defined(__SSE2__) /* SIMD SSE version */
877
878 #include <emmintrin.h>
879 typedef __m128i ZSTD_Vec128;
880
881 /* Returns a 128-bit container with 128-bits from src */
ZSTD_Vec128_read(const void * const src)882 static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
883 return _mm_loadu_si128((ZSTD_Vec128 const*)src);
884 }
885
886 /* Returns a ZSTD_Vec128 with the byte "val" packed 16 times */
ZSTD_Vec128_set8(BYTE val)887 static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
888 return _mm_set1_epi8((char)val);
889 }
890
891 /* Do byte-by-byte comparison result of x and y. Then collapse 128-bit resultant mask
892 * into a 32-bit mask that is the MSB of each byte.
893 * */
ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x,ZSTD_Vec128 y)894 static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
895 return (ZSTD_VecMask)_mm_movemask_epi8(_mm_cmpeq_epi8(x, y));
896 }
897
898 typedef struct {
899 __m128i fst;
900 __m128i snd;
901 } ZSTD_Vec256;
902
ZSTD_Vec256_read(const void * const ptr)903 static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) {
904 ZSTD_Vec256 v;
905 v.fst = ZSTD_Vec128_read(ptr);
906 v.snd = ZSTD_Vec128_read((ZSTD_Vec128 const*)ptr + 1);
907 return v;
908 }
909
ZSTD_Vec256_set8(BYTE val)910 static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
911 ZSTD_Vec256 v;
912 v.fst = ZSTD_Vec128_set8(val);
913 v.snd = ZSTD_Vec128_set8(val);
914 return v;
915 }
916
ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x,ZSTD_Vec256 y)917 static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
918 ZSTD_VecMask fstMask;
919 ZSTD_VecMask sndMask;
920 fstMask = ZSTD_Vec128_cmpMask8(x.fst, y.fst);
921 sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd);
922 return fstMask | (sndMask << 16);
923 }
924
925 #elif !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON) /* SIMD ARM NEON Version */
926
927 #include <arm_neon.h>
928 typedef uint8x16_t ZSTD_Vec128;
929
ZSTD_Vec128_read(const void * const src)930 static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
931 return vld1q_u8((const BYTE* const)src);
932 }
933
ZSTD_Vec128_set8(BYTE val)934 static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
935 return vdupq_n_u8(val);
936 }
937
938 /* Mimics '_mm_movemask_epi8()' from SSE */
ZSTD_vmovmaskq_u8(ZSTD_Vec128 val)939 static U32 ZSTD_vmovmaskq_u8(ZSTD_Vec128 val) {
940 /* Shift out everything but the MSB bits in each byte */
941 uint16x8_t highBits = vreinterpretq_u16_u8(vshrq_n_u8(val, 7));
942 /* Merge the even lanes together with vsra (right shift and add) */
943 uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(highBits, highBits, 7));
944 uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
945 uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
946 /* Extract the low 8 bits from each lane, merge */
947 return vgetq_lane_u8(paired64, 0) | ((U32)vgetq_lane_u8(paired64, 8) << 8);
948 }
949
ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x,ZSTD_Vec128 y)950 static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
951 return (ZSTD_VecMask)ZSTD_vmovmaskq_u8(vceqq_u8(x, y));
952 }
953
954 typedef struct {
955 uint8x16_t fst;
956 uint8x16_t snd;
957 } ZSTD_Vec256;
958
ZSTD_Vec256_read(const void * const ptr)959 static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) {
960 ZSTD_Vec256 v;
961 v.fst = ZSTD_Vec128_read(ptr);
962 v.snd = ZSTD_Vec128_read((ZSTD_Vec128 const*)ptr + 1);
963 return v;
964 }
965
ZSTD_Vec256_set8(BYTE val)966 static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
967 ZSTD_Vec256 v;
968 v.fst = ZSTD_Vec128_set8(val);
969 v.snd = ZSTD_Vec128_set8(val);
970 return v;
971 }
972
ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x,ZSTD_Vec256 y)973 static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
974 ZSTD_VecMask fstMask;
975 ZSTD_VecMask sndMask;
976 fstMask = ZSTD_Vec128_cmpMask8(x.fst, y.fst);
977 sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd);
978 return fstMask | (sndMask << 16);
979 }
980
981 #else /* Scalar fallback version */
982
983 #define VEC128_NB_SIZE_T (16 / sizeof(size_t))
984 typedef struct {
985 size_t vec[VEC128_NB_SIZE_T];
986 } ZSTD_Vec128;
987
ZSTD_Vec128_read(const void * const src)988 static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
989 ZSTD_Vec128 ret;
990 ZSTD_memcpy(ret.vec, src, VEC128_NB_SIZE_T*sizeof(size_t));
991 return ret;
992 }
993
ZSTD_Vec128_set8(BYTE val)994 static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
995 ZSTD_Vec128 ret = { {0} };
996 int startBit = sizeof(size_t) * 8 - 8;
997 for (;startBit >= 0; startBit -= 8) {
998 unsigned j = 0;
999 for (;j < VEC128_NB_SIZE_T; ++j) {
1000 ret.vec[j] |= ((size_t)val << startBit);
1001 }
1002 }
1003 return ret;
1004 }
1005
1006 /* Compare x to y, byte by byte, generating a "matches" bitfield */
ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x,ZSTD_Vec128 y)1007 static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
1008 ZSTD_VecMask res = 0;
1009 unsigned i = 0;
1010 unsigned l = 0;
1011 for (; i < VEC128_NB_SIZE_T; ++i) {
1012 const size_t cmp1 = x.vec[i];
1013 const size_t cmp2 = y.vec[i];
1014 unsigned j = 0;
1015 for (; j < sizeof(size_t); ++j, ++l) {
1016 if (((cmp1 >> j*8) & 0xFF) == ((cmp2 >> j*8) & 0xFF)) {
1017 res |= ((U32)1 << (j+i*sizeof(size_t)));
1018 }
1019 }
1020 }
1021 return res;
1022 }
1023
1024 #define VEC256_NB_SIZE_T 2*VEC128_NB_SIZE_T
1025 typedef struct {
1026 size_t vec[VEC256_NB_SIZE_T];
1027 } ZSTD_Vec256;
1028
ZSTD_Vec256_read(const void * const src)1029 static ZSTD_Vec256 ZSTD_Vec256_read(const void* const src) {
1030 ZSTD_Vec256 ret;
1031 ZSTD_memcpy(ret.vec, src, VEC256_NB_SIZE_T*sizeof(size_t));
1032 return ret;
1033 }
1034
ZSTD_Vec256_set8(BYTE val)1035 static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
1036 ZSTD_Vec256 ret = { {0} };
1037 int startBit = sizeof(size_t) * 8 - 8;
1038 for (;startBit >= 0; startBit -= 8) {
1039 unsigned j = 0;
1040 for (;j < VEC256_NB_SIZE_T; ++j) {
1041 ret.vec[j] |= ((size_t)val << startBit);
1042 }
1043 }
1044 return ret;
1045 }
1046
1047 /* Compare x to y, byte by byte, generating a "matches" bitfield */
ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x,ZSTD_Vec256 y)1048 static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
1049 ZSTD_VecMask res = 0;
1050 unsigned i = 0;
1051 unsigned l = 0;
1052 for (; i < VEC256_NB_SIZE_T; ++i) {
1053 const size_t cmp1 = x.vec[i];
1054 const size_t cmp2 = y.vec[i];
1055 unsigned j = 0;
1056 for (; j < sizeof(size_t); ++j, ++l) {
1057 if (((cmp1 >> j*8) & 0xFF) == ((cmp2 >> j*8) & 0xFF)) {
1058 res |= ((U32)1 << (j+i*sizeof(size_t)));
1059 }
1060 }
1061 }
1062 return res;
1063 }
1064
1065 #endif /* !defined(ZSTD_NO_INTRINSICS) && defined(__SSE2__) */
1066
1067 /* ZSTD_VecMask_next():
1068 * Starting from the LSB, returns the idx of the next non-zero bit.
1069 * Basically counting the nb of trailing zeroes.
1070 */
ZSTD_VecMask_next(ZSTD_VecMask val)1071 static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
1072 # if defined(_MSC_VER) /* Visual */
1073 unsigned long r=0;
1074 return _BitScanForward(&r, val) ? (U32)r : 0;
1075 # elif defined(__GNUC__) && (__GNUC__ >= 3)
1076 return (U32)__builtin_ctz(val);
1077 # else
1078 /* Software ctz version: http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightMultLookup */
1079 static const U32 multiplyDeBruijnBitPosition[32] =
1080 {
1081 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
1082 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
1083 };
1084 return multiplyDeBruijnBitPosition[((U32)((val & -(int)val) * 0x077CB531U)) >> 27];
1085
1086 # endif
1087 }
1088
1089 /* ZSTD_VecMask_rotateRight():
1090 * Rotates a bitfield to the right by "rotation" bits.
1091 * If the rotation is greater than totalBits, the returned mask is 0.
1092 */
1093 FORCE_INLINE_TEMPLATE ZSTD_VecMask
ZSTD_VecMask_rotateRight(ZSTD_VecMask mask,U32 const rotation,U32 const totalBits)1094 ZSTD_VecMask_rotateRight(ZSTD_VecMask mask, U32 const rotation, U32 const totalBits) {
1095 if (rotation == 0)
1096 return mask;
1097 switch (totalBits) {
1098 default:
1099 assert(0);
1100 case 16:
1101 return (mask >> rotation) | (U16)(mask << (16 - rotation));
1102 case 32:
1103 return (mask >> rotation) | (U32)(mask << (32 - rotation));
1104 }
1105 }
1106
1107 /* ZSTD_row_nextIndex():
1108 * Returns the next index to insert at within a tagTable row, and updates the "head"
1109 * value to reflect the update. Essentially cycles backwards from [0, {entries per row})
1110 */
ZSTD_row_nextIndex(BYTE * const tagRow,U32 const rowMask)1111 FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
1112 U32 const next = (*tagRow - 1) & rowMask;
1113 *tagRow = (BYTE)next;
1114 return next;
1115 }
1116
1117 /* ZSTD_isAligned():
1118 * Checks that a pointer is aligned to "align" bytes which must be a power of 2.
1119 */
ZSTD_isAligned(void const * ptr,size_t align)1120 MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
1121 assert((align & (align - 1)) == 0);
1122 return (((size_t)ptr) & (align - 1)) == 0;
1123 }
1124
1125 /* ZSTD_row_prefetch():
1126 * Performs prefetching for the hashTable and tagTable at a given row.
1127 */
ZSTD_row_prefetch(U32 const * hashTable,U16 const * tagTable,U32 const relRow,U32 const rowLog)1128 FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
1129 PREFETCH_L1(hashTable + relRow);
1130 if (rowLog == 5) {
1131 PREFETCH_L1(hashTable + relRow + 16);
1132 }
1133 PREFETCH_L1(tagTable + relRow);
1134 assert(rowLog == 4 || rowLog == 5);
1135 assert(ZSTD_isAligned(hashTable + relRow, 64)); /* prefetched hash row always 64-byte aligned */
1136 assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on a multiple of 32 or 64 bytes */
1137 }
1138
1139 /* ZSTD_row_fillHashCache():
1140 * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
1141 * but not beyond iLimit.
1142 */
ZSTD_row_fillHashCache(ZSTD_matchState_t * ms,const BYTE * base,U32 const rowLog,U32 const mls,U32 idx,const BYTE * const iLimit)1143 static void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
1144 U32 const rowLog, U32 const mls,
1145 U32 idx, const BYTE* const iLimit)
1146 {
1147 U32 const* const hashTable = ms->hashTable;
1148 U16 const* const tagTable = ms->tagTable;
1149 U32 const hashLog = ms->rowHashLog;
1150 U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
1151 U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
1152
1153 for (; idx < lim; ++idx) {
1154 U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1155 U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1156 ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
1157 ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
1158 }
1159
1160 DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[0], ms->hashCache[1],
1161 ms->hashCache[2], ms->hashCache[3], ms->hashCache[4],
1162 ms->hashCache[5], ms->hashCache[6], ms->hashCache[7]);
1163 }
1164
1165 /* ZSTD_row_nextCachedHash():
1166 * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at
1167 * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
1168 */
ZSTD_row_nextCachedHash(U32 * cache,U32 const * hashTable,U16 const * tagTable,BYTE const * base,U32 idx,U32 const hashLog,U32 const rowLog,U32 const mls)1169 FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
1170 U16 const* tagTable, BYTE const* base,
1171 U32 idx, U32 const hashLog,
1172 U32 const rowLog, U32 const mls)
1173 {
1174 U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1175 U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1176 ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
1177 { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
1178 cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash;
1179 return hash;
1180 }
1181 }
1182
1183 /* ZSTD_row_update_internal():
1184 * Inserts the byte at ip into the appropriate position in the hash table.
1185 * Determines the relative row, and the position within the {16, 32} entry row to insert at.
1186 */
ZSTD_row_update_internal(ZSTD_matchState_t * ms,const BYTE * ip,U32 const mls,U32 const rowLog,U32 const rowMask,U32 const useCache)1187 FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
1188 U32 const mls, U32 const rowLog,
1189 U32 const rowMask, U32 const useCache)
1190 {
1191 U32* const hashTable = ms->hashTable;
1192 U16* const tagTable = ms->tagTable;
1193 U32 const hashLog = ms->rowHashLog;
1194 const BYTE* const base = ms->window.base;
1195 const U32 target = (U32)(ip - base);
1196 U32 idx = ms->nextToUpdate;
1197
1198 DEBUGLOG(6, "ZSTD_row_update_internal(): nextToUpdate=%u, current=%u", idx, target);
1199 for (; idx < target; ++idx) {
1200 U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, idx, hashLog, rowLog, mls)
1201 : (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1202 U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1203 U32* const row = hashTable + relRow;
1204 BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
1205 Explicit cast allows us to get exact desired position within each row */
1206 U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
1207
1208 assert(hash == ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
1209 ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
1210 row[pos] = idx;
1211 }
1212 ms->nextToUpdate = target;
1213 }
1214
1215 /* ZSTD_row_update():
1216 * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary
1217 * processing.
1218 */
ZSTD_row_update(ZSTD_matchState_t * const ms,const BYTE * ip)1219 void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
1220 const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
1221 const U32 rowMask = (1u << rowLog) - 1;
1222 const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
1223
1224 DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
1225 ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
1226 }
1227
1228 /* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
1229 * the hash at the nth position in a row of the tagTable.
1230 */
1231 FORCE_INLINE_TEMPLATE
ZSTD_row_getMatchMask(const BYTE * const tagRow,const BYTE tag,const U32 head,const U32 rowEntries)1232 ZSTD_VecMask ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries) {
1233 ZSTD_VecMask matches = 0;
1234 if (rowEntries == 16) {
1235 ZSTD_Vec128 hashes = ZSTD_Vec128_read(tagRow + ZSTD_ROW_HASH_TAG_OFFSET);
1236 ZSTD_Vec128 expandedTags = ZSTD_Vec128_set8(tag);
1237 matches = ZSTD_Vec128_cmpMask8(hashes, expandedTags);
1238 } else if (rowEntries == 32) {
1239 ZSTD_Vec256 hashes = ZSTD_Vec256_read(tagRow + ZSTD_ROW_HASH_TAG_OFFSET);
1240 ZSTD_Vec256 expandedTags = ZSTD_Vec256_set8(tag);
1241 matches = ZSTD_Vec256_cmpMask8(hashes, expandedTags);
1242 } else {
1243 assert(0);
1244 }
1245 /* Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
1246 to match up with the actual layout of the entries within the hashTable */
1247 return ZSTD_VecMask_rotateRight(matches, head, rowEntries);
1248 }
1249
1250 /* The high-level approach of the SIMD row based match finder is as follows:
1251 * - Figure out where to insert the new entry:
1252 * - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag"
1253 * - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines
1254 * which row to insert into.
1255 * - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can
1256 * be considered as a circular buffer with a "head" index that resides in the tagTable.
1257 * - Also insert the "tag" into the equivalent row and position in the tagTable.
1258 * - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry.
1259 * The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively,
1260 * for alignment/performance reasons, leaving some bytes unused.
1261 * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and
1262 * generate a bitfield that we can cycle through to check the collisions in the hash table.
1263 * - Pick the longest match.
1264 */
1265 FORCE_INLINE_TEMPLATE
ZSTD_RowFindBestMatch_generic(ZSTD_matchState_t * ms,const BYTE * const ip,const BYTE * const iLimit,size_t * offsetPtr,const U32 mls,const ZSTD_dictMode_e dictMode,const U32 rowLog)1266 size_t ZSTD_RowFindBestMatch_generic (
1267 ZSTD_matchState_t* ms,
1268 const BYTE* const ip, const BYTE* const iLimit,
1269 size_t* offsetPtr,
1270 const U32 mls, const ZSTD_dictMode_e dictMode,
1271 const U32 rowLog)
1272 {
1273 U32* const hashTable = ms->hashTable;
1274 U16* const tagTable = ms->tagTable;
1275 U32* const hashCache = ms->hashCache;
1276 const U32 hashLog = ms->rowHashLog;
1277 const ZSTD_compressionParameters* const cParams = &ms->cParams;
1278 const BYTE* const base = ms->window.base;
1279 const BYTE* const dictBase = ms->window.dictBase;
1280 const U32 dictLimit = ms->window.dictLimit;
1281 const BYTE* const prefixStart = base + dictLimit;
1282 const BYTE* const dictEnd = dictBase + dictLimit;
1283 const U32 curr = (U32)(ip-base);
1284 const U32 maxDistance = 1U << cParams->windowLog;
1285 const U32 lowestValid = ms->window.lowLimit;
1286 const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
1287 const U32 isDictionary = (ms->loadedDictEnd != 0);
1288 const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
1289 const U32 rowEntries = (1U << rowLog);
1290 const U32 rowMask = rowEntries - 1;
1291 const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
1292 U32 nbAttempts = 1U << cappedSearchLog;
1293 size_t ml=4-1;
1294
1295 /* DMS/DDS variables that may be referenced laster */
1296 const ZSTD_matchState_t* const dms = ms->dictMatchState;
1297 size_t ddsIdx;
1298 U32 ddsExtraAttempts; /* cctx hash tables are limited in searches, but allow extra searches into DDS */
1299 U32 dmsTag;
1300 U32* dmsRow;
1301 BYTE* dmsTagRow;
1302
1303 if (dictMode == ZSTD_dedicatedDictSearch) {
1304 const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
1305 { /* Prefetch DDS hashtable entry */
1306 ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG;
1307 PREFETCH_L1(&dms->hashTable[ddsIdx]);
1308 }
1309 ddsExtraAttempts = cParams->searchLog > rowLog ? 1U << (cParams->searchLog - rowLog) : 0;
1310 }
1311
1312 if (dictMode == ZSTD_dictMatchState) {
1313 /* Prefetch DMS rows */
1314 U32* const dmsHashTable = dms->hashTable;
1315 U16* const dmsTagTable = dms->tagTable;
1316 U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1317 U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1318 dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
1319 dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow);
1320 dmsRow = dmsHashTable + dmsRelRow;
1321 ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog);
1322 }
1323
1324 /* Update the hashTable and tagTable up to (but not including) ip */
1325 ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
1326 { /* Get the hash for ip, compute the appropriate row */
1327 U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
1328 U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1329 U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
1330 U32* const row = hashTable + relRow;
1331 BYTE* tagRow = (BYTE*)(tagTable + relRow);
1332 U32 const head = *tagRow & rowMask;
1333 U32 matchBuffer[32 /* maximum nb entries per row */];
1334 size_t numMatches = 0;
1335 size_t currMatch = 0;
1336 ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
1337
1338 /* Cycle through the matches and prefetch */
1339 for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
1340 U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
1341 U32 const matchIndex = row[matchPos];
1342 assert(numMatches < rowEntries);
1343 if (matchIndex < lowLimit)
1344 break;
1345 if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
1346 PREFETCH_L1(base + matchIndex);
1347 } else {
1348 PREFETCH_L1(dictBase + matchIndex);
1349 }
1350 matchBuffer[numMatches++] = matchIndex;
1351 }
1352
1353 /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
1354 in ZSTD_row_update_internal() at the next search. */
1355 {
1356 U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
1357 tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
1358 row[pos] = ms->nextToUpdate++;
1359 }
1360
1361 /* Return the longest match */
1362 for (; currMatch < numMatches; ++currMatch) {
1363 U32 const matchIndex = matchBuffer[currMatch];
1364 size_t currentMl=0;
1365 assert(matchIndex < curr);
1366 assert(matchIndex >= lowLimit);
1367
1368 if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
1369 const BYTE* const match = base + matchIndex;
1370 assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
1371 if (match[ml] == ip[ml]) /* potentially better */
1372 currentMl = ZSTD_count(ip, match, iLimit);
1373 } else {
1374 const BYTE* const match = dictBase + matchIndex;
1375 assert(match+4 <= dictEnd);
1376 if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
1377 currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
1378 }
1379
1380 /* Save best solution */
1381 if (currentMl > ml) {
1382 ml = currentMl;
1383 *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
1384 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
1385 }
1386 }
1387 }
1388
1389 if (dictMode == ZSTD_dedicatedDictSearch) {
1390 ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms,
1391 ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
1392 } else if (dictMode == ZSTD_dictMatchState) {
1393 /* TODO: Measure and potentially add prefetching to DMS */
1394 const U32 dmsLowestIndex = dms->window.dictLimit;
1395 const BYTE* const dmsBase = dms->window.base;
1396 const BYTE* const dmsEnd = dms->window.nextSrc;
1397 const U32 dmsSize = (U32)(dmsEnd - dmsBase);
1398 const U32 dmsIndexDelta = dictLimit - dmsSize;
1399
1400 { U32 const head = *dmsTagRow & rowMask;
1401 U32 matchBuffer[32 /* maximum nb row entries */];
1402 size_t numMatches = 0;
1403 size_t currMatch = 0;
1404 ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
1405
1406 for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
1407 U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
1408 U32 const matchIndex = dmsRow[matchPos];
1409 if (matchIndex < dmsLowestIndex)
1410 break;
1411 PREFETCH_L1(dmsBase + matchIndex);
1412 matchBuffer[numMatches++] = matchIndex;
1413 }
1414
1415 /* Return the longest match */
1416 for (; currMatch < numMatches; ++currMatch) {
1417 U32 const matchIndex = matchBuffer[currMatch];
1418 size_t currentMl=0;
1419 assert(matchIndex >= dmsLowestIndex);
1420 assert(matchIndex < curr);
1421
1422 { const BYTE* const match = dmsBase + matchIndex;
1423 assert(match+4 <= dmsEnd);
1424 if (MEM_read32(match) == MEM_read32(ip))
1425 currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;
1426 }
1427
1428 if (currentMl > ml) {
1429 ml = currentMl;
1430 *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
1431 if (ip+currentMl == iLimit) break;
1432 }
1433 }
1434 }
1435 }
1436 return ml;
1437 }
1438
1439 /* Inlining is important to hardwire a hot branch (template emulation) */
ZSTD_RowFindBestMatch_selectMLS(ZSTD_matchState_t * ms,const BYTE * ip,const BYTE * const iLimit,const ZSTD_dictMode_e dictMode,size_t * offsetPtr,const U32 rowLog)1440 FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_selectMLS (
1441 ZSTD_matchState_t* ms,
1442 const BYTE* ip, const BYTE* const iLimit,
1443 const ZSTD_dictMode_e dictMode, size_t* offsetPtr, const U32 rowLog)
1444 {
1445 switch(ms->cParams.minMatch)
1446 {
1447 default : /* includes case 3 */
1448 case 4 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, dictMode, rowLog);
1449 case 5 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, dictMode, rowLog);
1450 case 7 :
1451 case 6 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, dictMode, rowLog);
1452 }
1453 }
1454
ZSTD_RowFindBestMatch_selectRowLog(ZSTD_matchState_t * ms,const BYTE * ip,const BYTE * const iLimit,size_t * offsetPtr)1455 FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_selectRowLog (
1456 ZSTD_matchState_t* ms,
1457 const BYTE* ip, const BYTE* const iLimit,
1458 size_t* offsetPtr)
1459 {
1460 const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1461 switch(cappedSearchLog)
1462 {
1463 default :
1464 case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 4);
1465 case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 5);
1466 }
1467 }
1468
ZSTD_RowFindBestMatch_dictMatchState_selectRowLog(ZSTD_matchState_t * ms,const BYTE * ip,const BYTE * const iLimit,size_t * offsetPtr)1469 FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_dictMatchState_selectRowLog(
1470 ZSTD_matchState_t* ms,
1471 const BYTE* ip, const BYTE* const iLimit,
1472 size_t* offsetPtr)
1473 {
1474 const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1475 switch(cappedSearchLog)
1476 {
1477 default :
1478 case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 4);
1479 case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 5);
1480 }
1481 }
1482
ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog(ZSTD_matchState_t * ms,const BYTE * ip,const BYTE * const iLimit,size_t * offsetPtr)1483 FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog(
1484 ZSTD_matchState_t* ms,
1485 const BYTE* ip, const BYTE* const iLimit,
1486 size_t* offsetPtr)
1487 {
1488 const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1489 switch(cappedSearchLog)
1490 {
1491 default :
1492 case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 4);
1493 case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 5);
1494 }
1495 }
1496
ZSTD_RowFindBestMatch_extDict_selectRowLog(ZSTD_matchState_t * ms,const BYTE * ip,const BYTE * const iLimit,size_t * offsetPtr)1497 FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_extDict_selectRowLog (
1498 ZSTD_matchState_t* ms,
1499 const BYTE* ip, const BYTE* const iLimit,
1500 size_t* offsetPtr)
1501 {
1502 const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
1503 switch(cappedSearchLog)
1504 {
1505 default :
1506 case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 4);
1507 case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 5);
1508 }
1509 }
1510
1511
1512 /* *******************************
1513 * Common parser - lazy strategy
1514 *********************************/
1515 typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
1516
1517 FORCE_INLINE_TEMPLATE size_t
ZSTD_compressBlock_lazy_generic(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],const void * src,size_t srcSize,const searchMethod_e searchMethod,const U32 depth,ZSTD_dictMode_e const dictMode)1518 ZSTD_compressBlock_lazy_generic(
1519 ZSTD_matchState_t* ms, seqStore_t* seqStore,
1520 U32 rep[ZSTD_REP_NUM],
1521 const void* src, size_t srcSize,
1522 const searchMethod_e searchMethod, const U32 depth,
1523 ZSTD_dictMode_e const dictMode)
1524 {
1525 const BYTE* const istart = (const BYTE*)src;
1526 const BYTE* ip = istart;
1527 const BYTE* anchor = istart;
1528 const BYTE* const iend = istart + srcSize;
1529 const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
1530 const BYTE* const base = ms->window.base;
1531 const U32 prefixLowestIndex = ms->window.dictLimit;
1532 const BYTE* const prefixLowest = base + prefixLowestIndex;
1533 const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
1534
1535 typedef size_t (*searchMax_f)(
1536 ZSTD_matchState_t* ms,
1537 const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
1538
1539 /**
1540 * This table is indexed first by the four ZSTD_dictMode_e values, and then
1541 * by the two searchMethod_e values. NULLs are placed for configurations
1542 * that should never occur (extDict modes go to the other implementation
1543 * below and there is no DDSS for binary tree search yet).
1544 */
1545 const searchMax_f searchFuncs[4][3] = {
1546 {
1547 ZSTD_HcFindBestMatch_selectMLS,
1548 ZSTD_BtFindBestMatch_selectMLS,
1549 ZSTD_RowFindBestMatch_selectRowLog
1550 },
1551 {
1552 NULL,
1553 NULL,
1554 NULL
1555 },
1556 {
1557 ZSTD_HcFindBestMatch_dictMatchState_selectMLS,
1558 ZSTD_BtFindBestMatch_dictMatchState_selectMLS,
1559 ZSTD_RowFindBestMatch_dictMatchState_selectRowLog
1560 },
1561 {
1562 ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS,
1563 NULL,
1564 ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog
1565 }
1566 };
1567
1568 searchMax_f const searchMax = searchFuncs[dictMode][(int)searchMethod];
1569 U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
1570
1571 const int isDMS = dictMode == ZSTD_dictMatchState;
1572 const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
1573 const int isDxS = isDMS || isDDS;
1574 const ZSTD_matchState_t* const dms = ms->dictMatchState;
1575 const U32 dictLowestIndex = isDxS ? dms->window.dictLimit : 0;
1576 const BYTE* const dictBase = isDxS ? dms->window.base : NULL;
1577 const BYTE* const dictLowest = isDxS ? dictBase + dictLowestIndex : NULL;
1578 const BYTE* const dictEnd = isDxS ? dms->window.nextSrc : NULL;
1579 const U32 dictIndexDelta = isDxS ?
1580 prefixLowestIndex - (U32)(dictEnd - dictBase) :
1581 0;
1582 const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));
1583
1584 assert(searchMax != NULL);
1585
1586 DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);
1587 ip += (dictAndPrefixLength == 0);
1588 if (dictMode == ZSTD_noDict) {
1589 U32 const curr = (U32)(ip - base);
1590 U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
1591 U32 const maxRep = curr - windowLow;
1592 if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
1593 if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
1594 }
1595 if (isDxS) {
1596 /* dictMatchState repCode checks don't currently handle repCode == 0
1597 * disabling. */
1598 assert(offset_1 <= dictAndPrefixLength);
1599 assert(offset_2 <= dictAndPrefixLength);
1600 }
1601
1602 if (searchMethod == search_rowHash) {
1603 ZSTD_row_fillHashCache(ms, base, rowLog,
1604 MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
1605 ms->nextToUpdate, ilimit);
1606 }
1607
1608 /* Match Loop */
1609 #if defined(__GNUC__) && defined(__x86_64__)
1610 /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
1611 * code alignment is perturbed. To fix the instability align the loop on 32-bytes.
1612 */
1613 __asm__(".p2align 5");
1614 #endif
1615 while (ip < ilimit) {
1616 size_t matchLength=0;
1617 size_t offset=0;
1618 const BYTE* start=ip+1;
1619
1620 /* check repCode */
1621 if (isDxS) {
1622 const U32 repIndex = (U32)(ip - base) + 1 - offset_1;
1623 const BYTE* repMatch = ((dictMode == ZSTD_dictMatchState || dictMode == ZSTD_dedicatedDictSearch)
1624 && repIndex < prefixLowestIndex) ?
1625 dictBase + (repIndex - dictIndexDelta) :
1626 base + repIndex;
1627 if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
1628 && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
1629 const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
1630 matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
1631 if (depth==0) goto _storeSequence;
1632 }
1633 }
1634 if ( dictMode == ZSTD_noDict
1635 && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) {
1636 matchLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
1637 if (depth==0) goto _storeSequence;
1638 }
1639
1640 /* first search (depth 0) */
1641 { size_t offsetFound = 999999999;
1642 size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
1643 if (ml2 > matchLength)
1644 matchLength = ml2, start = ip, offset=offsetFound;
1645 }
1646
1647 if (matchLength < 4) {
1648 ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */
1649 continue;
1650 }
1651
1652 /* let's try to find a better solution */
1653 if (depth>=1)
1654 while (ip<ilimit) {
1655 ip ++;
1656 if ( (dictMode == ZSTD_noDict)
1657 && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1658 size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
1659 int const gain2 = (int)(mlRep * 3);
1660 int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
1661 if ((mlRep >= 4) && (gain2 > gain1))
1662 matchLength = mlRep, offset = 0, start = ip;
1663 }
1664 if (isDxS) {
1665 const U32 repIndex = (U32)(ip - base) - offset_1;
1666 const BYTE* repMatch = repIndex < prefixLowestIndex ?
1667 dictBase + (repIndex - dictIndexDelta) :
1668 base + repIndex;
1669 if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
1670 && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
1671 const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
1672 size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
1673 int const gain2 = (int)(mlRep * 3);
1674 int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
1675 if ((mlRep >= 4) && (gain2 > gain1))
1676 matchLength = mlRep, offset = 0, start = ip;
1677 }
1678 }
1679 { size_t offset2=999999999;
1680 size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1681 int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */
1682 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
1683 if ((ml2 >= 4) && (gain2 > gain1)) {
1684 matchLength = ml2, offset = offset2, start = ip;
1685 continue; /* search a better one */
1686 } }
1687
1688 /* let's find an even better one */
1689 if ((depth==2) && (ip<ilimit)) {
1690 ip ++;
1691 if ( (dictMode == ZSTD_noDict)
1692 && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1693 size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
1694 int const gain2 = (int)(mlRep * 4);
1695 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
1696 if ((mlRep >= 4) && (gain2 > gain1))
1697 matchLength = mlRep, offset = 0, start = ip;
1698 }
1699 if (isDxS) {
1700 const U32 repIndex = (U32)(ip - base) - offset_1;
1701 const BYTE* repMatch = repIndex < prefixLowestIndex ?
1702 dictBase + (repIndex - dictIndexDelta) :
1703 base + repIndex;
1704 if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
1705 && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
1706 const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
1707 size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
1708 int const gain2 = (int)(mlRep * 4);
1709 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
1710 if ((mlRep >= 4) && (gain2 > gain1))
1711 matchLength = mlRep, offset = 0, start = ip;
1712 }
1713 }
1714 { size_t offset2=999999999;
1715 size_t const ml2 = searchMax(ms, ip, iend, &offset2);
1716 int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */
1717 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
1718 if ((ml2 >= 4) && (gain2 > gain1)) {
1719 matchLength = ml2, offset = offset2, start = ip;
1720 continue;
1721 } } }
1722 break; /* nothing found : store previous solution */
1723 }
1724
1725 /* NOTE:
1726 * start[-offset+ZSTD_REP_MOVE-1] is undefined behavior.
1727 * (-offset+ZSTD_REP_MOVE-1) is unsigned, and is added to start, which
1728 * overflows the pointer, which is undefined behavior.
1729 */
1730 /* catch up */
1731 if (offset) {
1732 if (dictMode == ZSTD_noDict) {
1733 while ( ((start > anchor) & (start - (offset-ZSTD_REP_MOVE) > prefixLowest))
1734 && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) ) /* only search for offset within prefix */
1735 { start--; matchLength++; }
1736 }
1737 if (isDxS) {
1738 U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
1739 const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
1740 const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
1741 while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
1742 }
1743 offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE);
1744 }
1745 /* store sequence */
1746 _storeSequence:
1747 { size_t const litLength = start - anchor;
1748 ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
1749 anchor = ip = start + matchLength;
1750 }
1751
1752 /* check immediate repcode */
1753 if (isDxS) {
1754 while (ip <= ilimit) {
1755 U32 const current2 = (U32)(ip-base);
1756 U32 const repIndex = current2 - offset_2;
1757 const BYTE* repMatch = repIndex < prefixLowestIndex ?
1758 dictBase - dictIndexDelta + repIndex :
1759 base + repIndex;
1760 if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex) >= 3 /* intentional overflow */)
1761 && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
1762 const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
1763 matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
1764 offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset_2 <=> offset_1 */
1765 ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
1766 ip += matchLength;
1767 anchor = ip;
1768 continue;
1769 }
1770 break;
1771 }
1772 }
1773
1774 if (dictMode == ZSTD_noDict) {
1775 while ( ((ip <= ilimit) & (offset_2>0))
1776 && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
1777 /* store sequence */
1778 matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
1779 offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */
1780 ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
1781 ip += matchLength;
1782 anchor = ip;
1783 continue; /* faster when present ... (?) */
1784 } } }
1785
1786 /* Save reps for next block */
1787 rep[0] = offset_1 ? offset_1 : savedOffset;
1788 rep[1] = offset_2 ? offset_2 : savedOffset;
1789
1790 /* Return the last literals size */
1791 return (size_t)(iend - anchor);
1792 }
1793
1794
ZSTD_compressBlock_btlazy2(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1795 size_t ZSTD_compressBlock_btlazy2(
1796 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1797 void const* src, size_t srcSize)
1798 {
1799 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
1800 }
1801
ZSTD_compressBlock_lazy2(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1802 size_t ZSTD_compressBlock_lazy2(
1803 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1804 void const* src, size_t srcSize)
1805 {
1806 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
1807 }
1808
ZSTD_compressBlock_lazy(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1809 size_t ZSTD_compressBlock_lazy(
1810 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1811 void const* src, size_t srcSize)
1812 {
1813 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
1814 }
1815
ZSTD_compressBlock_greedy(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1816 size_t ZSTD_compressBlock_greedy(
1817 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1818 void const* src, size_t srcSize)
1819 {
1820 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
1821 }
1822
ZSTD_compressBlock_btlazy2_dictMatchState(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1823 size_t ZSTD_compressBlock_btlazy2_dictMatchState(
1824 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1825 void const* src, size_t srcSize)
1826 {
1827 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
1828 }
1829
ZSTD_compressBlock_lazy2_dictMatchState(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1830 size_t ZSTD_compressBlock_lazy2_dictMatchState(
1831 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1832 void const* src, size_t srcSize)
1833 {
1834 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
1835 }
1836
ZSTD_compressBlock_lazy_dictMatchState(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1837 size_t ZSTD_compressBlock_lazy_dictMatchState(
1838 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1839 void const* src, size_t srcSize)
1840 {
1841 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);
1842 }
1843
ZSTD_compressBlock_greedy_dictMatchState(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1844 size_t ZSTD_compressBlock_greedy_dictMatchState(
1845 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1846 void const* src, size_t srcSize)
1847 {
1848 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
1849 }
1850
1851
ZSTD_compressBlock_lazy2_dedicatedDictSearch(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1852 size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
1853 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1854 void const* src, size_t srcSize)
1855 {
1856 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
1857 }
1858
ZSTD_compressBlock_lazy_dedicatedDictSearch(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1859 size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
1860 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1861 void const* src, size_t srcSize)
1862 {
1863 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
1864 }
1865
ZSTD_compressBlock_greedy_dedicatedDictSearch(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1866 size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
1867 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1868 void const* src, size_t srcSize)
1869 {
1870 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
1871 }
1872
1873 /* Row-based matchfinder */
ZSTD_compressBlock_lazy2_row(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1874 size_t ZSTD_compressBlock_lazy2_row(
1875 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1876 void const* src, size_t srcSize)
1877 {
1878 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
1879 }
1880
ZSTD_compressBlock_lazy_row(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1881 size_t ZSTD_compressBlock_lazy_row(
1882 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1883 void const* src, size_t srcSize)
1884 {
1885 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
1886 }
1887
ZSTD_compressBlock_greedy_row(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1888 size_t ZSTD_compressBlock_greedy_row(
1889 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1890 void const* src, size_t srcSize)
1891 {
1892 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
1893 }
1894
ZSTD_compressBlock_lazy2_dictMatchState_row(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1895 size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
1896 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1897 void const* src, size_t srcSize)
1898 {
1899 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
1900 }
1901
ZSTD_compressBlock_lazy_dictMatchState_row(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1902 size_t ZSTD_compressBlock_lazy_dictMatchState_row(
1903 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1904 void const* src, size_t srcSize)
1905 {
1906 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
1907 }
1908
ZSTD_compressBlock_greedy_dictMatchState_row(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1909 size_t ZSTD_compressBlock_greedy_dictMatchState_row(
1910 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1911 void const* src, size_t srcSize)
1912 {
1913 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
1914 }
1915
1916
ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1917 size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
1918 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1919 void const* src, size_t srcSize)
1920 {
1921 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch);
1922 }
1923
ZSTD_compressBlock_lazy_dedicatedDictSearch_row(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1924 size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
1925 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1926 void const* src, size_t srcSize)
1927 {
1928 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
1929 }
1930
ZSTD_compressBlock_greedy_dedicatedDictSearch_row(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)1931 size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
1932 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1933 void const* src, size_t srcSize)
1934 {
1935 return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
1936 }
1937
1938 FORCE_INLINE_TEMPLATE
ZSTD_compressBlock_lazy_extDict_generic(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],const void * src,size_t srcSize,const searchMethod_e searchMethod,const U32 depth)1939 size_t ZSTD_compressBlock_lazy_extDict_generic(
1940 ZSTD_matchState_t* ms, seqStore_t* seqStore,
1941 U32 rep[ZSTD_REP_NUM],
1942 const void* src, size_t srcSize,
1943 const searchMethod_e searchMethod, const U32 depth)
1944 {
1945 const BYTE* const istart = (const BYTE*)src;
1946 const BYTE* ip = istart;
1947 const BYTE* anchor = istart;
1948 const BYTE* const iend = istart + srcSize;
1949 const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
1950 const BYTE* const base = ms->window.base;
1951 const U32 dictLimit = ms->window.dictLimit;
1952 const BYTE* const prefixStart = base + dictLimit;
1953 const BYTE* const dictBase = ms->window.dictBase;
1954 const BYTE* const dictEnd = dictBase + dictLimit;
1955 const BYTE* const dictStart = dictBase + ms->window.lowLimit;
1956 const U32 windowLog = ms->cParams.windowLog;
1957 const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
1958
1959 typedef size_t (*searchMax_f)(
1960 ZSTD_matchState_t* ms,
1961 const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
1962 const searchMax_f searchFuncs[3] = {
1963 ZSTD_HcFindBestMatch_extDict_selectMLS,
1964 ZSTD_BtFindBestMatch_extDict_selectMLS,
1965 ZSTD_RowFindBestMatch_extDict_selectRowLog
1966 };
1967 searchMax_f searchMax = searchFuncs[(int)searchMethod];
1968 U32 offset_1 = rep[0], offset_2 = rep[1];
1969
1970 DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
1971
1972 /* init */
1973 ip += (ip == prefixStart);
1974 if (searchMethod == search_rowHash) {
1975 ZSTD_row_fillHashCache(ms, base, rowLog,
1976 MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
1977 ms->nextToUpdate, ilimit);
1978 }
1979
1980 /* Match Loop */
1981 #if defined(__GNUC__) && defined(__x86_64__)
1982 /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
1983 * code alignment is perturbed. To fix the instability align the loop on 32-bytes.
1984 */
1985 __asm__(".p2align 5");
1986 #endif
1987 while (ip < ilimit) {
1988 size_t matchLength=0;
1989 size_t offset=0;
1990 const BYTE* start=ip+1;
1991 U32 curr = (U32)(ip-base);
1992
1993 /* check repCode */
1994 { const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr+1, windowLog);
1995 const U32 repIndex = (U32)(curr+1 - offset_1);
1996 const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
1997 const BYTE* const repMatch = repBase + repIndex;
1998 if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */
1999 & (offset_1 < curr+1 - windowLow) ) /* note: we are searching at curr+1 */
2000 if (MEM_read32(ip+1) == MEM_read32(repMatch)) {
2001 /* repcode detected we should take it */
2002 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
2003 matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repEnd, prefixStart) + 4;
2004 if (depth==0) goto _storeSequence;
2005 } }
2006
2007 /* first search (depth 0) */
2008 { size_t offsetFound = 999999999;
2009 size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
2010 if (ml2 > matchLength)
2011 matchLength = ml2, start = ip, offset=offsetFound;
2012 }
2013
2014 if (matchLength < 4) {
2015 ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */
2016 continue;
2017 }
2018
2019 /* let's try to find a better solution */
2020 if (depth>=1)
2021 while (ip<ilimit) {
2022 ip ++;
2023 curr++;
2024 /* check repCode */
2025 if (offset) {
2026 const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
2027 const U32 repIndex = (U32)(curr - offset_1);
2028 const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
2029 const BYTE* const repMatch = repBase + repIndex;
2030 if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
2031 & (offset_1 < curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
2032 if (MEM_read32(ip) == MEM_read32(repMatch)) {
2033 /* repcode detected */
2034 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
2035 size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
2036 int const gain2 = (int)(repLength * 3);
2037 int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
2038 if ((repLength >= 4) && (gain2 > gain1))
2039 matchLength = repLength, offset = 0, start = ip;
2040 } }
2041
2042 /* search match, depth 1 */
2043 { size_t offset2=999999999;
2044 size_t const ml2 = searchMax(ms, ip, iend, &offset2);
2045 int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */
2046 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
2047 if ((ml2 >= 4) && (gain2 > gain1)) {
2048 matchLength = ml2, offset = offset2, start = ip;
2049 continue; /* search a better one */
2050 } }
2051
2052 /* let's find an even better one */
2053 if ((depth==2) && (ip<ilimit)) {
2054 ip ++;
2055 curr++;
2056 /* check repCode */
2057 if (offset) {
2058 const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
2059 const U32 repIndex = (U32)(curr - offset_1);
2060 const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
2061 const BYTE* const repMatch = repBase + repIndex;
2062 if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
2063 & (offset_1 < curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
2064 if (MEM_read32(ip) == MEM_read32(repMatch)) {
2065 /* repcode detected */
2066 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
2067 size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
2068 int const gain2 = (int)(repLength * 4);
2069 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
2070 if ((repLength >= 4) && (gain2 > gain1))
2071 matchLength = repLength, offset = 0, start = ip;
2072 } }
2073
2074 /* search match, depth 2 */
2075 { size_t offset2=999999999;
2076 size_t const ml2 = searchMax(ms, ip, iend, &offset2);
2077 int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */
2078 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
2079 if ((ml2 >= 4) && (gain2 > gain1)) {
2080 matchLength = ml2, offset = offset2, start = ip;
2081 continue;
2082 } } }
2083 break; /* nothing found : store previous solution */
2084 }
2085
2086 /* catch up */
2087 if (offset) {
2088 U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
2089 const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
2090 const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
2091 while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
2092 offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE);
2093 }
2094
2095 /* store sequence */
2096 _storeSequence:
2097 { size_t const litLength = start - anchor;
2098 ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
2099 anchor = ip = start + matchLength;
2100 }
2101
2102 /* check immediate repcode */
2103 while (ip <= ilimit) {
2104 const U32 repCurrent = (U32)(ip-base);
2105 const U32 windowLow = ZSTD_getLowestMatchIndex(ms, repCurrent, windowLog);
2106 const U32 repIndex = repCurrent - offset_2;
2107 const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
2108 const BYTE* const repMatch = repBase + repIndex;
2109 if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
2110 & (offset_2 < repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
2111 if (MEM_read32(ip) == MEM_read32(repMatch)) {
2112 /* repcode detected we should take it */
2113 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
2114 matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
2115 offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset history */
2116 ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
2117 ip += matchLength;
2118 anchor = ip;
2119 continue; /* faster when present ... (?) */
2120 }
2121 break;
2122 } }
2123
2124 /* Save reps for next block */
2125 rep[0] = offset_1;
2126 rep[1] = offset_2;
2127
2128 /* Return the last literals size */
2129 return (size_t)(iend - anchor);
2130 }
2131
2132
ZSTD_compressBlock_greedy_extDict(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)2133 size_t ZSTD_compressBlock_greedy_extDict(
2134 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2135 void const* src, size_t srcSize)
2136 {
2137 return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0);
2138 }
2139
ZSTD_compressBlock_lazy_extDict(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)2140 size_t ZSTD_compressBlock_lazy_extDict(
2141 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2142 void const* src, size_t srcSize)
2143
2144 {
2145 return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);
2146 }
2147
ZSTD_compressBlock_lazy2_extDict(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)2148 size_t ZSTD_compressBlock_lazy2_extDict(
2149 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2150 void const* src, size_t srcSize)
2151
2152 {
2153 return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);
2154 }
2155
ZSTD_compressBlock_btlazy2_extDict(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)2156 size_t ZSTD_compressBlock_btlazy2_extDict(
2157 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2158 void const* src, size_t srcSize)
2159
2160 {
2161 return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
2162 }
2163
ZSTD_compressBlock_greedy_extDict_row(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)2164 size_t ZSTD_compressBlock_greedy_extDict_row(
2165 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2166 void const* src, size_t srcSize)
2167 {
2168 return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
2169 }
2170
ZSTD_compressBlock_lazy_extDict_row(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)2171 size_t ZSTD_compressBlock_lazy_extDict_row(
2172 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2173 void const* src, size_t srcSize)
2174
2175 {
2176 return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
2177 }
2178
ZSTD_compressBlock_lazy2_extDict_row(ZSTD_matchState_t * ms,seqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)2179 size_t ZSTD_compressBlock_lazy2_extDict_row(
2180 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2181 void const* src, size_t srcSize)
2182
2183 {
2184 return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
2185 }
2186