1 /* LzFind.c -- Match finder for LZ algorithms
2 2021-11-29 : Igor Pavlov : Public domain */
3 
4 #include "Precomp.h"
5 
6 #include <string.h>
7 // #include <stdio.h>
8 
9 #include "CpuArch.h"
10 #include "LzFind.h"
11 #include "LzHash.h"
12 
13 #define kBlockMoveAlign       (1 << 7)    // alignment for memmove()
14 #define kBlockSizeAlign       (1 << 16)   // alignment for block allocation
15 #define kBlockSizeReserveMin  (1 << 24)   // it's 1/256 from 4 GB dictinary
16 
17 #define kEmptyHashValue 0
18 
19 #define kMaxValForNormalize ((UInt32)0)
20 // #define kMaxValForNormalize ((UInt32)(1 << 20) + 0xFFF) // for debug
21 
22 // #define kNormalizeAlign (1 << 7) // alignment for speculated accesses
23 
24 #define GET_AVAIL_BYTES(p) \
25   Inline_MatchFinder_GetNumAvailableBytes(p)
26 
27 
28 // #define kFix5HashSize (kHash2Size + kHash3Size + kHash4Size)
29 #define kFix5HashSize kFix4HashSize
30 
31 /*
32  HASH2_CALC:
33    if (hv) match, then cur[0] and cur[1] also match
34 */
35 #define HASH2_CALC hv = GetUi16(cur);
36 
37 // (crc[0 ... 255] & 0xFF) provides one-to-one correspondence to [0 ... 255]
38 
39 /*
40  HASH3_CALC:
41    if (cur[0]) and (h2) match, then cur[1]            also match
42    if (cur[0]) and (hv) match, then cur[1] and cur[2] also match
43 */
44 #define HASH3_CALC { \
45   UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
46   h2 = temp & (kHash2Size - 1); \
47   hv = (temp ^ ((UInt32)cur[2] << 8)) & p->hashMask; }
48 
49 #define HASH4_CALC { \
50   UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
51   h2 = temp & (kHash2Size - 1); \
52   temp ^= ((UInt32)cur[2] << 8); \
53   h3 = temp & (kHash3Size - 1); \
54   hv = (temp ^ (p->crc[cur[3]] << kLzHash_CrcShift_1)) & p->hashMask; }
55 
56 #define HASH5_CALC { \
57   UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
58   h2 = temp & (kHash2Size - 1); \
59   temp ^= ((UInt32)cur[2] << 8); \
60   h3 = temp & (kHash3Size - 1); \
61   temp ^= (p->crc[cur[3]] << kLzHash_CrcShift_1); \
62   /* h4 = temp & p->hash4Mask; */ /* (kHash4Size - 1); */ \
63   hv = (temp ^ (p->crc[cur[4]] << kLzHash_CrcShift_2)) & p->hashMask; }
64 
65 #define HASH_ZIP_CALC hv = ((cur[2] | ((UInt32)cur[0] << 8)) ^ p->crc[cur[1]]) & 0xFFFF;
66 
67 
LzInWindow_Free(CMatchFinder * p,ISzAllocPtr alloc)68 static void LzInWindow_Free(CMatchFinder *p, ISzAllocPtr alloc)
69 {
70   if (!p->directInput)
71   {
72     ISzAlloc_Free(alloc, p->bufferBase);
73     p->bufferBase = NULL;
74   }
75 }
76 
77 
LzInWindow_Create2(CMatchFinder * p,UInt32 blockSize,ISzAllocPtr alloc)78 static int LzInWindow_Create2(CMatchFinder *p, UInt32 blockSize, ISzAllocPtr alloc)
79 {
80   if (blockSize == 0)
81     return 0;
82   if (!p->bufferBase || p->blockSize != blockSize)
83   {
84     // size_t blockSizeT;
85     LzInWindow_Free(p, alloc);
86     p->blockSize = blockSize;
87     // blockSizeT = blockSize;
88 
89     // printf("\nblockSize = 0x%x\n", blockSize);
90     /*
91     #if defined _WIN64
92     // we can allocate 4GiB, but still use UInt32 for (p->blockSize)
93     // we use UInt32 type for (p->blockSize), because
94     // we don't want to wrap over 4 GiB,
95     // when we use (p->streamPos - p->pos) that is UInt32.
96     if (blockSize >= (UInt32)0 - (UInt32)kBlockSizeAlign)
97     {
98       blockSizeT = ((size_t)1 << 32);
99       printf("\nchanged to blockSizeT = 4GiB\n");
100     }
101     #endif
102     */
103 
104     p->bufferBase = (Byte *)ISzAlloc_Alloc(alloc, blockSize);
105     // printf("\nbufferBase = %p\n", p->bufferBase);
106     // return 0; // for debug
107   }
108   return (p->bufferBase != NULL);
109 }
110 
MatchFinder_GetPointerToCurrentPos(CMatchFinder * p)111 static const Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p) { return p->buffer; }
112 
MatchFinder_GetNumAvailableBytes(CMatchFinder * p)113 static UInt32 MatchFinder_GetNumAvailableBytes(CMatchFinder *p) { return GET_AVAIL_BYTES(p); }
114 
115 
116 MY_NO_INLINE
MatchFinder_ReadBlock(CMatchFinder * p)117 static void MatchFinder_ReadBlock(CMatchFinder *p)
118 {
119   if (p->streamEndWasReached || p->result != SZ_OK)
120     return;
121 
122   /* We use (p->streamPos - p->pos) value.
123      (p->streamPos < p->pos) is allowed. */
124 
125   if (p->directInput)
126   {
127     UInt32 curSize = 0xFFFFFFFF - GET_AVAIL_BYTES(p);
128     if (curSize > p->directInputRem)
129       curSize = (UInt32)p->directInputRem;
130     p->directInputRem -= curSize;
131     p->streamPos += curSize;
132     if (p->directInputRem == 0)
133       p->streamEndWasReached = 1;
134     return;
135   }
136 
137   for (;;)
138   {
139     Byte *dest = p->buffer + GET_AVAIL_BYTES(p);
140     size_t size = (size_t)(p->bufferBase + p->blockSize - dest);
141     if (size == 0)
142     {
143       /* we call ReadBlock() after NeedMove() and MoveBlock().
144          NeedMove() and MoveBlock() povide more than (keepSizeAfter)
145          to the end of (blockSize).
146          So we don't execute this branch in normal code flow.
147          We can go here, if we will call ReadBlock() before NeedMove(), MoveBlock().
148       */
149       // p->result = SZ_ERROR_FAIL; // we can show error here
150       return;
151     }
152 
153     // #define kRead 3
154     // if (size > kRead) size = kRead; // for debug
155 
156     p->result = ISeqInStream_Read(p->stream, dest, &size);
157     if (p->result != SZ_OK)
158       return;
159     if (size == 0)
160     {
161       p->streamEndWasReached = 1;
162       return;
163     }
164     p->streamPos += (UInt32)size;
165     if (GET_AVAIL_BYTES(p) > p->keepSizeAfter)
166       return;
167     /* here and in another (p->keepSizeAfter) checks we keep on 1 byte more than was requested by Create() function
168          (GET_AVAIL_BYTES(p) >= p->keepSizeAfter) - minimal required size */
169   }
170 
171   // on exit: (p->result != SZ_OK || p->streamEndWasReached || GET_AVAIL_BYTES(p) > p->keepSizeAfter)
172 }
173 
174 
175 
176 MY_NO_INLINE
MatchFinder_MoveBlock(CMatchFinder * p)177 void MatchFinder_MoveBlock(CMatchFinder *p)
178 {
179   const size_t offset = (size_t)(p->buffer - p->bufferBase) - p->keepSizeBefore;
180   const size_t keepBefore = (offset & (kBlockMoveAlign - 1)) + p->keepSizeBefore;
181   p->buffer = p->bufferBase + keepBefore;
182   memmove(p->bufferBase,
183       p->bufferBase + (offset & ~((size_t)kBlockMoveAlign - 1)),
184       keepBefore + (size_t)GET_AVAIL_BYTES(p));
185 }
186 
187 /* We call MoveBlock() before ReadBlock().
188    So MoveBlock() can be wasteful operation, if the whole input data
189    can fit in current block even without calling MoveBlock().
190    in important case where (dataSize <= historySize)
191      condition (p->blockSize > dataSize + p->keepSizeAfter) is met
192      So there is no MoveBlock() in that case case.
193 */
194 
MatchFinder_NeedMove(CMatchFinder * p)195 int MatchFinder_NeedMove(CMatchFinder *p)
196 {
197   if (p->directInput)
198     return 0;
199   if (p->streamEndWasReached || p->result != SZ_OK)
200     return 0;
201   return ((size_t)(p->bufferBase + p->blockSize - p->buffer) <= p->keepSizeAfter);
202 }
203 
MatchFinder_ReadIfRequired(CMatchFinder * p)204 void MatchFinder_ReadIfRequired(CMatchFinder *p)
205 {
206   if (p->keepSizeAfter >= GET_AVAIL_BYTES(p))
207     MatchFinder_ReadBlock(p);
208 }
209 
210 
211 
MatchFinder_SetDefaultSettings(CMatchFinder * p)212 static void MatchFinder_SetDefaultSettings(CMatchFinder *p)
213 {
214   p->cutValue = 32;
215   p->btMode = 1;
216   p->numHashBytes = 4;
217   p->bigHash = 0;
218 }
219 
220 #define kCrcPoly 0xEDB88320
221 
MatchFinder_Construct(CMatchFinder * p)222 void MatchFinder_Construct(CMatchFinder *p)
223 {
224   unsigned i;
225   p->bufferBase = NULL;
226   p->directInput = 0;
227   p->hash = NULL;
228   p->expectedDataSize = (UInt64)(Int64)-1;
229   MatchFinder_SetDefaultSettings(p);
230 
231   for (i = 0; i < 256; i++)
232   {
233     UInt32 r = (UInt32)i;
234     unsigned j;
235     for (j = 0; j < 8; j++)
236       r = (r >> 1) ^ (kCrcPoly & ((UInt32)0 - (r & 1)));
237     p->crc[i] = r;
238   }
239 }
240 
MatchFinder_FreeThisClassMemory(CMatchFinder * p,ISzAllocPtr alloc)241 static void MatchFinder_FreeThisClassMemory(CMatchFinder *p, ISzAllocPtr alloc)
242 {
243   ISzAlloc_Free(alloc, p->hash);
244   p->hash = NULL;
245 }
246 
MatchFinder_Free(CMatchFinder * p,ISzAllocPtr alloc)247 void MatchFinder_Free(CMatchFinder *p, ISzAllocPtr alloc)
248 {
249   MatchFinder_FreeThisClassMemory(p, alloc);
250   LzInWindow_Free(p, alloc);
251 }
252 
AllocRefs(size_t num,ISzAllocPtr alloc)253 static CLzRef* AllocRefs(size_t num, ISzAllocPtr alloc)
254 {
255   size_t sizeInBytes = (size_t)num * sizeof(CLzRef);
256   if (sizeInBytes / sizeof(CLzRef) != num)
257     return NULL;
258   return (CLzRef *)ISzAlloc_Alloc(alloc, sizeInBytes);
259 }
260 
261 #if (kBlockSizeReserveMin < kBlockSizeAlign * 2)
262   #error Stop_Compiling_Bad_Reserve
263 #endif
264 
265 
266 
GetBlockSize(CMatchFinder * p,UInt32 historySize)267 static UInt32 GetBlockSize(CMatchFinder *p, UInt32 historySize)
268 {
269   UInt32 blockSize = (p->keepSizeBefore + p->keepSizeAfter);
270   /*
271   if (historySize > kMaxHistorySize)
272     return 0;
273   */
274   // printf("\nhistorySize == 0x%x\n", historySize);
275 
276   if (p->keepSizeBefore < historySize || blockSize < p->keepSizeBefore)  // if 32-bit overflow
277     return 0;
278 
279   {
280     const UInt32 kBlockSizeMax = (UInt32)0 - (UInt32)kBlockSizeAlign;
281     const UInt32 rem = kBlockSizeMax - blockSize;
282     const UInt32 reserve = (blockSize >> (blockSize < ((UInt32)1 << 30) ? 1 : 2))
283         + (1 << 12) + kBlockMoveAlign + kBlockSizeAlign; // do not overflow 32-bit here
284     if (blockSize >= kBlockSizeMax
285         || rem < kBlockSizeReserveMin) // we reject settings that will be slow
286       return 0;
287     if (reserve >= rem)
288       blockSize = kBlockSizeMax;
289     else
290     {
291       blockSize += reserve;
292       blockSize &= ~(UInt32)(kBlockSizeAlign - 1);
293     }
294   }
295   // printf("\n LzFind_blockSize = %x\n", blockSize);
296   // printf("\n LzFind_blockSize = %d\n", blockSize >> 20);
297   return blockSize;
298 }
299 
300 
MatchFinder_Create(CMatchFinder * p,UInt32 historySize,UInt32 keepAddBufferBefore,UInt32 matchMaxLen,UInt32 keepAddBufferAfter,ISzAllocPtr alloc)301 int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
302     UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter,
303     ISzAllocPtr alloc)
304 {
305   /* we need one additional byte in (p->keepSizeBefore),
306      since we use MoveBlock() after (p->pos++) and before dictionary using */
307   // keepAddBufferBefore = (UInt32)0xFFFFFFFF - (1 << 22); // for debug
308   p->keepSizeBefore = historySize + keepAddBufferBefore + 1;
309 
310   keepAddBufferAfter += matchMaxLen;
311   /* we need (p->keepSizeAfter >= p->numHashBytes) */
312   if (keepAddBufferAfter < p->numHashBytes)
313     keepAddBufferAfter = p->numHashBytes;
314   // keepAddBufferAfter -= 2; // for debug
315   p->keepSizeAfter = keepAddBufferAfter;
316 
317   if (p->directInput)
318     p->blockSize = 0;
319   if (p->directInput || LzInWindow_Create2(p, GetBlockSize(p, historySize), alloc))
320   {
321     const UInt32 newCyclicBufferSize = historySize + 1; // do not change it
322     UInt32 hs;
323     p->matchMaxLen = matchMaxLen;
324     {
325       // UInt32 hs4;
326       p->fixedHashSize = 0;
327       hs = (1 << 16) - 1;
328       if (p->numHashBytes != 2)
329       {
330         hs = historySize;
331         if (hs > p->expectedDataSize)
332           hs = (UInt32)p->expectedDataSize;
333         if (hs != 0)
334           hs--;
335         hs |= (hs >> 1);
336         hs |= (hs >> 2);
337         hs |= (hs >> 4);
338         hs |= (hs >> 8);
339         // we propagated 16 bits in (hs). Low 16 bits must be set later
340         hs >>= 1;
341         if (hs >= (1 << 24))
342         {
343           if (p->numHashBytes == 3)
344             hs = (1 << 24) - 1;
345           else
346             hs >>= 1;
347           /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */
348         }
349 
350         // hs = ((UInt32)1 << 25) - 1; // for test
351 
352         // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2)
353         hs |= (1 << 16) - 1; /* don't change it! */
354 
355         // bt5: we adjust the size with recommended minimum size
356         if (p->numHashBytes >= 5)
357           hs |= (256 << kLzHash_CrcShift_2) - 1;
358       }
359       p->hashMask = hs;
360       hs++;
361 
362       /*
363       hs4 = (1 << 20);
364       if (hs4 > hs)
365         hs4 = hs;
366       // hs4 = (1 << 16); // for test
367       p->hash4Mask = hs4 - 1;
368       */
369 
370       if (p->numHashBytes > 2) p->fixedHashSize += kHash2Size;
371       if (p->numHashBytes > 3) p->fixedHashSize += kHash3Size;
372       // if (p->numHashBytes > 4) p->fixedHashSize += hs4; // kHash4Size;
373       hs += p->fixedHashSize;
374     }
375 
376     {
377       size_t newSize;
378       size_t numSons;
379       p->historySize = historySize;
380       p->hashSizeSum = hs;
381       p->cyclicBufferSize = newCyclicBufferSize; // it must be = (historySize + 1)
382 
383       numSons = newCyclicBufferSize;
384       if (p->btMode)
385         numSons <<= 1;
386       newSize = hs + numSons;
387 
388       // aligned size is not required here, but it can be better for some loops
389       #define NUM_REFS_ALIGN_MASK 0xF
390       newSize = (newSize + NUM_REFS_ALIGN_MASK) & ~(size_t)NUM_REFS_ALIGN_MASK;
391 
392       if (p->hash && p->numRefs == newSize)
393         return 1;
394 
395       MatchFinder_FreeThisClassMemory(p, alloc);
396       p->numRefs = newSize;
397       p->hash = AllocRefs(newSize, alloc);
398 
399       if (p->hash)
400       {
401         p->son = p->hash + p->hashSizeSum;
402         return 1;
403       }
404     }
405   }
406 
407   MatchFinder_Free(p, alloc);
408   return 0;
409 }
410 
411 
MatchFinder_SetLimits(CMatchFinder * p)412 static void MatchFinder_SetLimits(CMatchFinder *p)
413 {
414   UInt32 k;
415   UInt32 n = kMaxValForNormalize - p->pos;
416   if (n == 0)
417     n = (UInt32)(Int32)-1;  // we allow (pos == 0) at start even with (kMaxValForNormalize == 0)
418 
419   k = p->cyclicBufferSize - p->cyclicBufferPos;
420   if (k < n)
421     n = k;
422 
423   k = GET_AVAIL_BYTES(p);
424   {
425     const UInt32 ksa = p->keepSizeAfter;
426     UInt32 mm = p->matchMaxLen;
427     if (k > ksa)
428       k -= ksa; // we must limit exactly to keepSizeAfter for ReadBlock
429     else if (k >= mm)
430     {
431       // the limitation for (p->lenLimit) update
432       k -= mm;   // optimization : to reduce the number of checks
433       k++;
434       // k = 1; // non-optimized version : for debug
435     }
436     else
437     {
438       mm = k;
439       if (k != 0)
440         k = 1;
441     }
442     p->lenLimit = mm;
443   }
444   if (k < n)
445     n = k;
446 
447   p->posLimit = p->pos + n;
448 }
449 
450 
MatchFinder_Init_LowHash(CMatchFinder * p)451 void MatchFinder_Init_LowHash(CMatchFinder *p)
452 {
453   size_t i;
454   CLzRef *items = p->hash;
455   const size_t numItems = p->fixedHashSize;
456   for (i = 0; i < numItems; i++)
457     items[i] = kEmptyHashValue;
458 }
459 
460 
MatchFinder_Init_HighHash(CMatchFinder * p)461 void MatchFinder_Init_HighHash(CMatchFinder *p)
462 {
463   size_t i;
464   CLzRef *items = p->hash + p->fixedHashSize;
465   const size_t numItems = (size_t)p->hashMask + 1;
466   for (i = 0; i < numItems; i++)
467     items[i] = kEmptyHashValue;
468 }
469 
470 
MatchFinder_Init_4(CMatchFinder * p)471 void MatchFinder_Init_4(CMatchFinder *p)
472 {
473   p->buffer = p->bufferBase;
474   {
475     /* kEmptyHashValue = 0 (Zero) is used in hash tables as NO-VALUE marker.
476        the code in CMatchFinderMt expects (pos = 1) */
477     p->pos =
478     p->streamPos =
479         1; // it's smallest optimal value. do not change it
480         // 0; // for debug
481   }
482   p->result = SZ_OK;
483   p->streamEndWasReached = 0;
484 }
485 
486 
487 // (CYC_TO_POS_OFFSET == 0) is expected by some optimized code
488 #define CYC_TO_POS_OFFSET 0
489 // #define CYC_TO_POS_OFFSET 1 // for debug
490 
MatchFinder_Init(CMatchFinder * p)491 void MatchFinder_Init(CMatchFinder *p)
492 {
493   MatchFinder_Init_HighHash(p);
494   MatchFinder_Init_LowHash(p);
495   MatchFinder_Init_4(p);
496   // if (readData)
497   MatchFinder_ReadBlock(p);
498 
499   /* if we init (cyclicBufferPos = pos), then we can use one variable
500      instead of both (cyclicBufferPos) and (pos) : only before (cyclicBufferPos) wrapping */
501   p->cyclicBufferPos = (p->pos - CYC_TO_POS_OFFSET); // init with relation to (pos)
502   // p->cyclicBufferPos = 0; // smallest value
503   // p->son[0] = p->son[1] = 0; // unused: we can init skipped record for speculated accesses.
504   MatchFinder_SetLimits(p);
505 }
506 
507 
508 
509 #ifdef MY_CPU_X86_OR_AMD64
510   #if defined(__clang__) && (__clang_major__ >= 8) \
511     || defined(__GNUC__) && (__GNUC__ >= 8) \
512     || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900)
513       #define USE_SATUR_SUB_128
514       #define USE_AVX2
515       #define ATTRIB_SSE41 __attribute__((__target__("sse4.1")))
516       #define ATTRIB_AVX2 __attribute__((__target__("avx2")))
517   #elif defined(_MSC_VER)
518     #if (_MSC_VER >= 1600)
519       #define USE_SATUR_SUB_128
520       #if (_MSC_VER >= 1900)
521         #define USE_AVX2
522         #include <immintrin.h> // avx
523       #endif
524     #endif
525   #endif
526 
527 // #elif defined(MY_CPU_ARM_OR_ARM64)
528 #elif defined(MY_CPU_ARM64)
529 
530   #if defined(__clang__) && (__clang_major__ >= 8) \
531     || defined(__GNUC__) && (__GNUC__ >= 8)
532       #define USE_SATUR_SUB_128
533     #ifdef MY_CPU_ARM64
534       // #define ATTRIB_SSE41 __attribute__((__target__("")))
535     #else
536       // #define ATTRIB_SSE41 __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
537     #endif
538 
539   #elif defined(_MSC_VER)
540     #if (_MSC_VER >= 1910)
541       #define USE_SATUR_SUB_128
542     #endif
543   #endif
544 
545   #if defined(_MSC_VER) && defined(MY_CPU_ARM64)
546     #include <arm64_neon.h>
547   #else
548     #include <arm_neon.h>
549   #endif
550 
551 #endif
552 
553 /*
554 #ifndef ATTRIB_SSE41
555   #define ATTRIB_SSE41
556 #endif
557 #ifndef ATTRIB_AVX2
558   #define ATTRIB_AVX2
559 #endif
560 */
561 
562 #ifdef USE_SATUR_SUB_128
563 
564 // #define _SHOW_HW_STATUS
565 
566 #ifdef _SHOW_HW_STATUS
567 #include <stdio.h>
568 #define _PRF(x) x
569 _PRF(;)
570 #else
571 #define _PRF(x)
572 #endif
573 
574 #ifdef MY_CPU_ARM_OR_ARM64
575 
576 #ifdef MY_CPU_ARM64
577 // #define FORCE_SATUR_SUB_128
578 #endif
579 
580 typedef uint32x4_t v128;
581 #define SASUB_128(i) \
582    *(v128 *)(void *)(items + (i) * 4) = \
583   vsubq_u32(vmaxq_u32(*(const v128 *)(const void *)(items + (i) * 4), sub2), sub2);
584 
585 #else
586 
587 #include <smmintrin.h> // sse4.1
588 
589 typedef __m128i v128;
590 #define SASUB_128(i) \
591   *(v128 *)(void *)(items + (i) * 4) = \
592   _mm_sub_epi32(_mm_max_epu32(*(const v128 *)(const void *)(items + (i) * 4), sub2), sub2); // SSE 4.1
593 
594 #endif
595 
596 
597 
598 MY_NO_INLINE
599 static
600 #ifdef ATTRIB_SSE41
601 ATTRIB_SSE41
602 #endif
603 void
604 MY_FAST_CALL
LzFind_SaturSub_128(UInt32 subValue,CLzRef * items,const CLzRef * lim)605 LzFind_SaturSub_128(UInt32 subValue, CLzRef *items, const CLzRef *lim)
606 {
607   v128 sub2 =
608     #ifdef MY_CPU_ARM_OR_ARM64
609       vdupq_n_u32(subValue);
610     #else
611       _mm_set_epi32((Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue);
612     #endif
613   do
614   {
615     SASUB_128(0)
616     SASUB_128(1)
617     SASUB_128(2)
618     SASUB_128(3)
619     items += 4 * 4;
620   }
621   while (items != lim);
622 }
623 
624 
625 
626 #ifdef USE_AVX2
627 
628 #include <immintrin.h> // avx
629 
630 #define SASUB_256(i) *(__m256i *)(void *)(items + (i) * 8) = _mm256_sub_epi32(_mm256_max_epu32(*(const __m256i *)(const void *)(items + (i) * 8), sub2), sub2); // AVX2
631 
632 MY_NO_INLINE
633 static
634 #ifdef ATTRIB_AVX2
635 ATTRIB_AVX2
636 #endif
637 void
638 MY_FAST_CALL
LzFind_SaturSub_256(UInt32 subValue,CLzRef * items,const CLzRef * lim)639 LzFind_SaturSub_256(UInt32 subValue, CLzRef *items, const CLzRef *lim)
640 {
641   __m256i sub2 = _mm256_set_epi32(
642       (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue,
643       (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue);
644   do
645   {
646     SASUB_256(0)
647     SASUB_256(1)
648     items += 2 * 8;
649   }
650   while (items != lim);
651 }
652 #endif // USE_AVX2
653 
654 #ifndef FORCE_SATUR_SUB_128
655 typedef void (MY_FAST_CALL *LZFIND_SATUR_SUB_CODE_FUNC)(
656     UInt32 subValue, CLzRef *items, const CLzRef *lim);
657 static LZFIND_SATUR_SUB_CODE_FUNC g_LzFind_SaturSub;
658 #endif // FORCE_SATUR_SUB_128
659 
660 #endif // USE_SATUR_SUB_128
661 
662 
663 // kEmptyHashValue must be zero
664 // #define SASUB_32(i) v = items[i];  m = v - subValue;  if (v < subValue) m = kEmptyHashValue;  items[i] = m;
665 #define SASUB_32(i) v = items[i];  if (v < subValue) v = subValue; items[i] = v - subValue;
666 
667 #ifdef FORCE_SATUR_SUB_128
668 
669 #define DEFAULT_SaturSub LzFind_SaturSub_128
670 
671 #else
672 
673 #define DEFAULT_SaturSub LzFind_SaturSub_32
674 
675 MY_NO_INLINE
676 static
677 void
678 MY_FAST_CALL
LzFind_SaturSub_32(UInt32 subValue,CLzRef * items,const CLzRef * lim)679 LzFind_SaturSub_32(UInt32 subValue, CLzRef *items, const CLzRef *lim)
680 {
681   do
682   {
683     UInt32 v;
684     SASUB_32(0)
685     SASUB_32(1)
686     SASUB_32(2)
687     SASUB_32(3)
688     SASUB_32(4)
689     SASUB_32(5)
690     SASUB_32(6)
691     SASUB_32(7)
692     items += 8;
693   }
694   while (items != lim);
695 }
696 
697 #endif
698 
699 
700 MY_NO_INLINE
MatchFinder_Normalize3(UInt32 subValue,CLzRef * items,size_t numItems)701 void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems)
702 {
703   #define K_NORM_ALIGN_BLOCK_SIZE (1 << 6)
704 
705   CLzRef *lim;
706 
707   for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (K_NORM_ALIGN_BLOCK_SIZE - 1)) != 0; numItems--)
708   {
709     UInt32 v;
710     SASUB_32(0);
711     items++;
712   }
713 
714   {
715     #define K_NORM_ALIGN_MASK (K_NORM_ALIGN_BLOCK_SIZE / 4 - 1)
716     lim = items + (numItems & ~(size_t)K_NORM_ALIGN_MASK);
717     numItems &= K_NORM_ALIGN_MASK;
718     if (items != lim)
719     {
720       #if defined(USE_SATUR_SUB_128) && !defined(FORCE_SATUR_SUB_128)
721         if (g_LzFind_SaturSub)
722           g_LzFind_SaturSub(subValue, items, lim);
723         else
724       #endif
725           DEFAULT_SaturSub(subValue, items, lim);
726     }
727     items = lim;
728   }
729 
730 
731   for (; numItems != 0; numItems--)
732   {
733     UInt32 v;
734     SASUB_32(0);
735     items++;
736   }
737 }
738 
739 
740 
741 // call MatchFinder_CheckLimits() only after (p->pos++) update
742 
743 MY_NO_INLINE
MatchFinder_CheckLimits(CMatchFinder * p)744 static void MatchFinder_CheckLimits(CMatchFinder *p)
745 {
746   if (// !p->streamEndWasReached && p->result == SZ_OK &&
747       p->keepSizeAfter == GET_AVAIL_BYTES(p))
748   {
749     // we try to read only in exact state (p->keepSizeAfter == GET_AVAIL_BYTES(p))
750     if (MatchFinder_NeedMove(p))
751       MatchFinder_MoveBlock(p);
752     MatchFinder_ReadBlock(p);
753   }
754 
755   if (p->pos == kMaxValForNormalize)
756   if (GET_AVAIL_BYTES(p) >= p->numHashBytes) // optional optimization for last bytes of data.
757     /*
758        if we disable normalization for last bytes of data, and
759        if (data_size == 4 GiB), we don't call wastfull normalization,
760        but (pos) will be wrapped over Zero (0) in that case.
761        And we cannot resume later to normal operation
762     */
763   {
764     // MatchFinder_Normalize(p);
765     /* after normalization we need (p->pos >= p->historySize + 1); */
766     /* we can reduce subValue to aligned value, if want to keep alignment
767        of (p->pos) and (p->buffer) for speculated accesses. */
768     const UInt32 subValue = (p->pos - p->historySize - 1) /* & ~(UInt32)(kNormalizeAlign - 1) */;
769     // const UInt32 subValue = (1 << 15); // for debug
770     // printf("\nMatchFinder_Normalize() subValue == 0x%x\n", subValue);
771     size_t numSonRefs = p->cyclicBufferSize;
772     if (p->btMode)
773       numSonRefs <<= 1;
774     Inline_MatchFinder_ReduceOffsets(p, subValue);
775     MatchFinder_Normalize3(subValue, p->hash, (size_t)p->hashSizeSum + numSonRefs);
776   }
777 
778   if (p->cyclicBufferPos == p->cyclicBufferSize)
779     p->cyclicBufferPos = 0;
780 
781   MatchFinder_SetLimits(p);
782 }
783 
784 
785 /*
786   (lenLimit > maxLen)
787 */
788 MY_FORCE_INLINE
Hc_GetMatchesSpec(size_t lenLimit,UInt32 curMatch,UInt32 pos,const Byte * cur,CLzRef * son,size_t _cyclicBufferPos,UInt32 _cyclicBufferSize,UInt32 cutValue,UInt32 * d,unsigned maxLen)789 static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son,
790     size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue,
791     UInt32 *d, unsigned maxLen)
792 {
793   /*
794   son[_cyclicBufferPos] = curMatch;
795   for (;;)
796   {
797     UInt32 delta = pos - curMatch;
798     if (cutValue-- == 0 || delta >= _cyclicBufferSize)
799       return d;
800     {
801       const Byte *pb = cur - delta;
802       curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)];
803       if (pb[maxLen] == cur[maxLen] && *pb == *cur)
804       {
805         UInt32 len = 0;
806         while (++len != lenLimit)
807           if (pb[len] != cur[len])
808             break;
809         if (maxLen < len)
810         {
811           maxLen = len;
812           *d++ = len;
813           *d++ = delta - 1;
814           if (len == lenLimit)
815             return d;
816         }
817       }
818     }
819   }
820   */
821 
822   const Byte *lim = cur + lenLimit;
823   son[_cyclicBufferPos] = curMatch;
824 
825   do
826   {
827     UInt32 delta;
828 
829     if (curMatch == 0)
830       break;
831     // if (curMatch2 >= curMatch) return NULL;
832     delta = pos - curMatch;
833     if (delta >= _cyclicBufferSize)
834       break;
835     {
836       ptrdiff_t diff;
837       curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)];
838       diff = (ptrdiff_t)0 - (ptrdiff_t)delta;
839       if (cur[maxLen] == cur[(ptrdiff_t)maxLen + diff])
840       {
841         const Byte *c = cur;
842         while (*c == c[diff])
843         {
844           if (++c == lim)
845           {
846             d[0] = (UInt32)(lim - cur);
847             d[1] = delta - 1;
848             return d + 2;
849           }
850         }
851         {
852           const unsigned len = (unsigned)(c - cur);
853           if (maxLen < len)
854           {
855             maxLen = len;
856             d[0] = (UInt32)len;
857             d[1] = delta - 1;
858             d += 2;
859           }
860         }
861       }
862     }
863   }
864   while (--cutValue);
865 
866   return d;
867 }
868 
869 
870 MY_FORCE_INLINE
GetMatchesSpec1(UInt32 lenLimit,UInt32 curMatch,UInt32 pos,const Byte * cur,CLzRef * son,size_t _cyclicBufferPos,UInt32 _cyclicBufferSize,UInt32 cutValue,UInt32 * d,UInt32 maxLen)871 UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son,
872     size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue,
873     UInt32 *d, UInt32 maxLen)
874 {
875   CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1;
876   CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);
877   unsigned len0 = 0, len1 = 0;
878 
879   UInt32 cmCheck;
880 
881   // if (curMatch >= pos) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; }
882 
883   cmCheck = (UInt32)(pos - _cyclicBufferSize);
884   if ((UInt32)pos <= _cyclicBufferSize)
885     cmCheck = 0;
886 
887   if (cmCheck < curMatch)
888   do
889   {
890     const UInt32 delta = pos - curMatch;
891     {
892       CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
893       const Byte *pb = cur - delta;
894       unsigned len = (len0 < len1 ? len0 : len1);
895       const UInt32 pair0 = pair[0];
896       if (pb[len] == cur[len])
897       {
898         if (++len != lenLimit && pb[len] == cur[len])
899           while (++len != lenLimit)
900             if (pb[len] != cur[len])
901               break;
902         if (maxLen < len)
903         {
904           maxLen = (UInt32)len;
905           *d++ = (UInt32)len;
906           *d++ = delta - 1;
907           if (len == lenLimit)
908           {
909             *ptr1 = pair0;
910             *ptr0 = pair[1];
911             return d;
912           }
913         }
914       }
915       if (pb[len] < cur[len])
916       {
917         *ptr1 = curMatch;
918         // const UInt32 curMatch2 = pair[1];
919         // if (curMatch2 >= curMatch) { *ptr0 = *ptr1 = kEmptyHashValue;  return NULL; }
920         // curMatch = curMatch2;
921         curMatch = pair[1];
922         ptr1 = pair + 1;
923         len1 = len;
924       }
925       else
926       {
927         *ptr0 = curMatch;
928         curMatch = pair[0];
929         ptr0 = pair;
930         len0 = len;
931       }
932     }
933   }
934   while(--cutValue && cmCheck < curMatch);
935 
936   *ptr0 = *ptr1 = kEmptyHashValue;
937   return d;
938 }
939 
940 
SkipMatchesSpec(UInt32 lenLimit,UInt32 curMatch,UInt32 pos,const Byte * cur,CLzRef * son,size_t _cyclicBufferPos,UInt32 _cyclicBufferSize,UInt32 cutValue)941 static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son,
942     size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue)
943 {
944   CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1;
945   CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);
946   unsigned len0 = 0, len1 = 0;
947 
948   UInt32 cmCheck;
949 
950   cmCheck = (UInt32)(pos - _cyclicBufferSize);
951   if ((UInt32)pos <= _cyclicBufferSize)
952     cmCheck = 0;
953 
954   if (// curMatch >= pos ||  // failure
955       cmCheck < curMatch)
956   do
957   {
958     const UInt32 delta = pos - curMatch;
959     {
960       CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
961       const Byte *pb = cur - delta;
962       unsigned len = (len0 < len1 ? len0 : len1);
963       if (pb[len] == cur[len])
964       {
965         while (++len != lenLimit)
966           if (pb[len] != cur[len])
967             break;
968         {
969           if (len == lenLimit)
970           {
971             *ptr1 = pair[0];
972             *ptr0 = pair[1];
973             return;
974           }
975         }
976       }
977       if (pb[len] < cur[len])
978       {
979         *ptr1 = curMatch;
980         curMatch = pair[1];
981         ptr1 = pair + 1;
982         len1 = len;
983       }
984       else
985       {
986         *ptr0 = curMatch;
987         curMatch = pair[0];
988         ptr0 = pair;
989         len0 = len;
990       }
991     }
992   }
993   while(--cutValue && cmCheck < curMatch);
994 
995   *ptr0 = *ptr1 = kEmptyHashValue;
996   return;
997 }
998 
999 
1000 #define MOVE_POS \
1001   ++p->cyclicBufferPos; \
1002   p->buffer++; \
1003   { const UInt32 pos1 = p->pos + 1; p->pos = pos1; if (pos1 == p->posLimit) MatchFinder_CheckLimits(p); }
1004 
1005 #define MOVE_POS_RET MOVE_POS return distances;
1006 
1007 MY_NO_INLINE
MatchFinder_MovePos(CMatchFinder * p)1008 static void MatchFinder_MovePos(CMatchFinder *p)
1009 {
1010   /* we go here at the end of stream data, when (avail < num_hash_bytes)
1011      We don't update sons[cyclicBufferPos << btMode].
1012      So (sons) record will contain junk. And we cannot resume match searching
1013      to normal operation, even if we will provide more input data in buffer.
1014      p->sons[p->cyclicBufferPos << p->btMode] = 0;  // kEmptyHashValue
1015      if (p->btMode)
1016         p->sons[(p->cyclicBufferPos << p->btMode) + 1] = 0;  // kEmptyHashValue
1017   */
1018   MOVE_POS;
1019 }
1020 
1021 #define GET_MATCHES_HEADER2(minLen, ret_op) \
1022   unsigned lenLimit; UInt32 hv; Byte *cur; UInt32 curMatch; \
1023   lenLimit = (unsigned)p->lenLimit; { if (lenLimit < minLen) { MatchFinder_MovePos(p); ret_op; }} \
1024   cur = p->buffer;
1025 
1026 #define GET_MATCHES_HEADER(minLen) GET_MATCHES_HEADER2(minLen, return distances)
1027 #define SKIP_HEADER(minLen)   do { GET_MATCHES_HEADER2(minLen, continue)
1028 
1029 #define MF_PARAMS(p)  lenLimit, curMatch, p->pos, p->buffer, p->son, p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue
1030 
1031 #define SKIP_FOOTER  SkipMatchesSpec(MF_PARAMS(p)); MOVE_POS; } while (--num);
1032 
1033 #define GET_MATCHES_FOOTER_BASE(_maxLen_, func) \
1034   distances = func(MF_PARAMS(p), \
1035   distances, (UInt32)_maxLen_); MOVE_POS_RET;
1036 
1037 #define GET_MATCHES_FOOTER_BT(_maxLen_) \
1038   GET_MATCHES_FOOTER_BASE(_maxLen_, GetMatchesSpec1)
1039 
1040 #define GET_MATCHES_FOOTER_HC(_maxLen_) \
1041   GET_MATCHES_FOOTER_BASE(_maxLen_, Hc_GetMatchesSpec)
1042 
1043 
1044 
1045 #define UPDATE_maxLen { \
1046     const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)d2; \
1047     const Byte *c = cur + maxLen; \
1048     const Byte *lim = cur + lenLimit; \
1049     for (; c != lim; c++) if (*(c + diff) != *c) break; \
1050     maxLen = (unsigned)(c - cur); }
1051 
Bt2_MatchFinder_GetMatches(CMatchFinder * p,UInt32 * distances)1052 static UInt32* Bt2_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1053 {
1054   GET_MATCHES_HEADER(2)
1055   HASH2_CALC;
1056   curMatch = p->hash[hv];
1057   p->hash[hv] = p->pos;
1058   GET_MATCHES_FOOTER_BT(1)
1059 }
1060 
Bt3Zip_MatchFinder_GetMatches(CMatchFinder * p,UInt32 * distances)1061 UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1062 {
1063   GET_MATCHES_HEADER(3)
1064   HASH_ZIP_CALC;
1065   curMatch = p->hash[hv];
1066   p->hash[hv] = p->pos;
1067   GET_MATCHES_FOOTER_BT(2)
1068 }
1069 
1070 
1071 #define SET_mmm  \
1072   mmm = p->cyclicBufferSize; \
1073   if (pos < mmm) \
1074     mmm = pos;
1075 
1076 
Bt3_MatchFinder_GetMatches(CMatchFinder * p,UInt32 * distances)1077 static UInt32* Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1078 {
1079   UInt32 mmm;
1080   UInt32 h2, d2, pos;
1081   unsigned maxLen;
1082   UInt32 *hash;
1083   GET_MATCHES_HEADER(3)
1084 
1085   HASH3_CALC;
1086 
1087   hash = p->hash;
1088   pos = p->pos;
1089 
1090   d2 = pos - hash[h2];
1091 
1092   curMatch = (hash + kFix3HashSize)[hv];
1093 
1094   hash[h2] = pos;
1095   (hash + kFix3HashSize)[hv] = pos;
1096 
1097   SET_mmm
1098 
1099   maxLen = 2;
1100 
1101   if (d2 < mmm && *(cur - d2) == *cur)
1102   {
1103     UPDATE_maxLen
1104     distances[0] = (UInt32)maxLen;
1105     distances[1] = d2 - 1;
1106     distances += 2;
1107     if (maxLen == lenLimit)
1108     {
1109       SkipMatchesSpec(MF_PARAMS(p));
1110       MOVE_POS_RET;
1111     }
1112   }
1113 
1114   GET_MATCHES_FOOTER_BT(maxLen)
1115 }
1116 
1117 
Bt4_MatchFinder_GetMatches(CMatchFinder * p,UInt32 * distances)1118 static UInt32* Bt4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1119 {
1120   UInt32 mmm;
1121   UInt32 h2, h3, d2, d3, pos;
1122   unsigned maxLen;
1123   UInt32 *hash;
1124   GET_MATCHES_HEADER(4)
1125 
1126   HASH4_CALC;
1127 
1128   hash = p->hash;
1129   pos = p->pos;
1130 
1131   d2 = pos - hash                  [h2];
1132   d3 = pos - (hash + kFix3HashSize)[h3];
1133   curMatch = (hash + kFix4HashSize)[hv];
1134 
1135   hash                  [h2] = pos;
1136   (hash + kFix3HashSize)[h3] = pos;
1137   (hash + kFix4HashSize)[hv] = pos;
1138 
1139   SET_mmm
1140 
1141   maxLen = 3;
1142 
1143   for (;;)
1144   {
1145     if (d2 < mmm && *(cur - d2) == *cur)
1146     {
1147       distances[0] = 2;
1148       distances[1] = d2 - 1;
1149       distances += 2;
1150       if (*(cur - d2 + 2) == cur[2])
1151       {
1152         // distances[-2] = 3;
1153       }
1154       else if (d3 < mmm && *(cur - d3) == *cur)
1155       {
1156         d2 = d3;
1157         distances[1] = d3 - 1;
1158         distances += 2;
1159       }
1160       else
1161         break;
1162     }
1163     else if (d3 < mmm && *(cur - d3) == *cur)
1164     {
1165       d2 = d3;
1166       distances[1] = d3 - 1;
1167       distances += 2;
1168     }
1169     else
1170       break;
1171 
1172     UPDATE_maxLen
1173     distances[-2] = (UInt32)maxLen;
1174     if (maxLen == lenLimit)
1175     {
1176       SkipMatchesSpec(MF_PARAMS(p));
1177       MOVE_POS_RET
1178     }
1179     break;
1180   }
1181 
1182   GET_MATCHES_FOOTER_BT(maxLen)
1183 }
1184 
1185 
Bt5_MatchFinder_GetMatches(CMatchFinder * p,UInt32 * distances)1186 static UInt32* Bt5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1187 {
1188   UInt32 mmm;
1189   UInt32 h2, h3, d2, d3, maxLen, pos;
1190   UInt32 *hash;
1191   GET_MATCHES_HEADER(5)
1192 
1193   HASH5_CALC;
1194 
1195   hash = p->hash;
1196   pos = p->pos;
1197 
1198   d2 = pos - hash                  [h2];
1199   d3 = pos - (hash + kFix3HashSize)[h3];
1200   // d4 = pos - (hash + kFix4HashSize)[h4];
1201 
1202   curMatch = (hash + kFix5HashSize)[hv];
1203 
1204   hash                  [h2] = pos;
1205   (hash + kFix3HashSize)[h3] = pos;
1206   // (hash + kFix4HashSize)[h4] = pos;
1207   (hash + kFix5HashSize)[hv] = pos;
1208 
1209   SET_mmm
1210 
1211   maxLen = 4;
1212 
1213   for (;;)
1214   {
1215     if (d2 < mmm && *(cur - d2) == *cur)
1216     {
1217       distances[0] = 2;
1218       distances[1] = d2 - 1;
1219       distances += 2;
1220       if (*(cur - d2 + 2) == cur[2])
1221       {
1222       }
1223       else if (d3 < mmm && *(cur - d3) == *cur)
1224       {
1225         distances[1] = d3 - 1;
1226         distances += 2;
1227         d2 = d3;
1228       }
1229       else
1230         break;
1231     }
1232     else if (d3 < mmm && *(cur - d3) == *cur)
1233     {
1234       distances[1] = d3 - 1;
1235       distances += 2;
1236       d2 = d3;
1237     }
1238     else
1239       break;
1240 
1241     distances[-2] = 3;
1242     if (*(cur - d2 + 3) != cur[3])
1243       break;
1244     UPDATE_maxLen
1245     distances[-2] = (UInt32)maxLen;
1246     if (maxLen == lenLimit)
1247     {
1248       SkipMatchesSpec(MF_PARAMS(p));
1249       MOVE_POS_RET;
1250     }
1251     break;
1252   }
1253 
1254   GET_MATCHES_FOOTER_BT(maxLen)
1255 }
1256 
1257 
Hc4_MatchFinder_GetMatches(CMatchFinder * p,UInt32 * distances)1258 static UInt32* Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1259 {
1260   UInt32 mmm;
1261   UInt32 h2, h3, d2, d3, pos;
1262   unsigned maxLen;
1263   UInt32 *hash;
1264   GET_MATCHES_HEADER(4)
1265 
1266   HASH4_CALC;
1267 
1268   hash = p->hash;
1269   pos = p->pos;
1270 
1271   d2 = pos - hash                  [h2];
1272   d3 = pos - (hash + kFix3HashSize)[h3];
1273   curMatch = (hash + kFix4HashSize)[hv];
1274 
1275   hash                  [h2] = pos;
1276   (hash + kFix3HashSize)[h3] = pos;
1277   (hash + kFix4HashSize)[hv] = pos;
1278 
1279   SET_mmm
1280 
1281   maxLen = 3;
1282 
1283   for (;;)
1284   {
1285     if (d2 < mmm && *(cur - d2) == *cur)
1286     {
1287       distances[0] = 2;
1288       distances[1] = d2 - 1;
1289       distances += 2;
1290       if (*(cur - d2 + 2) == cur[2])
1291       {
1292         // distances[-2] = 3;
1293       }
1294       else if (d3 < mmm && *(cur - d3) == *cur)
1295       {
1296         d2 = d3;
1297         distances[1] = d3 - 1;
1298         distances += 2;
1299       }
1300       else
1301         break;
1302     }
1303     else if (d3 < mmm && *(cur - d3) == *cur)
1304     {
1305       d2 = d3;
1306       distances[1] = d3 - 1;
1307       distances += 2;
1308     }
1309     else
1310       break;
1311 
1312     UPDATE_maxLen
1313     distances[-2] = (UInt32)maxLen;
1314     if (maxLen == lenLimit)
1315     {
1316       p->son[p->cyclicBufferPos] = curMatch;
1317       MOVE_POS_RET;
1318     }
1319     break;
1320   }
1321 
1322   GET_MATCHES_FOOTER_HC(maxLen);
1323 }
1324 
1325 
Hc5_MatchFinder_GetMatches(CMatchFinder * p,UInt32 * distances)1326 static UInt32 * Hc5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1327 {
1328   UInt32 mmm;
1329   UInt32 h2, h3, d2, d3, maxLen, pos;
1330   UInt32 *hash;
1331   GET_MATCHES_HEADER(5)
1332 
1333   HASH5_CALC;
1334 
1335   hash = p->hash;
1336   pos = p->pos;
1337 
1338   d2 = pos - hash                  [h2];
1339   d3 = pos - (hash + kFix3HashSize)[h3];
1340   // d4 = pos - (hash + kFix4HashSize)[h4];
1341 
1342   curMatch = (hash + kFix5HashSize)[hv];
1343 
1344   hash                  [h2] = pos;
1345   (hash + kFix3HashSize)[h3] = pos;
1346   // (hash + kFix4HashSize)[h4] = pos;
1347   (hash + kFix5HashSize)[hv] = pos;
1348 
1349   SET_mmm
1350 
1351   maxLen = 4;
1352 
1353   for (;;)
1354   {
1355     if (d2 < mmm && *(cur - d2) == *cur)
1356     {
1357       distances[0] = 2;
1358       distances[1] = d2 - 1;
1359       distances += 2;
1360       if (*(cur - d2 + 2) == cur[2])
1361       {
1362       }
1363       else if (d3 < mmm && *(cur - d3) == *cur)
1364       {
1365         distances[1] = d3 - 1;
1366         distances += 2;
1367         d2 = d3;
1368       }
1369       else
1370         break;
1371     }
1372     else if (d3 < mmm && *(cur - d3) == *cur)
1373     {
1374       distances[1] = d3 - 1;
1375       distances += 2;
1376       d2 = d3;
1377     }
1378     else
1379       break;
1380 
1381     distances[-2] = 3;
1382     if (*(cur - d2 + 3) != cur[3])
1383       break;
1384     UPDATE_maxLen
1385     distances[-2] = maxLen;
1386     if (maxLen == lenLimit)
1387     {
1388       p->son[p->cyclicBufferPos] = curMatch;
1389       MOVE_POS_RET;
1390     }
1391     break;
1392   }
1393 
1394   GET_MATCHES_FOOTER_HC(maxLen);
1395 }
1396 
1397 
Hc3Zip_MatchFinder_GetMatches(CMatchFinder * p,UInt32 * distances)1398 UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1399 {
1400   GET_MATCHES_HEADER(3)
1401   HASH_ZIP_CALC;
1402   curMatch = p->hash[hv];
1403   p->hash[hv] = p->pos;
1404   GET_MATCHES_FOOTER_HC(2)
1405 }
1406 
1407 
Bt2_MatchFinder_Skip(CMatchFinder * p,UInt32 num)1408 static void Bt2_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1409 {
1410   SKIP_HEADER(2)
1411   {
1412     HASH2_CALC;
1413     curMatch = p->hash[hv];
1414     p->hash[hv] = p->pos;
1415   }
1416   SKIP_FOOTER
1417 }
1418 
Bt3Zip_MatchFinder_Skip(CMatchFinder * p,UInt32 num)1419 void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1420 {
1421   SKIP_HEADER(3)
1422   {
1423     HASH_ZIP_CALC;
1424     curMatch = p->hash[hv];
1425     p->hash[hv] = p->pos;
1426   }
1427   SKIP_FOOTER
1428 }
1429 
Bt3_MatchFinder_Skip(CMatchFinder * p,UInt32 num)1430 static void Bt3_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1431 {
1432   SKIP_HEADER(3)
1433   {
1434     UInt32 h2;
1435     UInt32 *hash;
1436     HASH3_CALC;
1437     hash = p->hash;
1438     curMatch = (hash + kFix3HashSize)[hv];
1439     hash[h2] =
1440     (hash + kFix3HashSize)[hv] = p->pos;
1441   }
1442   SKIP_FOOTER
1443 }
1444 
Bt4_MatchFinder_Skip(CMatchFinder * p,UInt32 num)1445 static void Bt4_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1446 {
1447   SKIP_HEADER(4)
1448   {
1449     UInt32 h2, h3;
1450     UInt32 *hash;
1451     HASH4_CALC;
1452     hash = p->hash;
1453     curMatch = (hash + kFix4HashSize)[hv];
1454     hash                  [h2] =
1455     (hash + kFix3HashSize)[h3] =
1456     (hash + kFix4HashSize)[hv] = p->pos;
1457   }
1458   SKIP_FOOTER
1459 }
1460 
Bt5_MatchFinder_Skip(CMatchFinder * p,UInt32 num)1461 static void Bt5_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1462 {
1463   SKIP_HEADER(5)
1464   {
1465     UInt32 h2, h3;
1466     UInt32 *hash;
1467     HASH5_CALC;
1468     hash = p->hash;
1469     curMatch = (hash + kFix5HashSize)[hv];
1470     hash                  [h2] =
1471     (hash + kFix3HashSize)[h3] =
1472     // (hash + kFix4HashSize)[h4] =
1473     (hash + kFix5HashSize)[hv] = p->pos;
1474   }
1475   SKIP_FOOTER
1476 }
1477 
1478 
1479 #define HC_SKIP_HEADER(minLen) \
1480     do { if (p->lenLimit < minLen) { MatchFinder_MovePos(p); num--; continue; } { \
1481     Byte *cur; \
1482     UInt32 *hash; \
1483     UInt32 *son; \
1484     UInt32 pos = p->pos; \
1485     UInt32 num2 = num; \
1486     /* (p->pos == p->posLimit) is not allowed here !!! */ \
1487     { const UInt32 rem = p->posLimit - pos; if (num2 > rem) num2 = rem; } \
1488     num -= num2; \
1489     { const UInt32 cycPos = p->cyclicBufferPos; \
1490       son = p->son + cycPos; \
1491       p->cyclicBufferPos = cycPos + num2; } \
1492     cur = p->buffer; \
1493     hash = p->hash; \
1494     do { \
1495     UInt32 curMatch; \
1496     UInt32 hv;
1497 
1498 
1499 #define HC_SKIP_FOOTER \
1500     cur++;  pos++;  *son++ = curMatch; \
1501     } while (--num2); \
1502     p->buffer = cur; \
1503     p->pos = pos; \
1504     if (pos == p->posLimit) MatchFinder_CheckLimits(p); \
1505     }} while(num); \
1506 
1507 
Hc4_MatchFinder_Skip(CMatchFinder * p,UInt32 num)1508 static void Hc4_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1509 {
1510   HC_SKIP_HEADER(4)
1511 
1512     UInt32 h2, h3;
1513     HASH4_CALC;
1514     curMatch = (hash + kFix4HashSize)[hv];
1515     hash                  [h2] =
1516     (hash + kFix3HashSize)[h3] =
1517     (hash + kFix4HashSize)[hv] = pos;
1518 
1519   HC_SKIP_FOOTER
1520 }
1521 
1522 
Hc5_MatchFinder_Skip(CMatchFinder * p,UInt32 num)1523 static void Hc5_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1524 {
1525   HC_SKIP_HEADER(5)
1526 
1527     UInt32 h2, h3;
1528     HASH5_CALC
1529     curMatch = (hash + kFix5HashSize)[hv];
1530     hash                  [h2] =
1531     (hash + kFix3HashSize)[h3] =
1532     // (hash + kFix4HashSize)[h4] =
1533     (hash + kFix5HashSize)[hv] = pos;
1534 
1535   HC_SKIP_FOOTER
1536 }
1537 
1538 
Hc3Zip_MatchFinder_Skip(CMatchFinder * p,UInt32 num)1539 void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1540 {
1541   HC_SKIP_HEADER(3)
1542 
1543     HASH_ZIP_CALC;
1544     curMatch = hash[hv];
1545     hash[hv] = pos;
1546 
1547   HC_SKIP_FOOTER
1548 }
1549 
1550 
MatchFinder_CreateVTable(CMatchFinder * p,IMatchFinder2 * vTable)1551 void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable)
1552 {
1553   vTable->Init = (Mf_Init_Func)MatchFinder_Init;
1554   vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinder_GetNumAvailableBytes;
1555   vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinder_GetPointerToCurrentPos;
1556   if (!p->btMode)
1557   {
1558     if (p->numHashBytes <= 4)
1559     {
1560       vTable->GetMatches = (Mf_GetMatches_Func)Hc4_MatchFinder_GetMatches;
1561       vTable->Skip = (Mf_Skip_Func)Hc4_MatchFinder_Skip;
1562     }
1563     else
1564     {
1565       vTable->GetMatches = (Mf_GetMatches_Func)Hc5_MatchFinder_GetMatches;
1566       vTable->Skip = (Mf_Skip_Func)Hc5_MatchFinder_Skip;
1567     }
1568   }
1569   else if (p->numHashBytes == 2)
1570   {
1571     vTable->GetMatches = (Mf_GetMatches_Func)Bt2_MatchFinder_GetMatches;
1572     vTable->Skip = (Mf_Skip_Func)Bt2_MatchFinder_Skip;
1573   }
1574   else if (p->numHashBytes == 3)
1575   {
1576     vTable->GetMatches = (Mf_GetMatches_Func)Bt3_MatchFinder_GetMatches;
1577     vTable->Skip = (Mf_Skip_Func)Bt3_MatchFinder_Skip;
1578   }
1579   else if (p->numHashBytes == 4)
1580   {
1581     vTable->GetMatches = (Mf_GetMatches_Func)Bt4_MatchFinder_GetMatches;
1582     vTable->Skip = (Mf_Skip_Func)Bt4_MatchFinder_Skip;
1583   }
1584   else
1585   {
1586     vTable->GetMatches = (Mf_GetMatches_Func)Bt5_MatchFinder_GetMatches;
1587     vTable->Skip = (Mf_Skip_Func)Bt5_MatchFinder_Skip;
1588   }
1589 }
1590 
1591 
1592 
LzFindPrepare()1593 void LzFindPrepare()
1594 {
1595   #ifndef FORCE_SATUR_SUB_128
1596   #ifdef USE_SATUR_SUB_128
1597   LZFIND_SATUR_SUB_CODE_FUNC f = NULL;
1598   #ifdef MY_CPU_ARM_OR_ARM64
1599   {
1600     if (CPU_IsSupported_NEON())
1601     {
1602       // #pragma message ("=== LzFind NEON")
1603       _PRF(printf("\n=== LzFind NEON\n"));
1604       f = LzFind_SaturSub_128;
1605     }
1606     // f = 0; // for debug
1607   }
1608   #else // MY_CPU_ARM_OR_ARM64
1609   if (CPU_IsSupported_SSE41())
1610   {
1611     // #pragma message ("=== LzFind SSE41")
1612     _PRF(printf("\n=== LzFind SSE41\n"));
1613     f = LzFind_SaturSub_128;
1614 
1615     #ifdef USE_AVX2
1616     if (CPU_IsSupported_AVX2())
1617     {
1618       // #pragma message ("=== LzFind AVX2")
1619       _PRF(printf("\n=== LzFind AVX2\n"));
1620       f = LzFind_SaturSub_256;
1621     }
1622     #endif
1623   }
1624   #endif // MY_CPU_ARM_OR_ARM64
1625   g_LzFind_SaturSub = f;
1626   #endif // USE_SATUR_SUB_128
1627   #endif // FORCE_SATUR_SUB_128
1628 }
1629