1 /* LzFindMt.c -- multithreaded Match finder for LZ algorithms
2 2021-04-01 : Igor Pavlov : Public domain */
3 
4 #include "Precomp.h"
5 
6 #include "CpuArch.h"
7 
8 #include "LzHash.h"
9 #include "LzFindMt.h"
10 
11 // #define LOG_ITERS
12 
13 #ifdef LOG_ITERS
14 #include <stdio.h>
15 static UInt64 g_NumIters_Tree;
16 static UInt64 g_NumIters_Loop;
17 #define LOG_ITER(x) x
18 #else
19 #define LOG_ITER(x)
20 #endif
21 
22 #define kMtHashBlockSize (1 << 17)
23 #define kMtHashNumBlocks (1 << 1)
24 #define kMtHashNumBlocksMask (kMtHashNumBlocks - 1)
25 
26 #define kMtBtBlockSize (1 << 16)
27 #define kMtBtNumBlocks (1 << 4)
28 #define kMtBtNumBlocksMask (kMtBtNumBlocks - 1)
29 
30 /*
31   HASH functions:
32   We use raw 8/16 bits from a[1] and a[2],
33   xored with crc(a[0]) and crc(a[3]).
34   We check a[0], a[3] only. We don't need to compare a[1] and a[2] in matches.
35   our crc() function provides one-to-one correspondence for low 8-bit values:
36     (crc[0...0xFF] & 0xFF) <-> [0...0xFF]
37 */
38 
39 #define MT_HASH2_CALC \
40   h2 = (p->crc[cur[0]] ^ cur[1]) & (kHash2Size - 1);
41 
42 #define MT_HASH3_CALC { \
43   UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
44   h2 = temp & (kHash2Size - 1); \
45   h3 = (temp ^ ((UInt32)cur[2] << 8)) & (kHash3Size - 1); }
46 
47 /*
48 #define MT_HASH3_CALC__NO_2 { \
49   UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
50   h3 = (temp ^ ((UInt32)cur[2] << 8)) & (kHash3Size - 1); }
51 
52 #define __MT_HASH4_CALC { \
53   UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
54   h2 = temp & (kHash2Size - 1); \
55   temp ^= ((UInt32)cur[2] << 8); \
56   h3 = temp & (kHash3Size - 1); \
57   h4 = (temp ^ (p->crc[cur[3]] << kLzHash_CrcShift_1)) & p->hash4Mask; }
58   // (kHash4Size - 1);
59 */
60 
61 
MtSync_Construct(CMtSync * p)62 static void MtSync_Construct(CMtSync *p)
63 {
64   p->wasCreated = False;
65   p->csWasInitialized = False;
66   p->csWasEntered = False;
67   Thread_Construct(&p->thread);
68   Event_Construct(&p->canStart);
69   Event_Construct(&p->wasStarted);
70   Event_Construct(&p->wasStopped);
71   Semaphore_Construct(&p->freeSemaphore);
72   Semaphore_Construct(&p->filledSemaphore);
73   p->affinity = 0;
74 }
75 
76 
77 MY_NO_INLINE
MtSync_GetNextBlock(CMtSync * p)78 static void MtSync_GetNextBlock(CMtSync *p)
79 {
80   if (p->needStart)
81   {
82     p->numProcessedBlocks = 1;
83     p->needStart = False;
84     p->stopWriting = False;
85     p->exit = False;
86     Event_Reset(&p->wasStarted);
87     Event_Reset(&p->wasStopped);
88 
89     Event_Set(&p->canStart);
90     Event_Wait(&p->wasStarted);
91 
92     // if (mt) MatchFinder_Init_LowHash(mt->MatchFinder);
93   }
94   else
95   {
96     CriticalSection_Leave(&p->cs);
97     p->csWasEntered = False;
98     p->numProcessedBlocks++;
99     Semaphore_Release1(&p->freeSemaphore);
100   }
101   Semaphore_Wait(&p->filledSemaphore);
102   CriticalSection_Enter(&p->cs);
103   p->csWasEntered = True;
104 }
105 
106 /* MtSync_StopWriting must be called if Writing was started */
107 
MtSync_StopWriting(CMtSync * p)108 static void MtSync_StopWriting(CMtSync *p)
109 {
110   UInt32 myNumBlocks = p->numProcessedBlocks;
111   if (!Thread_WasCreated(&p->thread) || p->needStart)
112     return;
113   p->stopWriting = True;
114   if (p->csWasEntered)
115   {
116     CriticalSection_Leave(&p->cs);
117     p->csWasEntered = False;
118   }
119   Semaphore_Release1(&p->freeSemaphore);
120 
121   Event_Wait(&p->wasStopped);
122 
123   while (myNumBlocks++ != p->numProcessedBlocks)
124   {
125     Semaphore_Wait(&p->filledSemaphore);
126     Semaphore_Release1(&p->freeSemaphore);
127   }
128   p->needStart = True;
129 }
130 
MtSync_Destruct(CMtSync * p)131 static void MtSync_Destruct(CMtSync *p)
132 {
133   if (Thread_WasCreated(&p->thread))
134   {
135     MtSync_StopWriting(p);
136     p->exit = True;
137     if (p->needStart)
138       Event_Set(&p->canStart);
139     Thread_Wait_Close(&p->thread);
140   }
141   if (p->csWasInitialized)
142   {
143     CriticalSection_Delete(&p->cs);
144     p->csWasInitialized = False;
145   }
146 
147   Event_Close(&p->canStart);
148   Event_Close(&p->wasStarted);
149   Event_Close(&p->wasStopped);
150   Semaphore_Close(&p->freeSemaphore);
151   Semaphore_Close(&p->filledSemaphore);
152 
153   p->wasCreated = False;
154 }
155 
156 #define RINOK_THREAD(x) { if ((x) != 0) return SZ_ERROR_THREAD; }
157 
MtSync_Create2(CMtSync * p,THREAD_FUNC_TYPE startAddress,void * obj,UInt32 numBlocks)158 static SRes MtSync_Create2(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *obj, UInt32 numBlocks)
159 {
160   WRes wres;
161   if (p->wasCreated)
162     return SZ_OK;
163 
164   RINOK_THREAD(CriticalSection_Init(&p->cs));
165   p->csWasInitialized = True;
166 
167   RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->canStart));
168   RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->wasStarted));
169   RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->wasStopped));
170 
171   RINOK_THREAD(Semaphore_Create(&p->freeSemaphore, numBlocks, numBlocks));
172   RINOK_THREAD(Semaphore_Create(&p->filledSemaphore, 0, numBlocks));
173 
174   p->needStart = True;
175 
176   if (p->affinity != 0)
177     wres = Thread_Create_With_Affinity(&p->thread, startAddress, obj, (CAffinityMask)p->affinity);
178   else
179     wres = Thread_Create(&p->thread, startAddress, obj);
180   RINOK_THREAD(wres);
181   p->wasCreated = True;
182   return SZ_OK;
183 }
184 
MtSync_Create(CMtSync * p,THREAD_FUNC_TYPE startAddress,void * obj,UInt32 numBlocks)185 static SRes MtSync_Create(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *obj, UInt32 numBlocks)
186 {
187   SRes res = MtSync_Create2(p, startAddress, obj, numBlocks);
188   if (res != SZ_OK)
189     MtSync_Destruct(p);
190   return res;
191 }
192 
193 // static void MtSync_Init(CMtSync *p) { p->needStart = True; }
194 
195 #define kMtMaxValForNormalize 0xFFFFFFFF
196 // #define kMtMaxValForNormalize ((1 << 25) + (1 << 20))
197 
198 
199 #ifdef MY_CPU_LE_UNALIGN
200   #define GetUi24hi_from32(p) ((UInt32)GetUi32(p) >> 8)
201 #else
202   #define GetUi24hi_from32(p) ((p)[1] ^ ((UInt32)(p)[2] << 8) ^ ((UInt32)(p)[3] << 16))
203 #endif
204 
205 #define GetHeads_DECL(name) \
206     static void GetHeads ## name(const Byte *p, UInt32 pos, \
207       UInt32 *hash, UInt32 hashMask, UInt32 *heads, UInt32 numHeads, const UInt32 *crc)
208 
209 #define GetHeads_LOOP(v) \
210     for (; numHeads != 0; numHeads--) { \
211       const UInt32 value = (v); \
212       p++; \
213       *heads++ = pos - hash[value]; \
214       hash[value] = pos++; }
215 
216 #define DEF_GetHeads2(name, v, action) \
217     GetHeads_DECL(name) { action \
218     GetHeads_LOOP(v) }
219 
220 #define DEF_GetHeads(name, v) DEF_GetHeads2(name, v, ;)
221 
GetUi16(p)222 DEF_GetHeads2(2, GetUi16(p), UNUSED_VAR(hashMask); UNUSED_VAR(crc); )
223 DEF_GetHeads(3,  (crc[p[0]] ^ GetUi16(p + 1)) & hashMask)
224 DEF_GetHeads2(3b, GetUi16(p) ^ ((UInt32)(p)[2] << 16), UNUSED_VAR(hashMask); UNUSED_VAR(crc); )
225 // BT3 is not good for crc collisions for big hashMask values.
226 
227 /*
228 GetHeads_DECL(3b)
229 {
230   UNUSED_VAR(hashMask);
231   UNUSED_VAR(crc);
232   {
233   const Byte *pLim = p + numHeads;
234   if (numHeads == 0)
235     return;
236   pLim--;
237   while (p < pLim)
238   {
239     UInt32 v1 = GetUi32(p);
240     UInt32 v0 = v1 & 0xFFFFFF;
241     UInt32 h0, h1;
242     p += 2;
243     v1 >>= 8;
244     h0 = hash[v0]; hash[v0] = pos; heads[0] = pos - h0; pos++;
245     h1 = hash[v1]; hash[v1] = pos; heads[1] = pos - h1; pos++;
246     heads += 2;
247   }
248   if (p == pLim)
249   {
250     UInt32 v0 = GetUi16(p) ^ ((UInt32)(p)[2] << 16);
251     *heads = pos - hash[v0];
252     hash[v0] = pos;
253   }
254   }
255 }
256 */
257 
258 /*
259 GetHeads_DECL(4)
260 {
261   unsigned sh = 0;
262   UNUSED_VAR(crc)
263   while ((hashMask & 0x80000000) == 0)
264   {
265     hashMask <<= 1;
266     sh++;
267   }
268   GetHeads_LOOP((GetUi32(p) * 0xa54a1) >> sh)
269 }
270 #define GetHeads4b GetHeads4
271 */
272 
273 #define USE_GetHeads_LOCAL_CRC
274 
275 #ifdef USE_GetHeads_LOCAL_CRC
276 
277 GetHeads_DECL(4)
278 {
279   UInt32 crc0[256];
280   UInt32 crc1[256];
281   {
282     unsigned i;
283     for (i = 0; i < 256; i++)
284     {
285       UInt32 v = crc[i];
286       crc0[i] = v & hashMask;
287       crc1[i] = (v << kLzHash_CrcShift_1) & hashMask;
288       // crc1[i] = rotlFixed(v, 8) & hashMask;
289     }
290   }
291   GetHeads_LOOP(crc0[p[0]] ^ crc1[p[3]] ^ (UInt32)GetUi16(p+1))
292 }
293 
294 GetHeads_DECL(4b)
295 {
296   UInt32 crc0[256];
297   {
298     unsigned i;
299     for (i = 0; i < 256; i++)
300       crc0[i] = crc[i] & hashMask;
301   }
302   GetHeads_LOOP(crc0[p[0]] ^ GetUi24hi_from32(p))
303 }
304 
305 GetHeads_DECL(5)
306 {
307   UInt32 crc0[256];
308   UInt32 crc1[256];
309   UInt32 crc2[256];
310   {
311     unsigned i;
312     for (i = 0; i < 256; i++)
313     {
314       UInt32 v = crc[i];
315       crc0[i] = v & hashMask;
316       crc1[i] = (v << kLzHash_CrcShift_1) & hashMask;
317       crc2[i] = (v << kLzHash_CrcShift_2) & hashMask;
318     }
319   }
320   GetHeads_LOOP(crc0[p[0]] ^ crc1[p[3]] ^ crc2[p[4]] ^ (UInt32)GetUi16(p+1))
321 }
322 
323 GetHeads_DECL(5b)
324 {
325   UInt32 crc0[256];
326   UInt32 crc1[256];
327   {
328     unsigned i;
329     for (i = 0; i < 256; i++)
330     {
331       UInt32 v = crc[i];
332       crc0[i] = v & hashMask;
333       crc1[i] = (v << kLzHash_CrcShift_1) & hashMask;
334     }
335   }
336   GetHeads_LOOP(crc0[p[0]] ^ crc1[p[4]] ^ GetUi24hi_from32(p))
337 }
338 
339 #else
340 
341 DEF_GetHeads(4,  (crc[p[0]] ^ (crc[p[3]] << kLzHash_CrcShift_1) ^ (UInt32)GetUi16(p+1)) & hashMask)
342 DEF_GetHeads(4b, (crc[p[0]] ^ GetUi24hi_from32(p)) & hashMask)
343 DEF_GetHeads(5,  (crc[p[0]] ^ (crc[p[3]] << kLzHash_CrcShift_1) ^ (crc[p[4]] << kLzHash_CrcShift_2) ^ (UInt32)GetUi16(p + 1)) & hashMask)
344 DEF_GetHeads(5b, (crc[p[0]] ^ (crc[p[4]] << kLzHash_CrcShift_1) ^ GetUi24hi_from32(p)) & hashMask)
345 
346 #endif
347 
348 
HashThreadFunc(CMatchFinderMt * mt)349 static void HashThreadFunc(CMatchFinderMt *mt)
350 {
351   CMtSync *p = &mt->hashSync;
352   for (;;)
353   {
354     UInt32 numProcessedBlocks = 0;
355     Event_Wait(&p->canStart);
356     Event_Set(&p->wasStarted);
357 
358     MatchFinder_Init_HighHash(mt->MatchFinder);
359 
360     for (;;)
361     {
362       if (p->exit)
363         return;
364       if (p->stopWriting)
365       {
366         p->numProcessedBlocks = numProcessedBlocks;
367         Event_Set(&p->wasStopped);
368         break;
369       }
370 
371       {
372         CMatchFinder *mf = mt->MatchFinder;
373         if (MatchFinder_NeedMove(mf))
374         {
375           CriticalSection_Enter(&mt->btSync.cs);
376           CriticalSection_Enter(&mt->hashSync.cs);
377           {
378             const Byte *beforePtr = Inline_MatchFinder_GetPointerToCurrentPos(mf);
379             ptrdiff_t offset;
380             MatchFinder_MoveBlock(mf);
381             offset = beforePtr - Inline_MatchFinder_GetPointerToCurrentPos(mf);
382             mt->pointerToCurPos -= offset;
383             mt->buffer -= offset;
384           }
385           CriticalSection_Leave(&mt->btSync.cs);
386           CriticalSection_Leave(&mt->hashSync.cs);
387           continue;
388         }
389 
390         Semaphore_Wait(&p->freeSemaphore);
391 
392         MatchFinder_ReadIfRequired(mf);
393         if (mf->pos > (kMtMaxValForNormalize - kMtHashBlockSize))
394         {
395           UInt32 subValue = (mf->pos - mf->historySize - 1);
396           MatchFinder_ReduceOffsets(mf, subValue);
397           MatchFinder_Normalize3(subValue, mf->hash + mf->fixedHashSize, (size_t)mf->hashMask + 1);
398         }
399         {
400           UInt32 *heads = mt->hashBuf + ((numProcessedBlocks++) & kMtHashNumBlocksMask) * kMtHashBlockSize;
401           UInt32 num = mf->streamPos - mf->pos;
402           heads[0] = 2;
403           heads[1] = num;
404           if (num >= mf->numHashBytes)
405           {
406             num = num - mf->numHashBytes + 1;
407             if (num > kMtHashBlockSize - 2)
408               num = kMtHashBlockSize - 2;
409             mt->GetHeadsFunc(mf->buffer, mf->pos, mf->hash + mf->fixedHashSize, mf->hashMask, heads + 2, num, mf->crc);
410             heads[0] = 2 + num;
411           }
412           mf->pos += num;
413           mf->buffer += num;
414         }
415       }
416 
417       Semaphore_Release1(&p->filledSemaphore);
418     }
419   }
420 }
421 
MatchFinderMt_GetNextBlock_Hash(CMatchFinderMt * p)422 static void MatchFinderMt_GetNextBlock_Hash(CMatchFinderMt *p)
423 {
424   MtSync_GetNextBlock(&p->hashSync);
425   p->hashBufPosLimit = p->hashBufPos = ((p->hashSync.numProcessedBlocks - 1) & kMtHashNumBlocksMask) * kMtHashBlockSize;
426   p->hashBufPosLimit += p->hashBuf[p->hashBufPos++];
427   p->hashNumAvail = p->hashBuf[p->hashBufPos++];
428 }
429 
430 #define kEmptyHashValue 0
431 
432 #define MFMT_GM_INLINE
433 
434 #ifdef MFMT_GM_INLINE
435 
436 /*
437   we use size_t for _cyclicBufferPos instead of UInt32
438   to eliminate "movsx" BUG in old MSVC x64 compiler.
439 */
440 
441 MY_NO_INLINE
GetMatchesSpecN(UInt32 lenLimit,UInt32 pos,const Byte * cur,CLzRef * son,size_t _cyclicBufferPos,UInt32 _cyclicBufferSize,UInt32 _cutValue,UInt32 * d,UInt32 _maxLen,const UInt32 * hash,const UInt32 * limit,UInt32 size,UInt32 * posRes)442 static UInt32 *GetMatchesSpecN(UInt32 lenLimit, UInt32 pos, const Byte *cur, CLzRef *son,
443     size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 _cutValue,
444     UInt32 *d, UInt32 _maxLen, const UInt32 *hash, const UInt32 *limit, UInt32 size, UInt32 *posRes)
445 {
446   do
447   {
448   UInt32 *_distances = ++d;
449   UInt32 delta = *hash++;
450 
451   CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1;
452   CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);
453   unsigned len0 = 0, len1 = 0;
454   UInt32 cutValue = _cutValue;
455   unsigned maxLen = (unsigned)_maxLen;
456 
457   /*
458   #define PREF_STEP 1
459   if (size > PREF_STEP)
460   {
461     UInt32 delta = hash[PREF_STEP - 1];
462     if (delta < _cyclicBufferSize)
463     {
464       size_t cyc1 = _cyclicBufferPos + PREF_STEP;
465       CLzRef *pair = son + ((size_t)(cyc1 - delta + ((delta > cyc1) ? _cyclicBufferSize : 0)) << 1);
466       Byte b = *(cur + PREF_STEP - delta);
467       _distances[0] = pair[0];
468       _distances[1] = b;
469     }
470   }
471   */
472   if (cutValue == 0 || delta >= _cyclicBufferSize)
473   {
474     *ptr0 = *ptr1 = kEmptyHashValue;
475   }
476   else
477   for (LOG_ITER(g_NumIters_Tree++);;)
478   {
479     LOG_ITER(g_NumIters_Loop++);
480     {
481       CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((_cyclicBufferPos < delta) ? _cyclicBufferSize : 0)) << 1);
482       const Byte *pb = cur - delta;
483       unsigned len = (len0 < len1 ? len0 : len1);
484       UInt32 pair0 = *pair;
485       if (pb[len] == cur[len])
486       {
487         if (++len != lenLimit && pb[len] == cur[len])
488           while (++len != lenLimit)
489             if (pb[len] != cur[len])
490               break;
491         if (maxLen < len)
492         {
493           maxLen = len;
494           *d++ = (UInt32)len;
495           *d++ = delta - 1;
496           if (len == lenLimit)
497           {
498             UInt32 pair1 = pair[1];
499             *ptr1 = pair0;
500             *ptr0 = pair1;
501             break;
502           }
503         }
504       }
505       {
506         UInt32 curMatch = pos - delta;
507         // delta = pos - *pair;
508         // delta = pos - pair[((UInt32)pb[len] - (UInt32)cur[len]) >> 31];
509         if (pb[len] < cur[len])
510         {
511           delta = pos - pair[1];
512           *ptr1 = curMatch;
513           ptr1 = pair + 1;
514           len1 = len;
515         }
516         else
517         {
518           delta = pos - *pair;
519           *ptr0 = curMatch;
520           ptr0 = pair;
521           len0 = len;
522         }
523       }
524     }
525     if (--cutValue == 0 || delta >= _cyclicBufferSize)
526     {
527       *ptr0 = *ptr1 = kEmptyHashValue;
528       break;
529     }
530   }
531   pos++;
532   _cyclicBufferPos++;
533   cur++;
534   {
535     UInt32 num = (UInt32)(d - _distances);
536     _distances[-1] = num;
537   }
538   }
539   while (d < limit && --size != 0);
540   *posRes = pos;
541   return d;
542 }
543 
544 #endif
545 
546 
547 
BtGetMatches(CMatchFinderMt * p,UInt32 * d)548 static void BtGetMatches(CMatchFinderMt *p, UInt32 *d)
549 {
550   UInt32 numProcessed = 0;
551   UInt32 curPos = 2;
552   UInt32 limit = kMtBtBlockSize - (p->matchMaxLen * 2); //  * 2
553 
554   d[1] = p->hashNumAvail;
555 
556   while (curPos < limit)
557   {
558     if (p->hashBufPos == p->hashBufPosLimit)
559     {
560       MatchFinderMt_GetNextBlock_Hash(p);
561       d[1] = numProcessed + p->hashNumAvail;
562       if (p->hashNumAvail >= p->numHashBytes)
563         continue;
564       d[0] = curPos + p->hashNumAvail;
565       d += curPos;
566       for (; p->hashNumAvail != 0; p->hashNumAvail--)
567         *d++ = 0;
568       return;
569     }
570     {
571       UInt32 size = p->hashBufPosLimit - p->hashBufPos;
572       UInt32 lenLimit = p->matchMaxLen;
573       UInt32 pos = p->pos;
574       UInt32 cyclicBufferPos = p->cyclicBufferPos;
575       if (lenLimit >= p->hashNumAvail)
576         lenLimit = p->hashNumAvail;
577       {
578         UInt32 size2 = p->hashNumAvail - lenLimit + 1;
579         if (size2 < size)
580           size = size2;
581         size2 = p->cyclicBufferSize - cyclicBufferPos;
582         if (size2 < size)
583           size = size2;
584       }
585 
586       #ifndef MFMT_GM_INLINE
587       while (curPos < limit && size-- != 0)
588       {
589         UInt32 *startDistances = d + curPos;
590         UInt32 num = (UInt32)(GetMatchesSpec1(lenLimit, pos - p->hashBuf[p->hashBufPos++],
591             pos, p->buffer, p->son, cyclicBufferPos, p->cyclicBufferSize, p->cutValue,
592             startDistances + 1, p->numHashBytes - 1) - startDistances);
593         *startDistances = num - 1;
594         curPos += num;
595         cyclicBufferPos++;
596         pos++;
597         p->buffer++;
598       }
599       #else
600       {
601         UInt32 posRes;
602         curPos = (UInt32)(GetMatchesSpecN(lenLimit, pos, p->buffer, p->son, cyclicBufferPos, p->cyclicBufferSize, p->cutValue,
603             d + curPos, p->numHashBytes - 1, p->hashBuf + p->hashBufPos,
604             d + limit,
605             size, &posRes) - d);
606         p->hashBufPos += posRes - pos;
607         cyclicBufferPos += posRes - pos;
608         p->buffer += posRes - pos;
609         pos = posRes;
610       }
611       #endif
612 
613       numProcessed += pos - p->pos;
614       p->hashNumAvail -= pos - p->pos;
615       p->pos = pos;
616       if (cyclicBufferPos == p->cyclicBufferSize)
617         cyclicBufferPos = 0;
618       p->cyclicBufferPos = cyclicBufferPos;
619     }
620   }
621 
622   d[0] = curPos;
623 }
624 
BtFillBlock(CMatchFinderMt * p,UInt32 globalBlockIndex)625 static void BtFillBlock(CMatchFinderMt *p, UInt32 globalBlockIndex)
626 {
627   CMtSync *sync = &p->hashSync;
628   if (!sync->needStart)
629   {
630     CriticalSection_Enter(&sync->cs);
631     sync->csWasEntered = True;
632   }
633 
634   BtGetMatches(p, p->btBuf + (globalBlockIndex & kMtBtNumBlocksMask) * kMtBtBlockSize);
635 
636   if (p->pos > kMtMaxValForNormalize - kMtBtBlockSize)
637   {
638     UInt32 subValue = p->pos - p->cyclicBufferSize;
639     MatchFinder_Normalize3(subValue, p->son, (size_t)p->cyclicBufferSize * 2);
640     p->pos -= subValue;
641   }
642 
643   if (!sync->needStart)
644   {
645     CriticalSection_Leave(&sync->cs);
646     sync->csWasEntered = False;
647   }
648 }
649 
BtThreadFunc(CMatchFinderMt * mt)650 static void BtThreadFunc(CMatchFinderMt *mt)
651 {
652   CMtSync *p = &mt->btSync;
653   for (;;)
654   {
655     UInt32 blockIndex = 0;
656     Event_Wait(&p->canStart);
657     Event_Set(&p->wasStarted);
658     for (;;)
659     {
660       if (p->exit)
661         return;
662       if (p->stopWriting)
663       {
664         p->numProcessedBlocks = blockIndex;
665         MtSync_StopWriting(&mt->hashSync);
666         Event_Set(&p->wasStopped);
667         break;
668       }
669       Semaphore_Wait(&p->freeSemaphore);
670       BtFillBlock(mt, blockIndex++);
671       Semaphore_Release1(&p->filledSemaphore);
672     }
673   }
674 }
675 
MatchFinderMt_Construct(CMatchFinderMt * p)676 void MatchFinderMt_Construct(CMatchFinderMt *p)
677 {
678   p->hashBuf = NULL;
679   MtSync_Construct(&p->hashSync);
680   MtSync_Construct(&p->btSync);
681 }
682 
MatchFinderMt_FreeMem(CMatchFinderMt * p,ISzAllocPtr alloc)683 static void MatchFinderMt_FreeMem(CMatchFinderMt *p, ISzAllocPtr alloc)
684 {
685   ISzAlloc_Free(alloc, p->hashBuf);
686   p->hashBuf = NULL;
687 }
688 
MatchFinderMt_Destruct(CMatchFinderMt * p,ISzAllocPtr alloc)689 void MatchFinderMt_Destruct(CMatchFinderMt *p, ISzAllocPtr alloc)
690 {
691   MtSync_Destruct(&p->hashSync);
692   MtSync_Destruct(&p->btSync);
693 
694   LOG_ITER(
695   printf("\nTree %9d * %7d iter = %9d sum \n",
696       (UInt32)(g_NumIters_Tree / 1000),
697       (UInt32)(((UInt64)g_NumIters_Loop * 1000) / (g_NumIters_Tree + 1)),
698       (UInt32)(g_NumIters_Loop / 1000)
699       ));
700 
701   MatchFinderMt_FreeMem(p, alloc);
702 }
703 
704 #define kHashBufferSize (kMtHashBlockSize * kMtHashNumBlocks)
705 #define kBtBufferSize (kMtBtBlockSize * kMtBtNumBlocks)
706 
HashThreadFunc2(void * p)707 static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE HashThreadFunc2(void *p) { HashThreadFunc((CMatchFinderMt *)p);  return 0; }
BtThreadFunc2(void * p)708 static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE BtThreadFunc2(void *p)
709 {
710   Byte allocaDummy[0x180];
711   unsigned i = 0;
712   for (i = 0; i < 16; i++)
713     allocaDummy[i] = (Byte)0;
714   if (allocaDummy[0] == 0)
715     BtThreadFunc((CMatchFinderMt *)p);
716   return 0;
717 }
718 
MatchFinderMt_Create(CMatchFinderMt * p,UInt32 historySize,UInt32 keepAddBufferBefore,UInt32 matchMaxLen,UInt32 keepAddBufferAfter,ISzAllocPtr alloc)719 SRes MatchFinderMt_Create(CMatchFinderMt *p, UInt32 historySize, UInt32 keepAddBufferBefore,
720     UInt32 matchMaxLen, UInt32 keepAddBufferAfter, ISzAllocPtr alloc)
721 {
722   CMatchFinder *mf = p->MatchFinder;
723   p->historySize = historySize;
724   if (kMtBtBlockSize <= matchMaxLen * 4)
725     return SZ_ERROR_PARAM;
726   if (!p->hashBuf)
727   {
728     p->hashBuf = (UInt32 *)ISzAlloc_Alloc(alloc, (kHashBufferSize + kBtBufferSize) * sizeof(UInt32));
729     if (!p->hashBuf)
730       return SZ_ERROR_MEM;
731     p->btBuf = p->hashBuf + kHashBufferSize;
732   }
733   keepAddBufferBefore += (kHashBufferSize + kBtBufferSize);
734   keepAddBufferAfter += kMtHashBlockSize;
735   if (!MatchFinder_Create(mf, historySize, keepAddBufferBefore, matchMaxLen, keepAddBufferAfter, alloc))
736     return SZ_ERROR_MEM;
737 
738   RINOK(MtSync_Create(&p->hashSync, HashThreadFunc2, p, kMtHashNumBlocks));
739   RINOK(MtSync_Create(&p->btSync, BtThreadFunc2, p, kMtBtNumBlocks));
740   return SZ_OK;
741 }
742 
743 /* Call it after ReleaseStream / SetStream */
MatchFinderMt_Init(CMatchFinderMt * p)744 static void MatchFinderMt_Init(CMatchFinderMt *p)
745 {
746   CMatchFinder *mf = p->MatchFinder;
747 
748   p->btBufPos =
749   p->btBufPosLimit = 0;
750   p->hashBufPos =
751   p->hashBufPosLimit = 0;
752 
753   /* Init without data reading. We don't want to read data in this thread */
754   MatchFinder_Init_3(mf, False);
755   MatchFinder_Init_LowHash(mf);
756 
757   p->pointerToCurPos = Inline_MatchFinder_GetPointerToCurrentPos(mf);
758   p->btNumAvailBytes = 0;
759   p->lzPos = p->historySize + 1;
760 
761   p->hash = mf->hash;
762   p->fixedHashSize = mf->fixedHashSize;
763   // p->hash4Mask = mf->hash4Mask;
764   p->crc = mf->crc;
765 
766   p->son = mf->son;
767   p->matchMaxLen = mf->matchMaxLen;
768   p->numHashBytes = mf->numHashBytes;
769   p->pos = mf->pos;
770   p->buffer = mf->buffer;
771   p->cyclicBufferPos = mf->cyclicBufferPos;
772   p->cyclicBufferSize = mf->cyclicBufferSize;
773   p->cutValue = mf->cutValue;
774 }
775 
776 /* ReleaseStream is required to finish multithreading */
MatchFinderMt_ReleaseStream(CMatchFinderMt * p)777 void MatchFinderMt_ReleaseStream(CMatchFinderMt *p)
778 {
779   MtSync_StopWriting(&p->btSync);
780   /* p->MatchFinder->ReleaseStream(); */
781 }
782 
783 
784 MY_NO_INLINE
MatchFinderMt_GetNextBlock_Bt(CMatchFinderMt * p)785 static void MatchFinderMt_GetNextBlock_Bt(CMatchFinderMt *p)
786 {
787   UInt32 blockIndex, k;
788 
789   MtSync_GetNextBlock(&p->btSync);
790 
791   blockIndex = ((p->btSync.numProcessedBlocks - 1) & kMtBtNumBlocksMask);
792   k = blockIndex * kMtBtBlockSize;
793   p->btBufPosLimit = k + p->btBuf[k];
794   p->btNumAvailBytes = p->btBuf[k + 1];
795   p->btBufPos = k + 2;
796   if (p->lzPos >= kMtMaxValForNormalize - kMtBtBlockSize)
797   {
798     MatchFinder_Normalize3(p->lzPos - p->historySize - 1, p->hash, p->fixedHashSize);
799     p->lzPos = p->historySize + 1;
800   }
801 }
802 
MatchFinderMt_GetPointerToCurrentPos(CMatchFinderMt * p)803 static const Byte * MatchFinderMt_GetPointerToCurrentPos(CMatchFinderMt *p)
804 {
805   return p->pointerToCurPos;
806 }
807 
808 #define GET_NEXT_BLOCK_IF_REQUIRED if (p->btBufPos == p->btBufPosLimit) MatchFinderMt_GetNextBlock_Bt(p);
809 
MatchFinderMt_GetNumAvailableBytes(CMatchFinderMt * p)810 static UInt32 MatchFinderMt_GetNumAvailableBytes(CMatchFinderMt *p)
811 {
812   GET_NEXT_BLOCK_IF_REQUIRED;
813   return p->btNumAvailBytes;
814 }
815 
MixMatches2(CMatchFinderMt * p,UInt32 matchMinPos,UInt32 * d)816 static UInt32 * MixMatches2(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d)
817 {
818   UInt32 h2, c2;
819   UInt32 *hash = p->hash;
820   const Byte *cur = p->pointerToCurPos;
821   UInt32 m = p->lzPos;
822   MT_HASH2_CALC
823 
824   c2 = hash[h2];
825   hash[h2] = m;
826 
827   if (c2 >= matchMinPos)
828     if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m] == cur[0])
829     {
830       *d++ = 2;
831       *d++ = m - c2 - 1;
832     }
833 
834   return d;
835 }
836 
MixMatches3(CMatchFinderMt * p,UInt32 matchMinPos,UInt32 * d)837 static UInt32 * MixMatches3(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d)
838 {
839   UInt32 h2, h3, c2, c3;
840   UInt32 *hash = p->hash;
841   const Byte *cur = p->pointerToCurPos;
842   UInt32 m = p->lzPos;
843   MT_HASH3_CALC
844 
845   c2 = hash[h2];
846   c3 = (hash + kFix3HashSize)[h3];
847 
848   hash[h2] = m;
849   (hash + kFix3HashSize)[h3] = m;
850 
851   if (c2 >= matchMinPos && cur[(ptrdiff_t)c2 - (ptrdiff_t)m] == cur[0])
852   {
853     d[1] = m - c2 - 1;
854     if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 2] == cur[2])
855     {
856       d[0] = 3;
857       return d + 2;
858     }
859     d[0] = 2;
860     d += 2;
861   }
862 
863   if (c3 >= matchMinPos && cur[(ptrdiff_t)c3 - (ptrdiff_t)m] == cur[0])
864   {
865     *d++ = 3;
866     *d++ = m - c3 - 1;
867   }
868 
869   return d;
870 }
871 
872 
873 #define INCREASE_LZ_POS p->lzPos++; p->pointerToCurPos++;
874 
875 /*
876 static
877 UInt32 MatchFinderMt_GetMatches_Bt4(CMatchFinderMt *p, UInt32 *d)
878 {
879   UInt32 pos = p->btBufPos;
880   const UInt32 *bt = p->btBuf + pos;
881   UInt32 len = *bt++;
882   UInt32 matchMinPos;
883   const UInt32 *d_base = d;
884   UInt32 avail = p->btNumAvailBytes - 1;
885   p->btBufPos = pos + 1 + len;
886 
887   {
888     UInt32 temp1 = p->historySize;
889     p->btNumAvailBytes = avail;
890 
891     #define BT_HASH_BYTES_MAX 5
892 
893     if (len != 0)
894       temp1 = bt[1];
895     else if (avail < (BT_HASH_BYTES_MAX - 2))
896     {
897       INCREASE_LZ_POS
898       return 0;
899     }
900     matchMinPos = p->lzPos - temp1;
901   }
902 
903   for (;;)
904   {
905 
906   UInt32 h2, h3, c2, c3;
907   UInt32 *hash = p->hash;
908   const Byte *cur = p->pointerToCurPos;
909   UInt32 m = p->lzPos;
910   MT_HASH3_CALC
911 
912   c2 = hash[h2];
913   c3 = (hash + kFix3HashSize)[h3];
914 
915   hash[h2] = m;
916   (hash + kFix3HashSize)[h3] = m;
917 
918   if (c2 >= matchMinPos && cur[(ptrdiff_t)c2 - (ptrdiff_t)m] == cur[0])
919   {
920     d[1] = m - c2 - 1;
921     if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 2] == cur[2])
922     {
923       d[0] = 3;
924       d += 2;
925       break;
926     }
927     // else
928     {
929       d[0] = 2;
930       d += 2;
931     }
932   }
933   if (c3 >= matchMinPos && cur[(ptrdiff_t)c3 - (ptrdiff_t)m] == cur[0])
934   {
935     *d++ = 3;
936     *d++ = m - c3 - 1;
937   }
938   break;
939   }
940 
941   if (len != 0)
942   {
943     do
944     {
945       UInt32 v0 = bt[0];
946       UInt32 v1 = bt[1];
947       bt += 2;
948       d[0] = v0;
949       d[1] = v1;
950       d += 2;
951     }
952     while ((len -= 2) != 0);
953   }
954   INCREASE_LZ_POS
955   return (UInt32)(d - d_base);
956 }
957 */
958 
959 
MixMatches4(CMatchFinderMt * p,UInt32 matchMinPos,UInt32 * d)960 static UInt32 *MixMatches4(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d)
961 {
962   UInt32 h2, h3, /* h4, */ c2, c3 /* , c4 */;
963   UInt32 *hash = p->hash;
964   const Byte *cur = p->pointerToCurPos;
965   UInt32 m = p->lzPos;
966   MT_HASH3_CALC
967   // MT_HASH4_CALC
968   c2 = hash[h2];
969   c3 = (hash + kFix3HashSize)[h3];
970   // c4 = (hash + kFix4HashSize)[h4];
971 
972   hash[h2] = m;
973   (hash + kFix3HashSize)[h3] = m;
974   // (hash + kFix4HashSize)[h4] = m;
975 
976   #define _USE_H2
977 
978   #ifdef _USE_H2
979   if (c2 >= matchMinPos && cur[(ptrdiff_t)c2 - (ptrdiff_t)m] == cur[0])
980   {
981     d[1] = m - c2 - 1;
982     if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 2] == cur[2])
983     {
984       // d[0] = (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 3] == cur[3]) ? 4 : 3;
985       // return d + 2;
986 
987       if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 3] == cur[3])
988       {
989         d[0] = 4;
990         return d + 2;
991       }
992       d[0] = 3;
993       d += 2;
994 
995       #ifdef _USE_H4
996       if (c4 >= matchMinPos)
997         if (
998           cur[(ptrdiff_t)c4 - (ptrdiff_t)m]     == cur[0] &&
999           cur[(ptrdiff_t)c4 - (ptrdiff_t)m + 3] == cur[3]
1000           )
1001       {
1002         *d++ = 4;
1003         *d++ = m - c4 - 1;
1004       }
1005       #endif
1006       return d;
1007     }
1008     d[0] = 2;
1009     d += 2;
1010   }
1011   #endif
1012 
1013   if (c3 >= matchMinPos && cur[(ptrdiff_t)c3 - (ptrdiff_t)m] == cur[0])
1014   {
1015     d[1] = m - c3 - 1;
1016     if (cur[(ptrdiff_t)c3 - (ptrdiff_t)m + 3] == cur[3])
1017     {
1018       d[0] = 4;
1019       return d + 2;
1020     }
1021     d[0] = 3;
1022     d += 2;
1023   }
1024 
1025   #ifdef _USE_H4
1026   if (c4 >= matchMinPos)
1027     if (
1028       cur[(ptrdiff_t)c4 - (ptrdiff_t)m]     == cur[0] &&
1029       cur[(ptrdiff_t)c4 - (ptrdiff_t)m + 3] == cur[3]
1030       )
1031     {
1032       *d++ = 4;
1033       *d++ = m - c4 - 1;
1034     }
1035   #endif
1036 
1037   return d;
1038 }
1039 
1040 
MatchFinderMt2_GetMatches(CMatchFinderMt * p,UInt32 * d)1041 static UInt32 MatchFinderMt2_GetMatches(CMatchFinderMt *p, UInt32 *d)
1042 {
1043   const UInt32 *bt = p->btBuf + p->btBufPos;
1044   UInt32 len = *bt++;
1045   p->btBufPos += 1 + len;
1046   p->btNumAvailBytes--;
1047   {
1048     UInt32 i;
1049     for (i = 0; i < len; i += 2)
1050     {
1051       UInt32 v0 = bt[0];
1052       UInt32 v1 = bt[1];
1053       bt += 2;
1054       d[0] = v0;
1055       d[1] = v1;
1056       d += 2;
1057     }
1058   }
1059   INCREASE_LZ_POS
1060   return len;
1061 }
1062 
1063 
1064 
MatchFinderMt_GetMatches(CMatchFinderMt * p,UInt32 * d)1065 static UInt32 MatchFinderMt_GetMatches(CMatchFinderMt *p, UInt32 *d)
1066 {
1067   UInt32 pos = p->btBufPos;
1068   const UInt32 *bt = p->btBuf + pos;
1069   UInt32 len = *bt++;
1070   UInt32 avail = p->btNumAvailBytes - 1;
1071   p->btNumAvailBytes = avail;
1072   p->btBufPos = pos + 1 + len;
1073   if (len == 0)
1074   {
1075     #define BT_HASH_BYTES_MAX 5
1076     if (avail >= (BT_HASH_BYTES_MAX - 1) - 1)
1077       len = (UInt32)(p->MixMatchesFunc(p, p->lzPos - p->historySize, d) - d);
1078   }
1079   else
1080   {
1081     /*
1082       first match pair from BinTree: (match_len, match_dist),
1083       (match_len >= numHashBytes).
1084       MixMatchesFunc() inserts only hash matches that are nearer than (match_dist)
1085     */
1086     UInt32 *d2;
1087     d2 = p->MixMatchesFunc(p, p->lzPos - bt[1], d);
1088     do
1089     {
1090       UInt32 v0 = bt[0];
1091       UInt32 v1 = bt[1];
1092       bt += 2;
1093       d2[0] = v0;
1094       d2[1] = v1;
1095       d2 += 2;
1096     }
1097     while ((len -= 2) != 0);
1098     len = (UInt32)(d2 - d);
1099   }
1100   INCREASE_LZ_POS
1101   return len;
1102 }
1103 
1104 #define SKIP_HEADER2_MT  do { GET_NEXT_BLOCK_IF_REQUIRED
1105 #define SKIP_HEADER_MT(n) SKIP_HEADER2_MT if (p->btNumAvailBytes-- >= (n)) { const Byte *cur = p->pointerToCurPos; UInt32 *hash = p->hash;
1106 #define SKIP_FOOTER_MT } INCREASE_LZ_POS p->btBufPos += p->btBuf[p->btBufPos] + 1; } while (--num != 0);
1107 
MatchFinderMt0_Skip(CMatchFinderMt * p,UInt32 num)1108 static void MatchFinderMt0_Skip(CMatchFinderMt *p, UInt32 num)
1109 {
1110   SKIP_HEADER2_MT { p->btNumAvailBytes--;
1111   SKIP_FOOTER_MT
1112 }
1113 
1114 static void MatchFinderMt2_Skip(CMatchFinderMt *p, UInt32 num)
1115 {
1116   SKIP_HEADER_MT(2)
1117       UInt32 h2;
1118       MT_HASH2_CALC
1119       hash[h2] = p->lzPos;
1120   SKIP_FOOTER_MT
1121 }
1122 
1123 static void MatchFinderMt3_Skip(CMatchFinderMt *p, UInt32 num)
1124 {
1125   SKIP_HEADER_MT(3)
1126       UInt32 h2, h3;
1127       MT_HASH3_CALC
1128       (hash + kFix3HashSize)[h3] =
1129       hash[                h2] =
1130         p->lzPos;
1131   SKIP_FOOTER_MT
1132 }
1133 
1134 static void MatchFinderMt4_Skip(CMatchFinderMt *p, UInt32 num)
1135 {
1136   SKIP_HEADER_MT(4)
1137       UInt32 h2, h3 /*, h4 */;
1138       MT_HASH3_CALC
1139       // MT_HASH4_CALC
1140       // (hash + kFix4HashSize)[h4] =
1141       (hash + kFix3HashSize)[h3] =
1142       hash[                h2] =
1143         p->lzPos;
1144   SKIP_FOOTER_MT
1145 }
1146 
1147 void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder *vTable)
1148 {
1149   vTable->Init = (Mf_Init_Func)MatchFinderMt_Init;
1150   vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinderMt_GetNumAvailableBytes;
1151   vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinderMt_GetPointerToCurrentPos;
1152   vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt_GetMatches;
1153 
1154   switch (p->MatchFinder->numHashBytes)
1155   {
1156     case 2:
1157       p->GetHeadsFunc = GetHeads2;
1158       p->MixMatchesFunc = (Mf_Mix_Matches)NULL;
1159       vTable->Skip = (Mf_Skip_Func)MatchFinderMt0_Skip;
1160       vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt2_GetMatches;
1161       break;
1162     case 3:
1163       p->GetHeadsFunc = p->MatchFinder->bigHash ? GetHeads3b : GetHeads3;
1164       p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches2;
1165       vTable->Skip = (Mf_Skip_Func)MatchFinderMt2_Skip;
1166       break;
1167     case 4:
1168       p->GetHeadsFunc = p->MatchFinder->bigHash ? GetHeads4b : GetHeads4;
1169 
1170       // it's fast inline version of GetMatches()
1171       // vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt_GetMatches_Bt4;
1172 
1173       p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches3;
1174       vTable->Skip = (Mf_Skip_Func)MatchFinderMt3_Skip;
1175       break;
1176     default:
1177       p->GetHeadsFunc = p->MatchFinder->bigHash ? GetHeads5b : GetHeads5;
1178       p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches4;
1179       vTable->Skip = (Mf_Skip_Func)MatchFinderMt4_Skip;
1180       break;
1181   }
1182 }
1183