1 /* LzFindMt.c -- multithreaded Match finder for LZ algorithms
2 2021-04-01 : Igor Pavlov : Public domain */
3
4 #include "Precomp.h"
5
6 #include "CpuArch.h"
7
8 #include "LzHash.h"
9 #include "LzFindMt.h"
10
11 // #define LOG_ITERS
12
13 #ifdef LOG_ITERS
14 #include <stdio.h>
15 static UInt64 g_NumIters_Tree;
16 static UInt64 g_NumIters_Loop;
17 #define LOG_ITER(x) x
18 #else
19 #define LOG_ITER(x)
20 #endif
21
22 #define kMtHashBlockSize (1 << 17)
23 #define kMtHashNumBlocks (1 << 1)
24 #define kMtHashNumBlocksMask (kMtHashNumBlocks - 1)
25
26 #define kMtBtBlockSize (1 << 16)
27 #define kMtBtNumBlocks (1 << 4)
28 #define kMtBtNumBlocksMask (kMtBtNumBlocks - 1)
29
30 /*
31 HASH functions:
32 We use raw 8/16 bits from a[1] and a[2],
33 xored with crc(a[0]) and crc(a[3]).
34 We check a[0], a[3] only. We don't need to compare a[1] and a[2] in matches.
35 our crc() function provides one-to-one correspondence for low 8-bit values:
36 (crc[0...0xFF] & 0xFF) <-> [0...0xFF]
37 */
38
39 #define MT_HASH2_CALC \
40 h2 = (p->crc[cur[0]] ^ cur[1]) & (kHash2Size - 1);
41
42 #define MT_HASH3_CALC { \
43 UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
44 h2 = temp & (kHash2Size - 1); \
45 h3 = (temp ^ ((UInt32)cur[2] << 8)) & (kHash3Size - 1); }
46
47 /*
48 #define MT_HASH3_CALC__NO_2 { \
49 UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
50 h3 = (temp ^ ((UInt32)cur[2] << 8)) & (kHash3Size - 1); }
51
52 #define __MT_HASH4_CALC { \
53 UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
54 h2 = temp & (kHash2Size - 1); \
55 temp ^= ((UInt32)cur[2] << 8); \
56 h3 = temp & (kHash3Size - 1); \
57 h4 = (temp ^ (p->crc[cur[3]] << kLzHash_CrcShift_1)) & p->hash4Mask; }
58 // (kHash4Size - 1);
59 */
60
61
MtSync_Construct(CMtSync * p)62 static void MtSync_Construct(CMtSync *p)
63 {
64 p->wasCreated = False;
65 p->csWasInitialized = False;
66 p->csWasEntered = False;
67 Thread_Construct(&p->thread);
68 Event_Construct(&p->canStart);
69 Event_Construct(&p->wasStarted);
70 Event_Construct(&p->wasStopped);
71 Semaphore_Construct(&p->freeSemaphore);
72 Semaphore_Construct(&p->filledSemaphore);
73 p->affinity = 0;
74 }
75
76
77 MY_NO_INLINE
MtSync_GetNextBlock(CMtSync * p)78 static void MtSync_GetNextBlock(CMtSync *p)
79 {
80 if (p->needStart)
81 {
82 p->numProcessedBlocks = 1;
83 p->needStart = False;
84 p->stopWriting = False;
85 p->exit = False;
86 Event_Reset(&p->wasStarted);
87 Event_Reset(&p->wasStopped);
88
89 Event_Set(&p->canStart);
90 Event_Wait(&p->wasStarted);
91
92 // if (mt) MatchFinder_Init_LowHash(mt->MatchFinder);
93 }
94 else
95 {
96 CriticalSection_Leave(&p->cs);
97 p->csWasEntered = False;
98 p->numProcessedBlocks++;
99 Semaphore_Release1(&p->freeSemaphore);
100 }
101 Semaphore_Wait(&p->filledSemaphore);
102 CriticalSection_Enter(&p->cs);
103 p->csWasEntered = True;
104 }
105
106 /* MtSync_StopWriting must be called if Writing was started */
107
MtSync_StopWriting(CMtSync * p)108 static void MtSync_StopWriting(CMtSync *p)
109 {
110 UInt32 myNumBlocks = p->numProcessedBlocks;
111 if (!Thread_WasCreated(&p->thread) || p->needStart)
112 return;
113 p->stopWriting = True;
114 if (p->csWasEntered)
115 {
116 CriticalSection_Leave(&p->cs);
117 p->csWasEntered = False;
118 }
119 Semaphore_Release1(&p->freeSemaphore);
120
121 Event_Wait(&p->wasStopped);
122
123 while (myNumBlocks++ != p->numProcessedBlocks)
124 {
125 Semaphore_Wait(&p->filledSemaphore);
126 Semaphore_Release1(&p->freeSemaphore);
127 }
128 p->needStart = True;
129 }
130
MtSync_Destruct(CMtSync * p)131 static void MtSync_Destruct(CMtSync *p)
132 {
133 if (Thread_WasCreated(&p->thread))
134 {
135 MtSync_StopWriting(p);
136 p->exit = True;
137 if (p->needStart)
138 Event_Set(&p->canStart);
139 Thread_Wait_Close(&p->thread);
140 }
141 if (p->csWasInitialized)
142 {
143 CriticalSection_Delete(&p->cs);
144 p->csWasInitialized = False;
145 }
146
147 Event_Close(&p->canStart);
148 Event_Close(&p->wasStarted);
149 Event_Close(&p->wasStopped);
150 Semaphore_Close(&p->freeSemaphore);
151 Semaphore_Close(&p->filledSemaphore);
152
153 p->wasCreated = False;
154 }
155
156 #define RINOK_THREAD(x) { if ((x) != 0) return SZ_ERROR_THREAD; }
157
MtSync_Create2(CMtSync * p,THREAD_FUNC_TYPE startAddress,void * obj,UInt32 numBlocks)158 static SRes MtSync_Create2(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *obj, UInt32 numBlocks)
159 {
160 WRes wres;
161 if (p->wasCreated)
162 return SZ_OK;
163
164 RINOK_THREAD(CriticalSection_Init(&p->cs));
165 p->csWasInitialized = True;
166
167 RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->canStart));
168 RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->wasStarted));
169 RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->wasStopped));
170
171 RINOK_THREAD(Semaphore_Create(&p->freeSemaphore, numBlocks, numBlocks));
172 RINOK_THREAD(Semaphore_Create(&p->filledSemaphore, 0, numBlocks));
173
174 p->needStart = True;
175
176 if (p->affinity != 0)
177 wres = Thread_Create_With_Affinity(&p->thread, startAddress, obj, (CAffinityMask)p->affinity);
178 else
179 wres = Thread_Create(&p->thread, startAddress, obj);
180 RINOK_THREAD(wres);
181 p->wasCreated = True;
182 return SZ_OK;
183 }
184
MtSync_Create(CMtSync * p,THREAD_FUNC_TYPE startAddress,void * obj,UInt32 numBlocks)185 static SRes MtSync_Create(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *obj, UInt32 numBlocks)
186 {
187 SRes res = MtSync_Create2(p, startAddress, obj, numBlocks);
188 if (res != SZ_OK)
189 MtSync_Destruct(p);
190 return res;
191 }
192
193 // static void MtSync_Init(CMtSync *p) { p->needStart = True; }
194
195 #define kMtMaxValForNormalize 0xFFFFFFFF
196 // #define kMtMaxValForNormalize ((1 << 25) + (1 << 20))
197
198
199 #ifdef MY_CPU_LE_UNALIGN
200 #define GetUi24hi_from32(p) ((UInt32)GetUi32(p) >> 8)
201 #else
202 #define GetUi24hi_from32(p) ((p)[1] ^ ((UInt32)(p)[2] << 8) ^ ((UInt32)(p)[3] << 16))
203 #endif
204
205 #define GetHeads_DECL(name) \
206 static void GetHeads ## name(const Byte *p, UInt32 pos, \
207 UInt32 *hash, UInt32 hashMask, UInt32 *heads, UInt32 numHeads, const UInt32 *crc)
208
209 #define GetHeads_LOOP(v) \
210 for (; numHeads != 0; numHeads--) { \
211 const UInt32 value = (v); \
212 p++; \
213 *heads++ = pos - hash[value]; \
214 hash[value] = pos++; }
215
216 #define DEF_GetHeads2(name, v, action) \
217 GetHeads_DECL(name) { action \
218 GetHeads_LOOP(v) }
219
220 #define DEF_GetHeads(name, v) DEF_GetHeads2(name, v, ;)
221
GetUi16(p)222 DEF_GetHeads2(2, GetUi16(p), UNUSED_VAR(hashMask); UNUSED_VAR(crc); )
223 DEF_GetHeads(3, (crc[p[0]] ^ GetUi16(p + 1)) & hashMask)
224 DEF_GetHeads2(3b, GetUi16(p) ^ ((UInt32)(p)[2] << 16), UNUSED_VAR(hashMask); UNUSED_VAR(crc); )
225 // BT3 is not good for crc collisions for big hashMask values.
226
227 /*
228 GetHeads_DECL(3b)
229 {
230 UNUSED_VAR(hashMask);
231 UNUSED_VAR(crc);
232 {
233 const Byte *pLim = p + numHeads;
234 if (numHeads == 0)
235 return;
236 pLim--;
237 while (p < pLim)
238 {
239 UInt32 v1 = GetUi32(p);
240 UInt32 v0 = v1 & 0xFFFFFF;
241 UInt32 h0, h1;
242 p += 2;
243 v1 >>= 8;
244 h0 = hash[v0]; hash[v0] = pos; heads[0] = pos - h0; pos++;
245 h1 = hash[v1]; hash[v1] = pos; heads[1] = pos - h1; pos++;
246 heads += 2;
247 }
248 if (p == pLim)
249 {
250 UInt32 v0 = GetUi16(p) ^ ((UInt32)(p)[2] << 16);
251 *heads = pos - hash[v0];
252 hash[v0] = pos;
253 }
254 }
255 }
256 */
257
258 /*
259 GetHeads_DECL(4)
260 {
261 unsigned sh = 0;
262 UNUSED_VAR(crc)
263 while ((hashMask & 0x80000000) == 0)
264 {
265 hashMask <<= 1;
266 sh++;
267 }
268 GetHeads_LOOP((GetUi32(p) * 0xa54a1) >> sh)
269 }
270 #define GetHeads4b GetHeads4
271 */
272
273 #define USE_GetHeads_LOCAL_CRC
274
275 #ifdef USE_GetHeads_LOCAL_CRC
276
277 GetHeads_DECL(4)
278 {
279 UInt32 crc0[256];
280 UInt32 crc1[256];
281 {
282 unsigned i;
283 for (i = 0; i < 256; i++)
284 {
285 UInt32 v = crc[i];
286 crc0[i] = v & hashMask;
287 crc1[i] = (v << kLzHash_CrcShift_1) & hashMask;
288 // crc1[i] = rotlFixed(v, 8) & hashMask;
289 }
290 }
291 GetHeads_LOOP(crc0[p[0]] ^ crc1[p[3]] ^ (UInt32)GetUi16(p+1))
292 }
293
294 GetHeads_DECL(4b)
295 {
296 UInt32 crc0[256];
297 {
298 unsigned i;
299 for (i = 0; i < 256; i++)
300 crc0[i] = crc[i] & hashMask;
301 }
302 GetHeads_LOOP(crc0[p[0]] ^ GetUi24hi_from32(p))
303 }
304
305 GetHeads_DECL(5)
306 {
307 UInt32 crc0[256];
308 UInt32 crc1[256];
309 UInt32 crc2[256];
310 {
311 unsigned i;
312 for (i = 0; i < 256; i++)
313 {
314 UInt32 v = crc[i];
315 crc0[i] = v & hashMask;
316 crc1[i] = (v << kLzHash_CrcShift_1) & hashMask;
317 crc2[i] = (v << kLzHash_CrcShift_2) & hashMask;
318 }
319 }
320 GetHeads_LOOP(crc0[p[0]] ^ crc1[p[3]] ^ crc2[p[4]] ^ (UInt32)GetUi16(p+1))
321 }
322
323 GetHeads_DECL(5b)
324 {
325 UInt32 crc0[256];
326 UInt32 crc1[256];
327 {
328 unsigned i;
329 for (i = 0; i < 256; i++)
330 {
331 UInt32 v = crc[i];
332 crc0[i] = v & hashMask;
333 crc1[i] = (v << kLzHash_CrcShift_1) & hashMask;
334 }
335 }
336 GetHeads_LOOP(crc0[p[0]] ^ crc1[p[4]] ^ GetUi24hi_from32(p))
337 }
338
339 #else
340
341 DEF_GetHeads(4, (crc[p[0]] ^ (crc[p[3]] << kLzHash_CrcShift_1) ^ (UInt32)GetUi16(p+1)) & hashMask)
342 DEF_GetHeads(4b, (crc[p[0]] ^ GetUi24hi_from32(p)) & hashMask)
343 DEF_GetHeads(5, (crc[p[0]] ^ (crc[p[3]] << kLzHash_CrcShift_1) ^ (crc[p[4]] << kLzHash_CrcShift_2) ^ (UInt32)GetUi16(p + 1)) & hashMask)
344 DEF_GetHeads(5b, (crc[p[0]] ^ (crc[p[4]] << kLzHash_CrcShift_1) ^ GetUi24hi_from32(p)) & hashMask)
345
346 #endif
347
348
HashThreadFunc(CMatchFinderMt * mt)349 static void HashThreadFunc(CMatchFinderMt *mt)
350 {
351 CMtSync *p = &mt->hashSync;
352 for (;;)
353 {
354 UInt32 numProcessedBlocks = 0;
355 Event_Wait(&p->canStart);
356 Event_Set(&p->wasStarted);
357
358 MatchFinder_Init_HighHash(mt->MatchFinder);
359
360 for (;;)
361 {
362 if (p->exit)
363 return;
364 if (p->stopWriting)
365 {
366 p->numProcessedBlocks = numProcessedBlocks;
367 Event_Set(&p->wasStopped);
368 break;
369 }
370
371 {
372 CMatchFinder *mf = mt->MatchFinder;
373 if (MatchFinder_NeedMove(mf))
374 {
375 CriticalSection_Enter(&mt->btSync.cs);
376 CriticalSection_Enter(&mt->hashSync.cs);
377 {
378 const Byte *beforePtr = Inline_MatchFinder_GetPointerToCurrentPos(mf);
379 ptrdiff_t offset;
380 MatchFinder_MoveBlock(mf);
381 offset = beforePtr - Inline_MatchFinder_GetPointerToCurrentPos(mf);
382 mt->pointerToCurPos -= offset;
383 mt->buffer -= offset;
384 }
385 CriticalSection_Leave(&mt->btSync.cs);
386 CriticalSection_Leave(&mt->hashSync.cs);
387 continue;
388 }
389
390 Semaphore_Wait(&p->freeSemaphore);
391
392 MatchFinder_ReadIfRequired(mf);
393 if (mf->pos > (kMtMaxValForNormalize - kMtHashBlockSize))
394 {
395 UInt32 subValue = (mf->pos - mf->historySize - 1);
396 MatchFinder_ReduceOffsets(mf, subValue);
397 MatchFinder_Normalize3(subValue, mf->hash + mf->fixedHashSize, (size_t)mf->hashMask + 1);
398 }
399 {
400 UInt32 *heads = mt->hashBuf + ((numProcessedBlocks++) & kMtHashNumBlocksMask) * kMtHashBlockSize;
401 UInt32 num = mf->streamPos - mf->pos;
402 heads[0] = 2;
403 heads[1] = num;
404 if (num >= mf->numHashBytes)
405 {
406 num = num - mf->numHashBytes + 1;
407 if (num > kMtHashBlockSize - 2)
408 num = kMtHashBlockSize - 2;
409 mt->GetHeadsFunc(mf->buffer, mf->pos, mf->hash + mf->fixedHashSize, mf->hashMask, heads + 2, num, mf->crc);
410 heads[0] = 2 + num;
411 }
412 mf->pos += num;
413 mf->buffer += num;
414 }
415 }
416
417 Semaphore_Release1(&p->filledSemaphore);
418 }
419 }
420 }
421
MatchFinderMt_GetNextBlock_Hash(CMatchFinderMt * p)422 static void MatchFinderMt_GetNextBlock_Hash(CMatchFinderMt *p)
423 {
424 MtSync_GetNextBlock(&p->hashSync);
425 p->hashBufPosLimit = p->hashBufPos = ((p->hashSync.numProcessedBlocks - 1) & kMtHashNumBlocksMask) * kMtHashBlockSize;
426 p->hashBufPosLimit += p->hashBuf[p->hashBufPos++];
427 p->hashNumAvail = p->hashBuf[p->hashBufPos++];
428 }
429
430 #define kEmptyHashValue 0
431
432 #define MFMT_GM_INLINE
433
434 #ifdef MFMT_GM_INLINE
435
436 /*
437 we use size_t for _cyclicBufferPos instead of UInt32
438 to eliminate "movsx" BUG in old MSVC x64 compiler.
439 */
440
441 MY_NO_INLINE
GetMatchesSpecN(UInt32 lenLimit,UInt32 pos,const Byte * cur,CLzRef * son,size_t _cyclicBufferPos,UInt32 _cyclicBufferSize,UInt32 _cutValue,UInt32 * d,UInt32 _maxLen,const UInt32 * hash,const UInt32 * limit,UInt32 size,UInt32 * posRes)442 static UInt32 *GetMatchesSpecN(UInt32 lenLimit, UInt32 pos, const Byte *cur, CLzRef *son,
443 size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 _cutValue,
444 UInt32 *d, UInt32 _maxLen, const UInt32 *hash, const UInt32 *limit, UInt32 size, UInt32 *posRes)
445 {
446 do
447 {
448 UInt32 *_distances = ++d;
449 UInt32 delta = *hash++;
450
451 CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1;
452 CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);
453 unsigned len0 = 0, len1 = 0;
454 UInt32 cutValue = _cutValue;
455 unsigned maxLen = (unsigned)_maxLen;
456
457 /*
458 #define PREF_STEP 1
459 if (size > PREF_STEP)
460 {
461 UInt32 delta = hash[PREF_STEP - 1];
462 if (delta < _cyclicBufferSize)
463 {
464 size_t cyc1 = _cyclicBufferPos + PREF_STEP;
465 CLzRef *pair = son + ((size_t)(cyc1 - delta + ((delta > cyc1) ? _cyclicBufferSize : 0)) << 1);
466 Byte b = *(cur + PREF_STEP - delta);
467 _distances[0] = pair[0];
468 _distances[1] = b;
469 }
470 }
471 */
472 if (cutValue == 0 || delta >= _cyclicBufferSize)
473 {
474 *ptr0 = *ptr1 = kEmptyHashValue;
475 }
476 else
477 for (LOG_ITER(g_NumIters_Tree++);;)
478 {
479 LOG_ITER(g_NumIters_Loop++);
480 {
481 CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((_cyclicBufferPos < delta) ? _cyclicBufferSize : 0)) << 1);
482 const Byte *pb = cur - delta;
483 unsigned len = (len0 < len1 ? len0 : len1);
484 UInt32 pair0 = *pair;
485 if (pb[len] == cur[len])
486 {
487 if (++len != lenLimit && pb[len] == cur[len])
488 while (++len != lenLimit)
489 if (pb[len] != cur[len])
490 break;
491 if (maxLen < len)
492 {
493 maxLen = len;
494 *d++ = (UInt32)len;
495 *d++ = delta - 1;
496 if (len == lenLimit)
497 {
498 UInt32 pair1 = pair[1];
499 *ptr1 = pair0;
500 *ptr0 = pair1;
501 break;
502 }
503 }
504 }
505 {
506 UInt32 curMatch = pos - delta;
507 // delta = pos - *pair;
508 // delta = pos - pair[((UInt32)pb[len] - (UInt32)cur[len]) >> 31];
509 if (pb[len] < cur[len])
510 {
511 delta = pos - pair[1];
512 *ptr1 = curMatch;
513 ptr1 = pair + 1;
514 len1 = len;
515 }
516 else
517 {
518 delta = pos - *pair;
519 *ptr0 = curMatch;
520 ptr0 = pair;
521 len0 = len;
522 }
523 }
524 }
525 if (--cutValue == 0 || delta >= _cyclicBufferSize)
526 {
527 *ptr0 = *ptr1 = kEmptyHashValue;
528 break;
529 }
530 }
531 pos++;
532 _cyclicBufferPos++;
533 cur++;
534 {
535 UInt32 num = (UInt32)(d - _distances);
536 _distances[-1] = num;
537 }
538 }
539 while (d < limit && --size != 0);
540 *posRes = pos;
541 return d;
542 }
543
544 #endif
545
546
547
BtGetMatches(CMatchFinderMt * p,UInt32 * d)548 static void BtGetMatches(CMatchFinderMt *p, UInt32 *d)
549 {
550 UInt32 numProcessed = 0;
551 UInt32 curPos = 2;
552 UInt32 limit = kMtBtBlockSize - (p->matchMaxLen * 2); // * 2
553
554 d[1] = p->hashNumAvail;
555
556 while (curPos < limit)
557 {
558 if (p->hashBufPos == p->hashBufPosLimit)
559 {
560 MatchFinderMt_GetNextBlock_Hash(p);
561 d[1] = numProcessed + p->hashNumAvail;
562 if (p->hashNumAvail >= p->numHashBytes)
563 continue;
564 d[0] = curPos + p->hashNumAvail;
565 d += curPos;
566 for (; p->hashNumAvail != 0; p->hashNumAvail--)
567 *d++ = 0;
568 return;
569 }
570 {
571 UInt32 size = p->hashBufPosLimit - p->hashBufPos;
572 UInt32 lenLimit = p->matchMaxLen;
573 UInt32 pos = p->pos;
574 UInt32 cyclicBufferPos = p->cyclicBufferPos;
575 if (lenLimit >= p->hashNumAvail)
576 lenLimit = p->hashNumAvail;
577 {
578 UInt32 size2 = p->hashNumAvail - lenLimit + 1;
579 if (size2 < size)
580 size = size2;
581 size2 = p->cyclicBufferSize - cyclicBufferPos;
582 if (size2 < size)
583 size = size2;
584 }
585
586 #ifndef MFMT_GM_INLINE
587 while (curPos < limit && size-- != 0)
588 {
589 UInt32 *startDistances = d + curPos;
590 UInt32 num = (UInt32)(GetMatchesSpec1(lenLimit, pos - p->hashBuf[p->hashBufPos++],
591 pos, p->buffer, p->son, cyclicBufferPos, p->cyclicBufferSize, p->cutValue,
592 startDistances + 1, p->numHashBytes - 1) - startDistances);
593 *startDistances = num - 1;
594 curPos += num;
595 cyclicBufferPos++;
596 pos++;
597 p->buffer++;
598 }
599 #else
600 {
601 UInt32 posRes;
602 curPos = (UInt32)(GetMatchesSpecN(lenLimit, pos, p->buffer, p->son, cyclicBufferPos, p->cyclicBufferSize, p->cutValue,
603 d + curPos, p->numHashBytes - 1, p->hashBuf + p->hashBufPos,
604 d + limit,
605 size, &posRes) - d);
606 p->hashBufPos += posRes - pos;
607 cyclicBufferPos += posRes - pos;
608 p->buffer += posRes - pos;
609 pos = posRes;
610 }
611 #endif
612
613 numProcessed += pos - p->pos;
614 p->hashNumAvail -= pos - p->pos;
615 p->pos = pos;
616 if (cyclicBufferPos == p->cyclicBufferSize)
617 cyclicBufferPos = 0;
618 p->cyclicBufferPos = cyclicBufferPos;
619 }
620 }
621
622 d[0] = curPos;
623 }
624
BtFillBlock(CMatchFinderMt * p,UInt32 globalBlockIndex)625 static void BtFillBlock(CMatchFinderMt *p, UInt32 globalBlockIndex)
626 {
627 CMtSync *sync = &p->hashSync;
628 if (!sync->needStart)
629 {
630 CriticalSection_Enter(&sync->cs);
631 sync->csWasEntered = True;
632 }
633
634 BtGetMatches(p, p->btBuf + (globalBlockIndex & kMtBtNumBlocksMask) * kMtBtBlockSize);
635
636 if (p->pos > kMtMaxValForNormalize - kMtBtBlockSize)
637 {
638 UInt32 subValue = p->pos - p->cyclicBufferSize;
639 MatchFinder_Normalize3(subValue, p->son, (size_t)p->cyclicBufferSize * 2);
640 p->pos -= subValue;
641 }
642
643 if (!sync->needStart)
644 {
645 CriticalSection_Leave(&sync->cs);
646 sync->csWasEntered = False;
647 }
648 }
649
BtThreadFunc(CMatchFinderMt * mt)650 static void BtThreadFunc(CMatchFinderMt *mt)
651 {
652 CMtSync *p = &mt->btSync;
653 for (;;)
654 {
655 UInt32 blockIndex = 0;
656 Event_Wait(&p->canStart);
657 Event_Set(&p->wasStarted);
658 for (;;)
659 {
660 if (p->exit)
661 return;
662 if (p->stopWriting)
663 {
664 p->numProcessedBlocks = blockIndex;
665 MtSync_StopWriting(&mt->hashSync);
666 Event_Set(&p->wasStopped);
667 break;
668 }
669 Semaphore_Wait(&p->freeSemaphore);
670 BtFillBlock(mt, blockIndex++);
671 Semaphore_Release1(&p->filledSemaphore);
672 }
673 }
674 }
675
MatchFinderMt_Construct(CMatchFinderMt * p)676 void MatchFinderMt_Construct(CMatchFinderMt *p)
677 {
678 p->hashBuf = NULL;
679 MtSync_Construct(&p->hashSync);
680 MtSync_Construct(&p->btSync);
681 }
682
MatchFinderMt_FreeMem(CMatchFinderMt * p,ISzAllocPtr alloc)683 static void MatchFinderMt_FreeMem(CMatchFinderMt *p, ISzAllocPtr alloc)
684 {
685 ISzAlloc_Free(alloc, p->hashBuf);
686 p->hashBuf = NULL;
687 }
688
MatchFinderMt_Destruct(CMatchFinderMt * p,ISzAllocPtr alloc)689 void MatchFinderMt_Destruct(CMatchFinderMt *p, ISzAllocPtr alloc)
690 {
691 MtSync_Destruct(&p->hashSync);
692 MtSync_Destruct(&p->btSync);
693
694 LOG_ITER(
695 printf("\nTree %9d * %7d iter = %9d sum \n",
696 (UInt32)(g_NumIters_Tree / 1000),
697 (UInt32)(((UInt64)g_NumIters_Loop * 1000) / (g_NumIters_Tree + 1)),
698 (UInt32)(g_NumIters_Loop / 1000)
699 ));
700
701 MatchFinderMt_FreeMem(p, alloc);
702 }
703
704 #define kHashBufferSize (kMtHashBlockSize * kMtHashNumBlocks)
705 #define kBtBufferSize (kMtBtBlockSize * kMtBtNumBlocks)
706
HashThreadFunc2(void * p)707 static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE HashThreadFunc2(void *p) { HashThreadFunc((CMatchFinderMt *)p); return 0; }
BtThreadFunc2(void * p)708 static THREAD_FUNC_RET_TYPE THREAD_FUNC_CALL_TYPE BtThreadFunc2(void *p)
709 {
710 Byte allocaDummy[0x180];
711 unsigned i = 0;
712 for (i = 0; i < 16; i++)
713 allocaDummy[i] = (Byte)0;
714 if (allocaDummy[0] == 0)
715 BtThreadFunc((CMatchFinderMt *)p);
716 return 0;
717 }
718
MatchFinderMt_Create(CMatchFinderMt * p,UInt32 historySize,UInt32 keepAddBufferBefore,UInt32 matchMaxLen,UInt32 keepAddBufferAfter,ISzAllocPtr alloc)719 SRes MatchFinderMt_Create(CMatchFinderMt *p, UInt32 historySize, UInt32 keepAddBufferBefore,
720 UInt32 matchMaxLen, UInt32 keepAddBufferAfter, ISzAllocPtr alloc)
721 {
722 CMatchFinder *mf = p->MatchFinder;
723 p->historySize = historySize;
724 if (kMtBtBlockSize <= matchMaxLen * 4)
725 return SZ_ERROR_PARAM;
726 if (!p->hashBuf)
727 {
728 p->hashBuf = (UInt32 *)ISzAlloc_Alloc(alloc, (kHashBufferSize + kBtBufferSize) * sizeof(UInt32));
729 if (!p->hashBuf)
730 return SZ_ERROR_MEM;
731 p->btBuf = p->hashBuf + kHashBufferSize;
732 }
733 keepAddBufferBefore += (kHashBufferSize + kBtBufferSize);
734 keepAddBufferAfter += kMtHashBlockSize;
735 if (!MatchFinder_Create(mf, historySize, keepAddBufferBefore, matchMaxLen, keepAddBufferAfter, alloc))
736 return SZ_ERROR_MEM;
737
738 RINOK(MtSync_Create(&p->hashSync, HashThreadFunc2, p, kMtHashNumBlocks));
739 RINOK(MtSync_Create(&p->btSync, BtThreadFunc2, p, kMtBtNumBlocks));
740 return SZ_OK;
741 }
742
743 /* Call it after ReleaseStream / SetStream */
MatchFinderMt_Init(CMatchFinderMt * p)744 static void MatchFinderMt_Init(CMatchFinderMt *p)
745 {
746 CMatchFinder *mf = p->MatchFinder;
747
748 p->btBufPos =
749 p->btBufPosLimit = 0;
750 p->hashBufPos =
751 p->hashBufPosLimit = 0;
752
753 /* Init without data reading. We don't want to read data in this thread */
754 MatchFinder_Init_3(mf, False);
755 MatchFinder_Init_LowHash(mf);
756
757 p->pointerToCurPos = Inline_MatchFinder_GetPointerToCurrentPos(mf);
758 p->btNumAvailBytes = 0;
759 p->lzPos = p->historySize + 1;
760
761 p->hash = mf->hash;
762 p->fixedHashSize = mf->fixedHashSize;
763 // p->hash4Mask = mf->hash4Mask;
764 p->crc = mf->crc;
765
766 p->son = mf->son;
767 p->matchMaxLen = mf->matchMaxLen;
768 p->numHashBytes = mf->numHashBytes;
769 p->pos = mf->pos;
770 p->buffer = mf->buffer;
771 p->cyclicBufferPos = mf->cyclicBufferPos;
772 p->cyclicBufferSize = mf->cyclicBufferSize;
773 p->cutValue = mf->cutValue;
774 }
775
776 /* ReleaseStream is required to finish multithreading */
MatchFinderMt_ReleaseStream(CMatchFinderMt * p)777 void MatchFinderMt_ReleaseStream(CMatchFinderMt *p)
778 {
779 MtSync_StopWriting(&p->btSync);
780 /* p->MatchFinder->ReleaseStream(); */
781 }
782
783
784 MY_NO_INLINE
MatchFinderMt_GetNextBlock_Bt(CMatchFinderMt * p)785 static void MatchFinderMt_GetNextBlock_Bt(CMatchFinderMt *p)
786 {
787 UInt32 blockIndex, k;
788
789 MtSync_GetNextBlock(&p->btSync);
790
791 blockIndex = ((p->btSync.numProcessedBlocks - 1) & kMtBtNumBlocksMask);
792 k = blockIndex * kMtBtBlockSize;
793 p->btBufPosLimit = k + p->btBuf[k];
794 p->btNumAvailBytes = p->btBuf[k + 1];
795 p->btBufPos = k + 2;
796 if (p->lzPos >= kMtMaxValForNormalize - kMtBtBlockSize)
797 {
798 MatchFinder_Normalize3(p->lzPos - p->historySize - 1, p->hash, p->fixedHashSize);
799 p->lzPos = p->historySize + 1;
800 }
801 }
802
MatchFinderMt_GetPointerToCurrentPos(CMatchFinderMt * p)803 static const Byte * MatchFinderMt_GetPointerToCurrentPos(CMatchFinderMt *p)
804 {
805 return p->pointerToCurPos;
806 }
807
808 #define GET_NEXT_BLOCK_IF_REQUIRED if (p->btBufPos == p->btBufPosLimit) MatchFinderMt_GetNextBlock_Bt(p);
809
MatchFinderMt_GetNumAvailableBytes(CMatchFinderMt * p)810 static UInt32 MatchFinderMt_GetNumAvailableBytes(CMatchFinderMt *p)
811 {
812 GET_NEXT_BLOCK_IF_REQUIRED;
813 return p->btNumAvailBytes;
814 }
815
MixMatches2(CMatchFinderMt * p,UInt32 matchMinPos,UInt32 * d)816 static UInt32 * MixMatches2(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d)
817 {
818 UInt32 h2, c2;
819 UInt32 *hash = p->hash;
820 const Byte *cur = p->pointerToCurPos;
821 UInt32 m = p->lzPos;
822 MT_HASH2_CALC
823
824 c2 = hash[h2];
825 hash[h2] = m;
826
827 if (c2 >= matchMinPos)
828 if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m] == cur[0])
829 {
830 *d++ = 2;
831 *d++ = m - c2 - 1;
832 }
833
834 return d;
835 }
836
MixMatches3(CMatchFinderMt * p,UInt32 matchMinPos,UInt32 * d)837 static UInt32 * MixMatches3(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d)
838 {
839 UInt32 h2, h3, c2, c3;
840 UInt32 *hash = p->hash;
841 const Byte *cur = p->pointerToCurPos;
842 UInt32 m = p->lzPos;
843 MT_HASH3_CALC
844
845 c2 = hash[h2];
846 c3 = (hash + kFix3HashSize)[h3];
847
848 hash[h2] = m;
849 (hash + kFix3HashSize)[h3] = m;
850
851 if (c2 >= matchMinPos && cur[(ptrdiff_t)c2 - (ptrdiff_t)m] == cur[0])
852 {
853 d[1] = m - c2 - 1;
854 if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 2] == cur[2])
855 {
856 d[0] = 3;
857 return d + 2;
858 }
859 d[0] = 2;
860 d += 2;
861 }
862
863 if (c3 >= matchMinPos && cur[(ptrdiff_t)c3 - (ptrdiff_t)m] == cur[0])
864 {
865 *d++ = 3;
866 *d++ = m - c3 - 1;
867 }
868
869 return d;
870 }
871
872
873 #define INCREASE_LZ_POS p->lzPos++; p->pointerToCurPos++;
874
875 /*
876 static
877 UInt32 MatchFinderMt_GetMatches_Bt4(CMatchFinderMt *p, UInt32 *d)
878 {
879 UInt32 pos = p->btBufPos;
880 const UInt32 *bt = p->btBuf + pos;
881 UInt32 len = *bt++;
882 UInt32 matchMinPos;
883 const UInt32 *d_base = d;
884 UInt32 avail = p->btNumAvailBytes - 1;
885 p->btBufPos = pos + 1 + len;
886
887 {
888 UInt32 temp1 = p->historySize;
889 p->btNumAvailBytes = avail;
890
891 #define BT_HASH_BYTES_MAX 5
892
893 if (len != 0)
894 temp1 = bt[1];
895 else if (avail < (BT_HASH_BYTES_MAX - 2))
896 {
897 INCREASE_LZ_POS
898 return 0;
899 }
900 matchMinPos = p->lzPos - temp1;
901 }
902
903 for (;;)
904 {
905
906 UInt32 h2, h3, c2, c3;
907 UInt32 *hash = p->hash;
908 const Byte *cur = p->pointerToCurPos;
909 UInt32 m = p->lzPos;
910 MT_HASH3_CALC
911
912 c2 = hash[h2];
913 c3 = (hash + kFix3HashSize)[h3];
914
915 hash[h2] = m;
916 (hash + kFix3HashSize)[h3] = m;
917
918 if (c2 >= matchMinPos && cur[(ptrdiff_t)c2 - (ptrdiff_t)m] == cur[0])
919 {
920 d[1] = m - c2 - 1;
921 if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 2] == cur[2])
922 {
923 d[0] = 3;
924 d += 2;
925 break;
926 }
927 // else
928 {
929 d[0] = 2;
930 d += 2;
931 }
932 }
933 if (c3 >= matchMinPos && cur[(ptrdiff_t)c3 - (ptrdiff_t)m] == cur[0])
934 {
935 *d++ = 3;
936 *d++ = m - c3 - 1;
937 }
938 break;
939 }
940
941 if (len != 0)
942 {
943 do
944 {
945 UInt32 v0 = bt[0];
946 UInt32 v1 = bt[1];
947 bt += 2;
948 d[0] = v0;
949 d[1] = v1;
950 d += 2;
951 }
952 while ((len -= 2) != 0);
953 }
954 INCREASE_LZ_POS
955 return (UInt32)(d - d_base);
956 }
957 */
958
959
MixMatches4(CMatchFinderMt * p,UInt32 matchMinPos,UInt32 * d)960 static UInt32 *MixMatches4(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d)
961 {
962 UInt32 h2, h3, /* h4, */ c2, c3 /* , c4 */;
963 UInt32 *hash = p->hash;
964 const Byte *cur = p->pointerToCurPos;
965 UInt32 m = p->lzPos;
966 MT_HASH3_CALC
967 // MT_HASH4_CALC
968 c2 = hash[h2];
969 c3 = (hash + kFix3HashSize)[h3];
970 // c4 = (hash + kFix4HashSize)[h4];
971
972 hash[h2] = m;
973 (hash + kFix3HashSize)[h3] = m;
974 // (hash + kFix4HashSize)[h4] = m;
975
976 #define _USE_H2
977
978 #ifdef _USE_H2
979 if (c2 >= matchMinPos && cur[(ptrdiff_t)c2 - (ptrdiff_t)m] == cur[0])
980 {
981 d[1] = m - c2 - 1;
982 if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 2] == cur[2])
983 {
984 // d[0] = (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 3] == cur[3]) ? 4 : 3;
985 // return d + 2;
986
987 if (cur[(ptrdiff_t)c2 - (ptrdiff_t)m + 3] == cur[3])
988 {
989 d[0] = 4;
990 return d + 2;
991 }
992 d[0] = 3;
993 d += 2;
994
995 #ifdef _USE_H4
996 if (c4 >= matchMinPos)
997 if (
998 cur[(ptrdiff_t)c4 - (ptrdiff_t)m] == cur[0] &&
999 cur[(ptrdiff_t)c4 - (ptrdiff_t)m + 3] == cur[3]
1000 )
1001 {
1002 *d++ = 4;
1003 *d++ = m - c4 - 1;
1004 }
1005 #endif
1006 return d;
1007 }
1008 d[0] = 2;
1009 d += 2;
1010 }
1011 #endif
1012
1013 if (c3 >= matchMinPos && cur[(ptrdiff_t)c3 - (ptrdiff_t)m] == cur[0])
1014 {
1015 d[1] = m - c3 - 1;
1016 if (cur[(ptrdiff_t)c3 - (ptrdiff_t)m + 3] == cur[3])
1017 {
1018 d[0] = 4;
1019 return d + 2;
1020 }
1021 d[0] = 3;
1022 d += 2;
1023 }
1024
1025 #ifdef _USE_H4
1026 if (c4 >= matchMinPos)
1027 if (
1028 cur[(ptrdiff_t)c4 - (ptrdiff_t)m] == cur[0] &&
1029 cur[(ptrdiff_t)c4 - (ptrdiff_t)m + 3] == cur[3]
1030 )
1031 {
1032 *d++ = 4;
1033 *d++ = m - c4 - 1;
1034 }
1035 #endif
1036
1037 return d;
1038 }
1039
1040
MatchFinderMt2_GetMatches(CMatchFinderMt * p,UInt32 * d)1041 static UInt32 MatchFinderMt2_GetMatches(CMatchFinderMt *p, UInt32 *d)
1042 {
1043 const UInt32 *bt = p->btBuf + p->btBufPos;
1044 UInt32 len = *bt++;
1045 p->btBufPos += 1 + len;
1046 p->btNumAvailBytes--;
1047 {
1048 UInt32 i;
1049 for (i = 0; i < len; i += 2)
1050 {
1051 UInt32 v0 = bt[0];
1052 UInt32 v1 = bt[1];
1053 bt += 2;
1054 d[0] = v0;
1055 d[1] = v1;
1056 d += 2;
1057 }
1058 }
1059 INCREASE_LZ_POS
1060 return len;
1061 }
1062
1063
1064
MatchFinderMt_GetMatches(CMatchFinderMt * p,UInt32 * d)1065 static UInt32 MatchFinderMt_GetMatches(CMatchFinderMt *p, UInt32 *d)
1066 {
1067 UInt32 pos = p->btBufPos;
1068 const UInt32 *bt = p->btBuf + pos;
1069 UInt32 len = *bt++;
1070 UInt32 avail = p->btNumAvailBytes - 1;
1071 p->btNumAvailBytes = avail;
1072 p->btBufPos = pos + 1 + len;
1073 if (len == 0)
1074 {
1075 #define BT_HASH_BYTES_MAX 5
1076 if (avail >= (BT_HASH_BYTES_MAX - 1) - 1)
1077 len = (UInt32)(p->MixMatchesFunc(p, p->lzPos - p->historySize, d) - d);
1078 }
1079 else
1080 {
1081 /*
1082 first match pair from BinTree: (match_len, match_dist),
1083 (match_len >= numHashBytes).
1084 MixMatchesFunc() inserts only hash matches that are nearer than (match_dist)
1085 */
1086 UInt32 *d2;
1087 d2 = p->MixMatchesFunc(p, p->lzPos - bt[1], d);
1088 do
1089 {
1090 UInt32 v0 = bt[0];
1091 UInt32 v1 = bt[1];
1092 bt += 2;
1093 d2[0] = v0;
1094 d2[1] = v1;
1095 d2 += 2;
1096 }
1097 while ((len -= 2) != 0);
1098 len = (UInt32)(d2 - d);
1099 }
1100 INCREASE_LZ_POS
1101 return len;
1102 }
1103
1104 #define SKIP_HEADER2_MT do { GET_NEXT_BLOCK_IF_REQUIRED
1105 #define SKIP_HEADER_MT(n) SKIP_HEADER2_MT if (p->btNumAvailBytes-- >= (n)) { const Byte *cur = p->pointerToCurPos; UInt32 *hash = p->hash;
1106 #define SKIP_FOOTER_MT } INCREASE_LZ_POS p->btBufPos += p->btBuf[p->btBufPos] + 1; } while (--num != 0);
1107
MatchFinderMt0_Skip(CMatchFinderMt * p,UInt32 num)1108 static void MatchFinderMt0_Skip(CMatchFinderMt *p, UInt32 num)
1109 {
1110 SKIP_HEADER2_MT { p->btNumAvailBytes--;
1111 SKIP_FOOTER_MT
1112 }
1113
1114 static void MatchFinderMt2_Skip(CMatchFinderMt *p, UInt32 num)
1115 {
1116 SKIP_HEADER_MT(2)
1117 UInt32 h2;
1118 MT_HASH2_CALC
1119 hash[h2] = p->lzPos;
1120 SKIP_FOOTER_MT
1121 }
1122
1123 static void MatchFinderMt3_Skip(CMatchFinderMt *p, UInt32 num)
1124 {
1125 SKIP_HEADER_MT(3)
1126 UInt32 h2, h3;
1127 MT_HASH3_CALC
1128 (hash + kFix3HashSize)[h3] =
1129 hash[ h2] =
1130 p->lzPos;
1131 SKIP_FOOTER_MT
1132 }
1133
1134 static void MatchFinderMt4_Skip(CMatchFinderMt *p, UInt32 num)
1135 {
1136 SKIP_HEADER_MT(4)
1137 UInt32 h2, h3 /*, h4 */;
1138 MT_HASH3_CALC
1139 // MT_HASH4_CALC
1140 // (hash + kFix4HashSize)[h4] =
1141 (hash + kFix3HashSize)[h3] =
1142 hash[ h2] =
1143 p->lzPos;
1144 SKIP_FOOTER_MT
1145 }
1146
1147 void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder *vTable)
1148 {
1149 vTable->Init = (Mf_Init_Func)MatchFinderMt_Init;
1150 vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinderMt_GetNumAvailableBytes;
1151 vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinderMt_GetPointerToCurrentPos;
1152 vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt_GetMatches;
1153
1154 switch (p->MatchFinder->numHashBytes)
1155 {
1156 case 2:
1157 p->GetHeadsFunc = GetHeads2;
1158 p->MixMatchesFunc = (Mf_Mix_Matches)NULL;
1159 vTable->Skip = (Mf_Skip_Func)MatchFinderMt0_Skip;
1160 vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt2_GetMatches;
1161 break;
1162 case 3:
1163 p->GetHeadsFunc = p->MatchFinder->bigHash ? GetHeads3b : GetHeads3;
1164 p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches2;
1165 vTable->Skip = (Mf_Skip_Func)MatchFinderMt2_Skip;
1166 break;
1167 case 4:
1168 p->GetHeadsFunc = p->MatchFinder->bigHash ? GetHeads4b : GetHeads4;
1169
1170 // it's fast inline version of GetMatches()
1171 // vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt_GetMatches_Bt4;
1172
1173 p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches3;
1174 vTable->Skip = (Mf_Skip_Func)MatchFinderMt3_Skip;
1175 break;
1176 default:
1177 p->GetHeadsFunc = p->MatchFinder->bigHash ? GetHeads5b : GetHeads5;
1178 p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches4;
1179 vTable->Skip = (Mf_Skip_Func)MatchFinderMt4_Skip;
1180 break;
1181 }
1182 }
1183