1 #include "./durchschlag.h" 2 3 #include <algorithm> 4 #include <exception> /* terminate */ 5 6 #include "divsufsort.h" 7 8 /* Pointer to position in text. */ 9 typedef DurchschlagTextIdx TextIdx; 10 11 /* (Sum of) value(s) of slice(s). */ 12 typedef uint32_t Score; 13 14 typedef struct HashSlot { 15 TextIdx next; 16 TextIdx offset; 17 } HashSlot; 18 19 typedef struct MetaSlot { 20 TextIdx mark; 21 Score score; 22 } MetaSlot; 23 24 typedef struct Range { 25 TextIdx start; 26 TextIdx end; 27 } Range; 28 29 typedef struct Candidate { 30 Score score; 31 TextIdx position; 32 } Candidate; 33 34 struct greaterScore { 35 bool operator()(const Candidate& a, const Candidate& b) const { 36 return (a.score > b.score) || 37 ((a.score == b.score) && (a.position < b.position)); 38 } 39 }; 40 41 struct lessScore { 42 bool operator()(const Candidate& a, const Candidate& b) const { 43 return (a.score < b.score) || 44 ((a.score == b.score) && (a.position > b.position)); 45 } 46 }; 47 48 #define CANDIDATE_BUNDLE_SIZE (1 << 18) 49 50 static void fatal(const char* error) { 51 fprintf(stderr, "%s\n", error); 52 std::terminate(); 53 } 54 55 static TextIdx calculateDictionarySize(const std::vector<Range>& ranges) { 56 TextIdx result = 0; 57 for (size_t i = 0; i < ranges.size(); ++i) { 58 const Range& r = ranges[i]; 59 result += r.end - r.start; 60 } 61 return result; 62 } 63 64 static std::string createDictionary( 65 const uint8_t* data, const std::vector<Range>& ranges, size_t limit) { 66 std::string output; 67 output.reserve(calculateDictionarySize(ranges)); 68 for (size_t i = 0; i < ranges.size(); ++i) { 69 const Range& r = ranges[i]; 70 output.insert(output.end(), &data[r.start], &data[r.end]); 71 } 72 if (output.size() > limit) { 73 output.resize(limit); 74 } 75 return output; 76 } 77 78 /* precondition: span > 0 79 precondition: end + span == len(shortcut) */ 80 static Score buildCandidatesList(std::vector<Candidate>* candidates, 81 std::vector<MetaSlot>* map, TextIdx span, const TextIdx* shortcut, 82 TextIdx end) { 83 candidates->resize(0); 84 85 size_t n = map->size(); 86 MetaSlot* slots = map->data(); 87 for (size_t j = 0; j < n; ++j) { 88 slots[j].mark = 0; 89 } 90 91 Score score = 0; 92 /* Consider the whole span, except one last item. The following loop will 93 add the last item to the end of the "chain", evaluate it, and cut one 94 "link" form the beginning. */ 95 for (size_t j = 0; j < span - 1; ++j) { 96 MetaSlot& item = slots[shortcut[j]]; 97 if (item.mark == 0) { 98 score += item.score; 99 } 100 item.mark++; 101 } 102 103 TextIdx i = 0; 104 TextIdx limit = std::min<TextIdx>(end, CANDIDATE_BUNDLE_SIZE); 105 Score maxScore = 0; 106 for (; i < limit; ++i) { 107 TextIdx slice = shortcut[i + span - 1]; 108 MetaSlot& pick = slots[slice]; 109 if (pick.mark == 0) { 110 score += pick.score; 111 } 112 pick.mark++; 113 114 if (score > maxScore) { 115 maxScore = score; 116 } 117 candidates->push_back({score, i}); 118 119 MetaSlot& drop = slots[shortcut[i]]; 120 drop.mark--; 121 if (drop.mark == 0) { 122 score -= drop.score; 123 } 124 } 125 126 std::make_heap(candidates->begin(), candidates->end(), greaterScore()); 127 Score minScore = candidates->at(0).score; 128 for (; i < end; ++i) { 129 TextIdx slice = shortcut[i + span - 1]; 130 MetaSlot& pick = slots[slice]; 131 if (pick.mark == 0) { 132 score += pick.score; 133 } 134 pick.mark++; 135 136 if (score > maxScore) { 137 maxScore = score; 138 } 139 if (score >= minScore) { 140 candidates->push_back({score, i}); 141 std::push_heap(candidates->begin(), candidates->end(), greaterScore()); 142 if (candidates->size() > CANDIDATE_BUNDLE_SIZE && maxScore != minScore) { 143 while (candidates->at(0).score == minScore) { 144 std::pop_heap(candidates->begin(), candidates->end(), greaterScore()); 145 candidates->pop_back(); 146 } 147 minScore = candidates->at(0).score; 148 } 149 } 150 151 MetaSlot& drop = slots[shortcut[i]]; 152 drop.mark--; 153 if (drop.mark == 0) { 154 score -= drop.score; 155 } 156 } 157 158 for (size_t j = 0; j < n; ++j) { 159 slots[j].mark = 0; 160 } 161 162 std::make_heap(candidates->begin(), candidates->end(), lessScore()); 163 return minScore; 164 } 165 166 /* precondition: span > 0 167 precondition: end + span == len(shortcut) */ 168 static Score rebuildCandidatesList(std::vector<TextIdx>* candidates, 169 std::vector<MetaSlot>* map, TextIdx span, const TextIdx* shortcut, 170 TextIdx end, TextIdx* next) { 171 size_t n = candidates->size(); 172 TextIdx* data = candidates->data(); 173 for (size_t i = 0; i < n; ++i) { 174 data[i] = 0; 175 } 176 177 n = map->size(); 178 MetaSlot* slots = map->data(); 179 for (size_t i = 0; i < n; ++i) { 180 slots[i].mark = 0; 181 } 182 183 Score score = 0; 184 /* Consider the whole span, except one last item. The following loop will 185 add the last item to the end of the "chain", evaluate it, and cut one 186 "link" form the beginning. */ 187 for (TextIdx i = 0; i < span - 1; ++i) { 188 MetaSlot& item = slots[shortcut[i]]; 189 if (item.mark == 0) { 190 score += item.score; 191 } 192 item.mark++; 193 } 194 195 Score maxScore = 0; 196 for (TextIdx i = 0; i < end; ++i) { 197 MetaSlot& pick = slots[shortcut[i + span - 1]]; 198 if (pick.mark == 0) { 199 score += pick.score; 200 } 201 pick.mark++; 202 203 if (candidates->size() <= score) { 204 candidates->resize(score + 1); 205 } 206 if (score > maxScore) { 207 maxScore = score; 208 } 209 next[i] = candidates->at(score); 210 candidates->at(score) = i; 211 212 MetaSlot& drop = slots[shortcut[i]]; 213 drop.mark--; 214 if (drop.mark == 0) { 215 score -= drop.score; 216 } 217 } 218 219 for (size_t i = 0; i < n; ++i) { 220 slots[i].mark = 0; 221 } 222 223 candidates->resize(maxScore + 1); 224 return maxScore; 225 } 226 227 static void addRange(std::vector<Range>* ranges, TextIdx start, TextIdx end) { 228 for (auto it = ranges->begin(); it != ranges->end();) { 229 if (end < it->start) { 230 ranges->insert(it, {start, end}); 231 return; 232 } 233 if (it->end < start) { 234 it++; 235 continue; 236 } 237 // Combine with existing. 238 start = std::min(start, it->start); 239 end = std::max(end, it->end); 240 // Remove consumed vector and continue. 241 it = ranges->erase(it); 242 } 243 ranges->push_back({start, end}); 244 } 245 246 std::string durchschlag_generate( 247 size_t dictionary_size_limit, size_t slice_len, size_t block_len, 248 const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) { 249 DurchschlagContext ctx = durchschlag_prepare( 250 slice_len, sample_sizes, sample_data); 251 return durchschlag_generate(DURCHSCHLAG_COLLABORATIVE, 252 dictionary_size_limit, block_len, ctx, sample_data); 253 } 254 255 DurchschlagContext durchschlag_prepare(size_t slice_len, 256 const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) { 257 /* Parameters aliasing */ 258 TextIdx sliceLen = static_cast<TextIdx>(slice_len); 259 if (sliceLen != slice_len) fatal("slice_len is too large"); 260 if (sliceLen < 1) fatal("slice_len is too small"); 261 const uint8_t* data = sample_data; 262 263 TextIdx total = 0; 264 std::vector<TextIdx> offsets; 265 offsets.reserve(sample_sizes.size()); 266 for (size_t i = 0; i < sample_sizes.size(); ++i) { 267 TextIdx delta = static_cast<TextIdx>(sample_sizes[i]); 268 if (delta != sample_sizes[i]) fatal("sample is too large"); 269 if (delta == 0) fatal("0-length samples are prohibited"); 270 TextIdx next_total = total + delta; 271 if (next_total <= total) fatal("corpus is too large"); 272 total = next_total; 273 offsets.push_back(total); 274 } 275 276 if (total < sliceLen) fatal("slice_len is larger than corpus size"); 277 TextIdx end = total - static_cast<TextIdx>(sliceLen) + 1; 278 TextIdx hashLen = 11; 279 while (hashLen < 29 && ((1u << hashLen) < end)) { 280 hashLen += 3; 281 } 282 hashLen -= 3; 283 TextIdx hashMask = (1u << hashLen) - 1u; 284 std::vector<TextIdx> hashHead(1 << hashLen); 285 TextIdx hash = 0; 286 TextIdx lShift = 3; 287 TextIdx rShift = hashLen - lShift; 288 for (TextIdx i = 0; i < sliceLen - 1; ++i) { 289 TextIdx v = data[i]; 290 hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v; 291 } 292 TextIdx lShiftX = (lShift * (sliceLen - 1)) % hashLen; 293 TextIdx rShiftX = hashLen - lShiftX; 294 295 std::vector<HashSlot> map; 296 map.push_back({0, 0}); 297 TextIdx hashSlot = 1; 298 std::vector<TextIdx> sliceMap; 299 sliceMap.reserve(end); 300 for (TextIdx i = 0; i < end; ++i) { 301 TextIdx v = data[i + sliceLen - 1]; 302 TextIdx bucket = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v; 303 v = data[i]; 304 hash = bucket ^ (((v << lShiftX) | (v >> rShiftX)) & hashMask); 305 TextIdx slot = hashHead[bucket]; 306 while (slot != 0) { 307 HashSlot& item = map[slot]; 308 TextIdx start = item.offset; 309 bool miss = false; 310 for (TextIdx j = 0; j < sliceLen; ++j) { 311 if (data[i + j] != data[start + j]) { 312 miss = true; 313 break; 314 } 315 } 316 if (!miss) { 317 sliceMap.push_back(slot); 318 break; 319 } 320 slot = item.next; 321 } 322 if (slot == 0) { 323 map.push_back({hashHead[bucket], i}); 324 hashHead[bucket] = hashSlot; 325 sliceMap.push_back(hashSlot); 326 hashSlot++; 327 } 328 } 329 330 return {total, sliceLen, static_cast<TextIdx>(map.size()), 331 std::move(offsets), std::move(sliceMap)}; 332 } 333 334 DurchschlagContext durchschlag_prepare(size_t slice_len, 335 const std::vector<size_t>& sample_sizes, const DurchschlagIndex& index) { 336 /* Parameters aliasing */ 337 TextIdx sliceLen = static_cast<TextIdx>(slice_len); 338 if (sliceLen != slice_len) fatal("slice_len is too large"); 339 if (sliceLen < 1) fatal("slice_len is too small"); 340 const TextIdx* lcp = index.lcp.data(); 341 const TextIdx* sa = index.sa.data(); 342 343 TextIdx total = 0; 344 std::vector<TextIdx> offsets; 345 offsets.reserve(sample_sizes.size()); 346 for (size_t i = 0; i < sample_sizes.size(); ++i) { 347 TextIdx delta = static_cast<TextIdx>(sample_sizes[i]); 348 if (delta != sample_sizes[i]) fatal("sample is too large"); 349 if (delta == 0) fatal("0-length samples are prohibited"); 350 TextIdx next_total = total + delta; 351 if (next_total <= total) fatal("corpus is too large"); 352 total = next_total; 353 offsets.push_back(total); 354 } 355 356 if (total < sliceLen) fatal("slice_len is larger than corpus size"); 357 TextIdx counter = 1; 358 TextIdx end = total - sliceLen + 1; 359 std::vector<TextIdx> sliceMap(total); 360 TextIdx last = 0; 361 TextIdx current = 1; 362 while (current <= total) { 363 if (lcp[current - 1] < sliceLen) { 364 for (TextIdx i = last; i < current; ++i) { 365 sliceMap[sa[i]] = counter; 366 } 367 counter++; 368 last = current; 369 } 370 current++; 371 } 372 sliceMap.resize(end); 373 374 // Reorder items for the better locality. 375 std::vector<TextIdx> reorder(counter); 376 counter = 1; 377 for (TextIdx i = 0; i < end; ++i) { 378 if (reorder[sliceMap[i]] == 0) { 379 reorder[sliceMap[i]] = counter++; 380 } 381 } 382 for (TextIdx i = 0; i < end; ++i) { 383 sliceMap[i] = reorder[sliceMap[i]]; 384 } 385 386 return {total, sliceLen, counter, std::move(offsets), std::move(sliceMap)}; 387 } 388 389 DurchschlagIndex durchschlag_index(const std::vector<uint8_t>& data) { 390 TextIdx total = static_cast<TextIdx>(data.size()); 391 if (total != data.size()) fatal("corpus is too large"); 392 saidx_t saTotal = static_cast<saidx_t>(total); 393 if (saTotal < 0) fatal("corpus is too large"); 394 if (static_cast<TextIdx>(saTotal) != total) fatal("corpus is too large"); 395 std::vector<TextIdx> sa(total); 396 /* Hopefully, non-negative int32_t values match TextIdx ones. */ 397 if (sizeof(TextIdx) != sizeof(int32_t)) fatal("type length mismatch"); 398 int32_t* saData = reinterpret_cast<int32_t*>(sa.data()); 399 divsufsort(data.data(), saData, saTotal); 400 401 std::vector<TextIdx> isa(total); 402 for (TextIdx i = 0; i < total; ++i) isa[sa[i]] = i; 403 404 // TODO: borrowed -> unknown efficiency. 405 std::vector<TextIdx> lcp(total); 406 TextIdx k = 0; 407 lcp[total - 1] = 0; 408 for (TextIdx i = 0; i < total; ++i) { 409 TextIdx current = isa[i]; 410 if (current == total - 1) { 411 k = 0; 412 continue; 413 } 414 TextIdx j = sa[current + 1]; // Suffix which follow i-th suffix. 415 while ((i + k < total) && (j + k < total) && (data[i + k] == data[j + k])) { 416 ++k; 417 } 418 lcp[current] = k; 419 if (k > 0) --k; 420 } 421 422 return {std::move(lcp), std::move(sa)}; 423 } 424 425 static void ScoreSlices(const std::vector<TextIdx>& offsets, 426 std::vector<MetaSlot>& map, const TextIdx* shortcut, TextIdx end) { 427 TextIdx piece = 0; 428 /* Fresh map contains all zeroes -> initial mark should be different. */ 429 TextIdx mark = 1; 430 for (TextIdx i = 0; i < end; ++i) { 431 if (offsets[piece] == i) { 432 piece++; 433 mark++; 434 } 435 MetaSlot& item = map[shortcut[i]]; 436 if (item.mark != mark) { 437 item.mark = mark; 438 item.score++; 439 } 440 } 441 } 442 443 static std::string durchschlagGenerateExclusive( 444 size_t dictionary_size_limit, size_t block_len, 445 const DurchschlagContext& context, const uint8_t* sample_data) { 446 /* Parameters aliasing */ 447 TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit); 448 if (targetSize != dictionary_size_limit) { 449 fprintf(stderr, "dictionary_size_limit is too large\n"); 450 return ""; 451 } 452 TextIdx sliceLen = context.sliceLen; 453 TextIdx total = context.dataSize; 454 TextIdx blockLen = static_cast<TextIdx>(block_len); 455 if (blockLen != block_len) { 456 fprintf(stderr, "block_len is too large\n"); 457 return ""; 458 } 459 const uint8_t* data = sample_data; 460 const std::vector<TextIdx>& offsets = context.offsets; 461 std::vector<MetaSlot> map(context.numUniqueSlices); 462 const TextIdx* shortcut = context.sliceMap.data(); 463 464 /* Initialization */ 465 if (blockLen < sliceLen) { 466 fprintf(stderr, "sliceLen is larger than block_len\n"); 467 return ""; 468 } 469 if (targetSize < blockLen || total < blockLen) { 470 fprintf(stderr, "block_len is too large\n"); 471 return ""; 472 } 473 TextIdx end = total - sliceLen + 1; 474 ScoreSlices(offsets, map, shortcut, end); 475 TextIdx span = blockLen - sliceLen + 1; 476 end = static_cast<TextIdx>(context.sliceMap.size()) - span; 477 std::vector<TextIdx> candidates; 478 std::vector<TextIdx> next(end); 479 Score maxScore = rebuildCandidatesList( 480 &candidates, &map, span, shortcut, end, next.data()); 481 482 /* Block selection */ 483 const size_t triesLimit = (600 * 1000000) / span; 484 const size_t candidatesLimit = (150 * 1000000) / span; 485 std::vector<Range> ranges; 486 TextIdx mark = 0; 487 size_t numTries = 0; 488 while (true) { 489 TextIdx dictSize = calculateDictionarySize(ranges); 490 size_t numCandidates = 0; 491 if (dictSize > targetSize - blockLen) { 492 break; 493 } 494 if (maxScore == 0) { 495 break; 496 } 497 while (true) { 498 TextIdx candidate = 0; 499 while (maxScore > 0) { 500 if (candidates[maxScore] != 0) { 501 candidate = candidates[maxScore]; 502 candidates[maxScore] = next[candidate]; 503 break; 504 } 505 maxScore--; 506 } 507 if (maxScore == 0) { 508 break; 509 } 510 mark++; 511 numTries++; 512 numCandidates++; 513 Score score = 0; 514 for (size_t j = candidate; j < candidate + span; ++j) { 515 MetaSlot& item = map[shortcut[j]]; 516 if (item.mark != mark) { 517 score += item.score; 518 item.mark = mark; 519 } 520 } 521 if (score < maxScore) { 522 if (numTries < triesLimit && numCandidates < candidatesLimit) { 523 next[candidate] = candidates[score]; 524 candidates[score] = candidate; 525 } else { 526 maxScore = rebuildCandidatesList( 527 &candidates, &map, span, shortcut, end, next.data()); 528 mark = 0; 529 numTries = 0; 530 numCandidates = 0; 531 } 532 continue; 533 } else if (score > maxScore) { 534 fprintf(stderr, "Broken invariant\n"); 535 return ""; 536 } 537 for (TextIdx j = candidate; j < candidate + span; ++j) { 538 MetaSlot& item = map[shortcut[j]]; 539 item.score = 0; 540 } 541 addRange(&ranges, candidate, candidate + blockLen); 542 break; 543 } 544 } 545 546 return createDictionary(data, ranges, targetSize); 547 } 548 549 static std::string durchschlagGenerateCollaborative( 550 size_t dictionary_size_limit, size_t block_len, 551 const DurchschlagContext& context, const uint8_t* sample_data) { 552 /* Parameters aliasing */ 553 TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit); 554 if (targetSize != dictionary_size_limit) { 555 fprintf(stderr, "dictionary_size_limit is too large\n"); 556 return ""; 557 } 558 TextIdx sliceLen = context.sliceLen; 559 TextIdx total = context.dataSize; 560 TextIdx blockLen = static_cast<TextIdx>(block_len); 561 if (blockLen != block_len) { 562 fprintf(stderr, "block_len is too large\n"); 563 return ""; 564 } 565 const uint8_t* data = sample_data; 566 const std::vector<TextIdx>& offsets = context.offsets; 567 std::vector<MetaSlot> map(context.numUniqueSlices); 568 const TextIdx* shortcut = context.sliceMap.data(); 569 570 /* Initialization */ 571 if (blockLen < sliceLen) { 572 fprintf(stderr, "sliceLen is larger than block_len\n"); 573 return ""; 574 } 575 if (targetSize < blockLen || total < blockLen) { 576 fprintf(stderr, "block_len is too large\n"); 577 return ""; 578 } 579 TextIdx end = total - sliceLen + 1; 580 ScoreSlices(offsets, map, shortcut, end); 581 TextIdx span = blockLen - sliceLen + 1; 582 end = static_cast<TextIdx>(context.sliceMap.size()) - span; 583 std::vector<Candidate> candidates; 584 candidates.reserve(CANDIDATE_BUNDLE_SIZE + 1024); 585 Score minScore = buildCandidatesList(&candidates, &map, span, shortcut, end); 586 587 /* Block selection */ 588 std::vector<Range> ranges; 589 TextIdx mark = 0; 590 while (true) { 591 TextIdx dictSize = calculateDictionarySize(ranges); 592 if (dictSize > targetSize - blockLen) { 593 break; 594 } 595 if (minScore == 0 && candidates.empty()) { 596 break; 597 } 598 while (true) { 599 if (candidates.empty()) { 600 minScore = buildCandidatesList(&candidates, &map, span, shortcut, end); 601 mark = 0; 602 } 603 TextIdx candidate = candidates[0].position; 604 Score expectedScore = candidates[0].score; 605 if (expectedScore == 0) { 606 candidates.resize(0); 607 break; 608 } 609 std::pop_heap(candidates.begin(), candidates.end(), lessScore()); 610 candidates.pop_back(); 611 mark++; 612 Score score = 0; 613 for (TextIdx j = candidate; j < candidate + span; ++j) { 614 MetaSlot& item = map[shortcut[j]]; 615 if (item.mark != mark) { 616 score += item.score; 617 item.mark = mark; 618 } 619 } 620 if (score < expectedScore) { 621 if (score >= minScore) { 622 candidates.push_back({score, candidate}); 623 std::push_heap(candidates.begin(), candidates.end(), lessScore()); 624 } 625 continue; 626 } else if (score > expectedScore) { 627 fatal("Broken invariant"); 628 } 629 for (TextIdx j = candidate; j < candidate + span; ++j) { 630 MetaSlot& item = map[shortcut[j]]; 631 item.score = 0; 632 } 633 addRange(&ranges, candidate, candidate + blockLen); 634 break; 635 } 636 } 637 638 return createDictionary(data, ranges, targetSize); 639 } 640 641 std::string durchschlag_generate(DurchschalgResourceStrategy strategy, 642 size_t dictionary_size_limit, size_t block_len, 643 const DurchschlagContext& context, const uint8_t* sample_data) { 644 if (strategy == DURCHSCHLAG_COLLABORATIVE) { 645 return durchschlagGenerateCollaborative( 646 dictionary_size_limit, block_len, context, sample_data); 647 } else { 648 return durchschlagGenerateExclusive( 649 dictionary_size_limit, block_len, context, sample_data); 650 } 651 } 652 653 void durchschlag_distill(size_t slice_len, size_t minimum_population, 654 std::vector<size_t>* sample_sizes, uint8_t* sample_data) { 655 /* Parameters aliasing */ 656 uint8_t* data = sample_data; 657 658 /* Build slice map. */ 659 DurchschlagContext context = durchschlag_prepare( 660 slice_len, *sample_sizes, data); 661 662 /* Calculate slice population. */ 663 const std::vector<TextIdx>& offsets = context.offsets; 664 std::vector<MetaSlot> map(context.numUniqueSlices); 665 const TextIdx* shortcut = context.sliceMap.data(); 666 TextIdx sliceLen = context.sliceLen; 667 TextIdx total = context.dataSize; 668 TextIdx end = total - sliceLen + 1; 669 ScoreSlices(offsets, map, shortcut, end); 670 671 /* Condense samples, omitting unique slices. */ 672 TextIdx readPos = 0; 673 TextIdx writePos = 0; 674 TextIdx lastNonUniquePos = 0; 675 for (TextIdx i = 0; i < sample_sizes->size(); ++i) { 676 TextIdx sampleStart = writePos; 677 TextIdx oldSampleEnd = 678 readPos + static_cast<TextIdx>(sample_sizes->at(i)); 679 while (readPos < oldSampleEnd) { 680 if (readPos < end) { 681 MetaSlot& item = map[shortcut[readPos]]; 682 if (item.score >= minimum_population) { 683 lastNonUniquePos = readPos + sliceLen; 684 } 685 } 686 if (readPos < lastNonUniquePos) { 687 data[writePos++] = data[readPos]; 688 } 689 readPos++; 690 } 691 sample_sizes->at(i) = writePos - sampleStart; 692 } 693 } 694 695 void durchschlag_purify(size_t slice_len, size_t minimum_population, 696 const std::vector<size_t>& sample_sizes, uint8_t* sample_data) { 697 /* Parameters aliasing */ 698 uint8_t* data = sample_data; 699 700 /* Build slice map. */ 701 DurchschlagContext context = durchschlag_prepare( 702 slice_len, sample_sizes, data); 703 704 /* Calculate slice population. */ 705 const std::vector<TextIdx>& offsets = context.offsets; 706 std::vector<MetaSlot> map(context.numUniqueSlices); 707 const TextIdx* shortcut = context.sliceMap.data(); 708 TextIdx sliceLen = context.sliceLen; 709 TextIdx total = context.dataSize; 710 TextIdx end = total - sliceLen + 1; 711 ScoreSlices(offsets, map, shortcut, end); 712 713 /* Rewrite samples, zeroing out unique slices. */ 714 TextIdx lastNonUniquePos = 0; 715 for (TextIdx readPos = 0; readPos < total; ++readPos) { 716 if (readPos < end) { 717 MetaSlot& item = map[shortcut[readPos]]; 718 if (item.score >= minimum_population) { 719 lastNonUniquePos = readPos + sliceLen; 720 } 721 } 722 if (readPos >= lastNonUniquePos) { 723 data[readPos] = 0; 724 } 725 } 726 } 727