1 /////////////////////////////////////////////////////////////////////////////// 2 // 3 /// \file coder.c 4 /// \brief Compresses or uncompresses a file 5 // 6 // Author: Lasse Collin 7 // 8 // This file has been put into the public domain. 9 // You can do whatever you want with this file. 10 // 11 /////////////////////////////////////////////////////////////////////////////// 12 13 #include "private.h" 14 15 16 /// Return value type for coder_init(). 17 enum coder_init_ret { 18 CODER_INIT_NORMAL, 19 CODER_INIT_PASSTHRU, 20 CODER_INIT_ERROR, 21 }; 22 23 24 enum operation_mode opt_mode = MODE_COMPRESS; 25 enum format_type opt_format = FORMAT_AUTO; 26 bool opt_auto_adjust = true; 27 bool opt_single_stream = false; 28 uint64_t opt_block_size = 0; 29 uint64_t *opt_block_list = NULL; 30 31 32 /// Stream used to communicate with liblzma 33 static lzma_stream strm = LZMA_STREAM_INIT; 34 35 /// Filters needed for all encoding all formats, and also decoding in raw data 36 static lzma_filter filters[LZMA_FILTERS_MAX + 1]; 37 38 /// Input and output buffers 39 static io_buf in_buf; 40 static io_buf out_buf; 41 42 /// Number of filters. Zero indicates that we are using a preset. 43 static uint32_t filters_count = 0; 44 45 /// Number of the preset (0-9) 46 static uint32_t preset_number = LZMA_PRESET_DEFAULT; 47 48 /// Integrity check type 49 static lzma_check check; 50 51 /// This becomes false if the --check=CHECK option is used. 52 static bool check_default = true; 53 54 /// Indicates if unconsumed input is allowed to remain after 55 /// decoding has successfully finished. This is set for each file 56 /// in coder_init(). 57 static bool allow_trailing_input; 58 59 #ifdef MYTHREAD_ENABLED 60 static lzma_mt mt_options = { 61 .flags = 0, 62 .timeout = 300, 63 .filters = filters, 64 }; 65 #endif 66 67 68 extern void 69 coder_set_check(lzma_check new_check) 70 { 71 check = new_check; 72 check_default = false; 73 return; 74 } 75 76 77 static void 78 forget_filter_chain(void) 79 { 80 // Setting a preset makes us forget a possibly defined custom 81 // filter chain. 82 while (filters_count > 0) { 83 --filters_count; 84 free(filters[filters_count].options); 85 filters[filters_count].options = NULL; 86 } 87 88 return; 89 } 90 91 92 extern void 93 coder_set_preset(uint32_t new_preset) 94 { 95 preset_number &= ~LZMA_PRESET_LEVEL_MASK; 96 preset_number |= new_preset; 97 forget_filter_chain(); 98 return; 99 } 100 101 102 extern void 103 coder_set_extreme(void) 104 { 105 preset_number |= LZMA_PRESET_EXTREME; 106 forget_filter_chain(); 107 return; 108 } 109 110 111 extern void 112 coder_add_filter(lzma_vli id, void *options) 113 { 114 if (filters_count == LZMA_FILTERS_MAX) 115 message_fatal(_("Maximum number of filters is four")); 116 117 filters[filters_count].id = id; 118 filters[filters_count].options = options; 119 ++filters_count; 120 121 // Setting a custom filter chain makes us forget the preset options. 122 // This makes a difference if one specifies e.g. "xz -9 --lzma2 -e" 123 // where the custom filter chain resets the preset level back to 124 // the default 6, making the example equivalent to "xz -6e". 125 preset_number = LZMA_PRESET_DEFAULT; 126 127 return; 128 } 129 130 131 tuklib_attr_noreturn 132 static void 133 memlimit_too_small(uint64_t memory_usage) 134 { 135 message(V_ERROR, _("Memory usage limit is too low for the given " 136 "filter setup.")); 137 message_mem_needed(V_ERROR, memory_usage); 138 tuklib_exit(E_ERROR, E_ERROR, false); 139 } 140 141 142 extern void 143 coder_set_compression_settings(void) 144 { 145 #ifdef HAVE_LZIP_DECODER 146 // .lz compression isn't supported. 147 assert(opt_format != FORMAT_LZIP); 148 #endif 149 150 // The default check type is CRC64, but fallback to CRC32 151 // if CRC64 isn't supported by the copy of liblzma we are 152 // using. CRC32 is always supported. 153 if (check_default) { 154 check = LZMA_CHECK_CRC64; 155 if (!lzma_check_is_supported(check)) 156 check = LZMA_CHECK_CRC32; 157 } 158 159 // Options for LZMA1 or LZMA2 in case we are using a preset. 160 static lzma_options_lzma opt_lzma; 161 162 if (filters_count == 0) { 163 // We are using a preset. This is not a good idea in raw mode 164 // except when playing around with things. Different versions 165 // of this software may use different options in presets, and 166 // thus make uncompressing the raw data difficult. 167 if (opt_format == FORMAT_RAW) { 168 // The message is shown only if warnings are allowed 169 // but the exit status isn't changed. 170 message(V_WARNING, _("Using a preset in raw mode " 171 "is discouraged.")); 172 message(V_WARNING, _("The exact options of the " 173 "presets may vary between software " 174 "versions.")); 175 } 176 177 // Get the preset for LZMA1 or LZMA2. 178 if (lzma_lzma_preset(&opt_lzma, preset_number)) 179 message_bug(); 180 181 // Use LZMA2 except with --format=lzma we use LZMA1. 182 filters[0].id = opt_format == FORMAT_LZMA 183 ? LZMA_FILTER_LZMA1 : LZMA_FILTER_LZMA2; 184 filters[0].options = &opt_lzma; 185 filters_count = 1; 186 } 187 188 // Terminate the filter options array. 189 filters[filters_count].id = LZMA_VLI_UNKNOWN; 190 191 // If we are using the .lzma format, allow exactly one filter 192 // which has to be LZMA1. 193 if (opt_format == FORMAT_LZMA && (filters_count != 1 194 || filters[0].id != LZMA_FILTER_LZMA1)) 195 message_fatal(_("The .lzma format supports only " 196 "the LZMA1 filter")); 197 198 // If we are using the .xz format, make sure that there is no LZMA1 199 // filter to prevent LZMA_PROG_ERROR. 200 if (opt_format == FORMAT_XZ) 201 for (size_t i = 0; i < filters_count; ++i) 202 if (filters[i].id == LZMA_FILTER_LZMA1) 203 message_fatal(_("LZMA1 cannot be used " 204 "with the .xz format")); 205 206 // Print the selected filter chain. 207 message_filters_show(V_DEBUG, filters); 208 209 // The --flush-timeout option requires LZMA_SYNC_FLUSH support 210 // from the filter chain. Currently threaded encoder doesn't support 211 // LZMA_SYNC_FLUSH so single-threaded mode must be used. 212 if (opt_mode == MODE_COMPRESS && opt_flush_timeout != 0) { 213 for (size_t i = 0; i < filters_count; ++i) { 214 switch (filters[i].id) { 215 case LZMA_FILTER_LZMA2: 216 case LZMA_FILTER_DELTA: 217 break; 218 219 default: 220 message_fatal(_("The filter chain is " 221 "incompatible with --flush-timeout")); 222 } 223 } 224 225 if (hardware_threads_is_mt()) { 226 message(V_WARNING, _("Switching to single-threaded " 227 "mode due to --flush-timeout")); 228 hardware_threads_set(1); 229 } 230 } 231 232 // Get the memory usage. Note that if --format=raw was used, 233 // we can be decompressing. 234 // 235 // If multithreaded .xz compression is done, this value will be 236 // replaced. 237 uint64_t memory_limit = hardware_memlimit_get(opt_mode); 238 uint64_t memory_usage = UINT64_MAX; 239 if (opt_mode == MODE_COMPRESS) { 240 #ifdef HAVE_ENCODERS 241 # ifdef MYTHREAD_ENABLED 242 if (opt_format == FORMAT_XZ && hardware_threads_is_mt()) { 243 memory_limit = hardware_memlimit_mtenc_get(); 244 mt_options.threads = hardware_threads_get(); 245 mt_options.block_size = opt_block_size; 246 mt_options.check = check; 247 memory_usage = lzma_stream_encoder_mt_memusage( 248 &mt_options); 249 if (memory_usage != UINT64_MAX) 250 message(V_DEBUG, _("Using up to %" PRIu32 251 " threads."), 252 mt_options.threads); 253 } else 254 # endif 255 { 256 memory_usage = lzma_raw_encoder_memusage(filters); 257 } 258 #endif 259 } else { 260 #ifdef HAVE_DECODERS 261 memory_usage = lzma_raw_decoder_memusage(filters); 262 #endif 263 } 264 265 if (memory_usage == UINT64_MAX) 266 message_fatal(_("Unsupported filter chain or filter options")); 267 268 // Print memory usage info before possible dictionary 269 // size auto-adjusting. 270 // 271 // NOTE: If only encoder support was built, we cannot show the 272 // what the decoder memory usage will be. 273 message_mem_needed(V_DEBUG, memory_usage); 274 #ifdef HAVE_DECODERS 275 if (opt_mode == MODE_COMPRESS) { 276 const uint64_t decmem = lzma_raw_decoder_memusage(filters); 277 if (decmem != UINT64_MAX) 278 message(V_DEBUG, _("Decompression will need " 279 "%s MiB of memory."), uint64_to_str( 280 round_up_to_mib(decmem), 0)); 281 } 282 #endif 283 284 if (memory_usage <= memory_limit) 285 return; 286 287 // With --format=raw settings are never adjusted to meet 288 // the memory usage limit. 289 if (opt_format == FORMAT_RAW) 290 memlimit_too_small(memory_usage); 291 292 assert(opt_mode == MODE_COMPRESS); 293 294 #ifdef HAVE_ENCODERS 295 # ifdef MYTHREAD_ENABLED 296 if (opt_format == FORMAT_XZ && hardware_threads_is_mt()) { 297 // Try to reduce the number of threads before 298 // adjusting the compression settings down. 299 while (mt_options.threads > 1) { 300 // Reduce the number of threads by one and check 301 // the memory usage. 302 --mt_options.threads; 303 memory_usage = lzma_stream_encoder_mt_memusage( 304 &mt_options); 305 if (memory_usage == UINT64_MAX) 306 message_bug(); 307 308 if (memory_usage <= memory_limit) { 309 // The memory usage is now low enough. 310 message(V_WARNING, _("Reduced the number of " 311 "threads from %s to %s to not exceed " 312 "the memory usage limit of %s MiB"), 313 uint64_to_str( 314 hardware_threads_get(), 0), 315 uint64_to_str(mt_options.threads, 1), 316 uint64_to_str(round_up_to_mib( 317 memory_limit), 2)); 318 return; 319 } 320 } 321 322 // If the memory usage limit is only a soft limit (automatic 323 // number of threads and no --memlimit-compress), the limit 324 // is only used to reduce the number of threads and once at 325 // just one thread, the limit is completely ignored. This 326 // way -T0 won't use insane amount of memory but at the same 327 // time the soft limit will never make xz fail and never make 328 // xz change settings that would affect the compressed output. 329 if (hardware_memlimit_mtenc_is_default()) { 330 message(V_WARNING, _("Reduced the number of threads " 331 "from %s to one. The automatic memory usage " 332 "limit of %s MiB is still being exceeded. " 333 "%s MiB of memory is required. " 334 "Continuing anyway."), 335 uint64_to_str(hardware_threads_get(), 0), 336 uint64_to_str( 337 round_up_to_mib(memory_limit), 1), 338 uint64_to_str( 339 round_up_to_mib(memory_usage), 2)); 340 return; 341 } 342 343 // If --no-adjust was used, we cannot drop to single-threaded 344 // mode since it produces different compressed output. 345 // 346 // NOTE: In xz 5.2.x, --no-adjust also prevented reducing 347 // the number of threads. This changed in 5.3.3alpha. 348 if (!opt_auto_adjust) 349 memlimit_too_small(memory_usage); 350 351 // Switch to single-threaded mode. It uses 352 // less memory than using one thread in 353 // the multithreaded mode but the output 354 // is also different. 355 hardware_threads_set(1); 356 memory_usage = lzma_raw_encoder_memusage(filters); 357 message(V_WARNING, _("Switching to single-threaded mode " 358 "to not exceed the memory usage limit of %s MiB"), 359 uint64_to_str(round_up_to_mib(memory_limit), 0)); 360 } 361 # endif 362 363 if (memory_usage <= memory_limit) 364 return; 365 366 // Don't adjust LZMA2 or LZMA1 dictionary size if --no-adjust 367 // was specified as that would change the compressed output. 368 if (!opt_auto_adjust) 369 memlimit_too_small(memory_usage); 370 371 // Look for the last filter if it is LZMA2 or LZMA1, so we can make 372 // it use less RAM. With other filters we don't know what to do. 373 size_t i = 0; 374 while (filters[i].id != LZMA_FILTER_LZMA2 375 && filters[i].id != LZMA_FILTER_LZMA1) { 376 if (filters[i].id == LZMA_VLI_UNKNOWN) 377 memlimit_too_small(memory_usage); 378 379 ++i; 380 } 381 382 // Decrease the dictionary size until we meet the memory 383 // usage limit. First round down to full mebibytes. 384 lzma_options_lzma *opt = filters[i].options; 385 const uint32_t orig_dict_size = opt->dict_size; 386 opt->dict_size &= ~((UINT32_C(1) << 20) - 1); 387 while (true) { 388 // If it is below 1 MiB, auto-adjusting failed. We could be 389 // more sophisticated and scale it down even more, but let's 390 // see if many complain about this version. 391 // 392 // FIXME: Displays the scaled memory usage instead 393 // of the original. 394 if (opt->dict_size < (UINT32_C(1) << 20)) 395 memlimit_too_small(memory_usage); 396 397 memory_usage = lzma_raw_encoder_memusage(filters); 398 if (memory_usage == UINT64_MAX) 399 message_bug(); 400 401 // Accept it if it is low enough. 402 if (memory_usage <= memory_limit) 403 break; 404 405 // Otherwise 1 MiB down and try again. I hope this 406 // isn't too slow method for cases where the original 407 // dict_size is very big. 408 opt->dict_size -= UINT32_C(1) << 20; 409 } 410 411 // Tell the user that we decreased the dictionary size. 412 message(V_WARNING, _("Adjusted LZMA%c dictionary size " 413 "from %s MiB to %s MiB to not exceed " 414 "the memory usage limit of %s MiB"), 415 filters[i].id == LZMA_FILTER_LZMA2 416 ? '2' : '1', 417 uint64_to_str(orig_dict_size >> 20, 0), 418 uint64_to_str(opt->dict_size >> 20, 1), 419 uint64_to_str(round_up_to_mib(memory_limit), 2)); 420 #endif 421 422 return; 423 } 424 425 426 #ifdef HAVE_DECODERS 427 /// Return true if the data in in_buf seems to be in the .xz format. 428 static bool 429 is_format_xz(void) 430 { 431 // Specify the magic as hex to be compatible with EBCDIC systems. 432 static const uint8_t magic[6] = { 0xFD, 0x37, 0x7A, 0x58, 0x5A, 0x00 }; 433 return strm.avail_in >= sizeof(magic) 434 && memcmp(in_buf.u8, magic, sizeof(magic)) == 0; 435 } 436 437 438 /// Return true if the data in in_buf seems to be in the .lzma format. 439 static bool 440 is_format_lzma(void) 441 { 442 // The .lzma header is 13 bytes. 443 if (strm.avail_in < 13) 444 return false; 445 446 // Decode the LZMA1 properties. 447 lzma_filter filter = { .id = LZMA_FILTER_LZMA1 }; 448 if (lzma_properties_decode(&filter, NULL, in_buf.u8, 5) != LZMA_OK) 449 return false; 450 451 // A hack to ditch tons of false positives: We allow only dictionary 452 // sizes that are 2^n or 2^n + 2^(n-1) or UINT32_MAX. LZMA_Alone 453 // created only files with 2^n, but accepts any dictionary size. 454 // If someone complains, this will be reconsidered. 455 lzma_options_lzma *opt = filter.options; 456 const uint32_t dict_size = opt->dict_size; 457 free(opt); 458 459 if (dict_size != UINT32_MAX) { 460 uint32_t d = dict_size - 1; 461 d |= d >> 2; 462 d |= d >> 3; 463 d |= d >> 4; 464 d |= d >> 8; 465 d |= d >> 16; 466 ++d; 467 if (d != dict_size || dict_size == 0) 468 return false; 469 } 470 471 // Another hack to ditch false positives: Assume that if the 472 // uncompressed size is known, it must be less than 256 GiB. 473 // Again, if someone complains, this will be reconsidered. 474 uint64_t uncompressed_size = 0; 475 for (size_t i = 0; i < 8; ++i) 476 uncompressed_size |= (uint64_t)(in_buf.u8[5 + i]) << (i * 8); 477 478 if (uncompressed_size != UINT64_MAX 479 && uncompressed_size > (UINT64_C(1) << 38)) 480 return false; 481 482 return true; 483 } 484 485 486 #ifdef HAVE_LZIP_DECODER 487 /// Return true if the data in in_buf seems to be in the .lz format. 488 static bool 489 is_format_lzip(void) 490 { 491 static const uint8_t magic[4] = { 0x4C, 0x5A, 0x49, 0x50 }; 492 return strm.avail_in >= sizeof(magic) 493 && memcmp(in_buf.u8, magic, sizeof(magic)) == 0; 494 } 495 #endif 496 #endif 497 498 499 /// Detect the input file type (for now, this done only when decompressing), 500 /// and initialize an appropriate coder. Return value indicates if a normal 501 /// liblzma-based coder was initialized (CODER_INIT_NORMAL), if passthru 502 /// mode should be used (CODER_INIT_PASSTHRU), or if an error occurred 503 /// (CODER_INIT_ERROR). 504 static enum coder_init_ret 505 coder_init(file_pair *pair) 506 { 507 lzma_ret ret = LZMA_PROG_ERROR; 508 509 // In most cases if there is input left when coding finishes, 510 // something has gone wrong. Exceptions are --single-stream 511 // and decoding .lz files which can contain trailing non-.lz data. 512 // These will be handled later in this function. 513 allow_trailing_input = false; 514 515 if (opt_mode == MODE_COMPRESS) { 516 #ifdef HAVE_ENCODERS 517 switch (opt_format) { 518 case FORMAT_AUTO: 519 // args.c ensures this. 520 assert(0); 521 break; 522 523 case FORMAT_XZ: 524 # ifdef MYTHREAD_ENABLED 525 if (hardware_threads_is_mt()) 526 ret = lzma_stream_encoder_mt( 527 &strm, &mt_options); 528 else 529 # endif 530 ret = lzma_stream_encoder( 531 &strm, filters, check); 532 break; 533 534 case FORMAT_LZMA: 535 ret = lzma_alone_encoder(&strm, filters[0].options); 536 break; 537 538 # ifdef HAVE_LZIP_DECODER 539 case FORMAT_LZIP: 540 // args.c should disallow this. 541 assert(0); 542 ret = LZMA_PROG_ERROR; 543 break; 544 # endif 545 546 case FORMAT_RAW: 547 ret = lzma_raw_encoder(&strm, filters); 548 break; 549 } 550 #endif 551 } else { 552 #ifdef HAVE_DECODERS 553 uint32_t flags = 0; 554 555 // It seems silly to warn about unsupported check if the 556 // check won't be verified anyway due to --ignore-check. 557 if (opt_ignore_check) 558 flags |= LZMA_IGNORE_CHECK; 559 else 560 flags |= LZMA_TELL_UNSUPPORTED_CHECK; 561 562 if (opt_single_stream) 563 allow_trailing_input = true; 564 else 565 flags |= LZMA_CONCATENATED; 566 567 // We abuse FORMAT_AUTO to indicate unknown file format, 568 // for which we may consider passthru mode. 569 enum format_type init_format = FORMAT_AUTO; 570 571 switch (opt_format) { 572 case FORMAT_AUTO: 573 // .lz is checked before .lzma since .lzma detection 574 // is more complicated (no magic bytes). 575 if (is_format_xz()) 576 init_format = FORMAT_XZ; 577 # ifdef HAVE_LZIP_DECODER 578 else if (is_format_lzip()) 579 init_format = FORMAT_LZIP; 580 # endif 581 else if (is_format_lzma()) 582 init_format = FORMAT_LZMA; 583 break; 584 585 case FORMAT_XZ: 586 if (is_format_xz()) 587 init_format = FORMAT_XZ; 588 break; 589 590 case FORMAT_LZMA: 591 if (is_format_lzma()) 592 init_format = FORMAT_LZMA; 593 break; 594 595 # ifdef HAVE_LZIP_DECODER 596 case FORMAT_LZIP: 597 if (is_format_lzip()) 598 init_format = FORMAT_LZIP; 599 break; 600 # endif 601 602 case FORMAT_RAW: 603 init_format = FORMAT_RAW; 604 break; 605 } 606 607 switch (init_format) { 608 case FORMAT_AUTO: 609 // Unknown file format. If --decompress --stdout 610 // --force have been given, then we copy the input 611 // as is to stdout. Checking for MODE_DECOMPRESS 612 // is needed, because we don't want to do use 613 // passthru mode with --test. 614 if (opt_mode == MODE_DECOMPRESS 615 && opt_stdout && opt_force) { 616 // These are needed for progress info. 617 strm.total_in = 0; 618 strm.total_out = 0; 619 return CODER_INIT_PASSTHRU; 620 } 621 622 ret = LZMA_FORMAT_ERROR; 623 break; 624 625 case FORMAT_XZ: 626 # ifdef MYTHREAD_ENABLED 627 mt_options.flags = flags; 628 629 mt_options.threads = hardware_threads_get(); 630 mt_options.memlimit_stop 631 = hardware_memlimit_get(MODE_DECOMPRESS); 632 633 // If single-threaded mode was requested, set the 634 // memlimit for threading to zero. This forces the 635 // decoder to use single-threaded mode which matches 636 // the behavior of lzma_stream_decoder(). 637 // 638 // Otherwise use the limit for threaded decompression 639 // which has a sane default (users are still free to 640 // make it insanely high though). 641 mt_options.memlimit_threading 642 = mt_options.threads == 1 643 ? 0 : hardware_memlimit_mtdec_get(); 644 645 ret = lzma_stream_decoder_mt(&strm, &mt_options); 646 # else 647 ret = lzma_stream_decoder(&strm, 648 hardware_memlimit_get( 649 MODE_DECOMPRESS), flags); 650 # endif 651 break; 652 653 case FORMAT_LZMA: 654 ret = lzma_alone_decoder(&strm, 655 hardware_memlimit_get( 656 MODE_DECOMPRESS)); 657 break; 658 659 # ifdef HAVE_LZIP_DECODER 660 case FORMAT_LZIP: 661 allow_trailing_input = true; 662 ret = lzma_lzip_decoder(&strm, 663 hardware_memlimit_get( 664 MODE_DECOMPRESS), flags); 665 break; 666 # endif 667 668 case FORMAT_RAW: 669 // Memory usage has already been checked in 670 // coder_set_compression_settings(). 671 ret = lzma_raw_decoder(&strm, filters); 672 break; 673 } 674 675 // Try to decode the headers. This will catch too low 676 // memory usage limit in case it happens in the first 677 // Block of the first Stream, which is where it very 678 // probably will happen if it is going to happen. 679 // 680 // This will also catch unsupported check type which 681 // we treat as a warning only. If there are empty 682 // concatenated Streams with unsupported check type then 683 // the message can be shown more than once here. The loop 684 // is used in case there is first a warning about 685 // unsupported check type and then the first Block 686 // would exceed the memlimit. 687 if (ret == LZMA_OK && init_format != FORMAT_RAW) { 688 strm.next_out = NULL; 689 strm.avail_out = 0; 690 while ((ret = lzma_code(&strm, LZMA_RUN)) 691 == LZMA_UNSUPPORTED_CHECK) 692 message_warning(_("%s: %s"), pair->src_name, 693 message_strm(ret)); 694 695 // With --single-stream lzma_code won't wait for 696 // LZMA_FINISH and thus it can return LZMA_STREAM_END 697 // if the file has no uncompressed data inside. 698 // So treat LZMA_STREAM_END as LZMA_OK here. 699 // When lzma_code() is called again in coder_normal() 700 // it will return LZMA_STREAM_END again. 701 if (ret == LZMA_STREAM_END) 702 ret = LZMA_OK; 703 } 704 #endif 705 } 706 707 if (ret != LZMA_OK) { 708 message_error(_("%s: %s"), pair->src_name, message_strm(ret)); 709 if (ret == LZMA_MEMLIMIT_ERROR) 710 message_mem_needed(V_ERROR, lzma_memusage(&strm)); 711 712 return CODER_INIT_ERROR; 713 } 714 715 return CODER_INIT_NORMAL; 716 } 717 718 719 /// Resolve conflicts between opt_block_size and opt_block_list in single 720 /// threaded mode. We want to default to opt_block_list, except when it is 721 /// larger than opt_block_size. If this is the case for the current Block 722 /// at *list_pos, then we break into smaller Blocks. Otherwise advance 723 /// to the next Block in opt_block_list, and break apart if needed. 724 static void 725 split_block(uint64_t *block_remaining, 726 uint64_t *next_block_remaining, 727 size_t *list_pos) 728 { 729 if (*next_block_remaining > 0) { 730 // The Block at *list_pos has previously been split up. 731 assert(!hardware_threads_is_mt()); 732 assert(opt_block_size > 0); 733 assert(opt_block_list != NULL); 734 735 if (*next_block_remaining > opt_block_size) { 736 // We have to split the current Block at *list_pos 737 // into another opt_block_size length Block. 738 *block_remaining = opt_block_size; 739 } else { 740 // This is the last remaining split Block for the 741 // Block at *list_pos. 742 *block_remaining = *next_block_remaining; 743 } 744 745 *next_block_remaining -= *block_remaining; 746 747 } else { 748 // The Block at *list_pos has been finished. Go to the next 749 // entry in the list. If the end of the list has been reached, 750 // reuse the size of the last Block. 751 if (opt_block_list[*list_pos + 1] != 0) 752 ++*list_pos; 753 754 *block_remaining = opt_block_list[*list_pos]; 755 756 // If in single-threaded mode, split up the Block if needed. 757 // This is not needed in multi-threaded mode because liblzma 758 // will do this due to how threaded encoding works. 759 if (!hardware_threads_is_mt() && opt_block_size > 0 760 && *block_remaining > opt_block_size) { 761 *next_block_remaining 762 = *block_remaining - opt_block_size; 763 *block_remaining = opt_block_size; 764 } 765 } 766 } 767 768 769 static bool 770 coder_write_output(file_pair *pair) 771 { 772 if (opt_mode != MODE_TEST) { 773 if (io_write(pair, &out_buf, IO_BUFFER_SIZE - strm.avail_out)) 774 return true; 775 } 776 777 strm.next_out = out_buf.u8; 778 strm.avail_out = IO_BUFFER_SIZE; 779 return false; 780 } 781 782 783 /// Compress or decompress using liblzma. 784 static bool 785 coder_normal(file_pair *pair) 786 { 787 // Encoder needs to know when we have given all the input to it. 788 // The decoders need to know it too when we are using 789 // LZMA_CONCATENATED. We need to check for src_eof here, because 790 // the first input chunk has been already read if decompressing, 791 // and that may have been the only chunk we will read. 792 lzma_action action = pair->src_eof ? LZMA_FINISH : LZMA_RUN; 793 794 lzma_ret ret; 795 796 // Assume that something goes wrong. 797 bool success = false; 798 799 // block_remaining indicates how many input bytes to encode before 800 // finishing the current .xz Block. The Block size is set with 801 // --block-size=SIZE and --block-list. They have an effect only when 802 // compressing to the .xz format. If block_remaining == UINT64_MAX, 803 // only a single block is created. 804 uint64_t block_remaining = UINT64_MAX; 805 806 // next_block_remaining for when we are in single-threaded mode and 807 // the Block in --block-list is larger than the --block-size=SIZE. 808 uint64_t next_block_remaining = 0; 809 810 // Position in opt_block_list. Unused if --block-list wasn't used. 811 size_t list_pos = 0; 812 813 // Handle --block-size for single-threaded mode and the first step 814 // of --block-list. 815 if (opt_mode == MODE_COMPRESS && opt_format == FORMAT_XZ) { 816 // --block-size doesn't do anything here in threaded mode, 817 // because the threaded encoder will take care of splitting 818 // to fixed-sized Blocks. 819 if (!hardware_threads_is_mt() && opt_block_size > 0) 820 block_remaining = opt_block_size; 821 822 // If --block-list was used, start with the first size. 823 // 824 // For threaded case, --block-size specifies how big Blocks 825 // the encoder needs to be prepared to create at maximum 826 // and --block-list will simultaneously cause new Blocks 827 // to be started at specified intervals. To keep things 828 // logical, the same is done in single-threaded mode. The 829 // output is still not identical because in single-threaded 830 // mode the size info isn't written into Block Headers. 831 if (opt_block_list != NULL) { 832 if (block_remaining < opt_block_list[list_pos]) { 833 assert(!hardware_threads_is_mt()); 834 next_block_remaining = opt_block_list[list_pos] 835 - block_remaining; 836 } else { 837 block_remaining = opt_block_list[list_pos]; 838 } 839 } 840 } 841 842 strm.next_out = out_buf.u8; 843 strm.avail_out = IO_BUFFER_SIZE; 844 845 while (!user_abort) { 846 // Fill the input buffer if it is empty and we aren't 847 // flushing or finishing. 848 if (strm.avail_in == 0 && action == LZMA_RUN) { 849 strm.next_in = in_buf.u8; 850 strm.avail_in = io_read(pair, &in_buf, 851 my_min(block_remaining, 852 IO_BUFFER_SIZE)); 853 854 if (strm.avail_in == SIZE_MAX) 855 break; 856 857 if (pair->src_eof) { 858 action = LZMA_FINISH; 859 860 } else if (block_remaining != UINT64_MAX) { 861 // Start a new Block after every 862 // opt_block_size bytes of input. 863 block_remaining -= strm.avail_in; 864 if (block_remaining == 0) 865 action = LZMA_FULL_BARRIER; 866 } 867 868 if (action == LZMA_RUN && pair->flush_needed) 869 action = LZMA_SYNC_FLUSH; 870 } 871 872 // Let liblzma do the actual work. 873 ret = lzma_code(&strm, action); 874 875 // Write out if the output buffer became full. 876 if (strm.avail_out == 0) { 877 if (coder_write_output(pair)) 878 break; 879 } 880 881 if (ret == LZMA_STREAM_END && (action == LZMA_SYNC_FLUSH 882 || action == LZMA_FULL_BARRIER)) { 883 if (action == LZMA_SYNC_FLUSH) { 884 // Flushing completed. Write the pending data 885 // out immediately so that the reading side 886 // can decompress everything compressed so far. 887 if (coder_write_output(pair)) 888 break; 889 890 // Mark that we haven't seen any new input 891 // since the previous flush. 892 pair->src_has_seen_input = false; 893 pair->flush_needed = false; 894 } else { 895 // Start a new Block after LZMA_FULL_BARRIER. 896 if (opt_block_list == NULL) { 897 assert(!hardware_threads_is_mt()); 898 assert(opt_block_size > 0); 899 block_remaining = opt_block_size; 900 } else { 901 split_block(&block_remaining, 902 &next_block_remaining, 903 &list_pos); 904 } 905 } 906 907 // Start a new Block after LZMA_FULL_FLUSH or continue 908 // the same block after LZMA_SYNC_FLUSH. 909 action = LZMA_RUN; 910 911 } else if (ret != LZMA_OK) { 912 // Determine if the return value indicates that we 913 // won't continue coding. LZMA_NO_CHECK would be 914 // here too if LZMA_TELL_ANY_CHECK was used. 915 const bool stop = ret != LZMA_UNSUPPORTED_CHECK; 916 917 if (stop) { 918 // Write the remaining bytes even if something 919 // went wrong, because that way the user gets 920 // as much data as possible, which can be good 921 // when trying to get at least some useful 922 // data out of damaged files. 923 if (coder_write_output(pair)) 924 break; 925 } 926 927 if (ret == LZMA_STREAM_END) { 928 if (allow_trailing_input) { 929 io_fix_src_pos(pair, strm.avail_in); 930 success = true; 931 break; 932 } 933 934 // Check that there is no trailing garbage. 935 // This is needed for LZMA_Alone and raw 936 // streams. This is *not* done with .lz files 937 // as that format specifically requires 938 // allowing trailing garbage. 939 if (strm.avail_in == 0 && !pair->src_eof) { 940 // Try reading one more byte. 941 // Hopefully we don't get any more 942 // input, and thus pair->src_eof 943 // becomes true. 944 strm.avail_in = io_read( 945 pair, &in_buf, 1); 946 if (strm.avail_in == SIZE_MAX) 947 break; 948 949 assert(strm.avail_in == 0 950 || strm.avail_in == 1); 951 } 952 953 if (strm.avail_in == 0) { 954 assert(pair->src_eof); 955 success = true; 956 break; 957 } 958 959 // We hadn't reached the end of the file. 960 ret = LZMA_DATA_ERROR; 961 assert(stop); 962 } 963 964 // If we get here and stop is true, something went 965 // wrong and we print an error. Otherwise it's just 966 // a warning and coding can continue. 967 if (stop) { 968 message_error(_("%s: %s"), pair->src_name, 969 message_strm(ret)); 970 } else { 971 message_warning(_("%s: %s"), pair->src_name, 972 message_strm(ret)); 973 974 // When compressing, all possible errors set 975 // stop to true. 976 assert(opt_mode != MODE_COMPRESS); 977 } 978 979 if (ret == LZMA_MEMLIMIT_ERROR) { 980 // Display how much memory it would have 981 // actually needed. 982 message_mem_needed(V_ERROR, 983 lzma_memusage(&strm)); 984 } 985 986 if (stop) 987 break; 988 } 989 990 // Show progress information under certain conditions. 991 message_progress_update(); 992 } 993 994 return success; 995 } 996 997 998 /// Copy from input file to output file without processing the data in any 999 /// way. This is used only when trying to decompress unrecognized files 1000 /// with --decompress --stdout --force, so the output is always stdout. 1001 static bool 1002 coder_passthru(file_pair *pair) 1003 { 1004 while (strm.avail_in != 0) { 1005 if (user_abort) 1006 return false; 1007 1008 if (io_write(pair, &in_buf, strm.avail_in)) 1009 return false; 1010 1011 strm.total_in += strm.avail_in; 1012 strm.total_out = strm.total_in; 1013 message_progress_update(); 1014 1015 strm.avail_in = io_read(pair, &in_buf, IO_BUFFER_SIZE); 1016 if (strm.avail_in == SIZE_MAX) 1017 return false; 1018 } 1019 1020 return true; 1021 } 1022 1023 1024 extern void 1025 coder_run(const char *filename) 1026 { 1027 // Set and possibly print the filename for the progress message. 1028 message_filename(filename); 1029 1030 // Try to open the input file. 1031 file_pair *pair = io_open_src(filename); 1032 if (pair == NULL) 1033 return; 1034 1035 // Assume that something goes wrong. 1036 bool success = false; 1037 1038 if (opt_mode == MODE_COMPRESS) { 1039 strm.next_in = NULL; 1040 strm.avail_in = 0; 1041 } else { 1042 // Read the first chunk of input data. This is needed 1043 // to detect the input file type. 1044 strm.next_in = in_buf.u8; 1045 strm.avail_in = io_read(pair, &in_buf, IO_BUFFER_SIZE); 1046 } 1047 1048 if (strm.avail_in != SIZE_MAX) { 1049 // Initialize the coder. This will detect the file format 1050 // and, in decompression or testing mode, check the memory 1051 // usage of the first Block too. This way we don't try to 1052 // open the destination file if we see that coding wouldn't 1053 // work at all anyway. This also avoids deleting the old 1054 // "target" file if --force was used. 1055 const enum coder_init_ret init_ret = coder_init(pair); 1056 1057 if (init_ret != CODER_INIT_ERROR && !user_abort) { 1058 // Don't open the destination file when --test 1059 // is used. 1060 if (opt_mode == MODE_TEST || !io_open_dest(pair)) { 1061 // Remember the current time. It is needed 1062 // for progress indicator. 1063 mytime_set_start_time(); 1064 1065 // Initialize the progress indicator. 1066 // 1067 // NOTE: When reading from stdin, fstat() 1068 // isn't called on it and thus src_st.st_size 1069 // is zero. If stdin pointed to a regular 1070 // file, it would still be possible to know 1071 // the file size but then we would also need 1072 // to take into account the current reading 1073 // position since with stdin it isn't 1074 // necessarily at the beginning of the file. 1075 const bool is_passthru = init_ret 1076 == CODER_INIT_PASSTHRU; 1077 const uint64_t in_size 1078 = pair->src_st.st_size <= 0 1079 ? 0 : (uint64_t)(pair->src_st.st_size); 1080 message_progress_start(&strm, 1081 is_passthru, in_size); 1082 1083 // Do the actual coding or passthru. 1084 if (is_passthru) 1085 success = coder_passthru(pair); 1086 else 1087 success = coder_normal(pair); 1088 1089 message_progress_end(success); 1090 } 1091 } 1092 } 1093 1094 // Close the file pair. It needs to know if coding was successful to 1095 // know if the source or target file should be unlinked. 1096 io_close(pair, success); 1097 1098 return; 1099 } 1100 1101 1102 #ifndef NDEBUG 1103 extern void 1104 coder_free(void) 1105 { 1106 lzma_end(&strm); 1107 return; 1108 } 1109 #endif 1110