1 /////////////////////////////////////////////////////////////////////////////// 2 // 3 /// \file coder.c 4 /// \brief Compresses or uncompresses a file 5 // 6 // Author: Lasse Collin 7 // 8 // This file has been put into the public domain. 9 // You can do whatever you want with this file. 10 // 11 /////////////////////////////////////////////////////////////////////////////// 12 13 #include "private.h" 14 15 16 /// Return value type for coder_init(). 17 enum coder_init_ret { 18 CODER_INIT_NORMAL, 19 CODER_INIT_PASSTHRU, 20 CODER_INIT_ERROR, 21 }; 22 23 24 enum operation_mode opt_mode = MODE_COMPRESS; 25 enum format_type opt_format = FORMAT_AUTO; 26 bool opt_auto_adjust = true; 27 bool opt_single_stream = false; 28 uint64_t opt_block_size = 0; 29 uint64_t *opt_block_list = NULL; 30 31 32 /// Stream used to communicate with liblzma 33 static lzma_stream strm = LZMA_STREAM_INIT; 34 35 /// Filters needed for all encoding all formats, and also decoding in raw data 36 static lzma_filter filters[LZMA_FILTERS_MAX + 1]; 37 38 /// Input and output buffers 39 static io_buf in_buf; 40 static io_buf out_buf; 41 42 /// Number of filters. Zero indicates that we are using a preset. 43 static uint32_t filters_count = 0; 44 45 /// Number of the preset (0-9) 46 static uint32_t preset_number = LZMA_PRESET_DEFAULT; 47 48 /// Integrity check type 49 static lzma_check check; 50 51 /// This becomes false if the --check=CHECK option is used. 52 static bool check_default = true; 53 54 /// Indicates if unconsumed input is allowed to remain after 55 /// decoding has successfully finished. This is set for each file 56 /// in coder_init(). 57 static bool allow_trailing_input; 58 59 #ifdef MYTHREAD_ENABLED 60 static lzma_mt mt_options = { 61 .flags = 0, 62 .timeout = 300, 63 .filters = filters, 64 }; 65 #endif 66 67 68 extern void 69 coder_set_check(lzma_check new_check) 70 { 71 check = new_check; 72 check_default = false; 73 return; 74 } 75 76 77 static void 78 forget_filter_chain(void) 79 { 80 // Setting a preset makes us forget a possibly defined custom 81 // filter chain. 82 while (filters_count > 0) { 83 --filters_count; 84 free(filters[filters_count].options); 85 filters[filters_count].options = NULL; 86 } 87 88 return; 89 } 90 91 92 extern void 93 coder_set_preset(uint32_t new_preset) 94 { 95 preset_number &= ~LZMA_PRESET_LEVEL_MASK; 96 preset_number |= new_preset; 97 forget_filter_chain(); 98 return; 99 } 100 101 102 extern void 103 coder_set_extreme(void) 104 { 105 preset_number |= LZMA_PRESET_EXTREME; 106 forget_filter_chain(); 107 return; 108 } 109 110 111 extern void 112 coder_add_filter(lzma_vli id, void *options) 113 { 114 if (filters_count == LZMA_FILTERS_MAX) 115 message_fatal(_("Maximum number of filters is four")); 116 117 filters[filters_count].id = id; 118 filters[filters_count].options = options; 119 ++filters_count; 120 121 // Setting a custom filter chain makes us forget the preset options. 122 // This makes a difference if one specifies e.g. "xz -9 --lzma2 -e" 123 // where the custom filter chain resets the preset level back to 124 // the default 6, making the example equivalent to "xz -6e". 125 preset_number = LZMA_PRESET_DEFAULT; 126 127 return; 128 } 129 130 131 static void lzma_attribute((__noreturn__)) 132 memlimit_too_small(uint64_t memory_usage) 133 { 134 message(V_ERROR, _("Memory usage limit is too low for the given " 135 "filter setup.")); 136 message_mem_needed(V_ERROR, memory_usage); 137 tuklib_exit(E_ERROR, E_ERROR, false); 138 } 139 140 141 extern void 142 coder_set_compression_settings(void) 143 { 144 #ifdef HAVE_LZIP_DECODER 145 // .lz compression isn't supported. 146 assert(opt_format != FORMAT_LZIP); 147 #endif 148 149 // The default check type is CRC64, but fallback to CRC32 150 // if CRC64 isn't supported by the copy of liblzma we are 151 // using. CRC32 is always supported. 152 if (check_default) { 153 check = LZMA_CHECK_CRC64; 154 if (!lzma_check_is_supported(check)) 155 check = LZMA_CHECK_CRC32; 156 } 157 158 // Options for LZMA1 or LZMA2 in case we are using a preset. 159 static lzma_options_lzma opt_lzma; 160 161 if (filters_count == 0) { 162 // We are using a preset. This is not a good idea in raw mode 163 // except when playing around with things. Different versions 164 // of this software may use different options in presets, and 165 // thus make uncompressing the raw data difficult. 166 if (opt_format == FORMAT_RAW) { 167 // The message is shown only if warnings are allowed 168 // but the exit status isn't changed. 169 message(V_WARNING, _("Using a preset in raw mode " 170 "is discouraged.")); 171 message(V_WARNING, _("The exact options of the " 172 "presets may vary between software " 173 "versions.")); 174 } 175 176 // Get the preset for LZMA1 or LZMA2. 177 if (lzma_lzma_preset(&opt_lzma, preset_number)) 178 message_bug(); 179 180 // Use LZMA2 except with --format=lzma we use LZMA1. 181 filters[0].id = opt_format == FORMAT_LZMA 182 ? LZMA_FILTER_LZMA1 : LZMA_FILTER_LZMA2; 183 filters[0].options = &opt_lzma; 184 filters_count = 1; 185 } 186 187 // Terminate the filter options array. 188 filters[filters_count].id = LZMA_VLI_UNKNOWN; 189 190 // If we are using the .lzma format, allow exactly one filter 191 // which has to be LZMA1. 192 if (opt_format == FORMAT_LZMA && (filters_count != 1 193 || filters[0].id != LZMA_FILTER_LZMA1)) 194 message_fatal(_("The .lzma format supports only " 195 "the LZMA1 filter")); 196 197 // If we are using the .xz format, make sure that there is no LZMA1 198 // filter to prevent LZMA_PROG_ERROR. 199 if (opt_format == FORMAT_XZ) 200 for (size_t i = 0; i < filters_count; ++i) 201 if (filters[i].id == LZMA_FILTER_LZMA1) 202 message_fatal(_("LZMA1 cannot be used " 203 "with the .xz format")); 204 205 // Print the selected filter chain. 206 message_filters_show(V_DEBUG, filters); 207 208 // The --flush-timeout option requires LZMA_SYNC_FLUSH support 209 // from the filter chain. Currently threaded encoder doesn't support 210 // LZMA_SYNC_FLUSH so single-threaded mode must be used. 211 if (opt_mode == MODE_COMPRESS && opt_flush_timeout != 0) { 212 for (size_t i = 0; i < filters_count; ++i) { 213 switch (filters[i].id) { 214 case LZMA_FILTER_LZMA2: 215 case LZMA_FILTER_DELTA: 216 break; 217 218 default: 219 message_fatal(_("The filter chain is " 220 "incompatible with --flush-timeout")); 221 } 222 } 223 224 if (hardware_threads_is_mt()) { 225 message(V_WARNING, _("Switching to single-threaded " 226 "mode due to --flush-timeout")); 227 hardware_threads_set(1); 228 } 229 } 230 231 // Get the memory usage. Note that if --format=raw was used, 232 // we can be decompressing. 233 // 234 // If multithreaded .xz compression is done, this value will be 235 // replaced. 236 uint64_t memory_limit = hardware_memlimit_get(opt_mode); 237 uint64_t memory_usage = UINT64_MAX; 238 if (opt_mode == MODE_COMPRESS) { 239 #ifdef HAVE_ENCODERS 240 # ifdef MYTHREAD_ENABLED 241 if (opt_format == FORMAT_XZ && hardware_threads_is_mt()) { 242 memory_limit = hardware_memlimit_mtenc_get(); 243 mt_options.threads = hardware_threads_get(); 244 mt_options.block_size = opt_block_size; 245 mt_options.check = check; 246 memory_usage = lzma_stream_encoder_mt_memusage( 247 &mt_options); 248 if (memory_usage != UINT64_MAX) 249 message(V_DEBUG, _("Using up to %" PRIu32 250 " threads."), 251 mt_options.threads); 252 } else 253 # endif 254 { 255 memory_usage = lzma_raw_encoder_memusage(filters); 256 } 257 #endif 258 } else { 259 #ifdef HAVE_DECODERS 260 memory_usage = lzma_raw_decoder_memusage(filters); 261 #endif 262 } 263 264 if (memory_usage == UINT64_MAX) 265 message_fatal(_("Unsupported filter chain or filter options")); 266 267 // Print memory usage info before possible dictionary 268 // size auto-adjusting. 269 // 270 // NOTE: If only encoder support was built, we cannot show the 271 // what the decoder memory usage will be. 272 message_mem_needed(V_DEBUG, memory_usage); 273 #ifdef HAVE_DECODERS 274 if (opt_mode == MODE_COMPRESS) { 275 const uint64_t decmem = lzma_raw_decoder_memusage(filters); 276 if (decmem != UINT64_MAX) 277 message(V_DEBUG, _("Decompression will need " 278 "%s MiB of memory."), uint64_to_str( 279 round_up_to_mib(decmem), 0)); 280 } 281 #endif 282 283 if (memory_usage <= memory_limit) 284 return; 285 286 // With --format=raw settings are never adjusted to meet 287 // the memory usage limit. 288 if (opt_format == FORMAT_RAW) 289 memlimit_too_small(memory_usage); 290 291 assert(opt_mode == MODE_COMPRESS); 292 293 #ifdef HAVE_ENCODERS 294 # ifdef MYTHREAD_ENABLED 295 if (opt_format == FORMAT_XZ && hardware_threads_is_mt()) { 296 // Try to reduce the number of threads before 297 // adjusting the compression settings down. 298 while (mt_options.threads > 1) { 299 // Reduce the number of threads by one and check 300 // the memory usage. 301 --mt_options.threads; 302 memory_usage = lzma_stream_encoder_mt_memusage( 303 &mt_options); 304 if (memory_usage == UINT64_MAX) 305 message_bug(); 306 307 if (memory_usage <= memory_limit) { 308 // The memory usage is now low enough. 309 message(V_WARNING, _("Reduced the number of " 310 "threads from %s to %s to not exceed " 311 "the memory usage limit of %s MiB"), 312 uint64_to_str( 313 hardware_threads_get(), 0), 314 uint64_to_str(mt_options.threads, 1), 315 uint64_to_str(round_up_to_mib( 316 memory_limit), 2)); 317 return; 318 } 319 } 320 321 // If the memory usage limit is only a soft limit (automatic 322 // number of threads and no --memlimit-compress), the limit 323 // is only used to reduce the number of threads and once at 324 // just one thread, the limit is completely ignored. This 325 // way -T0 won't use insane amount of memory but at the same 326 // time the soft limit will never make xz fail and never make 327 // xz change settings that would affect the compressed output. 328 if (hardware_memlimit_mtenc_is_default()) { 329 message(V_WARNING, _("Reduced the number of threads " 330 "from %s to one. The automatic memory usage " 331 "limit of %s MiB is still being exceeded. " 332 "%s MiB of memory is required. " 333 "Continuing anyway."), 334 uint64_to_str(hardware_threads_get(), 0), 335 uint64_to_str( 336 round_up_to_mib(memory_limit), 1), 337 uint64_to_str( 338 round_up_to_mib(memory_usage), 2)); 339 return; 340 } 341 342 // If --no-adjust was used, we cannot drop to single-threaded 343 // mode since it produces different compressed output. 344 // 345 // NOTE: In xz 5.2.x, --no-adjust also prevented reducing 346 // the number of threads. This changed in 5.3.3alpha. 347 if (!opt_auto_adjust) 348 memlimit_too_small(memory_usage); 349 350 // Switch to single-threaded mode. It uses 351 // less memory than using one thread in 352 // the multithreaded mode but the output 353 // is also different. 354 hardware_threads_set(1); 355 memory_usage = lzma_raw_encoder_memusage(filters); 356 message(V_WARNING, _("Switching to single-threaded mode " 357 "to not exceed the memory usage limit of %s MiB"), 358 uint64_to_str(round_up_to_mib(memory_limit), 0)); 359 } 360 # endif 361 362 if (memory_usage <= memory_limit) 363 return; 364 365 // Don't adjust LZMA2 or LZMA1 dictionary size if --no-adjust 366 // was specified as that would change the compressed output. 367 if (!opt_auto_adjust) 368 memlimit_too_small(memory_usage); 369 370 // Look for the last filter if it is LZMA2 or LZMA1, so we can make 371 // it use less RAM. With other filters we don't know what to do. 372 size_t i = 0; 373 while (filters[i].id != LZMA_FILTER_LZMA2 374 && filters[i].id != LZMA_FILTER_LZMA1) { 375 if (filters[i].id == LZMA_VLI_UNKNOWN) 376 memlimit_too_small(memory_usage); 377 378 ++i; 379 } 380 381 // Decrease the dictionary size until we meet the memory 382 // usage limit. First round down to full mebibytes. 383 lzma_options_lzma *opt = filters[i].options; 384 const uint32_t orig_dict_size = opt->dict_size; 385 opt->dict_size &= ~((UINT32_C(1) << 20) - 1); 386 while (true) { 387 // If it is below 1 MiB, auto-adjusting failed. We could be 388 // more sophisticated and scale it down even more, but let's 389 // see if many complain about this version. 390 // 391 // FIXME: Displays the scaled memory usage instead 392 // of the original. 393 if (opt->dict_size < (UINT32_C(1) << 20)) 394 memlimit_too_small(memory_usage); 395 396 memory_usage = lzma_raw_encoder_memusage(filters); 397 if (memory_usage == UINT64_MAX) 398 message_bug(); 399 400 // Accept it if it is low enough. 401 if (memory_usage <= memory_limit) 402 break; 403 404 // Otherwise 1 MiB down and try again. I hope this 405 // isn't too slow method for cases where the original 406 // dict_size is very big. 407 opt->dict_size -= UINT32_C(1) << 20; 408 } 409 410 // Tell the user that we decreased the dictionary size. 411 message(V_WARNING, _("Adjusted LZMA%c dictionary size " 412 "from %s MiB to %s MiB to not exceed " 413 "the memory usage limit of %s MiB"), 414 filters[i].id == LZMA_FILTER_LZMA2 415 ? '2' : '1', 416 uint64_to_str(orig_dict_size >> 20, 0), 417 uint64_to_str(opt->dict_size >> 20, 1), 418 uint64_to_str(round_up_to_mib(memory_limit), 2)); 419 #endif 420 421 return; 422 } 423 424 425 #ifdef HAVE_DECODERS 426 /// Return true if the data in in_buf seems to be in the .xz format. 427 static bool 428 is_format_xz(void) 429 { 430 // Specify the magic as hex to be compatible with EBCDIC systems. 431 static const uint8_t magic[6] = { 0xFD, 0x37, 0x7A, 0x58, 0x5A, 0x00 }; 432 return strm.avail_in >= sizeof(magic) 433 && memcmp(in_buf.u8, magic, sizeof(magic)) == 0; 434 } 435 436 437 /// Return true if the data in in_buf seems to be in the .lzma format. 438 static bool 439 is_format_lzma(void) 440 { 441 // The .lzma header is 13 bytes. 442 if (strm.avail_in < 13) 443 return false; 444 445 // Decode the LZMA1 properties. 446 lzma_filter filter = { .id = LZMA_FILTER_LZMA1 }; 447 if (lzma_properties_decode(&filter, NULL, in_buf.u8, 5) != LZMA_OK) 448 return false; 449 450 // A hack to ditch tons of false positives: We allow only dictionary 451 // sizes that are 2^n or 2^n + 2^(n-1) or UINT32_MAX. LZMA_Alone 452 // created only files with 2^n, but accepts any dictionary size. 453 // If someone complains, this will be reconsidered. 454 lzma_options_lzma *opt = filter.options; 455 const uint32_t dict_size = opt->dict_size; 456 free(opt); 457 458 if (dict_size != UINT32_MAX) { 459 uint32_t d = dict_size - 1; 460 d |= d >> 2; 461 d |= d >> 3; 462 d |= d >> 4; 463 d |= d >> 8; 464 d |= d >> 16; 465 ++d; 466 if (d != dict_size || dict_size == 0) 467 return false; 468 } 469 470 // Another hack to ditch false positives: Assume that if the 471 // uncompressed size is known, it must be less than 256 GiB. 472 // Again, if someone complains, this will be reconsidered. 473 uint64_t uncompressed_size = 0; 474 for (size_t i = 0; i < 8; ++i) 475 uncompressed_size |= (uint64_t)(in_buf.u8[5 + i]) << (i * 8); 476 477 if (uncompressed_size != UINT64_MAX 478 && uncompressed_size > (UINT64_C(1) << 38)) 479 return false; 480 481 return true; 482 } 483 484 485 #ifdef HAVE_LZIP_DECODER 486 /// Return true if the data in in_buf seems to be in the .lz format. 487 static bool 488 is_format_lzip(void) 489 { 490 static const uint8_t magic[4] = { 0x4C, 0x5A, 0x49, 0x50 }; 491 return strm.avail_in >= sizeof(magic) 492 && memcmp(in_buf.u8, magic, sizeof(magic)) == 0; 493 } 494 #endif 495 #endif 496 497 498 /// Detect the input file type (for now, this done only when decompressing), 499 /// and initialize an appropriate coder. Return value indicates if a normal 500 /// liblzma-based coder was initialized (CODER_INIT_NORMAL), if passthru 501 /// mode should be used (CODER_INIT_PASSTHRU), or if an error occurred 502 /// (CODER_INIT_ERROR). 503 static enum coder_init_ret 504 coder_init(file_pair *pair) 505 { 506 lzma_ret ret = LZMA_PROG_ERROR; 507 508 // In most cases if there is input left when coding finishes, 509 // something has gone wrong. Exceptions are --single-stream 510 // and decoding .lz files which can contain trailing non-.lz data. 511 // These will be handled later in this function. 512 allow_trailing_input = false; 513 514 if (opt_mode == MODE_COMPRESS) { 515 #ifdef HAVE_ENCODERS 516 switch (opt_format) { 517 case FORMAT_AUTO: 518 // args.c ensures this. 519 assert(0); 520 break; 521 522 case FORMAT_XZ: 523 # ifdef MYTHREAD_ENABLED 524 if (hardware_threads_is_mt()) 525 ret = lzma_stream_encoder_mt( 526 &strm, &mt_options); 527 else 528 # endif 529 ret = lzma_stream_encoder( 530 &strm, filters, check); 531 break; 532 533 case FORMAT_LZMA: 534 ret = lzma_alone_encoder(&strm, filters[0].options); 535 break; 536 537 # ifdef HAVE_LZIP_DECODER 538 case FORMAT_LZIP: 539 // args.c should disallow this. 540 assert(0); 541 ret = LZMA_PROG_ERROR; 542 break; 543 # endif 544 545 case FORMAT_RAW: 546 ret = lzma_raw_encoder(&strm, filters); 547 break; 548 } 549 #endif 550 } else { 551 #ifdef HAVE_DECODERS 552 uint32_t flags = 0; 553 554 // It seems silly to warn about unsupported check if the 555 // check won't be verified anyway due to --ignore-check. 556 if (opt_ignore_check) 557 flags |= LZMA_IGNORE_CHECK; 558 else 559 flags |= LZMA_TELL_UNSUPPORTED_CHECK; 560 561 if (opt_single_stream) 562 allow_trailing_input = true; 563 else 564 flags |= LZMA_CONCATENATED; 565 566 // We abuse FORMAT_AUTO to indicate unknown file format, 567 // for which we may consider passthru mode. 568 enum format_type init_format = FORMAT_AUTO; 569 570 switch (opt_format) { 571 case FORMAT_AUTO: 572 // .lz is checked before .lzma since .lzma detection 573 // is more complicated (no magic bytes). 574 if (is_format_xz()) 575 init_format = FORMAT_XZ; 576 # ifdef HAVE_LZIP_DECODER 577 else if (is_format_lzip()) 578 init_format = FORMAT_LZIP; 579 # endif 580 else if (is_format_lzma()) 581 init_format = FORMAT_LZMA; 582 break; 583 584 case FORMAT_XZ: 585 if (is_format_xz()) 586 init_format = FORMAT_XZ; 587 break; 588 589 case FORMAT_LZMA: 590 if (is_format_lzma()) 591 init_format = FORMAT_LZMA; 592 break; 593 594 # ifdef HAVE_LZIP_DECODER 595 case FORMAT_LZIP: 596 if (is_format_lzip()) 597 init_format = FORMAT_LZIP; 598 break; 599 # endif 600 601 case FORMAT_RAW: 602 init_format = FORMAT_RAW; 603 break; 604 } 605 606 switch (init_format) { 607 case FORMAT_AUTO: 608 // Unknown file format. If --decompress --stdout 609 // --force have been given, then we copy the input 610 // as is to stdout. Checking for MODE_DECOMPRESS 611 // is needed, because we don't want to do use 612 // passthru mode with --test. 613 if (opt_mode == MODE_DECOMPRESS 614 && opt_stdout && opt_force) { 615 // These are needed for progress info. 616 strm.total_in = 0; 617 strm.total_out = 0; 618 return CODER_INIT_PASSTHRU; 619 } 620 621 ret = LZMA_FORMAT_ERROR; 622 break; 623 624 case FORMAT_XZ: 625 # ifdef MYTHREAD_ENABLED 626 mt_options.flags = flags; 627 628 mt_options.threads = hardware_threads_get(); 629 mt_options.memlimit_stop 630 = hardware_memlimit_get(MODE_DECOMPRESS); 631 632 // If single-threaded mode was requested, set the 633 // memlimit for threading to zero. This forces the 634 // decoder to use single-threaded mode which matches 635 // the behavior of lzma_stream_decoder(). 636 // 637 // Otherwise use the limit for threaded decompression 638 // which has a sane default (users are still free to 639 // make it insanely high though). 640 mt_options.memlimit_threading 641 = mt_options.threads == 1 642 ? 0 : hardware_memlimit_mtdec_get(); 643 644 ret = lzma_stream_decoder_mt(&strm, &mt_options); 645 # else 646 ret = lzma_stream_decoder(&strm, 647 hardware_memlimit_get( 648 MODE_DECOMPRESS), flags); 649 # endif 650 break; 651 652 case FORMAT_LZMA: 653 ret = lzma_alone_decoder(&strm, 654 hardware_memlimit_get( 655 MODE_DECOMPRESS)); 656 break; 657 658 # ifdef HAVE_LZIP_DECODER 659 case FORMAT_LZIP: 660 allow_trailing_input = true; 661 ret = lzma_lzip_decoder(&strm, 662 hardware_memlimit_get( 663 MODE_DECOMPRESS), flags); 664 break; 665 # endif 666 667 case FORMAT_RAW: 668 // Memory usage has already been checked in 669 // coder_set_compression_settings(). 670 ret = lzma_raw_decoder(&strm, filters); 671 break; 672 } 673 674 // Try to decode the headers. This will catch too low 675 // memory usage limit in case it happens in the first 676 // Block of the first Stream, which is where it very 677 // probably will happen if it is going to happen. 678 // 679 // This will also catch unsupported check type which 680 // we treat as a warning only. If there are empty 681 // concatenated Streams with unsupported check type then 682 // the message can be shown more than once here. The loop 683 // is used in case there is first a warning about 684 // unsupported check type and then the first Block 685 // would exceed the memlimit. 686 if (ret == LZMA_OK && init_format != FORMAT_RAW) { 687 strm.next_out = NULL; 688 strm.avail_out = 0; 689 while ((ret = lzma_code(&strm, LZMA_RUN)) 690 == LZMA_UNSUPPORTED_CHECK) 691 message_warning("%s: %s", pair->src_name, 692 message_strm(ret)); 693 694 // With --single-stream lzma_code won't wait for 695 // LZMA_FINISH and thus it can return LZMA_STREAM_END 696 // if the file has no uncompressed data inside. 697 // So treat LZMA_STREAM_END as LZMA_OK here. 698 // When lzma_code() is called again in coder_normal() 699 // it will return LZMA_STREAM_END again. 700 if (ret == LZMA_STREAM_END) 701 ret = LZMA_OK; 702 } 703 #endif 704 } 705 706 if (ret != LZMA_OK) { 707 message_error("%s: %s", pair->src_name, message_strm(ret)); 708 if (ret == LZMA_MEMLIMIT_ERROR) 709 message_mem_needed(V_ERROR, lzma_memusage(&strm)); 710 711 return CODER_INIT_ERROR; 712 } 713 714 return CODER_INIT_NORMAL; 715 } 716 717 718 /// Resolve conflicts between opt_block_size and opt_block_list in single 719 /// threaded mode. We want to default to opt_block_list, except when it is 720 /// larger than opt_block_size. If this is the case for the current Block 721 /// at *list_pos, then we break into smaller Blocks. Otherwise advance 722 /// to the next Block in opt_block_list, and break apart if needed. 723 static void 724 split_block(uint64_t *block_remaining, 725 uint64_t *next_block_remaining, 726 size_t *list_pos) 727 { 728 if (*next_block_remaining > 0) { 729 // The Block at *list_pos has previously been split up. 730 assert(!hardware_threads_is_mt()); 731 assert(opt_block_size > 0); 732 assert(opt_block_list != NULL); 733 734 if (*next_block_remaining > opt_block_size) { 735 // We have to split the current Block at *list_pos 736 // into another opt_block_size length Block. 737 *block_remaining = opt_block_size; 738 } else { 739 // This is the last remaining split Block for the 740 // Block at *list_pos. 741 *block_remaining = *next_block_remaining; 742 } 743 744 *next_block_remaining -= *block_remaining; 745 746 } else { 747 // The Block at *list_pos has been finished. Go to the next 748 // entry in the list. If the end of the list has been reached, 749 // reuse the size of the last Block. 750 if (opt_block_list[*list_pos + 1] != 0) 751 ++*list_pos; 752 753 *block_remaining = opt_block_list[*list_pos]; 754 755 // If in single-threaded mode, split up the Block if needed. 756 // This is not needed in multi-threaded mode because liblzma 757 // will do this due to how threaded encoding works. 758 if (!hardware_threads_is_mt() && opt_block_size > 0 759 && *block_remaining > opt_block_size) { 760 *next_block_remaining 761 = *block_remaining - opt_block_size; 762 *block_remaining = opt_block_size; 763 } 764 } 765 } 766 767 768 static bool 769 coder_write_output(file_pair *pair) 770 { 771 if (opt_mode != MODE_TEST) { 772 if (io_write(pair, &out_buf, IO_BUFFER_SIZE - strm.avail_out)) 773 return true; 774 } 775 776 strm.next_out = out_buf.u8; 777 strm.avail_out = IO_BUFFER_SIZE; 778 return false; 779 } 780 781 782 /// Compress or decompress using liblzma. 783 static bool 784 coder_normal(file_pair *pair) 785 { 786 // Encoder needs to know when we have given all the input to it. 787 // The decoders need to know it too when we are using 788 // LZMA_CONCATENATED. We need to check for src_eof here, because 789 // the first input chunk has been already read if decompressing, 790 // and that may have been the only chunk we will read. 791 lzma_action action = pair->src_eof ? LZMA_FINISH : LZMA_RUN; 792 793 lzma_ret ret; 794 795 // Assume that something goes wrong. 796 bool success = false; 797 798 // block_remaining indicates how many input bytes to encode before 799 // finishing the current .xz Block. The Block size is set with 800 // --block-size=SIZE and --block-list. They have an effect only when 801 // compressing to the .xz format. If block_remaining == UINT64_MAX, 802 // only a single block is created. 803 uint64_t block_remaining = UINT64_MAX; 804 805 // next_block_remaining for when we are in single-threaded mode and 806 // the Block in --block-list is larger than the --block-size=SIZE. 807 uint64_t next_block_remaining = 0; 808 809 // Position in opt_block_list. Unused if --block-list wasn't used. 810 size_t list_pos = 0; 811 812 // Handle --block-size for single-threaded mode and the first step 813 // of --block-list. 814 if (opt_mode == MODE_COMPRESS && opt_format == FORMAT_XZ) { 815 // --block-size doesn't do anything here in threaded mode, 816 // because the threaded encoder will take care of splitting 817 // to fixed-sized Blocks. 818 if (!hardware_threads_is_mt() && opt_block_size > 0) 819 block_remaining = opt_block_size; 820 821 // If --block-list was used, start with the first size. 822 // 823 // For threaded case, --block-size specifies how big Blocks 824 // the encoder needs to be prepared to create at maximum 825 // and --block-list will simultaneously cause new Blocks 826 // to be started at specified intervals. To keep things 827 // logical, the same is done in single-threaded mode. The 828 // output is still not identical because in single-threaded 829 // mode the size info isn't written into Block Headers. 830 if (opt_block_list != NULL) { 831 if (block_remaining < opt_block_list[list_pos]) { 832 assert(!hardware_threads_is_mt()); 833 next_block_remaining = opt_block_list[list_pos] 834 - block_remaining; 835 } else { 836 block_remaining = opt_block_list[list_pos]; 837 } 838 } 839 } 840 841 strm.next_out = out_buf.u8; 842 strm.avail_out = IO_BUFFER_SIZE; 843 844 while (!user_abort) { 845 // Fill the input buffer if it is empty and we aren't 846 // flushing or finishing. 847 if (strm.avail_in == 0 && action == LZMA_RUN) { 848 strm.next_in = in_buf.u8; 849 strm.avail_in = io_read(pair, &in_buf, 850 my_min(block_remaining, 851 IO_BUFFER_SIZE)); 852 853 if (strm.avail_in == SIZE_MAX) 854 break; 855 856 if (pair->src_eof) { 857 action = LZMA_FINISH; 858 859 } else if (block_remaining != UINT64_MAX) { 860 // Start a new Block after every 861 // opt_block_size bytes of input. 862 block_remaining -= strm.avail_in; 863 if (block_remaining == 0) 864 action = LZMA_FULL_BARRIER; 865 } 866 867 if (action == LZMA_RUN && pair->flush_needed) 868 action = LZMA_SYNC_FLUSH; 869 } 870 871 // Let liblzma do the actual work. 872 ret = lzma_code(&strm, action); 873 874 // Write out if the output buffer became full. 875 if (strm.avail_out == 0) { 876 if (coder_write_output(pair)) 877 break; 878 } 879 880 if (ret == LZMA_STREAM_END && (action == LZMA_SYNC_FLUSH 881 || action == LZMA_FULL_BARRIER)) { 882 if (action == LZMA_SYNC_FLUSH) { 883 // Flushing completed. Write the pending data 884 // out immediately so that the reading side 885 // can decompress everything compressed so far. 886 if (coder_write_output(pair)) 887 break; 888 889 // Mark that we haven't seen any new input 890 // since the previous flush. 891 pair->src_has_seen_input = false; 892 pair->flush_needed = false; 893 } else { 894 // Start a new Block after LZMA_FULL_BARRIER. 895 if (opt_block_list == NULL) { 896 assert(!hardware_threads_is_mt()); 897 assert(opt_block_size > 0); 898 block_remaining = opt_block_size; 899 } else { 900 split_block(&block_remaining, 901 &next_block_remaining, 902 &list_pos); 903 } 904 } 905 906 // Start a new Block after LZMA_FULL_FLUSH or continue 907 // the same block after LZMA_SYNC_FLUSH. 908 action = LZMA_RUN; 909 910 } else if (ret != LZMA_OK) { 911 // Determine if the return value indicates that we 912 // won't continue coding. LZMA_NO_CHECK would be 913 // here too if LZMA_TELL_ANY_CHECK was used. 914 const bool stop = ret != LZMA_UNSUPPORTED_CHECK; 915 916 if (stop) { 917 // Write the remaining bytes even if something 918 // went wrong, because that way the user gets 919 // as much data as possible, which can be good 920 // when trying to get at least some useful 921 // data out of damaged files. 922 if (coder_write_output(pair)) 923 break; 924 } 925 926 if (ret == LZMA_STREAM_END) { 927 if (allow_trailing_input) { 928 io_fix_src_pos(pair, strm.avail_in); 929 success = true; 930 break; 931 } 932 933 // Check that there is no trailing garbage. 934 // This is needed for LZMA_Alone and raw 935 // streams. This is *not* done with .lz files 936 // as that format specifically requires 937 // allowing trailing garbage. 938 if (strm.avail_in == 0 && !pair->src_eof) { 939 // Try reading one more byte. 940 // Hopefully we don't get any more 941 // input, and thus pair->src_eof 942 // becomes true. 943 strm.avail_in = io_read( 944 pair, &in_buf, 1); 945 if (strm.avail_in == SIZE_MAX) 946 break; 947 948 assert(strm.avail_in == 0 949 || strm.avail_in == 1); 950 } 951 952 if (strm.avail_in == 0) { 953 assert(pair->src_eof); 954 success = true; 955 break; 956 } 957 958 // We hadn't reached the end of the file. 959 ret = LZMA_DATA_ERROR; 960 assert(stop); 961 } 962 963 // If we get here and stop is true, something went 964 // wrong and we print an error. Otherwise it's just 965 // a warning and coding can continue. 966 if (stop) { 967 message_error("%s: %s", pair->src_name, 968 message_strm(ret)); 969 } else { 970 message_warning("%s: %s", pair->src_name, 971 message_strm(ret)); 972 973 // When compressing, all possible errors set 974 // stop to true. 975 assert(opt_mode != MODE_COMPRESS); 976 } 977 978 if (ret == LZMA_MEMLIMIT_ERROR) { 979 // Display how much memory it would have 980 // actually needed. 981 message_mem_needed(V_ERROR, 982 lzma_memusage(&strm)); 983 } 984 985 if (stop) 986 break; 987 } 988 989 // Show progress information under certain conditions. 990 message_progress_update(); 991 } 992 993 return success; 994 } 995 996 997 /// Copy from input file to output file without processing the data in any 998 /// way. This is used only when trying to decompress unrecognized files 999 /// with --decompress --stdout --force, so the output is always stdout. 1000 static bool 1001 coder_passthru(file_pair *pair) 1002 { 1003 while (strm.avail_in != 0) { 1004 if (user_abort) 1005 return false; 1006 1007 if (io_write(pair, &in_buf, strm.avail_in)) 1008 return false; 1009 1010 strm.total_in += strm.avail_in; 1011 strm.total_out = strm.total_in; 1012 message_progress_update(); 1013 1014 strm.avail_in = io_read(pair, &in_buf, IO_BUFFER_SIZE); 1015 if (strm.avail_in == SIZE_MAX) 1016 return false; 1017 } 1018 1019 return true; 1020 } 1021 1022 1023 extern void 1024 coder_run(const char *filename) 1025 { 1026 // Set and possibly print the filename for the progress message. 1027 message_filename(filename); 1028 1029 // Try to open the input file. 1030 file_pair *pair = io_open_src(filename); 1031 if (pair == NULL) 1032 return; 1033 1034 // Assume that something goes wrong. 1035 bool success = false; 1036 1037 if (opt_mode == MODE_COMPRESS) { 1038 strm.next_in = NULL; 1039 strm.avail_in = 0; 1040 } else { 1041 // Read the first chunk of input data. This is needed 1042 // to detect the input file type. 1043 strm.next_in = in_buf.u8; 1044 strm.avail_in = io_read(pair, &in_buf, IO_BUFFER_SIZE); 1045 } 1046 1047 if (strm.avail_in != SIZE_MAX) { 1048 // Initialize the coder. This will detect the file format 1049 // and, in decompression or testing mode, check the memory 1050 // usage of the first Block too. This way we don't try to 1051 // open the destination file if we see that coding wouldn't 1052 // work at all anyway. This also avoids deleting the old 1053 // "target" file if --force was used. 1054 const enum coder_init_ret init_ret = coder_init(pair); 1055 1056 if (init_ret != CODER_INIT_ERROR && !user_abort) { 1057 // Don't open the destination file when --test 1058 // is used. 1059 if (opt_mode == MODE_TEST || !io_open_dest(pair)) { 1060 // Remember the current time. It is needed 1061 // for progress indicator. 1062 mytime_set_start_time(); 1063 1064 // Initialize the progress indicator. 1065 // 1066 // NOTE: When reading from stdin, fstat() 1067 // isn't called on it and thus src_st.st_size 1068 // is zero. If stdin pointed to a regular 1069 // file, it would still be possible to know 1070 // the file size but then we would also need 1071 // to take into account the current reading 1072 // position since with stdin it isn't 1073 // necessarily at the beginning of the file. 1074 const bool is_passthru = init_ret 1075 == CODER_INIT_PASSTHRU; 1076 const uint64_t in_size 1077 = pair->src_st.st_size <= 0 1078 ? 0 : (uint64_t)(pair->src_st.st_size); 1079 message_progress_start(&strm, 1080 is_passthru, in_size); 1081 1082 // Do the actual coding or passthru. 1083 if (is_passthru) 1084 success = coder_passthru(pair); 1085 else 1086 success = coder_normal(pair); 1087 1088 message_progress_end(success); 1089 } 1090 } 1091 } 1092 1093 // Close the file pair. It needs to know if coding was successful to 1094 // know if the source or target file should be unlinked. 1095 io_close(pair, success); 1096 1097 return; 1098 } 1099 1100 1101 #ifndef NDEBUG 1102 extern void 1103 coder_free(void) 1104 { 1105 lzma_end(&strm); 1106 return; 1107 } 1108 #endif 1109