1-- There are three LZ77 encoders at choice here: 2-- 3-- 1/ LZ77_using_LZHuf, based on LZHuf 4-- 5-- 2/ LZ77_using_IZ, based on Info-Zip's Zip's deflate.c which is 6-- actually the LZ77 part of Zip's compression. 7-- 8-- 3/ LZ77_using_BT4, based on LZMA SDK's BT4 algorithm. 9-- 10-- Variant 1/, LZ77_using_LZHuf, is working since 2009. Two problems: it is slow 11-- and not well adapted to the Deflate format (mediocre compression). 12-- 13-- Variant 2/, LZ77_using_IZ, is much faster, and better suited for Deflate. 14-- It has been added on 05-Mar-2016. 15-- The code is tailored and optimized for a single set of 16-- the String_buffer_size, Look_Ahead, Threshold LZ77 parameters - those for Deflate. 17-- 18-- Variant 3/, LZ77_using_BT4, was added on 06-Sep-2016. 19-- The seems to be the best match finder for LZMA on data of the >= 1 MB scale. 20 21-- To do: 22-- 23-- 2/ 24-- - LZ77 / IZ: similar to the test with TOO_FAR, try to cluster distances around 25-- values needing less extra bits (may not work at all...) 26-- - LZ77 / IZ: tune TOO_FAR (max: 32767), see http://optipng.sf.net/pngtech/too_far.html 27-- "TOO_FAR in zlib Is Not Too Far" for discussion 28 29-- Legal licensing note: 30 31-- Copyright (c) 2016 .. 2018 Gautier de Montmollin (maintainer of the Ada version) 32-- SWITZERLAND 33 34-- Permission is hereby granted, free of charge, to any person obtaining a copy 35-- of this software and associated documentation files (the "Software"), to deal 36-- in the Software without restriction, including without limitation the rights 37-- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 38-- copies of the Software, and to permit persons to whom the Software is 39-- furnished to do so, subject to the following conditions: 40 41-- The above copyright notice and this permission notice shall be included in 42-- all copies or substantial portions of the Software. 43 44-- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 45-- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 46-- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 47-- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 48-- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 49-- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 50-- THE SOFTWARE. 51 52-- NB: this is the MIT License, as found 21-Aug-2016 on the site 53-- http://www.opensource.org/licenses/mit-license.php 54 55with Ada.Unchecked_Deallocation; 56with Interfaces; use Interfaces; 57with System; 58 59package body LZ77 is 60 61 -- System.Word_Size: 13.3(8): A word is the largest amount of storage 62 -- that can be conveniently and efficiently manipulated by the hardware, 63 -- given the implementation's run-time model. 64 -- 65 min_bits_32: constant:= Integer'Max(32, System.Word_Size); 66 min_bits_16: constant:= Integer'Max(16, System.Word_Size); 67 68 -- We define an Integer type which is at least 32 bits, but n bits 69 -- on a native n (> 32) bits architecture (no performance hit on 64+ 70 -- bits architectures). 71 -- Integer_M16 not needed: Integer already guarantees 16 bits 72 -- 73 type Integer_M32 is range -2**(min_bits_32-1) .. 2**(min_bits_32-1) - 1; 74 subtype Natural_M32 is Integer_M32 range 0..Integer_M32'Last; 75 76 type Unsigned_M16 is mod 2**min_bits_16; 77 type Unsigned_M32 is mod 2**min_bits_32; 78 79 procedure Encode is 80 81 ----------------------- 82 -- LZHuf algorithm -- 83 ----------------------- 84 85 procedure LZ77_using_LZHuf is 86 -- Based on LZHUF by OKUMURA & YOSHIZAKI. 87 -- Here the adaptive Huffman coding is thrown away: 88 -- algorithm is used only to find matching patterns. 89 90 N_Char : constant Integer:= 256-Threshold+Look_Ahead; 91 -- Character code (= 0..N_CHAR-1) 92 Max_Table : constant Integer:= N_Char*2-1; 93 94 type Text_Buffer is array ( 0..String_buffer_size+Look_Ahead-1 ) of Byte; 95 empty_buffer: constant Text_Buffer:= (others=> 32); -- ' ' 96 97 -- > The Huffman frequency handling is made generic so we have 98 -- one copy of the tree and of the frequency table for Encode 99 -- and one for Decode 100 101 generic 102 package Huffman is 103 --- Pointing parent nodes. 104 --- Area [Max_Table..(Max_Table + N_CHAR - 1)] are pointers for leaves 105 Parent: array ( 0..Max_Table+N_Char-1 ) of Natural; 106 --- Pointing children nodes (son[], son[] + 1) 107 Son : array ( 0..Max_Table-1 ) of Natural; 108 109 Root_Position : constant Integer:= Max_Table-1; -- (can be always Son'last ?) 110 111 procedure Start; 112 procedure Update_Freq_Tree( C0: Natural ); 113 end Huffman; 114 115 package body Huffman is 116 117 Freq: array ( 0..Max_Table ) of Natural; -- Cumulative freq table 118 119 Max_Freq: constant := 16#8000#; 120 -- ^-- update when cumulative frequency reaches to this value 121 122 procedure Start is 123 I: Natural; 124 begin 125 for J in 0 .. N_Char-1 loop 126 Freq(J):= 1; 127 Son (J):= J + Max_Table; 128 Parent(J + Max_Table):= J; 129 end loop; 130 131 I:= 0; 132 for J in N_Char .. Root_Position loop 133 Freq(J):= Freq(I)+Freq(I+1); 134 Son (J):= I; 135 Parent(I):= J; 136 Parent(I+1):= J; 137 I:= I + 2; 138 end loop; 139 140 Freq( Freq'Last ):= 16#FFFF#; -- ( Max_Table ) 141 Parent( Root_Position ):= 0; 142 end Start; 143 144 procedure Update_Freq_Tree( C0: Natural ) is 145 146 procedure Reconstruct_Freq_Tree is 147 I,J,K,F,L: Natural; 148 begin 149 -- Halven cumulative freq for leaf nodes 150 J:= 0; 151 for I in 0 .. Root_Position loop 152 if Son(I) >= Max_Table then 153 Freq(J):= (Freq(I)+1) / 2; 154 Son (J):= Son(I); 155 J:= J + 1; 156 end if; 157 end loop; 158 159 -- Make a tree : first, connect children nodes 160 I:= 0; 161 for J in N_Char .. Root_Position loop -- J : free nodes 162 K:= I+1; 163 F:= Freq(I) + Freq(K); -- new frequency 164 Freq(J):= F; 165 K:= J-1; 166 while F < Freq(K) loop 167 K:= K-1; 168 end loop; 169 170 K:= K+1; 171 L:= J-K; -- 2007: fix: was L:= (J-K)*2, memcopy parameter remain 172 173 Freq( K+1 .. K+L ):= Freq( K .. K+L-1 ); -- shift by one cell right 174 Freq(K):= F; 175 Son ( K+1 .. K+L ):= Son ( K .. K+L-1 ); -- shift by one cell right 176 Son (K):= I; 177 I:= I + 2; 178 end loop; 179 180 -- Connect parent nodes 181 for I in 0 .. Max_Table-1 loop 182 K:= Son(I); 183 Parent(K):= I; 184 if K < Max_Table then 185 Parent(K+1):= I; 186 end if; 187 end loop; 188 189 end Reconstruct_Freq_Tree; 190 191 C,I,J,K,L: Natural; 192 193 begin -- Update_Freq_Tree; 194 if Freq( Root_Position ) = Max_Freq then 195 Reconstruct_Freq_Tree; 196 end if; 197 C:= Parent(C0 + Max_Table); 198 loop 199 Freq(C):= Freq(C) + 1; 200 K:= Freq(C); 201 -- Swap nodes to keep the tree freq-ordered 202 L:= C+1; 203 if K > Freq(L) then 204 while K > Freq(L+1) loop 205 L:= L + 1; 206 end loop; 207 208 Freq(C):= Freq(L); 209 Freq(L):= K; 210 211 I:= Son(C); 212 Parent(I):= L; 213 if I < Max_Table then 214 Parent(I+1):= L; 215 end if; 216 217 J:= Son(L); 218 Son(L):= I; 219 220 Parent(J):= C; 221 if J < Max_Table then 222 Parent(J+1):= C; 223 end if; 224 Son(C):= J; 225 226 C := L; 227 end if; 228 C:= Parent(C); 229 exit when C=0; 230 end loop; -- do it until reaching the root 231 end Update_Freq_Tree; 232 233 end Huffman; 234 235 Node_Nil : constant Integer:= String_buffer_size; -- End of tree's node 236 237 Lson,Dad: array ( 0..String_buffer_size ) of Natural; 238 Rson: array ( 0..String_buffer_size + 256 ) of Natural; 239 240 procedure Init_Tree is 241 begin 242 for I in String_buffer_size+1 .. Rson'Last loop 243 Rson(I) := Node_Nil; 244 end loop; -- root 245 for I in 0 .. String_buffer_size-1 loop 246 Dad(I) := Node_Nil; 247 end loop; -- node 248 end Init_Tree; 249 250 Match_Position : Natural; 251 Match_Length : Natural; 252 253 Text_Buf: Text_Buffer:= empty_buffer; 254 255 procedure Insert_Node (R: Integer) is 256 I,P: Integer; 257 Geq: Boolean:= True; 258 C: Natural; 259 begin 260 P:= String_buffer_size + 1 + Integer(Text_Buf(R)); 261 Rson(R):= Node_Nil; 262 Lson(R):= Node_Nil; 263 Match_Length := 0; 264 loop 265 if Geq then 266 if Rson(P) = Node_Nil then 267 Rson(P):= R; 268 Dad(R) := P; 269 return; 270 end if; 271 P:= Rson(P); 272 else 273 if Lson(P) = Node_Nil then 274 Lson(P):= R; 275 Dad(R) := P; 276 return; 277 end if; 278 P:= Lson(P); 279 end if; 280 I:= 1; 281 while I < Look_Ahead and then Text_Buf(R+I) = Text_Buf(P+I) loop 282 I:= I + 1; 283 end loop; 284 285 Geq:= Text_Buf(R+I) >= Text_Buf(P+I) or I = Look_Ahead; 286 287 if I > Threshold then 288 if I > Match_Length then 289 Match_Position := (R-P) mod String_buffer_size - 1; 290 Match_Length:= I; 291 exit when Match_Length >= Look_Ahead; 292 end if; 293 if I = Match_Length then 294 C:= (R-P) mod String_buffer_size - 1; 295 if C < Match_Position then 296 Match_Position:= C; 297 end if; 298 end if; 299 end if; 300 end loop; 301 302 Dad (R):= Dad (P); 303 Lson(R):= Lson(P); 304 Rson(R):= Rson(P); 305 Dad(Lson(P)):= R; 306 Dad(Rson(P)):= R; 307 if Rson(Dad(P)) = P then 308 Rson(Dad(P)):= R; 309 else 310 Lson(Dad(P)):= R; 311 end if; 312 Dad(P):= Node_Nil; -- remove P 313 end Insert_Node; 314 315 procedure Delete_Node (P: Natural) is 316 Q: Natural; 317 begin 318 if Dad(P) = Node_Nil then -- unregistered 319 return; 320 end if; 321 if Rson(P) = Node_Nil then 322 Q:= Lson(P); 323 elsif Lson(P) = Node_Nil then 324 Q:= Rson(P); 325 else 326 Q:= Lson(P); 327 if Rson(Q) /= Node_Nil then 328 loop 329 Q:= Rson(Q); 330 exit when Rson(Q) = Node_Nil; 331 end loop; 332 Rson(Dad(Q)):= Lson(Q); 333 Dad(Lson(Q)):= Dad(Q); 334 Lson(Q):= Lson(P); 335 Dad(Lson(P)):= Q; 336 end if; 337 Rson(Q):= Rson(P); 338 Dad(Rson(P)):= Q; 339 end if; 340 Dad(Q):= Dad(P); 341 if Rson(Dad(P))=P then 342 Rson(Dad(P)):= Q; 343 else 344 Lson(Dad(P)):= Q; 345 end if; 346 Dad(P):= Node_Nil; 347 end Delete_Node; 348 349 package Huffman_E is new Huffman; 350 351 I,R,S,Last_Match_Length: Natural; 352 Len: Integer; 353 C: Byte; 354 begin 355 if not More_bytes then 356 return; 357 end if; 358 Huffman_E.Start; 359 Init_Tree; 360 S:= 0; 361 R:= String_buffer_size - Look_Ahead; 362 Len:= 0; 363 while Len < Look_Ahead and More_bytes loop 364 Text_Buf(R+Len):= Read_byte; 365 Len:= Len + 1; 366 end loop; 367 368 -- Seems: fill dictionary with default value 369 -- 370 -- for I in 1.. Look_Ahead loop 371 -- Insert_Node(R - I); 372 -- end loop; 373 374 Insert_Node(R); 375 376 loop 377 if Match_Length > Len then 378 Match_Length:= Len; 379 end if; 380 if Match_Length <= Threshold then 381 Match_Length := 1; 382 Huffman_E.Update_Freq_Tree( Natural(Text_Buf(R)) ); 383 Write_literal( Text_Buf(R) ); 384 else 385 Write_DL_code(Match_Position+1, Match_Length); 386 end if; 387 Last_Match_Length := Match_Length; 388 I:= 0; 389 while I < Last_Match_Length and More_bytes loop 390 I:= I + 1; 391 Delete_Node(S); 392 C:= Read_byte; 393 Text_Buf(S):= C; 394 if S < Look_Ahead-1 then 395 Text_Buf(S+String_buffer_size):= C; 396 end if; 397 S:= (S+1) mod String_buffer_size; 398 R:= (R+1) mod String_buffer_size; 399 Insert_Node(R); 400 end loop; 401 402 while I < Last_Match_Length loop 403 I:= I + 1; 404 Delete_Node(S); 405 S := (S+1) mod String_buffer_size; 406 R := (R+1) mod String_buffer_size; 407 Len:= Len - 1; 408 if Len > 0 then 409 Insert_Node(R); 410 end if; 411 end loop; 412 413 exit when Len=0; 414 end loop; 415 end LZ77_using_LZHuf; 416 417 -------------------------- 418 -- Info-Zip algorithm -- 419 -------------------------- 420 421 -- LZ77_using_IZ: based on deflate.c by Jean-Loup Gailly. 422 -- Core description of the algorithm: 423 -- 424 -- The most straightforward technique turns out to be the fastest for 425 -- most input files: try all possible matches and select the longest. 426 -- The key feature of this algorithm is that insertions into the string 427 -- dictionary are very simple and thus fast, and deletions are avoided 428 -- completely. Insertions are performed at each input character, whereas 429 -- string matches are performed only when the previous match ends. So it 430 -- is preferable to spend more time in matches to allow very fast string 431 -- insertions and avoid deletions. The matching algorithm for small 432 -- strings is inspired from that of Rabin & Karp [1]. A brute force approach 433 -- is used to find longer strings when a small match has been found. 434 -- 435 -- The idea of lazy evaluation of matches is due to Jan-Mark Wams. 436 -- 437 -- [1] A description of the Rabin and Karp algorithm is given in the book 438 -- "Algorithms" by R. Sedgewick, Addison-Wesley, p252. 439 -- 440 -- About hashing: chapter 6.4 of The Art of Computer Programming, Volume 3, D.E. Knuth 441 -- Rabin and Karp algorithm: http://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm 442 443 -- Compression level: 0: store, 1: best speed, 9: best compression, 10: variant of level 9 444 -- Ada code: only levels 4 to 10 are supported. 445 446 procedure LZ77_using_IZ(level: Natural) is 447 HASH_BITS: constant:= 15; -- 13..15 448 HASH_SIZE: constant:= 2 ** HASH_BITS; 449 HASH_MASK: constant:= HASH_SIZE - 1; 450 WSIZE : constant Integer_M32:= Integer_M32(String_buffer_size); 451 WMASK : constant Unsigned_M16:= Unsigned_M16(WSIZE - 1); 452 -- HASH_SIZE and WSIZE must be powers of two 453 NIL : constant:= 0; -- Tail of hash chains 454 TOO_FAR : constant:= 4096; -- Matches of length 3 are discarded if their distance exceeds TOO_FAR 455 -- 456 subtype ulg is Unsigned_M32; 457 subtype unsigned is Unsigned_M16; 458 subtype ush is Unsigned_M16; 459 -- subtype long is Integer_M32; 460 -- subtype int is Integer; 461 subtype Pos is Unsigned_M32; -- must be at least 32 bits 462 -- subtype IPos is unsigned; 463 -- A Pos is an index in the character window. IPos is used only for parameter passing. 464 window: array(0 .. 2 * WSIZE - 1) of Byte; 465 -- Sliding window. Input bytes are read into the second half of the window, 466 -- and move to the first half later to keep a dictionary of at least WSIZE 467 -- bytes. With this organization, matches are limited to a distance of 468 -- WSIZE-MAX_MATCH bytes, but this ensures that IO is always 469 -- performed with a length multiple of the block size. 470 prev: array(0..unsigned(WSIZE - 1)) of Pos; 471 -- Link to older string with same hash index. 472 -- This link is maintained only for the last 32K strings. 473 -- An index in this array is thus a window index modulo 32K. 474 head: array(0..unsigned(HASH_SIZE - 1)) of Pos; 475 -- Heads of the hash chains or NIL. 476 window_size: ulg; 477 -- window size, 2*WSIZE except for MMAP or BIG_MEM, where it is the 478 -- input file length plus MIN_LOOKAHEAD. 479 sliding: Boolean; -- Set to False when the input file is already in memory [was: int] 480 ins_h: unsigned; -- hash index of string to be inserted 481 MIN_MATCH: constant Integer_M32:= Integer_M32(Threshold) + 1; -- Deflate: 3 482 MAX_MATCH: constant Integer_M32:= Integer_M32(Look_Ahead); -- Deflate: 258 483 -- Minimum amount of lookahead, except at the end of the input file. 484 MIN_LOOKAHEAD: constant Integer_M32:= MAX_MATCH + MIN_MATCH + 1; -- Deflate: 262 485 -- This LZ77 compression doesn't use the full possible distance range: 32507..32768 unused! 486 MAX_DIST : constant Integer_M32:= WSIZE - MIN_LOOKAHEAD; -- Deflate: 32506 487 H_SHIFT: constant Integer:= Integer((HASH_BITS + MIN_MATCH - 1) / MIN_MATCH); 488 -- Number of bits by which ins_h and del_h must be shifted at each 489 -- input step. It must be such that after MIN_MATCH steps, the oldest 490 -- byte no longer takes part in the hash key, that is: 491 -- H_SHIFT * MIN_MATCH >= HASH_BITS 492 prev_length: Natural_M32; -- [was: unsigned] 493 -- Length of the best match at previous step. Matches not greater than this 494 -- are discarded. This is used in the lazy match evaluation. 495 strstart : Natural_M32; -- start of string to insert [was: unsigned] 496 match_start: Natural_M32; -- start of matching string [was: unsigned] 497 eofile : Boolean; -- flag set at end of input file [was: int] 498 lookahead : Natural_M32; -- number of valid bytes ahead in window [was: unsigned] 499 max_chain_length : unsigned; 500 -- To speed up deflation, hash chains are never searched beyond this length. 501 -- A higher limit improves compression ratio but degrades the speed. 502 max_lazy_match: Natural_M32; -- [was: unsigned] 503 -- Attempt to find a better match only when the current match is strictly 504 -- smaller than this value. This mechanism is used only for compression 505 -- levels >= 4. 506 good_match: Natural_M32; -- [was: unsigned] 507 -- Use a faster search when the previous match is longer than this 508 nice_match: Integer_M32; -- Stop searching when current match exceeds this 509 -- Values for max_lazy_match, good_match, nice_match and max_chain_length, 510 -- depending on the desired pack level (0..9). The values given below have 511 -- been tuned to exclude worst case performance for pathological files. 512 -- Better values may be found for specific files. 513 type config is record 514 good_length : Natural_M32; -- reduce lazy search above this match length [was: ush] 515 max_lazy : Natural_M32; -- do not perform lazy search above this match length 516 nice_length : Integer_M32; -- quit search above this match length 517 max_chain : ush; 518 end record; 519 520 configuration_table: constant array(0..10) of config:= ( 521 -- good lazy nice chain 522 (0, 0, 0, 0), -- 0: store only 523 (4, 4, 8, 4), -- 1: maximum speed, no lazy matches 524 (4, 5, 16, 8), 525 (4, 6, 32, 32), 526 (4, 4, 16, 16), -- 4: lazy matches 527 (8, 16, 32, 32), 528 (8, 16, 128, 128), 529 (8, 32, 128, 256), 530 (32, 128, 258, 1024), 531 (32, 258, 258, 4096), -- 9: maximum compression 532 (34, 258, 258, 4096)); -- "secret" variant of level 9 533 534 -- Update a hash value with the given input byte 535 -- IN assertion: all calls to to UPDATE_HASH are made with consecutive 536 -- input characters, so that a running hash key can be computed from the 537 -- previous key instead of complete recalculation each time. 538 539 procedure UPDATE_HASH(h: in out unsigned; c: Byte) is 540 pragma Inline(UPDATE_HASH); 541 begin 542 h := (unsigned(Shift_Left(Unsigned_32(h), H_SHIFT)) xor unsigned(c)) and HASH_MASK; 543 end UPDATE_HASH; 544 545 -- Insert string starting at s in the dictionary and set match_head to the previous head 546 -- of the hash chain (the most recent string with same hash key). Return 547 -- the previous length of the hash chain. 548 -- IN assertion: all calls to to INSERT_STRING are made with consecutive 549 -- input characters and the first MIN_MATCH bytes of s are valid 550 -- (except for the last MIN_MATCH-1 bytes of the input file). 551 552 procedure INSERT_STRING(s: Integer_M32; match_head: out Natural_M32) is 553 pragma Inline(INSERT_STRING); 554 begin 555 UPDATE_HASH(ins_h, window(s + MIN_MATCH - 1)); 556 match_head := Natural_M32(head(ins_h)); 557 prev(unsigned(s) and WMASK):= Pos(match_head); 558 head(ins_h) := Pos(s); 559 end INSERT_STRING; 560 561 procedure Read_buf(from: Integer_M32; amount: unsigned; actual: out Integer_M32) is 562 need: unsigned:= amount; 563 begin 564 -- put_line("Read buffer: from:" & from'img & "; amount:" & amount'img); 565 actual:= 0; 566 while need > 0 and then More_bytes loop 567 window(from + actual):= Read_byte; 568 actual:= actual + 1; 569 need:= need - 1; 570 end loop; 571 -- put_line("Read buffer: actual:" & actual'img); 572 end Read_buf; 573 574 -- Fill the window when the lookahead becomes insufficient. 575 -- Updates strstart and lookahead, and sets eofile if end of input file. 576 -- 577 -- IN assertion: lookahead < MIN_LOOKAHEAD && strstart + lookahead > 0 578 -- OUT assertions: strstart <= window_size-MIN_LOOKAHEAD 579 -- At least one byte has been read, or eofile is set; file reads are 580 -- performed for at least two bytes (required for the translate_eol option). 581 582 procedure Fill_window is 583 more: unsigned; 584 m: Pos; 585 n: Natural_M32; 586 begin 587 loop 588 more:= unsigned(window_size - ulg(lookahead) - ulg(strstart)); 589 if False then -- C: "if (more == (unsigned)EOF) {" ?... GdM: seems a 16-bit code for EOF 590 -- Very unlikely, but possible on 16 bit machine if strstart == 0 591 -- and lookahead == 1 (input done one byte at time) 592 more:= more - 1; 593 elsif strstart >= WSIZE + MAX_DIST and then sliding then 594 -- By the IN assertion, the window is not empty so we can't confuse 595 -- more == 0 with more == 64K on a 16 bit machine. 596 window(0 .. WSIZE - 1):= window(WSIZE .. 2 * WSIZE - 1); 597 -- GdM: in rare cases (e.g. level 9 on test file "enwik8"), match_start happens 598 -- to be < WSIZE. We do as in the original 16-bit C code: mod 2**16, such that the 599 -- index is the window's range. 600 -- This assumes WSIZE = 2**15, which is checked at startup of LZ77_using_IZ. 601 -- Very likely, match_start is garbage anyway - see http://sf.net/p/infozip/bugs/49/ 602 match_start := Natural_M32( Unsigned_16(match_start) - Unsigned_16(WSIZE mod (2**16)) ); 603 strstart := strstart - WSIZE; -- we now have strstart >= MAX_DIST: 604 for nn in 0 .. unsigned'(HASH_SIZE - 1) loop 605 m := head(nn); 606 if m >= Pos(WSIZE) then 607 head(nn) := m - Pos(WSIZE); 608 else 609 head(nn) := NIL; 610 end if; 611 end loop; 612 -- 613 for nn in 0 .. unsigned(WSIZE - 1) loop 614 m := prev(nn); 615 if m >= Pos(WSIZE) then 616 prev(nn) := m - Pos(WSIZE); 617 else 618 prev(nn) := NIL; 619 end if; 620 -- If n is not on any hash chain, prev[n] is garbage but its value will never be used. 621 end loop; 622 more:= more + unsigned(WSIZE); 623 end if; 624 exit when eofile; 625 -- If there was no sliding: 626 -- strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 && 627 -- more == window_size - lookahead - strstart 628 -- => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1) 629 -- => more >= window_size - 2*WSIZE + 2 630 -- In the MMAP or BIG_MEM case (not yet supported in gzip), 631 -- window_size == input_size + MIN_LOOKAHEAD && 632 -- strstart + lookahead <= input_size => more >= MIN_LOOKAHEAD. 633 -- Otherwise, window_size == 2*WSIZE so more >= 2. 634 -- If there was sliding, more >= WSIZE. So in all cases, more >= 2. 635 -- 636 -- Assert(more >= 2, "more < 2"); 637 -- 638 Read_buf(strstart + lookahead, more, n); 639 if n = 0 then 640 eofile := True; 641 else 642 lookahead := lookahead + n; 643 end if; 644 exit when lookahead >= MIN_LOOKAHEAD or eofile; 645 end loop; 646 -- put_line("Fill done - eofile = " & eofile'img); 647 end Fill_window; 648 649 -- Initialize the "longest match" routines for a new file 650 -- 651 -- IN assertion: window_size is > 0 if the input file is already read or 652 -- mapped in the window array, 0 otherwise. In the first case, 653 -- window_size is sufficient to contain the whole input file plus 654 -- MIN_LOOKAHEAD bytes (to avoid referencing memory beyond the end 655 -- of window when looking for matches towards the end). 656 657 procedure LM_Init (pack_level: Natural) is 658 begin 659 -- Do not slide the window if the whole input is already in memory (window_size > 0) 660 sliding := False; 661 if window_size = 0 then 662 sliding := True; 663 window_size := 2 * ulg(WSIZE); 664 end if; 665 -- Initialize the hash table. 666 -- prev will be initialized on the fly. 667 head:= (others => NIL); 668 -- Set the default configuration parameters: 669 max_lazy_match := configuration_table(pack_level).max_lazy; 670 good_match := configuration_table(pack_level).good_length; 671 nice_match := configuration_table(pack_level).nice_length; 672 max_chain_length := configuration_table(pack_level).max_chain; 673 -- Info-Zip comment: ??? reduce max_chain_length for binary files 674 strstart := 0; 675 Read_buf(0, unsigned(WSIZE), lookahead); 676 if lookahead = 0 then 677 eofile := True; 678 return; 679 end if; 680 eofile := False; 681 -- Make sure that we always have enough lookahead. This is important 682 -- if input comes from a device such as a tty. 683 if lookahead < MIN_LOOKAHEAD then 684 Fill_window; 685 end if; 686 ins_h := 0; 687 for j in 0 .. Natural_M32(MIN_MATCH)-2 loop 688 UPDATE_HASH(ins_h, window(j)); 689 end loop; 690 -- If lookahead < MIN_MATCH, ins_h is garbage, but this is 691 -- not important since only literal bytes will be emitted. 692 end LM_Init; 693 694 -- Set match_start to the longest match starting at the given string and 695 -- return its length. Matches shorter or equal to prev_length are discarded, 696 -- in which case the result is equal to prev_length and match_start is 697 -- garbage. 698 -- IN assertions: current_match is the head of the hash chain for the current 699 -- string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1 700 701 procedure Longest_Match(current_match: in out Integer_M32; longest: out Integer_M32) is 702 chain_length : unsigned := max_chain_length; -- max hash chain length 703 scan : Integer_M32 := strstart; -- current string 704 match : Integer_M32; -- matched string 705 len : Integer_M32; -- length of current match 706 best_len : Integer_M32 := prev_length; -- best match length so far 707 limit : Natural_M32; -- [was: IPos] 708 strend : constant Integer_M32:= strstart + MAX_MATCH; 709 scan_end : Integer_M32:= scan + best_len; 710 begin 711 -- Stop when current_match becomes <= limit. To simplify the code, 712 -- we prevent matches with the string of window index 0. 713 if strstart > MAX_DIST then 714 limit:= strstart - MAX_DIST; 715 else 716 limit:= NIL; 717 end if; 718 -- Do not waste too much time if we already have a good match: 719 if prev_length >= good_match then 720 chain_length := chain_length / 4; 721 end if; 722 -- Assert(strstart <= window_size-MIN_LOOKAHEAD, "insufficient lookahead"); 723 loop 724 -- Assert(current_match < strstart, "no future"); 725 match := current_match; 726 -- Skip to next match if the match length cannot increase 727 -- or if the match length is less than 2: 728 -- 729 -- NB: this is the Not-UNALIGNED_OK variant in the C code. 730 -- Translation of the UNALIGNED_OK variant is left as an exercise ;-). 731 -- (!! worth a try: GNAT optimizes window(match..match+1[3]) to 16[32] bit) 732 -- 733 if window(match + best_len) /= window(scan_end) or else 734 window(match + best_len - 1) /= window(scan_end - 1) or else 735 window(match) /= window(scan) or else 736 window(match + 1) /= window(scan + 1) 737 then 738 match:= match + 1; -- C: continue 739 else 740 -- The check at best_len - 1 can be removed because it will be made 741 -- again later. (This heuristic is not always a win.) 742 -- 743 -- It is not necessary to compare window(scan + 2) and window(match + 2) since they 744 -- are always equal when the other bytes match, given that 745 -- the hash keys are equal and that HASH_BITS >= 8. 746 scan:= scan + 2; 747 match:= match + 2; 748 -- C: The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16. 749 -- It is easy to get rid of this optimization if necessary. 750 -- Ada: see the "else" part below. 751 if MAX_MATCH = 258 then 752 -- We check for insufficient lookahead only every 8th comparison; 753 -- the 256th check will be made at strstart + 258. 754 loop 755 scan:= scan + 1; 756 match:= match + 1; 757 exit when window(scan) /= window(match); 758 scan:= scan + 1; 759 match:= match + 1; 760 exit when window(scan) /= window(match); 761 scan:= scan + 1; 762 match:= match + 1; 763 exit when window(scan) /= window(match); 764 scan:= scan + 1; 765 match:= match + 1; 766 exit when window(scan) /= window(match); 767 scan:= scan + 1; 768 match:= match + 1; 769 exit when window(scan) /= window(match); 770 scan:= scan + 1; 771 match:= match + 1; 772 exit when window(scan) /= window(match); 773 scan:= scan + 1; 774 match:= match + 1; 775 exit when window(scan) /= window(match); 776 scan:= scan + 1; 777 match:= match + 1; 778 exit when window(scan) /= window(match) or else scan >= strend; 779 end loop; 780 else 781 -- We check for insufficient lookahead after every comparison. 782 loop 783 scan:= scan + 1; 784 match:= match + 1; 785 exit when window(scan) /= window(match) or else scan >= strend; 786 end loop; 787 end if; 788 -- Assert(scan <= window+(unsigned)(window_size-1), "wild scan"); 789 len := MAX_MATCH - (strend - scan); 790 scan := strend - MAX_MATCH; 791 if len > best_len then 792 match_start := current_match; 793 best_len := len; 794 exit when len >= nice_match; 795 scan_end := scan + best_len; 796 end if; 797 end if; 798 current_match := Integer_M32(prev(unsigned(current_match) and WMASK)); 799 exit when current_match <= limit; 800 chain_length:= chain_length - 1; 801 exit when chain_length = 0; 802 end loop; 803 longest:= best_len; 804 end Longest_Match; 805 806 procedure LZ77_part_of_IZ_Deflate is 807 hash_head : Natural_M32:= NIL; -- head of hash chain 808 prev_match: Natural_M32; -- previous match [was: IPos] 809 match_available: Boolean:= False; -- set if previous match exists 810 match_length: Natural_M32:= MIN_MATCH - 1; -- length of best match 811 max_insert: Natural_M32; 812 begin 813 match_start:= 0; -- NB: no initialization in deflate.c 814 -- NB: level <= 3 would call deflate_fast; 815 -- 816 -- Process the input block. 817 while lookahead /= 0 loop 818 -- Insert the string window(strstart .. strstart + 2) in the 819 -- dictionary, and set hash_head to the head of the hash chain: 820 if lookahead >= MIN_MATCH then 821 INSERT_STRING(strstart, hash_head); 822 end if; 823 -- Find the longest match, discarding those <= prev_length. 824 prev_length := match_length; 825 prev_match := match_start; 826 match_length := MIN_MATCH - 1; 827 if hash_head /= NIL and then 828 prev_length < max_lazy_match and then 829 strstart - hash_head <= MAX_DIST 830 then 831 -- To simplify the code, we prevent matches with the string 832 -- of window index 0 (in particular we have to avoid a match 833 -- of the string with itself at the start of the input file). 834 -- 835 -- Do not look for matches beyond the end of the input. 836 -- This is necessary to make deflate deterministic. 837 if nice_match > lookahead then 838 nice_match := lookahead; 839 end if; 840 Longest_Match(hash_head, match_length); 841 -- Longest_Match sets match_start 842 if match_length > lookahead then 843 match_length := lookahead; 844 end if; 845 -- Ignore a length 3 match if it is too distant: 846 if match_length = MIN_MATCH and then strstart - match_start > TOO_FAR then 847 -- If prev_match is also MIN_MATCH, match_start is garbage 848 -- but we will ignore the current match anyway. 849 match_length := MIN_MATCH - 1; 850 end if; 851 end if; 852 -- If there was a match at the previous step and the current 853 -- match is not better, output the previous match: 854 if prev_length >= MIN_MATCH and then match_length <= prev_length then 855 max_insert:= strstart + lookahead - MIN_MATCH; 856 -- C: in DEBUG mode: check_match(strstart-1, prev_match, prev_length); 857 -- 858 ------------------------------------ 859 -- Output a Distance-Length code -- 860 ------------------------------------ 861 Write_DL_code(Positive(strstart - 1 - prev_match), Positive(prev_length)); 862 -- Insert in hash table all strings up to the end of the match. 863 -- strstart-1 and strstart are already inserted. 864 lookahead := lookahead - (prev_length-1); 865 prev_length := prev_length - 2; 866 loop 867 strstart:= strstart + 1; 868 if strstart <= max_insert then 869 INSERT_STRING(strstart, hash_head); 870 -- strstart never exceeds WSIZE - MAX_MATCH, so there 871 -- are always MIN_MATCH bytes ahead. 872 end if; 873 prev_length:= prev_length - 1; 874 exit when prev_length = 0; 875 end loop; 876 strstart:= strstart + 1; 877 match_available := False; 878 match_length := MIN_MATCH - 1; 879 elsif match_available then 880 -- If there was no match at the previous position, output a 881 -- single literal. If there was a match but the current match 882 -- is longer, truncate the previous match to a single literal. 883 -- 884 ------------------------ 885 -- Output a literal -- 886 ------------------------ 887 Write_literal(window(strstart-1)); 888 strstart:= strstart + 1; 889 lookahead := lookahead - 1; 890 else 891 -- There is no previous match to compare with, wait for the next step to decide. 892 match_available := True; 893 strstart:= strstart + 1; 894 lookahead := lookahead - 1; 895 end if; 896 -- Assert(strstart <= isize && lookahead <= isize, "a bit too far"); 897 -- 898 -- Make sure that we always have enough lookahead, except 899 -- at the end of the input file. We need MAX_MATCH bytes 900 -- for the next match, plus MIN_MATCH bytes to insert the 901 -- string following the next match. 902 if lookahead < MIN_LOOKAHEAD then 903 Fill_window; 904 end if; 905 end loop; 906 ----------------------------------- 907 -- Output last literal, if any -- 908 ----------------------------------- 909 if match_available then 910 Write_literal(window(strstart-1)); 911 end if; 912 end LZ77_part_of_IZ_Deflate; 913 914 Code_too_clever: exception; 915 begin 916 if Look_Ahead /= 258 or String_buffer_size /= 2 ** 15 or Threshold /= 2 then 917 raise Code_too_clever; -- was optimized for these parameters 918 end if; 919 window_size:= 0; 920 LM_Init(level); 921 LZ77_part_of_IZ_Deflate; 922 end LZ77_using_IZ; 923 924 --------------------------------------------------------------------- 925 -- BT4 - Binary tree of match positions selected with -- 926 -- the leading 2 to 4 bytes of each possible match. -- 927 --------------------------------------------------------------------- 928 929 -- Based on BT4.java by Lasse Collin, itself based on LzFind.c by Igor Pavlov. 930 931 procedure LZ77_using_BT4 is 932 MATCH_LEN_MIN: constant Integer:= Threshold + 1; 933 -- 934 readPos : Integer := -1; 935 cur_literal : Byte; 936 readLimit : Integer := -1; 937 finishing : constant Boolean := False; 938 writePos : Integer := 0; 939 pendingSize : Integer := 0; 940 -- 941 OPTS : constant := 4096; 942 EXTRA_SIZE_BEFORE : constant := OPTS; 943 EXTRA_SIZE_AFTER : constant := OPTS; 944 945 keepSizeBefore : constant Integer:= EXTRA_SIZE_BEFORE + String_buffer_size; 946 keepSizeAfter : constant Integer:= EXTRA_SIZE_AFTER + Look_Ahead; 947 reserveSize : constant Integer:= 948 Integer'Min( 949 String_buffer_size / 2 + 950 256 * (2 ** 10), -- 256 KB 951 512 * (2 ** 20) -- 512 MB 952 ); 953 getBufSize: constant Integer:= keepSizeBefore + keepSizeAfter + reserveSize; 954 955 type Int_array is array(Natural range <>) of Integer; 956 type p_Int_array is access Int_array; 957 procedure Dispose is new Ada.Unchecked_Deallocation(Int_array, p_Int_array); 958 959 procedure Normalize(positions: in out Int_array; normalizationOffset: Integer) is 960 begin 961 for i in 0 .. positions'Length - 1 loop 962 if positions(i) <= normalizationOffset then 963 positions(i) := 0; 964 else 965 positions(i) := positions(i) - normalizationOffset; 966 end if; 967 end loop; 968 end Normalize; 969 970 function getAvail return Integer is 971 pragma Inline(getAvail); 972 begin 973 -- !! - 1 added for getting readPos in buf'Range upon: cur_literal:= buf(readPos); 974 return writePos - readPos - 1; 975 end getAvail; 976 977 function movePos(requiredForFlushing, requiredForFinishing: Integer) return Integer is 978 avail: Integer; 979 begin 980 -- assert requiredForFlushing >= requiredForFinishing; 981 readPos := readPos + 1; 982 avail := getAvail; 983 if avail < requiredForFlushing then 984 if avail < requiredForFinishing or else not finishing 985 then 986 pendingSize:= pendingSize + 1; 987 -- GdM: this causes cyclicPos and lzpos not being in sync with readPos, 988 -- the pendingSize value is there for catching up. 989 avail := 0; 990 end if; 991 end if; 992 return avail; 993 end movePos; 994 995 function getHash4Size return Integer is 996 h : Unsigned_32:= Unsigned_32(String_buffer_size - 1); 997 begin 998 h:= h or Shift_Right(h, 1); 999 h:= h or Shift_Right(h, 2); 1000 h:= h or Shift_Right(h, 4); 1001 h:= h or Shift_Right(h, 8); 1002 h:= Shift_Right(h, 1); 1003 h:= h or 16#FFFF#; -- LzFind.c: "don't change it! It's required for Deflate" 1004 if h > 2 ** 24 then 1005 h:= Shift_Right(h, 1); 1006 end if; 1007 return Integer(h + 1); 1008 end getHash4Size; 1009 1010 type Byte_array is array(Natural range <>) of Byte; 1011 type p_Byte_array is access Byte_array; 1012 procedure Dispose is new Ada.Unchecked_Deallocation(Byte_array, p_Byte_array); 1013 1014 package Hash234 is 1015 HASH_2_SIZE : constant := 2 ** 10; 1016 HASH_2_MASK : constant := HASH_2_SIZE - 1; 1017 HASH_3_SIZE : constant := 2 ** 16; 1018 HASH_3_MASK : constant := HASH_3_SIZE - 1; 1019 hash_4_size : constant Integer:= getHash4Size; 1020 hash_4_mask : constant Unsigned_32:= Unsigned_32(hash_4_size) - 1; 1021 -- 1022 hash2Table: Int_array(0..HASH_2_SIZE-1) := (others => 0); -- !! initialization added 1023 hash3Table: Int_array(0..HASH_3_SIZE-1) := (others => 0); -- !! initialization added 1024 hash4Table: p_Int_array; 1025 -- 1026 hash2Value, hash3Value, hash4Value: Unsigned_32:= 0; 1027 -- 1028 procedure calcHashes(buf: Byte_array; off: Integer); 1029 procedure updateTables(pos: Integer); 1030 procedure Normalize(normalizeOffset: Integer); 1031 end Hash234; 1032 1033 package body Hash234 is 1034 1035 crcTable: array(Byte) of Unsigned_32; 1036 CRC32_POLY: constant:= 16#EDB8_8320#; 1037 1038 procedure calcHashes(buf: Byte_array; off: Integer) is 1039 temp: Unsigned_32 := crcTable(buf(off)) xor Unsigned_32(buf(off + 1)); 1040 begin 1041 hash2Value := temp and HASH_2_MASK; 1042 temp:= temp xor Shift_Left(Unsigned_32(buf(off + 2)), 8); 1043 hash3Value := temp and HASH_3_MASK; 1044 temp:= temp xor Shift_Left(crcTable(buf(off + 3)), 5); 1045 hash4Value := temp and hash_4_mask; 1046 end calcHashes; 1047 1048 procedure updateTables(pos: Integer) is 1049 begin 1050 hash2Table(Integer(hash2Value)) := pos; 1051 hash3Table(Integer(hash3Value)) := pos; 1052 hash4Table(Integer(hash4Value)) := pos; 1053 end updateTables; 1054 1055 procedure Normalize(normalizeOffset: Integer) is 1056 begin 1057 Normalize(hash2Table, normalizeOffset); 1058 Normalize(hash3Table, normalizeOffset); 1059 Normalize(hash4Table.all, normalizeOffset); 1060 end Normalize; 1061 1062 r: Unsigned_32; 1063 begin 1064 -- NB: heap allocation used only for convenience because of 1065 -- small default stack sizes on some compilers. 1066 hash4Table:= new Int_array(0..hash_4_size-1); 1067 hash4Table.all:= (others => 0); -- !! initialization added 1068 for i in Byte loop 1069 r:= Unsigned_32(i); 1070 for j in 0 .. 7 loop 1071 if (r and 1) = 0 then 1072 r:= Shift_Right(r, 1); 1073 else 1074 r:= Shift_Right(r, 1) xor CRC32_POLY; 1075 end if; 1076 end loop; 1077 crcTable(i) := r; 1078 end loop; 1079 end Hash234; 1080 1081 niceLen: constant Integer:= Integer'Min(162, Look_Ahead); -- const. was 64 1082 depthLimit: constant:= 48; -- Alternatively: 16 + niceLen / 2 1083 1084 -- !! nicer: unconstr. array of (dist, len) pairs, 1-based array. 1085 1086 type Any_Matches_type(countMax: Integer) is record 1087 count: Integer:= 0; 1088 len : Int_array(0 .. countMax); 1089 dist : Int_array(0 .. countMax); 1090 end record; 1091 1092 -- Subtracting 1 because the shortest match that this match 1093 -- finder can find is 2 bytes, so there's no need to reserve 1094 -- space for one-byte matches. 1095 subtype Matches_type is Any_Matches_type(niceLen - 1); 1096 1097 cyclicSize : constant Integer := String_buffer_size; -- Had: + 1; 1098 cyclicPos : Integer := -1; 1099 lzPos : Integer := cyclicSize; 1100 1101 max_dist: constant Integer:= cyclicSize; 1102 1103 package BT4_Algo is 1104 procedure skip(len: Natural); 1105 pragma Inline(skip); 1106 function getMatches return Matches_type; 1107 end BT4_Algo; 1108 1109 buf : p_Byte_array; 1110 tree : p_Int_array; 1111 1112 package body BT4_Algo is 1113 1114 function movePos return Integer is 1115 avail : constant Integer:= movePos(requiredForFlushing => niceLen, requiredForFinishing => 4); 1116 normalizationOffset: Integer; 1117 begin 1118 -- Put_Line("BT4_movePos"); 1119 if avail /= 0 then 1120 lzPos:= lzPos + 1; 1121 if lzPos = Integer'Last then 1122 normalizationOffset := Integer'Last - cyclicSize; 1123 Hash234.Normalize(normalizationOffset); 1124 Normalize(tree.all, normalizationOffset); 1125 lzPos:= lzPos - normalizationOffset; 1126 end if; 1127 cyclicPos:= cyclicPos + 1; 1128 if cyclicPos = cyclicSize then 1129 -- Put_Line("cyclicPos zeroed"); 1130 cyclicPos := 0; 1131 end if; 1132 end if; 1133 return avail; 1134 end movePos; 1135 1136 Null_position: constant:= -1; -- LzFind.c: kEmptyHashValue, 0 1137 1138 procedure skip_update_tree(niceLenLimit: Integer; currentMatch: in out Integer) is 1139 delta0, depth, ptr0, ptr1, pair, len, len0, len1: Integer; 1140 begin 1141 -- Put("BT4.skip_update_tree... "); 1142 depth := depthLimit; 1143 ptr0 := cyclicPos * 2 + 1; 1144 ptr1 := cyclicPos * 2; 1145 len0 := 0; 1146 len1 := 0; 1147 loop 1148 delta0 := lzPos - currentMatch; 1149 if depth = 0 or else delta0 >= max_dist then 1150 tree(ptr0) := Null_position; 1151 tree(ptr1) := Null_position; 1152 return; 1153 end if; 1154 depth:= depth - 1; 1155 if cyclicPos - delta0 < 0 then 1156 pair:= cyclicSize; 1157 else 1158 pair:= 0; 1159 end if; 1160 pair := (cyclicPos - delta0 + pair) * 2; 1161 len := Integer'Min(len0, len1); 1162 -- Match ? 1163 if buf(readPos + len - delta0) = buf(readPos + len) then 1164 -- No need to look for longer matches than niceLenLimit 1165 -- because we only are updating the tree, not returning 1166 -- matches found to the caller. 1167 loop 1168 len:= len + 1; 1169 if len = niceLenLimit then 1170 tree(ptr1) := tree(pair); 1171 tree(ptr0) := tree(pair + 1); 1172 return; 1173 end if; 1174 exit when buf(readPos + len - delta0) /= buf(readPos + len); 1175 end loop; 1176 end if; 1177 -- Bytes are no more matching. The past value is either smaller... 1178 if buf(readPos + len - delta0) < buf(readPos + len) then 1179 tree(ptr1) := currentMatch; 1180 ptr1 := pair + 1; 1181 currentMatch := tree(ptr1); 1182 len1 := len; 1183 else -- ... or larger 1184 tree(ptr0) := currentMatch; 1185 ptr0 := pair; 1186 currentMatch := tree(ptr0); 1187 len0 := len; 1188 end if; 1189 end loop; 1190 end skip_update_tree; 1191 1192 procedure skip(len: Natural) is 1193 -- 1194 procedure Skip_one is 1195 pragma Inline(Skip_one); 1196 niceLenLimit, avail, currentMatch: Integer; 1197 begin 1198 niceLenLimit := niceLen; 1199 avail := movePos; 1200 if avail < niceLenLimit then 1201 if avail = 0 then 1202 return; 1203 end if; 1204 niceLenLimit := avail; 1205 end if; 1206 Hash234.calcHashes(buf.all, readPos); 1207 currentMatch := Hash234.hash4Table (Integer(Hash234.hash4Value)); 1208 Hash234.updateTables(lzPos); 1209 skip_update_tree(niceLenLimit, currentMatch); 1210 end Skip_one; 1211 -- 1212 begin 1213 for count in reverse 1 .. len loop 1214 Skip_one; 1215 end loop; 1216 end skip; 1217 1218 function getMatches return Matches_type is 1219 matches: Matches_type; 1220 matchLenLimit : Integer := Look_Ahead; 1221 niceLenLimit : Integer := niceLen; 1222 avail: Integer; 1223 delta0, delta2, delta3, currentMatch, 1224 lenBest, depth, ptr0, ptr1, pair, len, len0, len1: Integer; 1225 begin 1226 -- Put("BT4.getMatches... "); 1227 matches.count:= 0; 1228 avail:= movePos; 1229 if avail < matchLenLimit then 1230 if avail = 0 then 1231 return matches; 1232 end if; 1233 matchLenLimit := avail; 1234 if niceLenLimit > avail then 1235 niceLenLimit := avail; 1236 end if; 1237 end if; 1238 -- 1239 Hash234.calcHashes(buf.all, readPos); 1240 delta2 := lzPos - Hash234.hash2Table (Integer(Hash234.hash2Value)); 1241 delta3 := lzPos - Hash234.hash3Table (Integer(Hash234.hash3Value)); 1242 currentMatch := Hash234.hash4Table (Integer(Hash234.hash4Value)); 1243 Hash234.updateTables(lzPos); 1244 -- 1245 lenBest := 0; 1246 -- See if the hash from the first two bytes found a match. 1247 -- The hashing algorithm guarantees that if the first byte 1248 -- matches, also the second byte does, so there's no need to 1249 -- test the second byte. 1250 if delta2 < max_dist and then buf(readPos - delta2) = buf(readPos) then 1251 -- Match of length 2 found and checked. 1252 lenBest := 2; 1253 matches.len(0) := 2; 1254 matches.dist(0) := delta2 - 1; 1255 matches.count := 1; 1256 end if; 1257 -- See if the hash from the first three bytes found a match that 1258 -- is different from the match possibly found by the two-byte hash. 1259 -- Also here the hashing algorithm guarantees that if the first byte 1260 -- matches, also the next two bytes do. 1261 if delta2 /= delta3 and then delta3 < max_dist 1262 and then buf(readPos - delta3) = buf(readPos) 1263 then 1264 -- Match of length 3 found and checked. 1265 lenBest := 3; 1266 matches.count := matches.count + 1; 1267 matches.dist(matches.count - 1) := delta3 - 1; 1268 delta2 := delta3; 1269 end if; 1270 -- If a match was found, see how long it is. 1271 if matches.count > 0 then 1272 while lenBest < matchLenLimit and then buf(readPos + lenBest - delta2) 1273 = buf(readPos + lenBest) 1274 loop 1275 lenBest:= lenBest + 1; 1276 end loop; 1277 matches.len(matches.count - 1) := lenBest; 1278 -- Return if it is long enough (niceLen or reached the end of the dictionary). 1279 if lenBest >= niceLenLimit then 1280 skip_update_tree(niceLenLimit, currentMatch); 1281 return matches; 1282 end if; 1283 end if; 1284 -- Long enough match wasn't found so easily. Look for better matches from the binary tree. 1285 if lenBest < 3 then 1286 lenBest := 3; 1287 end if; 1288 depth := depthLimit; 1289 ptr0 := cyclicPos * 2 + 1; 1290 ptr1 := cyclicPos * 2; 1291 len0 := 0; 1292 len1 := 0; 1293 -- 1294 loop 1295 delta0 := lzPos - currentMatch; 1296 -- Return if the search depth limit has been reached or 1297 -- if the distance of the potential match exceeds the 1298 -- dictionary size. 1299 if depth = 0 or else delta0 >= max_dist then 1300 tree(ptr0) := Null_position; 1301 tree(ptr1) := Null_position; 1302 return matches; 1303 end if; 1304 depth:= depth - 1; 1305 -- 1306 if cyclicPos - delta0 < 0 then 1307 pair:= cyclicSize; 1308 else 1309 pair:= 0; 1310 end if; 1311 pair := (cyclicPos - delta0 + pair) * 2; 1312 len := Integer'Min(len0, len1); 1313 -- Match ? 1314 if buf(readPos + len - delta0) = buf(readPos + len) then 1315 loop 1316 len:= len + 1; 1317 exit when len >= matchLenLimit 1318 or else buf(readPos + len - delta0) /= buf(readPos + len); 1319 end loop; 1320 if len > lenBest then 1321 lenBest := len; 1322 matches.len(matches.count) := len; 1323 matches.dist(matches.count) := delta0 - 1; 1324 matches.count:= matches.count + 1; 1325 if len >= niceLenLimit then 1326 tree(ptr1) := tree(pair); 1327 tree(ptr0) := tree(pair + 1); 1328 return matches; 1329 end if; 1330 end if; 1331 end if; 1332 -- Bytes are no more matching. The past value is either smaller... 1333 if buf(readPos + len - delta0) < buf(readPos + len) then 1334 tree(ptr1) := currentMatch; 1335 ptr1 := pair + 1; 1336 currentMatch := tree(ptr1); 1337 len1 := len; 1338 else -- ... or larger 1339 tree(ptr0) := currentMatch; 1340 ptr0 := pair; 1341 currentMatch := tree(ptr0); 1342 len0 := len; 1343 end if; 1344 end loop; 1345 end getMatches; 1346 1347 begin 1348 -- NB: heap allocation used only for convenience because of 1349 -- small default stack sizes on some compilers. 1350 tree:= new Int_array(0 .. cyclicSize * 2 - 1); 1351 tree.all:= (others => Null_position); 1352 end BT4_Algo; 1353 1354 -- Moves data from the end of the buffer to the beginning, discarding 1355 -- old data and making space for new input. 1356 1357 procedure moveWindow is 1358 -- Align the move to a multiple of 16 bytes (LZMA-friendly, see pos_bits) 1359 moveOffset : constant Integer := ((readPos + 1 - keepSizeBefore) / 16) * 16; 1360 moveSize : constant Integer := writePos - moveOffset; 1361 begin 1362 -- Put_Line(" Move window, size=" & moveSize'Img & " offset=" & moveOffset'Img); 1363 buf(0 .. moveSize - 1):= buf(moveOffset .. moveOffset + moveSize - 1); 1364 readPos := readPos - moveOffset; 1365 readLimit := readLimit - moveOffset; 1366 writePos := writePos - moveOffset; 1367 end moveWindow; 1368 1369 -- Copies new data into the buffer. 1370 function fillWindow(len_initial: Integer) return Integer is 1371 1372 -- Process pending data that hasn't been ran through the match finder yet. 1373 -- Run it through the match finder now if there is enough new data 1374 -- available (readPos < readLimit) that the encoder may encode at 1375 -- least one more input byte. 1376 -- 1377 procedure processPendingBytes is 1378 oldPendingSize: Integer; 1379 begin 1380 if pendingSize > 0 and then readPos < readLimit then 1381 readPos := readPos - pendingSize; 1382 oldPendingSize := pendingSize; 1383 pendingSize := 0; 1384 BT4_Algo.skip(oldPendingSize); 1385 end if; 1386 end processPendingBytes; 1387 -- 1388 len: Integer:= len_initial; 1389 actual_len: Integer:= 0; 1390 begin 1391 -- Put_Line("Fill window - start"); 1392 -- Move the sliding window if needed. 1393 if readPos >= buf'Length - keepSizeAfter then 1394 moveWindow; 1395 end if; 1396 1397 -- Try to fill the dictionary buffer up to its boundary. 1398 if len > buf'Length - writePos then 1399 len := buf'Length - writePos; 1400 end if; 1401 1402 while len > 0 and then More_bytes loop 1403 buf(writePos):= Read_byte; 1404 writePos:= writePos + 1; 1405 len:= len - 1; 1406 actual_len:= actual_len + 1; 1407 end loop; 1408 1409 -- Set the new readLimit but only if there's enough data to allow 1410 -- encoding of at least one more byte. 1411 if writePos >= keepSizeAfter then 1412 readLimit := writePos - keepSizeAfter; 1413 end if; 1414 1415 processPendingBytes; 1416 1417 -- Put_Line("Fill window, requested=" & len_initial'Img & " actual=" & actual_len'Img); 1418 -- Tell the caller how much input we actually copied into the dictionary. 1419 return actual_len; 1420 end fillWindow; 1421 1422 matches : Matches_type; 1423 readAhead : Integer := -1; -- LZMAEncoder.java 1424 1425 function getMatches return Matches_type is 1426 begin 1427 readAhead:= readAhead + 1; 1428 return BT4_Algo.getMatches; 1429 end getMatches; 1430 1431 procedure skip(len: Natural) is 1432 pragma Inline(skip); 1433 begin 1434 readAhead:= readAhead + len; 1435 BT4_Algo.skip(len); 1436 end skip; 1437 1438 -- Small stack of recent distances used for LZ. 1439 subtype Repeat_stack_range is Integer range 0..3; 1440 rep_dist: array(Repeat_stack_range) of Natural := (others => 0); 1441 1442 procedure getNextSymbol is 1443 avail, mainLen, mainDist, newLen, newDist, limit: Integer; 1444 1445 function changePair(smallDist, bigDist: Integer) return Boolean is 1446 pragma Inline(changePair); 1447 begin 1448 return smallDist < bigDist / 128; 1449 end changePair; 1450 1451 -- This function is for debugging. The matches stored in the 'tree' array 1452 -- may be wrong if the variables cyclicPos, lzPos and readPos are not in sync. 1453 -- The issue seems to have been solved now (rev. 489). 1454 function Is_match_correct(shift: Natural) return Boolean is 1455 pragma Inline(Is_match_correct); 1456 paranoid: constant Boolean:= True; 1457 begin 1458 if paranoid then 1459 for i in reverse -1 + shift .. mainLen - 2 + shift loop 1460 if buf(readPos - (mainDist+1) + i) /= buf(readPos + i) then 1461 return False; -- Should not occur (check with code coverage) 1462 end if; 1463 end loop; 1464 end if; 1465 return True; 1466 end Is_match_correct; 1467 1468 function getMatchLen(dist, lenLimit: Integer) return Natural is 1469 pragma Inline(getMatchLen); 1470 backPos: constant Integer := readPos - dist - 1; 1471 len: Integer := 0; 1472 begin 1473 if dist < 1 then 1474 return 0; 1475 end if; 1476 -- @ if readPos+len not in buf.all'Range then 1477 -- @ Put("**** readpos " & buf'Last'Img & readPos'Img); 1478 -- @ end if; 1479 -- @ if backPos+len not in buf.all'Range then 1480 -- @ Put("**** backpos " & buf'Last'Img & backPos'Img); 1481 -- @ end if; 1482 while len < lenLimit and then buf(readPos + len) = buf(backPos + len) loop 1483 len:= len + 1; 1484 end loop; 1485 return len; 1486 end getMatchLen; 1487 1488 procedure Send_first_literal_of_match is 1489 begin 1490 Write_literal(cur_literal); 1491 readAhead := readAhead - 1; 1492 end Send_first_literal_of_match; 1493 1494 procedure Send_DL_code( distance, length: Integer ) is 1495 found_repeat: Integer:= rep_dist'First - 1; 1496 aux: Integer; 1497 begin 1498 Write_DL_code(distance + 1, length); 1499 readAhead := readAhead - length; 1500 if LZMA_friendly then 1501 -- 1502 -- Manage the stack of recent distances in the same way the "MA" part of LZMA does. 1503 -- 1504 for i in rep_dist'Range loop 1505 if distance = rep_dist(i) then 1506 found_repeat:= i; 1507 exit; 1508 end if; 1509 end loop; 1510 if found_repeat >= rep_dist'First then 1511 -- Roll the stack of recent distances up to the item with index found_repeat, 1512 -- which becomes first. If found_repeat = rep_dist'First, no actual change occurs. 1513 aux:= rep_dist(found_repeat); 1514 for i in reverse rep_dist'First + 1 .. found_repeat loop 1515 rep_dist(i) := rep_dist(i-1); 1516 end loop; 1517 rep_dist(rep_dist'First):= aux; 1518 else 1519 -- Shift the stack of recent distances; the new distance becomes the first item. 1520 for i in reverse rep_dist'First + 1 .. rep_dist'Last loop 1521 rep_dist(i) := rep_dist(i-1); 1522 end loop; 1523 rep_dist(0) := distance; 1524 end if; 1525 end if; 1526 end Send_DL_code; 1527 1528 bestRepLen, bestRepIndex, len: Integer; 1529 1530 begin 1531 -- Get the matches for the next byte unless readAhead indicates 1532 -- that we already got the new matches during the previous call 1533 -- to this procedure. 1534 if readAhead = -1 then 1535 matches := getMatches; 1536 end if; 1537 -- @ if readPos not in buf.all'Range then 1538 -- @ Put("**** " & buf'Last'Img & keepSizeAfter'Img & readPos'Img & writePos'Img); 1539 -- @ end if; 1540 cur_literal:= buf(readPos); 1541 -- Get the number of bytes available in the dictionary, but 1542 -- not more than the maximum match length. If there aren't 1543 -- enough bytes remaining to encode a match at all, return 1544 -- immediately to encode this byte as a literal. 1545 avail := Integer'Min(getAvail, Look_Ahead); 1546 if avail < MATCH_LEN_MIN then 1547 -- Put("[a]"); 1548 Send_first_literal_of_match; 1549 return; 1550 end if; 1551 1552 if LZMA_friendly then 1553 -- Look for a match from the previous four different match distances. 1554 bestRepLen := 0; 1555 bestRepIndex := 0; 1556 for rep in Repeat_stack_range loop 1557 len := getMatchLen(rep_dist(rep), avail); 1558 if len >= MATCH_LEN_MIN then 1559 -- If it is long enough, return it. 1560 if len >= niceLen then 1561 skip(len - 1); 1562 -- Put_Line("[DL RA]"); 1563 Send_DL_code(rep_dist(rep), len); 1564 return; 1565 end if; 1566 -- Remember the index and length of the best repeated match. 1567 if len > bestRepLen then 1568 bestRepIndex := rep; 1569 bestRepLen := len; 1570 end if; 1571 end if; 1572 end loop; 1573 end if; 1574 1575 mainLen := 0; 1576 mainDist := 0; 1577 if matches.count > 0 then 1578 mainLen := matches.len(matches.count - 1); 1579 mainDist := matches.dist(matches.count - 1); 1580 if mainLen >= niceLen then 1581 if Is_match_correct(1) then 1582 skip(mainLen - 1); 1583 -- Put_Line("[DL A]" & mainDist'Img & mainLen'Img); 1584 Send_DL_code(mainDist, mainLen); 1585 return; 1586 else 1587 -- Put_Line("Wrong match [A]! pos=" & Integer'Image(lzPos - cyclicSize)); 1588 Send_first_literal_of_match; 1589 return; 1590 end if; 1591 end if; 1592 while matches.count > 1 and then mainLen = matches.len(matches.count - 2) + 1 loop 1593 exit when not changePair(matches.dist(matches.count - 2), mainDist); 1594 matches.count:= matches.count - 1; 1595 mainLen := matches.len(matches.count - 1); 1596 mainDist := matches.dist(matches.count - 1); 1597 end loop; 1598 if mainLen = MATCH_LEN_MIN and then mainDist >= 128 then 1599 mainLen := 1; 1600 end if; 1601 end if; 1602 1603 if LZMA_friendly 1604 and then bestRepLen >= MATCH_LEN_MIN 1605 and then ( bestRepLen + 1 >= mainLen 1606 or else (bestRepLen + 2 >= mainLen and then mainDist >= 2 ** 9) 1607 or else (bestRepLen + 3 >= mainLen and then mainDist >= 2 ** 15) ) 1608 then 1609 skip(bestRepLen - 1); 1610 -- Put_Line("[DL RB]"); 1611 Send_DL_code(rep_dist(bestRepIndex), bestRepLen); 1612 return; 1613 end if; 1614 1615 if mainLen < MATCH_LEN_MIN or else avail <= MATCH_LEN_MIN then 1616 --Put("[b]"); 1617 Send_first_literal_of_match; 1618 return; 1619 end if; 1620 1621 -- Get the next match. Test if it is better than the current match. 1622 -- If so, encode the current byte as a literal. 1623 matches := getMatches; 1624 -- 1625 if matches.count > 0 then 1626 newLen := matches.len(matches.count - 1); 1627 newDist := matches.dist(matches.count - 1); 1628 if (newLen >= mainLen and then newDist < mainDist) 1629 or else (newLen = mainLen + 1 1630 and then not changePair(mainDist, newDist)) 1631 or else newLen > mainLen + 1 1632 or else (newLen + 1 >= mainLen 1633 and then mainLen >= MATCH_LEN_MIN + 1 1634 and then changePair(newDist, mainDist)) 1635 then 1636 --Put("[c]"); 1637 --Put(Character'Val(cur_literal)); 1638 Send_first_literal_of_match; 1639 return; 1640 end if; 1641 end if; 1642 1643 limit := Integer'Max(mainLen - 1, MATCH_LEN_MIN); 1644 for rep in rep_dist'Range loop 1645 if getMatchLen(rep_dist(rep), limit) = limit then 1646 Send_first_literal_of_match; 1647 return; 1648 end if; 1649 end loop; 1650 1651 if Is_match_correct(0) then 1652 skip(mainLen - 2); 1653 -- Put_Line("[DL B]" & mainDist'Img & mainLen'Img); 1654 Send_DL_code(mainDist, mainLen); 1655 else 1656 -- Put_Line("Wrong match [B]!"); 1657 Send_first_literal_of_match; 1658 end if; 1659 end getNextSymbol; 1660 1661 actual_written, avail: Integer; 1662 begin 1663 -- NB: heap allocation used only for convenience because of 1664 -- small default stack sizes on some compilers. 1665 buf:= new Byte_array(0 .. getBufSize); 1666 actual_written:= fillWindow(String_buffer_size); 1667 if actual_written > 0 then 1668 loop 1669 getNextSymbol; 1670 avail:= getAvail; 1671 if avail = 0 then 1672 actual_written:= fillWindow(String_buffer_size); 1673 exit when actual_written = 0; 1674 end if; 1675 end loop; 1676 end if; 1677 Dispose(buf); 1678 Dispose(tree); 1679 Dispose(Hash234.hash4Table); 1680 end LZ77_using_BT4; 1681 1682 begin 1683 case Method is 1684 when LZHuf => 1685 LZ77_using_LZHuf; 1686 when IZ_4 .. IZ_10 => 1687 LZ77_using_IZ( 4 + Method_Type'Pos(Method) - Method_Type'Pos(IZ_4) ); 1688 when BT4 => 1689 LZ77_using_BT4; 1690 end case; 1691 end Encode; 1692 1693end LZ77; 1694