1--  There are three LZ77 encoders at choice here:
2--
3--    1/  LZ77_using_LZHuf, based on LZHuf
4--
5--    2/  LZ77_using_IZ, based on Info-Zip's Zip's deflate.c which is
6--          actually the LZ77 part of Zip's compression.
7--
8--    3/  LZ77_using_BT4, based on LZMA SDK's BT4 algorithm.
9--
10--  Variant 1/, LZ77_using_LZHuf, is working since 2009. Two problems: it is slow
11--     and not well adapted to the Deflate format (mediocre compression).
12--
13--  Variant 2/, LZ77_using_IZ, is much faster, and better suited for Deflate.
14--     It has been added on 05-Mar-2016.
15--     The code is tailored and optimized for a single set of
16--     the String_buffer_size, Look_Ahead, Threshold LZ77 parameters - those for Deflate.
17--
18--  Variant 3/, LZ77_using_BT4, was added on 06-Sep-2016.
19--     The seems to be the best match finder for LZMA on data of the >= 1 MB scale.
20
21--  To do:
22--
23--  2/
24--    - LZ77 / IZ: similar to the test with TOO_FAR, try to cluster distances around
25--        values needing less extra bits (may not work at all...)
26--    - LZ77 / IZ: tune TOO_FAR (max: 32767), see http://optipng.sf.net/pngtech/too_far.html
27--        "TOO_FAR in zlib Is Not Too Far" for discussion
28
29--  Legal licensing note:
30
31--  Copyright (c) 2016 .. 2018 Gautier de Montmollin (maintainer of the Ada version)
32--  SWITZERLAND
33
34--  Permission is hereby granted, free of charge, to any person obtaining a copy
35--  of this software and associated documentation files (the "Software"), to deal
36--  in the Software without restriction, including without limitation the rights
37--  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
38--  copies of the Software, and to permit persons to whom the Software is
39--  furnished to do so, subject to the following conditions:
40
41--  The above copyright notice and this permission notice shall be included in
42--  all copies or substantial portions of the Software.
43
44--  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
45--  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
46--  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
47--  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
48--  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
49--  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
50--  THE SOFTWARE.
51
52-- NB: this is the MIT License, as found 21-Aug-2016 on the site
53-- http://www.opensource.org/licenses/mit-license.php
54
55with Ada.Unchecked_Deallocation;
56with Interfaces; use Interfaces;
57with System;
58
59package body LZ77 is
60
61  --  System.Word_Size: 13.3(8): A word is the largest amount of storage
62  --  that can be conveniently and efficiently manipulated by the hardware,
63  --  given the implementation's run-time model.
64  --
65  min_bits_32: constant:= Integer'Max(32, System.Word_Size);
66  min_bits_16: constant:= Integer'Max(16, System.Word_Size);
67
68  --  We define an Integer type which is at least 32 bits, but n bits
69  --  on a native n (> 32) bits architecture (no performance hit on 64+
70  --  bits architectures).
71  --  Integer_M16 not needed: Integer already guarantees 16 bits
72  --
73  type Integer_M32 is range -2**(min_bits_32-1) .. 2**(min_bits_32-1) - 1;
74  subtype Natural_M32  is Integer_M32 range 0..Integer_M32'Last;
75
76  type Unsigned_M16 is mod 2**min_bits_16;
77  type Unsigned_M32 is mod 2**min_bits_32;
78
79  procedure Encode is
80
81    -----------------------
82    --  LZHuf algorithm  --
83    -----------------------
84
85    procedure LZ77_using_LZHuf is
86      --  Based on LZHUF by OKUMURA & YOSHIZAKI.
87      --  Here the adaptive Huffman coding is thrown away:
88      --  algorithm is used only to find matching patterns.
89
90      N_Char    : constant Integer:= 256-Threshold+Look_Ahead;
91      -- Character code (= 0..N_CHAR-1)
92      Max_Table     : constant Integer:= N_Char*2-1;
93
94      type Text_Buffer is array ( 0..String_buffer_size+Look_Ahead-1 ) of Byte;
95      empty_buffer: constant Text_Buffer:= (others=> 32); -- ' '
96
97      -- > The Huffman frequency handling is made generic so we have
98      --   one copy of the tree and of the frequency table for Encode
99      --   and one for Decode
100
101      generic
102      package Huffman is
103        --- Pointing parent nodes.
104        --- Area [Max_Table..(Max_Table + N_CHAR - 1)] are pointers for leaves
105        Parent:  array ( 0..Max_Table+N_Char-1 ) of Natural;
106        --- Pointing children nodes (son[], son[] + 1)
107        Son   :  array ( 0..Max_Table-1 )  of Natural;
108
109        Root_Position : constant Integer:= Max_Table-1; -- (can be always Son'last ?)
110
111        procedure Start;
112        procedure Update_Freq_Tree( C0: Natural );
113      end Huffman;
114
115      package body Huffman is
116
117        Freq: array ( 0..Max_Table ) of Natural; -- Cumulative freq table
118
119        Max_Freq: constant := 16#8000#;
120        -- ^-- update when cumulative frequency reaches to this value
121
122        procedure Start is
123          I: Natural;
124        begin
125          for J in  0 .. N_Char-1  loop
126            Freq(J):= 1;
127            Son (J):= J + Max_Table;
128            Parent(J + Max_Table):= J;
129          end loop;
130
131          I:= 0;
132          for J in N_Char .. Root_Position  loop
133            Freq(J):= Freq(I)+Freq(I+1);
134            Son (J):= I;
135            Parent(I):= J;
136            Parent(I+1):= J;
137            I:= I + 2;
138          end loop;
139
140          Freq( Freq'Last ):= 16#FFFF#; -- ( Max_Table )
141          Parent( Root_Position ):= 0;
142        end Start;
143
144        procedure Update_Freq_Tree( C0: Natural ) is
145
146          procedure Reconstruct_Freq_Tree is
147            I,J,K,F,L: Natural;
148          begin
149            -- Halven cumulative freq for leaf nodes
150            J:= 0;
151            for I in 0 .. Root_Position loop
152              if Son(I) >= Max_Table then
153                Freq(J):= (Freq(I)+1) / 2;
154                Son (J):= Son(I);
155                J:= J + 1;
156              end if;
157            end loop;
158
159            -- Make a tree : first, connect children nodes
160            I:= 0;
161            for J in N_Char .. Root_Position loop -- J : free nodes
162              K:= I+1;
163              F:= Freq(I) + Freq(K); -- new frequency
164              Freq(J):= F;
165              K:= J-1;
166              while F < Freq(K) loop
167                K:= K-1;
168              end loop;
169
170              K:= K+1;
171              L:= J-K; -- 2007: fix: was L:= (J-K)*2, memcopy parameter remain
172
173              Freq( K+1 .. K+L ):= Freq( K .. K+L-1 ); -- shift by one cell right
174              Freq(K):= F;
175              Son ( K+1 .. K+L ):= Son ( K .. K+L-1 ); -- shift by one cell right
176              Son (K):= I;
177              I:= I + 2;
178            end loop;
179
180            -- Connect parent nodes
181            for I in 0 .. Max_Table-1 loop
182              K:= Son(I);
183              Parent(K):= I;
184              if K < Max_Table then
185                Parent(K+1):= I;
186              end if;
187            end loop;
188
189          end Reconstruct_Freq_Tree;
190
191          C,I,J,K,L: Natural;
192
193        begin -- Update_Freq_Tree;
194          if Freq( Root_Position ) = Max_Freq then
195            Reconstruct_Freq_Tree;
196          end if;
197          C:= Parent(C0 + Max_Table);
198          loop
199            Freq(C):= Freq(C) + 1;
200            K:= Freq(C);
201            -- Swap nodes to keep the tree freq-ordered
202            L:= C+1;
203            if  K > Freq(L) then
204              while K > Freq(L+1) loop
205                L:= L + 1;
206              end loop;
207
208              Freq(C):= Freq(L);
209              Freq(L):= K;
210
211              I:= Son(C);
212              Parent(I):= L;
213              if I < Max_Table then
214                Parent(I+1):= L;
215              end if;
216
217              J:= Son(L);
218              Son(L):= I;
219
220              Parent(J):= C;
221              if J < Max_Table then
222                Parent(J+1):= C;
223              end if;
224              Son(C):= J;
225
226              C := L;
227            end if;
228            C:= Parent(C);
229            exit when C=0;
230          end loop;        -- do it until reaching the root
231        end Update_Freq_Tree;
232
233      end Huffman;
234
235      Node_Nil : constant Integer:= String_buffer_size;    -- End of tree's node
236
237      Lson,Dad:  array ( 0..String_buffer_size       ) of Natural;
238      Rson:      array ( 0..String_buffer_size + 256 ) of Natural;
239
240      procedure Init_Tree is
241      begin
242        for I in String_buffer_size+1 .. Rson'Last loop
243          Rson(I) := Node_Nil;
244        end loop; -- root
245        for I in 0 .. String_buffer_size-1 loop
246          Dad(I)  := Node_Nil;
247        end loop; -- node
248      end Init_Tree;
249
250      Match_Position : Natural;
251      Match_Length   : Natural;
252
253      Text_Buf: Text_Buffer:= empty_buffer;
254
255      procedure Insert_Node (R: Integer) is
256        I,P: Integer;
257        Geq: Boolean:= True;
258        C:   Natural;
259      begin
260        P:= String_buffer_size + 1 + Integer(Text_Buf(R));
261        Rson(R):= Node_Nil;
262        Lson(R):= Node_Nil;
263        Match_Length := 0;
264        loop
265          if Geq then
266            if Rson(P) = Node_Nil then
267              Rson(P):= R;
268              Dad(R) := P;
269              return;
270            end if;
271            P:= Rson(P);
272          else
273            if Lson(P) = Node_Nil then
274              Lson(P):= R;
275              Dad(R) := P;
276              return;
277            end if;
278            P:= Lson(P);
279          end if;
280          I:= 1;
281          while I < Look_Ahead and then Text_Buf(R+I) = Text_Buf(P+I)  loop
282            I:= I + 1;
283          end loop;
284
285          Geq:= Text_Buf(R+I) >= Text_Buf(P+I) or I = Look_Ahead;
286
287          if I > Threshold then
288            if I > Match_Length then
289              Match_Position := (R-P) mod String_buffer_size - 1;
290              Match_Length:= I;
291              exit when Match_Length >= Look_Ahead;
292            end if;
293            if I = Match_Length then
294              C:= (R-P) mod String_buffer_size - 1;
295              if C < Match_Position then
296                Match_Position:= C;
297              end if;
298            end if;
299          end if;
300        end loop;
301
302        Dad (R):= Dad (P);
303        Lson(R):= Lson(P);
304        Rson(R):= Rson(P);
305        Dad(Lson(P)):= R;
306        Dad(Rson(P)):= R;
307        if Rson(Dad(P)) = P then
308          Rson(Dad(P)):= R;
309        else
310          Lson(Dad(P)):= R;
311        end if;
312        Dad(P):= Node_Nil; -- remove P
313      end Insert_Node;
314
315      procedure Delete_Node (P: Natural) is
316        Q: Natural;
317      begin
318        if Dad(P) = Node_Nil then  -- unregistered
319          return;
320        end if;
321        if    Rson(P) = Node_Nil then
322          Q:= Lson(P);
323        elsif Lson(P) = Node_Nil then
324          Q:= Rson(P);
325        else
326          Q:= Lson(P);
327          if Rson(Q) /= Node_Nil then
328            loop
329              Q:= Rson(Q);
330              exit when Rson(Q) = Node_Nil;
331            end loop;
332            Rson(Dad(Q)):= Lson(Q);
333            Dad(Lson(Q)):= Dad(Q);
334            Lson(Q):= Lson(P);
335            Dad(Lson(P)):= Q;
336          end if;
337          Rson(Q):= Rson(P);
338          Dad(Rson(P)):= Q;
339        end if;
340        Dad(Q):= Dad(P);
341        if Rson(Dad(P))=P then
342          Rson(Dad(P)):= Q;
343        else
344          Lson(Dad(P)):= Q;
345        end if;
346        Dad(P):= Node_Nil;
347      end Delete_Node;
348
349      package Huffman_E is new Huffman;
350
351      I,R,S,Last_Match_Length: Natural;
352      Len: Integer;
353      C: Byte;
354    begin
355      if not More_bytes then
356        return;
357      end if;
358      Huffman_E.Start;
359      Init_Tree;
360      S:= 0;
361      R:= String_buffer_size - Look_Ahead;
362      Len:= 0;
363      while Len < Look_Ahead and More_bytes loop
364        Text_Buf(R+Len):= Read_byte;
365        Len:= Len + 1;
366      end loop;
367
368      --  Seems: fill dictionary with default value
369      --
370      --  for I in 1.. Look_Ahead loop
371      --    Insert_Node(R - I);
372      --  end loop;
373
374      Insert_Node(R);
375
376      loop
377        if Match_Length > Len then
378          Match_Length:= Len;
379        end if;
380        if Match_Length <= Threshold then
381          Match_Length := 1;
382          Huffman_E.Update_Freq_Tree( Natural(Text_Buf(R)) );
383          Write_literal( Text_Buf(R) );
384        else
385          Write_DL_code(Match_Position+1, Match_Length);
386        end if;
387        Last_Match_Length := Match_Length;
388        I:= 0;
389        while I < Last_Match_Length and More_bytes loop
390          I:= I + 1;
391          Delete_Node(S);
392          C:= Read_byte;
393          Text_Buf(S):= C;
394          if  S < Look_Ahead-1 then
395            Text_Buf(S+String_buffer_size):= C;
396          end if;
397          S:= (S+1) mod String_buffer_size;
398          R:= (R+1) mod String_buffer_size;
399          Insert_Node(R);
400        end loop;
401
402        while I < Last_Match_Length loop
403          I:= I + 1;
404          Delete_Node(S);
405          S := (S+1) mod String_buffer_size;
406          R := (R+1) mod String_buffer_size;
407          Len:= Len - 1;
408          if Len > 0 then
409            Insert_Node(R);
410          end if;
411        end loop;
412
413        exit when Len=0;
414      end loop;
415    end LZ77_using_LZHuf;
416
417    --------------------------
418    --  Info-Zip algorithm  --
419    --------------------------
420
421    --  LZ77_using_IZ: based on deflate.c by Jean-Loup Gailly.
422    --  Core description of the algorithm:
423    --
424    --     The most straightforward technique turns out to be the fastest for
425    --     most input files: try all possible matches and select the longest.
426    --     The key feature of this algorithm is that insertions into the string
427    --     dictionary are very simple and thus fast, and deletions are avoided
428    --     completely. Insertions are performed at each input character, whereas
429    --     string matches are performed only when the previous match ends. So it
430    --     is preferable to spend more time in matches to allow very fast string
431    --     insertions and avoid deletions. The matching algorithm for small
432    --     strings is inspired from that of Rabin & Karp [1]. A brute force approach
433    --     is used to find longer strings when a small match has been found.
434    --
435    --     The idea of lazy evaluation of matches is due to Jan-Mark Wams.
436    --
437    --     [1] A description of the Rabin and Karp algorithm is given in the book
438    --         "Algorithms" by R. Sedgewick, Addison-Wesley, p252.
439    --
440    --  About hashing: chapter 6.4 of The Art of Computer Programming, Volume 3, D.E. Knuth
441    --  Rabin and Karp algorithm: http://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm
442
443    --  Compression level: 0: store, 1: best speed, 9: best compression, 10: variant of level 9
444    --  Ada code: only levels 4 to 10 are supported.
445
446    procedure LZ77_using_IZ(level: Natural) is
447      HASH_BITS: constant:= 15;  --  13..15
448      HASH_SIZE: constant:= 2 ** HASH_BITS;
449      HASH_MASK: constant:= HASH_SIZE - 1;
450      WSIZE    : constant Integer_M32:= Integer_M32(String_buffer_size);
451      WMASK    : constant Unsigned_M16:= Unsigned_M16(WSIZE - 1);
452      --  HASH_SIZE and WSIZE must be powers of two
453      NIL      : constant:= 0;  --  Tail of hash chains
454      TOO_FAR  : constant:= 4096;  --  Matches of length 3 are discarded if their distance exceeds TOO_FAR
455      --
456      subtype ulg is Unsigned_M32;
457      subtype unsigned is Unsigned_M16;
458      subtype ush is Unsigned_M16;
459      --  subtype long is Integer_M32;
460      --  subtype int is Integer;
461      subtype Pos is Unsigned_M32;  --  must be at least 32 bits
462      --  subtype IPos is unsigned;
463      --  A Pos is an index in the character window. IPos is used only for parameter passing.
464      window: array(0 .. 2 * WSIZE - 1) of Byte;
465      --  Sliding window. Input bytes are read into the second half of the window,
466      --  and move to the first half later to keep a dictionary of at least WSIZE
467      --  bytes. With this organization, matches are limited to a distance of
468      --  WSIZE-MAX_MATCH bytes, but this ensures that IO is always
469      --  performed with a length multiple of the block size.
470      prev: array(0..unsigned(WSIZE - 1)) of Pos;
471      --  Link to older string with same hash index.
472      --  This link is maintained only for the last 32K strings.
473      --  An index in this array is thus a window index modulo 32K.
474      head: array(0..unsigned(HASH_SIZE - 1)) of Pos;
475      --  Heads of the hash chains or NIL.
476      window_size: ulg;
477      --  window size, 2*WSIZE except for MMAP or BIG_MEM, where it is the
478      --  input file length plus MIN_LOOKAHEAD.
479      sliding: Boolean;  --  Set to False when the input file is already in memory  [was: int]
480      ins_h: unsigned;   --  hash index of string to be inserted
481      MIN_MATCH: constant Integer_M32:= Integer_M32(Threshold) + 1;  --  Deflate: 3
482      MAX_MATCH: constant Integer_M32:= Integer_M32(Look_Ahead);     --  Deflate: 258
483      --  Minimum amount of lookahead, except at the end of the input file.
484      MIN_LOOKAHEAD: constant Integer_M32:= MAX_MATCH + MIN_MATCH + 1;  --  Deflate: 262
485      --  This LZ77 compression doesn't use the full possible distance range: 32507..32768 unused!
486      MAX_DIST : constant Integer_M32:= WSIZE - MIN_LOOKAHEAD;  --  Deflate: 32506
487      H_SHIFT: constant Integer:= Integer((HASH_BITS + MIN_MATCH - 1) / MIN_MATCH);
488      --  Number of bits by which ins_h and del_h must be shifted at each
489      --  input step. It must be such that after MIN_MATCH steps, the oldest
490      --  byte no longer takes part in the hash key, that is:
491      --  H_SHIFT * MIN_MATCH >= HASH_BITS
492      prev_length: Natural_M32; --  [was: unsigned]
493      --  Length of the best match at previous step. Matches not greater than this
494      --  are discarded. This is used in the lazy match evaluation.
495      strstart   : Natural_M32;   --  start of string to insert [was: unsigned]
496      match_start: Natural_M32;   --  start of matching string [was: unsigned]
497      eofile     : Boolean;       --  flag set at end of input file [was: int]
498      lookahead  : Natural_M32;   --  number of valid bytes ahead in window  [was: unsigned]
499      max_chain_length : unsigned;
500      --  To speed up deflation, hash chains are never searched beyond this length.
501      --  A higher limit improves compression ratio but degrades the speed.
502      max_lazy_match: Natural_M32;  --  [was: unsigned]
503      --  Attempt to find a better match only when the current match is strictly
504      --  smaller than this value. This mechanism is used only for compression
505      --  levels >= 4.
506      good_match: Natural_M32;  --  [was: unsigned]
507      --  Use a faster search when the previous match is longer than this
508      nice_match: Integer_M32;  --  Stop searching when current match exceeds this
509      --  Values for max_lazy_match, good_match, nice_match and max_chain_length,
510      --  depending on the desired pack level (0..9). The values given below have
511      --  been tuned to exclude worst case performance for pathological files.
512      --  Better values may be found for specific files.
513      type config is record
514        good_length  : Natural_M32;  --  reduce lazy search above this match length [was: ush]
515        max_lazy     : Natural_M32;  --  do not perform lazy search above this match length
516        nice_length  : Integer_M32;  --  quit search above this match length
517        max_chain    : ush;
518      end record;
519
520      configuration_table: constant array(0..10) of config:= (
521      --  good lazy nice chain
522          (0,    0,  0,    0),    --  0: store only
523          (4,    4,  8,    4),    --  1: maximum speed, no lazy matches
524          (4,    5, 16,    8),
525          (4,    6, 32,   32),
526          (4,    4, 16,   16),    --  4: lazy matches
527          (8,   16, 32,   32),
528          (8,   16, 128, 128),
529          (8,   32, 128, 256),
530          (32, 128, 258, 1024),
531          (32, 258, 258, 4096),   --  9: maximum compression
532          (34, 258, 258, 4096));  --  "secret" variant of level 9
533
534      --  Update a hash value with the given input byte
535      --  IN  assertion: all calls to to UPDATE_HASH are made with consecutive
536      --     input characters, so that a running hash key can be computed from the
537      --     previous key instead of complete recalculation each time.
538
539      procedure UPDATE_HASH(h: in out unsigned; c: Byte) is
540      pragma Inline(UPDATE_HASH);
541      begin
542        h := (unsigned(Shift_Left(Unsigned_32(h), H_SHIFT)) xor unsigned(c)) and HASH_MASK;
543      end UPDATE_HASH;
544
545      --  Insert string starting at s in the dictionary and set match_head to the previous head
546      --  of the hash chain (the most recent string with same hash key). Return
547      --  the previous length of the hash chain.
548      --  IN  assertion: all calls to to INSERT_STRING are made with consecutive
549      --     input characters and the first MIN_MATCH bytes of s are valid
550      --     (except for the last MIN_MATCH-1 bytes of the input file).
551
552      procedure INSERT_STRING(s: Integer_M32; match_head: out Natural_M32) is
553      pragma Inline(INSERT_STRING);
554      begin
555        UPDATE_HASH(ins_h, window(s + MIN_MATCH - 1));
556        match_head := Natural_M32(head(ins_h));
557        prev(unsigned(s) and WMASK):= Pos(match_head);
558        head(ins_h) := Pos(s);
559      end INSERT_STRING;
560
561      procedure Read_buf(from: Integer_M32; amount: unsigned; actual: out Integer_M32) is
562        need: unsigned:= amount;
563      begin
564        --  put_line("Read buffer: from:" & from'img & ";  amount:" & amount'img);
565        actual:= 0;
566        while need > 0 and then More_bytes loop
567          window(from + actual):= Read_byte;
568          actual:= actual + 1;
569          need:= need - 1;
570        end loop;
571        --  put_line("Read buffer: actual:" & actual'img);
572      end Read_buf;
573
574      --  Fill the window when the lookahead becomes insufficient.
575      --  Updates strstart and lookahead, and sets eofile if end of input file.
576      --
577      --  IN assertion: lookahead < MIN_LOOKAHEAD && strstart + lookahead > 0
578      --  OUT assertions: strstart <= window_size-MIN_LOOKAHEAD
579      --     At least one byte has been read, or eofile is set; file reads are
580      --     performed for at least two bytes (required for the translate_eol option).
581
582      procedure Fill_window is
583        more: unsigned;
584        m: Pos;
585        n: Natural_M32;
586      begin
587        loop
588          more:= unsigned(window_size - ulg(lookahead) - ulg(strstart));
589          if False then  --  C: "if (more == (unsigned)EOF) {" ?... GdM: seems a 16-bit code for EOF
590            --  Very unlikely, but possible on 16 bit machine if strstart == 0
591            --  and lookahead == 1 (input done one byte at time)
592            more:= more - 1;
593          elsif strstart >= WSIZE + MAX_DIST and then sliding then
594            --  By the IN assertion, the window is not empty so we can't confuse
595            --  more == 0 with more == 64K on a 16 bit machine.
596            window(0 .. WSIZE - 1):= window(WSIZE .. 2 * WSIZE - 1);
597            --  GdM: in rare cases (e.g. level 9 on test file "enwik8"), match_start happens
598            --  to be < WSIZE. We do as in the original 16-bit C code: mod 2**16, such that the
599            --  index is the window's range.
600            --  This assumes WSIZE = 2**15, which is checked at startup of LZ77_using_IZ.
601            --  Very likely, match_start is garbage anyway - see http://sf.net/p/infozip/bugs/49/
602            match_start := Natural_M32( Unsigned_16(match_start) - Unsigned_16(WSIZE mod (2**16)) );
603            strstart    := strstart - WSIZE; -- we now have strstart >= MAX_DIST:
604            for nn in 0 .. unsigned'(HASH_SIZE - 1) loop
605              m := head(nn);
606              if m >= Pos(WSIZE) then
607                head(nn) := m - Pos(WSIZE);
608              else
609                head(nn) := NIL;
610              end if;
611            end loop;
612            --
613            for nn in 0 .. unsigned(WSIZE - 1) loop
614              m := prev(nn);
615              if m >= Pos(WSIZE) then
616                prev(nn) := m - Pos(WSIZE);
617              else
618                prev(nn) := NIL;
619              end if;
620              --  If n is not on any hash chain, prev[n] is garbage but its value will never be used.
621            end loop;
622            more:= more + unsigned(WSIZE);
623          end if;
624          exit when eofile;
625          --  If there was no sliding:
626          --     strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
627          --     more == window_size - lookahead - strstart
628          --  => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
629          --  => more >= window_size - 2*WSIZE + 2
630          --  In the MMAP or BIG_MEM case (not yet supported in gzip),
631          --    window_size == input_size + MIN_LOOKAHEAD  &&
632          --    strstart + lookahead <= input_size => more >= MIN_LOOKAHEAD.
633          --  Otherwise, window_size == 2*WSIZE so more >= 2.
634          --  If there was sliding, more >= WSIZE. So in all cases, more >= 2.
635          --
636          --  Assert(more >= 2, "more < 2");
637          --
638          Read_buf(strstart + lookahead, more, n);
639          if n = 0 then
640            eofile := True;
641          else
642            lookahead := lookahead + n;
643          end if;
644          exit when lookahead >= MIN_LOOKAHEAD or eofile;
645        end loop;
646        --  put_line("Fill done - eofile = " & eofile'img);
647      end Fill_window;
648
649      --  Initialize the "longest match" routines for a new file
650      --
651      --  IN assertion: window_size is > 0 if the input file is already read or
652      --     mapped in the window array, 0 otherwise. In the first case,
653      --     window_size is sufficient to contain the whole input file plus
654      --     MIN_LOOKAHEAD bytes (to avoid referencing memory beyond the end
655      --     of window when looking for matches towards the end).
656
657      procedure LM_Init (pack_level: Natural) is
658      begin
659        --  Do not slide the window if the whole input is already in memory (window_size > 0)
660        sliding := False;
661        if window_size = 0 then
662          sliding := True;
663          window_size := 2 * ulg(WSIZE);
664        end if;
665        --  Initialize the hash table.
666        --  prev will be initialized on the fly.
667        head:= (others => NIL);
668        --  Set the default configuration parameters:
669        max_lazy_match   := configuration_table(pack_level).max_lazy;
670        good_match       := configuration_table(pack_level).good_length;
671        nice_match       := configuration_table(pack_level).nice_length;
672        max_chain_length := configuration_table(pack_level).max_chain;
673        --  Info-Zip comment: ??? reduce max_chain_length for binary files
674        strstart := 0;
675        Read_buf(0, unsigned(WSIZE), lookahead);
676        if lookahead = 0 then
677          eofile := True;
678          return;
679        end if;
680        eofile := False;
681        --  Make sure that we always have enough lookahead. This is important
682        --  if input comes from a device such as a tty.
683        if lookahead < MIN_LOOKAHEAD then
684          Fill_window;
685        end if;
686        ins_h := 0;
687        for j in 0 .. Natural_M32(MIN_MATCH)-2 loop
688          UPDATE_HASH(ins_h, window(j));
689        end loop;
690        --  If lookahead < MIN_MATCH, ins_h is garbage, but this is
691        --  not important since only literal bytes will be emitted.
692      end LM_Init;
693
694      --  Set match_start to the longest match starting at the given string and
695      --  return its length. Matches shorter or equal to prev_length are discarded,
696      --  in which case the result is equal to prev_length and match_start is
697      --  garbage.
698      --  IN assertions: current_match is the head of the hash chain for the current
699      --    string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
700
701      procedure Longest_Match(current_match: in out Integer_M32; longest: out Integer_M32) is
702        chain_length : unsigned := max_chain_length;  --  max hash chain length
703        scan         : Integer_M32 := strstart;       --  current string
704        match        : Integer_M32;                   --  matched string
705        len          : Integer_M32;                   --  length of current match
706        best_len     : Integer_M32 := prev_length;    --  best match length so far
707        limit        : Natural_M32;  --  [was: IPos]
708        strend       : constant Integer_M32:= strstart + MAX_MATCH;
709        scan_end     : Integer_M32:= scan + best_len;
710      begin
711        --  Stop when current_match becomes <= limit. To simplify the code,
712        --  we prevent matches with the string of window index 0.
713        if strstart > MAX_DIST then
714          limit:= strstart - MAX_DIST;
715        else
716          limit:= NIL;
717        end if;
718        --  Do not waste too much time if we already have a good match:
719        if prev_length >= good_match then
720          chain_length := chain_length / 4;
721        end if;
722        --  Assert(strstart <= window_size-MIN_LOOKAHEAD, "insufficient lookahead");
723        loop
724          --  Assert(current_match < strstart, "no future");
725          match := current_match;
726          --  Skip to next match if the match length cannot increase
727          --  or if the match length is less than 2:
728          --
729          --  NB: this is the Not-UNALIGNED_OK variant in the C code.
730          --      Translation of the UNALIGNED_OK variant is left as an exercise ;-).
731          --      (!! worth a try: GNAT optimizes window(match..match+1[3]) to 16[32] bit)
732          --
733          if window(match + best_len)     /= window(scan_end) or else
734             window(match + best_len - 1) /= window(scan_end - 1) or else
735             window(match)                /= window(scan) or else
736             window(match + 1)            /= window(scan + 1)
737          then
738            match:= match + 1;  --  C: continue
739          else
740            --  The check at best_len - 1 can be removed because it will be made
741            --  again later. (This heuristic is not always a win.)
742            --
743            --  It is not necessary to compare window(scan + 2) and window(match + 2) since they
744            --  are always equal when the other bytes match, given that
745            --  the hash keys are equal and that HASH_BITS >= 8.
746            scan:= scan + 2;
747            match:= match + 2;
748            --  C: The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.
749            --     It is easy to get rid of this optimization if necessary.
750            --  Ada: see the "else" part below.
751            if MAX_MATCH = 258 then
752              --  We check for insufficient lookahead only every 8th comparison;
753              --  the 256th check will be made at strstart + 258.
754              loop
755                scan:= scan + 1;
756                match:= match + 1;
757                exit when window(scan) /= window(match);
758                scan:= scan + 1;
759                match:= match + 1;
760                exit when window(scan) /= window(match);
761                scan:= scan + 1;
762                match:= match + 1;
763                exit when window(scan) /= window(match);
764                scan:= scan + 1;
765                match:= match + 1;
766                exit when window(scan) /= window(match);
767                scan:= scan + 1;
768                match:= match + 1;
769                exit when window(scan) /= window(match);
770                scan:= scan + 1;
771                match:= match + 1;
772                exit when window(scan) /= window(match);
773                scan:= scan + 1;
774                match:= match + 1;
775                exit when window(scan) /= window(match);
776                scan:= scan + 1;
777                match:= match + 1;
778                exit when window(scan) /= window(match) or else scan >= strend;
779              end loop;
780            else
781              --  We check for insufficient lookahead after every comparison.
782              loop
783                scan:= scan + 1;
784                match:= match + 1;
785                exit when window(scan) /= window(match) or else scan >= strend;
786              end loop;
787            end if;
788            --  Assert(scan <= window+(unsigned)(window_size-1), "wild scan");
789            len := MAX_MATCH - (strend - scan);
790            scan := strend - MAX_MATCH;
791            if len > best_len then
792              match_start := current_match;
793              best_len := len;
794              exit when len >= nice_match;
795              scan_end  := scan + best_len;
796            end if;
797          end if;
798          current_match := Integer_M32(prev(unsigned(current_match) and WMASK));
799          exit when current_match <= limit;
800          chain_length:= chain_length - 1;
801          exit when chain_length = 0;
802        end loop;
803        longest:= best_len;
804      end Longest_Match;
805
806      procedure LZ77_part_of_IZ_Deflate is
807        hash_head : Natural_M32:= NIL;              --  head of hash chain
808        prev_match: Natural_M32;                    --  previous match  [was: IPos]
809        match_available: Boolean:= False;           --  set if previous match exists
810        match_length: Natural_M32:= MIN_MATCH - 1;  --  length of best match
811        max_insert: Natural_M32;
812      begin
813        match_start:= 0;  --  NB: no initialization in deflate.c
814        --  NB: level <= 3 would call deflate_fast;
815        --
816        --  Process the input block.
817        while lookahead /= 0 loop
818          --  Insert the string window(strstart .. strstart + 2) in the
819          --  dictionary, and set hash_head to the head of the hash chain:
820          if lookahead >= MIN_MATCH then
821            INSERT_STRING(strstart, hash_head);
822          end if;
823          --  Find the longest match, discarding those <= prev_length.
824          prev_length  := match_length;
825          prev_match   := match_start;
826          match_length := MIN_MATCH - 1;
827          if hash_head /= NIL and then
828             prev_length < max_lazy_match and then
829             strstart - hash_head <= MAX_DIST
830          then
831            --  To simplify the code, we prevent matches with the string
832            --  of window index 0 (in particular we have to avoid a match
833            --  of the string with itself at the start of the input file).
834            --
835            --  Do not look for matches beyond the end of the input.
836            --  This is necessary to make deflate deterministic.
837            if nice_match > lookahead then
838              nice_match := lookahead;
839            end if;
840            Longest_Match(hash_head, match_length);
841            --  Longest_Match sets match_start
842            if match_length > lookahead then
843              match_length := lookahead;
844            end if;
845            --  Ignore a length 3 match if it is too distant:
846            if match_length = MIN_MATCH and then strstart - match_start > TOO_FAR then
847              --  If prev_match is also MIN_MATCH, match_start is garbage
848              --  but we will ignore the current match anyway.
849              match_length := MIN_MATCH - 1;
850            end if;
851          end if;
852          --  If there was a match at the previous step and the current
853          --  match is not better, output the previous match:
854          if prev_length >= MIN_MATCH and then match_length <= prev_length then
855            max_insert:= strstart + lookahead - MIN_MATCH;
856            --  C: in DEBUG mode: check_match(strstart-1, prev_match, prev_length);
857            --
858            ------------------------------------
859            --  Output a Distance-Length code --
860            ------------------------------------
861            Write_DL_code(Positive(strstart - 1 - prev_match), Positive(prev_length));
862            --  Insert in hash table all strings up to the end of the match.
863            --  strstart-1 and strstart are already inserted.
864            lookahead := lookahead - (prev_length-1);
865            prev_length := prev_length - 2;
866            loop
867              strstart:= strstart + 1;
868              if strstart <= max_insert then
869                INSERT_STRING(strstart, hash_head);
870                --  strstart never exceeds WSIZE - MAX_MATCH, so there
871                --  are always MIN_MATCH bytes ahead.
872              end if;
873              prev_length:= prev_length - 1;
874              exit when prev_length = 0;
875            end loop;
876            strstart:= strstart + 1;
877            match_available := False;
878            match_length := MIN_MATCH - 1;
879          elsif match_available then
880            --  If there was no match at the previous position, output a
881            --  single literal. If there was a match but the current match
882            --  is longer, truncate the previous match to a single literal.
883            --
884            ------------------------
885            --  Output a literal  --
886            ------------------------
887            Write_literal(window(strstart-1));
888            strstart:= strstart + 1;
889            lookahead := lookahead - 1;
890          else
891            --  There is no previous match to compare with, wait for the next step to decide.
892            match_available := True;
893            strstart:= strstart + 1;
894            lookahead := lookahead - 1;
895          end if;
896          --  Assert(strstart <= isize && lookahead <= isize, "a bit too far");
897          --
898          --  Make sure that we always have enough lookahead, except
899          --  at the end of the input file. We need MAX_MATCH bytes
900          --  for the next match, plus MIN_MATCH bytes to insert the
901          --  string following the next match.
902          if lookahead < MIN_LOOKAHEAD then
903            Fill_window;
904          end if;
905        end loop;
906        -----------------------------------
907        --  Output last literal, if any  --
908        -----------------------------------
909        if match_available then
910          Write_literal(window(strstart-1));
911        end if;
912      end LZ77_part_of_IZ_Deflate;
913
914      Code_too_clever: exception;
915    begin
916      if Look_Ahead /= 258 or String_buffer_size /= 2 ** 15 or Threshold /= 2 then
917        raise Code_too_clever;  --  was optimized for these parameters
918      end if;
919      window_size:= 0;
920      LM_Init(level);
921      LZ77_part_of_IZ_Deflate;
922    end LZ77_using_IZ;
923
924    ---------------------------------------------------------------------
925    --  BT4  -  Binary tree of match positions selected with           --
926    --          the leading 2 to 4 bytes of each possible match.       --
927    ---------------------------------------------------------------------
928
929    --  Based on BT4.java by Lasse Collin, itself based on LzFind.c by Igor Pavlov.
930
931    procedure LZ77_using_BT4 is
932      MATCH_LEN_MIN: constant Integer:= Threshold + 1;
933      --
934      readPos     : Integer := -1;
935      cur_literal : Byte;
936      readLimit   : Integer := -1;
937      finishing   : constant Boolean := False;
938      writePos    : Integer :=  0;
939      pendingSize : Integer :=  0;
940      --
941      OPTS : constant := 4096;
942      EXTRA_SIZE_BEFORE : constant :=  OPTS;
943      EXTRA_SIZE_AFTER  : constant :=  OPTS;
944
945      keepSizeBefore : constant Integer:= EXTRA_SIZE_BEFORE + String_buffer_size;
946      keepSizeAfter  : constant Integer:= EXTRA_SIZE_AFTER  + Look_Ahead;
947      reserveSize : constant Integer:=
948        Integer'Min(
949          String_buffer_size / 2 +
950          256 * (2 ** 10),  --  256 KB
951          512 * (2 ** 20)   --  512 MB
952        );
953      getBufSize: constant Integer:= keepSizeBefore + keepSizeAfter + reserveSize;
954
955      type Int_array is array(Natural range <>) of Integer;
956      type p_Int_array is access Int_array;
957      procedure Dispose is new Ada.Unchecked_Deallocation(Int_array, p_Int_array);
958
959      procedure Normalize(positions: in out Int_array; normalizationOffset: Integer) is
960      begin
961        for i in 0 .. positions'Length - 1 loop
962          if positions(i) <= normalizationOffset then
963            positions(i) := 0;
964          else
965            positions(i) := positions(i) - normalizationOffset;
966          end if;
967        end loop;
968      end Normalize;
969
970      function getAvail return Integer is
971      pragma Inline(getAvail);
972      begin
973         --  !! - 1 added for getting readPos in buf'Range upon: cur_literal:= buf(readPos);
974        return writePos - readPos - 1;
975      end getAvail;
976
977      function movePos(requiredForFlushing, requiredForFinishing: Integer) return Integer is
978        avail: Integer;
979      begin
980        --  assert requiredForFlushing >= requiredForFinishing;
981        readPos := readPos + 1;
982        avail   := getAvail;
983        if avail < requiredForFlushing then
984          if avail < requiredForFinishing or else not finishing
985          then
986            pendingSize:= pendingSize + 1;
987            --  GdM: this causes cyclicPos and lzpos not being in sync with readPos,
988            --       the pendingSize value is there for catching up.
989            avail := 0;
990          end if;
991        end if;
992        return avail;
993      end movePos;
994
995      function getHash4Size return Integer is
996        h : Unsigned_32:= Unsigned_32(String_buffer_size - 1);
997      begin
998        h:= h or Shift_Right(h, 1);
999        h:= h or Shift_Right(h, 2);
1000        h:= h or Shift_Right(h, 4);
1001        h:= h or Shift_Right(h, 8);
1002        h:= Shift_Right(h, 1);
1003        h:= h or 16#FFFF#;  --  LzFind.c: "don't change it! It's required for Deflate"
1004        if h > 2 ** 24 then
1005          h:= Shift_Right(h, 1);
1006        end if;
1007        return Integer(h + 1);
1008      end getHash4Size;
1009
1010      type Byte_array is array(Natural range <>) of Byte;
1011      type p_Byte_array is access Byte_array;
1012      procedure Dispose is new Ada.Unchecked_Deallocation(Byte_array, p_Byte_array);
1013
1014      package Hash234 is
1015        HASH_2_SIZE : constant := 2 ** 10;
1016        HASH_2_MASK : constant := HASH_2_SIZE - 1;
1017        HASH_3_SIZE : constant := 2 ** 16;
1018        HASH_3_MASK : constant := HASH_3_SIZE - 1;
1019        hash_4_size : constant Integer:= getHash4Size;
1020        hash_4_mask : constant Unsigned_32:= Unsigned_32(hash_4_size) - 1;
1021        --
1022        hash2Table: Int_array(0..HASH_2_SIZE-1) := (others => 0);  --  !! initialization added
1023        hash3Table: Int_array(0..HASH_3_SIZE-1) := (others => 0);  --  !! initialization added
1024        hash4Table: p_Int_array;
1025        --
1026        hash2Value, hash3Value, hash4Value: Unsigned_32:= 0;
1027        --
1028        procedure calcHashes(buf: Byte_array; off: Integer);
1029        procedure updateTables(pos: Integer);
1030        procedure Normalize(normalizeOffset: Integer);
1031      end Hash234;
1032
1033      package body Hash234 is
1034
1035        crcTable: array(Byte) of Unsigned_32;
1036        CRC32_POLY: constant:= 16#EDB8_8320#;
1037
1038        procedure calcHashes(buf: Byte_array; off: Integer) is
1039          temp: Unsigned_32 := crcTable(buf(off)) xor Unsigned_32(buf(off + 1));
1040        begin
1041          hash2Value := temp and HASH_2_MASK;
1042          temp:= temp xor Shift_Left(Unsigned_32(buf(off + 2)), 8);
1043          hash3Value := temp and HASH_3_MASK;
1044          temp:= temp xor Shift_Left(crcTable(buf(off + 3)), 5);
1045          hash4Value := temp and hash_4_mask;
1046        end calcHashes;
1047
1048        procedure updateTables(pos: Integer) is
1049        begin
1050          hash2Table(Integer(hash2Value)) := pos;
1051          hash3Table(Integer(hash3Value)) := pos;
1052          hash4Table(Integer(hash4Value)) := pos;
1053        end updateTables;
1054
1055        procedure Normalize(normalizeOffset: Integer) is
1056        begin
1057          Normalize(hash2Table, normalizeOffset);
1058          Normalize(hash3Table, normalizeOffset);
1059          Normalize(hash4Table.all, normalizeOffset);
1060        end Normalize;
1061
1062        r: Unsigned_32;
1063      begin
1064        --  NB: heap allocation used only for convenience because of
1065        --      small default stack sizes on some compilers.
1066        hash4Table:= new Int_array(0..hash_4_size-1);
1067        hash4Table.all:= (others => 0);  --  !! initialization added
1068        for i in Byte loop
1069          r:= Unsigned_32(i);
1070          for j in 0 .. 7 loop
1071            if (r and 1) = 0 then
1072              r:= Shift_Right(r, 1);
1073            else
1074              r:= Shift_Right(r, 1) xor CRC32_POLY;
1075            end if;
1076          end loop;
1077          crcTable(i) := r;
1078        end loop;
1079      end Hash234;
1080
1081      niceLen: constant Integer:= Integer'Min(162, Look_Ahead);  --  const. was 64
1082      depthLimit: constant:= 48;  --  Alternatively: 16 + niceLen / 2
1083
1084      --  !! nicer: unconstr. array of (dist, len) pairs, 1-based array.
1085
1086      type Any_Matches_type(countMax: Integer) is record
1087        count: Integer:= 0;
1088        len  : Int_array(0 .. countMax);
1089        dist : Int_array(0 .. countMax);
1090      end record;
1091
1092      --  Subtracting 1 because the shortest match that this match
1093      --  finder can find is 2 bytes, so there's no need to reserve
1094      --  space for one-byte matches.
1095      subtype Matches_type is Any_Matches_type(niceLen - 1);
1096
1097      cyclicSize : constant Integer := String_buffer_size;  --  Had: + 1;
1098      cyclicPos  : Integer := -1;
1099      lzPos      : Integer := cyclicSize;
1100
1101      max_dist: constant Integer:= cyclicSize;
1102
1103      package BT4_Algo is
1104        procedure skip(len: Natural);
1105        pragma Inline(skip);
1106        function getMatches return Matches_type;
1107      end BT4_Algo;
1108
1109      buf : p_Byte_array;
1110      tree : p_Int_array;
1111
1112      package body BT4_Algo is
1113
1114        function movePos return Integer is
1115          avail : constant Integer:= movePos(requiredForFlushing => niceLen, requiredForFinishing => 4);
1116          normalizationOffset: Integer;
1117        begin
1118          --  Put_Line("BT4_movePos");
1119          if avail /= 0 then
1120            lzPos:= lzPos + 1;
1121            if lzPos = Integer'Last then
1122              normalizationOffset := Integer'Last - cyclicSize;
1123              Hash234.Normalize(normalizationOffset);
1124              Normalize(tree.all, normalizationOffset);
1125              lzPos:= lzPos - normalizationOffset;
1126            end if;
1127            cyclicPos:= cyclicPos + 1;
1128            if cyclicPos = cyclicSize then
1129              --  Put_Line("cyclicPos zeroed");
1130              cyclicPos := 0;
1131            end if;
1132          end if;
1133          return avail;
1134        end movePos;
1135
1136        Null_position: constant:= -1;  --  LzFind.c: kEmptyHashValue, 0
1137
1138        procedure skip_update_tree(niceLenLimit: Integer; currentMatch: in out Integer) is
1139          delta0, depth, ptr0, ptr1, pair, len, len0, len1: Integer;
1140        begin
1141          --  Put("BT4.skip_update_tree... ");
1142          depth := depthLimit;
1143          ptr0  := cyclicPos * 2 + 1;
1144          ptr1  := cyclicPos * 2;
1145          len0  := 0;
1146          len1  := 0;
1147          loop
1148            delta0 := lzPos - currentMatch;
1149            if depth = 0 or else delta0 >= max_dist then
1150              tree(ptr0) := Null_position;
1151              tree(ptr1) := Null_position;
1152              return;
1153            end if;
1154            depth:= depth - 1;
1155            if cyclicPos - delta0 < 0 then
1156              pair:= cyclicSize;
1157            else
1158              pair:= 0;
1159            end if;
1160            pair := (cyclicPos - delta0 + pair) * 2;
1161            len  := Integer'Min(len0, len1);
1162            --  Match ?
1163            if buf(readPos + len - delta0) = buf(readPos + len) then
1164              --  No need to look for longer matches than niceLenLimit
1165              --  because we only are updating the tree, not returning
1166              --  matches found to the caller.
1167              loop
1168                len:= len + 1;
1169                if len = niceLenLimit then
1170                  tree(ptr1) := tree(pair);
1171                  tree(ptr0) := tree(pair + 1);
1172                  return;
1173                end if;
1174                exit when buf(readPos + len - delta0) /= buf(readPos + len);
1175              end loop;
1176            end if;
1177            --  Bytes are no more matching. The past value is either smaller...
1178            if buf(readPos + len - delta0) < buf(readPos + len) then
1179              tree(ptr1) := currentMatch;
1180              ptr1 := pair + 1;
1181              currentMatch := tree(ptr1);
1182              len1 := len;
1183            else  --  ... or larger
1184              tree(ptr0) := currentMatch;
1185              ptr0 := pair;
1186              currentMatch := tree(ptr0);
1187              len0 := len;
1188            end if;
1189          end loop;
1190        end skip_update_tree;
1191
1192        procedure skip(len: Natural) is
1193          --
1194          procedure Skip_one is
1195          pragma Inline(Skip_one);
1196            niceLenLimit, avail, currentMatch: Integer;
1197          begin
1198            niceLenLimit := niceLen;
1199            avail := movePos;
1200            if avail < niceLenLimit then
1201              if avail = 0 then
1202                return;
1203              end if;
1204              niceLenLimit := avail;
1205            end if;
1206            Hash234.calcHashes(buf.all, readPos);
1207            currentMatch := Hash234.hash4Table (Integer(Hash234.hash4Value));
1208            Hash234.updateTables(lzPos);
1209            skip_update_tree(niceLenLimit, currentMatch);
1210          end Skip_one;
1211          --
1212        begin
1213          for count in reverse 1 .. len loop
1214            Skip_one;
1215          end loop;
1216        end skip;
1217
1218        function getMatches return Matches_type is
1219          matches: Matches_type;
1220          matchLenLimit : Integer := Look_Ahead;
1221          niceLenLimit  : Integer := niceLen;
1222          avail: Integer;
1223          delta0, delta2, delta3, currentMatch,
1224          lenBest, depth, ptr0, ptr1, pair, len, len0, len1: Integer;
1225        begin
1226          --  Put("BT4.getMatches... ");
1227          matches.count:= 0;
1228          avail:= movePos;
1229          if avail < matchLenLimit then
1230            if avail = 0 then
1231              return matches;
1232            end if;
1233            matchLenLimit := avail;
1234            if niceLenLimit > avail then
1235              niceLenLimit := avail;
1236            end if;
1237          end if;
1238          --
1239          Hash234.calcHashes(buf.all, readPos);
1240          delta2 := lzPos - Hash234.hash2Table (Integer(Hash234.hash2Value));
1241          delta3 := lzPos - Hash234.hash3Table (Integer(Hash234.hash3Value));
1242          currentMatch :=   Hash234.hash4Table (Integer(Hash234.hash4Value));
1243          Hash234.updateTables(lzPos);
1244          --
1245          lenBest := 0;
1246          --  See if the hash from the first two bytes found a match.
1247          --  The hashing algorithm guarantees that if the first byte
1248          --  matches, also the second byte does, so there's no need to
1249          --  test the second byte.
1250          if delta2 < max_dist and then buf(readPos - delta2) = buf(readPos) then
1251            --  Match of length 2 found and checked.
1252            lenBest := 2;
1253            matches.len(0) := 2;
1254            matches.dist(0) := delta2 - 1;
1255            matches.count := 1;
1256          end if;
1257          --  See if the hash from the first three bytes found a match that
1258          --  is different from the match possibly found by the two-byte hash.
1259          --  Also here the hashing algorithm guarantees that if the first byte
1260          --  matches, also the next two bytes do.
1261          if delta2 /= delta3 and then delta3 < max_dist
1262                  and then buf(readPos - delta3) = buf(readPos)
1263          then
1264            --  Match of length 3 found and checked.
1265            lenBest := 3;
1266            matches.count := matches.count + 1;
1267            matches.dist(matches.count - 1) := delta3 - 1;
1268            delta2 := delta3;
1269          end if;
1270          --  If a match was found, see how long it is.
1271          if matches.count > 0 then
1272            while lenBest < matchLenLimit and then buf(readPos + lenBest - delta2)
1273                                                 = buf(readPos + lenBest)
1274            loop
1275              lenBest:= lenBest + 1;
1276            end loop;
1277            matches.len(matches.count - 1) := lenBest;
1278            --  Return if it is long enough (niceLen or reached the end of the dictionary).
1279            if lenBest >= niceLenLimit then
1280              skip_update_tree(niceLenLimit, currentMatch);
1281              return matches;
1282            end if;
1283          end if;
1284          --  Long enough match wasn't found so easily. Look for better matches from the binary tree.
1285          if lenBest < 3 then
1286            lenBest := 3;
1287          end if;
1288          depth := depthLimit;
1289          ptr0  := cyclicPos * 2 + 1;
1290          ptr1  := cyclicPos * 2;
1291          len0  := 0;
1292          len1  := 0;
1293          --
1294          loop
1295            delta0 := lzPos - currentMatch;
1296            --  Return if the search depth limit has been reached or
1297            --  if the distance of the potential match exceeds the
1298            --  dictionary size.
1299            if depth = 0 or else delta0 >= max_dist then
1300              tree(ptr0) := Null_position;
1301              tree(ptr1) := Null_position;
1302              return matches;
1303            end if;
1304            depth:= depth - 1;
1305            --
1306            if cyclicPos - delta0 < 0 then
1307              pair:= cyclicSize;
1308            else
1309              pair:= 0;
1310            end if;
1311            pair := (cyclicPos - delta0 + pair) * 2;
1312            len  := Integer'Min(len0, len1);
1313            --  Match ?
1314            if buf(readPos + len - delta0) = buf(readPos + len) then
1315              loop
1316                len:= len + 1;
1317                exit when len >= matchLenLimit
1318                  or else buf(readPos + len - delta0) /= buf(readPos + len);
1319              end loop;
1320              if len > lenBest then
1321                lenBest := len;
1322                matches.len(matches.count) := len;
1323                matches.dist(matches.count) := delta0 - 1;
1324                matches.count:= matches.count + 1;
1325                if len >= niceLenLimit then
1326                  tree(ptr1) := tree(pair);
1327                  tree(ptr0) := tree(pair + 1);
1328                  return matches;
1329                end if;
1330              end if;
1331            end if;
1332            --  Bytes are no more matching. The past value is either smaller...
1333            if buf(readPos + len - delta0) < buf(readPos + len) then
1334              tree(ptr1) := currentMatch;
1335              ptr1 := pair + 1;
1336              currentMatch := tree(ptr1);
1337              len1 := len;
1338            else  --  ... or larger
1339              tree(ptr0) := currentMatch;
1340              ptr0 := pair;
1341              currentMatch := tree(ptr0);
1342              len0 := len;
1343            end if;
1344          end loop;
1345        end getMatches;
1346
1347      begin
1348        --  NB: heap allocation used only for convenience because of
1349        --      small default stack sizes on some compilers.
1350        tree:= new Int_array(0 .. cyclicSize * 2 - 1);
1351        tree.all:= (others => Null_position);
1352      end BT4_Algo;
1353
1354      --  Moves data from the end of the buffer to the beginning, discarding
1355      --  old data and making space for new input.
1356
1357      procedure moveWindow is
1358        --  Align the move to a multiple of 16 bytes (LZMA-friendly, see pos_bits)
1359        moveOffset : constant Integer := ((readPos + 1 - keepSizeBefore) / 16) * 16;
1360        moveSize   : constant Integer := writePos - moveOffset;
1361      begin
1362        --  Put_Line("  Move window, size=" & moveSize'Img & " offset=" & moveOffset'Img);
1363        buf(0 .. moveSize - 1):= buf(moveOffset .. moveOffset + moveSize - 1);
1364        readPos   := readPos   - moveOffset;
1365        readLimit := readLimit - moveOffset;
1366        writePos  := writePos  - moveOffset;
1367      end moveWindow;
1368
1369     --  Copies new data into the buffer.
1370     function fillWindow(len_initial: Integer) return Integer is
1371
1372       --  Process pending data that hasn't been ran through the match finder yet.
1373       --  Run it through the match finder now if there is enough new data
1374       --  available (readPos < readLimit) that the encoder may encode at
1375       --  least one more input byte.
1376       --
1377       procedure processPendingBytes is
1378         oldPendingSize: Integer;
1379       begin
1380         if pendingSize > 0 and then readPos < readLimit then
1381           readPos := readPos - pendingSize;
1382           oldPendingSize := pendingSize;
1383           pendingSize := 0;
1384           BT4_Algo.skip(oldPendingSize);
1385         end if;
1386       end processPendingBytes;
1387       --
1388       len: Integer:= len_initial;
1389       actual_len: Integer:= 0;
1390     begin
1391        --  Put_Line("Fill window - start");
1392        --  Move the sliding window if needed.
1393        if readPos >= buf'Length - keepSizeAfter then
1394          moveWindow;
1395        end if;
1396
1397        --  Try to fill the dictionary buffer up to its boundary.
1398        if len > buf'Length - writePos then
1399          len := buf'Length - writePos;
1400        end if;
1401
1402        while len > 0 and then More_bytes loop
1403          buf(writePos):= Read_byte;
1404          writePos:= writePos + 1;
1405          len:= len - 1;
1406          actual_len:= actual_len + 1;
1407        end loop;
1408
1409        --  Set the new readLimit but only if there's enough data to allow
1410        --  encoding of at least one more byte.
1411        if writePos >= keepSizeAfter then
1412          readLimit := writePos - keepSizeAfter;
1413        end if;
1414
1415        processPendingBytes;
1416
1417        --  Put_Line("Fill window, requested=" & len_initial'Img & " actual=" & actual_len'Img);
1418        --  Tell the caller how much input we actually copied into the dictionary.
1419        return actual_len;
1420      end fillWindow;
1421
1422      matches : Matches_type;
1423      readAhead : Integer := -1;  -- LZMAEncoder.java
1424
1425      function getMatches return Matches_type is
1426      begin
1427        readAhead:= readAhead + 1;
1428        return BT4_Algo.getMatches;
1429      end getMatches;
1430
1431      procedure skip(len: Natural) is
1432      pragma Inline(skip);
1433      begin
1434        readAhead:= readAhead + len;
1435        BT4_Algo.skip(len);
1436      end skip;
1437
1438      --  Small stack of recent distances used for LZ.
1439      subtype Repeat_stack_range is Integer range 0..3;
1440      rep_dist: array(Repeat_stack_range) of Natural := (others => 0);
1441
1442      procedure getNextSymbol is
1443        avail, mainLen, mainDist, newLen, newDist, limit: Integer;
1444
1445        function changePair(smallDist, bigDist: Integer) return Boolean is
1446        pragma Inline(changePair);
1447        begin
1448          return smallDist < bigDist / 128;
1449        end changePair;
1450
1451        --  This function is for debugging. The matches stored in the 'tree' array
1452        --  may be wrong if the variables cyclicPos, lzPos and readPos are not in sync.
1453        --  The issue seems to have been solved now (rev. 489).
1454        function Is_match_correct(shift: Natural) return Boolean is
1455        pragma Inline(Is_match_correct);
1456          paranoid: constant Boolean:= True;
1457        begin
1458          if paranoid then
1459            for i in reverse -1 + shift .. mainLen - 2 + shift loop
1460              if buf(readPos - (mainDist+1) + i) /= buf(readPos + i) then
1461                return False;  --  Should not occur (check with code coverage)
1462              end if;
1463            end loop;
1464          end if;
1465          return True;
1466        end Is_match_correct;
1467
1468        function getMatchLen(dist, lenLimit: Integer) return Natural is
1469        pragma Inline(getMatchLen);
1470          backPos: constant Integer := readPos - dist - 1;
1471          len: Integer := 0;
1472        begin
1473          if dist < 1 then
1474            return 0;
1475          end if;
1476          --  @ if readPos+len not in buf.all'Range then
1477          --  @   Put("**** readpos " & buf'Last'Img & readPos'Img);
1478          --  @ end if;
1479          --  @ if backPos+len not in buf.all'Range then
1480          --  @   Put("**** backpos " & buf'Last'Img & backPos'Img);
1481          --  @ end if;
1482          while len < lenLimit and then buf(readPos + len) = buf(backPos + len) loop
1483            len:= len + 1;
1484          end loop;
1485          return len;
1486        end getMatchLen;
1487
1488        procedure Send_first_literal_of_match is
1489        begin
1490          Write_literal(cur_literal);
1491          readAhead := readAhead - 1;
1492        end Send_first_literal_of_match;
1493
1494        procedure Send_DL_code( distance, length: Integer ) is
1495          found_repeat: Integer:= rep_dist'First - 1;
1496          aux: Integer;
1497        begin
1498          Write_DL_code(distance + 1, length);
1499          readAhead := readAhead - length;
1500          if LZMA_friendly then
1501            --
1502            --  Manage the stack of recent distances in the same way the "MA" part of LZMA does.
1503            --
1504            for i in rep_dist'Range loop
1505              if distance = rep_dist(i) then
1506                found_repeat:= i;
1507                exit;
1508              end if;
1509            end loop;
1510            if found_repeat >= rep_dist'First then
1511              --  Roll the stack of recent distances up to the item with index found_repeat,
1512              --  which becomes first. If found_repeat = rep_dist'First, no actual change occurs.
1513              aux:= rep_dist(found_repeat);
1514              for i in reverse rep_dist'First + 1 .. found_repeat loop
1515                rep_dist(i) := rep_dist(i-1);
1516              end loop;
1517              rep_dist(rep_dist'First):= aux;
1518            else
1519              --  Shift the stack of recent distances; the new distance becomes the first item.
1520              for i in reverse rep_dist'First + 1 .. rep_dist'Last loop
1521                rep_dist(i) := rep_dist(i-1);
1522              end loop;
1523              rep_dist(0) := distance;
1524            end if;
1525          end if;
1526        end Send_DL_code;
1527
1528        bestRepLen, bestRepIndex, len: Integer;
1529
1530      begin
1531        --  Get the matches for the next byte unless readAhead indicates
1532        --  that we already got the new matches during the previous call
1533        --  to this procedure.
1534        if readAhead = -1 then
1535          matches := getMatches;
1536        end if;
1537        --  @ if readPos not in buf.all'Range then
1538        --  @   Put("**** " & buf'Last'Img & keepSizeAfter'Img & readPos'Img & writePos'Img);
1539        --  @ end if;
1540        cur_literal:= buf(readPos);
1541        --  Get the number of bytes available in the dictionary, but
1542        --  not more than the maximum match length. If there aren't
1543        --  enough bytes remaining to encode a match at all, return
1544        --  immediately to encode this byte as a literal.
1545        avail := Integer'Min(getAvail, Look_Ahead);
1546        if avail < MATCH_LEN_MIN then
1547          --  Put("[a]");
1548          Send_first_literal_of_match;
1549          return;
1550        end if;
1551
1552        if LZMA_friendly then
1553          --  Look for a match from the previous four different match distances.
1554          bestRepLen := 0;
1555          bestRepIndex := 0;
1556          for rep in Repeat_stack_range loop
1557            len := getMatchLen(rep_dist(rep), avail);
1558            if len >= MATCH_LEN_MIN then
1559              --  If it is long enough, return it.
1560              if len >= niceLen then
1561                skip(len - 1);
1562                --  Put_Line("[DL RA]");
1563                Send_DL_code(rep_dist(rep), len);
1564                return;
1565              end if;
1566              --  Remember the index and length of the best repeated match.
1567              if len > bestRepLen then
1568                bestRepIndex := rep;
1569                bestRepLen := len;
1570              end if;
1571            end if;
1572          end loop;
1573        end if;
1574
1575        mainLen := 0;
1576        mainDist := 0;
1577        if matches.count > 0 then
1578          mainLen  := matches.len(matches.count - 1);
1579          mainDist := matches.dist(matches.count - 1);
1580          if mainLen >= niceLen then
1581            if Is_match_correct(1) then
1582              skip(mainLen - 1);
1583              --  Put_Line("[DL A]" & mainDist'Img & mainLen'Img);
1584              Send_DL_code(mainDist, mainLen);
1585              return;
1586            else
1587              --  Put_Line("Wrong match [A]! pos=" & Integer'Image(lzPos - cyclicSize));
1588              Send_first_literal_of_match;
1589              return;
1590            end if;
1591          end if;
1592          while matches.count > 1 and then mainLen = matches.len(matches.count - 2) + 1 loop
1593            exit when not changePair(matches.dist(matches.count - 2), mainDist);
1594            matches.count:= matches.count - 1;
1595            mainLen := matches.len(matches.count - 1);
1596            mainDist := matches.dist(matches.count - 1);
1597          end loop;
1598          if mainLen = MATCH_LEN_MIN and then mainDist >= 128 then
1599            mainLen := 1;
1600          end if;
1601        end if;
1602
1603        if LZMA_friendly
1604             and then bestRepLen >= MATCH_LEN_MIN
1605             and then ( bestRepLen + 1 >= mainLen
1606                        or else (bestRepLen + 2 >= mainLen and then mainDist >= 2 ** 9)
1607                        or else (bestRepLen + 3 >= mainLen and then mainDist >= 2 ** 15) )
1608        then
1609          skip(bestRepLen - 1);
1610          --  Put_Line("[DL RB]");
1611          Send_DL_code(rep_dist(bestRepIndex), bestRepLen);
1612          return;
1613        end if;
1614
1615        if mainLen < MATCH_LEN_MIN or else avail <= MATCH_LEN_MIN then
1616          --Put("[b]");
1617          Send_first_literal_of_match;
1618          return;
1619        end if;
1620
1621        --  Get the next match. Test if it is better than the current match.
1622        --  If so, encode the current byte as a literal.
1623        matches := getMatches;
1624        --
1625        if matches.count > 0 then
1626          newLen  := matches.len(matches.count - 1);
1627          newDist := matches.dist(matches.count - 1);
1628          if (newLen >= mainLen and then newDist < mainDist)
1629                  or else (newLen = mainLen + 1
1630                      and then not changePair(mainDist, newDist))
1631                  or else newLen > mainLen + 1
1632                  or else (newLen + 1 >= mainLen
1633                      and then mainLen >= MATCH_LEN_MIN + 1
1634                      and then changePair(newDist, mainDist))
1635          then
1636            --Put("[c]");
1637            --Put(Character'Val(cur_literal));
1638            Send_first_literal_of_match;
1639            return;
1640          end if;
1641        end if;
1642
1643        limit := Integer'Max(mainLen - 1, MATCH_LEN_MIN);
1644        for rep in rep_dist'Range loop
1645          if getMatchLen(rep_dist(rep), limit) = limit then
1646            Send_first_literal_of_match;
1647            return;
1648          end if;
1649        end loop;
1650
1651        if Is_match_correct(0) then
1652          skip(mainLen - 2);
1653          --  Put_Line("[DL B]" & mainDist'Img & mainLen'Img);
1654          Send_DL_code(mainDist, mainLen);
1655        else
1656          --  Put_Line("Wrong match [B]!");
1657          Send_first_literal_of_match;
1658        end if;
1659      end getNextSymbol;
1660
1661      actual_written, avail: Integer;
1662    begin
1663      --  NB: heap allocation used only for convenience because of
1664      --      small default stack sizes on some compilers.
1665      buf:= new Byte_array(0 .. getBufSize);
1666      actual_written:= fillWindow(String_buffer_size);
1667      if actual_written > 0 then
1668        loop
1669          getNextSymbol;
1670          avail:= getAvail;
1671          if avail = 0 then
1672            actual_written:= fillWindow(String_buffer_size);
1673            exit when actual_written = 0;
1674          end if;
1675        end loop;
1676      end if;
1677      Dispose(buf);
1678      Dispose(tree);
1679      Dispose(Hash234.hash4Table);
1680    end LZ77_using_BT4;
1681
1682  begin
1683    case Method is
1684      when LZHuf =>
1685        LZ77_using_LZHuf;
1686      when IZ_4 .. IZ_10 =>
1687        LZ77_using_IZ( 4 + Method_Type'Pos(Method) -  Method_Type'Pos(IZ_4) );
1688      when BT4 =>
1689        LZ77_using_BT4;
1690    end case;
1691  end Encode;
1692
1693end LZ77;
1694