1--  LZMA library
2----------------
3--  Library for encoding and decoding data streams in the LZMA compression
4--  format invented by Igor Pavlov.
5--
6--  Pure Ada 95+ code, 100% portable: OS-, CPU- and compiler- independent.
7
8--  Legal licensing note:
9
10--  Copyright (c) 2016 .. 2018 Gautier de Montmollin
11--  SWITZERLAND
12
13--  Permission is hereby granted, free of charge, to any person obtaining a copy
14--  of this software and associated documentation files (the "Software"), to deal
15--  in the Software without restriction, including without limitation the rights
16--  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17--  copies of the Software, and to permit persons to whom the Software is
18--  furnished to do so, subject to the following conditions:
19
20--  The above copyright notice and this permission notice shall be included in
21--  all copies or substantial portions of the Software.
22
23--  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24--  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25--  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26--  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27--  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28--  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
29--  THE SOFTWARE.
30
31-- NB: this is the MIT License, as found 21-Aug-2016 on the site
32-- http://www.opensource.org/licenses/mit-license.php
33
34with Ada.Direct_IO;  --  Only used for the type Data_Bytes_Count below.
35with Interfaces;
36with System;
37
38package LZMA is
39
40  --  The compression and decompression procedures are located
41  --  in child packages LZMA.Encoding and LZMA.Decoding respectively.
42
43  --  Bits of last byte being used as context.
44  --    With the value 8, LZMA uses a complete Markov chain for predicting
45  --    a literal from the previous one, like PKZip's Reduce format.
46  subtype Literal_context_bits_range  is Integer range 0..8;
47
48  --  Position mod 2**bits is used, but for literal context only.
49  subtype Literal_position_bits_range is Integer range 0..4;
50
51  --  Position mod 2**bits is used in various places.
52  subtype Position_bits_range         is Integer range 0..4;
53
54  Default_dictionary_size : constant := 2 ** 15;  --  32 KB, like Deflate.
55
56  subtype Byte is Interfaces.Unsigned_8;
57
58  --  Ada.Direct_IO is only there for the Data_Bytes_Count type.
59  --  In case you want to avoid reference to Ada.Direct_IO,
60  --  you can customize the definition of Data_Bytes_Count, provided
61  --  it has enough capacity for counting bytes in the streams involved.
62  package BIO is new Ada.Direct_IO (Byte);
63  subtype Data_Bytes_Count is BIO.Count;
64
65private
66
67  use Interfaces;
68
69  --  These integer types are defined in the LZMA specification
70  --  (DRAFT version, 2015-06-14, by Igor Pavlov)
71
72  type Unsigned is mod 2 ** System.Word_Size;
73  subtype UInt64 is Unsigned_64;
74  subtype UInt32 is Unsigned_32;
75  subtype UInt16 is Unsigned_16;
76
77  ----------------------------
78  --  Finite state machine  --
79  ----------------------------
80
81  States_count : constant := 12;  --  LZMA specification name: "kNumStates"
82  subtype State_range is Unsigned range 0..States_count - 1;
83  type Transition is array(State_range) of State_range;
84
85  ------------------------------------ From ...  0  1  2  3  4  5  6   7   8   9  10  11
86  Update_State_Literal  : constant Transition:= (0, 0, 0, 0, 1, 2, 3,  4,  5,  6,  4,  5);
87  Update_State_Match    : constant Transition:= (7, 7, 7, 7, 7, 7, 7, 10, 10, 10, 10, 10);
88  Update_State_Rep      : constant Transition:= (8, 8, 8, 8, 8, 8, 8, 11, 11, 11, 11, 11);
89  Update_State_ShortRep : constant Transition:= (9, 9, 9, 9, 9, 9, 9, 11, 11, 11, 11, 11);
90
91  --  Context for improving compression of aligned data,
92  --  modulo 2**n = 2, 4, 8 or 16 (max) bytes, or disabled: n = 0.
93  Max_pos_bits : constant := 4;  --  LZMA specification name: "kNumPosBitsMax"
94  Max_pos_states_count : constant := 2**Max_pos_bits;
95  subtype Pos_state_range is Unsigned range 0 .. Max_pos_states_count-1;
96
97  ----------------------------------------
98  --  Probability model for bit coding  --
99  ----------------------------------------
100
101  Probability_model_bits  : constant:= 11;  --  LZMA specification name: "kNumBitModelTotalBits"
102  Probability_model_count : constant:= 2 ** Probability_model_bits;
103
104  Probability_change_bits : constant:= 5;   --  LZMA specification name: "kNumMoveBits"
105
106  --  All probabilities are initialized with p=0.5. LZMA specification name: "PROB_INIT_VAL"
107  Initial_probability : constant := Probability_model_count / 2;
108
109  --  Type for storing probabilities, must have at least Probability_model_bits bits.
110  --  LZMA specification recommends UInt16. LzmaEnc.c uses UInt16 or optionally UInt32.
111  type CProb is new UInt16;
112
113  --  Integer (signed) used as index because there is a -1 (unused) index in Pos_coder_range.
114  type CProb_array is array(Integer range <>) of CProb;
115
116  Align_bits       : constant := 4;  --  LZMA specification name: "kNumAlignBits"
117  Align_table_size : constant := 2 ** Align_bits;
118  Align_mask       : constant := Align_table_size - 1;
119
120  subtype Bits_3_range is Integer range 0 .. 2**3 - 1;
121  subtype Bits_6_range is Integer range 0 .. 2**6 - 1;
122  subtype Bits_8_range is Integer range 0 .. 2**8 - 1;
123  subtype Bits_NAB_range is Integer range 0 .. 2**Align_bits - 1;
124
125  subtype Probs_3_bits is CProb_array(Bits_3_range);
126  subtype Probs_6_bits is CProb_array(Bits_6_range);
127  subtype Probs_8_bits is CProb_array(Bits_8_range);
128  subtype Probs_NAB_bits is CProb_array(Bits_NAB_range);
129
130  --------------------------------------------------
131  --  Probabilities for the binary decision tree  --
132  --------------------------------------------------
133
134  type Probs_state is array(State_range) of CProb;
135  type Probs_state_and_pos_state is array(State_range, Pos_state_range) of CProb;
136
137  type Probs_for_switches is record
138    --  This is the context for the switch between a Literal and a LZ Distance-Length code
139    match     : Probs_state_and_pos_state:= (others => (others => Initial_probability));
140    --  These are contexts for various repetition modes
141    rep       : Probs_state:= (others => Initial_probability);
142    rep_g0    : Probs_state:= (others => Initial_probability);
143    rep_g1    : Probs_state:= (others => Initial_probability);
144    rep_g2    : Probs_state:= (others => Initial_probability);
145    rep0_long : Probs_state_and_pos_state:= (others => (others => Initial_probability));
146  end record;
147
148  ------------------------------------
149  --  Probabilities for LZ lengths  --
150  ------------------------------------
151
152  type Low_mid_coder_probs is array(Pos_state_range) of Probs_3_bits;
153
154  --  Probabilities used for encoding LZ lengths. LZMA specification name: "CLenDecoder"
155  type Probs_for_LZ_Lengths is record
156    choice_1   : CProb               := Initial_probability;  --  0: low coder; 1: mid or high
157    choice_2   : CProb               := Initial_probability;  --  0: mid; 1: high
158    low_coder  : Low_mid_coder_probs := (others => (others => Initial_probability));
159    mid_coder  : Low_mid_coder_probs := (others => (others => Initial_probability));
160    high_coder : Probs_8_bits        := (others => Initial_probability);
161  end record;
162
163  --------------------------------------
164  --  Probabilities for LZ distances  --
165  --------------------------------------
166
167  Len_to_pos_states  : constant := 4;
168  subtype Slot_coder_range is Unsigned range 0 .. Len_to_pos_states - 1;
169  type Slot_coder_probs is array(Slot_coder_range) of Probs_6_bits;
170  Dist_slot_bits: constant:= 6;  --  "kNumPosSlotBits"
171
172  Start_dist_model_index : constant :=  4;  --  "kStartPosModelIndex"
173  End_dist_model_index   : constant := 14;  --  LZMA specification name: "kEndPosModelIndex"
174  Num_full_distances  : constant := 2 ** (End_dist_model_index / 2);  --  "kNumFullDistances"
175
176  --  Pos_coder_range: index -1 is never used as such but appears
177  --  when calling Bit_Tree_Reverse_Encode (as in the original C version, RcTree_ReverseEncode).
178  subtype Pos_coder_range is Integer range -1 .. Num_full_distances - End_dist_model_index;
179  subtype Pos_coder_probs is CProb_array(Pos_coder_range);
180
181  type Probs_for_LZ_Distances is record
182    slot_coder  : Slot_coder_probs := (others => (others => Initial_probability));
183    align_coder : Probs_NAB_bits   := (others => Initial_probability);
184    pos_coder   : Pos_coder_probs  := (others => Initial_probability);
185  end record;
186
187  --------------------------------------
188  --  All probabilities used by LZMA  --
189  --------------------------------------
190
191  type All_probabilities(last_lit_prob_index: Integer) is record
192    --  Literals:
193    lit     : CProb_array(0..last_lit_prob_index):= (others => Initial_probability);
194    --  Distances:
195    dist    : Probs_for_LZ_Distances;
196    --  Lengths:
197    len     : Probs_for_LZ_Lengths;
198    rep_len : Probs_for_LZ_Lengths;
199    --  Decision tree switches:
200    switch  : Probs_for_switches;
201  end record;
202
203  -------------
204  --  Misc.  --
205  -------------
206
207  --  Minimum dictionary (= plain text buffer of n previous bytes)
208  --  size is 4096. LZMA specification name: "LZMA_DIC_MIN"
209  Min_dictionary_size : constant := 2 ** 12;
210
211  --  Log2-style encoding of LZ lengths
212  Len_low_bits     : constant:= 3;
213  Len_low_symbols  : constant:= 2 ** Len_low_bits;
214  Len_mid_bits     : constant:= 3;
215  Len_mid_symbols  : constant:= 2 ** Len_mid_bits;
216  Len_high_bits    : constant:= 8;
217  Len_high_symbols : constant:= 2 ** Len_high_bits;
218  Len_symbols      : constant:= Len_low_symbols + Len_mid_symbols + Len_high_symbols;
219
220  Min_match_length : constant:= 2;  --  "LZMA_MATCH_LEN_MIN"
221  Max_match_length : constant:= Min_match_length + Len_symbols - 1;  --  "LZMA_MATCH_LEN_MAX"
222
223  subtype Match_length_range is Integer range Min_match_length .. Max_match_length;
224
225  --  Fake distance, used as an end-of-stream marker.
226  end_of_stream_magic_distance : constant := 16#FFFF_FFFF#;
227
228  --------------------------------------------------
229  --  Binary values of various decision switches  --
230  --------------------------------------------------
231
232  --  LZ literal vs. DL code
233  Literal_choice : constant:= 0;
234  DL_code_choice : constant:= 1;
235
236  --  Within DL code: "Simple match" vs. "Rep match"
237  Simple_match_choice : constant:= 0;
238  Rep_match_choice    : constant:= 1;
239
240  --  Within "Rep match": "Distance is rep0" vs. "Distance is not rep0"
241  The_distance_is_rep0_choice     : constant:= 0;
242  The_distance_is_not_rep0_choice : constant:= 1;
243  --  Within "Distance is rep0":
244  The_length_is_1_choice     : constant:= 0;
245  The_length_is_not_1_choice : constant:= 1;
246  --  Within "Distance is not rep0": "Distance is rep1" vs. "Distance is not rep1"
247  The_distance_is_rep1_choice     : constant:= 0;
248  The_distance_is_not_rep1_choice : constant:= 1;
249  --  Within "Distance is not rep1": "Distance is rep2" vs. "Distance is not rep2"
250  The_distance_is_rep2_choice     : constant:= 0;
251  The_distance_is_not_rep2_choice : constant:= 1;
252
253  ----------------------
254  --  Range encoding  --
255  ----------------------
256
257  --  Normalization threshold. When the range width is below that value,
258  --  a shift is needed.
259  width_threshold : constant := 2**24;  --  LZMA specification name: "kTopValue"
260
261  --  The following article (the only reference in the LZMA specification)
262  --  explains how range encoding works:
263  --
264  --     G. N. N. Martin, Range encoding: an algorithm for removing redundancy
265  --     from a digitized message, Video & Data Recording Conference,
266  --     Southampton, UK, July 24-27, 1979.
267
268end LZMA;
269