1-- LZMA library 2---------------- 3-- Library for encoding and decoding data streams in the LZMA compression 4-- format invented by Igor Pavlov. 5-- 6-- Pure Ada 95+ code, 100% portable: OS-, CPU- and compiler- independent. 7 8-- Legal licensing note: 9 10-- Copyright (c) 2016 .. 2018 Gautier de Montmollin 11-- SWITZERLAND 12 13-- Permission is hereby granted, free of charge, to any person obtaining a copy 14-- of this software and associated documentation files (the "Software"), to deal 15-- in the Software without restriction, including without limitation the rights 16-- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 17-- copies of the Software, and to permit persons to whom the Software is 18-- furnished to do so, subject to the following conditions: 19 20-- The above copyright notice and this permission notice shall be included in 21-- all copies or substantial portions of the Software. 22 23-- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 24-- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 25-- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 26-- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 27-- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 28-- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 29-- THE SOFTWARE. 30 31-- NB: this is the MIT License, as found 21-Aug-2016 on the site 32-- http://www.opensource.org/licenses/mit-license.php 33 34with Ada.Direct_IO; -- Only used for the type Data_Bytes_Count below. 35with Interfaces; 36with System; 37 38package LZMA is 39 40 -- The compression and decompression procedures are located 41 -- in child packages LZMA.Encoding and LZMA.Decoding respectively. 42 43 -- Bits of last byte being used as context. 44 -- With the value 8, LZMA uses a complete Markov chain for predicting 45 -- a literal from the previous one, like PKZip's Reduce format. 46 subtype Literal_context_bits_range is Integer range 0..8; 47 48 -- Position mod 2**bits is used, but for literal context only. 49 subtype Literal_position_bits_range is Integer range 0..4; 50 51 -- Position mod 2**bits is used in various places. 52 subtype Position_bits_range is Integer range 0..4; 53 54 Default_dictionary_size : constant := 2 ** 15; -- 32 KB, like Deflate. 55 56 subtype Byte is Interfaces.Unsigned_8; 57 58 -- Ada.Direct_IO is only there for the Data_Bytes_Count type. 59 -- In case you want to avoid reference to Ada.Direct_IO, 60 -- you can customize the definition of Data_Bytes_Count, provided 61 -- it has enough capacity for counting bytes in the streams involved. 62 package BIO is new Ada.Direct_IO (Byte); 63 subtype Data_Bytes_Count is BIO.Count; 64 65private 66 67 use Interfaces; 68 69 -- These integer types are defined in the LZMA specification 70 -- (DRAFT version, 2015-06-14, by Igor Pavlov) 71 72 type Unsigned is mod 2 ** System.Word_Size; 73 subtype UInt64 is Unsigned_64; 74 subtype UInt32 is Unsigned_32; 75 subtype UInt16 is Unsigned_16; 76 77 ---------------------------- 78 -- Finite state machine -- 79 ---------------------------- 80 81 States_count : constant := 12; -- LZMA specification name: "kNumStates" 82 subtype State_range is Unsigned range 0..States_count - 1; 83 type Transition is array(State_range) of State_range; 84 85 ------------------------------------ From ... 0 1 2 3 4 5 6 7 8 9 10 11 86 Update_State_Literal : constant Transition:= (0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5); 87 Update_State_Match : constant Transition:= (7, 7, 7, 7, 7, 7, 7, 10, 10, 10, 10, 10); 88 Update_State_Rep : constant Transition:= (8, 8, 8, 8, 8, 8, 8, 11, 11, 11, 11, 11); 89 Update_State_ShortRep : constant Transition:= (9, 9, 9, 9, 9, 9, 9, 11, 11, 11, 11, 11); 90 91 -- Context for improving compression of aligned data, 92 -- modulo 2**n = 2, 4, 8 or 16 (max) bytes, or disabled: n = 0. 93 Max_pos_bits : constant := 4; -- LZMA specification name: "kNumPosBitsMax" 94 Max_pos_states_count : constant := 2**Max_pos_bits; 95 subtype Pos_state_range is Unsigned range 0 .. Max_pos_states_count-1; 96 97 ---------------------------------------- 98 -- Probability model for bit coding -- 99 ---------------------------------------- 100 101 Probability_model_bits : constant:= 11; -- LZMA specification name: "kNumBitModelTotalBits" 102 Probability_model_count : constant:= 2 ** Probability_model_bits; 103 104 Probability_change_bits : constant:= 5; -- LZMA specification name: "kNumMoveBits" 105 106 -- All probabilities are initialized with p=0.5. LZMA specification name: "PROB_INIT_VAL" 107 Initial_probability : constant := Probability_model_count / 2; 108 109 -- Type for storing probabilities, must have at least Probability_model_bits bits. 110 -- LZMA specification recommends UInt16. LzmaEnc.c uses UInt16 or optionally UInt32. 111 type CProb is new UInt16; 112 113 -- Integer (signed) used as index because there is a -1 (unused) index in Pos_coder_range. 114 type CProb_array is array(Integer range <>) of CProb; 115 116 Align_bits : constant := 4; -- LZMA specification name: "kNumAlignBits" 117 Align_table_size : constant := 2 ** Align_bits; 118 Align_mask : constant := Align_table_size - 1; 119 120 subtype Bits_3_range is Integer range 0 .. 2**3 - 1; 121 subtype Bits_6_range is Integer range 0 .. 2**6 - 1; 122 subtype Bits_8_range is Integer range 0 .. 2**8 - 1; 123 subtype Bits_NAB_range is Integer range 0 .. 2**Align_bits - 1; 124 125 subtype Probs_3_bits is CProb_array(Bits_3_range); 126 subtype Probs_6_bits is CProb_array(Bits_6_range); 127 subtype Probs_8_bits is CProb_array(Bits_8_range); 128 subtype Probs_NAB_bits is CProb_array(Bits_NAB_range); 129 130 -------------------------------------------------- 131 -- Probabilities for the binary decision tree -- 132 -------------------------------------------------- 133 134 type Probs_state is array(State_range) of CProb; 135 type Probs_state_and_pos_state is array(State_range, Pos_state_range) of CProb; 136 137 type Probs_for_switches is record 138 -- This is the context for the switch between a Literal and a LZ Distance-Length code 139 match : Probs_state_and_pos_state:= (others => (others => Initial_probability)); 140 -- These are contexts for various repetition modes 141 rep : Probs_state:= (others => Initial_probability); 142 rep_g0 : Probs_state:= (others => Initial_probability); 143 rep_g1 : Probs_state:= (others => Initial_probability); 144 rep_g2 : Probs_state:= (others => Initial_probability); 145 rep0_long : Probs_state_and_pos_state:= (others => (others => Initial_probability)); 146 end record; 147 148 ------------------------------------ 149 -- Probabilities for LZ lengths -- 150 ------------------------------------ 151 152 type Low_mid_coder_probs is array(Pos_state_range) of Probs_3_bits; 153 154 -- Probabilities used for encoding LZ lengths. LZMA specification name: "CLenDecoder" 155 type Probs_for_LZ_Lengths is record 156 choice_1 : CProb := Initial_probability; -- 0: low coder; 1: mid or high 157 choice_2 : CProb := Initial_probability; -- 0: mid; 1: high 158 low_coder : Low_mid_coder_probs := (others => (others => Initial_probability)); 159 mid_coder : Low_mid_coder_probs := (others => (others => Initial_probability)); 160 high_coder : Probs_8_bits := (others => Initial_probability); 161 end record; 162 163 -------------------------------------- 164 -- Probabilities for LZ distances -- 165 -------------------------------------- 166 167 Len_to_pos_states : constant := 4; 168 subtype Slot_coder_range is Unsigned range 0 .. Len_to_pos_states - 1; 169 type Slot_coder_probs is array(Slot_coder_range) of Probs_6_bits; 170 Dist_slot_bits: constant:= 6; -- "kNumPosSlotBits" 171 172 Start_dist_model_index : constant := 4; -- "kStartPosModelIndex" 173 End_dist_model_index : constant := 14; -- LZMA specification name: "kEndPosModelIndex" 174 Num_full_distances : constant := 2 ** (End_dist_model_index / 2); -- "kNumFullDistances" 175 176 -- Pos_coder_range: index -1 is never used as such but appears 177 -- when calling Bit_Tree_Reverse_Encode (as in the original C version, RcTree_ReverseEncode). 178 subtype Pos_coder_range is Integer range -1 .. Num_full_distances - End_dist_model_index; 179 subtype Pos_coder_probs is CProb_array(Pos_coder_range); 180 181 type Probs_for_LZ_Distances is record 182 slot_coder : Slot_coder_probs := (others => (others => Initial_probability)); 183 align_coder : Probs_NAB_bits := (others => Initial_probability); 184 pos_coder : Pos_coder_probs := (others => Initial_probability); 185 end record; 186 187 -------------------------------------- 188 -- All probabilities used by LZMA -- 189 -------------------------------------- 190 191 type All_probabilities(last_lit_prob_index: Integer) is record 192 -- Literals: 193 lit : CProb_array(0..last_lit_prob_index):= (others => Initial_probability); 194 -- Distances: 195 dist : Probs_for_LZ_Distances; 196 -- Lengths: 197 len : Probs_for_LZ_Lengths; 198 rep_len : Probs_for_LZ_Lengths; 199 -- Decision tree switches: 200 switch : Probs_for_switches; 201 end record; 202 203 ------------- 204 -- Misc. -- 205 ------------- 206 207 -- Minimum dictionary (= plain text buffer of n previous bytes) 208 -- size is 4096. LZMA specification name: "LZMA_DIC_MIN" 209 Min_dictionary_size : constant := 2 ** 12; 210 211 -- Log2-style encoding of LZ lengths 212 Len_low_bits : constant:= 3; 213 Len_low_symbols : constant:= 2 ** Len_low_bits; 214 Len_mid_bits : constant:= 3; 215 Len_mid_symbols : constant:= 2 ** Len_mid_bits; 216 Len_high_bits : constant:= 8; 217 Len_high_symbols : constant:= 2 ** Len_high_bits; 218 Len_symbols : constant:= Len_low_symbols + Len_mid_symbols + Len_high_symbols; 219 220 Min_match_length : constant:= 2; -- "LZMA_MATCH_LEN_MIN" 221 Max_match_length : constant:= Min_match_length + Len_symbols - 1; -- "LZMA_MATCH_LEN_MAX" 222 223 subtype Match_length_range is Integer range Min_match_length .. Max_match_length; 224 225 -- Fake distance, used as an end-of-stream marker. 226 end_of_stream_magic_distance : constant := 16#FFFF_FFFF#; 227 228 -------------------------------------------------- 229 -- Binary values of various decision switches -- 230 -------------------------------------------------- 231 232 -- LZ literal vs. DL code 233 Literal_choice : constant:= 0; 234 DL_code_choice : constant:= 1; 235 236 -- Within DL code: "Simple match" vs. "Rep match" 237 Simple_match_choice : constant:= 0; 238 Rep_match_choice : constant:= 1; 239 240 -- Within "Rep match": "Distance is rep0" vs. "Distance is not rep0" 241 The_distance_is_rep0_choice : constant:= 0; 242 The_distance_is_not_rep0_choice : constant:= 1; 243 -- Within "Distance is rep0": 244 The_length_is_1_choice : constant:= 0; 245 The_length_is_not_1_choice : constant:= 1; 246 -- Within "Distance is not rep0": "Distance is rep1" vs. "Distance is not rep1" 247 The_distance_is_rep1_choice : constant:= 0; 248 The_distance_is_not_rep1_choice : constant:= 1; 249 -- Within "Distance is not rep1": "Distance is rep2" vs. "Distance is not rep2" 250 The_distance_is_rep2_choice : constant:= 0; 251 The_distance_is_not_rep2_choice : constant:= 1; 252 253 ---------------------- 254 -- Range encoding -- 255 ---------------------- 256 257 -- Normalization threshold. When the range width is below that value, 258 -- a shift is needed. 259 width_threshold : constant := 2**24; -- LZMA specification name: "kTopValue" 260 261 -- The following article (the only reference in the LZMA specification) 262 -- explains how range encoding works: 263 -- 264 -- G. N. N. Martin, Range encoding: an algorithm for removing redundancy 265 -- from a digitized message, Video & Data Recording Conference, 266 -- Southampton, UK, July 24-27, 1979. 267 268end LZMA; 269