1 /*
2  * lzms_common.c - Common code for LZMS compression and decompression
3  */
4 
5 /*
6  * Copyright (C) 2013-2016 Eric Biggers
7  *
8  * This file is free software; you can redistribute it and/or modify it under
9  * the terms of the GNU Lesser General Public License as published by the Free
10  * Software Foundation; either version 3 of the License, or (at your option) any
11  * later version.
12  *
13  * This file is distributed in the hope that it will be useful, but WITHOUT
14  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
15  * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
16  * details.
17  *
18  * You should have received a copy of the GNU Lesser General Public License
19  * along with this file; if not, see http://www.gnu.org/licenses/.
20  */
21 
22 #ifdef HAVE_CONFIG_H
23 #  include "config.h"
24 #endif
25 
26 #include "wimlib/lzms_common.h"
27 #include "wimlib/unaligned.h"
28 #include "wimlib/x86_cpu_features.h"
29 
30 #ifdef __x86_64__
31 #  include <emmintrin.h>
32 #endif
33 
34 /* Table: offset slot => offset slot base value  */
35 const u32 lzms_offset_slot_base[LZMS_MAX_NUM_OFFSET_SYMS + 1] = {
36 	0x00000001, 0x00000002, 0x00000003, 0x00000004,
37 	0x00000005, 0x00000006, 0x00000007, 0x00000008,
38 	0x00000009, 0x0000000d, 0x00000011, 0x00000015,
39 	0x00000019, 0x0000001d, 0x00000021, 0x00000025,
40 	0x00000029, 0x0000002d, 0x00000035, 0x0000003d,
41 	0x00000045, 0x0000004d, 0x00000055, 0x0000005d,
42 	0x00000065, 0x00000075, 0x00000085, 0x00000095,
43 	0x000000a5, 0x000000b5, 0x000000c5, 0x000000d5,
44 	0x000000e5, 0x000000f5, 0x00000105, 0x00000125,
45 	0x00000145, 0x00000165, 0x00000185, 0x000001a5,
46 	0x000001c5, 0x000001e5, 0x00000205, 0x00000225,
47 	0x00000245, 0x00000265, 0x00000285, 0x000002a5,
48 	0x000002c5, 0x000002e5, 0x00000325, 0x00000365,
49 	0x000003a5, 0x000003e5, 0x00000425, 0x00000465,
50 	0x000004a5, 0x000004e5, 0x00000525, 0x00000565,
51 	0x000005a5, 0x000005e5, 0x00000625, 0x00000665,
52 	0x000006a5, 0x00000725, 0x000007a5, 0x00000825,
53 	0x000008a5, 0x00000925, 0x000009a5, 0x00000a25,
54 	0x00000aa5, 0x00000b25, 0x00000ba5, 0x00000c25,
55 	0x00000ca5, 0x00000d25, 0x00000da5, 0x00000e25,
56 	0x00000ea5, 0x00000f25, 0x00000fa5, 0x00001025,
57 	0x000010a5, 0x000011a5, 0x000012a5, 0x000013a5,
58 	0x000014a5, 0x000015a5, 0x000016a5, 0x000017a5,
59 	0x000018a5, 0x000019a5, 0x00001aa5, 0x00001ba5,
60 	0x00001ca5, 0x00001da5, 0x00001ea5, 0x00001fa5,
61 	0x000020a5, 0x000021a5, 0x000022a5, 0x000023a5,
62 	0x000024a5, 0x000026a5, 0x000028a5, 0x00002aa5,
63 	0x00002ca5, 0x00002ea5, 0x000030a5, 0x000032a5,
64 	0x000034a5, 0x000036a5, 0x000038a5, 0x00003aa5,
65 	0x00003ca5, 0x00003ea5, 0x000040a5, 0x000042a5,
66 	0x000044a5, 0x000046a5, 0x000048a5, 0x00004aa5,
67 	0x00004ca5, 0x00004ea5, 0x000050a5, 0x000052a5,
68 	0x000054a5, 0x000056a5, 0x000058a5, 0x00005aa5,
69 	0x00005ca5, 0x00005ea5, 0x000060a5, 0x000064a5,
70 	0x000068a5, 0x00006ca5, 0x000070a5, 0x000074a5,
71 	0x000078a5, 0x00007ca5, 0x000080a5, 0x000084a5,
72 	0x000088a5, 0x00008ca5, 0x000090a5, 0x000094a5,
73 	0x000098a5, 0x00009ca5, 0x0000a0a5, 0x0000a4a5,
74 	0x0000a8a5, 0x0000aca5, 0x0000b0a5, 0x0000b4a5,
75 	0x0000b8a5, 0x0000bca5, 0x0000c0a5, 0x0000c4a5,
76 	0x0000c8a5, 0x0000cca5, 0x0000d0a5, 0x0000d4a5,
77 	0x0000d8a5, 0x0000dca5, 0x0000e0a5, 0x0000e4a5,
78 	0x0000eca5, 0x0000f4a5, 0x0000fca5, 0x000104a5,
79 	0x00010ca5, 0x000114a5, 0x00011ca5, 0x000124a5,
80 	0x00012ca5, 0x000134a5, 0x00013ca5, 0x000144a5,
81 	0x00014ca5, 0x000154a5, 0x00015ca5, 0x000164a5,
82 	0x00016ca5, 0x000174a5, 0x00017ca5, 0x000184a5,
83 	0x00018ca5, 0x000194a5, 0x00019ca5, 0x0001a4a5,
84 	0x0001aca5, 0x0001b4a5, 0x0001bca5, 0x0001c4a5,
85 	0x0001cca5, 0x0001d4a5, 0x0001dca5, 0x0001e4a5,
86 	0x0001eca5, 0x0001f4a5, 0x0001fca5, 0x000204a5,
87 	0x00020ca5, 0x000214a5, 0x00021ca5, 0x000224a5,
88 	0x000234a5, 0x000244a5, 0x000254a5, 0x000264a5,
89 	0x000274a5, 0x000284a5, 0x000294a5, 0x0002a4a5,
90 	0x0002b4a5, 0x0002c4a5, 0x0002d4a5, 0x0002e4a5,
91 	0x0002f4a5, 0x000304a5, 0x000314a5, 0x000324a5,
92 	0x000334a5, 0x000344a5, 0x000354a5, 0x000364a5,
93 	0x000374a5, 0x000384a5, 0x000394a5, 0x0003a4a5,
94 	0x0003b4a5, 0x0003c4a5, 0x0003d4a5, 0x0003e4a5,
95 	0x0003f4a5, 0x000404a5, 0x000414a5, 0x000424a5,
96 	0x000434a5, 0x000444a5, 0x000454a5, 0x000464a5,
97 	0x000474a5, 0x000484a5, 0x000494a5, 0x0004a4a5,
98 	0x0004b4a5, 0x0004c4a5, 0x0004e4a5, 0x000504a5,
99 	0x000524a5, 0x000544a5, 0x000564a5, 0x000584a5,
100 	0x0005a4a5, 0x0005c4a5, 0x0005e4a5, 0x000604a5,
101 	0x000624a5, 0x000644a5, 0x000664a5, 0x000684a5,
102 	0x0006a4a5, 0x0006c4a5, 0x0006e4a5, 0x000704a5,
103 	0x000724a5, 0x000744a5, 0x000764a5, 0x000784a5,
104 	0x0007a4a5, 0x0007c4a5, 0x0007e4a5, 0x000804a5,
105 	0x000824a5, 0x000844a5, 0x000864a5, 0x000884a5,
106 	0x0008a4a5, 0x0008c4a5, 0x0008e4a5, 0x000904a5,
107 	0x000924a5, 0x000944a5, 0x000964a5, 0x000984a5,
108 	0x0009a4a5, 0x0009c4a5, 0x0009e4a5, 0x000a04a5,
109 	0x000a24a5, 0x000a44a5, 0x000a64a5, 0x000aa4a5,
110 	0x000ae4a5, 0x000b24a5, 0x000b64a5, 0x000ba4a5,
111 	0x000be4a5, 0x000c24a5, 0x000c64a5, 0x000ca4a5,
112 	0x000ce4a5, 0x000d24a5, 0x000d64a5, 0x000da4a5,
113 	0x000de4a5, 0x000e24a5, 0x000e64a5, 0x000ea4a5,
114 	0x000ee4a5, 0x000f24a5, 0x000f64a5, 0x000fa4a5,
115 	0x000fe4a5, 0x001024a5, 0x001064a5, 0x0010a4a5,
116 	0x0010e4a5, 0x001124a5, 0x001164a5, 0x0011a4a5,
117 	0x0011e4a5, 0x001224a5, 0x001264a5, 0x0012a4a5,
118 	0x0012e4a5, 0x001324a5, 0x001364a5, 0x0013a4a5,
119 	0x0013e4a5, 0x001424a5, 0x001464a5, 0x0014a4a5,
120 	0x0014e4a5, 0x001524a5, 0x001564a5, 0x0015a4a5,
121 	0x0015e4a5, 0x001624a5, 0x001664a5, 0x0016a4a5,
122 	0x0016e4a5, 0x001724a5, 0x001764a5, 0x0017a4a5,
123 	0x0017e4a5, 0x001824a5, 0x001864a5, 0x0018a4a5,
124 	0x0018e4a5, 0x001924a5, 0x001964a5, 0x0019e4a5,
125 	0x001a64a5, 0x001ae4a5, 0x001b64a5, 0x001be4a5,
126 	0x001c64a5, 0x001ce4a5, 0x001d64a5, 0x001de4a5,
127 	0x001e64a5, 0x001ee4a5, 0x001f64a5, 0x001fe4a5,
128 	0x002064a5, 0x0020e4a5, 0x002164a5, 0x0021e4a5,
129 	0x002264a5, 0x0022e4a5, 0x002364a5, 0x0023e4a5,
130 	0x002464a5, 0x0024e4a5, 0x002564a5, 0x0025e4a5,
131 	0x002664a5, 0x0026e4a5, 0x002764a5, 0x0027e4a5,
132 	0x002864a5, 0x0028e4a5, 0x002964a5, 0x0029e4a5,
133 	0x002a64a5, 0x002ae4a5, 0x002b64a5, 0x002be4a5,
134 	0x002c64a5, 0x002ce4a5, 0x002d64a5, 0x002de4a5,
135 	0x002e64a5, 0x002ee4a5, 0x002f64a5, 0x002fe4a5,
136 	0x003064a5, 0x0030e4a5, 0x003164a5, 0x0031e4a5,
137 	0x003264a5, 0x0032e4a5, 0x003364a5, 0x0033e4a5,
138 	0x003464a5, 0x0034e4a5, 0x003564a5, 0x0035e4a5,
139 	0x003664a5, 0x0036e4a5, 0x003764a5, 0x0037e4a5,
140 	0x003864a5, 0x0038e4a5, 0x003964a5, 0x0039e4a5,
141 	0x003a64a5, 0x003ae4a5, 0x003b64a5, 0x003be4a5,
142 	0x003c64a5, 0x003ce4a5, 0x003d64a5, 0x003de4a5,
143 	0x003ee4a5, 0x003fe4a5, 0x0040e4a5, 0x0041e4a5,
144 	0x0042e4a5, 0x0043e4a5, 0x0044e4a5, 0x0045e4a5,
145 	0x0046e4a5, 0x0047e4a5, 0x0048e4a5, 0x0049e4a5,
146 	0x004ae4a5, 0x004be4a5, 0x004ce4a5, 0x004de4a5,
147 	0x004ee4a5, 0x004fe4a5, 0x0050e4a5, 0x0051e4a5,
148 	0x0052e4a5, 0x0053e4a5, 0x0054e4a5, 0x0055e4a5,
149 	0x0056e4a5, 0x0057e4a5, 0x0058e4a5, 0x0059e4a5,
150 	0x005ae4a5, 0x005be4a5, 0x005ce4a5, 0x005de4a5,
151 	0x005ee4a5, 0x005fe4a5, 0x0060e4a5, 0x0061e4a5,
152 	0x0062e4a5, 0x0063e4a5, 0x0064e4a5, 0x0065e4a5,
153 	0x0066e4a5, 0x0067e4a5, 0x0068e4a5, 0x0069e4a5,
154 	0x006ae4a5, 0x006be4a5, 0x006ce4a5, 0x006de4a5,
155 	0x006ee4a5, 0x006fe4a5, 0x0070e4a5, 0x0071e4a5,
156 	0x0072e4a5, 0x0073e4a5, 0x0074e4a5, 0x0075e4a5,
157 	0x0076e4a5, 0x0077e4a5, 0x0078e4a5, 0x0079e4a5,
158 	0x007ae4a5, 0x007be4a5, 0x007ce4a5, 0x007de4a5,
159 	0x007ee4a5, 0x007fe4a5, 0x0080e4a5, 0x0081e4a5,
160 	0x0082e4a5, 0x0083e4a5, 0x0084e4a5, 0x0085e4a5,
161 	0x0086e4a5, 0x0087e4a5, 0x0088e4a5, 0x0089e4a5,
162 	0x008ae4a5, 0x008be4a5, 0x008ce4a5, 0x008de4a5,
163 	0x008fe4a5, 0x0091e4a5, 0x0093e4a5, 0x0095e4a5,
164 	0x0097e4a5, 0x0099e4a5, 0x009be4a5, 0x009de4a5,
165 	0x009fe4a5, 0x00a1e4a5, 0x00a3e4a5, 0x00a5e4a5,
166 	0x00a7e4a5, 0x00a9e4a5, 0x00abe4a5, 0x00ade4a5,
167 	0x00afe4a5, 0x00b1e4a5, 0x00b3e4a5, 0x00b5e4a5,
168 	0x00b7e4a5, 0x00b9e4a5, 0x00bbe4a5, 0x00bde4a5,
169 	0x00bfe4a5, 0x00c1e4a5, 0x00c3e4a5, 0x00c5e4a5,
170 	0x00c7e4a5, 0x00c9e4a5, 0x00cbe4a5, 0x00cde4a5,
171 	0x00cfe4a5, 0x00d1e4a5, 0x00d3e4a5, 0x00d5e4a5,
172 	0x00d7e4a5, 0x00d9e4a5, 0x00dbe4a5, 0x00dde4a5,
173 	0x00dfe4a5, 0x00e1e4a5, 0x00e3e4a5, 0x00e5e4a5,
174 	0x00e7e4a5, 0x00e9e4a5, 0x00ebe4a5, 0x00ede4a5,
175 	0x00efe4a5, 0x00f1e4a5, 0x00f3e4a5, 0x00f5e4a5,
176 	0x00f7e4a5, 0x00f9e4a5, 0x00fbe4a5, 0x00fde4a5,
177 	0x00ffe4a5, 0x0101e4a5, 0x0103e4a5, 0x0105e4a5,
178 	0x0107e4a5, 0x0109e4a5, 0x010be4a5, 0x010de4a5,
179 	0x010fe4a5, 0x0111e4a5, 0x0113e4a5, 0x0115e4a5,
180 	0x0117e4a5, 0x0119e4a5, 0x011be4a5, 0x011de4a5,
181 	0x011fe4a5, 0x0121e4a5, 0x0123e4a5, 0x0125e4a5,
182 	0x0127e4a5, 0x0129e4a5, 0x012be4a5, 0x012de4a5,
183 	0x012fe4a5, 0x0131e4a5, 0x0133e4a5, 0x0135e4a5,
184 	0x0137e4a5, 0x013be4a5, 0x013fe4a5, 0x0143e4a5,
185 	0x0147e4a5, 0x014be4a5, 0x014fe4a5, 0x0153e4a5,
186 	0x0157e4a5, 0x015be4a5, 0x015fe4a5, 0x0163e4a5,
187 	0x0167e4a5, 0x016be4a5, 0x016fe4a5, 0x0173e4a5,
188 	0x0177e4a5, 0x017be4a5, 0x017fe4a5, 0x0183e4a5,
189 	0x0187e4a5, 0x018be4a5, 0x018fe4a5, 0x0193e4a5,
190 	0x0197e4a5, 0x019be4a5, 0x019fe4a5, 0x01a3e4a5,
191 	0x01a7e4a5, 0x01abe4a5, 0x01afe4a5, 0x01b3e4a5,
192 	0x01b7e4a5, 0x01bbe4a5, 0x01bfe4a5, 0x01c3e4a5,
193 	0x01c7e4a5, 0x01cbe4a5, 0x01cfe4a5, 0x01d3e4a5,
194 	0x01d7e4a5, 0x01dbe4a5, 0x01dfe4a5, 0x01e3e4a5,
195 	0x01e7e4a5, 0x01ebe4a5, 0x01efe4a5, 0x01f3e4a5,
196 	0x01f7e4a5, 0x01fbe4a5, 0x01ffe4a5, 0x0203e4a5,
197 	0x0207e4a5, 0x020be4a5, 0x020fe4a5, 0x0213e4a5,
198 	0x0217e4a5, 0x021be4a5, 0x021fe4a5, 0x0223e4a5,
199 	0x0227e4a5, 0x022be4a5, 0x022fe4a5, 0x0233e4a5,
200 	0x0237e4a5, 0x023be4a5, 0x023fe4a5, 0x0243e4a5,
201 	0x0247e4a5, 0x024be4a5, 0x024fe4a5, 0x0253e4a5,
202 	0x0257e4a5, 0x025be4a5, 0x025fe4a5, 0x0263e4a5,
203 	0x0267e4a5, 0x026be4a5, 0x026fe4a5, 0x0273e4a5,
204 	0x0277e4a5, 0x027be4a5, 0x027fe4a5, 0x0283e4a5,
205 	0x0287e4a5, 0x028be4a5, 0x028fe4a5, 0x0293e4a5,
206 	0x0297e4a5, 0x029be4a5, 0x029fe4a5, 0x02a3e4a5,
207 	0x02a7e4a5, 0x02abe4a5, 0x02afe4a5, 0x02b3e4a5,
208 	0x02bbe4a5, 0x02c3e4a5, 0x02cbe4a5, 0x02d3e4a5,
209 	0x02dbe4a5, 0x02e3e4a5, 0x02ebe4a5, 0x02f3e4a5,
210 	0x02fbe4a5, 0x0303e4a5, 0x030be4a5, 0x0313e4a5,
211 	0x031be4a5, 0x0323e4a5, 0x032be4a5, 0x0333e4a5,
212 	0x033be4a5, 0x0343e4a5, 0x034be4a5, 0x0353e4a5,
213 	0x035be4a5, 0x0363e4a5, 0x036be4a5, 0x0373e4a5,
214 	0x037be4a5, 0x0383e4a5, 0x038be4a5, 0x0393e4a5,
215 	0x039be4a5, 0x03a3e4a5, 0x03abe4a5, 0x03b3e4a5,
216 	0x03bbe4a5, 0x03c3e4a5, 0x03cbe4a5, 0x03d3e4a5,
217 	0x03dbe4a5, 0x03e3e4a5, 0x03ebe4a5, 0x03f3e4a5,
218 	0x03fbe4a5, 0x0403e4a5, 0x040be4a5, 0x0413e4a5,
219 	0x041be4a5, 0x0423e4a5, 0x042be4a5, 0x0433e4a5,
220 	0x043be4a5, 0x0443e4a5, 0x044be4a5, 0x0453e4a5,
221 	0x045be4a5, 0x0463e4a5, 0x046be4a5, 0x0473e4a5,
222 	0x047be4a5, 0x0483e4a5, 0x048be4a5, 0x0493e4a5,
223 	0x049be4a5, 0x04a3e4a5, 0x04abe4a5, 0x04b3e4a5,
224 	0x04bbe4a5, 0x04c3e4a5, 0x04cbe4a5, 0x04d3e4a5,
225 	0x04dbe4a5, 0x04e3e4a5, 0x04ebe4a5, 0x04f3e4a5,
226 	0x04fbe4a5, 0x0503e4a5, 0x050be4a5, 0x0513e4a5,
227 	0x051be4a5, 0x0523e4a5, 0x052be4a5, 0x0533e4a5,
228 	0x053be4a5, 0x0543e4a5, 0x054be4a5, 0x0553e4a5,
229 	0x055be4a5, 0x0563e4a5, 0x056be4a5, 0x0573e4a5,
230 	0x057be4a5, 0x0583e4a5, 0x058be4a5, 0x0593e4a5,
231 	0x059be4a5, 0x05a3e4a5, 0x05abe4a5, 0x05b3e4a5,
232 	0x05bbe4a5, 0x05c3e4a5, 0x05cbe4a5, 0x05d3e4a5,
233 	0x05dbe4a5, 0x05e3e4a5, 0x05ebe4a5, 0x05f3e4a5,
234 	0x05fbe4a5, 0x060be4a5, 0x061be4a5, 0x062be4a5,
235 	0x063be4a5, 0x064be4a5, 0x065be4a5, 0x465be4a5,
236 	/* The last entry is extra; it is equal to LZMS_MAX_MATCH_OFFSET + 1 and
237 	 * is here to aid binary search.  */
238 };
239 
240 /* Table: offset slot => number of extra offset bits  */
241 const u8 lzms_extra_offset_bits[LZMS_MAX_NUM_OFFSET_SYMS] = {
242 	0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2,
243 	2 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4,
244 	4 , 4 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5,
245 	5 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6,
246 	7 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 7,
247 	7 , 7 , 7 , 7 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 8,
248 	8 , 8 , 8 , 8 , 8 , 8 , 8 , 8 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9,
249 	9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9,
250 	9 , 9 , 9 , 9 , 9 , 9 , 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
251 	10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
252 	10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11,
253 	11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
254 	11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12,
255 	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
256 	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
257 	12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13,
258 	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
259 	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
260 	13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
261 	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
262 	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
263 	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
264 	14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
265 	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
266 	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
267 	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
268 	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16,
269 	16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
270 	16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
271 	16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
272 	16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
273 	16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17,
274 	17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
275 	17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
276 	17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
277 	17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
278 	17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
279 	18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
280 	18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
281 	18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
282 	18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
283 	18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
284 	18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 19,
285 	19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
286 	19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
287 	19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
288 	19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
289 	19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
290 	19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
291 	19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 30,
292 };
293 
294 /* Table: length slot => length slot base value  */
295 const u32 lzms_length_slot_base[LZMS_NUM_LENGTH_SYMS + 1] = {
296 	0x00000001, 0x00000002, 0x00000003, 0x00000004,
297 	0x00000005, 0x00000006, 0x00000007, 0x00000008,
298 	0x00000009, 0x0000000a, 0x0000000b, 0x0000000c,
299 	0x0000000d, 0x0000000e, 0x0000000f, 0x00000010,
300 	0x00000011, 0x00000012, 0x00000013, 0x00000014,
301 	0x00000015, 0x00000016, 0x00000017, 0x00000018,
302 	0x00000019, 0x0000001a, 0x0000001b, 0x0000001d,
303 	0x0000001f, 0x00000021, 0x00000023, 0x00000027,
304 	0x0000002b, 0x0000002f, 0x00000033, 0x00000037,
305 	0x0000003b, 0x00000043, 0x0000004b, 0x00000053,
306 	0x0000005b, 0x0000006b, 0x0000007b, 0x0000008b,
307 	0x0000009b, 0x000000ab, 0x000000cb, 0x000000eb,
308 	0x0000012b, 0x000001ab, 0x000002ab, 0x000004ab,
309 	0x000008ab, 0x000108ab, 0x400108ab,
310 	/* The last entry is extra; it is equal to LZMS_MAX_MATCH_LENGTH + 1 and
311 	 * is here to aid binary search.  */
312 };
313 
314 /* Table: length slot => number of extra length bits  */
315 const u8 lzms_extra_length_bits[LZMS_NUM_LENGTH_SYMS] = {
316 	0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
317 	0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
318 	0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
319 	0 , 0 , 1 , 1 , 1 , 1 , 2 , 2 ,
320 	2 , 2 , 2 , 2 , 3 , 3 , 3 , 3 ,
321 	4 , 4 , 4 , 4 , 4 , 5 , 5 , 6 ,
322 	7 , 8 , 9 , 10, 16, 30,
323 };
324 
325 unsigned
lzms_get_slot(u32 value,const u32 slot_base_tab[],unsigned num_slots)326 lzms_get_slot(u32 value, const u32 slot_base_tab[], unsigned num_slots)
327 {
328 	unsigned l = 0;
329 	unsigned r = num_slots - 1;
330 	for (;;) {
331 		unsigned slot = (l + r) / 2;
332 		if (value >= slot_base_tab[slot]) {
333 			if (value < slot_base_tab[slot + 1])
334 				return slot;
335 			else
336 				l = slot + 1;
337 		} else {
338 			r = slot - 1;
339 		}
340 	}
341 }
342 
343 /* Return the number of offset slots used when processing a buffer having the
344  * specified uncompressed size.  */
345 unsigned
lzms_get_num_offset_slots(size_t uncompressed_size)346 lzms_get_num_offset_slots(size_t uncompressed_size)
347 {
348 	if (uncompressed_size < 2)
349 		return 0;
350 	return 1 + lzms_get_offset_slot(uncompressed_size - 1);
351 }
352 
353 void
lzms_init_probabilities(struct lzms_probabilites * probs)354 lzms_init_probabilities(struct lzms_probabilites *probs)
355 {
356 	struct lzms_probability_entry *entries =
357 		(struct lzms_probability_entry *)probs;
358 	size_t num_entries = sizeof(struct lzms_probabilites) /
359 			     sizeof(struct lzms_probability_entry);
360 	for (size_t i = 0; i < num_entries; i++) {
361 		entries[i].num_recent_zero_bits = LZMS_INITIAL_PROBABILITY;
362 		entries[i].recent_bits = LZMS_INITIAL_RECENT_BITS;
363 	}
364 }
365 
366 void
lzms_init_symbol_frequencies(u32 freqs[],unsigned num_syms)367 lzms_init_symbol_frequencies(u32 freqs[], unsigned num_syms)
368 {
369 	for (unsigned sym = 0; sym < num_syms; sym++)
370 		freqs[sym] = 1;
371 }
372 
373 void
lzms_dilute_symbol_frequencies(u32 freqs[],unsigned num_syms)374 lzms_dilute_symbol_frequencies(u32 freqs[], unsigned num_syms)
375 {
376 	for (unsigned sym = 0; sym < num_syms; sym++)
377 		freqs[sym] = (freqs[sym] >> 1) + 1;
378 }
379 
380 
381 #ifdef __x86_64__
382 static forceinline u8 *
find_next_opcode_sse4_2(u8 * p)383 find_next_opcode_sse4_2(u8 *p)
384 {
385 	const __v16qi potential_opcodes = (__v16qi) {0x48, 0x4C, 0xE8, 0xE9, 0xF0, 0xFF};
386 	__asm__(
387 		"  pcmpestri $0x0, (%[p]), %[potential_opcodes]      \n"
388 		"  jc 2f                                             \n"
389 		"1:                                                  \n"
390 		"  add $0x10, %[p]                                   \n"
391 		"  pcmpestri $0x0, (%[p]), %[potential_opcodes]      \n"
392 		"  jnc 1b                                            \n"
393 		"2:                                                  \n"
394 	#ifdef __ILP32__ /* x32 ABI (x86_64 with 32-bit pointers) */
395 		"  add %%ecx, %[p]                                   \n"
396 	#else
397 		"  add %%rcx, %[p]                                   \n"
398 	#endif
399 		: [p] "+r" (p)
400 		: [potential_opcodes] "x" (potential_opcodes), "a" (6), "d" (16)
401 		: "rcx", "cc"
402 	       );
403 
404 	return p;
405 }
406 #endif /* __x86_64__ */
407 
408 static forceinline u8 *
find_next_opcode_default(u8 * p)409 find_next_opcode_default(u8 *p)
410 {
411 	/*
412 	 * The following table is used to accelerate the common case where the
413 	 * byte has nothing to do with x86 translation and must simply be
414 	 * skipped.  This was faster than the following alternatives:
415 	 *	- Jump table with 256 entries
416 	 *	- Switch statement with default
417 	 */
418 	static const u8 is_potential_opcode[256] = {
419 		[0x48] = 1, [0x4C] = 1, [0xE8] = 1,
420 		[0xE9] = 1, [0xF0] = 1, [0xFF] = 1,
421 	};
422 
423 	for (;;) {
424 		if (is_potential_opcode[*p])
425 			break;
426 		p++;
427 		if (is_potential_opcode[*p])
428 			break;
429 		p++;
430 		if (is_potential_opcode[*p])
431 			break;
432 		p++;
433 		if (is_potential_opcode[*p])
434 			break;
435 		p++;
436 	}
437 	return p;
438 }
439 
440 static forceinline u8 *
translate_if_needed(u8 * data,u8 * p,s32 * last_x86_pos,s32 last_target_usages[],bool undo)441 translate_if_needed(u8 *data, u8 *p, s32 *last_x86_pos,
442 		    s32 last_target_usages[], bool undo)
443 {
444 	s32 max_trans_offset;
445 	s32 opcode_nbytes;
446 	u16 target16;
447 	s32 i;
448 
449 	max_trans_offset = LZMS_X86_MAX_TRANSLATION_OFFSET;
450 
451 	/*
452 	 * p[0] has one of the following values:
453 	 *	0x48 0x4C 0xE8 0xE9 0xF0 0xFF
454 	 */
455 
456 	if (p[0] >= 0xF0) {
457 		if (p[0] & 0x0F) {
458 			/* 0xFF (instruction group)  */
459 			if (p[1] == 0x15) {
460 				/* Call indirect relative  */
461 				opcode_nbytes = 2;
462 				goto have_opcode;
463 			}
464 		} else {
465 			/* 0xF0 (lock prefix)  */
466 			if (p[1] == 0x83 && p[2] == 0x05) {
467 				/* Lock add relative  */
468 				opcode_nbytes = 3;
469 				goto have_opcode;
470 			}
471 		}
472 	} else if (p[0] <= 0x4C) {
473 
474 		/* 0x48 or 0x4C.  In 64-bit code this is a REX prefix byte with
475 		 * W=1, R=[01], X=0, and B=0, and it will be followed by the
476 		 * actual opcode, then additional bytes depending on the opcode.
477 		 * We are most interested in several common instructions that
478 		 * access data relative to the instruction pointer.  These use a
479 		 * 1-byte opcode, followed by a ModR/M byte, followed by a
480 		 * 4-byte displacement.  */
481 
482 		/* Test: does the ModR/M byte indicate RIP-relative addressing?
483 		 * Note: there seems to be a mistake in the format here; the
484 		 * mask really should be 0xC7 instead of 0x07 so that both the
485 		 * MOD and R/M fields of ModR/M are tested, not just R/M.  */
486 		if ((p[2] & 0x07) == 0x05) {
487 			/* Check for the LEA (load effective address) or MOV
488 			 * (move) opcodes.  For MOV there are additional
489 			 * restrictions, although it seems they are only helpful
490 			 * due to the overly lax ModR/M test.  */
491 			if (p[1] == 0x8D ||
492 			    (p[1] == 0x8B && !(p[0] & 0x04) && !(p[2] & 0xF0)))
493 			{
494 				opcode_nbytes = 3;
495 				goto have_opcode;
496 			}
497 		}
498 	} else {
499 		if (p[0] & 0x01) {
500 			/* 0xE9: Jump relative.  Theoretically this would be
501 			 * useful to translate, but in fact it's explicitly
502 			 * excluded.  Most likely it creates too many false
503 			 * positives for the detection algorithm.  */
504 			p += 4;
505 		} else {
506 			/* 0xE8: Call relative.  This is a common case, so it
507 			 * uses a reduced max_trans_offset.  In other words, we
508 			 * have to be more confident that the data actually is
509 			 * x86 machine code before we'll do the translation.  */
510 			opcode_nbytes = 1;
511 			max_trans_offset >>= 1;
512 			goto have_opcode;
513 		}
514 	}
515 
516 	return p + 1;
517 
518 have_opcode:
519 	i = p - data;
520 	p += opcode_nbytes;
521 	if (undo) {
522 		if (i - *last_x86_pos <= max_trans_offset) {
523 			u32 n = get_unaligned_le32(p);
524 			put_unaligned_le32(n - i, p);
525 		}
526 		target16 = i + get_unaligned_le16(p);
527 	} else {
528 		target16 = i + get_unaligned_le16(p);
529 		if (i - *last_x86_pos <= max_trans_offset) {
530 			u32 n = get_unaligned_le32(p);
531 			put_unaligned_le32(n + i, p);
532 		}
533 	}
534 
535 	i += opcode_nbytes + sizeof(le32) - 1;
536 
537 	if (i - last_target_usages[target16] <= LZMS_X86_ID_WINDOW_SIZE)
538 		*last_x86_pos = i;
539 
540 	last_target_usages[target16] = i;
541 
542 	return p + sizeof(le32);
543 }
544 
545 /*
546  * Translate relative addresses embedded in x86 instructions into absolute
547  * addresses (@undo == %false), or undo this translation (@undo == %true).
548  *
549  * Absolute addresses are usually more compressible by LZ factorization.
550  *
551  * @last_target_usages must be a temporary array of length >= 65536.
552  */
553 void
lzms_x86_filter(u8 data[restrict],s32 size,s32 last_target_usages[restrict],bool undo)554 lzms_x86_filter(u8 data[restrict], s32 size,
555 		s32 last_target_usages[restrict], bool undo)
556 {
557 	/*
558 	 * Note: this filter runs unconditionally and uses a custom algorithm to
559 	 * detect data regions that probably contain x86 code.
560 	 *
561 	 * 'last_x86_pos' tracks the most recent position that has a good chance
562 	 * of being the start of an x86 instruction.  When the filter detects a
563 	 * likely x86 instruction, it updates this variable and considers the
564 	 * next LZMS_X86_MAX_TRANSLATION_OFFSET bytes of data as valid for x86
565 	 * translations.
566 	 *
567 	 * If part of the data does not, in fact, contain x86 machine code, then
568 	 * 'last_x86_pos' will, very likely, eventually fall more than
569 	 * LZMS_X86_MAX_TRANSLATION_OFFSET bytes behind the current position.
570 	 * This results in x86 translations being disabled until the next likely
571 	 * x86 instruction is detected.
572 	 *
573 	 * To identify "likely x86 instructions", the algorithm attempts to
574 	 * track the position of the most recent potential relative-addressing
575 	 * instruction that referenced each possible memory address.  If it
576 	 * finds two references to the same memory address within an
577 	 * LZMS_X86_ID_WINDOW_SIZE-byte sized window, then the second reference
578 	 * is flagged as a likely x86 instruction.  Since the instructions
579 	 * considered for translation necessarily use relative addressing, the
580 	 * algorithm does a tentative translation into absolute addresses.  In
581 	 * addition, so that memory addresses can be looked up in an array of
582 	 * reasonable size (in this code, 'last_target_usages'), only the
583 	 * low-order 2 bytes of each address are considered significant.
584 	 */
585 
586 	u8 *p;
587 	u8 *tail_ptr;
588 	s32 last_x86_pos = -LZMS_X86_MAX_TRANSLATION_OFFSET - 1;
589 
590 	if (size <= 17)
591 		return;
592 
593 	for (s32 i = 0; i < 65536; i++)
594 		last_target_usages[i] = -(s32)LZMS_X86_ID_WINDOW_SIZE - 1;
595 
596 	/*
597 	 * Optimization: only check for end-of-buffer when we already have a
598 	 * byte that is a potential opcode for x86 translation.  To do this,
599 	 * overwrite one of the bytes near the end of the buffer, and restore it
600 	 * later.  The correctness of this optimization relies on two
601 	 * characteristics of compressed format:
602 	 *
603 	 *  1. No translation can follow an opcode beginning in the last 16
604 	 *     bytes.
605 	 *  2. A translation following an opcode starting at the last possible
606 	 *     position (17 bytes from the end) never extends more than 7 bytes.
607 	 *     Consequently, we can overwrite any of the bytes starting at
608 	 *     data[(size - 16) + 7] and have no effect on the result, as long
609 	 *     as we restore those bytes later.
610 	 */
611 
612 	/* Note: the very first byte must be ignored completely!  */
613 	p = data + 1;
614 	tail_ptr = &data[size - 16];
615 
616 #ifdef __x86_64__
617 	if (x86_have_cpu_feature(X86_CPU_FEATURE_SSE4_2)) {
618 		u8 saved_byte = *tail_ptr;
619 		*tail_ptr = 0xE8;
620 		for (;;) {
621 			u8 *new_p = find_next_opcode_sse4_2(p);
622 			if (new_p >= tail_ptr - 8)
623 				break;
624 			p = new_p;
625 			p = translate_if_needed(data, p, &last_x86_pos,
626 						last_target_usages, undo);
627 		}
628 		*tail_ptr = saved_byte;
629 	}
630 #endif
631 	{
632 		u8 saved_byte = *(tail_ptr + 8);
633 		*(tail_ptr + 8) = 0xE8;
634 		for (;;) {
635 			p = find_next_opcode_default(p);
636 			if (p >= tail_ptr)
637 				break;
638 			p = translate_if_needed(data, p, &last_x86_pos,
639 						last_target_usages, undo);
640 		}
641 		*(tail_ptr + 8) = saved_byte;
642 	}
643 }
644