1 static char rcsid[] = "$Id: bitpack64-write.c 221731 2020-02-13 19:47:16Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5
6 #include "bitpack64-write.h"
7
8 #ifdef WORDS_BIGENDIAN
9 #include "bigendian.h" /* For FWRITE_UINTS */
10 #else
11 #include "littleendian.h" /* For FWRITE_UINTS */
12 #endif
13
14 #include <stdio.h>
15 #include <stdlib.h>
16 #include <string.h> /* For memset */
17 #include <errno.h>
18 #include "mem.h"
19 #include "assert.h"
20 #include "fopen.h"
21 #include "popcount.h"
22 #include "bitpack64-access.h" /* For Bitpack64_extract_bitpack */
23
24 #ifdef HAVE_SSE2
25 #include <emmintrin.h>
26 #endif
27
28
29 /* #define ALLOW_ODD_PACKSIZES 1 */
30
31 /* #define USE_ONE_FILE_FOR_FIXED 1 */
32
33 #define DIFFERENTIAL_METAINFO_SIZE 2
34 #define PAIRED_METAINFO_SIZE 3
35 #define RANK_METAINFO_SIZE 1 /* A variant of differential, where packsize is always 6 (lg 64) */
36 #define DIRECT_METAINFO_SIZE 1
37 #define BLOCKSIZE 64
38 #define POSITIONS_PAGE 4294967296 /* 2^32 */
39
40 #define BUFFER_SIZE 1000000
41
42
43 /* Note: For offset pointers, where we need fast cumulative sums, we
44 use vertical format (where successive values are in different
45 packed unsigned ints). For lcp, we want raw values, and vertical
46 format is still slightly more efficient than horizontal format. */
47
48 #ifdef HAVE_SSE2
49 static int
write_reg_buffered_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,__m128i OutReg)50 write_reg_buffered_vert (FILE *strm_fp, Positionsptr_T *strm_buffer,
51 int strm_buffer_size, int strm_buffer_i, __m128i OutReg) {
52
53 #if 0
54 /* Type casting method (when we passed in pointer to OutReg). Needs a memory fence. */
55 UINT4 *buffer = (UINT4 *) OutReg;
56 _mm_lfence(); /* Needed to avoid storing incorrect values into strm_buffer */
57 #else
58 /* Storing method. Safer. */
59 UINT4 buffer[4];
60 _mm_store_si128((__m128i *) buffer,OutReg);
61 #endif
62
63 /* printf("Writing %08X %08X %08X %08X\n",buffer[0],buffer[1],buffer[2],buffer[3]); */
64
65 strm_buffer[strm_buffer_i++] = buffer[0];
66 if (strm_buffer_i == strm_buffer_size) {
67 if (FWRITE_UINTS(strm_buffer,strm_buffer_size,strm_fp) != (size_t) strm_buffer_size) {
68 fprintf(stderr,"Error in write_reg_buffered_vert: %s\n",strerror(errno));
69 exit(9);
70 }
71 strm_buffer_i = 0;
72 }
73
74 strm_buffer[strm_buffer_i++] = buffer[1];
75 if (strm_buffer_i == strm_buffer_size) {
76 if (FWRITE_UINTS(strm_buffer,strm_buffer_size,strm_fp) != (size_t) strm_buffer_size) {
77 fprintf(stderr,"Error in write_reg_buffered_vert: %s\n",strerror(errno));
78 exit(9);
79 }
80 strm_buffer_i = 0;
81 }
82
83 strm_buffer[strm_buffer_i++] = buffer[2];
84 if (strm_buffer_i == strm_buffer_size) {
85 if (FWRITE_UINTS(strm_buffer,strm_buffer_size,strm_fp) != (size_t) strm_buffer_size) {
86 fprintf(stderr,"Error in write_reg_buffered_vert: %s\n",strerror(errno));
87 exit(9);
88 }
89 strm_buffer_i = 0;
90 }
91
92 strm_buffer[strm_buffer_i++] = buffer[3];
93 if (strm_buffer_i == strm_buffer_size) {
94 if (FWRITE_UINTS(strm_buffer,strm_buffer_size,strm_fp) != (size_t) strm_buffer_size) {
95 fprintf(stderr,"Error in write_reg_buffered_vert: %s\n",strerror(errno));
96 exit(9);
97 }
98 strm_buffer_i = 0;
99 }
100
101 return strm_buffer_i;
102 }
103 #else
104 static int
write_reg_buffered_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,UINT4 * horizontal,int nwritten)105 write_reg_buffered_vert (FILE *strm_fp, Positionsptr_T *strm_buffer,
106 int strm_buffer_size, int strm_buffer_i,
107 UINT4 *horizontal, int nwritten) {
108 UINT4 vertical[64];
109 int nrows = nwritten/4, row, column, k;
110
111 /* Convert to vertical */
112 for (column = 0; column < 4; column++) {
113 k = column;
114 for (row = 0; row < nrows; row++) {
115 vertical[k] = *horizontal++;
116 k += 4;
117 }
118 }
119
120 /* Send to output buffer */
121 for (k = 0; k < nwritten; k++) {
122 /* printf("Writing %08X\n",vertical[k]); */
123 strm_buffer[strm_buffer_i++] = vertical[k];
124 if (strm_buffer_i == strm_buffer_size) {
125 if (FWRITE_UINTS(strm_buffer,strm_buffer_size,strm_fp) != (size_t) strm_buffer_size) {
126 fprintf(stderr,"Error in write_reg_buffered_vert: %s\n",strerror(errno));
127 exit(9);
128 }
129 strm_buffer_i = 0;
130 }
131 }
132
133 return strm_buffer_i;
134 }
135 #endif
136
137
138
139 static int
write_reg_buffered_horiz(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,UINT4 * values,int nwritten)140 write_reg_buffered_horiz (FILE *strm_fp, Positionsptr_T *strm_buffer,
141 int strm_buffer_size, int strm_buffer_i,
142 UINT4 *values, int nwritten) {
143 int k;
144
145 /* Send to output buffer */
146 for (k = 0; k < nwritten; k++) {
147 /* printf("Writing %08X\n",values[k]); */
148 strm_buffer[strm_buffer_i++] = values[k];
149 if (strm_buffer_i == strm_buffer_size) {
150 if (FWRITE_UINTS(strm_buffer,strm_buffer_size,strm_fp) != (size_t) strm_buffer_size) {
151 fprintf(stderr,"Error in write_reg_buffered_horiz: %s\n",strerror(errno));
152 exit(9);
153 }
154 strm_buffer_i = 0;
155 }
156 }
157
158 return strm_buffer_i;
159 }
160
161
162
163
164 #ifdef HAVE_SSE2
165 static __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7, mask8,
166 mask9, mask10, mask11, mask12, mask13, mask14, mask15, mask16,
167 mask17, mask18, mask19, mask20, mask21, mask22, mask23, mask24,
168 mask25, mask26, mask27, mask28, mask29, mask30, mask31;
169 #endif
170
171
172 static void
write_setup()173 write_setup () {
174
175 #ifdef HAVE_SSE2
176 mask1 = _mm_set1_epi32(1U);
177 mask2 = _mm_set1_epi32(3U);
178 mask3 = _mm_set1_epi32(7U);
179 mask4 = _mm_set1_epi32(15U);
180 mask5 = _mm_set1_epi32(31U);
181 mask6 = _mm_set1_epi32(63U);
182 mask7 = _mm_set1_epi32(127U);
183 mask8 = _mm_set1_epi32(255U);
184 mask9 = _mm_set1_epi32(511U);
185 mask10 = _mm_set1_epi32(1023U);
186 mask11 = _mm_set1_epi32(2047U);
187 mask12 = _mm_set1_epi32(4095U);
188 mask13 = _mm_set1_epi32(8191U);
189 mask14 = _mm_set1_epi32(16383U);
190 mask15 = _mm_set1_epi32(32767U);
191 mask16 = _mm_set1_epi32(65535U);
192 mask17 = _mm_set1_epi32(131071U);
193 mask18 = _mm_set1_epi32(262143U);
194 mask19 = _mm_set1_epi32(524287U);
195 mask20 = _mm_set1_epi32(1048575U);
196 mask21 = _mm_set1_epi32(2097151U);
197 mask22 = _mm_set1_epi32(4194303U);
198 mask23 = _mm_set1_epi32(8388607U);
199 mask24 = _mm_set1_epi32(16777215U);
200 mask25 = _mm_set1_epi32(33554431U);
201 mask26 = _mm_set1_epi32(67108863U);
202 mask27 = _mm_set1_epi32(134217727U);
203 mask28 = _mm_set1_epi32(268435455U);
204 mask29 = _mm_set1_epi32(536870911U);
205 mask30 = _mm_set1_epi32(1073741823U);
206 mask31 = _mm_set1_epi32(2147483647U);
207 #endif
208
209 return;
210 }
211
212 #ifdef ALLOW_ODD_PACKSIZES
213 /* nwritten = 1 * 4 = 4 unsigned ints */
214 static int
write_01_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)215 write_01_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
216 const __m128i *in = (const __m128i *) _in;
217 __m128i OutReg;
218
219 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask1);
220 OutReg = InReg;
221 InReg = _mm_and_si128(_mm_load_si128(++in), mask1);
222
223 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
224 InReg = _mm_and_si128(_mm_load_si128(++in), mask1);
225
226 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
227 InReg = _mm_and_si128(_mm_load_si128(++in), mask1);
228
229 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
230 InReg = _mm_and_si128(_mm_load_si128(++in), mask1);
231
232 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
233 InReg = _mm_and_si128(_mm_load_si128(++in), mask1);
234
235 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
236 InReg = _mm_and_si128(_mm_load_si128(++in), mask1);
237
238 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
239 InReg = _mm_and_si128(_mm_load_si128(++in), mask1);
240
241 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
242 InReg = _mm_and_si128(_mm_load_si128(++in), mask1);
243
244 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
245 InReg = _mm_and_si128(_mm_load_si128(++in), mask1);
246
247 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
248 InReg = _mm_and_si128(_mm_load_si128(++in), mask1);
249
250 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
251 InReg = _mm_and_si128(_mm_load_si128(++in), mask1);
252
253 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
254 InReg = _mm_and_si128(_mm_load_si128(++in), mask1);
255
256 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
257 InReg = _mm_and_si128(_mm_load_si128(++in), mask1);
258
259 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
260 InReg = _mm_and_si128(_mm_load_si128(++in), mask1);
261
262 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
263 InReg = _mm_and_si128(_mm_load_si128(++in), mask1);
264
265 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
266 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
267 OutReg);
268
269 return strm_buffer_i;
270 }
271 #endif
272
273
274 #ifdef HAVE_SSE2
275 /* nwritten = 1 * 4 = 4 unsigned ints */
276 static int
write_02_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)277 write_02_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
278 const __m128i *in = (const __m128i *) _in;
279 __m128i OutReg;
280
281 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask2);
282 OutReg = InReg;
283 InReg = _mm_and_si128(_mm_load_si128(++in), mask2);
284
285 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
286 InReg = _mm_and_si128(_mm_load_si128(++in), mask2);
287
288 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
289 InReg = _mm_and_si128(_mm_load_si128(++in), mask2);
290
291 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
292 InReg = _mm_and_si128(_mm_load_si128(++in), mask2);
293
294 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
295 InReg = _mm_and_si128(_mm_load_si128(++in), mask2);
296
297 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
298 InReg = _mm_and_si128(_mm_load_si128(++in), mask2);
299
300 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
301 InReg = _mm_and_si128(_mm_load_si128(++in), mask2);
302
303 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
304 InReg = _mm_and_si128(_mm_load_si128(++in), mask2);
305
306 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
307 InReg = _mm_and_si128(_mm_load_si128(++in), mask2);
308
309 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
310 InReg = _mm_and_si128(_mm_load_si128(++in), mask2);
311
312 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
313 InReg = _mm_and_si128(_mm_load_si128(++in), mask2);
314
315 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
316 InReg = _mm_and_si128(_mm_load_si128(++in), mask2);
317
318 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
319 InReg = _mm_and_si128(_mm_load_si128(++in), mask2);
320
321 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
322 InReg = _mm_and_si128(_mm_load_si128(++in), mask2);
323
324 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
325 InReg = _mm_and_si128(_mm_load_si128(++in), mask2);
326
327 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
328 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
329 OutReg);
330
331 return strm_buffer_i;
332 }
333 #endif
334
335 static int
pack_02_horiz(UINT4 * out,const UINT4 * in)336 pack_02_horiz (UINT4 *out, const UINT4 *in) {
337 int column;
338
339 for (column = 0; column < 4; column++) {
340 *out |= (*in) % (1U << 2 ) ;
341 ++in;
342 *out |= ( (*in) % (1U << 2 ) ) << 2 ;
343 ++in;
344 *out |= ( (*in) % (1U << 2 ) ) << 4 ;
345 ++in;
346 *out |= ( (*in) % (1U << 2 ) ) << 6 ;
347 ++in;
348 *out |= ( (*in) % (1U << 2 ) ) << 8 ;
349 ++in;
350 *out |= ( (*in) % (1U << 2 ) ) << 10 ;
351 ++in;
352 *out |= ( (*in) % (1U << 2 ) ) << 12 ;
353 ++in;
354 *out |= ( (*in) % (1U << 2 ) ) << 14 ;
355 ++in;
356 *out |= ( (*in) % (1U << 2 ) ) << 16 ;
357 ++in;
358 *out |= ( (*in) % (1U << 2 ) ) << 18 ;
359 ++in;
360 *out |= ( (*in) % (1U << 2 ) ) << 20 ;
361 ++in;
362 *out |= ( (*in) % (1U << 2 ) ) << 22 ;
363 ++in;
364 *out |= ( (*in) % (1U << 2 ) ) << 24 ;
365 ++in;
366 *out |= ( (*in) % (1U << 2 ) ) << 26 ;
367 ++in;
368 *out |= ( (*in) % (1U << 2 ) ) << 28 ;
369 ++in;
370 *out |= ( (*in) % (1U << 2 ) ) << 30 ;
371 ++out;
372 ++in;
373 }
374
375 return 4;
376 }
377
378
379
380 #ifdef ALLOW_ODD_PACKSIZES
381 /* nwritten = 2 * 4 = 8 unsigned ints */
382 static int
write_03_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)383 write_03_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
384 const __m128i *in = (const __m128i *) _in;
385 __m128i OutReg;
386
387 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask3);
388 OutReg = InReg;
389 InReg = _mm_and_si128(_mm_load_si128(++in), mask3);
390
391 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
392 InReg = _mm_and_si128(_mm_load_si128(++in), mask3);
393
394 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
395 InReg = _mm_and_si128(_mm_load_si128(++in), mask3);
396
397 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
398 InReg = _mm_and_si128(_mm_load_si128(++in), mask3);
399
400 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
401 InReg = _mm_and_si128(_mm_load_si128(++in), mask3);
402
403 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
404 InReg = _mm_and_si128(_mm_load_si128(++in), mask3);
405
406 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
407 InReg = _mm_and_si128(_mm_load_si128(++in), mask3);
408
409 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
410 InReg = _mm_and_si128(_mm_load_si128(++in), mask3);
411
412 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
413 InReg = _mm_and_si128(_mm_load_si128(++in), mask3);
414
415 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
416 InReg = _mm_and_si128(_mm_load_si128(++in), mask3);
417
418 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
419 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
420 OutReg);
421
422 OutReg = _mm_srli_epi32(InReg, 3 - 1);
423 InReg = _mm_and_si128(_mm_load_si128(++in), mask3);
424
425 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
426 InReg = _mm_and_si128(_mm_load_si128(++in), mask3);
427
428 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
429 InReg = _mm_and_si128(_mm_load_si128(++in), mask3);
430
431 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
432 InReg = _mm_and_si128(_mm_load_si128(++in), mask3);
433
434 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
435 InReg = _mm_and_si128(_mm_load_si128(++in), mask3);
436
437 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
438 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
439 OutReg);
440
441 return strm_buffer_i;
442 }
443 #endif
444
445
446
447 #ifdef ALLOW_ODD_PACKSIZES
448 /* nwritten = 3 * 4 = 12 unsigned ints */
449 static int
write_05_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)450 write_05_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
451 const __m128i *in = (const __m128i *) _in;
452 __m128i OutReg;
453
454 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask5);
455 OutReg = InReg;
456 InReg = _mm_and_si128(_mm_load_si128(++in), mask5);
457
458 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
459 InReg = _mm_and_si128(_mm_load_si128(++in), mask5);
460
461 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
462 InReg = _mm_and_si128(_mm_load_si128(++in), mask5);
463
464 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
465 InReg = _mm_and_si128(_mm_load_si128(++in), mask5);
466
467 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
468 InReg = _mm_and_si128(_mm_load_si128(++in), mask5);
469
470 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
471 InReg = _mm_and_si128(_mm_load_si128(++in), mask5);
472
473 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
474 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
475 OutReg);
476
477 OutReg = _mm_srli_epi32(InReg, 5 - 3);
478 InReg = _mm_and_si128(_mm_load_si128(++in), mask5);
479
480 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
481 InReg = _mm_and_si128(_mm_load_si128(++in), mask5);
482
483 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
484 InReg = _mm_and_si128(_mm_load_si128(++in), mask5);
485
486 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
487 InReg = _mm_and_si128(_mm_load_si128(++in), mask5);
488
489 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
490 InReg = _mm_and_si128(_mm_load_si128(++in), mask5);
491
492 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
493 InReg = _mm_and_si128(_mm_load_si128(++in), mask5);
494
495 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
496 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
497 OutReg);
498
499 OutReg = _mm_srli_epi32(InReg, 5 - 1);
500 InReg = _mm_and_si128(_mm_load_si128(++in), mask5);
501
502 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
503 InReg = _mm_and_si128(_mm_load_si128(++in), mask5);
504
505 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
506 InReg = _mm_and_si128(_mm_load_si128(++in), mask5);
507
508 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
509 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
510 OutReg);
511
512 return strm_buffer_i;
513 }
514 #endif
515
516
517 #ifdef HAVE_SSE2
518 /* nwritten = 3 * 4 = 12 unsigned ints */
519 static int
write_06_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)520 write_06_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
521 const __m128i *in = (const __m128i *) _in;
522 __m128i OutReg;
523
524 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask6);
525 OutReg = InReg;
526 InReg = _mm_and_si128(_mm_load_si128(++in), mask6);
527
528 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
529 InReg = _mm_and_si128(_mm_load_si128(++in), mask6);
530
531 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
532 InReg = _mm_and_si128(_mm_load_si128(++in), mask6);
533
534 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
535 InReg = _mm_and_si128(_mm_load_si128(++in), mask6);
536
537 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
538 InReg = _mm_and_si128(_mm_load_si128(++in), mask6);
539
540 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
541 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
542 OutReg);
543
544 OutReg = _mm_srli_epi32(InReg, 6 - 4);
545 InReg = _mm_and_si128(_mm_load_si128(++in), mask6);
546
547 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
548 InReg = _mm_and_si128(_mm_load_si128(++in), mask6);
549
550 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
551 InReg = _mm_and_si128(_mm_load_si128(++in), mask6);
552
553 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
554 InReg = _mm_and_si128(_mm_load_si128(++in), mask6);
555
556 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
557 InReg = _mm_and_si128(_mm_load_si128(++in), mask6);
558
559 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
560 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
561 OutReg);
562
563 OutReg = _mm_srli_epi32(InReg, 6 - 2);
564 InReg = _mm_and_si128(_mm_load_si128(++in), mask6);
565
566 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
567 InReg = _mm_and_si128(_mm_load_si128(++in), mask6);
568
569 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
570 InReg = _mm_and_si128(_mm_load_si128(++in), mask6);
571
572 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
573 InReg = _mm_and_si128(_mm_load_si128(++in), mask6);
574
575 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
576 InReg = _mm_and_si128(_mm_load_si128(++in), mask6);
577
578 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
579 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
580 OutReg);
581
582 return strm_buffer_i;
583 }
584 #endif
585
586 static int
pack_06_horiz(UINT4 * out,const UINT4 * in)587 pack_06_horiz (UINT4 *out, const UINT4 *in) {
588 int column;
589
590 for (column = 0; column < 4; column++) {
591 *out |= (*in) % (1U << 6 ) ;
592 ++in;
593 *out |= ( (*in) % (1U << 6 ) ) << 6 ;
594 ++in;
595 *out |= ( (*in) % (1U << 6 ) ) << 12 ;
596 ++in;
597 *out |= ( (*in) % (1U << 6 ) ) << 18 ;
598 ++in;
599 *out |= ( (*in) % (1U << 6 ) ) << 24 ;
600 ++in;
601 *out |= ( (*in) % (1U << 6 ) ) << 30 ;
602 ++out;
603 *out |= ( (*in) % (1U << 6 ) ) >> ( 6 - 4 );
604 ++in;
605 *out |= ( (*in) % (1U << 6 ) ) << 4 ;
606 ++in;
607 *out |= ( (*in) % (1U << 6 ) ) << 10 ;
608 ++in;
609 *out |= ( (*in) % (1U << 6 ) ) << 16 ;
610 ++in;
611 *out |= ( (*in) % (1U << 6 ) ) << 22 ;
612 ++in;
613 *out |= ( (*in) % (1U << 6 ) ) << 28 ;
614 ++out;
615 *out |= ( (*in) % (1U << 6 ) ) >> ( 6 - 2 );
616 ++in;
617 *out |= ( (*in) % (1U << 6 ) ) << 2 ;
618 ++in;
619 *out |= ( (*in) % (1U << 6 ) ) << 8 ;
620 ++in;
621 *out |= ( (*in) % (1U << 6 ) ) << 14 ;
622 ++in;
623 *out |= ( (*in) % (1U << 6 ) ) << 20 ;
624 ++in;
625 *out |= ( (*in) % (1U << 6 ) ) << 26 ;
626 ++out;
627 ++in;
628 }
629
630 return 12;
631 }
632
633
634
635 #ifdef ALLOW_ODD_PACKSIZES
636 /* nwritten = 4 * 4 = 16 unsigned ints */
637 static int
write_07_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)638 write_07_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
639 const __m128i *in = (const __m128i *) _in;
640 __m128i OutReg;
641
642 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask7);
643 OutReg = InReg;
644 InReg = _mm_and_si128(_mm_load_si128(++in), mask7);
645
646 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
647 InReg = _mm_and_si128(_mm_load_si128(++in), mask7);
648
649 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
650 InReg = _mm_and_si128(_mm_load_si128(++in), mask7);
651
652 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
653 InReg = _mm_and_si128(_mm_load_si128(++in), mask7);
654
655 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
656 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
657 OutReg);
658
659 OutReg = _mm_srli_epi32(InReg, 7 - 3);
660 InReg = _mm_and_si128(_mm_load_si128(++in), mask7);
661
662 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
663 InReg = _mm_and_si128(_mm_load_si128(++in), mask7);
664
665 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
666 InReg = _mm_and_si128(_mm_load_si128(++in), mask7);
667
668 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
669 InReg = _mm_and_si128(_mm_load_si128(++in), mask7);
670
671 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
672 InReg = _mm_and_si128(_mm_load_si128(++in), mask7);
673
674 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
675 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
676 OutReg);
677
678 OutReg = _mm_srli_epi32(InReg, 7 - 6);
679 InReg = _mm_and_si128(_mm_load_si128(++in), mask7);
680
681 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
682 InReg = _mm_and_si128(_mm_load_si128(++in), mask7);
683
684 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
685 InReg = _mm_and_si128(_mm_load_si128(++in), mask7);
686
687 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
688 InReg = _mm_and_si128(_mm_load_si128(++in), mask7);
689
690 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
691 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
692 OutReg);
693
694 OutReg = _mm_srli_epi32(InReg, 7 - 2);
695 InReg = _mm_and_si128(_mm_load_si128(++in), mask7);
696
697 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
698 InReg = _mm_and_si128(_mm_load_si128(++in), mask7);
699
700 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
701 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
702 OutReg);
703
704 return strm_buffer_i;
705 }
706 #endif
707
708
709 #ifdef ALLOW_ODD_PACKSIZES
710 /* nwritten = 5 * 4 = 20 unsigned ints */
711 static int
write_09_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)712 write_09_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
713 const __m128i *in = (const __m128i *) _in;
714 __m128i OutReg;
715
716 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask9);
717 OutReg = InReg;
718 InReg = _mm_and_si128(_mm_load_si128(++in), mask9);
719
720 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
721 InReg = _mm_and_si128(_mm_load_si128(++in), mask9);
722
723 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
724 InReg = _mm_and_si128(_mm_load_si128(++in), mask9);
725
726 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
727 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
728 OutReg);
729
730 OutReg = _mm_srli_epi32(InReg, 9 - 4);
731 InReg = _mm_and_si128(_mm_load_si128(++in), mask9);
732
733 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
734 InReg = _mm_and_si128(_mm_load_si128(++in), mask9);
735
736 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
737 InReg = _mm_and_si128(_mm_load_si128(++in), mask9);
738
739 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
740 InReg = _mm_and_si128(_mm_load_si128(++in), mask9);
741
742 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
743 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
744 OutReg);
745
746 OutReg = _mm_srli_epi32(InReg, 9 - 8);
747 InReg = _mm_and_si128(_mm_load_si128(++in), mask9);
748
749 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
750 InReg = _mm_and_si128(_mm_load_si128(++in), mask9);
751
752 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
753 InReg = _mm_and_si128(_mm_load_si128(++in), mask9);
754
755 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
756 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
757 OutReg);
758
759 OutReg = _mm_srli_epi32(InReg, 9 - 3);
760 InReg = _mm_and_si128(_mm_load_si128(++in), mask9);
761
762 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
763 InReg = _mm_and_si128(_mm_load_si128(++in), mask9);
764
765 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
766 InReg = _mm_and_si128(_mm_load_si128(++in), mask9);
767
768 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
769 InReg = _mm_and_si128(_mm_load_si128(++in), mask9);
770
771 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
772 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
773 OutReg);
774
775 OutReg = _mm_srli_epi32(InReg, 9 - 7);
776 InReg = _mm_and_si128(_mm_load_si128(++in), mask9);
777
778 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
779 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
780 OutReg);
781
782 return strm_buffer_i;
783 }
784 #endif
785
786
787 #ifdef HAVE_SSE2
788 /* nwritten = 5 * 4 = 20 unsigned ints */
789 static int
write_10_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)790 write_10_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
791 const __m128i *in = (const __m128i *) _in;
792 __m128i OutReg;
793
794 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask10);
795 OutReg = InReg;
796 InReg = _mm_and_si128(_mm_load_si128(++in), mask10);
797
798 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
799 InReg = _mm_and_si128(_mm_load_si128(++in), mask10);
800
801 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
802 InReg = _mm_and_si128(_mm_load_si128(++in), mask10);
803
804 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
805 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
806 OutReg);
807
808 OutReg = _mm_srli_epi32(InReg, 10 - 8);
809 InReg = _mm_and_si128(_mm_load_si128(++in), mask10);
810
811 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
812 InReg = _mm_and_si128(_mm_load_si128(++in), mask10);
813
814 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
815 InReg = _mm_and_si128(_mm_load_si128(++in), mask10);
816
817 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
818 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
819 OutReg);
820
821 OutReg = _mm_srli_epi32(InReg, 10 - 6);
822 InReg = _mm_and_si128(_mm_load_si128(++in), mask10);
823
824 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
825 InReg = _mm_and_si128(_mm_load_si128(++in), mask10);
826
827 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
828 InReg = _mm_and_si128(_mm_load_si128(++in), mask10);
829
830 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
831 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
832 OutReg);
833
834 OutReg = _mm_srli_epi32(InReg, 10 - 4);
835 InReg = _mm_and_si128(_mm_load_si128(++in), mask10);
836
837 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
838 InReg = _mm_and_si128(_mm_load_si128(++in), mask10);
839
840 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
841 InReg = _mm_and_si128(_mm_load_si128(++in), mask10);
842
843 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
844 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
845 OutReg);
846
847 OutReg = _mm_srli_epi32(InReg, 10 - 2);
848 InReg = _mm_and_si128(_mm_load_si128(++in), mask10);
849
850 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
851 InReg = _mm_and_si128(_mm_load_si128(++in), mask10);
852
853 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
854 InReg = _mm_and_si128(_mm_load_si128(++in), mask10);
855
856 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
857 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
858 OutReg);
859
860 return strm_buffer_i;
861 }
862 #endif
863
864 static int
pack_10_horiz(UINT4 * out,const UINT4 * in)865 pack_10_horiz (UINT4 *out, const UINT4 *in) {
866 int column;
867
868 for (column = 0; column < 4; column++) {
869 *out |= (*in) % (1U << 10 ) ;
870 ++in;
871 *out |= ( (*in) % (1U << 10 ) ) << 10 ;
872 ++in;
873 *out |= ( (*in) % (1U << 10 ) ) << 20 ;
874 ++in;
875 *out |= ( (*in) % (1U << 10 ) ) << 30 ;
876 ++out;
877 *out |= ( (*in) % (1U << 10 ) ) >> ( 10 - 8 );
878 ++in;
879 *out |= ( (*in) % (1U << 10 ) ) << 8 ;
880 ++in;
881 *out |= ( (*in) % (1U << 10 ) ) << 18 ;
882 ++in;
883 *out |= ( (*in) % (1U << 10 ) ) << 28 ;
884 ++out;
885 *out |= ( (*in) % (1U << 10 ) ) >> ( 10 - 6 );
886 ++in;
887 *out |= ( (*in) % (1U << 10 ) ) << 6 ;
888 ++in;
889 *out |= ( (*in) % (1U << 10 ) ) << 16 ;
890 ++in;
891 *out |= ( (*in) % (1U << 10 ) ) << 26 ;
892 ++out;
893 *out |= ( (*in) % (1U << 10 ) ) >> ( 10 - 4 );
894 ++in;
895 *out |= ( (*in) % (1U << 10 ) ) << 4 ;
896 ++in;
897 *out |= ( (*in) % (1U << 10 ) ) << 14 ;
898 ++in;
899 *out |= ( (*in) % (1U << 10 ) ) << 24 ;
900 ++out;
901 *out |= ( (*in) % (1U << 10 ) ) >> ( 10 - 2 );
902 ++in;
903 *out |= ( (*in) % (1U << 10 ) ) << 2 ;
904 ++in;
905 *out |= ( (*in) % (1U << 10 ) ) << 12 ;
906 ++in;
907 *out |= ( (*in) % (1U << 10 ) ) << 22 ;
908 ++out;
909 ++in;
910 }
911
912 return 20;
913 }
914
915
916
917
918 #ifdef ALLOW_ODD_PACKSIZES
919 /* nwritten = 6 * 4 = 24 unsigned ints */
920 static int
write_11_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)921 write_11_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
922 const __m128i *in = (const __m128i *) _in;
923 __m128i OutReg;
924
925 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask11);
926 OutReg = InReg;
927 InReg = _mm_and_si128(_mm_load_si128(++in), mask11);
928
929 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
930 InReg = _mm_and_si128(_mm_load_si128(++in), mask11);
931
932 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
933 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
934 OutReg);
935
936 OutReg = _mm_srli_epi32(InReg, 11 - 1);
937 InReg = _mm_and_si128(_mm_load_si128(++in), mask11);
938
939 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
940 InReg = _mm_and_si128(_mm_load_si128(++in), mask11);
941
942 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
943 InReg = _mm_and_si128(_mm_load_si128(++in), mask11);
944
945 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
946 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
947 OutReg);
948
949 OutReg = _mm_srli_epi32(InReg, 11 - 2);
950 InReg = _mm_and_si128(_mm_load_si128(++in), mask11);
951
952 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
953 InReg = _mm_and_si128(_mm_load_si128(++in), mask11);
954
955 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
956 InReg = _mm_and_si128(_mm_load_si128(++in), mask11);
957
958 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
959 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
960 OutReg);
961
962 OutReg = _mm_srli_epi32(InReg, 11 - 3);
963 InReg = _mm_and_si128(_mm_load_si128(++in), mask11);
964
965 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
966 InReg = _mm_and_si128(_mm_load_si128(++in), mask11);
967
968 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
969 InReg = _mm_and_si128(_mm_load_si128(++in), mask11);
970
971 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
972 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
973 OutReg);
974
975 OutReg = _mm_srli_epi32(InReg, 11 - 4);
976 InReg = _mm_and_si128(_mm_load_si128(++in), mask11);
977
978 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
979 InReg = _mm_and_si128(_mm_load_si128(++in), mask11);
980
981 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
982 InReg = _mm_and_si128(_mm_load_si128(++in), mask11);
983
984 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
985 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
986 OutReg);
987
988 OutReg = _mm_srli_epi32(InReg, 11 - 5);
989 InReg = _mm_and_si128(_mm_load_si128(++in), mask11);
990
991 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
992 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
993 OutReg);
994
995 return strm_buffer_i;
996 }
997 #endif
998
999
1000 #ifdef HAVE_SSE2
1001 /* nwritten = 6 * 4 = 24 unsigned ints */
1002 static int
write_12_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)1003 write_12_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
1004 const __m128i *in = (const __m128i *) _in;
1005 __m128i OutReg;
1006
1007 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask12);
1008 OutReg = InReg;
1009 InReg = _mm_and_si128(_mm_load_si128(++in), mask12);
1010
1011 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
1012 InReg = _mm_and_si128(_mm_load_si128(++in), mask12);
1013
1014 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
1015 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1016 OutReg);
1017
1018 OutReg = _mm_srli_epi32(InReg, 12 - 4);
1019 InReg = _mm_and_si128(_mm_load_si128(++in), mask12);
1020
1021 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
1022 InReg = _mm_and_si128(_mm_load_si128(++in), mask12);
1023
1024 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
1025 InReg = _mm_and_si128(_mm_load_si128(++in), mask12);
1026
1027 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
1028 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1029 OutReg);
1030
1031 OutReg = _mm_srli_epi32(InReg, 12 - 8);
1032 InReg = _mm_and_si128(_mm_load_si128(++in), mask12);
1033
1034 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
1035 InReg = _mm_and_si128(_mm_load_si128(++in), mask12);
1036
1037 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
1038 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1039 OutReg);
1040
1041 InReg = _mm_and_si128(_mm_load_si128(++in), mask12);
1042
1043 OutReg = InReg;
1044 InReg = _mm_and_si128(_mm_load_si128(++in), mask12);
1045
1046 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
1047 InReg = _mm_and_si128(_mm_load_si128(++in), mask12);
1048
1049 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
1050 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1051 OutReg);
1052
1053 OutReg = _mm_srli_epi32(InReg, 12 - 4);
1054 InReg = _mm_and_si128(_mm_load_si128(++in), mask12);
1055
1056 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
1057 InReg = _mm_and_si128(_mm_load_si128(++in), mask12);
1058
1059 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
1060 InReg = _mm_and_si128(_mm_load_si128(++in), mask12);
1061
1062 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
1063 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1064 OutReg);
1065
1066 OutReg = _mm_srli_epi32(InReg, 12 - 8);
1067 InReg = _mm_and_si128(_mm_load_si128(++in), mask12);
1068
1069 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
1070 InReg = _mm_and_si128(_mm_load_si128(++in), mask12);
1071
1072 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
1073 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1074 OutReg);
1075
1076 return strm_buffer_i;
1077 }
1078 #endif
1079
1080 static int
pack_12_horiz(UINT4 * out,const UINT4 * in)1081 pack_12_horiz (UINT4 *out, const UINT4 *in) {
1082 int column;
1083
1084 for (column = 0; column < 4; column++) {
1085
1086 *out |= (*in) % (1U << 12 ) ;
1087 ++in;
1088 *out |= ( (*in) % (1U << 12 ) ) << 12 ;
1089 ++in;
1090 *out |= ( (*in) % (1U << 12 ) ) << 24 ;
1091 ++out;
1092 *out |= ( (*in) % (1U << 12 ) ) >> ( 12 - 4 );
1093 ++in;
1094 *out |= ( (*in) % (1U << 12 ) ) << 4 ;
1095 ++in;
1096 *out |= ( (*in) % (1U << 12 ) ) << 16 ;
1097 ++in;
1098 *out |= ( (*in) % (1U << 12 ) ) << 28 ;
1099 ++out;
1100 *out |= ( (*in) % (1U << 12 ) ) >> ( 12 - 8 );
1101 ++in;
1102 *out |= ( (*in) % (1U << 12 ) ) << 8 ;
1103 ++in;
1104 *out |= ( (*in) % (1U << 12 ) ) << 20 ;
1105 ++out;
1106 ++in;
1107 *out |= (*in) % (1U << 12 ) ;
1108 ++in;
1109 *out |= ( (*in) % (1U << 12 ) ) << 12 ;
1110 ++in;
1111 *out |= ( (*in) % (1U << 12 ) ) << 24 ;
1112 ++out;
1113 *out |= ( (*in) % (1U << 12 ) ) >> ( 12 - 4 );
1114 ++in;
1115 *out |= ( (*in) % (1U << 12 ) ) << 4 ;
1116 ++in;
1117 *out |= ( (*in) % (1U << 12 ) ) << 16 ;
1118 ++in;
1119 *out |= ( (*in) % (1U << 12 ) ) << 28 ;
1120 ++out;
1121 *out |= ( (*in) % (1U << 12 ) ) >> ( 12 - 8 );
1122 ++in;
1123 *out |= ( (*in) % (1U << 12 ) ) << 8 ;
1124 ++in;
1125 *out |= ( (*in) % (1U << 12 ) ) << 20 ;
1126 ++out;
1127 ++in;
1128 }
1129
1130 return 24;
1131 }
1132
1133
1134
1135
1136 #ifdef ALLOW_ODD_PACKSIZES
1137 /* nwritten = 7 * 4 = 28 unsigned ints */
1138 static int
write_13_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)1139 write_13_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
1140 const __m128i *in = (const __m128i *) _in;
1141 __m128i OutReg;
1142
1143 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask13);
1144 OutReg = InReg;
1145 InReg = _mm_and_si128(_mm_load_si128(++in), mask13);
1146
1147 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
1148 InReg = _mm_and_si128(_mm_load_si128(++in), mask13);
1149
1150 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
1151 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1152 OutReg);
1153
1154 OutReg = _mm_srli_epi32(InReg, 13 - 7);
1155 InReg = _mm_and_si128(_mm_load_si128(++in), mask13);
1156
1157 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
1158 InReg = _mm_and_si128(_mm_load_si128(++in), mask13);
1159
1160 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
1161 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1162 OutReg);
1163
1164 OutReg = _mm_srli_epi32(InReg, 13 - 1);
1165 InReg = _mm_and_si128(_mm_load_si128(++in), mask13);
1166
1167 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
1168 InReg = _mm_and_si128(_mm_load_si128(++in), mask13);
1169
1170 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
1171 InReg = _mm_and_si128(_mm_load_si128(++in), mask13);
1172
1173 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
1174 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1175 OutReg);
1176
1177 OutReg = _mm_srli_epi32(InReg, 13 - 8);
1178 InReg = _mm_and_si128(_mm_load_si128(++in), mask13);
1179
1180 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
1181 InReg = _mm_and_si128(_mm_load_si128(++in), mask13);
1182
1183 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
1184 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1185 OutReg);
1186
1187 OutReg = _mm_srli_epi32(InReg, 13 - 2);
1188 InReg = _mm_and_si128(_mm_load_si128(++in), mask13);
1189
1190 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
1191 InReg = _mm_and_si128(_mm_load_si128(++in), mask13);
1192
1193 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
1194 InReg = _mm_and_si128(_mm_load_si128(++in), mask13);
1195
1196 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
1197 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1198 OutReg);
1199
1200 OutReg = _mm_srli_epi32(InReg, 13 - 9);
1201 InReg = _mm_and_si128(_mm_load_si128(++in), mask13);
1202
1203 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
1204 InReg = _mm_and_si128(_mm_load_si128(++in), mask13);
1205
1206 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
1207 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1208 OutReg);
1209
1210 OutReg = _mm_srli_epi32(InReg, 13 - 3);
1211 InReg = _mm_and_si128(_mm_load_si128(++in), mask13);
1212
1213 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
1214 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1215 OutReg);
1216
1217 return strm_buffer_i;
1218 }
1219 #endif
1220
1221
1222 #ifdef HAVE_SSE2
1223 /* nwritten = 7 * 4 = 28 unsigned ints */
1224 static int
write_14_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)1225 write_14_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
1226 const __m128i *in = (const __m128i *) _in;
1227 __m128i OutReg;
1228
1229 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask14);
1230 OutReg = InReg;
1231 InReg = _mm_and_si128(_mm_load_si128(++in), mask14);
1232
1233 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
1234 InReg = _mm_and_si128(_mm_load_si128(++in), mask14);
1235
1236 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
1237 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1238 OutReg);
1239
1240 OutReg = _mm_srli_epi32(InReg, 14 - 10);
1241 InReg = _mm_and_si128(_mm_load_si128(++in), mask14);
1242
1243 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
1244 InReg = _mm_and_si128(_mm_load_si128(++in), mask14);
1245
1246 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
1247 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1248 OutReg);
1249
1250 OutReg = _mm_srli_epi32(InReg, 14 - 6);
1251 InReg = _mm_and_si128(_mm_load_si128(++in), mask14);
1252
1253 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
1254 InReg = _mm_and_si128(_mm_load_si128(++in), mask14);
1255
1256 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
1257 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1258 OutReg);
1259
1260 OutReg = _mm_srli_epi32(InReg, 14 - 2);
1261 InReg = _mm_and_si128(_mm_load_si128(++in), mask14);
1262
1263 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
1264 InReg = _mm_and_si128(_mm_load_si128(++in), mask14);
1265
1266 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
1267 InReg = _mm_and_si128(_mm_load_si128(++in), mask14);
1268
1269 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
1270 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1271 OutReg);
1272
1273 OutReg = _mm_srli_epi32(InReg, 14 - 12);
1274 InReg = _mm_and_si128(_mm_load_si128(++in), mask14);
1275
1276 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
1277 InReg = _mm_and_si128(_mm_load_si128(++in), mask14);
1278
1279 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
1280 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1281 OutReg);
1282
1283 OutReg = _mm_srli_epi32(InReg, 14 - 8);
1284 InReg = _mm_and_si128(_mm_load_si128(++in), mask14);
1285
1286 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
1287 InReg = _mm_and_si128(_mm_load_si128(++in), mask14);
1288
1289 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
1290 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1291 OutReg);
1292
1293 OutReg = _mm_srli_epi32(InReg, 14 - 4);
1294 InReg = _mm_and_si128(_mm_load_si128(++in), mask14);
1295
1296 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
1297 InReg = _mm_and_si128(_mm_load_si128(++in), mask14);
1298
1299 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
1300 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1301 OutReg);
1302
1303 return strm_buffer_i;
1304 }
1305 #endif
1306
1307
1308 static int
pack_14_horiz(UINT4 * out,const UINT4 * in)1309 pack_14_horiz (UINT4 *out, const UINT4 *in) {
1310 int column;
1311
1312 for (column = 0; column < 4; column++) {
1313 *out |= (*in) % (1U << 14 ) ;
1314 ++in;
1315 *out |= ( (*in) % (1U << 14 ) ) << 14 ;
1316 ++in;
1317 *out |= ( (*in) % (1U << 14 ) ) << 28 ;
1318 ++out;
1319 *out |= ( (*in) % (1U << 14 ) ) >> ( 14 - 10 );
1320 ++in;
1321 *out |= ( (*in) % (1U << 14 ) ) << 10 ;
1322 ++in;
1323 *out |= ( (*in) % (1U << 14 ) ) << 24 ;
1324 ++out;
1325 *out |= ( (*in) % (1U << 14 ) ) >> ( 14 - 6 );
1326 ++in;
1327 *out |= ( (*in) % (1U << 14 ) ) << 6 ;
1328 ++in;
1329 *out |= ( (*in) % (1U << 14 ) ) << 20 ;
1330 ++out;
1331 *out |= ( (*in) % (1U << 14 ) ) >> ( 14 - 2 );
1332 ++in;
1333 *out |= ( (*in) % (1U << 14 ) ) << 2 ;
1334 ++in;
1335 *out |= ( (*in) % (1U << 14 ) ) << 16 ;
1336 ++in;
1337 *out |= ( (*in) % (1U << 14 ) ) << 30 ;
1338 ++out;
1339 *out |= ( (*in) % (1U << 14 ) ) >> ( 14 - 12 );
1340 ++in;
1341 *out |= ( (*in) % (1U << 14 ) ) << 12 ;
1342 ++in;
1343 *out |= ( (*in) % (1U << 14 ) ) << 26 ;
1344 ++out;
1345 *out |= ( (*in) % (1U << 14 ) ) >> ( 14 - 8 );
1346 ++in;
1347 *out |= ( (*in) % (1U << 14 ) ) << 8 ;
1348 ++in;
1349 *out |= ( (*in) % (1U << 14 ) ) << 22 ;
1350 ++out;
1351 *out |= ( (*in) % (1U << 14 ) ) >> ( 14 - 4 );
1352 ++in;
1353 *out |= ( (*in) % (1U << 14 ) ) << 4 ;
1354 ++in;
1355 *out |= ( (*in) % (1U << 14 ) ) << 18 ;
1356 ++out;
1357 ++in;
1358 }
1359
1360 return 28;
1361 }
1362
1363
1364 #ifdef ALLOW_ODD_PACKSIZES
1365 /* nwritten = 8 * 4 = 32 unsigned ints */
1366 static int
write_15_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)1367 write_15_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
1368 const __m128i *in = (const __m128i *) _in;
1369 __m128i OutReg;
1370
1371 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask15);
1372 OutReg = InReg;
1373 InReg = _mm_and_si128(_mm_load_si128(++in), mask15);
1374
1375 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
1376 InReg = _mm_and_si128(_mm_load_si128(++in), mask15);
1377
1378 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
1379 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1380 OutReg);
1381
1382 OutReg = _mm_srli_epi32(InReg, 15 - 13);
1383 InReg = _mm_and_si128(_mm_load_si128(++in), mask15);
1384
1385 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
1386 InReg = _mm_and_si128(_mm_load_si128(++in), mask15);
1387
1388 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
1389 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1390 OutReg);
1391
1392 OutReg = _mm_srli_epi32(InReg, 15 - 11);
1393 InReg = _mm_and_si128(_mm_load_si128(++in), mask15);
1394
1395 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
1396 InReg = _mm_and_si128(_mm_load_si128(++in), mask15);
1397
1398 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
1399 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1400 OutReg);
1401
1402 OutReg = _mm_srli_epi32(InReg, 15 - 9);
1403 InReg = _mm_and_si128(_mm_load_si128(++in), mask15);
1404
1405 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
1406 InReg = _mm_and_si128(_mm_load_si128(++in), mask15);
1407
1408 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
1409 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1410 OutReg);
1411
1412 OutReg = _mm_srli_epi32(InReg, 15 - 7);
1413 InReg = _mm_and_si128(_mm_load_si128(++in), mask15);
1414
1415 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
1416 InReg = _mm_and_si128(_mm_load_si128(++in), mask15);
1417
1418 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
1419 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1420 OutReg);
1421
1422 OutReg = _mm_srli_epi32(InReg, 15 - 5);
1423 InReg = _mm_and_si128(_mm_load_si128(++in), mask15);
1424
1425 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
1426 InReg = _mm_and_si128(_mm_load_si128(++in), mask15);
1427
1428 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
1429 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1430 OutReg);
1431
1432 OutReg = _mm_srli_epi32(InReg, 15 - 3);
1433 InReg = _mm_and_si128(_mm_load_si128(++in), mask15);
1434
1435 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
1436 InReg = _mm_and_si128(_mm_load_si128(++in), mask15);
1437
1438 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
1439 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1440 OutReg);
1441
1442 OutReg = _mm_srli_epi32(InReg, 15 - 1);
1443 InReg = _mm_and_si128(_mm_load_si128(++in), mask15);
1444
1445 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
1446 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1447 OutReg);
1448
1449 return strm_buffer_i;
1450 }
1451 #endif
1452
1453
1454
1455 #ifdef ALLOW_ODD_PACKSIZES
1456 /* nwritten = 9 * 4 = 36 unsigned ints */
1457 static int
write_17_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)1458 write_17_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
1459 const __m128i *in = (const __m128i *) _in;
1460 __m128i OutReg;
1461
1462 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask17);
1463 OutReg = InReg;
1464 InReg = _mm_and_si128(_mm_load_si128(++in), mask17);
1465
1466 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
1467 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1468 OutReg);
1469
1470 OutReg = _mm_srli_epi32(InReg, 17 - 2);
1471 InReg = _mm_and_si128(_mm_load_si128(++in), mask17);
1472
1473 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
1474 InReg = _mm_and_si128(_mm_load_si128(++in), mask17);
1475
1476 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
1477 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1478 OutReg);
1479
1480 OutReg = _mm_srli_epi32(InReg, 17 - 4);
1481 InReg = _mm_and_si128(_mm_load_si128(++in), mask17);
1482
1483 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
1484 InReg = _mm_and_si128(_mm_load_si128(++in), mask17);
1485
1486 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
1487 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1488 OutReg);
1489
1490 OutReg = _mm_srli_epi32(InReg, 17 - 6);
1491 InReg = _mm_and_si128(_mm_load_si128(++in), mask17);
1492
1493 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
1494 InReg = _mm_and_si128(_mm_load_si128(++in), mask17);
1495
1496 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
1497 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1498 OutReg);
1499
1500 OutReg = _mm_srli_epi32(InReg, 17 - 8);
1501 InReg = _mm_and_si128(_mm_load_si128(++in), mask17);
1502
1503 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
1504 InReg = _mm_and_si128(_mm_load_si128(++in), mask17);
1505
1506 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
1507 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1508 OutReg);
1509
1510 OutReg = _mm_srli_epi32(InReg, 17 - 10);
1511 InReg = _mm_and_si128(_mm_load_si128(++in), mask17);
1512
1513 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
1514 InReg = _mm_and_si128(_mm_load_si128(++in), mask17);
1515
1516 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
1517 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1518 OutReg);
1519
1520 OutReg = _mm_srli_epi32(InReg, 17 - 12);
1521 InReg = _mm_and_si128(_mm_load_si128(++in), mask17);
1522
1523 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
1524 InReg = _mm_and_si128(_mm_load_si128(++in), mask17);
1525
1526 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
1527 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1528 OutReg);
1529
1530 OutReg = _mm_srli_epi32(InReg, 17 - 14);
1531 InReg = _mm_and_si128(_mm_load_si128(++in), mask17);
1532
1533 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
1534 InReg = _mm_and_si128(_mm_load_si128(++in), mask17);
1535
1536 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
1537 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1538 OutReg);
1539
1540
1541 OutReg = _mm_srli_epi32(InReg, 17 - 16);
1542 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1543 OutReg);
1544
1545 return strm_buffer_i;
1546 }
1547 #endif
1548
1549
1550
1551 #ifdef HAVE_SSE2
1552 /* nwritten = 9 * 4 = 36 unsigned ints */
1553 static int
write_18_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)1554 write_18_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
1555 const __m128i *in = (const __m128i *) _in;
1556 __m128i OutReg;
1557
1558 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask18);
1559 OutReg = InReg;
1560 InReg = _mm_and_si128(_mm_load_si128(++in), mask18);
1561
1562 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
1563 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1564 OutReg);
1565
1566 OutReg = _mm_srli_epi32(InReg, 18 - 4);
1567 InReg = _mm_and_si128(_mm_load_si128(++in), mask18);
1568
1569 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
1570 InReg = _mm_and_si128(_mm_load_si128(++in), mask18);
1571
1572 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
1573 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1574 OutReg);
1575
1576 OutReg = _mm_srli_epi32(InReg, 18 - 8);
1577 InReg = _mm_and_si128(_mm_load_si128(++in), mask18);
1578
1579 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
1580 InReg = _mm_and_si128(_mm_load_si128(++in), mask18);
1581
1582 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
1583 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1584 OutReg);
1585
1586 OutReg = _mm_srli_epi32(InReg, 18 - 12);
1587 InReg = _mm_and_si128(_mm_load_si128(++in), mask18);
1588
1589 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
1590 InReg = _mm_and_si128(_mm_load_si128(++in), mask18);
1591
1592 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
1593 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1594 OutReg);
1595
1596 OutReg = _mm_srli_epi32(InReg, 18 - 16);
1597 InReg = _mm_and_si128(_mm_load_si128(++in), mask18);
1598
1599 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
1600 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1601 OutReg);
1602
1603 OutReg = _mm_srli_epi32(InReg, 18 - 2);
1604 InReg = _mm_and_si128(_mm_load_si128(++in), mask18);
1605
1606 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
1607 InReg = _mm_and_si128(_mm_load_si128(++in), mask18);
1608
1609 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
1610 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1611 OutReg);
1612
1613 OutReg = _mm_srli_epi32(InReg, 18 - 6);
1614 InReg = _mm_and_si128(_mm_load_si128(++in), mask18);
1615
1616 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
1617 InReg = _mm_and_si128(_mm_load_si128(++in), mask18);
1618
1619 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
1620 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1621 OutReg);
1622
1623 OutReg = _mm_srli_epi32(InReg, 18 - 10);
1624 InReg = _mm_and_si128(_mm_load_si128(++in), mask18);
1625
1626 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
1627 InReg = _mm_and_si128(_mm_load_si128(++in), mask18);
1628
1629 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
1630 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1631 OutReg);
1632
1633 OutReg = _mm_srli_epi32(InReg, 18 - 14);
1634 InReg = _mm_and_si128(_mm_load_si128(++in), mask18);
1635
1636 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
1637 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1638 OutReg);
1639
1640 return strm_buffer_i;
1641 }
1642 #endif
1643
1644 static int
pack_18_horiz(UINT4 * out,const UINT4 * in)1645 pack_18_horiz (UINT4 *out, const UINT4 *in) {
1646 int column;
1647
1648 for (column = 0; column < 4; column++) {
1649 *out |= (*in) % (1U << 18 ) ;
1650 ++in;
1651 *out |= ( (*in) % (1U << 18 ) ) << 18 ;
1652 ++out;
1653 *out |= ( (*in) % (1U << 18 ) ) >> ( 18 - 4 );
1654 ++in;
1655 *out |= ( (*in) % (1U << 18 ) ) << 4 ;
1656 ++in;
1657 *out |= ( (*in) % (1U << 18 ) ) << 22 ;
1658 ++out;
1659 *out |= ( (*in) % (1U << 18 ) ) >> ( 18 - 8 );
1660 ++in;
1661 *out |= ( (*in) % (1U << 18 ) ) << 8 ;
1662 ++in;
1663 *out |= ( (*in) % (1U << 18 ) ) << 26 ;
1664 ++out;
1665 *out |= ( (*in) % (1U << 18 ) ) >> ( 18 - 12 );
1666 ++in;
1667 *out |= ( (*in) % (1U << 18 ) ) << 12 ;
1668 ++in;
1669 *out |= ( (*in) % (1U << 18 ) ) << 30 ;
1670 ++out;
1671 *out |= ( (*in) % (1U << 18 ) ) >> ( 18 - 16 );
1672 ++in;
1673 *out |= ( (*in) % (1U << 18 ) ) << 16 ;
1674 ++out;
1675 *out |= ( (*in) % (1U << 18 ) ) >> ( 18 - 2 );
1676 ++in;
1677 *out |= ( (*in) % (1U << 18 ) ) << 2 ;
1678 ++in;
1679 *out |= ( (*in) % (1U << 18 ) ) << 20 ;
1680 ++out;
1681 *out |= ( (*in) % (1U << 18 ) ) >> ( 18 - 6 );
1682 ++in;
1683 *out |= ( (*in) % (1U << 18 ) ) << 6 ;
1684 ++in;
1685 *out |= ( (*in) % (1U << 18 ) ) << 24 ;
1686 ++out;
1687 *out |= ( (*in) % (1U << 18 ) ) >> ( 18 - 10 );
1688 ++in;
1689 *out |= ( (*in) % (1U << 18 ) ) << 10 ;
1690 ++in;
1691 *out |= ( (*in) % (1U << 18 ) ) << 28 ;
1692 ++out;
1693 *out |= ( (*in) % (1U << 18 ) ) >> ( 18 - 14 );
1694 ++in;
1695 *out |= ( (*in) % (1U << 18 ) ) << 14 ;
1696 ++out;
1697 ++in;
1698 }
1699
1700 return 36;
1701 }
1702
1703
1704
1705 #ifdef ALLOW_ODD_PACKSIZES
1706 /* nwritten = 10 * 4 = 40 unsigned ints */
1707 static int
write_19_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)1708 write_19_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
1709 const __m128i *in = (const __m128i *) _in;
1710 __m128i OutReg;
1711
1712 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask19);
1713 OutReg = InReg;
1714 InReg = _mm_and_si128(_mm_load_si128(++in), mask19);
1715
1716 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
1717 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1718 OutReg);
1719
1720 OutReg = _mm_srli_epi32(InReg, 19 - 6);
1721 InReg = _mm_and_si128(_mm_load_si128(++in), mask19);
1722
1723 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
1724 InReg = _mm_and_si128(_mm_load_si128(++in), mask19);
1725
1726 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
1727 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1728 OutReg);
1729
1730 OutReg = _mm_srli_epi32(InReg, 19 - 12);
1731 InReg = _mm_and_si128(_mm_load_si128(++in), mask19);
1732
1733 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
1734 InReg = _mm_and_si128(_mm_load_si128(++in), mask19);
1735
1736 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
1737 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1738 OutReg);
1739
1740 OutReg = _mm_srli_epi32(InReg, 19 - 18);
1741 InReg = _mm_and_si128(_mm_load_si128(++in), mask19);
1742
1743 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
1744 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1745 OutReg);
1746
1747 OutReg = _mm_srli_epi32(InReg, 19 - 5);
1748 InReg = _mm_and_si128(_mm_load_si128(++in), mask19);
1749
1750 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
1751 InReg = _mm_and_si128(_mm_load_si128(++in), mask19);
1752
1753 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
1754 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1755 OutReg);
1756
1757 OutReg = _mm_srli_epi32(InReg, 19 - 11);
1758 InReg = _mm_and_si128(_mm_load_si128(++in), mask19);
1759
1760 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
1761 InReg = _mm_and_si128(_mm_load_si128(++in), mask19);
1762
1763 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
1764 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1765 OutReg);
1766
1767 OutReg = _mm_srli_epi32(InReg, 19 - 17);
1768 InReg = _mm_and_si128(_mm_load_si128(++in), mask19);
1769
1770 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
1771 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1772 OutReg);
1773
1774 OutReg = _mm_srli_epi32(InReg, 19 - 4);
1775 InReg = _mm_and_si128(_mm_load_si128(++in), mask19);
1776
1777 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
1778 InReg = _mm_and_si128(_mm_load_si128(++in), mask19);
1779
1780 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
1781 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1782 OutReg);
1783
1784 OutReg = _mm_srli_epi32(InReg, 19 - 10);
1785 InReg = _mm_and_si128(_mm_load_si128(++in), mask19);
1786
1787 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
1788 InReg = _mm_and_si128(_mm_load_si128(++in), mask19);
1789
1790 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
1791 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1792 OutReg);
1793
1794
1795 OutReg = _mm_srli_epi32(InReg, 19 - 16);
1796 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1797 OutReg);
1798
1799 return strm_buffer_i;
1800 }
1801 #endif
1802
1803
1804 #ifdef HAVE_SSE2
1805 /* nwritten = 10 * 4 = 40 unsigned ints */
1806 static int
write_20_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)1807 write_20_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
1808 const __m128i *in = (const __m128i *) _in;
1809 __m128i OutReg;
1810
1811 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask20);
1812 OutReg = InReg;
1813 InReg = _mm_and_si128(_mm_load_si128(++in), mask20);
1814
1815 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
1816 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1817 OutReg);
1818
1819 OutReg = _mm_srli_epi32(InReg, 20 - 8);
1820 InReg = _mm_and_si128(_mm_load_si128(++in), mask20);
1821
1822 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
1823 InReg = _mm_and_si128(_mm_load_si128(++in), mask20);
1824
1825 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
1826 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1827 OutReg);
1828
1829 OutReg = _mm_srli_epi32(InReg, 20 - 16);
1830 InReg = _mm_and_si128(_mm_load_si128(++in), mask20);
1831
1832 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
1833 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1834 OutReg);
1835
1836 OutReg = _mm_srli_epi32(InReg, 20 - 4);
1837 InReg = _mm_and_si128(_mm_load_si128(++in), mask20);
1838
1839 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
1840 InReg = _mm_and_si128(_mm_load_si128(++in), mask20);
1841
1842 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
1843 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1844 OutReg);
1845
1846 OutReg = _mm_srli_epi32(InReg, 20 - 12);
1847 InReg = _mm_and_si128(_mm_load_si128(++in), mask20);
1848
1849 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
1850 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1851 OutReg);
1852
1853 InReg = _mm_and_si128(_mm_load_si128(++in), mask20);
1854
1855 OutReg = InReg;
1856 InReg = _mm_and_si128(_mm_load_si128(++in), mask20);
1857
1858 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
1859 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1860 OutReg);
1861
1862 OutReg = _mm_srli_epi32(InReg, 20 - 8);
1863 InReg = _mm_and_si128(_mm_load_si128(++in), mask20);
1864
1865 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
1866 InReg = _mm_and_si128(_mm_load_si128(++in), mask20);
1867
1868 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
1869 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1870 OutReg);
1871
1872 OutReg = _mm_srli_epi32(InReg, 20 - 16);
1873 InReg = _mm_and_si128(_mm_load_si128(++in), mask20);
1874
1875 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
1876 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1877 OutReg);
1878
1879 OutReg = _mm_srli_epi32(InReg, 20 - 4);
1880 InReg = _mm_and_si128(_mm_load_si128(++in), mask20);
1881
1882 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
1883 InReg = _mm_and_si128(_mm_load_si128(++in), mask20);
1884
1885 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
1886 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1887 OutReg);
1888
1889 OutReg = _mm_srli_epi32(InReg, 20 - 12);
1890 InReg = _mm_and_si128(_mm_load_si128(++in), mask20);
1891
1892 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
1893 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1894 OutReg);
1895
1896 return strm_buffer_i;
1897 }
1898 #endif
1899
1900 static int
pack_20_horiz(UINT4 * out,const UINT4 * in)1901 pack_20_horiz (UINT4 *out, const UINT4 *in) {
1902 int column;
1903
1904 for (column = 0; column < 4; column++) {
1905 *out |= (*in) % (1U << 20 ) ;
1906 ++in;
1907 *out |= ( (*in) % (1U << 20 ) ) << 20 ;
1908 ++out;
1909 *out |= ( (*in) % (1U << 20 ) ) >> ( 20 - 8 );
1910 ++in;
1911 *out |= ( (*in) % (1U << 20 ) ) << 8 ;
1912 ++in;
1913 *out |= ( (*in) % (1U << 20 ) ) << 28 ;
1914 ++out;
1915 *out |= ( (*in) % (1U << 20 ) ) >> ( 20 - 16 );
1916 ++in;
1917 *out |= ( (*in) % (1U << 20 ) ) << 16 ;
1918 ++out;
1919 *out |= ( (*in) % (1U << 20 ) ) >> ( 20 - 4 );
1920 ++in;
1921 *out |= ( (*in) % (1U << 20 ) ) << 4 ;
1922 ++in;
1923 *out |= ( (*in) % (1U << 20 ) ) << 24 ;
1924 ++out;
1925 *out |= ( (*in) % (1U << 20 ) ) >> ( 20 - 12 );
1926 ++in;
1927 *out |= ( (*in) % (1U << 20 ) ) << 12 ;
1928 ++out;
1929 ++in;
1930 *out |= (*in) % (1U << 20 ) ;
1931 ++in;
1932 *out |= ( (*in) % (1U << 20 ) ) << 20 ;
1933 ++out;
1934 *out |= ( (*in) % (1U << 20 ) ) >> ( 20 - 8 );
1935 ++in;
1936 *out |= ( (*in) % (1U << 20 ) ) << 8 ;
1937 ++in;
1938 *out |= ( (*in) % (1U << 20 ) ) << 28 ;
1939 ++out;
1940 *out |= ( (*in) % (1U << 20 ) ) >> ( 20 - 16 );
1941 ++in;
1942 *out |= ( (*in) % (1U << 20 ) ) << 16 ;
1943 ++out;
1944 *out |= ( (*in) % (1U << 20 ) ) >> ( 20 - 4 );
1945 ++in;
1946 *out |= ( (*in) % (1U << 20 ) ) << 4 ;
1947 ++in;
1948 *out |= ( (*in) % (1U << 20 ) ) << 24 ;
1949 ++out;
1950 *out |= ( (*in) % (1U << 20 ) ) >> ( 20 - 12 );
1951 ++in;
1952 *out |= ( (*in) % (1U << 20 ) ) << 12 ;
1953 ++out;
1954 ++in;
1955 }
1956
1957 return 40;
1958 }
1959
1960
1961
1962 #ifdef ALLOW_ODD_PACKSIZES
1963 /* nwritten = 11 * 4 = 44 unsigned ints */
1964 static int
write_21_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)1965 write_21_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
1966 const __m128i *in = (const __m128i *) _in;
1967 __m128i OutReg;
1968
1969 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask21);
1970 OutReg = InReg;
1971 InReg = _mm_and_si128(_mm_load_si128(++in), mask21);
1972
1973 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
1974 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1975 OutReg);
1976
1977 OutReg = _mm_srli_epi32(InReg, 21 - 10);
1978 InReg = _mm_and_si128(_mm_load_si128(++in), mask21);
1979
1980 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
1981 InReg = _mm_and_si128(_mm_load_si128(++in), mask21);
1982
1983 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
1984 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1985 OutReg);
1986
1987 OutReg = _mm_srli_epi32(InReg, 21 - 20);
1988 InReg = _mm_and_si128(_mm_load_si128(++in), mask21);
1989
1990 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
1991 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1992 OutReg);
1993
1994 OutReg = _mm_srli_epi32(InReg, 21 - 9);
1995 InReg = _mm_and_si128(_mm_load_si128(++in), mask21);
1996
1997 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
1998 InReg = _mm_and_si128(_mm_load_si128(++in), mask21);
1999
2000 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
2001 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2002 OutReg);
2003
2004 OutReg = _mm_srli_epi32(InReg, 21 - 19);
2005 InReg = _mm_and_si128(_mm_load_si128(++in), mask21);
2006
2007 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
2008 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2009 OutReg);
2010
2011 OutReg = _mm_srli_epi32(InReg, 21 - 8);
2012 InReg = _mm_and_si128(_mm_load_si128(++in), mask21);
2013
2014 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
2015 InReg = _mm_and_si128(_mm_load_si128(++in), mask21);
2016
2017 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
2018 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2019 OutReg);
2020
2021 OutReg = _mm_srli_epi32(InReg, 21 - 18);
2022 InReg = _mm_and_si128(_mm_load_si128(++in), mask21);
2023
2024 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
2025 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2026 OutReg);
2027
2028 OutReg = _mm_srli_epi32(InReg, 21 - 7);
2029 InReg = _mm_and_si128(_mm_load_si128(++in), mask21);
2030
2031 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
2032 InReg = _mm_and_si128(_mm_load_si128(++in), mask21);
2033
2034 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
2035 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2036 OutReg);
2037
2038 OutReg = _mm_srli_epi32(InReg, 21 - 17);
2039 InReg = _mm_and_si128(_mm_load_si128(++in), mask21);
2040
2041 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
2042 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2043 OutReg);
2044
2045 OutReg = _mm_srli_epi32(InReg, 21 - 6);
2046 InReg = _mm_and_si128(_mm_load_si128(++in), mask21);
2047
2048 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
2049 InReg = _mm_and_si128(_mm_load_si128(++in), mask21);
2050
2051 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
2052 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2053 OutReg);
2054
2055
2056 OutReg = _mm_srli_epi32(InReg, 21 - 16);
2057 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2058 OutReg);
2059
2060 return strm_buffer_i;
2061 }
2062 #endif
2063
2064
2065 #ifdef HAVE_SSE2
2066 /* nwritten = 11 * 4 = 44 unsigned ints */
2067 static int
write_22_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)2068 write_22_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
2069 const __m128i *in = (const __m128i *) _in;
2070 __m128i OutReg;
2071
2072 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask22);
2073 OutReg = InReg;
2074 InReg = _mm_and_si128(_mm_load_si128(++in), mask22);
2075
2076 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
2077 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2078 OutReg);
2079
2080 OutReg = _mm_srli_epi32(InReg, 22 - 12);
2081 InReg = _mm_and_si128(_mm_load_si128(++in), mask22);
2082
2083 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
2084 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2085 OutReg);
2086
2087 OutReg = _mm_srli_epi32(InReg, 22 - 2);
2088 InReg = _mm_and_si128(_mm_load_si128(++in), mask22);
2089
2090 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
2091 InReg = _mm_and_si128(_mm_load_si128(++in), mask22);
2092
2093 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
2094 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2095 OutReg);
2096
2097 OutReg = _mm_srli_epi32(InReg, 22 - 14);
2098 InReg = _mm_and_si128(_mm_load_si128(++in), mask22);
2099
2100 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
2101 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2102 OutReg);
2103
2104 OutReg = _mm_srli_epi32(InReg, 22 - 4);
2105 InReg = _mm_and_si128(_mm_load_si128(++in), mask22);
2106
2107 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
2108 InReg = _mm_and_si128(_mm_load_si128(++in), mask22);
2109
2110 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
2111 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2112 OutReg);
2113
2114 OutReg = _mm_srli_epi32(InReg, 22 - 16);
2115 InReg = _mm_and_si128(_mm_load_si128(++in), mask22);
2116
2117 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
2118 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2119 OutReg);
2120
2121 OutReg = _mm_srli_epi32(InReg, 22 - 6);
2122 InReg = _mm_and_si128(_mm_load_si128(++in), mask22);
2123
2124 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
2125 InReg = _mm_and_si128(_mm_load_si128(++in), mask22);
2126
2127 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
2128 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2129 OutReg);
2130
2131 OutReg = _mm_srli_epi32(InReg, 22 - 18);
2132 InReg = _mm_and_si128(_mm_load_si128(++in), mask22);
2133
2134 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
2135 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2136 OutReg);
2137
2138 OutReg = _mm_srli_epi32(InReg, 22 - 8);
2139 InReg = _mm_and_si128(_mm_load_si128(++in), mask22);
2140
2141 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
2142 InReg = _mm_and_si128(_mm_load_si128(++in), mask22);
2143
2144 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
2145 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2146 OutReg);
2147
2148 OutReg = _mm_srli_epi32(InReg, 22 - 20);
2149 InReg = _mm_and_si128(_mm_load_si128(++in), mask22);
2150
2151 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
2152 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2153 OutReg);
2154
2155 OutReg = _mm_srli_epi32(InReg, 22 - 10);
2156 InReg = _mm_and_si128(_mm_load_si128(++in), mask22);
2157
2158 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
2159 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2160 OutReg);
2161
2162 return strm_buffer_i;
2163 }
2164 #endif
2165
2166 static int
pack_22_horiz(UINT4 * out,const UINT4 * in)2167 pack_22_horiz (UINT4 *out, const UINT4 *in) {
2168 int column;
2169
2170 for (column = 0; column < 4; column++) {
2171 *out |= (*in) % (1U << 22 ) ;
2172 ++in;
2173 *out |= ( (*in) % (1U << 22 ) ) << 22 ;
2174 ++out;
2175 *out |= ( (*in) % (1U << 22 ) ) >> ( 22 - 12 );
2176 ++in;
2177 *out |= ( (*in) % (1U << 22 ) ) << 12 ;
2178 ++out;
2179 *out |= ( (*in) % (1U << 22 ) ) >> ( 22 - 2 );
2180 ++in;
2181 *out |= ( (*in) % (1U << 22 ) ) << 2 ;
2182 ++in;
2183 *out |= ( (*in) % (1U << 22 ) ) << 24 ;
2184 ++out;
2185 *out |= ( (*in) % (1U << 22 ) ) >> ( 22 - 14 );
2186 ++in;
2187 *out |= ( (*in) % (1U << 22 ) ) << 14 ;
2188 ++out;
2189 *out |= ( (*in) % (1U << 22 ) ) >> ( 22 - 4 );
2190 ++in;
2191 *out |= ( (*in) % (1U << 22 ) ) << 4 ;
2192 ++in;
2193 *out |= ( (*in) % (1U << 22 ) ) << 26 ;
2194 ++out;
2195 *out |= ( (*in) % (1U << 22 ) ) >> ( 22 - 16 );
2196 ++in;
2197 *out |= ( (*in) % (1U << 22 ) ) << 16 ;
2198 ++out;
2199 *out |= ( (*in) % (1U << 22 ) ) >> ( 22 - 6 );
2200 ++in;
2201 *out |= ( (*in) % (1U << 22 ) ) << 6 ;
2202 ++in;
2203 *out |= ( (*in) % (1U << 22 ) ) << 28 ;
2204 ++out;
2205 *out |= ( (*in) % (1U << 22 ) ) >> ( 22 - 18 );
2206 ++in;
2207 *out |= ( (*in) % (1U << 22 ) ) << 18 ;
2208 ++out;
2209 *out |= ( (*in) % (1U << 22 ) ) >> ( 22 - 8 );
2210 ++in;
2211 *out |= ( (*in) % (1U << 22 ) ) << 8 ;
2212 ++in;
2213 *out |= ( (*in) % (1U << 22 ) ) << 30 ;
2214 ++out;
2215 *out |= ( (*in) % (1U << 22 ) ) >> ( 22 - 20 );
2216 ++in;
2217 *out |= ( (*in) % (1U << 22 ) ) << 20 ;
2218 ++out;
2219 *out |= ( (*in) % (1U << 22 ) ) >> ( 22 - 10 );
2220 ++in;
2221 *out |= ( (*in) % (1U << 22 ) ) << 10 ;
2222 ++out;
2223 ++in;
2224 }
2225
2226 return 44;
2227 }
2228
2229
2230 #ifdef ALLOW_ODD_PACKSIZES
2231 /* nwritten = 12 * 4 = 48 unsigned ints */
2232 static int
write_23_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)2233 write_23_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
2234 const __m128i *in = (const __m128i *) _in;
2235 __m128i OutReg;
2236
2237 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask23);
2238 OutReg = InReg;
2239 InReg = _mm_and_si128(_mm_load_si128(++in), mask23);
2240
2241 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
2242 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2243 OutReg);
2244
2245 OutReg = _mm_srli_epi32(InReg, 23 - 14);
2246 InReg = _mm_and_si128(_mm_load_si128(++in), mask23);
2247
2248 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
2249 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2250 OutReg);
2251
2252 OutReg = _mm_srli_epi32(InReg, 23 - 5);
2253 InReg = _mm_and_si128(_mm_load_si128(++in), mask23);
2254
2255 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
2256 InReg = _mm_and_si128(_mm_load_si128(++in), mask23);
2257
2258 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
2259 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2260 OutReg);
2261
2262 OutReg = _mm_srli_epi32(InReg, 23 - 19);
2263 InReg = _mm_and_si128(_mm_load_si128(++in), mask23);
2264
2265 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
2266 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2267 OutReg);
2268
2269 OutReg = _mm_srli_epi32(InReg, 23 - 10);
2270 InReg = _mm_and_si128(_mm_load_si128(++in), mask23);
2271
2272 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
2273 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2274 OutReg);
2275
2276 OutReg = _mm_srli_epi32(InReg, 23 - 1);
2277 InReg = _mm_and_si128(_mm_load_si128(++in), mask23);
2278
2279 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
2280 InReg = _mm_and_si128(_mm_load_si128(++in), mask23);
2281
2282 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
2283 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2284 OutReg);
2285
2286 OutReg = _mm_srli_epi32(InReg, 23 - 15);
2287 InReg = _mm_and_si128(_mm_load_si128(++in), mask23);
2288
2289 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
2290 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2291 OutReg);
2292
2293 OutReg = _mm_srli_epi32(InReg, 23 - 6);
2294 InReg = _mm_and_si128(_mm_load_si128(++in), mask23);
2295
2296 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
2297 InReg = _mm_and_si128(_mm_load_si128(++in), mask23);
2298
2299 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
2300 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2301 OutReg);
2302
2303 OutReg = _mm_srli_epi32(InReg, 23 - 20);
2304 InReg = _mm_and_si128(_mm_load_si128(++in), mask23);
2305
2306 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
2307 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2308 OutReg);
2309
2310 OutReg = _mm_srli_epi32(InReg, 23 - 11);
2311 InReg = _mm_and_si128(_mm_load_si128(++in), mask23);
2312
2313 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
2314 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2315 OutReg);
2316
2317 OutReg = _mm_srli_epi32(InReg, 23 - 2);
2318 InReg = _mm_and_si128(_mm_load_si128(++in), mask23);
2319
2320 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
2321 InReg = _mm_and_si128(_mm_load_si128(++in), mask23);
2322
2323 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
2324 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2325 OutReg);
2326
2327 OutReg = _mm_srli_epi32(InReg, 23 - 16);
2328 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2329 OutReg);
2330
2331 return strm_buffer_i;
2332 }
2333 #endif
2334
2335
2336 #ifdef HAVE_SSE2
2337 /* nwritten = 12 * 4 = 48 unsigned ints */
2338 static int
write_24_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)2339 write_24_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
2340 const __m128i *in = (const __m128i *) _in;
2341 __m128i OutReg;
2342
2343 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask24);
2344 OutReg = InReg;
2345 InReg = _mm_and_si128(_mm_load_si128(++in), mask24);
2346
2347 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
2348 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2349 OutReg);
2350
2351 OutReg = _mm_srli_epi32(InReg, 24 - 16);
2352 InReg = _mm_and_si128(_mm_load_si128(++in), mask24);
2353
2354 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
2355 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2356 OutReg);
2357
2358 OutReg = _mm_srli_epi32(InReg, 24 - 8);
2359 InReg = _mm_and_si128(_mm_load_si128(++in), mask24);
2360
2361 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
2362 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2363 OutReg);
2364
2365 InReg = _mm_and_si128(_mm_load_si128(++in), mask24);
2366
2367 OutReg = InReg;
2368 InReg = _mm_and_si128(_mm_load_si128(++in), mask24);
2369
2370 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
2371 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2372 OutReg);
2373
2374 OutReg = _mm_srli_epi32(InReg, 24 - 16);
2375 InReg = _mm_and_si128(_mm_load_si128(++in), mask24);
2376
2377 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
2378 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2379 OutReg);
2380
2381 OutReg = _mm_srli_epi32(InReg, 24 - 8);
2382 InReg = _mm_and_si128(_mm_load_si128(++in), mask24);
2383
2384 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
2385 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2386 OutReg);
2387
2388 InReg = _mm_and_si128(_mm_load_si128(++in), mask24);
2389
2390 OutReg = InReg;
2391 InReg = _mm_and_si128(_mm_load_si128(++in), mask24);
2392
2393 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
2394 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2395 OutReg);
2396
2397 OutReg = _mm_srli_epi32(InReg, 24 - 16);
2398 InReg = _mm_and_si128(_mm_load_si128(++in), mask24);
2399
2400 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
2401 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2402 OutReg);
2403
2404 OutReg = _mm_srli_epi32(InReg, 24 - 8);
2405 InReg = _mm_and_si128(_mm_load_si128(++in), mask24);
2406
2407 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
2408 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2409 OutReg);
2410
2411 InReg = _mm_and_si128(_mm_load_si128(++in), mask24);
2412
2413 OutReg = InReg;
2414 InReg = _mm_and_si128(_mm_load_si128(++in), mask24);
2415
2416 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
2417 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2418 OutReg);
2419
2420 OutReg = _mm_srli_epi32(InReg, 24 - 16);
2421 InReg = _mm_and_si128(_mm_load_si128(++in), mask24);
2422
2423 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
2424 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2425 OutReg);
2426
2427 OutReg = _mm_srli_epi32(InReg, 24 - 8);
2428 InReg = _mm_and_si128(_mm_load_si128(++in), mask24);
2429
2430 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
2431 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2432 OutReg);
2433
2434 return strm_buffer_i;
2435 }
2436 #endif
2437
2438 static int
pack_24_horiz(UINT4 * out,const UINT4 * in)2439 pack_24_horiz (UINT4 *out, const UINT4 *in) {
2440 int column;
2441
2442 for (column = 0; column < 4; column++) {
2443 *out |= (*in) % (1U << 24 ) ;
2444 ++in;
2445 *out |= ( (*in) % (1U << 24 ) ) << 24 ;
2446 ++out;
2447 *out |= ( (*in) % (1U << 24 ) ) >> ( 24 - 16 );
2448 ++in;
2449 *out |= ( (*in) % (1U << 24 ) ) << 16 ;
2450 ++out;
2451 *out |= ( (*in) % (1U << 24 ) ) >> ( 24 - 8 );
2452 ++in;
2453 *out |= ( (*in) % (1U << 24 ) ) << 8 ;
2454 ++out;
2455 ++in;
2456 *out |= (*in) % (1U << 24 ) ;
2457 ++in;
2458 *out |= ( (*in) % (1U << 24 ) ) << 24 ;
2459 ++out;
2460 *out |= ( (*in) % (1U << 24 ) ) >> ( 24 - 16 );
2461 ++in;
2462 *out |= ( (*in) % (1U << 24 ) ) << 16 ;
2463 ++out;
2464 *out |= ( (*in) % (1U << 24 ) ) >> ( 24 - 8 );
2465 ++in;
2466 *out |= ( (*in) % (1U << 24 ) ) << 8 ;
2467 ++out;
2468 ++in;
2469 *out |= (*in) % (1U << 24 ) ;
2470 ++in;
2471 *out |= ( (*in) % (1U << 24 ) ) << 24 ;
2472 ++out;
2473 *out |= ( (*in) % (1U << 24 ) ) >> ( 24 - 16 );
2474 ++in;
2475 *out |= ( (*in) % (1U << 24 ) ) << 16 ;
2476 ++out;
2477 *out |= ( (*in) % (1U << 24 ) ) >> ( 24 - 8 );
2478 ++in;
2479 *out |= ( (*in) % (1U << 24 ) ) << 8 ;
2480 ++out;
2481 ++in;
2482 *out |= (*in) % (1U << 24 ) ;
2483 ++in;
2484 *out |= ( (*in) % (1U << 24 ) ) << 24 ;
2485 ++out;
2486 *out |= ( (*in) % (1U << 24 ) ) >> ( 24 - 16 );
2487 ++in;
2488 *out |= ( (*in) % (1U << 24 ) ) << 16 ;
2489 ++out;
2490 *out |= ( (*in) % (1U << 24 ) ) >> ( 24 - 8 );
2491 ++in;
2492 *out |= ( (*in) % (1U << 24 ) ) << 8 ;
2493 ++out;
2494 ++in;
2495 }
2496
2497 return 48;
2498 }
2499
2500
2501 #ifdef ALLOW_ODD_PACKSIZES
2502 /* nwritten = 13 * 4 = 52 unsigned ints */
2503 static int
write_25_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)2504 write_25_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
2505 const __m128i *in = (const __m128i *) _in;
2506 __m128i OutReg;
2507
2508 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask25);
2509 OutReg = InReg;
2510 InReg = _mm_and_si128(_mm_load_si128(++in), mask25);
2511
2512 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
2513 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2514 OutReg);
2515
2516 OutReg = _mm_srli_epi32(InReg, 25 - 18);
2517 InReg = _mm_and_si128(_mm_load_si128(++in), mask25);
2518
2519 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
2520 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2521 OutReg);
2522
2523 OutReg = _mm_srli_epi32(InReg, 25 - 11);
2524 InReg = _mm_and_si128(_mm_load_si128(++in), mask25);
2525
2526 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
2527 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2528 OutReg);
2529
2530 OutReg = _mm_srli_epi32(InReg, 25 - 4);
2531 InReg = _mm_and_si128(_mm_load_si128(++in), mask25);
2532
2533 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
2534 InReg = _mm_and_si128(_mm_load_si128(++in), mask25);
2535
2536 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
2537 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2538 OutReg);
2539
2540 OutReg = _mm_srli_epi32(InReg, 25 - 22);
2541 InReg = _mm_and_si128(_mm_load_si128(++in), mask25);
2542
2543 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
2544 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2545 OutReg);
2546
2547 OutReg = _mm_srli_epi32(InReg, 25 - 15);
2548 InReg = _mm_and_si128(_mm_load_si128(++in), mask25);
2549
2550 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
2551 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2552 OutReg);
2553
2554 OutReg = _mm_srli_epi32(InReg, 25 - 8);
2555 InReg = _mm_and_si128(_mm_load_si128(++in), mask25);
2556
2557 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
2558 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2559 OutReg);
2560
2561 OutReg = _mm_srli_epi32(InReg, 25 - 1);
2562 InReg = _mm_and_si128(_mm_load_si128(++in), mask25);
2563
2564 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
2565 InReg = _mm_and_si128(_mm_load_si128(++in), mask25);
2566
2567 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
2568 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2569 OutReg);
2570
2571 OutReg = _mm_srli_epi32(InReg, 25 - 19);
2572 InReg = _mm_and_si128(_mm_load_si128(++in), mask25);
2573
2574 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
2575 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2576 OutReg);
2577
2578 OutReg = _mm_srli_epi32(InReg, 25 - 12);
2579 InReg = _mm_and_si128(_mm_load_si128(++in), mask25);
2580
2581 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
2582 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2583 OutReg);
2584
2585 OutReg = _mm_srli_epi32(InReg, 25 - 5);
2586 InReg = _mm_and_si128(_mm_load_si128(++in), mask25);
2587
2588 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
2589 InReg = _mm_and_si128(_mm_load_si128(++in), mask25);
2590
2591 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
2592 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2593 OutReg);
2594
2595 OutReg = _mm_srli_epi32(InReg, 25 - 23);
2596 InReg = _mm_and_si128(_mm_load_si128(++in), mask25);
2597
2598 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
2599 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2600 OutReg);
2601
2602
2603 OutReg = _mm_srli_epi32(InReg, 25 - 16);
2604 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2605 OutReg);
2606
2607 return strm_buffer_i;
2608 }
2609 #endif
2610
2611
2612 #ifdef HAVE_SSE2
2613 /* nwritten = 13 * 4 = 52 unsigned ints */
2614 static int
write_26_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)2615 write_26_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
2616 const __m128i *in = (const __m128i *) _in;
2617 __m128i OutReg;
2618
2619 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask26);
2620 OutReg = InReg;
2621 InReg = _mm_and_si128(_mm_load_si128(++in), mask26);
2622
2623 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
2624 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2625 OutReg);
2626
2627 OutReg = _mm_srli_epi32(InReg, 26 - 20);
2628 InReg = _mm_and_si128(_mm_load_si128(++in), mask26);
2629
2630 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
2631 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2632 OutReg);
2633
2634 OutReg = _mm_srli_epi32(InReg, 26 - 14);
2635 InReg = _mm_and_si128(_mm_load_si128(++in), mask26);
2636
2637 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
2638 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2639 OutReg);
2640
2641 OutReg = _mm_srli_epi32(InReg, 26 - 8);
2642 InReg = _mm_and_si128(_mm_load_si128(++in), mask26);
2643
2644 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
2645 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2646 OutReg);
2647
2648 OutReg = _mm_srli_epi32(InReg, 26 - 2);
2649 InReg = _mm_and_si128(_mm_load_si128(++in), mask26);
2650
2651 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
2652 InReg = _mm_and_si128(_mm_load_si128(++in), mask26);
2653
2654 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
2655 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2656 OutReg);
2657
2658 OutReg = _mm_srli_epi32(InReg, 26 - 22);
2659 InReg = _mm_and_si128(_mm_load_si128(++in), mask26);
2660
2661 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
2662 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2663 OutReg);
2664
2665 OutReg = _mm_srli_epi32(InReg, 26 - 16);
2666 InReg = _mm_and_si128(_mm_load_si128(++in), mask26);
2667
2668 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
2669 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2670 OutReg);
2671
2672 OutReg = _mm_srli_epi32(InReg, 26 - 10);
2673 InReg = _mm_and_si128(_mm_load_si128(++in), mask26);
2674
2675 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
2676 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2677 OutReg);
2678
2679 OutReg = _mm_srli_epi32(InReg, 26 - 4);
2680 InReg = _mm_and_si128(_mm_load_si128(++in), mask26);
2681
2682 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
2683 InReg = _mm_and_si128(_mm_load_si128(++in), mask26);
2684
2685 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
2686 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2687 OutReg);
2688
2689 OutReg = _mm_srli_epi32(InReg, 26 - 24);
2690 InReg = _mm_and_si128(_mm_load_si128(++in), mask26);
2691
2692 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
2693 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2694 OutReg);
2695
2696 OutReg = _mm_srli_epi32(InReg, 26 - 18);
2697 InReg = _mm_and_si128(_mm_load_si128(++in), mask26);
2698
2699 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
2700 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2701 OutReg);
2702
2703 OutReg = _mm_srli_epi32(InReg, 26 - 12);
2704 InReg = _mm_and_si128(_mm_load_si128(++in), mask26);
2705
2706 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
2707 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2708 OutReg);
2709
2710 OutReg = _mm_srli_epi32(InReg, 26 - 6);
2711 InReg = _mm_and_si128(_mm_load_si128(++in), mask26);
2712
2713 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
2714 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2715 OutReg);
2716
2717 return strm_buffer_i;
2718 }
2719 #endif
2720
2721
2722 static int
pack_26_horiz(UINT4 * out,const UINT4 * in)2723 pack_26_horiz (UINT4 *out, const UINT4 *in) {
2724 int column;
2725
2726 for (column = 0; column < 4; column++) {
2727 *out |= (*in) % (1U << 26 ) ;
2728 ++in;
2729 *out |= ( (*in) % (1U << 26 ) ) << 26 ;
2730 ++out;
2731 *out |= ( (*in) % (1U << 26 ) ) >> ( 26 - 20 );
2732 ++in;
2733 *out |= ( (*in) % (1U << 26 ) ) << 20 ;
2734 ++out;
2735 *out |= ( (*in) % (1U << 26 ) ) >> ( 26 - 14 );
2736 ++in;
2737 *out |= ( (*in) % (1U << 26 ) ) << 14 ;
2738 ++out;
2739 *out |= ( (*in) % (1U << 26 ) ) >> ( 26 - 8 );
2740 ++in;
2741 *out |= ( (*in) % (1U << 26 ) ) << 8 ;
2742 ++out;
2743 *out |= ( (*in) % (1U << 26 ) ) >> ( 26 - 2 );
2744 ++in;
2745 *out |= ( (*in) % (1U << 26 ) ) << 2 ;
2746 ++in;
2747 *out |= ( (*in) % (1U << 26 ) ) << 28 ;
2748 ++out;
2749 *out |= ( (*in) % (1U << 26 ) ) >> ( 26 - 22 );
2750 ++in;
2751 *out |= ( (*in) % (1U << 26 ) ) << 22 ;
2752 ++out;
2753 *out |= ( (*in) % (1U << 26 ) ) >> ( 26 - 16 );
2754 ++in;
2755 *out |= ( (*in) % (1U << 26 ) ) << 16 ;
2756 ++out;
2757 *out |= ( (*in) % (1U << 26 ) ) >> ( 26 - 10 );
2758 ++in;
2759 *out |= ( (*in) % (1U << 26 ) ) << 10 ;
2760 ++out;
2761 *out |= ( (*in) % (1U << 26 ) ) >> ( 26 - 4 );
2762 ++in;
2763 *out |= ( (*in) % (1U << 26 ) ) << 4 ;
2764 ++in;
2765 *out |= ( (*in) % (1U << 26 ) ) << 30 ;
2766 ++out;
2767 *out |= ( (*in) % (1U << 26 ) ) >> ( 26 - 24 );
2768 ++in;
2769 *out |= ( (*in) % (1U << 26 ) ) << 24 ;
2770 ++out;
2771 *out |= ( (*in) % (1U << 26 ) ) >> ( 26 - 18 );
2772 ++in;
2773 *out |= ( (*in) % (1U << 26 ) ) << 18 ;
2774 ++out;
2775 *out |= ( (*in) % (1U << 26 ) ) >> ( 26 - 12 );
2776 ++in;
2777 *out |= ( (*in) % (1U << 26 ) ) << 12 ;
2778 ++out;
2779 *out |= ( (*in) % (1U << 26 ) ) >> ( 26 - 6 );
2780 ++in;
2781 *out |= ( (*in) % (1U << 26 ) ) << 6 ;
2782 ++out;
2783 ++in;
2784 }
2785
2786 return 52;
2787 }
2788
2789
2790
2791 #ifdef ALLOW_ODD_PACKSIZES
2792 /* nwritten = 14 * 4 = 56 unsigned ints */
2793 static int
write_27_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)2794 write_27_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
2795 const __m128i *in = (const __m128i *) _in;
2796 __m128i OutReg;
2797
2798 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask27);
2799 OutReg = InReg;
2800 InReg = _mm_and_si128(_mm_load_si128(++in), mask27);
2801
2802 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
2803 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2804 OutReg);
2805
2806 OutReg = _mm_srli_epi32(InReg, 27 - 22);
2807 InReg = _mm_and_si128(_mm_load_si128(++in), mask27);
2808
2809 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
2810 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2811 OutReg);
2812
2813 OutReg = _mm_srli_epi32(InReg, 27 - 17);
2814 InReg = _mm_and_si128(_mm_load_si128(++in), mask27);
2815
2816 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
2817 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2818 OutReg);
2819
2820 OutReg = _mm_srli_epi32(InReg, 27 - 12);
2821 InReg = _mm_and_si128(_mm_load_si128(++in), mask27);
2822
2823 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
2824 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2825 OutReg);
2826
2827 OutReg = _mm_srli_epi32(InReg, 27 - 7);
2828 InReg = _mm_and_si128(_mm_load_si128(++in), mask27);
2829
2830 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
2831 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2832 OutReg);
2833
2834 OutReg = _mm_srli_epi32(InReg, 27 - 2);
2835 InReg = _mm_and_si128(_mm_load_si128(++in), mask27);
2836
2837 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
2838 InReg = _mm_and_si128(_mm_load_si128(++in), mask27);
2839
2840 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
2841 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2842 OutReg);
2843
2844 OutReg = _mm_srli_epi32(InReg, 27 - 24);
2845 InReg = _mm_and_si128(_mm_load_si128(++in), mask27);
2846
2847 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
2848 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2849 OutReg);
2850
2851 OutReg = _mm_srli_epi32(InReg, 27 - 19);
2852 InReg = _mm_and_si128(_mm_load_si128(++in), mask27);
2853
2854 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
2855 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2856 OutReg);
2857
2858 OutReg = _mm_srli_epi32(InReg, 27 - 14);
2859 InReg = _mm_and_si128(_mm_load_si128(++in), mask27);
2860
2861 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
2862 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2863 OutReg);
2864
2865 OutReg = _mm_srli_epi32(InReg, 27 - 9);
2866 InReg = _mm_and_si128(_mm_load_si128(++in), mask27);
2867
2868 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
2869 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2870 OutReg);
2871
2872 OutReg = _mm_srli_epi32(InReg, 27 - 4);
2873 InReg = _mm_and_si128(_mm_load_si128(++in), mask27);
2874
2875 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
2876 InReg = _mm_and_si128(_mm_load_si128(++in), mask27);
2877
2878 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
2879 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2880 OutReg);
2881
2882 OutReg = _mm_srli_epi32(InReg, 27 - 26);
2883 InReg = _mm_and_si128(_mm_load_si128(++in), mask27);
2884
2885 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
2886 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2887 OutReg);
2888
2889 OutReg = _mm_srli_epi32(InReg, 27 - 21);
2890 InReg = _mm_and_si128(_mm_load_si128(++in), mask27);
2891
2892 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
2893 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2894 OutReg);
2895
2896 OutReg = _mm_srli_epi32(InReg, 27 - 16);
2897 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2898 OutReg);
2899
2900 return strm_buffer_i;
2901 }
2902 #endif
2903
2904
2905 #ifdef HAVE_SSE2
2906 /* nwritten = 14 * 4 = 56 unsigned ints */
2907 static int
write_28_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)2908 write_28_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
2909 const __m128i *in = (const __m128i *) _in;
2910 __m128i OutReg;
2911
2912 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask28);
2913 OutReg = InReg;
2914 InReg = _mm_and_si128(_mm_load_si128(++in), mask28);
2915
2916 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
2917 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2918 OutReg);
2919
2920 OutReg = _mm_srli_epi32(InReg, 28 - 24);
2921 InReg = _mm_and_si128(_mm_load_si128(++in), mask28);
2922
2923 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
2924 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2925 OutReg);
2926
2927 OutReg = _mm_srli_epi32(InReg, 28 - 20);
2928 InReg = _mm_and_si128(_mm_load_si128(++in), mask28);
2929
2930 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
2931 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2932 OutReg);
2933
2934 OutReg = _mm_srli_epi32(InReg, 28 - 16);
2935 InReg = _mm_and_si128(_mm_load_si128(++in), mask28);
2936
2937 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
2938 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2939 OutReg);
2940
2941 OutReg = _mm_srli_epi32(InReg, 28 - 12);
2942 InReg = _mm_and_si128(_mm_load_si128(++in), mask28);
2943
2944 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
2945 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2946 OutReg);
2947
2948 OutReg = _mm_srli_epi32(InReg, 28 - 8);
2949 InReg = _mm_and_si128(_mm_load_si128(++in), mask28);
2950
2951 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
2952 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2953 OutReg);
2954
2955 OutReg = _mm_srli_epi32(InReg, 28 - 4);
2956 InReg = _mm_and_si128(_mm_load_si128(++in), mask28);
2957
2958 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
2959 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2960 OutReg);
2961
2962 InReg = _mm_and_si128(_mm_load_si128(++in), mask28);
2963
2964 OutReg = InReg;
2965 InReg = _mm_and_si128(_mm_load_si128(++in), mask28);
2966
2967 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
2968 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2969 OutReg);
2970
2971 OutReg = _mm_srli_epi32(InReg, 28 - 24);
2972 InReg = _mm_and_si128(_mm_load_si128(++in), mask28);
2973
2974 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
2975 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2976 OutReg);
2977
2978 OutReg = _mm_srli_epi32(InReg, 28 - 20);
2979 InReg = _mm_and_si128(_mm_load_si128(++in), mask28);
2980
2981 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
2982 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2983 OutReg);
2984
2985 OutReg = _mm_srli_epi32(InReg, 28 - 16);
2986 InReg = _mm_and_si128(_mm_load_si128(++in), mask28);
2987
2988 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
2989 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2990 OutReg);
2991
2992 OutReg = _mm_srli_epi32(InReg, 28 - 12);
2993 InReg = _mm_and_si128(_mm_load_si128(++in), mask28);
2994
2995 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
2996 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2997 OutReg);
2998
2999 OutReg = _mm_srli_epi32(InReg, 28 - 8);
3000 InReg = _mm_and_si128(_mm_load_si128(++in), mask28);
3001
3002 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
3003 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3004 OutReg);
3005
3006 OutReg = _mm_srli_epi32(InReg, 28 - 4);
3007 InReg = _mm_and_si128(_mm_load_si128(++in), mask28);
3008
3009 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
3010 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3011 OutReg);
3012
3013 return strm_buffer_i;
3014 }
3015 #endif
3016
3017
3018 static int
pack_28_horiz(UINT4 * out,const UINT4 * in)3019 pack_28_horiz (UINT4 *out, const UINT4 *in) {
3020 int column;
3021
3022 for (column = 0; column < 4; column++) {
3023 *out |= (*in) % (1U << 28 ) ;
3024 ++in;
3025 *out |= ( (*in) % (1U << 28 ) ) << 28 ;
3026 ++out;
3027 *out |= ( (*in) % (1U << 28 ) ) >> ( 28 - 24 );
3028 ++in;
3029 *out |= ( (*in) % (1U << 28 ) ) << 24 ;
3030 ++out;
3031 *out |= ( (*in) % (1U << 28 ) ) >> ( 28 - 20 );
3032 ++in;
3033 *out |= ( (*in) % (1U << 28 ) ) << 20 ;
3034 ++out;
3035 *out |= ( (*in) % (1U << 28 ) ) >> ( 28 - 16 );
3036 ++in;
3037 *out |= ( (*in) % (1U << 28 ) ) << 16 ;
3038 ++out;
3039 *out |= ( (*in) % (1U << 28 ) ) >> ( 28 - 12 );
3040 ++in;
3041 *out |= ( (*in) % (1U << 28 ) ) << 12 ;
3042 ++out;
3043 *out |= ( (*in) % (1U << 28 ) ) >> ( 28 - 8 );
3044 ++in;
3045 *out |= ( (*in) % (1U << 28 ) ) << 8 ;
3046 ++out;
3047 *out |= ( (*in) % (1U << 28 ) ) >> ( 28 - 4 );
3048 ++in;
3049 *out |= ( (*in) % (1U << 28 ) ) << 4 ;
3050 ++out;
3051 ++in;
3052 *out |= (*in) % (1U << 28 ) ;
3053 ++in;
3054 *out |= ( (*in) % (1U << 28 ) ) << 28 ;
3055 ++out;
3056 *out |= ( (*in) % (1U << 28 ) ) >> ( 28 - 24 );
3057 ++in;
3058 *out |= ( (*in) % (1U << 28 ) ) << 24 ;
3059 ++out;
3060 *out |= ( (*in) % (1U << 28 ) ) >> ( 28 - 20 );
3061 ++in;
3062 *out |= ( (*in) % (1U << 28 ) ) << 20 ;
3063 ++out;
3064 *out |= ( (*in) % (1U << 28 ) ) >> ( 28 - 16 );
3065 ++in;
3066 *out |= ( (*in) % (1U << 28 ) ) << 16 ;
3067 ++out;
3068 *out |= ( (*in) % (1U << 28 ) ) >> ( 28 - 12 );
3069 ++in;
3070 *out |= ( (*in) % (1U << 28 ) ) << 12 ;
3071 ++out;
3072 *out |= ( (*in) % (1U << 28 ) ) >> ( 28 - 8 );
3073 ++in;
3074 *out |= ( (*in) % (1U << 28 ) ) << 8 ;
3075 ++out;
3076 *out |= ( (*in) % (1U << 28 ) ) >> ( 28 - 4 );
3077 ++in;
3078 *out |= ( (*in) % (1U << 28 ) ) << 4 ;
3079 ++out;
3080 ++in;
3081 }
3082
3083 return 56;
3084 }
3085
3086
3087
3088 #ifdef ALLOW_ODD_PACKSIZES
3089 /* nwritten = 15 * 4 = 60 unsigned ints */
3090 static int
write_29_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)3091 write_29_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
3092 const __m128i *in = (const __m128i *) _in;
3093 __m128i OutReg;
3094
3095 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask29);
3096 OutReg = InReg;
3097 InReg = _mm_and_si128(_mm_load_si128(++in), mask29);
3098
3099 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
3100 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3101 OutReg);
3102
3103 OutReg = _mm_srli_epi32(InReg, 29 - 26);
3104 InReg = _mm_and_si128(_mm_load_si128(++in), mask29);
3105
3106 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
3107 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3108 OutReg);
3109
3110 OutReg = _mm_srli_epi32(InReg, 29 - 23);
3111 InReg = _mm_and_si128(_mm_load_si128(++in), mask29);
3112
3113 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
3114 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3115 OutReg);
3116
3117 OutReg = _mm_srli_epi32(InReg, 29 - 20);
3118 InReg = _mm_and_si128(_mm_load_si128(++in), mask29);
3119
3120 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
3121 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3122 OutReg);
3123
3124 OutReg = _mm_srli_epi32(InReg, 29 - 17);
3125 InReg = _mm_and_si128(_mm_load_si128(++in), mask29);
3126
3127 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
3128 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3129 OutReg);
3130
3131 OutReg = _mm_srli_epi32(InReg, 29 - 14);
3132 InReg = _mm_and_si128(_mm_load_si128(++in), mask29);
3133
3134 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
3135 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3136 OutReg);
3137
3138 OutReg = _mm_srli_epi32(InReg, 29 - 11);
3139 InReg = _mm_and_si128(_mm_load_si128(++in), mask29);
3140
3141 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
3142 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3143 OutReg);
3144
3145 OutReg = _mm_srli_epi32(InReg, 29 - 8);
3146 InReg = _mm_and_si128(_mm_load_si128(++in), mask29);
3147
3148 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
3149 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3150 OutReg);
3151
3152 OutReg = _mm_srli_epi32(InReg, 29 - 5);
3153 InReg = _mm_and_si128(_mm_load_si128(++in), mask29);
3154
3155 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
3156 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3157 OutReg);
3158
3159 OutReg = _mm_srli_epi32(InReg, 29 - 2);
3160 InReg = _mm_and_si128(_mm_load_si128(++in), mask29);
3161
3162 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
3163 InReg = _mm_and_si128(_mm_load_si128(++in), mask29);
3164
3165 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
3166 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3167 OutReg);
3168
3169 OutReg = _mm_srli_epi32(InReg, 29 - 28);
3170 InReg = _mm_and_si128(_mm_load_si128(++in), mask29);
3171
3172 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
3173 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3174 OutReg);
3175
3176 OutReg = _mm_srli_epi32(InReg, 29 - 25);
3177 InReg = _mm_and_si128(_mm_load_si128(++in), mask29);
3178
3179 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
3180 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3181 OutReg);
3182
3183 OutReg = _mm_srli_epi32(InReg, 29 - 22);
3184 InReg = _mm_and_si128(_mm_load_si128(++in), mask29);
3185
3186 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
3187 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3188 OutReg);
3189
3190 OutReg = _mm_srli_epi32(InReg, 29 - 19);
3191 InReg = _mm_and_si128(_mm_load_si128(++in), mask29);
3192
3193 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
3194 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3195 OutReg);
3196
3197
3198 OutReg = _mm_srli_epi32(InReg, 29 - 16);
3199 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3200 OutReg);
3201
3202 return strm_buffer_i;
3203 }
3204 #endif
3205
3206
3207 #ifdef HAVE_SSE2
3208 /* nwritten = 15 * 4 = 60 unsigned ints */
3209 static int
write_30_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)3210 write_30_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
3211 const __m128i *in = (const __m128i *) _in;
3212 __m128i OutReg;
3213
3214 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask30);
3215 OutReg = InReg;
3216 InReg = _mm_and_si128(_mm_load_si128(++in), mask30);
3217
3218 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
3219 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3220 OutReg);
3221
3222 OutReg = _mm_srli_epi32(InReg, 30 - 28);
3223 InReg = _mm_and_si128(_mm_load_si128(++in), mask30);
3224
3225 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
3226 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3227 OutReg);
3228
3229 OutReg = _mm_srli_epi32(InReg, 30 - 26);
3230 InReg = _mm_and_si128(_mm_load_si128(++in), mask30);
3231
3232 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
3233 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3234 OutReg);
3235
3236 OutReg = _mm_srli_epi32(InReg, 30 - 24);
3237 InReg = _mm_and_si128(_mm_load_si128(++in), mask30);
3238
3239 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
3240 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3241 OutReg);
3242
3243 OutReg = _mm_srli_epi32(InReg, 30 - 22);
3244 InReg = _mm_and_si128(_mm_load_si128(++in), mask30);
3245
3246 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
3247 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3248 OutReg);
3249
3250 OutReg = _mm_srli_epi32(InReg, 30 - 20);
3251 InReg = _mm_and_si128(_mm_load_si128(++in), mask30);
3252
3253 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
3254 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3255 OutReg);
3256
3257 OutReg = _mm_srli_epi32(InReg, 30 - 18);
3258 InReg = _mm_and_si128(_mm_load_si128(++in), mask30);
3259
3260 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
3261 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3262 OutReg);
3263
3264 OutReg = _mm_srli_epi32(InReg, 30 - 16);
3265 InReg = _mm_and_si128(_mm_load_si128(++in), mask30);
3266
3267 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
3268 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3269 OutReg);
3270
3271 OutReg = _mm_srli_epi32(InReg, 30 - 14);
3272 InReg = _mm_and_si128(_mm_load_si128(++in), mask30);
3273
3274 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
3275 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3276 OutReg);
3277
3278 OutReg = _mm_srli_epi32(InReg, 30 - 12);
3279 InReg = _mm_and_si128(_mm_load_si128(++in), mask30);
3280
3281 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
3282 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3283 OutReg);
3284
3285 OutReg = _mm_srli_epi32(InReg, 30 - 10);
3286 InReg = _mm_and_si128(_mm_load_si128(++in), mask30);
3287
3288 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
3289 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3290 OutReg);
3291
3292 OutReg = _mm_srli_epi32(InReg, 30 - 8);
3293 InReg = _mm_and_si128(_mm_load_si128(++in), mask30);
3294
3295 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
3296 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3297 OutReg);
3298
3299 OutReg = _mm_srli_epi32(InReg, 30 - 6);
3300 InReg = _mm_and_si128(_mm_load_si128(++in), mask30);
3301
3302 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
3303 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3304 OutReg);
3305
3306 OutReg = _mm_srli_epi32(InReg, 30 - 4);
3307 InReg = _mm_and_si128(_mm_load_si128(++in), mask30);
3308
3309 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
3310 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3311 OutReg);
3312
3313 OutReg = _mm_srli_epi32(InReg, 30 - 2);
3314 InReg = _mm_and_si128(_mm_load_si128(++in), mask30);
3315
3316 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
3317 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3318 OutReg);
3319
3320 return strm_buffer_i;
3321 }
3322 #endif
3323
3324
3325 static int
pack_30_horiz(UINT4 * out,const UINT4 * in)3326 pack_30_horiz (UINT4 *out, const UINT4 *in) {
3327 int column;
3328
3329 for (column = 0; column < 4; column++) {
3330 *out |= (*in) % (1U << 30 ) ;
3331 ++in;
3332 *out |= ( (*in) % (1U << 30 ) ) << 30 ;
3333 ++out;
3334 *out |= ( (*in) % (1U << 30 ) ) >> ( 30 - 28 );
3335 ++in;
3336 *out |= ( (*in) % (1U << 30 ) ) << 28 ;
3337 ++out;
3338 *out |= ( (*in) % (1U << 30 ) ) >> ( 30 - 26 );
3339 ++in;
3340 *out |= ( (*in) % (1U << 30 ) ) << 26 ;
3341 ++out;
3342 *out |= ( (*in) % (1U << 30 ) ) >> ( 30 - 24 );
3343 ++in;
3344 *out |= ( (*in) % (1U << 30 ) ) << 24 ;
3345 ++out;
3346 *out |= ( (*in) % (1U << 30 ) ) >> ( 30 - 22 );
3347 ++in;
3348 *out |= ( (*in) % (1U << 30 ) ) << 22 ;
3349 ++out;
3350 *out |= ( (*in) % (1U << 30 ) ) >> ( 30 - 20 );
3351 ++in;
3352 *out |= ( (*in) % (1U << 30 ) ) << 20 ;
3353 ++out;
3354 *out |= ( (*in) % (1U << 30 ) ) >> ( 30 - 18 );
3355 ++in;
3356 *out |= ( (*in) % (1U << 30 ) ) << 18 ;
3357 ++out;
3358 *out |= ( (*in) % (1U << 30 ) ) >> ( 30 - 16 );
3359 ++in;
3360 *out |= ( (*in) % (1U << 30 ) ) << 16 ;
3361 ++out;
3362 *out |= ( (*in) % (1U << 30 ) ) >> ( 30 - 14 );
3363 ++in;
3364 *out |= ( (*in) % (1U << 30 ) ) << 14 ;
3365 ++out;
3366 *out |= ( (*in) % (1U << 30 ) ) >> ( 30 - 12 );
3367 ++in;
3368 *out |= ( (*in) % (1U << 30 ) ) << 12 ;
3369 ++out;
3370 *out |= ( (*in) % (1U << 30 ) ) >> ( 30 - 10 );
3371 ++in;
3372 *out |= ( (*in) % (1U << 30 ) ) << 10 ;
3373 ++out;
3374 *out |= ( (*in) % (1U << 30 ) ) >> ( 30 - 8 );
3375 ++in;
3376 *out |= ( (*in) % (1U << 30 ) ) << 8 ;
3377 ++out;
3378 *out |= ( (*in) % (1U << 30 ) ) >> ( 30 - 6 );
3379 ++in;
3380 *out |= ( (*in) % (1U << 30 ) ) << 6 ;
3381 ++out;
3382 *out |= ( (*in) % (1U << 30 ) ) >> ( 30 - 4 );
3383 ++in;
3384 *out |= ( (*in) % (1U << 30 ) ) << 4 ;
3385 ++out;
3386 *out |= ( (*in) % (1U << 30 ) ) >> ( 30 - 2 );
3387 ++in;
3388 *out |= ( (*in) % (1U << 30 ) ) << 2 ;
3389 ++out;
3390 ++in;
3391 }
3392
3393 return 60;
3394 }
3395
3396
3397
3398 #ifdef ALLOW_ODD_PACKSIZES
3399 /* nwritten = 16 * 4 = 64 unsigned ints */
3400 static int
write_31_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)3401 write_31_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
3402 const __m128i *in = (const __m128i *) _in;
3403 __m128i OutReg;
3404
3405 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask31);
3406 OutReg = InReg;
3407 InReg = _mm_and_si128(_mm_load_si128(++in), mask31);
3408
3409 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
3410 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3411 OutReg);
3412
3413 OutReg = _mm_srli_epi32(InReg, 31 - 30);
3414 InReg = _mm_and_si128(_mm_load_si128(++in), mask31);
3415
3416 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
3417 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3418 OutReg);
3419
3420 OutReg = _mm_srli_epi32(InReg, 31 - 29);
3421 InReg = _mm_and_si128(_mm_load_si128(++in), mask31);
3422
3423 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
3424 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3425 OutReg);
3426
3427 OutReg = _mm_srli_epi32(InReg, 31 - 28);
3428 InReg = _mm_and_si128(_mm_load_si128(++in), mask31);
3429
3430 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
3431 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3432 OutReg);
3433
3434 OutReg = _mm_srli_epi32(InReg, 31 - 27);
3435 InReg = _mm_and_si128(_mm_load_si128(++in), mask31);
3436
3437 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
3438 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3439 OutReg);
3440
3441 OutReg = _mm_srli_epi32(InReg, 31 - 26);
3442 InReg = _mm_and_si128(_mm_load_si128(++in), mask31);
3443
3444 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
3445 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3446 OutReg);
3447
3448 OutReg = _mm_srli_epi32(InReg, 31 - 25);
3449 InReg = _mm_and_si128(_mm_load_si128(++in), mask31);
3450
3451 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
3452 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3453 OutReg);
3454
3455 OutReg = _mm_srli_epi32(InReg, 31 - 24);
3456 InReg = _mm_and_si128(_mm_load_si128(++in), mask31);
3457
3458 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
3459 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3460 OutReg);
3461
3462 OutReg = _mm_srli_epi32(InReg, 31 - 23);
3463 InReg = _mm_and_si128(_mm_load_si128(++in), mask31);
3464
3465 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
3466 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3467 OutReg);
3468
3469 OutReg = _mm_srli_epi32(InReg, 31 - 22);
3470 InReg = _mm_and_si128(_mm_load_si128(++in), mask31);
3471
3472 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
3473 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3474 OutReg);
3475
3476 OutReg = _mm_srli_epi32(InReg, 31 - 21);
3477 InReg = _mm_and_si128(_mm_load_si128(++in), mask31);
3478
3479 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
3480 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3481 OutReg);
3482
3483 OutReg = _mm_srli_epi32(InReg, 31 - 20);
3484 InReg = _mm_and_si128(_mm_load_si128(++in), mask31);
3485
3486 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
3487 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3488 OutReg);
3489
3490 OutReg = _mm_srli_epi32(InReg, 31 - 19);
3491 InReg = _mm_and_si128(_mm_load_si128(++in), mask31);
3492
3493 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
3494 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3495 OutReg);
3496
3497 OutReg = _mm_srli_epi32(InReg, 31 - 18);
3498 InReg = _mm_and_si128(_mm_load_si128(++in), mask31);
3499
3500 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
3501 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3502 OutReg);
3503
3504 OutReg = _mm_srli_epi32(InReg, 31 - 17);
3505 InReg = _mm_and_si128(_mm_load_si128(++in), mask31);
3506
3507 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
3508 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3509 OutReg);
3510
3511
3512 OutReg = _mm_srli_epi32(InReg, 31 - 16);
3513 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3514 OutReg);
3515
3516 return strm_buffer_i;
3517 }
3518 #endif
3519
3520
3521 #ifdef HAVE_SSE2
3522 /* nwritten = 16 * 4 = 64 unsigned ints */
3523 static int
write_32_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)3524 write_32_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
3525 const __m128i *in = (const __m128i *) _in;
3526 __m128i OutReg;
3527
3528 __m128i InReg = _mm_load_si128(in);
3529 OutReg = InReg;
3530 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3531 OutReg);
3532
3533 InReg = _mm_load_si128(++in);
3534
3535 OutReg = InReg;
3536 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3537 OutReg);
3538
3539 InReg = _mm_load_si128(++in);
3540
3541 OutReg = InReg;
3542 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3543 OutReg);
3544
3545 InReg = _mm_load_si128(++in);
3546
3547 OutReg = InReg;
3548 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3549 OutReg);
3550
3551 InReg = _mm_load_si128(++in);
3552
3553 OutReg = InReg;
3554 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3555 OutReg);
3556
3557 InReg = _mm_load_si128(++in);
3558
3559 OutReg = InReg;
3560 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3561 OutReg);
3562
3563 InReg = _mm_load_si128(++in);
3564
3565 OutReg = InReg;
3566 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3567 OutReg);
3568
3569 InReg = _mm_load_si128(++in);
3570
3571 OutReg = InReg;
3572 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3573 OutReg);
3574
3575 InReg = _mm_load_si128(++in);
3576
3577 OutReg = InReg;
3578 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3579 OutReg);
3580
3581 InReg = _mm_load_si128(++in);
3582
3583 OutReg = InReg;
3584 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3585 OutReg);
3586
3587 InReg = _mm_load_si128(++in);
3588
3589 OutReg = InReg;
3590 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3591 OutReg);
3592
3593 InReg = _mm_load_si128(++in);
3594
3595 OutReg = InReg;
3596 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3597 OutReg);
3598
3599 InReg = _mm_load_si128(++in);
3600
3601 OutReg = InReg;
3602 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3603 OutReg);
3604
3605 InReg = _mm_load_si128(++in);
3606
3607 OutReg = InReg;
3608 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3609 OutReg);
3610
3611 InReg = _mm_load_si128(++in);
3612
3613 OutReg = InReg;
3614 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3615 OutReg);
3616
3617 InReg = _mm_load_si128(++in);
3618
3619 OutReg = InReg;
3620 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3621 OutReg);
3622
3623 return strm_buffer_i;
3624 }
3625 #endif
3626
3627
3628 static int
pack_32_horiz(UINT4 * out,const UINT4 * in)3629 pack_32_horiz (UINT4 *out, const UINT4 *in) {
3630 int column;
3631
3632 for (column = 0; column < 4; column++) {
3633 *out = *in;
3634 ++out;
3635 ++in;
3636 *out = *in;
3637 ++out;
3638 ++in;
3639 *out = *in;
3640 ++out;
3641 ++in;
3642 *out = *in;
3643 ++out;
3644 ++in;
3645 *out = *in;
3646 ++out;
3647 ++in;
3648 *out = *in;
3649 ++out;
3650 ++in;
3651 *out = *in;
3652 ++out;
3653 ++in;
3654 *out = *in;
3655 ++out;
3656 ++in;
3657 *out = *in;
3658 ++out;
3659 ++in;
3660 *out = *in;
3661 ++out;
3662 ++in;
3663 *out = *in;
3664 ++out;
3665 ++in;
3666 *out = *in;
3667 ++out;
3668 ++in;
3669 *out = *in;
3670 ++out;
3671 ++in;
3672 *out = *in;
3673 ++out;
3674 ++in;
3675 *out = *in;
3676 ++out;
3677 ++in;
3678 *out = *in;
3679 ++out;
3680 ++in;
3681 }
3682
3683 return 64;
3684 }
3685
3686
3687
3688 #ifdef HAVE_SSE2
3689 /* nwritten = 2 * 4 = 8 unsigned ints */
3690 static int
write_04_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)3691 write_04_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
3692 const __m128i *in = (const __m128i *) _in;
3693 __m128i OutReg;
3694
3695 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask4);
3696 OutReg = InReg;
3697 InReg = _mm_and_si128(_mm_load_si128(++in), mask4);
3698
3699 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
3700 InReg = _mm_and_si128(_mm_load_si128(++in), mask4);
3701
3702 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
3703 InReg = _mm_and_si128(_mm_load_si128(++in), mask4);
3704
3705 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
3706 InReg = _mm_and_si128(_mm_load_si128(++in), mask4);
3707
3708 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
3709 InReg = _mm_and_si128(_mm_load_si128(++in), mask4);
3710
3711 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
3712 InReg = _mm_and_si128(_mm_load_si128(++in), mask4);
3713
3714 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
3715 InReg = _mm_and_si128(_mm_load_si128(++in), mask4);
3716
3717 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
3718 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3719 OutReg);
3720
3721 InReg = _mm_and_si128(_mm_load_si128(++in), mask4);
3722
3723 OutReg = InReg;
3724 InReg = _mm_and_si128(_mm_load_si128(++in), mask4);
3725
3726 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
3727 InReg = _mm_and_si128(_mm_load_si128(++in), mask4);
3728
3729 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
3730 InReg = _mm_and_si128(_mm_load_si128(++in), mask4);
3731
3732 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
3733 InReg = _mm_and_si128(_mm_load_si128(++in), mask4);
3734
3735 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
3736 InReg = _mm_and_si128(_mm_load_si128(++in), mask4);
3737
3738 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
3739 InReg = _mm_and_si128(_mm_load_si128(++in), mask4);
3740
3741 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
3742 InReg = _mm_and_si128(_mm_load_si128(++in), mask4);
3743
3744 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
3745 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3746 OutReg);
3747
3748 return strm_buffer_i;
3749 }
3750 #endif
3751
3752
3753 static int
pack_04_horiz(UINT4 * out,const UINT4 * in)3754 pack_04_horiz (UINT4 *out, const UINT4 *in) {
3755 int column;
3756
3757 for (column = 0; column < 4; column++) {
3758 *out |= (*in) % (1U << 4 ) ;
3759 ++in;
3760 *out |= ( (*in) % (1U << 4 ) ) << 4 ;
3761 ++in;
3762 *out |= ( (*in) % (1U << 4 ) ) << 8 ;
3763 ++in;
3764 *out |= ( (*in) % (1U << 4 ) ) << 12 ;
3765 ++in;
3766 *out |= ( (*in) % (1U << 4 ) ) << 16 ;
3767 ++in;
3768 *out |= ( (*in) % (1U << 4 ) ) << 20 ;
3769 ++in;
3770 *out |= ( (*in) % (1U << 4 ) ) << 24 ;
3771 ++in;
3772 *out |= ( (*in) % (1U << 4 ) ) << 28 ;
3773 ++out;
3774 ++in;
3775 *out |= (*in) % (1U << 4 ) ;
3776 ++in;
3777 *out |= ( (*in) % (1U << 4 ) ) << 4 ;
3778 ++in;
3779 *out |= ( (*in) % (1U << 4 ) ) << 8 ;
3780 ++in;
3781 *out |= ( (*in) % (1U << 4 ) ) << 12 ;
3782 ++in;
3783 *out |= ( (*in) % (1U << 4 ) ) << 16 ;
3784 ++in;
3785 *out |= ( (*in) % (1U << 4 ) ) << 20 ;
3786 ++in;
3787 *out |= ( (*in) % (1U << 4 ) ) << 24 ;
3788 ++in;
3789 *out |= ( (*in) % (1U << 4 ) ) << 28 ;
3790 ++out;
3791 ++in;
3792 }
3793
3794 return 8;
3795 }
3796
3797
3798 #ifdef HAVE_SSE2
3799 /* nwritten = 4 * 4 = 16 unsigned ints */
3800 static int
write_08_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)3801 write_08_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
3802 const __m128i *in = (const __m128i *) _in;
3803 __m128i OutReg;
3804
3805 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask8);
3806 OutReg = InReg;
3807 InReg = _mm_and_si128(_mm_load_si128(++in), mask8);
3808
3809 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
3810 InReg = _mm_and_si128(_mm_load_si128(++in), mask8);
3811
3812 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
3813 InReg = _mm_and_si128(_mm_load_si128(++in), mask8);
3814
3815 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
3816 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3817 OutReg);
3818
3819 InReg = _mm_and_si128(_mm_load_si128(++in), mask8);
3820
3821 OutReg = InReg;
3822 InReg = _mm_and_si128(_mm_load_si128(++in), mask8);
3823
3824 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
3825 InReg = _mm_and_si128(_mm_load_si128(++in), mask8);
3826
3827 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
3828 InReg = _mm_and_si128(_mm_load_si128(++in), mask8);
3829
3830 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
3831 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3832 OutReg);
3833
3834 InReg = _mm_and_si128(_mm_load_si128(++in), mask8);
3835
3836 OutReg = InReg;
3837 InReg = _mm_and_si128(_mm_load_si128(++in), mask8);
3838
3839 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
3840 InReg = _mm_and_si128(_mm_load_si128(++in), mask8);
3841
3842 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
3843 InReg = _mm_and_si128(_mm_load_si128(++in), mask8);
3844
3845 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
3846 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3847 OutReg);
3848
3849 InReg = _mm_and_si128(_mm_load_si128(++in), mask8);
3850
3851 OutReg = InReg;
3852 InReg = _mm_and_si128(_mm_load_si128(++in), mask8);
3853
3854 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
3855 InReg = _mm_and_si128(_mm_load_si128(++in), mask8);
3856
3857 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
3858 InReg = _mm_and_si128(_mm_load_si128(++in), mask8);
3859
3860 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
3861 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3862 OutReg);
3863
3864 return strm_buffer_i;
3865 }
3866 #endif
3867
3868
3869 static int
pack_08_horiz(UINT4 * out,const UINT4 * in)3870 pack_08_horiz (UINT4 *out, const UINT4 *in) {
3871 int column;
3872
3873 for (column = 0; column < 4; column++) {
3874 *out |= (*in) % (1U << 8 ) ;
3875 ++in;
3876 *out |= ( (*in) % (1U << 8 ) ) << 8 ;
3877 ++in;
3878 *out |= ( (*in) % (1U << 8 ) ) << 16 ;
3879 ++in;
3880 *out |= ( (*in) % (1U << 8 ) ) << 24 ;
3881 ++out;
3882 ++in;
3883 *out |= (*in) % (1U << 8 ) ;
3884 ++in;
3885 *out |= ( (*in) % (1U << 8 ) ) << 8 ;
3886 ++in;
3887 *out |= ( (*in) % (1U << 8 ) ) << 16 ;
3888 ++in;
3889 *out |= ( (*in) % (1U << 8 ) ) << 24 ;
3890 ++out;
3891 ++in;
3892 *out |= (*in) % (1U << 8 ) ;
3893 ++in;
3894 *out |= ( (*in) % (1U << 8 ) ) << 8 ;
3895 ++in;
3896 *out |= ( (*in) % (1U << 8 ) ) << 16 ;
3897 ++in;
3898 *out |= ( (*in) % (1U << 8 ) ) << 24 ;
3899 ++out;
3900 ++in;
3901 *out |= (*in) % (1U << 8 ) ;
3902 ++in;
3903 *out |= ( (*in) % (1U << 8 ) ) << 8 ;
3904 ++in;
3905 *out |= ( (*in) % (1U << 8 ) ) << 16 ;
3906 ++in;
3907 *out |= ( (*in) % (1U << 8 ) ) << 24 ;
3908 ++out;
3909 ++in;
3910 }
3911
3912 return 16;
3913 }
3914
3915
3916 #ifdef HAVE_SSE2
3917 /* nwritten = 8 * 4 = 32 unsigned ints */
3918 static int
write_16_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)3919 write_16_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
3920 const __m128i *in = (const __m128i *) _in;
3921 __m128i OutReg;
3922
3923 __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask16);
3924
3925 OutReg = InReg;
3926 InReg = _mm_and_si128(_mm_load_si128(++in), mask16);
3927
3928 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
3929 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3930 OutReg);
3931
3932 InReg = _mm_and_si128(_mm_load_si128(++in), mask16);
3933
3934 OutReg = InReg;
3935 InReg = _mm_and_si128(_mm_load_si128(++in), mask16);
3936
3937 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
3938 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3939 OutReg);
3940
3941 InReg = _mm_and_si128(_mm_load_si128(++in), mask16);
3942
3943 OutReg = InReg;
3944 InReg = _mm_and_si128(_mm_load_si128(++in), mask16);
3945
3946 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
3947 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3948 OutReg);
3949
3950 InReg = _mm_and_si128(_mm_load_si128(++in), mask16);
3951
3952 OutReg = InReg;
3953 InReg = _mm_and_si128(_mm_load_si128(++in), mask16);
3954
3955 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
3956 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3957 OutReg);
3958
3959 InReg = _mm_and_si128(_mm_load_si128(++in), mask16);
3960
3961 OutReg = InReg;
3962 InReg = _mm_and_si128(_mm_load_si128(++in), mask16);
3963
3964 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
3965 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3966 OutReg);
3967
3968 InReg = _mm_and_si128(_mm_load_si128(++in), mask16);
3969
3970 OutReg = InReg;
3971 InReg = _mm_and_si128(_mm_load_si128(++in), mask16);
3972
3973 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
3974 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3975 OutReg);
3976
3977 InReg = _mm_and_si128(_mm_load_si128(++in), mask16);
3978
3979 OutReg = InReg;
3980 InReg = _mm_and_si128(_mm_load_si128(++in), mask16);
3981
3982 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
3983 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3984 OutReg);
3985
3986 InReg = _mm_and_si128(_mm_load_si128(++in), mask16);
3987
3988 OutReg = InReg;
3989 InReg = _mm_and_si128(_mm_load_si128(++in), mask16);
3990
3991 OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
3992 strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3993 OutReg);
3994
3995 return strm_buffer_i;
3996 }
3997 #endif
3998
3999
4000 static int
pack_16_horiz(UINT4 * out,const UINT4 * in)4001 pack_16_horiz (UINT4 *out, const UINT4 *in) {
4002 int column;
4003
4004 for (column = 0; column < 4; column++) {
4005 *out |= (*in) % (1U << 16 ) ;
4006 ++in;
4007 *out |= ( (*in) % (1U << 16 ) ) << 16 ;
4008 ++out;
4009 ++in;
4010 *out |= (*in) % (1U << 16 ) ;
4011 ++in;
4012 *out |= ( (*in) % (1U << 16 ) ) << 16 ;
4013 ++out;
4014 ++in;
4015 *out |= (*in) % (1U << 16 ) ;
4016 ++in;
4017 *out |= ( (*in) % (1U << 16 ) ) << 16 ;
4018 ++out;
4019 ++in;
4020 *out |= (*in) % (1U << 16 ) ;
4021 ++in;
4022 *out |= ( (*in) % (1U << 16 ) ) << 16 ;
4023 ++out;
4024 ++in;
4025 *out |= (*in) % (1U << 16 ) ;
4026 ++in;
4027 *out |= ( (*in) % (1U << 16 ) ) << 16 ;
4028 ++out;
4029 ++in;
4030 *out |= (*in) % (1U << 16 ) ;
4031 ++in;
4032 *out |= ( (*in) % (1U << 16 ) ) << 16 ;
4033 ++out;
4034 ++in;
4035 *out |= (*in) % (1U << 16 ) ;
4036 ++in;
4037 *out |= ( (*in) % (1U << 16 ) ) << 16 ;
4038 ++out;
4039 ++in;
4040 *out |= (*in) % (1U << 16 ) ;
4041 ++in;
4042 *out |= ( (*in) % (1U << 16 ) ) << 16 ;
4043 ++out;
4044 ++in;
4045 }
4046
4047 return 32;
4048 }
4049
4050
4051 /* Vertical format requires all values in a block to be decoded */
4052 #ifdef HAVE_SSE2
4053 static int
write_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in,int packsize)4054 write_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i,
4055 const UINT4 *_in, int packsize) {
4056
4057 #if 0
4058 int i;
4059
4060 printf("Entering with packsize %d\n",packsize);
4061 for (i = 0; i < BLOCKSIZE; i++) {
4062 printf("%d ",_in[i]);
4063 }
4064 printf("\n");
4065 #endif
4066
4067 switch (packsize) {
4068 #ifdef ALLOW_ODD_PACKSIZES
4069 case 0: return strm_buffer_i;
4070 case 1: return write_01_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4071 case 2: return write_02_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4072 case 3: return write_03_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4073 case 4: return write_04_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4074 case 5: return write_05_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4075 case 6: return write_06_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4076 case 7: return write_07_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4077 case 8: return write_08_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4078 case 9: return write_09_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4079 case 10: return write_10_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4080 case 11: return write_11_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4081 case 12: return write_12_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4082 case 13: return write_13_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4083 case 14: return write_14_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4084 case 15: return write_15_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4085 case 16: return write_16_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4086 case 17: return write_17_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4087 case 18: return write_18_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4088 case 19: return write_19_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4089 case 20: return write_20_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4090 case 21: return write_21_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4091 case 22: return write_22_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4092 case 23: return write_23_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4093 case 24: return write_24_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4094 case 25: return write_25_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4095 case 26: return write_26_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4096 case 27: return write_27_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4097 case 28: return write_28_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4098 case 29: return write_29_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4099 case 30: return write_30_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4100 case 31: return write_31_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4101 case 32: return write_32_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4102 #else
4103 case 0: return strm_buffer_i;
4104 case 2: return write_02_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4105 case 4: return write_04_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4106 case 6: return write_06_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4107 case 8: return write_08_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4108 case 10: return write_10_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4109 case 12: return write_12_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4110 case 14: return write_14_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4111 case 16: return write_16_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4112 case 18: return write_18_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4113 case 20: return write_20_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4114 case 22: return write_22_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4115 case 24: return write_24_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4116 case 26: return write_26_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4117 case 28: return write_28_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4118 case 30: return write_30_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4119 case 32: return write_32_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4120 #endif
4121 default: fprintf(stderr,"packsize of %d not allowed\n",packsize); abort();
4122 }
4123 }
4124
4125 #else
4126
4127 static void
reorder_values_vertically(Positionsptr_T * vertical,const Positionsptr_T * horizontal)4128 reorder_values_vertically (Positionsptr_T *vertical, const Positionsptr_T *horizontal) {
4129 int column, row, k = 0;
4130 Positionsptr_T *out;
4131
4132 out = &(vertical[0]);
4133 for (column = 0; column < 4; column++) {
4134 k = column;
4135 for (row = 0; row < BLOCKSIZE/4; row++) {
4136 *out++ = horizontal[k];
4137 k += 4;
4138 }
4139 }
4140
4141 #if 0
4142 printf("horizontal\n");
4143 for (k = 0; k < BLOCKSIZE; k++) {
4144 if (k % 4 == 0) {
4145 printf("\n");
4146 }
4147 printf("%u ",horizontal[k]);
4148 }
4149 printf("\n");
4150
4151 printf("vertical\n");
4152 for (k = 0; k < BLOCKSIZE; k++) {
4153 if (k % (BLOCKSIZE/4) == 0) {
4154 printf("\n");
4155 }
4156 printf("%u ",vertical[k]);
4157 }
4158 printf("\n");
4159 #endif
4160
4161 return;
4162 }
4163
4164 /* Non-SIMD code cannot write vertical format easily, so using
4165 horizontal code and conversions */
4166 static int
write_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * horizontal,int packsize)4167 write_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i,
4168 const UINT4 *horizontal, int packsize) {
4169 int nwritten;
4170 UINT4 buffer[BLOCKSIZE], vertical[BLOCKSIZE];
4171
4172 #if 0
4173 int i;
4174
4175 printf("Entering with packsize %d\n",packsize);
4176 for (i = 0; i < BLOCKSIZE; i++) {
4177 printf("%d ",_in[i]);
4178 }
4179 printf("\n");
4180 #endif
4181
4182 reorder_values_vertically(vertical,horizontal);
4183 memset((void *) buffer,0,BLOCKSIZE*sizeof(UINT4));
4184
4185 switch (packsize) {
4186 case 0: return strm_buffer_i;
4187 case 2: nwritten = pack_02_horiz(buffer,&(vertical[0])); break;
4188 case 4: nwritten = pack_04_horiz(buffer,&(vertical[0])); break;
4189 case 6: nwritten = pack_06_horiz(buffer,&(vertical[0])); break;
4190 case 8: nwritten = pack_08_horiz(buffer,&(vertical[0])); break;
4191 case 10: nwritten = pack_10_horiz(buffer,&(vertical[0])); break;
4192 case 12: nwritten = pack_12_horiz(buffer,&(vertical[0])); break;
4193 case 14: nwritten = pack_14_horiz(buffer,&(vertical[0])); break;
4194 case 16: nwritten = pack_16_horiz(buffer,&(vertical[0])); break;
4195 case 18: nwritten = pack_18_horiz(buffer,&(vertical[0])); break;
4196 case 20: nwritten = pack_20_horiz(buffer,&(vertical[0])); break;
4197 case 22: nwritten = pack_22_horiz(buffer,&(vertical[0])); break;
4198 case 24: nwritten = pack_24_horiz(buffer,&(vertical[0])); break;
4199 case 26: nwritten = pack_26_horiz(buffer,&(vertical[0])); break;
4200 case 28: nwritten = pack_28_horiz(buffer,&(vertical[0])); break;
4201 case 30: nwritten = pack_30_horiz(buffer,&(vertical[0])); break;
4202 case 32: nwritten = pack_32_horiz(buffer,&(vertical[0])); break;
4203 default: fprintf(stderr,"packsize of %d not allowed\n",packsize); abort();
4204 }
4205
4206 return write_reg_buffered_vert(strm_fp,strm_buffer,
4207 strm_buffer_size,strm_buffer_i,
4208 buffer,nwritten);
4209 }
4210 #endif
4211
4212
4213 /* Columnar order allows just the necessary values in a block to be decoded */
4214 static void
columnar_order(UINT4 * columnar,const UINT4 * vertical)4215 columnar_order (UINT4 *columnar, const UINT4 *vertical) {
4216
4217 columnar[0] = vertical[0]; /* remainder 1 */
4218 columnar[1] = vertical[4]; /* remainder 5 */
4219 columnar[2] = vertical[8]; /* remainder 9 */
4220 columnar[3] = vertical[12]; /* remainder 13 */
4221 columnar[4] = vertical[16]; /* remainder 17 */
4222 columnar[5] = vertical[20]; /* remainder 21 */
4223 columnar[6] = vertical[24]; /* remainder 25 */
4224 columnar[7] = vertical[28]; /* remainder 29 */
4225
4226 columnar[8] = vertical[1]; /* remainder 2 */
4227 columnar[9] = vertical[5]; /* remainder 6 */
4228 columnar[10] = vertical[9]; /* remainder 10 */
4229 columnar[11] = vertical[13]; /* remainder 14 */
4230 columnar[12] = vertical[17]; /* remainder 18 */
4231 columnar[13] = vertical[21]; /* remainder 22 */
4232 columnar[14] = vertical[25]; /* remainder 26 */
4233 columnar[15] = vertical[29]; /* remainder 30 */
4234
4235 columnar[16] = vertical[2]; /* remainder 3 */
4236 columnar[17] = vertical[6]; /* remainder 7 */
4237 columnar[18] = vertical[10]; /* remainder 11 */
4238 columnar[19] = vertical[14]; /* remainder 15 */
4239 columnar[20] = vertical[18]; /* remainder 19 */
4240 columnar[21] = vertical[22]; /* remainder 23 */
4241 columnar[22] = vertical[26]; /* remainder 27 */
4242 columnar[23] = vertical[30]; /* remainder 31 */
4243
4244 columnar[24] = vertical[3]; /* remainder 4 */
4245 columnar[25] = vertical[7]; /* remainder 8 */
4246 columnar[26] = vertical[11]; /* remainder 12 */
4247 columnar[27] = vertical[15]; /* remainder 16 */
4248 columnar[28] = vertical[19]; /* remainder 20 */
4249 columnar[29] = vertical[23]; /* remainder 24 */
4250 columnar[30] = vertical[27]; /* remainder 28 */
4251 columnar[31] = vertical[31]; /* remainder 32 */
4252
4253 columnar[32] = vertical[32]; /* remainder 63 */
4254 columnar[33] = vertical[36]; /* remainder 59 */
4255 columnar[34] = vertical[40]; /* remainder 55 */
4256 columnar[35] = vertical[44]; /* remainder 51 */
4257 columnar[36] = vertical[48]; /* remainder 47 */
4258 columnar[37] = vertical[52]; /* remainder 43 */
4259 columnar[38] = vertical[56]; /* remainder 39 */
4260 columnar[39] = vertical[60]; /* remainder 35 */
4261
4262 columnar[40] = vertical[33]; /* remainder 62 */
4263 columnar[41] = vertical[37]; /* remainder 58 */
4264 columnar[42] = vertical[41]; /* remainder 54 */
4265 columnar[43] = vertical[45]; /* remainder 50 */
4266 columnar[44] = vertical[49]; /* remainder 46 */
4267 columnar[45] = vertical[53]; /* remainder 42 */
4268 columnar[46] = vertical[57]; /* remainder 38 */
4269 columnar[47] = vertical[61]; /* remainder 34 */
4270
4271 columnar[48] = vertical[34]; /* remainder 61 */
4272 columnar[49] = vertical[38]; /* remainder 57 */
4273 columnar[50] = vertical[42]; /* remainder 53 */
4274 columnar[51] = vertical[46]; /* remainder 49 */
4275 columnar[52] = vertical[50]; /* remainder 45 */
4276 columnar[53] = vertical[54]; /* remainder 41 */
4277 columnar[54] = vertical[58]; /* remainder 37 */
4278 columnar[55] = vertical[62]; /* remainder 33 */
4279
4280 columnar[56] = vertical[35]; /* remainder 60 */
4281 columnar[57] = vertical[39]; /* remainder 56 */
4282 columnar[58] = vertical[43]; /* remainder 52 */
4283 columnar[59] = vertical[47]; /* remainder 48 */
4284 columnar[60] = vertical[51]; /* remainder 44 */
4285 columnar[61] = vertical[55]; /* remainder 40 */
4286 columnar[62] = vertical[59]; /* remainder 36 */
4287 columnar[63] = vertical[63]; /* remainder 32 */
4288
4289 return;
4290 }
4291
4292
4293 #ifdef HAVE_SSE2
4294
4295 int
Bitpack64_write_columnar(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in,int packsize)4296 Bitpack64_write_columnar (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i,
4297 const UINT4 *_in, int packsize) {
4298 UINT4 columnar[BLOCKSIZE];
4299
4300 #if 0
4301 int i;
4302
4303 printf("Entering with packsize %d\n",packsize);
4304 for (i = 0; i < BLOCKSIZE; i++) {
4305 printf("%d ",_in[i]);
4306 }
4307 printf("\n");
4308 #endif
4309
4310 columnar_order(columnar,_in);
4311
4312 switch (packsize) {
4313 case 0: return strm_buffer_i;
4314 case 2: return write_02_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4315 case 4: return write_04_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4316 case 6: return write_06_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4317 case 8: return write_08_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4318 case 10: return write_10_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4319 case 12: return write_12_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4320 case 14: return write_14_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4321 case 16: return write_16_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4322 case 18: return write_18_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4323 case 20: return write_20_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4324 case 22: return write_22_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4325 case 24: return write_24_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4326 case 26: return write_26_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4327 case 28: return write_28_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4328 case 30: return write_30_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4329 case 32: return write_32_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4330
4331 default: fprintf(stderr,"packsize of %d not allowed\n",packsize); abort();
4332 }
4333 }
4334
4335 #else
4336
4337 /* Non-SIMD code cannot write vertical format easily, so using
4338 horizontal code and conversions */
4339
4340 int
Bitpack64_write_columnar(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * horizontal,int packsize)4341 Bitpack64_write_columnar (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i,
4342 const UINT4 *horizontal, int packsize) {
4343 int nwritten;
4344 UINT4 buffer[BLOCKSIZE], vertical[BLOCKSIZE];
4345 UINT4 columnar[BLOCKSIZE];
4346
4347 #if 0
4348 int i;
4349
4350 printf("Entering with packsize %d\n",packsize);
4351 for (i = 0; i < BLOCKSIZE; i++) {
4352 printf("%d ",horizontal[i]);
4353 }
4354 printf("\n");
4355 #endif
4356
4357 columnar_order(columnar,horizontal);
4358 reorder_values_vertically(vertical,columnar);
4359 memset((void *) buffer,0,BLOCKSIZE*sizeof(UINT4));
4360
4361 switch (packsize) {
4362 case 0: return strm_buffer_i;
4363 case 2: nwritten = pack_02_horiz(buffer,&(vertical[0])); break;
4364 case 4: nwritten = pack_04_horiz(buffer,&(vertical[0])); break;
4365 case 6: nwritten = pack_06_horiz(buffer,&(vertical[0])); break;
4366 case 8: nwritten = pack_08_horiz(buffer,&(vertical[0])); break;
4367 case 10: nwritten = pack_10_horiz(buffer,&(vertical[0])); break;
4368 case 12: nwritten = pack_12_horiz(buffer,&(vertical[0])); break;
4369 case 14: nwritten = pack_14_horiz(buffer,&(vertical[0])); break;
4370 case 16: nwritten = pack_16_horiz(buffer,&(vertical[0])); break;
4371 case 18: nwritten = pack_18_horiz(buffer,&(vertical[0])); break;
4372 case 20: nwritten = pack_20_horiz(buffer,&(vertical[0])); break;
4373 case 22: nwritten = pack_22_horiz(buffer,&(vertical[0])); break;
4374 case 24: nwritten = pack_24_horiz(buffer,&(vertical[0])); break;
4375 case 26: nwritten = pack_26_horiz(buffer,&(vertical[0])); break;
4376 case 28: nwritten = pack_28_horiz(buffer,&(vertical[0])); break;
4377 case 30: nwritten = pack_30_horiz(buffer,&(vertical[0])); break;
4378 case 32: nwritten = pack_32_horiz(buffer,&(vertical[0])); break;
4379 default: fprintf(stderr,"packsize of %d not allowed\n",packsize); abort();
4380 }
4381
4382 return write_reg_buffered_vert(strm_fp,strm_buffer,
4383 strm_buffer_size,strm_buffer_i,
4384 buffer,nwritten);
4385 }
4386
4387 #endif
4388
4389
4390
4391 /* Horizontal format is slightly more complicated for random access of individual values */
4392 int
Bitpack64_write_horiz(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * horizontal,int packsize)4393 Bitpack64_write_horiz (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i,
4394 const UINT4 *horizontal, int packsize) {
4395 int nwritten;
4396 UINT4 buffer[BLOCKSIZE];
4397
4398 write_setup();
4399
4400 #if 0
4401 int i;
4402
4403 printf("Entering with packsize %d\n",packsize);
4404 for (i = 0; i < BLOCKSIZE; i++) {
4405 printf("%d ",_in[i]);
4406 }
4407 printf("\n");
4408 #endif
4409
4410 memset((void *) buffer,0,BLOCKSIZE*sizeof(UINT4));
4411
4412 switch (packsize) {
4413 case 0: return strm_buffer_i;
4414 case 2: nwritten = pack_02_horiz(buffer,&(horizontal[0])); break;
4415 case 4: nwritten = pack_04_horiz(buffer,&(horizontal[0])); break;
4416 case 6: nwritten = pack_06_horiz(buffer,&(horizontal[0])); break;
4417 case 8: nwritten = pack_08_horiz(buffer,&(horizontal[0])); break;
4418 case 10: nwritten = pack_10_horiz(buffer,&(horizontal[0])); break;
4419 case 12: nwritten = pack_12_horiz(buffer,&(horizontal[0])); break;
4420 case 14: nwritten = pack_14_horiz(buffer,&(horizontal[0])); break;
4421 case 16: nwritten = pack_16_horiz(buffer,&(horizontal[0])); break;
4422 case 18: nwritten = pack_18_horiz(buffer,&(horizontal[0])); break;
4423 case 20: nwritten = pack_20_horiz(buffer,&(horizontal[0])); break;
4424 case 22: nwritten = pack_22_horiz(buffer,&(horizontal[0])); break;
4425 case 24: nwritten = pack_24_horiz(buffer,&(horizontal[0])); break;
4426 case 26: nwritten = pack_26_horiz(buffer,&(horizontal[0])); break;
4427 case 28: nwritten = pack_28_horiz(buffer,&(horizontal[0])); break;
4428 case 30: nwritten = pack_30_horiz(buffer,&(horizontal[0])); break;
4429 case 32: nwritten = pack_32_horiz(buffer,&(horizontal[0])); break;
4430 default: fprintf(stderr,"packsize of %d not allowed\n",packsize); abort();
4431 }
4432
4433 return write_reg_buffered_horiz(strm_fp,strm_buffer,
4434 strm_buffer_size,strm_buffer_i,
4435 buffer,nwritten);
4436 }
4437
4438
4439
4440 /* Processes 64 values at a time. Returns packsize. */
4441 /* Handles first 32 values from the initial value, and the last 32
4442 values from the final value. More efficient since we need to
4443 process only half as many inputs. */
4444 int
Bitpack64_compute_q4_diffs_bidir(UINT4 * diffs,UINT4 * values)4445 Bitpack64_compute_q4_diffs_bidir (UINT4 *diffs, UINT4 *values) {
4446 UINT4 packsize;
4447 int i;
4448 UINT4 maxdiff = 0;
4449 int firstbit;
4450 #ifdef HAVE_BUILTIN_CLZ
4451 #elif defined(HAVE_ASM_BSR)
4452 int msb;
4453 #endif
4454
4455 #if 0
4456 for (i = 0; i < 64; i++) {
4457 assert(values[i+1] >= values[i]);
4458 }
4459 #endif
4460
4461 maxdiff |= (diffs[32] = values[64] - values[63]);
4462 maxdiff |= (diffs[33] = values[64] - values[62]);
4463 maxdiff |= (diffs[34] = values[64] - values[61]);
4464 maxdiff |= (diffs[35] = values[64] - values[60]);
4465 for (i = 36; i < 64; i++) {
4466 maxdiff |= (diffs[i] = values[64+32-(i+1-4)] - values[64+32-(i+1)]);
4467 }
4468 for (i = 31; i >= 4; i--) {
4469 maxdiff |= (diffs[i] = values[i+1] - values[i+1-4]);
4470 }
4471 maxdiff |= (diffs[3] = values[4] - values[0]);
4472 maxdiff |= (diffs[2] = values[3] - values[0]);
4473 maxdiff |= (diffs[1] = values[2] - values[0]);
4474 maxdiff |= (diffs[0] = values[1] - values[0]);
4475
4476 if (maxdiff == 0) {
4477 /* __builtin_clz() behaves oddly on zero */
4478 return 0;
4479
4480 } else {
4481 #ifdef HAVE_BUILTIN_CLZ
4482 firstbit = __builtin_clz(maxdiff);
4483 packsize = 32 - firstbit;
4484 #elif defined(HAVE_ASM_BSR)
4485 asm("bsr %1,%0" : "=r"(msb) : "r"(maxdiff));
4486 packsize = msb + 1;
4487 #else
4488 firstbit = ((maxdiff >> 16) ? clz_table[maxdiff >> 16] : 16 + clz_table[maxdiff]);
4489 packsize = 32 - firstbit;
4490 #endif
4491
4492 #ifdef ALLOW_ODD_PACKSIZES
4493 return packsize;
4494 #else
4495 return (packsize + 1) & ~1; /* Converts packsizes to the next multiple of 2 */
4496 #endif
4497 }
4498 }
4499
4500
4501 #ifdef HAVE_64_BIT
4502 static int
Bitpack64_compute_q4_diffs_bidir_huge(UINT4 * diffs,UINT8 * values)4503 Bitpack64_compute_q4_diffs_bidir_huge (UINT4 *diffs, UINT8 *values) {
4504 UINT4 packsize;
4505 int i;
4506 UINT4 maxdiff = 0;
4507 int firstbit;
4508 #ifdef HAVE_BUILTIN_CLZ
4509 #elif defined(HAVE_ASM_BSR)
4510 int msb;
4511 #endif
4512
4513 #if 0
4514 for (i = 0; i < 64; i++) {
4515 assert(values[i+1] >= values[i]);
4516 }
4517 #endif
4518
4519 maxdiff |= (diffs[32] = (UINT4) (values[64] - values[63]));
4520 maxdiff |= (diffs[33] = (UINT4) (values[64] - values[62]));
4521 maxdiff |= (diffs[34] = (UINT4) (values[64] - values[61]));
4522 maxdiff |= (diffs[35] = (UINT4) (values[64] - values[60]));
4523 for (i = 36; i < 64; i++) {
4524 maxdiff |= (diffs[i] = (UINT4) (values[64+32-(i+1-4)] - values[64+32-(i+1)]));
4525 }
4526 for (i = 31; i >= 4; i--) {
4527 maxdiff |= (diffs[i] = (UINT4) (values[i+1] - values[i+1-4]));
4528 }
4529 maxdiff |= (diffs[3] = (UINT4) (values[4] - values[0]));
4530 maxdiff |= (diffs[2] = (UINT4) (values[3] - values[0]));
4531 maxdiff |= (diffs[1] = (UINT4) (values[2] - values[0]));
4532 maxdiff |= (diffs[0] = (UINT4) (values[1] - values[0]));
4533
4534 if (maxdiff == 0) {
4535 /* __builtin_clz() behaves oddly on zero */
4536 return 0;
4537
4538 } else {
4539 #ifdef HAVE_BUILTIN_CLZ
4540 firstbit = __builtin_clz(maxdiff);
4541 packsize = 32 - firstbit;
4542 #elif defined(HAVE_ASM_BSR)
4543 asm("bsr %1,%0" : "=r"(msb) : "r"(maxdiff));
4544 packsize = msb + 1;
4545 #else
4546 firstbit = ((maxdiff >> 16) ? clz_table[maxdiff >> 16] : 16 + clz_table[maxdiff]);
4547 packsize = 32 - firstbit;
4548 #endif
4549
4550 #ifdef ALLOW_ODD_PACKSIZES
4551 return packsize;
4552 #else
4553 return (packsize + 1) & ~1; /* Converts packsizes to the next multiple of 2 */
4554 #endif
4555 }
4556 }
4557 #endif
4558
4559
4560 #if 0
4561 static int
4562 compute_q1_diffs (UINT4 *diffs, UINT4 *values) {
4563 UINT4 packsize;
4564 int i;
4565 UINT4 maxdiff = 0;
4566 int firstbit;
4567 #ifdef HAVE_BUILTIN_CLZ
4568 #elif defined(HAVE_ASM_BSR)
4569 int msb;
4570 #endif
4571
4572 #if 0
4573 for (i = 0; i < 64; i++) {
4574 assert(values[i+1] >= values[i]);
4575 }
4576 #endif
4577
4578 for (i = 63; i >= 0; i--) {
4579 maxdiff |= (diffs[i] = values[i+1] - values[i]);
4580 }
4581
4582 if (maxdiff == 0) {
4583 /* __builtin_clz() behaves oddly on zero */
4584 return 0;
4585
4586 } else {
4587 #ifdef HAVE_BUILTIN_CLZ
4588 firstbit = __builtin_clz(maxdiff);
4589 packsize = 32 - firstbit;
4590 #elif defined(HAVE_ASM_BSR)
4591 asm("bsr %1,%0" : "=r"(msb) : "r"(maxdiff));
4592 packsize = msb + 1;
4593 #else
4594 firstbit = ((maxdiff >> 16) ? clz_table[maxdiff >> 16] : 16 + clz_table[maxdiff]);
4595 packsize = 32 - firstbit;
4596 #endif
4597
4598 #ifdef ALLOW_ODD_PACKSIZES
4599 return packsize;
4600 #else
4601 return (packsize + 1) & ~1; /* Converts packsizes to the next multiple of 2 */
4602 #endif
4603 }
4604 }
4605 #endif
4606
4607
4608 /* Used by trindex and indexdb_cat programs */
4609 /* We want to store values 0..n, with final value at ascending[n]
4610 possibly stored as the final metainfo value */
4611 /* Stored in columnar order */
4612 void
Bitpack64_write_differential(char * ptrsfile,char * compfile,UINT4 * ascending,Oligospace_T n)4613 Bitpack64_write_differential (char *ptrsfile, char *compfile, UINT4 *ascending, Oligospace_T n) {
4614 FILE *ptrs_fp, *comp_fp;
4615 UINT4 *ptrs, *p;
4616 size_t nptrs;
4617 int i;
4618 Oligospace_T positioni;
4619
4620 /* Buffer is used to avoid frequent writes to the file */
4621 UINT4 *buffer;
4622 int buffer_size = BUFFER_SIZE;
4623 int buffer_i;
4624
4625 UINT4 diffs[BLOCKSIZE], last_block[BLOCKSIZE+1];
4626
4627 UINT4 nwritten;
4628 int packsize;
4629
4630
4631 write_setup();
4632
4633 /* printf("Entered Bitpack64_write_differential with n %llu\n",n); */
4634
4635 /* 2 metavalues: nwritten (pointer) and cumulative sum for block.
4636 Packsize can be computed from difference between successive
4637 pointers, if only even packsizes are allowed */
4638 p = ptrs = (UINT4 *) CALLOC(((n + BLOCKSIZE)/BLOCKSIZE + 1) * DIFFERENTIAL_METAINFO_SIZE,sizeof(UINT4));
4639
4640 if ((comp_fp = FOPEN_WRITE_BINARY(compfile)) == NULL) {
4641 fprintf(stderr,"Can't write to compfile %s: %s\n",compfile,strerror(errno));
4642 abort();
4643 }
4644 buffer = (UINT4 *) CALLOC(buffer_size,sizeof(UINT4));
4645 buffer_i = 0;
4646
4647 nwritten = 0U;
4648
4649 /* Last value of ascending is at ascending[n] */
4650 /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
4651 for (positioni = 0; positioni + BLOCKSIZE <= n; positioni += BLOCKSIZE) {
4652 /* Pointer */
4653 *p++ = nwritten/4; /* In 128-bit registers */
4654
4655 /* Value for start of block */
4656 *p++ = ascending[positioni];
4657
4658 /* Pack block of 64 diffs */
4659 packsize = Bitpack64_compute_q4_diffs_bidir(diffs,&(ascending[positioni]));
4660 buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
4661
4662 #ifdef ALLOW_ODD_PACKSIZES
4663 nwritten += 2 * ((packsize + 1) & ~1);
4664 #else
4665 nwritten += 2 * packsize;
4666 #endif
4667 }
4668
4669 /* Old: Check for positioni < n, because if positioni == n, ascending[n] will be taken care of as metainfo */
4670 /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
4671 if (positioni <= n) {
4672 /* Finish last block of 64 */
4673 *p++ = nwritten/4; /* In 128-bit registers */
4674
4675 /* Value for start of block */
4676 *p++ = ascending[positioni];
4677
4678 /* For differential, want <=. For direct, want < */
4679 for (i = 0; i <= (int) (n - positioni); i++) {
4680 last_block[i] = ascending[positioni+i];
4681 }
4682 for ( ; i <= BLOCKSIZE; i++) {
4683 /* Copy last value for rest of block */
4684 last_block[i] = ascending[n];
4685 }
4686
4687 /* Pack block of < 64 diffs */
4688 packsize = Bitpack64_compute_q4_diffs_bidir(diffs,last_block);
4689 buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
4690
4691 #ifdef ALLOW_ODD_PACKSIZES
4692 nwritten += 2 * ((packsize + 1) & ~1);
4693 #else
4694 nwritten += 2 * packsize;
4695 #endif
4696 }
4697
4698
4699 /* Write the final pointer, which will point after the end of the file */
4700 *p++ = nwritten/4; /* In 128-bit registers */
4701
4702 /* Value for end of block */
4703 *p++ = ascending[n];
4704
4705 if ((ptrs_fp = FOPEN_WRITE_BINARY(ptrsfile)) == NULL) {
4706 fprintf(stderr,"Can't write to ptrsfile %s: %s\n",ptrsfile,strerror(errno));
4707 abort();
4708 } else {
4709 nptrs = p - ptrs;
4710 if (FWRITE_UINTS(ptrs,nptrs,ptrs_fp) != (size_t) nptrs) {
4711 fprintf(stderr,"Error in Bitpack64_write_differential: %s\n",strerror(errno));
4712 exit(9);
4713 }
4714 FREE(ptrs);
4715 fclose(ptrs_fp);
4716 }
4717
4718 /* Empty buffer */
4719 if (buffer_i > 0) {
4720 if (FWRITE_UINTS(buffer,buffer_i,comp_fp) != (size_t) buffer_i) {
4721 fprintf(stderr,"Error in Bitpack64_write_differential: %s\n",strerror(errno));
4722 exit(9);
4723 }
4724 buffer_i = 0;
4725 }
4726 FREE(buffer);
4727 fclose(comp_fp);
4728
4729 return;
4730 }
4731
4732
4733 static UINT4
compute_ascending(UINT4 * ascending,UINT4 * counts)4734 compute_ascending (UINT4 *ascending, UINT4 *counts) {
4735 int i;
4736
4737 ascending[0] = 0;
4738 for (i = 1; i <= 64; i++) {
4739 ascending[i] = ascending[i-1] + counts[i-1];
4740 }
4741
4742 return ascending[64];
4743 }
4744
4745 static UINT8
compute_ascending_huge(UINT8 * ascending,UINT4 * counts)4746 compute_ascending_huge (UINT8 *ascending, UINT4 *counts) {
4747 int i;
4748
4749 ascending[0] = 0;
4750 for (i = 1; i <= 64; i++) {
4751 ascending[i] = ascending[i-1] + (UINT8) counts[i-1];
4752 }
4753
4754 return ascending[64];
4755 }
4756
4757 #ifdef COUNTS_WITHOUT_COMPRESSION
4758 static void
compare_offsets_huge(UINT8 * ascending,UINT4 * counts_direct,Oligospace_T positioni)4759 compare_offsets_huge (UINT8 *ascending, UINT4 *counts_direct, Oligospace_T positioni) {
4760 int i, j;
4761 UINT8 totalcount = ascending[0];
4762
4763 for (i = 1; i <= 64; i++) {
4764 if (ascending[i] != totalcount + counts_direct[i]) {
4765 fprintf(stderr,"At positioni %llu, element %d, computed with compression %llu != computed without compression %llu\n",
4766 positioni,i,ascending[i],totalcount + counts_direct[i]);
4767 for (j = 0; j < 64; j++) {
4768 fprintf(stderr,"%d: %llu %u\n",j,ascending[j],counts_direct[j]);
4769 }
4770 abort();
4771 }
4772 totalcount += counts_direct[i];
4773 }
4774
4775 return;
4776 }
4777 #endif
4778
4779
4780 /* We want to store values 0..n, with final value at ascending[n]
4781 possibly stored as the final metainfo value */
4782 /* Stored in columnar order */
4783 void
Bitpack64_write_differential_bitpacks(char * ptrsfile,char * compfile,char * packsizes,UINT4 ** bitpacks,Oligospace_T n)4784 Bitpack64_write_differential_bitpacks (char *ptrsfile, char *compfile, char *packsizes, UINT4 **bitpacks,
4785 Oligospace_T n) {
4786 FILE *ptrs_fp, *comp_fp;
4787 UINT4 *ptrs, *p, nregisters;
4788 UINT4 totalcount;
4789 size_t nptrs;
4790 int i;
4791 Oligospace_T positioni, bmer;
4792
4793 /* Buffer is used to avoid frequent writes to the file */
4794 UINT4 *buffer;
4795 int buffer_size = BUFFER_SIZE;
4796 int buffer_i;
4797
4798 UINT4 diffs[BLOCKSIZE], ascending[BLOCKSIZE+1], counts[BLOCKSIZE], last_block[BLOCKSIZE];
4799 int packsize;
4800
4801
4802 write_setup();
4803
4804 /* printf("Entered Bitpack64_write_differential with n %llu\n",n); */
4805
4806 /* 2 metavalues: nwritten (pointer) and cumulative sum for block.
4807 Packsize can be computed from difference between successive
4808 pointers, if only even packsizes are allowed */
4809 p = ptrs = (UINT4 *) CALLOC(((n + BLOCKSIZE)/BLOCKSIZE + 1) * DIFFERENTIAL_METAINFO_SIZE,sizeof(UINT4));
4810
4811 if ((comp_fp = FOPEN_WRITE_BINARY(compfile)) == NULL) {
4812 fprintf(stderr,"Can't write to compfile %s: %s\n",compfile,strerror(errno));
4813 abort();
4814 }
4815 buffer = (UINT4 *) CALLOC(buffer_size,sizeof(UINT4));
4816 buffer_i = 0;
4817
4818 nregisters = 0U;
4819
4820 /* Last value of ascending is at ascending[n] */
4821 /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
4822 totalcount = 0;
4823 for (positioni = 0, bmer = 0; positioni + BLOCKSIZE <= n; positioni += BLOCKSIZE, bmer++) {
4824 /* Pointer */
4825 *p++ = nregisters; /* In 128-bit registers */
4826
4827 /* Value for start of block */
4828 *p++ = totalcount;
4829
4830 /* Pack block of 64 diffs */
4831 Bitpack64_extract_bitpack(counts,packsizes[bmer],bitpacks[bmer]);
4832 totalcount += compute_ascending(ascending,counts);
4833 packsize = Bitpack64_compute_q4_diffs_bidir(diffs,ascending); /* Note: This packsize may differ from packsizes[bmer], because of calculation of diffs */
4834 buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
4835
4836 nregisters += packsize / 2;
4837 }
4838
4839 /* Old: Check for positioni < n, because if positioni == n, ascending[n] will be taken care of as metainfo */
4840 /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
4841 /* For nucleotides, expect a single final block where positioni == n */
4842 if (positioni <= n) {
4843 /* Finish last block of 64 */
4844 *p++ = nregisters; /* In 128-bit registers */
4845
4846 /* Value for start of block */
4847 *p++ = totalcount;
4848
4849 if (positioni == n) {
4850 /* Don't have a bitpack at [bmerspace]. Just fills counts with zeroes. */
4851 Bitpack64_extract_bitpack(counts,/*packsize*/0,/*bitpack*/NULL);
4852 } else {
4853 Bitpack64_extract_bitpack(counts,packsizes[bmer],bitpacks[bmer]);
4854 }
4855
4856 /* For differential, want <=. For direct, want < */
4857 for (i = 0; i <= (int) (n - positioni); i++) {
4858 last_block[i] = counts[i];
4859 }
4860 for ( ; i < BLOCKSIZE; i++) {
4861 /* Copy last value for rest of block */
4862 last_block[i] = 0;
4863 }
4864
4865 /* Pack block of < 64 diffs */
4866 totalcount += compute_ascending(ascending,last_block);
4867 packsize = Bitpack64_compute_q4_diffs_bidir(diffs,ascending);
4868 buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
4869
4870 nregisters += packsize / 2;
4871 }
4872
4873 /* Write the final pointer, which will point after the end of the file */
4874 *p++ = nregisters; /* In 128-bit registers */
4875
4876 /* Value for end of block */
4877 *p++ = totalcount;
4878
4879 if ((ptrs_fp = FOPEN_WRITE_BINARY(ptrsfile)) == NULL) {
4880 fprintf(stderr,"Can't write to ptrsfile %s: %s\n",ptrsfile,strerror(errno));
4881 abort();
4882 } else {
4883 nptrs = p - ptrs;
4884 if (FWRITE_UINTS(ptrs,nptrs,ptrs_fp) != (size_t) nptrs) {
4885 fprintf(stderr,"Error in Bitpack64_write_differential_bitpacks: %s\n",strerror(errno));
4886 exit(9);
4887 }
4888 FREE(ptrs);
4889 fclose(ptrs_fp);
4890 }
4891
4892 /* Empty buffer */
4893 if (buffer_i > 0) {
4894 if (FWRITE_UINTS(buffer,buffer_i,comp_fp) != (size_t) buffer_i) {
4895 fprintf(stderr,"Error in Bitpack64_write_differential_bitpacks: %s\n",strerror(errno));
4896 exit(9);
4897 }
4898 buffer_i = 0;
4899 }
4900 FREE(buffer);
4901 fclose(comp_fp);
4902
4903 return;
4904 }
4905
4906
4907 #if 0
4908 /* We want to store values 0..n, with final value at ascending[n]
4909 possibly stored as the final metainfo value */
4910 /* Stored in columnar order */
4911 UINT4
4912 Bitpack64_append_differential_bitpacks (UINT4 *totalcount, FILE *ptrs_fp, FILE *comp_fp, char *packsizes, UINT4 **bitpacks,
4913 Oligospace_T n) {
4914 UINT4 *ptrs, *p, nregisters;
4915 size_t nptrs;
4916 int i;
4917 Oligospace_T positioni, bmer;
4918
4919 /* Buffer is used to avoid frequent writes to the file */
4920 UINT4 *buffer;
4921 int buffer_size = BUFFER_SIZE;
4922 int buffer_i;
4923
4924 UINT4 diffs[BLOCKSIZE], ascending[BLOCKSIZE+1], counts[BLOCKSIZE], last_block[BLOCKSIZE];
4925 int packsize;
4926
4927
4928 write_setup();
4929
4930 /* printf("Entered Bitpack64_write_differential with n %llu\n",n); */
4931
4932 /* 2 metavalues: nwritten (pointer) and cumulative sum for block.
4933 Packsize can be computed from difference between successive
4934 pointers, if only even packsizes are allowed */
4935 p = ptrs = (UINT4 *) CALLOC(((n + BLOCKSIZE)/BLOCKSIZE + 1) * DIFFERENTIAL_METAINFO_SIZE,sizeof(UINT4));
4936
4937 buffer = (UINT4 *) CALLOC(buffer_size,sizeof(UINT4));
4938 buffer_i = 0;
4939
4940 nregisters = 0U;
4941
4942 /* Last value of ascending is at ascending[n] */
4943 /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
4944 *totalcount = 0;
4945 for (positioni = 0, bmer = 0; positioni + BLOCKSIZE <= n; positioni += BLOCKSIZE, bmer++) {
4946 /* Pointer */
4947 *p++ = nregisters; /* In 128-bit registers */
4948
4949 /* Value for start of block */
4950 *p++ = *totalcount;
4951
4952 /* Pack block of 64 diffs */
4953 Bitpack64_extract_bitpack(counts,packsizes[bmer],bitpacks[bmer]);
4954 *totalcount += compute_ascending(ascending,counts);
4955 packsize = Bitpack64_compute_q4_diffs_bidir(diffs,ascending); /* Note: This packsize may differ from packsizes[bmer], because of calculation of diffs */
4956 buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
4957
4958 nregisters += packsize / 2;
4959 }
4960
4961 /* Old: Check for positioni < n, because if positioni == n, ascending[n] will be taken care of as metainfo */
4962 /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
4963 /* For nucleotides, expect a single final block where positioni == n */
4964 if (positioni <= n) {
4965 /* Finish last block of 64 */
4966 *p++ = nregisters; /* In 128-bit registers */
4967
4968 /* Value for start of block */
4969 *p++ = *totalcount;
4970
4971 if (positioni == n) {
4972 /* Don't have a bitpack at [bmerspace]. Just fills counts with zeroes. */
4973 Bitpack64_extract_bitpack(counts,/*packsize*/0,/*bitpack*/NULL);
4974 } else {
4975 Bitpack64_extract_bitpack(counts,packsizes[bmer],bitpacks[bmer]);
4976 }
4977
4978 /* For differential, want <=. For direct, want < */
4979 for (i = 0; i <= (int) (n - positioni); i++) {
4980 last_block[i] = counts[i];
4981 }
4982 for ( ; i < BLOCKSIZE; i++) {
4983 /* Copy last value for rest of block */
4984 last_block[i] = 0;
4985 }
4986
4987 /* Pack block of < 64 diffs */
4988 *totalcount += compute_ascending(ascending,last_block);
4989 packsize = Bitpack64_compute_q4_diffs_bidir(diffs,ascending);
4990 buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
4991
4992 nregisters += packsize / 2;
4993 }
4994
4995 #if 0
4996 /* Since we are providing blocks of 64, the meta entry at [64] does get written */
4997 /* Write the final pointer, which will point after the end of the file */
4998 *p++ = nregisters; /* In 128-bit registers */
4999
5000 /* Value for end of block */
5001 *p++ = *totalcount;
5002 #endif
5003
5004 nptrs = p - ptrs;
5005 if (FWRITE_UINTS(ptrs,nptrs,ptrs_fp) != (size_t) nptrs) {
5006 fprintf(stderr,"Error in Bitpack64_append_differential_bitpacks: %s\n",strerror(errno));
5007 exit(9);
5008 }
5009 FREE(ptrs);
5010
5011 /* Empty buffer */
5012 if (buffer_i > 0) {
5013 if (FWRITE_UINTS(buffer,buffer_i,comp_fp) != (size_t) buffer_i) {
5014 fprintf(stderr,"Error in Bitpack64_append_differential_bitpacks: %s\n",strerror(errno));
5015 exit(9);
5016 }
5017 buffer_i = 0;
5018 }
5019 FREE(buffer);
5020
5021 return nregisters;
5022 }
5023 #endif
5024
5025
5026
5027 #if 0
5028 /* We want to store values 0..n, with final value at ascending[n]
5029 possibly stored as the final metainfo value */
5030 /* D4 stored in columnar order, plus D1 stored as direct */
5031 void
5032 Bitpack64_write_differential_paired (char *ptrsfile, char *compfile, UINT4 *ascending, Oligospace_T n) {
5033 FILE *ptrs_fp, *comp_fp;
5034 UINT4 *ptrs, *p;
5035 size_t nptrs;
5036 int i;
5037 Oligospace_T positioni;
5038
5039 /* Buffer is used to avoid frequent writes to the file */
5040 UINT4 *buffer;
5041 int buffer_size = BUFFER_SIZE;
5042 int buffer_i;
5043
5044 UINT4 diffs[BLOCKSIZE], last_block[BLOCKSIZE+1];
5045
5046 UINT4 nwritten;
5047 int packsize;
5048
5049
5050 write_setup();
5051
5052 /* 2 metavalues: nwritten (pointer) and cumulative sum for block.
5053 Packsize can be computed from difference between successive
5054 pointers, if only even packsizes are allowed */
5055 p = ptrs = (UINT4 *) CALLOC(((n + BLOCKSIZE)/BLOCKSIZE + 1) * PAIRED_METAINFO_SIZE,sizeof(UINT4));
5056
5057 if ((comp_fp = FOPEN_WRITE_BINARY(compfile)) == NULL) {
5058 fprintf(stderr,"Can't write to compfile %s: %s\n",compfile,strerror(errno));
5059 abort();
5060 }
5061 buffer = (UINT4 *) CALLOC(buffer_size,sizeof(UINT4));
5062 buffer_i = 0;
5063
5064 nwritten = 0U;
5065
5066 /* Last value of ascending is at ascending[n] */
5067 /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
5068 for (positioni = 0; positioni + BLOCKSIZE <= n; positioni += BLOCKSIZE) {
5069 /* Pointer to D4 */
5070 *p++ = nwritten/4; /* In 128-bit registers */
5071
5072 /* Prefix sum for start of block */
5073 *p++ = ascending[positioni];
5074
5075 /* D4: Pack block of 64 diffs */
5076 packsize = Bitpack64_compute_q4_diffs_bidir(diffs,&(ascending[positioni]));
5077 buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
5078
5079 #ifdef ALLOW_ODD_PACKSIZES
5080 nwritten += 2 * ((packsize + 1) & ~1);
5081 #else
5082 nwritten += 2 * packsize;
5083 #endif
5084
5085 /* Pointer to D1 */
5086 *p++ = nwritten/4; /* In 128-bit registers */
5087
5088 /* D1: Pack block of 64 diffs */
5089 packsize = compute_q1_diffs(diffs,&(ascending[positioni]));
5090 buffer_i = write_vert(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
5091
5092 #ifdef ALLOW_ODD_PACKSIZES
5093 nwritten += 2 * ((packsize + 1) & ~1);
5094 #else
5095 nwritten += 2 * packsize;
5096 #endif
5097 }
5098
5099 /* Old: Check for positioni < n, because if positioni == n, ascending[n] will be taken care of as metainfo */
5100 /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
5101 if (positioni <= n) {
5102 /* Finish last block of 64 */
5103 /* Pointer to D4 */
5104 *p++ = nwritten/4; /* In 128-bit registers */
5105
5106 /* Prefix sum for start of block */
5107 *p++ = ascending[positioni];
5108
5109 /* For differential, want <=. For direct, want < */
5110 for (i = 0; i <= (int) (n - positioni); i++) {
5111 last_block[i] = ascending[positioni+i];
5112 }
5113 for ( ; i <= BLOCKSIZE; i++) {
5114 /* Copy last value for rest of block */
5115 last_block[i] = ascending[n];
5116 }
5117
5118 /* D4: Pack block of < 64 diffs */
5119 packsize = Bitpack64_compute_q4_diffs_bidir(diffs,last_block);
5120 buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
5121
5122 #ifdef ALLOW_ODD_PACKSIZES
5123 nwritten += 2 * ((packsize + 1) & ~1);
5124 #else
5125 nwritten += 2 * packsize;
5126 #endif
5127
5128 /* Pointer to D1 */
5129 *p++ = nwritten/4; /* In 128-bit registers */
5130
5131 /* D1: Pack block of < 64 diffs */
5132 packsize = compute_q1_diffs(diffs,last_block);
5133 buffer_i = write_vert(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
5134 }
5135
5136 /* Write the final pointer, which will point after the end of the file */
5137 *p++ = nwritten/4; /* In 128-bit registers */
5138
5139 /* Prefix sum for end of block */
5140 *p++ = ascending[n];
5141
5142 if ((ptrs_fp = FOPEN_WRITE_BINARY(ptrsfile)) == NULL) {
5143 fprintf(stderr,"Can't write to ptrsfile %s: %s\n",ptrsfile,strerror(errno));
5144 abort();
5145 } else {
5146 nptrs = p - ptrs;
5147 if (FWRITE_UINTS(ptrs,nptrs,ptrs_fp) != (size_t) nptrs) {
5148 fprintf(stderr,"Error in Bitpack64_write_differential_paired: %s\n",strerror(errno));
5149 exit(9);
5150 }
5151 FREE(ptrs);
5152 fclose(ptrs_fp);
5153 }
5154
5155 /* Empty buffer */
5156 if (buffer_i > 0) {
5157 if (FWRITE_UINTS(buffer,buffer_i,comp_fp) != (size_t) buffer_i) {
5158 fprintf(stderr,"Error in Bitpack64_write_differential_paired: %s\n",strerror(errno));
5159 exit(9);
5160 }
5161 buffer_i = 0;
5162 }
5163 FREE(buffer);
5164 fclose(comp_fp);
5165
5166 return;
5167 }
5168 #endif
5169
5170
5171
5172
5173 #if 0
5174 /* Worst case:
5175 64 128 192 256
5176 256 256 256 256 */
5177
5178 #define FIXED10_PACKSIZE 10 /* Enough to hold +/- 256 */
5179
5180 /* We want to store values 0..n, with final value at ascending[n]
5181 possibly stored as the final metainfo value */
5182 /* Stored in columnar order */
5183 void
5184 Bitpack64_write_fixed10 (char *ptrsfile, char *compfile, UINT4 *ascending, Oligospace_T n) {
5185 #ifndef USE_ONE_FILE_FOR_FIXED
5186 FILE *ptrs_fp;
5187 #endif
5188 FILE *comp_fp;
5189 UINT4 *ptrs;
5190 UINT4 ptri;
5191 int i;
5192 Oligospace_T positioni;
5193
5194 /* Buffer is used to avoid frequent writes to the file */
5195 UINT4 *buffer;
5196 int buffer_size = BUFFER_SIZE;
5197 int buffer_i;
5198
5199 UINT4 diffs[BLOCKSIZE], last_block[BLOCKSIZE+1];
5200
5201 UINT4 nwritten;
5202 int packsize;
5203
5204 write_setup();
5205
5206 /* 2 metavalues: nwritten (pointer) and cumulative sum for block.
5207 Packsize can be computed from difference between successive
5208 pointers, if only even packsizes are allowed */
5209 #ifdef USE_ONE_FILE_FOR_FIXED
5210 ptrs = (UINT4 *) CALLOC(4,sizeof(UINT4));
5211 ptri = 0;
5212 #else
5213 ptrs = (UINT4 *) CALLOC(((n + BLOCKSIZE)/BLOCKSIZE + 1) * RANK_METAINFO_SIZE,sizeof(UINT4));
5214 ptri = 0;
5215 #endif
5216
5217 if ((comp_fp = FOPEN_WRITE_BINARY(compfile)) == NULL) {
5218 fprintf(stderr,"Can't write to compfile %s: %s\n",compfile,strerror(errno));
5219 abort();
5220 }
5221 buffer = (UINT4 *) CALLOC(buffer_size,sizeof(UINT4));
5222 buffer_i = 0;
5223
5224 nwritten = 0U;
5225
5226 /* Last value of ascending is at ascending[n] */
5227 /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
5228 for (positioni = 0; positioni + BLOCKSIZE <= n; positioni += BLOCKSIZE) {
5229 #if 0
5230 /* Pointer */
5231 ptrs[ptri++] = nwritten/4; /* In 128-bit registers */
5232 #endif
5233
5234 /* Value for start of block */
5235 ptrs[ptri++] = ascending[positioni];
5236 #ifdef USE_ONE_FILE_FOR_FIXED
5237 if (ptri == 4) {
5238 if (FWRITE_UINTS(ptrs,4,comp_fp) != (size_t) 4) {
5239 fprintf(stderr,"Error in Bitpack64_write_fixed10: %s\n",strerror(errno));
5240 exit(9);
5241 }
5242 ptri = 0;
5243 }
5244 #endif
5245
5246 /* Pack block of 64 diffs */
5247 packsize = Bitpack64_compute_q4_diffs_bidir(diffs,&(ascending[positioni]));
5248 assert(packsize <= FIXED10_PACKSIZE);
5249 buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,FIXED10_PACKSIZE);
5250
5251 nwritten += 2 * FIXED10_PACKSIZE;
5252 }
5253
5254 /* Old: Check for positioni < n, because if positioni == n, ascending[n] will be taken care of as metainfo */
5255 /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
5256 if (positioni <= n) {
5257 #if 0
5258 /* Finish last block of 64 */
5259 ptrs[ptri++] = nwritten/4; /* In 128-bit registers */
5260 #endif
5261
5262 /* Value for start of block */
5263 ptrs[ptri++] = ascending[positioni];
5264 #ifdef USE_ONE_FILE_FOR_FIXED
5265 if (ptri == 4) {
5266 if (FWRITE_UINTS(ptrs,4,comp_fp) != (size_t) 4) {
5267 fprintf(stderr,"Error in Bitpack64_write_fixed10: %s\n",strerror(errno));
5268 exit(9);
5269 }
5270 ptri = 0;
5271 }
5272 #endif
5273
5274 /* For differential, want <=. For direct, want < */
5275 for (i = 0; i <= (int) (n - positioni); i++) {
5276 last_block[i] = ascending[positioni+i];
5277 }
5278 for ( ; i <= BLOCKSIZE; i++) {
5279 /* Copy last value for rest of block */
5280 last_block[i] = ascending[n];
5281 }
5282
5283 /* Pack block of < 64 diffs */
5284 packsize = Bitpack64_compute_q4_diffs_bidir(diffs,last_block);
5285 assert(packsize <= FIXED10_PACKSIZE);
5286 buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,FIXED10_PACKSIZE);
5287
5288 nwritten += 2 * FIXED10_PACKSIZE;
5289 }
5290
5291
5292 #if 0
5293 /* Write the final pointer, which will point after the end of the file */
5294 ptrs[ptri++] = nwritten/4; /* In 128-bit registers */
5295 #endif
5296
5297 /* Value for end of block */
5298 ptrs[ptri++] = ascending[n];
5299 #ifdef USE_ONE_FILE_FOR_FIXED
5300 for (i = ptri; i < 4; i++) {
5301 ptrs[i] = 0U;
5302 }
5303 if (FWRITE_UINTS(ptrs,4,comp_fp) != (size_t) 4) {
5304 fprintf(stderr,"Error in Bitpack64_write_fixed10: %s\n",strerror(errno));
5305 exit(9);
5306 }
5307 #else
5308 if ((ptrs_fp = FOPEN_WRITE_BINARY(ptrsfile)) == NULL) {
5309 fprintf(stderr,"Can't write to ptrsfile %s: %s\n",ptrsfile,strerror(errno));
5310 abort();
5311 } else {
5312 if (FWRITE_UINTS(ptrs,ptri,ptrs_fp) != (size_t) ptri) {
5313 fprintf(stderr,"Error in Bitpack64_write_fixed10: %s\n",strerror(errno));
5314 exit(9);
5315 }
5316 fclose(ptrs_fp);
5317 }
5318 #endif
5319 FREE(ptrs);
5320
5321 /* Empty buffer */
5322 if (buffer_i > 0) {
5323 if (FWRITE_UINTS(buffer,buffer_i,comp_fp) != (size_t) buffer_i) {
5324 fprintf(stderr,"Error in Bitpack64_write_fixed10: %s\n",strerror(errno));
5325 exit(9);
5326 }
5327 buffer_i = 0;
5328 }
5329 FREE(buffer);
5330 fclose(comp_fp);
5331
5332 return;
5333 }
5334 #endif
5335
5336
5337 void
Bitpack64_write_differential_huge(char * pagesfile,char * ptrsfile,char * compfile,UINT8 * ascending,Oligospace_T n)5338 Bitpack64_write_differential_huge (char *pagesfile, char *ptrsfile, char *compfile,
5339 UINT8 *ascending, Oligospace_T n) {
5340 UINT8 currpage, nextpage;
5341 FILE *pages_fp, *ptrs_fp, *comp_fp;
5342 UINT4 pages[25]; /* Allows us to handle up to 100 billion positions */
5343 UINT4 *ptrs, *p;
5344 size_t nptrs;
5345 Oligospace_T positioni;
5346
5347 /* Buffer is used to avoid frequent writes to the file */
5348 UINT4 *buffer;
5349 int buffer_size = BUFFER_SIZE;
5350 int buffer_i;
5351
5352 UINT4 diffs[BLOCKSIZE];
5353 UINT8 last_block[BLOCKSIZE+1];
5354
5355 int pagei = 0, i;
5356 UINT4 nwritten;
5357 int packsize;
5358
5359
5360 write_setup();
5361
5362 /* 2 metavalues: nwritten (pointer) and cumulative sum for block.
5363 Packsize can be computed from difference between successive
5364 pointers, if only even packsizes are allowed */
5365 p = ptrs = (UINT4 *) CALLOC(((n + BLOCKSIZE)/BLOCKSIZE + 1) * DIFFERENTIAL_METAINFO_SIZE,sizeof(UINT4));
5366
5367 if ((comp_fp = FOPEN_WRITE_BINARY(compfile)) == NULL) {
5368 fprintf(stderr,"Can't write to compfile %s: %s\n",compfile,strerror(errno));
5369 abort();
5370 }
5371 buffer = (UINT4 *) CALLOC(buffer_size,sizeof(UINT4));
5372 buffer_i = 0;
5373
5374 currpage = 0;
5375 nextpage = POSITIONS_PAGE;
5376 nwritten = 0U;
5377
5378 /* Last value of ascending is at ascending[n] */
5379 /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
5380 for (positioni = 0; positioni + BLOCKSIZE <= n; positioni += BLOCKSIZE) {
5381 /* Pointer */
5382 *p++ = nwritten/4; /* In 128-bit registers */
5383
5384 /* Value for start of block */
5385 while (ascending[positioni] >= nextpage) {
5386 fprintf(stderr,"\nAt position %llu (block %llu), ascending %llu >= nextpage %llu",
5387 positioni,positioni/BLOCKSIZE,ascending[positioni],nextpage);
5388 pages[pagei++] = positioni/BLOCKSIZE;
5389 currpage = nextpage;
5390 nextpage += POSITIONS_PAGE;
5391 }
5392 *p++ = ascending[positioni] - currpage;
5393
5394
5395 /* Pack block of 64 diffs */
5396 packsize = Bitpack64_compute_q4_diffs_bidir_huge(diffs,&(ascending[positioni]));
5397 buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
5398
5399 #ifdef ALLOW_ODD_PACKSIZES
5400 nwritten += 2 * ((packsize + 1) & ~1);
5401 #else
5402 nwritten += 2 * packsize;
5403 #endif
5404 }
5405
5406 /* Old: Check for positioni < n, because if positioni == n, ascending[n] will be taken care of as metainfo */
5407 /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
5408 if (positioni <= n) {
5409 /* Finish last block of 64 */
5410 *p++ = nwritten/4; /* In 128-bit registers */
5411
5412 /* Value for start of block */
5413 while (ascending[positioni] >= nextpage) {
5414 fprintf(stderr,"\nAt position %llu (block %llu), ascending %llu >= nextpage %llu",
5415 positioni,positioni/BLOCKSIZE,ascending[positioni],nextpage);
5416 pages[pagei++] = positioni/BLOCKSIZE;
5417 currpage = nextpage;
5418 nextpage += POSITIONS_PAGE;
5419 }
5420 *p++ = ascending[positioni] - currpage;
5421
5422 /* For differential, want <=. For direct, want < */
5423 for (i = 0; i <= (int) (n - positioni); i++) {
5424 last_block[i] = ascending[positioni+i] - currpage;
5425 }
5426 for ( ; i <= BLOCKSIZE; i++) {
5427 /* Copy last value for rest of block */
5428 last_block[i] = ascending[n] - currpage;
5429 }
5430
5431 /* Pack block of < 64 diffs */
5432 packsize = Bitpack64_compute_q4_diffs_bidir_huge(diffs,last_block);
5433 buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
5434
5435 #ifdef ALLOW_ODD_PACKSIZES
5436 nwritten += 2 * ((packsize + 1) & ~1);
5437 #else
5438 nwritten += 2 * packsize;
5439 #endif
5440 }
5441
5442
5443 /* Write the final pointer, which will point after the end of the file */
5444 *p++ = nwritten/4; /* In 128-bit registers */
5445
5446 /* Value for end of block */
5447 if (ascending[n] >= nextpage) {
5448 fprintf(stderr,"\nAt final oligo %llu (block %llu), ascending %llu >= nextpage %llu",
5449 n,n/BLOCKSIZE,ascending[n],nextpage);
5450 pages[pagei++] = n/BLOCKSIZE;
5451 currpage = nextpage;
5452 /* nextpage += POSITIONS_PAGE; */
5453 }
5454 *p++ = ascending[n] - currpage;
5455
5456
5457 /* Write pages */
5458 if (pagei > 0) {
5459 pages[pagei++] = (UINT4) -1; /* Final value */
5460 if ((pages_fp = FOPEN_WRITE_BINARY(pagesfile)) == NULL) {
5461 fprintf(stderr,"Can't write to pagesfile %s: %s\n",pagesfile,strerror(errno));
5462 abort();
5463 } else {
5464 fprintf(stderr,"\nHave %d pages:",pagei);
5465 for (i = 0; i < pagei; i++) {
5466 fprintf(stderr," %u",pages[i]);
5467 }
5468 fprintf(stderr,"\n");
5469 if (FWRITE_UINTS(pages,pagei,pages_fp) != (size_t) pagei) {
5470 fprintf(stderr,"Error in Bitpack64_write_differential_huge: %s\n",strerror(errno));
5471 exit(9);
5472 }
5473 /* FREE(pages); */
5474 fclose(pages_fp);
5475 }
5476 }
5477
5478 if ((ptrs_fp = FOPEN_WRITE_BINARY(ptrsfile)) == NULL) {
5479 fprintf(stderr,"Can't write to ptrsfile %s: %s\n",ptrsfile,strerror(errno));
5480 abort();
5481 } else {
5482 nptrs = p - ptrs;
5483 if (FWRITE_UINTS(ptrs,nptrs,ptrs_fp) != (size_t) nptrs) {
5484 fprintf(stderr,"Error in Bitpack64_write_differential_huge: %s\n",strerror(errno));
5485 exit(9);
5486 }
5487 FREE(ptrs);
5488 fclose(ptrs_fp);
5489 }
5490
5491 /* Empty buffer */
5492 if (buffer_i > 0) {
5493 if (FWRITE_UINTS(buffer,buffer_i,comp_fp) != (size_t) buffer_i) {
5494 fprintf(stderr,"Error in Bitpack64_write_differential_huge: %s\n",strerror(errno));
5495 exit(9);
5496 }
5497 buffer_i = 0;
5498 }
5499 FREE(buffer);
5500 fclose(comp_fp);
5501
5502 return;
5503 }
5504
5505
5506 void
Bitpack64_write_differential_huge_bitpacks(char * pagesfile,char * ptrsfile,char * compfile,char * packsizes,UINT4 ** bitpacks,UINT4 * counts_direct,Oligospace_T n)5507 Bitpack64_write_differential_huge_bitpacks (char *pagesfile, char *ptrsfile, char *compfile,
5508 char *packsizes, UINT4 **bitpacks,
5509 #ifdef COUNTS_WITHOUT_COMPRESSION
5510 UINT4* counts_direct,
5511 #endif
5512 Oligospace_T n) {
5513 #ifdef CHECK
5514 UINT4 q;
5515 #endif
5516
5517 UINT8 currpage, nextpage;
5518 FILE *pages_fp, *ptrs_fp, *comp_fp;
5519 UINT4 pages[25]; /* Allows us to handle up to 100 billion positions. At q3, means 300 billion nt */
5520 UINT4 *ptrs, *p, nregisters;
5521 UINT8 totalcount;
5522 size_t nptrs;
5523 Oligospace_T positioni, bmer;
5524
5525 /* Buffer is used to avoid frequent writes to the file */
5526 UINT4 *buffer;
5527 int buffer_size = BUFFER_SIZE;
5528 int buffer_i;
5529
5530 UINT4 diffs[BLOCKSIZE], counts[BLOCKSIZE], last_block[BLOCKSIZE];
5531 UINT8 ascending[BLOCKSIZE+1];
5532
5533 int pagei = 0, i;
5534 int packsize;
5535
5536
5537 write_setup();
5538
5539 /* 2 metavalues: nwritten (pointer) and cumulative sum for block.
5540 Packsize can be computed from difference between successive
5541 pointers, if only even packsizes are allowed */
5542 p = ptrs = (UINT4 *) CALLOC(((n + BLOCKSIZE)/BLOCKSIZE + 1) * DIFFERENTIAL_METAINFO_SIZE,sizeof(UINT4));
5543
5544 if ((comp_fp = FOPEN_WRITE_BINARY(compfile)) == NULL) {
5545 fprintf(stderr,"Can't write to compfile %s: %s\n",compfile,strerror(errno));
5546 abort();
5547 }
5548 buffer = (UINT4 *) CALLOC(buffer_size,sizeof(UINT4));
5549 buffer_i = 0;
5550
5551 currpage = 0;
5552 nextpage = POSITIONS_PAGE;
5553 nregisters = 0U;
5554
5555 /* Last value of ascending is at ascending[n] */
5556 /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
5557 totalcount = 0;
5558 for (positioni = 0, bmer = 0; positioni + BLOCKSIZE <= n; positioni += BLOCKSIZE, bmer++) {
5559 /* Pointer */
5560 *p++ = nregisters; /* In 128-bit registers */
5561
5562 /* Value for start of block */
5563 while (totalcount >= nextpage) {
5564 fprintf(stderr,"\nAt position %llu (block %llu), ascending %llu >= nextpage %llu",
5565 positioni,positioni/BLOCKSIZE,totalcount,nextpage);
5566 pages[pagei++] = positioni/BLOCKSIZE;
5567 currpage = nextpage;
5568 nextpage += POSITIONS_PAGE;
5569 }
5570 *p++ = totalcount - currpage;
5571
5572 /* Pack block of 64 diffs */
5573 Bitpack64_extract_bitpack(counts,packsizes[bmer],bitpacks[bmer]);
5574 totalcount += compute_ascending_huge(ascending,counts);
5575 #ifdef COUNTS_WITHOUT_COMPRESSION
5576 compare_offsets_huge(ascending,&(counts_direct[positioni]),positioni);
5577 #endif
5578 packsize = Bitpack64_compute_q4_diffs_bidir_huge(diffs,ascending); /* Note: This packsize may differ from packsizes[bmer], because of calculation of diffs */
5579 buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
5580
5581 nregisters += packsize / 2;
5582 }
5583
5584 /* Old: Check for positioni < n, because if positioni == n, ascending[n] will be taken care of as metainfo */
5585 /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
5586 /* For nucleotides, expect a single final block where positioni == n */
5587 if (positioni <= n) {
5588 /* Finish last block of 64 */
5589 *p++ = nregisters; /* In 128-bit registers */
5590
5591 /* Value for start of block */
5592 while (totalcount >= nextpage) {
5593 fprintf(stderr,"\nAt position %llu (block %llu), totalcount %llu >= nextpage %llu",
5594 positioni,positioni/BLOCKSIZE,totalcount,nextpage);
5595 pages[pagei++] = positioni/BLOCKSIZE;
5596 currpage = nextpage;
5597 nextpage += POSITIONS_PAGE;
5598 }
5599 *p++ = totalcount - currpage;
5600
5601 if (positioni == n) {
5602 /* Don't have a bitpack at [bmerspace] */
5603 Bitpack64_extract_bitpack(counts,/*packsize*/0,/*bitpack*/NULL);
5604 } else {
5605 Bitpack64_extract_bitpack(counts,packsizes[bmer],bitpacks[bmer]);
5606 }
5607
5608 /* For differential, want <=. For direct, want < */
5609 for (i = 0; i <= (int) (n - positioni); i++) {
5610 last_block[i] = counts[i];
5611 }
5612 for ( ; i <= BLOCKSIZE; i++) {
5613 /* Copy last value for rest of block */
5614 last_block[i] = 0;
5615 }
5616
5617 /* Pack block of < 64 diffs */
5618 totalcount += compute_ascending_huge(ascending,last_block);
5619 #ifdef COUNTS_WITHOUT_COMPRESSION
5620 /* May not match for a partial block */
5621 /* compare_offsets_huge(ascending,&(counts[positioni]),positioni); */
5622 #endif
5623 packsize = Bitpack64_compute_q4_diffs_bidir_huge(diffs,ascending); /* Note: This packsize may differ from packsizes[bmer], because of calculation of diffs */
5624 buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
5625
5626 nregisters += packsize / 2;
5627 }
5628
5629
5630 /* Write the final pointer, which will point after the end of the file */
5631 *p++ = nregisters; /* In 128-bit registers */
5632
5633 /* Value for end of block */
5634 if (totalcount >= nextpage) {
5635 fprintf(stderr,"\nAt final oligo %llu (block %llu), totalcount %llu >= nextpage %llu",
5636 n,n/BLOCKSIZE,totalcount,nextpage);
5637 pages[pagei++] = n/BLOCKSIZE;
5638 currpage = nextpage;
5639 /* nextpage += POSITIONS_PAGE; */
5640 }
5641 *p++ = totalcount - currpage;
5642
5643
5644 /* Write pages */
5645 if (pagei > 0) {
5646 pages[pagei++] = (UINT4) -1; /* Final value */
5647 if ((pages_fp = FOPEN_WRITE_BINARY(pagesfile)) == NULL) {
5648 fprintf(stderr,"Can't write to pagesfile %s: %s\n",pagesfile,strerror(errno));
5649 abort();
5650 } else {
5651 fprintf(stderr,"\nHave %d pages:",pagei);
5652 for (i = 0; i < pagei; i++) {
5653 fprintf(stderr," %u",pages[i]);
5654 }
5655 fprintf(stderr,"\n");
5656 if (FWRITE_UINTS(pages,pagei,pages_fp) != (size_t) pagei) {
5657 fprintf(stderr,"Error in Bitpack64_write_differential_huge: %s\n",strerror(errno));
5658 exit(9);
5659 }
5660 /* FREE(pages); */
5661 fclose(pages_fp);
5662 }
5663 }
5664
5665 if ((ptrs_fp = FOPEN_WRITE_BINARY(ptrsfile)) == NULL) {
5666 fprintf(stderr,"Can't write to ptrsfile %s: %s\n",ptrsfile,strerror(errno));
5667 abort();
5668 } else {
5669 nptrs = p - ptrs;
5670 if (FWRITE_UINTS(ptrs,nptrs,ptrs_fp) != (size_t) nptrs) {
5671 fprintf(stderr,"Error in Bitpack64_write_differential_huge: %s\n",strerror(errno));
5672 exit(9);
5673 }
5674 FREE(ptrs);
5675 fclose(ptrs_fp);
5676 }
5677
5678 /* Empty buffer */
5679 if (buffer_i > 0) {
5680 if (FWRITE_UINTS(buffer,buffer_i,comp_fp) != (size_t) buffer_i) {
5681 fprintf(stderr,"Error in Bitpack64_write_differential_huge: %s\n",strerror(errno));
5682 exit(9);
5683 }
5684 buffer_i = 0;
5685 }
5686 FREE(buffer);
5687 fclose(comp_fp);
5688
5689 return;
5690 }
5691
5692
5693
5694 #if 0
5695 void
5696 Bitpack64_write_fixed10_huge (char *pagesfile, char *ptrsfile, char *compfile,
5697 UINT8 *ascending, Oligospace_T n) {
5698 #ifndef USE_ONE_FILE_FOR_FIXED
5699 FILE *ptrs_fp;
5700 #endif
5701 UINT8 currpage, nextpage;
5702 FILE *pages_fp, *comp_fp;
5703 UINT4 pages[25]; /* Allows us to handle up to 100 billion positions */
5704 UINT4 *ptrs;
5705 UINT4 ptri;
5706 Oligospace_T positioni;
5707
5708 /* Buffer is used to avoid frequent writes to the file */
5709 UINT4 *buffer;
5710 int buffer_size = BUFFER_SIZE;
5711 int buffer_i;
5712
5713 UINT4 diffs[BLOCKSIZE];
5714 UINT8 last_block[BLOCKSIZE+1];
5715
5716 int pagei = 0, i;
5717 UINT4 nwritten;
5718 int packsize;
5719
5720
5721 write_setup();
5722
5723 /* 2 metavalues: nwritten (pointer) and cumulative sum for block.
5724 Packsize can be computed from difference between successive
5725 pointers, if only even packsizes are allowed */
5726 #ifdef USE_ONE_FILE_FOR_FIXED
5727 ptrs = (UINT *) CALLOC(4,sizeof(UINT4));
5728 ptri = 0;
5729 #else
5730 ptrs = (UINT4 *) CALLOC(((n + BLOCKSIZE)/BLOCKSIZE + 1) * RANK_METAINFO_SIZE,sizeof(UINT4));
5731 ptri = 0;
5732 #endif
5733
5734 if ((comp_fp = FOPEN_WRITE_BINARY(compfile)) == NULL) {
5735 fprintf(stderr,"Can't write to compfile %s: %s\n",compfile,strerror(errno));
5736 abort();
5737 }
5738 buffer = (UINT4 *) CALLOC(buffer_size,sizeof(UINT4));
5739 buffer_i = 0;
5740
5741 currpage = 0;
5742 nextpage = POSITIONS_PAGE;
5743 nwritten = 0U;
5744
5745 /* Last value of ascending is at ascending[n] */
5746 /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
5747 for (positioni = 0; positioni + BLOCKSIZE <= n; positioni += BLOCKSIZE) {
5748 #if 0
5749 /* Pointer */
5750 ptrs[ptri++] = nwritten/4; /* In 128-bit registers */
5751 #endif
5752
5753 /* Value for start of block */
5754 while (ascending[positioni] >= nextpage) {
5755 fprintf(stderr,"\nAt position %llu (block %llu), ascending %llu >= nextpage %llu",
5756 (unsigned long long) positioni,(unsigned long long) (positioni/BLOCKSIZE),ascending[positioni],nextpage);
5757 pages[pagei++] = positioni/BLOCKSIZE;
5758 currpage = nextpage;
5759 nextpage += POSITIONS_PAGE;
5760 }
5761 ptrs[ptri++] = ascending[positioni] - currpage;
5762 #ifdef USE_ONE_FILE_FOR_FIXED
5763 if (ptri == 4) {
5764 if (FWRITE_UINTS(ptrs,4,comp_fp) != (size_t) 4) {
5765 fprintf(stderr,"Error in Bitpack64_write_fixed10_huge: %s\n",strerror(errno));
5766 exit(9);
5767 }
5768 ptri = 0;
5769 }
5770 #endif
5771
5772 /* Pack block of 64 diffs */
5773 packsize = compute_q4_diffs_bidir_huge(diffs,&(ascending[positioni]));
5774 assert(packsize <= FIXED10_PACKSIZE);
5775 buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,FIXED10_PACKSIZE);
5776
5777 nwritten += 2 * packsize;
5778 }
5779
5780 /* Old: Check for positioni < n, because if positioni == n, ascending[n] will be taken care of as metainfo */
5781 /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
5782 if (positioni <= n) {
5783 #if 0
5784 /* Finish last block of 64 */
5785 ptrs[ptri++] = nwritten/4; /* In 128-bit registers */
5786 #endif
5787
5788 /* Value for start of block */
5789 while (ascending[positioni] >= nextpage) {
5790 fprintf(stderr,"\nAt position %llu (block %llu), ascending %llu >= nextpage %llu",
5791 (unsigned long long) positioni,(unsigned long long) (positioni/BLOCKSIZE),ascending[positioni],nextpage);
5792 pages[pagei++] = positioni/BLOCKSIZE;
5793 currpage = nextpage;
5794 nextpage += POSITIONS_PAGE;
5795 }
5796 ptrs[ptri++] = ascending[positioni] - currpage;
5797 #ifdef USE_ONE_FILE_FOR_FIXED
5798 if (ptri == 4) {
5799 if (FWRITE_UINTS(ptrs,4,comp_fp) != (size_t) 4) {
5800 fprintf(stderr,"Error in Bitpack64_write_fixed10_huge: %s\n",strerror(errno));
5801 exit(9);
5802 }
5803 ptri = 0;
5804 }
5805 #endif
5806
5807 /* For differential, want <=. For direct, want < */
5808 for (i = 0; i <= (int) (n - positioni); i++) {
5809 last_block[i] = ascending[positioni+i] - currpage;
5810 }
5811 for ( ; i <= BLOCKSIZE; i++) {
5812 /* Copy last value for rest of block */
5813 last_block[i] = ascending[n] - currpage;
5814 }
5815
5816 /* Pack block of < 64 diffs */
5817 packsize = compute_q4_diffs_bidir_huge(diffs,last_block);
5818 assert(packsize <= FIXED10_PACKSIZE);
5819 buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,FIXED10_PACKSIZE);
5820
5821 nwritten += 2 * packsize;
5822 }
5823
5824
5825 #if 0
5826 /* Write the final pointer, which will point after the end of the file */
5827 ptrs[ptri++] = nwritten/4; /* In 128-bit registers */
5828 #endif
5829
5830 /* Value for end of block */
5831 if (ascending[n] >= nextpage) {
5832 fprintf(stderr,"\nAt final oligo %llu (block %llu), ascending %llu >= nextpage %llu",
5833 (unsigned long long) n,(unsigned long long) (n/BLOCKSIZE),ascending[n],nextpage);
5834 pages[pagei++] = n/BLOCKSIZE;
5835 currpage = nextpage;
5836 /* nextpage += POSITIONS_PAGE; */
5837 }
5838 ptrs[ptri++] = ascending[n] - currpage;
5839 #ifdef USE_ONE_FILE_FOR_FIXED
5840 for (i = ptri; i < 4; i++) {
5841 ptrs[i] = 0U;
5842 }
5843 if (FWRITE_UINTS(ptrs,4,comp_fp) != (size_t) 4) {
5844 fprintf(stderr,"Error in Bitpack64_write_fixed10_huge: %s\n",strerror(errno));
5845 exit(9);
5846 }
5847 #else
5848 if ((ptrs_fp = FOPEN_WRITE_BINARY(ptrsfile)) == NULL) {
5849 fprintf(stderr,"Can't write to ptrsfile %s: %s\n",ptrsfile,strerror(errno));
5850 abort();
5851 } else {
5852 if (FWRITE_UINTS(ptrs,ptri,ptrs_fp) != (size_t) ptri) {
5853 fprintf(stderr,"Error in Bitpack64_write_fixed10_huge: %s\n",strerror(errno));
5854 exit(9);
5855 }
5856 fclose(ptrs_fp);
5857 }
5858 #endif
5859 FREE(ptrs);
5860
5861 /* Write pages */
5862 if (pagei > 0) {
5863 pages[pagei++] = (UINT4) -1; /* Final value */
5864 if ((pages_fp = FOPEN_WRITE_BINARY(pagesfile)) == NULL) {
5865 fprintf(stderr,"Can't write to pagesfile %s: %s\n",pagesfile,strerror(errno));
5866 abort();
5867 } else {
5868 fprintf(stderr,"\nHave %d pages:",pagei);
5869 for (i = 0; i < pagei; i++) {
5870 fprintf(stderr," %u",pages[i]);
5871 }
5872 fprintf(stderr,"\n");
5873 if (FWRITE_UINTS(pages,pagei,pages_fp) != (size_t) pagei) {
5874 fprintf(stderr,"Error in Bitpack64_write_fixed10_huge: %s\n",strerror(errno));
5875 exit(9);
5876 }
5877 /* FREE(pages); */
5878 fclose(pages_fp);
5879 }
5880 }
5881
5882 /* Empty buffer */
5883 if (buffer_i > 0) {
5884 if (FWRITE_UINTS(buffer,buffer_i,comp_fp) != (size_t) buffer_i) {
5885 fprintf(stderr,"Error in Bitpack64_write_fixed10_huge: %s\n",strerror(errno));
5886 exit(9);
5887 }
5888 buffer_i = 0;
5889 }
5890 FREE(buffer);
5891 fclose(comp_fp);
5892
5893 return;
5894 }
5895 #endif
5896
5897
5898
5899 static int
compute_packsize(UINT4 * values)5900 compute_packsize (UINT4 *values) {
5901 UINT4 packsize;
5902 UINT4 maxvalue = 0;
5903 int i;
5904 int firstbit;
5905 #ifdef HAVE_BUILTIN_CLZ
5906 #elif defined(HAVE_ASM_BSR)
5907 int msb;
5908 #endif
5909
5910 for (i = 0; i < 64; i++) {
5911 maxvalue |= values[i];
5912 }
5913
5914 if (maxvalue == 0) {
5915 /* __builtin_clz() behaves oddly on zero */
5916 return 0;
5917
5918 } else {
5919 #ifdef HAVE_BUILTIN_CLZ
5920 firstbit = __builtin_clz(maxvalue);
5921 packsize = 32 - firstbit;
5922 #elif defined(HAVE_ASM_BSR)
5923 asm("bsr %1,%0" : "=r"(msb) : "r"(maxvalue));
5924 packsize = msb + 1;
5925 #else
5926 firstbit = ((maxvalue >> 16) ? clz_table[maxvalue >> 16] : 16 + clz_table[maxvalue]);
5927 packsize = 32 - firstbit;
5928 #endif
5929
5930 #ifdef ALLOW_ODD_PACKSIZES
5931 return packsize;
5932 #else
5933 return (packsize + 1) & ~1; /* Converts packsizes to the next multiple of 2 */
5934 #endif
5935 }
5936 }
5937
5938
5939 #if 0
5940 /* Stores the $n$ values [0..(n-1)] */
5941 /* Want to store values 0..n-1. The value direct[n] does not exist. */
5942 /* Stored in vertical order */
5943 void
5944 Bitpack64_write_direct (char *ptrsfile, char *compfile, UINT4 *direct, Oligospace_T n) {
5945 FILE *ptrs_fp, *comp_fp;
5946 UINT4 *ptrs, *p;
5947 size_t nptrs;
5948 int i;
5949 Oligospace_T positioni;
5950
5951 UINT4 *buffer;
5952 int buffer_size = BUFFER_SIZE;
5953 int buffer_i;
5954
5955 UINT4 last_block[BLOCKSIZE];
5956
5957 UINT4 nwritten;
5958 int packsize;
5959
5960
5961 write_setup();
5962
5963 /* 1 metavalue: nwritten (pointer). Packsize can be
5964 computed from difference between successive pointers, if only
5965 even packsizes are allowed */
5966 p = ptrs = (UINT4 *) CALLOC(((n + BLOCKSIZE - 1)/BLOCKSIZE + 1) * DIRECT_METAINFO_SIZE,sizeof(UINT4));
5967
5968 if ((comp_fp = FOPEN_WRITE_BINARY(compfile)) == NULL) {
5969 fprintf(stderr,"Can't write to compfile %s: %s\n",compfile,strerror(errno));
5970 abort();
5971 }
5972 buffer = (UINT4 *) CALLOC(buffer_size,sizeof(UINT4));
5973 buffer_i = 0;
5974
5975 nwritten = 0U;
5976
5977 for (positioni = 0; positioni + BLOCKSIZE < n; positioni += BLOCKSIZE) {
5978 /* Pointer */
5979 *p++ = nwritten/4; /* In 128-bit registers */
5980
5981 /* Pack block of 64 diffs */
5982 packsize = compute_packsize(&(direct[positioni]));
5983 buffer_i = write_vert(comp_fp,buffer,buffer_size,buffer_i,&(direct[positioni]),packsize);
5984
5985 #ifdef ALLOW_ODD_PACKSIZES
5986 nwritten += 2 * ((packsize + 1) & ~1);
5987 #else
5988 nwritten += 2 * packsize;
5989 #endif
5990 }
5991
5992 if (positioni < n) {
5993 /* Finish last block of 64 */
5994 *p++ = nwritten/4; /* In 128-bit registers */
5995
5996 i = 0;
5997 while (positioni < n) {
5998 last_block[i++] = direct[positioni++];
5999 }
6000 while (i < BLOCKSIZE) {
6001 last_block[i++] = 0;
6002 }
6003
6004 packsize = compute_packsize(last_block);
6005 buffer_i = write_vert(comp_fp,buffer,buffer_size,buffer_i,last_block,packsize);
6006
6007 #ifdef ALLOW_ODD_PACKSIZES
6008 nwritten += 2 * ((packsize + 1) & ~1);
6009 #else
6010 nwritten += 2 * packsize;
6011 #endif
6012 }
6013
6014 /* Write the final pointer, which will point after the end of the
6015 file */
6016 *p++ = nwritten/4; /* In 128-bit registers */
6017
6018 if ((ptrs_fp = FOPEN_WRITE_BINARY(ptrsfile)) == NULL) {
6019 fprintf(stderr,"Can't write to ptrsfile %s: %s\n",ptrsfile,strerror(errno));
6020 abort();
6021 } else {
6022 nptrs = p - ptrs;
6023 if (FWRITE_UINTS(ptrs,nptrs,ptrs_fp) != (size_t) nptrs) {
6024 fprintf(stderr,"Error in Bitpack64_write_direct: %s\n",strerror(errno));
6025 exit(9);
6026 }
6027 FREE(ptrs);
6028 fclose(ptrs_fp);
6029 }
6030
6031 /* Empty buffer */
6032 if (buffer_i > 0) {
6033 if (FWRITE_UINTS(buffer,buffer_i,comp_fp) != (size_t) buffer_i) {
6034 fprintf(stderr,"Error in Bitpack64_write_direct: %s\n",strerror(errno));
6035 exit(9);
6036 }
6037 buffer_i = 0;
6038 }
6039 FREE(buffer);
6040 fclose(comp_fp);
6041
6042 return;
6043 }
6044 #endif
6045
6046
6047