1 static char rcsid[] = "$Id: bitpack64-write.c 221731 2020-02-13 19:47:16Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 
6 #include "bitpack64-write.h"
7 
8 #ifdef WORDS_BIGENDIAN
9 #include "bigendian.h"		/* For FWRITE_UINTS */
10 #else
11 #include "littleendian.h"	/* For FWRITE_UINTS */
12 #endif
13 
14 #include <stdio.h>
15 #include <stdlib.h>
16 #include <string.h>		/* For memset */
17 #include <errno.h>
18 #include "mem.h"
19 #include "assert.h"
20 #include "fopen.h"
21 #include "popcount.h"
22 #include "bitpack64-access.h"	/* For Bitpack64_extract_bitpack */
23 
24 #ifdef HAVE_SSE2
25 #include <emmintrin.h>
26 #endif
27 
28 
29 /* #define ALLOW_ODD_PACKSIZES 1 */
30 
31 /* #define USE_ONE_FILE_FOR_FIXED 1 */
32 
33 #define DIFFERENTIAL_METAINFO_SIZE 2
34 #define PAIRED_METAINFO_SIZE 3
35 #define RANK_METAINFO_SIZE 1	/* A variant of differential, where packsize is always 6 (lg 64) */
36 #define DIRECT_METAINFO_SIZE 1
37 #define BLOCKSIZE 64
38 #define POSITIONS_PAGE 4294967296 /* 2^32 */
39 
40 #define BUFFER_SIZE 1000000
41 
42 
43 /* Note: For offset pointers, where we need fast cumulative sums, we
44    use vertical format (where successive values are in different
45    packed unsigned ints).  For lcp, we want raw values, and vertical
46    format is still slightly more efficient than horizontal format. */
47 
48 #ifdef HAVE_SSE2
49 static int
write_reg_buffered_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,__m128i OutReg)50 write_reg_buffered_vert (FILE *strm_fp, Positionsptr_T *strm_buffer,
51 			 int strm_buffer_size, int strm_buffer_i, __m128i OutReg) {
52 
53 #if 0
54   /* Type casting method (when we passed in pointer to OutReg).  Needs a memory fence. */
55   UINT4 *buffer = (UINT4 *) OutReg;
56   _mm_lfence();  /* Needed to avoid storing incorrect values into strm_buffer */
57 #else
58   /* Storing method.  Safer.  */
59   UINT4 buffer[4];
60   _mm_store_si128((__m128i *) buffer,OutReg);
61 #endif
62 
63   /* printf("Writing %08X %08X %08X %08X\n",buffer[0],buffer[1],buffer[2],buffer[3]); */
64 
65   strm_buffer[strm_buffer_i++] = buffer[0];
66   if (strm_buffer_i == strm_buffer_size) {
67     if (FWRITE_UINTS(strm_buffer,strm_buffer_size,strm_fp) != (size_t) strm_buffer_size) {
68       fprintf(stderr,"Error in write_reg_buffered_vert: %s\n",strerror(errno));
69       exit(9);
70     }
71     strm_buffer_i = 0;
72   }
73 
74   strm_buffer[strm_buffer_i++] = buffer[1];
75   if (strm_buffer_i == strm_buffer_size) {
76     if (FWRITE_UINTS(strm_buffer,strm_buffer_size,strm_fp) != (size_t) strm_buffer_size) {
77       fprintf(stderr,"Error in write_reg_buffered_vert: %s\n",strerror(errno));
78       exit(9);
79     }
80     strm_buffer_i = 0;
81   }
82 
83   strm_buffer[strm_buffer_i++] = buffer[2];
84   if (strm_buffer_i == strm_buffer_size) {
85     if (FWRITE_UINTS(strm_buffer,strm_buffer_size,strm_fp) != (size_t) strm_buffer_size) {
86       fprintf(stderr,"Error in write_reg_buffered_vert: %s\n",strerror(errno));
87       exit(9);
88     }
89     strm_buffer_i = 0;
90   }
91 
92   strm_buffer[strm_buffer_i++] = buffer[3];
93   if (strm_buffer_i == strm_buffer_size) {
94     if (FWRITE_UINTS(strm_buffer,strm_buffer_size,strm_fp) != (size_t) strm_buffer_size) {
95       fprintf(stderr,"Error in write_reg_buffered_vert: %s\n",strerror(errno));
96       exit(9);
97     }
98     strm_buffer_i = 0;
99   }
100 
101   return strm_buffer_i;
102 }
103 #else
104 static int
write_reg_buffered_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,UINT4 * horizontal,int nwritten)105 write_reg_buffered_vert (FILE *strm_fp, Positionsptr_T *strm_buffer,
106 			 int strm_buffer_size, int strm_buffer_i,
107 			 UINT4 *horizontal, int nwritten) {
108   UINT4 vertical[64];
109   int nrows = nwritten/4, row, column, k;
110 
111   /* Convert to vertical */
112   for (column = 0; column < 4; column++) {
113     k = column;
114     for (row = 0; row < nrows; row++) {
115       vertical[k] = *horizontal++;
116       k += 4;
117     }
118   }
119 
120   /* Send to output buffer */
121   for (k = 0; k < nwritten; k++) {
122     /* printf("Writing %08X\n",vertical[k]); */
123     strm_buffer[strm_buffer_i++] = vertical[k];
124     if (strm_buffer_i == strm_buffer_size) {
125       if (FWRITE_UINTS(strm_buffer,strm_buffer_size,strm_fp) != (size_t) strm_buffer_size) {
126 	fprintf(stderr,"Error in write_reg_buffered_vert: %s\n",strerror(errno));
127 	exit(9);
128       }
129       strm_buffer_i = 0;
130     }
131   }
132 
133   return strm_buffer_i;
134 }
135 #endif
136 
137 
138 
139 static int
write_reg_buffered_horiz(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,UINT4 * values,int nwritten)140 write_reg_buffered_horiz (FILE *strm_fp, Positionsptr_T *strm_buffer,
141 			  int strm_buffer_size, int strm_buffer_i,
142 			  UINT4 *values, int nwritten) {
143   int k;
144 
145   /* Send to output buffer */
146   for (k = 0; k < nwritten; k++) {
147     /* printf("Writing %08X\n",values[k]); */
148     strm_buffer[strm_buffer_i++] = values[k];
149     if (strm_buffer_i == strm_buffer_size) {
150       if (FWRITE_UINTS(strm_buffer,strm_buffer_size,strm_fp) != (size_t) strm_buffer_size) {
151 	fprintf(stderr,"Error in write_reg_buffered_horiz: %s\n",strerror(errno));
152 	exit(9);
153       }
154       strm_buffer_i = 0;
155     }
156   }
157 
158   return strm_buffer_i;
159 }
160 
161 
162 
163 
164 #ifdef HAVE_SSE2
165 static __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7, mask8,
166   mask9, mask10, mask11, mask12, mask13, mask14, mask15, mask16,
167   mask17, mask18, mask19, mask20, mask21, mask22, mask23, mask24,
168   mask25, mask26, mask27, mask28, mask29, mask30, mask31;
169 #endif
170 
171 
172 static void
write_setup()173 write_setup () {
174 
175 #ifdef HAVE_SSE2
176   mask1 = _mm_set1_epi32(1U);
177   mask2 = _mm_set1_epi32(3U);
178   mask3 =  _mm_set1_epi32(7U);
179   mask4 =  _mm_set1_epi32(15U);
180   mask5 =  _mm_set1_epi32(31U);
181   mask6 =  _mm_set1_epi32(63U);
182   mask7 =  _mm_set1_epi32(127U);
183   mask8 =  _mm_set1_epi32(255U);
184   mask9 =  _mm_set1_epi32(511U);
185   mask10 =  _mm_set1_epi32(1023U);
186   mask11 =  _mm_set1_epi32(2047U);
187   mask12 =  _mm_set1_epi32(4095U);
188   mask13 =  _mm_set1_epi32(8191U);
189   mask14 =  _mm_set1_epi32(16383U);
190   mask15 =  _mm_set1_epi32(32767U);
191   mask16 =  _mm_set1_epi32(65535U);
192   mask17 =  _mm_set1_epi32(131071U);
193   mask18 =  _mm_set1_epi32(262143U);
194   mask19 =  _mm_set1_epi32(524287U);
195   mask20 =  _mm_set1_epi32(1048575U);
196   mask21 =  _mm_set1_epi32(2097151U);
197   mask22 =  _mm_set1_epi32(4194303U);
198   mask23 =  _mm_set1_epi32(8388607U);
199   mask24 =  _mm_set1_epi32(16777215U);
200   mask25 =  _mm_set1_epi32(33554431U);
201   mask26 =  _mm_set1_epi32(67108863U);
202   mask27 =  _mm_set1_epi32(134217727U);
203   mask28 =  _mm_set1_epi32(268435455U);
204   mask29 =  _mm_set1_epi32(536870911U);
205   mask30 =  _mm_set1_epi32(1073741823U);
206   mask31 =  _mm_set1_epi32(2147483647U);
207 #endif
208 
209   return;
210 }
211 
212 #ifdef ALLOW_ODD_PACKSIZES
213 /* nwritten = 1 * 4 = 4 unsigned ints */
214 static int
write_01_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)215 write_01_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
216     const __m128i *in = (const __m128i *) _in;
217     __m128i OutReg;
218 
219     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask1);
220     OutReg = InReg;
221     InReg = _mm_and_si128(_mm_load_si128(++in), mask1);
222 
223     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
224     InReg = _mm_and_si128(_mm_load_si128(++in), mask1);
225 
226     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
227     InReg = _mm_and_si128(_mm_load_si128(++in), mask1);
228 
229     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
230     InReg = _mm_and_si128(_mm_load_si128(++in), mask1);
231 
232     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
233     InReg = _mm_and_si128(_mm_load_si128(++in), mask1);
234 
235     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
236     InReg = _mm_and_si128(_mm_load_si128(++in), mask1);
237 
238     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
239     InReg = _mm_and_si128(_mm_load_si128(++in), mask1);
240 
241     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
242     InReg = _mm_and_si128(_mm_load_si128(++in), mask1);
243 
244     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
245     InReg = _mm_and_si128(_mm_load_si128(++in), mask1);
246 
247     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
248     InReg = _mm_and_si128(_mm_load_si128(++in), mask1);
249 
250     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
251     InReg = _mm_and_si128(_mm_load_si128(++in), mask1);
252 
253     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
254     InReg = _mm_and_si128(_mm_load_si128(++in), mask1);
255 
256     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
257     InReg = _mm_and_si128(_mm_load_si128(++in), mask1);
258 
259     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
260     InReg = _mm_and_si128(_mm_load_si128(++in), mask1);
261 
262     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
263     InReg = _mm_and_si128(_mm_load_si128(++in), mask1);
264 
265     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
266     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
267 					       OutReg);
268 
269     return strm_buffer_i;
270 }
271 #endif
272 
273 
274 #ifdef HAVE_SSE2
275 /* nwritten = 1 * 4 = 4 unsigned ints */
276 static int
write_02_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)277 write_02_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
278     const __m128i *in = (const __m128i *) _in;
279     __m128i OutReg;
280 
281     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask2);
282     OutReg = InReg;
283     InReg = _mm_and_si128(_mm_load_si128(++in), mask2);
284 
285     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
286     InReg = _mm_and_si128(_mm_load_si128(++in), mask2);
287 
288     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
289     InReg = _mm_and_si128(_mm_load_si128(++in), mask2);
290 
291     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
292     InReg = _mm_and_si128(_mm_load_si128(++in), mask2);
293 
294     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
295     InReg = _mm_and_si128(_mm_load_si128(++in), mask2);
296 
297     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
298     InReg = _mm_and_si128(_mm_load_si128(++in), mask2);
299 
300     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
301     InReg = _mm_and_si128(_mm_load_si128(++in), mask2);
302 
303     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
304     InReg = _mm_and_si128(_mm_load_si128(++in), mask2);
305 
306     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
307     InReg = _mm_and_si128(_mm_load_si128(++in), mask2);
308 
309     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
310     InReg = _mm_and_si128(_mm_load_si128(++in), mask2);
311 
312     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
313     InReg = _mm_and_si128(_mm_load_si128(++in), mask2);
314 
315     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
316     InReg = _mm_and_si128(_mm_load_si128(++in), mask2);
317 
318     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
319     InReg = _mm_and_si128(_mm_load_si128(++in), mask2);
320 
321     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
322     InReg = _mm_and_si128(_mm_load_si128(++in), mask2);
323 
324     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
325     InReg = _mm_and_si128(_mm_load_si128(++in), mask2);
326 
327     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
328     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
329 					       OutReg);
330 
331     return strm_buffer_i;
332 }
333 #endif
334 
335 static int
pack_02_horiz(UINT4 * out,const UINT4 * in)336 pack_02_horiz (UINT4 *out, const UINT4 *in) {
337   int column;
338 
339   for (column = 0; column < 4; column++) {
340     *out |= (*in)   % (1U << 2 ) ;
341     ++in;
342     *out |= ( (*in)   % (1U << 2 )  ) <<  2 ;
343     ++in;
344     *out |= ( (*in)   % (1U << 2 )  ) <<  4 ;
345     ++in;
346     *out |= ( (*in)   % (1U << 2 )  ) <<  6 ;
347     ++in;
348     *out |= ( (*in)   % (1U << 2 )  ) <<  8 ;
349     ++in;
350     *out |= ( (*in)   % (1U << 2 )  ) <<  10 ;
351     ++in;
352     *out |= ( (*in)   % (1U << 2 )  ) <<  12 ;
353     ++in;
354     *out |= ( (*in)   % (1U << 2 )  ) <<  14 ;
355     ++in;
356     *out |= ( (*in)   % (1U << 2 )  ) <<  16 ;
357     ++in;
358     *out |= ( (*in)   % (1U << 2 )  ) <<  18 ;
359     ++in;
360     *out |= ( (*in)   % (1U << 2 )  ) <<  20 ;
361     ++in;
362     *out |= ( (*in)   % (1U << 2 )  ) <<  22 ;
363     ++in;
364     *out |= ( (*in)   % (1U << 2 )  ) <<  24 ;
365     ++in;
366     *out |= ( (*in)   % (1U << 2 )  ) <<  26 ;
367     ++in;
368     *out |= ( (*in)   % (1U << 2 )  ) <<  28 ;
369     ++in;
370     *out |= ( (*in)   % (1U << 2 )  ) <<  30 ;
371     ++out;
372     ++in;
373   }
374 
375   return 4;
376 }
377 
378 
379 
380 #ifdef ALLOW_ODD_PACKSIZES
381 /* nwritten = 2 * 4 = 8 unsigned ints */
382 static int
write_03_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)383 write_03_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
384     const __m128i *in = (const __m128i *) _in;
385     __m128i OutReg;
386 
387     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask3);
388     OutReg = InReg;
389     InReg = _mm_and_si128(_mm_load_si128(++in), mask3);
390 
391     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
392     InReg = _mm_and_si128(_mm_load_si128(++in), mask3);
393 
394     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
395     InReg = _mm_and_si128(_mm_load_si128(++in), mask3);
396 
397     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
398     InReg = _mm_and_si128(_mm_load_si128(++in), mask3);
399 
400     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
401     InReg = _mm_and_si128(_mm_load_si128(++in), mask3);
402 
403     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
404     InReg = _mm_and_si128(_mm_load_si128(++in), mask3);
405 
406     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
407     InReg = _mm_and_si128(_mm_load_si128(++in), mask3);
408 
409     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
410     InReg = _mm_and_si128(_mm_load_si128(++in), mask3);
411 
412     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
413     InReg = _mm_and_si128(_mm_load_si128(++in), mask3);
414 
415     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
416     InReg = _mm_and_si128(_mm_load_si128(++in), mask3);
417 
418     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
419     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
420 					       OutReg);
421 
422     OutReg = _mm_srli_epi32(InReg, 3 - 1);
423     InReg = _mm_and_si128(_mm_load_si128(++in), mask3);
424 
425     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
426     InReg = _mm_and_si128(_mm_load_si128(++in), mask3);
427 
428     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
429     InReg = _mm_and_si128(_mm_load_si128(++in), mask3);
430 
431     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
432     InReg = _mm_and_si128(_mm_load_si128(++in), mask3);
433 
434     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
435     InReg = _mm_and_si128(_mm_load_si128(++in), mask3);
436 
437     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
438     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
439 					       OutReg);
440 
441     return strm_buffer_i;
442 }
443 #endif
444 
445 
446 
447 #ifdef ALLOW_ODD_PACKSIZES
448 /* nwritten = 3 * 4 = 12 unsigned ints */
449 static int
write_05_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)450 write_05_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
451     const __m128i *in = (const __m128i *) _in;
452     __m128i OutReg;
453 
454     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask5);
455     OutReg = InReg;
456     InReg = _mm_and_si128(_mm_load_si128(++in), mask5);
457 
458     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
459     InReg = _mm_and_si128(_mm_load_si128(++in), mask5);
460 
461     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
462     InReg = _mm_and_si128(_mm_load_si128(++in), mask5);
463 
464     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
465     InReg = _mm_and_si128(_mm_load_si128(++in), mask5);
466 
467     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
468     InReg = _mm_and_si128(_mm_load_si128(++in), mask5);
469 
470     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
471     InReg = _mm_and_si128(_mm_load_si128(++in), mask5);
472 
473     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
474     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
475 					       OutReg);
476 
477     OutReg = _mm_srli_epi32(InReg, 5 - 3);
478     InReg = _mm_and_si128(_mm_load_si128(++in), mask5);
479 
480     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
481     InReg = _mm_and_si128(_mm_load_si128(++in), mask5);
482 
483     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
484     InReg = _mm_and_si128(_mm_load_si128(++in), mask5);
485 
486     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
487     InReg = _mm_and_si128(_mm_load_si128(++in), mask5);
488 
489     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
490     InReg = _mm_and_si128(_mm_load_si128(++in), mask5);
491 
492     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
493     InReg = _mm_and_si128(_mm_load_si128(++in), mask5);
494 
495     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
496     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
497 					       OutReg);
498 
499     OutReg = _mm_srli_epi32(InReg, 5 - 1);
500     InReg = _mm_and_si128(_mm_load_si128(++in), mask5);
501 
502     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
503     InReg = _mm_and_si128(_mm_load_si128(++in), mask5);
504 
505     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
506     InReg = _mm_and_si128(_mm_load_si128(++in), mask5);
507 
508     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
509     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
510 					       OutReg);
511 
512     return strm_buffer_i;
513 }
514 #endif
515 
516 
517 #ifdef HAVE_SSE2
518 /* nwritten = 3 * 4 = 12 unsigned ints */
519 static int
write_06_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)520 write_06_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
521     const __m128i *in = (const __m128i *) _in;
522     __m128i OutReg;
523 
524     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask6);
525     OutReg = InReg;
526     InReg = _mm_and_si128(_mm_load_si128(++in), mask6);
527 
528     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
529     InReg = _mm_and_si128(_mm_load_si128(++in), mask6);
530 
531     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
532     InReg = _mm_and_si128(_mm_load_si128(++in), mask6);
533 
534     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
535     InReg = _mm_and_si128(_mm_load_si128(++in), mask6);
536 
537     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
538     InReg = _mm_and_si128(_mm_load_si128(++in), mask6);
539 
540     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
541     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
542 					       OutReg);
543 
544     OutReg = _mm_srli_epi32(InReg, 6 - 4);
545     InReg = _mm_and_si128(_mm_load_si128(++in), mask6);
546 
547     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
548     InReg = _mm_and_si128(_mm_load_si128(++in), mask6);
549 
550     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
551     InReg = _mm_and_si128(_mm_load_si128(++in), mask6);
552 
553     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
554     InReg = _mm_and_si128(_mm_load_si128(++in), mask6);
555 
556     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
557     InReg = _mm_and_si128(_mm_load_si128(++in), mask6);
558 
559     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
560     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
561 					       OutReg);
562 
563     OutReg = _mm_srli_epi32(InReg, 6 - 2);
564     InReg = _mm_and_si128(_mm_load_si128(++in), mask6);
565 
566     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
567     InReg = _mm_and_si128(_mm_load_si128(++in), mask6);
568 
569     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
570     InReg = _mm_and_si128(_mm_load_si128(++in), mask6);
571 
572     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
573     InReg = _mm_and_si128(_mm_load_si128(++in), mask6);
574 
575     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
576     InReg = _mm_and_si128(_mm_load_si128(++in), mask6);
577 
578     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
579     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
580 					       OutReg);
581 
582     return strm_buffer_i;
583 }
584 #endif
585 
586 static int
pack_06_horiz(UINT4 * out,const UINT4 * in)587 pack_06_horiz (UINT4 *out, const UINT4 *in) {
588   int column;
589 
590   for (column = 0; column < 4; column++) {
591     *out |= (*in)   % (1U << 6 ) ;
592     ++in;
593     *out |= ( (*in)   % (1U << 6 )  ) <<  6 ;
594     ++in;
595     *out |= ( (*in)   % (1U << 6 )  ) <<  12 ;
596     ++in;
597     *out |= ( (*in)   % (1U << 6 )  ) <<  18 ;
598     ++in;
599     *out |= ( (*in)   % (1U << 6 )  ) <<  24 ;
600     ++in;
601     *out |= ( (*in)   % (1U << 6 )  ) <<  30 ;
602     ++out;
603     *out |=  ( (*in)   % (1U << 6 ) ) >> ( 6  -  4 );
604     ++in;
605     *out |= ( (*in)   % (1U << 6 )  ) <<  4 ;
606     ++in;
607     *out |= ( (*in)   % (1U << 6 )  ) <<  10 ;
608     ++in;
609     *out |= ( (*in)   % (1U << 6 )  ) <<  16 ;
610     ++in;
611     *out |= ( (*in)   % (1U << 6 )  ) <<  22 ;
612     ++in;
613     *out |= ( (*in)   % (1U << 6 )  ) <<  28 ;
614     ++out;
615     *out |=  ( (*in)   % (1U << 6 ) ) >> ( 6  -  2 );
616     ++in;
617     *out |= ( (*in)   % (1U << 6 )  ) <<  2 ;
618     ++in;
619     *out |= ( (*in)   % (1U << 6 )  ) <<  8 ;
620     ++in;
621     *out |= ( (*in)   % (1U << 6 )  ) <<  14 ;
622     ++in;
623     *out |= ( (*in)   % (1U << 6 )  ) <<  20 ;
624     ++in;
625     *out |= ( (*in)   % (1U << 6 )  ) <<  26 ;
626     ++out;
627     ++in;
628   }
629 
630   return 12;
631 }
632 
633 
634 
635 #ifdef ALLOW_ODD_PACKSIZES
636 /* nwritten = 4 * 4 = 16 unsigned ints */
637 static int
write_07_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)638 write_07_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
639     const __m128i *in = (const __m128i *) _in;
640     __m128i OutReg;
641 
642     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask7);
643     OutReg = InReg;
644     InReg = _mm_and_si128(_mm_load_si128(++in), mask7);
645 
646     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
647     InReg = _mm_and_si128(_mm_load_si128(++in), mask7);
648 
649     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
650     InReg = _mm_and_si128(_mm_load_si128(++in), mask7);
651 
652     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
653     InReg = _mm_and_si128(_mm_load_si128(++in), mask7);
654 
655     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
656     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
657 					       OutReg);
658 
659     OutReg = _mm_srli_epi32(InReg, 7 - 3);
660     InReg = _mm_and_si128(_mm_load_si128(++in), mask7);
661 
662     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
663     InReg = _mm_and_si128(_mm_load_si128(++in), mask7);
664 
665     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
666     InReg = _mm_and_si128(_mm_load_si128(++in), mask7);
667 
668     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
669     InReg = _mm_and_si128(_mm_load_si128(++in), mask7);
670 
671     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
672     InReg = _mm_and_si128(_mm_load_si128(++in), mask7);
673 
674     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
675     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
676 					       OutReg);
677 
678     OutReg = _mm_srli_epi32(InReg, 7 - 6);
679     InReg = _mm_and_si128(_mm_load_si128(++in), mask7);
680 
681     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
682     InReg = _mm_and_si128(_mm_load_si128(++in), mask7);
683 
684     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
685     InReg = _mm_and_si128(_mm_load_si128(++in), mask7);
686 
687     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
688     InReg = _mm_and_si128(_mm_load_si128(++in), mask7);
689 
690     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
691     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
692 					       OutReg);
693 
694     OutReg = _mm_srli_epi32(InReg, 7 - 2);
695     InReg = _mm_and_si128(_mm_load_si128(++in), mask7);
696 
697     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
698     InReg = _mm_and_si128(_mm_load_si128(++in), mask7);
699 
700     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
701     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
702 					       OutReg);
703 
704     return strm_buffer_i;
705 }
706 #endif
707 
708 
709 #ifdef ALLOW_ODD_PACKSIZES
710 /* nwritten = 5 * 4 = 20 unsigned ints */
711 static int
write_09_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)712 write_09_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
713     const __m128i *in = (const __m128i *) _in;
714     __m128i OutReg;
715 
716     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask9);
717     OutReg = InReg;
718     InReg = _mm_and_si128(_mm_load_si128(++in), mask9);
719 
720     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
721     InReg = _mm_and_si128(_mm_load_si128(++in), mask9);
722 
723     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
724     InReg = _mm_and_si128(_mm_load_si128(++in), mask9);
725 
726     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
727     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
728 					       OutReg);
729 
730     OutReg = _mm_srli_epi32(InReg, 9 - 4);
731     InReg = _mm_and_si128(_mm_load_si128(++in), mask9);
732 
733     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
734     InReg = _mm_and_si128(_mm_load_si128(++in), mask9);
735 
736     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
737     InReg = _mm_and_si128(_mm_load_si128(++in), mask9);
738 
739     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
740     InReg = _mm_and_si128(_mm_load_si128(++in), mask9);
741 
742     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
743     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
744 					       OutReg);
745 
746     OutReg = _mm_srli_epi32(InReg, 9 - 8);
747     InReg = _mm_and_si128(_mm_load_si128(++in), mask9);
748 
749     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
750     InReg = _mm_and_si128(_mm_load_si128(++in), mask9);
751 
752     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
753     InReg = _mm_and_si128(_mm_load_si128(++in), mask9);
754 
755     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
756     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
757 					       OutReg);
758 
759     OutReg = _mm_srli_epi32(InReg, 9 - 3);
760     InReg = _mm_and_si128(_mm_load_si128(++in), mask9);
761 
762     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
763     InReg = _mm_and_si128(_mm_load_si128(++in), mask9);
764 
765     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
766     InReg = _mm_and_si128(_mm_load_si128(++in), mask9);
767 
768     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
769     InReg = _mm_and_si128(_mm_load_si128(++in), mask9);
770 
771     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
772     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
773 					       OutReg);
774 
775     OutReg = _mm_srli_epi32(InReg, 9 - 7);
776     InReg = _mm_and_si128(_mm_load_si128(++in), mask9);
777 
778     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
779     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
780 					       OutReg);
781 
782     return strm_buffer_i;
783 }
784 #endif
785 
786 
787 #ifdef HAVE_SSE2
788 /* nwritten = 5 * 4 = 20 unsigned ints */
789 static int
write_10_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)790 write_10_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
791     const __m128i *in = (const __m128i *) _in;
792     __m128i OutReg;
793 
794     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask10);
795     OutReg = InReg;
796     InReg = _mm_and_si128(_mm_load_si128(++in), mask10);
797 
798     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
799     InReg = _mm_and_si128(_mm_load_si128(++in), mask10);
800 
801     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
802     InReg = _mm_and_si128(_mm_load_si128(++in), mask10);
803 
804     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
805     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
806 					       OutReg);
807 
808     OutReg = _mm_srli_epi32(InReg, 10 - 8);
809     InReg = _mm_and_si128(_mm_load_si128(++in), mask10);
810 
811     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
812     InReg = _mm_and_si128(_mm_load_si128(++in), mask10);
813 
814     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
815     InReg = _mm_and_si128(_mm_load_si128(++in), mask10);
816 
817     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
818     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
819 					       OutReg);
820 
821     OutReg = _mm_srli_epi32(InReg, 10 - 6);
822     InReg = _mm_and_si128(_mm_load_si128(++in), mask10);
823 
824     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
825     InReg = _mm_and_si128(_mm_load_si128(++in), mask10);
826 
827     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
828     InReg = _mm_and_si128(_mm_load_si128(++in), mask10);
829 
830     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
831     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
832 					       OutReg);
833 
834     OutReg = _mm_srli_epi32(InReg, 10 - 4);
835     InReg = _mm_and_si128(_mm_load_si128(++in), mask10);
836 
837     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
838     InReg = _mm_and_si128(_mm_load_si128(++in), mask10);
839 
840     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
841     InReg = _mm_and_si128(_mm_load_si128(++in), mask10);
842 
843     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
844     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
845 					       OutReg);
846 
847     OutReg = _mm_srli_epi32(InReg, 10 - 2);
848     InReg = _mm_and_si128(_mm_load_si128(++in), mask10);
849 
850     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
851     InReg = _mm_and_si128(_mm_load_si128(++in), mask10);
852 
853     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
854     InReg = _mm_and_si128(_mm_load_si128(++in), mask10);
855 
856     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
857     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
858 					       OutReg);
859 
860     return strm_buffer_i;
861 }
862 #endif
863 
864 static int
pack_10_horiz(UINT4 * out,const UINT4 * in)865 pack_10_horiz (UINT4 *out, const UINT4 *in) {
866   int column;
867 
868   for (column = 0; column < 4; column++) {
869     *out |= (*in)   % (1U << 10 ) ;
870     ++in;
871     *out |= ( (*in)   % (1U << 10 )  ) <<  10 ;
872     ++in;
873     *out |= ( (*in)   % (1U << 10 )  ) <<  20 ;
874     ++in;
875     *out |= ( (*in)   % (1U << 10 )  ) <<  30 ;
876     ++out;
877     *out |=  ( (*in)   % (1U << 10 ) ) >> ( 10  -  8 );
878     ++in;
879     *out |= ( (*in)   % (1U << 10 )  ) <<  8 ;
880     ++in;
881     *out |= ( (*in)   % (1U << 10 )  ) <<  18 ;
882     ++in;
883     *out |= ( (*in)   % (1U << 10 )  ) <<  28 ;
884     ++out;
885     *out |=  ( (*in)   % (1U << 10 ) ) >> ( 10  -  6 );
886     ++in;
887     *out |= ( (*in)   % (1U << 10 )  ) <<  6 ;
888     ++in;
889     *out |= ( (*in)   % (1U << 10 )  ) <<  16 ;
890     ++in;
891     *out |= ( (*in)   % (1U << 10 )  ) <<  26 ;
892     ++out;
893     *out |=  ( (*in)   % (1U << 10 ) ) >> ( 10  -  4 );
894     ++in;
895     *out |= ( (*in)   % (1U << 10 )  ) <<  4 ;
896     ++in;
897     *out |= ( (*in)   % (1U << 10 )  ) <<  14 ;
898     ++in;
899     *out |= ( (*in)   % (1U << 10 )  ) <<  24 ;
900     ++out;
901     *out |=  ( (*in)   % (1U << 10 ) ) >> ( 10  -  2 );
902     ++in;
903     *out |= ( (*in)   % (1U << 10 )  ) <<  2 ;
904     ++in;
905     *out |= ( (*in)   % (1U << 10 )  ) <<  12 ;
906     ++in;
907     *out |= ( (*in)   % (1U << 10 )  ) <<  22 ;
908     ++out;
909     ++in;
910   }
911 
912   return 20;
913 }
914 
915 
916 
917 
918 #ifdef ALLOW_ODD_PACKSIZES
919 /* nwritten = 6 * 4 = 24 unsigned ints */
920 static int
write_11_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)921 write_11_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
922     const __m128i *in = (const __m128i *) _in;
923     __m128i OutReg;
924 
925     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask11);
926     OutReg = InReg;
927     InReg = _mm_and_si128(_mm_load_si128(++in), mask11);
928 
929     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
930     InReg = _mm_and_si128(_mm_load_si128(++in), mask11);
931 
932     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
933     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
934 					       OutReg);
935 
936     OutReg = _mm_srli_epi32(InReg, 11 - 1);
937     InReg = _mm_and_si128(_mm_load_si128(++in), mask11);
938 
939     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
940     InReg = _mm_and_si128(_mm_load_si128(++in), mask11);
941 
942     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
943     InReg = _mm_and_si128(_mm_load_si128(++in), mask11);
944 
945     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
946     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
947 					       OutReg);
948 
949     OutReg = _mm_srli_epi32(InReg, 11 - 2);
950     InReg = _mm_and_si128(_mm_load_si128(++in), mask11);
951 
952     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
953     InReg = _mm_and_si128(_mm_load_si128(++in), mask11);
954 
955     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
956     InReg = _mm_and_si128(_mm_load_si128(++in), mask11);
957 
958     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
959     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
960 					       OutReg);
961 
962     OutReg = _mm_srli_epi32(InReg, 11 - 3);
963     InReg = _mm_and_si128(_mm_load_si128(++in), mask11);
964 
965     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
966     InReg = _mm_and_si128(_mm_load_si128(++in), mask11);
967 
968     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
969     InReg = _mm_and_si128(_mm_load_si128(++in), mask11);
970 
971     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
972     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
973 					       OutReg);
974 
975     OutReg = _mm_srli_epi32(InReg, 11 - 4);
976     InReg = _mm_and_si128(_mm_load_si128(++in), mask11);
977 
978     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
979     InReg = _mm_and_si128(_mm_load_si128(++in), mask11);
980 
981     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
982     InReg = _mm_and_si128(_mm_load_si128(++in), mask11);
983 
984     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
985     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
986 					       OutReg);
987 
988     OutReg = _mm_srli_epi32(InReg, 11 - 5);
989     InReg = _mm_and_si128(_mm_load_si128(++in), mask11);
990 
991     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
992     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
993 					       OutReg);
994 
995     return strm_buffer_i;
996 }
997 #endif
998 
999 
1000 #ifdef HAVE_SSE2
1001 /* nwritten = 6 * 4 = 24 unsigned ints */
1002 static int
write_12_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)1003 write_12_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
1004     const __m128i *in = (const __m128i *) _in;
1005     __m128i OutReg;
1006 
1007     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask12);
1008     OutReg = InReg;
1009     InReg = _mm_and_si128(_mm_load_si128(++in), mask12);
1010 
1011     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
1012     InReg = _mm_and_si128(_mm_load_si128(++in), mask12);
1013 
1014     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
1015     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1016 					       OutReg);
1017 
1018     OutReg = _mm_srli_epi32(InReg, 12 - 4);
1019     InReg = _mm_and_si128(_mm_load_si128(++in), mask12);
1020 
1021     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
1022     InReg = _mm_and_si128(_mm_load_si128(++in), mask12);
1023 
1024     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
1025     InReg = _mm_and_si128(_mm_load_si128(++in), mask12);
1026 
1027     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
1028     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1029 					       OutReg);
1030 
1031     OutReg = _mm_srli_epi32(InReg, 12 - 8);
1032     InReg = _mm_and_si128(_mm_load_si128(++in), mask12);
1033 
1034     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
1035     InReg = _mm_and_si128(_mm_load_si128(++in), mask12);
1036 
1037     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
1038     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1039 					       OutReg);
1040 
1041     InReg = _mm_and_si128(_mm_load_si128(++in), mask12);
1042 
1043     OutReg = InReg;
1044     InReg = _mm_and_si128(_mm_load_si128(++in), mask12);
1045 
1046     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
1047     InReg = _mm_and_si128(_mm_load_si128(++in), mask12);
1048 
1049     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
1050     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1051 					       OutReg);
1052 
1053     OutReg = _mm_srli_epi32(InReg, 12 - 4);
1054     InReg = _mm_and_si128(_mm_load_si128(++in), mask12);
1055 
1056     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
1057     InReg = _mm_and_si128(_mm_load_si128(++in), mask12);
1058 
1059     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
1060     InReg = _mm_and_si128(_mm_load_si128(++in), mask12);
1061 
1062     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
1063     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1064 					       OutReg);
1065 
1066     OutReg = _mm_srli_epi32(InReg, 12 - 8);
1067     InReg = _mm_and_si128(_mm_load_si128(++in), mask12);
1068 
1069     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
1070     InReg = _mm_and_si128(_mm_load_si128(++in), mask12);
1071 
1072     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
1073     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1074 					       OutReg);
1075 
1076     return strm_buffer_i;
1077 }
1078 #endif
1079 
1080 static int
pack_12_horiz(UINT4 * out,const UINT4 * in)1081 pack_12_horiz (UINT4 *out, const UINT4 *in) {
1082   int column;
1083 
1084   for (column = 0; column < 4; column++) {
1085 
1086     *out |= (*in)   % (1U << 12 ) ;
1087     ++in;
1088     *out |= ( (*in)   % (1U << 12 )  ) <<  12 ;
1089     ++in;
1090     *out |= ( (*in)   % (1U << 12 )  ) <<  24 ;
1091     ++out;
1092     *out |=  ( (*in)   % (1U << 12 ) ) >> ( 12  -  4 );
1093     ++in;
1094     *out |= ( (*in)   % (1U << 12 )  ) <<  4 ;
1095     ++in;
1096     *out |= ( (*in)   % (1U << 12 )  ) <<  16 ;
1097     ++in;
1098     *out |= ( (*in)   % (1U << 12 )  ) <<  28 ;
1099     ++out;
1100     *out |=  ( (*in)   % (1U << 12 ) ) >> ( 12  -  8 );
1101     ++in;
1102     *out |= ( (*in)   % (1U << 12 )  ) <<  8 ;
1103     ++in;
1104     *out |= ( (*in)   % (1U << 12 )  ) <<  20 ;
1105     ++out;
1106     ++in;
1107     *out |= (*in)   % (1U << 12 ) ;
1108     ++in;
1109     *out |= ( (*in)   % (1U << 12 )  ) <<  12 ;
1110     ++in;
1111     *out |= ( (*in)   % (1U << 12 )  ) <<  24 ;
1112     ++out;
1113     *out |=  ( (*in)   % (1U << 12 ) ) >> ( 12  -  4 );
1114     ++in;
1115     *out |= ( (*in)   % (1U << 12 )  ) <<  4 ;
1116     ++in;
1117     *out |= ( (*in)   % (1U << 12 )  ) <<  16 ;
1118     ++in;
1119     *out |= ( (*in)   % (1U << 12 )  ) <<  28 ;
1120     ++out;
1121     *out |=  ( (*in)   % (1U << 12 ) ) >> ( 12  -  8 );
1122     ++in;
1123     *out |= ( (*in)   % (1U << 12 )  ) <<  8 ;
1124     ++in;
1125     *out |= ( (*in)   % (1U << 12 )  ) <<  20 ;
1126     ++out;
1127     ++in;
1128   }
1129 
1130   return 24;
1131 }
1132 
1133 
1134 
1135 
1136 #ifdef ALLOW_ODD_PACKSIZES
1137 /* nwritten = 7 * 4 = 28 unsigned ints */
1138 static int
write_13_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)1139 write_13_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
1140     const __m128i *in = (const __m128i *) _in;
1141     __m128i OutReg;
1142 
1143     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask13);
1144     OutReg = InReg;
1145     InReg = _mm_and_si128(_mm_load_si128(++in), mask13);
1146 
1147     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
1148     InReg = _mm_and_si128(_mm_load_si128(++in), mask13);
1149 
1150     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
1151     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1152 					       OutReg);
1153 
1154     OutReg = _mm_srli_epi32(InReg, 13 - 7);
1155     InReg = _mm_and_si128(_mm_load_si128(++in), mask13);
1156 
1157     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
1158     InReg = _mm_and_si128(_mm_load_si128(++in), mask13);
1159 
1160     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
1161     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1162 					       OutReg);
1163 
1164     OutReg = _mm_srli_epi32(InReg, 13 - 1);
1165     InReg = _mm_and_si128(_mm_load_si128(++in), mask13);
1166 
1167     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
1168     InReg = _mm_and_si128(_mm_load_si128(++in), mask13);
1169 
1170     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
1171     InReg = _mm_and_si128(_mm_load_si128(++in), mask13);
1172 
1173     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
1174     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1175 					       OutReg);
1176 
1177     OutReg = _mm_srli_epi32(InReg, 13 - 8);
1178     InReg = _mm_and_si128(_mm_load_si128(++in), mask13);
1179 
1180     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
1181     InReg = _mm_and_si128(_mm_load_si128(++in), mask13);
1182 
1183     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
1184     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1185 					       OutReg);
1186 
1187     OutReg = _mm_srli_epi32(InReg, 13 - 2);
1188     InReg = _mm_and_si128(_mm_load_si128(++in), mask13);
1189 
1190     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
1191     InReg = _mm_and_si128(_mm_load_si128(++in), mask13);
1192 
1193     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
1194     InReg = _mm_and_si128(_mm_load_si128(++in), mask13);
1195 
1196     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
1197     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1198 					       OutReg);
1199 
1200     OutReg = _mm_srli_epi32(InReg, 13 - 9);
1201     InReg = _mm_and_si128(_mm_load_si128(++in), mask13);
1202 
1203     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
1204     InReg = _mm_and_si128(_mm_load_si128(++in), mask13);
1205 
1206     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
1207     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1208 					       OutReg);
1209 
1210     OutReg = _mm_srli_epi32(InReg, 13 - 3);
1211     InReg = _mm_and_si128(_mm_load_si128(++in), mask13);
1212 
1213     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
1214     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1215 					       OutReg);
1216 
1217     return strm_buffer_i;
1218 }
1219 #endif
1220 
1221 
1222 #ifdef HAVE_SSE2
1223 /* nwritten = 7 * 4 = 28 unsigned ints */
1224 static int
write_14_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)1225 write_14_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
1226     const __m128i *in = (const __m128i *) _in;
1227     __m128i OutReg;
1228 
1229     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask14);
1230     OutReg = InReg;
1231     InReg = _mm_and_si128(_mm_load_si128(++in), mask14);
1232 
1233     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
1234     InReg = _mm_and_si128(_mm_load_si128(++in), mask14);
1235 
1236     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
1237     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1238 					       OutReg);
1239 
1240     OutReg = _mm_srli_epi32(InReg, 14 - 10);
1241     InReg = _mm_and_si128(_mm_load_si128(++in), mask14);
1242 
1243     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
1244     InReg = _mm_and_si128(_mm_load_si128(++in), mask14);
1245 
1246     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
1247     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1248 					       OutReg);
1249 
1250     OutReg = _mm_srli_epi32(InReg, 14 - 6);
1251     InReg = _mm_and_si128(_mm_load_si128(++in), mask14);
1252 
1253     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
1254     InReg = _mm_and_si128(_mm_load_si128(++in), mask14);
1255 
1256     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
1257     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1258 					       OutReg);
1259 
1260     OutReg = _mm_srli_epi32(InReg, 14 - 2);
1261     InReg = _mm_and_si128(_mm_load_si128(++in), mask14);
1262 
1263     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
1264     InReg = _mm_and_si128(_mm_load_si128(++in), mask14);
1265 
1266     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
1267     InReg = _mm_and_si128(_mm_load_si128(++in), mask14);
1268 
1269     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
1270     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1271 					       OutReg);
1272 
1273     OutReg = _mm_srli_epi32(InReg, 14 - 12);
1274     InReg = _mm_and_si128(_mm_load_si128(++in), mask14);
1275 
1276     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
1277     InReg = _mm_and_si128(_mm_load_si128(++in), mask14);
1278 
1279     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
1280     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1281 					       OutReg);
1282 
1283     OutReg = _mm_srli_epi32(InReg, 14 - 8);
1284     InReg = _mm_and_si128(_mm_load_si128(++in), mask14);
1285 
1286     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
1287     InReg = _mm_and_si128(_mm_load_si128(++in), mask14);
1288 
1289     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
1290     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1291 					       OutReg);
1292 
1293     OutReg = _mm_srli_epi32(InReg, 14 - 4);
1294     InReg = _mm_and_si128(_mm_load_si128(++in), mask14);
1295 
1296     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
1297     InReg = _mm_and_si128(_mm_load_si128(++in), mask14);
1298 
1299     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
1300     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1301 					       OutReg);
1302 
1303     return strm_buffer_i;
1304 }
1305 #endif
1306 
1307 
1308 static int
pack_14_horiz(UINT4 * out,const UINT4 * in)1309 pack_14_horiz (UINT4 *out, const UINT4 *in) {
1310   int column;
1311 
1312   for (column = 0; column < 4; column++) {
1313     *out |= (*in)   % (1U << 14 ) ;
1314     ++in;
1315     *out |= ( (*in)   % (1U << 14 )  ) <<  14 ;
1316     ++in;
1317     *out |= ( (*in)   % (1U << 14 )  ) <<  28 ;
1318     ++out;
1319     *out |=  ( (*in)   % (1U << 14 ) ) >> ( 14  -  10 );
1320     ++in;
1321     *out |= ( (*in)   % (1U << 14 )  ) <<  10 ;
1322     ++in;
1323     *out |= ( (*in)   % (1U << 14 )  ) <<  24 ;
1324     ++out;
1325     *out |=  ( (*in)   % (1U << 14 ) ) >> ( 14  -  6 );
1326     ++in;
1327     *out |= ( (*in)   % (1U << 14 )  ) <<  6 ;
1328     ++in;
1329     *out |= ( (*in)   % (1U << 14 )  ) <<  20 ;
1330     ++out;
1331     *out |=  ( (*in)   % (1U << 14 ) ) >> ( 14  -  2 );
1332     ++in;
1333     *out |= ( (*in)   % (1U << 14 )  ) <<  2 ;
1334     ++in;
1335     *out |= ( (*in)   % (1U << 14 )  ) <<  16 ;
1336     ++in;
1337     *out |= ( (*in)   % (1U << 14 )  ) <<  30 ;
1338     ++out;
1339     *out |=  ( (*in)   % (1U << 14 ) ) >> ( 14  -  12 );
1340     ++in;
1341     *out |= ( (*in)   % (1U << 14 )  ) <<  12 ;
1342     ++in;
1343     *out |= ( (*in)   % (1U << 14 )  ) <<  26 ;
1344     ++out;
1345     *out |=  ( (*in)   % (1U << 14 ) ) >> ( 14  -  8 );
1346     ++in;
1347     *out |= ( (*in)   % (1U << 14 )  ) <<  8 ;
1348     ++in;
1349     *out |= ( (*in)   % (1U << 14 )  ) <<  22 ;
1350     ++out;
1351     *out |=  ( (*in)   % (1U << 14 ) ) >> ( 14  -  4 );
1352     ++in;
1353     *out |= ( (*in)   % (1U << 14 )  ) <<  4 ;
1354     ++in;
1355     *out |= ( (*in)   % (1U << 14 )  ) <<  18 ;
1356     ++out;
1357     ++in;
1358   }
1359 
1360   return 28;
1361 }
1362 
1363 
1364 #ifdef ALLOW_ODD_PACKSIZES
1365 /* nwritten = 8 * 4 = 32 unsigned ints */
1366 static int
write_15_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)1367 write_15_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
1368     const __m128i *in = (const __m128i *) _in;
1369     __m128i OutReg;
1370 
1371     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask15);
1372     OutReg = InReg;
1373     InReg = _mm_and_si128(_mm_load_si128(++in), mask15);
1374 
1375     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
1376     InReg = _mm_and_si128(_mm_load_si128(++in), mask15);
1377 
1378     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
1379     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1380 					       OutReg);
1381 
1382     OutReg = _mm_srli_epi32(InReg, 15 - 13);
1383     InReg = _mm_and_si128(_mm_load_si128(++in), mask15);
1384 
1385     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
1386     InReg = _mm_and_si128(_mm_load_si128(++in), mask15);
1387 
1388     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
1389     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1390 					       OutReg);
1391 
1392     OutReg = _mm_srli_epi32(InReg, 15 - 11);
1393     InReg = _mm_and_si128(_mm_load_si128(++in), mask15);
1394 
1395     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
1396     InReg = _mm_and_si128(_mm_load_si128(++in), mask15);
1397 
1398     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
1399     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1400 					       OutReg);
1401 
1402     OutReg = _mm_srli_epi32(InReg, 15 - 9);
1403     InReg = _mm_and_si128(_mm_load_si128(++in), mask15);
1404 
1405     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
1406     InReg = _mm_and_si128(_mm_load_si128(++in), mask15);
1407 
1408     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
1409     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1410 					       OutReg);
1411 
1412     OutReg = _mm_srli_epi32(InReg, 15 - 7);
1413     InReg = _mm_and_si128(_mm_load_si128(++in), mask15);
1414 
1415     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
1416     InReg = _mm_and_si128(_mm_load_si128(++in), mask15);
1417 
1418     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
1419     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1420 					       OutReg);
1421 
1422     OutReg = _mm_srli_epi32(InReg, 15 - 5);
1423     InReg = _mm_and_si128(_mm_load_si128(++in), mask15);
1424 
1425     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
1426     InReg = _mm_and_si128(_mm_load_si128(++in), mask15);
1427 
1428     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
1429     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1430 					       OutReg);
1431 
1432     OutReg = _mm_srli_epi32(InReg, 15 - 3);
1433     InReg = _mm_and_si128(_mm_load_si128(++in), mask15);
1434 
1435     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
1436     InReg = _mm_and_si128(_mm_load_si128(++in), mask15);
1437 
1438     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
1439     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1440 					       OutReg);
1441 
1442     OutReg = _mm_srli_epi32(InReg, 15 - 1);
1443     InReg = _mm_and_si128(_mm_load_si128(++in), mask15);
1444 
1445     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
1446     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1447 					       OutReg);
1448 
1449     return strm_buffer_i;
1450 }
1451 #endif
1452 
1453 
1454 
1455 #ifdef ALLOW_ODD_PACKSIZES
1456 /* nwritten = 9 * 4 = 36 unsigned ints */
1457 static int
write_17_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)1458 write_17_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
1459     const __m128i *in = (const __m128i *) _in;
1460     __m128i OutReg;
1461 
1462     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask17);
1463     OutReg = InReg;
1464     InReg = _mm_and_si128(_mm_load_si128(++in), mask17);
1465 
1466     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
1467     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1468 					       OutReg);
1469 
1470     OutReg = _mm_srli_epi32(InReg, 17 - 2);
1471     InReg = _mm_and_si128(_mm_load_si128(++in), mask17);
1472 
1473     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
1474     InReg = _mm_and_si128(_mm_load_si128(++in), mask17);
1475 
1476     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
1477     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1478 					       OutReg);
1479 
1480     OutReg = _mm_srli_epi32(InReg, 17 - 4);
1481     InReg = _mm_and_si128(_mm_load_si128(++in), mask17);
1482 
1483     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
1484     InReg = _mm_and_si128(_mm_load_si128(++in), mask17);
1485 
1486     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
1487     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1488 					       OutReg);
1489 
1490     OutReg = _mm_srli_epi32(InReg, 17 - 6);
1491     InReg = _mm_and_si128(_mm_load_si128(++in), mask17);
1492 
1493     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
1494     InReg = _mm_and_si128(_mm_load_si128(++in), mask17);
1495 
1496     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
1497     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1498 					       OutReg);
1499 
1500     OutReg = _mm_srli_epi32(InReg, 17 - 8);
1501     InReg = _mm_and_si128(_mm_load_si128(++in), mask17);
1502 
1503     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
1504     InReg = _mm_and_si128(_mm_load_si128(++in), mask17);
1505 
1506     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
1507     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1508 					       OutReg);
1509 
1510     OutReg = _mm_srli_epi32(InReg, 17 - 10);
1511     InReg = _mm_and_si128(_mm_load_si128(++in), mask17);
1512 
1513     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
1514     InReg = _mm_and_si128(_mm_load_si128(++in), mask17);
1515 
1516     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
1517     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1518 					       OutReg);
1519 
1520     OutReg = _mm_srli_epi32(InReg, 17 - 12);
1521     InReg = _mm_and_si128(_mm_load_si128(++in), mask17);
1522 
1523     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
1524     InReg = _mm_and_si128(_mm_load_si128(++in), mask17);
1525 
1526     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
1527     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1528 					       OutReg);
1529 
1530     OutReg = _mm_srli_epi32(InReg, 17 - 14);
1531     InReg = _mm_and_si128(_mm_load_si128(++in), mask17);
1532 
1533     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
1534     InReg = _mm_and_si128(_mm_load_si128(++in), mask17);
1535 
1536     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
1537     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1538 					       OutReg);
1539 
1540 
1541     OutReg = _mm_srli_epi32(InReg, 17 - 16);
1542     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1543 					       OutReg);
1544 
1545     return strm_buffer_i;
1546 }
1547 #endif
1548 
1549 
1550 
1551 #ifdef HAVE_SSE2
1552 /* nwritten = 9 * 4 = 36 unsigned ints */
1553 static int
write_18_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)1554 write_18_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
1555     const __m128i *in = (const __m128i *) _in;
1556     __m128i OutReg;
1557 
1558     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask18);
1559     OutReg = InReg;
1560     InReg = _mm_and_si128(_mm_load_si128(++in), mask18);
1561 
1562     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
1563     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1564 					       OutReg);
1565 
1566     OutReg = _mm_srli_epi32(InReg, 18 - 4);
1567     InReg = _mm_and_si128(_mm_load_si128(++in), mask18);
1568 
1569     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
1570     InReg = _mm_and_si128(_mm_load_si128(++in), mask18);
1571 
1572     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
1573     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1574 					       OutReg);
1575 
1576     OutReg = _mm_srli_epi32(InReg, 18 - 8);
1577     InReg = _mm_and_si128(_mm_load_si128(++in), mask18);
1578 
1579     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
1580     InReg = _mm_and_si128(_mm_load_si128(++in), mask18);
1581 
1582     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
1583     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1584 					       OutReg);
1585 
1586     OutReg = _mm_srli_epi32(InReg, 18 - 12);
1587     InReg = _mm_and_si128(_mm_load_si128(++in), mask18);
1588 
1589     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
1590     InReg = _mm_and_si128(_mm_load_si128(++in), mask18);
1591 
1592     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
1593     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1594 					       OutReg);
1595 
1596     OutReg = _mm_srli_epi32(InReg, 18 - 16);
1597     InReg = _mm_and_si128(_mm_load_si128(++in), mask18);
1598 
1599     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
1600     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1601 					       OutReg);
1602 
1603     OutReg = _mm_srli_epi32(InReg, 18 - 2);
1604     InReg = _mm_and_si128(_mm_load_si128(++in), mask18);
1605 
1606     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
1607     InReg = _mm_and_si128(_mm_load_si128(++in), mask18);
1608 
1609     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
1610     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1611 					       OutReg);
1612 
1613     OutReg = _mm_srli_epi32(InReg, 18 - 6);
1614     InReg = _mm_and_si128(_mm_load_si128(++in), mask18);
1615 
1616     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
1617     InReg = _mm_and_si128(_mm_load_si128(++in), mask18);
1618 
1619     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
1620     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1621 					       OutReg);
1622 
1623     OutReg = _mm_srli_epi32(InReg, 18 - 10);
1624     InReg = _mm_and_si128(_mm_load_si128(++in), mask18);
1625 
1626     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
1627     InReg = _mm_and_si128(_mm_load_si128(++in), mask18);
1628 
1629     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
1630     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1631 					       OutReg);
1632 
1633     OutReg = _mm_srli_epi32(InReg, 18 - 14);
1634     InReg = _mm_and_si128(_mm_load_si128(++in), mask18);
1635 
1636     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
1637     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1638 					       OutReg);
1639 
1640     return strm_buffer_i;
1641 }
1642 #endif
1643 
1644 static int
pack_18_horiz(UINT4 * out,const UINT4 * in)1645 pack_18_horiz (UINT4 *out, const UINT4 *in) {
1646   int column;
1647 
1648   for (column = 0; column < 4; column++) {
1649     *out |= (*in)   % (1U << 18 ) ;
1650     ++in;
1651     *out |= ( (*in)   % (1U << 18 )  ) <<  18 ;
1652     ++out;
1653     *out |=  ( (*in)   % (1U << 18 ) ) >> ( 18  -  4 );
1654     ++in;
1655     *out |= ( (*in)   % (1U << 18 )  ) <<  4 ;
1656     ++in;
1657     *out |= ( (*in)   % (1U << 18 )  ) <<  22 ;
1658     ++out;
1659     *out |=  ( (*in)   % (1U << 18 ) ) >> ( 18  -  8 );
1660     ++in;
1661     *out |= ( (*in)   % (1U << 18 )  ) <<  8 ;
1662     ++in;
1663     *out |= ( (*in)   % (1U << 18 )  ) <<  26 ;
1664     ++out;
1665     *out |=  ( (*in)   % (1U << 18 ) ) >> ( 18  -  12 );
1666     ++in;
1667     *out |= ( (*in)   % (1U << 18 )  ) <<  12 ;
1668     ++in;
1669     *out |= ( (*in)   % (1U << 18 )  ) <<  30 ;
1670     ++out;
1671     *out |=  ( (*in)   % (1U << 18 ) ) >> ( 18  -  16 );
1672     ++in;
1673     *out |= ( (*in)   % (1U << 18 )  ) <<  16 ;
1674     ++out;
1675     *out |=  ( (*in)   % (1U << 18 ) ) >> ( 18  -  2 );
1676     ++in;
1677     *out |= ( (*in)   % (1U << 18 )  ) <<  2 ;
1678     ++in;
1679     *out |= ( (*in)   % (1U << 18 )  ) <<  20 ;
1680     ++out;
1681     *out |=  ( (*in)   % (1U << 18 ) ) >> ( 18  -  6 );
1682     ++in;
1683     *out |= ( (*in)   % (1U << 18 )  ) <<  6 ;
1684     ++in;
1685     *out |= ( (*in)   % (1U << 18 )  ) <<  24 ;
1686     ++out;
1687     *out |=  ( (*in)   % (1U << 18 ) ) >> ( 18  -  10 );
1688     ++in;
1689     *out |= ( (*in)   % (1U << 18 )  ) <<  10 ;
1690     ++in;
1691     *out |= ( (*in)   % (1U << 18 )  ) <<  28 ;
1692     ++out;
1693     *out |=  ( (*in)   % (1U << 18 ) ) >> ( 18  -  14 );
1694     ++in;
1695     *out |= ( (*in)   % (1U << 18 )  ) <<  14 ;
1696     ++out;
1697     ++in;
1698   }
1699 
1700   return 36;
1701 }
1702 
1703 
1704 
1705 #ifdef ALLOW_ODD_PACKSIZES
1706 /* nwritten = 10 * 4 = 40 unsigned ints */
1707 static int
write_19_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)1708 write_19_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
1709     const __m128i *in = (const __m128i *) _in;
1710     __m128i OutReg;
1711 
1712     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask19);
1713     OutReg = InReg;
1714     InReg = _mm_and_si128(_mm_load_si128(++in), mask19);
1715 
1716     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
1717     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1718 					       OutReg);
1719 
1720     OutReg = _mm_srli_epi32(InReg, 19 - 6);
1721     InReg = _mm_and_si128(_mm_load_si128(++in), mask19);
1722 
1723     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
1724     InReg = _mm_and_si128(_mm_load_si128(++in), mask19);
1725 
1726     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
1727     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1728 					       OutReg);
1729 
1730     OutReg = _mm_srli_epi32(InReg, 19 - 12);
1731     InReg = _mm_and_si128(_mm_load_si128(++in), mask19);
1732 
1733     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
1734     InReg = _mm_and_si128(_mm_load_si128(++in), mask19);
1735 
1736     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
1737     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1738 					       OutReg);
1739 
1740     OutReg = _mm_srli_epi32(InReg, 19 - 18);
1741     InReg = _mm_and_si128(_mm_load_si128(++in), mask19);
1742 
1743     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
1744     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1745 					       OutReg);
1746 
1747     OutReg = _mm_srli_epi32(InReg, 19 - 5);
1748     InReg = _mm_and_si128(_mm_load_si128(++in), mask19);
1749 
1750     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
1751     InReg = _mm_and_si128(_mm_load_si128(++in), mask19);
1752 
1753     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
1754     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1755 					       OutReg);
1756 
1757     OutReg = _mm_srli_epi32(InReg, 19 - 11);
1758     InReg = _mm_and_si128(_mm_load_si128(++in), mask19);
1759 
1760     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
1761     InReg = _mm_and_si128(_mm_load_si128(++in), mask19);
1762 
1763     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
1764     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1765 					       OutReg);
1766 
1767     OutReg = _mm_srli_epi32(InReg, 19 - 17);
1768     InReg = _mm_and_si128(_mm_load_si128(++in), mask19);
1769 
1770     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
1771     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1772 					       OutReg);
1773 
1774     OutReg = _mm_srli_epi32(InReg, 19 - 4);
1775     InReg = _mm_and_si128(_mm_load_si128(++in), mask19);
1776 
1777     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
1778     InReg = _mm_and_si128(_mm_load_si128(++in), mask19);
1779 
1780     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
1781     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1782 					       OutReg);
1783 
1784     OutReg = _mm_srli_epi32(InReg, 19 - 10);
1785     InReg = _mm_and_si128(_mm_load_si128(++in), mask19);
1786 
1787     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
1788     InReg = _mm_and_si128(_mm_load_si128(++in), mask19);
1789 
1790     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
1791     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1792 					       OutReg);
1793 
1794 
1795     OutReg = _mm_srli_epi32(InReg, 19 - 16);
1796     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1797 					       OutReg);
1798 
1799     return strm_buffer_i;
1800 }
1801 #endif
1802 
1803 
1804 #ifdef HAVE_SSE2
1805 /* nwritten = 10 * 4 = 40 unsigned ints */
1806 static int
write_20_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)1807 write_20_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
1808     const __m128i *in = (const __m128i *) _in;
1809     __m128i OutReg;
1810 
1811     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask20);
1812     OutReg = InReg;
1813     InReg = _mm_and_si128(_mm_load_si128(++in), mask20);
1814 
1815     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
1816     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1817 					       OutReg);
1818 
1819     OutReg = _mm_srli_epi32(InReg, 20 - 8);
1820     InReg = _mm_and_si128(_mm_load_si128(++in), mask20);
1821 
1822     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
1823     InReg = _mm_and_si128(_mm_load_si128(++in), mask20);
1824 
1825     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
1826     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1827 					       OutReg);
1828 
1829     OutReg = _mm_srli_epi32(InReg, 20 - 16);
1830     InReg = _mm_and_si128(_mm_load_si128(++in), mask20);
1831 
1832     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
1833     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1834 					       OutReg);
1835 
1836     OutReg = _mm_srli_epi32(InReg, 20 - 4);
1837     InReg = _mm_and_si128(_mm_load_si128(++in), mask20);
1838 
1839     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
1840     InReg = _mm_and_si128(_mm_load_si128(++in), mask20);
1841 
1842     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
1843     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1844 					       OutReg);
1845 
1846     OutReg = _mm_srli_epi32(InReg, 20 - 12);
1847     InReg = _mm_and_si128(_mm_load_si128(++in), mask20);
1848 
1849     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
1850     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1851 					       OutReg);
1852 
1853     InReg = _mm_and_si128(_mm_load_si128(++in), mask20);
1854 
1855     OutReg = InReg;
1856     InReg = _mm_and_si128(_mm_load_si128(++in), mask20);
1857 
1858     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
1859     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1860 					       OutReg);
1861 
1862     OutReg = _mm_srli_epi32(InReg, 20 - 8);
1863     InReg = _mm_and_si128(_mm_load_si128(++in), mask20);
1864 
1865     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
1866     InReg = _mm_and_si128(_mm_load_si128(++in), mask20);
1867 
1868     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
1869     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1870 					       OutReg);
1871 
1872     OutReg = _mm_srli_epi32(InReg, 20 - 16);
1873     InReg = _mm_and_si128(_mm_load_si128(++in), mask20);
1874 
1875     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
1876     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1877 					       OutReg);
1878 
1879     OutReg = _mm_srli_epi32(InReg, 20 - 4);
1880     InReg = _mm_and_si128(_mm_load_si128(++in), mask20);
1881 
1882     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
1883     InReg = _mm_and_si128(_mm_load_si128(++in), mask20);
1884 
1885     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
1886     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1887 					       OutReg);
1888 
1889     OutReg = _mm_srli_epi32(InReg, 20 - 12);
1890     InReg = _mm_and_si128(_mm_load_si128(++in), mask20);
1891 
1892     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
1893     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1894 					       OutReg);
1895 
1896     return strm_buffer_i;
1897 }
1898 #endif
1899 
1900 static int
pack_20_horiz(UINT4 * out,const UINT4 * in)1901 pack_20_horiz (UINT4 *out, const UINT4 *in) {
1902   int column;
1903 
1904   for (column = 0; column < 4; column++) {
1905     *out |= (*in)   % (1U << 20 ) ;
1906     ++in;
1907     *out |= ( (*in)   % (1U << 20 )  ) <<  20 ;
1908     ++out;
1909     *out |=  ( (*in)   % (1U << 20 ) ) >> ( 20  -  8 );
1910     ++in;
1911     *out |= ( (*in)   % (1U << 20 )  ) <<  8 ;
1912     ++in;
1913     *out |= ( (*in)   % (1U << 20 )  ) <<  28 ;
1914     ++out;
1915     *out |=  ( (*in)   % (1U << 20 ) ) >> ( 20  -  16 );
1916     ++in;
1917     *out |= ( (*in)   % (1U << 20 )  ) <<  16 ;
1918     ++out;
1919     *out |=  ( (*in)   % (1U << 20 ) ) >> ( 20  -  4 );
1920     ++in;
1921     *out |= ( (*in)   % (1U << 20 )  ) <<  4 ;
1922     ++in;
1923     *out |= ( (*in)   % (1U << 20 )  ) <<  24 ;
1924     ++out;
1925     *out |=  ( (*in)   % (1U << 20 ) ) >> ( 20  -  12 );
1926     ++in;
1927     *out |= ( (*in)   % (1U << 20 )  ) <<  12 ;
1928     ++out;
1929     ++in;
1930     *out |= (*in)   % (1U << 20 ) ;
1931     ++in;
1932     *out |= ( (*in)   % (1U << 20 )  ) <<  20 ;
1933     ++out;
1934     *out |=  ( (*in)   % (1U << 20 ) ) >> ( 20  -  8 );
1935     ++in;
1936     *out |= ( (*in)   % (1U << 20 )  ) <<  8 ;
1937     ++in;
1938     *out |= ( (*in)   % (1U << 20 )  ) <<  28 ;
1939     ++out;
1940     *out |=  ( (*in)   % (1U << 20 ) ) >> ( 20  -  16 );
1941     ++in;
1942     *out |= ( (*in)   % (1U << 20 )  ) <<  16 ;
1943     ++out;
1944     *out |=  ( (*in)   % (1U << 20 ) ) >> ( 20  -  4 );
1945     ++in;
1946     *out |= ( (*in)   % (1U << 20 )  ) <<  4 ;
1947     ++in;
1948     *out |= ( (*in)   % (1U << 20 )  ) <<  24 ;
1949     ++out;
1950     *out |=  ( (*in)   % (1U << 20 ) ) >> ( 20  -  12 );
1951     ++in;
1952     *out |= ( (*in)   % (1U << 20 )  ) <<  12 ;
1953     ++out;
1954     ++in;
1955   }
1956 
1957   return 40;
1958 }
1959 
1960 
1961 
1962 #ifdef ALLOW_ODD_PACKSIZES
1963 /* nwritten = 11 * 4 = 44 unsigned ints */
1964 static int
write_21_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)1965 write_21_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
1966     const __m128i *in = (const __m128i *) _in;
1967     __m128i OutReg;
1968 
1969     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask21);
1970     OutReg = InReg;
1971     InReg = _mm_and_si128(_mm_load_si128(++in), mask21);
1972 
1973     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
1974     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1975 					       OutReg);
1976 
1977     OutReg = _mm_srli_epi32(InReg, 21 - 10);
1978     InReg = _mm_and_si128(_mm_load_si128(++in), mask21);
1979 
1980     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
1981     InReg = _mm_and_si128(_mm_load_si128(++in), mask21);
1982 
1983     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
1984     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1985 					       OutReg);
1986 
1987     OutReg = _mm_srli_epi32(InReg, 21 - 20);
1988     InReg = _mm_and_si128(_mm_load_si128(++in), mask21);
1989 
1990     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
1991     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
1992 					       OutReg);
1993 
1994     OutReg = _mm_srli_epi32(InReg, 21 - 9);
1995     InReg = _mm_and_si128(_mm_load_si128(++in), mask21);
1996 
1997     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
1998     InReg = _mm_and_si128(_mm_load_si128(++in), mask21);
1999 
2000     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
2001     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2002 					       OutReg);
2003 
2004     OutReg = _mm_srli_epi32(InReg, 21 - 19);
2005     InReg = _mm_and_si128(_mm_load_si128(++in), mask21);
2006 
2007     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
2008     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2009 					       OutReg);
2010 
2011     OutReg = _mm_srli_epi32(InReg, 21 - 8);
2012     InReg = _mm_and_si128(_mm_load_si128(++in), mask21);
2013 
2014     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
2015     InReg = _mm_and_si128(_mm_load_si128(++in), mask21);
2016 
2017     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
2018     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2019 					       OutReg);
2020 
2021     OutReg = _mm_srli_epi32(InReg, 21 - 18);
2022     InReg = _mm_and_si128(_mm_load_si128(++in), mask21);
2023 
2024     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
2025     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2026 					       OutReg);
2027 
2028     OutReg = _mm_srli_epi32(InReg, 21 - 7);
2029     InReg = _mm_and_si128(_mm_load_si128(++in), mask21);
2030 
2031     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
2032     InReg = _mm_and_si128(_mm_load_si128(++in), mask21);
2033 
2034     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
2035     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2036 					       OutReg);
2037 
2038     OutReg = _mm_srli_epi32(InReg, 21 - 17);
2039     InReg = _mm_and_si128(_mm_load_si128(++in), mask21);
2040 
2041     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
2042     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2043 					       OutReg);
2044 
2045     OutReg = _mm_srli_epi32(InReg, 21 - 6);
2046     InReg = _mm_and_si128(_mm_load_si128(++in), mask21);
2047 
2048     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
2049     InReg = _mm_and_si128(_mm_load_si128(++in), mask21);
2050 
2051     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
2052     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2053 					       OutReg);
2054 
2055 
2056     OutReg = _mm_srli_epi32(InReg, 21 - 16);
2057     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2058 					       OutReg);
2059 
2060     return strm_buffer_i;
2061 }
2062 #endif
2063 
2064 
2065 #ifdef HAVE_SSE2
2066 /* nwritten = 11 * 4 = 44 unsigned ints */
2067 static int
write_22_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)2068 write_22_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
2069     const __m128i *in = (const __m128i *) _in;
2070     __m128i OutReg;
2071 
2072     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask22);
2073     OutReg = InReg;
2074     InReg = _mm_and_si128(_mm_load_si128(++in), mask22);
2075 
2076     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
2077     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2078 					       OutReg);
2079 
2080     OutReg = _mm_srli_epi32(InReg, 22 - 12);
2081     InReg = _mm_and_si128(_mm_load_si128(++in), mask22);
2082 
2083     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
2084     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2085 					       OutReg);
2086 
2087     OutReg = _mm_srli_epi32(InReg, 22 - 2);
2088     InReg = _mm_and_si128(_mm_load_si128(++in), mask22);
2089 
2090     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
2091     InReg = _mm_and_si128(_mm_load_si128(++in), mask22);
2092 
2093     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
2094     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2095 					       OutReg);
2096 
2097     OutReg = _mm_srli_epi32(InReg, 22 - 14);
2098     InReg = _mm_and_si128(_mm_load_si128(++in), mask22);
2099 
2100     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
2101     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2102 					       OutReg);
2103 
2104     OutReg = _mm_srli_epi32(InReg, 22 - 4);
2105     InReg = _mm_and_si128(_mm_load_si128(++in), mask22);
2106 
2107     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
2108     InReg = _mm_and_si128(_mm_load_si128(++in), mask22);
2109 
2110     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
2111     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2112 					       OutReg);
2113 
2114     OutReg = _mm_srli_epi32(InReg, 22 - 16);
2115     InReg = _mm_and_si128(_mm_load_si128(++in), mask22);
2116 
2117     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
2118     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2119 					       OutReg);
2120 
2121     OutReg = _mm_srli_epi32(InReg, 22 - 6);
2122     InReg = _mm_and_si128(_mm_load_si128(++in), mask22);
2123 
2124     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
2125     InReg = _mm_and_si128(_mm_load_si128(++in), mask22);
2126 
2127     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
2128     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2129 					       OutReg);
2130 
2131     OutReg = _mm_srli_epi32(InReg, 22 - 18);
2132     InReg = _mm_and_si128(_mm_load_si128(++in), mask22);
2133 
2134     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
2135     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2136 					       OutReg);
2137 
2138     OutReg = _mm_srli_epi32(InReg, 22 - 8);
2139     InReg = _mm_and_si128(_mm_load_si128(++in), mask22);
2140 
2141     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
2142     InReg = _mm_and_si128(_mm_load_si128(++in), mask22);
2143 
2144     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
2145     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2146 					       OutReg);
2147 
2148     OutReg = _mm_srli_epi32(InReg, 22 - 20);
2149     InReg = _mm_and_si128(_mm_load_si128(++in), mask22);
2150 
2151     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
2152     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2153 					       OutReg);
2154 
2155     OutReg = _mm_srli_epi32(InReg, 22 - 10);
2156     InReg = _mm_and_si128(_mm_load_si128(++in), mask22);
2157 
2158     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
2159     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2160 					       OutReg);
2161 
2162     return strm_buffer_i;
2163 }
2164 #endif
2165 
2166 static int
pack_22_horiz(UINT4 * out,const UINT4 * in)2167 pack_22_horiz (UINT4 *out, const UINT4 *in) {
2168   int column;
2169 
2170   for (column = 0; column < 4; column++) {
2171     *out |= (*in)   % (1U << 22 ) ;
2172     ++in;
2173     *out |= ( (*in)   % (1U << 22 )  ) <<  22 ;
2174     ++out;
2175     *out |=  ( (*in)   % (1U << 22 ) ) >> ( 22  -  12 );
2176     ++in;
2177     *out |= ( (*in)   % (1U << 22 )  ) <<  12 ;
2178     ++out;
2179     *out |=  ( (*in)   % (1U << 22 ) ) >> ( 22  -  2 );
2180     ++in;
2181     *out |= ( (*in)   % (1U << 22 )  ) <<  2 ;
2182     ++in;
2183     *out |= ( (*in)   % (1U << 22 )  ) <<  24 ;
2184     ++out;
2185     *out |=  ( (*in)   % (1U << 22 ) ) >> ( 22  -  14 );
2186     ++in;
2187     *out |= ( (*in)   % (1U << 22 )  ) <<  14 ;
2188     ++out;
2189     *out |=  ( (*in)   % (1U << 22 ) ) >> ( 22  -  4 );
2190     ++in;
2191     *out |= ( (*in)   % (1U << 22 )  ) <<  4 ;
2192     ++in;
2193     *out |= ( (*in)   % (1U << 22 )  ) <<  26 ;
2194     ++out;
2195     *out |=  ( (*in)   % (1U << 22 ) ) >> ( 22  -  16 );
2196     ++in;
2197     *out |= ( (*in)   % (1U << 22 )  ) <<  16 ;
2198     ++out;
2199     *out |=  ( (*in)   % (1U << 22 ) ) >> ( 22  -  6 );
2200     ++in;
2201     *out |= ( (*in)   % (1U << 22 )  ) <<  6 ;
2202     ++in;
2203     *out |= ( (*in)   % (1U << 22 )  ) <<  28 ;
2204     ++out;
2205     *out |=  ( (*in)   % (1U << 22 ) ) >> ( 22  -  18 );
2206     ++in;
2207     *out |= ( (*in)   % (1U << 22 )  ) <<  18 ;
2208     ++out;
2209     *out |=  ( (*in)   % (1U << 22 ) ) >> ( 22  -  8 );
2210     ++in;
2211     *out |= ( (*in)   % (1U << 22 )  ) <<  8 ;
2212     ++in;
2213     *out |= ( (*in)   % (1U << 22 )  ) <<  30 ;
2214     ++out;
2215     *out |=  ( (*in)   % (1U << 22 ) ) >> ( 22  -  20 );
2216     ++in;
2217     *out |= ( (*in)   % (1U << 22 )  ) <<  20 ;
2218     ++out;
2219     *out |=  ( (*in)   % (1U << 22 ) ) >> ( 22  -  10 );
2220     ++in;
2221     *out |= ( (*in)   % (1U << 22 )  ) <<  10 ;
2222     ++out;
2223     ++in;
2224   }
2225 
2226   return 44;
2227 }
2228 
2229 
2230 #ifdef ALLOW_ODD_PACKSIZES
2231 /* nwritten = 12 * 4 = 48 unsigned ints */
2232 static int
write_23_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)2233 write_23_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
2234     const __m128i *in = (const __m128i *) _in;
2235     __m128i OutReg;
2236 
2237     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask23);
2238     OutReg = InReg;
2239     InReg = _mm_and_si128(_mm_load_si128(++in), mask23);
2240 
2241     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
2242     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2243 					       OutReg);
2244 
2245     OutReg = _mm_srli_epi32(InReg, 23 - 14);
2246     InReg = _mm_and_si128(_mm_load_si128(++in), mask23);
2247 
2248     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
2249     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2250 					       OutReg);
2251 
2252     OutReg = _mm_srli_epi32(InReg, 23 - 5);
2253     InReg = _mm_and_si128(_mm_load_si128(++in), mask23);
2254 
2255     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
2256     InReg = _mm_and_si128(_mm_load_si128(++in), mask23);
2257 
2258     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
2259     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2260 					       OutReg);
2261 
2262     OutReg = _mm_srli_epi32(InReg, 23 - 19);
2263     InReg = _mm_and_si128(_mm_load_si128(++in), mask23);
2264 
2265     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
2266     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2267 					       OutReg);
2268 
2269     OutReg = _mm_srli_epi32(InReg, 23 - 10);
2270     InReg = _mm_and_si128(_mm_load_si128(++in), mask23);
2271 
2272     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
2273     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2274 					       OutReg);
2275 
2276     OutReg = _mm_srli_epi32(InReg, 23 - 1);
2277     InReg = _mm_and_si128(_mm_load_si128(++in), mask23);
2278 
2279     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
2280     InReg = _mm_and_si128(_mm_load_si128(++in), mask23);
2281 
2282     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
2283     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2284 					       OutReg);
2285 
2286     OutReg = _mm_srli_epi32(InReg, 23 - 15);
2287     InReg = _mm_and_si128(_mm_load_si128(++in), mask23);
2288 
2289     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
2290     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2291 					       OutReg);
2292 
2293     OutReg = _mm_srli_epi32(InReg, 23 - 6);
2294     InReg = _mm_and_si128(_mm_load_si128(++in), mask23);
2295 
2296     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
2297     InReg = _mm_and_si128(_mm_load_si128(++in), mask23);
2298 
2299     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
2300     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2301 					       OutReg);
2302 
2303     OutReg = _mm_srli_epi32(InReg, 23 - 20);
2304     InReg = _mm_and_si128(_mm_load_si128(++in), mask23);
2305 
2306     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
2307     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2308 					       OutReg);
2309 
2310     OutReg = _mm_srli_epi32(InReg, 23 - 11);
2311     InReg = _mm_and_si128(_mm_load_si128(++in), mask23);
2312 
2313     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
2314     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2315 					       OutReg);
2316 
2317     OutReg = _mm_srli_epi32(InReg, 23 - 2);
2318     InReg = _mm_and_si128(_mm_load_si128(++in), mask23);
2319 
2320     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
2321     InReg = _mm_and_si128(_mm_load_si128(++in), mask23);
2322 
2323     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
2324     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2325 					       OutReg);
2326 
2327     OutReg = _mm_srli_epi32(InReg, 23 - 16);
2328     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2329 					       OutReg);
2330 
2331     return strm_buffer_i;
2332 }
2333 #endif
2334 
2335 
2336 #ifdef HAVE_SSE2
2337 /* nwritten = 12 * 4 = 48 unsigned ints */
2338 static int
write_24_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)2339 write_24_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
2340     const __m128i *in = (const __m128i *) _in;
2341     __m128i OutReg;
2342 
2343     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask24);
2344     OutReg = InReg;
2345     InReg = _mm_and_si128(_mm_load_si128(++in), mask24);
2346 
2347     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
2348     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2349 					       OutReg);
2350 
2351     OutReg = _mm_srli_epi32(InReg, 24 - 16);
2352     InReg = _mm_and_si128(_mm_load_si128(++in), mask24);
2353 
2354     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
2355     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2356 					       OutReg);
2357 
2358     OutReg = _mm_srli_epi32(InReg, 24 - 8);
2359     InReg = _mm_and_si128(_mm_load_si128(++in), mask24);
2360 
2361     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
2362     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2363 					       OutReg);
2364 
2365     InReg = _mm_and_si128(_mm_load_si128(++in), mask24);
2366 
2367     OutReg = InReg;
2368     InReg = _mm_and_si128(_mm_load_si128(++in), mask24);
2369 
2370     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
2371     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2372 					       OutReg);
2373 
2374     OutReg = _mm_srli_epi32(InReg, 24 - 16);
2375     InReg = _mm_and_si128(_mm_load_si128(++in), mask24);
2376 
2377     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
2378     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2379 					       OutReg);
2380 
2381     OutReg = _mm_srli_epi32(InReg, 24 - 8);
2382     InReg = _mm_and_si128(_mm_load_si128(++in), mask24);
2383 
2384     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
2385     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2386 					       OutReg);
2387 
2388     InReg = _mm_and_si128(_mm_load_si128(++in), mask24);
2389 
2390     OutReg = InReg;
2391     InReg = _mm_and_si128(_mm_load_si128(++in), mask24);
2392 
2393     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
2394     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2395 					       OutReg);
2396 
2397     OutReg = _mm_srli_epi32(InReg, 24 - 16);
2398     InReg = _mm_and_si128(_mm_load_si128(++in), mask24);
2399 
2400     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
2401     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2402 					       OutReg);
2403 
2404     OutReg = _mm_srli_epi32(InReg, 24 - 8);
2405     InReg = _mm_and_si128(_mm_load_si128(++in), mask24);
2406 
2407     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
2408     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2409 					       OutReg);
2410 
2411     InReg = _mm_and_si128(_mm_load_si128(++in), mask24);
2412 
2413     OutReg = InReg;
2414     InReg = _mm_and_si128(_mm_load_si128(++in), mask24);
2415 
2416     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
2417     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2418 					       OutReg);
2419 
2420     OutReg = _mm_srli_epi32(InReg, 24 - 16);
2421     InReg = _mm_and_si128(_mm_load_si128(++in), mask24);
2422 
2423     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
2424     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2425 					       OutReg);
2426 
2427     OutReg = _mm_srli_epi32(InReg, 24 - 8);
2428     InReg = _mm_and_si128(_mm_load_si128(++in), mask24);
2429 
2430     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
2431     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2432 					       OutReg);
2433 
2434     return strm_buffer_i;
2435 }
2436 #endif
2437 
2438 static int
pack_24_horiz(UINT4 * out,const UINT4 * in)2439 pack_24_horiz (UINT4 *out, const UINT4 *in) {
2440   int column;
2441 
2442   for (column = 0; column < 4; column++) {
2443     *out |= (*in)   % (1U << 24 ) ;
2444     ++in;
2445     *out |= ( (*in)   % (1U << 24 )  ) <<  24 ;
2446     ++out;
2447     *out |=  ( (*in)   % (1U << 24 ) ) >> ( 24  -  16 );
2448     ++in;
2449     *out |= ( (*in)   % (1U << 24 )  ) <<  16 ;
2450     ++out;
2451     *out |=  ( (*in)   % (1U << 24 ) ) >> ( 24  -  8 );
2452     ++in;
2453     *out |= ( (*in)   % (1U << 24 )  ) <<  8 ;
2454     ++out;
2455     ++in;
2456     *out |= (*in)   % (1U << 24 ) ;
2457     ++in;
2458     *out |= ( (*in)   % (1U << 24 )  ) <<  24 ;
2459     ++out;
2460     *out |=  ( (*in)   % (1U << 24 ) ) >> ( 24  -  16 );
2461     ++in;
2462     *out |= ( (*in)   % (1U << 24 )  ) <<  16 ;
2463     ++out;
2464     *out |=  ( (*in)   % (1U << 24 ) ) >> ( 24  -  8 );
2465     ++in;
2466     *out |= ( (*in)   % (1U << 24 )  ) <<  8 ;
2467     ++out;
2468     ++in;
2469     *out |= (*in)   % (1U << 24 ) ;
2470     ++in;
2471     *out |= ( (*in)   % (1U << 24 )  ) <<  24 ;
2472     ++out;
2473     *out |=  ( (*in)   % (1U << 24 ) ) >> ( 24  -  16 );
2474     ++in;
2475     *out |= ( (*in)   % (1U << 24 )  ) <<  16 ;
2476     ++out;
2477     *out |=  ( (*in)   % (1U << 24 ) ) >> ( 24  -  8 );
2478     ++in;
2479     *out |= ( (*in)   % (1U << 24 )  ) <<  8 ;
2480     ++out;
2481     ++in;
2482     *out |= (*in)   % (1U << 24 ) ;
2483     ++in;
2484     *out |= ( (*in)   % (1U << 24 )  ) <<  24 ;
2485     ++out;
2486     *out |=  ( (*in)   % (1U << 24 ) ) >> ( 24  -  16 );
2487     ++in;
2488     *out |= ( (*in)   % (1U << 24 )  ) <<  16 ;
2489     ++out;
2490     *out |=  ( (*in)   % (1U << 24 ) ) >> ( 24  -  8 );
2491     ++in;
2492     *out |= ( (*in)   % (1U << 24 )  ) <<  8 ;
2493     ++out;
2494     ++in;
2495   }
2496 
2497   return 48;
2498 }
2499 
2500 
2501 #ifdef ALLOW_ODD_PACKSIZES
2502 /* nwritten = 13 * 4 = 52 unsigned ints */
2503 static int
write_25_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)2504 write_25_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
2505     const __m128i *in = (const __m128i *) _in;
2506     __m128i OutReg;
2507 
2508     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask25);
2509     OutReg = InReg;
2510     InReg = _mm_and_si128(_mm_load_si128(++in), mask25);
2511 
2512     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
2513     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2514 					       OutReg);
2515 
2516     OutReg = _mm_srli_epi32(InReg, 25 - 18);
2517     InReg = _mm_and_si128(_mm_load_si128(++in), mask25);
2518 
2519     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
2520     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2521 					       OutReg);
2522 
2523     OutReg = _mm_srli_epi32(InReg, 25 - 11);
2524     InReg = _mm_and_si128(_mm_load_si128(++in), mask25);
2525 
2526     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
2527     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2528 					       OutReg);
2529 
2530     OutReg = _mm_srli_epi32(InReg, 25 - 4);
2531     InReg = _mm_and_si128(_mm_load_si128(++in), mask25);
2532 
2533     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
2534     InReg = _mm_and_si128(_mm_load_si128(++in), mask25);
2535 
2536     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
2537     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2538 					       OutReg);
2539 
2540     OutReg = _mm_srli_epi32(InReg, 25 - 22);
2541     InReg = _mm_and_si128(_mm_load_si128(++in), mask25);
2542 
2543     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
2544     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2545 					       OutReg);
2546 
2547     OutReg = _mm_srli_epi32(InReg, 25 - 15);
2548     InReg = _mm_and_si128(_mm_load_si128(++in), mask25);
2549 
2550     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
2551     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2552 					       OutReg);
2553 
2554     OutReg = _mm_srli_epi32(InReg, 25 - 8);
2555     InReg = _mm_and_si128(_mm_load_si128(++in), mask25);
2556 
2557     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
2558     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2559 					       OutReg);
2560 
2561     OutReg = _mm_srli_epi32(InReg, 25 - 1);
2562     InReg = _mm_and_si128(_mm_load_si128(++in), mask25);
2563 
2564     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
2565     InReg = _mm_and_si128(_mm_load_si128(++in), mask25);
2566 
2567     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
2568     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2569 					       OutReg);
2570 
2571     OutReg = _mm_srli_epi32(InReg, 25 - 19);
2572     InReg = _mm_and_si128(_mm_load_si128(++in), mask25);
2573 
2574     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
2575     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2576 					       OutReg);
2577 
2578     OutReg = _mm_srli_epi32(InReg, 25 - 12);
2579     InReg = _mm_and_si128(_mm_load_si128(++in), mask25);
2580 
2581     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
2582     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2583 					       OutReg);
2584 
2585     OutReg = _mm_srli_epi32(InReg, 25 - 5);
2586     InReg = _mm_and_si128(_mm_load_si128(++in), mask25);
2587 
2588     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
2589     InReg = _mm_and_si128(_mm_load_si128(++in), mask25);
2590 
2591     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
2592     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2593 					       OutReg);
2594 
2595     OutReg = _mm_srli_epi32(InReg, 25 - 23);
2596     InReg = _mm_and_si128(_mm_load_si128(++in), mask25);
2597 
2598     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
2599     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2600 					       OutReg);
2601 
2602 
2603     OutReg = _mm_srli_epi32(InReg, 25 - 16);
2604     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2605 					       OutReg);
2606 
2607     return strm_buffer_i;
2608 }
2609 #endif
2610 
2611 
2612 #ifdef HAVE_SSE2
2613 /* nwritten = 13 * 4 = 52 unsigned ints */
2614 static int
write_26_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)2615 write_26_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
2616     const __m128i *in = (const __m128i *) _in;
2617     __m128i OutReg;
2618 
2619     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask26);
2620     OutReg = InReg;
2621     InReg = _mm_and_si128(_mm_load_si128(++in), mask26);
2622 
2623     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
2624     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2625 					       OutReg);
2626 
2627     OutReg = _mm_srli_epi32(InReg, 26 - 20);
2628     InReg = _mm_and_si128(_mm_load_si128(++in), mask26);
2629 
2630     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
2631     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2632 					       OutReg);
2633 
2634     OutReg = _mm_srli_epi32(InReg, 26 - 14);
2635     InReg = _mm_and_si128(_mm_load_si128(++in), mask26);
2636 
2637     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
2638     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2639 					       OutReg);
2640 
2641     OutReg = _mm_srli_epi32(InReg, 26 - 8);
2642     InReg = _mm_and_si128(_mm_load_si128(++in), mask26);
2643 
2644     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
2645     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2646 					       OutReg);
2647 
2648     OutReg = _mm_srli_epi32(InReg, 26 - 2);
2649     InReg = _mm_and_si128(_mm_load_si128(++in), mask26);
2650 
2651     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
2652     InReg = _mm_and_si128(_mm_load_si128(++in), mask26);
2653 
2654     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
2655     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2656 					       OutReg);
2657 
2658     OutReg = _mm_srli_epi32(InReg, 26 - 22);
2659     InReg = _mm_and_si128(_mm_load_si128(++in), mask26);
2660 
2661     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
2662     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2663 					       OutReg);
2664 
2665     OutReg = _mm_srli_epi32(InReg, 26 - 16);
2666     InReg = _mm_and_si128(_mm_load_si128(++in), mask26);
2667 
2668     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
2669     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2670 					       OutReg);
2671 
2672     OutReg = _mm_srli_epi32(InReg, 26 - 10);
2673     InReg = _mm_and_si128(_mm_load_si128(++in), mask26);
2674 
2675     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
2676     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2677 					       OutReg);
2678 
2679     OutReg = _mm_srli_epi32(InReg, 26 - 4);
2680     InReg = _mm_and_si128(_mm_load_si128(++in), mask26);
2681 
2682     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
2683     InReg = _mm_and_si128(_mm_load_si128(++in), mask26);
2684 
2685     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
2686     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2687 					       OutReg);
2688 
2689     OutReg = _mm_srli_epi32(InReg, 26 - 24);
2690     InReg = _mm_and_si128(_mm_load_si128(++in), mask26);
2691 
2692     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
2693     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2694 					       OutReg);
2695 
2696     OutReg = _mm_srli_epi32(InReg, 26 - 18);
2697     InReg = _mm_and_si128(_mm_load_si128(++in), mask26);
2698 
2699     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
2700     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2701 					       OutReg);
2702 
2703     OutReg = _mm_srli_epi32(InReg, 26 - 12);
2704     InReg = _mm_and_si128(_mm_load_si128(++in), mask26);
2705 
2706     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
2707     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2708 					       OutReg);
2709 
2710     OutReg = _mm_srli_epi32(InReg, 26 - 6);
2711     InReg = _mm_and_si128(_mm_load_si128(++in), mask26);
2712 
2713     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
2714     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2715 					       OutReg);
2716 
2717     return strm_buffer_i;
2718 }
2719 #endif
2720 
2721 
2722 static int
pack_26_horiz(UINT4 * out,const UINT4 * in)2723 pack_26_horiz (UINT4 *out, const UINT4 *in) {
2724   int column;
2725 
2726   for (column = 0; column < 4; column++) {
2727     *out |= (*in)   % (1U << 26 ) ;
2728     ++in;
2729     *out |= ( (*in)   % (1U << 26 )  ) <<  26 ;
2730     ++out;
2731     *out |=  ( (*in)   % (1U << 26 ) ) >> ( 26  -  20 );
2732     ++in;
2733     *out |= ( (*in)   % (1U << 26 )  ) <<  20 ;
2734     ++out;
2735     *out |=  ( (*in)   % (1U << 26 ) ) >> ( 26  -  14 );
2736     ++in;
2737     *out |= ( (*in)   % (1U << 26 )  ) <<  14 ;
2738     ++out;
2739     *out |=  ( (*in)   % (1U << 26 ) ) >> ( 26  -  8 );
2740     ++in;
2741     *out |= ( (*in)   % (1U << 26 )  ) <<  8 ;
2742     ++out;
2743     *out |=  ( (*in)   % (1U << 26 ) ) >> ( 26  -  2 );
2744     ++in;
2745     *out |= ( (*in)   % (1U << 26 )  ) <<  2 ;
2746     ++in;
2747     *out |= ( (*in)   % (1U << 26 )  ) <<  28 ;
2748     ++out;
2749     *out |=  ( (*in)   % (1U << 26 ) ) >> ( 26  -  22 );
2750     ++in;
2751     *out |= ( (*in)   % (1U << 26 )  ) <<  22 ;
2752     ++out;
2753     *out |=  ( (*in)   % (1U << 26 ) ) >> ( 26  -  16 );
2754     ++in;
2755     *out |= ( (*in)   % (1U << 26 )  ) <<  16 ;
2756     ++out;
2757     *out |=  ( (*in)   % (1U << 26 ) ) >> ( 26  -  10 );
2758     ++in;
2759     *out |= ( (*in)   % (1U << 26 )  ) <<  10 ;
2760     ++out;
2761     *out |=  ( (*in)   % (1U << 26 ) ) >> ( 26  -  4 );
2762     ++in;
2763     *out |= ( (*in)   % (1U << 26 )  ) <<  4 ;
2764     ++in;
2765     *out |= ( (*in)   % (1U << 26 )  ) <<  30 ;
2766     ++out;
2767     *out |=  ( (*in)   % (1U << 26 ) ) >> ( 26  -  24 );
2768     ++in;
2769     *out |= ( (*in)   % (1U << 26 )  ) <<  24 ;
2770     ++out;
2771     *out |=  ( (*in)   % (1U << 26 ) ) >> ( 26  -  18 );
2772     ++in;
2773     *out |= ( (*in)   % (1U << 26 )  ) <<  18 ;
2774     ++out;
2775     *out |=  ( (*in)   % (1U << 26 ) ) >> ( 26  -  12 );
2776     ++in;
2777     *out |= ( (*in)   % (1U << 26 )  ) <<  12 ;
2778     ++out;
2779     *out |=  ( (*in)   % (1U << 26 ) ) >> ( 26  -  6 );
2780     ++in;
2781     *out |= ( (*in)   % (1U << 26 )  ) <<  6 ;
2782     ++out;
2783     ++in;
2784   }
2785 
2786   return 52;
2787 }
2788 
2789 
2790 
2791 #ifdef ALLOW_ODD_PACKSIZES
2792 /* nwritten = 14 * 4 = 56 unsigned ints */
2793 static int
write_27_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)2794 write_27_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
2795     const __m128i *in = (const __m128i *) _in;
2796     __m128i OutReg;
2797 
2798     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask27);
2799     OutReg = InReg;
2800     InReg = _mm_and_si128(_mm_load_si128(++in), mask27);
2801 
2802     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
2803     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2804 					       OutReg);
2805 
2806     OutReg = _mm_srli_epi32(InReg, 27 - 22);
2807     InReg = _mm_and_si128(_mm_load_si128(++in), mask27);
2808 
2809     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
2810     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2811 					       OutReg);
2812 
2813     OutReg = _mm_srli_epi32(InReg, 27 - 17);
2814     InReg = _mm_and_si128(_mm_load_si128(++in), mask27);
2815 
2816     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
2817     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2818 					       OutReg);
2819 
2820     OutReg = _mm_srli_epi32(InReg, 27 - 12);
2821     InReg = _mm_and_si128(_mm_load_si128(++in), mask27);
2822 
2823     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
2824     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2825 					       OutReg);
2826 
2827     OutReg = _mm_srli_epi32(InReg, 27 - 7);
2828     InReg = _mm_and_si128(_mm_load_si128(++in), mask27);
2829 
2830     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
2831     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2832 					       OutReg);
2833 
2834     OutReg = _mm_srli_epi32(InReg, 27 - 2);
2835     InReg = _mm_and_si128(_mm_load_si128(++in), mask27);
2836 
2837     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
2838     InReg = _mm_and_si128(_mm_load_si128(++in), mask27);
2839 
2840     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
2841     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2842 					       OutReg);
2843 
2844     OutReg = _mm_srli_epi32(InReg, 27 - 24);
2845     InReg = _mm_and_si128(_mm_load_si128(++in), mask27);
2846 
2847     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
2848     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2849 					       OutReg);
2850 
2851     OutReg = _mm_srli_epi32(InReg, 27 - 19);
2852     InReg = _mm_and_si128(_mm_load_si128(++in), mask27);
2853 
2854     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
2855     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2856 					       OutReg);
2857 
2858     OutReg = _mm_srli_epi32(InReg, 27 - 14);
2859     InReg = _mm_and_si128(_mm_load_si128(++in), mask27);
2860 
2861     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
2862     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2863 					       OutReg);
2864 
2865     OutReg = _mm_srli_epi32(InReg, 27 - 9);
2866     InReg = _mm_and_si128(_mm_load_si128(++in), mask27);
2867 
2868     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
2869     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2870 					       OutReg);
2871 
2872     OutReg = _mm_srli_epi32(InReg, 27 - 4);
2873     InReg = _mm_and_si128(_mm_load_si128(++in), mask27);
2874 
2875     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
2876     InReg = _mm_and_si128(_mm_load_si128(++in), mask27);
2877 
2878     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
2879     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2880 					       OutReg);
2881 
2882     OutReg = _mm_srli_epi32(InReg, 27 - 26);
2883     InReg = _mm_and_si128(_mm_load_si128(++in), mask27);
2884 
2885     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
2886     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2887 					       OutReg);
2888 
2889     OutReg = _mm_srli_epi32(InReg, 27 - 21);
2890     InReg = _mm_and_si128(_mm_load_si128(++in), mask27);
2891 
2892     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
2893     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2894 					       OutReg);
2895 
2896     OutReg = _mm_srli_epi32(InReg, 27 - 16);
2897     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2898 					       OutReg);
2899 
2900     return strm_buffer_i;
2901 }
2902 #endif
2903 
2904 
2905 #ifdef HAVE_SSE2
2906 /* nwritten = 14 * 4 = 56 unsigned ints */
2907 static int
write_28_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)2908 write_28_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
2909     const __m128i *in = (const __m128i *) _in;
2910     __m128i OutReg;
2911 
2912     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask28);
2913     OutReg = InReg;
2914     InReg = _mm_and_si128(_mm_load_si128(++in), mask28);
2915 
2916     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
2917     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2918 					       OutReg);
2919 
2920     OutReg = _mm_srli_epi32(InReg, 28 - 24);
2921     InReg = _mm_and_si128(_mm_load_si128(++in), mask28);
2922 
2923     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
2924     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2925 					       OutReg);
2926 
2927     OutReg = _mm_srli_epi32(InReg, 28 - 20);
2928     InReg = _mm_and_si128(_mm_load_si128(++in), mask28);
2929 
2930     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
2931     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2932 					       OutReg);
2933 
2934     OutReg = _mm_srli_epi32(InReg, 28 - 16);
2935     InReg = _mm_and_si128(_mm_load_si128(++in), mask28);
2936 
2937     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
2938     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2939 					       OutReg);
2940 
2941     OutReg = _mm_srli_epi32(InReg, 28 - 12);
2942     InReg = _mm_and_si128(_mm_load_si128(++in), mask28);
2943 
2944     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
2945     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2946 					       OutReg);
2947 
2948     OutReg = _mm_srli_epi32(InReg, 28 - 8);
2949     InReg = _mm_and_si128(_mm_load_si128(++in), mask28);
2950 
2951     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
2952     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2953 					       OutReg);
2954 
2955     OutReg = _mm_srli_epi32(InReg, 28 - 4);
2956     InReg = _mm_and_si128(_mm_load_si128(++in), mask28);
2957 
2958     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
2959     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2960 					       OutReg);
2961 
2962     InReg = _mm_and_si128(_mm_load_si128(++in), mask28);
2963 
2964     OutReg = InReg;
2965     InReg = _mm_and_si128(_mm_load_si128(++in), mask28);
2966 
2967     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
2968     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2969 					       OutReg);
2970 
2971     OutReg = _mm_srli_epi32(InReg, 28 - 24);
2972     InReg = _mm_and_si128(_mm_load_si128(++in), mask28);
2973 
2974     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
2975     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2976 					       OutReg);
2977 
2978     OutReg = _mm_srli_epi32(InReg, 28 - 20);
2979     InReg = _mm_and_si128(_mm_load_si128(++in), mask28);
2980 
2981     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
2982     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2983 					       OutReg);
2984 
2985     OutReg = _mm_srli_epi32(InReg, 28 - 16);
2986     InReg = _mm_and_si128(_mm_load_si128(++in), mask28);
2987 
2988     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
2989     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2990 					       OutReg);
2991 
2992     OutReg = _mm_srli_epi32(InReg, 28 - 12);
2993     InReg = _mm_and_si128(_mm_load_si128(++in), mask28);
2994 
2995     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
2996     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
2997 					       OutReg);
2998 
2999     OutReg = _mm_srli_epi32(InReg, 28 - 8);
3000     InReg = _mm_and_si128(_mm_load_si128(++in), mask28);
3001 
3002     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
3003     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3004 					       OutReg);
3005 
3006     OutReg = _mm_srli_epi32(InReg, 28 - 4);
3007     InReg = _mm_and_si128(_mm_load_si128(++in), mask28);
3008 
3009     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
3010     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3011 					       OutReg);
3012 
3013     return strm_buffer_i;
3014 }
3015 #endif
3016 
3017 
3018 static int
pack_28_horiz(UINT4 * out,const UINT4 * in)3019 pack_28_horiz (UINT4 *out, const UINT4 *in) {
3020   int column;
3021 
3022   for (column = 0; column < 4; column++) {
3023     *out |= (*in)   % (1U << 28 ) ;
3024     ++in;
3025     *out |= ( (*in)   % (1U << 28 )  ) <<  28 ;
3026     ++out;
3027     *out |=  ( (*in)   % (1U << 28 ) ) >> ( 28  -  24 );
3028     ++in;
3029     *out |= ( (*in)   % (1U << 28 )  ) <<  24 ;
3030     ++out;
3031     *out |=  ( (*in)   % (1U << 28 ) ) >> ( 28  -  20 );
3032     ++in;
3033     *out |= ( (*in)   % (1U << 28 )  ) <<  20 ;
3034     ++out;
3035     *out |=  ( (*in)   % (1U << 28 ) ) >> ( 28  -  16 );
3036     ++in;
3037     *out |= ( (*in)   % (1U << 28 )  ) <<  16 ;
3038     ++out;
3039     *out |=  ( (*in)   % (1U << 28 ) ) >> ( 28  -  12 );
3040     ++in;
3041     *out |= ( (*in)   % (1U << 28 )  ) <<  12 ;
3042     ++out;
3043     *out |=  ( (*in)   % (1U << 28 ) ) >> ( 28  -  8 );
3044     ++in;
3045     *out |= ( (*in)   % (1U << 28 )  ) <<  8 ;
3046     ++out;
3047     *out |=  ( (*in)   % (1U << 28 ) ) >> ( 28  -  4 );
3048     ++in;
3049     *out |= ( (*in)   % (1U << 28 )  ) <<  4 ;
3050     ++out;
3051     ++in;
3052     *out |= (*in)   % (1U << 28 ) ;
3053     ++in;
3054     *out |= ( (*in)   % (1U << 28 )  ) <<  28 ;
3055     ++out;
3056     *out |=  ( (*in)   % (1U << 28 ) ) >> ( 28  -  24 );
3057     ++in;
3058     *out |= ( (*in)   % (1U << 28 )  ) <<  24 ;
3059     ++out;
3060     *out |=  ( (*in)   % (1U << 28 ) ) >> ( 28  -  20 );
3061     ++in;
3062     *out |= ( (*in)   % (1U << 28 )  ) <<  20 ;
3063     ++out;
3064     *out |=  ( (*in)   % (1U << 28 ) ) >> ( 28  -  16 );
3065     ++in;
3066     *out |= ( (*in)   % (1U << 28 )  ) <<  16 ;
3067     ++out;
3068     *out |=  ( (*in)   % (1U << 28 ) ) >> ( 28  -  12 );
3069     ++in;
3070     *out |= ( (*in)   % (1U << 28 )  ) <<  12 ;
3071     ++out;
3072     *out |=  ( (*in)   % (1U << 28 ) ) >> ( 28  -  8 );
3073     ++in;
3074     *out |= ( (*in)   % (1U << 28 )  ) <<  8 ;
3075     ++out;
3076     *out |=  ( (*in)   % (1U << 28 ) ) >> ( 28  -  4 );
3077     ++in;
3078     *out |= ( (*in)   % (1U << 28 )  ) <<  4 ;
3079     ++out;
3080     ++in;
3081   }
3082 
3083   return 56;
3084 }
3085 
3086 
3087 
3088 #ifdef ALLOW_ODD_PACKSIZES
3089 /* nwritten = 15 * 4 = 60 unsigned ints */
3090 static int
write_29_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)3091 write_29_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
3092     const __m128i *in = (const __m128i *) _in;
3093     __m128i OutReg;
3094 
3095     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask29);
3096     OutReg = InReg;
3097     InReg = _mm_and_si128(_mm_load_si128(++in), mask29);
3098 
3099     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
3100     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3101 					       OutReg);
3102 
3103     OutReg = _mm_srli_epi32(InReg, 29 - 26);
3104     InReg = _mm_and_si128(_mm_load_si128(++in), mask29);
3105 
3106     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
3107     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3108 					       OutReg);
3109 
3110     OutReg = _mm_srli_epi32(InReg, 29 - 23);
3111     InReg = _mm_and_si128(_mm_load_si128(++in), mask29);
3112 
3113     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
3114     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3115 					       OutReg);
3116 
3117     OutReg = _mm_srli_epi32(InReg, 29 - 20);
3118     InReg = _mm_and_si128(_mm_load_si128(++in), mask29);
3119 
3120     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
3121     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3122 					       OutReg);
3123 
3124     OutReg = _mm_srli_epi32(InReg, 29 - 17);
3125     InReg = _mm_and_si128(_mm_load_si128(++in), mask29);
3126 
3127     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
3128     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3129 					       OutReg);
3130 
3131     OutReg = _mm_srli_epi32(InReg, 29 - 14);
3132     InReg = _mm_and_si128(_mm_load_si128(++in), mask29);
3133 
3134     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
3135     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3136 					       OutReg);
3137 
3138     OutReg = _mm_srli_epi32(InReg, 29 - 11);
3139     InReg = _mm_and_si128(_mm_load_si128(++in), mask29);
3140 
3141     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
3142     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3143 					       OutReg);
3144 
3145     OutReg = _mm_srli_epi32(InReg, 29 - 8);
3146     InReg = _mm_and_si128(_mm_load_si128(++in), mask29);
3147 
3148     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
3149     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3150 					       OutReg);
3151 
3152     OutReg = _mm_srli_epi32(InReg, 29 - 5);
3153     InReg = _mm_and_si128(_mm_load_si128(++in), mask29);
3154 
3155     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
3156     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3157 					       OutReg);
3158 
3159     OutReg = _mm_srli_epi32(InReg, 29 - 2);
3160     InReg = _mm_and_si128(_mm_load_si128(++in), mask29);
3161 
3162     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
3163     InReg = _mm_and_si128(_mm_load_si128(++in), mask29);
3164 
3165     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
3166     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3167 					       OutReg);
3168 
3169     OutReg = _mm_srli_epi32(InReg, 29 - 28);
3170     InReg = _mm_and_si128(_mm_load_si128(++in), mask29);
3171 
3172     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
3173     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3174 					       OutReg);
3175 
3176     OutReg = _mm_srli_epi32(InReg, 29 - 25);
3177     InReg = _mm_and_si128(_mm_load_si128(++in), mask29);
3178 
3179     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
3180     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3181 					       OutReg);
3182 
3183     OutReg = _mm_srli_epi32(InReg, 29 - 22);
3184     InReg = _mm_and_si128(_mm_load_si128(++in), mask29);
3185 
3186     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
3187     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3188 					       OutReg);
3189 
3190     OutReg = _mm_srli_epi32(InReg, 29 - 19);
3191     InReg = _mm_and_si128(_mm_load_si128(++in), mask29);
3192 
3193     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
3194     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3195 					       OutReg);
3196 
3197 
3198     OutReg = _mm_srli_epi32(InReg, 29 - 16);
3199     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3200 					       OutReg);
3201 
3202     return strm_buffer_i;
3203 }
3204 #endif
3205 
3206 
3207 #ifdef HAVE_SSE2
3208 /* nwritten = 15 * 4 = 60 unsigned ints */
3209 static int
write_30_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)3210 write_30_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
3211     const __m128i *in = (const __m128i *) _in;
3212     __m128i OutReg;
3213 
3214     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask30);
3215     OutReg = InReg;
3216     InReg = _mm_and_si128(_mm_load_si128(++in), mask30);
3217 
3218     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
3219     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3220 					       OutReg);
3221 
3222     OutReg = _mm_srli_epi32(InReg, 30 - 28);
3223     InReg = _mm_and_si128(_mm_load_si128(++in), mask30);
3224 
3225     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
3226     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3227 					       OutReg);
3228 
3229     OutReg = _mm_srli_epi32(InReg, 30 - 26);
3230     InReg = _mm_and_si128(_mm_load_si128(++in), mask30);
3231 
3232     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
3233     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3234 					       OutReg);
3235 
3236     OutReg = _mm_srli_epi32(InReg, 30 - 24);
3237     InReg = _mm_and_si128(_mm_load_si128(++in), mask30);
3238 
3239     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
3240     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3241 					       OutReg);
3242 
3243     OutReg = _mm_srli_epi32(InReg, 30 - 22);
3244     InReg = _mm_and_si128(_mm_load_si128(++in), mask30);
3245 
3246     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
3247     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3248 					       OutReg);
3249 
3250     OutReg = _mm_srli_epi32(InReg, 30 - 20);
3251     InReg = _mm_and_si128(_mm_load_si128(++in), mask30);
3252 
3253     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
3254     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3255 					       OutReg);
3256 
3257     OutReg = _mm_srli_epi32(InReg, 30 - 18);
3258     InReg = _mm_and_si128(_mm_load_si128(++in), mask30);
3259 
3260     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
3261     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3262 					       OutReg);
3263 
3264     OutReg = _mm_srli_epi32(InReg, 30 - 16);
3265     InReg = _mm_and_si128(_mm_load_si128(++in), mask30);
3266 
3267     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
3268     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3269 					       OutReg);
3270 
3271     OutReg = _mm_srli_epi32(InReg, 30 - 14);
3272     InReg = _mm_and_si128(_mm_load_si128(++in), mask30);
3273 
3274     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
3275     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3276 					       OutReg);
3277 
3278     OutReg = _mm_srli_epi32(InReg, 30 - 12);
3279     InReg = _mm_and_si128(_mm_load_si128(++in), mask30);
3280 
3281     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
3282     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3283 					       OutReg);
3284 
3285     OutReg = _mm_srli_epi32(InReg, 30 - 10);
3286     InReg = _mm_and_si128(_mm_load_si128(++in), mask30);
3287 
3288     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
3289     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3290 					       OutReg);
3291 
3292     OutReg = _mm_srli_epi32(InReg, 30 - 8);
3293     InReg = _mm_and_si128(_mm_load_si128(++in), mask30);
3294 
3295     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
3296     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3297 					       OutReg);
3298 
3299     OutReg = _mm_srli_epi32(InReg, 30 - 6);
3300     InReg = _mm_and_si128(_mm_load_si128(++in), mask30);
3301 
3302     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
3303     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3304 					       OutReg);
3305 
3306     OutReg = _mm_srli_epi32(InReg, 30 - 4);
3307     InReg = _mm_and_si128(_mm_load_si128(++in), mask30);
3308 
3309     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
3310     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3311 					       OutReg);
3312 
3313     OutReg = _mm_srli_epi32(InReg, 30 - 2);
3314     InReg = _mm_and_si128(_mm_load_si128(++in), mask30);
3315 
3316     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
3317     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3318 					       OutReg);
3319 
3320     return strm_buffer_i;
3321 }
3322 #endif
3323 
3324 
3325 static int
pack_30_horiz(UINT4 * out,const UINT4 * in)3326 pack_30_horiz (UINT4 *out, const UINT4 *in) {
3327   int column;
3328 
3329   for (column = 0; column < 4; column++) {
3330     *out |= (*in)   % (1U << 30 ) ;
3331     ++in;
3332     *out |= ( (*in)   % (1U << 30 )  ) <<  30 ;
3333     ++out;
3334     *out |=  ( (*in)   % (1U << 30 ) ) >> ( 30  -  28 );
3335     ++in;
3336     *out |= ( (*in)   % (1U << 30 )  ) <<  28 ;
3337     ++out;
3338     *out |=  ( (*in)   % (1U << 30 ) ) >> ( 30  -  26 );
3339     ++in;
3340     *out |= ( (*in)   % (1U << 30 )  ) <<  26 ;
3341     ++out;
3342     *out |=  ( (*in)   % (1U << 30 ) ) >> ( 30  -  24 );
3343     ++in;
3344     *out |= ( (*in)   % (1U << 30 )  ) <<  24 ;
3345     ++out;
3346     *out |=  ( (*in)   % (1U << 30 ) ) >> ( 30  -  22 );
3347     ++in;
3348     *out |= ( (*in)   % (1U << 30 )  ) <<  22 ;
3349     ++out;
3350     *out |=  ( (*in)   % (1U << 30 ) ) >> ( 30  -  20 );
3351     ++in;
3352     *out |= ( (*in)   % (1U << 30 )  ) <<  20 ;
3353     ++out;
3354     *out |=  ( (*in)   % (1U << 30 ) ) >> ( 30  -  18 );
3355     ++in;
3356     *out |= ( (*in)   % (1U << 30 )  ) <<  18 ;
3357     ++out;
3358     *out |=  ( (*in)   % (1U << 30 ) ) >> ( 30  -  16 );
3359     ++in;
3360     *out |= ( (*in)   % (1U << 30 )  ) <<  16 ;
3361     ++out;
3362     *out |=  ( (*in)   % (1U << 30 ) ) >> ( 30  -  14 );
3363     ++in;
3364     *out |= ( (*in)   % (1U << 30 )  ) <<  14 ;
3365     ++out;
3366     *out |=  ( (*in)   % (1U << 30 ) ) >> ( 30  -  12 );
3367     ++in;
3368     *out |= ( (*in)   % (1U << 30 )  ) <<  12 ;
3369     ++out;
3370     *out |=  ( (*in)   % (1U << 30 ) ) >> ( 30  -  10 );
3371     ++in;
3372     *out |= ( (*in)   % (1U << 30 )  ) <<  10 ;
3373     ++out;
3374     *out |=  ( (*in)   % (1U << 30 ) ) >> ( 30  -  8 );
3375     ++in;
3376     *out |= ( (*in)   % (1U << 30 )  ) <<  8 ;
3377     ++out;
3378     *out |=  ( (*in)   % (1U << 30 ) ) >> ( 30  -  6 );
3379     ++in;
3380     *out |= ( (*in)   % (1U << 30 )  ) <<  6 ;
3381     ++out;
3382     *out |=  ( (*in)   % (1U << 30 ) ) >> ( 30  -  4 );
3383     ++in;
3384     *out |= ( (*in)   % (1U << 30 )  ) <<  4 ;
3385     ++out;
3386     *out |=  ( (*in)   % (1U << 30 ) ) >> ( 30  -  2 );
3387     ++in;
3388     *out |= ( (*in)   % (1U << 30 )  ) <<  2 ;
3389     ++out;
3390     ++in;
3391   }
3392 
3393   return 60;
3394 }
3395 
3396 
3397 
3398 #ifdef ALLOW_ODD_PACKSIZES
3399 /* nwritten = 16 * 4 = 64 unsigned ints */
3400 static int
write_31_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)3401 write_31_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
3402     const __m128i *in = (const __m128i *) _in;
3403     __m128i OutReg;
3404 
3405     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask31);
3406     OutReg = InReg;
3407     InReg = _mm_and_si128(_mm_load_si128(++in), mask31);
3408 
3409     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
3410     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3411 					       OutReg);
3412 
3413     OutReg = _mm_srli_epi32(InReg, 31 - 30);
3414     InReg = _mm_and_si128(_mm_load_si128(++in), mask31);
3415 
3416     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
3417     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3418 					       OutReg);
3419 
3420     OutReg = _mm_srli_epi32(InReg, 31 - 29);
3421     InReg = _mm_and_si128(_mm_load_si128(++in), mask31);
3422 
3423     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
3424     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3425 					       OutReg);
3426 
3427     OutReg = _mm_srli_epi32(InReg, 31 - 28);
3428     InReg = _mm_and_si128(_mm_load_si128(++in), mask31);
3429 
3430     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
3431     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3432 					       OutReg);
3433 
3434     OutReg = _mm_srli_epi32(InReg, 31 - 27);
3435     InReg = _mm_and_si128(_mm_load_si128(++in), mask31);
3436 
3437     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
3438     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3439 					       OutReg);
3440 
3441     OutReg = _mm_srli_epi32(InReg, 31 - 26);
3442     InReg = _mm_and_si128(_mm_load_si128(++in), mask31);
3443 
3444     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
3445     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3446 					       OutReg);
3447 
3448     OutReg = _mm_srli_epi32(InReg, 31 - 25);
3449     InReg = _mm_and_si128(_mm_load_si128(++in), mask31);
3450 
3451     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
3452     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3453 					       OutReg);
3454 
3455     OutReg = _mm_srli_epi32(InReg, 31 - 24);
3456     InReg = _mm_and_si128(_mm_load_si128(++in), mask31);
3457 
3458     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
3459     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3460 					       OutReg);
3461 
3462     OutReg = _mm_srli_epi32(InReg, 31 - 23);
3463     InReg = _mm_and_si128(_mm_load_si128(++in), mask31);
3464 
3465     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
3466     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3467 					       OutReg);
3468 
3469     OutReg = _mm_srli_epi32(InReg, 31 - 22);
3470     InReg = _mm_and_si128(_mm_load_si128(++in), mask31);
3471 
3472     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
3473     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3474 					       OutReg);
3475 
3476     OutReg = _mm_srli_epi32(InReg, 31 - 21);
3477     InReg = _mm_and_si128(_mm_load_si128(++in), mask31);
3478 
3479     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
3480     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3481 					       OutReg);
3482 
3483     OutReg = _mm_srli_epi32(InReg, 31 - 20);
3484     InReg = _mm_and_si128(_mm_load_si128(++in), mask31);
3485 
3486     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
3487     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3488 					       OutReg);
3489 
3490     OutReg = _mm_srli_epi32(InReg, 31 - 19);
3491     InReg = _mm_and_si128(_mm_load_si128(++in), mask31);
3492 
3493     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
3494     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3495 					       OutReg);
3496 
3497     OutReg = _mm_srli_epi32(InReg, 31 - 18);
3498     InReg = _mm_and_si128(_mm_load_si128(++in), mask31);
3499 
3500     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
3501     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3502 					       OutReg);
3503 
3504     OutReg = _mm_srli_epi32(InReg, 31 - 17);
3505     InReg = _mm_and_si128(_mm_load_si128(++in), mask31);
3506 
3507     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
3508     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3509 					       OutReg);
3510 
3511 
3512     OutReg = _mm_srli_epi32(InReg, 31 - 16);
3513     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3514 					       OutReg);
3515 
3516     return strm_buffer_i;
3517 }
3518 #endif
3519 
3520 
3521 #ifdef HAVE_SSE2
3522 /* nwritten = 16 * 4 = 64 unsigned ints */
3523 static int
write_32_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)3524 write_32_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
3525     const __m128i *in = (const __m128i *) _in;
3526     __m128i OutReg;
3527 
3528     __m128i InReg = _mm_load_si128(in);
3529     OutReg = InReg;
3530     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3531 					       OutReg);
3532 
3533     InReg = _mm_load_si128(++in);
3534 
3535     OutReg = InReg;
3536     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3537 					       OutReg);
3538 
3539     InReg = _mm_load_si128(++in);
3540 
3541     OutReg = InReg;
3542     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3543 					       OutReg);
3544 
3545     InReg = _mm_load_si128(++in);
3546 
3547     OutReg = InReg;
3548     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3549 					       OutReg);
3550 
3551     InReg = _mm_load_si128(++in);
3552 
3553     OutReg = InReg;
3554     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3555 					       OutReg);
3556 
3557     InReg = _mm_load_si128(++in);
3558 
3559     OutReg = InReg;
3560     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3561 					       OutReg);
3562 
3563     InReg = _mm_load_si128(++in);
3564 
3565     OutReg = InReg;
3566     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3567 					       OutReg);
3568 
3569     InReg = _mm_load_si128(++in);
3570 
3571     OutReg = InReg;
3572     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3573 					       OutReg);
3574 
3575     InReg = _mm_load_si128(++in);
3576 
3577     OutReg = InReg;
3578     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3579 					       OutReg);
3580 
3581     InReg = _mm_load_si128(++in);
3582 
3583     OutReg = InReg;
3584     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3585 					       OutReg);
3586 
3587     InReg = _mm_load_si128(++in);
3588 
3589     OutReg = InReg;
3590     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3591 					       OutReg);
3592 
3593     InReg = _mm_load_si128(++in);
3594 
3595     OutReg = InReg;
3596     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3597 					       OutReg);
3598 
3599     InReg = _mm_load_si128(++in);
3600 
3601     OutReg = InReg;
3602     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3603 					       OutReg);
3604 
3605     InReg = _mm_load_si128(++in);
3606 
3607     OutReg = InReg;
3608     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3609 					       OutReg);
3610 
3611     InReg = _mm_load_si128(++in);
3612 
3613     OutReg = InReg;
3614     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3615 					       OutReg);
3616 
3617     InReg = _mm_load_si128(++in);
3618 
3619     OutReg = InReg;
3620     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3621 					       OutReg);
3622 
3623     return strm_buffer_i;
3624 }
3625 #endif
3626 
3627 
3628 static int
pack_32_horiz(UINT4 * out,const UINT4 * in)3629 pack_32_horiz (UINT4 *out, const UINT4 *in) {
3630   int column;
3631 
3632   for (column = 0; column < 4; column++) {
3633     *out = *in;
3634     ++out;
3635     ++in;
3636     *out = *in;
3637     ++out;
3638     ++in;
3639     *out = *in;
3640     ++out;
3641     ++in;
3642     *out = *in;
3643     ++out;
3644     ++in;
3645     *out = *in;
3646     ++out;
3647     ++in;
3648     *out = *in;
3649     ++out;
3650     ++in;
3651     *out = *in;
3652     ++out;
3653     ++in;
3654     *out = *in;
3655     ++out;
3656     ++in;
3657     *out = *in;
3658     ++out;
3659     ++in;
3660     *out = *in;
3661     ++out;
3662     ++in;
3663     *out = *in;
3664     ++out;
3665     ++in;
3666     *out = *in;
3667     ++out;
3668     ++in;
3669     *out = *in;
3670     ++out;
3671     ++in;
3672     *out = *in;
3673     ++out;
3674     ++in;
3675     *out = *in;
3676     ++out;
3677     ++in;
3678     *out = *in;
3679     ++out;
3680     ++in;
3681   }
3682 
3683   return 64;
3684 }
3685 
3686 
3687 
3688 #ifdef HAVE_SSE2
3689 /* nwritten = 2 * 4 = 8 unsigned ints */
3690 static int
write_04_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)3691 write_04_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
3692     const __m128i *in = (const __m128i *) _in;
3693     __m128i OutReg;
3694 
3695     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask4);
3696     OutReg = InReg;
3697     InReg = _mm_and_si128(_mm_load_si128(++in), mask4);
3698 
3699     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
3700     InReg = _mm_and_si128(_mm_load_si128(++in), mask4);
3701 
3702     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
3703     InReg = _mm_and_si128(_mm_load_si128(++in), mask4);
3704 
3705     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
3706     InReg = _mm_and_si128(_mm_load_si128(++in), mask4);
3707 
3708     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
3709     InReg = _mm_and_si128(_mm_load_si128(++in), mask4);
3710 
3711     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
3712     InReg = _mm_and_si128(_mm_load_si128(++in), mask4);
3713 
3714     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
3715     InReg = _mm_and_si128(_mm_load_si128(++in), mask4);
3716 
3717     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
3718     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3719 					       OutReg);
3720 
3721     InReg = _mm_and_si128(_mm_load_si128(++in), mask4);
3722 
3723     OutReg = InReg;
3724     InReg = _mm_and_si128(_mm_load_si128(++in), mask4);
3725 
3726     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
3727     InReg = _mm_and_si128(_mm_load_si128(++in), mask4);
3728 
3729     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
3730     InReg = _mm_and_si128(_mm_load_si128(++in), mask4);
3731 
3732     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
3733     InReg = _mm_and_si128(_mm_load_si128(++in), mask4);
3734 
3735     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
3736     InReg = _mm_and_si128(_mm_load_si128(++in), mask4);
3737 
3738     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
3739     InReg = _mm_and_si128(_mm_load_si128(++in), mask4);
3740 
3741     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
3742     InReg = _mm_and_si128(_mm_load_si128(++in), mask4);
3743 
3744     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
3745     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3746 					       OutReg);
3747 
3748     return strm_buffer_i;
3749 }
3750 #endif
3751 
3752 
3753 static int
pack_04_horiz(UINT4 * out,const UINT4 * in)3754 pack_04_horiz (UINT4 *out, const UINT4 *in) {
3755   int column;
3756 
3757   for (column = 0; column < 4; column++) {
3758     *out |= (*in)   % (1U << 4 ) ;
3759     ++in;
3760     *out |= ( (*in)   % (1U << 4 )  ) <<  4 ;
3761     ++in;
3762     *out |= ( (*in)   % (1U << 4 )  ) <<  8 ;
3763     ++in;
3764     *out |= ( (*in)   % (1U << 4 )  ) <<  12 ;
3765     ++in;
3766     *out |= ( (*in)   % (1U << 4 )  ) <<  16 ;
3767     ++in;
3768     *out |= ( (*in)   % (1U << 4 )  ) <<  20 ;
3769     ++in;
3770     *out |= ( (*in)   % (1U << 4 )  ) <<  24 ;
3771     ++in;
3772     *out |= ( (*in)   % (1U << 4 )  ) <<  28 ;
3773     ++out;
3774     ++in;
3775     *out |= (*in)   % (1U << 4 ) ;
3776     ++in;
3777     *out |= ( (*in)   % (1U << 4 )  ) <<  4 ;
3778     ++in;
3779     *out |= ( (*in)   % (1U << 4 )  ) <<  8 ;
3780     ++in;
3781     *out |= ( (*in)   % (1U << 4 )  ) <<  12 ;
3782     ++in;
3783     *out |= ( (*in)   % (1U << 4 )  ) <<  16 ;
3784     ++in;
3785     *out |= ( (*in)   % (1U << 4 )  ) <<  20 ;
3786     ++in;
3787     *out |= ( (*in)   % (1U << 4 )  ) <<  24 ;
3788     ++in;
3789     *out |= ( (*in)   % (1U << 4 )  ) <<  28 ;
3790     ++out;
3791     ++in;
3792   }
3793 
3794   return 8;
3795 }
3796 
3797 
3798 #ifdef HAVE_SSE2
3799 /* nwritten = 4 * 4 = 16 unsigned ints */
3800 static int
write_08_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)3801 write_08_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
3802     const __m128i *in = (const __m128i *) _in;
3803     __m128i OutReg;
3804 
3805     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask8);
3806     OutReg = InReg;
3807     InReg = _mm_and_si128(_mm_load_si128(++in), mask8);
3808 
3809     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
3810     InReg = _mm_and_si128(_mm_load_si128(++in), mask8);
3811 
3812     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
3813     InReg = _mm_and_si128(_mm_load_si128(++in), mask8);
3814 
3815     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
3816     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3817 					       OutReg);
3818 
3819     InReg = _mm_and_si128(_mm_load_si128(++in), mask8);
3820 
3821     OutReg = InReg;
3822     InReg = _mm_and_si128(_mm_load_si128(++in), mask8);
3823 
3824     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
3825     InReg = _mm_and_si128(_mm_load_si128(++in), mask8);
3826 
3827     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
3828     InReg = _mm_and_si128(_mm_load_si128(++in), mask8);
3829 
3830     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
3831     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3832 					       OutReg);
3833 
3834     InReg = _mm_and_si128(_mm_load_si128(++in), mask8);
3835 
3836     OutReg = InReg;
3837     InReg = _mm_and_si128(_mm_load_si128(++in), mask8);
3838 
3839     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
3840     InReg = _mm_and_si128(_mm_load_si128(++in), mask8);
3841 
3842     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
3843     InReg = _mm_and_si128(_mm_load_si128(++in), mask8);
3844 
3845     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
3846     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3847 					       OutReg);
3848 
3849     InReg = _mm_and_si128(_mm_load_si128(++in), mask8);
3850 
3851     OutReg = InReg;
3852     InReg = _mm_and_si128(_mm_load_si128(++in), mask8);
3853 
3854     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
3855     InReg = _mm_and_si128(_mm_load_si128(++in), mask8);
3856 
3857     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
3858     InReg = _mm_and_si128(_mm_load_si128(++in), mask8);
3859 
3860     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
3861     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3862 					       OutReg);
3863 
3864     return strm_buffer_i;
3865 }
3866 #endif
3867 
3868 
3869 static int
pack_08_horiz(UINT4 * out,const UINT4 * in)3870 pack_08_horiz (UINT4 *out, const UINT4 *in) {
3871   int column;
3872 
3873   for (column = 0; column < 4; column++) {
3874     *out |= (*in)   % (1U << 8 ) ;
3875     ++in;
3876     *out |= ( (*in)   % (1U << 8 )  ) <<  8 ;
3877     ++in;
3878     *out |= ( (*in)   % (1U << 8 )  ) <<  16 ;
3879     ++in;
3880     *out |= ( (*in)   % (1U << 8 )  ) <<  24 ;
3881     ++out;
3882     ++in;
3883     *out |= (*in)   % (1U << 8 ) ;
3884     ++in;
3885     *out |= ( (*in)   % (1U << 8 )  ) <<  8 ;
3886     ++in;
3887     *out |= ( (*in)   % (1U << 8 )  ) <<  16 ;
3888     ++in;
3889     *out |= ( (*in)   % (1U << 8 )  ) <<  24 ;
3890     ++out;
3891     ++in;
3892     *out |= (*in)   % (1U << 8 ) ;
3893     ++in;
3894     *out |= ( (*in)   % (1U << 8 )  ) <<  8 ;
3895     ++in;
3896     *out |= ( (*in)   % (1U << 8 )  ) <<  16 ;
3897     ++in;
3898     *out |= ( (*in)   % (1U << 8 )  ) <<  24 ;
3899     ++out;
3900     ++in;
3901     *out |= (*in)   % (1U << 8 ) ;
3902     ++in;
3903     *out |= ( (*in)   % (1U << 8 )  ) <<  8 ;
3904     ++in;
3905     *out |= ( (*in)   % (1U << 8 )  ) <<  16 ;
3906     ++in;
3907     *out |= ( (*in)   % (1U << 8 )  ) <<  24 ;
3908     ++out;
3909     ++in;
3910   }
3911 
3912   return 16;
3913 }
3914 
3915 
3916 #ifdef HAVE_SSE2
3917 /* nwritten = 8 * 4 = 32 unsigned ints */
3918 static int
write_16_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in)3919 write_16_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i, const UINT4 *_in) {
3920     const __m128i *in = (const __m128i *) _in;
3921     __m128i OutReg;
3922 
3923     __m128i InReg = _mm_and_si128(_mm_load_si128(in), mask16);
3924 
3925     OutReg = InReg;
3926     InReg = _mm_and_si128(_mm_load_si128(++in), mask16);
3927 
3928     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
3929     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3930 					       OutReg);
3931 
3932     InReg = _mm_and_si128(_mm_load_si128(++in), mask16);
3933 
3934     OutReg = InReg;
3935     InReg = _mm_and_si128(_mm_load_si128(++in), mask16);
3936 
3937     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
3938     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3939 					       OutReg);
3940 
3941     InReg = _mm_and_si128(_mm_load_si128(++in), mask16);
3942 
3943     OutReg = InReg;
3944     InReg = _mm_and_si128(_mm_load_si128(++in), mask16);
3945 
3946     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
3947     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3948 					       OutReg);
3949 
3950     InReg = _mm_and_si128(_mm_load_si128(++in), mask16);
3951 
3952     OutReg = InReg;
3953     InReg = _mm_and_si128(_mm_load_si128(++in), mask16);
3954 
3955     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
3956     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3957 					       OutReg);
3958 
3959     InReg = _mm_and_si128(_mm_load_si128(++in), mask16);
3960 
3961     OutReg = InReg;
3962     InReg = _mm_and_si128(_mm_load_si128(++in), mask16);
3963 
3964     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
3965     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3966 					       OutReg);
3967 
3968     InReg = _mm_and_si128(_mm_load_si128(++in), mask16);
3969 
3970     OutReg = InReg;
3971     InReg = _mm_and_si128(_mm_load_si128(++in), mask16);
3972 
3973     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
3974     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3975 					       OutReg);
3976 
3977     InReg = _mm_and_si128(_mm_load_si128(++in), mask16);
3978 
3979     OutReg = InReg;
3980     InReg = _mm_and_si128(_mm_load_si128(++in), mask16);
3981 
3982     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
3983     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3984 					       OutReg);
3985 
3986     InReg = _mm_and_si128(_mm_load_si128(++in), mask16);
3987 
3988     OutReg = InReg;
3989     InReg = _mm_and_si128(_mm_load_si128(++in), mask16);
3990 
3991     OutReg =  _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
3992     strm_buffer_i = write_reg_buffered_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,
3993 					       OutReg);
3994 
3995     return strm_buffer_i;
3996 }
3997 #endif
3998 
3999 
4000 static int
pack_16_horiz(UINT4 * out,const UINT4 * in)4001 pack_16_horiz (UINT4 *out, const UINT4 *in) {
4002   int column;
4003 
4004   for (column = 0; column < 4; column++) {
4005     *out |= (*in)   % (1U << 16 ) ;
4006     ++in;
4007     *out |= ( (*in)   % (1U << 16 )  ) <<  16 ;
4008     ++out;
4009     ++in;
4010     *out |= (*in)   % (1U << 16 ) ;
4011     ++in;
4012     *out |= ( (*in)   % (1U << 16 )  ) <<  16 ;
4013     ++out;
4014     ++in;
4015     *out |= (*in)   % (1U << 16 ) ;
4016     ++in;
4017     *out |= ( (*in)   % (1U << 16 )  ) <<  16 ;
4018     ++out;
4019     ++in;
4020     *out |= (*in)   % (1U << 16 ) ;
4021     ++in;
4022     *out |= ( (*in)   % (1U << 16 )  ) <<  16 ;
4023     ++out;
4024     ++in;
4025     *out |= (*in)   % (1U << 16 ) ;
4026     ++in;
4027     *out |= ( (*in)   % (1U << 16 )  ) <<  16 ;
4028     ++out;
4029     ++in;
4030     *out |= (*in)   % (1U << 16 ) ;
4031     ++in;
4032     *out |= ( (*in)   % (1U << 16 )  ) <<  16 ;
4033     ++out;
4034     ++in;
4035     *out |= (*in)   % (1U << 16 ) ;
4036     ++in;
4037     *out |= ( (*in)   % (1U << 16 )  ) <<  16 ;
4038     ++out;
4039     ++in;
4040     *out |= (*in)   % (1U << 16 ) ;
4041     ++in;
4042     *out |= ( (*in)   % (1U << 16 )  ) <<  16 ;
4043     ++out;
4044     ++in;
4045   }
4046 
4047   return 32;
4048 }
4049 
4050 
4051 /* Vertical format requires all values in a block to be decoded */
4052 #ifdef HAVE_SSE2
4053 static int
write_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in,int packsize)4054 write_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i,
4055 	    const UINT4 *_in, int packsize) {
4056 
4057 #if 0
4058   int i;
4059 
4060   printf("Entering with packsize %d\n",packsize);
4061   for (i = 0; i < BLOCKSIZE; i++) {
4062     printf("%d ",_in[i]);
4063   }
4064   printf("\n");
4065 #endif
4066 
4067   switch (packsize) {
4068 #ifdef ALLOW_ODD_PACKSIZES
4069   case 0: return strm_buffer_i;
4070   case 1: return write_01_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4071   case 2: return write_02_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4072   case 3: return write_03_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4073   case 4: return write_04_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4074   case 5: return write_05_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4075   case 6: return write_06_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4076   case 7: return write_07_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4077   case 8: return write_08_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4078   case 9: return write_09_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4079   case 10: return write_10_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4080   case 11: return write_11_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4081   case 12: return write_12_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4082   case 13: return write_13_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4083   case 14: return write_14_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4084   case 15: return write_15_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4085   case 16: return write_16_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4086   case 17: return write_17_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4087   case 18: return write_18_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4088   case 19: return write_19_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4089   case 20: return write_20_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4090   case 21: return write_21_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4091   case 22: return write_22_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4092   case 23: return write_23_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4093   case 24: return write_24_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4094   case 25: return write_25_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4095   case 26: return write_26_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4096   case 27: return write_27_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4097   case 28: return write_28_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4098   case 29: return write_29_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4099   case 30: return write_30_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4100   case 31: return write_31_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4101   case 32: return write_32_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4102 #else
4103   case 0: return strm_buffer_i;
4104   case 2: return write_02_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4105   case 4: return write_04_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4106   case 6: return write_06_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4107   case 8: return write_08_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4108   case 10: return write_10_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4109   case 12: return write_12_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4110   case 14: return write_14_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4111   case 16: return write_16_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4112   case 18: return write_18_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4113   case 20: return write_20_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4114   case 22: return write_22_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4115   case 24: return write_24_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4116   case 26: return write_26_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4117   case 28: return write_28_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4118   case 30: return write_30_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4119   case 32: return write_32_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,_in);
4120 #endif
4121   default: fprintf(stderr,"packsize of %d not allowed\n",packsize); abort();
4122   }
4123 }
4124 
4125 #else
4126 
4127 static void
reorder_values_vertically(Positionsptr_T * vertical,const Positionsptr_T * horizontal)4128 reorder_values_vertically (Positionsptr_T *vertical, const Positionsptr_T *horizontal) {
4129   int column, row, k = 0;
4130   Positionsptr_T *out;
4131 
4132   out = &(vertical[0]);
4133   for (column = 0; column < 4; column++) {
4134     k = column;
4135     for (row = 0; row < BLOCKSIZE/4; row++) {
4136       *out++ = horizontal[k];
4137       k += 4;
4138     }
4139   }
4140 
4141 #if 0
4142   printf("horizontal\n");
4143   for (k = 0; k < BLOCKSIZE; k++) {
4144     if (k % 4 == 0) {
4145       printf("\n");
4146     }
4147     printf("%u ",horizontal[k]);
4148   }
4149   printf("\n");
4150 
4151   printf("vertical\n");
4152   for (k = 0; k < BLOCKSIZE; k++) {
4153     if (k % (BLOCKSIZE/4) == 0) {
4154       printf("\n");
4155     }
4156     printf("%u ",vertical[k]);
4157   }
4158   printf("\n");
4159 #endif
4160 
4161   return;
4162 }
4163 
4164 /* Non-SIMD code cannot write vertical format easily, so using
4165    horizontal code and conversions */
4166 static int
write_vert(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * horizontal,int packsize)4167 write_vert (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i,
4168 	    const UINT4 *horizontal, int packsize) {
4169   int nwritten;
4170   UINT4 buffer[BLOCKSIZE], vertical[BLOCKSIZE];
4171 
4172 #if 0
4173   int i;
4174 
4175   printf("Entering with packsize %d\n",packsize);
4176   for (i = 0; i < BLOCKSIZE; i++) {
4177     printf("%d ",_in[i]);
4178   }
4179   printf("\n");
4180 #endif
4181 
4182   reorder_values_vertically(vertical,horizontal);
4183   memset((void *) buffer,0,BLOCKSIZE*sizeof(UINT4));
4184 
4185   switch (packsize) {
4186   case 0: return strm_buffer_i;
4187   case 2: nwritten = pack_02_horiz(buffer,&(vertical[0])); break;
4188   case 4: nwritten = pack_04_horiz(buffer,&(vertical[0])); break;
4189   case 6: nwritten = pack_06_horiz(buffer,&(vertical[0])); break;
4190   case 8: nwritten = pack_08_horiz(buffer,&(vertical[0])); break;
4191   case 10: nwritten = pack_10_horiz(buffer,&(vertical[0])); break;
4192   case 12: nwritten = pack_12_horiz(buffer,&(vertical[0])); break;
4193   case 14: nwritten = pack_14_horiz(buffer,&(vertical[0])); break;
4194   case 16: nwritten = pack_16_horiz(buffer,&(vertical[0])); break;
4195   case 18: nwritten = pack_18_horiz(buffer,&(vertical[0])); break;
4196   case 20: nwritten = pack_20_horiz(buffer,&(vertical[0])); break;
4197   case 22: nwritten = pack_22_horiz(buffer,&(vertical[0])); break;
4198   case 24: nwritten = pack_24_horiz(buffer,&(vertical[0])); break;
4199   case 26: nwritten = pack_26_horiz(buffer,&(vertical[0])); break;
4200   case 28: nwritten = pack_28_horiz(buffer,&(vertical[0])); break;
4201   case 30: nwritten = pack_30_horiz(buffer,&(vertical[0])); break;
4202   case 32: nwritten = pack_32_horiz(buffer,&(vertical[0])); break;
4203   default: fprintf(stderr,"packsize of %d not allowed\n",packsize); abort();
4204   }
4205 
4206   return write_reg_buffered_vert(strm_fp,strm_buffer,
4207 				 strm_buffer_size,strm_buffer_i,
4208 				 buffer,nwritten);
4209 }
4210 #endif
4211 
4212 
4213 /* Columnar order allows just the necessary values in a block to be decoded */
4214 static void
columnar_order(UINT4 * columnar,const UINT4 * vertical)4215 columnar_order (UINT4 *columnar, const UINT4 *vertical) {
4216 
4217   columnar[0] = vertical[0];		/* remainder 1 */
4218   columnar[1] = vertical[4];		/* remainder 5 */
4219   columnar[2] = vertical[8];		/* remainder 9 */
4220   columnar[3] = vertical[12];		/* remainder 13 */
4221   columnar[4] = vertical[16];		/* remainder 17 */
4222   columnar[5] = vertical[20];		/* remainder 21 */
4223   columnar[6] = vertical[24];		/* remainder 25 */
4224   columnar[7] = vertical[28];		/* remainder 29 */
4225 
4226   columnar[8] = vertical[1];		/* remainder 2 */
4227   columnar[9] = vertical[5];		/* remainder 6 */
4228   columnar[10] = vertical[9];		/* remainder 10 */
4229   columnar[11] = vertical[13];		/* remainder 14 */
4230   columnar[12] = vertical[17];		/* remainder 18 */
4231   columnar[13] = vertical[21];		/* remainder 22 */
4232   columnar[14] = vertical[25];		/* remainder 26 */
4233   columnar[15] = vertical[29];		/* remainder 30 */
4234 
4235   columnar[16] = vertical[2];		/* remainder 3 */
4236   columnar[17] = vertical[6];		/* remainder 7 */
4237   columnar[18] = vertical[10];		/* remainder 11 */
4238   columnar[19] = vertical[14];		/* remainder 15 */
4239   columnar[20] = vertical[18];		/* remainder 19 */
4240   columnar[21] = vertical[22];		/* remainder 23 */
4241   columnar[22] = vertical[26];		/* remainder 27 */
4242   columnar[23] = vertical[30];		/* remainder 31 */
4243 
4244   columnar[24] = vertical[3];		/* remainder 4 */
4245   columnar[25] = vertical[7];		/* remainder 8 */
4246   columnar[26] = vertical[11];		/* remainder 12 */
4247   columnar[27] = vertical[15];		/* remainder 16 */
4248   columnar[28] = vertical[19];		/* remainder 20 */
4249   columnar[29] = vertical[23];		/* remainder 24 */
4250   columnar[30] = vertical[27];		/* remainder 28 */
4251   columnar[31] = vertical[31];		/* remainder 32 */
4252 
4253   columnar[32] = vertical[32];		/* remainder 63 */
4254   columnar[33] = vertical[36];		/* remainder 59 */
4255   columnar[34] = vertical[40];		/* remainder 55 */
4256   columnar[35] = vertical[44];		/* remainder 51 */
4257   columnar[36] = vertical[48];		/* remainder 47 */
4258   columnar[37] = vertical[52];		/* remainder 43 */
4259   columnar[38] = vertical[56];		/* remainder 39 */
4260   columnar[39] = vertical[60];		/* remainder 35 */
4261 
4262   columnar[40] = vertical[33];		/* remainder 62 */
4263   columnar[41] = vertical[37];		/* remainder 58 */
4264   columnar[42] = vertical[41];		/* remainder 54 */
4265   columnar[43] = vertical[45];		/* remainder 50 */
4266   columnar[44] = vertical[49];		/* remainder 46 */
4267   columnar[45] = vertical[53];		/* remainder 42 */
4268   columnar[46] = vertical[57];		/* remainder 38 */
4269   columnar[47] = vertical[61];		/* remainder 34 */
4270 
4271   columnar[48] = vertical[34];		/* remainder 61 */
4272   columnar[49] = vertical[38];		/* remainder 57 */
4273   columnar[50] = vertical[42];		/* remainder 53 */
4274   columnar[51] = vertical[46];		/* remainder 49 */
4275   columnar[52] = vertical[50];		/* remainder 45 */
4276   columnar[53] = vertical[54];		/* remainder 41 */
4277   columnar[54] = vertical[58];		/* remainder 37 */
4278   columnar[55] = vertical[62];		/* remainder 33 */
4279 
4280   columnar[56] = vertical[35];		/* remainder 60 */
4281   columnar[57] = vertical[39];		/* remainder 56 */
4282   columnar[58] = vertical[43];		/* remainder 52 */
4283   columnar[59] = vertical[47];		/* remainder 48 */
4284   columnar[60] = vertical[51];		/* remainder 44 */
4285   columnar[61] = vertical[55];		/* remainder 40 */
4286   columnar[62] = vertical[59];		/* remainder 36 */
4287   columnar[63] = vertical[63];		/* remainder 32 */
4288 
4289   return;
4290 }
4291 
4292 
4293 #ifdef HAVE_SSE2
4294 
4295 int
Bitpack64_write_columnar(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * _in,int packsize)4296 Bitpack64_write_columnar (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i,
4297 			  const UINT4 *_in, int packsize) {
4298   UINT4 columnar[BLOCKSIZE];
4299 
4300 #if 0
4301   int i;
4302 
4303   printf("Entering with packsize %d\n",packsize);
4304   for (i = 0; i < BLOCKSIZE; i++) {
4305     printf("%d ",_in[i]);
4306   }
4307   printf("\n");
4308 #endif
4309 
4310   columnar_order(columnar,_in);
4311 
4312   switch (packsize) {
4313   case 0: return strm_buffer_i;
4314   case 2: return write_02_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4315   case 4: return write_04_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4316   case 6: return write_06_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4317   case 8: return write_08_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4318   case 10: return write_10_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4319   case 12: return write_12_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4320   case 14: return write_14_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4321   case 16: return write_16_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4322   case 18: return write_18_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4323   case 20: return write_20_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4324   case 22: return write_22_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4325   case 24: return write_24_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4326   case 26: return write_26_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4327   case 28: return write_28_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4328   case 30: return write_30_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4329   case 32: return write_32_vert(strm_fp,strm_buffer,strm_buffer_size,strm_buffer_i,columnar);
4330 
4331   default: fprintf(stderr,"packsize of %d not allowed\n",packsize); abort();
4332   }
4333 }
4334 
4335 #else
4336 
4337 /* Non-SIMD code cannot write vertical format easily, so using
4338    horizontal code and conversions */
4339 
4340 int
Bitpack64_write_columnar(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * horizontal,int packsize)4341 Bitpack64_write_columnar (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i,
4342 			  const UINT4 *horizontal, int packsize) {
4343   int nwritten;
4344   UINT4 buffer[BLOCKSIZE], vertical[BLOCKSIZE];
4345   UINT4 columnar[BLOCKSIZE];
4346 
4347 #if 0
4348   int i;
4349 
4350   printf("Entering with packsize %d\n",packsize);
4351   for (i = 0; i < BLOCKSIZE; i++) {
4352     printf("%d ",horizontal[i]);
4353   }
4354   printf("\n");
4355 #endif
4356 
4357   columnar_order(columnar,horizontal);
4358   reorder_values_vertically(vertical,columnar);
4359   memset((void *) buffer,0,BLOCKSIZE*sizeof(UINT4));
4360 
4361   switch (packsize) {
4362   case 0: return strm_buffer_i;
4363   case 2: nwritten = pack_02_horiz(buffer,&(vertical[0])); break;
4364   case 4: nwritten = pack_04_horiz(buffer,&(vertical[0])); break;
4365   case 6: nwritten = pack_06_horiz(buffer,&(vertical[0])); break;
4366   case 8: nwritten = pack_08_horiz(buffer,&(vertical[0])); break;
4367   case 10: nwritten = pack_10_horiz(buffer,&(vertical[0])); break;
4368   case 12: nwritten = pack_12_horiz(buffer,&(vertical[0])); break;
4369   case 14: nwritten = pack_14_horiz(buffer,&(vertical[0])); break;
4370   case 16: nwritten = pack_16_horiz(buffer,&(vertical[0])); break;
4371   case 18: nwritten = pack_18_horiz(buffer,&(vertical[0])); break;
4372   case 20: nwritten = pack_20_horiz(buffer,&(vertical[0])); break;
4373   case 22: nwritten = pack_22_horiz(buffer,&(vertical[0])); break;
4374   case 24: nwritten = pack_24_horiz(buffer,&(vertical[0])); break;
4375   case 26: nwritten = pack_26_horiz(buffer,&(vertical[0])); break;
4376   case 28: nwritten = pack_28_horiz(buffer,&(vertical[0])); break;
4377   case 30: nwritten = pack_30_horiz(buffer,&(vertical[0])); break;
4378   case 32: nwritten = pack_32_horiz(buffer,&(vertical[0])); break;
4379   default: fprintf(stderr,"packsize of %d not allowed\n",packsize); abort();
4380   }
4381 
4382   return write_reg_buffered_vert(strm_fp,strm_buffer,
4383 				 strm_buffer_size,strm_buffer_i,
4384 				 buffer,nwritten);
4385 }
4386 
4387 #endif
4388 
4389 
4390 
4391 /* Horizontal format is slightly more complicated for random access of individual values */
4392 int
Bitpack64_write_horiz(FILE * strm_fp,Positionsptr_T * strm_buffer,int strm_buffer_size,int strm_buffer_i,const UINT4 * horizontal,int packsize)4393 Bitpack64_write_horiz (FILE *strm_fp, Positionsptr_T *strm_buffer, int strm_buffer_size, int strm_buffer_i,
4394 		       const UINT4 *horizontal, int packsize) {
4395   int nwritten;
4396   UINT4 buffer[BLOCKSIZE];
4397 
4398   write_setup();
4399 
4400 #if 0
4401   int i;
4402 
4403   printf("Entering with packsize %d\n",packsize);
4404   for (i = 0; i < BLOCKSIZE; i++) {
4405     printf("%d ",_in[i]);
4406   }
4407   printf("\n");
4408 #endif
4409 
4410   memset((void *) buffer,0,BLOCKSIZE*sizeof(UINT4));
4411 
4412   switch (packsize) {
4413   case 0: return strm_buffer_i;
4414   case 2: nwritten = pack_02_horiz(buffer,&(horizontal[0])); break;
4415   case 4: nwritten = pack_04_horiz(buffer,&(horizontal[0])); break;
4416   case 6: nwritten = pack_06_horiz(buffer,&(horizontal[0])); break;
4417   case 8: nwritten = pack_08_horiz(buffer,&(horizontal[0])); break;
4418   case 10: nwritten = pack_10_horiz(buffer,&(horizontal[0])); break;
4419   case 12: nwritten = pack_12_horiz(buffer,&(horizontal[0])); break;
4420   case 14: nwritten = pack_14_horiz(buffer,&(horizontal[0])); break;
4421   case 16: nwritten = pack_16_horiz(buffer,&(horizontal[0])); break;
4422   case 18: nwritten = pack_18_horiz(buffer,&(horizontal[0])); break;
4423   case 20: nwritten = pack_20_horiz(buffer,&(horizontal[0])); break;
4424   case 22: nwritten = pack_22_horiz(buffer,&(horizontal[0])); break;
4425   case 24: nwritten = pack_24_horiz(buffer,&(horizontal[0])); break;
4426   case 26: nwritten = pack_26_horiz(buffer,&(horizontal[0])); break;
4427   case 28: nwritten = pack_28_horiz(buffer,&(horizontal[0])); break;
4428   case 30: nwritten = pack_30_horiz(buffer,&(horizontal[0])); break;
4429   case 32: nwritten = pack_32_horiz(buffer,&(horizontal[0])); break;
4430   default: fprintf(stderr,"packsize of %d not allowed\n",packsize); abort();
4431   }
4432 
4433   return write_reg_buffered_horiz(strm_fp,strm_buffer,
4434 				  strm_buffer_size,strm_buffer_i,
4435 				  buffer,nwritten);
4436 }
4437 
4438 
4439 
4440 /* Processes 64 values at a time.  Returns packsize. */
4441 /* Handles first 32 values from the initial value, and the last 32
4442    values from the final value.  More efficient since we need to
4443    process only half as many inputs. */
4444 int
Bitpack64_compute_q4_diffs_bidir(UINT4 * diffs,UINT4 * values)4445 Bitpack64_compute_q4_diffs_bidir (UINT4 *diffs, UINT4 *values) {
4446   UINT4 packsize;
4447   int i;
4448   UINT4 maxdiff = 0;
4449   int firstbit;
4450 #ifdef HAVE_BUILTIN_CLZ
4451 #elif defined(HAVE_ASM_BSR)
4452   int msb;
4453 #endif
4454 
4455 #if 0
4456   for (i = 0; i < 64; i++) {
4457     assert(values[i+1] >= values[i]);
4458   }
4459 #endif
4460 
4461   maxdiff |= (diffs[32] = values[64] - values[63]);
4462   maxdiff |= (diffs[33] = values[64] - values[62]);
4463   maxdiff |= (diffs[34] = values[64] - values[61]);
4464   maxdiff |= (diffs[35] = values[64] - values[60]);
4465   for (i = 36; i < 64; i++) {
4466     maxdiff |= (diffs[i] = values[64+32-(i+1-4)] - values[64+32-(i+1)]);
4467   }
4468   for (i = 31; i >= 4; i--) {
4469     maxdiff |= (diffs[i] = values[i+1] - values[i+1-4]);
4470   }
4471   maxdiff |= (diffs[3] = values[4] - values[0]);
4472   maxdiff |= (diffs[2] = values[3] - values[0]);
4473   maxdiff |= (diffs[1] = values[2] - values[0]);
4474   maxdiff |= (diffs[0] = values[1] - values[0]);
4475 
4476   if (maxdiff == 0) {
4477     /* __builtin_clz() behaves oddly on zero */
4478     return 0;
4479 
4480   } else {
4481 #ifdef HAVE_BUILTIN_CLZ
4482     firstbit = __builtin_clz(maxdiff);
4483     packsize = 32 - firstbit;
4484 #elif defined(HAVE_ASM_BSR)
4485     asm("bsr %1,%0" : "=r"(msb) : "r"(maxdiff));
4486     packsize = msb + 1;
4487 #else
4488     firstbit = ((maxdiff >> 16) ? clz_table[maxdiff >> 16] : 16 + clz_table[maxdiff]);
4489     packsize = 32 - firstbit;
4490 #endif
4491 
4492 #ifdef ALLOW_ODD_PACKSIZES
4493     return packsize;
4494 #else
4495     return (packsize + 1) & ~1;	/* Converts packsizes to the next multiple of 2 */
4496 #endif
4497   }
4498 }
4499 
4500 
4501 #ifdef HAVE_64_BIT
4502 static int
Bitpack64_compute_q4_diffs_bidir_huge(UINT4 * diffs,UINT8 * values)4503 Bitpack64_compute_q4_diffs_bidir_huge (UINT4 *diffs, UINT8 *values) {
4504   UINT4 packsize;
4505   int i;
4506   UINT4 maxdiff = 0;
4507   int firstbit;
4508 #ifdef HAVE_BUILTIN_CLZ
4509 #elif defined(HAVE_ASM_BSR)
4510   int msb;
4511 #endif
4512 
4513 #if 0
4514   for (i = 0; i < 64; i++) {
4515     assert(values[i+1] >= values[i]);
4516   }
4517 #endif
4518 
4519   maxdiff |= (diffs[32] = (UINT4) (values[64] - values[63]));
4520   maxdiff |= (diffs[33] = (UINT4) (values[64] - values[62]));
4521   maxdiff |= (diffs[34] = (UINT4) (values[64] - values[61]));
4522   maxdiff |= (diffs[35] = (UINT4) (values[64] - values[60]));
4523   for (i = 36; i < 64; i++) {
4524     maxdiff |= (diffs[i] = (UINT4) (values[64+32-(i+1-4)] - values[64+32-(i+1)]));
4525   }
4526   for (i = 31; i >= 4; i--) {
4527     maxdiff |= (diffs[i] = (UINT4) (values[i+1] - values[i+1-4]));
4528   }
4529   maxdiff |= (diffs[3] = (UINT4) (values[4] - values[0]));
4530   maxdiff |= (diffs[2] = (UINT4) (values[3] - values[0]));
4531   maxdiff |= (diffs[1] = (UINT4) (values[2] - values[0]));
4532   maxdiff |= (diffs[0] = (UINT4) (values[1] - values[0]));
4533 
4534   if (maxdiff == 0) {
4535     /* __builtin_clz() behaves oddly on zero */
4536     return 0;
4537 
4538   } else {
4539 #ifdef HAVE_BUILTIN_CLZ
4540     firstbit = __builtin_clz(maxdiff);
4541     packsize = 32 - firstbit;
4542 #elif defined(HAVE_ASM_BSR)
4543     asm("bsr %1,%0" : "=r"(msb) : "r"(maxdiff));
4544     packsize = msb + 1;
4545 #else
4546     firstbit = ((maxdiff >> 16) ? clz_table[maxdiff >> 16] : 16 + clz_table[maxdiff]);
4547     packsize = 32 - firstbit;
4548 #endif
4549 
4550 #ifdef ALLOW_ODD_PACKSIZES
4551     return packsize;
4552 #else
4553     return (packsize + 1) & ~1;	/* Converts packsizes to the next multiple of 2 */
4554 #endif
4555   }
4556 }
4557 #endif
4558 
4559 
4560 #if 0
4561 static int
4562 compute_q1_diffs (UINT4 *diffs, UINT4 *values) {
4563   UINT4 packsize;
4564   int i;
4565   UINT4 maxdiff = 0;
4566   int firstbit;
4567 #ifdef HAVE_BUILTIN_CLZ
4568 #elif defined(HAVE_ASM_BSR)
4569   int msb;
4570 #endif
4571 
4572 #if 0
4573   for (i = 0; i < 64; i++) {
4574     assert(values[i+1] >= values[i]);
4575   }
4576 #endif
4577 
4578   for (i = 63; i >= 0; i--) {
4579     maxdiff |= (diffs[i] = values[i+1] - values[i]);
4580   }
4581 
4582   if (maxdiff == 0) {
4583     /* __builtin_clz() behaves oddly on zero */
4584     return 0;
4585 
4586   } else {
4587 #ifdef HAVE_BUILTIN_CLZ
4588     firstbit = __builtin_clz(maxdiff);
4589     packsize = 32 - firstbit;
4590 #elif defined(HAVE_ASM_BSR)
4591     asm("bsr %1,%0" : "=r"(msb) : "r"(maxdiff));
4592     packsize = msb + 1;
4593 #else
4594     firstbit = ((maxdiff >> 16) ? clz_table[maxdiff >> 16] : 16 + clz_table[maxdiff]);
4595     packsize = 32 - firstbit;
4596 #endif
4597 
4598 #ifdef ALLOW_ODD_PACKSIZES
4599     return packsize;
4600 #else
4601     return (packsize + 1) & ~1;	/* Converts packsizes to the next multiple of 2 */
4602 #endif
4603   }
4604 }
4605 #endif
4606 
4607 
4608 /* Used by trindex and indexdb_cat programs */
4609 /* We want to store values 0..n, with final value at ascending[n]
4610    possibly stored as the final metainfo value */
4611 /* Stored in columnar order */
4612 void
Bitpack64_write_differential(char * ptrsfile,char * compfile,UINT4 * ascending,Oligospace_T n)4613 Bitpack64_write_differential (char *ptrsfile, char *compfile, UINT4 *ascending, Oligospace_T n) {
4614   FILE *ptrs_fp, *comp_fp;
4615   UINT4 *ptrs, *p;
4616   size_t nptrs;
4617   int i;
4618   Oligospace_T positioni;
4619 
4620   /* Buffer is used to avoid frequent writes to the file */
4621   UINT4 *buffer;
4622   int buffer_size = BUFFER_SIZE;
4623   int buffer_i;
4624 
4625   UINT4 diffs[BLOCKSIZE], last_block[BLOCKSIZE+1];
4626 
4627   UINT4 nwritten;
4628   int packsize;
4629 
4630 
4631   write_setup();
4632 
4633   /* printf("Entered Bitpack64_write_differential with n %llu\n",n); */
4634 
4635   /* 2 metavalues: nwritten (pointer) and cumulative sum for block.
4636      Packsize can be computed from difference between successive
4637      pointers, if only even packsizes are allowed */
4638   p = ptrs = (UINT4 *) CALLOC(((n + BLOCKSIZE)/BLOCKSIZE + 1) * DIFFERENTIAL_METAINFO_SIZE,sizeof(UINT4));
4639 
4640   if ((comp_fp = FOPEN_WRITE_BINARY(compfile)) == NULL) {
4641     fprintf(stderr,"Can't write to compfile %s: %s\n",compfile,strerror(errno));
4642     abort();
4643   }
4644   buffer = (UINT4 *) CALLOC(buffer_size,sizeof(UINT4));
4645   buffer_i = 0;
4646 
4647   nwritten = 0U;
4648 
4649   /* Last value of ascending is at ascending[n] */
4650   /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
4651   for (positioni = 0; positioni + BLOCKSIZE <= n; positioni += BLOCKSIZE) {
4652     /* Pointer */
4653     *p++ = nwritten/4;	/* In 128-bit registers */
4654 
4655     /* Value for start of block */
4656     *p++ = ascending[positioni];
4657 
4658     /* Pack block of 64 diffs */
4659     packsize = Bitpack64_compute_q4_diffs_bidir(diffs,&(ascending[positioni]));
4660     buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
4661 
4662 #ifdef ALLOW_ODD_PACKSIZES
4663     nwritten += 2 * ((packsize + 1) & ~1);
4664 #else
4665     nwritten += 2 * packsize;
4666 #endif
4667   }
4668 
4669   /* Old: Check for positioni < n, because if positioni == n, ascending[n] will be taken care of as metainfo */
4670   /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
4671   if (positioni <= n) {
4672     /* Finish last block of 64 */
4673     *p++ = nwritten/4;	/* In 128-bit registers */
4674 
4675     /* Value for start of block */
4676     *p++ = ascending[positioni];
4677 
4678     /* For differential, want <=.  For direct, want < */
4679     for (i = 0; i <= (int) (n - positioni); i++) {
4680       last_block[i] = ascending[positioni+i];
4681     }
4682     for ( ; i <= BLOCKSIZE; i++) {
4683       /* Copy last value for rest of block */
4684       last_block[i] = ascending[n];
4685     }
4686 
4687     /* Pack block of < 64 diffs */
4688     packsize = Bitpack64_compute_q4_diffs_bidir(diffs,last_block);
4689     buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
4690 
4691 #ifdef ALLOW_ODD_PACKSIZES
4692     nwritten += 2 * ((packsize + 1) & ~1);
4693 #else
4694     nwritten += 2 * packsize;
4695 #endif
4696   }
4697 
4698 
4699   /* Write the final pointer, which will point after the end of the file */
4700   *p++ = nwritten/4;	/* In 128-bit registers */
4701 
4702   /* Value for end of block */
4703   *p++ = ascending[n];
4704 
4705   if ((ptrs_fp = FOPEN_WRITE_BINARY(ptrsfile)) == NULL) {
4706     fprintf(stderr,"Can't write to ptrsfile %s: %s\n",ptrsfile,strerror(errno));
4707     abort();
4708   } else {
4709     nptrs = p - ptrs;
4710     if (FWRITE_UINTS(ptrs,nptrs,ptrs_fp) != (size_t) nptrs) {
4711       fprintf(stderr,"Error in Bitpack64_write_differential: %s\n",strerror(errno));
4712       exit(9);
4713     }
4714     FREE(ptrs);
4715     fclose(ptrs_fp);
4716   }
4717 
4718   /* Empty buffer */
4719   if (buffer_i > 0) {
4720     if (FWRITE_UINTS(buffer,buffer_i,comp_fp) != (size_t) buffer_i) {
4721       fprintf(stderr,"Error in Bitpack64_write_differential: %s\n",strerror(errno));
4722       exit(9);
4723     }
4724     buffer_i = 0;
4725   }
4726   FREE(buffer);
4727   fclose(comp_fp);
4728 
4729   return;
4730 }
4731 
4732 
4733 static UINT4
compute_ascending(UINT4 * ascending,UINT4 * counts)4734 compute_ascending (UINT4 *ascending, UINT4 *counts) {
4735   int i;
4736 
4737   ascending[0] = 0;
4738   for (i = 1; i <= 64; i++) {
4739     ascending[i] = ascending[i-1] + counts[i-1];
4740   }
4741 
4742   return ascending[64];
4743 }
4744 
4745 static UINT8
compute_ascending_huge(UINT8 * ascending,UINT4 * counts)4746 compute_ascending_huge (UINT8 *ascending, UINT4 *counts) {
4747   int i;
4748 
4749   ascending[0] = 0;
4750   for (i = 1; i <= 64; i++) {
4751     ascending[i] = ascending[i-1] + (UINT8) counts[i-1];
4752   }
4753 
4754   return ascending[64];
4755 }
4756 
4757 #ifdef COUNTS_WITHOUT_COMPRESSION
4758 static void
compare_offsets_huge(UINT8 * ascending,UINT4 * counts_direct,Oligospace_T positioni)4759 compare_offsets_huge (UINT8 *ascending, UINT4 *counts_direct, Oligospace_T positioni) {
4760   int i, j;
4761   UINT8 totalcount = ascending[0];
4762 
4763   for (i = 1; i <= 64; i++) {
4764     if (ascending[i] != totalcount + counts_direct[i]) {
4765       fprintf(stderr,"At positioni %llu, element %d, computed with compression %llu != computed without compression %llu\n",
4766 	      positioni,i,ascending[i],totalcount + counts_direct[i]);
4767       for (j = 0; j < 64; j++) {
4768 	fprintf(stderr,"%d: %llu %u\n",j,ascending[j],counts_direct[j]);
4769       }
4770       abort();
4771     }
4772     totalcount += counts_direct[i];
4773   }
4774 
4775   return;
4776 }
4777 #endif
4778 
4779 
4780 /* We want to store values 0..n, with final value at ascending[n]
4781    possibly stored as the final metainfo value */
4782 /* Stored in columnar order */
4783 void
Bitpack64_write_differential_bitpacks(char * ptrsfile,char * compfile,char * packsizes,UINT4 ** bitpacks,Oligospace_T n)4784 Bitpack64_write_differential_bitpacks (char *ptrsfile, char *compfile, char *packsizes, UINT4 **bitpacks,
4785 				       Oligospace_T n) {
4786   FILE *ptrs_fp, *comp_fp;
4787   UINT4 *ptrs, *p, nregisters;
4788   UINT4 totalcount;
4789   size_t nptrs;
4790   int i;
4791   Oligospace_T positioni, bmer;
4792 
4793   /* Buffer is used to avoid frequent writes to the file */
4794   UINT4 *buffer;
4795   int buffer_size = BUFFER_SIZE;
4796   int buffer_i;
4797 
4798   UINT4 diffs[BLOCKSIZE], ascending[BLOCKSIZE+1], counts[BLOCKSIZE], last_block[BLOCKSIZE];
4799   int packsize;
4800 
4801 
4802   write_setup();
4803 
4804   /* printf("Entered Bitpack64_write_differential with n %llu\n",n); */
4805 
4806   /* 2 metavalues: nwritten (pointer) and cumulative sum for block.
4807      Packsize can be computed from difference between successive
4808      pointers, if only even packsizes are allowed */
4809   p = ptrs = (UINT4 *) CALLOC(((n + BLOCKSIZE)/BLOCKSIZE + 1) * DIFFERENTIAL_METAINFO_SIZE,sizeof(UINT4));
4810 
4811   if ((comp_fp = FOPEN_WRITE_BINARY(compfile)) == NULL) {
4812     fprintf(stderr,"Can't write to compfile %s: %s\n",compfile,strerror(errno));
4813     abort();
4814   }
4815   buffer = (UINT4 *) CALLOC(buffer_size,sizeof(UINT4));
4816   buffer_i = 0;
4817 
4818   nregisters = 0U;
4819 
4820   /* Last value of ascending is at ascending[n] */
4821   /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
4822   totalcount = 0;
4823   for (positioni = 0, bmer = 0; positioni + BLOCKSIZE <= n; positioni += BLOCKSIZE, bmer++) {
4824     /* Pointer */
4825     *p++ = nregisters;	/* In 128-bit registers */
4826 
4827     /* Value for start of block */
4828     *p++ = totalcount;
4829 
4830     /* Pack block of 64 diffs */
4831     Bitpack64_extract_bitpack(counts,packsizes[bmer],bitpacks[bmer]);
4832     totalcount += compute_ascending(ascending,counts);
4833     packsize = Bitpack64_compute_q4_diffs_bidir(diffs,ascending); /* Note: This packsize may differ from packsizes[bmer], because of calculation of diffs */
4834     buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
4835 
4836     nregisters += packsize / 2;
4837   }
4838 
4839   /* Old: Check for positioni < n, because if positioni == n, ascending[n] will be taken care of as metainfo */
4840   /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
4841   /* For nucleotides, expect a single final block where positioni == n */
4842   if (positioni <= n) {
4843     /* Finish last block of 64 */
4844     *p++ = nregisters;	/* In 128-bit registers */
4845 
4846     /* Value for start of block */
4847     *p++ = totalcount;
4848 
4849     if (positioni == n) {
4850       /* Don't have a bitpack at [bmerspace].  Just fills counts with zeroes. */
4851       Bitpack64_extract_bitpack(counts,/*packsize*/0,/*bitpack*/NULL);
4852     } else {
4853       Bitpack64_extract_bitpack(counts,packsizes[bmer],bitpacks[bmer]);
4854     }
4855 
4856     /* For differential, want <=.  For direct, want < */
4857     for (i = 0; i <= (int) (n - positioni); i++) {
4858       last_block[i] = counts[i];
4859     }
4860     for ( ; i < BLOCKSIZE; i++) {
4861       /* Copy last value for rest of block */
4862       last_block[i] = 0;
4863     }
4864 
4865     /* Pack block of < 64 diffs */
4866     totalcount += compute_ascending(ascending,last_block);
4867     packsize = Bitpack64_compute_q4_diffs_bidir(diffs,ascending);
4868     buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
4869 
4870     nregisters += packsize / 2;
4871   }
4872 
4873   /* Write the final pointer, which will point after the end of the file */
4874   *p++ = nregisters;	/* In 128-bit registers */
4875 
4876   /* Value for end of block */
4877   *p++ = totalcount;
4878 
4879   if ((ptrs_fp = FOPEN_WRITE_BINARY(ptrsfile)) == NULL) {
4880     fprintf(stderr,"Can't write to ptrsfile %s: %s\n",ptrsfile,strerror(errno));
4881     abort();
4882   } else {
4883     nptrs = p - ptrs;
4884     if (FWRITE_UINTS(ptrs,nptrs,ptrs_fp) != (size_t) nptrs) {
4885       fprintf(stderr,"Error in Bitpack64_write_differential_bitpacks: %s\n",strerror(errno));
4886       exit(9);
4887     }
4888     FREE(ptrs);
4889     fclose(ptrs_fp);
4890   }
4891 
4892   /* Empty buffer */
4893   if (buffer_i > 0) {
4894     if (FWRITE_UINTS(buffer,buffer_i,comp_fp) != (size_t) buffer_i) {
4895       fprintf(stderr,"Error in Bitpack64_write_differential_bitpacks: %s\n",strerror(errno));
4896       exit(9);
4897     }
4898     buffer_i = 0;
4899   }
4900   FREE(buffer);
4901   fclose(comp_fp);
4902 
4903   return;
4904 }
4905 
4906 
4907 #if 0
4908 /* We want to store values 0..n, with final value at ascending[n]
4909    possibly stored as the final metainfo value */
4910 /* Stored in columnar order */
4911 UINT4
4912 Bitpack64_append_differential_bitpacks (UINT4 *totalcount, FILE *ptrs_fp, FILE *comp_fp, char *packsizes, UINT4 **bitpacks,
4913 					Oligospace_T n) {
4914   UINT4 *ptrs, *p, nregisters;
4915   size_t nptrs;
4916   int i;
4917   Oligospace_T positioni, bmer;
4918 
4919   /* Buffer is used to avoid frequent writes to the file */
4920   UINT4 *buffer;
4921   int buffer_size = BUFFER_SIZE;
4922   int buffer_i;
4923 
4924   UINT4 diffs[BLOCKSIZE], ascending[BLOCKSIZE+1], counts[BLOCKSIZE], last_block[BLOCKSIZE];
4925   int packsize;
4926 
4927 
4928   write_setup();
4929 
4930   /* printf("Entered Bitpack64_write_differential with n %llu\n",n); */
4931 
4932   /* 2 metavalues: nwritten (pointer) and cumulative sum for block.
4933      Packsize can be computed from difference between successive
4934      pointers, if only even packsizes are allowed */
4935   p = ptrs = (UINT4 *) CALLOC(((n + BLOCKSIZE)/BLOCKSIZE + 1) * DIFFERENTIAL_METAINFO_SIZE,sizeof(UINT4));
4936 
4937   buffer = (UINT4 *) CALLOC(buffer_size,sizeof(UINT4));
4938   buffer_i = 0;
4939 
4940   nregisters = 0U;
4941 
4942   /* Last value of ascending is at ascending[n] */
4943   /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
4944   *totalcount = 0;
4945   for (positioni = 0, bmer = 0; positioni + BLOCKSIZE <= n; positioni += BLOCKSIZE, bmer++) {
4946     /* Pointer */
4947     *p++ = nregisters;	/* In 128-bit registers */
4948 
4949     /* Value for start of block */
4950     *p++ = *totalcount;
4951 
4952     /* Pack block of 64 diffs */
4953     Bitpack64_extract_bitpack(counts,packsizes[bmer],bitpacks[bmer]);
4954     *totalcount += compute_ascending(ascending,counts);
4955     packsize = Bitpack64_compute_q4_diffs_bidir(diffs,ascending); /* Note: This packsize may differ from packsizes[bmer], because of calculation of diffs */
4956     buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
4957 
4958     nregisters += packsize / 2;
4959   }
4960 
4961   /* Old: Check for positioni < n, because if positioni == n, ascending[n] will be taken care of as metainfo */
4962   /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
4963   /* For nucleotides, expect a single final block where positioni == n */
4964   if (positioni <= n) {
4965     /* Finish last block of 64 */
4966     *p++ = nregisters;	/* In 128-bit registers */
4967 
4968     /* Value for start of block */
4969     *p++ = *totalcount;
4970 
4971     if (positioni == n) {
4972       /* Don't have a bitpack at [bmerspace].  Just fills counts with zeroes. */
4973       Bitpack64_extract_bitpack(counts,/*packsize*/0,/*bitpack*/NULL);
4974     } else {
4975       Bitpack64_extract_bitpack(counts,packsizes[bmer],bitpacks[bmer]);
4976     }
4977 
4978     /* For differential, want <=.  For direct, want < */
4979     for (i = 0; i <= (int) (n - positioni); i++) {
4980       last_block[i] = counts[i];
4981     }
4982     for ( ; i < BLOCKSIZE; i++) {
4983       /* Copy last value for rest of block */
4984       last_block[i] = 0;
4985     }
4986 
4987     /* Pack block of < 64 diffs */
4988     *totalcount += compute_ascending(ascending,last_block);
4989     packsize = Bitpack64_compute_q4_diffs_bidir(diffs,ascending);
4990     buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
4991 
4992     nregisters += packsize / 2;
4993   }
4994 
4995 #if 0
4996   /* Since we are providing blocks of 64, the meta entry at [64] does get written */
4997   /* Write the final pointer, which will point after the end of the file */
4998   *p++ = nregisters;	/* In 128-bit registers */
4999 
5000   /* Value for end of block */
5001   *p++ = *totalcount;
5002 #endif
5003 
5004   nptrs = p - ptrs;
5005   if (FWRITE_UINTS(ptrs,nptrs,ptrs_fp) != (size_t) nptrs) {
5006     fprintf(stderr,"Error in Bitpack64_append_differential_bitpacks: %s\n",strerror(errno));
5007     exit(9);
5008   }
5009   FREE(ptrs);
5010 
5011   /* Empty buffer */
5012   if (buffer_i > 0) {
5013     if (FWRITE_UINTS(buffer,buffer_i,comp_fp) != (size_t) buffer_i) {
5014       fprintf(stderr,"Error in Bitpack64_append_differential_bitpacks: %s\n",strerror(errno));
5015       exit(9);
5016     }
5017     buffer_i = 0;
5018   }
5019   FREE(buffer);
5020 
5021   return nregisters;
5022 }
5023 #endif
5024 
5025 
5026 
5027 #if 0
5028 /* We want to store values 0..n, with final value at ascending[n]
5029    possibly stored as the final metainfo value */
5030 /* D4 stored in columnar order, plus D1 stored as direct */
5031 void
5032 Bitpack64_write_differential_paired (char *ptrsfile, char *compfile, UINT4 *ascending, Oligospace_T n) {
5033   FILE *ptrs_fp, *comp_fp;
5034   UINT4 *ptrs, *p;
5035   size_t nptrs;
5036   int i;
5037   Oligospace_T positioni;
5038 
5039   /* Buffer is used to avoid frequent writes to the file */
5040   UINT4 *buffer;
5041   int buffer_size = BUFFER_SIZE;
5042   int buffer_i;
5043 
5044   UINT4 diffs[BLOCKSIZE], last_block[BLOCKSIZE+1];
5045 
5046   UINT4 nwritten;
5047   int packsize;
5048 
5049 
5050   write_setup();
5051 
5052   /* 2 metavalues: nwritten (pointer) and cumulative sum for block.
5053      Packsize can be computed from difference between successive
5054      pointers, if only even packsizes are allowed */
5055   p = ptrs = (UINT4 *) CALLOC(((n + BLOCKSIZE)/BLOCKSIZE + 1) * PAIRED_METAINFO_SIZE,sizeof(UINT4));
5056 
5057   if ((comp_fp = FOPEN_WRITE_BINARY(compfile)) == NULL) {
5058     fprintf(stderr,"Can't write to compfile %s: %s\n",compfile,strerror(errno));
5059     abort();
5060   }
5061   buffer = (UINT4 *) CALLOC(buffer_size,sizeof(UINT4));
5062   buffer_i = 0;
5063 
5064   nwritten = 0U;
5065 
5066   /* Last value of ascending is at ascending[n] */
5067   /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
5068   for (positioni = 0; positioni + BLOCKSIZE <= n; positioni += BLOCKSIZE) {
5069     /* Pointer to D4 */
5070     *p++ = nwritten/4;	/* In 128-bit registers */
5071 
5072     /* Prefix sum for start of block */
5073     *p++ = ascending[positioni];
5074 
5075     /* D4: Pack block of 64 diffs */
5076     packsize = Bitpack64_compute_q4_diffs_bidir(diffs,&(ascending[positioni]));
5077     buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
5078 
5079 #ifdef ALLOW_ODD_PACKSIZES
5080     nwritten += 2 * ((packsize + 1) & ~1);
5081 #else
5082     nwritten += 2 * packsize;
5083 #endif
5084 
5085     /* Pointer to D1 */
5086     *p++ = nwritten/4;	/* In 128-bit registers */
5087 
5088     /* D1: Pack block of 64 diffs */
5089     packsize = compute_q1_diffs(diffs,&(ascending[positioni]));
5090     buffer_i = write_vert(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
5091 
5092 #ifdef ALLOW_ODD_PACKSIZES
5093     nwritten += 2 * ((packsize + 1) & ~1);
5094 #else
5095     nwritten += 2 * packsize;
5096 #endif
5097   }
5098 
5099   /* Old: Check for positioni < n, because if positioni == n, ascending[n] will be taken care of as metainfo */
5100   /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
5101   if (positioni <= n) {
5102     /* Finish last block of 64 */
5103     /* Pointer to D4 */
5104     *p++ = nwritten/4;	/* In 128-bit registers */
5105 
5106     /* Prefix sum for start of block */
5107     *p++ = ascending[positioni];
5108 
5109     /* For differential, want <=.  For direct, want < */
5110     for (i = 0; i <= (int) (n - positioni); i++) {
5111       last_block[i] = ascending[positioni+i];
5112     }
5113     for ( ; i <= BLOCKSIZE; i++) {
5114       /* Copy last value for rest of block */
5115       last_block[i] = ascending[n];
5116     }
5117 
5118     /* D4: Pack block of < 64 diffs */
5119     packsize = Bitpack64_compute_q4_diffs_bidir(diffs,last_block);
5120     buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
5121 
5122 #ifdef ALLOW_ODD_PACKSIZES
5123     nwritten += 2 * ((packsize + 1) & ~1);
5124 #else
5125     nwritten += 2 * packsize;
5126 #endif
5127 
5128     /* Pointer to D1 */
5129     *p++ = nwritten/4;	/* In 128-bit registers */
5130 
5131     /* D1: Pack block of < 64 diffs */
5132     packsize = compute_q1_diffs(diffs,last_block);
5133     buffer_i = write_vert(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
5134   }
5135 
5136   /* Write the final pointer, which will point after the end of the file */
5137   *p++ = nwritten/4;	/* In 128-bit registers */
5138 
5139   /* Prefix sum for end of block */
5140   *p++ = ascending[n];
5141 
5142   if ((ptrs_fp = FOPEN_WRITE_BINARY(ptrsfile)) == NULL) {
5143     fprintf(stderr,"Can't write to ptrsfile %s: %s\n",ptrsfile,strerror(errno));
5144     abort();
5145   } else {
5146     nptrs = p - ptrs;
5147     if (FWRITE_UINTS(ptrs,nptrs,ptrs_fp) != (size_t) nptrs) {
5148       fprintf(stderr,"Error in Bitpack64_write_differential_paired: %s\n",strerror(errno));
5149       exit(9);
5150     }
5151     FREE(ptrs);
5152     fclose(ptrs_fp);
5153   }
5154 
5155   /* Empty buffer */
5156   if (buffer_i > 0) {
5157     if (FWRITE_UINTS(buffer,buffer_i,comp_fp) != (size_t) buffer_i) {
5158       fprintf(stderr,"Error in Bitpack64_write_differential_paired: %s\n",strerror(errno));
5159       exit(9);
5160     }
5161     buffer_i = 0;
5162   }
5163   FREE(buffer);
5164   fclose(comp_fp);
5165 
5166   return;
5167 }
5168 #endif
5169 
5170 
5171 
5172 
5173 #if 0
5174 /* Worst case:
5175    64 128 192 256
5176    256 256 256 256 */
5177 
5178 #define FIXED10_PACKSIZE 10 	/* Enough to hold +/- 256 */
5179 
5180 /* We want to store values 0..n, with final value at ascending[n]
5181    possibly stored as the final metainfo value */
5182 /* Stored in columnar order */
5183 void
5184 Bitpack64_write_fixed10 (char *ptrsfile, char *compfile, UINT4 *ascending, Oligospace_T n) {
5185 #ifndef USE_ONE_FILE_FOR_FIXED
5186   FILE *ptrs_fp;
5187 #endif
5188   FILE *comp_fp;
5189   UINT4 *ptrs;
5190   UINT4 ptri;
5191   int i;
5192   Oligospace_T positioni;
5193 
5194   /* Buffer is used to avoid frequent writes to the file */
5195   UINT4 *buffer;
5196   int buffer_size = BUFFER_SIZE;
5197   int buffer_i;
5198 
5199   UINT4 diffs[BLOCKSIZE], last_block[BLOCKSIZE+1];
5200 
5201   UINT4 nwritten;
5202   int packsize;
5203 
5204   write_setup();
5205 
5206   /* 2 metavalues: nwritten (pointer) and cumulative sum for block.
5207      Packsize can be computed from difference between successive
5208      pointers, if only even packsizes are allowed */
5209 #ifdef USE_ONE_FILE_FOR_FIXED
5210   ptrs = (UINT4 *) CALLOC(4,sizeof(UINT4));
5211   ptri = 0;
5212 #else
5213   ptrs = (UINT4 *) CALLOC(((n + BLOCKSIZE)/BLOCKSIZE + 1) * RANK_METAINFO_SIZE,sizeof(UINT4));
5214   ptri = 0;
5215 #endif
5216 
5217   if ((comp_fp = FOPEN_WRITE_BINARY(compfile)) == NULL) {
5218     fprintf(stderr,"Can't write to compfile %s: %s\n",compfile,strerror(errno));
5219     abort();
5220   }
5221   buffer = (UINT4 *) CALLOC(buffer_size,sizeof(UINT4));
5222   buffer_i = 0;
5223 
5224   nwritten = 0U;
5225 
5226   /* Last value of ascending is at ascending[n] */
5227   /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
5228   for (positioni = 0; positioni + BLOCKSIZE <= n; positioni += BLOCKSIZE) {
5229 #if 0
5230     /* Pointer */
5231     ptrs[ptri++] = nwritten/4;	/* In 128-bit registers */
5232 #endif
5233 
5234     /* Value for start of block */
5235     ptrs[ptri++] = ascending[positioni];
5236 #ifdef USE_ONE_FILE_FOR_FIXED
5237     if (ptri == 4) {
5238       if (FWRITE_UINTS(ptrs,4,comp_fp) != (size_t) 4) {
5239 	fprintf(stderr,"Error in Bitpack64_write_fixed10: %s\n",strerror(errno));
5240 	exit(9);
5241       }
5242       ptri = 0;
5243     }
5244 #endif
5245 
5246     /* Pack block of 64 diffs */
5247     packsize = Bitpack64_compute_q4_diffs_bidir(diffs,&(ascending[positioni]));
5248     assert(packsize <= FIXED10_PACKSIZE);
5249     buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,FIXED10_PACKSIZE);
5250 
5251     nwritten += 2 * FIXED10_PACKSIZE;
5252   }
5253 
5254   /* Old: Check for positioni < n, because if positioni == n, ascending[n] will be taken care of as metainfo */
5255   /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
5256   if (positioni <= n) {
5257 #if 0
5258     /* Finish last block of 64 */
5259     ptrs[ptri++] = nwritten/4;	/* In 128-bit registers */
5260 #endif
5261 
5262     /* Value for start of block */
5263     ptrs[ptri++] = ascending[positioni];
5264 #ifdef USE_ONE_FILE_FOR_FIXED
5265     if (ptri == 4) {
5266       if (FWRITE_UINTS(ptrs,4,comp_fp) != (size_t) 4) {
5267 	fprintf(stderr,"Error in Bitpack64_write_fixed10: %s\n",strerror(errno));
5268 	exit(9);
5269       }
5270       ptri = 0;
5271     }
5272 #endif
5273 
5274     /* For differential, want <=.  For direct, want < */
5275     for (i = 0; i <= (int) (n - positioni); i++) {
5276       last_block[i] = ascending[positioni+i];
5277     }
5278     for ( ; i <= BLOCKSIZE; i++) {
5279       /* Copy last value for rest of block */
5280       last_block[i] = ascending[n];
5281     }
5282 
5283     /* Pack block of < 64 diffs */
5284     packsize = Bitpack64_compute_q4_diffs_bidir(diffs,last_block);
5285     assert(packsize <= FIXED10_PACKSIZE);
5286     buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,FIXED10_PACKSIZE);
5287 
5288     nwritten += 2 * FIXED10_PACKSIZE;
5289   }
5290 
5291 
5292 #if 0
5293   /* Write the final pointer, which will point after the end of the file */
5294   ptrs[ptri++] = nwritten/4;	/* In 128-bit registers */
5295 #endif
5296 
5297   /* Value for end of block */
5298   ptrs[ptri++] = ascending[n];
5299 #ifdef USE_ONE_FILE_FOR_FIXED
5300   for (i = ptri; i < 4; i++) {
5301     ptrs[i] = 0U;
5302   }
5303   if (FWRITE_UINTS(ptrs,4,comp_fp) != (size_t) 4) {
5304     fprintf(stderr,"Error in Bitpack64_write_fixed10: %s\n",strerror(errno));
5305     exit(9);
5306   }
5307 #else
5308   if ((ptrs_fp = FOPEN_WRITE_BINARY(ptrsfile)) == NULL) {
5309     fprintf(stderr,"Can't write to ptrsfile %s: %s\n",ptrsfile,strerror(errno));
5310     abort();
5311   } else {
5312     if (FWRITE_UINTS(ptrs,ptri,ptrs_fp) != (size_t) ptri) {
5313       fprintf(stderr,"Error in Bitpack64_write_fixed10: %s\n",strerror(errno));
5314       exit(9);
5315     }
5316     fclose(ptrs_fp);
5317   }
5318 #endif
5319   FREE(ptrs);
5320 
5321   /* Empty buffer */
5322   if (buffer_i > 0) {
5323     if (FWRITE_UINTS(buffer,buffer_i,comp_fp) != (size_t) buffer_i) {
5324       fprintf(stderr,"Error in Bitpack64_write_fixed10: %s\n",strerror(errno));
5325       exit(9);
5326     }
5327     buffer_i = 0;
5328   }
5329   FREE(buffer);
5330   fclose(comp_fp);
5331 
5332   return;
5333 }
5334 #endif
5335 
5336 
5337 void
Bitpack64_write_differential_huge(char * pagesfile,char * ptrsfile,char * compfile,UINT8 * ascending,Oligospace_T n)5338 Bitpack64_write_differential_huge (char *pagesfile, char *ptrsfile, char *compfile,
5339 				   UINT8 *ascending, Oligospace_T n) {
5340   UINT8 currpage, nextpage;
5341   FILE *pages_fp, *ptrs_fp, *comp_fp;
5342   UINT4 pages[25];	/* Allows us to handle up to 100 billion positions */
5343   UINT4 *ptrs, *p;
5344   size_t nptrs;
5345   Oligospace_T positioni;
5346 
5347   /* Buffer is used to avoid frequent writes to the file */
5348   UINT4 *buffer;
5349   int buffer_size = BUFFER_SIZE;
5350   int buffer_i;
5351 
5352   UINT4 diffs[BLOCKSIZE];
5353   UINT8 last_block[BLOCKSIZE+1];
5354 
5355   int pagei = 0, i;
5356   UINT4 nwritten;
5357   int packsize;
5358 
5359 
5360   write_setup();
5361 
5362   /* 2 metavalues: nwritten (pointer) and cumulative sum for block.
5363      Packsize can be computed from difference between successive
5364      pointers, if only even packsizes are allowed */
5365   p = ptrs = (UINT4 *) CALLOC(((n + BLOCKSIZE)/BLOCKSIZE + 1) * DIFFERENTIAL_METAINFO_SIZE,sizeof(UINT4));
5366 
5367   if ((comp_fp = FOPEN_WRITE_BINARY(compfile)) == NULL) {
5368     fprintf(stderr,"Can't write to compfile %s: %s\n",compfile,strerror(errno));
5369     abort();
5370   }
5371   buffer = (UINT4 *) CALLOC(buffer_size,sizeof(UINT4));
5372   buffer_i = 0;
5373 
5374   currpage = 0;
5375   nextpage = POSITIONS_PAGE;
5376   nwritten = 0U;
5377 
5378   /* Last value of ascending is at ascending[n] */
5379   /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
5380   for (positioni = 0; positioni + BLOCKSIZE <= n; positioni += BLOCKSIZE) {
5381     /* Pointer */
5382     *p++ = nwritten/4;	/* In 128-bit registers */
5383 
5384     /* Value for start of block */
5385     while (ascending[positioni] >= nextpage) {
5386       fprintf(stderr,"\nAt position %llu (block %llu), ascending %llu >= nextpage %llu",
5387 	      positioni,positioni/BLOCKSIZE,ascending[positioni],nextpage);
5388       pages[pagei++] = positioni/BLOCKSIZE;
5389       currpage = nextpage;
5390       nextpage += POSITIONS_PAGE;
5391     }
5392     *p++ = ascending[positioni] - currpage;
5393 
5394 
5395     /* Pack block of 64 diffs */
5396     packsize = Bitpack64_compute_q4_diffs_bidir_huge(diffs,&(ascending[positioni]));
5397     buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
5398 
5399 #ifdef ALLOW_ODD_PACKSIZES
5400     nwritten += 2 * ((packsize + 1) & ~1);
5401 #else
5402     nwritten += 2 * packsize;
5403 #endif
5404   }
5405 
5406   /* Old: Check for positioni < n, because if positioni == n, ascending[n] will be taken care of as metainfo */
5407   /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
5408   if (positioni <= n) {
5409     /* Finish last block of 64 */
5410     *p++ = nwritten/4;	/* In 128-bit registers */
5411 
5412     /* Value for start of block */
5413     while (ascending[positioni] >= nextpage) {
5414       fprintf(stderr,"\nAt position %llu (block %llu), ascending %llu >= nextpage %llu",
5415 	      positioni,positioni/BLOCKSIZE,ascending[positioni],nextpage);
5416       pages[pagei++] = positioni/BLOCKSIZE;
5417       currpage = nextpage;
5418       nextpage += POSITIONS_PAGE;
5419     }
5420     *p++ = ascending[positioni] - currpage;
5421 
5422     /* For differential, want <=.  For direct, want < */
5423     for (i = 0; i <= (int) (n - positioni); i++) {
5424       last_block[i] = ascending[positioni+i] - currpage;
5425     }
5426     for ( ; i <= BLOCKSIZE; i++) {
5427       /* Copy last value for rest of block */
5428       last_block[i] = ascending[n] - currpage;
5429     }
5430 
5431     /* Pack block of < 64 diffs */
5432     packsize = Bitpack64_compute_q4_diffs_bidir_huge(diffs,last_block);
5433     buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
5434 
5435 #ifdef ALLOW_ODD_PACKSIZES
5436     nwritten += 2 * ((packsize + 1) & ~1);
5437 #else
5438     nwritten += 2 * packsize;
5439 #endif
5440   }
5441 
5442 
5443   /* Write the final pointer, which will point after the end of the file */
5444   *p++ = nwritten/4;	/* In 128-bit registers */
5445 
5446   /* Value for end of block */
5447   if (ascending[n] >= nextpage) {
5448     fprintf(stderr,"\nAt final oligo %llu (block %llu), ascending %llu >= nextpage %llu",
5449 	    n,n/BLOCKSIZE,ascending[n],nextpage);
5450     pages[pagei++] = n/BLOCKSIZE;
5451     currpage = nextpage;
5452     /* nextpage += POSITIONS_PAGE; */
5453   }
5454   *p++ = ascending[n] - currpage;
5455 
5456 
5457   /* Write pages */
5458   if (pagei > 0) {
5459     pages[pagei++] = (UINT4) -1; /* Final value */
5460     if ((pages_fp = FOPEN_WRITE_BINARY(pagesfile)) == NULL) {
5461       fprintf(stderr,"Can't write to pagesfile %s: %s\n",pagesfile,strerror(errno));
5462       abort();
5463     } else {
5464       fprintf(stderr,"\nHave %d pages:",pagei);
5465       for (i = 0; i < pagei; i++) {
5466 	fprintf(stderr," %u",pages[i]);
5467       }
5468       fprintf(stderr,"\n");
5469       if (FWRITE_UINTS(pages,pagei,pages_fp) != (size_t) pagei) {
5470 	fprintf(stderr,"Error in Bitpack64_write_differential_huge: %s\n",strerror(errno));
5471 	exit(9);
5472       }
5473       /* FREE(pages); */
5474       fclose(pages_fp);
5475     }
5476   }
5477 
5478   if ((ptrs_fp = FOPEN_WRITE_BINARY(ptrsfile)) == NULL) {
5479     fprintf(stderr,"Can't write to ptrsfile %s: %s\n",ptrsfile,strerror(errno));
5480     abort();
5481   } else {
5482     nptrs = p - ptrs;
5483     if (FWRITE_UINTS(ptrs,nptrs,ptrs_fp) != (size_t) nptrs) {
5484       fprintf(stderr,"Error in Bitpack64_write_differential_huge: %s\n",strerror(errno));
5485       exit(9);
5486     }
5487     FREE(ptrs);
5488     fclose(ptrs_fp);
5489   }
5490 
5491   /* Empty buffer */
5492   if (buffer_i > 0) {
5493     if (FWRITE_UINTS(buffer,buffer_i,comp_fp) != (size_t) buffer_i) {
5494       fprintf(stderr,"Error in Bitpack64_write_differential_huge: %s\n",strerror(errno));
5495       exit(9);
5496     }
5497     buffer_i = 0;
5498   }
5499   FREE(buffer);
5500   fclose(comp_fp);
5501 
5502   return;
5503 }
5504 
5505 
5506 void
Bitpack64_write_differential_huge_bitpacks(char * pagesfile,char * ptrsfile,char * compfile,char * packsizes,UINT4 ** bitpacks,UINT4 * counts_direct,Oligospace_T n)5507 Bitpack64_write_differential_huge_bitpacks (char *pagesfile, char *ptrsfile, char *compfile,
5508 					    char *packsizes, UINT4 **bitpacks,
5509 #ifdef COUNTS_WITHOUT_COMPRESSION
5510 					    UINT4* counts_direct,
5511 #endif
5512 					    Oligospace_T n) {
5513 #ifdef CHECK
5514   UINT4 q;
5515 #endif
5516 
5517   UINT8 currpage, nextpage;
5518   FILE *pages_fp, *ptrs_fp, *comp_fp;
5519   UINT4 pages[25];	/* Allows us to handle up to 100 billion positions.  At q3, means 300 billion nt */
5520   UINT4 *ptrs, *p, nregisters;
5521   UINT8 totalcount;
5522   size_t nptrs;
5523   Oligospace_T positioni, bmer;
5524 
5525   /* Buffer is used to avoid frequent writes to the file */
5526   UINT4 *buffer;
5527   int buffer_size = BUFFER_SIZE;
5528   int buffer_i;
5529 
5530   UINT4 diffs[BLOCKSIZE], counts[BLOCKSIZE], last_block[BLOCKSIZE];
5531   UINT8 ascending[BLOCKSIZE+1];
5532 
5533   int pagei = 0, i;
5534   int packsize;
5535 
5536 
5537   write_setup();
5538 
5539   /* 2 metavalues: nwritten (pointer) and cumulative sum for block.
5540      Packsize can be computed from difference between successive
5541      pointers, if only even packsizes are allowed */
5542   p = ptrs = (UINT4 *) CALLOC(((n + BLOCKSIZE)/BLOCKSIZE + 1) * DIFFERENTIAL_METAINFO_SIZE,sizeof(UINT4));
5543 
5544   if ((comp_fp = FOPEN_WRITE_BINARY(compfile)) == NULL) {
5545     fprintf(stderr,"Can't write to compfile %s: %s\n",compfile,strerror(errno));
5546     abort();
5547   }
5548   buffer = (UINT4 *) CALLOC(buffer_size,sizeof(UINT4));
5549   buffer_i = 0;
5550 
5551   currpage = 0;
5552   nextpage = POSITIONS_PAGE;
5553   nregisters = 0U;
5554 
5555   /* Last value of ascending is at ascending[n] */
5556   /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
5557   totalcount = 0;
5558   for (positioni = 0, bmer = 0; positioni + BLOCKSIZE <= n; positioni += BLOCKSIZE, bmer++) {
5559     /* Pointer */
5560     *p++ = nregisters;	/* In 128-bit registers */
5561 
5562     /* Value for start of block */
5563     while (totalcount >= nextpage) {
5564       fprintf(stderr,"\nAt position %llu (block %llu), ascending %llu >= nextpage %llu",
5565 	      positioni,positioni/BLOCKSIZE,totalcount,nextpage);
5566       pages[pagei++] = positioni/BLOCKSIZE;
5567       currpage = nextpage;
5568       nextpage += POSITIONS_PAGE;
5569     }
5570     *p++ = totalcount - currpage;
5571 
5572     /* Pack block of 64 diffs */
5573     Bitpack64_extract_bitpack(counts,packsizes[bmer],bitpacks[bmer]);
5574     totalcount += compute_ascending_huge(ascending,counts);
5575 #ifdef COUNTS_WITHOUT_COMPRESSION
5576     compare_offsets_huge(ascending,&(counts_direct[positioni]),positioni);
5577 #endif
5578     packsize = Bitpack64_compute_q4_diffs_bidir_huge(diffs,ascending); /* Note: This packsize may differ from packsizes[bmer], because of calculation of diffs */
5579     buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
5580 
5581     nregisters += packsize / 2;
5582   }
5583 
5584   /* Old: Check for positioni < n, because if positioni == n, ascending[n] will be taken care of as metainfo */
5585   /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
5586   /* For nucleotides, expect a single final block where positioni == n */
5587   if (positioni <= n) {
5588     /* Finish last block of 64 */
5589     *p++ = nregisters;	/* In 128-bit registers */
5590 
5591     /* Value for start of block */
5592     while (totalcount >= nextpage) {
5593       fprintf(stderr,"\nAt position %llu (block %llu), totalcount %llu >= nextpage %llu",
5594 	      positioni,positioni/BLOCKSIZE,totalcount,nextpage);
5595       pages[pagei++] = positioni/BLOCKSIZE;
5596       currpage = nextpage;
5597       nextpage += POSITIONS_PAGE;
5598     }
5599     *p++ = totalcount - currpage;
5600 
5601     if (positioni == n) {
5602       /* Don't have a bitpack at [bmerspace] */
5603       Bitpack64_extract_bitpack(counts,/*packsize*/0,/*bitpack*/NULL);
5604     } else {
5605       Bitpack64_extract_bitpack(counts,packsizes[bmer],bitpacks[bmer]);
5606     }
5607 
5608     /* For differential, want <=.  For direct, want < */
5609     for (i = 0; i <= (int) (n - positioni); i++) {
5610       last_block[i] = counts[i];
5611     }
5612     for ( ; i <= BLOCKSIZE; i++) {
5613       /* Copy last value for rest of block */
5614       last_block[i] = 0;
5615     }
5616 
5617     /* Pack block of < 64 diffs */
5618     totalcount += compute_ascending_huge(ascending,last_block);
5619 #ifdef COUNTS_WITHOUT_COMPRESSION
5620     /* May not match for a partial block */
5621     /* compare_offsets_huge(ascending,&(counts[positioni]),positioni); */
5622 #endif
5623     packsize = Bitpack64_compute_q4_diffs_bidir_huge(diffs,ascending); /* Note: This packsize may differ from packsizes[bmer], because of calculation of diffs */
5624     buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,packsize);
5625 
5626     nregisters += packsize / 2;
5627   }
5628 
5629 
5630   /* Write the final pointer, which will point after the end of the file */
5631   *p++ = nregisters;	/* In 128-bit registers */
5632 
5633   /* Value for end of block */
5634   if (totalcount >= nextpage) {
5635     fprintf(stderr,"\nAt final oligo %llu (block %llu), totalcount %llu >= nextpage %llu",
5636 	    n,n/BLOCKSIZE,totalcount,nextpage);
5637     pages[pagei++] = n/BLOCKSIZE;
5638     currpage = nextpage;
5639     /* nextpage += POSITIONS_PAGE; */
5640   }
5641   *p++ = totalcount - currpage;
5642 
5643 
5644   /* Write pages */
5645   if (pagei > 0) {
5646     pages[pagei++] = (UINT4) -1; /* Final value */
5647     if ((pages_fp = FOPEN_WRITE_BINARY(pagesfile)) == NULL) {
5648       fprintf(stderr,"Can't write to pagesfile %s: %s\n",pagesfile,strerror(errno));
5649       abort();
5650     } else {
5651       fprintf(stderr,"\nHave %d pages:",pagei);
5652       for (i = 0; i < pagei; i++) {
5653 	fprintf(stderr," %u",pages[i]);
5654       }
5655       fprintf(stderr,"\n");
5656       if (FWRITE_UINTS(pages,pagei,pages_fp) != (size_t) pagei) {
5657 	fprintf(stderr,"Error in Bitpack64_write_differential_huge: %s\n",strerror(errno));
5658 	exit(9);
5659       }
5660       /* FREE(pages); */
5661       fclose(pages_fp);
5662     }
5663   }
5664 
5665   if ((ptrs_fp = FOPEN_WRITE_BINARY(ptrsfile)) == NULL) {
5666     fprintf(stderr,"Can't write to ptrsfile %s: %s\n",ptrsfile,strerror(errno));
5667     abort();
5668   } else {
5669     nptrs = p - ptrs;
5670     if (FWRITE_UINTS(ptrs,nptrs,ptrs_fp) != (size_t) nptrs) {
5671       fprintf(stderr,"Error in Bitpack64_write_differential_huge: %s\n",strerror(errno));
5672       exit(9);
5673     }
5674     FREE(ptrs);
5675     fclose(ptrs_fp);
5676   }
5677 
5678   /* Empty buffer */
5679   if (buffer_i > 0) {
5680     if (FWRITE_UINTS(buffer,buffer_i,comp_fp) != (size_t) buffer_i) {
5681       fprintf(stderr,"Error in Bitpack64_write_differential_huge: %s\n",strerror(errno));
5682       exit(9);
5683     }
5684     buffer_i = 0;
5685   }
5686   FREE(buffer);
5687   fclose(comp_fp);
5688 
5689   return;
5690 }
5691 
5692 
5693 
5694 #if 0
5695 void
5696 Bitpack64_write_fixed10_huge (char *pagesfile, char *ptrsfile, char *compfile,
5697 			      UINT8 *ascending, Oligospace_T n) {
5698 #ifndef USE_ONE_FILE_FOR_FIXED
5699   FILE *ptrs_fp;
5700 #endif
5701   UINT8 currpage, nextpage;
5702   FILE *pages_fp, *comp_fp;
5703   UINT4 pages[25];	/* Allows us to handle up to 100 billion positions */
5704   UINT4 *ptrs;
5705   UINT4 ptri;
5706   Oligospace_T positioni;
5707 
5708   /* Buffer is used to avoid frequent writes to the file */
5709   UINT4 *buffer;
5710   int buffer_size = BUFFER_SIZE;
5711   int buffer_i;
5712 
5713   UINT4 diffs[BLOCKSIZE];
5714   UINT8 last_block[BLOCKSIZE+1];
5715 
5716   int pagei = 0, i;
5717   UINT4 nwritten;
5718   int packsize;
5719 
5720 
5721   write_setup();
5722 
5723   /* 2 metavalues: nwritten (pointer) and cumulative sum for block.
5724      Packsize can be computed from difference between successive
5725      pointers, if only even packsizes are allowed */
5726 #ifdef USE_ONE_FILE_FOR_FIXED
5727   ptrs = (UINT *) CALLOC(4,sizeof(UINT4));
5728   ptri = 0;
5729 #else
5730   ptrs = (UINT4 *) CALLOC(((n + BLOCKSIZE)/BLOCKSIZE + 1) * RANK_METAINFO_SIZE,sizeof(UINT4));
5731   ptri = 0;
5732 #endif
5733 
5734   if ((comp_fp = FOPEN_WRITE_BINARY(compfile)) == NULL) {
5735     fprintf(stderr,"Can't write to compfile %s: %s\n",compfile,strerror(errno));
5736     abort();
5737   }
5738   buffer = (UINT4 *) CALLOC(buffer_size,sizeof(UINT4));
5739   buffer_i = 0;
5740 
5741   currpage = 0;
5742   nextpage = POSITIONS_PAGE;
5743   nwritten = 0U;
5744 
5745   /* Last value of ascending is at ascending[n] */
5746   /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
5747   for (positioni = 0; positioni + BLOCKSIZE <= n; positioni += BLOCKSIZE) {
5748 #if 0
5749     /* Pointer */
5750     ptrs[ptri++] = nwritten/4;	/* In 128-bit registers */
5751 #endif
5752 
5753     /* Value for start of block */
5754     while (ascending[positioni] >= nextpage) {
5755       fprintf(stderr,"\nAt position %llu (block %llu), ascending %llu >= nextpage %llu",
5756 	      (unsigned long long) positioni,(unsigned long long) (positioni/BLOCKSIZE),ascending[positioni],nextpage);
5757       pages[pagei++] = positioni/BLOCKSIZE;
5758       currpage = nextpage;
5759       nextpage += POSITIONS_PAGE;
5760     }
5761     ptrs[ptri++] = ascending[positioni] - currpage;
5762 #ifdef USE_ONE_FILE_FOR_FIXED
5763     if (ptri == 4) {
5764       if (FWRITE_UINTS(ptrs,4,comp_fp) != (size_t) 4) {
5765 	fprintf(stderr,"Error in Bitpack64_write_fixed10_huge: %s\n",strerror(errno));
5766 	exit(9);
5767       }
5768       ptri = 0;
5769     }
5770 #endif
5771 
5772     /* Pack block of 64 diffs */
5773     packsize = compute_q4_diffs_bidir_huge(diffs,&(ascending[positioni]));
5774     assert(packsize <= FIXED10_PACKSIZE);
5775     buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,FIXED10_PACKSIZE);
5776 
5777     nwritten += 2 * packsize;
5778   }
5779 
5780   /* Old: Check for positioni < n, because if positioni == n, ascending[n] will be taken care of as metainfo */
5781   /* Use <= n instead of < n, because we want ascending[n] to be taken care of by unpack_00, not a check for remainder == 0 */
5782   if (positioni <= n) {
5783 #if 0
5784     /* Finish last block of 64 */
5785     ptrs[ptri++] = nwritten/4;	/* In 128-bit registers */
5786 #endif
5787 
5788     /* Value for start of block */
5789     while (ascending[positioni] >= nextpage) {
5790       fprintf(stderr,"\nAt position %llu (block %llu), ascending %llu >= nextpage %llu",
5791 	      (unsigned long long) positioni,(unsigned long long) (positioni/BLOCKSIZE),ascending[positioni],nextpage);
5792       pages[pagei++] = positioni/BLOCKSIZE;
5793       currpage = nextpage;
5794       nextpage += POSITIONS_PAGE;
5795     }
5796     ptrs[ptri++] = ascending[positioni] - currpage;
5797 #ifdef USE_ONE_FILE_FOR_FIXED
5798     if (ptri == 4) {
5799       if (FWRITE_UINTS(ptrs,4,comp_fp) != (size_t) 4) {
5800 	fprintf(stderr,"Error in Bitpack64_write_fixed10_huge: %s\n",strerror(errno));
5801 	exit(9);
5802       }
5803       ptri = 0;
5804     }
5805 #endif
5806 
5807     /* For differential, want <=.  For direct, want < */
5808     for (i = 0; i <= (int) (n - positioni); i++) {
5809       last_block[i] = ascending[positioni+i] - currpage;
5810     }
5811     for ( ; i <= BLOCKSIZE; i++) {
5812       /* Copy last value for rest of block */
5813       last_block[i] = ascending[n] - currpage;
5814     }
5815 
5816     /* Pack block of < 64 diffs */
5817     packsize = compute_q4_diffs_bidir_huge(diffs,last_block);
5818     assert(packsize <= FIXED10_PACKSIZE);
5819     buffer_i = Bitpack64_write_columnar(comp_fp,buffer,buffer_size,buffer_i,diffs,FIXED10_PACKSIZE);
5820 
5821     nwritten += 2 * packsize;
5822   }
5823 
5824 
5825 #if 0
5826   /* Write the final pointer, which will point after the end of the file */
5827   ptrs[ptri++] = nwritten/4;	/* In 128-bit registers */
5828 #endif
5829 
5830   /* Value for end of block */
5831   if (ascending[n] >= nextpage) {
5832     fprintf(stderr,"\nAt final oligo %llu (block %llu), ascending %llu >= nextpage %llu",
5833 	    (unsigned long long) n,(unsigned long long) (n/BLOCKSIZE),ascending[n],nextpage);
5834     pages[pagei++] = n/BLOCKSIZE;
5835     currpage = nextpage;
5836     /* nextpage += POSITIONS_PAGE; */
5837   }
5838   ptrs[ptri++] = ascending[n] - currpage;
5839 #ifdef USE_ONE_FILE_FOR_FIXED
5840   for (i = ptri; i < 4; i++) {
5841     ptrs[i] = 0U;
5842   }
5843   if (FWRITE_UINTS(ptrs,4,comp_fp) != (size_t) 4) {
5844     fprintf(stderr,"Error in Bitpack64_write_fixed10_huge: %s\n",strerror(errno));
5845     exit(9);
5846   }
5847 #else
5848   if ((ptrs_fp = FOPEN_WRITE_BINARY(ptrsfile)) == NULL) {
5849     fprintf(stderr,"Can't write to ptrsfile %s: %s\n",ptrsfile,strerror(errno));
5850     abort();
5851   } else {
5852     if (FWRITE_UINTS(ptrs,ptri,ptrs_fp) != (size_t) ptri) {
5853       fprintf(stderr,"Error in Bitpack64_write_fixed10_huge: %s\n",strerror(errno));
5854       exit(9);
5855     }
5856     fclose(ptrs_fp);
5857   }
5858 #endif
5859   FREE(ptrs);
5860 
5861   /* Write pages */
5862   if (pagei > 0) {
5863     pages[pagei++] = (UINT4) -1; /* Final value */
5864     if ((pages_fp = FOPEN_WRITE_BINARY(pagesfile)) == NULL) {
5865       fprintf(stderr,"Can't write to pagesfile %s: %s\n",pagesfile,strerror(errno));
5866       abort();
5867     } else {
5868       fprintf(stderr,"\nHave %d pages:",pagei);
5869       for (i = 0; i < pagei; i++) {
5870 	fprintf(stderr," %u",pages[i]);
5871       }
5872       fprintf(stderr,"\n");
5873       if (FWRITE_UINTS(pages,pagei,pages_fp) != (size_t) pagei) {
5874 	fprintf(stderr,"Error in Bitpack64_write_fixed10_huge: %s\n",strerror(errno));
5875 	exit(9);
5876       }
5877       /* FREE(pages); */
5878       fclose(pages_fp);
5879     }
5880   }
5881 
5882   /* Empty buffer */
5883   if (buffer_i > 0) {
5884     if (FWRITE_UINTS(buffer,buffer_i,comp_fp) != (size_t) buffer_i) {
5885       fprintf(stderr,"Error in Bitpack64_write_fixed10_huge: %s\n",strerror(errno));
5886       exit(9);
5887     }
5888     buffer_i = 0;
5889   }
5890   FREE(buffer);
5891   fclose(comp_fp);
5892 
5893   return;
5894 }
5895 #endif
5896 
5897 
5898 
5899 static int
compute_packsize(UINT4 * values)5900 compute_packsize (UINT4 *values) {
5901   UINT4 packsize;
5902   UINT4 maxvalue = 0;
5903   int i;
5904   int firstbit;
5905 #ifdef HAVE_BUILTIN_CLZ
5906 #elif defined(HAVE_ASM_BSR)
5907   int msb;
5908 #endif
5909 
5910   for (i = 0; i < 64; i++) {
5911     maxvalue |= values[i];
5912   }
5913 
5914   if (maxvalue == 0) {
5915     /* __builtin_clz() behaves oddly on zero */
5916     return 0;
5917 
5918   } else {
5919 #ifdef HAVE_BUILTIN_CLZ
5920     firstbit = __builtin_clz(maxvalue);
5921     packsize = 32 - firstbit;
5922 #elif defined(HAVE_ASM_BSR)
5923     asm("bsr %1,%0" : "=r"(msb) : "r"(maxvalue));
5924     packsize = msb + 1;
5925 #else
5926     firstbit = ((maxvalue >> 16) ? clz_table[maxvalue >> 16] : 16 + clz_table[maxvalue]);
5927     packsize = 32 - firstbit;
5928 #endif
5929 
5930 #ifdef ALLOW_ODD_PACKSIZES
5931     return packsize;
5932 #else
5933     return (packsize + 1) & ~1;	/* Converts packsizes to the next multiple of 2 */
5934 #endif
5935   }
5936 }
5937 
5938 
5939 #if 0
5940 /* Stores the $n$ values [0..(n-1)] */
5941 /* Want to store values 0..n-1.  The value direct[n] does not exist.  */
5942 /* Stored in vertical order */
5943 void
5944 Bitpack64_write_direct (char *ptrsfile, char *compfile, UINT4 *direct, Oligospace_T n) {
5945   FILE *ptrs_fp, *comp_fp;
5946   UINT4 *ptrs, *p;
5947   size_t nptrs;
5948   int i;
5949   Oligospace_T positioni;
5950 
5951   UINT4 *buffer;
5952   int buffer_size = BUFFER_SIZE;
5953   int buffer_i;
5954 
5955   UINT4 last_block[BLOCKSIZE];
5956 
5957   UINT4 nwritten;
5958   int packsize;
5959 
5960 
5961   write_setup();
5962 
5963   /* 1 metavalue: nwritten (pointer).  Packsize can be
5964      computed from difference between successive pointers, if only
5965      even packsizes are allowed */
5966   p = ptrs = (UINT4 *) CALLOC(((n + BLOCKSIZE - 1)/BLOCKSIZE + 1) * DIRECT_METAINFO_SIZE,sizeof(UINT4));
5967 
5968   if ((comp_fp = FOPEN_WRITE_BINARY(compfile)) == NULL) {
5969     fprintf(stderr,"Can't write to compfile %s: %s\n",compfile,strerror(errno));
5970     abort();
5971   }
5972   buffer = (UINT4 *) CALLOC(buffer_size,sizeof(UINT4));
5973   buffer_i = 0;
5974 
5975   nwritten = 0U;
5976 
5977   for (positioni = 0; positioni + BLOCKSIZE < n; positioni += BLOCKSIZE) {
5978     /* Pointer */
5979     *p++ = nwritten/4;	/* In 128-bit registers */
5980 
5981     /* Pack block of 64 diffs */
5982     packsize = compute_packsize(&(direct[positioni]));
5983     buffer_i = write_vert(comp_fp,buffer,buffer_size,buffer_i,&(direct[positioni]),packsize);
5984 
5985 #ifdef ALLOW_ODD_PACKSIZES
5986     nwritten += 2 * ((packsize + 1) & ~1);
5987 #else
5988     nwritten += 2 * packsize;
5989 #endif
5990   }
5991 
5992   if (positioni < n) {
5993     /* Finish last block of 64 */
5994     *p++ = nwritten/4;	/* In 128-bit registers */
5995 
5996     i = 0;
5997     while (positioni < n) {
5998       last_block[i++] = direct[positioni++];
5999     }
6000     while (i < BLOCKSIZE) {
6001       last_block[i++] = 0;
6002     }
6003 
6004     packsize = compute_packsize(last_block);
6005     buffer_i = write_vert(comp_fp,buffer,buffer_size,buffer_i,last_block,packsize);
6006 
6007 #ifdef ALLOW_ODD_PACKSIZES
6008     nwritten += 2 * ((packsize + 1) & ~1);
6009 #else
6010     nwritten += 2 * packsize;
6011 #endif
6012   }
6013 
6014   /* Write the final pointer, which will point after the end of the
6015      file */
6016   *p++ = nwritten/4;	/* In 128-bit registers */
6017 
6018   if ((ptrs_fp = FOPEN_WRITE_BINARY(ptrsfile)) == NULL) {
6019     fprintf(stderr,"Can't write to ptrsfile %s: %s\n",ptrsfile,strerror(errno));
6020     abort();
6021   } else {
6022     nptrs = p - ptrs;
6023     if (FWRITE_UINTS(ptrs,nptrs,ptrs_fp) != (size_t) nptrs) {
6024       fprintf(stderr,"Error in Bitpack64_write_direct: %s\n",strerror(errno));
6025       exit(9);
6026     }
6027     FREE(ptrs);
6028     fclose(ptrs_fp);
6029   }
6030 
6031   /* Empty buffer */
6032   if (buffer_i > 0) {
6033     if (FWRITE_UINTS(buffer,buffer_i,comp_fp) != (size_t) buffer_i) {
6034       fprintf(stderr,"Error in Bitpack64_write_direct: %s\n",strerror(errno));
6035       exit(9);
6036     }
6037     buffer_i = 0;
6038   }
6039   FREE(buffer);
6040   fclose(comp_fp);
6041 
6042   return;
6043 }
6044 #endif
6045 
6046 
6047