1 /* This file, swapproc.c, contains general utility routines that are */
2 /* used by other FITSIO routines to swap bytes. */
3
4 /* The FITSIO software was written by William Pence at the High Energy */
5 /* Astrophysic Science Archive Research Center (HEASARC) at the NASA */
6 /* Goddard Space Flight Center. */
7
8 /* The fast SSE2 and SSSE3 functions were provided by Julian Taylor, ESO */
9
10 #include <string.h>
11 #include <stdlib.h>
12 #include "fitsio2.h"
13
14 /* bswap builtin is available since GCC 4.3 */
15 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)
16 #define HAVE_BSWAP
17 #endif
18
19 #ifdef __SSSE3__
20 #include <tmmintrin.h>
21 /* swap 16 bytes according to mask, values must be 16 byte aligned */
swap_ssse3(char * values,__m128i mask)22 static inline void swap_ssse3(char * values, __m128i mask)
23 {
24 __m128i v = _mm_load_si128((__m128i *)values);
25 __m128i s = _mm_shuffle_epi8(v, mask);
26 _mm_store_si128((__m128i*)values, s);
27 }
28 #endif
29 #ifdef __SSE2__
30 #include <emmintrin.h>
31 /* swap 8 shorts, values must be 16 byte aligned
32 * faster than ssse3 variant for shorts */
swap2_sse2(char * values)33 static inline void swap2_sse2(char * values)
34 {
35 __m128i r1 = _mm_load_si128((__m128i *)values);
36 __m128i r2 = r1;
37 r1 = _mm_srli_epi16(r1, 8);
38 r2 = _mm_slli_epi16(r2, 8);
39 r1 = _mm_or_si128(r1, r2);
40 _mm_store_si128((__m128i*)values, r1);
41 }
42 /* the three shuffles required for 4 and 8 byte variants make
43 * SSE2 slower than bswap */
44
45
46 /* get number of elements to peel to reach alignment */
get_peel(void * addr,size_t esize,size_t nvals,size_t alignment)47 static inline size_t get_peel(void * addr, size_t esize, size_t nvals,
48 size_t alignment)
49 {
50 const size_t offset = (size_t)addr % alignment;
51 size_t peel = offset ? (alignment - offset) / esize : 0;
52 peel = nvals < peel ? nvals : peel;
53 return peel;
54 }
55 #endif
56
57 /*--------------------------------------------------------------------------*/
ffswap2_slow(short * svalues,long nvals)58 static void ffswap2_slow(short *svalues, long nvals)
59 {
60 register long ii;
61 unsigned short * usvalues;
62
63 usvalues = (unsigned short *) svalues;
64
65 for (ii = 0; ii < nvals; ii++)
66 {
67 usvalues[ii] = (usvalues[ii]>>8) | (usvalues[ii]<<8);
68 }
69 }
70 /*--------------------------------------------------------------------------*/
71 #if __SSE2__
ffswap2(short * svalues,long nvals)72 void ffswap2(short *svalues, /* IO - pointer to shorts to be swapped */
73 long nvals) /* I - number of shorts to be swapped */
74 /*
75 swap the bytes in the input short integers: ( 0 1 -> 1 0 )
76 */
77 {
78 if ((long)svalues % 2 != 0) { /* should not happen */
79 ffswap2_slow(svalues, nvals);
80 return;
81 }
82
83 long ii;
84 size_t peel = get_peel((void*)&svalues[0], sizeof(svalues[0]), nvals, 16);
85
86 ffswap2_slow(svalues, peel);
87 for (ii = peel; ii < (nvals - peel - (nvals - peel) % 8); ii+=8) {
88 swap2_sse2((char*)&svalues[ii]);
89 }
90 ffswap2_slow(&svalues[ii], nvals - ii);
91 }
92 #else
ffswap2(short * svalues,long nvals)93 void ffswap2(short *svalues, /* IO - pointer to shorts to be swapped */
94 long nvals) /* I - number of shorts to be swapped */
95 /*
96 swap the bytes in the input 4-byte integer: ( 0 1 2 3 -> 3 2 1 0 )
97 */
98 {
99 ffswap2_slow(svalues, nvals);
100 }
101 #endif
102 /*--------------------------------------------------------------------------*/
ffswap4_slow(INT32BIT * ivalues,long nvals)103 static void ffswap4_slow(INT32BIT *ivalues, long nvals)
104 {
105 register long ii;
106
107 #if defined(HAVE_BSWAP)
108 for (ii = 0; ii < nvals; ii++)
109 {
110 ivalues[ii] = __builtin_bswap32(ivalues[ii]);
111 }
112 #elif defined(_MSC_VER) && (_MSC_VER >= 1400)
113 /* intrinsic byte swapping function in Microsoft Visual C++ 8.0 and later */
114 unsigned int* uivalues = (unsigned int *) ivalues;
115
116 /* intrinsic byte swapping function in Microsoft Visual C++ */
117 for (ii = 0; ii < nvals; ii++)
118 {
119 uivalues[ii] = _byteswap_ulong(uivalues[ii]);
120 }
121 #else
122 char *cvalues, tmp;
123
124 for (ii = 0; ii < nvals; ii++)
125 {
126 cvalues = (char *)&ivalues[ii];
127 tmp = cvalues[0];
128 cvalues[0] = cvalues[3];
129 cvalues[3] = tmp;
130 tmp = cvalues[1];
131 cvalues[1] = cvalues[2];
132 cvalues[2] = tmp;
133 }
134 #endif
135 }
136 /*--------------------------------------------------------------------------*/
137 #ifdef __SSSE3__
ffswap4(INT32BIT * ivalues,long nvals)138 void ffswap4(INT32BIT *ivalues, /* IO - pointer to INT*4 to be swapped */
139 long nvals) /* I - number of floats to be swapped */
140 /*
141 swap the bytes in the input 4-byte integer: ( 0 1 2 3 -> 3 2 1 0 )
142 */
143 {
144 if ((long)ivalues % 4 != 0) { /* should not happen */
145 ffswap4_slow(ivalues, nvals);
146 return;
147 }
148
149 long ii;
150 const __m128i cmask4 = _mm_set_epi8(12, 13, 14, 15,
151 8, 9, 10, 11,
152 4, 5, 6, 7,
153 0, 1, 2 ,3);
154 size_t peel = get_peel((void*)&ivalues[0], sizeof(ivalues[0]), nvals, 16);
155 ffswap4_slow(ivalues, peel);
156 for (ii = peel; ii < (nvals - peel - (nvals - peel) % 4); ii+=4) {
157 swap_ssse3((char*)&ivalues[ii], cmask4);
158 }
159 ffswap4_slow(&ivalues[ii], nvals - ii);
160 }
161 #else
ffswap4(INT32BIT * ivalues,long nvals)162 void ffswap4(INT32BIT *ivalues, /* IO - pointer to INT*4 to be swapped */
163 long nvals) /* I - number of floats to be swapped */
164 /*
165 swap the bytes in the input 4-byte integer: ( 0 1 2 3 -> 3 2 1 0 )
166 */
167 {
168 ffswap4_slow(ivalues, nvals);
169 }
170 #endif
171 /*--------------------------------------------------------------------------*/
ffswap8_slow(double * dvalues,long nvals)172 static void ffswap8_slow(double *dvalues, long nvals)
173 {
174 register long ii;
175 #ifdef HAVE_BSWAP
176 LONGLONG * llvalues = (LONGLONG*)dvalues;
177
178 for (ii = 0; ii < nvals; ii++) {
179 llvalues[ii] = __builtin_bswap64(llvalues[ii]);
180 }
181 #elif defined(_MSC_VER) && (_MSC_VER >= 1400)
182 /* intrinsic byte swapping function in Microsoft Visual C++ 8.0 and later */
183 unsigned __int64 * llvalues = (unsigned __int64 *) dvalues;
184
185 for (ii = 0; ii < nvals; ii++)
186 {
187 llvalues[ii] = _byteswap_uint64(llvalues[ii]);
188 }
189 #else
190 register char *cvalues;
191 register char temp;
192
193 cvalues = (char *) dvalues; /* copy the pointer value */
194
195 for (ii = 0; ii < nvals*8; ii += 8)
196 {
197 temp = cvalues[ii];
198 cvalues[ii] = cvalues[ii+7];
199 cvalues[ii+7] = temp;
200
201 temp = cvalues[ii+1];
202 cvalues[ii+1] = cvalues[ii+6];
203 cvalues[ii+6] = temp;
204
205 temp = cvalues[ii+2];
206 cvalues[ii+2] = cvalues[ii+5];
207 cvalues[ii+5] = temp;
208
209 temp = cvalues[ii+3];
210 cvalues[ii+3] = cvalues[ii+4];
211 cvalues[ii+4] = temp;
212 }
213 #endif
214 }
215 /*--------------------------------------------------------------------------*/
216 #ifdef __SSSE3__
ffswap8(double * dvalues,long nvals)217 void ffswap8(double *dvalues, /* IO - pointer to doubles to be swapped */
218 long nvals) /* I - number of doubles to be swapped */
219 /*
220 swap the bytes in the input doubles: ( 01234567 -> 76543210 )
221 */
222 {
223 if ((long)dvalues % 8 != 0) { /* should not happen on amd64 */
224 ffswap8_slow(dvalues, nvals);
225 return;
226 }
227
228 long ii;
229 const __m128i cmask8 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15,
230 0, 1, 2 ,3, 4, 5, 6, 7);
231 size_t peel = get_peel((void*)&dvalues[0], sizeof(dvalues[0]), nvals, 16);
232 ffswap8_slow(dvalues, peel);
233 for (ii = peel; ii < (nvals - peel - (nvals - peel) % 2); ii+=2) {
234 swap_ssse3((char*)&dvalues[ii], cmask8);
235 }
236 ffswap8_slow(&dvalues[ii], nvals - ii);
237 }
238 #else
ffswap8(double * dvalues,long nvals)239 void ffswap8(double *dvalues, /* IO - pointer to doubles to be swapped */
240 long nvals) /* I - number of doubles to be swapped */
241 /*
242 swap the bytes in the input doubles: ( 01234567 -> 76543210 )
243 */
244 {
245 ffswap8_slow(dvalues, nvals);
246 }
247 #endif
248