1 /*
2  * Copyright 2020 The Emscripten Authors.  All rights reserved.
3  * Emscripten is available under two separate licenses, the MIT license and the
4  * University of Illinois/NCSA Open Source License.  Both these licenses can be
5  * found in the LICENSE file.
6  */
7 #pragma once
8 
9 #include <stdio.h>
10 #include <math.h>
11 #include <time.h>
12 #include <inttypes.h>
13 #include <float.h>
14 #include <assert.h>
15 #include <string.h>
16 
17 #ifdef __EMSCRIPTEN__
18 #include <emscripten/emscripten.h>
19 #define align1_int emscripten_align1_int
20 #define align1_int64 emscripten_align1_int64
21 #define align1_float emscripten_align1_float
22 #define align1_double emscripten_align1_double
23 #else
24 #define align1_int64 int64_t
25 #define align1_int int
26 #define align1_float float
27 #define align1_double double
28 #endif
29 
30 // Recasts floating point representation of f to an integer.
fcastu(float f)31 uint32_t fcastu(float f) { return *(uint32_t*)&f; }
dcastu(double f)32 uint64_t dcastu(double f) { return *(uint64_t*)&f; }
ucastf(uint32_t t)33 float ucastf(uint32_t t) { return *(float*)&t; }
ucastd(uint64_t t)34 double ucastd(uint64_t t) { return *(double*)&t; }
35 
36 // Data used in test. Store them global and access via a getter to confuse optimizer to not "solve" the whole test suite at compile-time,
37 // so that the operation will actually be performed at runtime, and not at compile-time. (Testing the capacity of the compiler to perform
38 // SIMD ops at compile-time would be interesting as well, but that's for another test)
39 float interesting_floats_[] = { -INFINITY, -FLT_MAX, -2.5f, -1.5f, -1.4f, -1.0f, -0.5f, -0.2f, -FLT_MIN, -0.f, 0.f,
40                                 1.401298464e-45f, FLT_MIN, 0.3f, 0.5f, 0.8f, 1.0f, 1.5f, 2.5f, 3.5f, 3.6f, FLT_MAX, INFINITY, NAN,
41                                 ucastf(0x01020304), ucastf(0x80000000), ucastf(0x7FFFFFFF), ucastf(0xFFFFFFFF)
42                             };
43 
44 double interesting_doubles_[] = { -INFINITY, -FLT_MAX, -2.5, -1.5, -1.4, -1.0, -0.5, -0.2, -FLT_MIN, -0.0, 0.0,
45                                 1.401298464e-45, FLT_MIN, 0.3, 0.5, 0.8, 1.0, 1.5, 2.5, 3.5, 3.6, FLT_MAX, INFINITY, NAN,
46                                 ucastd(0x0102030405060708ULL), ucastd(0x8000000000000000ULL),
47                                 ucastd(0x7FFFFFFFFFFFFFFFULL), ucastd(0xFFFFFFFFFFFFFFFFULL)
48                                 };
49 
50 uint32_t interesting_ints_[] = { 0, 1, 2, 3, 0x01020304, 0x10203040, 0x7FFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0x12345678, 0x9ABCDEF1, 0x80000000,
51                                  0x80808080, 0x7F7F7F7F, 0x01010101, 0x11111111, 0x20202020, 0x0F0F0F0F, 0xF0F0F0F0,
52                                  fcastu(-INFINITY), fcastu(-FLT_MAX), fcastu(-2.5f), fcastu(-1.5f), fcastu(-1.4f), fcastu(-1.0f), fcastu(-0.5f),
53                                  fcastu(-0.2f), fcastu(-FLT_MIN), 0xF9301AB9, 0x0039AB12, 0x19302BCD,
54                                  fcastu(1.401298464e-45f), fcastu(FLT_MIN), fcastu(0.3f), fcastu(0.5f), fcastu(0.8f), fcastu(1.0f), fcastu(1.5f),
55                                  fcastu(2.5f), fcastu(3.5f), fcastu(3.6f), fcastu(FLT_MAX), fcastu(INFINITY), fcastu(NAN) };
56 
always_true()57 bool always_true() { return time(NULL) != 0; } // This function always returns true, but the compiler should not know this.
58 
IsNan(float f)59 bool IsNan(float f) { return (fcastu(f) << 1) > 0xFF000000u; }
60 
61 // Replaces all occurrences of 'src' in string 'str' with 'dst', operating in place. strlen(dst) <= strlen(src).
contract_inplace(char * str,const char * src,const char * dst)62 void contract_inplace(char *str, const char *src, const char *dst)
63 {
64 	int dstLen = strlen(dst);
65 	int srcLen = strlen(src);
66 	int diff = srcLen - dstLen;
67 	assert(diff >= 0);
68 
69 	while(true)
70 	{
71 		char *pos = strstr(str, src);
72 		if (!pos) return;
73 		str = pos;
74 		strcpy(pos, dst);
75 		pos += dstLen;
76 		strcpy(pos, pos + diff);
77 	}
78 }
79 
80 // sprintf standard does not allow controlling how many leading zeros to use
81 // for printing out the exponent, and different compilers give different
82 // values. Perform a canonicalization step that enforces the printouts are
83 // the same.
CanonicalizeStringComparisons(char * s)84 void CanonicalizeStringComparisons(char *s)
85 {
86 	contract_inplace(s, "e+00", "e+");
87 	contract_inplace(s, "e-00", "e-");
88 	contract_inplace(s, "e+0", "e+");
89 	contract_inplace(s, "e-0", "e-");
90 	contract_inplace(s, "1.#INF", "inf");
91 }
92 
93 // Global test state that is used per-test to determine whether to validate the state of exact NaN bits
94 // in specific functions.
95 extern bool testNaNBits;
96 
97 char *SerializeFloat(float f, char *dstStr, bool approximate = false)
98 {
99 	if (IsNan(f))
100 	{
101 		uint32_t u = fcastu(f);
102 		int numChars = testNaNBits ? sprintf(dstStr, "NaN(0x%8X)", (unsigned int)u) : sprintf(dstStr, "NaN");
103 		return dstStr + numChars;
104 	}
105 	else
106 	{
107 		if (approximate > 0)
108 		{
109 			if (fabs(f) < FLT_MIN) // Flush denormals to zero (for _mm_rcp_ps)
110 				sprintf(dstStr, "%f", copysign(0.f, f));
111 			else if (fabs(f) >= 2.6e22f) // Flush large numbers to infinity (for _mm_rsqrt_ps)
112 				sprintf(dstStr, "%f", copysign(INFINITY, f));
113 			else
114 				sprintf(dstStr, "%.2g", f);
115 		}
116 		else
117 			sprintf(dstStr, "%.9g", f);
118 		CanonicalizeStringComparisons(dstStr);
119 		return dstStr + strlen(dstStr);
120 	}
121 }
122 
SerializeDouble(double f,char * dstStr)123 char *SerializeDouble(double f, char *dstStr)
124 {
125 	if (IsNan(f))
126 	{
127 		uint64_t u = dcastu(f);
128 		int numChars = testNaNBits ? sprintf(dstStr, "NaN(0x%08X%08X)", (unsigned int)(u>>32), (unsigned int)u) : sprintf(dstStr, "NaN");
129 		return dstStr + numChars;
130 	}
131 	else
132 	{
133 		sprintf(dstStr, "%.17g", f);
134 		CanonicalizeStringComparisons(dstStr);
135 		return dstStr + strlen(dstStr);
136 	}
137 }
138 
tostr(__m128 * m,char * outstr)139 void tostr(__m128 *m, char *outstr)
140 {
141 	union { __m128 m; float val[4]; } u;
142 	u.m = *m;
143 	char s[4][32];
144 	SerializeFloat(u.val[0], s[0]);
145 	SerializeFloat(u.val[1], s[1]);
146 	SerializeFloat(u.val[2], s[2]);
147 	SerializeFloat(u.val[3], s[3]);
148 	sprintf(outstr, "[%s,%s,%s,%s]", s[3], s[2], s[1], s[0]);
149 }
150 
tostr_approx(__m128 * m,char * outstr,bool approximate)151 void tostr_approx(__m128 *m, char *outstr, bool approximate)
152 {
153 	union { __m128 m; float val[4]; } u;
154 	u.m = *m;
155 	char s[4][32];
156 	SerializeFloat(u.val[0], s[0], approximate);
157 	SerializeFloat(u.val[1], s[1], approximate);
158 	SerializeFloat(u.val[2], s[2], approximate);
159 	SerializeFloat(u.val[3], s[3], approximate);
160 	sprintf(outstr, "[%s,%s,%s,%s]", s[3], s[2], s[1], s[0]);
161 }
162 
tostr(__m128i * m,char * outstr)163 void tostr(__m128i *m, char *outstr)
164 {
165 	union { __m128i m; uint32_t val[4]; } u;
166 	u.m = *m;
167 	sprintf(outstr, "[0x%08X,0x%08X,0x%08X,0x%08X]", u.val[3], u.val[2], u.val[1], u.val[0]);
168 }
169 
170 #ifdef __SSE2__
171 
tostr(__m128d * m,char * outstr)172 void tostr(__m128d *m, char *outstr)
173 {
174 	union { __m128d m; double val[2]; } u;
175 	u.m = *m;
176 	char s[2][64];
177 	SerializeDouble(u.val[0], s[0]);
178 	SerializeDouble(u.val[1], s[1]);
179 	sprintf(outstr, "[%s,%s]", s[1], s[0]);
180 }
181 
ExtractInRandomOrder(uint32_t * arr,int i,int n,int prime)182 __m128i ExtractInRandomOrder(uint32_t *arr, int i, int n, int prime)
183 {
184 	return _mm_set_epi32(arr[(i*prime)%n], arr[((i+1)*prime)%n], arr[((i+2)*prime)%n], arr[((i+3)*prime)%n]);
185 }
186 
ExtractInRandomOrder(double * arr,int i,int n,int prime)187 __m128d ExtractInRandomOrder(double *arr, int i, int n, int prime)
188 {
189 	return _mm_set_pd(arr[(i*prime)%n], arr[((i+1)*prime)%n]);
190 }
191 #endif
192 
tostr(align1_int * m,char * outstr)193 void tostr(align1_int *m, char *outstr)
194 {
195 	sprintf(outstr, "0x%08X", *m);
196 }
197 
tostr(align1_int64 * m,char * outstr)198 void tostr(align1_int64 *m, char *outstr)
199 {
200 	sprintf(outstr, "0x%08X%08X", (int)(*m >> 32), (int)*m);
201 }
202 
tostr(align1_float * m,char * outstr)203 void tostr(align1_float *m, char *outstr)
204 {
205 	SerializeFloat(*m, outstr);
206 }
207 
tostr(align1_double * m,char * outstr)208 void tostr(align1_double *m, char *outstr)
209 {
210 	SerializeDouble(*m, outstr);
211 }
212 
tostr(align1_double * m,int numElems,char * outstr)213 void tostr(align1_double *m, int numElems, char *outstr)
214 {
215 	char s[2][64];
216 	for(int i = 0; i < numElems; ++i)
217 		SerializeDouble(m[i], s[i]);
218 	switch(numElems)
219 	{
220 		case 1: sprintf(outstr, "{%s}", s[0]); break;
221 		case 2: sprintf(outstr, "{%s,%s}", s[0], s[1]); break;
222 	}
223 }
224 
tostr(align1_float * m,int numElems,char * outstr)225 void tostr(align1_float *m, int numElems, char *outstr)
226 {
227 	char s[4][64];
228 	for(int i = 0; i < numElems; ++i)
229 		SerializeFloat(m[i], s[i]);
230 	switch(numElems)
231 	{
232 		case 1: sprintf(outstr, "{%s}", s[0]); break;
233 		case 2: sprintf(outstr, "{%s,%s}", s[0], s[1]); break;
234 		case 3: sprintf(outstr, "{%s,%s,%s}", s[0], s[1], s[2]); break;
235 		case 4: sprintf(outstr, "{%s,%s,%s,%s}", s[0], s[1], s[2], s[3]); break;
236 	}
237 }
238 
tostr(align1_int * s,int numElems,char * outstr)239 void tostr(align1_int *s, int numElems, char *outstr)
240 {
241 	switch(numElems)
242 	{
243 		case 1: sprintf(outstr, "{0x%08X}", s[0]); break;
244 		case 2: sprintf(outstr, "{0x%08X,0x%08X}", s[0], s[1]); break;
245 		case 3: sprintf(outstr, "{0x%08X,0x%08X,0x%08X}", s[0], s[1], s[2]); break;
246 		case 4: sprintf(outstr, "{0x%08X,0x%08X,0x%08X,0x%08X}", s[0], s[1], s[2], s[3]); break;
247 	}
248 }
249 
tostr(align1_int64 * m,int numElems,char * outstr)250 void tostr(align1_int64 *m, int numElems, char *outstr)
251 {
252 	switch(numElems)
253 	{
254 		case 1: sprintf(outstr, "{0x%08X%08X}", (int)(*m >> 32), (int)*m); break;
255 		case 2: sprintf(outstr, "{0x%08X%08X,0x%08X%08X}", (int)(*m >> 32), (int)*m, (int)(m[1] >> 32), (int)m[1]);
256 	}
257 }
258 
259 // Accessors to the test data in a way that the compiler can't optimize at compile-time.
get_interesting_floats()260 __attribute__((noinline)) float *get_interesting_floats()
261 {
262 	return always_true() ? interesting_floats_ : 0;
263 }
264 
get_interesting_ints()265 __attribute__((noinline)) uint32_t *get_interesting_ints()
266 {
267 	return always_true() ? interesting_ints_ : 0;
268 }
269 
get_interesting_doubles()270 __attribute__((noinline)) double *get_interesting_doubles()
271 {
272 	return always_true() ? interesting_doubles_ : 0;
273 }
274 
ExtractFloatInRandomOrder(float * arr,int i,int n,int prime)275 __m128 ExtractFloatInRandomOrder(float *arr, int i, int n, int prime)
276 {
277 	return _mm_set_ps(arr[(i*prime)%n], arr[((i+1)*prime)%n], arr[((i+2)*prime)%n], arr[((i+3)*prime)%n]);
278 }
279 
280 #ifdef __SSE2__
ExtractDoubleInRandomOrder(double * arr,int i,int n,int prime)281 __m128d ExtractDoubleInRandomOrder(double *arr, int i, int n, int prime)
282 {
283 	return _mm_set_pd(arr[(i*prime)%n], arr[((i+1)*prime)%n]);
284 }
285 #endif
286 
ExtractIntInRandomOrder(unsigned int * arr,int i,int n,int prime)287 __m128 ExtractIntInRandomOrder(unsigned int *arr, int i, int n, int prime)
288 {
289 	return _mm_set_ps(*(float*)&arr[(i*prime)%n], *(float*)&arr[((i+1)*prime)%n], *(float*)&arr[((i+2)*prime)%n], *(float*)&arr[((i+3)*prime)%n]);
290 }
291 
292 #define E1(arr, i, n) ExtractFloatInRandomOrder(arr, i, n, 1)
293 #define E2(arr, i, n) ExtractFloatInRandomOrder(arr, i, n, 1787)
294 
295 #define E1_Double(arr, i, n) ExtractDoubleInRandomOrder(arr, i, n, 1)
296 #define E2_Double(arr, i, n) ExtractDoubleInRandomOrder(arr, i, n, 1787)
297 
298 #define E1_Int(arr, i, n) ExtractIntInRandomOrder(arr, i, n, 1)
299 #define E2_Int(arr, i, n) ExtractIntInRandomOrder(arr, i, n, 1787)
300 
301 #define M128i_M128i_M128i(func) \
302 	for(int i = 0; i < numInterestingInts / 4; ++i) \
303 		for(int k = 0; k < 4; ++k) \
304 			for(int j = 0; j < numInterestingInts / 4; ++j) \
305 			{ \
306 				__m128i m1 = (__m128i)E1_Int(interesting_ints, i*4+k, numInterestingInts); \
307 				__m128i m2 = (__m128i)E2_Int(interesting_ints, j*4, numInterestingInts); \
308 				__m128i ret = func(m1, m2); \
309 				/* a op b */ \
310 				char str[256]; tostr(&m1, str); \
311 				char str2[256]; tostr(&m2, str2); \
312 				char str3[256]; tostr(&ret, str3); \
313 				printf("%s(%s, %s) = %s\n", #func, str, str2, str3); \
314 				/* b op a */ \
315 				ret = func(m2, m1); \
316 				tostr(&m1, str); \
317 				tostr(&m2, str2); \
318 				tostr(&ret, str3); \
319 				printf("%s(%s, %s) = %s\n", #func, str, str2, str3); \
320 			}
321 
322 #define Ret_M128_Tint_body(Ret_type, func, Tint) \
323 	for(int i = 0; i < numInterestingFloats / 4; ++i) \
324 		for(int k = 0; k < 4; ++k) \
325 		{ \
326 			__m128 m1 = E1(interesting_floats, i*4+k, numInterestingFloats); \
327 			Ret_type ret = func(m1, Tint); \
328 			char str[256]; tostr(&m1, str); \
329 			char str2[256]; tostr(&ret, str2); \
330 			printf("%s(%s, %d) = %s\n", #func, str, Tint, str2); \
331 		}
332 
333 #define Ret_M128d_Tint_body(Ret_type, func, Tint) \
334 	for(int i = 0; i < numInterestingDoubles / 2; ++i) \
335 		for(int k = 0; k < 2; ++k) \
336 		{ \
337 			__m128d m1 = E1_Double(interesting_doubles, i*2+k, numInterestingDoubles); \
338 			Ret_type ret = func(m1, Tint); \
339 			char str[256]; tostr(&m1, str); \
340 			char str2[256]; tostr(&ret, str2); \
341 			printf("%s(%s, %d) = %s\n", #func, str, Tint, str2); \
342 		}
343 
344 #define Ret_M128i_Tint_body(Ret_type, func, Tint) \
345 	for(int i = 0; i < numInterestingInts / 4; ++i) \
346 		for(int k = 0; k < 4; ++k) \
347 		{ \
348 			__m128i m1 = (__m128i)E1_Int(interesting_ints, i*4+k, numInterestingInts); \
349 			Ret_type ret = func(m1, Tint); \
350 			char str[256]; tostr(&m1, str); \
351 			char str2[256]; tostr(&ret, str2); \
352 			printf("%s(%s, %d) = %s\n", #func, str, Tint, str2); \
353 		}
354 
355 #define Ret_M128i_int_Tint_body(Ret_type, func, Tint) \
356 	for(int i = 0; i < numInterestingInts / 4; ++i) \
357 		for(int j = 0; j < numInterestingInts; ++j) \
358 			for(int k = 0; k < 4; ++k) \
359 			{ \
360 				__m128i m1 = (__m128i)E1_Int(interesting_ints, i*4+k, numInterestingInts); \
361 				Ret_type ret = func(m1, interesting_ints[j], Tint); \
362 				char str[256]; tostr(&m1, str); \
363 				char str2[256]; tostr(&ret, str2); \
364 				printf("%s(%s, 0x%08X, %d) = %s\n", #func, str, interesting_ints[j], Tint, str2); \
365 			}
366 
367 #define Ret_M128d_M128d_Tint_body(Ret_type, func, Tint) \
368 	for(int i = 0; i < numInterestingDoubles / 2; ++i) \
369 		for(int k = 0; k < 2; ++k) \
370 			for(int j = 0; j < numInterestingDoubles / 2; ++j) \
371 			{ \
372 				__m128d m1 = E1_Double(interesting_doubles, i*2+k, numInterestingDoubles); \
373 				__m128d m2 = E2_Double(interesting_doubles, j*2, numInterestingDoubles); \
374 				Ret_type ret = func(m1, m2, Tint); \
375 				/* a op b */ \
376 				char str[256]; tostr(&m1, str); \
377 				char str2[256]; tostr(&m2, str2); \
378 				char str3[256]; tostr(&ret, str3); \
379 				printf("%s(%s, %s, %d) = %s\n", #func, str, str2, Tint, str3); \
380 				/* b op a */ \
381 				ret = func(m2, m1, Tint); \
382 				tostr(&m1, str); \
383 				tostr(&m2, str2); \
384 				tostr(&ret, str3); \
385 				printf("%s(%s, %s, %d) = %s\n", #func, str, str2, Tint, str3); \
386 			}
387 
388 #define Ret_M128i_M128i_Tint_body(Ret_type, func, Tint) \
389 	for(int i = 0; i < numInterestingInts / 4; ++i) \
390 		for(int k = 0; k < 4; ++k) \
391 			for(int j = 0; j < numInterestingInts / 4; ++j) \
392 			{ \
393 				__m128i m1 = (__m128i)E1_Int(interesting_ints, i*4+k, numInterestingInts); \
394 				__m128i m2 = (__m128i)E2_Int(interesting_ints, j*4, numInterestingInts); \
395 				Ret_type ret = func(m1, m2, Tint); \
396 				/* a op b */ \
397 				char str[256]; tostr(&m1, str); \
398 				char str2[256]; tostr(&m2, str2); \
399 				char str3[256]; tostr(&ret, str3); \
400 				printf("%s(%s, %s, %d) = %s\n", #func, str, str2, Tint, str3); \
401 				/* b op a */ \
402 				ret = func(m2, m1, Tint); \
403 				tostr(&m1, str); \
404 				tostr(&m2, str2); \
405 				tostr(&ret, str3); \
406 				printf("%s(%s, %s, %d) = %s\n", #func, str, str2, Tint, str3); \
407 			}
408 
409 #define Ret_M128_M128_Tint_body(Ret_type, func, Tint) \
410 	for(int i = 0; i < numInterestingFloats / 4; ++i) \
411 		for(int k = 0; k < 4; ++k) \
412 			for(int j = 0; j < numInterestingFloats / 4; ++j) \
413 			{ \
414 				__m128 m1 = E1(interesting_floats, i*4+k, numInterestingFloats); \
415 				__m128 m2 = E2(interesting_floats, j*4, numInterestingFloats); \
416 				Ret_type ret = func(m1, m2, Tint); \
417 				/* a op b */ \
418 				char str[256]; tostr(&m1, str); \
419 				char str2[256]; tostr(&m2, str2); \
420 				char str3[256]; tostr(&ret, str3); \
421 				printf("%s(%s, %s, %d) = %s\n", #func, str, str2, Tint, str3); \
422 				/* b op a */ \
423 				ret = func(m2, m1, Tint); \
424 				tostr(&m1, str); \
425 				tostr(&m2, str2); \
426 				tostr(&ret, str3); \
427 				printf("%s(%s, %s, %d) = %s\n", #func, str, str2, Tint, str3); \
428 			}
429 
430 #define const_int8_unroll(Ret_type, F, func) \
431 	F(Ret_type, func, 0); \
432 	F(Ret_type, func, 1); \
433 	F(Ret_type, func, 2); \
434 	F(Ret_type, func, 3); \
435 	F(Ret_type, func, 5); \
436 	F(Ret_type, func, 7); \
437 	F(Ret_type, func, 11); \
438 	F(Ret_type, func, 13); \
439 	F(Ret_type, func, 15); \
440 	F(Ret_type, func, 16); \
441 	F(Ret_type, func, 17); \
442 	F(Ret_type, func, 23); \
443 	F(Ret_type, func, 29); \
444 	F(Ret_type, func, 31); \
445 	F(Ret_type, func, 37); \
446 	F(Ret_type, func, 43); \
447 	F(Ret_type, func, 47); \
448 	F(Ret_type, func, 59); \
449 	F(Ret_type, func, 127); \
450 	F(Ret_type, func, 128); \
451 	F(Ret_type, func, 191); \
452 	F(Ret_type, func, 254); \
453 	F(Ret_type, func, 255);
454 
455 #define const_int5_full_unroll(Ret_type, F, func) \
456 	F(Ret_type, func, 0); \
457 	F(Ret_type, func, 1); \
458 	F(Ret_type, func, 2); \
459 	F(Ret_type, func, 3); \
460 	F(Ret_type, func, 4); \
461 	F(Ret_type, func, 5); \
462 	F(Ret_type, func, 6); \
463 	F(Ret_type, func, 7); \
464 	F(Ret_type, func, 8); \
465 	F(Ret_type, func, 9); \
466 	F(Ret_type, func, 10); \
467 	F(Ret_type, func, 11); \
468 	F(Ret_type, func, 12); \
469 	F(Ret_type, func, 13); \
470 	F(Ret_type, func, 14); \
471 	F(Ret_type, func, 15); \
472 	F(Ret_type, func, 16); \
473 	F(Ret_type, func, 17); \
474 	F(Ret_type, func, 18); \
475 	F(Ret_type, func, 19); \
476 	F(Ret_type, func, 20); \
477 	F(Ret_type, func, 21); \
478 	F(Ret_type, func, 22); \
479 	F(Ret_type, func, 23); \
480 	F(Ret_type, func, 24); \
481 	F(Ret_type, func, 25); \
482 	F(Ret_type, func, 26); \
483 	F(Ret_type, func, 27); \
484 	F(Ret_type, func, 28); \
485 	F(Ret_type, func, 29); \
486 	F(Ret_type, func, 30); \
487 	F(Ret_type, func, 31);
488 
489 #define Ret_M128_Tint(Ret_type, func) const_int8_unroll(Ret_type, Ret_M128_Tint_body, func)
490 #define Ret_M128d_Tint(Ret_type, func) const_int8_unroll(Ret_type, Ret_M128d_Tint_body, func)
491 #define Ret_M128i_Tint(Ret_type, func) const_int8_unroll(Ret_type, Ret_M128i_Tint_body, func)
492 #define Ret_M128i_int_Tint(Ret_type, func) const_int8_unroll(Ret_type, Ret_M128i_int_Tint_body, func)
493 #define Ret_M128i_M128i_Tint(Ret_type, func) const_int8_unroll(Ret_type, Ret_M128i_M128i_Tint_body, func)
494 #define Ret_M128d_M128d_Tint(Ret_type, func) const_int8_unroll(Ret_type, Ret_M128d_M128d_Tint_body, func)
495 #define Ret_M128d_M128d_Tint_5bits(Ret_type, func) const_int5_full_unroll(Ret_type, Ret_M128d_M128d_Tint_body, func)
496 #define Ret_M128_M128_Tint(Ret_type, func) const_int8_unroll(Ret_type, Ret_M128_M128_Tint_body, func)
497 #define Ret_M128_M128_Tint_5bits(Ret_type, func) const_int5_full_unroll(Ret_type, Ret_M128_M128_Tint_body, func)
498 
499 #define Ret_M128d_M128d(Ret_type, func) \
500 	for(int i = 0; i < numInterestingDoubles / 2; ++i) \
501 		for(int k = 0; k < 2; ++k) \
502 			for(int j = 0; j < numInterestingDoubles / 2; ++j) \
503 			{ \
504 				__m128d m1 = E1_Double(interesting_doubles, i*2+k, numInterestingDoubles); \
505 				__m128d m2 = E2_Double(interesting_doubles, j*2, numInterestingDoubles); \
506 				Ret_type ret = func(m1, m2); \
507 				/* a op b */ \
508 				char str[256]; tostr(&m1, str); \
509 				char str2[256]; tostr(&m2, str2); \
510 				char str3[256]; tostr(&ret, str3); \
511 				printf("%s(%s, %s) = %s\n", #func, str, str2, str3); \
512 				/* b op a */ \
513 				ret = func(m2, m1); \
514 				tostr(&m1, str); \
515 				tostr(&m2, str2); \
516 				tostr(&ret, str3); \
517 				printf("%s(%s, %s) = %s\n", #func, str, str2, str3); \
518 			}
519 
520 #define Ret_M128d_M128d_M128d(Ret_type, func) \
521 	for(int i = 0; i < numInterestingDoubles / 2; ++i) \
522 		for(int k = 0; k < 2; ++k) \
523 			for(int j = 0; j < numInterestingDoubles / 2; ++j) \
524 				for(int l = 0; l < numInterestingDoubles / 2; ++l) \
525 				{ \
526 					__m128d m1 = E1_Double(interesting_doubles, i*2+k, numInterestingDoubles); \
527 					__m128d m2 = E2_Double(interesting_doubles, j*2, numInterestingDoubles); \
528 					__m128d m3 = E1_Double(interesting_doubles, l*2, numInterestingDoubles); \
529 					Ret_type ret = func(m1, m2, m3); \
530 					/* a, b, c */ \
531 					char str[256]; tostr(&m1, str); \
532 					char str2[256]; tostr(&m2, str2); \
533 					char str3[256]; tostr(&m3, str3); \
534 					char str4[256]; tostr(&ret, str4); \
535 					printf("%s(%s, %s, %s) = %s\n", #func, str, str2, str3, str4); \
536 					/* b, c, a */ \
537 					ret = func(m2, m3, m1); \
538 					tostr(&m1, str); \
539 					tostr(&m2, str2); \
540 					tostr(&m3, str3); \
541 					tostr(&ret, str4); \
542 					printf("%s(%s, %s, %s) = %s\n", #func, str, str2, str3, str4); \
543 					/* c, a, b */ \
544 					ret = func(m3, m1, m2); \
545 					tostr(&m1, str); \
546 					tostr(&m2, str2); \
547 					tostr(&m3, str3); \
548 					tostr(&ret, str4); \
549 					printf("%s(%s, %s, %s) = %s\n", #func, str, str2, str3, str4); \
550 				}
551 
552 #define Ret_M128d_M128(Ret_type, func) \
553 	for(int i = 0; i < numInterestingDoubles / 2; ++i) \
554 		for(int k = 0; k < 2; ++k) \
555 			for(int j = 0; j < numInterestingDoubles / 2; ++j) \
556 			{ \
557 				__m128d m1 = E1_Double(interesting_doubles, i*2+k, numInterestingDoubles); \
558 				__m128 m2 = E2(interesting_floats, i*4+k, numInterestingFloats); \
559 				Ret_type ret = func(m1, m2); \
560 				/* a op b */ \
561 				char str[256]; tostr(&m1, str); \
562 				char str2[256]; tostr(&m2, str2); \
563 				char str3[256]; tostr(&ret, str3); \
564 				printf("%s(%s, %s) = %s\n", #func, str, str2, str3); \
565 			}
566 
567 #define Ret_M128d_int(Ret_type, func) \
568 	for(int i = 0; i < numInterestingDoubles / 2; ++i) \
569 		for(int k = 0; k < 2; ++k) \
570 			for(int j = 0; j < numInterestingInts; ++j) \
571 			{ \
572 				__m128d m1 = E1_Double(interesting_doubles, i*2+k, numInterestingDoubles); \
573 				int m2 = interesting_ints[j]; \
574 				Ret_type ret = func(m1, m2); \
575 				char str[256]; tostr(&m1, str); \
576 				char str2[256]; tostr(&m2, str2); \
577 				char str3[256]; tostr(&ret, str3); \
578 				printf("%s(%s, %s) = %s\n", #func, str, str2, str3); \
579 			}
580 
581 #define Ret_M128d_int64(Ret_type, func) \
582 	for(int i = 0; i < numInterestingDoubles / 2; ++i) \
583 		for(int k = 0; k < 2; ++k) \
584 			for(int j = 0; j < numInterestingInts; ++j) \
585 				for(int l = 0; l < numInterestingInts; ++l) \
586 				{ \
587 					__m128d m1 = E1_Double(interesting_doubles, i*2+k, numInterestingDoubles); \
588 					int64_t m2 = (int64_t)(((uint64_t)interesting_ints[j]) << 32 | (uint64_t)interesting_ints[l]); \
589 					Ret_type ret = func(m1, m2); \
590 					char str[256]; tostr(&m1, str); \
591 					char str2[256]; tostr(&m2, str2); \
592 					char str3[256]; tostr(&ret, str3); \
593 					printf("%s(%s, %s) = %s\n", #func, str, str2, str3); \
594 				}
595 
596 #define Ret_M128d(Ret_type, func) \
597 	for(int i = 0; i < numInterestingDoubles / 2; ++i) \
598 		for(int k = 0; k < 2; ++k) \
599 		{ \
600 			__m128d m1 = E1_Double(interesting_doubles, i*2+k, numInterestingDoubles); \
601 			Ret_type ret = func(m1); \
602 			char str[256]; tostr(&m1, str); \
603 			char str2[256]; tostr(&ret, str2); \
604 			printf("%s(%s) = %s\n", #func, str, str2); \
605 		}
606 
607 #define Ret_DoublePtr(Ret_type, func, numElemsAccessed, inc) \
608 	for(int i = 0; i+numElemsAccessed <= numInterestingDoubles; i += inc) \
609 	{ \
610 		double *ptr = interesting_doubles + i; \
611 		Ret_type ret = func(ptr); \
612 		char str[256]; tostr(ptr, numElemsAccessed, str); \
613 		char str2[256]; tostr(&ret, str2); \
614 		printf("%s(%s) = %s\n", #func, str, str2); \
615 	}
616 
617 #define Ret_DoublePtr_M128i(Ret_type, func, numElemsAccessed, inc) \
618 	for(int i = 0; i+numElemsAccessed <= numInterestingDoubles; i += inc) \
619 		for(int j = 0; j < numInterestingInts / 4; ++j) \
620 		{ \
621 			double *ptr = interesting_doubles + i; \
622 			__m128i m1 = (__m128i)E2_Int(interesting_ints, j*4, numInterestingInts); \
623 			Ret_type ret = func(ptr, m1); \
624 			char str[256]; tostr(ptr, numElemsAccessed, str); \
625 			char str2[256]; tostr(&ret, str2); \
626 			printf("%s(%s) = %s\n", #func, str, str2); \
627 		}
628 
629 float tempOutFloatStore[16];
getTempOutFloatStore(int alignmentBytes)630 float *getTempOutFloatStore(int alignmentBytes)
631 {
632 	memset(tempOutFloatStore, 0, sizeof(tempOutFloatStore));
633 	uintptr_t addr = (uintptr_t)tempOutFloatStore;
634 	addr = (addr + alignmentBytes - 1) & ~(alignmentBytes-1);
635 	return (float*)addr;
636 }
637 
getTempOutIntStore(int alignmentBytes)638 int *getTempOutIntStore(int alignmentBytes) { return (int*)getTempOutFloatStore(alignmentBytes); }
getTempOutDoubleStore(int alignmentBytes)639 double *getTempOutDoubleStore(int alignmentBytes) { return (double*)getTempOutFloatStore(alignmentBytes); }
640 
641 #define void_OutFloatPtr_M128(func, Ptr_type, numBytesWritten, alignmentBytes) \
642 	for(int i = 0; i < numInterestingFloats / 4; ++i) \
643 		for(int offset = 0; offset < numBytesWritten; offset += alignmentBytes) \
644 			for(int k = 0; k < 4; ++k) \
645 			{ \
646 				uintptr_t base = (uintptr_t)getTempOutFloatStore(16); \
647 				__m128 m1 = E1(interesting_floats, i*4+k, numInterestingFloats); \
648 				align1_float *out = (align1_float*)(base + offset); \
649 				func((Ptr_type)out, m1); \
650 				char str[256]; tostr(&m1, str); \
651 				char str2[256]; tostr(out, numBytesWritten/sizeof(float), str2); \
652 				printf("%s(p:align=%d, %s) = %s\n", #func, offset, str, str2); \
653 			}
654 
655 #define void_OutFloatPtr_M128i_M128(func, Ptr_type, numBytesWritten, alignmentBytes) \
656 	for(int i = 0; i < numInterestingFloats / 4; ++i) \
657 		for(int j = 0; j < numInterestingInts / 4; ++j) \
658 			for(int offset = 0; offset < numBytesWritten; offset += alignmentBytes) \
659 				for(int k = 0; k < 4; ++k) \
660 				{ \
661 					uintptr_t base = (uintptr_t)getTempOutFloatStore(16); \
662 					__m128i m1 = (__m128i)E1_Int(interesting_ints, j*4, numInterestingInts); \
663 					__m128 m2 = E1(interesting_floats, i*4+k, numInterestingFloats); \
664 					align1_float *out = (align1_float*)(base + offset); \
665 					func((Ptr_type)out, m1, m2); \
666 					char str[256]; tostr(&m1, str); \
667 					char str2[256]; tostr(&m2, str2); \
668 					char str3[256]; tostr(out, numBytesWritten/sizeof(float), str3); \
669 					printf("%s(p:align=%d, %s, %s) = %s\n", #func, offset, str, str2, str3); \
670 				}
671 
672 #define void_OutDoublePtr_M128d(func, Ptr_type, numBytesWritten, alignmentBytes) \
673 	for(int i = 0; i < numInterestingDoubles / 2; ++i) \
674 		for(int offset = 0; offset < numBytesWritten; offset += alignmentBytes) \
675 			for(int k = 0; k < 2; ++k) \
676 			{ \
677 				uintptr_t base = (uintptr_t)getTempOutDoubleStore(16); \
678 				__m128d m1 = E1_Double(interesting_doubles, i*2+k, numInterestingDoubles); \
679 				align1_double *out = (align1_double*)(base + offset); \
680 				func((Ptr_type)out, m1); \
681 				char str[256]; tostr(&m1, str); \
682 				char str2[256]; tostr(out, numBytesWritten/sizeof(double), str2); \
683 				printf("%s(p:align=%d, %s) = %s\n", #func, offset, str, str2); \
684 			}
685 
686 #define void_OutDoublePtr_M128i_M128d(func, Ptr_type, numBytesWritten, alignmentBytes) \
687 	for(int i = 0; i < numInterestingDoubles / 2; ++i) \
688 		for(int j = 0; j < numInterestingInts / 4; ++j) \
689 			for(int offset = 0; offset < numBytesWritten; offset += alignmentBytes) \
690 				for(int k = 0; k < 2; ++k) \
691 				{ \
692 					uintptr_t base = (uintptr_t)getTempOutDoubleStore(16); \
693 					__m128i m1 = (__m128i)E1_Int(interesting_ints, j*4, numInterestingInts); \
694 					__m128d m2 = E1_Double(interesting_doubles, i*2+k, numInterestingDoubles); \
695 					align1_double *out = (align1_double*)(base + offset); \
696 					func((Ptr_type)out, m1, m2); \
697 					char str[256]; tostr(&m1, str); \
698 					char str2[256]; tostr(&m2, str2); \
699 					char str3[256]; tostr(out, numBytesWritten/sizeof(double), str3); \
700 					printf("%s(p:align=%d, %s, %s) = %s\n", #func, offset, str, str2, str3); \
701 				}
702 
703 #define void_OutIntPtr_M128i(func, Ptr_type, numBytesWritten, alignmentBytes) \
704 	for(int i = 0; i < numInterestingInts / 4; ++i) \
705 		for(int offset = 0; offset < numBytesWritten; offset += alignmentBytes) \
706 			for(int k = 0; k < 4; ++k) \
707 			{ \
708 				uintptr_t base = (uintptr_t)getTempOutIntStore(16); \
709 				__m128i m1 = (__m128i)E1_Int(interesting_ints, i*4+k, numInterestingInts); \
710 				align1_int *out = (align1_int*)(base + offset); \
711 				func((Ptr_type)out, m1); \
712 				char str[256]; tostr(&m1, str); \
713 				char str2[256]; tostr(out, (numBytesWritten+sizeof(int)-1)/sizeof(int), str2); \
714 				printf("%s(p:align=%d, %s) = %s\n", #func, offset, str, str2); \
715 			}
716 
717 #define void_OutIntPtr_int(func, Ptr_type, numBytesWritten, alignmentBytes) \
718 	for(int i = 0; i < numInterestingInts; ++i) \
719 		for(int offset = 0; offset < numBytesWritten; offset += alignmentBytes) \
720 			for(int k = 0; k < 4; ++k) \
721 			{ \
722 				uintptr_t base = (uintptr_t)getTempOutIntStore(16); \
723 				int m1 = interesting_ints[i]; \
724 				align1_int *out = (align1_int*)(base + offset); \
725 				func((Ptr_type)out, m1); \
726 				char str[256]; tostr(&m1, str); \
727 				char str2[256]; tostr(out, numBytesWritten/sizeof(int), str2); \
728 				printf("%s(p:align=%d, %s) = %s\n", #func, offset, str, str2); \
729 			}
730 
731 #define void_OutIntPtr_int64(func, Ptr_type, numBytesWritten, alignmentBytes) \
732 	for(int i = 0; i < numInterestingInts; ++i) \
733 		for(int j = 0; j < numInterestingInts; ++j) \
734 			for(int offset = 0; offset < numBytesWritten; offset += alignmentBytes) \
735 			{ \
736 				uintptr_t base = (uintptr_t)getTempOutIntStore(16); \
737 				int64_t m1 = (int64_t)(((uint64_t)interesting_ints[i]) << 32 | (uint64_t)interesting_ints[j]); \
738 				align1_int64 *out = (align1_int64*)(base + offset); \
739 				func((Ptr_type)out, m1); \
740 				char str[256]; tostr(&m1, str); \
741 				char str2[256]; tostr(out, numBytesWritten/sizeof(int64_t), str2); \
742 				printf("%s(p:align=%d, %s) = %s\n", #func, offset, str, str2); \
743 			}
744 
745 #define void_M128i_M128i_OutIntPtr(func, Ptr_type, numBytesWritten, alignmentBytes) \
746 	for(int i = 0; i < numInterestingInts / 4; ++i) \
747 		for(int j = 0; j < numInterestingInts / 4; ++j) \
748 			for(int offset = 0; offset < numBytesWritten; offset += alignmentBytes) \
749 				for(int k = 0; k < 4; ++k) \
750 				{ \
751 					uintptr_t base = (uintptr_t)getTempOutIntStore(16); \
752 					__m128i m1 = (__m128i)E1_Int(interesting_ints, i*4+k, numInterestingInts); \
753 					__m128i m2 = (__m128i)E2_Int(interesting_ints, j*4, numInterestingInts); \
754 					align1_int *out = (int*)(base + offset); \
755 					func(m1, m2, (Ptr_type)out); \
756 					char str[256]; tostr(&m1, str); \
757 					char str2[256]; tostr(&m2, str2); \
758 					char str3[256]; tostr(out, numBytesWritten/sizeof(int), str3); \
759 					printf("%s(%s, %s, p:align=%d) = %s\n", #func, str, str2, offset, str3); \
760 				}
761 
762 #define Ret_M128(Ret_type, func) \
763 	for(int i = 0; i < numInterestingFloats / 4; ++i) \
764 		for(int k = 0; k < 4; ++k) \
765 		{ \
766 			__m128 m1 = E1(interesting_floats, i*4+k, numInterestingFloats); \
767 			Ret_type ret = func(m1); \
768 			char str[256]; tostr(&m1, str); \
769 			char str2[256]; tostr(&ret, str2); \
770 			printf("%s(%s) = %s\n", #func, str, str2); \
771 		}
772 
773 #define Ret_M128approx(Ret_type, func) \
774 	for(int i = 0; i < numInterestingFloats / 4; ++i) \
775 		for(int k = 0; k < 4; ++k) \
776 		{ \
777 			__m128 m1 = E1(interesting_floats, i*4+k, numInterestingFloats); \
778 			Ret_type ret = func(m1); \
779 			char str[256]; tostr(&m1, str); \
780 			char str2[256]; tostr_approx(&ret, str2, true/*approximate*/); \
781 			printf("%s(%s) = %s\n", #func, str, str2); \
782 		}
783 
784 #define Ret_FloatPtr(Ret_type, func, numElemsAccessed, inc) \
785 	for(int i = 0; i+numElemsAccessed <= numInterestingFloats; i += inc) \
786 	{ \
787 		float *ptr = interesting_floats + i; \
788 		Ret_type ret = func(ptr); \
789 		char str[256]; tostr(ptr, numElemsAccessed, str); \
790 		char str2[256]; tostr(&ret, str2); \
791 		printf("%s(%s) = %s\n", #func, str, str2); \
792 	}
793 
794 #define Ret_FloatPtr_M128i(Ret_type, func, numElemsAccessed, inc) \
795 	for(int i = 0; i+numElemsAccessed <= numInterestingFloats; i += inc) \
796 		for(int j = 0; j < numInterestingInts / 4; ++j) \
797 		{ \
798 			float *ptr = interesting_floats + i; \
799 			__m128i m1 = (__m128i)E1_Int(interesting_ints, j*4, numInterestingInts); \
800 			Ret_type ret = func(ptr, m1); \
801 			char str[256]; tostr(ptr, numElemsAccessed, str); \
802 			char str2[256]; tostr(&ret, str2); \
803 			printf("%s(%s) = %s\n", #func, str, str2); \
804 		}
805 
806 #define Ret_Float4(Ret_type, func, inc) \
807 	for(int i = 0; i+4 <= numInterestingFloats; i += inc) \
808 	{ \
809 		float *ptr = interesting_floats + i; \
810 		Ret_type ret = func(ptr[0], ptr[1], ptr[2], ptr[3]); \
811 		char str[256]; tostr(ptr, 4, str); \
812 		char str2[256]; tostr(&ret, str2); \
813 		printf("%s(%s) = %s\n", #func, str, str2); \
814 	}
815 
816 #define Ret_Float(Ret_type, func, inc) \
817 	for(int i = 0; i+1 <= numInterestingFloats; i += inc) \
818 	{ \
819 		float *ptr = interesting_floats + i; \
820 		Ret_type ret = func(*ptr); \
821 		char str[256]; tostr(ptr, 1, str); \
822 		char str2[256]; tostr(&ret, str2); \
823 		printf("%s(%s) = %s\n", #func, str, str2); \
824 	}
825 
826 #define Ret_IntPtr(Ret_type, func, Ptr_type, numElemsAccessed, inc) \
827 	for(int i = 0; i+numElemsAccessed <= numInterestingInts; i += inc) \
828 	{ \
829 		uint32_t *ptr = interesting_ints + i; \
830 		Ret_type ret = func((Ptr_type)ptr); \
831 		char str[256]; tostr((int*)ptr, numElemsAccessed, str); \
832 		char str2[256]; tostr(&ret, str2); \
833 		printf("%s(%s) = %s\n", #func, str, str2); \
834 	}
835 
836 #define Ret_M128_FloatPtr(Ret_type, func, Ptr_type, numElemsAccessed, inc) \
837 	for(int i = 0; i < numInterestingFloats / 4; ++i) \
838 		for(int k = 0; k < 4; ++k) \
839 			for(int j = 0; j+numElemsAccessed <= numInterestingFloats; j += inc) \
840 			{ \
841 				__m128 m1 = E1(interesting_floats, i*4+k, numInterestingFloats); \
842 				float *ptr = interesting_floats + j; \
843 				Ret_type ret = func(m1, (Ptr_type)ptr); \
844 				char str[256]; tostr(&m1, str); \
845 				char str2[256]; tostr(ptr, numElemsAccessed, str2); \
846 				char str3[256]; tostr(&ret, str3); \
847 				printf("%s(%s, %s) = %s\n", #func, str, str2, str3); \
848 			}
849 
850 #define Ret_M128d_DoublePtr(Ret_type, func, Ptr_type, numElemsAccessed, inc) \
851 	for(int i = 0; i < numInterestingDoubles / 2; ++i) \
852 		for(int k = 0; k < 2; ++k) \
853 			for(int j = 0; j+numElemsAccessed <= numInterestingDoubles; j += inc) \
854 			{ \
855 				__m128d m1 = E1_Double(interesting_doubles, i*2+k, numInterestingDoubles); \
856 				double *ptr = interesting_doubles + j; \
857 				Ret_type ret = func(m1, (Ptr_type)ptr); \
858 				char str[256]; tostr(&m1, str); \
859 				char str2[256]; tostr(ptr, numElemsAccessed, str2); \
860 				char str3[256]; tostr(&ret, str3); \
861 				printf("%s(%s, %s) = %s\n", #func, str, str2, str3); \
862 			}
863 
864 #define Ret_M128i(Ret_type, func) \
865 	for(int i = 0; i < numInterestingInts / 4; ++i) \
866 		for(int k = 0; k < 4; ++k) \
867 		{ \
868 			__m128i m1 = (__m128i)E1_Int(interesting_ints, i*4+k, numInterestingInts); \
869 			Ret_type ret = func(m1); \
870 			char str[256]; tostr(&m1, str); \
871 			char str2[256]; tostr(&ret, str2); \
872 			printf("%s(%s) = %s\n", #func, str, str2); \
873 		}
874 
875 #define Ret_M128i_M128i(Ret_type, func) \
876 	for(int i = 0; i < numInterestingInts / 4; ++i) \
877 		for(int k = 0; k < 4; ++k) \
878 			for(int j = 0; j < numInterestingInts / 4; ++j) \
879 			{ \
880 				__m128i m1 = (__m128i)E1_Int(interesting_ints, i*4+k, numInterestingInts); \
881 				__m128i m2 = (__m128i)E2_Int(interesting_ints, j*4, numInterestingInts); \
882 				Ret_type ret = func(m1, m2); \
883 				char str[256]; tostr(&m1, str); \
884 				char str2[256]; tostr(&m2, str2); \
885 				char str3[256]; tostr(&ret, str3); \
886 				printf("%s(%s, %s) = %s\n", #func, str, str2, str3); \
887 			}
888 
889 #define Ret_M128i_M128i_M128i(Ret_type, func) \
890 	for(int i = 0; i < numInterestingInts / 4; ++i) \
891 		for(int k = 0; k < 4; ++k) \
892 			for(int j = 0; j < numInterestingInts / 4; ++j) \
893 				for(int l = 0; l < numInterestingInts / 4; ++l) \
894 				{ \
895 					__m128i m1 = (__m128i)E1_Int(interesting_ints, i*4+k, numInterestingInts); \
896 					__m128i m2 = (__m128i)E2_Int(interesting_ints, j*4, numInterestingInts); \
897 					__m128i m3 = (__m128i)E1_Int(interesting_ints, l*4, numInterestingInts); \
898 					Ret_type ret = func(m1, m2, m3); \
899 					char str[256]; tostr(&m1, str); \
900 					char str2[256]; tostr(&m2, str2); \
901 					char str3[256]; tostr(&m3, str3); \
902 					char str4[256]; tostr(&ret, str4); \
903 					printf("%s(%s, %s, %s) = %s\n", #func, str, str2, str3, str4); \
904 				}
905 
906 #define Ret_int(Ret_type, func) \
907 	for(int i = 0; i < numInterestingInts; ++i) \
908 	{ \
909 		Ret_type ret = func(interesting_ints[i]); \
910 		char str[256]; tostr((int*)&interesting_ints[i], str); \
911 		char str2[256]; tostr(&ret, str2); \
912 		printf("%s(%s) = %s\n", #func, str, str2); \
913 	}
914 
915 #define Ret_int64(Ret_type, func) \
916 	for(int i = 0; i < numInterestingInts; ++i) \
917 		for(int j = 0; j < numInterestingInts; ++j) \
918 		{ \
919 			int64_t m1 = (int64_t)(((uint64_t)interesting_ints[i]) << 32 | (uint64_t)interesting_ints[j]); \
920 			Ret_type ret = func(m1); \
921 			char str[256]; tostr(&m1, str); \
922 			char str2[256]; tostr(&ret, str2); \
923 			printf("%s(%s) = %s\n", #func, str, str2); \
924 		}
925 
926 #define Ret_M128_M128(Ret_type, func) \
927 	for(int i = 0; i < numInterestingFloats / 4; ++i) \
928 		for(int k = 0; k < 4; ++k) \
929 			for(int j = 0; j < numInterestingFloats / 4; ++j) \
930 			{ \
931 				__m128 m1 = E1(interesting_floats, i*4+k, numInterestingFloats); \
932 				__m128 m2 = E2(interesting_floats, j*4, numInterestingFloats); \
933 				Ret_type ret = func(m1, m2); \
934 				char str[256]; tostr(&m1, str); \
935 				char str2[256]; tostr(&m2, str2); \
936 				char str3[256]; tostr(&ret, str3); \
937 				printf("%s(%s, %s) = %s\n", #func, str, str2, str3); \
938 			}
939 
940 #define Ret_M128_M128_M128(Ret_type, func) \
941 	for(int i = 0; i < numInterestingFloats / 4; ++i) \
942 		for(int k = 0; k < 4; ++k) \
943 			for(int j = 0; j < numInterestingFloats / 4; ++j) \
944 				for(int l = 0; l < numInterestingFloats / 4; ++l) \
945 				{ \
946 					__m128 m1 = E1(interesting_floats, i*4+k, numInterestingFloats); \
947 					__m128 m2 = E2(interesting_floats, j*4, numInterestingFloats); \
948 					__m128 m3 = E1(interesting_floats, l*4, numInterestingFloats); \
949 					Ret_type ret = func(m1, m2, m3); \
950 					char str[256]; tostr(&m1, str); \
951 					char str2[256]; tostr(&m2, str2); \
952 					char str3[256]; tostr(&m3, str3); \
953 					char str4[256]; tostr(&ret, str4); \
954 					printf("%s(%s, %s, %s) = %s\n", #func, str, str2, str3, str4); \
955 				}
956 
957 #define Ret_M128_int(Ret_type, func) \
958 	for(int i = 0; i < numInterestingFloats / 4; ++i) \
959 		for(int k = 0; k < 4; ++k) \
960 			for(int j = 0; j < numInterestingInts; ++j) \
961 			{ \
962 				__m128 m1 = E1(interesting_floats, i*4+k, numInterestingFloats); \
963 				int m2 = interesting_ints[j]; \
964 				Ret_type ret = func(m1, m2); \
965 				char str[256]; tostr(&m1, str); \
966 				char str2[256]; tostr(&m2, str2); \
967 				char str3[256]; tostr(&ret, str3); \
968 				printf("%s(%s, %s) = %s\n", #func, str, str2, str3); \
969 			}
970