1 /* { dg-do run } */
2 /* { dg-require-effective-target sse4 } */
3 /* { dg-options "-O2 -msse4.1" } */
4 
5 #ifndef CHECK_H
6 #define CHECK_H "sse4_1-check.h"
7 #endif
8 
9 #ifndef TEST
10 #define TEST sse4_1_test
11 #endif
12 
13 #include CHECK_H
14 
15 #include <smmintrin.h>
16 
17 #define lmskN  0x00
18 #define lmsk0  0x01
19 #define lmsk1  0x02
20 #define lmsk2  0x04
21 #define lmsk3  0x08
22 #define lmsk01 0x03
23 #define lmsk02 0x05
24 #define lmsk03 0x09
25 #define lmsk12 0x06
26 #define lmsk13 0x0A
27 #define lmsk23 0x0C
28 #define lmskA  0x0F
29 
30 #define hmskN  0x00
31 #define hmskA  0xF0
32 #define hmsk0  0x10
33 #define hmsk1  0x20
34 #define hmsk2  0x40
35 #define hmsk3  0x80
36 #define hmsk01 0x30
37 #define hmsk02 0x50
38 #define hmsk03 0x90
39 #define hmsk12 0x60
40 #define hmsk13 0xA0
41 #define hmsk23 0xC0
42 
43 #ifndef HIMASK
44 #define HIMASK hmskA
45 #endif
46 
47 static void
TEST(void)48 TEST (void)
49 {
50   union
51     {
52       __m128 x;
53       float f[4];
54     } val1, val2, res[16];
55   int masks[16];
56   int i, j;
57 
58   val1.f[0] = 2.;
59   val1.f[1] = 3.;
60   val1.f[2] = 4.;
61   val1.f[3] = 5.;
62 
63   val2.f[0] = 10.;
64   val2.f[1] = 100.;
65   val2.f[2] = 1000.;
66   val2.f[3] = 10000.;
67 
68   res[0].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk0);
69   res[1].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk1);
70   res[2].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk2);
71   res[3].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk3);
72   res[4].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk01);
73   res[5].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk02);
74   res[6].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk03);
75   res[7].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk12);
76   res[8].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk13);
77   res[9].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmsk23);
78   res[10].x = _mm_dp_ps (val1.x, val2.x, HIMASK | (0x0F & ~lmsk0));
79   res[11].x = _mm_dp_ps (val1.x, val2.x, HIMASK | (0x0F & ~lmsk1));
80   res[12].x = _mm_dp_ps (val1.x, val2.x, HIMASK | (0x0F & ~lmsk2));
81   res[13].x = _mm_dp_ps (val1.x, val2.x, HIMASK | (0x0F & ~lmsk3));
82   res[14].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmskN);
83   res[15].x = _mm_dp_ps (val1.x, val2.x, HIMASK | lmskA);
84 
85   masks[0] = HIMASK | lmsk0;
86   masks[1] = HIMASK | lmsk1;
87   masks[2] = HIMASK | lmsk2;
88   masks[3] = HIMASK | lmsk3;
89   masks[4] = HIMASK | lmsk01;
90   masks[5] = HIMASK | lmsk02;
91   masks[6] = HIMASK | lmsk03;
92   masks[7] = HIMASK | lmsk12;
93   masks[8] = HIMASK | lmsk13;
94   masks[9] = HIMASK | lmsk23;
95   masks[10] = HIMASK | (0x0F & ~lmsk0);
96   masks[11] = HIMASK | (0x0F & ~lmsk1);
97   masks[12] = HIMASK | (0x0F & ~lmsk2);
98   masks[13] = HIMASK | (0x0F & ~lmsk3);
99   masks[14] = HIMASK | lmskN;
100   masks[15] = HIMASK | lmskA;
101 
102   for (i = 0; i <= 15; i++)
103     {
104       float tmp = 0.;
105 
106       for (j = 0; j < 4; j++)
107 	if ((HIMASK & (0x10 << j)))
108 	  tmp += val1.f[j] * val2.f[j];
109 
110       for (j = 0; j < 4; j++)
111 	if ((masks[i] & (1 << j)) && res[i].f[j] != tmp)
112 	  abort ();
113    }
114 }
115