1 /* { dg-do compile } */
2 /* { dg-options "-O2 -march=skylake-avx512 -DDTYPE32" } */
3 
4 typedef unsigned long long u64;
5 typedef unsigned int u32;
6 typedef unsigned short u16;
7 typedef unsigned char u8;
8 
9 #ifdef DTYPE32
10 typedef u32 DTYPE;
11 #define byteswap byteswapu32
12 #endif
13 
14 #ifdef DTYPE16
15 typedef u16 DTYPE;
16 #define byteswap byteswapu16
17 #endif
18 
19 #ifdef DTYPE8
20 typedef u16 DTYPE;
21 #define byteswap byteswapu8
22 #endif
23 
24 #ifdef DTYPE64
25 typedef u16 DTYPE;
26 #define byteswap byteswapu64
27 #endif
28 
29 #define R(x,n) ( (x >> n) | (x << (32 - n)))
30 
31 #define S0(x) (R(x, 2) ^ R(x,13) ^ R(x,22))
32 #define S1(x) (R(x, 6) ^ R(x,11) ^ R(x,25))
33 
34 #define TT(a,b,c,d,e,f,g,h,x,K)                 \
35 {                                                        \
36     tmp1 = h + S1(e) + (g ^ (e & (f ^ g))) + K + x;                \
37     tmp2 = S0(a) + ((a & b) | (c & (a | b)));                           \
38     h  = tmp1 + tmp2;                                    \
39     d += tmp1;                                           \
40 }
41 
byteswapu32(u32 x)42 static inline u32 byteswapu32(u32 x)
43 {
44   x = (x & 0x0000FFFF) << 16 | (x & 0xFFFF0000) >> 16;
45   x = (x & 0x00FF00FF) << 8 | (x & 0xFF00FF00) >> 8;
46   return x;
47 }
48 
byteswapu16(u16 x)49 static inline u16 byteswapu16(u16 x)
50 {
51   x = (x & 0x00FF) << 8 | (x & 0xFF00) >> 8;
52   return x;
53 }
54 
byteswapu8(u8 x)55 static inline u8 byteswapu8(u8 x)
56 {
57   return x;
58 }
59 
byteswapu64(u64 x)60 static inline u64 byteswapu64(u64 x)
61 {
62   x = ((u64)(byteswapu32 (x & 0x00000000FFFFFFFF))) << 32 | byteswapu32((x & 0xFFFFFFFF00000000) >> 32);
63   return x;
64 }
65 
foo(DTYPE in[16],DTYPE out[8],const DTYPE C[16])66 void foo (DTYPE in[16], DTYPE out[8], const DTYPE C[16])
67 {
68     DTYPE tmp1 = 0, tmp2 = 0, a, b, c, d, e, f, g, h;
69     DTYPE w0, w1, w2, w3, w4, w5, w6, w7,
70 	w8, w9, w10, w11, w12, w13, w14, w15;
71     w0  = byteswap(in[0]);
72     w1  = byteswap(in[1]);
73     w2  = byteswap(in[2]);
74     w3  = byteswap(in[3]);
75     w4  = byteswap(in[4]);
76     w5  = byteswap(in[5]);
77     w6  = byteswap(in[6]);
78     w7  = byteswap(in[7]);
79     w8  = byteswap(in[8]);
80     w9  = byteswap(in[9]);
81     w10 = byteswap(in[10]);
82     w11 = byteswap(in[11]);
83     w12 = byteswap(in[12]);
84     w13 = byteswap(in[13]);
85     w14 = byteswap(in[14]);
86     w15 = byteswap(in[15]);
87     a = out[0];
88     b = out[1];
89     c = out[2];
90     d = out[3];
91     e = out[4];
92     f = out[5];
93     g = out[6];
94     h = out[7];
95 
96     TT(a, b, c, d, e, f, g, h,  w0, C[0]);
97     TT(h, a, b, c, d, e, f, g,  w1, C[1]);
98     TT(g, h, a, b, c, d, e, f,  w2, C[2]);
99     TT(f, g, h, a, b, c, d, e,  w3, C[3]);
100     TT(e, f, g, h, a, b, c, d,  w4, C[4]);
101     TT(d, e, f, g, h, a, b, c,  w5, C[5]);
102     TT(c, d, e, f, g, h, a, b,  w6, C[6]);
103     TT(b, c, d, e, f, g, h, a,  w7, C[7]);
104     TT(a, b, c, d, e, f, g, h,  w8, C[8]);
105     TT(h, a, b, c, d, e, f, g,  w9, C[9]);
106     TT(g, h, a, b, c, d, e, f, w10, C[10]);
107     TT(f, g, h, a, b, c, d, e, w11, C[11]);
108     TT(e, f, g, h, a, b, c, d, w12, C[12]);
109     TT(d, e, f, g, h, a, b, c, w13, C[13]);
110     TT(c, d, e, f, g, h, a, b, w14, C[14]);
111     TT(b, c, d, e, f, g, h, a, w15, C[15]);
112 
113     out[0] += a;
114     out[1] += b;
115     out[2] += c;
116     out[3] += d;
117     out[4] += e;
118     out[5] += f;
119     out[6] += g;
120     out[7] += h;
121 }
122 
123 /* { dg-final { scan-assembler "kmovd" } } */
124 /* { dg-final { scan-assembler-not "knot" } } */
125 /* { dg-final { scan-assembler-not "kxor" } } */
126 /* { dg-final { scan-assembler-not "kor" } } */
127 /* { dg-final { scan-assembler-not "kandn" } } */
128