1 /* { dg-do compile } */
2 /* { dg-options "-O2 -march=skylake-avx512 -DDTYPE32" } */
3
4 typedef unsigned long long u64;
5 typedef unsigned int u32;
6 typedef unsigned short u16;
7 typedef unsigned char u8;
8
9 #ifdef DTYPE32
10 typedef u32 DTYPE;
11 #define byteswap byteswapu32
12 #endif
13
14 #ifdef DTYPE16
15 typedef u16 DTYPE;
16 #define byteswap byteswapu16
17 #endif
18
19 #ifdef DTYPE8
20 typedef u16 DTYPE;
21 #define byteswap byteswapu8
22 #endif
23
24 #ifdef DTYPE64
25 typedef u16 DTYPE;
26 #define byteswap byteswapu64
27 #endif
28
29 #define R(x,n) ( (x >> n) | (x << (32 - n)))
30
31 #define S0(x) (R(x, 2) ^ R(x,13) ^ R(x,22))
32 #define S1(x) (R(x, 6) ^ R(x,11) ^ R(x,25))
33
34 #define TT(a,b,c,d,e,f,g,h,x,K) \
35 { \
36 tmp1 = h + S1(e) + (g ^ (e & (f ^ g))) + K + x; \
37 tmp2 = S0(a) + ((a & b) | (c & (a | b))); \
38 h = tmp1 + tmp2; \
39 d += tmp1; \
40 }
41
byteswapu32(u32 x)42 static inline u32 byteswapu32(u32 x)
43 {
44 x = (x & 0x0000FFFF) << 16 | (x & 0xFFFF0000) >> 16;
45 x = (x & 0x00FF00FF) << 8 | (x & 0xFF00FF00) >> 8;
46 return x;
47 }
48
byteswapu16(u16 x)49 static inline u16 byteswapu16(u16 x)
50 {
51 x = (x & 0x00FF) << 8 | (x & 0xFF00) >> 8;
52 return x;
53 }
54
byteswapu8(u8 x)55 static inline u8 byteswapu8(u8 x)
56 {
57 return x;
58 }
59
byteswapu64(u64 x)60 static inline u64 byteswapu64(u64 x)
61 {
62 x = ((u64)(byteswapu32 (x & 0x00000000FFFFFFFF))) << 32 | byteswapu32((x & 0xFFFFFFFF00000000) >> 32);
63 return x;
64 }
65
foo(DTYPE in[16],DTYPE out[8],const DTYPE C[16])66 void foo (DTYPE in[16], DTYPE out[8], const DTYPE C[16])
67 {
68 DTYPE tmp1 = 0, tmp2 = 0, a, b, c, d, e, f, g, h;
69 DTYPE w0, w1, w2, w3, w4, w5, w6, w7,
70 w8, w9, w10, w11, w12, w13, w14, w15;
71 w0 = byteswap(in[0]);
72 w1 = byteswap(in[1]);
73 w2 = byteswap(in[2]);
74 w3 = byteswap(in[3]);
75 w4 = byteswap(in[4]);
76 w5 = byteswap(in[5]);
77 w6 = byteswap(in[6]);
78 w7 = byteswap(in[7]);
79 w8 = byteswap(in[8]);
80 w9 = byteswap(in[9]);
81 w10 = byteswap(in[10]);
82 w11 = byteswap(in[11]);
83 w12 = byteswap(in[12]);
84 w13 = byteswap(in[13]);
85 w14 = byteswap(in[14]);
86 w15 = byteswap(in[15]);
87 a = out[0];
88 b = out[1];
89 c = out[2];
90 d = out[3];
91 e = out[4];
92 f = out[5];
93 g = out[6];
94 h = out[7];
95
96 TT(a, b, c, d, e, f, g, h, w0, C[0]);
97 TT(h, a, b, c, d, e, f, g, w1, C[1]);
98 TT(g, h, a, b, c, d, e, f, w2, C[2]);
99 TT(f, g, h, a, b, c, d, e, w3, C[3]);
100 TT(e, f, g, h, a, b, c, d, w4, C[4]);
101 TT(d, e, f, g, h, a, b, c, w5, C[5]);
102 TT(c, d, e, f, g, h, a, b, w6, C[6]);
103 TT(b, c, d, e, f, g, h, a, w7, C[7]);
104 TT(a, b, c, d, e, f, g, h, w8, C[8]);
105 TT(h, a, b, c, d, e, f, g, w9, C[9]);
106 TT(g, h, a, b, c, d, e, f, w10, C[10]);
107 TT(f, g, h, a, b, c, d, e, w11, C[11]);
108 TT(e, f, g, h, a, b, c, d, w12, C[12]);
109 TT(d, e, f, g, h, a, b, c, w13, C[13]);
110 TT(c, d, e, f, g, h, a, b, w14, C[14]);
111 TT(b, c, d, e, f, g, h, a, w15, C[15]);
112
113 out[0] += a;
114 out[1] += b;
115 out[2] += c;
116 out[3] += d;
117 out[4] += e;
118 out[5] += f;
119 out[6] += g;
120 out[7] += h;
121 }
122
123 /* { dg-final { scan-assembler "kmovd" } } */
124 /* { dg-final { scan-assembler-not "knot" } } */
125 /* { dg-final { scan-assembler-not "kxor" } } */
126 /* { dg-final { scan-assembler-not "kor" } } */
127 /* { dg-final { scan-assembler-not "kandn" } } */
128