1 
2 #ifndef blake2b_compress_sse41_H
3 #define blake2b_compress_sse41_H
4 
5 #define LOADU(p) _mm_loadu_si128((const __m128i *) (const void *) (p))
6 #define STOREU(p, r) _mm_storeu_si128((__m128i *) (void *) (p), r)
7 
8 #define _mm_roti_epi64(x, c)                                         \
9     (-(c) == 32)                                                     \
10         ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1))            \
11         : (-(c) == 24)                                               \
12               ? _mm_shuffle_epi8((x), r24)                           \
13               : (-(c) == 16)                                         \
14                     ? _mm_shuffle_epi8((x), r16)                     \
15                     : (-(c) == 63)                                   \
16                           ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
17                                           _mm_add_epi64((x), (x)))   \
18                           : _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
19                                           _mm_slli_epi64((x), 64 - (-(c))))
20 
21 #define G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1) \
22     row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);                \
23     row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);                \
24                                                                            \
25     row4l = _mm_xor_si128(row4l, row1l);                                   \
26     row4h = _mm_xor_si128(row4h, row1h);                                   \
27                                                                            \
28     row4l = _mm_roti_epi64(row4l, -32);                                    \
29     row4h = _mm_roti_epi64(row4h, -32);                                    \
30                                                                            \
31     row3l = _mm_add_epi64(row3l, row4l);                                   \
32     row3h = _mm_add_epi64(row3h, row4h);                                   \
33                                                                            \
34     row2l = _mm_xor_si128(row2l, row3l);                                   \
35     row2h = _mm_xor_si128(row2h, row3h);                                   \
36                                                                            \
37     row2l = _mm_roti_epi64(row2l, -24);                                    \
38     row2h = _mm_roti_epi64(row2h, -24);
39 
40 #define G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1) \
41     row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);                \
42     row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);                \
43                                                                            \
44     row4l = _mm_xor_si128(row4l, row1l);                                   \
45     row4h = _mm_xor_si128(row4h, row1h);                                   \
46                                                                            \
47     row4l = _mm_roti_epi64(row4l, -16);                                    \
48     row4h = _mm_roti_epi64(row4h, -16);                                    \
49                                                                            \
50     row3l = _mm_add_epi64(row3l, row4l);                                   \
51     row3h = _mm_add_epi64(row3h, row4h);                                   \
52                                                                            \
53     row2l = _mm_xor_si128(row2l, row3l);                                   \
54     row2h = _mm_xor_si128(row2h, row3h);                                   \
55                                                                            \
56     row2l = _mm_roti_epi64(row2l, -63);                                    \
57     row2h = _mm_roti_epi64(row2h, -63);
58 
59 #define DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h) \
60     t0    = _mm_alignr_epi8(row2h, row2l, 8);                               \
61     t1    = _mm_alignr_epi8(row2l, row2h, 8);                               \
62     row2l = t0;                                                             \
63     row2h = t1;                                                             \
64                                                                             \
65     t0    = row3l;                                                          \
66     row3l = row3h;                                                          \
67     row3h = t0;                                                             \
68                                                                             \
69     t0    = _mm_alignr_epi8(row4h, row4l, 8);                               \
70     t1    = _mm_alignr_epi8(row4l, row4h, 8);                               \
71     row4l = t1;                                                             \
72     row4h = t0;
73 
74 #define UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h) \
75     t0    = _mm_alignr_epi8(row2l, row2h, 8);                                 \
76     t1    = _mm_alignr_epi8(row2h, row2l, 8);                                 \
77     row2l = t0;                                                               \
78     row2h = t1;                                                               \
79                                                                               \
80     t0    = row3l;                                                            \
81     row3l = row3h;                                                            \
82     row3h = t0;                                                               \
83                                                                               \
84     t0    = _mm_alignr_epi8(row4l, row4h, 8);                                 \
85     t1    = _mm_alignr_epi8(row4h, row4l, 8);                                 \
86     row4l = t1;                                                               \
87     row4h = t0;
88 
89 #include "blake2b-load-sse41.h"
90 
91 #define ROUND(r)                                                         \
92     LOAD_MSG_##r##_1(b0, b1);                                            \
93     G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);  \
94     LOAD_MSG_##r##_2(b0, b1);                                            \
95     G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);  \
96     DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \
97     LOAD_MSG_##r##_3(b0, b1);                                            \
98     G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);  \
99     LOAD_MSG_##r##_4(b0, b1);                                            \
100     G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1);  \
101     UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);
102 
103 #endif
104