1 /*
2    BLAKE2 reference source code package - optimized C implementations
3 
4    Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
5 
6    To the extent possible under law, the author(s) have dedicated all copyright
7    and related and neighboring rights to this software to the public domain
8    worldwide. This software is distributed without any warranty.
9 
10    You should have received a copy of the CC0 Public Domain Dedication along
11    with
12    this software. If not, see
13    <http://creativecommons.org/publicdomain/zero/1.0/>.
14 */
15 
16 #ifndef blake2b_load_sse41_H
17 #define blake2b_load_sse41_H
18 
19 #define LOAD_MSG_0_1(b0, b1)             \
20     do {                                 \
21         b0 = _mm_unpacklo_epi64(m0, m1); \
22         b1 = _mm_unpacklo_epi64(m2, m3); \
23     } while (0)
24 
25 #define LOAD_MSG_0_2(b0, b1)             \
26     do {                                 \
27         b0 = _mm_unpackhi_epi64(m0, m1); \
28         b1 = _mm_unpackhi_epi64(m2, m3); \
29     } while (0)
30 
31 #define LOAD_MSG_0_3(b0, b1)             \
32     do {                                 \
33         b0 = _mm_unpacklo_epi64(m4, m5); \
34         b1 = _mm_unpacklo_epi64(m6, m7); \
35     } while (0)
36 
37 #define LOAD_MSG_0_4(b0, b1)             \
38     do {                                 \
39         b0 = _mm_unpackhi_epi64(m4, m5); \
40         b1 = _mm_unpackhi_epi64(m6, m7); \
41     } while (0)
42 
43 #define LOAD_MSG_1_1(b0, b1)             \
44     do {                                 \
45         b0 = _mm_unpacklo_epi64(m7, m2); \
46         b1 = _mm_unpackhi_epi64(m4, m6); \
47     } while (0)
48 
49 #define LOAD_MSG_1_2(b0, b1)             \
50     do {                                 \
51         b0 = _mm_unpacklo_epi64(m5, m4); \
52         b1 = _mm_alignr_epi8(m3, m7, 8); \
53     } while (0)
54 
55 #define LOAD_MSG_1_3(b0, b1)                                 \
56     do {                                                     \
57         b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
58         b1 = _mm_unpackhi_epi64(m5, m2);                     \
59     } while (0)
60 
61 #define LOAD_MSG_1_4(b0, b1)             \
62     do {                                 \
63         b0 = _mm_unpacklo_epi64(m6, m1); \
64         b1 = _mm_unpackhi_epi64(m3, m1); \
65     } while (0)
66 
67 #define LOAD_MSG_2_1(b0, b1)             \
68     do {                                 \
69         b0 = _mm_alignr_epi8(m6, m5, 8); \
70         b1 = _mm_unpackhi_epi64(m2, m7); \
71     } while (0)
72 
73 #define LOAD_MSG_2_2(b0, b1)                \
74     do {                                    \
75         b0 = _mm_unpacklo_epi64(m4, m0);    \
76         b1 = _mm_blend_epi16(m1, m6, 0xF0); \
77     } while (0)
78 
79 #define LOAD_MSG_2_3(b0, b1)                \
80     do {                                    \
81         b0 = _mm_blend_epi16(m5, m1, 0xF0); \
82         b1 = _mm_unpackhi_epi64(m3, m4);    \
83     } while (0)
84 
85 #define LOAD_MSG_2_4(b0, b1)             \
86     do {                                 \
87         b0 = _mm_unpacklo_epi64(m7, m3); \
88         b1 = _mm_alignr_epi8(m2, m0, 8); \
89     } while (0)
90 
91 #define LOAD_MSG_3_1(b0, b1)             \
92     do {                                 \
93         b0 = _mm_unpackhi_epi64(m3, m1); \
94         b1 = _mm_unpackhi_epi64(m6, m5); \
95     } while (0)
96 
97 #define LOAD_MSG_3_2(b0, b1)             \
98     do {                                 \
99         b0 = _mm_unpackhi_epi64(m4, m0); \
100         b1 = _mm_unpacklo_epi64(m6, m7); \
101     } while (0)
102 
103 #define LOAD_MSG_3_3(b0, b1)                \
104     do {                                    \
105         b0 = _mm_blend_epi16(m1, m2, 0xF0); \
106         b1 = _mm_blend_epi16(m2, m7, 0xF0); \
107     } while (0)
108 
109 #define LOAD_MSG_3_4(b0, b1)             \
110     do {                                 \
111         b0 = _mm_unpacklo_epi64(m3, m5); \
112         b1 = _mm_unpacklo_epi64(m0, m4); \
113     } while (0)
114 
115 #define LOAD_MSG_4_1(b0, b1)             \
116     do {                                 \
117         b0 = _mm_unpackhi_epi64(m4, m2); \
118         b1 = _mm_unpacklo_epi64(m1, m5); \
119     } while (0)
120 
121 #define LOAD_MSG_4_2(b0, b1)                \
122     do {                                    \
123         b0 = _mm_blend_epi16(m0, m3, 0xF0); \
124         b1 = _mm_blend_epi16(m2, m7, 0xF0); \
125     } while (0)
126 
127 #define LOAD_MSG_4_3(b0, b1)                \
128     do {                                    \
129         b0 = _mm_blend_epi16(m7, m5, 0xF0); \
130         b1 = _mm_blend_epi16(m3, m1, 0xF0); \
131     } while (0)
132 
133 #define LOAD_MSG_4_4(b0, b1)                \
134     do {                                    \
135         b0 = _mm_alignr_epi8(m6, m0, 8);    \
136         b1 = _mm_blend_epi16(m4, m6, 0xF0); \
137     } while (0)
138 
139 #define LOAD_MSG_5_1(b0, b1)             \
140     do {                                 \
141         b0 = _mm_unpacklo_epi64(m1, m3); \
142         b1 = _mm_unpacklo_epi64(m0, m4); \
143     } while (0)
144 
145 #define LOAD_MSG_5_2(b0, b1)             \
146     do {                                 \
147         b0 = _mm_unpacklo_epi64(m6, m5); \
148         b1 = _mm_unpackhi_epi64(m5, m1); \
149     } while (0)
150 
151 #define LOAD_MSG_5_3(b0, b1)                \
152     do {                                    \
153         b0 = _mm_blend_epi16(m2, m3, 0xF0); \
154         b1 = _mm_unpackhi_epi64(m7, m0);    \
155     } while (0)
156 
157 #define LOAD_MSG_5_4(b0, b1)                \
158     do {                                    \
159         b0 = _mm_unpackhi_epi64(m6, m2);    \
160         b1 = _mm_blend_epi16(m7, m4, 0xF0); \
161     } while (0)
162 
163 #define LOAD_MSG_6_1(b0, b1)                \
164     do {                                    \
165         b0 = _mm_blend_epi16(m6, m0, 0xF0); \
166         b1 = _mm_unpacklo_epi64(m7, m2);    \
167     } while (0)
168 
169 #define LOAD_MSG_6_2(b0, b1)             \
170     do {                                 \
171         b0 = _mm_unpackhi_epi64(m2, m7); \
172         b1 = _mm_alignr_epi8(m5, m6, 8); \
173     } while (0)
174 
175 #define LOAD_MSG_6_3(b0, b1)                                 \
176     do {                                                     \
177         b0 = _mm_unpacklo_epi64(m0, m3);                     \
178         b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1, 0, 3, 2)); \
179     } while (0)
180 
181 #define LOAD_MSG_6_4(b0, b1)                \
182     do {                                    \
183         b0 = _mm_unpackhi_epi64(m3, m1);    \
184         b1 = _mm_blend_epi16(m1, m5, 0xF0); \
185     } while (0)
186 
187 #define LOAD_MSG_7_1(b0, b1)                \
188     do {                                    \
189         b0 = _mm_unpackhi_epi64(m6, m3);    \
190         b1 = _mm_blend_epi16(m6, m1, 0xF0); \
191     } while (0)
192 
193 #define LOAD_MSG_7_2(b0, b1)             \
194     do {                                 \
195         b0 = _mm_alignr_epi8(m7, m5, 8); \
196         b1 = _mm_unpackhi_epi64(m0, m4); \
197     } while (0)
198 
199 #define LOAD_MSG_7_3(b0, b1)             \
200     do {                                 \
201         b0 = _mm_unpackhi_epi64(m2, m7); \
202         b1 = _mm_unpacklo_epi64(m4, m1); \
203     } while (0)
204 
205 #define LOAD_MSG_7_4(b0, b1)             \
206     do {                                 \
207         b0 = _mm_unpacklo_epi64(m0, m2); \
208         b1 = _mm_unpacklo_epi64(m3, m5); \
209     } while (0)
210 
211 #define LOAD_MSG_8_1(b0, b1)             \
212     do {                                 \
213         b0 = _mm_unpacklo_epi64(m3, m7); \
214         b1 = _mm_alignr_epi8(m0, m5, 8); \
215     } while (0)
216 
217 #define LOAD_MSG_8_2(b0, b1)             \
218     do {                                 \
219         b0 = _mm_unpackhi_epi64(m7, m4); \
220         b1 = _mm_alignr_epi8(m4, m1, 8); \
221     } while (0)
222 
223 #define LOAD_MSG_8_3(b0, b1)             \
224     do {                                 \
225         b0 = m6;                         \
226         b1 = _mm_alignr_epi8(m5, m0, 8); \
227     } while (0)
228 
229 #define LOAD_MSG_8_4(b0, b1)                \
230     do {                                    \
231         b0 = _mm_blend_epi16(m1, m3, 0xF0); \
232         b1 = m2;                            \
233     } while (0)
234 
235 #define LOAD_MSG_9_1(b0, b1)             \
236     do {                                 \
237         b0 = _mm_unpacklo_epi64(m5, m4); \
238         b1 = _mm_unpackhi_epi64(m3, m0); \
239     } while (0)
240 
241 #define LOAD_MSG_9_2(b0, b1)                \
242     do {                                    \
243         b0 = _mm_unpacklo_epi64(m1, m2);    \
244         b1 = _mm_blend_epi16(m3, m2, 0xF0); \
245     } while (0)
246 
247 #define LOAD_MSG_9_3(b0, b1)             \
248     do {                                 \
249         b0 = _mm_unpackhi_epi64(m7, m4); \
250         b1 = _mm_unpackhi_epi64(m1, m6); \
251     } while (0)
252 
253 #define LOAD_MSG_9_4(b0, b1)             \
254     do {                                 \
255         b0 = _mm_alignr_epi8(m7, m5, 8); \
256         b1 = _mm_unpacklo_epi64(m6, m0); \
257     } while (0)
258 
259 #define LOAD_MSG_10_1(b0, b1)            \
260     do {                                 \
261         b0 = _mm_unpacklo_epi64(m0, m1); \
262         b1 = _mm_unpacklo_epi64(m2, m3); \
263     } while (0)
264 
265 #define LOAD_MSG_10_2(b0, b1)            \
266     do {                                 \
267         b0 = _mm_unpackhi_epi64(m0, m1); \
268         b1 = _mm_unpackhi_epi64(m2, m3); \
269     } while (0)
270 
271 #define LOAD_MSG_10_3(b0, b1)            \
272     do {                                 \
273         b0 = _mm_unpacklo_epi64(m4, m5); \
274         b1 = _mm_unpacklo_epi64(m6, m7); \
275     } while (0)
276 
277 #define LOAD_MSG_10_4(b0, b1)            \
278     do {                                 \
279         b0 = _mm_unpackhi_epi64(m4, m5); \
280         b1 = _mm_unpackhi_epi64(m6, m7); \
281     } while (0)
282 
283 #define LOAD_MSG_11_1(b0, b1)            \
284     do {                                 \
285         b0 = _mm_unpacklo_epi64(m7, m2); \
286         b1 = _mm_unpackhi_epi64(m4, m6); \
287     } while (0)
288 
289 #define LOAD_MSG_11_2(b0, b1)            \
290     do {                                 \
291         b0 = _mm_unpacklo_epi64(m5, m4); \
292         b1 = _mm_alignr_epi8(m3, m7, 8); \
293     } while (0)
294 
295 #define LOAD_MSG_11_3(b0, b1)                                \
296     do {                                                     \
297         b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \
298         b1 = _mm_unpackhi_epi64(m5, m2);                     \
299     } while (0)
300 
301 #define LOAD_MSG_11_4(b0, b1)            \
302     do {                                 \
303         b0 = _mm_unpacklo_epi64(m6, m1); \
304         b1 = _mm_unpackhi_epi64(m3, m1); \
305     } while (0)
306 
307 #endif
308