1 /*
2    BLAKE2 reference source code package - optimized C implementations
3 
4    Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
5 
6    To the extent possible under law, the author(s) have dedicated all copyright
7    and related and neighboring rights to this software to the public domain
8    worldwide. This software is distributed without any warranty.
9 
10    You should have received a copy of the CC0 Public Domain Dedication along with
11    this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
12 */
13 #pragma once
14 #ifndef __BLAKE2S_LOAD_SSE41_H__
15 #define __BLAKE2S_LOAD_SSE41_H__
16 
17 #define LOAD_MSG_0_1(buf) \
18 buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(2,0,2,0)));
19 
20 #define LOAD_MSG_0_2(buf) \
21 buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(3,1,3,1)));
22 
23 #define LOAD_MSG_0_3(buf) \
24 buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(2,0,2,0)));
25 
26 #define LOAD_MSG_0_4(buf) \
27 buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(3,1,3,1)));
28 
29 #define LOAD_MSG_1_1(buf) \
30 t0 = _mm_blend_epi16(m1, m2, 0x0C); \
31 t1 = _mm_slli_si128(m3, 4); \
32 t2 = _mm_blend_epi16(t0, t1, 0xF0); \
33 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3));
34 
35 #define LOAD_MSG_1_2(buf) \
36 t0 = _mm_shuffle_epi32(m2,_MM_SHUFFLE(0,0,2,0)); \
37 t1 = _mm_blend_epi16(m1,m3,0xC0); \
38 t2 = _mm_blend_epi16(t0, t1, 0xF0); \
39 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
40 
41 #define LOAD_MSG_1_3(buf) \
42 t0 = _mm_slli_si128(m1, 4); \
43 t1 = _mm_blend_epi16(m2, t0, 0x30); \
44 t2 = _mm_blend_epi16(m0, t1, 0xF0); \
45 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
46 
47 #define LOAD_MSG_1_4(buf) \
48 t0 = _mm_unpackhi_epi32(m0,m1); \
49 t1 = _mm_slli_si128(m3, 4); \
50 t2 = _mm_blend_epi16(t0, t1, 0x0C); \
51 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
52 
53 #define LOAD_MSG_2_1(buf) \
54 t0 = _mm_unpackhi_epi32(m2,m3); \
55 t1 = _mm_blend_epi16(m3,m1,0x0C); \
56 t2 = _mm_blend_epi16(t0, t1, 0x0F); \
57 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
58 
59 #define LOAD_MSG_2_2(buf) \
60 t0 = _mm_unpacklo_epi32(m2,m0); \
61 t1 = _mm_blend_epi16(t0, m0, 0xF0); \
62 t2 = _mm_slli_si128(m3, 8); \
63 buf = _mm_blend_epi16(t1, t2, 0xC0);
64 
65 #define LOAD_MSG_2_3(buf) \
66 t0 = _mm_blend_epi16(m0, m2, 0x3C); \
67 t1 = _mm_srli_si128(m1, 12); \
68 t2 = _mm_blend_epi16(t0,t1,0x03); \
69 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2));
70 
71 #define LOAD_MSG_2_4(buf) \
72 t0 = _mm_slli_si128(m3, 4); \
73 t1 = _mm_blend_epi16(m0, m1, 0x33); \
74 t2 = _mm_blend_epi16(t1, t0, 0xC0); \
75 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3));
76 
77 #define LOAD_MSG_3_1(buf) \
78 t0 = _mm_unpackhi_epi32(m0,m1); \
79 t1 = _mm_unpackhi_epi32(t0, m2); \
80 t2 = _mm_blend_epi16(t1, m3, 0x0C); \
81 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
82 
83 #define LOAD_MSG_3_2(buf) \
84 t0 = _mm_slli_si128(m2, 8); \
85 t1 = _mm_blend_epi16(m3,m0,0x0C); \
86 t2 = _mm_blend_epi16(t1, t0, 0xC0); \
87 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
88 
89 #define LOAD_MSG_3_3(buf) \
90 t0 = _mm_blend_epi16(m0,m1,0x0F); \
91 t1 = _mm_blend_epi16(t0, m3, 0xC0); \
92 buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
93 
94 #define LOAD_MSG_3_4(buf) \
95 t0 = _mm_unpacklo_epi32(m0,m2); \
96 t1 = _mm_unpackhi_epi32(m1,m2); \
97 buf = _mm_unpacklo_epi64(t1,t0);
98 
99 #define LOAD_MSG_4_1(buf) \
100 t0 = _mm_unpacklo_epi64(m1,m2); \
101 t1 = _mm_unpackhi_epi64(m0,m2); \
102 t2 = _mm_blend_epi16(t0,t1,0x33); \
103 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
104 
105 #define LOAD_MSG_4_2(buf) \
106 t0 = _mm_unpackhi_epi64(m1,m3); \
107 t1 = _mm_unpacklo_epi64(m0,m1); \
108 buf = _mm_blend_epi16(t0,t1,0x33);
109 
110 #define LOAD_MSG_4_3(buf) \
111 t0 = _mm_unpackhi_epi64(m3,m1); \
112 t1 = _mm_unpackhi_epi64(m2,m0); \
113 buf = _mm_blend_epi16(t1,t0,0x33);
114 
115 #define LOAD_MSG_4_4(buf) \
116 t0 = _mm_blend_epi16(m0,m2,0x03); \
117 t1 = _mm_slli_si128(t0, 8); \
118 t2 = _mm_blend_epi16(t1,m3,0x0F); \
119 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,0,3));
120 
121 #define LOAD_MSG_5_1(buf) \
122 t0 = _mm_unpackhi_epi32(m0,m1); \
123 t1 = _mm_unpacklo_epi32(m0,m2); \
124 buf = _mm_unpacklo_epi64(t0,t1);
125 
126 #define LOAD_MSG_5_2(buf) \
127 t0 = _mm_srli_si128(m2, 4); \
128 t1 = _mm_blend_epi16(m0,m3,0x03); \
129 buf = _mm_blend_epi16(t1,t0,0x3C);
130 
131 #define LOAD_MSG_5_3(buf) \
132 t0 = _mm_blend_epi16(m1,m0,0x0C); \
133 t1 = _mm_srli_si128(m3, 4); \
134 t2 = _mm_blend_epi16(t0,t1,0x30); \
135 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0));
136 
137 #define LOAD_MSG_5_4(buf) \
138 t0 = _mm_unpacklo_epi64(m1,m2); \
139 t1= _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,2,0,1)); \
140 buf = _mm_blend_epi16(t0,t1,0x33);
141 
142 #define LOAD_MSG_6_1(buf) \
143 t0 = _mm_slli_si128(m1, 12); \
144 t1 = _mm_blend_epi16(m0,m3,0x33); \
145 buf = _mm_blend_epi16(t1,t0,0xC0);
146 
147 #define LOAD_MSG_6_2(buf) \
148 t0 = _mm_blend_epi16(m3,m2,0x30); \
149 t1 = _mm_srli_si128(m1, 4); \
150 t2 = _mm_blend_epi16(t0,t1,0x03); \
151 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,3,0));
152 
153 #define LOAD_MSG_6_3(buf) \
154 t0 = _mm_unpacklo_epi64(m0,m2); \
155 t1 = _mm_srli_si128(m1, 4); \
156 buf = _mm_shuffle_epi32(_mm_blend_epi16(t0,t1,0x0C), _MM_SHUFFLE(2,3,1,0));
157 
158 #define LOAD_MSG_6_4(buf) \
159 t0 = _mm_unpackhi_epi32(m1,m2); \
160 t1 = _mm_unpackhi_epi64(m0,t0); \
161 buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
162 
163 #define LOAD_MSG_7_1(buf) \
164 t0 = _mm_unpackhi_epi32(m0,m1); \
165 t1 = _mm_blend_epi16(t0,m3,0x0F); \
166 buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(2,0,3,1));
167 
168 #define LOAD_MSG_7_2(buf) \
169 t0 = _mm_blend_epi16(m2,m3,0x30); \
170 t1 = _mm_srli_si128(m0,4); \
171 t2 = _mm_blend_epi16(t0,t1,0x03); \
172 buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,2,3));
173 
174 #define LOAD_MSG_7_3(buf) \
175 t0 = _mm_unpackhi_epi64(m0,m3); \
176 t1 = _mm_unpacklo_epi64(m1,m2); \
177 t2 = _mm_blend_epi16(t0,t1,0x3C); \
178 buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,3,1));
179 
180 #define LOAD_MSG_7_4(buf) \
181 t0 = _mm_unpacklo_epi32(m0,m1); \
182 t1 = _mm_unpackhi_epi32(m1,m2); \
183 buf = _mm_unpacklo_epi64(t0,t1);
184 
185 #define LOAD_MSG_8_1(buf) \
186 t0 = _mm_unpackhi_epi32(m1,m3); \
187 t1 = _mm_unpacklo_epi64(t0,m0); \
188 t2 = _mm_blend_epi16(t1,m2,0xC0); \
189 buf = _mm_shufflehi_epi16(t2,_MM_SHUFFLE(1,0,3,2));
190 
191 #define LOAD_MSG_8_2(buf) \
192 t0 = _mm_unpackhi_epi32(m0,m3); \
193 t1 = _mm_blend_epi16(m2,t0,0xF0); \
194 buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(0,2,1,3));
195 
196 #define LOAD_MSG_8_3(buf) \
197 t0 = _mm_blend_epi16(m2,m0,0x0C); \
198 t1 = _mm_slli_si128(t0,4); \
199 buf = _mm_blend_epi16(t1,m3,0x0F);
200 
201 #define LOAD_MSG_8_4(buf) \
202 t0 = _mm_blend_epi16(m1,m0,0x30); \
203 buf = _mm_shuffle_epi32(t0,_MM_SHUFFLE(1,0,3,2));
204 
205 #define LOAD_MSG_9_1(buf) \
206 t0 = _mm_blend_epi16(m0,m2,0x03); \
207 t1 = _mm_blend_epi16(m1,m2,0x30); \
208 t2 = _mm_blend_epi16(t1,t0,0x0F); \
209 buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,3,0,2));
210 
211 #define LOAD_MSG_9_2(buf) \
212 t0 = _mm_slli_si128(m0,4); \
213 t1 = _mm_blend_epi16(m1,t0,0xC0); \
214 buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(1,2,0,3));
215 
216 #define LOAD_MSG_9_3(buf) \
217 t0 = _mm_unpackhi_epi32(m0,m3); \
218 t1 = _mm_unpacklo_epi32(m2,m3); \
219 t2 = _mm_unpackhi_epi64(t0,t1); \
220 buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(3,0,2,1));
221 
222 #define LOAD_MSG_9_4(buf) \
223 t0 = _mm_blend_epi16(m3,m2,0xC0); \
224 t1 = _mm_unpacklo_epi32(m0,m3); \
225 t2 = _mm_blend_epi16(t0,t1,0x0F); \
226 buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,1,2,3));
227 
228 #endif
229 
230