1 /* 2 BLAKE2 reference source code package - optimized C implementations 3 4 Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the 5 terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at 6 your option. The terms of these licenses can be found at: 7 8 - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 9 - OpenSSL license : https://www.openssl.org/source/license.html 10 - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 11 12 More information about the BLAKE2 hash function can be found at 13 https://blake2.net. 14 */ 15 #ifndef BLAKE2B_LOAD_SSE41_H 16 #define BLAKE2B_LOAD_SSE41_H 17 18 #define LOAD_MSG_0_1(b0, b1) \ 19 do \ 20 { \ 21 b0 = _mm_unpacklo_epi64(m0, m1); \ 22 b1 = _mm_unpacklo_epi64(m2, m3); \ 23 } while(0) 24 25 26 #define LOAD_MSG_0_2(b0, b1) \ 27 do \ 28 { \ 29 b0 = _mm_unpackhi_epi64(m0, m1); \ 30 b1 = _mm_unpackhi_epi64(m2, m3); \ 31 } while(0) 32 33 34 #define LOAD_MSG_0_3(b0, b1) \ 35 do \ 36 { \ 37 b0 = _mm_unpacklo_epi64(m4, m5); \ 38 b1 = _mm_unpacklo_epi64(m6, m7); \ 39 } while(0) 40 41 42 #define LOAD_MSG_0_4(b0, b1) \ 43 do \ 44 { \ 45 b0 = _mm_unpackhi_epi64(m4, m5); \ 46 b1 = _mm_unpackhi_epi64(m6, m7); \ 47 } while(0) 48 49 50 #define LOAD_MSG_1_1(b0, b1) \ 51 do \ 52 { \ 53 b0 = _mm_unpacklo_epi64(m7, m2); \ 54 b1 = _mm_unpackhi_epi64(m4, m6); \ 55 } while(0) 56 57 58 #define LOAD_MSG_1_2(b0, b1) \ 59 do \ 60 { \ 61 b0 = _mm_unpacklo_epi64(m5, m4); \ 62 b1 = _mm_alignr_epi8(m3, m7, 8); \ 63 } while(0) 64 65 66 #define LOAD_MSG_1_3(b0, b1) \ 67 do \ 68 { \ 69 b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ 70 b1 = _mm_unpackhi_epi64(m5, m2); \ 71 } while(0) 72 73 74 #define LOAD_MSG_1_4(b0, b1) \ 75 do \ 76 { \ 77 b0 = _mm_unpacklo_epi64(m6, m1); \ 78 b1 = _mm_unpackhi_epi64(m3, m1); \ 79 } while(0) 80 81 82 #define LOAD_MSG_2_1(b0, b1) \ 83 do \ 84 { \ 85 b0 = _mm_alignr_epi8(m6, m5, 8); \ 86 b1 = _mm_unpackhi_epi64(m2, m7); \ 87 } while(0) 88 89 90 #define LOAD_MSG_2_2(b0, b1) \ 91 do \ 92 { \ 93 b0 = _mm_unpacklo_epi64(m4, m0); \ 94 b1 = _mm_blend_epi16(m1, m6, 0xF0); \ 95 } while(0) 96 97 98 #define LOAD_MSG_2_3(b0, b1) \ 99 do \ 100 { \ 101 b0 = _mm_blend_epi16(m5, m1, 0xF0); \ 102 b1 = _mm_unpackhi_epi64(m3, m4); \ 103 } while(0) 104 105 106 #define LOAD_MSG_2_4(b0, b1) \ 107 do \ 108 { \ 109 b0 = _mm_unpacklo_epi64(m7, m3); \ 110 b1 = _mm_alignr_epi8(m2, m0, 8); \ 111 } while(0) 112 113 114 #define LOAD_MSG_3_1(b0, b1) \ 115 do \ 116 { \ 117 b0 = _mm_unpackhi_epi64(m3, m1); \ 118 b1 = _mm_unpackhi_epi64(m6, m5); \ 119 } while(0) 120 121 122 #define LOAD_MSG_3_2(b0, b1) \ 123 do \ 124 { \ 125 b0 = _mm_unpackhi_epi64(m4, m0); \ 126 b1 = _mm_unpacklo_epi64(m6, m7); \ 127 } while(0) 128 129 130 #define LOAD_MSG_3_3(b0, b1) \ 131 do \ 132 { \ 133 b0 = _mm_blend_epi16(m1, m2, 0xF0); \ 134 b1 = _mm_blend_epi16(m2, m7, 0xF0); \ 135 } while(0) 136 137 138 #define LOAD_MSG_3_4(b0, b1) \ 139 do \ 140 { \ 141 b0 = _mm_unpacklo_epi64(m3, m5); \ 142 b1 = _mm_unpacklo_epi64(m0, m4); \ 143 } while(0) 144 145 146 #define LOAD_MSG_4_1(b0, b1) \ 147 do \ 148 { \ 149 b0 = _mm_unpackhi_epi64(m4, m2); \ 150 b1 = _mm_unpacklo_epi64(m1, m5); \ 151 } while(0) 152 153 154 #define LOAD_MSG_4_2(b0, b1) \ 155 do \ 156 { \ 157 b0 = _mm_blend_epi16(m0, m3, 0xF0); \ 158 b1 = _mm_blend_epi16(m2, m7, 0xF0); \ 159 } while(0) 160 161 162 #define LOAD_MSG_4_3(b0, b1) \ 163 do \ 164 { \ 165 b0 = _mm_blend_epi16(m7, m5, 0xF0); \ 166 b1 = _mm_blend_epi16(m3, m1, 0xF0); \ 167 } while(0) 168 169 170 #define LOAD_MSG_4_4(b0, b1) \ 171 do \ 172 { \ 173 b0 = _mm_alignr_epi8(m6, m0, 8); \ 174 b1 = _mm_blend_epi16(m4, m6, 0xF0); \ 175 } while(0) 176 177 178 #define LOAD_MSG_5_1(b0, b1) \ 179 do \ 180 { \ 181 b0 = _mm_unpacklo_epi64(m1, m3); \ 182 b1 = _mm_unpacklo_epi64(m0, m4); \ 183 } while(0) 184 185 186 #define LOAD_MSG_5_2(b0, b1) \ 187 do \ 188 { \ 189 b0 = _mm_unpacklo_epi64(m6, m5); \ 190 b1 = _mm_unpackhi_epi64(m5, m1); \ 191 } while(0) 192 193 194 #define LOAD_MSG_5_3(b0, b1) \ 195 do \ 196 { \ 197 b0 = _mm_blend_epi16(m2, m3, 0xF0); \ 198 b1 = _mm_unpackhi_epi64(m7, m0); \ 199 } while(0) 200 201 202 #define LOAD_MSG_5_4(b0, b1) \ 203 do \ 204 { \ 205 b0 = _mm_unpackhi_epi64(m6, m2); \ 206 b1 = _mm_blend_epi16(m7, m4, 0xF0); \ 207 } while(0) 208 209 210 #define LOAD_MSG_6_1(b0, b1) \ 211 do \ 212 { \ 213 b0 = _mm_blend_epi16(m6, m0, 0xF0); \ 214 b1 = _mm_unpacklo_epi64(m7, m2); \ 215 } while(0) 216 217 218 #define LOAD_MSG_6_2(b0, b1) \ 219 do \ 220 { \ 221 b0 = _mm_unpackhi_epi64(m2, m7); \ 222 b1 = _mm_alignr_epi8(m5, m6, 8); \ 223 } while(0) 224 225 226 #define LOAD_MSG_6_3(b0, b1) \ 227 do \ 228 { \ 229 b0 = _mm_unpacklo_epi64(m0, m3); \ 230 b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \ 231 } while(0) 232 233 234 #define LOAD_MSG_6_4(b0, b1) \ 235 do \ 236 { \ 237 b0 = _mm_unpackhi_epi64(m3, m1); \ 238 b1 = _mm_blend_epi16(m1, m5, 0xF0); \ 239 } while(0) 240 241 242 #define LOAD_MSG_7_1(b0, b1) \ 243 do \ 244 { \ 245 b0 = _mm_unpackhi_epi64(m6, m3); \ 246 b1 = _mm_blend_epi16(m6, m1, 0xF0); \ 247 } while(0) 248 249 250 #define LOAD_MSG_7_2(b0, b1) \ 251 do \ 252 { \ 253 b0 = _mm_alignr_epi8(m7, m5, 8); \ 254 b1 = _mm_unpackhi_epi64(m0, m4); \ 255 } while(0) 256 257 258 #define LOAD_MSG_7_3(b0, b1) \ 259 do \ 260 { \ 261 b0 = _mm_unpackhi_epi64(m2, m7); \ 262 b1 = _mm_unpacklo_epi64(m4, m1); \ 263 } while(0) 264 265 266 #define LOAD_MSG_7_4(b0, b1) \ 267 do \ 268 { \ 269 b0 = _mm_unpacklo_epi64(m0, m2); \ 270 b1 = _mm_unpacklo_epi64(m3, m5); \ 271 } while(0) 272 273 274 #define LOAD_MSG_8_1(b0, b1) \ 275 do \ 276 { \ 277 b0 = _mm_unpacklo_epi64(m3, m7); \ 278 b1 = _mm_alignr_epi8(m0, m5, 8); \ 279 } while(0) 280 281 282 #define LOAD_MSG_8_2(b0, b1) \ 283 do \ 284 { \ 285 b0 = _mm_unpackhi_epi64(m7, m4); \ 286 b1 = _mm_alignr_epi8(m4, m1, 8); \ 287 } while(0) 288 289 290 #define LOAD_MSG_8_3(b0, b1) \ 291 do \ 292 { \ 293 b0 = m6; \ 294 b1 = _mm_alignr_epi8(m5, m0, 8); \ 295 } while(0) 296 297 298 #define LOAD_MSG_8_4(b0, b1) \ 299 do \ 300 { \ 301 b0 = _mm_blend_epi16(m1, m3, 0xF0); \ 302 b1 = m2; \ 303 } while(0) 304 305 306 #define LOAD_MSG_9_1(b0, b1) \ 307 do \ 308 { \ 309 b0 = _mm_unpacklo_epi64(m5, m4); \ 310 b1 = _mm_unpackhi_epi64(m3, m0); \ 311 } while(0) 312 313 314 #define LOAD_MSG_9_2(b0, b1) \ 315 do \ 316 { \ 317 b0 = _mm_unpacklo_epi64(m1, m2); \ 318 b1 = _mm_blend_epi16(m3, m2, 0xF0); \ 319 } while(0) 320 321 322 #define LOAD_MSG_9_3(b0, b1) \ 323 do \ 324 { \ 325 b0 = _mm_unpackhi_epi64(m7, m4); \ 326 b1 = _mm_unpackhi_epi64(m1, m6); \ 327 } while(0) 328 329 330 #define LOAD_MSG_9_4(b0, b1) \ 331 do \ 332 { \ 333 b0 = _mm_alignr_epi8(m7, m5, 8); \ 334 b1 = _mm_unpacklo_epi64(m6, m0); \ 335 } while(0) 336 337 338 #define LOAD_MSG_10_1(b0, b1) \ 339 do \ 340 { \ 341 b0 = _mm_unpacklo_epi64(m0, m1); \ 342 b1 = _mm_unpacklo_epi64(m2, m3); \ 343 } while(0) 344 345 346 #define LOAD_MSG_10_2(b0, b1) \ 347 do \ 348 { \ 349 b0 = _mm_unpackhi_epi64(m0, m1); \ 350 b1 = _mm_unpackhi_epi64(m2, m3); \ 351 } while(0) 352 353 354 #define LOAD_MSG_10_3(b0, b1) \ 355 do \ 356 { \ 357 b0 = _mm_unpacklo_epi64(m4, m5); \ 358 b1 = _mm_unpacklo_epi64(m6, m7); \ 359 } while(0) 360 361 362 #define LOAD_MSG_10_4(b0, b1) \ 363 do \ 364 { \ 365 b0 = _mm_unpackhi_epi64(m4, m5); \ 366 b1 = _mm_unpackhi_epi64(m6, m7); \ 367 } while(0) 368 369 370 #define LOAD_MSG_11_1(b0, b1) \ 371 do \ 372 { \ 373 b0 = _mm_unpacklo_epi64(m7, m2); \ 374 b1 = _mm_unpackhi_epi64(m4, m6); \ 375 } while(0) 376 377 378 #define LOAD_MSG_11_2(b0, b1) \ 379 do \ 380 { \ 381 b0 = _mm_unpacklo_epi64(m5, m4); \ 382 b1 = _mm_alignr_epi8(m3, m7, 8); \ 383 } while(0) 384 385 386 #define LOAD_MSG_11_3(b0, b1) \ 387 do \ 388 { \ 389 b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ 390 b1 = _mm_unpackhi_epi64(m5, m2); \ 391 } while(0) 392 393 394 #define LOAD_MSG_11_4(b0, b1) \ 395 do \ 396 { \ 397 b0 = _mm_unpacklo_epi64(m6, m1); \ 398 b1 = _mm_unpackhi_epi64(m3, m1); \ 399 } while(0) 400 401 402 #endif 403