1 /* 2 BLAKE2 reference source code package - optimized C implementations 3 4 Written in 2012 by Samuel Neves <sneves@dei.uc.pt> 5 6 To the extent possible under law, the author(s) have dedicated all copyright 7 and related and neighboring rights to this software to the public domain 8 worldwide. This software is distributed without any warranty. 9 10 You should have received a copy of the CC0 Public Domain Dedication along 11 with 12 this software. If not, see 13 <http://creativecommons.org/publicdomain/zero/1.0/>. 14 */ 15 16 #ifndef blake2b_load_sse41_H 17 #define blake2b_load_sse41_H 18 19 #define LOAD_MSG_0_1(b0, b1) \ 20 do { \ 21 b0 = _mm_unpacklo_epi64(m0, m1); \ 22 b1 = _mm_unpacklo_epi64(m2, m3); \ 23 } while (0) 24 25 #define LOAD_MSG_0_2(b0, b1) \ 26 do { \ 27 b0 = _mm_unpackhi_epi64(m0, m1); \ 28 b1 = _mm_unpackhi_epi64(m2, m3); \ 29 } while (0) 30 31 #define LOAD_MSG_0_3(b0, b1) \ 32 do { \ 33 b0 = _mm_unpacklo_epi64(m4, m5); \ 34 b1 = _mm_unpacklo_epi64(m6, m7); \ 35 } while (0) 36 37 #define LOAD_MSG_0_4(b0, b1) \ 38 do { \ 39 b0 = _mm_unpackhi_epi64(m4, m5); \ 40 b1 = _mm_unpackhi_epi64(m6, m7); \ 41 } while (0) 42 43 #define LOAD_MSG_1_1(b0, b1) \ 44 do { \ 45 b0 = _mm_unpacklo_epi64(m7, m2); \ 46 b1 = _mm_unpackhi_epi64(m4, m6); \ 47 } while (0) 48 49 #define LOAD_MSG_1_2(b0, b1) \ 50 do { \ 51 b0 = _mm_unpacklo_epi64(m5, m4); \ 52 b1 = _mm_alignr_epi8(m3, m7, 8); \ 53 } while (0) 54 55 #define LOAD_MSG_1_3(b0, b1) \ 56 do { \ 57 b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \ 58 b1 = _mm_unpackhi_epi64(m5, m2); \ 59 } while (0) 60 61 #define LOAD_MSG_1_4(b0, b1) \ 62 do { \ 63 b0 = _mm_unpacklo_epi64(m6, m1); \ 64 b1 = _mm_unpackhi_epi64(m3, m1); \ 65 } while (0) 66 67 #define LOAD_MSG_2_1(b0, b1) \ 68 do { \ 69 b0 = _mm_alignr_epi8(m6, m5, 8); \ 70 b1 = _mm_unpackhi_epi64(m2, m7); \ 71 } while (0) 72 73 #define LOAD_MSG_2_2(b0, b1) \ 74 do { \ 75 b0 = _mm_unpacklo_epi64(m4, m0); \ 76 b1 = _mm_blend_epi16(m1, m6, 0xF0); \ 77 } while (0) 78 79 #define LOAD_MSG_2_3(b0, b1) \ 80 do { \ 81 b0 = _mm_blend_epi16(m5, m1, 0xF0); \ 82 b1 = _mm_unpackhi_epi64(m3, m4); \ 83 } while (0) 84 85 #define LOAD_MSG_2_4(b0, b1) \ 86 do { \ 87 b0 = _mm_unpacklo_epi64(m7, m3); \ 88 b1 = _mm_alignr_epi8(m2, m0, 8); \ 89 } while (0) 90 91 #define LOAD_MSG_3_1(b0, b1) \ 92 do { \ 93 b0 = _mm_unpackhi_epi64(m3, m1); \ 94 b1 = _mm_unpackhi_epi64(m6, m5); \ 95 } while (0) 96 97 #define LOAD_MSG_3_2(b0, b1) \ 98 do { \ 99 b0 = _mm_unpackhi_epi64(m4, m0); \ 100 b1 = _mm_unpacklo_epi64(m6, m7); \ 101 } while (0) 102 103 #define LOAD_MSG_3_3(b0, b1) \ 104 do { \ 105 b0 = _mm_blend_epi16(m1, m2, 0xF0); \ 106 b1 = _mm_blend_epi16(m2, m7, 0xF0); \ 107 } while (0) 108 109 #define LOAD_MSG_3_4(b0, b1) \ 110 do { \ 111 b0 = _mm_unpacklo_epi64(m3, m5); \ 112 b1 = _mm_unpacklo_epi64(m0, m4); \ 113 } while (0) 114 115 #define LOAD_MSG_4_1(b0, b1) \ 116 do { \ 117 b0 = _mm_unpackhi_epi64(m4, m2); \ 118 b1 = _mm_unpacklo_epi64(m1, m5); \ 119 } while (0) 120 121 #define LOAD_MSG_4_2(b0, b1) \ 122 do { \ 123 b0 = _mm_blend_epi16(m0, m3, 0xF0); \ 124 b1 = _mm_blend_epi16(m2, m7, 0xF0); \ 125 } while (0) 126 127 #define LOAD_MSG_4_3(b0, b1) \ 128 do { \ 129 b0 = _mm_blend_epi16(m7, m5, 0xF0); \ 130 b1 = _mm_blend_epi16(m3, m1, 0xF0); \ 131 } while (0) 132 133 #define LOAD_MSG_4_4(b0, b1) \ 134 do { \ 135 b0 = _mm_alignr_epi8(m6, m0, 8); \ 136 b1 = _mm_blend_epi16(m4, m6, 0xF0); \ 137 } while (0) 138 139 #define LOAD_MSG_5_1(b0, b1) \ 140 do { \ 141 b0 = _mm_unpacklo_epi64(m1, m3); \ 142 b1 = _mm_unpacklo_epi64(m0, m4); \ 143 } while (0) 144 145 #define LOAD_MSG_5_2(b0, b1) \ 146 do { \ 147 b0 = _mm_unpacklo_epi64(m6, m5); \ 148 b1 = _mm_unpackhi_epi64(m5, m1); \ 149 } while (0) 150 151 #define LOAD_MSG_5_3(b0, b1) \ 152 do { \ 153 b0 = _mm_blend_epi16(m2, m3, 0xF0); \ 154 b1 = _mm_unpackhi_epi64(m7, m0); \ 155 } while (0) 156 157 #define LOAD_MSG_5_4(b0, b1) \ 158 do { \ 159 b0 = _mm_unpackhi_epi64(m6, m2); \ 160 b1 = _mm_blend_epi16(m7, m4, 0xF0); \ 161 } while (0) 162 163 #define LOAD_MSG_6_1(b0, b1) \ 164 do { \ 165 b0 = _mm_blend_epi16(m6, m0, 0xF0); \ 166 b1 = _mm_unpacklo_epi64(m7, m2); \ 167 } while (0) 168 169 #define LOAD_MSG_6_2(b0, b1) \ 170 do { \ 171 b0 = _mm_unpackhi_epi64(m2, m7); \ 172 b1 = _mm_alignr_epi8(m5, m6, 8); \ 173 } while (0) 174 175 #define LOAD_MSG_6_3(b0, b1) \ 176 do { \ 177 b0 = _mm_unpacklo_epi64(m0, m3); \ 178 b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1, 0, 3, 2)); \ 179 } while (0) 180 181 #define LOAD_MSG_6_4(b0, b1) \ 182 do { \ 183 b0 = _mm_unpackhi_epi64(m3, m1); \ 184 b1 = _mm_blend_epi16(m1, m5, 0xF0); \ 185 } while (0) 186 187 #define LOAD_MSG_7_1(b0, b1) \ 188 do { \ 189 b0 = _mm_unpackhi_epi64(m6, m3); \ 190 b1 = _mm_blend_epi16(m6, m1, 0xF0); \ 191 } while (0) 192 193 #define LOAD_MSG_7_2(b0, b1) \ 194 do { \ 195 b0 = _mm_alignr_epi8(m7, m5, 8); \ 196 b1 = _mm_unpackhi_epi64(m0, m4); \ 197 } while (0) 198 199 #define LOAD_MSG_7_3(b0, b1) \ 200 do { \ 201 b0 = _mm_unpackhi_epi64(m2, m7); \ 202 b1 = _mm_unpacklo_epi64(m4, m1); \ 203 } while (0) 204 205 #define LOAD_MSG_7_4(b0, b1) \ 206 do { \ 207 b0 = _mm_unpacklo_epi64(m0, m2); \ 208 b1 = _mm_unpacklo_epi64(m3, m5); \ 209 } while (0) 210 211 #define LOAD_MSG_8_1(b0, b1) \ 212 do { \ 213 b0 = _mm_unpacklo_epi64(m3, m7); \ 214 b1 = _mm_alignr_epi8(m0, m5, 8); \ 215 } while (0) 216 217 #define LOAD_MSG_8_2(b0, b1) \ 218 do { \ 219 b0 = _mm_unpackhi_epi64(m7, m4); \ 220 b1 = _mm_alignr_epi8(m4, m1, 8); \ 221 } while (0) 222 223 #define LOAD_MSG_8_3(b0, b1) \ 224 do { \ 225 b0 = m6; \ 226 b1 = _mm_alignr_epi8(m5, m0, 8); \ 227 } while (0) 228 229 #define LOAD_MSG_8_4(b0, b1) \ 230 do { \ 231 b0 = _mm_blend_epi16(m1, m3, 0xF0); \ 232 b1 = m2; \ 233 } while (0) 234 235 #define LOAD_MSG_9_1(b0, b1) \ 236 do { \ 237 b0 = _mm_unpacklo_epi64(m5, m4); \ 238 b1 = _mm_unpackhi_epi64(m3, m0); \ 239 } while (0) 240 241 #define LOAD_MSG_9_2(b0, b1) \ 242 do { \ 243 b0 = _mm_unpacklo_epi64(m1, m2); \ 244 b1 = _mm_blend_epi16(m3, m2, 0xF0); \ 245 } while (0) 246 247 #define LOAD_MSG_9_3(b0, b1) \ 248 do { \ 249 b0 = _mm_unpackhi_epi64(m7, m4); \ 250 b1 = _mm_unpackhi_epi64(m1, m6); \ 251 } while (0) 252 253 #define LOAD_MSG_9_4(b0, b1) \ 254 do { \ 255 b0 = _mm_alignr_epi8(m7, m5, 8); \ 256 b1 = _mm_unpacklo_epi64(m6, m0); \ 257 } while (0) 258 259 #define LOAD_MSG_10_1(b0, b1) \ 260 do { \ 261 b0 = _mm_unpacklo_epi64(m0, m1); \ 262 b1 = _mm_unpacklo_epi64(m2, m3); \ 263 } while (0) 264 265 #define LOAD_MSG_10_2(b0, b1) \ 266 do { \ 267 b0 = _mm_unpackhi_epi64(m0, m1); \ 268 b1 = _mm_unpackhi_epi64(m2, m3); \ 269 } while (0) 270 271 #define LOAD_MSG_10_3(b0, b1) \ 272 do { \ 273 b0 = _mm_unpacklo_epi64(m4, m5); \ 274 b1 = _mm_unpacklo_epi64(m6, m7); \ 275 } while (0) 276 277 #define LOAD_MSG_10_4(b0, b1) \ 278 do { \ 279 b0 = _mm_unpackhi_epi64(m4, m5); \ 280 b1 = _mm_unpackhi_epi64(m6, m7); \ 281 } while (0) 282 283 #define LOAD_MSG_11_1(b0, b1) \ 284 do { \ 285 b0 = _mm_unpacklo_epi64(m7, m2); \ 286 b1 = _mm_unpackhi_epi64(m4, m6); \ 287 } while (0) 288 289 #define LOAD_MSG_11_2(b0, b1) \ 290 do { \ 291 b0 = _mm_unpacklo_epi64(m5, m4); \ 292 b1 = _mm_alignr_epi8(m3, m7, 8); \ 293 } while (0) 294 295 #define LOAD_MSG_11_3(b0, b1) \ 296 do { \ 297 b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1, 0, 3, 2)); \ 298 b1 = _mm_unpackhi_epi64(m5, m2); \ 299 } while (0) 300 301 #define LOAD_MSG_11_4(b0, b1) \ 302 do { \ 303 b0 = _mm_unpacklo_epi64(m6, m1); \ 304 b1 = _mm_unpackhi_epi64(m3, m1); \ 305 } while (0) 306 307 #endif 308