1 /* 2 BLAKE2 reference source code package - optimized C implementations 3 4 Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the 5 terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at 6 your option. The terms of these licenses can be found at: 7 8 - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 9 - OpenSSL license : https://www.openssl.org/source/license.html 10 - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 11 12 More information about the BLAKE2 hash function can be found at 13 https://blake2.net. 14 */ 15 #pragma once 16 #ifndef __BLAKE2S_LOAD_XOP_H__ 17 #define __BLAKE2S_LOAD_XOP_H__ 18 19 #define TOB(x) ((x)*4*0x01010101 + 0x03020100) /* ..or not TOB */ 20 21 #if 0 22 /* Basic VPPERM emulation, for testing purposes */ 23 static __m128i _mm_perm_epi8(const __m128i src1, const __m128i src2, const __m128i sel) 24 { 25 const __m128i sixteen = _mm_set1_epi8(16); 26 const __m128i t0 = _mm_shuffle_epi8(src1, sel); 27 const __m128i s1 = _mm_shuffle_epi8(src2, _mm_sub_epi8(sel, sixteen)); 28 const __m128i mask = _mm_or_si128(_mm_cmpeq_epi8(sel, sixteen), 29 _mm_cmpgt_epi8(sel, sixteen)); /* (>=16) = 0xff : 00 */ 30 return _mm_blendv_epi8(t0, s1, mask); 31 } 32 #endif 33 34 #define LOAD_MSG_0_1(buf) \ 35 buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(6),TOB(4),TOB(2),TOB(0)) ); 36 37 #define LOAD_MSG_0_2(buf) \ 38 buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(7),TOB(5),TOB(3),TOB(1)) ); 39 40 #define LOAD_MSG_0_3(buf) \ 41 buf = _mm_perm_epi8(m2, m3, _mm_set_epi32(TOB(6),TOB(4),TOB(2),TOB(0)) ); 42 43 #define LOAD_MSG_0_4(buf) \ 44 buf = _mm_perm_epi8(m2, m3, _mm_set_epi32(TOB(7),TOB(5),TOB(3),TOB(1)) ); 45 46 #define LOAD_MSG_1_1(buf) \ 47 t0 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(0),TOB(5),TOB(0),TOB(0)) ); \ 48 buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(6)) ); 49 50 #define LOAD_MSG_1_2(buf) \ 51 t1 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(2),TOB(0),TOB(4),TOB(6)) ); \ 52 buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) ); 53 54 #define LOAD_MSG_1_3(buf) \ 55 t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(5),TOB(0),TOB(0),TOB(1)) ); \ 56 buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) ); 57 58 #define LOAD_MSG_1_4(buf) \ 59 t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(3),TOB(7),TOB(2),TOB(0)) ); \ 60 buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(4)) ); 61 62 #define LOAD_MSG_2_1(buf) \ 63 t0 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(0),TOB(1),TOB(0),TOB(7)) ); \ 64 buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(7),TOB(2),TOB(4),TOB(0)) ); 65 66 #define LOAD_MSG_2_2(buf) \ 67 t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(2),TOB(0),TOB(4)) ); \ 68 buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(0)) ); 69 70 #define LOAD_MSG_2_3(buf) \ 71 t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(7),TOB(3),TOB(0)) ); \ 72 buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(6)) ); 73 74 #define LOAD_MSG_2_4(buf) \ 75 t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(4),TOB(1),TOB(6),TOB(0)) ); \ 76 buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(6)) ); 77 78 #define LOAD_MSG_3_1(buf) \ 79 t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(3),TOB(7)) ); \ 80 t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) ); \ 81 buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(5),TOB(1),TOB(0)) ); 82 83 #define LOAD_MSG_3_2(buf) \ 84 t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(0),TOB(1),TOB(5)) ); \ 85 buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(6),TOB(4),TOB(1),TOB(0)) ); 86 87 #define LOAD_MSG_3_3(buf) \ 88 t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(4),TOB(5),TOB(2)) ); \ 89 buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) ); 90 91 #define LOAD_MSG_3_4(buf) \ 92 t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(0),TOB(6)) ); \ 93 buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(4),TOB(2),TOB(6),TOB(0)) ); 94 95 #define LOAD_MSG_4_1(buf) \ 96 t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(2),TOB(5),TOB(0)) ); \ 97 buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(6),TOB(2),TOB(1),TOB(5)) ); 98 99 #define LOAD_MSG_4_2(buf) \ 100 t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(4),TOB(7),TOB(0)) ); \ 101 buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) ); 102 103 #define LOAD_MSG_4_3(buf) \ 104 t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(3),TOB(6),TOB(0),TOB(0)) ); \ 105 t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(2),TOB(7),TOB(0)) ); \ 106 buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(6)) ); 107 108 #define LOAD_MSG_4_4(buf) \ 109 t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(4),TOB(0),TOB(1)) ); \ 110 buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(4),TOB(0)) ); 111 112 #define LOAD_MSG_5_1(buf) \ 113 t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(6),TOB(2)) ); \ 114 buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(4),TOB(2),TOB(1),TOB(0)) ); 115 116 #define LOAD_MSG_5_2(buf) \ 117 t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(3),TOB(7),TOB(6),TOB(0)) ); \ 118 buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(4)) ); 119 120 #define LOAD_MSG_5_3(buf) \ 121 t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(1),TOB(0),TOB(7),TOB(4)) ); \ 122 buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) ); 123 124 #define LOAD_MSG_5_4(buf) \ 125 t1 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(5),TOB(0),TOB(1),TOB(0)) ); \ 126 buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(6),TOB(1),TOB(5)) ); 127 128 #define LOAD_MSG_6_1(buf) \ 129 t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(4),TOB(0),TOB(1),TOB(0)) ); \ 130 buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(6),TOB(1),TOB(4)) ); 131 132 #define LOAD_MSG_6_2(buf) \ 133 t1 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(6),TOB(0),TOB(0),TOB(1)) ); \ 134 buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(5),TOB(7),TOB(0)) ); 135 136 #define LOAD_MSG_6_3(buf) \ 137 t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(6),TOB(0)) ); \ 138 buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(4),TOB(5),TOB(1),TOB(0)) ); 139 140 #define LOAD_MSG_6_4(buf) \ 141 t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(2),TOB(3),TOB(7)) ); \ 142 buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) ); 143 144 #define LOAD_MSG_7_1(buf) \ 145 t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(3),TOB(0),TOB(7),TOB(0)) ); \ 146 buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(4),TOB(1),TOB(5)) ); 147 148 #define LOAD_MSG_7_2(buf) \ 149 t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(5),TOB(1),TOB(0),TOB(7)) ); \ 150 buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(6),TOB(0)) ); 151 152 #define LOAD_MSG_7_3(buf) \ 153 t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(2),TOB(0),TOB(0),TOB(5)) ); \ 154 t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(4),TOB(1),TOB(0)) ); \ 155 buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(7),TOB(0)) ); 156 157 #define LOAD_MSG_7_4(buf) \ 158 t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(6),TOB(4),TOB(0)) ); \ 159 buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(6),TOB(2),TOB(1),TOB(0)) ); 160 161 #define LOAD_MSG_8_1(buf) \ 162 t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(0),TOB(6)) ); \ 163 t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) ); \ 164 buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(6),TOB(0)) ); 165 166 #define LOAD_MSG_8_2(buf) \ 167 t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(4),TOB(3),TOB(5),TOB(0)) ); \ 168 buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(7)) ); 169 170 #define LOAD_MSG_8_3(buf) \ 171 t0 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(6),TOB(1),TOB(0),TOB(0)) ); \ 172 buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(5),TOB(4)) ); \ 173 174 #define LOAD_MSG_8_4(buf) \ 175 buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(5),TOB(4),TOB(7),TOB(2)) ); 176 177 #define LOAD_MSG_9_1(buf) \ 178 t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(1),TOB(7),TOB(0),TOB(0)) ); \ 179 buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(2),TOB(4),TOB(6)) ); 180 181 #define LOAD_MSG_9_2(buf) \ 182 buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(5),TOB(6),TOB(4),TOB(2)) ); 183 184 #define LOAD_MSG_9_3(buf) \ 185 t0 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(3),TOB(5),TOB(0)) ); \ 186 buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(7)) ); 187 188 #define LOAD_MSG_9_4(buf) \ 189 t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(0),TOB(0),TOB(7)) ); \ 190 buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(4),TOB(6),TOB(0)) ); 191 192 #endif 193 194