1 /* 2 BLAKE2 reference source code package - optimized C implementations 3 4 Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the 5 terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at 6 your option. The terms of these licenses can be found at: 7 8 - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 9 - OpenSSL license : https://www.openssl.org/source/license.html 10 - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 11 12 More information about the BLAKE2 hash function can be found at 13 https://blake2.net. 14 */ 15 #ifndef BLAKE2S_LOAD_XOP_H 16 #define BLAKE2S_LOAD_XOP_H 17 18 #define TOB(x) ((x)*4*0x01010101 + 0x03020100) /* ..or not TOB */ 19 20 #if 0 21 /* Basic VPPERM emulation, for testing purposes */ 22 static __m128i _mm_perm_epi8(const __m128i src1, const __m128i src2, const __m128i sel) 23 { 24 const __m128i sixteen = _mm_set1_epi8(16); 25 const __m128i t0 = _mm_shuffle_epi8(src1, sel); 26 const __m128i s1 = _mm_shuffle_epi8(src2, _mm_sub_epi8(sel, sixteen)); 27 const __m128i mask = _mm_or_si128(_mm_cmpeq_epi8(sel, sixteen), 28 _mm_cmpgt_epi8(sel, sixteen)); /* (>=16) = 0xff : 00 */ 29 return _mm_blendv_epi8(t0, s1, mask); 30 } 31 #endif 32 33 #define LOAD_MSG_0_1(buf) \ 34 buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(6),TOB(4),TOB(2),TOB(0)) ); 35 36 #define LOAD_MSG_0_2(buf) \ 37 buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(7),TOB(5),TOB(3),TOB(1)) ); 38 39 #define LOAD_MSG_0_3(buf) \ 40 buf = _mm_perm_epi8(m2, m3, _mm_set_epi32(TOB(6),TOB(4),TOB(2),TOB(0)) ); 41 42 #define LOAD_MSG_0_4(buf) \ 43 buf = _mm_perm_epi8(m2, m3, _mm_set_epi32(TOB(7),TOB(5),TOB(3),TOB(1)) ); 44 45 #define LOAD_MSG_1_1(buf) \ 46 t0 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(0),TOB(5),TOB(0),TOB(0)) ); \ 47 buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(6)) ); 48 49 #define LOAD_MSG_1_2(buf) \ 50 t1 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(2),TOB(0),TOB(4),TOB(6)) ); \ 51 buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) ); 52 53 #define LOAD_MSG_1_3(buf) \ 54 t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(5),TOB(0),TOB(0),TOB(1)) ); \ 55 buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) ); 56 57 #define LOAD_MSG_1_4(buf) \ 58 t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(3),TOB(7),TOB(2),TOB(0)) ); \ 59 buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(4)) ); 60 61 #define LOAD_MSG_2_1(buf) \ 62 t0 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(0),TOB(1),TOB(0),TOB(7)) ); \ 63 buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(7),TOB(2),TOB(4),TOB(0)) ); 64 65 #define LOAD_MSG_2_2(buf) \ 66 t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(2),TOB(0),TOB(4)) ); \ 67 buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(0)) ); 68 69 #define LOAD_MSG_2_3(buf) \ 70 t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(7),TOB(3),TOB(0)) ); \ 71 buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(6)) ); 72 73 #define LOAD_MSG_2_4(buf) \ 74 t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(4),TOB(1),TOB(6),TOB(0)) ); \ 75 buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(6)) ); 76 77 #define LOAD_MSG_3_1(buf) \ 78 t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(3),TOB(7)) ); \ 79 t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) ); \ 80 buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(5),TOB(1),TOB(0)) ); 81 82 #define LOAD_MSG_3_2(buf) \ 83 t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(0),TOB(1),TOB(5)) ); \ 84 buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(6),TOB(4),TOB(1),TOB(0)) ); 85 86 #define LOAD_MSG_3_3(buf) \ 87 t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(4),TOB(5),TOB(2)) ); \ 88 buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) ); 89 90 #define LOAD_MSG_3_4(buf) \ 91 t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(0),TOB(6)) ); \ 92 buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(4),TOB(2),TOB(6),TOB(0)) ); 93 94 #define LOAD_MSG_4_1(buf) \ 95 t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(2),TOB(5),TOB(0)) ); \ 96 buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(6),TOB(2),TOB(1),TOB(5)) ); 97 98 #define LOAD_MSG_4_2(buf) \ 99 t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(4),TOB(7),TOB(0)) ); \ 100 buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) ); 101 102 #define LOAD_MSG_4_3(buf) \ 103 t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(3),TOB(6),TOB(0),TOB(0)) ); \ 104 t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(2),TOB(7),TOB(0)) ); \ 105 buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(6)) ); 106 107 #define LOAD_MSG_4_4(buf) \ 108 t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(4),TOB(0),TOB(1)) ); \ 109 buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(4),TOB(0)) ); 110 111 #define LOAD_MSG_5_1(buf) \ 112 t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(6),TOB(2)) ); \ 113 buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(4),TOB(2),TOB(1),TOB(0)) ); 114 115 #define LOAD_MSG_5_2(buf) \ 116 t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(3),TOB(7),TOB(6),TOB(0)) ); \ 117 buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(4)) ); 118 119 #define LOAD_MSG_5_3(buf) \ 120 t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(1),TOB(0),TOB(7),TOB(4)) ); \ 121 buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) ); 122 123 #define LOAD_MSG_5_4(buf) \ 124 t1 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(5),TOB(0),TOB(1),TOB(0)) ); \ 125 buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(6),TOB(1),TOB(5)) ); 126 127 #define LOAD_MSG_6_1(buf) \ 128 t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(4),TOB(0),TOB(1),TOB(0)) ); \ 129 buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(6),TOB(1),TOB(4)) ); 130 131 #define LOAD_MSG_6_2(buf) \ 132 t1 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(6),TOB(0),TOB(0),TOB(1)) ); \ 133 buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(5),TOB(7),TOB(0)) ); 134 135 #define LOAD_MSG_6_3(buf) \ 136 t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(6),TOB(0)) ); \ 137 buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(4),TOB(5),TOB(1),TOB(0)) ); 138 139 #define LOAD_MSG_6_4(buf) \ 140 t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(2),TOB(3),TOB(7)) ); \ 141 buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) ); 142 143 #define LOAD_MSG_7_1(buf) \ 144 t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(3),TOB(0),TOB(7),TOB(0)) ); \ 145 buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(4),TOB(1),TOB(5)) ); 146 147 #define LOAD_MSG_7_2(buf) \ 148 t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(5),TOB(1),TOB(0),TOB(7)) ); \ 149 buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(6),TOB(0)) ); 150 151 #define LOAD_MSG_7_3(buf) \ 152 t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(2),TOB(0),TOB(0),TOB(5)) ); \ 153 t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(4),TOB(1),TOB(0)) ); \ 154 buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(7),TOB(0)) ); 155 156 #define LOAD_MSG_7_4(buf) \ 157 t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(6),TOB(4),TOB(0)) ); \ 158 buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(6),TOB(2),TOB(1),TOB(0)) ); 159 160 #define LOAD_MSG_8_1(buf) \ 161 t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(0),TOB(6)) ); \ 162 t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) ); \ 163 buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(6),TOB(0)) ); 164 165 #define LOAD_MSG_8_2(buf) \ 166 t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(4),TOB(3),TOB(5),TOB(0)) ); \ 167 buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(7)) ); 168 169 #define LOAD_MSG_8_3(buf) \ 170 t0 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(6),TOB(1),TOB(0),TOB(0)) ); \ 171 buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(5),TOB(4)) ); \ 172 173 #define LOAD_MSG_8_4(buf) \ 174 buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(5),TOB(4),TOB(7),TOB(2)) ); 175 176 #define LOAD_MSG_9_1(buf) \ 177 t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(1),TOB(7),TOB(0),TOB(0)) ); \ 178 buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(2),TOB(4),TOB(6)) ); 179 180 #define LOAD_MSG_9_2(buf) \ 181 buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(5),TOB(6),TOB(4),TOB(2)) ); 182 183 #define LOAD_MSG_9_3(buf) \ 184 t0 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(3),TOB(5),TOB(0)) ); \ 185 buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(7)) ); 186 187 #define LOAD_MSG_9_4(buf) \ 188 t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(0),TOB(0),TOB(7)) ); \ 189 buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(4),TOB(6),TOB(0)) ); 190 191 #endif 192