1 /* 2 * Argon2 source code package 3 * 4 * Written by Daniel Dinu and Dmitry Khovratovich, 2015 5 * 6 * This work is licensed under a Creative Commons CC0 1.0 License/Waiver. 7 * 8 * You should have received a copy of the CC0 Public Domain Dedication along 9 * with 10 * this software. If not, see 11 * <http://creativecommons.org/publicdomain/zero/1.0/>. 12 */ 13 14 #include <stdint.h> 15 #include <stdlib.h> 16 #include <string.h> 17 18 #include "argon2-core.h" 19 #include "argon2.h" 20 #include "private/common.h" 21 #include "private/sse2_64_32.h" 22 23 #if defined(HAVE_AVX512FINTRIN_H) && defined(HAVE_AVX2INTRIN_H) && \ 24 defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H) 25 26 # ifdef __GNUC__ 27 # pragma GCC target("sse2") 28 # pragma GCC target("ssse3") 29 # pragma GCC target("sse4.1") 30 # pragma GCC target("avx2") 31 # pragma GCC target("avx512f") 32 # endif 33 34 # ifdef _MSC_VER 35 # include <intrin.h> /* for _mm_set_epi64x */ 36 # endif 37 #include <emmintrin.h> 38 #include <immintrin.h> 39 #include <smmintrin.h> 40 #include <tmmintrin.h> 41 42 # include "blamka-round-avx512f.h" 43 44 static void 45 fill_block(__m512i *state, const uint8_t *ref_block, uint8_t *next_block) 46 { 47 __m512i block_XY[ARGON2_512BIT_WORDS_IN_BLOCK]; 48 uint32_t i; 49 50 for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) { 51 block_XY[i] = state[i] = _mm512_xor_si512( 52 state[i], _mm512_loadu_si512((__m512i const *) (&ref_block[64 * i]))); 53 } 54 55 for (i = 0; i < 2; ++i) { 56 BLAKE2_ROUND_1( 57 state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3], 58 state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]); 59 } 60 61 for (i = 0; i < 2; ++i) { 62 BLAKE2_ROUND_2( 63 state[2 * 0 + i], state[2 * 1 + i], state[2 * 2 + i], state[2 * 3 + i], 64 state[2 * 4 + i], state[2 * 5 + i], state[2 * 6 + i], state[2 * 7 + i]); 65 } 66 67 for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) { 68 state[i] = _mm512_xor_si512(state[i], block_XY[i]); 69 _mm512_storeu_si512((__m512i *) (&next_block[64 * i]), state[i]); 70 } 71 } 72 73 static void 74 fill_block_with_xor(__m512i *state, const uint8_t *ref_block, 75 uint8_t *next_block) 76 { 77 __m512i block_XY[ARGON2_512BIT_WORDS_IN_BLOCK]; 78 uint32_t i; 79 80 for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) { 81 state[i] = _mm512_xor_si512( 82 state[i], _mm512_loadu_si512((__m512i const *) (&ref_block[64 * i]))); 83 block_XY[i] = _mm512_xor_si512( 84 state[i], _mm512_loadu_si512((__m512i const *) (&next_block[64 * i]))); 85 } 86 87 for (i = 0; i < 2; ++i) { 88 BLAKE2_ROUND_1( 89 state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3], 90 state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]); 91 } 92 93 for (i = 0; i < 2; ++i) { 94 BLAKE2_ROUND_2( 95 state[2 * 0 + i], state[2 * 1 + i], state[2 * 2 + i], state[2 * 3 + i], 96 state[2 * 4 + i], state[2 * 5 + i], state[2 * 6 + i], state[2 * 7 + i]); 97 } 98 99 for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) { 100 state[i] = _mm512_xor_si512(state[i], block_XY[i]); 101 _mm512_storeu_si512((__m512i *) (&next_block[64 * i]), state[i]); 102 } 103 } 104 105 static void 106 generate_addresses(const argon2_instance_t *instance, 107 const argon2_position_t *position, uint64_t *pseudo_rands) 108 { 109 block address_block, input_block, tmp_block; 110 uint32_t i; 111 112 init_block_value(&address_block, 0); 113 init_block_value(&input_block, 0); 114 115 if (instance != NULL && position != NULL) { 116 input_block.v[0] = position->pass; 117 input_block.v[1] = position->lane; 118 input_block.v[2] = position->slice; 119 input_block.v[3] = instance->memory_blocks; 120 input_block.v[4] = instance->passes; 121 input_block.v[5] = instance->type; 122 123 for (i = 0; i < instance->segment_length; ++i) { 124 if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) { 125 /* Temporary zero-initialized blocks */ 126 __m512i zero_block[ARGON2_512BIT_WORDS_IN_BLOCK]; 127 __m512i zero2_block[ARGON2_512BIT_WORDS_IN_BLOCK]; 128 129 memset(zero_block, 0, sizeof(zero_block)); 130 memset(zero2_block, 0, sizeof(zero2_block)); 131 init_block_value(&address_block, 0); 132 init_block_value(&tmp_block, 0); 133 /* Increasing index counter */ 134 input_block.v[6]++; 135 /* First iteration of G */ 136 fill_block_with_xor(zero_block, (uint8_t *) &input_block.v, 137 (uint8_t *) &tmp_block.v); 138 /* Second iteration of G */ 139 fill_block_with_xor(zero2_block, (uint8_t *) &tmp_block.v, 140 (uint8_t *) &address_block.v); 141 } 142 143 pseudo_rands[i] = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK]; 144 } 145 } 146 } 147 148 void 149 fill_segment_avx512f(const argon2_instance_t *instance, 150 argon2_position_t position) 151 { 152 block *ref_block = NULL, *curr_block = NULL; 153 uint64_t pseudo_rand, ref_index, ref_lane; 154 uint32_t prev_offset, curr_offset; 155 uint32_t starting_index, i; 156 __m512i state[ARGON2_512BIT_WORDS_IN_BLOCK]; 157 int data_independent_addressing = 1; 158 159 /* Pseudo-random values that determine the reference block position */ 160 uint64_t *pseudo_rands = NULL; 161 162 if (instance == NULL) { 163 return; 164 } 165 166 if (instance->type == Argon2_id && 167 (position.pass != 0 || position.slice >= ARGON2_SYNC_POINTS / 2)) { 168 data_independent_addressing = 0; 169 } 170 171 pseudo_rands = instance->pseudo_rands; 172 173 if (data_independent_addressing) { 174 generate_addresses(instance, &position, pseudo_rands); 175 } 176 177 starting_index = 0; 178 179 if ((0 == position.pass) && (0 == position.slice)) { 180 starting_index = 2; /* we have already generated the first two blocks */ 181 } 182 183 /* Offset of the current block */ 184 curr_offset = position.lane * instance->lane_length + 185 position.slice * instance->segment_length + starting_index; 186 187 if (0 == curr_offset % instance->lane_length) { 188 /* Last block in this lane */ 189 prev_offset = curr_offset + instance->lane_length - 1; 190 } else { 191 /* Previous block */ 192 prev_offset = curr_offset - 1; 193 } 194 195 memcpy(state, ((instance->region->memory + prev_offset)->v), 196 ARGON2_BLOCK_SIZE); 197 198 for (i = starting_index; i < instance->segment_length; 199 ++i, ++curr_offset, ++prev_offset) { 200 /*1.1 Rotating prev_offset if needed */ 201 if (curr_offset % instance->lane_length == 1) { 202 prev_offset = curr_offset - 1; 203 } 204 205 /* 1.2 Computing the index of the reference block */ 206 /* 1.2.1 Taking pseudo-random value from the previous block */ 207 if (data_independent_addressing) { 208 #pragma warning(push) 209 #pragma warning(disable : 6385) 210 pseudo_rand = pseudo_rands[i]; 211 #pragma warning(pop) 212 } else { 213 pseudo_rand = instance->region->memory[prev_offset].v[0]; 214 } 215 216 /* 1.2.2 Computing the lane of the reference block */ 217 ref_lane = ((pseudo_rand >> 32)) % instance->lanes; 218 219 if ((position.pass == 0) && (position.slice == 0)) { 220 /* Can not reference other lanes yet */ 221 ref_lane = position.lane; 222 } 223 224 /* 1.2.3 Computing the number of possible reference block within the 225 * lane. 226 */ 227 position.index = i; 228 ref_index = index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF, 229 ref_lane == position.lane); 230 231 /* 2 Creating a new block */ 232 ref_block = instance->region->memory + 233 instance->lane_length * ref_lane + ref_index; 234 curr_block = instance->region->memory + curr_offset; 235 if (position.pass != 0) { 236 fill_block_with_xor(state, (uint8_t *) ref_block->v, 237 (uint8_t *) curr_block->v); 238 } else { 239 fill_block(state, (uint8_t *) ref_block->v, 240 (uint8_t *) curr_block->v); 241 } 242 } 243 } 244 #endif 245