1 /* 2 * Argon2 source code package 3 * 4 * Written by Daniel Dinu and Dmitry Khovratovich, 2015 5 * 6 * This work is licensed under a Creative Commons CC0 1.0 License/Waiver. 7 * 8 * You should have received a copy of the CC0 Public Domain Dedication along 9 * with 10 * this software. If not, see 11 * <http://creativecommons.org/publicdomain/zero/1.0/>. 12 */ 13 14 #include <stdint.h> 15 #include <stdlib.h> 16 #include <string.h> 17 18 #include "argon2-core.h" 19 #include "argon2.h" 20 #include "private/common.h" 21 #include "private/sse2_64_32.h" 22 23 #if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \ 24 defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H) 25 26 # ifdef __GNUC__ 27 # pragma GCC target("sse2") 28 # pragma GCC target("ssse3") 29 # pragma GCC target("sse4.1") 30 # pragma GCC target("avx2") 31 # endif 32 33 # ifdef _MSC_VER 34 # include <intrin.h> /* for _mm_set_epi64x */ 35 # endif 36 #include <emmintrin.h> 37 #include <immintrin.h> 38 #include <smmintrin.h> 39 #include <tmmintrin.h> 40 41 # include "blamka-round-avx2.h" 42 43 static void 44 fill_block(__m256i *state, const uint8_t *ref_block, uint8_t *next_block) 45 { 46 __m256i block_XY[ARGON2_HWORDS_IN_BLOCK]; 47 uint32_t i; 48 49 for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) { 50 block_XY[i] = state[i] = _mm256_xor_si256( 51 state[i], _mm256_loadu_si256((__m256i const *) (&ref_block[32 * i]))); 52 } 53 54 for (i = 0; i < 4; ++i) { 55 BLAKE2_ROUND_1(state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5], 56 state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]); 57 } 58 59 for (i = 0; i < 4; ++i) { 60 BLAKE2_ROUND_2(state[ 0 + i], state[ 4 + i], state[ 8 + i], state[12 + i], 61 state[16 + i], state[20 + i], state[24 + i], state[28 + i]); 62 } 63 64 for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) { 65 state[i] = _mm256_xor_si256(state[i], block_XY[i]); 66 _mm256_storeu_si256((__m256i *) (&next_block[32 * i]), state[i]); 67 } 68 } 69 70 static void 71 fill_block_with_xor(__m256i *state, const uint8_t *ref_block, 72 uint8_t *next_block) 73 { 74 __m256i block_XY[ARGON2_HWORDS_IN_BLOCK]; 75 uint32_t i; 76 77 for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) { 78 state[i] = _mm256_xor_si256( 79 state[i], _mm256_loadu_si256((__m256i const *) (&ref_block[32 * i]))); 80 block_XY[i] = _mm256_xor_si256( 81 state[i], _mm256_loadu_si256((__m256i const *) (&next_block[32 * i]))); 82 } 83 84 for (i = 0; i < 4; ++i) { 85 BLAKE2_ROUND_1(state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5], 86 state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]); 87 } 88 89 for (i = 0; i < 4; ++i) { 90 BLAKE2_ROUND_2(state[ 0 + i], state[ 4 + i], state[ 8 + i], state[12 + i], 91 state[16 + i], state[20 + i], state[24 + i], state[28 + i]); 92 } 93 94 for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) { 95 state[i] = _mm256_xor_si256(state[i], block_XY[i]); 96 _mm256_storeu_si256((__m256i *) (&next_block[32 * i]), state[i]); 97 } 98 } 99 100 static void 101 generate_addresses(const argon2_instance_t *instance, 102 const argon2_position_t *position, uint64_t *pseudo_rands) 103 { 104 block address_block, input_block, tmp_block; 105 uint32_t i; 106 107 init_block_value(&address_block, 0); 108 init_block_value(&input_block, 0); 109 110 if (instance != NULL && position != NULL) { 111 input_block.v[0] = position->pass; 112 input_block.v[1] = position->lane; 113 input_block.v[2] = position->slice; 114 input_block.v[3] = instance->memory_blocks; 115 input_block.v[4] = instance->passes; 116 input_block.v[5] = instance->type; 117 118 for (i = 0; i < instance->segment_length; ++i) { 119 if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) { 120 /* Temporary zero-initialized blocks */ 121 __m256i zero_block[ARGON2_HWORDS_IN_BLOCK]; 122 __m256i zero2_block[ARGON2_HWORDS_IN_BLOCK]; 123 124 memset(zero_block, 0, sizeof(zero_block)); 125 memset(zero2_block, 0, sizeof(zero2_block)); 126 init_block_value(&address_block, 0); 127 init_block_value(&tmp_block, 0); 128 /* Increasing index counter */ 129 input_block.v[6]++; 130 /* First iteration of G */ 131 fill_block_with_xor(zero_block, (uint8_t *) &input_block.v, 132 (uint8_t *) &tmp_block.v); 133 /* Second iteration of G */ 134 fill_block_with_xor(zero2_block, (uint8_t *) &tmp_block.v, 135 (uint8_t *) &address_block.v); 136 } 137 138 pseudo_rands[i] = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK]; 139 } 140 } 141 } 142 143 void 144 fill_segment_avx2(const argon2_instance_t *instance, 145 argon2_position_t position) 146 { 147 block *ref_block = NULL, *curr_block = NULL; 148 uint64_t pseudo_rand, ref_index, ref_lane; 149 uint32_t prev_offset, curr_offset; 150 uint32_t starting_index, i; 151 __m256i state[ARGON2_HWORDS_IN_BLOCK]; 152 int data_independent_addressing = 1; 153 154 /* Pseudo-random values that determine the reference block position */ 155 uint64_t *pseudo_rands = NULL; 156 157 if (instance == NULL) { 158 return; 159 } 160 161 if (instance->type == Argon2_id && 162 (position.pass != 0 || position.slice >= ARGON2_SYNC_POINTS / 2)) { 163 data_independent_addressing = 0; 164 } 165 166 pseudo_rands = instance->pseudo_rands; 167 168 if (data_independent_addressing) { 169 generate_addresses(instance, &position, pseudo_rands); 170 } 171 172 starting_index = 0; 173 174 if ((0 == position.pass) && (0 == position.slice)) { 175 starting_index = 2; /* we have already generated the first two blocks */ 176 } 177 178 /* Offset of the current block */ 179 curr_offset = position.lane * instance->lane_length + 180 position.slice * instance->segment_length + starting_index; 181 182 if (0 == curr_offset % instance->lane_length) { 183 /* Last block in this lane */ 184 prev_offset = curr_offset + instance->lane_length - 1; 185 } else { 186 /* Previous block */ 187 prev_offset = curr_offset - 1; 188 } 189 190 memcpy(state, ((instance->region->memory + prev_offset)->v), 191 ARGON2_BLOCK_SIZE); 192 193 for (i = starting_index; i < instance->segment_length; 194 ++i, ++curr_offset, ++prev_offset) { 195 /*1.1 Rotating prev_offset if needed */ 196 if (curr_offset % instance->lane_length == 1) { 197 prev_offset = curr_offset - 1; 198 } 199 200 /* 1.2 Computing the index of the reference block */ 201 /* 1.2.1 Taking pseudo-random value from the previous block */ 202 if (data_independent_addressing) { 203 #pragma warning(push) 204 #pragma warning(disable : 6385) 205 pseudo_rand = pseudo_rands[i]; 206 #pragma warning(pop) 207 } else { 208 pseudo_rand = instance->region->memory[prev_offset].v[0]; 209 } 210 211 /* 1.2.2 Computing the lane of the reference block */ 212 ref_lane = ((pseudo_rand >> 32)) % instance->lanes; 213 214 if ((position.pass == 0) && (position.slice == 0)) { 215 /* Can not reference other lanes yet */ 216 ref_lane = position.lane; 217 } 218 219 /* 1.2.3 Computing the number of possible reference block within the 220 * lane. 221 */ 222 position.index = i; 223 ref_index = index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF, 224 ref_lane == position.lane); 225 226 /* 2 Creating a new block */ 227 ref_block = instance->region->memory + 228 instance->lane_length * ref_lane + ref_index; 229 curr_block = instance->region->memory + curr_offset; 230 if (position.pass != 0) { 231 fill_block_with_xor(state, (uint8_t *) ref_block->v, 232 (uint8_t *) curr_block->v); 233 } else { 234 fill_block(state, (uint8_t *) ref_block->v, 235 (uint8_t *) curr_block->v); 236 } 237 } 238 } 239 #endif 240