1 /*
2 * Argon2 source code package
3 *
4 * Written by Daniel Dinu and Dmitry Khovratovich, 2015
5 *
6 * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
7 *
8 * You should have received a copy of the CC0 Public Domain Dedication along
9 * with
10 * this software. If not, see
11 * <http://creativecommons.org/publicdomain/zero/1.0/>.
12 */
13
14 #include <stdint.h>
15 #include <stdlib.h>
16 #include <string.h>
17
18 #include "argon2-core.h"
19 #include "argon2.h"
20 #include "private/common.h"
21 #include "private/sse2_64_32.h"
22
23 #if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \
24 defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)
25
26 # ifdef __GNUC__
27 # pragma GCC target("sse2")
28 # pragma GCC target("ssse3")
29 # pragma GCC target("sse4.1")
30 # pragma GCC target("avx2")
31 # endif
32
33 # ifdef _MSC_VER
34 # include <intrin.h> /* for _mm_set_epi64x */
35 # endif
36 #include <emmintrin.h>
37 #include <immintrin.h>
38 #include <smmintrin.h>
39 #include <tmmintrin.h>
40
41 # include "blamka-round-avx2.h"
42
43 static void
fill_block(__m256i * state,const uint8_t * ref_block,uint8_t * next_block)44 fill_block(__m256i *state, const uint8_t *ref_block, uint8_t *next_block)
45 {
46 __m256i block_XY[ARGON2_HWORDS_IN_BLOCK];
47 uint32_t i;
48
49 for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
50 block_XY[i] = state[i] = _mm256_xor_si256(
51 state[i], _mm256_loadu_si256((__m256i const *) (&ref_block[32 * i])));
52 }
53
54 for (i = 0; i < 4; ++i) {
55 BLAKE2_ROUND_1(state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
56 state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
57 }
58
59 for (i = 0; i < 4; ++i) {
60 BLAKE2_ROUND_2(state[ 0 + i], state[ 4 + i], state[ 8 + i], state[12 + i],
61 state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
62 }
63
64 for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
65 state[i] = _mm256_xor_si256(state[i], block_XY[i]);
66 _mm256_storeu_si256((__m256i *) (&next_block[32 * i]), state[i]);
67 }
68 }
69
70 static void
fill_block_with_xor(__m256i * state,const uint8_t * ref_block,uint8_t * next_block)71 fill_block_with_xor(__m256i *state, const uint8_t *ref_block,
72 uint8_t *next_block)
73 {
74 __m256i block_XY[ARGON2_HWORDS_IN_BLOCK];
75 uint32_t i;
76
77 for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
78 state[i] = _mm256_xor_si256(
79 state[i], _mm256_loadu_si256((__m256i const *) (&ref_block[32 * i])));
80 block_XY[i] = _mm256_xor_si256(
81 state[i], _mm256_loadu_si256((__m256i const *) (&next_block[32 * i])));
82 }
83
84 for (i = 0; i < 4; ++i) {
85 BLAKE2_ROUND_1(state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
86 state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
87 }
88
89 for (i = 0; i < 4; ++i) {
90 BLAKE2_ROUND_2(state[ 0 + i], state[ 4 + i], state[ 8 + i], state[12 + i],
91 state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
92 }
93
94 for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
95 state[i] = _mm256_xor_si256(state[i], block_XY[i]);
96 _mm256_storeu_si256((__m256i *) (&next_block[32 * i]), state[i]);
97 }
98 }
99
100 static void
generate_addresses(const argon2_instance_t * instance,const argon2_position_t * position,uint64_t * pseudo_rands)101 generate_addresses(const argon2_instance_t *instance,
102 const argon2_position_t *position, uint64_t *pseudo_rands)
103 {
104 block address_block, input_block, tmp_block;
105 uint32_t i;
106
107 init_block_value(&address_block, 0);
108 init_block_value(&input_block, 0);
109
110 if (instance != NULL && position != NULL) {
111 input_block.v[0] = position->pass;
112 input_block.v[1] = position->lane;
113 input_block.v[2] = position->slice;
114 input_block.v[3] = instance->memory_blocks;
115 input_block.v[4] = instance->passes;
116 input_block.v[5] = instance->type;
117
118 for (i = 0; i < instance->segment_length; ++i) {
119 if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) {
120 /* Temporary zero-initialized blocks */
121 __m256i zero_block[ARGON2_HWORDS_IN_BLOCK];
122 __m256i zero2_block[ARGON2_HWORDS_IN_BLOCK];
123
124 memset(zero_block, 0, sizeof(zero_block));
125 memset(zero2_block, 0, sizeof(zero2_block));
126 init_block_value(&address_block, 0);
127 init_block_value(&tmp_block, 0);
128 /* Increasing index counter */
129 input_block.v[6]++;
130 /* First iteration of G */
131 fill_block_with_xor(zero_block, (uint8_t *) &input_block.v,
132 (uint8_t *) &tmp_block.v);
133 /* Second iteration of G */
134 fill_block_with_xor(zero2_block, (uint8_t *) &tmp_block.v,
135 (uint8_t *) &address_block.v);
136 }
137
138 pseudo_rands[i] = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK];
139 }
140 }
141 }
142
143 void
fill_segment_avx2(const argon2_instance_t * instance,argon2_position_t position)144 fill_segment_avx2(const argon2_instance_t *instance,
145 argon2_position_t position)
146 {
147 block *ref_block = NULL, *curr_block = NULL;
148 uint64_t pseudo_rand, ref_index, ref_lane;
149 uint32_t prev_offset, curr_offset;
150 uint32_t starting_index, i;
151 __m256i state[ARGON2_HWORDS_IN_BLOCK];
152 int data_independent_addressing = 1;
153
154 /* Pseudo-random values that determine the reference block position */
155 uint64_t *pseudo_rands = NULL;
156
157 if (instance == NULL) {
158 return;
159 }
160
161 if (instance->type == Argon2_id &&
162 (position.pass != 0 || position.slice >= ARGON2_SYNC_POINTS / 2)) {
163 data_independent_addressing = 0;
164 }
165
166 pseudo_rands = instance->pseudo_rands;
167
168 if (data_independent_addressing) {
169 generate_addresses(instance, &position, pseudo_rands);
170 }
171
172 starting_index = 0;
173
174 if ((0 == position.pass) && (0 == position.slice)) {
175 starting_index = 2; /* we have already generated the first two blocks */
176 }
177
178 /* Offset of the current block */
179 curr_offset = position.lane * instance->lane_length +
180 position.slice * instance->segment_length + starting_index;
181
182 if (0 == curr_offset % instance->lane_length) {
183 /* Last block in this lane */
184 prev_offset = curr_offset + instance->lane_length - 1;
185 } else {
186 /* Previous block */
187 prev_offset = curr_offset - 1;
188 }
189
190 memcpy(state, ((instance->region->memory + prev_offset)->v),
191 ARGON2_BLOCK_SIZE);
192
193 for (i = starting_index; i < instance->segment_length;
194 ++i, ++curr_offset, ++prev_offset) {
195 /*1.1 Rotating prev_offset if needed */
196 if (curr_offset % instance->lane_length == 1) {
197 prev_offset = curr_offset - 1;
198 }
199
200 /* 1.2 Computing the index of the reference block */
201 /* 1.2.1 Taking pseudo-random value from the previous block */
202 if (data_independent_addressing) {
203 #pragma warning(push)
204 #pragma warning(disable : 6385)
205 pseudo_rand = pseudo_rands[i];
206 #pragma warning(pop)
207 } else {
208 pseudo_rand = instance->region->memory[prev_offset].v[0];
209 }
210
211 /* 1.2.2 Computing the lane of the reference block */
212 ref_lane = ((pseudo_rand >> 32)) % instance->lanes;
213
214 if ((position.pass == 0) && (position.slice == 0)) {
215 /* Can not reference other lanes yet */
216 ref_lane = position.lane;
217 }
218
219 /* 1.2.3 Computing the number of possible reference block within the
220 * lane.
221 */
222 position.index = i;
223 ref_index = index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,
224 ref_lane == position.lane);
225
226 /* 2 Creating a new block */
227 ref_block = instance->region->memory +
228 instance->lane_length * ref_lane + ref_index;
229 curr_block = instance->region->memory + curr_offset;
230 if (position.pass != 0) {
231 fill_block_with_xor(state, (uint8_t *) ref_block->v,
232 (uint8_t *) curr_block->v);
233 } else {
234 fill_block(state, (uint8_t *) ref_block->v,
235 (uint8_t *) curr_block->v);
236 }
237 }
238 }
239 #endif
240