1 // lsh.cpp - written and placed in the public domain by Jeffrey Walton
2 //           Based on the specification and source code provided by
3 //           Korea Internet & Security Agency (KISA) website. Also
4 //           see https://seed.kisa.or.kr/kisa/algorithm/EgovLSHInfo.do
5 //           and https://seed.kisa.or.kr/kisa/Board/22/detailView.do.
6 
7 // We are hitting some sort of GCC bug in the LSH AVX2 code path.
8 // Clang is OK on the AVX2 code path. We believe it is GCC Issue
9 // 82735, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82735. It
10 // makes using zeroupper a little tricky.
11 
12 #include "pch.h"
13 #include "config.h"
14 
15 #include "lsh.h"
16 #include "cpu.h"
17 #include "misc.h"
18 
19 #if defined(CRYPTOPP_SSSE3_AVAILABLE) && defined(CRYPTOPP_ENABLE_64BIT_SSE)
20 
21 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
22 # include <emmintrin.h>
23 # include <tmmintrin.h>
24 #endif
25 
26 #if defined(CRYPTOPP_XOP_AVAILABLE)
27 # include <ammintrin.h>
28 #endif
29 
30 // GCC at 4.5. Clang is unknown. Also see https://stackoverflow.com/a/42493893.
31 #if (CRYPTOPP_GCC_VERSION >= 40500)
32 # include <x86intrin.h>
33 #endif
34 
35 ANONYMOUS_NAMESPACE_BEGIN
36 
37 /* LSH Constants */
38 
39 const unsigned int LSH256_MSG_BLK_BYTE_LEN = 128;
40 // const unsigned int LSH256_MSG_BLK_BIT_LEN = 1024;
41 // const unsigned int LSH256_CV_BYTE_LEN = 64;
42 const unsigned int LSH256_HASH_VAL_MAX_BYTE_LEN = 32;
43 
44 // const unsigned int MSG_BLK_WORD_LEN = 32;
45 const unsigned int CV_WORD_LEN = 16;
46 const unsigned int CONST_WORD_LEN = 8;
47 // const unsigned int HASH_VAL_MAX_WORD_LEN = 8;
48 // const unsigned int WORD_BIT_LEN = 32;
49 const unsigned int NUM_STEPS = 26;
50 
51 const unsigned int ROT_EVEN_ALPHA = 29;
52 const unsigned int ROT_EVEN_BETA = 1;
53 const unsigned int ROT_ODD_ALPHA = 5;
54 const unsigned int ROT_ODD_BETA = 17;
55 
56 const unsigned int LSH_TYPE_256_256 = 0x0000020;
57 const unsigned int LSH_TYPE_256_224 = 0x000001C;
58 
59 // const unsigned int LSH_TYPE_224 = LSH_TYPE_256_224;
60 // const unsigned int LSH_TYPE_256 = LSH_TYPE_256_256;
61 
62 /* Error Code */
63 
64 const unsigned int LSH_SUCCESS = 0x0;
65 // const unsigned int LSH_ERR_NULL_PTR = 0x2401;
66 // const unsigned int LSH_ERR_INVALID_ALGTYPE = 0x2402;
67 const unsigned int LSH_ERR_INVALID_DATABITLEN = 0x2403;
68 const unsigned int LSH_ERR_INVALID_STATE = 0x2404;
69 
70 /* Index into our state array */
71 
72 const unsigned int AlgorithmType = 80;
73 const unsigned int RemainingBits = 81;
74 
75 NAMESPACE_END
76 
77 NAMESPACE_BEGIN(CryptoPP)
78 NAMESPACE_BEGIN(LSH)
79 
80 // lsh256.cpp
81 extern const word32 LSH256_IV224[CV_WORD_LEN];
82 extern const word32 LSH256_IV256[CV_WORD_LEN];
83 extern const word32 LSH256_StepConstants[CONST_WORD_LEN * NUM_STEPS];
84 
85 NAMESPACE_END  // LSH
86 NAMESPACE_END  // Crypto++
87 
88 ANONYMOUS_NAMESPACE_BEGIN
89 
90 using CryptoPP::byte;
91 using CryptoPP::word32;
92 using CryptoPP::rotlFixed;
93 using CryptoPP::rotlConstant;
94 
95 using CryptoPP::GetBlock;
96 using CryptoPP::LittleEndian;
97 using CryptoPP::ConditionalByteReverse;
98 using CryptoPP::LITTLE_ENDIAN_ORDER;
99 
100 typedef byte lsh_u8;
101 typedef word32 lsh_u32;
102 typedef word32 lsh_uint;
103 typedef word32 lsh_err;
104 typedef word32 lsh_type;
105 
106 using CryptoPP::LSH::LSH256_IV224;
107 using CryptoPP::LSH::LSH256_IV256;
108 using CryptoPP::LSH::LSH256_StepConstants;
109 
110 struct LSH256_SSSE3_Context
111 {
LSH256_SSSE3_ContextLSH256_SSSE3_Context112 	LSH256_SSSE3_Context(word32* state, word32 algType, word32& remainingBitLength) :
113 		cv_l(state+0), cv_r(state+8), sub_msgs(state+16),
114 		last_block(reinterpret_cast<byte*>(state+48)),
115 		remain_databitlen(remainingBitLength),
116 		alg_type(static_cast<lsh_type>(algType)) {}
117 
118 	lsh_u32* cv_l;  // start of our state block
119 	lsh_u32* cv_r;
120 	lsh_u32* sub_msgs;
121 	lsh_u8*  last_block;
122 	lsh_u32& remain_databitlen;
123 	lsh_type alg_type;
124 };
125 
126 struct LSH256_SSSE3_Internal
127 {
LSH256_SSSE3_InternalLSH256_SSSE3_Internal128 	LSH256_SSSE3_Internal(word32* state) :
129 		submsg_e_l(state+16), submsg_e_r(state+24),
130 		submsg_o_l(state+32), submsg_o_r(state+40) { }
131 
132 	lsh_u32* submsg_e_l; /* even left sub-message  */
133 	lsh_u32* submsg_e_r; /* even right sub-message */
134 	lsh_u32* submsg_o_l; /* odd left sub-message   */
135 	lsh_u32* submsg_o_r; /* odd right sub-message  */
136 };
137 
138 // const word32 g_gamma256[8] = { 0, 8, 16, 24, 24, 16, 8, 0 };
139 
140 /* LSH AlgType Macro */
141 
LSH_IS_LSH512(lsh_uint val)142 inline bool LSH_IS_LSH512(lsh_uint val) {
143 	return (val & 0xf0000) == 0;
144 }
145 
LSH_GET_SMALL_HASHBIT(lsh_uint val)146 inline lsh_uint LSH_GET_SMALL_HASHBIT(lsh_uint val) {
147 	return val >> 24;
148 }
149 
LSH_GET_HASHBYTE(lsh_uint val)150 inline lsh_uint LSH_GET_HASHBYTE(lsh_uint val) {
151 	return val & 0xffff;
152 }
153 
LSH_GET_HASHBIT(lsh_uint val)154 inline lsh_uint LSH_GET_HASHBIT(lsh_uint val) {
155 	return (LSH_GET_HASHBYTE(val) << 3) - LSH_GET_SMALL_HASHBIT(val);
156 }
157 
loadLE32(lsh_u32 v)158 inline lsh_u32 loadLE32(lsh_u32 v) {
159 	return ConditionalByteReverse(LITTLE_ENDIAN_ORDER, v);
160 }
161 
ROTL(lsh_u32 x,lsh_u32 r)162 lsh_u32 ROTL(lsh_u32 x, lsh_u32 r) {
163 	return rotlFixed(x, r);
164 }
165 
166 // Original code relied upon unaligned lsh_u32 buffer
load_msg_blk(LSH256_SSSE3_Internal * i_state,const lsh_u8 msgblk[LSH256_MSG_BLK_BYTE_LEN])167 inline void load_msg_blk(LSH256_SSSE3_Internal* i_state, const lsh_u8 msgblk[LSH256_MSG_BLK_BYTE_LEN])
168 {
169 	CRYPTOPP_ASSERT(i_state != NULLPTR);
170 	lsh_u32* submsg_e_l = i_state->submsg_e_l;
171 	lsh_u32* submsg_e_r = i_state->submsg_e_r;
172 	lsh_u32* submsg_o_l = i_state->submsg_o_l;
173 	lsh_u32* submsg_o_r = i_state->submsg_o_r;
174 
175 	_mm_storeu_si128(M128_CAST(submsg_e_l+0),
176 		_mm_loadu_si128(CONST_M128_CAST(msgblk+0)));
177 	_mm_storeu_si128(M128_CAST(submsg_e_l+4),
178 		_mm_loadu_si128(CONST_M128_CAST(msgblk+16)));
179 	_mm_storeu_si128(M128_CAST(submsg_e_r+0),
180 		_mm_loadu_si128(CONST_M128_CAST(msgblk+32)));
181 	_mm_storeu_si128(M128_CAST(submsg_e_r+4),
182 		_mm_loadu_si128(CONST_M128_CAST(msgblk+48)));
183 	_mm_storeu_si128(M128_CAST(submsg_o_l+0),
184 		_mm_loadu_si128(CONST_M128_CAST(msgblk+64)));
185 	_mm_storeu_si128(M128_CAST(submsg_o_l+4),
186 		_mm_loadu_si128(CONST_M128_CAST(msgblk+80)));
187 	_mm_storeu_si128(M128_CAST(submsg_o_r+0),
188 		_mm_loadu_si128(CONST_M128_CAST(msgblk+96)));
189 	_mm_storeu_si128(M128_CAST(submsg_o_r+4),
190 		_mm_loadu_si128(CONST_M128_CAST(msgblk+112)));
191 }
192 
msg_exp_even(LSH256_SSSE3_Internal * i_state)193 inline void msg_exp_even(LSH256_SSSE3_Internal* i_state)
194 {
195 	CRYPTOPP_ASSERT(i_state != NULLPTR);
196 
197 	lsh_u32* submsg_e_l = i_state->submsg_e_l;
198 	lsh_u32* submsg_e_r = i_state->submsg_e_r;
199 	lsh_u32* submsg_o_l = i_state->submsg_o_l;
200 	lsh_u32* submsg_o_r = i_state->submsg_o_r;
201 
202 	_mm_storeu_si128(M128_CAST(submsg_e_l+0), _mm_add_epi32(
203 		_mm_shuffle_epi32(
204 			_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+0)), _MM_SHUFFLE(3,2,1,0)),
205 		_mm_shuffle_epi32(
206 			_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0)), _MM_SHUFFLE(1,0,2,3))));
207 
208 	_mm_storeu_si128(M128_CAST(submsg_e_l+4), _mm_add_epi32(
209 		_mm_shuffle_epi32(
210 			_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)), _MM_SHUFFLE(3,2,1,0)),
211 		_mm_shuffle_epi32(
212 			_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)), _MM_SHUFFLE(2,1,0,3))));
213 
214 	_mm_storeu_si128(M128_CAST(submsg_e_r+0), _mm_add_epi32(
215 		_mm_shuffle_epi32(
216 			_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+0)), _MM_SHUFFLE(3,2,1,0)),
217 		_mm_shuffle_epi32(
218 			_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0)), _MM_SHUFFLE(1,0,2,3))));
219 
220 	_mm_storeu_si128(M128_CAST(submsg_e_r+4), _mm_add_epi32(
221 		_mm_shuffle_epi32(
222 			_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)), _MM_SHUFFLE(3,2,1,0)),
223 		_mm_shuffle_epi32(
224 			_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)), _MM_SHUFFLE(2,1,0,3))));
225 }
226 
msg_exp_odd(LSH256_SSSE3_Internal * i_state)227 inline void msg_exp_odd(LSH256_SSSE3_Internal* i_state)
228 {
229 	CRYPTOPP_ASSERT(i_state != NULLPTR);
230 
231 	lsh_u32* submsg_e_l = i_state->submsg_e_l;
232 	lsh_u32* submsg_e_r = i_state->submsg_e_r;
233 	lsh_u32* submsg_o_l = i_state->submsg_o_l;
234 	lsh_u32* submsg_o_r = i_state->submsg_o_r;
235 
236 	_mm_storeu_si128(M128_CAST(submsg_o_l+0), _mm_add_epi32(
237 		_mm_shuffle_epi32(
238 			_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0)), _MM_SHUFFLE(3,2,1,0)),
239 		_mm_shuffle_epi32(
240 			_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+0)), _MM_SHUFFLE(1,0,2,3))));
241 
242 	_mm_storeu_si128(M128_CAST(submsg_o_l+4), _mm_add_epi32(
243 		_mm_shuffle_epi32(
244 			_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4)), _MM_SHUFFLE(3,2,1,0)),
245 		_mm_shuffle_epi32(
246 			_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4)), _MM_SHUFFLE(2,1,0,3))));
247 
248 	_mm_storeu_si128(M128_CAST(submsg_o_r+0), _mm_add_epi32(
249 		_mm_shuffle_epi32(
250 			_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0)), _MM_SHUFFLE(3,2,1,0)),
251 		_mm_shuffle_epi32(
252 			_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+0)), _MM_SHUFFLE(1,0,2,3))));
253 
254 	_mm_storeu_si128(M128_CAST(submsg_o_r+4), _mm_add_epi32(
255 		_mm_shuffle_epi32(
256 			_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4)), _MM_SHUFFLE(3,2,1,0)),
257 		_mm_shuffle_epi32(
258 			_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4)), _MM_SHUFFLE(2,1,0,3))));
259 }
260 
load_sc(const lsh_u32 ** p_const_v,size_t i)261 inline void load_sc(const lsh_u32** p_const_v, size_t i)
262 {
263 	CRYPTOPP_ASSERT(p_const_v != NULLPTR);
264 
265 	*p_const_v = &LSH256_StepConstants[i];
266 }
267 
msg_add_even(lsh_u32 cv_l[8],lsh_u32 cv_r[8],LSH256_SSSE3_Internal * i_state)268 inline void msg_add_even(lsh_u32 cv_l[8], lsh_u32 cv_r[8], LSH256_SSSE3_Internal* i_state)
269 {
270 	CRYPTOPP_ASSERT(i_state != NULLPTR);
271 
272 	lsh_u32* submsg_e_l = i_state->submsg_e_l;
273 	lsh_u32* submsg_e_r = i_state->submsg_e_r;
274 
275 	_mm_storeu_si128(M128_CAST(cv_l+0), _mm_xor_si128(
276 		_mm_loadu_si128(CONST_M128_CAST(cv_l+0)),
277 		_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+0))));
278 	_mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128(
279 		_mm_loadu_si128(CONST_M128_CAST(cv_l+4)),
280 		_mm_loadu_si128(CONST_M128_CAST(submsg_e_l+4))));
281 	_mm_storeu_si128(M128_CAST(cv_r+0), _mm_xor_si128(
282 		_mm_loadu_si128(CONST_M128_CAST(cv_r+0)),
283 		_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+0))));
284 	_mm_storeu_si128(M128_CAST(cv_r+4), _mm_xor_si128(
285 		_mm_loadu_si128(CONST_M128_CAST(cv_r+4)),
286 		_mm_loadu_si128(CONST_M128_CAST(submsg_e_r+4))));
287 }
288 
msg_add_odd(lsh_u32 cv_l[8],lsh_u32 cv_r[8],LSH256_SSSE3_Internal * i_state)289 inline void msg_add_odd(lsh_u32 cv_l[8], lsh_u32 cv_r[8], LSH256_SSSE3_Internal* i_state)
290 {
291 	CRYPTOPP_ASSERT(i_state != NULLPTR);
292 
293 	lsh_u32* submsg_o_l = i_state->submsg_o_l;
294 	lsh_u32* submsg_o_r = i_state->submsg_o_r;
295 
296 	_mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128(
297 		_mm_loadu_si128(CONST_M128_CAST(cv_l)),
298 		_mm_loadu_si128(CONST_M128_CAST(submsg_o_l))));
299 	_mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128(
300 		_mm_loadu_si128(CONST_M128_CAST(cv_l+4)),
301 		_mm_loadu_si128(CONST_M128_CAST(submsg_o_l+4))));
302 	_mm_storeu_si128(M128_CAST(cv_r), _mm_xor_si128(
303 		_mm_loadu_si128(CONST_M128_CAST(cv_r)),
304 		_mm_loadu_si128(CONST_M128_CAST(submsg_o_r))));
305 	_mm_storeu_si128(M128_CAST(cv_r+4), _mm_xor_si128(
306 		_mm_loadu_si128(CONST_M128_CAST(cv_r+4)),
307 		_mm_loadu_si128(CONST_M128_CAST(submsg_o_r+4))));
308 }
309 
add_blk(lsh_u32 cv_l[8],const lsh_u32 cv_r[8])310 inline void add_blk(lsh_u32 cv_l[8], const lsh_u32 cv_r[8])
311 {
312 	_mm_storeu_si128(M128_CAST(cv_l), _mm_add_epi32(
313 		_mm_loadu_si128(CONST_M128_CAST(cv_l)),
314 		_mm_loadu_si128(CONST_M128_CAST(cv_r))));
315 	_mm_storeu_si128(M128_CAST(cv_l+4), _mm_add_epi32(
316 		_mm_loadu_si128(CONST_M128_CAST(cv_l+4)),
317 		_mm_loadu_si128(CONST_M128_CAST(cv_r+4))));
318 }
319 
320 template <unsigned int R>
rotate_blk(lsh_u32 cv[8])321 inline void rotate_blk(lsh_u32 cv[8])
322 {
323 #if defined(CRYPTOPP_XOP_AVAILABLE)
324 	_mm_storeu_si128(M128_CAST(cv),
325 		_mm_roti_epi32(_mm_loadu_si128(CONST_M128_CAST(cv)), R));
326 	_mm_storeu_si128(M128_CAST(cv+4),
327 		_mm_roti_epi32(_mm_loadu_si128(CONST_M128_CAST(cv+4)), R));
328 #else
329 	_mm_storeu_si128(M128_CAST(cv), _mm_or_si128(
330 		_mm_slli_epi32(_mm_loadu_si128(CONST_M128_CAST(cv)), R),
331 		_mm_srli_epi32(_mm_loadu_si128(CONST_M128_CAST(cv)), 32-R)));
332 	_mm_storeu_si128(M128_CAST(cv+4), _mm_or_si128(
333 		_mm_slli_epi32(_mm_loadu_si128(CONST_M128_CAST(cv+4)), R),
334 		_mm_srli_epi32(_mm_loadu_si128(CONST_M128_CAST(cv+4)), 32-R)));
335 #endif
336 }
337 
xor_with_const(lsh_u32 * cv_l,const lsh_u32 * const_v)338 inline void xor_with_const(lsh_u32* cv_l, const lsh_u32* const_v)
339 {
340 	_mm_storeu_si128(M128_CAST(cv_l), _mm_xor_si128(
341 		_mm_loadu_si128(CONST_M128_CAST(cv_l)),
342 		_mm_loadu_si128(CONST_M128_CAST(const_v))));
343 	_mm_storeu_si128(M128_CAST(cv_l+4), _mm_xor_si128(
344 		_mm_loadu_si128(CONST_M128_CAST(cv_l+4)),
345 		_mm_loadu_si128(CONST_M128_CAST(const_v+4))));
346 }
347 
rotate_msg_gamma(lsh_u32 cv_r[8])348 inline void rotate_msg_gamma(lsh_u32 cv_r[8])
349 {
350 	// g_gamma256[8] = { 0, 8, 16, 24, 24, 16, 8, 0 };
351 	_mm_storeu_si128(M128_CAST(cv_r+0),
352 		_mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+0)),
353 			_mm_set_epi8(12,15,14,13, 9,8,11,10, 6,5,4,7, 3,2,1,0)));
354 	_mm_storeu_si128(M128_CAST(cv_r+4),
355 		_mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(cv_r+4)),
356 			_mm_set_epi8(15,14,13,12, 10,9,8,11, 5,4,7,6, 0,3,2,1)));
357 }
358 
word_perm(lsh_u32 cv_l[8],lsh_u32 cv_r[8])359 inline void word_perm(lsh_u32 cv_l[8], lsh_u32 cv_r[8])
360 {
361 	_mm_storeu_si128(M128_CAST(cv_l+0), _mm_shuffle_epi32(
362 		_mm_loadu_si128(CONST_M128_CAST(cv_l+0)), _MM_SHUFFLE(3,1,0,2)));
363 	_mm_storeu_si128(M128_CAST(cv_l+4), _mm_shuffle_epi32(
364 		_mm_loadu_si128(CONST_M128_CAST(cv_l+4)), _MM_SHUFFLE(3,1,0,2)));
365 	_mm_storeu_si128(M128_CAST(cv_r+0), _mm_shuffle_epi32(
366 		_mm_loadu_si128(CONST_M128_CAST(cv_r+0)), _MM_SHUFFLE(1,2,3,0)));
367 	_mm_storeu_si128(M128_CAST(cv_r+4), _mm_shuffle_epi32(
368 		_mm_loadu_si128(CONST_M128_CAST(cv_r+4)), _MM_SHUFFLE(1,2,3,0)));
369 
370 	__m128i temp = _mm_loadu_si128(CONST_M128_CAST(cv_l+0));
371 	_mm_storeu_si128(M128_CAST(cv_l+0),
372 		_mm_loadu_si128(CONST_M128_CAST(cv_l+4)));
373 	_mm_storeu_si128(M128_CAST(cv_l+4),
374 		_mm_loadu_si128(CONST_M128_CAST(cv_r+4)));
375 	_mm_storeu_si128(M128_CAST(cv_r+4),
376 		_mm_loadu_si128(CONST_M128_CAST(cv_r+0)));
377 	_mm_storeu_si128(M128_CAST(cv_r+0), temp);
378 };
379 
380 /* -------------------------------------------------------- *
381 * step function
382 * -------------------------------------------------------- */
383 
384 template <unsigned int Alpha, unsigned int Beta>
mix(lsh_u32 cv_l[8],lsh_u32 cv_r[8],const lsh_u32 const_v[8])385 inline void mix(lsh_u32 cv_l[8], lsh_u32 cv_r[8], const lsh_u32 const_v[8])
386 {
387 	add_blk(cv_l, cv_r);
388 	rotate_blk<Alpha>(cv_l);
389 	xor_with_const(cv_l, const_v);
390 	add_blk(cv_r, cv_l);
391 	rotate_blk<Beta>(cv_r);
392 	add_blk(cv_l, cv_r);
393 	rotate_msg_gamma(cv_r);
394 }
395 
396 /* -------------------------------------------------------- *
397 * compression function
398 * -------------------------------------------------------- */
399 
compress(LSH256_SSSE3_Context * ctx,const lsh_u8 pdMsgBlk[LSH256_MSG_BLK_BYTE_LEN])400 inline void compress(LSH256_SSSE3_Context* ctx, const lsh_u8 pdMsgBlk[LSH256_MSG_BLK_BYTE_LEN])
401 {
402 	CRYPTOPP_ASSERT(ctx != NULLPTR);
403 
404 	LSH256_SSSE3_Internal  s_state(ctx->cv_l);
405 	LSH256_SSSE3_Internal* i_state = &s_state;
406 
407 	const lsh_u32* const_v = NULL;
408 	lsh_u32* cv_l = ctx->cv_l;
409 	lsh_u32* cv_r = ctx->cv_r;
410 
411 	load_msg_blk(i_state, pdMsgBlk);
412 
413 	msg_add_even(cv_l, cv_r, i_state);
414 	load_sc(&const_v, 0);
415 	mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
416 	word_perm(cv_l, cv_r);
417 
418 	msg_add_odd(cv_l, cv_r, i_state);
419 	load_sc(&const_v, 8);
420 	mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
421 	word_perm(cv_l, cv_r);
422 
423 	for (size_t i = 1; i < NUM_STEPS / 2; i++)
424 	{
425 		msg_exp_even(i_state);
426 		msg_add_even(cv_l, cv_r, i_state);
427 		load_sc(&const_v, 16 * i);
428 		mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
429 		word_perm(cv_l, cv_r);
430 
431 		msg_exp_odd(i_state);
432 		msg_add_odd(cv_l, cv_r, i_state);
433 		load_sc(&const_v, 16 * i + 8);
434 		mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
435 		word_perm(cv_l, cv_r);
436 	}
437 
438 	msg_exp_even(i_state);
439 	msg_add_even(cv_l, cv_r, i_state);
440 }
441 
442 /* -------------------------------------------------------- */
443 
load_iv(lsh_u32 cv_l[8],lsh_u32 cv_r[8],const lsh_u32 iv[16])444 inline void load_iv(lsh_u32 cv_l[8], lsh_u32 cv_r[8], const lsh_u32 iv[16])
445 {
446 	_mm_storeu_si128(M128_CAST(cv_l+ 0),
447 		_mm_load_si128(CONST_M128_CAST(iv+ 0)));
448 	_mm_storeu_si128(M128_CAST(cv_l+ 4),
449 		_mm_load_si128(CONST_M128_CAST(iv+ 4)));
450 	_mm_storeu_si128(M128_CAST(cv_r+ 0),
451 		_mm_load_si128(CONST_M128_CAST(iv+ 8)));
452 	_mm_storeu_si128(M128_CAST(cv_r+ 4),
453 		_mm_load_si128(CONST_M128_CAST(iv+12)));
454 }
455 
zero_iv(lsh_u32 cv_l[8],lsh_u32 cv_r[8])456 inline void zero_iv(lsh_u32 cv_l[8], lsh_u32 cv_r[8])
457 {
458 	_mm_storeu_si128(M128_CAST(cv_l+0), _mm_setzero_si128());
459 	_mm_storeu_si128(M128_CAST(cv_l+4), _mm_setzero_si128());
460 	_mm_storeu_si128(M128_CAST(cv_r+0), _mm_setzero_si128());
461 	_mm_storeu_si128(M128_CAST(cv_r+4), _mm_setzero_si128());
462 }
463 
zero_submsgs(LSH256_SSSE3_Context * ctx)464 inline void zero_submsgs(LSH256_SSSE3_Context* ctx)
465 {
466 	lsh_u32* sub_msgs = ctx->sub_msgs;
467 
468 	_mm_storeu_si128(M128_CAST(sub_msgs+ 0), _mm_setzero_si128());
469 	_mm_storeu_si128(M128_CAST(sub_msgs+ 4), _mm_setzero_si128());
470 	_mm_storeu_si128(M128_CAST(sub_msgs+ 8), _mm_setzero_si128());
471 	_mm_storeu_si128(M128_CAST(sub_msgs+12), _mm_setzero_si128());
472 	_mm_storeu_si128(M128_CAST(sub_msgs+16), _mm_setzero_si128());
473 	_mm_storeu_si128(M128_CAST(sub_msgs+20), _mm_setzero_si128());
474 	_mm_storeu_si128(M128_CAST(sub_msgs+24), _mm_setzero_si128());
475 	_mm_storeu_si128(M128_CAST(sub_msgs+28), _mm_setzero_si128());
476 }
477 
init224(LSH256_SSSE3_Context * ctx)478 inline void init224(LSH256_SSSE3_Context* ctx)
479 {
480 	CRYPTOPP_ASSERT(ctx != NULLPTR);
481 
482 	zero_submsgs(ctx);
483 	load_iv(ctx->cv_l, ctx->cv_r, LSH256_IV224);
484 }
485 
init256(LSH256_SSSE3_Context * ctx)486 inline void init256(LSH256_SSSE3_Context* ctx)
487 {
488 	CRYPTOPP_ASSERT(ctx != NULLPTR);
489 
490 	zero_submsgs(ctx);
491 	load_iv(ctx->cv_l, ctx->cv_r, LSH256_IV256);
492 }
493 
494 /* -------------------------------------------------------- */
495 
fin(LSH256_SSSE3_Context * ctx)496 inline void fin(LSH256_SSSE3_Context* ctx)
497 {
498 	CRYPTOPP_ASSERT(ctx != NULLPTR);
499 
500 	_mm_storeu_si128(M128_CAST(ctx->cv_l+0), _mm_xor_si128(
501 		_mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+0)),
502 		_mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+0))));
503 	_mm_storeu_si128(M128_CAST(ctx->cv_l+4), _mm_xor_si128(
504 		_mm_loadu_si128(CONST_M128_CAST(ctx->cv_l+4)),
505 		_mm_loadu_si128(CONST_M128_CAST(ctx->cv_r+4))));
506 }
507 
508 /* -------------------------------------------------------- */
509 
get_hash(LSH256_SSSE3_Context * ctx,lsh_u8 * pbHashVal)510 inline void get_hash(LSH256_SSSE3_Context* ctx, lsh_u8* pbHashVal)
511 {
512 	CRYPTOPP_ASSERT(ctx != NULLPTR);
513 	CRYPTOPP_ASSERT(ctx->alg_type != 0);
514 	CRYPTOPP_ASSERT(pbHashVal != NULLPTR);
515 
516 	lsh_uint alg_type = ctx->alg_type;
517 	lsh_uint hash_val_byte_len = LSH_GET_HASHBYTE(alg_type);
518 	lsh_uint hash_val_bit_len = LSH_GET_SMALL_HASHBIT(alg_type);
519 
520 	// Multiplying by sizeof(lsh_u8) looks odd...
521 	memcpy(pbHashVal, ctx->cv_l, hash_val_byte_len);
522 	if (hash_val_bit_len){
523 		pbHashVal[hash_val_byte_len-1] &= (((lsh_u8)0xff) << hash_val_bit_len);
524 	}
525 }
526 
527 /* -------------------------------------------------------- */
528 
lsh256_ssse3_init(LSH256_SSSE3_Context * ctx)529 lsh_err lsh256_ssse3_init(LSH256_SSSE3_Context* ctx)
530 {
531 	CRYPTOPP_ASSERT(ctx != NULLPTR);
532 	CRYPTOPP_ASSERT(ctx->alg_type != 0);
533 
534 	lsh_u32 alg_type = ctx->alg_type;
535 	const lsh_u32* const_v = NULL;
536 	ctx->remain_databitlen = 0;
537 
538 	switch (alg_type)
539 	{
540 	case LSH_TYPE_256_256:
541 		init256(ctx);
542 		return LSH_SUCCESS;
543 	case LSH_TYPE_256_224:
544 		init224(ctx);
545 		return LSH_SUCCESS;
546 	default:
547 		break;
548 	}
549 
550 	lsh_u32* cv_l = ctx->cv_l;
551 	lsh_u32* cv_r = ctx->cv_r;
552 
553 	zero_iv(cv_l, cv_r);
554 	cv_l[0] = LSH256_HASH_VAL_MAX_BYTE_LEN;
555 	cv_l[1] = LSH_GET_HASHBIT(alg_type);
556 
557 	for (size_t i = 0; i < NUM_STEPS / 2; i++)
558 	{
559 		//Mix
560 		load_sc(&const_v, i * 16);
561 		mix<ROT_EVEN_ALPHA, ROT_EVEN_BETA>(cv_l, cv_r, const_v);
562 		word_perm(cv_l, cv_r);
563 
564 		load_sc(&const_v, i * 16 + 8);
565 		mix<ROT_ODD_ALPHA, ROT_ODD_BETA>(cv_l, cv_r, const_v);
566 		word_perm(cv_l, cv_r);
567 	}
568 
569 	return LSH_SUCCESS;
570 }
571 
lsh256_ssse3_update(LSH256_SSSE3_Context * ctx,const lsh_u8 * data,size_t databitlen)572 lsh_err lsh256_ssse3_update(LSH256_SSSE3_Context* ctx, const lsh_u8* data, size_t databitlen)
573 {
574 	CRYPTOPP_ASSERT(ctx != NULLPTR);
575 	CRYPTOPP_ASSERT(data != NULLPTR);
576 	CRYPTOPP_ASSERT(databitlen % 8 == 0);
577 	CRYPTOPP_ASSERT(ctx->alg_type != 0);
578 
579 	if (databitlen == 0){
580 		return LSH_SUCCESS;
581 	}
582 
583 	// We are byte oriented. tail bits will always be 0.
584 	size_t databytelen = databitlen >> 3;
585 	// lsh_uint pos2 = databitlen & 0x7;
586 	const size_t pos2 = 0;
587 
588 	size_t remain_msg_byte = ctx->remain_databitlen >> 3;
589 	// lsh_uint remain_msg_bit = ctx->remain_databitlen & 7;
590 	const size_t remain_msg_bit = 0;
591 
592 	if (remain_msg_byte >= LSH256_MSG_BLK_BYTE_LEN){
593 		return LSH_ERR_INVALID_STATE;
594 	}
595 	if (remain_msg_bit > 0){
596 		return LSH_ERR_INVALID_DATABITLEN;
597 	}
598 
599 	if (databytelen + remain_msg_byte < LSH256_MSG_BLK_BYTE_LEN)
600 	{
601 		memcpy(ctx->last_block + remain_msg_byte, data, databytelen);
602 		ctx->remain_databitlen += (lsh_uint)databitlen;
603 		remain_msg_byte += (lsh_uint)databytelen;
604 		if (pos2){
605 			ctx->last_block[remain_msg_byte] = data[databytelen] & ((0xff >> pos2) ^ 0xff);
606 		}
607 		return LSH_SUCCESS;
608 	}
609 
610 	if (remain_msg_byte > 0){
611 		size_t more_byte = LSH256_MSG_BLK_BYTE_LEN - remain_msg_byte;
612 		memcpy(ctx->last_block + remain_msg_byte, data, more_byte);
613 		compress(ctx, ctx->last_block);
614 		data += more_byte;
615 		databytelen -= more_byte;
616 		remain_msg_byte = 0;
617 		ctx->remain_databitlen = 0;
618 	}
619 
620 	while (databytelen >= LSH256_MSG_BLK_BYTE_LEN)
621 	{
622 		// This call to compress caused some trouble.
623 		// The data pointer can become unaligned in the
624 		// previous block.
625 		compress(ctx, data);
626 		data += LSH256_MSG_BLK_BYTE_LEN;
627 		databytelen -= LSH256_MSG_BLK_BYTE_LEN;
628 	}
629 
630 	if (databytelen > 0){
631 		memcpy(ctx->last_block, data, databytelen);
632 		ctx->remain_databitlen = (lsh_uint)(databytelen << 3);
633 	}
634 
635 	if (pos2){
636 		ctx->last_block[databytelen] = data[databytelen] & ((0xff >> pos2) ^ 0xff);
637 		ctx->remain_databitlen += pos2;
638 	}
639 
640 	return LSH_SUCCESS;
641 }
642 
lsh256_ssse3_final(LSH256_SSSE3_Context * ctx,lsh_u8 * hashval)643 lsh_err lsh256_ssse3_final(LSH256_SSSE3_Context* ctx, lsh_u8* hashval)
644 {
645 	CRYPTOPP_ASSERT(ctx != NULLPTR);
646 	CRYPTOPP_ASSERT(hashval != NULLPTR);
647 
648 	// We are byte oriented. tail bits will always be 0.
649 	size_t remain_msg_byte = ctx->remain_databitlen >> 3;
650 	// lsh_uint remain_msg_bit = ctx->remain_databitlen & 7;
651 	const size_t remain_msg_bit = 0;
652 
653 	if (remain_msg_byte >= LSH256_MSG_BLK_BYTE_LEN){
654 		return LSH_ERR_INVALID_STATE;
655 	}
656 
657 	if (remain_msg_bit){
658 		ctx->last_block[remain_msg_byte] |= (0x1 << (7 - remain_msg_bit));
659 	}
660 	else{
661 		ctx->last_block[remain_msg_byte] = 0x80;
662 	}
663 	memset(ctx->last_block + remain_msg_byte + 1, 0, LSH256_MSG_BLK_BYTE_LEN - remain_msg_byte - 1);
664 
665 	compress(ctx, ctx->last_block);
666 
667 	fin(ctx);
668 	get_hash(ctx, hashval);
669 
670 	return LSH_SUCCESS;
671 }
672 
673 ANONYMOUS_NAMESPACE_END  // Anonymous
674 
NAMESPACE_BEGIN(CryptoPP)675 NAMESPACE_BEGIN(CryptoPP)
676 
677 extern
678 void LSH256_Base_Restart_SSSE3(word32* state)
679 {
680 	state[RemainingBits] = 0;
681 	LSH256_SSSE3_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
682 	lsh_err err = lsh256_ssse3_init(&ctx);
683 
684 	if (err != LSH_SUCCESS)
685 		throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_ssse3_init failed");
686 }
687 
688 extern
LSH256_Base_Update_SSSE3(word32 * state,const byte * input,size_t size)689 void LSH256_Base_Update_SSSE3(word32* state, const byte *input, size_t size)
690 {
691 	LSH256_SSSE3_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
692 	lsh_err err = lsh256_ssse3_update(&ctx, input, 8*size);
693 
694 	if (err != LSH_SUCCESS)
695 		throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_ssse3_update failed");
696 }
697 
698 extern
LSH256_Base_TruncatedFinal_SSSE3(word32 * state,byte * hash,size_t)699 void LSH256_Base_TruncatedFinal_SSSE3(word32* state, byte *hash, size_t)
700 {
701 	LSH256_SSSE3_Context ctx(state, state[AlgorithmType], state[RemainingBits]);
702 	lsh_err err = lsh256_ssse3_final(&ctx, hash);
703 
704 	if (err != LSH_SUCCESS)
705 		throw Exception(Exception::OTHER_ERROR, "LSH256_Base: lsh256_ssse3_final failed");
706 }
707 
708 NAMESPACE_END
709 
710 #endif  // CRYPTOPP_SSSE3_AVAILABLE
711