1 /*******************************************************************************
2 Copyright (c) 2009-2020, Intel Corporation
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6
7 * Redistributions of source code must retain the above copyright notice,
8 this list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in the
11 documentation and/or other materials provided with the distribution.
12 * Neither the name of Intel Corporation nor the names of its contributors
13 may be used to endorse or promote products derived from this software
14 without specific prior written permission.
15
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
27
28 /*-----------------------------------------------------------------------
29 * zuc_avx.c
30 *-----------------------------------------------------------------------
31 * An implementation of ZUC, the core algorithm for the
32 * 3GPP Confidentiality and Integrity algorithms.
33 *
34 *-----------------------------------------------------------------------*/
35
36 #include <string.h>
37
38 #include "include/zuc_internal.h"
39 #include "include/wireless_common.h"
40 #include "include/save_xmms.h"
41 #include "include/clear_regs_mem.h"
42 #include "intel-ipsec-mb.h"
43
44 #define SAVE_XMMS save_xmms_avx
45 #define RESTORE_XMMS restore_xmms_avx
46 #define CLEAR_SCRATCH_SIMD_REGS clear_scratch_xmms_avx
47
48 #define NUM_AVX_BUFS 4
49 #define KEYSTR_ROUND_LEN 16
50
51 static inline
_zuc_eea3_1_buffer_avx(const void * pKey,const void * pIv,const void * pBufferIn,void * pBufferOut,const uint32_t length)52 void _zuc_eea3_1_buffer_avx(const void *pKey,
53 const void *pIv,
54 const void *pBufferIn,
55 void *pBufferOut,
56 const uint32_t length)
57 {
58 DECLARE_ALIGNED(ZucState_t zucState, 16);
59 DECLARE_ALIGNED(uint8_t keyStream[KEYSTR_ROUND_LEN], 16);
60 const uint64_t *pIn64 = NULL;
61 uint64_t *pOut64 = NULL, *pKeyStream64 = NULL;
62 uint64_t *pTemp64 = NULL, *pdstTemp64 = NULL;
63
64 uint32_t numKeyStreamsPerPkt = length/ KEYSTR_ROUND_LEN;
65 const uint32_t numBytesLeftOver = length % KEYSTR_ROUND_LEN;
66
67 /* initialize the zuc state */
68 asm_ZucInitialization_avx(pKey, pIv, &(zucState));
69
70 /* Loop Over all the Quad-Words in input buffer and XOR with the
71 * 16 bytes of generated keystream */
72 pOut64 = (uint64_t *) pBufferOut;
73 pIn64 = (const uint64_t *) pBufferIn;
74
75 while (numKeyStreamsPerPkt--) {
76 /* Generate the key stream 16 bytes at a time */
77 asm_ZucGenKeystream16B_avx((uint32_t *) &keyStream[0],
78 &zucState);
79
80 /* XOR The Keystream generated with the input buffer here */
81 pKeyStream64 = (uint64_t *) keyStream;
82 asm_XorKeyStream16B_avx(pIn64, pOut64, pKeyStream64);
83 pIn64 += 2;
84 pOut64 += 2;
85 }
86
87 /* Check for remaining 0 to 15 bytes */
88 if (numBytesLeftOver) {
89 /* buffer to store 16 bytes of keystream */
90 DECLARE_ALIGNED(uint8_t tempSrc[KEYSTR_ROUND_LEN], 16);
91 DECLARE_ALIGNED(uint8_t tempDst[KEYSTR_ROUND_LEN], 16);
92 const uint8_t *pIn8 = (const uint8_t *) pBufferIn;
93 uint8_t *pOut8 = (uint8_t *) pBufferOut;
94 const uint64_t num4BRounds = ((numBytesLeftOver - 1) / 4) + 1;
95
96 asm_ZucGenKeystream_avx((uint32_t *) &keyStream[0],
97 &zucState, num4BRounds);
98
99 /* copy the remaining bytes into temporary buffer and XOR with
100 * the 16-bytes of keystream. Then copy on the valid bytes back
101 * to the output buffer */
102
103 memcpy(&tempSrc[0], &pIn8[length - numBytesLeftOver],
104 numBytesLeftOver);
105 pKeyStream64 = (uint64_t *) &keyStream[0];
106 pTemp64 = (uint64_t *) &tempSrc[0];
107 pdstTemp64 = (uint64_t *) &tempDst[0];
108
109 asm_XorKeyStream16B_avx(pTemp64, pdstTemp64,
110 pKeyStream64);
111 memcpy(&pOut8[length - numBytesLeftOver], &tempDst[0],
112 numBytesLeftOver);
113
114 #ifdef SAFE_DATA
115 clear_mem(tempSrc, sizeof(tempSrc));
116 clear_mem(tempDst, sizeof(tempDst));
117 #endif
118 }
119 #ifdef SAFE_DATA
120 /* Clear sensitive data in stack */
121 clear_mem(keyStream, sizeof(keyStream));
122 clear_mem(&zucState, sizeof(zucState));
123 #endif
124 }
125
126 IMB_DLL_LOCAL
_zuc_eea3_4_buffer_avx(const void * const pKey[NUM_AVX_BUFS],const void * const pIv[NUM_AVX_BUFS],const void * const pBufferIn[NUM_AVX_BUFS],void * pBufferOut[NUM_AVX_BUFS],const uint32_t length[NUM_AVX_BUFS])127 void _zuc_eea3_4_buffer_avx(const void * const pKey[NUM_AVX_BUFS],
128 const void * const pIv[NUM_AVX_BUFS],
129 const void * const pBufferIn[NUM_AVX_BUFS],
130 void *pBufferOut[NUM_AVX_BUFS],
131 const uint32_t length[NUM_AVX_BUFS])
132 {
133 DECLARE_ALIGNED(ZucState4_t state, 16);
134 DECLARE_ALIGNED(ZucState_t singlePktState, 16);
135 unsigned int i;
136 /* Calculate the minimum input packet size */
137 uint32_t bytes1 = (length[0] < length[1] ?
138 length[0] : length[1]);
139 uint32_t bytes2 = (length[2] < length[3] ?
140 length[2] : length[3]);
141 /* min number of bytes */
142 uint32_t bytes = (bytes1 < bytes2) ? bytes1 : bytes2;
143 uint32_t numKeyStreamsPerPkt;
144 uint16_t remainBytes[NUM_AVX_BUFS] = {0};
145 DECLARE_ALIGNED(uint8_t keyStr[NUM_AVX_BUFS][KEYSTR_ROUND_LEN], 64);
146 /* structure to store the 4 keys */
147 DECLARE_ALIGNED(ZucKey4_t keys, 64);
148 /* structure to store the 4 IV's */
149 DECLARE_ALIGNED(ZucIv4_t ivs, 64);
150 uint32_t numBytesLeftOver = 0;
151 const uint8_t *pTempBufInPtr = NULL;
152 uint8_t *pTempBufOutPtr = NULL;
153 const uint64_t *pIn64[NUM_AVX_BUFS]= {NULL};
154 uint64_t *pOut64[NUM_AVX_BUFS] = {NULL};
155 uint64_t *pKeyStream64 = NULL;
156
157 /*
158 * Calculate the number of bytes left over for each packet,
159 * and setup the Keys and IVs
160 */
161 for (i = 0; i< NUM_AVX_BUFS; i++) {
162 remainBytes[i] = length[i];
163 keys.pKeys[i] = pKey[i];
164 ivs.pIvs[i] = pIv[i];
165 }
166
167 asm_ZucInitialization_4_avx( &keys, &ivs, &state);
168
169 for (i = 0; i < NUM_AVX_BUFS; i++) {
170 pOut64[i] = (uint64_t *) pBufferOut[i];
171 pIn64[i] = (const uint64_t *) pBufferIn[i];
172 }
173
174 /* Encrypt common length of all buffers */
175 asm_ZucCipher_4_avx(&state, pIn64, pOut64, remainBytes,
176 (uint16_t) bytes);
177
178 /* process each packet separately for the remaining bytes */
179 for (i = 0; i < NUM_AVX_BUFS; i++) {
180 if (remainBytes[i]) {
181 /* need to copy the zuc state to single packet state */
182 singlePktState.lfsrState[0] = state.lfsrState[0][i];
183 singlePktState.lfsrState[1] = state.lfsrState[1][i];
184 singlePktState.lfsrState[2] = state.lfsrState[2][i];
185 singlePktState.lfsrState[3] = state.lfsrState[3][i];
186 singlePktState.lfsrState[4] = state.lfsrState[4][i];
187 singlePktState.lfsrState[5] = state.lfsrState[5][i];
188 singlePktState.lfsrState[6] = state.lfsrState[6][i];
189 singlePktState.lfsrState[7] = state.lfsrState[7][i];
190 singlePktState.lfsrState[8] = state.lfsrState[8][i];
191 singlePktState.lfsrState[9] = state.lfsrState[9][i];
192 singlePktState.lfsrState[10] = state.lfsrState[10][i];
193 singlePktState.lfsrState[11] = state.lfsrState[11][i];
194 singlePktState.lfsrState[12] = state.lfsrState[12][i];
195 singlePktState.lfsrState[13] = state.lfsrState[13][i];
196 singlePktState.lfsrState[14] = state.lfsrState[14][i];
197 singlePktState.lfsrState[15] = state.lfsrState[15][i];
198
199 singlePktState.fR1 = state.fR1[i];
200 singlePktState.fR2 = state.fR2[i];
201
202 numKeyStreamsPerPkt = remainBytes[i] / KEYSTR_ROUND_LEN;
203 numBytesLeftOver = remainBytes[i] % KEYSTR_ROUND_LEN;
204
205 pTempBufInPtr = pBufferIn[i];
206 pTempBufOutPtr = pBufferOut[i];
207
208 /* update the output and input pointers here to point
209 * to the i'th buffers */
210 pOut64[0] = (uint64_t *) &pTempBufOutPtr[length[i] -
211 remainBytes[i]];
212 pIn64[0] = (const uint64_t *) &pTempBufInPtr[length[i] -
213 remainBytes[i]];
214
215 while (numKeyStreamsPerPkt--) {
216 /* Generate the key stream 16 bytes at a time */
217 asm_ZucGenKeystream16B_avx(
218 (uint32_t *) keyStr[0],
219 &singlePktState);
220 pKeyStream64 = (uint64_t *) keyStr[0];
221 asm_XorKeyStream16B_avx(pIn64[0], pOut64[0],
222 pKeyStream64);
223 pIn64[0] += 2;
224 pOut64[0] += 2;
225 }
226
227 /* Check for remaining 0 to 15 bytes */
228 if (numBytesLeftOver) {
229 DECLARE_ALIGNED(uint8_t tempSrc[16], 64);
230 DECLARE_ALIGNED(uint8_t tempDst[16], 64);
231 uint64_t *pTempSrc64;
232 uint64_t *pTempDst64;
233 uint32_t offset = length[i] - numBytesLeftOver;
234 const uint64_t num4BRounds =
235 ((numBytesLeftOver - 1) / 4) + 1;
236
237 asm_ZucGenKeystream_avx((uint32_t *)&keyStr[0],
238 &singlePktState,
239 num4BRounds);
240 /* copy the remaining bytes into temporary
241 * buffer and XOR with the 16 bytes of
242 * keystream. Then copy on the valid bytes back
243 * to the output buffer */
244 memcpy(&tempSrc[0], &pTempBufInPtr[offset],
245 numBytesLeftOver);
246 memset(&tempSrc[numBytesLeftOver], 0,
247 16 - numBytesLeftOver);
248
249 pKeyStream64 = (uint64_t *) &keyStr[0][0];
250 pTempSrc64 = (uint64_t *) &tempSrc[0];
251 pTempDst64 = (uint64_t *) &tempDst[0];
252 asm_XorKeyStream16B_avx(pTempSrc64, pTempDst64,
253 pKeyStream64);
254
255 memcpy(&pTempBufOutPtr[offset],
256 &tempDst[0], numBytesLeftOver);
257 #ifdef SAFE_DATA
258 clear_mem(tempSrc, sizeof(tempSrc));
259 clear_mem(tempDst, sizeof(tempDst));
260 #endif
261 }
262 }
263 }
264 #ifdef SAFE_DATA
265 /* Clear sensitive data in stack */
266 clear_mem(keyStr, sizeof(keyStr));
267 clear_mem(&singlePktState, sizeof(singlePktState));
268 clear_mem(&state, sizeof(state));
269 clear_mem(&keys, sizeof(keys));
270 #endif
271 }
272
zuc_eea3_1_buffer_avx(const void * pKey,const void * pIv,const void * pBufferIn,void * pBufferOut,const uint32_t length)273 void zuc_eea3_1_buffer_avx(const void *pKey,
274 const void *pIv,
275 const void *pBufferIn,
276 void *pBufferOut,
277 const uint32_t length)
278 {
279 #ifndef LINUX
280 DECLARE_ALIGNED(imb_uint128_t xmm_save[10], 16);
281
282 SAVE_XMMS(xmm_save);
283 #endif
284 #ifdef SAFE_PARAM
285 /* Check for NULL pointers */
286 if (pKey == NULL || pIv == NULL || pBufferIn == NULL ||
287 pBufferOut == NULL)
288 return;
289
290 /* Check input data is in range of supported length */
291 if (length < ZUC_MIN_BYTELEN || length > ZUC_MAX_BYTELEN)
292 return;
293 #endif
294 _zuc_eea3_1_buffer_avx(pKey, pIv, pBufferIn, pBufferOut, length);
295
296 #ifdef SAFE_DATA
297 /* Clear sensitive data in registers */
298 CLEAR_SCRATCH_GPS();
299 CLEAR_SCRATCH_SIMD_REGS();
300 #endif
301 #ifndef LINUX
302 RESTORE_XMMS(xmm_save);
303 #endif
304 }
305
zuc_eea3_4_buffer_avx(const void * const pKey[NUM_AVX_BUFS],const void * const pIv[NUM_AVX_BUFS],const void * const pBufferIn[NUM_AVX_BUFS],void * pBufferOut[NUM_AVX_BUFS],const uint32_t length[NUM_AVX_BUFS])306 void zuc_eea3_4_buffer_avx(const void * const pKey[NUM_AVX_BUFS],
307 const void * const pIv[NUM_AVX_BUFS],
308 const void * const pBufferIn[NUM_AVX_BUFS],
309 void *pBufferOut[NUM_AVX_BUFS],
310 const uint32_t length[NUM_AVX_BUFS])
311 {
312 #ifndef LINUX
313 DECLARE_ALIGNED(imb_uint128_t xmm_save[10], 16);
314
315 SAVE_XMMS(xmm_save);
316 #endif
317 #ifdef SAFE_PARAM
318 unsigned int i;
319
320 /* Check for NULL pointers */
321 if (pKey == NULL || pIv == NULL || pBufferIn == NULL ||
322 pBufferOut == NULL || length == NULL)
323 return;
324
325 for (i = 0; i < NUM_AVX_BUFS; i++) {
326 if (pKey[i] == NULL || pIv[i] == NULL ||
327 pBufferIn[i] == NULL || pBufferOut[i] == NULL)
328 return;
329
330 /* Check input data is in range of supported length */
331 if (length[i] < ZUC_MIN_BYTELEN || length[i] > ZUC_MAX_BYTELEN)
332 return;
333 }
334 #endif
335
336 _zuc_eea3_4_buffer_avx(pKey, pIv, pBufferIn, pBufferOut, length);
337
338 #ifdef SAFE_DATA
339 /* Clear sensitive data in registers */
340 CLEAR_SCRATCH_GPS();
341 CLEAR_SCRATCH_SIMD_REGS();
342 #endif
343 #ifndef LINUX
344 RESTORE_XMMS(xmm_save);
345 #endif
346 }
347
zuc_eea3_n_buffer_avx(const void * const pKey[],const void * const pIv[],const void * const pBufferIn[],void * pBufferOut[],const uint32_t length[],const uint32_t numBuffers)348 void zuc_eea3_n_buffer_avx(const void * const pKey[], const void * const pIv[],
349 const void * const pBufferIn[], void *pBufferOut[],
350 const uint32_t length[],
351 const uint32_t numBuffers)
352 {
353 #ifndef LINUX
354 DECLARE_ALIGNED(imb_uint128_t xmm_save[10], 16);
355
356 SAVE_XMMS(xmm_save);
357 #endif
358
359 unsigned int i;
360 unsigned int packetCount = numBuffers;
361
362 #ifdef SAFE_PARAM
363 /* Check for NULL pointers */
364 if (pKey == NULL || pIv == NULL || pBufferIn == NULL ||
365 pBufferOut == NULL || length == NULL)
366 return;
367
368 for (i = 0; i < numBuffers; i++) {
369 if (pKey[i] == NULL || pIv[i] == NULL ||
370 pBufferIn[i] == NULL || pBufferOut[i] == NULL)
371 return;
372
373 /* Check input data is in range of supported length */
374 if (length[i] < ZUC_MIN_BYTELEN || length[i] > ZUC_MAX_BYTELEN)
375 return;
376 }
377 #endif
378 i = 0;
379
380 while(packetCount >= 4) {
381 packetCount -=4;
382 _zuc_eea3_4_buffer_avx(&pKey[i],
383 &pIv[i],
384 &pBufferIn[i],
385 &pBufferOut[i],
386 &length[i]);
387 i+=4;
388 }
389
390 while(packetCount--) {
391 _zuc_eea3_1_buffer_avx(pKey[i],
392 pIv[i],
393 pBufferIn[i],
394 pBufferOut[i],
395 length[i]);
396 i++;
397 }
398 #ifdef SAFE_DATA
399 /* Clear sensitive data in registers */
400 CLEAR_SCRATCH_GPS();
401 CLEAR_SCRATCH_SIMD_REGS();
402 #endif
403 #ifndef LINUX
404 RESTORE_XMMS(xmm_save);
405 #endif
406 }
407
rotate_left(uint64_t u,size_t r)408 static inline uint64_t rotate_left(uint64_t u, size_t r)
409 {
410 return (((u) << (r)) | ((u) >> (64 - (r))));
411 }
412
load_uint64(const void * ptr)413 static inline uint64_t load_uint64(const void *ptr)
414 {
415 return *((const uint64_t *)ptr);
416 }
417
418 static inline
_zuc_eia3_1_buffer_avx(const void * pKey,const void * pIv,const void * pBufferIn,const uint32_t lengthInBits,uint32_t * pMacI)419 void _zuc_eia3_1_buffer_avx(const void *pKey,
420 const void *pIv,
421 const void *pBufferIn,
422 const uint32_t lengthInBits,
423 uint32_t *pMacI)
424 {
425 DECLARE_ALIGNED(ZucState_t zucState, 64);
426 DECLARE_ALIGNED(uint32_t keyStream[4 * 2], 64);
427 const uint32_t keyStreamLengthInBits = KEYSTR_ROUND_LEN * 8;
428 /* generate a key-stream 2 words longer than the input message */
429 const uint32_t N = lengthInBits + (2 * ZUC_WORD_BITS);
430 uint32_t L = (N + 31) / ZUC_WORD_BITS;
431 uint32_t *pZuc = (uint32_t *) &keyStream[0];
432 uint32_t remainingBits = lengthInBits;
433 uint32_t T = 0;
434 const uint8_t *pIn8 = (const uint8_t *) pBufferIn;
435
436 asm_ZucInitialization_avx(pKey, pIv, &(zucState));
437 asm_ZucGenKeystream16B_avx(pZuc, &zucState);
438
439 /* loop over the message bits */
440 while (remainingBits >= keyStreamLengthInBits) {
441 remainingBits -= keyStreamLengthInBits;
442 L -= (keyStreamLengthInBits / 32);
443
444 /* Generate the next key stream 8 bytes or 16 bytes */
445 if (!remainingBits)
446 asm_ZucGenKeystream8B_avx(&keyStream[4], &zucState);
447 else
448 asm_ZucGenKeystream16B_avx(&keyStream[4], &zucState);
449 T = asm_Eia3Round16BAVX(T, keyStream, pIn8);
450 /* Copy the last keystream generated to the first 16 bytes */
451 memcpy(&keyStream[0], &keyStream[4], KEYSTR_ROUND_LEN);
452 pIn8 = &pIn8[KEYSTR_ROUND_LEN];
453 }
454
455 /*
456 * If remaining bits has more than 2 ZUC WORDS (double words),
457 * keystream needs to have up to another 2 ZUC WORDS (8B)
458 */
459 if (remainingBits > (2 * 32))
460 asm_ZucGenKeystream8B_avx(&keyStream[4], &zucState);
461 T ^= asm_Eia3RemainderAVX(&keyStream[0], pIn8, remainingBits);
462 T ^= rotate_left(load_uint64(&keyStream[remainingBits / 32]),
463 remainingBits % 32);
464
465 /* save the final MAC-I result */
466 uint32_t keyBlock = keyStream[L - 1];
467 *pMacI = bswap4(T ^ keyBlock);
468
469 #ifdef SAFE_DATA
470 /* Clear sensitive data (in registers and stack) */
471 clear_mem(keyStream, sizeof(keyStream));
472 clear_mem(&zucState, sizeof(zucState));
473 #endif
474 }
475
476 IMB_DLL_LOCAL
_zuc_eia3_4_buffer_avx(const void * const pKey[NUM_AVX_BUFS],const void * const pIv[NUM_AVX_BUFS],const void * const pBufferIn[NUM_AVX_BUFS],const uint32_t lengthInBits[NUM_AVX_BUFS],uint32_t * pMacI[NUM_AVX_BUFS])477 void _zuc_eia3_4_buffer_avx(const void * const pKey[NUM_AVX_BUFS],
478 const void * const pIv[NUM_AVX_BUFS],
479 const void * const pBufferIn[NUM_AVX_BUFS],
480 const uint32_t lengthInBits[NUM_AVX_BUFS],
481 uint32_t *pMacI[NUM_AVX_BUFS])
482 {
483 unsigned int i;
484 DECLARE_ALIGNED(ZucState4_t state, 64);
485 DECLARE_ALIGNED(ZucState_t singlePktState, 64);
486 DECLARE_ALIGNED(uint8_t keyStr[NUM_AVX_BUFS][2*KEYSTR_ROUND_LEN], 64);
487 /* structure to store the 4 keys */
488 DECLARE_ALIGNED(ZucKey4_t keys, 64);
489 /* structure to store the 4 IV's */
490 DECLARE_ALIGNED(ZucIv4_t ivs, 64);
491 const uint8_t *pIn8[NUM_AVX_BUFS] = {NULL};
492 uint32_t remainCommonBits;
493 uint32_t numKeyStr = 0;
494 uint32_t T[NUM_AVX_BUFS] = {0};
495 const uint32_t keyStreamLengthInBits = KEYSTR_ROUND_LEN * 8;
496 DECLARE_ALIGNED(uint32_t *pKeyStrArr[NUM_AVX_BUFS], 16) = {NULL};
497 unsigned int allCommonBits;
498
499 /* Check if all lengths are equal */
500 if ((lengthInBits[0] == lengthInBits[1]) &&
501 (lengthInBits[0] == lengthInBits[2]) &&
502 (lengthInBits[0] == lengthInBits[3])) {
503 remainCommonBits = lengthInBits[0];
504 allCommonBits = 1;
505 } else {
506 /* Calculate the minimum input packet size */
507 uint32_t bits1 = (lengthInBits[0] < lengthInBits[1] ?
508 lengthInBits[0] : lengthInBits[1]);
509 uint32_t bits2 = (lengthInBits[2] < lengthInBits[3] ?
510 lengthInBits[2] : lengthInBits[3]);
511
512 remainCommonBits = (bits1 < bits2) ? bits1 : bits2;
513 allCommonBits = 0;
514 }
515
516 for (i = 0; i < NUM_AVX_BUFS; i++) {
517 pIn8[i] = (const uint8_t *) pBufferIn[i];
518 pKeyStrArr[i] = (uint32_t *) &keyStr[i][0];
519 keys.pKeys[i] = pKey[i];
520 ivs.pIvs[i] = pIv[i];
521 }
522
523 asm_ZucInitialization_4_avx( &keys, &ivs, &state);
524
525 /* Generate 16 bytes at a time */
526 asm_ZucGenKeystream16B_4_avx(&state, pKeyStrArr);
527
528 /* Point at the next 16 bytes of the key */
529 for (i = 0; i < NUM_AVX_BUFS; i++)
530 pKeyStrArr[i] = (uint32_t *) &keyStr[i][KEYSTR_ROUND_LEN];
531
532 /* loop over the message bits */
533 while (remainCommonBits >= keyStreamLengthInBits) {
534 remainCommonBits -= keyStreamLengthInBits;
535 numKeyStr++;
536 /* Generate the next key stream 8 bytes or 16 bytes */
537 if (!remainCommonBits && allCommonBits)
538 asm_ZucGenKeystream8B_4_avx(&state, pKeyStrArr);
539 else
540 asm_ZucGenKeystream16B_4_avx(&state, pKeyStrArr);
541 for (i = 0; i < NUM_AVX_BUFS; i++) {
542 T[i] = asm_Eia3Round16BAVX(T[i], keyStr[i], pIn8[i]);
543 /* Copy the last keystream generated
544 * to the first 16 bytes */
545 memcpy(&keyStr[i][0], &keyStr[i][KEYSTR_ROUND_LEN],
546 KEYSTR_ROUND_LEN);
547 pIn8[i] = &pIn8[i][KEYSTR_ROUND_LEN];
548 }
549 }
550
551 /* Process each packet separately for the remaining bits */
552 for (i = 0; i < NUM_AVX_BUFS; i++) {
553 const uint32_t N = lengthInBits[i] + (2 * ZUC_WORD_BITS);
554 uint32_t L = ((N + 31) / ZUC_WORD_BITS) -
555 numKeyStr*(keyStreamLengthInBits / 32);
556 uint32_t remainBits = lengthInBits[i] -
557 numKeyStr*keyStreamLengthInBits;
558 uint32_t *keyStr32 = (uint32_t *) keyStr[i];
559
560 /* If remaining bits are more than 8 bytes, we need to generate
561 * at least 8B more of keystream, so we need to copy
562 * the zuc state to single packet state first */
563 if (remainBits > (2*32)) {
564 singlePktState.lfsrState[0] = state.lfsrState[0][i];
565 singlePktState.lfsrState[1] = state.lfsrState[1][i];
566 singlePktState.lfsrState[2] = state.lfsrState[2][i];
567 singlePktState.lfsrState[3] = state.lfsrState[3][i];
568 singlePktState.lfsrState[4] = state.lfsrState[4][i];
569 singlePktState.lfsrState[5] = state.lfsrState[5][i];
570 singlePktState.lfsrState[6] = state.lfsrState[6][i];
571 singlePktState.lfsrState[7] = state.lfsrState[7][i];
572 singlePktState.lfsrState[8] = state.lfsrState[8][i];
573 singlePktState.lfsrState[9] = state.lfsrState[9][i];
574 singlePktState.lfsrState[10] = state.lfsrState[10][i];
575 singlePktState.lfsrState[11] = state.lfsrState[11][i];
576 singlePktState.lfsrState[12] = state.lfsrState[12][i];
577 singlePktState.lfsrState[13] = state.lfsrState[13][i];
578 singlePktState.lfsrState[14] = state.lfsrState[14][i];
579 singlePktState.lfsrState[15] = state.lfsrState[15][i];
580
581 singlePktState.fR1 = state.fR1[i];
582 singlePktState.fR2 = state.fR2[i];
583 }
584
585 while (remainBits >= keyStreamLengthInBits) {
586 remainBits -= keyStreamLengthInBits;
587 L -= (keyStreamLengthInBits / 32);
588
589 /* Generate the next key stream 8 bytes or 16 bytes */
590 if (!remainBits)
591 asm_ZucGenKeystream8B_avx(&keyStr32[4],
592 &singlePktState);
593 else
594 asm_ZucGenKeystream16B_avx(&keyStr32[4],
595 &singlePktState);
596 T[i] = asm_Eia3Round16BAVX(T[i], keyStr32, pIn8[i]);
597 /* Copy the last keystream generated
598 * to the first 16 bytes */
599 memcpy(keyStr32, &keyStr32[4], KEYSTR_ROUND_LEN);
600 pIn8[i] = &pIn8[i][KEYSTR_ROUND_LEN];
601 }
602
603 /*
604 * If remaining bits has more than 2 ZUC WORDS (double words),
605 * keystream needs to have up to another 2 ZUC WORDS (8B)
606 */
607
608 if (remainBits > (2 * 32))
609 asm_ZucGenKeystream8B_avx(&keyStr32[4],
610 &singlePktState);
611
612 uint32_t keyBlock = keyStr32[L - 1];
613
614 T[i] ^= asm_Eia3RemainderAVX(keyStr32, pIn8[i], remainBits);
615 T[i] ^= rotate_left(load_uint64(&keyStr32[remainBits / 32]),
616 remainBits % 32);
617
618 /* save the final MAC-I result */
619 *(pMacI[i]) = bswap4(T[i] ^ keyBlock);
620 }
621
622 #ifdef SAFE_DATA
623 /* Clear sensitive data (in registers and stack) */
624 clear_mem(keyStr, sizeof(keyStr));
625 clear_mem(&singlePktState, sizeof(singlePktState));
626 clear_mem(&state, sizeof(state));
627 clear_mem(&keys, sizeof(keys));
628 #endif
629 }
630
zuc_eia3_1_buffer_avx(const void * pKey,const void * pIv,const void * pBufferIn,const uint32_t lengthInBits,uint32_t * pMacI)631 void zuc_eia3_1_buffer_avx(const void *pKey,
632 const void *pIv,
633 const void *pBufferIn,
634 const uint32_t lengthInBits,
635 uint32_t *pMacI)
636 {
637 #ifndef LINUX
638 DECLARE_ALIGNED(imb_uint128_t xmm_save[10], 16);
639
640 SAVE_XMMS(xmm_save);
641 #endif
642 #ifdef SAFE_PARAM
643 /* Check for NULL pointers */
644 if (pKey == NULL || pIv == NULL || pBufferIn == NULL || pMacI == NULL)
645 return;
646
647 /* Check input data is in range of supported length */
648 if (lengthInBits < ZUC_MIN_BITLEN || lengthInBits > ZUC_MAX_BITLEN)
649 return;
650 #endif
651
652 _zuc_eia3_1_buffer_avx(pKey, pIv, pBufferIn, lengthInBits, pMacI);
653
654 #ifdef SAFE_DATA
655 CLEAR_SCRATCH_GPS();
656 CLEAR_SCRATCH_SIMD_REGS();
657 #endif
658 #ifndef LINUX
659 RESTORE_XMMS(xmm_save);
660 #endif
661 }
662
zuc_eia3_4_buffer_job_avx(const void * const pKey[NUM_AVX_BUFS],const void * const pIv[NUM_AVX_BUFS],const void * const pBufferIn[NUM_AVX_BUFS],uint32_t * pMacI[NUM_AVX_BUFS],const uint16_t lengthInBits[NUM_AVX_BUFS],const void * const job_in_lane[NUM_AVX_BUFS])663 void zuc_eia3_4_buffer_job_avx(const void * const pKey[NUM_AVX_BUFS],
664 const void * const pIv[NUM_AVX_BUFS],
665 const void * const pBufferIn[NUM_AVX_BUFS],
666 uint32_t *pMacI[NUM_AVX_BUFS],
667 const uint16_t lengthInBits[NUM_AVX_BUFS],
668 const void * const job_in_lane[NUM_AVX_BUFS])
669 {
670 unsigned int i;
671 DECLARE_ALIGNED(ZucState4_t state, 64);
672 DECLARE_ALIGNED(ZucState_t singlePktState, 64);
673 DECLARE_ALIGNED(uint8_t keyStr[NUM_AVX_BUFS][2*KEYSTR_ROUND_LEN], 64);
674 /* structure to store the 4 keys */
675 DECLARE_ALIGNED(ZucKey4_t keys, 64);
676 /* structure to store the 4 IV's */
677 DECLARE_ALIGNED(ZucIv4_t ivs, 64);
678 const uint8_t *pIn8[NUM_AVX_BUFS] = {NULL};
679 uint32_t remainCommonBits;
680 uint32_t numKeyStr = 0;
681 uint32_t T[NUM_AVX_BUFS] = {0};
682 const uint32_t keyStreamLengthInBits = KEYSTR_ROUND_LEN * 8;
683 DECLARE_ALIGNED(uint32_t *pKeyStrArr[NUM_AVX_BUFS], 16) = {NULL};
684 unsigned int allCommonBits;
685
686 /* Check if all lengths are equal */
687 if ((lengthInBits[0] == lengthInBits[1]) &&
688 (lengthInBits[0] == lengthInBits[2]) &&
689 (lengthInBits[0] == lengthInBits[3])) {
690 remainCommonBits = lengthInBits[0];
691 allCommonBits = 1;
692 } else {
693 /* Calculate the minimum input packet size */
694 uint32_t bits1 = (lengthInBits[0] < lengthInBits[1] ?
695 lengthInBits[0] : lengthInBits[1]);
696 uint32_t bits2 = (lengthInBits[2] < lengthInBits[3] ?
697 lengthInBits[2] : lengthInBits[3]);
698
699 remainCommonBits = (bits1 < bits2) ? bits1 : bits2;
700 allCommonBits = 0;
701 }
702
703 for (i = 0; i < NUM_AVX_BUFS; i++) {
704 pIn8[i] = (const uint8_t *) pBufferIn[i];
705 pKeyStrArr[i] = (uint32_t *) &keyStr[i][0];
706 keys.pKeys[i] = pKey[i];
707 ivs.pIvs[i] = pIv[i];
708 }
709
710 asm_ZucInitialization_4_avx( &keys, &ivs, &state);
711
712 /* Generate 16 bytes at a time */
713 asm_ZucGenKeystream16B_4_avx(&state, pKeyStrArr);
714
715 /* Point at the next 16 bytes of the key */
716 for (i = 0; i < NUM_AVX_BUFS; i++)
717 pKeyStrArr[i] = (uint32_t *) &keyStr[i][KEYSTR_ROUND_LEN];
718
719 /* loop over the message bits */
720 while (remainCommonBits >= keyStreamLengthInBits) {
721 remainCommonBits -= keyStreamLengthInBits;
722 numKeyStr++;
723 /* Generate the next key stream 8 bytes or 16 bytes */
724 if (!remainCommonBits && allCommonBits)
725 asm_ZucGenKeystream8B_4_avx(&state, pKeyStrArr);
726 else
727 asm_ZucGenKeystream16B_4_avx(&state, pKeyStrArr);
728 for (i = 0; i < NUM_AVX_BUFS; i++) {
729 if (job_in_lane[i] == NULL)
730 continue;
731 T[i] = asm_Eia3Round16BAVX(T[i], keyStr[i], pIn8[i]);
732 /* Copy the last keystream generated
733 * to the first 16 bytes */
734 memcpy(&keyStr[i][0], &keyStr[i][KEYSTR_ROUND_LEN],
735 KEYSTR_ROUND_LEN);
736 pIn8[i] = &pIn8[i][KEYSTR_ROUND_LEN];
737 }
738 }
739
740 /* Process each packet separately for the remaining bits */
741 for (i = 0; i < NUM_AVX_BUFS; i++) {
742 if (job_in_lane[i] == NULL)
743 continue;
744
745 const uint32_t N = lengthInBits[i] + (2 * ZUC_WORD_BITS);
746 uint32_t L = ((N + 31) / ZUC_WORD_BITS) -
747 numKeyStr*(keyStreamLengthInBits / 32);
748 uint32_t remainBits = lengthInBits[i] -
749 numKeyStr*keyStreamLengthInBits;
750 uint32_t *keyStr32 = (uint32_t *) keyStr[i];
751
752 /* If remaining bits are more than 8 bytes, we need to generate
753 * at least 8B more of keystream, so we need to copy
754 * the zuc state to single packet state first */
755 if (remainBits > (2*32)) {
756 singlePktState.lfsrState[0] = state.lfsrState[0][i];
757 singlePktState.lfsrState[1] = state.lfsrState[1][i];
758 singlePktState.lfsrState[2] = state.lfsrState[2][i];
759 singlePktState.lfsrState[3] = state.lfsrState[3][i];
760 singlePktState.lfsrState[4] = state.lfsrState[4][i];
761 singlePktState.lfsrState[5] = state.lfsrState[5][i];
762 singlePktState.lfsrState[6] = state.lfsrState[6][i];
763 singlePktState.lfsrState[7] = state.lfsrState[7][i];
764 singlePktState.lfsrState[8] = state.lfsrState[8][i];
765 singlePktState.lfsrState[9] = state.lfsrState[9][i];
766 singlePktState.lfsrState[10] = state.lfsrState[10][i];
767 singlePktState.lfsrState[11] = state.lfsrState[11][i];
768 singlePktState.lfsrState[12] = state.lfsrState[12][i];
769 singlePktState.lfsrState[13] = state.lfsrState[13][i];
770 singlePktState.lfsrState[14] = state.lfsrState[14][i];
771 singlePktState.lfsrState[15] = state.lfsrState[15][i];
772
773 singlePktState.fR1 = state.fR1[i];
774 singlePktState.fR2 = state.fR2[i];
775 }
776
777 while (remainBits >= keyStreamLengthInBits) {
778 remainBits -= keyStreamLengthInBits;
779 L -= (keyStreamLengthInBits / 32);
780
781 /* Generate the next key stream 8 bytes or 16 bytes */
782 if (!remainBits)
783 asm_ZucGenKeystream8B_avx(&keyStr32[4],
784 &singlePktState);
785 else
786 asm_ZucGenKeystream16B_avx(&keyStr32[4],
787 &singlePktState);
788 T[i] = asm_Eia3Round16BAVX(T[i], keyStr32, pIn8[i]);
789 /* Copy the last keystream generated
790 * to the first 16 bytes */
791 memcpy(keyStr32, &keyStr32[4], KEYSTR_ROUND_LEN);
792 pIn8[i] = &pIn8[i][KEYSTR_ROUND_LEN];
793 }
794
795 /*
796 * If remaining bits has more than 2 ZUC WORDS (double words),
797 * keystream needs to have up to another 2 ZUC WORDS (8B)
798 */
799 if (remainBits > (2 * 32))
800 asm_ZucGenKeystream8B_avx(&keyStr32[4],
801 &singlePktState);
802
803 uint32_t keyBlock = keyStr32[L - 1];
804
805 T[i] ^= asm_Eia3RemainderAVX(keyStr32, pIn8[i], remainBits);
806 T[i] ^= rotate_left(load_uint64(&keyStr32[remainBits / 32]),
807 remainBits % 32);
808
809 /* save the final MAC-I result */
810 *(pMacI[i]) = bswap4(T[i] ^ keyBlock);
811 }
812
813 #ifdef SAFE_DATA
814 /* Clear sensitive data (in registers and stack) */
815 clear_mem(keyStr, sizeof(keyStr));
816 clear_mem(&singlePktState, sizeof(singlePktState));
817 clear_mem(&state, sizeof(state));
818 clear_mem(&keys, sizeof(keys));
819 #endif
820 }
821
zuc_eia3_n_buffer_avx(const void * const pKey[],const void * const pIv[],const void * const pBufferIn[],const uint32_t lengthInBits[],uint32_t * pMacI[],const uint32_t numBuffers)822 void zuc_eia3_n_buffer_avx(const void * const pKey[],
823 const void * const pIv[],
824 const void * const pBufferIn[],
825 const uint32_t lengthInBits[],
826 uint32_t *pMacI[],
827 const uint32_t numBuffers)
828 {
829 #ifndef LINUX
830 DECLARE_ALIGNED(imb_uint128_t xmm_save[10], 16);
831
832 SAVE_XMMS(xmm_save);
833 #endif
834
835 unsigned int i;
836 unsigned int packetCount = numBuffers;
837
838 #ifdef SAFE_PARAM
839 /* Check for NULL pointers */
840 if (pKey == NULL || pIv == NULL || pBufferIn == NULL ||
841 lengthInBits == NULL || pMacI == NULL)
842 return;
843
844 for (i = 0; i < numBuffers; i++) {
845 if (pKey[i] == NULL || pIv[i] == NULL ||
846 pBufferIn[i] == NULL || pMacI[i] == NULL)
847 return;
848
849 /* Check input data is in range of supported length */
850 if (lengthInBits[i] < ZUC_MIN_BITLEN ||
851 lengthInBits[i] > ZUC_MAX_BITLEN)
852 return;
853 }
854 #endif
855 i = 0;
856
857 while(packetCount >= 4) {
858 packetCount -=4;
859 _zuc_eia3_4_buffer_avx(&pKey[i],
860 &pIv[i],
861 &pBufferIn[i],
862 &lengthInBits[i],
863 &pMacI[i]);
864 i+=4;
865 }
866
867 while(packetCount--) {
868 _zuc_eia3_1_buffer_avx(pKey[i],
869 pIv[i],
870 pBufferIn[i],
871 lengthInBits[i],
872 pMacI[i]);
873 i++;
874 }
875
876 #ifdef SAFE_DATA
877 /* Clear sensitive data in registers */
878 CLEAR_SCRATCH_GPS();
879 CLEAR_SCRATCH_SIMD_REGS();
880 #endif
881 #ifndef LINUX
882 RESTORE_XMMS(xmm_save);
883 #endif
884 }
885