1 /*******************************************************************************
2   Copyright (c) 2009-2020, Intel Corporation
3 
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6 
7       * Redistributions of source code must retain the above copyright notice,
8         this list of conditions and the following disclaimer.
9       * Redistributions in binary form must reproduce the above copyright
10         notice, this list of conditions and the following disclaimer in the
11         documentation and/or other materials provided with the distribution.
12       * Neither the name of Intel Corporation nor the names of its contributors
13         may be used to endorse or promote products derived from this software
14         without specific prior written permission.
15 
16   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19   DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20   FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24   OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
27 
28 /*-----------------------------------------------------------------------
29 * zuc_avx.c
30 *-----------------------------------------------------------------------
31 * An implementation of ZUC, the core algorithm for the
32 * 3GPP Confidentiality and Integrity algorithms.
33 *
34 *-----------------------------------------------------------------------*/
35 
36 #include <string.h>
37 
38 #include "include/zuc_internal.h"
39 #include "include/wireless_common.h"
40 #include "include/save_xmms.h"
41 #include "include/clear_regs_mem.h"
42 #include "intel-ipsec-mb.h"
43 
44 #define SAVE_XMMS               save_xmms_avx
45 #define RESTORE_XMMS            restore_xmms_avx
46 #define CLEAR_SCRATCH_SIMD_REGS clear_scratch_xmms_avx
47 
48 #define NUM_AVX_BUFS 4
49 #define KEYSTR_ROUND_LEN 16
50 
51 static inline
_zuc_eea3_1_buffer_avx(const void * pKey,const void * pIv,const void * pBufferIn,void * pBufferOut,const uint32_t length)52 void _zuc_eea3_1_buffer_avx(const void *pKey,
53                             const void *pIv,
54                             const void *pBufferIn,
55                             void *pBufferOut,
56                             const uint32_t length)
57 {
58         DECLARE_ALIGNED(ZucState_t zucState, 16);
59         DECLARE_ALIGNED(uint8_t keyStream[KEYSTR_ROUND_LEN], 16);
60         const uint64_t *pIn64 = NULL;
61         uint64_t *pOut64 = NULL, *pKeyStream64 = NULL;
62         uint64_t *pTemp64 = NULL, *pdstTemp64 = NULL;
63 
64         uint32_t numKeyStreamsPerPkt = length/ KEYSTR_ROUND_LEN;
65         const uint32_t numBytesLeftOver = length % KEYSTR_ROUND_LEN;
66 
67         /* initialize the zuc state */
68         asm_ZucInitialization_avx(pKey, pIv, &(zucState));
69 
70         /* Loop Over all the Quad-Words in input buffer and XOR with the
71          * 16 bytes of generated keystream */
72         pOut64 = (uint64_t *) pBufferOut;
73         pIn64 = (const uint64_t *) pBufferIn;
74 
75         while (numKeyStreamsPerPkt--) {
76                 /* Generate the key stream 16 bytes at a time */
77                 asm_ZucGenKeystream16B_avx((uint32_t *) &keyStream[0],
78                                            &zucState);
79 
80                 /* XOR The Keystream generated with the input buffer here */
81                 pKeyStream64 = (uint64_t *) keyStream;
82                 asm_XorKeyStream16B_avx(pIn64, pOut64, pKeyStream64);
83                 pIn64 += 2;
84                 pOut64 += 2;
85         }
86 
87         /* Check for remaining 0 to 15 bytes */
88         if (numBytesLeftOver) {
89                 /* buffer to store 16 bytes of keystream */
90                 DECLARE_ALIGNED(uint8_t tempSrc[KEYSTR_ROUND_LEN], 16);
91                 DECLARE_ALIGNED(uint8_t tempDst[KEYSTR_ROUND_LEN], 16);
92                 const uint8_t *pIn8 = (const uint8_t *) pBufferIn;
93                 uint8_t *pOut8 = (uint8_t *) pBufferOut;
94                 const uint64_t num4BRounds = ((numBytesLeftOver - 1) / 4) + 1;
95 
96                 asm_ZucGenKeystream_avx((uint32_t *) &keyStream[0],
97                                         &zucState, num4BRounds);
98 
99                 /* copy the remaining bytes into temporary buffer and XOR with
100                  * the 16-bytes of keystream. Then copy on the valid bytes back
101                  * to the output buffer */
102 
103                 memcpy(&tempSrc[0], &pIn8[length - numBytesLeftOver],
104                        numBytesLeftOver);
105                 pKeyStream64 = (uint64_t *) &keyStream[0];
106                 pTemp64 = (uint64_t *) &tempSrc[0];
107                 pdstTemp64 = (uint64_t *) &tempDst[0];
108 
109                 asm_XorKeyStream16B_avx(pTemp64, pdstTemp64,
110                                         pKeyStream64);
111                 memcpy(&pOut8[length - numBytesLeftOver], &tempDst[0],
112                        numBytesLeftOver);
113 
114 #ifdef SAFE_DATA
115                 clear_mem(tempSrc, sizeof(tempSrc));
116                 clear_mem(tempDst, sizeof(tempDst));
117 #endif
118         }
119 #ifdef SAFE_DATA
120         /* Clear sensitive data in stack */
121         clear_mem(keyStream, sizeof(keyStream));
122         clear_mem(&zucState, sizeof(zucState));
123 #endif
124 }
125 
126 IMB_DLL_LOCAL
_zuc_eea3_4_buffer_avx(const void * const pKey[NUM_AVX_BUFS],const void * const pIv[NUM_AVX_BUFS],const void * const pBufferIn[NUM_AVX_BUFS],void * pBufferOut[NUM_AVX_BUFS],const uint32_t length[NUM_AVX_BUFS])127 void _zuc_eea3_4_buffer_avx(const void * const pKey[NUM_AVX_BUFS],
128                             const void * const pIv[NUM_AVX_BUFS],
129                             const void * const pBufferIn[NUM_AVX_BUFS],
130                             void *pBufferOut[NUM_AVX_BUFS],
131                             const uint32_t length[NUM_AVX_BUFS])
132 {
133         DECLARE_ALIGNED(ZucState4_t state, 16);
134         DECLARE_ALIGNED(ZucState_t singlePktState, 16);
135         unsigned int i;
136         /* Calculate the minimum input packet size */
137         uint32_t bytes1 = (length[0] < length[1] ?
138                            length[0] : length[1]);
139         uint32_t bytes2 = (length[2] < length[3] ?
140                            length[2] : length[3]);
141         /* min number of bytes */
142         uint32_t bytes = (bytes1 < bytes2) ? bytes1 : bytes2;
143         uint32_t numKeyStreamsPerPkt;
144         uint16_t remainBytes[NUM_AVX_BUFS] = {0};
145         DECLARE_ALIGNED(uint8_t keyStr[NUM_AVX_BUFS][KEYSTR_ROUND_LEN], 64);
146         /* structure to store the 4 keys */
147         DECLARE_ALIGNED(ZucKey4_t keys, 64);
148         /* structure to store the 4 IV's */
149         DECLARE_ALIGNED(ZucIv4_t ivs, 64);
150         uint32_t numBytesLeftOver = 0;
151         const uint8_t *pTempBufInPtr = NULL;
152         uint8_t *pTempBufOutPtr = NULL;
153         const uint64_t *pIn64[NUM_AVX_BUFS]= {NULL};
154         uint64_t *pOut64[NUM_AVX_BUFS] = {NULL};
155         uint64_t *pKeyStream64 = NULL;
156 
157         /*
158          * Calculate the number of bytes left over for each packet,
159          * and setup the Keys and IVs
160          */
161         for (i = 0; i< NUM_AVX_BUFS; i++) {
162                 remainBytes[i] = length[i];
163                 keys.pKeys[i] = pKey[i];
164                 ivs.pIvs[i] = pIv[i];
165         }
166 
167         asm_ZucInitialization_4_avx( &keys,  &ivs, &state);
168 
169         for (i = 0; i < NUM_AVX_BUFS; i++) {
170                 pOut64[i] = (uint64_t *) pBufferOut[i];
171                 pIn64[i] = (const uint64_t *) pBufferIn[i];
172         }
173 
174         /* Encrypt common length of all buffers */
175         asm_ZucCipher_4_avx(&state, pIn64, pOut64, remainBytes,
176                             (uint16_t) bytes);
177 
178         /* process each packet separately for the remaining bytes */
179         for (i = 0; i < NUM_AVX_BUFS; i++) {
180                 if (remainBytes[i]) {
181                         /* need to copy the zuc state to single packet state */
182                         singlePktState.lfsrState[0] = state.lfsrState[0][i];
183                         singlePktState.lfsrState[1] = state.lfsrState[1][i];
184                         singlePktState.lfsrState[2] = state.lfsrState[2][i];
185                         singlePktState.lfsrState[3] = state.lfsrState[3][i];
186                         singlePktState.lfsrState[4] = state.lfsrState[4][i];
187                         singlePktState.lfsrState[5] = state.lfsrState[5][i];
188                         singlePktState.lfsrState[6] = state.lfsrState[6][i];
189                         singlePktState.lfsrState[7] = state.lfsrState[7][i];
190                         singlePktState.lfsrState[8] = state.lfsrState[8][i];
191                         singlePktState.lfsrState[9] = state.lfsrState[9][i];
192                         singlePktState.lfsrState[10] = state.lfsrState[10][i];
193                         singlePktState.lfsrState[11] = state.lfsrState[11][i];
194                         singlePktState.lfsrState[12] = state.lfsrState[12][i];
195                         singlePktState.lfsrState[13] = state.lfsrState[13][i];
196                         singlePktState.lfsrState[14] = state.lfsrState[14][i];
197                         singlePktState.lfsrState[15] = state.lfsrState[15][i];
198 
199                         singlePktState.fR1 = state.fR1[i];
200                         singlePktState.fR2 = state.fR2[i];
201 
202                         numKeyStreamsPerPkt = remainBytes[i] / KEYSTR_ROUND_LEN;
203                         numBytesLeftOver = remainBytes[i]  % KEYSTR_ROUND_LEN;
204 
205                         pTempBufInPtr = pBufferIn[i];
206                         pTempBufOutPtr = pBufferOut[i];
207 
208                         /* update the output and input pointers here to point
209                          * to the i'th buffers */
210                         pOut64[0] = (uint64_t *) &pTempBufOutPtr[length[i] -
211                                                                 remainBytes[i]];
212                         pIn64[0] = (const uint64_t *) &pTempBufInPtr[length[i] -
213                                                                 remainBytes[i]];
214 
215                         while (numKeyStreamsPerPkt--) {
216                                 /* Generate the key stream 16 bytes at a time */
217                                 asm_ZucGenKeystream16B_avx(
218                                                        (uint32_t *) keyStr[0],
219                                                        &singlePktState);
220                                 pKeyStream64 = (uint64_t *) keyStr[0];
221                                 asm_XorKeyStream16B_avx(pIn64[0], pOut64[0],
222                                                         pKeyStream64);
223                                 pIn64[0] += 2;
224                                 pOut64[0] += 2;
225                         }
226 
227                         /* Check for remaining 0 to 15 bytes */
228                         if (numBytesLeftOver) {
229                                 DECLARE_ALIGNED(uint8_t tempSrc[16], 64);
230                                 DECLARE_ALIGNED(uint8_t tempDst[16], 64);
231                                 uint64_t *pTempSrc64;
232                                 uint64_t *pTempDst64;
233                                 uint32_t offset = length[i] - numBytesLeftOver;
234                                 const uint64_t num4BRounds =
235                                         ((numBytesLeftOver - 1) / 4) + 1;
236 
237                                 asm_ZucGenKeystream_avx((uint32_t *)&keyStr[0],
238                                                         &singlePktState,
239                                                         num4BRounds);
240                                 /* copy the remaining bytes into temporary
241                                  * buffer and XOR with the 16 bytes of
242                                  * keystream. Then copy on the valid bytes back
243                                  * to the output buffer */
244                                 memcpy(&tempSrc[0], &pTempBufInPtr[offset],
245                                        numBytesLeftOver);
246                                 memset(&tempSrc[numBytesLeftOver], 0,
247                                        16 - numBytesLeftOver);
248 
249                                 pKeyStream64 = (uint64_t *) &keyStr[0][0];
250                                 pTempSrc64 = (uint64_t *) &tempSrc[0];
251                                 pTempDst64 = (uint64_t *) &tempDst[0];
252                                 asm_XorKeyStream16B_avx(pTempSrc64, pTempDst64,
253                                                         pKeyStream64);
254 
255                                 memcpy(&pTempBufOutPtr[offset],
256                                        &tempDst[0], numBytesLeftOver);
257 #ifdef SAFE_DATA
258                                 clear_mem(tempSrc, sizeof(tempSrc));
259                                 clear_mem(tempDst, sizeof(tempDst));
260 #endif
261                         }
262                 }
263         }
264 #ifdef SAFE_DATA
265         /* Clear sensitive data in stack */
266         clear_mem(keyStr, sizeof(keyStr));
267         clear_mem(&singlePktState, sizeof(singlePktState));
268         clear_mem(&state, sizeof(state));
269         clear_mem(&keys, sizeof(keys));
270 #endif
271 }
272 
zuc_eea3_1_buffer_avx(const void * pKey,const void * pIv,const void * pBufferIn,void * pBufferOut,const uint32_t length)273 void zuc_eea3_1_buffer_avx(const void *pKey,
274                            const void *pIv,
275                            const void *pBufferIn,
276                            void *pBufferOut,
277                            const uint32_t length)
278 {
279 #ifndef LINUX
280         DECLARE_ALIGNED(imb_uint128_t xmm_save[10], 16);
281 
282         SAVE_XMMS(xmm_save);
283 #endif
284 #ifdef SAFE_PARAM
285         /* Check for NULL pointers */
286         if (pKey == NULL || pIv == NULL || pBufferIn == NULL ||
287             pBufferOut == NULL)
288                 return;
289 
290         /* Check input data is in range of supported length */
291         if (length < ZUC_MIN_BYTELEN || length > ZUC_MAX_BYTELEN)
292                 return;
293 #endif
294         _zuc_eea3_1_buffer_avx(pKey, pIv, pBufferIn, pBufferOut, length);
295 
296 #ifdef SAFE_DATA
297         /* Clear sensitive data in registers */
298         CLEAR_SCRATCH_GPS();
299         CLEAR_SCRATCH_SIMD_REGS();
300 #endif
301 #ifndef LINUX
302         RESTORE_XMMS(xmm_save);
303 #endif
304 }
305 
zuc_eea3_4_buffer_avx(const void * const pKey[NUM_AVX_BUFS],const void * const pIv[NUM_AVX_BUFS],const void * const pBufferIn[NUM_AVX_BUFS],void * pBufferOut[NUM_AVX_BUFS],const uint32_t length[NUM_AVX_BUFS])306 void zuc_eea3_4_buffer_avx(const void * const pKey[NUM_AVX_BUFS],
307                            const void * const pIv[NUM_AVX_BUFS],
308                            const void * const pBufferIn[NUM_AVX_BUFS],
309                            void *pBufferOut[NUM_AVX_BUFS],
310                            const uint32_t length[NUM_AVX_BUFS])
311 {
312 #ifndef LINUX
313         DECLARE_ALIGNED(imb_uint128_t xmm_save[10], 16);
314 
315         SAVE_XMMS(xmm_save);
316 #endif
317 #ifdef SAFE_PARAM
318         unsigned int i;
319 
320         /* Check for NULL pointers */
321         if (pKey == NULL || pIv == NULL || pBufferIn == NULL ||
322             pBufferOut == NULL || length == NULL)
323                 return;
324 
325         for (i = 0; i < NUM_AVX_BUFS; i++) {
326                 if (pKey[i] == NULL || pIv[i] == NULL ||
327                     pBufferIn[i] == NULL || pBufferOut[i] == NULL)
328                         return;
329 
330                 /* Check input data is in range of supported length */
331                 if (length[i] < ZUC_MIN_BYTELEN || length[i] > ZUC_MAX_BYTELEN)
332                         return;
333         }
334 #endif
335 
336         _zuc_eea3_4_buffer_avx(pKey, pIv, pBufferIn, pBufferOut, length);
337 
338 #ifdef SAFE_DATA
339         /* Clear sensitive data in registers */
340         CLEAR_SCRATCH_GPS();
341         CLEAR_SCRATCH_SIMD_REGS();
342 #endif
343 #ifndef LINUX
344         RESTORE_XMMS(xmm_save);
345 #endif
346 }
347 
zuc_eea3_n_buffer_avx(const void * const pKey[],const void * const pIv[],const void * const pBufferIn[],void * pBufferOut[],const uint32_t length[],const uint32_t numBuffers)348 void zuc_eea3_n_buffer_avx(const void * const pKey[], const void * const pIv[],
349                            const void * const pBufferIn[], void *pBufferOut[],
350                            const uint32_t length[],
351                            const uint32_t numBuffers)
352 {
353 #ifndef LINUX
354         DECLARE_ALIGNED(imb_uint128_t xmm_save[10], 16);
355 
356         SAVE_XMMS(xmm_save);
357 #endif
358 
359         unsigned int i;
360         unsigned int packetCount = numBuffers;
361 
362 #ifdef SAFE_PARAM
363         /* Check for NULL pointers */
364         if (pKey == NULL || pIv == NULL || pBufferIn == NULL ||
365             pBufferOut == NULL || length == NULL)
366                 return;
367 
368         for (i = 0; i < numBuffers; i++) {
369                 if (pKey[i] == NULL || pIv[i] == NULL ||
370                     pBufferIn[i] == NULL || pBufferOut[i] == NULL)
371                         return;
372 
373                 /* Check input data is in range of supported length */
374                 if (length[i] < ZUC_MIN_BYTELEN || length[i] > ZUC_MAX_BYTELEN)
375                         return;
376         }
377 #endif
378         i = 0;
379 
380         while(packetCount >= 4) {
381                 packetCount -=4;
382                 _zuc_eea3_4_buffer_avx(&pKey[i],
383                                        &pIv[i],
384                                        &pBufferIn[i],
385                                        &pBufferOut[i],
386                                        &length[i]);
387                 i+=4;
388         }
389 
390         while(packetCount--) {
391                 _zuc_eea3_1_buffer_avx(pKey[i],
392                                        pIv[i],
393                                        pBufferIn[i],
394                                        pBufferOut[i],
395                                        length[i]);
396                 i++;
397         }
398 #ifdef SAFE_DATA
399         /* Clear sensitive data in registers */
400         CLEAR_SCRATCH_GPS();
401         CLEAR_SCRATCH_SIMD_REGS();
402 #endif
403 #ifndef LINUX
404         RESTORE_XMMS(xmm_save);
405 #endif
406 }
407 
rotate_left(uint64_t u,size_t r)408 static inline uint64_t rotate_left(uint64_t u, size_t r)
409 {
410         return (((u) << (r)) | ((u) >> (64 - (r))));
411 }
412 
load_uint64(const void * ptr)413 static inline uint64_t load_uint64(const void *ptr)
414 {
415         return *((const uint64_t *)ptr);
416 }
417 
418 static inline
_zuc_eia3_1_buffer_avx(const void * pKey,const void * pIv,const void * pBufferIn,const uint32_t lengthInBits,uint32_t * pMacI)419 void _zuc_eia3_1_buffer_avx(const void *pKey,
420                             const void *pIv,
421                             const void *pBufferIn,
422                             const uint32_t lengthInBits,
423                             uint32_t *pMacI)
424 {
425         DECLARE_ALIGNED(ZucState_t zucState, 64);
426         DECLARE_ALIGNED(uint32_t keyStream[4 * 2], 64);
427         const uint32_t keyStreamLengthInBits = KEYSTR_ROUND_LEN * 8;
428         /* generate a key-stream 2 words longer than the input message */
429         const uint32_t N = lengthInBits + (2 * ZUC_WORD_BITS);
430         uint32_t L = (N + 31) / ZUC_WORD_BITS;
431         uint32_t *pZuc = (uint32_t *) &keyStream[0];
432         uint32_t remainingBits = lengthInBits;
433         uint32_t T = 0;
434         const uint8_t *pIn8 = (const uint8_t *) pBufferIn;
435 
436         asm_ZucInitialization_avx(pKey, pIv, &(zucState));
437         asm_ZucGenKeystream16B_avx(pZuc, &zucState);
438 
439         /* loop over the message bits */
440         while (remainingBits >= keyStreamLengthInBits) {
441                 remainingBits -=  keyStreamLengthInBits;
442                 L -= (keyStreamLengthInBits / 32);
443 
444                 /* Generate the next key stream 8 bytes or 16 bytes */
445                 if (!remainingBits)
446                         asm_ZucGenKeystream8B_avx(&keyStream[4], &zucState);
447                 else
448                         asm_ZucGenKeystream16B_avx(&keyStream[4], &zucState);
449                 T = asm_Eia3Round16BAVX(T, keyStream, pIn8);
450                 /* Copy the last keystream generated to the first 16 bytes */
451                 memcpy(&keyStream[0], &keyStream[4], KEYSTR_ROUND_LEN);
452                 pIn8 = &pIn8[KEYSTR_ROUND_LEN];
453         }
454 
455         /*
456          * If remaining bits has more than 2 ZUC WORDS (double words),
457          * keystream needs to have up to another 2 ZUC WORDS (8B)
458          */
459         if (remainingBits > (2 * 32))
460                 asm_ZucGenKeystream8B_avx(&keyStream[4], &zucState);
461         T ^= asm_Eia3RemainderAVX(&keyStream[0], pIn8, remainingBits);
462         T ^= rotate_left(load_uint64(&keyStream[remainingBits / 32]),
463                          remainingBits % 32);
464 
465         /* save the final MAC-I result */
466         uint32_t keyBlock = keyStream[L - 1];
467         *pMacI = bswap4(T ^ keyBlock);
468 
469 #ifdef SAFE_DATA
470         /* Clear sensitive data (in registers and stack) */
471         clear_mem(keyStream, sizeof(keyStream));
472         clear_mem(&zucState, sizeof(zucState));
473 #endif
474 }
475 
476 IMB_DLL_LOCAL
_zuc_eia3_4_buffer_avx(const void * const pKey[NUM_AVX_BUFS],const void * const pIv[NUM_AVX_BUFS],const void * const pBufferIn[NUM_AVX_BUFS],const uint32_t lengthInBits[NUM_AVX_BUFS],uint32_t * pMacI[NUM_AVX_BUFS])477 void _zuc_eia3_4_buffer_avx(const void * const pKey[NUM_AVX_BUFS],
478                             const void * const pIv[NUM_AVX_BUFS],
479                             const void * const pBufferIn[NUM_AVX_BUFS],
480                             const uint32_t lengthInBits[NUM_AVX_BUFS],
481                             uint32_t *pMacI[NUM_AVX_BUFS])
482 {
483         unsigned int i;
484         DECLARE_ALIGNED(ZucState4_t state, 64);
485         DECLARE_ALIGNED(ZucState_t singlePktState, 64);
486         DECLARE_ALIGNED(uint8_t keyStr[NUM_AVX_BUFS][2*KEYSTR_ROUND_LEN], 64);
487         /* structure to store the 4 keys */
488         DECLARE_ALIGNED(ZucKey4_t keys, 64);
489         /* structure to store the 4 IV's */
490         DECLARE_ALIGNED(ZucIv4_t ivs, 64);
491         const uint8_t *pIn8[NUM_AVX_BUFS] = {NULL};
492         uint32_t remainCommonBits;
493         uint32_t numKeyStr = 0;
494         uint32_t T[NUM_AVX_BUFS] = {0};
495         const uint32_t keyStreamLengthInBits = KEYSTR_ROUND_LEN * 8;
496         DECLARE_ALIGNED(uint32_t *pKeyStrArr[NUM_AVX_BUFS], 16) = {NULL};
497         unsigned int allCommonBits;
498 
499         /* Check if all lengths are equal */
500         if ((lengthInBits[0] == lengthInBits[1]) &&
501             (lengthInBits[0] == lengthInBits[2]) &&
502             (lengthInBits[0] == lengthInBits[3])) {
503                 remainCommonBits = lengthInBits[0];
504                 allCommonBits = 1;
505         } else {
506                 /* Calculate the minimum input packet size */
507                 uint32_t bits1 = (lengthInBits[0] < lengthInBits[1] ?
508                                    lengthInBits[0] : lengthInBits[1]);
509                 uint32_t bits2 = (lengthInBits[2] < lengthInBits[3] ?
510                                    lengthInBits[2] : lengthInBits[3]);
511 
512                 remainCommonBits = (bits1 < bits2) ? bits1 : bits2;
513                 allCommonBits = 0;
514         }
515 
516         for (i = 0; i < NUM_AVX_BUFS; i++) {
517                 pIn8[i] = (const uint8_t *) pBufferIn[i];
518                 pKeyStrArr[i] = (uint32_t *) &keyStr[i][0];
519                 keys.pKeys[i] = pKey[i];
520                 ivs.pIvs[i] = pIv[i];
521         }
522 
523         asm_ZucInitialization_4_avx( &keys,  &ivs, &state);
524 
525         /* Generate 16 bytes at a time */
526         asm_ZucGenKeystream16B_4_avx(&state, pKeyStrArr);
527 
528         /* Point at the next 16 bytes of the key */
529         for (i = 0; i < NUM_AVX_BUFS; i++)
530                 pKeyStrArr[i] = (uint32_t *) &keyStr[i][KEYSTR_ROUND_LEN];
531 
532         /* loop over the message bits */
533         while (remainCommonBits >= keyStreamLengthInBits) {
534                 remainCommonBits -= keyStreamLengthInBits;
535                 numKeyStr++;
536                 /* Generate the next key stream 8 bytes or 16 bytes */
537                 if (!remainCommonBits && allCommonBits)
538                         asm_ZucGenKeystream8B_4_avx(&state, pKeyStrArr);
539                 else
540                         asm_ZucGenKeystream16B_4_avx(&state, pKeyStrArr);
541                 for (i = 0; i < NUM_AVX_BUFS; i++) {
542                         T[i] = asm_Eia3Round16BAVX(T[i], keyStr[i], pIn8[i]);
543                         /* Copy the last keystream generated
544                          * to the first 16 bytes */
545                         memcpy(&keyStr[i][0], &keyStr[i][KEYSTR_ROUND_LEN],
546                                KEYSTR_ROUND_LEN);
547                         pIn8[i] = &pIn8[i][KEYSTR_ROUND_LEN];
548                 }
549         }
550 
551         /* Process each packet separately for the remaining bits */
552         for (i = 0; i < NUM_AVX_BUFS; i++) {
553                 const uint32_t N = lengthInBits[i] + (2 * ZUC_WORD_BITS);
554                 uint32_t L = ((N + 31) / ZUC_WORD_BITS) -
555                              numKeyStr*(keyStreamLengthInBits / 32);
556                 uint32_t remainBits = lengthInBits[i] -
557                                       numKeyStr*keyStreamLengthInBits;
558                 uint32_t *keyStr32 = (uint32_t *) keyStr[i];
559 
560                 /* If remaining bits are more than 8 bytes, we need to generate
561                  * at least 8B more of keystream, so we need to copy
562                  * the zuc state to single packet state first */
563                 if (remainBits > (2*32)) {
564                         singlePktState.lfsrState[0] = state.lfsrState[0][i];
565                         singlePktState.lfsrState[1] = state.lfsrState[1][i];
566                         singlePktState.lfsrState[2] = state.lfsrState[2][i];
567                         singlePktState.lfsrState[3] = state.lfsrState[3][i];
568                         singlePktState.lfsrState[4] = state.lfsrState[4][i];
569                         singlePktState.lfsrState[5] = state.lfsrState[5][i];
570                         singlePktState.lfsrState[6] = state.lfsrState[6][i];
571                         singlePktState.lfsrState[7] = state.lfsrState[7][i];
572                         singlePktState.lfsrState[8] = state.lfsrState[8][i];
573                         singlePktState.lfsrState[9] = state.lfsrState[9][i];
574                         singlePktState.lfsrState[10] = state.lfsrState[10][i];
575                         singlePktState.lfsrState[11] = state.lfsrState[11][i];
576                         singlePktState.lfsrState[12] = state.lfsrState[12][i];
577                         singlePktState.lfsrState[13] = state.lfsrState[13][i];
578                         singlePktState.lfsrState[14] = state.lfsrState[14][i];
579                         singlePktState.lfsrState[15] = state.lfsrState[15][i];
580 
581                         singlePktState.fR1 = state.fR1[i];
582                         singlePktState.fR2 = state.fR2[i];
583                 }
584 
585                 while (remainBits >= keyStreamLengthInBits) {
586                         remainBits -= keyStreamLengthInBits;
587                         L -= (keyStreamLengthInBits / 32);
588 
589                         /* Generate the next key stream 8 bytes or 16 bytes */
590                         if (!remainBits)
591                                 asm_ZucGenKeystream8B_avx(&keyStr32[4],
592                                                           &singlePktState);
593                         else
594                                 asm_ZucGenKeystream16B_avx(&keyStr32[4],
595                                                            &singlePktState);
596                         T[i] = asm_Eia3Round16BAVX(T[i], keyStr32, pIn8[i]);
597                         /* Copy the last keystream generated
598                          * to the first 16 bytes */
599                         memcpy(keyStr32, &keyStr32[4], KEYSTR_ROUND_LEN);
600                         pIn8[i] = &pIn8[i][KEYSTR_ROUND_LEN];
601                 }
602 
603                 /*
604                  * If remaining bits has more than 2 ZUC WORDS (double words),
605                  * keystream needs to have up to another 2 ZUC WORDS (8B)
606                  */
607 
608                 if (remainBits > (2 * 32))
609                         asm_ZucGenKeystream8B_avx(&keyStr32[4],
610                                                   &singlePktState);
611 
612                 uint32_t keyBlock = keyStr32[L - 1];
613 
614                 T[i] ^= asm_Eia3RemainderAVX(keyStr32, pIn8[i], remainBits);
615                 T[i] ^= rotate_left(load_uint64(&keyStr32[remainBits / 32]),
616                                  remainBits % 32);
617 
618                 /* save the final MAC-I result */
619                 *(pMacI[i]) = bswap4(T[i] ^ keyBlock);
620         }
621 
622 #ifdef SAFE_DATA
623         /* Clear sensitive data (in registers and stack) */
624         clear_mem(keyStr, sizeof(keyStr));
625         clear_mem(&singlePktState, sizeof(singlePktState));
626         clear_mem(&state, sizeof(state));
627         clear_mem(&keys, sizeof(keys));
628 #endif
629 }
630 
zuc_eia3_1_buffer_avx(const void * pKey,const void * pIv,const void * pBufferIn,const uint32_t lengthInBits,uint32_t * pMacI)631 void zuc_eia3_1_buffer_avx(const void *pKey,
632                            const void *pIv,
633                            const void *pBufferIn,
634                            const uint32_t lengthInBits,
635                            uint32_t *pMacI)
636 {
637 #ifndef LINUX
638         DECLARE_ALIGNED(imb_uint128_t xmm_save[10], 16);
639 
640         SAVE_XMMS(xmm_save);
641 #endif
642 #ifdef SAFE_PARAM
643         /* Check for NULL pointers */
644         if (pKey == NULL || pIv == NULL || pBufferIn == NULL || pMacI == NULL)
645                 return;
646 
647         /* Check input data is in range of supported length */
648         if (lengthInBits < ZUC_MIN_BITLEN || lengthInBits > ZUC_MAX_BITLEN)
649                 return;
650 #endif
651 
652         _zuc_eia3_1_buffer_avx(pKey, pIv, pBufferIn, lengthInBits, pMacI);
653 
654 #ifdef SAFE_DATA
655         CLEAR_SCRATCH_GPS();
656         CLEAR_SCRATCH_SIMD_REGS();
657 #endif
658 #ifndef LINUX
659         RESTORE_XMMS(xmm_save);
660 #endif
661 }
662 
zuc_eia3_4_buffer_job_avx(const void * const pKey[NUM_AVX_BUFS],const void * const pIv[NUM_AVX_BUFS],const void * const pBufferIn[NUM_AVX_BUFS],uint32_t * pMacI[NUM_AVX_BUFS],const uint16_t lengthInBits[NUM_AVX_BUFS],const void * const job_in_lane[NUM_AVX_BUFS])663 void zuc_eia3_4_buffer_job_avx(const void * const pKey[NUM_AVX_BUFS],
664                                const void * const pIv[NUM_AVX_BUFS],
665                                const void * const pBufferIn[NUM_AVX_BUFS],
666                                uint32_t *pMacI[NUM_AVX_BUFS],
667                                const uint16_t lengthInBits[NUM_AVX_BUFS],
668                                const void * const job_in_lane[NUM_AVX_BUFS])
669 {
670         unsigned int i;
671         DECLARE_ALIGNED(ZucState4_t state, 64);
672         DECLARE_ALIGNED(ZucState_t singlePktState, 64);
673         DECLARE_ALIGNED(uint8_t keyStr[NUM_AVX_BUFS][2*KEYSTR_ROUND_LEN], 64);
674         /* structure to store the 4 keys */
675         DECLARE_ALIGNED(ZucKey4_t keys, 64);
676         /* structure to store the 4 IV's */
677         DECLARE_ALIGNED(ZucIv4_t ivs, 64);
678         const uint8_t *pIn8[NUM_AVX_BUFS] = {NULL};
679         uint32_t remainCommonBits;
680         uint32_t numKeyStr = 0;
681         uint32_t T[NUM_AVX_BUFS] = {0};
682         const uint32_t keyStreamLengthInBits = KEYSTR_ROUND_LEN * 8;
683         DECLARE_ALIGNED(uint32_t *pKeyStrArr[NUM_AVX_BUFS], 16) = {NULL};
684         unsigned int allCommonBits;
685 
686         /* Check if all lengths are equal */
687         if ((lengthInBits[0] == lengthInBits[1]) &&
688             (lengthInBits[0] == lengthInBits[2]) &&
689             (lengthInBits[0] == lengthInBits[3])) {
690                 remainCommonBits = lengthInBits[0];
691                 allCommonBits = 1;
692         } else {
693                 /* Calculate the minimum input packet size */
694                 uint32_t bits1 = (lengthInBits[0] < lengthInBits[1] ?
695                                    lengthInBits[0] : lengthInBits[1]);
696                 uint32_t bits2 = (lengthInBits[2] < lengthInBits[3] ?
697                                    lengthInBits[2] : lengthInBits[3]);
698 
699                 remainCommonBits = (bits1 < bits2) ? bits1 : bits2;
700                 allCommonBits = 0;
701         }
702 
703         for (i = 0; i < NUM_AVX_BUFS; i++) {
704                 pIn8[i] = (const uint8_t *) pBufferIn[i];
705                 pKeyStrArr[i] = (uint32_t *) &keyStr[i][0];
706                 keys.pKeys[i] = pKey[i];
707                 ivs.pIvs[i] = pIv[i];
708         }
709 
710         asm_ZucInitialization_4_avx( &keys,  &ivs, &state);
711 
712         /* Generate 16 bytes at a time */
713         asm_ZucGenKeystream16B_4_avx(&state, pKeyStrArr);
714 
715         /* Point at the next 16 bytes of the key */
716         for (i = 0; i < NUM_AVX_BUFS; i++)
717                 pKeyStrArr[i] = (uint32_t *) &keyStr[i][KEYSTR_ROUND_LEN];
718 
719         /* loop over the message bits */
720         while (remainCommonBits >= keyStreamLengthInBits) {
721                 remainCommonBits -= keyStreamLengthInBits;
722                 numKeyStr++;
723                 /* Generate the next key stream 8 bytes or 16 bytes */
724                 if (!remainCommonBits && allCommonBits)
725                         asm_ZucGenKeystream8B_4_avx(&state, pKeyStrArr);
726                 else
727                         asm_ZucGenKeystream16B_4_avx(&state, pKeyStrArr);
728                 for (i = 0; i < NUM_AVX_BUFS; i++) {
729                         if (job_in_lane[i] == NULL)
730                                 continue;
731                         T[i] = asm_Eia3Round16BAVX(T[i], keyStr[i], pIn8[i]);
732                         /* Copy the last keystream generated
733                          * to the first 16 bytes */
734                         memcpy(&keyStr[i][0], &keyStr[i][KEYSTR_ROUND_LEN],
735                                KEYSTR_ROUND_LEN);
736                         pIn8[i] = &pIn8[i][KEYSTR_ROUND_LEN];
737                 }
738         }
739 
740         /* Process each packet separately for the remaining bits */
741         for (i = 0; i < NUM_AVX_BUFS; i++) {
742                 if (job_in_lane[i] == NULL)
743                         continue;
744 
745                 const uint32_t N = lengthInBits[i] + (2 * ZUC_WORD_BITS);
746                 uint32_t L = ((N + 31) / ZUC_WORD_BITS) -
747                              numKeyStr*(keyStreamLengthInBits / 32);
748                 uint32_t remainBits = lengthInBits[i] -
749                                       numKeyStr*keyStreamLengthInBits;
750                 uint32_t *keyStr32 = (uint32_t *) keyStr[i];
751 
752                 /* If remaining bits are more than 8 bytes, we need to generate
753                  * at least 8B more of keystream, so we need to copy
754                  * the zuc state to single packet state first */
755                 if (remainBits > (2*32)) {
756                         singlePktState.lfsrState[0] = state.lfsrState[0][i];
757                         singlePktState.lfsrState[1] = state.lfsrState[1][i];
758                         singlePktState.lfsrState[2] = state.lfsrState[2][i];
759                         singlePktState.lfsrState[3] = state.lfsrState[3][i];
760                         singlePktState.lfsrState[4] = state.lfsrState[4][i];
761                         singlePktState.lfsrState[5] = state.lfsrState[5][i];
762                         singlePktState.lfsrState[6] = state.lfsrState[6][i];
763                         singlePktState.lfsrState[7] = state.lfsrState[7][i];
764                         singlePktState.lfsrState[8] = state.lfsrState[8][i];
765                         singlePktState.lfsrState[9] = state.lfsrState[9][i];
766                         singlePktState.lfsrState[10] = state.lfsrState[10][i];
767                         singlePktState.lfsrState[11] = state.lfsrState[11][i];
768                         singlePktState.lfsrState[12] = state.lfsrState[12][i];
769                         singlePktState.lfsrState[13] = state.lfsrState[13][i];
770                         singlePktState.lfsrState[14] = state.lfsrState[14][i];
771                         singlePktState.lfsrState[15] = state.lfsrState[15][i];
772 
773                         singlePktState.fR1 = state.fR1[i];
774                         singlePktState.fR2 = state.fR2[i];
775                 }
776 
777                 while (remainBits >= keyStreamLengthInBits) {
778                         remainBits -= keyStreamLengthInBits;
779                         L -= (keyStreamLengthInBits / 32);
780 
781                         /* Generate the next key stream 8 bytes or 16 bytes */
782                         if (!remainBits)
783                                 asm_ZucGenKeystream8B_avx(&keyStr32[4],
784                                                           &singlePktState);
785                         else
786                                 asm_ZucGenKeystream16B_avx(&keyStr32[4],
787                                                            &singlePktState);
788                         T[i] = asm_Eia3Round16BAVX(T[i], keyStr32, pIn8[i]);
789                         /* Copy the last keystream generated
790                          * to the first 16 bytes */
791                         memcpy(keyStr32, &keyStr32[4], KEYSTR_ROUND_LEN);
792                         pIn8[i] = &pIn8[i][KEYSTR_ROUND_LEN];
793                 }
794 
795                 /*
796                  * If remaining bits has more than 2 ZUC WORDS (double words),
797                  * keystream needs to have up to another 2 ZUC WORDS (8B)
798                  */
799                 if (remainBits > (2 * 32))
800                         asm_ZucGenKeystream8B_avx(&keyStr32[4],
801                                                   &singlePktState);
802 
803                 uint32_t keyBlock = keyStr32[L - 1];
804 
805                 T[i] ^= asm_Eia3RemainderAVX(keyStr32, pIn8[i], remainBits);
806                 T[i] ^= rotate_left(load_uint64(&keyStr32[remainBits / 32]),
807                                  remainBits % 32);
808 
809                 /* save the final MAC-I result */
810                 *(pMacI[i]) = bswap4(T[i] ^ keyBlock);
811         }
812 
813 #ifdef SAFE_DATA
814         /* Clear sensitive data (in registers and stack) */
815         clear_mem(keyStr, sizeof(keyStr));
816         clear_mem(&singlePktState, sizeof(singlePktState));
817         clear_mem(&state, sizeof(state));
818         clear_mem(&keys, sizeof(keys));
819 #endif
820 }
821 
zuc_eia3_n_buffer_avx(const void * const pKey[],const void * const pIv[],const void * const pBufferIn[],const uint32_t lengthInBits[],uint32_t * pMacI[],const uint32_t numBuffers)822 void zuc_eia3_n_buffer_avx(const void * const pKey[],
823                            const void * const pIv[],
824                            const void * const pBufferIn[],
825                            const uint32_t lengthInBits[],
826                            uint32_t *pMacI[],
827                            const uint32_t numBuffers)
828 {
829 #ifndef LINUX
830         DECLARE_ALIGNED(imb_uint128_t xmm_save[10], 16);
831 
832         SAVE_XMMS(xmm_save);
833 #endif
834 
835         unsigned int i;
836         unsigned int packetCount = numBuffers;
837 
838 #ifdef SAFE_PARAM
839         /* Check for NULL pointers */
840         if (pKey == NULL || pIv == NULL || pBufferIn == NULL ||
841             lengthInBits == NULL || pMacI == NULL)
842                 return;
843 
844         for (i = 0; i < numBuffers; i++) {
845                 if (pKey[i] == NULL || pIv[i] == NULL ||
846                     pBufferIn[i] == NULL || pMacI[i] == NULL)
847                         return;
848 
849                 /* Check input data is in range of supported length */
850                 if (lengthInBits[i] < ZUC_MIN_BITLEN ||
851                     lengthInBits[i] > ZUC_MAX_BITLEN)
852                         return;
853         }
854 #endif
855         i = 0;
856 
857         while(packetCount >= 4) {
858                 packetCount -=4;
859                 _zuc_eia3_4_buffer_avx(&pKey[i],
860                                        &pIv[i],
861                                        &pBufferIn[i],
862                                        &lengthInBits[i],
863                                        &pMacI[i]);
864                 i+=4;
865         }
866 
867         while(packetCount--) {
868                 _zuc_eia3_1_buffer_avx(pKey[i],
869                                        pIv[i],
870                                        pBufferIn[i],
871                                        lengthInBits[i],
872                                        pMacI[i]);
873                 i++;
874         }
875 
876 #ifdef SAFE_DATA
877         /* Clear sensitive data in registers */
878         CLEAR_SCRATCH_GPS();
879         CLEAR_SCRATCH_SIMD_REGS();
880 #endif
881 #ifndef LINUX
882         RESTORE_XMMS(xmm_save);
883 #endif
884 }
885