1;;
2;; Copyright (c) 2009-2020, Intel Corporation
3;;
4;; Redistribution and use in source and binary forms, with or without
5;; modification, are permitted provided that the following conditions are met:
6;;
7;;     * Redistributions of source code must retain the above copyright notice,
8;;       this list of conditions and the following disclaimer.
9;;     * Redistributions in binary form must reproduce the above copyright
10;;       notice, this list of conditions and the following disclaimer in the
11;;       documentation and/or other materials provided with the distribution.
12;;     * Neither the name of Intel Corporation nor the names of its contributors
13;;       may be used to endorse or promote products derived from this software
14;;       without specific prior written permission.
15;;
16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26;;
27
28%include "include/os.asm"
29%include "include/reg_sizes.asm"
30%include "include/zuc_sbox.inc"
31
32section .data
33default rel
34EK_d:
35dw	0x44D7, 0x26BC, 0x626B, 0x135E, 0x5789, 0x35E2, 0x7135, 0x09AF,
36dw	0x4D78, 0x2F13, 0x6BC4, 0x1AF1, 0x5E26, 0x3C4D, 0x789A, 0x47AC
37
38align 16
39mask_S0:
40dq      0xff00ff00ff00ff00
41
42align 16
43mask_S1:
44dq      0x00ff00ff00ff00ff
45
46%ifdef LINUX
47section .note.GNU-stack noalloc noexec nowrite progbits
48%endif
49
50section .text
51
52%define OFFSET_FR1      (16*4)
53%define OFFSET_FR2      (17*4)
54%define OFFSET_BRC_X0   (18*4)
55%define OFFSET_BRC_X1   (19*4)
56%define OFFSET_BRC_X2   (20*4)
57%define OFFSET_BRC_X3   (21*4)
58
59;
60;   BITS_REORG()
61;
62;   params
63;       %1 - round number
64;   uses
65;       eax, ebx, ecx, edx
66;   return
67;       updates r12d, r13d, r14d, r15d
68;
69%macro  BITS_REORG  1
70    ;
71    ; r12d = LFSR_S15
72    ; eax  = LFSR_S14
73    ; r13d = LFSR_S11
74    ; ebx  = LFSR_S9
75    ; r14d = LFSR_S7
76    ; ecx  = LFSR_S5
77    ; r15d = LFSR_S2
78    ; edx  = LFSR_S0
79
80    mov         r12d, [rsi + ((15 + %1) % 16)*4]
81    mov          eax, [rsi + ((14 + %1) % 16)*4]
82    mov         r13d, [rsi + ((11 + %1) % 16)*4]
83    mov          ebx, [rsi + (( 9 + %1) % 16)*4]
84    mov         r14d, [rsi + (( 7 + %1) % 16)*4]
85    mov          ecx, [rsi + (( 5 + %1) % 16)*4]
86    mov         r15d, [rsi + (( 2 + %1) % 16)*4]
87    mov          edx, [rsi + (( 0 + %1) % 16)*4]
88
89    shr         r12d, 15
90    shl         eax, 16
91    shl         ebx, 1
92    shl         ecx, 1
93    shl         edx, 1
94    shld        r12d, eax, 16   ; BRC_X0
95    shld        r13d, ebx, 16   ; BRC_X1
96    shld        r14d, ecx, 16   ; BRC_X2
97    shld        r15d, edx, 16   ; BRC_X3
98%endmacro
99
100;
101;   NONLIN_FUN()
102;
103;   uses
104;           rdi rsi eax rdx edx
105;           r8d r9d ebx
106;   return
107;       eax  = W value
108;       r10d = F_R1
109;       r11d = F_R2
110;
111%macro NONLIN_FUN   2
112%define %%CALC_W %1 ; [in] Calculate W if 1
113%define %%ARCH   %2 ; [in] SSE/SSE_NO_AESNI/AVX
114
115%if (%%CALC_W == 1)
116    mov         eax, r12d
117    xor         eax, r10d
118    add         eax, r11d   ; W = (BRC_X0 ^ F_R1) + F_R2
119%endif
120
121    add         r10d, r13d  ; W1= F_R1 + BRC_X1
122    xor         r11d, r14d  ; W2= F_R2 ^ BRC_X2
123
124    mov         rdx, r10
125    shld        edx, r11d, 16   ; P = (W1 << 16) | (W2 >> 16)
126    shld        r11d, r10d, 16  ; Q = (W2 << 16) | (W1 >> 16)
127
128    mov         ebx, edx
129    mov         ecx, edx
130    mov         r8d, edx
131    mov         r9d, edx
132
133    rol         ebx, 2
134    rol         ecx, 10
135    rol         r8d, 18
136    rol         r9d, 24
137    xor         edx, ebx
138    xor         edx, ecx
139    xor         edx, r8d
140    xor         edx, r9d    ; U = L1(P) = EDX, hi(RDX)=0
141
142    mov         ebx, r11d
143    mov         ecx, r11d
144    mov         r8d, r11d
145    mov         r9d, r11d
146    rol         ebx, 8
147    rol         ecx, 14
148    rol         r8d, 22
149    rol         r9d, 30
150    xor         r11d, ebx
151    xor         r11d, ecx
152    xor         r11d, r8d
153    xor         r11d, r9d   ; V = L2(Q) = R11D, hi(R11)=0
154
155    shl         r11, 32
156    xor         rdx, r11 ; V || U
157%ifidn %%ARCH, SSE
158    movq        xmm0, rdx
159    movdqa      xmm1, xmm0
160    S0_comput_SSE xmm1, xmm2, xmm3, 0
161    S1_comput_SSE xmm0, xmm2, xmm3, xmm4, 0
162
163    pand        xmm0, [rel mask_S1]
164    pand        xmm1, [rel mask_S0]
165
166    pxor        xmm0, xmm1
167    movd        r10d, xmm0      ; F_R1
168    pextrd      r11d, xmm0, 1   ; F_R2
169%elifidn %%ARCH, SSE_NO_AESNI
170    movq        xmm0, rdx
171    movdqa      xmm1, xmm0
172    S0_comput_SSE xmm1, xmm2, xmm3, 0
173    S1_comput_SSE_NO_AESNI xmm0, xmm2, xmm3, xmm4
174
175    pand        xmm0, [rel mask_S1]
176    pand        xmm1, [rel mask_S0]
177
178    pxor        xmm0, xmm1
179    movd        r10d, xmm0      ; F_R1
180    pextrd      r11d, xmm0, 1   ; F_R2
181%else
182    vmovq       xmm0, rdx
183    vmovdqa     xmm1, xmm0
184    S0_comput_AVX xmm1, xmm2, xmm3
185    S1_comput_AVX xmm0, xmm2, xmm3, xmm4
186    vpand        xmm0, [rel mask_S1]
187    vpand        xmm1, [rel mask_S0]
188
189
190    vpxor       xmm0, xmm0, xmm1
191    vmovd       r10d, xmm0      ; F_R1
192    vpextrd     r11d, xmm0, 1   ; F_R2
193
194%endif
195
196%endmacro
197
198;
199;   LFSR_UPDT()
200;
201;   params
202;       %1 - round number
203;   uses
204;       rax as input (ZERO or W)
205;   return
206;
207%macro  LFSR_UPDT   1
208    ;
209    ; ebx = LFSR_S0
210    ; ecx = LFSR_S4
211    ; edx = LFSR_S10
212    ; r8d = LFSR_S13
213    ; r9d = LFSR_S15
214    ;lea         rsi, [LFSR_STA] ; moved to calling function
215
216    mov         ebx, [rsi + (( 0 + %1) % 16)*4]
217    mov         ecx, [rsi + (( 4 + %1) % 16)*4]
218    mov         edx, [rsi + ((10 + %1) % 16)*4]
219    mov         r8d, [rsi + ((13 + %1) % 16)*4]
220    mov         r9d, [rsi + ((15 + %1) % 16)*4]
221
222    ; Calculate 64-bit LFSR feedback
223    add         rax, rbx
224    shl         rbx, 8
225    shl         rcx, 20
226    shl         rdx, 21
227    shl         r8, 17
228    shl         r9, 15
229    add         rax, rbx
230    add         rax, rcx
231    add         rax, rdx
232    add         rax, r8
233    add         rax, r9
234
235    ; Reduce it to 31-bit value
236    mov         rbx, rax
237    and         rax, 0x7FFFFFFF
238    shr         rbx, 31
239    add         rax, rbx
240
241    mov rbx, rax
242    sub rbx, 0x7FFFFFFF
243    cmovns rax, rbx
244
245
246    ; LFSR_S16 = (LFSR_S15++) = eax
247    mov         [rsi + (( 0 + %1) % 16)*4], eax
248%endmacro
249
250
251;
252;   make_u31()
253;
254%macro  make_u31    4
255
256%define %%Rt        %1
257%define %%Ke        %2
258%define %%Ek        %3
259%define %%Iv        %4
260    xor         %%Rt, %%Rt
261    shrd        %%Rt, %%Iv, 8
262    shrd        %%Rt, %%Ek, 15
263    shrd        %%Rt, %%Ke, 9
264%endmacro
265
266
267;
268;	key_expand()
269;
270%macro	key_expand	1
271	movzx		r8d, byte [pKe +  (%1 + 0)]
272	movzx		r9d, word [rbx + ((%1 + 0)*2)]
273	movzx		r10d, byte [pIv + (%1 + 0)]
274	make_u31	r11d, r8d, r9d, r10d
275	mov 		[rax +  ((%1 + 0)*4)], r11d
276
277	movzx		r12d, byte [pKe +  (%1 + 1)]
278	movzx		r13d, word [rbx + ((%1 + 1)*2)]
279	movzx		r14d, byte [pIv +  (%1 + 1)]
280	make_u31	r15d, r12d, r13d, r14d
281	mov 		[rax +  ((%1 + 1)*4)], r15d
282%endmacro
283
284;
285; Initialize internal LFSR
286;
287%macro ZUC_INIT 1
288%define %%ARCH  %1 ; [in] SSE/SSE_NO_AESNI/AVX
289
290%ifdef LINUX
291	%define		pKe	rdi
292	%define		pIv	rsi
293	%define		pState	rdx
294%else
295	%define		pKe	rcx
296	%define		pIv	rdx
297	%define		pState	r8
298%endif
299
300    ; save the base pointer
301    push rbp
302
303    ;load stack pointer to rbp and reserve memory in the red zone
304    mov rbp, rsp
305    sub rsp, 64
306
307    ; Save non-volatile registers
308    mov [rbp - 8],  rbx
309    mov [rbp - 16], r12
310    mov [rbp - 24], r13
311    mov [rbp - 32], r14
312    mov [rbp - 40], r15
313%ifndef LINUX
314    mov [rbp - 48], rdi
315    mov [rbp - 56], rsi
316%endif
317
318    lea rbx, [rel EK_d]     ; load pointer to D
319    lea rax, [pState]      ; load pointer to pState
320    mov [rbp - 64], pState   ; save pointer to pState
321
322    ; Expand key
323    key_expand  0
324    key_expand  2
325    key_expand  4
326    key_expand  6
327    key_expand  8
328    key_expand  10
329    key_expand  12
330    key_expand  14
331
332    ; Set R1 and R2 to zero
333    xor         r10, r10
334    xor         r11, r11
335
336    ; Shift LFSR 32-times, update state variables
337%assign N 0
338%rep 32
339    mov rdx, [rbp - 64]   ; load pointer to pState
340    lea rsi, [rdx]
341
342    BITS_REORG  N
343
344    NONLIN_FUN  1, %%ARCH
345    shr         eax, 1
346
347    mov rdx, [rbp - 64]   ; re-load pointer to pState
348    lea rsi, [rdx]
349
350    LFSR_UPDT   N
351
352%assign N N+1
353%endrep
354
355    ; And once more, initial round from keygen phase = 33 times
356    mov         rdx, [rbp - 64]   ; load pointer to pState
357    lea         rsi, [rdx]
358
359
360    BITS_REORG  0
361    NONLIN_FUN  0, %%ARCH
362    xor         rax, rax
363
364    mov         rdx, [rbp - 64]   ; load pointer to pState
365    lea         rsi, [rdx]
366
367    LFSR_UPDT   0
368
369    mov         rdx, [rbp - 64]   ; load pointer to pState
370    lea         rsi, [rdx]
371
372    ; Save ZUC's state variables
373    mov         [rsi + (16*4)],r10d  ;F_R1
374    mov         [rsi + (17*4)],r11d  ;F_R2
375    mov         [rsi + (18*4)],r12d  ;BRC_X0
376    mov         [rsi + (19*4)],r13d  ;BRC_X1
377    mov         [rsi + (20*4)],r14d  ;BRC_X2
378    mov         [rsi + (21*4)],r15d  ;BRC_X3
379
380
381    ; Restore non-volatile registers
382    mov rbx, [rbp - 8]
383    mov r12, [rbp - 16]
384    mov r13, [rbp - 24]
385    mov r14, [rbp - 32]
386    mov r15, [rbp - 40]
387%ifndef LINUX
388    mov rdi, [rbp - 48]
389    mov rsi, [rbp - 56]
390%endif
391
392    ; restore base pointer
393    mov rsp, rbp
394    pop rbp
395
396%endmacro
397
398;
399; Generate N*4 bytes of keystream
400; for a single buffer (where N is number of rounds)
401;
402%macro ZUC_KEYGEN 2
403%define %%ARCH          %1 ; [in] SSE/SSE_NO_AESNI/AVX
404%define %%NUM_ROUNDS    %2 ; [in] Number of 4-byte rounds
405
406%ifdef LINUX
407	%define		pKS	rdi
408	%define		pState	rsi
409%else
410	%define		pKS	rcx
411	%define		pState	rdx
412%endif
413
414%ifidn %%ARCH, AVX
415%define %%MOVDQA vmovdqa
416%else
417%define %%MOVDQA movdqa
418%endif
419
420    ; save the base pointer
421    push rbp
422
423    ;load stack pointer to rbp and reserve memory in the red zone
424    mov rbp, rsp
425    sub rsp, 72
426
427    ; Save non-volatile registers
428    mov [rbp - 8], rbx
429    mov [rbp - 16], r12
430    mov [rbp - 24], r13
431    mov [rbp - 32], r14
432    mov [rbp - 40], r15
433%ifndef LINUX
434    mov [rbp - 48], rdi
435    mov [rbp - 56], rsi
436%endif
437
438    ; Load input keystream pointer parameter in RAX
439    mov         rax, pKS
440
441    ; Restore ZUC's state variables
442    mov         r10d, [pState + OFFSET_FR1]
443    mov         r11d, [pState + OFFSET_FR2]
444    mov         r12d, [pState + OFFSET_BRC_X0]
445    mov         r13d, [pState + OFFSET_BRC_X1]
446    mov         r14d, [pState + OFFSET_BRC_X2]
447    mov         r15d, [pState + OFFSET_BRC_X3]
448
449    ; Store keystream pointer
450    mov [rbp - 64], rax
451
452    ; Store ZUC State Pointer
453    mov [rbp - 72], pState
454
455    ; Generate N*4B of keystream in N rounds
456%assign N 1
457%rep %%NUM_ROUNDS
458
459    mov rdx, [rbp - 72]       ; load *pState
460    lea rsi, [rdx]
461
462    BITS_REORG  N
463    NONLIN_FUN  1, %%ARCH
464
465    ;Store the keystream
466    mov rbx, [rbp - 64]  ; load *pkeystream
467    xor eax, r15d
468    mov [rbx], eax
469    add rbx, 4          ; increment the pointer
470    mov [rbp - 64], rbx   ; save pkeystream
471
472    xor         rax, rax
473
474    mov rdx, [rbp - 72]     ; load *pState
475    lea rsi, [rdx]
476
477    LFSR_UPDT   N
478
479%assign N N+1
480%endrep
481
482;; Reorder LFSR registers, as not all 16 rounds have been completed
483;; (if number of rounds is not 4, 8 or 16, the only possible case is 2,
484;; and in that case, we don't have to update the states, as that function
485;; call is done at the end the algorithm).
486%if (%%NUM_ROUNDS == 8)
487    %%MOVDQA xmm0, [rsi]
488    %%MOVDQA xmm1, [rsi+16]
489    %%MOVDQA xmm2, [rsi+32]
490    %%MOVDQA xmm3, [rsi+48]
491
492    %%MOVDQA [rsi],    xmm2
493    %%MOVDQA [rsi+16], xmm3
494    %%MOVDQA [rsi+32], xmm0
495    %%MOVDQA [rsi+48], xmm1
496%elif (%%NUM_ROUNDS == 4)
497    %%MOVDQA xmm0, [rsi]
498    %%MOVDQA xmm1, [rsi+16]
499    %%MOVDQA xmm2, [rsi+32]
500    %%MOVDQA xmm3, [rsi+48]
501
502    %%MOVDQA [rsi],    xmm1
503    %%MOVDQA [rsi+16], xmm2
504    %%MOVDQA [rsi+32], xmm3
505    %%MOVDQA [rsi+48], xmm0
506%endif
507
508    mov rsi, [rbp - 72]   ; load pState
509
510
511    ; Save ZUC's state variables
512    mov         [rsi + OFFSET_FR1], r10d
513    mov         [rsi + OFFSET_FR2], r11d
514    mov         [rsi + OFFSET_BRC_X0], r12d
515    mov         [rsi + OFFSET_BRC_X1], r13d
516    mov         [rsi + OFFSET_BRC_X2], r14d
517    mov         [rsi + OFFSET_BRC_X3], r15d
518
519    ; Restore non-volatile registers
520    mov rbx, [rbp - 8]
521    mov r12, [rbp - 16]
522    mov r13, [rbp - 24]
523    mov r14, [rbp - 32]
524    mov r15, [rbp - 40]
525%ifndef LINUX
526    mov rdi, [rbp - 48]
527    mov rsi, [rbp - 56]
528%endif
529
530    mov rsp, rbp
531    pop rbp
532
533%endmacro
534
535;
536; Generate N*4 bytes of keystream for a single buffer
537; (where N is number of rounds, being 16 rounds the maximum)
538;
539%macro ZUC_KEYGEN_VAR 1
540%define %%ARCH          %1 ; [in] SSE/SSE_NO_AESNI/AVX
541
542%ifdef LINUX
543	%define		pKS	rdi
544	%define		pState	rsi
545        %define         nRounds rdx
546%else
547	%define		pKS	rcx
548	%define		pState	rdx
549        %define         nRounds r8
550%endif
551
552%define MAX_ROUNDS 16
553    ; save the base pointer
554    push rbp
555
556    ;load stack pointer to rbp and reserve memory in the red zone
557    mov rbp, rsp
558    sub rsp, 80
559
560    ; Save non-volatile registers
561    mov [rbp - 8], rbx
562    mov [rbp - 16], r12
563    mov [rbp - 24], r13
564    mov [rbp - 32], r14
565    mov [rbp - 40], r15
566%ifndef LINUX
567    mov [rbp - 48], rdi
568    mov [rbp - 56], rsi
569%endif
570
571    mov [rbp - 80], nRounds
572
573    ; Load input keystream pointer parameter in RAX
574    mov         rax, pKS
575
576    ; Restore ZUC's state variables
577    mov         r10d, [pState + OFFSET_FR1]
578    mov         r11d, [pState + OFFSET_FR2]
579    mov         r12d, [pState + OFFSET_BRC_X0]
580    mov         r13d, [pState + OFFSET_BRC_X1]
581    mov         r14d, [pState + OFFSET_BRC_X2]
582    mov         r15d, [pState + OFFSET_BRC_X3]
583
584    ; Store keystream pointer
585    mov [rbp - 64], rax
586
587    ; Store ZUC State Pointer
588    mov [rbp - 72], pState
589
590    ; Generate N*4B of keystream in N rounds
591%assign N 1
592%rep MAX_ROUNDS
593
594    mov rdx, [rbp - 72]       ; load *pState
595    lea rsi, [rdx]
596
597    BITS_REORG  N
598    NONLIN_FUN  1, %%ARCH
599
600    ;Store the keystream
601    mov rbx, [rbp - 64]  ; load *pkeystream
602    xor eax, r15d
603    mov [rbx], eax
604    add rbx, 4          ; increment the pointer
605    mov [rbp - 64], rbx   ; save pkeystream
606
607    xor rax, rax
608
609    mov rdx, [rbp - 72]     ; load *pState
610    lea rsi, [rdx]
611
612    LFSR_UPDT   N
613
614    dec qword [rbp - 80] ; numRounds - 1
615    jz %%exit_loop
616%assign N N+1
617%endrep
618
619%%exit_loop:
620    mov rsi, [rbp - 72]   ; load pState
621
622
623    ; Save ZUC's state variables
624    mov         [rsi + OFFSET_FR1], r10d
625    mov         [rsi + OFFSET_FR2], r11d
626    mov         [rsi + OFFSET_BRC_X0], r12d
627    mov         [rsi + OFFSET_BRC_X1], r13d
628    mov         [rsi + OFFSET_BRC_X2], r14d
629    mov         [rsi + OFFSET_BRC_X3], r15d
630
631    ; Restore non-volatile registers
632    mov rbx, [rbp - 8]
633    mov r12, [rbp - 16]
634    mov r13, [rbp - 24]
635    mov r14, [rbp - 32]
636    mov r15, [rbp - 40]
637%ifndef LINUX
638    mov rdi, [rbp - 48]
639    mov rsi, [rbp - 56]
640%endif
641
642    mov rsp, rbp
643    pop rbp
644
645%endmacro
646
647;;
648;;extern void Zuc_Initialization_sse(uint8_t* pKey, uint8_t* pIV, uint32_t * pState)
649;;
650;; WIN64
651;;	RCX - pKey
652;;	RDX - pIV
653;;      R8  - pState
654;; LIN64
655;;	RDI - pKey
656;;	RSI - pIV
657;;      RDX - pState
658;;
659align 16
660MKGLOBAL(asm_ZucInitialization_sse,function,internal)
661asm_ZucInitialization_sse:
662
663    ZUC_INIT SSE
664
665    ret
666
667;;
668;;extern void Zuc_Initialization_sse_no_aesni(uint8_t* pKey, uint8_t* pIV,
669;;                                            uint32_t * pState)
670;;
671;; WIN64
672;;	RCX - pKey
673;;	RDX - pIV
674;;      R8  - pState
675;; LIN64
676;;	RDI - pKey
677;;	RSI - pIV
678;;      RDX - pState
679;;
680align 16
681MKGLOBAL(asm_ZucInitialization_sse_no_aesni,function,internal)
682asm_ZucInitialization_sse_no_aesni:
683
684    ZUC_INIT SSE_NO_AESNI
685
686    ret
687
688;;
689;;extern void Zuc_Initialization_avx(uint8_t* pKey, uint8_t* pIV, uint32_t * pState)
690;;
691;; WIN64
692;;	RCX - pKey
693;;	RDX - pIV
694;;      R8  - pState
695;; LIN64
696;;	RDI - pKey
697;;	RSI - pIV
698;;      RDX - pState
699;;
700align 16
701MKGLOBAL(asm_ZucInitialization_avx,function,internal)
702asm_ZucInitialization_avx:
703
704    ZUC_INIT AVX
705
706    ret
707
708;;
709;; void asm_ZucGenKeystream8B_sse(void *pKeystream, ZucState_t *pState);
710;;
711;; WIN64
712;;	RCX - KS (key stream pointer)
713;; 	RDX - STATE (state pointer)
714;; LIN64
715;;	RDI - KS (key stream pointer)
716;;	RSI - STATE (state pointer)
717;;
718align 16
719MKGLOBAL(asm_ZucGenKeystream8B_sse,function,internal)
720asm_ZucGenKeystream8B_sse:
721
722    ZUC_KEYGEN SSE, 2
723
724    ret
725
726;;
727;; void asm_ZucGenKeystream8B_sse_no_aesni(void *pKeystream, ZucState_t *pState);
728;;
729;; WIN64
730;;	RCX - KS (key stream pointer)
731;; 	RDX - STATE (state pointer)
732;; LIN64
733;;	RDI - KS (key stream pointer)
734;;	RSI - STATE (state pointer)
735;;
736align 16
737MKGLOBAL(asm_ZucGenKeystream8B_sse_no_aesni,function,internal)
738asm_ZucGenKeystream8B_sse_no_aesni:
739
740    ZUC_KEYGEN SSE_NO_AESNI, 2
741
742    ret
743
744;;
745;; void asm_ZucGenKeystream8B_avx(void *pKeystream, ZucState_t *pState);
746;;
747;; WIN64
748;;	RCX - KS (key stream pointer)
749;; 	RDX - STATE (state pointer)
750;; LIN64
751;;	RDI - KS (key stream pointer)
752;;	RSI - STATE (state pointer)
753;;
754align 16
755MKGLOBAL(asm_ZucGenKeystream8B_avx,function,internal)
756asm_ZucGenKeystream8B_avx:
757
758    ZUC_KEYGEN AVX, 2
759
760    ret
761
762;;
763;; void asm_ZucGenKeystream16B_sse(uint32_t * pKeystream, uint32_t * pState);
764;;
765;; WIN64
766;;	RCX - KS (key stream pointer)
767;; 	RDX - STATE (state pointer)
768;; LIN64
769;;	RDI - KS (key stream pointer)
770;;	RSI - STATE (state pointer)
771;;
772align 16
773MKGLOBAL(asm_ZucGenKeystream16B_sse,function,internal)
774asm_ZucGenKeystream16B_sse:
775
776    ZUC_KEYGEN SSE, 4
777
778    ret
779
780;;
781;; void asm_ZucGenKeystream16B_sse_no_aesni(uint32_t * pKeystream, uint32_t * pState);
782;;
783;; WIN64
784;;	RCX - KS (key stream pointer)
785;; 	RDX - STATE (state pointer)
786;; LIN64
787;;	RDI - KS (key stream pointer)
788;;	RSI - STATE (state pointer)
789;;
790align 16
791MKGLOBAL(asm_ZucGenKeystream16B_sse_no_aesni,function,internal)
792asm_ZucGenKeystream16B_sse_no_aesni:
793
794    ZUC_KEYGEN SSE_NO_AESNI, 4
795
796    ret
797
798;;
799;; void asm_ZucGenKeystream64B_avx(uint32_t * pKeystream, uint32_t * pState);
800;;
801;; WIN64
802;;	RCX - KS (key stream pointer)
803;; 	RDX - STATE (state pointer)
804;; LIN64
805;;	RDI - KS (key stream pointer)
806;;	RSI - STATE (state pointer)
807;;
808align 16
809MKGLOBAL(asm_ZucGenKeystream64B_avx,function,internal)
810asm_ZucGenKeystream64B_avx:
811
812    ZUC_KEYGEN AVX, 16
813
814    ret
815
816;;
817;; void asm_ZucGenKeystream32B_avx(uint32_t * pKeystream, uint32_t * pState);
818;;
819;; WIN64
820;;	RCX - KS (key stream pointer)
821;; 	RDX - STATE (state pointer)
822;; LIN64
823;;	RDI - KS (key stream pointer)
824;;	RSI - STATE (state pointer)
825;;
826align 16
827MKGLOBAL(asm_ZucGenKeystream32B_avx,function,internal)
828asm_ZucGenKeystream32B_avx:
829
830    ZUC_KEYGEN AVX, 8
831
832    ret
833
834;;
835;; void asm_ZucGenKeystream16B_avx(uint32_t * pKeystream, uint32_t * pState);
836;;
837;; WIN64
838;;	RCX - KS (key stream pointer)
839;; 	RDX - STATE (state pointer)
840;; LIN64
841;;	RDI - KS (key stream pointer)
842;;	RSI - STATE (state pointer)
843;;
844align 16
845MKGLOBAL(asm_ZucGenKeystream16B_avx,function,internal)
846asm_ZucGenKeystream16B_avx:
847
848    ZUC_KEYGEN AVX, 4
849
850    ret
851
852;;
853;; void asm_ZucGenKeystream_sse(uint32_t * pKeystream, uint32_t * pState,
854;;                              uint64_t numRounds);
855;;
856;; WIN64
857;;	RCX - KS (key stream pointer)
858;; 	RDX - STATE (state pointer)
859;; 	R8  - NROUNDS (number of 4B rounds)
860;; LIN64
861;;	RDI - KS (key stream pointer)
862;;	RSI - STATE (state pointer)
863;; 	RDX - NROUNDS (number of 4B rounds)
864;;
865align 16
866MKGLOBAL(asm_ZucGenKeystream_sse,function,internal)
867asm_ZucGenKeystream_sse:
868
869    ZUC_KEYGEN_VAR SSE
870
871    ret
872
873;;
874;; void asm_ZucGenKeystream_sse_no_aesni(uint32_t * pKeystream, uint32_t * pState,
875;;                              uint64_t numRounds);
876;;
877;; WIN64
878;;	RCX - KS (key stream pointer)
879;; 	RDX - STATE (state pointer)
880;; 	R8  - NROUNDS (number of 4B rounds)
881;; LIN64
882;;	RDI - KS (key stream pointer)
883;;	RSI - STATE (state pointer)
884;; 	RDX - NROUNDS (number of 4B rounds)
885;;
886align 16
887MKGLOBAL(asm_ZucGenKeystream_sse_no_aesni,function,internal)
888asm_ZucGenKeystream_sse_no_aesni:
889
890    ZUC_KEYGEN_VAR SSE_NO_AESNI
891
892    ret
893
894;;
895;; void asm_ZucGenKeystream_avx(uint32_t * pKeystream, uint32_t * pState);
896;;                              uint64_t numRounds);
897;;
898;; WIN64
899;;	RCX - KS (key stream pointer)
900;; 	RDX - STATE (state pointer)
901;; 	R8  - NROUNDS (number of 4B rounds)
902;; LIN64
903;;	RDI - KS (key stream pointer)
904;;	RSI - STATE (state pointer)
905;; 	RDX - NROUNDS (number of 4B rounds)
906;;
907align 16
908MKGLOBAL(asm_ZucGenKeystream_avx,function,internal)
909asm_ZucGenKeystream_avx:
910
911    ZUC_KEYGEN_VAR AVX
912
913    ret
914