1;;
2;; Copyright (c) 2020, Intel Corporation
3;;
4;; Redistribution and use in source and binary forms, with or without
5;; modification, are permitted provided that the following conditions are met:
6;;
7;;     * Redistributions of source code must retain the above copyright notice,
8;;       this list of conditions and the following disclaimer.
9;;     * Redistributions in binary form must reproduce the above copyright
10;;       notice, this list of conditions and the following disclaimer in the
11;;       documentation and/or other materials provided with the distribution.
12;;     * Neither the name of Intel Corporation nor the names of its contributors
13;;       may be used to endorse or promote products derived from this software
14;;       without specific prior written permission.
15;;
16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26;;
27
28%include "include/os.asm"
29%include "include/reg_sizes.asm"
30%include "include/zuc_sbox.inc"
31%include "include/transpose_avx2.asm"
32%include "include/memcpy.asm"
33%include "mb_mgr_datastruct.asm"
34
35%define APPEND(a,b) a %+ b
36
37section .data
38default rel
39
40align 32
41Ek_d:
42dd	0x0044D700, 0x0026BC00, 0x00626B00, 0x00135E00, 0x00578900, 0x0035E200, 0x00713500, 0x0009AF00
43dd	0x004D7800, 0x002F1300, 0x006BC400, 0x001AF100, 0x005E2600, 0x003C4D00, 0x00789A00, 0x0047AC00
44
45align 32
46shuf_mask_key:
47dd      0x00FFFFFF, 0x01FFFFFF, 0x02FFFFFF, 0x03FFFFFF, 0x04FFFFFF, 0x05FFFFFF, 0x06FFFFFF, 0x07FFFFFF,
48dd      0x08FFFFFF, 0x09FFFFFF, 0x0AFFFFFF, 0x0BFFFFFF, 0x0CFFFFFF, 0x0DFFFFFF, 0x0EFFFFFF, 0x0FFFFFFF,
49
50align 32
51shuf_mask_iv:
52dd      0xFFFFFF00, 0xFFFFFF01, 0xFFFFFF02, 0xFFFFFF03, 0xFFFFFF04, 0xFFFFFF05, 0xFFFFFF06, 0xFFFFFF07,
53dd      0xFFFFFF08, 0xFFFFFF09, 0xFFFFFF0A, 0xFFFFFF0B, 0xFFFFFF0C, 0xFFFFFF0D, 0xFFFFFF0E, 0xFFFFFF0F,
54
55align 32
56mask31:
57dd	0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF,
58dd	0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF,
59
60align 32
61swap_mask:
62db      0x03, 0x02, 0x01, 0x00, 0x07, 0x06, 0x05, 0x04
63db      0x0b, 0x0a, 0x09, 0x08, 0x0f, 0x0e, 0x0d, 0x0c
64db      0x03, 0x02, 0x01, 0x00, 0x07, 0x06, 0x05, 0x04
65db      0x0b, 0x0a, 0x09, 0x08, 0x0f, 0x0e, 0x0d, 0x0c
66
67
68align 32
69S1_S0_shuf:
70db      0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F
71db      0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F
72
73align 32
74S0_S1_shuf:
75db      0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F, 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
76db      0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F, 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E,
77
78align 32
79rev_S1_S0_shuf:
80db      0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F
81db      0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F
82
83align 32
84rev_S0_S1_shuf:
85db      0x08, 0x00, 0x09, 0x01, 0x0A, 0x02, 0x0B, 0x03, 0x0C, 0x04, 0x0D, 0x05, 0x0E, 0x06, 0x0F, 0x07
86db      0x08, 0x00, 0x09, 0x01, 0x0A, 0x02, 0x0B, 0x03, 0x0C, 0x04, 0x0D, 0x05, 0x0E, 0x06, 0x0F, 0x07
87
88align 32
89rot8_mod32:
90db      0x03, 0x00, 0x01, 0x02, 0x07, 0x04, 0x05, 0x06,
91db      0x0B, 0x08, 0x09, 0x0A, 0x0F, 0x0C, 0x0D, 0x0E
92db      0x03, 0x00, 0x01, 0x02, 0x07, 0x04, 0x05, 0x06,
93db      0x0B, 0x08, 0x09, 0x0A, 0x0F, 0x0C, 0x0D, 0x0E
94
95align 32
96rot16_mod32:
97db      0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
98db      0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D
99db      0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
100db      0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D
101
102align 32
103rot24_mod32:
104db      0x01, 0x02, 0x03, 0x00, 0x05, 0x06, 0x07, 0x04,
105db      0x09, 0x0A, 0x0B, 0x08, 0x0D, 0x0E, 0x0F, 0x0C
106db      0x01, 0x02, 0x03, 0x00, 0x05, 0x06, 0x07, 0x04,
107db      0x09, 0x0A, 0x0B, 0x08, 0x0D, 0x0E, 0x0F, 0x0C
108
109align 16
110broadcast_word:
111db      0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01
112db      0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01
113
114align 16
115all_ffs:
116dw      0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff
117
118align 16
119all_threes:
120dw      0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003
121
122align 16
123all_fffcs:
124dw      0xfffc, 0xfffc, 0xfffc, 0xfffc, 0xfffc, 0xfffc, 0xfffc, 0xfffc
125
126align 16
127all_1fs:
128dw      0x001f, 0x001f, 0x001f, 0x001f, 0x001f, 0x001f, 0x001f, 0x001f
129
130align 16
131all_20s:
132dw      0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020
133
134section .text
135align 64
136
137%define MASK31  ymm12
138
139%define OFS_R1  (16*(2*16))
140%define OFS_R2  (OFS_R1 + (2*16))
141%define OFS_X0  (OFS_R2 + (2*16))
142%define OFS_X1  (OFS_X0 + (2*16))
143%define OFS_X2  (OFS_X1 + (2*16))
144
145%ifidn __OUTPUT_FORMAT__, win64
146        %define XMM_STORAGE     16*10
147        %define GP_STORAGE      8*8
148%else
149        %define XMM_STORAGE     0
150        %define GP_STORAGE      6*8
151%endif
152
153%define VARIABLE_OFFSET XMM_STORAGE + GP_STORAGE
154%define GP_OFFSET XMM_STORAGE
155
156%macro FUNC_SAVE 0
157        mov     r11, rsp
158        sub     rsp, VARIABLE_OFFSET
159        and     rsp, ~15
160
161%ifidn __OUTPUT_FORMAT__, win64
162        ; xmm6:xmm15 need to be maintained for Windows
163        vmovdqa [rsp + 0*16], xmm6
164        vmovdqa [rsp + 1*16], xmm7
165        vmovdqa [rsp + 2*16], xmm8
166        vmovdqa [rsp + 3*16], xmm9
167        vmovdqa [rsp + 4*16], xmm10
168        vmovdqa [rsp + 5*16], xmm11
169        vmovdqa [rsp + 6*16], xmm12
170        vmovdqa [rsp + 7*16], xmm13
171        vmovdqa [rsp + 8*16], xmm14
172        vmovdqa [rsp + 9*16], xmm15
173        mov     [rsp + GP_OFFSET + 48], rdi
174        mov     [rsp + GP_OFFSET + 56], rsi
175%endif
176        mov     [rsp + GP_OFFSET],      r12
177        mov     [rsp + GP_OFFSET + 8],  r13
178        mov     [rsp + GP_OFFSET + 16], r14
179        mov     [rsp + GP_OFFSET + 24], r15
180        mov     [rsp + GP_OFFSET + 32], rbx
181        mov     [rsp + GP_OFFSET + 40], r11 ;; rsp pointer
182%endmacro
183
184
185%macro FUNC_RESTORE 0
186
187%ifidn __OUTPUT_FORMAT__, win64
188        vmovdqa xmm6,  [rsp + 0*16]
189        vmovdqa xmm7,  [rsp + 1*16]
190        vmovdqa xmm8,  [rsp + 2*16]
191        vmovdqa xmm9,  [rsp + 3*16]
192        vmovdqa xmm10, [rsp + 4*16]
193        vmovdqa xmm11, [rsp + 5*16]
194        vmovdqa xmm12, [rsp + 6*16]
195        vmovdqa xmm13, [rsp + 7*16]
196        vmovdqa xmm14, [rsp + 8*16]
197        vmovdqa xmm15, [rsp + 9*16]
198        mov     rdi, [rsp + GP_OFFSET + 48]
199        mov     rsi, [rsp + GP_OFFSET + 56]
200%endif
201        mov     r12, [rsp + GP_OFFSET]
202        mov     r13, [rsp + GP_OFFSET + 8]
203        mov     r14, [rsp + GP_OFFSET + 16]
204        mov     r15, [rsp + GP_OFFSET + 24]
205        mov     rbx, [rsp + GP_OFFSET + 32]
206        mov     rsp, [rsp + GP_OFFSET + 40]
207%endmacro
208
209; This macro reorder the LFSR registers
210; after N rounds (1 <= N <= 15), since the registers
211; are shifted every round
212;
213; The macro clobbers YMM0-15
214;
215%macro REORDER_LFSR 2
216%define %%STATE      %1
217%define %%NUM_ROUNDS %2
218
219%if %%NUM_ROUNDS != 16
220%assign i 0
221%rep 16
222    vmovdqa APPEND(ymm,i), [%%STATE + 32*i]
223%assign i (i+1)
224%endrep
225
226%assign i 0
227%assign j %%NUM_ROUNDS
228%rep 16
229    vmovdqa [%%STATE + 32*i], APPEND(ymm,j)
230%assign i (i+1)
231%assign j ((j+1) % 16)
232%endrep
233%endif ;; %%NUM_ROUNDS != 16
234
235%endmacro
236
237;;
238;;   make_u31()
239;;
240%macro  make_u31    4
241
242%define %%Rt        %1
243%define %%Ke        %2
244%define %%Ek        %3
245%define %%Iv        %4
246    xor         %%Rt, %%Rt
247    shrd        %%Rt, %%Iv, 8
248    shrd        %%Rt, %%Ek, 15
249    shrd        %%Rt, %%Ke, 9
250%endmacro
251
252
253;
254;   bits_reorg8()
255;
256%macro  bits_reorg8 2-3
257%define %%STATE     %1 ; [in] ZUC state
258%define %%ROUND_NUM %2 ; [in] Round number
259%define %%X3        %3 ; [out] YMM register containing X3 of all lanes
260    ;
261    ; ymm15 = LFSR_S15
262    ; ymm14 = LFSR_S14
263    ; ymm11 = LFSR_S11
264    ; ymm9  = LFSR_S9
265    ; ymm7  = LFSR_S7
266    ; ymm5  = LFSR_S5
267    ; ymm2  = LFSR_S2
268    ; ymm0  = LFSR_S0
269    ;
270    vmovdqa     ymm15, [%%STATE + ((15 + %%ROUND_NUM) % 16)*32]
271    vmovdqa     ymm14, [%%STATE + ((14 + %%ROUND_NUM) % 16)*32]
272    vmovdqa     ymm11, [%%STATE + ((11 + %%ROUND_NUM) % 16)*32]
273    vmovdqa     ymm9,  [%%STATE + (( 9 + %%ROUND_NUM) % 16)*32]
274    vmovdqa     ymm7,  [%%STATE + (( 7 + %%ROUND_NUM) % 16)*32]
275    vmovdqa     ymm5,  [%%STATE + (( 5 + %%ROUND_NUM) % 16)*32]
276    vmovdqa     ymm2,  [%%STATE + (( 2 + %%ROUND_NUM) % 16)*32]
277    vmovdqa     ymm0,  [%%STATE + (( 0 + %%ROUND_NUM) % 16)*32]
278
279    vpxor       ymm1, ymm1
280    vpslld      ymm15, 1
281    vpblendw    ymm3,  ymm14, ymm1, 0xAA
282    vpblendw    ymm15, ymm3, ymm15, 0xAA
283
284    vmovdqa     [%%STATE + OFS_X0], ymm15   ; BRC_X0
285    vpslld      ymm11, 16
286    vpsrld      ymm9, 15
287    vpor        ymm11, ymm9
288    vmovdqa     [%%STATE + OFS_X1], ymm11   ; BRC_X1
289    vpslld      ymm7, 16
290    vpsrld      ymm5, 15
291    vpor        ymm7, ymm5
292    vmovdqa     [%%STATE + OFS_X2], ymm7    ; BRC_X2
293%if (%0 == 3)
294    vpslld      ymm2, 16
295    vpsrld      ymm0, 15
296    vpor        %%X3, ymm2, ymm0 ; Store BRC_X3 in YMM register
297%endif
298%endmacro
299
300;
301;   rot_mod32()
302;
303;   uses ymm7
304;
305%macro  rot_mod32   3
306%if (%3 == 8)
307    vpshufb %1, %2, [rel rot8_mod32]
308%elif (%3 == 16)
309    vpshufb %1, %2, [rel rot16_mod32]
310%elif (%3 == 24)
311    vpshufb %1, %2, [rel rot24_mod32]
312%else
313    vpslld      %1, %2, %3
314    vpsrld      ymm7, %2, (32 - %3)
315
316    vpor        %1, ymm7
317%endif
318%endmacro
319
320
321;
322;   nonlin_fun8()
323;
324;   return
325;       W value, updates F_R1[] / F_R2[]
326;
327%macro nonlin_fun8  1-2
328%define %%STATE     %1  ; [in] ZUC state
329%define %%W         %2  ; [out] YMM register to contain W for all lanes
330
331%if (%0 == 2)
332    vmovdqa     %%W, [%%STATE + OFS_X0]
333    vpxor       %%W, [%%STATE + OFS_R1]
334    vpaddd      %%W, [%%STATE + OFS_R2]    ; W = (BRC_X0 ^ F_R1) + F_R2
335%endif
336
337    vmovdqa     ymm1, [%%STATE + OFS_R1]
338    vmovdqa     ymm2, [%%STATE + OFS_R2]
339    vpaddd      ymm1, [%%STATE + OFS_X1]    ; W1 = F_R1 + BRC_X1
340    vpxor       ymm2, [%%STATE + OFS_X2]    ; W2 = F_R2 ^ BRC_X2
341
342    vpslld      ymm3, ymm1, 16
343    vpsrld      ymm4, ymm1, 16
344    vpslld      ymm5, ymm2, 16
345    vpsrld      ymm6, ymm2, 16
346    vpor        ymm1, ymm3, ymm6
347    vpor        ymm2, ymm4, ymm5
348
349    rot_mod32   ymm3, ymm1, 2
350    rot_mod32   ymm4, ymm1, 10
351    rot_mod32   ymm5, ymm1, 18
352    rot_mod32   ymm6, ymm1, 24
353    vpxor       ymm1, ymm3
354    vpxor       ymm1, ymm4
355    vpxor       ymm1, ymm5
356    vpxor       ymm1, ymm6      ; XMM1 = U = L1(P)
357
358    rot_mod32   ymm3, ymm2, 8
359    rot_mod32   ymm4, ymm2, 14
360    rot_mod32   ymm5, ymm2, 22
361    rot_mod32   ymm6, ymm2, 30
362    vpxor       ymm2, ymm3
363    vpxor       ymm2, ymm4
364    vpxor       ymm2, ymm5
365    vpxor       ymm2, ymm6      ; XMM2 = V = L2(Q)
366
367    ; Shuffle U and V to have all S0 lookups in XMM1 and all S1 lookups in XMM2
368
369    ; Compress all S0 and S1 input values in each register
370    vpshufb     ymm1, [rel S0_S1_shuf] ; S0: Bytes 0-7,16-23 S1: Bytes 8-15,24-31
371    vpshufb     ymm2, [rel S1_S0_shuf] ; S1: Bytes 0-7,16-23 S0: Bytes 8-15,24-31
372
373    vshufpd     ymm3, ymm1, ymm2, 0xA ; All S0 input values
374    vshufpd     ymm4, ymm2, ymm1, 0xA ; All S1 input values
375
376    ; Compute S0 and S1 values
377    S0_comput_AVX2  ymm3, ymm1, ymm2
378    S1_comput_AVX2  ymm4, ymm1, ymm2, ymm5
379
380    ; Need to shuffle back ymm1 & ymm2 before storing output
381    ; (revert what was done before S0 and S1 computations)
382    vshufpd    ymm1, ymm3, ymm4, 0xA
383    vshufpd    ymm2, ymm4, ymm3, 0xA
384
385    vpshufb     ymm1, [rel rev_S0_S1_shuf]
386    vpshufb     ymm2, [rel rev_S1_S0_shuf]
387
388    vmovdqa     [%%STATE + OFS_R1], ymm1
389    vmovdqa     [%%STATE + OFS_R2], ymm2
390%endmacro
391
392;
393;   store32B_kstr8()
394;
395%macro  store32B_kstr8 8
396%define %%DATA32B_L0  %1  ; [in] 32 bytes of keystream for lane 0
397%define %%DATA32B_L1  %2  ; [in] 32 bytes of keystream for lane 1
398%define %%DATA32B_L2  %3  ; [in] 32 bytes of keystream for lane 2
399%define %%DATA32B_L3  %4  ; [in] 32 bytes of keystream for lane 3
400%define %%DATA32B_L4  %5  ; [in] 32 bytes of keystream for lane 4
401%define %%DATA32B_L5  %6  ; [in] 32 bytes of keystream for lane 5
402%define %%DATA32B_L6  %7  ; [in] 32 bytes of keystream for lane 6
403%define %%DATA32B_L7  %8  ; [in] 32 bytes of keystream for lane 7
404
405    mov         rcx, [rsp]
406    mov         rdx, [rsp + 8]
407    mov         r8,  [rsp + 16]
408    mov         r9,  [rsp + 24]
409    vmovdqu     [rcx], %%DATA32B_L0
410    vmovdqu     [rdx], %%DATA32B_L1
411    vmovdqu     [r8],  %%DATA32B_L2
412    vmovdqu     [r9],  %%DATA32B_L3
413
414    mov         rcx, [rsp + 32]
415    mov         rdx, [rsp + 40]
416    mov         r8,  [rsp + 48]
417    mov         r9,  [rsp + 56]
418    vmovdqu     [rcx], %%DATA32B_L4
419    vmovdqu     [rdx], %%DATA32B_L5
420    vmovdqu     [r8],  %%DATA32B_L6
421    vmovdqu     [r9],  %%DATA32B_L7
422
423%endmacro
424
425;
426;   store4B_kstr8()
427;
428;   params
429;
430;   %1 - YMM register with OFS_X3
431;   return
432;
433%macro  store4B_kstr8 1
434    mov         rcx, [rsp]
435    mov         rdx, [rsp + 8]
436    mov         r8,  [rsp + 16]
437    mov         r9,  [rsp + 24]
438    vpextrd     [r9],  XWORD(%1), 3
439    vpextrd     [r8],  XWORD(%1), 2
440    vpextrd     [rdx], XWORD(%1), 1
441    vmovd       [rcx], XWORD(%1)
442    add         rcx, 4
443    add         rdx, 4
444    add         r8, 4
445    add         r9, 4
446    mov         [rsp],      rcx
447    mov         [rsp + 8],  rdx
448    mov         [rsp + 16], r8
449    mov         [rsp + 24], r9
450
451    vextracti128 XWORD(%1), %1, 1
452    mov         rcx, [rsp + 32]
453    mov         rdx, [rsp + 40]
454    mov         r8,  [rsp + 48]
455    mov         r9,  [rsp + 56]
456    vpextrd     [r9],  XWORD(%1), 3
457    vpextrd     [r8],  XWORD(%1), 2
458    vpextrd     [rdx], XWORD(%1), 1
459    vmovd       [rcx], XWORD(%1)
460    add         rcx, 4
461    add         rdx, 4
462    add         r8, 4
463    add         r9, 4
464    mov         [rsp + 32], rcx
465    mov         [rsp + 40], rdx
466    mov         [rsp + 48], r8
467    mov         [rsp + 56], r9
468
469%endmacro
470
471
472;
473;   add_mod31()
474;       add two 32-bit args and reduce mod (2^31-1)
475;   params
476;       %1  - arg1/res
477;       %2  - arg2
478;   uses
479;       ymm2
480;   return
481;       %1
482%macro  add_mod31   2
483    vpaddd      %1, %2
484    vpsrld      ymm2, %1, 31
485    vpand       %1, MASK31
486    vpaddd      %1, ymm2
487%endmacro
488
489
490;
491;   rot_mod31()
492;       rotate (mult by pow of 2) 32-bit arg and reduce mod (2^31-1)
493;   params
494;       %1  - arg
495;       %2  - # of bits
496;   uses
497;       ymm2
498;   return
499;       %1
500%macro  rot_mod31   2
501
502    vpslld      ymm2, %1, %2
503    vpsrld      %1, %1, (31 - %2)
504
505    vpor        %1, ymm2
506    vpand       %1, MASK31
507%endmacro
508
509
510;
511;   lfsr_updt8()
512;
513;
514%macro  lfsr_updt8  3
515%define %%STATE     %1 ; [in] ZUC state
516%define %%ROUND_NUM %2 ; [in] Round number
517%define %%W         %3 ; [in/clobbered] YMM register to contain W for all lanes
518    ;
519    ; ymm1  = LFSR_S0
520    ; ymm4  = LFSR_S4
521    ; ymm10 = LFSR_S10
522    ; ymm13 = LFSR_S13
523    ; ymm15 = LFSR_S15
524    ;
525    vmovdqa     ymm1,  [%%STATE + (( 0 + %%ROUND_NUM) % 16)*32]
526    vmovdqa     ymm4,  [%%STATE + (( 4 + %%ROUND_NUM) % 16)*32]
527    vmovdqa     ymm10, [%%STATE + ((10 + %%ROUND_NUM) % 16)*32]
528    vmovdqa     ymm13, [%%STATE + ((13 + %%ROUND_NUM) % 16)*32]
529    vmovdqa     ymm15, [%%STATE + ((15 + %%ROUND_NUM) % 16)*32]
530
531    ; Calculate LFSR feedback
532    add_mod31   %%W, ymm1
533    rot_mod31   ymm1, 8
534    add_mod31   %%W, ymm1
535    rot_mod31   ymm4, 20
536    add_mod31   %%W, ymm4
537    rot_mod31   ymm10, 21
538    add_mod31   %%W, ymm10
539    rot_mod31   ymm13, 17
540    add_mod31   %%W, ymm13
541    rot_mod31   ymm15, 15
542    add_mod31   %%W, ymm15
543
544    vmovdqa     [%%STATE + (( 0 + %%ROUND_NUM) % 16)*32], %%W
545
546    ; LFSR_S16 = (LFSR_S15++) = eax
547%endmacro
548
549;
550; Initialize LFSR registers for a single lane
551;
552; This macro initializes 8 LFSR registers at time.
553; so it needs to be called twice.
554;
555; From spec, s_i (LFSR) registers need to be loaded as follows:
556;
557; For 0 <= i <= 15, let s_i= k_i || d_i || iv_i.
558; Where k_i is each byte of the key, d_i is a 15-bit constant
559; and iv_i is each byte of the IV.
560;
561%macro INIT_LFSR 7
562%define %%KEY       %1 ;; [in] Key pointer
563%define %%IV        %2 ;; [in] IV pointer
564%define %%SHUF_KEY  %3 ;; [in] Shuffle key mask
565%define %%SHUF_IV   %4 ;; [in] Shuffle key mask
566%define %%EKD_MASK  %5 ;; [in] Shuffle key mask
567%define %%LFSR      %6 ;; [out] YMM register to contain initialized LFSR regs
568%define %%YTMP      %7 ;; [clobbered] YMM temporary register
569
570    vbroadcastf128  %%LFSR, [%%KEY]
571    vbroadcastf128  %%YTMP, [%%IV]
572    vpshufb         %%LFSR, %%SHUF_KEY
573    vpsrld          %%LFSR, 1
574    vpshufb         %%YTMP, %%SHUF_IV
575    vpor            %%LFSR, %%YTMP
576    vpor            %%LFSR, %%EKD_MASK
577
578%endmacro
579
580
581MKGLOBAL(asm_ZucInitialization_8_avx2,function,internal)
582asm_ZucInitialization_8_avx2:
583
584%ifdef LINUX
585	%define		pKe	rdi
586	%define		pIv	rsi
587	%define		pState	rdx
588%else
589	%define		pKe	rcx
590	%define		pIv	rdx
591	%define		pState	r8
592%endif
593
594    FUNC_SAVE
595
596    ; Zero out R1/R2 (only lower half is used)
597    vpxor   ymm0, ymm0
598%assign I 0
599%rep 2
600    vmovdqa [pState + OFS_R1 + I*32], ymm0
601%assign I (I + 1)
602%endrep
603
604    ;;; Initialize all LFSR registers in two steps:
605    ;;; first, registers 0-7, then registers 8-15
606
607%assign off 0
608%rep 2
609    ; Set read-only registers for shuffle masks for key, IV and Ek_d for 8 registers
610    vmovdqa ymm13, [rel shuf_mask_key + off]
611    vmovdqa ymm14, [rel shuf_mask_iv + off]
612    vmovdqa ymm15, [rel Ek_d + off]
613
614    ; Set 8xLFSR registers for all packets
615%assign idx 0
616%rep 8
617    mov     r9, [pKe+8*idx]  ; Load Key N pointer
618    mov     r10, [pIv+8*idx] ; Load IV N pointer
619    INIT_LFSR r9, r10, ymm13, ymm14, ymm15, APPEND(ymm, idx), ymm12
620%assign idx (idx + 1)
621%endrep
622
623    ; Store 8xLFSR registers in memory (reordering first,
624    ; so all SX registers are together)
625    TRANSPOSE8_U32  ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9
626
627%assign i 0
628%rep 8
629    vmovdqa [pState + 8*off + 32*i], APPEND(ymm, i)
630%assign i (i+1)
631%endrep
632
633%assign off (off + 32)
634%endrep
635
636    ; Load read-only registers
637    vmovdqa  ymm12, [rel mask31]
638
639    mov rax, pState
640
641    ; Shift LFSR 32-times, update state variables
642%assign N 0
643%rep 32
644    bits_reorg8 rax, N
645    nonlin_fun8 rax, ymm0
646    vpsrld  ymm0,1           ; Shift out LSB of W
647    lfsr_updt8  rax, N, ymm0 ; W (ymm0) used in LFSR update - not set to zero
648%assign N N+1
649%endrep
650
651    ; And once more, initial round from keygen phase = 33 times
652    bits_reorg8 rax, 0
653    nonlin_fun8 rax
654
655    vpxor    ymm0, ymm0
656    lfsr_updt8  rax, 0, ymm0
657
658    FUNC_RESTORE
659
660    ret
661
662;
663; Generate N*4 bytes of keystream
664; for 8 buffers (where N is number of rounds)
665;
666%macro KEYGEN_8_AVX2 1
667%define %%NUM_ROUNDS    %1 ; [in] Number of 4-byte rounds
668
669%ifdef LINUX
670	%define		pState	rdi
671	%define		pKS	rsi
672%else
673	%define		pState	rcx
674	%define		pKS	rdx
675%endif
676
677    FUNC_SAVE
678
679    ; Store 8 keystream pointers on the stack
680    ; and reserve memory for storing keystreams for all 8 buffers
681    mov     r10, rsp
682    sub     rsp, (8*8 + %%NUM_ROUNDS * 32)
683    and     rsp, -31
684
685%assign i 0
686%rep 2
687    vmovdqa     ymm0, [pKS + 32*i]
688    vmovdqa     [rsp + 32*i], ymm0
689%assign i (i+1)
690%endrep
691
692    ; Load state pointer in RAX
693    mov         rax, pState
694
695    ; Load read-only registers
696    vmovdqa     ymm12, [rel mask31]
697
698    ; Generate N*4B of keystream in N rounds
699%assign N 1
700%rep %%NUM_ROUNDS
701    bits_reorg8 rax, N, ymm10
702    nonlin_fun8 rax, ymm0
703    ; OFS_X3 XOR W (ymm0) and store in stack
704    vpxor   ymm10, ymm0
705    vmovdqa [rsp + 64 + (N-1)*32], ymm10
706    vpxor        ymm0, ymm0
707    lfsr_updt8  rax, N, ymm0
708%assign N N+1
709%endrep
710
711%if (%%NUM_ROUNDS == 8)
712    ;; Load all OFS_X3
713    vmovdqa xmm0,[rsp + 64]
714    vmovdqa xmm1,[rsp + 64 + 32*1]
715    vmovdqa xmm2,[rsp + 64 + 32*2]
716    vmovdqa xmm3,[rsp + 64 + 32*3]
717    vmovdqa xmm4,[rsp + 64 + 16]
718    vmovdqa xmm5,[rsp + 64 + 32*1 + 16]
719    vmovdqa xmm6,[rsp + 64 + 32*2 + 16]
720    vmovdqa xmm7,[rsp + 64 + 32*3 + 16]
721
722    vinserti128 ymm0, ymm0, [rsp + 64 + 32*4], 0x01
723    vinserti128 ymm1, ymm1, [rsp + 64 + 32*5], 0x01
724    vinserti128 ymm2, ymm2, [rsp + 64 + 32*6], 0x01
725    vinserti128 ymm3, ymm3, [rsp + 64 + 32*7], 0x01
726    vinserti128 ymm4, ymm4, [rsp + 64 + 32*4 + 16], 0x01
727    vinserti128 ymm5, ymm5, [rsp + 64 + 32*5 + 16], 0x01
728    vinserti128 ymm6, ymm6, [rsp + 64 + 32*6 + 16], 0x01
729    vinserti128 ymm7, ymm7, [rsp + 64 + 32*7 + 16], 0x01
730
731    TRANSPOSE8_U32_PRELOADED ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9
732
733    store32B_kstr8 ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
734
735    ;; Reorder LFSR registers, as not all 16 rounds have been completed
736    ;; (No need to do if NUM_ROUNDS != 8, as it would indicate that
737    ;; these would be the final rounds)
738    REORDER_LFSR rax, 8
739
740%else ;; NUM_ROUNDS == 8
741%assign idx 0
742%rep %%NUM_ROUNDS
743    vmovdqa APPEND(ymm, idx), [rsp + 64 + idx*32]
744    store4B_kstr8 APPEND(ymm, idx)
745%assign idx (idx + 1)
746%endrep
747%endif ;; NUM_ROUNDS == 8
748
749        ;; Clear stack frame containing keystream information
750%ifdef SAFE_DATA
751        vpxor   ymm0, ymm0
752%assign i 0
753%rep (2+%%NUM_ROUNDS)
754	vmovdqa [rsp + i*32], ymm0
755%assign i (i+1)
756%endrep
757%endif
758
759    ;; Restore rsp pointer
760    mov         rsp, r10
761
762    FUNC_RESTORE
763
764%endmacro
765
766;;
767;; void asm_ZucGenKeystream32B_8_avx2(state8_t *pSta, u32* pKeyStr[8])
768;;
769;; WIN64
770;;  RCX    - pSta
771;;  RDX    - pKeyStr
772;;
773;; LIN64
774;;  RDI    - pSta
775;;  RSI    - pKeyStr
776;;
777MKGLOBAL(asm_ZucGenKeystream32B_8_avx2,function,internal)
778asm_ZucGenKeystream32B_8_avx2:
779
780    KEYGEN_8_AVX2 8
781
782    ret
783
784;;
785;; void asm_ZucGenKeystream8B_8_avx2(state8_t *pSta, u32* pKeyStr[8])
786;;
787;; WIN64
788;;  RCX    - pSta
789;;  RDX    - pKeyStr
790;;
791;; LIN64
792;;  RDI    - pSta
793;;  RSI    - pKeyStr
794;;
795MKGLOBAL(asm_ZucGenKeystream8B_8_avx2,function,internal)
796asm_ZucGenKeystream8B_8_avx2:
797
798    KEYGEN_8_AVX2 2
799
800    ret
801
802;;
803;; Encrypt N*4B bytes on all 8 buffers
804;; where N is number of rounds (up to 8)
805;; In final call, an array of final bytes is read
806;; from memory and only these final bytes are of
807;; plaintext are read and XOR'ed.
808%macro CIPHERNx4B_8 4
809%define %%NROUNDS        %1
810%define %%INITIAL_ROUND  %2
811%define %%OFFSET         %3
812%define %%LAST_CALL      %4
813
814%ifdef LINUX
815%define %%TMP1 r8
816%define %%TMP2 r9
817%else
818%define %%TMP1 rdi
819%define %%TMP2 rsi
820%endif
821        ; Load read-only registers
822        vmovdqa ymm12, [rel mask31]
823
824        ; Generate N*4B of keystream in N rounds
825%assign N 1
826%assign round (%%INITIAL_ROUND + N)
827%rep %%NROUNDS
828        bits_reorg8 rax, round, ymm10
829        nonlin_fun8 rax, ymm0
830        ; OFS_XR XOR W (ymm0)
831        vpxor   ymm10, ymm0
832        vmovdqa [rsp + (N-1)*32], ymm10
833        vpxor   ymm0, ymm0
834        lfsr_updt8  rax, round, ymm0
835%assign N N+1
836%assign round (round + 1)
837%endrep
838
839%assign N 0
840%assign idx 8
841%rep %%NROUNDS
842        vmovdqa APPEND(ymm, idx), [rsp + N*32]
843%assign N N+1
844%assign idx (idx+1)
845%endrep
846
847        TRANSPOSE8_U32 ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, \
848                       ymm15, ymm0, ymm1
849        ;; XOR Input buffer with keystream in rounds of 32B
850
851        mov     r12, [pIn]
852        mov     r13, [pIn + 8]
853        mov     r14, [pIn + 16]
854        mov     r15, [pIn + 24]
855%if (%%LAST_CALL == 1)
856        ;; Save GP registers
857        mov     [rsp + 32*8 + 16 + 8],  %%TMP1
858        mov     [rsp + 32*8 + 16 + 16], %%TMP2
859
860        ;; Read in r10 the word containing the number of final bytes to read for each lane
861        movzx  r10d, word [rsp + 8*32]
862        simd_load_avx2 ymm0, r12 + %%OFFSET, r10, %%TMP1, %%TMP2
863        movzx  r10d, word [rsp + 8*32 + 2]
864        simd_load_avx2 ymm1, r13 + %%OFFSET, r10, %%TMP1, %%TMP2
865        movzx  r10d, word [rsp + 8*32 + 4]
866        simd_load_avx2 ymm2, r14 + %%OFFSET, r10, %%TMP1, %%TMP2
867        movzx  r10d, word [rsp + 8*32 + 6]
868        simd_load_avx2 ymm3, r15 + %%OFFSET, r10, %%TMP1, %%TMP2
869%else
870        vmovdqu ymm0, [r12 + %%OFFSET]
871        vmovdqu ymm1, [r13 + %%OFFSET]
872        vmovdqu ymm2, [r14 + %%OFFSET]
873        vmovdqu ymm3, [r15 + %%OFFSET]
874%endif
875
876        mov     r12, [pIn + 32]
877        mov     r13, [pIn + 40]
878        mov     r14, [pIn + 48]
879        mov     r15, [pIn + 56]
880%if (%%LAST_CALL == 1)
881        movzx  r10d, word [rsp + 8*32 + 8]
882        simd_load_avx2 ymm4, r12 + %%OFFSET, r10, %%TMP1, %%TMP2
883        movzx  r10d, word [rsp + 8*32 + 10]
884        simd_load_avx2 ymm5, r13 + %%OFFSET, r10, %%TMP1, %%TMP2
885        movzx  r10d, word [rsp + 8*32 + 12]
886        simd_load_avx2 ymm6, r14 + %%OFFSET, r10, %%TMP1, %%TMP2
887        movzx  r10d, word [rsp + 8*32 + 14]
888        simd_load_avx2 ymm7, r15 + %%OFFSET, r10, %%TMP1, %%TMP2
889%else
890        vmovdqu ymm4, [r12 + %%OFFSET]
891        vmovdqu ymm5, [r13 + %%OFFSET]
892        vmovdqu ymm6, [r14 + %%OFFSET]
893        vmovdqu ymm7, [r15 + %%OFFSET]
894%endif
895        ; Shuffle all keystreams and XOR with plaintext
896%assign %%I 0
897%assign %%J 8
898%rep 8
899        vpshufb ymm %+ %%J, [rel swap_mask]
900        vpxor   ymm %+ %%J, ymm %+ %%I
901%assign %%I (%%I + 1)
902%assign %%J (%%J + 1)
903%endrep
904
905        ;; Write output
906        mov     r12, [pOut]
907        mov     r13, [pOut + 8]
908        mov     r14, [pOut + 16]
909        mov     r15, [pOut + 24]
910
911%if (%%LAST_CALL == 1)
912        add     r12, %%OFFSET
913        add     r13, %%OFFSET
914        add     r14, %%OFFSET
915        add     r15, %%OFFSET
916        ;; Read in r10 the word containing the number of final bytes to write for each lane
917        movzx  r10d, word [rsp + 8*32]
918        simd_store_avx2 r12, ymm8,  r10, %%TMP1, %%TMP2
919        movzx  r10d, word [rsp + 8*32 + 2]
920        simd_store_avx2 r13, ymm9,  r10, %%TMP1, %%TMP2
921        movzx  r10d, word [rsp + 8*32 + 4]
922        simd_store_avx2 r14, ymm10, r10, %%TMP1, %%TMP2
923        movzx  r10d, word [rsp + 8*32 + 6]
924        simd_store_avx2 r15, ymm11, r10, %%TMP1, %%TMP2
925%else
926        vmovdqu [r12 + %%OFFSET], ymm8
927        vmovdqu [r13 + %%OFFSET], ymm9
928        vmovdqu [r14 + %%OFFSET], ymm10
929        vmovdqu [r15 + %%OFFSET], ymm11
930%endif
931
932        mov     r12, [pOut + 32]
933        mov     r13, [pOut + 40]
934        mov     r14, [pOut + 48]
935        mov     r15, [pOut + 56]
936
937%if (%%LAST_CALL == 1)
938        add     r12, %%OFFSET
939        add     r13, %%OFFSET
940        add     r14, %%OFFSET
941        add     r15, %%OFFSET
942        movzx  r10d, word [rsp + 8*32 + 8]
943        simd_store_avx2 r12, ymm12, r10, %%TMP1, %%TMP2
944        movzx  r10d, word [rsp + 8*32 + 10]
945        simd_store_avx2 r13, ymm13, r10, %%TMP1, %%TMP2
946        movzx  r10d, word [rsp + 8*32 + 12]
947        simd_store_avx2 r14, ymm14, r10, %%TMP1, %%TMP2
948        movzx  r10d, word [rsp + 8*32 + 14]
949        simd_store_avx2 r15, ymm15, r10, %%TMP1, %%TMP2
950
951        ; Restore registers
952        mov     %%TMP1, [rsp + 32*8 + 16 + 8]
953        mov     %%TMP2, [rsp + 32*8 + 16 + 16]
954%else
955        vmovdqu [r12 + %%OFFSET], ymm12
956        vmovdqu [r13 + %%OFFSET], ymm13
957        vmovdqu [r14 + %%OFFSET], ymm14
958        vmovdqu [r15 + %%OFFSET], ymm15
959%endif
960
961%endmacro
962
963;;
964;; void asm_ZucCipher_8_avx2(state16_t *pSta, u64 *pIn[8],
965;;                           u64 *pOut[8], u16 lengths, u64 min_length);
966;;
967;; WIN64
968;;  RCX    - pSta
969;;  RDX    - pIn
970;;  R8     - pOut
971;;  R9     - lengths
972;;  rsp + 40 - min_length
973;;
974;; LIN64
975;;  RDI - pSta
976;;  RSI - pIn
977;;  RDX - pOut
978;;  RCX - lengths
979;;  R8  - min_length
980;;
981MKGLOBAL(asm_ZucCipher_8_avx2,function,internal)
982asm_ZucCipher_8_avx2:
983
984%ifdef LINUX
985        %define         pState  rdi
986        %define         pIn     rsi
987        %define         pOut    rdx
988        %define         lengths rcx
989        %define         arg5    r8
990%else
991        %define         pState  rcx
992        %define         pIn     rdx
993        %define         pOut    r8
994        %define         lengths r9
995        %define         arg5    [rsp + 40]
996%endif
997
998%define min_length r10
999%define buf_idx r11
1000
1001        mov     min_length, arg5
1002
1003        or      min_length, min_length
1004        jz      exit_cipher32
1005
1006        FUNC_SAVE
1007
1008        ;; Convert all lengths from UINT16_MAX (indicating that lane is not valid) to min length
1009        vmovd   xmm0, DWORD(min_length)
1010        vpshufb xmm0, xmm0, [rel broadcast_word]
1011        vmovdqa xmm1, [lengths]
1012        vpcmpeqw xmm2, xmm2 ;; Get all ff's in XMM register
1013        vpcmpeqw xmm3, xmm1, xmm2 ;; Mask with FFFF in NULL jobs
1014
1015        vpand   xmm4, xmm3, xmm0 ;; Length of valid job in all NULL jobs
1016        vpxor   xmm2, xmm3 ;; Mask with 0000 in NULL jobs
1017        vpand   xmm1, xmm2 ;; Zero out lengths of NULL jobs
1018        vpor    xmm1, xmm4 ;; XMM1 contain updated lengths
1019
1020        ; Round up to nearest multiple of 4 bytes
1021        vpaddw  xmm0, [rel all_threes]
1022        vpand   xmm0, [rel all_fffcs]
1023
1024        ; Calculate remaining bytes to encrypt after function call
1025        vpsubw  xmm2, xmm1, xmm0
1026        vpxor   xmm3, xmm3
1027        vpcmpgtw xmm4, xmm2, xmm3 ;; Mask with FFFF in lengths > 0
1028        ; Set to zero the lengths of the lanes which are going to be completed (lengths < 0)
1029        vpand   xmm2, xmm4
1030        vmovdqa [lengths], xmm2 ; Update in memory the final updated lengths
1031
1032        ; Calculate number of bytes to encrypt after round of 32 bytes (up to 31 bytes),
1033        ; for each lane, and store it in stack to be used in the last round
1034        vpsubw  xmm1, xmm2 ; Bytes to encrypt in all lanes
1035        vpand   xmm1, [rel all_1fs] ; Number of final bytes (up to 31 bytes) for each lane
1036        vpcmpeqw xmm2, xmm1, xmm3 ;; Mask with FFFF in lengths == 0
1037        vpand   xmm2, [rel all_20s] ;; 32 in positions where lengths was 0
1038        vpor    xmm1, xmm2          ;; Number of final bytes (up to 32 bytes) for each lane
1039
1040        ; Allocate stack frame to store keystreams (32*8 bytes), number of final bytes (16 bytes),
1041        ; space for rsp (8 bytes) and 2 GP registers (16 bytes) that will be clobbered later
1042        mov     rax, rsp
1043        sub     rsp, (32*8 + 16 + 16 + 8)
1044        and     rsp, -31
1045        xor     buf_idx, buf_idx
1046        vmovdqu [rsp + 32*8], xmm1
1047        mov     [rsp + 32*8 + 16], rax
1048
1049        ; Load state pointer in RAX
1050        mov     rax, pState
1051
1052loop_cipher64:
1053        cmp     min_length, 64
1054        jl      exit_loop_cipher64
1055
1056        CIPHERNx4B_8 8, 0, buf_idx, 0
1057
1058        add     buf_idx, 32
1059        sub     min_length, 32
1060
1061        CIPHERNx4B_8 8, 8, buf_idx, 0
1062
1063        add     buf_idx, 32
1064        sub     min_length, 32
1065
1066        jmp     loop_cipher64
1067exit_loop_cipher64:
1068
1069        ; Check if at least 32 bytes are left to encrypt
1070        cmp     min_length, 32
1071        jl      less_than_32
1072
1073        CIPHERNx4B_8 8, 0, buf_idx, 0
1074        REORDER_LFSR rax, 8
1075
1076        add     buf_idx, 32
1077        sub     min_length, 32
1078
1079        ; Check if there are more bytes left to encrypt
1080less_than_32:
1081
1082        mov     r15, min_length
1083        add     r15, 3
1084        shr     r15, 2 ;; number of rounds left (round up length to nearest multiple of 4B)
1085        jz      exit_final_rounds
1086
1087_final_rounds_is_1_8:
1088        cmp     r15, 4
1089        je      _num_final_rounds_is_4
1090        jl      _final_rounds_is_1_3
1091
1092        ; Final rounds 5-8
1093        cmp     r15, 8
1094        je      _num_final_rounds_is_8
1095        cmp     r15, 7
1096        je      _num_final_rounds_is_7
1097        cmp     r15, 6
1098        je      _num_final_rounds_is_6
1099        cmp     r15, 5
1100        je      _num_final_rounds_is_5
1101
1102_final_rounds_is_1_3:
1103        cmp     r15, 3
1104        je      _num_final_rounds_is_3
1105        cmp     r15, 2
1106        je      _num_final_rounds_is_2
1107
1108        jmp     _num_final_rounds_is_1
1109
1110        ; Perform encryption of last bytes (<= 31 bytes) and reorder LFSR registers
1111%assign I 1
1112%rep 8
1113APPEND(_num_final_rounds_is_,I):
1114        CIPHERNx4B_8 I, 0, buf_idx, 1
1115        REORDER_LFSR rax, I
1116        add     buf_idx, (I*4)
1117        jmp     exit_final_rounds
1118%assign I (I + 1)
1119%endrep
1120
1121exit_final_rounds:
1122        ;; update in/out pointers
1123
1124        ; Broadcast buf_idx in all qwords of ymm0
1125        vmovq           xmm0, buf_idx
1126        vpshufd         xmm0, xmm0, 0x44
1127        vperm2f128      ymm0, ymm0, 0x0
1128        vpaddq          ymm1, ymm0, [pIn]
1129        vpaddq          ymm2, ymm0, [pIn + 32]
1130        vmovdqa         [pIn], ymm1
1131        vmovdqa         [pIn + 32], ymm2
1132        vpaddq          ymm1, ymm0, [pOut]
1133        vpaddq          ymm2, ymm0, [pOut + 32]
1134        vmovdqa         [pOut], ymm1
1135        vmovdqa         [pOut + 32], ymm2
1136
1137        ;; Clear stack frame containing keystream information
1138%ifdef SAFE_DATA
1139        vpxor   ymm0, ymm0
1140%assign i 0
1141%rep 8
1142	vmovdqa [rsp + i*32], ymm0
1143%assign i (i+1)
1144%endrep
1145%endif
1146        ; Restore rsp
1147        mov     rsp, [rsp + 32*8 + 16]
1148
1149        FUNC_RESTORE
1150
1151exit_cipher32:
1152
1153        ret
1154
1155;----------------------------------------------------------------------------------------
1156;----------------------------------------------------------------------------------------
1157
1158%ifdef LINUX
1159section .note.GNU-stack noalloc noexec nowrite progbits
1160%endif
1161