1;;
2;; Copyright (c) 2020, Intel Corporation
3;;
4;; Redistribution and use in source and binary forms, with or without
5;; modification, are permitted provided that the following conditions are met:
6;;
7;;     * Redistributions of source code must retain the above copyright notice,
8;;       this list of conditions and the following disclaimer.
9;;     * Redistributions in binary form must reproduce the above copyright
10;;       notice, this list of conditions and the following disclaimer in the
11;;       documentation and/or other materials provided with the distribution.
12;;     * Neither the name of Intel Corporation nor the names of its contributors
13;;       may be used to endorse or promote products derived from this software
14;;       without specific prior written permission.
15;;
16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26;;
27
28%include "include/os.asm"
29%include "imb_job.asm"
30%include "include/clear_regs.asm"
31%include "include/const.inc"
32%include "include/reg_sizes.asm"
33%include "include/transpose_avx512.asm"
34%include "include/aes_common.asm"
35
36section .data
37default rel
38
39align 16
40constants:
41dd      0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
42
43align 64
44add_1_4:
45dd      0x00000001, 0x00000000, 0x00000000, 0x00000000
46dd      0x00000002, 0x00000000, 0x00000000, 0x00000000
47dd      0x00000003, 0x00000000, 0x00000000, 0x00000000
48dd      0x00000004, 0x00000000, 0x00000000, 0x00000000
49
50align 64
51add_5_8:
52dd      0x00000005, 0x00000000, 0x00000000, 0x00000000
53dd      0x00000006, 0x00000000, 0x00000000, 0x00000000
54dd      0x00000007, 0x00000000, 0x00000000, 0x00000000
55dd      0x00000008, 0x00000000, 0x00000000, 0x00000000
56
57align 64
58add_16:
59dd      0x00000010, 0x00000010, 0x00000010, 0x00000010
60dd      0x00000010, 0x00000010, 0x00000010, 0x00000010
61dd      0x00000010, 0x00000010, 0x00000010, 0x00000010
62dd      0x00000010, 0x00000010, 0x00000010, 0x00000010
63
64align 64
65set_1_16:
66dd      0x00000001, 0x00000002, 0x00000003, 0x00000004
67dd      0x00000005, 0x00000006, 0x00000007, 0x00000008
68dd      0x00000009, 0x0000000a, 0x0000000b, 0x0000000c
69dd      0x0000000d, 0x0000000e, 0x0000000f, 0x00000010
70
71align 64
72len_to_mask:
73dq      0xffffffffffffffff, 0x0000000000000001
74dq      0x0000000000000003, 0x0000000000000007
75dq      0x000000000000000f, 0x000000000000001f
76dq      0x000000000000003f, 0x000000000000007f
77dq      0x00000000000000ff, 0x00000000000001ff
78dq      0x00000000000003ff, 0x00000000000007ff
79dq      0x0000000000000fff, 0x0000000000001fff
80dq      0x0000000000003fff, 0x0000000000007fff
81dq      0x000000000000ffff, 0x000000000001ffff
82dq      0x000000000003ffff, 0x000000000007ffff
83dq      0x00000000000fffff, 0x00000000001fffff
84dq      0x00000000003fffff, 0x00000000007fffff
85dq      0x0000000000ffffff, 0x0000000001ffffff
86dq      0x0000000003ffffff, 0x0000000007ffffff
87dq      0x000000000fffffff, 0x000000001fffffff
88dq      0x000000003fffffff, 0x000000007fffffff
89dq      0x00000000ffffffff, 0x00000001ffffffff
90dq      0x00000003ffffffff, 0x00000007ffffffff
91dq      0x0000000fffffffff, 0x0000001fffffffff
92dq      0x0000003fffffffff, 0x0000007fffffffff
93dq      0x000000ffffffffff, 0x000001ffffffffff
94dq      0x000003ffffffffff, 0x000007ffffffffff
95dq      0x00000fffffffffff, 0x00001fffffffffff
96dq      0x00003fffffffffff, 0x00007fffffffffff
97dq      0x0000ffffffffffff, 0x0001ffffffffffff
98dq      0x0003ffffffffffff, 0x0007ffffffffffff
99dq      0x000fffffffffffff, 0x001fffffffffffff
100dq      0x003fffffffffffff, 0x007fffffffffffff
101dq      0x00ffffffffffffff, 0x01ffffffffffffff
102dq      0x03ffffffffffffff, 0x07ffffffffffffff
103dq      0x0fffffffffffffff, 0x1fffffffffffffff
104dq      0x3fffffffffffffff, 0x7fffffffffffffff
105
106%define APPEND(a,b) a %+ b
107
108%ifdef LINUX
109%define arg1    rdi
110%else
111%define arg1    rcx
112%endif
113
114%define job     arg1
115
116section .text
117
118%macro ZMM_OP_X4 9
119        ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 16, %1,%2,%3,%4,%5,%2,%3,%4,%5,%6,%7,%8,%9
120%endmacro
121
122%macro ZMM_ROLS_X4 5
123%define %%ZMM_OP1_1      %1
124%define %%ZMM_OP1_2      %2
125%define %%ZMM_OP1_3      %3
126%define %%ZMM_OP1_4      %4
127%define %%BITS_TO_ROTATE %5
128
129        vprold  %%ZMM_OP1_1, %%BITS_TO_ROTATE
130        vprold  %%ZMM_OP1_2, %%BITS_TO_ROTATE
131        vprold  %%ZMM_OP1_3, %%BITS_TO_ROTATE
132        vprold  %%ZMM_OP1_4, %%BITS_TO_ROTATE
133
134%endmacro
135
136;
137; Macro adding original state values to processed state values
138; and transposing 16x16 u32 from first 16 ZMM registers,
139; creating keystreams.
140; Note that the registers are tranposed in a different
141; order, so first register (IN00) containing row 0
142; will not contain the first column of the matrix, but
143; row 1 and same with other registers.
144; This is done to minimize the number of registers clobbered.
145; Once transposition is done, keystream is XOR'd with the plaintext
146; and output buffer is written.
147;
148%macro GENERATE_1K_KS_AND_ENCRYPT 35
149%define %%IN00_KS01  %1 ; [in/clobbered] Input row 0 of state, bytes 64-127 of keystream
150%define %%IN01_KS02  %2 ; [in/clobbered] Input row 1 of state, bytes 128-191 of keystream
151%define %%IN02_KS15  %3 ; [in/clobbered] Input row 2 of state, bytes 960-1023 of keystream
152%define %%IN03_KS04  %4 ; [in/clobbered] Input row 3 of state, bytes 256-319 of keystream
153%define %%IN04_KS08  %5 ; [in/clobbered] Input row 4 of state, bytes 512-575 of keystream
154%define %%IN05       %6 ; [in/clobbered] Input row 5 of state, bytes 576-639 of keystream
155%define %%IN06_KS13  %7 ; [in/clobbered] Input row 6 of state, bytes 832-895 of keystream
156%define %%IN07_KS07  %8 ; [in/clobbered] Input row 7 of state, bytes 448-511 of keystream
157%define %%IN08_KS05  %9 ; [in/clobbered] Input row 8 of state, bytes 320-383 of keystream
158%define %%IN09_KS00 %10 ; [in/clobbered] Input row 9 of state, bytes 0-63 of keystream
159%define %%IN10_KS06 %11 ; [in/clobbered] Input row 10 of state, bytes 384-447 of keystream
160%define %%IN11_KS11 %12 ; [in/clobbered] Input row 11 of state, bytes 704-767 of keystream
161%define %%IN12_KS12 %13 ; [in/clobbered] Input row 12 of state, bytes 768-831 of keystream
162%define %%IN13_KS03 %14 ; [in/clobbered] Input row 13 of state, bytes 192-255 of keystream
163%define %%IN14_KS14 %15 ; [in/clobbered] Input row 14 of state, bytes 896-959 of keystream
164%define %%IN15      %16 ; [in/clobbered] Input row 15 of state, bytes 640-703 of keystream
165%define %%IN_ORIG00_KS09  %17 ; [in/clobbered] Original input row 0, bytes 576-639 of keystream
166%define %%IN_ORIG01_KS10  %18 ; [in/clobbered] Original input row 1, bytes 640-703 of keystream
167%define %%IN_ORIG02  %19 ; [in] Original input row 2
168%define %%IN_ORIG03  %20 ; [in] Original input row 3
169%define %%IN_ORIG04  %21 ; [in] Original input row 4
170%define %%IN_ORIG05  %22 ; [in] Original input row 5
171%define %%IN_ORIG06  %23 ; [in] Original input row 6
172%define %%IN_ORIG07  %24 ; [in] Original input row 7
173%define %%IN_ORIG08  %25 ; [in] Original input row 8
174%define %%IN_ORIG09  %26 ; [in] Original input row 9
175%define %%IN_ORIG10  %27 ; [in] Original input row 10
176%define %%IN_ORIG11  %28 ; [in] Original input row 11
177%define %%IN_ORIG12  %29 ; [in] Original input row 12
178%define %%IN_ORIG13  %30 ; [in] Original input row 13
179%define %%IN_ORIG14  %31 ; [in] Original input row 14
180%define %%IN_ORIG15  %32 ; [in] Original input row 15
181%define %%SRC        %33 ; [in] Source pointer
182%define %%DST        %34 ; [in] Destination pointer
183%define %%OFF        %35 ; [in] Offset into src/dst pointers
184
185        vpaddd %%IN00_KS01, %%IN_ORIG00_KS09
186        vpaddd %%IN01_KS02, %%IN_ORIG01_KS10
187        vpaddd %%IN02_KS15, %%IN_ORIG02
188        vpaddd %%IN03_KS04, %%IN_ORIG03
189
190        ;; Deal with first lanes 0-7
191        ; T0, T1 free
192        vpunpckldq      %%IN_ORIG00_KS09, %%IN00_KS01, %%IN01_KS02
193        vpunpckhdq      %%IN00_KS01, %%IN00_KS01, %%IN01_KS02
194        vpunpckldq      %%IN_ORIG01_KS10, %%IN02_KS15, %%IN03_KS04
195        vpunpckhdq      %%IN02_KS15, %%IN02_KS15, %%IN03_KS04
196
197        ; IN01_KS02, IN03_KS04 free
198        vpunpcklqdq     %%IN03_KS04, %%IN_ORIG00_KS09, %%IN_ORIG01_KS10
199        vpunpckhqdq     %%IN01_KS02, %%IN_ORIG00_KS09, %%IN_ORIG01_KS10
200        vpunpcklqdq     %%IN_ORIG00_KS09, %%IN00_KS01, %%IN02_KS15
201        vpunpckhqdq     %%IN00_KS01, %%IN00_KS01, %%IN02_KS15
202
203        vpaddd %%IN04_KS08, %%IN_ORIG04
204        vpaddd %%IN05, %%IN_ORIG05
205        vpaddd %%IN06_KS13, %%IN_ORIG06
206        vpaddd %%IN07_KS07, %%IN_ORIG07
207
208        ; IN02_KS15, T1 free
209        vpunpckldq      %%IN_ORIG01_KS10, %%IN04_KS08, %%IN05
210        vpunpckhdq      %%IN04_KS08, %%IN04_KS08, %%IN05
211        vpunpckldq      %%IN02_KS15, %%IN06_KS13, %%IN07_KS07
212        vpunpckhdq      %%IN06_KS13, %%IN06_KS13, %%IN07_KS07
213
214        ; IN07_KS07, IN05 free
215        vpunpcklqdq     %%IN07_KS07, %%IN_ORIG01_KS10, %%IN02_KS15
216        vpunpckhqdq     %%IN05, %%IN_ORIG01_KS10, %%IN02_KS15
217        vpunpcklqdq     %%IN02_KS15, %%IN04_KS08, %%IN06_KS13
218        vpunpckhqdq     %%IN04_KS08, %%IN04_KS08, %%IN06_KS13
219
220        ; T1, IN06_KS13 free
221        vshufi64x2      %%IN_ORIG01_KS10, %%IN03_KS04, %%IN07_KS07, 0x44
222        vshufi64x2      %%IN03_KS04, %%IN03_KS04, %%IN07_KS07, 0xee
223        vshufi64x2      %%IN06_KS13, %%IN01_KS02, %%IN05, 0x44
224        vshufi64x2      %%IN01_KS02, %%IN01_KS02, %%IN05, 0xee
225        vshufi64x2      %%IN07_KS07, %%IN_ORIG00_KS09, %%IN02_KS15, 0x44
226        vshufi64x2      %%IN02_KS15, %%IN_ORIG00_KS09, %%IN02_KS15, 0xee
227        vshufi64x2      %%IN05, %%IN00_KS01, %%IN04_KS08, 0x44
228        vshufi64x2      %%IN00_KS01, %%IN00_KS01, %%IN04_KS08, 0xee
229
230        ;; Deal with lanes 8-15
231        vpaddd %%IN08_KS05, %%IN_ORIG08
232        vpaddd %%IN09_KS00, %%IN_ORIG09
233        vpaddd %%IN10_KS06, %%IN_ORIG10
234        vpaddd %%IN11_KS11, %%IN_ORIG11
235
236        vpunpckldq      %%IN_ORIG00_KS09, %%IN08_KS05, %%IN09_KS00
237        vpunpckhdq      %%IN08_KS05, %%IN08_KS05, %%IN09_KS00
238        vpunpckldq      %%IN04_KS08, %%IN10_KS06, %%IN11_KS11
239        vpunpckhdq      %%IN10_KS06, %%IN10_KS06, %%IN11_KS11
240
241        vpunpcklqdq     %%IN09_KS00, %%IN_ORIG00_KS09, %%IN04_KS08
242        vpunpckhqdq     %%IN04_KS08, %%IN_ORIG00_KS09, %%IN04_KS08
243        vpunpcklqdq     %%IN11_KS11, %%IN08_KS05, %%IN10_KS06
244        vpunpckhqdq     %%IN08_KS05, %%IN08_KS05, %%IN10_KS06
245
246        vpaddd %%IN12_KS12, %%IN_ORIG12
247        vpaddd %%IN13_KS03, %%IN_ORIG13
248        vpaddd %%IN14_KS14, %%IN_ORIG14
249        vpaddd %%IN15, %%IN_ORIG15
250
251        vpunpckldq      %%IN_ORIG00_KS09, %%IN12_KS12, %%IN13_KS03
252        vpunpckhdq      %%IN12_KS12, %%IN12_KS12, %%IN13_KS03
253        vpunpckldq      %%IN10_KS06, %%IN14_KS14, %%IN15
254        vpunpckhdq      %%IN14_KS14, %%IN14_KS14, %%IN15
255
256        vpunpcklqdq     %%IN13_KS03, %%IN_ORIG00_KS09, %%IN10_KS06
257        vpunpckhqdq     %%IN10_KS06, %%IN_ORIG00_KS09, %%IN10_KS06
258        vpunpcklqdq     %%IN15, %%IN12_KS12, %%IN14_KS14
259        vpunpckhqdq     %%IN12_KS12, %%IN12_KS12, %%IN14_KS14
260
261        vshufi64x2      %%IN14_KS14, %%IN09_KS00, %%IN13_KS03, 0x44
262        vshufi64x2      %%IN09_KS00, %%IN09_KS00, %%IN13_KS03, 0xee
263        vshufi64x2      %%IN_ORIG00_KS09, %%IN04_KS08, %%IN10_KS06, 0x44
264        vshufi64x2      %%IN10_KS06, %%IN04_KS08, %%IN10_KS06, 0xee
265        vshufi64x2      %%IN13_KS03, %%IN11_KS11, %%IN15, 0x44
266        vshufi64x2      %%IN11_KS11, %%IN11_KS11, %%IN15, 0xee
267        vshufi64x2      %%IN15, %%IN08_KS05, %%IN12_KS12, 0x44
268        vshufi64x2      %%IN08_KS05, %%IN08_KS05, %%IN12_KS12, 0xee
269
270        vshufi64x2      %%IN12_KS12, %%IN03_KS04, %%IN09_KS00, 0xdd
271        vpxorq          %%IN12_KS12, [%%SRC + %%OFF + 64*12]
272        vmovdqu64       [%%DST + %%OFF + 64*12], %%IN12_KS12
273
274        vshufi64x2      %%IN04_KS08, %%IN03_KS04, %%IN09_KS00, 0x88
275        vpxorq          %%IN04_KS08, [%%SRC + %%OFF + 64*8]
276        vmovdqu64       [%%DST + %%OFF + 64*8], %%IN04_KS08
277
278        vshufi64x2      %%IN09_KS00, %%IN_ORIG01_KS10, %%IN14_KS14, 0x88
279        vpxorq          %%IN09_KS00, [%%SRC + %%OFF]
280        vmovdqu64       [%%DST + %%OFF], %%IN09_KS00
281
282        vshufi64x2      %%IN03_KS04, %%IN_ORIG01_KS10, %%IN14_KS14, 0xdd
283        vpxorq          %%IN03_KS04, [%%SRC + %%OFF + 64*4]
284        vmovdqu64       [%%DST + %%OFF + 64*4], %%IN03_KS04
285
286        vshufi64x2      %%IN14_KS14, %%IN02_KS15, %%IN11_KS11, 0xdd
287        vpxorq          %%IN14_KS14, [%%SRC + %%OFF + 64*14]
288        vmovdqu64       [%%DST + %%OFF + 64*14], %%IN14_KS14
289
290        vshufi64x2      %%IN_ORIG01_KS10, %%IN02_KS15, %%IN11_KS11, 0x88
291        vpxorq          %%IN_ORIG01_KS10, [%%SRC + %%OFF + 64*10]
292        vmovdqu64       [%%DST + %%OFF + 64*10], %%IN_ORIG01_KS10
293
294        vshufi64x2      %%IN11_KS11, %%IN00_KS01, %%IN08_KS05, 0x88
295        vpxorq          %%IN11_KS11, [%%SRC + %%OFF + 64*11]
296        vmovdqu64       [%%DST + %%OFF + 64*11], %%IN11_KS11
297
298        vshufi64x2      %%IN02_KS15, %%IN00_KS01, %%IN08_KS05, 0xdd
299        vpxorq          %%IN02_KS15, [%%SRC + %%OFF + 64*15]
300        vmovdqu64       [%%DST + %%OFF + 64*15], %%IN02_KS15
301
302        vshufi64x2      %%IN00_KS01, %%IN06_KS13, %%IN_ORIG00_KS09, 0x88
303        vpxorq          %%IN00_KS01, [%%SRC + %%OFF + 64*1]
304        vmovdqu64       [%%DST + %%OFF + 64*1], %%IN00_KS01
305
306        vshufi64x2      %%IN08_KS05, %%IN06_KS13, %%IN_ORIG00_KS09, 0xdd
307        vpxorq          %%IN08_KS05, [%%SRC + %%OFF + 64*5]
308        vmovdqu64       [%%DST + %%OFF + 64*5], %%IN08_KS05
309
310        vshufi64x2      %%IN_ORIG00_KS09, %%IN01_KS02, %%IN10_KS06, 0x88
311        vpxorq          %%IN_ORIG00_KS09, [%%SRC + %%OFF + 64*9]
312        vmovdqu64       [%%DST + %%OFF + 64*9], %%IN_ORIG00_KS09
313
314        vshufi64x2      %%IN06_KS13, %%IN01_KS02, %%IN10_KS06, 0xdd
315        vpxorq          %%IN06_KS13, [%%SRC + %%OFF + 64*13]
316        vmovdqu64       [%%DST + %%OFF + 64*13], %%IN06_KS13
317
318        vshufi64x2      %%IN01_KS02, %%IN07_KS07, %%IN13_KS03, 0x88
319        vpxorq          %%IN01_KS02, [%%SRC + %%OFF + 64*2]
320        vmovdqu64       [%%DST + %%OFF + 64*2], %%IN01_KS02
321
322        vshufi64x2      %%IN10_KS06, %%IN07_KS07, %%IN13_KS03, 0xdd
323        vpxorq          %%IN10_KS06, [%%SRC + %%OFF + 64*6]
324        vmovdqu64       [%%DST + %%OFF + 64*6], %%IN10_KS06
325
326        vshufi64x2      %%IN13_KS03, %%IN05, %%IN15, 0x88
327        vpxorq          %%IN13_KS03, [%%SRC + %%OFF + 64*3]
328        vmovdqu64       [%%DST + %%OFF + 64*3], %%IN13_KS03
329
330        vshufi64x2      %%IN07_KS07, %%IN05, %%IN15, 0xdd
331        vpxorq          %%IN07_KS07, [%%SRC + %%OFF + 64*7]
332        vmovdqu64       [%%DST + %%OFF + 64*7], %%IN07_KS07
333%endmacro
334
335;;
336;; Performs a quarter round on all 4 columns,
337;; resulting in a full round
338;;
339%macro QUARTER_ROUND_X4 4
340%define %%A %1 ;; [in/out] ZMM register containing value A of all 4 columns
341%define %%B %2 ;; [in/out] ZMM register containing value B of all 4 columns
342%define %%C %3 ;; [in/out] ZMM register containing value C of all 4 columns
343%define %%D %4 ;; [in/out] ZMM register containing value D of all 4 columns
344
345        vpaddd          %%A, %%B
346        vpxorq          %%D, %%A
347        vprold          %%D, 16
348        vpaddd          %%C, %%D
349        vpxorq          %%B, %%C
350        vprold          %%B, 12
351        vpaddd          %%A, %%B
352        vpxorq          %%D, %%A
353        vprold          %%D, 8
354        vpaddd          %%C, %%D
355        vpxorq          %%B, %%C
356        vprold          %%B, 7
357
358%endmacro
359
360;;
361;; Rotates the registers to prepare the data
362;; from column round to diagonal round
363;;
364%macro COLUMN_TO_DIAG 3
365%define %%B %1 ;; [in/out] ZMM register containing value B of all 4 columns
366%define %%C %2 ;; [in/out] ZMM register containing value C of all 4 columns
367%define %%D %3 ;; [in/out] ZMM register containing value D of all 4 columns
368
369        vpshufd         %%B, %%B, 0x39 ; 0b00111001 ;; 0,3,2,1
370        vpshufd         %%C, %%C, 0x4E ; 0b01001110 ;; 1,0,3,2
371        vpshufd         %%D, %%D, 0x93 ; 0b10010011 ;; 2,1,0,3
372
373%endmacro
374
375;;
376;; Rotates the registers to prepare the data
377;; from diagonal round to column round
378;;
379%macro DIAG_TO_COLUMN 3
380%define %%B %1 ;; [in/out] ZMM register containing value B of all 4 columns
381%define %%C %2 ;; [in/out] ZMM register containing value C of all 4 columns
382%define %%D %3 ;; [in/out] ZMM register containing value D of all 4 columns
383
384        vpshufd         %%B, %%B, 0x93 ; 0b10010011 ; 2,1,0,3
385        vpshufd         %%C, %%C, 0x4E ; 0b01001110 ;  1,0,3,2
386        vpshufd         %%D, %%D, 0x39 ; 0b00111001 ;  0,3,2,1
387
388%endmacro
389;;
390;; Generates up to 64*8 bytes of keystream
391;;
392%macro GENERATE_512_KS 21
393%define %%A_L_KS0        %1  ;; [out] ZMM A / Bytes 0-63    of KS
394%define %%B_L_KS1        %2  ;; [out] ZMM B / Bytes 64-127  of KS
395%define %%C_L_KS2        %3  ;; [out] ZMM C / Bytes 128-191 of KS
396%define %%D_L_KS3        %4  ;; [out] ZMM D / Bytes 192-255 of KS
397%define %%A_H_KS4        %5  ;; [out] ZMM A / Bytes 256-319 of KS (or "none" in NUM_BLOCKS == 4)
398%define %%B_H_KS5        %6  ;; [out] ZMM B / Bytes 320-383 of KS (or "none" in NUM_BLOCKS == 4)
399%define %%C_H_KS6        %7  ;; [out] ZMM C / Bytes 384-447 of KS (or "none" in NUM_BLOCKS == 4)
400%define %%D_H_KS7        %8  ;; [out] ZMM D / Bytes 448-511 of KS (or "none" in NUM_BLOCKS == 4)
401%define %%STATE_IN_A_L   %9  ;; [in] ZMM containing state "A" part
402%define %%STATE_IN_B_L   %10 ;; [in] ZMM containing state "B" part
403%define %%STATE_IN_C_L   %11 ;; [in] ZMM containing state "C" part
404%define %%STATE_IN_D_L   %12 ;; [in] ZMM containing state "D" part
405%define %%STATE_IN_A_H   %13 ;; [in] ZMM containing state "A" part (or "none" in NUM_BLOCKS == 4)
406%define %%STATE_IN_B_H   %14 ;; [in] ZMM containing state "B" part (or "none" in NUM_BLOCKS == 4)
407%define %%STATE_IN_C_H   %15 ;; [in] ZMM containing state "C" part (or "none" in NUM_BLOCKS == 4)
408%define %%STATE_IN_D_H   %16 ;; [in] ZMM containing state "D" part (or "none" in NUM_BLOCKS == 4)
409%define %%ZTMP0          %17 ;; [clobbered] Temp ZMM reg
410%define %%ZTMP1          %18 ;; [clobbered] Temp ZMM reg
411%define %%ZTMP2          %19 ;; [clobbered] Temp ZMM reg
412%define %%ZTMP3          %20 ;; [clobbered] Temp ZMM reg
413%define %%NUM_BLOCKS     %21 ;; [in] Num blocks to encrypt (4 or 8)
414
415        vmovdqa64       %%A_L_KS0, %%STATE_IN_A_L
416        vmovdqa64       %%B_L_KS1, %%STATE_IN_B_L
417        vmovdqa64       %%C_L_KS2, %%STATE_IN_C_L
418        vmovdqa64       %%D_L_KS3, %%STATE_IN_D_L
419%if %%NUM_BLOCKS == 8
420        vmovdqa64       %%A_H_KS4, %%STATE_IN_A_H
421        vmovdqa64       %%B_H_KS5, %%STATE_IN_B_H
422        vmovdqa64       %%C_H_KS6, %%STATE_IN_C_H
423        vmovdqa64       %%D_H_KS7, %%STATE_IN_D_H
424%endif
425%rep 10
426%if %%NUM_BLOCKS == 4
427        QUARTER_ROUND_X4 %%A_L_KS0, %%B_L_KS1, %%C_L_KS2, %%D_L_KS3
428        COLUMN_TO_DIAG %%B_L_KS1, %%C_L_KS2, %%D_L_KS3
429        QUARTER_ROUND_X4 %%A_L_KS0, %%B_L_KS1, %%C_L_KS2, %%D_L_KS3
430        DIAG_TO_COLUMN %%B_L_KS1, %%C_L_KS2, %%D_L_KS3
431%else
432        QUARTER_ROUND_X4 %%A_L_KS0, %%B_L_KS1, %%C_L_KS2, %%D_L_KS3
433        QUARTER_ROUND_X4 %%A_H_KS4, %%B_H_KS5, %%C_H_KS6, %%D_H_KS7
434        COLUMN_TO_DIAG %%B_L_KS1, %%C_L_KS2, %%D_L_KS3
435        COLUMN_TO_DIAG %%B_H_KS5, %%C_H_KS6, %%D_H_KS7
436        QUARTER_ROUND_X4 %%A_L_KS0, %%B_L_KS1, %%C_L_KS2, %%D_L_KS3
437        QUARTER_ROUND_X4 %%A_H_KS4, %%B_H_KS5, %%C_H_KS6, %%D_H_KS7
438        DIAG_TO_COLUMN %%B_L_KS1, %%C_L_KS2, %%D_L_KS3
439        DIAG_TO_COLUMN %%B_H_KS5, %%C_H_KS6, %%D_H_KS7
440%endif ;; %%NUM_BLOCKS == 4
441%endrep
442
443        vpaddd %%A_L_KS0, %%STATE_IN_A_L
444        vpaddd %%B_L_KS1, %%STATE_IN_B_L
445        vpaddd %%C_L_KS2, %%STATE_IN_C_L
446        vpaddd %%D_L_KS3, %%STATE_IN_D_L
447
448        TRANSPOSE4_U128_INPLACE %%A_L_KS0, %%B_L_KS1, %%C_L_KS2, %%D_L_KS3, \
449                                %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
450%if %%NUM_BLOCKS == 8
451        vpaddd %%A_H_KS4, %%STATE_IN_A_H
452        vpaddd %%B_H_KS5, %%STATE_IN_B_H
453        vpaddd %%C_H_KS6, %%STATE_IN_C_H
454        vpaddd %%D_H_KS7, %%STATE_IN_D_H
455
456        TRANSPOSE4_U128_INPLACE %%A_H_KS4, %%B_H_KS5, %%C_H_KS6, %%D_H_KS7, \
457                                %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
458%endif
459%endmacro
460
461;;
462;; Performs a full chacha20 round on 16 states,
463;; consisting of 4 quarter rounds, which are done in parallel
464;;
465%macro CHACHA20_ROUND 16
466%define %%ZMM_DWORD_A1  %1  ;; [in/out] ZMM register containing dword A for first quarter round
467%define %%ZMM_DWORD_A2  %2  ;; [in/out] ZMM register containing dword A for second quarter round
468%define %%ZMM_DWORD_A3  %3  ;; [in/out] ZMM register containing dword A for third quarter round
469%define %%ZMM_DWORD_A4  %4  ;; [in/out] ZMM register containing dword A for fourth quarter round
470%define %%ZMM_DWORD_B1  %5  ;; [in/out] ZMM register containing dword B for first quarter round
471%define %%ZMM_DWORD_B2  %6  ;; [in/out] ZMM register containing dword B for second quarter round
472%define %%ZMM_DWORD_B3  %7  ;; [in/out] ZMM register containing dword B for third quarter round
473%define %%ZMM_DWORD_B4  %8  ;; [in/out] ZMM register containing dword B for fourth quarter round
474%define %%ZMM_DWORD_C1  %9  ;; [in/out] ZMM register containing dword C for first quarter round
475%define %%ZMM_DWORD_C2 %10  ;; [in/out] ZMM register containing dword C for second quarter round
476%define %%ZMM_DWORD_C3 %11  ;; [in/out] ZMM register containing dword C for third quarter round
477%define %%ZMM_DWORD_C4 %12  ;; [in/out] ZMM register containing dword C for fourth quarter round
478%define %%ZMM_DWORD_D1 %13  ;; [in/out] ZMM register containing dword D for first quarter round
479%define %%ZMM_DWORD_D2 %14  ;; [in/out] ZMM register containing dword D for second quarter round
480%define %%ZMM_DWORD_D3 %15  ;; [in/out] ZMM register containing dword D for third quarter round
481%define %%ZMM_DWORD_D4 %16  ;; [in/out] ZMM register containing dword D for fourth quarter round
482
483        ; A += B
484        ZMM_OP_X4 vpaddd, %%ZMM_DWORD_A1, %%ZMM_DWORD_A2, %%ZMM_DWORD_A3, %%ZMM_DWORD_A4, \
485                          %%ZMM_DWORD_B1, %%ZMM_DWORD_B2, %%ZMM_DWORD_B3, %%ZMM_DWORD_B4
486        ; D ^= A
487        ZMM_OP_X4 vpxorq, %%ZMM_DWORD_D1, %%ZMM_DWORD_D2, %%ZMM_DWORD_D3, %%ZMM_DWORD_D4, \
488                          %%ZMM_DWORD_A1, %%ZMM_DWORD_A2, %%ZMM_DWORD_A3, %%ZMM_DWORD_A4
489
490        ; D <<< 16
491        ZMM_ROLS_X4 %%ZMM_DWORD_D1, %%ZMM_DWORD_D2, %%ZMM_DWORD_D3, %%ZMM_DWORD_D4, 16
492
493        ; C += D
494        ZMM_OP_X4 vpaddd, %%ZMM_DWORD_C1, %%ZMM_DWORD_C2, %%ZMM_DWORD_C3, %%ZMM_DWORD_C4, \
495                          %%ZMM_DWORD_D1, %%ZMM_DWORD_D2, %%ZMM_DWORD_D3, %%ZMM_DWORD_D4
496        ; B ^= C
497        ZMM_OP_X4 vpxorq, %%ZMM_DWORD_B1, %%ZMM_DWORD_B2, %%ZMM_DWORD_B3, %%ZMM_DWORD_B4, \
498                          %%ZMM_DWORD_C1, %%ZMM_DWORD_C2, %%ZMM_DWORD_C3, %%ZMM_DWORD_C4
499
500        ; B <<< 12
501        ZMM_ROLS_X4 %%ZMM_DWORD_B1, %%ZMM_DWORD_B2, %%ZMM_DWORD_B3, %%ZMM_DWORD_B4, 12
502
503        ; A += B
504        ZMM_OP_X4 vpaddd, %%ZMM_DWORD_A1, %%ZMM_DWORD_A2, %%ZMM_DWORD_A3, %%ZMM_DWORD_A4, \
505                          %%ZMM_DWORD_B1, %%ZMM_DWORD_B2, %%ZMM_DWORD_B3, %%ZMM_DWORD_B4
506        ; D ^= A
507        ZMM_OP_X4 vpxorq, %%ZMM_DWORD_D1, %%ZMM_DWORD_D2, %%ZMM_DWORD_D3, %%ZMM_DWORD_D4, \
508                          %%ZMM_DWORD_A1, %%ZMM_DWORD_A2, %%ZMM_DWORD_A3, %%ZMM_DWORD_A4
509
510        ; D <<< 8
511        ZMM_ROLS_X4 %%ZMM_DWORD_D1, %%ZMM_DWORD_D2, %%ZMM_DWORD_D3, %%ZMM_DWORD_D4, 8
512
513        ; C += D
514        ZMM_OP_X4 vpaddd, %%ZMM_DWORD_C1, %%ZMM_DWORD_C2, %%ZMM_DWORD_C3, %%ZMM_DWORD_C4, \
515                          %%ZMM_DWORD_D1, %%ZMM_DWORD_D2, %%ZMM_DWORD_D3, %%ZMM_DWORD_D4
516        ; B ^= C
517        ZMM_OP_X4 vpxorq, %%ZMM_DWORD_B1, %%ZMM_DWORD_B2, %%ZMM_DWORD_B3, %%ZMM_DWORD_B4, \
518                          %%ZMM_DWORD_C1, %%ZMM_DWORD_C2, %%ZMM_DWORD_C3, %%ZMM_DWORD_C4
519
520        ; B <<< 7
521        ZMM_ROLS_X4 %%ZMM_DWORD_B1, %%ZMM_DWORD_B2, %%ZMM_DWORD_B3, %%ZMM_DWORD_B4, 7
522%endmacro
523
524;;
525;; Generates 64*16 bytes of keystream and encrypt up to 1KB of input data
526;;
527%macro ENCRYPT_1K 35
528%define %%ZMM_DWORD0       %1   ;; [clobbered] ZMM to contain dword 0 of all states
529%define %%ZMM_DWORD1       %2   ;; [clobbered] ZMM to contain dword 1 of all states
530%define %%ZMM_DWORD2       %3   ;; [clobbered] ZMM to contain dword 2 of all states
531%define %%ZMM_DWORD3       %4   ;; [clobbered] ZMM to contain dword 3 of all states
532%define %%ZMM_DWORD4       %5   ;; [clobbered] ZMM to contain dword 4 of all states
533%define %%ZMM_DWORD5       %6   ;; [clobbered] ZMM to contain dword 5 of all states
534%define %%ZMM_DWORD6       %7   ;; [clobbered] ZMM to contain dword 6 of all states
535%define %%ZMM_DWORD7       %8   ;; [clobbered] ZMM to contain dword 7 of all states
536%define %%ZMM_DWORD8       %9   ;; [clobbered] ZMM to contain dword 8 of all states
537%define %%ZMM_DWORD9       %10  ;; [clobbered] ZMM to contain dword 9 of all states
538%define %%ZMM_DWORD10      %11  ;; [clobbered] ZMM to contain dword 10 of all states
539%define %%ZMM_DWORD11      %12  ;; [clobbered] ZMM to contain dword 11 of all states
540%define %%ZMM_DWORD12      %13  ;; [clobbered] ZMM to contain dword 12 of all states
541%define %%ZMM_DWORD13      %14  ;; [clobbered] ZMM to contain dword 13 of all states
542%define %%ZMM_DWORD14      %15  ;; [clobbered] ZMM to contain dword 14 of all states
543%define %%ZMM_DWORD15      %16  ;; [clobbered] ZMM to contain dword 15 of all states
544%define %%ZMM_DWORD_ORIG0  %17  ;; [in/clobbered] ZMM containing dword 0 of all states / Temp ZMM register
545%define %%ZMM_DWORD_ORIG1  %18  ;; [in/clobbered] ZMM containing dword 1 of all states / Temp ZMM register
546%define %%ZMM_DWORD_ORIG2  %19  ;; [in] ZMM containing dword 2 of all states
547%define %%ZMM_DWORD_ORIG3  %20  ;; [in] ZMM containing dword 3 of all states
548%define %%ZMM_DWORD_ORIG4  %21  ;; [in] ZMM containing dword 4 of all states
549%define %%ZMM_DWORD_ORIG5  %22  ;; [in] ZMM containing dword 5 of all states
550%define %%ZMM_DWORD_ORIG6  %23  ;; [in] ZMM containing dword 6 of all states
551%define %%ZMM_DWORD_ORIG7  %24  ;; [in] ZMM containing dword 7 of all states
552%define %%ZMM_DWORD_ORIG8  %25  ;; [in] ZMM containing dword 8 of all states
553%define %%ZMM_DWORD_ORIG9  %26  ;; [in] ZMM containing dword 9 of all states
554%define %%ZMM_DWORD_ORIG10 %27  ;; [in] ZMM containing dword 10 of all states
555%define %%ZMM_DWORD_ORIG11 %28  ;; [in] ZMM containing dword 11 of all states
556%define %%ZMM_DWORD_ORIG12 %29  ;; [in] ZMM containing dword 12 of all states
557%define %%ZMM_DWORD_ORIG13 %30  ;; [in] ZMM containing dword 13 of all states
558%define %%ZMM_DWORD_ORIG14 %31  ;; [in] ZMM containing dword 14 of all states
559%define %%ZMM_DWORD_ORIG15 %32  ;; [in] ZMM containing dword 15 of all states
560%define %%SRC              %33  ;; [in] Source pointer
561%define %%DST              %34  ;; [in] Destination pointer
562%define %%OFF              %35  ;; [in] Offset into src/dst pointers
563
564%assign i 0
565%rep 16
566        vmovdqa64 APPEND(%%ZMM_DWORD, i), APPEND(%%ZMM_DWORD_ORIG, i)
567%assign i (i + 1)
568%endrep
569
570%rep 10
571
572        ;;; Each full round consists of 8 quarter rounds, 4 column rounds and 4 diagonal rounds
573        ;;; For first 4 column rounds:
574        ;;; A = 0, 1, 2, 3;   B = 4, 5, 6, 7;
575        ;;; C = 8, 9, 10, 11; D = 12, 13, 14, 15
576        CHACHA20_ROUND %%ZMM_DWORD0, %%ZMM_DWORD1, %%ZMM_DWORD2, %%ZMM_DWORD3, \
577                       %%ZMM_DWORD4, %%ZMM_DWORD5, %%ZMM_DWORD6, %%ZMM_DWORD7, \
578                       %%ZMM_DWORD8, %%ZMM_DWORD9, %%ZMM_DWORD10, %%ZMM_DWORD11, \
579                       %%ZMM_DWORD12, %%ZMM_DWORD13, %%ZMM_DWORD14, %%ZMM_DWORD15
580        ;;; For 4 diagonal rounds:
581        ;;; A = 0, 1, 2, 3;   B = 5, 6, 7, 4;
582        ;;; C = 10, 11, 8, 9; D = 15, 12, 13, 14
583        CHACHA20_ROUND %%ZMM_DWORD0, %%ZMM_DWORD1, %%ZMM_DWORD2, %%ZMM_DWORD3, \
584                       %%ZMM_DWORD5, %%ZMM_DWORD6, %%ZMM_DWORD7, %%ZMM_DWORD4, \
585                       %%ZMM_DWORD10, %%ZMM_DWORD11, %%ZMM_DWORD8, %%ZMM_DWORD9, \
586                       %%ZMM_DWORD15, %%ZMM_DWORD12, %%ZMM_DWORD13, %%ZMM_DWORD14
587%endrep
588
589        ;; Add original states to processed states, transpose
590        ;; these states to form the 64*16 bytes of keystream,
591        ;; XOR with plaintext and write ciphertext out
592        GENERATE_1K_KS_AND_ENCRYPT %%ZMM_DWORD0, %%ZMM_DWORD1, %%ZMM_DWORD2, %%ZMM_DWORD3, \
593                                   %%ZMM_DWORD4, %%ZMM_DWORD5, %%ZMM_DWORD6, %%ZMM_DWORD7, \
594                                   %%ZMM_DWORD8, %%ZMM_DWORD9, %%ZMM_DWORD10, %%ZMM_DWORD11, \
595                                   %%ZMM_DWORD12, %%ZMM_DWORD13, %%ZMM_DWORD14, %%ZMM_DWORD15, \
596                                   %%ZMM_DWORD_ORIG0, %%ZMM_DWORD_ORIG1, %%ZMM_DWORD_ORIG2, \
597                                   %%ZMM_DWORD_ORIG3,%%ZMM_DWORD_ORIG4, %%ZMM_DWORD_ORIG5, \
598                                   %%ZMM_DWORD_ORIG6, %%ZMM_DWORD_ORIG7, %%ZMM_DWORD_ORIG8, \
599                                   %%ZMM_DWORD_ORIG9, %%ZMM_DWORD_ORIG10, %%ZMM_DWORD_ORIG11, \
600                                   %%ZMM_DWORD_ORIG12, %%ZMM_DWORD_ORIG13, %%ZMM_DWORD_ORIG14, \
601                                   %%ZMM_DWORD_ORIG15, %%SRC, %%DST, %%OFF
602%endmacro
603
604;
605; Macro adding original state values to processed state values
606; and transposing 16x16 u32 from first 16 ZMM registers,
607; creating keystreams.
608; Note that the registers are tranposed in a different
609; order, so first register (IN00) containing row 0
610; will not contain the first column of the matrix, but
611; row 1 and same with other registers.
612; This is done to minimize the number of registers clobbered.
613;
614%macro ADD_TRANSPOSE_STATE_KS 32
615%define %%IN00_OUT01  %1 ; [in/out] Input row 0, Output column 1
616%define %%IN01_OUT02  %2 ; [in/out] Input row 1, Output column 2
617%define %%IN02_OUT15  %3 ; [in/out] Input row 2, Output column 15
618%define %%IN03_OUT04  %4 ; [in/out] Input row 3, Output column 4
619%define %%IN04_OUT08  %5 ; [in/out] Input row 4, Output column 8
620%define %%IN05_OUT09  %6 ; [in/out] Input row 5, Output column 9
621%define %%IN06_OUT13  %7 ; [in/out] Input row 6, Output column 13
622%define %%IN07_OUT07  %8 ; [in/out] Input row 7, Output column 7
623%define %%IN08_OUT05  %9 ; [in/out] Input row 8, Output column 5
624%define %%IN09_OUT00 %10 ; [in/out] Input row 9, Output column 0
625%define %%IN10_OUT06 %11 ; [in/out] Input row 10, Output column 6
626%define %%IN11_OUT11 %12 ; [in/out] Input row 11, Output column 11
627%define %%IN12_OUT12 %13 ; [in/out] Input row 12, Output column 12
628%define %%IN13_OUT03 %14 ; [in/out] Input row 13, Output column 3
629%define %%IN14_OUT14 %15 ; [in/out] Input row 14, Output column 14
630%define %%IN15_OUT10 %16 ; [in/out] Input row 15, Output column 10
631%define %%IN_ORIG00  %17 ; [in/clobbered] Original input row 0
632%define %%IN_ORIG01  %18 ; [in/clobbered] Original input row 1
633%define %%IN_ORIG02  %19 ; [in] Original input row 2
634%define %%IN_ORIG03  %20 ; [in] Original input row 3
635%define %%IN_ORIG04  %21 ; [in] Original input row 4
636%define %%IN_ORIG05  %22 ; [in] Original input row 5
637%define %%IN_ORIG06  %23 ; [in] Original input row 6
638%define %%IN_ORIG07  %24 ; [in] Original input row 7
639%define %%IN_ORIG08  %25 ; [in] Original input row 8
640%define %%IN_ORIG09  %26 ; [in] Original input row 9
641%define %%IN_ORIG10  %27 ; [in] Original input row 10
642%define %%IN_ORIG11  %28 ; [in] Original input row 11
643%define %%IN_ORIG12  %29 ; [in] Original input row 12
644%define %%IN_ORIG13  %30 ; [in] Original input row 13
645%define %%IN_ORIG14  %31 ; [in] Original input row 14
646%define %%IN_ORIG15  %32 ; [in] Original input row 15
647
648        vpaddd %%IN00_OUT01, %%IN_ORIG00
649        vpaddd %%IN01_OUT02, %%IN_ORIG01
650        vpaddd %%IN02_OUT15, %%IN_ORIG02
651        vpaddd %%IN03_OUT04, %%IN_ORIG03
652
653        ;; Deal with first lanes 0-7
654        ; T0, T1 free
655        vpunpckldq      %%IN_ORIG00, %%IN00_OUT01, %%IN01_OUT02
656        vpunpckhdq      %%IN00_OUT01, %%IN00_OUT01, %%IN01_OUT02
657        vpunpckldq      %%IN_ORIG01, %%IN02_OUT15, %%IN03_OUT04
658        vpunpckhdq      %%IN02_OUT15, %%IN02_OUT15, %%IN03_OUT04
659
660        ; IN01_OUT02, IN03_OUT04 free
661        vpunpcklqdq     %%IN03_OUT04, %%IN_ORIG00, %%IN_ORIG01
662        vpunpckhqdq     %%IN01_OUT02, %%IN_ORIG00, %%IN_ORIG01
663        vpunpcklqdq     %%IN_ORIG00, %%IN00_OUT01, %%IN02_OUT15
664        vpunpckhqdq     %%IN00_OUT01, %%IN00_OUT01, %%IN02_OUT15
665
666        vpaddd %%IN04_OUT08, %%IN_ORIG04
667        vpaddd %%IN05_OUT09, %%IN_ORIG05
668        vpaddd %%IN06_OUT13, %%IN_ORIG06
669        vpaddd %%IN07_OUT07, %%IN_ORIG07
670
671        ; IN02_OUT15, T1 free
672        vpunpckldq      %%IN_ORIG01, %%IN04_OUT08, %%IN05_OUT09
673        vpunpckhdq      %%IN04_OUT08, %%IN04_OUT08, %%IN05_OUT09
674        vpunpckldq      %%IN02_OUT15, %%IN06_OUT13, %%IN07_OUT07
675        vpunpckhdq      %%IN06_OUT13, %%IN06_OUT13, %%IN07_OUT07
676
677        ; IN07_OUT07, IN05_OUT09 free
678        vpunpcklqdq     %%IN07_OUT07, %%IN_ORIG01, %%IN02_OUT15
679        vpunpckhqdq     %%IN05_OUT09, %%IN_ORIG01, %%IN02_OUT15
680        vpunpcklqdq     %%IN02_OUT15, %%IN04_OUT08, %%IN06_OUT13
681        vpunpckhqdq     %%IN04_OUT08, %%IN04_OUT08, %%IN06_OUT13
682
683        ; T1, IN06_OUT13 free
684        vshufi64x2      %%IN_ORIG01, %%IN03_OUT04, %%IN07_OUT07, 0x44
685        vshufi64x2      %%IN03_OUT04, %%IN03_OUT04, %%IN07_OUT07, 0xee
686        vshufi64x2      %%IN06_OUT13, %%IN01_OUT02, %%IN05_OUT09, 0x44
687        vshufi64x2      %%IN01_OUT02, %%IN01_OUT02, %%IN05_OUT09, 0xee
688        vshufi64x2      %%IN07_OUT07, %%IN_ORIG00, %%IN02_OUT15, 0x44
689        vshufi64x2      %%IN02_OUT15, %%IN_ORIG00, %%IN02_OUT15, 0xee
690        vshufi64x2      %%IN05_OUT09, %%IN00_OUT01, %%IN04_OUT08, 0x44
691        vshufi64x2      %%IN00_OUT01, %%IN00_OUT01, %%IN04_OUT08, 0xee
692
693        ;; Deal with lanes 8-15
694        vpaddd %%IN08_OUT05, %%IN_ORIG08
695        vpaddd %%IN09_OUT00, %%IN_ORIG09
696        vpaddd %%IN10_OUT06, %%IN_ORIG10
697        vpaddd %%IN11_OUT11, %%IN_ORIG11
698
699        vpunpckldq      %%IN_ORIG00, %%IN08_OUT05, %%IN09_OUT00
700        vpunpckhdq      %%IN08_OUT05, %%IN08_OUT05, %%IN09_OUT00
701        vpunpckldq      %%IN04_OUT08, %%IN10_OUT06, %%IN11_OUT11
702        vpunpckhdq      %%IN10_OUT06, %%IN10_OUT06, %%IN11_OUT11
703
704        vpunpcklqdq     %%IN09_OUT00, %%IN_ORIG00, %%IN04_OUT08
705        vpunpckhqdq     %%IN04_OUT08, %%IN_ORIG00, %%IN04_OUT08
706        vpunpcklqdq     %%IN11_OUT11, %%IN08_OUT05, %%IN10_OUT06
707        vpunpckhqdq     %%IN08_OUT05, %%IN08_OUT05, %%IN10_OUT06
708
709        vpaddd %%IN12_OUT12, %%IN_ORIG12
710        vpaddd %%IN13_OUT03, %%IN_ORIG13
711        vpaddd %%IN14_OUT14, %%IN_ORIG14
712        vpaddd %%IN15_OUT10, %%IN_ORIG15
713
714        vpunpckldq      %%IN_ORIG00, %%IN12_OUT12, %%IN13_OUT03
715        vpunpckhdq      %%IN12_OUT12, %%IN12_OUT12, %%IN13_OUT03
716        vpunpckldq      %%IN10_OUT06, %%IN14_OUT14, %%IN15_OUT10
717        vpunpckhdq      %%IN14_OUT14, %%IN14_OUT14, %%IN15_OUT10
718
719        vpunpcklqdq     %%IN13_OUT03, %%IN_ORIG00, %%IN10_OUT06
720        vpunpckhqdq     %%IN10_OUT06, %%IN_ORIG00, %%IN10_OUT06
721        vpunpcklqdq     %%IN15_OUT10, %%IN12_OUT12, %%IN14_OUT14
722        vpunpckhqdq     %%IN12_OUT12, %%IN12_OUT12, %%IN14_OUT14
723
724        vshufi64x2      %%IN14_OUT14, %%IN09_OUT00, %%IN13_OUT03, 0x44
725        vshufi64x2      %%IN09_OUT00, %%IN09_OUT00, %%IN13_OUT03, 0xee
726        vshufi64x2      %%IN_ORIG00, %%IN04_OUT08, %%IN10_OUT06, 0x44
727        vshufi64x2      %%IN10_OUT06, %%IN04_OUT08, %%IN10_OUT06, 0xee
728        vshufi64x2      %%IN13_OUT03, %%IN11_OUT11, %%IN15_OUT10, 0x44
729        vshufi64x2      %%IN11_OUT11, %%IN11_OUT11, %%IN15_OUT10, 0xee
730        vshufi64x2      %%IN15_OUT10, %%IN08_OUT05, %%IN12_OUT12, 0x44
731        vshufi64x2      %%IN08_OUT05, %%IN08_OUT05, %%IN12_OUT12, 0xee
732
733        vshufi64x2      %%IN12_OUT12, %%IN03_OUT04, %%IN09_OUT00, 0xdd
734        vshufi64x2      %%IN04_OUT08, %%IN03_OUT04, %%IN09_OUT00, 0x88
735        vshufi64x2      %%IN03_OUT04, %%IN_ORIG01, %%IN14_OUT14, 0xdd
736        vshufi64x2      %%IN09_OUT00, %%IN_ORIG01, %%IN14_OUT14, 0x88
737        vshufi64x2      %%IN14_OUT14, %%IN02_OUT15, %%IN11_OUT11, 0xdd
738        vshufi64x2      %%IN_ORIG01, %%IN02_OUT15, %%IN11_OUT11, 0x88
739        vshufi64x2      %%IN11_OUT11, %%IN00_OUT01, %%IN08_OUT05, 0x88
740        vshufi64x2      %%IN02_OUT15, %%IN00_OUT01, %%IN08_OUT05, 0xdd
741        vshufi64x2      %%IN00_OUT01, %%IN06_OUT13, %%IN_ORIG00, 0x88
742        vshufi64x2      %%IN08_OUT05, %%IN06_OUT13, %%IN_ORIG00, 0xdd
743        vshufi64x2      %%IN_ORIG00, %%IN01_OUT02, %%IN10_OUT06, 0x88
744        vshufi64x2      %%IN06_OUT13, %%IN01_OUT02, %%IN10_OUT06, 0xdd
745        vshufi64x2      %%IN01_OUT02, %%IN07_OUT07, %%IN13_OUT03, 0x88
746        vshufi64x2      %%IN10_OUT06, %%IN07_OUT07, %%IN13_OUT03, 0xdd
747        vshufi64x2      %%IN13_OUT03, %%IN05_OUT09, %%IN15_OUT10, 0x88
748        vshufi64x2      %%IN07_OUT07, %%IN05_OUT09, %%IN15_OUT10, 0xdd
749
750        vmovdqa64       %%IN05_OUT09, %%IN_ORIG00
751        vmovdqa64       %%IN15_OUT10, %%IN_ORIG01
752%endmacro
753
754;;
755;; Generates 64*16 bytes of keystream
756;;
757%macro GENERATE_1K_KS 32
758%define %%ZMM_DWORD0       %1   ;; [out] ZMM containing dword 0 of all states and bytes 64-127  of keystream
759%define %%ZMM_DWORD1       %2   ;; [out] ZMM containing dword 1 of all states and bytes 128-191 of keystream
760%define %%ZMM_DWORD2       %3   ;; [out] ZMM containing dword 2 of all states and bytes 960-1023 of keystream
761%define %%ZMM_DWORD3       %4   ;; [out] ZMM containing dword 3 of all states and bytes 256-319 of keystream
762%define %%ZMM_DWORD4       %5   ;; [out] ZMM containing dword 4 of all states and bytes 512-575 of keystream
763%define %%ZMM_DWORD5       %6   ;; [out] ZMM containing dword 5 of all states and bytes 576-639 of keystream
764%define %%ZMM_DWORD6       %7   ;; [out] ZMM containing dword 6 of all states and bytes 832-895 of keystream
765%define %%ZMM_DWORD7       %8   ;; [out] ZMM containing dword 7 of all states and bytes 448-511 of keystream
766%define %%ZMM_DWORD8       %9   ;; [out] ZMM containing dword 8 of all states and bytes 320-383 of keystream
767%define %%ZMM_DWORD9       %10  ;; [out] ZMM containing dword 9 of all states and bytes 0-63 of keystream
768%define %%ZMM_DWORD10      %11  ;; [out] ZMM containing dword 10 of all states and bytes 384-447 of keystream
769%define %%ZMM_DWORD11      %12  ;; [out] ZMM containing dword 11 of all states and bytes 704-767 of keystream
770%define %%ZMM_DWORD12      %13  ;; [out] ZMM containing dword 12 of all states and bytes 768-831 of keystream
771%define %%ZMM_DWORD13      %14  ;; [out] ZMM containing dword 13 of all states and bytes 192-255 of keystream
772%define %%ZMM_DWORD14      %15  ;; [out] ZMM containing dword 14 of all states and bytes 896-959 of keystream
773%define %%ZMM_DWORD15      %16  ;; [out] ZMM containing dword 15 of all states and bytes 640-703 of keystream
774%define %%ZMM_DWORD_ORIG0  %17  ;; [in/clobbered] ZMM containing dword 0 of all states / Temp ZMM register
775%define %%ZMM_DWORD_ORIG1  %18  ;; [in/clobbered] ZMM containing dword 1 of all states / Temp ZMM register
776%define %%ZMM_DWORD_ORIG2  %19  ;; [in] ZMM containing dword 2 of all states
777%define %%ZMM_DWORD_ORIG3  %20  ;; [in] ZMM containing dword 3 of all states
778%define %%ZMM_DWORD_ORIG4  %21  ;; [in] ZMM containing dword 4 of all states
779%define %%ZMM_DWORD_ORIG5  %22  ;; [in] ZMM containing dword 5 of all states
780%define %%ZMM_DWORD_ORIG6  %23  ;; [in] ZMM containing dword 6 of all states
781%define %%ZMM_DWORD_ORIG7  %24  ;; [in] ZMM containing dword 7 of all states
782%define %%ZMM_DWORD_ORIG8  %25  ;; [in] ZMM containing dword 8 of all states
783%define %%ZMM_DWORD_ORIG9  %26  ;; [in] ZMM containing dword 9 of all states
784%define %%ZMM_DWORD_ORIG10 %27  ;; [in] ZMM containing dword 10 of all states
785%define %%ZMM_DWORD_ORIG11 %28  ;; [in] ZMM containing dword 11 of all states
786%define %%ZMM_DWORD_ORIG12 %29  ;; [in] ZMM containing dword 12 of all states
787%define %%ZMM_DWORD_ORIG13 %30  ;; [in] ZMM containing dword 13 of all states
788%define %%ZMM_DWORD_ORIG14 %31  ;; [in] ZMM containing dword 14 of all states
789%define %%ZMM_DWORD_ORIG15 %32  ;; [in] ZMM containing dword 15 of all states
790
791%assign i 0
792%rep 16
793        vmovdqa64 APPEND(%%ZMM_DWORD, i), APPEND(%%ZMM_DWORD_ORIG, i)
794%assign i (i + 1)
795%endrep
796
797%rep 10
798
799        ;;; Each full round consists of 8 quarter rounds, 4 column rounds and 4 diagonal rounds
800        ;;; For first 4 column rounds:
801        ;;; A = 0, 1, 2, 3;   B = 4, 5, 6, 7;
802        ;;; C = 8, 9, 10, 11; D = 12, 13, 14, 15
803        CHACHA20_ROUND %%ZMM_DWORD0, %%ZMM_DWORD1, %%ZMM_DWORD2, %%ZMM_DWORD3, \
804                       %%ZMM_DWORD4, %%ZMM_DWORD5, %%ZMM_DWORD6, %%ZMM_DWORD7, \
805                       %%ZMM_DWORD8, %%ZMM_DWORD9, %%ZMM_DWORD10, %%ZMM_DWORD11, \
806                       %%ZMM_DWORD12, %%ZMM_DWORD13, %%ZMM_DWORD14, %%ZMM_DWORD15
807        ;;; For 4 diagonal rounds:
808        ;;; A = 0, 1, 2, 3;   B = 5, 6, 7, 4;
809        ;;; C = 10, 11, 8, 9; D = 15, 12, 13, 14
810        CHACHA20_ROUND %%ZMM_DWORD0, %%ZMM_DWORD1, %%ZMM_DWORD2, %%ZMM_DWORD3, \
811                       %%ZMM_DWORD5, %%ZMM_DWORD6, %%ZMM_DWORD7, %%ZMM_DWORD4, \
812                       %%ZMM_DWORD10, %%ZMM_DWORD11, %%ZMM_DWORD8, %%ZMM_DWORD9, \
813                       %%ZMM_DWORD15, %%ZMM_DWORD12, %%ZMM_DWORD13, %%ZMM_DWORD14
814%endrep
815
816        ;; Add original states to processed states and transpose
817        ;; these states to form the 64*16 bytes of keystream
818        ADD_TRANSPOSE_STATE_KS %%ZMM_DWORD0, %%ZMM_DWORD1, %%ZMM_DWORD2, %%ZMM_DWORD3, \
819                               %%ZMM_DWORD4, %%ZMM_DWORD5, %%ZMM_DWORD6, %%ZMM_DWORD7, \
820                               %%ZMM_DWORD8, %%ZMM_DWORD9, %%ZMM_DWORD10, %%ZMM_DWORD11, \
821                               %%ZMM_DWORD12, %%ZMM_DWORD13, %%ZMM_DWORD14, %%ZMM_DWORD15, \
822                               %%ZMM_DWORD_ORIG0, %%ZMM_DWORD_ORIG1, %%ZMM_DWORD_ORIG2, \
823                               %%ZMM_DWORD_ORIG3,%%ZMM_DWORD_ORIG4, %%ZMM_DWORD_ORIG5, \
824                               %%ZMM_DWORD_ORIG6, %%ZMM_DWORD_ORIG7, %%ZMM_DWORD_ORIG8, \
825                               %%ZMM_DWORD_ORIG9, %%ZMM_DWORD_ORIG10, %%ZMM_DWORD_ORIG11, \
826                               %%ZMM_DWORD_ORIG12, %%ZMM_DWORD_ORIG13, %%ZMM_DWORD_ORIG14, \
827                               %%ZMM_DWORD_ORIG15
828%endmacro
829
830%macro ENCRYPT_1_16_BLOCKS 22
831%define %%KS0         %1 ; [in/clobbered] Bytes 0-63 of keystream
832%define %%KS1         %2 ; [in/clobbered] Bytes 64-127 of keystream
833%define %%KS2         %3 ; [in/clobbered] Bytes 128-191 of keystream
834%define %%KS3         %4 ; [in/clobbered] Bytes 192-255 of keystream
835%define %%KS4         %5 ; [in/clobbered] Bytes 256-319 of keystream
836%define %%KS5         %6 ; [in/clobbered] Bytes 320-383 of keystream
837%define %%KS6         %7 ; [in/clobbered] Bytes 384-447 of keystream
838%define %%KS7         %8 ; [in/clobbered] Bytes 448-511 of keystream
839%define %%KS8         %9 ; [in/clobbered] Bytes 512-575 of keystream
840%define %%KS9        %10 ; [in/clobbered] Bytes 576-639 of keystream
841%define %%KS10       %11 ; [in/clobbered] Bytes 640-703 of keystream
842%define %%KS11       %12 ; [in/clobbered] Bytes 704-767 of keystream
843%define %%KS12       %13 ; [in/clobbered] Bytes 768-831 of keystream
844%define %%KS13       %14 ; [in/clobbered] Bytes 832-895 of keystream
845%define %%KS14       %15 ; [in/clobbered] Bytes 896-959 of keystream
846%define %%KS15       %16 ; [in/clobbered] Bytes 960-1023 of keystream
847%define %%ZTMP       %17 ; [clobbered] Temporary ZMM register
848%define %%SRC        %18 ; [in] Source pointer
849%define %%DST        %19 ; [in] Destination pointer
850%define %%OFF        %20 ; [in] Offset into src/dst pointers
851%define %%KMASK      %21 ; [in] Mask register for final block
852%define %%NUM_BLOCKS %22 ; [in] Number of blocks to encrypt
853
854        ; XOR Keystreams with blocks of input data
855%assign %%I 0
856%rep (%%NUM_BLOCKS - 1)
857        vpxorq    APPEND(%%KS, %%I), [%%SRC + %%OFF + 64*%%I]
858%assign %%I (%%I + 1)
859%endrep
860        ; Final block which might have less than 64 bytes, so mask register is used
861        vmovdqu8 %%ZTMP{%%KMASK}, [%%SRC + %%OFF + 64*%%I]
862        vpxorq  APPEND(%%KS, %%I), %%ZTMP
863
864        ; Write out blocks of ciphertext
865%assign %%I 0
866%rep (%%NUM_BLOCKS - 1)
867        vmovdqu8 [%%DST + %%OFF + 64*%%I], APPEND(%%KS, %%I)
868%assign %%I (%%I + 1)
869%endrep
870        vmovdqu8 [%%DST + %%OFF + 64*%%I]{%%KMASK}, APPEND(%%KS, %%I)
871%endmacro
872
873%macro PREPARE_NEXT_STATES_4_TO_8 15
874%define %%STATE_IN_A_L   %1  ;; [out] ZMM containing state "A" part for states 1-4
875%define %%STATE_IN_B_L   %2  ;; [out] ZMM containing state "B" part for states 1-4
876%define %%STATE_IN_C_L   %3  ;; [out] ZMM containing state "C" part for states 1-4
877%define %%STATE_IN_D_L   %4  ;; [out] ZMM containing state "D" part for states 1-4
878%define %%STATE_IN_A_H   %5  ;; [out] ZMM containing state "A" part for states 5-8 (or "none" in NUM_BLOCKS == 4)
879%define %%STATE_IN_B_H   %6  ;; [out] ZMM containing state "B" part for states 5-8 (or "none" in NUM_BLOCKS == 4)
880%define %%STATE_IN_C_H   %7  ;; [out] ZMM containing state "C" part for states 5-8 (or "none" in NUM_BLOCKS == 4)
881%define %%STATE_IN_D_H   %8  ;; [out] ZMM containing state "D" part for states 5-8 (or "none" in NUM_BLOCKS == 4)
882%define %%ZTMP0          %9  ;; [clobbered] ZMM temp reg
883%define %%ZTMP1          %10 ;; [clobbered] ZMM temp reg
884%define %%LAST_BLK_CNT   %11 ;; [in] Last block counter
885%define %%IV             %12 ;; [in] Pointer to IV
886%define %%KEYS           %13 ;; [in/clobbered] Pointer to keys
887%define %%KMASK          %14 ;; [clobbered] Mask register
888%define %%NUM_BLOCKS     %15 ;; [in] Number of state blocks to prepare (numerical)
889
890        ;; Prepare next 8 states (or 4, if 4 or less blocks left)
891        vbroadcastf64x2  %%STATE_IN_B_L, [%%KEYS]            ; Load key bytes 0-15
892        vbroadcastf64x2  %%STATE_IN_C_L, [%%KEYS + 16]       ; Load key bytes 16-31
893        mov       %%KEYS, 0xfff ; Reuse %%KEYS register, as it is not going to be used again
894        kmovq     %%KMASK, %%KEYS
895        vmovdqu8  XWORD(%%STATE_IN_D_L){%%KMASK}, [%%IV] ; Load Nonce (12 bytes)
896        vpslldq   XWORD(%%STATE_IN_D_L), 4
897        vshufi64x2 %%STATE_IN_D_L, %%STATE_IN_D_L, 0 ; Broadcast 128 bits to 512 bits
898        vbroadcastf64x2 %%STATE_IN_A_L, [rel constants]
899
900%if %%NUM_BLOCKS == 8
901        ;; Prepare chacha states 4-7
902        vmovdqa64 %%STATE_IN_A_H, %%STATE_IN_A_L
903        vmovdqa64 %%STATE_IN_B_H, %%STATE_IN_B_L
904        vmovdqa64 %%STATE_IN_C_H, %%STATE_IN_C_L
905        vmovdqa64 %%STATE_IN_D_H, %%STATE_IN_D_L
906%endif
907
908        ; Broadcast last block counter
909        vmovq   XWORD(%%ZTMP0), %%LAST_BLK_CNT
910        vshufi32x4 %%ZTMP0, %%ZTMP0, 0x00
911%if %%NUM_BLOCKS == 4
912        ; Add 1-4 to construct next block counters
913        vpaddq  %%ZTMP0, [rel add_1_4]
914        vporq   %%STATE_IN_D_L, %%ZTMP0
915%else
916        ; Add 1-8 to construct next block counters
917        vmovdqa64 %%ZTMP1, %%ZTMP0
918        vpaddq  %%ZTMP0, [rel add_1_4]
919        vpaddq  %%ZTMP1, [rel add_5_8]
920        vporq   %%STATE_IN_D_L, %%ZTMP0
921        vporq   %%STATE_IN_D_H, %%ZTMP1
922%endif
923%endmacro
924
925align 32
926MKGLOBAL(submit_job_chacha20_enc_dec_avx512,function,internal)
927submit_job_chacha20_enc_dec_avx512:
928
929%define src     r8
930%define dst     r9
931%define len     r10
932%define iv      r11
933%define keys    rdx
934%define tmp     rdx
935%define off     rax
936
937        xor     off, off
938
939        mov     tmp, 0xffffffffffffffff
940        kmovq   k1, tmp
941
942        mov     len, [job + _msg_len_to_cipher_in_bytes]
943        mov     src, [job + _src]
944        add     src, [job + _cipher_start_src_offset_in_bytes]
945        mov     dst, [job + _dst]
946        mov     keys, [job + _enc_keys]
947        mov     iv, [job + _iv]
948
949        ; If less than or equal to 64*8 bytes, prepare directly states for up to 8 blocks
950        cmp     len, 64*8
951        jbe     exit_loop
952
953        ; Prepare first 16 chacha20 states from IV, key, constants and counter values
954        vpbroadcastd zmm0, [rel constants]
955        vpbroadcastd zmm1, [rel constants + 4]
956        vpbroadcastd zmm2, [rel constants + 8]
957        vpbroadcastd zmm3, [rel constants + 12]
958
959        vpbroadcastd zmm4, [keys]
960        vpbroadcastd zmm5, [keys + 4]
961        vpbroadcastd zmm6, [keys + 8]
962        vpbroadcastd zmm7, [keys + 12]
963        vpbroadcastd zmm8, [keys + 16]
964        vpbroadcastd zmm9, [keys + 20]
965        vpbroadcastd zmm10, [keys + 24]
966        vpbroadcastd zmm11, [keys + 28]
967
968        vpbroadcastd zmm13, [iv]
969        vpbroadcastd zmm14, [iv + 4]
970        vpbroadcastd zmm15, [iv + 8]
971        ;; Set first 16 counter values
972        vmovdqa64 zmm12, [rel set_1_16]
973
974        cmp     len, 64*16
975        jb      exit_loop
976
977align 32
978start_loop:
979        ENCRYPT_1K zmm16, zmm17, zmm18, zmm19, zmm20, zmm21, zmm22, zmm23, \
980                   zmm24, zmm25, zmm26, zmm27, zmm28, zmm29, zmm30, zmm31, \
981                   zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, \
982                   zmm9, zmm10, zmm11, zmm12, zmm13, zmm14, zmm15, src, dst, off
983
984        ; Update remaining length
985        sub     len, 64*16
986        add     off, 64*16
987
988        ; Reload first two registers zmm0 and 1,
989        ; as they have been overwritten by the previous macros
990        vpbroadcastd zmm0, [rel constants]
991        vpbroadcastd zmm1, [rel constants + 4]
992
993        ; Increment counter values
994        vpaddd      zmm12, [rel add_16]
995
996        cmp     len, 64*16
997        jae     start_loop
998
999exit_loop:
1000
1001        ; Check if there are partial block (less than 16*64 bytes)
1002        or      len, len
1003        jz      no_partial_block
1004
1005        cmp     len, 64*8
1006        ja      more_than_8_blocks_left
1007
1008        cmp     len, 64*4
1009        ja      more_than_4_blocks_left
1010
1011        ;; up to 4 blocks left
1012
1013        ; Get last block counter dividing offset by 64
1014        shr     off, 6
1015        PREPARE_NEXT_STATES_4_TO_8 zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, \
1016                                   zmm8, zmm9, off, iv, keys, k2, 4
1017        shl     off, 6 ; Restore offset
1018
1019        ; Use same first 4 registers as the output of GENERATE_1K_KS,
1020        ; to be able to use common code later on to encrypt
1021        GENERATE_512_KS zmm25, zmm16, zmm17, zmm29, none, none, none, none, \
1022                        zmm0, zmm1, zmm2, zmm3, none, none, none, none, \
1023                        zmm8, zmm9, zmm10, zmm11, 4
1024
1025        jmp ks_gen_done
1026
1027more_than_4_blocks_left:
1028        ;; up to 8 blocks left
1029
1030        ; Get last block counter dividing offset by 64
1031        shr     off, 6
1032        ;; up to 8 blocks left
1033        PREPARE_NEXT_STATES_4_TO_8 zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, \
1034                                   zmm8, zmm9, off, iv, keys, k2, 8
1035        shl     off, 6 ; Restore offset
1036
1037        ; Use same first 8 registers as the output of GENERATE_1K_KS,
1038        ; to be able to use common code later on to encrypt
1039        GENERATE_512_KS zmm25, zmm16, zmm17, zmm29, zmm19, zmm24, zmm26, zmm23, \
1040                        zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, \
1041                        zmm8, zmm9, zmm10, zmm11, 8
1042
1043        jmp ks_gen_done
1044more_than_8_blocks_left:
1045        ; Generate another 64*16 bytes of keystream and XOR only the leftover plaintext
1046        GENERATE_1K_KS zmm16, zmm17, zmm18, zmm19, zmm20, zmm21, zmm22, zmm23, \
1047                       zmm24, zmm25, zmm26, zmm27, zmm28, zmm29, zmm30, zmm31, \
1048                       zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, \
1049                       zmm9, zmm10, zmm11, zmm12, zmm13, zmm14, zmm15
1050
1051ks_gen_done:
1052
1053        ; Calculate number of final blocks
1054        mov     tmp, len
1055        add     tmp, 63
1056        shr     tmp, 6
1057
1058        cmp     tmp, 8
1059        je      final_num_blocks_is_8
1060        jb      final_num_blocks_is_1_7
1061
1062        ; Final blocks 9-16
1063        cmp     tmp, 12
1064        je      final_num_blocks_is_12
1065        jb      final_num_blocks_is_9_11
1066
1067        ; Final blocks 13-16
1068        cmp     tmp, 14
1069        je      final_num_blocks_is_14
1070        jb      final_num_blocks_is_13
1071
1072        cmp     tmp, 15
1073        je      final_num_blocks_is_15
1074        jmp     final_num_blocks_is_16
1075
1076final_num_blocks_is_9_11:
1077        cmp     tmp, 10
1078        je      final_num_blocks_is_10
1079        jb      final_num_blocks_is_9
1080        ja      final_num_blocks_is_11
1081
1082final_num_blocks_is_1_7:
1083        ; Final blocks 1-7
1084        cmp     tmp, 4
1085        je      final_num_blocks_is_4
1086        jb      final_num_blocks_is_1_3
1087
1088        ; Final blocks 5-7
1089        cmp     tmp, 6
1090        je      final_num_blocks_is_6
1091        jb      final_num_blocks_is_5
1092        ja      final_num_blocks_is_7
1093
1094final_num_blocks_is_1_3:
1095        cmp     tmp, 2
1096        je      final_num_blocks_is_2
1097        ja      final_num_blocks_is_3
1098
1099        ; 1 final block if no jump
1100%assign I 1
1101%rep 16
1102APPEND(final_num_blocks_is_, I):
1103
1104        lea     tmp, [rel len_to_mask]
1105        and     len, 63
1106        kmovq   k1, [tmp + len*8]
1107
1108APPEND(no_mask_update, I):
1109        ENCRYPT_1_16_BLOCKS zmm25, zmm16, zmm17, zmm29, zmm19, zmm24, zmm26, zmm23, \
1110                            zmm20, zmm21, zmm31, zmm27, zmm28, zmm22, zmm30, zmm18, \
1111                            zmm0, src, dst, off, k1, I
1112        jmp     no_partial_block
1113
1114%assign I (I + 1)
1115%endrep
1116
1117no_partial_block:
1118
1119%ifdef SAFE_DATA
1120        clear_all_zmms_asm
1121%endif
1122        mov     rax, job
1123        or      dword [rax + _status], STS_COMPLETED_AES
1124
1125        ret
1126
1127%ifdef LINUX
1128section .note.GNU-stack noalloc noexec nowrite progbits
1129%endif
1130