1;;
2;; Copyright (c) 2020, Intel Corporation
3;;
4;; Redistribution and use in source and binary forms, with or without
5;; modification, are permitted provided that the following conditions are met:
6;;
7;;     * Redistributions of source code must retain the above copyright notice,
8;;       this list of conditions and the following disclaimer.
9;;     * Redistributions in binary form must reproduce the above copyright
10;;       notice, this list of conditions and the following disclaimer in the
11;;       documentation and/or other materials provided with the distribution.
12;;     * Neither the name of Intel Corporation nor the names of its contributors
13;;       may be used to endorse or promote products derived from this software
14;;       without specific prior written permission.
15;;
16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26;;
27%use smartalign
28
29%include "imb_job.asm"
30%include "include/reg_sizes.asm"
31%include "include/os.asm"
32%include "include/clear_regs.asm"
33%include "include/aes_common.asm"
34%include "mb_mgr_datastruct.asm"
35
36default rel
37
38extern ethernet_fcs_avx512_local
39
40;; In System V AMD64 ABI
41;;	callee saves: RBX, RBP, R12-R15
42;; Windows x64 ABI
43;;	callee saves: RBX, RBP, RDI, RSI, RSP, R12-R15
44
45%define CONCAT(a,b) a %+ b
46
47struc STACKFRAME
48_rsp_save:      resq    1
49_job_save:      resq    1
50_gpr_save:	resq	4
51endstruc
52
53%ifdef LINUX
54%define arg1	rdi
55%define arg2	rsi
56%define arg3	rdx
57%else
58%define arg1	rcx
59%define arg2	rdx
60%define arg3	r8
61%endif
62
63%define job     arg1
64
65%define tmp1	rbx
66%define tmp2	rbp
67%define tmp3	r10
68%define tmp4	r11
69%define tmp5	r12
70%define tmp6	r13
71%define tmp7	r8
72%define tmp8	r9
73
74
75section .data
76
77;;; Precomputed constants for CRC32 (Ethernet FCS)
78;;;   Details of the CRC algorithm and 4 byte buffer of
79;;;   {0x01, 0x02, 0x03, 0x04}:
80;;;     Result     Poly       Init        RefIn  RefOut  XorOut
81;;;     0xB63CFBCD 0x04C11DB7 0xFFFFFFFF  true   true    0xFFFFFFFF
82
83align 16
84rk5:
85        dq 0x00000000ccaa009e, 0x0000000163cd6124
86rk7:
87        dq 0x00000001f7011640, 0x00000001db710640
88
89align 16
90
91fold_by_16: ;; fold by 16x128-bits
92        dq 0x00000000e95c1271, 0x00000000ce3371cb
93fold_by_8: ;; fold by 8x128-bits
94        dq 0x000000014a7fe880, 0x00000001e88ef372
95fold_by_4: ;; fold by 4x128-bits
96        dq 0x00000001c6e41596, 0x0000000154442bd4
97fold_by_2: ;; fold by 2x128-bits
98        dq 0x000000015a546366, 0x00000000f1da05aa
99fold_by_1: ;; fold by 1x128-bits
100        dq 0x00000000ccaa009e, 0x00000001751997d0
101
102align 16
103pshufb_shf_table:
104        ;;  use these values for shift registers with the pshufb instruction
105        dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
106        dq 0x0706050403020100, 0x000e0d0c0b0a0908
107
108align 16
109init_crc_value:
110        dq 0x00000000FFFFFFFF, 0x0000000000000000
111
112align 16
113mask:
114        dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
115
116align 16
117mask2:
118        dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
119align 16
120mask3:
121        dq 0x8080808080808080, 0x8080808080808080
122
123align 16
124mask_out_top_bytes:
125        dq 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
126        dq 0x0000000000000000, 0x0000000000000000
127
128;;; partial block read/write table
129align 64
130byte_len_to_mask_table:
131        dw      0x0000, 0x0001, 0x0003, 0x0007,
132        dw      0x000f, 0x001f, 0x003f, 0x007f,
133        dw      0x00ff, 0x01ff, 0x03ff, 0x07ff,
134        dw      0x0fff, 0x1fff, 0x3fff, 0x7fff,
135        dw      0xffff
136
137section .text
138
139;; ===================================================================
140;; ===================================================================
141;; CRC multiply before XOR against data block
142;; ===================================================================
143%macro CRC_CLMUL 4
144%define %%XCRC_IN_OUT   %1 ; [in/out] XMM with CRC (can be anything if "no_crc" below)
145%define %%XCRC_MUL      %2 ; [in] XMM with CRC constant  (can be anything if "no_crc" below)
146%define %%XCRC_DATA     %3 ; [in] XMM with data block
147%define %%XTMP          %4 ; [clobbered] temporary XMM
148
149        vpclmulqdq      %%XTMP, %%XCRC_IN_OUT, %%XCRC_MUL, 0x01
150        vpclmulqdq      %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%XCRC_MUL, 0x10
151        vpternlogq      %%XCRC_IN_OUT, %%XTMP, %%XCRC_DATA, 0x96 ; XCRC = XCRC ^ XTMP ^ DATA
152%endmacro
153
154;; ===================================================================
155;; ===================================================================
156;; CRC32 calculation on 16 byte data
157;; ===================================================================
158%macro CRC_UPDATE16 6
159%define %%INP           %1  ; [in/out] GP with input text pointer or "no_load"
160%define %%XCRC_IN_OUT   %2  ; [in/out] XMM with CRC (can be anything if "no_crc" below)
161%define %%XCRC_MUL      %3  ; [in] XMM with CRC multiplier constant
162%define %%TXMM1         %4  ; [clobbered|in] XMM temporary or data in (no_load)
163%define %%TXMM2         %5  ; [clobbered] XMM temporary
164%define %%CRC_TYPE      %6  ; [in] "first_crc" or "next_crc" or "no_crc"
165
166        ;; load data and increment in pointer
167%ifnidn %%INP, no_load
168        vmovdqu64       %%TXMM1, [%%INP]
169        add             %%INP,  16
170%endif
171
172        ;; CRC calculation
173%ifidn %%CRC_TYPE, next_crc
174        CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%TXMM1, %%TXMM2
175%endif
176%ifidn %%CRC_TYPE, first_crc
177        ;; in the first run just XOR initial CRC with the first block
178        vpxorq          %%XCRC_IN_OUT, %%TXMM1
179%endif
180
181%endmacro
182
183;; ===================================================================
184;; ===================================================================
185;; Barrett reduction from 128-bits to 32-bits modulo Ethernet FCS polynomial
186;; ===================================================================
187%macro CRC32_REDUCE_128_TO_32 5
188%define %%CRC   %1         ; [out] GP to store 32-bit Ethernet FCS value
189%define %%XCRC  %2         ; [in/clobbered] XMM with CRC
190%define %%XT1   %3         ; [clobbered] temporary xmm register
191%define %%XT2   %4         ; [clobbered] temporary xmm register
192%define %%XT3   %5         ; [clobbered] temporary xmm register
193
194%define %%XCRCKEY %%XT3
195
196        ;;  compute crc of a 128-bit value
197        vmovdqa64       %%XCRCKEY, [rel rk5]
198
199        ;; 64b fold
200        vpclmulqdq      %%XT1, %%XCRC, %%XCRCKEY, 0x00
201        vpsrldq         %%XCRC, %%XCRC, 8
202        vpxorq          %%XCRC, %%XCRC, %%XT1
203
204        ;; 32b fold
205        vpslldq         %%XT1, %%XCRC, 4
206        vpclmulqdq      %%XT1, %%XT1, %%XCRCKEY, 0x10
207        vpxorq          %%XCRC, %%XCRC, %%XT1
208
209%%_crc_barrett:
210        ;; Barrett reduction
211        vpandq          %%XCRC, [rel mask2]
212        vmovdqa64       %%XT1, %%XCRC
213        vmovdqa64       %%XT2, %%XCRC
214        vmovdqa64       %%XCRCKEY, [rel rk7]
215
216        vpclmulqdq      %%XCRC, %%XCRCKEY, 0x00
217        vpxorq          %%XCRC, %%XT2
218        vpandq          %%XCRC, [rel mask]
219        vmovdqa64       %%XT2, %%XCRC
220        vpclmulqdq      %%XCRC, %%XCRCKEY, 0x10
221        vpternlogq      %%XCRC, %%XT2, %%XT1, 0x96 ; XCRC = XCRC ^ XT2 ^ XT1
222        vpextrd         DWORD(%%CRC), %%XCRC, 2 ; 32-bit CRC value
223        not             DWORD(%%CRC)
224%endmacro
225
226;; ===================================================================
227;; ===================================================================
228;; Barrett reduction from 64-bits to 32-bits modulo Ethernet FCS polynomial
229;; ===================================================================
230%macro CRC32_REDUCE_64_TO_32 5
231%define %%CRC   %1         ; [out] GP to store 32-bit Ethernet FCS value
232%define %%XCRC  %2         ; [in/clobbered] XMM with CRC
233%define %%XT1   %3         ; [clobbered] temporary xmm register
234%define %%XT2   %4         ; [clobbered] temporary xmm register
235%define %%XT3   %5         ; [clobbered] temporary xmm register
236
237%define %%XCRCKEY %%XT3
238
239        ;; Barrett reduction
240        vpandq          %%XCRC, [rel mask2]
241        vmovdqa64       %%XT1, %%XCRC
242        vmovdqa64       %%XT2, %%XCRC
243        vmovdqa64       %%XCRCKEY, [rel rk7]
244
245        vpclmulqdq      %%XCRC, %%XCRCKEY, 0x00
246        vpxorq          %%XCRC, %%XT2
247        vpandq          %%XCRC, [rel mask]
248        vmovdqa64       %%XT2, %%XCRC
249        vpclmulqdq      %%XCRC, %%XCRCKEY, 0x10
250        vpternlogq      %%XCRC, %%XT2, %%XT1, 0x96 ; XCRC = XCRC ^ XT2 ^ XT1
251        vpextrd         DWORD(%%CRC), %%XCRC, 2 ; 32-bit CRC value
252        not             DWORD(%%CRC)
253%endmacro
254
255;; ===================================================================
256;; ===================================================================
257;; ETHERNET FCS CRC
258;; ===================================================================
259%macro ETHERNET_FCS_CRC 9
260%define %%p_in          %1  ; [in] pointer to the buffer (GPR)
261%define %%bytes_to_crc  %2  ; [in] number of bytes in the buffer (GPR)
262%define %%ethernet_fcs  %3  ; [out] GPR to put CRC value into (32 bits)
263%define %%xcrc          %4  ; [in] initial CRC value (xmm)
264%define %%tmp           %5  ; [clobbered] temporary GPR
265%define %%xcrckey       %6  ; [clobbered] temporary XMM / CRC multiplier
266%define %%xtmp1         %7  ; [clobbered] temporary XMM
267%define %%xtmp2         %8  ; [clobbered] temporary XMM
268%define %%xtmp3         %9  ; [clobbered] temporary XMM
269
270        ;; load CRC constants
271        vmovdqa64       %%xcrckey, [rel fold_by_1]
272
273        cmp             %%bytes_to_crc, 32
274        jae             %%_at_least_32_bytes
275
276        ;; less than 32 bytes
277        cmp             %%bytes_to_crc, 16
278        je              %%_exact_16_left
279        jl              %%_less_than_16_left
280
281        ;; load the plain-text
282        vmovdqu64       %%xtmp1, [%%p_in]
283        vpxorq          %%xcrc, %%xtmp1   ; xor the initial crc value
284        add             %%p_in, 16
285        sub             %%bytes_to_crc, 16
286        jmp             %%_crc_two_xmms
287
288%%_exact_16_left:
289        vmovdqu64       %%xtmp1, [%%p_in]
290        vpxorq          %%xcrc, %%xtmp1 ; xor the initial CRC value
291        jmp             %%_128_done
292
293%%_less_than_16_left:
294        lea             %%tmp, [rel byte_len_to_mask_table]
295        kmovw           k1, [%%tmp + %%bytes_to_crc*2]
296        vmovdqu8        %%xtmp1{k1}{z}, [%%p_in]
297
298        vpxorq          %%xcrc, %%xtmp1 ; xor the initial CRC value
299
300        cmp             %%bytes_to_crc, 4
301        jb              %%_less_than_4_left
302
303        lea             %%tmp, [rel pshufb_shf_table]
304        vmovdqu64       %%xtmp1, [%%tmp + %%bytes_to_crc]
305        vpshufb         %%xcrc, %%xtmp1
306        jmp             %%_128_done
307
308%%_less_than_4_left:
309        ;; less than 4 bytes left
310        cmp             %%bytes_to_crc, 3
311        jne             %%_less_than_3_left
312        vpslldq         %%xcrc, 5
313        jmp             %%_do_barret
314
315%%_less_than_3_left:
316        cmp             %%bytes_to_crc, 2
317        jne             %%_less_than_2_left
318        vpslldq         %%xcrc, 6
319        jmp             %%_do_barret
320
321%%_less_than_2_left:
322        vpslldq         %%xcrc, 7
323
324%%_do_barret:
325        CRC32_REDUCE_64_TO_32 %%ethernet_fcs, %%xcrc, %%xtmp1, %%xtmp2, %%xcrckey
326        jmp             %%_64_done
327
328%%_at_least_32_bytes:
329        CRC_UPDATE16 %%p_in, %%xcrc, %%xcrckey, %%xtmp1, %%xtmp2, first_crc
330        sub             %%bytes_to_crc, 16
331
332%%_main_loop:
333        cmp             %%bytes_to_crc, 16
334        jb              %%_exit_loop
335        CRC_UPDATE16 %%p_in, %%xcrc, %%xcrckey, %%xtmp1, %%xtmp2, next_crc
336        sub             %%bytes_to_crc, 16
337        jz              %%_128_done
338        jmp             %%_main_loop
339
340%%_exit_loop:
341
342        ;; Partial bytes left - complete CRC calculation
343%%_crc_two_xmms:
344        lea             %%tmp, [rel pshufb_shf_table]
345        vmovdqu64       %%xtmp2, [%%tmp + %%bytes_to_crc]
346        vmovdqu64       %%xtmp1, [%%p_in - 16 + %%bytes_to_crc]  ; xtmp1 = data for CRC
347        vmovdqa64       %%xtmp3, %%xcrc
348        vpshufb         %%xcrc, %%xtmp2  ; top num_bytes with LSB xcrc
349        vpxorq          %%xtmp2, [rel mask3]
350        vpshufb         %%xtmp3, %%xtmp2 ; bottom (16 - num_bytes) with MSB xcrc
351
352        ;; data num_bytes (top) blended with MSB bytes of CRC (bottom)
353        vpblendvb       %%xtmp3, %%xtmp1, %%xtmp2
354
355        ;; final CRC calculation
356        CRC_CLMUL %%xcrc, %%xcrckey, %%xtmp3, %%xtmp1
357
358%%_128_done:
359        CRC32_REDUCE_128_TO_32 %%ethernet_fcs, %%xcrc, %%xtmp1, %%xtmp2, %%xcrckey
360%%_64_done:
361%endmacro
362
363;; ===================================================================
364;; ===================================================================
365;; AES128/256 CBC decryption on 1 to 16 blocks
366;; ===================================================================
367%macro AES_CBC_DEC_1_TO_16 17
368%define %%SRC           %1  ; [in] GP with pointer to source buffer
369%define %%DST           %2  ; [in] GP with pointer to destination buffer
370%define %%NUMBL         %3  ; [in] numerical constant with number of blocks to process
371%define %%OFFS          %4  ; [in/out] GP with src/dst buffer offset
372%define %%NBYTES        %5  ; [in/out] GP with number of bytes to decrypt
373%define %%KEY_PTR       %6  ; [in] GP with pointer to expanded AES decrypt keys
374%define %%ZIV           %7  ; [in/out] IV in / last cipher text block on out (xmm0 - xmm15)
375%define %%NROUNDS       %8  ; [in] number of rounds; numerical value
376%define %%CIPHER_00_03  %9  ; [out] ZMM next 0-3 cipher blocks
377%define %%CIPHER_04_07  %10 ; [out] ZMM next 4-7 cipher blocks
378%define %%CIPHER_08_11  %11 ; [out] ZMM next 8-11 cipher blocks
379%define %%CIPHER_12_15  %12 ; [out] ZMM next 12-15 cipher blocks
380%define %%ZT1           %13 ; [clobbered] ZMM temporary
381%define %%ZT2           %14 ; [clobbered] ZMM temporary
382%define %%ZT3           %15 ; [clobbered] ZMM temporary
383%define %%ZT4           %16 ; [clobbered] ZMM temporary
384%define %%ZT5           %17 ; [clobbered] ZMM temporary
385
386        ;; /////////////////////////////////////////////////
387        ;; load cipher text
388        ZMM_LOAD_BLOCKS_0_16 %%NUMBL, %%SRC, %%OFFS, \
389                %%CIPHER_00_03, %%CIPHER_04_07, \
390                %%CIPHER_08_11, %%CIPHER_12_15
391
392        ;; /////////////////////////////////////////////////
393        ;; prepare cipher text blocks for an XOR after AES-DEC rounds
394        valignq         %%ZT1, %%CIPHER_00_03, %%ZIV, 6
395%if %%NUMBL > 4
396        valignq         %%ZT2, %%CIPHER_04_07, %%CIPHER_00_03, 6
397%endif
398%if %%NUMBL > 8
399        valignq         %%ZT3, %%CIPHER_08_11, %%CIPHER_04_07, 6
400%endif
401%if %%NUMBL > 12
402        valignq         %%ZT4, %%CIPHER_12_15, %%CIPHER_08_11, 6
403%endif
404
405        ;; /////////////////////////////////////////////////
406        ;; update IV with the last cipher block
407%if %%NUMBL < 4
408        valignq         %%ZIV, %%CIPHER_00_03, %%CIPHER_00_03, ((%%NUMBL % 4) * 2)
409%elif %%NUMBL == 4
410        vmovdqa64       %%ZIV, %%CIPHER_00_03
411%elif %%NUMBL < 8
412        valignq         %%ZIV, %%CIPHER_04_07, %%CIPHER_04_07, ((%%NUMBL % 4) * 2)
413%elif %%NUMBL == 8
414        vmovdqa64       %%ZIV, %%CIPHER_04_07
415%elif %%NUMBL < 12
416        valignq         %%ZIV, %%CIPHER_08_11, %%CIPHER_08_11, ((%%NUMBL % 4) * 2)
417%elif %%NUMBL == 12
418        vmovdqa64       %%ZIV, %%CIPHER_08_11
419%elif %%NUMBL < 16
420        valignq         %%ZIV, %%CIPHER_12_15, %%CIPHER_12_15, ((%%NUMBL % 4) * 2)
421%else ;; %%NUMBL == 16
422        vmovdqa64       %%ZIV, %%CIPHER_12_15
423%endif
424
425        ;; /////////////////////////////////////////////////
426        ;; AES rounds including XOR
427%assign j 0
428%rep (%%NROUNDS + 2)
429     vbroadcastf64x2    %%ZT5, [%%KEY_PTR + (j * 16)]
430     ZMM_AESDEC_ROUND_BLOCKS_0_16 %%CIPHER_00_03, %%CIPHER_04_07, \
431                        %%CIPHER_08_11, %%CIPHER_12_15, \
432                        %%ZT5, j, %%ZT1, %%ZT2, %%ZT3, %%ZT4, \
433                        %%NUMBL, %%NROUNDS
434%assign j (j + 1)
435%endrep
436
437        ;; /////////////////////////////////////////////////
438        ;; write plain text back to output
439        ZMM_STORE_BLOCKS_0_16 %%NUMBL, %%DST, %%OFFS, \
440                %%CIPHER_00_03, %%CIPHER_04_07, \
441                %%CIPHER_08_11, %%CIPHER_12_15
442
443        ;; /////////////////////////////////////////////////
444        ;; update lengths and offset
445        add             %%OFFS, (%%NUMBL * 16)
446        sub             %%NBYTES, (%%NUMBL * 16)
447%endmacro       ;; AES_CBC_DEC_1_TO_16
448
449;; ===================================================================
450;; ===================================================================
451;; CRC32 on 1 to 16 blocks (first_crc case only)
452;; ===================================================================
453%macro CRC32_FIRST_1_TO_16 13
454%define %%CRC_MUL    %1  ; [in] XMM with CRC multiplier
455%define %%CRC_IN_OUT %2  ; [in/out] current CRC value
456%define %%XTMP       %3  ; [clobbered] temporary XMM
457%define %%XTMP2      %4  ; [clobbered] temporary XMM
458%define %%NUMBL      %5  ; [in] number of blocks of clear text to compute CRC on
459%define %%ZCRCIN0    %6  ; [in] clear text 4 blocks
460%define %%ZCRCIN1    %7  ; [in] clear text 4 blocks
461%define %%ZCRCIN2    %8  ; [in] clear text 4 blocks
462%define %%ZCRCIN3    %9  ; [in] clear text 4 blocks
463%define %%ZCRCSUM0   %10 ; [clobbered] temporary ZMM
464%define %%ZCRCSUM1   %11 ; [clobbered] temporary ZMM
465%define %%ZCRCSUM2   %12 ; [clobbered] temporary ZMM
466%define %%ZCRCSUM3   %13 ; [clobbered] temporary ZMM
467
468%xdefine %%ZTMP0 ZWORD(%%XTMP)
469%xdefine %%ZTMP1 ZWORD(%%XTMP2)
470
471%if (%%NUMBL == 0)
472        ;; do nothing
473%elif (%%NUMBL == 1)
474        vpxorq          %%CRC_IN_OUT, XWORD(%%ZCRCIN0)
475%elif (%%NUMBL == 16)
476        vmovdqa64       %%ZCRCSUM0, %%ZCRCIN0
477        vmovdqa64       %%ZCRCSUM1, %%ZCRCIN1
478        vmovdqa64       %%ZCRCSUM2, %%ZCRCIN2
479        vmovdqa64       %%ZCRCSUM3, %%ZCRCIN3
480
481        ;; Add current CRC sum into block 0
482        vmovdqa64       %%CRC_IN_OUT, %%CRC_IN_OUT
483        vpxorq          %%ZCRCSUM0, %%ZCRCSUM0, ZWORD(%%CRC_IN_OUT)
484        ;; fold 16 x 128 bits -> 8 x 128 bits
485        vbroadcastf64x2 %%ZTMP0, [rel fold_by_8]
486        vpclmulqdq      %%ZTMP1, %%ZCRCSUM0, %%ZTMP0, 0x01
487        vpclmulqdq      %%ZCRCSUM0, %%ZCRCSUM0, %%ZTMP0, 0x10
488        vpternlogq      %%ZCRCSUM0, %%ZCRCSUM2, %%ZTMP1, 0x96
489
490        vpclmulqdq      %%ZTMP1, %%ZCRCSUM1, %%ZTMP0, 0x01
491        vpclmulqdq      %%ZCRCSUM1, %%ZCRCSUM1, %%ZTMP0, 0x10
492        vpternlogq      %%ZCRCSUM1, %%ZCRCSUM3, %%ZTMP1, 0x96
493
494        ;; fold 8 x 128 bits -> 4 x 128 bits
495        vbroadcastf64x2 %%ZTMP0, [rel fold_by_4]
496        vpclmulqdq      %%ZTMP1, %%ZCRCSUM0, %%ZTMP0, 0x01
497        vpclmulqdq      %%ZCRCSUM0, %%ZCRCSUM0, %%ZTMP0, 0x10
498        vpternlogq      %%ZCRCSUM0, %%ZCRCSUM1, %%ZTMP1, 0x96
499
500        ;; fold 4 x 128 bits -> 2 x 128 bits
501        vbroadcastf64x2 YWORD(%%ZTMP0), [rel fold_by_2]
502        vextracti64x4   YWORD(%%ZCRCSUM1), %%ZCRCSUM0, 1
503        vpclmulqdq      YWORD(%%ZTMP1), YWORD(%%ZCRCSUM0), YWORD(%%ZTMP0), 0x01
504        vpclmulqdq      YWORD(%%ZCRCSUM0), YWORD(%%ZCRCSUM0), YWORD(%%ZTMP0), 0x10
505        vpternlogq      YWORD(%%ZCRCSUM0), YWORD(%%ZCRCSUM1), YWORD(%%ZTMP1), 0x96
506
507        ;; fold 2 x 128 bits -> 1 x 128 bits
508        vmovdqa64       XWORD(%%ZTMP0), [rel fold_by_1]
509        vextracti64x2   XWORD(%%ZCRCSUM1), YWORD(%%ZCRCSUM0), 1
510        vpclmulqdq      XWORD(%%ZTMP1), XWORD(%%ZCRCSUM0), XWORD(%%ZTMP0), 0x01
511        vpclmulqdq      XWORD(%%ZCRCSUM0), XWORD(%%ZCRCSUM0), XWORD(%%ZTMP0), 0x10
512        vpternlogq      XWORD(%%ZCRCSUM0), XWORD(%%ZCRCSUM1), XWORD(%%ZTMP1), 0x96
513        vmovdqa64       %%CRC_IN_OUT, XWORD(%%ZCRCSUM0)
514
515%else
516
517        vpxorq          %%ZCRCSUM0, %%ZCRCSUM0
518        vpxorq          %%ZCRCSUM1, %%ZCRCSUM1
519        vpxorq          %%ZCRCSUM2, %%ZCRCSUM2
520        vpxorq          %%ZCRCSUM3, %%ZCRCSUM3
521
522        vmovdqa64       %%ZCRCSUM0, %%ZCRCIN0
523%if %%NUMBL > 4
524        vmovdqa64       %%ZCRCSUM1, %%ZCRCIN1
525%endif
526%if %%NUMBL > 8
527        vmovdqa64       %%ZCRCSUM2, %%ZCRCIN2
528%endif
529%if %%NUMBL > 12
530        vmovdqa64       %%ZCRCSUM3, %%ZCRCIN3
531%endif
532
533        ;; Add current CRC sum into block 0
534        vmovdqa64       %%CRC_IN_OUT, %%CRC_IN_OUT
535        vpxorq          %%ZCRCSUM0, %%ZCRCSUM0, ZWORD(%%CRC_IN_OUT)
536
537%assign blocks_left %%NUMBL
538
539%if (%%NUMBL >= 12)
540        vbroadcastf64x2 %%ZTMP0, [rel fold_by_4]
541        vpclmulqdq      %%ZTMP1, %%ZCRCSUM0, %%ZTMP0, 0x01
542        vpclmulqdq      %%ZCRCSUM0, %%ZCRCSUM0, %%ZTMP0, 0x10
543        vpternlogq      %%ZCRCSUM0, %%ZCRCSUM1, %%ZTMP1, 0x96
544
545        vpclmulqdq      %%ZTMP1, %%ZCRCSUM0, %%ZTMP0, 0x01
546        vpclmulqdq      %%ZCRCSUM0, %%ZCRCSUM0, %%ZTMP0, 0x10
547        vpternlogq      %%ZCRCSUM0, %%ZCRCSUM2, %%ZTMP1, 0x96
548        vmovdqa64       %%ZCRCSUM1, %%ZCRCSUM3
549
550%assign blocks_left (blocks_left - 8)
551
552%elif (%%NUMBL >= 8)
553        vbroadcastf64x2 %%ZTMP0, [rel fold_by_4]
554        vpclmulqdq      %%ZTMP1, %%ZCRCSUM0, %%ZTMP0, 0x01
555        vpclmulqdq      %%ZCRCSUM0, %%ZCRCSUM0, %%ZTMP0, 0x10
556        vpternlogq      %%ZCRCSUM0, %%ZCRCSUM1, %%ZTMP1, 0x96
557        vmovdqa64       %%ZCRCSUM1, %%ZCRCSUM2
558
559%assign blocks_left (blocks_left - 4)
560%endif
561
562        ;; 1 to 8 blocks left in ZCRCSUM0 and ZCRCSUM1
563
564%if blocks_left >= 4
565        ;; fold 4 x 128 bits -> 2 x 128 bits
566        vbroadcastf64x2 YWORD(%%ZTMP0), [rel fold_by_2]
567        vextracti64x4   YWORD(%%ZCRCSUM3), %%ZCRCSUM0, 1
568        vpclmulqdq      YWORD(%%ZTMP1), YWORD(%%ZCRCSUM0), YWORD(%%ZTMP0), 0x01
569        vpclmulqdq      YWORD(%%ZCRCSUM0), YWORD(%%ZCRCSUM0), YWORD(%%ZTMP0), 0x10
570        vpternlogq      YWORD(%%ZCRCSUM0), YWORD(%%ZCRCSUM3), YWORD(%%ZTMP1), 0x96
571
572        ;; fold 2 x 128 bits -> 1 x 128 bits
573        vmovdqa64       XWORD(%%ZTMP0), [rel fold_by_1]
574        vextracti64x2   XWORD(%%ZCRCSUM3), YWORD(%%ZCRCSUM0), 1
575        vpclmulqdq      XWORD(%%ZTMP1), XWORD(%%ZCRCSUM0), XWORD(%%ZTMP0), 0x01
576        vpclmulqdq      XWORD(%%ZCRCSUM0), XWORD(%%ZCRCSUM0), XWORD(%%ZTMP0), 0x10
577        vpternlogq      XWORD(%%ZCRCSUM0), XWORD(%%ZCRCSUM3), XWORD(%%ZTMP1), 0x96
578
579        vmovdqa64       %%CRC_IN_OUT, XWORD(%%ZCRCSUM0)
580
581        vmovdqa64       %%ZCRCSUM0, %%ZCRCSUM1
582
583%assign blocks_left (blocks_left - 4)
584
585%else
586        vmovdqa64       %%CRC_IN_OUT, XWORD(%%ZCRCSUM0)
587        vshufi64x2      %%ZCRCSUM0, %%ZCRCSUM0, %%ZCRCSUM0, 0011_1001b
588
589%assign blocks_left (blocks_left - 1)
590%endif
591
592%rep blocks_left
593        vmovdqa64       %%XTMP, XWORD(%%ZCRCSUM0)
594        CRC_CLMUL       %%CRC_IN_OUT, %%CRC_MUL, %%XTMP, %%XTMP2
595        vshufi64x2      %%ZCRCSUM0, %%ZCRCSUM0, %%ZCRCSUM0, 0011_1001b
596%endrep
597
598%endif  ;; %%NUMBL > 0
599
600%endmacro       ;; CRC32_FIRST_1_TO_16
601
602;; ===================================================================
603;; ===================================================================
604;; Stitched AES128/256 CBC decryption & CRC32 on 16 blocks
605;; ===================================================================
606%macro AES_CBC_DEC_CRC32_16 22
607%define %%SRC        %1  ; [in] GP with pointer to source buffer
608%define %%DST        %2  ; [in] GP with pointer to destination buffer
609%define %%OFFS       %3  ; [in/out] GP with src/dst buffer offset
610%define %%NBYTES     %4  ; [in/out] GP with number of bytes to decrypt
611%define %%KEY_PTR    %5  ; [in] GP with pointer to expanded AES decrypt keys
612%define %%ZIV        %6  ; [in/out] IV in / last cipher text block on out
613%define %%ZD0        %7  ; [clobbered] temporary ZMM
614%define %%ZD1        %8  ; [clobbered] temporary ZMM
615%define %%ZD2        %9  ; [clobbered] temporary ZMM
616%define %%ZD3        %10 ; [clobbered] temporary ZMM
617%define %%ZC0        %11 ; [clobbered] temporary ZMM
618%define %%ZC1        %12 ; [clobbered] temporary ZMM
619%define %%ZC2        %13 ; [clobbered] temporary ZMM
620%define %%ZC3        %14 ; [clobbered] temporary ZMM
621%define %%ZTMP0      %15 ; [clobbered] temporary ZMM
622%define %%ZTMP1      %16 ; [clobbered] temporary ZMM
623%define %%NROUNDS    %17 ; [in] Number of rounds (9 or 13)
624%define %%ZCRC_SUM0  %18 ; [in/out] current CRC value
625%define %%ZCRC_SUM1  %19 ; [in/out] current CRC value
626%define %%ZCRC_SUM2  %20 ; [in/out] current CRC value
627%define %%ZCRC_SUM3  %21 ; [in/out] current CRC value
628%define %%LAST_BLOCK %22 ; [out] xmm to store the last clear text block
629
630        ;; /////////////////////////////////////////////////
631        ;; load cipher text blocks
632        ZMM_LOAD_BLOCKS_0_16 16, %%SRC, %%OFFS, \
633                %%ZC0, %%ZC1, %%ZC2, %%ZC3
634
635        ;; /////////////////////////////////////////////////
636        ;; prepare cipher text blocks for an XOR after AES-DEC rounds
637        valignq         %%ZD0, %%ZC0, %%ZIV, 6
638        valignq         %%ZD1, %%ZC1, %%ZC0, 6
639        valignq         %%ZD2, %%ZC2, %%ZC1, 6
640        valignq         %%ZD3, %%ZC3, %%ZC2, 6
641
642        ;; /////////////////////////////////////////////////
643        ;; update IV for the next round (block 3 in ZIV)
644        vmovdqa64       %%ZIV, %%ZC3
645
646        ;; /////////////////////////////////////////////////
647        ;; AES rounds 0 to 10/14 & CRC
648
649%assign round 0
650%rep (%%NROUNDS + 2)
651        ;; /////////////////////////////////////////////////
652        ;; AES decrypt round
653        vbroadcastf64x2 %%ZTMP0, [%%KEY_PTR + (round*16)]
654        ZMM_AESDEC_ROUND_BLOCKS_0_16 %%ZC0, %%ZC1, %%ZC2, %%ZC3, \
655                        %%ZTMP0, round, %%ZD0, %%ZD1, %%ZD2, %%ZD3, \
656                        16, %%NROUNDS
657%assign round (round + 1)
658%endrep
659
660        ;; /////////////////////////////////////////////////
661        ;; store clear text
662        ZMM_STORE_BLOCKS_0_16 16, %%DST, %%OFFS, \
663                %%ZC0, %%ZC1, %%ZC2, %%ZC3
664
665                ;; \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
666                ;; CRC just decrypted blocks
667                vbroadcastf64x2 %%ZTMP0, [rel fold_by_16]
668	        vpclmulqdq	%%ZTMP1, %%ZCRC_SUM0, %%ZTMP0, 0x10
669	        vpclmulqdq	%%ZCRC_SUM0, %%ZCRC_SUM0, %%ZTMP0, 0x01
670                vpternlogq      %%ZCRC_SUM0, %%ZTMP1, %%ZC0, 0x96
671
672                vpclmulqdq	%%ZTMP1, %%ZCRC_SUM1, %%ZTMP0, 0x10
673	        vpclmulqdq	%%ZCRC_SUM1, %%ZCRC_SUM1, %%ZTMP0, 0x01
674                vpternlogq      %%ZCRC_SUM1, %%ZTMP1, %%ZC1, 0x96
675
676	        vpclmulqdq	%%ZTMP1, %%ZCRC_SUM2, %%ZTMP0, 0x10
677	        vpclmulqdq	%%ZCRC_SUM2, %%ZCRC_SUM2, %%ZTMP0, 0x01
678                vpternlogq      %%ZCRC_SUM2, %%ZTMP1, %%ZC2, 0x96
679
680                vpclmulqdq	%%ZTMP1, %%ZCRC_SUM3, %%ZTMP0, 0x10
681	        vpclmulqdq	%%ZCRC_SUM3, %%ZCRC_SUM3, %%ZTMP0, 0x01
682                vpternlogq      %%ZCRC_SUM3, %%ZTMP1, %%ZC3, 0x96
683
684                vextracti64x2   %%LAST_BLOCK, %%ZC3, 3
685
686        ;; /////////////////////////////////////////////////
687        ;; update lengths and offset
688        add             %%OFFS, (16 * 16)
689        sub             %%NBYTES, (16 * 16)
690
691%endmacro       ;; AES_CBC_DEC_CRC32_16
692
693;; ===================================================================
694;; ===================================================================
695;; DOCSIS SEC BPI decryption + CRC32
696;; This macro is handling the case when the two components are
697;; executed together.
698;; ===================================================================
699%macro DOCSIS_DEC_CRC32 40
700%define %%KEYS       %1   ;; [in] GP with pointer to expanded keys (decrypt)
701%define %%SRC        %2   ;; [in] GP with pointer to source buffer
702%define %%DST        %3   ;; [in] GP with pointer to destination buffer
703%define %%NUM_BYTES  %4   ;; [in/clobbered] GP with number of bytes to decrypt
704%define %%KEYS_ENC   %5   ;; [in] GP with pointer to expanded keys (encrypt)
705%define %%GT1        %6   ;; [clobbered] temporary GP
706%define %%GT2        %7   ;; [clobbered] temporary GP
707%define %%XCRC_INIT  %8   ;; [in/out] CRC initial value
708%define %%XIV        %9   ;; [in/out] cipher IV
709%define %%ZT1        %10  ;; [clobbered] temporary ZMM
710%define %%ZT2        %11  ;; [clobbered] temporary ZMM
711%define %%ZT3        %12  ;; [clobbered] temporary ZMM
712%define %%ZT4        %13  ;; [clobbered] temporary ZMM
713%define %%ZT5        %14  ;; [clobbered] temporary ZMM
714%define %%ZT6        %15  ;; [clobbered] temporary ZMM
715%define %%ZT7        %16  ;; [clobbered] temporary ZMM
716%define %%ZT8        %17  ;; [clobbered] temporary ZMM
717%define %%ZT9        %18  ;; [clobbered] temporary ZMM
718%define %%ZT10       %19  ;; [clobbered] temporary ZMM
719%define %%ZT11       %20  ;; [clobbered] temporary ZMM
720%define %%ZT12       %21  ;; [clobbered] temporary ZMM
721%define %%ZT13       %22  ;; [clobbered] temporary ZMM
722                          ;; no ZT14 - taken by XIV
723                          ;; no ZT15 - taken by CRC_INIT
724%define %%ZT16       %23  ;; [clobbered] temporary ZMM
725%define %%ZT17       %24  ;; [clobbered] temporary ZMM
726%define %%ZT18       %25  ;; [clobbered] temporary ZMM
727%define %%ZT19       %26  ;; [clobbered] temporary ZMM
728%define %%ZT20       %27  ;; [clobbered] temporary ZMM
729%define %%ZT21       %28  ;; [clobbered] temporary ZMM
730%define %%ZT22       %29  ;; [clobbered] temporary ZMM
731%define %%ZT23       %30  ;; [clobbered] temporary ZMM
732%define %%ZT24       %31  ;; [clobbered] temporary ZMM
733%define %%ZT25       %32  ;; [clobbered] temporary ZMM
734%define %%ZT26       %33  ;; [clobbered] temporary ZMM
735%define %%ZT27       %34  ;; [clobbered] temporary ZMM
736%define %%ZT28       %35  ;; [clobbered] temporary ZMM
737%define %%ZT29       %36  ;; [clobbered] temporary ZMM
738%define %%ZT30       %37  ;; [clobbered] temporary ZMM
739%define %%ZT31       %38  ;; [clobbered] temporary ZMM
740%define %%ZT32       %39  ;; [clobbered] temporary ZMM
741%define %%NROUNDS    %40  ;; [in] Number of rounds (9 or 13)
742
743%define %%NUM_BLOCKS %%GT1
744%define %%OFFSET     %%GT2
745
746%xdefine %%ZIV ZWORD(%%XIV)
747
748%xdefine %%XTMP0  XWORD(%%ZT1)
749%xdefine %%XTMP1  XWORD(%%ZT2)
750
751%xdefine %%XCRC_TMP    XWORD(%%ZT3)
752%xdefine %%XCRC_MUL    XWORD(%%ZT4)
753%xdefine %%XCRC_IN_OUT %%XCRC_INIT
754
755%xdefine %%ZCRC0 %%ZT5
756%xdefine %%ZCRC1 %%ZT6
757%xdefine %%ZCRC2 %%ZT7
758%xdefine %%ZCRC3 %%ZT8
759%xdefine %%XCRC0 XWORD(%%ZCRC0)
760
761%xdefine %%ZCIPH0 %%ZT9
762%xdefine %%ZCIPH1 %%ZT10
763%xdefine %%ZCIPH2 %%ZT11
764%xdefine %%ZCIPH3 %%ZT12
765
766%xdefine %%ZTMP0 %%ZT20
767%xdefine %%ZTMP1 %%ZT21
768%xdefine %%ZTMP2 %%ZT22
769%xdefine %%ZTMP3 %%ZT23
770%xdefine %%ZTMP4 %%ZT24
771%xdefine %%ZTMP5 %%ZT25
772%xdefine %%ZTMP6 %%ZT26
773%xdefine %%ZTMP7 %%ZT27
774%xdefine %%ZTMP8 %%ZT28
775%xdefine %%ZTMP9 %%ZT29
776
777%xdefine %%ZCRC_IN_OUT0   ZWORD(%%XCRC_IN_OUT)
778%xdefine %%ZCRC_IN_OUT1   %%ZT30
779%xdefine %%ZCRC_IN_OUT2   %%ZT31
780%xdefine %%ZCRC_IN_OUT3   %%ZT32
781
782        vmovdqa64       %%XCRC_MUL, [rel fold_by_1]
783        vmovdqa64       %%XCRC_INIT, %%XCRC_INIT
784
785        xor     %%OFFSET, %%OFFSET
786
787        cmp     %%NUM_BYTES, 16
788        jb      %%_check_partial_block
789
790        cmp     %%NUM_BYTES, (16 * 16) + 16
791        jb      %%_below_17_blocks
792
793        cmp     %%NUM_BYTES, (32 * 16) + 16
794        jb      %%_below_33_blocks
795
796        ;; =====================================================================
797        ;; =====================================================================
798        ;; Part handling messages bigger-equal 33 blocks
799        ;; - decrypt & crc performed per 16 block basis
800        ;; =====================================================================
801
802        ;; Decrypt 16 blocks first.
803        ;; Make sure IV is in the top 128 bits of ZMM.
804        vshufi64x2      %%ZIV, %%ZIV, %%ZIV, 0000_0000b
805
806        AES_CBC_DEC_1_TO_16     %%SRC, %%DST, 16, %%OFFSET, %%NUM_BYTES, \
807                                %%KEYS, %%ZIV, %%NROUNDS, \
808                                %%ZTMP0, %%ZCRC_IN_OUT1, \
809                                %%ZCRC_IN_OUT2, %%ZCRC_IN_OUT3, \
810                                %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5
811
812        ;; Start of CRC is just reading the data and adding initial value.
813        ;; In the next round multiply and add operations will apply.
814        vpxorq          %%ZCRC_IN_OUT0, %%ZCRC_IN_OUT0, %%ZTMP0
815
816        vextracti64x2   %%XCRC0, %%ZCRC_IN_OUT3, 3
817
818%%_main_loop:
819        cmp     %%NUM_BYTES, (16 * 16) + 16
820        jb      %%_main_loop_exit
821
822        ;; Stitched cipher and CRC on 16 blocks
823        AES_CBC_DEC_CRC32_16    %%SRC, %%DST, %%OFFSET, %%NUM_BYTES, \
824                                %%KEYS, %%ZIV, \
825                                %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \
826                                %%ZTMP5, %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, \
827                                %%NROUNDS, \
828                                %%ZCRC_IN_OUT0, %%ZCRC_IN_OUT1, \
829                                %%ZCRC_IN_OUT2, %%ZCRC_IN_OUT3, \
830                                %%XCRC0
831
832        jmp     %%_main_loop
833
834%%_main_loop_exit:
835        ;; Up to 16 (inclusive) blocks left to process
836        ;; - decrypt the blocks first
837        ;; - then crc decrypted blocks minus one block
838
839        ;; broadcast IV across ZMM (4th and 1st 128-bit positions are only important really)
840        vshufi64x2      %%ZIV, %%ZIV, %%ZIV, 1111_1111b
841
842        mov     %%NUM_BLOCKS, %%NUM_BYTES
843        shr     %%NUM_BLOCKS, 4
844        and     %%NUM_BLOCKS, 15
845        jz	%%_decrypt_eq0
846
847        cmp     %%NUM_BLOCKS, 8
848        jg      %%_decrypt_gt8
849        je      %%_decrypt_eq8
850
851        ;; 1 to 7 blocks
852	cmp	%%NUM_BLOCKS, 4
853	jg	%%_decrypt_gt4
854	je	%%_decrypt_eq4
855
856%%_decrypt_lt4:
857        ;; 1 to 3 blocks
858	cmp	%%NUM_BLOCKS, 2
859	jg	%%_decrypt_eq3
860	je	%%_decrypt_eq2
861        jmp     %%_decrypt_eq1
862
863%%_decrypt_gt4:
864        ;; 5 to 7
865	cmp	%%NUM_BLOCKS, 6
866	jg	%%_decrypt_eq7
867	je	%%_decrypt_eq6
868        jmp     %%_decrypt_eq5
869
870%%_decrypt_gt8:
871        ;; 9 to 15
872	cmp	%%NUM_BLOCKS, 12
873	jg	%%_decrypt_gt12
874	je	%%_decrypt_eq12
875
876        ;; 9 to 11
877	cmp	%%NUM_BLOCKS, 10
878	jg	%%_decrypt_eq11
879	je	%%_decrypt_eq10
880        jmp     %%_decrypt_eq9
881
882%%_decrypt_gt12:
883        ;; 13 to 15
884	cmp	%%NUM_BLOCKS, 14
885	jg	%%_decrypt_eq15
886	je	%%_decrypt_eq14
887        jmp     %%_decrypt_eq13
888
889%assign number_of_blocks 1
890%rep 15
891%%_decrypt_eq %+ number_of_blocks :
892        ;; decrypt selected number of blocks
893        AES_CBC_DEC_1_TO_16 %%SRC, %%DST, number_of_blocks, %%OFFSET, %%NUM_BYTES, \
894                        %%KEYS, %%ZIV, %%NROUNDS, \
895                        %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, \
896                        %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5
897
898        ;; extract & save the last decrypted block as crc for it is done separately
899        ;; towards the end of this macro
900%if number_of_blocks < 5
901        vextracti64x2   %%XCRC0, %%ZTMP6, (number_of_blocks - 1)
902%elif  number_of_blocks < 9
903        vextracti64x2   %%XCRC0, %%ZTMP7, (number_of_blocks - 4 - 1)
904%elif  number_of_blocks < 13
905        vextracti64x2   %%XCRC0, %%ZTMP8, (number_of_blocks - 8 - 1)
906%else
907        vextracti64x2   %%XCRC0, %%ZTMP9, (number_of_blocks - 12 - 1)
908%endif
909
910        ;; set number of blocks for CRC
911        mov             %%NUM_BLOCKS, (number_of_blocks - 1)
912
913        ;; extract latest IV into XIV for partial block processing
914        vextracti32x4   %%XIV, %%ZIV, 3
915        jmp             %%_decrypt_done_fold_by8
916
917%assign number_of_blocks (number_of_blocks + 1)
918%endrep
919
920%%_decrypt_eq0:
921        ;; Special case. Check if there are full 16 blocks for decrypt
922        ;; - it can happen here because the main loop checks for 17 blocks
923        ;; If yes then decrypt them and fall through to folding/crc section
924        ;; identifying 15 blocks for CRC
925        cmp             %%NUM_BYTES, (16 * 16)
926        jb              %%_cbc_decrypt_done
927
928        AES_CBC_DEC_1_TO_16 %%SRC, %%DST, 16, %%OFFSET, %%NUM_BYTES, \
929                        %%KEYS, %%ZIV, %%NROUNDS, \
930                        %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, \
931                        %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5
932
933        mov             %%NUM_BLOCKS, 15
934        vextracti32x4   %%XIV, %%ZIV, 3
935        vextracti64x2   %%XCRC0, %%ZTMP9, 3
936
937%%_decrypt_done_fold_by8:
938        ;; Register content at this point:
939        ;; ZTMP6 - ZTMP9 => decrypted blocks (16 to 31)
940        ;; ZCRC_IN_OUT0 - ZCRC_IN_OUT3 - fold by 16 CRC sums
941        ;; NUM_BLOCKS - number of blocks to CRC
942
943        ;; fold 16 x 128 bits -> 8 x 128 bits
944        vbroadcastf64x2 %%ZTMP2, [rel fold_by_8]
945        vpclmulqdq      %%ZTMP1, %%ZCRC_IN_OUT0, %%ZTMP2, 0x01
946        vpclmulqdq      %%ZCRC_IN_OUT0, %%ZCRC_IN_OUT0, %%ZTMP2, 0x10
947        vpternlogq      %%ZCRC_IN_OUT0, %%ZCRC_IN_OUT2, %%ZTMP1, 0x96
948
949        vpclmulqdq      %%ZTMP1, %%ZCRC_IN_OUT1, %%ZTMP2, 0x01
950        vpclmulqdq      %%ZCRC_IN_OUT1, %%ZCRC_IN_OUT1, %%ZTMP2, 0x10
951        vpternlogq      %%ZCRC_IN_OUT1, %%ZCRC_IN_OUT3, %%ZTMP1, 0x96
952
953%%_decrypt_done_no_fold_16_to_8:
954        ;; CRC 8 blocks of already decrypted text
955        test            %%NUM_BLOCKS, 8
956        jz              %%_skip_crc_by8
957
958        vbroadcastf64x2 %%ZTMP2, [rel fold_by_8]
959        vpclmulqdq      %%ZTMP1, %%ZCRC_IN_OUT0, %%ZTMP2, 0x01
960        vpclmulqdq      %%ZCRC_IN_OUT0, %%ZCRC_IN_OUT0, %%ZTMP2, 0x10
961        vpternlogq      %%ZCRC_IN_OUT0, %%ZTMP6, %%ZTMP1, 0x96
962
963        vpclmulqdq      %%ZTMP1, %%ZCRC_IN_OUT1, %%ZTMP2, 0x01
964        vpclmulqdq      %%ZCRC_IN_OUT1, %%ZCRC_IN_OUT1, %%ZTMP2, 0x10
965        vpternlogq      %%ZCRC_IN_OUT1, %%ZTMP7, %%ZTMP1, 0x96
966
967        vmovdqa64       %%ZTMP6, %%ZTMP8
968        vmovdqa64       %%ZTMP7, %%ZTMP9
969
970%%_skip_crc_by8:
971        ;; fold 8 x 128 bits -> 4 x 128 bits
972        vbroadcastf64x2 %%ZTMP2, [rel fold_by_4]
973        vpclmulqdq      %%ZTMP1, %%ZCRC_IN_OUT0, %%ZTMP2, 0x01
974        vpclmulqdq      %%ZCRC_IN_OUT0, %%ZCRC_IN_OUT0, %%ZTMP2, 0x10
975        vpternlogq      %%ZCRC_IN_OUT0, %%ZCRC_IN_OUT1, %%ZTMP1, 0x96
976
977        ;; CRC 4 blocks of already decrypted text
978        test            %%NUM_BLOCKS, 4
979        jz              %%_skip_crc_by4
980
981        vpclmulqdq      %%ZTMP1, %%ZCRC_IN_OUT0, %%ZTMP2, 0x01
982        vpclmulqdq      %%ZCRC_IN_OUT0, %%ZCRC_IN_OUT0, %%ZTMP2, 0x10
983        vpternlogq      %%ZCRC_IN_OUT0, %%ZTMP6, %%ZTMP1, 0x96
984
985        vmovdqa64       %%ZTMP6, %%ZTMP7
986
987%%_skip_crc_by4:
988        ;; fold 4 x 128 bits -> 2 x 128 bits
989        vbroadcastf64x2 YWORD(%%ZTMP2), [rel fold_by_2]
990        vextracti64x4   YWORD(%%ZCRC_IN_OUT1), %%ZCRC_IN_OUT0, 1
991        vpclmulqdq      YWORD(%%ZTMP1), YWORD(%%ZCRC_IN_OUT0), YWORD(%%ZTMP2), 0x01
992        vpclmulqdq      YWORD(%%ZCRC_IN_OUT0), YWORD(%%ZCRC_IN_OUT0), YWORD(%%ZTMP2), 0x10
993        vpternlogq      YWORD(%%ZCRC_IN_OUT0), YWORD(%%ZCRC_IN_OUT1), YWORD(%%ZTMP1), 0x96
994
995        ;; CRC 2 blocks of already decrypted text
996        test            %%NUM_BLOCKS, 2
997        jz              %%_skip_crc_by2
998
999        vpclmulqdq      YWORD(%%ZTMP1), YWORD(%%ZCRC_IN_OUT0), YWORD(%%ZTMP2), 0x01
1000        vpclmulqdq      YWORD(%%ZCRC_IN_OUT0), YWORD(%%ZCRC_IN_OUT0), YWORD(%%ZTMP2), 0x10
1001        vpternlogq      YWORD(%%ZCRC_IN_OUT0), YWORD(%%ZTMP6), YWORD(%%ZTMP1), 0x96
1002
1003        vshufi64x2      %%ZTMP6, %%ZTMP6, %%ZTMP6, 1110_1110b
1004
1005%%_skip_crc_by2:
1006        ;; fold 2 x 128 bits -> 1 x 128 bits
1007        vmovdqa64       XWORD(%%ZTMP2), [rel fold_by_1]
1008        vextracti64x2   XWORD(%%ZCRC_IN_OUT1), YWORD(%%ZCRC_IN_OUT0), 1
1009        vpclmulqdq      XWORD(%%ZTMP1), XWORD(%%ZCRC_IN_OUT0), XWORD(%%ZTMP2), 0x01
1010        vpclmulqdq      XWORD(%%ZCRC_IN_OUT0), XWORD(%%ZCRC_IN_OUT0), XWORD(%%ZTMP2), 0x10
1011        vpternlogq      XWORD(%%ZCRC_IN_OUT0), XWORD(%%ZCRC_IN_OUT1), XWORD(%%ZTMP1), 0x96
1012
1013        ;; CRC 1 block of already decrypted text
1014        test            %%NUM_BLOCKS, 1
1015        jz              %%_skip_crc_by1
1016
1017        vpclmulqdq      XWORD(%%ZTMP1), XWORD(%%ZCRC_IN_OUT0), XWORD(%%ZTMP2), 0x01
1018        vpclmulqdq      XWORD(%%ZCRC_IN_OUT0), XWORD(%%ZCRC_IN_OUT0), XWORD(%%ZTMP2), 0x10
1019        vpternlogq      XWORD(%%ZCRC_IN_OUT0), XWORD(%%ZTMP6), XWORD(%%ZTMP1), 0x96
1020
1021%%_skip_crc_by1:
1022        jmp             %%_check_partial_block
1023
1024%%_cbc_decrypt_done:
1025        ;; No blocks left to compute CRC for. Just fold the sums from 16x128-bits into 1x128-bits.
1026        ;; Register content at this point:
1027        ;; ZCRC_IN_OUT0 - ZCRC_IN_OUT3 - fold by 16 CRC sums
1028        ;; XCRC0 - includes the last decrypted block to be passed to partial check case
1029
1030        ;; fold 16 x 128 bits -> 8 x 128 bits
1031        vbroadcastf64x2 %%ZTMP2, [rel fold_by_8]
1032        vpclmulqdq      %%ZTMP1, %%ZCRC_IN_OUT0, %%ZTMP2, 0x01
1033        vpclmulqdq      %%ZCRC_IN_OUT0, %%ZCRC_IN_OUT0, %%ZTMP2, 0x10
1034        vpternlogq      %%ZCRC_IN_OUT0, %%ZCRC_IN_OUT2, %%ZTMP1, 0x96
1035
1036        vpclmulqdq      %%ZTMP1, %%ZCRC_IN_OUT1, %%ZTMP2, 0x01
1037        vpclmulqdq      %%ZCRC_IN_OUT1, %%ZCRC_IN_OUT1, %%ZTMP2, 0x10
1038        vpternlogq      %%ZCRC_IN_OUT1, %%ZCRC_IN_OUT3, %%ZTMP1, 0x96
1039
1040%%_cbc_decrypt_done_fold_8_to_4:
1041        ;; fold 8 x 128 bits -> 4 x 128 bits
1042        vbroadcastf64x2 %%ZTMP2, [rel fold_by_4]
1043        vpclmulqdq      %%ZTMP1, %%ZCRC_IN_OUT0, %%ZTMP2, 0x01
1044        vpclmulqdq      %%ZCRC_IN_OUT0, %%ZCRC_IN_OUT0, %%ZTMP2, 0x10
1045        vpternlogq      %%ZCRC_IN_OUT0, %%ZCRC_IN_OUT1, %%ZTMP1, 0x96
1046
1047        ;; fold 4 x 128 bits -> 2 x 128 bits
1048        vbroadcastf64x2 YWORD(%%ZTMP2), [rel fold_by_2]
1049        vextracti64x4   YWORD(%%ZCRC_IN_OUT1), %%ZCRC_IN_OUT0, 1
1050        vpclmulqdq      YWORD(%%ZTMP1), YWORD(%%ZCRC_IN_OUT0), YWORD(%%ZTMP2), 0x01
1051        vpclmulqdq      YWORD(%%ZCRC_IN_OUT0), YWORD(%%ZCRC_IN_OUT0), YWORD(%%ZTMP2), 0x10
1052        vpternlogq      YWORD(%%ZCRC_IN_OUT0), YWORD(%%ZCRC_IN_OUT1), YWORD(%%ZTMP1), 0x96
1053
1054        ;; fold 2 x 128 bits -> 1 x 128 bits
1055        vmovdqa64       XWORD(%%ZTMP2), [rel fold_by_1]
1056        vextracti64x2   XWORD(%%ZCRC_IN_OUT1), YWORD(%%ZCRC_IN_OUT0), 1
1057        vpclmulqdq      XWORD(%%ZTMP1), XWORD(%%ZCRC_IN_OUT0), XWORD(%%ZTMP2), 0x01
1058        vpclmulqdq      XWORD(%%ZCRC_IN_OUT0), XWORD(%%ZCRC_IN_OUT0), XWORD(%%ZTMP2), 0x10
1059        vpternlogq      XWORD(%%ZCRC_IN_OUT0), XWORD(%%ZCRC_IN_OUT1), XWORD(%%ZTMP1), 0x96
1060
1061        ;; - keep the last block out from the calculation
1062        ;;   (this may be a partial block - additional checks follow)
1063        jmp             %%_check_partial_block
1064
1065
1066        ;; =====================================================================
1067        ;; =====================================================================
1068        ;; Part handling messages from 16 - 32 blocks
1069        ;; =====================================================================
1070%%_below_33_blocks:
1071        ;; Decrypt 16 blocks first
1072        ;; Make sure IV is in the top 128 bits of ZMM.
1073        vshufi64x2      %%ZIV, %%ZIV, %%ZIV, 0000_0000b
1074
1075        AES_CBC_DEC_1_TO_16     %%SRC, %%DST, 16, %%OFFSET, %%NUM_BYTES, \
1076                                %%KEYS, %%ZIV, %%NROUNDS, \
1077                                %%ZTMP0, %%ZCRC_IN_OUT1, \
1078                                %%ZCRC_IN_OUT2, %%ZCRC_IN_OUT3, \
1079                                %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5
1080
1081        ;; Start of CRC is just reading the data and adding initial value.
1082        vpxorq          %%ZCRC_IN_OUT0, %%ZCRC_IN_OUT0, %%ZTMP0
1083
1084        ;; Use fold by 8 approach to start the CRC.
1085        ;; ZCRC_IN_OUT0 and ZCRC_IN_OUT1 include CRC sums.
1086        vbroadcastf64x2 %%ZTMP2, [rel fold_by_8]
1087        vpclmulqdq      %%ZTMP1, %%ZCRC_IN_OUT0, %%ZTMP2, 0x01
1088        vpclmulqdq      %%ZCRC_IN_OUT0, %%ZCRC_IN_OUT0, %%ZTMP2, 0x10
1089        vpternlogq      %%ZCRC_IN_OUT0, %%ZCRC_IN_OUT2, %%ZTMP1, 0x96
1090
1091        vpclmulqdq      %%ZTMP1, %%ZCRC_IN_OUT1, %%ZTMP2, 0x01
1092        vpclmulqdq      %%ZCRC_IN_OUT1, %%ZCRC_IN_OUT1, %%ZTMP2, 0x10
1093        vpternlogq      %%ZCRC_IN_OUT1, %%ZCRC_IN_OUT3, %%ZTMP1, 0x96
1094
1095        ;; Decrypt rest of the message.
1096        mov     %%NUM_BLOCKS, %%NUM_BYTES
1097        shr     %%NUM_BLOCKS, 4
1098        and     %%NUM_BLOCKS, 15
1099        jz	%%_decrypt2_eq0
1100
1101        cmp     %%NUM_BLOCKS, 8
1102        jg      %%_decrypt2_gt8
1103        je      %%_decrypt2_eq8
1104
1105        ;; 1 to 7 blocks
1106	cmp	%%NUM_BLOCKS, 4
1107	jg	%%_decrypt2_gt4
1108	je	%%_decrypt2_eq4
1109
1110%%_decrypt2_lt4:
1111        ;; 1 to 3 blocks
1112	cmp	%%NUM_BLOCKS, 2
1113	jg	%%_decrypt2_eq3
1114	je	%%_decrypt2_eq2
1115        jmp     %%_decrypt2_eq1
1116
1117%%_decrypt2_gt4:
1118        ;; 5 to 7
1119	cmp	%%NUM_BLOCKS, 6
1120	jg	%%_decrypt2_eq7
1121	je	%%_decrypt2_eq6
1122        jmp     %%_decrypt2_eq5
1123
1124%%_decrypt2_gt8:
1125        ;; 9 to 15
1126	cmp	%%NUM_BLOCKS, 12
1127	jg	%%_decrypt2_gt12
1128	je	%%_decrypt2_eq12
1129
1130        ;; 9 to 11
1131	cmp	%%NUM_BLOCKS, 10
1132	jg	%%_decrypt2_eq11
1133	je	%%_decrypt2_eq10
1134        jmp     %%_decrypt2_eq9
1135
1136%%_decrypt2_gt12:
1137        ;; 13 to 15
1138	cmp	%%NUM_BLOCKS, 14
1139	jg	%%_decrypt2_eq15
1140	je	%%_decrypt2_eq14
1141        jmp     %%_decrypt2_eq13
1142
1143%assign number_of_blocks 1
1144%rep 15
1145%%_decrypt2_eq %+ number_of_blocks :
1146        AES_CBC_DEC_1_TO_16 %%SRC, %%DST, number_of_blocks, %%OFFSET, %%NUM_BYTES, \
1147                        %%KEYS, %%ZIV, %%NROUNDS, \
1148                        %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, \
1149                        %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5
1150
1151%if number_of_blocks < 5
1152        vextracti64x2   %%XCRC0, %%ZTMP6, (number_of_blocks - 1)
1153%elif  number_of_blocks < 9
1154        vextracti64x2   %%XCRC0, %%ZTMP7, (number_of_blocks - 4 - 1)
1155%elif  number_of_blocks < 13
1156        vextracti64x2   %%XCRC0, %%ZTMP8, (number_of_blocks - 8 - 1)
1157%else
1158        vextracti64x2   %%XCRC0, %%ZTMP9, (number_of_blocks - 12 - 1)
1159%endif
1160
1161        ;; Update XIV
1162        mov             %%NUM_BLOCKS, (number_of_blocks - 1)
1163
1164        ;; Extract latest IV
1165        vextracti32x4   %%XIV, %%ZIV, 3
1166        jmp             %%_decrypt_done_no_fold_16_to_8
1167
1168%assign number_of_blocks (number_of_blocks + 1)
1169%endrep
1170
1171%%_decrypt2_eq0:
1172        ;; Special case. Check if there are full 16 blocks for decrypt.
1173        ;; If yes then decrypt them and fall through to folding/crc section
1174        ;; identifying 15 blocks for CRC
1175        cmp             %%NUM_BYTES, (16 * 16)
1176        jb              %%_cbc_decrypt_done_fold_8_to_4
1177
1178        AES_CBC_DEC_1_TO_16 %%SRC, %%DST, 16, %%OFFSET, %%NUM_BYTES, \
1179                        %%KEYS, %%ZIV, %%NROUNDS, \
1180                        %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, \
1181                        %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5
1182
1183        mov             %%NUM_BLOCKS, 15
1184        vextracti32x4   %%XIV, %%ZIV, 3
1185        vextracti64x2   %%XCRC0, %%ZTMP9, 3
1186        jmp             %%_decrypt_done_no_fold_16_to_8
1187
1188        ;; =====================================================================
1189        ;; =====================================================================
1190        ;; Part handling messages up to from 1 to 16 blocks
1191        ;; =====================================================================
1192%%_below_17_blocks:
1193        ;; Make sure IV is in the top 128 bits of ZMM.
1194        vshufi64x2      %%ZIV, %%ZIV, %%ZIV, 0000_0000b
1195
1196        mov     %%NUM_BLOCKS, %%NUM_BYTES
1197        shr     %%NUM_BLOCKS, 4
1198        and     %%NUM_BLOCKS, 15
1199        jz	%%_eq16
1200
1201        cmp     %%NUM_BLOCKS, 8
1202        jg      %%_gt8
1203        je      %%_eq8
1204
1205        ;; 1 to 7 blocks
1206	cmp	%%NUM_BLOCKS, 4
1207	jg	%%_gt4
1208	je	%%_eq4
1209
1210%%_lt4:
1211        ;; 1 to 3 blocks
1212	cmp	%%NUM_BLOCKS, 2
1213	jg	%%_eq3
1214	je	%%_eq2
1215        jmp     %%_eq1
1216
1217%%_gt4:
1218        ;; 5 to 7
1219	cmp	%%NUM_BLOCKS, 6
1220	jg	%%_eq7
1221	je	%%_eq6
1222        jmp     %%_eq5
1223
1224%%_gt8:
1225        ;; 9 to 15
1226	cmp	%%NUM_BLOCKS, 12
1227	jg	%%_gt12
1228	je	%%_eq12
1229
1230        ;; 9 to 11
1231	cmp	%%NUM_BLOCKS, 10
1232	jg	%%_eq11
1233	je	%%_eq10
1234        jmp     %%_eq9
1235
1236%%_gt12:
1237        ;; 13 to 15
1238	cmp	%%NUM_BLOCKS, 14
1239	jg	%%_eq15
1240	je	%%_eq14
1241        jmp     %%_eq13
1242
1243%assign number_of_blocks 1
1244%rep 16
1245%%_eq %+ number_of_blocks :
1246        ;; Start building the pipeline by decrypting number of blocks
1247        ;; - later cipher & CRC operations get stitched
1248        AES_CBC_DEC_1_TO_16 %%SRC, %%DST, number_of_blocks, %%OFFSET, %%NUM_BYTES, \
1249                           %%KEYS, %%ZIV, %%NROUNDS, \
1250                           %%ZCRC0, %%ZCRC1, %%ZCRC2, %%ZCRC3, \
1251                           %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5
1252
1253        vextracti32x4       %%XIV, %%ZIV, 3
1254
1255        ;; Less than 16 blocks remaining in the message:
1256        ;; - compute CRC on decrypted blocks (minus one, in case it is the last one)
1257        ;; - then check for any partial block left
1258%assign number_of_blocks_to_crc (number_of_blocks - 1)
1259        CRC32_FIRST_1_TO_16     %%XCRC_MUL, %%XCRC_IN_OUT, %%XTMP0, %%XTMP1, \
1260                                number_of_blocks_to_crc, \
1261                                %%ZCRC0, %%ZCRC1, %%ZCRC2, %%ZCRC3, \
1262                                %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
1263
1264%if number_of_blocks_to_crc == 0
1265%elif number_of_blocks_to_crc < 4
1266        vextracti32x4   %%XCRC0, %%ZCRC0, (number_of_blocks_to_crc % 4)
1267%elif number_of_blocks_to_crc < 8
1268        vextracti32x4   %%XCRC0, %%ZCRC1, (number_of_blocks_to_crc % 4)
1269%elif number_of_blocks_to_crc < 12
1270        vextracti32x4   %%XCRC0, %%ZCRC2, (number_of_blocks_to_crc % 4)
1271%else ;; number_of_blocks_to_crc < 16
1272        vextracti32x4   %%XCRC0, %%ZCRC3, (number_of_blocks_to_crc % 4)
1273%endif
1274        jmp     %%_check_partial_block
1275
1276%assign number_of_blocks (number_of_blocks + 1)
1277%endrep
1278
1279        ;; =====================================================================
1280        ;; =====================================================================
1281        ;; Part handling decrypt & CRC of partial block and
1282        ;; CRC of the second last block.
1283        ;; Register content at entry to this section:
1284        ;;     XCRC0 - last 16 bytes of clear text to compute crc on (optional)
1285        ;;     XCRC_IN_OUT - 128-bit crc fold product
1286        ;;     OFFSET - current offset
1287        ;;     NUM_BYTES - number of bytes left to decrypt
1288        ;;     XIV - IV for decrypt operation
1289        ;; =====================================================================
1290%%_check_partial_block:
1291        or              %%NUM_BYTES, %%NUM_BYTES
1292        jz              %%_no_partial_bytes
1293
1294        ;; AES128/256-CFB on the partial block
1295        lea             %%GT1, [rel byte_len_to_mask_table]
1296        kmovw           k1, [%%GT1 + %%NUM_BYTES*2]
1297        vmovdqu8        %%XTMP1{k1}{z}, [%%SRC + %%OFFSET + 0]
1298        vpxorq          %%XTMP0, %%XIV, [%%KEYS_ENC + 0*16]
1299%assign i 1
1300%rep %%NROUNDS
1301        vaesenc         %%XTMP0, [%%KEYS_ENC + i*16]
1302%assign i (i + 1)
1303%endrep
1304        vaesenclast     %%XTMP0, [%%KEYS_ENC + i*16]
1305        vpxorq          %%XTMP0, %%XTMP0, %%XTMP1
1306        vmovdqu8        [%%DST + %%OFFSET + 0]{k1}, %%XTMP0
1307
1308%%_no_partial_bytes:
1309        ;; At this stage:
1310        ;; - whole message is decrypted the focus moves to complete CRC
1311        ;;     - XCRC_IN_OUT includes folded data from all payload apart from
1312        ;;       the last full block and (potential) partial bytes
1313        ;;     - max 2 blocks (minus 1 byte) remain for CRC calculation
1314        ;; - %%OFFSET == 0 is used to check
1315        ;;   if message consists of partial block only
1316        or      %%OFFSET, %%OFFSET
1317        jz      %%_no_block_pending_crc
1318
1319        ;; Data block(s) was previously decrypted
1320        ;; - move to the last decrypted block
1321        ;; - calculate number of bytes to compute CRC for (less CRC field size)
1322        add     %%NUM_BYTES, (16 - 4)
1323        sub     %%OFFSET, 16
1324        jz      %%_no_partial_bytes__start_crc
1325
1326        cmp     %%NUM_BYTES, 16
1327        jb      %%_no_partial_bytes__lt16
1328
1329        ;; XCRC0 has copy of the last full decrypted block
1330        CRC_UPDATE16   no_load, %%XCRC_IN_OUT, %%XCRC_MUL, %%XCRC0, %%XTMP1, next_crc
1331
1332        sub     %%NUM_BYTES, 16
1333        add     %%OFFSET, 16    ; compensate for the subtract above
1334
1335%%_no_partial_bytes__lt16:
1336        or              %%NUM_BYTES, %%NUM_BYTES
1337        jz              %%_no_partial_bytes__128_done
1338
1339        ;; Partial bytes left - complete CRC calculation
1340        lea             %%GT1, [rel pshufb_shf_table]
1341        vmovdqu64       %%XTMP0, [%%GT1 + %%NUM_BYTES]
1342        lea             %%GT1, [%%DST + %%OFFSET]
1343        vmovdqu64       %%XTMP1, [%%GT1 - 16 + %%NUM_BYTES]  ; xtmp1 = data for CRC
1344        vmovdqa64       %%XCRC_TMP, %%XCRC_IN_OUT
1345        vpshufb         %%XCRC_IN_OUT, %%XTMP0  ; top num_bytes with LSB xcrc
1346        vpxorq          %%XTMP0, [rel mask3]
1347        vpshufb         %%XCRC_TMP, %%XTMP0 ; bottom (16 - num_bytes) with MSB xcrc
1348
1349        ;; data num_bytes (top) blended with MSB bytes of CRC (bottom)
1350        vpblendvb       %%XCRC_TMP, %%XTMP1, %%XTMP0
1351
1352        CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%XCRC_TMP, %%XTMP1
1353
1354%%_no_partial_bytes__128_done:
1355        CRC32_REDUCE_128_TO_32 rax, %%XCRC_IN_OUT, %%XTMP1, %%XTMP0, %%XCRC_TMP
1356        jmp     %%_do_return
1357
1358%%_no_partial_bytes__start_crc:
1359        ;; - CRC was not started yet
1360        ;; - CBC decryption could have taken place and/or CFB
1361        ;; - DST is never modified so it points to start of the buffer that
1362        ;;   is subject of CRC calculation
1363        ETHERNET_FCS_CRC %%DST, %%NUM_BYTES, rax, %%XCRC_IN_OUT, %%GT1, \
1364                         %%XCRC_MUL, %%XTMP0, %%XTMP1, %%XCRC_TMP
1365        jmp     %%_do_return
1366
1367%%_no_block_pending_crc:
1368        ;; Message consists of partial block only (first_crc not employed yet)
1369        ;; - XTMP0 includes clear text from CFB processing above
1370        ;; - k1 includes mask of bytes belonging to the message
1371        ;; - NUM_BYTES is length of cipher, CRC is 4 bytes shorter
1372        ;;     - ignoring hash lengths 1 to 4
1373        cmp             %%NUM_BYTES, 5
1374        jb              %%_do_return
1375
1376        ;; clear top 4 bytes of the data
1377        kshiftrw        k1, k1, 4
1378        vmovdqu8        %%XTMP0{k1}{z}, %%XTMP0
1379        vpxorq          %%XCRC_IN_OUT, %%XTMP0 ; xor the data in
1380
1381        sub             %%NUM_BYTES, 4
1382
1383        ;; CRC calculation for payload lengths below 4 is different
1384        cmp             %%NUM_BYTES, 4
1385        jb              %%_no_block_pending_crc__lt4
1386
1387        ;; 4 or more bytes left
1388        lea             %%GT1, [rel pshufb_shf_table]
1389        vmovdqu64       %%XTMP1, [%%GT1 + %%NUM_BYTES]
1390        vpshufb         %%XCRC_IN_OUT, %%XTMP1
1391
1392        CRC32_REDUCE_128_TO_32 rax, %%XCRC_IN_OUT, %%XTMP0, %%XTMP1, %%XCRC_TMP
1393        jmp             %%_do_return
1394
1395%%_no_block_pending_crc__lt4:
1396        ;; less than 4 bytes left for CRC
1397        cmp             %%NUM_BYTES, 3
1398        jne             %%_no_block_pending_crc__neq3
1399        vpslldq         %%XCRC_IN_OUT, 5
1400        jmp             %%_do_barret
1401
1402%%_no_block_pending_crc__neq3:
1403        cmp             %%NUM_BYTES, 2
1404        jne             %%_no_block_pending_crc__neq2
1405        vpslldq         %%XCRC_IN_OUT, 6
1406        jmp             %%_do_barret
1407
1408%%_no_block_pending_crc__neq2:
1409        vpslldq         %%XCRC_IN_OUT, 7
1410
1411%%_do_barret:
1412        CRC32_REDUCE_64_TO_32 rax, %%XCRC_IN_OUT, %%XTMP0, %%XTMP1, %%XCRC_TMP
1413
1414%%_do_return:
1415        ;; result in rax
1416
1417%endmacro       ;; DOCSIS_DEC_CRC32
1418
1419;; ===================================================================
1420;; ===================================================================
1421;; MACRO IMPLEMENTING API FOR STITCHED DOCSIS DECRYPT AND CRC32
1422;; ===================================================================
1423%macro AES_DOCSIS_DEC_CRC32 1
1424%define %%NROUNDS %1    ; [in] Number of rounds (9 or 13)
1425
1426        mov	        rax, rsp
1427	sub	        rsp, STACKFRAME_size
1428	and	        rsp, -64
1429	mov	        [rsp + _rsp_save], rax	; original SP
1430        mov             [rsp + _gpr_save + 0*8], r12
1431        mov             [rsp + _gpr_save + 1*8], r13
1432        mov             [rsp + _gpr_save + 2*8], rbx
1433        mov             [rsp + _gpr_save + 3*8], rbp
1434
1435        vmovdqa64       xmm15, [rel init_crc_value]
1436
1437        mov             tmp1, [job + _src]
1438        add             tmp1, [job + _hash_start_src_offset_in_bytes]   ; CRC only start
1439
1440        cmp             qword [job + _msg_len_to_cipher_in_bytes], 0
1441        jz              %%aes_docsis_dec_crc32_avx512__no_cipher
1442
1443        mov             tmp2, [job + _cipher_start_src_offset_in_bytes]
1444        cmp             tmp2, [job + _hash_start_src_offset_in_bytes]
1445        jbe             %%aes_docsis_dec_crc32_avx512__skip_aad       ; avoid zero lengths or negative cases
1446
1447        sub             tmp2, [job + _hash_start_src_offset_in_bytes]   ; CRC only size / AAD
1448
1449        ETHERNET_FCS_CRC tmp1, tmp2, rax, xmm15, tmp3, xmm0, xmm1, xmm2, xmm3
1450
1451        not             eax             ; carry CRC value into the combined part
1452        vmovd           xmm15, eax      ; initial CRC value
1453
1454%%aes_docsis_dec_crc32_avx512__skip_aad:
1455        mov             tmp1, [job + _iv]
1456	vmovdqu64       xmm14, [tmp1]   ; load IV
1457
1458        mov             tmp2, [job + _src]
1459        add             tmp2, [job + _cipher_start_src_offset_in_bytes] ; AES start
1460
1461        mov             tmp3, [job + _dst]                              ; AES destination
1462
1463        mov             tmp4, [job + _msg_len_to_cipher_in_bytes]       ; CRC + AES size
1464        mov             tmp5, [job + _dec_keys]
1465        mov             tmp6, [job + _enc_keys]
1466
1467        DOCSIS_DEC_CRC32 tmp5, tmp2, tmp3, tmp4, tmp6, \
1468                         tmp7, tmp8, \
1469                         xmm15, xmm14, \
1470                         zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, \
1471                         zmm8, zmm9, zmm10, zmm11, zmm12, zmm13, \
1472                         zmm16, zmm17, zmm18, zmm19, zmm20, zmm21, zmm22, zmm23, \
1473                         zmm24, zmm25, zmm26, zmm27, zmm28, zmm29, zmm30, zmm31, \
1474                         %%NROUNDS
1475
1476        jmp             %%aes_docsis_dec_crc32_avx512__exit
1477
1478%%aes_docsis_dec_crc32_avx512__no_cipher:
1479        ;; tmp1 - already points to hash start
1480        ;; job is arg1
1481        mov             [rsp + _job_save], job
1482        mov             arg2, [job + _msg_len_to_hash_in_bytes]
1483        xor             arg3, arg3
1484        mov             arg1, tmp1
1485        call            ethernet_fcs_avx512_local
1486        mov             job, [rsp + _job_save]
1487
1488%%aes_docsis_dec_crc32_avx512__exit:
1489        mov             tmp1, [job + _auth_tag_output]
1490	mov             [tmp1], eax        ; store CRC32 value
1491
1492        or              qword [job + _status], STS_COMPLETED_AES
1493
1494        ;; restore stack pointer and registers
1495        mov             r12, [rsp + _gpr_save + 0*8]
1496        mov             r13, [rsp + _gpr_save + 1*8]
1497        mov             rbx, [rsp + _gpr_save + 2*8]
1498        mov             rbp, [rsp + _gpr_save + 3*8]
1499	mov	        rsp, [rsp + _rsp_save]	; original SP
1500
1501%ifdef SAFE_DATA
1502	clear_all_zmms_asm
1503%endif ;; SAFE_DATA
1504%endmacro
1505
1506;; ===================================================================
1507;; ===================================================================
1508;; input: arg1 = job
1509;; ===================================================================
1510align 64
1511MKGLOBAL(aes_docsis128_dec_crc32_vaes_avx512,function,internal)
1512aes_docsis128_dec_crc32_vaes_avx512:
1513
1514        AES_DOCSIS_DEC_CRC32 9
1515
1516        ret
1517
1518align 64
1519MKGLOBAL(aes_docsis256_dec_crc32_vaes_avx512,function,internal)
1520aes_docsis256_dec_crc32_vaes_avx512:
1521
1522        AES_DOCSIS_DEC_CRC32 13
1523
1524        ret
1525
1526
1527%ifdef LINUX
1528section .note.GNU-stack noalloc noexec nowrite progbits
1529%endif
1530