1;; 2;; Copyright (c) 2020, Intel Corporation 3;; 4;; Redistribution and use in source and binary forms, with or without 5;; modification, are permitted provided that the following conditions are met: 6;; 7;; * Redistributions of source code must retain the above copyright notice, 8;; this list of conditions and the following disclaimer. 9;; * Redistributions in binary form must reproduce the above copyright 10;; notice, this list of conditions and the following disclaimer in the 11;; documentation and/or other materials provided with the distribution. 12;; * Neither the name of Intel Corporation nor the names of its contributors 13;; may be used to endorse or promote products derived from this software 14;; without specific prior written permission. 15;; 16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE 20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26;; 27%use smartalign 28 29%include "imb_job.asm" 30%include "include/reg_sizes.asm" 31%include "include/os.asm" 32%include "include/clear_regs.asm" 33%include "include/aes_common.asm" 34%include "mb_mgr_datastruct.asm" 35 36default rel 37 38extern ethernet_fcs_avx512_local 39 40;; In System V AMD64 ABI 41;; callee saves: RBX, RBP, R12-R15 42;; Windows x64 ABI 43;; callee saves: RBX, RBP, RDI, RSI, RSP, R12-R15 44 45%define CONCAT(a,b) a %+ b 46 47struc STACKFRAME 48_rsp_save: resq 1 49_job_save: resq 1 50_gpr_save: resq 4 51endstruc 52 53%ifdef LINUX 54%define arg1 rdi 55%define arg2 rsi 56%define arg3 rdx 57%else 58%define arg1 rcx 59%define arg2 rdx 60%define arg3 r8 61%endif 62 63%define job arg1 64 65%define tmp1 rbx 66%define tmp2 rbp 67%define tmp3 r10 68%define tmp4 r11 69%define tmp5 r12 70%define tmp6 r13 71%define tmp7 r8 72%define tmp8 r9 73 74 75section .data 76 77;;; Precomputed constants for CRC32 (Ethernet FCS) 78;;; Details of the CRC algorithm and 4 byte buffer of 79;;; {0x01, 0x02, 0x03, 0x04}: 80;;; Result Poly Init RefIn RefOut XorOut 81;;; 0xB63CFBCD 0x04C11DB7 0xFFFFFFFF true true 0xFFFFFFFF 82 83align 16 84rk5: 85 dq 0x00000000ccaa009e, 0x0000000163cd6124 86rk7: 87 dq 0x00000001f7011640, 0x00000001db710640 88 89align 16 90 91fold_by_16: ;; fold by 16x128-bits 92 dq 0x00000000e95c1271, 0x00000000ce3371cb 93fold_by_8: ;; fold by 8x128-bits 94 dq 0x000000014a7fe880, 0x00000001e88ef372 95fold_by_4: ;; fold by 4x128-bits 96 dq 0x00000001c6e41596, 0x0000000154442bd4 97fold_by_2: ;; fold by 2x128-bits 98 dq 0x000000015a546366, 0x00000000f1da05aa 99fold_by_1: ;; fold by 1x128-bits 100 dq 0x00000000ccaa009e, 0x00000001751997d0 101 102align 16 103pshufb_shf_table: 104 ;; use these values for shift registers with the pshufb instruction 105 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 106 dq 0x0706050403020100, 0x000e0d0c0b0a0908 107 108align 16 109init_crc_value: 110 dq 0x00000000FFFFFFFF, 0x0000000000000000 111 112align 16 113mask: 114 dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 115 116align 16 117mask2: 118 dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF 119align 16 120mask3: 121 dq 0x8080808080808080, 0x8080808080808080 122 123align 16 124mask_out_top_bytes: 125 dq 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF 126 dq 0x0000000000000000, 0x0000000000000000 127 128;;; partial block read/write table 129align 64 130byte_len_to_mask_table: 131 dw 0x0000, 0x0001, 0x0003, 0x0007, 132 dw 0x000f, 0x001f, 0x003f, 0x007f, 133 dw 0x00ff, 0x01ff, 0x03ff, 0x07ff, 134 dw 0x0fff, 0x1fff, 0x3fff, 0x7fff, 135 dw 0xffff 136 137section .text 138 139;; =================================================================== 140;; =================================================================== 141;; CRC multiply before XOR against data block 142;; =================================================================== 143%macro CRC_CLMUL 4 144%define %%XCRC_IN_OUT %1 ; [in/out] XMM with CRC (can be anything if "no_crc" below) 145%define %%XCRC_MUL %2 ; [in] XMM with CRC constant (can be anything if "no_crc" below) 146%define %%XCRC_DATA %3 ; [in] XMM with data block 147%define %%XTMP %4 ; [clobbered] temporary XMM 148 149 vpclmulqdq %%XTMP, %%XCRC_IN_OUT, %%XCRC_MUL, 0x01 150 vpclmulqdq %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%XCRC_MUL, 0x10 151 vpternlogq %%XCRC_IN_OUT, %%XTMP, %%XCRC_DATA, 0x96 ; XCRC = XCRC ^ XTMP ^ DATA 152%endmacro 153 154;; =================================================================== 155;; =================================================================== 156;; CRC32 calculation on 16 byte data 157;; =================================================================== 158%macro CRC_UPDATE16 6 159%define %%INP %1 ; [in/out] GP with input text pointer or "no_load" 160%define %%XCRC_IN_OUT %2 ; [in/out] XMM with CRC (can be anything if "no_crc" below) 161%define %%XCRC_MUL %3 ; [in] XMM with CRC multiplier constant 162%define %%TXMM1 %4 ; [clobbered|in] XMM temporary or data in (no_load) 163%define %%TXMM2 %5 ; [clobbered] XMM temporary 164%define %%CRC_TYPE %6 ; [in] "first_crc" or "next_crc" or "no_crc" 165 166 ;; load data and increment in pointer 167%ifnidn %%INP, no_load 168 vmovdqu64 %%TXMM1, [%%INP] 169 add %%INP, 16 170%endif 171 172 ;; CRC calculation 173%ifidn %%CRC_TYPE, next_crc 174 CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%TXMM1, %%TXMM2 175%endif 176%ifidn %%CRC_TYPE, first_crc 177 ;; in the first run just XOR initial CRC with the first block 178 vpxorq %%XCRC_IN_OUT, %%TXMM1 179%endif 180 181%endmacro 182 183;; =================================================================== 184;; =================================================================== 185;; Barrett reduction from 128-bits to 32-bits modulo Ethernet FCS polynomial 186;; =================================================================== 187%macro CRC32_REDUCE_128_TO_32 5 188%define %%CRC %1 ; [out] GP to store 32-bit Ethernet FCS value 189%define %%XCRC %2 ; [in/clobbered] XMM with CRC 190%define %%XT1 %3 ; [clobbered] temporary xmm register 191%define %%XT2 %4 ; [clobbered] temporary xmm register 192%define %%XT3 %5 ; [clobbered] temporary xmm register 193 194%define %%XCRCKEY %%XT3 195 196 ;; compute crc of a 128-bit value 197 vmovdqa64 %%XCRCKEY, [rel rk5] 198 199 ;; 64b fold 200 vpclmulqdq %%XT1, %%XCRC, %%XCRCKEY, 0x00 201 vpsrldq %%XCRC, %%XCRC, 8 202 vpxorq %%XCRC, %%XCRC, %%XT1 203 204 ;; 32b fold 205 vpslldq %%XT1, %%XCRC, 4 206 vpclmulqdq %%XT1, %%XT1, %%XCRCKEY, 0x10 207 vpxorq %%XCRC, %%XCRC, %%XT1 208 209%%_crc_barrett: 210 ;; Barrett reduction 211 vpandq %%XCRC, [rel mask2] 212 vmovdqa64 %%XT1, %%XCRC 213 vmovdqa64 %%XT2, %%XCRC 214 vmovdqa64 %%XCRCKEY, [rel rk7] 215 216 vpclmulqdq %%XCRC, %%XCRCKEY, 0x00 217 vpxorq %%XCRC, %%XT2 218 vpandq %%XCRC, [rel mask] 219 vmovdqa64 %%XT2, %%XCRC 220 vpclmulqdq %%XCRC, %%XCRCKEY, 0x10 221 vpternlogq %%XCRC, %%XT2, %%XT1, 0x96 ; XCRC = XCRC ^ XT2 ^ XT1 222 vpextrd DWORD(%%CRC), %%XCRC, 2 ; 32-bit CRC value 223 not DWORD(%%CRC) 224%endmacro 225 226;; =================================================================== 227;; =================================================================== 228;; Barrett reduction from 64-bits to 32-bits modulo Ethernet FCS polynomial 229;; =================================================================== 230%macro CRC32_REDUCE_64_TO_32 5 231%define %%CRC %1 ; [out] GP to store 32-bit Ethernet FCS value 232%define %%XCRC %2 ; [in/clobbered] XMM with CRC 233%define %%XT1 %3 ; [clobbered] temporary xmm register 234%define %%XT2 %4 ; [clobbered] temporary xmm register 235%define %%XT3 %5 ; [clobbered] temporary xmm register 236 237%define %%XCRCKEY %%XT3 238 239 ;; Barrett reduction 240 vpandq %%XCRC, [rel mask2] 241 vmovdqa64 %%XT1, %%XCRC 242 vmovdqa64 %%XT2, %%XCRC 243 vmovdqa64 %%XCRCKEY, [rel rk7] 244 245 vpclmulqdq %%XCRC, %%XCRCKEY, 0x00 246 vpxorq %%XCRC, %%XT2 247 vpandq %%XCRC, [rel mask] 248 vmovdqa64 %%XT2, %%XCRC 249 vpclmulqdq %%XCRC, %%XCRCKEY, 0x10 250 vpternlogq %%XCRC, %%XT2, %%XT1, 0x96 ; XCRC = XCRC ^ XT2 ^ XT1 251 vpextrd DWORD(%%CRC), %%XCRC, 2 ; 32-bit CRC value 252 not DWORD(%%CRC) 253%endmacro 254 255;; =================================================================== 256;; =================================================================== 257;; ETHERNET FCS CRC 258;; =================================================================== 259%macro ETHERNET_FCS_CRC 9 260%define %%p_in %1 ; [in] pointer to the buffer (GPR) 261%define %%bytes_to_crc %2 ; [in] number of bytes in the buffer (GPR) 262%define %%ethernet_fcs %3 ; [out] GPR to put CRC value into (32 bits) 263%define %%xcrc %4 ; [in] initial CRC value (xmm) 264%define %%tmp %5 ; [clobbered] temporary GPR 265%define %%xcrckey %6 ; [clobbered] temporary XMM / CRC multiplier 266%define %%xtmp1 %7 ; [clobbered] temporary XMM 267%define %%xtmp2 %8 ; [clobbered] temporary XMM 268%define %%xtmp3 %9 ; [clobbered] temporary XMM 269 270 ;; load CRC constants 271 vmovdqa64 %%xcrckey, [rel fold_by_1] 272 273 cmp %%bytes_to_crc, 32 274 jae %%_at_least_32_bytes 275 276 ;; less than 32 bytes 277 cmp %%bytes_to_crc, 16 278 je %%_exact_16_left 279 jl %%_less_than_16_left 280 281 ;; load the plain-text 282 vmovdqu64 %%xtmp1, [%%p_in] 283 vpxorq %%xcrc, %%xtmp1 ; xor the initial crc value 284 add %%p_in, 16 285 sub %%bytes_to_crc, 16 286 jmp %%_crc_two_xmms 287 288%%_exact_16_left: 289 vmovdqu64 %%xtmp1, [%%p_in] 290 vpxorq %%xcrc, %%xtmp1 ; xor the initial CRC value 291 jmp %%_128_done 292 293%%_less_than_16_left: 294 lea %%tmp, [rel byte_len_to_mask_table] 295 kmovw k1, [%%tmp + %%bytes_to_crc*2] 296 vmovdqu8 %%xtmp1{k1}{z}, [%%p_in] 297 298 vpxorq %%xcrc, %%xtmp1 ; xor the initial CRC value 299 300 cmp %%bytes_to_crc, 4 301 jb %%_less_than_4_left 302 303 lea %%tmp, [rel pshufb_shf_table] 304 vmovdqu64 %%xtmp1, [%%tmp + %%bytes_to_crc] 305 vpshufb %%xcrc, %%xtmp1 306 jmp %%_128_done 307 308%%_less_than_4_left: 309 ;; less than 4 bytes left 310 cmp %%bytes_to_crc, 3 311 jne %%_less_than_3_left 312 vpslldq %%xcrc, 5 313 jmp %%_do_barret 314 315%%_less_than_3_left: 316 cmp %%bytes_to_crc, 2 317 jne %%_less_than_2_left 318 vpslldq %%xcrc, 6 319 jmp %%_do_barret 320 321%%_less_than_2_left: 322 vpslldq %%xcrc, 7 323 324%%_do_barret: 325 CRC32_REDUCE_64_TO_32 %%ethernet_fcs, %%xcrc, %%xtmp1, %%xtmp2, %%xcrckey 326 jmp %%_64_done 327 328%%_at_least_32_bytes: 329 CRC_UPDATE16 %%p_in, %%xcrc, %%xcrckey, %%xtmp1, %%xtmp2, first_crc 330 sub %%bytes_to_crc, 16 331 332%%_main_loop: 333 cmp %%bytes_to_crc, 16 334 jb %%_exit_loop 335 CRC_UPDATE16 %%p_in, %%xcrc, %%xcrckey, %%xtmp1, %%xtmp2, next_crc 336 sub %%bytes_to_crc, 16 337 jz %%_128_done 338 jmp %%_main_loop 339 340%%_exit_loop: 341 342 ;; Partial bytes left - complete CRC calculation 343%%_crc_two_xmms: 344 lea %%tmp, [rel pshufb_shf_table] 345 vmovdqu64 %%xtmp2, [%%tmp + %%bytes_to_crc] 346 vmovdqu64 %%xtmp1, [%%p_in - 16 + %%bytes_to_crc] ; xtmp1 = data for CRC 347 vmovdqa64 %%xtmp3, %%xcrc 348 vpshufb %%xcrc, %%xtmp2 ; top num_bytes with LSB xcrc 349 vpxorq %%xtmp2, [rel mask3] 350 vpshufb %%xtmp3, %%xtmp2 ; bottom (16 - num_bytes) with MSB xcrc 351 352 ;; data num_bytes (top) blended with MSB bytes of CRC (bottom) 353 vpblendvb %%xtmp3, %%xtmp1, %%xtmp2 354 355 ;; final CRC calculation 356 CRC_CLMUL %%xcrc, %%xcrckey, %%xtmp3, %%xtmp1 357 358%%_128_done: 359 CRC32_REDUCE_128_TO_32 %%ethernet_fcs, %%xcrc, %%xtmp1, %%xtmp2, %%xcrckey 360%%_64_done: 361%endmacro 362 363;; =================================================================== 364;; =================================================================== 365;; AES128/256 CBC decryption on 1 to 16 blocks 366;; =================================================================== 367%macro AES_CBC_DEC_1_TO_16 17 368%define %%SRC %1 ; [in] GP with pointer to source buffer 369%define %%DST %2 ; [in] GP with pointer to destination buffer 370%define %%NUMBL %3 ; [in] numerical constant with number of blocks to process 371%define %%OFFS %4 ; [in/out] GP with src/dst buffer offset 372%define %%NBYTES %5 ; [in/out] GP with number of bytes to decrypt 373%define %%KEY_PTR %6 ; [in] GP with pointer to expanded AES decrypt keys 374%define %%ZIV %7 ; [in/out] IV in / last cipher text block on out (xmm0 - xmm15) 375%define %%NROUNDS %8 ; [in] number of rounds; numerical value 376%define %%CIPHER_00_03 %9 ; [out] ZMM next 0-3 cipher blocks 377%define %%CIPHER_04_07 %10 ; [out] ZMM next 4-7 cipher blocks 378%define %%CIPHER_08_11 %11 ; [out] ZMM next 8-11 cipher blocks 379%define %%CIPHER_12_15 %12 ; [out] ZMM next 12-15 cipher blocks 380%define %%ZT1 %13 ; [clobbered] ZMM temporary 381%define %%ZT2 %14 ; [clobbered] ZMM temporary 382%define %%ZT3 %15 ; [clobbered] ZMM temporary 383%define %%ZT4 %16 ; [clobbered] ZMM temporary 384%define %%ZT5 %17 ; [clobbered] ZMM temporary 385 386 ;; ///////////////////////////////////////////////// 387 ;; load cipher text 388 ZMM_LOAD_BLOCKS_0_16 %%NUMBL, %%SRC, %%OFFS, \ 389 %%CIPHER_00_03, %%CIPHER_04_07, \ 390 %%CIPHER_08_11, %%CIPHER_12_15 391 392 ;; ///////////////////////////////////////////////// 393 ;; prepare cipher text blocks for an XOR after AES-DEC rounds 394 valignq %%ZT1, %%CIPHER_00_03, %%ZIV, 6 395%if %%NUMBL > 4 396 valignq %%ZT2, %%CIPHER_04_07, %%CIPHER_00_03, 6 397%endif 398%if %%NUMBL > 8 399 valignq %%ZT3, %%CIPHER_08_11, %%CIPHER_04_07, 6 400%endif 401%if %%NUMBL > 12 402 valignq %%ZT4, %%CIPHER_12_15, %%CIPHER_08_11, 6 403%endif 404 405 ;; ///////////////////////////////////////////////// 406 ;; update IV with the last cipher block 407%if %%NUMBL < 4 408 valignq %%ZIV, %%CIPHER_00_03, %%CIPHER_00_03, ((%%NUMBL % 4) * 2) 409%elif %%NUMBL == 4 410 vmovdqa64 %%ZIV, %%CIPHER_00_03 411%elif %%NUMBL < 8 412 valignq %%ZIV, %%CIPHER_04_07, %%CIPHER_04_07, ((%%NUMBL % 4) * 2) 413%elif %%NUMBL == 8 414 vmovdqa64 %%ZIV, %%CIPHER_04_07 415%elif %%NUMBL < 12 416 valignq %%ZIV, %%CIPHER_08_11, %%CIPHER_08_11, ((%%NUMBL % 4) * 2) 417%elif %%NUMBL == 12 418 vmovdqa64 %%ZIV, %%CIPHER_08_11 419%elif %%NUMBL < 16 420 valignq %%ZIV, %%CIPHER_12_15, %%CIPHER_12_15, ((%%NUMBL % 4) * 2) 421%else ;; %%NUMBL == 16 422 vmovdqa64 %%ZIV, %%CIPHER_12_15 423%endif 424 425 ;; ///////////////////////////////////////////////// 426 ;; AES rounds including XOR 427%assign j 0 428%rep (%%NROUNDS + 2) 429 vbroadcastf64x2 %%ZT5, [%%KEY_PTR + (j * 16)] 430 ZMM_AESDEC_ROUND_BLOCKS_0_16 %%CIPHER_00_03, %%CIPHER_04_07, \ 431 %%CIPHER_08_11, %%CIPHER_12_15, \ 432 %%ZT5, j, %%ZT1, %%ZT2, %%ZT3, %%ZT4, \ 433 %%NUMBL, %%NROUNDS 434%assign j (j + 1) 435%endrep 436 437 ;; ///////////////////////////////////////////////// 438 ;; write plain text back to output 439 ZMM_STORE_BLOCKS_0_16 %%NUMBL, %%DST, %%OFFS, \ 440 %%CIPHER_00_03, %%CIPHER_04_07, \ 441 %%CIPHER_08_11, %%CIPHER_12_15 442 443 ;; ///////////////////////////////////////////////// 444 ;; update lengths and offset 445 add %%OFFS, (%%NUMBL * 16) 446 sub %%NBYTES, (%%NUMBL * 16) 447%endmacro ;; AES_CBC_DEC_1_TO_16 448 449;; =================================================================== 450;; =================================================================== 451;; CRC32 on 1 to 16 blocks (first_crc case only) 452;; =================================================================== 453%macro CRC32_FIRST_1_TO_16 13 454%define %%CRC_MUL %1 ; [in] XMM with CRC multiplier 455%define %%CRC_IN_OUT %2 ; [in/out] current CRC value 456%define %%XTMP %3 ; [clobbered] temporary XMM 457%define %%XTMP2 %4 ; [clobbered] temporary XMM 458%define %%NUMBL %5 ; [in] number of blocks of clear text to compute CRC on 459%define %%ZCRCIN0 %6 ; [in] clear text 4 blocks 460%define %%ZCRCIN1 %7 ; [in] clear text 4 blocks 461%define %%ZCRCIN2 %8 ; [in] clear text 4 blocks 462%define %%ZCRCIN3 %9 ; [in] clear text 4 blocks 463%define %%ZCRCSUM0 %10 ; [clobbered] temporary ZMM 464%define %%ZCRCSUM1 %11 ; [clobbered] temporary ZMM 465%define %%ZCRCSUM2 %12 ; [clobbered] temporary ZMM 466%define %%ZCRCSUM3 %13 ; [clobbered] temporary ZMM 467 468%xdefine %%ZTMP0 ZWORD(%%XTMP) 469%xdefine %%ZTMP1 ZWORD(%%XTMP2) 470 471%if (%%NUMBL == 0) 472 ;; do nothing 473%elif (%%NUMBL == 1) 474 vpxorq %%CRC_IN_OUT, XWORD(%%ZCRCIN0) 475%elif (%%NUMBL == 16) 476 vmovdqa64 %%ZCRCSUM0, %%ZCRCIN0 477 vmovdqa64 %%ZCRCSUM1, %%ZCRCIN1 478 vmovdqa64 %%ZCRCSUM2, %%ZCRCIN2 479 vmovdqa64 %%ZCRCSUM3, %%ZCRCIN3 480 481 ;; Add current CRC sum into block 0 482 vmovdqa64 %%CRC_IN_OUT, %%CRC_IN_OUT 483 vpxorq %%ZCRCSUM0, %%ZCRCSUM0, ZWORD(%%CRC_IN_OUT) 484 ;; fold 16 x 128 bits -> 8 x 128 bits 485 vbroadcastf64x2 %%ZTMP0, [rel fold_by_8] 486 vpclmulqdq %%ZTMP1, %%ZCRCSUM0, %%ZTMP0, 0x01 487 vpclmulqdq %%ZCRCSUM0, %%ZCRCSUM0, %%ZTMP0, 0x10 488 vpternlogq %%ZCRCSUM0, %%ZCRCSUM2, %%ZTMP1, 0x96 489 490 vpclmulqdq %%ZTMP1, %%ZCRCSUM1, %%ZTMP0, 0x01 491 vpclmulqdq %%ZCRCSUM1, %%ZCRCSUM1, %%ZTMP0, 0x10 492 vpternlogq %%ZCRCSUM1, %%ZCRCSUM3, %%ZTMP1, 0x96 493 494 ;; fold 8 x 128 bits -> 4 x 128 bits 495 vbroadcastf64x2 %%ZTMP0, [rel fold_by_4] 496 vpclmulqdq %%ZTMP1, %%ZCRCSUM0, %%ZTMP0, 0x01 497 vpclmulqdq %%ZCRCSUM0, %%ZCRCSUM0, %%ZTMP0, 0x10 498 vpternlogq %%ZCRCSUM0, %%ZCRCSUM1, %%ZTMP1, 0x96 499 500 ;; fold 4 x 128 bits -> 2 x 128 bits 501 vbroadcastf64x2 YWORD(%%ZTMP0), [rel fold_by_2] 502 vextracti64x4 YWORD(%%ZCRCSUM1), %%ZCRCSUM0, 1 503 vpclmulqdq YWORD(%%ZTMP1), YWORD(%%ZCRCSUM0), YWORD(%%ZTMP0), 0x01 504 vpclmulqdq YWORD(%%ZCRCSUM0), YWORD(%%ZCRCSUM0), YWORD(%%ZTMP0), 0x10 505 vpternlogq YWORD(%%ZCRCSUM0), YWORD(%%ZCRCSUM1), YWORD(%%ZTMP1), 0x96 506 507 ;; fold 2 x 128 bits -> 1 x 128 bits 508 vmovdqa64 XWORD(%%ZTMP0), [rel fold_by_1] 509 vextracti64x2 XWORD(%%ZCRCSUM1), YWORD(%%ZCRCSUM0), 1 510 vpclmulqdq XWORD(%%ZTMP1), XWORD(%%ZCRCSUM0), XWORD(%%ZTMP0), 0x01 511 vpclmulqdq XWORD(%%ZCRCSUM0), XWORD(%%ZCRCSUM0), XWORD(%%ZTMP0), 0x10 512 vpternlogq XWORD(%%ZCRCSUM0), XWORD(%%ZCRCSUM1), XWORD(%%ZTMP1), 0x96 513 vmovdqa64 %%CRC_IN_OUT, XWORD(%%ZCRCSUM0) 514 515%else 516 517 vpxorq %%ZCRCSUM0, %%ZCRCSUM0 518 vpxorq %%ZCRCSUM1, %%ZCRCSUM1 519 vpxorq %%ZCRCSUM2, %%ZCRCSUM2 520 vpxorq %%ZCRCSUM3, %%ZCRCSUM3 521 522 vmovdqa64 %%ZCRCSUM0, %%ZCRCIN0 523%if %%NUMBL > 4 524 vmovdqa64 %%ZCRCSUM1, %%ZCRCIN1 525%endif 526%if %%NUMBL > 8 527 vmovdqa64 %%ZCRCSUM2, %%ZCRCIN2 528%endif 529%if %%NUMBL > 12 530 vmovdqa64 %%ZCRCSUM3, %%ZCRCIN3 531%endif 532 533 ;; Add current CRC sum into block 0 534 vmovdqa64 %%CRC_IN_OUT, %%CRC_IN_OUT 535 vpxorq %%ZCRCSUM0, %%ZCRCSUM0, ZWORD(%%CRC_IN_OUT) 536 537%assign blocks_left %%NUMBL 538 539%if (%%NUMBL >= 12) 540 vbroadcastf64x2 %%ZTMP0, [rel fold_by_4] 541 vpclmulqdq %%ZTMP1, %%ZCRCSUM0, %%ZTMP0, 0x01 542 vpclmulqdq %%ZCRCSUM0, %%ZCRCSUM0, %%ZTMP0, 0x10 543 vpternlogq %%ZCRCSUM0, %%ZCRCSUM1, %%ZTMP1, 0x96 544 545 vpclmulqdq %%ZTMP1, %%ZCRCSUM0, %%ZTMP0, 0x01 546 vpclmulqdq %%ZCRCSUM0, %%ZCRCSUM0, %%ZTMP0, 0x10 547 vpternlogq %%ZCRCSUM0, %%ZCRCSUM2, %%ZTMP1, 0x96 548 vmovdqa64 %%ZCRCSUM1, %%ZCRCSUM3 549 550%assign blocks_left (blocks_left - 8) 551 552%elif (%%NUMBL >= 8) 553 vbroadcastf64x2 %%ZTMP0, [rel fold_by_4] 554 vpclmulqdq %%ZTMP1, %%ZCRCSUM0, %%ZTMP0, 0x01 555 vpclmulqdq %%ZCRCSUM0, %%ZCRCSUM0, %%ZTMP0, 0x10 556 vpternlogq %%ZCRCSUM0, %%ZCRCSUM1, %%ZTMP1, 0x96 557 vmovdqa64 %%ZCRCSUM1, %%ZCRCSUM2 558 559%assign blocks_left (blocks_left - 4) 560%endif 561 562 ;; 1 to 8 blocks left in ZCRCSUM0 and ZCRCSUM1 563 564%if blocks_left >= 4 565 ;; fold 4 x 128 bits -> 2 x 128 bits 566 vbroadcastf64x2 YWORD(%%ZTMP0), [rel fold_by_2] 567 vextracti64x4 YWORD(%%ZCRCSUM3), %%ZCRCSUM0, 1 568 vpclmulqdq YWORD(%%ZTMP1), YWORD(%%ZCRCSUM0), YWORD(%%ZTMP0), 0x01 569 vpclmulqdq YWORD(%%ZCRCSUM0), YWORD(%%ZCRCSUM0), YWORD(%%ZTMP0), 0x10 570 vpternlogq YWORD(%%ZCRCSUM0), YWORD(%%ZCRCSUM3), YWORD(%%ZTMP1), 0x96 571 572 ;; fold 2 x 128 bits -> 1 x 128 bits 573 vmovdqa64 XWORD(%%ZTMP0), [rel fold_by_1] 574 vextracti64x2 XWORD(%%ZCRCSUM3), YWORD(%%ZCRCSUM0), 1 575 vpclmulqdq XWORD(%%ZTMP1), XWORD(%%ZCRCSUM0), XWORD(%%ZTMP0), 0x01 576 vpclmulqdq XWORD(%%ZCRCSUM0), XWORD(%%ZCRCSUM0), XWORD(%%ZTMP0), 0x10 577 vpternlogq XWORD(%%ZCRCSUM0), XWORD(%%ZCRCSUM3), XWORD(%%ZTMP1), 0x96 578 579 vmovdqa64 %%CRC_IN_OUT, XWORD(%%ZCRCSUM0) 580 581 vmovdqa64 %%ZCRCSUM0, %%ZCRCSUM1 582 583%assign blocks_left (blocks_left - 4) 584 585%else 586 vmovdqa64 %%CRC_IN_OUT, XWORD(%%ZCRCSUM0) 587 vshufi64x2 %%ZCRCSUM0, %%ZCRCSUM0, %%ZCRCSUM0, 0011_1001b 588 589%assign blocks_left (blocks_left - 1) 590%endif 591 592%rep blocks_left 593 vmovdqa64 %%XTMP, XWORD(%%ZCRCSUM0) 594 CRC_CLMUL %%CRC_IN_OUT, %%CRC_MUL, %%XTMP, %%XTMP2 595 vshufi64x2 %%ZCRCSUM0, %%ZCRCSUM0, %%ZCRCSUM0, 0011_1001b 596%endrep 597 598%endif ;; %%NUMBL > 0 599 600%endmacro ;; CRC32_FIRST_1_TO_16 601 602;; =================================================================== 603;; =================================================================== 604;; Stitched AES128/256 CBC decryption & CRC32 on 16 blocks 605;; =================================================================== 606%macro AES_CBC_DEC_CRC32_16 22 607%define %%SRC %1 ; [in] GP with pointer to source buffer 608%define %%DST %2 ; [in] GP with pointer to destination buffer 609%define %%OFFS %3 ; [in/out] GP with src/dst buffer offset 610%define %%NBYTES %4 ; [in/out] GP with number of bytes to decrypt 611%define %%KEY_PTR %5 ; [in] GP with pointer to expanded AES decrypt keys 612%define %%ZIV %6 ; [in/out] IV in / last cipher text block on out 613%define %%ZD0 %7 ; [clobbered] temporary ZMM 614%define %%ZD1 %8 ; [clobbered] temporary ZMM 615%define %%ZD2 %9 ; [clobbered] temporary ZMM 616%define %%ZD3 %10 ; [clobbered] temporary ZMM 617%define %%ZC0 %11 ; [clobbered] temporary ZMM 618%define %%ZC1 %12 ; [clobbered] temporary ZMM 619%define %%ZC2 %13 ; [clobbered] temporary ZMM 620%define %%ZC3 %14 ; [clobbered] temporary ZMM 621%define %%ZTMP0 %15 ; [clobbered] temporary ZMM 622%define %%ZTMP1 %16 ; [clobbered] temporary ZMM 623%define %%NROUNDS %17 ; [in] Number of rounds (9 or 13) 624%define %%ZCRC_SUM0 %18 ; [in/out] current CRC value 625%define %%ZCRC_SUM1 %19 ; [in/out] current CRC value 626%define %%ZCRC_SUM2 %20 ; [in/out] current CRC value 627%define %%ZCRC_SUM3 %21 ; [in/out] current CRC value 628%define %%LAST_BLOCK %22 ; [out] xmm to store the last clear text block 629 630 ;; ///////////////////////////////////////////////// 631 ;; load cipher text blocks 632 ZMM_LOAD_BLOCKS_0_16 16, %%SRC, %%OFFS, \ 633 %%ZC0, %%ZC1, %%ZC2, %%ZC3 634 635 ;; ///////////////////////////////////////////////// 636 ;; prepare cipher text blocks for an XOR after AES-DEC rounds 637 valignq %%ZD0, %%ZC0, %%ZIV, 6 638 valignq %%ZD1, %%ZC1, %%ZC0, 6 639 valignq %%ZD2, %%ZC2, %%ZC1, 6 640 valignq %%ZD3, %%ZC3, %%ZC2, 6 641 642 ;; ///////////////////////////////////////////////// 643 ;; update IV for the next round (block 3 in ZIV) 644 vmovdqa64 %%ZIV, %%ZC3 645 646 ;; ///////////////////////////////////////////////// 647 ;; AES rounds 0 to 10/14 & CRC 648 649%assign round 0 650%rep (%%NROUNDS + 2) 651 ;; ///////////////////////////////////////////////// 652 ;; AES decrypt round 653 vbroadcastf64x2 %%ZTMP0, [%%KEY_PTR + (round*16)] 654 ZMM_AESDEC_ROUND_BLOCKS_0_16 %%ZC0, %%ZC1, %%ZC2, %%ZC3, \ 655 %%ZTMP0, round, %%ZD0, %%ZD1, %%ZD2, %%ZD3, \ 656 16, %%NROUNDS 657%assign round (round + 1) 658%endrep 659 660 ;; ///////////////////////////////////////////////// 661 ;; store clear text 662 ZMM_STORE_BLOCKS_0_16 16, %%DST, %%OFFS, \ 663 %%ZC0, %%ZC1, %%ZC2, %%ZC3 664 665 ;; \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ 666 ;; CRC just decrypted blocks 667 vbroadcastf64x2 %%ZTMP0, [rel fold_by_16] 668 vpclmulqdq %%ZTMP1, %%ZCRC_SUM0, %%ZTMP0, 0x10 669 vpclmulqdq %%ZCRC_SUM0, %%ZCRC_SUM0, %%ZTMP0, 0x01 670 vpternlogq %%ZCRC_SUM0, %%ZTMP1, %%ZC0, 0x96 671 672 vpclmulqdq %%ZTMP1, %%ZCRC_SUM1, %%ZTMP0, 0x10 673 vpclmulqdq %%ZCRC_SUM1, %%ZCRC_SUM1, %%ZTMP0, 0x01 674 vpternlogq %%ZCRC_SUM1, %%ZTMP1, %%ZC1, 0x96 675 676 vpclmulqdq %%ZTMP1, %%ZCRC_SUM2, %%ZTMP0, 0x10 677 vpclmulqdq %%ZCRC_SUM2, %%ZCRC_SUM2, %%ZTMP0, 0x01 678 vpternlogq %%ZCRC_SUM2, %%ZTMP1, %%ZC2, 0x96 679 680 vpclmulqdq %%ZTMP1, %%ZCRC_SUM3, %%ZTMP0, 0x10 681 vpclmulqdq %%ZCRC_SUM3, %%ZCRC_SUM3, %%ZTMP0, 0x01 682 vpternlogq %%ZCRC_SUM3, %%ZTMP1, %%ZC3, 0x96 683 684 vextracti64x2 %%LAST_BLOCK, %%ZC3, 3 685 686 ;; ///////////////////////////////////////////////// 687 ;; update lengths and offset 688 add %%OFFS, (16 * 16) 689 sub %%NBYTES, (16 * 16) 690 691%endmacro ;; AES_CBC_DEC_CRC32_16 692 693;; =================================================================== 694;; =================================================================== 695;; DOCSIS SEC BPI decryption + CRC32 696;; This macro is handling the case when the two components are 697;; executed together. 698;; =================================================================== 699%macro DOCSIS_DEC_CRC32 40 700%define %%KEYS %1 ;; [in] GP with pointer to expanded keys (decrypt) 701%define %%SRC %2 ;; [in] GP with pointer to source buffer 702%define %%DST %3 ;; [in] GP with pointer to destination buffer 703%define %%NUM_BYTES %4 ;; [in/clobbered] GP with number of bytes to decrypt 704%define %%KEYS_ENC %5 ;; [in] GP with pointer to expanded keys (encrypt) 705%define %%GT1 %6 ;; [clobbered] temporary GP 706%define %%GT2 %7 ;; [clobbered] temporary GP 707%define %%XCRC_INIT %8 ;; [in/out] CRC initial value 708%define %%XIV %9 ;; [in/out] cipher IV 709%define %%ZT1 %10 ;; [clobbered] temporary ZMM 710%define %%ZT2 %11 ;; [clobbered] temporary ZMM 711%define %%ZT3 %12 ;; [clobbered] temporary ZMM 712%define %%ZT4 %13 ;; [clobbered] temporary ZMM 713%define %%ZT5 %14 ;; [clobbered] temporary ZMM 714%define %%ZT6 %15 ;; [clobbered] temporary ZMM 715%define %%ZT7 %16 ;; [clobbered] temporary ZMM 716%define %%ZT8 %17 ;; [clobbered] temporary ZMM 717%define %%ZT9 %18 ;; [clobbered] temporary ZMM 718%define %%ZT10 %19 ;; [clobbered] temporary ZMM 719%define %%ZT11 %20 ;; [clobbered] temporary ZMM 720%define %%ZT12 %21 ;; [clobbered] temporary ZMM 721%define %%ZT13 %22 ;; [clobbered] temporary ZMM 722 ;; no ZT14 - taken by XIV 723 ;; no ZT15 - taken by CRC_INIT 724%define %%ZT16 %23 ;; [clobbered] temporary ZMM 725%define %%ZT17 %24 ;; [clobbered] temporary ZMM 726%define %%ZT18 %25 ;; [clobbered] temporary ZMM 727%define %%ZT19 %26 ;; [clobbered] temporary ZMM 728%define %%ZT20 %27 ;; [clobbered] temporary ZMM 729%define %%ZT21 %28 ;; [clobbered] temporary ZMM 730%define %%ZT22 %29 ;; [clobbered] temporary ZMM 731%define %%ZT23 %30 ;; [clobbered] temporary ZMM 732%define %%ZT24 %31 ;; [clobbered] temporary ZMM 733%define %%ZT25 %32 ;; [clobbered] temporary ZMM 734%define %%ZT26 %33 ;; [clobbered] temporary ZMM 735%define %%ZT27 %34 ;; [clobbered] temporary ZMM 736%define %%ZT28 %35 ;; [clobbered] temporary ZMM 737%define %%ZT29 %36 ;; [clobbered] temporary ZMM 738%define %%ZT30 %37 ;; [clobbered] temporary ZMM 739%define %%ZT31 %38 ;; [clobbered] temporary ZMM 740%define %%ZT32 %39 ;; [clobbered] temporary ZMM 741%define %%NROUNDS %40 ;; [in] Number of rounds (9 or 13) 742 743%define %%NUM_BLOCKS %%GT1 744%define %%OFFSET %%GT2 745 746%xdefine %%ZIV ZWORD(%%XIV) 747 748%xdefine %%XTMP0 XWORD(%%ZT1) 749%xdefine %%XTMP1 XWORD(%%ZT2) 750 751%xdefine %%XCRC_TMP XWORD(%%ZT3) 752%xdefine %%XCRC_MUL XWORD(%%ZT4) 753%xdefine %%XCRC_IN_OUT %%XCRC_INIT 754 755%xdefine %%ZCRC0 %%ZT5 756%xdefine %%ZCRC1 %%ZT6 757%xdefine %%ZCRC2 %%ZT7 758%xdefine %%ZCRC3 %%ZT8 759%xdefine %%XCRC0 XWORD(%%ZCRC0) 760 761%xdefine %%ZCIPH0 %%ZT9 762%xdefine %%ZCIPH1 %%ZT10 763%xdefine %%ZCIPH2 %%ZT11 764%xdefine %%ZCIPH3 %%ZT12 765 766%xdefine %%ZTMP0 %%ZT20 767%xdefine %%ZTMP1 %%ZT21 768%xdefine %%ZTMP2 %%ZT22 769%xdefine %%ZTMP3 %%ZT23 770%xdefine %%ZTMP4 %%ZT24 771%xdefine %%ZTMP5 %%ZT25 772%xdefine %%ZTMP6 %%ZT26 773%xdefine %%ZTMP7 %%ZT27 774%xdefine %%ZTMP8 %%ZT28 775%xdefine %%ZTMP9 %%ZT29 776 777%xdefine %%ZCRC_IN_OUT0 ZWORD(%%XCRC_IN_OUT) 778%xdefine %%ZCRC_IN_OUT1 %%ZT30 779%xdefine %%ZCRC_IN_OUT2 %%ZT31 780%xdefine %%ZCRC_IN_OUT3 %%ZT32 781 782 vmovdqa64 %%XCRC_MUL, [rel fold_by_1] 783 vmovdqa64 %%XCRC_INIT, %%XCRC_INIT 784 785 xor %%OFFSET, %%OFFSET 786 787 cmp %%NUM_BYTES, 16 788 jb %%_check_partial_block 789 790 cmp %%NUM_BYTES, (16 * 16) + 16 791 jb %%_below_17_blocks 792 793 cmp %%NUM_BYTES, (32 * 16) + 16 794 jb %%_below_33_blocks 795 796 ;; ===================================================================== 797 ;; ===================================================================== 798 ;; Part handling messages bigger-equal 33 blocks 799 ;; - decrypt & crc performed per 16 block basis 800 ;; ===================================================================== 801 802 ;; Decrypt 16 blocks first. 803 ;; Make sure IV is in the top 128 bits of ZMM. 804 vshufi64x2 %%ZIV, %%ZIV, %%ZIV, 0000_0000b 805 806 AES_CBC_DEC_1_TO_16 %%SRC, %%DST, 16, %%OFFSET, %%NUM_BYTES, \ 807 %%KEYS, %%ZIV, %%NROUNDS, \ 808 %%ZTMP0, %%ZCRC_IN_OUT1, \ 809 %%ZCRC_IN_OUT2, %%ZCRC_IN_OUT3, \ 810 %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5 811 812 ;; Start of CRC is just reading the data and adding initial value. 813 ;; In the next round multiply and add operations will apply. 814 vpxorq %%ZCRC_IN_OUT0, %%ZCRC_IN_OUT0, %%ZTMP0 815 816 vextracti64x2 %%XCRC0, %%ZCRC_IN_OUT3, 3 817 818%%_main_loop: 819 cmp %%NUM_BYTES, (16 * 16) + 16 820 jb %%_main_loop_exit 821 822 ;; Stitched cipher and CRC on 16 blocks 823 AES_CBC_DEC_CRC32_16 %%SRC, %%DST, %%OFFSET, %%NUM_BYTES, \ 824 %%KEYS, %%ZIV, \ 825 %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \ 826 %%ZTMP5, %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, \ 827 %%NROUNDS, \ 828 %%ZCRC_IN_OUT0, %%ZCRC_IN_OUT1, \ 829 %%ZCRC_IN_OUT2, %%ZCRC_IN_OUT3, \ 830 %%XCRC0 831 832 jmp %%_main_loop 833 834%%_main_loop_exit: 835 ;; Up to 16 (inclusive) blocks left to process 836 ;; - decrypt the blocks first 837 ;; - then crc decrypted blocks minus one block 838 839 ;; broadcast IV across ZMM (4th and 1st 128-bit positions are only important really) 840 vshufi64x2 %%ZIV, %%ZIV, %%ZIV, 1111_1111b 841 842 mov %%NUM_BLOCKS, %%NUM_BYTES 843 shr %%NUM_BLOCKS, 4 844 and %%NUM_BLOCKS, 15 845 jz %%_decrypt_eq0 846 847 cmp %%NUM_BLOCKS, 8 848 jg %%_decrypt_gt8 849 je %%_decrypt_eq8 850 851 ;; 1 to 7 blocks 852 cmp %%NUM_BLOCKS, 4 853 jg %%_decrypt_gt4 854 je %%_decrypt_eq4 855 856%%_decrypt_lt4: 857 ;; 1 to 3 blocks 858 cmp %%NUM_BLOCKS, 2 859 jg %%_decrypt_eq3 860 je %%_decrypt_eq2 861 jmp %%_decrypt_eq1 862 863%%_decrypt_gt4: 864 ;; 5 to 7 865 cmp %%NUM_BLOCKS, 6 866 jg %%_decrypt_eq7 867 je %%_decrypt_eq6 868 jmp %%_decrypt_eq5 869 870%%_decrypt_gt8: 871 ;; 9 to 15 872 cmp %%NUM_BLOCKS, 12 873 jg %%_decrypt_gt12 874 je %%_decrypt_eq12 875 876 ;; 9 to 11 877 cmp %%NUM_BLOCKS, 10 878 jg %%_decrypt_eq11 879 je %%_decrypt_eq10 880 jmp %%_decrypt_eq9 881 882%%_decrypt_gt12: 883 ;; 13 to 15 884 cmp %%NUM_BLOCKS, 14 885 jg %%_decrypt_eq15 886 je %%_decrypt_eq14 887 jmp %%_decrypt_eq13 888 889%assign number_of_blocks 1 890%rep 15 891%%_decrypt_eq %+ number_of_blocks : 892 ;; decrypt selected number of blocks 893 AES_CBC_DEC_1_TO_16 %%SRC, %%DST, number_of_blocks, %%OFFSET, %%NUM_BYTES, \ 894 %%KEYS, %%ZIV, %%NROUNDS, \ 895 %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, \ 896 %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5 897 898 ;; extract & save the last decrypted block as crc for it is done separately 899 ;; towards the end of this macro 900%if number_of_blocks < 5 901 vextracti64x2 %%XCRC0, %%ZTMP6, (number_of_blocks - 1) 902%elif number_of_blocks < 9 903 vextracti64x2 %%XCRC0, %%ZTMP7, (number_of_blocks - 4 - 1) 904%elif number_of_blocks < 13 905 vextracti64x2 %%XCRC0, %%ZTMP8, (number_of_blocks - 8 - 1) 906%else 907 vextracti64x2 %%XCRC0, %%ZTMP9, (number_of_blocks - 12 - 1) 908%endif 909 910 ;; set number of blocks for CRC 911 mov %%NUM_BLOCKS, (number_of_blocks - 1) 912 913 ;; extract latest IV into XIV for partial block processing 914 vextracti32x4 %%XIV, %%ZIV, 3 915 jmp %%_decrypt_done_fold_by8 916 917%assign number_of_blocks (number_of_blocks + 1) 918%endrep 919 920%%_decrypt_eq0: 921 ;; Special case. Check if there are full 16 blocks for decrypt 922 ;; - it can happen here because the main loop checks for 17 blocks 923 ;; If yes then decrypt them and fall through to folding/crc section 924 ;; identifying 15 blocks for CRC 925 cmp %%NUM_BYTES, (16 * 16) 926 jb %%_cbc_decrypt_done 927 928 AES_CBC_DEC_1_TO_16 %%SRC, %%DST, 16, %%OFFSET, %%NUM_BYTES, \ 929 %%KEYS, %%ZIV, %%NROUNDS, \ 930 %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, \ 931 %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5 932 933 mov %%NUM_BLOCKS, 15 934 vextracti32x4 %%XIV, %%ZIV, 3 935 vextracti64x2 %%XCRC0, %%ZTMP9, 3 936 937%%_decrypt_done_fold_by8: 938 ;; Register content at this point: 939 ;; ZTMP6 - ZTMP9 => decrypted blocks (16 to 31) 940 ;; ZCRC_IN_OUT0 - ZCRC_IN_OUT3 - fold by 16 CRC sums 941 ;; NUM_BLOCKS - number of blocks to CRC 942 943 ;; fold 16 x 128 bits -> 8 x 128 bits 944 vbroadcastf64x2 %%ZTMP2, [rel fold_by_8] 945 vpclmulqdq %%ZTMP1, %%ZCRC_IN_OUT0, %%ZTMP2, 0x01 946 vpclmulqdq %%ZCRC_IN_OUT0, %%ZCRC_IN_OUT0, %%ZTMP2, 0x10 947 vpternlogq %%ZCRC_IN_OUT0, %%ZCRC_IN_OUT2, %%ZTMP1, 0x96 948 949 vpclmulqdq %%ZTMP1, %%ZCRC_IN_OUT1, %%ZTMP2, 0x01 950 vpclmulqdq %%ZCRC_IN_OUT1, %%ZCRC_IN_OUT1, %%ZTMP2, 0x10 951 vpternlogq %%ZCRC_IN_OUT1, %%ZCRC_IN_OUT3, %%ZTMP1, 0x96 952 953%%_decrypt_done_no_fold_16_to_8: 954 ;; CRC 8 blocks of already decrypted text 955 test %%NUM_BLOCKS, 8 956 jz %%_skip_crc_by8 957 958 vbroadcastf64x2 %%ZTMP2, [rel fold_by_8] 959 vpclmulqdq %%ZTMP1, %%ZCRC_IN_OUT0, %%ZTMP2, 0x01 960 vpclmulqdq %%ZCRC_IN_OUT0, %%ZCRC_IN_OUT0, %%ZTMP2, 0x10 961 vpternlogq %%ZCRC_IN_OUT0, %%ZTMP6, %%ZTMP1, 0x96 962 963 vpclmulqdq %%ZTMP1, %%ZCRC_IN_OUT1, %%ZTMP2, 0x01 964 vpclmulqdq %%ZCRC_IN_OUT1, %%ZCRC_IN_OUT1, %%ZTMP2, 0x10 965 vpternlogq %%ZCRC_IN_OUT1, %%ZTMP7, %%ZTMP1, 0x96 966 967 vmovdqa64 %%ZTMP6, %%ZTMP8 968 vmovdqa64 %%ZTMP7, %%ZTMP9 969 970%%_skip_crc_by8: 971 ;; fold 8 x 128 bits -> 4 x 128 bits 972 vbroadcastf64x2 %%ZTMP2, [rel fold_by_4] 973 vpclmulqdq %%ZTMP1, %%ZCRC_IN_OUT0, %%ZTMP2, 0x01 974 vpclmulqdq %%ZCRC_IN_OUT0, %%ZCRC_IN_OUT0, %%ZTMP2, 0x10 975 vpternlogq %%ZCRC_IN_OUT0, %%ZCRC_IN_OUT1, %%ZTMP1, 0x96 976 977 ;; CRC 4 blocks of already decrypted text 978 test %%NUM_BLOCKS, 4 979 jz %%_skip_crc_by4 980 981 vpclmulqdq %%ZTMP1, %%ZCRC_IN_OUT0, %%ZTMP2, 0x01 982 vpclmulqdq %%ZCRC_IN_OUT0, %%ZCRC_IN_OUT0, %%ZTMP2, 0x10 983 vpternlogq %%ZCRC_IN_OUT0, %%ZTMP6, %%ZTMP1, 0x96 984 985 vmovdqa64 %%ZTMP6, %%ZTMP7 986 987%%_skip_crc_by4: 988 ;; fold 4 x 128 bits -> 2 x 128 bits 989 vbroadcastf64x2 YWORD(%%ZTMP2), [rel fold_by_2] 990 vextracti64x4 YWORD(%%ZCRC_IN_OUT1), %%ZCRC_IN_OUT0, 1 991 vpclmulqdq YWORD(%%ZTMP1), YWORD(%%ZCRC_IN_OUT0), YWORD(%%ZTMP2), 0x01 992 vpclmulqdq YWORD(%%ZCRC_IN_OUT0), YWORD(%%ZCRC_IN_OUT0), YWORD(%%ZTMP2), 0x10 993 vpternlogq YWORD(%%ZCRC_IN_OUT0), YWORD(%%ZCRC_IN_OUT1), YWORD(%%ZTMP1), 0x96 994 995 ;; CRC 2 blocks of already decrypted text 996 test %%NUM_BLOCKS, 2 997 jz %%_skip_crc_by2 998 999 vpclmulqdq YWORD(%%ZTMP1), YWORD(%%ZCRC_IN_OUT0), YWORD(%%ZTMP2), 0x01 1000 vpclmulqdq YWORD(%%ZCRC_IN_OUT0), YWORD(%%ZCRC_IN_OUT0), YWORD(%%ZTMP2), 0x10 1001 vpternlogq YWORD(%%ZCRC_IN_OUT0), YWORD(%%ZTMP6), YWORD(%%ZTMP1), 0x96 1002 1003 vshufi64x2 %%ZTMP6, %%ZTMP6, %%ZTMP6, 1110_1110b 1004 1005%%_skip_crc_by2: 1006 ;; fold 2 x 128 bits -> 1 x 128 bits 1007 vmovdqa64 XWORD(%%ZTMP2), [rel fold_by_1] 1008 vextracti64x2 XWORD(%%ZCRC_IN_OUT1), YWORD(%%ZCRC_IN_OUT0), 1 1009 vpclmulqdq XWORD(%%ZTMP1), XWORD(%%ZCRC_IN_OUT0), XWORD(%%ZTMP2), 0x01 1010 vpclmulqdq XWORD(%%ZCRC_IN_OUT0), XWORD(%%ZCRC_IN_OUT0), XWORD(%%ZTMP2), 0x10 1011 vpternlogq XWORD(%%ZCRC_IN_OUT0), XWORD(%%ZCRC_IN_OUT1), XWORD(%%ZTMP1), 0x96 1012 1013 ;; CRC 1 block of already decrypted text 1014 test %%NUM_BLOCKS, 1 1015 jz %%_skip_crc_by1 1016 1017 vpclmulqdq XWORD(%%ZTMP1), XWORD(%%ZCRC_IN_OUT0), XWORD(%%ZTMP2), 0x01 1018 vpclmulqdq XWORD(%%ZCRC_IN_OUT0), XWORD(%%ZCRC_IN_OUT0), XWORD(%%ZTMP2), 0x10 1019 vpternlogq XWORD(%%ZCRC_IN_OUT0), XWORD(%%ZTMP6), XWORD(%%ZTMP1), 0x96 1020 1021%%_skip_crc_by1: 1022 jmp %%_check_partial_block 1023 1024%%_cbc_decrypt_done: 1025 ;; No blocks left to compute CRC for. Just fold the sums from 16x128-bits into 1x128-bits. 1026 ;; Register content at this point: 1027 ;; ZCRC_IN_OUT0 - ZCRC_IN_OUT3 - fold by 16 CRC sums 1028 ;; XCRC0 - includes the last decrypted block to be passed to partial check case 1029 1030 ;; fold 16 x 128 bits -> 8 x 128 bits 1031 vbroadcastf64x2 %%ZTMP2, [rel fold_by_8] 1032 vpclmulqdq %%ZTMP1, %%ZCRC_IN_OUT0, %%ZTMP2, 0x01 1033 vpclmulqdq %%ZCRC_IN_OUT0, %%ZCRC_IN_OUT0, %%ZTMP2, 0x10 1034 vpternlogq %%ZCRC_IN_OUT0, %%ZCRC_IN_OUT2, %%ZTMP1, 0x96 1035 1036 vpclmulqdq %%ZTMP1, %%ZCRC_IN_OUT1, %%ZTMP2, 0x01 1037 vpclmulqdq %%ZCRC_IN_OUT1, %%ZCRC_IN_OUT1, %%ZTMP2, 0x10 1038 vpternlogq %%ZCRC_IN_OUT1, %%ZCRC_IN_OUT3, %%ZTMP1, 0x96 1039 1040%%_cbc_decrypt_done_fold_8_to_4: 1041 ;; fold 8 x 128 bits -> 4 x 128 bits 1042 vbroadcastf64x2 %%ZTMP2, [rel fold_by_4] 1043 vpclmulqdq %%ZTMP1, %%ZCRC_IN_OUT0, %%ZTMP2, 0x01 1044 vpclmulqdq %%ZCRC_IN_OUT0, %%ZCRC_IN_OUT0, %%ZTMP2, 0x10 1045 vpternlogq %%ZCRC_IN_OUT0, %%ZCRC_IN_OUT1, %%ZTMP1, 0x96 1046 1047 ;; fold 4 x 128 bits -> 2 x 128 bits 1048 vbroadcastf64x2 YWORD(%%ZTMP2), [rel fold_by_2] 1049 vextracti64x4 YWORD(%%ZCRC_IN_OUT1), %%ZCRC_IN_OUT0, 1 1050 vpclmulqdq YWORD(%%ZTMP1), YWORD(%%ZCRC_IN_OUT0), YWORD(%%ZTMP2), 0x01 1051 vpclmulqdq YWORD(%%ZCRC_IN_OUT0), YWORD(%%ZCRC_IN_OUT0), YWORD(%%ZTMP2), 0x10 1052 vpternlogq YWORD(%%ZCRC_IN_OUT0), YWORD(%%ZCRC_IN_OUT1), YWORD(%%ZTMP1), 0x96 1053 1054 ;; fold 2 x 128 bits -> 1 x 128 bits 1055 vmovdqa64 XWORD(%%ZTMP2), [rel fold_by_1] 1056 vextracti64x2 XWORD(%%ZCRC_IN_OUT1), YWORD(%%ZCRC_IN_OUT0), 1 1057 vpclmulqdq XWORD(%%ZTMP1), XWORD(%%ZCRC_IN_OUT0), XWORD(%%ZTMP2), 0x01 1058 vpclmulqdq XWORD(%%ZCRC_IN_OUT0), XWORD(%%ZCRC_IN_OUT0), XWORD(%%ZTMP2), 0x10 1059 vpternlogq XWORD(%%ZCRC_IN_OUT0), XWORD(%%ZCRC_IN_OUT1), XWORD(%%ZTMP1), 0x96 1060 1061 ;; - keep the last block out from the calculation 1062 ;; (this may be a partial block - additional checks follow) 1063 jmp %%_check_partial_block 1064 1065 1066 ;; ===================================================================== 1067 ;; ===================================================================== 1068 ;; Part handling messages from 16 - 32 blocks 1069 ;; ===================================================================== 1070%%_below_33_blocks: 1071 ;; Decrypt 16 blocks first 1072 ;; Make sure IV is in the top 128 bits of ZMM. 1073 vshufi64x2 %%ZIV, %%ZIV, %%ZIV, 0000_0000b 1074 1075 AES_CBC_DEC_1_TO_16 %%SRC, %%DST, 16, %%OFFSET, %%NUM_BYTES, \ 1076 %%KEYS, %%ZIV, %%NROUNDS, \ 1077 %%ZTMP0, %%ZCRC_IN_OUT1, \ 1078 %%ZCRC_IN_OUT2, %%ZCRC_IN_OUT3, \ 1079 %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5 1080 1081 ;; Start of CRC is just reading the data and adding initial value. 1082 vpxorq %%ZCRC_IN_OUT0, %%ZCRC_IN_OUT0, %%ZTMP0 1083 1084 ;; Use fold by 8 approach to start the CRC. 1085 ;; ZCRC_IN_OUT0 and ZCRC_IN_OUT1 include CRC sums. 1086 vbroadcastf64x2 %%ZTMP2, [rel fold_by_8] 1087 vpclmulqdq %%ZTMP1, %%ZCRC_IN_OUT0, %%ZTMP2, 0x01 1088 vpclmulqdq %%ZCRC_IN_OUT0, %%ZCRC_IN_OUT0, %%ZTMP2, 0x10 1089 vpternlogq %%ZCRC_IN_OUT0, %%ZCRC_IN_OUT2, %%ZTMP1, 0x96 1090 1091 vpclmulqdq %%ZTMP1, %%ZCRC_IN_OUT1, %%ZTMP2, 0x01 1092 vpclmulqdq %%ZCRC_IN_OUT1, %%ZCRC_IN_OUT1, %%ZTMP2, 0x10 1093 vpternlogq %%ZCRC_IN_OUT1, %%ZCRC_IN_OUT3, %%ZTMP1, 0x96 1094 1095 ;; Decrypt rest of the message. 1096 mov %%NUM_BLOCKS, %%NUM_BYTES 1097 shr %%NUM_BLOCKS, 4 1098 and %%NUM_BLOCKS, 15 1099 jz %%_decrypt2_eq0 1100 1101 cmp %%NUM_BLOCKS, 8 1102 jg %%_decrypt2_gt8 1103 je %%_decrypt2_eq8 1104 1105 ;; 1 to 7 blocks 1106 cmp %%NUM_BLOCKS, 4 1107 jg %%_decrypt2_gt4 1108 je %%_decrypt2_eq4 1109 1110%%_decrypt2_lt4: 1111 ;; 1 to 3 blocks 1112 cmp %%NUM_BLOCKS, 2 1113 jg %%_decrypt2_eq3 1114 je %%_decrypt2_eq2 1115 jmp %%_decrypt2_eq1 1116 1117%%_decrypt2_gt4: 1118 ;; 5 to 7 1119 cmp %%NUM_BLOCKS, 6 1120 jg %%_decrypt2_eq7 1121 je %%_decrypt2_eq6 1122 jmp %%_decrypt2_eq5 1123 1124%%_decrypt2_gt8: 1125 ;; 9 to 15 1126 cmp %%NUM_BLOCKS, 12 1127 jg %%_decrypt2_gt12 1128 je %%_decrypt2_eq12 1129 1130 ;; 9 to 11 1131 cmp %%NUM_BLOCKS, 10 1132 jg %%_decrypt2_eq11 1133 je %%_decrypt2_eq10 1134 jmp %%_decrypt2_eq9 1135 1136%%_decrypt2_gt12: 1137 ;; 13 to 15 1138 cmp %%NUM_BLOCKS, 14 1139 jg %%_decrypt2_eq15 1140 je %%_decrypt2_eq14 1141 jmp %%_decrypt2_eq13 1142 1143%assign number_of_blocks 1 1144%rep 15 1145%%_decrypt2_eq %+ number_of_blocks : 1146 AES_CBC_DEC_1_TO_16 %%SRC, %%DST, number_of_blocks, %%OFFSET, %%NUM_BYTES, \ 1147 %%KEYS, %%ZIV, %%NROUNDS, \ 1148 %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, \ 1149 %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5 1150 1151%if number_of_blocks < 5 1152 vextracti64x2 %%XCRC0, %%ZTMP6, (number_of_blocks - 1) 1153%elif number_of_blocks < 9 1154 vextracti64x2 %%XCRC0, %%ZTMP7, (number_of_blocks - 4 - 1) 1155%elif number_of_blocks < 13 1156 vextracti64x2 %%XCRC0, %%ZTMP8, (number_of_blocks - 8 - 1) 1157%else 1158 vextracti64x2 %%XCRC0, %%ZTMP9, (number_of_blocks - 12 - 1) 1159%endif 1160 1161 ;; Update XIV 1162 mov %%NUM_BLOCKS, (number_of_blocks - 1) 1163 1164 ;; Extract latest IV 1165 vextracti32x4 %%XIV, %%ZIV, 3 1166 jmp %%_decrypt_done_no_fold_16_to_8 1167 1168%assign number_of_blocks (number_of_blocks + 1) 1169%endrep 1170 1171%%_decrypt2_eq0: 1172 ;; Special case. Check if there are full 16 blocks for decrypt. 1173 ;; If yes then decrypt them and fall through to folding/crc section 1174 ;; identifying 15 blocks for CRC 1175 cmp %%NUM_BYTES, (16 * 16) 1176 jb %%_cbc_decrypt_done_fold_8_to_4 1177 1178 AES_CBC_DEC_1_TO_16 %%SRC, %%DST, 16, %%OFFSET, %%NUM_BYTES, \ 1179 %%KEYS, %%ZIV, %%NROUNDS, \ 1180 %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, \ 1181 %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5 1182 1183 mov %%NUM_BLOCKS, 15 1184 vextracti32x4 %%XIV, %%ZIV, 3 1185 vextracti64x2 %%XCRC0, %%ZTMP9, 3 1186 jmp %%_decrypt_done_no_fold_16_to_8 1187 1188 ;; ===================================================================== 1189 ;; ===================================================================== 1190 ;; Part handling messages up to from 1 to 16 blocks 1191 ;; ===================================================================== 1192%%_below_17_blocks: 1193 ;; Make sure IV is in the top 128 bits of ZMM. 1194 vshufi64x2 %%ZIV, %%ZIV, %%ZIV, 0000_0000b 1195 1196 mov %%NUM_BLOCKS, %%NUM_BYTES 1197 shr %%NUM_BLOCKS, 4 1198 and %%NUM_BLOCKS, 15 1199 jz %%_eq16 1200 1201 cmp %%NUM_BLOCKS, 8 1202 jg %%_gt8 1203 je %%_eq8 1204 1205 ;; 1 to 7 blocks 1206 cmp %%NUM_BLOCKS, 4 1207 jg %%_gt4 1208 je %%_eq4 1209 1210%%_lt4: 1211 ;; 1 to 3 blocks 1212 cmp %%NUM_BLOCKS, 2 1213 jg %%_eq3 1214 je %%_eq2 1215 jmp %%_eq1 1216 1217%%_gt4: 1218 ;; 5 to 7 1219 cmp %%NUM_BLOCKS, 6 1220 jg %%_eq7 1221 je %%_eq6 1222 jmp %%_eq5 1223 1224%%_gt8: 1225 ;; 9 to 15 1226 cmp %%NUM_BLOCKS, 12 1227 jg %%_gt12 1228 je %%_eq12 1229 1230 ;; 9 to 11 1231 cmp %%NUM_BLOCKS, 10 1232 jg %%_eq11 1233 je %%_eq10 1234 jmp %%_eq9 1235 1236%%_gt12: 1237 ;; 13 to 15 1238 cmp %%NUM_BLOCKS, 14 1239 jg %%_eq15 1240 je %%_eq14 1241 jmp %%_eq13 1242 1243%assign number_of_blocks 1 1244%rep 16 1245%%_eq %+ number_of_blocks : 1246 ;; Start building the pipeline by decrypting number of blocks 1247 ;; - later cipher & CRC operations get stitched 1248 AES_CBC_DEC_1_TO_16 %%SRC, %%DST, number_of_blocks, %%OFFSET, %%NUM_BYTES, \ 1249 %%KEYS, %%ZIV, %%NROUNDS, \ 1250 %%ZCRC0, %%ZCRC1, %%ZCRC2, %%ZCRC3, \ 1251 %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5 1252 1253 vextracti32x4 %%XIV, %%ZIV, 3 1254 1255 ;; Less than 16 blocks remaining in the message: 1256 ;; - compute CRC on decrypted blocks (minus one, in case it is the last one) 1257 ;; - then check for any partial block left 1258%assign number_of_blocks_to_crc (number_of_blocks - 1) 1259 CRC32_FIRST_1_TO_16 %%XCRC_MUL, %%XCRC_IN_OUT, %%XTMP0, %%XTMP1, \ 1260 number_of_blocks_to_crc, \ 1261 %%ZCRC0, %%ZCRC1, %%ZCRC2, %%ZCRC3, \ 1262 %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3 1263 1264%if number_of_blocks_to_crc == 0 1265%elif number_of_blocks_to_crc < 4 1266 vextracti32x4 %%XCRC0, %%ZCRC0, (number_of_blocks_to_crc % 4) 1267%elif number_of_blocks_to_crc < 8 1268 vextracti32x4 %%XCRC0, %%ZCRC1, (number_of_blocks_to_crc % 4) 1269%elif number_of_blocks_to_crc < 12 1270 vextracti32x4 %%XCRC0, %%ZCRC2, (number_of_blocks_to_crc % 4) 1271%else ;; number_of_blocks_to_crc < 16 1272 vextracti32x4 %%XCRC0, %%ZCRC3, (number_of_blocks_to_crc % 4) 1273%endif 1274 jmp %%_check_partial_block 1275 1276%assign number_of_blocks (number_of_blocks + 1) 1277%endrep 1278 1279 ;; ===================================================================== 1280 ;; ===================================================================== 1281 ;; Part handling decrypt & CRC of partial block and 1282 ;; CRC of the second last block. 1283 ;; Register content at entry to this section: 1284 ;; XCRC0 - last 16 bytes of clear text to compute crc on (optional) 1285 ;; XCRC_IN_OUT - 128-bit crc fold product 1286 ;; OFFSET - current offset 1287 ;; NUM_BYTES - number of bytes left to decrypt 1288 ;; XIV - IV for decrypt operation 1289 ;; ===================================================================== 1290%%_check_partial_block: 1291 or %%NUM_BYTES, %%NUM_BYTES 1292 jz %%_no_partial_bytes 1293 1294 ;; AES128/256-CFB on the partial block 1295 lea %%GT1, [rel byte_len_to_mask_table] 1296 kmovw k1, [%%GT1 + %%NUM_BYTES*2] 1297 vmovdqu8 %%XTMP1{k1}{z}, [%%SRC + %%OFFSET + 0] 1298 vpxorq %%XTMP0, %%XIV, [%%KEYS_ENC + 0*16] 1299%assign i 1 1300%rep %%NROUNDS 1301 vaesenc %%XTMP0, [%%KEYS_ENC + i*16] 1302%assign i (i + 1) 1303%endrep 1304 vaesenclast %%XTMP0, [%%KEYS_ENC + i*16] 1305 vpxorq %%XTMP0, %%XTMP0, %%XTMP1 1306 vmovdqu8 [%%DST + %%OFFSET + 0]{k1}, %%XTMP0 1307 1308%%_no_partial_bytes: 1309 ;; At this stage: 1310 ;; - whole message is decrypted the focus moves to complete CRC 1311 ;; - XCRC_IN_OUT includes folded data from all payload apart from 1312 ;; the last full block and (potential) partial bytes 1313 ;; - max 2 blocks (minus 1 byte) remain for CRC calculation 1314 ;; - %%OFFSET == 0 is used to check 1315 ;; if message consists of partial block only 1316 or %%OFFSET, %%OFFSET 1317 jz %%_no_block_pending_crc 1318 1319 ;; Data block(s) was previously decrypted 1320 ;; - move to the last decrypted block 1321 ;; - calculate number of bytes to compute CRC for (less CRC field size) 1322 add %%NUM_BYTES, (16 - 4) 1323 sub %%OFFSET, 16 1324 jz %%_no_partial_bytes__start_crc 1325 1326 cmp %%NUM_BYTES, 16 1327 jb %%_no_partial_bytes__lt16 1328 1329 ;; XCRC0 has copy of the last full decrypted block 1330 CRC_UPDATE16 no_load, %%XCRC_IN_OUT, %%XCRC_MUL, %%XCRC0, %%XTMP1, next_crc 1331 1332 sub %%NUM_BYTES, 16 1333 add %%OFFSET, 16 ; compensate for the subtract above 1334 1335%%_no_partial_bytes__lt16: 1336 or %%NUM_BYTES, %%NUM_BYTES 1337 jz %%_no_partial_bytes__128_done 1338 1339 ;; Partial bytes left - complete CRC calculation 1340 lea %%GT1, [rel pshufb_shf_table] 1341 vmovdqu64 %%XTMP0, [%%GT1 + %%NUM_BYTES] 1342 lea %%GT1, [%%DST + %%OFFSET] 1343 vmovdqu64 %%XTMP1, [%%GT1 - 16 + %%NUM_BYTES] ; xtmp1 = data for CRC 1344 vmovdqa64 %%XCRC_TMP, %%XCRC_IN_OUT 1345 vpshufb %%XCRC_IN_OUT, %%XTMP0 ; top num_bytes with LSB xcrc 1346 vpxorq %%XTMP0, [rel mask3] 1347 vpshufb %%XCRC_TMP, %%XTMP0 ; bottom (16 - num_bytes) with MSB xcrc 1348 1349 ;; data num_bytes (top) blended with MSB bytes of CRC (bottom) 1350 vpblendvb %%XCRC_TMP, %%XTMP1, %%XTMP0 1351 1352 CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%XCRC_TMP, %%XTMP1 1353 1354%%_no_partial_bytes__128_done: 1355 CRC32_REDUCE_128_TO_32 rax, %%XCRC_IN_OUT, %%XTMP1, %%XTMP0, %%XCRC_TMP 1356 jmp %%_do_return 1357 1358%%_no_partial_bytes__start_crc: 1359 ;; - CRC was not started yet 1360 ;; - CBC decryption could have taken place and/or CFB 1361 ;; - DST is never modified so it points to start of the buffer that 1362 ;; is subject of CRC calculation 1363 ETHERNET_FCS_CRC %%DST, %%NUM_BYTES, rax, %%XCRC_IN_OUT, %%GT1, \ 1364 %%XCRC_MUL, %%XTMP0, %%XTMP1, %%XCRC_TMP 1365 jmp %%_do_return 1366 1367%%_no_block_pending_crc: 1368 ;; Message consists of partial block only (first_crc not employed yet) 1369 ;; - XTMP0 includes clear text from CFB processing above 1370 ;; - k1 includes mask of bytes belonging to the message 1371 ;; - NUM_BYTES is length of cipher, CRC is 4 bytes shorter 1372 ;; - ignoring hash lengths 1 to 4 1373 cmp %%NUM_BYTES, 5 1374 jb %%_do_return 1375 1376 ;; clear top 4 bytes of the data 1377 kshiftrw k1, k1, 4 1378 vmovdqu8 %%XTMP0{k1}{z}, %%XTMP0 1379 vpxorq %%XCRC_IN_OUT, %%XTMP0 ; xor the data in 1380 1381 sub %%NUM_BYTES, 4 1382 1383 ;; CRC calculation for payload lengths below 4 is different 1384 cmp %%NUM_BYTES, 4 1385 jb %%_no_block_pending_crc__lt4 1386 1387 ;; 4 or more bytes left 1388 lea %%GT1, [rel pshufb_shf_table] 1389 vmovdqu64 %%XTMP1, [%%GT1 + %%NUM_BYTES] 1390 vpshufb %%XCRC_IN_OUT, %%XTMP1 1391 1392 CRC32_REDUCE_128_TO_32 rax, %%XCRC_IN_OUT, %%XTMP0, %%XTMP1, %%XCRC_TMP 1393 jmp %%_do_return 1394 1395%%_no_block_pending_crc__lt4: 1396 ;; less than 4 bytes left for CRC 1397 cmp %%NUM_BYTES, 3 1398 jne %%_no_block_pending_crc__neq3 1399 vpslldq %%XCRC_IN_OUT, 5 1400 jmp %%_do_barret 1401 1402%%_no_block_pending_crc__neq3: 1403 cmp %%NUM_BYTES, 2 1404 jne %%_no_block_pending_crc__neq2 1405 vpslldq %%XCRC_IN_OUT, 6 1406 jmp %%_do_barret 1407 1408%%_no_block_pending_crc__neq2: 1409 vpslldq %%XCRC_IN_OUT, 7 1410 1411%%_do_barret: 1412 CRC32_REDUCE_64_TO_32 rax, %%XCRC_IN_OUT, %%XTMP0, %%XTMP1, %%XCRC_TMP 1413 1414%%_do_return: 1415 ;; result in rax 1416 1417%endmacro ;; DOCSIS_DEC_CRC32 1418 1419;; =================================================================== 1420;; =================================================================== 1421;; MACRO IMPLEMENTING API FOR STITCHED DOCSIS DECRYPT AND CRC32 1422;; =================================================================== 1423%macro AES_DOCSIS_DEC_CRC32 1 1424%define %%NROUNDS %1 ; [in] Number of rounds (9 or 13) 1425 1426 mov rax, rsp 1427 sub rsp, STACKFRAME_size 1428 and rsp, -64 1429 mov [rsp + _rsp_save], rax ; original SP 1430 mov [rsp + _gpr_save + 0*8], r12 1431 mov [rsp + _gpr_save + 1*8], r13 1432 mov [rsp + _gpr_save + 2*8], rbx 1433 mov [rsp + _gpr_save + 3*8], rbp 1434 1435 vmovdqa64 xmm15, [rel init_crc_value] 1436 1437 mov tmp1, [job + _src] 1438 add tmp1, [job + _hash_start_src_offset_in_bytes] ; CRC only start 1439 1440 cmp qword [job + _msg_len_to_cipher_in_bytes], 0 1441 jz %%aes_docsis_dec_crc32_avx512__no_cipher 1442 1443 mov tmp2, [job + _cipher_start_src_offset_in_bytes] 1444 cmp tmp2, [job + _hash_start_src_offset_in_bytes] 1445 jbe %%aes_docsis_dec_crc32_avx512__skip_aad ; avoid zero lengths or negative cases 1446 1447 sub tmp2, [job + _hash_start_src_offset_in_bytes] ; CRC only size / AAD 1448 1449 ETHERNET_FCS_CRC tmp1, tmp2, rax, xmm15, tmp3, xmm0, xmm1, xmm2, xmm3 1450 1451 not eax ; carry CRC value into the combined part 1452 vmovd xmm15, eax ; initial CRC value 1453 1454%%aes_docsis_dec_crc32_avx512__skip_aad: 1455 mov tmp1, [job + _iv] 1456 vmovdqu64 xmm14, [tmp1] ; load IV 1457 1458 mov tmp2, [job + _src] 1459 add tmp2, [job + _cipher_start_src_offset_in_bytes] ; AES start 1460 1461 mov tmp3, [job + _dst] ; AES destination 1462 1463 mov tmp4, [job + _msg_len_to_cipher_in_bytes] ; CRC + AES size 1464 mov tmp5, [job + _dec_keys] 1465 mov tmp6, [job + _enc_keys] 1466 1467 DOCSIS_DEC_CRC32 tmp5, tmp2, tmp3, tmp4, tmp6, \ 1468 tmp7, tmp8, \ 1469 xmm15, xmm14, \ 1470 zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, \ 1471 zmm8, zmm9, zmm10, zmm11, zmm12, zmm13, \ 1472 zmm16, zmm17, zmm18, zmm19, zmm20, zmm21, zmm22, zmm23, \ 1473 zmm24, zmm25, zmm26, zmm27, zmm28, zmm29, zmm30, zmm31, \ 1474 %%NROUNDS 1475 1476 jmp %%aes_docsis_dec_crc32_avx512__exit 1477 1478%%aes_docsis_dec_crc32_avx512__no_cipher: 1479 ;; tmp1 - already points to hash start 1480 ;; job is arg1 1481 mov [rsp + _job_save], job 1482 mov arg2, [job + _msg_len_to_hash_in_bytes] 1483 xor arg3, arg3 1484 mov arg1, tmp1 1485 call ethernet_fcs_avx512_local 1486 mov job, [rsp + _job_save] 1487 1488%%aes_docsis_dec_crc32_avx512__exit: 1489 mov tmp1, [job + _auth_tag_output] 1490 mov [tmp1], eax ; store CRC32 value 1491 1492 or qword [job + _status], STS_COMPLETED_AES 1493 1494 ;; restore stack pointer and registers 1495 mov r12, [rsp + _gpr_save + 0*8] 1496 mov r13, [rsp + _gpr_save + 1*8] 1497 mov rbx, [rsp + _gpr_save + 2*8] 1498 mov rbp, [rsp + _gpr_save + 3*8] 1499 mov rsp, [rsp + _rsp_save] ; original SP 1500 1501%ifdef SAFE_DATA 1502 clear_all_zmms_asm 1503%endif ;; SAFE_DATA 1504%endmacro 1505 1506;; =================================================================== 1507;; =================================================================== 1508;; input: arg1 = job 1509;; =================================================================== 1510align 64 1511MKGLOBAL(aes_docsis128_dec_crc32_vaes_avx512,function,internal) 1512aes_docsis128_dec_crc32_vaes_avx512: 1513 1514 AES_DOCSIS_DEC_CRC32 9 1515 1516 ret 1517 1518align 64 1519MKGLOBAL(aes_docsis256_dec_crc32_vaes_avx512,function,internal) 1520aes_docsis256_dec_crc32_vaes_avx512: 1521 1522 AES_DOCSIS_DEC_CRC32 13 1523 1524 ret 1525 1526 1527%ifdef LINUX 1528section .note.GNU-stack noalloc noexec nowrite progbits 1529%endif 1530