1;; 2;; Copyright (c) 2020, Intel Corporation 3;; 4;; Redistribution and use in source and binary forms, with or without 5;; modification, are permitted provided that the following conditions are met: 6;; 7;; * Redistributions of source code must retain the above copyright notice, 8;; this list of conditions and the following disclaimer. 9;; * Redistributions in binary form must reproduce the above copyright 10;; notice, this list of conditions and the following disclaimer in the 11;; documentation and/or other materials provided with the distribution. 12;; * Neither the name of Intel Corporation nor the names of its contributors 13;; may be used to endorse or promote products derived from this software 14;; without specific prior written permission. 15;; 16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE 20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26;; 27 28%include "include/os.asm" 29%include "imb_job.asm" 30%include "include/clear_regs.asm" 31%include "include/const.inc" 32%include "include/reg_sizes.asm" 33%include "include/transpose_avx512.asm" 34%include "include/aes_common.asm" 35 36section .data 37default rel 38 39align 16 40constants: 41dd 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 42 43align 64 44add_1_4: 45dd 0x00000001, 0x00000000, 0x00000000, 0x00000000 46dd 0x00000002, 0x00000000, 0x00000000, 0x00000000 47dd 0x00000003, 0x00000000, 0x00000000, 0x00000000 48dd 0x00000004, 0x00000000, 0x00000000, 0x00000000 49 50align 64 51add_5_8: 52dd 0x00000005, 0x00000000, 0x00000000, 0x00000000 53dd 0x00000006, 0x00000000, 0x00000000, 0x00000000 54dd 0x00000007, 0x00000000, 0x00000000, 0x00000000 55dd 0x00000008, 0x00000000, 0x00000000, 0x00000000 56 57align 64 58add_16: 59dd 0x00000010, 0x00000010, 0x00000010, 0x00000010 60dd 0x00000010, 0x00000010, 0x00000010, 0x00000010 61dd 0x00000010, 0x00000010, 0x00000010, 0x00000010 62dd 0x00000010, 0x00000010, 0x00000010, 0x00000010 63 64align 64 65set_1_16: 66dd 0x00000001, 0x00000002, 0x00000003, 0x00000004 67dd 0x00000005, 0x00000006, 0x00000007, 0x00000008 68dd 0x00000009, 0x0000000a, 0x0000000b, 0x0000000c 69dd 0x0000000d, 0x0000000e, 0x0000000f, 0x00000010 70 71align 64 72len_to_mask: 73dq 0xffffffffffffffff, 0x0000000000000001 74dq 0x0000000000000003, 0x0000000000000007 75dq 0x000000000000000f, 0x000000000000001f 76dq 0x000000000000003f, 0x000000000000007f 77dq 0x00000000000000ff, 0x00000000000001ff 78dq 0x00000000000003ff, 0x00000000000007ff 79dq 0x0000000000000fff, 0x0000000000001fff 80dq 0x0000000000003fff, 0x0000000000007fff 81dq 0x000000000000ffff, 0x000000000001ffff 82dq 0x000000000003ffff, 0x000000000007ffff 83dq 0x00000000000fffff, 0x00000000001fffff 84dq 0x00000000003fffff, 0x00000000007fffff 85dq 0x0000000000ffffff, 0x0000000001ffffff 86dq 0x0000000003ffffff, 0x0000000007ffffff 87dq 0x000000000fffffff, 0x000000001fffffff 88dq 0x000000003fffffff, 0x000000007fffffff 89dq 0x00000000ffffffff, 0x00000001ffffffff 90dq 0x00000003ffffffff, 0x00000007ffffffff 91dq 0x0000000fffffffff, 0x0000001fffffffff 92dq 0x0000003fffffffff, 0x0000007fffffffff 93dq 0x000000ffffffffff, 0x000001ffffffffff 94dq 0x000003ffffffffff, 0x000007ffffffffff 95dq 0x00000fffffffffff, 0x00001fffffffffff 96dq 0x00003fffffffffff, 0x00007fffffffffff 97dq 0x0000ffffffffffff, 0x0001ffffffffffff 98dq 0x0003ffffffffffff, 0x0007ffffffffffff 99dq 0x000fffffffffffff, 0x001fffffffffffff 100dq 0x003fffffffffffff, 0x007fffffffffffff 101dq 0x00ffffffffffffff, 0x01ffffffffffffff 102dq 0x03ffffffffffffff, 0x07ffffffffffffff 103dq 0x0fffffffffffffff, 0x1fffffffffffffff 104dq 0x3fffffffffffffff, 0x7fffffffffffffff 105 106%define APPEND(a,b) a %+ b 107 108%ifdef LINUX 109%define arg1 rdi 110%else 111%define arg1 rcx 112%endif 113 114%define job arg1 115 116section .text 117 118%macro ZMM_OP_X4 9 119 ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 16, %1,%2,%3,%4,%5,%2,%3,%4,%5,%6,%7,%8,%9 120%endmacro 121 122%macro ZMM_ROLS_X4 5 123%define %%ZMM_OP1_1 %1 124%define %%ZMM_OP1_2 %2 125%define %%ZMM_OP1_3 %3 126%define %%ZMM_OP1_4 %4 127%define %%BITS_TO_ROTATE %5 128 129 vprold %%ZMM_OP1_1, %%BITS_TO_ROTATE 130 vprold %%ZMM_OP1_2, %%BITS_TO_ROTATE 131 vprold %%ZMM_OP1_3, %%BITS_TO_ROTATE 132 vprold %%ZMM_OP1_4, %%BITS_TO_ROTATE 133 134%endmacro 135 136; 137; Macro adding original state values to processed state values 138; and transposing 16x16 u32 from first 16 ZMM registers, 139; creating keystreams. 140; Note that the registers are tranposed in a different 141; order, so first register (IN00) containing row 0 142; will not contain the first column of the matrix, but 143; row 1 and same with other registers. 144; This is done to minimize the number of registers clobbered. 145; Once transposition is done, keystream is XOR'd with the plaintext 146; and output buffer is written. 147; 148%macro GENERATE_1K_KS_AND_ENCRYPT 35 149%define %%IN00_KS01 %1 ; [in/clobbered] Input row 0 of state, bytes 64-127 of keystream 150%define %%IN01_KS02 %2 ; [in/clobbered] Input row 1 of state, bytes 128-191 of keystream 151%define %%IN02_KS15 %3 ; [in/clobbered] Input row 2 of state, bytes 960-1023 of keystream 152%define %%IN03_KS04 %4 ; [in/clobbered] Input row 3 of state, bytes 256-319 of keystream 153%define %%IN04_KS08 %5 ; [in/clobbered] Input row 4 of state, bytes 512-575 of keystream 154%define %%IN05 %6 ; [in/clobbered] Input row 5 of state, bytes 576-639 of keystream 155%define %%IN06_KS13 %7 ; [in/clobbered] Input row 6 of state, bytes 832-895 of keystream 156%define %%IN07_KS07 %8 ; [in/clobbered] Input row 7 of state, bytes 448-511 of keystream 157%define %%IN08_KS05 %9 ; [in/clobbered] Input row 8 of state, bytes 320-383 of keystream 158%define %%IN09_KS00 %10 ; [in/clobbered] Input row 9 of state, bytes 0-63 of keystream 159%define %%IN10_KS06 %11 ; [in/clobbered] Input row 10 of state, bytes 384-447 of keystream 160%define %%IN11_KS11 %12 ; [in/clobbered] Input row 11 of state, bytes 704-767 of keystream 161%define %%IN12_KS12 %13 ; [in/clobbered] Input row 12 of state, bytes 768-831 of keystream 162%define %%IN13_KS03 %14 ; [in/clobbered] Input row 13 of state, bytes 192-255 of keystream 163%define %%IN14_KS14 %15 ; [in/clobbered] Input row 14 of state, bytes 896-959 of keystream 164%define %%IN15 %16 ; [in/clobbered] Input row 15 of state, bytes 640-703 of keystream 165%define %%IN_ORIG00_KS09 %17 ; [in/clobbered] Original input row 0, bytes 576-639 of keystream 166%define %%IN_ORIG01_KS10 %18 ; [in/clobbered] Original input row 1, bytes 640-703 of keystream 167%define %%IN_ORIG02 %19 ; [in] Original input row 2 168%define %%IN_ORIG03 %20 ; [in] Original input row 3 169%define %%IN_ORIG04 %21 ; [in] Original input row 4 170%define %%IN_ORIG05 %22 ; [in] Original input row 5 171%define %%IN_ORIG06 %23 ; [in] Original input row 6 172%define %%IN_ORIG07 %24 ; [in] Original input row 7 173%define %%IN_ORIG08 %25 ; [in] Original input row 8 174%define %%IN_ORIG09 %26 ; [in] Original input row 9 175%define %%IN_ORIG10 %27 ; [in] Original input row 10 176%define %%IN_ORIG11 %28 ; [in] Original input row 11 177%define %%IN_ORIG12 %29 ; [in] Original input row 12 178%define %%IN_ORIG13 %30 ; [in] Original input row 13 179%define %%IN_ORIG14 %31 ; [in] Original input row 14 180%define %%IN_ORIG15 %32 ; [in] Original input row 15 181%define %%SRC %33 ; [in] Source pointer 182%define %%DST %34 ; [in] Destination pointer 183%define %%OFF %35 ; [in] Offset into src/dst pointers 184 185 vpaddd %%IN00_KS01, %%IN_ORIG00_KS09 186 vpaddd %%IN01_KS02, %%IN_ORIG01_KS10 187 vpaddd %%IN02_KS15, %%IN_ORIG02 188 vpaddd %%IN03_KS04, %%IN_ORIG03 189 190 ;; Deal with first lanes 0-7 191 ; T0, T1 free 192 vpunpckldq %%IN_ORIG00_KS09, %%IN00_KS01, %%IN01_KS02 193 vpunpckhdq %%IN00_KS01, %%IN00_KS01, %%IN01_KS02 194 vpunpckldq %%IN_ORIG01_KS10, %%IN02_KS15, %%IN03_KS04 195 vpunpckhdq %%IN02_KS15, %%IN02_KS15, %%IN03_KS04 196 197 ; IN01_KS02, IN03_KS04 free 198 vpunpcklqdq %%IN03_KS04, %%IN_ORIG00_KS09, %%IN_ORIG01_KS10 199 vpunpckhqdq %%IN01_KS02, %%IN_ORIG00_KS09, %%IN_ORIG01_KS10 200 vpunpcklqdq %%IN_ORIG00_KS09, %%IN00_KS01, %%IN02_KS15 201 vpunpckhqdq %%IN00_KS01, %%IN00_KS01, %%IN02_KS15 202 203 vpaddd %%IN04_KS08, %%IN_ORIG04 204 vpaddd %%IN05, %%IN_ORIG05 205 vpaddd %%IN06_KS13, %%IN_ORIG06 206 vpaddd %%IN07_KS07, %%IN_ORIG07 207 208 ; IN02_KS15, T1 free 209 vpunpckldq %%IN_ORIG01_KS10, %%IN04_KS08, %%IN05 210 vpunpckhdq %%IN04_KS08, %%IN04_KS08, %%IN05 211 vpunpckldq %%IN02_KS15, %%IN06_KS13, %%IN07_KS07 212 vpunpckhdq %%IN06_KS13, %%IN06_KS13, %%IN07_KS07 213 214 ; IN07_KS07, IN05 free 215 vpunpcklqdq %%IN07_KS07, %%IN_ORIG01_KS10, %%IN02_KS15 216 vpunpckhqdq %%IN05, %%IN_ORIG01_KS10, %%IN02_KS15 217 vpunpcklqdq %%IN02_KS15, %%IN04_KS08, %%IN06_KS13 218 vpunpckhqdq %%IN04_KS08, %%IN04_KS08, %%IN06_KS13 219 220 ; T1, IN06_KS13 free 221 vshufi64x2 %%IN_ORIG01_KS10, %%IN03_KS04, %%IN07_KS07, 0x44 222 vshufi64x2 %%IN03_KS04, %%IN03_KS04, %%IN07_KS07, 0xee 223 vshufi64x2 %%IN06_KS13, %%IN01_KS02, %%IN05, 0x44 224 vshufi64x2 %%IN01_KS02, %%IN01_KS02, %%IN05, 0xee 225 vshufi64x2 %%IN07_KS07, %%IN_ORIG00_KS09, %%IN02_KS15, 0x44 226 vshufi64x2 %%IN02_KS15, %%IN_ORIG00_KS09, %%IN02_KS15, 0xee 227 vshufi64x2 %%IN05, %%IN00_KS01, %%IN04_KS08, 0x44 228 vshufi64x2 %%IN00_KS01, %%IN00_KS01, %%IN04_KS08, 0xee 229 230 ;; Deal with lanes 8-15 231 vpaddd %%IN08_KS05, %%IN_ORIG08 232 vpaddd %%IN09_KS00, %%IN_ORIG09 233 vpaddd %%IN10_KS06, %%IN_ORIG10 234 vpaddd %%IN11_KS11, %%IN_ORIG11 235 236 vpunpckldq %%IN_ORIG00_KS09, %%IN08_KS05, %%IN09_KS00 237 vpunpckhdq %%IN08_KS05, %%IN08_KS05, %%IN09_KS00 238 vpunpckldq %%IN04_KS08, %%IN10_KS06, %%IN11_KS11 239 vpunpckhdq %%IN10_KS06, %%IN10_KS06, %%IN11_KS11 240 241 vpunpcklqdq %%IN09_KS00, %%IN_ORIG00_KS09, %%IN04_KS08 242 vpunpckhqdq %%IN04_KS08, %%IN_ORIG00_KS09, %%IN04_KS08 243 vpunpcklqdq %%IN11_KS11, %%IN08_KS05, %%IN10_KS06 244 vpunpckhqdq %%IN08_KS05, %%IN08_KS05, %%IN10_KS06 245 246 vpaddd %%IN12_KS12, %%IN_ORIG12 247 vpaddd %%IN13_KS03, %%IN_ORIG13 248 vpaddd %%IN14_KS14, %%IN_ORIG14 249 vpaddd %%IN15, %%IN_ORIG15 250 251 vpunpckldq %%IN_ORIG00_KS09, %%IN12_KS12, %%IN13_KS03 252 vpunpckhdq %%IN12_KS12, %%IN12_KS12, %%IN13_KS03 253 vpunpckldq %%IN10_KS06, %%IN14_KS14, %%IN15 254 vpunpckhdq %%IN14_KS14, %%IN14_KS14, %%IN15 255 256 vpunpcklqdq %%IN13_KS03, %%IN_ORIG00_KS09, %%IN10_KS06 257 vpunpckhqdq %%IN10_KS06, %%IN_ORIG00_KS09, %%IN10_KS06 258 vpunpcklqdq %%IN15, %%IN12_KS12, %%IN14_KS14 259 vpunpckhqdq %%IN12_KS12, %%IN12_KS12, %%IN14_KS14 260 261 vshufi64x2 %%IN14_KS14, %%IN09_KS00, %%IN13_KS03, 0x44 262 vshufi64x2 %%IN09_KS00, %%IN09_KS00, %%IN13_KS03, 0xee 263 vshufi64x2 %%IN_ORIG00_KS09, %%IN04_KS08, %%IN10_KS06, 0x44 264 vshufi64x2 %%IN10_KS06, %%IN04_KS08, %%IN10_KS06, 0xee 265 vshufi64x2 %%IN13_KS03, %%IN11_KS11, %%IN15, 0x44 266 vshufi64x2 %%IN11_KS11, %%IN11_KS11, %%IN15, 0xee 267 vshufi64x2 %%IN15, %%IN08_KS05, %%IN12_KS12, 0x44 268 vshufi64x2 %%IN08_KS05, %%IN08_KS05, %%IN12_KS12, 0xee 269 270 vshufi64x2 %%IN12_KS12, %%IN03_KS04, %%IN09_KS00, 0xdd 271 vpxorq %%IN12_KS12, [%%SRC + %%OFF + 64*12] 272 vmovdqu64 [%%DST + %%OFF + 64*12], %%IN12_KS12 273 274 vshufi64x2 %%IN04_KS08, %%IN03_KS04, %%IN09_KS00, 0x88 275 vpxorq %%IN04_KS08, [%%SRC + %%OFF + 64*8] 276 vmovdqu64 [%%DST + %%OFF + 64*8], %%IN04_KS08 277 278 vshufi64x2 %%IN09_KS00, %%IN_ORIG01_KS10, %%IN14_KS14, 0x88 279 vpxorq %%IN09_KS00, [%%SRC + %%OFF] 280 vmovdqu64 [%%DST + %%OFF], %%IN09_KS00 281 282 vshufi64x2 %%IN03_KS04, %%IN_ORIG01_KS10, %%IN14_KS14, 0xdd 283 vpxorq %%IN03_KS04, [%%SRC + %%OFF + 64*4] 284 vmovdqu64 [%%DST + %%OFF + 64*4], %%IN03_KS04 285 286 vshufi64x2 %%IN14_KS14, %%IN02_KS15, %%IN11_KS11, 0xdd 287 vpxorq %%IN14_KS14, [%%SRC + %%OFF + 64*14] 288 vmovdqu64 [%%DST + %%OFF + 64*14], %%IN14_KS14 289 290 vshufi64x2 %%IN_ORIG01_KS10, %%IN02_KS15, %%IN11_KS11, 0x88 291 vpxorq %%IN_ORIG01_KS10, [%%SRC + %%OFF + 64*10] 292 vmovdqu64 [%%DST + %%OFF + 64*10], %%IN_ORIG01_KS10 293 294 vshufi64x2 %%IN11_KS11, %%IN00_KS01, %%IN08_KS05, 0x88 295 vpxorq %%IN11_KS11, [%%SRC + %%OFF + 64*11] 296 vmovdqu64 [%%DST + %%OFF + 64*11], %%IN11_KS11 297 298 vshufi64x2 %%IN02_KS15, %%IN00_KS01, %%IN08_KS05, 0xdd 299 vpxorq %%IN02_KS15, [%%SRC + %%OFF + 64*15] 300 vmovdqu64 [%%DST + %%OFF + 64*15], %%IN02_KS15 301 302 vshufi64x2 %%IN00_KS01, %%IN06_KS13, %%IN_ORIG00_KS09, 0x88 303 vpxorq %%IN00_KS01, [%%SRC + %%OFF + 64*1] 304 vmovdqu64 [%%DST + %%OFF + 64*1], %%IN00_KS01 305 306 vshufi64x2 %%IN08_KS05, %%IN06_KS13, %%IN_ORIG00_KS09, 0xdd 307 vpxorq %%IN08_KS05, [%%SRC + %%OFF + 64*5] 308 vmovdqu64 [%%DST + %%OFF + 64*5], %%IN08_KS05 309 310 vshufi64x2 %%IN_ORIG00_KS09, %%IN01_KS02, %%IN10_KS06, 0x88 311 vpxorq %%IN_ORIG00_KS09, [%%SRC + %%OFF + 64*9] 312 vmovdqu64 [%%DST + %%OFF + 64*9], %%IN_ORIG00_KS09 313 314 vshufi64x2 %%IN06_KS13, %%IN01_KS02, %%IN10_KS06, 0xdd 315 vpxorq %%IN06_KS13, [%%SRC + %%OFF + 64*13] 316 vmovdqu64 [%%DST + %%OFF + 64*13], %%IN06_KS13 317 318 vshufi64x2 %%IN01_KS02, %%IN07_KS07, %%IN13_KS03, 0x88 319 vpxorq %%IN01_KS02, [%%SRC + %%OFF + 64*2] 320 vmovdqu64 [%%DST + %%OFF + 64*2], %%IN01_KS02 321 322 vshufi64x2 %%IN10_KS06, %%IN07_KS07, %%IN13_KS03, 0xdd 323 vpxorq %%IN10_KS06, [%%SRC + %%OFF + 64*6] 324 vmovdqu64 [%%DST + %%OFF + 64*6], %%IN10_KS06 325 326 vshufi64x2 %%IN13_KS03, %%IN05, %%IN15, 0x88 327 vpxorq %%IN13_KS03, [%%SRC + %%OFF + 64*3] 328 vmovdqu64 [%%DST + %%OFF + 64*3], %%IN13_KS03 329 330 vshufi64x2 %%IN07_KS07, %%IN05, %%IN15, 0xdd 331 vpxorq %%IN07_KS07, [%%SRC + %%OFF + 64*7] 332 vmovdqu64 [%%DST + %%OFF + 64*7], %%IN07_KS07 333%endmacro 334 335;; 336;; Performs a quarter round on all 4 columns, 337;; resulting in a full round 338;; 339%macro QUARTER_ROUND_X4 4 340%define %%A %1 ;; [in/out] ZMM register containing value A of all 4 columns 341%define %%B %2 ;; [in/out] ZMM register containing value B of all 4 columns 342%define %%C %3 ;; [in/out] ZMM register containing value C of all 4 columns 343%define %%D %4 ;; [in/out] ZMM register containing value D of all 4 columns 344 345 vpaddd %%A, %%B 346 vpxorq %%D, %%A 347 vprold %%D, 16 348 vpaddd %%C, %%D 349 vpxorq %%B, %%C 350 vprold %%B, 12 351 vpaddd %%A, %%B 352 vpxorq %%D, %%A 353 vprold %%D, 8 354 vpaddd %%C, %%D 355 vpxorq %%B, %%C 356 vprold %%B, 7 357 358%endmacro 359 360;; 361;; Rotates the registers to prepare the data 362;; from column round to diagonal round 363;; 364%macro COLUMN_TO_DIAG 3 365%define %%B %1 ;; [in/out] ZMM register containing value B of all 4 columns 366%define %%C %2 ;; [in/out] ZMM register containing value C of all 4 columns 367%define %%D %3 ;; [in/out] ZMM register containing value D of all 4 columns 368 369 vpshufd %%B, %%B, 0x39 ; 0b00111001 ;; 0,3,2,1 370 vpshufd %%C, %%C, 0x4E ; 0b01001110 ;; 1,0,3,2 371 vpshufd %%D, %%D, 0x93 ; 0b10010011 ;; 2,1,0,3 372 373%endmacro 374 375;; 376;; Rotates the registers to prepare the data 377;; from diagonal round to column round 378;; 379%macro DIAG_TO_COLUMN 3 380%define %%B %1 ;; [in/out] ZMM register containing value B of all 4 columns 381%define %%C %2 ;; [in/out] ZMM register containing value C of all 4 columns 382%define %%D %3 ;; [in/out] ZMM register containing value D of all 4 columns 383 384 vpshufd %%B, %%B, 0x93 ; 0b10010011 ; 2,1,0,3 385 vpshufd %%C, %%C, 0x4E ; 0b01001110 ; 1,0,3,2 386 vpshufd %%D, %%D, 0x39 ; 0b00111001 ; 0,3,2,1 387 388%endmacro 389;; 390;; Generates up to 64*8 bytes of keystream 391;; 392%macro GENERATE_512_KS 21 393%define %%A_L_KS0 %1 ;; [out] ZMM A / Bytes 0-63 of KS 394%define %%B_L_KS1 %2 ;; [out] ZMM B / Bytes 64-127 of KS 395%define %%C_L_KS2 %3 ;; [out] ZMM C / Bytes 128-191 of KS 396%define %%D_L_KS3 %4 ;; [out] ZMM D / Bytes 192-255 of KS 397%define %%A_H_KS4 %5 ;; [out] ZMM A / Bytes 256-319 of KS (or "none" in NUM_BLOCKS == 4) 398%define %%B_H_KS5 %6 ;; [out] ZMM B / Bytes 320-383 of KS (or "none" in NUM_BLOCKS == 4) 399%define %%C_H_KS6 %7 ;; [out] ZMM C / Bytes 384-447 of KS (or "none" in NUM_BLOCKS == 4) 400%define %%D_H_KS7 %8 ;; [out] ZMM D / Bytes 448-511 of KS (or "none" in NUM_BLOCKS == 4) 401%define %%STATE_IN_A_L %9 ;; [in] ZMM containing state "A" part 402%define %%STATE_IN_B_L %10 ;; [in] ZMM containing state "B" part 403%define %%STATE_IN_C_L %11 ;; [in] ZMM containing state "C" part 404%define %%STATE_IN_D_L %12 ;; [in] ZMM containing state "D" part 405%define %%STATE_IN_A_H %13 ;; [in] ZMM containing state "A" part (or "none" in NUM_BLOCKS == 4) 406%define %%STATE_IN_B_H %14 ;; [in] ZMM containing state "B" part (or "none" in NUM_BLOCKS == 4) 407%define %%STATE_IN_C_H %15 ;; [in] ZMM containing state "C" part (or "none" in NUM_BLOCKS == 4) 408%define %%STATE_IN_D_H %16 ;; [in] ZMM containing state "D" part (or "none" in NUM_BLOCKS == 4) 409%define %%ZTMP0 %17 ;; [clobbered] Temp ZMM reg 410%define %%ZTMP1 %18 ;; [clobbered] Temp ZMM reg 411%define %%ZTMP2 %19 ;; [clobbered] Temp ZMM reg 412%define %%ZTMP3 %20 ;; [clobbered] Temp ZMM reg 413%define %%NUM_BLOCKS %21 ;; [in] Num blocks to encrypt (4 or 8) 414 415 vmovdqa64 %%A_L_KS0, %%STATE_IN_A_L 416 vmovdqa64 %%B_L_KS1, %%STATE_IN_B_L 417 vmovdqa64 %%C_L_KS2, %%STATE_IN_C_L 418 vmovdqa64 %%D_L_KS3, %%STATE_IN_D_L 419%if %%NUM_BLOCKS == 8 420 vmovdqa64 %%A_H_KS4, %%STATE_IN_A_H 421 vmovdqa64 %%B_H_KS5, %%STATE_IN_B_H 422 vmovdqa64 %%C_H_KS6, %%STATE_IN_C_H 423 vmovdqa64 %%D_H_KS7, %%STATE_IN_D_H 424%endif 425%rep 10 426%if %%NUM_BLOCKS == 4 427 QUARTER_ROUND_X4 %%A_L_KS0, %%B_L_KS1, %%C_L_KS2, %%D_L_KS3 428 COLUMN_TO_DIAG %%B_L_KS1, %%C_L_KS2, %%D_L_KS3 429 QUARTER_ROUND_X4 %%A_L_KS0, %%B_L_KS1, %%C_L_KS2, %%D_L_KS3 430 DIAG_TO_COLUMN %%B_L_KS1, %%C_L_KS2, %%D_L_KS3 431%else 432 QUARTER_ROUND_X4 %%A_L_KS0, %%B_L_KS1, %%C_L_KS2, %%D_L_KS3 433 QUARTER_ROUND_X4 %%A_H_KS4, %%B_H_KS5, %%C_H_KS6, %%D_H_KS7 434 COLUMN_TO_DIAG %%B_L_KS1, %%C_L_KS2, %%D_L_KS3 435 COLUMN_TO_DIAG %%B_H_KS5, %%C_H_KS6, %%D_H_KS7 436 QUARTER_ROUND_X4 %%A_L_KS0, %%B_L_KS1, %%C_L_KS2, %%D_L_KS3 437 QUARTER_ROUND_X4 %%A_H_KS4, %%B_H_KS5, %%C_H_KS6, %%D_H_KS7 438 DIAG_TO_COLUMN %%B_L_KS1, %%C_L_KS2, %%D_L_KS3 439 DIAG_TO_COLUMN %%B_H_KS5, %%C_H_KS6, %%D_H_KS7 440%endif ;; %%NUM_BLOCKS == 4 441%endrep 442 443 vpaddd %%A_L_KS0, %%STATE_IN_A_L 444 vpaddd %%B_L_KS1, %%STATE_IN_B_L 445 vpaddd %%C_L_KS2, %%STATE_IN_C_L 446 vpaddd %%D_L_KS3, %%STATE_IN_D_L 447 448 TRANSPOSE4_U128_INPLACE %%A_L_KS0, %%B_L_KS1, %%C_L_KS2, %%D_L_KS3, \ 449 %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3 450%if %%NUM_BLOCKS == 8 451 vpaddd %%A_H_KS4, %%STATE_IN_A_H 452 vpaddd %%B_H_KS5, %%STATE_IN_B_H 453 vpaddd %%C_H_KS6, %%STATE_IN_C_H 454 vpaddd %%D_H_KS7, %%STATE_IN_D_H 455 456 TRANSPOSE4_U128_INPLACE %%A_H_KS4, %%B_H_KS5, %%C_H_KS6, %%D_H_KS7, \ 457 %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3 458%endif 459%endmacro 460 461;; 462;; Performs a full chacha20 round on 16 states, 463;; consisting of 4 quarter rounds, which are done in parallel 464;; 465%macro CHACHA20_ROUND 16 466%define %%ZMM_DWORD_A1 %1 ;; [in/out] ZMM register containing dword A for first quarter round 467%define %%ZMM_DWORD_A2 %2 ;; [in/out] ZMM register containing dword A for second quarter round 468%define %%ZMM_DWORD_A3 %3 ;; [in/out] ZMM register containing dword A for third quarter round 469%define %%ZMM_DWORD_A4 %4 ;; [in/out] ZMM register containing dword A for fourth quarter round 470%define %%ZMM_DWORD_B1 %5 ;; [in/out] ZMM register containing dword B for first quarter round 471%define %%ZMM_DWORD_B2 %6 ;; [in/out] ZMM register containing dword B for second quarter round 472%define %%ZMM_DWORD_B3 %7 ;; [in/out] ZMM register containing dword B for third quarter round 473%define %%ZMM_DWORD_B4 %8 ;; [in/out] ZMM register containing dword B for fourth quarter round 474%define %%ZMM_DWORD_C1 %9 ;; [in/out] ZMM register containing dword C for first quarter round 475%define %%ZMM_DWORD_C2 %10 ;; [in/out] ZMM register containing dword C for second quarter round 476%define %%ZMM_DWORD_C3 %11 ;; [in/out] ZMM register containing dword C for third quarter round 477%define %%ZMM_DWORD_C4 %12 ;; [in/out] ZMM register containing dword C for fourth quarter round 478%define %%ZMM_DWORD_D1 %13 ;; [in/out] ZMM register containing dword D for first quarter round 479%define %%ZMM_DWORD_D2 %14 ;; [in/out] ZMM register containing dword D for second quarter round 480%define %%ZMM_DWORD_D3 %15 ;; [in/out] ZMM register containing dword D for third quarter round 481%define %%ZMM_DWORD_D4 %16 ;; [in/out] ZMM register containing dword D for fourth quarter round 482 483 ; A += B 484 ZMM_OP_X4 vpaddd, %%ZMM_DWORD_A1, %%ZMM_DWORD_A2, %%ZMM_DWORD_A3, %%ZMM_DWORD_A4, \ 485 %%ZMM_DWORD_B1, %%ZMM_DWORD_B2, %%ZMM_DWORD_B3, %%ZMM_DWORD_B4 486 ; D ^= A 487 ZMM_OP_X4 vpxorq, %%ZMM_DWORD_D1, %%ZMM_DWORD_D2, %%ZMM_DWORD_D3, %%ZMM_DWORD_D4, \ 488 %%ZMM_DWORD_A1, %%ZMM_DWORD_A2, %%ZMM_DWORD_A3, %%ZMM_DWORD_A4 489 490 ; D <<< 16 491 ZMM_ROLS_X4 %%ZMM_DWORD_D1, %%ZMM_DWORD_D2, %%ZMM_DWORD_D3, %%ZMM_DWORD_D4, 16 492 493 ; C += D 494 ZMM_OP_X4 vpaddd, %%ZMM_DWORD_C1, %%ZMM_DWORD_C2, %%ZMM_DWORD_C3, %%ZMM_DWORD_C4, \ 495 %%ZMM_DWORD_D1, %%ZMM_DWORD_D2, %%ZMM_DWORD_D3, %%ZMM_DWORD_D4 496 ; B ^= C 497 ZMM_OP_X4 vpxorq, %%ZMM_DWORD_B1, %%ZMM_DWORD_B2, %%ZMM_DWORD_B3, %%ZMM_DWORD_B4, \ 498 %%ZMM_DWORD_C1, %%ZMM_DWORD_C2, %%ZMM_DWORD_C3, %%ZMM_DWORD_C4 499 500 ; B <<< 12 501 ZMM_ROLS_X4 %%ZMM_DWORD_B1, %%ZMM_DWORD_B2, %%ZMM_DWORD_B3, %%ZMM_DWORD_B4, 12 502 503 ; A += B 504 ZMM_OP_X4 vpaddd, %%ZMM_DWORD_A1, %%ZMM_DWORD_A2, %%ZMM_DWORD_A3, %%ZMM_DWORD_A4, \ 505 %%ZMM_DWORD_B1, %%ZMM_DWORD_B2, %%ZMM_DWORD_B3, %%ZMM_DWORD_B4 506 ; D ^= A 507 ZMM_OP_X4 vpxorq, %%ZMM_DWORD_D1, %%ZMM_DWORD_D2, %%ZMM_DWORD_D3, %%ZMM_DWORD_D4, \ 508 %%ZMM_DWORD_A1, %%ZMM_DWORD_A2, %%ZMM_DWORD_A3, %%ZMM_DWORD_A4 509 510 ; D <<< 8 511 ZMM_ROLS_X4 %%ZMM_DWORD_D1, %%ZMM_DWORD_D2, %%ZMM_DWORD_D3, %%ZMM_DWORD_D4, 8 512 513 ; C += D 514 ZMM_OP_X4 vpaddd, %%ZMM_DWORD_C1, %%ZMM_DWORD_C2, %%ZMM_DWORD_C3, %%ZMM_DWORD_C4, \ 515 %%ZMM_DWORD_D1, %%ZMM_DWORD_D2, %%ZMM_DWORD_D3, %%ZMM_DWORD_D4 516 ; B ^= C 517 ZMM_OP_X4 vpxorq, %%ZMM_DWORD_B1, %%ZMM_DWORD_B2, %%ZMM_DWORD_B3, %%ZMM_DWORD_B4, \ 518 %%ZMM_DWORD_C1, %%ZMM_DWORD_C2, %%ZMM_DWORD_C3, %%ZMM_DWORD_C4 519 520 ; B <<< 7 521 ZMM_ROLS_X4 %%ZMM_DWORD_B1, %%ZMM_DWORD_B2, %%ZMM_DWORD_B3, %%ZMM_DWORD_B4, 7 522%endmacro 523 524;; 525;; Generates 64*16 bytes of keystream and encrypt up to 1KB of input data 526;; 527%macro ENCRYPT_1K 35 528%define %%ZMM_DWORD0 %1 ;; [clobbered] ZMM to contain dword 0 of all states 529%define %%ZMM_DWORD1 %2 ;; [clobbered] ZMM to contain dword 1 of all states 530%define %%ZMM_DWORD2 %3 ;; [clobbered] ZMM to contain dword 2 of all states 531%define %%ZMM_DWORD3 %4 ;; [clobbered] ZMM to contain dword 3 of all states 532%define %%ZMM_DWORD4 %5 ;; [clobbered] ZMM to contain dword 4 of all states 533%define %%ZMM_DWORD5 %6 ;; [clobbered] ZMM to contain dword 5 of all states 534%define %%ZMM_DWORD6 %7 ;; [clobbered] ZMM to contain dword 6 of all states 535%define %%ZMM_DWORD7 %8 ;; [clobbered] ZMM to contain dword 7 of all states 536%define %%ZMM_DWORD8 %9 ;; [clobbered] ZMM to contain dword 8 of all states 537%define %%ZMM_DWORD9 %10 ;; [clobbered] ZMM to contain dword 9 of all states 538%define %%ZMM_DWORD10 %11 ;; [clobbered] ZMM to contain dword 10 of all states 539%define %%ZMM_DWORD11 %12 ;; [clobbered] ZMM to contain dword 11 of all states 540%define %%ZMM_DWORD12 %13 ;; [clobbered] ZMM to contain dword 12 of all states 541%define %%ZMM_DWORD13 %14 ;; [clobbered] ZMM to contain dword 13 of all states 542%define %%ZMM_DWORD14 %15 ;; [clobbered] ZMM to contain dword 14 of all states 543%define %%ZMM_DWORD15 %16 ;; [clobbered] ZMM to contain dword 15 of all states 544%define %%ZMM_DWORD_ORIG0 %17 ;; [in/clobbered] ZMM containing dword 0 of all states / Temp ZMM register 545%define %%ZMM_DWORD_ORIG1 %18 ;; [in/clobbered] ZMM containing dword 1 of all states / Temp ZMM register 546%define %%ZMM_DWORD_ORIG2 %19 ;; [in] ZMM containing dword 2 of all states 547%define %%ZMM_DWORD_ORIG3 %20 ;; [in] ZMM containing dword 3 of all states 548%define %%ZMM_DWORD_ORIG4 %21 ;; [in] ZMM containing dword 4 of all states 549%define %%ZMM_DWORD_ORIG5 %22 ;; [in] ZMM containing dword 5 of all states 550%define %%ZMM_DWORD_ORIG6 %23 ;; [in] ZMM containing dword 6 of all states 551%define %%ZMM_DWORD_ORIG7 %24 ;; [in] ZMM containing dword 7 of all states 552%define %%ZMM_DWORD_ORIG8 %25 ;; [in] ZMM containing dword 8 of all states 553%define %%ZMM_DWORD_ORIG9 %26 ;; [in] ZMM containing dword 9 of all states 554%define %%ZMM_DWORD_ORIG10 %27 ;; [in] ZMM containing dword 10 of all states 555%define %%ZMM_DWORD_ORIG11 %28 ;; [in] ZMM containing dword 11 of all states 556%define %%ZMM_DWORD_ORIG12 %29 ;; [in] ZMM containing dword 12 of all states 557%define %%ZMM_DWORD_ORIG13 %30 ;; [in] ZMM containing dword 13 of all states 558%define %%ZMM_DWORD_ORIG14 %31 ;; [in] ZMM containing dword 14 of all states 559%define %%ZMM_DWORD_ORIG15 %32 ;; [in] ZMM containing dword 15 of all states 560%define %%SRC %33 ;; [in] Source pointer 561%define %%DST %34 ;; [in] Destination pointer 562%define %%OFF %35 ;; [in] Offset into src/dst pointers 563 564%assign i 0 565%rep 16 566 vmovdqa64 APPEND(%%ZMM_DWORD, i), APPEND(%%ZMM_DWORD_ORIG, i) 567%assign i (i + 1) 568%endrep 569 570%rep 10 571 572 ;;; Each full round consists of 8 quarter rounds, 4 column rounds and 4 diagonal rounds 573 ;;; For first 4 column rounds: 574 ;;; A = 0, 1, 2, 3; B = 4, 5, 6, 7; 575 ;;; C = 8, 9, 10, 11; D = 12, 13, 14, 15 576 CHACHA20_ROUND %%ZMM_DWORD0, %%ZMM_DWORD1, %%ZMM_DWORD2, %%ZMM_DWORD3, \ 577 %%ZMM_DWORD4, %%ZMM_DWORD5, %%ZMM_DWORD6, %%ZMM_DWORD7, \ 578 %%ZMM_DWORD8, %%ZMM_DWORD9, %%ZMM_DWORD10, %%ZMM_DWORD11, \ 579 %%ZMM_DWORD12, %%ZMM_DWORD13, %%ZMM_DWORD14, %%ZMM_DWORD15 580 ;;; For 4 diagonal rounds: 581 ;;; A = 0, 1, 2, 3; B = 5, 6, 7, 4; 582 ;;; C = 10, 11, 8, 9; D = 15, 12, 13, 14 583 CHACHA20_ROUND %%ZMM_DWORD0, %%ZMM_DWORD1, %%ZMM_DWORD2, %%ZMM_DWORD3, \ 584 %%ZMM_DWORD5, %%ZMM_DWORD6, %%ZMM_DWORD7, %%ZMM_DWORD4, \ 585 %%ZMM_DWORD10, %%ZMM_DWORD11, %%ZMM_DWORD8, %%ZMM_DWORD9, \ 586 %%ZMM_DWORD15, %%ZMM_DWORD12, %%ZMM_DWORD13, %%ZMM_DWORD14 587%endrep 588 589 ;; Add original states to processed states, transpose 590 ;; these states to form the 64*16 bytes of keystream, 591 ;; XOR with plaintext and write ciphertext out 592 GENERATE_1K_KS_AND_ENCRYPT %%ZMM_DWORD0, %%ZMM_DWORD1, %%ZMM_DWORD2, %%ZMM_DWORD3, \ 593 %%ZMM_DWORD4, %%ZMM_DWORD5, %%ZMM_DWORD6, %%ZMM_DWORD7, \ 594 %%ZMM_DWORD8, %%ZMM_DWORD9, %%ZMM_DWORD10, %%ZMM_DWORD11, \ 595 %%ZMM_DWORD12, %%ZMM_DWORD13, %%ZMM_DWORD14, %%ZMM_DWORD15, \ 596 %%ZMM_DWORD_ORIG0, %%ZMM_DWORD_ORIG1, %%ZMM_DWORD_ORIG2, \ 597 %%ZMM_DWORD_ORIG3,%%ZMM_DWORD_ORIG4, %%ZMM_DWORD_ORIG5, \ 598 %%ZMM_DWORD_ORIG6, %%ZMM_DWORD_ORIG7, %%ZMM_DWORD_ORIG8, \ 599 %%ZMM_DWORD_ORIG9, %%ZMM_DWORD_ORIG10, %%ZMM_DWORD_ORIG11, \ 600 %%ZMM_DWORD_ORIG12, %%ZMM_DWORD_ORIG13, %%ZMM_DWORD_ORIG14, \ 601 %%ZMM_DWORD_ORIG15, %%SRC, %%DST, %%OFF 602%endmacro 603 604; 605; Macro adding original state values to processed state values 606; and transposing 16x16 u32 from first 16 ZMM registers, 607; creating keystreams. 608; Note that the registers are tranposed in a different 609; order, so first register (IN00) containing row 0 610; will not contain the first column of the matrix, but 611; row 1 and same with other registers. 612; This is done to minimize the number of registers clobbered. 613; 614%macro ADD_TRANSPOSE_STATE_KS 32 615%define %%IN00_OUT01 %1 ; [in/out] Input row 0, Output column 1 616%define %%IN01_OUT02 %2 ; [in/out] Input row 1, Output column 2 617%define %%IN02_OUT15 %3 ; [in/out] Input row 2, Output column 15 618%define %%IN03_OUT04 %4 ; [in/out] Input row 3, Output column 4 619%define %%IN04_OUT08 %5 ; [in/out] Input row 4, Output column 8 620%define %%IN05_OUT09 %6 ; [in/out] Input row 5, Output column 9 621%define %%IN06_OUT13 %7 ; [in/out] Input row 6, Output column 13 622%define %%IN07_OUT07 %8 ; [in/out] Input row 7, Output column 7 623%define %%IN08_OUT05 %9 ; [in/out] Input row 8, Output column 5 624%define %%IN09_OUT00 %10 ; [in/out] Input row 9, Output column 0 625%define %%IN10_OUT06 %11 ; [in/out] Input row 10, Output column 6 626%define %%IN11_OUT11 %12 ; [in/out] Input row 11, Output column 11 627%define %%IN12_OUT12 %13 ; [in/out] Input row 12, Output column 12 628%define %%IN13_OUT03 %14 ; [in/out] Input row 13, Output column 3 629%define %%IN14_OUT14 %15 ; [in/out] Input row 14, Output column 14 630%define %%IN15_OUT10 %16 ; [in/out] Input row 15, Output column 10 631%define %%IN_ORIG00 %17 ; [in/clobbered] Original input row 0 632%define %%IN_ORIG01 %18 ; [in/clobbered] Original input row 1 633%define %%IN_ORIG02 %19 ; [in] Original input row 2 634%define %%IN_ORIG03 %20 ; [in] Original input row 3 635%define %%IN_ORIG04 %21 ; [in] Original input row 4 636%define %%IN_ORIG05 %22 ; [in] Original input row 5 637%define %%IN_ORIG06 %23 ; [in] Original input row 6 638%define %%IN_ORIG07 %24 ; [in] Original input row 7 639%define %%IN_ORIG08 %25 ; [in] Original input row 8 640%define %%IN_ORIG09 %26 ; [in] Original input row 9 641%define %%IN_ORIG10 %27 ; [in] Original input row 10 642%define %%IN_ORIG11 %28 ; [in] Original input row 11 643%define %%IN_ORIG12 %29 ; [in] Original input row 12 644%define %%IN_ORIG13 %30 ; [in] Original input row 13 645%define %%IN_ORIG14 %31 ; [in] Original input row 14 646%define %%IN_ORIG15 %32 ; [in] Original input row 15 647 648 vpaddd %%IN00_OUT01, %%IN_ORIG00 649 vpaddd %%IN01_OUT02, %%IN_ORIG01 650 vpaddd %%IN02_OUT15, %%IN_ORIG02 651 vpaddd %%IN03_OUT04, %%IN_ORIG03 652 653 ;; Deal with first lanes 0-7 654 ; T0, T1 free 655 vpunpckldq %%IN_ORIG00, %%IN00_OUT01, %%IN01_OUT02 656 vpunpckhdq %%IN00_OUT01, %%IN00_OUT01, %%IN01_OUT02 657 vpunpckldq %%IN_ORIG01, %%IN02_OUT15, %%IN03_OUT04 658 vpunpckhdq %%IN02_OUT15, %%IN02_OUT15, %%IN03_OUT04 659 660 ; IN01_OUT02, IN03_OUT04 free 661 vpunpcklqdq %%IN03_OUT04, %%IN_ORIG00, %%IN_ORIG01 662 vpunpckhqdq %%IN01_OUT02, %%IN_ORIG00, %%IN_ORIG01 663 vpunpcklqdq %%IN_ORIG00, %%IN00_OUT01, %%IN02_OUT15 664 vpunpckhqdq %%IN00_OUT01, %%IN00_OUT01, %%IN02_OUT15 665 666 vpaddd %%IN04_OUT08, %%IN_ORIG04 667 vpaddd %%IN05_OUT09, %%IN_ORIG05 668 vpaddd %%IN06_OUT13, %%IN_ORIG06 669 vpaddd %%IN07_OUT07, %%IN_ORIG07 670 671 ; IN02_OUT15, T1 free 672 vpunpckldq %%IN_ORIG01, %%IN04_OUT08, %%IN05_OUT09 673 vpunpckhdq %%IN04_OUT08, %%IN04_OUT08, %%IN05_OUT09 674 vpunpckldq %%IN02_OUT15, %%IN06_OUT13, %%IN07_OUT07 675 vpunpckhdq %%IN06_OUT13, %%IN06_OUT13, %%IN07_OUT07 676 677 ; IN07_OUT07, IN05_OUT09 free 678 vpunpcklqdq %%IN07_OUT07, %%IN_ORIG01, %%IN02_OUT15 679 vpunpckhqdq %%IN05_OUT09, %%IN_ORIG01, %%IN02_OUT15 680 vpunpcklqdq %%IN02_OUT15, %%IN04_OUT08, %%IN06_OUT13 681 vpunpckhqdq %%IN04_OUT08, %%IN04_OUT08, %%IN06_OUT13 682 683 ; T1, IN06_OUT13 free 684 vshufi64x2 %%IN_ORIG01, %%IN03_OUT04, %%IN07_OUT07, 0x44 685 vshufi64x2 %%IN03_OUT04, %%IN03_OUT04, %%IN07_OUT07, 0xee 686 vshufi64x2 %%IN06_OUT13, %%IN01_OUT02, %%IN05_OUT09, 0x44 687 vshufi64x2 %%IN01_OUT02, %%IN01_OUT02, %%IN05_OUT09, 0xee 688 vshufi64x2 %%IN07_OUT07, %%IN_ORIG00, %%IN02_OUT15, 0x44 689 vshufi64x2 %%IN02_OUT15, %%IN_ORIG00, %%IN02_OUT15, 0xee 690 vshufi64x2 %%IN05_OUT09, %%IN00_OUT01, %%IN04_OUT08, 0x44 691 vshufi64x2 %%IN00_OUT01, %%IN00_OUT01, %%IN04_OUT08, 0xee 692 693 ;; Deal with lanes 8-15 694 vpaddd %%IN08_OUT05, %%IN_ORIG08 695 vpaddd %%IN09_OUT00, %%IN_ORIG09 696 vpaddd %%IN10_OUT06, %%IN_ORIG10 697 vpaddd %%IN11_OUT11, %%IN_ORIG11 698 699 vpunpckldq %%IN_ORIG00, %%IN08_OUT05, %%IN09_OUT00 700 vpunpckhdq %%IN08_OUT05, %%IN08_OUT05, %%IN09_OUT00 701 vpunpckldq %%IN04_OUT08, %%IN10_OUT06, %%IN11_OUT11 702 vpunpckhdq %%IN10_OUT06, %%IN10_OUT06, %%IN11_OUT11 703 704 vpunpcklqdq %%IN09_OUT00, %%IN_ORIG00, %%IN04_OUT08 705 vpunpckhqdq %%IN04_OUT08, %%IN_ORIG00, %%IN04_OUT08 706 vpunpcklqdq %%IN11_OUT11, %%IN08_OUT05, %%IN10_OUT06 707 vpunpckhqdq %%IN08_OUT05, %%IN08_OUT05, %%IN10_OUT06 708 709 vpaddd %%IN12_OUT12, %%IN_ORIG12 710 vpaddd %%IN13_OUT03, %%IN_ORIG13 711 vpaddd %%IN14_OUT14, %%IN_ORIG14 712 vpaddd %%IN15_OUT10, %%IN_ORIG15 713 714 vpunpckldq %%IN_ORIG00, %%IN12_OUT12, %%IN13_OUT03 715 vpunpckhdq %%IN12_OUT12, %%IN12_OUT12, %%IN13_OUT03 716 vpunpckldq %%IN10_OUT06, %%IN14_OUT14, %%IN15_OUT10 717 vpunpckhdq %%IN14_OUT14, %%IN14_OUT14, %%IN15_OUT10 718 719 vpunpcklqdq %%IN13_OUT03, %%IN_ORIG00, %%IN10_OUT06 720 vpunpckhqdq %%IN10_OUT06, %%IN_ORIG00, %%IN10_OUT06 721 vpunpcklqdq %%IN15_OUT10, %%IN12_OUT12, %%IN14_OUT14 722 vpunpckhqdq %%IN12_OUT12, %%IN12_OUT12, %%IN14_OUT14 723 724 vshufi64x2 %%IN14_OUT14, %%IN09_OUT00, %%IN13_OUT03, 0x44 725 vshufi64x2 %%IN09_OUT00, %%IN09_OUT00, %%IN13_OUT03, 0xee 726 vshufi64x2 %%IN_ORIG00, %%IN04_OUT08, %%IN10_OUT06, 0x44 727 vshufi64x2 %%IN10_OUT06, %%IN04_OUT08, %%IN10_OUT06, 0xee 728 vshufi64x2 %%IN13_OUT03, %%IN11_OUT11, %%IN15_OUT10, 0x44 729 vshufi64x2 %%IN11_OUT11, %%IN11_OUT11, %%IN15_OUT10, 0xee 730 vshufi64x2 %%IN15_OUT10, %%IN08_OUT05, %%IN12_OUT12, 0x44 731 vshufi64x2 %%IN08_OUT05, %%IN08_OUT05, %%IN12_OUT12, 0xee 732 733 vshufi64x2 %%IN12_OUT12, %%IN03_OUT04, %%IN09_OUT00, 0xdd 734 vshufi64x2 %%IN04_OUT08, %%IN03_OUT04, %%IN09_OUT00, 0x88 735 vshufi64x2 %%IN03_OUT04, %%IN_ORIG01, %%IN14_OUT14, 0xdd 736 vshufi64x2 %%IN09_OUT00, %%IN_ORIG01, %%IN14_OUT14, 0x88 737 vshufi64x2 %%IN14_OUT14, %%IN02_OUT15, %%IN11_OUT11, 0xdd 738 vshufi64x2 %%IN_ORIG01, %%IN02_OUT15, %%IN11_OUT11, 0x88 739 vshufi64x2 %%IN11_OUT11, %%IN00_OUT01, %%IN08_OUT05, 0x88 740 vshufi64x2 %%IN02_OUT15, %%IN00_OUT01, %%IN08_OUT05, 0xdd 741 vshufi64x2 %%IN00_OUT01, %%IN06_OUT13, %%IN_ORIG00, 0x88 742 vshufi64x2 %%IN08_OUT05, %%IN06_OUT13, %%IN_ORIG00, 0xdd 743 vshufi64x2 %%IN_ORIG00, %%IN01_OUT02, %%IN10_OUT06, 0x88 744 vshufi64x2 %%IN06_OUT13, %%IN01_OUT02, %%IN10_OUT06, 0xdd 745 vshufi64x2 %%IN01_OUT02, %%IN07_OUT07, %%IN13_OUT03, 0x88 746 vshufi64x2 %%IN10_OUT06, %%IN07_OUT07, %%IN13_OUT03, 0xdd 747 vshufi64x2 %%IN13_OUT03, %%IN05_OUT09, %%IN15_OUT10, 0x88 748 vshufi64x2 %%IN07_OUT07, %%IN05_OUT09, %%IN15_OUT10, 0xdd 749 750 vmovdqa64 %%IN05_OUT09, %%IN_ORIG00 751 vmovdqa64 %%IN15_OUT10, %%IN_ORIG01 752%endmacro 753 754;; 755;; Generates 64*16 bytes of keystream 756;; 757%macro GENERATE_1K_KS 32 758%define %%ZMM_DWORD0 %1 ;; [out] ZMM containing dword 0 of all states and bytes 64-127 of keystream 759%define %%ZMM_DWORD1 %2 ;; [out] ZMM containing dword 1 of all states and bytes 128-191 of keystream 760%define %%ZMM_DWORD2 %3 ;; [out] ZMM containing dword 2 of all states and bytes 960-1023 of keystream 761%define %%ZMM_DWORD3 %4 ;; [out] ZMM containing dword 3 of all states and bytes 256-319 of keystream 762%define %%ZMM_DWORD4 %5 ;; [out] ZMM containing dword 4 of all states and bytes 512-575 of keystream 763%define %%ZMM_DWORD5 %6 ;; [out] ZMM containing dword 5 of all states and bytes 576-639 of keystream 764%define %%ZMM_DWORD6 %7 ;; [out] ZMM containing dword 6 of all states and bytes 832-895 of keystream 765%define %%ZMM_DWORD7 %8 ;; [out] ZMM containing dword 7 of all states and bytes 448-511 of keystream 766%define %%ZMM_DWORD8 %9 ;; [out] ZMM containing dword 8 of all states and bytes 320-383 of keystream 767%define %%ZMM_DWORD9 %10 ;; [out] ZMM containing dword 9 of all states and bytes 0-63 of keystream 768%define %%ZMM_DWORD10 %11 ;; [out] ZMM containing dword 10 of all states and bytes 384-447 of keystream 769%define %%ZMM_DWORD11 %12 ;; [out] ZMM containing dword 11 of all states and bytes 704-767 of keystream 770%define %%ZMM_DWORD12 %13 ;; [out] ZMM containing dword 12 of all states and bytes 768-831 of keystream 771%define %%ZMM_DWORD13 %14 ;; [out] ZMM containing dword 13 of all states and bytes 192-255 of keystream 772%define %%ZMM_DWORD14 %15 ;; [out] ZMM containing dword 14 of all states and bytes 896-959 of keystream 773%define %%ZMM_DWORD15 %16 ;; [out] ZMM containing dword 15 of all states and bytes 640-703 of keystream 774%define %%ZMM_DWORD_ORIG0 %17 ;; [in/clobbered] ZMM containing dword 0 of all states / Temp ZMM register 775%define %%ZMM_DWORD_ORIG1 %18 ;; [in/clobbered] ZMM containing dword 1 of all states / Temp ZMM register 776%define %%ZMM_DWORD_ORIG2 %19 ;; [in] ZMM containing dword 2 of all states 777%define %%ZMM_DWORD_ORIG3 %20 ;; [in] ZMM containing dword 3 of all states 778%define %%ZMM_DWORD_ORIG4 %21 ;; [in] ZMM containing dword 4 of all states 779%define %%ZMM_DWORD_ORIG5 %22 ;; [in] ZMM containing dword 5 of all states 780%define %%ZMM_DWORD_ORIG6 %23 ;; [in] ZMM containing dword 6 of all states 781%define %%ZMM_DWORD_ORIG7 %24 ;; [in] ZMM containing dword 7 of all states 782%define %%ZMM_DWORD_ORIG8 %25 ;; [in] ZMM containing dword 8 of all states 783%define %%ZMM_DWORD_ORIG9 %26 ;; [in] ZMM containing dword 9 of all states 784%define %%ZMM_DWORD_ORIG10 %27 ;; [in] ZMM containing dword 10 of all states 785%define %%ZMM_DWORD_ORIG11 %28 ;; [in] ZMM containing dword 11 of all states 786%define %%ZMM_DWORD_ORIG12 %29 ;; [in] ZMM containing dword 12 of all states 787%define %%ZMM_DWORD_ORIG13 %30 ;; [in] ZMM containing dword 13 of all states 788%define %%ZMM_DWORD_ORIG14 %31 ;; [in] ZMM containing dword 14 of all states 789%define %%ZMM_DWORD_ORIG15 %32 ;; [in] ZMM containing dword 15 of all states 790 791%assign i 0 792%rep 16 793 vmovdqa64 APPEND(%%ZMM_DWORD, i), APPEND(%%ZMM_DWORD_ORIG, i) 794%assign i (i + 1) 795%endrep 796 797%rep 10 798 799 ;;; Each full round consists of 8 quarter rounds, 4 column rounds and 4 diagonal rounds 800 ;;; For first 4 column rounds: 801 ;;; A = 0, 1, 2, 3; B = 4, 5, 6, 7; 802 ;;; C = 8, 9, 10, 11; D = 12, 13, 14, 15 803 CHACHA20_ROUND %%ZMM_DWORD0, %%ZMM_DWORD1, %%ZMM_DWORD2, %%ZMM_DWORD3, \ 804 %%ZMM_DWORD4, %%ZMM_DWORD5, %%ZMM_DWORD6, %%ZMM_DWORD7, \ 805 %%ZMM_DWORD8, %%ZMM_DWORD9, %%ZMM_DWORD10, %%ZMM_DWORD11, \ 806 %%ZMM_DWORD12, %%ZMM_DWORD13, %%ZMM_DWORD14, %%ZMM_DWORD15 807 ;;; For 4 diagonal rounds: 808 ;;; A = 0, 1, 2, 3; B = 5, 6, 7, 4; 809 ;;; C = 10, 11, 8, 9; D = 15, 12, 13, 14 810 CHACHA20_ROUND %%ZMM_DWORD0, %%ZMM_DWORD1, %%ZMM_DWORD2, %%ZMM_DWORD3, \ 811 %%ZMM_DWORD5, %%ZMM_DWORD6, %%ZMM_DWORD7, %%ZMM_DWORD4, \ 812 %%ZMM_DWORD10, %%ZMM_DWORD11, %%ZMM_DWORD8, %%ZMM_DWORD9, \ 813 %%ZMM_DWORD15, %%ZMM_DWORD12, %%ZMM_DWORD13, %%ZMM_DWORD14 814%endrep 815 816 ;; Add original states to processed states and transpose 817 ;; these states to form the 64*16 bytes of keystream 818 ADD_TRANSPOSE_STATE_KS %%ZMM_DWORD0, %%ZMM_DWORD1, %%ZMM_DWORD2, %%ZMM_DWORD3, \ 819 %%ZMM_DWORD4, %%ZMM_DWORD5, %%ZMM_DWORD6, %%ZMM_DWORD7, \ 820 %%ZMM_DWORD8, %%ZMM_DWORD9, %%ZMM_DWORD10, %%ZMM_DWORD11, \ 821 %%ZMM_DWORD12, %%ZMM_DWORD13, %%ZMM_DWORD14, %%ZMM_DWORD15, \ 822 %%ZMM_DWORD_ORIG0, %%ZMM_DWORD_ORIG1, %%ZMM_DWORD_ORIG2, \ 823 %%ZMM_DWORD_ORIG3,%%ZMM_DWORD_ORIG4, %%ZMM_DWORD_ORIG5, \ 824 %%ZMM_DWORD_ORIG6, %%ZMM_DWORD_ORIG7, %%ZMM_DWORD_ORIG8, \ 825 %%ZMM_DWORD_ORIG9, %%ZMM_DWORD_ORIG10, %%ZMM_DWORD_ORIG11, \ 826 %%ZMM_DWORD_ORIG12, %%ZMM_DWORD_ORIG13, %%ZMM_DWORD_ORIG14, \ 827 %%ZMM_DWORD_ORIG15 828%endmacro 829 830%macro ENCRYPT_1_16_BLOCKS 22 831%define %%KS0 %1 ; [in/clobbered] Bytes 0-63 of keystream 832%define %%KS1 %2 ; [in/clobbered] Bytes 64-127 of keystream 833%define %%KS2 %3 ; [in/clobbered] Bytes 128-191 of keystream 834%define %%KS3 %4 ; [in/clobbered] Bytes 192-255 of keystream 835%define %%KS4 %5 ; [in/clobbered] Bytes 256-319 of keystream 836%define %%KS5 %6 ; [in/clobbered] Bytes 320-383 of keystream 837%define %%KS6 %7 ; [in/clobbered] Bytes 384-447 of keystream 838%define %%KS7 %8 ; [in/clobbered] Bytes 448-511 of keystream 839%define %%KS8 %9 ; [in/clobbered] Bytes 512-575 of keystream 840%define %%KS9 %10 ; [in/clobbered] Bytes 576-639 of keystream 841%define %%KS10 %11 ; [in/clobbered] Bytes 640-703 of keystream 842%define %%KS11 %12 ; [in/clobbered] Bytes 704-767 of keystream 843%define %%KS12 %13 ; [in/clobbered] Bytes 768-831 of keystream 844%define %%KS13 %14 ; [in/clobbered] Bytes 832-895 of keystream 845%define %%KS14 %15 ; [in/clobbered] Bytes 896-959 of keystream 846%define %%KS15 %16 ; [in/clobbered] Bytes 960-1023 of keystream 847%define %%ZTMP %17 ; [clobbered] Temporary ZMM register 848%define %%SRC %18 ; [in] Source pointer 849%define %%DST %19 ; [in] Destination pointer 850%define %%OFF %20 ; [in] Offset into src/dst pointers 851%define %%KMASK %21 ; [in] Mask register for final block 852%define %%NUM_BLOCKS %22 ; [in] Number of blocks to encrypt 853 854 ; XOR Keystreams with blocks of input data 855%assign %%I 0 856%rep (%%NUM_BLOCKS - 1) 857 vpxorq APPEND(%%KS, %%I), [%%SRC + %%OFF + 64*%%I] 858%assign %%I (%%I + 1) 859%endrep 860 ; Final block which might have less than 64 bytes, so mask register is used 861 vmovdqu8 %%ZTMP{%%KMASK}, [%%SRC + %%OFF + 64*%%I] 862 vpxorq APPEND(%%KS, %%I), %%ZTMP 863 864 ; Write out blocks of ciphertext 865%assign %%I 0 866%rep (%%NUM_BLOCKS - 1) 867 vmovdqu8 [%%DST + %%OFF + 64*%%I], APPEND(%%KS, %%I) 868%assign %%I (%%I + 1) 869%endrep 870 vmovdqu8 [%%DST + %%OFF + 64*%%I]{%%KMASK}, APPEND(%%KS, %%I) 871%endmacro 872 873%macro PREPARE_NEXT_STATES_4_TO_8 15 874%define %%STATE_IN_A_L %1 ;; [out] ZMM containing state "A" part for states 1-4 875%define %%STATE_IN_B_L %2 ;; [out] ZMM containing state "B" part for states 1-4 876%define %%STATE_IN_C_L %3 ;; [out] ZMM containing state "C" part for states 1-4 877%define %%STATE_IN_D_L %4 ;; [out] ZMM containing state "D" part for states 1-4 878%define %%STATE_IN_A_H %5 ;; [out] ZMM containing state "A" part for states 5-8 (or "none" in NUM_BLOCKS == 4) 879%define %%STATE_IN_B_H %6 ;; [out] ZMM containing state "B" part for states 5-8 (or "none" in NUM_BLOCKS == 4) 880%define %%STATE_IN_C_H %7 ;; [out] ZMM containing state "C" part for states 5-8 (or "none" in NUM_BLOCKS == 4) 881%define %%STATE_IN_D_H %8 ;; [out] ZMM containing state "D" part for states 5-8 (or "none" in NUM_BLOCKS == 4) 882%define %%ZTMP0 %9 ;; [clobbered] ZMM temp reg 883%define %%ZTMP1 %10 ;; [clobbered] ZMM temp reg 884%define %%LAST_BLK_CNT %11 ;; [in] Last block counter 885%define %%IV %12 ;; [in] Pointer to IV 886%define %%KEYS %13 ;; [in/clobbered] Pointer to keys 887%define %%KMASK %14 ;; [clobbered] Mask register 888%define %%NUM_BLOCKS %15 ;; [in] Number of state blocks to prepare (numerical) 889 890 ;; Prepare next 8 states (or 4, if 4 or less blocks left) 891 vbroadcastf64x2 %%STATE_IN_B_L, [%%KEYS] ; Load key bytes 0-15 892 vbroadcastf64x2 %%STATE_IN_C_L, [%%KEYS + 16] ; Load key bytes 16-31 893 mov %%KEYS, 0xfff ; Reuse %%KEYS register, as it is not going to be used again 894 kmovq %%KMASK, %%KEYS 895 vmovdqu8 XWORD(%%STATE_IN_D_L){%%KMASK}, [%%IV] ; Load Nonce (12 bytes) 896 vpslldq XWORD(%%STATE_IN_D_L), 4 897 vshufi64x2 %%STATE_IN_D_L, %%STATE_IN_D_L, 0 ; Broadcast 128 bits to 512 bits 898 vbroadcastf64x2 %%STATE_IN_A_L, [rel constants] 899 900%if %%NUM_BLOCKS == 8 901 ;; Prepare chacha states 4-7 902 vmovdqa64 %%STATE_IN_A_H, %%STATE_IN_A_L 903 vmovdqa64 %%STATE_IN_B_H, %%STATE_IN_B_L 904 vmovdqa64 %%STATE_IN_C_H, %%STATE_IN_C_L 905 vmovdqa64 %%STATE_IN_D_H, %%STATE_IN_D_L 906%endif 907 908 ; Broadcast last block counter 909 vmovq XWORD(%%ZTMP0), %%LAST_BLK_CNT 910 vshufi32x4 %%ZTMP0, %%ZTMP0, 0x00 911%if %%NUM_BLOCKS == 4 912 ; Add 1-4 to construct next block counters 913 vpaddq %%ZTMP0, [rel add_1_4] 914 vporq %%STATE_IN_D_L, %%ZTMP0 915%else 916 ; Add 1-8 to construct next block counters 917 vmovdqa64 %%ZTMP1, %%ZTMP0 918 vpaddq %%ZTMP0, [rel add_1_4] 919 vpaddq %%ZTMP1, [rel add_5_8] 920 vporq %%STATE_IN_D_L, %%ZTMP0 921 vporq %%STATE_IN_D_H, %%ZTMP1 922%endif 923%endmacro 924 925align 32 926MKGLOBAL(submit_job_chacha20_enc_dec_avx512,function,internal) 927submit_job_chacha20_enc_dec_avx512: 928 929%define src r8 930%define dst r9 931%define len r10 932%define iv r11 933%define keys rdx 934%define tmp rdx 935%define off rax 936 937 xor off, off 938 939 mov tmp, 0xffffffffffffffff 940 kmovq k1, tmp 941 942 mov len, [job + _msg_len_to_cipher_in_bytes] 943 mov src, [job + _src] 944 add src, [job + _cipher_start_src_offset_in_bytes] 945 mov dst, [job + _dst] 946 mov keys, [job + _enc_keys] 947 mov iv, [job + _iv] 948 949 ; If less than or equal to 64*8 bytes, prepare directly states for up to 8 blocks 950 cmp len, 64*8 951 jbe exit_loop 952 953 ; Prepare first 16 chacha20 states from IV, key, constants and counter values 954 vpbroadcastd zmm0, [rel constants] 955 vpbroadcastd zmm1, [rel constants + 4] 956 vpbroadcastd zmm2, [rel constants + 8] 957 vpbroadcastd zmm3, [rel constants + 12] 958 959 vpbroadcastd zmm4, [keys] 960 vpbroadcastd zmm5, [keys + 4] 961 vpbroadcastd zmm6, [keys + 8] 962 vpbroadcastd zmm7, [keys + 12] 963 vpbroadcastd zmm8, [keys + 16] 964 vpbroadcastd zmm9, [keys + 20] 965 vpbroadcastd zmm10, [keys + 24] 966 vpbroadcastd zmm11, [keys + 28] 967 968 vpbroadcastd zmm13, [iv] 969 vpbroadcastd zmm14, [iv + 4] 970 vpbroadcastd zmm15, [iv + 8] 971 ;; Set first 16 counter values 972 vmovdqa64 zmm12, [rel set_1_16] 973 974 cmp len, 64*16 975 jb exit_loop 976 977align 32 978start_loop: 979 ENCRYPT_1K zmm16, zmm17, zmm18, zmm19, zmm20, zmm21, zmm22, zmm23, \ 980 zmm24, zmm25, zmm26, zmm27, zmm28, zmm29, zmm30, zmm31, \ 981 zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, \ 982 zmm9, zmm10, zmm11, zmm12, zmm13, zmm14, zmm15, src, dst, off 983 984 ; Update remaining length 985 sub len, 64*16 986 add off, 64*16 987 988 ; Reload first two registers zmm0 and 1, 989 ; as they have been overwritten by the previous macros 990 vpbroadcastd zmm0, [rel constants] 991 vpbroadcastd zmm1, [rel constants + 4] 992 993 ; Increment counter values 994 vpaddd zmm12, [rel add_16] 995 996 cmp len, 64*16 997 jae start_loop 998 999exit_loop: 1000 1001 ; Check if there are partial block (less than 16*64 bytes) 1002 or len, len 1003 jz no_partial_block 1004 1005 cmp len, 64*8 1006 ja more_than_8_blocks_left 1007 1008 cmp len, 64*4 1009 ja more_than_4_blocks_left 1010 1011 ;; up to 4 blocks left 1012 1013 ; Get last block counter dividing offset by 64 1014 shr off, 6 1015 PREPARE_NEXT_STATES_4_TO_8 zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, \ 1016 zmm8, zmm9, off, iv, keys, k2, 4 1017 shl off, 6 ; Restore offset 1018 1019 ; Use same first 4 registers as the output of GENERATE_1K_KS, 1020 ; to be able to use common code later on to encrypt 1021 GENERATE_512_KS zmm25, zmm16, zmm17, zmm29, none, none, none, none, \ 1022 zmm0, zmm1, zmm2, zmm3, none, none, none, none, \ 1023 zmm8, zmm9, zmm10, zmm11, 4 1024 1025 jmp ks_gen_done 1026 1027more_than_4_blocks_left: 1028 ;; up to 8 blocks left 1029 1030 ; Get last block counter dividing offset by 64 1031 shr off, 6 1032 ;; up to 8 blocks left 1033 PREPARE_NEXT_STATES_4_TO_8 zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, \ 1034 zmm8, zmm9, off, iv, keys, k2, 8 1035 shl off, 6 ; Restore offset 1036 1037 ; Use same first 8 registers as the output of GENERATE_1K_KS, 1038 ; to be able to use common code later on to encrypt 1039 GENERATE_512_KS zmm25, zmm16, zmm17, zmm29, zmm19, zmm24, zmm26, zmm23, \ 1040 zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, \ 1041 zmm8, zmm9, zmm10, zmm11, 8 1042 1043 jmp ks_gen_done 1044more_than_8_blocks_left: 1045 ; Generate another 64*16 bytes of keystream and XOR only the leftover plaintext 1046 GENERATE_1K_KS zmm16, zmm17, zmm18, zmm19, zmm20, zmm21, zmm22, zmm23, \ 1047 zmm24, zmm25, zmm26, zmm27, zmm28, zmm29, zmm30, zmm31, \ 1048 zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, \ 1049 zmm9, zmm10, zmm11, zmm12, zmm13, zmm14, zmm15 1050 1051ks_gen_done: 1052 1053 ; Calculate number of final blocks 1054 mov tmp, len 1055 add tmp, 63 1056 shr tmp, 6 1057 1058 cmp tmp, 8 1059 je final_num_blocks_is_8 1060 jb final_num_blocks_is_1_7 1061 1062 ; Final blocks 9-16 1063 cmp tmp, 12 1064 je final_num_blocks_is_12 1065 jb final_num_blocks_is_9_11 1066 1067 ; Final blocks 13-16 1068 cmp tmp, 14 1069 je final_num_blocks_is_14 1070 jb final_num_blocks_is_13 1071 1072 cmp tmp, 15 1073 je final_num_blocks_is_15 1074 jmp final_num_blocks_is_16 1075 1076final_num_blocks_is_9_11: 1077 cmp tmp, 10 1078 je final_num_blocks_is_10 1079 jb final_num_blocks_is_9 1080 ja final_num_blocks_is_11 1081 1082final_num_blocks_is_1_7: 1083 ; Final blocks 1-7 1084 cmp tmp, 4 1085 je final_num_blocks_is_4 1086 jb final_num_blocks_is_1_3 1087 1088 ; Final blocks 5-7 1089 cmp tmp, 6 1090 je final_num_blocks_is_6 1091 jb final_num_blocks_is_5 1092 ja final_num_blocks_is_7 1093 1094final_num_blocks_is_1_3: 1095 cmp tmp, 2 1096 je final_num_blocks_is_2 1097 ja final_num_blocks_is_3 1098 1099 ; 1 final block if no jump 1100%assign I 1 1101%rep 16 1102APPEND(final_num_blocks_is_, I): 1103 1104 lea tmp, [rel len_to_mask] 1105 and len, 63 1106 kmovq k1, [tmp + len*8] 1107 1108APPEND(no_mask_update, I): 1109 ENCRYPT_1_16_BLOCKS zmm25, zmm16, zmm17, zmm29, zmm19, zmm24, zmm26, zmm23, \ 1110 zmm20, zmm21, zmm31, zmm27, zmm28, zmm22, zmm30, zmm18, \ 1111 zmm0, src, dst, off, k1, I 1112 jmp no_partial_block 1113 1114%assign I (I + 1) 1115%endrep 1116 1117no_partial_block: 1118 1119%ifdef SAFE_DATA 1120 clear_all_zmms_asm 1121%endif 1122 mov rax, job 1123 or dword [rax + _status], STS_COMPLETED_AES 1124 1125 ret 1126 1127%ifdef LINUX 1128section .note.GNU-stack noalloc noexec nowrite progbits 1129%endif 1130