1;; 2;; Copyright (c) 2020, Intel Corporation 3;; 4;; Redistribution and use in source and binary forms, with or without 5;; modification, are permitted provided that the following conditions are met: 6;; 7;; * Redistributions of source code must retain the above copyright notice, 8;; this list of conditions and the following disclaimer. 9;; * Redistributions in binary form must reproduce the above copyright 10;; notice, this list of conditions and the following disclaimer in the 11;; documentation and/or other materials provided with the distribution. 12;; * Neither the name of Intel Corporation nor the names of its contributors 13;; may be used to endorse or promote products derived from this software 14;; without specific prior written permission. 15;; 16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE 20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26;; 27 28%include "include/os.asm" 29%include "imb_job.asm" 30%include "include/memcpy.asm" 31%include "include/clear_regs.asm" 32 33section .data 34default rel 35 36align 16 37constants0: 38dd 0x61707865, 0x61707865, 0x61707865, 0x61707865 39 40align 16 41constants1: 42dd 0x3320646e, 0x3320646e, 0x3320646e, 0x3320646e 43 44align 16 45constants2: 46dd 0x79622d32, 0x79622d32, 0x79622d32, 0x79622d32 47 48align 16 49constants3: 50dd 0x6b206574, 0x6b206574, 0x6b206574, 0x6b206574 51 52align 16 53constants: 54dd 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 55 56align 16 57dword_1: 58dd 0x00000001, 0x00000000, 0x00000000, 0x00000000 59 60align 16 61dword_2: 62dd 0x00000002, 0x00000000, 0x00000000, 0x00000000 63 64align 16 65dword_1_4: 66dd 0x00000001, 0x00000002, 0x00000003, 0x00000004 67 68align 16 69dword_4: 70dd 0x00000004, 0x00000004, 0x00000004, 0x00000004 71 72align 16 73shuf_mask_rotl8: 74db 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14 75 76align 16 77shuf_mask_rotl16: 78db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 79 80align 16 81poly_clamp_r: 82dq 0x0ffffffc0fffffff, 0x0ffffffc0ffffffc 83 84struc STACK 85_STATE: reso 16 ; Space to store first 4 states 86_XMM_SAVE: reso 2 ; Space to store up to 2 temporary XMM registers 87_RSP_SAVE: resq 1 ; Space to store rsp pointer 88endstruc 89%define STACK_SIZE STACK_size 90 91%ifdef LINUX 92%define arg1 rdi 93%define arg2 rsi 94%else 95%define arg1 rcx 96%define arg2 rdx 97%endif 98 99%define job arg1 100 101%define APPEND(a,b) a %+ b 102 103section .text 104 105;; 4x4 32-bit transpose function 106%macro TRANSPOSE4_U32 6 107%define %%r0 %1 ;; [in/out] Input first row / output third column 108%define %%r1 %2 ;; [in/out] Input second row / output second column 109%define %%r2 %3 ;; [in/clobbered] Input third row 110%define %%r3 %4 ;; [in/out] Input fourth row / output fourth column 111%define %%t0 %5 ;; [out] Temporary XMM register / output first column 112%define %%t1 %6 ;; [clobbered] Temporary XMM register 113 114 movdqa %%t0, %%r0 115 shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} 116 shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} 117 movdqa %%t1, %%r2 118 shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} 119 shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} 120 121 movdqa %%r1, %%t0 122 shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} 123 movdqa %%r3, %%r0 124 shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} 125 shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} 126 shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} 127%endmacro 128 129; Rotate dwords on a XMM registers to the left N_BITS 130%macro PROLD 3 131%define %%XMM_IN %1 ; [in/out] XMM register to be rotated 132%define %%N_BITS %2 ; [immediate] Number of bits to rotate 133%define %%XTMP %3 ; [clobbered] XMM temporary register 134%if %%N_BITS == 8 135 pshufb %%XMM_IN, [rel shuf_mask_rotl8] 136%elif %%N_BITS == 16 137 pshufb %%XMM_IN, [rel shuf_mask_rotl16] 138%else 139 movdqa %%XTMP, %%XMM_IN 140 psrld %%XTMP, (32-%%N_BITS) 141 pslld %%XMM_IN, %%N_BITS 142 por %%XMM_IN, %%XTMP 143%endif 144%endmacro 145 146;; 147;; Performs a quarter round on all 4 columns, 148;; resulting in a full round 149;; 150%macro quarter_round 5 151%define %%A %1 ;; [in/out] XMM register containing value A of all 4 columns 152%define %%B %2 ;; [in/out] XMM register containing value B of all 4 columns 153%define %%C %3 ;; [in/out] XMM register containing value C of all 4 columns 154%define %%D %4 ;; [in/out] XMM register containing value D of all 4 columns 155%define %%XTMP %5 ;; [clobbered] Temporary XMM register 156 157 paddd %%A, %%B 158 pxor %%D, %%A 159 PROLD %%D, 16, %%XTMP 160 paddd %%C, %%D 161 pxor %%B, %%C 162 PROLD %%B, 12, %%XTMP 163 paddd %%A, %%B 164 pxor %%D, %%A 165 PROLD %%D, 8, %%XTMP 166 paddd %%C, %%D 167 pxor %%B, %%C 168 PROLD %%B, 7, %%XTMP 169 170%endmacro 171 172%macro quarter_round_x2 9 173%define %%A_L %1 ;; [in/out] XMM register containing value A of all 4 columns 174%define %%B_L %2 ;; [in/out] XMM register containing value B of all 4 columns 175%define %%C_L %3 ;; [in/out] XMM register containing value C of all 4 columns 176%define %%D_L %4 ;; [in/out] XMM register containing value D of all 4 columns 177%define %%A_H %5 ;; [in/out] XMM register containing value A of all 4 columns 178%define %%B_H %6 ;; [in/out] XMM register containing value B of all 4 columns 179%define %%C_H %7 ;; [in/out] XMM register containing value C of all 4 columns 180%define %%D_H %8 ;; [in/out] XMM register containing value D of all 4 columns 181%define %%XTMP %9 ;; [clobbered] Temporary XMM register 182 183 paddd %%A_L, %%B_L 184 paddd %%A_H, %%B_H 185 pxor %%D_L, %%A_L 186 pxor %%D_H, %%A_H 187 PROLD %%D_L, 16, %%XTMP 188 PROLD %%D_H, 16, %%XTMP 189 paddd %%C_L, %%D_L 190 paddd %%C_H, %%D_H 191 pxor %%B_L, %%C_L 192 pxor %%B_H, %%C_H 193 PROLD %%B_L, 12, %%XTMP 194 PROLD %%B_H, 12, %%XTMP 195 paddd %%A_L, %%B_L 196 paddd %%A_H, %%B_H 197 pxor %%D_L, %%A_L 198 pxor %%D_H, %%A_H 199 PROLD %%D_L, 8, %%XTMP 200 PROLD %%D_H, 8, %%XTMP 201 paddd %%C_L, %%D_L 202 paddd %%C_H, %%D_H 203 pxor %%B_L, %%C_L 204 pxor %%B_H, %%C_H 205 PROLD %%B_L, 7, %%XTMP 206 PROLD %%B_H, 7, %%XTMP 207 208%endmacro 209 210;; 211;; Rotates the registers to prepare the data 212;; from column round to diagonal round 213;; 214%macro column_to_diag 3 215%define %%B %1 ;; [in/out] XMM register containing value B of all 4 columns 216%define %%C %2 ;; [in/out] XMM register containing value C of all 4 columns 217%define %%D %3 ;; [in/out] XMM register containing value D of all 4 columns 218 219 pshufd %%B, %%B, 0x39 ; 0b00111001 ;; 0,3,2,1 220 pshufd %%C, %%C, 0x4E ; 0b01001110 ;; 1,0,3,2 221 pshufd %%D, %%D, 0x93 ; 0b10010011 ;; 2,1,0,3 222 223%endmacro 224 225;; 226;; Rotates the registers to prepare the data 227;; from diagonal round to column round 228;; 229%macro diag_to_column 3 230%define %%B %1 ;; [in/out] XMM register containing value B of all 4 columns 231%define %%C %2 ;; [in/out] XMM register containing value C of all 4 columns 232%define %%D %3 ;; [in/out] XMM register containing value D of all 4 columns 233 234 pshufd %%B, %%B, 0x93 ; 0b10010011 ; 2,1,0,3 235 pshufd %%C, %%C, 0x4E ; 0b01001110 ; 1,0,3,2 236 pshufd %%D, %%D, 0x39 ; 0b00111001 ; 0,3,2,1 237 238%endmacro 239 240;; 241;; Generates 64 or 128 bytes of keystream 242;; States IN A-C are the same for first 64 and last 64 bytes 243;; State IN D differ because of the different block count 244;; 245%macro GENERATE_64_128_KS 9-14 246%define %%STATE_IN_A %1 ;; [in] XMM containing state A 247%define %%STATE_IN_B %2 ;; [in] XMM containing state B 248%define %%STATE_IN_C %3 ;; [in] XMM containing state C 249%define %%STATE_IN_D_L %4 ;; [in] XMM containing state D (low block count) 250%define %%A_L_KS0 %5 ;; [out] XMM to contain keystream 0-15 bytes 251%define %%B_L_KS1 %6 ;; [out] XMM to contain keystream 16-31 bytes 252%define %%C_L_KS2 %7 ;; [out] XMM to contain keystream 32-47 bytes 253%define %%D_L_KS3 %8 ;; [out] XMM to contain keystream 48-63 bytes 254%define %%XTMP %9 ;; [clobbered] Temporary XMM register 255%define %%STATE_IN_D_H %10 ;; [in] XMM containing state D (high block count) 256%define %%A_H_KS4 %11 ;; [out] XMM to contain keystream 64-79 bytes 257%define %%B_H_KS5 %12 ;; [out] XMM to contain keystream 80-95 bytes 258%define %%C_H_KS6 %13 ;; [out] XMM to contain keystream 96-111 bytes 259%define %%D_H_KS7 %14 ;; [out] XMM to contain keystream 112-127 bytes 260 261 movdqa %%A_L_KS0, %%STATE_IN_A 262 movdqa %%B_L_KS1, %%STATE_IN_B 263 movdqa %%C_L_KS2, %%STATE_IN_C 264 movdqa %%D_L_KS3, %%STATE_IN_D_L 265%if %0 == 14 266 movdqa %%A_H_KS4, %%STATE_IN_A 267 movdqa %%B_H_KS5, %%STATE_IN_B 268 movdqa %%C_H_KS6, %%STATE_IN_C 269 movdqa %%D_H_KS7, %%STATE_IN_D_H 270%endif 271%rep 10 272%if %0 == 14 273 quarter_round_x2 %%A_L_KS0, %%B_L_KS1, %%C_L_KS2, %%D_L_KS3, \ 274 %%A_H_KS4, %%B_H_KS5, %%C_H_KS6, %%D_H_KS7, %%XTMP 275 column_to_diag %%B_L_KS1, %%C_L_KS2, %%D_L_KS3 276 column_to_diag %%B_H_KS5, %%C_H_KS6, %%D_H_KS7 277 quarter_round_x2 %%A_L_KS0, %%B_L_KS1, %%C_L_KS2, %%D_L_KS3, \ 278 %%A_H_KS4, %%B_H_KS5, %%C_H_KS6, %%D_H_KS7, %%XTMP 279 diag_to_column %%B_L_KS1, %%C_L_KS2, %%D_L_KS3 280 diag_to_column %%B_H_KS5, %%C_H_KS6, %%D_H_KS7 281%else 282 quarter_round %%A_L_KS0, %%B_L_KS1, %%C_L_KS2, %%D_L_KS3, %%XTMP 283 column_to_diag %%B_L_KS1, %%C_L_KS2, %%D_L_KS3 284 quarter_round %%A_L_KS0, %%B_L_KS1, %%C_L_KS2, %%D_L_KS3, %%XTMP 285 diag_to_column %%B_L_KS1, %%C_L_KS2, %%D_L_KS3 286%endif 287%endrep 288 289 paddd %%A_L_KS0, %%STATE_IN_A 290 paddd %%B_L_KS1, %%STATE_IN_B 291 paddd %%C_L_KS2, %%STATE_IN_C 292 paddd %%D_L_KS3, %%STATE_IN_D_L 293%if %0 == 14 294 paddd %%A_H_KS4, %%STATE_IN_A 295 paddd %%B_H_KS5, %%STATE_IN_B 296 paddd %%C_H_KS6, %%STATE_IN_C 297 paddd %%D_H_KS7, %%STATE_IN_D_H 298%endif 299%endmacro 300 301; Perform 4 times the operation in first parameter 302%macro XMM_OP_X4 9 303%define %%OP %1 ; [immediate] Instruction 304%define %%DST_SRC1_1 %2 ; [in/out] First source/Destination 1 305%define %%DST_SRC1_2 %3 ; [in/out] First source/Destination 2 306%define %%DST_SRC1_3 %4 ; [in/out] First source/Destination 3 307%define %%DST_SRC1_4 %5 ; [in/out] First source/Destination 4 308%define %%SRC2_1 %6 ; [in] Second source 1 309%define %%SRC2_2 %7 ; [in] Second source 2 310%define %%SRC2_3 %8 ; [in] Second source 3 311%define %%SRC2_4 %9 ; [in] Second source 4 312 313 %%OP %%DST_SRC1_1, %%SRC2_1 314 %%OP %%DST_SRC1_2, %%SRC2_2 315 %%OP %%DST_SRC1_3, %%SRC2_3 316 %%OP %%DST_SRC1_4, %%SRC2_4 317%endmacro 318 319%macro XMM_ROLS_X4 6 320%define %%XMM_OP1_1 %1 321%define %%XMM_OP1_2 %2 322%define %%XMM_OP1_3 %3 323%define %%XMM_OP1_4 %4 324%define %%BITS_TO_ROTATE %5 325%define %%XTMP %6 326 327 ; Store temporary register when bits to rotate is not 8 and 16, 328 ; as the register will be clobbered in these cases, 329 ; containing needed information 330%if %%BITS_TO_ROTATE != 8 && %%BITS_TO_ROTATE != 16 331 movdqa [rsp + _XMM_SAVE], %%XTMP 332%endif 333 PROLD %%XMM_OP1_1, %%BITS_TO_ROTATE, %%XTMP 334 PROLD %%XMM_OP1_2, %%BITS_TO_ROTATE, %%XTMP 335 PROLD %%XMM_OP1_3, %%BITS_TO_ROTATE, %%XTMP 336 PROLD %%XMM_OP1_4, %%BITS_TO_ROTATE, %%XTMP 337%if %%BITS_TO_ROTATE != 8 && %%BITS_TO_ROTATE != 16 338 movdqa %%XTMP, [rsp + _XMM_SAVE] 339%endif 340%endmacro 341 342;; 343;; Performs a full chacha20 round on 4 states, 344;; consisting of 4 quarter rounds, which are done in parallel 345;; 346%macro CHACHA20_ROUND 16 347%define %%XMM_DWORD_A1 %1 ;; [in/out] XMM register containing dword A for first quarter round 348%define %%XMM_DWORD_A2 %2 ;; [in/out] XMM register containing dword A for second quarter round 349%define %%XMM_DWORD_A3 %3 ;; [in/out] XMM register containing dword A for third quarter round 350%define %%XMM_DWORD_A4 %4 ;; [in/out] XMM register containing dword A for fourth quarter round 351%define %%XMM_DWORD_B1 %5 ;; [in/out] XMM register containing dword B for first quarter round 352%define %%XMM_DWORD_B2 %6 ;; [in/out] XMM register containing dword B for second quarter round 353%define %%XMM_DWORD_B3 %7 ;; [in/out] XMM register containing dword B for third quarter round 354%define %%XMM_DWORD_B4 %8 ;; [in/out] XMM register containing dword B for fourth quarter round 355%define %%XMM_DWORD_C1 %9 ;; [in/out] XMM register containing dword C for first quarter round 356%define %%XMM_DWORD_C2 %10 ;; [in/out] XMM register containing dword C for second quarter round 357%define %%XMM_DWORD_C3 %11 ;; [in/out] XMM register containing dword C for third quarter round 358%define %%XMM_DWORD_C4 %12 ;; [in/out] XMM register containing dword C for fourth quarter round 359%define %%XMM_DWORD_D1 %13 ;; [in/out] XMM register containing dword D for first quarter round 360%define %%XMM_DWORD_D2 %14 ;; [in/out] XMM register containing dword D for second quarter round 361%define %%XMM_DWORD_D3 %15 ;; [in/out] XMM register containing dword D for third quarter round 362%define %%XMM_DWORD_D4 %16 ;; [in/out] XMM register containing dword D for fourth quarter round 363 364 ; A += B 365 XMM_OP_X4 paddd, %%XMM_DWORD_A1, %%XMM_DWORD_A2, %%XMM_DWORD_A3, %%XMM_DWORD_A4, \ 366 %%XMM_DWORD_B1, %%XMM_DWORD_B2, %%XMM_DWORD_B3, %%XMM_DWORD_B4 367 ; D ^= A 368 XMM_OP_X4 pxor, %%XMM_DWORD_D1, %%XMM_DWORD_D2, %%XMM_DWORD_D3, %%XMM_DWORD_D4, \ 369 %%XMM_DWORD_A1, %%XMM_DWORD_A2, %%XMM_DWORD_A3, %%XMM_DWORD_A4 370 371 ; D <<< 16 372 XMM_ROLS_X4 %%XMM_DWORD_D1, %%XMM_DWORD_D2, %%XMM_DWORD_D3, %%XMM_DWORD_D4, 16, \ 373 %%XMM_DWORD_B1 374 375 ; C += D 376 XMM_OP_X4 paddd, %%XMM_DWORD_C1, %%XMM_DWORD_C2, %%XMM_DWORD_C3, %%XMM_DWORD_C4, \ 377 %%XMM_DWORD_D1, %%XMM_DWORD_D2, %%XMM_DWORD_D3, %%XMM_DWORD_D4 378 ; B ^= C 379 XMM_OP_X4 pxor, %%XMM_DWORD_B1, %%XMM_DWORD_B2, %%XMM_DWORD_B3, %%XMM_DWORD_B4, \ 380 %%XMM_DWORD_C1, %%XMM_DWORD_C2, %%XMM_DWORD_C3, %%XMM_DWORD_C4 381 382 ; B <<< 12 383 XMM_ROLS_X4 %%XMM_DWORD_B1, %%XMM_DWORD_B2, %%XMM_DWORD_B3, %%XMM_DWORD_B4, 12, \ 384 %%XMM_DWORD_D1 385 386 ; A += B 387 XMM_OP_X4 paddd, %%XMM_DWORD_A1, %%XMM_DWORD_A2, %%XMM_DWORD_A3, %%XMM_DWORD_A4, \ 388 %%XMM_DWORD_B1, %%XMM_DWORD_B2, %%XMM_DWORD_B3, %%XMM_DWORD_B4 389 ; D ^= A 390 XMM_OP_X4 pxor, %%XMM_DWORD_D1, %%XMM_DWORD_D2, %%XMM_DWORD_D3, %%XMM_DWORD_D4, \ 391 %%XMM_DWORD_A1, %%XMM_DWORD_A2, %%XMM_DWORD_A3, %%XMM_DWORD_A4 392 393 ; D <<< 8 394 XMM_ROLS_X4 %%XMM_DWORD_D1, %%XMM_DWORD_D2, %%XMM_DWORD_D3, %%XMM_DWORD_D4, 8, \ 395 %%XMM_DWORD_B1 396 397 ; C += D 398 XMM_OP_X4 paddd, %%XMM_DWORD_C1, %%XMM_DWORD_C2, %%XMM_DWORD_C3, %%XMM_DWORD_C4, \ 399 %%XMM_DWORD_D1, %%XMM_DWORD_D2, %%XMM_DWORD_D3, %%XMM_DWORD_D4 400 ; B ^= C 401 XMM_OP_X4 pxor, %%XMM_DWORD_B1, %%XMM_DWORD_B2, %%XMM_DWORD_B3, %%XMM_DWORD_B4, \ 402 %%XMM_DWORD_C1, %%XMM_DWORD_C2, %%XMM_DWORD_C3, %%XMM_DWORD_C4 403 404 ; B <<< 7 405 XMM_ROLS_X4 %%XMM_DWORD_B1, %%XMM_DWORD_B2, %%XMM_DWORD_B3, %%XMM_DWORD_B4, 7, \ 406 %%XMM_DWORD_D1 407%endmacro 408 409;; 410;; Encodes 4 Chacha20 states, outputting 256 bytes of keystream 411;; Data still needs to be transposed to get the keystream in the correct order 412;; 413%macro GENERATE_256_KS 16 414%define %%XMM_DWORD_0 %1 ;; [out] XMM register to contain encoded dword 0 of the 4 Chacha20 states 415%define %%XMM_DWORD_1 %2 ;; [out] XMM register to contain encoded dword 1 of the 4 Chacha20 states 416%define %%XMM_DWORD_2 %3 ;; [out] XMM register to contain encoded dword 2 of the 4 Chacha20 states 417%define %%XMM_DWORD_3 %4 ;; [out] XMM register to contain encoded dword 3 of the 4 Chacha20 states 418%define %%XMM_DWORD_4 %5 ;; [out] XMM register to contain encoded dword 4 of the 4 Chacha20 states 419%define %%XMM_DWORD_5 %6 ;; [out] XMM register to contain encoded dword 5 of the 4 Chacha20 states 420%define %%XMM_DWORD_6 %7 ;; [out] XMM register to contain encoded dword 6 of the 4 Chacha20 states 421%define %%XMM_DWORD_7 %8 ;; [out] XMM register to contain encoded dword 7 of the 4 Chacha20 states 422%define %%XMM_DWORD_8 %9 ;; [out] XMM register to contain encoded dword 8 of the 4 Chacha20 states 423%define %%XMM_DWORD_9 %10 ;; [out] XMM register to contain encoded dword 9 of the 4 Chacha20 states 424%define %%XMM_DWORD_10 %11 ;; [out] XMM register to contain encoded dword 10 of the 4 Chacha20 states 425%define %%XMM_DWORD_11 %12 ;; [out] XMM register to contain encoded dword 11 of the 4 Chacha20 states 426%define %%XMM_DWORD_12 %13 ;; [out] XMM register to contain encoded dword 12 of the 4 Chacha20 states 427%define %%XMM_DWORD_13 %14 ;; [out] XMM register to contain encoded dword 13 of the 4 Chacha20 states 428%define %%XMM_DWORD_14 %15 ;; [out] XMM register to contain encoded dword 14 of the 4 Chacha20 states 429%define %%XMM_DWORD_15 %16 ;; [out] XMM register to contain encoded dword 15 of the 4 Chacha20 states 430 431%assign i 0 432%rep 16 433 movdqa APPEND(%%XMM_DWORD_, i), [rsp + _STATE + 16*i] 434%assign i (i + 1) 435%endrep 436 437%rep 10 438 CHACHA20_ROUND %%XMM_DWORD_0, %%XMM_DWORD_1, %%XMM_DWORD_2, %%XMM_DWORD_3, \ 439 %%XMM_DWORD_4, %%XMM_DWORD_5, %%XMM_DWORD_6, %%XMM_DWORD_7, \ 440 %%XMM_DWORD_8, %%XMM_DWORD_9, %%XMM_DWORD_10, %%XMM_DWORD_11, \ 441 %%XMM_DWORD_12, %%XMM_DWORD_13, %%XMM_DWORD_14, %%XMM_DWORD_15 442 443 CHACHA20_ROUND %%XMM_DWORD_0, %%XMM_DWORD_1, %%XMM_DWORD_2, %%XMM_DWORD_3, \ 444 %%XMM_DWORD_5, %%XMM_DWORD_6, %%XMM_DWORD_7, %%XMM_DWORD_4, \ 445 %%XMM_DWORD_10, %%XMM_DWORD_11, %%XMM_DWORD_8, %%XMM_DWORD_9, \ 446 %%XMM_DWORD_15, %%XMM_DWORD_12, %%XMM_DWORD_13, %%XMM_DWORD_14 447%endrep 448 449%assign i 0 450%rep 16 451 paddd APPEND(%%XMM_DWORD_, i), [rsp + _STATE + 16*i] 452%assign i (i + 1) 453%endrep 454%endmacro 455 456align 32 457MKGLOBAL(submit_job_chacha20_enc_dec_sse,function,internal) 458submit_job_chacha20_enc_dec_sse: 459 460%define src r8 461%define dst r9 462%define len r10 463%define iv r11 464%define keys rdx 465%define off rax 466%define tmp iv 467%define tmp2 keys 468 469 ; Read pointers and length 470 mov len, [job + _msg_len_to_cipher_in_bytes] 471 472 ; Check if there is nothing to encrypt 473 or len, len 474 jz exit 475 476 mov keys, [job + _enc_keys] 477 mov iv, [job + _iv] 478 mov src, [job + _src] 479 add src, [job + _cipher_start_src_offset_in_bytes] 480 mov dst, [job + _dst] 481 482 mov rax, rsp 483 sub rsp, STACK_SIZE 484 and rsp, -16 485 mov [rsp + _RSP_SAVE], rax ; save RSP 486 487 xor off, off 488 489 ; If less than or equal to 64*2 bytes, prepare directly states for 490 ; up to 2 blocks 491 cmp len, 64*2 492 jbe check_1_or_2_blocks_left 493 494 ; Prepare first 4 chacha states 495 movdqa xmm0, [rel constants0] 496 movdqa xmm1, [rel constants1] 497 movdqa xmm2, [rel constants2] 498 movdqa xmm3, [rel constants3] 499 500 ; Broadcast 8 dwords from key into XMM4-11 501 movdqu xmm12, [keys] 502 movdqu xmm15, [keys + 16] 503 pshufd xmm4, xmm12, 0x0 504 pshufd xmm5, xmm12, 0x55 505 pshufd xmm6, xmm12, 0xAA 506 pshufd xmm7, xmm12, 0xFF 507 pshufd xmm8, xmm15, 0x0 508 pshufd xmm9, xmm15, 0x55 509 pshufd xmm10, xmm15, 0xAA 510 pshufd xmm11, xmm15, 0xFF 511 512 ; Broadcast 3 dwords from IV into XMM13-15 513 movd xmm13, [iv] 514 movd xmm14, [iv + 4] 515 pshufd xmm13, xmm13, 0 516 pshufd xmm14, xmm14, 0 517 movd xmm15, [iv + 8] 518 pshufd xmm15, xmm15, 0 519 520 ; Set block counters for first 4 Chacha20 states 521 movdqa xmm12, [rel dword_1_4] 522 523%assign i 0 524%rep 16 525 movdqa [rsp + _STATE + 16*i], xmm %+ i 526%assign i (i + 1) 527%endrep 528 529 cmp len, 64*4 530 jb exit_loop 531 532align 32 533start_loop: 534 535 ; Generate 256 bytes of keystream 536 GENERATE_256_KS xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, \ 537 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 538 539 ;; Transpose state to get keystream and XOR with plaintext 540 ;; to get ciphertext 541 542 ; Save registers to be used as temp registers 543 movdqa [rsp + _XMM_SAVE], xmm14 544 movdqa [rsp + _XMM_SAVE + 16], xmm15 545 546 ; Transpose to get 0-63 bytes of KS 547 TRANSPOSE4_U32 xmm0, xmm1, xmm2, xmm3, xmm14, xmm15 548 549 ; xmm14, xmm1, xmm0, xmm3 550 ; xmm2, xmm15 free to use 551 movdqu xmm2, [src + off] 552 movdqu xmm15, [src + off + 16*4] 553 pxor xmm14, xmm2 554 pxor xmm1, xmm15 555 movdqu [dst + off], xmm14 556 movdqu [dst + off + 16*4], xmm1 557 558 movdqu xmm2, [src + off + 16*8] 559 movdqu xmm15, [src + off + 16*12] 560 pxor xmm0, xmm2 561 pxor xmm3, xmm15 562 movdqu [dst + off + 16*8], xmm0 563 movdqu [dst + off + 16*12], xmm3 564 565 ; Restore registers and use xmm0, xmm1 now that they are free 566 movdqa xmm14, [rsp + _XMM_SAVE] 567 movdqa xmm15, [rsp + _XMM_SAVE + 16] 568 569 ; Transpose to get bytes 64-127 of KS 570 TRANSPOSE4_U32 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1 571 572 ; xmm0, xmm5, xmm4, xmm7 573 ; xmm6, xmm1 free to use 574 movdqu xmm6, [src + off + 16] 575 movdqu xmm1, [src + off + 16*5] 576 pxor xmm0, xmm6 577 pxor xmm5, xmm1 578 movdqu [dst + off + 16], xmm0 579 movdqu [dst + off + 16*5], xmm5 580 581 movdqu xmm6, [src + off + 16*9] 582 movdqu xmm1, [src + off + 16*13] 583 pxor xmm4, xmm6 584 pxor xmm7, xmm1 585 movdqu [dst + off + 16*9], xmm4 586 movdqu [dst + off + 16*13], xmm7 587 588 ; Transpose to get bytes 128-191 of KS 589 TRANSPOSE4_U32 xmm8, xmm9, xmm10, xmm11, xmm0, xmm1 590 591 ; xmm0, xmm9, xmm8, xmm11 592 ; xmm10, xmm1 free to use 593 movdqu xmm10, [src + off + 16*2] 594 movdqu xmm1, [src + off + 16*6] 595 pxor xmm0, xmm10 596 pxor xmm9, xmm1 597 movdqu [dst + off + 16*2], xmm0 598 movdqu [dst + off + 16*6], xmm9 599 600 movdqu xmm10, [src + off + 16*10] 601 movdqu xmm1, [src + off + 16*14] 602 pxor xmm8, xmm10 603 pxor xmm11, xmm1 604 movdqu [dst + off + 16*10], xmm8 605 movdqu [dst + off + 16*14], xmm11 606 607 ; Transpose to get bytes 192-255 of KS 608 TRANSPOSE4_U32 xmm12, xmm13, xmm14, xmm15, xmm0, xmm1 609 610 ; xmm0, xmm13, xmm12, xmm15 611 ; xmm14, xmm1 free to use 612 movdqu xmm14, [src + off + 16*3] 613 movdqu xmm1, [src + off + 16*7] 614 pxor xmm0, xmm14 615 pxor xmm13, xmm1 616 movdqu [dst + off + 16*3], xmm0 617 movdqu [dst + off + 16*7], xmm13 618 619 movdqu xmm14, [src + off + 16*11] 620 movdqu xmm1, [src + off + 16*15] 621 pxor xmm12, xmm14 622 pxor xmm15, xmm1 623 movdqu [dst + off + 16*11], xmm12 624 movdqu [dst + off + 16*15], xmm15 625 ; Update remaining length 626 sub len, 64*4 627 add off, 64*4 628 629 ; Update counter values 630 movdqa xmm12, [rsp + 16*12] 631 paddd xmm12, [rel dword_4] 632 movdqa [rsp + 16*12], xmm12 633 634 cmp len, 64*4 635 jae start_loop 636 637exit_loop: 638 639 ; Check if there are no more bytes to encrypt 640 or len, len 641 jz no_partial_block 642 643 cmp len, 64*2 644 ja more_than_2_blocks_left 645 646check_1_or_2_blocks_left: 647 cmp len, 64 648 ja two_blocks_left 649 650 ;; 1 block left 651 652 ; Get last block counter dividing offset by 64 653 shr off, 6 654 655 ; Prepare next chacha state from IV, key 656 movdqu xmm1, [keys] ; Load key bytes 0-15 657 movdqu xmm2, [keys + 16] ; Load key bytes 16-31 658 ; Read nonce (12 bytes) 659 movq xmm3, [iv] 660 pinsrd xmm3, [iv + 8], 2 661 pslldq xmm3, 4 662 movdqa xmm0, [rel constants] 663 664 ; Insert next block count 665 inc DWORD(off) 666 movd xmm4, DWORD(off) 667 por xmm3, xmm4 668 dec DWORD(off) 669 shl off, 6 ; Restore offset 670 671 ; Generate 64 bytes of keystream 672 GENERATE_64_128_KS xmm0, xmm1, xmm2, xmm3, xmm9, xmm10, xmm11, \ 673 xmm12, xmm13 674 675 cmp len, 64 676 jne less_than_64 677 678 ;; Exactly 64 bytes left 679 680 ; Load plaintext, XOR with KS and store ciphertext 681 movdqu xmm14, [src + off] 682 movdqu xmm15, [src + off + 16] 683 pxor xmm14, xmm9 684 pxor xmm15, xmm10 685 movdqu [dst + off], xmm14 686 movdqu [dst + off + 16], xmm15 687 688 movdqu xmm14, [src + off + 16*2] 689 movdqu xmm15, [src + off + 16*3] 690 pxor xmm14, xmm11 691 pxor xmm15, xmm12 692 movdqu [dst + off + 16*2], xmm14 693 movdqu [dst + off + 16*3], xmm15 694 695 jmp no_partial_block 696 697less_than_64: 698 699 cmp len, 48 700 jb less_than_48 701 702 ; Load plaintext and XOR with keystream 703 movdqu xmm13, [src + off] 704 movdqu xmm14, [src + off + 16] 705 movdqu xmm15, [src + off + 32] 706 707 pxor xmm13, xmm9 708 pxor xmm14, xmm10 709 pxor xmm15, xmm11 710 711 ; Store resulting ciphertext 712 movdqu [dst + off], xmm13 713 movdqu [dst + off + 16], xmm14 714 movdqu [dst + off + 32], xmm15 715 716 ; Store last KS in xmm9, for partial block 717 movdqu xmm9, xmm12 718 719 sub len, 48 720 add off, 48 721 722 jmp check_partial 723less_than_48: 724 cmp len, 32 725 jb less_than_32 726 727 ; Load plaintext and XOR with keystream 728 movdqu xmm13, [src + off] 729 movdqu xmm14, [src + off + 16] 730 731 pxor xmm13, xmm9 732 pxor xmm14, xmm10 733 734 ; Store resulting ciphertext 735 movdqu [dst + off], xmm13 736 movdqu [dst + off + 16], xmm14 737 738 ; Store last KS in xmm9, for partial block 739 movdqu xmm9, xmm11 740 741 sub len, 32 742 add off, 32 743 744 jmp check_partial 745 746less_than_32: 747 cmp len, 16 748 jb check_partial 749 750 ; Load plaintext and XOR with keystream 751 movdqu xmm13, [src + off] 752 753 pxor xmm13, xmm9 754 755 ; Store resulting ciphertext 756 movdqu [dst + off], xmm13 757 758 ; Store last KS in xmm9, for partial block 759 movdqu xmm9, xmm10 760 761 sub len, 16 762 add off, 16 763 764check_partial: 765 or len, len 766 jz no_partial_block 767 768 add src, off 769 add dst, off 770 ; Load plaintext 771 simd_load_sse_15_1 xmm8, src, len 772 773 ; XOR KS with plaintext and store resulting ciphertext 774 pxor xmm8, xmm9 775 776 simd_store_sse_15 dst, xmm8, len, tmp, tmp2 777 778 jmp no_partial_block 779 780two_blocks_left: 781 782 ; Get last block counter dividing offset by 64 783 shr off, 6 784 785 ; Prepare next 2 chacha states from IV, key 786 movdqu xmm1, [keys] ; Load key bytes 0-15 787 movdqu xmm2, [keys + 16] ; Load key bytes 16-31 788 ; Read nonce (12 bytes) 789 movq xmm3, [iv] 790 pinsrd xmm3, [iv + 8], 2 791 pslldq xmm3, 4 792 movdqa xmm0, [rel constants] 793 794 movdqa xmm8, xmm3 795 796 ; Insert next block counts 797 inc DWORD(off) 798 movd xmm4, DWORD(off) 799 por xmm3, xmm4 800 inc DWORD(off) 801 movd xmm5, DWORD(off) 802 por xmm8, xmm5 803 sub off, 2 804 shl off, 6 ; Restore offset 805 806 ; Generate 128 bytes of keystream 807 GENERATE_64_128_KS xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, \ 808 xmm13, xmm8, xmm9, xmm10, xmm11, xmm12 809 810 cmp len, 128 811 jb between_64_127 812 813 ; Load plaintext, XOR with KS and store ciphertext 814 movdqu xmm14, [src + off] 815 movdqu xmm15, [src + off + 16] 816 pxor xmm14, xmm4 817 pxor xmm15, xmm5 818 movdqu [dst + off], xmm14 819 movdqu [dst + off + 16], xmm15 820 821 movdqu xmm14, [src + off + 16*2] 822 movdqu xmm15, [src + off + 16*3] 823 pxor xmm14, xmm6 824 pxor xmm15, xmm7 825 movdqu [dst + off + 16*2], xmm14 826 movdqu [dst + off + 16*3], xmm15 827 828 movdqu xmm14, [src + off + 16*4] 829 movdqu xmm15, [src + off + 16*5] 830 pxor xmm14, xmm9 831 pxor xmm15, xmm10 832 movdqu [dst + off + 16*4], xmm14 833 movdqu [dst + off + 16*5], xmm15 834 835 movdqu xmm14, [src + off + 16*6] 836 movdqu xmm15, [src + off + 16*7] 837 pxor xmm14, xmm11 838 pxor xmm15, xmm12 839 movdqu [dst + off + 16*6], xmm14 840 movdqu [dst + off + 16*7], xmm15 841 842 jmp no_partial_block 843 844between_64_127: 845 ; Load plaintext, XOR with KS and store ciphertext for first 64 bytes 846 movdqu xmm14, [src + off] 847 movdqu xmm15, [src + off + 16] 848 pxor xmm14, xmm4 849 pxor xmm15, xmm5 850 movdqu [dst + off], xmm14 851 movdqu [dst + off + 16], xmm15 852 853 movdqu xmm14, [src + off + 16*2] 854 movdqu xmm15, [src + off + 16*3] 855 pxor xmm14, xmm6 856 pxor xmm15, xmm7 857 movdqu [dst + off + 16*2], xmm14 858 movdqu [dst + off + 16*3], xmm15 859 860 sub len, 64 861 add off, 64 862 ; Handle rest up to 63 bytes in "less_than_64" 863 jmp less_than_64 864 865more_than_2_blocks_left: 866 867 ;; First generate 128 bytes of KS to encrypt next 128 bytes 868 869 ; Get last block counter dividing offset by 64 870 shr off, 6 871 872 ; Prepare next 2 chacha states from IV, key 873 movdqu xmm1, [keys] ; Load key bytes 0-15 874 movdqu xmm2, [keys + 16] ; Load key bytes 16-31 875 ; Read nonce (12 bytes) 876 movq xmm3, [iv] 877 pinsrd xmm3, [iv + 8], 2 878 pslldq xmm3, 4 879 movdqa xmm0, [rel constants] 880 881 movdqa xmm8, xmm3 882 883 ; Insert next block counts 884 inc DWORD(off) 885 movd xmm4, DWORD(off) 886 por xmm3, xmm4 887 inc DWORD(off) 888 movd xmm5, DWORD(off) 889 por xmm8, xmm5 890 sub off, 2 891 shl off, 6 ; Restore offset 892 893 ; Generate 128 bytes of keystream 894 GENERATE_64_128_KS xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, \ 895 xmm13, xmm8, xmm9, xmm10, xmm11, xmm12 896 897 ; Load plaintext, XOR with KS and store ciphertext 898 movdqu xmm14, [src + off] 899 movdqu xmm15, [src + off + 16] 900 pxor xmm14, xmm4 901 pxor xmm15, xmm5 902 movdqu [dst + off], xmm14 903 movdqu [dst + off + 16], xmm15 904 905 movdqu xmm14, [src + off + 16*2] 906 movdqu xmm15, [src + off + 16*3] 907 pxor xmm14, xmm6 908 pxor xmm15, xmm7 909 movdqu [dst + off + 16*2], xmm14 910 movdqu [dst + off + 16*3], xmm15 911 912 movdqu xmm14, [src + off + 16*4] 913 movdqu xmm15, [src + off + 16*5] 914 pxor xmm14, xmm9 915 pxor xmm15, xmm10 916 movdqu [dst + off + 16*4], xmm14 917 movdqu [dst + off + 16*5], xmm15 918 919 movdqu xmm14, [src + off + 16*6] 920 movdqu xmm15, [src + off + 16*7] 921 pxor xmm14, xmm11 922 pxor xmm15, xmm12 923 movdqu [dst + off + 16*6], xmm14 924 movdqu [dst + off + 16*7], xmm15 925 926 sub len, 128 927 add off, 128 928 929 jmp check_1_or_2_blocks_left 930 931no_partial_block: 932 933%ifdef SAFE_DATA 934 clear_all_xmms_sse_asm 935 ; Clear stack frame 936%assign i 0 937%rep 16 938 movdqa [rsp + _STATE + 16*i], xmm0 939%assign i (i + 1) 940%endrep 941 movdqa [rsp + _XMM_SAVE], xmm0 942 movdqa [rsp + _XMM_SAVE + 16], xmm0 943%endif 944 945 mov rsp, [rsp + _RSP_SAVE] 946 947exit: 948 mov rax, job 949 or dword [rax + _status], STS_COMPLETED_AES 950 951 ret 952 953;; 954;; void poly1305_key_gen_sse(IMB_JOB *job, void *poly_key) 955align 32 956MKGLOBAL(poly1305_key_gen_sse,function,internal) 957poly1305_key_gen_sse: 958 ;; prepare chacha state from IV, key 959 mov rax, [job + _enc_keys] 960 movdqa xmm0, [rel constants] 961 movdqu xmm1, [rax] ; Load key bytes 0-15 962 movdqu xmm2, [rax + 16] ; Load key bytes 16-31 963 ;; copy nonce (12 bytes) 964 mov rax, [job + _iv] 965 movq xmm3, [rax] 966 pinsrd xmm3, [rax + 8], 2 967 pslldq xmm3, 4 968 969 ;; run one round of chacha20 970 GENERATE_64_128_KS xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 971 972 ;; clamp R and store poly1305 key 973 ;; R = KEY[0..15] & 0xffffffc0ffffffc0ffffffc0fffffff 974 pand xmm4, [rel poly_clamp_r] 975 movdqu [arg2 + 0 * 16], xmm4 976 movdqu [arg2 + 1 * 16], xmm5 977 978%ifdef SAFE_DATA 979 clear_all_xmms_sse_asm 980%endif 981 ret 982 983%ifdef LINUX 984section .note.GNU-stack noalloc noexec nowrite progbits 985%endif 986