1;; 2;; Copyright (c) 2019-2020, Intel Corporation 3;; 4;; Redistribution and use in source and binary forms, with or without 5;; modification, are permitted provided that the following conditions are met: 6;; 7;; * Redistributions of source code must retain the above copyright notice, 8;; this list of conditions and the following disclaimer. 9;; * Redistributions in binary form must reproduce the above copyright 10;; notice, this list of conditions and the following disclaimer in the 11;; documentation and/or other materials provided with the distribution. 12;; * Neither the name of Intel Corporation nor the names of its contributors 13;; may be used to endorse or promote products derived from this software 14;; without specific prior written permission. 15;; 16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE 20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26;; 27 28%include "include/os.asm" 29%include "imb_job.asm" 30%include "mb_mgr_datastruct.asm" 31 32%include "include/reg_sizes.asm" 33%include "include/const.inc" 34 35%define SUBMIT_JOB_ZUC_EEA3 submit_job_zuc_eea3_avx 36%define FLUSH_JOB_ZUC_EEA3 flush_job_zuc_eea3_avx 37%define SUBMIT_JOB_ZUC_EIA3 submit_job_zuc_eia3_avx 38%define FLUSH_JOB_ZUC_EIA3 flush_job_zuc_eia3_avx 39 40section .data 41default rel 42 43align 16 44broadcast_word: 45db 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01 46db 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01 47 48align 16 49all_ffs_top_64bits: 50db 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 51db 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF 52 53clear_lane_mask_tab: 54dd 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 55dd 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 56 57clear_lane_mask_tab_start: 58dd 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 59dd 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 60 61align 16 62bitmask_to_dword_tab: 63dd 0x00000000, 0x00000000, 0x00000000, 0x00000000 64dd 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 65dd 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 66dd 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 67dd 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 68dd 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 69dd 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 70dd 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 71dd 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF 72dd 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF 73dd 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF 74dd 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF 75dd 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF 76dd 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF 77dd 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF 78dd 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF 79 80extern zuc_eia3_4_buffer_job_avx 81extern asm_ZucInitialization_4_avx 82extern asm_ZucCipher_4_avx 83 84%ifdef LINUX 85%define arg1 rdi 86%define arg2 rsi 87%define arg3 rdx 88%define arg4 rcx 89%define arg5 r8 90%define arg6 r9 91%else 92%define arg1 rcx 93%define arg2 rdx 94%define arg3 r8 95%define arg4 r9 96%define arg5 [rsp + 32] 97%define arg6 [rsp + 40] 98%endif 99 100%define state arg1 101%define job arg2 102 103%define job_rax rax 104 105; This routine and its callee clobbers all GPRs 106struc STACK 107_state_save resq 2*(16+2) ; Space for ZUC LFSR + R1-2 108_gpr_save: resq 10 109_null_len_save: resq 1 110_rsp_save: resq 1 111endstruc 112 113section .text 114 115%define APPEND(a,b) a %+ b 116%define APPEND3(a,b,c) a %+ b %+ c 117 118;; Clear state for multiple lanes in the OOO managers 119%macro CLEAR_ZUC_STATE 5 120%define %%STATE %1 ;; [in] ZUC OOO manager pointer 121%define %%LANE_MASK %2 ;; [in/clobbered] bitmask with lanes to clear 122%define %%TMP %3 ;; [clobbered] Temporary GP register 123%define %%XTMP1 %4 ;; [clobbered] Temporary XMM register 124%define %%XTMP2 %5 ;; [clobbered] Temporary XMM register 125 126 lea %%TMP, [rel bitmask_to_dword_tab] 127 shl %%LANE_MASK, 4 ; Multiply by 16 to move through the table 128 vmovdqa %%XTMP1, [%%TMP + %%LANE_MASK] 129 130 ;; Clear state for lanes 131%assign I 0 132%rep (16 + 6) 133 vpandn %%XTMP2, %%XTMP1, [%%STATE + _zuc_state + I*16] 134 vmovdqa [%%STATE + _zuc_state + I*16], %%XTMP2 135 136%assign I (I + 1) 137%endrep 138%endmacro 139 140;; Clear state for a specified lane in the OOO manager 141%macro CLEAR_ZUC_LANE_STATE 5 142%define %%STATE %1 ;; [in] ZUC OOO manager pointer 143%define %%LANE %2 ;; [in/clobbered] lane index 144%define %%TMP %3 ;; [clobbered] Temporary GP register 145%define %%XTMP1 %4 ;; [clobbered] Temporary YMM register 146%define %%XTMP2 %5 ;; [clobbered] Temporary YMM register 147 148 shl %%LANE, 2 149 lea %%TMP, [rel clear_lane_mask_tab_start] 150 sub %%TMP, %%LANE 151 vmovdqu %%XTMP1, [%%TMP] 152%assign I 0 153%rep (16 + 6) 154 vmovdqa %%XTMP2, [%%STATE + _zuc_state + I*16] 155 vpand %%XTMP2, %%XTMP1 156 vmovdqa [%%STATE + _zuc_state + I*16], %%XTMP2 157%assign I (I + 1) 158%endrep 159 160%endmacro 161 162; JOB* SUBMIT_JOB_ZUC_EEA3(MB_MGR_ZUC_OOO *state, IMB_JOB *job) 163; arg 1 : state 164; arg 2 : job 165MKGLOBAL(SUBMIT_JOB_ZUC_EEA3,function,internal) 166SUBMIT_JOB_ZUC_EEA3: 167 168; idx needs to be in rbp 169%define len rbp 170%define idx rbp 171 172%define lane r8 173%define unused_lanes rbx 174%define tmp r11 175%define tmp2 r13 176%define tmp3 r14 177%define min_len r15 178 179 mov rax, rsp 180 sub rsp, STACK_size 181 and rsp, -16 182 183 mov [rsp + _gpr_save + 8*0], rbx 184 mov [rsp + _gpr_save + 8*1], rbp 185 mov [rsp + _gpr_save + 8*2], r12 186 mov [rsp + _gpr_save + 8*3], r13 187 mov [rsp + _gpr_save + 8*4], r14 188 mov [rsp + _gpr_save + 8*5], r15 189%ifndef LINUX 190 mov [rsp + _gpr_save + 8*6], rsi 191 mov [rsp + _gpr_save + 8*7], rdi 192%endif 193 mov [rsp + _gpr_save + 8*8], state 194 mov [rsp + _gpr_save + 8*9], job 195 mov [rsp + _rsp_save], rax ; original SP 196 197 mov unused_lanes, [state + _zuc_unused_lanes] 198 movzx lane, BYTE(unused_lanes) 199 shr unused_lanes, 8 200 mov tmp, [job + _iv] 201 mov [state + _zuc_args_IV + lane*8], tmp 202 mov [state + _zuc_unused_lanes], unused_lanes 203 204 mov [state + _zuc_job_in_lane + lane*8], job 205 ; New job that needs init (update bit in zuc_init_not_done bitmask) 206 SHIFT_GP 1, lane, tmp, tmp2, left 207 or [state + _zuc_init_not_done], BYTE(tmp) 208 not tmp 209 and [state + _zuc_unused_lane_bitmask], BYTE(tmp) 210 211 mov tmp, [job + _src] 212 add tmp, [job + _cipher_start_src_offset_in_bytes] 213 mov [state + _zuc_args_in + lane*8], tmp 214 mov tmp, [job + _enc_keys] 215 mov [state + _zuc_args_keys + lane*8], tmp 216 mov tmp, [job + _dst] 217 mov [state + _zuc_args_out + lane*8], tmp 218 219 ;; insert len into proper lane 220 mov len, [job + _msg_len_to_cipher_in_bytes] 221 222 vmovq xmm0, [state + _zuc_lens] 223 XVPINSRW xmm0, xmm1, tmp, lane, len, scale_x16 224 vmovq [state + _zuc_lens], xmm0 225 226 cmp unused_lanes, 0xff 227 jne return_null_submit_eea3 228 229 ; Set all ffs in top 64 bits to invalid them 230 vpor xmm0, [rel all_ffs_top_64bits] 231 232 ; Find minimum length (searching for zero length, 233 ; to retrieve already encrypted buffers) 234 vphminposuw xmm1, xmm0 235 vpextrw min_len, xmm1, 0 ; min value 236 vpextrw idx, xmm1, 1 ; min index (0...3) 237 cmp min_len, 0 238 je len_is_0_submit_eea3 239 240 ; Move state into r12, as register for state will be used 241 ; to pass parameter to next function 242 mov r12, state 243 244%assign I 0 245%rep (16 + 2) 246 vmovdqa xmm0, [r12 + _zuc_state + 16*I] 247 vmovdqa [rsp + _state_save + 16*I], xmm0 248%assign I (I + 1) 249%endrep 250 251 ;; If Windows, reserve memory in stack for parameter transferring 252%ifndef LINUX 253 ;; 24 bytes for 3 parameters 254 sub rsp, 24 255%endif 256 lea arg1, [r12 + _zuc_args_keys] 257 lea arg2, [r12 + _zuc_args_IV] 258 lea arg3, [r12 + _zuc_state] 259 260 call asm_ZucInitialization_4_avx 261 262%ifndef LINUX 263 add rsp, 24 264%endif 265 266 cmp byte [r12 + _zuc_init_not_done], 0x0f ; Init done for all lanes 267 je skip_submit_restoring_state 268 269 ;; Load mask containing FF's in lanes which init has just been done 270 movzx DWORD(tmp3), byte [r12 + _zuc_init_not_done] 271 lea tmp2, [rel bitmask_to_dword_tab] 272 shl tmp3, 4 ; Multiply by 16 to move through the table 273 vmovdqa xmm2, [tmp3 + tmp2] 274 275 ;; Restore state from stack for lanes that did not need init 276%assign I 0 277%rep (16 + 2) 278 vmovdqa xmm0, [rsp + _state_save + 16*I] ; State before init 279 vmovdqa xmm1, [r12 + _zuc_state + 16*I] ; State after init 280 281 ; Zero out lanes that need to be restored in current state 282 vpand xmm1, xmm2 283 ; Zero out lanes that do not need to be restored in saved state 284 vpandn xmm0, xmm2, xmm0 285 vpor xmm1, xmm0 286 287 vmovdqa [r12 + _zuc_state + 16*I], xmm1 ; Save new state 288 289%assign I (I + 1) 290%endrep 291 292skip_submit_restoring_state: 293%ifdef SAFE_DATA 294 ;; Clear stack containing state info 295 vpxor xmm0, xmm0 296%assign I 0 297%rep (16 + 2) 298 vmovdqa [rsp + _state_save + 16*I], xmm0 299%assign I (I + 1) 300%endrep 301%endif 302 mov byte [r12 + _zuc_init_not_done], 0 ; Init done for all lanes 303 304 ;; If Windows, reserve memory in stack for parameter transferring 305%ifndef LINUX 306 ;; 40 bytes for 5 parameters 307 sub rsp, 40 308%endif 309 lea arg1, [r12 + _zuc_state] 310 lea arg2, [r12 + _zuc_args_in] 311 lea arg3, [r12 + _zuc_args_out] 312 lea arg4, [r12 + _zuc_lens] 313 mov arg5, min_len 314 315 call asm_ZucCipher_4_avx 316 317%ifndef LINUX 318 add rsp, 40 319%endif 320 321 mov state, [rsp + _gpr_save + 8*8] 322 mov job, [rsp + _gpr_save + 8*9] 323 324len_is_0_submit_eea3: 325 ; process completed job "idx" 326 mov job_rax, [state + _zuc_job_in_lane + idx*8] 327 mov unused_lanes, [state + _zuc_unused_lanes] 328 mov qword [state + _zuc_job_in_lane + idx*8], 0 329 or dword [job_rax + _status], STS_COMPLETED_AES 330 shl unused_lanes, 8 331 or unused_lanes, idx 332 mov [state + _zuc_unused_lanes], unused_lanes 333 SHIFT_GP 1, idx, tmp, tmp2, left 334 or [state + _zuc_unused_lane_bitmask], BYTE(tmp) 335 336%ifdef SAFE_DATA 337 ; Clear ZUC state of the lane that is returned 338 CLEAR_ZUC_LANE_STATE state, idx, tmp, xmm0, xmm1 339%endif 340 341return_submit_eea3: 342 343 mov rbx, [rsp + _gpr_save + 8*0] 344 mov rbp, [rsp + _gpr_save + 8*1] 345 mov r12, [rsp + _gpr_save + 8*2] 346 mov r13, [rsp + _gpr_save + 8*3] 347 mov r14, [rsp + _gpr_save + 8*4] 348 mov r15, [rsp + _gpr_save + 8*5] 349%ifndef LINUX 350 mov rsi, [rsp + _gpr_save + 8*6] 351 mov rdi, [rsp + _gpr_save + 8*7] 352%endif 353 mov rsp, [rsp + _rsp_save] ; original SP 354 355 ret 356 357return_null_submit_eea3: 358 xor job_rax, job_rax 359 jmp return_submit_eea3 360 361; JOB* FLUSH_JOB_ZUC_EEA3(MB_MGR_ZUC_OOO *state) 362; arg 1 : state 363MKGLOBAL(FLUSH_JOB_ZUC_EEA3,function,internal) 364FLUSH_JOB_ZUC_EEA3: 365 366%define unused_lanes rbx 367%define tmp1 rbx 368 369%define tmp2 rax 370 371; idx needs to be in rbp 372%define tmp rbp 373%define idx rbp 374 375%define tmp3 r8 376%define tmp4 r9 377%define tmp5 r10 378%define min_len r14 ; Will be maintained after function calls 379 380 mov rax, rsp 381 sub rsp, STACK_size 382 and rsp, -16 383 384 mov [rsp + _gpr_save + 8*0], rbx 385 mov [rsp + _gpr_save + 8*1], rbp 386 mov [rsp + _gpr_save + 8*2], r12 387 mov [rsp + _gpr_save + 8*3], r13 388 mov [rsp + _gpr_save + 8*4], r14 389 mov [rsp + _gpr_save + 8*5], r15 390%ifndef LINUX 391 mov [rsp + _gpr_save + 8*6], rsi 392 mov [rsp + _gpr_save + 8*7], rdi 393%endif 394 mov [rsp + _gpr_save + 8*8], state 395 mov [rsp + _rsp_save], rax ; original SP 396 397 ; check for empty 398 mov unused_lanes, [state + _zuc_unused_lanes] 399 bt unused_lanes, 32+7 400 jc return_null_flush_eea3 401 402 ; Set length = 0xFFFF in NULL jobs 403 vmovq xmm0, [state + _zuc_lens] 404 mov DWORD(tmp3), 0xffff 405%assign I 0 406%rep 4 407 cmp qword [state + _zuc_job_in_lane + I*8], 0 408 jne APPEND(skip_copy_ffs_,I) 409 pinsrw xmm0, DWORD(tmp3), I 410APPEND(skip_copy_ffs_,I): 411%assign I (I+1) 412%endrep 413 414 vmovq [state + _zuc_lens], xmm0 415 416 ; Set all ffs in top 64 bits to invalid them 417 vpor xmm0, [rel all_ffs_top_64bits] 418 419 ; Find minimum length (searching for zero length, 420 ; to retrieve already encrypted buffers) 421 vphminposuw xmm1, xmm0 422 vpextrw min_len, xmm1, 0 ; min value 423 vpextrw idx, xmm1, 1 ; min index (0...3) 424 cmp min_len, 0 425 je len_is_0_flush_eea3 426 427 ; copy good_lane to empty lanes 428 mov tmp1, [state + _zuc_args_in + idx*8] 429 mov tmp2, [state + _zuc_args_out + idx*8] 430 mov tmp3, [state + _zuc_args_keys + idx*8] 431 mov tmp4, [state + _zuc_args_IV + idx*8] 432 433%assign I 0 434%rep 4 435 cmp qword [state + _zuc_job_in_lane + I*8], 0 436 jne APPEND(skip_eea3_,I) 437 mov [state + _zuc_args_in + I*8], tmp1 438 mov [state + _zuc_args_out + I*8], tmp2 439 mov [state + _zuc_args_keys + I*8], tmp3 440 mov [state + _zuc_args_IV + I*8], tmp4 441APPEND(skip_eea3_,I): 442%assign I (I+1) 443%endrep 444 445 ; Move state into r12, as register for state will be used 446 ; to pass parameter to next function 447 mov r12, state 448 449 cmp word [r12 + _zuc_init_not_done], 0 450 je skip_flush_init 451 452%assign I 0 453%rep (16 + 2) 454 vmovdqa xmm0, [r12 + _zuc_state + 16*I] 455 vmovdqa [rsp + _state_save + 16*I], xmm0 456%assign I (I + 1) 457%endrep 458 459 ;; If Windows, reserve memory in stack for parameter transferring 460%ifndef LINUX 461 ;; 24 bytes for 3 parameters 462 sub rsp, 24 463%endif 464 lea arg1, [r12 + _zuc_args_keys] 465 lea arg2, [r12 + _zuc_args_IV] 466 lea arg3, [r12 + _zuc_state] 467 468 call asm_ZucInitialization_4_avx 469 470%ifndef LINUX 471 add rsp, 24 472%endif 473 cmp word [r12 + _zuc_init_not_done], 0x0f ; Init done for all lanes 474 je skip_flush_restoring_state 475 476 ;; Load mask containing FF's in lanes which init has just been done 477 movzx DWORD(tmp3), byte [r12 + _zuc_init_not_done] 478 lea tmp2, [rel bitmask_to_dword_tab] 479 shl tmp3, 4 ; Multiply by 16 to move through the table 480 vmovdqa xmm2, [tmp3 + tmp2] 481 482 ;; Restore state from stack for lanes that did not need init 483%assign I 0 484%rep (16 + 2) 485 vmovdqa xmm0, [rsp + _state_save + 16*I] ; State before init 486 vmovdqa xmm1, [r12 + _zuc_state + 16*I] ; State after init 487 488 ; Zero out lanes that need to be restored in current state 489 vpand xmm1, xmm2 490 ; Zero out lanes that do not need to be restored in saved state 491 vpandn xmm0, xmm2, xmm0 492 vpor xmm1, xmm0 493 494 vmovdqa [r12 + _zuc_state + 16*I], xmm1 ; Save new state 495%assign I (I + 1) 496%endrep 497 498skip_flush_restoring_state: 499%ifdef SAFE_DATA 500 ;; Clear stack containing state info 501 vpxor xmm0, xmm0 502%assign I 0 503%rep (16 + 2) 504 vmovdqa [rsp + _state_save + 16*I], xmm0 505%assign I (I + 1) 506%endrep 507%endif 508 mov word [r12 + _zuc_init_not_done], 0 ; Init done for all lanes 509 510skip_flush_init: 511 512 ;; Copy state from good lane to NULL lanes 513%assign I 0 514%rep (16 + 2) 515 ; Read dword from good lane and broadcast to NULL lanes 516 mov r13d, [r12 + _zuc_state + 16*I + idx*4] 517 518 vmovdqa xmm1, [r12 + _zuc_state + 16*I] ; State after init 519%assign J 0 520%rep 4 521 cmp qword [r12 + _zuc_job_in_lane + J*8], 0 522 jne APPEND3(skip_eea3_copy_,I,J) 523 vpinsrd xmm1, r13d, J 524APPEND3(skip_eea3_copy_,I,J): 525%assign J (J+1) 526%endrep 527 vmovdqa [r12 + _zuc_state + 16*I], xmm1 ; Save new state 528%assign I (I+1) 529%endrep 530 ;; If Windows, reserve memory in stack for parameter transferring 531%ifndef LINUX 532 ;; 40 bytes for 5 parameters 533 sub rsp, 40 534%endif 535 lea arg1, [r12 + _zuc_state] 536 lea arg2, [r12 + _zuc_args_in] 537 lea arg3, [r12 + _zuc_args_out] 538 lea arg4, [r12 + _zuc_lens] 539 mov arg5, min_len 540 541 call asm_ZucCipher_4_avx 542 543%ifndef LINUX 544 add rsp, 40 545%endif 546 mov state, [rsp + _gpr_save + 8*8] 547 548 ; Clear ZUC state of the lane that is returned and NULL lanes 549%ifdef SAFE_DATA 550 SHIFT_GP 1, idx, tmp1, tmp2, left 551 movzx DWORD(tmp3), byte [state + _zuc_unused_lane_bitmask] 552 or tmp3, tmp1 ;; bitmask with NULL lanes and job to return 553 554 CLEAR_ZUC_STATE state, tmp3, tmp2, xmm0, xmm1 555 jmp skip_flush_clear_state 556%endif 557 558len_is_0_flush_eea3: 559%ifdef SAFE_DATA 560 ; Clear ZUC state of the lane that is returned 561 mov tmp2, idx 562 CLEAR_ZUC_LANE_STATE state, tmp2, tmp3, xmm0, xmm1 563 564skip_flush_clear_state: 565%endif 566 ; process completed job "idx" 567 mov job_rax, [state + _zuc_job_in_lane + idx*8] 568 mov unused_lanes, [state + _zuc_unused_lanes] 569 mov qword [state + _zuc_job_in_lane + idx*8], 0 570 or dword [job_rax + _status], STS_COMPLETED_AES 571 shl unused_lanes, 8 572 or unused_lanes, idx 573 mov [state + _zuc_unused_lanes], unused_lanes 574 575 SHIFT_GP 1, idx, tmp3, tmp4, left 576 or [state + _zuc_unused_lane_bitmask], BYTE(tmp3) 577return_flush_eea3: 578 579 mov rbx, [rsp + _gpr_save + 8*0] 580 mov rbp, [rsp + _gpr_save + 8*1] 581 mov r12, [rsp + _gpr_save + 8*2] 582 mov r13, [rsp + _gpr_save + 8*3] 583 mov r14, [rsp + _gpr_save + 8*4] 584 mov r15, [rsp + _gpr_save + 8*5] 585%ifndef LINUX 586 mov rsi, [rsp + _gpr_save + 8*6] 587 mov rdi, [rsp + _gpr_save + 8*7] 588%endif 589 mov rsp, [rsp + _rsp_save] ; original SP 590 591 ret 592 593return_null_flush_eea3: 594 xor job_rax, job_rax 595 jmp return_flush_eea3 596 597; JOB* SUBMIT_JOB_ZUC_EIA3(MB_MGR_ZUC_OOO *state, IMB_JOB *job) 598; arg 1 : state 599; arg 2 : job 600MKGLOBAL(SUBMIT_JOB_ZUC_EIA3,function,internal) 601SUBMIT_JOB_ZUC_EIA3: 602 603; idx needs to be in rbp 604%define len rbp 605%define idx rbp 606%define tmp rbp 607 608%define lane r8 609%define unused_lanes rbx 610%define len2 r13 611 612 mov rax, rsp 613 sub rsp, STACK_size 614 and rsp, -16 615 616 mov [rsp + _gpr_save + 8*0], rbx 617 mov [rsp + _gpr_save + 8*1], rbp 618 mov [rsp + _gpr_save + 8*2], r12 619 mov [rsp + _gpr_save + 8*3], r13 620 mov [rsp + _gpr_save + 8*4], r14 621 mov [rsp + _gpr_save + 8*5], r15 622%ifndef LINUX 623 mov [rsp + _gpr_save + 8*6], rsi 624 mov [rsp + _gpr_save + 8*7], rdi 625%endif 626 mov [rsp + _gpr_save + 8*8], state 627 mov [rsp + _gpr_save + 8*9], job 628 mov [rsp + _rsp_save], rax ; original SP 629 630 mov unused_lanes, [state + _zuc_unused_lanes] 631 movzx lane, BYTE(unused_lanes) 632 shr unused_lanes, 8 633 mov tmp, [job + _zuc_eia3_iv] 634 mov [state + _zuc_args_IV + lane*8], tmp 635 mov [state + _zuc_unused_lanes], unused_lanes 636 637 mov [state + _zuc_job_in_lane + lane*8], job 638 mov tmp, [job + _src] 639 add tmp, [job + _hash_start_src_offset_in_bytes] 640 mov [state + _zuc_args_in + lane*8], tmp 641 mov tmp, [job + _zuc_eia3_key] 642 mov [state + _zuc_args_keys + lane*8], tmp 643 mov tmp, [job + _auth_tag_output] 644 mov [state + _zuc_args_out + lane*8], tmp 645 646 ;; insert len into proper lane 647 mov len, [job + _msg_len_to_hash_in_bits] 648 649 vmovdqa xmm0, [state + _zuc_lens] 650 XVPINSRW xmm0, xmm1, tmp, lane, len, scale_x16 651 vmovdqa [state + _zuc_lens], xmm0 652 653 cmp unused_lanes, 0xff 654 jne return_null_submit_eia3 655 656 ; Find minimum length (searching for zero length, 657 ; to retrieve already encrypted buffers) 658 vphminposuw xmm1, xmm0 659 vpextrw len2, xmm1, 0 ; min value 660 vpextrw idx, xmm1, 1 ; min index (0...3) 661 cmp len2, 0 662 je len_is_0_submit_eia3 663 664 ; Move state into r11, as register for state will be used 665 ; to pass parameter to next function 666 mov r11, state 667 668 ;; If Windows, reserve memory in stack for parameter transferring 669%ifndef LINUX 670 ;; 48 bytes for 6 parameters (already aligned to 16 bytes) 671 sub rsp, 48 672%endif 673 lea arg1, [r11 + _zuc_args_keys] 674 lea arg2, [r11 + _zuc_args_IV] 675 lea arg3, [r11 + _zuc_args_in] 676 lea arg4, [r11 + _zuc_args_out] 677%ifdef LINUX 678 lea arg5, [r11 + _zuc_lens] 679 lea arg6, [r11 + _zuc_job_in_lane] 680%else 681 lea r12, [r11 + _zuc_lens] 682 mov arg5, r12 683 lea r12, [r11 + _zuc_job_in_lane] 684 mov arg6, r12 685%endif 686 687 call zuc_eia3_4_buffer_job_avx 688 689%ifndef LINUX 690 add rsp, 48 691%endif 692 mov state, [rsp + _gpr_save + 8*8] 693 mov job, [rsp + _gpr_save + 8*9] 694 695 ;; Clear all lengths (function will authenticate all buffers) 696 mov qword [state + _zuc_lens], 0 697 698len_is_0_submit_eia3: 699 ; process completed job "idx" 700 mov job_rax, [state + _zuc_job_in_lane + idx*8] 701 mov unused_lanes, [state + _zuc_unused_lanes] 702 mov qword [state + _zuc_job_in_lane + idx*8], 0 703 or dword [job_rax + _status], STS_COMPLETED_HMAC 704 ;; TODO: fix double store (above setting the length to 0 and now setting to FFFFF) 705 mov word [state + _zuc_lens + idx*2], 0xFFFF 706 shl unused_lanes, 8 707 or unused_lanes, idx 708 mov [state + _zuc_unused_lanes], unused_lanes 709 710return_submit_eia3: 711 712 mov rbx, [rsp + _gpr_save + 8*0] 713 mov rbp, [rsp + _gpr_save + 8*1] 714 mov r12, [rsp + _gpr_save + 8*2] 715 mov r13, [rsp + _gpr_save + 8*3] 716 mov r14, [rsp + _gpr_save + 8*4] 717 mov r15, [rsp + _gpr_save + 8*5] 718%ifndef LINUX 719 mov rsi, [rsp + _gpr_save + 8*6] 720 mov rdi, [rsp + _gpr_save + 8*7] 721%endif 722 mov rsp, [rsp + _rsp_save] ; original SP 723 724 ret 725 726return_null_submit_eia3: 727 xor job_rax, job_rax 728 jmp return_submit_eia3 729 730; JOB* FLUSH_JOB_ZUC_EIA3(MB_MGR_ZUC_OOO *state) 731; arg 1 : state 732MKGLOBAL(FLUSH_JOB_ZUC_EIA3,function,internal) 733FLUSH_JOB_ZUC_EIA3: 734 735%define unused_lanes rbx 736%define tmp1 rbx 737 738%define tmp2 rax 739 740; idx needs to be in rbp 741%define tmp rbp 742%define idx rbp 743 744%define tmp3 r8 745%define tmp4 r9 746%define tmp5 r10 747 748 mov rax, rsp 749 sub rsp, STACK_size 750 and rsp, -16 751 752 mov [rsp + _gpr_save + 8*0], rbx 753 mov [rsp + _gpr_save + 8*1], rbp 754 mov [rsp + _gpr_save + 8*2], r12 755 mov [rsp + _gpr_save + 8*3], r13 756 mov [rsp + _gpr_save + 8*4], r14 757 mov [rsp + _gpr_save + 8*5], r15 758%ifndef LINUX 759 mov [rsp + _gpr_save + 8*6], rsi 760 mov [rsp + _gpr_save + 8*7], rdi 761%endif 762 mov [rsp + _gpr_save + 8*8], state 763 mov [rsp + _rsp_save], rax ; original SP 764 765 ; check for empty 766 mov unused_lanes, [state + _zuc_unused_lanes] 767 bt unused_lanes, 32+7 768 jc return_null_flush_eia3 769 770 ; Find minimum length (searching for zero length, 771 ; to retrieve already authenticated buffers) 772 vmovdqa xmm0, [state + _zuc_lens] 773 vphminposuw xmm1, xmm0 774 vpextrw len2, xmm1, 0 ; min value 775 vpextrw idx, xmm1, 1 ; min index (0...3) 776 cmp len2, 0 777 je len_is_0_flush_eia3 778 779 ; copy good_lane to empty lanes 780 mov tmp1, [state + _zuc_args_in + idx*8] 781 mov tmp2, [state + _zuc_args_out + idx*8] 782 mov tmp3, [state + _zuc_args_keys + idx*8] 783 mov tmp4, [state + _zuc_args_IV + idx*8] 784 mov WORD(tmp5), [state + _zuc_lens + idx*2] 785 786 ; Set valid length in NULL jobs 787 vmovd xmm0, DWORD(tmp5) 788 vpshufb xmm0, xmm0, [rel broadcast_word] 789 vmovdqa xmm1, [state + _zuc_lens] 790 791 vpcmpeqw xmm2, xmm2 ;; Get all ff's in XMM register 792 vpcmpeqw xmm3, xmm1, xmm2 ;; Mask with FFFF in NULL jobs 793 vmovq tmp5, xmm3 794 mov [rsp + _null_len_save], tmp5 ;; Save lengths with FFFF in NULL jobs 795 796 vpand xmm4, xmm3, xmm0 ;; Length of valid job in all NULL jobs 797 798 vpxor xmm2, xmm3 ;; Mask with 0000 in NULL jobs 799 vpand xmm1, xmm2 ;; Zero out lengths of NULL jobs 800 801 vpor xmm1, xmm4 802 vmovq tmp5, xmm1 803 mov [state + _zuc_lens], tmp5 804 805%assign I 0 806%rep 4 807 cmp qword [state + _zuc_job_in_lane + I*8], 0 808 jne APPEND(skip_eia3_,I) 809 mov [state + _zuc_args_in + I*8], tmp1 810 mov [state + _zuc_args_out + I*8], tmp2 811 mov [state + _zuc_args_keys + I*8], tmp3 812 mov [state + _zuc_args_IV + I*8], tmp4 813APPEND(skip_eia3_,I): 814%assign I (I+1) 815%endrep 816 817 ; Move state into r11, as register for state will be used 818 ; to pass parameter to next function 819 mov r11, state 820 821%ifndef LINUX 822 ;; 48 bytes for 6 parameters (already aligned to 16 bytes) 823 sub rsp, 48 824%endif 825 lea arg1, [r11 + _zuc_args_keys] 826 lea arg2, [r11 + _zuc_args_IV] 827 lea arg3, [r11 + _zuc_args_in] 828 lea arg4, [r11 + _zuc_args_out] 829%ifdef LINUX 830 lea arg5, [r11 + _zuc_lens] 831 lea arg6, [r11 + _zuc_job_in_lane] 832%else 833 lea r12, [r11 + _zuc_lens] 834 mov arg5, r12 835 lea r12, [r11 + _zuc_job_in_lane] 836 mov arg6, r12 837%endif 838 839 call zuc_eia3_4_buffer_job_avx 840 841%ifndef LINUX 842 add rsp, 48 843%endif 844 845 mov tmp5, [rsp + _null_len_save] 846 mov state, [rsp + _gpr_save + 8*8] 847 848 ;; Clear all lengths of valid jobs and set to FFFF to NULL jobs 849 mov qword [state + _zuc_lens], tmp5 850 851len_is_0_flush_eia3: 852 ; process completed job "idx" 853 mov job_rax, [state + _zuc_job_in_lane + idx*8] 854 mov unused_lanes, [state + _zuc_unused_lanes] 855 mov qword [state + _zuc_job_in_lane + idx*8], 0 856 or dword [job_rax + _status], STS_COMPLETED_HMAC 857 ;; TODO: fix double store (above setting the length to 0 and now setting to FFFFF) 858 mov word [state + _zuc_lens + idx*2], 0xFFFF 859 shl unused_lanes, 8 860 or unused_lanes, idx 861 mov [state + _zuc_unused_lanes], unused_lanes 862 863return_flush_eia3: 864 865 mov rbx, [rsp + _gpr_save + 8*0] 866 mov rbp, [rsp + _gpr_save + 8*1] 867 mov r12, [rsp + _gpr_save + 8*2] 868 mov r13, [rsp + _gpr_save + 8*3] 869 mov r14, [rsp + _gpr_save + 8*4] 870 mov r15, [rsp + _gpr_save + 8*5] 871%ifndef LINUX 872 mov rsi, [rsp + _gpr_save + 8*6] 873 mov rdi, [rsp + _gpr_save + 8*7] 874%endif 875 mov rsp, [rsp + _rsp_save] ; original SP 876 877 ret 878 879return_null_flush_eia3: 880 xor job_rax, job_rax 881 jmp return_flush_eia3 882 883%ifdef LINUX 884section .note.GNU-stack noalloc noexec nowrite progbits 885%endif 886