1;;
2;; Copyright (c) 2019-2020, Intel Corporation
3;;
4;; Redistribution and use in source and binary forms, with or without
5;; modification, are permitted provided that the following conditions are met:
6;;
7;;     * Redistributions of source code must retain the above copyright notice,
8;;       this list of conditions and the following disclaimer.
9;;     * Redistributions in binary form must reproduce the above copyright
10;;       notice, this list of conditions and the following disclaimer in the
11;;       documentation and/or other materials provided with the distribution.
12;;     * Neither the name of Intel Corporation nor the names of its contributors
13;;       may be used to endorse or promote products derived from this software
14;;       without specific prior written permission.
15;;
16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26;;
27
28%include "include/os.asm"
29%include "imb_job.asm"
30%include "mb_mgr_datastruct.asm"
31
32%include "include/reg_sizes.asm"
33%include "include/const.inc"
34
35%define SUBMIT_JOB_ZUC_EEA3 submit_job_zuc_eea3_avx
36%define FLUSH_JOB_ZUC_EEA3 flush_job_zuc_eea3_avx
37%define SUBMIT_JOB_ZUC_EIA3 submit_job_zuc_eia3_avx
38%define FLUSH_JOB_ZUC_EIA3 flush_job_zuc_eia3_avx
39
40section .data
41default rel
42
43align 16
44broadcast_word:
45db      0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01
46db      0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01
47
48align 16
49all_ffs_top_64bits:
50db      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
51db      0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
52
53clear_lane_mask_tab:
54dd      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
55dd      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
56
57clear_lane_mask_tab_start:
58dd      0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
59dd      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
60
61align 16
62bitmask_to_dword_tab:
63dd      0x00000000, 0x00000000, 0x00000000, 0x00000000
64dd      0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000
65dd      0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000
66dd      0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
67dd      0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000
68dd      0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
69dd      0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
70dd      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
71dd      0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
72dd      0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF
73dd      0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
74dd      0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
75dd      0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF
76dd      0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF
77dd      0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
78dd      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
79
80extern zuc_eia3_4_buffer_job_avx
81extern asm_ZucInitialization_4_avx
82extern asm_ZucCipher_4_avx
83
84%ifdef LINUX
85%define arg1    rdi
86%define arg2    rsi
87%define arg3    rdx
88%define arg4    rcx
89%define arg5    r8
90%define arg6    r9
91%else
92%define arg1    rcx
93%define arg2    rdx
94%define arg3    r8
95%define arg4    r9
96%define arg5    [rsp + 32]
97%define arg6    [rsp + 40]
98%endif
99
100%define state   arg1
101%define job     arg2
102
103%define job_rax          rax
104
105; This routine and its callee clobbers all GPRs
106struc STACK
107_state_save    resq     2*(16+2) ; Space for ZUC LFSR + R1-2
108_gpr_save:      resq    10
109_null_len_save: resq    1
110_rsp_save:      resq    1
111endstruc
112
113section .text
114
115%define APPEND(a,b) a %+ b
116%define APPEND3(a,b,c) a %+ b %+ c
117
118;; Clear state for multiple lanes in the OOO managers
119%macro CLEAR_ZUC_STATE 5
120%define %%STATE         %1 ;; [in] ZUC OOO manager pointer
121%define %%LANE_MASK     %2 ;; [in/clobbered] bitmask with lanes to clear
122%define %%TMP           %3 ;; [clobbered] Temporary GP register
123%define %%XTMP1         %4 ;; [clobbered] Temporary XMM register
124%define %%XTMP2         %5 ;; [clobbered] Temporary XMM register
125
126        lea     %%TMP, [rel bitmask_to_dword_tab]
127        shl     %%LANE_MASK, 4 ; Multiply by 16 to move through the table
128        vmovdqa %%XTMP1, [%%TMP + %%LANE_MASK]
129
130        ;; Clear state for lanes
131%assign I 0
132%rep (16 + 6)
133        vpandn  %%XTMP2, %%XTMP1, [%%STATE + _zuc_state + I*16]
134        vmovdqa [%%STATE + _zuc_state + I*16], %%XTMP2
135
136%assign I (I + 1)
137%endrep
138%endmacro
139
140;; Clear state for a specified lane in the OOO manager
141%macro CLEAR_ZUC_LANE_STATE 5
142%define %%STATE         %1 ;; [in] ZUC OOO manager pointer
143%define %%LANE          %2 ;; [in/clobbered] lane index
144%define %%TMP           %3 ;; [clobbered] Temporary GP register
145%define %%XTMP1         %4 ;; [clobbered] Temporary YMM register
146%define %%XTMP2         %5 ;; [clobbered] Temporary YMM register
147
148        shl     %%LANE, 2
149        lea     %%TMP, [rel clear_lane_mask_tab_start]
150        sub     %%TMP, %%LANE
151        vmovdqu %%XTMP1, [%%TMP]
152%assign I 0
153%rep (16 + 6)
154        vmovdqa %%XTMP2, [%%STATE + _zuc_state + I*16]
155        vpand   %%XTMP2, %%XTMP1
156        vmovdqa [%%STATE + _zuc_state + I*16], %%XTMP2
157%assign I (I + 1)
158%endrep
159
160%endmacro
161
162; JOB* SUBMIT_JOB_ZUC_EEA3(MB_MGR_ZUC_OOO *state, IMB_JOB *job)
163; arg 1 : state
164; arg 2 : job
165MKGLOBAL(SUBMIT_JOB_ZUC_EEA3,function,internal)
166SUBMIT_JOB_ZUC_EEA3:
167
168; idx needs to be in rbp
169%define len              rbp
170%define idx              rbp
171
172%define lane             r8
173%define unused_lanes     rbx
174%define tmp              r11
175%define tmp2             r13
176%define tmp3             r14
177%define min_len          r15
178
179        mov     rax, rsp
180        sub     rsp, STACK_size
181        and     rsp, -16
182
183        mov     [rsp + _gpr_save + 8*0], rbx
184        mov     [rsp + _gpr_save + 8*1], rbp
185        mov     [rsp + _gpr_save + 8*2], r12
186        mov     [rsp + _gpr_save + 8*3], r13
187        mov     [rsp + _gpr_save + 8*4], r14
188        mov     [rsp + _gpr_save + 8*5], r15
189%ifndef LINUX
190        mov     [rsp + _gpr_save + 8*6], rsi
191        mov     [rsp + _gpr_save + 8*7], rdi
192%endif
193        mov     [rsp + _gpr_save + 8*8], state
194        mov     [rsp + _gpr_save + 8*9], job
195        mov     [rsp + _rsp_save], rax  ; original SP
196
197        mov     unused_lanes, [state + _zuc_unused_lanes]
198        movzx   lane, BYTE(unused_lanes)
199        shr     unused_lanes, 8
200        mov     tmp, [job + _iv]
201        mov     [state + _zuc_args_IV + lane*8], tmp
202        mov     [state + _zuc_unused_lanes], unused_lanes
203
204        mov     [state + _zuc_job_in_lane + lane*8], job
205        ; New job that needs init (update bit in zuc_init_not_done bitmask)
206        SHIFT_GP        1, lane, tmp, tmp2, left
207        or      [state + _zuc_init_not_done], BYTE(tmp)
208        not     tmp
209        and     [state + _zuc_unused_lane_bitmask], BYTE(tmp)
210
211        mov     tmp, [job + _src]
212        add     tmp, [job + _cipher_start_src_offset_in_bytes]
213        mov     [state + _zuc_args_in + lane*8], tmp
214        mov     tmp, [job + _enc_keys]
215        mov     [state + _zuc_args_keys + lane*8], tmp
216        mov     tmp, [job + _dst]
217        mov     [state + _zuc_args_out + lane*8], tmp
218
219        ;; insert len into proper lane
220        mov     len, [job + _msg_len_to_cipher_in_bytes]
221
222        vmovq   xmm0, [state + _zuc_lens]
223        XVPINSRW xmm0, xmm1, tmp, lane, len, scale_x16
224        vmovq   [state + _zuc_lens], xmm0
225
226        cmp     unused_lanes, 0xff
227        jne     return_null_submit_eea3
228
229        ; Set all ffs in top 64 bits to invalid them
230        vpor    xmm0, [rel all_ffs_top_64bits]
231
232        ; Find minimum length (searching for zero length,
233        ; to retrieve already encrypted buffers)
234        vphminposuw     xmm1, xmm0
235        vpextrw min_len, xmm1, 0   ; min value
236        vpextrw idx, xmm1, 1    ; min index (0...3)
237        cmp     min_len, 0
238        je      len_is_0_submit_eea3
239
240        ; Move state into r12, as register for state will be used
241        ; to pass parameter to next function
242        mov     r12, state
243
244%assign I 0
245%rep (16 + 2)
246        vmovdqa  xmm0, [r12 + _zuc_state + 16*I]
247        vmovdqa  [rsp + _state_save + 16*I], xmm0
248%assign I (I + 1)
249%endrep
250
251        ;; If Windows, reserve memory in stack for parameter transferring
252%ifndef LINUX
253        ;; 24 bytes for 3 parameters
254        sub     rsp, 24
255%endif
256        lea     arg1, [r12 + _zuc_args_keys]
257        lea     arg2, [r12 + _zuc_args_IV]
258        lea     arg3, [r12 + _zuc_state]
259
260        call    asm_ZucInitialization_4_avx
261
262%ifndef LINUX
263        add     rsp, 24
264%endif
265
266        cmp     byte [r12 + _zuc_init_not_done], 0x0f ; Init done for all lanes
267        je      skip_submit_restoring_state
268
269        ;; Load mask containing FF's in lanes which init has just been done
270        movzx   DWORD(tmp3), byte [r12 + _zuc_init_not_done]
271        lea     tmp2, [rel bitmask_to_dword_tab]
272        shl     tmp3, 4 ; Multiply by 16 to move through the table
273        vmovdqa xmm2, [tmp3 + tmp2]
274
275        ;; Restore state from stack for lanes that did not need init
276%assign I 0
277%rep (16 + 2)
278        vmovdqa  xmm0, [rsp + _state_save + 16*I] ; State before init
279        vmovdqa  xmm1, [r12 + _zuc_state + 16*I] ; State after init
280
281        ; Zero out lanes that need to be restored in current state
282        vpand   xmm1, xmm2
283        ; Zero out lanes that do not need to be restored in saved state
284        vpandn  xmm0, xmm2, xmm0
285        vpor    xmm1, xmm0
286
287        vmovdqa  [r12 + _zuc_state + 16*I], xmm1 ; Save new state
288
289%assign I (I + 1)
290%endrep
291
292skip_submit_restoring_state:
293%ifdef SAFE_DATA
294        ;; Clear stack containing state info
295        vpxor   xmm0, xmm0
296%assign I 0
297%rep (16 + 2)
298        vmovdqa [rsp + _state_save + 16*I], xmm0
299%assign I (I + 1)
300%endrep
301%endif
302        mov     byte [r12 + _zuc_init_not_done], 0 ; Init done for all lanes
303
304        ;; If Windows, reserve memory in stack for parameter transferring
305%ifndef LINUX
306        ;; 40 bytes for 5 parameters
307        sub     rsp, 40
308%endif
309        lea     arg1, [r12 + _zuc_state]
310        lea     arg2, [r12 + _zuc_args_in]
311        lea     arg3, [r12 + _zuc_args_out]
312        lea     arg4, [r12 + _zuc_lens]
313        mov     arg5, min_len
314
315        call    asm_ZucCipher_4_avx
316
317%ifndef LINUX
318        add     rsp, 40
319%endif
320
321        mov     state, [rsp + _gpr_save + 8*8]
322        mov     job,   [rsp + _gpr_save + 8*9]
323
324len_is_0_submit_eea3:
325        ; process completed job "idx"
326        mov     job_rax, [state + _zuc_job_in_lane + idx*8]
327        mov     unused_lanes, [state + _zuc_unused_lanes]
328        mov     qword [state + _zuc_job_in_lane + idx*8], 0
329        or      dword [job_rax + _status], STS_COMPLETED_AES
330        shl     unused_lanes, 8
331        or      unused_lanes, idx
332        mov     [state + _zuc_unused_lanes], unused_lanes
333        SHIFT_GP        1, idx, tmp, tmp2, left
334        or      [state + _zuc_unused_lane_bitmask], BYTE(tmp)
335
336%ifdef SAFE_DATA
337        ; Clear ZUC state of the lane that is returned
338        CLEAR_ZUC_LANE_STATE state, idx, tmp, xmm0, xmm1
339%endif
340
341return_submit_eea3:
342
343        mov     rbx, [rsp + _gpr_save + 8*0]
344        mov     rbp, [rsp + _gpr_save + 8*1]
345        mov     r12, [rsp + _gpr_save + 8*2]
346        mov     r13, [rsp + _gpr_save + 8*3]
347        mov     r14, [rsp + _gpr_save + 8*4]
348        mov     r15, [rsp + _gpr_save + 8*5]
349%ifndef LINUX
350        mov     rsi, [rsp + _gpr_save + 8*6]
351        mov     rdi, [rsp + _gpr_save + 8*7]
352%endif
353        mov     rsp, [rsp + _rsp_save]  ; original SP
354
355        ret
356
357return_null_submit_eea3:
358        xor     job_rax, job_rax
359        jmp     return_submit_eea3
360
361; JOB* FLUSH_JOB_ZUC_EEA3(MB_MGR_ZUC_OOO *state)
362; arg 1 : state
363MKGLOBAL(FLUSH_JOB_ZUC_EEA3,function,internal)
364FLUSH_JOB_ZUC_EEA3:
365
366%define unused_lanes     rbx
367%define tmp1             rbx
368
369%define tmp2             rax
370
371; idx needs to be in rbp
372%define tmp              rbp
373%define idx              rbp
374
375%define tmp3             r8
376%define tmp4             r9
377%define tmp5             r10
378%define min_len          r14 ; Will be maintained after function calls
379
380        mov     rax, rsp
381        sub     rsp, STACK_size
382        and     rsp, -16
383
384        mov     [rsp + _gpr_save + 8*0], rbx
385        mov     [rsp + _gpr_save + 8*1], rbp
386        mov     [rsp + _gpr_save + 8*2], r12
387        mov     [rsp + _gpr_save + 8*3], r13
388        mov     [rsp + _gpr_save + 8*4], r14
389        mov     [rsp + _gpr_save + 8*5], r15
390%ifndef LINUX
391        mov     [rsp + _gpr_save + 8*6], rsi
392        mov     [rsp + _gpr_save + 8*7], rdi
393%endif
394        mov     [rsp + _gpr_save + 8*8], state
395        mov     [rsp + _rsp_save], rax  ; original SP
396
397        ; check for empty
398        mov     unused_lanes, [state + _zuc_unused_lanes]
399        bt      unused_lanes, 32+7
400        jc      return_null_flush_eea3
401
402        ; Set length = 0xFFFF in NULL jobs
403        vmovq   xmm0, [state + _zuc_lens]
404        mov     DWORD(tmp3), 0xffff
405%assign I 0
406%rep 4
407        cmp     qword [state + _zuc_job_in_lane + I*8], 0
408        jne     APPEND(skip_copy_ffs_,I)
409        pinsrw  xmm0, DWORD(tmp3), I
410APPEND(skip_copy_ffs_,I):
411%assign I (I+1)
412%endrep
413
414        vmovq   [state + _zuc_lens], xmm0
415
416        ; Set all ffs in top 64 bits to invalid them
417        vpor    xmm0, [rel all_ffs_top_64bits]
418
419        ; Find minimum length (searching for zero length,
420        ; to retrieve already encrypted buffers)
421        vphminposuw     xmm1, xmm0
422        vpextrw min_len, xmm1, 0   ; min value
423        vpextrw idx, xmm1, 1    ; min index (0...3)
424        cmp     min_len, 0
425        je      len_is_0_flush_eea3
426
427        ; copy good_lane to empty lanes
428        mov     tmp1, [state + _zuc_args_in + idx*8]
429        mov     tmp2, [state + _zuc_args_out + idx*8]
430        mov     tmp3, [state + _zuc_args_keys + idx*8]
431        mov     tmp4, [state + _zuc_args_IV + idx*8]
432
433%assign I 0
434%rep 4
435        cmp     qword [state + _zuc_job_in_lane + I*8], 0
436        jne     APPEND(skip_eea3_,I)
437        mov     [state + _zuc_args_in + I*8], tmp1
438        mov     [state + _zuc_args_out + I*8], tmp2
439        mov     [state + _zuc_args_keys + I*8], tmp3
440        mov     [state + _zuc_args_IV + I*8], tmp4
441APPEND(skip_eea3_,I):
442%assign I (I+1)
443%endrep
444
445        ; Move state into r12, as register for state will be used
446        ; to pass parameter to next function
447        mov     r12, state
448
449        cmp     word [r12 + _zuc_init_not_done], 0
450        je      skip_flush_init
451
452%assign I 0
453%rep (16 + 2)
454        vmovdqa  xmm0, [r12 + _zuc_state + 16*I]
455        vmovdqa  [rsp + _state_save + 16*I], xmm0
456%assign I (I + 1)
457%endrep
458
459        ;; If Windows, reserve memory in stack for parameter transferring
460%ifndef LINUX
461        ;; 24 bytes for 3 parameters
462        sub     rsp, 24
463%endif
464        lea     arg1, [r12 + _zuc_args_keys]
465        lea     arg2, [r12 + _zuc_args_IV]
466        lea     arg3, [r12 + _zuc_state]
467
468        call    asm_ZucInitialization_4_avx
469
470%ifndef LINUX
471        add     rsp, 24
472%endif
473        cmp     word [r12 + _zuc_init_not_done], 0x0f ; Init done for all lanes
474        je      skip_flush_restoring_state
475
476        ;; Load mask containing FF's in lanes which init has just been done
477        movzx   DWORD(tmp3), byte [r12 + _zuc_init_not_done]
478        lea     tmp2, [rel bitmask_to_dword_tab]
479        shl     tmp3, 4 ; Multiply by 16 to move through the table
480        vmovdqa xmm2, [tmp3 + tmp2]
481
482        ;; Restore state from stack for lanes that did not need init
483%assign I 0
484%rep (16 + 2)
485        vmovdqa  xmm0, [rsp + _state_save + 16*I] ; State before init
486        vmovdqa  xmm1, [r12 + _zuc_state + 16*I] ; State after init
487
488        ; Zero out lanes that need to be restored in current state
489        vpand   xmm1, xmm2
490        ; Zero out lanes that do not need to be restored in saved state
491        vpandn  xmm0, xmm2, xmm0
492        vpor    xmm1, xmm0
493
494        vmovdqa [r12 + _zuc_state + 16*I], xmm1 ; Save new state
495%assign I (I + 1)
496%endrep
497
498skip_flush_restoring_state:
499%ifdef SAFE_DATA
500        ;; Clear stack containing state info
501        vpxor   xmm0, xmm0
502%assign I 0
503%rep (16 + 2)
504        vmovdqa [rsp + _state_save + 16*I], xmm0
505%assign I (I + 1)
506%endrep
507%endif
508        mov     word [r12 + _zuc_init_not_done], 0 ; Init done for all lanes
509
510skip_flush_init:
511
512        ;; Copy state from good lane to NULL lanes
513%assign I 0
514%rep (16 + 2)
515        ; Read dword from good lane and broadcast to NULL lanes
516        mov     r13d, [r12 + _zuc_state + 16*I + idx*4]
517
518        vmovdqa xmm1, [r12 + _zuc_state + 16*I] ; State after init
519%assign J 0
520%rep 4
521        cmp     qword [r12 + _zuc_job_in_lane + J*8], 0
522        jne     APPEND3(skip_eea3_copy_,I,J)
523        vpinsrd xmm1, r13d, J
524APPEND3(skip_eea3_copy_,I,J):
525%assign J (J+1)
526%endrep
527        vmovdqa [r12 + _zuc_state + 16*I], xmm1 ; Save new state
528%assign I (I+1)
529%endrep
530        ;; If Windows, reserve memory in stack for parameter transferring
531%ifndef LINUX
532        ;; 40 bytes for 5 parameters
533        sub     rsp, 40
534%endif
535        lea     arg1, [r12 + _zuc_state]
536        lea     arg2, [r12 + _zuc_args_in]
537        lea     arg3, [r12 + _zuc_args_out]
538        lea     arg4, [r12 + _zuc_lens]
539        mov     arg5, min_len
540
541        call    asm_ZucCipher_4_avx
542
543%ifndef LINUX
544        add     rsp, 40
545%endif
546        mov     state, [rsp + _gpr_save + 8*8]
547
548        ; Clear ZUC state of the lane that is returned and NULL lanes
549%ifdef SAFE_DATA
550        SHIFT_GP        1, idx, tmp1, tmp2, left
551        movzx   DWORD(tmp3), byte [state + _zuc_unused_lane_bitmask]
552        or      tmp3, tmp1 ;; bitmask with NULL lanes and job to return
553
554        CLEAR_ZUC_STATE state, tmp3, tmp2, xmm0, xmm1
555        jmp     skip_flush_clear_state
556%endif
557
558len_is_0_flush_eea3:
559%ifdef SAFE_DATA
560        ; Clear ZUC state of the lane that is returned
561        mov     tmp2, idx
562        CLEAR_ZUC_LANE_STATE state, tmp2, tmp3, xmm0, xmm1
563
564skip_flush_clear_state:
565%endif
566        ; process completed job "idx"
567        mov     job_rax, [state + _zuc_job_in_lane + idx*8]
568        mov     unused_lanes, [state + _zuc_unused_lanes]
569        mov     qword [state + _zuc_job_in_lane + idx*8], 0
570        or      dword [job_rax + _status], STS_COMPLETED_AES
571        shl     unused_lanes, 8
572        or      unused_lanes, idx
573        mov     [state + _zuc_unused_lanes], unused_lanes
574
575        SHIFT_GP        1, idx, tmp3, tmp4, left
576        or      [state + _zuc_unused_lane_bitmask], BYTE(tmp3)
577return_flush_eea3:
578
579        mov     rbx, [rsp + _gpr_save + 8*0]
580        mov     rbp, [rsp + _gpr_save + 8*1]
581        mov     r12, [rsp + _gpr_save + 8*2]
582        mov     r13, [rsp + _gpr_save + 8*3]
583        mov     r14, [rsp + _gpr_save + 8*4]
584        mov     r15, [rsp + _gpr_save + 8*5]
585%ifndef LINUX
586        mov     rsi, [rsp + _gpr_save + 8*6]
587        mov     rdi, [rsp + _gpr_save + 8*7]
588%endif
589        mov     rsp, [rsp + _rsp_save]  ; original SP
590
591        ret
592
593return_null_flush_eea3:
594        xor     job_rax, job_rax
595        jmp     return_flush_eea3
596
597; JOB* SUBMIT_JOB_ZUC_EIA3(MB_MGR_ZUC_OOO *state, IMB_JOB *job)
598; arg 1 : state
599; arg 2 : job
600MKGLOBAL(SUBMIT_JOB_ZUC_EIA3,function,internal)
601SUBMIT_JOB_ZUC_EIA3:
602
603; idx needs to be in rbp
604%define len              rbp
605%define idx              rbp
606%define tmp              rbp
607
608%define lane             r8
609%define unused_lanes     rbx
610%define len2             r13
611
612        mov     rax, rsp
613        sub     rsp, STACK_size
614        and     rsp, -16
615
616        mov     [rsp + _gpr_save + 8*0], rbx
617        mov     [rsp + _gpr_save + 8*1], rbp
618        mov     [rsp + _gpr_save + 8*2], r12
619        mov     [rsp + _gpr_save + 8*3], r13
620        mov     [rsp + _gpr_save + 8*4], r14
621        mov     [rsp + _gpr_save + 8*5], r15
622%ifndef LINUX
623        mov     [rsp + _gpr_save + 8*6], rsi
624        mov     [rsp + _gpr_save + 8*7], rdi
625%endif
626        mov     [rsp + _gpr_save + 8*8], state
627        mov     [rsp + _gpr_save + 8*9], job
628        mov     [rsp + _rsp_save], rax  ; original SP
629
630        mov     unused_lanes, [state + _zuc_unused_lanes]
631        movzx   lane, BYTE(unused_lanes)
632        shr     unused_lanes, 8
633        mov     tmp, [job + _zuc_eia3_iv]
634        mov     [state + _zuc_args_IV + lane*8], tmp
635        mov     [state + _zuc_unused_lanes], unused_lanes
636
637        mov     [state + _zuc_job_in_lane + lane*8], job
638        mov     tmp, [job + _src]
639        add     tmp, [job + _hash_start_src_offset_in_bytes]
640        mov     [state + _zuc_args_in + lane*8], tmp
641        mov     tmp, [job + _zuc_eia3_key]
642        mov     [state + _zuc_args_keys + lane*8], tmp
643        mov     tmp, [job + _auth_tag_output]
644        mov     [state + _zuc_args_out + lane*8], tmp
645
646        ;; insert len into proper lane
647        mov     len, [job + _msg_len_to_hash_in_bits]
648
649        vmovdqa xmm0, [state + _zuc_lens]
650        XVPINSRW xmm0, xmm1, tmp, lane, len, scale_x16
651        vmovdqa [state + _zuc_lens], xmm0
652
653        cmp     unused_lanes, 0xff
654        jne     return_null_submit_eia3
655
656        ; Find minimum length (searching for zero length,
657        ; to retrieve already encrypted buffers)
658        vphminposuw     xmm1, xmm0
659        vpextrw len2, xmm1, 0   ; min value
660        vpextrw idx, xmm1, 1    ; min index (0...3)
661        cmp     len2, 0
662        je      len_is_0_submit_eia3
663
664        ; Move state into r11, as register for state will be used
665        ; to pass parameter to next function
666        mov     r11, state
667
668        ;; If Windows, reserve memory in stack for parameter transferring
669%ifndef LINUX
670        ;; 48 bytes for 6 parameters (already aligned to 16 bytes)
671        sub     rsp, 48
672%endif
673        lea     arg1, [r11 + _zuc_args_keys]
674        lea     arg2, [r11 + _zuc_args_IV]
675        lea     arg3, [r11 + _zuc_args_in]
676        lea     arg4, [r11 + _zuc_args_out]
677%ifdef LINUX
678        lea     arg5, [r11 + _zuc_lens]
679        lea     arg6, [r11 + _zuc_job_in_lane]
680%else
681        lea     r12, [r11 + _zuc_lens]
682        mov     arg5, r12
683        lea     r12, [r11 + _zuc_job_in_lane]
684        mov     arg6, r12
685%endif
686
687        call    zuc_eia3_4_buffer_job_avx
688
689%ifndef LINUX
690        add     rsp, 48
691%endif
692        mov     state, [rsp + _gpr_save + 8*8]
693        mov     job,   [rsp + _gpr_save + 8*9]
694
695        ;; Clear all lengths (function will authenticate all buffers)
696        mov     qword [state + _zuc_lens], 0
697
698len_is_0_submit_eia3:
699        ; process completed job "idx"
700        mov     job_rax, [state + _zuc_job_in_lane + idx*8]
701        mov     unused_lanes, [state + _zuc_unused_lanes]
702        mov     qword [state + _zuc_job_in_lane + idx*8], 0
703        or      dword [job_rax + _status], STS_COMPLETED_HMAC
704        ;; TODO: fix double store (above setting the length to 0 and now setting to FFFFF)
705        mov     word [state + _zuc_lens + idx*2], 0xFFFF
706        shl     unused_lanes, 8
707        or      unused_lanes, idx
708        mov     [state + _zuc_unused_lanes], unused_lanes
709
710return_submit_eia3:
711
712        mov     rbx, [rsp + _gpr_save + 8*0]
713        mov     rbp, [rsp + _gpr_save + 8*1]
714        mov     r12, [rsp + _gpr_save + 8*2]
715        mov     r13, [rsp + _gpr_save + 8*3]
716        mov     r14, [rsp + _gpr_save + 8*4]
717        mov     r15, [rsp + _gpr_save + 8*5]
718%ifndef LINUX
719        mov     rsi, [rsp + _gpr_save + 8*6]
720        mov     rdi, [rsp + _gpr_save + 8*7]
721%endif
722        mov     rsp, [rsp + _rsp_save]  ; original SP
723
724        ret
725
726return_null_submit_eia3:
727        xor     job_rax, job_rax
728        jmp     return_submit_eia3
729
730; JOB* FLUSH_JOB_ZUC_EIA3(MB_MGR_ZUC_OOO *state)
731; arg 1 : state
732MKGLOBAL(FLUSH_JOB_ZUC_EIA3,function,internal)
733FLUSH_JOB_ZUC_EIA3:
734
735%define unused_lanes     rbx
736%define tmp1             rbx
737
738%define tmp2             rax
739
740; idx needs to be in rbp
741%define tmp              rbp
742%define idx              rbp
743
744%define tmp3             r8
745%define tmp4             r9
746%define tmp5             r10
747
748        mov     rax, rsp
749        sub     rsp, STACK_size
750        and     rsp, -16
751
752        mov     [rsp + _gpr_save + 8*0], rbx
753        mov     [rsp + _gpr_save + 8*1], rbp
754        mov     [rsp + _gpr_save + 8*2], r12
755        mov     [rsp + _gpr_save + 8*3], r13
756        mov     [rsp + _gpr_save + 8*4], r14
757        mov     [rsp + _gpr_save + 8*5], r15
758%ifndef LINUX
759        mov     [rsp + _gpr_save + 8*6], rsi
760        mov     [rsp + _gpr_save + 8*7], rdi
761%endif
762        mov     [rsp + _gpr_save + 8*8], state
763        mov     [rsp + _rsp_save], rax  ; original SP
764
765        ; check for empty
766        mov     unused_lanes, [state + _zuc_unused_lanes]
767        bt      unused_lanes, 32+7
768        jc      return_null_flush_eia3
769
770        ; Find minimum length (searching for zero length,
771        ; to retrieve already authenticated buffers)
772        vmovdqa xmm0, [state + _zuc_lens]
773        vphminposuw     xmm1, xmm0
774        vpextrw len2, xmm1, 0   ; min value
775        vpextrw idx, xmm1, 1    ; min index (0...3)
776        cmp     len2, 0
777        je      len_is_0_flush_eia3
778
779        ; copy good_lane to empty lanes
780        mov     tmp1, [state + _zuc_args_in + idx*8]
781        mov     tmp2, [state + _zuc_args_out + idx*8]
782        mov     tmp3, [state + _zuc_args_keys + idx*8]
783        mov     tmp4, [state + _zuc_args_IV + idx*8]
784        mov     WORD(tmp5), [state + _zuc_lens + idx*2]
785
786        ; Set valid length in NULL jobs
787        vmovd   xmm0, DWORD(tmp5)
788        vpshufb xmm0, xmm0, [rel broadcast_word]
789        vmovdqa xmm1, [state + _zuc_lens]
790
791        vpcmpeqw xmm2, xmm2 ;; Get all ff's in XMM register
792        vpcmpeqw xmm3, xmm1, xmm2 ;; Mask with FFFF in NULL jobs
793        vmovq	tmp5, xmm3
794        mov     [rsp + _null_len_save], tmp5 ;; Save lengths with FFFF in NULL jobs
795
796        vpand   xmm4, xmm3, xmm0 ;; Length of valid job in all NULL jobs
797
798        vpxor   xmm2, xmm3 ;; Mask with 0000 in NULL jobs
799        vpand   xmm1, xmm2 ;; Zero out lengths of NULL jobs
800
801        vpor    xmm1, xmm4
802        vmovq   tmp5, xmm1
803        mov     [state + _zuc_lens], tmp5
804
805%assign I 0
806%rep 4
807        cmp     qword [state + _zuc_job_in_lane + I*8], 0
808        jne     APPEND(skip_eia3_,I)
809        mov     [state + _zuc_args_in + I*8], tmp1
810        mov     [state + _zuc_args_out + I*8], tmp2
811        mov     [state + _zuc_args_keys + I*8], tmp3
812        mov     [state + _zuc_args_IV + I*8], tmp4
813APPEND(skip_eia3_,I):
814%assign I (I+1)
815%endrep
816
817        ; Move state into r11, as register for state will be used
818        ; to pass parameter to next function
819        mov     r11, state
820
821%ifndef LINUX
822        ;; 48 bytes for 6 parameters (already aligned to 16 bytes)
823        sub     rsp, 48
824%endif
825        lea     arg1, [r11 + _zuc_args_keys]
826        lea     arg2, [r11 + _zuc_args_IV]
827        lea     arg3, [r11 + _zuc_args_in]
828        lea     arg4, [r11 + _zuc_args_out]
829%ifdef LINUX
830        lea     arg5, [r11 + _zuc_lens]
831        lea     arg6, [r11 + _zuc_job_in_lane]
832%else
833        lea     r12, [r11 + _zuc_lens]
834        mov     arg5, r12
835        lea     r12, [r11 + _zuc_job_in_lane]
836        mov     arg6, r12
837%endif
838
839        call    zuc_eia3_4_buffer_job_avx
840
841%ifndef LINUX
842        add     rsp, 48
843%endif
844
845        mov	tmp5, [rsp + _null_len_save]
846        mov     state, [rsp + _gpr_save + 8*8]
847
848        ;; Clear all lengths of valid jobs and set to FFFF to NULL jobs
849        mov     qword [state + _zuc_lens], tmp5
850
851len_is_0_flush_eia3:
852        ; process completed job "idx"
853        mov     job_rax, [state + _zuc_job_in_lane + idx*8]
854        mov     unused_lanes, [state + _zuc_unused_lanes]
855        mov     qword [state + _zuc_job_in_lane + idx*8], 0
856        or      dword [job_rax + _status], STS_COMPLETED_HMAC
857        ;; TODO: fix double store (above setting the length to 0 and now setting to FFFFF)
858        mov     word [state + _zuc_lens + idx*2], 0xFFFF
859        shl     unused_lanes, 8
860        or      unused_lanes, idx
861        mov     [state + _zuc_unused_lanes], unused_lanes
862
863return_flush_eia3:
864
865        mov     rbx, [rsp + _gpr_save + 8*0]
866        mov     rbp, [rsp + _gpr_save + 8*1]
867        mov     r12, [rsp + _gpr_save + 8*2]
868        mov     r13, [rsp + _gpr_save + 8*3]
869        mov     r14, [rsp + _gpr_save + 8*4]
870        mov     r15, [rsp + _gpr_save + 8*5]
871%ifndef LINUX
872        mov     rsi, [rsp + _gpr_save + 8*6]
873        mov     rdi, [rsp + _gpr_save + 8*7]
874%endif
875        mov     rsp, [rsp + _rsp_save]  ; original SP
876
877        ret
878
879return_null_flush_eia3:
880        xor     job_rax, job_rax
881        jmp     return_flush_eia3
882
883%ifdef LINUX
884section .note.GNU-stack noalloc noexec nowrite progbits
885%endif
886