1;;
2;; Copyright (c) 2012-2020, Intel Corporation
3;;
4;; Redistribution and use in source and binary forms, with or without
5;; modification, are permitted provided that the following conditions are met:
6;;
7;;     * Redistributions of source code must retain the above copyright notice,
8;;       this list of conditions and the following disclaimer.
9;;     * Redistributions in binary form must reproduce the above copyright
10;;       notice, this list of conditions and the following disclaimer in the
11;;       documentation and/or other materials provided with the distribution.
12;;     * Neither the name of Intel Corporation nor the names of its contributors
13;;       may be used to endorse or promote products derived from this software
14;;       without specific prior written permission.
15;;
16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26;;
27
28%include "include/os.asm"
29%include "imb_job.asm"
30%include "mb_mgr_datastruct.asm"
31%include "include/reg_sizes.asm"
32
33extern md5_x4x2_avx
34
35section .data
36default rel
37align 16
38dupw:	;ddq 0x01000100010001000100010001000100
39	dq 0x0100010001000100, 0x0100010001000100
40x80:    ;ddq 0x00000000000000000000000000000080
41        dq 0x0000000000000080, 0x0000000000000000
42x00:    ;ddq 0x00000000000000000000000000000000
43        dq 0x0000000000000000, 0x0000000000000000
44len_masks:
45	;ddq 0x0000000000000000000000000000FFFF
46	dq 0x000000000000FFFF, 0x0000000000000000
47	;ddq 0x000000000000000000000000FFFF0000
48	dq 0x00000000FFFF0000, 0x0000000000000000
49	;ddq 0x00000000000000000000FFFF00000000
50	dq 0x0000FFFF00000000, 0x0000000000000000
51	;ddq 0x0000000000000000FFFF000000000000
52	dq 0xFFFF000000000000, 0x0000000000000000
53	;ddq 0x000000000000FFFF0000000000000000
54	dq 0x0000000000000000, 0x000000000000FFFF
55	;ddq 0x00000000FFFF00000000000000000000
56	dq 0x0000000000000000, 0x00000000FFFF0000
57	;ddq 0x0000FFFF000000000000000000000000
58	dq 0x0000000000000000, 0x0000FFFF00000000
59	;ddq 0xFFFF0000000000000000000000000000
60	dq 0x0000000000000000, 0xFFFF000000000000
61one:	dq  1
62two:	dq  2
63three:	dq  3
64four:	dq  4
65five:	dq  5
66six:	dq  6
67seven:	dq  7
68
69section .text
70
71%if 1
72%ifdef LINUX
73%define arg1	rdi
74%define arg2	rsi
75%else
76%define arg1	rcx
77%define arg2	rdx
78%endif
79
80%define state	arg1
81%define job	arg2
82%define len2	arg2
83
84
85; idx needs to be in rbp
86%define idx             rbp
87
88; unused_lanes must be in rax-rdx
89%define unused_lanes    rbx
90%define lane_data       rbx
91%define tmp2		rbx
92
93%define job_rax         rax
94%define	tmp1		rax
95%define size_offset     rax
96%define tmp             rax
97%define start_offset    rax
98
99%define tmp3		arg1
100
101%define extra_blocks    arg2
102%define p               arg2
103
104%define tmp4		r8
105%define tmp5		r9
106
107%endif
108
109; This routine and/or the called routine clobbers all GPRs
110struc STACK
111_gpr_save:	resq	8
112_rsp_save:	resq	1
113endstruc
114
115%define APPEND(a,b) a %+ b
116
117; JOB* flush_job_hmac_md5_avx(MB_MGR_HMAC_MD5_OOO *state)
118; arg 1 : rcx : state
119MKGLOBAL(flush_job_hmac_md5_avx,function,internal)
120flush_job_hmac_md5_avx:
121
122        mov	rax, rsp
123        sub	rsp, STACK_size
124        and	rsp, -16
125
126	mov	[rsp + _gpr_save + 8*0], rbx
127	mov	[rsp + _gpr_save + 8*1], rbp
128	mov	[rsp + _gpr_save + 8*2], r12
129	mov	[rsp + _gpr_save + 8*3], r13
130	mov	[rsp + _gpr_save + 8*4], r14
131	mov	[rsp + _gpr_save + 8*5], r15
132%ifndef LINUX
133	mov	[rsp + _gpr_save + 8*6], rsi
134	mov	[rsp + _gpr_save + 8*7], rdi
135%endif
136	mov	[rsp + _rsp_save], rax	; original SP
137
138	mov	unused_lanes, [state + _unused_lanes_md5]
139	bt	unused_lanes, 32+3
140	jc	return_null
141
142	; find a lane with a non-null job
143	xor	idx, idx
144	cmp	qword [state + _ldata_md5 + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
145	cmovne	idx, [rel one]
146	cmp	qword [state + _ldata_md5 + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
147	cmovne	idx, [rel two]
148	cmp	qword [state + _ldata_md5 + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
149	cmovne	idx, [rel three]
150	cmp	qword [state + _ldata_md5 + 4 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
151	cmovne	idx, [rel four]
152	cmp	qword [state + _ldata_md5 + 5 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
153	cmovne	idx, [rel five]
154	cmp	qword [state + _ldata_md5 + 6 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
155	cmovne	idx, [rel six]
156	cmp	qword [state + _ldata_md5 + 7 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
157	cmovne	idx, [rel seven]
158
159copy_lane_data:
160	; copy good lane (idx) to empty lanes
161	vmovdqa	xmm0, [state + _lens_md5]
162	mov	tmp, [state + _args_data_ptr_md5 + PTR_SZ*idx]
163
164%assign I 0
165%rep 8
166	cmp	qword [state + _ldata_md5 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
167	jne	APPEND(skip_,I)
168	mov	[state + _args_data_ptr_md5 + PTR_SZ*I], tmp
169	vpor	xmm0, xmm0, [rel len_masks + 16*I]
170APPEND(skip_,I):
171%assign I (I+1)
172%endrep
173
174	vmovdqa	[state + _lens_md5], xmm0
175
176	vphminposuw	xmm1, xmm0
177	vpextrw	DWORD(len2), xmm1, 0	; min value
178	vpextrw	DWORD(idx), xmm1, 1	; min index (0...3)
179	cmp	len2, 0
180	je	len_is_0
181
182	vpshufb	xmm1, [rel dupw]	; duplicate words across all lanes
183	vpsubw	xmm0, xmm0, xmm1
184	vmovdqa	[state + _lens_md5], xmm0
185
186	; "state" and "args" are the same address, arg1
187	; len is arg2
188	call	md5_x4x2_avx
189	; state and idx are intact
190
191len_is_0:
192	; process completed job "idx"
193	imul	lane_data, idx, _HMAC_SHA1_LANE_DATA_size
194	lea	lane_data, [state + _ldata_md5 + lane_data]
195	mov	DWORD(extra_blocks), [lane_data + _extra_blocks]
196	cmp	extra_blocks, 0
197	jne	proc_extra_blocks
198	cmp	dword [lane_data + _outer_done], 0
199	jne	end_loop
200
201proc_outer:
202	mov	dword [lane_data + _outer_done], 1
203	mov	DWORD(size_offset), [lane_data + _size_offset]
204	mov	qword [lane_data + _extra_block + size_offset], 0
205	mov	word [state + _lens_md5 + 2*idx], 1
206	lea	tmp, [lane_data + _outer_block]
207	mov	job, [lane_data + _job_in_lane]
208	mov	[state + _args_data_ptr_md5 + PTR_SZ*idx], tmp
209
210	vmovd	xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE]
211	vpinsrd	xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1
212	vpinsrd	xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2
213	vpinsrd	xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3
214	vmovdqa	[lane_data + _outer_block], xmm0
215
216	mov	tmp, [job + _auth_key_xor_opad]
217	vmovdqu	xmm0, [tmp]
218	vmovd	[state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0
219	vpextrd	[state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1
220	vpextrd	[state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2
221	vpextrd	[state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3
222	jmp	copy_lane_data
223
224	align	16
225proc_extra_blocks:
226	mov	DWORD(start_offset), [lane_data + _start_offset]
227	mov	[state + _lens_md5 + 2*idx], WORD(extra_blocks)
228	lea	tmp, [lane_data + _extra_block + start_offset]
229	mov	[state + _args_data_ptr_md5 + PTR_SZ*idx], tmp
230	mov	dword [lane_data + _extra_blocks], 0
231	jmp	copy_lane_data
232
233return_null:
234	xor	job_rax, job_rax
235	jmp	return
236
237	align	16
238end_loop:
239	mov	job_rax, [lane_data + _job_in_lane]
240	mov	qword [lane_data + _job_in_lane], 0
241	or	dword [job_rax + _status], STS_COMPLETED_HMAC
242	mov	unused_lanes, [state + _unused_lanes_md5]
243	shl	unused_lanes, 4
244	or	unused_lanes, idx
245	mov	[state + _unused_lanes_md5], unused_lanes
246
247	mov	p, [job_rax + _auth_tag_output]
248
249	; copy 12 bytes
250	mov	DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE]
251	mov	DWORD(tmp4), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE]
252	mov	DWORD(tmp5), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE]
253;	bswap	DWORD(tmp2)
254;	bswap	DWORD(tmp4)
255;	bswap	DWORD(tmp3)
256	mov	[p + 0*4], DWORD(tmp2)
257	mov	[p + 1*4], DWORD(tmp4)
258	mov	[p + 2*4], DWORD(tmp5)
259
260        cmp     DWORD [job_rax + _auth_tag_output_len_in_bytes], 12
261        je      clear_ret
262
263        ; copy 16 bytes
264        mov	DWORD(tmp5), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE]
265        mov	[p + 3*4], DWORD(tmp5)
266
267clear_ret:
268
269%ifdef SAFE_DATA
270        vpxor   xmm0, xmm0
271
272        ;; Clear digest (16B), outer_block (16B) and extra_block (64B)
273        ;; of returned job and NULL jobs
274%assign I 0
275%rep 8
276	cmp	qword [state + _ldata_md5 + (I*_HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0
277	jne	APPEND(skip_clear_,I)
278
279        ;; Clear digest (16 bytes)
280%assign J 0
281%rep 4
282        mov     dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*I + J*MD5_DIGEST_ROW_SIZE], 0
283%assign J (J+1)
284%endrep
285
286        lea     lane_data, [state + _ldata_md5 + (I*_HMAC_SHA1_LANE_DATA_size)]
287        ;; Clear first 64 bytes of extra_block
288%assign offset 0
289%rep 4
290        vmovdqa [lane_data + _extra_block + offset], xmm0
291%assign offset (offset + 16)
292%endrep
293
294        ;; Clear first 16 bytes of outer_block
295        vmovdqa [lane_data + _outer_block], xmm0
296
297APPEND(skip_clear_,I):
298%assign I (I+1)
299%endrep
300
301%endif ;; SAFE_DATA
302
303return:
304
305	mov	rbx, [rsp + _gpr_save + 8*0]
306	mov	rbp, [rsp + _gpr_save + 8*1]
307	mov	r12, [rsp + _gpr_save + 8*2]
308	mov	r13, [rsp + _gpr_save + 8*3]
309	mov	r14, [rsp + _gpr_save + 8*4]
310	mov	r15, [rsp + _gpr_save + 8*5]
311%ifndef LINUX
312	mov	rsi, [rsp + _gpr_save + 8*6]
313	mov	rdi, [rsp + _gpr_save + 8*7]
314%endif
315	mov	rsp, [rsp + _rsp_save]	; original SP
316
317	ret
318
319%ifdef LINUX
320section .note.GNU-stack noalloc noexec nowrite progbits
321%endif
322