1/*-
2 * Copyright (c) 2023 The FreeBSD Foundation
3 *
4 * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
5 * under sponsorship from the FreeBSD Foundation.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE
27 */
28
29#include <machine/asm.h>
30
31#include "amd64_archlevel.h"
32
33#define ALIGN_TEXT      .p2align 4,0x90 /* 16-byte alignment, nop filled */
34
35ARCHFUNCS(timingsafe_bcmp)
36	ARCHFUNC(timingsafe_bcmp, scalar)
37	ARCHFUNC(timingsafe_bcmp, baseline)
38ENDARCHFUNCS(timingsafe_bcmp)
39
40ARCHENTRY(timingsafe_bcmp, scalar)
41	cmp	$16, %rdx		# at least 17 bytes to process?
42	ja	.Lgt16
43
44	cmp	$8, %edx		# at least 9 bytes to process?
45	ja	.L0916
46
47	cmp	$4, %edx		# at least 5 bytes to process?
48	ja	.L0508
49
50	cmp	$2, %edx		# at least 3 bytes to process?
51	ja	.L0304
52
53	test	%edx, %edx		# buffer empty?
54	jnz	.L0102
55
56	xor	%eax, %eax		# empty buffer always matches
57	ret
58
59.L0102:	movzbl	(%rdi), %eax		# load 1--2 bytes from first buffer
60	movzbl	-1(%rdi, %rdx, 1), %ecx
61	xor	(%rsi), %al		# xor in second buffer
62	xor	-1(%rsi, %rdx, 1), %cl
63	or	%ecx, %eax		# mismatch in any of the two?
64	ret
65
66.L0304:	movzwl	(%rdi), %eax
67	movzwl	-2(%rdi, %rdx, 1), %ecx
68	xor	(%rsi), %ax
69	xor	-2(%rsi, %rdx, 1), %cx
70	or	%ecx, %eax
71	ret
72
73.L0508:	mov	(%rdi), %eax
74	mov	-4(%rdi, %rdx, 1), %ecx
75	xor	(%rsi), %eax
76	xor	-4(%rsi, %rdx, 1), %ecx
77	or	%ecx, %eax
78	ret
79
80.L0916:	mov	(%rdi), %rax
81	mov	-8(%rdi, %rdx, 1), %rcx
82	xor	(%rsi), %rax
83	xor	-8(%rsi, %rdx, 1), %rcx
84	or	%rcx, %rax
85	setnz	%al			# ensure EAX nonzero even if only
86	ret				# high bits of RAX were set
87
88	/* more than 16 bytes: process buffer in a loop */
89.Lgt16:	mov	(%rdi), %rax		# process first 16 bytes
90	mov	8(%rdi), %r9
91	mov	$32, %ecx
92	xor	(%rsi), %rax
93	xor	8(%rsi), %r9
94	or	%r9, %rax
95
96	cmp	%rdx, %rcx		# enough left for a full iteration?
97	jae	.Ltail
98
99	/* main loop processing 16 bytes per iteration */
100	ALIGN_TEXT
1010:	mov	-16(%rdi, %rcx, 1), %r8
102	mov	-8(%rdi, %rcx, 1), %r9
103	xor	-16(%rsi, %rcx, 1), %r8
104	xor	-8(%rsi, %rcx, 1), %r9
105	add	$16, %rcx
106	or	%r9, %r8
107	or	%r8, %rax
108
109	cmp	%rdx, %rcx
110	jb	0b
111
112	/* process last 16 bytes */
113.Ltail:	mov	-16(%rdi, %rdx, 1), %r8
114	mov	-8(%rdi, %rdx, 1), %r9
115	xor	-16(%rsi, %rdx, 1), %r8
116	xor	-8(%rsi, %rdx, 1), %r9
117	or	%r9, %r8
118	or	%r8, %rax
119	setnz	%al
120	ret
121ARCHEND(timingsafe_bcmp, scalar)
122
123ARCHENTRY(timingsafe_bcmp, baseline)
124	cmp	$32, %rdx		# at least 33 bytes to process?
125	ja	.Lgt32b
126
127	cmp	$16, %edx		# at least 17 bytes to process?
128	ja	.L1732b
129
130	cmp	$8, %edx		# at least 9 bytes to process?
131	ja	.L0916b
132
133	cmp	$4, %edx		# at least 5 bytes to process?
134	ja	.L0508b
135
136	cmp	$2, %edx		# at least 3 bytes to process?
137	ja	.L0304b
138
139	test	%edx, %edx		# buffer empty?
140	jnz	.L0102b
141
142	xor	%eax, %eax		# empty buffer always matches
143	ret
144
145.L0102b:
146	movzbl	(%rdi), %eax		# load 1--2 bytes from first buffer
147	movzbl	-1(%rdi, %rdx, 1), %ecx
148	xor	(%rsi), %al		# xor in second buffer
149	xor	-1(%rsi, %rdx, 1), %cl
150	or	%ecx, %eax		# mismatch in any of the two?
151	ret
152
153.L0304b:
154	movzwl	(%rdi), %eax
155	movzwl	-2(%rdi, %rdx, 1), %ecx
156	xor	(%rsi), %ax
157	xor	-2(%rsi, %rdx, 1), %cx
158	or	%ecx, %eax
159	ret
160
161.L0508b:
162	mov	(%rdi), %eax
163	mov	-4(%rdi, %rdx, 1), %ecx
164	xor	(%rsi), %eax
165	xor	-4(%rsi, %rdx, 1), %ecx
166	or	%ecx, %eax
167	ret
168
169.L0916b:
170	mov	(%rdi), %rax
171	mov	-8(%rdi, %rdx, 1), %rcx
172	xor	(%rsi), %rax
173	xor	-8(%rsi, %rdx, 1), %rcx
174	or	%rcx, %rax
175	setnz	%al			# ensure EAX nonzero even if only
176	ret				# high bits of RAX were set
177
178.L1732b:
179	movdqu		(%rdi), %xmm0
180	movdqu		(%rsi), %xmm2
181	movdqu		-16(%rdi, %rdx, 1), %xmm1
182	movdqu		-16(%rsi, %rdx, 1), %xmm3
183	pcmpeqb		%xmm2, %xmm0
184	pcmpeqb		%xmm3, %xmm1
185	pand		%xmm1, %xmm0
186	pmovmskb	%xmm0, %eax	# 1 where equal
187	xor		$0xffff, %eax	# 1 where not equal
188	ret
189
190	/* more than 32 bytes: process buffer in a loop */
191.Lgt32b:
192	movdqu		(%rdi), %xmm4
193	movdqu		(%rsi), %xmm2
194	movdqu		16(%rdi), %xmm1
195	movdqu		16(%rsi), %xmm3
196	mov		$64, %ecx
197	pcmpeqb		%xmm2, %xmm4
198	pcmpeqb		%xmm3, %xmm1
199	pand		%xmm1, %xmm4
200	cmp		%rdx, %rcx	# enough left for a full iteration?
201	jae		.Ltailb
202
203	/* main loop processing 32 bytes per iteration */
204	ALIGN_TEXT
2050:	movdqu		-32(%rdi, %rcx, 1), %xmm0
206	movdqu		-32(%rsi, %rcx, 1), %xmm2
207	movdqu		-16(%rdi, %rcx, 1), %xmm1
208	movdqu		-16(%rsi, %rcx, 1), %xmm3
209	add		$32, %rcx
210	pcmpeqb		%xmm2, %xmm0
211	pcmpeqb		%xmm3, %xmm1
212	pand		%xmm1, %xmm0
213	pand		%xmm0, %xmm4
214	cmp		%rdx, %rcx
215	jb		0b
216
217	/* process last 32 bytes */
218.Ltailb:
219	movdqu		-32(%rdi, %rdx, 1), %xmm0
220	movdqu		-32(%rsi, %rdx, 1), %xmm2
221	movdqu		-16(%rdi, %rdx, 1), %xmm1
222	movdqu		-16(%rsi, %rdx, 1), %xmm3
223	pcmpeqb		%xmm2, %xmm0
224	pcmpeqb		%xmm3, %xmm1
225	pand		%xmm1, %xmm0
226	pand		%xmm4, %xmm0
227	pmovmskb	%xmm0, %eax
228	xor		$0xffff, %eax
229	ret
230ARCHEND(timingsafe_bcmp, baseline)
231
232	.section .note.GNU-stack,"",%progbits
233