1 /*-
2  * Copyright (c) 2018-2019 The FreeBSD Foundation
3  * Copyright (c) 2003 Peter Wemm.
4  * Copyright (c) 1993 The Regents of the University of California.
5  * All rights reserved.
6  *
7  * Portions of this software were developed by
8  * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
9  * the FreeBSD Foundation.
10  *
11  * Primarily rewritten and redeveloped by Mateusz Guzik
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  *
37  * $FreeBSD$
38  */
39 /*
40  * Macros to help implement memcmp(), bcmp(),
41  *			    bzero(), memset(),
42  *			    memcpy(), bcopy(), memmove()
43  */
44 
45 /*
46  * memcmp(b1, b2, len)
47  *	  rdi,rsi,rdx
48  */
49 .macro MEMCMP end
50 	xorl	%eax,%eax
51 10:
52 	cmpq	$16,%rdx
53 	ja	101632f
54 
55 100816:
56 	cmpb	$8,%dl
57 	jl	100408f
58 	movq	(%rdi),%r8
59 	movq	(%rsi),%r9
60 	cmpq	%r8,%r9
61 	jne	80f
62 	movq	-8(%rdi,%rdx),%r8
63 	movq	-8(%rsi,%rdx),%r9
64 	cmpq	%r8,%r9
65 	jne	10081608f
66 	\end
67 100408:
68 	cmpb	$4,%dl
69 	jl	100204f
70 	movl	(%rdi),%r8d
71 	movl	(%rsi),%r9d
72 	cmpl	%r8d,%r9d
73 	jne	80f
74 	movl	-4(%rdi,%rdx),%r8d
75 	movl	-4(%rsi,%rdx),%r9d
76 	cmpl	%r8d,%r9d
77 	jne	10040804f
78 	\end
79 100204:
80 	cmpb	$2,%dl
81 	jl	100001f
82 	movzwl	(%rdi),%r8d
83 	movzwl	(%rsi),%r9d
84 	cmpl	%r8d,%r9d
85 	jne	1f
86 	movzwl	-2(%rdi,%rdx),%r8d
87 	movzwl	-2(%rsi,%rdx),%r9d
88 	cmpl	%r8d,%r9d
89 	jne	1f
90 	\end
91 100001:
92 	cmpb	$1,%dl
93 	jl	100000f
94 	movzbl	(%rdi),%eax
95 	movzbl	(%rsi),%r8d
96 	subl	%r8d,%eax
97 100000:
98 	\end
99 ALIGN_TEXT
100 101632:
101 	cmpq	$32,%rdx
102 	ja	103200f
103 	movq	(%rdi),%r8
104 	movq	(%rsi),%r9
105 	cmpq	%r8,%r9
106 	jne	80f
107 	movq	8(%rdi),%r8
108 	movq	8(%rsi),%r9
109 	cmpq	%r8,%r9
110 	jne	10163208f
111 	movq	-16(%rdi,%rdx),%r8
112 	movq	-16(%rsi,%rdx),%r9
113 	cmpq	%r8,%r9
114 	jne	10163216f
115 	movq	-8(%rdi,%rdx),%r8
116 	movq	-8(%rsi,%rdx),%r9
117 	cmpq	%r8,%r9
118 	jne	10163224f
119 	\end
120 ALIGN_TEXT
121 103200:
122 	movq	(%rdi),%r8
123 	movq	8(%rdi),%r9
124 	subq	(%rsi),%r8
125 	subq	8(%rsi),%r9
126 	orq	%r8,%r9
127 	jnz	10320000f
128 
129 	movq    16(%rdi),%r8
130 	movq    24(%rdi),%r9
131 	subq    16(%rsi),%r8
132 	subq    24(%rsi),%r9
133 	orq	%r8,%r9
134 	jnz     10320016f
135 
136 	leaq	32(%rdi),%rdi
137 	leaq	32(%rsi),%rsi
138 	subq	$32,%rdx
139 	cmpq	$32,%rdx
140 	jae	103200b
141 	cmpb	$0,%dl
142 	jne	10b
143 	\end
144 
145 /*
146  * Mismatch was found.
147  *
148  * Before we compute it we narrow down the range (16 -> 8 -> 4 bytes).
149  */
150 ALIGN_TEXT
151 10320016:
152 	leaq	16(%rdi),%rdi
153 	leaq	16(%rsi),%rsi
154 10320000:
155 	movq	(%rdi),%r8
156 	movq	(%rsi),%r9
157 	cmpq	%r8,%r9
158 	jne	80f
159 	leaq	8(%rdi),%rdi
160 	leaq	8(%rsi),%rsi
161 	jmp	80f
162 ALIGN_TEXT
163 10081608:
164 10163224:
165 	leaq	-8(%rdi,%rdx),%rdi
166 	leaq	-8(%rsi,%rdx),%rsi
167 	jmp	80f
168 ALIGN_TEXT
169 10163216:
170 	leaq	-16(%rdi,%rdx),%rdi
171 	leaq	-16(%rsi,%rdx),%rsi
172 	jmp	80f
173 ALIGN_TEXT
174 10163208:
175 	leaq	8(%rdi),%rdi
176 	leaq	8(%rsi),%rsi
177 	jmp	80f
178 ALIGN_TEXT
179 10040804:
180 	leaq	-4(%rdi,%rdx),%rdi
181 	leaq	-4(%rsi,%rdx),%rsi
182 	jmp	1f
183 
184 ALIGN_TEXT
185 80:
186 	movl	(%rdi),%r8d
187 	movl	(%rsi),%r9d
188 	cmpl	%r8d,%r9d
189 	jne	1f
190 	leaq	4(%rdi),%rdi
191 	leaq	4(%rsi),%rsi
192 
193 /*
194  * We have up to 4 bytes to inspect.
195  */
196 1:
197 	movzbl	(%rdi),%eax
198 	movzbl	(%rsi),%r8d
199 	cmpb	%r8b,%al
200 	jne	2f
201 
202 	movzbl	1(%rdi),%eax
203 	movzbl	1(%rsi),%r8d
204 	cmpb	%r8b,%al
205 	jne	2f
206 
207 	movzbl	2(%rdi),%eax
208 	movzbl	2(%rsi),%r8d
209 	cmpb	%r8b,%al
210 	jne	2f
211 
212 	movzbl	3(%rdi),%eax
213 	movzbl	3(%rsi),%r8d
214 2:
215 	subl	%r8d,%eax
216 	\end
217 .endm
218 
219 /*
220  * memmove(dst, src, cnt)
221  *         rdi, rsi, rdx
222  */
223 
224 /*
225  * Register state at entry is supposed to be as follows:
226  * rdi - destination
227  * rsi - source
228  * rcx - count
229  *
230  * The macro possibly clobbers the above and: rcx, r8, r9, r10
231  * It does not clobber rax nor r11.
232  */
233 .macro MEMMOVE erms overlap end
234 	/*
235 	 * For sizes 0..32 all data is read before it is written, so there
236 	 * is no correctness issue with direction of copying.
237 	 */
238 	movq	%rdx,%rcx
239 	cmpq	$32,%rdx
240 	jbe	101632f
241 
242 .if \overlap == 1
243 	movq	%rdi,%r8
244 	subq	%rsi,%r8
245 	cmpq	%rcx,%r8	/* overlapping && src < dst? */
246 	jb	2f
247 .endif
248 
249 	/*
250 	 * AMD's movsq gets better at around 1024 bytes, Intel's gets
251 	 * better at around 256 bytes (Zen 2, 9900K era)
252 	 */
253 	cmpq	$1024,%rcx
254 	ja	1256f
255 
256 103200:
257 	movq	(%rsi),%rdx
258 	movq	%rdx,(%rdi)
259 	movq	8(%rsi),%rdx
260 	movq	%rdx,8(%rdi)
261 	movq	16(%rsi),%rdx
262 	movq	%rdx,16(%rdi)
263 	movq	24(%rsi),%rdx
264 	movq	%rdx,24(%rdi)
265 	leaq	32(%rsi),%rsi
266 	leaq	32(%rdi),%rdi
267 	subq	$32,%rcx
268 	cmpq	$32,%rcx
269 	jae	103200b
270 	cmpb	$0,%cl
271 	jne	101632f
272 	\end
273 	ALIGN_TEXT
274 101632:
275 	cmpb	$16,%cl
276 	jl	100816f
277 	movq	(%rsi),%rdx
278 	movq	8(%rsi),%r8
279 	movq	-16(%rsi,%rcx),%r9
280 	movq	-8(%rsi,%rcx),%r10
281 	movq	%rdx,(%rdi)
282 	movq	%r8,8(%rdi)
283 	movq	%r9,-16(%rdi,%rcx)
284 	movq	%r10,-8(%rdi,%rcx)
285 	\end
286 	ALIGN_TEXT
287 100816:
288 	cmpb	$8,%cl
289 	jl	100408f
290 	movq	(%rsi),%rdx
291 	movq	-8(%rsi,%rcx),%r8
292 	movq	%rdx,(%rdi)
293 	movq	%r8,-8(%rdi,%rcx,)
294 	\end
295 	ALIGN_TEXT
296 100408:
297 	cmpb	$4,%cl
298 	jl	100204f
299 	movl	(%rsi),%edx
300 	movl	-4(%rsi,%rcx),%r8d
301 	movl	%edx,(%rdi)
302 	movl	%r8d,-4(%rdi,%rcx)
303 	\end
304 	ALIGN_TEXT
305 100204:
306 	cmpb	$2,%cl
307 	jl	100001f
308 	movzwl	(%rsi),%edx
309 	movzwl	-2(%rsi,%rcx),%r8d
310 	movw	%dx,(%rdi)
311 	movw	%r8w,-2(%rdi,%rcx)
312 	\end
313 	ALIGN_TEXT
314 100001:
315 	cmpb	$1,%cl
316 	jl	100000f
317 	movb	(%rsi),%dl
318 	movb	%dl,(%rdi)
319 100000:
320 	\end
321 
322 	/*
323 	 * 256 or more bytes
324 	 */
325 	ALIGN_TEXT
326 1256:
327 	testb	$15,%dil
328 	jnz	100f
329 .if \erms == 1
330 	rep
331 	movsb
332 .else
333 	shrq	$3,%rcx                         /* copy by 64-bit words */
334 	rep
335 	movsq
336 	movq	%rdx,%rcx
337 	andl	$7,%ecx                         /* any bytes left? */
338 	jne	100408b
339 .endif
340 	\end
341 100:
342 	movq	(%rsi),%r8
343 	movq	8(%rsi),%r9
344 	movq	%rdi,%r10
345 	movq	%rdi,%rcx
346 	andq	$15,%rcx
347 	leaq	-16(%rdx,%rcx),%rdx
348 	neg	%rcx
349 	leaq	16(%rdi,%rcx),%rdi
350 	leaq	16(%rsi,%rcx),%rsi
351 	movq	%rdx,%rcx
352 .if \erms == 1
353 	rep
354 	movsb
355 	movq	%r8,(%r10)
356 	movq	%r9,8(%r10)
357 .else
358 	shrq	$3,%rcx                         /* copy by 64-bit words */
359 	rep
360 	movsq
361 	movq	%r8,(%r10)
362 	movq	%r9,8(%r10)
363 	movq	%rdx,%rcx
364 	andl	$7,%ecx                         /* any bytes left? */
365 	jne	100408b
366 .endif
367 	\end
368 
369 .if \overlap == 1
370 	/*
371 	 * Copy backwards.
372 	 */
373         ALIGN_TEXT
374 2:
375 	cmpq	$256,%rcx
376 	ja	2256f
377 
378 	leaq	-8(%rdi,%rcx),%rdi
379 	leaq	-8(%rsi,%rcx),%rsi
380 
381 	cmpq	$32,%rcx
382 	jb	2016f
383 
384 2032:
385 	movq	(%rsi),%rdx
386 	movq	%rdx,(%rdi)
387 	movq	-8(%rsi),%rdx
388 	movq	%rdx,-8(%rdi)
389 	movq	-16(%rsi),%rdx
390 	movq	%rdx,-16(%rdi)
391 	movq	-24(%rsi),%rdx
392 	movq	%rdx,-24(%rdi)
393 	leaq	-32(%rsi),%rsi
394 	leaq	-32(%rdi),%rdi
395 	subq	$32,%rcx
396 	cmpq	$32,%rcx
397 	jae	2032b
398 	cmpb	$0,%cl
399 	jne	2016f
400 	\end
401 	ALIGN_TEXT
402 2016:
403 	cmpb	$16,%cl
404 	jl	2008f
405 	movq	(%rsi),%rdx
406 	movq	%rdx,(%rdi)
407 	movq	-8(%rsi),%rdx
408 	movq	%rdx,-8(%rdi)
409 	subb	$16,%cl
410 	jz	2000f
411 	leaq	-16(%rsi),%rsi
412 	leaq	-16(%rdi),%rdi
413 2008:
414 	cmpb	$8,%cl
415 	jl	2004f
416 	movq	(%rsi),%rdx
417 	movq	%rdx,(%rdi)
418 	subb	$8,%cl
419 	jz	2000f
420 	leaq	-8(%rsi),%rsi
421 	leaq	-8(%rdi),%rdi
422 2004:
423 	cmpb	$4,%cl
424 	jl	2002f
425 	movl	4(%rsi),%edx
426 	movl	%edx,4(%rdi)
427 	subb	$4,%cl
428 	jz	2000f
429 	leaq	-4(%rsi),%rsi
430 	leaq	-4(%rdi),%rdi
431 2002:
432 	cmpb	$2,%cl
433 	jl	2001f
434 	movw	6(%rsi),%dx
435 	movw	%dx,6(%rdi)
436 	subb	$2,%cl
437 	jz	2000f
438 	leaq	-2(%rsi),%rsi
439 	leaq	-2(%rdi),%rdi
440 2001:
441 	cmpb	$1,%cl
442 	jl	2000f
443 	movb	7(%rsi),%dl
444 	movb	%dl,7(%rdi)
445 2000:
446 	\end
447 	ALIGN_TEXT
448 2256:
449 	std
450 .if \erms == 1
451 	leaq	-1(%rdi,%rcx),%rdi
452 	leaq	-1(%rsi,%rcx),%rsi
453 	rep
454 	movsb
455 	cld
456 .else
457 	leaq	-8(%rdi,%rcx),%rdi
458 	leaq	-8(%rsi,%rcx),%rsi
459 	shrq	$3,%rcx
460 	rep
461 	movsq
462 	cld
463 	movq	%rdx,%rcx
464 	andb	$7,%cl
465 	jne	2004b
466 .endif
467 	\end
468 .endif
469 .endm
470 
471 /*
472  * memset(dst, c,   len)
473  *        rdi, r10, rdx
474  */
475 .macro MEMSET erms end
476 	movq	%rdi,%rax
477 	movq	%rdx,%rcx
478 
479 	cmpq	$32,%rcx
480 	jbe	101632f
481 
482 	cmpq	$256,%rcx
483 	ja	1256f
484 
485 103200:
486 	movq	%r10,(%rdi)
487 	movq	%r10,8(%rdi)
488 	movq	%r10,16(%rdi)
489 	movq	%r10,24(%rdi)
490 	leaq	32(%rdi),%rdi
491 	subq	$32,%rcx
492 	cmpq	$32,%rcx
493 	ja	103200b
494 	cmpb	$16,%cl
495 	ja	201632f
496 	movq	%r10,-16(%rdi,%rcx)
497 	movq	%r10,-8(%rdi,%rcx)
498 	\end
499 	ALIGN_TEXT
500 101632:
501 	cmpb	$16,%cl
502 	jl	100816f
503 201632:
504 	movq	%r10,(%rdi)
505 	movq	%r10,8(%rdi)
506 	movq	%r10,-16(%rdi,%rcx)
507 	movq	%r10,-8(%rdi,%rcx)
508 	\end
509 	ALIGN_TEXT
510 100816:
511 	cmpb	$8,%cl
512 	jl	100408f
513 	movq	%r10,(%rdi)
514 	movq	%r10,-8(%rdi,%rcx)
515 	\end
516 	ALIGN_TEXT
517 100408:
518 	cmpb	$4,%cl
519 	jl	100204f
520 	movl	%r10d,(%rdi)
521 	movl	%r10d,-4(%rdi,%rcx)
522 	\end
523 	ALIGN_TEXT
524 100204:
525 	cmpb	$2,%cl
526 	jl	100001f
527 	movw	%r10w,(%rdi)
528 	movw	%r10w,-2(%rdi,%rcx)
529 	\end
530 	ALIGN_TEXT
531 100001:
532 	cmpb	$0,%cl
533 	je	100000f
534 	movb	%r10b,(%rdi)
535 100000:
536 	\end
537 	ALIGN_TEXT
538 1256:
539 	movq	%rdi,%r9
540 	movq	%r10,%rax
541 	testl	$15,%edi
542 	jnz	3f
543 1:
544 .if \erms == 1
545 	rep
546 	stosb
547 	movq	%r9,%rax
548 .else
549 	movq	%rcx,%rdx
550 	shrq	$3,%rcx
551 	rep
552 	stosq
553 	movq	%r9,%rax
554 	andl	$7,%edx
555 	jnz	2f
556 	\end
557 2:
558 	movq	%r10,-8(%rdi,%rdx)
559 .endif
560 	\end
561 	ALIGN_TEXT
562 3:
563 	movq	%r10,(%rdi)
564 	movq	%r10,8(%rdi)
565 	movq	%rdi,%r8
566 	andq	$15,%r8
567 	leaq	-16(%rcx,%r8),%rcx
568 	neg	%r8
569 	leaq	16(%rdi,%r8),%rdi
570 	jmp	1b
571 .endm
572 
573 .macro DUMMYARG
574 .endm
575