xref: /freebsd/sys/arm64/arm64/memcmp.S (revision 51a1bf7b)
151a1bf7bSAndrew Turner/* memcmp - compare memory
251a1bf7bSAndrew Turner *
351a1bf7bSAndrew Turner * Copyright (c) 2013-2020, Arm Limited.
451a1bf7bSAndrew Turner * SPDX-License-Identifier: MIT
551a1bf7bSAndrew Turner */
651a1bf7bSAndrew Turner
751a1bf7bSAndrew Turner/* Assumptions:
851a1bf7bSAndrew Turner *
951a1bf7bSAndrew Turner * ARMv8-a, AArch64, unaligned accesses.
1051a1bf7bSAndrew Turner */
1151a1bf7bSAndrew Turner
1251a1bf7bSAndrew Turner#include <machine/asm.h>
1351a1bf7bSAndrew Turner
1451a1bf7bSAndrew Turner#define L(l) .L ## l
1551a1bf7bSAndrew Turner
1651a1bf7bSAndrew Turner/* Parameters and result.  */
1751a1bf7bSAndrew Turner#define src1		x0
1851a1bf7bSAndrew Turner#define src2		x1
1951a1bf7bSAndrew Turner#define limit		x2
2051a1bf7bSAndrew Turner#define result		w0
2151a1bf7bSAndrew Turner
2251a1bf7bSAndrew Turner/* Internal variables.  */
2351a1bf7bSAndrew Turner#define data1		x3
2451a1bf7bSAndrew Turner#define data1w		w3
2551a1bf7bSAndrew Turner#define data1h		x4
2651a1bf7bSAndrew Turner#define data2		x5
2751a1bf7bSAndrew Turner#define data2w		w5
2851a1bf7bSAndrew Turner#define data2h		x6
2951a1bf7bSAndrew Turner#define tmp1		x7
3051a1bf7bSAndrew Turner#define tmp2		x8
3151a1bf7bSAndrew Turner
3251a1bf7bSAndrew TurnerENTRY (memcmp)
3351a1bf7bSAndrew Turner	subs	limit, limit, 8
3451a1bf7bSAndrew Turner	b.lo	L(less8)
3551a1bf7bSAndrew Turner
3651a1bf7bSAndrew Turner	ldr	data1, [src1], 8
3751a1bf7bSAndrew Turner	ldr	data2, [src2], 8
3851a1bf7bSAndrew Turner	cmp	data1, data2
3951a1bf7bSAndrew Turner	b.ne	L(return)
4051a1bf7bSAndrew Turner
4151a1bf7bSAndrew Turner	subs	limit, limit, 8
4251a1bf7bSAndrew Turner	b.gt	L(more16)
4351a1bf7bSAndrew Turner
4451a1bf7bSAndrew Turner	ldr	data1, [src1, limit]
4551a1bf7bSAndrew Turner	ldr	data2, [src2, limit]
4651a1bf7bSAndrew Turner	b	L(return)
4751a1bf7bSAndrew Turner
4851a1bf7bSAndrew TurnerL(more16):
4951a1bf7bSAndrew Turner	ldr	data1, [src1], 8
5051a1bf7bSAndrew Turner	ldr	data2, [src2], 8
5151a1bf7bSAndrew Turner	cmp	data1, data2
5251a1bf7bSAndrew Turner	bne	L(return)
5351a1bf7bSAndrew Turner
5451a1bf7bSAndrew Turner	/* Jump directly to comparing the last 16 bytes for 32 byte (or less)
5551a1bf7bSAndrew Turner	   strings.  */
5651a1bf7bSAndrew Turner	subs	limit, limit, 16
5751a1bf7bSAndrew Turner	b.ls	L(last_bytes)
5851a1bf7bSAndrew Turner
5951a1bf7bSAndrew Turner	/* We overlap loads between 0-32 bytes at either side of SRC1 when we
6051a1bf7bSAndrew Turner	   try to align, so limit it only to strings larger than 128 bytes.  */
6151a1bf7bSAndrew Turner	cmp	limit, 96
6251a1bf7bSAndrew Turner	b.ls	L(loop16)
6351a1bf7bSAndrew Turner
6451a1bf7bSAndrew Turner	/* Align src1 and adjust src2 with bytes not yet done.  */
6551a1bf7bSAndrew Turner	and	tmp1, src1, 15
6651a1bf7bSAndrew Turner	add	limit, limit, tmp1
6751a1bf7bSAndrew Turner	sub	src1, src1, tmp1
6851a1bf7bSAndrew Turner	sub	src2, src2, tmp1
6951a1bf7bSAndrew Turner
7051a1bf7bSAndrew Turner	/* Loop performing 16 bytes per iteration using aligned src1.
7151a1bf7bSAndrew Turner	   Limit is pre-decremented by 16 and must be larger than zero.
7251a1bf7bSAndrew Turner	   Exit if <= 16 bytes left to do or if the data is not equal.  */
7351a1bf7bSAndrew Turner	.p2align 4
7451a1bf7bSAndrew TurnerL(loop16):
7551a1bf7bSAndrew Turner	ldp	data1, data1h, [src1], 16
7651a1bf7bSAndrew Turner	ldp	data2, data2h, [src2], 16
7751a1bf7bSAndrew Turner	subs	limit, limit, 16
7851a1bf7bSAndrew Turner	ccmp	data1, data2, 0, hi
7951a1bf7bSAndrew Turner	ccmp	data1h, data2h, 0, eq
8051a1bf7bSAndrew Turner	b.eq	L(loop16)
8151a1bf7bSAndrew Turner
8251a1bf7bSAndrew Turner	cmp	data1, data2
8351a1bf7bSAndrew Turner	bne	L(return)
8451a1bf7bSAndrew Turner	mov	data1, data1h
8551a1bf7bSAndrew Turner	mov	data2, data2h
8651a1bf7bSAndrew Turner	cmp	data1, data2
8751a1bf7bSAndrew Turner	bne	L(return)
8851a1bf7bSAndrew Turner
8951a1bf7bSAndrew Turner	/* Compare last 1-16 bytes using unaligned access.  */
9051a1bf7bSAndrew TurnerL(last_bytes):
9151a1bf7bSAndrew Turner	add	src1, src1, limit
9251a1bf7bSAndrew Turner	add	src2, src2, limit
9351a1bf7bSAndrew Turner	ldp	data1, data1h, [src1]
9451a1bf7bSAndrew Turner	ldp	data2, data2h, [src2]
9551a1bf7bSAndrew Turner	cmp     data1, data2
9651a1bf7bSAndrew Turner	bne	L(return)
9751a1bf7bSAndrew Turner	mov	data1, data1h
9851a1bf7bSAndrew Turner	mov	data2, data2h
9951a1bf7bSAndrew Turner	cmp	data1, data2
10051a1bf7bSAndrew Turner
10151a1bf7bSAndrew Turner	/* Compare data bytes and set return value to 0, -1 or 1.  */
10251a1bf7bSAndrew TurnerL(return):
10351a1bf7bSAndrew Turner#ifndef __AARCH64EB__
10451a1bf7bSAndrew Turner	rev	data1, data1
10551a1bf7bSAndrew Turner	rev	data2, data2
10651a1bf7bSAndrew Turner#endif
10751a1bf7bSAndrew Turner	cmp     data1, data2
10851a1bf7bSAndrew TurnerL(ret_eq):
10951a1bf7bSAndrew Turner	cset	result, ne
11051a1bf7bSAndrew Turner	cneg	result, result, lo
11151a1bf7bSAndrew Turner	ret
11251a1bf7bSAndrew Turner
11351a1bf7bSAndrew Turner	.p2align 4
11451a1bf7bSAndrew Turner	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
11551a1bf7bSAndrew TurnerL(less8):
11651a1bf7bSAndrew Turner	adds	limit, limit, 4
11751a1bf7bSAndrew Turner	b.lo	L(less4)
11851a1bf7bSAndrew Turner	ldr	data1w, [src1], 4
11951a1bf7bSAndrew Turner	ldr	data2w, [src2], 4
12051a1bf7bSAndrew Turner	cmp	data1w, data2w
12151a1bf7bSAndrew Turner	b.ne	L(return)
12251a1bf7bSAndrew Turner	sub	limit, limit, 4
12351a1bf7bSAndrew TurnerL(less4):
12451a1bf7bSAndrew Turner	adds	limit, limit, 4
12551a1bf7bSAndrew Turner	beq	L(ret_eq)
12651a1bf7bSAndrew TurnerL(byte_loop):
12751a1bf7bSAndrew Turner	ldrb	data1w, [src1], 1
12851a1bf7bSAndrew Turner	ldrb	data2w, [src2], 1
12951a1bf7bSAndrew Turner	subs	limit, limit, 1
13051a1bf7bSAndrew Turner	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
13151a1bf7bSAndrew Turner	b.eq	L(byte_loop)
13251a1bf7bSAndrew Turner	sub	result, data1w, data2w
13351a1bf7bSAndrew Turner	ret
13451a1bf7bSAndrew Turner
13551a1bf7bSAndrew TurnerEND (memcmp)
13651a1bf7bSAndrew Turner
137