xref: /minix/common/lib/libc/arch/i386/string/memchr.S (revision 0a6a1f1d)
1/*
2 * Written by J.T. Conklin <jtc@acorntoolworks.com>
3 * Public domain.
4 */
5
6#include <machine/asm.h>
7
8#if defined(LIBC_SCCS)
9	RCSID("$NetBSD: memchr.S,v 1.2 2014/03/22 19:38:46 jakllsch Exp $")
10#endif
11
12ENTRY(memchr)
13	pushl	%esi
14	movl	8(%esp),%eax
15	movzbl	12(%esp),%ecx
16	movl	16(%esp),%esi
17
18	/*
19	 * Align to word boundary.
20	 * Consider unrolling loop?
21	 */
22	testl	%esi,%esi	/* nbytes == 0? */
23	je	.Lzero
24.Lalign:
25	testb	$3,%al
26	je	.Lword_aligned
27	cmpb	(%eax),%cl
28	je	.Ldone
29	incl	%eax
30	decl	%esi
31	jnz	.Lalign
32	jmp	.Lzero
33
34.Lword_aligned:
35	/* copy char to all bytes in word */
36	movb	%cl,%ch
37	movl	%ecx,%edx
38	sall	$16,%ecx
39	orl	%edx,%ecx
40
41	_ALIGN_TEXT
42.Lloop:
43	cmpl	$3,%esi		/* nbytes > 4 */
44	jbe	.Lbyte
45	movl	(%eax),%edx
46	addl	$4,%eax
47	xorl	%ecx,%edx
48	subl	$4,%esi
49	subl	$0x01010101,%edx
50	testl	$0x80808080,%edx
51	je	.Lloop
52
53	/*
54	 * In rare cases, the above loop may exit prematurely. We must
55	 * return to the loop if none of the bytes in the word are
56	 * equal to ch.
57	 */
58
59	/*
60	 * High load-use latency on the Athlon leads to significant
61	 * stalls, so we preload the next char as soon as possible
62	 * instead of using cmp mem8, reg8.
63	 *
64	 * Alignment here avoids a stall on the Athlon, even though
65	 * it's not a branch target.
66	 */
67	_ALIGN_TEXT
68	cmpb	-4(%eax),%cl	/* 1st byte == ch? */
69	movb	-3(%eax),%dl
70	jne	1f
71	subl	$4,%eax
72	jmp	.Ldone
73
74	_ALIGN_TEXT
751:	cmpb	%dl,%cl		/* 2nd byte == ch? */
76	movb	-2(%eax),%dl
77	jne	1f
78	subl	$3,%eax
79	jmp	.Ldone
80
81	_ALIGN_TEXT
821:	cmpb	%dl,%cl		/* 3rd byte == ch? */
83	movb	-1(%eax),%dl
84	jne	1f
85	subl	$2,%eax
86	jmp	.Ldone
87
88	_ALIGN_TEXT
891:	cmpb	%dl,%cl		/* 4th byte == ch? */
90	jne	.Lloop
91	decl	%eax
92	jmp	.Ldone
93
94.Lbyte:
95	testl	%esi,%esi
96	je	.Lzero
97.Lbyte_loop:
98	cmpb	(%eax),%cl
99	je	.Ldone
100	incl	%eax
101	decl	%esi
102	jnz	.Lbyte_loop
103
104.Lzero:
105	xorl	%eax,%eax
106
107.Ldone:
108	popl	%esi
109	ret
110END(memchr)
111