xref: /linux/arch/loongarch/lib/memcpy.S (revision 84b9b44b)
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
4 */
5
6#include <asm/alternative-asm.h>
7#include <asm/asm.h>
8#include <asm/asmmacro.h>
9#include <asm/cpu.h>
10#include <asm/export.h>
11#include <asm/regdef.h>
12
13SYM_FUNC_START(memcpy)
14	/*
15	 * Some CPUs support hardware unaligned access
16	 */
17	ALTERNATIVE	"b __memcpy_generic", \
18			"b __memcpy_fast", CPU_FEATURE_UAL
19SYM_FUNC_END(memcpy)
20_ASM_NOKPROBE(memcpy)
21
22EXPORT_SYMBOL(memcpy)
23
24/*
25 * void *__memcpy_generic(void *dst, const void *src, size_t n)
26 *
27 * a0: dst
28 * a1: src
29 * a2: n
30 */
31SYM_FUNC_START(__memcpy_generic)
32	move	a3, a0
33	beqz	a2, 2f
34
351:	ld.b	t0, a1, 0
36	st.b	t0, a0, 0
37	addi.d	a0, a0, 1
38	addi.d	a1, a1, 1
39	addi.d	a2, a2, -1
40	bgt	a2, zero, 1b
41
422:	move	a0, a3
43	jr	ra
44SYM_FUNC_END(__memcpy_generic)
45_ASM_NOKPROBE(__memcpy_generic)
46
47	.align	5
48SYM_FUNC_START_NOALIGN(__memcpy_small)
49	pcaddi	t0, 8
50	slli.d	a2, a2, 5
51	add.d	t0, t0, a2
52	jr	t0
53
54	.align	5
550:	jr	ra
56
57	.align	5
581:	ld.b	t0, a1, 0
59	st.b	t0, a0, 0
60	jr	ra
61
62	.align	5
632:	ld.h	t0, a1, 0
64	st.h	t0, a0, 0
65	jr	ra
66
67	.align	5
683:	ld.h	t0, a1, 0
69	ld.b	t1, a1, 2
70	st.h	t0, a0, 0
71	st.b	t1, a0, 2
72	jr	ra
73
74	.align	5
754:	ld.w	t0, a1, 0
76	st.w	t0, a0, 0
77	jr	ra
78
79	.align	5
805:	ld.w	t0, a1, 0
81	ld.b	t1, a1, 4
82	st.w	t0, a0, 0
83	st.b	t1, a0, 4
84	jr	ra
85
86	.align	5
876:	ld.w	t0, a1, 0
88	ld.h	t1, a1, 4
89	st.w	t0, a0, 0
90	st.h	t1, a0, 4
91	jr	ra
92
93	.align	5
947:	ld.w	t0, a1, 0
95	ld.w	t1, a1, 3
96	st.w	t0, a0, 0
97	st.w	t1, a0, 3
98	jr	ra
99
100	.align	5
1018:	ld.d	t0, a1, 0
102	st.d	t0, a0, 0
103	jr	ra
104SYM_FUNC_END(__memcpy_small)
105_ASM_NOKPROBE(__memcpy_small)
106
107/*
108 * void *__memcpy_fast(void *dst, const void *src, size_t n)
109 *
110 * a0: dst
111 * a1: src
112 * a2: n
113 */
114SYM_FUNC_START(__memcpy_fast)
115	sltui	t0, a2, 9
116	bnez	t0, __memcpy_small
117
118	add.d	a3, a1, a2
119	add.d	a2, a0, a2
120	ld.d	a6, a1, 0
121	ld.d	a7, a3, -8
122
123	/* align up destination address */
124	andi	t1, a0, 7
125	sub.d	t0, zero, t1
126	addi.d	t0, t0, 8
127	add.d	a1, a1, t0
128	add.d	a5, a0, t0
129
130	addi.d	a4, a3, -64
131	bgeu	a1, a4, .Llt64
132
133	/* copy 64 bytes at a time */
134.Lloop64:
135	ld.d	t0, a1, 0
136	ld.d	t1, a1, 8
137	ld.d	t2, a1, 16
138	ld.d	t3, a1, 24
139	ld.d	t4, a1, 32
140	ld.d	t5, a1, 40
141	ld.d	t6, a1, 48
142	ld.d	t7, a1, 56
143	addi.d	a1, a1, 64
144	st.d	t0, a5, 0
145	st.d	t1, a5, 8
146	st.d	t2, a5, 16
147	st.d	t3, a5, 24
148	st.d	t4, a5, 32
149	st.d	t5, a5, 40
150	st.d	t6, a5, 48
151	st.d	t7, a5, 56
152	addi.d	a5, a5, 64
153	bltu	a1, a4, .Lloop64
154
155	/* copy the remaining bytes */
156.Llt64:
157	addi.d	a4, a3, -32
158	bgeu	a1, a4, .Llt32
159	ld.d	t0, a1, 0
160	ld.d	t1, a1, 8
161	ld.d	t2, a1, 16
162	ld.d	t3, a1, 24
163	addi.d	a1, a1, 32
164	st.d	t0, a5, 0
165	st.d	t1, a5, 8
166	st.d	t2, a5, 16
167	st.d	t3, a5, 24
168	addi.d	a5, a5, 32
169
170.Llt32:
171	addi.d	a4, a3, -16
172	bgeu	a1, a4, .Llt16
173	ld.d	t0, a1, 0
174	ld.d	t1, a1, 8
175	addi.d	a1, a1, 16
176	st.d	t0, a5, 0
177	st.d	t1, a5, 8
178	addi.d	a5, a5, 16
179
180.Llt16:
181	addi.d	a4, a3, -8
182	bgeu	a1, a4, .Llt8
183	ld.d	t0, a1, 0
184	st.d	t0, a5, 0
185
186.Llt8:
187	st.d	a6, a0, 0
188	st.d	a7, a2, -8
189
190	/* return */
191	jr	ra
192SYM_FUNC_END(__memcpy_fast)
193_ASM_NOKPROBE(__memcpy_fast)
194