1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *  * Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 *  * Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in
12 *    the documentation and/or other materials provided with the
13 *    distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29
30/*
31 * Optimized memcpy() for ARM.
32 *
33 * note that memcpy() always returns the destination pointer,
34 * so we have to preserve R0.
35  */
36
37/*
38 * This file has been modified from the original for use in musl libc.
39 * The main changes are: addition of .type memcpy,%function to make the
40 * code safely callable from thumb mode, adjusting the return
41 * instructions to be compatible with pre-thumb ARM cpus, removal of
42 * prefetch code that is not compatible with older cpus and support for
43 * building as thumb 2 and big-endian.
44 */
45
46.syntax unified
47
48.global memcpy
49.type memcpy,%function
50memcpy:
51	/* The stack must always be 64-bits aligned to be compliant with the
52	 * ARM ABI. Since we have to save R0, we might as well save R4
53	 * which we can use for better pipelining of the reads below
54	 */
55	.fnstart
56	.save       {r0, r4, lr}
57	stmfd       sp!, {r0, r4, lr}
58	/* Making room for r5-r11 which will be spilled later */
59	.pad        #28
60	sub         sp, sp, #28
61
62	/* it simplifies things to take care of len<4 early */
63	cmp     r2, #4
64	blo     copy_last_3_and_return
65
66	/* compute the offset to align the source
67	 * offset = (4-(src&3))&3 = -src & 3
68	 */
69	rsb     r3, r1, #0
70	ands    r3, r3, #3
71	beq     src_aligned
72
73	/* align source to 32 bits. We need to insert 2 instructions between
74	 * a ldr[b|h] and str[b|h] because byte and half-word instructions
75	 * stall 2 cycles.
76	 */
77	movs    r12, r3, lsl #31
78	sub     r2, r2, r3              /* we know that r3 <= r2 because r2 >= 4 */
79	ldrbmi r3, [r1], #1
80	ldrbcs r4, [r1], #1
81	ldrbcs r12,[r1], #1
82	strbmi r3, [r0], #1
83	strbcs r4, [r0], #1
84	strbcs r12,[r0], #1
85
86src_aligned:
87
88	/* see if src and dst are aligned together (congruent) */
89	eor     r12, r0, r1
90	tst     r12, #3
91	bne     non_congruent
92
93	/* Use post-incriment mode for stm to spill r5-r11 to reserved stack
94	 * frame. Don't update sp.
95	 */
96	stmea   sp, {r5-r11}
97
98	/* align the destination to a cache-line */
99	rsb     r3, r0, #0
100	ands    r3, r3, #0x1C
101	beq     congruent_aligned32
102	cmp     r3, r2
103	andhi   r3, r2, #0x1C
104
105	/* conditionnaly copies 0 to 7 words (length in r3) */
106	movs    r12, r3, lsl #28
107	ldmcs   r1!, {r4, r5, r6, r7}           /* 16 bytes */
108	ldmmi   r1!, {r8, r9}                   /*  8 bytes */
109	stmcs   r0!, {r4, r5, r6, r7}
110	stmmi   r0!, {r8, r9}
111	tst     r3, #0x4
112	ldrne   r10,[r1], #4                    /*  4 bytes */
113	strne   r10,[r0], #4
114	sub     r2, r2, r3
115
116congruent_aligned32:
117	/*
118	 * here source is aligned to 32 bytes.
119	 */
120
121cached_aligned32:
122	subs    r2, r2, #32
123	blo     less_than_32_left
124
125	/*
126	 * We preload a cache-line up to 64 bytes ahead. On the 926, this will
127	 * stall only until the requested world is fetched, but the linefill
128	 * continues in the the background.
129	 * While the linefill is going, we write our previous cache-line
130	 * into the write-buffer (which should have some free space).
131	 * When the linefill is done, the writebuffer will
132	 * start dumping its content into memory
133	 *
134	 * While all this is going, we then load a full cache line into
135	 * 8 registers, this cache line should be in the cache by now
136	 * (or partly in the cache).
137	 *
138	 * This code should work well regardless of the source/dest alignment.
139	 *
140	 */
141
142	/* Align the preload register to a cache-line because the cpu does
143	 * "critical word first" (the first word requested is loaded first).
144	 */
145	@ bic           r12, r1, #0x1F
146	@ add           r12, r12, #64
147
1481:      ldmia   r1!, { r4-r11 }
149	subs    r2, r2, #32
150
151	/*
152	 * NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
153	 * for ARM9 preload will not be safely guarded by the preceding subs.
154	 * When it is safely guarded the only possibility to have SIGSEGV here
155	 * is because the caller overstates the length.
156	 */
157	@ ldrhi         r3, [r12], #32      /* cheap ARM9 preload */
158	stmia   r0!, { r4-r11 }
159	bhs     1b
160
161	add     r2, r2, #32
162
163less_than_32_left:
164	/*
165	 * less than 32 bytes left at this point (length in r2)
166	 */
167
168	/* skip all this if there is nothing to do, which should
169	 * be a common case (if not executed the code below takes
170	 * about 16 cycles)
171	 */
172	tst     r2, #0x1F
173	beq     1f
174
175	/* conditionnaly copies 0 to 31 bytes */
176	movs    r12, r2, lsl #28
177	ldmcs   r1!, {r4, r5, r6, r7}           /* 16 bytes */
178	ldmmi   r1!, {r8, r9}                   /*  8 bytes */
179	stmcs   r0!, {r4, r5, r6, r7}
180	stmmi   r0!, {r8, r9}
181	movs    r12, r2, lsl #30
182	ldrcs   r3, [r1], #4                    /*  4 bytes */
183	ldrhmi r4, [r1], #2                     /*  2 bytes */
184	strcs   r3, [r0], #4
185	strhmi r4, [r0], #2
186	tst     r2, #0x1
187	ldrbne r3, [r1]                         /*  last byte  */
188	strbne r3, [r0]
189
190	/* we're done! restore everything and return */
1911:      ldmfd   sp!, {r5-r11}
192	ldmfd   sp!, {r0, r4, lr}
193	bx      lr
194
195	/********************************************************************/
196
197non_congruent:
198	/*
199	 * here source is aligned to 4 bytes
200	 * but destination is not.
201	 *
202	 * in the code below r2 is the number of bytes read
203	 * (the number of bytes written is always smaller, because we have
204	 * partial words in the shift queue)
205	 */
206	cmp     r2, #4
207	blo     copy_last_3_and_return
208
209	/* Use post-incriment mode for stm to spill r5-r11 to reserved stack
210	 * frame. Don't update sp.
211	 */
212	stmea   sp, {r5-r11}
213
214	/* compute shifts needed to align src to dest */
215	rsb     r5, r0, #0
216	and     r5, r5, #3                      /* r5 = # bytes in partial words */
217	mov     r12, r5, lsl #3         /* r12 = right */
218	rsb     lr, r12, #32            /* lr = left  */
219
220	/* read the first word */
221	ldr     r3, [r1], #4
222	sub     r2, r2, #4
223
224	/* write a partial word (0 to 3 bytes), such that destination
225	 * becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
226	 */
227	movs    r5, r5, lsl #31
228
229#if __ARMEB__
230	movmi   r3, r3, ror #24
231	strbmi	r3, [r0], #1
232	movcs   r3, r3, ror #24
233	strbcs	r3, [r0], #1
234	movcs   r3, r3, ror #24
235	strbcs	r3, [r0], #1
236#else
237	strbmi r3, [r0], #1
238	movmi   r3, r3, lsr #8
239	strbcs r3, [r0], #1
240	movcs   r3, r3, lsr #8
241	strbcs r3, [r0], #1
242	movcs   r3, r3, lsr #8
243#endif
244
245	cmp     r2, #4
246	blo     partial_word_tail
247
248#if __ARMEB__
249	mov	r3, r3, lsr r12
250	mov	r3, r3, lsl r12
251#endif
252
253	/* Align destination to 32 bytes (cache line boundary) */
2541:      tst     r0, #0x1c
255	beq     2f
256	ldr     r5, [r1], #4
257	sub     r2, r2, #4
258#if __ARMEB__
259	mov     r4, r5,                 lsr lr
260	orr     r4, r4, r3
261	mov     r3, r5,                 lsl r12
262#else
263	mov     r4, r5,                 lsl lr
264	orr     r4, r4, r3
265	mov     r3, r5,                 lsr r12
266#endif
267	str     r4, [r0], #4
268	cmp     r2, #4
269	bhs     1b
270	blo     partial_word_tail
271
272	/* copy 32 bytes at a time */
2732:      subs    r2, r2, #32
274	blo     less_than_thirtytwo
275
276	/* Use immediate mode for the shifts, because there is an extra cycle
277	 * for register shifts, which could account for up to 50% of
278	 * performance hit.
279	 */
280
281	cmp     r12, #24
282	beq     loop24
283	cmp     r12, #8
284	beq     loop8
285
286loop16:
287	ldr     r12, [r1], #4
2881:      mov     r4, r12
289	ldmia   r1!, {   r5,r6,r7,  r8,r9,r10,r11}
290	subs    r2, r2, #32
291	ldrhs   r12, [r1], #4
292#if __ARMEB__
293	orr     r3, r3, r4, lsr #16
294	mov     r4, r4, lsl #16
295	orr     r4, r4, r5, lsr #16
296	mov     r5, r5, lsl #16
297	orr     r5, r5, r6, lsr #16
298	mov     r6, r6, lsl #16
299	orr     r6, r6, r7, lsr #16
300	mov     r7, r7, lsl #16
301	orr     r7, r7, r8, lsr #16
302	mov     r8, r8, lsl #16
303	orr     r8, r8, r9, lsr #16
304	mov     r9, r9, lsl #16
305	orr     r9, r9, r10, lsr #16
306	mov     r10, r10,               lsl #16
307	orr     r10, r10, r11, lsr #16
308	stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
309	mov     r3, r11, lsl #16
310#else
311	orr     r3, r3, r4, lsl #16
312	mov     r4, r4, lsr #16
313	orr     r4, r4, r5, lsl #16
314	mov     r5, r5, lsr #16
315	orr     r5, r5, r6, lsl #16
316	mov     r6, r6, lsr #16
317	orr     r6, r6, r7, lsl #16
318	mov     r7, r7, lsr #16
319	orr     r7, r7, r8, lsl #16
320	mov     r8, r8, lsr #16
321	orr     r8, r8, r9, lsl #16
322	mov     r9, r9, lsr #16
323	orr     r9, r9, r10, lsl #16
324	mov     r10, r10,               lsr #16
325	orr     r10, r10, r11, lsl #16
326	stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
327	mov     r3, r11, lsr #16
328#endif
329	bhs     1b
330	b       less_than_thirtytwo
331
332loop8:
333	ldr     r12, [r1], #4
3341:      mov     r4, r12
335	ldmia   r1!, {   r5,r6,r7,  r8,r9,r10,r11}
336	subs    r2, r2, #32
337	ldrhs   r12, [r1], #4
338#if __ARMEB__
339	orr     r3, r3, r4, lsr #24
340	mov     r4, r4, lsl #8
341	orr     r4, r4, r5, lsr #24
342	mov     r5, r5, lsl #8
343	orr     r5, r5, r6, lsr #24
344	mov     r6, r6,  lsl #8
345	orr     r6, r6, r7, lsr #24
346	mov     r7, r7,  lsl #8
347	orr     r7, r7, r8,             lsr #24
348	mov     r8, r8,  lsl #8
349	orr     r8, r8, r9,             lsr #24
350	mov     r9, r9,  lsl #8
351	orr     r9, r9, r10,    lsr #24
352	mov     r10, r10, lsl #8
353	orr     r10, r10, r11,  lsr #24
354	stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
355	mov     r3, r11, lsl #8
356#else
357	orr     r3, r3, r4, lsl #24
358	mov     r4, r4, lsr #8
359	orr     r4, r4, r5, lsl #24
360	mov     r5, r5, lsr #8
361	orr     r5, r5, r6, lsl #24
362	mov     r6, r6,  lsr #8
363	orr     r6, r6, r7, lsl #24
364	mov     r7, r7,  lsr #8
365	orr     r7, r7, r8,             lsl #24
366	mov     r8, r8,  lsr #8
367	orr     r8, r8, r9,             lsl #24
368	mov     r9, r9,  lsr #8
369	orr     r9, r9, r10,    lsl #24
370	mov     r10, r10, lsr #8
371	orr     r10, r10, r11,  lsl #24
372	stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
373	mov     r3, r11, lsr #8
374#endif
375	bhs     1b
376	b       less_than_thirtytwo
377
378loop24:
379	ldr     r12, [r1], #4
3801:      mov     r4, r12
381	ldmia   r1!, {   r5,r6,r7,  r8,r9,r10,r11}
382	subs    r2, r2, #32
383	ldrhs   r12, [r1], #4
384#if __ARMEB__
385	orr     r3, r3, r4, lsr #8
386	mov     r4, r4, lsl #24
387	orr     r4, r4, r5, lsr #8
388	mov     r5, r5, lsl #24
389	orr     r5, r5, r6, lsr #8
390	mov     r6, r6, lsl #24
391	orr     r6, r6, r7, lsr #8
392	mov     r7, r7, lsl #24
393	orr     r7, r7, r8, lsr #8
394	mov     r8, r8, lsl #24
395	orr     r8, r8, r9, lsr #8
396	mov     r9, r9, lsl #24
397	orr     r9, r9, r10, lsr #8
398	mov     r10, r10, lsl #24
399	orr     r10, r10, r11, lsr #8
400	stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
401	mov     r3, r11, lsl #24
402#else
403	orr     r3, r3, r4, lsl #8
404	mov     r4, r4, lsr #24
405	orr     r4, r4, r5, lsl #8
406	mov     r5, r5, lsr #24
407	orr     r5, r5, r6, lsl #8
408	mov     r6, r6, lsr #24
409	orr     r6, r6, r7, lsl #8
410	mov     r7, r7, lsr #24
411	orr     r7, r7, r8, lsl #8
412	mov     r8, r8, lsr #24
413	orr     r8, r8, r9, lsl #8
414	mov     r9, r9, lsr #24
415	orr     r9, r9, r10, lsl #8
416	mov     r10, r10, lsr #24
417	orr     r10, r10, r11, lsl #8
418	stmia   r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
419	mov     r3, r11, lsr #24
420#endif
421	bhs     1b
422
423less_than_thirtytwo:
424	/* copy the last 0 to 31 bytes of the source */
425	rsb     r12, lr, #32            /* we corrupted r12, recompute it  */
426	add     r2, r2, #32
427	cmp     r2, #4
428	blo     partial_word_tail
429
4301:      ldr     r5, [r1], #4
431	sub     r2, r2, #4
432#if __ARMEB__
433	mov     r4, r5,                 lsr lr
434	orr     r4, r4, r3
435	mov     r3,     r5,                     lsl r12
436#else
437	mov     r4, r5,                 lsl lr
438	orr     r4, r4, r3
439	mov     r3,     r5,                     lsr r12
440#endif
441	str     r4, [r0], #4
442	cmp     r2, #4
443	bhs     1b
444
445partial_word_tail:
446	/* we have a partial word in the input buffer */
447	movs    r5, lr, lsl #(31-3)
448#if __ARMEB__
449	movmi   r3, r3, ror #24
450	strbmi r3, [r0], #1
451	movcs   r3, r3, ror #24
452	strbcs r3, [r0], #1
453	movcs   r3, r3, ror #24
454	strbcs r3, [r0], #1
455#else
456	strbmi r3, [r0], #1
457	movmi   r3, r3, lsr #8
458	strbcs r3, [r0], #1
459	movcs   r3, r3, lsr #8
460	strbcs r3, [r0], #1
461#endif
462
463	/* Refill spilled registers from the stack. Don't update sp. */
464	ldmfd   sp, {r5-r11}
465
466copy_last_3_and_return:
467	movs    r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */
468	ldrbmi r2, [r1], #1
469	ldrbcs r3, [r1], #1
470	ldrbcs r12,[r1]
471	strbmi r2, [r0], #1
472	strbcs r3, [r0], #1
473	strbcs r12,[r0]
474
475	/* we're done! restore sp and spilled registers and return */
476	add     sp,  sp, #28
477	ldmfd   sp!, {r0, r4, lr}
478	bx      lr
479
480