xref: /minix/common/lib/libc/arch/sparc64/string/memcpy.S (revision 84d9c625)
1/*	$NetBSD: memcpy.S,v 1.2 2013/03/17 02:13:10 christos Exp $	*/
2
3/*
4 * Copyright (c) 1996-2002 Eduardo Horvath
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR  ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR  BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 *
25 */
26#include "strmacros.h"
27#if defined(LIBC_SCCS) && !defined(lint)
28RCSID("$NetBSD: memcpy.S,v 1.2 2013/03/17 02:13:10 christos Exp $")
29#endif  /* LIBC_SCCS and not lint */
30
31/*
32 * memcpy
33 * Assumes regions do not overlap;
34 *
35 * Must not use %g7 (see copyin/copyout above).
36 */
37ENTRY(memcpy) /* dest, src, size */
38	/*
39	 * Swap args for bcopy.  Gcc generates calls to memcpy for
40	 * structure assignments.
41	 */
42	mov	%o0, %o3
43	mov	%o1, %o0
44	mov	%o3, %o1
45#if !defined(_KERNEL) || defined(_RUMPKERNEL)
46ENTRY(bcopy) /* src, dest, size */
47#endif
48#ifdef DEBUG
49#if defined(_KERNEL) && !defined(_RUMPKERNEL)
50	set	pmapdebug, %o4
51	ld	[%o4], %o4
52	btst	0x80, %o4	! PDB_COPY
53	bz,pt	%icc, 3f
54	 nop
55#endif
56	save	%sp, -CC64FSZ, %sp
57	mov	%i0, %o1
58	set	2f, %o0
59	mov	%i1, %o2
60	call	printf
61	 mov	%i2, %o3
62!	ta	1; nop
63	restore
64	.data
652:	.asciz	"memcpy(%p<-%p,%x)\n"
66	_ALIGN
67	.text
683:
69#endif
70
71	cmp	%o2, BCOPY_SMALL
72
73Lmemcpy_start:
74	bge,pt	CCCR, 2f	! if >= this many, go be fancy.
75	 cmp	%o2, 256
76
77	mov	%o1, %o5	! Save memcpy return value
78	/*
79	 * Not much to copy, just do it a byte at a time.
80	 */
81	deccc	%o2		! while (--len >= 0)
82	bl	1f
83	 .empty
840:
85	inc	%o0
86	ldsb	[%o0 - 1], %o4	!	(++dst)[-1] = *src++;
87	stb	%o4, [%o1]
88	deccc	%o2
89	bge	0b
90	 inc	%o1
911:
92	retl
93	 mov	%o5, %o0
94	NOTREACHED
95
96	/*
97	 * Plenty of data to copy, so try to do it optimally.
98	 */
992:
100#ifdef USE_BLOCK_STORE_LOAD
101	! If it is big enough, use VIS instructions
102	bge	Lmemcpy_block
103	 nop
104#endif /* USE_BLOCK_STORE_LOAD */
105Lmemcpy_fancy:
106
107	!!
108	!! First align the output to a 8-byte entity
109	!!
110
111	save	%sp, -CC64FSZ, %sp
112
113	mov	%i0, %l0
114	mov	%i1, %l1
115
116	mov	%i2, %l2
117	btst	1, %l1
118
119	bz,pt	%icc, 4f
120	 btst	2, %l1
121	ldub	[%l0], %l4				! Load 1st byte
122
123	deccc	1, %l2
124	ble,pn	CCCR, Lmemcpy_finish			! XXXX
125	 inc	1, %l0
126
127	stb	%l4, [%l1]				! Store 1st byte
128	inc	1, %l1					! Update address
129	btst	2, %l1
1304:
131	bz,pt	%icc, 4f
132
133	 btst	1, %l0
134	bz,a	1f
135	 lduh	[%l0], %l4				! Load short
136
137	ldub	[%l0], %l4				! Load bytes
138
139	ldub	[%l0+1], %l3
140	sllx	%l4, 8, %l4
141	or	%l3, %l4, %l4
142
1431:
144	deccc	2, %l2
145	ble,pn	CCCR, Lmemcpy_finish			! XXXX
146	 inc	2, %l0
147	sth	%l4, [%l1]				! Store 1st short
148
149	inc	2, %l1
1504:
151	btst	4, %l1
152	bz,pt	CCCR, 4f
153
154	 btst	3, %l0
155	bz,a,pt	CCCR, 1f
156	 lduw	[%l0], %l4				! Load word -1
157
158	btst	1, %l0
159	bz,a,pt	%icc, 2f
160	 lduh	[%l0], %l4
161
162	ldub	[%l0], %l4
163
164	lduh	[%l0+1], %l3
165	sllx	%l4, 16, %l4
166	or	%l4, %l3, %l4
167
168	ldub	[%l0+3], %l3
169	sllx	%l4, 8, %l4
170	ba,pt	%icc, 1f
171	 or	%l4, %l3, %l4
172
1732:
174	lduh	[%l0+2], %l3
175	sllx	%l4, 16, %l4
176	or	%l4, %l3, %l4
177
1781:
179	deccc	4, %l2
180	ble,pn	CCCR, Lmemcpy_finish		! XXXX
181	 inc	4, %l0
182
183	st	%l4, [%l1]				! Store word
184	inc	4, %l1
1854:
186	!!
187	!! We are now 32-bit aligned in the dest.
188	!!
189Lmemcpy_common:
190
191	and	%l0, 7, %l4				! Shift amount
192	andn	%l0, 7, %l0				! Source addr
193
194	brz,pt	%l4, Lmemcpy_noshift8			! No shift version...
195
196	 sllx	%l4, 3, %l4				! In bits
197	mov	8<<3, %l3
198
199	ldx	[%l0], %o0				! Load word -1
200	sub	%l3, %l4, %l3				! Reverse shift
201	deccc	12*8, %l2				! Have enough room?
202
203	sllx	%o0, %l4, %o0
204	bl,pn	CCCR, 2f
205	 and	%l3, 0x38, %l3
206Lmemcpy_unrolled8:
207
208	/*
209	 * This is about as close to optimal as you can get, since
210	 * the shifts require EU0 and cannot be paired, and you have
211	 * 3 dependent operations on the data.
212	 */
213
214!	ldx	[%l0+0*8], %o0				! Already done
215!	sllx	%o0, %l4, %o0				! Already done
216	ldx	[%l0+1*8], %o1
217	ldx	[%l0+2*8], %o2
218	ldx	[%l0+3*8], %o3
219	ldx	[%l0+4*8], %o4
220	ba,pt	%icc, 1f
221	 ldx	[%l0+5*8], %o5
222	.align	8
2231:
224	srlx	%o1, %l3, %g1
225	inc	6*8, %l0
226
227	sllx	%o1, %l4, %o1
228	or	%g1, %o0, %g6
229	ldx	[%l0+0*8], %o0
230
231	stx	%g6, [%l1+0*8]
232	srlx	%o2, %l3, %g1
233
234	sllx	%o2, %l4, %o2
235	or	%g1, %o1, %g6
236	ldx	[%l0+1*8], %o1
237
238	stx	%g6, [%l1+1*8]
239	srlx	%o3, %l3, %g1
240
241	sllx	%o3, %l4, %o3
242	or	%g1, %o2, %g6
243	ldx	[%l0+2*8], %o2
244
245	stx	%g6, [%l1+2*8]
246	srlx	%o4, %l3, %g1
247
248	sllx	%o4, %l4, %o4
249	or	%g1, %o3, %g6
250	ldx	[%l0+3*8], %o3
251
252	stx	%g6, [%l1+3*8]
253	srlx	%o5, %l3, %g1
254
255	sllx	%o5, %l4, %o5
256	or	%g1, %o4, %g6
257	ldx	[%l0+4*8], %o4
258
259	stx	%g6, [%l1+4*8]
260	srlx	%o0, %l3, %g1
261	deccc	6*8, %l2				! Have enough room?
262
263	sllx	%o0, %l4, %o0				! Next loop
264	or	%g1, %o5, %g6
265	ldx	[%l0+5*8], %o5
266
267	stx	%g6, [%l1+5*8]
268	bge,pt	CCCR, 1b
269	 inc	6*8, %l1
270
271Lmemcpy_unrolled8_cleanup:
272	!!
273	!! Finished 8 byte block, unload the regs.
274	!!
275	srlx	%o1, %l3, %g1
276	inc	5*8, %l0
277
278	sllx	%o1, %l4, %o1
279	or	%g1, %o0, %g6
280
281	stx	%g6, [%l1+0*8]
282	srlx	%o2, %l3, %g1
283
284	sllx	%o2, %l4, %o2
285	or	%g1, %o1, %g6
286
287	stx	%g6, [%l1+1*8]
288	srlx	%o3, %l3, %g1
289
290	sllx	%o3, %l4, %o3
291	or	%g1, %o2, %g6
292
293	stx	%g6, [%l1+2*8]
294	srlx	%o4, %l3, %g1
295
296	sllx	%o4, %l4, %o4
297	or	%g1, %o3, %g6
298
299	stx	%g6, [%l1+3*8]
300	srlx	%o5, %l3, %g1
301
302	sllx	%o5, %l4, %o5
303	or	%g1, %o4, %g6
304
305	stx	%g6, [%l1+4*8]
306	inc	5*8, %l1
307
308	mov	%o5, %o0				! Save our unused data
309	dec	5*8, %l2
3102:
311	inccc	12*8, %l2
312	bz,pn	%icc, Lmemcpy_complete
313
314	!! Unrolled 8 times
315Lmemcpy_aligned8:
316!	ldx	[%l0], %o0				! Already done
317!	sllx	%o0, %l4, %o0				! Shift high word
318
319	 deccc	8, %l2					! Pre-decrement
320	bl,pn	CCCR, Lmemcpy_finish
3211:
322	ldx	[%l0+8], %o1				! Load word 0
323	inc	8, %l0
324
325	srlx	%o1, %l3, %g6
326	or	%g6, %o0, %g6				! Combine
327
328	stx	%g6, [%l1]				! Store result
329	 inc	8, %l1
330
331	deccc	8, %l2
332	bge,pn	CCCR, 1b
333	 sllx	%o1, %l4, %o0
334
335	btst	7, %l2					! Done?
336	bz,pt	CCCR, Lmemcpy_complete
337
338	!!
339	!! Loadup the last dregs into %o0 and shift it into place
340	!!
341	 srlx	%l3, 3, %g6				! # bytes in %o0
342	dec	8, %g6					!  - 8
343	!! n-8 - (by - 8) -> n - by
344	subcc	%l2, %g6, %g0				! # bytes we need
345	ble,pt	%icc, Lmemcpy_finish
346	 nop
347	ldx	[%l0+8], %o1				! Need another word
348	srlx	%o1, %l3, %o1
349	ba,pt	%icc, Lmemcpy_finish
350	 or	%o0, %o1, %o0				! All loaded up.
351
352Lmemcpy_noshift8:
353	deccc	6*8, %l2				! Have enough room?
354	bl,pn	CCCR, 2f
355	 nop
356	ba,pt	%icc, 1f
357	 nop
358	.align	32
3591:
360	ldx	[%l0+0*8], %o0
361	ldx	[%l0+1*8], %o1
362	ldx	[%l0+2*8], %o2
363	stx	%o0, [%l1+0*8]
364	stx	%o1, [%l1+1*8]
365	stx	%o2, [%l1+2*8]
366
367
368	ldx	[%l0+3*8], %o3
369	ldx	[%l0+4*8], %o4
370	ldx	[%l0+5*8], %o5
371	inc	6*8, %l0
372	stx	%o3, [%l1+3*8]
373	deccc	6*8, %l2
374	stx	%o4, [%l1+4*8]
375	stx	%o5, [%l1+5*8]
376	bge,pt	CCCR, 1b
377	 inc	6*8, %l1
3782:
379	inc	6*8, %l2
3801:
381	deccc	8, %l2
382	bl,pn	%icc, 1f				! < 0 --> sub word
383	 nop
384	ldx	[%l0], %g6
385	inc	8, %l0
386	stx	%g6, [%l1]
387	bg,pt	%icc, 1b				! Exactly 0 --> done
388	 inc	8, %l1
3891:
390	btst	7, %l2					! Done?
391	bz,pt	CCCR, Lmemcpy_complete
392	 clr	%l4
393	ldx	[%l0], %o0
394Lmemcpy_finish:
395
396	brz,pn	%l2, 2f					! 100% complete?
397	 cmp	%l2, 8					! Exactly 8 bytes?
398	bz,a,pn	CCCR, 2f
399	 stx	%o0, [%l1]
400
401	btst	4, %l2					! Word store?
402	bz	CCCR, 1f
403	 srlx	%o0, 32, %g6				! Shift high word down
404	stw	%g6, [%l1]
405	inc	4, %l1
406	mov	%o0, %g6				! Operate on the low bits
4071:
408	btst	2, %l2
409	mov	%g6, %o0
410	bz	1f
411	 srlx	%o0, 16, %g6
412
413	sth	%g6, [%l1]				! Store short
414	inc	2, %l1
415	mov	%o0, %g6				! Operate on low bytes
4161:
417	mov	%g6, %o0
418	btst	1, %l2					! Byte aligned?
419	bz	2f
420	 srlx	%o0, 8, %g6
421
422	stb	%g6, [%l1]				! Store last byte
423	inc	1, %l1					! Update address
4242:
425Lmemcpy_complete:
426#if 0
427	!!
428	!! verify copy success.
429	!!
430
431	mov	%i0, %o2
432	mov	%i1, %o4
433	mov	%i2, %l4
4340:
435	ldub	[%o2], %o1
436	inc	%o2
437	ldub	[%o4], %o3
438	inc	%o4
439	cmp	%o3, %o1
440	bnz	1f
441	 dec	%l4
442	brnz	%l4, 0b
443	 nop
444	ba	2f
445	 nop
446
4471:
448	set	0f, %o0
449	call	printf
450	 sub	%i2, %l4, %o5
451	set	1f, %o0
452	mov	%i0, %o2
453	mov	%i1, %o1
454	call	printf
455	 mov	%i2, %o3
456	ta	1
457	.data
4580:	.asciz	"memcpy failed: %x@%p != %x@%p byte %d\n"
4591:	.asciz	"memcpy(%p, %p, %lx)\n"
460	.align 8
461	.text
4622:
463#endif
464	ret
465	 restore %i1, %g0, %o0
466
467#ifdef USE_BLOCK_STORE_LOAD
468
469/*
470 * Block copy.  Useful for >256 byte copies.
471 *
472 * Benchmarking has shown this always seems to be slower than
473 * the integer version, so this is disabled.  Maybe someone will
474 * figure out why sometime.
475 */
476
477Lmemcpy_block:
478	sethi	%hi(block_disable), %o3
479	ldx	[ %o3 + %lo(block_disable) ], %o3
480	brnz,pn	%o3, Lmemcpy_fancy
481	!! Make sure our trap table is installed
482	set	_C_LABEL(trapbase), %o5
483	rdpr	%tba, %o3
484	sub	%o3, %o5, %o3
485	brnz,pn	%o3, Lmemcpy_fancy	! No, then don't use block load/store
486	 nop
487#if defined(_KERNEL) && !defined(_RUMPKERNEL)
488/*
489 * Kernel:
490 *
491 * Here we use VIS instructions to do a block clear of a page.
492 * But before we can do that we need to save and enable the FPU.
493 * The last owner of the FPU registers is fplwp, and
494 * fplwp->l_md.md_fpstate is the current fpstate.  If that's not
495 * null, call savefpstate() with it to store our current fp state.
496 *
497 * Next, allocate an aligned fpstate on the stack.  We will properly
498 * nest calls on a particular stack so this should not be a problem.
499 *
500 * Now we grab either curlwp (or if we're on the interrupt stack
501 * lwp0).  We stash its existing fpstate in a local register and
502 * put our new fpstate in curlwp->p_md.md_fpstate.  We point
503 * fplwp at curlwp (or lwp0) and enable the FPU.
504 *
505 * If we are ever preempted, our FPU state will be saved in our
506 * fpstate.  Then, when we're resumed and we take an FPDISABLED
507 * trap, the trap handler will be able to fish our FPU state out
508 * of curlwp (or lwp0).
509 *
510 * On exiting this routine we undo the damage: restore the original
511 * pointer to curlwp->p_md.md_fpstate, clear our fplwp, and disable
512 * the MMU.
513 *
514 *
515 * Register usage, Kernel only (after save):
516 *
517 * %i0		src
518 * %i1		dest
519 * %i2		size
520 *
521 * %l0		XXXX DEBUG old fpstate
522 * %l1		fplwp (hi bits only)
523 * %l2		orig fplwp
524 * %l3		orig fpstate
525 * %l5		curlwp
526 * %l6		old fpstate
527 *
528 * Register ussage, Kernel and user:
529 *
530 * %g1		src (retval for memcpy)
531 *
532 * %o0		src
533 * %o1		dest
534 * %o2		end dest
535 * %o5		last safe fetchable address
536 */
537
538	ENABLE_FPU(0)
539
540	mov	%i0, %o0				! Src addr.
541	mov	%i1, %o1				! Store our dest ptr here.
542	mov	%i2, %o2				! Len counter
543#endif	/* _KERNEL */
544
545	!!
546	!! First align the output to a 64-bit entity
547	!!
548
549	mov	%o1, %g1				! memcpy retval
550	add	%o0, %o2, %o5				! End of source block
551
552	andn	%o0, 7, %o3				! Start of block
553	dec	%o5
554	fzero	%f0
555
556	andn	%o5, BLOCK_ALIGN, %o5			! Last safe addr.
557	ldd	[%o3], %f2				! Load 1st word
558
559	dec	8, %o3					! Move %o3 1 word back
560	btst	1, %o1
561	bz	4f
562
563	 mov	-7, %o4					! Lowest src addr possible
564	alignaddr %o0, %o4, %o4				! Base addr for load.
565
566	cmp	%o3, %o4
567	be,pt	CCCR, 1f				! Already loaded?
568	 mov	%o4, %o3
569	fmovd	%f2, %f0				! No. Shift
570	ldd	[%o3+8], %f2				! And load
5711:
572
573	faligndata	%f0, %f2, %f4			! Isolate 1st byte
574
575	stda	%f4, [%o1] ASI_FL8_P			! Store 1st byte
576	inc	1, %o1					! Update address
577	inc	1, %o0
578	dec	1, %o2
5794:
580	btst	2, %o1
581	bz	4f
582
583	 mov	-6, %o4					! Calculate src - 6
584	alignaddr %o0, %o4, %o4				! calculate shift mask and dest.
585
586	cmp	%o3, %o4				! Addresses same?
587	be,pt	CCCR, 1f
588	 mov	%o4, %o3
589	fmovd	%f2, %f0				! Shuffle data
590	ldd	[%o3+8], %f2				! Load word 0
5911:
592	faligndata %f0, %f2, %f4			! Move 1st short low part of f8
593
594	stda	%f4, [%o1] ASI_FL16_P			! Store 1st short
595	dec	2, %o2
596	inc	2, %o1
597	inc	2, %o0
5984:
599	brz,pn	%o2, Lmemcpy_blockfinish			! XXXX
600
601	 btst	4, %o1
602	bz	4f
603
604	mov	-4, %o4
605	alignaddr %o0, %o4, %o4				! calculate shift mask and dest.
606
607	cmp	%o3, %o4				! Addresses same?
608	beq,pt	CCCR, 1f
609	 mov	%o4, %o3
610	fmovd	%f2, %f0				! Shuffle data
611	ldd	[%o3+8], %f2				! Load word 0
6121:
613	faligndata %f0, %f2, %f4			! Move 1st short low part of f8
614
615	st	%f5, [%o1]				! Store word
616	dec	4, %o2
617	inc	4, %o1
618	inc	4, %o0
6194:
620	brz,pn	%o2, Lmemcpy_blockfinish			! XXXX
621	!!
622	!! We are now 32-bit aligned in the dest.
623	!!
624Lmemcpy_block_common:
625
626	 mov	-0, %o4
627	alignaddr %o0, %o4, %o4				! base - shift
628
629	cmp	%o3, %o4				! Addresses same?
630	beq,pt	CCCR, 1f
631	 mov	%o4, %o3
632	fmovd	%f2, %f0				! Shuffle data
633	ldd	[%o3+8], %f2				! Load word 0
6341:
635	add	%o3, 8, %o0				! now use %o0 for src
636
637	!!
638	!! Continue until our dest is block aligned
639	!!
640Lmemcpy_block_aligned8:
6411:
642	brz	%o2, Lmemcpy_blockfinish
643	 btst	BLOCK_ALIGN, %o1			! Block aligned?
644	bz	1f
645
646	 faligndata %f0, %f2, %f4			! Generate result
647	deccc	8, %o2
648	ble,pn	%icc, Lmemcpy_blockfinish		! Should never happen
649	 fmovd	%f4, %f48
650
651	std	%f4, [%o1]				! Store result
652	inc	8, %o1
653
654	fmovd	%f2, %f0
655	inc	8, %o0
656	ba,pt	%xcc, 1b				! Not yet.
657	 ldd	[%o0], %f2				! Load next part
658Lmemcpy_block_aligned64:
6591:
660
661/*
662 * 64-byte aligned -- ready for block operations.
663 *
664 * Here we have the destination block aligned, but the
665 * source pointer may not be.  Sub-word alignment will
666 * be handled by faligndata instructions.  But the source
667 * can still be potentially aligned to 8 different words
668 * in our 64-bit block, so we have 8 different copy routines.
669 *
670 * Once we figure out our source alignment, we branch
671 * to the appropriate copy routine, which sets up the
672 * alignment for faligndata and loads (sets) the values
673 * into the source registers and does the copy loop.
674 *
675 * When were down to less than 1 block to store, we
676 * exit the copy loop and execute cleanup code.
677 *
678 * Block loads and stores are not properly interlocked.
679 * Stores save one reg/cycle, so you can start overwriting
680 * registers the cycle after the store is issued.
681 *
682 * Block loads require a block load to a different register
683 * block or a membar #Sync before accessing the loaded
684 * data.
685 *
686 * Since the faligndata instructions may be offset as far
687 * as 7 registers into a block (if you are shifting source
688 * 7 -> dest 0), you need 3 source register blocks for full
689 * performance: one you are copying, one you are loading,
690 * and one for interlocking.  Otherwise, we would need to
691 * sprinkle the code with membar #Sync and lose the advantage
692 * of running faligndata in parallel with block stores.  This
693 * means we are fetching a full 128 bytes ahead of the stores.
694 * We need to make sure the prefetch does not inadvertently
695 * cross a page boundary and fault on data that we will never
696 * store.
697 *
698 */
699#if 1
700	and	%o0, BLOCK_ALIGN, %o3
701	srax	%o3, 3, %o3				! Isolate the offset
702
703	brz	%o3, L100				! 0->0
704	 btst	4, %o3
705	bnz	%xcc, 4f
706	 btst	2, %o3
707	bnz	%xcc, 2f
708	 btst	1, %o3
709	ba,pt	%xcc, L101				! 0->1
710	 nop	/* XXX spitfire bug */
7112:
712	bz	%xcc, L102				! 0->2
713	 nop
714	ba,pt	%xcc, L103				! 0->3
715	 nop	/* XXX spitfire bug */
7164:
717	bnz	%xcc, 2f
718	 btst	1, %o3
719	bz	%xcc, L104				! 0->4
720	 nop
721	ba,pt	%xcc, L105				! 0->5
722	 nop	/* XXX spitfire bug */
7232:
724	bz	%xcc, L106				! 0->6
725	 nop
726	ba,pt	%xcc, L107				! 0->7
727	 nop	/* XXX spitfire bug */
728#else
729
730	!!
731	!! Isolate the word offset, which just happens to be
732	!! the slot in our jump table.
733	!!
734	!! This is 6 insns, most of which cannot be paired,
735	!! which is about the same as the above version.
736	!!
737	rd	%pc, %o4
7381:
739	and	%o0, 0x31, %o3
740	add	%o3, (Lmemcpy_block_jmp - 1b), %o3
741	jmpl	%o4 + %o3, %g0
742	 nop
743
744	!!
745	!! Jump table
746	!!
747
748Lmemcpy_block_jmp:
749	ba,a,pt	%xcc, L100
750	 nop
751	ba,a,pt	%xcc, L101
752	 nop
753	ba,a,pt	%xcc, L102
754	 nop
755	ba,a,pt	%xcc, L103
756	 nop
757	ba,a,pt	%xcc, L104
758	 nop
759	ba,a,pt	%xcc, L105
760	 nop
761	ba,a,pt	%xcc, L106
762	 nop
763	ba,a,pt	%xcc, L107
764	 nop
765#endif
766
767	!!
768	!! Source is block aligned.
769	!!
770	!! Just load a block and go.
771	!!
772L100:
773#ifdef RETURN_NAME
774	sethi	%hi(1f), %g1
775	ba,pt	%icc, 2f
776	 or	%g1, %lo(1f), %g1
7771:
778	.asciz	"L100"
779	.align	8
7802:
781#endif
782	fmovd	%f0 , %f62
783	ldda	[%o0] ASI_BLK_P, %f0
784	inc	BLOCK_SIZE, %o0
785	cmp	%o0, %o5
786	bleu,a,pn	%icc, 3f
787	 ldda	[%o0] ASI_BLK_P, %f16
788	ba,pt	%icc, 3f
789	 membar #Sync
790
791	.align	32					! ICache align.
7923:
793	faligndata	%f62, %f0, %f32
794	inc	BLOCK_SIZE, %o0
795	faligndata	%f0, %f2, %f34
796	dec	BLOCK_SIZE, %o2
797	faligndata	%f2, %f4, %f36
798	cmp	%o0, %o5
799	faligndata	%f4, %f6, %f38
800	faligndata	%f6, %f8, %f40
801	faligndata	%f8, %f10, %f42
802	faligndata	%f10, %f12, %f44
803	brlez,pn	%o2, Lmemcpy_blockdone
804	 faligndata	%f12, %f14, %f46
805
806	bleu,a,pn	%icc, 2f
807	 ldda	[%o0] ASI_BLK_P, %f48
808	membar	#Sync
8092:
810	stda	%f32, [%o1] ASI_STORE
811	faligndata	%f14, %f16, %f32
812	inc	BLOCK_SIZE, %o0
813	faligndata	%f16, %f18, %f34
814	inc	BLOCK_SIZE, %o1
815	faligndata	%f18, %f20, %f36
816	dec	BLOCK_SIZE, %o2
817	faligndata	%f20, %f22, %f38
818	cmp	%o0, %o5
819	faligndata	%f22, %f24, %f40
820	faligndata	%f24, %f26, %f42
821	faligndata	%f26, %f28, %f44
822	brlez,pn	%o2, Lmemcpy_blockdone
823	 faligndata	%f28, %f30, %f46
824
825	bleu,a,pn	%icc, 2f
826	 ldda	[%o0] ASI_BLK_P, %f0
827	membar	#Sync
8282:
829	stda	%f32, [%o1] ASI_STORE
830	faligndata	%f30, %f48, %f32
831	inc	BLOCK_SIZE, %o0
832	faligndata	%f48, %f50, %f34
833	inc	BLOCK_SIZE, %o1
834	faligndata	%f50, %f52, %f36
835	dec	BLOCK_SIZE, %o2
836	faligndata	%f52, %f54, %f38
837	cmp	%o0, %o5
838	faligndata	%f54, %f56, %f40
839	faligndata	%f56, %f58, %f42
840	faligndata	%f58, %f60, %f44
841	brlez,pn	%o2, Lmemcpy_blockdone
842	 faligndata	%f60, %f62, %f46
843	bleu,a,pn	%icc, 2f
844	 ldda	[%o0] ASI_BLK_P, %f16			! Increment is at top
845	membar	#Sync
8462:
847	stda	%f32, [%o1] ASI_STORE
848	ba	3b
849	 inc	BLOCK_SIZE, %o1
850
851	!!
852	!! Source at BLOCK_ALIGN+8
853	!!
854	!! We need to load almost 1 complete block by hand.
855	!!
856L101:
857#ifdef RETURN_NAME
858	sethi	%hi(1f), %g1
859	ba,pt	%icc, 2f
860	 or	%g1, %lo(1f), %g1
8611:
862	.asciz	"L101"
863	.align	8
8642:
865#endif
866!	fmovd	%f0, %f0				! Hoist fmovd
867	ldd	[%o0], %f2
868	inc	8, %o0
869	ldd	[%o0], %f4
870	inc	8, %o0
871	ldd	[%o0], %f6
872	inc	8, %o0
873	ldd	[%o0], %f8
874	inc	8, %o0
875	ldd	[%o0], %f10
876	inc	8, %o0
877	ldd	[%o0], %f12
878	inc	8, %o0
879	ldd	[%o0], %f14
880	inc	8, %o0
881
882	cmp	%o0, %o5
883	bleu,a,pn	%icc, 3f
884	 ldda	[%o0] ASI_BLK_P, %f16
885	membar #Sync
8863:
887	faligndata	%f0, %f2, %f32
888	inc	BLOCK_SIZE, %o0
889	faligndata	%f2, %f4, %f34
890	cmp	%o0, %o5
891	faligndata	%f4, %f6, %f36
892	dec	BLOCK_SIZE, %o2
893	faligndata	%f6, %f8, %f38
894	faligndata	%f8, %f10, %f40
895	faligndata	%f10, %f12, %f42
896	faligndata	%f12, %f14, %f44
897	bleu,a,pn	%icc, 2f
898	 ldda	[%o0] ASI_BLK_P, %f48
899	membar	#Sync
9002:
901	brlez,pn	%o2, Lmemcpy_blockdone
902	 faligndata	%f14, %f16, %f46
903
904	stda	%f32, [%o1] ASI_STORE
905
906	faligndata	%f16, %f18, %f32
907	inc	BLOCK_SIZE, %o0
908	faligndata	%f18, %f20, %f34
909	inc	BLOCK_SIZE, %o1
910	faligndata	%f20, %f22, %f36
911	cmp	%o0, %o5
912	faligndata	%f22, %f24, %f38
913	dec	BLOCK_SIZE, %o2
914	faligndata	%f24, %f26, %f40
915	faligndata	%f26, %f28, %f42
916	faligndata	%f28, %f30, %f44
917	bleu,a,pn	%icc, 2f
918	 ldda	[%o0] ASI_BLK_P, %f0
919	membar	#Sync
9202:
921	brlez,pn	%o2, Lmemcpy_blockdone
922	 faligndata	%f30, %f48, %f46
923
924	stda	%f32, [%o1] ASI_STORE
925
926	faligndata	%f48, %f50, %f32
927	inc	BLOCK_SIZE, %o0
928	faligndata	%f50, %f52, %f34
929	inc	BLOCK_SIZE, %o1
930	faligndata	%f52, %f54, %f36
931	cmp	%o0, %o5
932	faligndata	%f54, %f56, %f38
933	dec	BLOCK_SIZE, %o2
934	faligndata	%f56, %f58, %f40
935	faligndata	%f58, %f60, %f42
936	faligndata	%f60, %f62, %f44
937	bleu,a,pn	%icc, 2f
938	 ldda	[%o0] ASI_BLK_P, %f16
939	membar	#Sync
9402:
941	brlez,pn	%o2, Lmemcpy_blockdone
942	 faligndata	%f62, %f0, %f46
943
944	stda	%f32, [%o1] ASI_STORE
945	ba	3b
946	 inc	BLOCK_SIZE, %o1
947
948	!!
949	!! Source at BLOCK_ALIGN+16
950	!!
951	!! We need to load 6 doubles by hand.
952	!!
953L102:
954#ifdef RETURN_NAME
955	sethi	%hi(1f), %g1
956	ba,pt	%icc, 2f
957	 or	%g1, %lo(1f), %g1
9581:
959	.asciz	"L102"
960	.align	8
9612:
962#endif
963	ldd	[%o0], %f4
964	inc	8, %o0
965	fmovd	%f0, %f2				! Hoist fmovd
966	ldd	[%o0], %f6
967	inc	8, %o0
968
969	ldd	[%o0], %f8
970	inc	8, %o0
971	ldd	[%o0], %f10
972	inc	8, %o0
973	ldd	[%o0], %f12
974	inc	8, %o0
975	ldd	[%o0], %f14
976	inc	8, %o0
977
978	cmp	%o0, %o5
979	bleu,a,pn	%icc, 3f
980	 ldda	[%o0] ASI_BLK_P, %f16
981	membar #Sync
9823:
983	faligndata	%f2, %f4, %f32
984	inc	BLOCK_SIZE, %o0
985	faligndata	%f4, %f6, %f34
986	cmp	%o0, %o5
987	faligndata	%f6, %f8, %f36
988	dec	BLOCK_SIZE, %o2
989	faligndata	%f8, %f10, %f38
990	faligndata	%f10, %f12, %f40
991	faligndata	%f12, %f14, %f42
992	bleu,a,pn	%icc, 2f
993	 ldda	[%o0] ASI_BLK_P, %f48
994	membar	#Sync
9952:
996	faligndata	%f14, %f16, %f44
997
998	brlez,pn	%o2, Lmemcpy_blockdone
999	 faligndata	%f16, %f18, %f46
1000
1001	stda	%f32, [%o1] ASI_STORE
1002
1003	faligndata	%f18, %f20, %f32
1004	inc	BLOCK_SIZE, %o0
1005	faligndata	%f20, %f22, %f34
1006	inc	BLOCK_SIZE, %o1
1007	faligndata	%f22, %f24, %f36
1008	cmp	%o0, %o5
1009	faligndata	%f24, %f26, %f38
1010	dec	BLOCK_SIZE, %o2
1011	faligndata	%f26, %f28, %f40
1012	faligndata	%f28, %f30, %f42
1013	bleu,a,pn	%icc, 2f
1014	 ldda	[%o0] ASI_BLK_P, %f0
1015	membar	#Sync
10162:
1017	faligndata	%f30, %f48, %f44
1018	brlez,pn	%o2, Lmemcpy_blockdone
1019	 faligndata	%f48, %f50, %f46
1020
1021	stda	%f32, [%o1] ASI_STORE
1022
1023	faligndata	%f50, %f52, %f32
1024	inc	BLOCK_SIZE, %o0
1025	faligndata	%f52, %f54, %f34
1026	inc	BLOCK_SIZE, %o1
1027	faligndata	%f54, %f56, %f36
1028	cmp	%o0, %o5
1029	faligndata	%f56, %f58, %f38
1030	dec	BLOCK_SIZE, %o2
1031	faligndata	%f58, %f60, %f40
1032	faligndata	%f60, %f62, %f42
1033	bleu,a,pn	%icc, 2f
1034	 ldda	[%o0] ASI_BLK_P, %f16
1035	membar	#Sync
10362:
1037	faligndata	%f62, %f0, %f44
1038	brlez,pn	%o2, Lmemcpy_blockdone
1039	 faligndata	%f0, %f2, %f46
1040
1041	stda	%f32, [%o1] ASI_STORE
1042	ba	3b
1043	 inc	BLOCK_SIZE, %o1
1044
1045	!!
1046	!! Source at BLOCK_ALIGN+24
1047	!!
1048	!! We need to load 5 doubles by hand.
1049	!!
1050L103:
1051#ifdef RETURN_NAME
1052	sethi	%hi(1f), %g1
1053	ba,pt	%icc, 2f
1054	 or	%g1, %lo(1f), %g1
10551:
1056	.asciz	"L103"
1057	.align	8
10582:
1059#endif
1060	fmovd	%f0, %f4
1061	ldd	[%o0], %f6
1062	inc	8, %o0
1063	ldd	[%o0], %f8
1064	inc	8, %o0
1065	ldd	[%o0], %f10
1066	inc	8, %o0
1067	ldd	[%o0], %f12
1068	inc	8, %o0
1069	ldd	[%o0], %f14
1070	inc	8, %o0
1071
1072	cmp	%o0, %o5
1073	bleu,a,pn	%icc, 2f
1074	 ldda	[%o0] ASI_BLK_P, %f16
1075	membar #Sync
10762:
1077	inc	BLOCK_SIZE, %o0
10783:
1079	faligndata	%f4, %f6, %f32
1080	cmp	%o0, %o5
1081	faligndata	%f6, %f8, %f34
1082	dec	BLOCK_SIZE, %o2
1083	faligndata	%f8, %f10, %f36
1084	faligndata	%f10, %f12, %f38
1085	faligndata	%f12, %f14, %f40
1086	bleu,a,pn	%icc, 2f
1087	 ldda	[%o0] ASI_BLK_P, %f48
1088	membar	#Sync
10892:
1090	faligndata	%f14, %f16, %f42
1091	inc	BLOCK_SIZE, %o0
1092	faligndata	%f16, %f18, %f44
1093	brlez,pn	%o2, Lmemcpy_blockdone
1094	 faligndata	%f18, %f20, %f46
1095
1096	stda	%f32, [%o1] ASI_STORE
1097
1098	faligndata	%f20, %f22, %f32
1099	cmp	%o0, %o5
1100	faligndata	%f22, %f24, %f34
1101	dec	BLOCK_SIZE, %o2
1102	faligndata	%f24, %f26, %f36
1103	inc	BLOCK_SIZE, %o1
1104	faligndata	%f26, %f28, %f38
1105	faligndata	%f28, %f30, %f40
1106	ble,a,pn	%icc, 2f
1107	 ldda	[%o0] ASI_BLK_P, %f0
1108	membar	#Sync
11092:
1110	faligndata	%f30, %f48, %f42
1111	inc	BLOCK_SIZE, %o0
1112	faligndata	%f48, %f50, %f44
1113	brlez,pn	%o2, Lmemcpy_blockdone
1114	 faligndata	%f50, %f52, %f46
1115
1116	stda	%f32, [%o1] ASI_STORE
1117
1118	faligndata	%f52, %f54, %f32
1119	cmp	%o0, %o5
1120	faligndata	%f54, %f56, %f34
1121	dec	BLOCK_SIZE, %o2
1122	faligndata	%f56, %f58, %f36
1123	faligndata	%f58, %f60, %f38
1124	inc	BLOCK_SIZE, %o1
1125	faligndata	%f60, %f62, %f40
1126	bleu,a,pn	%icc, 2f
1127	 ldda	[%o0] ASI_BLK_P, %f16
1128	membar	#Sync
11292:
1130	faligndata	%f62, %f0, %f42
1131	inc	BLOCK_SIZE, %o0
1132	faligndata	%f0, %f2, %f44
1133	brlez,pn	%o2, Lmemcpy_blockdone
1134	 faligndata	%f2, %f4, %f46
1135
1136	stda	%f32, [%o1] ASI_STORE
1137	ba	3b
1138	 inc	BLOCK_SIZE, %o1
1139
1140	!!
1141	!! Source at BLOCK_ALIGN+32
1142	!!
1143	!! We need to load 4 doubles by hand.
1144	!!
1145L104:
1146#ifdef RETURN_NAME
1147	sethi	%hi(1f), %g1
1148	ba,pt	%icc, 2f
1149	 or	%g1, %lo(1f), %g1
11501:
1151	.asciz	"L104"
1152	.align	8
11532:
1154#endif
1155	fmovd	%f0, %f6
1156	ldd	[%o0], %f8
1157	inc	8, %o0
1158	ldd	[%o0], %f10
1159	inc	8, %o0
1160	ldd	[%o0], %f12
1161	inc	8, %o0
1162	ldd	[%o0], %f14
1163	inc	8, %o0
1164
1165	cmp	%o0, %o5
1166	bleu,a,pn	%icc, 2f
1167	 ldda	[%o0] ASI_BLK_P, %f16
1168	membar #Sync
11692:
1170	inc	BLOCK_SIZE, %o0
11713:
1172	faligndata	%f6, %f8, %f32
1173	cmp	%o0, %o5
1174	faligndata	%f8, %f10, %f34
1175	dec	BLOCK_SIZE, %o2
1176	faligndata	%f10, %f12, %f36
1177	faligndata	%f12, %f14, %f38
1178	bleu,a,pn	%icc, 2f
1179	 ldda	[%o0] ASI_BLK_P, %f48
1180	membar	#Sync
11812:
1182	faligndata	%f14, %f16, %f40
1183	faligndata	%f16, %f18, %f42
1184	inc	BLOCK_SIZE, %o0
1185	faligndata	%f18, %f20, %f44
1186	brlez,pn	%o2, Lmemcpy_blockdone
1187	 faligndata	%f20, %f22, %f46
1188
1189	stda	%f32, [%o1] ASI_STORE
1190
1191	faligndata	%f22, %f24, %f32
1192	cmp	%o0, %o5
1193	faligndata	%f24, %f26, %f34
1194	faligndata	%f26, %f28, %f36
1195	inc	BLOCK_SIZE, %o1
1196	faligndata	%f28, %f30, %f38
1197	bleu,a,pn	%icc, 2f
1198	 ldda	[%o0] ASI_BLK_P, %f0
1199	membar	#Sync
12002:
1201	faligndata	%f30, %f48, %f40
1202	dec	BLOCK_SIZE, %o2
1203	faligndata	%f48, %f50, %f42
1204	inc	BLOCK_SIZE, %o0
1205	faligndata	%f50, %f52, %f44
1206	brlez,pn	%o2, Lmemcpy_blockdone
1207	 faligndata	%f52, %f54, %f46
1208
1209	stda	%f32, [%o1] ASI_STORE
1210
1211	faligndata	%f54, %f56, %f32
1212	cmp	%o0, %o5
1213	faligndata	%f56, %f58, %f34
1214	faligndata	%f58, %f60, %f36
1215	inc	BLOCK_SIZE, %o1
1216	faligndata	%f60, %f62, %f38
1217	bleu,a,pn	%icc, 2f
1218	 ldda	[%o0] ASI_BLK_P, %f16
1219	membar	#Sync
12202:
1221	faligndata	%f62, %f0, %f40
1222	dec	BLOCK_SIZE, %o2
1223	faligndata	%f0, %f2, %f42
1224	inc	BLOCK_SIZE, %o0
1225	faligndata	%f2, %f4, %f44
1226	brlez,pn	%o2, Lmemcpy_blockdone
1227	 faligndata	%f4, %f6, %f46
1228
1229	stda	%f32, [%o1] ASI_STORE
1230	ba	3b
1231	 inc	BLOCK_SIZE, %o1
1232
1233	!!
1234	!! Source at BLOCK_ALIGN+40
1235	!!
1236	!! We need to load 3 doubles by hand.
1237	!!
1238L105:
1239#ifdef RETURN_NAME
1240	sethi	%hi(1f), %g1
1241	ba,pt	%icc, 2f
1242	 or	%g1, %lo(1f), %g1
12431:
1244	.asciz	"L105"
1245	.align	8
12462:
1247#endif
1248	fmovd	%f0, %f8
1249	ldd	[%o0], %f10
1250	inc	8, %o0
1251	ldd	[%o0], %f12
1252	inc	8, %o0
1253	ldd	[%o0], %f14
1254	inc	8, %o0
1255
1256	cmp	%o0, %o5
1257	bleu,a,pn	%icc, 2f
1258	 ldda	[%o0] ASI_BLK_P, %f16
1259	membar #Sync
12602:
1261	inc	BLOCK_SIZE, %o0
12623:
1263	faligndata	%f8, %f10, %f32
1264	cmp	%o0, %o5
1265	faligndata	%f10, %f12, %f34
1266	faligndata	%f12, %f14, %f36
1267	bleu,a,pn	%icc, 2f
1268	 ldda	[%o0] ASI_BLK_P, %f48
1269	membar	#Sync
12702:
1271	faligndata	%f14, %f16, %f38
1272	dec	BLOCK_SIZE, %o2
1273	faligndata	%f16, %f18, %f40
1274	inc	BLOCK_SIZE, %o0
1275	faligndata	%f18, %f20, %f42
1276	faligndata	%f20, %f22, %f44
1277	brlez,pn	%o2, Lmemcpy_blockdone
1278	 faligndata	%f22, %f24, %f46
1279
1280	stda	%f32, [%o1] ASI_STORE
1281
1282	faligndata	%f24, %f26, %f32
1283	cmp	%o0, %o5
1284	faligndata	%f26, %f28, %f34
1285	dec	BLOCK_SIZE, %o2
1286	faligndata	%f28, %f30, %f36
1287	bleu,a,pn	%icc, 2f
1288	 ldda	[%o0] ASI_BLK_P, %f0
1289	membar	#Sync
12902:
1291	faligndata	%f30, %f48, %f38
1292	inc	BLOCK_SIZE, %o1
1293	faligndata	%f48, %f50, %f40
1294	inc	BLOCK_SIZE, %o0
1295	faligndata	%f50, %f52, %f42
1296	faligndata	%f52, %f54, %f44
1297	brlez,pn	%o2, Lmemcpy_blockdone
1298	 faligndata	%f54, %f56, %f46
1299
1300	stda	%f32, [%o1] ASI_STORE
1301
1302	faligndata	%f56, %f58, %f32
1303	cmp	%o0, %o5
1304	faligndata	%f58, %f60, %f34
1305	dec	BLOCK_SIZE, %o2
1306	faligndata	%f60, %f62, %f36
1307	bleu,a,pn	%icc, 2f
1308	 ldda	[%o0] ASI_BLK_P, %f16
1309	membar	#Sync
13102:
1311	faligndata	%f62, %f0, %f38
1312	inc	BLOCK_SIZE, %o1
1313	faligndata	%f0, %f2, %f40
1314	inc	BLOCK_SIZE, %o0
1315	faligndata	%f2, %f4, %f42
1316	faligndata	%f4, %f6, %f44
1317	brlez,pn	%o2, Lmemcpy_blockdone
1318	 faligndata	%f6, %f8, %f46
1319
1320	stda	%f32, [%o1] ASI_STORE
1321	ba	3b
1322	 inc	BLOCK_SIZE, %o1
1323
1324
1325	!!
1326	!! Source at BLOCK_ALIGN+48
1327	!!
1328	!! We need to load 2 doubles by hand.
1329	!!
1330L106:
1331#ifdef RETURN_NAME
1332	sethi	%hi(1f), %g1
1333	ba,pt	%icc, 2f
1334	 or	%g1, %lo(1f), %g1
13351:
1336	.asciz	"L106"
1337	.align	8
13382:
1339#endif
1340	fmovd	%f0, %f10
1341	ldd	[%o0], %f12
1342	inc	8, %o0
1343	ldd	[%o0], %f14
1344	inc	8, %o0
1345
1346	cmp	%o0, %o5
1347	bleu,a,pn	%icc, 2f
1348	 ldda	[%o0] ASI_BLK_P, %f16
1349	membar #Sync
13502:
1351	inc	BLOCK_SIZE, %o0
13523:
1353	faligndata	%f10, %f12, %f32
1354	cmp	%o0, %o5
1355	faligndata	%f12, %f14, %f34
1356	bleu,a,pn	%icc, 2f
1357	 ldda	[%o0] ASI_BLK_P, %f48
1358	membar	#Sync
13592:
1360	faligndata	%f14, %f16, %f36
1361	dec	BLOCK_SIZE, %o2
1362	faligndata	%f16, %f18, %f38
1363	inc	BLOCK_SIZE, %o0
1364	faligndata	%f18, %f20, %f40
1365	faligndata	%f20, %f22, %f42
1366	faligndata	%f22, %f24, %f44
1367	brlez,pn	%o2, Lmemcpy_blockdone
1368	 faligndata	%f24, %f26, %f46
1369
1370	stda	%f32, [%o1] ASI_STORE
1371
1372	faligndata	%f26, %f28, %f32
1373	cmp	%o0, %o5
1374	faligndata	%f28, %f30, %f34
1375	bleu,a,pn	%icc, 2f
1376	 ldda	[%o0] ASI_BLK_P, %f0
1377	membar	#Sync
13782:
1379	faligndata	%f30, %f48, %f36
1380	dec	BLOCK_SIZE, %o2
1381	faligndata	%f48, %f50, %f38
1382	inc	BLOCK_SIZE, %o1
1383	faligndata	%f50, %f52, %f40
1384	faligndata	%f52, %f54, %f42
1385	inc	BLOCK_SIZE, %o0
1386	faligndata	%f54, %f56, %f44
1387	brlez,pn	%o2, Lmemcpy_blockdone
1388	 faligndata	%f56, %f58, %f46
1389
1390	stda	%f32, [%o1] ASI_STORE
1391
1392	faligndata	%f58, %f60, %f32
1393	cmp	%o0, %o5
1394	faligndata	%f60, %f62, %f34
1395	bleu,a,pn	%icc, 2f
1396	 ldda	[%o0] ASI_BLK_P, %f16
1397	membar	#Sync
13982:
1399	faligndata	%f62, %f0, %f36
1400	dec	BLOCK_SIZE, %o2
1401	faligndata	%f0, %f2, %f38
1402	inc	BLOCK_SIZE, %o1
1403	faligndata	%f2, %f4, %f40
1404	faligndata	%f4, %f6, %f42
1405	inc	BLOCK_SIZE, %o0
1406	faligndata	%f6, %f8, %f44
1407	brlez,pn	%o2, Lmemcpy_blockdone
1408	 faligndata	%f8, %f10, %f46
1409
1410	stda	%f32, [%o1] ASI_STORE
1411	ba	3b
1412	 inc	BLOCK_SIZE, %o1
1413
1414
1415	!!
1416	!! Source at BLOCK_ALIGN+56
1417	!!
1418	!! We need to load 1 double by hand.
1419	!!
1420L107:
1421#ifdef RETURN_NAME
1422	sethi	%hi(1f), %g1
1423	ba,pt	%icc, 2f
1424	 or	%g1, %lo(1f), %g1
14251:
1426	.asciz	"L107"
1427	.align	8
14282:
1429#endif
1430	fmovd	%f0, %f12
1431	ldd	[%o0], %f14
1432	inc	8, %o0
1433
1434	cmp	%o0, %o5
1435	bleu,a,pn	%icc, 2f
1436	 ldda	[%o0] ASI_BLK_P, %f16
1437	membar #Sync
14382:
1439	inc	BLOCK_SIZE, %o0
14403:
1441	faligndata	%f12, %f14, %f32
1442	cmp	%o0, %o5
1443	bleu,a,pn	%icc, 2f
1444	 ldda	[%o0] ASI_BLK_P, %f48
1445	membar	#Sync
14462:
1447	faligndata	%f14, %f16, %f34
1448	dec	BLOCK_SIZE, %o2
1449	faligndata	%f16, %f18, %f36
1450	inc	BLOCK_SIZE, %o0
1451	faligndata	%f18, %f20, %f38
1452	faligndata	%f20, %f22, %f40
1453	faligndata	%f22, %f24, %f42
1454	faligndata	%f24, %f26, %f44
1455	brlez,pn	%o2, Lmemcpy_blockdone
1456	 faligndata	%f26, %f28, %f46
1457
1458	stda	%f32, [%o1] ASI_STORE
1459
1460	faligndata	%f28, %f30, %f32
1461	cmp	%o0, %o5
1462	bleu,a,pn	%icc, 2f
1463	 ldda	[%o0] ASI_BLK_P, %f0
1464	membar	#Sync
14652:
1466	faligndata	%f30, %f48, %f34
1467	dec	BLOCK_SIZE, %o2
1468	faligndata	%f48, %f50, %f36
1469	inc	BLOCK_SIZE, %o1
1470	faligndata	%f50, %f52, %f38
1471	faligndata	%f52, %f54, %f40
1472	inc	BLOCK_SIZE, %o0
1473	faligndata	%f54, %f56, %f42
1474	faligndata	%f56, %f58, %f44
1475	brlez,pn	%o2, Lmemcpy_blockdone
1476	 faligndata	%f58, %f60, %f46
1477
1478	stda	%f32, [%o1] ASI_STORE
1479
1480	faligndata	%f60, %f62, %f32
1481	cmp	%o0, %o5
1482	bleu,a,pn	%icc, 2f
1483	 ldda	[%o0] ASI_BLK_P, %f16
1484	membar	#Sync
14852:
1486	faligndata	%f62, %f0, %f34
1487	dec	BLOCK_SIZE, %o2
1488	faligndata	%f0, %f2, %f36
1489	inc	BLOCK_SIZE, %o1
1490	faligndata	%f2, %f4, %f38
1491	faligndata	%f4, %f6, %f40
1492	inc	BLOCK_SIZE, %o0
1493	faligndata	%f6, %f8, %f42
1494	faligndata	%f8, %f10, %f44
1495
1496	brlez,pn	%o2, Lmemcpy_blockdone
1497	 faligndata	%f10, %f12, %f46
1498
1499	stda	%f32, [%o1] ASI_STORE
1500	ba	3b
1501	 inc	BLOCK_SIZE, %o1
1502
1503Lmemcpy_blockdone:
1504	inc	BLOCK_SIZE, %o2				! Fixup our overcommit
1505	membar	#Sync					! Finish any pending loads
1506#define	FINISH_REG(f)				\
1507	deccc	8, %o2;				\
1508	bl,a	Lmemcpy_blockfinish;		\
1509	 fmovd	f, %f48;			\
1510	std	f, [%o1];			\
1511	inc	8, %o1
1512
1513	FINISH_REG(%f32)
1514	FINISH_REG(%f34)
1515	FINISH_REG(%f36)
1516	FINISH_REG(%f38)
1517	FINISH_REG(%f40)
1518	FINISH_REG(%f42)
1519	FINISH_REG(%f44)
1520	FINISH_REG(%f46)
1521	FINISH_REG(%f48)
1522#undef FINISH_REG
1523	!!
1524	!! The low 3 bits have the sub-word bits needed to be
1525	!! stored [because (x-8)&0x7 == x].
1526	!!
1527Lmemcpy_blockfinish:
1528	brz,pn	%o2, 2f					! 100% complete?
1529	 fmovd	%f48, %f4
1530	cmp	%o2, 8					! Exactly 8 bytes?
1531	bz,a,pn	CCCR, 2f
1532	 std	%f4, [%o1]
1533
1534	btst	4, %o2					! Word store?
1535	bz	CCCR, 1f
1536	 nop
1537	st	%f4, [%o1]
1538	inc	4, %o1
15391:
1540	btst	2, %o2
1541	fzero	%f0
1542	bz	1f
1543
1544	 mov	-6, %o4
1545	alignaddr %o1, %o4, %g0
1546
1547	faligndata %f0, %f4, %f8
1548
1549	stda	%f8, [%o1] ASI_FL16_P			! Store short
1550	inc	2, %o1
15511:
1552	btst	1, %o2					! Byte aligned?
1553	bz	2f
1554
1555	 mov	-7, %o0					! Calculate dest - 7
1556	alignaddr %o1, %o0, %g0				! Calculate shift mask and dest.
1557
1558	faligndata %f0, %f4, %f8			! Move 1st byte to low part of f8
1559
1560	stda	%f8, [%o1] ASI_FL8_P			! Store 1st byte
1561	inc	1, %o1					! Update address
15622:
1563	membar	#Sync
1564#if 0
1565	!!
1566	!! verify copy success.
1567	!!
1568
1569	mov	%i0, %o2
1570	mov	%i1, %o4
1571	mov	%i2, %l4
15720:
1573	ldub	[%o2], %o1
1574	inc	%o2
1575	ldub	[%o4], %o3
1576	inc	%o4
1577	cmp	%o3, %o1
1578	bnz	1f
1579	 dec	%l4
1580	brnz	%l4, 0b
1581	 nop
1582	ba	2f
1583	 nop
1584
15851:
1586	set	block_disable, %o0
1587	stx	%o0, [%o0]
1588
1589	set	0f, %o0
1590	call	prom_printf
1591	 sub	%i2, %l4, %o5
1592	set	1f, %o0
1593	mov	%i0, %o2
1594	mov	%i1, %o1
1595	call	prom_printf
1596	 mov	%i2, %o3
1597	ta	1
1598	.data
1599	_ALIGN
16000:	.asciz	"block memcpy failed: %x@%p != %x@%p byte %d\r\n"
16011:	.asciz	"memcpy(%p, %p, %lx)\r\n"
1602	_ALIGN
1603	.text
16042:
1605#endif
1606#if defined(_KERNEL) && !defined(_RUMPKERNEL)
1607
1608/*
1609 * Weve saved our possible fpstate, now disable the fpu
1610 * and continue with life.
1611 */
1612	RESTORE_FPU
1613	ret
1614	 restore	%g1, 0, %o0			! Return DEST for memcpy
1615#endif
1616 	retl
1617	 mov	%g1, %o0
1618/*
1619 * Use block_disable to turn off block insns for
1620 * memcpy/memset
1621 */
1622	.data
1623	.align	8
1624	.globl	block_disable
1625block_disable:	.xword	1
1626	.text
1627#endif	/* USE_BLOCK_STORE_LOAD */
1628