1#include "setarch.h"
2
3#include "defines.h"
4
5#ifdef __H8300SX__
6
7	.global _memcpy
8_memcpy:
9	stm.l	er4-er6,@-er7
10
11	; Set up source and destination pointers for movmd.
12	mov.l	er0,er6
13	mov.l	er1,er5
14
15	; See whether the copy is long enough to use the movmd.l code.
16	; Although the code can handle anything longer than 6 bytes,
17	; it can be more expensive than movmd.b for small moves.
18	; It's better to use a higher threshold to account for this.
19	;
20	; Note that the exact overhead of the movmd.l checks depends on
21	; the alignments of the length and pointers.  They are faster when
22	; er0 & 3 == er1 & 3 == er2 & 3, faster still when these values
23	; are 0.  This threshold is a compromise between the various cases.
24	cmp	#16,LEN(r2)
25	blo	simple
26
27	; movmd.l only works for even addresses.  If one of the addresses
28	; is odd and the other is not, fall back on a simple move.
29	bld	#0,r5l
30	bxor	#0,r6l
31	bcs	simple
32
33	; Make the addresses even.
34	bld	#0,r5l
35	bcc	word_aligned
36	mov.b	@er5+,@er6+
37	sub	#1,LEN(r2)
38
39word_aligned:
40	; See if copying one word would make the first operand longword
41	; aligned.  Although this is only really worthwhile if it aligns
42	; the second operand as well, it's no worse if doesn't, so it
43	; hardly seems worth the overhead of a "band" check.
44	bld	#1,r6l
45	bcc	fast_copy
46	mov.w	@er5+,@er6+
47	sub	#2,LEN(r2)
48
49fast_copy:
50	; Set (e)r4 to the number of longwords to copy.
51	mov	LEN(r2),LEN(r4)
52	shlr	#2,LEN(r4)
53
54#ifdef __NORMAL_MODE__
55	; 16-bit pointers and size_ts: one movmd.l is enough.  This code
56	; is never reached with r4 == 0.
57	movmd.l
58	and.w	#3,r2
59simple:
60	mov.w	r2,r4
61	beq	quit
62	movmd.b
63quit:
64	rts/l	er4-er6
65#else
66	; Skip the first iteration if the number of longwords is divisible
67	; by 0x10000.
68	mov.w	r4,r4
69	beq	fast_loop_next
70
71	; This loop copies r4 (!= 0) longwords the first time round and 65536
72	; longwords on each iteration after that.
73fast_loop:
74	movmd.l
75fast_loop_next:
76	sub.w	#1,e4
77	bhs	fast_loop
78
79	; Mop up any left-over bytes.  We could just fall through to the
80	; simple code after the "and" but the version below is quicker
81	; and only takes 10 more bytes.
82	and.w	#3,r2
83	beq	quit
84	mov.w	r2,r4
85	movmd.b
86quit:
87	rts/l	er4-er6
88
89simple:
90	; Simple bytewise copy.  We need to handle all lengths, including zero.
91	mov.w	r2,r4
92	beq	simple_loop_next
93simple_loop:
94	movmd.b
95simple_loop_next:
96	sub.w	#1,e2
97	bhs	simple_loop
98	rts/l	er4-er6
99#endif
100
101#else
102
103	.global _memcpy
104_memcpy:
105;	MOVP	@(2/4,r7),A0P	; dst
106;	MOVP	@(4/8,r7),A1P	; src
107;	MOVP	@(6/12,r7),A2P	; len
108
109	MOVP	A0P,A3P	; keep copy of final dst
110	ADDP	A2P,A0P	; point to end of dst
111	CMPP	A0P,A3P	; see if anything to do
112	beq	quit
113
114	ADDP	A2P,A1P	; point to end of src
115
116	; lets see if we can do this in words
117	or	A0L,A2L	; or in the dst address
118	or	A3L,A2L	; or the length
119	or	A1L,A2L	; or the src address
120	btst	#0,A2L	; see if the lsb is zero
121	bne	byteloop
122
123wordloop:
124#ifdef __NORMAL_MODE__
125	sub	#2,A1P
126#else
127	subs	#2,A1P		; point to word
128#endif
129	mov.w	@A1P,A2		; get word
130	mov.w	A2,@-A0P	; save word
131	CMPP	A0P,A3P		; at the front again ?
132	bne 	wordloop
133	rts
134
135byteloop:
136#ifdef __NORMAL_MODE__
137	sub	#1,A1P
138#else
139	subs	#1,A1P		; point to byte
140#endif
141	mov.b	@A1P,A2L	; get byte
142	mov.b	A2L,@-A0P	; save byte
143	CMPP	A0P,A3P 	; at the front again ?
144	bne 	byteloop
145
146	; return with A0 pointing to dst
147quit:	rts
148
149#endif
150