1#include "setarch.h" 2 3#include "defines.h" 4 5#ifdef __H8300SX__ 6 7 .global _memcpy 8_memcpy: 9 stm.l er4-er6,@-er7 10 11 ; Set up source and destination pointers for movmd. 12 mov.l er0,er6 13 mov.l er1,er5 14 15 ; See whether the copy is long enough to use the movmd.l code. 16 ; Although the code can handle anything longer than 6 bytes, 17 ; it can be more expensive than movmd.b for small moves. 18 ; It's better to use a higher threshold to account for this. 19 ; 20 ; Note that the exact overhead of the movmd.l checks depends on 21 ; the alignments of the length and pointers. They are faster when 22 ; er0 & 3 == er1 & 3 == er2 & 3, faster still when these values 23 ; are 0. This threshold is a compromise between the various cases. 24 cmp #16,LEN(r2) 25 blo simple 26 27 ; movmd.l only works for even addresses. If one of the addresses 28 ; is odd and the other is not, fall back on a simple move. 29 bld #0,r5l 30 bxor #0,r6l 31 bcs simple 32 33 ; Make the addresses even. 34 bld #0,r5l 35 bcc word_aligned 36 mov.b @er5+,@er6+ 37 sub #1,LEN(r2) 38 39word_aligned: 40 ; See if copying one word would make the first operand longword 41 ; aligned. Although this is only really worthwhile if it aligns 42 ; the second operand as well, it's no worse if doesn't, so it 43 ; hardly seems worth the overhead of a "band" check. 44 bld #1,r6l 45 bcc fast_copy 46 mov.w @er5+,@er6+ 47 sub #2,LEN(r2) 48 49fast_copy: 50 ; Set (e)r4 to the number of longwords to copy. 51 mov LEN(r2),LEN(r4) 52 shlr #2,LEN(r4) 53 54#ifdef __NORMAL_MODE__ 55 ; 16-bit pointers and size_ts: one movmd.l is enough. This code 56 ; is never reached with r4 == 0. 57 movmd.l 58 and.w #3,r2 59simple: 60 mov.w r2,r4 61 beq quit 62 movmd.b 63quit: 64 rts/l er4-er6 65#else 66 ; Skip the first iteration if the number of longwords is divisible 67 ; by 0x10000. 68 mov.w r4,r4 69 beq fast_loop_next 70 71 ; This loop copies r4 (!= 0) longwords the first time round and 65536 72 ; longwords on each iteration after that. 73fast_loop: 74 movmd.l 75fast_loop_next: 76 sub.w #1,e4 77 bhs fast_loop 78 79 ; Mop up any left-over bytes. We could just fall through to the 80 ; simple code after the "and" but the version below is quicker 81 ; and only takes 10 more bytes. 82 and.w #3,r2 83 beq quit 84 mov.w r2,r4 85 movmd.b 86quit: 87 rts/l er4-er6 88 89simple: 90 ; Simple bytewise copy. We need to handle all lengths, including zero. 91 mov.w r2,r4 92 beq simple_loop_next 93simple_loop: 94 movmd.b 95simple_loop_next: 96 sub.w #1,e2 97 bhs simple_loop 98 rts/l er4-er6 99#endif 100 101#else 102 103 .global _memcpy 104_memcpy: 105; MOVP @(2/4,r7),A0P ; dst 106; MOVP @(4/8,r7),A1P ; src 107; MOVP @(6/12,r7),A2P ; len 108 109 MOVP A0P,A3P ; keep copy of final dst 110 ADDP A2P,A0P ; point to end of dst 111 CMPP A0P,A3P ; see if anything to do 112 beq quit 113 114 ADDP A2P,A1P ; point to end of src 115 116 ; lets see if we can do this in words 117 or A0L,A2L ; or in the dst address 118 or A3L,A2L ; or the length 119 or A1L,A2L ; or the src address 120 btst #0,A2L ; see if the lsb is zero 121 bne byteloop 122 123wordloop: 124#ifdef __NORMAL_MODE__ 125 sub #2,A1P 126#else 127 subs #2,A1P ; point to word 128#endif 129 mov.w @A1P,A2 ; get word 130 mov.w A2,@-A0P ; save word 131 CMPP A0P,A3P ; at the front again ? 132 bne wordloop 133 rts 134 135byteloop: 136#ifdef __NORMAL_MODE__ 137 sub #1,A1P 138#else 139 subs #1,A1P ; point to byte 140#endif 141 mov.b @A1P,A2L ; get byte 142 mov.b A2L,@-A0P ; save byte 143 CMPP A0P,A3P ; at the front again ? 144 bne byteloop 145 146 ; return with A0 pointing to dst 147quit: rts 148 149#endif 150