#include "setarch.h" #include "defines.h" #ifdef __H8300SX__ .global _memcpy _memcpy: stm.l er4-er6,@-er7 ; Set up source and destination pointers for movmd. mov.l er0,er6 mov.l er1,er5 ; See whether the copy is long enough to use the movmd.l code. ; Although the code can handle anything longer than 6 bytes, ; it can be more expensive than movmd.b for small moves. ; It's better to use a higher threshold to account for this. ; ; Note that the exact overhead of the movmd.l checks depends on ; the alignments of the length and pointers. They are faster when ; er0 & 3 == er1 & 3 == er2 & 3, faster still when these values ; are 0. This threshold is a compromise between the various cases. cmp #16,LEN(r2) blo simple ; movmd.l only works for even addresses. If one of the addresses ; is odd and the other is not, fall back on a simple move. bld #0,r5l bxor #0,r6l bcs simple ; Make the addresses even. bld #0,r5l bcc word_aligned mov.b @er5+,@er6+ sub #1,LEN(r2) word_aligned: ; See if copying one word would make the first operand longword ; aligned. Although this is only really worthwhile if it aligns ; the second operand as well, it's no worse if doesn't, so it ; hardly seems worth the overhead of a "band" check. bld #1,r6l bcc fast_copy mov.w @er5+,@er6+ sub #2,LEN(r2) fast_copy: ; Set (e)r4 to the number of longwords to copy. mov LEN(r2),LEN(r4) shlr #2,LEN(r4) #ifdef __NORMAL_MODE__ ; 16-bit pointers and size_ts: one movmd.l is enough. This code ; is never reached with r4 == 0. movmd.l and.w #3,r2 simple: mov.w r2,r4 beq quit movmd.b quit: rts/l er4-er6 #else ; Skip the first iteration if the number of longwords is divisible ; by 0x10000. mov.w r4,r4 beq fast_loop_next ; This loop copies r4 (!= 0) longwords the first time round and 65536 ; longwords on each iteration after that. fast_loop: movmd.l fast_loop_next: sub.w #1,e4 bhs fast_loop ; Mop up any left-over bytes. We could just fall through to the ; simple code after the "and" but the version below is quicker ; and only takes 10 more bytes. and.w #3,r2 beq quit mov.w r2,r4 movmd.b quit: rts/l er4-er6 simple: ; Simple bytewise copy. We need to handle all lengths, including zero. mov.w r2,r4 beq simple_loop_next simple_loop: movmd.b simple_loop_next: sub.w #1,e2 bhs simple_loop rts/l er4-er6 #endif #else .global _memcpy _memcpy: ; MOVP @(2/4,r7),A0P ; dst ; MOVP @(4/8,r7),A1P ; src ; MOVP @(6/12,r7),A2P ; len MOVP A0P,A3P ; keep copy of final dst ADDP A2P,A0P ; point to end of dst CMPP A0P,A3P ; see if anything to do beq quit ADDP A2P,A1P ; point to end of src ; lets see if we can do this in words or A0L,A2L ; or in the dst address or A3L,A2L ; or the length or A1L,A2L ; or the src address btst #0,A2L ; see if the lsb is zero bne byteloop wordloop: #ifdef __NORMAL_MODE__ sub #2,A1P #else subs #2,A1P ; point to word #endif mov.w @A1P,A2 ; get word mov.w A2,@-A0P ; save word CMPP A0P,A3P ; at the front again ? bne wordloop rts byteloop: #ifdef __NORMAL_MODE__ sub #1,A1P #else subs #1,A1P ; point to byte #endif mov.b @A1P,A2L ; get byte mov.b A2L,@-A0P ; save byte CMPP A0P,A3P ; at the front again ? bne byteloop ; return with A0 pointing to dst quit: rts #endif