1//===----------------------Hexagon builtin routine ------------------------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// An optimized version of a memcpy which is equivalent to the following loop: 10// 11// volatile unsigned *dest; 12// unsigned *src; 13// 14// for (i = 0; i < num_words; ++i) 15// *dest++ = *src++; 16// 17// The corresponding C prototype for this function would be 18// void hexagon_memcpy_forward_vp4cp4n2(volatile unsigned *dest, 19// const unsigned *src, 20// unsigned num_words); 21// 22// *** Both dest and src must be aligned to 32-bit boundaries. *** 23// The code does not perform any runtime checks for this, and will fail 24// in bad ways if this requirement is not met. 25// 26// The "forward" in the name refers to the fact that the function copies 27// the words going forward in memory. It is incorrect to use this function 28// for cases where the original code copied words in any other order. 29// 30// *** This function is only for the use by the compiler. *** 31// The only indended use is for the LLVM compiler to generate calls to 32// this function, when a mem-copy loop, like the one above, is detected. 33 34 .text 35 36// Inputs: 37// r0: dest 38// r1: src 39// r2: num_words 40 41 .globl hexagon_memcpy_forward_vp4cp4n2 42 .balign 32 43 .type hexagon_memcpy_forward_vp4cp4n2,@function 44hexagon_memcpy_forward_vp4cp4n2: 45 46 // Compute r3 to be the number of words remaining in the current page. 47 // At the same time, compute r4 to be the number of 32-byte blocks 48 // remaining in the page (for prefetch). 49 { 50 r3 = sub(##4096, r1) 51 r5 = lsr(r2, #3) 52 } 53 { 54 // The word count before end-of-page is in the 12 lowest bits of r3. 55 // (If the address in r1 was already page-aligned, the bits are 0.) 56 r3 = extractu(r3, #10, #2) 57 r4 = extractu(r3, #7, #5) 58 } 59 { 60 r3 = minu(r2, r3) 61 r4 = minu(r5, r4) 62 } 63 { 64 r4 = or(r4, ##2105344) // 2105344 = 0x202000 65 p0 = cmp.eq(r3, #0) 66 if (p0.new) jump:nt .Lskipprolog 67 } 68 l2fetch(r1, r4) 69 { 70 loop0(.Lprolog, r3) 71 r2 = sub(r2, r3) // r2 = number of words left after the prolog. 72 } 73 .falign 74.Lprolog: 75 { 76 r4 = memw(r1++#4) 77 memw(r0++#4) = r4.new 78 } :endloop0 79.Lskipprolog: 80 { 81 // Let r3 = number of whole pages left (page = 1024 words). 82 r3 = lsr(r2, #10) 83 if (cmp.eq(r3.new, #0)) jump:nt .Lskipmain 84 } 85 { 86 loop1(.Lout, r3) 87 r2 = extractu(r2, #10, #0) // r2 = r2 & 1023 88 r3 = ##2105472 // r3 = 0x202080 (prefetch info) 89 } 90 // Iterate over pages. 91 .falign 92.Lout: 93 // Prefetch each individual page. 94 l2fetch(r1, r3) 95 loop0(.Lpage, #512) 96 .falign 97.Lpage: 98 r5:4 = memd(r1++#8) 99 { 100 memw(r0++#8) = r4 101 memw(r0+#4) = r5 102 } :endloop0:endloop1 103.Lskipmain: 104 { 105 r3 = ##2105344 // r3 = 0x202000 (prefetch info) 106 r4 = lsr(r2, #3) // r4 = number of 32-byte blocks remaining. 107 p0 = cmp.eq(r2, #0) 108 if (p0.new) jumpr:nt r31 109 } 110 { 111 r3 = or(r3, r4) 112 loop0(.Lepilog, r2) 113 } 114 l2fetch(r1, r3) 115 .falign 116.Lepilog: 117 { 118 r4 = memw(r1++#4) 119 memw(r0++#4) = r4.new 120 } :endloop0 121 122 jumpr r31 123 124.size hexagon_memcpy_forward_vp4cp4n2, . - hexagon_memcpy_forward_vp4cp4n2 125