1/* $NetBSD: memset.S,v 1.1 2014/08/10 05:47:35 matt Exp $ */ 2 3/*- 4 * Copyright (c) 2014 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Matt Thomas of 3am Software Foundry. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32#include <machine/asm.h> 33 34ENTRY(memset) 35 cbz x2, .Lret 36 mov x15, x0 /* working data pointer */ 37 cbz x1, .Lzerofill 38 cbz x1, .Lfilled 39 /* 40 * Non zero fill, replicate to all 64 bits of x1. 41 */ 42 and x1, x1, #0xff 43 orr x1, x1, x1, lsl #8 44 orr x1, x1, x1, lsl #16 45 orr x1, x1, x1, lsl #32 46.Lfilled: 47 cmp x2, #15 /* if it's small, ignore alignment */ 48 b.ls .Llast_subqword 49 50 mov x6, x1 51 tst x15, #15 52 b.eq .Lqword_loop 53 54/* 55 * We have at least 15 to copy which means we can get qword alignment 56 * without having to check the amount left. 57 */ 58 tbz x15, #0, .Lhword_aligned 59 strb w1, [x15], #1 60.Lhword_aligned: 61 tbz x15, #1, .Lword_aligned 62 strh w1, [x15], #2 63.Lword_aligned: 64 tbz x15, #2, .Ldword_aligned 65 str w1, [x15], #4 66.Ldword_aligned: 67 tbz x15, #3, .Lqword_aligned 68 str x1, [x15], #8 69/* 70 * Now we qword aligned. Figure how much we have to write to get here. 71 * Then subtract from the length. If we get 0, we're done. 72 */ 73.Lqword_aligned: 74 sub x5, x15, x0 75 subs x2, x2, x5 76 b.eq .Lret 77 78/* 79 * Write 16 bytes at time. If we don't have 16 bytes to write, bail. 80 * Keep looping if there's data to set. 81 */ 82.Lqword_loop: 83 subs x2, x2, #16 84 b.mi .Llast_subqword 85 stp x1, x6, [x15], #16 86 b.ne .Lqword_loop 87 ret 88 89/* 90 * We have less than a qword to write. We hope we are aligned but since 91 * unaligned access works, we don't have to be aligned. 92 */ 93.Llast_subqword: 94 tbz x2, #3, .Llast_subdword 95 str x1, [x15], #8 96.Llast_subdword: 97 tbz x2, #2, .Llast_subword 98 str w1, [x15], #4 99.Llast_subword: 100 tbz x2, #1, .Llast_subhword 101 strh w1, [x15], #2 102.Llast_subhword: 103 tbz x2, #0, .Lret 104 strb w1, [x15] 105.Lret: ret 106 107/* 108 * If we are filling with zeros then let's see if we can use the 109 * dc zva, <Xt> 110 * instruction to speed things up. 111 */ 112.Lzerofill: 113 mrs x9, dczid_el0 114 /* 115 * Make sure we can the instruction isn't prohibited. 116 */ 117 tbnz x9, #4, .Lfilled 118 /* 119 * Now find out the block size. 120 */ 121 ubfx x9, x9, #0, #4 /* extract low 4 bits */ 122 add x9, x9, #2 /* add log2(word) */ 123 mov x10, #1 /* the value is log2(words) */ 124 lsl x10, x10, x9 /* shift to get the block size */ 125 cmp x2, x10 /* are we even copying a block? */ 126 b.lt .Lfilled /* no, do it 16 bytes at a time */ 127 /* 128 * Now we figure out how many aligned blocks we have 129 */ 130 sub x11, x10, #1 /* make block size a mask */ 131 add x12, x15, x11 /* round start to a block boundary */ 132 asr x12, x12, x9 /* "starting" block number */ 133 add x13, x15, x2 /* get ending address */ 134 asr x13, x13, x9 /* "ending" block numebr */ 135 cmp x13, x12 /* how many blocks? */ 136 b.eq .Lfilled /* none, do it 16 bytes at a time */ 137 138 /* 139 * Now we have one or more blocks to deal with. First now we need 140 * to get block aligned. 141 */ 142 and x7, x15, x11 /* are already aligned on a block boundary? */ 143 cbz x7, .Lblock_aligned 144 145 sub x7, x10, x7 /* subtract offset from block length */ 146 sub x2, x2, x7 /* subtract that from length */ 147 asr x7, x7, #2 /* qword -> word */ 148 149 tbz x15, #0, .Lzero_hword_aligned 150 strb wzr, [x15], #1 151.Lzero_hword_aligned: 152 tbz x15, #1, .Lzero_word_aligned 153 strh wzr, [x15], #2 154.Lzero_word_aligned: 155 tbz x15, #2, .Lzero_dword_aligned 156 str wzr, [x15], #4 157.Lzero_dword_aligned: 158 tbz x15, #3, .Lzero_qword_aligned 159 str xzr, [x15], #8 160.Lzero_qword_aligned: 161 cbz x7, .Lblock_aligned /* no qwords? just branch */ 162 adr x6, .Lblock_aligned 163 sub x6, x6, x7 /* backup to write the last N qwords */ 164 br x6 /* and do it */ 165 /* 166 * This is valid for cache lines <= 256 bytes. 167 */ 168 stp xzr, xzr, [x15], #16 169 stp xzr, xzr, [x15], #16 170 stp xzr, xzr, [x15], #16 171 stp xzr, xzr, [x15], #16 172 stp xzr, xzr, [x15], #16 173 stp xzr, xzr, [x15], #16 174 stp xzr, xzr, [x15], #16 175 stp xzr, xzr, [x15], #16 176 stp xzr, xzr, [x15], #16 177 stp xzr, xzr, [x15], #16 178 stp xzr, xzr, [x15], #16 179 stp xzr, xzr, [x15], #16 180 stp xzr, xzr, [x15], #16 181 stp xzr, xzr, [x15], #16 182 stp xzr, xzr, [x15], #16 183 184/* 185 * Now we are block aligned. 186 */ 187.Lblock_aligned: 188 subs x2, x2, x10 189 b.mi .Lblock_done 190 dc zva, x15 191 add x15, x15, x10 192 b.ne .Lblock_aligned 193 ret 194 195.Lblock_done: 196 and x2, x2, x12 /* make positive again */ 197 mov x6, xzr /* fill 2nd xword */ 198 b .Lqword_loop /* and finish filling */ 199 200END(memset) 201