1/* 2 * Copyright (c) 2012 3 * MIPS Technologies, Inc., California. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its 14 * contributors may be used to endorse or promote products derived from 15 * this software without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30#include "pixman-mips-dspr2-asm.h" 31 32/* 33 * This routine could be optimized for MIPS64. The current code only 34 * uses MIPS32 instructions. 35 */ 36 37#ifdef EB 38# define LWHI lwl /* high part is left in big-endian */ 39# define SWHI swl /* high part is left in big-endian */ 40# define LWLO lwr /* low part is right in big-endian */ 41# define SWLO swr /* low part is right in big-endian */ 42#else 43# define LWHI lwr /* high part is right in little-endian */ 44# define SWHI swr /* high part is right in little-endian */ 45# define LWLO lwl /* low part is left in big-endian */ 46# define SWLO swl /* low part is left in big-endian */ 47#endif 48 49LEAF_MIPS32R2(pixman_mips_fast_memcpy) 50 51 slti AT, a2, 8 52 bne AT, zero, $last8 53 move v0, a0 /* memcpy returns the dst pointer */ 54 55/* Test if the src and dst are word-aligned, or can be made word-aligned */ 56 xor t8, a1, a0 57 andi t8, t8, 0x3 /* t8 is a0/a1 word-displacement */ 58 59 bne t8, zero, $unaligned 60 negu a3, a0 61 62 andi a3, a3, 0x3 /* we need to copy a3 bytes to make a0/a1 aligned */ 63 beq a3, zero, $chk16w /* when a3=0 then the dst (a0) is word-aligned */ 64 subu a2, a2, a3 /* now a2 is the remining bytes count */ 65 66 LWHI t8, 0(a1) 67 addu a1, a1, a3 68 SWHI t8, 0(a0) 69 addu a0, a0, a3 70 71/* Now the dst/src are mutually word-aligned with word-aligned addresses */ 72$chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */ 73 /* t8 is the byte count after 64-byte chunks */ 74 75 beq a2, t8, $chk8w /* if a2==t8, no 64-byte chunks */ 76 /* There will be at most 1 32-byte chunk after it */ 77 subu a3, a2, t8 /* subtract from a2 the reminder */ 78 /* Here a3 counts bytes in 16w chunks */ 79 addu a3, a0, a3 /* Now a3 is the final dst after 64-byte chunks */ 80 81 addu t0, a0, a2 /* t0 is the "past the end" address */ 82 83/* 84 * When in the loop we exercise "pref 30, x(a0)", the a0+x should not be past 85 * the "t0-32" address 86 * This means: for x=128 the last "safe" a0 address is "t0-160" 87 * Alternatively, for x=64 the last "safe" a0 address is "t0-96" 88 * In the current version we use "pref 30, 128(a0)", so "t0-160" is the limit 89 */ 90 subu t9, t0, 160 /* t9 is the "last safe pref 30, 128(a0)" address */ 91 92 pref 0, 0(a1) /* bring the first line of src, addr 0 */ 93 pref 0, 32(a1) /* bring the second line of src, addr 32 */ 94 pref 0, 64(a1) /* bring the third line of src, addr 64 */ 95 pref 30, 32(a0) /* safe, as we have at least 64 bytes ahead */ 96/* In case the a0 > t9 don't use "pref 30" at all */ 97 sgtu v1, a0, t9 98 bgtz v1, $loop16w /* skip "pref 30, 64(a0)" for too short arrays */ 99 nop 100/* otherwise, start with using pref30 */ 101 pref 30, 64(a0) 102$loop16w: 103 pref 0, 96(a1) 104 lw t0, 0(a1) 105 bgtz v1, $skip_pref30_96 /* skip "pref 30, 96(a0)" */ 106 lw t1, 4(a1) 107 pref 30, 96(a0) /* continue setting up the dest, addr 96 */ 108$skip_pref30_96: 109 lw t2, 8(a1) 110 lw t3, 12(a1) 111 lw t4, 16(a1) 112 lw t5, 20(a1) 113 lw t6, 24(a1) 114 lw t7, 28(a1) 115 pref 0, 128(a1) /* bring the next lines of src, addr 128 */ 116 117 sw t0, 0(a0) 118 sw t1, 4(a0) 119 sw t2, 8(a0) 120 sw t3, 12(a0) 121 sw t4, 16(a0) 122 sw t5, 20(a0) 123 sw t6, 24(a0) 124 sw t7, 28(a0) 125 126 lw t0, 32(a1) 127 bgtz v1, $skip_pref30_128 /* skip "pref 30, 128(a0)" */ 128 lw t1, 36(a1) 129 pref 30, 128(a0) /* continue setting up the dest, addr 128 */ 130$skip_pref30_128: 131 lw t2, 40(a1) 132 lw t3, 44(a1) 133 lw t4, 48(a1) 134 lw t5, 52(a1) 135 lw t6, 56(a1) 136 lw t7, 60(a1) 137 pref 0, 160(a1) /* bring the next lines of src, addr 160 */ 138 139 sw t0, 32(a0) 140 sw t1, 36(a0) 141 sw t2, 40(a0) 142 sw t3, 44(a0) 143 sw t4, 48(a0) 144 sw t5, 52(a0) 145 sw t6, 56(a0) 146 sw t7, 60(a0) 147 148 addiu a0, a0, 64 /* adding 64 to dest */ 149 sgtu v1, a0, t9 150 bne a0, a3, $loop16w 151 addiu a1, a1, 64 /* adding 64 to src */ 152 move a2, t8 153 154/* Here we have src and dest word-aligned but less than 64-bytes to go */ 155 156$chk8w: 157 pref 0, 0x0(a1) 158 andi t8, a2, 0x1f /* is there a 32-byte chunk? */ 159 /* the t8 is the reminder count past 32-bytes */ 160 beq a2, t8, $chk1w /* when a2=t8, no 32-byte chunk */ 161 nop 162 163 lw t0, 0(a1) 164 lw t1, 4(a1) 165 lw t2, 8(a1) 166 lw t3, 12(a1) 167 lw t4, 16(a1) 168 lw t5, 20(a1) 169 lw t6, 24(a1) 170 lw t7, 28(a1) 171 addiu a1, a1, 32 172 173 sw t0, 0(a0) 174 sw t1, 4(a0) 175 sw t2, 8(a0) 176 sw t3, 12(a0) 177 sw t4, 16(a0) 178 sw t5, 20(a0) 179 sw t6, 24(a0) 180 sw t7, 28(a0) 181 addiu a0, a0, 32 182 183$chk1w: 184 andi a2, t8, 0x3 /* now a2 is the reminder past 1w chunks */ 185 beq a2, t8, $last8 186 subu a3, t8, a2 /* a3 is count of bytes in 1w chunks */ 187 addu a3, a0, a3 /* now a3 is the dst address past the 1w chunks */ 188 189/* copying in words (4-byte chunks) */ 190$wordCopy_loop: 191 lw t3, 0(a1) /* the first t3 may be equal t0 ... optimize? */ 192 addiu a1, a1, 4 193 addiu a0, a0, 4 194 bne a0, a3, $wordCopy_loop 195 sw t3, -4(a0) 196 197/* For the last (<8) bytes */ 198$last8: 199 blez a2, leave 200 addu a3, a0, a2 /* a3 is the last dst address */ 201$last8loop: 202 lb v1, 0(a1) 203 addiu a1, a1, 1 204 addiu a0, a0, 1 205 bne a0, a3, $last8loop 206 sb v1, -1(a0) 207 208leave: j ra 209 nop 210 211/* 212 * UNALIGNED case 213 */ 214 215$unaligned: 216 /* got here with a3="negu a0" */ 217 andi a3, a3, 0x3 /* test if the a0 is word aligned */ 218 beqz a3, $ua_chk16w 219 subu a2, a2, a3 /* bytes left after initial a3 bytes */ 220 221 LWHI v1, 0(a1) 222 LWLO v1, 3(a1) 223 addu a1, a1, a3 /* a3 may be here 1, 2 or 3 */ 224 SWHI v1, 0(a0) 225 addu a0, a0, a3 /* below the dst will be word aligned (NOTE1) */ 226 227$ua_chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */ 228 /* t8 is the byte count after 64-byte chunks */ 229 beq a2, t8, $ua_chk8w /* if a2==t8, no 64-byte chunks */ 230 /* There will be at most 1 32-byte chunk after it */ 231 subu a3, a2, t8 /* subtract from a2 the reminder */ 232 /* Here a3 counts bytes in 16w chunks */ 233 addu a3, a0, a3 /* Now a3 is the final dst after 64-byte chunks */ 234 235 addu t0, a0, a2 /* t0 is the "past the end" address */ 236 237 subu t9, t0, 160 /* t9 is the "last safe pref 30, 128(a0)" address */ 238 239 pref 0, 0(a1) /* bring the first line of src, addr 0 */ 240 pref 0, 32(a1) /* bring the second line of src, addr 32 */ 241 pref 0, 64(a1) /* bring the third line of src, addr 64 */ 242 pref 30, 32(a0) /* safe, as we have at least 64 bytes ahead */ 243/* In case the a0 > t9 don't use "pref 30" at all */ 244 sgtu v1, a0, t9 245 bgtz v1, $ua_loop16w /* skip "pref 30, 64(a0)" for too short arrays */ 246 nop 247/* otherwise, start with using pref30 */ 248 pref 30, 64(a0) 249$ua_loop16w: 250 pref 0, 96(a1) 251 LWHI t0, 0(a1) 252 LWLO t0, 3(a1) 253 LWHI t1, 4(a1) 254 bgtz v1, $ua_skip_pref30_96 255 LWLO t1, 7(a1) 256 pref 30, 96(a0) /* continue setting up the dest, addr 96 */ 257$ua_skip_pref30_96: 258 LWHI t2, 8(a1) 259 LWLO t2, 11(a1) 260 LWHI t3, 12(a1) 261 LWLO t3, 15(a1) 262 LWHI t4, 16(a1) 263 LWLO t4, 19(a1) 264 LWHI t5, 20(a1) 265 LWLO t5, 23(a1) 266 LWHI t6, 24(a1) 267 LWLO t6, 27(a1) 268 LWHI t7, 28(a1) 269 LWLO t7, 31(a1) 270 pref 0, 128(a1) /* bring the next lines of src, addr 128 */ 271 272 sw t0, 0(a0) 273 sw t1, 4(a0) 274 sw t2, 8(a0) 275 sw t3, 12(a0) 276 sw t4, 16(a0) 277 sw t5, 20(a0) 278 sw t6, 24(a0) 279 sw t7, 28(a0) 280 281 LWHI t0, 32(a1) 282 LWLO t0, 35(a1) 283 LWHI t1, 36(a1) 284 bgtz v1, $ua_skip_pref30_128 285 LWLO t1, 39(a1) 286 pref 30, 128(a0) /* continue setting up the dest, addr 128 */ 287$ua_skip_pref30_128: 288 LWHI t2, 40(a1) 289 LWLO t2, 43(a1) 290 LWHI t3, 44(a1) 291 LWLO t3, 47(a1) 292 LWHI t4, 48(a1) 293 LWLO t4, 51(a1) 294 LWHI t5, 52(a1) 295 LWLO t5, 55(a1) 296 LWHI t6, 56(a1) 297 LWLO t6, 59(a1) 298 LWHI t7, 60(a1) 299 LWLO t7, 63(a1) 300 pref 0, 160(a1) /* bring the next lines of src, addr 160 */ 301 302 sw t0, 32(a0) 303 sw t1, 36(a0) 304 sw t2, 40(a0) 305 sw t3, 44(a0) 306 sw t4, 48(a0) 307 sw t5, 52(a0) 308 sw t6, 56(a0) 309 sw t7, 60(a0) 310 311 addiu a0, a0, 64 /* adding 64 to dest */ 312 sgtu v1, a0, t9 313 bne a0, a3, $ua_loop16w 314 addiu a1, a1, 64 /* adding 64 to src */ 315 move a2, t8 316 317/* Here we have src and dest word-aligned but less than 64-bytes to go */ 318 319$ua_chk8w: 320 pref 0, 0x0(a1) 321 andi t8, a2, 0x1f /* is there a 32-byte chunk? */ 322 /* the t8 is the reminder count */ 323 beq a2, t8, $ua_chk1w /* when a2=t8, no 32-byte chunk */ 324 325 LWHI t0, 0(a1) 326 LWLO t0, 3(a1) 327 LWHI t1, 4(a1) 328 LWLO t1, 7(a1) 329 LWHI t2, 8(a1) 330 LWLO t2, 11(a1) 331 LWHI t3, 12(a1) 332 LWLO t3, 15(a1) 333 LWHI t4, 16(a1) 334 LWLO t4, 19(a1) 335 LWHI t5, 20(a1) 336 LWLO t5, 23(a1) 337 LWHI t6, 24(a1) 338 LWLO t6, 27(a1) 339 LWHI t7, 28(a1) 340 LWLO t7, 31(a1) 341 addiu a1, a1, 32 342 343 sw t0, 0(a0) 344 sw t1, 4(a0) 345 sw t2, 8(a0) 346 sw t3, 12(a0) 347 sw t4, 16(a0) 348 sw t5, 20(a0) 349 sw t6, 24(a0) 350 sw t7, 28(a0) 351 addiu a0, a0, 32 352 353$ua_chk1w: 354 andi a2, t8, 0x3 /* now a2 is the reminder past 1w chunks */ 355 beq a2, t8, $ua_smallCopy 356 subu a3, t8, a2 /* a3 is count of bytes in 1w chunks */ 357 addu a3, a0, a3 /* now a3 is the dst address past the 1w chunks */ 358 359/* copying in words (4-byte chunks) */ 360$ua_wordCopy_loop: 361 LWHI v1, 0(a1) 362 LWLO v1, 3(a1) 363 addiu a1, a1, 4 364 addiu a0, a0, 4 /* note: dst=a0 is word aligned here, see NOTE1 */ 365 bne a0, a3, $ua_wordCopy_loop 366 sw v1, -4(a0) 367 368/* Now less than 4 bytes (value in a2) left to copy */ 369$ua_smallCopy: 370 beqz a2, leave 371 addu a3, a0, a2 /* a3 is the last dst address */ 372$ua_smallCopy_loop: 373 lb v1, 0(a1) 374 addiu a1, a1, 1 375 addiu a0, a0, 1 376 bne a0, a3, $ua_smallCopy_loop 377 sb v1, -1(a0) 378 379 j ra 380 nop 381 382END(pixman_mips_fast_memcpy) 383