1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define M ARG1 /* rdi */ 43#define X ARG2 /* rsi */ 44#define INCX ARG3 /* rdx */ 45 46#define I %rax 47 48#include "l1param.h" 49 50 PROLOGUE 51 PROFCODE 52 53 SAVEREGISTERS 54 55 xorps %xmm0, %xmm0 56 testq M, M 57 jle .L999 58 testq INCX, INCX 59 jle .L999 60 61 xorps %xmm1, %xmm1 62 xorps %xmm2, %xmm2 63 xorps %xmm3, %xmm3 64 65 pcmpeqb %xmm15, %xmm15 66 psrlq $1, %xmm15 67 68 salq $BASE_SHIFT, INCX 69 xorps %xmm13, %xmm13 70 71 cmpq $SIZE, INCX 72 jne .L20 73 74 testq $SIZE, X 75 je .L05 76 77 movsd (X), %xmm0 78 addq $SIZE, X 79 andps %xmm15, %xmm0 80 decq M 81 jle .L999 82 ALIGN_3 83 84.L05: 85 subq $-16 * SIZE, X 86 87 movq M, I 88 sarq $4, I 89 jle .L12 90 91 movaps -16 * SIZE(X), %xmm4 92 movaps -14 * SIZE(X), %xmm5 93 movaps -12 * SIZE(X), %xmm6 94 movaps -10 * SIZE(X), %xmm7 95 96 movaps -8 * SIZE(X), %xmm8 97 movaps -6 * SIZE(X), %xmm9 98 movaps -4 * SIZE(X), %xmm10 99 movaps -2 * SIZE(X), %xmm11 100 101 decq I 102 jle .L11 103 ALIGN_4 104 105.L10: 106#ifdef PREFETCH 107 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 108#endif 109 110 andps %xmm15, %xmm4 111 addsd %xmm13, %xmm3 112 pshufd $0x4e, %xmm4, %xmm12 113 addsd %xmm4, %xmm0 114 movaps 0 * SIZE(X), %xmm4 115 116 andps %xmm15, %xmm5 117 addsd %xmm12, %xmm1 118 pshufd $0x4e, %xmm5, %xmm13 119 addsd %xmm5, %xmm2 120 movaps 2 * SIZE(X), %xmm5 121 122 andps %xmm15, %xmm6 123 addsd %xmm13, %xmm3 124 pshufd $0x4e, %xmm6, %xmm12 125 addsd %xmm6, %xmm0 126 movaps 4 * SIZE(X), %xmm6 127 128 andps %xmm15, %xmm7 129 addsd %xmm12, %xmm1 130 pshufd $0x4e, %xmm7, %xmm13 131 addsd %xmm7, %xmm2 132 movaps 6 * SIZE(X), %xmm7 133 134#if defined(PREFETCH) && !defined(FETCH128) 135 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 136#endif 137 138 andps %xmm15, %xmm8 139 addsd %xmm13, %xmm3 140 pshufd $0x4e, %xmm8, %xmm12 141 addsd %xmm8, %xmm0 142 movaps 8 * SIZE(X), %xmm8 143 144 andps %xmm15, %xmm9 145 addsd %xmm12, %xmm1 146 pshufd $0x4e, %xmm9, %xmm13 147 addsd %xmm9, %xmm2 148 movaps 10 * SIZE(X), %xmm9 149 150 andps %xmm15, %xmm10 151 addsd %xmm13, %xmm3 152 pshufd $0x4e, %xmm10, %xmm12 153 addsd %xmm10, %xmm0 154 movaps 12 * SIZE(X), %xmm10 155 156 andps %xmm15, %xmm11 157 addsd %xmm12, %xmm1 158 pshufd $0x4e, %xmm11, %xmm13 159 addsd %xmm11, %xmm2 160 movaps 14 * SIZE(X), %xmm11 161 162 subq $-16 * SIZE, X 163 decq I 164 jg .L10 165 ALIGN_4 166 167.L11: 168 andps %xmm15, %xmm4 169 addsd %xmm13, %xmm3 170 pshufd $0x4e, %xmm4, %xmm12 171 addsd %xmm4, %xmm0 172 173 andps %xmm15, %xmm5 174 addsd %xmm12, %xmm1 175 pshufd $0x4e, %xmm5, %xmm13 176 addsd %xmm5, %xmm2 177 178 andps %xmm15, %xmm6 179 addsd %xmm13, %xmm3 180 pshufd $0x4e, %xmm6, %xmm12 181 addsd %xmm6, %xmm0 182 183 andps %xmm15, %xmm7 184 addsd %xmm12, %xmm1 185 pshufd $0x4e, %xmm7, %xmm13 186 addsd %xmm7, %xmm2 187 188 andps %xmm15, %xmm8 189 addsd %xmm13, %xmm3 190 pshufd $0x4e, %xmm8, %xmm12 191 addsd %xmm8, %xmm0 192 193 andps %xmm15, %xmm9 194 addsd %xmm12, %xmm1 195 pshufd $0x4e, %xmm9, %xmm13 196 addsd %xmm9, %xmm2 197 198 andps %xmm15, %xmm10 199 addsd %xmm13, %xmm3 200 pshufd $0x4e, %xmm10, %xmm12 201 addsd %xmm10, %xmm0 202 203 andps %xmm15, %xmm11 204 addsd %xmm12, %xmm1 205 pshufd $0x4e, %xmm11, %xmm13 206 addsd %xmm11, %xmm2 207 208 addsd %xmm13, %xmm3 209 subq $-16 * SIZE, X 210 ALIGN_3 211 212.L12: 213 andq $15, M 214 jle .L998 215 216 testq $8, M 217 je .L13 218 219 movaps -16 * SIZE(X), %xmm4 220 movaps -14 * SIZE(X), %xmm5 221 movaps -12 * SIZE(X), %xmm6 222 movaps -10 * SIZE(X), %xmm7 223 addq $8 * SIZE, X 224 225 andps %xmm15, %xmm4 226 pshufd $0x4e, %xmm4, %xmm12 227 addsd %xmm4, %xmm0 228 andps %xmm15, %xmm5 229 addsd %xmm12, %xmm1 230 pshufd $0x4e, %xmm5, %xmm13 231 addsd %xmm5, %xmm2 232 addsd %xmm13, %xmm3 233 andps %xmm15, %xmm6 234 pshufd $0x4e, %xmm6, %xmm12 235 addsd %xmm6, %xmm0 236 andps %xmm15, %xmm7 237 addsd %xmm12, %xmm1 238 pshufd $0x4e, %xmm7, %xmm13 239 addsd %xmm7, %xmm2 240 addsd %xmm13, %xmm3 241 ALIGN_3 242 243.L13: 244 testq $4, M 245 je .L14 246 247 movaps -16 * SIZE(X), %xmm4 248 movaps -14 * SIZE(X), %xmm5 249 addq $4 * SIZE, X 250 251 andps %xmm15, %xmm4 252 pshufd $0x4e, %xmm4, %xmm12 253 addsd %xmm4, %xmm0 254 andps %xmm15, %xmm5 255 addsd %xmm12, %xmm1 256 pshufd $0x4e, %xmm5, %xmm13 257 addsd %xmm5, %xmm2 258 addsd %xmm13, %xmm3 259 ALIGN_3 260 261.L14: 262 testq $2, M 263 je .L15 264 265 movaps -16 * SIZE(X), %xmm4 266 addq $2 * SIZE, X 267 andps %xmm15, %xmm4 268 269 pshufd $0x4e, %xmm4, %xmm5 270 addsd %xmm4, %xmm2 271 addsd %xmm5, %xmm3 272 ALIGN_3 273 274.L15: 275 testq $1, M 276 je .L998 277 278 movsd -16 * SIZE(X), %xmm4 279 andps %xmm15, %xmm4 280 addsd %xmm4, %xmm0 281 jmp .L998 282 ALIGN_3 283 284.L20: 285 movq M, I 286 sarq $3, I 287 jle .L25 288 289 movsd (X), %xmm4 290 addq INCX, X 291 movsd (X), %xmm5 292 addq INCX, X 293 movsd (X), %xmm6 294 addq INCX, X 295 movsd (X), %xmm7 296 addq INCX, X 297 298 movsd (X), %xmm8 299 addq INCX, X 300 movsd (X), %xmm9 301 addq INCX, X 302 movsd (X), %xmm10 303 addq INCX, X 304 movsd (X), %xmm11 305 306 decq I 307 jle .L23 308 ALIGN_4 309 310.L22: 311 andps %xmm15, %xmm4 312 addq INCX, X 313 addsd %xmm4, %xmm0 314 movsd (X), %xmm4 315 andps %xmm15, %xmm5 316 addq INCX, X 317 addsd %xmm5, %xmm1 318 movsd (X), %xmm5 319 andps %xmm15, %xmm6 320 addq INCX, X 321 addsd %xmm6, %xmm2 322 movsd (X), %xmm6 323 andps %xmm15, %xmm7 324 addq INCX, X 325 addsd %xmm7, %xmm3 326 movsd (X), %xmm7 327 328 andps %xmm15, %xmm8 329 addq INCX, X 330 addsd %xmm8, %xmm0 331 movsd (X), %xmm8 332 andps %xmm15, %xmm9 333 addq INCX, X 334 addsd %xmm9, %xmm1 335 movsd (X), %xmm9 336 andps %xmm15, %xmm10 337 addq INCX, X 338 addsd %xmm10, %xmm2 339 movsd (X), %xmm10 340 andps %xmm15, %xmm11 341 addq INCX, X 342 addsd %xmm11, %xmm3 343 movsd (X), %xmm11 344 345 decq I 346 jg .L22 347 ALIGN_4 348 349.L23: 350 andps %xmm15, %xmm4 351 addq INCX, X 352 addsd %xmm4, %xmm0 353 andps %xmm15, %xmm5 354 addsd %xmm5, %xmm1 355 andps %xmm15, %xmm6 356 addsd %xmm6, %xmm2 357 andps %xmm15, %xmm7 358 addsd %xmm7, %xmm3 359 360 andps %xmm15, %xmm8 361 addsd %xmm8, %xmm0 362 andps %xmm15, %xmm9 363 addsd %xmm9, %xmm1 364 andps %xmm15, %xmm10 365 addsd %xmm10, %xmm2 366 andps %xmm15, %xmm11 367 addsd %xmm11, %xmm3 368 ALIGN_3 369 370.L25: 371 andq $7, M 372 jle .L998 373 374 testq $4, M 375 je .L26 376 377 movsd (X), %xmm4 378 addq INCX, X 379 movsd (X), %xmm5 380 addq INCX, X 381 movsd (X), %xmm6 382 andps %xmm15, %xmm4 383 addsd %xmm4, %xmm0 384 addq INCX, X 385 movsd (X), %xmm7 386 andps %xmm15, %xmm5 387 addsd %xmm5, %xmm1 388 addq INCX, X 389 390 andps %xmm15, %xmm6 391 addsd %xmm6, %xmm2 392 andps %xmm15, %xmm7 393 addsd %xmm7, %xmm3 394 ALIGN_3 395 396.L26: 397 testq $2, M 398 je .L27 399 400 movsd (X), %xmm4 401 addq INCX, X 402 movsd (X), %xmm5 403 addq INCX, X 404 405 andps %xmm15, %xmm4 406 andps %xmm15, %xmm5 407 408 addsd %xmm4, %xmm0 409 addsd %xmm5, %xmm1 410 ALIGN_3 411 412.L27: 413 testq $1, M 414 je .L998 415 416 movsd (X), %xmm4 417 andps %xmm15, %xmm4 418 addsd %xmm4, %xmm0 419 ALIGN_3 420 421.L998: 422 addsd %xmm1, %xmm0 423 addsd %xmm3, %xmm2 424 addsd %xmm2, %xmm0 425 ALIGN_4 426 427.L999: 428 RESTOREREGISTERS 429 430 ret 431 432 EPILOGUE 433 434