1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define M ARG1 /* rdi */ 43#define X ARG2 /* rsi */ 44#define INCX ARG3 /* rdx */ 45 46#define I %rax 47 48#include "l1param.h" 49 50 PROLOGUE 51 PROFCODE 52 53 SAVEREGISTERS 54 55 xorps %xmm0, %xmm0 56 testq M, M 57 jle .L999 58 testq INCX, INCX 59 jle .L999 60 61 xorps %xmm1, %xmm1 62 xorps %xmm2, %xmm2 63 xorps %xmm3, %xmm3 64 65 pcmpeqb %xmm15, %xmm15 66 psrld $1, %xmm15 67 68 leaq (, INCX, SIZE), INCX 69 70 cmpq $SIZE, INCX 71 jne .L100 72 73 subq $-32 * SIZE, X 74 75 cmpq $3, M 76 jle .L18 77 78 testq $4, X 79 je .L05 80 movss -32 * SIZE(X), %xmm0 81 andps %xmm15, %xmm0 82 addq $SIZE, X 83 decq M 84 jle .L998 85 ALIGN_3 86 87.L05: 88 testq $8, X 89 je .L10 90 91 movsd -32 * SIZE(X), %xmm1 92 andps %xmm15, %xmm1 93 addq $2 * SIZE, X 94 subq $2, M 95 jle .L998 96 ALIGN_3 97 98.L10: 99 movq M, I 100 sarq $5, I 101 jle .L14 102 103 movaps -32 * SIZE(X), %xmm4 104 movaps -28 * SIZE(X), %xmm5 105 movaps -24 * SIZE(X), %xmm6 106 movaps -20 * SIZE(X), %xmm7 107 108 movaps -16 * SIZE(X), %xmm8 109 movaps -12 * SIZE(X), %xmm9 110 movaps -8 * SIZE(X), %xmm10 111 movaps -4 * SIZE(X), %xmm11 112 decq I 113 jle .L12 114 ALIGN_3 115 116.L11: 117#ifdef PREFETCH 118 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 119#endif 120 121 andps %xmm15, %xmm4 122 addps %xmm4, %xmm0 123 movaps 0 * SIZE(X), %xmm4 124 125 andps %xmm15, %xmm5 126 addps %xmm5, %xmm1 127 movaps 4 * SIZE(X), %xmm5 128 129 andps %xmm15, %xmm6 130 addps %xmm6, %xmm2 131 movaps 8 * SIZE(X), %xmm6 132 133 andps %xmm15, %xmm7 134 addps %xmm7, %xmm3 135 movaps 12 * SIZE(X), %xmm7 136 137#if defined(PREFETCH) && !defined(FETCH128) 138 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 139#endif 140 141 andps %xmm15, %xmm8 142 addps %xmm8, %xmm0 143 movaps 16 * SIZE(X), %xmm8 144 145 andps %xmm15, %xmm9 146 addps %xmm9, %xmm1 147 movaps 20 * SIZE(X), %xmm9 148 149 andps %xmm15, %xmm10 150 addps %xmm10, %xmm2 151 movaps 24 * SIZE(X), %xmm10 152 153 andps %xmm15, %xmm11 154 addps %xmm11, %xmm3 155 movaps 28 * SIZE(X), %xmm11 156 157 subq $-32 * SIZE, X 158 decq I 159 jg .L11 160 ALIGN_3 161 162.L12: 163 andps %xmm15, %xmm4 164 addps %xmm4, %xmm0 165 andps %xmm15, %xmm5 166 addps %xmm5, %xmm1 167 168 andps %xmm15, %xmm6 169 addps %xmm6, %xmm2 170 andps %xmm15, %xmm7 171 addps %xmm7, %xmm3 172 173 andps %xmm15, %xmm8 174 addps %xmm8, %xmm0 175 andps %xmm15, %xmm9 176 addps %xmm9, %xmm1 177 178 andps %xmm15, %xmm10 179 addps %xmm10, %xmm2 180 andps %xmm15, %xmm11 181 addps %xmm11, %xmm3 182 183 subq $-32 * SIZE, X 184 ALIGN_3 185 186.L14: 187 testq $16, M 188 je .L16 189 190 movaps -32 * SIZE(X), %xmm4 191 andps %xmm15, %xmm4 192 addps %xmm4, %xmm0 193 194 movaps -28 * SIZE(X), %xmm5 195 andps %xmm15, %xmm5 196 addps %xmm5, %xmm1 197 198 movaps -24 * SIZE(X), %xmm4 199 andps %xmm15, %xmm4 200 addps %xmm4, %xmm0 201 202 movaps -20 * SIZE(X), %xmm5 203 andps %xmm15, %xmm5 204 addps %xmm5, %xmm1 205 206 addq $16 * SIZE, X 207 ALIGN_3 208 209.L16: 210 testq $8, M 211 je .L17 212 213 movaps -32 * SIZE(X), %xmm4 214 andps %xmm15, %xmm4 215 addps %xmm4, %xmm0 216 217 movaps -28 * SIZE(X), %xmm5 218 andps %xmm15, %xmm5 219 addps %xmm5, %xmm1 220 221 addq $8 * SIZE, X 222 ALIGN_3 223 224.L17: 225 testq $4, M 226 je .L18 227 228 movaps -32 * SIZE(X), %xmm6 229 andps %xmm15, %xmm6 230 addps %xmm6, %xmm2 231 addq $4 * SIZE, X 232 ALIGN_3 233 234.L18: 235 testq $2, M 236 je .L19 237 238#ifdef movsd 239 xorps %xmm7, %xmm7 240#endif 241 movsd -32 * SIZE(X), %xmm7 242 andps %xmm15, %xmm7 243 addps %xmm7, %xmm3 244 addq $2 * SIZE, X 245 ALIGN_3 246 247.L19: 248 testq $1, M 249 je .L998 250 251 movss -32 * SIZE(X), %xmm6 252 andps %xmm15, %xmm6 253 addps %xmm6, %xmm2 254 jmp .L998 255 ALIGN_4 256 257.L100: 258 movq M, I 259 sarq $3, I 260 jle .L105 261 ALIGN_4 262 263.L101: 264 movss 0 * SIZE(X), %xmm4 265 addq INCX, X 266 andps %xmm15, %xmm4 267 addss %xmm4, %xmm0 268 269 movss 0 * SIZE(X), %xmm5 270 addq INCX, X 271 andps %xmm15, %xmm5 272 addss %xmm5, %xmm1 273 274 movss 0 * SIZE(X), %xmm6 275 addq INCX, X 276 andps %xmm15, %xmm6 277 addss %xmm6, %xmm2 278 279 movss 0 * SIZE(X), %xmm7 280 addq INCX, X 281 andps %xmm15, %xmm7 282 addss %xmm7, %xmm3 283 284 movss 0 * SIZE(X), %xmm8 285 addq INCX, X 286 andps %xmm15, %xmm8 287 addss %xmm8, %xmm0 288 289 movss 0 * SIZE(X), %xmm4 290 addq INCX, X 291 andps %xmm15, %xmm4 292 addss %xmm4, %xmm1 293 294 movss 0 * SIZE(X), %xmm5 295 addq INCX, X 296 andps %xmm15, %xmm5 297 addss %xmm5, %xmm2 298 299 movss 0 * SIZE(X), %xmm6 300 addq INCX, X 301 andps %xmm15, %xmm6 302 addss %xmm6, %xmm3 303 304 decq I 305 jg .L101 306 ALIGN_4 307 308.L105: 309 andq $7, M 310 jle .L998 311 ALIGN_4 312 313.L106: 314 movss 0 * SIZE(X), %xmm4 315 andps %xmm15, %xmm4 316 addps %xmm4, %xmm0 317 addq INCX, X 318 decq M 319 jg .L106 320 ALIGN_4 321 322.L998: 323 addps %xmm1, %xmm0 324 addps %xmm3, %xmm2 325 addps %xmm2, %xmm0 326 327#ifndef HAVE_SSE3 328 movhlps %xmm0, %xmm1 329 addps %xmm1, %xmm0 330 331 movaps %xmm0, %xmm1 332 shufps $1, %xmm0, %xmm0 333 addss %xmm1, %xmm0 334#else 335 haddps %xmm0, %xmm0 336 haddps %xmm0, %xmm0 337#endif 338 ALIGN_4 339 340.L999: 341 RESTOREREGISTERS 342 343 ret 344 345 EPILOGUE 346