1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define M ARG1 /* rdi */ 43#define X ARG2 /* rsi */ 44#define INCX ARG3 /* rdx */ 45 46#define I %rax 47 48#include "l1param.h" 49 50 PROLOGUE 51 PROFCODE 52 53 SAVEREGISTERS 54 55 xorps %xmm0, %xmm0 56 testq M, M 57 jle .L999 58 testq INCX, INCX 59 jle .L999 60 61 xorps %xmm1, %xmm1 62 xorps %xmm2, %xmm2 63 xorps %xmm3, %xmm3 64 65 pcmpeqb %xmm15, %xmm15 66 psrlq $1, %xmm15 67 68 salq $BASE_SHIFT, INCX 69 70 subq $-16 * SIZE, X 71 72 cmpq $SIZE, INCX 73 jne .L40 74 75 testq $SIZE, X 76 je .L05 77 78 movsd -16 * SIZE(X), %xmm0 79 addq $SIZE, X 80 81 andps %xmm15, %xmm0 82 subq $1, M 83 jle .L999 84 ALIGN_3 85 86.L05: 87 movq M, I 88 sarq $4, I 89 jle .L20 90 91 movaps -16 * SIZE(X), %xmm4 92 movaps -14 * SIZE(X), %xmm5 93 movaps -12 * SIZE(X), %xmm6 94 movaps -10 * SIZE(X), %xmm7 95 96 movaps -8 * SIZE(X), %xmm8 97 movaps -6 * SIZE(X), %xmm9 98 movaps -4 * SIZE(X), %xmm10 99 movaps -2 * SIZE(X), %xmm11 100 101 decq I 102 jle .L11 103 ALIGN_4 104 105.L10: 106#ifdef PREFETCH 107 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 108#endif 109 110 andps %xmm15, %xmm4 111 addpd %xmm4, %xmm0 112 movaps 0 * SIZE(X), %xmm4 113 114 andps %xmm15, %xmm5 115 addpd %xmm5, %xmm1 116 movaps 2 * SIZE(X), %xmm5 117 118 andps %xmm15, %xmm6 119 addpd %xmm6, %xmm2 120 movaps 4 * SIZE(X), %xmm6 121 122 andps %xmm15, %xmm7 123 addpd %xmm7, %xmm3 124 movaps 6 * SIZE(X), %xmm7 125 126#if defined(PREFETCH) && !defined(FETCH128) 127 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 128#endif 129 130 andps %xmm15, %xmm8 131 addpd %xmm8, %xmm0 132 movaps 8 * SIZE(X), %xmm8 133 134 andps %xmm15, %xmm9 135 addpd %xmm9, %xmm1 136 movaps 10 * SIZE(X), %xmm9 137 138 andps %xmm15, %xmm10 139 addpd %xmm10, %xmm2 140 movaps 12 * SIZE(X), %xmm10 141 142 andps %xmm15, %xmm11 143 addpd %xmm11, %xmm3 144 movaps 14 * SIZE(X), %xmm11 145 146 subq $-16 * SIZE, X 147 decq I 148 jg .L10 149 ALIGN_4 150 151.L11: 152 andps %xmm15, %xmm4 153 andps %xmm15, %xmm5 154 andps %xmm15, %xmm6 155 andps %xmm15, %xmm7 156 157 addpd %xmm4, %xmm0 158 addpd %xmm5, %xmm1 159 addpd %xmm6, %xmm2 160 addpd %xmm7, %xmm3 161 162 andps %xmm15, %xmm8 163 andps %xmm15, %xmm9 164 andps %xmm15, %xmm10 165 andps %xmm15, %xmm11 166 167 addpd %xmm8, %xmm0 168 addpd %xmm9, %xmm1 169 addpd %xmm10, %xmm2 170 addpd %xmm11, %xmm3 171 172 subq $-16 * SIZE, X 173 ALIGN_3 174 175.L20: 176 andq $15, M 177 jle .L998 178 179 testq $8, M 180 je .L21 181 182 movaps -16 * SIZE(X), %xmm4 183 movaps -14 * SIZE(X), %xmm5 184 movaps -12 * SIZE(X), %xmm6 185 movaps -10 * SIZE(X), %xmm7 186 187 andps %xmm15, %xmm4 188 andps %xmm15, %xmm5 189 andps %xmm15, %xmm6 190 andps %xmm15, %xmm7 191 192 addpd %xmm4, %xmm0 193 addpd %xmm5, %xmm1 194 addpd %xmm6, %xmm2 195 addpd %xmm7, %xmm3 196 addq $8 * SIZE, X 197 ALIGN_3 198 199.L21: 200 testq $4, M 201 je .L22 202 203 movaps -16 * SIZE(X), %xmm4 204 movaps -14 * SIZE(X), %xmm5 205 206 andps %xmm15, %xmm4 207 andps %xmm15, %xmm5 208 addpd %xmm4, %xmm0 209 addpd %xmm5, %xmm1 210 211 addq $4 * SIZE, X 212 ALIGN_3 213 214.L22: 215 testq $2, M 216 je .L23 217 218 movaps -16 * SIZE(X), %xmm6 219 andps %xmm15, %xmm6 220 addpd %xmm6, %xmm3 221 addq $2 * SIZE, X 222 223.L23: 224 testq $1, M 225 je .L998 226 227#ifdef movsd 228 xorps %xmm4, %xmm4 229#endif 230 movsd -16 * SIZE(X), %xmm4 231 andps %xmm15, %xmm4 232 addsd %xmm4, %xmm0 233 jmp .L998 234 ALIGN_3 235 236.L40: 237 movq M, I 238 sarq $3, I 239 jle .L60 240 ALIGN_4 241 242.L50: 243 movsd -16 * SIZE(X), %xmm4 244 addq INCX, X 245 movhpd -16 * SIZE(X), %xmm4 246 addq INCX, X 247 andps %xmm15, %xmm4 248 addpd %xmm4, %xmm0 249 250 movsd -16 * SIZE(X), %xmm5 251 addq INCX, X 252 movhpd -16 * SIZE(X), %xmm5 253 addq INCX, X 254 andps %xmm15, %xmm5 255 addpd %xmm5, %xmm1 256 257 movsd -16 * SIZE(X), %xmm6 258 addq INCX, X 259 movhpd -16 * SIZE(X), %xmm6 260 addq INCX, X 261 andps %xmm15, %xmm6 262 addpd %xmm6, %xmm2 263 264 movsd -16 * SIZE(X), %xmm7 265 addq INCX, X 266 movhpd -16 * SIZE(X), %xmm7 267 addq INCX, X 268 andps %xmm15, %xmm7 269 addpd %xmm7, %xmm3 270 271 decq I 272 jg .L50 273 ALIGN_4 274 275.L60: 276#ifdef movsd 277 xorps %xmm4, %xmm4 278#endif 279 andq $7, M 280 jle .L998 281 ALIGN_4 282 283.L61: 284 movsd -16 * SIZE(X), %xmm4 285 andps %xmm15, %xmm4 286 addpd %xmm4, %xmm0 287 addq INCX, X 288 decq M 289 jg .L61 290 ALIGN_4 291 292.L998: 293 addpd %xmm1, %xmm0 294 addpd %xmm3, %xmm2 295 addpd %xmm2, %xmm0 296 ALIGN_4 297 298.L999: 299#ifndef HAVE_SSE3 300 movhlps %xmm0, %xmm1 301 addsd %xmm1, %xmm0 302#else 303 haddpd %xmm0, %xmm0 304#endif 305 306 RESTOREREGISTERS 307 308 ret 309 310 EPILOGUE 311 312