1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define M ARG1 /* rdi */ 26#define X ARG2 /* rsi */ 27#define INCX ARG3 /* rdx */ 28 29#define I %rax 30 31#include "l1param.h" 32 33 PROLOGUE 34 PROFCODE 35 36 SAVEREGISTERS 37 38 pxor %xmm0, %xmm0 39 testq M, M 40 jle .L999 41 testq INCX, INCX 42 jle .L999 43 44 pxor %xmm1, %xmm1 45 pxor %xmm2, %xmm2 46 pxor %xmm3, %xmm3 47 48 pcmpeqb %xmm15, %xmm15 49 psrld $1, %xmm15 50 51 salq $ZBASE_SHIFT, INCX 52 53 cmpq $2 * SIZE, INCX 54 jne .L100 55 56 subq $-32 * SIZE, X 57 addq M, M 58 59 cmpq $3, M 60 jle .L18 61 62 testq $4, X 63 je .L05 64 movss -32 * SIZE(X), %xmm0 65 andps %xmm15, %xmm0 66 addq $SIZE, X 67 decq M 68 jle .L998 69 ALIGN_3 70 71.L05: 72 testq $8, X 73 je .L10 74 75#ifdef movsd 76 xorps %xmm1, %xmm1 77#endif 78 movsd -32 * SIZE(X), %xmm1 79 andps %xmm15, %xmm1 80 addq $2 * SIZE, X 81 subq $2, M 82 jle .L998 83 ALIGN_3 84 85.L10: 86 movq M, I 87 sarq $5, I 88 jle .L14 89 90 movaps -32 * SIZE(X), %xmm4 91 movaps -28 * SIZE(X), %xmm5 92 movaps -24 * SIZE(X), %xmm6 93 movaps -20 * SIZE(X), %xmm7 94 95 movaps -16 * SIZE(X), %xmm8 96 movaps -12 * SIZE(X), %xmm9 97 movaps -8 * SIZE(X), %xmm10 98 movaps -4 * SIZE(X), %xmm11 99 decq I 100 jle .L12 101 ALIGN_3 102 103.L11: 104#ifdef PREFETCH 105 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 106#endif 107 108 andps %xmm15, %xmm4 109 addps %xmm4, %xmm0 110 movaps 0 * SIZE(X), %xmm4 111 112 andps %xmm15, %xmm5 113 addps %xmm5, %xmm1 114 movaps 4 * SIZE(X), %xmm5 115 116 andps %xmm15, %xmm6 117 addps %xmm6, %xmm2 118 movaps 8 * SIZE(X), %xmm6 119 120 andps %xmm15, %xmm7 121 addps %xmm7, %xmm3 122 movaps 12 * SIZE(X), %xmm7 123 124#if defined(PREFETCH) && !defined(FETCH128) 125 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 126#endif 127 128 andps %xmm15, %xmm8 129 addps %xmm8, %xmm0 130 movaps 16 * SIZE(X), %xmm8 131 132 andps %xmm15, %xmm9 133 addps %xmm9, %xmm1 134 movaps 20 * SIZE(X), %xmm9 135 136 andps %xmm15, %xmm10 137 addps %xmm10, %xmm2 138 movaps 24 * SIZE(X), %xmm10 139 140 andps %xmm15, %xmm11 141 addps %xmm11, %xmm3 142 movaps 28 * SIZE(X), %xmm11 143 144 subq $-32 * SIZE, X 145 decq I 146 jg .L11 147 ALIGN_3 148 149.L12: 150 andps %xmm15, %xmm4 151 addps %xmm4, %xmm0 152 andps %xmm15, %xmm5 153 addps %xmm5, %xmm1 154 155 andps %xmm15, %xmm6 156 addps %xmm6, %xmm2 157 andps %xmm15, %xmm7 158 addps %xmm7, %xmm3 159 160 andps %xmm15, %xmm8 161 addps %xmm8, %xmm0 162 andps %xmm15, %xmm9 163 addps %xmm9, %xmm1 164 165 andps %xmm15, %xmm10 166 addps %xmm10, %xmm2 167 andps %xmm15, %xmm11 168 addps %xmm11, %xmm3 169 170 addq $32 * SIZE, X 171 ALIGN_3 172 173.L14: 174 testq $31, M 175 jle .L998 176 177.L15: 178 testq $16, M 179 je .L16 180 181 movaps -32 * SIZE(X), %xmm4 182 andps %xmm15, %xmm4 183 addps %xmm4, %xmm0 184 185 movaps -28 * SIZE(X), %xmm5 186 andps %xmm15, %xmm5 187 addps %xmm5, %xmm1 188 189 movaps -24 * SIZE(X), %xmm4 190 andps %xmm15, %xmm4 191 addps %xmm4, %xmm0 192 193 movaps -20 * SIZE(X), %xmm5 194 andps %xmm15, %xmm5 195 addps %xmm5, %xmm1 196 197 addq $16 * SIZE, X 198 ALIGN_3 199 200.L16: 201 testq $8, M 202 je .L17 203 204 movaps -32 * SIZE(X), %xmm4 205 andps %xmm15, %xmm4 206 addps %xmm4, %xmm0 207 208 movaps -28 * SIZE(X), %xmm5 209 andps %xmm15, %xmm5 210 addps %xmm5, %xmm1 211 212 addq $8 * SIZE, X 213 ALIGN_3 214 215.L17: 216 testq $4, M 217 je .L18 218 219 movaps -32 * SIZE(X), %xmm6 220 andps %xmm15, %xmm6 221 addps %xmm6, %xmm2 222 addq $4 * SIZE, X 223 ALIGN_3 224 225.L18: 226 testq $2, M 227 je .L19 228 229#ifdef movsd 230 xorps %xmm7, %xmm7 231#endif 232 movsd -32 * SIZE(X), %xmm7 233 andps %xmm15, %xmm7 234 addps %xmm7, %xmm3 235 addq $2 * SIZE, X 236 ALIGN_3 237 238.L19: 239 testq $1, M 240 je .L998 241 242 movss -32 * SIZE(X), %xmm6 243 andps %xmm15, %xmm6 244 addps %xmm6, %xmm2 245 jmp .L998 246 ALIGN_4 247 248.L100: 249 movq M, I 250 sarq $2, I 251 jle .L105 252 ALIGN_4 253 254.L101: 255 movsd (X), %xmm4 256 addq INCX, X 257 movhps (X), %xmm4 258 addq INCX, X 259 260 andps %xmm15, %xmm4 261 addps %xmm4, %xmm0 262 263 movsd (X), %xmm5 264 addq INCX, X 265 movhps (X), %xmm5 266 addq INCX, X 267 268 andps %xmm15, %xmm5 269 addps %xmm5, %xmm1 270 271 decq I 272 jg .L101 273 ALIGN_4 274 275.L105: 276#ifdef movsd 277 xorps %xmm4, %xmm4 278#endif 279 andq $3, M 280 jle .L998 281 ALIGN_4 282 283.L106: 284 movsd (X), %xmm4 285 andps %xmm15, %xmm4 286 addps %xmm4, %xmm0 287 addq INCX, X 288 decq M 289 jg .L106 290 ALIGN_4 291 292.L998: 293 addps %xmm1, %xmm0 294 addps %xmm3, %xmm2 295 addps %xmm2, %xmm0 296 297#ifndef HAVE_SSE3 298 movhlps %xmm0, %xmm1 299 addps %xmm1, %xmm0 300 301 movaps %xmm0, %xmm1 302 shufps $1, %xmm0, %xmm0 303 addss %xmm1, %xmm0 304#else 305 haddps %xmm0, %xmm0 306 haddps %xmm0, %xmm0 307#endif 308 ALIGN_4 309 310.L999: 311 RESTOREREGISTERS 312 313 ret 314 315 EPILOGUE 316