1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define STACK 8 26#define ARGS 0 27 28#define STACK_M 4 + STACK + ARGS(%esp) 29#define STACK_X 8 + STACK + ARGS(%esp) 30#define STACK_INCX 12 + STACK + ARGS(%esp) 31 32#define I %eax 33#define M %ecx 34#define X %esi 35#define INCX %ebx 36 37#include "l1param.h" 38 39 PROLOGUE 40 PROFCODE 41 42 pushl %esi 43 pushl %ebx 44 45 movl STACK_M, M 46 movl STACK_X, X 47 movl STACK_INCX, INCX 48 49 xorps %xmm0, %xmm0 50 xorps %xmm1, %xmm1 51 52 testl M, M 53 jle .L999 54 testl INCX, INCX 55 jle .L999 56 57 pcmpeqb %xmm3, %xmm3 58 psrlq $1, %xmm3 59 60 sall $BASE_SHIFT, INCX 61 62 subl $-16 * SIZE, X 63 64 cmpl $SIZE, INCX 65 jne .L40 66 67 testl $SIZE, X 68 je .L05 69 70 movsd -16 * SIZE(X), %xmm0 71 addl $SIZE, X 72 73 andps %xmm3, %xmm0 74 subl $1, M 75 jle .L999 76 ALIGN_3 77 78.L05: 79 movl M, I 80 sarl $4, I 81 jle .L20 82 83 movaps -16 * SIZE(X), %xmm4 84 movaps -14 * SIZE(X), %xmm5 85 movaps -12 * SIZE(X), %xmm6 86 movaps -10 * SIZE(X), %xmm7 87 88 decl I 89 jle .L11 90 ALIGN_4 91 92.L10: 93#ifdef PREFETCH 94 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 95#endif 96 97 andps %xmm3, %xmm4 98 addpd %xmm4, %xmm0 99 movaps -8 * SIZE(X), %xmm4 100 101 andps %xmm3, %xmm5 102 addpd %xmm5, %xmm1 103 movaps -6 * SIZE(X), %xmm5 104 105 andps %xmm3, %xmm6 106 addpd %xmm6, %xmm0 107 movaps -4 * SIZE(X), %xmm6 108 109 andps %xmm3, %xmm7 110 addpd %xmm7, %xmm1 111 movaps -2 * SIZE(X), %xmm7 112 113#if defined(PREFETCH) && !defined(FETCH128) 114 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 115#endif 116 117 andps %xmm3, %xmm4 118 addpd %xmm4, %xmm0 119 movaps 0 * SIZE(X), %xmm4 120 121 andps %xmm3, %xmm5 122 addpd %xmm5, %xmm1 123 movaps 2 * SIZE(X), %xmm5 124 125 andps %xmm3, %xmm6 126 addpd %xmm6, %xmm0 127 movaps 4 * SIZE(X), %xmm6 128 129 andps %xmm3, %xmm7 130 addpd %xmm7, %xmm1 131 movaps 6 * SIZE(X), %xmm7 132 133 subl $-16 * SIZE, X 134 decl I 135 jg .L10 136 ALIGN_4 137 138.L11: 139 andps %xmm3, %xmm4 140 addpd %xmm4, %xmm0 141 movaps -8 * SIZE(X), %xmm4 142 143 andps %xmm3, %xmm5 144 addpd %xmm5, %xmm1 145 movaps -6 * SIZE(X), %xmm5 146 147 andps %xmm3, %xmm6 148 addpd %xmm6, %xmm0 149 movaps -4 * SIZE(X), %xmm6 150 151 andps %xmm3, %xmm7 152 addpd %xmm7, %xmm1 153 movaps -2 * SIZE(X), %xmm7 154 155 andps %xmm3, %xmm4 156 addpd %xmm4, %xmm0 157 andps %xmm3, %xmm5 158 addpd %xmm5, %xmm1 159 andps %xmm3, %xmm6 160 addpd %xmm6, %xmm0 161 andps %xmm3, %xmm7 162 addpd %xmm7, %xmm1 163 164 subl $-16 * SIZE, X 165 ALIGN_3 166 167.L20: 168 andl $15, M 169 jle .L999 170 171 testl $8, M 172 je .L21 173 174 movaps -16 * SIZE(X), %xmm4 175 movaps -14 * SIZE(X), %xmm5 176 movaps -12 * SIZE(X), %xmm6 177 movaps -10 * SIZE(X), %xmm7 178 179 andps %xmm3, %xmm4 180 addpd %xmm4, %xmm0 181 andps %xmm3, %xmm5 182 addpd %xmm5, %xmm1 183 andps %xmm3, %xmm6 184 addpd %xmm6, %xmm0 185 andps %xmm3, %xmm7 186 addpd %xmm7, %xmm1 187 addl $8 * SIZE, X 188 ALIGN_3 189 190.L21: 191 testl $4, M 192 je .L22 193 194 movaps -16 * SIZE(X), %xmm4 195 movaps -14 * SIZE(X), %xmm5 196 197 andps %xmm3, %xmm4 198 addpd %xmm4, %xmm0 199 andps %xmm3, %xmm5 200 addpd %xmm5, %xmm1 201 202 addl $4 * SIZE, X 203 ALIGN_3 204 205.L22: 206 testl $2, M 207 je .L23 208 209 movaps -16 * SIZE(X), %xmm4 210 andps %xmm3, %xmm4 211 addpd %xmm4, %xmm0 212 addl $2 * SIZE, X 213 214.L23: 215 testl $1, M 216 je .L999 217 218#ifdef movsd 219 xorps %xmm4, %xmm4 220#endif 221 movsd -16 * SIZE(X), %xmm4 222 andps %xmm3, %xmm4 223 addsd %xmm4, %xmm1 224 jmp .L999 225 ALIGN_3 226 227.L40: 228 movl M, I 229 sarl $3, I 230 jle .L60 231 ALIGN_4 232 233.L50: 234 movsd -16 * SIZE(X), %xmm4 235 addl INCX, X 236 movhps -16 * SIZE(X), %xmm4 237 addl INCX, X 238 andps %xmm3, %xmm4 239 addpd %xmm4, %xmm0 240 241 movsd -16 * SIZE(X), %xmm5 242 addl INCX, X 243 movhps -16 * SIZE(X), %xmm5 244 addl INCX, X 245 andps %xmm3, %xmm5 246 addpd %xmm5, %xmm1 247 248 movsd -16 * SIZE(X), %xmm6 249 addl INCX, X 250 movhps -16 * SIZE(X), %xmm6 251 addl INCX, X 252 andps %xmm3, %xmm6 253 addpd %xmm6, %xmm0 254 255 movsd -16 * SIZE(X), %xmm7 256 addl INCX, X 257 movhps -16 * SIZE(X), %xmm7 258 addl INCX, X 259 andps %xmm3, %xmm7 260 addpd %xmm7, %xmm1 261 262 decl I 263 jg .L50 264 ALIGN_4 265 266.L60: 267#ifdef movsd 268 xorps %xmm4, %xmm4 269#endif 270 andl $7, M 271 jle .L999 272 ALIGN_4 273 274.L61: 275 movsd -16 * SIZE(X), %xmm4 276 andps %xmm3, %xmm4 277 addsd %xmm4, %xmm0 278 addl INCX, X 279 decl M 280 jg .L61 281 ALIGN_4 282 283.L999: 284 addpd %xmm1, %xmm0 285 286#ifndef HAVE_SSE3 287 movaps %xmm0, %xmm1 288 unpckhpd %xmm0, %xmm0 289 addsd %xmm1, %xmm0 290#else 291 haddpd %xmm0, %xmm0 292#endif 293 294 movsd %xmm0, STACK_M 295 fldl STACK_M 296 popl %ebx 297 popl %esi 298 ret 299 300 EPILOGUE 301 302