1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define STACK 12 26#define ARGS 0 27 28#define STACK_N 4 + STACK + ARGS(%esp) 29#define STACK_X 8 + STACK + ARGS(%esp) 30#define STACK_INCX 12 + STACK + ARGS(%esp) 31#define STACK_Y 16 + STACK + ARGS(%esp) 32#define STACK_INCY 20 + STACK + ARGS(%esp) 33 34#define N %ebx 35#define X %esi 36#define INCX %ecx 37#define Y %edi 38#define INCY %edx 39 40 41 PROLOGUE 42 43 pushl %edi 44 pushl %esi 45 pushl %ebx 46 47 PROFCODE 48 49#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) 50 EMMS 51#endif 52 53 movl STACK_N, N 54 movl STACK_X, X 55 movl STACK_INCX, INCX 56 movl STACK_Y, Y 57 movl STACK_INCY, INCY 58 59#ifdef F_INTERFACE 60 movl (N),N 61 movl (INCX),INCX 62 movl (INCY),INCY 63#endif 64 65 sall $BASE_SHIFT, INCX 66 sall $BASE_SHIFT, INCY 67 68 fldz 69 fldz 70 fldz 71 fldz 72 73 cmpl $SIZE, INCX 74 jne .L14 75 cmpl $SIZE, INCY 76 jne .L14 77 78 movl N, %eax 79 sarl $2, %eax 80 jle .L15 81 ALIGN_3 82 83.L16: 84 FLD 0 * SIZE(X) 85 FLD 0 * SIZE(Y) 86 fmulp %st, %st(1) 87 faddp %st,%st(1) 88 FLD 1 * SIZE(X) 89 FLD 1 * SIZE(Y) 90 fmulp %st, %st(1) 91 faddp %st,%st(2) 92 FLD 2 * SIZE(X) 93 FLD 2 * SIZE(Y) 94 fmulp %st, %st(1) 95 faddp %st,%st(3) 96 FLD 3 * SIZE(X) 97 FLD 3 * SIZE(Y) 98 fmulp %st, %st(1) 99 faddp %st,%st(4) 100 addl $4 * SIZE, X 101 addl $4 * SIZE, Y 102 decl %eax 103 jg .L16 104 ALIGN_3 105 106.L15: 107 movl N, %eax 108 andl $3, %eax 109 jle .L27 110 ALIGN_3 111 112.L22: 113 FLD (X) 114 addl $SIZE, X 115 FLD (Y) 116 fmulp %st, %st(1) 117 addl $SIZE, Y 118 faddp %st,%st(1) 119 decl %eax 120 jg .L22 121 122 jmp .L27 123 ALIGN_3 124 125.L14: 126#ifdef F_INTERFACE 127 testl INCX, INCX 128 jge .L28 129 130 movl N, %eax 131 decl %eax 132 imull INCX, %eax 133 subl %eax, X 134 ALIGN_3 135 136.L28: 137 testl INCY, INCY 138 jge .L29 139 140 movl N, %eax 141 decl %eax 142 imull INCY, %eax 143 subl %eax, Y 144 ALIGN_3 145.L29: 146#endif 147 movl N, %eax 148 sarl $2, %eax 149 jle .L30 150 ALIGN_3 151 152.L31: 153 FLD (X) 154 addl INCX, X 155 FLD (Y) 156 fmulp %st, %st(1) 157 addl INCY, Y 158 faddp %st,%st(1) 159 160 FLD (X) 161 addl INCX, X 162 FLD (Y) 163 fmulp %st, %st(1) 164 addl INCY, Y 165 faddp %st,%st(2) 166 167 FLD (X) 168 addl INCX, X 169 FLD (Y) 170 fmulp %st, %st(1) 171 addl INCY, Y 172 faddp %st,%st(3) 173 174 FLD (X) 175 addl INCX, X 176 FLD (Y) 177 fmulp %st, %st(1) 178 addl INCY, Y 179 faddp %st,%st(4) 180 181 decl %eax 182 jg .L31 183 ALIGN_3 184 185.L30: 186 movl N, %eax 187 andl $3, %eax 188 jle .L27 189 ALIGN_3 190 191.L37: 192 FLD (X) 193 addl INCX, X 194 FLD (Y) 195 fmulp %st, %st(1) 196 addl INCY, Y 197 faddp %st, %st(1) 198 decl %eax 199 jg .L37 200 ALIGN_3 201 202.L27: 203 faddp %st,%st(2) 204 faddp %st,%st(2) 205 faddp %st,%st(1) 206 207 popl %ebx 208 popl %esi 209 popl %edi 210 ret 211 212 EPILOGUE 213