1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#ifdef PENTIUM 26#define P 32 27#endif 28 29#if defined(ATHLON) || defined(OPTERON) 30#define P 32 31#endif 32 33#ifndef P 34#define P DTB_ENTRIES 35#endif 36 37#define STACK 16 38#define ARGS 16 39 40#define PLDA_M 0 + STACK(%esp) 41#define XP 4 + STACK(%esp) 42#define MIN_N 8 + STACK(%esp) 43#define IS 12 + STACK(%esp) 44 45#define M 4 + STACK + ARGS(%esp) 46#define N 8 + STACK + ARGS(%esp) 47#define K 12 + STACK + ARGS(%esp) 48#define ALPHA 16 + STACK + ARGS(%esp) 49 50#define A 32 + STACK + ARGS(%esp) 51#define LDA 36 + STACK + ARGS(%esp) 52#define X 40 + STACK + ARGS(%esp) 53#define INCX 44 + STACK + ARGS(%esp) 54#define Y 48 + STACK + ARGS(%esp) 55#define INCY 52 + STACK + ARGS(%esp) 56#define BUFFER 56 + STACK + ARGS(%esp) 57 58 59 PROLOGUE 60 61 subl $ARGS, %esp 62 pushl %ebp 63 pushl %edi 64 pushl %esi 65 pushl %ebx 66 67 PROFCODE 68 69 FLD ALPHA 70 movl X, %edi 71 72 movl LDA, %ebx 73 sall $BASE_SHIFT, %ebx 74 75 movl $0, IS 76 movl M, %edx 77 movl N, %esi 78 79 test %esi, %esi 80 jle .L79 # goto END 81 test %edx, %edx 82 jle .L79 # goto END 83 84 movl INCY, %eax 85 sall $BASE_SHIFT, %eax 86 movl %eax, INCY 87 88 movl LDA, %eax 89 imull $P, %eax # P * lda 90 subl M ,%eax # P * lda - m 91 sall $BASE_SHIFT, %eax 92 movl %eax, PLDA_M 93 ALIGN_2 94 95.L32: 96 movl IS, %esi 97 movl $P, %edx 98 movl N, %eax 99 subl %esi,%eax # n - is 100 cmpl %edx, %eax 101#ifdef PENTIUM 102 jle .L33 103 movl %edx, %eax 104.L33: 105#else 106 cmovg %edx, %eax 107#endif 108 109 movl %eax, MIN_N 110 movl INCX, %edx 111 112 sall $BASE_SHIFT, %esi 113 leal (%edi, %esi, 1), %esi 114 115 movl %esi, XP 116 cmpl $1, %edx 117 je .L34 # if incx == 1 goto L34 118 119 movl BUFFER, %esi 120 sall $BASE_SHIFT, %edx 121 movl %esi, XP # xp = buffer 122 sarl $2,%eax 123 jle .L35 124 ALIGN_2 125 126.L36: 127 FLD (%edi) 128 addl %edx,%edi # x += incx 129 FLD (%edi) 130 addl %edx,%edi # x += incx 131 FLD (%edi) 132 addl %edx,%edi # x += incx 133 FLD (%edi) 134 addl %edx,%edi # x += incx 135 136 FST 3 * SIZE(%esi) 137 FST 2 * SIZE(%esi) 138 FST 1 * SIZE(%esi) 139 FST 0 * SIZE(%esi) 140 141 addl $4 * SIZE, %esi # xp += 4 142 decl %eax 143 jg .L36 144 ALIGN_3 145 146.L35: 147 movl MIN_N, %eax 148 andl $3, %eax 149 jle .L34 150 ALIGN_2 151 152.L42: 153 FLD (%edi) 154 addl %edx, %edi 155 FST (%esi) 156 addl $SIZE, %esi 157 decl %eax 158 jg .L42 159 ALIGN_3 160 161/* Main Routine */ 162.L34: 163 movl Y, %ecx # c_offset 164 movl M, %ebp 165 sarl $2, %ebp # j = (m >> 2) 166 jle .L47 167 ALIGN_2 168 169.L48: 170 movl A, %edx # a_offset = a 171 fldz 172 addl $4 * SIZE, A # a += 4 173 fldz 174 movl XP, %esi # b_offset = xp 175 fldz 176 movl MIN_N, %eax # i = min_n 177 fldz 178 FLD (%esi) # bt1 = b_offset 179 sarl $1, %eax 180 jle .L51 181 ALIGN_2 182 183#ifdef PENTIUM3 184#define PRESIZE 8 185#else 186#define PRESIZE 24 187#endif 188 189.L80: 190#ifdef PENTIUM3 191 prefetcht1 PRESIZE * SIZE(%edx, %ebx, 1) 192 FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) 193 fmul %st(1), %st # at1 *= bt1 194 195 prefetcht1 PRESIZE * SIZE(%esi) 196 faddp %st, %st(2) # ct1 += at1 197 FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) 198 199 fmul %st(1), %st # at1 *= bt1 200 faddp %st, %st(3) # ct2 += at1 201 FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) 202 203 fmul %st(1), %st # at1 *= bt1 204 faddp %st, %st(4) # ct3 += at1 205 FLD 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) 206 207 fmulp %st, %st(1) 208 faddp %st, %st(4) # ct4 += at1 209 FLD 1 * SIZE(%esi) # bt1 = b_offset 210 211 prefetcht1 PRESIZE * SIZE(%edx, %ebx, 2) 212 addl %ebx, %edx # a_offset += lda 213 FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) 214 215 fmul %st(1), %st # at1 *= bt1 216 faddp %st, %st(2) # ct1 += at1 217 FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) 218 219 fmul %st(1), %st # at1 *= bt1 220 faddp %st, %st(3) # ct2 += at1 221 FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) 222 223 fmul %st(1), %st # at1 *= bt1 224 faddp %st, %st(4) # ct3 += at1 225 FLD 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) 226 227 fmulp %st, %st(1) 228 addl %ebx, %edx 229 faddp %st, %st(4) # ct4 += at1 230 231 FLD 2 * SIZE(%esi) # bt1 = b_offset 232 addl $2 * SIZE, %esi # b_offset += 2 233 234#else 235#ifdef PENTIUM4 236 prefetchnta 8 * SIZE(%esi) 237#endif 238 FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) 239 fmul %st(1), %st # at1 *= bt1 240 faddp %st, %st(2) # ct1 += at1 241 242 FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) 243 fmul %st(1), %st # at1 *= bt1 244 faddp %st, %st(3) # ct2 += at1 245 246 FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) 247 fmul %st(1), %st # at1 *= bt1 248 faddp %st, %st(4) # ct3 += at1 249 250 FLD 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) 251 fmulp %st, %st(1) 252 faddp %st, %st(4) # ct4 += at1 253 FLD 1 * SIZE(%esi) # bt1 = b_offset 254 255 addl %ebx, %edx # a_offset += lda 256 257 FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) 258 fmul %st(1), %st # at1 *= bt1 259 faddp %st, %st(2) # ct1 += at1 260 261 FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) 262 fmul %st(1), %st # at1 *= bt1 263 faddp %st, %st(3) # ct2 += at1 264 265 FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) 266 fmul %st(1), %st # at1 *= bt1 267 faddp %st, %st(4) # ct3 += at1 268 269 FLD 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) 270 fmulp %st, %st(1) 271 faddp %st, %st(4) # ct4 += at1 272 FLD 2 * SIZE(%esi) # bt1 = b_offset 273 274 addl %ebx, %edx 275 addl $2 * SIZE, %esi # b_offset += 2 276#endif 277 decl %eax 278 jg .L80 279 280.L51: 281 movl MIN_N,%eax 282 andl $1, %eax 283 je .L57 284 285 FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) 286 fmul %st(1), %st # at1 *= bt1 287 faddp %st, %st(2) # ct1 += at1 288 289 FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) 290 fmul %st(1), %st # at1 *= bt1 291 faddp %st, %st(3) # ct2 += at1 292 293 FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) 294 fmul %st(1), %st # at1 *= bt1 295 faddp %st, %st(4) # ct3 += at1 296 297 FLD 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) 298 fmulp %st, %st(1) 299 faddp %st, %st(4) # ct4 += at1 300 fldz 301 ALIGN_2 302 303.L57: 304 ffreep %st(0) 305 306 fxch %st(4) 307 fmul %st, %st(4) 308 fmul %st, %st(1) 309 fmul %st, %st(2) 310 fmul %st, %st(3) 311 fxch %st(4) 312 313 movl INCY, %eax 314 315 FLD (%ecx) 316 faddp %st, %st(1) 317 FST (%ecx) 318 addl %eax, %ecx 319 320 FLD (%ecx) 321 faddp %st, %st(1) 322 FST (%ecx) 323 addl %eax, %ecx 324 325 FLD (%ecx) 326 faddp %st, %st(1) 327 FST (%ecx) 328 addl %eax, %ecx 329 330 FLD (%ecx) 331 faddp %st, %st(1) 332 FST (%ecx) 333 addl %eax, %ecx 334 335 decl %ebp # j -- 336 jg .L48 337 ALIGN_3 338 339.L47: 340 movl M, %ebp 341 andl $3, %ebp # j = (m & 3) 342 jle .L60 343 ALIGN_2 344 345.L61: 346 347 movl A, %edx # a_offset = a 348 fldz 349 addl $SIZE, A # a++ 350 fldz 351 movl XP,%esi 352 fldz 353 movl MIN_N,%eax 354 fldz 355 sarl $3,%eax 356 jle .L64 357 ALIGN_2 358 359.L65: 360 FLD 0 * SIZE(%esi) 361 FLD (%edx) 362 fmulp %st, %st(1) 363 faddp %st, %st(1) 364 addl %ebx, %edx 365 366 FLD 1 * SIZE(%esi) 367 FLD (%edx) 368 fmulp %st, %st(1) 369 faddp %st, %st(2) 370 addl %ebx ,%edx 371 372 FLD 2 * SIZE(%esi) 373 FLD (%edx) 374 fmulp %st, %st(1) 375 faddp %st, %st(3) 376 addl %ebx, %edx 377 378 FLD 3 * SIZE(%esi) 379 FLD (%edx) 380 fmulp %st, %st(1) 381 faddp %st, %st(4) 382 addl %ebx, %edx 383 384 FLD 4 * SIZE(%esi) 385 FLD (%edx) 386 fmulp %st, %st(1) 387 faddp %st,%st(1) 388 addl %ebx, %edx 389 390 FLD 5 * SIZE(%esi) 391 FLD (%edx) 392 fmulp %st, %st(1) 393 faddp %st, %st(2) 394 addl %ebx, %edx 395 396 FLD 6 * SIZE(%esi) 397 FLD (%edx) 398 fmulp %st, %st(1) 399 faddp %st,%st(3) 400 addl %ebx, %edx 401 402 FLD 7 * SIZE(%esi) 403 FLD (%edx) 404 fmulp %st, %st(1) 405 faddp %st,%st(4) 406 addl %ebx, %edx 407 408 addl $8 * SIZE, %esi 409 decl %eax 410 jg .L65 411 412.L64: 413 movl MIN_N,%eax 414 andl $7, %eax 415 jle .L70 416 ALIGN_2 417 418.L71: 419 FLD (%esi) 420 addl $SIZE, %esi # b_offset ++ 421 FLD (%edx) 422 fmulp %st, %st(1) 423 addl %ebx, %edx # a_offset += lda 424 faddp %st, %st(1) 425 decl %eax 426 jg .L71 427 ALIGN_2 428 429.L70: 430 faddp %st, %st(1) 431 faddp %st, %st(1) 432 faddp %st, %st(1) 433 434 fmul %st(1), %st 435 movl INCY, %eax 436 FLD (%ecx) 437 faddp %st, %st(1) 438 FST (%ecx) 439 addl %eax, %ecx 440 decl %ebp 441 jg .L61 442 443.L60: 444 movl PLDA_M, %esi 445 addl %esi, A # a += P * lda - m 446 addl $P, IS 447 movl N, %esi 448 cmpl %esi,IS 449 jl .L32 450 451.L79: 452 ffreep %st(0) 453 popl %ebx 454 popl %esi 455 popl %edi 456 popl %ebp 457 addl $ARGS, %esp 458 ret 459 460 EPILOGUE 461