1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#ifdef PENTIUM 43#define P 88 44#endif 45 46#ifndef P 47#define P 400 48#endif 49 50#define STACK 16 51#define ARGS 24 52 53#define NLDA 0 + STACK(%esp) 54#define XP 4 + STACK(%esp) 55#define MIN_M 8 + STACK(%esp) 56#define J 12 + STACK(%esp) 57#define IS 16 + STACK(%esp) 58 59#define M 4 + STACK + ARGS(%esp) 60#define N 8 + STACK + ARGS(%esp) 61#define K 12 + STACK + ARGS(%esp) 62#ifdef DOUBLE 63#define ALPHA_R 16 + STACK + ARGS(%esp) 64#define ALPHA_I 24 + STACK + ARGS(%esp) 65#define A 32 + STACK + ARGS(%esp) 66#define LDA 36 + STACK + ARGS(%esp) 67#define X 40 + STACK + ARGS(%esp) 68#define INCX 44 + STACK + ARGS(%esp) 69#define Y 48 + STACK + ARGS(%esp) 70#define INCY 52 + STACK + ARGS(%esp) 71#define BUFFER 56 + STACK + ARGS(%esp) 72#else 73#define ALPHA_R 16 + STACK + ARGS(%esp) 74#define ALPHA_I 20 + STACK + ARGS(%esp) 75#define A 24 + STACK + ARGS(%esp) 76#define LDA 28 + STACK + ARGS(%esp) 77#define X 32 + STACK + ARGS(%esp) 78#define INCX 36 + STACK + ARGS(%esp) 79#define Y 40 + STACK + ARGS(%esp) 80#define INCY 44 + STACK + ARGS(%esp) 81#define BUFFER 48 + STACK + ARGS(%esp) 82#endif 83 84 PROLOGUE 85 86 subl $ARGS, %esp 87 pushl %ebp 88 pushl %edi 89 pushl %esi 90 pushl %ebx 91 92 PROFCODE 93 94 FLD ALPHA_I 95 FLD ALPHA_R 96 97 movl X, %edi # X 98 99 movl $0, IS 100 101 movl M, %ebx 102 movl N, %ecx 103 testl %ebx, %ebx 104 jle .L79 105 106 testl %ecx, %ecx 107 jle .L79 108 109 movl INCX, %esi 110 addl %esi, %esi 111 leal (,%esi,SIZE), %esi 112 movl %esi, INCX 113 114 movl INCY, %esi 115 addl %esi, %esi 116 leal (, %esi, SIZE), %esi 117 movl %esi, INCY 118 119 movl LDA, %ebx 120 121 movl N, %eax 122 imull %ebx, %eax 123 movl $P, %esi 124 subl %eax, %esi 125 leal (, %esi, SIZE), %esi 126 addl %esi, %esi 127 movl %esi, NLDA 128 129 leal (,%ebx,SIZE), %esi 130 addl %esi, %esi 131 movl %esi, LDA 132 ALIGN_2 133 134.L32: 135 movl IS, %esi 136 137 movl $P, %edx 138 movl M, %eax 139 subl %esi, %eax 140 cmpl %edx, %eax 141#ifdef PENTIUM 142 jle .L33 143 movl %edx, %eax 144.L33: 145#else 146 cmovg %edx, %eax 147#endif 148 movl %eax, MIN_M 149 150 movl IS, %ecx 151 addl %ecx, %ecx 152 leal (%edi,%ecx,SIZE), %ecx # xp = x + is 153 movl INCX, %ebx 154 movl %ecx, XP 155 cmpl $2 * SIZE, %ebx 156 je .L34 157 158 movl BUFFER, %esi 159 movl MIN_M, %eax 160 movl %esi, XP 161 sarl $1, %eax 162 jle .L35 163 164 ALIGN_3 165 166.L36: 167 FLD 0 * SIZE(%edi) 168 FLD 1 * SIZE(%edi) 169 addl %ebx,%edi # x += incx 170 FLD 0 * SIZE(%edi) 171 FLD 1 * SIZE(%edi) 172 addl %ebx,%edi # x += incx 173 174 FST 3 * SIZE(%esi) 175 FST 2 * SIZE(%esi) 176 FST 1 * SIZE(%esi) 177 FST 0 * SIZE(%esi) 178 179 addl $4 * SIZE, %esi # xp += 4 180 decl %eax 181 jg .L36 182 ALIGN_3 183 184.L35: 185 movl MIN_M, %eax 186 andl $1,%eax 187 jle .L34 188 189 FLD 0 * SIZE(%edi) 190 FLD 1 * SIZE(%edi) 191 addl %ebx,%edi # x += incx 192 FST 1 * SIZE(%esi) 193 FST 0 * SIZE(%esi) 194 ALIGN_3 195 196/* Main Routine */ 197 198.L34: 199 movl Y, %ebp # coffset = y 200 201 movl N, %ecx 202 testl %ecx, %ecx 203 jle .L60 204 ALIGN_2 205 206.L61: 207 movl A, %ebx # a_offset = a 208 fldz # ct1 = ZERO 209 movl LDA, %edx 210 fldz # ct1 = ZERO 211 212 addl %ebx, %edx 213 fldz # ct1 = ZERO 214 movl %edx, A 215 fldz # ct1 = ZERO 216 217 movl XP, %esi 218 219 FLD (%esi) # bt1 = *(b_offset + 0) 220 221 movl MIN_M, %eax 222 sarl $1, %eax 223 jle .L64 224 ALIGN_3 225 226#define PRESIZE 8 227 228.L65: 229#ifdef HAS_PREFETCH 230 prefetcht0 PRESIZE * SIZE(%ebx) 231 prefetcht0 PRESIZE * SIZE(%esi) 232#endif 233 234 FLD 0 * SIZE(%ebx) # at1 = *(a_offset + 0) 235 fmul %st(1), %st # at1 *= bt1 236 faddp %st, %st(2) # ct1 += at1 237 238 FMUL 1 * SIZE(%ebx) # bt1 *= *(a_offset + 1) 239#ifndef CONJ 240 faddp %st, %st(2) # ct2 += bt1 241#else 242 fsubrp %st, %st(2) # ct2 -= bt1 243#endif 244 FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) 245 246 FLD 0 * SIZE(%ebx) # at1 = *(a_offset + 0) 247 fmul %st(1), %st # at1 *= bt1 248 faddp %st, %st(4) # ct3 += at1 249 250 FMUL 1 * SIZE(%ebx) # bt1 *= *(a_offset + 1) 251 faddp %st, %st(4) # ct4 += bt1 252 FLD 2 * SIZE(%esi) # bt1 = *(b_offset + 1) 253 254 FLD 2 * SIZE(%ebx) # at1 = *(a_offset + 0) 255 fmul %st(1), %st # at1 *= bt1 256 faddp %st, %st(2) # ct1 += at1 257 258 FMUL 3 * SIZE(%ebx) # bt1 *= *(a_offset + 1) 259#ifndef CONJ 260 faddp %st, %st(2) # ct2 += bt1 261#else 262 fsubrp %st, %st(2) # ct2 -= bt1 263#endif 264 FLD 3 * SIZE(%esi) # bt1 = *(b_offset + 1) 265 266 FLD 2 * SIZE(%ebx) # at1 = *(a_offset + 0) 267 fmul %st(1), %st # at1 *= bt1 268 faddp %st, %st(4) # ct3 += at1 269 270 FMUL 3 * SIZE(%ebx) # bt1 *= *(a_offset + 1) 271 faddp %st, %st(4) # ct4 += bt1 272 FLD 4 * SIZE(%esi) # bt1 = *(b_offset + 1) 273 274 addl $4 * SIZE, %esi 275 addl $4 * SIZE, %ebx 276 decl %eax 277 jg .L65 278 ALIGN_3 279 280.L64: 281 movl MIN_M, %eax 282 andl $1, %eax 283 jle .L70 284 ALIGN_3 285 286.L71: 287 FLD 0 * SIZE(%ebx) # at1 = *(a_offset + 0) 288 fmul %st(1), %st # at1 *= bt1 289 faddp %st, %st(2) # ct1 += at1 290 291 FMUL 1 * SIZE(%ebx) # bt1 *= *(a_offset + 1) 292#ifndef CONJ 293 faddp %st, %st(2) # ct2 += bt1 294#else 295 fsubrp %st, %st(2) # ct2 -= bt1 296#endif 297 FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) 298 299 FLD 0 * SIZE(%ebx) # at1 = *(a_offset + 0) 300 fmul %st(1), %st # at1 *= bt1 301 faddp %st, %st(4) # ct3 += at1 302 303 FMUL 1 * SIZE(%ebx) # bt1 *= *(a_offset + 1) 304 faddp %st, %st(4) # ct4 += bt1 305 fldz 306 ALIGN_3 307 308.L70: 309#ifndef C_SUN 310 ffreep %st(0) 311#else 312 .byte 0xdf 313 .byte 0xc0 314#endif 315 316#ifndef XCONJ 317#ifndef CONJ 318 fsubp %st, %st(3) 319 faddp %st, %st(1) 320#else 321 faddp %st, %st(3) 322 faddp %st, %st(1) 323#endif 324#else 325#ifndef CONJ 326 faddp %st, %st(3) 327 fsubp %st, %st(1) 328#else 329 fsubp %st, %st(3) 330 fsubp %st, %st(1) 331#endif 332#endif 333 334 fld %st(0) # ct4 = ct2 335 fmul %st(4), %st 336 fld %st(2) 337 fmul %st(4), %st 338 fsubp %st, %st(1) 339 340 FADD 0 * SIZE(%ebp) 341 FST 0 * SIZE(%ebp) 342 343 fmul %st(2), %st 344 fxch %st(1) 345 fmul %st(3), %st 346 faddp %st, %st(1) 347 348 FADD 1 * SIZE(%ebp) 349 FST 1 * SIZE(%ebp) 350 addl INCY, %ebp 351 352 decl %ecx 353 jg .L61 354 ALIGN_3 355 356.L60: 357 movl A, %ebx 358 addl NLDA, %ebx 359 movl %ebx, A 360 361 addl $P, IS 362 movl M, %esi 363 cmpl %esi, IS 364 jl .L32 365 ALIGN_3 366 367.L79: 368#ifndef C_SUN 369 ffreep %st(0) 370 ffreep %st(0) 371#else 372 .byte 0xdf 373 .byte 0xc0 374 .byte 0xdf 375 .byte 0xc0 376#endif 377 378 popl %ebx 379 popl %esi 380 popl %edi 381 popl %ebp 382 addl $ARGS, %esp 383 ret 384 385 EPILOGUE 386 387