1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define RPREFETCHSIZE 8 26#define WPREFETCHSIZE (RPREFETCHSIZE * 4) 27#define PREFETCH prefetcht0 28#define PREFETCHW prefetcht2 29 30#define STACK 16 31#define ARGS 0 32 33#define M 4 + STACK + ARGS(%esp) 34#define N 8 + STACK + ARGS(%esp) 35#define ARG_A 12 + STACK + ARGS(%esp) 36#define ARG_LDA 16 + STACK + ARGS(%esp) 37#define ARG_B 20 + STACK + ARGS(%esp) 38 39#define A %eax 40#define B %ebx 41#define LDA %ebp 42#define A1 %ecx 43#define A2 %edx 44#define I %esi 45#define J %edi 46 47 PROLOGUE 48 49 pushl %ebp 50 pushl %edi 51 pushl %esi 52 pushl %ebx 53 54 PROFCODE 55 56 movl ARG_A, A 57 movl ARG_B, B 58 movl ARG_LDA, LDA 59 60 sall $BASE_SHIFT, LDA 61 62 movl N, J 63 sarl $2, J 64 je .L20 65 ALIGN_3 66 67.L10: 68 movl A, A1 69 leal (A, LDA, 2), A2 70 addl $4 * SIZE, A 71 72 movl M, I 73 sarl $2, I 74 je .L15 75 ALIGN_3 76 77.L12: 78 PREFETCH RPREFETCHSIZE * SIZE(A1) 79 80 movsd 0 * SIZE(A1) , %xmm0 81 movhps 1 * SIZE(A1) , %xmm0 82 movsd 2 * SIZE(A1) , %xmm1 83 movhps 3 * SIZE(A1) , %xmm1 84 85 PREFETCH RPREFETCHSIZE * SIZE(A1, LDA) 86 87 movsd 0 * SIZE(A1, LDA), %xmm2 88 movhps 1 * SIZE(A1, LDA), %xmm2 89 movsd 2 * SIZE(A1, LDA), %xmm3 90 movhps 3 * SIZE(A1, LDA), %xmm3 91 92 PREFETCH RPREFETCHSIZE * SIZE(A2) 93 94 movsd 0 * SIZE(A2) , %xmm4 95 movhps 1 * SIZE(A2) , %xmm4 96 movsd 2 * SIZE(A2) , %xmm5 97 movhps 3 * SIZE(A2) , %xmm5 98 99 PREFETCH RPREFETCHSIZE * SIZE(A2, LDA) 100 101 movsd 0 * SIZE(A2, LDA), %xmm6 102 movhps 1 * SIZE(A2, LDA), %xmm6 103 movsd 2 * SIZE(A2, LDA), %xmm7 104 movhps 3 * SIZE(A2, LDA), %xmm7 105 106 PREFETCHW (RPREFETCHSIZE + 0) * SIZE(B) 107 108 movaps %xmm0, 0 * SIZE(B) 109 movaps %xmm1, 2 * SIZE(B) 110 movaps %xmm2, 4 * SIZE(B) 111 movaps %xmm3, 6 * SIZE(B) 112 113 PREFETCHW (RPREFETCHSIZE + 8) * SIZE(B) 114 115 movaps %xmm4, 8 * SIZE(B) 116 movaps %xmm5, 10 * SIZE(B) 117 movaps %xmm6, 12 * SIZE(B) 118 movaps %xmm7, 14 * SIZE(B) 119 120 leal (A1, LDA, 4), A1 121 leal (A2, LDA, 4), A2 122 subl $-16 * SIZE, B 123 decl I 124 jne .L12 125 ALIGN_3 126 127.L15: 128 testl $2, M 129 jle .L16 130 131 movsd 0 * SIZE(A1) , %xmm0 132 movhps 1 * SIZE(A1) , %xmm0 133 movsd 2 * SIZE(A1) , %xmm1 134 movhps 3 * SIZE(A1) , %xmm1 135 136 movsd 0 * SIZE(A1, LDA), %xmm2 137 movhps 1 * SIZE(A1, LDA), %xmm2 138 movsd 2 * SIZE(A1, LDA), %xmm3 139 movhps 3 * SIZE(A1, LDA), %xmm3 140 141 movaps %xmm0, 0 * SIZE(B) 142 movaps %xmm1, 2 * SIZE(B) 143 movaps %xmm2, 4 * SIZE(B) 144 movaps %xmm3, 6 * SIZE(B) 145 146 leal (A1, LDA, 2), A1 147 subl $-8 * SIZE, B 148 ALIGN_4 149 150.L16: 151 testl $1, M 152 jle .L19 153 154 movsd 0 * SIZE(A1) , %xmm0 155 movhps 1 * SIZE(A1) , %xmm0 156 movsd 2 * SIZE(A1) , %xmm1 157 movhps 3 * SIZE(A1) , %xmm1 158 159 movaps %xmm0, 0 * SIZE(B) 160 movaps %xmm1, 2 * SIZE(B) 161 subl $-4 * SIZE, B 162 ALIGN_4 163 164.L19: 165 decl J 166 jne .L10 167 ALIGN_3 168 169.L20: 170 testl $2, N 171 jle .L30 172 173 movl A, A1 174 leal (A, LDA, 2), A2 175 addl $2 * SIZE, A 176 177 movl M, I 178 sarl $2, I 179 je .L25 180 ALIGN_3 181 182.L22: 183 movsd 0 * SIZE(A1) , %xmm0 184 movhps 1 * SIZE(A1) , %xmm0 185 movsd 0 * SIZE(A1, LDA), %xmm1 186 movhps 1 * SIZE(A1, LDA), %xmm1 187 188 movsd 0 * SIZE(A2) , %xmm2 189 movhps 1 * SIZE(A2) , %xmm2 190 movsd 0 * SIZE(A2, LDA), %xmm3 191 movhps 1 * SIZE(A2, LDA), %xmm3 192 193 movaps %xmm0, 0 * SIZE(B) 194 movaps %xmm1, 2 * SIZE(B) 195 movaps %xmm2, 4 * SIZE(B) 196 movaps %xmm3, 6 * SIZE(B) 197 198 leal (A1, LDA, 4), A1 199 leal (A2, LDA, 4), A2 200 subl $-8 * SIZE, B 201 decl I 202 jne .L22 203 ALIGN_3 204 205.L25: 206 testl $2, M 207 jle .L26 208 209 movsd 0 * SIZE(A1) , %xmm0 210 movhps 1 * SIZE(A1) , %xmm0 211 movsd 0 * SIZE(A1, LDA), %xmm1 212 movhps 1 * SIZE(A1, LDA), %xmm1 213 214 movaps %xmm0, 0 * SIZE(B) 215 movaps %xmm1, 2 * SIZE(B) 216 217 leal (A1, LDA, 2), A1 218 subl $-4 * SIZE, B 219 ALIGN_4 220 221.L26: 222 testl $1, M 223 jle .L30 224 225 movsd 0 * SIZE(A1) , %xmm0 226 movhps 1 * SIZE(A1) , %xmm0 227 228 movaps %xmm0, 0 * SIZE(B) 229 subl $-2 * SIZE, B 230 ALIGN_4 231 232.L30: 233 testl $1, N 234 jle .L999 235 236 movl A, A1 237 leal (A, LDA, 2), A2 238 239 movl M, I 240 sarl $2, I 241 je .L35 242 ALIGN_3 243 244.L32: 245 movsd 0 * SIZE(A1) , %xmm0 246 movhps 0 * SIZE(A1, LDA), %xmm0 247 movsd 0 * SIZE(A2) , %xmm1 248 movhps 0 * SIZE(A2, LDA), %xmm1 249 250 movaps %xmm0, 0 * SIZE(B) 251 movaps %xmm1, 2 * SIZE(B) 252 253 leal (A1, LDA, 4), A1 254 leal (A2, LDA, 4), A2 255 subl $-4 * SIZE, B 256 decl I 257 jne .L32 258 ALIGN_3 259 260.L35: 261 testl $2, M 262 jle .L36 263 264 movsd 0 * SIZE(A1) , %xmm0 265 movhps 0 * SIZE(A1, LDA), %xmm0 266 267 movaps %xmm0, 0 * SIZE(B) 268 269 leal (A1, LDA, 2), A1 270 subl $-2 * SIZE, B 271 ALIGN_4 272 273.L36: 274 testl $1, M 275 jle .L999 276 277 movsd 0 * SIZE(A1) , %xmm0 278 movsd %xmm0, 0 * SIZE(B) 279 ALIGN_4 280 281.L999: 282 popl %ebx 283 popl %esi 284 popl %edi 285 popl %ebp 286 ret 287 288 EPILOGUE 289