1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define RPREFETCHSIZE 12 26#define WPREFETCHSIZE (RPREFETCHSIZE * 2) 27#define PREFETCH prefetcht0 28#define PREFETCHW prefetcht2 29 30#define STACK 16 31#define ARGS 8 32 33#define J 0 + STACK(%esp) 34#define BOFFSET2 4 + STACK(%esp) 35 36#define M 4 + STACK + ARGS(%esp) 37#define N 8 + STACK + ARGS(%esp) 38#define A 12 + STACK + ARGS(%esp) 39#define LDA 16 + STACK + ARGS(%esp) 40#define B 20 + STACK + ARGS(%esp) 41 42 PROLOGUE 43 44 subl $ARGS, %esp 45 pushl %ebp 46 pushl %edi 47 pushl %esi 48 pushl %ebx 49 50 PROFCODE 51 52 movl A, %ebp 53 movl B, %edi 54 55 movl M, %ebx 56 movl N, %eax 57 andl $-2, %eax 58 59 imull %ebx, %eax # m * ( n & ~1) 60 61 leal (%edi,%eax,SIZE), %eax # boffset2 = b + m * (n & ~1) 62 movl %eax, BOFFSET2 63 64 movl M, %esi 65#ifdef DOUBLE 66 sall $4,%esi 67#else 68 sall $3,%esi 69#endif 70 71 sarl $1, %ebx # if !(m & 1) goto L28 72 movl %ebx, J 73 jle .L28 74 ALIGN_4 75 76.L39: 77 movl %ebp, %edx # aoffset1 = a 78 movl LDA, %eax 79 movl N, %ebx 80 81 leal (%ebp, %eax,SIZE), %ecx # aoffset2 = a + lda 82 leal (%ecx, %eax,SIZE), %ebp # aoffset += 2 * lda 83 movl %edi, %eax # boffset1 = b_offset 84 addl $4 * SIZE, %edi # boffset += 4 85 86 sarl $2, %ebx 87 jle .L32 88 ALIGN_4 89 90.L36: 91 PREFETCH RPREFETCHSIZE * SIZE(%edx) 92 93 movsd 0 * SIZE(%edx), %xmm0 94 movhps 1 * SIZE(%edx), %xmm0 95 movsd 0 * SIZE(%ecx), %xmm2 96 movhps 1 * SIZE(%ecx), %xmm2 97 98 PREFETCH RPREFETCHSIZE * SIZE(%ecx) 99 100 movsd 2 * SIZE(%edx), %xmm4 101 movhps 3 * SIZE(%edx), %xmm4 102 movsd 2 * SIZE(%ecx), %xmm6 103 movhps 3 * SIZE(%ecx), %xmm6 104 105 movaps %xmm0, 0 * SIZE(%eax) 106 movaps %xmm2, 2 * SIZE(%eax) 107 108 addl %esi, %eax 109 110 movaps %xmm4, 0 * SIZE(%eax) 111 movaps %xmm6, 2 * SIZE(%eax) 112 113 addl $4 * SIZE, %ecx 114 addl $4 * SIZE, %edx 115 addl %esi, %eax 116 decl %ebx 117 jne .L36 118 ALIGN_4 119 120.L32: 121 movl N, %ebx 122 test $2, %ebx 123 je .L37 124 125 PREFETCH RPREFETCHSIZE * SIZE(%edx) 126 movsd 0 * SIZE(%edx), %xmm0 127 movhps 1 * SIZE(%edx), %xmm0 128 129 PREFETCH RPREFETCHSIZE * SIZE(%ecx) 130 movsd 0 * SIZE(%ecx), %xmm2 131 movhps 1 * SIZE(%ecx), %xmm2 132 133 movaps %xmm0, 0 * SIZE(%eax) 134 movaps %xmm2, 2 * SIZE(%eax) 135 136 addl $2 * SIZE, %ecx 137 addl $2 * SIZE, %edx 138 ALIGN_4 139 140.L37: 141 movl N, %ebx 142 test $1, %ebx 143 je .L38 144 145 movl BOFFSET2, %eax 146 147 movsd 0 * SIZE(%edx), %xmm0 148 movhps 0 * SIZE(%ecx), %xmm0 149 movaps %xmm0, 0 * SIZE(%eax) 150 151 addl $2 * SIZE, %eax 152 movl %eax, BOFFSET2 153 ALIGN_4 154 155.L38: 156 decl J 157 jg .L39 158 ALIGN_4 159 160.L28: 161 movl M, %eax 162 movl N, %ebx 163 164 testb $1, %al 165 je .L40 166 167 sarl $2, %ebx 168 jle .L41 169 ALIGN_4 170 171.L45: 172 movsd 0 * SIZE(%ebp), %xmm0 173 movhps 1 * SIZE(%ebp), %xmm0 174 movsd 2 * SIZE(%ebp), %xmm2 175 movhps 3 * SIZE(%ebp), %xmm2 176 177 movaps %xmm0, 0 * SIZE(%edi) 178 179 addl %esi, %edi 180 181 movaps %xmm2, 0 * SIZE(%edi) 182 183 addl %esi,%edi 184 addl $4 * SIZE, %ebp 185 decl %ebx 186 jg .L45 187 ALIGN_4 188 189.L41: 190 movl N, %ebx 191 test $2, %ebx 192 je .L46 193 194 movsd 0 * SIZE(%ebp), %xmm0 195 movhps 1 * SIZE(%ebp), %xmm0 196 movaps %xmm0, 0 * SIZE(%edi) 197 addl $2 * SIZE, %ebp 198 ALIGN_4 199 200.L46: 201 movl N, %ebx 202 test $1, %ebx 203 je .L40 204 205 movl BOFFSET2, %eax 206 207 movsd 0 * SIZE(%ebp), %xmm0 208 movsd %xmm0, 0 * SIZE(%eax) 209 ALIGN_4 210 211.L40: 212 popl %ebx 213 popl %esi 214 popl %edi 215 popl %ebp 216 addl $ARGS,%esp 217 ret 218 219 EPILOGUE 220