1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#ifndef WINDOWS_ABI 43 44#define M ARG1 /* rdi */ 45#define N ARG2 /* rsi */ 46#define A ARG3 /* rdx */ 47#define LDA ARG4 /* rcx */ 48#define B ARG5 /* r8 */ 49 50#define I %r10 51#define J %rbp 52 53#define AO1 %r9 54#define AO2 %r15 55#define AO3 %r11 56#define AO4 %r14 57#define BO1 %r13 58#define M8 %rbx 59#define BO %rax 60 61#else 62 63#define STACKSIZE 256 64 65#define M ARG1 /* rcx */ 66#define N ARG2 /* rdx */ 67#define A ARG3 /* r8 */ 68#define LDA ARG4 /* r9 */ 69#define OLD_B 40 + 64 + STACKSIZE(%rsp) 70 71#define B %rdi 72 73#define I %r10 74#define J %r11 75 76#define AO1 %r12 77#define AO2 %r13 78#define AO3 %r14 79#define AO4 %r15 80 81#define BO1 %rsi 82#define M8 %rbp 83#define BO %rax 84 85#endif 86 87 PROLOGUE 88 PROFCODE 89 90#ifdef WINDOWS_ABI 91 pushq %rdi 92 pushq %rsi 93#endif 94 pushq %r15 95 pushq %r14 96 pushq %r13 97 pushq %r12 98 pushq %rbp 99 pushq %rbx 100 101#ifdef WINDOWS_ABI 102 subq $STACKSIZE, %rsp 103 104 vmovups %xmm6, 0(%rsp) 105 vmovups %xmm7, 16(%rsp) 106 vmovups %xmm8, 32(%rsp) 107 vmovups %xmm9, 48(%rsp) 108 vmovups %xmm10, 64(%rsp) 109 vmovups %xmm11, 80(%rsp) 110 vmovups %xmm12, 96(%rsp) 111 vmovups %xmm13, 112(%rsp) 112 vmovups %xmm14, 128(%rsp) 113 vmovups %xmm15, 144(%rsp) 114 115 movq OLD_B, B 116#endif 117 118 movq N, %rax 119 andq $-2, %rax 120 imulq M, %rax 121 122 leaq (B, %rax, SIZE), BO1 123 124 leaq (, LDA, SIZE), LDA 125 leaq (, M, SIZE), M8 126 127 movq M, J 128 sarq $1, J 129 jle .L20 130 ALIGN_4 131 132.L01: 133 movq A, AO1 134 leaq (A, LDA ), AO2 135 leaq (A, LDA, 2), A 136 137 movq B, BO 138 addq $4 * SIZE, B 139 140 movq N, I 141 sarq $3, I 142 jle .L10 143 ALIGN_4 144 145 146.L08: 147#ifndef DOUBLE 148 149 vmovsd 0 * SIZE(AO1), %xmm0 150 vmovsd 2 * SIZE(AO1), %xmm2 151 vmovsd 4 * SIZE(AO1), %xmm4 152 vmovsd 6 * SIZE(AO1), %xmm6 153 vmovsd 0 * SIZE(AO2), %xmm1 154 vmovsd 2 * SIZE(AO2), %xmm3 155 vmovsd 4 * SIZE(AO2), %xmm5 156 vmovsd 6 * SIZE(AO2), %xmm7 157 158 vmovsd %xmm0, 0 * SIZE(BO) 159 vmovsd %xmm1, 2 * SIZE(BO) 160 leaq (BO, M8, 2), BO 161 162 vmovsd %xmm2, 0 * SIZE(BO) 163 vmovsd %xmm3, 2 * SIZE(BO) 164 leaq (BO, M8, 2), BO 165 166 vmovsd %xmm4, 0 * SIZE(BO) 167 vmovsd %xmm5, 2 * SIZE(BO) 168 leaq (BO, M8, 2), BO 169 170 vmovsd %xmm6, 0 * SIZE(BO) 171 vmovsd %xmm7, 2 * SIZE(BO) 172 leaq (BO, M8, 2), BO 173 174 175#else 176 177 prefetchnta 256(AO1) 178 prefetchnta 256(AO2) 179 vmovups 0 * SIZE(AO1), %xmm0 180 vmovups 2 * SIZE(AO1), %xmm2 181 vmovups 4 * SIZE(AO1), %xmm4 182 vmovups 6 * SIZE(AO1), %xmm6 183 vmovups 0 * SIZE(AO2), %xmm1 184 vmovups 2 * SIZE(AO2), %xmm3 185 vmovups 4 * SIZE(AO2), %xmm5 186 vmovups 6 * SIZE(AO2), %xmm7 187 188 vmovups %xmm0, 0 * SIZE(BO) 189 vmovups %xmm1, 2 * SIZE(BO) 190 leaq (BO, M8, 2), BO 191 192 vmovups %xmm2, 0 * SIZE(BO) 193 vmovups %xmm3, 2 * SIZE(BO) 194 leaq (BO, M8, 2), BO 195 196 vmovups %xmm4, 0 * SIZE(BO) 197 vmovups %xmm5, 2 * SIZE(BO) 198 leaq (BO, M8, 2), BO 199 200 vmovups %xmm6, 0 * SIZE(BO) 201 vmovups %xmm7, 2 * SIZE(BO) 202 leaq (BO, M8, 2), BO 203 204#endif 205 206 addq $8 * SIZE, AO1 207 addq $8 * SIZE, AO2 208 decq I 209 jg .L08 210 ALIGN_4 211 212 213 214.L10: 215 testq $4, N 216 jle .L12 217#ifndef DOUBLE 218 219 vmovsd 0 * SIZE(AO1), %xmm0 220 vmovsd 2 * SIZE(AO1), %xmm2 221 vmovsd 0 * SIZE(AO2), %xmm1 222 vmovsd 2 * SIZE(AO2), %xmm3 223 224 vmovsd %xmm0, 0 * SIZE(BO) 225 vmovsd %xmm1, 2 * SIZE(BO) 226 leaq (BO, M8, 2), BO 227 228 vmovsd %xmm2, 0 * SIZE(BO) 229 vmovsd %xmm3, 2 * SIZE(BO) 230 leaq (BO, M8, 2), BO 231 232 233#else 234 235 vmovups 0 * SIZE(AO1), %xmm0 236 vmovups 2 * SIZE(AO1), %xmm2 237 vmovups 0 * SIZE(AO2), %xmm1 238 vmovups 2 * SIZE(AO2), %xmm3 239 240 vmovups %xmm0, 0 * SIZE(BO) 241 vmovups %xmm1, 2 * SIZE(BO) 242 leaq (BO, M8, 2), BO 243 244 vmovups %xmm2, 0 * SIZE(BO) 245 vmovups %xmm3, 2 * SIZE(BO) 246 leaq (BO, M8, 2), BO 247 248#endif 249 250 addq $4 * SIZE, AO1 251 addq $4 * SIZE, AO2 252 ALIGN_4 253 254 255.L12: 256 testq $2, N 257 jle .L14 258#ifndef DOUBLE 259 vmovsd 0 * SIZE(AO1), %xmm0 260 vmovsd 0 * SIZE(AO2), %xmm1 261 262 vmovsd %xmm0, 0 * SIZE(BO) 263 vmovsd %xmm1, 2 * SIZE(BO) 264#else 265 vmovups 0 * SIZE(AO1), %xmm0 266 vmovups 0 * SIZE(AO2), %xmm1 267 268 vmovups %xmm0, 0 * SIZE(BO) 269 vmovups %xmm1, 2 * SIZE(BO) 270#endif 271 272 leaq (BO, M8, 2), BO 273 addq $2 * SIZE, AO1 274 addq $2 * SIZE, AO2 275 ALIGN_4 276 277.L14: 278 testq $1, N 279 jle .L19 280 281#ifndef DOUBLE 282 vmovss 0 * SIZE(AO1), %xmm0 283 vmovss 0 * SIZE(AO2), %xmm1 284 285 vmovss %xmm0, 0 * SIZE(BO1) 286 vmovss %xmm1, 1 * SIZE(BO1) 287#else 288 vmovsd 0 * SIZE(AO1), %xmm0 289 vmovhpd 0 * SIZE(AO2), %xmm0 , %xmm0 290 291 vmovups %xmm0, 0 * SIZE(BO1) 292#endif 293 294 addq $2 * SIZE, BO1 295 ALIGN_4 296 297.L19: 298 decq J 299 jg .L01 300 ALIGN_4 301 302.L20: 303 testq $1, M 304 jle .L999 305 ALIGN_4 306 307.L31: 308 movq A, AO1 309 movq B, BO 310 311 movq N, I 312 sarq $1, I 313 jle .L33 314 ALIGN_4 315 316.L32: 317#ifndef DOUBLE 318 vmovsd 0 * SIZE(AO1), %xmm0 319 vmovsd %xmm0, 0 * SIZE(BO) 320#else 321 vmovups 0 * SIZE(AO1), %xmm0 322 vmovups %xmm0, 0 * SIZE(BO) 323#endif 324 325 addq $2 * SIZE, AO1 326 leaq (BO, M8, 2), BO 327 decq I 328 jg .L32 329 ALIGN_4 330 331.L33: 332 testq $1, N 333 jle .L999 334 335#ifndef DOUBLE 336 vmovss 0 * SIZE(AO1), %xmm0 337 vmovss %xmm0, 0 * SIZE(BO1) 338#else 339 vmovsd 0 * SIZE(AO1), %xmm0 340 vmovsd %xmm0, 0 * SIZE(BO1) 341#endif 342 addq $1 * SIZE, BO1 343 ALIGN_4 344 345.L999: 346#ifdef WINDOWS_ABI 347 vmovups 0(%rsp), %xmm6 348 vmovups 16(%rsp), %xmm7 349 vmovups 32(%rsp), %xmm8 350 vmovups 48(%rsp), %xmm9 351 vmovups 64(%rsp), %xmm10 352 vmovups 80(%rsp), %xmm11 353 vmovups 96(%rsp), %xmm12 354 vmovups 112(%rsp), %xmm13 355 vmovups 128(%rsp), %xmm14 356 vmovups 144(%rsp), %xmm15 357 358 addq $STACKSIZE, %rsp 359#endif 360 361 popq %rbx 362 popq %rbp 363 popq %r12 364 popq %r13 365 popq %r14 366 popq %r15 367#ifdef WINDOWS_ABI 368 popq %rsi 369 popq %rdi 370#endif 371 372 ret 373 374 EPILOGUE 375