1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define N %i0 26#define X %i1 27#define INCX %i2 28#define Y %i3 29#define INCY %i4 30#define I %i5 31 32#ifdef DOUBLE 33#define a1 %f0 34#define a2 %f2 35#define a3 %f4 36#define a4 %f6 37#define a5 %f8 38#define a6 %f10 39#define a7 %f12 40#define a8 %f14 41#else 42#define a1 %f0 43#define a2 %f1 44#define a3 %f2 45#define a4 %f3 46#define a5 %f4 47#define a6 %f5 48#define a7 %f6 49#define a8 %f7 50#endif 51 52 PROLOGUE 53 SAVESP 54 55 sll INCX, ZBASE_SHIFT, INCX 56 sll INCY, ZBASE_SHIFT, INCY 57 58 cmp INCX, 2 * SIZE 59 bne .LL50 60 nop 61 cmp INCY, 2 * SIZE 62 bne .LL50 63 nop 64 65 sra N, 2, I 66 cmp I, 0 67 ble,pn %icc, .LL15 68 nop 69 70#define PREFETCHSIZE 32 71 72.LL11: 73 prefetch [X + PREFETCHSIZE * SIZE], 0 74 prefetch [Y + PREFETCHSIZE * SIZE], 0 75 76 LDF [X + 0 * SIZE], a1 77 LDF [X + 1 * SIZE], a2 78 LDF [X + 2 * SIZE], a3 79 LDF [X + 3 * SIZE], a4 80 LDF [X + 4 * SIZE], a5 81 LDF [X + 5 * SIZE], a6 82 LDF [X + 6 * SIZE], a7 83 LDF [X + 7 * SIZE], a8 84 85 STF a1, [Y + 0 * SIZE] 86 add I, -1, I 87 STF a2, [Y + 1 * SIZE] 88 cmp I, 0 89 STF a3, [Y + 2 * SIZE] 90 add X, 8 * SIZE, X 91 STF a4, [Y + 3 * SIZE] 92 STF a5, [Y + 4 * SIZE] 93 STF a6, [Y + 5 * SIZE] 94 STF a7, [Y + 6 * SIZE] 95 STF a8, [Y + 7 * SIZE] 96 97 bg,pt %icc, .LL11 98 add Y, 8 * SIZE, Y 99 100.LL15: 101 and N, 3, I 102 cmp I, 0 103 ble,a,pn %icc, .LL19 104 nop 105 106.LL16: 107 LDF [X + 0 * SIZE], a1 108 LDF [X + 1 * SIZE], a2 109 add I, -1, I 110 cmp I, 0 111 STF a1, [Y + 0 * SIZE] 112 add X, 2 * SIZE, X 113 STF a2, [Y + 1 * SIZE] 114 bg,pt %icc, .LL16 115 add Y, 2 * SIZE, Y 116 117.LL19: 118 return %i7 + 8 119 clr %g0 120 121.LL50: 122 sra N, 2, I 123 cmp I, 0 124 ble,pn %icc, .LL55 125 nop 126 127.LL51: 128 LDF [X + 0 * SIZE], a1 129 LDF [X + 1 * SIZE], a2 130 add X, INCX, X 131 LDF [X + 0 * SIZE], a3 132 LDF [X + 1 * SIZE], a4 133 add X, INCX, X 134 LDF [X + 0 * SIZE], a5 135 LDF [X + 1 * SIZE], a6 136 add X, INCX, X 137 LDF [X + 0 * SIZE], a7 138 LDF [X + 1 * SIZE], a8 139 add X, INCX, X 140 141 STF a1, [Y + 0 * SIZE] 142 add I, -1, I 143 STF a2, [Y + 1 * SIZE] 144 add Y, INCY, Y 145 cmp I, 0 146 STF a3, [Y + 0 * SIZE] 147 STF a4, [Y + 1 * SIZE] 148 add Y, INCY, Y 149 STF a5, [Y + 0 * SIZE] 150 STF a6, [Y + 1 * SIZE] 151 add Y, INCY, Y 152 STF a7, [Y + 0 * SIZE] 153 STF a8, [Y + 1 * SIZE] 154 155 bg,pt %icc, .LL51 156 add Y, INCY, Y 157 158.LL55: 159 and N, 3, I 160 cmp I, 0 161 ble,a,pn %icc, .LL59 162 nop 163 164.LL56: 165 LDF [X + 0 * SIZE], a1 166 LDF [X + 1 * SIZE], a2 167 add I, -1, I 168 cmp I, 0 169 add X, INCX, X 170 STF a1, [Y + 0 * SIZE] 171 STF a2, [Y + 1 * SIZE] 172 bg,pt %icc, .LL56 173 add Y, INCY, Y 174 175.LL59: 176 return %i7 + 8 177 clr %o0 178 179 EPILOGUE 180