1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define N r3 26#define X r4 27#define INCX r5 28 29#define PREA r8 30 31#define FZERO f1 32 33#define STACKSIZE 160 34 35 PROLOGUE 36 PROFCODE 37 38 addi SP, SP, -STACKSIZE 39 li r0, 0 40 41 stfd f14, 0(SP) 42 stfd f15, 8(SP) 43 stfd f16, 16(SP) 44 stfd f17, 24(SP) 45 46 stfd f18, 32(SP) 47 stfd f19, 40(SP) 48 stfd f20, 48(SP) 49 stfd f21, 56(SP) 50 51 stfd f22, 64(SP) 52 stfd f23, 72(SP) 53 stfd f24, 80(SP) 54 stfd f25, 88(SP) 55 56 stfd f26, 96(SP) 57 stfd f27, 104(SP) 58 stfd f28, 112(SP) 59 stfd f29, 120(SP) 60 61 stfd f30, 128(SP) 62 stfd f31, 136(SP) 63 64 stw r0, 144(SP) 65 lfs FZERO,144(SP) 66 67#ifdef F_INTERFACE 68 LDINT N, 0(N) 69 LDINT INCX, 0(INCX) 70#endif 71 72 slwi INCX, INCX, BASE_SHIFT 73 74 sub X, X, INCX 75 76 cmpwi cr0, N, 0 77 ble- LL(9999) 78 cmpwi cr0, INCX, 0 79 ble- LL(9999) 80 81 LFDUX f1, X, INCX 82 83 fmr f0, f1 84 subi N, N, 1 85 fmr f2, f1 86 fmr f3, f1 87 fmr f4, f1 88 fmr f5, f1 89 srawi. r0, N, 4 90 fmr f6, f1 91 mtspr CTR, r0 92 fmr f7, f1 93 beq- LL(150) 94 95 LFDUX f16, X, INCX 96 LFDUX f17, X, INCX 97 LFDUX f18, X, INCX 98 LFDUX f19, X, INCX 99 LFDUX f20, X, INCX 100 LFDUX f21, X, INCX 101 LFDUX f22, X, INCX 102 LFDUX f23, X, INCX 103 104 LFDUX f24, X, INCX 105 fsub f8, f0, f16 106 LFDUX f25, X, INCX 107 fsub f9, f1, f17 108 LFDUX f26, X, INCX 109 fsub f10, f2, f18 110 LFDUX f27, X, INCX 111 fsub f11, f3, f19 112 LFDUX f28, X, INCX 113 fsub f12, f4, f20 114 LFDUX f29, X, INCX 115 fsub f13, f5, f21 116 LFDUX f30, X, INCX 117 fsub f14, f6, f22 118 LFDUX f31, X, INCX 119 fsub f15, f7, f23 120 bdz LL(120) 121 .align 4 122 123LL(110): 124 fsel f0, f8, f16, f0 125 LFDUX f16, X, INCX 126 fsub f8, f0, f24 127 fsel f1, f9, f17, f1 128 LFDUX f17, X, INCX 129 fsub f9, f1, f25 130 fsel f2, f10, f18, f2 131 LFDUX f18, X, INCX 132 fsub f10, f2, f26 133 fsel f3, f11, f19, f3 134 LFDUX f19, X, INCX 135 fsub f11, f3, f27 136 137 fsel f4, f12, f20, f4 138 LFDUX f20, X, INCX 139 fsub f12, f4, f28 140 fsel f5, f13, f21, f5 141 LFDUX f21, X, INCX 142 fsub f13, f5, f29 143 fsel f6, f14, f22, f6 144 LFDUX f22, X, INCX 145 fsub f14, f6, f30 146 fsel f7, f15, f23, f7 147 LFDUX f23, X, INCX 148 fsub f15, f7, f31 149 150 fsel f0, f8, f24, f0 151 LFDUX f24, X, INCX 152 fsub f8, f0, f16 153 fsel f1, f9, f25, f1 154 LFDUX f25, X, INCX 155 fsub f9, f1, f17 156 fsel f2, f10, f26, f2 157 LFDUX f26, X, INCX 158 fsub f10, f2, f18 159 fsel f3, f11, f27, f3 160 LFDUX f27, X, INCX 161 fsub f11, f3, f19 162 163 fsel f4, f12, f28, f4 164 LFDUX f28, X, INCX 165 fsub f12, f4, f20 166 fsel f5, f13, f29, f5 167 LFDUX f29, X, INCX 168 fsub f13, f5, f21 169 fsel f6, f14, f30, f6 170 LFDUX f30, X, INCX 171 fsub f14, f6, f22 172 fsel f7, f15, f31, f7 173 LFDUX f31, X, INCX 174 fsub f15, f7, f23 175 bdnz LL(110) 176 .align 4 177 178LL(120): 179 fsel f0, f8, f16, f0 180 fsub f8, f0, f24 181 fsel f1, f9, f17, f1 182 fsub f9, f1, f25 183 fsel f2, f10, f18, f2 184 fsub f10, f2, f26 185 fsel f3, f11, f19, f3 186 fsub f11, f3, f27 187 188 fsel f4, f12, f20, f4 189 fsub f12, f4, f28 190 fsel f5, f13, f21, f5 191 fsub f13, f5, f29 192 fsel f6, f14, f22, f6 193 fsub f14, f6, f30 194 fsel f7, f15, f23, f7 195 fsub f15, f7, f31 196 197 fsel f0, f8, f24, f0 198 fsel f1, f9, f25, f1 199 fsel f2, f10, f26, f2 200 fsel f3, f11, f27, f3 201 fsel f4, f12, f28, f4 202 fsel f5, f13, f29, f5 203 fsel f6, f14, f30, f6 204 fsel f7, f15, f31, f7 205 .align 4 206 207LL(150): 208 andi. r0, N, 15 209 mtspr CTR, r0 210 beq LL(999) 211 .align 4 212 213LL(160): 214 LFDUX f8, X, INCX 215 fsub f16, f1, f8 216 fsel f1, f16, f8, f1 217 bdnz LL(160) 218 .align 4 219 220LL(999): 221 fsub f8, f0, f1 222 fsub f9, f2, f3 223 fsub f10, f4, f5 224 fsub f11, f6, f7 225 226 fsel f0, f8, f1, f0 227 fsel f2, f9, f3, f2 228 fsel f4, f10, f5, f4 229 fsel f6, f11, f7, f6 230 231 fsub f8, f0, f2 232 fsub f9, f4, f6 233 fsel f0, f8, f2, f0 234 fsel f4, f9, f6, f4 235 236 fsub f8, f0, f4 237 fsel f1, f8, f4, f0 238 .align 4 239 240LL(9999): 241 lfd f14, 0(SP) 242 lfd f15, 8(SP) 243 lfd f16, 16(SP) 244 lfd f17, 24(SP) 245 246 lfd f18, 32(SP) 247 lfd f19, 40(SP) 248 lfd f20, 48(SP) 249 lfd f21, 56(SP) 250 251 lfd f22, 64(SP) 252 lfd f23, 72(SP) 253 lfd f24, 80(SP) 254 lfd f25, 88(SP) 255 256 lfd f26, 96(SP) 257 lfd f27, 104(SP) 258 lfd f28, 112(SP) 259 lfd f29, 120(SP) 260 261 lfd f30, 128(SP) 262 lfd f31, 136(SP) 263 264 addi SP, SP, STACKSIZE 265 blr 266 267 EPILOGUE 268