1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define N r3 43#define X r4 44#define INCX r5 45#define Y r6 46#define INCY r7 47#define PRE r8 48 49#define FZERO f0 50 51#define STACKSIZE 96 52 53 PROLOGUE 54 PROFCODE 55 56 addi SP, SP, -STACKSIZE 57 li r0, 0 58 59 stfd f14, 0(SP) 60 stfd f15, 8(SP) 61 stfd f16, 16(SP) 62 stfd f17, 24(SP) 63 64 stfd f18, 32(SP) 65 stfd f19, 40(SP) 66 stfd f20, 48(SP) 67 stfd f21, 56(SP) 68 69 stfd f22, 64(SP) 70 stfd f23, 72(SP) 71 72 stw r0, 80(SP) 73 lfs FZERO, 80(SP) 74 75#ifdef F_INTERFACE 76 LDINT N, 0(N) 77 LDINT INCX, 0(INCX) 78 LDINT INCY, 0(INCY) 79#endif 80 81 slwi INCX, INCX, BASE_SHIFT 82 slwi INCY, INCY, BASE_SHIFT 83 84 fmr f1, FZERO 85 fmr f2, FZERO 86 fmr f3, FZERO 87 fmr f4, FZERO 88 fmr f5, FZERO 89 fmr f6, FZERO 90 fmr f7, FZERO 91 92 li PRE, 3 * 16 * SIZE 93 94 cmpwi cr0, N, 0 95 ble- LL(999) 96 97#ifdef F_INTERFACE 98 cmpwi cr0, INCX, 0 99 bge+ LL(102) 100 101 subi r0, N, 1 102 mullw r0, r0, INCX 103 sub X, X, r0 104 .align 4 105 106LL(102): 107 cmpwi cr0, INCY, 0 108 bge+ LL(104) 109 110 subi r0, N, 1 111 mullw r0, r0, INCY 112 sub Y, Y, r0 113 .align 4 114 115LL(104): 116#endif 117 sub X, X, INCX 118 sub Y, Y, INCY 119 120 srawi. r0, N, 4 121 mtspr CTR, r0 122 beq- LL(150) 123 124 LFDUX f8, X, INCX 125 LFDUX f16, Y, INCY 126 LFDUX f9, X, INCX 127 LFDUX f17, Y, INCY 128 129 LFDUX f10, X, INCX 130 LFDUX f18, Y, INCY 131 LFDUX f11, X, INCX 132 LFDUX f19, Y, INCY 133 134 LFDUX f12, X, INCX 135 LFDUX f20, Y, INCY 136 LFDUX f13, X, INCX 137 LFDUX f21, Y, INCY 138 139 LFDUX f14, X, INCX 140 LFDUX f22, Y, INCY 141 LFDUX f15, X, INCX 142 LFDUX f23, Y, INCY 143 bdz LL(120) 144 .align 4 145 146LL(110): 147 FMADD f0, f8, f16, f0 148 LFDUX f8, X, INCX 149 LFDUX f16, Y, INCY 150#ifdef PPCG4 151 dcbt X, PRE 152#endif 153 FMADD f1, f9, f17, f1 154 LFDUX f9, X, INCX 155 LFDUX f17, Y, INCY 156 FMADD f2, f10, f18, f2 157 LFDUX f10, X, INCX 158 LFDUX f18, Y, INCY 159#ifdef PPCG4 160 dcbt Y, PRE 161#endif 162 FMADD f3, f11, f19, f3 163 LFDUX f11, X, INCX 164 LFDUX f19, Y, INCY 165 166 FMADD f4, f12, f20, f4 167 LFDUX f12, X, INCX 168 LFDUX f20, Y, INCY 169#if defined(PPCG4) && defined(DOUBLE) 170 dcbt X, PRE 171#endif 172 FMADD f5, f13, f21, f5 173 LFDUX f13, X, INCX 174 LFDUX f21, Y, INCY 175 FMADD f6, f14, f22, f6 176 LFDUX f14, X, INCX 177 LFDUX f22, Y, INCY 178#if defined(PPCG4) && defined(DOUBLE) 179 dcbt Y, PRE 180#endif 181 FMADD f7, f15, f23, f7 182 LFDUX f15, X, INCX 183 LFDUX f23, Y, INCY 184 185 FMADD f0, f8, f16, f0 186 LFDUX f8, X, INCX 187 LFDUX f16, Y, INCY 188#ifdef PPCG4 189 dcbt X, PRE 190#endif 191 FMADD f1, f9, f17, f1 192 LFDUX f9, X, INCX 193 LFDUX f17, Y, INCY 194 FMADD f2, f10, f18, f2 195 LFDUX f10, X, INCX 196 LFDUX f18, Y, INCY 197#ifdef PPCG4 198 dcbt Y, PRE 199#endif 200 FMADD f3, f11, f19, f3 201 LFDUX f11, X, INCX 202 LFDUX f19, Y, INCY 203 204 FMADD f4, f12, f20, f4 205 LFDUX f12, X, INCX 206 LFDUX f20, Y, INCY 207#if defined(PPCG4) && defined(DOUBLE) 208 dcbt X, PRE 209#endif 210 FMADD f5, f13, f21, f5 211 LFDUX f13, X, INCX 212 LFDUX f21, Y, INCY 213 FMADD f6, f14, f22, f6 214 LFDUX f14, X, INCX 215 LFDUX f22, Y, INCY 216#if defined(PPCG4) && defined(DOUBLE) 217 dcbt Y, PRE 218#endif 219 FMADD f7, f15, f23, f7 220 LFDUX f15, X, INCX 221 LFDUX f23, Y, INCY 222 bdnz LL(110) 223 .align 4 224 225LL(120): 226 FMADD f0, f8, f16, f0 227 LFDUX f8, X, INCX 228 LFDUX f16, Y, INCY 229 FMADD f1, f9, f17, f1 230 LFDUX f9, X, INCX 231 LFDUX f17, Y, INCY 232 FMADD f2, f10, f18, f2 233 LFDUX f10, X, INCX 234 LFDUX f18, Y, INCY 235 FMADD f3, f11, f19, f3 236 LFDUX f11, X, INCX 237 LFDUX f19, Y, INCY 238 239 FMADD f4, f12, f20, f4 240 LFDUX f12, X, INCX 241 LFDUX f20, Y, INCY 242 FMADD f5, f13, f21, f5 243 LFDUX f13, X, INCX 244 LFDUX f21, Y, INCY 245 FMADD f6, f14, f22, f6 246 LFDUX f14, X, INCX 247 LFDUX f22, Y, INCY 248 FMADD f7, f15, f23, f7 249 LFDUX f15, X, INCX 250 LFDUX f23, Y, INCY 251 252 FMADD f0, f8, f16, f0 253 FMADD f1, f9, f17, f1 254 FMADD f2, f10, f18, f2 255 FMADD f3, f11, f19, f3 256 FMADD f4, f12, f20, f4 257 FMADD f5, f13, f21, f5 258 FMADD f6, f14, f22, f6 259 FMADD f7, f15, f23, f7 260 .align 4 261 262LL(150): 263 andi. r0, N, 15 264 mtspr CTR, r0 265 beq LL(999) 266 .align 4 267 268LL(160): 269 LFDUX f8, X, INCX 270 LFDUX f16, Y, INCY 271 FMADD f0, f8, f16, f0 272 bdnz LL(160) 273 .align 4 274 275LL(999): 276 FADD f0, f0, f1 277 FADD f2, f2, f3 278 FADD f4, f4, f5 279 FADD f6, f6, f7 280 281 FADD f0, f0, f2 282 FADD f4, f4, f6 283 FADD f1, f0, f4 284 285 lfd f14, 0(SP) 286 lfd f15, 8(SP) 287 lfd f16, 16(SP) 288 lfd f17, 24(SP) 289 290 lfd f18, 32(SP) 291 lfd f19, 40(SP) 292 lfd f20, 48(SP) 293 lfd f21, 56(SP) 294 295 lfd f22, 64(SP) 296 lfd f23, 72(SP) 297 298 addi SP, SP, STACKSIZE 299 blr 300 301 EPILOGUE 302