1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define N %i0 43#define X %i1 44#define INCX %i2 45#define I %i3 46 47#define c1 %f0 48#define c2 %f2 49#define c3 %f4 50#define c4 %f6 51#define t1 %f8 52#define t2 %f10 53#define t3 %f12 54#define t4 %f14 55 56#define a1 %f16 57#define a2 %f18 58#define a3 %f20 59#define a4 %f22 60#define a5 %f24 61#define a6 %f26 62#define a7 %f28 63#define a8 %f30 64 65 PROLOGUE 66 SAVESP 67 68 FCLR(0) 69 70 FMOV c1, c2 71 FMOV c1, c3 72 FMOV c1, c4 73 FMOV c1, t1 74 FMOV c1, t2 75 FMOV c1, t3 76 FMOV c1, t4 77 78 cmp INCX, 0 79 ble .LL20 80 sll INCX, ZBASE_SHIFT, INCX 81 82 cmp N, 0 83 ble .LL20 84 nop 85 86 cmp INCX, 2 * SIZE 87 bne .LL50 88 nop 89 90 sra N, 2, I 91 cmp I, 0 92 ble,pn %icc, .LL15 93 nop 94 95 ld [X + 0 * SIZE], a1 96 add I, -1, I 97 ld [X + 1 * SIZE], a2 98 cmp I, 0 99 ld [X + 2 * SIZE], a3 100 ld [X + 3 * SIZE], a4 101 ld [X + 4 * SIZE], a5 102 ld [X + 5 * SIZE], a6 103 ld [X + 6 * SIZE], a7 104 ld [X + 7 * SIZE], a8 105 106 ble,pt %icc, .LL12 107 add X, 8 * SIZE, X 108 109#define PREFETCHSIZE 40 110 111.LL11: 112 faddd c1, t1, c1 113 fsmuld a1, a1, t1 114 prefetch [X + PREFETCHSIZE * SIZE], 0 115 116 faddd c2, t2, c2 117 add I, -1, I 118 fsmuld a2, a2, t2 119 ld [X + 0 * SIZE], a1 120 121 faddd c3, t3, c3 122 cmp I, 0 123 fsmuld a3, a3, t3 124 ld [X + 1 * SIZE], a2 125 126 faddd c4, t4, c4 127 fsmuld a4, a4, t4 128 ld [X + 2 * SIZE], a3 129 130 faddd c1, t1, c1 131 fsmuld a5, a5, t1 132 ld [X + 3 * SIZE], a4 133 134 faddd c2, t2, c2 135 fsmuld a6, a6, t2 136 ld [X + 4 * SIZE], a5 137 138 faddd c3, t3, c3 139 fsmuld a7, a7, t3 140 ld [X + 5 * SIZE], a6 141 142 faddd c4, t4, c4 143 ld [X + 6 * SIZE], a7 144 fsmuld a8, a8, t4 145 add X, 8 * SIZE, X 146 147 bg,pt %icc, .LL11 148 ld [X - 1 * SIZE], a8 149 150.LL12: 151 faddd c1, t1, c1 152 fsmuld a1, a1, t1 153 faddd c2, t2, c2 154 fsmuld a2, a2, t2 155 156 faddd c3, t3, c3 157 fsmuld a3, a3, t3 158 faddd c4, t4, c4 159 fsmuld a4, a4, t4 160 161 faddd c1, t1, c1 162 fsmuld a5, a5, t1 163 faddd c2, t2, c2 164 fsmuld a6, a6, t2 165 166 faddd c3, t3, c3 167 fsmuld a7, a7, t3 168 faddd c4, t4, c4 169 fsmuld a8, a8, t4 170 171.LL15: 172 and N, 3, I 173 cmp I, 0 174 ble,a,pn %icc, .LL19 175 nop 176 177.LL16: 178 ld [X + 0 * SIZE], a1 179 add I, -1, I 180 ld [X + 1 * SIZE], a2 181 cmp I, 0 182 faddd c1, t1, c1 183 faddd c2, t2, c2 184 fsmuld a1, a1, t1 185 fsmuld a2, a2, t2 186 bg,pt %icc, .LL16 187 add X, 2 * SIZE, X 188 189.LL19: 190 faddd c1, t1, c1 191 faddd c2, t2, c2 192 faddd c3, t3, c3 193 faddd c4, t4, c4 194 195 faddd c1, c2, c1 196 faddd c3, c4, c3 197 faddd c1, c3, c1 198 199 fsqrtd c1, c1 200 201#if !defined(NEED_F2CCONV) || !defined(F_INTERFACE_F2C) 202 fdtos c1, c1 203#endif 204.LL20: 205 return %i7 + 8 206 clr %g0 207 208.LL50: 209 sra N, 2, I 210 cmp I, 0 211 ble,pn %icc, .LL55 212 nop 213 214 ld [X + 0 * SIZE], a1 215 ld [X + 1 * SIZE], a2 216 add X, INCX, X 217 ld [X + 0 * SIZE], a3 218 ld [X + 1 * SIZE], a4 219 add X, INCX, X 220 ld [X + 0 * SIZE], a5 221 ld [X + 1 * SIZE], a6 222 add X, INCX, X 223 add I, -1, I 224 ld [X + 0 * SIZE], a7 225 cmp I, 0 226 ld [X + 1 * SIZE], a8 227 228 ble,pt %icc, .LL52 229 add X, INCX, X 230 231.LL51: 232 faddd c1, t1, c1 233 add I, -1, I 234 fsmuld a1, a1, t1 235 ld [X + 0 * SIZE], a1 236 237 faddd c2, t2, c2 238 cmp I, 0 239 fsmuld a2, a2, t2 240 ld [X + 1 * SIZE], a2 241 add X, INCX, X 242 243 faddd c3, t3, c3 244 fsmuld a3, a3, t3 245 ld [X + 0 * SIZE], a3 246 247 faddd c4, t4, c4 248 fsmuld a4, a4, t4 249 ld [X + 1 * SIZE], a4 250 add X, INCX, X 251 252 faddd c1, t1, c1 253 fsmuld a5, a5, t1 254 ld [X + 0 * SIZE], a5 255 256 faddd c2, t2, c2 257 fsmuld a6, a6, t2 258 ld [X + 1 * SIZE], a6 259 add X, INCX, X 260 261 faddd c3, t3, c3 262 fsmuld a7, a7, t3 263 ld [X + 0 * SIZE], a7 264 265 faddd c4, t4, c4 266 fsmuld a8, a8, t4 267 ld [X + 1 * SIZE], a8 268 bg,pt %icc, .LL51 269 add X, INCX, X 270 271.LL52: 272 faddd c1, t1, c1 273 fsmuld a1, a1, t1 274 faddd c2, t2, c2 275 fsmuld a2, a2, t2 276 277 faddd c3, t3, c3 278 fsmuld a3, a3, t3 279 faddd c4, t4, c4 280 fsmuld a4, a4, t4 281 282 faddd c1, t1, c1 283 fsmuld a5, a5, t1 284 faddd c2, t2, c2 285 fsmuld a6, a6, t2 286 287 faddd c3, t3, c3 288 fsmuld a7, a7, t3 289 faddd c4, t4, c4 290 fsmuld a8, a8, t4 291 292.LL55: 293 and N, 3, I 294 cmp I, 0 295 ble,a,pn %icc, .LL59 296 nop 297 298.LL56: 299 ld [X + 0 * SIZE], a1 300 add I, -1, I 301 ld [X + 1 * SIZE], a2 302 cmp I, 0 303 faddd c1, t1, c1 304 faddd c2, t2, c2 305 fsmuld a1, a1, t1 306 fsmuld a2, a2, t2 307 bg,pt %icc, .LL56 308 add X, INCX, X 309 310.LL59: 311 faddd c1, t1, c1 312 faddd c2, t2, c2 313 faddd c3, t3, c3 314 faddd c4, t4, c4 315 316 faddd c1, c2, c1 317 faddd c3, c4, c3 318 faddd c1, c3, c1 319 320 fsqrtd c1, c1 321 322#if !defined(NEED_F2CCONV) || !defined(F_INTERFACE_F2C) 323 fdtos c1, c1 324#endif 325 326 return %i7 + 8 327 clr %o0 328 329 EPILOGUE 330