1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define N %i0 43#define X %i1 44#define INCX %i2 45#define I %i3 46 47#define v1 %o0 48#define v2 %o1 49#define v3 %o2 50#define v4 %o3 51#define count %o4 52 53#ifdef DOUBLE 54#define c1 %f0 55#define c2 %f2 56#define c3 %f4 57#define c4 %f6 58#define t1 %f8 59#define t2 %f10 60#define t3 %f12 61#define t4 %f14 62#define t5 %f16 63#define t6 %f18 64#define t7 %f20 65#define t8 %f22 66 67#define a1 %f24 68#define a2 %f26 69#define a3 %f28 70#define a4 %f30 71#define a5 %f32 72#define a6 %f34 73#define a7 %f36 74#define a8 %f38 75#else 76#define c1 %f0 77#define c2 %f1 78#define c3 %f2 79#define c4 %f3 80#define t1 %f4 81#define t2 %f5 82#define t3 %f6 83#define t4 %f7 84#define t5 %f8 85#define t6 %f9 86#define t7 %f10 87#define t8 %f11 88 89#define a1 %f12 90#define a2 %f13 91#define a3 %f14 92#define a4 %f15 93#define a5 %f16 94#define a6 %f17 95#define a7 %f18 96#define a8 %f19 97#endif 98 99#ifndef USE_MIN 100#define FCMOV FMOVG 101#define CMOV movg 102#else 103#define FCMOV FMOVL 104#define CMOV movl 105#endif 106 107 108 PROLOGUE 109 SAVESP 110 111 FCLR(0) 112 113 cmp N, 0 114 ble .LL20 115 clr v1 116 117 cmp INCX, 0 118 ble .LL20 119 sll INCX, ZBASE_SHIFT, INCX 120 121 mov 1, v1 122 123 LDF [X + 0 * SIZE], c1 124 LDF [X + 1 * SIZE], c2 125 add N, -1, N 126 FABS c1, c1 127 add X, INCX, X 128 FABS c2, c2 129 cmp N, 0 130 ble .LL20 131 FADD c1, c2, c1 132 133 FMOV c1, c2 134 mov 1, v2 135 FMOV c1, c3 136 mov 1, v3 137 FMOV c1, c4 138 mov 1, v4 139 mov 2, count 140 141 cmp INCX, 2 * SIZE 142 bne .LL50 143 nop 144 145 sra N, 2, I 146 cmp I, 0 147 ble,pn %icc, .LL15 148 nop 149 150 LDF [X + 0 * SIZE], a1 151 LDF [X + 1 * SIZE], a2 152 LDF [X + 2 * SIZE], a3 153 LDF [X + 3 * SIZE], a4 154 155 LDF [X + 4 * SIZE], a5 156 add I, -1, I 157 LDF [X + 5 * SIZE], a6 158 cmp I, 0 159 LDF [X + 6 * SIZE], a7 160 LDF [X + 7 * SIZE], a8 161 162 ble,pt %icc, .LL12 163 add X, 8 * SIZE, X 164 165#define PREFETCHSIZE 32 166 167.LL11: 168 prefetch [X + PREFETCHSIZE * SIZE], 0 169 170 FABS a1, t1 171 LDF [X + 0 * SIZE], a1 172 FABS a2, t2 173 LDF [X + 1 * SIZE], a2 174 FABS a3, t3 175 LDF [X + 2 * SIZE], a3 176 FABS a4, t4 177 LDF [X + 3 * SIZE], a4 178 179 FABS a5, t5 180 LDF [X + 4 * SIZE], a5 181 FABS a6, t6 182 LDF [X + 5 * SIZE], a6 183 FABS a7, t7 184 LDF [X + 6 * SIZE], a7 185 FABS a8, t8 186 LDF [X + 7 * SIZE], a8 187 188 FADD t1, t2, t1 189 FADD t3, t4, t3 190 FADD t5, t6, t5 191 FADD t7, t8, t7 192 193 FCMP %fcc0, t1, c1 194 FCMP %fcc1, t3, c2 195 FCMP %fcc2, t5, c3 196 FCMP %fcc3, t7, c4 197 198 FCMOV %fcc0, t1, c1 199 CMOV %fcc0, count, v1 200 add I, -1, I 201 FCMOV %fcc1, t3, c2 202 CMOV %fcc1, count, v2 203 cmp I, 0 204 FCMOV %fcc2, t5, c3 205 CMOV %fcc2, count, v3 206 FCMOV %fcc3, t7, c4 207 CMOV %fcc3, count, v4 208 add count, 4, count 209 210 bg,pt %icc, .LL11 211 add X, 8 * SIZE, X 212 213.LL12: 214 FABS a1, t1 215 FABS a2, t2 216 FABS a3, t3 217 FABS a4, t4 218 219 FABS a5, t5 220 FABS a6, t6 221 FABS a7, t7 222 FABS a8, t8 223 224 FADD t1, t2, t1 225 FADD t3, t4, t3 226 FADD t5, t6, t5 227 FADD t7, t8, t7 228 229 FCMP %fcc0, t1, c1 230 FCMP %fcc1, t3, c2 231 FCMP %fcc2, t5, c3 232 FCMP %fcc3, t7, c4 233 234 FCMOV %fcc0, t1, c1 235 CMOV %fcc0, count, v1 236 FCMOV %fcc1, t3, c2 237 CMOV %fcc1, count, v2 238 FCMOV %fcc2, t5, c3 239 CMOV %fcc2, count, v3 240 FCMOV %fcc3, t7, c4 241 CMOV %fcc3, count, v4 242 add count, 4, count 243 244.LL15: 245 and N, 3, I 246 cmp I, 0 247 ble,a,pn %icc, .LL19 248 nop 249 250.LL16: 251 LDF [X + 0 * SIZE], a1 252 LDF [X + 1 * SIZE], a2 253 254 FABS a1, t1 255 FABS a2, t2 256 FADD t1, t2, t1 257 FCMP %fcc0, t1, c1 258 FCMOV %fcc0, t1, c1 259 CMOV %fcc0, count, v1 260 add count, 1, count 261 add I, -1, I 262 cmp I, 0 263 bg,pt %icc, .LL16 264 add X, 2 * SIZE, X 265 266.LL19: 267 FCMP %fcc0, c2, c1 268 add v2, 1, v2 269 FCMP %fcc1, c4, c3 270 add v3, 2, v3 271 add v4, 3, v4 272 273 FCMOV %fcc0, c2, c1 274 CMOV %fcc0, v2, v1 275 FCMOV %fcc1, c4, c3 276 CMOV %fcc1, v4, v3 277 FCMP %fcc0, c3, c1 278 CMOV %fcc0, v3, v1 279 280.LL20: 281 mov v1, %i0 282 return %i7 + 8 283 nop 284 285.LL50: 286 sra N, 2, I 287 cmp I, 0 288 ble,pn %icc, .LL55 289 nop 290 291 LDF [X + 0 * SIZE], a1 292 LDF [X + 1 * SIZE], a2 293 add X, INCX, X 294 LDF [X + 0 * SIZE], a3 295 LDF [X + 1 * SIZE], a4 296 add X, INCX, X 297 LDF [X + 0 * SIZE], a5 298 LDF [X + 1 * SIZE], a6 299 add X, INCX, X 300 add I, -1, I 301 LDF [X + 0 * SIZE], a7 302 cmp I, 0 303 LDF [X + 1 * SIZE], a8 304 ble,pt %icc, .LL52 305 add X, INCX, X 306 307.LL51: 308 FABS a1, t1 309 LDF [X + 0 * SIZE], a1 310 FABS a2, t2 311 LDF [X + 1 * SIZE], a2 312 add X, INCX, X 313 FABS a3, t3 314 LDF [X + 0 * SIZE], a3 315 FABS a4, t4 316 LDF [X + 1 * SIZE], a4 317 add X, INCX, X 318 319 FABS a5, t5 320 LDF [X + 0 * SIZE], a5 321 FABS a6, t6 322 LDF [X + 1 * SIZE], a6 323 add X, INCX, X 324 FABS a7, t7 325 LDF [X + 0 * SIZE], a7 326 FABS a8, t8 327 LDF [X + 1 * SIZE], a8 328 329 FADD t1, t2, t1 330 FADD t3, t4, t3 331 FADD t5, t6, t5 332 FADD t7, t8, t7 333 334 FCMP %fcc0, t1, c1 335 FCMP %fcc1, t3, c2 336 FCMP %fcc2, t5, c3 337 FCMP %fcc3, t7, c4 338 339 FCMOV %fcc0, t1, c1 340 CMOV %fcc0, count, v1 341 add I, -1, I 342 FCMOV %fcc1, t3, c2 343 CMOV %fcc1, count, v2 344 cmp I, 0 345 FCMOV %fcc2, t5, c3 346 CMOV %fcc2, count, v3 347 FCMOV %fcc3, t7, c4 348 CMOV %fcc3, count, v4 349 add count, 4, count 350 351 bg,pt %icc, .LL51 352 add X, INCX, X 353 354.LL52: 355 FABS a1, t1 356 FABS a2, t2 357 FABS a3, t3 358 FABS a4, t4 359 360 FABS a5, t5 361 FABS a6, t6 362 FABS a7, t7 363 FABS a8, t8 364 365 FADD t1, t2, t1 366 FADD t3, t4, t3 367 FADD t5, t6, t5 368 FADD t7, t8, t7 369 370 FCMP %fcc0, t1, c1 371 FCMP %fcc1, t3, c2 372 FCMP %fcc2, t5, c3 373 FCMP %fcc3, t7, c4 374 375 FCMOV %fcc0, t1, c1 376 CMOV %fcc0, count, v1 377 FCMOV %fcc1, t3, c2 378 CMOV %fcc1, count, v2 379 FCMOV %fcc2, t5, c3 380 CMOV %fcc2, count, v3 381 FCMOV %fcc3, t7, c4 382 CMOV %fcc3, count, v4 383 add count, 4, count 384 385.LL55: 386 and N, 3, I 387 cmp I, 0 388 ble,a,pn %icc, .LL59 389 nop 390 391.LL56: 392 LDF [X + 0 * SIZE], a1 393 LDF [X + 1 * SIZE], a2 394 395 FABS a1, t1 396 add I, -1, I 397 FABS a2, t2 398 cmp I, 0 399 FADD t1, t2, t1 400 FCMP %fcc0, t1, c1 401 FCMOV %fcc0, t1, c1 402 CMOV %fcc0, count, v1 403 add count, 1, count 404 bg,pt %icc, .LL56 405 add X, INCX, X 406 407.LL59: 408 FCMP %fcc0, c2, c1 409 add v2, 1, v2 410 FCMP %fcc1, c4, c3 411 add v3, 2, v3 412 add v4, 3, v4 413 414 FCMOV %fcc0, c2, c1 415 CMOV %fcc0, v2, v1 416 FCMOV %fcc1, c4, c3 417 CMOV %fcc1, v4, v3 418 FCMP %fcc0, c3, c1 419 CMOV %fcc0, v3, v1 420 421 mov v1, %i0 422 return %i7 + 8 423 nop 424 425 EPILOGUE 426