1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define M ARG1 /* rdi */ 43#define X ARG2 /* rsi */ 44#define INCX ARG3 /* rdx */ 45 46#define I %rax 47 48#ifdef USE_MIN 49#define maxpd minpd 50#define maxsd minsd 51#endif 52 53#include "l1param.h" 54 55 PROLOGUE 56 PROFCODE 57 58 SAVEREGISTERS 59 60 pxor %xmm0, %xmm0 61 testq M, M 62 jle .L999 63 testq INCX, INCX 64 jle .L999 65 66 salq $ZBASE_SHIFT, INCX 67 68 pcmpeqb %xmm15, %xmm15 69 psrlq $1, %xmm15 70 71 movsd 0 * SIZE(X), %xmm0 72 movsd 1 * SIZE(X), %xmm1 73 addq INCX, X 74 decq M 75 andpd %xmm15, %xmm0 76 andpd %xmm15, %xmm1 77 addpd %xmm1, %xmm0 78 unpcklpd %xmm0, %xmm0 79 movapd %xmm0, %xmm1 80 movapd %xmm0, %xmm2 81 movapd %xmm0, %xmm3 82 83 cmpq $2 * SIZE, INCX 84 jne .L40 85 86.L30: 87 movq M, I 88 sarq $3, I 89 jle .L35 90 ALIGN_4 91 92.L31: 93#ifdef PREFETCH 94 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 95#endif 96 97 movsd 0 * SIZE(X), %xmm4 98 movsd 1 * SIZE(X), %xmm5 99 movhpd 2 * SIZE(X), %xmm4 100 movhpd 3 * SIZE(X), %xmm5 101 movsd 4 * SIZE(X), %xmm6 102 movsd 5 * SIZE(X), %xmm7 103 movhpd 6 * SIZE(X), %xmm6 104 movhpd 7 * SIZE(X), %xmm7 105 106 andpd %xmm15, %xmm4 107 andpd %xmm15, %xmm5 108 addpd %xmm5, %xmm4 109 maxpd %xmm4, %xmm0 110 111 andpd %xmm15, %xmm6 112 andpd %xmm15, %xmm7 113 addpd %xmm7, %xmm6 114 maxpd %xmm6, %xmm1 115 116#if defined(PREFETCH) && !defined(FETCH128) 117 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 118#endif 119 120 movsd 8 * SIZE(X), %xmm4 121 movsd 9 * SIZE(X), %xmm5 122 movhpd 10 * SIZE(X), %xmm4 123 movhpd 11 * SIZE(X), %xmm5 124 movsd 12 * SIZE(X), %xmm6 125 movsd 13 * SIZE(X), %xmm7 126 movhpd 14 * SIZE(X), %xmm6 127 movhpd 15 * SIZE(X), %xmm7 128 129 andpd %xmm15, %xmm4 130 andpd %xmm15, %xmm5 131 addpd %xmm5, %xmm4 132 maxpd %xmm4, %xmm2 133 134 andpd %xmm15, %xmm6 135 andpd %xmm15, %xmm7 136 addpd %xmm7, %xmm6 137 maxpd %xmm6, %xmm3 138 139 addq $16 * SIZE, X 140 decq I 141 jg .L31 142 ALIGN_4 143 144.L35: 145 andq $7, M 146 jle .L998 147 148 testq $4, M 149 je .L36 150 151 movsd 0 * SIZE(X), %xmm4 152 movsd 1 * SIZE(X), %xmm5 153 movhpd 2 * SIZE(X), %xmm4 154 movhpd 3 * SIZE(X), %xmm5 155 movsd 4 * SIZE(X), %xmm6 156 movsd 5 * SIZE(X), %xmm7 157 movhpd 6 * SIZE(X), %xmm6 158 movhpd 7 * SIZE(X), %xmm7 159 160 andpd %xmm15, %xmm4 161 andpd %xmm15, %xmm5 162 andpd %xmm15, %xmm6 163 andpd %xmm15, %xmm7 164 165 addpd %xmm5, %xmm4 166 addpd %xmm7, %xmm6 167 maxpd %xmm4, %xmm0 168 maxpd %xmm6, %xmm1 169 170 addq $8 * SIZE, X 171 ALIGN_3 172 173.L36: 174 testq $2, M 175 je .L37 176 177 movsd 0 * SIZE(X), %xmm4 178 movsd 1 * SIZE(X), %xmm5 179 movhpd 2 * SIZE(X), %xmm4 180 movhpd 3 * SIZE(X), %xmm5 181 addq $4 * SIZE, X 182 183 andpd %xmm15, %xmm4 184 andpd %xmm15, %xmm5 185 addpd %xmm5, %xmm4 186 maxpd %xmm4, %xmm0 187 ALIGN_3 188 189.L37: 190 testq $1, M 191 je .L998 192 193 movsd 0 * SIZE(X), %xmm4 194 movsd 1 * SIZE(X), %xmm5 195 andpd %xmm15, %xmm4 196 andpd %xmm15, %xmm5 197 addpd %xmm5, %xmm4 198 maxsd %xmm4, %xmm2 199 jmp .L998 200 ALIGN_4 201 202 203.L40: 204 movq M, I 205 sarq $3, I 206 jle .L45 207 ALIGN_4 208 209.L41: 210#ifdef PREFETCH 211 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 212#endif 213 214 movsd 0 * SIZE(X), %xmm4 215 movsd 1 * SIZE(X), %xmm5 216 addq INCX, X 217 movhpd 0 * SIZE(X), %xmm4 218 movhpd 1 * SIZE(X), %xmm5 219 addq INCX, X 220 movsd 0 * SIZE(X), %xmm6 221 movsd 1 * SIZE(X), %xmm7 222 addq INCX, X 223 movhpd 0 * SIZE(X), %xmm6 224 movhpd 1 * SIZE(X), %xmm7 225 addq INCX, X 226 227 andpd %xmm15, %xmm4 228 andpd %xmm15, %xmm5 229 addpd %xmm5, %xmm4 230 maxpd %xmm4, %xmm0 231 232 andpd %xmm15, %xmm6 233 andpd %xmm15, %xmm7 234 addpd %xmm7, %xmm6 235 maxpd %xmm6, %xmm1 236 237#if defined(PREFETCH) && !defined(FETCH128) 238 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 239#endif 240 241 movsd 0 * SIZE(X), %xmm4 242 movsd 1 * SIZE(X), %xmm5 243 addq INCX, X 244 movhpd 0 * SIZE(X), %xmm4 245 movhpd 1 * SIZE(X), %xmm5 246 addq INCX, X 247 movsd 0 * SIZE(X), %xmm6 248 movsd 1 * SIZE(X), %xmm7 249 addq INCX, X 250 movhpd 0 * SIZE(X), %xmm6 251 movhpd 1 * SIZE(X), %xmm7 252 addq INCX, X 253 254 andpd %xmm15, %xmm4 255 andpd %xmm15, %xmm5 256 addpd %xmm5, %xmm4 257 maxpd %xmm4, %xmm2 258 259 andpd %xmm15, %xmm6 260 andpd %xmm15, %xmm7 261 addpd %xmm7, %xmm6 262 maxpd %xmm6, %xmm3 263 264 decq I 265 jg .L41 266 ALIGN_4 267 268.L45: 269 andq $7, M 270 jle .L998 271 272 testq $4, M 273 je .L46 274 275 movsd 0 * SIZE(X), %xmm4 276 movsd 1 * SIZE(X), %xmm5 277 addq INCX, X 278 movhpd 0 * SIZE(X), %xmm4 279 movhpd 1 * SIZE(X), %xmm5 280 addq INCX, X 281 movsd 0 * SIZE(X), %xmm6 282 movsd 1 * SIZE(X), %xmm7 283 addq INCX, X 284 movhpd 0 * SIZE(X), %xmm6 285 movhpd 1 * SIZE(X), %xmm7 286 addq INCX, X 287 288 andpd %xmm15, %xmm4 289 andpd %xmm15, %xmm5 290 andpd %xmm15, %xmm6 291 andpd %xmm15, %xmm7 292 addpd %xmm5, %xmm4 293 addpd %xmm7, %xmm6 294 maxpd %xmm4, %xmm0 295 maxpd %xmm6, %xmm1 296 ALIGN_3 297 298.L46: 299 testq $2, M 300 je .L47 301 302 movsd 0 * SIZE(X), %xmm4 303 movsd 1 * SIZE(X), %xmm5 304 addq INCX, X 305 movhpd 0 * SIZE(X), %xmm4 306 movhpd 1 * SIZE(X), %xmm5 307 addq INCX, X 308 andpd %xmm15, %xmm4 309 andpd %xmm15, %xmm5 310 addpd %xmm5, %xmm4 311 maxpd %xmm4, %xmm2 312 ALIGN_3 313 314.L47: 315 testq $1, M 316 je .L998 317 318 movsd 0 * SIZE(X), %xmm4 319 movsd 1 * SIZE(X), %xmm5 320 andpd %xmm15, %xmm4 321 andpd %xmm15, %xmm5 322 addpd %xmm5, %xmm4 323 maxsd %xmm4, %xmm3 324 jmp .L998 325 ALIGN_4 326 327.L998: 328 maxpd %xmm1, %xmm0 329 maxpd %xmm3, %xmm2 330 maxpd %xmm2, %xmm0 331 movapd %xmm0, %xmm1 332 unpckhpd %xmm0, %xmm0 333 maxsd %xmm1, %xmm0 334 ALIGN_4 335 336.L999: 337 RESTOREREGISTERS 338 339 ret 340 341 EPILOGUE 342