1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define M ARG1 /* rdi */ 43#define X ARG2 /* rsi */ 44#define INCX ARG3 /* rdx */ 45 46#define I %rax 47 48#ifdef USE_MIN 49#define maxps minps 50#define maxss minss 51#endif 52 53#include "l1param.h" 54 55 PROLOGUE 56 PROFCODE 57 58 SAVEREGISTERS 59 60 pxor %xmm0, %xmm0 61 salq $ZBASE_SHIFT, INCX 62 63 testq M, M 64 jle .L999 65 66 pcmpeqb %xmm15, %xmm15 67 psrld $1, %xmm15 68 69 movss 0 * SIZE(X), %xmm0 70 movss 1 * SIZE(X), %xmm1 71 addq INCX, X 72 decq M 73 andps %xmm15, %xmm0 74 andps %xmm15, %xmm1 75 addps %xmm1, %xmm0 76 shufps $0, %xmm0, %xmm0 77 movaps %xmm0, %xmm1 78 cmpq $2 * SIZE, INCX 79 jne .L40 80 81.L30: 82 movq M, I 83 sarq $3, I 84 jle .L35 85 ALIGN_4 86 87.L31: 88#ifdef PREFETCH 89 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 90#endif 91 92 movsd 0 * SIZE(X), %xmm4 93 movhps 2 * SIZE(X), %xmm4 94 movsd 4 * SIZE(X), %xmm5 95 movhps 6 * SIZE(X), %xmm5 96 97 movaps %xmm4, %xmm6 98 99 shufps $0x88, %xmm5, %xmm4 100 shufps $0xdd, %xmm5, %xmm6 101 102 andps %xmm15, %xmm4 103 andps %xmm15, %xmm6 104 addps %xmm6, %xmm4 105 maxps %xmm4, %xmm0 106 107 movsd 8 * SIZE(X), %xmm7 108 movhps 10 * SIZE(X), %xmm7 109 movsd 12 * SIZE(X), %xmm8 110 movhps 14 * SIZE(X), %xmm8 111 movaps %xmm7, %xmm9 112 113 shufps $0x88, %xmm8, %xmm7 114 shufps $0xdd, %xmm8, %xmm9 115 116 andps %xmm15, %xmm7 117 andps %xmm15, %xmm9 118 addps %xmm9, %xmm7 119 maxps %xmm7, %xmm0 120 121 addq $16 * SIZE, X 122 decq I 123 jg .L31 124 ALIGN_4 125 126.L35: 127 andq $7, M 128 jle .L998 129 130 testq $4, M 131 je .L36 132 133 movsd 0 * SIZE(X), %xmm4 134 movhps 2 * SIZE(X), %xmm4 135 movsd 4 * SIZE(X), %xmm5 136 movhps 6 * SIZE(X), %xmm5 137 movaps %xmm4, %xmm6 138 139 shufps $0x88, %xmm5, %xmm4 140 shufps $0xdd, %xmm5, %xmm6 141 142 andps %xmm15, %xmm4 143 andps %xmm15, %xmm6 144 addps %xmm6, %xmm4 145 maxps %xmm4, %xmm0 146 147 addq $8 * SIZE, X 148 ALIGN_3 149 150.L36: 151 testq $2, M 152 je .L37 153 154 movss 0 * SIZE(X), %xmm4 155 movss 1 * SIZE(X), %xmm5 156 movss 2 * SIZE(X), %xmm6 157 movss 3 * SIZE(X), %xmm7 158 andps %xmm15, %xmm4 159 andps %xmm15, %xmm5 160 andps %xmm15, %xmm6 161 andps %xmm15, %xmm7 162 addps %xmm5, %xmm4 163 addps %xmm7, %xmm6 164 maxss %xmm4, %xmm0 165 maxss %xmm6, %xmm1 166 addq $4 * SIZE, X 167 ALIGN_3 168 169.L37: 170 testq $1, M 171 je .L998 172 173 movss 0 * SIZE(X), %xmm4 174 movss 1 * SIZE(X), %xmm5 175 andps %xmm15, %xmm4 176 andps %xmm15, %xmm5 177 addps %xmm5, %xmm4 178 maxss %xmm4, %xmm0 179 jmp .L998 180 ALIGN_4 181 182 183.L40: 184 movq M, I 185 sarq $3, I 186 jle .L45 187 ALIGN_4 188 189.L41: 190#ifdef PREFETCH 191 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 192#endif 193 194 movsd 0 * SIZE(X), %xmm4 195 addq INCX, X 196 movhps 0 * SIZE(X), %xmm4 197 addq INCX, X 198 movsd 0 * SIZE(X), %xmm5 199 addq INCX, X 200 movhps 0 * SIZE(X), %xmm5 201 addq INCX, X 202 203 movaps %xmm4, %xmm6 204 205 shufps $0x88, %xmm5, %xmm4 206 shufps $0xdd, %xmm5, %xmm6 207 208 andps %xmm15, %xmm4 209 andps %xmm15, %xmm6 210 addps %xmm6, %xmm4 211 maxps %xmm4, %xmm0 212 213 movsd 0 * SIZE(X), %xmm7 214 addq INCX, X 215 movhps 0 * SIZE(X), %xmm7 216 addq INCX, X 217 movsd 0 * SIZE(X), %xmm8 218 addq INCX, X 219 movhps 0 * SIZE(X), %xmm8 220 addq INCX, X 221 movaps %xmm7, %xmm9 222 223 shufps $0x88, %xmm8, %xmm7 224 shufps $0xdd, %xmm8, %xmm9 225 226 andps %xmm15, %xmm7 227 andps %xmm15, %xmm9 228 addps %xmm9, %xmm7 229 maxps %xmm7, %xmm0 230 231 decq I 232 jg .L41 233 ALIGN_4 234 235.L45: 236 andq $7, M 237 jle .L998 238 239 testq $4, M 240 je .L46 241 242 movsd 0 * SIZE(X), %xmm4 243 addq INCX, X 244 movhps 0 * SIZE(X), %xmm4 245 addq INCX, X 246 movsd 0 * SIZE(X), %xmm5 247 addq INCX, X 248 movhps 0 * SIZE(X), %xmm5 249 addq INCX, X 250 movaps %xmm4, %xmm6 251 252 shufps $0x88, %xmm5, %xmm4 253 shufps $0xdd, %xmm5, %xmm6 254 255 andps %xmm15, %xmm4 256 andps %xmm15, %xmm6 257 addps %xmm6, %xmm4 258 maxps %xmm4, %xmm0 259 ALIGN_3 260 261.L46: 262 testq $2, M 263 je .L47 264 265 movss 0 * SIZE(X), %xmm4 266 movss 1 * SIZE(X), %xmm5 267 addq INCX, X 268 movss 0 * SIZE(X), %xmm6 269 movss 1 * SIZE(X), %xmm7 270 addq INCX, X 271 andps %xmm15, %xmm4 272 andps %xmm15, %xmm5 273 andps %xmm15, %xmm6 274 andps %xmm15, %xmm7 275 addps %xmm5, %xmm4 276 addps %xmm7, %xmm6 277 maxss %xmm4, %xmm0 278 maxss %xmm6, %xmm1 279 ALIGN_3 280 281.L47: 282 testq $1, M 283 je .L998 284 285 movss 0 * SIZE(X), %xmm4 286 movss 1 * SIZE(X), %xmm5 287 andps %xmm15, %xmm4 288 andps %xmm15, %xmm5 289 addps %xmm5, %xmm4 290 maxss %xmm4, %xmm0 291 jmp .L998 292 ALIGN_4 293 294.L998: 295 maxps %xmm1, %xmm0 296 movaps %xmm0, %xmm1 297 movhlps %xmm0, %xmm0 298 maxps %xmm1, %xmm0 299 movaps %xmm0, %xmm1 300 shufps $1, %xmm0, %xmm0 301 maxss %xmm1, %xmm0 302 ALIGN_4 303 304.L999: 305 RESTOREREGISTERS 306 307 ret 308 309 EPILOGUE 310