1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define STACK 8 43#define ARGS 0 44 45#define STACK_M 4 + STACK + ARGS(%esp) 46#define STACK_X 8 + STACK + ARGS(%esp) 47#define STACK_INCX 12 + STACK + ARGS(%esp) 48 49#define I %eax 50#define M %ecx 51#define X %esi 52#define INCX %ebx 53 54#include "l1param.h" 55 56 PROLOGUE 57 PROFCODE 58 59 pushl %esi 60 pushl %ebx 61 62 movl STACK_M, M 63 movl STACK_X, X 64 movl STACK_INCX, INCX 65 66 xorps %xmm0, %xmm0 67 xorps %xmm1, %xmm1 68 69 testl M, M 70 jle .L999 71 testl INCX, INCX 72 jle .L999 73 74 pcmpeqb %xmm3, %xmm3 75 psrlq $1, %xmm3 76 77 sall $BASE_SHIFT, INCX 78 79 subl $-16 * SIZE, X 80 81 cmpl $SIZE, INCX 82 jne .L40 83 84 testl $SIZE, X 85 je .L05 86 87 movsd -16 * SIZE(X), %xmm0 88 addl $SIZE, X 89 90 andps %xmm3, %xmm0 91 subl $1, M 92 jle .L999 93 ALIGN_3 94 95.L05: 96 movl M, I 97 sarl $4, I 98 jle .L20 99 100 movaps -16 * SIZE(X), %xmm4 101 movaps -14 * SIZE(X), %xmm5 102 movaps -12 * SIZE(X), %xmm6 103 movaps -10 * SIZE(X), %xmm7 104 105 decl I 106 jle .L11 107 ALIGN_4 108 109.L10: 110#ifdef PREFETCH 111 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 112#endif 113 114 andps %xmm3, %xmm4 115 addpd %xmm4, %xmm0 116 movaps -8 * SIZE(X), %xmm4 117 118 andps %xmm3, %xmm5 119 addpd %xmm5, %xmm1 120 movaps -6 * SIZE(X), %xmm5 121 122 andps %xmm3, %xmm6 123 addpd %xmm6, %xmm0 124 movaps -4 * SIZE(X), %xmm6 125 126 andps %xmm3, %xmm7 127 addpd %xmm7, %xmm1 128 movaps -2 * SIZE(X), %xmm7 129 130#if defined(PREFETCH) && !defined(FETCH128) 131 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 132#endif 133 134 andps %xmm3, %xmm4 135 addpd %xmm4, %xmm0 136 movaps 0 * SIZE(X), %xmm4 137 138 andps %xmm3, %xmm5 139 addpd %xmm5, %xmm1 140 movaps 2 * SIZE(X), %xmm5 141 142 andps %xmm3, %xmm6 143 addpd %xmm6, %xmm0 144 movaps 4 * SIZE(X), %xmm6 145 146 andps %xmm3, %xmm7 147 addpd %xmm7, %xmm1 148 movaps 6 * SIZE(X), %xmm7 149 150 subl $-16 * SIZE, X 151 decl I 152 jg .L10 153 ALIGN_4 154 155.L11: 156 andps %xmm3, %xmm4 157 addpd %xmm4, %xmm0 158 movaps -8 * SIZE(X), %xmm4 159 160 andps %xmm3, %xmm5 161 addpd %xmm5, %xmm1 162 movaps -6 * SIZE(X), %xmm5 163 164 andps %xmm3, %xmm6 165 addpd %xmm6, %xmm0 166 movaps -4 * SIZE(X), %xmm6 167 168 andps %xmm3, %xmm7 169 addpd %xmm7, %xmm1 170 movaps -2 * SIZE(X), %xmm7 171 172 andps %xmm3, %xmm4 173 addpd %xmm4, %xmm0 174 andps %xmm3, %xmm5 175 addpd %xmm5, %xmm1 176 andps %xmm3, %xmm6 177 addpd %xmm6, %xmm0 178 andps %xmm3, %xmm7 179 addpd %xmm7, %xmm1 180 181 subl $-16 * SIZE, X 182 ALIGN_3 183 184.L20: 185 andl $15, M 186 jle .L999 187 188 testl $8, M 189 je .L21 190 191 movaps -16 * SIZE(X), %xmm4 192 movaps -14 * SIZE(X), %xmm5 193 movaps -12 * SIZE(X), %xmm6 194 movaps -10 * SIZE(X), %xmm7 195 196 andps %xmm3, %xmm4 197 addpd %xmm4, %xmm0 198 andps %xmm3, %xmm5 199 addpd %xmm5, %xmm1 200 andps %xmm3, %xmm6 201 addpd %xmm6, %xmm0 202 andps %xmm3, %xmm7 203 addpd %xmm7, %xmm1 204 addl $8 * SIZE, X 205 ALIGN_3 206 207.L21: 208 testl $4, M 209 je .L22 210 211 movaps -16 * SIZE(X), %xmm4 212 movaps -14 * SIZE(X), %xmm5 213 214 andps %xmm3, %xmm4 215 addpd %xmm4, %xmm0 216 andps %xmm3, %xmm5 217 addpd %xmm5, %xmm1 218 219 addl $4 * SIZE, X 220 ALIGN_3 221 222.L22: 223 testl $2, M 224 je .L23 225 226 movaps -16 * SIZE(X), %xmm4 227 andps %xmm3, %xmm4 228 addpd %xmm4, %xmm0 229 addl $2 * SIZE, X 230 231.L23: 232 testl $1, M 233 je .L999 234 235#ifdef movsd 236 xorps %xmm4, %xmm4 237#endif 238 movsd -16 * SIZE(X), %xmm4 239 andps %xmm3, %xmm4 240 addsd %xmm4, %xmm1 241 jmp .L999 242 ALIGN_3 243 244.L40: 245 movl M, I 246 sarl $3, I 247 jle .L60 248 ALIGN_4 249 250.L50: 251 movsd -16 * SIZE(X), %xmm4 252 addl INCX, X 253 movhps -16 * SIZE(X), %xmm4 254 addl INCX, X 255 andps %xmm3, %xmm4 256 addpd %xmm4, %xmm0 257 258 movsd -16 * SIZE(X), %xmm5 259 addl INCX, X 260 movhps -16 * SIZE(X), %xmm5 261 addl INCX, X 262 andps %xmm3, %xmm5 263 addpd %xmm5, %xmm1 264 265 movsd -16 * SIZE(X), %xmm6 266 addl INCX, X 267 movhps -16 * SIZE(X), %xmm6 268 addl INCX, X 269 andps %xmm3, %xmm6 270 addpd %xmm6, %xmm0 271 272 movsd -16 * SIZE(X), %xmm7 273 addl INCX, X 274 movhps -16 * SIZE(X), %xmm7 275 addl INCX, X 276 andps %xmm3, %xmm7 277 addpd %xmm7, %xmm1 278 279 decl I 280 jg .L50 281 ALIGN_4 282 283.L60: 284#ifdef movsd 285 xorps %xmm4, %xmm4 286#endif 287 andl $7, M 288 jle .L999 289 ALIGN_4 290 291.L61: 292 movsd -16 * SIZE(X), %xmm4 293 andps %xmm3, %xmm4 294 addsd %xmm4, %xmm0 295 addl INCX, X 296 decl M 297 jg .L61 298 ALIGN_4 299 300.L999: 301 addpd %xmm1, %xmm0 302 303#ifndef HAVE_SSE3 304 movaps %xmm0, %xmm1 305 unpckhpd %xmm0, %xmm0 306 addsd %xmm1, %xmm0 307#else 308 haddpd %xmm0, %xmm0 309#endif 310 311 movsd %xmm0, STACK_M 312 fldl STACK_M 313 popl %ebx 314 popl %esi 315 ret 316 317 EPILOGUE 318 319