1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define M ARG1 /* rdi */ 43#define X ARG2 /* rsi */ 44#define INCX ARG3 /* rdx */ 45 46#define I %rax 47 48#include "l1param.h" 49 50 PROLOGUE 51 PROFCODE 52 53 SAVEREGISTERS 54 55 pxor %xmm0, %xmm0 56 testq M, M 57 jle .L999 58 pxor %xmm1, %xmm1 59 testq INCX, INCX 60 jle .L999 61 62 pxor %xmm2, %xmm2 63 leaq (, INCX, SIZE), INCX 64 pxor %xmm3, %xmm3 65 cmpq $SIZE, INCX 66 jne .L40 67 68 testq $SIZE, X 69 je .L05 70 71 movss 0 * SIZE(X), %xmm4 72 cvtss2sd %xmm4, %xmm6 73 mulsd %xmm6, %xmm6 74 addsd %xmm6, %xmm3 75 addq INCX, X 76 decq M 77 jle .L998 78 ALIGN_3 79 80.L05: 81 movq M, I 82 sarq $3, I 83 jle .L14 84 85 movsd 0 * SIZE(X), %xmm4 86 movsd 2 * SIZE(X), %xmm5 87 movsd 4 * SIZE(X), %xmm6 88 movsd 6 * SIZE(X), %xmm7 89 addq $8 * SIZE, X 90 decq I 91 jle .L12 92 ALIGN_3 93 94.L10: 95#ifdef PREFETCH 96 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 97#endif 98 99 cvtps2pd %xmm4, %xmm8 100 cvtps2pd %xmm5, %xmm9 101 cvtps2pd %xmm6, %xmm10 102 cvtps2pd %xmm7, %xmm11 103 104 movsd 0 * SIZE(X), %xmm4 105 movsd 2 * SIZE(X), %xmm5 106 movsd 4 * SIZE(X), %xmm6 107 movsd 6 * SIZE(X), %xmm7 108 109 mulpd %xmm8, %xmm8 110 mulpd %xmm9, %xmm9 111 mulpd %xmm10, %xmm10 112 mulpd %xmm11, %xmm11 113 114 addpd %xmm8, %xmm0 115 addpd %xmm9, %xmm1 116 addpd %xmm10, %xmm2 117 addpd %xmm11, %xmm3 118 119 addq $8 * SIZE, X 120 decq I 121 jg .L10 122 ALIGN_3 123 124.L12: 125 cvtps2pd %xmm4, %xmm8 126 cvtps2pd %xmm5, %xmm9 127 cvtps2pd %xmm6, %xmm10 128 cvtps2pd %xmm7, %xmm11 129 130 mulpd %xmm8, %xmm8 131 mulpd %xmm9, %xmm9 132 mulpd %xmm10, %xmm10 133 mulpd %xmm11, %xmm11 134 135 addpd %xmm8, %xmm0 136 addpd %xmm9, %xmm1 137 addpd %xmm10, %xmm2 138 addpd %xmm11, %xmm3 139 ALIGN_3 140 141 142.L14: 143 testq $4, M 144 je .L15 145 146 movsd 0 * SIZE(X), %xmm4 147 movsd 2 * SIZE(X), %xmm5 148 cvtps2pd %xmm4, %xmm6 149 cvtps2pd %xmm5, %xmm7 150 mulpd %xmm6, %xmm6 151 mulpd %xmm7, %xmm7 152 addpd %xmm6, %xmm0 153 addpd %xmm7, %xmm1 154 addq $4 * SIZE, X 155 ALIGN_3 156 157.L15: 158 testq $2, M 159 je .L16 160 161 movsd 0 * SIZE(X), %xmm4 162 cvtps2pd %xmm4, %xmm6 163 mulpd %xmm6, %xmm6 164 addpd %xmm6, %xmm2 165 addq $2 * SIZE, X 166 ALIGN_3 167 168.L16: 169 testq $1, M 170 je .L998 171 172 movss 0 * SIZE(X), %xmm4 173 cvtss2sd %xmm4, %xmm6 174 mulsd %xmm6, %xmm6 175 addsd %xmm6, %xmm3 176 jmp .L998 177 ALIGN_4 178 179.L40: 180 movq M, I 181 sarq $3, I 182 jle .L44 183 ALIGN_4 184 185.L41: 186 movss (X), %xmm4 187 addq INCX, X 188 movss (X), %xmm5 189 addq INCX, X 190 movss (X), %xmm6 191 addq INCX, X 192 movss (X), %xmm7 193 addq INCX, X 194 movss (X), %xmm8 195 addq INCX, X 196 movss (X), %xmm9 197 addq INCX, X 198 movss (X), %xmm10 199 addq INCX, X 200 movss (X), %xmm11 201 addq INCX, X 202 203 cvtss2sd %xmm4, %xmm4 204 cvtss2sd %xmm5, %xmm5 205 cvtss2sd %xmm6, %xmm6 206 cvtss2sd %xmm7, %xmm7 207 cvtss2sd %xmm8, %xmm8 208 cvtss2sd %xmm9, %xmm9 209 cvtss2sd %xmm10, %xmm10 210 cvtss2sd %xmm11, %xmm11 211 212 mulsd %xmm4, %xmm4 213 mulsd %xmm5, %xmm5 214 mulsd %xmm6, %xmm6 215 mulsd %xmm7, %xmm7 216 217 addsd %xmm4, %xmm0 218 addsd %xmm5, %xmm1 219 addsd %xmm6, %xmm2 220 addsd %xmm7, %xmm3 221 222 mulsd %xmm8, %xmm8 223 mulsd %xmm9, %xmm9 224 mulsd %xmm10, %xmm10 225 mulsd %xmm11, %xmm11 226 227 addsd %xmm8, %xmm0 228 addsd %xmm9, %xmm1 229 addsd %xmm10, %xmm2 230 addsd %xmm11, %xmm3 231 232 decq I 233 jg .L41 234 ALIGN_3 235 236.L44: 237 testq $4, M 238 je .L45 239 240 movss (X), %xmm4 241 addq INCX, X 242 movss (X), %xmm5 243 addq INCX, X 244 movss (X), %xmm6 245 addq INCX, X 246 movss (X), %xmm7 247 addq INCX, X 248 249 cvtss2sd %xmm4, %xmm8 250 cvtss2sd %xmm5, %xmm9 251 cvtss2sd %xmm6, %xmm10 252 cvtss2sd %xmm7, %xmm11 253 254 mulsd %xmm8, %xmm8 255 mulsd %xmm9, %xmm9 256 mulsd %xmm10, %xmm10 257 mulsd %xmm11, %xmm11 258 259 addsd %xmm8, %xmm0 260 addsd %xmm9, %xmm1 261 addsd %xmm10, %xmm2 262 addsd %xmm11, %xmm3 263 ALIGN_3 264 265.L45: 266 testq $2, M 267 je .L46 268 269 movss (X), %xmm4 270 addq INCX, X 271 movss (X), %xmm5 272 addq INCX, X 273 274 cvtss2sd %xmm4, %xmm6 275 cvtss2sd %xmm5, %xmm7 276 mulsd %xmm6, %xmm6 277 mulsd %xmm7, %xmm7 278 addsd %xmm6, %xmm1 279 addsd %xmm7, %xmm2 280 ALIGN_3 281 282.L46: 283 testq $1, M 284 je .L998 285 286 movss (X), %xmm4 287 cvtss2sd %xmm4, %xmm6 288 mulsd %xmm6, %xmm6 289 addsd %xmm6, %xmm3 290 ALIGN_4 291 292.L998: 293 addpd %xmm1, %xmm0 294 addpd %xmm3, %xmm2 295 addpd %xmm2, %xmm0 296 297#ifndef HAVE_SSE3 298 movapd %xmm0, %xmm1 299 unpckhpd %xmm0, %xmm0 300 addsd %xmm1, %xmm0 301#else 302 haddpd %xmm0, %xmm0 303#endif 304 ALIGN_4 305 306.L999: 307 sqrtsd %xmm0, %xmm0 308 309 cvtsd2ss %xmm0, %xmm0 310 311 RESTOREREGISTERS 312 313 ret 314 315 EPILOGUE 316 317