1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define M ARG1 /* rdi */ 43#define X ARG2 /* rsi */ 44#define INCX ARG3 /* rdx */ 45 46#define I %rax 47 48#include "l1param.h" 49 50 PROLOGUE 51 PROFCODE 52 53#ifdef WINDOWS_ABI 54 emms 55#endif 56 57 fldz 58 testq M, M 59 jle .L999 60 testq INCX, INCX 61 jle .L999 62 63 salq $BASE_SHIFT, INCX 64 65 fldz 66 fldz 67 fldz 68 cmpq $SIZE, INCX 69 jne .L40 70 71 movq M, I 72 sarq $3, I 73 jle .L20 74 ALIGN_4 75 76.L10: 77#ifdef PREFETCH 78 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 79#endif 80 81 FLD 0 * SIZE(X) 82 fmul %st(0), %st 83 FLD 1 * SIZE(X) 84 fmul %st(0), %st 85 FLD 2 * SIZE(X) 86 fmul %st(0), %st 87 FLD 3 * SIZE(X) 88 fmul %st(0), %st 89 90 faddp %st, %st(7) 91 faddp %st, %st(5) 92 faddp %st, %st(3) 93 faddp %st, %st(1) 94 95 FLD 4 * SIZE(X) 96 fmul %st(0), %st 97 FLD 5 * SIZE(X) 98 fmul %st(0), %st 99 FLD 6 * SIZE(X) 100 fmul %st(0), %st 101 FLD 7 * SIZE(X) 102 fmul %st(0), %st 103 104 addq $8 * SIZE, X 105 106 faddp %st, %st(7) 107 faddp %st, %st(5) 108 faddp %st, %st(3) 109 faddp %st, %st(1) 110 111 decq I 112 jg .L10 113 ALIGN_4 114 115.L20: 116 andq $7, M 117 jle .L998 118 ALIGN_4 119 120 121.L21: 122 FLD (X) 123 fmul %st(0), %st 124 faddp %st,%st(1) 125 addq $1 * SIZE, X 126 decq M 127 jg .L21 128 jmp .L998 129 ALIGN_4 130 131.L40: 132 movq M, I 133 sarq $3, I 134 jle .L60 135 ALIGN_4 136 137.L50: 138 FLD (X) 139 addq INCX, X 140 fmul %st(0), %st 141 FLD (X) 142 addq INCX, X 143 fmul %st(0), %st 144 FLD (X) 145 addq INCX, X 146 fmul %st(0), %st 147 FLD (X) 148 addq INCX, X 149 fmul %st(0), %st 150 151 faddp %st, %st(7) 152 faddp %st, %st(5) 153 faddp %st, %st(3) 154 faddp %st, %st(1) 155 156 FLD (X) 157 addq INCX, X 158 fmul %st(0), %st 159 FLD (X) 160 addq INCX, X 161 fmul %st(0), %st 162 FLD (X) 163 addq INCX, X 164 fmul %st(0), %st 165 FLD (X) 166 addq INCX, X 167 fmul %st(0), %st 168 169 faddp %st, %st(7) 170 faddp %st, %st(5) 171 faddp %st, %st(3) 172 faddp %st, %st(1) 173 174 decq I 175 jg .L50 176 ALIGN_4 177 178.L60: 179 andq $7, M 180 jle .L998 181 ALIGN_4 182 183 184.L61: 185 FLD (X) 186 addq INCX, X 187 fmul %st(0), %st 188 faddp %st,%st(1) 189 decq M 190 jg .L61 191 ALIGN_4 192 193.L998: 194 faddp %st,%st(2) 195 faddp %st,%st(1) 196 faddp %st,%st(1) 197 ALIGN_4 198 199.L999: 200 fsqrt 201#ifndef XDOUBLE 202 sub $2 * SIZE, %rsp 203 FST (%rsp) 204 MOVSD (%rsp), %xmm0 205 add $2 * SIZE, %rsp 206#endif 207 ret 208 209 EPILOGUE 210 211