1 /* Copyright (C) 2005 The Scalable Software Infrastructure Project. All rights reserved. 2 3 Redistribution and use in source and binary forms, with or without 4 modification, are permitted provided that the following conditions are met: 5 1. Redistributions of source code must retain the above copyright 6 notice, this list of conditions and the following disclaimer. 7 2. Redistributions in binary form must reproduce the above copyright 8 notice, this list of conditions and the following disclaimer in the 9 documentation and/or other materials provided with the distribution. 10 3. Neither the name of the project nor the names of its contributors 11 may be used to endorse or promote products derived from this software 12 without specific prior written permission. 13 14 THIS SOFTWARE IS PROVIDED BY THE SCALABLE SOFTWARE INFRASTRUCTURE PROJECT 15 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 16 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE SCALABLE SOFTWARE INFRASTRUCTURE 18 PROJECT BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, 19 OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 20 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 21 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 22 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 23 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 24 POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 28 #ifndef __LIS_PRECISION_H__ 29 #define __LIS_PRECISION_H__ 30 31 #ifdef USE_SSE2 32 #include <emmintrin.h> 33 #endif 34 35 #define SPLITTER 134217729.0 36 37 #ifndef USE_QUAD_PRECISION 38 #define LIS_QUAD_DECLAR 39 #else 40 #ifdef USE_SSE2 41 #define LIS_QUAD_DECLAR __m128d bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh 42 #else 43 #define LIS_QUAD_DECLAR double p1,p2,tq,bhi,blo,chi,clo,sh,eh,sl,el,th,tl 44 #endif 45 #endif 46 47 /********************************************************** 48 * LIS_QUAD_MINUS(a_hi,a_lo) * 49 ********************************************************** 50 (a_hi,a_lo) <- (-a_hi,-a_lo) 51 **********************************************************/ 52 53 #define LIS_QUAD_MINUS(a_hi,a_lo) \ 54 (a_hi) = -(a_hi); \ 55 (a_lo) = -(a_lo) 56 57 /********************************************************** 58 * LIS_QUAD_ZERO(a_hi,a_lo) * 59 ********************************************************** 60 (a_hi,a_lo) <- (0,0) 61 **********************************************************/ 62 63 #define LIS_QUAD_ZERO(a_hi,a_lo) \ 64 (a_hi) = 0.0; \ 65 (a_lo) = 0.0 66 67 /********************************************************** 68 * LIS_QUAD_ONE(a_hi,a_lo) * 69 ********************************************************** 70 (a_hi,a_lo) <- (1,0) 71 **********************************************************/ 72 73 #define LIS_QUAD_ONE(a_hi,a_lo) \ 74 (a_hi) = 1.0; \ 75 (a_lo) = 0.0 76 77 /********************************************************** 78 * LIS_QUAD_FAST_TWO_SUM(a,b,r,e) * 79 ********************************************************** 80 |a| > |b| 81 a + b -> (r,e) 82 **********************************************************/ 83 84 #define LIS_QUAD_FAST_TWO_SUM(a,b,r,e) \ 85 (r) = (a) + (b); \ 86 (e) = (b) - ((r) - (a)) 87 88 /********************************************************** 89 * LIS_QUAD_TWO_SUM(a,b,r,e) * 90 ********************************************************** 91 a + b -> (r,e) 92 **********************************************************/ 93 94 #define LIS_QUAD_TWO_SUM(a,b,r,e) \ 95 (r) = (a) + (b); \ 96 th = (r) - (a); \ 97 (e) = ((a) - ((r) - th)) + ((b) - th) 98 99 /********************************************************** 100 * LIS_QUAD_TWO_DIFF(a,b,r,e) * 101 ********************************************************** 102 a - b -> (r,e) 103 **********************************************************/ 104 105 #define LIS_QUAD_TWO_DIFF(a,b,r,e) \ 106 (r) = (a) - (b); \ 107 th = (r) - (a); \ 108 (e) = ((a) - ((r) - th)) - ((b) + th) 109 110 /********************************************************** 111 * LIS_QUAD_SPLIT(b,b_hi,b_lo) * 112 ********************************************************** 113 b -> (b_hi,b_lo) 114 **********************************************************/ 115 116 #define LIS_QUAD_SPLIT(b,b_hi,b_lo) \ 117 tq = SPLITTER * (b); \ 118 (b_hi) = tq - (tq-(b)); \ 119 (b_lo) = (b) - (b_hi); \ 120 121 /********************************************************** 122 * LIS_QUAD_TWO_PROD(a,b,r,e) * 123 ********************************************************** 124 a x b -> (r,e) 125 **********************************************************/ 126 127 #ifndef HAS_FMA 128 #define LIS_QUAD_TWO_PROD(a,b,r,e) \ 129 (r) = (a) * (b); \ 130 LIS_QUAD_SPLIT((a),bhi,blo); \ 131 LIS_QUAD_SPLIT((b),chi,clo); \ 132 (e) = ((bhi*chi-(r))+bhi*clo+blo*chi)+blo*clo 133 #else 134 #define LIS_QUAD_TWO_PROD(a,b,r,e) \ 135 (r) = (-(a)) * (b); \ 136 (e) = (a) * (b) + (r) 137 #endif 138 139 /********************************************************** 140 * LIS_QUAD_TWO_SQR(a,r,e) * 141 ********************************************************** 142 a x a -> (r,e) 143 **********************************************************/ 144 145 #ifndef HAS_FMA 146 #define LIS_QUAD_TWO_SQR(a,r,e) \ 147 (r) = (a) * (a); \ 148 LIS_QUAD_SPLIT((a),bhi,blo); \ 149 (e) = ((bhi*bhi-(r))+2.0*bhi*blo)+blo*blo 150 #else 151 #define LIS_QUAD_TWO_SQR(a,r,e) \ 152 (r) = (-(a)) * (a); \ 153 (e) = (a) * (a) + (r) 154 #endif 155 156 /********************************************************** 157 * LIS_QUAD_MUL(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) * 158 ********************************************************** 159 (a_hi,a_lo) <- (b_hi,b_lo) x (c_hi,c_lo) 160 **********************************************************/ 161 162 #define LIS_QUAD_MUL(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) \ 163 LIS_QUAD_TWO_PROD((b_hi),(c_hi),p1,p2); \ 164 p2 += ((b_hi) * (c_lo)); \ 165 p2 += ((b_lo) * (c_hi)); \ 166 LIS_QUAD_FAST_TWO_SUM(p1,p2,(a_hi),(a_lo)) 167 168 /********************************************************** 169 * LIS_QUAD_MULD(a_hi,a_lo,b_hi,b_lo,c) * 170 ********************************************************** 171 (a_hi,a_lo) <- (b_hi,b_lo) x c 172 **********************************************************/ 173 174 #define LIS_QUAD_MULD(a_hi,a_lo,b_hi,b_lo,c) \ 175 LIS_QUAD_TWO_PROD((b_hi),(c),p1,p2); \ 176 p2 += ((b_lo) * (c)); \ 177 LIS_QUAD_FAST_TWO_SUM(p1,p2,(a_hi),(a_lo)) 178 179 /********************************************************** 180 * LIS_QUAD_SQR(a_hi,a_lo,b_hi,b_lo) * 181 ********************************************************** 182 (a_hi,a_lo) <- (b_hi,b_lo) x (b_hi,b_lo) 183 **********************************************************/ 184 185 #define LIS_QUAD_SQR(a_hi,a_lo,b_hi,b_lo) \ 186 LIS_QUAD_TWO_SQR((b_hi),p1,p2); \ 187 p2 += (2.0*(b_hi) * (b_lo)); \ 188 p2 += ((b_lo) * (b_lo)); \ 189 LIS_QUAD_FAST_TWO_SUM(p1,p2,(a_hi),(a_lo)) 190 191 /********************************************************** 192 * LIS_QUAD_ADD(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) * 193 ********************************************************** 194 (a_hi,a_lo) <- (b_hi,b_lo) + (c_hi,c_lo) 195 **********************************************************/ 196 197 #ifndef USE_FAST_QUAD_ADD 198 #define LIS_QUAD_ADD(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) \ 199 LIS_QUAD_TWO_SUM((b_hi),(c_hi),sh,eh); \ 200 LIS_QUAD_TWO_SUM((b_lo),(c_lo),sl,el); \ 201 eh += sl; \ 202 LIS_QUAD_FAST_TWO_SUM(sh,eh,sh,eh); \ 203 eh += el; \ 204 LIS_QUAD_FAST_TWO_SUM(sh,eh,(a_hi),(a_lo)) 205 #else 206 #define LIS_QUAD_ADD(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) \ 207 LIS_QUAD_TWO_SUM((b_hi),(c_hi),sh,eh); \ 208 eh += (b_lo); \ 209 eh += (c_lo); \ 210 LIS_QUAD_FAST_TWO_SUM(sh,eh,(a_hi),(a_lo)) 211 #endif 212 213 /********************************************************** 214 * LIS_QUAD_DIV(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) * 215 ********************************************************** 216 (a_hi,a_lo) <- (b_hi,b_lo) / (c_hi,c_lo) 217 **********************************************************/ 218 219 #define LIS_QUAD_DIV(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) \ 220 tl = (b_hi) / (c_hi); \ 221 LIS_QUAD_MULD(eh,el,(c_hi),(c_lo),tl); \ 222 LIS_QUAD_TWO_DIFF((b_hi),eh,sh,sl); \ 223 sl -= el; \ 224 sl += (b_lo); \ 225 th = (sh+sl) / (c_hi); \ 226 LIS_QUAD_FAST_TWO_SUM(tl,th,(a_hi),(a_lo)) 227 228 /********************************************************** 229 * LIS_QUAD_SQRT(a_hi,a_lo,b_hi,b_lo) * 230 ********************************************************** 231 (a_hi,a_lo) <- SQRT( (b_hi,b_lo) ) 232 **********************************************************/ 233 234 #define LIS_QUAD_SQRT(a_hi,a_lo,b_hi,b_lo) \ 235 if( (b_hi)==0 ) \ 236 { \ 237 (a_hi) = (a_lo) = 0.0; \ 238 return LIS_FAILS; \ 239 } \ 240 if( (b_hi)<0 ) \ 241 { \ 242 printf("ERROR bh=%e\n",(b_hi)); \ 243 (a_hi) = (a_lo) = 0.0; \ 244 return LIS_FAILS; \ 245 } \ 246 p1 = 1.0 / sqrt((b_hi)); \ 247 p2 = (b_hi) * p1; \ 248 p1 = p1 * 0.5; \ 249 LIS_QUAD_TWO_SQR(p2,chi,clo); \ 250 LIS_QUAD_ADD(th,eh,(b_hi),(b_lo),-chi,-clo); \ 251 p1 = p1 * th; \ 252 LIS_QUAD_FAST_TWO_SUM(p1,p2,(a_hi),(a_lo)) 253 254 /********************************************************** 255 * LIS_QUAD_FMA(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) * 256 ********************************************************** 257 (d_hi,d_lo) <- (a_hi,a_lo) + (b_hi,b_lo) * (c_hi,c_lo) 258 **********************************************************/ 259 260 #define LIS_QUAD_FMA(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) \ 261 LIS_QUAD_MUL(chi,p2,(b_hi),(b_lo),(c_hi),(c_lo)); \ 262 LIS_QUAD_ADD((d_hi),(d_lo),(a_hi),(a_lo),chi,p2) 263 264 /********************************************************** 265 * LIS_QUAD_FMAD(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo,c) * 266 ********************************************************** 267 (d_hi,d_lo) <- (a_hi,a_lo) + (b_hi,b_lo) * c 268 **********************************************************/ 269 270 #define LIS_QUAD_FMAD(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo,c) \ 271 LIS_QUAD_MULD(chi,p2,(b_hi),(b_lo),(c)); \ 272 LIS_QUAD_ADD((d_hi),(d_lo),(a_hi),(a_lo),chi,p2) 273 274 /********************************************************** 275 * LIS_QUAD_FSA(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo) * 276 ********************************************************** 277 (d_hi,d_lo) <- (a_hi,a_lo) + (b_hi,b_lo) * (b_hi,b_lo) 278 **********************************************************/ 279 280 #define LIS_QUAD_FSA(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo) \ 281 LIS_QUAD_SQR(bhi,p2,(b_hi),(b_lo)); \ 282 LIS_QUAD_ADD((d_hi),(d_lo),(a_hi),(a_lo),bhi,p2) 283 284 /********************************************************** 285 * * 286 * SSE2(SD) * 287 * * 288 **********************************************************/ 289 290 /********************************************************** 291 * LIS_QUAD_MUL_SSE2_STORE(a_hi,a_lo) * 292 ********************************************************** 293 (a_hi,a_lo) <- 294 **********************************************************/ 295 296 #define LIS_QUAD_MUL_SSE2_STORE(a_hi,a_lo) \ 297 _mm_store_sd(&(a_hi),sh); \ 298 sh = _mm_sub_sd(sh,bh); \ 299 wh = _mm_sub_sd(wh,sh); \ 300 _mm_store_sd(&(a_lo),wh) 301 302 /********************************************************** 303 * LIS_QUAD_ADD_SSE2_LOAD(b_hi,b_lo,c_hi,c_lo) * 304 ********************************************************** 305 (b_hi,b_lo) (c_hi,c_lo) 306 **********************************************************/ 307 308 #ifndef USE_FAST_QUAD_ADD 309 #define LIS_QUAD_ADD_SSE2_LOAD(b_hi,b_lo,c_hi,c_lo) \ 310 sh = _mm_set_pd((b_lo),(b_hi)); \ 311 bl = _mm_set_pd((c_lo),(c_hi)) 312 #else 313 #define LIS_QUAD_ADD_SSE2_LOAD(b_hi,b_lo,c_hi,c_lo) \ 314 eh = _mm_set_sd((b_hi)); \ 315 cl = _mm_set_sd((b_lo)); \ 316 sh = _mm_set_sd((c_hi)); \ 317 wh = _mm_set_sd((c_lo)) 318 #endif 319 320 /********************************************************** 321 * LIS_QUAD_FMA_SSE2_LOAD(a_hi,a_lo) * 322 ********************************************************** 323 (a_hi,a_lo) 324 **********************************************************/ 325 326 #ifndef USE_FAST_QUAD_ADD 327 #define LIS_QUAD_FMA_SSE2_LOAD(a_hi,a_lo) \ 328 th = _mm_sub_sd(sh,bh); \ 329 wh = _mm_sub_sd(wh,th); \ 330 bl = _mm_set_pd((a_lo),(a_hi)); \ 331 sh = _mm_unpacklo_pd(sh,wh) 332 #else 333 #define LIS_QUAD_FMA_SSE2_LOAD(a_hi,a_lo) \ 334 th = _mm_sub_sd(sh,bh); \ 335 wh = _mm_sub_sd(wh,th); \ 336 eh = _mm_set_sd((a_hi)); \ 337 cl = _mm_set_sd((a_lo)) 338 #endif 339 340 /********************************************************** 341 * LIS_QUAD_MUL_SSE2_CORE(b_hi,b_lo,c_hi,c_lo) * 342 ********************************************************** 343 (b_hi,b_lo) x (c_hi,c_lo) 344 **********************************************************/ 345 346 #define LIS_QUAD_MUL_SSE2_CORE(b_hi,b_lo,c_hi,c_lo) \ 347 sh = _mm_set_pd(SPLITTER,SPLITTER); \ 348 ch = _mm_set_pd((c_hi),(b_hi)); \ 349 bh = _mm_unpackhi_pd(ch,ch); \ 350 tl = _mm_set_pd((b_lo),(c_lo)); \ 351 bh = _mm_mul_sd(bh,ch); \ 352 sh = _mm_mul_pd(sh,ch); \ 353 th = _mm_sub_pd(sh,ch); \ 354 sh = _mm_sub_pd(sh,th); \ 355 th = _mm_mul_pd(ch,tl); \ 356 ch = _mm_sub_pd(ch,sh); \ 357 eh = _mm_unpackhi_pd(sh,ch); \ 358 wh = _mm_unpacklo_pd(sh,ch); \ 359 ch = _mm_unpackhi_pd(ch,wh); \ 360 wh = _mm_mul_pd(wh,eh); \ 361 sh = _mm_mul_pd(sh,ch); \ 362 ch = _mm_unpackhi_pd(sh,sh); \ 363 tl = _mm_unpackhi_pd(wh,wh); \ 364 eh = _mm_unpackhi_pd(th,th); \ 365 wh = _mm_sub_sd(wh,bh); \ 366 wh = _mm_add_sd(wh,ch); \ 367 wh = _mm_add_sd(wh,sh); \ 368 wh = _mm_add_sd(wh,tl); \ 369 wh = _mm_add_sd(wh,eh); \ 370 wh = _mm_add_sd(wh,th); \ 371 sh = _mm_add_sd(wh,bh) 372 373 /********************************************************** 374 * LIS_QUAD_MULD_SSE2_CORE(b_hi,b_lo,c) * 375 ********************************************************** 376 (b_hi,b_lo) x c 377 **********************************************************/ 378 379 #define LIS_QUAD_MULD_SSE2_CORE(b_hi,b_lo,c) \ 380 sh = _mm_set_pd(SPLITTER,SPLITTER); \ 381 ch = _mm_set_pd((b_hi),(c)); \ 382 bh = _mm_unpackhi_pd(ch,ch); \ 383 sl = _mm_load_sd(&(b_lo)); \ 384 bh = _mm_mul_sd(bh,ch); \ 385 sl = _mm_mul_sd(sl,ch); \ 386 sh = _mm_mul_pd(sh,ch); \ 387 th = _mm_sub_pd(sh,ch); \ 388 sh = _mm_sub_pd(sh,th); \ 389 ch = _mm_sub_pd(ch,sh); \ 390 t1 = _mm_unpackhi_pd(sh,ch); \ 391 wh = _mm_unpacklo_pd(sh,ch); \ 392 ch = _mm_unpackhi_pd(ch,wh); \ 393 wh = _mm_mul_pd(wh,t1); \ 394 sh = _mm_mul_pd(sh,ch); \ 395 ch = _mm_unpackhi_pd(sh,sh); \ 396 th = _mm_unpackhi_pd(wh,wh); \ 397 wh = _mm_sub_sd(wh,bh); \ 398 wh = _mm_add_sd(wh,ch); \ 399 wh = _mm_add_sd(wh,sh); \ 400 wh = _mm_add_sd(wh,th); \ 401 wh = _mm_add_sd(wh,sl); \ 402 sh = _mm_add_sd(wh,bh) 403 404 /********************************************************** 405 * LIS_QUAD_SQR_SSE2_CORE(b_hi,b_lo) * 406 ********************************************************** 407 (b_hi,b_lo) x (b_hi,b_lo) 408 **********************************************************/ 409 410 #define LIS_QUAD_SQR_SSE2_CORE(b_hi,b_lo) \ 411 sh = _mm_set_sd(SPLITTER); \ 412 th = _mm_load_sd(&(b_hi)); \ 413 bl = _mm_load_sd(&(b_lo)); \ 414 bh = _mm_mul_sd(th,th); \ 415 sh = _mm_mul_sd(sh,th); \ 416 wh = _mm_sub_sd(sh,th); \ 417 sh = _mm_sub_sd(sh,wh); \ 418 cl = _mm_sub_sd(th,sh); \ 419 wh = _mm_mul_sd(sh,sh); \ 420 sh = _mm_add_sd(sh,sh); \ 421 sh = _mm_mul_sd(sh,cl); \ 422 cl = _mm_mul_sd(cl,cl); \ 423 wh = _mm_sub_sd(wh,bh); \ 424 wh = _mm_add_sd(wh,sh); \ 425 wh = _mm_add_sd(wh,cl); \ 426 th = _mm_add_sd(th,th); \ 427 th = _mm_mul_sd(th,bl); \ 428 bl = _mm_mul_sd(bl,bl); \ 429 wh = _mm_add_sd(wh,th); \ 430 wh = _mm_add_sd(wh,bl); \ 431 sh = _mm_add_sd(wh,bh) 432 433 /********************************************************** 434 * LIS_QUAD_ADD_SSE2_CORE(a_hi,a_lo) * 435 ********************************************************** 436 (a_hi,a_lo) <- (b_hi,b_lo) + (c_hi,c_lo) 437 **********************************************************/ 438 439 #ifndef USE_FAST_QUAD_ADD 440 #define LIS_QUAD_ADD_SSE2_CORE(a_hi,a_lo) \ 441 t0 = _mm_add_pd(bl,sh); \ 442 eh = _mm_sub_pd(t0,bl); \ 443 th = _mm_sub_pd(t0,eh); \ 444 sh = _mm_sub_pd(sh,eh); \ 445 bl = _mm_sub_pd(bl,th); \ 446 bl = _mm_add_pd(bl,sh); \ 447 eh = _mm_unpackhi_pd(bl,bl); \ 448 sh = _mm_unpackhi_pd(t0,t0); \ 449 th = t0; \ 450 bl = _mm_add_sd(bl,sh); \ 451 th = _mm_add_sd(th,bl); \ 452 t0 = _mm_sub_sd(th,t0); \ 453 bl = _mm_sub_sd(bl,t0); \ 454 bl = _mm_add_sd(bl,eh); \ 455 sh = _mm_add_sd(th,bl); \ 456 _mm_store_sd(&(a_hi),sh); \ 457 sh = _mm_sub_sd(sh,th); \ 458 bl = _mm_sub_sd(bl,sh); \ 459 _mm_store_sd(&(a_lo),bl) 460 #else 461 #define LIS_QUAD_ADD_SSE2_CORE(a_hi,a_lo) \ 462 bl = _mm_add_sd(eh,sh); \ 463 th = _mm_sub_sd(bl,eh); \ 464 t0 = _mm_sub_sd(bl,th); \ 465 sh = _mm_sub_sd(sh,th); \ 466 eh = _mm_sub_sd(eh,t0); \ 467 eh = _mm_add_sd(eh,sh); \ 468 eh = _mm_add_sd(eh,cl); \ 469 eh = _mm_add_sd(eh,wh); \ 470 th = _mm_add_sd(bl,eh); \ 471 _mm_store_sd(&(a_hi),th); \ 472 th = _mm_sub_sd(th,bl); \ 473 eh = _mm_sub_sd(eh,th); \ 474 _mm_store_sd(&(a_lo),eh) 475 #endif 476 477 /********************************************************** 478 * LIS_QUAD_MUL_SSE2(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) * 479 ********************************************************** 480 (a_hi,a_lo) <- (b_hi,b_lo) x (c_hi,c_lo) 481 **********************************************************/ 482 483 #define LIS_QUAD_MUL_SSE2(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) \ 484 LIS_QUAD_MUL_SSE2_CORE((b_hi),(b_lo),(c_hi),(c_lo)); \ 485 LIS_QUAD_MUL_SSE2_STORE((a_hi),(a_lo)) 486 487 /********************************************************** 488 * LIS_QUAD_MULD_SSE2(a_hi,a_lo,b_hi,b_lo,c) * 489 ********************************************************** 490 (a_hi,a_lo) <- (b_hi,b_lo) x c 491 **********************************************************/ 492 493 #define LIS_QUAD_MULD_SSE2(a_hi,a_lo,b_hi,b_lo,c) \ 494 LIS_QUAD_MULD_SSE2_CORE((b_hi),(b_lo),(c)); \ 495 LIS_QUAD_MUL_SSE2_STORE((a_hi),(a_lo)) 496 497 /********************************************************** 498 * LIS_QUAD_SQR_SSE2(a_hi,a_lo,b_hi,b_lo) * 499 ********************************************************** 500 (a_hi,a_lo) <- (b_hi,b_lo) x (b_hi,b_lo) 501 **********************************************************/ 502 503 #define LIS_QUAD_SQR_SSE2(a_hi,a_lo,b_hi,b_lo) \ 504 LIS_QUAD_SQR_SSE2_CORE((b_hi),(b_lo)); \ 505 LIS_QUAD_MUL_SSE2_STORE((a_hi),(a_lo)) 506 507 /********************************************************** 508 * LIS_QUAD_ADD_SSE2(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) * 509 ********************************************************** 510 (a_hi,a_lo) <- (b_hi,b_lo) + (c_hi,c_lo) 511 **********************************************************/ 512 513 #define LIS_QUAD_ADD_SSE2(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) \ 514 LIS_QUAD_ADD_SSE2_LOAD((b_hi),(b_lo),(c_hi),(c_lo)); \ 515 LIS_QUAD_ADD_SSE2_CORE((a_hi),(a_lo)) 516 517 /********************************************************** 518 * LIS_QUAD_DIV_SSE2(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) * 519 ********************************************************** 520 (a_hi,a_lo) <- (b_hi,b_lo) / (c_hi,c_lo) 521 **********************************************************/ 522 523 #define LIS_QUAD_DIV_SSE2(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) \ 524 sh = _mm_set_pd(SPLITTER,SPLITTER); \ 525 bh = _mm_set_pd((b_lo),(b_hi)); \ 526 ch = _mm_set_pd((c_lo),(c_hi)); \ 527 p2 = bh; \ 528 wh = ch; \ 529 p2 = _mm_div_sd(p2,ch); \ 530 wh = _mm_unpacklo_pd(wh,p2); \ 531 ch = _mm_move_sd(ch,p2); \ 532 p2 = wh; \ 533 sh = _mm_mul_pd(sh,wh); \ 534 th = _mm_sub_pd(sh,wh); \ 535 sh = _mm_sub_pd(sh,th); \ 536 ch = _mm_mul_pd(ch,wh); \ 537 wh = _mm_sub_pd(wh,sh); \ 538 th = sh; \ 539 p1 = wh; \ 540 th = _mm_unpackhi_pd(th,th); \ 541 p1 = _mm_unpackhi_pd(p1,p1); \ 542 eh = th; \ 543 th = _mm_mul_sd(th,wh); \ 544 eh = _mm_mul_sd(eh,sh); \ 545 wh = _mm_mul_sd(wh,p1); \ 546 sh = _mm_mul_sd(sh,p1); \ 547 p1 = ch; \ 548 p1 = _mm_unpackhi_pd(p1,p1); \ 549 eh = _mm_sub_sd(eh,ch); \ 550 eh = _mm_add_sd(eh,sh); \ 551 eh = _mm_add_sd(eh,th); \ 552 eh = _mm_add_sd(eh,wh); \ 553 eh = _mm_add_sd(eh,p1); \ 554 p1 = ch; \ 555 p1 = _mm_add_sd(p1,eh); \ 556 wh = p1; \ 557 wh = _mm_sub_sd(wh,ch); \ 558 sh = bh; \ 559 sh = _mm_sub_sd(sh,p1); \ 560 eh = _mm_sub_sd(eh,wh); \ 561 th = sh; \ 562 th = _mm_sub_sd(th,bh); \ 563 p1 = _mm_add_sd(p1,th); \ 564 wh = sh; \ 565 wh = _mm_sub_sd(wh,th); \ 566 th = bh; \ 567 th = _mm_unpackhi_pd(th,th); \ 568 bh = _mm_sub_sd(bh,wh); \ 569 bh = _mm_sub_sd(bh,p1); \ 570 bh = _mm_sub_sd(bh,eh); \ 571 bh = _mm_add_sd(bh,th); \ 572 th = p2; \ 573 th = _mm_unpackhi_pd(th,th); \ 574 eh = th; \ 575 sh = _mm_add_sd(sh,bh); \ 576 sh = _mm_div_sd(sh,p2); \ 577 th = _mm_add_sd(th,sh); \ 578 _mm_store_sd(&(a_hi),th); \ 579 th = _mm_sub_sd(th,eh); \ 580 sh = _mm_sub_sd(sh,th); \ 581 _mm_store_sd(&(a_lo),sh) 582 583 /********************************************************** 584 * LIS_QUAD_SQRT_SSE2(a_hi,a_lo,b_hi,b_lo) * 585 ********************************************************** 586 (a_hi,a_lo) <- SQRT( (b_hi,b_lo) ) 587 **********************************************************/ 588 589 #define LIS_QUAD_SQRT_SSE2(a_hi,a_lo,b_hi,b_lo) \ 590 if( (b_hi)==0 ) \ 591 { \ 592 (a_hi) = (a_lo) = 0.0; \ 593 return LIS_FAILS; \ 594 } \ 595 if( (b_hi)<0 ) \ 596 { \ 597 printf("ERROR bh=%e\n",(b_hi)); \ 598 (a_hi) = (a_lo) = 0.0; \ 599 return LIS_FAILS; \ 600 } \ 601 wh = _mm_set_sd(SPLITTER); \ 602 bh = _mm_load_pd(&(b_hi)); \ 603 bh = bh; \ 604 t0 = _mm_set_sd(1.0); \ 605 t1 = _mm_set_sd(0.5); \ 606 t2 = _mm_sqrt_pd(bh); \ 607 t0 = _mm_div_sd(t0,t2); \ 608 t2 = _mm_mul_sd(bh,t0); \ 609 t0 = _mm_mul_sd(t0,t1); \ 610 p1 = _mm_mul_sd(t2,t2); \ 611 wh = _mm_mul_sd(wh,t2); \ 612 t1 = _mm_sub_sd(wh,t2); \ 613 wh = _mm_sub_sd(wh,t1); \ 614 wl = _mm_sub_sd(t2,wh); \ 615 t1 = _mm_mul_sd(wh,wh); \ 616 wh = _mm_add_sd(wh,wh); \ 617 wh = _mm_mul_sd(wh,wl); \ 618 wl = _mm_mul_sd(wl,wl); \ 619 t1 = _mm_sub_sd(t1,p1); \ 620 t1 = _mm_add_sd(t1,wh); \ 621 t1 = _mm_add_sd(t1,wl); \ 622 p1 = _mm_unpacklo_pd(p1,t1); \ 623 sh = _mm_sub_pd(bh,p1); \ 624 eh = _mm_sub_pd(sh,bh); \ 625 th = _mm_sub_pd(sh,eh); \ 626 p1 = _mm_add_pd(p1,eh); \ 627 bh = _mm_sub_pd(bh,th); \ 628 bh = _mm_sub_pd(bh,p1); \ 629 eh = _mm_unpackhi_pd(bh,bh); \ 630 ch = _mm_unpackhi_pd(sh,sh); \ 631 th = sh; \ 632 bh = _mm_add_sd(bh,ch); \ 633 th = _mm_add_sd(th,bh); \ 634 sh = _mm_sub_sd(th,sh); \ 635 bh = _mm_sub_sd(bh,sh); \ 636 bh = _mm_add_sd(bh,eh); \ 637 th = _mm_add_sd(th,bh); \ 638 t0 = _mm_mul_sd(t0,th); \ 639 p1 = _mm_add_sd(t2,t0); \ 640 _mm_store_sd(&(a_hi),p1); \ 641 p1 = _mm_sub_sd(p1,t2); \ 642 t0 = _mm_sub_sd(t0,p1); \ 643 _mm_store_sd(&(a_lo),t0) 644 645 /*************************************************************** 646 * LIS_QUAD_FMA_SSE2(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) * 647 *************************************************************** 648 (d_hi,d_lo) <- (a_hi,a_lo) + (b_hi,b_lo) * (c_hi,c_lo) 649 ***************************************************************/ 650 651 #define LIS_QUAD_FMA_SSE2(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) \ 652 LIS_QUAD_MUL_SSE2_CORE((b_hi),(b_lo),(c_hi),(c_lo)); \ 653 LIS_QUAD_FMA_SSE2_LOAD((a_hi),(a_lo)); \ 654 LIS_QUAD_ADD_SSE2_CORE((d_hi),(d_lo)) 655 656 /*************************************************************** 657 * LIS_QUAD_FMAD_SSE2(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo,c) * 658 *************************************************************** 659 (d_hi,d_lo) <- (a_hi,a_lo) + (b_hi,b_lo) * c 660 ***************************************************************/ 661 662 #define LIS_QUAD_FMAD_SSE2(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo,c) \ 663 LIS_QUAD_MULD_SSE2_CORE((b_hi),(b_lo),(c)); \ 664 LIS_QUAD_FMA_SSE2_LOAD((a_hi),(a_lo)); \ 665 LIS_QUAD_ADD_SSE2_CORE((d_hi),(d_lo)) 666 667 /*************************************************************** 668 * LIS_QUAD_FSA_SSE2(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo) * 669 *************************************************************** 670 (d_hi,d_lo) <- (a_hi,a_lo) + (b_hi,b_lo) * (b_hi,b_lo) 671 ***************************************************************/ 672 673 #define LIS_QUAD_FSA_SSE2(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo) \ 674 LIS_QUAD_SQR_SSE2_CORE((b_hi),(b_lo)); \ 675 LIS_QUAD_FMA_SSE2_LOAD((a_hi),(a_lo)); \ 676 LIS_QUAD_ADD_SSE2_CORE((d_hi),(d_lo)) 677 678 /********************************************************** 679 * * 680 * SSE2(PD) * 681 * * 682 **********************************************************/ 683 684 /********************************************************** 685 * LIS_QUAD_MUL2_SSE2_LOAD(b_hi,b_lo,c_hi,c_lo) * 686 ********************************************************** 687 (b_hi,b_lo) (c_hi,c_lo) 688 **********************************************************/ 689 690 #define LIS_QUAD_MUL2_SSE2_LOAD(b_hi,b_lo,c_hi,c_lo) \ 691 t0 = _mm_set_pd(SPLITTER,SPLITTER); \ 692 bh = _mm_loadu_pd(&(b_hi)); \ 693 ch = _mm_loadu_pd(&(c_hi)); \ 694 bl = _mm_loadu_pd(&(b_lo)); \ 695 cl = _mm_loadu_pd(&(c_lo)) 696 697 /********************************************************** 698 * LIS_QUAD_MULD2_SSE2_LOAD(b_hi,b_lo,c) * 699 ********************************************************** 700 (b_hi,b_lo) (c) 701 **********************************************************/ 702 703 #define LIS_QUAD_MULD2_SSE2_LOAD(b_hi,b_lo,c) \ 704 t0 = _mm_set_pd(SPLITTER,SPLITTER); \ 705 bh = _mm_loadu_pd(&(b_hi)); \ 706 bl = _mm_loadu_pd(&(b_lo)); \ 707 ch = _mm_loadu_pd(&(c)) 708 #define LIS_QUAD_MULD2_SSE2_LOAD_SD(b0_hi,b0_lo,b1_hi,b1_lo,c) \ 709 t0 = _mm_set_pd(SPLITTER,SPLITTER); \ 710 bh = _mm_set_pd((b1_hi),(b0_hi)); \ 711 bl = _mm_set_pd((b1_lo),(b0_lo)); \ 712 ch = _mm_loadu_pd(&(c)) 713 714 /********************************************************** 715 * LIS_QUAD_SQR2_SSE2_LOAD(b_hi,b_lo) * 716 ********************************************************** 717 (b_hi,b_lo) 718 **********************************************************/ 719 720 #define LIS_QUAD_SQR2_SSE2_LOAD(b_hi,b_lo) \ 721 ch = _mm_set_pd(SPLITTER,SPLITTER); \ 722 bh = _mm_loadu_pd(&(b_hi)); \ 723 bl = _mm_loadu_pd(&(b_lo)) 724 725 /********************************************************** 726 * LIS_QUAD_MUL2_SSE2_STORE(a_hi,a_lo) * 727 ********************************************************** 728 (a_hi,a_lo) <- 729 **********************************************************/ 730 731 #define LIS_QUAD_MUL2_SSE2_STORE(a_hi,a_lo) \ 732 _mm_storeu_pd(&(a_hi),ch); \ 733 ch = _mm_sub_pd(ch,p1); \ 734 p2 = _mm_sub_pd(p2,ch); \ 735 _mm_storeu_pd(&(a_lo),p2) 736 #define LIS_QUAD_MUL2_SSE2_STOREU(a_hi,a_lo) \ 737 _mm_storeu_pd(&(a_hi),ch); \ 738 ch = _mm_sub_pd(ch,p1); \ 739 p2 = _mm_sub_pd(p2,ch); \ 740 _mm_storeu_pd(&(a_lo),p2) 741 742 /********************************************************** 743 * LIS_QUAD_ADD2_SSE2_LOAD(b_hi,b_lo,c_hi,c_lo) * 744 ********************************************************** 745 (b_hi,b_lo) (c_hi,c_lo) 746 **********************************************************/ 747 748 #ifndef USE_FAST_QUAD_ADD 749 #define LIS_QUAD_ADD2_SSE2_LOAD(b_hi,b_lo,c_hi,c_lo) \ 750 sh = _mm_set_pd((b_lo),(b_hi)); \ 751 bl = _mm_set_pd((c_lo),(c_hi)) 752 #else 753 #define LIS_QUAD_ADD2_SSE2_LOAD(b_hi,b_lo,c_hi,c_lo) \ 754 eh = _mm_set_sd((b_hi)); \ 755 cl = _mm_set_sd((b_lo)); \ 756 sh = _mm_set_sd((c_hi)); \ 757 wh = _mm_set_sd((c_lo)) 758 #endif 759 760 /********************************************************** 761 * LIS_QUAD_ADD2_SSE2_STORE(a_hi,a_lo) * 762 ********************************************************** 763 (a_hi,a_lo) <- 764 **********************************************************/ 765 766 #define LIS_QUAD_ADD2_SSE2_STORE(a_hi,a_lo) \ 767 _mm_storeu_pd(&(a_hi),sh); \ 768 sh = _mm_sub_pd(sh,th); \ 769 bh = _mm_sub_pd(bh,sh); \ 770 _mm_storeu_pd(&(a_lo),bh) 771 #define LIS_QUAD_ADD2_SSE2_STORE_SD(a0_hi,a0_lo,a1_hi,a1_lo) \ 772 _mm_storel_pd(&(a0_hi),sh); \ 773 _mm_storeh_pd(&(a1_hi),sh); \ 774 sh = _mm_sub_pd(sh,th); \ 775 bh = _mm_sub_pd(bh,sh); \ 776 _mm_storel_pd(&(a0_lo),bh); \ 777 _mm_storeh_pd(&(a1_lo),bh) 778 779 /********************************************************** 780 * LIS_QUAD_FMA2_SSE2_LOAD(a_hi,a_lo) * 781 ********************************************************** 782 (a_hi,a_lo) 783 **********************************************************/ 784 785 #define LIS_QUAD_FMA2_SSE2_LOAD(a_hi,a_lo) \ 786 t1 = _mm_sub_pd(ch,p1); \ 787 p2 = _mm_sub_pd(p2,t1); \ 788 bh = _mm_loadu_pd(&(a_hi)); \ 789 bl = _mm_loadu_pd(&(a_lo)) 790 #define LIS_QUAD_FMA2_SSE2_LOAD_SD(a0_hi,a0_lo,a1_hi,a1_lo) \ 791 t1 = _mm_sub_pd(ch,p1); \ 792 p2 = _mm_sub_pd(p2,t1); \ 793 bh = _mm_set_pd((a1_hi),(a0_hi)); \ 794 bl = _mm_set_pd((a1_lo),(a0_lo)) 795 796 /********************************************************** 797 * LIS_QUAD_MUL2_SSE2_CORE * 798 ********************************************************** 799 (b_hi,b_lo) x (c_hi,c_lo) 800 **********************************************************/ 801 802 #define LIS_QUAD_MUL2_SSE2_CORE \ 803 p1 = _mm_mul_pd(bh,ch); \ 804 sh = _mm_mul_pd(t0,bh); \ 805 sl = _mm_mul_pd(t0,ch); \ 806 th = _mm_sub_pd(sh,bh); \ 807 tl = _mm_sub_pd(sl,ch); \ 808 sh = _mm_sub_pd(sh,th); \ 809 sl = _mm_sub_pd(sl,tl); \ 810 t1 = _mm_mul_pd(bh,cl); \ 811 wh = _mm_sub_pd(bh,sh); \ 812 t2 = _mm_mul_pd(ch,bl); \ 813 wl = _mm_sub_pd(ch,sl); \ 814 t0 = _mm_mul_pd(wh,wl); \ 815 p2 = _mm_mul_pd(sh,sl); \ 816 sh = _mm_mul_pd(sh,wl); \ 817 sl = _mm_mul_pd(sl,wh); \ 818 p2 = _mm_sub_pd(p2,p1); \ 819 p2 = _mm_add_pd(p2,sh); \ 820 p2 = _mm_add_pd(p2,sl); \ 821 p2 = _mm_add_pd(p2,t0); \ 822 p2 = _mm_add_pd(p2,t1); \ 823 p2 = _mm_add_pd(p2,t2); \ 824 ch = _mm_add_pd(p1,p2) 825 826 /********************************************************** 827 * LIS_QUAD_MULD2_SSE2_CORE * 828 ********************************************************** 829 (b_hi,b_lo) x c 830 **********************************************************/ 831 832 #define LIS_QUAD_MULD2_SSE2_CORE \ 833 p1 = _mm_mul_pd(bh,ch); \ 834 bl = _mm_mul_pd(bl,ch); \ 835 sh = _mm_mul_pd(t0,bh); \ 836 th = _mm_sub_pd(sh,bh); \ 837 sh = _mm_sub_pd(sh,th); \ 838 bh = _mm_sub_pd(bh,sh); \ 839 sl = _mm_mul_pd(t0,ch); \ 840 tl = _mm_sub_pd(sl,ch); \ 841 sl = _mm_sub_pd(sl,tl); \ 842 ch = _mm_sub_pd(ch,sl); \ 843 t2 = _mm_mul_pd(bh,ch); \ 844 p2 = _mm_mul_pd(sh,sl); \ 845 t0 = _mm_mul_pd(sh,ch); \ 846 t1 = _mm_mul_pd(sl,bh); \ 847 p2 = _mm_sub_pd(p2,p1); \ 848 p2 = _mm_add_pd(p2,t0); \ 849 p2 = _mm_add_pd(p2,t1); \ 850 p2 = _mm_add_pd(p2,t2); \ 851 p2 = _mm_add_pd(p2,bl); \ 852 ch = _mm_add_pd(p1,p2) 853 854 /********************************************************** 855 * LIS_QUAD_SQR2_SSE2_CORE * 856 ********************************************************** 857 (b_hi,b_lo) x (b_hi,b_lo) 858 **********************************************************/ 859 860 #define LIS_QUAD_SQR2_SSE2_CORE \ 861 p1 = _mm_mul_pd(bh,bh); \ 862 ch = _mm_mul_pd(ch,bh); \ 863 p2 = _mm_sub_pd(ch,bh); \ 864 ch = _mm_sub_pd(ch,p2); \ 865 cl = _mm_sub_pd(bh,ch); \ 866 p2 = _mm_mul_pd(ch,ch); \ 867 ch = _mm_add_pd(ch,ch); \ 868 ch = _mm_mul_pd(ch,cl); \ 869 cl = _mm_mul_pd(cl,cl); \ 870 p2 = _mm_sub_pd(p2,p1); \ 871 p2 = _mm_add_pd(p2,ch); \ 872 p2 = _mm_add_pd(p2,cl); \ 873 bh = _mm_add_pd(bh,bh); \ 874 bh = _mm_mul_pd(bh,bl); \ 875 bl = _mm_mul_pd(bl,bl); \ 876 p2 = _mm_add_pd(p2,bh); \ 877 p2 = _mm_add_pd(p2,bl); \ 878 ch = _mm_add_pd(p1,p2) 879 880 /********************************************************** 881 * LIS_QUAD_ADD2_SSE2_CORE * 882 ********************************************************** 883 (b_hi,b_lo) + (c_hi,c_lo) 884 **********************************************************/ 885 886 #ifndef USE_FAST_QUAD_ADD 887 #define LIS_QUAD_ADD2_SSE2_CORE \ 888 sh = _mm_add_pd(bh,ch); \ 889 th = _mm_sub_pd(sh,bh); \ 890 t0 = _mm_sub_pd(sh,th); \ 891 ch = _mm_sub_pd(ch,th); \ 892 bh = _mm_sub_pd(bh,t0); \ 893 bh = _mm_add_pd(bh,ch); \ 894 sl = _mm_add_pd(bl,p2); \ 895 th = _mm_sub_pd(sl,bl); \ 896 t0 = _mm_sub_pd(sl,th); \ 897 p2 = _mm_sub_pd(p2,th); \ 898 bl = _mm_sub_pd(bl,t0); \ 899 bl = _mm_add_pd(bl,p2); \ 900 bh = _mm_add_pd(bh,sl); \ 901 th = sh; \ 902 th = _mm_add_pd(th,bh); \ 903 sh = _mm_sub_pd(th,sh); \ 904 bh = _mm_sub_pd(bh,sh); \ 905 bh = _mm_add_pd(bh,bl); \ 906 sh = _mm_add_pd(th,bh) 907 #else 908 #define LIS_QUAD_ADD2_SSE2_CORE \ 909 th = _mm_add_pd(bh,ch); \ 910 wh = _mm_sub_pd(th,bh); \ 911 t0 = _mm_sub_pd(th,wh); \ 912 ch = _mm_sub_pd(ch,wh); \ 913 bh = _mm_sub_pd(bh,t0); \ 914 bh = _mm_add_pd(bh,ch); \ 915 bh = _mm_add_pd(bh,bl); \ 916 bh = _mm_add_pd(bh,p2); \ 917 sh = _mm_add_pd(th,bh) 918 #endif 919 920 /********************************************************** 921 * LIS_QUAD_MUL2_SSE2(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) * 922 ********************************************************** 923 (a_hi,a_lo) <- (b_hi,b_lo) x (c_hi,c_lo) 924 **********************************************************/ 925 926 #define LIS_QUAD_MUL2_SSE2(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) \ 927 LIS_QUAD_MUL2_SSE2_LOAD((b_hi),(b_lo),(c_hi),(c_lo)); \ 928 LIS_QUAD_MUL2_SSE2_CORE; \ 929 LIS_QUAD_MUL2_SSE2_STORE((a_hi),(a_lo)) 930 931 /********************************************************** 932 * LIS_QUAD_MULD2_SSE2(a_hi,a_lo,b_hi,b_lo,c) * 933 ********************************************************** 934 (a_hi,a_lo) <- (b_hi,b_lo) x c 935 **********************************************************/ 936 937 #define LIS_QUAD_MULD2_SSE2(a_hi,a_lo,b_hi,b_lo,c) \ 938 LIS_QUAD_MULD2_SSE2_LOAD((b_hi),(b_lo),(c)); \ 939 LIS_QUAD_MULD2_SSE2_CORE; \ 940 LIS_QUAD_MUL2_SSE2_STORE((a_hi),(a_lo)) 941 942 /********************************************************** 943 * LIS_QUAD_SQR2_SSE2(a_hi,a_lo,b_hi,b_lo) * 944 ********************************************************** 945 (a_hi,a_lo) <- (b_hi,b_lo) x (b_hi,b_lo) 946 **********************************************************/ 947 948 #define LIS_QUAD_SQR2_SSE2(a_hi,a_lo,b_hi,b_lo) \ 949 LIS_QUAD_SQR2_SSE2_LOAD((b_hi),(b_lo)); \ 950 LIS_QUAD_SQR2_SSE2_CORE; \ 951 LIS_QUAD_MUL2_SSE2_STORE((a_hi),(a_lo)) 952 953 /********************************************************** 954 * LIS_QUAD_ADD2_SSE2(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) * 955 ********************************************************** 956 (a_hi,a_lo) <- (b_hi,b_lo) + (c_hi,c_lo) 957 **********************************************************/ 958 959 #define LIS_QUAD_ADD2_SSE2(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) \ 960 LIS_QUAD_ADD2_SSE2_LOAD((b_hi),(b_lo),(c_hi),(c_lo)); \ 961 LIS_QUAD_ADD2_SSE2_CORE; \ 962 LIS_QUAD_ADD2_SSE2_STORE((a_hi),(a_lo)) 963 964 /*************************************************************** 965 * LIS_QUAD_FMA2_SSE2(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) * 966 *************************************************************** 967 (d_hi,d_lo) <- (a_hi,a_lo) + (b_hi,b_lo) * (c_hi,c_lo) 968 ***************************************************************/ 969 970 #define LIS_QUAD_FMA2_SSE2(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) \ 971 LIS_QUAD_MUL2_SSE2_LOAD((b_hi),(b_lo),(c_hi),(c_lo)); \ 972 LIS_QUAD_MUL2_SSE2_CORE; \ 973 LIS_QUAD_FMA2_SSE2_LOAD((a_hi),(a_lo)); \ 974 LIS_QUAD_ADD2_SSE2_CORE; \ 975 LIS_QUAD_ADD2_SSE2_STORE((d_hi),(d_lo)) 976 977 /*************************************************************************** 978 * LIS_QUAD_FMAD2_SSE2(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo,c) * 979 * LIS_QUAD_FMAD2_SSE2_LDSD(d_hi,d_lo,a_hi,a_lo,b0_hi,b0_lo,b1_hi,b1_lo,c) * 980 *************************************************************************** 981 (d_hi,d_lo) <- (a_hi,a_lo) + (b_hi,b_lo) * c 982 ***************************************************************/ 983 984 #define LIS_QUAD_FMAD2_SSE2(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo,c) \ 985 LIS_QUAD_MULD2_SSE2_LOAD((b_hi),(b_lo),(c)); \ 986 LIS_QUAD_MULD2_SSE2_CORE; \ 987 LIS_QUAD_FMA2_SSE2_LOAD((a_hi),(a_lo)); \ 988 LIS_QUAD_ADD2_SSE2_CORE; \ 989 LIS_QUAD_ADD2_SSE2_STORE((d_hi),(d_lo)) 990 991 /****************************************************************/ 992 993 #define LIS_QUAD_FMAD2_SSE2_LDSD(d_hi,d_lo,a_hi,a_lo,b0_hi,b0_lo,b1_hi,b1_lo,c) \ 994 LIS_QUAD_MULD2_SSE2_LOAD_SD((b0_hi),(b0_lo),(b1_hi),(b1_lo),(c)); \ 995 LIS_QUAD_MULD2_SSE2_CORE; \ 996 LIS_QUAD_FMA2_SSE2_LOAD((a_hi),(a_lo)); \ 997 LIS_QUAD_ADD2_SSE2_CORE; \ 998 LIS_QUAD_ADD2_SSE2_STORE((d_hi),(d_lo)) 999 1000 /****************************************************************/ 1001 1002 #define LIS_QUAD_FMAD2_SSE2_STSD(d0_hi,d0_lo,d1_hi,d1_lo,a0_hi,a0_lo,a1_hi,a1_lo,b0_hi,b0_lo,b1_hi,b1_lo,c) \ 1003 LIS_QUAD_MULD2_SSE2_LOAD_SD((b0_hi),(b0_lo),(b1_hi),(b1_lo),(c)); \ 1004 LIS_QUAD_MULD2_SSE2_CORE; \ 1005 LIS_QUAD_FMA2_SSE2_LOAD_SD((a0_hi),(a0_lo),(a1_hi),(a1_lo)); \ 1006 LIS_QUAD_ADD2_SSE2_CORE; \ 1007 LIS_QUAD_ADD2_SSE2_STORE_SD((d0_hi),(d0_lo),(d1_hi),(d1_lo)) 1008 1009 /*************************************************************** 1010 * LIS_QUAD_FSA2_SSE2(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo) * 1011 *************************************************************** 1012 (d_hi,d_lo) <- (a_hi,a_lo) + (b_hi,b_lo) * (b_hi,b_lo) 1013 ***************************************************************/ 1014 1015 #define LIS_QUAD_FSA2_SSE2(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo) \ 1016 LIS_QUAD_SQR2_SSE2_LOAD((b_hi),(b_lo)); \ 1017 LIS_QUAD_SQR2_SSE2_CORE; \ 1018 LIS_QUAD_FMA2_SSE2_LOAD((a_hi),(a_lo)); \ 1019 LIS_QUAD_ADD2_SSE2_CORE; \ 1020 LIS_QUAD_ADD2_SSE2_STORE((d_hi),(d_lo)) 1021 1022 1023 1024 1025 1026 extern double *lis_quad_scalar_tmp; 1027 1028 #define LIS_QUAD_SCALAR_MALLOC(s,pos,num) \ 1029 (s).hi = &lis_quad_scalar_tmp[2*(pos)]; \ 1030 (s).lo = &lis_quad_scalar_tmp[2*(pos)+(num)] 1031 1032 #ifdef __cplusplus 1033 extern "C" 1034 { 1035 #endif 1036 1037 extern void lis_quad_x87_fpu_init(LIS_UNSIGNED_INT *cw_old); 1038 extern void lis_quad_x87_fpu_finalize(LIS_UNSIGNED_INT cw); 1039 1040 extern void lis_quad_minus(LIS_QUAD *a); 1041 extern void lis_quad_zero(LIS_QUAD *a); 1042 extern void lis_quad_one(LIS_QUAD *a); 1043 extern void lis_quad_min(LIS_QUAD *a, LIS_QUAD *b, LIS_QUAD *c); 1044 extern void lis_quad_max(LIS_QUAD *a, LIS_QUAD *b, LIS_QUAD *c); 1045 1046 extern void lis_quad_add(LIS_QUAD *a, const LIS_QUAD *b, const LIS_QUAD *c); 1047 extern void lis_quad_sub(LIS_QUAD *a, const LIS_QUAD *b, const LIS_QUAD *c); 1048 extern void lis_quad_mul(LIS_QUAD *a, const LIS_QUAD *b, const LIS_QUAD *c); 1049 extern void lis_quad_mul_dd_d(LIS_QUAD *a, const LIS_QUAD *b, const double c); 1050 extern void lis_quad_sqr(LIS_QUAD *a, const LIS_QUAD *b); 1051 extern void lis_quad_div(LIS_QUAD *a, const LIS_QUAD *b, const LIS_QUAD *c); 1052 extern LIS_INT lis_quad_sqrt(LIS_QUAD *a, const LIS_QUAD *b); 1053 1054 #ifdef __cplusplus 1055 } 1056 #endif 1057 1058 #endif 1059