1 /* Copyright (C) 2005 The Scalable Software Infrastructure Project. All rights reserved.
2 
3    Redistribution and use in source and binary forms, with or without
4    modification, are permitted provided that the following conditions are met:
5    1. Redistributions of source code must retain the above copyright
6       notice, this list of conditions and the following disclaimer.
7    2. Redistributions in binary form must reproduce the above copyright
8       notice, this list of conditions and the following disclaimer in the
9       documentation and/or other materials provided with the distribution.
10    3. Neither the name of the project nor the names of its contributors
11       may be used to endorse or promote products derived from this software
12       without specific prior written permission.
13 
14    THIS SOFTWARE IS PROVIDED BY THE SCALABLE SOFTWARE INFRASTRUCTURE PROJECT
15    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
16    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17    PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE SCALABLE SOFTWARE INFRASTRUCTURE
18    PROJECT BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
19    OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
20    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
21    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
22    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
24    POSSIBILITY OF SUCH DAMAGE.
25 */
26 
27 
28 #ifndef __LIS_PRECISION_H__
29 #define __LIS_PRECISION_H__
30 
31 #ifdef USE_SSE2
32 	#include <emmintrin.h>
33 #endif
34 
35 #define SPLITTER 134217729.0
36 
37 #ifndef USE_QUAD_PRECISION
38 	#define LIS_QUAD_DECLAR
39 #else
40 #ifdef USE_SSE2
41 	#define LIS_QUAD_DECLAR __m128d bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh
42 #else
43 	#define LIS_QUAD_DECLAR double p1,p2,tq,bhi,blo,chi,clo,sh,eh,sl,el,th,tl
44 #endif
45 #endif
46 
47 /**********************************************************
48  * LIS_QUAD_MINUS(a_hi,a_lo)                              *
49  **********************************************************
50   (a_hi,a_lo) <- (-a_hi,-a_lo)
51  **********************************************************/
52 
53 #define LIS_QUAD_MINUS(a_hi,a_lo)   \
54 				(a_hi) = -(a_hi); \
55 				(a_lo) = -(a_lo)
56 
57 /**********************************************************
58  * LIS_QUAD_ZERO(a_hi,a_lo)                               *
59  **********************************************************
60   (a_hi,a_lo) <- (0,0)
61  **********************************************************/
62 
63 #define LIS_QUAD_ZERO(a_hi,a_lo)   \
64 				(a_hi) = 0.0; \
65 				(a_lo) = 0.0
66 
67 /**********************************************************
68  * LIS_QUAD_ONE(a_hi,a_lo)                                *
69  **********************************************************
70   (a_hi,a_lo) <- (1,0)
71  **********************************************************/
72 
73 #define LIS_QUAD_ONE(a_hi,a_lo)   \
74 				(a_hi) = 1.0; \
75 				(a_lo) = 0.0
76 
77 /**********************************************************
78  * LIS_QUAD_FAST_TWO_SUM(a,b,r,e)                         *
79  **********************************************************
80   |a| > |b|
81   a + b -> (r,e)
82  **********************************************************/
83 
84 #define LIS_QUAD_FAST_TWO_SUM(a,b,r,e)   \
85 				(r) = (a) + (b); \
86 				(e) = (b)  - ((r) - (a))
87 
88 /**********************************************************
89  * LIS_QUAD_TWO_SUM(a,b,r,e)                              *
90  **********************************************************
91   a + b -> (r,e)
92  **********************************************************/
93 
94 #define LIS_QUAD_TWO_SUM(a,b,r,e)   \
95 				(r) = (a) + (b); \
96 				th  = (r) - (a); \
97 				(e) = ((a) - ((r) - th)) + ((b) - th)
98 
99 /**********************************************************
100  * LIS_QUAD_TWO_DIFF(a,b,r,e)                             *
101  **********************************************************
102   a - b -> (r,e)
103  **********************************************************/
104 
105 #define LIS_QUAD_TWO_DIFF(a,b,r,e)   \
106 				(r) = (a) - (b); \
107 				th  = (r) - (a); \
108 				(e) = ((a) - ((r) - th)) - ((b) + th)
109 
110 /**********************************************************
111  * LIS_QUAD_SPLIT(b,b_hi,b_lo)                            *
112  **********************************************************
113   b -> (b_hi,b_lo)
114  **********************************************************/
115 
116 #define LIS_QUAD_SPLIT(b,b_hi,b_lo)   \
117 				tq     = SPLITTER * (b); \
118 				(b_hi) = tq - (tq-(b));  \
119 				(b_lo) = (b) - (b_hi);   \
120 
121 /**********************************************************
122  * LIS_QUAD_TWO_PROD(a,b,r,e)                             *
123  **********************************************************
124   a x b -> (r,e)
125  **********************************************************/
126 
127 #ifndef HAS_FMA
128 #define LIS_QUAD_TWO_PROD(a,b,r,e)   \
129 				(r) = (a) * (b); \
130 				LIS_QUAD_SPLIT((a),bhi,blo); \
131 				LIS_QUAD_SPLIT((b),chi,clo); \
132 				(e) = ((bhi*chi-(r))+bhi*clo+blo*chi)+blo*clo
133 #else
134 #define LIS_QUAD_TWO_PROD(a,b,r,e)   \
135 				(r) = (-(a)) * (b); \
136 				(e) = (a) * (b) + (r)
137 #endif
138 
139 /**********************************************************
140  * LIS_QUAD_TWO_SQR(a,r,e)                                *
141  **********************************************************
142   a x a -> (r,e)
143  **********************************************************/
144 
145 #ifndef HAS_FMA
146 #define LIS_QUAD_TWO_SQR(a,r,e)   \
147 				(r) = (a) * (a); \
148 				LIS_QUAD_SPLIT((a),bhi,blo); \
149 				(e) = ((bhi*bhi-(r))+2.0*bhi*blo)+blo*blo
150 #else
151 #define LIS_QUAD_TWO_SQR(a,r,e)   \
152 				(r) = (-(a)) * (a); \
153 				(e) = (a) * (a) + (r)
154 #endif
155 
156 /**********************************************************
157  * LIS_QUAD_MUL(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo)            *
158  **********************************************************
159   (a_hi,a_lo) <- (b_hi,b_lo) x (c_hi,c_lo)
160  **********************************************************/
161 
162 #define LIS_QUAD_MUL(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) \
163 				LIS_QUAD_TWO_PROD((b_hi),(c_hi),p1,p2); \
164 				p2 += ((b_hi) * (c_lo)); \
165 				p2 += ((b_lo) * (c_hi)); \
166 				LIS_QUAD_FAST_TWO_SUM(p1,p2,(a_hi),(a_lo))
167 
168 /**********************************************************
169  * LIS_QUAD_MULD(a_hi,a_lo,b_hi,b_lo,c)                   *
170  **********************************************************
171   (a_hi,a_lo) <- (b_hi,b_lo) x c
172  **********************************************************/
173 
174 #define LIS_QUAD_MULD(a_hi,a_lo,b_hi,b_lo,c) \
175 				LIS_QUAD_TWO_PROD((b_hi),(c),p1,p2); \
176 				p2 += ((b_lo) * (c)); \
177 				LIS_QUAD_FAST_TWO_SUM(p1,p2,(a_hi),(a_lo))
178 
179 /**********************************************************
180  * LIS_QUAD_SQR(a_hi,a_lo,b_hi,b_lo)                      *
181  **********************************************************
182   (a_hi,a_lo) <- (b_hi,b_lo) x (b_hi,b_lo)
183  **********************************************************/
184 
185 #define LIS_QUAD_SQR(a_hi,a_lo,b_hi,b_lo) \
186 				LIS_QUAD_TWO_SQR((b_hi),p1,p2); \
187 				p2 += (2.0*(b_hi) * (b_lo)); \
188 				p2 += ((b_lo) * (b_lo)); \
189 				LIS_QUAD_FAST_TWO_SUM(p1,p2,(a_hi),(a_lo))
190 
191 /**********************************************************
192  * LIS_QUAD_ADD(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo)            *
193  **********************************************************
194   (a_hi,a_lo) <- (b_hi,b_lo) + (c_hi,c_lo)
195  **********************************************************/
196 
197 #ifndef USE_FAST_QUAD_ADD
198 #define LIS_QUAD_ADD(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) \
199 				LIS_QUAD_TWO_SUM((b_hi),(c_hi),sh,eh); \
200 				LIS_QUAD_TWO_SUM((b_lo),(c_lo),sl,el); \
201 				eh += sl; \
202 				LIS_QUAD_FAST_TWO_SUM(sh,eh,sh,eh); \
203 				eh += el; \
204 				LIS_QUAD_FAST_TWO_SUM(sh,eh,(a_hi),(a_lo))
205 #else
206 #define LIS_QUAD_ADD(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) \
207 				LIS_QUAD_TWO_SUM((b_hi),(c_hi),sh,eh); \
208 				eh += (b_lo); \
209 				eh += (c_lo); \
210 				LIS_QUAD_FAST_TWO_SUM(sh,eh,(a_hi),(a_lo))
211 #endif
212 
213 /**********************************************************
214  * LIS_QUAD_DIV(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo)            *
215  **********************************************************
216   (a_hi,a_lo) <- (b_hi,b_lo) / (c_hi,c_lo)
217  **********************************************************/
218 
219 #define LIS_QUAD_DIV(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) \
220 				tl  = (b_hi) / (c_hi); \
221 				LIS_QUAD_MULD(eh,el,(c_hi),(c_lo),tl); \
222 				LIS_QUAD_TWO_DIFF((b_hi),eh,sh,sl); \
223 				sl -= el; \
224 				sl += (b_lo); \
225 				th  = (sh+sl) / (c_hi); \
226 				LIS_QUAD_FAST_TWO_SUM(tl,th,(a_hi),(a_lo))
227 
228 /**********************************************************
229  * LIS_QUAD_SQRT(a_hi,a_lo,b_hi,b_lo)                     *
230  **********************************************************
231   (a_hi,a_lo) <- SQRT( (b_hi,b_lo) )
232  **********************************************************/
233 
234 #define LIS_QUAD_SQRT(a_hi,a_lo,b_hi,b_lo) \
235 				if( (b_hi)==0 ) \
236 				{ \
237 					(a_hi) = (a_lo) = 0.0; \
238 					return LIS_FAILS; \
239 				} \
240 				if( (b_hi)<0 ) \
241 				{ \
242 					printf("ERROR bh=%e\n",(b_hi)); \
243 					(a_hi) = (a_lo) = 0.0; \
244 					return LIS_FAILS; \
245 				} \
246 				p1 = 1.0 / sqrt((b_hi)); \
247 				p2 = (b_hi) * p1; \
248 				p1 = p1 * 0.5; \
249 				LIS_QUAD_TWO_SQR(p2,chi,clo); \
250 				LIS_QUAD_ADD(th,eh,(b_hi),(b_lo),-chi,-clo); \
251 				p1 = p1 * th; \
252 				LIS_QUAD_FAST_TWO_SUM(p1,p2,(a_hi),(a_lo))
253 
254 /**********************************************************
255  * LIS_QUAD_FMA(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo,c_hi,c_lo)  *
256  **********************************************************
257   (d_hi,d_lo) <- (a_hi,a_lo) + (b_hi,b_lo) * (c_hi,c_lo)
258  **********************************************************/
259 
260 #define LIS_QUAD_FMA(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) \
261 				LIS_QUAD_MUL(chi,p2,(b_hi),(b_lo),(c_hi),(c_lo)); \
262 				LIS_QUAD_ADD((d_hi),(d_lo),(a_hi),(a_lo),chi,p2)
263 
264 /**********************************************************
265  * LIS_QUAD_FMAD(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo,c)         *
266  **********************************************************
267   (d_hi,d_lo) <- (a_hi,a_lo) + (b_hi,b_lo) * c
268  **********************************************************/
269 
270 #define LIS_QUAD_FMAD(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo,c) \
271 				LIS_QUAD_MULD(chi,p2,(b_hi),(b_lo),(c)); \
272 				LIS_QUAD_ADD((d_hi),(d_lo),(a_hi),(a_lo),chi,p2)
273 
274 /**********************************************************
275  * LIS_QUAD_FSA(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo)            *
276  **********************************************************
277   (d_hi,d_lo) <- (a_hi,a_lo) + (b_hi,b_lo) * (b_hi,b_lo)
278  **********************************************************/
279 
280 #define LIS_QUAD_FSA(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo) \
281 				LIS_QUAD_SQR(bhi,p2,(b_hi),(b_lo)); \
282 				LIS_QUAD_ADD((d_hi),(d_lo),(a_hi),(a_lo),bhi,p2)
283 
284 /**********************************************************
285  *                                                        *
286  *                      SSE2(SD)                          *
287  *                                                        *
288  **********************************************************/
289 
290 /**********************************************************
291  * LIS_QUAD_MUL_SSE2_STORE(a_hi,a_lo)                     *
292  **********************************************************
293   (a_hi,a_lo) <-
294  **********************************************************/
295 
296 #define LIS_QUAD_MUL_SSE2_STORE(a_hi,a_lo) \
297 				_mm_store_sd(&(a_hi),sh); \
298 				sh = _mm_sub_sd(sh,bh); \
299 				wh = _mm_sub_sd(wh,sh); \
300 				_mm_store_sd(&(a_lo),wh)
301 
302 /**********************************************************
303  * LIS_QUAD_ADD_SSE2_LOAD(b_hi,b_lo,c_hi,c_lo)            *
304  **********************************************************
305   (b_hi,b_lo)  (c_hi,c_lo)
306  **********************************************************/
307 
308 #ifndef USE_FAST_QUAD_ADD
309 #define LIS_QUAD_ADD_SSE2_LOAD(b_hi,b_lo,c_hi,c_lo) \
310 				sh = _mm_set_pd((b_lo),(b_hi)); \
311 				bl = _mm_set_pd((c_lo),(c_hi))
312 #else
313 #define LIS_QUAD_ADD_SSE2_LOAD(b_hi,b_lo,c_hi,c_lo) \
314  				eh = _mm_set_sd((b_hi)); \
315 				cl = _mm_set_sd((b_lo)); \
316 				sh = _mm_set_sd((c_hi)); \
317 				wh = _mm_set_sd((c_lo))
318 #endif
319 
320 /**********************************************************
321  * LIS_QUAD_FMA_SSE2_LOAD(a_hi,a_lo)                      *
322  **********************************************************
323   (a_hi,a_lo)
324  **********************************************************/
325 
326 #ifndef USE_FAST_QUAD_ADD
327 #define LIS_QUAD_FMA_SSE2_LOAD(a_hi,a_lo) \
328 				th = _mm_sub_sd(sh,bh); \
329 				wh = _mm_sub_sd(wh,th); \
330 				bl = _mm_set_pd((a_lo),(a_hi)); \
331 				sh = _mm_unpacklo_pd(sh,wh)
332 #else
333 #define LIS_QUAD_FMA_SSE2_LOAD(a_hi,a_lo) \
334 				th = _mm_sub_sd(sh,bh); \
335 				wh = _mm_sub_sd(wh,th); \
336  				eh = _mm_set_sd((a_hi)); \
337 				cl = _mm_set_sd((a_lo))
338 #endif
339 
340 /**********************************************************
341  * LIS_QUAD_MUL_SSE2_CORE(b_hi,b_lo,c_hi,c_lo)            *
342  **********************************************************
343   (b_hi,b_lo) x (c_hi,c_lo)
344  **********************************************************/
345 
346 #define LIS_QUAD_MUL_SSE2_CORE(b_hi,b_lo,c_hi,c_lo) \
347 				sh = _mm_set_pd(SPLITTER,SPLITTER); \
348 				ch = _mm_set_pd((c_hi),(b_hi)); \
349 				bh = _mm_unpackhi_pd(ch,ch); \
350 				tl = _mm_set_pd((b_lo),(c_lo)); \
351 				bh = _mm_mul_sd(bh,ch); \
352 				sh = _mm_mul_pd(sh,ch); \
353 				th = _mm_sub_pd(sh,ch); \
354 				sh = _mm_sub_pd(sh,th); \
355 				th = _mm_mul_pd(ch,tl); \
356 				ch = _mm_sub_pd(ch,sh); \
357 				eh = _mm_unpackhi_pd(sh,ch); \
358 				wh = _mm_unpacklo_pd(sh,ch); \
359 				ch = _mm_unpackhi_pd(ch,wh);  \
360 				wh = _mm_mul_pd(wh,eh); \
361 				sh = _mm_mul_pd(sh,ch); \
362 				ch = _mm_unpackhi_pd(sh,sh); \
363 				tl = _mm_unpackhi_pd(wh,wh); \
364 				eh = _mm_unpackhi_pd(th,th); \
365 				wh = _mm_sub_sd(wh,bh); \
366 				wh = _mm_add_sd(wh,ch); \
367 				wh = _mm_add_sd(wh,sh); \
368 				wh = _mm_add_sd(wh,tl); \
369 				wh = _mm_add_sd(wh,eh); \
370 				wh = _mm_add_sd(wh,th); \
371 				sh = _mm_add_sd(wh,bh)
372 
373 /**********************************************************
374  * LIS_QUAD_MULD_SSE2_CORE(b_hi,b_lo,c)                   *
375  **********************************************************
376   (b_hi,b_lo) x c
377  **********************************************************/
378 
379 #define LIS_QUAD_MULD_SSE2_CORE(b_hi,b_lo,c) \
380 				sh = _mm_set_pd(SPLITTER,SPLITTER); \
381 				ch = _mm_set_pd((b_hi),(c)); \
382 				bh = _mm_unpackhi_pd(ch,ch); \
383 				sl = _mm_load_sd(&(b_lo)); \
384 				bh = _mm_mul_sd(bh,ch); \
385 				sl = _mm_mul_sd(sl,ch); \
386 				sh = _mm_mul_pd(sh,ch); \
387 				th = _mm_sub_pd(sh,ch); \
388 				sh = _mm_sub_pd(sh,th); \
389 				ch = _mm_sub_pd(ch,sh); \
390 				t1 = _mm_unpackhi_pd(sh,ch); \
391 				wh = _mm_unpacklo_pd(sh,ch); \
392 				ch = _mm_unpackhi_pd(ch,wh);  \
393 				wh = _mm_mul_pd(wh,t1); \
394 				sh = _mm_mul_pd(sh,ch); \
395 				ch = _mm_unpackhi_pd(sh,sh); \
396 				th = _mm_unpackhi_pd(wh,wh); \
397 				wh = _mm_sub_sd(wh,bh); \
398 				wh = _mm_add_sd(wh,ch); \
399 				wh = _mm_add_sd(wh,sh); \
400 				wh = _mm_add_sd(wh,th); \
401 				wh = _mm_add_sd(wh,sl); \
402 				sh = _mm_add_sd(wh,bh)
403 
404 /**********************************************************
405  * LIS_QUAD_SQR_SSE2_CORE(b_hi,b_lo)                      *
406  **********************************************************
407   (b_hi,b_lo) x (b_hi,b_lo)
408  **********************************************************/
409 
410 #define LIS_QUAD_SQR_SSE2_CORE(b_hi,b_lo) \
411 				sh = _mm_set_sd(SPLITTER); \
412 				th = _mm_load_sd(&(b_hi)); \
413 				bl = _mm_load_sd(&(b_lo)); \
414 				bh = _mm_mul_sd(th,th); \
415 				sh = _mm_mul_sd(sh,th); \
416 				wh = _mm_sub_sd(sh,th); \
417 				sh = _mm_sub_sd(sh,wh); \
418 				cl = _mm_sub_sd(th,sh); \
419 				wh = _mm_mul_sd(sh,sh); \
420 				sh = _mm_add_sd(sh,sh); \
421 				sh = _mm_mul_sd(sh,cl); \
422 				cl = _mm_mul_sd(cl,cl); \
423 				wh = _mm_sub_sd(wh,bh); \
424 				wh = _mm_add_sd(wh,sh); \
425 				wh = _mm_add_sd(wh,cl); \
426 				th = _mm_add_sd(th,th); \
427 				th = _mm_mul_sd(th,bl); \
428 				bl = _mm_mul_sd(bl,bl); \
429 				wh = _mm_add_sd(wh,th); \
430 				wh = _mm_add_sd(wh,bl); \
431 				sh = _mm_add_sd(wh,bh)
432 
433 /**********************************************************
434  * LIS_QUAD_ADD_SSE2_CORE(a_hi,a_lo)                      *
435  **********************************************************
436   (a_hi,a_lo) <- (b_hi,b_lo) + (c_hi,c_lo)
437  **********************************************************/
438 
439 #ifndef USE_FAST_QUAD_ADD
440 #define LIS_QUAD_ADD_SSE2_CORE(a_hi,a_lo) \
441 				t0 = _mm_add_pd(bl,sh); \
442 				eh = _mm_sub_pd(t0,bl); \
443 				th = _mm_sub_pd(t0,eh); \
444 				sh = _mm_sub_pd(sh,eh); \
445 				bl = _mm_sub_pd(bl,th); \
446 				bl = _mm_add_pd(bl,sh); \
447 				eh = _mm_unpackhi_pd(bl,bl); \
448 				sh = _mm_unpackhi_pd(t0,t0); \
449 				th = t0; \
450 				bl = _mm_add_sd(bl,sh); \
451 				th = _mm_add_sd(th,bl); \
452 				t0 = _mm_sub_sd(th,t0); \
453 				bl = _mm_sub_sd(bl,t0); \
454 				bl = _mm_add_sd(bl,eh); \
455 				sh = _mm_add_sd(th,bl); \
456 				_mm_store_sd(&(a_hi),sh); \
457 				sh = _mm_sub_sd(sh,th); \
458 				bl = _mm_sub_sd(bl,sh); \
459 				_mm_store_sd(&(a_lo),bl)
460 #else
461 #define LIS_QUAD_ADD_SSE2_CORE(a_hi,a_lo) \
462 				bl = _mm_add_sd(eh,sh); \
463 				th = _mm_sub_sd(bl,eh); \
464 				t0 = _mm_sub_sd(bl,th); \
465 				sh = _mm_sub_sd(sh,th); \
466 				eh = _mm_sub_sd(eh,t0); \
467 				eh = _mm_add_sd(eh,sh); \
468 				eh = _mm_add_sd(eh,cl); \
469 				eh = _mm_add_sd(eh,wh); \
470 				th = _mm_add_sd(bl,eh); \
471 				_mm_store_sd(&(a_hi),th); \
472 				th = _mm_sub_sd(th,bl); \
473 				eh = _mm_sub_sd(eh,th); \
474 				_mm_store_sd(&(a_lo),eh)
475 #endif
476 
477 /**********************************************************
478  * LIS_QUAD_MUL_SSE2(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo)       *
479  **********************************************************
480   (a_hi,a_lo) <- (b_hi,b_lo) x (c_hi,c_lo)
481  **********************************************************/
482 
483 #define LIS_QUAD_MUL_SSE2(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) \
484 				LIS_QUAD_MUL_SSE2_CORE((b_hi),(b_lo),(c_hi),(c_lo)); \
485 				LIS_QUAD_MUL_SSE2_STORE((a_hi),(a_lo))
486 
487 /**********************************************************
488  * LIS_QUAD_MULD_SSE2(a_hi,a_lo,b_hi,b_lo,c)              *
489  **********************************************************
490   (a_hi,a_lo) <- (b_hi,b_lo) x c
491  **********************************************************/
492 
493 #define LIS_QUAD_MULD_SSE2(a_hi,a_lo,b_hi,b_lo,c) \
494 				LIS_QUAD_MULD_SSE2_CORE((b_hi),(b_lo),(c)); \
495 				LIS_QUAD_MUL_SSE2_STORE((a_hi),(a_lo))
496 
497 /**********************************************************
498  * LIS_QUAD_SQR_SSE2(a_hi,a_lo,b_hi,b_lo)                 *
499  **********************************************************
500   (a_hi,a_lo) <- (b_hi,b_lo) x (b_hi,b_lo)
501  **********************************************************/
502 
503 #define LIS_QUAD_SQR_SSE2(a_hi,a_lo,b_hi,b_lo) \
504 				LIS_QUAD_SQR_SSE2_CORE((b_hi),(b_lo)); \
505 				LIS_QUAD_MUL_SSE2_STORE((a_hi),(a_lo))
506 
507 /**********************************************************
508  * LIS_QUAD_ADD_SSE2(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo)       *
509  **********************************************************
510   (a_hi,a_lo) <- (b_hi,b_lo) + (c_hi,c_lo)
511  **********************************************************/
512 
513 #define LIS_QUAD_ADD_SSE2(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) \
514 				LIS_QUAD_ADD_SSE2_LOAD((b_hi),(b_lo),(c_hi),(c_lo)); \
515 				LIS_QUAD_ADD_SSE2_CORE((a_hi),(a_lo))
516 
517 /**********************************************************
518  * LIS_QUAD_DIV_SSE2(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo)       *
519  **********************************************************
520   (a_hi,a_lo) <- (b_hi,b_lo) / (c_hi,c_lo)
521  **********************************************************/
522 
523 #define LIS_QUAD_DIV_SSE2(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) \
524 				sh = _mm_set_pd(SPLITTER,SPLITTER); \
525 				bh = _mm_set_pd((b_lo),(b_hi)); \
526 				ch = _mm_set_pd((c_lo),(c_hi)); \
527 				p2 = bh; \
528 				wh = ch; \
529 				p2 = _mm_div_sd(p2,ch); \
530 				wh = _mm_unpacklo_pd(wh,p2); \
531 				ch = _mm_move_sd(ch,p2); \
532 				p2 = wh; \
533 				sh = _mm_mul_pd(sh,wh); \
534 				th = _mm_sub_pd(sh,wh); \
535 				sh = _mm_sub_pd(sh,th); \
536 				ch = _mm_mul_pd(ch,wh); \
537 				wh = _mm_sub_pd(wh,sh); \
538 				th = sh; \
539 				p1 = wh; \
540 				th = _mm_unpackhi_pd(th,th); \
541 				p1 = _mm_unpackhi_pd(p1,p1); \
542 				eh = th; \
543 				th = _mm_mul_sd(th,wh); \
544 				eh = _mm_mul_sd(eh,sh); \
545 				wh = _mm_mul_sd(wh,p1); \
546 				sh = _mm_mul_sd(sh,p1); \
547 				p1 = ch; \
548 				p1 = _mm_unpackhi_pd(p1,p1); \
549 				eh = _mm_sub_sd(eh,ch); \
550 				eh = _mm_add_sd(eh,sh); \
551 				eh = _mm_add_sd(eh,th); \
552 				eh = _mm_add_sd(eh,wh); \
553 				eh = _mm_add_sd(eh,p1); \
554 				p1 = ch; \
555 				p1 = _mm_add_sd(p1,eh); \
556 				wh = p1; \
557 				wh = _mm_sub_sd(wh,ch); \
558 				sh = bh; \
559 				sh = _mm_sub_sd(sh,p1); \
560 				eh = _mm_sub_sd(eh,wh); \
561 				th = sh; \
562 				th = _mm_sub_sd(th,bh); \
563 				p1 = _mm_add_sd(p1,th); \
564 				wh = sh; \
565 				wh = _mm_sub_sd(wh,th); \
566 				th = bh; \
567 				th = _mm_unpackhi_pd(th,th); \
568 				bh = _mm_sub_sd(bh,wh); \
569 				bh = _mm_sub_sd(bh,p1); \
570 				bh = _mm_sub_sd(bh,eh); \
571 				bh = _mm_add_sd(bh,th); \
572 				th = p2; \
573 				th = _mm_unpackhi_pd(th,th); \
574 				eh = th; \
575 				sh = _mm_add_sd(sh,bh); \
576 				sh = _mm_div_sd(sh,p2); \
577 				th = _mm_add_sd(th,sh); \
578 				_mm_store_sd(&(a_hi),th); \
579 				th = _mm_sub_sd(th,eh); \
580 				sh = _mm_sub_sd(sh,th); \
581 				_mm_store_sd(&(a_lo),sh)
582 
583 /**********************************************************
584  * LIS_QUAD_SQRT_SSE2(a_hi,a_lo,b_hi,b_lo)                *
585  **********************************************************
586   (a_hi,a_lo) <- SQRT( (b_hi,b_lo) )
587  **********************************************************/
588 
589 #define LIS_QUAD_SQRT_SSE2(a_hi,a_lo,b_hi,b_lo) \
590 				if( (b_hi)==0 ) \
591 				{ \
592 					(a_hi) = (a_lo) = 0.0; \
593 					return LIS_FAILS; \
594 				} \
595 				if( (b_hi)<0 ) \
596 				{ \
597 					printf("ERROR bh=%e\n",(b_hi)); \
598 					(a_hi) = (a_lo) = 0.0; \
599 					return LIS_FAILS; \
600 				} \
601 				wh = _mm_set_sd(SPLITTER); \
602 				bh = _mm_load_pd(&(b_hi)); \
603 				bh = bh; \
604 				t0 = _mm_set_sd(1.0); \
605 				t1 = _mm_set_sd(0.5); \
606 				t2 = _mm_sqrt_pd(bh); \
607 				t0 = _mm_div_sd(t0,t2); \
608 				t2 = _mm_mul_sd(bh,t0); \
609 				t0 = _mm_mul_sd(t0,t1); \
610 				p1 = _mm_mul_sd(t2,t2); \
611 				wh = _mm_mul_sd(wh,t2); \
612 				t1 = _mm_sub_sd(wh,t2); \
613 				wh = _mm_sub_sd(wh,t1); \
614 				wl = _mm_sub_sd(t2,wh); \
615 				t1 = _mm_mul_sd(wh,wh); \
616 				wh = _mm_add_sd(wh,wh); \
617 				wh = _mm_mul_sd(wh,wl); \
618 				wl = _mm_mul_sd(wl,wl); \
619 				t1 = _mm_sub_sd(t1,p1); \
620 				t1 = _mm_add_sd(t1,wh); \
621 				t1 = _mm_add_sd(t1,wl); \
622 				p1 = _mm_unpacklo_pd(p1,t1); \
623 				sh = _mm_sub_pd(bh,p1); \
624 				eh = _mm_sub_pd(sh,bh); \
625 				th = _mm_sub_pd(sh,eh); \
626 				p1 = _mm_add_pd(p1,eh); \
627 				bh = _mm_sub_pd(bh,th); \
628 				bh = _mm_sub_pd(bh,p1); \
629 				eh = _mm_unpackhi_pd(bh,bh); \
630 				ch = _mm_unpackhi_pd(sh,sh); \
631 				th = sh; \
632 				bh = _mm_add_sd(bh,ch); \
633 				th = _mm_add_sd(th,bh); \
634 				sh = _mm_sub_sd(th,sh); \
635 				bh = _mm_sub_sd(bh,sh); \
636 				bh = _mm_add_sd(bh,eh); \
637 				th = _mm_add_sd(th,bh); \
638 				t0 = _mm_mul_sd(t0,th); \
639 				p1 = _mm_add_sd(t2,t0); \
640 				_mm_store_sd(&(a_hi),p1); \
641 				p1 = _mm_sub_sd(p1,t2); \
642 				t0 = _mm_sub_sd(t0,p1); \
643 				_mm_store_sd(&(a_lo),t0)
644 
645 /***************************************************************
646  * LIS_QUAD_FMA_SSE2(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo,c_hi,c_lo)  *
647  ***************************************************************
648   (d_hi,d_lo) <- (a_hi,a_lo) + (b_hi,b_lo) * (c_hi,c_lo)
649  ***************************************************************/
650 
651 #define LIS_QUAD_FMA_SSE2(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) \
652 				LIS_QUAD_MUL_SSE2_CORE((b_hi),(b_lo),(c_hi),(c_lo)); \
653 				LIS_QUAD_FMA_SSE2_LOAD((a_hi),(a_lo)); \
654 				LIS_QUAD_ADD_SSE2_CORE((d_hi),(d_lo))
655 
656 /***************************************************************
657  * LIS_QUAD_FMAD_SSE2(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo,c)         *
658  ***************************************************************
659   (d_hi,d_lo) <- (a_hi,a_lo) + (b_hi,b_lo) * c
660  ***************************************************************/
661 
662 #define LIS_QUAD_FMAD_SSE2(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo,c) \
663 				LIS_QUAD_MULD_SSE2_CORE((b_hi),(b_lo),(c)); \
664 				LIS_QUAD_FMA_SSE2_LOAD((a_hi),(a_lo)); \
665 				LIS_QUAD_ADD_SSE2_CORE((d_hi),(d_lo))
666 
667 /***************************************************************
668  * LIS_QUAD_FSA_SSE2(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo)            *
669  ***************************************************************
670   (d_hi,d_lo) <- (a_hi,a_lo) + (b_hi,b_lo) * (b_hi,b_lo)
671  ***************************************************************/
672 
673 #define LIS_QUAD_FSA_SSE2(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo) \
674 				LIS_QUAD_SQR_SSE2_CORE((b_hi),(b_lo)); \
675 				LIS_QUAD_FMA_SSE2_LOAD((a_hi),(a_lo)); \
676 				LIS_QUAD_ADD_SSE2_CORE((d_hi),(d_lo))
677 
678 /**********************************************************
679  *                                                        *
680  *                      SSE2(PD)                          *
681  *                                                        *
682  **********************************************************/
683 
684 /**********************************************************
685  * LIS_QUAD_MUL2_SSE2_LOAD(b_hi,b_lo,c_hi,c_lo)           *
686  **********************************************************
687   (b_hi,b_lo)  (c_hi,c_lo)
688  **********************************************************/
689 
690 #define LIS_QUAD_MUL2_SSE2_LOAD(b_hi,b_lo,c_hi,c_lo) \
691 				t0 = _mm_set_pd(SPLITTER,SPLITTER); \
692 				bh = _mm_loadu_pd(&(b_hi)); \
693 				ch = _mm_loadu_pd(&(c_hi)); \
694 				bl = _mm_loadu_pd(&(b_lo)); \
695 				cl = _mm_loadu_pd(&(c_lo))
696 
697 /**********************************************************
698  * LIS_QUAD_MULD2_SSE2_LOAD(b_hi,b_lo,c)                  *
699  **********************************************************
700   (b_hi,b_lo)  (c)
701  **********************************************************/
702 
703 #define LIS_QUAD_MULD2_SSE2_LOAD(b_hi,b_lo,c) \
704 				t0 = _mm_set_pd(SPLITTER,SPLITTER); \
705 				bh = _mm_loadu_pd(&(b_hi)); \
706 				bl = _mm_loadu_pd(&(b_lo)); \
707 				ch = _mm_loadu_pd(&(c))
708 #define LIS_QUAD_MULD2_SSE2_LOAD_SD(b0_hi,b0_lo,b1_hi,b1_lo,c) \
709 				t0 = _mm_set_pd(SPLITTER,SPLITTER); \
710 				bh = _mm_set_pd((b1_hi),(b0_hi)); \
711 				bl = _mm_set_pd((b1_lo),(b0_lo)); \
712 				ch = _mm_loadu_pd(&(c))
713 
714 /**********************************************************
715  * LIS_QUAD_SQR2_SSE2_LOAD(b_hi,b_lo)                     *
716  **********************************************************
717   (b_hi,b_lo)
718  **********************************************************/
719 
720 #define LIS_QUAD_SQR2_SSE2_LOAD(b_hi,b_lo) \
721 				ch = _mm_set_pd(SPLITTER,SPLITTER); \
722 				bh = _mm_loadu_pd(&(b_hi)); \
723 				bl = _mm_loadu_pd(&(b_lo))
724 
725 /**********************************************************
726  * LIS_QUAD_MUL2_SSE2_STORE(a_hi,a_lo)                    *
727  **********************************************************
728   (a_hi,a_lo) <-
729  **********************************************************/
730 
731 #define LIS_QUAD_MUL2_SSE2_STORE(a_hi,a_lo) \
732 				_mm_storeu_pd(&(a_hi),ch); \
733 				ch = _mm_sub_pd(ch,p1); \
734 				p2 = _mm_sub_pd(p2,ch); \
735 				_mm_storeu_pd(&(a_lo),p2)
736 #define LIS_QUAD_MUL2_SSE2_STOREU(a_hi,a_lo) \
737 				_mm_storeu_pd(&(a_hi),ch); \
738 				ch = _mm_sub_pd(ch,p1); \
739 				p2 = _mm_sub_pd(p2,ch); \
740 				_mm_storeu_pd(&(a_lo),p2)
741 
742 /**********************************************************
743  * LIS_QUAD_ADD2_SSE2_LOAD(b_hi,b_lo,c_hi,c_lo)           *
744  **********************************************************
745   (b_hi,b_lo)  (c_hi,c_lo)
746  **********************************************************/
747 
748 #ifndef USE_FAST_QUAD_ADD
749 #define LIS_QUAD_ADD2_SSE2_LOAD(b_hi,b_lo,c_hi,c_lo) \
750 				sh = _mm_set_pd((b_lo),(b_hi)); \
751 				bl = _mm_set_pd((c_lo),(c_hi))
752 #else
753 #define LIS_QUAD_ADD2_SSE2_LOAD(b_hi,b_lo,c_hi,c_lo) \
754  				eh = _mm_set_sd((b_hi)); \
755 				cl = _mm_set_sd((b_lo)); \
756 				sh = _mm_set_sd((c_hi)); \
757 				wh = _mm_set_sd((c_lo))
758 #endif
759 
760 /**********************************************************
761  * LIS_QUAD_ADD2_SSE2_STORE(a_hi,a_lo)                    *
762  **********************************************************
763   (a_hi,a_lo) <-
764  **********************************************************/
765 
766 #define LIS_QUAD_ADD2_SSE2_STORE(a_hi,a_lo) \
767 				_mm_storeu_pd(&(a_hi),sh); \
768 				sh = _mm_sub_pd(sh,th); \
769 				bh = _mm_sub_pd(bh,sh); \
770 				_mm_storeu_pd(&(a_lo),bh)
771 #define LIS_QUAD_ADD2_SSE2_STORE_SD(a0_hi,a0_lo,a1_hi,a1_lo) \
772 				_mm_storel_pd(&(a0_hi),sh); \
773 				_mm_storeh_pd(&(a1_hi),sh); \
774 				sh = _mm_sub_pd(sh,th); \
775 				bh = _mm_sub_pd(bh,sh); \
776 				_mm_storel_pd(&(a0_lo),bh); \
777 				_mm_storeh_pd(&(a1_lo),bh)
778 
779 /**********************************************************
780  * LIS_QUAD_FMA2_SSE2_LOAD(a_hi,a_lo)                     *
781  **********************************************************
782   (a_hi,a_lo)
783  **********************************************************/
784 
785 #define LIS_QUAD_FMA2_SSE2_LOAD(a_hi,a_lo) \
786 				t1 = _mm_sub_pd(ch,p1); \
787 				p2 = _mm_sub_pd(p2,t1); \
788 				bh = _mm_loadu_pd(&(a_hi)); \
789 				bl = _mm_loadu_pd(&(a_lo))
790 #define LIS_QUAD_FMA2_SSE2_LOAD_SD(a0_hi,a0_lo,a1_hi,a1_lo) \
791 				t1 = _mm_sub_pd(ch,p1); \
792 				p2 = _mm_sub_pd(p2,t1); \
793 				bh = _mm_set_pd((a1_hi),(a0_hi)); \
794 				bl = _mm_set_pd((a1_lo),(a0_lo))
795 
796 /**********************************************************
797  * LIS_QUAD_MUL2_SSE2_CORE                                *
798  **********************************************************
799   (b_hi,b_lo) x (c_hi,c_lo)
800  **********************************************************/
801 
802 #define LIS_QUAD_MUL2_SSE2_CORE \
803 				p1 = _mm_mul_pd(bh,ch); \
804 				sh = _mm_mul_pd(t0,bh); \
805 				sl = _mm_mul_pd(t0,ch); \
806 				th = _mm_sub_pd(sh,bh); \
807 				tl = _mm_sub_pd(sl,ch); \
808 				sh = _mm_sub_pd(sh,th); \
809 				sl = _mm_sub_pd(sl,tl); \
810 				t1 = _mm_mul_pd(bh,cl); \
811 				wh = _mm_sub_pd(bh,sh); \
812 				t2 = _mm_mul_pd(ch,bl); \
813 				wl = _mm_sub_pd(ch,sl); \
814 				t0 = _mm_mul_pd(wh,wl); \
815 				p2 = _mm_mul_pd(sh,sl); \
816 				sh = _mm_mul_pd(sh,wl); \
817 				sl = _mm_mul_pd(sl,wh); \
818 				p2 = _mm_sub_pd(p2,p1); \
819 				p2 = _mm_add_pd(p2,sh); \
820 				p2 = _mm_add_pd(p2,sl); \
821 				p2 = _mm_add_pd(p2,t0); \
822 				p2 = _mm_add_pd(p2,t1); \
823 				p2 = _mm_add_pd(p2,t2); \
824 				ch = _mm_add_pd(p1,p2)
825 
826 /**********************************************************
827  * LIS_QUAD_MULD2_SSE2_CORE                               *
828  **********************************************************
829   (b_hi,b_lo) x c
830  **********************************************************/
831 
832 #define LIS_QUAD_MULD2_SSE2_CORE \
833 				p1 = _mm_mul_pd(bh,ch); \
834 				bl = _mm_mul_pd(bl,ch); \
835 				sh = _mm_mul_pd(t0,bh); \
836 				th = _mm_sub_pd(sh,bh); \
837 				sh = _mm_sub_pd(sh,th); \
838 				bh = _mm_sub_pd(bh,sh); \
839 				sl = _mm_mul_pd(t0,ch); \
840 				tl = _mm_sub_pd(sl,ch); \
841 				sl = _mm_sub_pd(sl,tl); \
842 				ch = _mm_sub_pd(ch,sl); \
843 				t2 = _mm_mul_pd(bh,ch); \
844 				p2 = _mm_mul_pd(sh,sl); \
845 				t0 = _mm_mul_pd(sh,ch); \
846 				t1 = _mm_mul_pd(sl,bh); \
847 				p2 = _mm_sub_pd(p2,p1); \
848 				p2 = _mm_add_pd(p2,t0); \
849 				p2 = _mm_add_pd(p2,t1); \
850 				p2 = _mm_add_pd(p2,t2); \
851 				p2 = _mm_add_pd(p2,bl); \
852 				ch = _mm_add_pd(p1,p2)
853 
854 /**********************************************************
855  * LIS_QUAD_SQR2_SSE2_CORE                                *
856  **********************************************************
857   (b_hi,b_lo) x (b_hi,b_lo)
858  **********************************************************/
859 
860 #define LIS_QUAD_SQR2_SSE2_CORE \
861 				p1 = _mm_mul_pd(bh,bh); \
862 				ch = _mm_mul_pd(ch,bh); \
863 				p2 = _mm_sub_pd(ch,bh); \
864 				ch = _mm_sub_pd(ch,p2); \
865 				cl = _mm_sub_pd(bh,ch); \
866 				p2 = _mm_mul_pd(ch,ch); \
867 				ch = _mm_add_pd(ch,ch); \
868 				ch = _mm_mul_pd(ch,cl); \
869 				cl = _mm_mul_pd(cl,cl); \
870 				p2 = _mm_sub_pd(p2,p1); \
871 				p2 = _mm_add_pd(p2,ch); \
872 				p2 = _mm_add_pd(p2,cl); \
873 				bh = _mm_add_pd(bh,bh); \
874 				bh = _mm_mul_pd(bh,bl); \
875 				bl = _mm_mul_pd(bl,bl); \
876 				p2 = _mm_add_pd(p2,bh); \
877 				p2 = _mm_add_pd(p2,bl); \
878 				ch = _mm_add_pd(p1,p2)
879 
880 /**********************************************************
881  * LIS_QUAD_ADD2_SSE2_CORE                                *
882  **********************************************************
883   (b_hi,b_lo) + (c_hi,c_lo)
884  **********************************************************/
885 
886 #ifndef USE_FAST_QUAD_ADD
887 #define LIS_QUAD_ADD2_SSE2_CORE \
888 				sh = _mm_add_pd(bh,ch); \
889 				th = _mm_sub_pd(sh,bh); \
890 				t0 = _mm_sub_pd(sh,th); \
891 				ch = _mm_sub_pd(ch,th); \
892 				bh = _mm_sub_pd(bh,t0); \
893 				bh = _mm_add_pd(bh,ch); \
894 				sl = _mm_add_pd(bl,p2); \
895 				th = _mm_sub_pd(sl,bl); \
896 				t0 = _mm_sub_pd(sl,th); \
897 				p2 = _mm_sub_pd(p2,th); \
898 				bl = _mm_sub_pd(bl,t0); \
899 				bl = _mm_add_pd(bl,p2); \
900 				bh = _mm_add_pd(bh,sl); \
901 				th = sh; \
902 				th = _mm_add_pd(th,bh); \
903 				sh = _mm_sub_pd(th,sh); \
904 				bh = _mm_sub_pd(bh,sh); \
905 				bh = _mm_add_pd(bh,bl); \
906 				sh = _mm_add_pd(th,bh)
907 #else
908 #define LIS_QUAD_ADD2_SSE2_CORE \
909 				th = _mm_add_pd(bh,ch); \
910 				wh = _mm_sub_pd(th,bh); \
911 				t0 = _mm_sub_pd(th,wh); \
912 				ch = _mm_sub_pd(ch,wh); \
913 				bh = _mm_sub_pd(bh,t0); \
914 				bh = _mm_add_pd(bh,ch); \
915 				bh = _mm_add_pd(bh,bl); \
916 				bh = _mm_add_pd(bh,p2); \
917 				sh = _mm_add_pd(th,bh)
918 #endif
919 
920 /**********************************************************
921  * LIS_QUAD_MUL2_SSE2(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo)      *
922  **********************************************************
923   (a_hi,a_lo) <- (b_hi,b_lo) x (c_hi,c_lo)
924  **********************************************************/
925 
926 #define LIS_QUAD_MUL2_SSE2(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) \
927 				LIS_QUAD_MUL2_SSE2_LOAD((b_hi),(b_lo),(c_hi),(c_lo)); \
928 				LIS_QUAD_MUL2_SSE2_CORE; \
929 				LIS_QUAD_MUL2_SSE2_STORE((a_hi),(a_lo))
930 
931 /**********************************************************
932  * LIS_QUAD_MULD2_SSE2(a_hi,a_lo,b_hi,b_lo,c)             *
933  **********************************************************
934   (a_hi,a_lo) <- (b_hi,b_lo) x c
935  **********************************************************/
936 
937 #define LIS_QUAD_MULD2_SSE2(a_hi,a_lo,b_hi,b_lo,c) \
938 				LIS_QUAD_MULD2_SSE2_LOAD((b_hi),(b_lo),(c)); \
939 				LIS_QUAD_MULD2_SSE2_CORE; \
940 				LIS_QUAD_MUL2_SSE2_STORE((a_hi),(a_lo))
941 
942 /**********************************************************
943  * LIS_QUAD_SQR2_SSE2(a_hi,a_lo,b_hi,b_lo)                *
944  **********************************************************
945   (a_hi,a_lo) <- (b_hi,b_lo) x (b_hi,b_lo)
946  **********************************************************/
947 
948 #define LIS_QUAD_SQR2_SSE2(a_hi,a_lo,b_hi,b_lo) \
949 				LIS_QUAD_SQR2_SSE2_LOAD((b_hi),(b_lo)); \
950 				LIS_QUAD_SQR2_SSE2_CORE; \
951 				LIS_QUAD_MUL2_SSE2_STORE((a_hi),(a_lo))
952 
953 /**********************************************************
954  * LIS_QUAD_ADD2_SSE2(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo)      *
955  **********************************************************
956   (a_hi,a_lo) <- (b_hi,b_lo) + (c_hi,c_lo)
957  **********************************************************/
958 
959 #define LIS_QUAD_ADD2_SSE2(a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) \
960 				LIS_QUAD_ADD2_SSE2_LOAD((b_hi),(b_lo),(c_hi),(c_lo)); \
961 				LIS_QUAD_ADD2_SSE2_CORE; \
962 				LIS_QUAD_ADD2_SSE2_STORE((a_hi),(a_lo))
963 
964 /***************************************************************
965  * LIS_QUAD_FMA2_SSE2(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) *
966  ***************************************************************
967   (d_hi,d_lo) <- (a_hi,a_lo) + (b_hi,b_lo) * (c_hi,c_lo)
968  ***************************************************************/
969 
970 #define LIS_QUAD_FMA2_SSE2(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo,c_hi,c_lo) \
971 				LIS_QUAD_MUL2_SSE2_LOAD((b_hi),(b_lo),(c_hi),(c_lo)); \
972 				LIS_QUAD_MUL2_SSE2_CORE; \
973 				LIS_QUAD_FMA2_SSE2_LOAD((a_hi),(a_lo)); \
974 				LIS_QUAD_ADD2_SSE2_CORE; \
975 				LIS_QUAD_ADD2_SSE2_STORE((d_hi),(d_lo))
976 
977 /***************************************************************************
978  * LIS_QUAD_FMAD2_SSE2(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo,c)                    *
979  * LIS_QUAD_FMAD2_SSE2_LDSD(d_hi,d_lo,a_hi,a_lo,b0_hi,b0_lo,b1_hi,b1_lo,c) *
980  ***************************************************************************
981   (d_hi,d_lo) <- (a_hi,a_lo) + (b_hi,b_lo) * c
982  ***************************************************************/
983 
984 #define LIS_QUAD_FMAD2_SSE2(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo,c) \
985 				LIS_QUAD_MULD2_SSE2_LOAD((b_hi),(b_lo),(c)); \
986 				LIS_QUAD_MULD2_SSE2_CORE; \
987 				LIS_QUAD_FMA2_SSE2_LOAD((a_hi),(a_lo)); \
988 				LIS_QUAD_ADD2_SSE2_CORE; \
989 				LIS_QUAD_ADD2_SSE2_STORE((d_hi),(d_lo))
990 
991 /****************************************************************/
992 
993 #define LIS_QUAD_FMAD2_SSE2_LDSD(d_hi,d_lo,a_hi,a_lo,b0_hi,b0_lo,b1_hi,b1_lo,c) \
994 				LIS_QUAD_MULD2_SSE2_LOAD_SD((b0_hi),(b0_lo),(b1_hi),(b1_lo),(c)); \
995 				LIS_QUAD_MULD2_SSE2_CORE; \
996 				LIS_QUAD_FMA2_SSE2_LOAD((a_hi),(a_lo)); \
997 				LIS_QUAD_ADD2_SSE2_CORE; \
998 				LIS_QUAD_ADD2_SSE2_STORE((d_hi),(d_lo))
999 
1000 /****************************************************************/
1001 
1002 #define LIS_QUAD_FMAD2_SSE2_STSD(d0_hi,d0_lo,d1_hi,d1_lo,a0_hi,a0_lo,a1_hi,a1_lo,b0_hi,b0_lo,b1_hi,b1_lo,c) \
1003 				LIS_QUAD_MULD2_SSE2_LOAD_SD((b0_hi),(b0_lo),(b1_hi),(b1_lo),(c)); \
1004 				LIS_QUAD_MULD2_SSE2_CORE; \
1005 				LIS_QUAD_FMA2_SSE2_LOAD_SD((a0_hi),(a0_lo),(a1_hi),(a1_lo)); \
1006 				LIS_QUAD_ADD2_SSE2_CORE; \
1007 				LIS_QUAD_ADD2_SSE2_STORE_SD((d0_hi),(d0_lo),(d1_hi),(d1_lo))
1008 
1009 /***************************************************************
1010  * LIS_QUAD_FSA2_SSE2(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo)           *
1011  ***************************************************************
1012   (d_hi,d_lo) <- (a_hi,a_lo) + (b_hi,b_lo) * (b_hi,b_lo)
1013  ***************************************************************/
1014 
1015 #define LIS_QUAD_FSA2_SSE2(d_hi,d_lo,a_hi,a_lo,b_hi,b_lo) \
1016 				LIS_QUAD_SQR2_SSE2_LOAD((b_hi),(b_lo)); \
1017 				LIS_QUAD_SQR2_SSE2_CORE; \
1018 				LIS_QUAD_FMA2_SSE2_LOAD((a_hi),(a_lo)); \
1019 				LIS_QUAD_ADD2_SSE2_CORE; \
1020 				LIS_QUAD_ADD2_SSE2_STORE((d_hi),(d_lo))
1021 
1022 
1023 
1024 
1025 
1026 extern double *lis_quad_scalar_tmp;
1027 
1028 #define LIS_QUAD_SCALAR_MALLOC(s,pos,num) \
1029 				(s).hi = &lis_quad_scalar_tmp[2*(pos)]; \
1030 				(s).lo = &lis_quad_scalar_tmp[2*(pos)+(num)]
1031 
1032 #ifdef __cplusplus
1033 extern "C"
1034 {
1035 #endif
1036 
1037 extern void lis_quad_x87_fpu_init(LIS_UNSIGNED_INT *cw_old);
1038 extern void lis_quad_x87_fpu_finalize(LIS_UNSIGNED_INT cw);
1039 
1040 extern void lis_quad_minus(LIS_QUAD *a);
1041 extern void lis_quad_zero(LIS_QUAD *a);
1042 extern void lis_quad_one(LIS_QUAD *a);
1043 extern void lis_quad_min(LIS_QUAD *a, LIS_QUAD *b, LIS_QUAD *c);
1044 extern void lis_quad_max(LIS_QUAD *a, LIS_QUAD *b, LIS_QUAD *c);
1045 
1046 extern void lis_quad_add(LIS_QUAD *a, const LIS_QUAD *b, const LIS_QUAD *c);
1047 extern void lis_quad_sub(LIS_QUAD *a, const LIS_QUAD *b, const LIS_QUAD *c);
1048 extern void lis_quad_mul(LIS_QUAD *a, const LIS_QUAD *b, const LIS_QUAD *c);
1049 extern void lis_quad_mul_dd_d(LIS_QUAD *a, const LIS_QUAD *b, const double c);
1050 extern void lis_quad_sqr(LIS_QUAD *a, const LIS_QUAD *b);
1051 extern void lis_quad_div(LIS_QUAD *a, const LIS_QUAD *b, const LIS_QUAD *c);
1052 extern LIS_INT  lis_quad_sqrt(LIS_QUAD *a, const LIS_QUAD *b);
1053 
1054 #ifdef __cplusplus
1055 }
1056 #endif
1057 
1058 #endif
1059