1 /* Copyright (C) 2005 The Scalable Software Infrastructure Project. All rights reserved.
2
3 Redistribution and use in source and binary forms, with or without
4 modification, are permitted provided that the following conditions are met:
5 1. Redistributions of source code must retain the above copyright
6 notice, this list of conditions and the following disclaimer.
7 2. Redistributions in binary form must reproduce the above copyright
8 notice, this list of conditions and the following disclaimer in the
9 documentation and/or other materials provided with the distribution.
10 3. Neither the name of the project nor the names of its contributors
11 may be used to endorse or promote products derived from this software
12 without specific prior written permission.
13
14 THIS SOFTWARE IS PROVIDED BY THE SCALABLE SOFTWARE INFRASTRUCTURE PROJECT
15 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
16 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE SCALABLE SOFTWARE INFRASTRUCTURE
18 PROJECT BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
19 OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
20 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
21 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
22 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
24 POSSIBILITY OF SUCH DAMAGE.
25 */
26
27 #ifdef HAVE_CONFIG_H
28 #include "lis_config.h"
29 #else
30 #ifdef HAVE_CONFIG_WIN_H
31 #include "lis_config_win.h"
32 #endif
33 #endif
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #ifdef HAVE_MALLOC_H
38 #include <malloc.h>
39 #endif
40 #include <string.h>
41 #include <math.h>
42 #ifdef USE_SSE2
43 #include <emmintrin.h>
44 #endif
45 #ifdef _OPENMP
46 #include <omp.h>
47 #endif
48 #ifdef USE_MPI
49 #include <mpi.h>
50 #endif
51 #include "lislib.h"
52
53
54 #ifdef USE_QUAD_PRECISION
lis_matvec_csr_mp(LIS_MATRIX A,LIS_VECTOR X,LIS_VECTOR Y)55 void lis_matvec_csr_mp(LIS_MATRIX A, LIS_VECTOR X, LIS_VECTOR Y)
56 {
57 LIS_INT i,j,n;
58 LIS_INT is,ie,j0;
59 LIS_INT *jj0;
60 LIS_SCALAR *vv0;
61 LIS_SCALAR *x,*y,*xl,*yl;
62 LIS_QUAD_DECLAR;
63
64
65 n = A->n;
66 x = X->value;
67 y = Y->value;
68 xl = X->value_lo;
69 yl = Y->value_lo;
70 if( A->is_splited )
71 {
72 #ifdef _OPENMP
73 #ifndef USE_SSE2
74 #pragma omp parallel for private(i,j,is,ie,j0,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
75 #else
76 #pragma omp parallel for private(i,j,is,ie,j0,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
77 #endif
78 #endif
79 for(i=0;i<n;i++)
80 {
81 #ifndef USE_SSE2
82 LIS_QUAD_MULD(y[i],yl[i],x[i],xl[i],A->D->value[i]);
83 #else
84 LIS_QUAD_MULD_SSE2(y[i],yl[i],x[i],xl[i],A->D->value[i]);
85 #endif
86 is = A->L->ptr[i];
87 ie = A->L->ptr[i+1];
88 for(j=is;j<ie-0;j+=1)
89 {
90 j0 = A->L->index[j+0];
91 #ifndef USE_SSE2
92 LIS_QUAD_FMAD(y[i],yl[i],y[i],yl[i],x[j0],xl[j0],A->L->value[j]);
93 #else
94 LIS_QUAD_FMAD_SSE2(y[i],yl[i],y[i],yl[i],x[j0],xl[j0],A->L->value[j]);
95 #endif
96 }
97 is = A->U->ptr[i];
98 ie = A->U->ptr[i+1];
99 for(j=is;j<ie-0;j+=1)
100 {
101 j0 = A->U->index[j+0];
102 #ifndef USE_SSE2
103 LIS_QUAD_FMAD(y[i],yl[i],y[i],yl[i],x[j0],xl[j0],A->U->value[j]);
104 #else
105 LIS_QUAD_FMAD_SSE2(y[i],yl[i],y[i],yl[i],x[j0],xl[j0],A->U->value[j]);
106 #endif
107 }
108 }
109 }
110 else
111 {
112 jj0 = A->index;
113 vv0 = A->value;
114 #ifdef _OPENMP
115 #ifndef USE_SSE2
116 #pragma omp parallel for private(i,j,is,ie,j0,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
117 #else
118 #pragma omp parallel for private(i,j,is,ie,j0,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
119 #endif
120 #endif
121 for(i=0;i<n;i++)
122 {
123 y[i] = yl[i] = 0.0;
124
125 is = A->ptr[i];
126 ie = A->ptr[i+1];
127 for(j=is;j<ie-0;j+=1)
128 {
129 j0 = jj0[j+0];
130 #ifndef USE_SSE2
131 LIS_QUAD_FMAD(y[i],yl[i],y[i],yl[i],x[j0],xl[j0],vv0[j]);
132 #else
133 LIS_QUAD_FMAD_SSE2(y[i],yl[i],y[i],yl[i],x[j0],xl[j0],vv0[j]);
134 #endif
135 }
136 }
137 }
138 }
139
lis_matvec_csr_mp2(LIS_MATRIX A,LIS_VECTOR X,LIS_VECTOR Y)140 void lis_matvec_csr_mp2(LIS_MATRIX A, LIS_VECTOR X, LIS_VECTOR Y)
141 {
142 LIS_INT i,j,n;
143 LIS_INT is,ie;
144 LIS_INT j0,j1;
145 LIS_INT *jj0;
146 LIS_SCALAR *vv0;
147 LIS_SCALAR *x,*y,*xl,*yl;
148 LIS_QUAD_PD tt;
149 LIS_QUAD_DECLAR;
150
151 n = A->n;
152 x = X->value;
153 y = Y->value;
154 xl = X->value_lo;
155 yl = Y->value_lo;
156 if( A->is_splited )
157 {
158 #ifdef _OPENMP
159 #ifndef USE_SSE2
160 #pragma omp parallel for private(i,j,is,ie,j0,j1,tt,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
161 #else
162 #pragma omp parallel for private(i,j,is,ie,j0,j1,tt,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
163 #endif
164 #endif
165 for(i=0;i<n;i++)
166 {
167 #ifndef USE_SSE2
168 LIS_QUAD_MULD(y[i],yl[i],x[i],xl[i],A->D->value[i]);
169 #else
170 LIS_QUAD_MULD_SSE2(y[i],yl[i],x[i],xl[i],A->D->value[i]);
171 #endif
172
173 tt.hi[0] = tt.hi[1] = tt.lo[0] = tt.lo[1] = 0.0;
174 is = A->L->ptr[i];
175 ie = A->L->ptr[i+1];
176 for(j=is;j<ie-1;j+=2)
177 {
178 j0 = A->L->index[j+0];
179 j1 = A->L->index[j+1];
180 #ifdef USE_SSE2
181 LIS_QUAD_FMAD2_SSE2_LDSD(tt.hi[0],tt.lo[0],tt.hi[0],tt.lo[0],x[j0],xl[j0],x[j1],xl[j1],A->L->value[j]);
182 #endif
183 }
184 for(;j<ie;j++)
185 {
186 j0 = A->L->index[j+0];
187 #ifdef USE_SSE2
188 LIS_QUAD_FMAD_SSE2(y[i],yl[i],y[i],yl[i],x[j0],xl[j0],A->L->value[j]);
189 #endif
190 }
191 is = A->U->ptr[i];
192 ie = A->U->ptr[i+1];
193 for(j=is;j<ie-1;j+=2)
194 {
195 j0 = A->U->index[j+0];
196 j1 = A->U->index[j+1];
197 #ifdef USE_SSE2
198 LIS_QUAD_FMAD2_SSE2_LDSD(tt.hi[0],tt.lo[0],tt.hi[0],tt.lo[0],x[j0],xl[j0],x[j1],xl[j1],A->U->value[j]);
199 #endif
200 }
201 for(;j<ie;j++)
202 {
203 j0 = A->U->index[j+0];
204 #ifdef USE_SSE2
205 LIS_QUAD_FMAD_SSE2(y[i],yl[i],y[i],yl[i],x[j0],xl[j0],A->U->value[j]);
206 #endif
207 }
208 #ifdef USE_SSE2
209 LIS_QUAD_ADD_SSE2(y[i],yl[i],y[i],yl[i],tt.hi[0],tt.lo[0]);
210 LIS_QUAD_ADD_SSE2(y[i],yl[i],y[i],yl[i],tt.hi[1],tt.lo[1]);
211 #endif
212 }
213 }
214 else
215 {
216 jj0 = A->index;
217 vv0 = A->value;
218 #ifdef _OPENMP
219 #ifndef USE_SSE2
220 #pragma omp parallel for private(i,j,is,ie,j0,j1,tt,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
221 #else
222 #pragma omp parallel for private(i,j,is,ie,j0,j1,tt,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
223 #endif
224 #endif
225 for(i=0;i<n;i++)
226 {
227 tt.hi[0] = tt.hi[1] = tt.lo[0] = tt.lo[1] = 0.0;
228
229 is = A->ptr[i];
230 ie = A->ptr[i+1];
231 for(j=is;j<ie-1;j+=2)
232 {
233 j0 = jj0[j+0];
234 j1 = jj0[j+1];
235 #ifdef USE_SSE2
236 LIS_QUAD_FMAD2_SSE2_LDSD(tt.hi[0],tt.lo[0],tt.hi[0],tt.lo[0],x[j0],xl[j0],x[j1],xl[j1],vv0[j]);
237 #endif
238 }
239 #ifdef USE_SSE2
240 LIS_QUAD_ADD_SSE2(y[i],yl[i],tt.hi[0],tt.lo[0],tt.hi[1],tt.lo[1]);
241 #endif
242 for(;j<ie;j++)
243 {
244 j0 = jj0[j+0];
245 #ifdef USE_SSE2
246 LIS_QUAD_FMAD_SSE2(y[i],yl[i],y[i],yl[i],x[j0],xl[j0],vv0[j]);
247 #endif
248 }
249 }
250 }
251 }
252
lis_matvech_csr_mp(LIS_MATRIX A,LIS_VECTOR X,LIS_VECTOR Y)253 void lis_matvech_csr_mp(LIS_MATRIX A, LIS_VECTOR X, LIS_VECTOR Y)
254 {
255 LIS_INT i,j,js,je,jj;
256 LIS_INT n,np;
257 LIS_QUAD_PTR tt0;
258 LIS_SCALAR *x,*y,*xl,*yl;
259 #ifdef _OPENMP
260 LIS_INT k,nprocs;
261 LIS_SCALAR *ww,*wwl;
262 #endif
263 LIS_QUAD_DECLAR;
264
265 n = A->n;
266 np = A->np;
267 x = X->value;
268 y = Y->value;
269 xl = X->value_lo;
270 yl = Y->value_lo;
271 tt0.hi = &X->work[0];
272 tt0.lo = &X->work[1];
273 if( A->is_splited )
274 {
275 #ifdef _OPENMP
276 nprocs = omp_get_max_threads();
277 ww = (LIS_SCALAR *)lis_malloc( 2*nprocs*np*sizeof(LIS_SCALAR),"lis_matvech_csr_mp::ww" );
278 wwl = &ww[nprocs*np];
279 #ifndef USE_SSE2
280 #pragma omp parallel private(i,j,js,je,jj,k,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
281 #else
282 #pragma omp parallel private(i,j,js,je,jj,k,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
283 #endif
284 {
285 k = omp_get_thread_num();
286 #pragma omp for
287 for(j=0;j<nprocs;j++)
288 {
289 memset( &ww[j*np], 0, np*sizeof(LIS_SCALAR) );
290 memset( &wwl[j*np], 0, np*sizeof(LIS_SCALAR) );
291 }
292 #pragma omp for
293 for(i=0; i<n; i++)
294 {
295 js = A->L->ptr[i];
296 je = A->L->ptr[i+1];
297 for(j=js;j<je;j++)
298 {
299 jj = k*np+A->L->index[j];
300 #ifndef USE_SSE2
301 LIS_QUAD_FMAD(ww[jj],wwl[jj],ww[jj],wwl[jj],x[i],xl[i],A->L->value[j]);
302 #else
303 LIS_QUAD_FMAD_SSE2(ww[jj],wwl[jj],ww[jj],wwl[jj],x[i],xl[i],A->L->value[j]);
304 #endif
305 }
306 js = A->U->ptr[i];
307 je = A->U->ptr[i+1];
308 for(j=js;j<je;j++)
309 {
310 jj = k*np+A->U->index[j];
311 #ifndef USE_SSE2
312 LIS_QUAD_FMAD(ww[jj],wwl[jj],ww[jj],wwl[jj],x[i],xl[i],A->U->value[j]);
313 #else
314 LIS_QUAD_FMAD_SSE2(ww[jj],wwl[jj],ww[jj],wwl[jj],x[i],xl[i],A->U->value[j]);
315 #endif
316 }
317 }
318 #pragma omp for
319 for(i=0;i<np;i++)
320 {
321 #ifndef USE_SSE2
322 LIS_QUAD_MULD(y[i],yl[i],x[i],xl[i],A->D->value[i]);
323 #else
324 LIS_QUAD_MULD_SSE2(y[i],yl[i],x[i],xl[i],A->D->value[i]);
325 #endif
326 for(j=0;j<nprocs;j++)
327 {
328 #ifndef USE_SSE2
329 LIS_QUAD_ADD(y[i],yl[i],y[i],yl[i],ww[j*np+i],wwl[j*np+i]);
330 #else
331 LIS_QUAD_ADD_SSE2(y[i],yl[i],y[i],yl[i],ww[j*np+i],wwl[j*np+i]);
332 #endif
333 }
334 }
335 }
336 lis_free(ww);
337 #else
338 for(i=0; i<np; i++)
339 {
340 #ifndef USE_SSE2
341 LIS_QUAD_MULD(y[i],yl[i],x[i],xl[i],A->D->value[i]);
342 #else
343 LIS_QUAD_MULD_SSE2(y[i],yl[i],x[i],xl[i],A->D->value[i]);
344 #endif
345 }
346 for(i=0; i<n; i++)
347 {
348 js = A->L->ptr[i];
349 je = A->L->ptr[i+1];
350 for(j=js;j<je;j++)
351 {
352 jj = A->L->index[j];
353 #ifndef USE_SSE2
354 LIS_QUAD_FMAD(y[jj],yl[jj],y[jj],yl[jj],x[i],xl[i],A->L->value[j]);
355 #else
356 LIS_QUAD_FMAD_SSE2(y[jj],yl[jj],y[jj],yl[jj],x[i],xl[i],A->L->value[j]);
357 #endif
358 }
359 js = A->U->ptr[i];
360 je = A->U->ptr[i+1];
361 for(j=js;j<je;j++)
362 {
363 jj = A->U->index[j];
364 #ifndef USE_SSE2
365 LIS_QUAD_FMAD(y[jj],yl[jj],y[jj],yl[jj],x[i],xl[i],A->U->value[j]);
366 #else
367 LIS_QUAD_FMAD_SSE2(y[jj],yl[jj],y[jj],yl[jj],x[i],xl[i],A->U->value[j]);
368 #endif
369 }
370 }
371 #endif
372 }
373 else
374 {
375 #ifdef _OPENMP
376 nprocs = omp_get_max_threads();
377 ww = (LIS_SCALAR *)lis_malloc( 2*nprocs*np*sizeof(LIS_SCALAR),"lis_matvech_csr_mp::ww" );
378 wwl = &ww[nprocs*np];
379 #ifndef USE_SSE2
380 #pragma omp parallel private(i,j,js,je,jj,k,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
381 #else
382 #pragma omp parallel private(i,j,js,je,jj,k,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
383 #endif
384 {
385 k = omp_get_thread_num();
386 #pragma omp for
387 for(j=0;j<nprocs;j++)
388 {
389 memset( &ww[j*np], 0, np*sizeof(LIS_SCALAR) );
390 memset( &wwl[j*np], 0, np*sizeof(LIS_SCALAR) );
391 }
392 #pragma omp for
393 for(i=0; i<n; i++)
394 {
395 js = A->ptr[i];
396 je = A->ptr[i+1];
397 for(j=js;j<je;j++)
398 {
399 jj = k*np+A->index[j];
400 #ifndef USE_SSE2
401 LIS_QUAD_FMAD(ww[jj],wwl[jj],ww[jj],wwl[jj],x[i],xl[i],A->value[j]);
402 #else
403 LIS_QUAD_FMAD_SSE2(ww[jj],wwl[jj],ww[jj],wwl[jj],x[i],xl[i],A->value[j]);
404 #endif
405 }
406 }
407 #pragma omp for
408 for(i=0;i<np;i++)
409 {
410 y[i] = yl[i] = 0.0;
411 for(j=0;j<nprocs;j++)
412 {
413 #ifndef USE_SSE2
414 LIS_QUAD_ADD(y[i],yl[i],y[i],yl[i],ww[j*np+i],wwl[j*np+i]);
415 #else
416 LIS_QUAD_ADD_SSE2(y[i],yl[i],y[i],yl[i],ww[j*np+i],wwl[j*np+i]);
417 #endif
418 }
419 }
420 }
421 lis_free(ww);
422 #else
423 for(i=0; i<np; i++)
424 {
425 y[i] = 0.0;
426 yl[i] = 0.0;
427 }
428 for(i=0; i<n; i++)
429 {
430 js = A->ptr[i];
431 je = A->ptr[i+1];
432 tt0.hi[0] = x[i];
433 tt0.lo[0] = xl[i];
434 for(j=js;j<je;j++)
435 {
436 jj = A->index[j];
437 #ifndef USE_SSE2
438 LIS_QUAD_FMAD(y[jj],yl[jj],y[jj],yl[jj],tt0.hi[0],tt0.lo[0],A->value[j]);
439 #else
440 LIS_QUAD_FMAD_SSE2(y[jj],yl[jj],y[jj],yl[jj],tt0.hi[0],tt0.lo[0],A->value[j]);
441 #endif
442 }
443 }
444 #endif
445 }
446 }
447
lis_matvech_csr_mp2(LIS_MATRIX A,LIS_VECTOR X,LIS_VECTOR Y)448 void lis_matvech_csr_mp2(LIS_MATRIX A, LIS_VECTOR X, LIS_VECTOR Y)
449 {
450 LIS_INT i,j,js,je,j0,j1;
451 LIS_INT n,np;
452 LIS_QUAD_PTR tt0;
453 LIS_SCALAR *x,*y,*xl,*yl;
454 #ifdef _OPENMP
455 LIS_INT k,nprocs;
456 LIS_SCALAR *ww,*wwl;
457 #endif
458 LIS_QUAD_DECLAR;
459
460 n = A->n;
461 np = A->np;
462 x = X->value;
463 y = Y->value;
464 xl = X->value_lo;
465 yl = Y->value_lo;
466 tt0.hi = &X->work[0];
467 tt0.lo = &X->work[2];
468 if( A->is_splited )
469 {
470 #ifdef _OPENMP
471 nprocs = omp_get_max_threads();
472 ww = (LIS_SCALAR *)lis_malloc( 2*nprocs*np*sizeof(LIS_SCALAR), "lis_matvech_csr_mp2::ww" );
473 wwl = &ww[nprocs*np];
474 #ifndef USE_SSE2
475 #pragma omp parallel private(i,j,js,je,j0,j1,k,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
476 #else
477 #pragma omp parallel private(i,j,js,je,j0,j1,k,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
478 #endif
479 {
480 k = omp_get_thread_num();
481 #pragma omp for
482 for(i=0; i<np; i++)
483 {
484 #ifndef USE_SSE2
485 LIS_QUAD_MULD(ww[i],wwl[i],x[i],xl[i],A->D->value[i]);
486 #else
487 LIS_QUAD_MULD_SSE2(ww[i],wwl[i],x[i],xl[i],A->D->value[i]);
488 #endif
489 }
490 #pragma omp for
491 for(i=0; i<n; i++)
492 {
493 js = A->L->ptr[i];
494 je = A->L->ptr[i+1];
495 for(j=js;j<je-1;j+=2)
496 {
497 j0 = k*np + A->L->index[j];
498 j1 = k*np + A->L->index[j+1];
499 #ifdef USE_SSE2
500 LIS_QUAD_FMAD2_SSE2_STSD(ww[j0],wwl[j0],ww[j1],wwl[j1],ww[j0],wwl[j0],ww[j1],wwl[j1],x[i],xl[i],x[i],xl[i],A->L->value[j]);
501 #endif
502 }
503 for(;j<je;j++)
504 {
505 j0 = A->L->index[j];
506 #ifdef USE_SSE2
507 LIS_QUAD_FMAD_SSE2(ww[j0],wwl[j0],ww[j0],wwl[j0],x[i],xl[i],A->L->value[j]);
508 #endif
509 }
510 js = A->U->ptr[i];
511 je = A->U->ptr[i+1];
512 for(j=js;j<je-1;j+=2)
513 {
514 j0 = k*np + A->U->index[j];
515 j1 = k*np + A->U->index[j+1];
516 #ifdef USE_SSE2
517 LIS_QUAD_FMAD2_SSE2_STSD(ww[j0],wwl[j0],ww[j1],wwl[j1],ww[j0],wwl[j0],ww[j1],wwl[j1],x[i],xl[i],x[i],xl[i],A->U->value[j]);
518 #endif
519 }
520 for(;j<je;j++)
521 {
522 j0 = A->U->index[j];
523 #ifdef USE_SSE2
524 LIS_QUAD_FMAD_SSE2(ww[j0],wwl[j0],ww[j0],wwl[j0],x[i],xl[i],A->U->value[j]);
525 #endif
526 }
527 }
528 #pragma omp for
529 for(i=0;i<np;i++)
530 {
531 y[i] = yl[i] = 0.0;
532 for(j=0;j<nprocs;j++)
533 {
534 #ifdef USE_SSE2
535 LIS_QUAD_ADD_SSE2(y[i],yl[i],y[i],yl[i],ww[j*np+i],wwl[j*np+i]);
536 #endif
537 }
538 }
539 }
540 lis_free(ww);
541 #else
542 for(i=0; i<np; i++)
543 {
544 #ifndef USE_SSE2
545 LIS_QUAD_MULD(y[i],yl[i],x[i],xl[i],A->D->value[i]);
546 #else
547 LIS_QUAD_MULD_SSE2(y[i],yl[i],x[i],xl[i],A->D->value[i]);
548 #endif
549 }
550 for(i=0; i<n; i++)
551 {
552 js = A->L->ptr[i];
553 je = A->L->ptr[i+1];
554 for(j=js;j<je-1;j+=2)
555 {
556 j0 = A->L->index[j];
557 j1 = A->L->index[j+1];
558 #ifdef USE_SSE2
559 LIS_QUAD_FMAD2_SSE2_STSD(y[j0],yl[j0],y[j1],yl[j1],y[j0],yl[j0],y[j1],yl[j1],x[i],xl[i],x[i],xl[i],A->L->value[j]);
560 #endif
561 }
562 for(;j<je;j++)
563 {
564 j0 = A->L->index[j];
565 #ifdef USE_SSE2
566 LIS_QUAD_FMAD_SSE2(y[j0],yl[j0],y[j0],yl[j0],tt0.hi[0],tt0.lo[0],A->L->value[j]);
567 #endif
568 }
569 js = A->U->ptr[i];
570 je = A->U->ptr[i+1];
571 for(j=js;j<je-1;j+=2)
572 {
573 j0 = A->U->index[j];
574 j1 = A->U->index[j+1];
575 #ifdef USE_SSE2
576 LIS_QUAD_FMAD2_SSE2_STSD(y[j0],yl[j0],y[j1],yl[j1],y[j0],yl[j0],y[j1],yl[j1],x[i],xl[i],x[i],xl[i],A->U->value[j]);
577 #endif
578 }
579 for(;j<je;j++)
580 {
581 j0 = A->U->index[j];
582 #ifdef USE_SSE2
583 LIS_QUAD_FMAD_SSE2(y[j0],yl[j0],y[j0],yl[j0],tt0.hi[0],tt0.lo[0],A->U->value[j]);
584 #endif
585 }
586 }
587 #endif
588 }
589 else
590 {
591 #ifdef _OPENMP
592 nprocs = omp_get_max_threads();
593 ww = (LIS_SCALAR *)lis_malloc( 2*nprocs*np*sizeof(LIS_SCALAR), "lis_matvech_csr_mp2::ww" );
594 wwl = &ww[nprocs*np];
595 #ifndef USE_SSE2
596 #pragma omp parallel private(i,j,js,je,j0,j1,k,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
597 #else
598 #pragma omp parallel private(i,j,js,je,j0,j1,k,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
599 #endif
600 {
601 k = omp_get_thread_num();
602 #pragma omp for
603 for(j=0;j<nprocs;j++)
604 {
605 memset( &ww[j*np], 0, np*sizeof(LIS_SCALAR) );
606 memset( &wwl[j*np], 0, np*sizeof(LIS_SCALAR) );
607 }
608 #pragma omp for
609 for(i=0; i<n; i++)
610 {
611 js = A->ptr[i];
612 je = A->ptr[i+1];
613 for(j=js;j<je-1;j+=2)
614 {
615 j0 = k*np + A->index[j];
616 j1 = k*np + A->index[j+1];
617 #ifdef USE_SSE2
618 LIS_QUAD_FMAD2_SSE2_STSD(ww[j0],wwl[j0],ww[j1],wwl[j1],ww[j0],wwl[j0],ww[j1],wwl[j1],x[i],xl[i],x[i],xl[i],A->value[j]);
619 #endif
620 }
621 for(;j<je;j++)
622 {
623 j0 = A->index[j];
624 #ifdef USE_SSE2
625 LIS_QUAD_FMAD_SSE2(ww[j0],wwl[j0],ww[j0],wwl[j0],x[i],xl[i],A->value[j]);
626 #endif
627 }
628 }
629 #pragma omp for
630 for(i=0;i<np;i++)
631 {
632 y[i] = yl[i] = 0.0;
633 for(j=0;j<nprocs;j++)
634 {
635 #ifdef USE_SSE2
636 LIS_QUAD_ADD_SSE2(y[i],yl[i],y[i],yl[i],ww[j*np+i],wwl[j*np+i]);
637 #endif
638 }
639 }
640 }
641 lis_free(ww);
642 #else
643 for(i=0; i<np; i++)
644 {
645 y[i] = 0.0;
646 yl[i] = 0.0;
647 }
648 for(i=0; i<n; i++)
649 {
650 js = A->ptr[i];
651 je = A->ptr[i+1];
652 for(j=js;j<je-1;j+=2)
653 {
654 j0 = A->index[j];
655 j1 = A->index[j+1];
656 #ifdef USE_SSE2
657 LIS_QUAD_FMAD2_SSE2_STSD(y[j0],yl[j0],y[j1],yl[j1],y[j0],yl[j0],y[j1],yl[j1],x[i],xl[i],x[i],xl[i],A->value[j]);
658 #endif
659 }
660 for(;j<je;j++)
661 {
662 j0 = A->index[j];
663 #ifdef USE_SSE2
664 LIS_QUAD_FMAD_SSE2(y[j0],yl[j0],y[j0],yl[j0],x[i],xl[i],A->value[j]);
665 #endif
666 }
667 }
668 #endif
669 }
670 }
671 #endif
672
673 #ifdef USE_QUAD_PRECISION
lis_matvec_csc_mp(LIS_MATRIX A,LIS_VECTOR X,LIS_VECTOR Y)674 void lis_matvec_csc_mp(LIS_MATRIX A, LIS_VECTOR X, LIS_VECTOR Y)
675 {
676 LIS_INT i,j,js,je,jj;
677 LIS_INT n,np;
678 LIS_QUAD_PTR tt0;
679 LIS_SCALAR *x,*y,*xl,*yl;
680 #ifdef _OPENMP
681 LIS_INT k,nprocs;
682 LIS_SCALAR *ww,*wwl;
683 #endif
684 LIS_QUAD_DECLAR;
685
686 n = A->n;
687 np = A->np;
688 x = X->value;
689 y = Y->value;
690 xl = X->value_lo;
691 yl = Y->value_lo;
692 tt0.hi = &X->work[0];
693 tt0.lo = &X->work[1];
694 if( A->is_splited )
695 {
696 #ifdef _OPENMP
697 nprocs = omp_get_max_threads();
698 ww = (LIS_SCALAR *)lis_malloc( 2*nprocs*np*sizeof(LIS_SCALAR),"lis_matvec_csr_mp::ww" );
699 wwl = &ww[nprocs*np];
700 #ifndef USE_SSE2
701 #pragma omp parallel private(i,j,js,je,jj,k,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
702 #else
703 #pragma omp parallel private(i,j,js,je,jj,k,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
704 #endif
705 {
706 k = omp_get_thread_num();
707 #pragma omp for
708 for(j=0;j<nprocs;j++)
709 {
710 memset( &ww[j*np], 0, np*sizeof(LIS_SCALAR) );
711 memset( &wwl[j*np], 0, np*sizeof(LIS_SCALAR) );
712 }
713 #pragma omp for
714 for(i=0; i<np; i++)
715 {
716 js = A->L->ptr[i];
717 je = A->L->ptr[i+1];
718 for(j=js;j<je;j++)
719 {
720 jj = k*np+A->L->index[j];
721 #ifndef USE_SSE2
722 LIS_QUAD_FMAD(ww[jj],wwl[jj],ww[jj],wwl[jj],x[i],xl[i],A->L->value[j]);
723 #else
724 LIS_QUAD_FMAD_SSE2(ww[jj],wwl[jj],ww[jj],wwl[jj],x[i],xl[i],A->L->value[j]);
725 #endif
726 }
727 js = A->U->ptr[i];
728 je = A->U->ptr[i+1];
729 for(j=js;j<je;j++)
730 {
731 jj = k*np+A->U->index[j];
732 #ifndef USE_SSE2
733 LIS_QUAD_FMAD(ww[jj],wwl[jj],ww[jj],wwl[jj],x[i],xl[i],A->U->value[j]);
734 #else
735 LIS_QUAD_FMAD_SSE2(ww[jj],wwl[jj],ww[jj],wwl[jj],x[i],xl[i],A->U->value[j]);
736 #endif
737 }
738 }
739 #pragma omp for
740 for(i=0;i<n;i++)
741 {
742 #ifndef USE_SSE2
743 LIS_QUAD_MULD(y[i],yl[i],x[i],xl[i],A->D->value[i]);
744 #else
745 LIS_QUAD_MULD_SSE2(y[i],yl[i],x[i],xl[i],A->D->value[i]);
746 #endif
747 for(j=0;j<nprocs;j++)
748 {
749 #ifndef USE_SSE2
750 LIS_QUAD_ADD(y[i],yl[i],y[i],yl[i],ww[j*np+i],wwl[j*np+i]);
751 #else
752 LIS_QUAD_ADD_SSE2(y[i],yl[i],y[i],yl[i],ww[j*np+i],wwl[j*np+i]);
753 #endif
754 }
755 }
756 }
757 lis_free(ww);
758 #else
759 for(i=0; i<n; i++)
760 {
761 #ifndef USE_SSE2
762 LIS_QUAD_MULD(y[i],yl[i],x[i],xl[i],A->D->value[i]);
763 #else
764 LIS_QUAD_MULD_SSE2(y[i],yl[i],x[i],xl[i],A->D->value[i]);
765 #endif
766 }
767 for(i=0; i<np; i++)
768 {
769 js = A->L->ptr[i];
770 je = A->L->ptr[i+1];
771 for(j=js;j<je;j++)
772 {
773 jj = A->L->index[j];
774 #ifndef USE_SSE2
775 LIS_QUAD_FMAD(y[jj],yl[jj],y[jj],yl[jj],x[i],xl[i],A->L->value[j]);
776 #else
777 LIS_QUAD_FMAD_SSE2(y[jj],yl[jj],y[jj],yl[jj],x[i],xl[i],A->L->value[j]);
778 #endif
779 }
780 js = A->U->ptr[i];
781 je = A->U->ptr[i+1];
782 for(j=js;j<je;j++)
783 {
784 jj = A->U->index[j];
785 #ifndef USE_SSE2
786 LIS_QUAD_FMAD(y[jj],yl[jj],y[jj],yl[jj],x[i],xl[i],A->U->value[j]);
787 #else
788 LIS_QUAD_FMAD_SSE2(y[jj],yl[jj],y[jj],yl[jj],x[i],xl[i],A->U->value[j]);
789 #endif
790 }
791 }
792 #endif
793 }
794 else
795 {
796 #ifdef _OPENMP
797 nprocs = omp_get_max_threads();
798 ww = (LIS_SCALAR *)lis_malloc( 2*nprocs*np*sizeof(LIS_SCALAR),"lis_matvec_csr_mp::ww" );
799 wwl = &ww[nprocs*np];
800 #ifndef USE_SSE2
801 #pragma omp parallel private(i,j,js,je,jj,k,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
802 #else
803 #pragma omp parallel private(i,j,js,je,jj,k,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
804 #endif
805 {
806 k = omp_get_thread_num();
807 #pragma omp for
808 for(j=0;j<nprocs;j++)
809 {
810 memset( &ww[j*np], 0, np*sizeof(LIS_SCALAR) );
811 memset( &wwl[j*np], 0, np*sizeof(LIS_SCALAR) );
812 }
813 #pragma omp for
814 for(i=0; i<np; i++)
815 {
816 js = A->ptr[i];
817 je = A->ptr[i+1];
818 for(j=js;j<je;j++)
819 {
820 jj = k*np+A->L->index[j];
821 #ifndef USE_SSE2
822 LIS_QUAD_FMAD(ww[jj],wwl[jj],ww[jj],wwl[jj],x[i],xl[i],A->L->value[j]);
823 #else
824 LIS_QUAD_FMAD_SSE2(ww[jj],wwl[jj],ww[jj],wwl[jj],x[i],xl[i],A->L->value[j]);
825 #endif
826 }
827 }
828 #pragma omp for
829 for(i=0;i<np;i++)
830 {
831 y[i] = yl[i] = 0.0;
832 for(j=0;j<nprocs;j++)
833 {
834 #ifndef USE_SSE2
835 LIS_QUAD_ADD(y[i],yl[i],y[i],yl[i],ww[j*np+i],wwl[j*np+i]);
836 #else
837 LIS_QUAD_ADD_SSE2(y[i],yl[i],y[i],yl[i],ww[j*np+i],wwl[j*np+i]);
838 #endif
839 }
840 }
841 }
842 lis_free(ww);
843 #else
844 for(i=0; i<n; i++)
845 {
846 y[i] = 0.0;
847 yl[i] = 0.0;
848 }
849 for(i=0; i<np; i++)
850 {
851 js = A->ptr[i];
852 je = A->ptr[i+1];
853 tt0.hi[0] = x[i];
854 tt0.lo[0] = xl[i];
855 for(j=js;j<je;j++)
856 {
857 jj = A->index[j];
858 #ifndef USE_SSE2
859 LIS_QUAD_FMAD(y[jj],yl[jj],y[jj],yl[jj],tt0.hi[0],tt0.lo[0],A->value[j]);
860 #else
861 LIS_QUAD_FMAD_SSE2(y[jj],yl[jj],y[jj],yl[jj],tt0.hi[0],tt0.lo[0],A->value[j]);
862 #endif
863 }
864 }
865 #endif
866 }
867 }
868
lis_matvec_csc_mp2(LIS_MATRIX A,LIS_VECTOR X,LIS_VECTOR Y)869 void lis_matvec_csc_mp2(LIS_MATRIX A, LIS_VECTOR X, LIS_VECTOR Y)
870 {
871 LIS_INT i,j,js,je,j0,j1;
872 LIS_INT n,np;
873 LIS_QUAD_PTR tt0;
874 LIS_SCALAR *x,*y,*xl,*yl;
875 #ifdef _OPENMP
876 LIS_INT k,nprocs;
877 LIS_SCALAR *ww,*wwl;
878 #endif
879 LIS_QUAD_DECLAR;
880
881 n = A->n;
882 np = A->np;
883 x = X->value;
884 y = Y->value;
885 xl = X->value_lo;
886 yl = Y->value_lo;
887 tt0.hi = &X->work[0];
888 tt0.lo = &X->work[2];
889 if( A->is_splited )
890 {
891 #ifdef _OPENMP
892 nprocs = omp_get_max_threads();
893 ww = (LIS_SCALAR *)lis_malloc( 2*nprocs*np*sizeof(LIS_SCALAR), "lis_matvec_csr_mp2::ww" );
894 wwl = &ww[nprocs*np];
895 #ifndef USE_SSE2
896 #pragma omp parallel private(i,j,js,je,j0,j1,k,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
897 #else
898 #pragma omp parallel private(i,j,js,je,j0,j1,k,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
899 #endif
900 {
901 k = omp_get_thread_num();
902 #pragma omp for
903 for(j=0;j<nprocs;j++)
904 {
905 memset( &ww[j*np], 0, np*sizeof(LIS_SCALAR) );
906 memset( &wwl[j*np], 0, np*sizeof(LIS_SCALAR) );
907 }
908 #pragma omp for
909 for(i=0; i<np; i++)
910 {
911 js = A->ptr[i];
912 je = A->ptr[i+1];
913 for(j=js;j<je-1;j+=2)
914 {
915 j0 = k*np + A->index[j];
916 j1 = k*np + A->index[j+1];
917 #ifdef USE_SSE2
918 LIS_QUAD_FMAD2_SSE2_STSD(ww[j0],wwl[j0],ww[j1],wwl[j1],ww[j0],wwl[j0],ww[j1],wwl[j1],x[i],xl[i],x[i],xl[i],A->value[j]);
919 #endif
920 }
921 for(;j<je;j++)
922 {
923 j0 = A->index[j];
924 #ifdef USE_SSE2
925 LIS_QUAD_FMAD_SSE2(ww[j0],wwl[j0],ww[j0],wwl[j0],x[i],xl[i],A->value[j]);
926 #endif
927 }
928 }
929 #pragma omp for
930 for(i=0;i<n;i++)
931 {
932 y[i] = yl[i] = 0.0;
933 for(j=0;j<nprocs;j++)
934 {
935 #ifdef USE_SSE2
936 LIS_QUAD_ADD_SSE2(y[i],yl[i],y[i],yl[i],ww[j*np+i],wwl[j*np+i]);
937 #endif
938 }
939 }
940 }
941 lis_free(ww);
942 #else
943 for(i=0; i<n; i++)
944 {
945 #ifndef USE_SSE2
946 LIS_QUAD_MULD(y[i],yl[i],x[i],xl[i],A->D->value[i]);
947 #else
948 LIS_QUAD_MULD_SSE2(y[i],yl[i],x[i],xl[i],A->D->value[i]);
949 #endif
950 }
951 for(i=0; i<np; i++)
952 {
953 js = A->L->ptr[i];
954 je = A->L->ptr[i+1];
955 for(j=js;j<je-1;j+=2)
956 {
957 j0 = A->L->index[j];
958 j1 = A->L->index[j+1];
959 #ifdef USE_SSE2
960 LIS_QUAD_FMAD2_SSE2_STSD(y[j0],yl[j0],y[j1],yl[j1],y[j0],yl[j0],y[j1],yl[j1],x[i],xl[i],x[i],xl[i],A->L->value[j]);
961 #endif
962 }
963 for(;j<je;j++)
964 {
965 j0 = A->L->index[j];
966 #ifdef USE_SSE2
967 LIS_QUAD_FMAD_SSE2(y[j0],yl[j0],y[j0],yl[j0],tt0.hi[0],tt0.lo[0],A->L->value[j]);
968 #endif
969 }
970 js = A->U->ptr[i];
971 je = A->U->ptr[i+1];
972 for(j=js;j<je-1;j+=2)
973 {
974 j0 = A->U->index[j];
975 j1 = A->U->index[j+1];
976 #ifdef USE_SSE2
977 LIS_QUAD_FMAD2_SSE2_STSD(y[j0],yl[j0],y[j1],yl[j1],y[j0],yl[j0],y[j1],yl[j1],x[i],xl[i],x[i],xl[i],A->U->value[j]);
978 #endif
979 }
980 for(;j<je;j++)
981 {
982 j0 = A->U->index[j];
983 #ifdef USE_SSE2
984 LIS_QUAD_FMAD_SSE2(y[j0],yl[j0],y[j0],yl[j0],tt0.hi[0],tt0.lo[0],A->U->value[j]);
985 #endif
986 }
987 }
988 #endif
989 }
990 else
991 {
992 #ifdef _OPENMP
993 nprocs = omp_get_max_threads();
994 ww = (LIS_SCALAR *)lis_malloc( 2*nprocs*np*sizeof(LIS_SCALAR), "lis_matvec_csr_mp2::ww" );
995 wwl = &ww[nprocs*np];
996 #ifndef USE_SSE2
997 #pragma omp parallel private(i,j,js,je,j0,j1,k,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
998 #else
999 #pragma omp parallel private(i,j,js,je,j0,j1,k,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
1000 #endif
1001 {
1002 k = omp_get_thread_num();
1003 #pragma omp for
1004 for(j=0;j<nprocs;j++)
1005 {
1006 memset( &ww[j*np], 0, np*sizeof(LIS_SCALAR) );
1007 memset( &wwl[j*np], 0, np*sizeof(LIS_SCALAR) );
1008 }
1009 #pragma omp for
1010 for(i=0; i<np; i++)
1011 {
1012 js = A->ptr[i];
1013 je = A->ptr[i+1];
1014 for(j=js;j<je-1;j+=2)
1015 {
1016 j0 = k*np + A->index[j];
1017 j1 = k*np + A->index[j+1];
1018 #ifdef USE_SSE2
1019 LIS_QUAD_FMAD2_SSE2_STSD(ww[j0],wwl[j0],ww[j1],wwl[j1],ww[j0],wwl[j0],ww[j1],wwl[j1],x[i],xl[i],x[i],xl[i],A->value[j]);
1020 #endif
1021 }
1022 for(;j<je;j++)
1023 {
1024 j0 = A->index[j];
1025 #ifdef USE_SSE2
1026 LIS_QUAD_FMAD_SSE2(ww[j0],wwl[j0],ww[j0],wwl[j0],x[i],xl[i],A->value[j]);
1027 #endif
1028 }
1029 }
1030 #pragma omp for
1031 for(i=0;i<n;i++)
1032 {
1033 y[i] = yl[i] = 0.0;
1034 for(j=0;j<nprocs;j++)
1035 {
1036 #ifdef USE_SSE2
1037 LIS_QUAD_ADD_SSE2(y[i],yl[i],y[i],yl[i],ww[j*np+i],wwl[j*np+i]);
1038 #endif
1039 }
1040 }
1041 }
1042 lis_free(ww);
1043 #else
1044 for(i=0; i<n; i++)
1045 {
1046 y[i] = 0.0;
1047 yl[i] = 0.0;
1048 }
1049 for(i=0; i<np; i++)
1050 {
1051 js = A->ptr[i];
1052 je = A->ptr[i+1];
1053 for(j=js;j<je-1;j+=2)
1054 {
1055 j0 = A->index[j];
1056 j1 = A->index[j+1];
1057 #ifdef USE_SSE2
1058 LIS_QUAD_FMAD2_SSE2_STSD(y[j0],yl[j0],y[j1],yl[j1],y[j0],yl[j0],y[j1],yl[j1],x[i],xl[i],x[i],xl[i],A->value[j]);
1059 #endif
1060 }
1061 for(;j<je;j++)
1062 {
1063 j0 = A->index[j];
1064 #ifdef USE_SSE2
1065 LIS_QUAD_FMAD_SSE2(y[j0],yl[j0],y[j0],yl[j0],x[i],xl[i],A->value[j]);
1066 #endif
1067 }
1068 }
1069 #endif
1070 }
1071 }
1072
lis_matvech_csc_mp(LIS_MATRIX A,LIS_VECTOR X,LIS_VECTOR Y)1073 void lis_matvech_csc_mp(LIS_MATRIX A, LIS_VECTOR X, LIS_VECTOR Y)
1074 {
1075 LIS_INT i,j,np;
1076 LIS_INT is,ie,j0;
1077 LIS_INT *jj0;
1078 LIS_SCALAR *vv0;
1079 LIS_SCALAR *x,*y,*xl,*yl;
1080 LIS_QUAD_DECLAR;
1081
1082
1083 np = A->np;
1084 x = X->value;
1085 y = Y->value;
1086 xl = X->value_lo;
1087 yl = Y->value_lo;
1088 if( A->is_splited )
1089 {
1090 #ifdef _OPENMP
1091 #ifndef USE_SSE2
1092 #pragma omp parallel for private(i,j,is,ie,j0,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
1093 #else
1094 #pragma omp parallel for private(i,j,is,ie,j0,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
1095 #endif
1096 #endif
1097 for(i=0;i<np;i++)
1098 {
1099 #ifndef USE_SSE2
1100 LIS_QUAD_MULD(y[i],yl[i],x[i],xl[i],A->D->value[i]);
1101 #else
1102 LIS_QUAD_MULD_SSE2(y[i],yl[i],x[i],xl[i],A->D->value[i]);
1103 #endif
1104 is = A->L->ptr[i];
1105 ie = A->L->ptr[i+1];
1106 for(j=is;j<ie-0;j+=1)
1107 {
1108 j0 = A->L->index[j+0];
1109 #ifndef USE_SSE2
1110 LIS_QUAD_FMAD(y[i],yl[i],y[i],yl[i],x[j0],xl[j0],A->L->value[j]);
1111 #else
1112 LIS_QUAD_FMAD_SSE2(y[i],yl[i],y[i],yl[i],x[j0],xl[j0],A->L->value[j]);
1113 #endif
1114 }
1115 is = A->U->ptr[i];
1116 ie = A->U->ptr[i+1];
1117 for(j=is;j<ie-0;j+=1)
1118 {
1119 j0 = A->U->index[j+0];
1120 #ifndef USE_SSE2
1121 LIS_QUAD_FMAD(y[i],yl[i],y[i],yl[i],x[j0],xl[j0],A->U->value[j]);
1122 #else
1123 LIS_QUAD_FMAD_SSE2(y[i],yl[i],y[i],yl[i],x[j0],xl[j0],A->U->value[j]);
1124 #endif
1125 }
1126 }
1127 }
1128 else
1129 {
1130 jj0 = A->index;
1131 vv0 = A->value;
1132 #ifdef _OPENMP
1133 #ifndef USE_SSE2
1134 #pragma omp parallel for private(i,j,is,ie,j0,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
1135 #else
1136 #pragma omp parallel for private(i,j,is,ie,j0,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
1137 #endif
1138 #endif
1139 for(i=0;i<np;i++)
1140 {
1141 y[i] = yl[i] = 0.0;
1142
1143 is = A->ptr[i];
1144 ie = A->ptr[i+1];
1145 for(j=is;j<ie-0;j+=1)
1146 {
1147 j0 = jj0[j+0];
1148 #ifndef USE_SSE2
1149 LIS_QUAD_FMAD(y[i],yl[i],y[i],yl[i],x[j0],xl[j0],vv0[j]);
1150 #else
1151 LIS_QUAD_FMAD_SSE2(y[i],yl[i],y[i],yl[i],x[j0],xl[j0],vv0[j]);
1152 #endif
1153 }
1154 }
1155 }
1156 }
1157
lis_matvech_csc_mp2(LIS_MATRIX A,LIS_VECTOR X,LIS_VECTOR Y)1158 void lis_matvech_csc_mp2(LIS_MATRIX A, LIS_VECTOR X, LIS_VECTOR Y)
1159 {
1160 LIS_INT i,j,np;
1161 LIS_INT is,ie;
1162 LIS_INT j0,j1;
1163 LIS_INT *jj0;
1164 LIS_SCALAR *vv0;
1165 LIS_SCALAR *x,*y,*xl,*yl;
1166 LIS_QUAD_PD tt;
1167 LIS_QUAD_DECLAR;
1168
1169 np = A->np;
1170 x = X->value;
1171 y = Y->value;
1172 xl = X->value_lo;
1173 yl = Y->value_lo;
1174 if( A->is_splited )
1175 {
1176 #ifdef _OPENMP
1177 #ifndef USE_SSE2
1178 #pragma omp parallel for private(i,j,is,ie,j0,j1,tt,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
1179 #else
1180 #pragma omp parallel for private(i,j,is,ie,j0,j1,tt,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
1181 #endif
1182 #endif
1183 for(i=0;i<np;i++)
1184 {
1185 #ifndef USE_SSE2
1186 LIS_QUAD_MULD(y[i],yl[i],x[i],xl[i],A->D->value[i]);
1187 #else
1188 LIS_QUAD_MULD_SSE2(y[i],yl[i],x[i],xl[i],A->D->value[i]);
1189 #endif
1190
1191 tt.hi[0] = tt.hi[1] = tt.lo[0] = tt.lo[1] = 0.0;
1192 is = A->L->ptr[i];
1193 ie = A->L->ptr[i+1];
1194 for(j=is;j<ie-1;j+=2)
1195 {
1196 j0 = A->L->index[j+0];
1197 j1 = A->L->index[j+1];
1198 #ifdef USE_SSE2
1199 LIS_QUAD_FMAD2_SSE2_LDSD(tt.hi[0],tt.lo[0],tt.hi[0],tt.lo[0],x[j0],xl[j0],x[j1],xl[j1],A->L->value[j]);
1200 #endif
1201 }
1202 for(;j<ie;j++)
1203 {
1204 j0 = A->L->index[j+0];
1205 #ifdef USE_SSE2
1206 LIS_QUAD_FMAD_SSE2(y[i],yl[i],y[i],yl[i],x[j0],xl[j0],A->L->value[j]);
1207 #endif
1208 }
1209 is = A->U->ptr[i];
1210 ie = A->U->ptr[i+1];
1211 for(j=is;j<ie-1;j+=2)
1212 {
1213 j0 = A->U->index[j+0];
1214 j1 = A->U->index[j+1];
1215 #ifdef USE_SSE2
1216 LIS_QUAD_FMAD2_SSE2_LDSD(tt.hi[0],tt.lo[0],tt.hi[0],tt.lo[0],x[j0],xl[j0],x[j1],xl[j1],A->U->value[j]);
1217 #endif
1218 }
1219 for(;j<ie;j++)
1220 {
1221 j0 = A->U->index[j+0];
1222 #ifdef USE_SSE2
1223 LIS_QUAD_FMAD_SSE2(y[i],yl[i],y[i],yl[i],x[j0],xl[j0],A->U->value[j]);
1224 #endif
1225 }
1226 #ifdef USE_SSE2
1227 LIS_QUAD_ADD_SSE2(y[i],yl[i],y[i],yl[i],tt.hi[0],tt.lo[0]);
1228 LIS_QUAD_ADD_SSE2(y[i],yl[i],y[i],yl[i],tt.hi[1],tt.lo[1]);
1229 #endif
1230 }
1231 }
1232 else
1233 {
1234 jj0 = A->index;
1235 vv0 = A->value;
1236 #ifdef _OPENMP
1237 #ifndef USE_SSE2
1238 #pragma omp parallel for private(i,j,is,ie,j0,j1,tt,p1,p2,tq,bhi,blo,chi,clo,sh,sl,th,tl,eh,el)
1239 #else
1240 #pragma omp parallel for private(i,j,is,ie,j0,j1,tt,bh,ch,sh,wh,th,bl,cl,sl,wl,tl,p1,p2,t0,t1,t2,eh)
1241 #endif
1242 #endif
1243 for(i=0;i<np;i++)
1244 {
1245 tt.hi[0] = tt.hi[1] = tt.lo[0] = tt.lo[1] = 0.0;
1246
1247 is = A->ptr[i];
1248 ie = A->ptr[i+1];
1249 for(j=is;j<ie-1;j+=2)
1250 {
1251 j0 = jj0[j+0];
1252 j1 = jj0[j+1];
1253 #ifdef USE_SSE2
1254 LIS_QUAD_FMAD2_SSE2_LDSD(tt.hi[0],tt.lo[0],tt.hi[0],tt.lo[0],x[j0],xl[j0],x[j1],xl[j1],vv0[j]);
1255 #endif
1256 }
1257 #ifdef USE_SSE2
1258 LIS_QUAD_ADD_SSE2(y[i],yl[i],tt.hi[0],tt.lo[0],tt.hi[1],tt.lo[1]);
1259 #endif
1260 for(;j<ie;j++)
1261 {
1262 j0 = jj0[j+0];
1263 #ifdef USE_SSE2
1264 LIS_QUAD_FMAD_SSE2(y[i],yl[i],y[i],yl[i],x[j0],xl[j0],vv0[j]);
1265 #endif
1266 }
1267 }
1268 }
1269 }
1270 #endif
1271