1 /* -----------------------------------------------------------------
2 * Programmer(s): David J. Gardner @ LLNL
3 * -----------------------------------------------------------------
4 * Acknowledgements: This NVECTOR module is based on the NVECTOR
5 * Serial module by Scott D. Cohen, Alan C.
6 * Hindmarsh, Radu Serban, and Aaron Collier
7 * @ LLNL
8 * -----------------------------------------------------------------
9 * SUNDIALS Copyright Start
10 * Copyright (c) 2002-2020, Lawrence Livermore National Security
11 * and Southern Methodist University.
12 * All rights reserved.
13 *
14 * See the top-level LICENSE and NOTICE files for details.
15 *
16 * SPDX-License-Identifier: BSD-3-Clause
17 * SUNDIALS Copyright End
18 * -----------------------------------------------------------------
19 * This is the implementation file for a POSIX Threads (Pthreads)
20 * implementation of the NVECTOR package using a LOCAL array of
21 * structures to pass data to threads.
22 * -----------------------------------------------------------------*/
23
24 #include <stdio.h>
25 #include <stdlib.h>
26
27 #include <nvector/nvector_pthreads.h>
28 #include <sundials/sundials_math.h>
29 #include <math.h> /* define NAN */
30
31 #define ZERO RCONST(0.0)
32 #define HALF RCONST(0.5)
33 #define ONE RCONST(1.0)
34 #define ONEPT5 RCONST(1.5)
35
36 /* Private functions for special cases of vector operations */
37 static void VCopy_Pthreads(N_Vector x, N_Vector z); /* z=x */
38 static void VSum_Pthreads(N_Vector x, N_Vector y, N_Vector z); /* z=x+y */
39 static void VDiff_Pthreads(N_Vector x, N_Vector y, N_Vector z); /* z=x-y */
40 static void VNeg_Pthreads(N_Vector x, N_Vector z); /* z=-x */
41 static void VScaleSum_Pthreads(realtype c, N_Vector x, N_Vector y, N_Vector z); /* z=c(x+y) */
42 static void VScaleDiff_Pthreads(realtype c, N_Vector x, N_Vector y, N_Vector z); /* z=c(x-y) */
43 static void VLin1_Pthreads(realtype a, N_Vector x, N_Vector y, N_Vector z); /* z=ax+y */
44 static void VLin2_Pthreads(realtype a, N_Vector x, N_Vector y, N_Vector z); /* z=ax-y */
45 static void Vaxpy_Pthreads(realtype a, N_Vector x, N_Vector y); /* y <- ax+y */
46 static void VScaleBy_Pthreads(realtype a, N_Vector x); /* x <- ax */
47
48 /* Private functions for special cases of vector array operations */
49 static int VSumVectorArray_Pthreads(int nvec, N_Vector* X, N_Vector* Y, N_Vector* Z); /* Z=X+Y */
50 static int VDiffVectorArray_Pthreads(int nvec, N_Vector* X, N_Vector* Y, N_Vector* Z); /* Z=X-Y */
51 static int VScaleSumVectorArray_Pthreads(int nvec, realtype c, N_Vector* X, N_Vector* Y, N_Vector* Z); /* Z=c(X+Y) */
52 static int VScaleDiffVectorArray_Pthreads(int nvec, realtype c, N_Vector* X, N_Vector* Y, N_Vector* Z); /* Z=c(X-Y) */
53 static int VLin1VectorArray_Pthreads(int nvec, realtype a, N_Vector* X, N_Vector* Y, N_Vector* Z); /* Z=aX+Y */
54 static int VLin2VectorArray_Pthreads(int nvec, realtype a, N_Vector* X, N_Vector* Y, N_Vector* Z); /* Z=aX-Y */
55 static int VaxpyVectorArray_Pthreads(int nvec, realtype a, N_Vector* X, N_Vector* Y); /* Y <- aX+Y */
56
57 /* Pthread companion functions for vector operations */
58 static void *N_VLinearSum_PT(void *thread_data);
59 static void *N_VConst_PT(void *thread_data);
60 static void *N_VProd_PT(void *thread_data);
61 static void *N_VDiv_PT(void *thread_data);
62 static void *N_VScale_PT(void *thread_data);
63 static void *N_VAbs_PT(void *thread_data);
64 static void *N_VInv_PT(void *thread_data);
65 static void *N_VAddConst_PT(void *thread_data);
66 static void *N_VCompare_PT(void *thread_data);
67 static void *N_VDotProd_PT(void *thread_data);
68 static void *N_VMaxNorm_PT(void *thread_data);
69 static void *N_VWSqrSum_PT(void *thread_data);
70 static void *N_VMin_PT(void *thread_data);
71 static void *N_VWL2Norm_PT(void *thread_data);
72 static void *N_VL1Norm_PT(void *thread_data);
73 static void *N_VInvTest_PT(void *thread_data);
74 static void *N_VWSqrSumMask_PT(void *thread_data);
75 static void *N_VConstrMask_PT(void *thread_data);
76 static void *N_VMinQuotient_PT(void *thread_data);
77
78 /* Pthread companion functions special cases of vector operations */
79 static void *VCopy_PT(void *thread_data);
80 static void *VSum_PT(void *thread_data);
81 static void *VDiff_PT(void *thread_data);
82 static void *VNeg_PT(void *thread_data);
83 static void *VScaleSum_PT(void *thread_data);
84 static void *VScaleDiff_PT(void *thread_data);
85 static void *VLin1_PT(void *thread_data);
86 static void *VLin2_PT(void *thread_data);
87 static void *VScaleBy_PT(void *thread_data);
88 static void *Vaxpy_PT(void *thread_data);
89
90 /* Pthread companion functions for fused vector operations */
91 static void *N_VLinearCombination_PT(void *thread_data);
92 static void *N_VScaleAddMulti_PT(void *thread_data);
93 static void *N_VDotProdMulti_PT(void *thread_data);
94
95 /* Pthread companion functions for vector array operations */
96 static void *N_VLinearSumVectorArray_PT(void *thread_data);
97 static void *N_VScaleVectorArray_PT(void *thread_data);
98 static void *N_VConstVectorArray_PT(void *thread_data);
99 static void *N_VWrmsNormVectorArray_PT(void *thread_data);
100 static void *N_VWrmsNormMaskVectorArray_PT(void *thread_data);
101 static void *N_VScaleAddMultiVectorArray_PT(void *thread_data);
102 static void *N_VLinearCombinationVectorArray_PT(void *thread_data);
103
104 /* Pthread companion functions special cases of vector array operations */
105 static void *VSumVectorArray_PT(void *thread_data);
106 static void *VDiffVectorArray_PT(void *thread_data);
107 static void *VScaleSumVectorArray_PT(void *thread_data);
108 static void *VScaleDiffVectorArray_PT(void *thread_data);
109 static void *VLin1VectorArray_PT(void *thread_data);
110 static void *VLin2VectorArray_PT(void *thread_data);
111 static void *VaxpyVectorArray_PT(void *thread_data);
112
113 /* Function to determine loop values for threads */
114 static void N_VSplitLoop(int myid, int *nthreads, sunindextype *N,
115 sunindextype *start, sunindextype *end);
116
117 /* Function to initialize thread data */
118 static void N_VInitThreadData(Pthreads_Data *thread_data);
119
120 /*
121 * -----------------------------------------------------------------
122 * exported functions
123 * -----------------------------------------------------------------
124 */
125
126 /* ----------------------------------------------------------------
127 * Returns vector type ID. Used to identify vector implementation
128 * from abstract N_Vector interface.
129 */
N_VGetVectorID_Pthreads(N_Vector v)130 N_Vector_ID N_VGetVectorID_Pthreads(N_Vector v)
131 {
132 return SUNDIALS_NVEC_PTHREADS;
133 }
134
135 /* ----------------------------------------------------------------------------
136 * Function to create a new empty vector
137 */
138
N_VNewEmpty_Pthreads(sunindextype length,int num_threads)139 N_Vector N_VNewEmpty_Pthreads(sunindextype length, int num_threads)
140 {
141 N_Vector v;
142 N_VectorContent_Pthreads content;
143
144 /* Create an empty vector object */
145 v = NULL;
146 v = N_VNewEmpty();
147 if (v == NULL) return(NULL);
148
149 /* Attach operations */
150
151 /* constructors, destructors, and utility operations */
152 v->ops->nvgetvectorid = N_VGetVectorID_Pthreads;
153 v->ops->nvclone = N_VClone_Pthreads;
154 v->ops->nvcloneempty = N_VCloneEmpty_Pthreads;
155 v->ops->nvdestroy = N_VDestroy_Pthreads;
156 v->ops->nvspace = N_VSpace_Pthreads;
157 v->ops->nvgetarraypointer = N_VGetArrayPointer_Pthreads;
158 v->ops->nvsetarraypointer = N_VSetArrayPointer_Pthreads;
159 v->ops->nvgetlength = N_VGetLength_Pthreads;
160
161 /* standard vector operations */
162 v->ops->nvlinearsum = N_VLinearSum_Pthreads;
163 v->ops->nvconst = N_VConst_Pthreads;
164 v->ops->nvprod = N_VProd_Pthreads;
165 v->ops->nvdiv = N_VDiv_Pthreads;
166 v->ops->nvscale = N_VScale_Pthreads;
167 v->ops->nvabs = N_VAbs_Pthreads;
168 v->ops->nvinv = N_VInv_Pthreads;
169 v->ops->nvaddconst = N_VAddConst_Pthreads;
170 v->ops->nvdotprod = N_VDotProd_Pthreads;
171 v->ops->nvmaxnorm = N_VMaxNorm_Pthreads;
172 v->ops->nvwrmsnormmask = N_VWrmsNormMask_Pthreads;
173 v->ops->nvwrmsnorm = N_VWrmsNorm_Pthreads;
174 v->ops->nvmin = N_VMin_Pthreads;
175 v->ops->nvwl2norm = N_VWL2Norm_Pthreads;
176 v->ops->nvl1norm = N_VL1Norm_Pthreads;
177 v->ops->nvcompare = N_VCompare_Pthreads;
178 v->ops->nvinvtest = N_VInvTest_Pthreads;
179 v->ops->nvconstrmask = N_VConstrMask_Pthreads;
180 v->ops->nvminquotient = N_VMinQuotient_Pthreads;
181
182 /* fused and vector array operations are disabled (NULL) by default */
183
184 /* local reduction operations */
185 v->ops->nvdotprodlocal = N_VDotProd_Pthreads;
186 v->ops->nvmaxnormlocal = N_VMaxNorm_Pthreads;
187 v->ops->nvminlocal = N_VMin_Pthreads;
188 v->ops->nvl1normlocal = N_VL1Norm_Pthreads;
189 v->ops->nvinvtestlocal = N_VInvTest_Pthreads;
190 v->ops->nvconstrmasklocal = N_VConstrMask_Pthreads;
191 v->ops->nvminquotientlocal = N_VMinQuotient_Pthreads;
192 v->ops->nvwsqrsumlocal = N_VWSqrSumLocal_Pthreads;
193 v->ops->nvwsqrsummasklocal = N_VWSqrSumMaskLocal_Pthreads;
194
195 /* Create content */
196 content = NULL;
197 content = (N_VectorContent_Pthreads) malloc(sizeof *content);
198 if (content == NULL) { N_VDestroy(v); return(NULL); }
199
200 /* Attach content */
201 v->content = content;
202
203 /* Initialize content */
204 content->length = length;
205 content->num_threads = num_threads;
206 content->own_data = SUNFALSE;
207 content->data = NULL;
208
209 return(v);
210 }
211
212 /* ----------------------------------------------------------------------------
213 * Function to create a new vector
214 */
215
N_VNew_Pthreads(sunindextype length,int num_threads)216 N_Vector N_VNew_Pthreads(sunindextype length, int num_threads)
217 {
218 N_Vector v;
219 realtype *data;
220
221 v = NULL;
222 v = N_VNewEmpty_Pthreads(length, num_threads);
223 if (v == NULL) return(NULL);
224
225 /* Create data */
226 if (length > 0) {
227
228 /* Allocate memory */
229 data = NULL;
230 data = (realtype *) malloc(length * sizeof(realtype));
231 if(data == NULL) { N_VDestroy_Pthreads(v); return(NULL); }
232
233 /* Attach data */
234 NV_OWN_DATA_PT(v) = SUNTRUE;
235 NV_DATA_PT(v) = data;
236
237 }
238
239 return(v);
240 }
241
242 /* ----------------------------------------------------------------------------
243 * Function to create a vector with user data component
244 */
245
N_VMake_Pthreads(sunindextype length,int num_threads,realtype * v_data)246 N_Vector N_VMake_Pthreads(sunindextype length, int num_threads, realtype *v_data)
247 {
248 N_Vector v;
249
250 v = NULL;
251 v = N_VNewEmpty_Pthreads(length, num_threads);
252 if (v == NULL) return(NULL);
253
254 if (length > 0) {
255 /* Attach data */
256 NV_OWN_DATA_PT(v) = SUNFALSE;
257 NV_DATA_PT(v) = v_data;
258 }
259
260 return(v);
261 }
262
263 /* ----------------------------------------------------------------------------
264 * Function to create an array of new vectors.
265 */
266
N_VCloneVectorArray_Pthreads(int count,N_Vector w)267 N_Vector* N_VCloneVectorArray_Pthreads(int count, N_Vector w)
268 {
269 N_Vector* vs;
270 int j;
271
272 if (count <= 0) return(NULL);
273
274 vs = NULL;
275 vs = (N_Vector*) malloc(count * sizeof(N_Vector));
276 if(vs == NULL) return(NULL);
277
278 for (j = 0; j < count; j++) {
279 vs[j] = NULL;
280 vs[j] = N_VClone_Pthreads(w);
281 if (vs[j] == NULL) {
282 N_VDestroyVectorArray_Pthreads(vs, j-1);
283 return(NULL);
284 }
285 }
286
287 return(vs);
288 }
289
290 /* ----------------------------------------------------------------------------
291 * Function to create an array of new vectors with NULL data array.
292 */
293
N_VCloneVectorArrayEmpty_Pthreads(int count,N_Vector w)294 N_Vector* N_VCloneVectorArrayEmpty_Pthreads(int count, N_Vector w)
295 {
296 N_Vector* vs;
297 int j;
298
299 if (count <= 0) return(NULL);
300
301 vs = NULL;
302 vs = (N_Vector*) malloc(count * sizeof(N_Vector));
303 if(vs == NULL) return(NULL);
304
305 for (j = 0; j < count; j++) {
306 vs[j] = NULL;
307 vs[j] = N_VCloneEmpty_Pthreads(w);
308 if (vs[j] == NULL) {
309 N_VDestroyVectorArray_Pthreads(vs, j-1);
310 return(NULL);
311 }
312 }
313
314 return(vs);
315 }
316
317 /* ----------------------------------------------------------------------------
318 * Function to free an array created with N_VCloneVectorArray_Pthreads
319 */
320
N_VDestroyVectorArray_Pthreads(N_Vector * vs,int count)321 void N_VDestroyVectorArray_Pthreads(N_Vector* vs, int count)
322 {
323 int j;
324
325 for (j = 0; j < count; j++) N_VDestroy_Pthreads(vs[j]);
326
327 free(vs); vs = NULL;
328
329 return;
330 }
331
332 /* ----------------------------------------------------------------------------
333 * Function to return number of vector elements
334 */
N_VGetLength_Pthreads(N_Vector v)335 sunindextype N_VGetLength_Pthreads(N_Vector v)
336 {
337 return NV_LENGTH_PT(v);
338 }
339
340 /* ----------------------------------------------------------------------------
341 * Function to print a vector to stdout
342 */
343
N_VPrint_Pthreads(N_Vector x)344 void N_VPrint_Pthreads(N_Vector x)
345 {
346 N_VPrintFile_Pthreads(x, stdout);
347 }
348
349 /* ----------------------------------------------------------------------------
350 * Function to print a vector to outfile
351 */
352
N_VPrintFile_Pthreads(N_Vector x,FILE * outfile)353 void N_VPrintFile_Pthreads(N_Vector x, FILE *outfile)
354 {
355 sunindextype i, N;
356 realtype *xd;
357
358 xd = NULL;
359
360 N = NV_LENGTH_PT(x);
361 xd = NV_DATA_PT(x);
362
363 for (i = 0; i < N; i++) {
364 #if defined(SUNDIALS_EXTENDED_PRECISION)
365 fprintf(outfile, "%11.8Lg\n", xd[i]);
366 #elif defined(SUNDIALS_DOUBLE_PRECISION)
367 fprintf(outfile, "%11.8g\n", xd[i]);
368 #else
369 fprintf(outfile, "%11.8g\n", xd[i]);
370 #endif
371 }
372 fprintf(outfile, "\n");
373
374 return;
375 }
376
377 /*
378 * -----------------------------------------------------------------
379 * implementation of vector operations
380 * -----------------------------------------------------------------
381 */
382
383 /* ----------------------------------------------------------------------------
384 * Create new vector from existing vector without attaching data
385 */
386
N_VCloneEmpty_Pthreads(N_Vector w)387 N_Vector N_VCloneEmpty_Pthreads(N_Vector w)
388 {
389 N_Vector v;
390 N_VectorContent_Pthreads content;
391
392 if (w == NULL) return(NULL);
393
394 /* Create vector */
395 v = NULL;
396 v = N_VNewEmpty();
397 if (v == NULL) return(NULL);
398
399 /* Attach operations */
400 if (N_VCopyOps(w, v)) { N_VDestroy(v); return(NULL); }
401
402 /* Create content */
403 content = NULL;
404 content = (N_VectorContent_Pthreads) malloc(sizeof *content);
405 if (content == NULL) { N_VDestroy(v); return(NULL); }
406
407 /* Attach content */
408 v->content = content;
409
410 /* Initialize content */
411 content->length = NV_LENGTH_PT(w);
412 content->num_threads = NV_NUM_THREADS_PT(w);
413 content->own_data = SUNFALSE;
414 content->data = NULL;
415
416 return(v);
417 }
418
419
420 /* ----------------------------------------------------------------------------
421 * Create new vector from existing vector and attach data
422 */
423
N_VClone_Pthreads(N_Vector w)424 N_Vector N_VClone_Pthreads(N_Vector w)
425 {
426 N_Vector v;
427 realtype *data;
428 sunindextype length;
429
430 v = NULL;
431 v = N_VCloneEmpty_Pthreads(w);
432 if (v == NULL) return(NULL);
433
434 length = NV_LENGTH_PT(w);
435
436 /* Create data */
437 if (length > 0) {
438
439 /* Allocate memory */
440 data = NULL;
441 data = (realtype *) malloc(length * sizeof(realtype));
442 if(data == NULL) { N_VDestroy_Pthreads(v); return(NULL); }
443
444 /* Attach data */
445 NV_OWN_DATA_PT(v) = SUNTRUE;
446 NV_DATA_PT(v) = data;
447
448 }
449
450 return(v);
451 }
452
453 /* ----------------------------------------------------------------------------
454 * Destroy vector and free vector memory
455 */
456
N_VDestroy_Pthreads(N_Vector v)457 void N_VDestroy_Pthreads(N_Vector v)
458 {
459 if (v == NULL) return;
460
461 /* free content */
462 if (v->content != NULL) {
463 if (NV_OWN_DATA_PT(v) && NV_DATA_PT(v) != NULL) {
464 free(NV_DATA_PT(v));
465 NV_DATA_PT(v) = NULL;
466 }
467 free(v->content);
468 v->content = NULL;
469 }
470
471 /* free ops and vector */
472 if (v->ops != NULL) { free(v->ops); v->ops = NULL; }
473 free(v); v = NULL;
474
475 return;
476 }
477
478 /* ----------------------------------------------------------------------------
479 * Get storage requirement for vector
480 */
481
N_VSpace_Pthreads(N_Vector v,sunindextype * lrw,sunindextype * liw)482 void N_VSpace_Pthreads(N_Vector v, sunindextype *lrw, sunindextype *liw)
483 {
484 *lrw = NV_LENGTH_PT(v);
485 *liw = 1;
486
487 return;
488 }
489
490
491 /* ----------------------------------------------------------------------------
492 * Get vector data pointer
493 */
494
N_VGetArrayPointer_Pthreads(N_Vector v)495 realtype *N_VGetArrayPointer_Pthreads(N_Vector v)
496 {
497 return((realtype *) NV_DATA_PT(v));
498 }
499
500
501 /* ----------------------------------------------------------------------------
502 * Set vector data pointer
503 */
504
N_VSetArrayPointer_Pthreads(realtype * v_data,N_Vector v)505 void N_VSetArrayPointer_Pthreads(realtype *v_data, N_Vector v)
506 {
507 if (NV_LENGTH_PT(v) > 0) NV_DATA_PT(v) = v_data;
508
509 return;
510 }
511
512
513 /* ----------------------------------------------------------------------------
514 * Compute linear sum z[i] = a*x[i]+b*y[i]
515 */
516
N_VLinearSum_Pthreads(realtype a,N_Vector x,realtype b,N_Vector y,N_Vector z)517 void N_VLinearSum_Pthreads(realtype a, N_Vector x, realtype b, N_Vector y, N_Vector z)
518 {
519 sunindextype N;
520 int i, nthreads;
521 pthread_t *threads;
522 Pthreads_Data *thread_data;
523 pthread_attr_t attr;
524
525 realtype c;
526 N_Vector v1, v2;
527 booleantype test;
528
529 if ((b == ONE) && (z == y)) { /* BLAS usage: axpy y <- ax+y */
530 Vaxpy_Pthreads(a,x,y);
531 return;
532 }
533
534 if ((a == ONE) && (z == x)) { /* BLAS usage: axpy x <- by+x */
535 Vaxpy_Pthreads(b,y,x);
536 return;
537 }
538
539 /* Case: a == b == 1.0 */
540
541 if ((a == ONE) && (b == ONE)) {
542 VSum_Pthreads(x, y, z);
543 return;
544 }
545
546 /* Cases: (1) a == 1.0, b = -1.0, (2) a == -1.0, b == 1.0 */
547
548 if ((test = ((a == ONE) && (b == -ONE))) || ((a == -ONE) && (b == ONE))) {
549 v1 = test ? y : x;
550 v2 = test ? x : y;
551 VDiff_Pthreads(v2, v1, z);
552 return;
553 }
554
555 /* Cases: (1) a == 1.0, b == other or 0.0, (2) a == other or 0.0, b == 1.0 */
556 /* if a or b is 0.0, then user should have called N_VScale */
557
558 if ((test = (a == ONE)) || (b == ONE)) {
559 c = test ? b : a;
560 v1 = test ? y : x;
561 v2 = test ? x : y;
562 VLin1_Pthreads(c, v1, v2, z);
563 return;
564 }
565
566 /* Cases: (1) a == -1.0, b != 1.0, (2) a != 1.0, b == -1.0 */
567
568 if ((test = (a == -ONE)) || (b == -ONE)) {
569 c = test ? b : a;
570 v1 = test ? y : x;
571 v2 = test ? x : y;
572 VLin2_Pthreads(c, v1, v2, z);
573 return;
574 }
575
576 /* Case: a == b */
577 /* catches case both a and b are 0.0 - user should have called N_VConst */
578
579 if (a == b) {
580 VScaleSum_Pthreads(a, x, y, z);
581 return;
582 }
583
584 /* Case: a == -b */
585
586 if (a == -b) {
587 VScaleDiff_Pthreads(a, x, y, z);
588 return;
589 }
590
591 /* Do all cases not handled above:
592 (1) a == other, b == 0.0 - user should have called N_VScale
593 (2) a == 0.0, b == other - user should have called N_VScale
594 (3) a,b == other, a !=b, a != -b */
595
596 /* allocate threads and thread data structs */
597 N = NV_LENGTH_PT(x);
598 nthreads = NV_NUM_THREADS_PT(x);
599 threads = (pthread_t *) malloc(nthreads*sizeof(pthread_t));
600 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
601
602 /* set thread attributes */
603 pthread_attr_init(&attr);
604 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
605
606 for (i=0; i<nthreads; i++) {
607 /* initialize thread data */
608 N_VInitThreadData(&thread_data[i]);
609
610 /* compute start and end loop index for thread */
611 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
612
613 /* pack thread data */
614 thread_data[i].c1 = a;
615 thread_data[i].c2 = b;
616 thread_data[i].v1 = NV_DATA_PT(x);
617 thread_data[i].v2 = NV_DATA_PT(y);
618 thread_data[i].v3 = NV_DATA_PT(z);
619
620 /* create threads and call pthread companion function */
621 pthread_create(&threads[i], &attr, N_VLinearSum_PT, (void *) &thread_data[i]);
622 }
623
624 /* wait for all threads to finish */
625 for (i=0; i<nthreads; i++) {
626 pthread_join(threads[i], NULL);
627 }
628
629 /* clean up and return */
630 pthread_attr_destroy(&attr);
631 free(threads);
632 free(thread_data);
633
634 return;
635 }
636
637 /* ----------------------------------------------------------------------------
638 * Pthread companion function to N_VLinearSum
639 */
640
N_VLinearSum_PT(void * thread_data)641 static void *N_VLinearSum_PT(void *thread_data)
642 {
643 sunindextype i, start, end;
644 realtype a, b;
645 realtype *xd, *yd, *zd;
646 Pthreads_Data *my_data;
647
648 /* extract thread data */
649 my_data = (Pthreads_Data *) thread_data;
650
651 a = my_data->c1;
652 b = my_data->c2;
653 xd = my_data->v1;
654 yd = my_data->v2;
655 zd = my_data->v3;
656
657 start = my_data->start;
658 end = my_data->end;
659
660 /* compute linear sum */
661 for (i = start; i < end; i++){
662 zd[i] = (a*xd[i])+(b*yd[i]);
663 }
664
665 /* exit */
666 pthread_exit(NULL);
667 }
668
669
670 /* ----------------------------------------------------------------------------
671 * Assigns constant value to all vector elements, z[i] = c
672 */
673
N_VConst_Pthreads(realtype c,N_Vector z)674 void N_VConst_Pthreads(realtype c, N_Vector z)
675 {
676 sunindextype N;
677 int i, nthreads;
678 pthread_t *threads;
679 Pthreads_Data *thread_data;
680 pthread_attr_t attr;
681
682 /* allocate threads and thread data structs */
683 N = NV_LENGTH_PT(z);
684 nthreads = NV_NUM_THREADS_PT(z);
685 threads = malloc(nthreads*sizeof(pthread_t));
686 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
687
688 /* set thread attributes */
689 pthread_attr_init(&attr);
690 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
691
692 for (i=0; i<nthreads; i++) {
693 /* initialize thread data */
694 N_VInitThreadData(&thread_data[i]);
695
696 /* compute start and end loop index for thread */
697 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
698
699 /* pack thread data */
700 thread_data[i].c1 = c;
701 thread_data[i].v1 = NV_DATA_PT(z);
702
703 /* create threads and call pthread companion function */
704 pthread_create(&threads[i], &attr, N_VConst_PT, (void *) &thread_data[i]);
705 }
706
707 /* wait for all threads to finish */
708 for (i=0; i<nthreads; i++) {
709 pthread_join(threads[i], NULL);
710 }
711
712 /* clean up and return */
713 pthread_attr_destroy(&attr);
714 free(threads);
715 free(thread_data);
716
717 return;
718 }
719
720
721 /* ----------------------------------------------------------------------------
722 * Pthread companion function to N_VConst
723 */
724
N_VConst_PT(void * thread_data)725 static void *N_VConst_PT(void *thread_data)
726 {
727 sunindextype i, start, end;
728 realtype c;
729 realtype *zd;
730 Pthreads_Data *my_data;
731
732 /* extract thread data */
733 my_data = (Pthreads_Data *) thread_data;
734
735 c = my_data->c1;
736 zd = my_data->v1;
737
738 start = my_data->start;
739 end = my_data->end;
740
741 /* assign constant values */
742 for (i = start; i < end; i++)
743 zd[i] = c;
744
745 /* exit */
746 pthread_exit(NULL);
747 }
748
749
750 /* ----------------------------------------------------------------------------
751 * Compute componentwise product z[i] = x[i]*y[i]
752 */
753
N_VProd_Pthreads(N_Vector x,N_Vector y,N_Vector z)754 void N_VProd_Pthreads(N_Vector x, N_Vector y, N_Vector z)
755 {
756 sunindextype N;
757 int i, nthreads;
758 pthread_t *threads;
759 Pthreads_Data *thread_data;
760 pthread_attr_t attr;
761
762 /* allocate threads and thread data structs */
763 N = NV_LENGTH_PT(x);
764 nthreads = NV_NUM_THREADS_PT(x);
765 threads = malloc(nthreads*sizeof(pthread_t));
766 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
767
768 /* set thread attributes */
769 pthread_attr_init(&attr);
770 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
771
772 for (i=0; i<nthreads; i++) {
773 /* initialize thread data */
774 N_VInitThreadData(&thread_data[i]);
775
776 /* compute start and end loop index for thread */
777 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
778
779 /* pack thread data */
780 thread_data[i].v1 = NV_DATA_PT(x);
781 thread_data[i].v2 = NV_DATA_PT(y);
782 thread_data[i].v3 = NV_DATA_PT(z);
783
784 /* create threads and call pthread companion function */
785 pthread_create(&threads[i], &attr, N_VProd_PT, (void *) &thread_data[i]);
786 }
787
788 /* wait for all threads to finish */
789 for (i=0; i<nthreads; i++) {
790 pthread_join(threads[i], NULL);
791 }
792
793 /* clean up and exit */
794 pthread_attr_destroy(&attr);
795 free(threads);
796 free(thread_data);
797
798 return;
799 }
800
801 /* ----------------------------------------------------------------------------
802 * Pthread companion function to N_VProd
803 */
804
N_VProd_PT(void * thread_data)805 static void *N_VProd_PT(void *thread_data)
806 {
807 sunindextype i, start, end;
808 realtype *xd, *yd, *zd;
809 Pthreads_Data *my_data;
810
811 /* extract thread data */
812 my_data = (Pthreads_Data *) thread_data;
813
814 xd = my_data->v1;
815 yd = my_data->v2;
816 zd = my_data->v3;
817
818 start = my_data->start;
819 end = my_data->end;
820
821 /* compute componentwise product */
822 for (i = start; i < end; i++)
823 zd[i] = xd[i]*yd[i];
824
825 /* exit */
826 pthread_exit(NULL);
827 }
828
829
830 /* ----------------------------------------------------------------------------
831 * Compute componentwise division z[i] = x[i]/y[i]
832 */
833
N_VDiv_Pthreads(N_Vector x,N_Vector y,N_Vector z)834 void N_VDiv_Pthreads(N_Vector x, N_Vector y, N_Vector z)
835 {
836 sunindextype N;
837 int i, nthreads;
838 pthread_t *threads;
839 Pthreads_Data *thread_data;
840 pthread_attr_t attr;
841
842 /* allocate threads and thread data structs */
843 N = NV_LENGTH_PT(x);
844 nthreads = NV_NUM_THREADS_PT(x);
845 threads = malloc(nthreads*sizeof(pthread_t));
846 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
847
848 /* set thread attributes */
849 pthread_attr_init(&attr);
850 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
851
852 for (i=0; i<nthreads; i++) {
853 /* initialize thread data */
854 N_VInitThreadData(&thread_data[i]);
855
856 /* compute start and end loop index for thread */
857 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
858
859 /* pack thread data */
860 thread_data[i].v1 = NV_DATA_PT(x);
861 thread_data[i].v2 = NV_DATA_PT(y);
862 thread_data[i].v3 = NV_DATA_PT(z);
863
864 /* create threads and call pthread companion function */
865 pthread_create(&threads[i], &attr, N_VDiv_PT, (void *) &thread_data[i]);
866 }
867
868 /* wait for all threads to finish */
869 for (i=0; i<nthreads; i++) {
870 pthread_join(threads[i], NULL);
871 }
872
873 /* clean up and return */
874 pthread_attr_destroy(&attr);
875 free(threads);
876 free(thread_data);
877
878 return;
879 }
880
881
882 /* ----------------------------------------------------------------------------
883 * Pthread companion function to N_VDiv
884 */
885
N_VDiv_PT(void * thread_data)886 static void *N_VDiv_PT(void *thread_data)
887 {
888 sunindextype i, start, end;
889 realtype *xd, *yd, *zd;
890 Pthreads_Data *my_data;
891
892 /* extract thread data */
893 my_data = (Pthreads_Data *) thread_data;
894
895 xd = my_data->v1;
896 yd = my_data->v2;
897 zd = my_data->v3;
898
899 start = my_data->start;
900 end = my_data->end;
901
902 /* compute componentwise division */
903 for (i = start; i < end; i++)
904 zd[i] = xd[i]/yd[i];
905
906 /* exit */
907 pthread_exit(NULL);
908 }
909
910
911 /* ----------------------------------------------------------------------------
912 * Compute scaler multiplication z[i] = c*x[i]
913 */
914
N_VScale_Pthreads(realtype c,N_Vector x,N_Vector z)915 void N_VScale_Pthreads(realtype c, N_Vector x, N_Vector z)
916 {
917 sunindextype N;
918 int i, nthreads;
919 pthread_t *threads;
920 Pthreads_Data *thread_data;
921 pthread_attr_t attr;
922
923 if (z == x) { /* BLAS usage: scale x <- cx */
924 VScaleBy_Pthreads(c, x);
925 return;
926 }
927
928 if (c == ONE) {
929 VCopy_Pthreads(x, z);
930 } else if (c == -ONE) {
931 VNeg_Pthreads(x, z);
932 } else {
933 /* allocate threads and thread data structs */
934 N = NV_LENGTH_PT(x);
935 nthreads = NV_NUM_THREADS_PT(x);
936 threads = malloc(nthreads*sizeof(pthread_t));
937 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
938
939 /* set thread attributes */
940 pthread_attr_init(&attr);
941 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
942
943 for (i=0; i<nthreads; i++) {
944 /* initialize thread data */
945 N_VInitThreadData(&thread_data[i]);
946
947 /* compute start and end loop index for thread */
948 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
949
950 /* pack thread data */
951 thread_data[i].c1 = c;
952 thread_data[i].v1 = NV_DATA_PT(x);
953 thread_data[i].v2 = NV_DATA_PT(z);
954
955 /* create threads and call pthread companion function */
956 pthread_create(&threads[i], &attr, N_VScale_PT, (void *) &thread_data[i]);
957 }
958
959 /* wait for all threads to finish */
960 for (i=0; i<nthreads; i++) {
961 pthread_join(threads[i], NULL);
962 }
963
964 /* clean up */
965 pthread_attr_destroy(&attr);
966 free(threads);
967 free(thread_data);
968 }
969
970 return;
971 }
972
973
974 /* ----------------------------------------------------------------------------
975 * Pthread companion function to N_VScale
976 */
977
N_VScale_PT(void * thread_data)978 static void *N_VScale_PT(void *thread_data)
979 {
980 sunindextype i, start, end;
981 realtype c;
982 realtype *xd, *zd;
983 Pthreads_Data *my_data;
984
985 /* extract thread data */
986 my_data = (Pthreads_Data *) thread_data;
987
988 c = my_data->c1;
989 xd = my_data->v1;
990 zd = my_data->v2;
991
992 start = my_data->start;
993 end = my_data->end;
994
995 /* compute scaler multiplication */
996 for (i = start; i < end; i++)
997 zd[i] = c*xd[i];
998
999 /* exit */
1000 pthread_exit(NULL);
1001 }
1002
1003
1004 /* ----------------------------------------------------------------------------
1005 * Compute absolute value of vector components z[i] = SUNRabs(x[i])
1006 */
1007
N_VAbs_Pthreads(N_Vector x,N_Vector z)1008 void N_VAbs_Pthreads(N_Vector x, N_Vector z)
1009 {
1010 sunindextype N;
1011 int i, nthreads;
1012 pthread_t *threads;
1013 Pthreads_Data *thread_data;
1014 pthread_attr_t attr;
1015
1016 /* allocate threads and thread data structs */
1017 N = NV_LENGTH_PT(x);
1018 nthreads = NV_NUM_THREADS_PT(x);
1019 threads = malloc(nthreads*sizeof(pthread_t));
1020 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
1021
1022 /* set thread attributes */
1023 pthread_attr_init(&attr);
1024 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
1025
1026 for (i=0; i<nthreads; i++) {
1027 /* initialize thread data */
1028 N_VInitThreadData(&thread_data[i]);
1029
1030 /* compute start and end loop index for thread */
1031 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
1032
1033 /* pack thread data */
1034 thread_data[i].v1 = NV_DATA_PT(x);
1035 thread_data[i].v2 = NV_DATA_PT(z);
1036
1037 /* create threads and call pthread companion function */
1038 pthread_create(&threads[i], &attr, N_VAbs_PT, (void *) &thread_data[i]);
1039 }
1040
1041 /* wait for all threads to finish */
1042 for (i=0; i<nthreads; i++) {
1043 pthread_join(threads[i], NULL);
1044 }
1045
1046 /* clean up and return */
1047 pthread_attr_destroy(&attr);
1048 free(threads);
1049 free(thread_data);
1050
1051 return;
1052 }
1053
1054
1055 /* ----------------------------------------------------------------------------
1056 * Pthread companion function to N_VAbs
1057 */
1058
N_VAbs_PT(void * thread_data)1059 static void *N_VAbs_PT(void *thread_data)
1060 {
1061 sunindextype i, start, end;
1062 realtype *xd, *zd;
1063 Pthreads_Data *my_data;
1064
1065 /* extract thread data */
1066 my_data = (Pthreads_Data *) thread_data;
1067
1068 xd = my_data->v1;
1069 zd = my_data->v2;
1070
1071 start = my_data->start;
1072 end = my_data->end;
1073
1074 /* compute absolute value of components */
1075 for (i = start; i < end; i++)
1076 zd[i] = SUNRabs(xd[i]);
1077
1078 /* exit */
1079 pthread_exit(NULL);
1080 }
1081
1082
1083 /* ----------------------------------------------------------------------------
1084 * Compute componentwise inverse z[i] = 1 / x[i]
1085 */
1086
N_VInv_Pthreads(N_Vector x,N_Vector z)1087 void N_VInv_Pthreads(N_Vector x, N_Vector z)
1088 {
1089 sunindextype N;
1090 int i, nthreads;
1091 pthread_t *threads;
1092 Pthreads_Data *thread_data;
1093 pthread_attr_t attr;
1094
1095 /* allocate threads and thread data structs */
1096 N = NV_LENGTH_PT(x);
1097 nthreads = NV_NUM_THREADS_PT(x);
1098 threads = malloc(nthreads*sizeof(pthread_t));
1099 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
1100
1101 /* set thread attributes */
1102 pthread_attr_init(&attr);
1103 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
1104
1105 for (i=0; i<nthreads; i++) {
1106 /* initialize thread data */
1107 N_VInitThreadData(&thread_data[i]);
1108
1109 /* compute start and end loop index for thread */
1110 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
1111
1112 /* pack thread data */
1113 thread_data[i].v1 = NV_DATA_PT(x);
1114 thread_data[i].v2 = NV_DATA_PT(z);
1115
1116 /* create threads and call pthread companion function */
1117 pthread_create(&threads[i], &attr, N_VInv_PT, (void *) &thread_data[i]);
1118 }
1119
1120 /* wait for all threads to finish */
1121 for (i=0; i<nthreads; i++) {
1122 pthread_join(threads[i], NULL);
1123 }
1124
1125 /* clean up and return */
1126 pthread_attr_destroy(&attr);
1127 free(threads);
1128 free(thread_data);
1129
1130 return;
1131 }
1132
1133
1134 /* ----------------------------------------------------------------------------
1135 * Pthread companion function to N_VInv
1136 */
1137
N_VInv_PT(void * thread_data)1138 static void *N_VInv_PT(void *thread_data)
1139 {
1140 sunindextype i, start, end;
1141 realtype *xd, *zd;
1142 Pthreads_Data *my_data;
1143
1144 /* extract thread data */
1145 my_data = (Pthreads_Data *) thread_data;
1146
1147 xd = my_data->v1;
1148 zd = my_data->v2;
1149
1150 start = my_data->start;
1151 end = my_data->end;
1152
1153 /* compute componentwise inverse */
1154 for (i = start; i < end; i++)
1155 zd[i] = ONE/xd[i];
1156
1157 /* exit */
1158 pthread_exit(NULL);
1159 }
1160
1161
1162 /* ----------------------------------------------------------------------------
1163 * Compute componentwise addition of a scaler to a vector z[i] = x[i] + b
1164 */
1165
N_VAddConst_Pthreads(N_Vector x,realtype b,N_Vector z)1166 void N_VAddConst_Pthreads(N_Vector x, realtype b, N_Vector z)
1167 {
1168 sunindextype N;
1169 int i, nthreads;
1170 pthread_t *threads;
1171 Pthreads_Data *thread_data;
1172 pthread_attr_t attr;
1173
1174 /* allocate threads and thread data structs */
1175 N = NV_LENGTH_PT(x);
1176 nthreads = NV_NUM_THREADS_PT(x);
1177 threads = malloc(nthreads*sizeof(pthread_t));
1178 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
1179
1180 /* set thread attributes */
1181 pthread_attr_init(&attr);
1182 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
1183
1184 for (i=0; i<nthreads; i++) {
1185 /* initialize thread data */
1186 N_VInitThreadData(&thread_data[i]);
1187
1188 /* compute start and end loop index for thread */
1189 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
1190
1191 /* pack thread data */
1192 thread_data[i].c1 = b;
1193 thread_data[i].v1 = NV_DATA_PT(x);
1194 thread_data[i].v2 = NV_DATA_PT(z);
1195
1196 /* create threads and call pthread companion function */
1197 pthread_create(&threads[i], &attr, N_VAddConst_PT, (void *) &thread_data[i]);
1198 }
1199
1200 /* wait for all threads to finish */
1201 for (i=0; i<nthreads; i++) {
1202 pthread_join(threads[i], NULL);
1203 }
1204
1205 /* clean up and return */
1206 pthread_attr_destroy(&attr);
1207 free(threads);
1208 free(thread_data);
1209
1210 return;
1211 }
1212
1213
1214 /* ----------------------------------------------------------------------------
1215 * Pthread companion function to N_VAddConst
1216 */
1217
N_VAddConst_PT(void * thread_data)1218 static void *N_VAddConst_PT(void *thread_data)
1219 {
1220 sunindextype i, start, end;
1221 realtype b;
1222 realtype *xd, *zd;
1223 Pthreads_Data *my_data;
1224
1225 /* extract thread data */
1226 my_data = (Pthreads_Data *) thread_data;
1227
1228 b = my_data->c1;
1229 xd = my_data->v1;
1230 zd = my_data->v2;
1231
1232 start = my_data->start;
1233 end = my_data->end;
1234
1235 /* compute componentwise constant addition */
1236 for (i = start; i < end; i++)
1237 zd[i] = xd[i] + b;
1238
1239 /* exit */
1240 pthread_exit(NULL);
1241 }
1242
1243
1244 /* ----------------------------------------------------------------------------
1245 * Computes the dot product of two vectors, a = sum(x[i]*y[i])
1246 */
1247
N_VDotProd_Pthreads(N_Vector x,N_Vector y)1248 realtype N_VDotProd_Pthreads(N_Vector x, N_Vector y)
1249 {
1250 sunindextype N;
1251 int i, nthreads;
1252 pthread_t *threads;
1253 Pthreads_Data *thread_data;
1254 pthread_attr_t attr;
1255 pthread_mutex_t global_mutex;
1256 realtype sum = ZERO;
1257
1258 /* allocate threads and thread data structs */
1259 N = NV_LENGTH_PT(x);
1260 nthreads = NV_NUM_THREADS_PT(x);
1261 threads = malloc(nthreads*sizeof(pthread_t));
1262 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
1263
1264 /* set thread attributes */
1265 pthread_attr_init(&attr);
1266 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
1267
1268 /* lock for reduction */
1269 pthread_mutex_init(&global_mutex, NULL);
1270
1271 for (i=0; i<nthreads; i++) {
1272 /* initialize thread data */
1273 N_VInitThreadData(&thread_data[i]);
1274
1275 /* compute start and end loop index for thread */
1276 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
1277
1278 /* pack thread data */
1279 thread_data[i].v1 = NV_DATA_PT(x);
1280 thread_data[i].v2 = NV_DATA_PT(y);
1281 thread_data[i].global_val = ∑
1282 thread_data[i].global_mutex = &global_mutex;
1283
1284 /* create threads and call pthread companion function */
1285 pthread_create(&threads[i], &attr, N_VDotProd_PT, (void *) &thread_data[i]);
1286 }
1287
1288 /* wait for all threads to finish */
1289 for (i=0; i<nthreads; i++) {
1290 pthread_join(threads[i], NULL);
1291 }
1292
1293 /* clean up and return */
1294 pthread_attr_destroy(&attr);
1295 pthread_mutex_destroy(&global_mutex);
1296 free(threads);
1297 free(thread_data);
1298
1299 return(sum);
1300 }
1301
1302
1303 /* ----------------------------------------------------------------------------
1304 * Pthread companion function to N_VDotProd
1305 */
1306
N_VDotProd_PT(void * thread_data)1307 static void *N_VDotProd_PT(void *thread_data)
1308 {
1309 sunindextype i, start, end;
1310 realtype *xd, *yd;
1311 realtype local_sum, *global_sum;
1312 Pthreads_Data *my_data;
1313 pthread_mutex_t *global_mutex;
1314
1315 /* extract thread data */
1316 my_data = (Pthreads_Data *) thread_data;
1317
1318 xd = my_data->v1;
1319 yd = my_data->v2;
1320
1321 global_sum = my_data->global_val;
1322 global_mutex = my_data->global_mutex;
1323
1324 start = my_data->start;
1325 end = my_data->end;
1326
1327 /* compute dot product */
1328 local_sum = ZERO;
1329 for (i = start; i < end; i++)
1330 local_sum += xd[i] * yd[i];
1331
1332 /* update global sum */
1333 pthread_mutex_lock(global_mutex);
1334 *global_sum += local_sum;
1335 pthread_mutex_unlock(global_mutex);
1336
1337 /* exit */
1338 pthread_exit(NULL);
1339 }
1340
1341
1342 /* ----------------------------------------------------------------------------
1343 * Computes max norm of the vector
1344 */
1345
N_VMaxNorm_Pthreads(N_Vector x)1346 realtype N_VMaxNorm_Pthreads(N_Vector x)
1347 {
1348 sunindextype N;
1349 int i, nthreads;
1350 pthread_t *threads;
1351 Pthreads_Data *thread_data;
1352 pthread_attr_t attr;
1353 pthread_mutex_t global_mutex;
1354 realtype max = ZERO;
1355
1356 /* allocate threads and thread data structs */
1357 N = NV_LENGTH_PT(x);
1358 nthreads = NV_NUM_THREADS_PT(x);
1359 threads = malloc(nthreads*sizeof(pthread_t));
1360 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
1361
1362 /* set thread attributes */
1363 pthread_attr_init(&attr);
1364 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
1365
1366 /* lock for reduction */
1367 pthread_mutex_init(&global_mutex, NULL);
1368
1369 for (i=0; i<nthreads; i++) {
1370 /* initialize thread data */
1371 N_VInitThreadData(&thread_data[i]);
1372
1373 /* compute start and end loop index for thread */
1374 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
1375
1376 /* pack thread data */
1377 thread_data[i].v1 = NV_DATA_PT(x);
1378 thread_data[i].global_val = &max;
1379 thread_data[i].global_mutex = &global_mutex;
1380
1381 /* create threads and call pthread companion function */
1382 pthread_create(&threads[i], &attr, N_VMaxNorm_PT, (void *) &thread_data[i]);
1383 }
1384
1385 /* wait for all threads to finish */
1386 for (i=0; i<nthreads; i++) {
1387 pthread_join(threads[i], NULL);
1388 }
1389
1390 /* clean up and return */
1391 pthread_attr_destroy(&attr);
1392 pthread_mutex_destroy(&global_mutex);
1393 free(threads);
1394 free(thread_data);
1395
1396 return(max);
1397 }
1398
1399
1400 /* ----------------------------------------------------------------------------
1401 * Pthread companion function to N_VMaxNorm
1402 */
1403
N_VMaxNorm_PT(void * thread_data)1404 static void *N_VMaxNorm_PT(void *thread_data)
1405 {
1406 sunindextype i, start, end;
1407 realtype *xd;
1408 realtype local_max, *global_max;
1409 Pthreads_Data *my_data;
1410 pthread_mutex_t *global_mutex;
1411
1412 /* extract thread data */
1413 my_data = (Pthreads_Data *) thread_data;
1414
1415 xd = my_data->v1;
1416
1417 global_max = my_data->global_val;
1418 global_mutex = my_data->global_mutex;
1419
1420 start = my_data->start;
1421 end = my_data->end;
1422
1423 /* find local max */
1424 local_max = ZERO;
1425 for (i = start; i < end; i++)
1426 if (SUNRabs(xd[i]) > local_max) local_max = SUNRabs(xd[i]);
1427
1428 /* update global max */
1429 pthread_mutex_lock(global_mutex);
1430 if (local_max > *global_max) {
1431 *global_max = local_max;
1432 }
1433 pthread_mutex_unlock(global_mutex);
1434
1435 /* exit */
1436 pthread_exit(NULL);
1437 }
1438
1439
1440 /* ----------------------------------------------------------------------------
1441 * Computes weighted root mean square norm of a vector
1442 */
1443
N_VWrmsNorm_Pthreads(N_Vector x,N_Vector w)1444 realtype N_VWrmsNorm_Pthreads(N_Vector x, N_Vector w)
1445 {
1446 return(SUNRsqrt(N_VWSqrSumLocal_Pthreads(x, w)/(NV_LENGTH_PT(x))));
1447 }
1448
1449
1450 /* ----------------------------------------------------------------------------
1451 * Computes weighted square sum of a vector
1452 */
1453
N_VWSqrSumLocal_Pthreads(N_Vector x,N_Vector w)1454 realtype N_VWSqrSumLocal_Pthreads(N_Vector x, N_Vector w)
1455 {
1456 sunindextype N;
1457 int i, nthreads;
1458 pthread_t *threads;
1459 Pthreads_Data *thread_data;
1460 pthread_attr_t attr;
1461 pthread_mutex_t global_mutex;
1462 realtype sum = ZERO;
1463
1464 /* allocate threads and thread data structs */
1465 N = NV_LENGTH_PT(x);
1466 nthreads = NV_NUM_THREADS_PT(x);
1467 threads = malloc(nthreads*sizeof(pthread_t));
1468 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
1469
1470 /* set thread attributes */
1471 pthread_attr_init(&attr);
1472 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
1473
1474 /* lock for reduction */
1475 pthread_mutex_init(&global_mutex, NULL);
1476
1477 for (i=0; i<nthreads; i++) {
1478 /* initialize thread data */
1479 N_VInitThreadData(&thread_data[i]);
1480
1481 /* compute start and end loop index for thread */
1482 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
1483
1484 /* pack thread data */
1485 thread_data[i].v1 = NV_DATA_PT(x);
1486 thread_data[i].v2 = NV_DATA_PT(w);
1487 thread_data[i].global_val = ∑
1488 thread_data[i].global_mutex = &global_mutex;
1489
1490 /* create threads and call pthread companion function */
1491 pthread_create(&threads[i], &attr, N_VWSqrSum_PT, (void *) &thread_data[i]);
1492 }
1493
1494 /* wait for all threads to finish */
1495 for (i=0; i<nthreads; i++) {
1496 pthread_join(threads[i], NULL);
1497 }
1498
1499 /* clean up and return */
1500 pthread_attr_destroy(&attr);
1501 pthread_mutex_destroy(&global_mutex);
1502 free(threads);
1503 free(thread_data);
1504
1505 return(sum);
1506 }
1507
1508
1509 /* ----------------------------------------------------------------------------
1510 * Pthread companion function to N_VWrmsNorm
1511 */
1512
N_VWSqrSum_PT(void * thread_data)1513 static void *N_VWSqrSum_PT(void *thread_data)
1514 {
1515 sunindextype i, start, end;
1516 realtype *xd, *wd;
1517 realtype local_sum, *global_sum;
1518 Pthreads_Data *my_data;
1519 pthread_mutex_t *global_mutex;
1520
1521 /* extract thread data */
1522 my_data = (Pthreads_Data *) thread_data;
1523
1524 xd = my_data->v1;
1525 wd = my_data->v2;
1526
1527 global_sum = my_data->global_val;
1528 global_mutex = my_data->global_mutex;
1529
1530 start = my_data->start;
1531 end = my_data->end;
1532
1533 /* compute wrms norm */
1534 local_sum = ZERO;
1535 for (i = start; i < end; i++)
1536 local_sum += SUNSQR(xd[i] * wd[i]);
1537
1538 /* update global sum */
1539 pthread_mutex_lock(global_mutex);
1540 *global_sum += local_sum;
1541 pthread_mutex_unlock(global_mutex);
1542
1543 /* exit */
1544 pthread_exit(NULL);
1545 }
1546
1547
1548 /* ----------------------------------------------------------------------------
1549 * Computes weighted root mean square norm of a masked vector
1550 */
1551
N_VWrmsNormMask_Pthreads(N_Vector x,N_Vector w,N_Vector id)1552 realtype N_VWrmsNormMask_Pthreads(N_Vector x, N_Vector w, N_Vector id)
1553 {
1554 return(SUNRsqrt(N_VWSqrSumMaskLocal_Pthreads(x, w, id)/(NV_LENGTH_PT(x))));
1555 }
1556
1557
1558 /* ----------------------------------------------------------------------------
1559 * Computes weighted square sum of a masked vector
1560 */
1561
N_VWSqrSumMaskLocal_Pthreads(N_Vector x,N_Vector w,N_Vector id)1562 realtype N_VWSqrSumMaskLocal_Pthreads(N_Vector x, N_Vector w, N_Vector id)
1563 {
1564 sunindextype N;
1565 int i, nthreads;
1566 pthread_t *threads;
1567 Pthreads_Data *thread_data;
1568 pthread_attr_t attr;
1569 pthread_mutex_t global_mutex;
1570 realtype sum = ZERO;
1571
1572 /* allocate threads and thread data structs */
1573 N = NV_LENGTH_PT(x);
1574 nthreads = NV_NUM_THREADS_PT(x);
1575 threads = malloc(nthreads*sizeof(pthread_t));
1576 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
1577
1578 /* set thread attributes */
1579 pthread_attr_init(&attr);
1580 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
1581
1582 /* lock for reduction */
1583 pthread_mutex_init(&global_mutex, NULL);
1584
1585 for (i=0; i<nthreads; i++) {
1586 /* initialize thread data */
1587 N_VInitThreadData(&thread_data[i]);
1588
1589 /* compute start and end loop index for thread */
1590 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
1591
1592 /* pack thread data */
1593 thread_data[i].v1 = NV_DATA_PT(x);
1594 thread_data[i].v2 = NV_DATA_PT(w);
1595 thread_data[i].v3 = NV_DATA_PT(id);
1596 thread_data[i].global_val = ∑
1597 thread_data[i].global_mutex = &global_mutex;
1598
1599 /* create threads and call pthread companion function */
1600 pthread_create(&threads[i], &attr, N_VWSqrSumMask_PT, (void *) &thread_data[i]);
1601 }
1602
1603 /* wait for all threads to finish */
1604 for (i=0; i<nthreads; i++) {
1605 pthread_join(threads[i], NULL);
1606 }
1607
1608 /* clean up and return */
1609 pthread_attr_destroy(&attr);
1610 pthread_mutex_destroy(&global_mutex);
1611 free(threads);
1612 free(thread_data);
1613
1614 return(sum);
1615 }
1616
1617
1618 /* ----------------------------------------------------------------------------
1619 * Pthread companion function to N_VWSqrSumMask
1620 */
1621
N_VWSqrSumMask_PT(void * thread_data)1622 static void *N_VWSqrSumMask_PT(void *thread_data)
1623 {
1624 sunindextype i, start, end;
1625 realtype *xd, *wd, *idd;
1626 realtype local_sum, *global_sum;
1627 Pthreads_Data *my_data;
1628 pthread_mutex_t *global_mutex;
1629
1630 /* extract thread data */
1631 my_data = (Pthreads_Data *) thread_data;
1632
1633 xd = my_data->v1;
1634 wd = my_data->v2;
1635 idd = my_data->v3;
1636
1637 global_sum = my_data->global_val;
1638 global_mutex = my_data->global_mutex;
1639
1640 start = my_data->start;
1641 end = my_data->end;
1642
1643 /* compute wrms norm with mask */
1644 local_sum = ZERO;
1645 for (i = start; i < end; i++) {
1646 if (idd[i] > ZERO)
1647 local_sum += SUNSQR(xd[i]*wd[i]);
1648 }
1649
1650 /* update global sum */
1651 pthread_mutex_lock(global_mutex);
1652 *global_sum += local_sum;
1653 pthread_mutex_unlock(global_mutex);
1654
1655 /* exit */
1656 pthread_exit(NULL);
1657 }
1658
1659
1660 /* ----------------------------------------------------------------------------
1661 * Finds the minimun component of a vector
1662 */
1663
N_VMin_Pthreads(N_Vector x)1664 realtype N_VMin_Pthreads(N_Vector x)
1665 {
1666 sunindextype N;
1667 int i, nthreads;
1668 pthread_t *threads;
1669 Pthreads_Data *thread_data;
1670 pthread_attr_t attr;
1671 pthread_mutex_t global_mutex;
1672 realtype min;
1673
1674 /* initialize global min */
1675 min = NV_Ith_PT(x,0);
1676
1677 /* allocate threads and thread data structs */
1678 N = NV_LENGTH_PT(x);
1679 nthreads = NV_NUM_THREADS_PT(x);
1680 threads = malloc(nthreads*sizeof(pthread_t));
1681 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
1682
1683 /* set thread attributes */
1684 pthread_attr_init(&attr);
1685 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
1686
1687 /* lock for reduction */
1688 pthread_mutex_init(&global_mutex, NULL);
1689
1690 for (i=0; i<nthreads; i++) {
1691 /* initialize thread data */
1692 N_VInitThreadData(&thread_data[i]);
1693
1694 /* compute start and end loop index for thread */
1695 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
1696
1697 /* pack thread data */
1698 thread_data[i].v1 = NV_DATA_PT(x);
1699 thread_data[i].global_val = &min;
1700 thread_data[i].global_mutex = &global_mutex;
1701
1702 /* create threads and call pthread companion function */
1703 pthread_create(&threads[i], &attr, N_VMin_PT, (void *) &thread_data[i]);
1704 }
1705
1706 /* wait for all threads to finish */
1707 for (i=0; i<nthreads; i++) {
1708 pthread_join(threads[i], NULL);
1709 }
1710
1711 /* clean up and return */
1712 pthread_attr_destroy(&attr);
1713 pthread_mutex_destroy(&global_mutex);
1714 free(threads);
1715 free(thread_data);
1716
1717 return(min);
1718 }
1719
1720
1721 /* ----------------------------------------------------------------------------
1722 * Pthread companion function to N_VMin
1723 */
1724
N_VMin_PT(void * thread_data)1725 static void *N_VMin_PT(void *thread_data)
1726 {
1727 sunindextype i, start, end;
1728 realtype *xd;
1729 realtype local_min, *global_min;
1730 Pthreads_Data *my_data;
1731 pthread_mutex_t *global_mutex;
1732
1733 /* extract thread data */
1734 my_data = (Pthreads_Data *) thread_data;
1735
1736 xd = my_data->v1;
1737
1738 global_min = my_data->global_val;
1739 global_mutex = my_data->global_mutex;
1740
1741 start = my_data->start;
1742 end = my_data->end;
1743
1744 /* find local min */
1745 local_min = *global_min;
1746 for (i = start; i < end; i++) {
1747 if (xd[i] < local_min)
1748 local_min = xd[i];
1749 }
1750
1751 /* update global min */
1752 pthread_mutex_lock(global_mutex);
1753 if (local_min < *global_min)
1754 *global_min = local_min;
1755 pthread_mutex_unlock(global_mutex);
1756
1757 /* exit */
1758 pthread_exit(NULL);
1759 }
1760
1761
1762 /* ----------------------------------------------------------------------------
1763 * Computes weighted L2 norm of a vector
1764 */
1765
N_VWL2Norm_Pthreads(N_Vector x,N_Vector w)1766 realtype N_VWL2Norm_Pthreads(N_Vector x, N_Vector w)
1767 {
1768 sunindextype N;
1769 int i, nthreads;
1770 pthread_t *threads;
1771 Pthreads_Data *thread_data;
1772 pthread_attr_t attr;
1773 pthread_mutex_t global_mutex;
1774 realtype sum = ZERO;
1775
1776 /* allocate threads and thread data structs */
1777 N = NV_LENGTH_PT(x);
1778 nthreads = NV_NUM_THREADS_PT(x);
1779 threads = malloc(nthreads*sizeof(pthread_t));
1780 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
1781
1782 /* set thread attributes */
1783 pthread_attr_init(&attr);
1784 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
1785
1786 /* lock for reduction */
1787 pthread_mutex_init(&global_mutex, NULL);
1788
1789 for (i=0; i<nthreads; i++) {
1790 /* initialize thread data */
1791 N_VInitThreadData(&thread_data[i]);
1792
1793 /* compute start and end loop index for thread */
1794 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
1795
1796 /* pack thread data */
1797 thread_data[i].v1 = NV_DATA_PT(x);
1798 thread_data[i].v2 = NV_DATA_PT(w);
1799 thread_data[i].global_val = ∑
1800 thread_data[i].global_mutex = &global_mutex;
1801
1802 /* create threads and call pthread companion function */
1803 pthread_create(&threads[i], &attr, N_VWL2Norm_PT, (void *) &thread_data[i]);
1804 }
1805
1806 /* wait for all threads to finish */
1807 for (i=0; i<nthreads; i++) {
1808 pthread_join(threads[i], NULL);
1809 }
1810
1811 /* clean up and return */
1812 pthread_attr_destroy(&attr);
1813 pthread_mutex_destroy(&global_mutex);
1814 free(threads);
1815 free(thread_data);
1816
1817 return(SUNRsqrt(sum));
1818 }
1819
1820
1821 /* ----------------------------------------------------------------------------
1822 * Pthread companion function to N_VWL2Norm
1823 */
1824
N_VWL2Norm_PT(void * thread_data)1825 static void *N_VWL2Norm_PT(void *thread_data)
1826 {
1827 sunindextype i, start, end;
1828 realtype *xd, *wd;
1829 realtype local_sum, *global_sum;
1830 Pthreads_Data *my_data;
1831 pthread_mutex_t *global_mutex;
1832
1833 /* extract thread data */
1834 my_data = (Pthreads_Data *) thread_data;
1835
1836 xd = my_data->v1;
1837 wd = my_data->v2;
1838
1839 global_sum = my_data->global_val;
1840 global_mutex = my_data->global_mutex;
1841
1842 start = my_data->start;
1843 end = my_data->end;
1844
1845 /* compute WL2 norm */
1846 local_sum = ZERO;
1847 for (i = start; i < end; i++)
1848 local_sum += SUNSQR(xd[i]*wd[i]);
1849
1850 /* update global sum */
1851 pthread_mutex_lock(global_mutex);
1852 *global_sum += local_sum;
1853 pthread_mutex_unlock(global_mutex);
1854
1855 /* exit */
1856 pthread_exit(NULL);
1857 }
1858
1859
1860 /* ----------------------------------------------------------------------------
1861 * Computes L1 norm of a vector
1862 */
1863
N_VL1Norm_Pthreads(N_Vector x)1864 realtype N_VL1Norm_Pthreads(N_Vector x)
1865 {
1866 sunindextype N;
1867 int i, nthreads;
1868 pthread_t *threads;
1869 Pthreads_Data *thread_data;
1870 pthread_attr_t attr;
1871 pthread_mutex_t global_mutex;
1872 realtype sum = ZERO;
1873
1874 /* allocate threads and thread data structs */
1875 N = NV_LENGTH_PT(x);
1876 nthreads = NV_NUM_THREADS_PT(x);
1877 threads = malloc(nthreads*sizeof(pthread_t));
1878 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
1879
1880 /* set thread attributes */
1881 pthread_attr_init(&attr);
1882 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
1883
1884 /* lock for reduction */
1885 pthread_mutex_init(&global_mutex, NULL);
1886
1887 for (i=0; i<nthreads; i++) {
1888 /* initialize thread data */
1889 N_VInitThreadData(&thread_data[i]);
1890
1891 /* compute start and end loop index for thread */
1892 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
1893
1894 /* pack thread data */
1895 thread_data[i].v1 = NV_DATA_PT(x);
1896 thread_data[i].global_val = ∑
1897 thread_data[i].global_mutex = &global_mutex;
1898
1899 /* create threads and call pthread companion function */
1900 pthread_create(&threads[i], &attr, N_VL1Norm_PT, (void *) &thread_data[i]);
1901 }
1902
1903 /* wait for all threads to finish */
1904 for (i=0; i<nthreads; i++) {
1905 pthread_join(threads[i], NULL);
1906 }
1907
1908 /* clean up and return */
1909 pthread_attr_destroy(&attr);
1910 pthread_mutex_destroy(&global_mutex);
1911 free(threads);
1912 free(thread_data);
1913
1914 return(sum);
1915 }
1916
1917
1918 /* ----------------------------------------------------------------------------
1919 * Pthread companion function to N_VL1Norm
1920 */
1921
N_VL1Norm_PT(void * thread_data)1922 static void *N_VL1Norm_PT(void *thread_data)
1923 {
1924 sunindextype i, start, end;
1925 realtype *xd;
1926 realtype local_sum, *global_sum;
1927 Pthreads_Data *my_data;
1928 pthread_mutex_t *global_mutex;
1929
1930 /* extract thread data */
1931 my_data = (Pthreads_Data *) thread_data;
1932
1933 xd = my_data->v1;
1934
1935 global_sum = my_data->global_val;
1936 global_mutex = my_data->global_mutex;
1937
1938 start = my_data->start;
1939 end = my_data->end;
1940
1941 /* compute L1 norm */
1942 local_sum = ZERO;
1943 for (i = start; i < end; i++)
1944 local_sum += SUNRabs(xd[i]);
1945
1946 /* update global sum */
1947 pthread_mutex_lock(global_mutex);
1948 *global_sum += local_sum;
1949 pthread_mutex_unlock(global_mutex);
1950
1951 /* exit */
1952 pthread_exit(NULL);
1953 }
1954
1955
1956 /* ----------------------------------------------------------------------------
1957 * Compare vector component values to a scaler
1958 */
1959
N_VCompare_Pthreads(realtype c,N_Vector x,N_Vector z)1960 void N_VCompare_Pthreads(realtype c, N_Vector x, N_Vector z)
1961 {
1962 sunindextype N;
1963 int i, nthreads;
1964 pthread_t *threads;
1965 Pthreads_Data *thread_data;
1966 pthread_attr_t attr;
1967
1968 /* allocate threads and thread data structs */
1969 N = NV_LENGTH_PT(x);
1970 nthreads = NV_NUM_THREADS_PT(x);
1971 threads = malloc(nthreads*sizeof(pthread_t));
1972 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
1973
1974 /* set thread attributes */
1975 pthread_attr_init(&attr);
1976 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
1977
1978 for (i=0; i<nthreads; i++) {
1979 /* initialize thread data */
1980 N_VInitThreadData(&thread_data[i]);
1981
1982 /* compute start and end loop index for thread */
1983 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
1984
1985 /* pack thread data */
1986 thread_data[i].c1 = c;
1987 thread_data[i].v1 = NV_DATA_PT(x);
1988 thread_data[i].v2 = NV_DATA_PT(z);
1989
1990 /* create threads and call pthread companion function */
1991 pthread_create(&threads[i], &attr, N_VCompare_PT, (void *) &thread_data[i]);
1992 }
1993
1994 /* wait for all threads to finish */
1995 for (i=0; i<nthreads; i++) {
1996 pthread_join(threads[i], NULL);
1997 }
1998
1999 /* clean up and return */
2000 pthread_attr_destroy(&attr);
2001 free(threads);
2002 free(thread_data);
2003
2004 return;
2005 }
2006
2007 /* ----------------------------------------------------------------------------
2008 * Pthread companion function to N_VCompare
2009 */
2010
N_VCompare_PT(void * thread_data)2011 static void *N_VCompare_PT(void *thread_data)
2012 {
2013 sunindextype i, start, end;
2014 realtype c;
2015 realtype *xd, *zd;
2016 Pthreads_Data *my_data;
2017
2018 /* extract thread data */
2019 my_data = (Pthreads_Data *) thread_data;
2020
2021 c = my_data->c1;
2022 xd = my_data->v1;
2023 zd = my_data->v2;
2024
2025 start = my_data->start;
2026 end = my_data->end;
2027
2028 /* compare component to scaler */
2029 for (i = start; i < end; i++)
2030 zd[i] = (SUNRabs(xd[i]) >= c) ? ONE : ZERO;
2031
2032 /* exit */
2033 pthread_exit(NULL);
2034 }
2035
2036
2037 /* ----------------------------------------------------------------------------
2038 * Compute componentwise inverse z[i] = ONE/x[i] and check if x[i] == ZERO
2039 */
2040
N_VInvTest_Pthreads(N_Vector x,N_Vector z)2041 booleantype N_VInvTest_Pthreads(N_Vector x, N_Vector z)
2042 {
2043 sunindextype N;
2044 int i, nthreads;
2045 pthread_t *threads;
2046 Pthreads_Data *thread_data;
2047 pthread_attr_t attr;
2048
2049 realtype val = ZERO;
2050
2051 /* allocate threads and thread data structs */
2052 N = NV_LENGTH_PT(x);
2053 nthreads = NV_NUM_THREADS_PT(x);
2054 threads = malloc(nthreads*sizeof(pthread_t));
2055 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
2056
2057 /* set thread attributes */
2058 pthread_attr_init(&attr);
2059 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
2060
2061 for (i=0; i<nthreads; i++) {
2062 /* initialize thread data */
2063 N_VInitThreadData(&thread_data[i]);
2064
2065 /* compute start and end loop index for thread */
2066 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
2067
2068 /* pack thread data */
2069 thread_data[i].v1 = NV_DATA_PT(x);
2070 thread_data[i].v2 = NV_DATA_PT(z);
2071 thread_data[i].global_val = &val;
2072
2073 /* create threads and call pthread companion function */
2074 pthread_create(&threads[i], &attr, N_VInvTest_PT, (void *) &thread_data[i]);
2075 }
2076
2077 /* wait for all threads to finish */
2078 for (i=0; i<nthreads; i++) {
2079 pthread_join(threads[i], NULL);
2080 }
2081
2082 /* clean up and return */
2083 pthread_attr_destroy(&attr);
2084 free(threads);
2085 free(thread_data);
2086
2087 if (val > ZERO)
2088 return (SUNFALSE);
2089 else
2090 return (SUNTRUE);
2091 }
2092
2093
2094 /* ----------------------------------------------------------------------------
2095 * Pthread companion function to N_VInvTest
2096 */
2097
N_VInvTest_PT(void * thread_data)2098 static void *N_VInvTest_PT(void *thread_data)
2099 {
2100 sunindextype i, start, end;
2101 realtype *xd, *zd;
2102 realtype local_val, *global_val;
2103 Pthreads_Data *my_data;
2104
2105 /* extract thread data */
2106 my_data = (Pthreads_Data *) thread_data;
2107
2108 xd = my_data->v1;
2109 zd = my_data->v2;
2110
2111 global_val = my_data->global_val;
2112
2113 start = my_data->start;
2114 end = my_data->end;
2115
2116 /* compute inverse with check for divide by ZERO */
2117 local_val = ZERO;
2118 for (i = start; i < end; i++) {
2119 if (xd[i] == ZERO)
2120 local_val = ONE;
2121 else
2122 zd[i] = ONE/xd[i];
2123 }
2124
2125 /* update global val */
2126 if (local_val > ZERO) {
2127 *global_val = local_val;
2128 }
2129
2130 /* exit */
2131 pthread_exit(NULL);
2132 }
2133
2134
2135 /* ----------------------------------------------------------------------------
2136 * Compute constraint mask of a vector
2137 */
2138
N_VConstrMask_Pthreads(N_Vector c,N_Vector x,N_Vector m)2139 booleantype N_VConstrMask_Pthreads(N_Vector c, N_Vector x, N_Vector m)
2140 {
2141 sunindextype N;
2142 int i, nthreads;
2143 pthread_t *threads;
2144 Pthreads_Data *thread_data;
2145 pthread_attr_t attr;
2146
2147 realtype val = ZERO;
2148
2149 /* allocate threads and thread data structs */
2150 N = NV_LENGTH_PT(x);
2151 nthreads = NV_NUM_THREADS_PT(x);
2152 threads = malloc(nthreads*sizeof(pthread_t));
2153 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
2154
2155 /* set thread attributes */
2156 pthread_attr_init(&attr);
2157 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
2158
2159 for (i=0; i<nthreads; i++) {
2160 /* initialize thread data */
2161 N_VInitThreadData(&thread_data[i]);
2162
2163 /* compute start and end loop index for thread */
2164 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
2165
2166 /* pack thread data */
2167 thread_data[i].v1 = NV_DATA_PT(c);
2168 thread_data[i].v2 = NV_DATA_PT(x);
2169 thread_data[i].v3 = NV_DATA_PT(m);
2170 thread_data[i].global_val = &val;
2171
2172 /* create threads and call pthread companion function */
2173 pthread_create(&threads[i], &attr, N_VConstrMask_PT, (void *) &thread_data[i]);
2174 }
2175
2176 /* wait for all threads to finish */
2177 for (i=0; i<nthreads; i++) {
2178 pthread_join(threads[i], NULL);
2179 }
2180
2181 /* clean up and return */
2182 pthread_attr_destroy(&attr);
2183 free(threads);
2184 free(thread_data);
2185
2186 if (val > ZERO)
2187 return(SUNFALSE);
2188 else
2189 return(SUNTRUE);
2190 }
2191
2192
2193 /* ----------------------------------------------------------------------------
2194 * Pthread companion function to N_VConstrMask
2195 */
2196
N_VConstrMask_PT(void * thread_data)2197 static void *N_VConstrMask_PT(void *thread_data)
2198 {
2199 sunindextype i, start, end;
2200 realtype *cd, *xd, *md;
2201 realtype local_val, *global_val;
2202 Pthreads_Data *my_data;
2203
2204 /* extract thread data */
2205 my_data = (Pthreads_Data *) thread_data;
2206
2207 cd = my_data->v1;
2208 xd = my_data->v2;
2209 md = my_data->v3;
2210
2211 global_val = my_data->global_val;
2212
2213 start = my_data->start;
2214 end = my_data->end;
2215
2216 /* compute constraint mask */
2217 local_val = ZERO;
2218 for (i = start; i < end; i++) {
2219 md[i] = ZERO;
2220
2221 /* Continue if no constraints were set for the variable */
2222 if (cd[i] == ZERO)
2223 continue;
2224
2225 /* Check if a set constraint has been violated */
2226 if ((SUNRabs(cd[i]) > ONEPT5 && xd[i]*cd[i] <= ZERO) ||
2227 (SUNRabs(cd[i]) > HALF && xd[i]*cd[i] < ZERO)) {
2228 local_val = md[i] = ONE;
2229 }
2230 }
2231
2232 /* update global val */
2233 if (local_val > ZERO) {
2234 *global_val = local_val;
2235 }
2236
2237 /* exit */
2238 pthread_exit(NULL);
2239 }
2240
2241
2242 /* ----------------------------------------------------------------------------
2243 * Compute minimum componentwise quotient
2244 */
2245
N_VMinQuotient_Pthreads(N_Vector num,N_Vector denom)2246 realtype N_VMinQuotient_Pthreads(N_Vector num, N_Vector denom)
2247 {
2248 sunindextype N;
2249 int i, nthreads;
2250 pthread_t *threads;
2251 Pthreads_Data *thread_data;
2252 pthread_attr_t attr;
2253 pthread_mutex_t global_mutex;
2254 realtype min = BIG_REAL;
2255
2256 /* allocate threads and thread data structs */
2257 N = NV_LENGTH_PT(num);
2258 nthreads = NV_NUM_THREADS_PT(num);
2259 threads = malloc(nthreads*sizeof(pthread_t));
2260 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
2261
2262 /* set thread attributes */
2263 pthread_attr_init(&attr);
2264 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
2265
2266 /* lock for reduction */
2267 pthread_mutex_init(&global_mutex, NULL);
2268
2269 for (i=0; i<nthreads; i++) {
2270 /* initialize thread data */
2271 N_VInitThreadData(&thread_data[i]);
2272
2273 /* compute start and end loop index for thread */
2274 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
2275
2276 /* pack thread data */
2277 thread_data[i].v1 = NV_DATA_PT(num);
2278 thread_data[i].v2 = NV_DATA_PT(denom);
2279 thread_data[i].global_val = &min;
2280 thread_data[i].global_mutex = &global_mutex;
2281
2282 /* create threads and call pthread companion function */
2283 pthread_create(&threads[i], &attr, N_VMinQuotient_PT, (void *) &thread_data[i]);
2284 }
2285
2286 /* wait for all threads to finish */
2287 for (i=0; i<nthreads; i++) {
2288 pthread_join(threads[i], NULL);
2289 }
2290
2291 /* clean up and return */
2292 pthread_attr_destroy(&attr);
2293 pthread_mutex_destroy(&global_mutex);
2294 free(threads);
2295 free(thread_data);
2296
2297 return(min);
2298 }
2299
2300
2301 /* ----------------------------------------------------------------------------
2302 * Pthread companion function to N_VConstrMask
2303 */
2304
N_VMinQuotient_PT(void * thread_data)2305 static void *N_VMinQuotient_PT(void *thread_data)
2306 {
2307 sunindextype i, start, end;
2308 realtype *nd, *dd;
2309 realtype local_min, *global_min;
2310 Pthreads_Data *my_data;
2311 pthread_mutex_t *global_mutex;
2312
2313 /* extract thread data */
2314 my_data = (Pthreads_Data *) thread_data;
2315
2316 nd = my_data->v1;
2317 dd = my_data->v2;
2318
2319 global_min = my_data->global_val;
2320 global_mutex = my_data->global_mutex;
2321
2322 start = my_data->start;
2323 end = my_data->end;
2324
2325 /* compute minimum quotient */
2326 local_min = BIG_REAL;
2327 for (i = start; i < end; i++) {
2328 if (dd[i] == ZERO)
2329 continue;
2330 local_min = SUNMIN(local_min, nd[i]/dd[i]);
2331 }
2332
2333 /* update global min */
2334 pthread_mutex_lock(global_mutex);
2335 if (local_min < *global_min)
2336 *global_min = local_min;
2337 pthread_mutex_unlock(global_mutex);
2338
2339 /* exit */
2340 pthread_exit(NULL);
2341 }
2342
2343
2344 /*
2345 * -----------------------------------------------------------------------------
2346 * fused vector operations
2347 * -----------------------------------------------------------------------------
2348 */
2349
2350
2351 /* -----------------------------------------------------------------------------
2352 * Compute the linear combination z = c[i]*X[i]
2353 */
2354
N_VLinearCombination_Pthreads(int nvec,realtype * c,N_Vector * X,N_Vector z)2355 int N_VLinearCombination_Pthreads(int nvec, realtype* c, N_Vector* X, N_Vector z)
2356 {
2357 sunindextype N;
2358 int i, nthreads;
2359 pthread_t *threads;
2360 Pthreads_Data *thread_data;
2361 pthread_attr_t attr;
2362
2363 /* invalid number of vectors */
2364 if (nvec < 1) return(-1);
2365
2366 /* should have called N_VScale */
2367 if (nvec == 1) {
2368 N_VScale_Pthreads(c[0], X[0], z);
2369 return(0);
2370 }
2371
2372 /* should have called N_VLinearSum */
2373 if (nvec == 2) {
2374 N_VLinearSum_Pthreads(c[0], X[0], c[1], X[1], z);
2375 return(0);
2376 }
2377
2378 /* get vector length and data array */
2379 N = NV_LENGTH_PT(z);
2380 nthreads = NV_NUM_THREADS_PT(z);
2381 threads = malloc(nthreads*sizeof(pthread_t));
2382 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
2383
2384 /* set thread attributes */
2385 pthread_attr_init(&attr);
2386 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
2387
2388 for (i=0; i<nthreads; i++) {
2389 /* initialize thread data */
2390 N_VInitThreadData(&thread_data[i]);
2391
2392 /* compute start and end loop index for thread */
2393 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
2394
2395 /* pack thread data */
2396 thread_data[i].nvec = nvec;
2397 thread_data[i].cvals = c;
2398 thread_data[i].Y1 = X;
2399 thread_data[i].x1 = z;
2400
2401 /* create threads and call pthread companion function */
2402 pthread_create(&threads[i], &attr, N_VLinearCombination_PT, (void *) &thread_data[i]);
2403 }
2404
2405 /* wait for all threads to finish */
2406 for (i=0; i<nthreads; i++) {
2407 pthread_join(threads[i], NULL);
2408 }
2409
2410 /* clean up and return */
2411 pthread_attr_destroy(&attr);
2412 free(threads);
2413 free(thread_data);
2414
2415 return(0);
2416 }
2417
2418
2419 /* -----------------------------------------------------------------------------
2420 * Pthread companion function to N_VLinearCombination
2421 */
2422
N_VLinearCombination_PT(void * thread_data)2423 static void *N_VLinearCombination_PT(void *thread_data)
2424 {
2425 Pthreads_Data *my_data;
2426 sunindextype j, start, end;
2427
2428 int i;
2429 realtype* c=NULL;
2430 realtype* xd=NULL;
2431 realtype* zd=NULL;
2432
2433 /* extract thread data */
2434 my_data = (Pthreads_Data *) thread_data;
2435
2436 start = my_data->start;
2437 end = my_data->end;
2438
2439 c = my_data->cvals;
2440 zd = NV_DATA_PT(my_data->x1);
2441
2442 /*
2443 * X[0] += c[i]*X[i], i = 1,...,nvec-1
2444 */
2445 if ((my_data->Y1[0] == my_data->x1) && (c[0] == ONE)) {
2446 for (i=1; i<my_data->nvec; i++) {
2447 xd = NV_DATA_PT(my_data->Y1[i]);
2448 for (j=start; j<end; j++) {
2449 zd[j] += c[i] * xd[j];
2450 }
2451 }
2452 pthread_exit(NULL);
2453 }
2454
2455 /*
2456 * X[0] = c[0] * X[0] + sum{ c[i] * X[i] }, i = 1,...,nvec-1
2457 */
2458 if (my_data->Y1[0] == my_data->x1) {
2459 for (j=start; j<end; j++) {
2460 zd[j] *= c[0];
2461 }
2462 for (i=1; i<my_data->nvec; i++) {
2463 xd = NV_DATA_PT(my_data->Y1[i]);
2464 for (j=start; j<end; j++) {
2465 zd[j] += c[i] * xd[j];
2466 }
2467 }
2468 pthread_exit(NULL);
2469 }
2470
2471 /*
2472 * z = sum{ c[i] * X[i] }, i = 0,...,nvec-1
2473 */
2474 xd = NV_DATA_PT(my_data->Y1[0]);
2475 for (j=start; j<end; j++) {
2476 zd[j] = c[0] * xd[j];
2477 }
2478 for (i=1; i<my_data->nvec; i++) {
2479 xd = NV_DATA_PT(my_data->Y1[i]);
2480 for (j=start; j<end; j++) {
2481 zd[j] += c[i] * xd[j];
2482 }
2483 }
2484 pthread_exit(NULL);
2485 }
2486
2487
2488 /* -----------------------------------------------------------------------------
2489 * Compute multiple linear sums Z[i] = Y[i] + a*x
2490 */
2491
N_VScaleAddMulti_Pthreads(int nvec,realtype * a,N_Vector x,N_Vector * Y,N_Vector * Z)2492 int N_VScaleAddMulti_Pthreads(int nvec, realtype* a, N_Vector x, N_Vector* Y, N_Vector* Z)
2493 {
2494 sunindextype N;
2495 int i, nthreads;
2496 pthread_t *threads;
2497 Pthreads_Data *thread_data;
2498 pthread_attr_t attr;
2499
2500 /* invalid number of vectors */
2501 if (nvec < 1) return(-1);
2502
2503 /* should have called N_VLinearSum */
2504 if (nvec == 1) {
2505 N_VLinearSum_Pthreads(a[0], x, ONE, Y[0], Z[0]);
2506 return(0);
2507 }
2508
2509 /* get vector length and data array */
2510 N = NV_LENGTH_PT(x);
2511 nthreads = NV_NUM_THREADS_PT(x);
2512 threads = malloc(nthreads*sizeof(pthread_t));
2513 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
2514
2515 /* set thread attributes */
2516 pthread_attr_init(&attr);
2517 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
2518
2519 for (i=0; i<nthreads; i++) {
2520 /* initialize thread data */
2521 N_VInitThreadData(&thread_data[i]);
2522
2523 /* compute start and end loop index for thread */
2524 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
2525
2526 /* pack thread data */
2527 thread_data[i].nvec = nvec;
2528 thread_data[i].cvals = a;
2529 thread_data[i].x1 = x;
2530 thread_data[i].Y1 = Y;
2531 thread_data[i].Y2 = Z;
2532
2533 /* create threads and call pthread companion function */
2534 pthread_create(&threads[i], &attr, N_VScaleAddMulti_PT, (void *) &thread_data[i]);
2535 }
2536
2537 /* wait for all threads to finish */
2538 for (i=0; i<nthreads; i++) {
2539 pthread_join(threads[i], NULL);
2540 }
2541
2542 /* clean up and return */
2543 pthread_attr_destroy(&attr);
2544 free(threads);
2545 free(thread_data);
2546
2547 return(0);
2548 }
2549
2550
2551 /* -----------------------------------------------------------------------------
2552 * Pthread companion function to N_VScaleAddMulti
2553 */
2554
N_VScaleAddMulti_PT(void * thread_data)2555 static void *N_VScaleAddMulti_PT(void *thread_data)
2556 {
2557 Pthreads_Data *my_data;
2558 sunindextype j, start, end;
2559
2560 int i;
2561 realtype* a=NULL;
2562 realtype* xd=NULL;
2563 realtype* yd=NULL;
2564 realtype* zd=NULL;
2565
2566 /* extract thread data */
2567 my_data = (Pthreads_Data *) thread_data;
2568
2569 start = my_data->start;
2570 end = my_data->end;
2571
2572 a = my_data->cvals;
2573 xd = NV_DATA_PT(my_data->x1);
2574
2575 /*
2576 * Y[i][j] += a[i] * x[j]
2577 */
2578 if (my_data->Y1 == my_data->Y2) {
2579 for (i=0; i<my_data->nvec; i++) {
2580 yd = NV_DATA_PT(my_data->Y1[i]);
2581 for (j=start; j<end; j++) {
2582 yd[j] += a[i] * xd[j];
2583 }
2584 }
2585 pthread_exit(NULL);
2586 }
2587
2588 /*
2589 * Z[i][j] = Y[i][j] + a[i] * x[j]
2590 */
2591 for (i=0; i<my_data->nvec; i++) {
2592 yd = NV_DATA_PT(my_data->Y1[i]);
2593 zd = NV_DATA_PT(my_data->Y2[i]);
2594 for (j=start; j<end; j++) {
2595 zd[j] = a[i] * xd[j] + yd[j];
2596 }
2597 }
2598 pthread_exit(NULL);
2599 }
2600
2601
2602 /* -----------------------------------------------------------------------------
2603 * Compute the dot product of a vector with multiple vectors, a[i] = sum(x*Y[i])
2604 */
2605
N_VDotProdMulti_Pthreads(int nvec,N_Vector x,N_Vector * Y,realtype * dotprods)2606 int N_VDotProdMulti_Pthreads(int nvec, N_Vector x, N_Vector* Y, realtype* dotprods)
2607 {
2608 sunindextype N;
2609 int i, nthreads;
2610 pthread_t *threads;
2611 Pthreads_Data *thread_data;
2612 pthread_attr_t attr;
2613 pthread_mutex_t global_mutex;
2614
2615 /* invalid number of vectors */
2616 if (nvec < 1) return(-1);
2617
2618 /* should have called N_VDotProd */
2619 if (nvec == 1) {
2620 dotprods[0] = N_VDotProd_Pthreads(x, Y[0]);
2621 return(0);
2622 }
2623
2624 /* initialize output array */
2625 for (i=0; i<nvec; i++)
2626 dotprods[i] = ZERO;
2627
2628 /* allocate threads and thread data structs */
2629 N = NV_LENGTH_PT(x);
2630 nthreads = NV_NUM_THREADS_PT(x);
2631 threads = malloc(nthreads*sizeof(pthread_t));
2632 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
2633
2634 /* set thread attributes */
2635 pthread_attr_init(&attr);
2636 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
2637
2638 /* lock for reduction */
2639 pthread_mutex_init(&global_mutex, NULL);
2640
2641 for (i=0; i<nthreads; i++) {
2642 /* initialize thread data */
2643 N_VInitThreadData(&thread_data[i]);
2644
2645 /* compute start and end loop index for thread */
2646 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
2647
2648 /* pack thread data */
2649 thread_data[i].nvec = nvec;
2650 thread_data[i].x1 = x;
2651 thread_data[i].Y1 = Y;
2652 thread_data[i].cvals = dotprods;
2653
2654 thread_data[i].global_mutex = &global_mutex;
2655
2656 /* create threads and call pthread companion function */
2657 pthread_create(&threads[i], &attr, N_VDotProdMulti_PT, (void *) &thread_data[i]);
2658 }
2659
2660 /* wait for all threads to finish */
2661 for (i=0; i<nthreads; i++) {
2662 pthread_join(threads[i], NULL);
2663 }
2664
2665 /* clean up and return */
2666 pthread_attr_destroy(&attr);
2667 pthread_mutex_destroy(&global_mutex);
2668 free(threads);
2669 free(thread_data);
2670
2671 return(0);
2672 }
2673
2674
2675 /* -----------------------------------------------------------------------------
2676 * Pthread companion function to N_VDotProdMulti
2677 */
2678
N_VDotProdMulti_PT(void * thread_data)2679 static void *N_VDotProdMulti_PT(void *thread_data)
2680 {
2681 Pthreads_Data *my_data;
2682 sunindextype j, start, end;
2683 pthread_mutex_t *lock;
2684
2685 int i;
2686 realtype sum;
2687 realtype* dotprods=NULL;
2688 realtype* xd=NULL;
2689 realtype* yd=NULL;
2690
2691 /* extract thread data */
2692 my_data = (Pthreads_Data *) thread_data;
2693
2694 start = my_data->start;
2695 end = my_data->end;
2696 lock = my_data->global_mutex;
2697
2698 xd = NV_DATA_PT(my_data->x1);
2699 dotprods = my_data->cvals;
2700
2701 /* compute multiple dot products */
2702 for (i=0; i<my_data->nvec; i++) {
2703 yd = NV_DATA_PT(my_data->Y1[i]);
2704 sum = ZERO;
2705 for (j=start; j<end; j++) {
2706 sum += xd[j] * yd[j];
2707 }
2708 /* update global sum */
2709 pthread_mutex_lock(lock);
2710 dotprods[i] += sum;
2711 pthread_mutex_unlock(lock);
2712 }
2713
2714 /* exit */
2715 pthread_exit(NULL);
2716 }
2717
2718
2719 /*
2720 * -----------------------------------------------------------------------------
2721 * vector array operations
2722 * -----------------------------------------------------------------------------
2723 */
2724
2725
2726 /* -----------------------------------------------------------------------------
2727 * Compute multiple linear sums Z[i] = a*X[i] + b*Y[i]
2728 */
2729
N_VLinearSumVectorArray_Pthreads(int nvec,realtype a,N_Vector * X,realtype b,N_Vector * Y,N_Vector * Z)2730 int N_VLinearSumVectorArray_Pthreads(int nvec, realtype a, N_Vector* X,
2731 realtype b, N_Vector* Y, N_Vector* Z)
2732 {
2733 sunindextype N;
2734 int i, nthreads;
2735 pthread_t *threads;
2736 Pthreads_Data *thread_data;
2737 pthread_attr_t attr;
2738
2739 realtype c;
2740 N_Vector* V1;
2741 N_Vector* V2;
2742 booleantype test;
2743
2744 /* invalid number of vectors */
2745 if (nvec < 1) return(-1);
2746
2747 /* should have called N_VLinearSum */
2748 if (nvec == 1) {
2749 N_VLinearSum_Pthreads(a, X[0], b, Y[0], Z[0]);
2750 return(0);
2751 }
2752
2753 /* BLAS usage: axpy y <- ax+y */
2754 if ((b == ONE) && (Z == Y))
2755 return(VaxpyVectorArray_Pthreads(nvec, a, X, Y));
2756
2757 /* BLAS usage: axpy x <- by+x */
2758 if ((a == ONE) && (Z == X))
2759 return(VaxpyVectorArray_Pthreads(nvec, b, Y, X));
2760
2761 /* Case: a == b == 1.0 */
2762 if ((a == ONE) && (b == ONE))
2763 return(VSumVectorArray_Pthreads(nvec, X, Y, Z));
2764
2765 /* Cases: */
2766 /* (1) a == 1.0, b = -1.0, */
2767 /* (2) a == -1.0, b == 1.0 */
2768 if ((test = ((a == ONE) && (b == -ONE))) || ((a == -ONE) && (b == ONE))) {
2769 V1 = test ? Y : X;
2770 V2 = test ? X : Y;
2771 return(VDiffVectorArray_Pthreads(nvec, V2, V1, Z));
2772 }
2773
2774 /* Cases: */
2775 /* (1) a == 1.0, b == other or 0.0, */
2776 /* (2) a == other or 0.0, b == 1.0 */
2777 /* if a or b is 0.0, then user should have called N_VScale */
2778 if ((test = (a == ONE)) || (b == ONE)) {
2779 c = test ? b : a;
2780 V1 = test ? Y : X;
2781 V2 = test ? X : Y;
2782 return(VLin1VectorArray_Pthreads(nvec, c, V1, V2, Z));
2783 }
2784
2785 /* Cases: */
2786 /* (1) a == -1.0, b != 1.0, */
2787 /* (2) a != 1.0, b == -1.0 */
2788 if ((test = (a == -ONE)) || (b == -ONE)) {
2789 c = test ? b : a;
2790 V1 = test ? Y : X;
2791 V2 = test ? X : Y;
2792 return(VLin2VectorArray_Pthreads(nvec, c, V1, V2, Z));
2793 }
2794
2795 /* Case: a == b */
2796 /* catches case both a and b are 0.0 - user should have called N_VConst */
2797 if (a == b)
2798 return(VScaleSumVectorArray_Pthreads(nvec, a, X, Y, Z));
2799
2800 /* Case: a == -b */
2801 if (a == -b)
2802 return(VScaleDiffVectorArray_Pthreads(nvec, a, X, Y, Z));
2803
2804 /* Do all cases not handled above: */
2805 /* (1) a == other, b == 0.0 - user should have called N_VScale */
2806 /* (2) a == 0.0, b == other - user should have called N_VScale */
2807 /* (3) a,b == other, a !=b, a != -b */
2808
2809 /* get vector length and data array */
2810 N = NV_LENGTH_PT(Z[0]);
2811 nthreads = NV_NUM_THREADS_PT(Z[0]);
2812 threads = malloc(nthreads*sizeof(pthread_t));
2813 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
2814
2815 /* set thread attributes */
2816 pthread_attr_init(&attr);
2817 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
2818
2819 for (i=0; i<nthreads; i++) {
2820 /* initialize thread data */
2821 N_VInitThreadData(&thread_data[i]);
2822
2823 /* compute start and end loop index for thread */
2824 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
2825
2826 /* pack thread data */
2827 thread_data[i].nvec = nvec;
2828 thread_data[i].c1 = a;
2829 thread_data[i].c2 = b;
2830 thread_data[i].Y1 = X;
2831 thread_data[i].Y2 = Y;
2832 thread_data[i].Y3 = Z;
2833
2834 /* create threads and call pthread companion function */
2835 pthread_create(&threads[i], &attr, N_VLinearSumVectorArray_PT,
2836 (void *) &thread_data[i]);
2837 }
2838
2839 /* wait for all threads to finish */
2840 for (i=0; i<nthreads; i++) {
2841 pthread_join(threads[i], NULL);
2842 }
2843
2844 /* clean up and return */
2845 pthread_attr_destroy(&attr);
2846 free(threads);
2847 free(thread_data);
2848
2849 return(0);
2850 }
2851
2852
2853 /* -----------------------------------------------------------------------------
2854 * Pthread companion function to N_VLinearSumVectorArray
2855 */
2856
N_VLinearSumVectorArray_PT(void * thread_data)2857 static void *N_VLinearSumVectorArray_PT(void *thread_data)
2858 {
2859 Pthreads_Data *my_data;
2860 sunindextype j, start, end;
2861
2862 int i;
2863 realtype a, b;
2864 realtype* xd=NULL;
2865 realtype* yd=NULL;
2866 realtype* zd=NULL;
2867
2868 /* extract thread data */
2869 my_data = (Pthreads_Data *) thread_data;
2870
2871 start = my_data->start;
2872 end = my_data->end;
2873
2874 a = my_data->c1;
2875 b = my_data->c2;
2876
2877 /* compute linear sum for each vector pair in vector arrays */
2878 for (i=0; i<my_data->nvec; i++) {
2879 xd = NV_DATA_PT(my_data->Y1[i]);
2880 yd = NV_DATA_PT(my_data->Y2[i]);
2881 zd = NV_DATA_PT(my_data->Y3[i]);
2882 for (j=start; j<end; j++) {
2883 zd[j] = a * xd[j] + b * yd[j];
2884 }
2885 }
2886
2887 /* exit */
2888 pthread_exit(NULL);
2889 }
2890
2891
2892 /* -----------------------------------------------------------------------------
2893 * Scale multiple vectors Z[i] = c[i]*X[i]
2894 */
2895
N_VScaleVectorArray_Pthreads(int nvec,realtype * c,N_Vector * X,N_Vector * Z)2896 int N_VScaleVectorArray_Pthreads(int nvec, realtype* c, N_Vector* X, N_Vector* Z)
2897 {
2898 sunindextype N;
2899 int i, nthreads;
2900 pthread_t *threads;
2901 Pthreads_Data *thread_data;
2902 pthread_attr_t attr;
2903
2904 /* invalid number of vectors */
2905 if (nvec < 1) return(-1);
2906
2907 /* should have called N_VScale */
2908 if (nvec == 1) {
2909 N_VScale_Pthreads(c[0], X[0], Z[0]);
2910 return(0);
2911 }
2912
2913 /* get vector length and data array */
2914 N = NV_LENGTH_PT(Z[0]);
2915 nthreads = NV_NUM_THREADS_PT(Z[0]);
2916 threads = malloc(nthreads*sizeof(pthread_t));
2917 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
2918
2919 /* set thread attributes */
2920 pthread_attr_init(&attr);
2921 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
2922
2923 for (i=0; i<nthreads; i++) {
2924 /* initialize thread data */
2925 N_VInitThreadData(&thread_data[i]);
2926
2927 /* compute start and end loop index for thread */
2928 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
2929
2930 /* pack thread data */
2931 thread_data[i].nvec = nvec;
2932 thread_data[i].cvals = c;
2933 thread_data[i].Y1 = X;
2934 thread_data[i].Y2 = Z;
2935
2936 /* create threads and call pthread companion function */
2937 pthread_create(&threads[i], &attr, N_VScaleVectorArray_PT,
2938 (void *) &thread_data[i]);
2939 }
2940
2941 /* wait for all threads to finish */
2942 for (i=0; i<nthreads; i++) {
2943 pthread_join(threads[i], NULL);
2944 }
2945
2946 /* clean up and return */
2947 pthread_attr_destroy(&attr);
2948 free(threads);
2949 free(thread_data);
2950
2951 return(0);
2952 }
2953
2954
2955 /* -----------------------------------------------------------------------------
2956 * Pthread companion function to N_VScaleVectorArray
2957 */
2958
N_VScaleVectorArray_PT(void * thread_data)2959 static void *N_VScaleVectorArray_PT(void *thread_data)
2960 {
2961 Pthreads_Data *my_data;
2962 sunindextype j, start, end;
2963
2964 int i;
2965 realtype* c;
2966 realtype* xd=NULL;
2967 realtype* zd=NULL;
2968
2969 /* extract thread data */
2970 my_data = (Pthreads_Data *) thread_data;
2971
2972 start = my_data->start;
2973 end = my_data->end;
2974
2975 c = my_data->cvals;
2976
2977 /*
2978 * X[i] *= c[i]
2979 */
2980 if (my_data->Y1 == my_data->Y2) {
2981 for (i=0; i<my_data->nvec; i++) {
2982 xd = NV_DATA_PT(my_data->Y1[i]);
2983 for (j=start; j<end; j++) {
2984 xd[j] *= c[i];
2985 }
2986 }
2987 pthread_exit(NULL);
2988 }
2989
2990 /*
2991 * Z[i] = c[i] * X[i]
2992 */
2993 for (i=0; i<my_data->nvec; i++) {
2994 xd = NV_DATA_PT(my_data->Y1[i]);
2995 zd = NV_DATA_PT(my_data->Y2[i]);
2996 for (j=start; j<end; j++) {
2997 zd[j] = c[i] * xd[j];
2998 }
2999 }
3000 pthread_exit(NULL);
3001 }
3002
3003
3004 /* -----------------------------------------------------------------------------
3005 * Set multiple vectors to a constant value Z[i] = c
3006 */
3007
N_VConstVectorArray_Pthreads(int nvec,realtype c,N_Vector * Z)3008 int N_VConstVectorArray_Pthreads(int nvec, realtype c, N_Vector* Z)
3009 {
3010 sunindextype N;
3011 int i, nthreads;
3012 pthread_t *threads;
3013 Pthreads_Data *thread_data;
3014 pthread_attr_t attr;
3015
3016 /* invalid number of vectors */
3017 if (nvec < 1) return(-1);
3018
3019 /* should have called N_VConst */
3020 if (nvec == 1) {
3021 N_VConst_Pthreads(c, Z[0]);
3022 return(0);
3023 }
3024
3025 /* get vector length and data array */
3026 N = NV_LENGTH_PT(Z[0]);
3027 nthreads = NV_NUM_THREADS_PT(Z[0]);
3028 threads = malloc(nthreads*sizeof(pthread_t));
3029 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
3030
3031 /* set thread attributes */
3032 pthread_attr_init(&attr);
3033 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
3034
3035 for (i=0; i<nthreads; i++) {
3036 /* initialize thread data */
3037 N_VInitThreadData(&thread_data[i]);
3038
3039 /* compute start and end loop index for thread */
3040 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
3041
3042 /* pack thread data */
3043 thread_data[i].nvec = nvec;
3044 thread_data[i].c1 = c;
3045 thread_data[i].Y1 = Z;
3046
3047 /* create threads and call pthread companion function */
3048 pthread_create(&threads[i], &attr, N_VConstVectorArray_PT,
3049 (void *) &thread_data[i]);
3050 }
3051
3052 /* wait for all threads to finish */
3053 for (i=0; i<nthreads; i++) {
3054 pthread_join(threads[i], NULL);
3055 }
3056
3057 /* clean up and return */
3058 pthread_attr_destroy(&attr);
3059 free(threads);
3060 free(thread_data);
3061
3062 return(0);
3063 }
3064
3065
3066 /* -----------------------------------------------------------------------------
3067 * Pthread companion function to N_VConstVectorArray
3068 */
3069
N_VConstVectorArray_PT(void * thread_data)3070 static void *N_VConstVectorArray_PT(void *thread_data)
3071 {
3072 Pthreads_Data *my_data;
3073 sunindextype j, start, end;
3074
3075 int i;
3076 realtype* zd=NULL;
3077
3078 /* extract thread data */
3079 my_data = (Pthreads_Data *) thread_data;
3080
3081 start = my_data->start;
3082 end = my_data->end;
3083
3084 /* set each vector in the vector array to a constant */
3085 for (i=0; i<my_data->nvec; i++) {
3086 zd = NV_DATA_PT(my_data->Y1[i]);
3087 for (j=start; j<end; j++) {
3088 zd[j] = my_data->c1;
3089 }
3090 }
3091
3092 /* exit */
3093 pthread_exit(NULL);
3094 }
3095
3096
3097 /* ----------------------------------------------------------------------------
3098 * Compute the weighted root mean square norm of multiple vectors
3099 */
3100
N_VWrmsNormVectorArray_Pthreads(int nvec,N_Vector * X,N_Vector * W,realtype * nrm)3101 int N_VWrmsNormVectorArray_Pthreads(int nvec, N_Vector* X, N_Vector* W, realtype* nrm)
3102 {
3103 sunindextype N;
3104 int i, nthreads;
3105 pthread_t *threads;
3106 Pthreads_Data *thread_data;
3107 pthread_attr_t attr;
3108 pthread_mutex_t global_mutex;
3109
3110 /* invalid number of vectors */
3111 if (nvec < 1) return(-1);
3112
3113 /* should have called N_VWrmsNorm */
3114 if (nvec == 1) {
3115 nrm[0] = N_VWrmsNorm_Pthreads(X[0], W[0]);
3116 return(0);
3117 }
3118
3119 /* initialize output array */
3120 for (i=0; i<nvec; i++)
3121 nrm[i] = ZERO;
3122
3123 /* allocate threads and thread data structs */
3124 N = NV_LENGTH_PT(X[0]);
3125 nthreads = NV_NUM_THREADS_PT(X[0]);
3126 threads = malloc(nthreads*sizeof(pthread_t));
3127 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
3128
3129 /* set thread attributes */
3130 pthread_attr_init(&attr);
3131 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
3132
3133 /* lock for reduction */
3134 pthread_mutex_init(&global_mutex, NULL);
3135
3136 for (i=0; i<nthreads; i++) {
3137 /* initialize thread data */
3138 N_VInitThreadData(&thread_data[i]);
3139
3140 /* compute start and end loop index for thread */
3141 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
3142
3143 /* pack thread data */
3144 thread_data[i].nvec = nvec;
3145 thread_data[i].Y1 = X;
3146 thread_data[i].Y2 = W;
3147 thread_data[i].cvals = nrm;
3148
3149 thread_data[i].global_mutex = &global_mutex;
3150
3151 /* create threads and call pthread companion function */
3152 pthread_create(&threads[i], &attr, N_VWrmsNormVectorArray_PT,
3153 (void *) &thread_data[i]);
3154 }
3155
3156 /* wait for all threads to finish */
3157 for (i=0; i<nthreads; i++) {
3158 pthread_join(threads[i], NULL);
3159 }
3160
3161 /* finalize wrms calculation */
3162 for (i=0; i<nvec; i++)
3163 nrm[i] = SUNRsqrt(nrm[i]/N);
3164
3165 /* clean up and return */
3166 pthread_attr_destroy(&attr);
3167 pthread_mutex_destroy(&global_mutex);
3168 free(threads);
3169 free(thread_data);
3170
3171 return(0);
3172 }
3173
3174
3175 /* ----------------------------------------------------------------------------
3176 * Pthread companion function to N_VWrmsNormVectorArray
3177 */
3178
N_VWrmsNormVectorArray_PT(void * thread_data)3179 static void *N_VWrmsNormVectorArray_PT(void *thread_data)
3180 {
3181 Pthreads_Data *my_data;
3182 sunindextype j, start, end;
3183 pthread_mutex_t *lock;
3184
3185 int i;
3186 realtype sum;
3187 realtype* nrm=NULL;
3188 realtype* xd=NULL;
3189 realtype* wd=NULL;
3190
3191 /* extract thread data */
3192 my_data = (Pthreads_Data *) thread_data;
3193
3194 start = my_data->start;
3195 end = my_data->end;
3196 lock = my_data->global_mutex;
3197
3198 nrm = my_data->cvals;
3199
3200 /* compute the WRMS norm for each vector in the vector array */
3201 for (i=0; i<my_data->nvec; i++) {
3202 xd = NV_DATA_PT(my_data->Y1[i]);
3203 wd = NV_DATA_PT(my_data->Y2[i]);
3204 sum = ZERO;
3205 for (j=start; j<end; j++) {
3206 sum += SUNSQR(xd[j] * wd[j]);
3207 }
3208 /* update global sum */
3209 pthread_mutex_lock(lock);
3210 nrm[i] += sum;
3211 pthread_mutex_unlock(lock);
3212 }
3213
3214 /* exit */
3215 pthread_exit(NULL);
3216 }
3217
3218
3219 /* ----------------------------------------------------------------------------
3220 * Compute the weighted root mean square norm of multiple vectors
3221 */
3222
N_VWrmsNormMaskVectorArray_Pthreads(int nvec,N_Vector * X,N_Vector * W,N_Vector id,realtype * nrm)3223 int N_VWrmsNormMaskVectorArray_Pthreads(int nvec, N_Vector* X, N_Vector* W,
3224 N_Vector id, realtype* nrm)
3225 {
3226 sunindextype N;
3227 int i, nthreads;
3228 pthread_t *threads;
3229 Pthreads_Data *thread_data;
3230 pthread_attr_t attr;
3231 pthread_mutex_t global_mutex;
3232
3233 /* invalid number of vectors */
3234 if (nvec < 1) return(-1);
3235
3236 /* should have called N_VWrmsNorm */
3237 if (nvec == 1) {
3238 nrm[0] = N_VWrmsNormMask_Pthreads(X[0], W[0], id);
3239 return(0);
3240 }
3241
3242 /* initialize output array */
3243 for (i=0; i<nvec; i++)
3244 nrm[i] = ZERO;
3245
3246 /* allocate threads and thread data structs */
3247 N = NV_LENGTH_PT(X[0]);
3248 nthreads = NV_NUM_THREADS_PT(X[0]);
3249 threads = malloc(nthreads*sizeof(pthread_t));
3250 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
3251
3252 /* set thread attributes */
3253 pthread_attr_init(&attr);
3254 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
3255
3256 /* lock for reduction */
3257 pthread_mutex_init(&global_mutex, NULL);
3258
3259 for (i=0; i<nthreads; i++) {
3260 /* initialize thread data */
3261 N_VInitThreadData(&thread_data[i]);
3262
3263 /* compute start and end loop index for thread */
3264 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
3265
3266 /* pack thread data */
3267 thread_data[i].nvec = nvec;
3268 thread_data[i].Y1 = X;
3269 thread_data[i].Y2 = W;
3270 thread_data[i].x1 = id;
3271 thread_data[i].cvals = nrm;
3272
3273 thread_data[i].global_mutex = &global_mutex;
3274
3275 /* create threads and call pthread companion function */
3276 pthread_create(&threads[i], &attr, N_VWrmsNormMaskVectorArray_PT,
3277 (void *) &thread_data[i]);
3278 }
3279
3280 /* wait for all threads to finish */
3281 for (i=0; i<nthreads; i++) {
3282 pthread_join(threads[i], NULL);
3283 }
3284
3285 /* finalize wrms calculation */
3286 for (i=0; i<nvec; i++)
3287 nrm[i] = SUNRsqrt(nrm[i]/N);
3288
3289 /* clean up and return */
3290 pthread_attr_destroy(&attr);
3291 pthread_mutex_destroy(&global_mutex);
3292 free(threads);
3293 free(thread_data);
3294
3295 return(0);
3296 }
3297
3298
3299 /* ----------------------------------------------------------------------------
3300 * Pthread companion function to N_VWrmsNormVectorArray
3301 */
3302
N_VWrmsNormMaskVectorArray_PT(void * thread_data)3303 static void *N_VWrmsNormMaskVectorArray_PT(void *thread_data)
3304 {
3305 Pthreads_Data *my_data;
3306 sunindextype j, start, end;
3307 pthread_mutex_t *lock;
3308
3309 int i;
3310 realtype sum;
3311 realtype* nrm=NULL;
3312 realtype* xd=NULL;
3313 realtype* wd=NULL;
3314 realtype* idd=NULL;
3315
3316 /* extract thread data */
3317 my_data = (Pthreads_Data *) thread_data;
3318
3319 start = my_data->start;
3320 end = my_data->end;
3321 lock = my_data->global_mutex;
3322
3323 nrm = my_data->cvals;
3324 idd = NV_DATA_PT(my_data->x1);
3325
3326 /* compute the WRMS norm for each vector in the vector array */
3327 for (i=0; i<my_data->nvec; i++) {
3328 xd = NV_DATA_PT(my_data->Y1[i]);
3329 wd = NV_DATA_PT(my_data->Y2[i]);
3330 sum = ZERO;
3331 for (j=start; j<end; j++) {
3332 if (idd[j] > ZERO)
3333 sum += SUNSQR(xd[j] * wd[j]);
3334 }
3335 /* update global sum */
3336 pthread_mutex_lock(lock);
3337 nrm[i] += sum;
3338 pthread_mutex_unlock(lock);
3339 }
3340
3341 /* exit */
3342 pthread_exit(NULL);
3343 }
3344
3345
3346 /* -----------------------------------------------------------------------------
3347 * Scale and add a vector to multiple vectors Z = Y + a*X
3348 */
3349
N_VScaleAddMultiVectorArray_Pthreads(int nvec,int nsum,realtype * a,N_Vector * X,N_Vector ** Y,N_Vector ** Z)3350 int N_VScaleAddMultiVectorArray_Pthreads(int nvec, int nsum, realtype* a,
3351 N_Vector* X, N_Vector** Y, N_Vector** Z)
3352 {
3353 sunindextype N;
3354 int i, j, nthreads;
3355 pthread_t *threads;
3356 Pthreads_Data *thread_data;
3357 pthread_attr_t attr;
3358
3359 int retval;
3360 N_Vector* YY;
3361 N_Vector* ZZ;
3362
3363 /* invalid number of vectors */
3364 if (nvec < 1) return(-1);
3365 if (nsum < 1) return(-1);
3366
3367 /* ---------------------------
3368 * Special cases for nvec == 1
3369 * --------------------------- */
3370
3371 if (nvec == 1) {
3372
3373 /* should have called N_VLinearSum */
3374 if (nsum == 1) {
3375 N_VLinearSum_Pthreads(a[0], X[0], ONE, Y[0][0], Z[0][0]);
3376 return(0);
3377 }
3378
3379 /* should have called N_VScaleAddMulti */
3380 YY = (N_Vector*) malloc(nsum * sizeof(N_Vector));
3381 ZZ = (N_Vector*) malloc(nsum * sizeof(N_Vector));
3382
3383 for (j=0; j<nsum; j++) {
3384 YY[j] = Y[j][0];
3385 ZZ[j] = Z[j][0];
3386 }
3387
3388 retval = N_VScaleAddMulti_Pthreads(nsum, a, X[0], YY, ZZ);
3389
3390 free(YY);
3391 free(ZZ);
3392 return(retval);
3393 }
3394
3395 /* --------------------------
3396 * Special cases for nvec > 1
3397 * -------------------------- */
3398
3399 /* should have called N_VLinearSumVectorArray */
3400 if (nsum == 1) {
3401 retval = N_VLinearSumVectorArray_Pthreads(nvec, a[0], X, ONE, Y[0], Z[0]);
3402 return(retval);
3403 }
3404
3405 /* ----------------------------
3406 * Compute multiple linear sums
3407 * ---------------------------- */
3408
3409 /* get vector length and data array */
3410 N = NV_LENGTH_PT(X[0]);
3411 nthreads = NV_NUM_THREADS_PT(X[0]);
3412 threads = malloc(nthreads*sizeof(pthread_t));
3413 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
3414
3415 /* set thread attributes */
3416 pthread_attr_init(&attr);
3417 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
3418
3419 for (i=0; i<nthreads; i++) {
3420 /* initialize thread data */
3421 N_VInitThreadData(&thread_data[i]);
3422
3423 /* compute start and end loop index for thread */
3424 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
3425
3426 /* pack thread data */
3427 thread_data[i].nvec = nvec;
3428 thread_data[i].nsum = nsum;
3429 thread_data[i].cvals = a;
3430 thread_data[i].Y1 = X;
3431 thread_data[i].ZZ1 = Y;
3432 thread_data[i].ZZ2 = Z;
3433
3434 /* create threads and call pthread companion function */
3435 pthread_create(&threads[i], &attr, N_VScaleAddMultiVectorArray_PT,
3436 (void *) &thread_data[i]);
3437 }
3438
3439 /* wait for all threads to finish */
3440 for (i=0; i<nthreads; i++) {
3441 pthread_join(threads[i], NULL);
3442 }
3443
3444 /* clean up and return */
3445 pthread_attr_destroy(&attr);
3446 free(threads);
3447 free(thread_data);
3448
3449 return(0);
3450 }
3451
3452
3453 /* -----------------------------------------------------------------------------
3454 * Pthread companion function to N_VScaleAddMultiVectorArray
3455 */
3456
N_VScaleAddMultiVectorArray_PT(void * thread_data)3457 static void *N_VScaleAddMultiVectorArray_PT(void *thread_data)
3458 {
3459 Pthreads_Data *my_data;
3460 sunindextype k, start, end;
3461
3462 int i, j;
3463 realtype* a=NULL;
3464 realtype* xd=NULL;
3465 realtype* yd=NULL;
3466 realtype* zd=NULL;
3467
3468 /* extract thread data */
3469 my_data = (Pthreads_Data *) thread_data;
3470
3471 start = my_data->start;
3472 end = my_data->end;
3473
3474 a = my_data->cvals;
3475
3476 /*
3477 * Y[i][j] += a[i] * x[j]
3478 */
3479 if (my_data->ZZ1 == my_data->ZZ2) {
3480 for (i=0; i<my_data->nvec; i++) {
3481 xd = NV_DATA_PT(my_data->Y1[i]);
3482 for (j=0; j<my_data->nsum; j++){
3483 yd = NV_DATA_PT(my_data->ZZ1[j][i]);
3484 for (k=start; k<end; k++) {
3485 yd[k] += a[j] * xd[k];
3486 }
3487 }
3488 }
3489 pthread_exit(NULL);
3490 }
3491
3492 /*
3493 * Z[i][j] = Y[i][j] + a[i] * x[j]
3494 */
3495 for (i=0; i<my_data->nvec; i++) {
3496 xd = NV_DATA_PT(my_data->Y1[i]);
3497 for (j=0; j<my_data->nsum; j++){
3498 yd = NV_DATA_PT(my_data->ZZ1[j][i]);
3499 zd = NV_DATA_PT(my_data->ZZ2[j][i]);
3500 for (k=start; k<end; k++) {
3501 zd[k] = a[j] * xd[k] + yd[k];
3502 }
3503 }
3504 }
3505 pthread_exit(NULL);
3506 }
3507
3508
3509 /* -----------------------------------------------------------------------------
3510 * Compute a linear combination for multiple vectors
3511 */
3512
N_VLinearCombinationVectorArray_Pthreads(int nvec,int nsum,realtype * c,N_Vector ** X,N_Vector * Z)3513 int N_VLinearCombinationVectorArray_Pthreads(int nvec, int nsum, realtype* c,
3514 N_Vector** X, N_Vector* Z)
3515 {
3516 sunindextype N;
3517 int i, j, nthreads;
3518 pthread_t *threads;
3519 Pthreads_Data *thread_data;
3520 pthread_attr_t attr;
3521
3522 int retval;
3523 realtype* ctmp;
3524 N_Vector* Y;
3525
3526 /* invalid number of vectors */
3527 if (nvec < 1) return(-1);
3528 if (nsum < 1) return(-1);
3529
3530 /* ---------------------------
3531 * Special cases for nvec == 1
3532 * --------------------------- */
3533
3534 if (nvec == 1) {
3535
3536 /* should have called N_VScale */
3537 if (nsum == 1) {
3538 N_VScale_Pthreads(c[0], X[0][0], Z[0]);
3539 return(0);
3540 }
3541
3542 /* should have called N_VLinearSum */
3543 if (nsum == 2) {
3544 N_VLinearSum_Pthreads(c[0], X[0][0], c[1], X[1][0], Z[0]);
3545 return(0);
3546 }
3547
3548 /* should have called N_VLinearCombination */
3549 Y = (N_Vector*) malloc(nsum * sizeof(N_Vector));
3550
3551 for (i=0; i<nsum; i++) {
3552 Y[i] = X[i][0];
3553 }
3554
3555 retval = N_VLinearCombination_Pthreads(nsum, c, Y, Z[0]);
3556
3557 free(Y);
3558 return(retval);
3559 }
3560
3561 /* --------------------------
3562 * Special cases for nvec > 1
3563 * -------------------------- */
3564
3565 /* should have called N_VScaleVectorArray */
3566 if (nsum == 1) {
3567
3568 ctmp = (realtype*) malloc(nvec * sizeof(realtype));
3569
3570 for (j=0; j<nvec; j++) {
3571 ctmp[j] = c[0];
3572 }
3573
3574 retval = N_VScaleVectorArray_Pthreads(nvec, ctmp, X[0], Z);
3575
3576 free(ctmp);
3577 return(retval);
3578 }
3579
3580 /* should have called N_VLinearSumVectorArray */
3581 if (nsum == 2) {
3582 retval = N_VLinearSumVectorArray_Pthreads(nvec, c[0], X[0], c[1], X[1], Z);
3583 return(retval);
3584 }
3585
3586 /* --------------------------
3587 * Compute linear combination
3588 * -------------------------- */
3589
3590 /* get vector length and data array */
3591 N = NV_LENGTH_PT(Z[0]);
3592 nthreads = NV_NUM_THREADS_PT(Z[0]);
3593 threads = malloc(nthreads*sizeof(pthread_t));
3594 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
3595
3596 /* set thread attributes */
3597 pthread_attr_init(&attr);
3598 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
3599
3600 for (i=0; i<nthreads; i++) {
3601 /* initialize thread data */
3602 N_VInitThreadData(&thread_data[i]);
3603
3604 /* compute start and end loop index for thread */
3605 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
3606
3607 /* pack thread data */
3608 thread_data[i].nvec = nvec;
3609 thread_data[i].nsum = nsum;
3610 thread_data[i].cvals = c;
3611 thread_data[i].ZZ1 = X;
3612 thread_data[i].Y1 = Z;
3613
3614 /* create threads and call pthread companion function */
3615 pthread_create(&threads[i], &attr, N_VLinearCombinationVectorArray_PT,
3616 (void *) &thread_data[i]);
3617 }
3618
3619 /* wait for all threads to finish */
3620 for (i=0; i<nthreads; i++) {
3621 pthread_join(threads[i], NULL);
3622 }
3623
3624 /* clean up and return */
3625 pthread_attr_destroy(&attr);
3626 free(threads);
3627 free(thread_data);
3628
3629 return(0);
3630 }
3631
3632
3633 /* -----------------------------------------------------------------------------
3634 * Pthread companion function to N_VLinearCombinationVectorArray
3635 */
3636
N_VLinearCombinationVectorArray_PT(void * thread_data)3637 static void *N_VLinearCombinationVectorArray_PT(void *thread_data)
3638 {
3639 Pthreads_Data *my_data;
3640 sunindextype k, start, end;
3641
3642 int i; /* vector arrays index in summation [0,nsum) */
3643 int j; /* vector index in vector array [0,nvec) */
3644 realtype* c=NULL;
3645 realtype* zd=NULL;
3646 realtype* xd=NULL;
3647
3648 /* extract thread data */
3649 my_data = (Pthreads_Data *) thread_data;
3650
3651 start = my_data->start;
3652 end = my_data->end;
3653
3654 c = my_data->cvals;
3655
3656 /*
3657 * X[0][j] += c[i]*X[i][j], i = 1,...,nvec-1
3658 */
3659 if ((my_data->ZZ1[0] == my_data->Y1) && (c[0] == ONE)) {
3660 for (j=0; j<my_data->nvec; j++) {
3661 zd = NV_DATA_PT(my_data->Y1[j]);
3662 for (i=1; i<my_data->nsum; i++) {
3663 xd = NV_DATA_PT(my_data->ZZ1[i][j]);
3664 for (k=start; k<end; k++) {
3665 zd[k] += c[i] * xd[k];
3666 }
3667 }
3668 }
3669 pthread_exit(NULL);
3670 }
3671
3672 /*
3673 * X[0][j] = c[0] * X[0][j] + sum{ c[i] * X[i][j] }, i = 1,...,nvec-1
3674 */
3675 if (my_data->ZZ1[0] == my_data->Y1) {
3676 for (j=0; j<my_data->nvec; j++) {
3677 zd = NV_DATA_PT(my_data->Y1[j]);
3678 for (k=start; k<end; k++) {
3679 zd[k] *= c[0];
3680 }
3681 for (i=1; i<my_data->nsum; i++) {
3682 xd = NV_DATA_PT(my_data->ZZ1[i][j]);
3683 for (k=start; k<end; k++) {
3684 zd[k] += c[i] * xd[k];
3685 }
3686 }
3687 }
3688 pthread_exit(NULL);
3689 }
3690
3691 /*
3692 * Z[j] = sum{ c[i] * X[i][j] }, i = 0,...,nvec-1
3693 */
3694 for (j=0; j<my_data->nvec; j++) {
3695 xd = NV_DATA_PT(my_data->ZZ1[0][j]);
3696 zd = NV_DATA_PT(my_data->Y1[j]);
3697 for (k=start; k<end; k++) {
3698 zd[k] = c[0] * xd[k];
3699 }
3700 for (i=1; i<my_data->nsum; i++) {
3701 xd = NV_DATA_PT(my_data->ZZ1[i][j]);
3702 for (k=start; k<end; k++) {
3703 zd[k] += c[i] * xd[k];
3704 }
3705 }
3706 }
3707 pthread_exit(NULL);
3708 }
3709
3710
3711 /*
3712 * -----------------------------------------------------------------
3713 * private functions for special cases of vector operations
3714 * -----------------------------------------------------------------
3715 */
3716
3717
3718 /* ----------------------------------------------------------------------------
3719 * Copy vector components into second vector
3720 */
3721
VCopy_Pthreads(N_Vector x,N_Vector z)3722 static void VCopy_Pthreads(N_Vector x, N_Vector z)
3723 {
3724 sunindextype N;
3725 int i, nthreads;
3726 pthread_t *threads;
3727 Pthreads_Data *thread_data;
3728 pthread_attr_t attr;
3729
3730 /* allocate threads and thread data structs */
3731 N = NV_LENGTH_PT(x);
3732 nthreads = NV_NUM_THREADS_PT(x);
3733 threads = malloc(nthreads*sizeof(pthread_t));
3734 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
3735
3736 /* set thread attributes */
3737 pthread_attr_init(&attr);
3738 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
3739
3740 for (i=0; i<nthreads; i++) {
3741 /* initialize thread data */
3742 N_VInitThreadData(&thread_data[i]);
3743
3744 /* compute start and end loop index for thread */
3745 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
3746
3747 /* pack thread data */
3748 thread_data[i].v1 = NV_DATA_PT(x);
3749 thread_data[i].v2 = NV_DATA_PT(z);
3750
3751 /* create threads and call pthread companion function */
3752 pthread_create(&threads[i], &attr, VCopy_PT, (void *) &thread_data[i]);
3753 }
3754
3755 /* wait for all threads to finish */
3756 for (i=0; i<nthreads; i++) {
3757 pthread_join(threads[i], NULL);
3758 }
3759
3760 /* clean up and return */
3761 pthread_attr_destroy(&attr);
3762 free(threads);
3763 free(thread_data);
3764
3765 return;
3766 }
3767
3768
3769 /* ----------------------------------------------------------------------------
3770 * Pthread companion function to VCopy
3771 */
3772
VCopy_PT(void * thread_data)3773 static void *VCopy_PT(void *thread_data)
3774 {
3775 sunindextype i, start, end;
3776 realtype *xd, *zd;
3777 Pthreads_Data *my_data;
3778
3779 /* extract thread data */
3780 my_data = (Pthreads_Data *) thread_data;
3781
3782 xd = my_data->v1;
3783 zd = my_data->v2;
3784
3785 start = my_data->start;
3786 end = my_data->end;
3787
3788 /* copy vector components */
3789 for (i = start; i < end; i++)
3790 zd[i] = xd[i];
3791
3792 /* exit */
3793 pthread_exit(NULL);
3794 }
3795
3796
3797 /* ----------------------------------------------------------------------------
3798 * Compute vector sum
3799 */
3800
VSum_Pthreads(N_Vector x,N_Vector y,N_Vector z)3801 static void VSum_Pthreads(N_Vector x, N_Vector y, N_Vector z)
3802 {
3803 sunindextype N;
3804 int i, nthreads;
3805 pthread_t *threads;
3806 Pthreads_Data *thread_data;
3807 pthread_attr_t attr;
3808
3809 /* allocate threads and thread data structs */
3810 N = NV_LENGTH_PT(x);
3811 nthreads = NV_NUM_THREADS_PT(x);
3812 threads = malloc(nthreads*sizeof(pthread_t));
3813 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
3814
3815 /* set thread attributes */
3816 pthread_attr_init(&attr);
3817 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
3818
3819 for (i=0; i<nthreads; i++) {
3820 /* initialize thread data */
3821 N_VInitThreadData(&thread_data[i]);
3822
3823 /* compute start and end loop index for thread */
3824 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
3825
3826 /* pack thread data */
3827 thread_data[i].v1 = NV_DATA_PT(x);
3828 thread_data[i].v2 = NV_DATA_PT(y);
3829 thread_data[i].v3 = NV_DATA_PT(z);
3830
3831 /* create threads and call pthread companion function */
3832 pthread_create(&threads[i], &attr, VSum_PT, (void *) &thread_data[i]);
3833 }
3834
3835 /* wait for all threads to finish */
3836 for (i=0; i<nthreads; i++) {
3837 pthread_join(threads[i], NULL);
3838 }
3839
3840 /* clean up and return */
3841 pthread_attr_destroy(&attr);
3842 free(threads);
3843 free(thread_data);
3844
3845 return;
3846 }
3847
3848
3849 /* ----------------------------------------------------------------------------
3850 * Pthread companion function to VSum
3851 */
3852
VSum_PT(void * thread_data)3853 static void *VSum_PT(void *thread_data)
3854 {
3855 sunindextype i, start, end;
3856 realtype *xd, *yd, *zd;
3857 Pthreads_Data *my_data;
3858
3859 /* extract thread data */
3860 my_data = (Pthreads_Data *) thread_data;
3861
3862 xd = my_data->v1;
3863 yd = my_data->v2;
3864 zd = my_data->v3;
3865
3866 start = my_data->start;
3867 end = my_data->end;
3868
3869 /* compute vector sum */
3870 for (i = start; i < end; i++)
3871 zd[i] = xd[i] + yd[i];
3872
3873 /* exit */
3874 pthread_exit(NULL);
3875 }
3876
3877
3878 /* ----------------------------------------------------------------------------
3879 * Compute vector difference
3880 */
3881
VDiff_Pthreads(N_Vector x,N_Vector y,N_Vector z)3882 static void VDiff_Pthreads(N_Vector x, N_Vector y, N_Vector z)
3883 {
3884 sunindextype N;
3885 int i, nthreads;
3886 pthread_t *threads;
3887 Pthreads_Data *thread_data;
3888 pthread_attr_t attr;
3889
3890 /* allocate threads and thread data structs */
3891 N = NV_LENGTH_PT(x);
3892 nthreads = NV_NUM_THREADS_PT(x);
3893 threads = malloc(nthreads*sizeof(pthread_t));
3894 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
3895
3896 /* set thread attributes */
3897 pthread_attr_init(&attr);
3898 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
3899
3900 for (i=0; i<nthreads; i++) {
3901 /* initialize thread data */
3902 N_VInitThreadData(&thread_data[i]);
3903
3904 /* compute start and end loop index for thread */
3905 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
3906
3907 /* pack thread data */
3908 thread_data[i].v1 = NV_DATA_PT(x);
3909 thread_data[i].v2 = NV_DATA_PT(y);
3910 thread_data[i].v3 = NV_DATA_PT(z);
3911
3912 /* create threads and call pthread companion function */
3913 pthread_create(&threads[i], &attr, VDiff_PT, (void *) &thread_data[i]);
3914 }
3915
3916 /* wait for all threads to finish */
3917 for (i=0; i<nthreads; i++) {
3918 pthread_join(threads[i], NULL);
3919 }
3920
3921 /* clean up and return */
3922 pthread_attr_destroy(&attr);
3923 free(threads);
3924 free(thread_data);
3925
3926 return;
3927 }
3928
3929
3930 /* ----------------------------------------------------------------------------
3931 * Pthread companion function to VDiff
3932 */
3933
VDiff_PT(void * thread_data)3934 static void *VDiff_PT(void *thread_data)
3935 {
3936 sunindextype i, start, end;
3937 realtype *xd, *yd, *zd;
3938 Pthreads_Data *my_data;
3939
3940 /* extract thread data */
3941 my_data = (Pthreads_Data *) thread_data;
3942
3943 xd = my_data->v1;
3944 yd = my_data->v2;
3945 zd = my_data->v3;
3946
3947 start = my_data->start;
3948 end = my_data->end;
3949
3950 /* compute vector difference */
3951 for (i = start; i < end; i++)
3952 zd[i] = xd[i] - yd[i];
3953
3954 /* exit */
3955 pthread_exit(NULL);
3956 }
3957
3958
3959 /* ----------------------------------------------------------------------------
3960 * Compute the negative of a vector
3961 */
3962
VNeg_Pthreads(N_Vector x,N_Vector z)3963 static void VNeg_Pthreads(N_Vector x, N_Vector z)
3964 {
3965 sunindextype N;
3966 int i, nthreads;
3967 pthread_t *threads;
3968 Pthreads_Data *thread_data;
3969 pthread_attr_t attr;
3970
3971 /* allocate threads and thread data structs */
3972 N = NV_LENGTH_PT(x);
3973 nthreads = NV_NUM_THREADS_PT(x);
3974 threads = malloc(nthreads*sizeof(pthread_t));
3975 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
3976
3977 /* set thread attributes */
3978 pthread_attr_init(&attr);
3979 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
3980
3981 for (i=0; i<nthreads; i++) {
3982 /* initialize thread data */
3983 N_VInitThreadData(&thread_data[i]);
3984
3985 /* compute start and end loop index for thread */
3986 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
3987
3988 /* pack thread data */
3989 thread_data[i].v1 = NV_DATA_PT(x);
3990 thread_data[i].v2 = NV_DATA_PT(z);
3991
3992 /* create threads and call pthread companion function */
3993 pthread_create(&threads[i], &attr, VNeg_PT, (void *) &thread_data[i]);
3994 }
3995
3996 /* wait for all threads to finish */
3997 for (i=0; i<nthreads; i++) {
3998 pthread_join(threads[i], NULL);
3999 }
4000
4001 /* clean up and return */
4002 pthread_attr_destroy(&attr);
4003 free(threads);
4004 free(thread_data);
4005
4006 return;
4007 }
4008
4009
4010 /* ----------------------------------------------------------------------------
4011 * Pthread companion function to VNeg
4012 */
4013
VNeg_PT(void * thread_data)4014 static void *VNeg_PT(void *thread_data)
4015 {
4016 sunindextype i, start, end;
4017 realtype *xd, *zd;
4018 Pthreads_Data *my_data;
4019
4020 /* extract thread data */
4021 my_data = (Pthreads_Data *) thread_data;
4022
4023 xd = my_data->v1;
4024 zd = my_data->v2;
4025
4026 start = my_data->start;
4027 end = my_data->end;
4028
4029 /* compute negative of vector */
4030 for (i = start; i < end; i++)
4031 zd[i] = -xd[i];
4032
4033 /* exit */
4034 pthread_exit(NULL);
4035 }
4036
4037
4038 /* ----------------------------------------------------------------------------
4039 * Compute scaled vector sum
4040 */
4041
VScaleSum_Pthreads(realtype c,N_Vector x,N_Vector y,N_Vector z)4042 static void VScaleSum_Pthreads(realtype c, N_Vector x, N_Vector y, N_Vector z)
4043 {
4044 sunindextype N;
4045 int i, nthreads;
4046 pthread_t *threads;
4047 Pthreads_Data *thread_data;
4048 pthread_attr_t attr;
4049
4050 /* allocate threads and thread data structs */
4051 N = NV_LENGTH_PT(x);
4052 nthreads = NV_NUM_THREADS_PT(x);
4053 threads = malloc(nthreads*sizeof(pthread_t));
4054 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
4055
4056 /* set thread attributes */
4057 pthread_attr_init(&attr);
4058 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
4059
4060 for (i=0; i<nthreads; i++) {
4061 /* initialize thread data */
4062 N_VInitThreadData(&thread_data[i]);
4063
4064 /* compute start and end loop index for thread */
4065 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
4066
4067 /* pack thread data */
4068 thread_data[i].c1 = c;
4069 thread_data[i].v1 = NV_DATA_PT(x);
4070 thread_data[i].v2 = NV_DATA_PT(y);
4071 thread_data[i].v3 = NV_DATA_PT(z);
4072
4073 /* create threads and call pthread companion function */
4074 pthread_create(&threads[i], &attr, VScaleSum_PT, (void *) &thread_data[i]);
4075 }
4076
4077 /* wait for all threads to finish */
4078 for (i=0; i<nthreads; i++) {
4079 pthread_join(threads[i], NULL);
4080 }
4081
4082 /* clean up and return */
4083 pthread_attr_destroy(&attr);
4084 free(threads);
4085 free(thread_data);
4086
4087 return;
4088 }
4089
4090
4091 /* ----------------------------------------------------------------------------
4092 * Pthread companion function to VScaleSum
4093 */
4094
VScaleSum_PT(void * thread_data)4095 static void *VScaleSum_PT(void *thread_data)
4096 {
4097 sunindextype i, start, end;
4098 realtype c;
4099 realtype *xd, *yd, *zd;
4100 Pthreads_Data *my_data;
4101
4102 /* extract thread data */
4103 my_data = (Pthreads_Data *) thread_data;
4104
4105 c = my_data->c1;
4106 xd = my_data->v1;
4107 yd = my_data->v2;
4108 zd = my_data->v3;
4109
4110 start = my_data->start;
4111 end = my_data->end;
4112
4113 /* compute scaled vector sum */
4114 for (i = start; i < end; i++)
4115 zd[i] = c*(xd[i] + yd[i]);
4116
4117 /* exit */
4118 pthread_exit(NULL);
4119 }
4120
4121
4122 /* ----------------------------------------------------------------------------
4123 * Compute scaled vector difference
4124 */
4125
VScaleDiff_Pthreads(realtype c,N_Vector x,N_Vector y,N_Vector z)4126 static void VScaleDiff_Pthreads(realtype c, N_Vector x, N_Vector y, N_Vector z)
4127 {
4128 sunindextype N;
4129 int i, nthreads;
4130 pthread_t *threads;
4131 Pthreads_Data *thread_data;
4132 pthread_attr_t attr;
4133
4134 /* allocate threads and thread data structs */
4135 N = NV_LENGTH_PT(x);
4136 nthreads = NV_NUM_THREADS_PT(x);
4137 threads = malloc(nthreads*sizeof(pthread_t));
4138 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
4139
4140 /* set thread attributes */
4141 pthread_attr_init(&attr);
4142 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
4143
4144 for (i=0; i<nthreads; i++) {
4145 /* initialize thread data */
4146 N_VInitThreadData(&thread_data[i]);
4147
4148 /* compute start and end loop index for thread */
4149 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
4150
4151 /* pack thread data */
4152 thread_data[i].c1 = c;
4153 thread_data[i].v1 = NV_DATA_PT(x);
4154 thread_data[i].v2 = NV_DATA_PT(y);
4155 thread_data[i].v3 = NV_DATA_PT(z);
4156
4157 /* create threads and call pthread companion function */
4158 pthread_create(&threads[i], &attr, VScaleDiff_PT, (void *) &thread_data[i]);
4159 }
4160
4161 /* wait for all threads to finish */
4162 for (i=0; i<nthreads; i++) {
4163 pthread_join(threads[i], NULL);
4164 }
4165
4166 /* clean up and return */
4167 pthread_attr_destroy(&attr);
4168 free(threads);
4169 free(thread_data);
4170
4171 return;
4172 }
4173
4174
4175 /* ----------------------------------------------------------------------------
4176 * Pthread companion function to VScaleDiff
4177 */
4178
VScaleDiff_PT(void * thread_data)4179 static void *VScaleDiff_PT(void *thread_data)
4180 {
4181 sunindextype i, start, end;
4182 realtype c;
4183 realtype *xd, *yd, *zd;
4184 Pthreads_Data *my_data;
4185
4186 /* extract thread data */
4187 my_data = (Pthreads_Data *) thread_data;
4188
4189 c = my_data->c1;
4190 xd = my_data->v1;
4191 yd = my_data->v2;
4192 zd = my_data->v3;
4193
4194 start = my_data->start;
4195 end = my_data->end;
4196
4197 /* compute scaled vector difference */
4198 for (i = start; i < end; i++)
4199 zd[i] = c*(xd[i] - yd[i]);
4200
4201 /* exit */
4202 pthread_exit(NULL);
4203 }
4204
4205
4206 /* ----------------------------------------------------------------------------
4207 * Compute vector sum z[i] = a*x[i]+y[i]
4208 */
4209
VLin1_Pthreads(realtype a,N_Vector x,N_Vector y,N_Vector z)4210 static void VLin1_Pthreads(realtype a, N_Vector x, N_Vector y, N_Vector z)
4211 {
4212 sunindextype N;
4213 int i, nthreads;
4214 pthread_t *threads;
4215 Pthreads_Data *thread_data;
4216 pthread_attr_t attr;
4217
4218 /* allocate threads and thread data structs */
4219 N = NV_LENGTH_PT(x);
4220 nthreads = NV_NUM_THREADS_PT(x);
4221 threads = malloc(nthreads*sizeof(pthread_t));
4222 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
4223
4224 /* set thread attributes */
4225 pthread_attr_init(&attr);
4226 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
4227
4228 for (i=0; i<nthreads; i++) {
4229 /* initialize thread data */
4230 N_VInitThreadData(&thread_data[i]);
4231
4232 /* compute start and end loop index for thread */
4233 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
4234
4235 /* pack thread data */
4236 thread_data[i].c1 = a;
4237 thread_data[i].v1 = NV_DATA_PT(x);
4238 thread_data[i].v2 = NV_DATA_PT(y);
4239 thread_data[i].v3 = NV_DATA_PT(z);
4240
4241 /* create threads and call pthread companion function */
4242 pthread_create(&threads[i], &attr, VLin1_PT, (void *) &thread_data[i]);
4243 }
4244
4245 /* wait for all threads to finish */
4246 for (i=0; i<nthreads; i++) {
4247 pthread_join(threads[i], NULL);
4248 }
4249
4250 /* clean up and return */
4251 pthread_attr_destroy(&attr);
4252 free(threads);
4253 free(thread_data);
4254
4255 return;
4256 }
4257
4258
4259 /* ----------------------------------------------------------------------------
4260 * Pthread companion function to VLin1
4261 */
4262
VLin1_PT(void * thread_data)4263 static void *VLin1_PT(void *thread_data)
4264 {
4265 sunindextype i, start, end;
4266 realtype a;
4267 realtype *xd, *yd, *zd;
4268 Pthreads_Data *my_data;
4269
4270 /* extract thread data */
4271 my_data = (Pthreads_Data *) thread_data;
4272
4273 a = my_data->c1;
4274 xd = my_data->v1;
4275 yd = my_data->v2;
4276 zd = my_data->v3;
4277
4278 start = my_data->start;
4279 end = my_data->end;
4280
4281 /* compute vector sum */
4282 for (i = start; i < end; i++)
4283 zd[i] = (a*xd[i]) + yd[i];
4284
4285 /* exit */
4286 pthread_exit(NULL);
4287 }
4288
4289
4290 /* ----------------------------------------------------------------------------
4291 * Compute vector difference z[i] = a*x[i]-y[i]
4292 */
4293
VLin2_Pthreads(realtype a,N_Vector x,N_Vector y,N_Vector z)4294 static void VLin2_Pthreads(realtype a, N_Vector x, N_Vector y, N_Vector z)
4295 {
4296 sunindextype N;
4297 int i, nthreads;
4298 pthread_t *threads;
4299 Pthreads_Data *thread_data;
4300 pthread_attr_t attr;
4301
4302 /* allocate threads and thread data structs */
4303 N = NV_LENGTH_PT(x);
4304 nthreads = NV_NUM_THREADS_PT(x);
4305 threads = malloc(nthreads*sizeof(pthread_t));
4306 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
4307
4308 /* set thread attributes */
4309 pthread_attr_init(&attr);
4310 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
4311
4312 for (i=0; i<nthreads; i++) {
4313 /* initialize thread data */
4314 N_VInitThreadData(&thread_data[i]);
4315
4316 /* compute start and end loop index for thread */
4317 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
4318
4319 /* pack thread data */
4320 thread_data[i].c1 = a;
4321 thread_data[i].v1 = NV_DATA_PT(x);
4322 thread_data[i].v2 = NV_DATA_PT(y);
4323 thread_data[i].v3 = NV_DATA_PT(z);
4324
4325 /* create threads and call pthread companion function */
4326 pthread_create(&threads[i], &attr, VLin2_PT, (void *) &thread_data[i]);
4327 }
4328
4329 /* wait for all threads to finish */
4330 for (i=0; i<nthreads; i++) {
4331 pthread_join(threads[i], NULL);
4332 }
4333
4334 /* clean up and return */
4335 pthread_attr_destroy(&attr);
4336 free(threads);
4337 free(thread_data);
4338
4339 return;
4340 }
4341
4342 /* ----------------------------------------------------------------------------
4343 * Pthread companion function to VLin2
4344 */
4345
VLin2_PT(void * thread_data)4346 static void *VLin2_PT(void *thread_data)
4347 {
4348 sunindextype i, start, end;
4349 realtype a;
4350 realtype *xd, *yd, *zd;
4351 Pthreads_Data *my_data;
4352
4353 /* extract thread data */
4354 my_data = (Pthreads_Data *) thread_data;
4355
4356 a = my_data->c1;
4357 xd = my_data->v1;
4358 yd = my_data->v2;
4359 zd = my_data->v3;
4360
4361 start = my_data->start;
4362 end = my_data->end;
4363
4364 /* compute vector difference */
4365 for (i = start; i < end; i++)
4366 zd[i] = (a*xd[i]) - yd[i];
4367
4368 /* exit */
4369 pthread_exit(NULL);
4370 }
4371
4372
4373 /* ----------------------------------------------------------------------------
4374 * Compute special cases of linear sum
4375 */
4376
Vaxpy_Pthreads(realtype a,N_Vector x,N_Vector y)4377 static void Vaxpy_Pthreads(realtype a, N_Vector x, N_Vector y)
4378 {
4379 sunindextype N;
4380 int i, nthreads;
4381 pthread_t *threads;
4382 Pthreads_Data *thread_data;
4383 pthread_attr_t attr;
4384
4385 /* allocate threads and thread data structs */
4386 N = NV_LENGTH_PT(x);
4387 nthreads = NV_NUM_THREADS_PT(x);
4388 threads = malloc(nthreads*sizeof(pthread_t));
4389 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
4390
4391 /* set thread attributes */
4392 pthread_attr_init(&attr);
4393 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
4394
4395 for (i=0; i<nthreads; i++) {
4396 /* initialize thread data */
4397 N_VInitThreadData(&thread_data[i]);
4398
4399 /* compute start and end loop index for thread */
4400 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
4401
4402 /* pack thread data */
4403 thread_data[i].c1 = a;
4404 thread_data[i].v1 = NV_DATA_PT(x);
4405 thread_data[i].v2 = NV_DATA_PT(y);
4406
4407 /* create threads and call pthread companion function */
4408 pthread_create(&threads[i], &attr, Vaxpy_PT, (void *) &thread_data[i]);
4409 }
4410
4411 /* wait for all threads to finish */
4412 for (i=0; i<nthreads; i++) {
4413 pthread_join(threads[i], NULL);
4414 }
4415
4416 /* clean up and return */
4417 pthread_attr_destroy(&attr);
4418 free(threads);
4419 free(thread_data);
4420
4421 return;
4422 }
4423
4424 /* ----------------------------------------------------------------------------
4425 * Pthread companion function to Vaxpy
4426 */
4427
Vaxpy_PT(void * thread_data)4428 static void *Vaxpy_PT(void *thread_data)
4429 {
4430 sunindextype i, start, end;
4431 realtype a;
4432 realtype *xd, *yd;
4433 Pthreads_Data *my_data;
4434
4435 /* extract thread data */
4436 my_data = (Pthreads_Data *) thread_data;
4437
4438 a = my_data->c1;
4439 xd = my_data->v1;
4440 yd = my_data->v2;
4441
4442 start = my_data->start;
4443 end = my_data->end;
4444
4445 /* compute axpy */
4446 if (a == ONE) {
4447 for (i = start; i < end; i++)
4448 yd[i] += xd[i];
4449
4450 /* exit */
4451 pthread_exit(NULL);
4452 }
4453
4454 if (a == -ONE) {
4455 for (i = start; i < end; i++)
4456 yd[i] -= xd[i];
4457
4458 /* exit */
4459 pthread_exit(NULL);
4460 }
4461
4462 for (i = start; i < end; i++)
4463 yd[i] += a*xd[i];
4464
4465 /* return */
4466 pthread_exit(NULL);
4467 }
4468
4469
4470 /* ----------------------------------------------------------------------------
4471 * Compute scaled vector
4472 */
4473
VScaleBy_Pthreads(realtype a,N_Vector x)4474 static void VScaleBy_Pthreads(realtype a, N_Vector x)
4475 {
4476 sunindextype N;
4477 int i, nthreads;
4478 pthread_t *threads;
4479 Pthreads_Data *thread_data;
4480 pthread_attr_t attr;
4481
4482 /* allocate threads and thread data structs */
4483 N = NV_LENGTH_PT(x);
4484 nthreads = NV_NUM_THREADS_PT(x);
4485 threads = malloc(nthreads*sizeof(pthread_t));
4486 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
4487
4488 /* set thread attributes */
4489 pthread_attr_init(&attr);
4490 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
4491
4492 for (i=0; i<nthreads; i++) {
4493 /* initialize thread data */
4494 N_VInitThreadData(&thread_data[i]);
4495
4496 /* compute start and end loop index for thread */
4497 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
4498
4499 /* pack thread data */
4500 thread_data[i].c1 = a;
4501 thread_data[i].v1 = NV_DATA_PT(x);
4502
4503 /* create threads and call pthread companion function */
4504 pthread_create(&threads[i], &attr, VScaleBy_PT, (void *) &thread_data[i]);
4505 }
4506
4507 /* wait for all threads to finish */
4508 for (i=0; i<nthreads; i++) {
4509 pthread_join(threads[i], NULL);
4510 }
4511
4512 /* clean up and return */
4513 pthread_attr_destroy(&attr);
4514 free(threads);
4515 free(thread_data);
4516
4517 return;
4518 }
4519
4520
4521 /* ----------------------------------------------------------------------------
4522 * Pthread companion function to VScaleBy
4523 */
4524
VScaleBy_PT(void * thread_data)4525 static void *VScaleBy_PT(void *thread_data)
4526 {
4527 sunindextype i, start, end;
4528 realtype a;
4529 realtype *xd;
4530 Pthreads_Data *my_data;
4531
4532 /* extract thread data */
4533 my_data = (Pthreads_Data *) thread_data;
4534
4535 a = my_data->c1;
4536 xd = my_data->v1;
4537
4538 start = my_data->start;
4539 end = my_data->end;
4540
4541 /* compute scaled vector */
4542 for (i = start; i < end; i++)
4543 xd[i] *= a;
4544
4545 /* exit */
4546 pthread_exit(NULL);
4547 }
4548
4549
4550 /*
4551 * -----------------------------------------------------------------------------
4552 * private functions for special cases of vector array operations
4553 * -----------------------------------------------------------------------------
4554 */
4555
VSumVectorArray_Pthreads(int nvec,N_Vector * X,N_Vector * Y,N_Vector * Z)4556 static int VSumVectorArray_Pthreads(int nvec, N_Vector* X, N_Vector* Y, N_Vector* Z)
4557 {
4558 sunindextype N;
4559 int i, nthreads;
4560 pthread_t *threads;
4561 Pthreads_Data *thread_data;
4562 pthread_attr_t attr;
4563
4564 /* allocate threads and thread data structs */
4565 N = NV_LENGTH_PT(X[0]);
4566 nthreads = NV_NUM_THREADS_PT(X[0]);
4567 threads = malloc(nthreads*sizeof(pthread_t));
4568 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
4569
4570 /* set thread attributes */
4571 pthread_attr_init(&attr);
4572 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
4573
4574 /* pack thread data, distribute loop indices, and create threads/call kernel */
4575 for (i=0; i<nthreads; i++) {
4576 N_VInitThreadData(&thread_data[i]);
4577
4578 thread_data[i].nvec = nvec;
4579 thread_data[i].Y1 = X;
4580 thread_data[i].Y2 = Y;
4581 thread_data[i].Y3 = Z;
4582
4583 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
4584
4585 pthread_create(&threads[i], &attr, VSumVectorArray_PT, (void *) &thread_data[i]);
4586 }
4587
4588 /* wait for all threads to finish */
4589 for (i=0; i<nthreads; i++)
4590 pthread_join(threads[i], NULL);
4591
4592 /* clean up and return */
4593 pthread_attr_destroy(&attr);
4594 free(threads);
4595 free(thread_data);
4596
4597 return(0);
4598 }
4599
VSumVectorArray_PT(void * thread_data)4600 static void *VSumVectorArray_PT(void *thread_data)
4601 {
4602 Pthreads_Data *my_data;
4603 sunindextype j, start, end;
4604
4605 int i;
4606 realtype* xd=NULL;
4607 realtype* yd=NULL;
4608 realtype* zd=NULL;
4609
4610 my_data = (Pthreads_Data *) thread_data;
4611 start = my_data->start;
4612 end = my_data->end;
4613
4614 for (i=0; i<my_data->nvec; i++) {
4615 xd = NV_DATA_PT(my_data->Y1[i]);
4616 yd = NV_DATA_PT(my_data->Y2[i]);
4617 zd = NV_DATA_PT(my_data->Y3[i]);
4618 for (j=start; j<end; j++)
4619 zd[j] = xd[j] + yd[j];
4620 }
4621
4622 pthread_exit(NULL);
4623 }
4624
4625
VDiffVectorArray_Pthreads(int nvec,N_Vector * X,N_Vector * Y,N_Vector * Z)4626 static int VDiffVectorArray_Pthreads(int nvec, N_Vector* X, N_Vector* Y, N_Vector* Z)
4627 {
4628 sunindextype N;
4629 int i, nthreads;
4630 pthread_t *threads;
4631 Pthreads_Data *thread_data;
4632 pthread_attr_t attr;
4633
4634 /* allocate threads and thread data structs */
4635 N = NV_LENGTH_PT(X[0]);
4636 nthreads = NV_NUM_THREADS_PT(X[0]);
4637 threads = malloc(nthreads*sizeof(pthread_t));
4638 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
4639
4640 /* set thread attributes */
4641 pthread_attr_init(&attr);
4642 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
4643
4644 /* pack thread data, distribute loop indices, and create threads/call kernel */
4645 for (i=0; i<nthreads; i++) {
4646 N_VInitThreadData(&thread_data[i]);
4647
4648 thread_data[i].nvec = nvec;
4649 thread_data[i].Y1 = X;
4650 thread_data[i].Y2 = Y;
4651 thread_data[i].Y3 = Z;
4652
4653 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
4654
4655 pthread_create(&threads[i], &attr, VDiffVectorArray_PT, (void *) &thread_data[i]);
4656 }
4657
4658 /* wait for all threads to finish */
4659 for (i=0; i<nthreads; i++)
4660 pthread_join(threads[i], NULL);
4661
4662 /* clean up and return */
4663 pthread_attr_destroy(&attr);
4664 free(threads);
4665 free(thread_data);
4666
4667 return(0);
4668 }
4669
VDiffVectorArray_PT(void * thread_data)4670 static void *VDiffVectorArray_PT(void *thread_data)
4671 {
4672 Pthreads_Data *my_data;
4673 sunindextype j, start, end;
4674
4675 int i;
4676 realtype* xd=NULL;
4677 realtype* yd=NULL;
4678 realtype* zd=NULL;
4679
4680 my_data = (Pthreads_Data *) thread_data;
4681 start = my_data->start;
4682 end = my_data->end;
4683
4684 for (i=0; i<my_data->nvec; i++) {
4685 xd = NV_DATA_PT(my_data->Y1[i]);
4686 yd = NV_DATA_PT(my_data->Y2[i]);
4687 zd = NV_DATA_PT(my_data->Y3[i]);
4688 for (j=start; j<end; j++)
4689 zd[j] = xd[j] - yd[j];
4690 }
4691
4692 pthread_exit(NULL);
4693 }
4694
4695
VScaleSumVectorArray_Pthreads(int nvec,realtype c,N_Vector * X,N_Vector * Y,N_Vector * Z)4696 static int VScaleSumVectorArray_Pthreads(int nvec, realtype c, N_Vector* X, N_Vector* Y, N_Vector* Z)
4697 {
4698 sunindextype N;
4699 int i, nthreads;
4700 pthread_t *threads;
4701 Pthreads_Data *thread_data;
4702 pthread_attr_t attr;
4703
4704 /* allocate threads and thread data structs */
4705 N = NV_LENGTH_PT(X[0]);
4706 nthreads = NV_NUM_THREADS_PT(X[0]);
4707 threads = malloc(nthreads*sizeof(pthread_t));
4708 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
4709
4710 /* set thread attributes */
4711 pthread_attr_init(&attr);
4712 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
4713
4714 /* pack thread data, distribute loop indices, and create threads/call kernel */
4715 for (i=0; i<nthreads; i++) {
4716 N_VInitThreadData(&thread_data[i]);
4717
4718 thread_data[i].nvec = nvec;
4719 thread_data[i].c1 = c;
4720 thread_data[i].Y1 = X;
4721 thread_data[i].Y2 = Y;
4722 thread_data[i].Y3 = Z;
4723
4724 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
4725
4726 pthread_create(&threads[i], &attr, VScaleSumVectorArray_PT, (void *) &thread_data[i]);
4727 }
4728
4729 /* wait for all threads to finish */
4730 for (i=0; i<nthreads; i++)
4731 pthread_join(threads[i], NULL);
4732
4733 /* clean up and return */
4734 pthread_attr_destroy(&attr);
4735 free(threads);
4736 free(thread_data);
4737
4738 return(0);
4739 }
4740
VScaleSumVectorArray_PT(void * thread_data)4741 static void *VScaleSumVectorArray_PT(void *thread_data)
4742 {
4743 Pthreads_Data *my_data;
4744 sunindextype j, start, end;
4745
4746 int i;
4747 realtype c;
4748 realtype* xd=NULL;
4749 realtype* yd=NULL;
4750 realtype* zd=NULL;
4751
4752 my_data = (Pthreads_Data *) thread_data;
4753 start = my_data->start;
4754 end = my_data->end;
4755 c = my_data->c1;
4756
4757 for (i=0; i<my_data->nvec; i++) {
4758 xd = NV_DATA_PT(my_data->Y1[i]);
4759 yd = NV_DATA_PT(my_data->Y2[i]);
4760 zd = NV_DATA_PT(my_data->Y3[i]);
4761 for (j=start; j<end; j++)
4762 zd[j] = c * (xd[j] + yd[j]);
4763 }
4764
4765 pthread_exit(NULL);
4766 }
4767
4768
VScaleDiffVectorArray_Pthreads(int nvec,realtype c,N_Vector * X,N_Vector * Y,N_Vector * Z)4769 static int VScaleDiffVectorArray_Pthreads(int nvec, realtype c, N_Vector* X, N_Vector* Y, N_Vector* Z)
4770 {
4771 sunindextype N;
4772 int i, nthreads;
4773 pthread_t *threads;
4774 Pthreads_Data *thread_data;
4775 pthread_attr_t attr;
4776
4777 /* allocate threads and thread data structs */
4778 N = NV_LENGTH_PT(X[0]);
4779 nthreads = NV_NUM_THREADS_PT(X[0]);
4780 threads = malloc(nthreads*sizeof(pthread_t));
4781 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
4782
4783 /* set thread attributes */
4784 pthread_attr_init(&attr);
4785 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
4786
4787 /* pack thread data, distribute loop indices, and create threads/call kernel */
4788 for (i=0; i<nthreads; i++) {
4789 N_VInitThreadData(&thread_data[i]);
4790
4791 thread_data[i].nvec = nvec;
4792 thread_data[i].c1 = c;
4793 thread_data[i].Y1 = X;
4794 thread_data[i].Y2 = Y;
4795 thread_data[i].Y3 = Z;
4796
4797 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
4798
4799 pthread_create(&threads[i], &attr, VScaleDiffVectorArray_PT, (void *) &thread_data[i]);
4800 }
4801
4802 /* wait for all threads to finish */
4803 for (i=0; i<nthreads; i++)
4804 pthread_join(threads[i], NULL);
4805
4806 /* clean up and return */
4807 pthread_attr_destroy(&attr);
4808 free(threads);
4809 free(thread_data);
4810
4811 return(0);
4812 }
4813
VScaleDiffVectorArray_PT(void * thread_data)4814 static void *VScaleDiffVectorArray_PT(void *thread_data)
4815 {
4816 Pthreads_Data *my_data;
4817 sunindextype j, start, end;
4818
4819 int i;
4820 realtype c;
4821 realtype* xd=NULL;
4822 realtype* yd=NULL;
4823 realtype* zd=NULL;
4824
4825 my_data = (Pthreads_Data *) thread_data;
4826 start = my_data->start;
4827 end = my_data->end;
4828 c = my_data->c1;
4829
4830 for (i=0; i<my_data->nvec; i++) {
4831 xd = NV_DATA_PT(my_data->Y1[i]);
4832 yd = NV_DATA_PT(my_data->Y2[i]);
4833 zd = NV_DATA_PT(my_data->Y3[i]);
4834 for (j=start; j<end; j++)
4835 zd[j] = c * (xd[j] - yd[j]);
4836 }
4837
4838 pthread_exit(NULL);
4839 }
4840
4841
VLin1VectorArray_Pthreads(int nvec,realtype a,N_Vector * X,N_Vector * Y,N_Vector * Z)4842 static int VLin1VectorArray_Pthreads(int nvec, realtype a, N_Vector* X, N_Vector* Y, N_Vector* Z)
4843 {
4844 sunindextype N;
4845 int i, nthreads;
4846 pthread_t *threads;
4847 Pthreads_Data *thread_data;
4848 pthread_attr_t attr;
4849
4850 /* allocate threads and thread data structs */
4851 N = NV_LENGTH_PT(X[0]);
4852 nthreads = NV_NUM_THREADS_PT(X[0]);
4853 threads = malloc(nthreads*sizeof(pthread_t));
4854 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
4855
4856 /* set thread attributes */
4857 pthread_attr_init(&attr);
4858 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
4859
4860 /* pack thread data, distribute loop indices, and create threads/call kernel */
4861 for (i=0; i<nthreads; i++) {
4862 N_VInitThreadData(&thread_data[i]);
4863
4864 thread_data[i].nvec = nvec;
4865 thread_data[i].c1 = a;
4866 thread_data[i].Y1 = X;
4867 thread_data[i].Y2 = Y;
4868 thread_data[i].Y3 = Z;
4869
4870 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
4871
4872 pthread_create(&threads[i], &attr, VLin1VectorArray_PT, (void *) &thread_data[i]);
4873 }
4874
4875 /* wait for all threads to finish */
4876 for (i=0; i<nthreads; i++)
4877 pthread_join(threads[i], NULL);
4878
4879 /* clean up and return */
4880 pthread_attr_destroy(&attr);
4881 free(threads);
4882 free(thread_data);
4883
4884 return(0);
4885 }
4886
VLin1VectorArray_PT(void * thread_data)4887 static void *VLin1VectorArray_PT(void *thread_data)
4888 {
4889 Pthreads_Data *my_data;
4890 sunindextype j, start, end;
4891
4892 int i;
4893 realtype a;
4894 realtype* xd=NULL;
4895 realtype* yd=NULL;
4896 realtype* zd=NULL;
4897
4898 my_data = (Pthreads_Data *) thread_data;
4899 start = my_data->start;
4900 end = my_data->end;
4901 a = my_data->c1;
4902
4903 for (i=0; i<my_data->nvec; i++) {
4904 xd = NV_DATA_PT(my_data->Y1[i]);
4905 yd = NV_DATA_PT(my_data->Y2[i]);
4906 zd = NV_DATA_PT(my_data->Y3[i]);
4907 for (j=start; j<end; j++)
4908 zd[j] = (a * xd[j]) + yd[j];
4909 }
4910
4911 pthread_exit(NULL);
4912 }
4913
4914
VLin2VectorArray_Pthreads(int nvec,realtype a,N_Vector * X,N_Vector * Y,N_Vector * Z)4915 static int VLin2VectorArray_Pthreads(int nvec, realtype a, N_Vector* X, N_Vector* Y, N_Vector* Z)
4916 {
4917 sunindextype N;
4918 int i, nthreads;
4919 pthread_t *threads;
4920 Pthreads_Data *thread_data;
4921 pthread_attr_t attr;
4922
4923 /* allocate threads and thread data structs */
4924 N = NV_LENGTH_PT(X[0]);
4925 nthreads = NV_NUM_THREADS_PT(X[0]);
4926 threads = malloc(nthreads*sizeof(pthread_t));
4927 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
4928
4929 /* set thread attributes */
4930 pthread_attr_init(&attr);
4931 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
4932
4933 /* pack thread data, distribute loop indices, and create threads/call kernel */
4934 for (i=0; i<nthreads; i++) {
4935 N_VInitThreadData(&thread_data[i]);
4936
4937 thread_data[i].nvec = nvec;
4938 thread_data[i].c1 = a;
4939 thread_data[i].Y1 = X;
4940 thread_data[i].Y2 = Y;
4941 thread_data[i].Y3 = Z;
4942
4943 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
4944
4945 pthread_create(&threads[i], &attr, VLin2VectorArray_PT, (void *) &thread_data[i]);
4946 }
4947
4948 /* wait for all threads to finish */
4949 for (i=0; i<nthreads; i++)
4950 pthread_join(threads[i], NULL);
4951
4952 /* clean up and return */
4953 pthread_attr_destroy(&attr);
4954 free(threads);
4955 free(thread_data);
4956
4957 return(0);
4958 }
4959
VLin2VectorArray_PT(void * thread_data)4960 static void *VLin2VectorArray_PT(void *thread_data)
4961 {
4962 Pthreads_Data *my_data;
4963 sunindextype j, start, end;
4964
4965 int i;
4966 realtype a;
4967 realtype* xd=NULL;
4968 realtype* yd=NULL;
4969 realtype* zd=NULL;
4970
4971 my_data = (Pthreads_Data *) thread_data;
4972 start = my_data->start;
4973 end = my_data->end;
4974 a = my_data->c1;
4975
4976 for (i=0; i<my_data->nvec; i++) {
4977 xd = NV_DATA_PT(my_data->Y1[i]);
4978 yd = NV_DATA_PT(my_data->Y2[i]);
4979 zd = NV_DATA_PT(my_data->Y3[i]);
4980 for (j=start; j<end; j++)
4981 zd[j] = (a * xd[j]) - yd[j];
4982 }
4983
4984 pthread_exit(NULL);
4985 }
4986
4987
VaxpyVectorArray_Pthreads(int nvec,realtype a,N_Vector * X,N_Vector * Y)4988 static int VaxpyVectorArray_Pthreads(int nvec, realtype a, N_Vector* X, N_Vector* Y)
4989 {
4990 sunindextype N;
4991 int i, nthreads;
4992 pthread_t *threads;
4993 Pthreads_Data *thread_data;
4994 pthread_attr_t attr;
4995
4996 /* allocate threads and thread data structs */
4997 N = NV_LENGTH_PT(X[0]);
4998 nthreads = NV_NUM_THREADS_PT(X[0]);
4999 threads = malloc(nthreads*sizeof(pthread_t));
5000 thread_data = (Pthreads_Data *) malloc(nthreads*sizeof(struct _Pthreads_Data));
5001
5002 /* set thread attributes */
5003 pthread_attr_init(&attr);
5004 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
5005
5006 /* pack thread data, distribute loop indices, and create threads/call kernel */
5007 for (i=0; i<nthreads; i++) {
5008 N_VInitThreadData(&thread_data[i]);
5009
5010 thread_data[i].nvec = nvec;
5011 thread_data[i].c1 = a;
5012 thread_data[i].Y1 = X;
5013 thread_data[i].Y2 = Y;
5014
5015 N_VSplitLoop(i, &nthreads, &N, &thread_data[i].start, &thread_data[i].end);
5016
5017 pthread_create(&threads[i], &attr, VaxpyVectorArray_PT, (void *) &thread_data[i]);
5018 }
5019
5020 /* wait for all threads to finish */
5021 for (i=0; i<nthreads; i++)
5022 pthread_join(threads[i], NULL);
5023
5024 /* clean up and return */
5025 pthread_attr_destroy(&attr);
5026 free(threads);
5027 free(thread_data);
5028
5029 return(0);
5030 }
5031
VaxpyVectorArray_PT(void * thread_data)5032 static void *VaxpyVectorArray_PT(void *thread_data)
5033 {
5034 Pthreads_Data *my_data;
5035 sunindextype j, start, end;
5036
5037 int i;
5038 realtype a;
5039 realtype* xd=NULL;
5040 realtype* yd=NULL;
5041
5042 my_data = (Pthreads_Data *) thread_data;
5043 start = my_data->start;
5044 end = my_data->end;
5045 a = my_data->c1;
5046
5047 if (a == ONE) {
5048 for (i=0; i<my_data->nvec; i++) {
5049 xd = NV_DATA_PT(my_data->Y1[i]);
5050 yd = NV_DATA_PT(my_data->Y2[i]);
5051 for (j=start; j<end; j++)
5052 yd[j] += xd[j];
5053 }
5054 pthread_exit(NULL);
5055 }
5056
5057 if (a == -ONE) {
5058 for (i=0; i<my_data->nvec; i++) {
5059 xd = NV_DATA_PT(my_data->Y1[i]);
5060 yd = NV_DATA_PT(my_data->Y2[i]);
5061 for (j=start; j<end; j++)
5062 yd[j] -= xd[j];
5063 }
5064 pthread_exit(NULL);
5065 }
5066
5067 for (i=0; i<my_data->nvec; i++) {
5068 xd = NV_DATA_PT(my_data->Y1[i]);
5069 yd = NV_DATA_PT(my_data->Y2[i]);
5070 for (j=start; j<end; j++)
5071 yd[j] += a * xd[j];
5072 }
5073 pthread_exit(NULL);
5074 }
5075
5076
5077 /*
5078 * -----------------------------------------------------------------
5079 * private utility functions
5080 * -----------------------------------------------------------------
5081 */
5082
5083 /* ----------------------------------------------------------------------------
5084 * Determine loop indices for a thread
5085 */
5086
N_VSplitLoop(int myid,int * nthreads,sunindextype * N,sunindextype * start,sunindextype * end)5087 static void N_VSplitLoop(int myid, int *nthreads, sunindextype *N,
5088 sunindextype *start, sunindextype *end)
5089 {
5090 sunindextype q, r; /* quotient and remainder */
5091
5092 /* work per thread and leftover work */
5093 q = *N / *nthreads;
5094 r = *N % *nthreads;
5095
5096 /* assign work */
5097 if (myid < r) {
5098 *start = myid * q + myid;
5099 *end = *start + q + 1;
5100 } else {
5101 *start = myid * q + r;
5102 *end = *start + q;
5103 }
5104 }
5105
5106
5107 /* ----------------------------------------------------------------------------
5108 * Initialize values of local thread data struct
5109 */
5110
N_VInitThreadData(Pthreads_Data * thread_data)5111 static void N_VInitThreadData(Pthreads_Data *thread_data)
5112 {
5113 thread_data->start = -1;
5114 thread_data->end = -1;
5115
5116 #if __STDC_VERSION__ >= 199901L
5117 thread_data->c1 = NAN;
5118 thread_data->c2 = NAN;
5119 #else
5120 thread_data->c1 = ZERO;
5121 thread_data->c2 = ZERO;
5122 #endif
5123
5124 thread_data->v1 = NULL;
5125 thread_data->v2 = NULL;
5126 thread_data->v3 = NULL;
5127 thread_data->global_val = NULL;
5128 thread_data->global_mutex = NULL;
5129
5130 thread_data->nvec = ZERO;
5131 thread_data->nsum = ZERO;
5132 thread_data->cvals = NULL;
5133 thread_data->Y1 = NULL;
5134 thread_data->Y2 = NULL;
5135 thread_data->Y3 = NULL;
5136 }
5137
5138
5139 /*
5140 * -----------------------------------------------------------------
5141 * Enable / Disable fused and vector array operations
5142 * -----------------------------------------------------------------
5143 */
5144
N_VEnableFusedOps_Pthreads(N_Vector v,booleantype tf)5145 int N_VEnableFusedOps_Pthreads(N_Vector v, booleantype tf)
5146 {
5147 /* check that vector is non-NULL */
5148 if (v == NULL) return(-1);
5149
5150 /* check that ops structure is non-NULL */
5151 if (v->ops == NULL) return(-1);
5152
5153 if (tf) {
5154 /* enable all fused vector operations */
5155 v->ops->nvlinearcombination = N_VLinearCombination_Pthreads;
5156 v->ops->nvscaleaddmulti = N_VScaleAddMulti_Pthreads;
5157 v->ops->nvdotprodmulti = N_VDotProdMulti_Pthreads;
5158 /* enable all vector array operations */
5159 v->ops->nvlinearsumvectorarray = N_VLinearSumVectorArray_Pthreads;
5160 v->ops->nvscalevectorarray = N_VScaleVectorArray_Pthreads;
5161 v->ops->nvconstvectorarray = N_VConstVectorArray_Pthreads;
5162 v->ops->nvwrmsnormvectorarray = N_VWrmsNormVectorArray_Pthreads;
5163 v->ops->nvwrmsnormmaskvectorarray = N_VWrmsNormMaskVectorArray_Pthreads;
5164 v->ops->nvscaleaddmultivectorarray = N_VScaleAddMultiVectorArray_Pthreads;
5165 v->ops->nvlinearcombinationvectorarray = N_VLinearCombinationVectorArray_Pthreads;
5166 } else {
5167 /* disable all fused vector operations */
5168 v->ops->nvlinearcombination = NULL;
5169 v->ops->nvscaleaddmulti = NULL;
5170 v->ops->nvdotprodmulti = NULL;
5171 /* disable all vector array operations */
5172 v->ops->nvlinearsumvectorarray = NULL;
5173 v->ops->nvscalevectorarray = NULL;
5174 v->ops->nvconstvectorarray = NULL;
5175 v->ops->nvwrmsnormvectorarray = NULL;
5176 v->ops->nvwrmsnormmaskvectorarray = NULL;
5177 v->ops->nvscaleaddmultivectorarray = NULL;
5178 v->ops->nvlinearcombinationvectorarray = NULL;
5179 }
5180
5181 /* return success */
5182 return(0);
5183 }
5184
5185
N_VEnableLinearCombination_Pthreads(N_Vector v,booleantype tf)5186 int N_VEnableLinearCombination_Pthreads(N_Vector v, booleantype tf)
5187 {
5188 /* check that vector is non-NULL */
5189 if (v == NULL) return(-1);
5190
5191 /* check that ops structure is non-NULL */
5192 if (v->ops == NULL) return(-1);
5193
5194 /* enable/disable operation */
5195 if (tf)
5196 v->ops->nvlinearcombination = N_VLinearCombination_Pthreads;
5197 else
5198 v->ops->nvlinearcombination = NULL;
5199
5200 /* return success */
5201 return(0);
5202 }
5203
N_VEnableScaleAddMulti_Pthreads(N_Vector v,booleantype tf)5204 int N_VEnableScaleAddMulti_Pthreads(N_Vector v, booleantype tf)
5205 {
5206 /* check that vector is non-NULL */
5207 if (v == NULL) return(-1);
5208
5209 /* check that ops structure is non-NULL */
5210 if (v->ops == NULL) return(-1);
5211
5212 /* enable/disable operation */
5213 if (tf)
5214 v->ops->nvscaleaddmulti = N_VScaleAddMulti_Pthreads;
5215 else
5216 v->ops->nvscaleaddmulti = NULL;
5217
5218 /* return success */
5219 return(0);
5220 }
5221
N_VEnableDotProdMulti_Pthreads(N_Vector v,booleantype tf)5222 int N_VEnableDotProdMulti_Pthreads(N_Vector v, booleantype tf)
5223 {
5224 /* check that vector is non-NULL */
5225 if (v == NULL) return(-1);
5226
5227 /* check that ops structure is non-NULL */
5228 if (v->ops == NULL) return(-1);
5229
5230 /* enable/disable operation */
5231 if (tf)
5232 v->ops->nvdotprodmulti = N_VDotProdMulti_Pthreads;
5233 else
5234 v->ops->nvdotprodmulti = NULL;
5235
5236 /* return success */
5237 return(0);
5238 }
5239
N_VEnableLinearSumVectorArray_Pthreads(N_Vector v,booleantype tf)5240 int N_VEnableLinearSumVectorArray_Pthreads(N_Vector v, booleantype tf)
5241 {
5242 /* check that vector is non-NULL */
5243 if (v == NULL) return(-1);
5244
5245 /* check that ops structure is non-NULL */
5246 if (v->ops == NULL) return(-1);
5247
5248 /* enable/disable operation */
5249 if (tf)
5250 v->ops->nvlinearsumvectorarray = N_VLinearSumVectorArray_Pthreads;
5251 else
5252 v->ops->nvlinearsumvectorarray = NULL;
5253
5254 /* return success */
5255 return(0);
5256 }
5257
N_VEnableScaleVectorArray_Pthreads(N_Vector v,booleantype tf)5258 int N_VEnableScaleVectorArray_Pthreads(N_Vector v, booleantype tf)
5259 {
5260 /* check that vector is non-NULL */
5261 if (v == NULL) return(-1);
5262
5263 /* check that ops structure is non-NULL */
5264 if (v->ops == NULL) return(-1);
5265
5266 /* enable/disable operation */
5267 if (tf)
5268 v->ops->nvscalevectorarray = N_VScaleVectorArray_Pthreads;
5269 else
5270 v->ops->nvscalevectorarray = NULL;
5271
5272 /* return success */
5273 return(0);
5274 }
5275
N_VEnableConstVectorArray_Pthreads(N_Vector v,booleantype tf)5276 int N_VEnableConstVectorArray_Pthreads(N_Vector v, booleantype tf)
5277 {
5278 /* check that vector is non-NULL */
5279 if (v == NULL) return(-1);
5280
5281 /* check that ops structure is non-NULL */
5282 if (v->ops == NULL) return(-1);
5283
5284 /* enable/disable operation */
5285 if (tf)
5286 v->ops->nvconstvectorarray = N_VConstVectorArray_Pthreads;
5287 else
5288 v->ops->nvconstvectorarray = NULL;
5289
5290 /* return success */
5291 return(0);
5292 }
5293
N_VEnableWrmsNormVectorArray_Pthreads(N_Vector v,booleantype tf)5294 int N_VEnableWrmsNormVectorArray_Pthreads(N_Vector v, booleantype tf)
5295 {
5296 /* check that vector is non-NULL */
5297 if (v == NULL) return(-1);
5298
5299 /* check that ops structure is non-NULL */
5300 if (v->ops == NULL) return(-1);
5301
5302 /* enable/disable operation */
5303 if (tf)
5304 v->ops->nvwrmsnormvectorarray = N_VWrmsNormVectorArray_Pthreads;
5305 else
5306 v->ops->nvwrmsnormvectorarray = NULL;
5307
5308 /* return success */
5309 return(0);
5310 }
5311
N_VEnableWrmsNormMaskVectorArray_Pthreads(N_Vector v,booleantype tf)5312 int N_VEnableWrmsNormMaskVectorArray_Pthreads(N_Vector v, booleantype tf)
5313 {
5314 /* check that vector is non-NULL */
5315 if (v == NULL) return(-1);
5316
5317 /* check that ops structure is non-NULL */
5318 if (v->ops == NULL) return(-1);
5319
5320 /* enable/disable operation */
5321 if (tf)
5322 v->ops->nvwrmsnormmaskvectorarray = N_VWrmsNormMaskVectorArray_Pthreads;
5323 else
5324 v->ops->nvwrmsnormmaskvectorarray = NULL;
5325
5326 /* return success */
5327 return(0);
5328 }
5329
N_VEnableScaleAddMultiVectorArray_Pthreads(N_Vector v,booleantype tf)5330 int N_VEnableScaleAddMultiVectorArray_Pthreads(N_Vector v, booleantype tf)
5331 {
5332 /* check that vector is non-NULL */
5333 if (v == NULL) return(-1);
5334
5335 /* check that ops structure is non-NULL */
5336 if (v->ops == NULL) return(-1);
5337
5338 /* enable/disable operation */
5339 if (tf)
5340 v->ops->nvscaleaddmultivectorarray = N_VScaleAddMultiVectorArray_Pthreads;
5341 else
5342 v->ops->nvscaleaddmultivectorarray = NULL;
5343
5344 /* return success */
5345 return(0);
5346 }
5347
N_VEnableLinearCombinationVectorArray_Pthreads(N_Vector v,booleantype tf)5348 int N_VEnableLinearCombinationVectorArray_Pthreads(N_Vector v, booleantype tf)
5349 {
5350 /* check that vector is non-NULL */
5351 if (v == NULL) return(-1);
5352
5353 /* check that ops structure is non-NULL */
5354 if (v->ops == NULL) return(-1);
5355
5356 /* enable/disable operation */
5357 if (tf)
5358 v->ops->nvlinearcombinationvectorarray = N_VLinearCombinationVectorArray_Pthreads;
5359 else
5360 v->ops->nvlinearcombinationvectorarray = NULL;
5361
5362 /* return success */
5363 return(0);
5364 }
5365