1 #if HAVE_CONFIG_H
2 # include "config.h"
3 #endif
4
5 /* $Id: vector.c,v 1.32.6.4 2007-08-29 17:32:32 manoj Exp $ */
6 #include "armcip.h"
7 #include "copy.h"
8 #include "acc.h"
9 #include "memlock.h"
10 #include <stdio.h>
11 #include <assert.h>
12
13 #define SERVER_GET 1
14 #define SERVER_NBGET 2
15 #define DIRECT_GET 3
16 #define DIRECT_NBGET 4
17 #define SERVER_PUT 5
18 #define SERVER_NBPUT 6
19 #define DIRECT_PUT 7
20 #define DIRECT_NBPUT 8
21
22
23 # define DO_FENCE(__proc,__prot) if(__prot==SERVER_GET);\
24 else if(__prot==SERVER_PUT);\
25 else if(__prot==DIRECT_GET || __prot==DIRECT_NBGET){\
26 if(armci_prot_switch_fence[__proc]==SERVER_PUT)\
27 ARMCI_DoFence(__proc);\
28 }\
29 else if(__prot==DIRECT_PUT || __prot==DIRECT_NBPUT){\
30 if(armci_prot_switch_fence[__proc]==SERVER_PUT)\
31 ARMCI_DoFence(__proc);\
32 }\
33 else;\
34 armci_prot_switch_fence[__proc]=__prot
35
36 /*
37 typedef struct {
38 float real;
39 float imag;
40 } complex_t;
41
42 typedef struct {
43 double real;
44 double imag;
45 } dcomplex_t;
46 */
47
48 /*
49 void I_ACCUMULATE(void* scale, int elems, void*src, void* dst)
50 {
51 int j;
52 int *a=(int*)dst, *b=(int*)src;
53 int alpha = *(int*)scale;
54
55 for(j=0;j<elems;j++) a[j] += alpha*b[j];
56 }
57 */
58
59
60 #define ACCUMULATE( DTYPE, scale, elems, src, dst) {\
61 int j;\
62 DTYPE *a =(DTYPE *)(dst);\
63 DTYPE *b =(DTYPE *)(src);\
64 DTYPE alpha = *(DTYPE *)(scale);\
65 for(j=0;j<(elems);j++)a[j] += alpha*b[j];\
66 }
67
68 #define ACCUMULATE_RA( DTYPE, elems, src, dst) {\
69 int j;\
70 DTYPE *a =(DTYPE *)(dst);\
71 DTYPE *b =(DTYPE *)(src);\
72 for(j=0;j<(elems);j++)a[j] ^= b[j];\
73 }
74
75 #define CPL_ACCUMULATE( DTYPE, scale, elems, src, dst) {\
76 int j;\
77 DTYPE *a =(DTYPE *)(dst);\
78 DTYPE *b =(DTYPE *)(src);\
79 DTYPE alpha = *(DTYPE *)(scale);\
80 for(j=0;j<(elems);j++){\
81 a[j].real += alpha.real*b[j].real - alpha.imag*b[j].imag;\
82 a[j].imag += alpha.imag*b[j].real + alpha.real*b[j].imag;\
83 }\
84 }
85
86 extern int* armci_prot_switch_fence;
87 extern int armci_prot_switch_preproc;
88 extern int armci_prot_switch_preop;
89
90
91 /*\ compute address range for memory to lock
92 \*/
armci_lockmem_scatter(void * ptr_array[],int len,int bytes,int proc)93 void armci_lockmem_scatter(void *ptr_array[], int len, int bytes, int proc)
94 {
95 int i;
96 void *pmin, *pmax;
97
98 pmin=ptr_array[0];
99 pmax=ptr_array[0];
100
101 for(i = 0; i< len; i++){
102 pmin = ARMCI_MIN(ptr_array[i],pmin);
103 pmax = ARMCI_MAX(ptr_array[i],pmax);
104 }
105 pmax = bytes-1 + (char*)pmax;
106 ARMCI_LOCKMEM(pmin, pmax, proc);
107 /* printf("%d: locked %ld-%ld bytes=%d\n",armci_me,pmin,pmax,
108 1+(char*)pmax -(char*)pmin);fflush(stdout); */
109 }
110
111
112
armci_scatter_acc(int op,void * scale,armci_giov_t dsc,int proc,int lockit)113 void armci_scatter_acc(int op, void *scale, armci_giov_t dsc,
114 int proc, int lockit)
115 {
116 # define ITERATOR for(i = 0; i< dsc.ptr_array_len; i++)
117 int i, elems, size;
118 if(lockit)
119 armci_lockmem_scatter(dsc.dst_ptr_array, dsc.ptr_array_len,
120 dsc.bytes, proc);
121 switch (op){
122 case ARMCI_ACC_INT:
123 size = sizeof(int);
124 elems = dsc.bytes/size;
125 if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
126 ITERATOR{
127 ACCUMULATE(int, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
128 }
129 break;
130
131 case ARMCI_ACC_LNG:
132 size = sizeof(long);
133 elems = dsc.bytes/size;
134 if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
135 ITERATOR{
136 ACCUMULATE(long, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
137 }
138 break;
139
140 case ARMCI_ACC_DBL:
141 size = sizeof(double);
142 elems = dsc.bytes/size;
143 if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
144 ITERATOR{
145 ACCUMULATE(double, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
146 }
147 break;
148
149 case ARMCI_ACC_DCP:
150 size = 2*sizeof(double);
151 elems = dsc.bytes/size;
152 if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
153 ITERATOR{
154 CPL_ACCUMULATE(dcomplex_t, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
155 }
156 break;
157
158 case ARMCI_ACC_CPL:
159 size = 2*sizeof(float);
160 elems = dsc.bytes/size;
161 if(dsc.bytes %size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
162 ITERATOR{
163 CPL_ACCUMULATE(complex_t, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
164 }
165 break;
166
167 case ARMCI_ACC_FLT:
168 size = sizeof(float);
169 elems = dsc.bytes/size;
170 if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
171 ITERATOR{
172 ACCUMULATE(float, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
173 }
174 break;
175 case ARMCI_ACC_RA:
176 size = sizeof(long);
177 elems = dsc.bytes/size;
178 if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
179 ITERATOR{
180 ACCUMULATE_RA(long,elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
181 }
182 break;
183 default: armci_die("ARMCI vector accumulate: operation not supported",op);
184 }
185
186 if(lockit) ARMCI_UNLOCKMEM(proc);
187 }
188
189
190 #ifdef ACC_COPY
191 # define PWORKLEN 2048
192 static void *pwork[PWORKLEN]; /* work array of pointers */
193 #endif
194
armci_acc_vector(int op,void * scale,armci_giov_t darr[],int len,int proc)195 int armci_acc_vector(int op, /* operation code */
196 void *scale, /* pointer to scale factor in accumulate */
197 armci_giov_t darr[], /* descriptor array */
198 int len, /* length of descriptor array */
199 int proc /* remote process(or) ID */
200 )
201 {
202 int i;
203
204 #if defined(ACC_COPY)
205 if(proc == armci_me ){
206 #endif
207 for(i = 0; i< len; i++) armci_scatter_acc(op, scale, darr[i], proc, 1);
208 #if defined(ACC_COPY)
209 }else{
210 for(i = 0; i< len; i++){
211 armci_giov_t dr = darr[i];
212 int j, rc, nb;
213 if(dr.bytes > BUFSIZE/2){
214 /* for large segments use strided implementation */
215 for(j=0; j< dr.ptr_array_len; j++){
216 rc = armci_acc_copy_strided(op, scale,proc,
217 dr.src_ptr_array[j], NULL, dr.dst_ptr_array[j],NULL,
218 &dr.bytes, 0);
219 if(rc)return(rc);
220 }
221 }else{
222 armci_giov_t dl;
223 /*lock memory:should optimize it to lock only a chunk at a time*/
224 armci_lockmem_scatter(dr.dst_ptr_array, dr.ptr_array_len, dr.bytes, proc);
225 /* copy as many blocks as possible into the local buffer */
226 dl.bytes = dr.bytes;
227 nb = ARMCI_MIN(PWORKLEN,BUFSIZE/dr.bytes);
228 for(j=0; j< dr.ptr_array_len; j+= nb){
229 int nblocks = ARMCI_MIN(nb, dr.ptr_array_len -j);
230 int k;
231 /* setup vector descriptor for remote memory copy
232 to bring data into buffer*/
233 dl.ptr_array_len = nblocks;
234 dl.src_ptr_array = dr.dst_ptr_array + j; /* GET destination becomes source for copy */
235 for(k=0; k< nblocks; k++) pwork[k] = k*dl.bytes + (char*)armci_internal_buffer;
236 dl.dst_ptr_array = pwork;
237 /* get data to the local buffer */
238 rc = armci_copy_vector(GET, &dl, 1, proc);
239 if(rc){ ARMCI_UNLOCKMEM(proc); return(rc);}
240 /* update source array for accumulate */
241 dl.src_ptr_array = dr.src_ptr_array +j;
242 /* do scatter accumulate updating copy of data in buffer */
243 armci_scatter_acc(op, scale, dl, armci_me, 0);
244 /* modify descriptor-now source becomes destination for PUT*/
245 dl.dst_ptr_array = dr.dst_ptr_array + j;
246 dl.src_ptr_array = pwork;
247 /* put data back */
248 rc = armci_copy_vector(PUT, &dl, 1, proc);
249 FENCE_NODE(proc);
250 if(rc){ ARMCI_UNLOCKMEM(proc); return(rc);}
251 }
252 ARMCI_UNLOCKMEM(proc);
253 }
254 }/*endfor*/
255 }
256 #endif
257
258 return 0;
259 }
260
261
262
263
armci_copy_vector(int op,armci_giov_t darr[],int len,int proc)264 int armci_copy_vector(int op, /* operation code */
265 armci_giov_t darr[], /* descriptor array */
266 int len, /* length of descriptor array */
267 int proc /* remote process(or) ID */
268 )
269 {
270 int i,s,shmem= SAMECLUSNODE(proc);
271 int armci_th_idx = ARMCI_THREAD_IDX;
272
273 if(shmem){
274 /* local/shared memory copy */
275 for(i = 0; i< len; i++){
276 for( s=0; s< darr[i].ptr_array_len; s++){
277 armci_copy(darr[i].src_ptr_array[s],darr[i].dst_ptr_array[s],darr[i].bytes);
278 }
279 }
280
281 }else {
282 switch(op){
283 case PUT:
284
285 for(i = 0; i< len; i++){
286
287 UPDATE_FENCE_STATE(proc, PUT, darr[i].ptr_array_len);
288
289 for( s=0; s< darr[i].ptr_array_len; s++){
290 armci_put(darr[i].src_ptr_array[s],darr[i].dst_ptr_array[s],
291 darr[i].bytes, proc);
292 }
293 }
294 break;
295 case GET:
296 for(i = 0; i< len; i++){
297 for( s=0; s< darr[i].ptr_array_len; s++){
298 armci_get(darr[i].src_ptr_array[s],darr[i].dst_ptr_array[s],
299 darr[i].bytes,proc);
300 }
301 }
302 break;
303 default:
304 armci_die("armci_copy_vector: wrong optype",op);
305 }
306 }
307
308 return 0;
309 }
310
311
armci_vector_to_buf(armci_giov_t darr[],int len,void * buf)312 void armci_vector_to_buf(armci_giov_t darr[], int len, void* buf)
313 {
314 int i,s;
315 char *ptr = (char*)buf;
316 for(i = 0; i< len; i++){
317 for( s=0; s< darr[i].ptr_array_len; s++){
318 armci_copy(darr[i].src_ptr_array[s],ptr,darr[i].bytes);
319 ptr += darr[i].bytes;
320 }
321 }
322 }
323
324
armci_vector_from_buf(armci_giov_t darr[],int len,void * buf)325 void armci_vector_from_buf(armci_giov_t darr[], int len, void* buf)
326 {
327 int i,s;
328 char *ptr = (char*)buf;
329
330 for(i = 0; i< len; i++){
331 for( s=0; s< darr[i].ptr_array_len; s++){
332 armci_copy(ptr, darr[i].dst_ptr_array[s],darr[i].bytes);
333 ptr += darr[i].bytes;
334 }
335 }
336 }
337
PARMCI_PutV(armci_giov_t darr[],int len,int proc)338 int PARMCI_PutV( armci_giov_t darr[], /* descriptor array */
339 int len, /* length of descriptor array */
340 int proc /* remote process(or) ID */
341 )
342 {
343 int rc=0, i,direct=1;
344 if(len<1) return FAIL;
345 for(i=0;i<len;i++){
346 if(darr[i].src_ptr_array == NULL || darr[i].dst_ptr_array ==NULL) return FAIL2;
347 if(darr[i].bytes<1)return FAIL3;
348 if(darr[i].ptr_array_len <1) return FAIL4;
349 }
350
351 if(proc<0 || proc >= armci_nproc)return FAIL5;
352
353 ORDER(PUT,proc); /* ensure ordering */
354 direct=SAMECLUSNODE(proc);
355
356 if(direct){
357 if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_PUT);
358 rc = armci_copy_vector(PUT, darr, len, proc);
359 }
360 else{
361 DO_FENCE(proc,SERVER_PUT);
362 rc = armci_pack_vector(PUT, NULL, darr, len, proc,NULL);
363 }
364
365 if(rc) return FAIL6;
366 else return 0;
367
368 }
369
370
PARMCI_GetV(armci_giov_t darr[],int len,int proc)371 int PARMCI_GetV( armci_giov_t darr[], /* descriptor array */
372 int len, /* length of descriptor array */
373 int proc /* remote process(or) ID */
374 )
375 {
376 int rc=0, i,direct=1;
377
378 if(len<1) return FAIL;
379 for(i=0;i<len;i++){
380 if(darr[i].src_ptr_array==NULL ||darr[i].dst_ptr_array==NULL)return FAIL2;
381 if(darr[i].bytes<1)return FAIL3;
382 if(darr[i].ptr_array_len <1) return FAIL4;
383 }
384
385 if(proc<0 || proc >= armci_nproc)return FAIL5;
386
387 ORDER(GET,proc); /* ensure ordering */
388 #ifndef QUADRICS
389 direct=SAMECLUSNODE(proc);
390 #endif
391
392 if(direct){
393 if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_GET);
394 rc = armci_copy_vector(GET, darr, len, proc);
395 }
396 else{
397 DO_FENCE(proc,SERVER_GET);
398 rc = armci_pack_vector(GET, NULL, darr, len, proc,NULL);
399 }
400
401 if(rc) return FAIL6;
402 else return 0;
403 }
404
405
406
407
PARMCI_AccV(int op,void * scale,armci_giov_t darr[],int len,int proc)408 int PARMCI_AccV( int op, /* oeration code */
409 void *scale, /*scaling factor for accumulate */
410 armci_giov_t darr[], /* descriptor array */
411 int len, /* length of descriptor array */
412 int proc /* remote process(or) ID */
413 )
414 {
415 int rc=0, i,direct=0;
416
417 if(len<1) return FAIL;
418 for(i=0;i<len;i++){
419 if(darr[i].src_ptr_array==NULL ||darr[i].dst_ptr_array==NULL)return FAIL2;
420 if(darr[i].bytes<1)return FAIL3;
421 if(darr[i].ptr_array_len <1) return FAIL4;
422 }
423
424 if(proc<0 || proc >= armci_nproc)return FAIL5;
425
426 ORDER(op,proc); /* ensure ordering */
427 direct=SAMECLUSNODE(proc);
428 # if defined(ACC_COPY) && !defined(ACC_SMP)
429 if(armci_me != proc) direct=0;
430 # error "grrr"
431 # endif
432 if(direct) {
433 rc = armci_acc_vector( op, scale, darr, len, proc);
434 } else {
435 DO_FENCE(proc,SERVER_PUT);
436 rc = armci_pack_vector(op, scale, darr, len, proc,NULL);
437 }
438
439 if(rc) return FAIL6;
440 else return 0;
441 }
442
443
444 /*****************************************************************************/
445
446 /*\ Non-blocking vector API
447 \*/
PARMCI_NbPutV(armci_giov_t darr[],int len,int proc,armci_hdl_t * usr_hdl)448 int PARMCI_NbPutV( armci_giov_t darr[], /* descriptor array */
449 int len, /* length of descriptor array */
450 int proc, /* remote process(or) ID */
451 armci_hdl_t* usr_hdl /*non-blocking request handle*/
452 )
453 {
454 armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl;
455 int rc=0, i,direct=1;
456
457 if(len<1) return FAIL;
458 for(i=0;i<len;i++){
459 if(darr[i].src_ptr_array == NULL || darr[i].dst_ptr_array ==NULL) return FAIL2;
460 if(darr[i].bytes<1)return FAIL3;
461 if(darr[i].ptr_array_len <1) return FAIL4;
462 }
463
464 if(proc<0 || proc >= armci_nproc)return FAIL5;
465
466 direct=SAMECLUSNODE(proc);
467 /* aggregate put */
468 if(nb_handle && nb_handle->agg_flag == SET) {
469 if(!direct) {
470 rc=armci_agg_save_giov_descriptor(darr, len, proc, PUT, nb_handle);
471 return rc;
472 }
473 }
474 else {
475
476 /*ORDER(PUT,proc); ensure ordering */
477 UPDATE_FENCE_INFO(proc);
478
479 /*set tag and op in the nb handle*/
480 if(nb_handle){
481 nb_handle->tag = GET_NEXT_NBTAG();
482 nb_handle->op = PUT;
483 nb_handle->proc= proc;
484 nb_handle->bufid=NB_NONE;
485 }
486 else
487 nb_handle = armci_set_implicit_handle(PUT, proc);
488 }
489
490 if(direct){
491 if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_PUT);
492 rc = armci_copy_vector(PUT, darr, len, proc);
493 }
494 else{
495 DO_FENCE(proc,SERVER_NBPUT);
496 rc = armci_pack_vector(PUT, NULL, darr, len, proc,nb_handle);
497 }
498
499 if(rc) return FAIL6;
500 else return 0;
501 }
502
PARMCI_NbGetV(armci_giov_t darr[],int len,int proc,armci_hdl_t * usr_hdl)503 int PARMCI_NbGetV( armci_giov_t darr[], /* descriptor array */
504 int len, /* length of descriptor array */
505 int proc, /* remote process(or) ID */
506 armci_hdl_t* usr_hdl /*non-blocking request handle*/
507 )
508 {
509 armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl;
510 int rc=0, i,direct=1;
511
512 if(len<1) return FAIL;
513 for(i=0;i<len;i++){
514 if(darr[i].src_ptr_array==NULL ||darr[i].dst_ptr_array==NULL)return FAIL2;
515 if(darr[i].bytes<1)return FAIL3;
516 if(darr[i].ptr_array_len <1) return FAIL4;
517 }
518
519 if(proc<0 || proc >= armci_nproc)return FAIL5;
520
521 direct=SAMECLUSNODE(proc);
522
523 /* aggregate get */
524 if(nb_handle && nb_handle->agg_flag == SET) {
525 if(!direct) {
526 rc=armci_agg_save_giov_descriptor(darr, len, proc, GET, nb_handle);
527 return rc;
528 }
529 }
530 else {
531 /* ORDER(GET,proc); ensure ordering */
532 if(nb_handle){
533 nb_handle->tag = GET_NEXT_NBTAG();
534 nb_handle->op = GET;
535 nb_handle->proc= proc;
536 nb_handle->bufid=NB_NONE;
537 }
538 else
539 nb_handle = armci_set_implicit_handle(GET, proc);
540 }
541
542 if(direct){
543 if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_GET);
544 rc = armci_copy_vector(GET, darr, len, proc);
545 }
546 else{
547 DO_FENCE(proc,SERVER_NBGET);
548 rc = armci_pack_vector(GET, NULL, darr, len, proc,nb_handle);
549 }
550
551 if(rc) return FAIL6;
552 else return 0;
553 }
554
555
PARMCI_NbAccV(int op,void * scale,armci_giov_t darr[],int len,int proc,armci_hdl_t * usr_hdl)556 int PARMCI_NbAccV( int op, /* oeration code */
557 void *scale, /*scaling factor for accumulate */
558 armci_giov_t darr[], /* descriptor array */
559 int len, /* length of descriptor array */
560 int proc, /* remote process(or) ID */
561 armci_hdl_t* usr_hdl /*non-blocking request handle*/
562 )
563 {
564 armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl;
565 int rc=0, i,direct=1;
566
567 if(len<1) return FAIL;
568 for(i=0;i<len;i++)
569 {
570 if(darr[i].src_ptr_array==NULL ||darr[i].dst_ptr_array==NULL)return FAIL2;
571 if(darr[i].bytes<1)return FAIL3;
572 if(darr[i].ptr_array_len <1) return FAIL4;
573 }
574
575 if(proc<0 || proc >= armci_nproc)return FAIL5;
576
577 UPDATE_FENCE_INFO(proc);
578 direct=SAMECLUSNODE(proc);
579
580 if(nb_handle){
581 nb_handle->tag = GET_NEXT_NBTAG();
582 nb_handle->op = op;
583 nb_handle->proc= proc;
584 nb_handle->bufid=NB_NONE;
585 }
586 else
587 nb_handle = armci_set_implicit_handle(op, proc);
588
589 # if defined(ACC_COPY) && !defined(ACC_SMP)
590 if(armci_me != proc) direct=0;
591 # endif
592
593 if(direct)
594 rc = armci_acc_vector( op, scale, darr, len, proc);
595 else{
596 DO_FENCE(proc,SERVER_NBPUT);
597 rc = armci_pack_vector(op, scale, darr, len, proc,nb_handle);
598 }
599
600 if(rc) return FAIL6;
601 else return 0;
602 }
603 /*****************************************************************************/
604