1 #if HAVE_CONFIG_H
2 # include "config.h"
3 #endif
4
5 /* $Id: vector.c,v 1.32.6.4 2007-08-29 17:32:32 manoj Exp $ */
6 #include "armcip.h"
7 #include "copy.h"
8 #include "acc.h"
9 #include "memlock.h"
10 #include <stdio.h>
11 #include <assert.h>
12
13 #define SERVER_GET 1
14 #define SERVER_NBGET 2
15 #define DIRECT_GET 3
16 #define DIRECT_NBGET 4
17 #define SERVER_PUT 5
18 #define SERVER_NBPUT 6
19 #define DIRECT_PUT 7
20 #define DIRECT_NBPUT 8
21
22
23 # define DO_FENCE(__proc,__prot) if(__prot==SERVER_GET);\
24 else if(__prot==SERVER_PUT);\
25 else if(__prot==DIRECT_GET || __prot==DIRECT_NBGET){\
26 if(armci_prot_switch_fence[__proc]==SERVER_PUT)\
27 ARMCI_DoFence(__proc);\
28 }\
29 else if(__prot==DIRECT_PUT || __prot==DIRECT_NBPUT){\
30 if(armci_prot_switch_fence[__proc]==SERVER_PUT)\
31 ARMCI_DoFence(__proc);\
32 }\
33 else;\
34 armci_prot_switch_fence[__proc]=__prot
35
36 /* defined in acc.h so don't redefine here
37 #ifndef ARMCI_COMPLEX_TYPES
38 typedef struct {
39 float real;
40 float imag;
41 } complex_t;
42
43 typedef struct {
44 double real;
45 double imag;
46 } dcomplex_t;
47 #endif
48 */
49
50 /*
51 void I_ACCUMULATE(void* scale, int elems, void*src, void* dst)
52 {
53 int j;
54 int *a=(int*)dst, *b=(int*)src;
55 int alpha = *(int*)scale;
56
57 for(j=0;j<elems;j++) a[j] += alpha*b[j];
58 }
59 */
60
61
62 #define ACCUMULATE( DTYPE, scale, elems, src, dst) {\
63 int j;\
64 DTYPE *a =(DTYPE *)(dst);\
65 DTYPE *b =(DTYPE *)(src);\
66 DTYPE alpha = *(DTYPE *)(scale);\
67 for(j=0;j<(elems);j++)a[j] += alpha*b[j];\
68 }
69
70 #define ACCUMULATE_RA( DTYPE, elems, src, dst) {\
71 int j;\
72 DTYPE *a =(DTYPE *)(dst);\
73 DTYPE *b =(DTYPE *)(src);\
74 for(j=0;j<(elems);j++)a[j] ^= b[j];\
75 }
76
77 #define CPL_ACCUMULATE( DTYPE, scale, elems, src, dst) {\
78 int j;\
79 DTYPE *a =(DTYPE *)(dst);\
80 DTYPE *b =(DTYPE *)(src);\
81 DTYPE alpha = *(DTYPE *)(scale);\
82 for(j=0;j<(elems);j++){\
83 a[j].real += alpha.real*b[j].real - alpha.imag*b[j].imag;\
84 a[j].imag += alpha.imag*b[j].real + alpha.real*b[j].imag;\
85 }\
86 }
87
88 extern int* armci_prot_switch_fence;
89 extern int armci_prot_switch_preproc;
90 extern int armci_prot_switch_preop;
91
92
93 /*\ compute address range for memory to lock
94 \*/
armci_lockmem_scatter(void * ptr_array[],int len,int bytes,int proc)95 void armci_lockmem_scatter(void *ptr_array[], int len, int bytes, int proc)
96 {
97 int i;
98 void *pmin, *pmax;
99
100 pmin=ptr_array[0];
101 pmax=ptr_array[0];
102
103 for(i = 0; i< len; i++){
104 pmin = ARMCI_MIN(ptr_array[i],pmin);
105 pmax = ARMCI_MAX(ptr_array[i],pmax);
106 }
107 pmax = bytes-1 + (char*)pmax;
108 ARMCI_LOCKMEM(pmin, pmax, proc);
109 /* printf("%d: locked %ld-%ld bytes=%d\n",armci_me,pmin,pmax,
110 1+(char*)pmax -(char*)pmin);fflush(stdout); */
111 }
112
113
114
armci_scatter_acc(int op,void * scale,armci_giov_t dsc,int proc,int lockit)115 void armci_scatter_acc(int op, void *scale, armci_giov_t dsc,
116 int proc, int lockit)
117 {
118 # define ITERATOR for(i = 0; i< dsc.ptr_array_len; i++)
119 int i, elems, size;
120 if(lockit)
121 armci_lockmem_scatter(dsc.dst_ptr_array, dsc.ptr_array_len,
122 dsc.bytes, proc);
123 switch (op){
124 case ARMCI_ACC_INT:
125 size = sizeof(int);
126 elems = dsc.bytes/size;
127 if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
128 ITERATOR{
129 ACCUMULATE(int, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
130 }
131 break;
132
133 case ARMCI_ACC_LNG:
134 size = sizeof(long);
135 elems = dsc.bytes/size;
136 if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
137 ITERATOR{
138 ACCUMULATE(long, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
139 }
140 break;
141
142 case ARMCI_ACC_DBL:
143 size = sizeof(double);
144 elems = dsc.bytes/size;
145 if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
146 ITERATOR{
147 ACCUMULATE(double, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
148 }
149 break;
150
151 case ARMCI_ACC_DCP:
152 size = 2*sizeof(double);
153 elems = dsc.bytes/size;
154 if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
155 ITERATOR{
156 CPL_ACCUMULATE(dcomplex_t, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
157 }
158 break;
159
160 case ARMCI_ACC_CPL:
161 size = 2*sizeof(float);
162 elems = dsc.bytes/size;
163 if(dsc.bytes %size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
164 ITERATOR{
165 CPL_ACCUMULATE(complex_t, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
166 }
167 break;
168
169 case ARMCI_ACC_FLT:
170 size = sizeof(float);
171 elems = dsc.bytes/size;
172 if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
173 ITERATOR{
174 ACCUMULATE(float, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
175 }
176 break;
177 case ARMCI_ACC_RA:
178 size = sizeof(long);
179 elems = dsc.bytes/size;
180 if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
181 ITERATOR{
182 ACCUMULATE_RA(long,elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
183 }
184 break;
185 default: armci_die("ARMCI vector accumulate: operation not supported",op);
186 }
187
188 if(lockit) ARMCI_UNLOCKMEM(proc);
189 }
190
191
192 #ifdef ACC_COPY
193 # define PWORKLEN 2048
194 static void *pwork[PWORKLEN]; /* work array of pointers */
195 #endif
196
armci_acc_vector(int op,void * scale,armci_giov_t darr[],int len,int proc)197 int armci_acc_vector(int op, /* operation code */
198 void *scale, /* pointer to scale factor in accumulate */
199 armci_giov_t darr[], /* descriptor array */
200 int len, /* length of descriptor array */
201 int proc /* remote process(or) ID */
202 )
203 {
204 int i;
205
206 #if defined(ACC_COPY)
207 if(proc == armci_me ){
208 #endif
209 for(i = 0; i< len; i++) armci_scatter_acc(op, scale, darr[i], proc, 1);
210 #if defined(ACC_COPY)
211 }else{
212 for(i = 0; i< len; i++){
213 armci_giov_t dr = darr[i];
214 int j, rc, nb;
215 if(dr.bytes > BUFSIZE/2){
216 /* for large segments use strided implementation */
217 for(j=0; j< dr.ptr_array_len; j++){
218 rc = armci_acc_copy_strided(op, scale,proc,
219 dr.src_ptr_array[j], NULL, dr.dst_ptr_array[j],NULL,
220 &dr.bytes, 0);
221 if(rc)return(rc);
222 }
223 }else{
224 armci_giov_t dl;
225 /*lock memory:should optimize it to lock only a chunk at a time*/
226 armci_lockmem_scatter(dr.dst_ptr_array, dr.ptr_array_len, dr.bytes, proc);
227 /* copy as many blocks as possible into the local buffer */
228 dl.bytes = dr.bytes;
229 nb = ARMCI_MIN(PWORKLEN,BUFSIZE/dr.bytes);
230 for(j=0; j< dr.ptr_array_len; j+= nb){
231 int nblocks = ARMCI_MIN(nb, dr.ptr_array_len -j);
232 int k;
233 /* setup vector descriptor for remote memory copy
234 to bring data into buffer*/
235 dl.ptr_array_len = nblocks;
236 dl.src_ptr_array = dr.dst_ptr_array + j; /* GET destination becomes source for copy */
237 for(k=0; k< nblocks; k++) pwork[k] = k*dl.bytes + (char*)armci_internal_buffer;
238 dl.dst_ptr_array = pwork;
239 /* get data to the local buffer */
240 rc = armci_copy_vector(GET, &dl, 1, proc);
241 if(rc){ ARMCI_UNLOCKMEM(proc); return(rc);}
242 /* update source array for accumulate */
243 dl.src_ptr_array = dr.src_ptr_array +j;
244 /* do scatter accumulate updating copy of data in buffer */
245 armci_scatter_acc(op, scale, dl, armci_me, 0);
246 /* modify descriptor-now source becomes destination for PUT*/
247 dl.dst_ptr_array = dr.dst_ptr_array + j;
248 dl.src_ptr_array = pwork;
249 /* put data back */
250 rc = armci_copy_vector(PUT, &dl, 1, proc);
251 FENCE_NODE(proc);
252 if(rc){ ARMCI_UNLOCKMEM(proc); return(rc);}
253 }
254 ARMCI_UNLOCKMEM(proc);
255 }
256 }/*endfor*/
257 }
258 #endif
259
260 return 0;
261 }
262
263
264
265
armci_copy_vector(int op,armci_giov_t darr[],int len,int proc)266 int armci_copy_vector(int op, /* operation code */
267 armci_giov_t darr[], /* descriptor array */
268 int len, /* length of descriptor array */
269 int proc /* remote process(or) ID */
270 )
271 {
272 int i,s,shmem= SAMECLUSNODE(proc);
273 int armci_th_idx = ARMCI_THREAD_IDX;
274
275 if(shmem){
276 /* local/shared memory copy */
277 for(i = 0; i< len; i++){
278 for( s=0; s< darr[i].ptr_array_len; s++){
279 armci_copy(darr[i].src_ptr_array[s],darr[i].dst_ptr_array[s],darr[i].bytes);
280 }
281 }
282
283 }else {
284 switch(op){
285 case PUT:
286
287 for(i = 0; i< len; i++){
288
289 UPDATE_FENCE_STATE(proc, PUT, darr[i].ptr_array_len);
290
291 for( s=0; s< darr[i].ptr_array_len; s++){
292 armci_put(darr[i].src_ptr_array[s],darr[i].dst_ptr_array[s],
293 darr[i].bytes, proc);
294 }
295 }
296 break;
297 case GET:
298 for(i = 0; i< len; i++){
299 for( s=0; s< darr[i].ptr_array_len; s++){
300 armci_get(darr[i].src_ptr_array[s],darr[i].dst_ptr_array[s],
301 darr[i].bytes,proc);
302 }
303 }
304 break;
305 default:
306 armci_die("armci_copy_vector: wrong optype",op);
307 }
308 }
309
310 return 0;
311 }
312
313
armci_vector_to_buf(armci_giov_t darr[],int len,void * buf)314 void armci_vector_to_buf(armci_giov_t darr[], int len, void* buf)
315 {
316 int i,s;
317 char *ptr = (char*)buf;
318 for(i = 0; i< len; i++){
319 for( s=0; s< darr[i].ptr_array_len; s++){
320 armci_copy(darr[i].src_ptr_array[s],ptr,darr[i].bytes);
321 ptr += darr[i].bytes;
322 }
323 }
324 }
325
326
armci_vector_from_buf(armci_giov_t darr[],int len,void * buf)327 void armci_vector_from_buf(armci_giov_t darr[], int len, void* buf)
328 {
329 int i,s;
330 char *ptr = (char*)buf;
331
332 for(i = 0; i< len; i++){
333 for( s=0; s< darr[i].ptr_array_len; s++){
334 armci_copy(ptr, darr[i].dst_ptr_array[s],darr[i].bytes);
335 ptr += darr[i].bytes;
336 }
337 }
338 }
339
PARMCI_PutV(armci_giov_t darr[],int len,int proc)340 int PARMCI_PutV( armci_giov_t darr[], /* descriptor array */
341 int len, /* length of descriptor array */
342 int proc /* remote process(or) ID */
343 )
344 {
345 int rc=0, i,direct=1;
346 if(len<1) return FAIL;
347 for(i=0;i<len;i++){
348 if(darr[i].src_ptr_array == NULL || darr[i].dst_ptr_array ==NULL) return FAIL2;
349 if(darr[i].bytes<1)return FAIL3;
350 if(darr[i].ptr_array_len <1) return FAIL4;
351 }
352
353 if(proc<0 || proc >= armci_nproc)return FAIL5;
354
355 ORDER(PUT,proc); /* ensure ordering */
356 direct=SAMECLUSNODE(proc);
357
358 if(direct){
359 if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_PUT);
360 rc = armci_copy_vector(PUT, darr, len, proc);
361 }
362 else{
363 DO_FENCE(proc,SERVER_PUT);
364 rc = armci_pack_vector(PUT, NULL, darr, len, proc,NULL);
365 }
366
367 if(rc) return FAIL6;
368 else return 0;
369
370 }
371
372
PARMCI_GetV(armci_giov_t darr[],int len,int proc)373 int PARMCI_GetV( armci_giov_t darr[], /* descriptor array */
374 int len, /* length of descriptor array */
375 int proc /* remote process(or) ID */
376 )
377 {
378 int rc=0, i,direct=1;
379
380 if(len<1) return FAIL;
381 for(i=0;i<len;i++){
382 if(darr[i].src_ptr_array==NULL ||darr[i].dst_ptr_array==NULL)return FAIL2;
383 if(darr[i].bytes<1)return FAIL3;
384 if(darr[i].ptr_array_len <1) return FAIL4;
385 }
386
387 if(proc<0 || proc >= armci_nproc)return FAIL5;
388
389 ORDER(GET,proc); /* ensure ordering */
390 #ifndef QUADRICS
391 direct=SAMECLUSNODE(proc);
392 #endif
393
394 if(direct){
395 if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_GET);
396 rc = armci_copy_vector(GET, darr, len, proc);
397 }
398 else{
399 DO_FENCE(proc,SERVER_GET);
400 rc = armci_pack_vector(GET, NULL, darr, len, proc,NULL);
401 }
402
403 if(rc) return FAIL6;
404 else return 0;
405 }
406
407
408
409
PARMCI_AccV(int op,void * scale,armci_giov_t darr[],int len,int proc)410 int PARMCI_AccV( int op, /* oeration code */
411 void *scale, /*scaling factor for accumulate */
412 armci_giov_t darr[], /* descriptor array */
413 int len, /* length of descriptor array */
414 int proc /* remote process(or) ID */
415 )
416 {
417 int rc=0, i,direct=0;
418
419 if(len<1) return FAIL;
420 for(i=0;i<len;i++){
421 if(darr[i].src_ptr_array==NULL ||darr[i].dst_ptr_array==NULL)return FAIL2;
422 if(darr[i].bytes<1)return FAIL3;
423 if(darr[i].ptr_array_len <1) return FAIL4;
424 }
425
426 if(proc<0 || proc >= armci_nproc)return FAIL5;
427
428 ORDER(op,proc); /* ensure ordering */
429 direct=SAMECLUSNODE(proc);
430 # if defined(ACC_COPY) && !defined(ACC_SMP)
431 if(armci_me != proc) direct=0;
432 # error "grrr"
433 # endif
434 if(direct) {
435 rc = armci_acc_vector( op, scale, darr, len, proc);
436 } else {
437 DO_FENCE(proc,SERVER_PUT);
438 rc = armci_pack_vector(op, scale, darr, len, proc,NULL);
439 }
440
441 if(rc) return FAIL6;
442 else return 0;
443 }
444
445
446 /*****************************************************************************/
447
448 /*\ Non-blocking vector API
449 \*/
PARMCI_NbPutV(armci_giov_t darr[],int len,int proc,armci_hdl_t * usr_hdl)450 int PARMCI_NbPutV( armci_giov_t darr[], /* descriptor array */
451 int len, /* length of descriptor array */
452 int proc, /* remote process(or) ID */
453 armci_hdl_t* usr_hdl /*non-blocking request handle*/
454 )
455 {
456 armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl;
457 int rc=0, i,direct=1;
458
459 if(len<1) return FAIL;
460 for(i=0;i<len;i++){
461 if(darr[i].src_ptr_array == NULL || darr[i].dst_ptr_array ==NULL) return FAIL2;
462 if(darr[i].bytes<1)return FAIL3;
463 if(darr[i].ptr_array_len <1) return FAIL4;
464 }
465
466 if(proc<0 || proc >= armci_nproc)return FAIL5;
467
468 direct=SAMECLUSNODE(proc);
469 /* aggregate put */
470 if(nb_handle && nb_handle->agg_flag == SET) {
471 if(!direct) {
472 rc=armci_agg_save_giov_descriptor(darr, len, proc, PUT, nb_handle);
473 return rc;
474 }
475 }
476 else {
477
478 /*ORDER(PUT,proc); ensure ordering */
479 UPDATE_FENCE_INFO(proc);
480
481 /*set tag and op in the nb handle*/
482 if(nb_handle){
483 nb_handle->tag = GET_NEXT_NBTAG();
484 nb_handle->op = PUT;
485 nb_handle->proc= proc;
486 nb_handle->bufid=NB_NONE;
487 }
488 else
489 nb_handle = armci_set_implicit_handle(PUT, proc);
490 }
491
492 if(direct){
493 if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_PUT);
494 rc = armci_copy_vector(PUT, darr, len, proc);
495 }
496 else{
497 DO_FENCE(proc,SERVER_NBPUT);
498 rc = armci_pack_vector(PUT, NULL, darr, len, proc,nb_handle);
499 }
500
501 if(rc) return FAIL6;
502 else return 0;
503 }
504
PARMCI_NbGetV(armci_giov_t darr[],int len,int proc,armci_hdl_t * usr_hdl)505 int PARMCI_NbGetV( armci_giov_t darr[], /* descriptor array */
506 int len, /* length of descriptor array */
507 int proc, /* remote process(or) ID */
508 armci_hdl_t* usr_hdl /*non-blocking request handle*/
509 )
510 {
511 armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl;
512 int rc=0, i,direct=1;
513
514 if(len<1) return FAIL;
515 for(i=0;i<len;i++){
516 if(darr[i].src_ptr_array==NULL ||darr[i].dst_ptr_array==NULL)return FAIL2;
517 if(darr[i].bytes<1)return FAIL3;
518 if(darr[i].ptr_array_len <1) return FAIL4;
519 }
520
521 if(proc<0 || proc >= armci_nproc)return FAIL5;
522
523 direct=SAMECLUSNODE(proc);
524
525 /* aggregate get */
526 if(nb_handle && nb_handle->agg_flag == SET) {
527 if(!direct) {
528 rc=armci_agg_save_giov_descriptor(darr, len, proc, GET, nb_handle);
529 return rc;
530 }
531 }
532 else {
533 /* ORDER(GET,proc); ensure ordering */
534 if(nb_handle){
535 nb_handle->tag = GET_NEXT_NBTAG();
536 nb_handle->op = GET;
537 nb_handle->proc= proc;
538 nb_handle->bufid=NB_NONE;
539 }
540 else
541 nb_handle = armci_set_implicit_handle(GET, proc);
542 }
543
544 if(direct){
545 if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_GET);
546 rc = armci_copy_vector(GET, darr, len, proc);
547 }
548 else{
549 DO_FENCE(proc,SERVER_NBGET);
550 rc = armci_pack_vector(GET, NULL, darr, len, proc,nb_handle);
551 }
552
553 if(rc) return FAIL6;
554 else return 0;
555 }
556
557
PARMCI_NbAccV(int op,void * scale,armci_giov_t darr[],int len,int proc,armci_hdl_t * usr_hdl)558 int PARMCI_NbAccV( int op, /* oeration code */
559 void *scale, /*scaling factor for accumulate */
560 armci_giov_t darr[], /* descriptor array */
561 int len, /* length of descriptor array */
562 int proc, /* remote process(or) ID */
563 armci_hdl_t* usr_hdl /*non-blocking request handle*/
564 )
565 {
566 armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl;
567 int rc=0, i,direct=1;
568
569 if(len<1) return FAIL;
570 for(i=0;i<len;i++)
571 {
572 if(darr[i].src_ptr_array==NULL ||darr[i].dst_ptr_array==NULL)return FAIL2;
573 if(darr[i].bytes<1)return FAIL3;
574 if(darr[i].ptr_array_len <1) return FAIL4;
575 }
576
577 if(proc<0 || proc >= armci_nproc)return FAIL5;
578
579 UPDATE_FENCE_INFO(proc);
580 direct=SAMECLUSNODE(proc);
581
582 if(nb_handle){
583 nb_handle->tag = GET_NEXT_NBTAG();
584 nb_handle->op = op;
585 nb_handle->proc= proc;
586 nb_handle->bufid=NB_NONE;
587 }
588 else
589 nb_handle = armci_set_implicit_handle(op, proc);
590
591 # if defined(ACC_COPY) && !defined(ACC_SMP)
592 if(armci_me != proc) direct=0;
593 # endif
594
595 if(direct)
596 rc = armci_acc_vector( op, scale, darr, len, proc);
597 else{
598 DO_FENCE(proc,SERVER_NBPUT);
599 rc = armci_pack_vector(op, scale, darr, len, proc,nb_handle);
600 }
601
602 if(rc) return FAIL6;
603 else return 0;
604 }
605 /*****************************************************************************/
606