1 #if HAVE_CONFIG_H
2 #   include "config.h"
3 #endif
4 
5 /* $Id: vector.c,v 1.32.6.4 2007-08-29 17:32:32 manoj Exp $ */
6 #include "armcip.h"
7 #include "copy.h"
8 #include "acc.h"
9 #include "memlock.h"
10 #include <stdio.h>
11 #include <assert.h>
12 
13 #define SERVER_GET 1
14 #define SERVER_NBGET 2
15 #define DIRECT_GET 3
16 #define DIRECT_NBGET 4
17 #define SERVER_PUT 5
18 #define SERVER_NBPUT 6
19 #define DIRECT_PUT 7
20 #define DIRECT_NBPUT 8
21 
22 
23 #  define DO_FENCE(__proc,__prot) if(__prot==SERVER_GET);\
24         else if(__prot==SERVER_PUT);\
25         else if(__prot==DIRECT_GET || __prot==DIRECT_NBGET){\
26           if(armci_prot_switch_fence[__proc]==SERVER_PUT)\
27             ARMCI_DoFence(__proc);\
28         }\
29         else if(__prot==DIRECT_PUT || __prot==DIRECT_NBPUT){\
30           if(armci_prot_switch_fence[__proc]==SERVER_PUT)\
31             ARMCI_DoFence(__proc);\
32         }\
33         else;\
34         armci_prot_switch_fence[__proc]=__prot
35 
36 /* defined in acc.h so don't redefine here
37 #ifndef ARMCI_COMPLEX_TYPES
38 typedef struct {
39     float real;
40     float imag;
41 } complex_t;
42 
43 typedef struct {
44     double real;
45     double imag;
46 } dcomplex_t;
47 #endif
48 */
49 
50 /*
51 void I_ACCUMULATE(void* scale, int elems, void*src, void* dst)
52 {
53     int j;
54     int *a=(int*)dst, *b=(int*)src;
55     int alpha = *(int*)scale;
56 
57     for(j=0;j<elems;j++) a[j] += alpha*b[j];
58 }
59 */
60 
61 
62 #define ACCUMULATE( DTYPE, scale, elems, src, dst) {\
63     int j;\
64     DTYPE *a =(DTYPE *)(dst);\
65     DTYPE *b =(DTYPE *)(src);\
66     DTYPE alpha = *(DTYPE *)(scale);\
67     for(j=0;j<(elems);j++)a[j] += alpha*b[j];\
68 }
69 
70 #define ACCUMULATE_RA( DTYPE, elems, src, dst) {\
71     int j;\
72     DTYPE *a =(DTYPE *)(dst);\
73     DTYPE *b =(DTYPE *)(src);\
74     for(j=0;j<(elems);j++)a[j] ^= b[j];\
75 }
76 
77 #define CPL_ACCUMULATE( DTYPE, scale, elems, src, dst) {\
78     int j;\
79     DTYPE *a =(DTYPE *)(dst);\
80     DTYPE *b =(DTYPE *)(src);\
81     DTYPE alpha = *(DTYPE *)(scale);\
82     for(j=0;j<(elems);j++){\
83         a[j].real += alpha.real*b[j].real - alpha.imag*b[j].imag;\
84         a[j].imag += alpha.imag*b[j].real + alpha.real*b[j].imag;\
85     }\
86 }
87 
88 extern int* armci_prot_switch_fence;
89 extern int armci_prot_switch_preproc;
90 extern int armci_prot_switch_preop;
91 
92 
93 /*\ compute address range for memory to lock
94 \*/
armci_lockmem_scatter(void * ptr_array[],int len,int bytes,int proc)95 void armci_lockmem_scatter(void *ptr_array[], int len, int bytes, int proc)
96 {
97      int i;
98      void *pmin, *pmax;
99 
100      pmin=ptr_array[0];
101      pmax=ptr_array[0];
102 
103      for(i = 0; i< len; i++){
104               pmin = ARMCI_MIN(ptr_array[i],pmin);
105               pmax = ARMCI_MAX(ptr_array[i],pmax);
106      }
107      pmax =  bytes-1 + (char*)pmax;
108      ARMCI_LOCKMEM(pmin, pmax, proc);
109 /*    printf("%d: locked %ld-%ld bytes=%d\n",armci_me,pmin,pmax,
110      1+(char*)pmax -(char*)pmin);fflush(stdout); */
111 }
112 
113 
114 
armci_scatter_acc(int op,void * scale,armci_giov_t dsc,int proc,int lockit)115 void armci_scatter_acc(int op, void *scale, armci_giov_t dsc,
116                                             int proc, int lockit)
117 {
118 #   define ITERATOR for(i = 0; i< dsc.ptr_array_len; i++)
119     int i, elems, size;
120       if(lockit)
121          armci_lockmem_scatter(dsc.dst_ptr_array, dsc.ptr_array_len,
122                                dsc.bytes, proc);
123       switch (op){
124       case ARMCI_ACC_INT:
125           size  = sizeof(int);
126           elems = dsc.bytes/size;
127           if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
128           ITERATOR{
129             ACCUMULATE(int, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
130           }
131           break;
132 
133       case ARMCI_ACC_LNG:
134           size  = sizeof(long);
135           elems = dsc.bytes/size;
136           if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
137           ITERATOR{
138             ACCUMULATE(long, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
139           }
140           break;
141 
142       case ARMCI_ACC_DBL:
143           size  = sizeof(double);
144           elems = dsc.bytes/size;
145           if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
146           ITERATOR{
147             ACCUMULATE(double, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
148           }
149           break;
150 
151       case ARMCI_ACC_DCP:
152           size  = 2*sizeof(double);
153           elems = dsc.bytes/size;
154           if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
155           ITERATOR{
156             CPL_ACCUMULATE(dcomplex_t, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
157           }
158           break;
159 
160       case ARMCI_ACC_CPL:
161           size  = 2*sizeof(float);
162           elems = dsc.bytes/size;
163           if(dsc.bytes %size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
164           ITERATOR{
165             CPL_ACCUMULATE(complex_t, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
166           }
167           break;
168 
169       case ARMCI_ACC_FLT:
170           size  = sizeof(float);
171           elems = dsc.bytes/size;
172           if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
173           ITERATOR{
174             ACCUMULATE(float, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
175           }
176           break;
177       case ARMCI_ACC_RA:
178           size  = sizeof(long);
179           elems = dsc.bytes/size;
180           if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
181           ITERATOR{
182             ACCUMULATE_RA(long,elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
183           }
184           break;
185       default: armci_die("ARMCI vector accumulate: operation not supported",op);
186       }
187 
188       if(lockit) ARMCI_UNLOCKMEM(proc);
189 }
190 
191 
192 #ifdef ACC_COPY
193 #  define PWORKLEN 2048
194    static void *pwork[PWORKLEN];  /* work array of pointers */
195 #endif
196 
armci_acc_vector(int op,void * scale,armci_giov_t darr[],int len,int proc)197 int armci_acc_vector(int op,             /* operation code */
198                     void *scale,         /* pointer to scale factor in accumulate */
199                     armci_giov_t darr[], /* descriptor array */
200                     int len,             /* length of descriptor array */
201                     int proc             /* remote process(or) ID */
202               )
203 {
204     int i;
205 
206 #if defined(ACC_COPY)
207     if(proc == armci_me ){
208 #endif
209        for(i = 0; i< len; i++) armci_scatter_acc(op, scale, darr[i], proc, 1);
210 #if defined(ACC_COPY)
211     }else{
212        for(i = 0; i< len; i++){
213            armci_giov_t dr =  darr[i];
214            int j, rc, nb;
215            if(dr.bytes > BUFSIZE/2){
216                /* for large segments use strided implementation */
217                for(j=0; j< dr.ptr_array_len; j++){
218                    rc = armci_acc_copy_strided(op, scale,proc,
219                            dr.src_ptr_array[j], NULL, dr.dst_ptr_array[j],NULL,
220                            &dr.bytes, 0);
221                    if(rc)return(rc);
222                }
223            }else{
224                armci_giov_t dl;
225                /*lock memory:should optimize it to lock only a chunk at a time*/
226                armci_lockmem_scatter(dr.dst_ptr_array, dr.ptr_array_len, dr.bytes, proc);
227                /* copy as many blocks as possible into the local buffer */
228                dl.bytes = dr.bytes;
229                nb = ARMCI_MIN(PWORKLEN,BUFSIZE/dr.bytes);
230                for(j=0; j< dr.ptr_array_len; j+= nb){
231                    int nblocks = ARMCI_MIN(nb, dr.ptr_array_len -j);
232                    int k;
233                    /* setup vector descriptor for remote memory copy
234                       to bring data into buffer*/
235                    dl.ptr_array_len = nblocks;
236                    dl.src_ptr_array = dr.dst_ptr_array + j; /* GET destination becomes source for copy */
237                    for(k=0; k< nblocks; k++) pwork[k] = k*dl.bytes + (char*)armci_internal_buffer;
238                    dl.dst_ptr_array = pwork;
239                    /* get data to the local buffer */
240                    rc = armci_copy_vector(GET, &dl, 1, proc);
241                    if(rc){ ARMCI_UNLOCKMEM(proc); return(rc);}
242                    /* update source array for accumulate */
243                    dl.src_ptr_array = dr.src_ptr_array +j;
244                    /* do scatter accumulate updating copy of data in buffer */
245                    armci_scatter_acc(op, scale, dl, armci_me, 0);
246                    /* modify descriptor-now source becomes destination for PUT*/
247                    dl.dst_ptr_array = dr.dst_ptr_array + j;
248                    dl.src_ptr_array = pwork;
249                    /* put data back */
250                    rc = armci_copy_vector(PUT, &dl, 1, proc);
251                    FENCE_NODE(proc);
252                    if(rc){ ARMCI_UNLOCKMEM(proc); return(rc);}
253                }
254                ARMCI_UNLOCKMEM(proc);
255            }
256        }/*endfor*/
257     }
258 #endif
259 
260     return 0;
261 }
262 
263 
264 
265 
armci_copy_vector(int op,armci_giov_t darr[],int len,int proc)266 int armci_copy_vector(int op,            /* operation code */
267                     armci_giov_t darr[], /* descriptor array */
268                     int len,             /* length of descriptor array */
269                     int proc             /* remote process(or) ID */
270               )
271 {
272     int i,s,shmem= SAMECLUSNODE(proc);
273     int armci_th_idx = ARMCI_THREAD_IDX;
274 
275     if(shmem){
276       /* local/shared memory copy */
277       for(i = 0; i< len; i++){
278         for( s=0; s< darr[i].ptr_array_len; s++){
279            armci_copy(darr[i].src_ptr_array[s],darr[i].dst_ptr_array[s],darr[i].bytes);
280         }
281       }
282 
283     }else {
284       switch(op){
285       case PUT:
286 
287         for(i = 0; i< len; i++){
288 
289           UPDATE_FENCE_STATE(proc, PUT, darr[i].ptr_array_len);
290 
291           for( s=0; s< darr[i].ptr_array_len; s++){
292               armci_put(darr[i].src_ptr_array[s],darr[i].dst_ptr_array[s],
293                         darr[i].bytes, proc);
294            }
295         }
296         break;
297       case GET:
298         for(i = 0; i< len; i++){
299           for( s=0; s< darr[i].ptr_array_len; s++){
300               armci_get(darr[i].src_ptr_array[s],darr[i].dst_ptr_array[s],
301                         darr[i].bytes,proc);
302            }
303         }
304         break;
305       default:
306           armci_die("armci_copy_vector: wrong optype",op);
307       }
308    }
309 
310    return 0;
311 }
312 
313 
armci_vector_to_buf(armci_giov_t darr[],int len,void * buf)314 void armci_vector_to_buf(armci_giov_t darr[], int len, void* buf)
315 {
316 int i,s;
317 char *ptr = (char*)buf;
318       for(i = 0; i< len; i++){
319         for( s=0; s< darr[i].ptr_array_len; s++){
320           armci_copy(darr[i].src_ptr_array[s],ptr,darr[i].bytes);
321           ptr += darr[i].bytes;
322         }
323       }
324 }
325 
326 
armci_vector_from_buf(armci_giov_t darr[],int len,void * buf)327 void armci_vector_from_buf(armci_giov_t darr[], int len, void* buf)
328 {
329 int i,s;
330 char *ptr = (char*)buf;
331 
332       for(i = 0; i< len; i++){
333         for( s=0; s< darr[i].ptr_array_len; s++){
334           armci_copy(ptr, darr[i].dst_ptr_array[s],darr[i].bytes);
335           ptr += darr[i].bytes;
336         }
337       }
338 }
339 
PARMCI_PutV(armci_giov_t darr[],int len,int proc)340 int PARMCI_PutV( armci_giov_t darr[], /* descriptor array */
341                 int len,  /* length of descriptor array */
342                 int proc  /* remote process(or) ID */
343               )
344 {
345     int rc=0, i,direct=1;
346     if(len<1) return FAIL;
347     for(i=0;i<len;i++){
348         if(darr[i].src_ptr_array == NULL || darr[i].dst_ptr_array ==NULL) return FAIL2;
349         if(darr[i].bytes<1)return FAIL3;
350         if(darr[i].ptr_array_len <1) return FAIL4;
351     }
352 
353     if(proc<0 || proc >= armci_nproc)return FAIL5;
354 
355     ORDER(PUT,proc); /* ensure ordering */
356     direct=SAMECLUSNODE(proc);
357 
358     if(direct){
359          if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_PUT);
360          rc = armci_copy_vector(PUT, darr, len, proc);
361     }
362     else{
363          DO_FENCE(proc,SERVER_PUT);
364          rc = armci_pack_vector(PUT, NULL, darr, len, proc,NULL);
365     }
366 
367     if(rc) return FAIL6;
368     else return 0;
369 
370 }
371 
372 
PARMCI_GetV(armci_giov_t darr[],int len,int proc)373 int PARMCI_GetV( armci_giov_t darr[], /* descriptor array */
374                 int len,  /* length of descriptor array */
375                 int proc  /* remote process(or) ID */
376               )
377 {
378     int rc=0, i,direct=1;
379 
380     if(len<1) return FAIL;
381     for(i=0;i<len;i++){
382       if(darr[i].src_ptr_array==NULL ||darr[i].dst_ptr_array==NULL)return FAIL2;
383       if(darr[i].bytes<1)return FAIL3;
384       if(darr[i].ptr_array_len <1) return FAIL4;
385     }
386 
387     if(proc<0 || proc >= armci_nproc)return FAIL5;
388 
389     ORDER(GET,proc); /* ensure ordering */
390 #ifndef QUADRICS
391     direct=SAMECLUSNODE(proc);
392 #endif
393 
394     if(direct){
395        if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_GET);
396        rc = armci_copy_vector(GET, darr, len, proc);
397     }
398     else{
399        DO_FENCE(proc,SERVER_GET);
400        rc = armci_pack_vector(GET, NULL, darr, len, proc,NULL);
401     }
402 
403     if(rc) return FAIL6;
404     else return 0;
405 }
406 
407 
408 
409 
PARMCI_AccV(int op,void * scale,armci_giov_t darr[],int len,int proc)410 int PARMCI_AccV( int op,              /* oeration code */
411                 void *scale,         /*scaling factor for accumulate */
412                 armci_giov_t darr[], /* descriptor array */
413                 int len,             /* length of descriptor array */
414                 int proc             /* remote process(or) ID */
415               )
416 {
417     int rc=0, i,direct=0;
418 
419     if(len<1) return FAIL;
420     for(i=0;i<len;i++){
421       if(darr[i].src_ptr_array==NULL ||darr[i].dst_ptr_array==NULL)return FAIL2;
422       if(darr[i].bytes<1)return FAIL3;
423       if(darr[i].ptr_array_len <1) return FAIL4;
424     }
425 
426     if(proc<0 || proc >= armci_nproc)return FAIL5;
427 
428     ORDER(op,proc); /* ensure ordering */
429     direct=SAMECLUSNODE(proc);
430 #   if defined(ACC_COPY) && !defined(ACC_SMP)
431        if(armci_me != proc) direct=0;
432 #      error "grrr"
433 #   endif
434     if(direct) {
435          rc = armci_acc_vector( op, scale, darr, len, proc);
436     } else {
437          DO_FENCE(proc,SERVER_PUT);
438          rc = armci_pack_vector(op, scale, darr, len, proc,NULL);
439     }
440 
441     if(rc) return FAIL6;
442     else return 0;
443 }
444 
445 
446 /*****************************************************************************/
447 
448 /*\ Non-blocking vector API
449 \*/
PARMCI_NbPutV(armci_giov_t darr[],int len,int proc,armci_hdl_t * usr_hdl)450 int PARMCI_NbPutV( armci_giov_t darr[], /* descriptor array */
451                 int len,  /* length of descriptor array */
452                 int proc, /* remote process(or) ID */
453                 armci_hdl_t* usr_hdl  /*non-blocking request handle*/
454               )
455 {
456     armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl;
457     int rc=0, i,direct=1;
458 
459     if(len<1) return FAIL;
460     for(i=0;i<len;i++){
461         if(darr[i].src_ptr_array == NULL || darr[i].dst_ptr_array ==NULL) return FAIL2;
462         if(darr[i].bytes<1)return FAIL3;
463         if(darr[i].ptr_array_len <1) return FAIL4;
464     }
465 
466     if(proc<0 || proc >= armci_nproc)return FAIL5;
467 
468     direct=SAMECLUSNODE(proc);
469     /* aggregate put */
470     if(nb_handle && nb_handle->agg_flag == SET) {
471        if(!direct) {
472 	  rc=armci_agg_save_giov_descriptor(darr, len, proc, PUT, nb_handle);
473 	  return rc;
474        }
475     }
476     else {
477 
478       /*ORDER(PUT,proc);  ensure ordering */
479       UPDATE_FENCE_INFO(proc);
480 
481       /*set tag and op in the nb handle*/
482       if(nb_handle){
483 	nb_handle->tag = GET_NEXT_NBTAG();
484 	nb_handle->op  = PUT;
485 	nb_handle->proc= proc;
486 	nb_handle->bufid=NB_NONE;
487       }
488       else
489 	nb_handle = armci_set_implicit_handle(PUT, proc);
490     }
491 
492     if(direct){
493       if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_PUT);
494          rc = armci_copy_vector(PUT, darr, len, proc);
495     }
496     else{
497       DO_FENCE(proc,SERVER_NBPUT);
498          rc = armci_pack_vector(PUT, NULL, darr, len, proc,nb_handle);
499     }
500 
501     if(rc) return FAIL6;
502     else return 0;
503 }
504 
PARMCI_NbGetV(armci_giov_t darr[],int len,int proc,armci_hdl_t * usr_hdl)505 int PARMCI_NbGetV( armci_giov_t darr[], /* descriptor array */
506                 int len,  /* length of descriptor array */
507                 int proc, /* remote process(or) ID */
508                 armci_hdl_t* usr_hdl  /*non-blocking request handle*/
509               )
510 {
511     armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl;
512     int rc=0, i,direct=1;
513 
514     if(len<1) return FAIL;
515     for(i=0;i<len;i++){
516       if(darr[i].src_ptr_array==NULL ||darr[i].dst_ptr_array==NULL)return FAIL2;
517       if(darr[i].bytes<1)return FAIL3;
518       if(darr[i].ptr_array_len <1) return FAIL4;
519     }
520 
521     if(proc<0 || proc >= armci_nproc)return FAIL5;
522 
523     direct=SAMECLUSNODE(proc);
524 
525     /* aggregate get */
526     if(nb_handle && nb_handle->agg_flag == SET) {
527        if(!direct) {
528 	  rc=armci_agg_save_giov_descriptor(darr, len, proc, GET, nb_handle);
529 	  return rc;
530        }
531     }
532     else {
533       /* ORDER(GET,proc); ensure ordering */
534       if(nb_handle){
535 	nb_handle->tag = GET_NEXT_NBTAG();
536 	nb_handle->op  = GET;
537 	nb_handle->proc= proc;
538 	nb_handle->bufid=NB_NONE;
539       }
540       else
541 	nb_handle = armci_set_implicit_handle(GET, proc);
542     }
543 
544     if(direct){
545        if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_GET);
546          rc = armci_copy_vector(GET, darr, len, proc);
547     }
548     else{
549        DO_FENCE(proc,SERVER_NBGET);
550        rc = armci_pack_vector(GET, NULL, darr, len, proc,nb_handle);
551     }
552 
553     if(rc) return FAIL6;
554     else return 0;
555 }
556 
557 
PARMCI_NbAccV(int op,void * scale,armci_giov_t darr[],int len,int proc,armci_hdl_t * usr_hdl)558 int PARMCI_NbAccV( int op,              /* oeration code */
559                 void *scale,         /*scaling factor for accumulate */
560                 armci_giov_t darr[], /* descriptor array */
561                 int len,             /* length of descriptor array */
562                 int proc,            /* remote process(or) ID */
563                 armci_hdl_t* usr_hdl  /*non-blocking request handle*/
564               )
565 {
566     armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl;
567     int rc=0, i,direct=1;
568 
569     if(len<1) return FAIL;
570     for(i=0;i<len;i++)
571     {
572       if(darr[i].src_ptr_array==NULL ||darr[i].dst_ptr_array==NULL)return FAIL2;
573       if(darr[i].bytes<1)return FAIL3;
574       if(darr[i].ptr_array_len <1) return FAIL4;
575     }
576 
577     if(proc<0 || proc >= armci_nproc)return FAIL5;
578 
579     UPDATE_FENCE_INFO(proc);
580     direct=SAMECLUSNODE(proc);
581 
582     if(nb_handle){
583       nb_handle->tag = GET_NEXT_NBTAG();
584       nb_handle->op  = op;
585       nb_handle->proc= proc;
586       nb_handle->bufid=NB_NONE;
587     }
588     else
589       nb_handle = armci_set_implicit_handle(op, proc);
590 
591 #   if defined(ACC_COPY) && !defined(ACC_SMP)
592        if(armci_me != proc) direct=0;
593 #   endif
594 
595     if(direct)
596          rc = armci_acc_vector( op, scale, darr, len, proc);
597     else{
598       DO_FENCE(proc,SERVER_NBPUT);
599          rc = armci_pack_vector(op, scale, darr, len, proc,nb_handle);
600     }
601 
602     if(rc) return FAIL6;
603     else return 0;
604 }
605 /*****************************************************************************/
606