1 #if HAVE_CONFIG_H
2 #   include "config.h"
3 #endif
4 
5 /* $Id: vector.c,v 1.32.6.4 2007-08-29 17:32:32 manoj Exp $ */
6 #include "armcip.h"
7 #include "copy.h"
8 #include "acc.h"
9 #include "memlock.h"
10 #include <stdio.h>
11 #include <assert.h>
12 
13 #define SERVER_GET 1
14 #define SERVER_NBGET 2
15 #define DIRECT_GET 3
16 #define DIRECT_NBGET 4
17 #define SERVER_PUT 5
18 #define SERVER_NBPUT 6
19 #define DIRECT_PUT 7
20 #define DIRECT_NBPUT 8
21 
22 
23 #  define DO_FENCE(__proc,__prot) if(__prot==SERVER_GET);\
24         else if(__prot==SERVER_PUT);\
25         else if(__prot==DIRECT_GET || __prot==DIRECT_NBGET){\
26           if(armci_prot_switch_fence[__proc]==SERVER_PUT)\
27             ARMCI_DoFence(__proc);\
28         }\
29         else if(__prot==DIRECT_PUT || __prot==DIRECT_NBPUT){\
30           if(armci_prot_switch_fence[__proc]==SERVER_PUT)\
31             ARMCI_DoFence(__proc);\
32         }\
33         else;\
34         armci_prot_switch_fence[__proc]=__prot
35 
36 /*
37 typedef struct {
38     float real;
39     float imag;
40 } complex_t;
41 
42 typedef struct {
43     double real;
44     double imag;
45 } dcomplex_t;
46 */
47 
48 /*
49 void I_ACCUMULATE(void* scale, int elems, void*src, void* dst)
50 {
51     int j;
52     int *a=(int*)dst, *b=(int*)src;
53     int alpha = *(int*)scale;
54 
55     for(j=0;j<elems;j++) a[j] += alpha*b[j];
56 }
57 */
58 
59 
60 #define ACCUMULATE( DTYPE, scale, elems, src, dst) {\
61     int j;\
62     DTYPE *a =(DTYPE *)(dst);\
63     DTYPE *b =(DTYPE *)(src);\
64     DTYPE alpha = *(DTYPE *)(scale);\
65     for(j=0;j<(elems);j++)a[j] += alpha*b[j];\
66 }
67 
68 #define ACCUMULATE_RA( DTYPE, elems, src, dst) {\
69     int j;\
70     DTYPE *a =(DTYPE *)(dst);\
71     DTYPE *b =(DTYPE *)(src);\
72     for(j=0;j<(elems);j++)a[j] ^= b[j];\
73 }
74 
75 #define CPL_ACCUMULATE( DTYPE, scale, elems, src, dst) {\
76     int j;\
77     DTYPE *a =(DTYPE *)(dst);\
78     DTYPE *b =(DTYPE *)(src);\
79     DTYPE alpha = *(DTYPE *)(scale);\
80     for(j=0;j<(elems);j++){\
81         a[j].real += alpha.real*b[j].real - alpha.imag*b[j].imag;\
82         a[j].imag += alpha.imag*b[j].real + alpha.real*b[j].imag;\
83     }\
84 }
85 
86 extern int* armci_prot_switch_fence;
87 extern int armci_prot_switch_preproc;
88 extern int armci_prot_switch_preop;
89 
90 
91 /*\ compute address range for memory to lock
92 \*/
armci_lockmem_scatter(void * ptr_array[],int len,int bytes,int proc)93 void armci_lockmem_scatter(void *ptr_array[], int len, int bytes, int proc)
94 {
95      int i;
96      void *pmin, *pmax;
97 
98      pmin=ptr_array[0];
99      pmax=ptr_array[0];
100 
101      for(i = 0; i< len; i++){
102               pmin = ARMCI_MIN(ptr_array[i],pmin);
103               pmax = ARMCI_MAX(ptr_array[i],pmax);
104      }
105      pmax =  bytes-1 + (char*)pmax;
106      ARMCI_LOCKMEM(pmin, pmax, proc);
107 /*    printf("%d: locked %ld-%ld bytes=%d\n",armci_me,pmin,pmax,
108      1+(char*)pmax -(char*)pmin);fflush(stdout); */
109 }
110 
111 
112 
armci_scatter_acc(int op,void * scale,armci_giov_t dsc,int proc,int lockit)113 void armci_scatter_acc(int op, void *scale, armci_giov_t dsc,
114                                             int proc, int lockit)
115 {
116 #   define ITERATOR for(i = 0; i< dsc.ptr_array_len; i++)
117     int i, elems, size;
118       if(lockit)
119          armci_lockmem_scatter(dsc.dst_ptr_array, dsc.ptr_array_len,
120                                dsc.bytes, proc);
121       switch (op){
122       case ARMCI_ACC_INT:
123           size  = sizeof(int);
124           elems = dsc.bytes/size;
125           if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
126           ITERATOR{
127             ACCUMULATE(int, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
128           }
129           break;
130 
131       case ARMCI_ACC_LNG:
132           size  = sizeof(long);
133           elems = dsc.bytes/size;
134           if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
135           ITERATOR{
136             ACCUMULATE(long, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
137           }
138           break;
139 
140       case ARMCI_ACC_DBL:
141           size  = sizeof(double);
142           elems = dsc.bytes/size;
143           if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
144           ITERATOR{
145             ACCUMULATE(double, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
146           }
147           break;
148 
149       case ARMCI_ACC_DCP:
150           size  = 2*sizeof(double);
151           elems = dsc.bytes/size;
152           if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
153           ITERATOR{
154             CPL_ACCUMULATE(dcomplex_t, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
155           }
156           break;
157 
158       case ARMCI_ACC_CPL:
159           size  = 2*sizeof(float);
160           elems = dsc.bytes/size;
161           if(dsc.bytes %size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
162           ITERATOR{
163             CPL_ACCUMULATE(complex_t, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
164           }
165           break;
166 
167       case ARMCI_ACC_FLT:
168           size  = sizeof(float);
169           elems = dsc.bytes/size;
170           if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
171           ITERATOR{
172             ACCUMULATE(float, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
173           }
174           break;
175       case ARMCI_ACC_RA:
176           size  = sizeof(long);
177           elems = dsc.bytes/size;
178           if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
179           ITERATOR{
180             ACCUMULATE_RA(long,elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
181           }
182           break;
183       default: armci_die("ARMCI vector accumulate: operation not supported",op);
184       }
185 
186       if(lockit) ARMCI_UNLOCKMEM(proc);
187 }
188 
189 
190 #ifdef ACC_COPY
191 #  define PWORKLEN 2048
192    static void *pwork[PWORKLEN];  /* work array of pointers */
193 #endif
194 
armci_acc_vector(int op,void * scale,armci_giov_t darr[],int len,int proc)195 int armci_acc_vector(int op,             /* operation code */
196                     void *scale,         /* pointer to scale factor in accumulate */
197                     armci_giov_t darr[], /* descriptor array */
198                     int len,             /* length of descriptor array */
199                     int proc             /* remote process(or) ID */
200               )
201 {
202     int i;
203 
204 #if defined(ACC_COPY)
205     if(proc == armci_me ){
206 #endif
207        for(i = 0; i< len; i++) armci_scatter_acc(op, scale, darr[i], proc, 1);
208 #if defined(ACC_COPY)
209     }else{
210        for(i = 0; i< len; i++){
211            armci_giov_t dr =  darr[i];
212            int j, rc, nb;
213            if(dr.bytes > BUFSIZE/2){
214                /* for large segments use strided implementation */
215                for(j=0; j< dr.ptr_array_len; j++){
216                    rc = armci_acc_copy_strided(op, scale,proc,
217                            dr.src_ptr_array[j], NULL, dr.dst_ptr_array[j],NULL,
218                            &dr.bytes, 0);
219                    if(rc)return(rc);
220                }
221            }else{
222                armci_giov_t dl;
223                /*lock memory:should optimize it to lock only a chunk at a time*/
224                armci_lockmem_scatter(dr.dst_ptr_array, dr.ptr_array_len, dr.bytes, proc);
225                /* copy as many blocks as possible into the local buffer */
226                dl.bytes = dr.bytes;
227                nb = ARMCI_MIN(PWORKLEN,BUFSIZE/dr.bytes);
228                for(j=0; j< dr.ptr_array_len; j+= nb){
229                    int nblocks = ARMCI_MIN(nb, dr.ptr_array_len -j);
230                    int k;
231                    /* setup vector descriptor for remote memory copy
232                       to bring data into buffer*/
233                    dl.ptr_array_len = nblocks;
234                    dl.src_ptr_array = dr.dst_ptr_array + j; /* GET destination becomes source for copy */
235                    for(k=0; k< nblocks; k++) pwork[k] = k*dl.bytes + (char*)armci_internal_buffer;
236                    dl.dst_ptr_array = pwork;
237                    /* get data to the local buffer */
238                    rc = armci_copy_vector(GET, &dl, 1, proc);
239                    if(rc){ ARMCI_UNLOCKMEM(proc); return(rc);}
240                    /* update source array for accumulate */
241                    dl.src_ptr_array = dr.src_ptr_array +j;
242                    /* do scatter accumulate updating copy of data in buffer */
243                    armci_scatter_acc(op, scale, dl, armci_me, 0);
244                    /* modify descriptor-now source becomes destination for PUT*/
245                    dl.dst_ptr_array = dr.dst_ptr_array + j;
246                    dl.src_ptr_array = pwork;
247                    /* put data back */
248                    rc = armci_copy_vector(PUT, &dl, 1, proc);
249                    FENCE_NODE(proc);
250                    if(rc){ ARMCI_UNLOCKMEM(proc); return(rc);}
251                }
252                ARMCI_UNLOCKMEM(proc);
253            }
254        }/*endfor*/
255     }
256 #endif
257 
258     return 0;
259 }
260 
261 
262 
263 
armci_copy_vector(int op,armci_giov_t darr[],int len,int proc)264 int armci_copy_vector(int op,            /* operation code */
265                     armci_giov_t darr[], /* descriptor array */
266                     int len,             /* length of descriptor array */
267                     int proc             /* remote process(or) ID */
268               )
269 {
270     int i,s,shmem= SAMECLUSNODE(proc);
271     int armci_th_idx = ARMCI_THREAD_IDX;
272 
273     if(shmem){
274       /* local/shared memory copy */
275       for(i = 0; i< len; i++){
276         for( s=0; s< darr[i].ptr_array_len; s++){
277            armci_copy(darr[i].src_ptr_array[s],darr[i].dst_ptr_array[s],darr[i].bytes);
278         }
279       }
280 
281     }else {
282       switch(op){
283       case PUT:
284 
285         for(i = 0; i< len; i++){
286 
287           UPDATE_FENCE_STATE(proc, PUT, darr[i].ptr_array_len);
288 
289           for( s=0; s< darr[i].ptr_array_len; s++){
290               armci_put(darr[i].src_ptr_array[s],darr[i].dst_ptr_array[s],
291                         darr[i].bytes, proc);
292            }
293         }
294         break;
295       case GET:
296         for(i = 0; i< len; i++){
297           for( s=0; s< darr[i].ptr_array_len; s++){
298               armci_get(darr[i].src_ptr_array[s],darr[i].dst_ptr_array[s],
299                         darr[i].bytes,proc);
300            }
301         }
302         break;
303       default:
304           armci_die("armci_copy_vector: wrong optype",op);
305       }
306    }
307 
308    return 0;
309 }
310 
311 
armci_vector_to_buf(armci_giov_t darr[],int len,void * buf)312 void armci_vector_to_buf(armci_giov_t darr[], int len, void* buf)
313 {
314 int i,s;
315 char *ptr = (char*)buf;
316       for(i = 0; i< len; i++){
317         for( s=0; s< darr[i].ptr_array_len; s++){
318           armci_copy(darr[i].src_ptr_array[s],ptr,darr[i].bytes);
319           ptr += darr[i].bytes;
320         }
321       }
322 }
323 
324 
armci_vector_from_buf(armci_giov_t darr[],int len,void * buf)325 void armci_vector_from_buf(armci_giov_t darr[], int len, void* buf)
326 {
327 int i,s;
328 char *ptr = (char*)buf;
329 
330       for(i = 0; i< len; i++){
331         for( s=0; s< darr[i].ptr_array_len; s++){
332           armci_copy(ptr, darr[i].dst_ptr_array[s],darr[i].bytes);
333           ptr += darr[i].bytes;
334         }
335       }
336 }
337 
PARMCI_PutV(armci_giov_t darr[],int len,int proc)338 int PARMCI_PutV( armci_giov_t darr[], /* descriptor array */
339                 int len,  /* length of descriptor array */
340                 int proc  /* remote process(or) ID */
341               )
342 {
343     int rc=0, i,direct=1;
344     if(len<1) return FAIL;
345     for(i=0;i<len;i++){
346         if(darr[i].src_ptr_array == NULL || darr[i].dst_ptr_array ==NULL) return FAIL2;
347         if(darr[i].bytes<1)return FAIL3;
348         if(darr[i].ptr_array_len <1) return FAIL4;
349     }
350 
351     if(proc<0 || proc >= armci_nproc)return FAIL5;
352 
353     ORDER(PUT,proc); /* ensure ordering */
354     direct=SAMECLUSNODE(proc);
355 
356     if(direct){
357          if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_PUT);
358          rc = armci_copy_vector(PUT, darr, len, proc);
359     }
360     else{
361          DO_FENCE(proc,SERVER_PUT);
362          rc = armci_pack_vector(PUT, NULL, darr, len, proc,NULL);
363     }
364 
365     if(rc) return FAIL6;
366     else return 0;
367 
368 }
369 
370 
PARMCI_GetV(armci_giov_t darr[],int len,int proc)371 int PARMCI_GetV( armci_giov_t darr[], /* descriptor array */
372                 int len,  /* length of descriptor array */
373                 int proc  /* remote process(or) ID */
374               )
375 {
376     int rc=0, i,direct=1;
377 
378     if(len<1) return FAIL;
379     for(i=0;i<len;i++){
380       if(darr[i].src_ptr_array==NULL ||darr[i].dst_ptr_array==NULL)return FAIL2;
381       if(darr[i].bytes<1)return FAIL3;
382       if(darr[i].ptr_array_len <1) return FAIL4;
383     }
384 
385     if(proc<0 || proc >= armci_nproc)return FAIL5;
386 
387     ORDER(GET,proc); /* ensure ordering */
388 #ifndef QUADRICS
389     direct=SAMECLUSNODE(proc);
390 #endif
391 
392     if(direct){
393        if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_GET);
394        rc = armci_copy_vector(GET, darr, len, proc);
395     }
396     else{
397        DO_FENCE(proc,SERVER_GET);
398        rc = armci_pack_vector(GET, NULL, darr, len, proc,NULL);
399     }
400 
401     if(rc) return FAIL6;
402     else return 0;
403 }
404 
405 
406 
407 
PARMCI_AccV(int op,void * scale,armci_giov_t darr[],int len,int proc)408 int PARMCI_AccV( int op,              /* oeration code */
409                 void *scale,         /*scaling factor for accumulate */
410                 armci_giov_t darr[], /* descriptor array */
411                 int len,             /* length of descriptor array */
412                 int proc             /* remote process(or) ID */
413               )
414 {
415     int rc=0, i,direct=0;
416 
417     if(len<1) return FAIL;
418     for(i=0;i<len;i++){
419       if(darr[i].src_ptr_array==NULL ||darr[i].dst_ptr_array==NULL)return FAIL2;
420       if(darr[i].bytes<1)return FAIL3;
421       if(darr[i].ptr_array_len <1) return FAIL4;
422     }
423 
424     if(proc<0 || proc >= armci_nproc)return FAIL5;
425 
426     ORDER(op,proc); /* ensure ordering */
427     direct=SAMECLUSNODE(proc);
428 #   if defined(ACC_COPY) && !defined(ACC_SMP)
429        if(armci_me != proc) direct=0;
430 #      error "grrr"
431 #   endif
432     if(direct) {
433          rc = armci_acc_vector( op, scale, darr, len, proc);
434     } else {
435          DO_FENCE(proc,SERVER_PUT);
436          rc = armci_pack_vector(op, scale, darr, len, proc,NULL);
437     }
438 
439     if(rc) return FAIL6;
440     else return 0;
441 }
442 
443 
444 /*****************************************************************************/
445 
446 /*\ Non-blocking vector API
447 \*/
PARMCI_NbPutV(armci_giov_t darr[],int len,int proc,armci_hdl_t * usr_hdl)448 int PARMCI_NbPutV( armci_giov_t darr[], /* descriptor array */
449                 int len,  /* length of descriptor array */
450                 int proc, /* remote process(or) ID */
451                 armci_hdl_t* usr_hdl  /*non-blocking request handle*/
452               )
453 {
454     armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl;
455     int rc=0, i,direct=1;
456 
457     if(len<1) return FAIL;
458     for(i=0;i<len;i++){
459         if(darr[i].src_ptr_array == NULL || darr[i].dst_ptr_array ==NULL) return FAIL2;
460         if(darr[i].bytes<1)return FAIL3;
461         if(darr[i].ptr_array_len <1) return FAIL4;
462     }
463 
464     if(proc<0 || proc >= armci_nproc)return FAIL5;
465 
466     direct=SAMECLUSNODE(proc);
467     /* aggregate put */
468     if(nb_handle && nb_handle->agg_flag == SET) {
469        if(!direct) {
470 	  rc=armci_agg_save_giov_descriptor(darr, len, proc, PUT, nb_handle);
471 	  return rc;
472        }
473     }
474     else {
475 
476       /*ORDER(PUT,proc);  ensure ordering */
477       UPDATE_FENCE_INFO(proc);
478 
479       /*set tag and op in the nb handle*/
480       if(nb_handle){
481 	nb_handle->tag = GET_NEXT_NBTAG();
482 	nb_handle->op  = PUT;
483 	nb_handle->proc= proc;
484 	nb_handle->bufid=NB_NONE;
485       }
486       else
487 	nb_handle = armci_set_implicit_handle(PUT, proc);
488     }
489 
490     if(direct){
491       if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_PUT);
492          rc = armci_copy_vector(PUT, darr, len, proc);
493     }
494     else{
495       DO_FENCE(proc,SERVER_NBPUT);
496          rc = armci_pack_vector(PUT, NULL, darr, len, proc,nb_handle);
497     }
498 
499     if(rc) return FAIL6;
500     else return 0;
501 }
502 
PARMCI_NbGetV(armci_giov_t darr[],int len,int proc,armci_hdl_t * usr_hdl)503 int PARMCI_NbGetV( armci_giov_t darr[], /* descriptor array */
504                 int len,  /* length of descriptor array */
505                 int proc, /* remote process(or) ID */
506                 armci_hdl_t* usr_hdl  /*non-blocking request handle*/
507               )
508 {
509     armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl;
510     int rc=0, i,direct=1;
511 
512     if(len<1) return FAIL;
513     for(i=0;i<len;i++){
514       if(darr[i].src_ptr_array==NULL ||darr[i].dst_ptr_array==NULL)return FAIL2;
515       if(darr[i].bytes<1)return FAIL3;
516       if(darr[i].ptr_array_len <1) return FAIL4;
517     }
518 
519     if(proc<0 || proc >= armci_nproc)return FAIL5;
520 
521     direct=SAMECLUSNODE(proc);
522 
523     /* aggregate get */
524     if(nb_handle && nb_handle->agg_flag == SET) {
525        if(!direct) {
526 	  rc=armci_agg_save_giov_descriptor(darr, len, proc, GET, nb_handle);
527 	  return rc;
528        }
529     }
530     else {
531       /* ORDER(GET,proc); ensure ordering */
532       if(nb_handle){
533 	nb_handle->tag = GET_NEXT_NBTAG();
534 	nb_handle->op  = GET;
535 	nb_handle->proc= proc;
536 	nb_handle->bufid=NB_NONE;
537       }
538       else
539 	nb_handle = armci_set_implicit_handle(GET, proc);
540     }
541 
542     if(direct){
543        if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_GET);
544          rc = armci_copy_vector(GET, darr, len, proc);
545     }
546     else{
547        DO_FENCE(proc,SERVER_NBGET);
548        rc = armci_pack_vector(GET, NULL, darr, len, proc,nb_handle);
549     }
550 
551     if(rc) return FAIL6;
552     else return 0;
553 }
554 
555 
PARMCI_NbAccV(int op,void * scale,armci_giov_t darr[],int len,int proc,armci_hdl_t * usr_hdl)556 int PARMCI_NbAccV( int op,              /* oeration code */
557                 void *scale,         /*scaling factor for accumulate */
558                 armci_giov_t darr[], /* descriptor array */
559                 int len,             /* length of descriptor array */
560                 int proc,            /* remote process(or) ID */
561                 armci_hdl_t* usr_hdl  /*non-blocking request handle*/
562               )
563 {
564     armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl;
565     int rc=0, i,direct=1;
566 
567     if(len<1) return FAIL;
568     for(i=0;i<len;i++)
569     {
570       if(darr[i].src_ptr_array==NULL ||darr[i].dst_ptr_array==NULL)return FAIL2;
571       if(darr[i].bytes<1)return FAIL3;
572       if(darr[i].ptr_array_len <1) return FAIL4;
573     }
574 
575     if(proc<0 || proc >= armci_nproc)return FAIL5;
576 
577     UPDATE_FENCE_INFO(proc);
578     direct=SAMECLUSNODE(proc);
579 
580     if(nb_handle){
581       nb_handle->tag = GET_NEXT_NBTAG();
582       nb_handle->op  = op;
583       nb_handle->proc= proc;
584       nb_handle->bufid=NB_NONE;
585     }
586     else
587       nb_handle = armci_set_implicit_handle(op, proc);
588 
589 #   if defined(ACC_COPY) && !defined(ACC_SMP)
590        if(armci_me != proc) direct=0;
591 #   endif
592 
593     if(direct)
594          rc = armci_acc_vector( op, scale, darr, len, proc);
595     else{
596       DO_FENCE(proc,SERVER_NBPUT);
597          rc = armci_pack_vector(op, scale, darr, len, proc,nb_handle);
598     }
599 
600     if(rc) return FAIL6;
601     else return 0;
602 }
603 /*****************************************************************************/
604