1 #if HAVE_CONFIG_H
2 #   include "config.h"
3 #endif
4 
5 /* $Id$ */
6 
7 #if HAVE_STDIO_H
8 #   include <stdio.h>
9 #endif
10 #if HAVE_STDLIB_H
11 #   include <stdlib.h>
12 #endif
13 #if HAVE_UNISTD_H
14 #   include <unistd.h>
15 #elif HAVE_WINDOWS_H
16 #   include <windows.h>
17 #   define sleep(x) Sleep(1000*(x))
18 #endif
19 #if HAVE_ASSERT_H
20 #   include <assert.h>
21 #endif
22 
23 #include "armci.h"
24 #include "message.h"
25 
26 #define DIM1 5
27 #define DIM2 3
28 #ifdef __sun
29 /* Solaris has shared memory shortages in the default system configuration */
30 # define DIM3 6
31 # define DIM4 5
32 # define DIM5 4
33 #elif defined(__alpha__)
34 # define DIM3 8
35 # define DIM4 5
36 # define DIM5 6
37 #else
38 # define DIM3 8
39 # define DIM4 9
40 # define DIM5 7
41 #endif
42 #define DIM6 3
43 #define DIM7 2
44 
45 
46 #define OFF 1
47 #define EDIM1 (DIM1+OFF)
48 #define EDIM2 (DIM2+OFF)
49 #define EDIM3 (DIM3+OFF)
50 #define EDIM4 (DIM4+OFF)
51 #define EDIM5 (DIM5+OFF)
52 #define EDIM6 (DIM6+OFF)
53 #define EDIM7 (DIM7+OFF)
54 
55 #define DIMS 4
56 #define MAXDIMS 7
57 #define MAX_DIM_VAL 50
58 #define LOOP 200
59 
60 #define BASE 100.
61 #define MAXPROC 128
62 #define TIMES 100
63 
64 #ifdef CRAY
65 # define ELEMS 800
66 #else
67 # define ELEMS 200
68 #endif
69 
70 /***************************** macros ************************/
71 #define COPY(src, dst, bytes) memcpy((dst),(src),(bytes))
72 #define ARMCI_MAX(a,b) (((a) >= (b)) ? (a) : (b))
73 #define ARMCI_MIN(a,b) (((a) <= (b)) ? (a) : (b))
74 #define ARMCI_ABS(a) (((a) <0) ? -(a) : (a))
75 
76 #define ROW      65536
77 #define COL      ROW   /* square matrices only for the time being */
78 
79 /***************************** global data *******************/
80 int me, nproc;
81 short int fortran_indexing=0;
82 static int proc_row_list[MAXPROC];/*no of rows owned by each process - accumulated*/
83 static int proc_nz_list[MAXPROC]; /*no of non-zeros owned by each process */
84 
85 #ifdef MSG_COMMS_PVM
pvm_init(int argc,char * argv[])86 void pvm_init(int argc, char *argv[])
87 {
88     int mytid, mygid, ctid[MAXPROC];
89     int np, i;
90 
91     mytid = pvm_mytid();
92     if((argc != 2) && (argc != 1)) goto usage;
93     if(argc == 1) np = 1;
94     if(argc == 2)
95         if((np = atoi(argv[1])) < 1) goto usage;
96     if(np > MAXPROC) goto usage;
97 
98     mygid = pvm_joingroup(MPGROUP);
99 
100     if(np > 1)
101         if (mygid == 0)
102             i = pvm_spawn(argv[0], argv+1, 0, "", np-1, ctid);
103 
104     while(pvm_gsize(MPGROUP) < np) sleep(1);
105 
106     /* sync */
107     pvm_barrier(MPGROUP, np);
108 
109     printf("PVM initialization done!\n");
110 
111     return;
112 
113 usage:
114     fprintf(stderr, "usage: %s <nproc>\n", argv[0]);
115     pvm_exit();
116     exit(-1);
117 }
118 #endif
119 
create_array(void * a[],int elem_size,int ndim,int dims[])120 void create_array(void *a[], int elem_size, int ndim, int dims[])
121 {
122      int bytes=elem_size, i, rc;
123 
124      assert(ndim<=MAXDIMS);
125      for(i=0;i<ndim;i++)bytes*=dims[i];
126 
127      rc = ARMCI_Malloc(a, bytes);
128      assert(rc==0);
129 
130      assert(a[me]);
131 
132 }
133 
destroy_array(void * ptr[])134 void destroy_array(void *ptr[])
135 {
136     int check;
137 
138     armci_msg_barrier();
139     check = !ARMCI_Free(ptr[me]);
140     assert(check);
141 }
142 
verify_list(int * proc_row_list)143 static void verify_list(int *proc_row_list) {
144   int i;
145   printf("\nVERIFY: %d: No of rows = %d\n\n", 0, proc_row_list[0]);
146   for(i=1; i<nproc; i++)
147     printf("\nVERIFY: %d: No of rows = %d\n\n", i, proc_row_list[i]-proc_row_list[i-1]);
148   fflush(stdout);
149 }
150 
load_balance(int n,int non_zero,int * row_ind_tmp)151 static void load_balance(int n, int non_zero, int *row_ind_tmp) {
152 
153   int proc_id, i, local_nz, local_nz_acc, A, B;
154 
155   local_nz = local_nz_acc = non_zero/nproc;
156 
157   /* number of rows owned by each process is stored in proc_row_list. This
158      is supposed to be well load balanced, so that each process has almost
159      same number of non-zero elements */
160   proc_id = 0;
161   if(me==0) printf("local_nz = %d\n", local_nz);
162   for(i=0; i<n; i++) { /* as # of entries in row_ind_tmp = n+1 */
163     if(row_ind_tmp[i] < local_nz_acc && row_ind_tmp[i+1] >= local_nz_acc) {
164       proc_row_list[proc_id++] = i+1;
165       local_nz_acc = local_nz*(proc_id+1);
166       if(proc_id == nproc-1) local_nz_acc = non_zero;
167       if(me==0 && proc_id<nproc) printf("local_nz = %d\n", local_nz_acc);
168     }
169   }
170 
171   proc_row_list[nproc-1] = n;
172 
173   for(i=0; i<nproc; i++) {
174     A = (i==0) ? 0: proc_row_list[i-1];/* # of entries in row_ind_tmp is n+1*/
175     B = proc_row_list[i];
176     proc_nz_list[i] = row_ind_tmp[B]-row_ind_tmp[A];
177   }
178 
179   if(proc_id != nproc)
180     ARMCI_Error("Error while preparing Process Row list", proc_id-1);
181 
182 #if 1
183   if(me==0) verify_list(proc_row_list);
184 #endif
185 
186 }
187 
sparse_initialize(int * n,int * non_zero,int ** row_ind,int ** col_ind,double ** values,double ** vec,double ** svec)188 static int sparse_initialize(int *n, int *non_zero, int **row_ind,
189                  int **col_ind, double **values, double **vec,
190                  double **svec) {
191 
192   int i, j, rc, max, *row_ind_tmp=NULL, *tmp_indices=NULL;
193   double *tmp_values=NULL;
194   unsigned long len;
195   FILE *fp=NULL;
196 
197   /* Broadcast order of matrix */
198   if(me==0) {
199     if((fp=fopen("Sparse-MPI/av41092.rua.data", "r")) == NULL)
200       ARMCI_Error("Error: Input file not found", me);
201     fortran_indexing = 1; /* This is 1 for Harwell-Boeing format matrices */
202     fscanf(fp, "%d", n);
203     if(*n%nproc)
204       ARMCI_Error("# of rows is not divisible by # of processors", nproc);
205     if(*n > ROW)
206       ARMCI_Error("order is greater than defined variable ROW", ROW);
207   }
208   len = sizeof(int);
209   armci_msg_brdcst(n, len, 0);
210 
211   /* Broad cast number of non_zeros */
212   if(me==0) fscanf(fp, "%d", non_zero);
213   armci_msg_brdcst(non_zero, len, 0);
214 
215   /* Broadcast row indices */
216   len = (*n+1)*sizeof(int);
217   row_ind_tmp = (int *)malloc(len);
218   if(me==0)for(i=0; i<*n+1; i++) {
219     fscanf(fp, "%d", &row_ind_tmp[i]);
220     if(fortran_indexing) --row_ind_tmp[i];
221   }
222   armci_msg_brdcst(row_ind_tmp, len, 0);
223 
224   load_balance(*n, *non_zero, row_ind_tmp);
225 
226   /* find how much temporary storage is needed at the maximum */
227   if(me==0) {
228     for(max=-1,j=0;j<nproc;j++) if(max<proc_nz_list[j]) max=proc_nz_list[j];
229     if(max<0) ARMCI_Error(" max cannot be negative", max);
230   }
231 
232   /* Broadcast the maximum number of elements */
233   len = sizeof(int);
234   armci_msg_brdcst(&max, len, 0);
235 
236   /* create the Sparse MAtrix Array */
237   if(me==0) printf("  Creating ValueArray (CompressedSparseMatrix) ...\n\n");
238   create_array((void**)col_ind, sizeof(int), 1, &max);
239 
240   /* create the column subscript array */
241   if(me==0) printf("  Creating Column Subscript Array ... \n\n");
242   create_array((void**)values, sizeof(double), 1, &max);
243 
244   /* create the x-vector and the solution vector */
245   if(me==0) printf("  Creating Vectors ... \n\n");
246   create_array((void**)vec,  sizeof(double),1, &max);
247   create_array((void**)svec, sizeof(double),1, &max);
248   armci_msg_barrier();
249 
250 
251   /* Process 0 distributes the column indices and non_zero values to
252      respective processors*/
253   if(me == 0) {
254     tmp_indices = (int *)malloc(max*sizeof(int));
255     tmp_values  = (double *)malloc(max*sizeof(double));
256 
257     for(j=0; j<nproc; j++) {
258       for(i=0; i<proc_nz_list[j]; i++) {
259     fscanf(fp, "%d", &tmp_indices[i]);
260     if(fortran_indexing) --tmp_indices[i];
261       }
262       /* rc = fread(tmp_indices, sizeof(int), proc_nz_list[j], fp); */
263       if((rc=ARMCI_Put(tmp_indices, col_ind[j], proc_nz_list[j]*sizeof(int), j)))
264     ARMCI_Error("armci_nbput failed\n",rc);
265     }
266     for(j=0; j<nproc; j++) {
267       for(i=0; i<proc_nz_list[j]; i++) fscanf(fp, "%lf", &tmp_values[i]);
268       if((rc=ARMCI_Put(tmp_values, values[j], proc_nz_list[j]*sizeof(double), j)))
269     ARMCI_Error("armci_nbput failed\n",rc);
270     }
271   }
272   ARMCI_AllFence(); armci_msg_barrier();ARMCI_AllFence();
273 
274   /* initializing x-vector */
275   if(me==0) for(i=0;i<proc_nz_list[me]; i++) vec[me][i] = (i+1);
276   else for(i=0;i<proc_nz_list[me];i++) vec[me][i]=me*proc_nz_list[me-1]+(i+1);
277 
278 #if 0
279   if(me==0) {
280     printf("max = %d\n", max);
281     for(i=0; i<max; i++)  printf("%.1f ", values[me][i]);
282     printf("\n");
283   }
284 #endif
285 
286   *row_ind = row_ind_tmp;
287   if(me==0) {
288     free(tmp_indices);
289     free(tmp_values);
290     fclose(fp);
291   }
292   return 0;
293 }
294 
compare(const void * p1,const void * p2)295 static int compare(const void *p1, const void *p2) {
296   int i = *((int *)p1);
297   int j = *((int *)p2);
298 
299   if (i > j) return (1);
300   if (i < j) return (-1);
301   return (0);
302 }
303 
304 static int count = -1;
305 static armci_hdl_t gHandle[MAXPROC];
306 static int prev_proc = -1;
307 
get_data(int n,int start,int end,double * vec_local,double ** vec)308 static void get_data(int n, int start, int end, double *vec_local,
309             double **vec) {
310   int i, j, rc, bytes, offset;
311   int proc_start, proc_end, idx_start, idx_end;
312 
313   proc_start = proc_end = -1;
314   for(i=0; i<nproc; i++) {
315     if(proc_start<0 && proc_row_list[i]>start) proc_start = i;
316     if(proc_end<0 && proc_row_list[i]>end) proc_end = i;
317   }
318   if(proc_start<0 || proc_end<0) ARMCI_Error("Invalid Process Ids", -1);
319 
320   for(i=proc_start; i<=proc_end; i++) {
321     if(i==proc_start) idx_start = start;
322     else { if(i==0) idx_start=0; else idx_start = proc_row_list[i-1];}
323     if(i==proc_end) idx_end = end;
324     else idx_end = proc_row_list[i]-1;
325 
326     if(i!=prev_proc) {
327       ++count;   prev_proc = i;
328       ARMCI_INIT_HANDLE(&gHandle[count]);
329       ARMCI_SET_AGGREGATE_HANDLE(&gHandle[count]);
330     }
331 
332     if(i==0) offset=0; else offset = proc_row_list[i-1];
333     if(i==me) { /* local */
334       for(j=idx_start; j<=idx_end; j++) vec_local[j] = vec[me][j-offset];
335     }
336     else {     /* remote */
337       bytes = (idx_end-idx_start+1)*sizeof(double);
338       vec_local[idx_start] = -1;
339 #if 0
340       if((rc=ARMCI_Get(&vec[i][idx_start-offset], &vec_local[idx_start],
341                bytes, i)))
342 #else
343       if((rc=ARMCI_NbGet(&vec[i][idx_start-offset], &vec_local[idx_start],
344                bytes, i, &gHandle[count])))
345 #endif
346     ARMCI_Error("armci_nbget failed\n",rc);
347     }
348   }
349 }
350 
sparse_multiply(int n,int non_zero,int * row_ind,int ** col_ind,double ** values,double ** vec,double ** svec)351 static void sparse_multiply(int n, int non_zero, int *row_ind, int **col_ind,
352             double **values, double **vec, double **svec) {
353 
354   int i, j, k, num_elements, offset, *tmp_indices;
355   double start_time, comm_time, comp_time, v, vec_local[COL];
356   int start, end, prev, nrows, idx;
357 
358 #if 0
359   /* ---- Sequential Case ----  */
360    for(i=0; i<n; i++) {
361      svec[me][i] = 0;
362      for(k=row_ind[i]; k<row_ind[i+1]; k++) {
363        j = col_ind[me][k];
364        v = values[me][k];
365        svec[me][i] += v*vec[me][j];
366        printf("%.1f %.1f\n", v, vec[me][j]);
367      }
368    }
369    for(i=0; i<n; i++) printf("%.1f ", svec[me][i]);
370    printf("\n");
371 #else
372 
373   num_elements = proc_nz_list[me];
374   printf("num_elements = %d\n", num_elements);
375   tmp_indices = (int *)malloc(num_elements*sizeof(int));
376   for(i=0; i<num_elements; i++) tmp_indices[i] = col_ind[me][i];
377   qsort(tmp_indices, num_elements, sizeof(int), compare);
378 
379   start_time = armci_timer();
380 
381   /* get the required portion of vector you need to local array */
382   start = prev = tmp_indices[0];
383   for(i=1; i<num_elements; i++) {
384     if(tmp_indices[i]>prev+1) {
385       end = prev;
386       get_data(n, start, end, vec_local, vec);
387       start = prev = tmp_indices[i];
388     }
389     else prev = tmp_indices[i];
390   }
391   get_data(n, start, prev, vec_local, vec);
392 
393 #if 1
394   if(count>=0) for(i=0; i<=count; i++) ARMCI_Wait(&gHandle[i]);
395 #endif
396 
397   comm_time = armci_timer() - start_time;
398   start_time = armci_timer();
399 
400   /* Perform Matrix-Vector multiply and store the result in
401      solution vector - "svec[]" */
402 
403   if(me==0) { nrows = proc_row_list[me]; offset = row_ind[0]; }
404   else {
405     nrows = proc_row_list[me]-proc_row_list[me-1];
406     offset = row_ind[proc_row_list[me-1]];
407   }
408   /* printf("%d: My total Work = %d\n", me, nrows); */
409 
410   for(i=0; i<nrows; i++) { /* loop over rows owned by me */
411     svec[me][i] = 0;
412     if(me==0) idx = i; else idx = proc_row_list[me-1] + i;
413     for(k=row_ind[idx]; k<row_ind[idx+1]; k++) {
414       j = col_ind[me][k-offset];
415       v = values[me][k-offset];
416       svec[me][i] += v*vec_local[j];
417     }
418   }
419   comp_time = armci_timer()-start_time;
420   printf("%d: %f + %f = %f  (count = %d)\n", me, comm_time, comp_time,
421      comm_time+comp_time, count+1);
422 #endif
423 }
424 
gather_solution_vector(double ** svec)425 static void gather_solution_vector(double **svec) {
426 #if 0
427   double y[COL];
428   if((rc=ARMCI_Get(&vec[i][idx_start-offset], &vec_local[idx_start],
429            bytes, i)))
430     ARMCI_Error("armci_nbget failed\n",rc);
431 #endif
432 }
433 
test_sparse()434 static void test_sparse() {
435 
436     int *col_ind[MAXPROC];
437     double *values[MAXPROC], *vec[MAXPROC], *svec[MAXPROC]/*, start_time*/;
438     int n, non_zero, *row_ind;
439 
440     sparse_initialize(&n, &non_zero, &row_ind, col_ind, values, vec, svec);
441     armci_msg_barrier();
442 
443     /*start_time = armci_timer();*/
444     sparse_multiply(n, non_zero, row_ind, col_ind, values, vec, svec);
445     /* printf("%d: Timetaken = %f\n", me, armci_timer()-start_time); */
446     armci_msg_barrier();
447 
448     if(me==0) gather_solution_vector(svec);
449 
450     if(me==0){printf("O.K.\n"); fflush(stdout);}
451     destroy_array((void **)vec);
452 }
453 
454 
main(int argc,char * argv[])455 int main(int argc, char* argv[])
456 {
457 
458     armci_msg_init(&argc, &argv);
459     nproc = armci_msg_nproc();
460     me = armci_msg_me();
461 
462 /*    printf("nproc = %d, me = %d\n", nproc, me);*/
463 
464     if(nproc>MAXPROC && me==0)
465        ARMCI_Error("Test works for up to %d processors\n",MAXPROC);
466 
467     if(me==0){
468        printf("ARMCI test program (%d processes)\n",nproc);
469        fflush(stdout);
470        sleep(1);
471     }
472 
473     ARMCI_Init();
474 
475     if(me==0){
476       printf("\n  Performing Sparse Matrix-Vector Multiplication ...\n\n");
477       fflush(stdout);
478     }
479     test_sparse();
480 
481     ARMCI_AllFence();
482     armci_msg_barrier();
483     if(me==0){printf("\nSuccess!!\n"); fflush(stdout);}
484     sleep(2);
485 
486     armci_msg_barrier();
487     ARMCI_Finalize();
488     armci_msg_finalize();
489     return(0);
490 }
491