1 #if HAVE_CONFIG_H
2 # include "config.h"
3 #endif
4
5 /* $Id$ */
6
7 #if HAVE_STDIO_H
8 # include <stdio.h>
9 #endif
10 #if HAVE_STDLIB_H
11 # include <stdlib.h>
12 #endif
13 #if HAVE_UNISTD_H
14 # include <unistd.h>
15 #elif HAVE_WINDOWS_H
16 # include <windows.h>
17 # define sleep(x) Sleep(1000*(x))
18 #endif
19 #if HAVE_ASSERT_H
20 # include <assert.h>
21 #endif
22
23 #include "armci.h"
24 #include "message.h"
25
26 #define DIM1 5
27 #define DIM2 3
28 #ifdef __sun
29 /* Solaris has shared memory shortages in the default system configuration */
30 # define DIM3 6
31 # define DIM4 5
32 # define DIM5 4
33 #elif defined(__alpha__)
34 # define DIM3 8
35 # define DIM4 5
36 # define DIM5 6
37 #else
38 # define DIM3 8
39 # define DIM4 9
40 # define DIM5 7
41 #endif
42 #define DIM6 3
43 #define DIM7 2
44
45
46 #define OFF 1
47 #define EDIM1 (DIM1+OFF)
48 #define EDIM2 (DIM2+OFF)
49 #define EDIM3 (DIM3+OFF)
50 #define EDIM4 (DIM4+OFF)
51 #define EDIM5 (DIM5+OFF)
52 #define EDIM6 (DIM6+OFF)
53 #define EDIM7 (DIM7+OFF)
54
55 #define DIMS 4
56 #define MAXDIMS 7
57 #define MAX_DIM_VAL 50
58 #define LOOP 200
59
60 #define BASE 100.
61 #define MAXPROC 128
62 #define TIMES 100
63
64 #ifdef CRAY
65 # define ELEMS 800
66 #else
67 # define ELEMS 200
68 #endif
69
70 /***************************** macros ************************/
71 #define COPY(src, dst, bytes) memcpy((dst),(src),(bytes))
72 #define ARMCI_MAX(a,b) (((a) >= (b)) ? (a) : (b))
73 #define ARMCI_MIN(a,b) (((a) <= (b)) ? (a) : (b))
74 #define ARMCI_ABS(a) (((a) <0) ? -(a) : (a))
75
76 #define ROW 65536
77 #define COL ROW /* square matrices only for the time being */
78
79 /***************************** global data *******************/
80 int me, nproc;
81 short int fortran_indexing=0;
82 static int proc_row_list[MAXPROC];/*no of rows owned by each process - accumulated*/
83 static int proc_nz_list[MAXPROC]; /*no of non-zeros owned by each process */
84
85 #ifdef MSG_COMMS_PVM
pvm_init(int argc,char * argv[])86 void pvm_init(int argc, char *argv[])
87 {
88 int mytid, mygid, ctid[MAXPROC];
89 int np, i;
90
91 mytid = pvm_mytid();
92 if((argc != 2) && (argc != 1)) goto usage;
93 if(argc == 1) np = 1;
94 if(argc == 2)
95 if((np = atoi(argv[1])) < 1) goto usage;
96 if(np > MAXPROC) goto usage;
97
98 mygid = pvm_joingroup(MPGROUP);
99
100 if(np > 1)
101 if (mygid == 0)
102 i = pvm_spawn(argv[0], argv+1, 0, "", np-1, ctid);
103
104 while(pvm_gsize(MPGROUP) < np) sleep(1);
105
106 /* sync */
107 pvm_barrier(MPGROUP, np);
108
109 printf("PVM initialization done!\n");
110
111 return;
112
113 usage:
114 fprintf(stderr, "usage: %s <nproc>\n", argv[0]);
115 pvm_exit();
116 exit(-1);
117 }
118 #endif
119
create_array(void * a[],int elem_size,int ndim,int dims[])120 void create_array(void *a[], int elem_size, int ndim, int dims[])
121 {
122 int bytes=elem_size, i, rc;
123
124 assert(ndim<=MAXDIMS);
125 for(i=0;i<ndim;i++)bytes*=dims[i];
126
127 rc = ARMCI_Malloc(a, bytes);
128 assert(rc==0);
129
130 assert(a[me]);
131
132 }
133
destroy_array(void * ptr[])134 void destroy_array(void *ptr[])
135 {
136 int check;
137
138 armci_msg_barrier();
139 check = !ARMCI_Free(ptr[me]);
140 assert(check);
141 }
142
verify_list(int * proc_row_list)143 static void verify_list(int *proc_row_list) {
144 int i;
145 printf("\nVERIFY: %d: No of rows = %d\n\n", 0, proc_row_list[0]);
146 for(i=1; i<nproc; i++)
147 printf("\nVERIFY: %d: No of rows = %d\n\n", i, proc_row_list[i]-proc_row_list[i-1]);
148 fflush(stdout);
149 }
150
load_balance(int n,int non_zero,int * row_ind_tmp)151 static void load_balance(int n, int non_zero, int *row_ind_tmp) {
152
153 int proc_id, i, local_nz, local_nz_acc, A, B;
154
155 local_nz = local_nz_acc = non_zero/nproc;
156
157 /* number of rows owned by each process is stored in proc_row_list. This
158 is supposed to be well load balanced, so that each process has almost
159 same number of non-zero elements */
160 proc_id = 0;
161 if(me==0) printf("local_nz = %d\n", local_nz);
162 for(i=0; i<n; i++) { /* as # of entries in row_ind_tmp = n+1 */
163 if(row_ind_tmp[i] < local_nz_acc && row_ind_tmp[i+1] >= local_nz_acc) {
164 proc_row_list[proc_id++] = i+1;
165 local_nz_acc = local_nz*(proc_id+1);
166 if(proc_id == nproc-1) local_nz_acc = non_zero;
167 if(me==0 && proc_id<nproc) printf("local_nz = %d\n", local_nz_acc);
168 }
169 }
170
171 proc_row_list[nproc-1] = n;
172
173 for(i=0; i<nproc; i++) {
174 A = (i==0) ? 0: proc_row_list[i-1];/* # of entries in row_ind_tmp is n+1*/
175 B = proc_row_list[i];
176 proc_nz_list[i] = row_ind_tmp[B]-row_ind_tmp[A];
177 }
178
179 if(proc_id != nproc)
180 ARMCI_Error("Error while preparing Process Row list", proc_id-1);
181
182 #if 1
183 if(me==0) verify_list(proc_row_list);
184 #endif
185
186 }
187
sparse_initialize(int * n,int * non_zero,int ** row_ind,int ** col_ind,double ** values,double ** vec,double ** svec)188 static int sparse_initialize(int *n, int *non_zero, int **row_ind,
189 int **col_ind, double **values, double **vec,
190 double **svec) {
191
192 int i, j, rc, max, *row_ind_tmp=NULL, *tmp_indices=NULL;
193 double *tmp_values=NULL;
194 unsigned long len;
195 FILE *fp=NULL;
196
197 /* Broadcast order of matrix */
198 if(me==0) {
199 if((fp=fopen("Sparse-MPI/av41092.rua.data", "r")) == NULL)
200 ARMCI_Error("Error: Input file not found", me);
201 fortran_indexing = 1; /* This is 1 for Harwell-Boeing format matrices */
202 fscanf(fp, "%d", n);
203 if(*n%nproc)
204 ARMCI_Error("# of rows is not divisible by # of processors", nproc);
205 if(*n > ROW)
206 ARMCI_Error("order is greater than defined variable ROW", ROW);
207 }
208 len = sizeof(int);
209 armci_msg_brdcst(n, len, 0);
210
211 /* Broad cast number of non_zeros */
212 if(me==0) fscanf(fp, "%d", non_zero);
213 armci_msg_brdcst(non_zero, len, 0);
214
215 /* Broadcast row indices */
216 len = (*n+1)*sizeof(int);
217 row_ind_tmp = (int *)malloc(len);
218 if(me==0)for(i=0; i<*n+1; i++) {
219 fscanf(fp, "%d", &row_ind_tmp[i]);
220 if(fortran_indexing) --row_ind_tmp[i];
221 }
222 armci_msg_brdcst(row_ind_tmp, len, 0);
223
224 load_balance(*n, *non_zero, row_ind_tmp);
225
226 /* find how much temporary storage is needed at the maximum */
227 if(me==0) {
228 for(max=-1,j=0;j<nproc;j++) if(max<proc_nz_list[j]) max=proc_nz_list[j];
229 if(max<0) ARMCI_Error(" max cannot be negative", max);
230 }
231
232 /* Broadcast the maximum number of elements */
233 len = sizeof(int);
234 armci_msg_brdcst(&max, len, 0);
235
236 /* create the Sparse MAtrix Array */
237 if(me==0) printf(" Creating ValueArray (CompressedSparseMatrix) ...\n\n");
238 create_array((void**)col_ind, sizeof(int), 1, &max);
239
240 /* create the column subscript array */
241 if(me==0) printf(" Creating Column Subscript Array ... \n\n");
242 create_array((void**)values, sizeof(double), 1, &max);
243
244 /* create the x-vector and the solution vector */
245 if(me==0) printf(" Creating Vectors ... \n\n");
246 create_array((void**)vec, sizeof(double),1, &max);
247 create_array((void**)svec, sizeof(double),1, &max);
248 armci_msg_barrier();
249
250
251 /* Process 0 distributes the column indices and non_zero values to
252 respective processors*/
253 if(me == 0) {
254 tmp_indices = (int *)malloc(max*sizeof(int));
255 tmp_values = (double *)malloc(max*sizeof(double));
256
257 for(j=0; j<nproc; j++) {
258 for(i=0; i<proc_nz_list[j]; i++) {
259 fscanf(fp, "%d", &tmp_indices[i]);
260 if(fortran_indexing) --tmp_indices[i];
261 }
262 /* rc = fread(tmp_indices, sizeof(int), proc_nz_list[j], fp); */
263 if((rc=ARMCI_Put(tmp_indices, col_ind[j], proc_nz_list[j]*sizeof(int), j)))
264 ARMCI_Error("armci_nbput failed\n",rc);
265 }
266 for(j=0; j<nproc; j++) {
267 for(i=0; i<proc_nz_list[j]; i++) fscanf(fp, "%lf", &tmp_values[i]);
268 if((rc=ARMCI_Put(tmp_values, values[j], proc_nz_list[j]*sizeof(double), j)))
269 ARMCI_Error("armci_nbput failed\n",rc);
270 }
271 }
272 ARMCI_AllFence(); armci_msg_barrier();ARMCI_AllFence();
273
274 /* initializing x-vector */
275 if(me==0) for(i=0;i<proc_nz_list[me]; i++) vec[me][i] = (i+1);
276 else for(i=0;i<proc_nz_list[me];i++) vec[me][i]=me*proc_nz_list[me-1]+(i+1);
277
278 #if 0
279 if(me==0) {
280 printf("max = %d\n", max);
281 for(i=0; i<max; i++) printf("%.1f ", values[me][i]);
282 printf("\n");
283 }
284 #endif
285
286 *row_ind = row_ind_tmp;
287 if(me==0) {
288 free(tmp_indices);
289 free(tmp_values);
290 fclose(fp);
291 }
292 return 0;
293 }
294
compare(const void * p1,const void * p2)295 static int compare(const void *p1, const void *p2) {
296 int i = *((int *)p1);
297 int j = *((int *)p2);
298
299 if (i > j) return (1);
300 if (i < j) return (-1);
301 return (0);
302 }
303
304 static int count = -1;
305 static armci_hdl_t gHandle[MAXPROC];
306 static int prev_proc = -1;
307
get_data(int n,int start,int end,double * vec_local,double ** vec)308 static void get_data(int n, int start, int end, double *vec_local,
309 double **vec) {
310 int i, j, rc, bytes, offset;
311 int proc_start, proc_end, idx_start, idx_end;
312
313 proc_start = proc_end = -1;
314 for(i=0; i<nproc; i++) {
315 if(proc_start<0 && proc_row_list[i]>start) proc_start = i;
316 if(proc_end<0 && proc_row_list[i]>end) proc_end = i;
317 }
318 if(proc_start<0 || proc_end<0) ARMCI_Error("Invalid Process Ids", -1);
319
320 for(i=proc_start; i<=proc_end; i++) {
321 if(i==proc_start) idx_start = start;
322 else { if(i==0) idx_start=0; else idx_start = proc_row_list[i-1];}
323 if(i==proc_end) idx_end = end;
324 else idx_end = proc_row_list[i]-1;
325
326 if(i!=prev_proc) {
327 ++count; prev_proc = i;
328 ARMCI_INIT_HANDLE(&gHandle[count]);
329 ARMCI_SET_AGGREGATE_HANDLE(&gHandle[count]);
330 }
331
332 if(i==0) offset=0; else offset = proc_row_list[i-1];
333 if(i==me) { /* local */
334 for(j=idx_start; j<=idx_end; j++) vec_local[j] = vec[me][j-offset];
335 }
336 else { /* remote */
337 bytes = (idx_end-idx_start+1)*sizeof(double);
338 vec_local[idx_start] = -1;
339 #if 0
340 if((rc=ARMCI_Get(&vec[i][idx_start-offset], &vec_local[idx_start],
341 bytes, i)))
342 #else
343 if((rc=ARMCI_NbGet(&vec[i][idx_start-offset], &vec_local[idx_start],
344 bytes, i, &gHandle[count])))
345 #endif
346 ARMCI_Error("armci_nbget failed\n",rc);
347 }
348 }
349 }
350
sparse_multiply(int n,int non_zero,int * row_ind,int ** col_ind,double ** values,double ** vec,double ** svec)351 static void sparse_multiply(int n, int non_zero, int *row_ind, int **col_ind,
352 double **values, double **vec, double **svec) {
353
354 int i, j, k, num_elements, offset, *tmp_indices;
355 double start_time, comm_time, comp_time, v, vec_local[COL];
356 int start, end, prev, nrows, idx;
357
358 #if 0
359 /* ---- Sequential Case ---- */
360 for(i=0; i<n; i++) {
361 svec[me][i] = 0;
362 for(k=row_ind[i]; k<row_ind[i+1]; k++) {
363 j = col_ind[me][k];
364 v = values[me][k];
365 svec[me][i] += v*vec[me][j];
366 printf("%.1f %.1f\n", v, vec[me][j]);
367 }
368 }
369 for(i=0; i<n; i++) printf("%.1f ", svec[me][i]);
370 printf("\n");
371 #else
372
373 num_elements = proc_nz_list[me];
374 printf("num_elements = %d\n", num_elements);
375 tmp_indices = (int *)malloc(num_elements*sizeof(int));
376 for(i=0; i<num_elements; i++) tmp_indices[i] = col_ind[me][i];
377 qsort(tmp_indices, num_elements, sizeof(int), compare);
378
379 start_time = armci_timer();
380
381 /* get the required portion of vector you need to local array */
382 start = prev = tmp_indices[0];
383 for(i=1; i<num_elements; i++) {
384 if(tmp_indices[i]>prev+1) {
385 end = prev;
386 get_data(n, start, end, vec_local, vec);
387 start = prev = tmp_indices[i];
388 }
389 else prev = tmp_indices[i];
390 }
391 get_data(n, start, prev, vec_local, vec);
392
393 #if 1
394 if(count>=0) for(i=0; i<=count; i++) ARMCI_Wait(&gHandle[i]);
395 #endif
396
397 comm_time = armci_timer() - start_time;
398 start_time = armci_timer();
399
400 /* Perform Matrix-Vector multiply and store the result in
401 solution vector - "svec[]" */
402
403 if(me==0) { nrows = proc_row_list[me]; offset = row_ind[0]; }
404 else {
405 nrows = proc_row_list[me]-proc_row_list[me-1];
406 offset = row_ind[proc_row_list[me-1]];
407 }
408 /* printf("%d: My total Work = %d\n", me, nrows); */
409
410 for(i=0; i<nrows; i++) { /* loop over rows owned by me */
411 svec[me][i] = 0;
412 if(me==0) idx = i; else idx = proc_row_list[me-1] + i;
413 for(k=row_ind[idx]; k<row_ind[idx+1]; k++) {
414 j = col_ind[me][k-offset];
415 v = values[me][k-offset];
416 svec[me][i] += v*vec_local[j];
417 }
418 }
419 comp_time = armci_timer()-start_time;
420 printf("%d: %f + %f = %f (count = %d)\n", me, comm_time, comp_time,
421 comm_time+comp_time, count+1);
422 #endif
423 }
424
gather_solution_vector(double ** svec)425 static void gather_solution_vector(double **svec) {
426 #if 0
427 double y[COL];
428 if((rc=ARMCI_Get(&vec[i][idx_start-offset], &vec_local[idx_start],
429 bytes, i)))
430 ARMCI_Error("armci_nbget failed\n",rc);
431 #endif
432 }
433
test_sparse()434 static void test_sparse() {
435
436 int *col_ind[MAXPROC];
437 double *values[MAXPROC], *vec[MAXPROC], *svec[MAXPROC]/*, start_time*/;
438 int n, non_zero, *row_ind;
439
440 sparse_initialize(&n, &non_zero, &row_ind, col_ind, values, vec, svec);
441 armci_msg_barrier();
442
443 /*start_time = armci_timer();*/
444 sparse_multiply(n, non_zero, row_ind, col_ind, values, vec, svec);
445 /* printf("%d: Timetaken = %f\n", me, armci_timer()-start_time); */
446 armci_msg_barrier();
447
448 if(me==0) gather_solution_vector(svec);
449
450 if(me==0){printf("O.K.\n"); fflush(stdout);}
451 destroy_array((void **)vec);
452 }
453
454
main(int argc,char * argv[])455 int main(int argc, char* argv[])
456 {
457
458 armci_msg_init(&argc, &argv);
459 nproc = armci_msg_nproc();
460 me = armci_msg_me();
461
462 /* printf("nproc = %d, me = %d\n", nproc, me);*/
463
464 if(nproc>MAXPROC && me==0)
465 ARMCI_Error("Test works for up to %d processors\n",MAXPROC);
466
467 if(me==0){
468 printf("ARMCI test program (%d processes)\n",nproc);
469 fflush(stdout);
470 sleep(1);
471 }
472
473 ARMCI_Init();
474
475 if(me==0){
476 printf("\n Performing Sparse Matrix-Vector Multiplication ...\n\n");
477 fflush(stdout);
478 }
479 test_sparse();
480
481 ARMCI_AllFence();
482 armci_msg_barrier();
483 if(me==0){printf("\nSuccess!!\n"); fflush(stdout);}
484 sleep(2);
485
486 armci_msg_barrier();
487 ARMCI_Finalize();
488 armci_msg_finalize();
489 return(0);
490 }
491