1 /******************************************************************************
2 * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other
3 * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
4 *
5 * SPDX-License-Identifier: (Apache-2.0 OR MIT)
6 ******************************************************************************/
7
8 #include "_hypre_parcsr_ls.h"
9
10 /*--------------------------------------------------------------------------
11 * hypre_ParAMGBuildMultipass
12 * This routine implements Stuben's direct interpolation with multiple passes.
13 *--------------------------------------------------------------------------*/
14
15 HYPRE_Int
hypre_BoomerAMGBuildMultipassHost(hypre_ParCSRMatrix * A,HYPRE_Int * CF_marker,hypre_ParCSRMatrix * S,HYPRE_BigInt * num_cpts_global,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int debug_flag,HYPRE_Real trunc_factor,HYPRE_Int P_max_elmts,HYPRE_Int weight_option,hypre_ParCSRMatrix ** P_ptr)16 hypre_BoomerAMGBuildMultipassHost( hypre_ParCSRMatrix *A,
17 HYPRE_Int *CF_marker,
18 hypre_ParCSRMatrix *S,
19 HYPRE_BigInt *num_cpts_global,
20 HYPRE_Int num_functions,
21 HYPRE_Int *dof_func,
22 HYPRE_Int debug_flag,
23 HYPRE_Real trunc_factor,
24 HYPRE_Int P_max_elmts,
25 HYPRE_Int weight_option,
26 hypre_ParCSRMatrix **P_ptr )
27 {
28 #ifdef HYPRE_PROFILE
29 hypre_profile_times[HYPRE_TIMER_ID_MULTIPASS_INTERP] -= hypre_MPI_Wtime();
30 #endif
31
32 MPI_Comm comm = hypre_ParCSRMatrixComm(A);
33 hypre_ParCSRCommPkg *comm_pkg = hypre_ParCSRMatrixCommPkg(S);
34 hypre_ParCSRCommHandle *comm_handle;
35 hypre_ParCSRCommPkg *tmp_comm_pkg;
36
37 hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
38 HYPRE_Real *A_diag_data = hypre_CSRMatrixData(A_diag);
39 HYPRE_Int *A_diag_i = hypre_CSRMatrixI(A_diag);
40 HYPRE_Int *A_diag_j = hypre_CSRMatrixJ(A_diag);
41
42 hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
43 HYPRE_Real *A_offd_data = NULL;
44 HYPRE_Int *A_offd_i = hypre_CSRMatrixI(A_offd);
45 HYPRE_Int *A_offd_j = NULL;
46 //HYPRE_BigInt *col_map_offd_A = hypre_ParCSRMatrixColMapOffd(A);
47 HYPRE_Int num_cols_offd_A = hypre_CSRMatrixNumCols(A_offd);
48
49 hypre_CSRMatrix *S_diag = hypre_ParCSRMatrixDiag(S);
50 HYPRE_Int *S_diag_i = hypre_CSRMatrixI(S_diag);
51 HYPRE_Int *S_diag_j = hypre_CSRMatrixJ(S_diag);
52
53 hypre_CSRMatrix *S_offd = hypre_ParCSRMatrixOffd(S);
54 HYPRE_Int *S_offd_i = hypre_CSRMatrixI(S_offd);
55 HYPRE_Int *S_offd_j = NULL;
56 /*HYPRE_BigInt *col_map_offd_S = hypre_ParCSRMatrixColMapOffd(S);
57 HYPRE_Int num_cols_offd_S = hypre_CSRMatrixNumCols(S_offd);
58 HYPRE_BigInt *col_map_offd = NULL;*/
59 HYPRE_Int num_cols_offd;
60
61 hypre_ParCSRMatrix *P;
62 hypre_CSRMatrix *P_diag;
63 HYPRE_Real *P_diag_data;
64 HYPRE_Int *P_diag_i; /*at first counter of nonzero cols for each row,
65 finally will be pointer to start of row */
66 HYPRE_Int *P_diag_j;
67
68 hypre_CSRMatrix *P_offd;
69 HYPRE_Real *P_offd_data = NULL;
70 HYPRE_Int *P_offd_i; /*at first counter of nonzero cols for each row,
71 finally will be pointer to start of row */
72 HYPRE_Int *P_offd_j = NULL;
73
74 HYPRE_Int num_sends = 0;
75 HYPRE_Int *int_buf_data = NULL;
76 HYPRE_BigInt *big_buf_data = NULL;
77 HYPRE_Int *send_map_start;
78 HYPRE_Int *send_map_elmt;
79 HYPRE_Int *send_procs;
80 HYPRE_Int num_recvs = 0;
81 HYPRE_Int *recv_vec_start;
82 HYPRE_Int *recv_procs;
83 HYPRE_Int *new_recv_vec_start = NULL;
84 HYPRE_Int **Pext_send_map_start = NULL;
85 HYPRE_Int **Pext_recv_vec_start = NULL;
86 HYPRE_Int *Pext_start = NULL;
87 HYPRE_Int *P_ncols = NULL;
88
89 HYPRE_Int *CF_marker_offd = NULL;
90 HYPRE_Int *dof_func_offd = NULL;
91 HYPRE_Int *P_marker;
92 HYPRE_Int *P_marker_offd = NULL;
93 HYPRE_Int *C_array;
94 HYPRE_Int *C_array_offd = NULL;
95 HYPRE_Int *pass_array = NULL; /* contains points ordered according to pass */
96 HYPRE_Int *pass_pointer = NULL; /* pass_pointer[j] contains pointer to first
97 point of pass j contained in pass_array */
98 HYPRE_Int *P_diag_start;
99 HYPRE_Int *P_offd_start = NULL;
100 HYPRE_Int **P_diag_pass;
101 HYPRE_Int **P_offd_pass = NULL;
102 HYPRE_Int **Pext_pass = NULL;
103 HYPRE_BigInt *big_temp_pass = NULL;
104 HYPRE_BigInt **new_elmts = NULL; /* new neighbors generated in each pass */
105 HYPRE_Int *new_counter = NULL; /* contains no. of new neighbors for
106 each pass */
107 HYPRE_Int *loc = NULL; /* contains locations for new neighbor
108 connections in int_o_buffer to avoid searching */
109 HYPRE_Int *Pext_i = NULL; /*contains P_diag_i and P_offd_i info for nonzero
110 cols of off proc neighbors */
111 HYPRE_BigInt *Pext_send_buffer = NULL; /* used to collect global nonzero
112 col ids in P_diag for send_map_elmts */
113
114 HYPRE_Int *map_S_to_new = NULL;
115 HYPRE_BigInt *new_col_map_offd = NULL;
116 HYPRE_BigInt *col_map_offd_P = NULL;
117 HYPRE_Int *permute = NULL;
118 HYPRE_BigInt *big_permute = NULL;
119
120 HYPRE_Int cnt;
121 HYPRE_Int cnt_nz;
122 HYPRE_Int total_nz;
123 HYPRE_Int pass;
124 HYPRE_Int num_passes;
125 HYPRE_Int max_num_passes = 10;
126
127 HYPRE_Int n_fine;
128 HYPRE_Int n_coarse = 0;
129 HYPRE_Int n_coarse_offd = 0;
130 HYPRE_Int n_SF = 0;
131 HYPRE_Int n_SF_offd = 0;
132
133 HYPRE_Int *fine_to_coarse = NULL;
134 HYPRE_BigInt *fine_to_coarse_offd = NULL;
135
136 HYPRE_Int *assigned = NULL;
137 HYPRE_Int *assigned_offd = NULL;
138
139 HYPRE_Real *Pext_send_data = NULL;
140 HYPRE_Real *Pext_data = NULL;
141
142 HYPRE_Real sum_C, sum_N;
143 HYPRE_Real sum_C_pos, sum_C_neg;
144 HYPRE_Real sum_N_pos, sum_N_neg;
145 HYPRE_Real diagonal;
146 HYPRE_Real alfa = 1.0;
147 HYPRE_Real beta = 1.0;
148 HYPRE_Int j_start;
149 HYPRE_Int j_end;
150
151 HYPRE_Int i,i1;
152 HYPRE_Int j,j1;
153 HYPRE_Int k,k1,k2,k3;
154 HYPRE_BigInt big_k1;
155 HYPRE_Int pass_array_size;
156 HYPRE_BigInt global_pass_array_size;
157 HYPRE_BigInt local_pass_array_size;
158 HYPRE_Int my_id, num_procs;
159 HYPRE_Int index, start;
160 HYPRE_BigInt my_first_cpt;
161 HYPRE_BigInt total_global_cpts;
162 HYPRE_Int p_cnt;
163 HYPRE_Int total_nz_offd;
164 HYPRE_Int cnt_nz_offd;
165 HYPRE_Int cnt_offd, cnt_new;
166 HYPRE_Int no_break;
167 HYPRE_Int not_found;
168 HYPRE_Int Pext_send_size;
169 HYPRE_Int Pext_recv_size;
170 HYPRE_Int old_Pext_send_size;
171 HYPRE_Int old_Pext_recv_size;
172 HYPRE_Int P_offd_size = 0;
173 HYPRE_Int local_index = -1;
174 HYPRE_Int new_num_cols_offd = 0;
175 HYPRE_Int num_cols_offd_P;
176
177 /* Threading variables */
178 HYPRE_Int my_thread_num, num_threads, thread_start, thread_stop;
179 HYPRE_Int pass_length;
180 HYPRE_Int *tmp_marker, *tmp_marker_offd;
181 HYPRE_Int *tmp_array, *tmp_array_offd;
182 HYPRE_Int * max_num_threads = hypre_CTAlloc(HYPRE_Int, 1, HYPRE_MEMORY_HOST);
183 HYPRE_Int * cnt_nz_per_thread;
184 HYPRE_Int * cnt_nz_offd_per_thread;
185
186 /* HYPRE_Real wall_time;
187 wall_time = hypre_MPI_Wtime(); */
188
189 /* Initialize threading variables */
190 max_num_threads[0] = hypre_NumThreads();
191 cnt_nz_per_thread = hypre_CTAlloc(HYPRE_Int, max_num_threads[0], HYPRE_MEMORY_HOST);
192 cnt_nz_offd_per_thread = hypre_CTAlloc(HYPRE_Int, max_num_threads[0], HYPRE_MEMORY_HOST);
193 for(i=0; i < max_num_threads[0]; i++)
194 {
195 cnt_nz_offd_per_thread[i] = 0;
196 cnt_nz_per_thread[i] = 0;
197 }
198
199
200 /*-----------------------------------------------------------------------
201 * Access the CSR vectors for A and S. Also get size of fine grid.
202 *-----------------------------------------------------------------------*/
203
204 hypre_MPI_Comm_size(comm,&num_procs);
205 hypre_MPI_Comm_rank(comm,&my_id);
206
207 my_first_cpt = num_cpts_global[0];
208 /* total_global_cpts = 0; */
209 if (my_id == (num_procs -1)) total_global_cpts = num_cpts_global[1];
210 hypre_MPI_Bcast(&total_global_cpts, 1, HYPRE_MPI_BIG_INT, num_procs-1, comm);
211
212 if (!comm_pkg)
213 {
214 comm_pkg = hypre_ParCSRMatrixCommPkg(A);
215 if (!comm_pkg)
216 {
217 hypre_MatvecCommPkgCreate(A);
218
219 comm_pkg = hypre_ParCSRMatrixCommPkg(A);
220 }
221 }
222
223 //col_map_offd = col_map_offd_A;
224 num_cols_offd = num_cols_offd_A;
225
226 if (num_cols_offd_A)
227 {
228 A_offd_data = hypre_CSRMatrixData(A_offd);
229 A_offd_j = hypre_CSRMatrixJ(A_offd);
230 }
231
232 if (num_cols_offd)
233 S_offd_j = hypre_CSRMatrixJ(S_offd);
234
235 n_fine = hypre_CSRMatrixNumRows(A_diag);
236
237 /*-----------------------------------------------------------------------
238 * Intialize counters and allocate mapping vector.
239 *-----------------------------------------------------------------------*/
240
241 if (n_fine) fine_to_coarse = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_HOST);
242
243 n_coarse = 0;
244 n_SF = 0;
245 #ifdef HYPRE_USING_OPENMP
246 #pragma omp parallel for private(i) reduction(+:n_coarse,n_SF ) HYPRE_SMP_SCHEDULE
247 #endif
248 for (i=0; i < n_fine; i++)
249 if (CF_marker[i] == 1) n_coarse++;
250 else if (CF_marker[i] == -3) n_SF++;
251
252 pass_array_size = n_fine-n_coarse-n_SF;
253 if (pass_array_size) pass_array = hypre_CTAlloc(HYPRE_Int, pass_array_size, HYPRE_MEMORY_HOST);
254 pass_pointer = hypre_CTAlloc(HYPRE_Int, max_num_passes+1, HYPRE_MEMORY_HOST);
255 if (n_fine) assigned = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_HOST);
256 P_diag_i = hypre_CTAlloc(HYPRE_Int, n_fine+1, HYPRE_MEMORY_HOST);
257 P_offd_i = hypre_CTAlloc(HYPRE_Int, n_fine+1, HYPRE_MEMORY_HOST);
258 if (n_coarse) C_array = hypre_CTAlloc(HYPRE_Int, n_coarse, HYPRE_MEMORY_HOST);
259
260 if (num_cols_offd)
261 {
262 CF_marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_offd, HYPRE_MEMORY_HOST);
263 if (num_functions > 1) dof_func_offd = hypre_CTAlloc(HYPRE_Int, num_cols_offd, HYPRE_MEMORY_HOST);
264 }
265
266 if (num_procs > 1)
267 {
268 num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
269 send_procs = hypre_ParCSRCommPkgSendProcs(comm_pkg);
270 send_map_start = hypre_ParCSRCommPkgSendMapStarts(comm_pkg);
271 send_map_elmt = hypre_ParCSRCommPkgSendMapElmts(comm_pkg);
272 num_recvs = hypre_ParCSRCommPkgNumRecvs(comm_pkg);
273 recv_procs = hypre_ParCSRCommPkgRecvProcs(comm_pkg);
274 recv_vec_start = hypre_ParCSRCommPkgRecvVecStarts(comm_pkg);
275 if (send_map_start[num_sends])
276 {
277 int_buf_data = hypre_CTAlloc(HYPRE_Int, send_map_start[num_sends], HYPRE_MEMORY_HOST);
278 big_buf_data = hypre_CTAlloc(HYPRE_BigInt, send_map_start[num_sends], HYPRE_MEMORY_HOST);
279 }
280 }
281
282
283 index = 0;
284 for (i=0; i < num_sends; i++)
285 {
286 start = send_map_start[i];
287 for (j = start; j < send_map_start[i+1]; j++)
288 int_buf_data[index++] = CF_marker[send_map_elmt[j]];
289 }
290 if (num_procs > 1)
291 {
292 comm_handle = hypre_ParCSRCommHandleCreate(11, comm_pkg, int_buf_data,
293 CF_marker_offd);
294 hypre_ParCSRCommHandleDestroy(comm_handle);
295 }
296
297 if (num_functions > 1)
298 {
299 index = 0;
300 for (i=0; i < num_sends; i++)
301 {
302 start = send_map_start[i];
303 for (j = start; j < send_map_start[i+1]; j++)
304 int_buf_data[index++] = dof_func[send_map_elmt[j]];
305 }
306 if (num_procs > 1)
307 {
308 comm_handle = hypre_ParCSRCommHandleCreate(11, comm_pkg, int_buf_data,
309 dof_func_offd);
310 hypre_ParCSRCommHandleDestroy(comm_handle);
311 }
312 }
313
314 n_coarse_offd = 0;
315 n_SF_offd = 0;
316 #ifdef HYPRE_USING_OPENMP
317 #pragma omp parallel for private(i) reduction(+:n_coarse_offd,n_SF_offd) HYPRE_SMP_SCHEDULE
318 #endif
319 for (i=0; i < num_cols_offd; i++)
320 if (CF_marker_offd[i] == 1) n_coarse_offd++;
321 else if (CF_marker_offd[i] == -3) n_SF_offd++;
322
323 if (num_cols_offd)
324 {
325 assigned_offd = hypre_CTAlloc(HYPRE_Int, num_cols_offd, HYPRE_MEMORY_HOST);
326 map_S_to_new = hypre_CTAlloc(HYPRE_Int, num_cols_offd, HYPRE_MEMORY_HOST);
327 fine_to_coarse_offd = hypre_CTAlloc(HYPRE_BigInt, num_cols_offd, HYPRE_MEMORY_HOST);
328 new_col_map_offd = hypre_CTAlloc(HYPRE_BigInt, n_coarse_offd, HYPRE_MEMORY_HOST);
329 }
330
331 /*-----------------------------------------------------------------------
332 * First Pass: determine the maximal size of P, and elementsPerRow[i].
333 *-----------------------------------------------------------------------*/
334
335 /*-----------------------------------------------------------------------
336 * Assigned points are points for which we know an interpolation
337 * formula already, and which are thus available to interpolate from.
338 * assigned[i]=0 for C points, and 1, 2, 3, ... for F points, depending
339 * in which pass their interpolation formula is determined.
340 *
341 * pass_array contains the points ordered according to its pass, i.e.
342 * | C-points | points of pass 1 | points of pass 2 | ....
343 * C_points are points 0 through pass_pointer[1]-1,
344 * points of pass k (0 < k < num_passes) are contained in points
345 * pass_pointer[k] through pass_pointer[k+1]-1 of pass_array .
346 *
347 * pass_array is also used to avoid going through all points for each pass,
348 * i,e. at the bginning it contains all points in descending order starting
349 * with n_fine-1. Then starting from the last point, we evaluate whether
350 * it is a C_point (pass 0). If it is the point is brought to the front
351 * and the length of the points to be searched is shortened. This is
352 * done until the parameter cnt (which determines the first point of
353 * pass_array to be searched) becomes n_fine. Then all points have been
354 * assigned a pass number.
355 *-----------------------------------------------------------------------*/
356
357
358 cnt = 0;
359 p_cnt = pass_array_size-1;
360 P_diag_i[0] = 0;
361 P_offd_i[0] = 0;
362 for (i = 0; i < n_fine; i++)
363 {
364 if (CF_marker[i] == 1)
365 {
366 fine_to_coarse[i] = cnt; /* this C point is assigned index
367 coarse_counter on coarse grid,
368 and in column of P */
369 C_array[cnt++] = i;
370 assigned[i] = 0;
371 P_diag_i[i+1] = 1; /* one element in row i1 of P */
372 P_offd_i[i+1] = 0;
373 }
374 else if (CF_marker[i] == -1)
375 {
376 pass_array[p_cnt--] = i;
377 P_diag_i[i+1] = 0;
378 P_offd_i[i+1] = 0;
379 assigned[i] = -1;
380 fine_to_coarse[i] = -1;
381 }
382 else
383 {
384 P_diag_i[i+1] = 0;
385 P_offd_i[i+1] = 0;
386 assigned[i] = -1;
387 fine_to_coarse[i] = -1;
388 }
389 }
390
391 index = 0;
392 for (i=0; i < num_sends; i++)
393 {
394 start = send_map_start[i];
395 for (j = start; j < send_map_start[i+1]; j++)
396 {
397 big_buf_data[index] = (HYPRE_BigInt)fine_to_coarse[send_map_elmt[j]];
398 if (big_buf_data[index] > -1)
399 big_buf_data[index] += my_first_cpt;
400 index++;
401 }
402 }
403 if (num_procs > 1)
404 {
405 comm_handle = hypre_ParCSRCommHandleCreate(21, comm_pkg, big_buf_data,
406 fine_to_coarse_offd);
407 hypre_ParCSRCommHandleDestroy(comm_handle);
408 }
409
410 new_recv_vec_start = hypre_CTAlloc(HYPRE_Int, num_recvs+1, HYPRE_MEMORY_HOST);
411
412 if (n_coarse_offd)
413 C_array_offd = hypre_CTAlloc(HYPRE_Int, n_coarse_offd, HYPRE_MEMORY_HOST);
414
415 cnt = 0;
416 new_recv_vec_start[0] = 0;
417 for (j = 0; j < num_recvs; j++)
418 {
419 for (i = recv_vec_start[j]; i < recv_vec_start[j+1]; i++)
420 {
421 if (CF_marker_offd[i] == 1)
422 {
423 map_S_to_new[i] = cnt;
424 C_array_offd[cnt] = i;
425 new_col_map_offd[cnt++] = fine_to_coarse_offd[i];
426 assigned_offd[i] = 0;
427 }
428 else
429 {
430 assigned_offd[i] = -1;
431 map_S_to_new[i] = -1;
432 }
433 }
434 new_recv_vec_start[j+1] = cnt;
435 }
436
437 cnt = 0;
438 hypre_TFree(fine_to_coarse_offd, HYPRE_MEMORY_HOST);
439
440 /*-----------------------------------------------------------------------
441 * Mark all local neighbors of C points as 'assigned'.
442 *-----------------------------------------------------------------------*/
443
444 pass_pointer[0] = 0;
445 pass_pointer[1] = 0;
446 total_nz = n_coarse; /* accumulates total number of nonzeros in P_diag */
447 total_nz_offd = 0; /* accumulates total number of nonzeros in P_offd */
448
449 cnt = 0;
450 cnt_offd = 0;
451 cnt_nz = 0;
452 cnt_nz_offd = 0;
453 for (i = pass_array_size-1; i > cnt-1; i--)
454 {
455 i1 = pass_array[i];
456 for (j=S_diag_i[i1]; j < S_diag_i[i1+1]; j++)
457 {
458 j1 = S_diag_j[j];
459 if (CF_marker[j1] == 1)
460 {
461 P_diag_i[i1+1]++;
462 cnt_nz++;
463 assigned[i1] = 1;
464 }
465 }
466 for (j=S_offd_i[i1]; j < S_offd_i[i1+1]; j++)
467 {
468 j1 = S_offd_j[j];
469 if (CF_marker_offd[j1] == 1)
470 {
471 P_offd_i[i1+1]++;
472 cnt_nz_offd++;
473 assigned[i1] = 1;
474 }
475 }
476 if (assigned[i1] == 1)
477 {
478 pass_array[i++] = pass_array[cnt];
479 pass_array[cnt++] = i1;
480 }
481 }
482
483 pass_pointer[2] = cnt;
484
485 /*-----------------------------------------------------------------------
486 * All local neighbors are assigned, now need to exchange the boundary
487 * info for assigned strong neighbors.
488 *-----------------------------------------------------------------------*/
489
490 index = 0;
491 for (i=0; i < num_sends; i++)
492 {
493 start = send_map_start[i];
494 for (j = start; j < send_map_start[i+1]; j++)
495 { int_buf_data[index++] = assigned[send_map_elmt[j]]; }
496 }
497 if (num_procs > 1)
498 {
499 comm_handle = hypre_ParCSRCommHandleCreate(11, comm_pkg, int_buf_data,
500 assigned_offd);
501 hypre_ParCSRCommHandleDestroy(comm_handle);
502 }
503
504 /*-----------------------------------------------------------------------
505 * Now we need to determine strong neighbors of points of pass 1, etc.
506 * we need to update assigned_offd after each pass
507 *-----------------------------------------------------------------------*/
508
509 pass = 2;
510 local_pass_array_size = (HYPRE_BigInt)(pass_array_size - cnt);
511 hypre_MPI_Allreduce(&local_pass_array_size, &global_pass_array_size, 1, HYPRE_MPI_BIG_INT,
512 hypre_MPI_SUM, comm);
513 while (global_pass_array_size && pass < max_num_passes)
514 {
515 for (i = pass_array_size-1; i > cnt-1; i--)
516 {
517 i1 = pass_array[i];
518 no_break = 1;
519 for (j=S_diag_i[i1]; j < S_diag_i[i1+1]; j++)
520 {
521 j1 = S_diag_j[j];
522 if (assigned[j1] == pass-1)
523 {
524 pass_array[i++] = pass_array[cnt];
525 pass_array[cnt++] = i1;
526 assigned[i1] = pass;
527 no_break = 0;
528 break;
529 }
530 }
531 if (no_break)
532 {
533 for (j=S_offd_i[i1]; j < S_offd_i[i1+1]; j++)
534 {
535 j1 = S_offd_j[j];
536 if (assigned_offd[j1] == pass-1)
537 {
538 pass_array[i++] = pass_array[cnt];
539 pass_array[cnt++] = i1;
540 assigned[i1] = pass;
541 break;
542 }
543 }
544 }
545 }
546 /*hypre_printf("pass %d remaining points %d \n", pass, local_pass_array_size);*/
547
548 pass++;
549 pass_pointer[pass] = cnt;
550
551 local_pass_array_size = (HYPRE_BigInt)(pass_array_size - cnt);
552 hypre_MPI_Allreduce(&local_pass_array_size, &global_pass_array_size, 1, HYPRE_MPI_BIG_INT,
553 hypre_MPI_SUM, comm);
554 index = 0;
555 for (i=0; i < num_sends; i++)
556 {
557 start = send_map_start[i];
558 for (j = start; j < send_map_start[i+1]; j++)
559 { int_buf_data[index++] = assigned[send_map_elmt[j]]; }
560 }
561 if (num_procs > 1)
562 {
563 comm_handle = hypre_ParCSRCommHandleCreate(11, comm_pkg, int_buf_data,
564 assigned_offd);
565 hypre_ParCSRCommHandleDestroy(comm_handle);
566 }
567 }
568
569 hypre_TFree(int_buf_data, HYPRE_MEMORY_HOST);
570 hypre_TFree(big_buf_data, HYPRE_MEMORY_HOST);
571
572 num_passes = pass;
573
574 P_diag_pass = hypre_CTAlloc(HYPRE_Int*, num_passes, HYPRE_MEMORY_HOST); /* P_diag_pass[i] will contain
575 all column numbers for points of pass i */
576
577 P_diag_pass[1] = hypre_CTAlloc(HYPRE_Int, cnt_nz, HYPRE_MEMORY_HOST);
578
579 P_diag_start = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_HOST); /* P_diag_start[i] contains
580 pointer to begin of column numbers in P_pass for point i,
581 P_diag_i[i+1] contains number of columns for point i */
582
583 P_offd_start = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_HOST);
584
585 if (num_procs > 1)
586 {
587 P_offd_pass = hypre_CTAlloc(HYPRE_Int*, num_passes, HYPRE_MEMORY_HOST);
588
589 if (cnt_nz_offd)
590 P_offd_pass[1] = hypre_CTAlloc(HYPRE_Int, cnt_nz_offd, HYPRE_MEMORY_HOST);
591 else
592 P_offd_pass[1] = NULL;
593
594 new_elmts = hypre_CTAlloc(HYPRE_BigInt*, num_passes, HYPRE_MEMORY_HOST);
595
596 new_counter = hypre_CTAlloc(HYPRE_Int, num_passes+1, HYPRE_MEMORY_HOST);
597
598 new_counter[0] = 0;
599 new_counter[1] = n_coarse_offd;
600 new_num_cols_offd = n_coarse_offd;
601
602 new_elmts[0] = new_col_map_offd;
603 }
604
605 /*-----------------------------------------------------------------------
606 * Pass 1: now we consider points of pass 1, with strong C_neighbors,
607 *-----------------------------------------------------------------------*/
608
609 cnt_nz = 0;
610 cnt_nz_offd = 0;
611 /* JBS: Possible candidate for threading */
612 for (i=pass_pointer[1]; i < pass_pointer[2]; i++)
613 {
614 i1 = pass_array[i];
615 P_diag_start[i1] = cnt_nz;
616 P_offd_start[i1] = cnt_nz_offd;
617 for (j=S_diag_i[i1]; j < S_diag_i[i1+1]; j++)
618 {
619 j1 = S_diag_j[j];
620 if (CF_marker[j1] == 1)
621 { P_diag_pass[1][cnt_nz++] = fine_to_coarse[j1]; }
622 }
623 for (j=S_offd_i[i1]; j < S_offd_i[i1+1]; j++)
624 {
625 j1 = S_offd_j[j];
626 if (CF_marker_offd[j1] == 1)
627 { P_offd_pass[1][cnt_nz_offd++] = map_S_to_new[j1]; }
628 }
629 }
630
631
632 total_nz += cnt_nz;
633 total_nz_offd += cnt_nz_offd;
634
635 if (num_procs > 1)
636 {
637 tmp_comm_pkg = hypre_CTAlloc(hypre_ParCSRCommPkg, 1, HYPRE_MEMORY_HOST);
638 Pext_send_map_start = hypre_CTAlloc(HYPRE_Int*, num_passes, HYPRE_MEMORY_HOST);
639 Pext_recv_vec_start = hypre_CTAlloc(HYPRE_Int*, num_passes, HYPRE_MEMORY_HOST);
640 Pext_pass = hypre_CTAlloc(HYPRE_Int*, num_passes, HYPRE_MEMORY_HOST);
641 Pext_i = hypre_CTAlloc(HYPRE_Int, num_cols_offd+1, HYPRE_MEMORY_HOST);
642 if (num_cols_offd) Pext_start = hypre_CTAlloc(HYPRE_Int, num_cols_offd, HYPRE_MEMORY_HOST);
643 if (send_map_start[num_sends])
644 P_ncols = hypre_CTAlloc(HYPRE_Int, send_map_start[num_sends], HYPRE_MEMORY_HOST);
645 #ifdef HYPRE_USING_OPENMP
646 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
647 #endif
648 for (i=0; i < num_cols_offd+1; i++)
649 { Pext_i[i] = 0; }
650 #ifdef HYPRE_USING_OPENMP
651 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
652 #endif
653 for (i=0; i < send_map_start[num_sends]; i++)
654 { P_ncols[i] = 0; }
655 }
656
657 old_Pext_send_size = 0;
658 old_Pext_recv_size = 0;
659 for (pass=2; pass < num_passes; pass++)
660 {
661
662 if (num_procs > 1)
663 {
664 Pext_send_map_start[pass] = hypre_CTAlloc(HYPRE_Int, num_sends+1, HYPRE_MEMORY_HOST);
665 Pext_recv_vec_start[pass] = hypre_CTAlloc(HYPRE_Int, num_recvs+1, HYPRE_MEMORY_HOST);
666 Pext_send_size = 0;
667 Pext_send_map_start[pass][0] = 0;
668
669 for (i=0; i < num_sends; i++)
670 {
671 #ifdef HYPRE_USING_OPENMP
672 #pragma omp parallel for private(j,j1) reduction(+:Pext_send_size) HYPRE_SMP_SCHEDULE
673 #endif
674 for (j=send_map_start[i]; j < send_map_start[i+1]; j++)
675 {
676 j1 = send_map_elmt[j];
677 if (assigned[j1] == pass-1)
678 {
679 P_ncols[j] = P_diag_i[j1+1] + P_offd_i[j1+1];
680 Pext_send_size += P_ncols[j];
681 }
682 }
683 Pext_send_map_start[pass][i+1] = Pext_send_size;
684 }
685
686 comm_handle = hypre_ParCSRCommHandleCreate (11, comm_pkg,
687 P_ncols, &Pext_i[1]);
688 hypre_ParCSRCommHandleDestroy(comm_handle);
689
690 if (Pext_send_size > old_Pext_send_size)
691 {
692 hypre_TFree(Pext_send_buffer, HYPRE_MEMORY_HOST);
693 Pext_send_buffer = hypre_CTAlloc(HYPRE_BigInt, Pext_send_size, HYPRE_MEMORY_HOST);
694 }
695 old_Pext_send_size = Pext_send_size;
696 }
697
698 cnt_offd = 0;
699 for (i=0; i < num_sends; i++)
700 {
701 for (j=send_map_start[i]; j < send_map_start[i+1]; j++)
702 {
703 j1 = send_map_elmt[j];
704 if (assigned[j1] == pass-1)
705 {
706 j_start = P_diag_start[j1];
707 j_end = j_start+P_diag_i[j1+1];
708 for (k=j_start; k < j_end; k++)
709 {
710 Pext_send_buffer[cnt_offd++] = my_first_cpt
711 + (HYPRE_BigInt) P_diag_pass[pass-1][k];
712 }
713 j_start = P_offd_start[j1];
714 j_end = j_start+P_offd_i[j1+1];
715 for (k=j_start; k < j_end; k++)
716 {
717 k1 = P_offd_pass[pass-1][k];
718 k3 = 0;
719 while (k3 < pass-1)
720 {
721 if (k1 < new_counter[k3+1])
722 {
723 k2 = k1-new_counter[k3];
724 Pext_send_buffer[cnt_offd++] = new_elmts[k3][k2];
725 break;
726 }
727 k3++;
728 }
729 }
730 }
731 }
732 }
733
734 if (num_procs > 1)
735 {
736 Pext_recv_size = 0;
737 Pext_recv_vec_start[pass][0] = 0;
738 cnt_offd = 0;
739 for (i=0; i < num_recvs; i++)
740 {
741 for (j=recv_vec_start[i]; j<recv_vec_start[i+1]; j++)
742 {
743 if (assigned_offd[j] == pass-1)
744 {
745 Pext_start[j] = cnt_offd;
746 cnt_offd += Pext_i[j+1];
747 }
748 }
749 Pext_recv_size = cnt_offd;
750 Pext_recv_vec_start[pass][i+1] = Pext_recv_size;
751 }
752
753 hypre_ParCSRCommPkgComm(tmp_comm_pkg) = comm;
754 hypre_ParCSRCommPkgNumSends(tmp_comm_pkg) = num_sends;
755 hypre_ParCSRCommPkgSendProcs(tmp_comm_pkg) = send_procs;
756 hypre_ParCSRCommPkgSendMapStarts(tmp_comm_pkg) =
757 Pext_send_map_start[pass];
758 hypre_ParCSRCommPkgNumRecvs(tmp_comm_pkg) = num_recvs;
759 hypre_ParCSRCommPkgRecvProcs(tmp_comm_pkg) = recv_procs;
760 hypre_ParCSRCommPkgRecvVecStarts(tmp_comm_pkg) =
761 Pext_recv_vec_start[pass];
762
763 if (Pext_recv_size)
764 {
765 Pext_pass[pass] = hypre_CTAlloc(HYPRE_Int, Pext_recv_size, HYPRE_MEMORY_HOST);
766 new_elmts[pass-1] = hypre_CTAlloc(HYPRE_BigInt, Pext_recv_size, HYPRE_MEMORY_HOST);
767 }
768 else
769 {
770 Pext_pass[pass] = NULL;
771 new_elmts[pass-1] = NULL;
772 }
773
774 if (Pext_recv_size > old_Pext_recv_size)
775 {
776 hypre_TFree(loc, HYPRE_MEMORY_HOST);
777 loc = hypre_CTAlloc(HYPRE_Int, Pext_recv_size, HYPRE_MEMORY_HOST);
778 hypre_TFree(big_temp_pass, HYPRE_MEMORY_HOST);
779 big_temp_pass = hypre_CTAlloc(HYPRE_BigInt, Pext_recv_size, HYPRE_MEMORY_HOST);
780 }
781 old_Pext_recv_size = Pext_recv_size;
782
783 comm_handle = hypre_ParCSRCommHandleCreate (21, tmp_comm_pkg,
784 Pext_send_buffer, big_temp_pass);
785 hypre_ParCSRCommHandleDestroy(comm_handle);
786 }
787
788 cnt_new = 0;
789 cnt_offd = 0;
790 /* JBS: Possible candidate for threading */
791 for (i=0; i < num_recvs; i++)
792 {
793 for (j=recv_vec_start[i]; j < recv_vec_start[i+1]; j++)
794 {
795 if (assigned_offd[j] == pass-1)
796 {
797 for (j1 = cnt_offd; j1 < cnt_offd+Pext_i[j+1]; j1++)
798 {
799 big_k1 = big_temp_pass[j1];
800 k2 = (HYPRE_Int)(big_k1 - my_first_cpt);
801 if (k2 > -1 && k2 < n_coarse)
802 { Pext_pass[pass][j1] = -k2-1; }
803 else
804 {
805 not_found = 1;
806 k3 = 0;
807 while (k3 < pass-1 && not_found)
808 {
809 k2 = hypre_BigBinarySearch(new_elmts[k3], big_k1,
810 (new_counter[k3+1]-new_counter[k3]));
811 if (k2 > -1)
812 {
813 Pext_pass[pass][j1] = k2 + new_counter[k3];
814 not_found = 0;
815 }
816 else
817 {
818 k3++;
819 }
820 }
821 if (not_found)
822 {
823 new_elmts[pass-1][cnt_new] = big_k1;
824 loc[cnt_new++] = j1;
825 }
826 }
827 }
828 cnt_offd += Pext_i[j+1];
829 }
830 }
831 }
832
833 if (cnt_new)
834 {
835 hypre_BigQsortbi(new_elmts[pass-1],loc,0,cnt_new-1);
836 cnt = 0;
837 local_index = new_counter[pass-1];
838 Pext_pass[pass][loc[0]] = local_index;
839
840 for (i=1; i < cnt_new; i++)
841 {
842 if (new_elmts[pass-1][i] > new_elmts[pass-1][cnt])
843 {
844 new_elmts[pass-1][++cnt] = new_elmts[pass-1][i];
845 local_index++;
846 }
847 Pext_pass[pass][loc[i]] = local_index;
848 }
849 new_counter[pass] = local_index+1;
850 }
851 else if (num_procs > 1)
852 new_counter[pass] = new_counter[pass-1];
853
854 if (new_num_cols_offd < local_index+1)
855 { new_num_cols_offd = local_index+1; }
856
857 pass_length = pass_pointer[pass+1] - pass_pointer[pass];
858 #ifdef HYPRE_USING_OPENMP
859 #pragma omp parallel private(i,my_thread_num,num_threads,thread_start,thread_stop,cnt_nz,cnt_nz_offd,i1,j,j1,j_start,j_end,k1,k,P_marker,P_marker_offd)
860 #endif
861 {
862 /* Thread by computing the sparsity structure for this pass only over
863 * each thread's range of rows. Rows are divided up evenly amongst
864 * the threads. The necessary thread-wise temporary arrays, like
865 * P_marker, are initialized and de-allocated internally to the
866 * parallel region. */
867
868 my_thread_num = hypre_GetThreadNum();
869 num_threads = hypre_NumActiveThreads();
870 thread_start = (pass_length/num_threads)*my_thread_num;
871 if (my_thread_num == num_threads-1)
872 { thread_stop = pass_length; }
873 else
874 { thread_stop = (pass_length/num_threads)*(my_thread_num+1); }
875 thread_start += pass_pointer[pass];
876 thread_stop += pass_pointer[pass];
877
878 /* Local initializations */
879 cnt_nz = 0;
880 cnt_nz_offd = 0;
881
882 /* This block of code is to go to the top of the parallel region starting before
883 * the loop over num_passes. */
884 P_marker = hypre_CTAlloc(HYPRE_Int, n_coarse, HYPRE_MEMORY_HOST); /* marks points to see if they're counted */
885 for (i=0; i < n_coarse; i++)
886 { P_marker[i] = -1; }
887 if (new_num_cols_offd == local_index+1)
888 {
889 P_marker_offd = hypre_CTAlloc(HYPRE_Int, new_num_cols_offd, HYPRE_MEMORY_HOST);
890 for (i=0; i < new_num_cols_offd; i++)
891 { P_marker_offd[i] = -1; }
892 }
893 else if (n_coarse_offd)
894 {
895 P_marker_offd = hypre_CTAlloc(HYPRE_Int, n_coarse_offd, HYPRE_MEMORY_HOST);
896 for (i=0; i < n_coarse_offd; i++)
897 { P_marker_offd[i] = -1; }
898 }
899
900
901 /* Need some variables to store each threads cnt_nz and cnt_nz_offd, and
902 * then stitch things together as in par_interp.c
903 * This loop writes
904 * P_diag_i, P_offd_i: data parallel here, and require no special treatment
905 * P_diag_start, P_offd_start: are not data parallel, require special treatment
906 */
907 for (i=thread_start; i < thread_stop; i++)
908 {
909 i1 = pass_array[i];
910 P_diag_start[i1] = cnt_nz;
911 P_offd_start[i1] = cnt_nz_offd;
912 for (j=S_diag_i[i1]; j < S_diag_i[i1+1]; j++)
913 {
914 j1 = S_diag_j[j];
915 if (assigned[j1] == pass-1)
916 {
917 j_start = P_diag_start[j1];
918 j_end = j_start+P_diag_i[j1+1];
919 for (k=j_start; k < j_end; k++)
920 {
921 k1 = P_diag_pass[pass-1][k];
922 if (P_marker[k1] != i1)
923 {
924 cnt_nz++;
925 P_diag_i[i1+1]++;
926 P_marker[k1] = i1;
927 }
928 }
929 j_start = P_offd_start[j1];
930 j_end = j_start+P_offd_i[j1+1];
931 for (k=j_start; k < j_end; k++)
932 {
933 k1 = P_offd_pass[pass-1][k];
934 if (P_marker_offd[k1] != i1)
935 {
936 cnt_nz_offd++;
937 P_offd_i[i1+1]++;
938 P_marker_offd[k1] = i1;
939 }
940 }
941 }
942 }
943 j_start = 0;
944 for (j=S_offd_i[i1]; j < S_offd_i[i1+1]; j++)
945 {
946 j1 = S_offd_j[j];
947 if (assigned_offd[j1] == pass-1)
948 {
949 j_start = Pext_start[j1];
950 j_end = j_start+Pext_i[j1+1];
951 for (k=j_start; k < j_end; k++)
952 {
953 k1 = Pext_pass[pass][k];
954 if (k1 < 0)
955 {
956 if (P_marker[-k1-1] != i1)
957 {
958 cnt_nz++;
959 P_diag_i[i1+1]++;
960 P_marker[-k1-1] = i1;
961 }
962 }
963 else if (P_marker_offd[k1] != i1)
964 {
965 cnt_nz_offd++;
966 P_offd_i[i1+1]++;
967 P_marker_offd[k1] = i1;
968 }
969 }
970 }
971 }
972 }
973
974 /* Update P_diag_start, P_offd_start with cumulative
975 * nonzero counts over all threads */
976 if(my_thread_num == 0)
977 { max_num_threads[0] = num_threads; }
978 cnt_nz_offd_per_thread[my_thread_num] = cnt_nz_offd;
979 cnt_nz_per_thread[my_thread_num] = cnt_nz;
980 #ifdef HYPRE_USING_OPENMP
981 #pragma omp barrier
982 #endif
983 if(my_thread_num == 0)
984 {
985 for(i = 1; i < max_num_threads[0]; i++)
986 {
987 cnt_nz_offd_per_thread[i] += cnt_nz_offd_per_thread[i-1];
988 cnt_nz_per_thread[i] += cnt_nz_per_thread[i-1];
989 }
990 }
991 #ifdef HYPRE_USING_OPENMP
992 #pragma omp barrier
993 #endif
994 if(my_thread_num > 0)
995 {
996 /* update this thread's section of P_diag_start and P_offd_start
997 * with the num of nz's counted by previous threads */
998 for (i=thread_start; i < thread_stop; i++)
999 {
1000 i1 = pass_array[i];
1001 P_diag_start[i1] += cnt_nz_per_thread[my_thread_num-1];
1002 P_offd_start[i1] += cnt_nz_offd_per_thread[my_thread_num-1];
1003 }
1004 }
1005 else /* if my_thread_num == 0 */
1006 {
1007 /* Grab the nz count for all threads */
1008 cnt_nz = cnt_nz_per_thread[max_num_threads[0]-1];
1009 cnt_nz_offd = cnt_nz_offd_per_thread[max_num_threads[0]-1];
1010
1011 /* Updated total nz count */
1012 total_nz += cnt_nz;
1013 total_nz_offd += cnt_nz_offd;
1014
1015 /* Allocate P_diag_pass and P_offd_pass for all threads */
1016 P_diag_pass[pass] = hypre_CTAlloc(HYPRE_Int, cnt_nz, HYPRE_MEMORY_HOST);
1017 if (cnt_nz_offd)
1018 P_offd_pass[pass] = hypre_CTAlloc(HYPRE_Int, cnt_nz_offd, HYPRE_MEMORY_HOST);
1019 else if (num_procs > 1)
1020 P_offd_pass[pass] = NULL;
1021 }
1022 #ifdef HYPRE_USING_OPENMP
1023 #pragma omp barrier
1024 #endif
1025
1026 /* offset cnt_nz and cnt_nz_offd to point to the starting
1027 * point in P_diag_pass and P_offd_pass for each thread */
1028 if(my_thread_num > 0)
1029 {
1030 cnt_nz = cnt_nz_per_thread[my_thread_num-1];
1031 cnt_nz_offd = cnt_nz_offd_per_thread[my_thread_num-1];
1032 }
1033 else
1034 {
1035 cnt_nz = 0;
1036 cnt_nz_offd = 0;
1037 }
1038
1039 /* Set P_diag_pass and P_offd_pass */
1040 for (i=thread_start; i < thread_stop; i++)
1041 {
1042 i1 = pass_array[i];
1043 for (j=S_diag_i[i1]; j < S_diag_i[i1+1]; j++)
1044 {
1045 j1 = S_diag_j[j];
1046 if (assigned[j1] == pass-1)
1047 {
1048 j_start = P_diag_start[j1];
1049 j_end = j_start+P_diag_i[j1+1];
1050 for (k=j_start; k < j_end; k++)
1051 {
1052 k1 = P_diag_pass[pass-1][k];
1053 if (P_marker[k1] != -i1-1)
1054 {
1055 P_diag_pass[pass][cnt_nz++] = k1;
1056 P_marker[k1] = -i1-1;
1057 }
1058 }
1059 j_start = P_offd_start[j1];
1060 j_end = j_start+P_offd_i[j1+1];
1061 for (k=j_start; k < j_end; k++)
1062 {
1063 k1 = P_offd_pass[pass-1][k];
1064 if (P_marker_offd[k1] != -i1-1)
1065 {
1066 P_offd_pass[pass][cnt_nz_offd++] = k1;
1067 P_marker_offd[k1] = -i1-1;
1068 }
1069 }
1070 }
1071 }
1072 for (j=S_offd_i[i1]; j < S_offd_i[i1+1]; j++)
1073 {
1074 j1 = S_offd_j[j];
1075 if (assigned_offd[j1] == pass-1)
1076 {
1077 j_start = Pext_start[j1];
1078 j_end = j_start+Pext_i[j1+1];
1079 for (k=j_start; k < j_end; k++)
1080 {
1081 k1 = Pext_pass[pass][k];
1082 if (k1 < 0)
1083 {
1084 if (P_marker[-k1-1] != -i1-1)
1085 {
1086 P_diag_pass[pass][cnt_nz++] = -k1-1;
1087 P_marker[-k1-1] = -i1-1;
1088 }
1089 }
1090 else if (P_marker_offd[k1] != -i1-1)
1091 {
1092 P_offd_pass[pass][cnt_nz_offd++] = k1;
1093 P_marker_offd[k1] = -i1-1;
1094 }
1095 }
1096 }
1097 }
1098 }
1099
1100 hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
1101 if ( (n_coarse_offd) || (new_num_cols_offd == local_index+1) )
1102 { hypre_TFree(P_marker_offd, HYPRE_MEMORY_HOST); }
1103
1104 } /* End parallel region */
1105 }
1106
1107
1108 hypre_TFree(loc, HYPRE_MEMORY_HOST);
1109 hypre_TFree(P_ncols, HYPRE_MEMORY_HOST);
1110 hypre_TFree(Pext_send_buffer, HYPRE_MEMORY_HOST);
1111 hypre_TFree(big_temp_pass, HYPRE_MEMORY_HOST);
1112 hypre_TFree(new_recv_vec_start, HYPRE_MEMORY_HOST);
1113 hypre_TFree(cnt_nz_per_thread, HYPRE_MEMORY_HOST);
1114 hypre_TFree(cnt_nz_offd_per_thread, HYPRE_MEMORY_HOST);
1115 hypre_TFree(max_num_threads, HYPRE_MEMORY_HOST);
1116
1117 P_diag_j = hypre_CTAlloc(HYPRE_Int, total_nz, HYPRE_MEMORY_HOST);
1118 P_diag_data = hypre_CTAlloc(HYPRE_Real, total_nz, HYPRE_MEMORY_HOST);
1119
1120
1121 if (total_nz_offd)
1122 {
1123 P_offd_j = hypre_CTAlloc(HYPRE_Int, total_nz_offd, HYPRE_MEMORY_HOST);
1124 P_offd_data = hypre_CTAlloc(HYPRE_Real, total_nz_offd, HYPRE_MEMORY_HOST);
1125 }
1126
1127 for (i=0; i < n_fine; i++)
1128 {
1129 P_diag_i[i+1] += P_diag_i[i];
1130 P_offd_i[i+1] += P_offd_i[i];
1131 }
1132
1133 /* determine P for coarse points */
1134
1135 #ifdef HYPRE_USING_OPENMP
1136 #pragma omp parallel for private(i,i1) HYPRE_SMP_SCHEDULE
1137 #endif
1138 for (i=0; i < n_coarse; i++)
1139 {
1140 i1 = C_array[i];
1141 P_diag_j[P_diag_i[i1]] = fine_to_coarse[i1];
1142 P_diag_data[P_diag_i[i1]] = 1.0;
1143 }
1144
1145
1146 if (weight_option) /*if this is set, weights are separated into
1147 negative and positive offdiagonals and accumulated
1148 accordingly */
1149 {
1150
1151 pass_length = pass_pointer[2]-pass_pointer[1];
1152 #ifdef HYPRE_USING_OPENMP
1153 #pragma omp parallel private(thread_start,thread_stop,my_thread_num,num_threads,P_marker,P_marker_offd,i,i1,sum_C_pos,sum_C_neg,sum_N_pos,sum_N_neg,j_start,j_end,j,k1,cnt,j1,cnt_offd,diagonal,alfa,beta)
1154 #endif
1155 {
1156 /* Sparsity structure is now finished. Next, calculate interpolation
1157 * weights for pass one. Thread by computing the interpolation
1158 * weights only over each thread's range of rows. Rows are divided
1159 * up evenly amongst the threads. */
1160
1161 P_marker = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_HOST);
1162 for (i=0; i < n_fine; i++)
1163 { P_marker[i] = -1; }
1164 if (num_cols_offd)
1165 {
1166 P_marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_offd, HYPRE_MEMORY_HOST);
1167 for (i=0; i < num_cols_offd; i++)
1168 P_marker_offd[i] = -1;
1169 }
1170
1171 /* Compute this thread's range of pass_length */
1172 my_thread_num = hypre_GetThreadNum();
1173 num_threads = hypre_NumActiveThreads();
1174 thread_start = pass_pointer[1] + (pass_length/num_threads)*my_thread_num;
1175 if (my_thread_num == num_threads-1)
1176 { thread_stop = pass_pointer[1] + pass_length; }
1177 else
1178 { thread_stop = pass_pointer[1] + (pass_length/num_threads)*(my_thread_num+1); }
1179
1180 /* determine P for points of pass 1, i.e. neighbors of coarse points */
1181 for (i=thread_start; i < thread_stop; i++)
1182 {
1183 i1 = pass_array[i];
1184 sum_C_pos = 0;
1185 sum_C_neg = 0;
1186 sum_N_pos = 0;
1187 sum_N_neg = 0;
1188 j_start = P_diag_start[i1];
1189 j_end = j_start+P_diag_i[i1+1]-P_diag_i[i1];
1190 for (j=j_start; j < j_end; j++)
1191 {
1192 k1 = P_diag_pass[1][j];
1193 P_marker[C_array[k1]] = i1;
1194 }
1195 cnt = P_diag_i[i1];
1196 for (j=A_diag_i[i1]+1; j < A_diag_i[i1+1]; j++)
1197 {
1198 j1 = A_diag_j[j];
1199 if (CF_marker[j1] != -3 &&
1200 (num_functions == 1 || dof_func[i1] == dof_func[j1]))
1201 {
1202 if (A_diag_data[j] < 0)
1203 sum_N_neg += A_diag_data[j];
1204 else
1205 sum_N_pos += A_diag_data[j];
1206 }
1207 if (j1 != -1 && P_marker[j1] == i1)
1208 {
1209 P_diag_data[cnt] = A_diag_data[j];
1210 P_diag_j[cnt++] = fine_to_coarse[j1];
1211 if (A_diag_data[j] < 0)
1212 sum_C_neg += A_diag_data[j];
1213 else
1214 sum_C_pos += A_diag_data[j];
1215 }
1216 }
1217 j_start = P_offd_start[i1];
1218 j_end = j_start+P_offd_i[i1+1]-P_offd_i[i1];
1219 for (j=j_start; j < j_end; j++)
1220 {
1221 k1 = P_offd_pass[1][j];
1222 P_marker_offd[C_array_offd[k1]] = i1;
1223 }
1224 cnt_offd = P_offd_i[i1];
1225 for (j=A_offd_i[i1]; j < A_offd_i[i1+1]; j++)
1226 {
1227 j1 = A_offd_j[j];
1228 if (CF_marker_offd[j1] != -3 &&
1229 (num_functions == 1 || dof_func[i1] == dof_func_offd[j1]))
1230 {
1231 if (A_offd_data[j] < 0)
1232 sum_N_neg += A_offd_data[j];
1233 else
1234 sum_N_pos += A_offd_data[j];
1235 }
1236 if (j1 != -1 && P_marker_offd[j1] == i1)
1237 {
1238 P_offd_data[cnt_offd] = A_offd_data[j];
1239 P_offd_j[cnt_offd++] = map_S_to_new[j1];
1240 if (A_offd_data[j] < 0)
1241 sum_C_neg += A_offd_data[j];
1242 else
1243 sum_C_pos += A_offd_data[j];
1244 }
1245 }
1246 diagonal = A_diag_data[A_diag_i[i1]];
1247 if (sum_C_neg*diagonal != 0) alfa = -sum_N_neg/(sum_C_neg*diagonal);
1248 if (sum_C_pos*diagonal != 0) beta = -sum_N_pos/(sum_C_pos*diagonal);
1249 for (j=P_diag_i[i1]; j < cnt; j++)
1250 if (P_diag_data[j] < 0)
1251 P_diag_data[j] *= alfa;
1252 else
1253 P_diag_data[j] *= beta;
1254 for (j=P_offd_i[i1]; j < cnt_offd; j++)
1255 if (P_offd_data[j] < 0)
1256 P_offd_data[j] *= alfa;
1257 else
1258 P_offd_data[j] *= beta;
1259 }
1260
1261 hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
1262 if (num_cols_offd)
1263 { hypre_TFree(P_marker_offd, HYPRE_MEMORY_HOST); }
1264 } /* End Parallel Region */
1265
1266 old_Pext_send_size = 0;
1267 old_Pext_recv_size = 0;
1268
1269 if (n_coarse) hypre_TFree(C_array, HYPRE_MEMORY_HOST);
1270 hypre_TFree(C_array_offd, HYPRE_MEMORY_HOST);
1271 hypre_TFree(P_diag_pass[1], HYPRE_MEMORY_HOST);
1272 if (num_procs > 1) hypre_TFree(P_offd_pass[1], HYPRE_MEMORY_HOST);
1273
1274
1275 for (pass = 2; pass < num_passes; pass++)
1276 {
1277
1278 if (num_procs > 1)
1279 {
1280 Pext_send_size = Pext_send_map_start[pass][num_sends];
1281 if (Pext_send_size > old_Pext_send_size)
1282 {
1283 hypre_TFree(Pext_send_data, HYPRE_MEMORY_HOST);
1284 Pext_send_data = hypre_CTAlloc(HYPRE_Real, Pext_send_size, HYPRE_MEMORY_HOST);
1285 }
1286 old_Pext_send_size = Pext_send_size;
1287
1288 cnt_offd = 0;
1289 for (i=0; i < num_sends; i++)
1290 {
1291 for (j=send_map_start[i]; j < send_map_start[i+1]; j++)
1292 {
1293 j1 = send_map_elmt[j];
1294 if (assigned[j1] == pass-1)
1295 {
1296 j_start = P_diag_i[j1];
1297 j_end = P_diag_i[j1+1];
1298 for (k=j_start; k < j_end; k++)
1299 { Pext_send_data[cnt_offd++] = P_diag_data[k]; }
1300 j_start = P_offd_i[j1];
1301 j_end = P_offd_i[j1+1];
1302 for (k=j_start; k < j_end; k++)
1303 { Pext_send_data[cnt_offd++] = P_offd_data[k]; }
1304 }
1305 }
1306 }
1307
1308 hypre_ParCSRCommPkgNumSends(tmp_comm_pkg) = num_sends;
1309 hypre_ParCSRCommPkgSendMapStarts(tmp_comm_pkg) =
1310 Pext_send_map_start[pass];
1311 hypre_ParCSRCommPkgNumRecvs(tmp_comm_pkg) = num_recvs;
1312 hypre_ParCSRCommPkgRecvVecStarts(tmp_comm_pkg) =
1313 Pext_recv_vec_start[pass];
1314
1315 Pext_recv_size = Pext_recv_vec_start[pass][num_recvs];
1316
1317 if (Pext_recv_size > old_Pext_recv_size)
1318 {
1319 hypre_TFree(Pext_data, HYPRE_MEMORY_HOST);
1320 Pext_data = hypre_CTAlloc(HYPRE_Real, Pext_recv_size, HYPRE_MEMORY_HOST);
1321 }
1322 old_Pext_recv_size = Pext_recv_size;
1323
1324 comm_handle = hypre_ParCSRCommHandleCreate (1, tmp_comm_pkg,
1325 Pext_send_data, Pext_data);
1326 hypre_ParCSRCommHandleDestroy(comm_handle);
1327
1328 hypre_TFree(Pext_send_map_start[pass], HYPRE_MEMORY_HOST);
1329 hypre_TFree(Pext_recv_vec_start[pass], HYPRE_MEMORY_HOST);
1330 }
1331
1332 pass_length = pass_pointer[pass+1]-pass_pointer[pass];
1333 #ifdef HYPRE_USING_OPENMP
1334 #pragma omp parallel private(thread_start,thread_stop,my_thread_num,num_threads,P_marker,P_marker_offd,i,i1,sum_C_neg,sum_C_pos,sum_N_neg,sum_N_pos,j_start,j_end,cnt,j,k1,cnt_offd,j1,k,alfa,beta,diagonal,C_array,C_array_offd)
1335 #endif
1336 {
1337 /* Sparsity structure is now finished. Next, calculate interpolation
1338 * weights for passes >= 2. Thread by computing the interpolation
1339 * weights only over each thread's range of rows. Rows are divided
1340 * up evenly amongst the threads. */
1341
1342 P_marker = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_HOST);
1343 for (i=0; i < n_fine; i++)
1344 { P_marker[i] = -1; }
1345 if (num_cols_offd)
1346 {
1347 P_marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_offd, HYPRE_MEMORY_HOST);
1348 for (i=0; i < num_cols_offd; i++)
1349 P_marker_offd[i] = -1;
1350 }
1351
1352 C_array = NULL;
1353 C_array_offd = NULL;
1354 if (n_coarse)
1355 { C_array = hypre_CTAlloc(HYPRE_Int, n_coarse, HYPRE_MEMORY_HOST); }
1356 if (new_num_cols_offd > n_coarse_offd)
1357 { C_array_offd = hypre_CTAlloc(HYPRE_Int, new_num_cols_offd, HYPRE_MEMORY_HOST); }
1358 else if (n_coarse_offd)
1359 { C_array_offd = hypre_CTAlloc(HYPRE_Int, n_coarse_offd, HYPRE_MEMORY_HOST); }
1360
1361 /* Compute this thread's range of pass_length */
1362 my_thread_num = hypre_GetThreadNum();
1363 num_threads = hypre_NumActiveThreads();
1364 thread_start = pass_pointer[pass] + (pass_length/num_threads)*my_thread_num;
1365 if (my_thread_num == num_threads-1)
1366 { thread_stop = pass_pointer[pass] + pass_length; }
1367 else
1368 { thread_stop = pass_pointer[pass] + (pass_length/num_threads)*(my_thread_num+1); }
1369
1370 /* Loop over each thread's row-range */
1371 for (i=thread_start; i < thread_stop; i++)
1372 {
1373 i1 = pass_array[i];
1374 sum_C_neg = 0;
1375 sum_C_pos = 0;
1376 sum_N_neg = 0;
1377 sum_N_pos = 0;
1378 j_start = P_diag_start[i1];
1379 j_end = j_start+P_diag_i[i1+1]-P_diag_i[i1];
1380 cnt = P_diag_i[i1];
1381 for (j=j_start; j < j_end; j++)
1382 {
1383 k1 = P_diag_pass[pass][j];
1384 C_array[k1] = cnt;
1385 P_diag_data[cnt] = 0;
1386 P_diag_j[cnt++] = k1;
1387 }
1388 j_start = P_offd_start[i1];
1389 j_end = j_start+P_offd_i[i1+1]-P_offd_i[i1];
1390 cnt_offd = P_offd_i[i1];
1391 for (j=j_start; j < j_end; j++)
1392 {
1393 k1 = P_offd_pass[pass][j];
1394 C_array_offd[k1] = cnt_offd;
1395 P_offd_data[cnt_offd] = 0;
1396 P_offd_j[cnt_offd++] = k1;
1397 }
1398 for (j=S_diag_i[i1]; j < S_diag_i[i1+1]; j++)
1399 {
1400 j1 = S_diag_j[j];
1401 if (assigned[j1] == pass-1)
1402 P_marker[j1] = i1;
1403 }
1404 for (j=S_offd_i[i1]; j < S_offd_i[i1+1]; j++)
1405 {
1406 j1 = S_offd_j[j];
1407 if (assigned_offd[j1] == pass-1)
1408 P_marker_offd[j1] = i1;
1409 }
1410 for (j=A_diag_i[i1]+1; j < A_diag_i[i1+1]; j++)
1411 {
1412 j1 = A_diag_j[j];
1413 if (P_marker[j1] == i1)
1414 {
1415 for (k=P_diag_i[j1]; k < P_diag_i[j1+1]; k++)
1416 {
1417 k1 = P_diag_j[k];
1418 alfa = A_diag_data[j]*P_diag_data[k];
1419 P_diag_data[C_array[k1]] += alfa;
1420 if (alfa < 0)
1421 {
1422 sum_C_neg += alfa;
1423 sum_N_neg += alfa;
1424 }
1425 else
1426 {
1427 sum_C_pos += alfa;
1428 sum_N_pos += alfa;
1429 }
1430 }
1431 for (k=P_offd_i[j1]; k < P_offd_i[j1+1]; k++)
1432 {
1433 k1 = P_offd_j[k];
1434 alfa = A_diag_data[j]*P_offd_data[k];
1435 P_offd_data[C_array_offd[k1]] += alfa;
1436 if (alfa < 0)
1437 {
1438 sum_C_neg += alfa;
1439 sum_N_neg += alfa;
1440 }
1441 else
1442 {
1443 sum_C_pos += alfa;
1444 sum_N_pos += alfa;
1445 }
1446 }
1447 }
1448 else
1449 {
1450 if (CF_marker[j1] != -3 &&
1451 (num_functions == 1 || dof_func[i1] == dof_func[j1]))
1452 {
1453 if (A_diag_data[j] < 0)
1454 sum_N_neg += A_diag_data[j];
1455 else
1456 sum_N_pos += A_diag_data[j];
1457 }
1458 }
1459 }
1460 for (j=A_offd_i[i1]; j < A_offd_i[i1+1]; j++)
1461 {
1462 j1 = A_offd_j[j];
1463
1464 if (j1 > -1 && P_marker_offd[j1] == i1)
1465 {
1466 j_start = Pext_start[j1];
1467 j_end = j_start+Pext_i[j1+1];
1468 for (k=j_start; k < j_end; k++)
1469 {
1470 k1 = Pext_pass[pass][k];
1471 alfa = A_offd_data[j]*Pext_data[k];
1472 if (k1 < 0)
1473 P_diag_data[C_array[-k1-1]] += alfa;
1474 else
1475 P_offd_data[C_array_offd[k1]] += alfa;
1476 if (alfa < 0)
1477 {
1478 sum_C_neg += alfa;
1479 sum_N_neg += alfa;
1480 }
1481 else
1482 {
1483 sum_C_pos += alfa;
1484 sum_N_pos += alfa;
1485 }
1486 }
1487 }
1488 else
1489 {
1490 if (CF_marker_offd[j1] != -3 &&
1491 (num_functions == 1 || dof_func_offd[j1] == dof_func[i1]))
1492 {
1493 if ( A_offd_data[j] < 0)
1494 sum_N_neg += A_offd_data[j];
1495 else
1496 sum_N_pos += A_offd_data[j];
1497 }
1498 }
1499 }
1500 diagonal = A_diag_data[A_diag_i[i1]];
1501 if (sum_C_neg*diagonal != 0) alfa = -sum_N_neg/(sum_C_neg*diagonal);
1502 if (sum_C_pos*diagonal != 0) beta = -sum_N_pos/(sum_C_pos*diagonal);
1503
1504 for (j=P_diag_i[i1]; j < P_diag_i[i1+1]; j++)
1505 if (P_diag_data[j] < 0)
1506 P_diag_data[j] *= alfa;
1507 else
1508 P_diag_data[j] *= beta;
1509 for (j=P_offd_i[i1]; j < P_offd_i[i1+1]; j++)
1510 if (P_offd_data[j] < 0)
1511 P_offd_data[j] *= alfa;
1512 else
1513 P_offd_data[j] *= beta;
1514 }
1515
1516 hypre_TFree(C_array, HYPRE_MEMORY_HOST);
1517 hypre_TFree(C_array_offd, HYPRE_MEMORY_HOST);
1518 hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
1519 if (num_cols_offd)
1520 { hypre_TFree(P_marker_offd, HYPRE_MEMORY_HOST); }
1521
1522 } /* End OMP Parallel Section */
1523
1524 hypre_TFree(P_diag_pass[pass], HYPRE_MEMORY_HOST);
1525 if (num_procs > 1)
1526 {
1527 hypre_TFree(P_offd_pass[pass], HYPRE_MEMORY_HOST);
1528 hypre_TFree(Pext_pass[pass], HYPRE_MEMORY_HOST);
1529 }
1530 } /* End num_passes for-loop */
1531 }
1532 else /* no distinction between positive and negative offdiagonal element */
1533 {
1534
1535 pass_length = pass_pointer[2]-pass_pointer[1];
1536 #ifdef HYPRE_USING_OPENMP
1537 #pragma omp parallel private(thread_start,thread_stop,my_thread_num,num_threads,k,k1,i,i1,j,j1,sum_C,sum_N,j_start,j_end,cnt,tmp_marker,tmp_marker_offd,cnt_offd,diagonal,alfa)
1538 #endif
1539 {
1540 /* Sparsity structure is now finished. Next, calculate interpolation
1541 * weights for pass one. Thread by computing the interpolation
1542 * weights only over each thread's range of rows. Rows are divided
1543 * up evenly amongst the threads. */
1544
1545 /* Initialize thread-wise variables */
1546 tmp_marker = NULL;
1547 if (n_fine)
1548 { tmp_marker = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_HOST); }
1549 tmp_marker_offd = NULL;
1550 if (num_cols_offd)
1551 { tmp_marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_offd, HYPRE_MEMORY_HOST); }
1552 for (i=0; i < n_fine; i++)
1553 { tmp_marker[i] = -1; }
1554 for (i=0; i < num_cols_offd; i++)
1555 { tmp_marker_offd[i] = -1; }
1556
1557 /* Compute this thread's range of pass_length */
1558 my_thread_num = hypre_GetThreadNum();
1559 num_threads = hypre_NumActiveThreads();
1560 thread_start = pass_pointer[1] + (pass_length/num_threads)*my_thread_num;
1561 if (my_thread_num == num_threads-1)
1562 { thread_stop = pass_pointer[1] + pass_length; }
1563 else
1564 { thread_stop = pass_pointer[1] + (pass_length/num_threads)*(my_thread_num+1); }
1565
1566 /* determine P for points of pass 1, i.e. neighbors of coarse points */
1567 for (i=thread_start; i < thread_stop; i++)
1568 {
1569 i1 = pass_array[i];
1570 sum_C = 0;
1571 sum_N = 0;
1572 j_start = P_diag_start[i1];
1573 j_end = j_start+P_diag_i[i1+1]-P_diag_i[i1];
1574 for (j=j_start; j < j_end; j++)
1575 {
1576 k1 = P_diag_pass[1][j];
1577 tmp_marker[C_array[k1]] = i1;
1578 }
1579 cnt = P_diag_i[i1];
1580 for (j=A_diag_i[i1]+1; j < A_diag_i[i1+1]; j++)
1581 {
1582 j1 = A_diag_j[j];
1583 if (CF_marker[j1] != -3 &&
1584 (num_functions == 1 || dof_func[i1] == dof_func[j1]))
1585 sum_N += A_diag_data[j];
1586 if (j1 != -1 && tmp_marker[j1] == i1)
1587 {
1588 P_diag_data[cnt] = A_diag_data[j];
1589 P_diag_j[cnt++] = fine_to_coarse[j1];
1590 sum_C += A_diag_data[j];
1591 }
1592 }
1593 j_start = P_offd_start[i1];
1594 j_end = j_start+P_offd_i[i1+1]-P_offd_i[i1];
1595 for (j=j_start; j < j_end; j++)
1596 {
1597 k1 = P_offd_pass[1][j];
1598 tmp_marker_offd[C_array_offd[k1]] = i1;
1599 }
1600 cnt_offd = P_offd_i[i1];
1601 for (j=A_offd_i[i1]; j < A_offd_i[i1+1]; j++)
1602 {
1603 j1 = A_offd_j[j];
1604 if (CF_marker_offd[j1] != -3 &&
1605 (num_functions == 1 || dof_func[i1] == dof_func_offd[j1]))
1606 sum_N += A_offd_data[j];
1607 if (j1 != -1 && tmp_marker_offd[j1] == i1)
1608 {
1609 P_offd_data[cnt_offd] = A_offd_data[j];
1610 P_offd_j[cnt_offd++] = map_S_to_new[j1];
1611 sum_C += A_offd_data[j];
1612 }
1613 }
1614 diagonal = A_diag_data[A_diag_i[i1]];
1615 if (sum_C*diagonal != 0) alfa = -sum_N/(sum_C*diagonal);
1616 for (j=P_diag_i[i1]; j < cnt; j++)
1617 P_diag_data[j] *= alfa;
1618 for (j=P_offd_i[i1]; j < cnt_offd; j++)
1619 P_offd_data[j] *= alfa;
1620 }
1621 hypre_TFree(tmp_marker, HYPRE_MEMORY_HOST);
1622 hypre_TFree(tmp_marker_offd, HYPRE_MEMORY_HOST);
1623 } /* end OMP parallel region */
1624
1625 old_Pext_send_size = 0;
1626 old_Pext_recv_size = 0;
1627
1628 if (n_coarse) hypre_TFree(C_array, HYPRE_MEMORY_HOST);
1629 hypre_TFree(C_array_offd, HYPRE_MEMORY_HOST);
1630 hypre_TFree(P_diag_pass[1], HYPRE_MEMORY_HOST);
1631 if (num_procs > 1) hypre_TFree(P_offd_pass[1], HYPRE_MEMORY_HOST);
1632
1633 for (pass = 2; pass < num_passes; pass++)
1634 {
1635
1636 if (num_procs > 1)
1637 {
1638 Pext_send_size = Pext_send_map_start[pass][num_sends];
1639 if (Pext_send_size > old_Pext_send_size)
1640 {
1641 hypre_TFree(Pext_send_data, HYPRE_MEMORY_HOST);
1642 Pext_send_data = hypre_CTAlloc(HYPRE_Real, Pext_send_size, HYPRE_MEMORY_HOST);
1643 }
1644 old_Pext_send_size = Pext_send_size;
1645
1646 cnt_offd = 0;
1647 for (i=0; i < num_sends; i++)
1648 {
1649 for (j=send_map_start[i]; j < send_map_start[i+1]; j++)
1650 {
1651 j1 = send_map_elmt[j];
1652 if (assigned[j1] == pass-1)
1653 {
1654 j_start = P_diag_i[j1];
1655 j_end = P_diag_i[j1+1];
1656 for (k=j_start; k < j_end; k++)
1657 {
1658 Pext_send_data[cnt_offd++] = P_diag_data[k];
1659 }
1660 j_start = P_offd_i[j1];
1661 j_end = P_offd_i[j1+1];
1662 for (k=j_start; k < j_end; k++)
1663 {
1664 Pext_send_data[cnt_offd++] = P_offd_data[k];
1665 }
1666 }
1667 }
1668 }
1669
1670 hypre_ParCSRCommPkgNumSends(tmp_comm_pkg) = num_sends;
1671 hypre_ParCSRCommPkgSendMapStarts(tmp_comm_pkg) =
1672 Pext_send_map_start[pass];
1673 hypre_ParCSRCommPkgNumRecvs(tmp_comm_pkg) = num_recvs;
1674 hypre_ParCSRCommPkgRecvVecStarts(tmp_comm_pkg) =
1675 Pext_recv_vec_start[pass];
1676
1677 Pext_recv_size = Pext_recv_vec_start[pass][num_recvs];
1678
1679 if (Pext_recv_size > old_Pext_recv_size)
1680 {
1681 hypre_TFree(Pext_data, HYPRE_MEMORY_HOST);
1682 Pext_data = hypre_CTAlloc(HYPRE_Real, Pext_recv_size, HYPRE_MEMORY_HOST);
1683 }
1684 old_Pext_recv_size = Pext_recv_size;
1685
1686 comm_handle = hypre_ParCSRCommHandleCreate (1, tmp_comm_pkg,
1687 Pext_send_data, Pext_data);
1688 hypre_ParCSRCommHandleDestroy(comm_handle);
1689
1690 hypre_TFree(Pext_send_map_start[pass], HYPRE_MEMORY_HOST);
1691 hypre_TFree(Pext_recv_vec_start[pass], HYPRE_MEMORY_HOST);
1692 }
1693
1694 pass_length = pass_pointer[pass+1]-pass_pointer[pass];
1695 #ifdef HYPRE_USING_OPENMP
1696 #pragma omp parallel private(thread_start,thread_stop,my_thread_num,num_threads,k,k1,i,i1,j,j1,sum_C,sum_N,j_start,j_end,cnt,tmp_marker,tmp_marker_offd,cnt_offd,diagonal,alfa,tmp_array,tmp_array_offd)
1697 #endif
1698 {
1699 /* Sparsity structure is now finished. Next, calculate interpolation
1700 * weights for passes >= 2. Thread by computing the interpolation
1701 * weights only over each thread's range of rows. Rows are divided
1702 * up evenly amongst the threads. */
1703
1704 /* Initialize thread-wise variables */
1705 tmp_marker = NULL;
1706 if (n_fine)
1707 { tmp_marker = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_HOST); }
1708 tmp_marker_offd = NULL;
1709 if (num_cols_offd)
1710 { tmp_marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_offd, HYPRE_MEMORY_HOST); }
1711 tmp_array = NULL;
1712 if (n_coarse)
1713 { tmp_array = hypre_CTAlloc(HYPRE_Int, n_coarse, HYPRE_MEMORY_HOST); }
1714 tmp_array_offd = NULL;
1715 if (new_num_cols_offd > n_coarse_offd)
1716 { tmp_array_offd = hypre_CTAlloc(HYPRE_Int, new_num_cols_offd, HYPRE_MEMORY_HOST); }
1717 else
1718 { tmp_array_offd = hypre_CTAlloc(HYPRE_Int, n_coarse_offd, HYPRE_MEMORY_HOST);}
1719 for (i=0; i < n_fine; i++)
1720 { tmp_marker[i] = -1; }
1721 for (i=0; i < num_cols_offd; i++)
1722 { tmp_marker_offd[i] = -1; }
1723
1724 /* Compute this thread's range of pass_length */
1725 my_thread_num = hypre_GetThreadNum();
1726 num_threads = hypre_NumActiveThreads();
1727 thread_start = pass_pointer[pass] + (pass_length/num_threads)*my_thread_num;
1728 if (my_thread_num == num_threads-1)
1729 { thread_stop = pass_pointer[pass] + pass_length; }
1730 else
1731 { thread_stop = pass_pointer[pass] + (pass_length/num_threads)*(my_thread_num+1); }
1732
1733 for (i=thread_start; i < thread_stop; i++)
1734 {
1735 i1 = pass_array[i];
1736 sum_C = 0;
1737 sum_N = 0;
1738 j_start = P_diag_start[i1];
1739 j_end = j_start+P_diag_i[i1+1]-P_diag_i[i1];
1740 cnt = P_diag_i[i1];
1741 for (j=j_start; j < j_end; j++)
1742 {
1743 k1 = P_diag_pass[pass][j];
1744 tmp_array[k1] = cnt;
1745 P_diag_data[cnt] = 0;
1746 P_diag_j[cnt++] = k1;
1747 }
1748 j_start = P_offd_start[i1];
1749 j_end = j_start+P_offd_i[i1+1]-P_offd_i[i1];
1750 cnt_offd = P_offd_i[i1];
1751 for (j=j_start; j < j_end; j++)
1752 {
1753 k1 = P_offd_pass[pass][j];
1754 tmp_array_offd[k1] = cnt_offd;
1755 P_offd_data[cnt_offd] = 0;
1756 P_offd_j[cnt_offd++] = k1;
1757 }
1758 for (j=S_diag_i[i1]; j < S_diag_i[i1+1]; j++)
1759 {
1760 j1 = S_diag_j[j];
1761 if (assigned[j1] == pass-1)
1762 tmp_marker[j1] = i1;
1763 }
1764 for (j=S_offd_i[i1]; j < S_offd_i[i1+1]; j++)
1765 {
1766 j1 = S_offd_j[j];
1767 if (assigned_offd[j1] == pass-1)
1768 tmp_marker_offd[j1] = i1;
1769 }
1770 for (j=A_diag_i[i1]+1; j < A_diag_i[i1+1]; j++)
1771 {
1772 j1 = A_diag_j[j];
1773 if (tmp_marker[j1] == i1)
1774 {
1775 for (k=P_diag_i[j1]; k < P_diag_i[j1+1]; k++)
1776 {
1777 k1 = P_diag_j[k];
1778 alfa = A_diag_data[j]*P_diag_data[k];
1779 P_diag_data[tmp_array[k1]] += alfa;
1780 sum_C += alfa;
1781 sum_N += alfa;
1782 }
1783 for (k=P_offd_i[j1]; k < P_offd_i[j1+1]; k++)
1784 {
1785 k1 = P_offd_j[k];
1786 alfa = A_diag_data[j]*P_offd_data[k];
1787 P_offd_data[tmp_array_offd[k1]] += alfa;
1788 sum_C += alfa;
1789 sum_N += alfa;
1790 }
1791 }
1792 else
1793 {
1794 if (CF_marker[j1] != -3 &&
1795 (num_functions == 1 || dof_func[i1] == dof_func[j1]))
1796 sum_N += A_diag_data[j];
1797 }
1798 }
1799 for (j=A_offd_i[i1]; j < A_offd_i[i1+1]; j++)
1800 {
1801 j1 = A_offd_j[j];
1802
1803 if (j1 > -1 && tmp_marker_offd[j1] == i1)
1804 {
1805 j_start = Pext_start[j1];
1806 j_end = j_start+Pext_i[j1+1];
1807 for (k=j_start; k < j_end; k++)
1808 {
1809 k1 = Pext_pass[pass][k];
1810 alfa = A_offd_data[j]*Pext_data[k];
1811 if (k1 < 0)
1812 P_diag_data[tmp_array[-k1-1]] += alfa;
1813 else
1814 P_offd_data[tmp_array_offd[k1]] += alfa;
1815 sum_C += alfa;
1816 sum_N += alfa;
1817 }
1818 }
1819 else
1820 {
1821 if (CF_marker_offd[j1] != -3 &&
1822 (num_functions == 1 || dof_func_offd[j1] == dof_func[i1]))
1823 sum_N += A_offd_data[j];
1824 }
1825 }
1826 diagonal = A_diag_data[A_diag_i[i1]];
1827 if (sum_C*diagonal != 0.0) alfa = -sum_N/(sum_C*diagonal);
1828
1829 for (j=P_diag_i[i1]; j < P_diag_i[i1+1]; j++)
1830 P_diag_data[j] *= alfa;
1831 for (j=P_offd_i[i1]; j < P_offd_i[i1+1]; j++)
1832 P_offd_data[j] *= alfa;
1833 }
1834 hypre_TFree(tmp_marker, HYPRE_MEMORY_HOST);
1835 hypre_TFree(tmp_marker_offd, HYPRE_MEMORY_HOST);
1836 hypre_TFree(tmp_array, HYPRE_MEMORY_HOST);
1837 hypre_TFree(tmp_array_offd, HYPRE_MEMORY_HOST);
1838 } /* End OMP Parallel Section */
1839
1840 hypre_TFree(P_diag_pass[pass], HYPRE_MEMORY_HOST);
1841 if (num_procs > 1)
1842 {
1843 hypre_TFree(P_offd_pass[pass], HYPRE_MEMORY_HOST);
1844 hypre_TFree(Pext_pass[pass], HYPRE_MEMORY_HOST);
1845 }
1846 }
1847 }
1848
1849 hypre_TFree(CF_marker_offd, HYPRE_MEMORY_HOST);
1850 hypre_TFree(Pext_send_map_start, HYPRE_MEMORY_HOST);
1851 hypre_TFree(Pext_recv_vec_start, HYPRE_MEMORY_HOST);
1852 hypre_TFree(dof_func_offd, HYPRE_MEMORY_HOST);
1853 hypre_TFree(Pext_send_data, HYPRE_MEMORY_HOST);
1854 hypre_TFree(Pext_data, HYPRE_MEMORY_HOST);
1855 hypre_TFree(P_diag_pass, HYPRE_MEMORY_HOST);
1856 hypre_TFree(P_offd_pass, HYPRE_MEMORY_HOST);
1857 hypre_TFree(Pext_pass, HYPRE_MEMORY_HOST);
1858 hypre_TFree(P_diag_start, HYPRE_MEMORY_HOST);
1859 hypre_TFree(P_offd_start, HYPRE_MEMORY_HOST);
1860 hypre_TFree(Pext_start, HYPRE_MEMORY_HOST);
1861 hypre_TFree(Pext_i, HYPRE_MEMORY_HOST);
1862 hypre_TFree(fine_to_coarse, HYPRE_MEMORY_HOST);
1863 hypre_TFree(assigned, HYPRE_MEMORY_HOST);
1864 hypre_TFree(assigned_offd, HYPRE_MEMORY_HOST);
1865 hypre_TFree(pass_pointer, HYPRE_MEMORY_HOST);
1866 hypre_TFree(pass_array, HYPRE_MEMORY_HOST);
1867 hypre_TFree(map_S_to_new, HYPRE_MEMORY_HOST);
1868 if (num_procs > 1) hypre_TFree(tmp_comm_pkg, HYPRE_MEMORY_HOST);
1869
1870 P = hypre_ParCSRMatrixCreate(comm,
1871 hypre_ParCSRMatrixGlobalNumRows(A),
1872 total_global_cpts,
1873 hypre_ParCSRMatrixColStarts(A),
1874 num_cpts_global,
1875 0,
1876 P_diag_i[n_fine],
1877 P_offd_i[n_fine]);
1878 P_diag = hypre_ParCSRMatrixDiag(P);
1879 hypre_CSRMatrixData(P_diag) = P_diag_data;
1880 hypre_CSRMatrixI(P_diag) = P_diag_i;
1881 hypre_CSRMatrixJ(P_diag) = P_diag_j;
1882 P_offd = hypre_ParCSRMatrixOffd(P);
1883 hypre_CSRMatrixData(P_offd) = P_offd_data;
1884 hypre_CSRMatrixI(P_offd) = P_offd_i;
1885 hypre_CSRMatrixJ(P_offd) = P_offd_j;
1886
1887 /* Compress P, removing coefficients smaller than trunc_factor * Max
1888 and/or keep yat most <P_max_elmts> per row absolutely maximal coefficients */
1889
1890 if (trunc_factor != 0.0 || P_max_elmts != 0)
1891 {
1892 hypre_BoomerAMGInterpTruncation(P, trunc_factor, P_max_elmts);
1893 P_diag_data = hypre_CSRMatrixData(P_diag);
1894 P_diag_i = hypre_CSRMatrixI(P_diag);
1895 P_diag_j = hypre_CSRMatrixJ(P_diag);
1896 P_offd_data = hypre_CSRMatrixData(P_offd);
1897 P_offd_i = hypre_CSRMatrixI(P_offd);
1898 P_offd_j = hypre_CSRMatrixJ(P_offd);
1899 }
1900 P_offd_size = P_offd_i[n_fine];
1901
1902 num_cols_offd_P = 0;
1903 if (P_offd_size)
1904 {
1905 if (new_num_cols_offd > num_cols_offd)
1906 { P_marker_offd = hypre_CTAlloc(HYPRE_Int, new_num_cols_offd, HYPRE_MEMORY_HOST); }
1907 else
1908 { P_marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_offd, HYPRE_MEMORY_HOST); }
1909 #ifdef HYPRE_USING_OPENMP
1910 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
1911 #endif
1912 for (i=0; i < new_num_cols_offd; i++)
1913 { P_marker_offd[i] = 0; }
1914
1915 num_cols_offd_P = 0;
1916 for (i=0; i < P_offd_size; i++)
1917 {
1918 index = P_offd_j[i];
1919 if (!P_marker_offd[index])
1920 {
1921 num_cols_offd_P++;
1922 P_marker_offd[index] = 1;
1923 }
1924 }
1925
1926 col_map_offd_P = hypre_CTAlloc(HYPRE_BigInt, num_cols_offd_P, HYPRE_MEMORY_HOST);
1927 permute = hypre_CTAlloc(HYPRE_Int, new_counter[num_passes-1], HYPRE_MEMORY_HOST);
1928 big_permute = hypre_CTAlloc(HYPRE_BigInt, new_counter[num_passes-1], HYPRE_MEMORY_HOST);
1929
1930 #ifdef HYPRE_USING_OPENMP
1931 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
1932 #endif
1933 for (i=0; i < new_counter[num_passes-1]; i++)
1934 big_permute[i] = -1;
1935
1936 cnt = 0;
1937 for (i=0; i < num_passes-1; i++)
1938 {
1939 for (j=new_counter[i]; j < new_counter[i+1]; j++)
1940 {
1941 if (P_marker_offd[j])
1942 {
1943 col_map_offd_P[cnt] = new_elmts[i][j-(HYPRE_BigInt)new_counter[i]];
1944 big_permute[j] = col_map_offd_P[cnt++];
1945 }
1946 }
1947 }
1948
1949 hypre_BigQsort0(col_map_offd_P,0,num_cols_offd_P-1);
1950
1951 #ifdef HYPRE_USING_OPENMP
1952 #pragma omp parallel for private(i,big_k1) HYPRE_SMP_SCHEDULE
1953 #endif
1954 for (i=0; i < new_counter[num_passes-1]; i++)
1955 {
1956 big_k1 = big_permute[i];
1957 if (big_k1 != -1)
1958 permute[i] = hypre_BigBinarySearch(col_map_offd_P,big_k1,num_cols_offd_P);
1959 }
1960
1961 #ifdef HYPRE_USING_OPENMP
1962 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
1963 #endif
1964 for (i=0; i < P_offd_size; i++)
1965 { P_offd_j[i] = permute[P_offd_j[i]]; }
1966
1967 hypre_TFree(P_marker_offd, HYPRE_MEMORY_HOST);
1968 }
1969 if (num_procs > 1)
1970 {
1971 for (i=0; i < num_passes-1; i++)
1972 hypre_TFree(new_elmts[i], HYPRE_MEMORY_HOST);
1973 }
1974 hypre_TFree(permute, HYPRE_MEMORY_HOST);
1975 hypre_TFree(big_permute, HYPRE_MEMORY_HOST);
1976 hypre_TFree(new_elmts, HYPRE_MEMORY_HOST);
1977 hypre_TFree(new_counter, HYPRE_MEMORY_HOST);
1978
1979 if (num_cols_offd_P)
1980 {
1981 hypre_ParCSRMatrixColMapOffd(P) = col_map_offd_P;
1982 hypre_CSRMatrixNumCols(P_offd) = num_cols_offd_P;
1983 }
1984
1985 if (n_SF)
1986 {
1987 #ifdef HYPRE_USING_OPENMP
1988 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
1989 #endif
1990 for (i=0; i < n_fine; i++)
1991 if (CF_marker[i] == -3) CF_marker[i] = -1;
1992 }
1993
1994 if (num_procs > 1)
1995 {
1996 hypre_MatvecCommPkgCreate(P);
1997 }
1998
1999 *P_ptr = P;
2000
2001 /* wall_time = hypre_MPI_Wtime() - wall_time;
2002 hypre_printf("TOTAL TIME %1.2e \n",wall_time); */
2003
2004 /*-----------------------------------------------------------------------
2005 * Build and return dof_func array for coarse grid.
2006 *-----------------------------------------------------------------------*/
2007
2008 /*-----------------------------------------------------------------------
2009 * Free mapping vector and marker array.
2010 *-----------------------------------------------------------------------*/
2011
2012 #ifdef HYPRE_PROFILE
2013 hypre_profile_times[HYPRE_TIMER_ID_MULTIPASS_INTERP] += hypre_MPI_Wtime();
2014 #endif
2015
2016 return(0);
2017 }
2018
2019 HYPRE_Int
hypre_BoomerAMGBuildMultipass(hypre_ParCSRMatrix * A,HYPRE_Int * CF_marker,hypre_ParCSRMatrix * S,HYPRE_BigInt * num_cpts_global,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int debug_flag,HYPRE_Real trunc_factor,HYPRE_Int P_max_elmts,HYPRE_Int weight_option,hypre_ParCSRMatrix ** P_ptr)2020 hypre_BoomerAMGBuildMultipass( hypre_ParCSRMatrix *A,
2021 HYPRE_Int *CF_marker,
2022 hypre_ParCSRMatrix *S,
2023 HYPRE_BigInt *num_cpts_global,
2024 HYPRE_Int num_functions,
2025 HYPRE_Int *dof_func,
2026 HYPRE_Int debug_flag,
2027 HYPRE_Real trunc_factor,
2028 HYPRE_Int P_max_elmts,
2029 HYPRE_Int weight_option,
2030 hypre_ParCSRMatrix **P_ptr )
2031 {
2032 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2033 hypre_GpuProfilingPushRange("MultipassInterp");
2034 #endif
2035
2036 HYPRE_Int ierr = 0;
2037
2038 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2039 HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy2( hypre_ParCSRMatrixMemoryLocation(A),
2040 hypre_ParCSRMatrixMemoryLocation(S) );
2041 if (exec == HYPRE_EXEC_DEVICE)
2042 {
2043 /* Notice: call the mod version on GPUs */
2044 ierr = hypre_BoomerAMGBuildModMultipassDevice( A, CF_marker, S, num_cpts_global,
2045 trunc_factor, P_max_elmts, 9,
2046 num_functions, dof_func,
2047 P_ptr );
2048 }
2049 else
2050 #endif
2051 {
2052 ierr = hypre_BoomerAMGBuildMultipassHost( A, CF_marker, S, num_cpts_global,
2053 num_functions, dof_func, debug_flag,
2054 trunc_factor, P_max_elmts, weight_option,
2055 P_ptr );
2056 }
2057
2058 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2059 hypre_GpuProfilingPopRange();
2060 #endif
2061
2062 return ierr;
2063 }
2064
2065