1 /******************************************************************************
2  * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other
3  * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
4  *
5  * SPDX-License-Identifier: (Apache-2.0 OR MIT)
6  ******************************************************************************/
7 
8 #include "_hypre_parcsr_ls.h"
9 
10 /*--------------------------------------------------------------------------
11  * hypre_ParAMGBuildMultipass
12  * This routine implements Stuben's direct interpolation with multiple passes.
13  *--------------------------------------------------------------------------*/
14 
15 HYPRE_Int
hypre_BoomerAMGBuildMultipassHost(hypre_ParCSRMatrix * A,HYPRE_Int * CF_marker,hypre_ParCSRMatrix * S,HYPRE_BigInt * num_cpts_global,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int debug_flag,HYPRE_Real trunc_factor,HYPRE_Int P_max_elmts,HYPRE_Int weight_option,hypre_ParCSRMatrix ** P_ptr)16 hypre_BoomerAMGBuildMultipassHost( hypre_ParCSRMatrix  *A,
17                                    HYPRE_Int           *CF_marker,
18                                    hypre_ParCSRMatrix  *S,
19                                    HYPRE_BigInt        *num_cpts_global,
20                                    HYPRE_Int            num_functions,
21                                    HYPRE_Int           *dof_func,
22                                    HYPRE_Int            debug_flag,
23                                    HYPRE_Real           trunc_factor,
24                                    HYPRE_Int            P_max_elmts,
25                                    HYPRE_Int            weight_option,
26                                    hypre_ParCSRMatrix **P_ptr )
27 {
28 #ifdef HYPRE_PROFILE
29    hypre_profile_times[HYPRE_TIMER_ID_MULTIPASS_INTERP] -= hypre_MPI_Wtime();
30 #endif
31 
32    MPI_Comm                comm = hypre_ParCSRMatrixComm(A);
33    hypre_ParCSRCommPkg    *comm_pkg = hypre_ParCSRMatrixCommPkg(S);
34    hypre_ParCSRCommHandle *comm_handle;
35    hypre_ParCSRCommPkg    *tmp_comm_pkg;
36 
37    hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
38    HYPRE_Real      *A_diag_data = hypre_CSRMatrixData(A_diag);
39    HYPRE_Int       *A_diag_i = hypre_CSRMatrixI(A_diag);
40    HYPRE_Int       *A_diag_j = hypre_CSRMatrixJ(A_diag);
41 
42    hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
43    HYPRE_Real      *A_offd_data = NULL;
44    HYPRE_Int       *A_offd_i = hypre_CSRMatrixI(A_offd);
45    HYPRE_Int       *A_offd_j = NULL;
46    //HYPRE_BigInt    *col_map_offd_A = hypre_ParCSRMatrixColMapOffd(A);
47    HYPRE_Int        num_cols_offd_A = hypre_CSRMatrixNumCols(A_offd);
48 
49    hypre_CSRMatrix *S_diag = hypre_ParCSRMatrixDiag(S);
50    HYPRE_Int       *S_diag_i = hypre_CSRMatrixI(S_diag);
51    HYPRE_Int       *S_diag_j = hypre_CSRMatrixJ(S_diag);
52 
53    hypre_CSRMatrix *S_offd = hypre_ParCSRMatrixOffd(S);
54    HYPRE_Int       *S_offd_i = hypre_CSRMatrixI(S_offd);
55    HYPRE_Int       *S_offd_j = NULL;
56    /*HYPRE_BigInt    *col_map_offd_S = hypre_ParCSRMatrixColMapOffd(S);
57    HYPRE_Int        num_cols_offd_S = hypre_CSRMatrixNumCols(S_offd);
58    HYPRE_BigInt    *col_map_offd = NULL;*/
59    HYPRE_Int        num_cols_offd;
60 
61    hypre_ParCSRMatrix *P;
62    hypre_CSRMatrix *P_diag;
63    HYPRE_Real      *P_diag_data;
64    HYPRE_Int       *P_diag_i; /*at first counter of nonzero cols for each row,
65                                       finally will be pointer to start of row */
66    HYPRE_Int       *P_diag_j;
67 
68    hypre_CSRMatrix *P_offd;
69    HYPRE_Real      *P_offd_data = NULL;
70    HYPRE_Int       *P_offd_i; /*at first counter of nonzero cols for each row,
71                                       finally will be pointer to start of row */
72    HYPRE_Int       *P_offd_j = NULL;
73 
74    HYPRE_Int        num_sends = 0;
75    HYPRE_Int       *int_buf_data = NULL;
76    HYPRE_BigInt    *big_buf_data = NULL;
77    HYPRE_Int       *send_map_start;
78    HYPRE_Int       *send_map_elmt;
79    HYPRE_Int       *send_procs;
80    HYPRE_Int        num_recvs = 0;
81    HYPRE_Int       *recv_vec_start;
82    HYPRE_Int       *recv_procs;
83    HYPRE_Int       *new_recv_vec_start = NULL;
84    HYPRE_Int      **Pext_send_map_start = NULL;
85    HYPRE_Int      **Pext_recv_vec_start = NULL;
86    HYPRE_Int       *Pext_start = NULL;
87    HYPRE_Int       *P_ncols = NULL;
88 
89    HYPRE_Int       *CF_marker_offd = NULL;
90    HYPRE_Int       *dof_func_offd = NULL;
91    HYPRE_Int       *P_marker;
92    HYPRE_Int       *P_marker_offd = NULL;
93    HYPRE_Int       *C_array;
94    HYPRE_Int       *C_array_offd = NULL;
95    HYPRE_Int       *pass_array = NULL; /* contains points ordered according to pass */
96    HYPRE_Int       *pass_pointer = NULL; /* pass_pointer[j] contains pointer to first
97                                                   point of pass j contained in pass_array */
98    HYPRE_Int       *P_diag_start;
99    HYPRE_Int       *P_offd_start = NULL;
100    HYPRE_Int      **P_diag_pass;
101    HYPRE_Int      **P_offd_pass = NULL;
102    HYPRE_Int      **Pext_pass = NULL;
103    HYPRE_BigInt    *big_temp_pass = NULL;
104    HYPRE_BigInt   **new_elmts = NULL; /* new neighbors generated in each pass */
105    HYPRE_Int       *new_counter = NULL; /* contains no. of new neighbors for
106                                            each pass */
107    HYPRE_Int       *loc = NULL; /* contains locations for new neighbor
108                                    connections in int_o_buffer to avoid searching */
109    HYPRE_Int       *Pext_i = NULL; /*contains P_diag_i and P_offd_i info for nonzero
110                                      cols of off proc neighbors */
111    HYPRE_BigInt    *Pext_send_buffer = NULL; /* used to collect global nonzero
112                                                 col ids in P_diag for send_map_elmts */
113 
114    HYPRE_Int       *map_S_to_new = NULL;
115    HYPRE_BigInt    *new_col_map_offd = NULL;
116    HYPRE_BigInt    *col_map_offd_P = NULL;
117    HYPRE_Int       *permute = NULL;
118    HYPRE_BigInt    *big_permute = NULL;
119 
120    HYPRE_Int        cnt;
121    HYPRE_Int        cnt_nz;
122    HYPRE_Int        total_nz;
123    HYPRE_Int        pass;
124    HYPRE_Int        num_passes;
125    HYPRE_Int        max_num_passes = 10;
126 
127    HYPRE_Int        n_fine;
128    HYPRE_Int        n_coarse = 0;
129    HYPRE_Int        n_coarse_offd = 0;
130    HYPRE_Int        n_SF = 0;
131    HYPRE_Int        n_SF_offd = 0;
132 
133    HYPRE_Int       *fine_to_coarse = NULL;
134    HYPRE_BigInt    *fine_to_coarse_offd = NULL;
135 
136    HYPRE_Int       *assigned = NULL;
137    HYPRE_Int       *assigned_offd = NULL;
138 
139    HYPRE_Real      *Pext_send_data = NULL;
140    HYPRE_Real      *Pext_data = NULL;
141 
142    HYPRE_Real       sum_C, sum_N;
143    HYPRE_Real       sum_C_pos, sum_C_neg;
144    HYPRE_Real       sum_N_pos, sum_N_neg;
145    HYPRE_Real       diagonal;
146    HYPRE_Real       alfa = 1.0;
147    HYPRE_Real       beta = 1.0;
148    HYPRE_Int        j_start;
149    HYPRE_Int        j_end;
150 
151    HYPRE_Int        i,i1;
152    HYPRE_Int        j,j1;
153    HYPRE_Int        k,k1,k2,k3;
154    HYPRE_BigInt     big_k1;
155    HYPRE_Int        pass_array_size;
156    HYPRE_BigInt     global_pass_array_size;
157    HYPRE_BigInt     local_pass_array_size;
158    HYPRE_Int        my_id, num_procs;
159    HYPRE_Int        index, start;
160    HYPRE_BigInt     my_first_cpt;
161    HYPRE_BigInt     total_global_cpts;
162    HYPRE_Int        p_cnt;
163    HYPRE_Int        total_nz_offd;
164    HYPRE_Int        cnt_nz_offd;
165    HYPRE_Int        cnt_offd, cnt_new;
166    HYPRE_Int        no_break;
167    HYPRE_Int        not_found;
168    HYPRE_Int        Pext_send_size;
169    HYPRE_Int        Pext_recv_size;
170    HYPRE_Int        old_Pext_send_size;
171    HYPRE_Int        old_Pext_recv_size;
172    HYPRE_Int        P_offd_size = 0;
173    HYPRE_Int        local_index = -1;
174    HYPRE_Int        new_num_cols_offd = 0;
175    HYPRE_Int        num_cols_offd_P;
176 
177    /* Threading variables */
178    HYPRE_Int my_thread_num, num_threads, thread_start, thread_stop;
179    HYPRE_Int pass_length;
180    HYPRE_Int *tmp_marker, *tmp_marker_offd;
181    HYPRE_Int *tmp_array,  *tmp_array_offd;
182    HYPRE_Int * max_num_threads = hypre_CTAlloc(HYPRE_Int,  1, HYPRE_MEMORY_HOST);
183    HYPRE_Int * cnt_nz_per_thread;
184    HYPRE_Int * cnt_nz_offd_per_thread;
185 
186    /* HYPRE_Real wall_time;
187       wall_time = hypre_MPI_Wtime(); */
188 
189    /* Initialize threading variables */
190    max_num_threads[0] = hypre_NumThreads();
191    cnt_nz_per_thread = hypre_CTAlloc(HYPRE_Int,  max_num_threads[0], HYPRE_MEMORY_HOST);
192    cnt_nz_offd_per_thread = hypre_CTAlloc(HYPRE_Int,  max_num_threads[0], HYPRE_MEMORY_HOST);
193    for(i=0; i < max_num_threads[0]; i++)
194    {
195       cnt_nz_offd_per_thread[i] = 0;
196       cnt_nz_per_thread[i] = 0;
197    }
198 
199 
200    /*-----------------------------------------------------------------------
201     *  Access the CSR vectors for A and S. Also get size of fine grid.
202     *-----------------------------------------------------------------------*/
203 
204    hypre_MPI_Comm_size(comm,&num_procs);
205    hypre_MPI_Comm_rank(comm,&my_id);
206 
207    my_first_cpt = num_cpts_global[0];
208    /*   total_global_cpts = 0; */
209    if (my_id == (num_procs -1)) total_global_cpts = num_cpts_global[1];
210    hypre_MPI_Bcast(&total_global_cpts, 1, HYPRE_MPI_BIG_INT, num_procs-1, comm);
211 
212    if (!comm_pkg)
213    {
214       comm_pkg = hypre_ParCSRMatrixCommPkg(A);
215       if (!comm_pkg)
216       {
217          hypre_MatvecCommPkgCreate(A);
218 
219          comm_pkg = hypre_ParCSRMatrixCommPkg(A);
220       }
221    }
222 
223    //col_map_offd = col_map_offd_A;
224    num_cols_offd = num_cols_offd_A;
225 
226    if (num_cols_offd_A)
227    {
228       A_offd_data = hypre_CSRMatrixData(A_offd);
229       A_offd_j    = hypre_CSRMatrixJ(A_offd);
230    }
231 
232    if (num_cols_offd)
233       S_offd_j    = hypre_CSRMatrixJ(S_offd);
234 
235    n_fine = hypre_CSRMatrixNumRows(A_diag);
236 
237    /*-----------------------------------------------------------------------
238     *  Intialize counters and allocate mapping vector.
239     *-----------------------------------------------------------------------*/
240 
241    if (n_fine) fine_to_coarse = hypre_CTAlloc(HYPRE_Int,  n_fine, HYPRE_MEMORY_HOST);
242 
243    n_coarse = 0;
244    n_SF = 0;
245 #ifdef HYPRE_USING_OPENMP
246 #pragma omp parallel for private(i) reduction(+:n_coarse,n_SF ) HYPRE_SMP_SCHEDULE
247 #endif
248    for (i=0; i < n_fine; i++)
249       if (CF_marker[i] == 1) n_coarse++;
250       else if (CF_marker[i] == -3) n_SF++;
251 
252    pass_array_size = n_fine-n_coarse-n_SF;
253    if (pass_array_size) pass_array = hypre_CTAlloc(HYPRE_Int,  pass_array_size, HYPRE_MEMORY_HOST);
254    pass_pointer = hypre_CTAlloc(HYPRE_Int,  max_num_passes+1, HYPRE_MEMORY_HOST);
255    if (n_fine) assigned = hypre_CTAlloc(HYPRE_Int,  n_fine, HYPRE_MEMORY_HOST);
256    P_diag_i = hypre_CTAlloc(HYPRE_Int, n_fine+1, HYPRE_MEMORY_HOST);
257    P_offd_i = hypre_CTAlloc(HYPRE_Int, n_fine+1, HYPRE_MEMORY_HOST);
258    if (n_coarse) C_array = hypre_CTAlloc(HYPRE_Int,  n_coarse, HYPRE_MEMORY_HOST);
259 
260    if (num_cols_offd)
261    {
262       CF_marker_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_offd, HYPRE_MEMORY_HOST);
263       if (num_functions > 1) dof_func_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_offd, HYPRE_MEMORY_HOST);
264    }
265 
266    if (num_procs > 1)
267    {
268       num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
269       send_procs = hypre_ParCSRCommPkgSendProcs(comm_pkg);
270       send_map_start = hypre_ParCSRCommPkgSendMapStarts(comm_pkg);
271       send_map_elmt = hypre_ParCSRCommPkgSendMapElmts(comm_pkg);
272       num_recvs = hypre_ParCSRCommPkgNumRecvs(comm_pkg);
273       recv_procs = hypre_ParCSRCommPkgRecvProcs(comm_pkg);
274       recv_vec_start = hypre_ParCSRCommPkgRecvVecStarts(comm_pkg);
275       if (send_map_start[num_sends])
276       {
277          int_buf_data = hypre_CTAlloc(HYPRE_Int,  send_map_start[num_sends], HYPRE_MEMORY_HOST);
278          big_buf_data = hypre_CTAlloc(HYPRE_BigInt,  send_map_start[num_sends], HYPRE_MEMORY_HOST);
279       }
280    }
281 
282 
283    index = 0;
284    for (i=0; i < num_sends; i++)
285    {
286       start = send_map_start[i];
287       for (j = start; j < send_map_start[i+1]; j++)
288          int_buf_data[index++] = CF_marker[send_map_elmt[j]];
289    }
290    if (num_procs > 1)
291    {
292       comm_handle = hypre_ParCSRCommHandleCreate(11, comm_pkg, int_buf_data,
293                                                  CF_marker_offd);
294       hypre_ParCSRCommHandleDestroy(comm_handle);
295    }
296 
297    if (num_functions > 1)
298    {
299       index = 0;
300       for (i=0; i < num_sends; i++)
301       {
302          start = send_map_start[i];
303          for (j = start; j < send_map_start[i+1]; j++)
304             int_buf_data[index++] = dof_func[send_map_elmt[j]];
305       }
306       if (num_procs > 1)
307       {
308          comm_handle = hypre_ParCSRCommHandleCreate(11, comm_pkg, int_buf_data,
309                                                     dof_func_offd);
310          hypre_ParCSRCommHandleDestroy(comm_handle);
311       }
312    }
313 
314    n_coarse_offd = 0;
315    n_SF_offd = 0;
316 #ifdef HYPRE_USING_OPENMP
317 #pragma omp parallel for private(i) reduction(+:n_coarse_offd,n_SF_offd) HYPRE_SMP_SCHEDULE
318 #endif
319    for (i=0; i < num_cols_offd; i++)
320       if (CF_marker_offd[i] == 1) n_coarse_offd++;
321       else if (CF_marker_offd[i] == -3) n_SF_offd++;
322 
323    if (num_cols_offd)
324    {
325       assigned_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_offd, HYPRE_MEMORY_HOST);
326       map_S_to_new = hypre_CTAlloc(HYPRE_Int,  num_cols_offd, HYPRE_MEMORY_HOST);
327       fine_to_coarse_offd = hypre_CTAlloc(HYPRE_BigInt,  num_cols_offd, HYPRE_MEMORY_HOST);
328       new_col_map_offd = hypre_CTAlloc(HYPRE_BigInt,  n_coarse_offd, HYPRE_MEMORY_HOST);
329    }
330 
331    /*-----------------------------------------------------------------------
332     *  First Pass: determine the maximal size of P, and elementsPerRow[i].
333     *-----------------------------------------------------------------------*/
334 
335    /*-----------------------------------------------------------------------
336     *  Assigned points are points for which we know an interpolation
337     *  formula already, and which are thus available to interpolate from.
338     *  assigned[i]=0 for C points, and 1, 2, 3, ... for F points, depending
339     *  in which pass their interpolation formula is determined.
340     *
341     *  pass_array contains the points ordered according to its pass, i.e.
342     *  |  C-points   |  points of pass 1 | points of pass 2 | ....
343     * C_points are points 0 through pass_pointer[1]-1,
344     * points of pass k  (0 < k < num_passes) are contained in points
345     * pass_pointer[k] through pass_pointer[k+1]-1 of pass_array .
346     *
347     * pass_array is also used to avoid going through all points for each pass,
348     * i,e. at the bginning it contains all points in descending order starting
349     * with n_fine-1. Then starting from the last point, we evaluate whether
350     * it is a C_point (pass 0). If it is the point is brought to the front
351     * and the length of the points to be searched is shortened.  This is
352     * done until the parameter cnt (which determines the first point of
353     * pass_array to be searched) becomes n_fine. Then all points have been
354     * assigned a pass number.
355     *-----------------------------------------------------------------------*/
356 
357 
358    cnt = 0;
359    p_cnt = pass_array_size-1;
360    P_diag_i[0] = 0;
361    P_offd_i[0] = 0;
362    for (i = 0; i < n_fine; i++)
363    {
364       if (CF_marker[i] == 1)
365       {
366          fine_to_coarse[i] = cnt; /* this C point is assigned index
367                                      coarse_counter on coarse grid,
368                                      and in column of P */
369          C_array[cnt++] = i;
370          assigned[i] = 0;
371          P_diag_i[i+1] = 1; /* one element in row i1 of P */
372          P_offd_i[i+1] = 0;
373       }
374       else if (CF_marker[i] == -1)
375       {
376          pass_array[p_cnt--] = i;
377          P_diag_i[i+1] = 0;
378          P_offd_i[i+1] = 0;
379          assigned[i] = -1;
380          fine_to_coarse[i] = -1;
381       }
382       else
383       {
384          P_diag_i[i+1] = 0;
385          P_offd_i[i+1] = 0;
386          assigned[i] = -1;
387          fine_to_coarse[i] = -1;
388       }
389    }
390 
391    index = 0;
392    for (i=0; i < num_sends; i++)
393    {
394       start = send_map_start[i];
395       for (j = start; j < send_map_start[i+1]; j++)
396       {
397          big_buf_data[index] = (HYPRE_BigInt)fine_to_coarse[send_map_elmt[j]];
398          if (big_buf_data[index] > -1)
399             big_buf_data[index] += my_first_cpt;
400          index++;
401       }
402    }
403    if (num_procs > 1)
404    {
405       comm_handle = hypre_ParCSRCommHandleCreate(21, comm_pkg, big_buf_data,
406                                                  fine_to_coarse_offd);
407       hypre_ParCSRCommHandleDestroy(comm_handle);
408    }
409 
410    new_recv_vec_start = hypre_CTAlloc(HYPRE_Int, num_recvs+1, HYPRE_MEMORY_HOST);
411 
412    if (n_coarse_offd)
413       C_array_offd = hypre_CTAlloc(HYPRE_Int, n_coarse_offd, HYPRE_MEMORY_HOST);
414 
415    cnt = 0;
416    new_recv_vec_start[0] = 0;
417    for (j = 0; j < num_recvs; j++)
418    {
419       for (i = recv_vec_start[j]; i < recv_vec_start[j+1]; i++)
420       {
421          if (CF_marker_offd[i] == 1)
422          {
423             map_S_to_new[i] = cnt;
424             C_array_offd[cnt] = i;
425             new_col_map_offd[cnt++] = fine_to_coarse_offd[i];
426             assigned_offd[i] = 0;
427          }
428          else
429          {
430             assigned_offd[i] = -1;
431             map_S_to_new[i] = -1;
432          }
433       }
434       new_recv_vec_start[j+1] = cnt;
435    }
436 
437    cnt = 0;
438    hypre_TFree(fine_to_coarse_offd, HYPRE_MEMORY_HOST);
439 
440    /*-----------------------------------------------------------------------
441     *  Mark all local neighbors of C points as 'assigned'.
442     *-----------------------------------------------------------------------*/
443 
444    pass_pointer[0] = 0;
445    pass_pointer[1] = 0;
446    total_nz = n_coarse;  /* accumulates total number of nonzeros in P_diag */
447    total_nz_offd = 0; /* accumulates total number of nonzeros in P_offd */
448 
449    cnt = 0;
450    cnt_offd = 0;
451    cnt_nz = 0;
452    cnt_nz_offd = 0;
453    for (i = pass_array_size-1; i > cnt-1; i--)
454    {
455       i1 = pass_array[i];
456       for (j=S_diag_i[i1]; j < S_diag_i[i1+1]; j++)
457       {
458          j1 = S_diag_j[j];
459          if (CF_marker[j1] == 1)
460          {
461             P_diag_i[i1+1]++;
462             cnt_nz++;
463             assigned[i1] = 1;
464          }
465       }
466       for (j=S_offd_i[i1]; j < S_offd_i[i1+1]; j++)
467       {
468          j1 = S_offd_j[j];
469          if (CF_marker_offd[j1] == 1)
470          {
471             P_offd_i[i1+1]++;
472             cnt_nz_offd++;
473             assigned[i1] = 1;
474          }
475       }
476       if (assigned[i1] == 1)
477       {
478          pass_array[i++] = pass_array[cnt];
479          pass_array[cnt++] = i1;
480       }
481    }
482 
483    pass_pointer[2] = cnt;
484 
485    /*-----------------------------------------------------------------------
486     *  All local neighbors are assigned, now need to exchange the boundary
487     *  info for assigned strong neighbors.
488     *-----------------------------------------------------------------------*/
489 
490    index = 0;
491    for (i=0; i < num_sends; i++)
492    {
493       start = send_map_start[i];
494       for (j = start; j < send_map_start[i+1]; j++)
495       {    int_buf_data[index++] = assigned[send_map_elmt[j]]; }
496    }
497    if (num_procs > 1)
498    {
499       comm_handle = hypre_ParCSRCommHandleCreate(11, comm_pkg, int_buf_data,
500                                                  assigned_offd);
501       hypre_ParCSRCommHandleDestroy(comm_handle);
502    }
503 
504    /*-----------------------------------------------------------------------
505     *  Now we need to determine strong neighbors of points of pass 1, etc.
506     *  we need to update assigned_offd after each pass
507     *-----------------------------------------------------------------------*/
508 
509    pass = 2;
510    local_pass_array_size = (HYPRE_BigInt)(pass_array_size - cnt);
511    hypre_MPI_Allreduce(&local_pass_array_size, &global_pass_array_size, 1, HYPRE_MPI_BIG_INT,
512                        hypre_MPI_SUM, comm);
513    while (global_pass_array_size && pass < max_num_passes)
514    {
515       for (i = pass_array_size-1; i > cnt-1; i--)
516       {
517          i1 = pass_array[i];
518          no_break = 1;
519          for (j=S_diag_i[i1]; j < S_diag_i[i1+1]; j++)
520          {
521             j1 = S_diag_j[j];
522             if (assigned[j1] == pass-1)
523             {
524                pass_array[i++] = pass_array[cnt];
525                pass_array[cnt++] = i1;
526                assigned[i1] = pass;
527                no_break = 0;
528                break;
529             }
530          }
531          if (no_break)
532          {
533             for (j=S_offd_i[i1]; j < S_offd_i[i1+1]; j++)
534             {
535                j1 = S_offd_j[j];
536                if (assigned_offd[j1] == pass-1)
537                {
538                   pass_array[i++] = pass_array[cnt];
539                   pass_array[cnt++] = i1;
540                   assigned[i1] = pass;
541                   break;
542                }
543             }
544          }
545       }
546       /*hypre_printf("pass %d  remaining points %d \n", pass, local_pass_array_size);*/
547 
548       pass++;
549       pass_pointer[pass] = cnt;
550 
551       local_pass_array_size = (HYPRE_BigInt)(pass_array_size - cnt);
552       hypre_MPI_Allreduce(&local_pass_array_size, &global_pass_array_size, 1, HYPRE_MPI_BIG_INT,
553                           hypre_MPI_SUM, comm);
554       index = 0;
555       for (i=0; i < num_sends; i++)
556       {
557          start = send_map_start[i];
558          for (j = start; j < send_map_start[i+1]; j++)
559          {   int_buf_data[index++] = assigned[send_map_elmt[j]]; }
560       }
561       if (num_procs > 1)
562       {
563          comm_handle = hypre_ParCSRCommHandleCreate(11, comm_pkg, int_buf_data,
564                                                     assigned_offd);
565          hypre_ParCSRCommHandleDestroy(comm_handle);
566       }
567    }
568 
569    hypre_TFree(int_buf_data, HYPRE_MEMORY_HOST);
570    hypre_TFree(big_buf_data, HYPRE_MEMORY_HOST);
571 
572    num_passes = pass;
573 
574    P_diag_pass = hypre_CTAlloc(HYPRE_Int*, num_passes, HYPRE_MEMORY_HOST); /* P_diag_pass[i] will contain
575                                                                               all column numbers for points of pass i */
576 
577    P_diag_pass[1] = hypre_CTAlloc(HYPRE_Int, cnt_nz, HYPRE_MEMORY_HOST);
578 
579    P_diag_start = hypre_CTAlloc(HYPRE_Int,  n_fine, HYPRE_MEMORY_HOST); /* P_diag_start[i] contains
580                                                                            pointer to begin of column numbers in P_pass for point i,
581                                                                            P_diag_i[i+1] contains number of columns for point i */
582 
583    P_offd_start = hypre_CTAlloc(HYPRE_Int,  n_fine, HYPRE_MEMORY_HOST);
584 
585    if (num_procs > 1)
586    {
587       P_offd_pass = hypre_CTAlloc(HYPRE_Int*, num_passes, HYPRE_MEMORY_HOST);
588 
589       if (cnt_nz_offd)
590          P_offd_pass[1] = hypre_CTAlloc(HYPRE_Int, cnt_nz_offd, HYPRE_MEMORY_HOST);
591       else
592          P_offd_pass[1] = NULL;
593 
594       new_elmts = hypre_CTAlloc(HYPRE_BigInt*, num_passes, HYPRE_MEMORY_HOST);
595 
596       new_counter = hypre_CTAlloc(HYPRE_Int,  num_passes+1, HYPRE_MEMORY_HOST);
597 
598       new_counter[0] = 0;
599       new_counter[1] = n_coarse_offd;
600       new_num_cols_offd = n_coarse_offd;
601 
602       new_elmts[0] = new_col_map_offd;
603    }
604 
605    /*-----------------------------------------------------------------------
606     *  Pass 1: now we consider points of pass 1, with strong C_neighbors,
607     *-----------------------------------------------------------------------*/
608 
609    cnt_nz = 0;
610    cnt_nz_offd = 0;
611    /* JBS: Possible candidate for threading */
612    for (i=pass_pointer[1]; i < pass_pointer[2]; i++)
613    {
614       i1 = pass_array[i];
615       P_diag_start[i1] = cnt_nz;
616       P_offd_start[i1] = cnt_nz_offd;
617       for (j=S_diag_i[i1]; j < S_diag_i[i1+1]; j++)
618       {
619          j1 = S_diag_j[j];
620          if (CF_marker[j1] == 1)
621          {   P_diag_pass[1][cnt_nz++] = fine_to_coarse[j1]; }
622       }
623       for (j=S_offd_i[i1]; j < S_offd_i[i1+1]; j++)
624       {
625          j1 = S_offd_j[j];
626          if (CF_marker_offd[j1] == 1)
627          {   P_offd_pass[1][cnt_nz_offd++] = map_S_to_new[j1]; }
628       }
629    }
630 
631 
632    total_nz += cnt_nz;
633    total_nz_offd += cnt_nz_offd;
634 
635    if (num_procs > 1)
636    {
637       tmp_comm_pkg = hypre_CTAlloc(hypre_ParCSRCommPkg, 1, HYPRE_MEMORY_HOST);
638       Pext_send_map_start = hypre_CTAlloc(HYPRE_Int*, num_passes, HYPRE_MEMORY_HOST);
639       Pext_recv_vec_start = hypre_CTAlloc(HYPRE_Int*, num_passes, HYPRE_MEMORY_HOST);
640       Pext_pass = hypre_CTAlloc(HYPRE_Int*, num_passes, HYPRE_MEMORY_HOST);
641       Pext_i = hypre_CTAlloc(HYPRE_Int,  num_cols_offd+1, HYPRE_MEMORY_HOST);
642       if (num_cols_offd) Pext_start = hypre_CTAlloc(HYPRE_Int,  num_cols_offd, HYPRE_MEMORY_HOST);
643       if (send_map_start[num_sends])
644          P_ncols = hypre_CTAlloc(HYPRE_Int, send_map_start[num_sends], HYPRE_MEMORY_HOST);
645 #ifdef HYPRE_USING_OPENMP
646 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
647 #endif
648       for (i=0; i < num_cols_offd+1; i++)
649       {   Pext_i[i] = 0; }
650 #ifdef HYPRE_USING_OPENMP
651 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
652 #endif
653       for (i=0; i < send_map_start[num_sends]; i++)
654       {   P_ncols[i] = 0; }
655    }
656 
657    old_Pext_send_size = 0;
658    old_Pext_recv_size = 0;
659    for (pass=2; pass < num_passes; pass++)
660    {
661 
662       if (num_procs > 1)
663       {
664          Pext_send_map_start[pass] = hypre_CTAlloc(HYPRE_Int,  num_sends+1, HYPRE_MEMORY_HOST);
665          Pext_recv_vec_start[pass] = hypre_CTAlloc(HYPRE_Int,  num_recvs+1, HYPRE_MEMORY_HOST);
666          Pext_send_size = 0;
667          Pext_send_map_start[pass][0] = 0;
668 
669          for (i=0; i < num_sends; i++)
670          {
671 #ifdef HYPRE_USING_OPENMP
672 #pragma omp parallel for private(j,j1) reduction(+:Pext_send_size) HYPRE_SMP_SCHEDULE
673 #endif
674             for (j=send_map_start[i]; j < send_map_start[i+1]; j++)
675             {
676                j1 = send_map_elmt[j];
677                if (assigned[j1] == pass-1)
678                {
679                   P_ncols[j] = P_diag_i[j1+1] + P_offd_i[j1+1];
680                   Pext_send_size += P_ncols[j];
681                }
682             }
683             Pext_send_map_start[pass][i+1] = Pext_send_size;
684          }
685 
686          comm_handle = hypre_ParCSRCommHandleCreate (11, comm_pkg,
687                                                      P_ncols, &Pext_i[1]);
688          hypre_ParCSRCommHandleDestroy(comm_handle);
689 
690          if (Pext_send_size > old_Pext_send_size)
691          {
692             hypre_TFree(Pext_send_buffer, HYPRE_MEMORY_HOST);
693             Pext_send_buffer = hypre_CTAlloc(HYPRE_BigInt,  Pext_send_size, HYPRE_MEMORY_HOST);
694          }
695          old_Pext_send_size = Pext_send_size;
696       }
697 
698       cnt_offd = 0;
699       for (i=0; i < num_sends; i++)
700       {
701          for (j=send_map_start[i]; j < send_map_start[i+1]; j++)
702          {
703             j1 = send_map_elmt[j];
704             if (assigned[j1] == pass-1)
705             {
706                j_start = P_diag_start[j1];
707                j_end = j_start+P_diag_i[j1+1];
708                for (k=j_start; k < j_end; k++)
709                {
710                   Pext_send_buffer[cnt_offd++] = my_first_cpt
711                      + (HYPRE_BigInt) P_diag_pass[pass-1][k];
712                }
713                j_start = P_offd_start[j1];
714                j_end = j_start+P_offd_i[j1+1];
715                for (k=j_start; k < j_end; k++)
716                {
717                   k1 = P_offd_pass[pass-1][k];
718                   k3 = 0;
719                   while (k3 < pass-1)
720                   {
721                      if (k1 < new_counter[k3+1])
722                      {
723                         k2 = k1-new_counter[k3];
724                         Pext_send_buffer[cnt_offd++] = new_elmts[k3][k2];
725                         break;
726                      }
727                      k3++;
728                   }
729                }
730             }
731          }
732       }
733 
734       if (num_procs > 1)
735       {
736          Pext_recv_size = 0;
737          Pext_recv_vec_start[pass][0] = 0;
738          cnt_offd = 0;
739          for (i=0; i < num_recvs; i++)
740          {
741             for (j=recv_vec_start[i]; j<recv_vec_start[i+1]; j++)
742             {
743                if (assigned_offd[j] == pass-1)
744                {
745                   Pext_start[j] = cnt_offd;
746                   cnt_offd += Pext_i[j+1];
747                }
748             }
749             Pext_recv_size = cnt_offd;
750             Pext_recv_vec_start[pass][i+1] = Pext_recv_size;
751          }
752 
753          hypre_ParCSRCommPkgComm(tmp_comm_pkg) = comm;
754          hypre_ParCSRCommPkgNumSends(tmp_comm_pkg) = num_sends;
755          hypre_ParCSRCommPkgSendProcs(tmp_comm_pkg) = send_procs;
756          hypre_ParCSRCommPkgSendMapStarts(tmp_comm_pkg) =
757             Pext_send_map_start[pass];
758          hypre_ParCSRCommPkgNumRecvs(tmp_comm_pkg) = num_recvs;
759          hypre_ParCSRCommPkgRecvProcs(tmp_comm_pkg) = recv_procs;
760          hypre_ParCSRCommPkgRecvVecStarts(tmp_comm_pkg) =
761             Pext_recv_vec_start[pass];
762 
763          if (Pext_recv_size)
764          {
765             Pext_pass[pass] = hypre_CTAlloc(HYPRE_Int,  Pext_recv_size, HYPRE_MEMORY_HOST);
766             new_elmts[pass-1] = hypre_CTAlloc(HYPRE_BigInt, Pext_recv_size, HYPRE_MEMORY_HOST);
767          }
768          else
769          {
770             Pext_pass[pass] = NULL;
771             new_elmts[pass-1] = NULL;
772          }
773 
774          if (Pext_recv_size > old_Pext_recv_size)
775          {
776             hypre_TFree(loc, HYPRE_MEMORY_HOST);
777             loc = hypre_CTAlloc(HYPRE_Int, Pext_recv_size, HYPRE_MEMORY_HOST);
778             hypre_TFree(big_temp_pass, HYPRE_MEMORY_HOST);
779             big_temp_pass = hypre_CTAlloc(HYPRE_BigInt, Pext_recv_size, HYPRE_MEMORY_HOST);
780          }
781          old_Pext_recv_size = Pext_recv_size;
782 
783          comm_handle = hypre_ParCSRCommHandleCreate (21, tmp_comm_pkg,
784                                                      Pext_send_buffer, big_temp_pass);
785          hypre_ParCSRCommHandleDestroy(comm_handle);
786       }
787 
788       cnt_new = 0;
789       cnt_offd = 0;
790       /* JBS: Possible candidate for threading */
791       for (i=0; i < num_recvs; i++)
792       {
793          for (j=recv_vec_start[i]; j < recv_vec_start[i+1]; j++)
794          {
795             if (assigned_offd[j] == pass-1)
796             {
797                for (j1 = cnt_offd; j1 < cnt_offd+Pext_i[j+1]; j1++)
798                {
799                   big_k1 = big_temp_pass[j1];
800                   k2 = (HYPRE_Int)(big_k1 - my_first_cpt);
801                   if (k2 > -1 && k2 < n_coarse)
802                   {  Pext_pass[pass][j1] = -k2-1; }
803                   else
804                   {
805                      not_found = 1;
806                      k3 = 0;
807                      while (k3 < pass-1 && not_found)
808                      {
809                         k2 = hypre_BigBinarySearch(new_elmts[k3], big_k1,
810                                                 (new_counter[k3+1]-new_counter[k3]));
811                         if (k2 > -1)
812                         {
813                            Pext_pass[pass][j1] = k2 + new_counter[k3];
814                            not_found = 0;
815                         }
816                         else
817                         {
818                            k3++;
819                         }
820                      }
821                      if (not_found)
822                      {
823                         new_elmts[pass-1][cnt_new] = big_k1;
824                         loc[cnt_new++] = j1;
825                      }
826                   }
827                }
828                cnt_offd += Pext_i[j+1];
829             }
830          }
831       }
832 
833       if (cnt_new)
834       {
835          hypre_BigQsortbi(new_elmts[pass-1],loc,0,cnt_new-1);
836          cnt = 0;
837          local_index = new_counter[pass-1];
838          Pext_pass[pass][loc[0]] = local_index;
839 
840          for (i=1; i < cnt_new; i++)
841          {
842             if (new_elmts[pass-1][i] > new_elmts[pass-1][cnt])
843             {
844                new_elmts[pass-1][++cnt] = new_elmts[pass-1][i];
845                local_index++;
846             }
847             Pext_pass[pass][loc[i]] = local_index;
848          }
849          new_counter[pass] = local_index+1;
850       }
851       else if (num_procs > 1)
852          new_counter[pass] = new_counter[pass-1];
853 
854       if (new_num_cols_offd < local_index+1)
855       {    new_num_cols_offd = local_index+1; }
856 
857       pass_length = pass_pointer[pass+1] - pass_pointer[pass];
858 #ifdef HYPRE_USING_OPENMP
859 #pragma omp parallel private(i,my_thread_num,num_threads,thread_start,thread_stop,cnt_nz,cnt_nz_offd,i1,j,j1,j_start,j_end,k1,k,P_marker,P_marker_offd)
860 #endif
861       {
862          /* Thread by computing the sparsity structure for this pass only over
863           * each thread's range of rows.  Rows are divided up evenly amongst
864           * the threads.  The necessary thread-wise temporary arrays, like
865           * P_marker, are initialized and de-allocated internally to the
866           * parallel region. */
867 
868          my_thread_num = hypre_GetThreadNum();
869          num_threads = hypre_NumActiveThreads();
870          thread_start = (pass_length/num_threads)*my_thread_num;
871          if (my_thread_num == num_threads-1)
872          {  thread_stop = pass_length; }
873          else
874          {  thread_stop = (pass_length/num_threads)*(my_thread_num+1); }
875          thread_start += pass_pointer[pass];
876          thread_stop += pass_pointer[pass];
877 
878          /* Local initializations */
879          cnt_nz = 0;
880          cnt_nz_offd = 0;
881 
882          /* This block of code is to go to the top of the parallel region starting before
883           * the loop over num_passes. */
884          P_marker = hypre_CTAlloc(HYPRE_Int,  n_coarse, HYPRE_MEMORY_HOST); /* marks points to see if they're counted */
885          for (i=0; i < n_coarse; i++)
886          {   P_marker[i] = -1; }
887          if (new_num_cols_offd == local_index+1)
888          {
889             P_marker_offd = hypre_CTAlloc(HYPRE_Int, new_num_cols_offd, HYPRE_MEMORY_HOST);
890             for (i=0; i < new_num_cols_offd; i++)
891             {   P_marker_offd[i] = -1; }
892          }
893          else if (n_coarse_offd)
894          {
895             P_marker_offd = hypre_CTAlloc(HYPRE_Int,  n_coarse_offd, HYPRE_MEMORY_HOST);
896             for (i=0; i < n_coarse_offd; i++)
897             {   P_marker_offd[i] = -1; }
898          }
899 
900 
901          /* Need some variables to store each threads cnt_nz and cnt_nz_offd, and
902           * then stitch things together as in par_interp.c
903           * This loop writes
904           * P_diag_i, P_offd_i: data parallel here, and require no special treatment
905           * P_diag_start, P_offd_start: are not data parallel, require special treatment
906           */
907          for (i=thread_start; i < thread_stop; i++)
908          {
909             i1 = pass_array[i];
910             P_diag_start[i1] = cnt_nz;
911             P_offd_start[i1] = cnt_nz_offd;
912             for (j=S_diag_i[i1]; j < S_diag_i[i1+1]; j++)
913             {
914                j1 = S_diag_j[j];
915                if (assigned[j1] == pass-1)
916                {
917                   j_start = P_diag_start[j1];
918                   j_end = j_start+P_diag_i[j1+1];
919                   for (k=j_start; k < j_end; k++)
920                   {
921                      k1 = P_diag_pass[pass-1][k];
922                      if (P_marker[k1] != i1)
923                      {
924                         cnt_nz++;
925                         P_diag_i[i1+1]++;
926                         P_marker[k1] = i1;
927                      }
928                   }
929                   j_start = P_offd_start[j1];
930                   j_end = j_start+P_offd_i[j1+1];
931                   for (k=j_start; k < j_end; k++)
932                   {
933                      k1 = P_offd_pass[pass-1][k];
934                      if (P_marker_offd[k1] != i1)
935                      {
936                         cnt_nz_offd++;
937                         P_offd_i[i1+1]++;
938                         P_marker_offd[k1] = i1;
939                      }
940                   }
941                }
942             }
943             j_start = 0;
944             for (j=S_offd_i[i1]; j < S_offd_i[i1+1]; j++)
945             {
946                j1 = S_offd_j[j];
947                if (assigned_offd[j1] == pass-1)
948                {
949                   j_start = Pext_start[j1];
950                   j_end = j_start+Pext_i[j1+1];
951                   for (k=j_start; k < j_end; k++)
952                   {
953                      k1 = Pext_pass[pass][k];
954                      if (k1 < 0)
955                      {
956                         if (P_marker[-k1-1] != i1)
957                         {
958                            cnt_nz++;
959                            P_diag_i[i1+1]++;
960                            P_marker[-k1-1] = i1;
961                         }
962                      }
963                      else if (P_marker_offd[k1] != i1)
964                      {
965                         cnt_nz_offd++;
966                         P_offd_i[i1+1]++;
967                         P_marker_offd[k1] = i1;
968                      }
969                   }
970                }
971             }
972          }
973 
974          /* Update P_diag_start, P_offd_start with cumulative
975           * nonzero counts over all threads */
976          if(my_thread_num == 0)
977          {   max_num_threads[0] = num_threads; }
978          cnt_nz_offd_per_thread[my_thread_num] = cnt_nz_offd;
979          cnt_nz_per_thread[my_thread_num] = cnt_nz;
980 #ifdef HYPRE_USING_OPENMP
981 #pragma omp barrier
982 #endif
983          if(my_thread_num == 0)
984          {
985             for(i = 1; i < max_num_threads[0]; i++)
986             {
987                cnt_nz_offd_per_thread[i] += cnt_nz_offd_per_thread[i-1];
988                cnt_nz_per_thread[i] += cnt_nz_per_thread[i-1];
989             }
990          }
991 #ifdef HYPRE_USING_OPENMP
992 #pragma omp barrier
993 #endif
994          if(my_thread_num > 0)
995          {
996             /* update this thread's section of P_diag_start and P_offd_start
997              * with the num of nz's counted by previous threads */
998             for (i=thread_start; i < thread_stop; i++)
999             {
1000                i1 = pass_array[i];
1001                P_diag_start[i1] += cnt_nz_per_thread[my_thread_num-1];
1002                P_offd_start[i1] += cnt_nz_offd_per_thread[my_thread_num-1];
1003             }
1004          }
1005          else /* if my_thread_num == 0 */
1006          {
1007             /* Grab the nz count for all threads */
1008             cnt_nz = cnt_nz_per_thread[max_num_threads[0]-1];
1009             cnt_nz_offd = cnt_nz_offd_per_thread[max_num_threads[0]-1];
1010 
1011             /* Updated total nz count */
1012             total_nz += cnt_nz;
1013             total_nz_offd += cnt_nz_offd;
1014 
1015             /* Allocate P_diag_pass and P_offd_pass for all threads */
1016             P_diag_pass[pass] = hypre_CTAlloc(HYPRE_Int,  cnt_nz, HYPRE_MEMORY_HOST);
1017             if (cnt_nz_offd)
1018                P_offd_pass[pass] = hypre_CTAlloc(HYPRE_Int,  cnt_nz_offd, HYPRE_MEMORY_HOST);
1019             else if (num_procs > 1)
1020                P_offd_pass[pass] = NULL;
1021          }
1022 #ifdef HYPRE_USING_OPENMP
1023 #pragma omp barrier
1024 #endif
1025 
1026          /* offset cnt_nz and cnt_nz_offd to point to the starting
1027           * point in P_diag_pass and P_offd_pass for each thread */
1028          if(my_thread_num > 0)
1029          {
1030             cnt_nz = cnt_nz_per_thread[my_thread_num-1];
1031             cnt_nz_offd = cnt_nz_offd_per_thread[my_thread_num-1];
1032          }
1033          else
1034          {
1035             cnt_nz = 0;
1036             cnt_nz_offd = 0;
1037          }
1038 
1039          /* Set P_diag_pass and P_offd_pass */
1040          for (i=thread_start; i < thread_stop; i++)
1041          {
1042             i1 = pass_array[i];
1043             for (j=S_diag_i[i1]; j < S_diag_i[i1+1]; j++)
1044             {
1045                j1 = S_diag_j[j];
1046                if (assigned[j1] == pass-1)
1047                {
1048                   j_start = P_diag_start[j1];
1049                   j_end = j_start+P_diag_i[j1+1];
1050                   for (k=j_start; k < j_end; k++)
1051                   {
1052                      k1 = P_diag_pass[pass-1][k];
1053                      if (P_marker[k1] != -i1-1)
1054                      {
1055                         P_diag_pass[pass][cnt_nz++] = k1;
1056                         P_marker[k1] = -i1-1;
1057                      }
1058                   }
1059                   j_start = P_offd_start[j1];
1060                   j_end = j_start+P_offd_i[j1+1];
1061                   for (k=j_start; k < j_end; k++)
1062                   {
1063                      k1 = P_offd_pass[pass-1][k];
1064                      if (P_marker_offd[k1] != -i1-1)
1065                      {
1066                         P_offd_pass[pass][cnt_nz_offd++] = k1;
1067                         P_marker_offd[k1] = -i1-1;
1068                      }
1069                   }
1070                }
1071             }
1072             for (j=S_offd_i[i1]; j < S_offd_i[i1+1]; j++)
1073             {
1074                j1 = S_offd_j[j];
1075                if (assigned_offd[j1] == pass-1)
1076                {
1077                   j_start = Pext_start[j1];
1078                   j_end = j_start+Pext_i[j1+1];
1079                   for (k=j_start; k < j_end; k++)
1080                   {
1081                      k1 = Pext_pass[pass][k];
1082                      if (k1 < 0)
1083                      {
1084                         if (P_marker[-k1-1] != -i1-1)
1085                         {
1086                            P_diag_pass[pass][cnt_nz++] = -k1-1;
1087                            P_marker[-k1-1] = -i1-1;
1088                         }
1089                      }
1090                      else if (P_marker_offd[k1] != -i1-1)
1091                      {
1092                         P_offd_pass[pass][cnt_nz_offd++] = k1;
1093                         P_marker_offd[k1] = -i1-1;
1094                      }
1095                   }
1096                }
1097             }
1098          }
1099 
1100          hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
1101          if ( (n_coarse_offd) || (new_num_cols_offd  == local_index+1) )
1102          {    hypre_TFree(P_marker_offd, HYPRE_MEMORY_HOST); }
1103 
1104       } /* End parallel region */
1105    }
1106 
1107 
1108    hypre_TFree(loc, HYPRE_MEMORY_HOST);
1109    hypre_TFree(P_ncols, HYPRE_MEMORY_HOST);
1110    hypre_TFree(Pext_send_buffer, HYPRE_MEMORY_HOST);
1111    hypre_TFree(big_temp_pass, HYPRE_MEMORY_HOST);
1112    hypre_TFree(new_recv_vec_start, HYPRE_MEMORY_HOST);
1113    hypre_TFree(cnt_nz_per_thread, HYPRE_MEMORY_HOST);
1114    hypre_TFree(cnt_nz_offd_per_thread, HYPRE_MEMORY_HOST);
1115    hypre_TFree(max_num_threads, HYPRE_MEMORY_HOST);
1116 
1117    P_diag_j = hypre_CTAlloc(HYPRE_Int, total_nz, HYPRE_MEMORY_HOST);
1118    P_diag_data = hypre_CTAlloc(HYPRE_Real, total_nz, HYPRE_MEMORY_HOST);
1119 
1120 
1121    if (total_nz_offd)
1122    {
1123       P_offd_j = hypre_CTAlloc(HYPRE_Int, total_nz_offd, HYPRE_MEMORY_HOST);
1124       P_offd_data = hypre_CTAlloc(HYPRE_Real, total_nz_offd, HYPRE_MEMORY_HOST);
1125    }
1126 
1127    for (i=0; i < n_fine; i++)
1128    {
1129       P_diag_i[i+1] += P_diag_i[i];
1130       P_offd_i[i+1] += P_offd_i[i];
1131    }
1132 
1133 /* determine P for coarse points */
1134 
1135 #ifdef HYPRE_USING_OPENMP
1136 #pragma omp parallel for private(i,i1) HYPRE_SMP_SCHEDULE
1137 #endif
1138    for (i=0; i < n_coarse; i++)
1139    {
1140       i1 = C_array[i];
1141       P_diag_j[P_diag_i[i1]] = fine_to_coarse[i1];
1142       P_diag_data[P_diag_i[i1]] = 1.0;
1143    }
1144 
1145 
1146    if (weight_option) /*if this is set, weights are separated into
1147                         negative and positive offdiagonals and accumulated
1148                         accordingly */
1149    {
1150 
1151       pass_length = pass_pointer[2]-pass_pointer[1];
1152 #ifdef HYPRE_USING_OPENMP
1153 #pragma omp parallel private(thread_start,thread_stop,my_thread_num,num_threads,P_marker,P_marker_offd,i,i1,sum_C_pos,sum_C_neg,sum_N_pos,sum_N_neg,j_start,j_end,j,k1,cnt,j1,cnt_offd,diagonal,alfa,beta)
1154 #endif
1155       {
1156          /* Sparsity structure is now finished.  Next, calculate interpolation
1157           * weights for pass one.  Thread by computing the interpolation
1158           * weights only over each thread's range of rows.  Rows are divided
1159           * up evenly amongst the threads. */
1160 
1161          P_marker = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_HOST);
1162          for (i=0; i < n_fine; i++)
1163          {   P_marker[i] = -1; }
1164          if (num_cols_offd)
1165          {
1166             P_marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_offd, HYPRE_MEMORY_HOST);
1167             for (i=0; i < num_cols_offd; i++)
1168                P_marker_offd[i] = -1;
1169          }
1170 
1171          /* Compute this thread's range of pass_length */
1172          my_thread_num = hypre_GetThreadNum();
1173          num_threads = hypre_NumActiveThreads();
1174          thread_start = pass_pointer[1] + (pass_length/num_threads)*my_thread_num;
1175          if (my_thread_num == num_threads-1)
1176          {  thread_stop = pass_pointer[1] + pass_length; }
1177          else
1178          {  thread_stop = pass_pointer[1] + (pass_length/num_threads)*(my_thread_num+1); }
1179 
1180          /* determine P for points of pass 1, i.e. neighbors of coarse points */
1181          for (i=thread_start; i < thread_stop; i++)
1182          {
1183             i1 = pass_array[i];
1184             sum_C_pos = 0;
1185             sum_C_neg = 0;
1186             sum_N_pos = 0;
1187             sum_N_neg = 0;
1188             j_start = P_diag_start[i1];
1189             j_end = j_start+P_diag_i[i1+1]-P_diag_i[i1];
1190             for (j=j_start; j < j_end; j++)
1191             {
1192                k1 = P_diag_pass[1][j];
1193                P_marker[C_array[k1]] = i1;
1194             }
1195             cnt = P_diag_i[i1];
1196             for (j=A_diag_i[i1]+1; j < A_diag_i[i1+1]; j++)
1197             {
1198                j1 = A_diag_j[j];
1199                if (CF_marker[j1] != -3 &&
1200                    (num_functions == 1 || dof_func[i1] == dof_func[j1]))
1201                {
1202                   if (A_diag_data[j] < 0)
1203                      sum_N_neg += A_diag_data[j];
1204                   else
1205                      sum_N_pos += A_diag_data[j];
1206                }
1207                if (j1 != -1 && P_marker[j1] == i1)
1208                {
1209                   P_diag_data[cnt] = A_diag_data[j];
1210                   P_diag_j[cnt++] = fine_to_coarse[j1];
1211                   if (A_diag_data[j] < 0)
1212                      sum_C_neg += A_diag_data[j];
1213                   else
1214                      sum_C_pos += A_diag_data[j];
1215                }
1216             }
1217             j_start = P_offd_start[i1];
1218             j_end = j_start+P_offd_i[i1+1]-P_offd_i[i1];
1219             for (j=j_start; j < j_end; j++)
1220             {
1221                k1 = P_offd_pass[1][j];
1222                P_marker_offd[C_array_offd[k1]] = i1;
1223             }
1224             cnt_offd = P_offd_i[i1];
1225             for (j=A_offd_i[i1]; j < A_offd_i[i1+1]; j++)
1226             {
1227                j1 = A_offd_j[j];
1228                if (CF_marker_offd[j1] != -3 &&
1229                    (num_functions == 1 || dof_func[i1] == dof_func_offd[j1]))
1230                {
1231                   if (A_offd_data[j] < 0)
1232                      sum_N_neg += A_offd_data[j];
1233                   else
1234                      sum_N_pos += A_offd_data[j];
1235                }
1236                if (j1 != -1 && P_marker_offd[j1] == i1)
1237                {
1238                   P_offd_data[cnt_offd] = A_offd_data[j];
1239                   P_offd_j[cnt_offd++] = map_S_to_new[j1];
1240                   if (A_offd_data[j] < 0)
1241                      sum_C_neg += A_offd_data[j];
1242                   else
1243                      sum_C_pos += A_offd_data[j];
1244                }
1245             }
1246             diagonal = A_diag_data[A_diag_i[i1]];
1247             if (sum_C_neg*diagonal != 0) alfa = -sum_N_neg/(sum_C_neg*diagonal);
1248             if (sum_C_pos*diagonal != 0) beta = -sum_N_pos/(sum_C_pos*diagonal);
1249             for (j=P_diag_i[i1]; j < cnt; j++)
1250                if (P_diag_data[j] < 0)
1251                   P_diag_data[j] *= alfa;
1252                else
1253                   P_diag_data[j] *= beta;
1254             for (j=P_offd_i[i1]; j < cnt_offd; j++)
1255                if (P_offd_data[j] < 0)
1256                   P_offd_data[j] *= alfa;
1257                else
1258                   P_offd_data[j] *= beta;
1259          }
1260 
1261          hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
1262          if (num_cols_offd)
1263          {    hypre_TFree(P_marker_offd, HYPRE_MEMORY_HOST); }
1264       } /* End Parallel Region */
1265 
1266       old_Pext_send_size = 0;
1267       old_Pext_recv_size = 0;
1268 
1269       if (n_coarse) hypre_TFree(C_array, HYPRE_MEMORY_HOST);
1270       hypre_TFree(C_array_offd, HYPRE_MEMORY_HOST);
1271       hypre_TFree(P_diag_pass[1], HYPRE_MEMORY_HOST);
1272       if (num_procs > 1) hypre_TFree(P_offd_pass[1], HYPRE_MEMORY_HOST);
1273 
1274 
1275       for (pass = 2; pass < num_passes; pass++)
1276       {
1277 
1278          if (num_procs > 1)
1279          {
1280             Pext_send_size = Pext_send_map_start[pass][num_sends];
1281             if (Pext_send_size > old_Pext_send_size)
1282             {
1283                hypre_TFree(Pext_send_data, HYPRE_MEMORY_HOST);
1284                Pext_send_data = hypre_CTAlloc(HYPRE_Real,  Pext_send_size, HYPRE_MEMORY_HOST);
1285             }
1286             old_Pext_send_size = Pext_send_size;
1287 
1288             cnt_offd = 0;
1289             for (i=0; i < num_sends; i++)
1290             {
1291                for (j=send_map_start[i]; j < send_map_start[i+1]; j++)
1292                {
1293                   j1 = send_map_elmt[j];
1294                   if (assigned[j1] == pass-1)
1295                   {
1296                      j_start = P_diag_i[j1];
1297                      j_end = P_diag_i[j1+1];
1298                      for (k=j_start; k < j_end; k++)
1299                      {   Pext_send_data[cnt_offd++] = P_diag_data[k]; }
1300                      j_start = P_offd_i[j1];
1301                      j_end = P_offd_i[j1+1];
1302                      for (k=j_start; k < j_end; k++)
1303                      {  Pext_send_data[cnt_offd++] = P_offd_data[k]; }
1304                   }
1305                }
1306             }
1307 
1308             hypre_ParCSRCommPkgNumSends(tmp_comm_pkg) = num_sends;
1309             hypre_ParCSRCommPkgSendMapStarts(tmp_comm_pkg) =
1310                Pext_send_map_start[pass];
1311             hypre_ParCSRCommPkgNumRecvs(tmp_comm_pkg) = num_recvs;
1312             hypre_ParCSRCommPkgRecvVecStarts(tmp_comm_pkg) =
1313                Pext_recv_vec_start[pass];
1314 
1315             Pext_recv_size = Pext_recv_vec_start[pass][num_recvs];
1316 
1317             if (Pext_recv_size > old_Pext_recv_size)
1318             {
1319                hypre_TFree(Pext_data, HYPRE_MEMORY_HOST);
1320                Pext_data = hypre_CTAlloc(HYPRE_Real,  Pext_recv_size, HYPRE_MEMORY_HOST);
1321             }
1322             old_Pext_recv_size = Pext_recv_size;
1323 
1324             comm_handle = hypre_ParCSRCommHandleCreate (1, tmp_comm_pkg,
1325                                                         Pext_send_data, Pext_data);
1326             hypre_ParCSRCommHandleDestroy(comm_handle);
1327 
1328             hypre_TFree(Pext_send_map_start[pass], HYPRE_MEMORY_HOST);
1329             hypre_TFree(Pext_recv_vec_start[pass], HYPRE_MEMORY_HOST);
1330          }
1331 
1332          pass_length = pass_pointer[pass+1]-pass_pointer[pass];
1333 #ifdef HYPRE_USING_OPENMP
1334 #pragma omp parallel private(thread_start,thread_stop,my_thread_num,num_threads,P_marker,P_marker_offd,i,i1,sum_C_neg,sum_C_pos,sum_N_neg,sum_N_pos,j_start,j_end,cnt,j,k1,cnt_offd,j1,k,alfa,beta,diagonal,C_array,C_array_offd)
1335 #endif
1336          {
1337             /* Sparsity structure is now finished.  Next, calculate interpolation
1338              * weights for passes >= 2.  Thread by computing the interpolation
1339              * weights only over each thread's range of rows.  Rows are divided
1340              * up evenly amongst the threads. */
1341 
1342             P_marker = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_HOST);
1343             for (i=0; i < n_fine; i++)
1344             {   P_marker[i] = -1; }
1345             if (num_cols_offd)
1346             {
1347                P_marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_offd, HYPRE_MEMORY_HOST);
1348                for (i=0; i < num_cols_offd; i++)
1349                   P_marker_offd[i] = -1;
1350             }
1351 
1352             C_array = NULL;
1353             C_array_offd = NULL;
1354             if (n_coarse)
1355             {   C_array = hypre_CTAlloc(HYPRE_Int,  n_coarse, HYPRE_MEMORY_HOST); }
1356             if (new_num_cols_offd > n_coarse_offd)
1357             {   C_array_offd = hypre_CTAlloc(HYPRE_Int,  new_num_cols_offd, HYPRE_MEMORY_HOST); }
1358             else if (n_coarse_offd)
1359             {   C_array_offd = hypre_CTAlloc(HYPRE_Int,  n_coarse_offd, HYPRE_MEMORY_HOST); }
1360 
1361             /* Compute this thread's range of pass_length */
1362             my_thread_num = hypre_GetThreadNum();
1363             num_threads = hypre_NumActiveThreads();
1364             thread_start = pass_pointer[pass] + (pass_length/num_threads)*my_thread_num;
1365             if (my_thread_num == num_threads-1)
1366             {  thread_stop = pass_pointer[pass] + pass_length; }
1367             else
1368             {  thread_stop = pass_pointer[pass] + (pass_length/num_threads)*(my_thread_num+1); }
1369 
1370             /* Loop over each thread's row-range */
1371             for (i=thread_start; i < thread_stop; i++)
1372             {
1373                i1 = pass_array[i];
1374                sum_C_neg = 0;
1375                sum_C_pos = 0;
1376                sum_N_neg = 0;
1377                sum_N_pos = 0;
1378                j_start = P_diag_start[i1];
1379                j_end = j_start+P_diag_i[i1+1]-P_diag_i[i1];
1380                cnt = P_diag_i[i1];
1381                for (j=j_start; j < j_end; j++)
1382                {
1383                   k1 = P_diag_pass[pass][j];
1384                   C_array[k1] = cnt;
1385                   P_diag_data[cnt] = 0;
1386                   P_diag_j[cnt++] = k1;
1387                }
1388                j_start = P_offd_start[i1];
1389                j_end = j_start+P_offd_i[i1+1]-P_offd_i[i1];
1390                cnt_offd = P_offd_i[i1];
1391                for (j=j_start; j < j_end; j++)
1392                {
1393                   k1 = P_offd_pass[pass][j];
1394                   C_array_offd[k1] = cnt_offd;
1395                   P_offd_data[cnt_offd] = 0;
1396                   P_offd_j[cnt_offd++] = k1;
1397                }
1398                for (j=S_diag_i[i1]; j < S_diag_i[i1+1]; j++)
1399                {
1400                   j1 = S_diag_j[j];
1401                   if (assigned[j1] == pass-1)
1402                      P_marker[j1] = i1;
1403                }
1404                for (j=S_offd_i[i1]; j < S_offd_i[i1+1]; j++)
1405                {
1406                   j1 = S_offd_j[j];
1407                   if (assigned_offd[j1] == pass-1)
1408                      P_marker_offd[j1] = i1;
1409                }
1410                for (j=A_diag_i[i1]+1; j < A_diag_i[i1+1]; j++)
1411                {
1412                   j1 = A_diag_j[j];
1413                   if (P_marker[j1] == i1)
1414                   {
1415                      for (k=P_diag_i[j1]; k < P_diag_i[j1+1]; k++)
1416                      {
1417                         k1 = P_diag_j[k];
1418                         alfa = A_diag_data[j]*P_diag_data[k];
1419                         P_diag_data[C_array[k1]] += alfa;
1420                         if (alfa < 0)
1421                         {
1422                            sum_C_neg += alfa;
1423                            sum_N_neg += alfa;
1424                         }
1425                         else
1426                         {
1427                            sum_C_pos += alfa;
1428                            sum_N_pos += alfa;
1429                         }
1430                      }
1431                      for (k=P_offd_i[j1]; k < P_offd_i[j1+1]; k++)
1432                      {
1433                         k1 = P_offd_j[k];
1434                         alfa = A_diag_data[j]*P_offd_data[k];
1435                         P_offd_data[C_array_offd[k1]] += alfa;
1436                         if (alfa < 0)
1437                         {
1438                            sum_C_neg += alfa;
1439                            sum_N_neg += alfa;
1440                         }
1441                         else
1442                         {
1443                            sum_C_pos += alfa;
1444                            sum_N_pos += alfa;
1445                         }
1446                      }
1447                   }
1448                   else
1449                   {
1450                      if (CF_marker[j1] != -3 &&
1451                          (num_functions == 1 || dof_func[i1] == dof_func[j1]))
1452                      {
1453                         if (A_diag_data[j] < 0)
1454                            sum_N_neg += A_diag_data[j];
1455                         else
1456                            sum_N_pos += A_diag_data[j];
1457                      }
1458                   }
1459                }
1460                for (j=A_offd_i[i1]; j < A_offd_i[i1+1]; j++)
1461                {
1462                   j1 = A_offd_j[j];
1463 
1464                   if (j1 > -1 && P_marker_offd[j1] == i1)
1465                   {
1466                      j_start = Pext_start[j1];
1467                      j_end = j_start+Pext_i[j1+1];
1468                      for (k=j_start; k < j_end; k++)
1469                      {
1470                         k1 = Pext_pass[pass][k];
1471                         alfa = A_offd_data[j]*Pext_data[k];
1472                         if (k1 < 0)
1473                            P_diag_data[C_array[-k1-1]] += alfa;
1474                         else
1475                            P_offd_data[C_array_offd[k1]] += alfa;
1476                         if (alfa < 0)
1477                         {
1478                            sum_C_neg += alfa;
1479                            sum_N_neg += alfa;
1480                         }
1481                         else
1482                         {
1483                            sum_C_pos += alfa;
1484                            sum_N_pos += alfa;
1485                         }
1486                      }
1487                   }
1488                   else
1489                   {
1490                      if (CF_marker_offd[j1] != -3 &&
1491                          (num_functions == 1 || dof_func_offd[j1] == dof_func[i1]))
1492                      {
1493                         if ( A_offd_data[j] < 0)
1494                            sum_N_neg += A_offd_data[j];
1495                         else
1496                            sum_N_pos += A_offd_data[j];
1497                      }
1498                   }
1499                }
1500                diagonal = A_diag_data[A_diag_i[i1]];
1501                if (sum_C_neg*diagonal != 0) alfa = -sum_N_neg/(sum_C_neg*diagonal);
1502                if (sum_C_pos*diagonal != 0) beta = -sum_N_pos/(sum_C_pos*diagonal);
1503 
1504                for (j=P_diag_i[i1]; j < P_diag_i[i1+1]; j++)
1505                   if (P_diag_data[j] < 0)
1506                      P_diag_data[j] *= alfa;
1507                   else
1508                      P_diag_data[j] *= beta;
1509                for (j=P_offd_i[i1]; j < P_offd_i[i1+1]; j++)
1510                   if (P_offd_data[j] < 0)
1511                      P_offd_data[j] *= alfa;
1512                   else
1513                      P_offd_data[j] *= beta;
1514             }
1515 
1516             hypre_TFree(C_array, HYPRE_MEMORY_HOST);
1517             hypre_TFree(C_array_offd, HYPRE_MEMORY_HOST);
1518             hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
1519             if (num_cols_offd)
1520             {   hypre_TFree(P_marker_offd, HYPRE_MEMORY_HOST); }
1521 
1522          } /* End OMP Parallel Section */
1523 
1524          hypre_TFree(P_diag_pass[pass], HYPRE_MEMORY_HOST);
1525          if (num_procs > 1)
1526          {
1527             hypre_TFree(P_offd_pass[pass], HYPRE_MEMORY_HOST);
1528             hypre_TFree(Pext_pass[pass], HYPRE_MEMORY_HOST);
1529          }
1530       } /* End num_passes for-loop */
1531    }
1532    else /* no distinction between positive and negative offdiagonal element */
1533    {
1534 
1535       pass_length = pass_pointer[2]-pass_pointer[1];
1536 #ifdef HYPRE_USING_OPENMP
1537 #pragma omp parallel private(thread_start,thread_stop,my_thread_num,num_threads,k,k1,i,i1,j,j1,sum_C,sum_N,j_start,j_end,cnt,tmp_marker,tmp_marker_offd,cnt_offd,diagonal,alfa)
1538 #endif
1539       {
1540          /* Sparsity structure is now finished.  Next, calculate interpolation
1541           * weights for pass one.  Thread by computing the interpolation
1542           * weights only over each thread's range of rows.  Rows are divided
1543           * up evenly amongst the threads. */
1544 
1545          /* Initialize thread-wise variables */
1546          tmp_marker = NULL;
1547          if (n_fine)
1548          {   tmp_marker = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_HOST); }
1549          tmp_marker_offd = NULL;
1550          if (num_cols_offd)
1551          {   tmp_marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_offd, HYPRE_MEMORY_HOST); }
1552          for (i=0; i < n_fine; i++)
1553          {   tmp_marker[i] = -1; }
1554          for (i=0; i < num_cols_offd; i++)
1555          {   tmp_marker_offd[i] = -1; }
1556 
1557          /* Compute this thread's range of pass_length */
1558          my_thread_num = hypre_GetThreadNum();
1559          num_threads = hypre_NumActiveThreads();
1560          thread_start = pass_pointer[1] + (pass_length/num_threads)*my_thread_num;
1561          if (my_thread_num == num_threads-1)
1562          {  thread_stop = pass_pointer[1] + pass_length; }
1563          else
1564          {  thread_stop = pass_pointer[1] + (pass_length/num_threads)*(my_thread_num+1); }
1565 
1566          /* determine P for points of pass 1, i.e. neighbors of coarse points */
1567          for (i=thread_start; i < thread_stop; i++)
1568          {
1569             i1 = pass_array[i];
1570             sum_C = 0;
1571             sum_N = 0;
1572             j_start = P_diag_start[i1];
1573             j_end = j_start+P_diag_i[i1+1]-P_diag_i[i1];
1574             for (j=j_start; j < j_end; j++)
1575             {
1576                k1 = P_diag_pass[1][j];
1577                tmp_marker[C_array[k1]] = i1;
1578             }
1579             cnt = P_diag_i[i1];
1580             for (j=A_diag_i[i1]+1; j < A_diag_i[i1+1]; j++)
1581             {
1582                j1 = A_diag_j[j];
1583                if (CF_marker[j1] != -3 &&
1584                    (num_functions == 1 || dof_func[i1] == dof_func[j1]))
1585                   sum_N += A_diag_data[j];
1586                if (j1 != -1 && tmp_marker[j1] == i1)
1587                {
1588                   P_diag_data[cnt] = A_diag_data[j];
1589                   P_diag_j[cnt++] = fine_to_coarse[j1];
1590                   sum_C += A_diag_data[j];
1591                }
1592             }
1593             j_start = P_offd_start[i1];
1594             j_end = j_start+P_offd_i[i1+1]-P_offd_i[i1];
1595             for (j=j_start; j < j_end; j++)
1596             {
1597                k1 = P_offd_pass[1][j];
1598                tmp_marker_offd[C_array_offd[k1]] = i1;
1599             }
1600             cnt_offd = P_offd_i[i1];
1601             for (j=A_offd_i[i1]; j < A_offd_i[i1+1]; j++)
1602             {
1603                j1 = A_offd_j[j];
1604                if (CF_marker_offd[j1] != -3 &&
1605                    (num_functions == 1 || dof_func[i1] == dof_func_offd[j1]))
1606                   sum_N += A_offd_data[j];
1607                if (j1 != -1 && tmp_marker_offd[j1] == i1)
1608                {
1609                   P_offd_data[cnt_offd] = A_offd_data[j];
1610                   P_offd_j[cnt_offd++] = map_S_to_new[j1];
1611                   sum_C += A_offd_data[j];
1612                }
1613             }
1614             diagonal = A_diag_data[A_diag_i[i1]];
1615             if (sum_C*diagonal != 0) alfa = -sum_N/(sum_C*diagonal);
1616             for (j=P_diag_i[i1]; j < cnt; j++)
1617                P_diag_data[j] *= alfa;
1618             for (j=P_offd_i[i1]; j < cnt_offd; j++)
1619                P_offd_data[j] *= alfa;
1620          }
1621          hypre_TFree(tmp_marker, HYPRE_MEMORY_HOST);
1622          hypre_TFree(tmp_marker_offd, HYPRE_MEMORY_HOST);
1623       } /* end OMP parallel region */
1624 
1625       old_Pext_send_size = 0;
1626       old_Pext_recv_size = 0;
1627 
1628       if (n_coarse) hypre_TFree(C_array, HYPRE_MEMORY_HOST);
1629       hypre_TFree(C_array_offd, HYPRE_MEMORY_HOST);
1630       hypre_TFree(P_diag_pass[1], HYPRE_MEMORY_HOST);
1631       if (num_procs > 1) hypre_TFree(P_offd_pass[1], HYPRE_MEMORY_HOST);
1632 
1633       for (pass = 2; pass < num_passes; pass++)
1634       {
1635 
1636          if (num_procs > 1)
1637          {
1638             Pext_send_size = Pext_send_map_start[pass][num_sends];
1639             if (Pext_send_size > old_Pext_send_size)
1640             {
1641                hypre_TFree(Pext_send_data, HYPRE_MEMORY_HOST);
1642                Pext_send_data = hypre_CTAlloc(HYPRE_Real,  Pext_send_size, HYPRE_MEMORY_HOST);
1643             }
1644             old_Pext_send_size = Pext_send_size;
1645 
1646             cnt_offd = 0;
1647             for (i=0; i < num_sends; i++)
1648             {
1649                for (j=send_map_start[i]; j < send_map_start[i+1]; j++)
1650                {
1651                   j1 = send_map_elmt[j];
1652                   if (assigned[j1] == pass-1)
1653                   {
1654                      j_start = P_diag_i[j1];
1655                      j_end = P_diag_i[j1+1];
1656                      for (k=j_start; k < j_end; k++)
1657                      {
1658                         Pext_send_data[cnt_offd++] = P_diag_data[k];
1659                      }
1660                      j_start = P_offd_i[j1];
1661                      j_end = P_offd_i[j1+1];
1662                      for (k=j_start; k < j_end; k++)
1663                      {
1664                         Pext_send_data[cnt_offd++] = P_offd_data[k];
1665                      }
1666                   }
1667                }
1668             }
1669 
1670             hypre_ParCSRCommPkgNumSends(tmp_comm_pkg) = num_sends;
1671             hypre_ParCSRCommPkgSendMapStarts(tmp_comm_pkg) =
1672                Pext_send_map_start[pass];
1673             hypre_ParCSRCommPkgNumRecvs(tmp_comm_pkg) = num_recvs;
1674             hypre_ParCSRCommPkgRecvVecStarts(tmp_comm_pkg) =
1675                Pext_recv_vec_start[pass];
1676 
1677             Pext_recv_size = Pext_recv_vec_start[pass][num_recvs];
1678 
1679             if (Pext_recv_size > old_Pext_recv_size)
1680             {
1681                hypre_TFree(Pext_data, HYPRE_MEMORY_HOST);
1682                Pext_data = hypre_CTAlloc(HYPRE_Real,  Pext_recv_size, HYPRE_MEMORY_HOST);
1683             }
1684             old_Pext_recv_size = Pext_recv_size;
1685 
1686             comm_handle = hypre_ParCSRCommHandleCreate (1, tmp_comm_pkg,
1687                                                         Pext_send_data, Pext_data);
1688             hypre_ParCSRCommHandleDestroy(comm_handle);
1689 
1690             hypre_TFree(Pext_send_map_start[pass], HYPRE_MEMORY_HOST);
1691             hypre_TFree(Pext_recv_vec_start[pass], HYPRE_MEMORY_HOST);
1692          }
1693 
1694          pass_length = pass_pointer[pass+1]-pass_pointer[pass];
1695 #ifdef HYPRE_USING_OPENMP
1696 #pragma omp parallel private(thread_start,thread_stop,my_thread_num,num_threads,k,k1,i,i1,j,j1,sum_C,sum_N,j_start,j_end,cnt,tmp_marker,tmp_marker_offd,cnt_offd,diagonal,alfa,tmp_array,tmp_array_offd)
1697 #endif
1698          {
1699             /* Sparsity structure is now finished.  Next, calculate interpolation
1700              * weights for passes >= 2.  Thread by computing the interpolation
1701              * weights only over each thread's range of rows.  Rows are divided
1702              * up evenly amongst the threads. */
1703 
1704             /* Initialize thread-wise variables */
1705             tmp_marker = NULL;
1706             if (n_fine)
1707             {    tmp_marker = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_HOST); }
1708             tmp_marker_offd = NULL;
1709             if (num_cols_offd)
1710             {    tmp_marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_offd, HYPRE_MEMORY_HOST); }
1711             tmp_array = NULL;
1712             if (n_coarse)
1713             {    tmp_array = hypre_CTAlloc(HYPRE_Int, n_coarse, HYPRE_MEMORY_HOST); }
1714             tmp_array_offd = NULL;
1715             if (new_num_cols_offd > n_coarse_offd)
1716             {    tmp_array_offd = hypre_CTAlloc(HYPRE_Int, new_num_cols_offd, HYPRE_MEMORY_HOST); }
1717             else
1718             {    tmp_array_offd = hypre_CTAlloc(HYPRE_Int, n_coarse_offd, HYPRE_MEMORY_HOST);}
1719             for (i=0; i < n_fine; i++)
1720             {    tmp_marker[i] = -1; }
1721             for (i=0; i < num_cols_offd; i++)
1722             {    tmp_marker_offd[i] = -1; }
1723 
1724             /* Compute this thread's range of pass_length */
1725             my_thread_num = hypre_GetThreadNum();
1726             num_threads = hypre_NumActiveThreads();
1727             thread_start = pass_pointer[pass] + (pass_length/num_threads)*my_thread_num;
1728             if (my_thread_num == num_threads-1)
1729             {  thread_stop = pass_pointer[pass] + pass_length; }
1730             else
1731             {  thread_stop = pass_pointer[pass] + (pass_length/num_threads)*(my_thread_num+1); }
1732 
1733             for (i=thread_start; i < thread_stop; i++)
1734             {
1735                i1 = pass_array[i];
1736                sum_C = 0;
1737                sum_N = 0;
1738                j_start = P_diag_start[i1];
1739                j_end = j_start+P_diag_i[i1+1]-P_diag_i[i1];
1740                cnt = P_diag_i[i1];
1741                for (j=j_start; j < j_end; j++)
1742                {
1743                   k1 = P_diag_pass[pass][j];
1744                   tmp_array[k1] = cnt;
1745                   P_diag_data[cnt] = 0;
1746                   P_diag_j[cnt++] = k1;
1747                }
1748                j_start = P_offd_start[i1];
1749                j_end = j_start+P_offd_i[i1+1]-P_offd_i[i1];
1750                cnt_offd = P_offd_i[i1];
1751                for (j=j_start; j < j_end; j++)
1752                {
1753                   k1 = P_offd_pass[pass][j];
1754                   tmp_array_offd[k1] = cnt_offd;
1755                   P_offd_data[cnt_offd] = 0;
1756                   P_offd_j[cnt_offd++] = k1;
1757                }
1758                for (j=S_diag_i[i1]; j < S_diag_i[i1+1]; j++)
1759                {
1760                   j1 = S_diag_j[j];
1761                   if (assigned[j1] == pass-1)
1762                      tmp_marker[j1] = i1;
1763                }
1764                for (j=S_offd_i[i1]; j < S_offd_i[i1+1]; j++)
1765                {
1766                   j1 = S_offd_j[j];
1767                   if (assigned_offd[j1] == pass-1)
1768                      tmp_marker_offd[j1] = i1;
1769                }
1770                for (j=A_diag_i[i1]+1; j < A_diag_i[i1+1]; j++)
1771                {
1772                   j1 = A_diag_j[j];
1773                   if (tmp_marker[j1] == i1)
1774                   {
1775                      for (k=P_diag_i[j1]; k < P_diag_i[j1+1]; k++)
1776                      {
1777                         k1 = P_diag_j[k];
1778                         alfa = A_diag_data[j]*P_diag_data[k];
1779                         P_diag_data[tmp_array[k1]] += alfa;
1780                         sum_C += alfa;
1781                         sum_N += alfa;
1782                      }
1783                      for (k=P_offd_i[j1]; k < P_offd_i[j1+1]; k++)
1784                      {
1785                         k1 = P_offd_j[k];
1786                         alfa = A_diag_data[j]*P_offd_data[k];
1787                         P_offd_data[tmp_array_offd[k1]] += alfa;
1788                         sum_C += alfa;
1789                         sum_N += alfa;
1790                      }
1791                   }
1792                   else
1793                   {
1794                      if (CF_marker[j1] != -3 &&
1795                          (num_functions == 1 || dof_func[i1] == dof_func[j1]))
1796                         sum_N += A_diag_data[j];
1797                   }
1798                }
1799                for (j=A_offd_i[i1]; j < A_offd_i[i1+1]; j++)
1800                {
1801                   j1 = A_offd_j[j];
1802 
1803                   if (j1 > -1 && tmp_marker_offd[j1] == i1)
1804                   {
1805                      j_start = Pext_start[j1];
1806                      j_end = j_start+Pext_i[j1+1];
1807                      for (k=j_start; k < j_end; k++)
1808                      {
1809                         k1 = Pext_pass[pass][k];
1810                         alfa = A_offd_data[j]*Pext_data[k];
1811                         if (k1 < 0)
1812                            P_diag_data[tmp_array[-k1-1]] += alfa;
1813                         else
1814                            P_offd_data[tmp_array_offd[k1]] += alfa;
1815                         sum_C += alfa;
1816                         sum_N += alfa;
1817                      }
1818                   }
1819                   else
1820                   {
1821                      if (CF_marker_offd[j1] != -3 &&
1822                          (num_functions == 1 || dof_func_offd[j1] == dof_func[i1]))
1823                         sum_N += A_offd_data[j];
1824                   }
1825                }
1826                diagonal = A_diag_data[A_diag_i[i1]];
1827                if (sum_C*diagonal != 0.0) alfa = -sum_N/(sum_C*diagonal);
1828 
1829                for (j=P_diag_i[i1]; j < P_diag_i[i1+1]; j++)
1830                   P_diag_data[j] *= alfa;
1831                for (j=P_offd_i[i1]; j < P_offd_i[i1+1]; j++)
1832                   P_offd_data[j] *= alfa;
1833             }
1834             hypre_TFree(tmp_marker, HYPRE_MEMORY_HOST);
1835             hypre_TFree(tmp_marker_offd, HYPRE_MEMORY_HOST);
1836             hypre_TFree(tmp_array, HYPRE_MEMORY_HOST);
1837             hypre_TFree(tmp_array_offd, HYPRE_MEMORY_HOST);
1838          } /* End OMP Parallel Section */
1839 
1840          hypre_TFree(P_diag_pass[pass], HYPRE_MEMORY_HOST);
1841          if (num_procs > 1)
1842          {
1843             hypre_TFree(P_offd_pass[pass], HYPRE_MEMORY_HOST);
1844             hypre_TFree(Pext_pass[pass], HYPRE_MEMORY_HOST);
1845          }
1846       }
1847    }
1848 
1849    hypre_TFree(CF_marker_offd, HYPRE_MEMORY_HOST);
1850    hypre_TFree(Pext_send_map_start, HYPRE_MEMORY_HOST);
1851    hypre_TFree(Pext_recv_vec_start, HYPRE_MEMORY_HOST);
1852    hypre_TFree(dof_func_offd, HYPRE_MEMORY_HOST);
1853    hypre_TFree(Pext_send_data, HYPRE_MEMORY_HOST);
1854    hypre_TFree(Pext_data, HYPRE_MEMORY_HOST);
1855    hypre_TFree(P_diag_pass, HYPRE_MEMORY_HOST);
1856    hypre_TFree(P_offd_pass, HYPRE_MEMORY_HOST);
1857    hypre_TFree(Pext_pass, HYPRE_MEMORY_HOST);
1858    hypre_TFree(P_diag_start, HYPRE_MEMORY_HOST);
1859    hypre_TFree(P_offd_start, HYPRE_MEMORY_HOST);
1860    hypre_TFree(Pext_start, HYPRE_MEMORY_HOST);
1861    hypre_TFree(Pext_i, HYPRE_MEMORY_HOST);
1862    hypre_TFree(fine_to_coarse, HYPRE_MEMORY_HOST);
1863    hypre_TFree(assigned, HYPRE_MEMORY_HOST);
1864    hypre_TFree(assigned_offd, HYPRE_MEMORY_HOST);
1865    hypre_TFree(pass_pointer, HYPRE_MEMORY_HOST);
1866    hypre_TFree(pass_array, HYPRE_MEMORY_HOST);
1867    hypre_TFree(map_S_to_new, HYPRE_MEMORY_HOST);
1868    if (num_procs > 1) hypre_TFree(tmp_comm_pkg, HYPRE_MEMORY_HOST);
1869 
1870    P = hypre_ParCSRMatrixCreate(comm,
1871                                 hypre_ParCSRMatrixGlobalNumRows(A),
1872                                 total_global_cpts,
1873                                 hypre_ParCSRMatrixColStarts(A),
1874                                 num_cpts_global,
1875                                 0,
1876                                 P_diag_i[n_fine],
1877                                 P_offd_i[n_fine]);
1878    P_diag = hypre_ParCSRMatrixDiag(P);
1879    hypre_CSRMatrixData(P_diag) = P_diag_data;
1880    hypre_CSRMatrixI(P_diag) = P_diag_i;
1881    hypre_CSRMatrixJ(P_diag) = P_diag_j;
1882    P_offd = hypre_ParCSRMatrixOffd(P);
1883    hypre_CSRMatrixData(P_offd) = P_offd_data;
1884    hypre_CSRMatrixI(P_offd) = P_offd_i;
1885    hypre_CSRMatrixJ(P_offd) = P_offd_j;
1886 
1887    /* Compress P, removing coefficients smaller than trunc_factor * Max
1888       and/or keep yat most <P_max_elmts> per row absolutely maximal coefficients */
1889 
1890    if (trunc_factor != 0.0 || P_max_elmts != 0)
1891    {
1892       hypre_BoomerAMGInterpTruncation(P, trunc_factor, P_max_elmts);
1893       P_diag_data = hypre_CSRMatrixData(P_diag);
1894       P_diag_i = hypre_CSRMatrixI(P_diag);
1895       P_diag_j = hypre_CSRMatrixJ(P_diag);
1896       P_offd_data = hypre_CSRMatrixData(P_offd);
1897       P_offd_i = hypre_CSRMatrixI(P_offd);
1898       P_offd_j = hypre_CSRMatrixJ(P_offd);
1899    }
1900    P_offd_size = P_offd_i[n_fine];
1901 
1902    num_cols_offd_P = 0;
1903    if (P_offd_size)
1904    {
1905       if (new_num_cols_offd > num_cols_offd)
1906       {   P_marker_offd = hypre_CTAlloc(HYPRE_Int, new_num_cols_offd, HYPRE_MEMORY_HOST); }
1907       else
1908       {   P_marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_offd, HYPRE_MEMORY_HOST); }
1909 #ifdef HYPRE_USING_OPENMP
1910 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
1911 #endif
1912       for (i=0; i < new_num_cols_offd; i++)
1913       {   P_marker_offd[i] = 0; }
1914 
1915       num_cols_offd_P = 0;
1916       for (i=0; i < P_offd_size; i++)
1917       {
1918          index = P_offd_j[i];
1919          if (!P_marker_offd[index])
1920          {
1921             num_cols_offd_P++;
1922             P_marker_offd[index] = 1;
1923          }
1924       }
1925 
1926       col_map_offd_P = hypre_CTAlloc(HYPRE_BigInt, num_cols_offd_P, HYPRE_MEMORY_HOST);
1927       permute = hypre_CTAlloc(HYPRE_Int,  new_counter[num_passes-1], HYPRE_MEMORY_HOST);
1928       big_permute = hypre_CTAlloc(HYPRE_BigInt,  new_counter[num_passes-1], HYPRE_MEMORY_HOST);
1929 
1930 #ifdef HYPRE_USING_OPENMP
1931 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
1932 #endif
1933       for (i=0; i < new_counter[num_passes-1]; i++)
1934          big_permute[i] = -1;
1935 
1936       cnt = 0;
1937       for (i=0; i < num_passes-1; i++)
1938       {
1939          for (j=new_counter[i]; j < new_counter[i+1]; j++)
1940          {
1941             if (P_marker_offd[j])
1942             {
1943                col_map_offd_P[cnt] = new_elmts[i][j-(HYPRE_BigInt)new_counter[i]];
1944                big_permute[j] = col_map_offd_P[cnt++];
1945             }
1946          }
1947       }
1948 
1949       hypre_BigQsort0(col_map_offd_P,0,num_cols_offd_P-1);
1950 
1951 #ifdef HYPRE_USING_OPENMP
1952 #pragma omp parallel for private(i,big_k1) HYPRE_SMP_SCHEDULE
1953 #endif
1954       for (i=0; i < new_counter[num_passes-1]; i++)
1955       {
1956          big_k1 = big_permute[i];
1957          if (big_k1 != -1)
1958             permute[i] = hypre_BigBinarySearch(col_map_offd_P,big_k1,num_cols_offd_P);
1959       }
1960 
1961 #ifdef HYPRE_USING_OPENMP
1962 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
1963 #endif
1964       for (i=0; i < P_offd_size; i++)
1965       {   P_offd_j[i] = permute[P_offd_j[i]]; }
1966 
1967       hypre_TFree(P_marker_offd, HYPRE_MEMORY_HOST);
1968    }
1969    if (num_procs > 1)
1970    {
1971       for (i=0; i < num_passes-1; i++)
1972          hypre_TFree(new_elmts[i], HYPRE_MEMORY_HOST);
1973    }
1974    hypre_TFree(permute, HYPRE_MEMORY_HOST);
1975    hypre_TFree(big_permute, HYPRE_MEMORY_HOST);
1976    hypre_TFree(new_elmts, HYPRE_MEMORY_HOST);
1977    hypre_TFree(new_counter, HYPRE_MEMORY_HOST);
1978 
1979    if (num_cols_offd_P)
1980    {
1981       hypre_ParCSRMatrixColMapOffd(P) = col_map_offd_P;
1982       hypre_CSRMatrixNumCols(P_offd) = num_cols_offd_P;
1983    }
1984 
1985    if (n_SF)
1986    {
1987 #ifdef HYPRE_USING_OPENMP
1988 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
1989 #endif
1990       for (i=0; i < n_fine; i++)
1991          if (CF_marker[i] == -3) CF_marker[i] = -1;
1992    }
1993 
1994    if (num_procs > 1)
1995    {
1996       hypre_MatvecCommPkgCreate(P);
1997    }
1998 
1999    *P_ptr = P;
2000 
2001    /* wall_time = hypre_MPI_Wtime() - wall_time;
2002       hypre_printf("TOTAL TIME  %1.2e \n",wall_time); */
2003 
2004    /*-----------------------------------------------------------------------
2005     *  Build and return dof_func array for coarse grid.
2006     *-----------------------------------------------------------------------*/
2007 
2008    /*-----------------------------------------------------------------------
2009     *  Free mapping vector and marker array.
2010     *-----------------------------------------------------------------------*/
2011 
2012 #ifdef HYPRE_PROFILE
2013    hypre_profile_times[HYPRE_TIMER_ID_MULTIPASS_INTERP] += hypre_MPI_Wtime();
2014 #endif
2015 
2016    return(0);
2017 }
2018 
2019 HYPRE_Int
hypre_BoomerAMGBuildMultipass(hypre_ParCSRMatrix * A,HYPRE_Int * CF_marker,hypre_ParCSRMatrix * S,HYPRE_BigInt * num_cpts_global,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int debug_flag,HYPRE_Real trunc_factor,HYPRE_Int P_max_elmts,HYPRE_Int weight_option,hypre_ParCSRMatrix ** P_ptr)2020 hypre_BoomerAMGBuildMultipass( hypre_ParCSRMatrix  *A,
2021                                HYPRE_Int           *CF_marker,
2022                                hypre_ParCSRMatrix  *S,
2023                                HYPRE_BigInt        *num_cpts_global,
2024                                HYPRE_Int            num_functions,
2025                                HYPRE_Int           *dof_func,
2026                                HYPRE_Int            debug_flag,
2027                                HYPRE_Real           trunc_factor,
2028                                HYPRE_Int            P_max_elmts,
2029                                HYPRE_Int            weight_option,
2030                                hypre_ParCSRMatrix **P_ptr )
2031 {
2032 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2033    hypre_GpuProfilingPushRange("MultipassInterp");
2034 #endif
2035 
2036    HYPRE_Int ierr = 0;
2037 
2038 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2039    HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy2( hypre_ParCSRMatrixMemoryLocation(A),
2040                                                       hypre_ParCSRMatrixMemoryLocation(S) );
2041    if (exec == HYPRE_EXEC_DEVICE)
2042    {
2043       /* Notice: call the mod version on GPUs */
2044       ierr = hypre_BoomerAMGBuildModMultipassDevice( A, CF_marker, S, num_cpts_global,
2045                                                      trunc_factor, P_max_elmts, 9,
2046                                                      num_functions, dof_func,
2047                                                      P_ptr );
2048    }
2049    else
2050 #endif
2051    {
2052       ierr = hypre_BoomerAMGBuildMultipassHost( A, CF_marker, S, num_cpts_global,
2053                                                 num_functions, dof_func, debug_flag,
2054                                                 trunc_factor, P_max_elmts, weight_option,
2055                                                 P_ptr );
2056    }
2057 
2058 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2059    hypre_GpuProfilingPopRange();
2060 #endif
2061 
2062    return ierr;
2063 }
2064 
2065