1 /******************************************************************************
2  * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other
3  * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
4  *
5  * SPDX-License-Identifier: (Apache-2.0 OR MIT)
6  ******************************************************************************/
7 
8 #include "_hypre_parcsr_ls.h"
9 
10 /*---------------------------------------------------------------------------
11  * hypre_BoomerAMGBuildInterp
12  *--------------------------------------------------------------------------*/
13 
14 HYPRE_Int
hypre_BoomerAMGBuildInterp(hypre_ParCSRMatrix * A,HYPRE_Int * CF_marker,hypre_ParCSRMatrix * S,HYPRE_BigInt * num_cpts_global,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int debug_flag,HYPRE_Real trunc_factor,HYPRE_Int max_elmts,hypre_ParCSRMatrix ** P_ptr)15 hypre_BoomerAMGBuildInterp( hypre_ParCSRMatrix   *A,
16                          HYPRE_Int               *CF_marker,
17                          hypre_ParCSRMatrix      *S,
18                          HYPRE_BigInt            *num_cpts_global,
19                          HYPRE_Int                num_functions,
20                          HYPRE_Int               *dof_func,
21                          HYPRE_Int                debug_flag,
22                          HYPRE_Real               trunc_factor,
23                          HYPRE_Int                max_elmts,
24                          hypre_ParCSRMatrix     **P_ptr)
25 {
26    MPI_Comm           comm = hypre_ParCSRMatrixComm(A);
27    hypre_ParCSRCommPkg     *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
28    hypre_ParCSRCommHandle  *comm_handle;
29 
30    hypre_CSRMatrix   *A_diag = hypre_ParCSRMatrixDiag(A);
31    HYPRE_Real        *A_diag_data = hypre_CSRMatrixData(A_diag);
32    HYPRE_Int         *A_diag_i = hypre_CSRMatrixI(A_diag);
33    HYPRE_Int         *A_diag_j = hypre_CSRMatrixJ(A_diag);
34 
35    hypre_CSRMatrix   *A_offd = hypre_ParCSRMatrixOffd(A);
36    HYPRE_Real        *A_offd_data = hypre_CSRMatrixData(A_offd);
37    HYPRE_Int         *A_offd_i = hypre_CSRMatrixI(A_offd);
38    HYPRE_Int         *A_offd_j = hypre_CSRMatrixJ(A_offd);
39    HYPRE_Int          num_cols_A_offd = hypre_CSRMatrixNumCols(A_offd);
40    HYPRE_BigInt      *col_map_offd = hypre_ParCSRMatrixColMapOffd(A);
41 
42    hypre_CSRMatrix   *S_diag = hypre_ParCSRMatrixDiag(S);
43    HYPRE_Int         *S_diag_i = hypre_CSRMatrixI(S_diag);
44    HYPRE_Int         *S_diag_j = hypre_CSRMatrixJ(S_diag);
45 
46    hypre_CSRMatrix   *S_offd = hypre_ParCSRMatrixOffd(S);
47    HYPRE_Int         *S_offd_i = hypre_CSRMatrixI(S_offd);
48    HYPRE_Int         *S_offd_j = hypre_CSRMatrixJ(S_offd);
49 
50    hypre_ParCSRMatrix *P;
51    HYPRE_BigInt       *col_map_offd_P;
52    HYPRE_Int          *tmp_map_offd = NULL;
53 
54    HYPRE_Int         *CF_marker_offd = NULL;
55    HYPRE_Int         *dof_func_offd = NULL;
56 
57    hypre_CSRMatrix   *A_ext;
58 
59    HYPRE_Real        *A_ext_data = NULL;
60    HYPRE_Int         *A_ext_i = NULL;
61    HYPRE_BigInt      *A_ext_j = NULL;
62 
63    hypre_CSRMatrix   *P_diag;
64    hypre_CSRMatrix   *P_offd;
65 
66    HYPRE_Real        *P_diag_data;
67    HYPRE_Int         *P_diag_i;
68    HYPRE_Int         *P_diag_j;
69    HYPRE_Real        *P_offd_data;
70    HYPRE_Int         *P_offd_i;
71    HYPRE_Int         *P_offd_j;
72 
73    HYPRE_Int          P_diag_size, P_offd_size;
74 
75    HYPRE_Int         *P_marker, *P_marker_offd;
76 
77    HYPRE_Int          jj_counter,jj_counter_offd;
78    HYPRE_Int         *jj_count, *jj_count_offd;
79    HYPRE_Int          jj_begin_row,jj_begin_row_offd;
80    HYPRE_Int          jj_end_row,jj_end_row_offd;
81 
82    HYPRE_Int          start_indexing = 0; /* start indexing for P_data at 0 */
83 
84    HYPRE_Int          n_fine = hypre_CSRMatrixNumRows(A_diag);
85 
86    HYPRE_Int          strong_f_marker;
87 
88    HYPRE_Int         *fine_to_coarse;
89    //HYPRE_Int         *fine_to_coarse_offd;
90    HYPRE_Int         *coarse_counter;
91    HYPRE_Int          coarse_shift;
92    HYPRE_BigInt       total_global_cpts;
93    //HYPRE_BigInt       my_first_cpt;
94    HYPRE_Int          num_cols_P_offd;
95 
96    HYPRE_Int          i,i1,i2;
97    HYPRE_Int          j,jl,jj,jj1;
98    HYPRE_Int          kc;
99    HYPRE_BigInt       big_k;
100    HYPRE_Int          start;
101    HYPRE_Int          sgn;
102    HYPRE_Int          c_num;
103 
104    HYPRE_Real         diagonal;
105    HYPRE_Real         sum;
106    HYPRE_Real         distribute;
107 
108    HYPRE_Real         zero = 0.0;
109    HYPRE_Real         one  = 1.0;
110 
111    HYPRE_Int          my_id;
112    HYPRE_Int          num_procs;
113    HYPRE_Int          num_threads;
114    HYPRE_Int          num_sends;
115    HYPRE_Int          index;
116    HYPRE_Int          ns, ne, size, rest;
117    HYPRE_Int          print_level = 0;
118    HYPRE_Int         *int_buf_data;
119 
120    HYPRE_BigInt col_1 = hypre_ParCSRMatrixFirstRowIndex(A);
121    HYPRE_Int local_numrows = hypre_CSRMatrixNumRows(A_diag);
122    HYPRE_BigInt col_n = col_1 + (HYPRE_BigInt)local_numrows;
123 
124    HYPRE_Real       wall_time;  /* for debugging instrumentation  */
125 
126    hypre_MPI_Comm_size(comm, &num_procs);
127    hypre_MPI_Comm_rank(comm,&my_id);
128    num_threads = hypre_NumThreads();
129 
130    //my_first_cpt = num_cpts_global[0];
131    if (my_id == (num_procs -1)) total_global_cpts = num_cpts_global[1];
132    hypre_MPI_Bcast(&total_global_cpts, 1, HYPRE_MPI_BIG_INT, num_procs-1, comm);
133 
134    /*-------------------------------------------------------------------
135     * Get the CF_marker data for the off-processor columns
136     *-------------------------------------------------------------------*/
137 
138    if (debug_flag < 0)
139    {
140       debug_flag = -debug_flag;
141       print_level = 1;
142    }
143 
144    if (debug_flag==4) wall_time = time_getWallclockSeconds();
145 
146    if (num_cols_A_offd) CF_marker_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
147    if (num_functions > 1 && num_cols_A_offd)
148    {
149       dof_func_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
150    }
151 
152    if (!comm_pkg)
153    {
154       hypre_MatvecCommPkgCreate(A);
155       comm_pkg = hypre_ParCSRMatrixCommPkg(A);
156    }
157 
158    num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
159    int_buf_data = hypre_CTAlloc(HYPRE_Int,  hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
160                                 HYPRE_MEMORY_HOST);
161 
162    index = 0;
163    for (i = 0; i < num_sends; i++)
164    {
165       start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
166       for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
167       {
168          int_buf_data[index++] = CF_marker[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
169       }
170    }
171 
172    comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data, CF_marker_offd);
173 
174    hypre_ParCSRCommHandleDestroy(comm_handle);
175    if (num_functions > 1)
176    {
177       index = 0;
178       for (i = 0; i < num_sends; i++)
179       {
180          start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
181          for (j=start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
182             int_buf_data[index++] = dof_func[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
183       }
184 
185       comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data, dof_func_offd);
186 
187       hypre_ParCSRCommHandleDestroy(comm_handle);
188    }
189 
190    if (debug_flag==4)
191    {
192       wall_time = time_getWallclockSeconds() - wall_time;
193       hypre_printf("Proc = %d     Interp: Comm 1 CF_marker =    %f\n",
194             my_id, wall_time);
195       fflush(NULL);
196    }
197 
198    /*----------------------------------------------------------------------
199     * Get the ghost rows of A
200     *---------------------------------------------------------------------*/
201 
202    if (debug_flag==4) wall_time = time_getWallclockSeconds();
203 
204    if (num_procs > 1)
205    {
206       A_ext      = hypre_ParCSRMatrixExtractBExt(A,A,1);
207       A_ext_i    = hypre_CSRMatrixI(A_ext);
208       A_ext_j    = hypre_CSRMatrixBigJ(A_ext);
209       A_ext_data = hypre_CSRMatrixData(A_ext);
210    }
211 
212    index = 0;
213    for (i=0; i < num_cols_A_offd; i++)
214    {
215       for (j=A_ext_i[i]; j < A_ext_i[i+1]; j++)
216       {
217          big_k = A_ext_j[j];
218          if (big_k >= col_1 && big_k < col_n)
219          {
220             A_ext_j[index] = big_k - col_1;
221             A_ext_data[index++] = A_ext_data[j];
222          }
223          else
224          {
225             kc = hypre_BigBinarySearch(col_map_offd,big_k,num_cols_A_offd);
226             if (kc > -1)
227             {
228                A_ext_j[index] = (HYPRE_BigInt)(-kc-1);
229                A_ext_data[index++] = A_ext_data[j];
230             }
231          }
232       }
233       A_ext_i[i] = index;
234    }
235    for (i = num_cols_A_offd; i > 0; i--)
236       A_ext_i[i] = A_ext_i[i-1];
237    if (num_procs > 1) A_ext_i[0] = 0;
238 
239    if (debug_flag==4)
240    {
241       wall_time = time_getWallclockSeconds() - wall_time;
242       hypre_printf("Proc = %d  Interp: Comm 2   Get A_ext =  %f\n",
243             my_id, wall_time);
244       fflush(NULL);
245    }
246 
247 
248    /*-----------------------------------------------------------------------
249     *  First Pass: Determine size of P and fill in fine_to_coarse mapping.
250     *-----------------------------------------------------------------------*/
251 
252    /*-----------------------------------------------------------------------
253     *  Intialize counters and allocate mapping vector.
254     *-----------------------------------------------------------------------*/
255 
256    coarse_counter = hypre_CTAlloc(HYPRE_Int,  num_threads, HYPRE_MEMORY_HOST);
257    jj_count = hypre_CTAlloc(HYPRE_Int,  num_threads, HYPRE_MEMORY_HOST);
258    jj_count_offd = hypre_CTAlloc(HYPRE_Int,  num_threads, HYPRE_MEMORY_HOST);
259 
260    fine_to_coarse = hypre_CTAlloc(HYPRE_Int,  n_fine, HYPRE_MEMORY_HOST);
261 #ifdef HYPRE_USING_OPENMP
262 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
263 #endif
264    for (i = 0; i < n_fine; i++) fine_to_coarse[i] = -1;
265 
266    jj_counter = start_indexing;
267    jj_counter_offd = start_indexing;
268 
269    /*-----------------------------------------------------------------------
270     *  Loop over fine grid.
271     *-----------------------------------------------------------------------*/
272 
273    /* RDF: this looks a little tricky, but doable */
274 #ifdef HYPRE_USING_OPENMP
275 #pragma omp parallel for private(i,j,i1,jj,ns,ne,size,rest) HYPRE_SMP_SCHEDULE
276 #endif
277    for (j = 0; j < num_threads; j++)
278    {
279       size = n_fine/num_threads;
280       rest = n_fine - size*num_threads;
281       if (j < rest)
282       {
283          ns = j*size+j;
284          ne = (j+1)*size+j+1;
285       }
286       else
287       {
288          ns = j*size+rest;
289          ne = (j+1)*size+rest;
290       }
291       for (i = ns; i < ne; i++)
292       {
293 
294          /*--------------------------------------------------------------------
295           *  If i is a C-point, interpolation is the identity. Also set up
296           *  mapping vector.
297           *--------------------------------------------------------------------*/
298 
299          if (CF_marker[i] >= 0)
300          {
301             jj_count[j]++;
302             fine_to_coarse[i] = coarse_counter[j];
303             coarse_counter[j]++;
304          }
305 
306          /*--------------------------------------------------------------------
307           *  If i is an F-point, interpolation is from the C-points that
308           *  strongly influence i.
309           *--------------------------------------------------------------------*/
310 
311          else
312          {
313             for (jj = S_diag_i[i]; jj < S_diag_i[i+1]; jj++)
314             {
315                i1 = S_diag_j[jj];
316                if (CF_marker[i1] >= 0)
317                {
318                   jj_count[j]++;
319                }
320             }
321 
322             if (num_procs > 1)
323             {
324                for (jj = S_offd_i[i]; jj < S_offd_i[i+1]; jj++)
325                {
326                   i1 = S_offd_j[jj];
327                   if (CF_marker_offd[i1] >= 0)
328                   {
329                      jj_count_offd[j]++;
330                   }
331                }
332             }
333          }
334       }
335    }
336 
337    /*-----------------------------------------------------------------------
338     *  Allocate  arrays.
339     *-----------------------------------------------------------------------*/
340 
341    for (i=0; i < num_threads-1; i++)
342    {
343       coarse_counter[i+1] += coarse_counter[i];
344       jj_count[i+1] += jj_count[i];
345       jj_count_offd[i+1] += jj_count_offd[i];
346    }
347    i = num_threads-1;
348    jj_counter = jj_count[i];
349    jj_counter_offd = jj_count_offd[i];
350 
351    P_diag_size = jj_counter;
352 
353    P_diag_i    = hypre_CTAlloc(HYPRE_Int,  n_fine+1,    HYPRE_MEMORY_DEVICE);
354    P_diag_j    = hypre_CTAlloc(HYPRE_Int,  P_diag_size, HYPRE_MEMORY_DEVICE);
355    P_diag_data = hypre_CTAlloc(HYPRE_Real, P_diag_size, HYPRE_MEMORY_DEVICE);
356 
357    P_diag_i[n_fine] = jj_counter;
358 
359 
360    P_offd_size = jj_counter_offd;
361 
362    P_offd_i    = hypre_CTAlloc(HYPRE_Int,  n_fine+1,    HYPRE_MEMORY_DEVICE);
363    P_offd_j    = hypre_CTAlloc(HYPRE_Int,  P_offd_size, HYPRE_MEMORY_DEVICE);
364    P_offd_data = hypre_CTAlloc(HYPRE_Real, P_offd_size, HYPRE_MEMORY_DEVICE);
365 
366    /*-----------------------------------------------------------------------
367     *  Intialize some stuff.
368     *-----------------------------------------------------------------------*/
369 
370    jj_counter = start_indexing;
371    jj_counter_offd = start_indexing;
372 
373    if (debug_flag==4)
374    {
375       wall_time = time_getWallclockSeconds() - wall_time;
376       hypre_printf("Proc = %d     Interp: Internal work 1 =     %f\n",
377             my_id, wall_time);
378       fflush(NULL);
379    }
380 
381    /*-----------------------------------------------------------------------
382     *  Send and receive fine_to_coarse info.
383     *-----------------------------------------------------------------------*/
384 
385    if (debug_flag==4) wall_time = time_getWallclockSeconds();
386 
387    //fine_to_coarse_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
388 
389 #ifdef HYPRE_USING_OPENMP
390 #pragma omp parallel for private(i,j,ns,ne,size,rest,coarse_shift) HYPRE_SMP_SCHEDULE
391 #endif
392    for (j = 0; j < num_threads; j++)
393    {
394       coarse_shift = 0;
395       if (j > 0) coarse_shift = coarse_counter[j-1];
396       size = n_fine/num_threads;
397       rest = n_fine - size*num_threads;
398       if (j < rest)
399       {
400          ns = j*size+j;
401          ne = (j+1)*size+j+1;
402       }
403       else
404       {
405          ns = j*size+rest;
406          ne = (j+1)*size+rest;
407       }
408       for (i = ns; i < ne; i++)
409       {
410          fine_to_coarse[i] += coarse_shift;
411       }
412       //fine_to_coarse[i] += my_first_cpt+coarse_shift;
413    }
414    /*index = 0;
415      for (i = 0; i < num_sends; i++)
416      {
417      start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
418      for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
419      int_buf_data[index++]
420      = fine_to_coarse[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
421      }
422 
423      comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data, fine_to_coarse_offd);
424 
425      hypre_ParCSRCommHandleDestroy(comm_handle);
426 
427      if (debug_flag==4)
428      {
429      wall_time = time_getWallclockSeconds() - wall_time;
430      hypre_printf("Proc = %d     Interp: Comm 4 FineToCoarse = %f\n",
431      my_id, wall_time);
432      fflush(NULL);
433      }*/
434 
435    if (debug_flag==4) wall_time = time_getWallclockSeconds();
436 
437    /*#ifdef HYPRE_USING_OPENMP
438 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
439 #endif
440 for (i = 0; i < n_fine; i++) fine_to_coarse[i] -= my_first_cpt; */
441 
442    /*-----------------------------------------------------------------------
443     *  Loop over fine grid points.
444     *-----------------------------------------------------------------------*/
445 
446 #ifdef HYPRE_USING_OPENMP
447 #pragma omp parallel for private(i,j,jl,i1,i2,jj,jj1,ns,ne,size,rest,sum,diagonal,distribute,P_marker,P_marker_offd,strong_f_marker,jj_counter,jj_counter_offd,sgn,c_num,jj_begin_row,jj_end_row,jj_begin_row_offd,jj_end_row_offd) HYPRE_SMP_SCHEDULE
448 #endif
449    for (jl = 0; jl < num_threads; jl++)
450    {
451       size = n_fine/num_threads;
452       rest = n_fine - size*num_threads;
453       if (jl < rest)
454       {
455          ns = jl*size+jl;
456          ne = (jl+1)*size+jl+1;
457       }
458       else
459       {
460          ns = jl*size+rest;
461          ne = (jl+1)*size+rest;
462       }
463       jj_counter = 0;
464       if (jl > 0) jj_counter = jj_count[jl-1];
465       jj_counter_offd = 0;
466       if (jl > 0) jj_counter_offd = jj_count_offd[jl-1];
467 
468       P_marker = hypre_CTAlloc(HYPRE_Int,  n_fine, HYPRE_MEMORY_HOST);
469       if (num_cols_A_offd)
470          P_marker_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
471       else
472          P_marker_offd = NULL;
473 
474       for (i = 0; i < n_fine; i++)
475       {
476          P_marker[i] = -1;
477       }
478       for (i = 0; i < num_cols_A_offd; i++)
479       {
480          P_marker_offd[i] = -1;
481       }
482       strong_f_marker = -2;
483 
484       for (i = ns; i < ne; i++)
485       {
486 
487          /*--------------------------------------------------------------------
488           *  If i is a c-point, interpolation is the identity.
489           *--------------------------------------------------------------------*/
490 
491          if (CF_marker[i] >= 0)
492          {
493             P_diag_i[i] = jj_counter;
494             P_diag_j[jj_counter]    = fine_to_coarse[i];
495             P_diag_data[jj_counter] = one;
496             jj_counter++;
497          }
498 
499          /*--------------------------------------------------------------------
500           *  If i is an F-point, build interpolation.
501           *--------------------------------------------------------------------*/
502 
503          else
504          {
505             /* Diagonal part of P */
506             P_diag_i[i] = jj_counter;
507             jj_begin_row = jj_counter;
508 
509             for (jj = S_diag_i[i]; jj < S_diag_i[i+1]; jj++)
510             {
511                i1 = S_diag_j[jj];
512 
513                /*--------------------------------------------------------------
514                 * If neighbor i1 is a C-point, set column number in P_diag_j
515                 * and initialize interpolation weight to zero.
516                 *--------------------------------------------------------------*/
517 
518                if (CF_marker[i1] >= 0)
519                {
520                   P_marker[i1] = jj_counter;
521                   P_diag_j[jj_counter]    = fine_to_coarse[i1];
522                   P_diag_data[jj_counter] = zero;
523                   jj_counter++;
524                }
525 
526                /*--------------------------------------------------------------
527                 * If neighbor i1 is an F-point, mark it as a strong F-point
528                 * whose connection needs to be distributed.
529                 *--------------------------------------------------------------*/
530 
531                else if (CF_marker[i1] != -3)
532                {
533                   P_marker[i1] = strong_f_marker;
534                }
535             }
536             jj_end_row = jj_counter;
537 
538             /* Off-Diagonal part of P */
539             P_offd_i[i] = jj_counter_offd;
540             jj_begin_row_offd = jj_counter_offd;
541 
542 
543             if (num_procs > 1)
544             {
545                for (jj = S_offd_i[i]; jj < S_offd_i[i+1]; jj++)
546                {
547                   i1 = S_offd_j[jj];
548 
549                   /*-----------------------------------------------------------
550                    * If neighbor i1 is a C-point, set column number in P_offd_j
551                    * and initialize interpolation weight to zero.
552                    *-----------------------------------------------------------*/
553 
554                   if (CF_marker_offd[i1] >= 0)
555                   {
556                      P_marker_offd[i1] = jj_counter_offd;
557                      /*P_offd_j[jj_counter_offd]  = fine_to_coarse_offd[i1];*/
558                      P_offd_j[jj_counter_offd]  = i1;
559                      P_offd_data[jj_counter_offd] = zero;
560                      jj_counter_offd++;
561                   }
562 
563                   /*-----------------------------------------------------------
564                    * If neighbor i1 is an F-point, mark it as a strong F-point
565                    * whose connection needs to be distributed.
566                    *-----------------------------------------------------------*/
567 
568                   else if (CF_marker_offd[i1] != -3)
569                   {
570                      P_marker_offd[i1] = strong_f_marker;
571                   }
572                }
573             }
574 
575             jj_end_row_offd = jj_counter_offd;
576 
577             diagonal = A_diag_data[A_diag_i[i]];
578 
579 
580             /* Loop over ith row of A.  First, the diagonal part of A */
581 
582             for (jj = A_diag_i[i]+1; jj < A_diag_i[i+1]; jj++)
583             {
584                i1 = A_diag_j[jj];
585 
586                /*--------------------------------------------------------------
587                 * Case 1: neighbor i1 is a C-point and strongly influences i,
588                 * accumulate a_{i,i1} into the interpolation weight.
589                 *--------------------------------------------------------------*/
590 
591                if (P_marker[i1] >= jj_begin_row)
592                {
593                   P_diag_data[P_marker[i1]] += A_diag_data[jj];
594                }
595 
596                /*--------------------------------------------------------------
597                 * Case 2: neighbor i1 is an F-point and strongly influences i,
598                 * distribute a_{i,i1} to C-points that strongly infuence i.
599                 * Note: currently no distribution to the diagonal in this case.
600                 *--------------------------------------------------------------*/
601 
602                else if (P_marker[i1] == strong_f_marker)
603                {
604                   sum = zero;
605 
606                   /*-----------------------------------------------------------
607                    * Loop over row of A for point i1 and calculate the sum
608                    * of the connections to c-points that strongly influence i.
609                    *-----------------------------------------------------------*/
610                   sgn = 1;
611                   if (A_diag_data[A_diag_i[i1]] < 0) sgn = -1;
612                   /* Diagonal block part of row i1 */
613                   for (jj1 = A_diag_i[i1]; jj1 < A_diag_i[i1+1]; jj1++)
614                   {
615                      i2 = A_diag_j[jj1];
616                      if (P_marker[i2] >= jj_begin_row &&
617                            (sgn*A_diag_data[jj1]) < 0)
618                      {
619                         sum += A_diag_data[jj1];
620                      }
621                   }
622 
623                   /* Off-Diagonal block part of row i1 */
624                   if (num_procs > 1)
625                   {
626                      for (jj1 = A_offd_i[i1]; jj1 < A_offd_i[i1+1]; jj1++)
627                      {
628                         i2 = A_offd_j[jj1];
629                         if (P_marker_offd[i2] >= jj_begin_row_offd
630                               && (sgn*A_offd_data[jj1]) < 0)
631                         {
632                            sum += A_offd_data[jj1];
633                         }
634                      }
635                   }
636 
637                   if (sum != 0)
638                   {
639                      distribute = A_diag_data[jj] / sum;
640 
641                      /*-----------------------------------------------------------
642                       * Loop over row of A for point i1 and do the distribution.
643                       *-----------------------------------------------------------*/
644 
645                      /* Diagonal block part of row i1 */
646                      for (jj1 = A_diag_i[i1]; jj1 < A_diag_i[i1+1]; jj1++)
647                      {
648                         i2 = A_diag_j[jj1];
649                         if (P_marker[i2] >= jj_begin_row
650                               && (sgn*A_diag_data[jj1]) < 0)
651                         {
652                            P_diag_data[P_marker[i2]]
653                               += distribute * A_diag_data[jj1];
654                         }
655                      }
656 
657                      /* Off-Diagonal block part of row i1 */
658                      if (num_procs > 1)
659                      {
660                         for (jj1 = A_offd_i[i1]; jj1 < A_offd_i[i1+1]; jj1++)
661                         {
662                            i2 = A_offd_j[jj1];
663                            if (P_marker_offd[i2] >= jj_begin_row_offd
664                                  && (sgn*A_offd_data[jj1]) < 0)
665                            {
666                               P_offd_data[P_marker_offd[i2]]
667                                  += distribute * A_offd_data[jj1];
668                            }
669                         }
670                      }
671                   }
672                   else
673                   {
674                      if (num_functions == 1 || dof_func[i] == dof_func[i1])
675                      {
676                         diagonal += A_diag_data[jj];
677                      }
678                   }
679                }
680 
681                /*--------------------------------------------------------------
682                 * Case 3: neighbor i1 weakly influences i, accumulate a_{i,i1}
683                 * into the diagonal.
684                 *--------------------------------------------------------------*/
685 
686                else if (CF_marker[i1] != -3)
687                {
688                   if (num_functions == 1 || dof_func[i] == dof_func[i1])
689                   {
690                      diagonal += A_diag_data[jj];
691                   }
692                }
693 
694             }
695 
696 
697             /*----------------------------------------------------------------
698              * Still looping over ith row of A. Next, loop over the
699              * off-diagonal part of A
700              *---------------------------------------------------------------*/
701 
702             if (num_procs > 1)
703             {
704                for (jj = A_offd_i[i]; jj < A_offd_i[i+1]; jj++)
705                {
706                   i1 = A_offd_j[jj];
707 
708                   /*--------------------------------------------------------------
709                    * Case 1: neighbor i1 is a C-point and strongly influences i,
710                    * accumulate a_{i,i1} into the interpolation weight.
711                    *--------------------------------------------------------------*/
712 
713                   if (P_marker_offd[i1] >= jj_begin_row_offd)
714                   {
715                      P_offd_data[P_marker_offd[i1]] += A_offd_data[jj];
716                   }
717 
718                   /*------------------------------------------------------------
719                    * Case 2: neighbor i1 is an F-point and strongly influences i,
720                    * distribute a_{i,i1} to C-points that strongly infuence i.
721                    * Note: currently no distribution to the diagonal in this case.
722                    *-----------------------------------------------------------*/
723 
724                   else if (P_marker_offd[i1] == strong_f_marker)
725                   {
726                      sum = zero;
727 
728                      /*---------------------------------------------------------
729                       * Loop over row of A_ext for point i1 and calculate the sum
730                       * of the connections to c-points that strongly influence i.
731                       *---------------------------------------------------------*/
732 
733                      /* find row number */
734                      c_num = A_offd_j[jj];
735 
736                      sgn = 1;
737                      if (A_ext_data[A_ext_i[c_num]] < 0) sgn = -1;
738                      for (jj1 = A_ext_i[c_num]; jj1 < A_ext_i[c_num+1]; jj1++)
739                      {
740                         i2 = (HYPRE_Int)A_ext_j[jj1];
741 
742                         if (i2 > -1)
743                         {
744                            /* in the diagonal block */
745                            if (P_marker[i2] >= jj_begin_row
746                                  && (sgn*A_ext_data[jj1]) < 0)
747                            {
748                               sum += A_ext_data[jj1];
749                            }
750                         }
751                         else
752                         {
753                            /* in the off_diagonal block  */
754                            if (P_marker_offd[-i2-1] >= jj_begin_row_offd
755                                  && (sgn*A_ext_data[jj1]) < 0)
756                            {
757                               sum += A_ext_data[jj1];
758                            }
759 
760                         }
761 
762                      }
763 
764                      if (sum != 0)
765                      {
766                         distribute = A_offd_data[jj] / sum;
767                         /*---------------------------------------------------------
768                          * Loop over row of A_ext for point i1 and do
769                          * the distribution.
770                          *--------------------------------------------------------*/
771 
772                         /* Diagonal block part of row i1 */
773 
774                         for (jj1 = A_ext_i[c_num]; jj1 < A_ext_i[c_num+1]; jj1++)
775                         {
776                            i2 = (HYPRE_Int)A_ext_j[jj1];
777 
778                            if (i2 > -1) /* in the diagonal block */
779                            {
780                               if (P_marker[i2] >= jj_begin_row
781                                     && (sgn*A_ext_data[jj1]) < 0)
782                               {
783                                  P_diag_data[P_marker[i2]]
784                                     += distribute * A_ext_data[jj1];
785                               }
786                            }
787                            else
788                            {
789                               /* in the off_diagonal block  */
790                               if (P_marker_offd[-i2-1] >= jj_begin_row_offd
791                                     && (sgn*A_ext_data[jj1]) < 0)
792                                  P_offd_data[P_marker_offd[-i2-1]]
793                                     += distribute * A_ext_data[jj1];
794                            }
795                         }
796                      }
797                      else
798                      {
799                         if (num_functions == 1 || dof_func[i] == dof_func_offd[i1])
800                         {
801                            diagonal += A_offd_data[jj];
802                         }
803                      }
804                   }
805 
806                   /*-----------------------------------------------------------
807                    * Case 3: neighbor i1 weakly influences i, accumulate a_{i,i1}
808                    * into the diagonal.
809                    *-----------------------------------------------------------*/
810 
811                   else if (CF_marker_offd[i1] != -3)
812                   {
813                      if (num_functions == 1 || dof_func[i] == dof_func_offd[i1])
814                      {
815                         diagonal += A_offd_data[jj];
816                      }
817                   }
818 
819                }
820             }
821 
822             /*-----------------------------------------------------------------
823              * Set interpolation weight by dividing by the diagonal.
824              *-----------------------------------------------------------------*/
825 
826             if (diagonal == 0.0)
827             {
828                if (print_level)
829                {
830                   hypre_printf(" Warning! zero diagonal! Proc id %d row %d\n", my_id,i);
831                }
832                for (jj = jj_begin_row; jj < jj_end_row; jj++)
833                {
834                   P_diag_data[jj] = 0.0;
835                }
836                for (jj = jj_begin_row_offd; jj < jj_end_row_offd; jj++)
837                {
838                   P_offd_data[jj] = 0.0;
839                }
840             }
841             else
842             {
843                for (jj = jj_begin_row; jj < jj_end_row; jj++)
844                {
845                   P_diag_data[jj] /= -diagonal;
846                }
847                for (jj = jj_begin_row_offd; jj < jj_end_row_offd; jj++)
848                {
849                   P_offd_data[jj] /= -diagonal;
850                }
851             }
852 
853          }
854 
855          strong_f_marker--;
856 
857          P_offd_i[i+1] = jj_counter_offd;
858       }
859       hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
860       hypre_TFree(P_marker_offd, HYPRE_MEMORY_HOST);
861    }
862 
863    P = hypre_ParCSRMatrixCreate(comm,
864          hypre_ParCSRMatrixGlobalNumRows(A),
865          total_global_cpts,
866          hypre_ParCSRMatrixColStarts(A),
867          num_cpts_global,
868          0,
869          P_diag_i[n_fine],
870          P_offd_i[n_fine]);
871 
872    P_diag = hypre_ParCSRMatrixDiag(P);
873    hypre_CSRMatrixData(P_diag) = P_diag_data;
874    hypre_CSRMatrixI(P_diag) = P_diag_i;
875    hypre_CSRMatrixJ(P_diag) = P_diag_j;
876    P_offd = hypre_ParCSRMatrixOffd(P);
877    hypre_CSRMatrixData(P_offd) = P_offd_data;
878    hypre_CSRMatrixI(P_offd) = P_offd_i;
879    hypre_CSRMatrixJ(P_offd) = P_offd_j;
880 
881    /* Compress P, removing coefficients smaller than trunc_factor * Max */
882 
883    if (trunc_factor != 0.0 || max_elmts > 0)
884    {
885       hypre_BoomerAMGInterpTruncation(P, trunc_factor, max_elmts);
886       P_diag_data = hypre_CSRMatrixData(P_diag);
887       P_diag_i = hypre_CSRMatrixI(P_diag);
888       P_diag_j = hypre_CSRMatrixJ(P_diag);
889       P_offd_data = hypre_CSRMatrixData(P_offd);
890       P_offd_i = hypre_CSRMatrixI(P_offd);
891       P_offd_j = hypre_CSRMatrixJ(P_offd);
892       P_diag_size = P_diag_i[n_fine];
893       P_offd_size = P_offd_i[n_fine];
894    }
895 
896    num_cols_P_offd = 0;
897    if (P_offd_size)
898    {
899       P_marker = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
900 
901 #ifdef HYPRE_USING_OPENMP
902 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
903 #endif
904       for (i=0; i < num_cols_A_offd; i++)
905       {
906          P_marker[i] = 0;
907       }
908 
909       num_cols_P_offd = 0;
910       for (i=0; i < P_offd_size; i++)
911       {
912          index = P_offd_j[i];
913          if (!P_marker[index])
914          {
915             num_cols_P_offd++;
916             P_marker[index] = 1;
917          }
918       }
919 
920       col_map_offd_P = hypre_CTAlloc(HYPRE_BigInt, num_cols_P_offd, HYPRE_MEMORY_HOST);
921       tmp_map_offd = hypre_CTAlloc(HYPRE_Int, num_cols_P_offd, HYPRE_MEMORY_HOST);
922 
923       index = 0;
924       for (i=0; i < num_cols_P_offd; i++)
925       {
926          while (P_marker[index]==0) index++;
927          tmp_map_offd[i] = index++;
928       }
929 
930 #ifdef HYPRE_USING_OPENMP
931 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
932 #endif
933       for (i=0; i < P_offd_size; i++)
934          P_offd_j[i] = hypre_BinarySearch(tmp_map_offd,
935                P_offd_j[i],
936                num_cols_P_offd);
937       hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
938    }
939 
940    for (i=0; i < n_fine; i++)
941    {
942       if (CF_marker[i] == -3) CF_marker[i] = -1;
943    }
944 
945    if (num_cols_P_offd)
946    {
947       hypre_ParCSRMatrixColMapOffd(P) = col_map_offd_P;
948       hypre_CSRMatrixNumCols(P_offd) = num_cols_P_offd;
949    }
950 
951    hypre_GetCommPkgRTFromCommPkgA(P,A, fine_to_coarse, tmp_map_offd);
952 
953 
954    *P_ptr = P;
955 
956    hypre_TFree(tmp_map_offd, HYPRE_MEMORY_HOST);
957    hypre_TFree(CF_marker_offd, HYPRE_MEMORY_HOST);
958    hypre_TFree(dof_func_offd, HYPRE_MEMORY_HOST);
959    hypre_TFree(int_buf_data, HYPRE_MEMORY_HOST);
960    hypre_TFree(fine_to_coarse, HYPRE_MEMORY_HOST);
961    //hypre_TFree(fine_to_coarse_offd, HYPRE_MEMORY_HOST);
962    hypre_TFree(coarse_counter, HYPRE_MEMORY_HOST);
963    hypre_TFree(jj_count, HYPRE_MEMORY_HOST);
964    hypre_TFree(jj_count_offd, HYPRE_MEMORY_HOST);
965 
966    if (num_procs > 1) hypre_CSRMatrixDestroy(A_ext);
967 
968    return hypre_error_flag;
969 }
970 
971 
972 /*---------------------------------------------------------------------------
973  * hypre_BoomerAMGBuildInterpHE
974  * interpolation routine for hyperbolic PDEs
975  * treats weak fine connections  like strong fine connections
976  *--------------------------------------------------------------------------*/
977 
978 HYPRE_Int
hypre_BoomerAMGBuildInterpHE(hypre_ParCSRMatrix * A,HYPRE_Int * CF_marker,hypre_ParCSRMatrix * S,HYPRE_BigInt * num_cpts_global,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int debug_flag,HYPRE_Real trunc_factor,HYPRE_Int max_elmts,hypre_ParCSRMatrix ** P_ptr)979 hypre_BoomerAMGBuildInterpHE( hypre_ParCSRMatrix   *A,
980                               HYPRE_Int            *CF_marker,
981                               hypre_ParCSRMatrix   *S,
982                               HYPRE_BigInt         *num_cpts_global,
983                               HYPRE_Int             num_functions,
984                               HYPRE_Int            *dof_func,
985                               HYPRE_Int             debug_flag,
986                               HYPRE_Real            trunc_factor,
987                               HYPRE_Int             max_elmts,
988                               hypre_ParCSRMatrix  **P_ptr)
989 {
990 
991    MPI_Comm      comm = hypre_ParCSRMatrixComm(A);
992    hypre_ParCSRCommPkg     *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
993    hypre_ParCSRCommHandle  *comm_handle;
994 
995    hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
996    HYPRE_Real      *A_diag_data = hypre_CSRMatrixData(A_diag);
997    HYPRE_Int       *A_diag_i = hypre_CSRMatrixI(A_diag);
998    HYPRE_Int       *A_diag_j = hypre_CSRMatrixJ(A_diag);
999 
1000    hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
1001    HYPRE_Real      *A_offd_data = hypre_CSRMatrixData(A_offd);
1002    HYPRE_Int       *A_offd_i = hypre_CSRMatrixI(A_offd);
1003    HYPRE_Int       *A_offd_j = hypre_CSRMatrixJ(A_offd);
1004    HYPRE_Int        num_cols_A_offd = hypre_CSRMatrixNumCols(A_offd);
1005    HYPRE_BigInt    *col_map_offd = hypre_ParCSRMatrixColMapOffd(A);
1006 
1007    hypre_CSRMatrix *S_diag = hypre_ParCSRMatrixDiag(S);
1008    HYPRE_Int       *S_diag_i = hypre_CSRMatrixI(S_diag);
1009    HYPRE_Int       *S_diag_j = hypre_CSRMatrixJ(S_diag);
1010 
1011    hypre_CSRMatrix *S_offd = hypre_ParCSRMatrixOffd(S);
1012    HYPRE_Int       *S_offd_i = hypre_CSRMatrixI(S_offd);
1013    HYPRE_Int       *S_offd_j = hypre_CSRMatrixJ(S_offd);
1014 
1015    hypre_ParCSRMatrix *P;
1016    HYPRE_BigInt      *col_map_offd_P;
1017    HYPRE_Int      *tmp_map_offd = NULL;
1018 
1019    HYPRE_Int          *CF_marker_offd = NULL;
1020    HYPRE_Int          *dof_func_offd = NULL;
1021 
1022    hypre_CSRMatrix *A_ext;
1023 
1024    HYPRE_Real      *A_ext_data = NULL;
1025    HYPRE_Int       *A_ext_i = NULL;
1026    HYPRE_BigInt    *A_ext_j = NULL;
1027 
1028    hypre_CSRMatrix *P_diag;
1029    hypre_CSRMatrix *P_offd;
1030 
1031    HYPRE_Real      *P_diag_data;
1032    HYPRE_Int       *P_diag_i;
1033    HYPRE_Int       *P_diag_j;
1034    HYPRE_Real      *P_offd_data;
1035    HYPRE_Int       *P_offd_i;
1036    HYPRE_Int       *P_offd_j;
1037 
1038    HYPRE_Int        P_diag_size, P_offd_size;
1039 
1040    HYPRE_Int       *P_marker, *P_marker_offd;
1041 
1042    HYPRE_Int        jj_counter,jj_counter_offd;
1043    HYPRE_Int       *jj_count, *jj_count_offd;
1044    HYPRE_Int        jj_begin_row,jj_begin_row_offd;
1045    HYPRE_Int        jj_end_row,jj_end_row_offd;
1046 
1047    HYPRE_Int        start_indexing = 0; /* start indexing for P_data at 0 */
1048 
1049    HYPRE_Int        n_fine = hypre_CSRMatrixNumRows(A_diag);
1050 
1051    HYPRE_Int       *fine_to_coarse;
1052    //HYPRE_Int       *fine_to_coarse_offd;
1053    HYPRE_Int       *coarse_counter;
1054    HYPRE_Int        coarse_shift;
1055    HYPRE_BigInt     total_global_cpts;
1056    //HYPRE_BigInt     my_first_cpt;
1057    HYPRE_Int        num_cols_P_offd;
1058 
1059    HYPRE_Int        i,i1,i2;
1060    HYPRE_Int        j,jl,jj,jj1;
1061    HYPRE_Int        kc;
1062    HYPRE_BigInt     big_k;
1063    HYPRE_Int        start;
1064    HYPRE_Int        sgn;
1065    HYPRE_Int        c_num;
1066 
1067    HYPRE_Real       diagonal;
1068    HYPRE_Real       sum;
1069    HYPRE_Real       distribute;
1070 
1071    HYPRE_Real       zero = 0.0;
1072    HYPRE_Real       one  = 1.0;
1073 
1074    HYPRE_Int        my_id;
1075    HYPRE_Int        num_procs;
1076    HYPRE_Int        num_threads;
1077    HYPRE_Int        num_sends;
1078    HYPRE_Int        index;
1079    HYPRE_Int        ns, ne, size, rest;
1080    HYPRE_Int       *int_buf_data;
1081 
1082    HYPRE_BigInt col_1 = hypre_ParCSRMatrixFirstRowIndex(A);
1083    HYPRE_Int local_numrows = hypre_CSRMatrixNumRows(A_diag);
1084    HYPRE_BigInt col_n = col_1 + local_numrows;
1085 
1086    HYPRE_Real       wall_time;  /* for debugging instrumentation  */
1087 
1088    hypre_MPI_Comm_size(comm, &num_procs);
1089    hypre_MPI_Comm_rank(comm,&my_id);
1090    num_threads = hypre_NumThreads();
1091 
1092 
1093    //my_first_cpt = num_cpts_global[0];
1094    if (my_id == (num_procs -1)) total_global_cpts = num_cpts_global[1];
1095    hypre_MPI_Bcast(&total_global_cpts, 1, HYPRE_MPI_BIG_INT, num_procs-1, comm);
1096 
1097    /*-------------------------------------------------------------------
1098     * Get the CF_marker data for the off-processor columns
1099     *-------------------------------------------------------------------*/
1100 
1101    if (debug_flag==4) wall_time = time_getWallclockSeconds();
1102 
1103    if (num_cols_A_offd) CF_marker_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
1104    if (num_functions > 1 && num_cols_A_offd)
1105       dof_func_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
1106 
1107    if (!comm_pkg)
1108    {
1109       hypre_MatvecCommPkgCreate(A);
1110       comm_pkg = hypre_ParCSRMatrixCommPkg(A);
1111    }
1112 
1113    num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
1114    int_buf_data = hypre_CTAlloc(HYPRE_Int, hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
1115                                 HYPRE_MEMORY_HOST);
1116 
1117    index = 0;
1118    for (i = 0; i < num_sends; i++)
1119    {
1120       start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
1121       for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
1122          int_buf_data[index++]
1123             = CF_marker[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
1124    }
1125 
1126    comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data, CF_marker_offd);
1127 
1128    hypre_ParCSRCommHandleDestroy(comm_handle);
1129    if (num_functions > 1)
1130    {
1131       index = 0;
1132       for (i = 0; i < num_sends; i++)
1133       {
1134          start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
1135          for (j=start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
1136             int_buf_data[index++]
1137                = dof_func[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
1138       }
1139 
1140       comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data, dof_func_offd);
1141 
1142       hypre_ParCSRCommHandleDestroy(comm_handle);
1143    }
1144 
1145    if (debug_flag==4)
1146    {
1147       wall_time = time_getWallclockSeconds() - wall_time;
1148       hypre_printf("Proc = %d     Interp: Comm 1 CF_marker =    %f\n",
1149             my_id, wall_time);
1150       fflush(NULL);
1151    }
1152 
1153    /*----------------------------------------------------------------------
1154     * Get the ghost rows of A
1155     *---------------------------------------------------------------------*/
1156 
1157    if (debug_flag==4) wall_time = time_getWallclockSeconds();
1158 
1159    if (num_procs > 1)
1160    {
1161       A_ext      = hypre_ParCSRMatrixExtractBExt(A,A,1);
1162       A_ext_i    = hypre_CSRMatrixI(A_ext);
1163       A_ext_j    = hypre_CSRMatrixBigJ(A_ext);
1164       A_ext_data = hypre_CSRMatrixData(A_ext);
1165    }
1166 
1167    index = 0;
1168    for (i=0; i < num_cols_A_offd; i++)
1169    {
1170       for (j=A_ext_i[i]; j < A_ext_i[i+1]; j++)
1171       {
1172          big_k = A_ext_j[j];
1173          if (big_k >= col_1 && big_k < col_n)
1174          {
1175             A_ext_j[index] = big_k - col_1;
1176             A_ext_data[index++] = A_ext_data[j];
1177          }
1178          else
1179          {
1180             kc = hypre_BigBinarySearch(col_map_offd,big_k,num_cols_A_offd);
1181             if (kc > -1)
1182             {
1183                A_ext_j[index] = (HYPRE_BigInt)(-kc-1);
1184                A_ext_data[index++] = A_ext_data[j];
1185             }
1186          }
1187       }
1188       A_ext_i[i] = index;
1189    }
1190    for (i = num_cols_A_offd; i > 0; i--)
1191       A_ext_i[i] = A_ext_i[i-1];
1192    if (num_procs > 1) A_ext_i[0] = 0;
1193 
1194    if (debug_flag==4)
1195    {
1196       wall_time = time_getWallclockSeconds() - wall_time;
1197       hypre_printf("Proc = %d  Interp: Comm 2   Get A_ext =  %f\n",
1198             my_id, wall_time);
1199       fflush(NULL);
1200    }
1201 
1202    /*-----------------------------------------------------------------------
1203     *  First Pass: Determine size of P and fill in fine_to_coarse mapping.
1204     *-----------------------------------------------------------------------*/
1205 
1206    /*-----------------------------------------------------------------------
1207     *  Intialize counters and allocate mapping vector.
1208     *-----------------------------------------------------------------------*/
1209 
1210    coarse_counter = hypre_CTAlloc(HYPRE_Int,  num_threads, HYPRE_MEMORY_HOST);
1211    jj_count = hypre_CTAlloc(HYPRE_Int,  num_threads, HYPRE_MEMORY_HOST);
1212    jj_count_offd = hypre_CTAlloc(HYPRE_Int,  num_threads, HYPRE_MEMORY_HOST);
1213 
1214    fine_to_coarse = hypre_CTAlloc(HYPRE_Int,  n_fine, HYPRE_MEMORY_HOST);
1215 #ifdef HYPRE_USING_OPENMP
1216 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
1217 #endif
1218    for (i = 0; i < n_fine; i++) fine_to_coarse[i] = -1;
1219 
1220    jj_counter = start_indexing;
1221    jj_counter_offd = start_indexing;
1222 
1223    /*-----------------------------------------------------------------------
1224     *  Loop over fine grid.
1225     *-----------------------------------------------------------------------*/
1226 
1227    /* RDF: this looks a little tricky, but doable */
1228 #ifdef HYPRE_USING_OPENMP
1229 #pragma omp parallel for private(i,j,i1,jj,ns,ne,size,rest) HYPRE_SMP_SCHEDULE
1230 #endif
1231    for (j = 0; j < num_threads; j++)
1232    {
1233       size = n_fine/num_threads;
1234       rest = n_fine - size*num_threads;
1235       if (j < rest)
1236       {
1237          ns = j*size+j;
1238          ne = (j+1)*size+j+1;
1239       }
1240       else
1241       {
1242          ns = j*size+rest;
1243          ne = (j+1)*size+rest;
1244       }
1245       for (i = ns; i < ne; i++)
1246       {
1247 
1248          /*--------------------------------------------------------------------
1249           *  If i is a C-point, interpolation is the identity. Also set up
1250           *  mapping vector.
1251           *--------------------------------------------------------------------*/
1252 
1253          if (CF_marker[i] >= 0)
1254          {
1255             jj_count[j]++;
1256             fine_to_coarse[i] = coarse_counter[j];
1257             coarse_counter[j]++;
1258          }
1259 
1260          /*--------------------------------------------------------------------
1261           *  If i is an F-point, interpolation is from the C-points that
1262           *  strongly influence i.
1263           *--------------------------------------------------------------------*/
1264 
1265          else
1266          {
1267             for (jj = S_diag_i[i]; jj < S_diag_i[i+1]; jj++)
1268             {
1269                i1 = S_diag_j[jj];
1270                if (CF_marker[i1] >= 0)
1271                {
1272                   jj_count[j]++;
1273                }
1274             }
1275 
1276             if (num_procs > 1)
1277             {
1278                for (jj = S_offd_i[i]; jj < S_offd_i[i+1]; jj++)
1279                {
1280                   i1 = S_offd_j[jj];
1281                   if (CF_marker_offd[i1] >= 0)
1282                   {
1283                      jj_count_offd[j]++;
1284                   }
1285                }
1286             }
1287          }
1288       }
1289    }
1290 
1291    /*-----------------------------------------------------------------------
1292     *  Allocate  arrays.
1293     *-----------------------------------------------------------------------*/
1294 
1295    for (i=0; i < num_threads-1; i++)
1296    {
1297       coarse_counter[i+1] += coarse_counter[i];
1298       jj_count[i+1] += jj_count[i];
1299       jj_count_offd[i+1] += jj_count_offd[i];
1300    }
1301    i = num_threads-1;
1302    jj_counter = jj_count[i];
1303    jj_counter_offd = jj_count_offd[i];
1304 
1305    P_diag_size = jj_counter;
1306 
1307    P_diag_i    = hypre_CTAlloc(HYPRE_Int,  n_fine+1, HYPRE_MEMORY_HOST);
1308    P_diag_j    = hypre_CTAlloc(HYPRE_Int,  P_diag_size, HYPRE_MEMORY_HOST);
1309    P_diag_data = hypre_CTAlloc(HYPRE_Real,  P_diag_size, HYPRE_MEMORY_HOST);
1310 
1311    P_diag_i[n_fine] = jj_counter;
1312 
1313 
1314    P_offd_size = jj_counter_offd;
1315 
1316    P_offd_i    = hypre_CTAlloc(HYPRE_Int,  n_fine+1, HYPRE_MEMORY_HOST);
1317    P_offd_j    = hypre_CTAlloc(HYPRE_Int,  P_offd_size, HYPRE_MEMORY_HOST);
1318    P_offd_data = hypre_CTAlloc(HYPRE_Real,  P_offd_size, HYPRE_MEMORY_HOST);
1319 
1320    /*-----------------------------------------------------------------------
1321     *  Intialize some stuff.
1322     *-----------------------------------------------------------------------*/
1323 
1324    jj_counter = start_indexing;
1325    jj_counter_offd = start_indexing;
1326 
1327    if (debug_flag==4)
1328    {
1329       wall_time = time_getWallclockSeconds() - wall_time;
1330       hypre_printf("Proc = %d     Interp: Internal work 1 =     %f\n",
1331             my_id, wall_time);
1332       fflush(NULL);
1333    }
1334 
1335    /*-----------------------------------------------------------------------
1336     *  Send and receive fine_to_coarse info.
1337     *-----------------------------------------------------------------------*/
1338 
1339    if (debug_flag==4) wall_time = time_getWallclockSeconds();
1340 
1341    //fine_to_coarse_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
1342 
1343 #ifdef HYPRE_USING_OPENMP
1344 #pragma omp parallel for private(i,j,ns,ne,size,rest,coarse_shift) HYPRE_SMP_SCHEDULE
1345 #endif
1346    for (j = 0; j < num_threads; j++)
1347    {
1348       coarse_shift = 0;
1349       if (j > 0) coarse_shift = coarse_counter[j-1];
1350       size = n_fine/num_threads;
1351       rest = n_fine - size*num_threads;
1352       if (j < rest)
1353       {
1354          ns = j*size+j;
1355          ne = (j+1)*size+j+1;
1356       }
1357       else
1358       {
1359          ns = j*size+rest;
1360          ne = (j+1)*size+rest;
1361       }
1362       for (i = ns; i < ne; i++)
1363          fine_to_coarse[i] += coarse_shift;
1364    }
1365    /*index = 0;
1366      for (i = 0; i < num_sends; i++)
1367      {
1368      start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
1369      for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
1370      int_buf_data[index++]
1371      = fine_to_coarse[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
1372      }
1373 
1374      comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data,
1375      fine_to_coarse_offd);
1376 
1377      hypre_ParCSRCommHandleDestroy(comm_handle);
1378 
1379      if (debug_flag==4)
1380      {
1381      wall_time = time_getWallclockSeconds() - wall_time;
1382      hypre_printf("Proc = %d     Interp: Comm 4 FineToCoarse = %f\n",
1383      my_id, wall_time);
1384      fflush(NULL);
1385      }*/
1386 
1387    if (debug_flag==4) wall_time = time_getWallclockSeconds();
1388 
1389    /*#ifdef HYPRE_USING_OPENMP
1390 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
1391 #endif
1392 for (i = 0; i < n_fine; i++) fine_to_coarse[i] -= my_first_cpt;*/
1393 
1394    /*-----------------------------------------------------------------------
1395     *  Loop over fine grid points.
1396     *-----------------------------------------------------------------------*/
1397 
1398 #ifdef HYPRE_USING_OPENMP
1399 #pragma omp parallel for private(i,j,jl,i1,i2,jj,jj1,ns,ne,size,rest,sum,diagonal,distribute,P_marker,P_marker_offd,jj_counter,jj_counter_offd,sgn,c_num,jj_begin_row,jj_end_row,jj_begin_row_offd,jj_end_row_offd) HYPRE_SMP_SCHEDULE
1400 #endif
1401    for (jl = 0; jl < num_threads; jl++)
1402    {
1403       size = n_fine/num_threads;
1404       rest = n_fine - size*num_threads;
1405       if (jl < rest)
1406       {
1407          ns = jl*size+jl;
1408          ne = (jl+1)*size+jl+1;
1409       }
1410       else
1411       {
1412          ns = jl*size+rest;
1413          ne = (jl+1)*size+rest;
1414       }
1415       jj_counter = 0;
1416       if (jl > 0) jj_counter = jj_count[jl-1];
1417       jj_counter_offd = 0;
1418       if (jl > 0) jj_counter_offd = jj_count_offd[jl-1];
1419 
1420       P_marker = hypre_CTAlloc(HYPRE_Int,  n_fine, HYPRE_MEMORY_HOST);
1421       if (num_cols_A_offd)
1422          P_marker_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
1423       else
1424          P_marker_offd = NULL;
1425 
1426       for (i = 0; i < n_fine; i++)
1427       {
1428          P_marker[i] = -1;
1429       }
1430       for (i = 0; i < num_cols_A_offd; i++)
1431       {
1432          P_marker_offd[i] = -1;
1433       }
1434 
1435       for (i = ns; i < ne; i++)
1436       {
1437 
1438          /*--------------------------------------------------------------------
1439           *  If i is a c-point, interpolation is the identity.
1440           *--------------------------------------------------------------------*/
1441 
1442          if (CF_marker[i] >= 0)
1443          {
1444             P_diag_i[i] = jj_counter;
1445             P_diag_j[jj_counter]    = fine_to_coarse[i];
1446             P_diag_data[jj_counter] = one;
1447             jj_counter++;
1448          }
1449 
1450          /*--------------------------------------------------------------------
1451           *  If i is an F-point, build interpolation.
1452           *--------------------------------------------------------------------*/
1453 
1454          else
1455          {
1456             /* Diagonal part of P */
1457             P_diag_i[i] = jj_counter;
1458             jj_begin_row = jj_counter;
1459 
1460             for (jj = S_diag_i[i]; jj < S_diag_i[i+1]; jj++)
1461             {
1462                i1 = S_diag_j[jj];
1463 
1464                /*--------------------------------------------------------------
1465                 * If neighbor i1 is a C-point, set column number in P_diag_j
1466                 * and initialize interpolation weight to zero.
1467                 *--------------------------------------------------------------*/
1468 
1469                if (CF_marker[i1] >= 0)
1470                {
1471                   P_marker[i1] = jj_counter;
1472                   P_diag_j[jj_counter]    = fine_to_coarse[i1];
1473                   P_diag_data[jj_counter] = zero;
1474                   jj_counter++;
1475                }
1476 
1477             }
1478             jj_end_row = jj_counter;
1479 
1480             /* Off-Diagonal part of P */
1481             P_offd_i[i] = jj_counter_offd;
1482             jj_begin_row_offd = jj_counter_offd;
1483 
1484 
1485             if (num_procs > 1)
1486             {
1487                for (jj = S_offd_i[i]; jj < S_offd_i[i+1]; jj++)
1488                {
1489                   i1 = S_offd_j[jj];
1490 
1491                   /*-----------------------------------------------------------
1492                    * If neighbor i1 is a C-point, set column number in P_offd_j
1493                    * and initialize interpolation weight to zero.
1494                    *-----------------------------------------------------------*/
1495 
1496                   if (CF_marker_offd[i1] >= 0)
1497                   {
1498                      P_marker_offd[i1] = jj_counter_offd;
1499                      P_offd_j[jj_counter_offd]  = i1;
1500                      P_offd_data[jj_counter_offd] = zero;
1501                      jj_counter_offd++;
1502                   }
1503                }
1504             }
1505 
1506             jj_end_row_offd = jj_counter_offd;
1507 
1508             diagonal = A_diag_data[A_diag_i[i]];
1509 
1510 
1511             /* Loop over ith row of A.  First, the diagonal part of A */
1512 
1513             for (jj = A_diag_i[i]+1; jj < A_diag_i[i+1]; jj++)
1514             {
1515                i1 = A_diag_j[jj];
1516 
1517                /*--------------------------------------------------------------
1518                 * Case 1: neighbor i1 is a C-point and strongly influences i,
1519                 * accumulate a_{i,i1} into the interpolation weight.
1520                 *--------------------------------------------------------------*/
1521 
1522                if (P_marker[i1] >= jj_begin_row)
1523                {
1524                   P_diag_data[P_marker[i1]] += A_diag_data[jj];
1525                }
1526 
1527                /*--------------------------------------------------------------
1528                 * Case 2: neighbor i1 is an F-point and influences i,
1529                 * distribute a_{i,i1} to C-points that strongly influence i.
1530                 * Note: currently no distribution to the diagonal in this case.
1531                 *--------------------------------------------------------------*/
1532 
1533                else
1534                {
1535                   sum = zero;
1536 
1537                   /*-----------------------------------------------------------
1538                    * Loop over row of A for point i1 and calculate the sum
1539                    * of the connections to c-points that strongly influence i.
1540                    *-----------------------------------------------------------*/
1541                   sgn = 1;
1542                   if (A_diag_data[A_diag_i[i1]] < 0) sgn = -1;
1543                   /* Diagonal block part of row i1 */
1544                   for (jj1 = A_diag_i[i1]; jj1 < A_diag_i[i1+1]; jj1++)
1545                   {
1546                      i2 = A_diag_j[jj1];
1547                      if (P_marker[i2] >= jj_begin_row &&
1548                            (sgn*A_diag_data[jj1]) < 0)
1549                      {
1550                         sum += A_diag_data[jj1];
1551                      }
1552                   }
1553 
1554                   /* Off-Diagonal block part of row i1 */
1555                   if (num_procs > 1)
1556                   {
1557                      for (jj1 = A_offd_i[i1]; jj1 < A_offd_i[i1+1]; jj1++)
1558                      {
1559                         i2 = A_offd_j[jj1];
1560                         if (P_marker_offd[i2] >= jj_begin_row_offd
1561                               && (sgn*A_offd_data[jj1]) < 0)
1562                         {
1563                            sum += A_offd_data[jj1];
1564                         }
1565                      }
1566                   }
1567 
1568                   if (sum != 0)
1569                   {
1570                      distribute = A_diag_data[jj] / sum;
1571 
1572                      /*-----------------------------------------------------------
1573                       * Loop over row of A for point i1 and do the distribution.
1574                       *-----------------------------------------------------------*/
1575 
1576                      /* Diagonal block part of row i1 */
1577                      for (jj1 = A_diag_i[i1]; jj1 < A_diag_i[i1+1]; jj1++)
1578                      {
1579                         i2 = A_diag_j[jj1];
1580                         if (P_marker[i2] >= jj_begin_row
1581                               && (sgn*A_diag_data[jj1]) < 0)
1582                         {
1583                            P_diag_data[P_marker[i2]]
1584                               += distribute * A_diag_data[jj1];
1585                         }
1586                      }
1587 
1588                      /* Off-Diagonal block part of row i1 */
1589                      if (num_procs > 1)
1590                      {
1591                         for (jj1 = A_offd_i[i1]; jj1 < A_offd_i[i1+1]; jj1++)
1592                         {
1593                            i2 = A_offd_j[jj1];
1594                            if (P_marker_offd[i2] >= jj_begin_row_offd
1595                                  && (sgn*A_offd_data[jj1]) < 0)
1596                            {
1597                               P_offd_data[P_marker_offd[i2]]
1598                                  += distribute * A_offd_data[jj1];
1599                            }
1600                         }
1601                      }
1602                   }
1603                   else
1604                   {
1605                      if (num_functions == 1 || dof_func[i] == dof_func[i1])
1606                         diagonal += A_diag_data[jj];
1607                   }
1608                }
1609 
1610             }
1611 
1612 
1613             /*----------------------------------------------------------------
1614              * Still looping over ith row of A. Next, loop over the
1615              * off-diagonal part of A
1616              *---------------------------------------------------------------*/
1617 
1618             if (num_procs > 1)
1619             {
1620                for (jj = A_offd_i[i]; jj < A_offd_i[i+1]; jj++)
1621                {
1622                   i1 = A_offd_j[jj];
1623 
1624                   /*--------------------------------------------------------------
1625                    * Case 1: neighbor i1 is a C-point and strongly influences i,
1626                    * accumulate a_{i,i1} into the interpolation weight.
1627                    *--------------------------------------------------------------*/
1628 
1629                   if (P_marker_offd[i1] >= jj_begin_row_offd)
1630                   {
1631                      P_offd_data[P_marker_offd[i1]] += A_offd_data[jj];
1632                   }
1633 
1634                   /*------------------------------------------------------------
1635                    * Case 2: neighbor i1 is an F-point and influences i,
1636                    * distribute a_{i,i1} to C-points that strongly infuence i.
1637                    * Note: currently no distribution to the diagonal in this case.
1638                    *-----------------------------------------------------------*/
1639 
1640                   else
1641                   {
1642                      sum = zero;
1643 
1644                      /*---------------------------------------------------------
1645                       * Loop over row of A_ext for point i1 and calculate the sum
1646                       * of the connections to c-points that strongly influence i.
1647                       *---------------------------------------------------------*/
1648 
1649                      /* find row number */
1650                      c_num = A_offd_j[jj];
1651 
1652                      sgn = 1;
1653                      if (A_ext_data[A_ext_i[c_num]] < 0) sgn = -1;
1654                      for (jj1 = A_ext_i[c_num]; jj1 < A_ext_i[c_num+1]; jj1++)
1655                      {
1656                         i2 = (HYPRE_Int)A_ext_j[jj1];
1657 
1658                         if (i2 > -1)
1659                         {
1660                            /* in the diagonal block */
1661                            if (P_marker[i2] >= jj_begin_row
1662                                  && (sgn*A_ext_data[jj1]) < 0)
1663                            {
1664                               sum += A_ext_data[jj1];
1665                            }
1666                         }
1667                         else
1668                         {
1669                            /* in the off_diagonal block  */
1670                            if (P_marker_offd[-i2-1] >= jj_begin_row_offd
1671                                  && (sgn*A_ext_data[jj1]) < 0)
1672                            {
1673                               sum += A_ext_data[jj1];
1674                            }
1675 
1676                         }
1677 
1678                      }
1679 
1680                      if (sum != 0)
1681                      {
1682                         distribute = A_offd_data[jj] / sum;
1683                         /*---------------------------------------------------------
1684                          * Loop over row of A_ext for point i1 and do
1685                          * the distribution.
1686                          *--------------------------------------------------------*/
1687 
1688                         /* Diagonal block part of row i1 */
1689                         for (jj1 = A_ext_i[c_num]; jj1 < A_ext_i[c_num+1]; jj1++)
1690                         {
1691                            i2 = (HYPRE_Int)A_ext_j[jj1];
1692 
1693                            if (i2 > -1) /* in the diagonal block */
1694                            {
1695                               if (P_marker[i2] >= jj_begin_row
1696                                     && (sgn*A_ext_data[jj1]) < 0)
1697                               {
1698                                  P_diag_data[P_marker[i2]]
1699                                     += distribute * A_ext_data[jj1];
1700                               }
1701                            }
1702                            else
1703                            {
1704                               /* in the off_diagonal block  */
1705                               if (P_marker_offd[-i2-1] >= jj_begin_row_offd
1706                                     && (sgn*A_ext_data[jj1]) < 0)
1707                                  P_offd_data[P_marker_offd[-i2-1]]
1708                                     += distribute * A_ext_data[jj1];
1709                            }
1710                         }
1711                      }
1712                      else
1713                      {
1714                         if (num_functions == 1 || dof_func[i] == dof_func_offd[i1])
1715                            diagonal += A_offd_data[jj];
1716                      }
1717                   }
1718                }
1719             }
1720 
1721             /*-----------------------------------------------------------------
1722              * Set interpolation weight by dividing by the diagonal.
1723              *-----------------------------------------------------------------*/
1724 
1725             for (jj = jj_begin_row; jj < jj_end_row; jj++)
1726             {
1727                P_diag_data[jj] /= -diagonal;
1728             }
1729 
1730             for (jj = jj_begin_row_offd; jj < jj_end_row_offd; jj++)
1731             {
1732                P_offd_data[jj] /= -diagonal;
1733             }
1734          }
1735 
1736          P_offd_i[i+1] = jj_counter_offd;
1737       }
1738       hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
1739       hypre_TFree(P_marker_offd, HYPRE_MEMORY_HOST);
1740    }
1741 
1742    P = hypre_ParCSRMatrixCreate(comm,
1743          hypre_ParCSRMatrixGlobalNumRows(A),
1744          total_global_cpts,
1745          hypre_ParCSRMatrixColStarts(A),
1746          num_cpts_global,
1747          0,
1748          P_diag_i[n_fine],
1749          P_offd_i[n_fine]);
1750 
1751 
1752    P_diag = hypre_ParCSRMatrixDiag(P);
1753    hypre_CSRMatrixData(P_diag) = P_diag_data;
1754    hypre_CSRMatrixI(P_diag) = P_diag_i;
1755    hypre_CSRMatrixJ(P_diag) = P_diag_j;
1756    P_offd = hypre_ParCSRMatrixOffd(P);
1757    hypre_CSRMatrixData(P_offd) = P_offd_data;
1758    hypre_CSRMatrixI(P_offd) = P_offd_i;
1759    hypre_CSRMatrixJ(P_offd) = P_offd_j;
1760 
1761    /* Compress P, removing coefficients smaller than trunc_factor * Max */
1762 
1763    if (trunc_factor != 0.0 || max_elmts > 0)
1764    {
1765       hypre_BoomerAMGInterpTruncation(P, trunc_factor, max_elmts);
1766       P_diag_data = hypre_CSRMatrixData(P_diag);
1767       P_diag_i = hypre_CSRMatrixI(P_diag);
1768       P_diag_j = hypre_CSRMatrixJ(P_diag);
1769       P_offd_data = hypre_CSRMatrixData(P_offd);
1770       P_offd_i = hypre_CSRMatrixI(P_offd);
1771       P_offd_j = hypre_CSRMatrixJ(P_offd);
1772       P_diag_size = P_diag_i[n_fine];
1773       P_offd_size = P_offd_i[n_fine];
1774    }
1775 
1776    num_cols_P_offd = 0;
1777    if (P_offd_size)
1778    {
1779       P_marker = hypre_CTAlloc(HYPRE_Int, num_cols_A_offd, HYPRE_MEMORY_HOST);
1780 
1781 #ifdef HYPRE_USING_OPENMP
1782 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
1783 #endif
1784       for (i=0; i < num_cols_A_offd; i++)
1785          P_marker[i] = 0;
1786 
1787       num_cols_P_offd = 0;
1788       for (i=0; i < P_offd_size; i++)
1789       {
1790          index = P_offd_j[i];
1791          if (!P_marker[index])
1792          {
1793             num_cols_P_offd++;
1794             P_marker[index] = 1;
1795          }
1796       }
1797 
1798       col_map_offd_P = hypre_CTAlloc(HYPRE_BigInt, num_cols_P_offd, HYPRE_MEMORY_HOST);
1799       tmp_map_offd = hypre_CTAlloc(HYPRE_Int, num_cols_P_offd, HYPRE_MEMORY_HOST);
1800 
1801       index = 0;
1802       for (i=0; i < num_cols_P_offd; i++)
1803       {
1804          while (P_marker[index]==0) index++;
1805          tmp_map_offd[i] = index++;
1806       }
1807 
1808 #ifdef HYPRE_USING_OPENMP
1809 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
1810 #endif
1811       for (i=0; i < P_offd_size; i++)
1812          P_offd_j[i] = hypre_BinarySearch(tmp_map_offd,
1813                P_offd_j[i],
1814                num_cols_P_offd);
1815       hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
1816    }
1817 
1818    for (i=0; i < n_fine; i++)
1819       if (CF_marker[i] == -3) CF_marker[i] = -1;
1820 
1821    if (num_cols_P_offd)
1822    {
1823       hypre_ParCSRMatrixColMapOffd(P) = col_map_offd_P;
1824       hypre_CSRMatrixNumCols(P_offd) = num_cols_P_offd;
1825    }
1826 
1827    hypre_GetCommPkgRTFromCommPkgA(P,A,fine_to_coarse, tmp_map_offd);
1828 
1829    *P_ptr = P;
1830 
1831    hypre_TFree(CF_marker_offd, HYPRE_MEMORY_HOST);
1832    hypre_TFree(dof_func_offd, HYPRE_MEMORY_HOST);
1833    hypre_TFree(int_buf_data, HYPRE_MEMORY_HOST);
1834    hypre_TFree(fine_to_coarse, HYPRE_MEMORY_HOST);
1835    hypre_TFree(tmp_map_offd, HYPRE_MEMORY_HOST);
1836    hypre_TFree(coarse_counter, HYPRE_MEMORY_HOST);
1837    hypre_TFree(jj_count, HYPRE_MEMORY_HOST);
1838    hypre_TFree(jj_count_offd, HYPRE_MEMORY_HOST);
1839 
1840    if (num_procs > 1)
1841    {
1842       hypre_CSRMatrixDestroy(A_ext);
1843    }
1844 
1845    return hypre_error_flag;
1846 }
1847 
1848 
1849 /*---------------------------------------------------------------------------
1850  * hypre_BoomerAMGBuildDirInterp
1851  *--------------------------------------------------------------------------*/
1852 
1853 HYPRE_Int
hypre_BoomerAMGBuildDirInterpHost(hypre_ParCSRMatrix * A,HYPRE_Int * CF_marker,hypre_ParCSRMatrix * S,HYPRE_BigInt * num_cpts_global,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int debug_flag,HYPRE_Real trunc_factor,HYPRE_Int max_elmts,hypre_ParCSRMatrix ** P_ptr)1854 hypre_BoomerAMGBuildDirInterpHost( hypre_ParCSRMatrix   *A,
1855                                    HYPRE_Int            *CF_marker,
1856                                    hypre_ParCSRMatrix   *S,
1857                                    HYPRE_BigInt         *num_cpts_global,
1858                                    HYPRE_Int             num_functions,
1859                                    HYPRE_Int            *dof_func,
1860                                    HYPRE_Int             debug_flag,
1861                                    HYPRE_Real            trunc_factor,
1862                                    HYPRE_Int             max_elmts,
1863                                    hypre_ParCSRMatrix  **P_ptr)
1864 {
1865 
1866    MPI_Comm      comm = hypre_ParCSRMatrixComm(A);
1867    hypre_ParCSRCommPkg     *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
1868    hypre_ParCSRCommHandle  *comm_handle;
1869 
1870    hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
1871    HYPRE_Real      *A_diag_data = hypre_CSRMatrixData(A_diag);
1872    HYPRE_Int       *A_diag_i = hypre_CSRMatrixI(A_diag);
1873    HYPRE_Int       *A_diag_j = hypre_CSRMatrixJ(A_diag);
1874 
1875    hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
1876    HYPRE_Real      *A_offd_data = hypre_CSRMatrixData(A_offd);
1877    HYPRE_Int       *A_offd_i = hypre_CSRMatrixI(A_offd);
1878    HYPRE_Int       *A_offd_j = hypre_CSRMatrixJ(A_offd);
1879    HYPRE_Int        num_cols_A_offd = hypre_CSRMatrixNumCols(A_offd);
1880 
1881    hypre_CSRMatrix *S_diag = hypre_ParCSRMatrixDiag(S);
1882    HYPRE_Int       *S_diag_i = hypre_CSRMatrixI(S_diag);
1883    HYPRE_Int       *S_diag_j = hypre_CSRMatrixJ(S_diag);
1884 
1885    hypre_CSRMatrix *S_offd = hypre_ParCSRMatrixOffd(S);
1886    HYPRE_Int       *S_offd_i = hypre_CSRMatrixI(S_offd);
1887    HYPRE_Int       *S_offd_j = hypre_CSRMatrixJ(S_offd);
1888 
1889    hypre_ParCSRMatrix *P;
1890    HYPRE_BigInt      *col_map_offd_P;
1891    HYPRE_Int         *tmp_map_offd = NULL;
1892 
1893    HYPRE_Int          *CF_marker_offd = NULL;
1894    HYPRE_Int          *dof_func_offd = NULL;
1895 
1896    hypre_CSRMatrix    *P_diag;
1897    hypre_CSRMatrix    *P_offd;
1898 
1899    HYPRE_Real      *P_diag_data;
1900    HYPRE_Int       *P_diag_i;
1901    HYPRE_Int       *P_diag_j;
1902    HYPRE_Real      *P_offd_data;
1903    HYPRE_Int       *P_offd_i;
1904    HYPRE_Int       *P_offd_j;
1905 
1906    HYPRE_Int        P_diag_size, P_offd_size;
1907 
1908    HYPRE_Int        jj_counter,jj_counter_offd;
1909    HYPRE_Int       *jj_count, *jj_count_offd;
1910    HYPRE_Int        jj_begin_row,jj_begin_row_offd;
1911    HYPRE_Int        jj_end_row,jj_end_row_offd;
1912 
1913    HYPRE_Int        start_indexing = 0; /* start indexing for P_data at 0 */
1914 
1915    HYPRE_Int        n_fine = hypre_CSRMatrixNumRows(A_diag);
1916 
1917    HYPRE_Int       *fine_to_coarse;
1918    HYPRE_Int       *coarse_counter;
1919    HYPRE_Int        coarse_shift;
1920    HYPRE_BigInt     total_global_cpts;
1921    HYPRE_Int        num_cols_P_offd;
1922    //HYPRE_BigInt     my_first_cpt;
1923 
1924    HYPRE_Int        i,i1;
1925    HYPRE_Int        j,jl,jj;
1926    HYPRE_Int        start;
1927 
1928    HYPRE_Real       diagonal;
1929    HYPRE_Real       sum_N_pos, sum_P_pos;
1930    HYPRE_Real       sum_N_neg, sum_P_neg;
1931    HYPRE_Real       alfa = 1.0;
1932    HYPRE_Real       beta = 1.0;
1933 
1934    HYPRE_Real       zero = 0.0;
1935    HYPRE_Real       one  = 1.0;
1936 
1937    HYPRE_Int        my_id;
1938    HYPRE_Int        num_procs;
1939    HYPRE_Int        num_threads;
1940    HYPRE_Int        num_sends;
1941    HYPRE_Int        index;
1942    HYPRE_Int        ns, ne, size, rest;
1943    HYPRE_Int       *int_buf_data;
1944 
1945    HYPRE_Real       wall_time;  /* for debugging instrumentation  */
1946 
1947    hypre_MPI_Comm_size(comm, &num_procs);
1948    hypre_MPI_Comm_rank(comm,&my_id);
1949    num_threads = hypre_NumThreads();
1950 
1951    //my_first_cpt = num_cpts_global[0];
1952    if (my_id == (num_procs -1)) total_global_cpts = num_cpts_global[1];
1953    hypre_MPI_Bcast(&total_global_cpts, 1, HYPRE_MPI_BIG_INT, num_procs-1, comm);
1954 
1955    /*-------------------------------------------------------------------
1956     * Get the CF_marker data for the off-processor columns
1957     *-------------------------------------------------------------------*/
1958 
1959    if (debug_flag==4) wall_time = time_getWallclockSeconds();
1960 
1961    if (num_cols_A_offd) CF_marker_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
1962    if (num_functions > 1 && num_cols_A_offd)
1963       dof_func_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
1964 
1965    if (!comm_pkg)
1966    {
1967       hypre_MatvecCommPkgCreate(A);
1968       comm_pkg = hypre_ParCSRMatrixCommPkg(A);
1969    }
1970 
1971    num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
1972    int_buf_data = hypre_CTAlloc(HYPRE_Int,  hypre_ParCSRCommPkgSendMapStart(comm_pkg,
1973             num_sends), HYPRE_MEMORY_HOST);
1974 
1975    index = 0;
1976    for (i = 0; i < num_sends; i++)
1977    {
1978       start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
1979       for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
1980          int_buf_data[index++]
1981             = CF_marker[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
1982    }
1983 
1984    comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data,
1985          CF_marker_offd);
1986 
1987    hypre_ParCSRCommHandleDestroy(comm_handle);
1988    if (num_functions > 1)
1989    {
1990       index = 0;
1991       for (i = 0; i < num_sends; i++)
1992       {
1993          start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
1994          for (j=start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
1995             int_buf_data[index++]
1996                = dof_func[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
1997       }
1998 
1999       comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data,
2000             dof_func_offd);
2001 
2002       hypre_ParCSRCommHandleDestroy(comm_handle);
2003    }
2004 
2005    if (debug_flag==4)
2006    {
2007       wall_time = time_getWallclockSeconds() - wall_time;
2008       hypre_printf("Proc = %d     Interp: Comm 1 CF_marker =    %f\n",
2009             my_id, wall_time);
2010       fflush(NULL);
2011    }
2012 
2013    /*-----------------------------------------------------------------------
2014     *  First Pass: Determine size of P and fill in fine_to_coarse mapping.
2015     *-----------------------------------------------------------------------*/
2016 
2017    /*-----------------------------------------------------------------------
2018     *  Intialize counters and allocate mapping vector.
2019     *-----------------------------------------------------------------------*/
2020 
2021    coarse_counter = hypre_CTAlloc(HYPRE_Int,  num_threads, HYPRE_MEMORY_HOST);
2022    jj_count = hypre_CTAlloc(HYPRE_Int,  num_threads, HYPRE_MEMORY_HOST);
2023    jj_count_offd = hypre_CTAlloc(HYPRE_Int,  num_threads, HYPRE_MEMORY_HOST);
2024 
2025    fine_to_coarse = hypre_CTAlloc(HYPRE_Int,  n_fine, HYPRE_MEMORY_HOST);
2026 #ifdef HYPRE_USING_OPENMP
2027 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
2028 #endif
2029    for (i = 0; i < n_fine; i++) fine_to_coarse[i] = -1;
2030 
2031    jj_counter = start_indexing;
2032    jj_counter_offd = start_indexing;
2033 
2034    /*-----------------------------------------------------------------------
2035     *  Loop over fine grid.
2036     *-----------------------------------------------------------------------*/
2037 
2038    /* RDF: this looks a little tricky, but doable */
2039 #ifdef HYPRE_USING_OPENMP
2040 #pragma omp parallel for private(i,j,i1,jj,ns,ne,size,rest) HYPRE_SMP_SCHEDULE
2041 #endif
2042    for (j = 0; j < num_threads; j++)
2043    {
2044       size = n_fine/num_threads;
2045       rest = n_fine - size*num_threads;
2046       if (j < rest)
2047       {
2048          ns = j*size+j;
2049          ne = (j+1)*size+j+1;
2050       }
2051       else
2052       {
2053          ns = j*size+rest;
2054          ne = (j+1)*size+rest;
2055       }
2056       for (i = ns; i < ne; i++)
2057       {
2058 
2059          /*--------------------------------------------------------------------
2060           *  If i is a C-point, interpolation is the identity. Also set up
2061           *  mapping vector.
2062           *--------------------------------------------------------------------*/
2063 
2064          if (CF_marker[i] >= 0)
2065          {
2066             jj_count[j]++;
2067             fine_to_coarse[i] = coarse_counter[j];
2068             coarse_counter[j]++;
2069          }
2070 
2071          /*--------------------------------------------------------------------
2072           *  If i is an F-point, interpolation is from the C-points that
2073           *  strongly influence i.
2074           *--------------------------------------------------------------------*/
2075 
2076          else
2077          {
2078             for (jj = S_diag_i[i]; jj < S_diag_i[i+1]; jj++)
2079             {
2080                i1 = S_diag_j[jj];
2081                if (CF_marker[i1] > 0)
2082                {
2083                   jj_count[j]++;
2084                }
2085             }
2086 
2087             if (num_procs > 1)
2088             {
2089                for (jj = S_offd_i[i]; jj < S_offd_i[i+1]; jj++)
2090                {
2091                   i1 = S_offd_j[jj];
2092                   if (CF_marker_offd[i1] > 0)
2093                   {
2094                      jj_count_offd[j]++;
2095                   }
2096                }
2097             }
2098          }
2099       }
2100    }
2101 
2102    /*-----------------------------------------------------------------------
2103     *  Allocate  arrays.
2104     *-----------------------------------------------------------------------*/
2105 
2106    for (i=0; i < num_threads-1; i++)
2107    {
2108       coarse_counter[i+1] += coarse_counter[i];
2109       jj_count[i+1] += jj_count[i];
2110       jj_count_offd[i+1] += jj_count_offd[i];
2111    }
2112    i = num_threads-1;
2113    jj_counter = jj_count[i];
2114    jj_counter_offd = jj_count_offd[i];
2115 
2116    P_diag_size = jj_counter;
2117 
2118    P_diag_i    = hypre_CTAlloc(HYPRE_Int,  n_fine+1,    HYPRE_MEMORY_DEVICE);
2119    P_diag_j    = hypre_CTAlloc(HYPRE_Int,  P_diag_size, HYPRE_MEMORY_DEVICE);
2120    P_diag_data = hypre_CTAlloc(HYPRE_Real, P_diag_size, HYPRE_MEMORY_DEVICE);
2121 
2122    P_diag_i[n_fine] = jj_counter;
2123 
2124 
2125    P_offd_size = jj_counter_offd;
2126 
2127    P_offd_i    = hypre_CTAlloc(HYPRE_Int,  n_fine+1,    HYPRE_MEMORY_DEVICE);
2128    P_offd_j    = hypre_CTAlloc(HYPRE_Int,  P_offd_size, HYPRE_MEMORY_DEVICE);
2129    P_offd_data = hypre_CTAlloc(HYPRE_Real, P_offd_size, HYPRE_MEMORY_DEVICE);
2130 
2131    /*-----------------------------------------------------------------------
2132     *  Intialize some stuff.
2133     *-----------------------------------------------------------------------*/
2134 
2135    jj_counter = start_indexing;
2136    jj_counter_offd = start_indexing;
2137 
2138    if (debug_flag==4)
2139    {
2140       wall_time = time_getWallclockSeconds() - wall_time;
2141       hypre_printf("Proc = %d     Interp: Internal work 1 =     %f\n",
2142             my_id, wall_time);
2143       fflush(NULL);
2144    }
2145 
2146    /*-----------------------------------------------------------------------
2147     *  Send and receive fine_to_coarse info.
2148     *-----------------------------------------------------------------------*/
2149 
2150    if (debug_flag==4) wall_time = time_getWallclockSeconds();
2151 
2152    //fine_to_coarse_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
2153 
2154 #ifdef HYPRE_USING_OPENMP
2155 #pragma omp parallel for private(i,j,ns,ne,size,rest,coarse_shift) HYPRE_SMP_SCHEDULE
2156 #endif
2157    for (j = 0; j < num_threads; j++)
2158    {
2159       coarse_shift = 0;
2160       if (j > 0) coarse_shift = coarse_counter[j-1];
2161       size = n_fine/num_threads;
2162       rest = n_fine - size*num_threads;
2163       if (j < rest)
2164       {
2165          ns = j*size+j;
2166          ne = (j+1)*size+j+1;
2167       }
2168       else
2169       {
2170          ns = j*size+rest;
2171          ne = (j+1)*size+rest;
2172       }
2173       for (i = ns; i < ne; i++)
2174       {
2175          fine_to_coarse[i] += coarse_shift;
2176       }
2177    }
2178    /*index = 0;
2179      for (i = 0; i < num_sends; i++)
2180      {
2181      start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
2182      for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
2183      int_buf_data[index++]
2184      = fine_to_coarse[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
2185      }
2186 
2187      comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data,
2188      fine_to_coarse_offd);
2189 
2190      hypre_ParCSRCommHandleDestroy(comm_handle);
2191 
2192      if (debug_flag==4)
2193      {
2194      wall_time = time_getWallclockSeconds() - wall_time;
2195      hypre_printf("Proc = %d     Interp: Comm 4 FineToCoarse = %f\n",
2196      my_id, wall_time);
2197      fflush(NULL);
2198      }*/
2199 
2200    if (debug_flag==4) wall_time = time_getWallclockSeconds();
2201 
2202    /*#ifdef HYPRE_USING_OPENMP
2203 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
2204 #endif
2205 for (i = 0; i < n_fine; i++) fine_to_coarse[i] -= my_first_cpt;*/
2206 
2207    /*-----------------------------------------------------------------------
2208     *  Loop over fine grid points.
2209     *-----------------------------------------------------------------------*/
2210 
2211 #ifdef HYPRE_USING_OPENMP
2212 #pragma omp parallel for private(i,j,jl,i1,jj,ns,ne,size,rest,diagonal,jj_counter,jj_counter_offd,jj_begin_row,jj_end_row,jj_begin_row_offd,jj_end_row_offd,sum_P_pos,sum_P_neg,sum_N_pos,sum_N_neg,alfa,beta) HYPRE_SMP_SCHEDULE
2213 #endif
2214    for (jl = 0; jl < num_threads; jl++)
2215    {
2216       HYPRE_Int       *P_marker, *P_marker_offd;
2217 
2218       size = n_fine/num_threads;
2219       rest = n_fine - size*num_threads;
2220       if (jl < rest)
2221       {
2222          ns = jl*size+jl;
2223          ne = (jl+1)*size+jl+1;
2224       }
2225       else
2226       {
2227          ns = jl*size+rest;
2228          ne = (jl+1)*size+rest;
2229       }
2230       jj_counter = 0;
2231       if (jl > 0) jj_counter = jj_count[jl-1];
2232       jj_counter_offd = 0;
2233       if (jl > 0) jj_counter_offd = jj_count_offd[jl-1];
2234 
2235       P_marker = hypre_CTAlloc(HYPRE_Int,  n_fine, HYPRE_MEMORY_HOST);
2236       if (num_cols_A_offd)
2237          P_marker_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
2238       else
2239          P_marker_offd = NULL;
2240 
2241       for (i = 0; i < n_fine; i++)
2242       {
2243          P_marker[i] = -1;
2244       }
2245       for (i = 0; i < num_cols_A_offd; i++)
2246       {
2247          P_marker_offd[i] = -1;
2248       }
2249 
2250       for (i = ns; i < ne; i++)
2251       {
2252 
2253          /*--------------------------------------------------------------------
2254           *  If i is a c-point, interpolation is the identity.
2255           *--------------------------------------------------------------------*/
2256 
2257          if (CF_marker[i] >= 0)
2258          {
2259             P_diag_i[i] = jj_counter;
2260             P_diag_j[jj_counter]    = fine_to_coarse[i];
2261             P_diag_data[jj_counter] = one;
2262             jj_counter++;
2263          }
2264 
2265          /*--------------------------------------------------------------------
2266           *  If i is an F-point, build interpolation.
2267           *--------------------------------------------------------------------*/
2268 
2269          else
2270          {
2271             /* Diagonal part of P */
2272             P_diag_i[i] = jj_counter;
2273             jj_begin_row = jj_counter;
2274 
2275             for (jj = S_diag_i[i]; jj < S_diag_i[i+1]; jj++)
2276             {
2277                i1 = S_diag_j[jj];
2278 
2279                /*--------------------------------------------------------------
2280                 * If neighbor i1 is a C-point, set column number in P_diag_j
2281                 * and initialize interpolation weight to zero.
2282                 *--------------------------------------------------------------*/
2283 
2284                if (CF_marker[i1] >= 0)
2285                {
2286                   P_marker[i1] = jj_counter;
2287                   P_diag_j[jj_counter]    = fine_to_coarse[i1];
2288                   P_diag_data[jj_counter] = zero;
2289                   jj_counter++;
2290                }
2291 
2292             }
2293             jj_end_row = jj_counter;
2294 
2295             /* Off-Diagonal part of P */
2296             P_offd_i[i] = jj_counter_offd;
2297             jj_begin_row_offd = jj_counter_offd;
2298 
2299 
2300             if (num_procs > 1)
2301             {
2302                for (jj = S_offd_i[i]; jj < S_offd_i[i+1]; jj++)
2303                {
2304                   i1 = S_offd_j[jj];
2305 
2306                   /*-----------------------------------------------------------
2307                    * If neighbor i1 is a C-point, set column number in P_offd_j
2308                    * and initialize interpolation weight to zero.
2309                    *-----------------------------------------------------------*/
2310 
2311                   if (CF_marker_offd[i1] >= 0)
2312                   {
2313                      P_marker_offd[i1] = jj_counter_offd;
2314                      P_offd_j[jj_counter_offd]  = i1;
2315                      P_offd_data[jj_counter_offd] = zero;
2316                      jj_counter_offd++;
2317                   }
2318                }
2319             }
2320 
2321             jj_end_row_offd = jj_counter_offd;
2322 
2323             diagonal = A_diag_data[A_diag_i[i]];
2324 
2325 
2326             /* Loop over ith row of A.  First, the diagonal part of A */
2327             sum_N_pos = 0;
2328             sum_N_neg = 0;
2329             sum_P_pos = 0;
2330             sum_P_neg = 0;
2331 
2332             for (jj = A_diag_i[i]+1; jj < A_diag_i[i+1]; jj++)
2333             {
2334                i1 = A_diag_j[jj];
2335                if (num_functions == 1 || dof_func[i1] == dof_func[i])
2336                {
2337                   if (A_diag_data[jj] > 0)
2338                      sum_N_pos += A_diag_data[jj];
2339                   else
2340                      sum_N_neg += A_diag_data[jj];
2341                }
2342                /*--------------------------------------------------------------
2343                 * Case 1: neighbor i1 is a C-point and strongly influences i,
2344                 * accumulate a_{i,i1} into the interpolation weight.
2345                 *--------------------------------------------------------------*/
2346 
2347                if (P_marker[i1] >= jj_begin_row)
2348                {
2349                   P_diag_data[P_marker[i1]] += A_diag_data[jj];
2350                   if (A_diag_data[jj] > 0)
2351                      sum_P_pos += A_diag_data[jj];
2352                   else
2353                      sum_P_neg += A_diag_data[jj];
2354                }
2355             }
2356 
2357             /*----------------------------------------------------------------
2358              * Still looping over ith row of A. Next, loop over the
2359              * off-diagonal part of A
2360              *---------------------------------------------------------------*/
2361 
2362             if (num_procs > 1)
2363             {
2364                for (jj = A_offd_i[i]; jj < A_offd_i[i+1]; jj++)
2365                {
2366                   i1 = A_offd_j[jj];
2367                   if (num_functions == 1 || dof_func_offd[i1] == dof_func[i])
2368                   {
2369                      if (A_offd_data[jj] > 0)
2370                         sum_N_pos += A_offd_data[jj];
2371                      else
2372                         sum_N_neg += A_offd_data[jj];
2373                   }
2374 
2375                   /*--------------------------------------------------------------
2376                    * Case 1: neighbor i1 is a C-point and strongly influences i,
2377                    * accumulate a_{i,i1} into the interpolation weight.
2378                    *--------------------------------------------------------------*/
2379 
2380                   if (P_marker_offd[i1] >= jj_begin_row_offd)
2381                   {
2382                      P_offd_data[P_marker_offd[i1]] += A_offd_data[jj];
2383                      if (A_offd_data[jj] > 0)
2384                         sum_P_pos += A_offd_data[jj];
2385                      else
2386                         sum_P_neg += A_offd_data[jj];
2387                   }
2388 
2389                }
2390             }
2391             if (sum_P_neg) alfa = sum_N_neg/sum_P_neg/diagonal;
2392             if (sum_P_pos) beta = sum_N_pos/sum_P_pos/diagonal;
2393 
2394             /*-----------------------------------------------------------------
2395              * Set interpolation weight by dividing by the diagonal.
2396              *-----------------------------------------------------------------*/
2397 
2398             for (jj = jj_begin_row; jj < jj_end_row; jj++)
2399             {
2400                if (P_diag_data[jj]> 0)
2401                   P_diag_data[jj] *= -beta;
2402                else
2403                   P_diag_data[jj] *= -alfa;
2404             }
2405 
2406             for (jj = jj_begin_row_offd; jj < jj_end_row_offd; jj++)
2407             {
2408                if (P_offd_data[jj]> 0)
2409                   P_offd_data[jj] *= -beta;
2410                else
2411                   P_offd_data[jj] *= -alfa;
2412             }
2413 
2414          }
2415 
2416          P_offd_i[i+1] = jj_counter_offd;
2417       }
2418       hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
2419       hypre_TFree(P_marker_offd, HYPRE_MEMORY_HOST);
2420    }
2421 
2422    P = hypre_ParCSRMatrixCreate(comm,
2423          hypre_ParCSRMatrixGlobalNumRows(A),
2424          total_global_cpts,
2425          hypre_ParCSRMatrixColStarts(A),
2426          num_cpts_global,
2427          0,
2428          P_diag_i[n_fine],
2429          P_offd_i[n_fine]);
2430 
2431 
2432    P_diag = hypre_ParCSRMatrixDiag(P);
2433    hypre_CSRMatrixData(P_diag) = P_diag_data;
2434    hypre_CSRMatrixI(P_diag) = P_diag_i;
2435    hypre_CSRMatrixJ(P_diag) = P_diag_j;
2436    P_offd = hypre_ParCSRMatrixOffd(P);
2437    hypre_CSRMatrixData(P_offd) = P_offd_data;
2438    hypre_CSRMatrixI(P_offd) = P_offd_i;
2439    hypre_CSRMatrixJ(P_offd) = P_offd_j;
2440 
2441    /* Compress P, removing coefficients smaller than trunc_factor * Max */
2442 
2443    if (trunc_factor != 0.0 || max_elmts > 0)
2444    {
2445       hypre_BoomerAMGInterpTruncation(P, trunc_factor, max_elmts);
2446       P_diag_data = hypre_CSRMatrixData(P_diag);
2447       P_diag_i = hypre_CSRMatrixI(P_diag);
2448       P_diag_j = hypre_CSRMatrixJ(P_diag);
2449       P_offd_data = hypre_CSRMatrixData(P_offd);
2450       P_offd_i = hypre_CSRMatrixI(P_offd);
2451       P_offd_j = hypre_CSRMatrixJ(P_offd);
2452       P_diag_size = P_diag_i[n_fine];
2453       P_offd_size = P_offd_i[n_fine];
2454    }
2455 
2456    num_cols_P_offd = 0;
2457    if (P_offd_size)
2458    {
2459       HYPRE_Int *P_marker = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
2460 
2461 #ifdef HYPRE_USING_OPENMP
2462 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
2463 #endif
2464       for (i=0; i < num_cols_A_offd; i++)
2465          P_marker[i] = 0;
2466 
2467       num_cols_P_offd = 0;
2468       for (i=0; i < P_offd_size; i++)
2469       {
2470          index = P_offd_j[i];
2471          if (!P_marker[index])
2472          {
2473             num_cols_P_offd++;
2474             P_marker[index] = 1;
2475          }
2476       }
2477 
2478       col_map_offd_P = hypre_CTAlloc(HYPRE_BigInt, num_cols_P_offd, HYPRE_MEMORY_HOST);
2479       tmp_map_offd = hypre_CTAlloc(HYPRE_Int, num_cols_P_offd, HYPRE_MEMORY_HOST);
2480 
2481       index = 0;
2482       for (i=0; i < num_cols_P_offd; i++)
2483       {
2484          while (P_marker[index]==0) index++;
2485          tmp_map_offd[i] = index++;
2486       }
2487 
2488 #ifdef HYPRE_USING_OPENMP
2489 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
2490 #endif
2491       for (i=0; i < P_offd_size; i++)
2492       {
2493          P_offd_j[i] = hypre_BinarySearch(tmp_map_offd,
2494                P_offd_j[i],
2495                num_cols_P_offd);
2496       }
2497       hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
2498    }
2499 
2500    for (i=0; i < n_fine; i++)
2501       if (CF_marker[i] == -3) CF_marker[i] = -1;
2502 
2503    if (num_cols_P_offd)
2504    {
2505       hypre_ParCSRMatrixColMapOffd(P) = col_map_offd_P;
2506       hypre_CSRMatrixNumCols(P_offd) = num_cols_P_offd;
2507    }
2508 
2509    hypre_GetCommPkgRTFromCommPkgA(P, A, fine_to_coarse, tmp_map_offd);
2510 
2511    *P_ptr = P;
2512 
2513    hypre_TFree(CF_marker_offd, HYPRE_MEMORY_HOST);
2514    hypre_TFree(dof_func_offd, HYPRE_MEMORY_HOST);
2515    hypre_TFree(int_buf_data, HYPRE_MEMORY_HOST);
2516    hypre_TFree(fine_to_coarse, HYPRE_MEMORY_HOST);
2517    hypre_TFree(tmp_map_offd, HYPRE_MEMORY_HOST);
2518    hypre_TFree(coarse_counter, HYPRE_MEMORY_HOST);
2519    hypre_TFree(jj_count, HYPRE_MEMORY_HOST);
2520    hypre_TFree(jj_count_offd, HYPRE_MEMORY_HOST);
2521 
2522    return hypre_error_flag;
2523 }
2524 
2525 HYPRE_Int
hypre_BoomerAMGBuildDirInterp(hypre_ParCSRMatrix * A,HYPRE_Int * CF_marker,hypre_ParCSRMatrix * S,HYPRE_BigInt * num_cpts_global,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int debug_flag,HYPRE_Real trunc_factor,HYPRE_Int max_elmts,HYPRE_Int interp_type,hypre_ParCSRMatrix ** P_ptr)2526 hypre_BoomerAMGBuildDirInterp( hypre_ParCSRMatrix   *A,
2527                                HYPRE_Int            *CF_marker,
2528                                hypre_ParCSRMatrix   *S,
2529                                HYPRE_BigInt         *num_cpts_global,
2530                                HYPRE_Int             num_functions,
2531                                HYPRE_Int            *dof_func,
2532                                HYPRE_Int             debug_flag,
2533                                HYPRE_Real            trunc_factor,
2534                                HYPRE_Int             max_elmts,
2535                                HYPRE_Int             interp_type,
2536                                hypre_ParCSRMatrix  **P_ptr)
2537 {
2538 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2539    hypre_GpuProfilingPushRange("DirInterp");
2540 #endif
2541 
2542    HYPRE_Int ierr = 0;
2543 
2544 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2545    HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
2546 
2547    if (exec == HYPRE_EXEC_DEVICE)
2548    {
2549       ierr = hypre_BoomerAMGBuildDirInterpDevice(A,CF_marker,S,num_cpts_global,num_functions,dof_func,
2550                                                  debug_flag,trunc_factor,max_elmts,
2551                                                  interp_type, P_ptr);
2552    }
2553    else
2554 #endif
2555    {
2556       ierr = hypre_BoomerAMGBuildDirInterpHost(A,CF_marker,S,num_cpts_global,num_functions,dof_func,
2557                                                debug_flag,trunc_factor,max_elmts, P_ptr);
2558    }
2559 
2560 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2561    hypre_GpuProfilingPopRange();
2562 #endif
2563 
2564    return ierr;
2565 }
2566 
2567 /*------------------------------------------------
2568  * Drop entries in interpolation matrix P
2569  * max_elmts == 0 means no limit on rownnz
2570  *------------------------------------------------*/
2571 HYPRE_Int
hypre_BoomerAMGInterpTruncation(hypre_ParCSRMatrix * P,HYPRE_Real trunc_factor,HYPRE_Int max_elmts)2572 hypre_BoomerAMGInterpTruncation( hypre_ParCSRMatrix *P,
2573                                  HYPRE_Real          trunc_factor,
2574                                  HYPRE_Int           max_elmts)
2575 {
2576    if (trunc_factor <= 0.0 && max_elmts == 0)
2577    {
2578       return 0;
2579    }
2580 
2581 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2582    HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(P) );
2583 
2584    if (exec == HYPRE_EXEC_DEVICE)
2585    {
2586       return hypre_BoomerAMGInterpTruncationDevice(P, trunc_factor, max_elmts);
2587    }
2588    else
2589 #endif
2590    {
2591       HYPRE_Int rescale = 1; // rescale P
2592       HYPRE_Int nrm_type = 0; // Use infty-norm of row to perform treshold dropping
2593       return hypre_ParCSRMatrixTruncate(P, trunc_factor, max_elmts, rescale, nrm_type);
2594    }
2595 }
2596 
2597 /*---------------------------------------------------------------------------
2598  * hypre_BoomerAMGBuildInterpModUnk - this is a modified interpolation for the unknown approach.
2599  * here we need to pass in a strength matrix built on the entire matrix.
2600  *
2601  *--------------------------------------------------------------------------*/
2602 
2603 HYPRE_Int
hypre_BoomerAMGBuildInterpModUnk(hypre_ParCSRMatrix * A,HYPRE_Int * CF_marker,hypre_ParCSRMatrix * S,HYPRE_BigInt * num_cpts_global,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int debug_flag,HYPRE_Real trunc_factor,HYPRE_Int max_elmts,hypre_ParCSRMatrix ** P_ptr)2604 hypre_BoomerAMGBuildInterpModUnk( hypre_ParCSRMatrix   *A,
2605                                   HYPRE_Int            *CF_marker,
2606                                   hypre_ParCSRMatrix   *S,
2607                                   HYPRE_BigInt         *num_cpts_global,
2608                                   HYPRE_Int             num_functions,
2609                                   HYPRE_Int            *dof_func,
2610                                   HYPRE_Int             debug_flag,
2611                                   HYPRE_Real            trunc_factor,
2612                                   HYPRE_Int             max_elmts,
2613                                   hypre_ParCSRMatrix  **P_ptr)
2614 {
2615 
2616    MPI_Comm       comm = hypre_ParCSRMatrixComm(A);
2617    hypre_ParCSRCommPkg     *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
2618    hypre_ParCSRCommHandle  *comm_handle;
2619 
2620    hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
2621    HYPRE_Real      *A_diag_data = hypre_CSRMatrixData(A_diag);
2622    HYPRE_Int       *A_diag_i = hypre_CSRMatrixI(A_diag);
2623    HYPRE_Int       *A_diag_j = hypre_CSRMatrixJ(A_diag);
2624 
2625    hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
2626    HYPRE_Real      *A_offd_data = hypre_CSRMatrixData(A_offd);
2627    HYPRE_Int       *A_offd_i = hypre_CSRMatrixI(A_offd);
2628    HYPRE_Int       *A_offd_j = hypre_CSRMatrixJ(A_offd);
2629    HYPRE_Int        num_cols_A_offd = hypre_CSRMatrixNumCols(A_offd);
2630    HYPRE_BigInt    *col_map_offd = hypre_ParCSRMatrixColMapOffd(A);
2631 
2632    hypre_CSRMatrix *S_diag = hypre_ParCSRMatrixDiag(S);
2633    HYPRE_Int       *S_diag_i = hypre_CSRMatrixI(S_diag);
2634    HYPRE_Int       *S_diag_j = hypre_CSRMatrixJ(S_diag);
2635 
2636    hypre_CSRMatrix *S_offd = hypre_ParCSRMatrixOffd(S);
2637    HYPRE_Int       *S_offd_i = hypre_CSRMatrixI(S_offd);
2638    HYPRE_Int       *S_offd_j = hypre_CSRMatrixJ(S_offd);
2639 
2640    hypre_ParCSRMatrix *P;
2641    HYPRE_BigInt       *col_map_offd_P;
2642    HYPRE_Int          *tmp_map_offd;
2643 
2644    HYPRE_Int          *CF_marker_offd = NULL;
2645    HYPRE_Int          *dof_func_offd = NULL;
2646 
2647    hypre_CSRMatrix *A_ext;
2648 
2649    HYPRE_Real      *A_ext_data = NULL;
2650    HYPRE_Int       *A_ext_i = NULL;
2651    HYPRE_BigInt    *A_ext_j = NULL;
2652 
2653    hypre_CSRMatrix *P_diag;
2654    hypre_CSRMatrix *P_offd;
2655 
2656    HYPRE_Real      *P_diag_data;
2657    HYPRE_Int       *P_diag_i;
2658    HYPRE_Int       *P_diag_j;
2659    HYPRE_Real      *P_offd_data;
2660    HYPRE_Int       *P_offd_i;
2661    HYPRE_Int       *P_offd_j;
2662 
2663    HYPRE_Int        P_diag_size, P_offd_size;
2664 
2665    HYPRE_Int       *P_marker, *P_marker_offd;
2666 
2667    HYPRE_Int        jj_counter,jj_counter_offd;
2668    HYPRE_Int       *jj_count, *jj_count_offd;
2669    HYPRE_Int        jj_begin_row,jj_begin_row_offd;
2670    HYPRE_Int        jj_end_row,jj_end_row_offd;
2671 
2672    HYPRE_Int        start_indexing = 0; /* start indexing for P_data at 0 */
2673 
2674    HYPRE_Int        n_fine = hypre_CSRMatrixNumRows(A_diag);
2675 
2676    HYPRE_Int        strong_f_marker;
2677 
2678    HYPRE_Int       *fine_to_coarse;
2679    //HYPRE_Int       *fine_to_coarse_offd;
2680    HYPRE_Int       *coarse_counter;
2681    HYPRE_Int        coarse_shift;
2682    HYPRE_BigInt     total_global_cpts;
2683    HYPRE_Int        num_cols_P_offd;
2684    //HYPRE_BigInt     my_first_cpt;
2685 
2686    HYPRE_Int        i,i1,i2;
2687    HYPRE_Int        j,jl,jj,jj1;
2688    HYPRE_Int        kc;
2689    HYPRE_BigInt     big_k;
2690    HYPRE_Int        start;
2691    HYPRE_Int        sgn;
2692    HYPRE_Int        c_num;
2693 
2694    HYPRE_Real       diagonal;
2695    HYPRE_Real       sum;
2696    HYPRE_Real       distribute;
2697 
2698    HYPRE_Real       zero = 0.0;
2699    HYPRE_Real       one  = 1.0;
2700 
2701    HYPRE_Int        my_id;
2702    HYPRE_Int        num_procs;
2703    HYPRE_Int        num_threads;
2704    HYPRE_Int        num_sends;
2705    HYPRE_Int        index;
2706    HYPRE_Int        ns, ne, size, rest;
2707    HYPRE_Int        print_level = 0;
2708    HYPRE_Int       *int_buf_data;
2709 
2710    HYPRE_BigInt col_1 = hypre_ParCSRMatrixFirstRowIndex(A);
2711    HYPRE_Int local_numrows = hypre_CSRMatrixNumRows(A_diag);
2712    HYPRE_BigInt col_n = col_1 + local_numrows;
2713 
2714    HYPRE_Real       wall_time;  /* for debugging instrumentation  */
2715 
2716    hypre_MPI_Comm_size(comm, &num_procs);
2717    hypre_MPI_Comm_rank(comm,&my_id);
2718    num_threads = hypre_NumThreads();
2719 
2720 
2721    //my_first_cpt = num_cpts_global[0];
2722    if (my_id == (num_procs -1)) total_global_cpts = num_cpts_global[1];
2723    hypre_MPI_Bcast(&total_global_cpts, 1, HYPRE_MPI_BIG_INT, num_procs-1, comm);
2724 
2725    /*-------------------------------------------------------------------
2726     * Get the CF_marker data for the off-processor columns
2727     *-------------------------------------------------------------------*/
2728 
2729    if (debug_flag < 0)
2730    {
2731       debug_flag = -debug_flag;
2732       print_level = 1;
2733    }
2734 
2735    if (debug_flag==4) wall_time = time_getWallclockSeconds();
2736 
2737    if (num_cols_A_offd) CF_marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_A_offd, HYPRE_MEMORY_HOST);
2738    if (num_functions > 1 && num_cols_A_offd)
2739       dof_func_offd = hypre_CTAlloc(HYPRE_Int, num_cols_A_offd, HYPRE_MEMORY_HOST);
2740 
2741    if (!comm_pkg)
2742    {
2743       hypre_MatvecCommPkgCreate(A);
2744       comm_pkg = hypre_ParCSRMatrixCommPkg(A);
2745    }
2746 
2747    num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
2748    int_buf_data = hypre_CTAlloc(HYPRE_Int,  hypre_ParCSRCommPkgSendMapStart(comm_pkg,
2749             num_sends), HYPRE_MEMORY_HOST);
2750 
2751    index = 0;
2752    for (i = 0; i < num_sends; i++)
2753    {
2754       start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
2755       for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
2756          int_buf_data[index++]
2757             = CF_marker[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
2758    }
2759 
2760    comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data,
2761          CF_marker_offd);
2762 
2763    hypre_ParCSRCommHandleDestroy(comm_handle);
2764    if (num_functions > 1)
2765    {
2766       index = 0;
2767       for (i = 0; i < num_sends; i++)
2768       {
2769          start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
2770          for (j=start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
2771             int_buf_data[index++]
2772                = dof_func[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
2773       }
2774 
2775       comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data,
2776             dof_func_offd);
2777 
2778       hypre_ParCSRCommHandleDestroy(comm_handle);
2779    }
2780 
2781    if (debug_flag==4)
2782    {
2783       wall_time = time_getWallclockSeconds() - wall_time;
2784       hypre_printf("Proc = %d     Interp: Comm 1 CF_marker =    %f\n",
2785             my_id, wall_time);
2786       fflush(NULL);
2787    }
2788 
2789    /*----------------------------------------------------------------------
2790     * Get the ghost rows of A
2791     *---------------------------------------------------------------------*/
2792 
2793    if (debug_flag==4) wall_time = time_getWallclockSeconds();
2794 
2795    if (num_procs > 1)
2796    {
2797       A_ext      = hypre_ParCSRMatrixExtractBExt(A,A,1);
2798       A_ext_i    = hypre_CSRMatrixI(A_ext);
2799       A_ext_j    = hypre_CSRMatrixBigJ(A_ext);
2800       A_ext_data = hypre_CSRMatrixData(A_ext);
2801    }
2802 
2803    index = 0;
2804    for (i=0; i < num_cols_A_offd; i++)
2805    {
2806       for (j=A_ext_i[i]; j < A_ext_i[i+1]; j++)
2807       {
2808          big_k = A_ext_j[j];
2809          if (big_k >= col_1 && big_k < col_n)
2810          {
2811             A_ext_j[index] = big_k - col_1;
2812             A_ext_data[index++] = A_ext_data[j];
2813          }
2814          else
2815          {
2816             kc = hypre_BigBinarySearch(col_map_offd,big_k,num_cols_A_offd);
2817             if (kc > -1)
2818             {
2819                A_ext_j[index] = (HYPRE_BigInt)(-kc-1);
2820                A_ext_data[index++] = A_ext_data[j];
2821             }
2822          }
2823       }
2824       A_ext_i[i] = index;
2825    }
2826    for (i = num_cols_A_offd; i > 0; i--)
2827       A_ext_i[i] = A_ext_i[i-1];
2828    if (num_procs > 1) A_ext_i[0] = 0;
2829 
2830    if (debug_flag==4)
2831    {
2832       wall_time = time_getWallclockSeconds() - wall_time;
2833       hypre_printf("Proc = %d  Interp: Comm 2   Get A_ext =  %f\n",
2834             my_id, wall_time);
2835       fflush(NULL);
2836    }
2837 
2838 
2839    /*-----------------------------------------------------------------------
2840     *  First Pass: Determine size of P and fill in fine_to_coarse mapping.
2841     *-----------------------------------------------------------------------*/
2842 
2843    /*-----------------------------------------------------------------------
2844     *  Intialize counters and allocate mapping vector.
2845     *-----------------------------------------------------------------------*/
2846 
2847    coarse_counter = hypre_CTAlloc(HYPRE_Int,  num_threads, HYPRE_MEMORY_HOST);
2848    jj_count = hypre_CTAlloc(HYPRE_Int,  num_threads, HYPRE_MEMORY_HOST);
2849    jj_count_offd = hypre_CTAlloc(HYPRE_Int,  num_threads, HYPRE_MEMORY_HOST);
2850 
2851    fine_to_coarse = hypre_CTAlloc(HYPRE_Int,  n_fine, HYPRE_MEMORY_HOST);
2852 #ifdef HYPRE_USING_OPENMP
2853 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
2854 #endif
2855    for (i = 0; i < n_fine; i++) fine_to_coarse[i] = -1;
2856 
2857    jj_counter = start_indexing;
2858    jj_counter_offd = start_indexing;
2859 
2860    /*-----------------------------------------------------------------------
2861     *  Loop over fine grid.
2862     *-----------------------------------------------------------------------*/
2863 
2864    /* RDF: this looks a little tricky, but doable */
2865 #ifdef HYPRE_USING_OPENMP
2866 #pragma omp parallel for private(i,j,i1,jj,ns,ne,size,rest) HYPRE_SMP_SCHEDULE
2867 #endif
2868    for (j = 0; j < num_threads; j++)
2869    {
2870       size = n_fine/num_threads;
2871       rest = n_fine - size*num_threads;
2872       if (j < rest)
2873       {
2874          ns = j*size+j;
2875          ne = (j+1)*size+j+1;
2876       }
2877       else
2878       {
2879          ns = j*size+rest;
2880          ne = (j+1)*size+rest;
2881       }
2882       for (i = ns; i < ne; i++)
2883       {
2884 
2885          /*--------------------------------------------------------------------
2886           *  If i is a C-point, interpolation is the identity. Also set up
2887           *  mapping vector.
2888           *--------------------------------------------------------------------*/
2889 
2890          if (CF_marker[i] >= 0)
2891          {
2892             jj_count[j]++;
2893             fine_to_coarse[i] = coarse_counter[j];
2894             coarse_counter[j]++;
2895          }
2896 
2897          /*--------------------------------------------------------------------
2898           *  If i is an F-point, interpolation is from the C-points that
2899           *  strongly influence i.
2900           *--------------------------------------------------------------------*/
2901 
2902          else
2903          {
2904             for (jj = S_diag_i[i]; jj < S_diag_i[i+1]; jj++)
2905             {
2906                i1 = S_diag_j[jj];
2907                if (CF_marker[i1] >= 0)
2908                {
2909                   jj_count[j]++;
2910                }
2911             }
2912 
2913             if (num_procs > 1)
2914             {
2915                for (jj = S_offd_i[i]; jj < S_offd_i[i+1]; jj++)
2916                {
2917                   i1 = S_offd_j[jj];
2918                   if (CF_marker_offd[i1] >= 0)
2919                   {
2920                      jj_count_offd[j]++;
2921                   }
2922                }
2923             }
2924          }
2925       }
2926    }
2927 
2928    /*-----------------------------------------------------------------------
2929     *  Allocate  arrays.
2930     *-----------------------------------------------------------------------*/
2931 
2932    for (i=0; i < num_threads-1; i++)
2933    {
2934       coarse_counter[i+1] += coarse_counter[i];
2935       jj_count[i+1] += jj_count[i];
2936       jj_count_offd[i+1] += jj_count_offd[i];
2937    }
2938    i = num_threads-1;
2939    jj_counter = jj_count[i];
2940    jj_counter_offd = jj_count_offd[i];
2941 
2942    P_diag_size = jj_counter;
2943 
2944    P_diag_i    = hypre_CTAlloc(HYPRE_Int,  n_fine+1, HYPRE_MEMORY_HOST);
2945    P_diag_j    = hypre_CTAlloc(HYPRE_Int,  P_diag_size, HYPRE_MEMORY_HOST);
2946    P_diag_data = hypre_CTAlloc(HYPRE_Real,  P_diag_size, HYPRE_MEMORY_HOST);
2947 
2948    P_diag_i[n_fine] = jj_counter;
2949 
2950    P_offd_size = jj_counter_offd;
2951 
2952    P_offd_i    = hypre_CTAlloc(HYPRE_Int,  n_fine+1, HYPRE_MEMORY_HOST);
2953    P_offd_j    = hypre_CTAlloc(HYPRE_Int,  P_offd_size, HYPRE_MEMORY_HOST);
2954    P_offd_data = hypre_CTAlloc(HYPRE_Real,  P_offd_size, HYPRE_MEMORY_HOST);
2955 
2956    /*-----------------------------------------------------------------------
2957     *  Intialize some stuff.
2958     *-----------------------------------------------------------------------*/
2959 
2960    jj_counter = start_indexing;
2961    jj_counter_offd = start_indexing;
2962 
2963    if (debug_flag==4)
2964    {
2965       wall_time = time_getWallclockSeconds() - wall_time;
2966       hypre_printf("Proc = %d     Interp: Internal work 1 =     %f\n",
2967             my_id, wall_time);
2968       fflush(NULL);
2969    }
2970 
2971    /*-----------------------------------------------------------------------
2972     *  Send and receive fine_to_coarse info.
2973     *-----------------------------------------------------------------------*/
2974 
2975    if (debug_flag==4) wall_time = time_getWallclockSeconds();
2976 
2977    //fine_to_coarse_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
2978 
2979 #ifdef HYPRE_USING_OPENMP
2980 #pragma omp parallel for private(i,j,ns,ne,size,rest,coarse_shift) HYPRE_SMP_SCHEDULE
2981 #endif
2982    for (j = 0; j < num_threads; j++)
2983    {
2984       coarse_shift = 0;
2985       if (j > 0) coarse_shift = coarse_counter[j-1];
2986       size = n_fine/num_threads;
2987       rest = n_fine - size*num_threads;
2988       if (j < rest)
2989       {
2990          ns = j*size+j;
2991          ne = (j+1)*size+j+1;
2992       }
2993       else
2994       {
2995          ns = j*size+rest;
2996          ne = (j+1)*size+rest;
2997       }
2998       for (i = ns; i < ne; i++)
2999          fine_to_coarse[i] += coarse_shift;
3000    }
3001    /*index = 0;
3002      for (i = 0; i < num_sends; i++)
3003      {
3004      start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
3005      for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
3006      int_buf_data[index++]
3007      = fine_to_coarse[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
3008      }
3009 
3010      comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data,
3011      fine_to_coarse_offd);
3012 
3013      hypre_ParCSRCommHandleDestroy(comm_handle);
3014 
3015      if (debug_flag==4)
3016      {
3017      wall_time = time_getWallclockSeconds() - wall_time;
3018      hypre_printf("Proc = %d     Interp: Comm 4 FineToCoarse = %f\n",
3019      my_id, wall_time);
3020      fflush(NULL);
3021      }*/
3022 
3023    if (debug_flag==4) wall_time = time_getWallclockSeconds();
3024 
3025    /*#ifdef HYPRE_USING_OPENMP
3026 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
3027 #endif
3028 for (i = 0; i < n_fine; i++) fine_to_coarse[i] -= my_first_cpt;*/
3029 
3030    /*-----------------------------------------------------------------------
3031     *  Loop over fine grid points.
3032     *-----------------------------------------------------------------------*/
3033 
3034 #ifdef HYPRE_USING_OPENMP
3035 #pragma omp parallel for private(i,j,jl,i1,i2,jj,jj1,ns,ne,size,rest,sum,diagonal,distribute,P_marker,P_marker_offd,strong_f_marker,jj_counter,jj_counter_offd,sgn,c_num,jj_begin_row,jj_end_row,jj_begin_row_offd,jj_end_row_offd) HYPRE_SMP_SCHEDULE
3036 #endif
3037    for (jl = 0; jl < num_threads; jl++)
3038    {
3039       size = n_fine/num_threads;
3040       rest = n_fine - size*num_threads;
3041       if (jl < rest)
3042       {
3043          ns = jl*size+jl;
3044          ne = (jl+1)*size+jl+1;
3045       }
3046       else
3047       {
3048          ns = jl*size+rest;
3049          ne = (jl+1)*size+rest;
3050       }
3051       jj_counter = 0;
3052       if (jl > 0) jj_counter = jj_count[jl-1];
3053       jj_counter_offd = 0;
3054       if (jl > 0) jj_counter_offd = jj_count_offd[jl-1];
3055 
3056       P_marker = hypre_CTAlloc(HYPRE_Int,  n_fine, HYPRE_MEMORY_HOST);
3057       if (num_cols_A_offd)
3058          P_marker_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
3059       else
3060          P_marker_offd = NULL;
3061 
3062       for (i = 0; i < n_fine; i++)
3063       {
3064          P_marker[i] = -1;
3065       }
3066       for (i = 0; i < num_cols_A_offd; i++)
3067       {
3068          P_marker_offd[i] = -1;
3069       }
3070       strong_f_marker = -2;
3071 
3072       for (i = ns; i < ne; i++)
3073       {
3074 
3075          /*--------------------------------------------------------------------
3076           *  If i is a c-point, interpolation is the identity.
3077           *--------------------------------------------------------------------*/
3078 
3079          if (CF_marker[i] >= 0)
3080          {
3081             P_diag_i[i] = jj_counter;
3082             P_diag_j[jj_counter]    = fine_to_coarse[i];
3083             P_diag_data[jj_counter] = one;
3084             jj_counter++;
3085          }
3086 
3087          /*--------------------------------------------------------------------
3088           *  If i is an F-point, build interpolation.
3089           *--------------------------------------------------------------------*/
3090 
3091          else
3092          {
3093             /* Diagonal part of P */
3094             P_diag_i[i] = jj_counter;
3095             jj_begin_row = jj_counter;
3096 
3097             for (jj = S_diag_i[i]; jj < S_diag_i[i+1]; jj++)
3098             {
3099                i1 = S_diag_j[jj];
3100 
3101                /*--------------------------------------------------------------
3102                 * If neighbor i1 is a C-point, set column number in P_diag_j
3103                 * and initialize interpolation weight to zero.
3104                 *--------------------------------------------------------------*/
3105 
3106                if (CF_marker[i1] >= 0)
3107                {
3108                   P_marker[i1] = jj_counter;
3109                   P_diag_j[jj_counter]    = fine_to_coarse[i1];
3110                   P_diag_data[jj_counter] = zero;
3111                   jj_counter++;
3112                }
3113 
3114                /*--------------------------------------------------------------
3115                 * If neighbor i1 is an F-point, mark it as a strong F-point
3116                 * whose connection needs to be distributed.
3117                 *--------------------------------------------------------------*/
3118 
3119                else if (CF_marker[i1] != -3)
3120                {
3121                   P_marker[i1] = strong_f_marker;
3122                }
3123             }
3124             jj_end_row = jj_counter;
3125 
3126             /* Off-Diagonal part of P */
3127             P_offd_i[i] = jj_counter_offd;
3128             jj_begin_row_offd = jj_counter_offd;
3129 
3130 
3131             if (num_procs > 1)
3132             {
3133                for (jj = S_offd_i[i]; jj < S_offd_i[i+1]; jj++)
3134                {
3135                   i1 = S_offd_j[jj];
3136 
3137                   /*-----------------------------------------------------------
3138                    * If neighbor i1 is a C-point, set column number in P_offd_j
3139                    * and initialize interpolation weight to zero.
3140                    *-----------------------------------------------------------*/
3141 
3142                   if (CF_marker_offd[i1] >= 0)
3143                   {
3144                      P_marker_offd[i1] = jj_counter_offd;
3145                      /*P_offd_j[jj_counter_offd]  = fine_to_coarse_offd[i1];*/
3146                      P_offd_j[jj_counter_offd]  = i1;
3147                      P_offd_data[jj_counter_offd] = zero;
3148                      jj_counter_offd++;
3149                   }
3150 
3151                   /*-----------------------------------------------------------
3152                    * If neighbor i1 is an F-point, mark it as a strong F-point
3153                    * whose connection needs to be distributed.
3154                    *-----------------------------------------------------------*/
3155 
3156                   else if (CF_marker_offd[i1] != -3)
3157                   {
3158                      P_marker_offd[i1] = strong_f_marker;
3159                   }
3160                }
3161             }
3162 
3163             jj_end_row_offd = jj_counter_offd;
3164 
3165             diagonal = A_diag_data[A_diag_i[i]];
3166 
3167 
3168             /* Loop over ith row of A.  First, the diagonal part of A */
3169 
3170             for (jj = A_diag_i[i]+1; jj < A_diag_i[i+1]; jj++)
3171             {
3172                i1 = A_diag_j[jj];
3173 
3174                /*--------------------------------------------------------------
3175                 * Case 1: neighbor i1 is a C-point and strongly influences i,
3176                 * accumulate a_{i,i1} into the interpolation weight.
3177                 *--------------------------------------------------------------*/
3178 
3179                if (P_marker[i1] >= jj_begin_row)
3180                {
3181                   P_diag_data[P_marker[i1]] += A_diag_data[jj];
3182                }
3183 
3184                /*--------------------------------------------------------------
3185                 * Case 2: neighbor i1 is an F-point and strongly influences i,
3186                 * distribute a_{i,i1} to C-points that strongly infuence i.
3187                 * Note: currently no distribution to the diagonal in this case.
3188 
3189                 HERE, we only want to distribut to points of the SAME function type
3190 
3191                 *--------------------------------------------------------------*/
3192 
3193                else if (P_marker[i1] == strong_f_marker)
3194                {
3195                   sum = zero;
3196 
3197                   /*-----------------------------------------------------------
3198                    * Loop over row of A for point i1 and calculate the sum
3199                    * of the connections to c-points that strongly influence i.
3200                    *-----------------------------------------------------------*/
3201                   sgn = 1;
3202                   if (A_diag_data[A_diag_i[i1]] < 0) sgn = -1;
3203                   /* Diagonal block part of row i1 */
3204                   for (jj1 = A_diag_i[i1]; jj1 < A_diag_i[i1+1]; jj1++)
3205                   {
3206                      i2 = A_diag_j[jj1];
3207                      if (num_functions == 1 || dof_func[i1] == dof_func[i2])
3208                      {
3209 
3210                         if (P_marker[i2] >= jj_begin_row &&
3211                               (sgn*A_diag_data[jj1]) < 0 )
3212                         {
3213                            sum += A_diag_data[jj1];
3214                         }
3215                      }
3216 
3217                   }
3218 
3219                   /* Off-Diagonal block part of row i1 */
3220                   if (num_procs > 1)
3221                   {
3222                      for (jj1 = A_offd_i[i1]; jj1 < A_offd_i[i1+1]; jj1++)
3223                      {
3224                         i2 = A_offd_j[jj1];
3225                         if (num_functions == 1 || dof_func[i1] == dof_func[i2])
3226                         {
3227                            if (P_marker_offd[i2] >= jj_begin_row_offd
3228                                  && (sgn*A_offd_data[jj1]) < 0)
3229                            {
3230                               sum += A_offd_data[jj1];
3231                            }
3232                         }
3233                      }
3234                   }
3235 
3236                   if (sum != 0)
3237                   {
3238                      distribute = A_diag_data[jj] / sum;
3239 
3240                      /*-----------------------------------------------------------
3241                       * Loop over row of A for point i1 and do the distribution.
3242                       *-----------------------------------------------------------*/
3243 
3244                      /* Diagonal block part of row i1 */
3245                      for (jj1 = A_diag_i[i1]; jj1 < A_diag_i[i1+1]; jj1++)
3246                      {
3247                         i2 = A_diag_j[jj1];
3248                         if (num_functions == 1 || dof_func[i1] == dof_func[i2])
3249                         {
3250                            if (P_marker[i2] >= jj_begin_row
3251                                  && (sgn*A_diag_data[jj1]) < 0)
3252                            {
3253                               P_diag_data[P_marker[i2]]
3254                                  += distribute * A_diag_data[jj1];
3255                            }
3256                         }
3257 
3258                      }
3259 
3260                      /* Off-Diagonal block part of row i1 */
3261                      if (num_procs > 1)
3262                      {
3263                         for (jj1 = A_offd_i[i1]; jj1 < A_offd_i[i1+1]; jj1++)
3264                         {
3265                            i2 = A_offd_j[jj1];
3266                            if (num_functions == 1 || dof_func[i1] == dof_func[i2])
3267                            {
3268                               if (P_marker_offd[i2] >= jj_begin_row_offd
3269                                     && (sgn*A_offd_data[jj1]) < 0)
3270                               {
3271                                  P_offd_data[P_marker_offd[i2]]
3272                                     += distribute * A_offd_data[jj1];
3273                               }
3274                            }
3275                         }
3276 
3277                      }
3278                   }
3279                   else /* sum = 0 - only add to diag if the same function type */
3280                   {
3281                      if (num_functions == 1 || dof_func[i] == dof_func[i1])
3282                         diagonal += A_diag_data[jj];
3283                   }
3284                }
3285 
3286                /*--------------------------------------------------------------
3287                 * Case 3: neighbor i1 weakly influences i, accumulate a_{i,i1}
3288                 * into the diagonal. (only if the same function type)
3289                 *--------------------------------------------------------------*/
3290 
3291                else if (CF_marker[i1] != -3)
3292                {
3293                   if (num_functions == 1 || dof_func[i] == dof_func[i1])
3294                      diagonal += A_diag_data[jj];
3295                }
3296 
3297             }
3298 
3299 
3300             /*----------------------------------------------------------------
3301              * Still looping over ith row of A. Next, loop over the
3302              * off-diagonal part of A
3303              *---------------------------------------------------------------*/
3304 
3305             if (num_procs > 1)
3306             {
3307                for (jj = A_offd_i[i]; jj < A_offd_i[i+1]; jj++)
3308                {
3309                   i1 = A_offd_j[jj];
3310 
3311                   /*--------------------------------------------------------------
3312                    * Case 1: neighbor i1 is a C-point and strongly influences i,
3313                    * accumulate a_{i,i1} into the interpolation weight.
3314                    *--------------------------------------------------------------*/
3315 
3316                   if (P_marker_offd[i1] >= jj_begin_row_offd)
3317                   {
3318                      P_offd_data[P_marker_offd[i1]] += A_offd_data[jj];
3319                   }
3320 
3321                   /*------------------------------------------------------------
3322                    * Case 2: neighbor i1 is an F-point and strongly influences i,
3323                    * distribute a_{i,i1} to C-points that strongly infuence i.
3324                    * Note: currently no distribution to the diagonal in this case.
3325 
3326                    AGAIN, we only want to distribut to points of the SAME function type
3327 
3328                    *-----------------------------------------------------------*/
3329 
3330                   else if (P_marker_offd[i1] == strong_f_marker)
3331                   {
3332                      sum = zero;
3333 
3334                      /*---------------------------------------------------------
3335                       * Loop over row of A_ext for point i1 and calculate the sum
3336                       * of the connections to c-points that strongly influence i.
3337                       *---------------------------------------------------------*/
3338 
3339                      /* find row number */
3340                      c_num = A_offd_j[jj];
3341 
3342                      sgn = 1;
3343                      if (A_ext_data[A_ext_i[c_num]] < 0) sgn = -1;
3344                      for (jj1 = A_ext_i[c_num]; jj1 < A_ext_i[c_num+1]; jj1++)
3345                      {
3346                         i2 = (HYPRE_Int)A_ext_j[jj1];
3347                         if (num_functions == 1 || dof_func[i1] == dof_func[i2])
3348                         {
3349                            if (i2 > -1)
3350                            {
3351                               /* in the diagonal block */
3352                               if (P_marker[i2] >= jj_begin_row
3353                                     && (sgn*A_ext_data[jj1]) < 0)
3354                               {
3355                                  sum += A_ext_data[jj1];
3356                               }
3357                            }
3358                            else
3359                            {
3360                               /* in the off_diagonal block  */
3361                               if (P_marker_offd[-i2-1] >= jj_begin_row_offd
3362                                     && (sgn*A_ext_data[jj1]) < 0)
3363                               {
3364                                  sum += A_ext_data[jj1];
3365                               }
3366                            }
3367 
3368                         }
3369                      }
3370                      if (sum != 0)
3371                      {
3372                         distribute = A_offd_data[jj] / sum;
3373                         /*---------------------------------------------------------
3374                          * Loop over row of A_ext for point i1 and do
3375                          * the distribution.
3376                          *--------------------------------------------------------*/
3377 
3378                         /* Diagonal block part of row i1 */
3379 
3380                         for (jj1 = A_ext_i[c_num]; jj1 < A_ext_i[c_num+1]; jj1++)
3381                         {
3382                            i2 = (HYPRE_Int)A_ext_j[jj1];
3383                            if (num_functions == 1 || dof_func[i1] == dof_func[i2])
3384                            {
3385                               if (i2 > -1) /* in the diagonal block */
3386                               {
3387                                  if (P_marker[i2] >= jj_begin_row
3388                                        && (sgn*A_ext_data[jj1]) < 0)
3389                                  {
3390                                     P_diag_data[P_marker[i2]]
3391                                        += distribute * A_ext_data[jj1];
3392                                  }
3393                               }
3394                               else
3395                               {
3396                                  /* in the off_diagonal block  */
3397                                  if (P_marker_offd[-i2-1] >= jj_begin_row_offd
3398                                        && (sgn*A_ext_data[jj1]) < 0)
3399                                     P_offd_data[P_marker_offd[-i2-1]]
3400                                        += distribute * A_ext_data[jj1];
3401                               }
3402                            }
3403                         }
3404                      }
3405                      else /* sum = 0 */
3406                      {
3407                         if (num_functions == 1 || dof_func[i] == dof_func_offd[i1])
3408                            diagonal += A_offd_data[jj];
3409                      }
3410                   }
3411 
3412                   /*-----------------------------------------------------------
3413                    * Case 3: neighbor i1 weakly influences i, accumulate a_{i,i1}
3414                    * into the diagonal.
3415                    *-----------------------------------------------------------*/
3416 
3417                   else if (CF_marker_offd[i1] != -3)
3418                   {
3419                      if (num_functions == 1 || dof_func[i] == dof_func_offd[i1])
3420                         diagonal += A_offd_data[jj];
3421                   }
3422 
3423                }
3424             }
3425 
3426             /*-----------------------------------------------------------------
3427              * Set interpolation weight by dividing by the diagonal.
3428              *-----------------------------------------------------------------*/
3429 
3430             if (diagonal == 0.0)
3431             {
3432                if (print_level)
3433                   hypre_printf(" Warning! zero diagonal! Proc id %d row %d\n", my_id,i);
3434                for (jj = jj_begin_row; jj < jj_end_row; jj++)
3435                {
3436                   P_diag_data[jj] = 0.0;
3437                }
3438                for (jj = jj_begin_row_offd; jj < jj_end_row_offd; jj++)
3439                {
3440                   P_offd_data[jj] = 0.0;
3441                }
3442             }
3443             else
3444             {
3445                for (jj = jj_begin_row; jj < jj_end_row; jj++)
3446                {
3447                   P_diag_data[jj] /= -diagonal;
3448                }
3449                for (jj = jj_begin_row_offd; jj < jj_end_row_offd; jj++)
3450                {
3451                   P_offd_data[jj] /= -diagonal;
3452                }
3453             }
3454          }
3455 
3456          strong_f_marker--;
3457 
3458          P_offd_i[i+1] = jj_counter_offd;
3459       }
3460       hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
3461       hypre_TFree(P_marker_offd, HYPRE_MEMORY_HOST);
3462    }
3463 
3464    P = hypre_ParCSRMatrixCreate(comm,
3465          hypre_ParCSRMatrixGlobalNumRows(A),
3466          total_global_cpts,
3467          hypre_ParCSRMatrixColStarts(A),
3468          num_cpts_global,
3469          0,
3470          P_diag_i[n_fine],
3471          P_offd_i[n_fine]);
3472 
3473    P_diag = hypre_ParCSRMatrixDiag(P);
3474    hypre_CSRMatrixData(P_diag) = P_diag_data;
3475    hypre_CSRMatrixI(P_diag) = P_diag_i;
3476    hypre_CSRMatrixJ(P_diag) = P_diag_j;
3477    P_offd = hypre_ParCSRMatrixOffd(P);
3478    hypre_CSRMatrixData(P_offd) = P_offd_data;
3479    hypre_CSRMatrixI(P_offd) = P_offd_i;
3480    hypre_CSRMatrixJ(P_offd) = P_offd_j;
3481 
3482    /* Compress P, removing coefficients smaller than trunc_factor * Max */
3483 
3484    if (trunc_factor != 0.0 || max_elmts > 0)
3485    {
3486       hypre_BoomerAMGInterpTruncation(P, trunc_factor, max_elmts);
3487       P_diag_data = hypre_CSRMatrixData(P_diag);
3488       P_diag_i = hypre_CSRMatrixI(P_diag);
3489       P_diag_j = hypre_CSRMatrixJ(P_diag);
3490       P_offd_data = hypre_CSRMatrixData(P_offd);
3491       P_offd_i = hypre_CSRMatrixI(P_offd);
3492       P_offd_j = hypre_CSRMatrixJ(P_offd);
3493       P_diag_size = P_diag_i[n_fine];
3494       P_offd_size = P_offd_i[n_fine];
3495    }
3496 
3497    num_cols_P_offd = 0;
3498    if (P_offd_size)
3499    {
3500       P_marker = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
3501 
3502 #ifdef HYPRE_USING_OPENMP
3503 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
3504 #endif
3505       for (i=0; i < num_cols_A_offd; i++)
3506          P_marker[i] = 0;
3507 
3508       num_cols_P_offd = 0;
3509       for (i=0; i < P_offd_size; i++)
3510       {
3511          index = P_offd_j[i];
3512          if (!P_marker[index])
3513          {
3514             num_cols_P_offd++;
3515             P_marker[index] = 1;
3516          }
3517       }
3518 
3519       col_map_offd_P = hypre_CTAlloc(HYPRE_BigInt, num_cols_P_offd, HYPRE_MEMORY_HOST);
3520       tmp_map_offd = hypre_CTAlloc(HYPRE_Int, num_cols_P_offd, HYPRE_MEMORY_HOST);
3521 
3522       index = 0;
3523       for (i=0; i < num_cols_P_offd; i++)
3524       {
3525          while (P_marker[index]==0) index++;
3526          tmp_map_offd[i] = index++;
3527       }
3528 
3529 #ifdef HYPRE_USING_OPENMP
3530 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
3531 #endif
3532       for (i=0; i < P_offd_size; i++)
3533          P_offd_j[i] = hypre_BinarySearch(tmp_map_offd,
3534                P_offd_j[i],
3535                num_cols_P_offd);
3536       hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
3537    }
3538 
3539    for (i=0; i < n_fine; i++)
3540       if (CF_marker[i] == -3) CF_marker[i] = -1;
3541 
3542    if (num_cols_P_offd)
3543    {
3544       hypre_ParCSRMatrixColMapOffd(P) = col_map_offd_P;
3545       hypre_CSRMatrixNumCols(P_offd) = num_cols_P_offd;
3546    }
3547 
3548    hypre_GetCommPkgRTFromCommPkgA(P, A, fine_to_coarse, tmp_map_offd);
3549 
3550 
3551    *P_ptr = P;
3552 
3553    hypre_TFree(CF_marker_offd, HYPRE_MEMORY_HOST);
3554    hypre_TFree(dof_func_offd, HYPRE_MEMORY_HOST);
3555    hypre_TFree(int_buf_data, HYPRE_MEMORY_HOST);
3556    hypre_TFree(fine_to_coarse, HYPRE_MEMORY_HOST);
3557    hypre_TFree(tmp_map_offd, HYPRE_MEMORY_HOST);
3558    hypre_TFree(coarse_counter, HYPRE_MEMORY_HOST);
3559    hypre_TFree(jj_count, HYPRE_MEMORY_HOST);
3560    hypre_TFree(jj_count_offd, HYPRE_MEMORY_HOST);
3561 
3562    if (num_procs > 1) hypre_CSRMatrixDestroy(A_ext);
3563 
3564    return hypre_error_flag;
3565 
3566 }
3567 
3568 /*---------------------------------------------------------------------------
3569  * hypre_BoomerAMGTruncandBuild
3570  *--------------------------------------------------------------------------*/
3571 
3572 HYPRE_Int
hypre_BoomerAMGTruncandBuild(hypre_ParCSRMatrix * P,HYPRE_Real trunc_factor,HYPRE_Int max_elmts)3573 hypre_BoomerAMGTruncandBuild( hypre_ParCSRMatrix   *P,
3574                          HYPRE_Real                trunc_factor,
3575                          HYPRE_Int                 max_elmts)
3576 {
3577    hypre_CSRMatrix *P_offd = hypre_ParCSRMatrixOffd(P);
3578    hypre_ParCSRCommPkg   *commpkg_P = hypre_ParCSRMatrixCommPkg(P);
3579    HYPRE_BigInt          *col_map_offd = hypre_ParCSRMatrixColMapOffd(P);
3580    HYPRE_Int             *P_offd_i = hypre_CSRMatrixI(P_offd);
3581    HYPRE_Int             *P_offd_j = hypre_CSRMatrixJ(P_offd);
3582    HYPRE_Int              num_cols_offd = hypre_CSRMatrixNumCols(P_offd);
3583    HYPRE_Int              n_fine = hypre_CSRMatrixNumRows(P_offd);
3584 
3585    HYPRE_BigInt          *new_col_map_offd;
3586    HYPRE_Int             *tmp_map_offd = NULL;
3587 
3588    HYPRE_Int              P_offd_size=0, new_num_cols_offd;
3589 
3590    HYPRE_Int             *P_marker;
3591 
3592    HYPRE_Int              i;
3593 
3594    HYPRE_Int              index;
3595 
3596    /* Compress P, removing coefficients smaller than trunc_factor * Max */
3597 
3598    if (trunc_factor != 0.0 || max_elmts > 0)
3599    {
3600       hypre_BoomerAMGInterpTruncation(P, trunc_factor, max_elmts);
3601       P_offd_j = hypre_CSRMatrixJ(P_offd);
3602       P_offd_i = hypre_CSRMatrixI(P_offd);
3603       P_offd_size = P_offd_i[n_fine];
3604    }
3605 
3606    new_num_cols_offd = 0;
3607    if (P_offd_size)
3608    {
3609       P_marker = hypre_CTAlloc(HYPRE_Int,  num_cols_offd, HYPRE_MEMORY_HOST);
3610 
3611       /*#define HYPRE_SMP_PRIVATE i
3612 #include "../utilities/hypre_smp_forloop.h"*/
3613       for (i=0; i < num_cols_offd; i++)
3614          P_marker[i] = 0;
3615 
3616       for (i=0; i < P_offd_size; i++)
3617       {
3618          index = P_offd_j[i];
3619          if (!P_marker[index])
3620          {
3621             new_num_cols_offd++;
3622             P_marker[index] = 1;
3623          }
3624       }
3625 
3626       tmp_map_offd = hypre_CTAlloc(HYPRE_Int, new_num_cols_offd, HYPRE_MEMORY_HOST);
3627       new_col_map_offd = hypre_CTAlloc(HYPRE_BigInt, new_num_cols_offd, HYPRE_MEMORY_HOST);
3628 
3629       index = 0;
3630       for (i=0; i < new_num_cols_offd; i++)
3631       {
3632          while (P_marker[index]==0) index++;
3633          tmp_map_offd[i] = index++;
3634       }
3635 
3636       /*#define HYPRE_SMP_PRIVATE i
3637 #include "../utilities/hypre_smp_forloop.h"*/
3638       for (i=0; i < P_offd_size; i++)
3639          P_offd_j[i] = hypre_BinarySearch(tmp_map_offd,
3640                P_offd_j[i],
3641                new_num_cols_offd);
3642    }
3643 
3644    index = 0;
3645    for (i = 0; i < new_num_cols_offd; i++)
3646    {
3647       while (P_marker[index] == 0) index++;
3648 
3649       new_col_map_offd[i] = col_map_offd[index];
3650       index++;
3651    }
3652 
3653    if (P_offd_size) hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
3654 
3655    if (new_num_cols_offd)
3656    {
3657       hypre_TFree(tmp_map_offd, HYPRE_MEMORY_HOST);
3658       hypre_TFree(col_map_offd, HYPRE_MEMORY_HOST);
3659       hypre_ParCSRMatrixColMapOffd(P) = new_col_map_offd;
3660       hypre_CSRMatrixNumCols(P_offd) = new_num_cols_offd;
3661    }
3662 
3663    if (commpkg_P != NULL) hypre_MatvecCommPkgDestroy(commpkg_P);
3664    hypre_MatvecCommPkgCreate(P);
3665 
3666    return hypre_error_flag;
3667 
3668 }
3669 
hypre_CreateC(hypre_ParCSRMatrix * A,HYPRE_Real w)3670 hypre_ParCSRMatrix *hypre_CreateC( hypre_ParCSRMatrix  *A,
3671                                    HYPRE_Real w)
3672 {
3673    MPI_Comm    comm = hypre_ParCSRMatrixComm(A);
3674 
3675    hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
3676 
3677    HYPRE_Real *A_diag_data = hypre_CSRMatrixData(A_diag);
3678    HYPRE_Int  *A_diag_i = hypre_CSRMatrixI(A_diag);
3679    HYPRE_Int  *A_diag_j = hypre_CSRMatrixJ(A_diag);
3680 
3681    hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
3682 
3683    HYPRE_Real *A_offd_data = hypre_CSRMatrixData(A_offd);
3684    HYPRE_Int  *A_offd_i = hypre_CSRMatrixI(A_offd);
3685    HYPRE_Int  *A_offd_j = hypre_CSRMatrixJ(A_offd);
3686 
3687    HYPRE_BigInt *row_starts = hypre_ParCSRMatrixRowStarts(A);
3688    HYPRE_BigInt *col_map_offd_A = hypre_ParCSRMatrixColMapOffd(A);
3689    HYPRE_Int    num_rows = hypre_CSRMatrixNumRows(A_diag);
3690    HYPRE_Int    num_cols_offd = hypre_CSRMatrixNumCols(A_offd);
3691    HYPRE_BigInt global_num_rows = hypre_ParCSRMatrixGlobalNumRows(A);
3692 
3693    hypre_ParCSRMatrix *C;
3694    hypre_CSRMatrix *C_diag;
3695    hypre_CSRMatrix *C_offd;
3696 
3697    HYPRE_Real      *C_diag_data;
3698    HYPRE_Int       *C_diag_i;
3699    HYPRE_Int       *C_diag_j;
3700 
3701    HYPRE_Real      *C_offd_data;
3702    HYPRE_Int       *C_offd_i;
3703    HYPRE_Int       *C_offd_j;
3704    HYPRE_BigInt    *col_map_offd_C;
3705 
3706    HYPRE_Int i, j, index;
3707    HYPRE_Real  invdiag;
3708    HYPRE_Real  w_local = w;
3709 
3710    C = hypre_ParCSRMatrixCreate(comm, global_num_rows, global_num_rows, row_starts,
3711          row_starts, num_cols_offd, A_diag_i[num_rows], A_offd_i[num_rows]);
3712 
3713    hypre_ParCSRMatrixInitialize(C);
3714 
3715    C_diag = hypre_ParCSRMatrixDiag(C);
3716    C_offd = hypre_ParCSRMatrixOffd(C);
3717 
3718    C_diag_i = hypre_CSRMatrixI(C_diag);
3719    C_diag_j = hypre_CSRMatrixJ(C_diag);
3720    C_diag_data = hypre_CSRMatrixData(C_diag);
3721 
3722    C_offd_i = hypre_CSRMatrixI(C_offd);
3723    C_offd_j = hypre_CSRMatrixJ(C_offd);
3724    C_offd_data = hypre_CSRMatrixData(C_offd);
3725 
3726    col_map_offd_C = hypre_ParCSRMatrixColMapOffd(C);
3727 
3728    for (i = 0; i < num_cols_offd; i++)
3729    {
3730       col_map_offd_C[i] = col_map_offd_A[i];
3731    }
3732 
3733    for (i = 0; i < num_rows; i++)
3734    {
3735       index = A_diag_i[i];
3736       invdiag = -w/A_diag_data[index];
3737       C_diag_data[index] = 1.0-w;
3738       C_diag_j[index] = A_diag_j[index];
3739       if (w == 0)
3740       {
3741          w_local = fabs(A_diag_data[index]);
3742          for (j = index+1; j < A_diag_i[i+1]; j++)
3743             w_local += fabs(A_diag_data[j]);
3744          for (j = A_offd_i[i]; j < A_offd_i[i+1]; j++)
3745             w_local += fabs(A_offd_data[j]);
3746          invdiag = -1/w_local;
3747          C_diag_data[index] = 1.0-A_diag_data[index]/w_local;
3748       }
3749       C_diag_i[i] = index;
3750       C_offd_i[i] = A_offd_i[i];
3751       for (j = index+1; j < A_diag_i[i+1]; j++)
3752       {
3753          C_diag_data[j] = A_diag_data[j]*invdiag;
3754          C_diag_j[j] = A_diag_j[j];
3755       }
3756       for (j = A_offd_i[i]; j < A_offd_i[i+1]; j++)
3757       {
3758          C_offd_data[j] = A_offd_data[j]*invdiag;
3759          C_offd_j[j] = A_offd_j[j];
3760       }
3761    }
3762    C_diag_i[num_rows] = A_diag_i[num_rows];
3763    C_offd_i[num_rows] = A_offd_i[num_rows];
3764 
3765    return C;
3766 }
3767 
3768 /* RL */
3769 HYPRE_Int
hypre_BoomerAMGBuildInterpOnePntHost(hypre_ParCSRMatrix * A,HYPRE_Int * CF_marker,hypre_ParCSRMatrix * S,HYPRE_BigInt * num_cpts_global,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int debug_flag,hypre_ParCSRMatrix ** P_ptr)3770 hypre_BoomerAMGBuildInterpOnePntHost( hypre_ParCSRMatrix  *A,
3771                                       HYPRE_Int           *CF_marker,
3772                                       hypre_ParCSRMatrix  *S,
3773                                       HYPRE_BigInt        *num_cpts_global,
3774                                       HYPRE_Int            num_functions,
3775                                       HYPRE_Int           *dof_func,
3776                                       HYPRE_Int            debug_flag,
3777                                       hypre_ParCSRMatrix **P_ptr)
3778 {
3779    MPI_Comm                 comm     = hypre_ParCSRMatrixComm(A);
3780    hypre_ParCSRCommPkg     *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
3781    hypre_ParCSRCommHandle  *comm_handle;
3782 
3783    hypre_CSRMatrix         *A_diag      = hypre_ParCSRMatrixDiag(A);
3784    HYPRE_Real              *A_diag_data = hypre_CSRMatrixData(A_diag);
3785    HYPRE_Int               *A_diag_i    = hypre_CSRMatrixI(A_diag);
3786    HYPRE_Int               *A_diag_j    = hypre_CSRMatrixJ(A_diag);
3787 
3788    hypre_CSRMatrix         *A_offd      = hypre_ParCSRMatrixOffd(A);
3789    HYPRE_Real              *A_offd_data = hypre_CSRMatrixData(A_offd);
3790    HYPRE_Int               *A_offd_i    = hypre_CSRMatrixI(A_offd);
3791    HYPRE_Int               *A_offd_j    = hypre_CSRMatrixJ(A_offd);
3792 
3793    HYPRE_Int                num_cols_A_offd = hypre_CSRMatrixNumCols(A_offd);
3794    //HYPRE_Int               *col_map_offd_A    = hypre_ParCSRMatrixColMapOffd(A);
3795 
3796    hypre_CSRMatrix         *S_diag   = hypre_ParCSRMatrixDiag(S);
3797    HYPRE_Int               *S_diag_i = hypre_CSRMatrixI(S_diag);
3798    HYPRE_Int               *S_diag_j = hypre_CSRMatrixJ(S_diag);
3799 
3800    hypre_CSRMatrix         *S_offd   = hypre_ParCSRMatrixOffd(S);
3801    HYPRE_Int               *S_offd_i = hypre_CSRMatrixI(S_offd);
3802    HYPRE_Int               *S_offd_j = hypre_CSRMatrixJ(S_offd);
3803 
3804    /* Interpolation matrix P */
3805    hypre_ParCSRMatrix      *P;
3806    /* csr's */
3807    hypre_CSRMatrix    *P_diag;
3808    hypre_CSRMatrix    *P_offd;
3809    /* arrays */
3810    HYPRE_Real         *P_diag_data;
3811    HYPRE_Int          *P_diag_i;
3812    HYPRE_Int          *P_diag_j;
3813    HYPRE_Real         *P_offd_data;
3814    HYPRE_Int          *P_offd_i;
3815    HYPRE_Int          *P_offd_j;
3816    HYPRE_Int           num_cols_offd_P;
3817    HYPRE_Int          *tmp_map_offd = NULL;
3818    HYPRE_BigInt       *col_map_offd_P = NULL;
3819    /* CF marker off-diag part */
3820    HYPRE_Int          *CF_marker_offd = NULL;
3821    /* func type off-diag part */
3822    HYPRE_Int          *dof_func_offd  = NULL;
3823    /* nnz */
3824    HYPRE_Int           nnz_diag, nnz_offd, cnt_diag, cnt_offd;
3825    HYPRE_Int          *marker_diag, *marker_offd = NULL;
3826    /* local size */
3827    HYPRE_Int           n_fine = hypre_CSRMatrixNumRows(A_diag);
3828    /* number of C-pts */
3829    HYPRE_Int           n_cpts = 0;
3830    /* fine to coarse mapping: diag part and offd part */
3831    HYPRE_Int          *fine_to_coarse;
3832    HYPRE_BigInt       *fine_to_coarse_offd = NULL;
3833    HYPRE_BigInt        total_global_cpts, my_first_cpt;
3834    HYPRE_Int           my_id, num_procs;
3835    HYPRE_Int           num_sends;
3836    HYPRE_Int          *int_buf_data = NULL;
3837    HYPRE_BigInt       *big_int_buf_data = NULL;
3838    //HYPRE_Int col_start = hypre_ParCSRMatrixFirstRowIndex(A);
3839    //HYPRE_Int col_end   = col_start + n_fine;
3840 
3841    HYPRE_Int           i, j, i1, j1, k1, index, start;
3842    HYPRE_Int          *max_abs_cij;
3843    char               *max_abs_diag_offd;
3844    HYPRE_Real          max_abs_aij, vv;
3845 
3846    hypre_MPI_Comm_size(comm, &num_procs);
3847    hypre_MPI_Comm_rank(comm,&my_id);
3848 
3849    my_first_cpt = num_cpts_global[0];
3850    if (my_id == (num_procs -1)) total_global_cpts = num_cpts_global[1];
3851    hypre_MPI_Bcast(&total_global_cpts, 1, HYPRE_MPI_BIG_INT, num_procs-1, comm);
3852 
3853    /*-------------------------------------------------------------------
3854     * Get the CF_marker data for the off-processor columns
3855     *-------------------------------------------------------------------*/
3856    /* CF marker for the off-diag columns */
3857    if (num_cols_A_offd)
3858    {
3859       CF_marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_A_offd,HYPRE_MEMORY_HOST);
3860    }
3861    /* function type indicator for the off-diag columns */
3862    if (num_functions > 1 && num_cols_A_offd)
3863    {
3864       dof_func_offd = hypre_CTAlloc(HYPRE_Int, num_cols_A_offd,HYPRE_MEMORY_HOST);
3865    }
3866    /* if CommPkg of A is not present, create it */
3867    if (!comm_pkg)
3868    {
3869       hypre_MatvecCommPkgCreate(A);
3870       comm_pkg = hypre_ParCSRMatrixCommPkg(A);
3871    }
3872    /* number of sends to do (number of procs) */
3873    num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
3874    /* send buffer, of size send_map_starts[num_sends]),
3875     * i.e., number of entries to send */
3876    int_buf_data = hypre_CTAlloc(HYPRE_Int, hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),HYPRE_MEMORY_HOST);
3877 
3878    /* copy CF markers of elements to send to buffer
3879     * RL: why copy them with two for loops? Why not just loop through all in one */
3880    index = 0;
3881    for (i = 0; i < num_sends; i++)
3882    {
3883       /* start pos of elements sent to send_proc[i] */
3884       start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
3885       /* loop through all elems to send_proc[i] */
3886       for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
3887       {
3888          /* CF marker of send_map_elemts[j] */
3889          int_buf_data[index++] = CF_marker[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
3890       }
3891    }
3892    /* create a handle to start communication. 11: for integer */
3893    comm_handle = hypre_ParCSRCommHandleCreate(11, comm_pkg, int_buf_data, CF_marker_offd);
3894    /* destroy the handle to finish communication */
3895    hypre_ParCSRCommHandleDestroy(comm_handle);
3896 
3897    /* do a similar communication for dof_func */
3898    if (num_functions > 1)
3899    {
3900       index = 0;
3901       for (i = 0; i < num_sends; i++)
3902       {
3903          start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
3904          for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
3905          {
3906             int_buf_data[index++] = dof_func[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
3907          }
3908       }
3909       comm_handle = hypre_ParCSRCommHandleCreate(11, comm_pkg, int_buf_data, dof_func_offd);
3910       hypre_ParCSRCommHandleDestroy(comm_handle);
3911    }
3912 
3913    hypre_TFree(int_buf_data,HYPRE_MEMORY_HOST);
3914    /*-----------------------------------------------------------------------
3915     *  First Pass: Determine size of P and fill in fine_to_coarse mapping,
3916     *  and find the most strongly influencing C-pt for each F-pt
3917     *-----------------------------------------------------------------------*/
3918    /* nnz in diag and offd parts */
3919    cnt_diag = 0;
3920    cnt_offd = 0;
3921    max_abs_cij       = hypre_CTAlloc(HYPRE_Int, n_fine,HYPRE_MEMORY_HOST);
3922    max_abs_diag_offd = hypre_CTAlloc(char, n_fine,HYPRE_MEMORY_HOST);
3923    fine_to_coarse    = hypre_CTAlloc(HYPRE_Int, n_fine,HYPRE_MEMORY_HOST);
3924 
3925    /* markers initialized as zeros */
3926    marker_diag = hypre_CTAlloc(HYPRE_Int, n_fine,HYPRE_MEMORY_HOST);
3927    marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_A_offd,HYPRE_MEMORY_HOST);
3928 
3929    for (i = 0; i < n_fine; i++)
3930    {
3931       /*--------------------------------------------------------------------
3932        *  If i is a C-point, interpolation is the identity. Also set up
3933        *  mapping vector.
3934        *--------------------------------------------------------------------*/
3935       if (CF_marker[i] >= 0)
3936       {
3937          //fine_to_coarse[i] = my_first_cpt + n_cpts;
3938          fine_to_coarse[i] = n_cpts;
3939          n_cpts++;
3940          continue;
3941       }
3942 
3943       /* mark all the strong connections: in S */
3944       HYPRE_Int MARK = i + 1;
3945       /* loop through row i of S, diag part  */
3946       for (j = S_diag_i[i]; j < S_diag_i[i+1]; j++)
3947       {
3948          marker_diag[S_diag_j[j]] = MARK;
3949       }
3950       /* loop through row i of S, offd part  */
3951       if (num_procs > 1)
3952       {
3953           for (j = S_offd_i[i]; j < S_offd_i[i+1]; j++)
3954           {
3955              j1 = S_offd_j[j];
3956              marker_offd[j1] = MARK;
3957           }
3958       }
3959 
3960       fine_to_coarse[i] = -1;
3961       /*---------------------------------------------------------------------------
3962        *  If i is an F-pt, interpolation is from the most strongly influencing C-pt
3963        *  Find this C-pt and save it
3964        *--------------------------------------------------------------------------*/
3965       /* if we failed to find any strong C-pt, mark this point as an 'n' */
3966       char marker = 'n';
3967       /* max abs val */
3968       max_abs_aij = -1.0;
3969       /* loop through row i of A, diag part  */
3970       for (j = A_diag_i[i]; j < A_diag_i[i+1]; j++)
3971       {
3972          i1 = A_diag_j[j];
3973          vv = fabs(A_diag_data[j]);
3974 #if 0
3975          /* !!! this is a hack just for code verification purpose !!!
3976             it basically says:
3977             1. if we see |a_ij| < 1e-14, force it to be 1e-14
3978             2. if we see |a_ij| == the max(|a_ij|) so far exactly,
3979                replace it if the j idx is smaller
3980             Reasons:
3981             1. numerical round-off for eps-level values
3982             2. entries in CSR rows may be listed in different orders
3983          */
3984          vv = vv < 1e-14 ? 1e-14 : vv;
3985          if (CF_marker[i1] >= 0 && marker_diag[i1] == MARK &&
3986              vv == max_abs_aij && i1 < max_abs_cij[i])
3987          {
3988             /* mark it as a 'd' */
3989             marker         = 'd';
3990             max_abs_cij[i] = i1;
3991             max_abs_aij    = vv;
3992             continue;
3993          }
3994 #endif
3995          /* it is a strong C-pt and has abs val larger than what have seen */
3996          if (CF_marker[i1] >= 0 && marker_diag[i1] == MARK && vv > max_abs_aij)
3997          {
3998             /* mark it as a 'd' */
3999             marker         = 'd';
4000             max_abs_cij[i] = i1;
4001             max_abs_aij    = vv;
4002          }
4003       }
4004       /* offd part */
4005       if (num_procs > 1)
4006       {
4007          for (j = A_offd_i[i]; j < A_offd_i[i+1]; j++)
4008          {
4009             i1 = A_offd_j[j];
4010             vv = fabs(A_offd_data[j]);
4011             if (CF_marker_offd[i1] >= 0 && marker_offd[i1] == MARK && vv > max_abs_aij)
4012             {
4013                /* mark it as an 'o' */
4014                marker         = 'o';
4015                max_abs_cij[i] = i1;
4016                max_abs_aij    = vv;
4017             }
4018          }
4019       }
4020 
4021       max_abs_diag_offd[i] = marker;
4022 
4023       if (marker == 'd')
4024       {
4025          cnt_diag ++;
4026       }
4027       else if (marker == 'o')
4028       {
4029          cnt_offd ++;
4030       }
4031    }
4032 
4033    nnz_diag = cnt_diag + n_cpts;
4034    nnz_offd = cnt_offd;
4035 
4036    /*------------- allocate arrays */
4037    P_diag_i    = hypre_CTAlloc(HYPRE_Int,  n_fine+1,HYPRE_MEMORY_DEVICE);
4038    P_diag_j    = hypre_CTAlloc(HYPRE_Int,  nnz_diag,HYPRE_MEMORY_DEVICE);
4039    P_diag_data = hypre_CTAlloc(HYPRE_Real, nnz_diag,HYPRE_MEMORY_DEVICE);
4040 
4041    /* not in ``if num_procs > 1'',
4042     * allocation needed even for empty CSR */
4043    P_offd_i    = hypre_CTAlloc(HYPRE_Int,  n_fine+1,HYPRE_MEMORY_DEVICE);
4044    P_offd_j    = hypre_CTAlloc(HYPRE_Int,  nnz_offd,HYPRE_MEMORY_DEVICE);
4045    P_offd_data = hypre_CTAlloc(HYPRE_Real, nnz_offd,HYPRE_MEMORY_DEVICE);
4046 
4047    /* redundant */
4048    P_diag_i[0] = 0;
4049    P_offd_i[0] = 0;
4050 
4051    /* reset counters */
4052    cnt_diag = 0;
4053    cnt_offd = 0;
4054 
4055    /*-----------------------------------------------------------------------
4056     *  Send and receive fine_to_coarse info.
4057     *-----------------------------------------------------------------------*/
4058    fine_to_coarse_offd = hypre_CTAlloc(HYPRE_BigInt, num_cols_A_offd,HYPRE_MEMORY_HOST);
4059    big_int_buf_data = hypre_CTAlloc(HYPRE_BigInt, hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),HYPRE_MEMORY_HOST);
4060    index = 0;
4061    for (i = 0; i < num_sends; i++)
4062    {
4063       start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
4064       for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
4065       {
4066          big_int_buf_data[index++] = my_first_cpt
4067                +(HYPRE_BigInt)fine_to_coarse[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
4068       }
4069    }
4070    comm_handle = hypre_ParCSRCommHandleCreate(21, comm_pkg, big_int_buf_data, fine_to_coarse_offd);
4071    hypre_ParCSRCommHandleDestroy(comm_handle);
4072 
4073    /*-----------------------------------------------------------------------
4074     *  Second Pass: Populate P
4075     *-----------------------------------------------------------------------*/
4076    for (i = 0; i < n_fine; i++)
4077    {
4078       if (CF_marker[i] >= 0)
4079       {
4080          /*--------------------------------------------------------------------
4081           *  If i is a C-point, interpolation is the identity.
4082           *--------------------------------------------------------------------*/
4083          //P_diag_j[cnt_diag] = fine_to_coarse[i] - my_first_cpt;
4084          P_diag_j[cnt_diag] = fine_to_coarse[i];
4085          P_diag_data[cnt_diag++] = 1.0;
4086       }
4087       else
4088       {
4089          /*---------------------------------------------------------------------------
4090           *  If i is an F-pt, interpolation is from the most strongly influencing C-pt
4091           *--------------------------------------------------------------------------*/
4092          if (max_abs_diag_offd[i] == 'd')
4093          {
4094             /* on diag part of P */
4095             j = max_abs_cij[i];
4096             //P_diag_j[cnt_diag] = fine_to_coarse[j] - my_first_cpt;
4097             P_diag_j[cnt_diag] = fine_to_coarse[j];
4098             P_diag_data[cnt_diag++] = 1.0;
4099          }
4100          else if (max_abs_diag_offd[i] == 'o')
4101          {
4102             /* on offd part of P */
4103             j = max_abs_cij[i];
4104             P_offd_j[cnt_offd] = j;
4105             P_offd_data[cnt_offd++] = 1.0;
4106          }
4107       }
4108 
4109       P_diag_i[i+1] = cnt_diag;
4110       P_offd_i[i+1] = cnt_offd;
4111    }
4112 
4113    hypre_assert(cnt_diag == nnz_diag);
4114    hypre_assert(cnt_offd == nnz_offd);
4115 
4116    /* num of cols in the offd part of P */
4117    num_cols_offd_P = 0;
4118 
4119    /* marker_offd: all -1 */
4120    for (i = 0; i < num_cols_A_offd; i++)
4121    {
4122       marker_offd[i] = -1;
4123    }
4124    for (i = 0; i < nnz_offd; i++)
4125    {
4126       i1 = P_offd_j[i];
4127       if (marker_offd[i1] == -1)
4128       {
4129          num_cols_offd_P++;
4130          marker_offd[i1] = 1;
4131       }
4132    }
4133 
4134    /* col_map_offd_P: the col indices of the offd of P
4135     * we first keep them be the offd-idx of A */
4136    col_map_offd_P = hypre_CTAlloc(HYPRE_BigInt, num_cols_offd_P,HYPRE_MEMORY_HOST);
4137    tmp_map_offd = hypre_CTAlloc(HYPRE_Int, num_cols_offd_P,HYPRE_MEMORY_HOST);
4138    for (i = 0, i1 = 0; i < num_cols_A_offd; i++)
4139    {
4140       if (marker_offd[i] == 1)
4141       {
4142          tmp_map_offd[i1++] = i;
4143       }
4144    }
4145    hypre_assert(i1 == num_cols_offd_P);
4146 
4147    /* now, adjust P_offd_j to local idx w.r.t col_map_offd_R
4148     * by searching */
4149    for (i = 0; i < nnz_offd; i++)
4150    {
4151       i1 = P_offd_j[i];
4152       k1 = hypre_BinarySearch(tmp_map_offd, i1, num_cols_offd_P);
4153       /* search must succeed */
4154       hypre_assert(k1 >= 0 && k1 < num_cols_offd_P);
4155       P_offd_j[i] = k1;
4156    }
4157 
4158    /* change col_map_offd_P to global coarse ids */
4159    for (i = 0; i < num_cols_offd_P; i++)
4160    {
4161       col_map_offd_P[i] = fine_to_coarse_offd[tmp_map_offd[i]];
4162    }
4163 
4164    /* Now, we should have everything of Parcsr matrix P */
4165    P = hypre_ParCSRMatrixCreate(comm,
4166                                 hypre_ParCSRMatrixGlobalNumCols(A), /* global num of rows */
4167                                 total_global_cpts, /* global num of cols */
4168                                 hypre_ParCSRMatrixColStarts(A), /* row_starts */
4169                                 num_cpts_global, /* col_starts */
4170                                 num_cols_offd_P, /* num cols offd */
4171                                 nnz_diag,
4172                                 nnz_offd);
4173 
4174    P_diag = hypre_ParCSRMatrixDiag(P);
4175    hypre_CSRMatrixData(P_diag) = P_diag_data;
4176    hypre_CSRMatrixI(P_diag)    = P_diag_i;
4177    hypre_CSRMatrixJ(P_diag)    = P_diag_j;
4178 
4179    P_offd = hypre_ParCSRMatrixOffd(P);
4180    hypre_CSRMatrixData(P_offd) = P_offd_data;
4181    hypre_CSRMatrixI(P_offd)    = P_offd_i;
4182    hypre_CSRMatrixJ(P_offd)    = P_offd_j;
4183 
4184    hypre_ParCSRMatrixColMapOffd(P) = col_map_offd_P;
4185 
4186    /* create CommPkg of P */
4187    hypre_MatvecCommPkgCreate(P);
4188 
4189    *P_ptr = P;
4190 
4191    /* free workspace */
4192    hypre_TFree(CF_marker_offd,HYPRE_MEMORY_HOST);
4193    hypre_TFree(dof_func_offd,HYPRE_MEMORY_HOST);
4194    hypre_TFree(tmp_map_offd,HYPRE_MEMORY_HOST);
4195    hypre_TFree(big_int_buf_data,HYPRE_MEMORY_HOST);
4196    hypre_TFree(fine_to_coarse,HYPRE_MEMORY_HOST);
4197    hypre_TFree(fine_to_coarse_offd,HYPRE_MEMORY_HOST);
4198    hypre_TFree(marker_diag,HYPRE_MEMORY_HOST);
4199    hypre_TFree(marker_offd,HYPRE_MEMORY_HOST);
4200    hypre_TFree(max_abs_cij,HYPRE_MEMORY_HOST);
4201    hypre_TFree(max_abs_diag_offd,HYPRE_MEMORY_HOST);
4202 
4203    return hypre_error_flag;
4204 }
4205 
4206 HYPRE_Int
hypre_BoomerAMGBuildInterpOnePnt(hypre_ParCSRMatrix * A,HYPRE_Int * CF_marker,hypre_ParCSRMatrix * S,HYPRE_BigInt * num_cpts_global,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int debug_flag,hypre_ParCSRMatrix ** P_ptr)4207 hypre_BoomerAMGBuildInterpOnePnt( hypre_ParCSRMatrix  *A,
4208                                   HYPRE_Int           *CF_marker,
4209                                   hypre_ParCSRMatrix  *S,
4210                                   HYPRE_BigInt        *num_cpts_global,
4211                                   HYPRE_Int            num_functions,
4212                                   HYPRE_Int           *dof_func,
4213                                   HYPRE_Int            debug_flag,
4214                                   hypre_ParCSRMatrix **P_ptr)
4215 {
4216 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
4217    hypre_GpuProfilingPushRange("OnePntInterp");
4218 #endif
4219 
4220    HYPRE_Int ierr = 0;
4221 
4222 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
4223    HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
4224 
4225    if (exec == HYPRE_EXEC_DEVICE)
4226    {
4227       ierr = hypre_BoomerAMGBuildInterpOnePntDevice(A,CF_marker,S,num_cpts_global,num_functions,
4228                                                    dof_func,debug_flag,P_ptr);
4229    }
4230    else
4231 #endif
4232    {
4233       ierr = hypre_BoomerAMGBuildInterpOnePntHost(A,CF_marker,S,num_cpts_global,num_functions,
4234                                                    dof_func,debug_flag,P_ptr);
4235    }
4236 
4237 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
4238    hypre_GpuProfilingPopRange();
4239 #endif
4240 
4241    return ierr;
4242 }
4243