1 /******************************************************************************
2  * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other
3  * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
4  *
5  * SPDX-License-Identifier: (Apache-2.0 OR MIT)
6  ******************************************************************************/
7 
8 #include "_hypre_parcsr_ls.h"
9 #include "aux_interp.h"
10 
11 /*---------------------------------------------------------------------------
12  * hypre_BoomerAMGBuildModExtInterp
13  *  Comment:
14  *--------------------------------------------------------------------------*/
15 HYPRE_Int
hypre_BoomerAMGBuildModExtInterpHost(hypre_ParCSRMatrix * A,HYPRE_Int * CF_marker,hypre_ParCSRMatrix * S,HYPRE_BigInt * num_cpts_global,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int debug_flag,HYPRE_Real trunc_factor,HYPRE_Int max_elmts,hypre_ParCSRMatrix ** P_ptr)16 hypre_BoomerAMGBuildModExtInterpHost(hypre_ParCSRMatrix  *A,
17                                      HYPRE_Int           *CF_marker,
18                                      hypre_ParCSRMatrix  *S,
19                                      HYPRE_BigInt        *num_cpts_global,
20                                      HYPRE_Int            num_functions,
21                                      HYPRE_Int           *dof_func,
22                                      HYPRE_Int            debug_flag,
23                                      HYPRE_Real           trunc_factor,
24                                      HYPRE_Int            max_elmts,
25                                      hypre_ParCSRMatrix **P_ptr)
26 {
27    /* Communication Variables */
28    MPI_Comm              comm = hypre_ParCSRMatrixComm(A);
29    HYPRE_MemoryLocation  memory_location_P = hypre_ParCSRMatrixMemoryLocation(A);
30    hypre_ParCSRCommPkg     *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
31    hypre_ParCSRCommHandle  *comm_handle = NULL;
32    HYPRE_Int             my_id, num_procs;
33 
34    /* Variables to store input variables */
35    hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
36    HYPRE_Real      *A_diag_data = hypre_CSRMatrixData(A_diag);
37    HYPRE_Int       *A_diag_j = hypre_CSRMatrixJ(A_diag);
38    HYPRE_Int       *A_diag_i = hypre_CSRMatrixI(A_diag);
39 
40    hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
41    HYPRE_Real      *A_offd_data = hypre_CSRMatrixData(A_offd);
42    HYPRE_Int       *A_offd_j = hypre_CSRMatrixJ(A_offd);
43    HYPRE_Int       *A_offd_i = hypre_CSRMatrixI(A_offd);
44 
45    hypre_CSRMatrix *S_diag = hypre_ParCSRMatrixDiag(S);
46    HYPRE_Int       *S_diag_j = hypre_CSRMatrixJ(S_diag);
47    HYPRE_Int       *S_diag_i = hypre_CSRMatrixI(S_diag);
48 
49    hypre_CSRMatrix *S_offd = hypre_ParCSRMatrixOffd(S);
50    HYPRE_Int       *S_offd_j = hypre_CSRMatrixJ(S_offd);
51    HYPRE_Int       *S_offd_i = hypre_CSRMatrixI(S_offd);
52 
53    HYPRE_Int        n_fine = hypre_CSRMatrixNumRows(A_diag);
54    HYPRE_BigInt     total_global_cpts;
55 
56    /* Interpolation matrix P */
57    hypre_ParCSRMatrix *P;
58    hypre_CSRMatrix    *P_diag;
59    hypre_CSRMatrix    *P_offd;
60 
61    HYPRE_Real      *P_diag_data = NULL;
62    HYPRE_Int       *P_diag_i, *P_diag_j = NULL;
63    HYPRE_Real      *P_offd_data = NULL;
64    HYPRE_Int       *P_offd_i, *P_offd_j = NULL;
65 
66    /* Intermediate matrices */
67    hypre_ParCSRMatrix *As_FF, *As_FC, *W;
68    HYPRE_Real *D_q, *D_w;
69    hypre_CSRMatrix *As_FF_diag;
70    hypre_CSRMatrix *As_FF_offd;
71    hypre_CSRMatrix *As_FC_diag;
72    hypre_CSRMatrix *As_FC_offd;
73    hypre_CSRMatrix *W_diag;
74    hypre_CSRMatrix *W_offd;
75 
76    HYPRE_Int *As_FF_diag_i;
77    HYPRE_Int *As_FF_offd_i;
78    HYPRE_Int *As_FC_diag_i;
79    HYPRE_Int *As_FC_offd_i;
80    HYPRE_Int *W_diag_i;
81    HYPRE_Int *W_offd_i;
82    HYPRE_Int *W_diag_j;
83    HYPRE_Int *W_offd_j;
84 
85    HYPRE_Real *As_FF_diag_data;
86    HYPRE_Real *As_FF_offd_data;
87    HYPRE_Real *As_FC_diag_data;
88    HYPRE_Real *As_FC_offd_data;
89    HYPRE_Real *W_diag_data;
90    HYPRE_Real *W_offd_data;
91 
92    HYPRE_BigInt    *col_map_offd_P = NULL;
93    HYPRE_BigInt    *new_col_map_offd = NULL;
94    HYPRE_Int        P_diag_size;
95    HYPRE_Int        P_offd_size;
96    HYPRE_Int        new_ncols_P_offd;
97    HYPRE_Int        num_cols_P_offd;
98    HYPRE_Int       *P_marker = NULL;
99    HYPRE_Int       *dof_func_offd = NULL;
100 
101    /* Loop variables */
102    HYPRE_Int        index;
103    HYPRE_Int        i, j;
104    HYPRE_Int       *cpt_array;
105    HYPRE_Int       *start_array;
106    HYPRE_Int       *startf_array;
107    HYPRE_Int start, stop, startf, stopf;
108    HYPRE_Int cnt_diag, cnt_offd, row, c_pt;
109 
110    /* Definitions */
111    //HYPRE_Real       wall_time;
112    HYPRE_Int n_Cpts, n_Fpts;
113    HYPRE_Int num_threads = hypre_NumThreads();
114 
115    //if (debug_flag==4) wall_time = time_getWallclockSeconds();
116 
117    /* BEGIN */
118    hypre_MPI_Comm_size(comm, &num_procs);
119    hypre_MPI_Comm_rank(comm,&my_id);
120 
121    if (my_id == (num_procs -1)) total_global_cpts = num_cpts_global[1];
122    hypre_MPI_Bcast(&total_global_cpts, 1, HYPRE_MPI_BIG_INT, num_procs-1, comm);
123    n_Cpts = num_cpts_global[1]-num_cpts_global[0];
124 
125    hypre_ParCSRMatrixGenerateFFFC(A, CF_marker, num_cpts_global, S, &As_FC, &As_FF);
126 
127    As_FC_diag = hypre_ParCSRMatrixDiag(As_FC);
128    As_FC_diag_i = hypre_CSRMatrixI(As_FC_diag);
129    As_FC_diag_data = hypre_CSRMatrixData(As_FC_diag);
130    As_FC_offd = hypre_ParCSRMatrixOffd(As_FC);
131    As_FC_offd_i = hypre_CSRMatrixI(As_FC_offd);
132    As_FC_offd_data = hypre_CSRMatrixData(As_FC_offd);
133    As_FF_diag = hypre_ParCSRMatrixDiag(As_FF);
134    As_FF_diag_i = hypre_CSRMatrixI(As_FF_diag);
135    As_FF_diag_data = hypre_CSRMatrixData(As_FF_diag);
136    As_FF_offd = hypre_ParCSRMatrixOffd(As_FF);
137    As_FF_offd_i = hypre_CSRMatrixI(As_FF_offd);
138    As_FF_offd_data = hypre_CSRMatrixData(As_FF_offd);
139    n_Fpts = hypre_CSRMatrixNumRows(As_FF_diag);
140 
141    D_q = hypre_CTAlloc(HYPRE_Real, n_Fpts, HYPRE_MEMORY_HOST);
142    D_w = hypre_CTAlloc(HYPRE_Real, n_Fpts, HYPRE_MEMORY_HOST);
143    cpt_array = hypre_CTAlloc(HYPRE_Int, num_threads, HYPRE_MEMORY_HOST);
144    start_array = hypre_CTAlloc(HYPRE_Int, num_threads+1, HYPRE_MEMORY_HOST);
145    startf_array = hypre_CTAlloc(HYPRE_Int, num_threads+1, HYPRE_MEMORY_HOST);
146 
147 #ifdef HYPRE_USING_OPENMP
148 #pragma omp parallel private(i,j,start,stop,startf,stopf,row)
149 #endif
150    {
151       HYPRE_Int my_thread_num = hypre_GetThreadNum();
152       HYPRE_Real beta, gamma;
153 
154       start = (n_fine/num_threads)*my_thread_num;
155       if (my_thread_num == num_threads-1)
156       {
157          stop = n_fine;
158       }
159       else
160       {
161          stop = (n_fine/num_threads)*(my_thread_num+1);
162       }
163       start_array[my_thread_num+1] = stop;
164       for (i=start; i < stop; i++)
165       {
166          if (CF_marker[i] > 0)
167          {
168             cpt_array[my_thread_num]++;
169          }
170       }
171 
172 #ifdef HYPRE_USING_OPENMP
173 #pragma omp barrier
174 #endif
175       if (my_thread_num == 0)
176       {
177          for (i=1; i < num_threads; i++)
178          {
179             cpt_array[i] += cpt_array[i-1];
180          }
181          if (num_functions > 1)
182          {
183             HYPRE_Int *int_buf_data = NULL;
184             HYPRE_Int num_sends, startc;
185             HYPRE_Int num_cols_A_offd = hypre_CSRMatrixNumCols(A_offd);
186             dof_func_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
187             index = 0;
188             num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
189             int_buf_data = hypre_CTAlloc(HYPRE_Int, hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends), HYPRE_MEMORY_HOST);
190             for (i = 0; i < num_sends; i++)
191             {
192                startc = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
193                for (j = startc; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
194                {
195                   int_buf_data[index++] = dof_func[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
196                }
197             }
198             comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data, dof_func_offd);
199             hypre_ParCSRCommHandleDestroy(comm_handle);
200             hypre_TFree(int_buf_data, HYPRE_MEMORY_HOST);
201          }
202       }
203 
204 #ifdef HYPRE_USING_OPENMP
205 #pragma omp barrier
206 #endif
207       if (my_thread_num > 0)
208          startf = start - cpt_array[my_thread_num-1];
209       else
210          startf = 0;
211 
212       if (my_thread_num < num_threads-1)
213          stopf = stop - cpt_array[my_thread_num];
214       else
215          stopf = n_Fpts;
216 
217       startf_array[my_thread_num+1] = stopf;
218 
219       /* Create D_q = D_beta */
220       for (i=startf; i < stopf; i++)
221       {
222          for (j=As_FC_diag_i[i]; j < As_FC_diag_i[i+1]; j++)
223          {
224             D_q[i] += As_FC_diag_data[j];
225          }
226          for (j=As_FC_offd_i[i]; j < As_FC_offd_i[i+1]; j++)
227          {
228             D_q[i] += As_FC_offd_data[j];
229          }
230       }
231 
232       /* Create D_w = D_alpha + D_gamma */
233       row = startf;
234       for (i=start; i < stop; i++)
235       {
236          if (CF_marker[i] < 0)
237          {
238             if (num_functions > 1)
239             {
240                HYPRE_Int jA, jS, jC;
241                jC = A_diag_i[i];
242                for (j=S_diag_i[i]; j < S_diag_i[i+1]; j++)
243                {
244                   jS = S_diag_j[j];
245                   jA = A_diag_j[jC];
246                   while (jA != jS)
247                   {
248                      if (dof_func[i] == dof_func[jA])
249                      {
250                         D_w[row] += A_diag_data[jC++];
251                      }
252                      else
253                         jC++;
254                      jA = A_diag_j[jC];
255                   }
256                   jC++;
257                }
258                for (j=jC; j < A_diag_i[i+1]; j++)
259                {
260                   if (dof_func[i] == dof_func[A_diag_j[j]])
261                         D_w[row] += A_diag_data[j];
262                }
263                jC = A_offd_i[i];
264                for (j=S_offd_i[i]; j < S_offd_i[i+1]; j++)
265                {
266                   jS = S_offd_j[j];
267                   jA = A_offd_j[jC];
268                   while (jA != jS)
269                   {
270                      if (dof_func[i] == dof_func_offd[jA])
271                      {
272                         D_w[row] += A_offd_data[jC++];
273                      }
274                      else
275                         jC++;
276                      jA = A_offd_j[jC];
277                   }
278                   jC++;
279                }
280                for (j=jC; j < A_offd_i[i+1]; j++)
281                {
282                   if (dof_func[i] == dof_func_offd[A_offd_j[j]])
283                         D_w[row] += A_offd_data[j];
284                }
285                row++;
286             }
287             else
288             {
289                for (j=A_diag_i[i]; j < A_diag_i[i+1]; j++)
290                {
291                   D_w[row] += A_diag_data[j];
292                }
293                for (j=A_offd_i[i]; j < A_offd_i[i+1]; j++)
294                {
295                   D_w[row] += A_offd_data[j];
296                }
297                for (j=As_FF_diag_i[row]+1; j < As_FF_diag_i[row+1]; j++)
298                {
299                   D_w[row] -= As_FF_diag_data[j];
300                }
301                for (j=As_FF_offd_i[row]; j < As_FF_offd_i[row+1]; j++)
302                {
303                   D_w[row] -= As_FF_offd_data[j];
304                }
305                D_w[row] -= D_q[row];
306                row++;
307             }
308          }
309       }
310 
311       for (i=startf; i<stopf; i++)
312       {
313          j = As_FF_diag_i[i];
314          if (D_w[i]) beta = 1.0/D_w[i];
315          else beta = 1.0;
316          As_FF_diag_data[j] = beta*D_q[i];
317          if (D_q[i]) gamma = -1.0/D_q[i];
318          else gamma = 1.0;
319          for (j=As_FF_diag_i[i]+1; j < As_FF_diag_i[i+1]; j++)
320             As_FF_diag_data[j] *= beta;
321          for (j=As_FF_offd_i[i]; j < As_FF_offd_i[i+1]; j++)
322             As_FF_offd_data[j] *= beta;
323          for (j=As_FC_diag_i[i]; j < As_FC_diag_i[i+1]; j++)
324             As_FC_diag_data[j] *= gamma;
325          for (j=As_FC_offd_i[i]; j < As_FC_offd_i[i+1]; j++)
326             As_FC_offd_data[j] *= gamma;
327       }
328 
329    }   /* end parallel region */
330 
331    W = hypre_ParMatmul(As_FF, As_FC);
332    W_diag = hypre_ParCSRMatrixDiag(W);
333    W_offd = hypre_ParCSRMatrixOffd(W);
334    W_diag_i = hypre_CSRMatrixI(W_diag);
335    W_diag_j = hypre_CSRMatrixJ(W_diag);
336    W_diag_data = hypre_CSRMatrixData(W_diag);
337    W_offd_i = hypre_CSRMatrixI(W_offd);
338    W_offd_j = hypre_CSRMatrixJ(W_offd);
339    W_offd_data = hypre_CSRMatrixData(W_offd);
340    num_cols_P_offd = hypre_CSRMatrixNumCols(W_offd);
341    /*-----------------------------------------------------------------------
342     *  Intialize data for P
343     *-----------------------------------------------------------------------*/
344    P_diag_i    = hypre_CTAlloc(HYPRE_Int,  n_fine+1, memory_location_P);
345    P_offd_i    = hypre_CTAlloc(HYPRE_Int,  n_fine+1, memory_location_P);
346 
347    P_diag_size = n_Cpts + hypre_CSRMatrixI(W_diag)[n_Fpts];
348    P_offd_size = hypre_CSRMatrixI(W_offd)[n_Fpts];
349 
350    if (P_diag_size)
351    {
352       P_diag_j    = hypre_CTAlloc(HYPRE_Int,  P_diag_size, memory_location_P);
353       P_diag_data = hypre_CTAlloc(HYPRE_Real,  P_diag_size, memory_location_P);
354    }
355 
356    if (P_offd_size)
357    {
358       P_offd_j    = hypre_CTAlloc(HYPRE_Int,  P_offd_size, memory_location_P);
359       P_offd_data = hypre_CTAlloc(HYPRE_Real,  P_offd_size, memory_location_P);
360    }
361 
362 #ifdef HYPRE_USING_OPENMP
363 #pragma omp parallel private(i,j,start,stop,startf,stopf,c_pt,row,cnt_diag,cnt_offd)
364 #endif
365    {
366       HYPRE_Int my_thread_num = hypre_GetThreadNum();
367       startf = startf_array[my_thread_num];
368       stopf = startf_array[my_thread_num+1];
369       start = start_array[my_thread_num];
370       stop = start_array[my_thread_num+1];
371 
372       if (my_thread_num > 0)
373          c_pt = cpt_array[my_thread_num-1];
374       else
375          c_pt = 0;
376       cnt_diag = W_diag_i[startf]+c_pt;
377       cnt_offd = W_offd_i[startf];
378       row = startf;
379       for (i=start; i < stop; i++)
380       {
381          if (CF_marker[i] > 0)
382          {
383             P_diag_j[cnt_diag] = c_pt++;
384             P_diag_data[cnt_diag++] = 1.0;
385          }
386          else
387          {
388             for (j=W_diag_i[row]; j < W_diag_i[row+1]; j++)
389             {
390                P_diag_j[cnt_diag] = W_diag_j[j];
391                P_diag_data[cnt_diag++] = W_diag_data[j];
392             }
393             for (j=W_offd_i[row]; j < W_offd_i[row+1]; j++)
394             {
395                P_offd_j[cnt_offd] = W_offd_j[j];
396                P_offd_data[cnt_offd++] = W_offd_data[j];
397             }
398             row++;
399          }
400          P_diag_i[i+1] = cnt_diag;
401          P_offd_i[i+1] = cnt_offd;
402       }
403 
404    }   /* end parallel region */
405 
406    /*-----------------------------------------------------------------------
407     *  Create matrix
408     *-----------------------------------------------------------------------*/
409 
410    P = hypre_ParCSRMatrixCreate(comm,
411          hypre_ParCSRMatrixGlobalNumRows(A),
412          total_global_cpts,
413          hypre_ParCSRMatrixColStarts(A),
414          num_cpts_global,
415          num_cols_P_offd,
416          P_diag_i[n_fine],
417          P_offd_i[n_fine]);
418 
419    P_diag = hypre_ParCSRMatrixDiag(P);
420    hypre_CSRMatrixData(P_diag) = P_diag_data;
421    hypre_CSRMatrixI(P_diag) = P_diag_i;
422    hypre_CSRMatrixJ(P_diag) = P_diag_j;
423    P_offd = hypre_ParCSRMatrixOffd(P);
424    hypre_CSRMatrixData(P_offd) = P_offd_data;
425    hypre_CSRMatrixI(P_offd) = P_offd_i;
426    hypre_CSRMatrixJ(P_offd) = P_offd_j;
427    hypre_ParCSRMatrixColMapOffd(P) = hypre_ParCSRMatrixColMapOffd(W);
428    hypre_ParCSRMatrixColMapOffd(W) = NULL;
429 
430    hypre_CSRMatrixMemoryLocation(P_diag) = memory_location_P;
431    hypre_CSRMatrixMemoryLocation(P_offd) = memory_location_P;
432 
433    /* Compress P, removing coefficients smaller than trunc_factor * Max */
434    if (trunc_factor != 0.0 || max_elmts > 0)
435    {
436       HYPRE_Int *map;
437       hypre_BoomerAMGInterpTruncation(P, trunc_factor, max_elmts);
438       P_diag_data = hypre_CSRMatrixData(P_diag);
439       P_diag_i = hypre_CSRMatrixI(P_diag);
440       P_diag_j = hypre_CSRMatrixJ(P_diag);
441       P_offd_data = hypre_CSRMatrixData(P_offd);
442       P_offd_i = hypre_CSRMatrixI(P_offd);
443       P_offd_j = hypre_CSRMatrixJ(P_offd);
444       P_diag_size = P_diag_i[n_fine];
445       P_offd_size = P_offd_i[n_fine];
446 
447       col_map_offd_P = hypre_ParCSRMatrixColMapOffd(P);
448       if (num_cols_P_offd)
449       {
450          P_marker = hypre_CTAlloc(HYPRE_Int, num_cols_P_offd, HYPRE_MEMORY_HOST);
451          for (i=0; i < P_offd_size; i++)
452          {
453             P_marker[P_offd_j[i]] = 1;
454          }
455 
456          new_ncols_P_offd = 0;
457          for (i=0; i < num_cols_P_offd; i++)
458          {
459             if (P_marker[i]) new_ncols_P_offd++;
460          }
461 
462          new_col_map_offd = hypre_CTAlloc(HYPRE_BigInt, new_ncols_P_offd, HYPRE_MEMORY_HOST);
463          map = hypre_CTAlloc(HYPRE_Int, new_ncols_P_offd, HYPRE_MEMORY_HOST);
464 
465          index = 0;
466          for (i=0; i < num_cols_P_offd; i++)
467             if (P_marker[i])
468             {
469                new_col_map_offd[index] = col_map_offd_P[i];
470                map[index++] = i;
471             }
472          hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
473 
474 
475 #ifdef HYPRE_USING_OPENMP
476 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
477 #endif
478          for (i=0; i < P_offd_size; i++)
479          {
480             P_offd_j[i] = hypre_BinarySearch(map, P_offd_j[i],
481                new_ncols_P_offd);
482          }
483 
484          hypre_TFree(col_map_offd_P, HYPRE_MEMORY_HOST);
485          hypre_ParCSRMatrixColMapOffd(P) = new_col_map_offd;
486          hypre_CSRMatrixNumCols(P_offd) = new_ncols_P_offd;
487          hypre_TFree(map, HYPRE_MEMORY_HOST);
488       }
489    }
490 
491    hypre_MatvecCommPkgCreate(P);
492 
493    *P_ptr = P;
494 
495    /* Deallocate memory */
496    hypre_TFree(D_q, HYPRE_MEMORY_HOST);
497    hypre_TFree(D_w, HYPRE_MEMORY_HOST);
498    hypre_TFree(cpt_array, HYPRE_MEMORY_HOST);
499    hypre_TFree(start_array, HYPRE_MEMORY_HOST);
500    hypre_TFree(startf_array, HYPRE_MEMORY_HOST);
501    hypre_TFree(dof_func_offd, HYPRE_MEMORY_HOST);
502    hypre_ParCSRMatrixDestroy(As_FF);
503    hypre_ParCSRMatrixDestroy(As_FC);
504    hypre_ParCSRMatrixDestroy(W);
505 
506    return hypre_error_flag;
507 }
508 
509 /*-----------------------------------------------------------------------*
510  * Modularized Extended Interpolation
511  *-----------------------------------------------------------------------*/
512 HYPRE_Int
hypre_BoomerAMGBuildModExtInterp(hypre_ParCSRMatrix * A,HYPRE_Int * CF_marker,hypre_ParCSRMatrix * S,HYPRE_BigInt * num_cpts_global,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int debug_flag,HYPRE_Real trunc_factor,HYPRE_Int max_elmts,hypre_ParCSRMatrix ** P_ptr)513 hypre_BoomerAMGBuildModExtInterp(hypre_ParCSRMatrix  *A,
514                                  HYPRE_Int           *CF_marker,
515                                  hypre_ParCSRMatrix  *S,
516                                  HYPRE_BigInt        *num_cpts_global,
517                                  HYPRE_Int            num_functions,
518                                  HYPRE_Int           *dof_func,
519                                  HYPRE_Int            debug_flag,
520                                  HYPRE_Real           trunc_factor,
521                                  HYPRE_Int            max_elmts,
522                                  hypre_ParCSRMatrix **P_ptr)
523 {
524 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
525    hypre_GpuProfilingPushRange("ModExtInterp");
526 #endif
527 
528    HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
529 
530    HYPRE_Int ierr = 0;
531 
532    if (exec == HYPRE_EXEC_HOST)
533    {
534       ierr = hypre_BoomerAMGBuildModExtInterpHost(A,CF_marker,S,num_cpts_global,num_functions,dof_func,
535                                                   debug_flag,trunc_factor,max_elmts,P_ptr);
536    }
537 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
538    else
539    {
540       ierr = hypre_BoomerAMGBuildExtInterpDevice(A,CF_marker,S,num_cpts_global,1,NULL,
541                                                  debug_flag,trunc_factor,max_elmts,P_ptr);
542    }
543 #endif
544 
545 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
546    hypre_GpuProfilingPopRange();
547 #endif
548 
549    return ierr;
550 }
551 
552 
553 /*---------------------------------------------------------------------------
554  * hypre_BoomerAMGBuildModExtPIInterp
555  *  Comment:
556  *--------------------------------------------------------------------------*/
557 HYPRE_Int
hypre_BoomerAMGBuildModExtPIInterpHost(hypre_ParCSRMatrix * A,HYPRE_Int * CF_marker,hypre_ParCSRMatrix * S,HYPRE_BigInt * num_cpts_global,HYPRE_Int debug_flag,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Real trunc_factor,HYPRE_Int max_elmts,hypre_ParCSRMatrix ** P_ptr)558 hypre_BoomerAMGBuildModExtPIInterpHost(hypre_ParCSRMatrix  *A,
559                                        HYPRE_Int           *CF_marker,
560                                        hypre_ParCSRMatrix  *S,
561                                        HYPRE_BigInt        *num_cpts_global,
562                                        HYPRE_Int            debug_flag,
563                                        HYPRE_Int            num_functions,
564                                        HYPRE_Int           *dof_func,
565                                        HYPRE_Real           trunc_factor,
566                                        HYPRE_Int            max_elmts,
567                                        hypre_ParCSRMatrix **P_ptr)
568 {
569    /* Communication Variables */
570    MPI_Comm                 comm = hypre_ParCSRMatrixComm(A);
571    hypre_ParCSRCommPkg     *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
572    hypre_ParCSRCommHandle  *comm_handle = NULL;
573    HYPRE_MemoryLocation     memory_location_P = hypre_ParCSRMatrixMemoryLocation(A);
574 
575    HYPRE_Int              my_id, num_procs;
576 
577    /* Variables to store input variables */
578    hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
579    HYPRE_Real      *A_diag_data = hypre_CSRMatrixData(A_diag);
580    HYPRE_Int       *A_diag_i = hypre_CSRMatrixI(A_diag);
581    HYPRE_Int       *A_diag_j = hypre_CSRMatrixJ(A_diag);
582 
583    hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
584    HYPRE_Real      *A_offd_data = hypre_CSRMatrixData(A_offd);
585    HYPRE_Int       *A_offd_i = hypre_CSRMatrixI(A_offd);
586    HYPRE_Int       *A_offd_j = hypre_CSRMatrixJ(A_offd);
587 
588    hypre_CSRMatrix *S_diag = hypre_ParCSRMatrixDiag(S);
589    HYPRE_Int       *S_diag_j = hypre_CSRMatrixJ(S_diag);
590    HYPRE_Int       *S_diag_i = hypre_CSRMatrixI(S_diag);
591 
592    hypre_CSRMatrix *S_offd = hypre_ParCSRMatrixOffd(S);
593    HYPRE_Int       *S_offd_j = hypre_CSRMatrixJ(S_offd);
594    HYPRE_Int       *S_offd_i = hypre_CSRMatrixI(S_offd);
595 
596    HYPRE_Int        n_fine = hypre_CSRMatrixNumRows(A_diag);
597    HYPRE_BigInt     total_global_cpts;
598 
599    hypre_CSRMatrix *As_FF_ext = NULL;
600    HYPRE_Real      *As_FF_ext_data = NULL;
601    HYPRE_Int       *As_FF_ext_i = NULL;
602    HYPRE_BigInt    *As_FF_ext_j = NULL;
603 
604    /* Interpolation matrix P */
605    hypre_ParCSRMatrix *P;
606    hypre_CSRMatrix    *P_diag;
607    hypre_CSRMatrix    *P_offd;
608 
609    HYPRE_Real      *P_diag_data = NULL;
610    HYPRE_Int       *P_diag_i, *P_diag_j = NULL;
611    HYPRE_Real      *P_offd_data = NULL;
612    HYPRE_Int       *P_offd_i, *P_offd_j = NULL;
613 
614    /* Intermediate matrices */
615    hypre_ParCSRMatrix *As_FF, *As_FC, *W;
616    HYPRE_Real *D_q, *D_w, *D_theta, *D_q_offd = NULL;
617    hypre_CSRMatrix *As_FF_diag;
618    hypre_CSRMatrix *As_FF_offd;
619    hypre_CSRMatrix *As_FC_diag;
620    hypre_CSRMatrix *As_FC_offd;
621    hypre_CSRMatrix *W_diag;
622    hypre_CSRMatrix *W_offd;
623 
624    HYPRE_Int *As_FF_diag_i;
625    HYPRE_Int *As_FF_diag_j;
626    HYPRE_Int *As_FF_offd_i;
627    HYPRE_Int *As_FF_offd_j = NULL;
628    HYPRE_Int *As_FC_diag_i;
629    HYPRE_Int *As_FC_offd_i;
630    HYPRE_Int *W_diag_i;
631    HYPRE_Int *W_offd_i;
632    HYPRE_Int *W_diag_j;
633    HYPRE_Int *W_offd_j = NULL;
634 
635    HYPRE_Real *As_FF_diag_data;
636    HYPRE_Real *As_FF_offd_data = NULL;
637    HYPRE_Real *As_FC_diag_data;
638    HYPRE_Real *As_FC_offd_data = NULL;
639    HYPRE_Real *W_diag_data;
640    HYPRE_Real *W_offd_data = NULL;
641    HYPRE_Real *buf_data = NULL;
642    HYPRE_Real *tmp_FF_diag_data = NULL;
643 
644    HYPRE_BigInt    *col_map_offd_P = NULL;
645    HYPRE_BigInt    *new_col_map_offd = NULL;
646    HYPRE_BigInt     first_index;
647    HYPRE_Int        P_diag_size;
648    HYPRE_Int        P_offd_size;
649    HYPRE_Int        new_ncols_P_offd;
650    HYPRE_Int        num_cols_P_offd;
651    HYPRE_Int       *P_marker = NULL;
652    HYPRE_Int       *dof_func_offd = NULL;
653 
654    /* Loop variables */
655    HYPRE_Int        index, startc, num_sends;
656    HYPRE_Int        i, j, jj, k, kk;
657    HYPRE_Int       *cpt_array;
658    HYPRE_Int       *start_array;
659    HYPRE_Int       *startf_array;
660    HYPRE_Int start, stop, startf, stopf;
661    HYPRE_Int cnt_diag, cnt_offd, row, c_pt;
662    HYPRE_Int num_cols_A_FF_offd;
663    HYPRE_Real value, value1, theta;
664 
665    /* Definitions */
666    //HYPRE_Real       wall_time;
667    HYPRE_Int n_Cpts, n_Fpts;
668    HYPRE_Int num_threads = hypre_NumThreads();
669 
670    //if (debug_flag==4) wall_time = time_getWallclockSeconds();
671 
672    /* BEGIN */
673    hypre_MPI_Comm_size(comm, &num_procs);
674    hypre_MPI_Comm_rank(comm,&my_id);
675 
676    if (my_id == (num_procs -1)) total_global_cpts = num_cpts_global[1];
677    hypre_MPI_Bcast(&total_global_cpts, 1, HYPRE_MPI_BIG_INT, num_procs-1, comm);
678    n_Cpts = num_cpts_global[1]-num_cpts_global[0];
679 
680    hypre_ParCSRMatrixGenerateFFFC(A, CF_marker, num_cpts_global, S, &As_FC, &As_FF);
681 
682    if (num_procs > 1)
683    {
684       As_FF_ext = hypre_ParCSRMatrixExtractBExt(As_FF,As_FF,1);
685       As_FF_ext_i = hypre_CSRMatrixI(As_FF_ext);
686       As_FF_ext_j = hypre_CSRMatrixBigJ(As_FF_ext);
687       As_FF_ext_data = hypre_CSRMatrixData(As_FF_ext);
688    }
689 
690    As_FC_diag = hypre_ParCSRMatrixDiag(As_FC);
691    As_FC_diag_i = hypre_CSRMatrixI(As_FC_diag);
692    As_FC_diag_data = hypre_CSRMatrixData(As_FC_diag);
693    As_FC_offd = hypre_ParCSRMatrixOffd(As_FC);
694    As_FC_offd_i = hypre_CSRMatrixI(As_FC_offd);
695    As_FC_offd_data = hypre_CSRMatrixData(As_FC_offd);
696    As_FF_diag = hypre_ParCSRMatrixDiag(As_FF);
697    As_FF_diag_i = hypre_CSRMatrixI(As_FF_diag);
698    As_FF_diag_j = hypre_CSRMatrixJ(As_FF_diag);
699    As_FF_diag_data = hypre_CSRMatrixData(As_FF_diag);
700    As_FF_offd = hypre_ParCSRMatrixOffd(As_FF);
701    As_FF_offd_i = hypre_CSRMatrixI(As_FF_offd);
702    As_FF_offd_j = hypre_CSRMatrixJ(As_FF_offd);
703    As_FF_offd_data = hypre_CSRMatrixData(As_FF_offd);
704    n_Fpts = hypre_CSRMatrixNumRows(As_FF_diag);
705    num_cols_A_FF_offd = hypre_CSRMatrixNumCols(As_FF_offd);
706    first_index = hypre_ParCSRMatrixRowStarts(As_FF)[0];
707    tmp_FF_diag_data = hypre_CTAlloc(HYPRE_Real, As_FF_diag_i[n_Fpts], HYPRE_MEMORY_HOST);
708 
709    D_q = hypre_CTAlloc(HYPRE_Real, n_Fpts, HYPRE_MEMORY_HOST);
710    D_theta = hypre_CTAlloc(HYPRE_Real, n_Fpts, HYPRE_MEMORY_HOST);
711    D_w = hypre_CTAlloc(HYPRE_Real, n_Fpts, HYPRE_MEMORY_HOST);
712    cpt_array = hypre_CTAlloc(HYPRE_Int, num_threads, HYPRE_MEMORY_HOST);
713    start_array = hypre_CTAlloc(HYPRE_Int, num_threads+1, HYPRE_MEMORY_HOST);
714    startf_array = hypre_CTAlloc(HYPRE_Int, num_threads+1, HYPRE_MEMORY_HOST);
715 
716 #ifdef HYPRE_USING_OPENMP
717 #pragma omp parallel private(i,j,jj,k,kk,start,stop,startf,stopf,row,theta,value,value1)
718 #endif
719    {
720       HYPRE_Int my_thread_num = hypre_GetThreadNum();
721 
722       start = (n_fine/num_threads)*my_thread_num;
723       if (my_thread_num == num_threads-1)
724       {
725          stop = n_fine;
726       }
727       else
728       {
729          stop = (n_fine/num_threads)*(my_thread_num+1);
730       }
731       start_array[my_thread_num+1] = stop;
732       for (i=start; i < stop; i++)
733       {
734          if (CF_marker[i] > 0)
735          {
736             cpt_array[my_thread_num]++;
737          }
738       }
739 
740 #ifdef HYPRE_USING_OPENMP
741 #pragma omp barrier
742 #endif
743       if (my_thread_num == 0)
744       {
745          for (i=1; i < num_threads; i++)
746          {
747             cpt_array[i] += cpt_array[i-1];
748          }
749       }
750 #ifdef HYPRE_USING_OPENMP
751 #pragma omp barrier
752 #endif
753       if (my_thread_num > 0)
754          startf = start - cpt_array[my_thread_num-1];
755       else
756          startf = 0;
757 
758       if (my_thread_num < num_threads-1)
759          stopf = stop - cpt_array[my_thread_num];
760       else
761          stopf = n_Fpts;
762 
763       startf_array[my_thread_num+1] = stopf;
764 
765       for (i=startf; i < stopf; i++)
766       {
767          for (j=As_FC_diag_i[i]; j < As_FC_diag_i[i+1]; j++)
768          {
769             D_q[i] += As_FC_diag_data[j];
770          }
771          for (j=As_FC_offd_i[i]; j < As_FC_offd_i[i+1]; j++)
772          {
773             D_q[i] += As_FC_offd_data[j];
774          }
775       }
776 
777       for (j = As_FF_diag_i[startf]; j < As_FF_diag_i[stopf]; j++)
778       {
779          tmp_FF_diag_data[j] = As_FF_diag_data[j];
780       }
781 
782 
783 #ifdef HYPRE_USING_OPENMP
784 #pragma omp barrier
785 #endif
786       if (my_thread_num == 0)
787       {
788          if (num_cols_A_FF_offd)
789          {
790             D_q_offd = hypre_CTAlloc(HYPRE_Real,  num_cols_A_FF_offd, HYPRE_MEMORY_HOST);
791          }
792          index = 0;
793          comm_pkg = hypre_ParCSRMatrixCommPkg(As_FF);
794          if (!comm_pkg)
795          {
796             hypre_MatvecCommPkgCreate(As_FF);
797             comm_pkg = hypre_ParCSRMatrixCommPkg(As_FF);
798          }
799          num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
800          buf_data = hypre_CTAlloc(HYPRE_Real, hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends), HYPRE_MEMORY_HOST);
801          for (i = 0; i < num_sends; i++)
802          {
803             startc = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
804             for (j = startc; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
805             {
806                buf_data[index++] = D_q[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
807             }
808          }
809 
810          comm_handle = hypre_ParCSRCommHandleCreate( 1, comm_pkg, buf_data, D_q_offd);
811          hypre_ParCSRCommHandleDestroy(comm_handle);
812 
813          if (num_functions > 1)
814          {
815             HYPRE_Int *int_buf_data = NULL;
816             HYPRE_Int num_sends, startc;
817             HYPRE_Int num_cols_A_offd = hypre_CSRMatrixNumCols(A_offd);
818             dof_func_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
819             index = 0;
820             num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
821             int_buf_data = hypre_CTAlloc(HYPRE_Int, hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends), HYPRE_MEMORY_HOST);
822             for (i = 0; i < num_sends; i++)
823             {
824                startc = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
825                for (j = startc; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
826                {
827                   int_buf_data[index++] = dof_func[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
828                }
829             }
830             comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data, dof_func_offd);
831             hypre_ParCSRCommHandleDestroy(comm_handle);
832             hypre_TFree(int_buf_data, HYPRE_MEMORY_HOST);
833          }
834       }
835 
836 #ifdef HYPRE_USING_OPENMP
837 #pragma omp barrier
838 #endif
839 
840       row = startf;
841       for (i=start; i < stop; i++)
842       {
843          HYPRE_Int jA, jC, jS;
844          if (CF_marker[i] < 0)
845          {
846             if (num_functions > 1)
847             {
848                jC = A_diag_i[i];
849                for (j=S_diag_i[i]; j < S_diag_i[i+1]; j++)
850                {
851                   jS = S_diag_j[j];
852                   jA = A_diag_j[jC];
853                   while (jA != jS)
854                   {
855                      if (dof_func[i] == dof_func[jA])
856                      {
857                         D_w[row] += A_diag_data[jC++];
858                      }
859                      else
860                         jC++;
861                      jA = A_diag_j[jC];
862                   }
863                   jC++;
864                }
865                for (j=jC; j < A_diag_i[i+1]; j++)
866                {
867                   if (dof_func[i] == dof_func[A_diag_j[j]])
868                         D_w[row] += A_diag_data[j];
869                }
870                jC = A_offd_i[i];
871                for (j=S_offd_i[i]; j < S_offd_i[i+1]; j++)
872                {
873                   jS = S_offd_j[j];
874                   jA = A_offd_j[jC];
875                   while (jA != jS)
876                   {
877                      if (dof_func[i] == dof_func_offd[jA])
878                      {
879                         D_w[row] += A_offd_data[jC++];
880                      }
881                      else
882                         jC++;
883                      jA = A_offd_j[jC];
884                   }
885                   jC++;
886                }
887                for (j=jC; j < A_offd_i[i+1]; j++)
888                {
889                   if (dof_func[i] == dof_func_offd[A_offd_j[j]])
890                         D_w[row] += A_offd_data[j];
891                }
892                row++;
893             }
894             else
895             {
896                for (j=A_diag_i[i]; j < A_diag_i[i+1]; j++)
897                {
898                   D_w[row] += A_diag_data[j];
899                }
900                for (j=A_offd_i[i]; j < A_offd_i[i+1]; j++)
901                {
902                   D_w[row] += A_offd_data[j];
903                }
904                for (j=As_FF_diag_i[row]+1; j < As_FF_diag_i[row+1]; j++)
905                {
906                   D_w[row] -= As_FF_diag_data[j];
907                }
908                for (j=As_FF_offd_i[row]; j < As_FF_offd_i[row+1]; j++)
909                {
910                   D_w[row] -= As_FF_offd_data[j];
911                }
912                D_w[row] -= D_q[row];
913                row++;
914             }
915          }
916       }
917 
918       for (i=startf; i<stopf; i++)
919       {
920          for (j = As_FF_diag_i[i]+1; j < As_FF_diag_i[i+1]; j++)
921          {
922             jj = As_FF_diag_j[j];
923             value = D_q[jj];
924             for (k = As_FF_diag_i[jj]+1; k < As_FF_diag_i[jj+1]; k++)
925             {
926                kk = As_FF_diag_j[k];
927                if (kk == i)
928                {
929                   value1 = tmp_FF_diag_data[k];
930                   value += value1;
931                   D_theta[i] += As_FF_diag_data[j]*value1/value;
932                   break;
933                }
934             }
935             As_FF_diag_data[j] /= value;
936          }
937          for (j = As_FF_offd_i[i]; j < As_FF_offd_i[i+1]; j++)
938          {
939             jj = As_FF_offd_j[j];
940             value = D_q_offd[jj];
941             for (k = As_FF_ext_i[jj]; k < As_FF_ext_i[jj+1]; k++)
942             {
943                kk = (HYPRE_Int)(As_FF_ext_j[k] - first_index);
944                if (kk == i)
945                {
946                   value1 = As_FF_ext_data[k];
947                   value += value1;
948                   D_theta[i] += As_FF_offd_data[j]*value1/value;
949                   break;
950                }
951             }
952             As_FF_offd_data[j] /= value;
953          }
954          As_FF_diag_data[As_FF_diag_i[i]] = 1.0;
955       }
956 
957 #ifdef HYPRE_USING_OPENMP
958 #pragma omp barrier
959 #endif
960 
961       for (i=startf; i<stopf; i++)
962       {
963          theta = (D_theta[i]+D_w[i]);
964          if (theta)
965          {
966             theta = -1.0/theta;
967             for (j=As_FF_diag_i[i]; j < As_FF_diag_i[i+1]; j++)
968                As_FF_diag_data[j] *= theta;
969             for (j=As_FF_offd_i[i]; j < As_FF_offd_i[i+1]; j++)
970                As_FF_offd_data[j] *= theta;
971          }
972       }
973 
974    }   /* end parallel region */
975 
976    W = hypre_ParMatmul(As_FF, As_FC);
977    W_diag = hypre_ParCSRMatrixDiag(W);
978    W_offd = hypre_ParCSRMatrixOffd(W);
979    W_diag_i = hypre_CSRMatrixI(W_diag);
980    W_diag_j = hypre_CSRMatrixJ(W_diag);
981    W_diag_data = hypre_CSRMatrixData(W_diag);
982    W_offd_i = hypre_CSRMatrixI(W_offd);
983    W_offd_j = hypre_CSRMatrixJ(W_offd);
984    W_offd_data = hypre_CSRMatrixData(W_offd);
985    num_cols_P_offd = hypre_CSRMatrixNumCols(W_offd);
986    /*-----------------------------------------------------------------------
987     *  Intialize data for P
988     *-----------------------------------------------------------------------*/
989    P_diag_i    = hypre_CTAlloc(HYPRE_Int,  n_fine+1, memory_location_P);
990    P_offd_i    = hypre_CTAlloc(HYPRE_Int,  n_fine+1, memory_location_P);
991 
992    P_diag_size = n_Cpts + hypre_CSRMatrixI(W_diag)[n_Fpts];
993    P_offd_size = hypre_CSRMatrixI(W_offd)[n_Fpts];
994 
995    if (P_diag_size)
996    {
997       P_diag_j    = hypre_CTAlloc(HYPRE_Int,  P_diag_size, memory_location_P);
998       P_diag_data = hypre_CTAlloc(HYPRE_Real,  P_diag_size, memory_location_P);
999    }
1000 
1001    if (P_offd_size)
1002    {
1003       P_offd_j    = hypre_CTAlloc(HYPRE_Int,  P_offd_size, memory_location_P);
1004       P_offd_data = hypre_CTAlloc(HYPRE_Real,  P_offd_size, memory_location_P);
1005    }
1006 
1007 #ifdef HYPRE_USING_OPENMP
1008 #pragma omp parallel private(i,j,start,stop,startf,stopf,c_pt,row,cnt_diag,cnt_offd)
1009 #endif
1010    {
1011       HYPRE_Int my_thread_num = hypre_GetThreadNum();
1012       startf = startf_array[my_thread_num];
1013       stopf = startf_array[my_thread_num+1];
1014       start = start_array[my_thread_num];
1015       stop = start_array[my_thread_num+1];
1016 
1017       if (my_thread_num > 0)
1018          c_pt = cpt_array[my_thread_num-1];
1019       else
1020          c_pt = 0;
1021       cnt_diag = W_diag_i[startf]+c_pt;
1022       cnt_offd = W_offd_i[startf];
1023       row = startf;
1024       for (i=start; i < stop; i++)
1025       {
1026          if (CF_marker[i] > 0)
1027          {
1028             P_diag_j[cnt_diag] = c_pt++;
1029             P_diag_data[cnt_diag++] = 1.0;
1030          }
1031          else
1032          {
1033             for (j=W_diag_i[row]; j < W_diag_i[row+1]; j++)
1034             {
1035                P_diag_j[cnt_diag] = W_diag_j[j];
1036                P_diag_data[cnt_diag++] = W_diag_data[j];
1037             }
1038             for (j=W_offd_i[row]; j < W_offd_i[row+1]; j++)
1039             {
1040                P_offd_j[cnt_offd] = W_offd_j[j];
1041                P_offd_data[cnt_offd++] = W_offd_data[j];
1042             }
1043             row++;
1044          }
1045          P_diag_i[i+1] = cnt_diag;
1046          P_offd_i[i+1] = cnt_offd;
1047       }
1048 
1049    }   /* end parallel region */
1050 
1051    /*-----------------------------------------------------------------------
1052     *  Create matrix
1053     *-----------------------------------------------------------------------*/
1054 
1055    P = hypre_ParCSRMatrixCreate(comm,
1056          hypre_ParCSRMatrixGlobalNumRows(A),
1057          total_global_cpts,
1058          hypre_ParCSRMatrixColStarts(A),
1059          num_cpts_global,
1060          num_cols_P_offd,
1061          P_diag_i[n_fine],
1062          P_offd_i[n_fine]);
1063 
1064    P_diag = hypre_ParCSRMatrixDiag(P);
1065    hypre_CSRMatrixData(P_diag) = P_diag_data;
1066    hypre_CSRMatrixI(P_diag) = P_diag_i;
1067    hypre_CSRMatrixJ(P_diag) = P_diag_j;
1068    P_offd = hypre_ParCSRMatrixOffd(P);
1069    hypre_CSRMatrixData(P_offd) = P_offd_data;
1070    hypre_CSRMatrixI(P_offd) = P_offd_i;
1071    hypre_CSRMatrixJ(P_offd) = P_offd_j;
1072    hypre_ParCSRMatrixColMapOffd(P) = hypre_ParCSRMatrixColMapOffd(W);
1073    hypre_ParCSRMatrixColMapOffd(W) = NULL;
1074 
1075    hypre_CSRMatrixMemoryLocation(P_diag) = memory_location_P;
1076    hypre_CSRMatrixMemoryLocation(P_offd) = memory_location_P;
1077 
1078    /* Compress P, removing coefficients smaller than trunc_factor * Max */
1079    if (trunc_factor != 0.0 || max_elmts > 0)
1080    {
1081       HYPRE_Int *map;
1082       hypre_BoomerAMGInterpTruncation(P, trunc_factor, max_elmts);
1083       P_diag_data = hypre_CSRMatrixData(P_diag);
1084       P_diag_i = hypre_CSRMatrixI(P_diag);
1085       P_diag_j = hypre_CSRMatrixJ(P_diag);
1086       P_offd_data = hypre_CSRMatrixData(P_offd);
1087       P_offd_i = hypre_CSRMatrixI(P_offd);
1088       P_offd_j = hypre_CSRMatrixJ(P_offd);
1089       P_diag_size = P_diag_i[n_fine];
1090       P_offd_size = P_offd_i[n_fine];
1091 
1092       col_map_offd_P = hypre_ParCSRMatrixColMapOffd(P);
1093       if (num_cols_P_offd)
1094       {
1095          P_marker = hypre_CTAlloc(HYPRE_Int, num_cols_P_offd, HYPRE_MEMORY_HOST);
1096          for (i=0; i < P_offd_size; i++)
1097             P_marker[P_offd_j[i]] = 1;
1098 
1099          new_ncols_P_offd = 0;
1100          for (i=0; i < num_cols_P_offd; i++)
1101             if (P_marker[i]) new_ncols_P_offd++;
1102 
1103          new_col_map_offd = hypre_CTAlloc(HYPRE_BigInt, new_ncols_P_offd, HYPRE_MEMORY_HOST);
1104          map = hypre_CTAlloc(HYPRE_Int, new_ncols_P_offd, HYPRE_MEMORY_HOST);
1105 
1106          index = 0;
1107          for (i=0; i < num_cols_P_offd; i++)
1108             if (P_marker[i])
1109             {
1110                 new_col_map_offd[index] = col_map_offd_P[i];
1111                 map[index++] = i;
1112             }
1113          hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
1114 
1115 
1116 #ifdef HYPRE_USING_OPENMP
1117 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
1118 #endif
1119          for (i=0; i < P_offd_size; i++)
1120          {
1121             P_offd_j[i] = hypre_BinarySearch(map, P_offd_j[i],
1122                new_ncols_P_offd);
1123          }
1124          hypre_TFree(col_map_offd_P, HYPRE_MEMORY_HOST);
1125          hypre_ParCSRMatrixColMapOffd(P) = new_col_map_offd;
1126          hypre_CSRMatrixNumCols(P_offd) = new_ncols_P_offd;
1127          hypre_TFree(map, HYPRE_MEMORY_HOST);
1128       }
1129    }
1130 
1131    hypre_MatvecCommPkgCreate(P);
1132 
1133    *P_ptr = P;
1134 
1135    /* Deallocate memory */
1136    hypre_TFree(D_q, HYPRE_MEMORY_HOST);
1137    hypre_TFree(D_q_offd, HYPRE_MEMORY_HOST);
1138    hypre_TFree(D_w, HYPRE_MEMORY_HOST);
1139    hypre_TFree(D_theta, HYPRE_MEMORY_HOST);
1140    hypre_TFree(dof_func_offd, HYPRE_MEMORY_HOST);
1141    hypre_TFree(cpt_array, HYPRE_MEMORY_HOST);
1142    hypre_TFree(start_array, HYPRE_MEMORY_HOST);
1143    hypre_TFree(startf_array, HYPRE_MEMORY_HOST);
1144    hypre_TFree(buf_data, HYPRE_MEMORY_HOST);
1145    hypre_TFree(tmp_FF_diag_data, HYPRE_MEMORY_HOST);
1146    hypre_ParCSRMatrixDestroy(As_FF);
1147    hypre_ParCSRMatrixDestroy(As_FC);
1148    hypre_ParCSRMatrixDestroy(W);
1149    hypre_CSRMatrixDestroy(As_FF_ext);
1150 
1151    return hypre_error_flag;
1152 }
1153 
1154 /*-----------------------------------------------------------------------*
1155  * Modularized Extended+i Interpolation
1156  *-----------------------------------------------------------------------*/
1157 HYPRE_Int
hypre_BoomerAMGBuildModExtPIInterp(hypre_ParCSRMatrix * A,HYPRE_Int * CF_marker,hypre_ParCSRMatrix * S,HYPRE_BigInt * num_cpts_global,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int debug_flag,HYPRE_Real trunc_factor,HYPRE_Int max_elmts,hypre_ParCSRMatrix ** P_ptr)1158 hypre_BoomerAMGBuildModExtPIInterp(hypre_ParCSRMatrix  *A,
1159                                    HYPRE_Int           *CF_marker,
1160                                    hypre_ParCSRMatrix  *S,
1161                                    HYPRE_BigInt        *num_cpts_global,
1162                                    HYPRE_Int            num_functions,
1163                                    HYPRE_Int           *dof_func,
1164                                    HYPRE_Int            debug_flag,
1165                                    HYPRE_Real           trunc_factor,
1166                                    HYPRE_Int            max_elmts,
1167                                    hypre_ParCSRMatrix **P_ptr)
1168 {
1169 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
1170    hypre_GpuProfilingPushRange("ModExtPIInterp");
1171 #endif
1172 
1173    HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
1174 
1175    HYPRE_Int ierr = 0;
1176 
1177    if (exec == HYPRE_EXEC_HOST)
1178    {
1179       ierr = hypre_BoomerAMGBuildModExtPIInterpHost(A, CF_marker, S, num_cpts_global,
1180                                                     debug_flag, num_functions, dof_func,
1181                                                     trunc_factor, max_elmts, P_ptr);
1182    }
1183 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
1184    else
1185    {
1186       ierr = hypre_BoomerAMGBuildExtPIInterpDevice(A, CF_marker, S, num_cpts_global, 1, NULL,
1187                                                    debug_flag, trunc_factor, max_elmts, P_ptr);
1188    }
1189 #endif
1190 
1191 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
1192    hypre_GpuProfilingPopRange();
1193 #endif
1194 
1195    return ierr;
1196 }
1197 
1198 /*---------------------------------------------------------------------------
1199  * hypre_BoomerAMGBuildModExtPEInterp
1200  *  Comment:
1201  *--------------------------------------------------------------------------*/
1202 HYPRE_Int
hypre_BoomerAMGBuildModExtPEInterpHost(hypre_ParCSRMatrix * A,HYPRE_Int * CF_marker,hypre_ParCSRMatrix * S,HYPRE_BigInt * num_cpts_global,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int debug_flag,HYPRE_Real trunc_factor,HYPRE_Int max_elmts,hypre_ParCSRMatrix ** P_ptr)1203 hypre_BoomerAMGBuildModExtPEInterpHost(hypre_ParCSRMatrix   *A,
1204                                        HYPRE_Int            *CF_marker,
1205                                        hypre_ParCSRMatrix   *S,
1206                                        HYPRE_BigInt         *num_cpts_global,
1207                                        HYPRE_Int             num_functions,
1208                                        HYPRE_Int            *dof_func,
1209                                        HYPRE_Int             debug_flag,
1210                                        HYPRE_Real            trunc_factor,
1211                                        HYPRE_Int             max_elmts,
1212                                        hypre_ParCSRMatrix  **P_ptr)
1213 {
1214    /* Communication Variables */
1215    MPI_Comm                 comm = hypre_ParCSRMatrixComm(A);
1216    HYPRE_MemoryLocation memory_location_P = hypre_ParCSRMatrixMemoryLocation(A);
1217    hypre_ParCSRCommPkg     *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
1218    hypre_ParCSRCommHandle  *comm_handle = NULL;
1219 
1220    HYPRE_Int              my_id, num_procs;
1221 
1222    /* Variables to store input variables */
1223    hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
1224    HYPRE_Real      *A_diag_data = hypre_CSRMatrixData(A_diag);
1225    HYPRE_Int       *A_diag_i = hypre_CSRMatrixI(A_diag);
1226    HYPRE_Int       *A_diag_j = hypre_CSRMatrixJ(A_diag);
1227 
1228    hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
1229    HYPRE_Real      *A_offd_data = hypre_CSRMatrixData(A_offd);
1230    HYPRE_Int       *A_offd_i = hypre_CSRMatrixI(A_offd);
1231    HYPRE_Int       *A_offd_j = hypre_CSRMatrixJ(A_offd);
1232 
1233    hypre_CSRMatrix *S_diag = hypre_ParCSRMatrixDiag(S);
1234    HYPRE_Int       *S_diag_j = hypre_CSRMatrixJ(S_diag);
1235    HYPRE_Int       *S_diag_i = hypre_CSRMatrixI(S_diag);
1236 
1237    hypre_CSRMatrix *S_offd = hypre_ParCSRMatrixOffd(S);
1238    HYPRE_Int       *S_offd_j = hypre_CSRMatrixJ(S_offd);
1239    HYPRE_Int       *S_offd_i = hypre_CSRMatrixI(S_offd);
1240 
1241    HYPRE_Int        n_fine = hypre_CSRMatrixNumRows(A_diag);
1242    HYPRE_BigInt     total_global_cpts;
1243 
1244    /* Interpolation matrix P */
1245    hypre_ParCSRMatrix *P;
1246    hypre_CSRMatrix    *P_diag;
1247    hypre_CSRMatrix    *P_offd;
1248 
1249    HYPRE_Real      *P_diag_data = NULL;
1250    HYPRE_Int       *P_diag_i, *P_diag_j = NULL;
1251    HYPRE_Real      *P_offd_data = NULL;
1252    HYPRE_Int       *P_offd_i, *P_offd_j = NULL;
1253 
1254    /* Intermediate matrices */
1255    hypre_ParCSRMatrix *As_FF, *As_FC, *W;
1256    HYPRE_Real *D_beta, *D_w, *D_lambda, *D_tmp, *D_tau, *D_tmp_offd = NULL;
1257    hypre_CSRMatrix *As_FF_diag;
1258    hypre_CSRMatrix *As_FF_offd;
1259    hypre_CSRMatrix *As_FC_diag;
1260    hypre_CSRMatrix *As_FC_offd;
1261    hypre_CSRMatrix *W_diag;
1262    hypre_CSRMatrix *W_offd;
1263 
1264    HYPRE_Int *As_FF_diag_i;
1265    HYPRE_Int *As_FF_diag_j;
1266    HYPRE_Int *As_FF_offd_i;
1267    HYPRE_Int *As_FF_offd_j;
1268    HYPRE_Int *As_FC_diag_i;
1269    HYPRE_Int *As_FC_offd_i;
1270    HYPRE_Int *W_diag_i;
1271    HYPRE_Int *W_offd_i;
1272    HYPRE_Int *W_diag_j;
1273    HYPRE_Int *W_offd_j = NULL;
1274 
1275    HYPRE_Real *As_FF_diag_data;
1276    HYPRE_Real *As_FF_offd_data = NULL;
1277    HYPRE_Real *As_FC_diag_data;
1278    HYPRE_Real *As_FC_offd_data = NULL;
1279    HYPRE_Real *W_diag_data;
1280    HYPRE_Real *W_offd_data = NULL;
1281    HYPRE_Real *buf_data = NULL;
1282 
1283    HYPRE_BigInt    *col_map_offd_P = NULL;
1284    HYPRE_BigInt    *new_col_map_offd = NULL;
1285    HYPRE_Int        P_diag_size;
1286    HYPRE_Int        P_offd_size;
1287    HYPRE_Int        new_ncols_P_offd;
1288    HYPRE_Int        num_cols_P_offd;
1289    HYPRE_Int       *P_marker = NULL;
1290    HYPRE_Int       *dof_func_offd = NULL;
1291 
1292    /* Loop variables */
1293    HYPRE_Int        index, startc, num_sends;
1294    HYPRE_Int        i, j;
1295    HYPRE_Int       *cpt_array;
1296    HYPRE_Int       *start_array;
1297    HYPRE_Int       *startf_array;
1298    HYPRE_Int start, stop, startf, stopf;
1299    HYPRE_Int cnt_diag, cnt_offd, row, c_pt;
1300    HYPRE_Int num_cols_A_FF_offd;
1301    HYPRE_Real value, theta;
1302 
1303    /* Definitions */
1304    //HYPRE_Real       wall_time;
1305    HYPRE_Int n_Cpts, n_Fpts;
1306    HYPRE_Int num_threads = hypre_NumThreads();
1307 
1308    //if (debug_flag==4) wall_time = time_getWallclockSeconds();
1309 
1310    /* BEGIN */
1311    hypre_MPI_Comm_size(comm, &num_procs);
1312    hypre_MPI_Comm_rank(comm,&my_id);
1313 
1314    if (my_id == (num_procs -1)) total_global_cpts = num_cpts_global[1];
1315    hypre_MPI_Bcast(&total_global_cpts, 1, HYPRE_MPI_BIG_INT, num_procs-1, comm);
1316    n_Cpts = num_cpts_global[1]-num_cpts_global[0];
1317 
1318    hypre_ParCSRMatrixGenerateFFFC(A, CF_marker, num_cpts_global, S, &As_FC, &As_FF);
1319 
1320    As_FC_diag = hypre_ParCSRMatrixDiag(As_FC);
1321    As_FC_diag_i = hypre_CSRMatrixI(As_FC_diag);
1322    As_FC_diag_data = hypre_CSRMatrixData(As_FC_diag);
1323    As_FC_offd = hypre_ParCSRMatrixOffd(As_FC);
1324    As_FC_offd_i = hypre_CSRMatrixI(As_FC_offd);
1325    As_FC_offd_data = hypre_CSRMatrixData(As_FC_offd);
1326    As_FF_diag = hypre_ParCSRMatrixDiag(As_FF);
1327    As_FF_diag_i = hypre_CSRMatrixI(As_FF_diag);
1328    As_FF_diag_j = hypre_CSRMatrixJ(As_FF_diag);
1329    As_FF_diag_data = hypre_CSRMatrixData(As_FF_diag);
1330    As_FF_offd = hypre_ParCSRMatrixOffd(As_FF);
1331    As_FF_offd_i = hypre_CSRMatrixI(As_FF_offd);
1332    As_FF_offd_j = hypre_CSRMatrixJ(As_FF_offd);
1333    As_FF_offd_data = hypre_CSRMatrixData(As_FF_offd);
1334    n_Fpts = hypre_CSRMatrixNumRows(As_FF_diag);
1335    num_cols_A_FF_offd = hypre_CSRMatrixNumCols(As_FF_offd);
1336 
1337    D_beta = hypre_CTAlloc(HYPRE_Real, n_Fpts, HYPRE_MEMORY_HOST);
1338    D_lambda = hypre_CTAlloc(HYPRE_Real, n_Fpts, HYPRE_MEMORY_HOST);
1339    D_tmp = hypre_CTAlloc(HYPRE_Real, n_Fpts, HYPRE_MEMORY_HOST);
1340    D_tau = hypre_CTAlloc(HYPRE_Real, n_Fpts, HYPRE_MEMORY_HOST);
1341    D_w = hypre_CTAlloc(HYPRE_Real, n_Fpts, HYPRE_MEMORY_HOST);
1342    cpt_array = hypre_CTAlloc(HYPRE_Int, num_threads, HYPRE_MEMORY_HOST);
1343    start_array = hypre_CTAlloc(HYPRE_Int, num_threads+1, HYPRE_MEMORY_HOST);
1344    startf_array = hypre_CTAlloc(HYPRE_Int, num_threads+1, HYPRE_MEMORY_HOST);
1345 
1346 #ifdef HYPRE_USING_OPENMP
1347 #pragma omp parallel private(i,j,start,stop,startf,stopf,row,theta,value)
1348 #endif
1349    {
1350       HYPRE_Int my_thread_num = hypre_GetThreadNum();
1351 
1352       start = (n_fine/num_threads)*my_thread_num;
1353       if (my_thread_num == num_threads-1)
1354       {
1355          stop = n_fine;
1356       }
1357       else
1358       {
1359          stop = (n_fine/num_threads)*(my_thread_num+1);
1360       }
1361       start_array[my_thread_num+1] = stop;
1362       for (i=start; i < stop; i++)
1363       {
1364          if (CF_marker[i] > 0)
1365          {
1366             cpt_array[my_thread_num]++;
1367          }
1368       }
1369 
1370 #ifdef HYPRE_USING_OPENMP
1371 #pragma omp barrier
1372 #endif
1373       if (my_thread_num == 0)
1374       {
1375          for (i=1; i < num_threads; i++)
1376          {
1377             cpt_array[i] += cpt_array[i-1];
1378          }
1379          if (num_functions > 1)
1380          {
1381             HYPRE_Int *int_buf_data = NULL;
1382             HYPRE_Int num_sends, startc;
1383             HYPRE_Int num_cols_A_offd = hypre_CSRMatrixNumCols(A_offd);
1384             dof_func_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
1385             index = 0;
1386             num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
1387             int_buf_data = hypre_CTAlloc(HYPRE_Int, hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends), HYPRE_MEMORY_HOST);
1388             for (i = 0; i < num_sends; i++)
1389             {
1390                startc = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
1391                for (j = startc; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
1392                {
1393                   int_buf_data[index++] = dof_func[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
1394                }
1395             }
1396             comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data, dof_func_offd);
1397             hypre_ParCSRCommHandleDestroy(comm_handle);
1398             hypre_TFree(int_buf_data, HYPRE_MEMORY_HOST);
1399          }
1400       }
1401 #ifdef HYPRE_USING_OPENMP
1402 #pragma omp barrier
1403 #endif
1404       if (my_thread_num > 0)
1405          startf = start - cpt_array[my_thread_num-1];
1406       else
1407          startf = 0;
1408 
1409       if (my_thread_num < num_threads-1)
1410          stopf = stop - cpt_array[my_thread_num];
1411       else
1412          stopf = n_Fpts;
1413 
1414       startf_array[my_thread_num+1] = stopf;
1415 
1416       for (i=startf; i < stopf; i++)
1417       {
1418          HYPRE_Real number;
1419          for (j=As_FF_diag_i[i]+1; j < As_FF_diag_i[i+1]; j++)
1420          {
1421             D_lambda[i] += As_FF_diag_data[j];
1422          }
1423          for (j=As_FF_offd_i[i]; j < As_FF_offd_i[i+1]; j++)
1424          {
1425             D_lambda[i] += As_FF_offd_data[j];
1426          }
1427          number = (HYPRE_Real)(As_FF_diag_i[i+1]-As_FF_diag_i[i]-1+As_FF_offd_i[i+1]-As_FF_offd_i[i]);
1428          if (number) D_lambda[i] /= number;
1429          for (j=As_FC_diag_i[i]; j < As_FC_diag_i[i+1]; j++)
1430          {
1431             D_beta[i] += As_FC_diag_data[j];
1432          }
1433          for (j=As_FC_offd_i[i]; j < As_FC_offd_i[i+1]; j++)
1434          {
1435             D_beta[i] += As_FC_offd_data[j];
1436          }
1437          if (D_lambda[i]+D_beta[i]) D_tmp[i] = D_lambda[i]/(D_beta[i]+D_lambda[i]);
1438       }
1439 
1440 
1441 #ifdef HYPRE_USING_OPENMP
1442 #pragma omp barrier
1443 #endif
1444       if (my_thread_num == 0)
1445       {
1446          if (num_cols_A_FF_offd)
1447          {
1448             D_tmp_offd = hypre_CTAlloc(HYPRE_Real,  num_cols_A_FF_offd, HYPRE_MEMORY_HOST);
1449          }
1450          index = 0;
1451          comm_pkg = hypre_ParCSRMatrixCommPkg(As_FF);
1452          if (!comm_pkg)
1453          {
1454             hypre_MatvecCommPkgCreate(As_FF);
1455             comm_pkg = hypre_ParCSRMatrixCommPkg(As_FF);
1456          }
1457          num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
1458          buf_data = hypre_CTAlloc(HYPRE_Real, hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends), HYPRE_MEMORY_HOST);
1459          for (i = 0; i < num_sends; i++)
1460          {
1461             startc = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
1462             for (j = startc; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
1463             {
1464                buf_data[index++] = D_tmp[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
1465             }
1466          }
1467 
1468          comm_handle = hypre_ParCSRCommHandleCreate( 1, comm_pkg, buf_data, D_tmp_offd);
1469          hypre_ParCSRCommHandleDestroy(comm_handle);
1470       }
1471 
1472 #ifdef HYPRE_USING_OPENMP
1473 #pragma omp barrier
1474 #endif
1475 
1476       row = startf;
1477       for (i=start; i < stop; i++)
1478       {
1479          if (CF_marker[i] < 0)
1480          {
1481             if (num_functions > 1)
1482             {
1483                HYPRE_Int jA, jC, jS;
1484                jC = A_diag_i[i];
1485                for (j=S_diag_i[i]; j < S_diag_i[i+1]; j++)
1486                {
1487                   jS = S_diag_j[j];
1488                   jA = A_diag_j[jC];
1489                   while (jA != jS)
1490                   {
1491                      if (dof_func[i] == dof_func[jA])
1492                      {
1493                         D_w[row] += A_diag_data[jC++];
1494                      }
1495                      else
1496                         jC++;
1497                      jA = A_diag_j[jC];
1498                   }
1499                   jC++;
1500                }
1501                for (j=jC; j < A_diag_i[i+1]; j++)
1502                {
1503                   if (dof_func[i] == dof_func[A_diag_j[j]])
1504                         D_w[row] += A_diag_data[j];
1505                }
1506                jC = A_offd_i[i];
1507                for (j=S_offd_i[i]; j < S_offd_i[i+1]; j++)
1508                {
1509                   jS = S_offd_j[j];
1510                   jA = A_offd_j[jC];
1511                   while (jA != jS)
1512                   {
1513                      if (dof_func[i] == dof_func_offd[jA])
1514                      {
1515                         D_w[row] += A_offd_data[jC++];
1516                      }
1517                      else
1518                         jC++;
1519                      jA = A_offd_j[jC];
1520                   }
1521                   jC++;
1522                }
1523                for (j=jC; j < A_offd_i[i+1]; j++)
1524                {
1525                   if (dof_func[i] == dof_func_offd[A_offd_j[j]])
1526                         D_w[row] += A_offd_data[j];
1527                }
1528                row++;
1529             }
1530             else
1531             {
1532                for (j=A_diag_i[i]; j < A_diag_i[i+1]; j++)
1533                {
1534                   D_w[row] += A_diag_data[j];
1535                }
1536                for (j=A_offd_i[i]; j < A_offd_i[i+1]; j++)
1537                {
1538                   D_w[row] += A_offd_data[j];
1539                }
1540                for (j=As_FF_diag_i[row]+1; j < As_FF_diag_i[row+1]; j++)
1541                {
1542                   D_w[row] -= As_FF_diag_data[j];
1543                }
1544                for (j=As_FF_offd_i[row]; j < As_FF_offd_i[row+1]; j++)
1545                {
1546                   D_w[row] -= As_FF_offd_data[j];
1547                }
1548                D_w[row] -= D_beta[row];
1549                row++;
1550             }
1551          }
1552       }
1553 
1554       for (i=startf; i<stopf; i++)
1555       {
1556          for (j=As_FF_diag_i[i]+1; j < As_FF_diag_i[i+1]; j++)
1557          {
1558             index = As_FF_diag_j[j];
1559             D_tau[i] += As_FF_diag_data[j]*D_tmp[index];
1560          }
1561          for (j=As_FF_offd_i[i]; j < As_FF_offd_i[i+1]; j++)
1562          {
1563             index = As_FF_offd_j[j];
1564             D_tau[i] += As_FF_offd_data[j]*D_tmp_offd[index];
1565          }
1566       }
1567       for (i=startf; i<stopf; i++)
1568       {
1569          value = D_w[i]+D_tau[i];
1570          if (value) value = -1.0/value;
1571          theta = D_beta[i]+D_lambda[i];
1572          As_FF_diag_data[As_FF_diag_i[i]] = value*theta;
1573          if (theta) theta = 1.0/theta;
1574          for (j = As_FF_diag_i[i]+1; j < As_FF_diag_i[i+1]; j++)
1575          {
1576             As_FF_diag_data[j] *= value;
1577          }
1578          for (j = As_FF_offd_i[i]; j < As_FF_offd_i[i+1]; j++)
1579          {
1580             As_FF_offd_data[j] *= value;
1581          }
1582          for (j = As_FC_diag_i[i]; j < As_FC_diag_i[i+1]; j++)
1583          {
1584             As_FC_diag_data[j] *= theta;
1585          }
1586          for (j = As_FC_offd_i[i]; j < As_FC_offd_i[i+1]; j++)
1587          {
1588             As_FC_offd_data[j] *= theta;
1589          }
1590       }
1591 
1592    }   /* end parallel region */
1593 
1594    W = hypre_ParMatmul(As_FF, As_FC);
1595    W_diag = hypre_ParCSRMatrixDiag(W);
1596    W_offd = hypre_ParCSRMatrixOffd(W);
1597    W_diag_i = hypre_CSRMatrixI(W_diag);
1598    W_diag_j = hypre_CSRMatrixJ(W_diag);
1599    W_diag_data = hypre_CSRMatrixData(W_diag);
1600    W_offd_i = hypre_CSRMatrixI(W_offd);
1601    W_offd_j = hypre_CSRMatrixJ(W_offd);
1602    W_offd_data = hypre_CSRMatrixData(W_offd);
1603    num_cols_P_offd = hypre_CSRMatrixNumCols(W_offd);
1604    /*-----------------------------------------------------------------------
1605     *  Intialize data for P
1606     *-----------------------------------------------------------------------*/
1607    P_diag_i    = hypre_CTAlloc(HYPRE_Int,  n_fine+1, memory_location_P);
1608    P_offd_i    = hypre_CTAlloc(HYPRE_Int,  n_fine+1, memory_location_P);
1609 
1610    P_diag_size = n_Cpts + hypre_CSRMatrixI(W_diag)[n_Fpts];
1611    P_offd_size = hypre_CSRMatrixI(W_offd)[n_Fpts];
1612 
1613    if (P_diag_size)
1614    {
1615       P_diag_j    = hypre_CTAlloc(HYPRE_Int,  P_diag_size, memory_location_P);
1616       P_diag_data = hypre_CTAlloc(HYPRE_Real,  P_diag_size, memory_location_P);
1617    }
1618 
1619    if (P_offd_size)
1620    {
1621       P_offd_j    = hypre_CTAlloc(HYPRE_Int,  P_offd_size, memory_location_P);
1622       P_offd_data = hypre_CTAlloc(HYPRE_Real,  P_offd_size, memory_location_P);
1623    }
1624 
1625 #ifdef HYPRE_USING_OPENMP
1626 #pragma omp parallel private(i,j,start,stop,startf,stopf,c_pt,row,cnt_diag,cnt_offd)
1627 #endif
1628    {
1629       HYPRE_Int my_thread_num = hypre_GetThreadNum();
1630       startf = startf_array[my_thread_num];
1631       stopf = startf_array[my_thread_num+1];
1632       start = start_array[my_thread_num];
1633       stop = start_array[my_thread_num+1];
1634 
1635       if (my_thread_num > 0)
1636          c_pt = cpt_array[my_thread_num-1];
1637       else
1638          c_pt = 0;
1639       cnt_diag = W_diag_i[startf]+c_pt;
1640       cnt_offd = W_offd_i[startf];
1641       row = startf;
1642       for (i=start; i < stop; i++)
1643       {
1644          if (CF_marker[i] > 0)
1645          {
1646             P_diag_j[cnt_diag] = c_pt++;
1647             P_diag_data[cnt_diag++] = 1.0;
1648          }
1649          else
1650          {
1651             for (j=W_diag_i[row]; j < W_diag_i[row+1]; j++)
1652             {
1653                P_diag_j[cnt_diag] = W_diag_j[j];
1654                P_diag_data[cnt_diag++] = W_diag_data[j];
1655             }
1656             for (j=W_offd_i[row]; j < W_offd_i[row+1]; j++)
1657             {
1658                P_offd_j[cnt_offd] = W_offd_j[j];
1659                P_offd_data[cnt_offd++] = W_offd_data[j];
1660             }
1661             row++;
1662          }
1663          P_diag_i[i+1] = cnt_diag;
1664          P_offd_i[i+1] = cnt_offd;
1665       }
1666 
1667    }   /* end parallel region */
1668 
1669    /*-----------------------------------------------------------------------
1670     *  Create matrix
1671     *-----------------------------------------------------------------------*/
1672 
1673    P = hypre_ParCSRMatrixCreate(comm,
1674          hypre_ParCSRMatrixGlobalNumRows(A),
1675          total_global_cpts,
1676          hypre_ParCSRMatrixColStarts(A),
1677          num_cpts_global,
1678          num_cols_P_offd,
1679          P_diag_i[n_fine],
1680          P_offd_i[n_fine]);
1681 
1682    P_diag = hypre_ParCSRMatrixDiag(P);
1683    hypre_CSRMatrixData(P_diag) = P_diag_data;
1684    hypre_CSRMatrixI(P_diag) = P_diag_i;
1685    hypre_CSRMatrixJ(P_diag) = P_diag_j;
1686    P_offd = hypre_ParCSRMatrixOffd(P);
1687    hypre_CSRMatrixData(P_offd) = P_offd_data;
1688    hypre_CSRMatrixI(P_offd) = P_offd_i;
1689    hypre_CSRMatrixJ(P_offd) = P_offd_j;
1690    hypre_ParCSRMatrixColMapOffd(P) = hypre_ParCSRMatrixColMapOffd(W);
1691    hypre_ParCSRMatrixColMapOffd(W) = NULL;
1692 
1693    hypre_CSRMatrixMemoryLocation(P_diag) = memory_location_P;
1694    hypre_CSRMatrixMemoryLocation(P_offd) = memory_location_P;
1695 
1696    /* Compress P, removing coefficients smaller than trunc_factor * Max */
1697    if (trunc_factor != 0.0 || max_elmts > 0)
1698    {
1699       HYPRE_Int *map;
1700       hypre_BoomerAMGInterpTruncation(P, trunc_factor, max_elmts);
1701       P_diag_data = hypre_CSRMatrixData(P_diag);
1702       P_diag_i = hypre_CSRMatrixI(P_diag);
1703       P_diag_j = hypre_CSRMatrixJ(P_diag);
1704       P_offd_data = hypre_CSRMatrixData(P_offd);
1705       P_offd_i = hypre_CSRMatrixI(P_offd);
1706       P_offd_j = hypre_CSRMatrixJ(P_offd);
1707       P_diag_size = P_diag_i[n_fine];
1708       P_offd_size = P_offd_i[n_fine];
1709 
1710       col_map_offd_P = hypre_ParCSRMatrixColMapOffd(P);
1711       if (num_cols_P_offd)
1712       {
1713          P_marker = hypre_CTAlloc(HYPRE_Int, num_cols_P_offd, HYPRE_MEMORY_HOST);
1714          for (i=0; i < P_offd_size; i++)
1715             P_marker[P_offd_j[i]] = 1;
1716 
1717          new_ncols_P_offd = 0;
1718          for (i=0; i < num_cols_P_offd; i++)
1719             if (P_marker[i]) new_ncols_P_offd++;
1720 
1721          new_col_map_offd = hypre_CTAlloc(HYPRE_BigInt, new_ncols_P_offd, HYPRE_MEMORY_HOST);
1722          map = hypre_CTAlloc(HYPRE_Int, new_ncols_P_offd, HYPRE_MEMORY_HOST);
1723 
1724          index = 0;
1725          for (i=0; i < num_cols_P_offd; i++)
1726             if (P_marker[i])
1727             {
1728                 new_col_map_offd[index] = col_map_offd_P[i];
1729                 map[index++] = i;
1730             }
1731          hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
1732 
1733 
1734 #ifdef HYPRE_USING_OPENMP
1735 #pragma omp parallel for private(i) HYPRE_SMP_SCHEDULE
1736 #endif
1737          for (i=0; i < P_offd_size; i++)
1738          {
1739             P_offd_j[i] = hypre_BinarySearch(map, P_offd_j[i],
1740                new_ncols_P_offd);
1741          }
1742          hypre_TFree(col_map_offd_P, HYPRE_MEMORY_HOST);
1743          hypre_ParCSRMatrixColMapOffd(P) = new_col_map_offd;
1744          hypre_CSRMatrixNumCols(P_offd) = new_ncols_P_offd;
1745          hypre_TFree(map, HYPRE_MEMORY_HOST);
1746       }
1747    }
1748 
1749    hypre_MatvecCommPkgCreate(P);
1750 
1751    *P_ptr = P;
1752 
1753    /* Deallocate memory */
1754    hypre_TFree(D_tmp, HYPRE_MEMORY_HOST);
1755    hypre_TFree(D_tmp_offd, HYPRE_MEMORY_HOST);
1756    hypre_TFree(D_w, HYPRE_MEMORY_HOST);
1757    hypre_TFree(D_tau, HYPRE_MEMORY_HOST);
1758    hypre_TFree(D_beta, HYPRE_MEMORY_HOST);
1759    hypre_TFree(D_lambda, HYPRE_MEMORY_HOST);
1760    hypre_TFree(cpt_array, HYPRE_MEMORY_HOST);
1761    hypre_TFree(start_array, HYPRE_MEMORY_HOST);
1762    hypre_TFree(startf_array, HYPRE_MEMORY_HOST);
1763    hypre_TFree(buf_data, HYPRE_MEMORY_HOST);
1764    hypre_ParCSRMatrixDestroy(As_FF);
1765    hypre_ParCSRMatrixDestroy(As_FC);
1766    hypre_ParCSRMatrixDestroy(W);
1767 
1768    return hypre_error_flag;
1769 }
1770 
1771 /*-----------------------------------------------------------------------*
1772  * Modularized Extended+e Interpolation
1773  *-----------------------------------------------------------------------*/
1774 HYPRE_Int
hypre_BoomerAMGBuildModExtPEInterp(hypre_ParCSRMatrix * A,HYPRE_Int * CF_marker,hypre_ParCSRMatrix * S,HYPRE_BigInt * num_cpts_global,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int debug_flag,HYPRE_Real trunc_factor,HYPRE_Int max_elmts,hypre_ParCSRMatrix ** P_ptr)1775 hypre_BoomerAMGBuildModExtPEInterp(hypre_ParCSRMatrix  *A,
1776                                       HYPRE_Int           *CF_marker,
1777                                       hypre_ParCSRMatrix  *S,
1778                                       HYPRE_BigInt        *num_cpts_global,
1779                                       HYPRE_Int            num_functions,
1780                                       HYPRE_Int           *dof_func,
1781                                       HYPRE_Int            debug_flag,
1782                                       HYPRE_Real           trunc_factor,
1783                                       HYPRE_Int            max_elmts,
1784                                       hypre_ParCSRMatrix **P_ptr)
1785 {
1786 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
1787    hypre_GpuProfilingPushRange("ModExtPEInterp");
1788 #endif
1789 
1790    HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
1791 
1792    HYPRE_Int ierr = 0;
1793 
1794    if (exec == HYPRE_EXEC_HOST)
1795    {
1796       ierr = hypre_BoomerAMGBuildModExtPEInterpHost(A, CF_marker, S, num_cpts_global,
1797                                                     num_functions, dof_func,
1798                                                     debug_flag, trunc_factor, max_elmts, P_ptr);
1799    }
1800 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
1801    else
1802    {
1803       ierr = hypre_BoomerAMGBuildExtPEInterpDevice(A,CF_marker,S,num_cpts_global,1,NULL,
1804                                                  debug_flag,trunc_factor,max_elmts,P_ptr);
1805    }
1806 #endif
1807 
1808 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
1809    hypre_GpuProfilingPopRange();
1810 #endif
1811 
1812    return ierr;
1813 }
1814