1 /******************************************************************************
2  * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other
3  * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
4  *
5  * SPDX-License-Identifier: (Apache-2.0 OR MIT)
6  ******************************************************************************/
7 
8 #include "_hypre_parcsr_block_mv.h"
9 
10 /*---------------------------------------------------------------------------
11  * hypre_BoomerAMGBlockBuildInterp
12 
13  This is the block version of classical R-S interpolation. We use the complete
14  blocks of A (not just the diagonals of these blocks).
15 
16  A and P are now Block matrices.  The Strength matrix S is not as it gives
17  nodal strengths.
18 
19  CF_marker is size number of nodes.
20 
21  add_weak_to_diag  0 = don't add weak connections to diag (distribute instead)
22  1 = do add
23 
24  *--------------------------------------------------------------------------*/
25 
26 HYPRE_Int
hypre_BoomerAMGBuildBlockInterp(hypre_ParCSRBlockMatrix * A,HYPRE_Int * CF_marker,hypre_ParCSRMatrix * S,HYPRE_BigInt * num_cpts_global,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int debug_flag,HYPRE_Real trunc_factor,HYPRE_Int max_elmts,HYPRE_Int add_weak_to_diag,hypre_ParCSRBlockMatrix ** P_ptr)27 hypre_BoomerAMGBuildBlockInterp( hypre_ParCSRBlockMatrix *A,
28                                  HYPRE_Int               *CF_marker,
29                                  hypre_ParCSRMatrix   *S,
30                                  HYPRE_BigInt         *num_cpts_global,
31                                  HYPRE_Int             num_functions,
32                                  HYPRE_Int            *dof_func,
33                                  HYPRE_Int             debug_flag,
34                                  HYPRE_Real            trunc_factor,
35                                  HYPRE_Int             max_elmts,
36                                  HYPRE_Int             add_weak_to_diag,
37                                  hypre_ParCSRBlockMatrix  **P_ptr )
38 {
39 
40    MPI_Comm                 comm = hypre_ParCSRBlockMatrixComm(A);
41    hypre_ParCSRCommPkg     *comm_pkg = hypre_ParCSRBlockMatrixCommPkg(A);
42    hypre_ParCSRCommHandle  *comm_handle;
43 
44    hypre_CSRBlockMatrix *A_diag = hypre_ParCSRBlockMatrixDiag(A);
45    HYPRE_Real           *A_diag_data = hypre_CSRBlockMatrixData(A_diag);
46    HYPRE_Int            *A_diag_i = hypre_CSRBlockMatrixI(A_diag);
47    HYPRE_Int            *A_diag_j = hypre_CSRBlockMatrixJ(A_diag);
48 
49    HYPRE_Int             block_size = hypre_CSRBlockMatrixBlockSize(A_diag);
50    HYPRE_Int             bnnz = block_size*block_size;
51 
52    hypre_CSRBlockMatrix *A_offd = hypre_ParCSRBlockMatrixOffd(A);
53    HYPRE_Real           *A_offd_data = hypre_CSRBlockMatrixData(A_offd);
54    HYPRE_Int            *A_offd_i = hypre_CSRBlockMatrixI(A_offd);
55    HYPRE_Int            *A_offd_j = hypre_CSRBlockMatrixJ(A_offd);
56    HYPRE_Int             num_cols_A_offd = hypre_CSRBlockMatrixNumCols(A_offd);
57    HYPRE_BigInt         *col_map_offd = hypre_ParCSRBlockMatrixColMapOffd(A);
58 
59    hypre_CSRMatrix      *S_diag = hypre_ParCSRMatrixDiag(S);
60    HYPRE_Int            *S_diag_i = hypre_CSRMatrixI(S_diag);
61    HYPRE_Int            *S_diag_j = hypre_CSRMatrixJ(S_diag);
62 
63    hypre_CSRMatrix      *S_offd = hypre_ParCSRMatrixOffd(S);
64    HYPRE_Int            *S_offd_i = hypre_CSRMatrixI(S_offd);
65    HYPRE_Int            *S_offd_j = hypre_CSRMatrixJ(S_offd);
66 
67    hypre_ParCSRBlockMatrix *P;
68    HYPRE_BigInt          *col_map_offd_P;
69    HYPRE_Int             *tmp_map_offd = NULL;
70 
71    HYPRE_Int             *CF_marker_offd = NULL;
72 
73    hypre_CSRBlockMatrix  *A_ext;
74 
75    HYPRE_Real            *A_ext_data = NULL;
76    HYPRE_Int             *A_ext_i = NULL;
77    HYPRE_BigInt          *A_ext_j = NULL;
78 
79    hypre_CSRBlockMatrix  *P_diag;
80    hypre_CSRBlockMatrix  *P_offd;
81 
82    HYPRE_Real            *P_diag_data;
83    HYPRE_Int             *P_diag_i;
84    HYPRE_Int             *P_diag_j;
85    HYPRE_Real            *P_offd_data;
86    HYPRE_Int             *P_offd_i;
87    HYPRE_Int             *P_offd_j;
88 
89    HYPRE_Int              P_diag_size, P_offd_size;
90 
91    HYPRE_Int             *P_marker, *P_marker_offd;
92 
93    HYPRE_Int              jj_counter,jj_counter_offd;
94    HYPRE_Int             *jj_count, *jj_count_offd = NULL;
95    HYPRE_Int              jj_begin_row,jj_begin_row_offd;
96    HYPRE_Int              jj_end_row,jj_end_row_offd;
97 
98    HYPRE_Int              start_indexing = 0; /* start indexing for P_data at 0 */
99 
100    HYPRE_Int              n_fine = hypre_CSRBlockMatrixNumRows(A_diag);
101 
102    HYPRE_Int              strong_f_marker;
103 
104    HYPRE_Int             *fine_to_coarse;
105    HYPRE_BigInt          *fine_to_coarse_offd = NULL;
106    HYPRE_Int             *coarse_counter;
107    HYPRE_Int              coarse_shift;
108    HYPRE_BigInt           total_global_cpts, my_first_cpt;
109    HYPRE_Int              num_cols_P_offd;
110 
111    HYPRE_Int              bd;
112 
113    HYPRE_Int              i,i1,i2;
114    HYPRE_Int              j,jl,jj,jj1;
115    HYPRE_Int              kc;
116    HYPRE_BigInt           big_k;
117    HYPRE_Int              start;
118 
119    HYPRE_Int              c_num;
120 
121    HYPRE_Int              my_id;
122    HYPRE_Int              num_procs;
123    HYPRE_Int              num_threads;
124    HYPRE_Int              num_sends;
125    HYPRE_Int              index;
126    HYPRE_Int              ns, ne, size, rest;
127    HYPRE_Int             *int_buf_data = NULL;
128    HYPRE_BigInt          *big_buf_data = NULL;
129 
130    HYPRE_BigInt col_1 = hypre_ParCSRBlockMatrixFirstRowIndex(A);
131    HYPRE_Int local_numrows = hypre_CSRBlockMatrixNumRows(A_diag);
132    HYPRE_BigInt col_n = col_1 + (HYPRE_BigInt)local_numrows;
133 
134    HYPRE_Real       wall_time;  /* for debugging instrumentation  */
135 
136    HYPRE_Real       *identity_block;
137    HYPRE_Real       *zero_block;
138    HYPRE_Real       *diagonal_block;
139    HYPRE_Real       *sum_block;
140    HYPRE_Real       *distribute_block;
141 
142    hypre_MPI_Comm_size(comm, &num_procs);
143    hypre_MPI_Comm_rank(comm,&my_id);
144    /* num_threads = hypre_NumThreads(); */
145    num_threads = 1;
146 
147    my_first_cpt = num_cpts_global[0];
148    if (my_id == (num_procs -1)) total_global_cpts = num_cpts_global[1];
149    hypre_MPI_Bcast(&total_global_cpts, 1, HYPRE_MPI_BIG_INT, num_procs-1, comm);
150 
151    /*-------------------------------------------------------------------
152     * Get the CF_marker data for the off-processor columns
153     *-------------------------------------------------------------------*/
154 
155    if (debug_flag==4) wall_time = time_getWallclockSeconds();
156 
157    CF_marker_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
158 
159 
160    if (!comm_pkg)
161    {
162       hypre_BlockMatvecCommPkgCreate(A);
163       comm_pkg = hypre_ParCSRBlockMatrixCommPkg(A);
164    }
165 
166    num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
167    int_buf_data = hypre_CTAlloc(HYPRE_Int,  hypre_ParCSRCommPkgSendMapStart(comm_pkg,
168                                                    num_sends), HYPRE_MEMORY_HOST);
169 
170    index = 0;
171    for (i = 0; i < num_sends; i++)
172    {
173       start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
174       for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
175       {
176          int_buf_data[index++]
177             = CF_marker[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
178       }
179 
180    }
181 
182    /* we do not need the block version of comm handle - because
183       CF_marker corresponds to the nodal matrix.  This call populates
184       CF_marker_offd */
185    comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data,
186                                                CF_marker_offd);
187 
188    hypre_ParCSRCommHandleDestroy(comm_handle);
189 
190 
191    if (debug_flag==4)
192    {
193       wall_time = time_getWallclockSeconds() - wall_time;
194       hypre_printf("Proc = %d     Interp: Comm 1 CF_marker =    %f\n",
195                    my_id, wall_time);
196       fflush(NULL);
197    }
198 
199    /*----------------------------------------------------------------------
200     * Get the ghost rows of A
201     *---------------------------------------------------------------------*/
202 
203    if (debug_flag==4) wall_time = time_getWallclockSeconds();
204 
205    if (num_procs > 1)
206    {
207       A_ext      = hypre_ParCSRBlockMatrixExtractBExt(A, A, 1);
208       A_ext_i    = hypre_CSRBlockMatrixI(A_ext);
209       A_ext_j    = hypre_CSRBlockMatrixBigJ(A_ext);
210       A_ext_data = hypre_CSRBlockMatrixData(A_ext);
211    }
212 
213    index = 0;
214    for (i=0; i < num_cols_A_offd; i++)
215    {
216       for (j=A_ext_i[i]; j < A_ext_i[i+1]; j++)
217       {
218          big_k = A_ext_j[j];
219          if (big_k >= col_1 && big_k < col_n)
220          {
221             A_ext_j[index] = big_k - col_1;
222             /* for the data field we must get all of the block data */
223             for (bd = 0; bd < bnnz; bd++)
224             {
225                A_ext_data[index*bnnz + bd] = A_ext_data[j*bnnz + bd];
226             }
227             index++;
228          }
229          else
230          {
231             kc = hypre_BigBinarySearch(col_map_offd, big_k, num_cols_A_offd);
232             if (kc > -1)
233             {
234                A_ext_j[index] = (HYPRE_BigInt)(-kc-1);
235                for (bd = 0; bd < bnnz; bd++)
236                {
237                   A_ext_data[index*bnnz + bd] = A_ext_data[j*bnnz + bd];
238                }
239                index++;
240             }
241          }
242       }
243       A_ext_i[i] = index;
244    }
245    for (i = num_cols_A_offd; i > 0; i--)
246       A_ext_i[i] = A_ext_i[i-1];
247    if (num_procs > 1) A_ext_i[0] = 0;
248 
249    if (debug_flag==4)
250    {
251       wall_time = time_getWallclockSeconds() - wall_time;
252       hypre_printf("Proc = %d  Interp: Comm 2   Get A_ext =  %f\n",
253                    my_id, wall_time);
254       fflush(NULL);
255    }
256 
257 
258    /*-----------------------------------------------------------------------
259     *  First Pass: Determine size of P and fill in fine_to_coarse mapping.
260     *-----------------------------------------------------------------------*/
261 
262    /*-----------------------------------------------------------------------
263     *  Intialize counters and allocate mapping vector.
264     *-----------------------------------------------------------------------*/
265 
266    coarse_counter = hypre_CTAlloc(HYPRE_Int,  num_threads, HYPRE_MEMORY_HOST);
267    jj_count = hypre_CTAlloc(HYPRE_Int,  num_threads, HYPRE_MEMORY_HOST);
268    jj_count_offd = hypre_CTAlloc(HYPRE_Int,  num_threads, HYPRE_MEMORY_HOST);
269 
270    fine_to_coarse = hypre_CTAlloc(HYPRE_Int,  n_fine, HYPRE_MEMORY_HOST);
271 
272    for (i = 0; i < n_fine; i++) fine_to_coarse[i] = -1;
273 
274    jj_counter = start_indexing;
275    jj_counter_offd = start_indexing;
276 
277    /*-----------------------------------------------------------------------
278     *  Loop over fine grid.
279     *-----------------------------------------------------------------------*/
280 
281 
282    for (j = 0; j < num_threads; j++)
283    {
284       size = n_fine/num_threads;
285       rest = n_fine - size*num_threads;
286       if (j < rest)
287       {
288          ns = j*size+j;
289          ne = (j+1)*size+j+1;
290       }
291       else
292       {
293          ns = j*size+rest;
294          ne = (j+1)*size+rest;
295       }
296 
297 
298       /* loop over the fine grid points */
299       for (i = ns; i < ne; i++)
300       {
301 
302          /*--------------------------------------------------------------------
303           *  If i is a C-point, interpolation is the identity. Also set up
304           *  mapping vector (fine_to_coarse is the mapping vector).
305           *--------------------------------------------------------------------*/
306 
307          if (CF_marker[i] >= 0)
308          {
309             jj_count[j]++;
310             fine_to_coarse[i] = coarse_counter[j];
311             coarse_counter[j]++;
312          }
313 
314          /*--------------------------------------------------------------------
315           *  If i is an F-point, interpolation is from the C-points that
316           *  strongly influence i.
317           *--------------------------------------------------------------------*/
318 
319          else
320          {
321             for (jj = S_diag_i[i]; jj < S_diag_i[i+1]; jj++)
322             {
323                i1 = S_diag_j[jj];
324                if (CF_marker[i1] >= 0)
325                {
326                   jj_count[j]++;
327                }
328             }
329 
330             if (num_procs > 1)
331             {
332                for (jj = S_offd_i[i]; jj < S_offd_i[i+1]; jj++)
333                {
334                   i1 = S_offd_j[jj];
335                   if (CF_marker_offd[i1] >= 0)
336                   {
337                      jj_count_offd[j]++;
338                   }
339                }
340             }
341          }
342       }
343    }
344 
345    /*-----------------------------------------------------------------------
346     *  Allocate  arrays.
347     *-----------------------------------------------------------------------*/
348 
349    for (i=0; i < num_threads-1; i++)
350    {
351       coarse_counter[i+1] += coarse_counter[i];
352       jj_count[i+1] += jj_count[i];
353       jj_count_offd[i+1] += jj_count_offd[i];
354    }
355    i = num_threads-1;
356    jj_counter = jj_count[i];
357    jj_counter_offd = jj_count_offd[i];
358 
359    P_diag_size = jj_counter;
360 
361    P_diag_i    = hypre_CTAlloc(HYPRE_Int,  n_fine+1, HYPRE_MEMORY_HOST);
362    P_diag_j    = hypre_CTAlloc(HYPRE_Int,  P_diag_size, HYPRE_MEMORY_HOST);
363    /* we need to include the size of the blocks in the data size */
364    P_diag_data = hypre_CTAlloc(HYPRE_Real,  P_diag_size*bnnz, HYPRE_MEMORY_HOST);
365 
366    P_diag_i[n_fine] = jj_counter;
367 
368 
369    P_offd_size = jj_counter_offd;
370 
371    P_offd_i    = hypre_CTAlloc(HYPRE_Int,  n_fine+1, HYPRE_MEMORY_HOST);
372    P_offd_j    = hypre_CTAlloc(HYPRE_Int,  P_offd_size, HYPRE_MEMORY_HOST);
373    /* we need to include the size of the blocks in the data size */
374    P_offd_data = hypre_CTAlloc(HYPRE_Real,  P_offd_size*bnnz, HYPRE_MEMORY_HOST);
375 
376    /*-----------------------------------------------------------------------
377     *  Intialize some stuff.
378     *-----------------------------------------------------------------------*/
379 
380    jj_counter = start_indexing;
381    jj_counter_offd = start_indexing;
382 
383    if (debug_flag==4)
384    {
385       wall_time = time_getWallclockSeconds() - wall_time;
386       hypre_printf("Proc = %d     Interp: Internal work 1 =     %f\n",
387                    my_id, wall_time);
388       fflush(NULL);
389    }
390 
391    /* we need a block identity and a block of zeros*/
392    identity_block = hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
393    zero_block =  hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
394 
395    for(i = 0; i < block_size; i++)
396    {
397       identity_block[i*block_size + i] = 1.0;
398    }
399 
400 
401    /* we also need a block to keep track of the diagonal values and a sum */
402    diagonal_block =  hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
403    sum_block =  hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
404    distribute_block =  hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
405 
406    /*-----------------------------------------------------------------------
407     *  Send and receive fine_to_coarse info.
408     *-----------------------------------------------------------------------*/
409 
410    if (debug_flag==4) wall_time = time_getWallclockSeconds();
411 
412    fine_to_coarse_offd = hypre_CTAlloc(HYPRE_BigInt,  num_cols_A_offd, HYPRE_MEMORY_HOST);
413    big_buf_data = hypre_CTAlloc(HYPRE_BigInt,  hypre_ParCSRCommPkgSendMapStart(comm_pkg,
414                                                           num_sends), HYPRE_MEMORY_HOST);
415 
416    for (j = 0; j < num_threads; j++)
417    {
418       coarse_shift = 0;
419       if (j > 0) coarse_shift = coarse_counter[j-1];
420       size = n_fine/num_threads;
421       rest = n_fine - size*num_threads;
422       if (j < rest)
423       {
424          ns = j*size+j;
425          ne = (j+1)*size+j+1;
426       }
427       else
428       {
429          ns = j*size+rest;
430          ne = (j+1)*size+rest;
431       }
432       for (i = ns; i < ne; i++)
433          fine_to_coarse[i] += coarse_shift;
434    }
435    index = 0;
436    for (i = 0; i < num_sends; i++)
437    {
438       start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
439       for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
440          big_buf_data[index++]
441             = my_first_cpt+fine_to_coarse[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
442    }
443 
444    /* again, we do not need to use the block version of comm handle since
445       the fine to coarse mapping is size of the nodes */
446 
447    comm_handle = hypre_ParCSRCommHandleCreate( 21, comm_pkg, big_buf_data,
448                                                fine_to_coarse_offd);
449 
450    hypre_ParCSRCommHandleDestroy(comm_handle);
451 
452    if (debug_flag==4)
453    {
454       wall_time = time_getWallclockSeconds() - wall_time;
455       hypre_printf("Proc = %d     Interp: Comm 4 FineToCoarse = %f\n",
456                    my_id, wall_time);
457       fflush(NULL);
458    }
459 
460    if (debug_flag==4) wall_time = time_getWallclockSeconds();
461 
462    /*-----------------------------------------------------------------------
463     *  Loop over fine grid points.
464     *-----------------------------------------------------------------------*/
465 
466    for (jl = 0; jl < num_threads; jl++)
467    {
468       size = n_fine/num_threads;
469       rest = n_fine - size*num_threads;
470       if (jl < rest)
471       {
472          ns = jl*size+jl;
473          ne = (jl+1)*size+jl+1;
474       }
475       else
476       {
477          ns = jl*size+rest;
478          ne = (jl+1)*size+rest;
479       }
480       jj_counter = 0;
481       if (jl > 0) jj_counter = jj_count[jl-1];
482       jj_counter_offd = 0;
483       if (jl > 0) jj_counter_offd = jj_count_offd[jl-1];
484 
485       P_marker = hypre_CTAlloc(HYPRE_Int,  n_fine, HYPRE_MEMORY_HOST);
486       P_marker_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
487 
488       for (i = 0; i < n_fine; i++)
489       {
490          P_marker[i] = -1;
491       }
492       for (i = 0; i < num_cols_A_offd; i++)
493       {
494          P_marker_offd[i] = -1;
495       }
496       strong_f_marker = -2;
497 
498       for (i = ns; i < ne; i++)
499       {
500 
501          /*--------------------------------------------------------------------
502           *  If i is a c-point, interpolation is the identity.
503           *--------------------------------------------------------------------*/
504 
505          if (CF_marker[i] >= 0)
506          {
507             P_diag_i[i] = jj_counter;
508             P_diag_j[jj_counter]    = fine_to_coarse[i];
509             /* P_diag_data[jj_counter] = one; */
510             hypre_CSRBlockMatrixBlockCopyData(identity_block,
511                                               &P_diag_data[jj_counter*bnnz],
512                                               1.0, block_size);
513             jj_counter++;
514          }
515 
516          /*--------------------------------------------------------------------
517           *  If i is an F-point, build interpolation.
518           *--------------------------------------------------------------------*/
519 
520          else
521          {
522             /* Diagonal part of P */
523             P_diag_i[i] = jj_counter;
524             jj_begin_row = jj_counter;
525 
526             for (jj = S_diag_i[i]; jj < S_diag_i[i+1]; jj++)
527             {
528                i1 = S_diag_j[jj];
529 
530                /*--------------------------------------------------------------
531                 * If neighbor i1 is a C-point, set column number in P_diag_j
532                 * and initialize interpolation weight to zero.
533                 *--------------------------------------------------------------*/
534 
535                if (CF_marker[i1] >= 0)
536                {
537                   P_marker[i1] = jj_counter;
538                   P_diag_j[jj_counter]    = fine_to_coarse[i1];
539                   /* P_diag_data[jj_counter] = zero; */
540                   hypre_CSRBlockMatrixBlockCopyData(zero_block,
541                                                     &P_diag_data[jj_counter*bnnz],
542                                                     1.0, block_size);
543                   jj_counter++;
544                }
545 
546                /*--------------------------------------------------------------
547                 * If neighbor i1 is an F-point, mark it as a strong F-point
548                 * whose connection needs to be distributed.
549                 *--------------------------------------------------------------*/
550 
551                else if (CF_marker[i1] != -3)
552                {
553                   P_marker[i1] = strong_f_marker;
554                }
555             }
556             jj_end_row = jj_counter;
557 
558             /* Off-Diagonal part of P */
559             P_offd_i[i] = jj_counter_offd;
560             jj_begin_row_offd = jj_counter_offd;
561 
562 
563             if (num_procs > 1)
564             {
565                for (jj = S_offd_i[i]; jj < S_offd_i[i+1]; jj++)
566                {
567                   i1 = S_offd_j[jj];
568 
569                   /*-----------------------------------------------------------
570                    * If neighbor i1 is a C-point, set column number in P_offd_j
571                    * and initialize interpolation weight to zero.
572                    *-----------------------------------------------------------*/
573 
574                   if (CF_marker_offd[i1] >= 0)
575                   {
576                      P_marker_offd[i1] = jj_counter_offd;
577                      P_offd_j[jj_counter_offd]  = i1;
578                      /* P_offd_data[jj_counter_offd] = zero; */
579                      hypre_CSRBlockMatrixBlockCopyData(zero_block,
580                                                        &P_offd_data[jj_counter_offd*bnnz],
581                                                        1.0, block_size);
582 
583                       jj_counter_offd++;
584                   }
585 
586                   /*-----------------------------------------------------------
587                    * If neighbor i1 is an F-point, mark it as a strong F-point
588                    * whose connection needs to be distributed.
589                    *-----------------------------------------------------------*/
590 
591                   else if (CF_marker_offd[i1] != -3)
592                   {
593                      P_marker_offd[i1] = strong_f_marker;
594                   }
595                }
596             }
597 
598             jj_end_row_offd = jj_counter_offd;
599 
600 
601             /* get the diagonal block */
602             /* diagonal = A_diag_data[A_diag_i[i]]; */
603             hypre_CSRBlockMatrixBlockCopyData(&A_diag_data[A_diag_i[i]*bnnz], diagonal_block,
604                                               1.0, block_size);
605 
606 
607 
608             /* Here we go through the neighborhood of this grid point */
609 
610             /* Loop over ith row of A.  First, the diagonal part of A */
611 
612             for (jj = A_diag_i[i]+1; jj < A_diag_i[i+1]; jj++)
613             {
614                i1 = A_diag_j[jj];
615 
616                /*--------------------------------------------------------------
617                 * Case 1: neighbor i1 is a C-point and strongly influences i,
618                 * accumulate a_{i,i1} into the interpolation weight.
619                 *--------------------------------------------------------------*/
620 
621                if (P_marker[i1] >= jj_begin_row)
622                {
623                   /*   P_diag_data[P_marker[i1]] += A_diag_data[jj]; */
624                   hypre_CSRBlockMatrixBlockAddAccumulate(&A_diag_data[jj*bnnz],
625                                                          &P_diag_data[P_marker[i1]*bnnz],
626                                                          block_size);
627 
628                }
629 
630                /*--------------------------------------------------------------
631                 * Case 2: neighbor i1 is an F-point and strongly influences i,
632                 * distribute a_{i,i1} to C-points that strongly infuence i.
633                 * Note: currently no distribution to the diagonal in this case.
634                 *--------------------------------------------------------------*/
635 
636                else if (P_marker[i1] == strong_f_marker || (!add_weak_to_diag  && CF_marker[i1] != -3))
637                {
638                   /* initialize sum to zero */
639                   /* sum = zero; */
640                   hypre_CSRBlockMatrixBlockCopyData(zero_block, sum_block, 1.0,
641                                                     block_size);
642 
643 
644                   /*-----------------------------------------------------------
645                    * Loop over row of A for point i1 and calculate the sum
646                    * of the connections to c-points that strongly influence i.
647                    *-----------------------------------------------------------*/
648 
649                   /* Diagonal block part of row i1 */
650                   for (jj1 = A_diag_i[i1]; jj1 < A_diag_i[i1+1]; jj1++)
651                   {
652                      i2 = A_diag_j[jj1];
653                      if (P_marker[i2] >= jj_begin_row)
654                      {
655                         /* add diag data to sum */
656                         /* sum += A_diag_data[jj1]; */
657                         hypre_CSRBlockMatrixBlockAddAccumulate(&A_diag_data[jj1*bnnz],
658                                                                sum_block, block_size);
659                      }
660                   }
661 
662                   /* Off-Diagonal block part of row i1 */
663                   if (num_procs > 1)
664                   {
665                      for (jj1 = A_offd_i[i1]; jj1 < A_offd_i[i1+1]; jj1++)
666                      {
667                         i2 = A_offd_j[jj1];
668                         if (P_marker_offd[i2] >= jj_begin_row_offd )
669                         {
670                            /* add off diag data to sum */
671                            /*sum += A_offd_data[jj1];*/
672                            hypre_CSRBlockMatrixBlockAddAccumulate(&A_offd_data[jj1*bnnz],
673                                                                   sum_block, block_size);
674 
675                         }
676                      }
677                   }
678                   /* check whether sum_block is singular */
679 
680                   /* distribute = A_diag_data[jj] / sum;*/
681                   /* here we want: A_diag_data * sum^(-1) */
682                   /* note that results are uneffected for most problems if
683                      we do sum^(-1) * A_diag_data - but it seems to matter
684                      a little for very non-sym */
685 
686                   if (hypre_CSRBlockMatrixBlockMultInv(sum_block, &A_diag_data[jj*bnnz],
687                                                        distribute_block, block_size) == 0)
688                   {
689 
690 
691                      /*-----------------------------------------------------------
692                       * Loop over row of A for point i1 and do the distribution.
693                       *-----------------------------------------------------------*/
694 
695                      /* Diagonal block part of row i1 */
696                      for (jj1 = A_diag_i[i1]; jj1 < A_diag_i[i1+1]; jj1++)
697                      {
698                         i2 = A_diag_j[jj1];
699                         if (P_marker[i2] >= jj_begin_row )
700                         {
701 
702                            /*  P_diag_data[P_marker[i2]]
703                                += distribute * A_diag_data[jj1];*/
704 
705                            /* multiply - result in sum_block */
706                            hypre_CSRBlockMatrixBlockMultAdd(distribute_block,
707                                                             &A_diag_data[jj1*bnnz], 0.0,
708                                                             sum_block, block_size);
709 
710 
711                            /* add result to p_diag_data */
712                            hypre_CSRBlockMatrixBlockAddAccumulate(sum_block,
713                                                                   &P_diag_data[P_marker[i2]*bnnz],
714                                                                   block_size);
715 
716                         }
717                      }
718 
719                      /* Off-Diagonal block part of row i1 */
720                      if (num_procs > 1)
721                      {
722                         for (jj1 = A_offd_i[i1]; jj1 < A_offd_i[i1+1]; jj1++)
723                         {
724                            i2 = A_offd_j[jj1];
725                            if (P_marker_offd[i2] >= jj_begin_row_offd)
726                            {
727                               /* P_offd_data[P_marker_offd[i2]]
728                                  += distribute * A_offd_data[jj1]; */
729 
730                               /* multiply - result in sum_block */
731                               hypre_CSRBlockMatrixBlockMultAdd(distribute_block,
732                                                                &A_offd_data[jj1*bnnz], 0.0,
733                                                                sum_block, block_size);
734 
735 
736                               /* add result to p_offd_data */
737                               hypre_CSRBlockMatrixBlockAddAccumulate(sum_block,
738                                                                      &P_offd_data[P_marker_offd[i2]*bnnz],
739                                                                      block_size);
740                            }
741                         }
742                      }
743                   }
744                   else /* sum block is all zeros (or almost singular) - just add to diagonal */
745                   {
746                      /* diagonal += A_diag_data[jj]; */
747                      if (add_weak_to_diag) hypre_CSRBlockMatrixBlockAddAccumulate(&A_diag_data[jj*bnnz],
748                                                                                   diagonal_block,
749                                                                                   block_size);
750 
751                   }
752                }
753 
754                /*--------------------------------------------------------------
755                 * Case 3: neighbor i1 weakly influences i, accumulate a_{i,i1}
756                 * into the diagonal.
757                 *--------------------------------------------------------------*/
758 
759                else if (CF_marker[i1] != -3 && add_weak_to_diag)
760                {
761                   /* diagonal += A_diag_data[jj];*/
762                   hypre_CSRBlockMatrixBlockAddAccumulate(&A_diag_data[jj*bnnz],
763                                                          diagonal_block,
764                                                          block_size);
765 
766                }
767 
768             }
769 
770 
771             /*----------------------------------------------------------------
772              * Still looping over ith row of A. Next, loop over the
773              * off-diagonal part of A
774              *---------------------------------------------------------------*/
775 
776             if (num_procs > 1)
777             {
778                for (jj = A_offd_i[i]; jj < A_offd_i[i+1]; jj++)
779                {
780                   i1 = A_offd_j[jj];
781 
782                   /*--------------------------------------------------------------
783                    * Case 1: neighbor i1 is a C-point and strongly influences i,
784                    * accumulate a_{i,i1} into the interpolation weight.
785                    *--------------------------------------------------------------*/
786 
787                   if (P_marker_offd[i1] >= jj_begin_row_offd)
788                   {
789                      /* P_offd_data[P_marker_offd[i1]] += A_offd_data[jj]; */
790                      hypre_CSRBlockMatrixBlockAddAccumulate( &A_offd_data[jj*bnnz],
791                                                              &P_offd_data[P_marker_offd[i1]*bnnz],
792                                                              block_size);
793                   }
794 
795                   /*------------------------------------------------------------
796                    * Case 2: neighbor i1 is an F-point and strongly influences i,
797                    * distribute a_{i,i1} to C-points that strongly infuence i.
798                    * Note: currently no distribution to the diagonal in this case.
799                    *-----------------------------------------------------------*/
800 
801                   else if (P_marker_offd[i1] == strong_f_marker || (!add_weak_to_diag  && CF_marker[i1] != -3))
802                   {
803 
804                      /* initialize sum to zero */
805                      hypre_CSRBlockMatrixBlockCopyData(zero_block, sum_block,
806                                                        1.0, block_size);
807 
808                      /*---------------------------------------------------------
809                       * Loop over row of A_ext for point i1 and calculate the sum
810                       * of the connections to c-points that strongly influence i.
811                       *---------------------------------------------------------*/
812 
813                      /* find row number */
814                      c_num = A_offd_j[jj];
815 
816                      for (jj1 = A_ext_i[c_num]; jj1 < A_ext_i[c_num+1]; jj1++)
817                      {
818                         i2 = (HYPRE_Int)A_ext_j[jj1];
819 
820                         if (i2 > -1)
821                         {
822                            /* in the diagonal block */
823                            if (P_marker[i2] >= jj_begin_row)
824                            {
825                               /* sum += A_ext_data[jj1]; */
826                               hypre_CSRBlockMatrixBlockAddAccumulate(&A_ext_data[jj1*bnnz],
827                                                                      sum_block, block_size);
828                            }
829                         }
830                         else
831                         {
832                            /* in the off_diagonal block  */
833                            if (P_marker_offd[-i2-1] >= jj_begin_row_offd)
834                            {
835                               /* sum += A_ext_data[jj1]; */
836                               hypre_CSRBlockMatrixBlockAddAccumulate(&A_ext_data[jj1*bnnz],
837                                                                      sum_block, block_size);
838 
839                            }
840                         }
841                      }
842 
843                      /* check whether sum_block is singular */
844 
845 
846                      /* distribute = A_offd_data[jj] / sum;  */
847                      /* here we want: A_offd_data * sum^(-1) */
848                      if (hypre_CSRBlockMatrixBlockMultInv(sum_block, &A_offd_data[jj*bnnz],
849                                                           distribute_block, block_size) == 0)
850                      {
851 
852                         /*---------------------------------------------------------
853                          * Loop over row of A_ext for point i1 and do
854                          * the distribution.
855                          *--------------------------------------------------------*/
856 
857                         /* Diagonal block part of row i1 */
858 
859                         for (jj1 = A_ext_i[c_num]; jj1 < A_ext_i[c_num+1]; jj1++)
860                         {
861                            i2 = (HYPRE_Int)A_ext_j[jj1];
862 
863                            if (i2 > -1) /* in the diagonal block */
864                            {
865                               if (P_marker[i2] >= jj_begin_row)
866                               {
867                                  /* P_diag_data[P_marker[i2]]
868                                     += distribute * A_ext_data[jj1]; */
869 
870                                  /* multiply - result in sum_block */
871                                  hypre_CSRBlockMatrixBlockMultAdd(distribute_block,
872                                                                   &A_ext_data[jj1*bnnz], 0.0,
873                                                                   sum_block, block_size);
874 
875 
876                                  /* add result to p_diag_data */
877                                  hypre_CSRBlockMatrixBlockAddAccumulate(sum_block,
878                                                                         &P_diag_data[P_marker[i2]*bnnz],
879                                                                         block_size);
880 
881                               }
882                            }
883                            else
884                            {
885                               /* in the off_diagonal block  */
886                               if (P_marker_offd[-i2-1] >= jj_begin_row_offd)
887 
888                                  /*P_offd_data[P_marker_offd[-i2-1]]
889                                    += distribute * A_ext_data[jj1];*/
890                               {
891 
892                                  /* multiply - result in sum_block */
893                                  hypre_CSRBlockMatrixBlockMultAdd(distribute_block,
894                                                                   &A_ext_data[jj1*bnnz], 0.0,
895                                                                   sum_block, block_size);
896 
897 
898                                  /* add result to p_offd_data */
899                                  hypre_CSRBlockMatrixBlockAddAccumulate(sum_block,
900                                                                         &P_offd_data[P_marker_offd[-i2-1]*bnnz],
901                                                                         block_size);
902                               }
903 
904 
905                            }
906                         }
907                      }
908                      else /* sum block is all zeros - just add to diagonal */
909                      {
910                         /* diagonal += A_offd_data[jj]; */
911                         if (add_weak_to_diag) hypre_CSRBlockMatrixBlockAddAccumulate(&A_offd_data[jj*bnnz],
912                                                                                      diagonal_block,
913                                                                                      block_size);
914 
915                      }
916                   }
917 
918                   /*-----------------------------------------------------------
919                    * Case 3: neighbor i1 weakly influences i, accumulate a_{i,i1}
920                    * into the diagonal.
921                    *-----------------------------------------------------------*/
922 
923                   else if (CF_marker_offd[i1] != -3 && add_weak_to_diag)
924                   {
925                      /* diagonal += A_offd_data[jj]; */
926                      hypre_CSRBlockMatrixBlockAddAccumulate(&A_offd_data[jj*bnnz],
927                                                             diagonal_block,
928                                                             block_size);
929 
930                   }
931                }
932             }
933 
934             /*-----------------------------------------------------------------
935              * Set interpolation weight by dividing by the diagonal.
936              *-----------------------------------------------------------------*/
937 
938             for (jj = jj_begin_row; jj < jj_end_row; jj++)
939             {
940 
941                /* P_diag_data[jj] /= -diagonal; */
942 
943                /* want diagonal^(-1)*P_diag_data */
944                /* do division - put in sum_block */
945                if ( hypre_CSRBlockMatrixBlockInvMult(diagonal_block, &P_diag_data[jj*bnnz],
946                                                      sum_block, block_size) == 0)
947                {
948                   /* now copy to  P_diag_data[jj] and make negative */
949                   hypre_CSRBlockMatrixBlockCopyData(sum_block, &P_diag_data[jj*bnnz],
950                                                     -1.0, block_size);
951                }
952                else
953                {
954                   /* hypre_printf(" Warning! singular diagonal block! Proc id %d row %d\n", my_id,i);  */
955                   /* just make P_diag_data negative since diagonal is singular) */
956                   hypre_CSRBlockMatrixBlockCopyData(&P_diag_data[jj*bnnz], &P_diag_data[jj*bnnz],
957                                                     -1.0, block_size);
958 
959                }
960             }
961 
962             for (jj = jj_begin_row_offd; jj < jj_end_row_offd; jj++)
963             {
964                /* P_offd_data[jj] /= -diagonal; */
965 
966                /* do division - put in sum_block */
967                hypre_CSRBlockMatrixBlockInvMult(diagonal_block, &P_offd_data[jj*bnnz],
968                                                 sum_block, block_size);
969 
970                /* now copy to  P_offd_data[jj] and make negative */
971                hypre_CSRBlockMatrixBlockCopyData(sum_block, &P_offd_data[jj*bnnz],
972                                                  -1.0, block_size);
973 
974 
975 
976             }
977 
978          }
979 
980          strong_f_marker--;
981 
982          P_offd_i[i+1] = jj_counter_offd;
983       }
984       hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
985       hypre_TFree(P_marker_offd, HYPRE_MEMORY_HOST);
986    }
987 
988    /* Now create P - as a block matrix */
989    P = hypre_ParCSRBlockMatrixCreate(comm, block_size,
990                                      hypre_ParCSRBlockMatrixGlobalNumRows(A),
991                                      total_global_cpts,
992                                      hypre_ParCSRBlockMatrixColStarts(A),
993                                      num_cpts_global,
994                                      0,
995                                      P_diag_i[n_fine],
996                                      P_offd_i[n_fine]);
997 
998    P_diag = hypre_ParCSRBlockMatrixDiag(P);
999    hypre_CSRBlockMatrixData(P_diag) = P_diag_data;
1000    hypre_CSRBlockMatrixI(P_diag) = P_diag_i;
1001    hypre_CSRBlockMatrixJ(P_diag) = P_diag_j;
1002 
1003    P_offd = hypre_ParCSRBlockMatrixOffd(P);
1004    hypre_CSRBlockMatrixData(P_offd) = P_offd_data;
1005    hypre_CSRBlockMatrixI(P_offd) = P_offd_i;
1006    hypre_CSRBlockMatrixJ(P_offd) = P_offd_j;
1007 
1008    /* Compress P, removing coefficients smaller than trunc_factor * Max */
1009    if (trunc_factor != 0.0  || max_elmts > 0)
1010    {
1011       hypre_BoomerAMGBlockInterpTruncation(P, trunc_factor, max_elmts);
1012       P_diag_data = hypre_CSRBlockMatrixData(P_diag);
1013       P_diag_i = hypre_CSRBlockMatrixI(P_diag);
1014       P_diag_j = hypre_CSRBlockMatrixJ(P_diag);
1015       P_offd_data = hypre_CSRBlockMatrixData(P_offd);
1016       P_offd_i = hypre_CSRBlockMatrixI(P_offd);
1017       P_offd_j = hypre_CSRBlockMatrixJ(P_offd);
1018       P_diag_size = P_diag_i[n_fine];
1019       P_offd_size = P_offd_i[n_fine];
1020    }
1021 
1022    num_cols_P_offd = 0;
1023    if (P_offd_size)
1024    {
1025       P_marker = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
1026 
1027       for (i=0; i < num_cols_A_offd; i++)
1028          P_marker[i] = 0;
1029 
1030       num_cols_P_offd = 0;
1031       for (i=0; i < P_offd_size; i++)
1032       {
1033          index = P_offd_j[i];
1034          if (!P_marker[index])
1035          {
1036             num_cols_P_offd++;
1037             P_marker[index] = 1;
1038          }
1039       }
1040 
1041       col_map_offd_P = hypre_CTAlloc(HYPRE_BigInt, num_cols_P_offd, HYPRE_MEMORY_HOST);
1042       tmp_map_offd = hypre_CTAlloc(HYPRE_Int, num_cols_P_offd, HYPRE_MEMORY_HOST);
1043 
1044       index = 0;
1045       for (i=0; i < num_cols_P_offd; i++)
1046       {
1047          while (P_marker[index]==0) index++;
1048          tmp_map_offd[i] = index++;
1049       }
1050 
1051       for (i=0; i < P_offd_size; i++)
1052          P_offd_j[i] = hypre_BinarySearch(tmp_map_offd,
1053                                           P_offd_j[i],
1054                                           num_cols_P_offd);
1055       hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
1056    }
1057 
1058    for (i=0; i < n_fine; i++)
1059       if (CF_marker[i] == -3) CF_marker[i] = -1;
1060 
1061    if (num_cols_P_offd)
1062    {
1063       hypre_ParCSRBlockMatrixColMapOffd(P) = col_map_offd_P;
1064       hypre_CSRBlockMatrixNumCols(P_offd) = num_cols_P_offd;
1065    }
1066 
1067    /* use block version */
1068    hypre_GetCommPkgBlockRTFromCommPkgBlockA(P,A, tmp_map_offd, fine_to_coarse_offd);
1069 
1070 
1071    *P_ptr = P;
1072 
1073 
1074    hypre_TFree(zero_block, HYPRE_MEMORY_HOST);
1075    hypre_TFree(identity_block, HYPRE_MEMORY_HOST);
1076    hypre_TFree(diagonal_block, HYPRE_MEMORY_HOST);
1077    hypre_TFree(sum_block, HYPRE_MEMORY_HOST);
1078    hypre_TFree(distribute_block, HYPRE_MEMORY_HOST);
1079 
1080    hypre_TFree(CF_marker_offd, HYPRE_MEMORY_HOST);
1081    hypre_TFree(tmp_map_offd, HYPRE_MEMORY_HOST);
1082    hypre_TFree(int_buf_data, HYPRE_MEMORY_HOST);
1083    hypre_TFree(big_buf_data, HYPRE_MEMORY_HOST);
1084    hypre_TFree(fine_to_coarse, HYPRE_MEMORY_HOST);
1085    hypre_TFree(fine_to_coarse_offd, HYPRE_MEMORY_HOST);
1086    hypre_TFree(coarse_counter, HYPRE_MEMORY_HOST);
1087    hypre_TFree(jj_count, HYPRE_MEMORY_HOST);
1088    hypre_TFree(jj_count_offd, HYPRE_MEMORY_HOST);
1089 
1090    if (num_procs > 1) hypre_CSRBlockMatrixDestroy(A_ext);
1091 
1092    return hypre_error_flag;
1093 }
1094 
1095 /* 8/07 - not sure that it is appropriate to scale by the blocks - for
1096    now it is commented out - may want to change this or do something
1097    different  */
1098 
1099 HYPRE_Int
hypre_BoomerAMGBlockInterpTruncation(hypre_ParCSRBlockMatrix * P,HYPRE_Real trunc_factor,HYPRE_Int max_elmts)1100 hypre_BoomerAMGBlockInterpTruncation( hypre_ParCSRBlockMatrix *P,
1101                                       HYPRE_Real    trunc_factor,
1102                                       HYPRE_Int max_elmts)
1103 {
1104    hypre_CSRBlockMatrix *P_diag = hypre_ParCSRBlockMatrixDiag(P);
1105    HYPRE_Int     *P_diag_i = hypre_CSRBlockMatrixI(P_diag);
1106    HYPRE_Int     *P_diag_j = hypre_CSRBlockMatrixJ(P_diag);
1107    HYPRE_Real    *P_diag_data = hypre_CSRBlockMatrixData(P_diag);
1108    HYPRE_Int     *P_diag_j_new;
1109    HYPRE_Real    *P_diag_data_new;
1110 
1111    hypre_CSRBlockMatrix *P_offd = hypre_ParCSRBlockMatrixOffd(P);
1112    HYPRE_Int     *P_offd_i = hypre_CSRBlockMatrixI(P_offd);
1113    HYPRE_Int     *P_offd_j = hypre_CSRBlockMatrixJ(P_offd);
1114    HYPRE_Real    *P_offd_data = hypre_CSRBlockMatrixData(P_offd);
1115    HYPRE_Int     *P_offd_j_new;
1116    HYPRE_Real    *P_offd_data_new;
1117 
1118    HYPRE_Int   block_size = hypre_CSRBlockMatrixBlockSize(P_diag);
1119    HYPRE_Int   bnnz = block_size*block_size;
1120 
1121    HYPRE_Int   n_fine = hypre_CSRBlockMatrixNumRows(P_diag);
1122    HYPRE_Int   num_cols = hypre_CSRBlockMatrixNumCols(P_diag);
1123    HYPRE_Int   i, j, start_j, k;
1124    HYPRE_Int   ierr = 0;
1125    HYPRE_Int   next_open = 0;
1126    HYPRE_Int   now_checking = 0;
1127    HYPRE_Int   num_lost = 0;
1128    HYPRE_Int   next_open_offd = 0;
1129    HYPRE_Int   now_checking_offd = 0;
1130    HYPRE_Int   num_lost_offd = 0;
1131    HYPRE_Int   P_diag_size;
1132    HYPRE_Int   P_offd_size;
1133    HYPRE_Real  max_coef, tmp;
1134    HYPRE_Real *row_sum;
1135    HYPRE_Real *scale;
1136    HYPRE_Real *out_block;
1137    HYPRE_Int   cnt, cnt_diag, cnt_offd;
1138    HYPRE_Int   num_elmts;
1139 
1140    /* for now we will use the frobenius norm to
1141       determine whether to keep a block or not  - so norm_type = 1*/
1142    row_sum  = hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
1143    scale = hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
1144    out_block = hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
1145 
1146    if (trunc_factor > 0)
1147    {
1148       /* go through each row */
1149       for (i = 0; i < n_fine; i++)
1150       {
1151          max_coef = 0.0;
1152 
1153          /* diag */
1154          for (j = P_diag_i[i]; j < P_diag_i[i+1]; j++)
1155          {
1156             hypre_CSRBlockMatrixBlockNorm(1, &P_diag_data[j*bnnz] , &tmp, block_size);
1157             max_coef = (max_coef < tmp) ?  tmp : max_coef;
1158          }
1159 
1160          /* off_diag */
1161          for (j = P_offd_i[i]; j < P_offd_i[i+1]; j++)
1162          {
1163             hypre_CSRBlockMatrixBlockNorm(1, &P_offd_data[j*bnnz], &tmp, block_size);
1164             max_coef = (max_coef < tmp) ?  tmp : max_coef;
1165          }
1166 
1167          max_coef *= trunc_factor;
1168 
1169          start_j = P_diag_i[i];
1170          P_diag_i[i] -= num_lost;
1171 
1172          /* set scale and row sum to zero */
1173          hypre_CSRBlockMatrixBlockSetScalar(scale, 0.0, block_size);
1174          hypre_CSRBlockMatrixBlockSetScalar(row_sum, 0.0, block_size);
1175 
1176          for (j = start_j; j < P_diag_i[i+1]; j++)
1177          {
1178             /* row_sum += P_diag_data[now_checking];*/
1179             hypre_CSRBlockMatrixBlockAddAccumulate(&P_diag_data[now_checking*bnnz], row_sum, block_size);
1180 
1181             hypre_CSRBlockMatrixBlockNorm(1, &P_diag_data[now_checking*bnnz] , &tmp, block_size);
1182 
1183             if ( tmp < max_coef)
1184             {
1185                num_lost++;
1186                now_checking++;
1187             }
1188             else
1189             {
1190                /* scale += P_diag_data[now_checking]; */
1191                hypre_CSRBlockMatrixBlockAddAccumulate(&P_diag_data[now_checking*bnnz], scale, block_size);
1192 
1193                /* P_diag_data[next_open] = P_diag_data[now_checking]; */
1194                hypre_CSRBlockMatrixBlockCopyData( &P_diag_data[now_checking*bnnz], &P_diag_data[next_open*bnnz],
1195                                                   1.0, block_size);
1196 
1197                P_diag_j[next_open] = P_diag_j[now_checking];
1198                now_checking++;
1199                next_open++;
1200             }
1201          }
1202 
1203          start_j = P_offd_i[i];
1204          P_offd_i[i] -= num_lost_offd;
1205 
1206          for (j = start_j; j < P_offd_i[i+1]; j++)
1207          {
1208             /* row_sum += P_offd_data[now_checking_offd]; */
1209             hypre_CSRBlockMatrixBlockAddAccumulate(&P_offd_data[now_checking_offd*bnnz], row_sum, block_size);
1210 
1211             hypre_CSRBlockMatrixBlockNorm(1, &P_offd_data[now_checking_offd*bnnz] , &tmp, block_size);
1212 
1213             if ( tmp < max_coef)
1214             {
1215                num_lost_offd++;
1216                now_checking_offd++;
1217             }
1218             else
1219             {
1220                /* scale += P_offd_data[now_checking_offd]; */
1221                hypre_CSRBlockMatrixBlockAddAccumulate(&P_offd_data[now_checking_offd*bnnz], scale, block_size);
1222 
1223                /* P_offd_data[next_open_offd] = P_offd_data[now_checking_offd];*/
1224                hypre_CSRBlockMatrixBlockCopyData( &P_offd_data[now_checking_offd*bnnz], &P_offd_data[next_open_offd*bnnz],
1225                                                   1.0, block_size);
1226 
1227 
1228                P_offd_j[next_open_offd] = P_offd_j[now_checking_offd];
1229                now_checking_offd++;
1230                next_open_offd++;
1231             }
1232          }
1233          /* normalize row of P */
1234 #if 0
1235          /* out_block = row_sum/scale; */
1236          if (hypre_CSRBlockMatrixBlockInvMult(scale, row_sum, out_block, block_size) == 0)
1237          {
1238 
1239             for (j = P_diag_i[i]; j < (P_diag_i[i+1]-num_lost); j++)
1240             {
1241                /* P_diag_data[j] *= out_block; */
1242 
1243                /* put mult result in row_sum */
1244                hypre_CSRBlockMatrixBlockMultAdd(out_block, &P_diag_data[j*bnnz], 0.0,
1245                                                 row_sum, block_size);
1246                /* add to P_diag_data */
1247                hypre_CSRBlockMatrixBlockAddAccumulate(row_sum, &P_diag_data[j*bnnz], block_size);
1248             }
1249 
1250             for (j = P_offd_i[i]; j < (P_offd_i[i+1]-num_lost_offd); j++)
1251             {
1252 
1253                /* P_offd_data[j] *= out_block; */
1254 
1255                /* put mult result in row_sum */
1256                hypre_CSRBlockMatrixBlockMultAdd(out_block, &P_offd_data[j*bnnz], 0.0,
1257                                                 row_sum, block_size);
1258                /* add to to P_offd_data */
1259                hypre_CSRBlockMatrixBlockAddAccumulate(row_sum, &P_offd_data[j*bnnz], block_size);
1260 
1261             }
1262 
1263          }
1264 #endif
1265       }
1266 
1267       P_diag_i[n_fine] -= num_lost;
1268       P_offd_i[n_fine] -= num_lost_offd;
1269    }
1270    if (max_elmts > 0)
1271    {
1272       HYPRE_Int   P_mxnum, cnt1, rowlength;
1273       HYPRE_Int  *P_aux_j;
1274       HYPRE_Real *P_aux_data;
1275       HYPRE_Real *norm_array;
1276 
1277       rowlength = 0;
1278       if (n_fine)
1279          rowlength = P_diag_i[1]+P_offd_i[1];
1280       P_mxnum = rowlength;
1281       for (i=1; i<n_fine; i++)
1282       {
1283          rowlength = P_diag_i[i+1]-P_diag_i[i]+P_offd_i[i+1]-P_offd_i[i];
1284          if (rowlength > P_mxnum) P_mxnum = rowlength;
1285       }
1286       if (P_mxnum > max_elmts)
1287       {
1288          P_aux_j = hypre_CTAlloc(HYPRE_Int,  P_mxnum, HYPRE_MEMORY_HOST);
1289          P_aux_data = hypre_CTAlloc(HYPRE_Real,  P_mxnum*bnnz, HYPRE_MEMORY_HOST);
1290          cnt_diag = 0;
1291          cnt_offd = 0;
1292 
1293          for (i = 0; i < n_fine; i++)
1294          {
1295             hypre_CSRBlockMatrixBlockSetScalar(row_sum, 0.0, block_size);
1296             /*row_sum = 0; */
1297 
1298             num_elmts = P_diag_i[i+1]-P_diag_i[i]+P_offd_i[i+1]-P_offd_i[i];
1299             if (max_elmts < num_elmts)
1300             {
1301                cnt = 0;
1302                for (j = P_diag_i[i]; j < P_diag_i[i+1]; j++)
1303                {
1304                   P_aux_j[cnt] = P_diag_j[j];
1305                   /*P_aux_data[cnt++] = P_diag_data[j];*/
1306                   hypre_CSRBlockMatrixBlockCopyData(&P_diag_data[j*bnnz],
1307                                                     &P_aux_data[cnt*bnnz],
1308                                                     1.0, block_size);
1309                   cnt++;
1310                   /*row_sum += P_diag_data[j];*/
1311                   hypre_CSRBlockMatrixBlockAddAccumulate(&P_diag_data[j*bnnz], row_sum, block_size);
1312 
1313 
1314                }
1315                num_lost += cnt;
1316                cnt1 = cnt;
1317                for (j = P_offd_i[i]; j < P_offd_i[i+1]; j++)
1318                {
1319                   P_aux_j[cnt] = P_offd_j[j]+num_cols;
1320                   /*P_aux_data[cnt++] = P_offd_data[j];*/
1321                   hypre_CSRBlockMatrixBlockCopyData(&P_offd_data[j*bnnz],
1322                                                     &P_aux_data[cnt*bnnz],
1323                                                     1.0, block_size);
1324                   cnt++;
1325 
1326                   /*row_sum += P_offd_data[j];*/
1327                   hypre_CSRBlockMatrixBlockAddAccumulate(&P_offd_data[j*bnnz], row_sum, block_size);
1328 
1329 
1330                }
1331                num_lost_offd += cnt-cnt1;
1332                /* sort data */
1333                norm_array = hypre_CTAlloc(HYPRE_Real,  cnt, HYPRE_MEMORY_HOST);
1334                for (j=0; j< cnt; j++)
1335                {
1336                   hypre_CSRBlockMatrixBlockNorm(1, &P_aux_data[j*bnnz] , &norm_array[j], block_size);
1337                }
1338 
1339                hypre_block_qsort(P_aux_j, norm_array, P_aux_data,block_size, 0,cnt-1);
1340 
1341                hypre_TFree(norm_array, HYPRE_MEMORY_HOST);
1342 
1343                /* scale = 0; */
1344                hypre_CSRBlockMatrixBlockSetScalar(scale, 0.0, block_size);
1345                P_diag_i[i] = cnt_diag;
1346                P_offd_i[i] = cnt_offd;
1347                for (j = 0; j < max_elmts; j++)
1348                {
1349                   /* scale += P_aux_data[j];*/
1350                   hypre_CSRBlockMatrixBlockAddAccumulate(&P_aux_data[j*bnnz],
1351                                                          scale, block_size);
1352 
1353 
1354                   if (P_aux_j[j] < num_cols)
1355                   {
1356                      P_diag_j[cnt_diag] = P_aux_j[j];
1357                      /*P_diag_data[cnt_diag++] = P_aux_data[j];*/
1358                      hypre_CSRBlockMatrixBlockCopyData(&P_aux_data[j*bnnz],
1359                                                        &P_diag_data[cnt_diag*bnnz],
1360                                                        1.0, block_size);
1361 
1362                      cnt_diag++;
1363 
1364 
1365                   }
1366                   else
1367                   {
1368                      P_offd_j[cnt_offd] = P_aux_j[j]-num_cols;
1369                      /*P_offd_data[cnt_offd++] = P_aux_data[j];*/
1370                      hypre_CSRBlockMatrixBlockCopyData(&P_aux_data[j*bnnz],
1371                                                        &P_offd_data[cnt_offd*bnnz],
1372                                                        1.0, block_size);
1373                      cnt_offd++;
1374 
1375                   }
1376                }
1377                num_lost -= cnt_diag-P_diag_i[i];
1378                num_lost_offd -= cnt_offd-P_offd_i[i];
1379                /* normalize row of P */
1380                /* out_block = row_sum/scale; */
1381                /*if (scale != 0.)*/
1382 #if 0
1383                if (hypre_CSRBlockMatrixBlockInvMult(scale, row_sum, out_block, block_size) == 0)
1384                {
1385 
1386                   for (j = P_diag_i[i]; j < cnt_diag; j++)
1387                   {
1388 
1389                      /* P_diag_data[j] *= out_block; */
1390 
1391                      /* put mult result in row_sum */
1392                      hypre_CSRBlockMatrixBlockMultAdd(out_block, &P_diag_data[j*bnnz], 0.0,
1393                                                       row_sum, block_size);
1394                      /* add to P_diag_data */
1395                      hypre_CSRBlockMatrixBlockAddAccumulate(row_sum, &P_diag_data[j*bnnz], block_size);
1396                   }
1397 
1398                   for (j = P_offd_i[i]; j < cnt_offd; j++)
1399                   {
1400 
1401                      /* P_offd_data[j] *= out_block; */
1402 
1403                      /* put mult result in row_sum */
1404                      hypre_CSRBlockMatrixBlockMultAdd(out_block, &P_offd_data[j*bnnz], 0.0,
1405                                                       row_sum, block_size);
1406                      /* add to to P_offd_data */
1407                      hypre_CSRBlockMatrixBlockAddAccumulate(row_sum, &P_offd_data[j*bnnz], block_size);
1408                   }
1409 
1410 
1411                }
1412 #endif
1413             }
1414             else
1415             {
1416                if (P_diag_i[i] != cnt_diag)
1417                {
1418                   start_j = P_diag_i[i];
1419                   P_diag_i[i] = cnt_diag;
1420                   for (j = start_j; j < P_diag_i[i+1]; j++)
1421                   {
1422                      P_diag_j[cnt_diag] = P_diag_j[j];
1423                      /*P_diag_data[cnt_diag++] = P_diag_data[j];*/
1424                      hypre_CSRBlockMatrixBlockCopyData(&P_diag_data[j*bnnz],
1425                                                        &P_diag_data[cnt_diag*bnnz],
1426                                                        1.0, block_size);
1427                      cnt_diag++;
1428 
1429 
1430                   }
1431                }
1432                else
1433                   cnt_diag += P_diag_i[i+1]-P_diag_i[i];
1434                if (P_offd_i[i] != cnt_offd)
1435                {
1436                   start_j = P_offd_i[i];
1437                   P_offd_i[i] = cnt_offd;
1438                   for (j = start_j; j < P_offd_i[i+1]; j++)
1439                   {
1440                      P_offd_j[cnt_offd] = P_offd_j[j];
1441                      /*P_offd_data[cnt_offd++] = P_offd_data[j];*/
1442 
1443                      hypre_CSRBlockMatrixBlockCopyData(&P_offd_data[j*bnnz],
1444                                                        &P_offd_data[cnt_offd*bnnz],
1445                                                        1.0, block_size);
1446                      cnt_offd++;
1447                   }
1448                }
1449                else
1450                   cnt_offd += P_offd_i[i+1]-P_offd_i[i];
1451             }
1452          }
1453          P_diag_i[n_fine] = cnt_diag;
1454          P_offd_i[n_fine] = cnt_offd;
1455          hypre_TFree(P_aux_j, HYPRE_MEMORY_HOST);
1456          hypre_TFree(P_aux_data, HYPRE_MEMORY_HOST);
1457       }
1458    }
1459 
1460 
1461 
1462 
1463    if (num_lost)
1464    {
1465       P_diag_size = P_diag_i[n_fine];
1466       P_diag_j_new = hypre_CTAlloc(HYPRE_Int,  P_diag_size, HYPRE_MEMORY_HOST);
1467       P_diag_data_new = hypre_CTAlloc(HYPRE_Real,  P_diag_size*bnnz, HYPRE_MEMORY_HOST);
1468       for (i=0; i < P_diag_size; i++)
1469       {
1470          P_diag_j_new[i] = P_diag_j[i];
1471          for (k=0; k < bnnz; k++)
1472          {
1473             P_diag_data_new[i*bnnz+k] = P_diag_data[i*bnnz+k];
1474          }
1475 
1476       }
1477       hypre_TFree(P_diag_j, HYPRE_MEMORY_HOST);
1478       hypre_TFree(P_diag_data, HYPRE_MEMORY_HOST);
1479       hypre_CSRMatrixJ(P_diag) = P_diag_j_new;
1480       hypre_CSRMatrixData(P_diag) = P_diag_data_new;
1481       hypre_CSRMatrixNumNonzeros(P_diag) = P_diag_size;
1482    }
1483    if (num_lost_offd)
1484    {
1485       P_offd_size = P_offd_i[n_fine];
1486       P_offd_j_new = hypre_CTAlloc(HYPRE_Int,  P_offd_size, HYPRE_MEMORY_HOST);
1487       P_offd_data_new = hypre_CTAlloc(HYPRE_Real,  P_offd_size*bnnz, HYPRE_MEMORY_HOST);
1488       for (i=0; i < P_offd_size; i++)
1489       {
1490          P_offd_j_new[i] = P_offd_j[i];
1491          for (k=0; k < bnnz; k++)
1492          {
1493             P_offd_data_new[i*bnnz + k] = P_offd_data[i*bnnz + k];
1494          }
1495 
1496       }
1497       hypre_TFree(P_offd_j, HYPRE_MEMORY_HOST);
1498       hypre_TFree(P_offd_data, HYPRE_MEMORY_HOST);
1499       hypre_CSRMatrixJ(P_offd) = P_offd_j_new;
1500       hypre_CSRMatrixData(P_offd) = P_offd_data_new;
1501       hypre_CSRMatrixNumNonzeros(P_offd) = P_offd_size;
1502    }
1503 
1504    hypre_TFree(row_sum, HYPRE_MEMORY_HOST);
1505    hypre_TFree(scale, HYPRE_MEMORY_HOST);
1506    hypre_TFree(out_block, HYPRE_MEMORY_HOST);
1507 
1508    return ierr;
1509 }
1510 
1511 /*-----------------------------------------------*/
1512 /* compare on w, move v and blk_array */
1513 
hypre_block_qsort(HYPRE_Int * v,HYPRE_Complex * w,HYPRE_Complex * blk_array,HYPRE_Int block_size,HYPRE_Int left,HYPRE_Int right)1514 void hypre_block_qsort( HYPRE_Int  *v,
1515                         HYPRE_Complex *w,
1516                         HYPRE_Complex *blk_array,
1517                         HYPRE_Int   block_size,
1518                         HYPRE_Int   left,
1519                         HYPRE_Int   right )
1520 {
1521    HYPRE_Int i, last;
1522 
1523    if (left >= right)
1524       return;
1525 
1526    hypre_swap2( v, w, left, (left+right)/2);
1527    hypre_swap_blk(blk_array, block_size, left, (left+right)/2);
1528    last = left;
1529    for (i = left+1; i <= right; i++)
1530       if (hypre_cabs(w[i]) > hypre_cabs(w[left]))
1531       {
1532          hypre_swap2(v, w, ++last, i);
1533          hypre_swap_blk(blk_array, block_size, last, i);
1534       }
1535    hypre_swap2(v, w, left, last);
1536    hypre_swap_blk(blk_array, block_size, left, last);
1537    hypre_block_qsort(v, w, blk_array, block_size, left, last-1);
1538    hypre_block_qsort(v, w, blk_array, block_size, last+1, right);
1539 }
1540 
hypre_swap_blk(HYPRE_Complex * v,HYPRE_Int block_size,HYPRE_Int i,HYPRE_Int j)1541 void hypre_swap_blk( HYPRE_Complex *v,
1542                HYPRE_Int   block_size,
1543                HYPRE_Int   i,
1544                HYPRE_Int   j )
1545 {
1546    HYPRE_Int bnnz = block_size*block_size;
1547    HYPRE_Real    *temp;
1548 
1549    temp = hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
1550 
1551    /*temp = v[i];*/
1552    hypre_CSRBlockMatrixBlockCopyData(&v[i*bnnz],temp, 1.0, block_size);
1553    /*v[i] = v[j];*/
1554    hypre_CSRBlockMatrixBlockCopyData(&v[j*bnnz],&v[i*bnnz], 1.0, block_size);
1555    /* v[j] = temp; */
1556    hypre_CSRBlockMatrixBlockCopyData(temp,&v[j*bnnz], 1.0, block_size);
1557 
1558    hypre_TFree(temp, HYPRE_MEMORY_HOST);
1559 }
1560 
1561 /*---------------------------------------------------------------------------
1562  * hypre_BoomerAMGBlockBuildInterpDiag
1563 
1564  This is the block version of classical R-S interpolation. We use just the
1565  diagonals of these blocks.
1566 
1567  A and P are now Block matrices.  The Strength matrix S is not as it gives
1568  nodal strengths.
1569 
1570  CF_marker is size number of nodes.
1571 
1572  add_weak_to_diag  0 = don't add weak connections to diag (distribute instead)
1573  1 = do add
1574  *--------------------------------------------------------------------------*/
1575 
1576 HYPRE_Int
hypre_BoomerAMGBuildBlockInterpDiag(hypre_ParCSRBlockMatrix * A,HYPRE_Int * CF_marker,hypre_ParCSRMatrix * S,HYPRE_BigInt * num_cpts_global,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int debug_flag,HYPRE_Real trunc_factor,HYPRE_Int max_elmts,HYPRE_Int add_weak_to_diag,hypre_ParCSRBlockMatrix ** P_ptr)1577 hypre_BoomerAMGBuildBlockInterpDiag( hypre_ParCSRBlockMatrix *A,
1578                                      HYPRE_Int                *CF_marker,
1579                                      hypre_ParCSRMatrix       *S,
1580                                      HYPRE_BigInt             *num_cpts_global,
1581                                      HYPRE_Int                 num_functions,
1582                                      HYPRE_Int                *dof_func,
1583                                      HYPRE_Int                 debug_flag,
1584                                      HYPRE_Real                trunc_factor,
1585                                      HYPRE_Int                 max_elmts,
1586                                      HYPRE_Int                 add_weak_to_diag,
1587                                      hypre_ParCSRBlockMatrix  **P_ptr)
1588 {
1589 
1590    MPI_Comm                 comm = hypre_ParCSRBlockMatrixComm(A);
1591    hypre_ParCSRCommPkg     *comm_pkg = hypre_ParCSRBlockMatrixCommPkg(A);
1592    hypre_ParCSRCommHandle  *comm_handle;
1593 
1594    hypre_CSRBlockMatrix  *A_diag = hypre_ParCSRBlockMatrixDiag(A);
1595    HYPRE_Real            *A_diag_data = hypre_CSRBlockMatrixData(A_diag);
1596    HYPRE_Int             *A_diag_i = hypre_CSRBlockMatrixI(A_diag);
1597    HYPRE_Int             *A_diag_j = hypre_CSRBlockMatrixJ(A_diag);
1598 
1599    HYPRE_Int              block_size = hypre_CSRBlockMatrixBlockSize(A_diag);
1600    HYPRE_Int              bnnz = block_size*block_size;
1601 
1602    hypre_CSRBlockMatrix  *A_offd = hypre_ParCSRBlockMatrixOffd(A);
1603    HYPRE_Real            *A_offd_data = hypre_CSRBlockMatrixData(A_offd);
1604    HYPRE_Int             *A_offd_i = hypre_CSRBlockMatrixI(A_offd);
1605    HYPRE_Int             *A_offd_j = hypre_CSRBlockMatrixJ(A_offd);
1606    HYPRE_Int              num_cols_A_offd = hypre_CSRBlockMatrixNumCols(A_offd);
1607    HYPRE_BigInt          *col_map_offd = hypre_ParCSRBlockMatrixColMapOffd(A);
1608 
1609    hypre_CSRMatrix       *S_diag = hypre_ParCSRMatrixDiag(S);
1610    HYPRE_Int             *S_diag_i = hypre_CSRMatrixI(S_diag);
1611    HYPRE_Int             *S_diag_j = hypre_CSRMatrixJ(S_diag);
1612 
1613    hypre_CSRMatrix       *S_offd = hypre_ParCSRMatrixOffd(S);
1614    HYPRE_Int             *S_offd_i = hypre_CSRMatrixI(S_offd);
1615    HYPRE_Int             *S_offd_j = hypre_CSRMatrixJ(S_offd);
1616 
1617    hypre_ParCSRBlockMatrix *P;
1618    HYPRE_BigInt          *col_map_offd_P;
1619    HYPRE_Int             *tmp_map_offd = NULL;
1620 
1621    HYPRE_Int             *CF_marker_offd = NULL;
1622 
1623    hypre_CSRBlockMatrix  *A_ext;
1624 
1625    HYPRE_Real            *A_ext_data = NULL;
1626    HYPRE_Int             *A_ext_i = NULL;
1627    HYPRE_BigInt          *A_ext_j = NULL;
1628 
1629    hypre_CSRBlockMatrix  *P_diag;
1630    hypre_CSRBlockMatrix  *P_offd;
1631 
1632    HYPRE_Real            *P_diag_data;
1633    HYPRE_Int             *P_diag_i;
1634    HYPRE_Int             *P_diag_j;
1635    HYPRE_Real            *P_offd_data;
1636    HYPRE_Int             *P_offd_i;
1637    HYPRE_Int             *P_offd_j;
1638 
1639    HYPRE_Int              P_diag_size, P_offd_size;
1640 
1641    HYPRE_Int             *P_marker, *P_marker_offd = NULL;
1642 
1643    HYPRE_Int              jj_counter,jj_counter_offd;
1644    HYPRE_Int             *jj_count, *jj_count_offd = NULL;
1645    HYPRE_Int              jj_begin_row,jj_begin_row_offd;
1646    HYPRE_Int              jj_end_row,jj_end_row_offd;
1647 
1648    HYPRE_Int              start_indexing = 0; /* start indexing for P_data at 0 */
1649 
1650    HYPRE_Int              n_fine = hypre_CSRBlockMatrixNumRows(A_diag);
1651 
1652    HYPRE_Int              strong_f_marker;
1653 
1654    HYPRE_Int             *fine_to_coarse;
1655    HYPRE_BigInt          *fine_to_coarse_offd = NULL;
1656    HYPRE_Int             *coarse_counter;
1657    HYPRE_Int              coarse_shift;
1658    HYPRE_BigInt           total_global_cpts;
1659    HYPRE_Int              num_cols_P_offd, my_first_cpt;
1660 
1661    HYPRE_Int              bd;
1662 
1663    HYPRE_Int              i,i1,i2;
1664    HYPRE_Int              j,jl,jj,jj1;
1665    HYPRE_Int              kc;
1666    HYPRE_BigInt           big_k;
1667    HYPRE_Int              start;
1668 
1669    HYPRE_Int              c_num;
1670 
1671    HYPRE_Int              my_id;
1672    HYPRE_Int              num_procs;
1673    HYPRE_Int              num_threads;
1674    HYPRE_Int              num_sends;
1675    HYPRE_Int              index;
1676    HYPRE_Int              ns, ne, size, rest;
1677    HYPRE_Int             *int_buf_data = NULL;
1678    HYPRE_BigInt          *big_buf_data = NULL;
1679 
1680    HYPRE_BigInt col_1 = hypre_ParCSRBlockMatrixFirstRowIndex(A);
1681    HYPRE_Int local_numrows = hypre_CSRBlockMatrixNumRows(A_diag);
1682    HYPRE_BigInt col_n = col_1 + (HYPRE_BigInt)local_numrows;
1683 
1684    HYPRE_Real       wall_time;  /* for debugging instrumentation  */
1685 
1686 
1687    HYPRE_Real       *identity_block;
1688    HYPRE_Real       *zero_block;
1689    HYPRE_Real       *diagonal_block;
1690    HYPRE_Real       *sum_block;
1691    HYPRE_Real       *distribute_block;
1692 
1693    HYPRE_Real       *sign;
1694 
1695    hypre_MPI_Comm_size(comm, &num_procs);
1696    hypre_MPI_Comm_rank(comm,&my_id);
1697    num_threads = hypre_NumThreads();
1698 
1699 
1700    my_first_cpt = num_cpts_global[0];
1701    if (my_id == (num_procs -1)) total_global_cpts = num_cpts_global[1];
1702    hypre_MPI_Bcast(&total_global_cpts, 1, HYPRE_MPI_BIG_INT, num_procs-1, comm);
1703 
1704    /*-------------------------------------------------------------------
1705     * Get the CF_marker data for the off-processor columns
1706     *-------------------------------------------------------------------*/
1707 
1708    if (debug_flag==4) wall_time = time_getWallclockSeconds();
1709 
1710    CF_marker_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
1711 
1712 
1713    if (!comm_pkg)
1714    {
1715       hypre_BlockMatvecCommPkgCreate(A);
1716       comm_pkg = hypre_ParCSRBlockMatrixCommPkg(A);
1717    }
1718 
1719    num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
1720    int_buf_data = hypre_CTAlloc(HYPRE_Int,  hypre_ParCSRCommPkgSendMapStart(comm_pkg,
1721                                                        num_sends), HYPRE_MEMORY_HOST);
1722 
1723    index = 0;
1724    for (i = 0; i < num_sends; i++)
1725    {
1726       start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
1727       for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
1728       {
1729          int_buf_data[index++]
1730             = CF_marker[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
1731       }
1732 
1733    }
1734 
1735    /* we do not need the block version of comm handle - because
1736       CF_marker corresponds to the nodal matrix.  This call populates
1737       CF_marker_offd */
1738    comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data,
1739                                                CF_marker_offd);
1740 
1741    hypre_ParCSRCommHandleDestroy(comm_handle);
1742 
1743 
1744    if (debug_flag==4)
1745    {
1746       wall_time = time_getWallclockSeconds() - wall_time;
1747       hypre_printf("Proc = %d     Interp: Comm 1 CF_marker =    %f\n",
1748                    my_id, wall_time);
1749       fflush(NULL);
1750    }
1751 
1752    /*----------------------------------------------------------------------
1753     * Get the ghost rows of A
1754     *---------------------------------------------------------------------*/
1755 
1756    if (debug_flag==4) wall_time = time_getWallclockSeconds();
1757 
1758    if (num_procs > 1)
1759    {
1760       A_ext      = hypre_ParCSRBlockMatrixExtractBExt(A, A, 1);
1761       A_ext_i    = hypre_CSRBlockMatrixI(A_ext);
1762       A_ext_j    = hypre_CSRBlockMatrixBigJ(A_ext);
1763       A_ext_data = hypre_CSRBlockMatrixData(A_ext);
1764    }
1765 
1766    index = 0;
1767    for (i=0; i < num_cols_A_offd; i++)
1768    {
1769       for (j=A_ext_i[i]; j < A_ext_i[i+1]; j++)
1770       {
1771          big_k = A_ext_j[j];
1772          if (big_k >= col_1 && big_k < col_n)
1773          {
1774             A_ext_j[index] = big_k - col_1;
1775             /* for the data field we must get all of the block data */
1776             for (bd = 0; bd < bnnz; bd++)
1777             {
1778                A_ext_data[index*bnnz + bd] = A_ext_data[j*bnnz + bd];
1779             }
1780             index++;
1781          }
1782          else
1783          {
1784             kc = hypre_BigBinarySearch(col_map_offd, big_k, num_cols_A_offd);
1785             if (kc > -1)
1786             {
1787                A_ext_j[index] = (HYPRE_BigInt)(-kc-1);
1788                for (bd = 0; bd < bnnz; bd++)
1789                {
1790                   A_ext_data[index*bnnz + bd] = A_ext_data[j*bnnz + bd];
1791                }
1792                index++;
1793             }
1794          }
1795       }
1796       A_ext_i[i] = index;
1797    }
1798    for (i = num_cols_A_offd; i > 0; i--)
1799       A_ext_i[i] = A_ext_i[i-1];
1800    if (num_procs > 1) A_ext_i[0] = 0;
1801 
1802    if (debug_flag==4)
1803    {
1804       wall_time = time_getWallclockSeconds() - wall_time;
1805       hypre_printf("Proc = %d  Interp: Comm 2   Get A_ext =  %f\n",
1806                    my_id, wall_time);
1807       fflush(NULL);
1808    }
1809 
1810 
1811    /*-----------------------------------------------------------------------
1812     *  First Pass: Determine size of P and fill in fine_to_coarse mapping.
1813     *-----------------------------------------------------------------------*/
1814 
1815    /*-----------------------------------------------------------------------
1816     *  Intialize counters and allocate mapping vector.
1817     *-----------------------------------------------------------------------*/
1818 
1819    coarse_counter = hypre_CTAlloc(HYPRE_Int,  num_threads, HYPRE_MEMORY_HOST);
1820    jj_count = hypre_CTAlloc(HYPRE_Int,  num_threads, HYPRE_MEMORY_HOST);
1821    jj_count_offd = hypre_CTAlloc(HYPRE_Int,  num_threads, HYPRE_MEMORY_HOST);
1822 
1823    fine_to_coarse = hypre_CTAlloc(HYPRE_Int,  n_fine, HYPRE_MEMORY_HOST);
1824 
1825    for (i = 0; i < n_fine; i++) fine_to_coarse[i] = -1;
1826 
1827    jj_counter = start_indexing;
1828    jj_counter_offd = start_indexing;
1829 
1830    /*-----------------------------------------------------------------------
1831     *  Loop over fine grid.
1832     *-----------------------------------------------------------------------*/
1833 
1834 
1835    for (j = 0; j < num_threads; j++)
1836    {
1837       size = n_fine/num_threads;
1838       rest = n_fine - size*num_threads;
1839       if (j < rest)
1840       {
1841          ns = j*size+j;
1842          ne = (j+1)*size+j+1;
1843       }
1844       else
1845       {
1846          ns = j*size+rest;
1847          ne = (j+1)*size+rest;
1848       }
1849 
1850 
1851       /* loop over the fine grid points */
1852       for (i = ns; i < ne; i++)
1853       {
1854 
1855          /*--------------------------------------------------------------------
1856           *  If i is a C-point, interpolation is the identity. Also set up
1857           *  mapping vector (fine_to_coarse is the mapping vector).
1858           *--------------------------------------------------------------------*/
1859 
1860          if (CF_marker[i] >= 0)
1861          {
1862             jj_count[j]++;
1863             fine_to_coarse[i] = coarse_counter[j];
1864             coarse_counter[j]++;
1865          }
1866 
1867          /*--------------------------------------------------------------------
1868           *  If i is an F-point, interpolation is from the C-points that
1869           *  strongly influence i.
1870           *--------------------------------------------------------------------*/
1871 
1872          else
1873          {
1874             for (jj = S_diag_i[i]; jj < S_diag_i[i+1]; jj++)
1875             {
1876                i1 = S_diag_j[jj];
1877                if (CF_marker[i1] >= 0)
1878                {
1879                   jj_count[j]++;
1880                }
1881             }
1882 
1883             if (num_procs > 1)
1884             {
1885                for (jj = S_offd_i[i]; jj < S_offd_i[i+1]; jj++)
1886                {
1887                   i1 = S_offd_j[jj];
1888                   if (CF_marker_offd[i1] >= 0)
1889                   {
1890                      jj_count_offd[j]++;
1891                   }
1892                }
1893             }
1894          }
1895       }
1896    }
1897 
1898    /*-----------------------------------------------------------------------
1899     *  Allocate  arrays.
1900     *-----------------------------------------------------------------------*/
1901 
1902    for (i=0; i < num_threads-1; i++)
1903    {
1904       coarse_counter[i+1] += coarse_counter[i];
1905       jj_count[i+1] += jj_count[i];
1906       jj_count_offd[i+1] += jj_count_offd[i];
1907    }
1908    i = num_threads-1;
1909    jj_counter = jj_count[i];
1910    jj_counter_offd = jj_count_offd[i];
1911 
1912    P_diag_size = jj_counter;
1913 
1914    P_diag_i    = hypre_CTAlloc(HYPRE_Int,  n_fine+1, HYPRE_MEMORY_HOST);
1915    P_diag_j    = hypre_CTAlloc(HYPRE_Int,  P_diag_size, HYPRE_MEMORY_HOST);
1916    /* we need to include the size of the blocks in the data size */
1917    P_diag_data = hypre_CTAlloc(HYPRE_Real,  P_diag_size*bnnz, HYPRE_MEMORY_HOST);
1918 
1919    P_diag_i[n_fine] = jj_counter;
1920 
1921 
1922    P_offd_size = jj_counter_offd;
1923 
1924    P_offd_i    = hypre_CTAlloc(HYPRE_Int,  n_fine+1, HYPRE_MEMORY_HOST);
1925    P_offd_j    = hypre_CTAlloc(HYPRE_Int,  P_offd_size, HYPRE_MEMORY_HOST);
1926    /* we need to include the size of the blocks in the data size */
1927    P_offd_data = hypre_CTAlloc(HYPRE_Real,  P_offd_size*bnnz, HYPRE_MEMORY_HOST);
1928 
1929    /*-----------------------------------------------------------------------
1930     *  Intialize some stuff.
1931     *-----------------------------------------------------------------------*/
1932 
1933    jj_counter = start_indexing;
1934    jj_counter_offd = start_indexing;
1935 
1936    if (debug_flag==4)
1937    {
1938       wall_time = time_getWallclockSeconds() - wall_time;
1939       hypre_printf("Proc = %d     Interp: Internal work 1 =     %f\n",
1940                    my_id, wall_time);
1941       fflush(NULL);
1942    }
1943 
1944    /* we need a block identity and a block of zeros*/
1945    identity_block = hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
1946    zero_block =  hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
1947 
1948 
1949 
1950    for(i = 0; i < block_size; i++)
1951    {
1952       identity_block[i*block_size + i] = 1.0;
1953    }
1954 
1955 
1956    /* we also need a block to keep track of the diagonal values and a sum */
1957    diagonal_block =  hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
1958    sum_block =  hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
1959    distribute_block =  hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
1960 
1961 
1962    sign =  hypre_CTAlloc(HYPRE_Real,  block_size, HYPRE_MEMORY_HOST);
1963 
1964    /*-----------------------------------------------------------------------
1965     *  Send and receive fine_to_coarse info.
1966     *-----------------------------------------------------------------------*/
1967 
1968    if (debug_flag==4) wall_time = time_getWallclockSeconds();
1969 
1970    fine_to_coarse_offd = hypre_CTAlloc(HYPRE_BigInt,  num_cols_A_offd, HYPRE_MEMORY_HOST);
1971    big_buf_data = hypre_CTAlloc(HYPRE_BigInt,  hypre_ParCSRCommPkgSendMapStart(comm_pkg,
1972                                                    num_sends), HYPRE_MEMORY_HOST);
1973 
1974    for (j = 0; j < num_threads; j++)
1975    {
1976       coarse_shift = 0;
1977       if (j > 0) coarse_shift = coarse_counter[j-1];
1978       size = n_fine/num_threads;
1979       rest = n_fine - size*num_threads;
1980       if (j < rest)
1981       {
1982          ns = j*size+j;
1983          ne = (j+1)*size+j+1;
1984       }
1985       else
1986       {
1987          ns = j*size+rest;
1988          ne = (j+1)*size+rest;
1989       }
1990       for (i = ns; i < ne; i++)
1991          fine_to_coarse[i] += coarse_shift;
1992    }
1993    index = 0;
1994    for (i = 0; i < num_sends; i++)
1995    {
1996       start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
1997       for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
1998          big_buf_data[index++] = my_first_cpt
1999             + fine_to_coarse[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
2000    }
2001 
2002    /* again, we do not need to use the block version of comm handle since
2003       the fine to coarse mapping is size of the nodes */
2004 
2005    comm_handle = hypre_ParCSRCommHandleCreate( 21, comm_pkg, big_buf_data,
2006                                                fine_to_coarse_offd);
2007 
2008    hypre_ParCSRCommHandleDestroy(comm_handle);
2009 
2010    if (debug_flag==4)
2011    {
2012       wall_time = time_getWallclockSeconds() - wall_time;
2013       hypre_printf("Proc = %d     Interp: Comm 4 FineToCoarse = %f\n",
2014                    my_id, wall_time);
2015       fflush(NULL);
2016    }
2017 
2018    if (debug_flag==4) wall_time = time_getWallclockSeconds();
2019 
2020    /*-----------------------------------------------------------------------
2021     *  Loop over fine grid points.
2022     *-----------------------------------------------------------------------*/
2023 
2024    for (jl = 0; jl < num_threads; jl++)
2025    {
2026       size = n_fine/num_threads;
2027       rest = n_fine - size*num_threads;
2028       if (jl < rest)
2029       {
2030          ns = jl*size+jl;
2031          ne = (jl+1)*size+jl+1;
2032       }
2033       else
2034       {
2035          ns = jl*size+rest;
2036          ne = (jl+1)*size+rest;
2037       }
2038       jj_counter = 0;
2039       if (jl > 0) jj_counter = jj_count[jl-1];
2040       jj_counter_offd = 0;
2041       if (jl > 0) jj_counter_offd = jj_count_offd[jl-1];
2042 
2043       P_marker = hypre_CTAlloc(HYPRE_Int,  n_fine, HYPRE_MEMORY_HOST);
2044       P_marker_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
2045 
2046       for (i = 0; i < n_fine; i++)
2047       {
2048          P_marker[i] = -1;
2049       }
2050       for (i = 0; i < num_cols_A_offd; i++)
2051       {
2052          P_marker_offd[i] = -1;
2053       }
2054       strong_f_marker = -2;
2055 
2056       for (i = ns; i < ne; i++)
2057       {
2058 
2059          /*--------------------------------------------------------------------
2060           *  If i is a c-point, interpolation is the identity.
2061           *--------------------------------------------------------------------*/
2062 
2063          if (CF_marker[i] >= 0)
2064          {
2065             P_diag_i[i] = jj_counter;
2066             P_diag_j[jj_counter]    = fine_to_coarse[i];
2067             /* P_diag_data[jj_counter] = one; */
2068             hypre_CSRBlockMatrixBlockCopyData(identity_block,
2069                                               &P_diag_data[jj_counter*bnnz],
2070                                               1.0, block_size);
2071             jj_counter++;
2072          }
2073 
2074          /*--------------------------------------------------------------------
2075           *  If i is an F-point, build interpolation.
2076           *--------------------------------------------------------------------*/
2077 
2078          else
2079          {
2080             /* Diagonal part of P */
2081             P_diag_i[i] = jj_counter;
2082             jj_begin_row = jj_counter;
2083 
2084             for (jj = S_diag_i[i]; jj < S_diag_i[i+1]; jj++)
2085             {
2086                i1 = S_diag_j[jj];
2087 
2088                /*--------------------------------------------------------------
2089                 * If neighbor i1 is a C-point, set column number in P_diag_j
2090                 * and initialize interpolation weight to zero.
2091                 *--------------------------------------------------------------*/
2092 
2093                if (CF_marker[i1] >= 0)
2094                {
2095                   P_marker[i1] = jj_counter;
2096                   P_diag_j[jj_counter]    = fine_to_coarse[i1];
2097                   /* P_diag_data[jj_counter] = zero; */
2098                   hypre_CSRBlockMatrixBlockCopyData(zero_block,
2099                                                     &P_diag_data[jj_counter*bnnz],
2100                                                     1.0, block_size);
2101                   jj_counter++;
2102                }
2103 
2104                /*--------------------------------------------------------------
2105                 * If neighbor i1 is an F-point, mark it as a strong F-point
2106                 * whose connection needs to be distributed.
2107                 *--------------------------------------------------------------*/
2108 
2109                else if (CF_marker[i1] != -3)
2110                {
2111                   P_marker[i1] = strong_f_marker;
2112                }
2113             }
2114             jj_end_row = jj_counter;
2115 
2116             /* Off-Diagonal part of P */
2117             P_offd_i[i] = jj_counter_offd;
2118             jj_begin_row_offd = jj_counter_offd;
2119 
2120 
2121             if (num_procs > 1)
2122             {
2123                for (jj = S_offd_i[i]; jj < S_offd_i[i+1]; jj++)
2124                {
2125                   i1 = S_offd_j[jj];
2126 
2127                   /*-----------------------------------------------------------
2128                    * If neighbor i1 is a C-point, set column number in P_offd_j
2129                    * and initialize interpolation weight to zero.
2130                    *-----------------------------------------------------------*/
2131 
2132                   if (CF_marker_offd[i1] >= 0)
2133                   {
2134                      P_marker_offd[i1] = jj_counter_offd;
2135                      P_offd_j[jj_counter_offd]  = i1;
2136                      /* P_offd_data[jj_counter_offd] = zero; */
2137                      hypre_CSRBlockMatrixBlockCopyData(zero_block,
2138                                                        &P_offd_data[jj_counter_offd*bnnz],
2139                                                        1.0, block_size);
2140 
2141                      jj_counter_offd++;
2142                   }
2143 
2144                   /*-----------------------------------------------------------
2145                    * If neighbor i1 is an F-point, mark it as a strong F-point
2146                    * whose connection needs to be distributed.
2147                    *-----------------------------------------------------------*/
2148 
2149                   else if (CF_marker_offd[i1] != -3)
2150                   {
2151                      P_marker_offd[i1] = strong_f_marker;
2152                   }
2153                }
2154             }
2155 
2156             jj_end_row_offd = jj_counter_offd;
2157 
2158 
2159             /* get the diagonal block */
2160             /* diagonal = A_diag_data[A_diag_i[i]]; */
2161             hypre_CSRBlockMatrixBlockCopyDataDiag(&A_diag_data[A_diag_i[i]*bnnz], diagonal_block,
2162                                                   1.0, block_size);
2163 
2164 
2165 
2166             /* Here we go through the neighborhood of this grid point */
2167 
2168             /* Loop over ith row of A.  First, the diagonal part of A */
2169 
2170             for (jj = A_diag_i[i]+1; jj < A_diag_i[i+1]; jj++)
2171             {
2172                i1 = A_diag_j[jj];
2173 
2174                /*--------------------------------------------------------------
2175                 * Case 1: neighbor i1 is a C-point and strongly influences i,
2176                 * accumulate a_{i,i1} into the interpolation weight.
2177                 *--------------------------------------------------------------*/
2178 
2179                if (P_marker[i1] >= jj_begin_row)
2180                {
2181                   /*   P_diag_data[P_marker[i1]] += A_diag_data[jj]; */
2182                   hypre_CSRBlockMatrixBlockAddAccumulateDiag(&A_diag_data[jj*bnnz],
2183                                                              &P_diag_data[P_marker[i1]*bnnz],
2184                                                              block_size);
2185 
2186                }
2187 
2188                /*--------------------------------------------------------------
2189                 * Case 2: neighbor i1 is an F-point and strongly influences i,
2190                 * distribute a_{i,i1} to C-points that strongly infuence i.
2191                 * Note: currently no distribution to the diagonal in this case.
2192                 *--------------------------------------------------------------*/
2193 
2194                else if (P_marker[i1] == strong_f_marker || (!add_weak_to_diag  && CF_marker[i1] != -3))
2195                {
2196                   /* initialize sum to zero */
2197                   /* sum = zero; */
2198                   hypre_CSRBlockMatrixBlockCopyData(zero_block, sum_block, 1.0,
2199                                                     block_size);
2200 
2201 
2202                   /*-----------------------------------------------------------
2203                    * Loop over row of A for point i1 and calculate the sum
2204                    * of the connections to c-points that strongly influence i.
2205                    *-----------------------------------------------------------*/
2206 
2207                   hypre_CSRBlockMatrixComputeSign(&A_diag_data[A_diag_i[i1]*bnnz], sign, block_size);
2208 
2209 
2210                   /* Diagonal block part of row i1 */
2211                   for (jj1 = A_diag_i[i1]; jj1 < A_diag_i[i1+1]; jj1++)
2212                   {
2213                      i2 = A_diag_j[jj1];
2214                      if (P_marker[i2] >= jj_begin_row)
2215                      {
2216                         /* add diag data to sum */
2217                         /* sum += A_diag_data[jj1]; */
2218                         /* hypre_CSRBlockMatrixBlockAddAccumulateDiag(&A_diag_data[jj1*bnnz],
2219                            sum_block, block_size);*/
2220 
2221                         hypre_CSRBlockMatrixBlockAddAccumulateDiagCheckSign(&A_diag_data[jj1*bnnz],
2222                                                                             sum_block, block_size, sign);
2223                      }
2224                   }
2225 
2226                   /* Off-Diagonal block part of row i1 */
2227                   if (num_procs > 1)
2228                   {
2229                      for (jj1 = A_offd_i[i1]; jj1 < A_offd_i[i1+1]; jj1++)
2230                      {
2231                         i2 = A_offd_j[jj1];
2232                         if (P_marker_offd[i2] >= jj_begin_row_offd )
2233                         {
2234                            /* add off diag data to sum */
2235                            /*sum += A_offd_data[jj1];*/
2236                            /* hypre_CSRBlockMatrixBlockAddAccumulateDiag(&A_offd_data[jj1*bnnz],
2237                               sum_block, block_size);*/
2238                            hypre_CSRBlockMatrixBlockAddAccumulateDiagCheckSign(&A_offd_data[jj1*bnnz],
2239                                                                                sum_block, block_size, sign);
2240                         }
2241                      }
2242                   }
2243                   /* check whether sum_block is singular */
2244 
2245                   /* distribute = A_diag_data[jj] / sum;*/
2246                   /* here we want: A_diag_data * sum^(-1) */
2247 
2248                   if (hypre_CSRBlockMatrixBlockInvMultDiag(sum_block, &A_diag_data[jj*bnnz],
2249                                                            distribute_block, block_size) == 0)
2250                   {
2251 
2252 
2253                      /*-----------------------------------------------------------
2254                       * Loop over row of A for point i1 and do the distribution.
2255                       *-----------------------------------------------------------*/
2256 
2257                      /* Diagonal block part of row i1 */
2258                      for (jj1 = A_diag_i[i1]; jj1 < A_diag_i[i1+1]; jj1++)
2259                      {
2260                         i2 = A_diag_j[jj1];
2261                         if (P_marker[i2] >= jj_begin_row )
2262                         {
2263 
2264                            /*  P_diag_data[P_marker[i2]]
2265                                += distribute * A_diag_data[jj1];*/
2266 
2267                            /* multiply - result in sum_block */
2268                            hypre_CSRBlockMatrixBlockCopyData(zero_block,
2269                                                              sum_block, 1.0, block_size);
2270 
2271 
2272                            /* hypre_CSRBlockMatrixBlockMultAddDiag(distribute_block,
2273                               &A_diag_data[jj1*bnnz], 0.0,
2274                               sum_block, block_size);*/
2275                            hypre_CSRBlockMatrixBlockMultAddDiagCheckSign(distribute_block,
2276                                                                          &A_diag_data[jj1*bnnz], 0.0,
2277                                                                          sum_block, block_size, sign);
2278 
2279 
2280                            /* add result to p_diag_data */
2281                            hypre_CSRBlockMatrixBlockAddAccumulateDiag(sum_block,
2282                                                                       &P_diag_data[P_marker[i2]*bnnz],
2283                                                                       block_size);
2284 
2285                         }
2286                      }
2287 
2288                      /* Off-Diagonal block part of row i1 */
2289                      if (num_procs > 1)
2290                      {
2291                         for (jj1 = A_offd_i[i1]; jj1 < A_offd_i[i1+1]; jj1++)
2292                         {
2293                            i2 = A_offd_j[jj1];
2294                            if (P_marker_offd[i2] >= jj_begin_row_offd)
2295                            {
2296                               /* P_offd_data[P_marker_offd[i2]]
2297                                  += distribute * A_offd_data[jj1]; */
2298 
2299                               /* multiply - result in sum_block */
2300 
2301                               hypre_CSRBlockMatrixBlockCopyData(zero_block,
2302                                                                 sum_block, 1.0, block_size);
2303                               /* hypre_CSRBlockMatrixBlockMultAddDiag(distribute_block,
2304                                  &A_offd_data[jj1*bnnz], 0.0,
2305                                  sum_block, block_size); */
2306 
2307                               hypre_CSRBlockMatrixBlockMultAddDiagCheckSign(distribute_block,
2308                                                                             &A_offd_data[jj1*bnnz], 0.0,
2309                                                                             sum_block, block_size, sign);
2310 
2311 
2312 
2313                               /* add result to p_offd_data */
2314                               hypre_CSRBlockMatrixBlockAddAccumulateDiag(sum_block,
2315                                                                          &P_offd_data[P_marker_offd[i2]*bnnz],
2316                                                                          block_size);
2317 
2318 
2319                            }
2320                         }
2321                      }
2322                   }
2323                   else /* sum block is all zeros (or almost singular) - just add to diagonal */
2324                   {
2325                      /* diagonal += A_diag_data[jj]; */
2326                      if (add_weak_to_diag) hypre_CSRBlockMatrixBlockAddAccumulateDiag(&A_diag_data[jj*bnnz],
2327                                                                                       diagonal_block,
2328                                                                                       block_size);
2329                   }
2330                }
2331 
2332                /*--------------------------------------------------------------
2333                 * Case 3: neighbor i1 weakly influences i, accumulate a_{i,i1}
2334                 * into the diagonal.
2335                 *--------------------------------------------------------------*/
2336 
2337                else if (CF_marker[i1] != -3 && add_weak_to_diag)
2338                {
2339                   /* diagonal += A_diag_data[jj];*/
2340                   hypre_CSRBlockMatrixBlockAddAccumulateDiag(&A_diag_data[jj*bnnz],
2341                                                              diagonal_block,
2342                                                              block_size);
2343                }
2344             }
2345 
2346 
2347             /*----------------------------------------------------------------
2348              * Still looping over ith row of A. Next, loop over the
2349              * off-diagonal part of A
2350              *---------------------------------------------------------------*/
2351 
2352             if (num_procs > 1)
2353             {
2354                for (jj = A_offd_i[i]; jj < A_offd_i[i+1]; jj++)
2355                {
2356                   i1 = A_offd_j[jj];
2357 
2358                   /*--------------------------------------------------------------
2359                    * Case 1: neighbor i1 is a C-point and strongly influences i,
2360                    * accumulate a_{i,i1} into the interpolation weight.
2361                    *--------------------------------------------------------------*/
2362 
2363                   if (P_marker_offd[i1] >= jj_begin_row_offd)
2364                   {
2365                      /* P_offd_data[P_marker_offd[i1]] += A_offd_data[jj]; */
2366                      hypre_CSRBlockMatrixBlockAddAccumulateDiag( &A_offd_data[jj*bnnz],
2367                                                                  &P_offd_data[P_marker_offd[i1]*bnnz],
2368                                                                  block_size);
2369                   }
2370 
2371                   /*------------------------------------------------------------
2372                    * Case 2: neighbor i1 is an F-point and strongly influences i,
2373                    * distribute a_{i,i1} to C-points that strongly infuence i.
2374                    * Note: currently no distribution to the diagonal in this case.
2375                    *-----------------------------------------------------------*/
2376 
2377                   else if (P_marker_offd[i1] == strong_f_marker || (!add_weak_to_diag  && CF_marker[i1] != -3))
2378                   {
2379 
2380                      /* initialize sum to zero */
2381                      hypre_CSRBlockMatrixBlockCopyData(zero_block, sum_block,
2382                                                        1.0, block_size);
2383 
2384                      /*---------------------------------------------------------
2385                       * Loop over row of A_ext for point i1 and calculate the sum
2386                       * of the connections to c-points that strongly influence i.
2387                       *---------------------------------------------------------*/
2388 
2389                      /* find row number */
2390                      c_num = A_offd_j[jj];
2391 
2392                      hypre_CSRBlockMatrixComputeSign(&A_ext_data[A_ext_i[c_num]*bnnz], sign, block_size);
2393 
2394 
2395                      for (jj1 = A_ext_i[c_num]; jj1 < A_ext_i[c_num+1]; jj1++)
2396                      {
2397                         i2 = (HYPRE_Int)A_ext_j[jj1];
2398 
2399                         if (i2 > -1)
2400                         {
2401                            /* in the diagonal block */
2402                            if (P_marker[i2] >= jj_begin_row)
2403                            {
2404                               /* sum += A_ext_data[jj1]; */
2405                               /*  hypre_CSRBlockMatrixBlockAddAccumulateDiag(&A_ext_data[jj1*bnnz],
2406                                   sum_block, block_size);*/
2407                               hypre_CSRBlockMatrixBlockAddAccumulateDiagCheckSign(&A_ext_data[jj1*bnnz],
2408                                                                                   sum_block, block_size, sign);
2409                            }
2410                         }
2411                         else
2412                         {
2413                            /* in the off_diagonal block  */
2414                            if (P_marker_offd[-i2-1] >= jj_begin_row_offd)
2415                            {
2416                               /* sum += A_ext_data[jj1]; */
2417                               /* hypre_CSRBlockMatrixBlockAddAccumulateDiag(&A_ext_data[jj1*bnnz],
2418                                  sum_block, block_size);*/
2419                               hypre_CSRBlockMatrixBlockAddAccumulateDiagCheckSign(&A_ext_data[jj1*bnnz],
2420                                                                                   sum_block, block_size, sign);
2421                            }
2422                         }
2423                      }
2424 
2425                      /* check whether sum_block is singular */
2426 
2427 
2428                      /* distribute = A_offd_data[jj] / sum;  */
2429                      /* here we want: A_offd_data * sum^(-1) */
2430                      if (hypre_CSRBlockMatrixBlockInvMultDiag(sum_block, &A_offd_data[jj*bnnz],
2431                                                               distribute_block, block_size) == 0)
2432                      {
2433 
2434                         /*---------------------------------------------------------
2435                          * Loop over row of A_ext for point i1 and do
2436                          * the distribution.
2437                          *--------------------------------------------------------*/
2438 
2439                         /* Diagonal block part of row i1 */
2440 
2441                         for (jj1 = A_ext_i[c_num]; jj1 < A_ext_i[c_num+1]; jj1++)
2442                         {
2443                            i2 = (HYPRE_Int)A_ext_j[jj1];
2444 
2445                            if (i2 > -1) /* in the diagonal block */
2446                            {
2447                               if (P_marker[i2] >= jj_begin_row)
2448                               {
2449                                  /* P_diag_data[P_marker[i2]]
2450                                     += distribute * A_ext_data[jj1]; */
2451 
2452                                  /* multiply - result in sum_block */
2453                                  hypre_CSRBlockMatrixBlockCopyData(zero_block, sum_block, 1.0, block_size);
2454 
2455                                  /* hypre_CSRBlockMatrixBlockMultAddDiag(distribute_block,
2456                                     &A_ext_data[jj1*bnnz], 0.0,
2457                                     sum_block, block_size); */
2458 
2459                                  hypre_CSRBlockMatrixBlockMultAddDiagCheckSign(distribute_block,
2460                                                                                &A_ext_data[jj1*bnnz], 0.0,
2461                                                                                sum_block, block_size, sign);
2462                                  /* add result to p_diag_data */
2463                                  hypre_CSRBlockMatrixBlockAddAccumulateDiag(sum_block,
2464                                                                             &P_diag_data[P_marker[i2]*bnnz],
2465                                                                             block_size);
2466                               }
2467                            }
2468                            else
2469                            {
2470                               /* in the off_diagonal block  */
2471                               if (P_marker_offd[-i2-1] >= jj_begin_row_offd)
2472 
2473                                  /*P_offd_data[P_marker_offd[-i2-1]]
2474                                    += distribute * A_ext_data[jj1];*/
2475                               {
2476 
2477                                  /* multiply - result in sum_block */
2478                                  hypre_CSRBlockMatrixBlockCopyData(zero_block, sum_block, 1.0, block_size);
2479 
2480                                  /* hypre_CSRBlockMatrixBlockMultAddDiag(distribute_block,
2481                                     &A_ext_data[jj1*bnnz], 0.0,
2482                                     sum_block, block_size);*/
2483 
2484                                  hypre_CSRBlockMatrixBlockMultAddDiagCheckSign(distribute_block,
2485                                                                                &A_ext_data[jj1*bnnz], 0.0,
2486                                                                                sum_block, block_size, sign);
2487                                  /* add result to p_offd_data */
2488                                  hypre_CSRBlockMatrixBlockAddAccumulateDiag(sum_block,
2489                                                                             &P_offd_data[P_marker_offd[-i2-1]*bnnz],
2490                                                                             block_size);
2491                               }
2492                            }
2493                         }
2494                      }
2495                      else /* sum block is all zeros - just add to diagonal */
2496                      {
2497                         /* diagonal += A_offd_data[jj]; */
2498                         if (add_weak_to_diag) hypre_CSRBlockMatrixBlockAddAccumulateDiag(&A_offd_data[jj*bnnz],
2499                                                                                          diagonal_block,
2500                                                                                          block_size);
2501                      }
2502                   }
2503 
2504                   /*-----------------------------------------------------------
2505                    * Case 3: neighbor i1 weakly influences i, accumulate a_{i,i1}
2506                    * into the diagonal.
2507                    *-----------------------------------------------------------*/
2508 
2509                   else if (CF_marker_offd[i1] != -3 && add_weak_to_diag)
2510                   {
2511                      /* diagonal += A_offd_data[jj]; */
2512                      hypre_CSRBlockMatrixBlockAddAccumulateDiag(&A_offd_data[jj*bnnz],
2513                                               diagonal_block, block_size);
2514                   }
2515                }
2516             }
2517 
2518             /*-----------------------------------------------------------------
2519              * Set interpolation weight by dividing by the diagonal.
2520              *-----------------------------------------------------------------*/
2521 
2522             for (jj = jj_begin_row; jj < jj_end_row; jj++)
2523             {
2524 
2525                /* P_diag_data[jj] /= -diagonal; */
2526 
2527                /* want diagonal^(-1)*P_diag_data */
2528                /* do division - put in sum_block */
2529                if ( hypre_CSRBlockMatrixBlockInvMultDiag(diagonal_block, &P_diag_data[jj*bnnz],
2530                                                          sum_block, block_size) == 0)
2531                {
2532                   /* now copy to  P_diag_data[jj] and make negative */
2533                   hypre_CSRBlockMatrixBlockCopyData(sum_block, &P_diag_data[jj*bnnz],
2534                                                    -1.0, block_size);
2535                }
2536                else
2537                {
2538                   /* hypre_printf(" Warning! singular diagonal block! Proc id %d row %d\n", my_id,i);  */
2539                   /* just make P_diag_data negative since diagonal is zero */
2540                   hypre_CSRBlockMatrixBlockCopyData(&P_diag_data[jj*bnnz], &P_diag_data[jj*bnnz],
2541                                                     -1.0, block_size);
2542                }
2543             }
2544 
2545             for (jj = jj_begin_row_offd; jj < jj_end_row_offd; jj++)
2546             {
2547                /* P_offd_data[jj] /= -diagonal; */
2548 
2549                /* do division - put in sum_block */
2550                hypre_CSRBlockMatrixBlockInvMultDiag(diagonal_block, &P_offd_data[jj*bnnz],
2551                                                     sum_block, block_size);
2552 
2553                /* now copy to  P_offd_data[jj] and make negative */
2554                hypre_CSRBlockMatrixBlockCopyData(sum_block, &P_offd_data[jj*bnnz],
2555                                                  -1.0, block_size);
2556             }
2557          }
2558 
2559          strong_f_marker--;
2560 
2561          P_offd_i[i+1] = jj_counter_offd;
2562       }
2563       hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
2564       hypre_TFree(P_marker_offd, HYPRE_MEMORY_HOST);
2565    }
2566 
2567    /* Now create P - as a block matrix */
2568    P = hypre_ParCSRBlockMatrixCreate(comm, block_size,
2569                                      hypre_ParCSRBlockMatrixGlobalNumRows(A),
2570                                      total_global_cpts,
2571                                      hypre_ParCSRBlockMatrixColStarts(A),
2572                                      num_cpts_global,
2573                                      0,
2574                                      P_diag_i[n_fine],
2575                                      P_offd_i[n_fine]);
2576 
2577 
2578    P_diag = hypre_ParCSRBlockMatrixDiag(P);
2579    hypre_CSRBlockMatrixData(P_diag) = P_diag_data;
2580    hypre_CSRBlockMatrixI(P_diag) = P_diag_i;
2581    hypre_CSRBlockMatrixJ(P_diag) = P_diag_j;
2582 
2583    P_offd = hypre_ParCSRBlockMatrixOffd(P);
2584    hypre_CSRBlockMatrixData(P_offd) = P_offd_data;
2585    hypre_CSRBlockMatrixI(P_offd) = P_offd_i;
2586    hypre_CSRBlockMatrixJ(P_offd) = P_offd_j;
2587 
2588    /* Compress P, removing coefficients smaller than trunc_factor * Max */
2589    if (trunc_factor != 0.0 || max_elmts > 0)
2590    {
2591       hypre_BoomerAMGBlockInterpTruncation(P, trunc_factor, max_elmts);
2592       P_diag_data = hypre_CSRBlockMatrixData(P_diag);
2593       P_diag_i = hypre_CSRBlockMatrixI(P_diag);
2594       P_diag_j = hypre_CSRBlockMatrixJ(P_diag);
2595       P_offd_data = hypre_CSRBlockMatrixData(P_offd);
2596       P_offd_i = hypre_CSRBlockMatrixI(P_offd);
2597       P_offd_j = hypre_CSRBlockMatrixJ(P_offd);
2598       P_diag_size = P_diag_i[n_fine];
2599       P_offd_size = P_offd_i[n_fine];
2600    }
2601 
2602 
2603    num_cols_P_offd = 0;
2604    if (P_offd_size)
2605    {
2606       P_marker = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
2607 
2608       for (i=0; i < num_cols_A_offd; i++)
2609          P_marker[i] = 0;
2610 
2611       num_cols_P_offd = 0;
2612       for (i=0; i < P_offd_size; i++)
2613       {
2614          index = P_offd_j[i];
2615          if (!P_marker[index])
2616          {
2617             num_cols_P_offd++;
2618             P_marker[index] = 1;
2619          }
2620       }
2621 
2622       tmp_map_offd = hypre_CTAlloc(HYPRE_Int, num_cols_P_offd, HYPRE_MEMORY_HOST);
2623       col_map_offd_P = hypre_CTAlloc(HYPRE_BigInt, num_cols_P_offd, HYPRE_MEMORY_HOST);
2624 
2625       index = 0;
2626       for (i=0; i < num_cols_P_offd; i++)
2627       {
2628          while (P_marker[index]==0) index++;
2629          tmp_map_offd[i] = index++;
2630       }
2631 
2632       for (i=0; i < P_offd_size; i++)
2633          P_offd_j[i] = hypre_BinarySearch(tmp_map_offd,
2634                                           P_offd_j[i],
2635                                           num_cols_P_offd);
2636       hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
2637    }
2638 
2639    for (i=0; i < n_fine; i++)
2640       if (CF_marker[i] == -3) CF_marker[i] = -1;
2641 
2642    if (num_cols_P_offd)
2643    {
2644       hypre_ParCSRBlockMatrixColMapOffd(P) = col_map_offd_P;
2645       hypre_CSRBlockMatrixNumCols(P_offd) = num_cols_P_offd;
2646    }
2647 
2648    /* use block version */
2649    hypre_GetCommPkgBlockRTFromCommPkgBlockA(P,A, tmp_map_offd, fine_to_coarse_offd);
2650 
2651 
2652    *P_ptr = P;
2653 
2654    hypre_TFree(sign, HYPRE_MEMORY_HOST);
2655 
2656 
2657    hypre_TFree(zero_block, HYPRE_MEMORY_HOST);
2658    hypre_TFree(identity_block, HYPRE_MEMORY_HOST);
2659    hypre_TFree(diagonal_block, HYPRE_MEMORY_HOST);
2660    hypre_TFree(sum_block, HYPRE_MEMORY_HOST);
2661    hypre_TFree(distribute_block, HYPRE_MEMORY_HOST);
2662 
2663    hypre_TFree(tmp_map_offd, HYPRE_MEMORY_HOST);
2664    hypre_TFree(CF_marker_offd, HYPRE_MEMORY_HOST);
2665    hypre_TFree(int_buf_data, HYPRE_MEMORY_HOST);
2666    hypre_TFree(big_buf_data, HYPRE_MEMORY_HOST);
2667    hypre_TFree(fine_to_coarse, HYPRE_MEMORY_HOST);
2668    hypre_TFree(fine_to_coarse_offd, HYPRE_MEMORY_HOST);
2669    hypre_TFree(coarse_counter, HYPRE_MEMORY_HOST);
2670    hypre_TFree(jj_count, HYPRE_MEMORY_HOST);
2671    hypre_TFree(jj_count_offd, HYPRE_MEMORY_HOST);
2672 
2673    if (num_procs > 1) hypre_CSRBlockMatrixDestroy(A_ext);
2674 
2675    return(0);
2676 
2677 }
2678 
2679 
2680 /*---------------------------------------------------------------------------
2681  * hypre_BoomerAMGBlockBuildInterpRV
2682 
2683  Here we are modifying the block interp like in Ruge's elasticity paper
2684  (applied math comp '86) - only we don't include the diagonal
2685  for dist. the f-connect
2686 
2687 
2688  - when we do the distribution of the f-connection, we only distribute the error
2689  to like unknowns - this has the effect of only using the diagonal of the
2690  matrix for the f-distributions.  In addition, we will not differentiate
2691  between the strength of the f-connections (so nothing is added to the diag)
2692 
2693  *--------------------------------------------------------------------------*/
2694 
2695 HYPRE_Int
hypre_BoomerAMGBuildBlockInterpRV(hypre_ParCSRBlockMatrix * A,HYPRE_Int * CF_marker,hypre_ParCSRMatrix * S,HYPRE_BigInt * num_cpts_global,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int debug_flag,HYPRE_Real trunc_factor,HYPRE_Int max_elmts,hypre_ParCSRBlockMatrix ** P_ptr)2696 hypre_BoomerAMGBuildBlockInterpRV( hypre_ParCSRBlockMatrix    *A,
2697                                    HYPRE_Int                  *CF_marker,
2698                                    hypre_ParCSRMatrix         *S,
2699                                    HYPRE_BigInt               *num_cpts_global,
2700                                    HYPRE_Int                   num_functions,
2701                                    HYPRE_Int                  *dof_func,
2702                                    HYPRE_Int                   debug_flag,
2703                                    HYPRE_Real                  trunc_factor,
2704                                    HYPRE_Int                   max_elmts,
2705                                    hypre_ParCSRBlockMatrix   **P_ptr)
2706 {
2707 
2708    MPI_Comm                 comm = hypre_ParCSRBlockMatrixComm(A);
2709    hypre_ParCSRCommPkg     *comm_pkg = hypre_ParCSRBlockMatrixCommPkg(A);
2710    hypre_ParCSRCommHandle  *comm_handle;
2711 
2712    hypre_CSRBlockMatrix  *A_diag = hypre_ParCSRBlockMatrixDiag(A);
2713    HYPRE_Real            *A_diag_data = hypre_CSRBlockMatrixData(A_diag);
2714    HYPRE_Int             *A_diag_i = hypre_CSRBlockMatrixI(A_diag);
2715    HYPRE_Int             *A_diag_j = hypre_CSRBlockMatrixJ(A_diag);
2716 
2717    HYPRE_Int              block_size = hypre_CSRBlockMatrixBlockSize(A_diag);
2718    HYPRE_Int              bnnz = block_size*block_size;
2719 
2720    hypre_CSRBlockMatrix  *A_offd = hypre_ParCSRBlockMatrixOffd(A);
2721    HYPRE_Real            *A_offd_data = hypre_CSRBlockMatrixData(A_offd);
2722    HYPRE_Int             *A_offd_i = hypre_CSRBlockMatrixI(A_offd);
2723    HYPRE_Int             *A_offd_j = hypre_CSRBlockMatrixJ(A_offd);
2724    HYPRE_Int              num_cols_A_offd = hypre_CSRBlockMatrixNumCols(A_offd);
2725    HYPRE_BigInt          *col_map_offd = hypre_ParCSRBlockMatrixColMapOffd(A);
2726 
2727    hypre_CSRMatrix       *S_diag = hypre_ParCSRMatrixDiag(S);
2728    HYPRE_Int             *S_diag_i = hypre_CSRMatrixI(S_diag);
2729    HYPRE_Int             *S_diag_j = hypre_CSRMatrixJ(S_diag);
2730 
2731    hypre_CSRMatrix       *S_offd = hypre_ParCSRMatrixOffd(S);
2732    HYPRE_Int             *S_offd_i = hypre_CSRMatrixI(S_offd);
2733    HYPRE_Int             *S_offd_j = hypre_CSRMatrixJ(S_offd);
2734 
2735    hypre_ParCSRBlockMatrix *P;
2736    HYPRE_BigInt            *col_map_offd_P;
2737    HYPRE_Int               *tmp_map_offd = NULL;
2738 
2739    HYPRE_Int             *CF_marker_offd = NULL;
2740 
2741    hypre_CSRBlockMatrix  *A_ext;
2742 
2743    HYPRE_Real            *A_ext_data = NULL;
2744    HYPRE_Int             *A_ext_i = NULL;
2745    HYPRE_BigInt          *A_ext_j = NULL;
2746 
2747    hypre_CSRBlockMatrix  *P_diag;
2748    hypre_CSRBlockMatrix  *P_offd;
2749 
2750    HYPRE_Real            *P_diag_data;
2751    HYPRE_Int             *P_diag_i;
2752    HYPRE_Int             *P_diag_j;
2753    HYPRE_Real            *P_offd_data;
2754    HYPRE_Int             *P_offd_i;
2755    HYPRE_Int             *P_offd_j;
2756 
2757    HYPRE_Int              P_diag_size, P_offd_size;
2758 
2759    HYPRE_Int             *P_marker, *P_marker_offd = NULL;
2760 
2761    HYPRE_Int              jj_counter,jj_counter_offd;
2762    HYPRE_Int             *jj_count, *jj_count_offd = NULL;
2763    HYPRE_Int              jj_begin_row,jj_begin_row_offd;
2764    HYPRE_Int              jj_end_row,jj_end_row_offd;
2765 
2766    HYPRE_Int              start_indexing = 0; /* start indexing for P_data at 0 */
2767 
2768    HYPRE_Int              n_fine = hypre_CSRBlockMatrixNumRows(A_diag);
2769 
2770    HYPRE_Int              strong_f_marker;
2771 
2772    HYPRE_Int             *fine_to_coarse;
2773    HYPRE_BigInt          *fine_to_coarse_offd = NULL;
2774    HYPRE_Int             *coarse_counter;
2775    HYPRE_Int              coarse_shift;
2776    HYPRE_BigInt           total_global_cpts;
2777    HYPRE_Int              num_cols_P_offd;
2778    HYPRE_BigInt           my_first_cpt;
2779 
2780    HYPRE_Int              bd;
2781 
2782    HYPRE_Int              i,i1,i2;
2783    HYPRE_Int              j,jl,jj,jj1;
2784    HYPRE_Int              kc;
2785    HYPRE_BigInt           big_k;
2786    HYPRE_Int              start;
2787 
2788    HYPRE_Int              c_num;
2789 
2790    HYPRE_Int              my_id;
2791    HYPRE_Int              num_procs;
2792    HYPRE_Int              num_threads;
2793    HYPRE_Int              num_sends;
2794    HYPRE_Int              index;
2795    HYPRE_Int              ns, ne, size, rest;
2796    HYPRE_Int             *int_buf_data = NULL;
2797    HYPRE_BigInt          *big_buf_data = NULL;
2798 
2799    HYPRE_BigInt col_1 = hypre_ParCSRBlockMatrixFirstRowIndex(A);
2800    HYPRE_Int local_numrows = hypre_CSRBlockMatrixNumRows(A_diag);
2801    HYPRE_BigInt col_n = col_1 + (HYPRE_BigInt)local_numrows;
2802 
2803    HYPRE_Real       wall_time;  /* for debugging instrumentation  */
2804 
2805 
2806    HYPRE_Real       *identity_block;
2807    HYPRE_Real       *zero_block;
2808    HYPRE_Real       *diagonal_block;
2809    HYPRE_Real       *sum_block;
2810    HYPRE_Real       *distribute_block;
2811 
2812    hypre_MPI_Comm_size(comm, &num_procs);
2813    hypre_MPI_Comm_rank(comm,&my_id);
2814    num_threads = hypre_NumThreads();
2815 
2816    my_first_cpt = num_cpts_global[0];
2817    if (my_id == (num_procs -1)) total_global_cpts = num_cpts_global[1];
2818    hypre_MPI_Bcast(&total_global_cpts, 1, HYPRE_MPI_BIG_INT, num_procs-1, comm);
2819 
2820    /*-------------------------------------------------------------------
2821     * Get the CF_marker data for the off-processor columns
2822     *-------------------------------------------------------------------*/
2823 
2824    if (debug_flag==4) wall_time = time_getWallclockSeconds();
2825 
2826    CF_marker_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
2827 
2828 
2829    if (!comm_pkg)
2830    {
2831       hypre_BlockMatvecCommPkgCreate(A);
2832       comm_pkg = hypre_ParCSRBlockMatrixCommPkg(A);
2833    }
2834 
2835    num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
2836    int_buf_data = hypre_CTAlloc(HYPRE_Int,  hypre_ParCSRCommPkgSendMapStart(comm_pkg,
2837                                                       num_sends), HYPRE_MEMORY_HOST);
2838 
2839    index = 0;
2840    for (i = 0; i < num_sends; i++)
2841    {
2842       start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
2843       for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
2844       {
2845          int_buf_data[index++]
2846             = CF_marker[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
2847       }
2848 
2849    }
2850 
2851    /* we do not need the block version of comm handle - because
2852       CF_marker corresponds to the nodal matrix.  This call populates
2853       CF_marker_offd */
2854    comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data,
2855                                                CF_marker_offd);
2856 
2857    hypre_ParCSRCommHandleDestroy(comm_handle);
2858 
2859 
2860    if (debug_flag==4)
2861    {
2862       wall_time = time_getWallclockSeconds() - wall_time;
2863       hypre_printf("Proc = %d     Interp: Comm 1 CF_marker =    %f\n",
2864                    my_id, wall_time);
2865       fflush(NULL);
2866    }
2867 
2868    /*----------------------------------------------------------------------
2869     * Get the ghost rows of A
2870     *---------------------------------------------------------------------*/
2871 
2872    if (debug_flag==4) wall_time = time_getWallclockSeconds();
2873 
2874    if (num_procs > 1)
2875    {
2876       A_ext      = hypre_ParCSRBlockMatrixExtractBExt(A, A, 1);
2877       A_ext_i    = hypre_CSRBlockMatrixI(A_ext);
2878       A_ext_j    = hypre_CSRBlockMatrixBigJ(A_ext);
2879       A_ext_data = hypre_CSRBlockMatrixData(A_ext);
2880    }
2881 
2882    index = 0;
2883    for (i=0; i < num_cols_A_offd; i++)
2884    {
2885       for (j=A_ext_i[i]; j < A_ext_i[i+1]; j++)
2886       {
2887          big_k = A_ext_j[j];
2888          if (big_k >= col_1 && big_k < col_n)
2889          {
2890             A_ext_j[index] = big_k - col_1;
2891             /* for the data field we must get all of the blocbig_k data */
2892             for (bd = 0; bd < bnnz; bd++)
2893             {
2894                A_ext_data[index*bnnz + bd] = A_ext_data[j*bnnz + bd];
2895             }
2896             index++;
2897          }
2898          else
2899          {
2900             kc = hypre_BigBinarySearch(col_map_offd, big_k, num_cols_A_offd);
2901             if (kc > -1)
2902             {
2903                A_ext_j[index] = (HYPRE_BigInt)(-kc-1);
2904                for (bd = 0; bd < bnnz; bd++)
2905                {
2906                   A_ext_data[index*bnnz + bd] = A_ext_data[j*bnnz + bd];
2907                }
2908                index++;
2909             }
2910          }
2911       }
2912       A_ext_i[i] = index;
2913    }
2914    for (i = num_cols_A_offd; i > 0; i--)
2915       A_ext_i[i] = A_ext_i[i-1];
2916    if (num_procs > 1) A_ext_i[0] = 0;
2917 
2918    if (debug_flag==4)
2919    {
2920       wall_time = time_getWallclockSeconds() - wall_time;
2921       hypre_printf("Proc = %d  Interp: Comm 2   Get A_ext =  %f\n",
2922                    my_id, wall_time);
2923       fflush(NULL);
2924    }
2925 
2926 
2927    /*-----------------------------------------------------------------------
2928     *  First Pass: Determine size of P and fill in fine_to_coarse mapping.
2929     *-----------------------------------------------------------------------*/
2930 
2931    /*-----------------------------------------------------------------------
2932     *  Intialize counters and allocate mapping vector.
2933     *-----------------------------------------------------------------------*/
2934 
2935    coarse_counter = hypre_CTAlloc(HYPRE_Int,  num_threads, HYPRE_MEMORY_HOST);
2936    jj_count = hypre_CTAlloc(HYPRE_Int,  num_threads, HYPRE_MEMORY_HOST);
2937    jj_count_offd = hypre_CTAlloc(HYPRE_Int,  num_threads, HYPRE_MEMORY_HOST);
2938 
2939    fine_to_coarse = hypre_CTAlloc(HYPRE_Int,  n_fine, HYPRE_MEMORY_HOST);
2940 
2941    for (i = 0; i < n_fine; i++) fine_to_coarse[i] = -1;
2942 
2943    jj_counter = start_indexing;
2944    jj_counter_offd = start_indexing;
2945 
2946    /*-----------------------------------------------------------------------
2947     *  Loop over fine grid.
2948     *-----------------------------------------------------------------------*/
2949 
2950    for (j = 0; j < num_threads; j++)
2951    {
2952       size = n_fine/num_threads;
2953       rest = n_fine - size*num_threads;
2954       if (j < rest)
2955       {
2956          ns = j*size+j;
2957          ne = (j+1)*size+j+1;
2958       }
2959       else
2960       {
2961          ns = j*size+rest;
2962          ne = (j+1)*size+rest;
2963       }
2964 
2965 
2966       /* loop over the fine grid points */
2967       for (i = ns; i < ne; i++)
2968       {
2969 
2970          /*--------------------------------------------------------------------
2971           *  If i is a C-point, interpolation is the identity. Also set up
2972           *  mapping vector (fine_to_coarse is the mapping vector).
2973           *--------------------------------------------------------------------*/
2974 
2975          if (CF_marker[i] >= 0)
2976          {
2977             jj_count[j]++;
2978             fine_to_coarse[i] = coarse_counter[j];
2979             coarse_counter[j]++;
2980          }
2981 
2982          /*--------------------------------------------------------------------
2983           *  If i is an F-point, interpolation is from the C-points that
2984           *  strongly influence i.
2985           *--------------------------------------------------------------------*/
2986 
2987          else
2988          {
2989             for (jj = S_diag_i[i]; jj < S_diag_i[i+1]; jj++)
2990             {
2991                i1 = S_diag_j[jj];
2992                if (CF_marker[i1] >= 0)
2993                {
2994                   jj_count[j]++;
2995                }
2996             }
2997 
2998             if (num_procs > 1)
2999             {
3000                for (jj = S_offd_i[i]; jj < S_offd_i[i+1]; jj++)
3001                {
3002                   i1 = S_offd_j[jj];
3003                   if (CF_marker_offd[i1] >= 0)
3004                   {
3005                      jj_count_offd[j]++;
3006                   }
3007                }
3008             }
3009          }
3010       }
3011    }
3012 
3013    /*-----------------------------------------------------------------------
3014     *  Allocate  arrays.
3015     *-----------------------------------------------------------------------*/
3016 
3017    for (i=0; i < num_threads-1; i++)
3018    {
3019       coarse_counter[i+1] += coarse_counter[i];
3020       jj_count[i+1] += jj_count[i];
3021       jj_count_offd[i+1] += jj_count_offd[i];
3022    }
3023    i = num_threads-1;
3024    jj_counter = jj_count[i];
3025    jj_counter_offd = jj_count_offd[i];
3026 
3027    P_diag_size = jj_counter;
3028 
3029    P_diag_i    = hypre_CTAlloc(HYPRE_Int,  n_fine+1, HYPRE_MEMORY_HOST);
3030    P_diag_j    = hypre_CTAlloc(HYPRE_Int,  P_diag_size, HYPRE_MEMORY_HOST);
3031    /* we need to include the size of the blocks in the data size */
3032    P_diag_data = hypre_CTAlloc(HYPRE_Real,  P_diag_size*bnnz, HYPRE_MEMORY_HOST);
3033 
3034    P_diag_i[n_fine] = jj_counter;
3035 
3036 
3037    P_offd_size = jj_counter_offd;
3038 
3039    P_offd_i    = hypre_CTAlloc(HYPRE_Int,  n_fine+1, HYPRE_MEMORY_HOST);
3040    P_offd_j    = hypre_CTAlloc(HYPRE_Int,  P_offd_size, HYPRE_MEMORY_HOST);
3041    /* we need to include the size of the blocks in the data size */
3042    P_offd_data = hypre_CTAlloc(HYPRE_Real,  P_offd_size*bnnz, HYPRE_MEMORY_HOST);
3043 
3044    /*-----------------------------------------------------------------------
3045     *  Intialize some stuff.
3046     *-----------------------------------------------------------------------*/
3047 
3048    jj_counter = start_indexing;
3049    jj_counter_offd = start_indexing;
3050 
3051    if (debug_flag==4)
3052    {
3053       wall_time = time_getWallclockSeconds() - wall_time;
3054       hypre_printf("Proc = %d     Interp: Internal work 1 =     %f\n",
3055                    my_id, wall_time);
3056       fflush(NULL);
3057    }
3058 
3059    /* we need a block identity and a block of zeros*/
3060    identity_block = hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
3061    zero_block =  hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
3062 
3063    for(i = 0; i < block_size; i++)
3064    {
3065       identity_block[i*block_size + i] = 1.0;
3066    }
3067 
3068 
3069    /* we also need a block to keep track of the diagonal values and a sum */
3070    diagonal_block =  hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
3071    sum_block =  hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
3072    distribute_block =  hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
3073 
3074    /*-----------------------------------------------------------------------
3075     *  Send and receive fine_to_coarse info.
3076     *-----------------------------------------------------------------------*/
3077 
3078    if (debug_flag==4) wall_time = time_getWallclockSeconds();
3079 
3080    fine_to_coarse_offd = hypre_CTAlloc(HYPRE_BigInt,  num_cols_A_offd, HYPRE_MEMORY_HOST);
3081    big_buf_data = hypre_CTAlloc(HYPRE_BigInt,  hypre_ParCSRCommPkgSendMapStart(comm_pkg,
3082                                                       num_sends), HYPRE_MEMORY_HOST);
3083 
3084    for (j = 0; j < num_threads; j++)
3085    {
3086       coarse_shift = 0;
3087       if (j > 0) coarse_shift = coarse_counter[j-1];
3088       size = n_fine/num_threads;
3089       rest = n_fine - size*num_threads;
3090       if (j < rest)
3091       {
3092          ns = j*size+j;
3093          ne = (j+1)*size+j+1;
3094       }
3095       else
3096       {
3097          ns = j*size+rest;
3098          ne = (j+1)*size+rest;
3099       }
3100       for (i = ns; i < ne; i++)
3101          fine_to_coarse[i] += coarse_shift;
3102    }
3103    index = 0;
3104    for (i = 0; i < num_sends; i++)
3105    {
3106       start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
3107       for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
3108          big_buf_data[index++] = my_first_cpt
3109             + fine_to_coarse[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
3110    }
3111 
3112    /* again, we do not need to use the block version of comm handle since
3113       the fine to coarse mapping is size of the nodes */
3114 
3115    comm_handle = hypre_ParCSRCommHandleCreate( 21, comm_pkg, big_buf_data,
3116                                                fine_to_coarse_offd);
3117 
3118    hypre_ParCSRCommHandleDestroy(comm_handle);
3119 
3120    if (debug_flag==4)
3121    {
3122       wall_time = time_getWallclockSeconds() - wall_time;
3123       hypre_printf("Proc = %d     Interp: Comm 4 FineToCoarse = %f\n",
3124                    my_id, wall_time);
3125       fflush(NULL);
3126    }
3127 
3128    if (debug_flag==4) wall_time = time_getWallclockSeconds();
3129 
3130    for (i = 0; i < n_fine; i++) fine_to_coarse[i] -= my_first_cpt;
3131 
3132    /*-----------------------------------------------------------------------
3133     *  Loop over fine grid points.
3134     *-----------------------------------------------------------------------*/
3135 
3136    for (jl = 0; jl < num_threads; jl++)
3137    {
3138       size = n_fine/num_threads;
3139       rest = n_fine - size*num_threads;
3140       if (jl < rest)
3141       {
3142          ns = jl*size+jl;
3143          ne = (jl+1)*size+jl+1;
3144       }
3145       else
3146       {
3147          ns = jl*size+rest;
3148          ne = (jl+1)*size+rest;
3149       }
3150       jj_counter = 0;
3151       if (jl > 0) jj_counter = jj_count[jl-1];
3152       jj_counter_offd = 0;
3153       if (jl > 0) jj_counter_offd = jj_count_offd[jl-1];
3154 
3155       P_marker = hypre_CTAlloc(HYPRE_Int,  n_fine, HYPRE_MEMORY_HOST);
3156       P_marker_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
3157 
3158       for (i = 0; i < n_fine; i++)
3159       {
3160          P_marker[i] = -1;
3161       }
3162       for (i = 0; i < num_cols_A_offd; i++)
3163       {
3164          P_marker_offd[i] = -1;
3165       }
3166       strong_f_marker = -2;
3167 
3168       for (i = ns; i < ne; i++)
3169       {
3170 
3171          /*--------------------------------------------------------------------
3172           *  If i is a c-point, interpolation is the identity.
3173           *--------------------------------------------------------------------*/
3174 
3175          if (CF_marker[i] >= 0)
3176          {
3177             P_diag_i[i] = jj_counter;
3178             P_diag_j[jj_counter]    = fine_to_coarse[i];
3179             /* P_diag_data[jj_counter] = one; */
3180             hypre_CSRBlockMatrixBlockCopyData(identity_block,
3181                                               &P_diag_data[jj_counter*bnnz],
3182                                               1.0, block_size);
3183             jj_counter++;
3184          }
3185 
3186          /*--------------------------------------------------------------------
3187           *  If i is an F-point, build interpolation.
3188           *--------------------------------------------------------------------*/
3189 
3190          else
3191          {
3192             /* Diagonal part of P */
3193             P_diag_i[i] = jj_counter;
3194             jj_begin_row = jj_counter;
3195 
3196             for (jj = S_diag_i[i]; jj < S_diag_i[i+1]; jj++)
3197             {
3198                i1 = S_diag_j[jj];
3199 
3200                /*--------------------------------------------------------------
3201                 * If neighbor i1 is a C-point, set column number in P_diag_j
3202                 * and initialize interpolation weight to zero.
3203                 *--------------------------------------------------------------*/
3204 
3205                if (CF_marker[i1] >= 0)
3206                {
3207                   P_marker[i1] = jj_counter;
3208                   P_diag_j[jj_counter]    = fine_to_coarse[i1];
3209                   /* P_diag_data[jj_counter] = zero; */
3210                   hypre_CSRBlockMatrixBlockCopyData(zero_block,
3211                                                     &P_diag_data[jj_counter*bnnz],
3212                                                     1.0, block_size);
3213                   jj_counter++;
3214                }
3215 
3216                /*--------------------------------------------------------------
3217                 * If neighbor i1 is an F-point, mark it as a strong F-point
3218                 * whose connection needs to be distributed.
3219                 *--------------------------------------------------------------*/
3220 
3221                else if (CF_marker[i1] != -3)
3222                {
3223                   P_marker[i1] = strong_f_marker;
3224                }
3225             }
3226             jj_end_row = jj_counter;
3227 
3228             /* Off-Diagonal part of P */
3229             P_offd_i[i] = jj_counter_offd;
3230             jj_begin_row_offd = jj_counter_offd;
3231 
3232 
3233             if (num_procs > 1)
3234             {
3235                for (jj = S_offd_i[i]; jj < S_offd_i[i+1]; jj++)
3236                {
3237                   i1 = S_offd_j[jj];
3238 
3239                   /*-----------------------------------------------------------
3240                    * If neighbor i1 is a C-point, set column number in P_offd_j
3241                    * and initialize interpolation weight to zero.
3242                    *-----------------------------------------------------------*/
3243 
3244                   if (CF_marker_offd[i1] >= 0)
3245                   {
3246                      P_marker_offd[i1] = jj_counter_offd;
3247                      P_offd_j[jj_counter_offd]  = i1;
3248                      /* P_offd_data[jj_counter_offd] = zero; */
3249                      hypre_CSRBlockMatrixBlockCopyData(zero_block,
3250                                                        &P_offd_data[jj_counter_offd*bnnz],
3251                                                        1.0, block_size);
3252 
3253                      jj_counter_offd++;
3254                   }
3255 
3256                   /*-----------------------------------------------------------
3257                    * If neighbor i1 is an F-point, mark it as a strong F-point
3258                    * whose connection needs to be distributed.
3259                    *-----------------------------------------------------------*/
3260 
3261                   else if (CF_marker_offd[i1] != -3)
3262                   {
3263                      P_marker_offd[i1] = strong_f_marker;
3264                   }
3265                }
3266             }
3267 
3268             jj_end_row_offd = jj_counter_offd;
3269 
3270 
3271             /* get the diagonal block */
3272             /* diagonal = A_diag_data[A_diag_i[i]]; */
3273             hypre_CSRBlockMatrixBlockCopyData(&A_diag_data[A_diag_i[i]*bnnz], diagonal_block,
3274                                               1.0, block_size);
3275 
3276 
3277 
3278             /* Here we go through the neighborhood of this grid point */
3279 
3280             /* Loop over ith row of A.  First, the diagonal part of A */
3281 
3282             for (jj = A_diag_i[i]+1; jj < A_diag_i[i+1]; jj++)
3283             {
3284                i1 = A_diag_j[jj];
3285 
3286                /*--------------------------------------------------------------
3287                 * Case 1: neighbor i1 is a C-point and strongly influences i,
3288                 * accumulate a_{i,i1} into the interpolation weight.
3289                 *--------------------------------------------------------------*/
3290 
3291                if (P_marker[i1] >= jj_begin_row)
3292                {
3293                   /*   P_diag_data[P_marker[i1]] += A_diag_data[jj]; */
3294                   hypre_CSRBlockMatrixBlockAddAccumulate(&A_diag_data[jj*bnnz],
3295                                                          &P_diag_data[P_marker[i1]*bnnz],
3296                                                          block_size);
3297 
3298                }
3299 
3300                /*--------------------------------------------------------------
3301                 * Case 2: neighbor i1 is an F-point (MAY or MAY NOT strongly influences i),
3302                 * distribute a_{i,i1} to C-points that strongly infuence i.
3303                 * Note: currently no distribution to the diagonal in this case.
3304                 *--------------------------------------------------------------*/
3305 
3306                else if (P_marker[i1] == strong_f_marker || CF_marker[i1] != -3)
3307                {
3308                   /* initialize sum to zero */
3309                   /* sum = zero; */
3310                   hypre_CSRBlockMatrixBlockCopyData(zero_block, sum_block, 1.0,
3311                                                     block_size);
3312 
3313 
3314                   /*-----------------------------------------------------------
3315                    * Loop over row of A for point i1 and calculate the sum
3316                    * of the connections to c-points that strongly influence i.-
3317 
3318                    HERE WE ONLY WANT THE DIAG CONTIRBUTIONS (intra-unknown)
3319 
3320                    *-----------------------------------------------------------*/
3321 
3322                   /* Diagonal block part of row i1 */
3323                   for (jj1 = A_diag_i[i1]; jj1 < A_diag_i[i1+1]; jj1++)
3324                   {
3325                      i2 = A_diag_j[jj1];
3326                      if (P_marker[i2] >= jj_begin_row)
3327                      {
3328                         /* add diag data to sum */
3329                         /* sum += A_diag_data[jj1]; */
3330                         hypre_CSRBlockMatrixBlockAddAccumulateDiag(&A_diag_data[jj1*bnnz],
3331                                                                    sum_block, block_size);
3332                      }
3333                   }
3334 
3335                   /* Off-Diagonal block part of row i1 */
3336                   if (num_procs > 1)
3337                   {
3338                      for (jj1 = A_offd_i[i1]; jj1 < A_offd_i[i1+1]; jj1++)
3339                      {
3340                         i2 = A_offd_j[jj1];
3341                         if (P_marker_offd[i2] >= jj_begin_row_offd )
3342                         {
3343                            /* add off diag data to sum */
3344                            /*sum += A_offd_data[jj1];*/
3345                            hypre_CSRBlockMatrixBlockAddAccumulateDiag(&A_offd_data[jj1*bnnz],
3346                                                                       sum_block, block_size);
3347 
3348                         }
3349                      }
3350                   }
3351                   /* check whether sum_block is singular (NOW SUM IS A DIAG MATRIX)*/
3352                   /* distribute = A_diag_data[jj] / sum;  (if a diag element is 0 then
3353                      that col is scaled by 1 instead of 1/diag) - doesn'treturn 0*/
3354                   if (hypre_CSRBlockMatrixBlockInvMultDiag2(&A_diag_data[jj*bnnz], sum_block,
3355                                                             distribute_block, block_size) == 0)
3356                   {
3357 
3358                      /*-----------------------------------------------------------
3359                       * Loop over row of A for point i1 and do the distribution.-
3360                       HERE AGAIN WE ONLY WANT TO DIST W/IN A LIKE UNKNOWN
3361 
3362                       *-----------------------------------------------------------*/
3363 
3364                      /* Diagonal block part of row i1 */
3365                      for (jj1 = A_diag_i[i1]; jj1 < A_diag_i[i1+1]; jj1++)
3366                      {
3367                         i2 = A_diag_j[jj1];
3368                         if (P_marker[i2] >= jj_begin_row )
3369                         {
3370 
3371                            /*  P_diag_data[P_marker[i2]]
3372                                += distribute * A_diag_data[jj1];*/
3373 
3374                            /* multiply - result in sum_block */
3375                            hypre_CSRBlockMatrixBlockMultAddDiag2(distribute_block,
3376                                                                  &A_diag_data[jj1*bnnz], 0.0,
3377                                                                  sum_block, block_size);
3378 
3379 
3380                            /* add result to p_diag_data */
3381                            hypre_CSRBlockMatrixBlockAddAccumulate(sum_block,
3382                                                                   &P_diag_data[P_marker[i2]*bnnz],
3383                                                                   block_size);
3384 
3385                         }
3386                      }
3387 
3388                      /* Off-Diagonal block part of row i1 */
3389                      if (num_procs > 1)
3390                      {
3391                         for (jj1 = A_offd_i[i1]; jj1 < A_offd_i[i1+1]; jj1++)
3392                         {
3393                            i2 = A_offd_j[jj1];
3394                            if (P_marker_offd[i2] >= jj_begin_row_offd)
3395                            {
3396                               /* P_offd_data[P_marker_offd[i2]]
3397                                  += distribute * A_offd_data[jj1]; */
3398 
3399                               /* multiply - result in sum_block */
3400                               hypre_CSRBlockMatrixBlockMultAddDiag2(distribute_block,
3401                                                                     &A_offd_data[jj1*bnnz], 0.0,
3402                                                                     sum_block, block_size);
3403 
3404 
3405                               /* add result to p_offd_data */
3406                               hypre_CSRBlockMatrixBlockAddAccumulate(sum_block,
3407                                                                      &P_offd_data[P_marker_offd[i2]*bnnz],
3408                                                                      block_size);
3409                            }
3410                         }
3411                      }
3412                   } /* end of if sum */
3413                }/* end of case 1 or case 2*/
3414 
3415             }/* end of loop of diag part */
3416 
3417 
3418             /*----------------------------------------------------------------
3419              * Still looping over ith row of A. Next, loop over the
3420              * off-diagonal part of A
3421              *---------------------------------------------------------------*/
3422 
3423             if (num_procs > 1)
3424             {
3425                for (jj = A_offd_i[i]; jj < A_offd_i[i+1]; jj++)
3426                {
3427                   i1 = A_offd_j[jj];
3428 
3429                   /*--------------------------------------------------------------
3430                    * Case 1: neighbor i1 is a C-point and strongly influences i,
3431                    * accumulate a_{i,i1} into the interpolation weight.
3432                    *--------------------------------------------------------------*/
3433 
3434                   if (P_marker_offd[i1] >= jj_begin_row_offd)
3435                   {
3436                      /* P_offd_data[P_marker_offd[i1]] += A_offd_data[jj]; */
3437                      hypre_CSRBlockMatrixBlockAddAccumulate( &A_offd_data[jj*bnnz],
3438                                                              &P_offd_data[P_marker_offd[i1]*bnnz],
3439                                                              block_size);
3440                   }
3441 
3442                   /*------------------------------------------------------------
3443                    * Case 2: neighbor i1 is an F-point and (MAY or MAY NOT strongly influences i),
3444                    * distribute a_{i,i1} to C-points that strongly infuence i.
3445                    * Note: currently no distribution to the diagonal in this case.
3446                    *-----------------------------------------------------------*/
3447 
3448                   else if (P_marker_offd[i1] == strong_f_marker || CF_marker[i1] != -3 )
3449                   {
3450 
3451                      /* initialize sum to zero */
3452                      hypre_CSRBlockMatrixBlockCopyData(zero_block, sum_block,
3453                                                        1.0, block_size);
3454 
3455                      /*---------------------------------------------------------
3456                       * Loop over row of A_ext for point i1 and calculate the sum
3457                       * of the connections to c-points that strongly influence i.
3458 
3459 
3460                       HERE WE ONLY WANT THE DIAG CONTIRBUTIONS (intra-unknown)
3461 
3462                       *---------------------------------------------------------*/
3463 
3464                      /* find row number */
3465                      c_num = A_offd_j[jj];
3466 
3467                      for (jj1 = A_ext_i[c_num]; jj1 < A_ext_i[c_num+1]; jj1++)
3468                      {
3469                         i2 = (HYPRE_Int)A_ext_j[jj1];
3470 
3471                         if (i2 > -1)
3472                         {
3473                            /* in the diagonal block */
3474                            if (P_marker[i2] >= jj_begin_row)
3475                            {
3476                               /* sum += A_ext_data[jj1]; */
3477                               hypre_CSRBlockMatrixBlockAddAccumulateDiag(&A_ext_data[jj1*bnnz],
3478                                                                          sum_block, block_size);
3479                            }
3480                         }
3481                         else
3482                         {
3483                            /* in the off_diagonal block  */
3484                            if (P_marker_offd[-i2-1] >= jj_begin_row_offd)
3485                            {
3486                               /* sum += A_ext_data[jj1]; */
3487                               hypre_CSRBlockMatrixBlockAddAccumulateDiag(&A_ext_data[jj1*bnnz],
3488                                                                          sum_block, block_size);
3489 
3490                            }
3491                         }
3492                      }
3493 
3494                      /* check whether sum_block is singular */
3495 
3496 
3497                      /* distribute = A_offd_data[jj] / sum;  */
3498                      /* here we want: A_offd_data * sum^(-1) */
3499                      if (hypre_CSRBlockMatrixBlockInvMultDiag2(&A_offd_data[jj*bnnz], sum_block,
3500                                                                distribute_block, block_size) == 0)
3501                      {
3502 
3503                         /*---------------------------------------------------------
3504                          * Loop over row of A_ext for point i1 and do
3505                          * the distribution.
3506 
3507                          HERE AGAIN WE ONLY WANT TO DIST W/IN A LIKE UNKNOWN
3508 
3509                          *--------------------------------------------------------*/
3510 
3511                         /* Diagonal block part of row i1 */
3512 
3513                         for (jj1 = A_ext_i[c_num]; jj1 < A_ext_i[c_num+1]; jj1++)
3514                         {
3515                            i2 = (HYPRE_Int)A_ext_j[jj1];
3516 
3517                            if (i2 > -1) /* in the diagonal block */
3518                            {
3519                               if (P_marker[i2] >= jj_begin_row)
3520                               {
3521                                  /* P_diag_data[P_marker[i2]]
3522                                     += distribute * A_ext_data[jj1]; */
3523 
3524                                  /* multiply - result in sum_block */
3525                                  hypre_CSRBlockMatrixBlockMultAddDiag2(distribute_block,
3526                                                                        &A_ext_data[jj1*bnnz], 0.0,
3527                                                                        sum_block, block_size);
3528 
3529 
3530                                  /* add result to p_diag_data */
3531                                  hypre_CSRBlockMatrixBlockAddAccumulate(sum_block,
3532                                                                         &P_diag_data[P_marker[i2]*bnnz],
3533                                                                         block_size);
3534 
3535                               }
3536                            }
3537                            else
3538                            {
3539                               /* in the off_diagonal block  */
3540                               if (P_marker_offd[-i2-1] >= jj_begin_row_offd)
3541 
3542                                  /*P_offd_data[P_marker_offd[-i2-1]]
3543                                    += distribute * A_ext_data[jj1];*/
3544                               {
3545 
3546                                  /* multiply - result in sum_block */
3547                                  hypre_CSRBlockMatrixBlockMultAddDiag2(distribute_block,
3548                                                                        &A_ext_data[jj1*bnnz], 0.0,
3549                                                                        sum_block, block_size);
3550 
3551 
3552                                  /* add result to p_offd_data */
3553                                  hypre_CSRBlockMatrixBlockAddAccumulate(sum_block,
3554                                                                         &P_offd_data[P_marker_offd[-i2-1]*bnnz],
3555                                                                         block_size);
3556                               }
3557                            }
3558                         }
3559                      }
3560                   }
3561                }
3562             }
3563 
3564             /*-----------------------------------------------------------------
3565              * Set interpolation weight by dividing by the diagonal.
3566              *-----------------------------------------------------------------*/
3567 
3568             for (jj = jj_begin_row; jj < jj_end_row; jj++)
3569             {
3570 
3571                /* P_diag_data[jj] /= -diagonal; */
3572 
3573                /* want diagonal^(-1)*P_diag_data */
3574                /* do division - put in sum_block */
3575                if ( hypre_CSRBlockMatrixBlockInvMult(diagonal_block, &P_diag_data[jj*bnnz],
3576                                                      sum_block, block_size) == 0)
3577                {
3578                   /* now copy to  P_diag_data[jj] and make negative */
3579                   hypre_CSRBlockMatrixBlockCopyData(sum_block, &P_diag_data[jj*bnnz],
3580                                                     -1.0, block_size);
3581                }
3582                else
3583                {
3584                   /* hypre_printf(" Warning! singular diagonal block! Proc id %d row %d\n", my_id,i);  */
3585                   /* just make P_diag_data negative since diagonal is singular) */
3586                   hypre_CSRBlockMatrixBlockCopyData(&P_diag_data[jj*bnnz], &P_diag_data[jj*bnnz],
3587                                                     -1.0, block_size);
3588 
3589                }
3590             }
3591 
3592             for (jj = jj_begin_row_offd; jj < jj_end_row_offd; jj++)
3593             {
3594                /* P_offd_data[jj] /= -diagonal; */
3595 
3596                /* do division - put in sum_block */
3597                hypre_CSRBlockMatrixBlockInvMult(diagonal_block, &P_offd_data[jj*bnnz],
3598                                                 sum_block, block_size);
3599 
3600                /* now copy to  P_offd_data[jj] and make negative */
3601                hypre_CSRBlockMatrixBlockCopyData(sum_block, &P_offd_data[jj*bnnz],
3602                                                  -1.0, block_size);
3603 
3604 
3605 
3606             }
3607 
3608          }
3609 
3610          strong_f_marker--;
3611 
3612          P_offd_i[i+1] = jj_counter_offd;
3613       }
3614       hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
3615       hypre_TFree(P_marker_offd, HYPRE_MEMORY_HOST);
3616    }
3617 
3618    /* Now create P - as a block matrix */
3619    P = hypre_ParCSRBlockMatrixCreate(comm, block_size,
3620                                      hypre_ParCSRBlockMatrixGlobalNumRows(A),
3621                                      total_global_cpts,
3622                                      hypre_ParCSRBlockMatrixColStarts(A),
3623                                      num_cpts_global,
3624                                      0,
3625                                      P_diag_i[n_fine],
3626                                      P_offd_i[n_fine]);
3627 
3628    P_diag = hypre_ParCSRBlockMatrixDiag(P);
3629    hypre_CSRBlockMatrixData(P_diag) = P_diag_data;
3630    hypre_CSRBlockMatrixI(P_diag) = P_diag_i;
3631    hypre_CSRBlockMatrixJ(P_diag) = P_diag_j;
3632 
3633    P_offd = hypre_ParCSRBlockMatrixOffd(P);
3634    hypre_CSRBlockMatrixData(P_offd) = P_offd_data;
3635    hypre_CSRBlockMatrixI(P_offd) = P_offd_i;
3636    hypre_CSRBlockMatrixJ(P_offd) = P_offd_j;
3637 
3638    /* Compress P, removing coefficients smaller than trunc_factor * Max */
3639    if (trunc_factor != 0.0 || max_elmts > 0)
3640    {
3641       hypre_BoomerAMGBlockInterpTruncation(P, trunc_factor, max_elmts);
3642       P_diag_data = hypre_CSRBlockMatrixData(P_diag);
3643       P_diag_i = hypre_CSRBlockMatrixI(P_diag);
3644       P_diag_j = hypre_CSRBlockMatrixJ(P_diag);
3645       P_offd_data = hypre_CSRBlockMatrixData(P_offd);
3646       P_offd_i = hypre_CSRBlockMatrixI(P_offd);
3647       P_offd_j = hypre_CSRBlockMatrixJ(P_offd);
3648       P_diag_size = P_diag_i[n_fine];
3649       P_offd_size = P_offd_i[n_fine];
3650    }
3651 
3652 
3653    num_cols_P_offd = 0;
3654    if (P_offd_size)
3655    {
3656       P_marker = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
3657 
3658 
3659       for (i=0; i < num_cols_A_offd; i++)
3660          P_marker[i] = 0;
3661 
3662       num_cols_P_offd = 0;
3663       for (i=0; i < P_offd_size; i++)
3664       {
3665          index = P_offd_j[i];
3666          if (!P_marker[index])
3667          {
3668             num_cols_P_offd++;
3669             P_marker[index] = 1;
3670          }
3671       }
3672 
3673       col_map_offd_P = hypre_CTAlloc(HYPRE_BigInt, num_cols_P_offd, HYPRE_MEMORY_HOST);
3674       tmp_map_offd = hypre_CTAlloc(HYPRE_Int, num_cols_P_offd, HYPRE_MEMORY_HOST);
3675 
3676       index = 0;
3677       for (i=0; i < num_cols_P_offd; i++)
3678       {
3679          while (P_marker[index]==0) index++;
3680          tmp_map_offd[i] = index++;
3681       }
3682 
3683       for (i=0; i < P_offd_size; i++)
3684          P_offd_j[i] = hypre_BinarySearch(tmp_map_offd,
3685                                           P_offd_j[i],
3686                                           num_cols_P_offd);
3687       hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
3688    }
3689 
3690    for (i=0; i < n_fine; i++)
3691       if (CF_marker[i] == -3) CF_marker[i] = -1;
3692 
3693    if (num_cols_P_offd)
3694    {
3695       hypre_ParCSRBlockMatrixColMapOffd(P) = col_map_offd_P;
3696       hypre_CSRBlockMatrixNumCols(P_offd) = num_cols_P_offd;
3697    }
3698 
3699    /* use block version */
3700    hypre_GetCommPkgBlockRTFromCommPkgBlockA(P,A, tmp_map_offd, fine_to_coarse_offd);
3701 
3702 
3703    *P_ptr = P;
3704 
3705 
3706    hypre_TFree(zero_block, HYPRE_MEMORY_HOST);
3707    hypre_TFree(identity_block, HYPRE_MEMORY_HOST);
3708    hypre_TFree(diagonal_block, HYPRE_MEMORY_HOST);
3709    hypre_TFree(sum_block, HYPRE_MEMORY_HOST);
3710    hypre_TFree(distribute_block, HYPRE_MEMORY_HOST);
3711 
3712    hypre_TFree(CF_marker_offd, HYPRE_MEMORY_HOST);
3713    hypre_TFree(tmp_map_offd, HYPRE_MEMORY_HOST);
3714    hypre_TFree(big_buf_data, HYPRE_MEMORY_HOST);
3715    hypre_TFree(int_buf_data, HYPRE_MEMORY_HOST);
3716    hypre_TFree(fine_to_coarse, HYPRE_MEMORY_HOST);
3717    hypre_TFree(fine_to_coarse_offd, HYPRE_MEMORY_HOST);
3718    hypre_TFree(coarse_counter, HYPRE_MEMORY_HOST);
3719    hypre_TFree(jj_count, HYPRE_MEMORY_HOST);
3720    hypre_TFree(jj_count_offd, HYPRE_MEMORY_HOST);
3721 
3722    if (num_procs > 1) hypre_CSRBlockMatrixDestroy(A_ext);
3723 
3724    return(0);
3725 
3726 }
3727 
3728 /*---------------------------------------------------------------------------
3729  * hypre_BoomerAMGBlockBuildInterpRV2
3730 
3731  Here we are modifying the block interp like in Ruge's elasticity paper as above
3732  (applied math comp '86), only instead of using just the diagonals of the
3733  scaling matrices (for the fine connections), we use a diagonal matrix
3734  whose diag entries are the row sumes (like suggested in Tanya Clees thesis
3735  for direct interp)
3736 
3737  -again there is no differentiation for weak/strong f-connections
3738 
3739  *--------------------------------------------------------------------------*/
3740 
3741 HYPRE_Int
hypre_BoomerAMGBuildBlockInterpRV2(hypre_ParCSRBlockMatrix * A,HYPRE_Int * CF_marker,hypre_ParCSRMatrix * S,HYPRE_BigInt * num_cpts_global,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int debug_flag,HYPRE_Real trunc_factor,HYPRE_Int max_elmts,hypre_ParCSRBlockMatrix ** P_ptr)3742 hypre_BoomerAMGBuildBlockInterpRV2( hypre_ParCSRBlockMatrix   *A,
3743                                     HYPRE_Int                 *CF_marker,
3744                                     hypre_ParCSRMatrix        *S,
3745                                     HYPRE_BigInt              *num_cpts_global,
3746                                     HYPRE_Int                  num_functions,
3747                                     HYPRE_Int                 *dof_func,
3748                                     HYPRE_Int                  debug_flag,
3749                                     HYPRE_Real                 trunc_factor,
3750                                     HYPRE_Int                  max_elmts,
3751                                     hypre_ParCSRBlockMatrix  **P_ptr)
3752 {
3753 
3754    MPI_Comm           comm = hypre_ParCSRBlockMatrixComm(A);
3755    hypre_ParCSRCommPkg     *comm_pkg = hypre_ParCSRBlockMatrixCommPkg(A);
3756    hypre_ParCSRCommHandle  *comm_handle;
3757 
3758    hypre_CSRBlockMatrix *A_diag = hypre_ParCSRBlockMatrixDiag(A);
3759    HYPRE_Real           *A_diag_data = hypre_CSRBlockMatrixData(A_diag);
3760    HYPRE_Int            *A_diag_i = hypre_CSRBlockMatrixI(A_diag);
3761    HYPRE_Int            *A_diag_j = hypre_CSRBlockMatrixJ(A_diag);
3762 
3763    HYPRE_Int             block_size = hypre_CSRBlockMatrixBlockSize(A_diag);
3764    HYPRE_Int             bnnz = block_size*block_size;
3765 
3766    hypre_CSRBlockMatrix *A_offd = hypre_ParCSRBlockMatrixOffd(A);
3767    HYPRE_Real      *A_offd_data = hypre_CSRBlockMatrixData(A_offd);
3768    HYPRE_Int             *A_offd_i = hypre_CSRBlockMatrixI(A_offd);
3769    HYPRE_Int             *A_offd_j = hypre_CSRBlockMatrixJ(A_offd);
3770    HYPRE_Int              num_cols_A_offd = hypre_CSRBlockMatrixNumCols(A_offd);
3771    HYPRE_BigInt          *col_map_offd = hypre_ParCSRBlockMatrixColMapOffd(A);
3772 
3773    hypre_CSRMatrix *S_diag = hypre_ParCSRMatrixDiag(S);
3774    HYPRE_Int             *S_diag_i = hypre_CSRMatrixI(S_diag);
3775    HYPRE_Int             *S_diag_j = hypre_CSRMatrixJ(S_diag);
3776 
3777    hypre_CSRMatrix *S_offd = hypre_ParCSRMatrixOffd(S);
3778    HYPRE_Int             *S_offd_i = hypre_CSRMatrixI(S_offd);
3779    HYPRE_Int             *S_offd_j = hypre_CSRMatrixJ(S_offd);
3780 
3781    hypre_ParCSRBlockMatrix *P;
3782    HYPRE_BigInt          *col_map_offd_P;
3783    HYPRE_Int             *tmp_map_offd = NULL;
3784 
3785    HYPRE_Int             *CF_marker_offd = NULL;
3786 
3787    hypre_CSRBlockMatrix *A_ext;
3788 
3789    HYPRE_Real            *A_ext_data = NULL;
3790    HYPRE_Int             *A_ext_i = NULL;
3791    HYPRE_BigInt          *A_ext_j = NULL;
3792 
3793    hypre_CSRBlockMatrix    *P_diag;
3794    hypre_CSRBlockMatrix    *P_offd;
3795 
3796    HYPRE_Real      *P_diag_data;
3797    HYPRE_Int       *P_diag_i;
3798    HYPRE_Int       *P_diag_j;
3799    HYPRE_Real      *P_offd_data;
3800    HYPRE_Int       *P_offd_i;
3801    HYPRE_Int       *P_offd_j;
3802 
3803    HYPRE_Int        P_diag_size, P_offd_size;
3804 
3805    HYPRE_Int       *P_marker, *P_marker_offd = NULL;
3806 
3807    HYPRE_Int        jj_counter,jj_counter_offd;
3808    HYPRE_Int       *jj_count, *jj_count_offd = NULL;
3809    HYPRE_Int        jj_begin_row,jj_begin_row_offd;
3810    HYPRE_Int        jj_end_row,jj_end_row_offd;
3811 
3812    HYPRE_Int        start_indexing = 0; /* start indexing for P_data at 0 */
3813 
3814    HYPRE_Int        n_fine = hypre_CSRBlockMatrixNumRows(A_diag);
3815 
3816    HYPRE_Int        strong_f_marker;
3817 
3818    HYPRE_Int       *fine_to_coarse;
3819    HYPRE_BigInt    *fine_to_coarse_offd = NULL;
3820    HYPRE_Int       *coarse_counter;
3821    HYPRE_Int        coarse_shift;
3822    HYPRE_BigInt     total_global_cpts;
3823    HYPRE_Int        num_cols_P_offd;
3824    HYPRE_BigInt     my_first_cpt;
3825 
3826    HYPRE_Int        bd;
3827 
3828    HYPRE_Int        i,i1,i2;
3829    HYPRE_Int        j,jl,jj,jj1;
3830    HYPRE_Int        kc;
3831    HYPRE_BigInt     big_k;
3832    HYPRE_Int        start;
3833 
3834    HYPRE_Int        c_num;
3835 
3836    HYPRE_Int        my_id;
3837    HYPRE_Int        num_procs;
3838    HYPRE_Int        num_threads;
3839    HYPRE_Int        num_sends;
3840    HYPRE_Int        index;
3841    HYPRE_Int        ns, ne, size, rest;
3842    HYPRE_Int       *int_buf_data = NULL;
3843    HYPRE_BigInt    *big_buf_data = NULL;
3844 
3845    HYPRE_BigInt col_1 = hypre_ParCSRBlockMatrixFirstRowIndex(A);
3846    HYPRE_Int local_numrows = hypre_CSRBlockMatrixNumRows(A_diag);
3847    HYPRE_BigInt col_n = col_1 + (HYPRE_BigInt)local_numrows;
3848 
3849    HYPRE_Real       wall_time;  /* for debugging instrumentation  */
3850 
3851 
3852    HYPRE_Real       *identity_block;
3853    HYPRE_Real       *zero_block;
3854    HYPRE_Real       *diagonal_block;
3855    HYPRE_Real       *sum_block;
3856    HYPRE_Real       *distribute_block;
3857 
3858    hypre_MPI_Comm_size(comm, &num_procs);
3859    hypre_MPI_Comm_rank(comm,&my_id);
3860    num_threads = hypre_NumThreads();
3861 
3862    my_first_cpt = num_cpts_global[0];
3863    if (my_id == (num_procs -1)) total_global_cpts = num_cpts_global[1];
3864    hypre_MPI_Bcast(&total_global_cpts, 1, HYPRE_MPI_BIG_INT, num_procs-1, comm);
3865 
3866    /*-------------------------------------------------------------------
3867     * Get the CF_marker data for the off-processor columns
3868     *-------------------------------------------------------------------*/
3869 
3870    if (debug_flag==4) wall_time = time_getWallclockSeconds();
3871 
3872    CF_marker_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
3873 
3874 
3875    if (!comm_pkg)
3876    {
3877       hypre_BlockMatvecCommPkgCreate(A);
3878       comm_pkg = hypre_ParCSRBlockMatrixCommPkg(A);
3879    }
3880 
3881    num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
3882    int_buf_data = hypre_CTAlloc(HYPRE_Int,  hypre_ParCSRCommPkgSendMapStart(comm_pkg,
3883                                                        num_sends), HYPRE_MEMORY_HOST);
3884 
3885    index = 0;
3886    for (i = 0; i < num_sends; i++)
3887    {
3888       start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
3889       for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
3890       {
3891          int_buf_data[index++]
3892             = CF_marker[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
3893       }
3894 
3895    }
3896 
3897    /* we do not need the block version of comm handle - because
3898       CF_marker corresponds to the nodal matrix.  This call populates
3899       CF_marker_offd */
3900    comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data,
3901                                                CF_marker_offd);
3902 
3903    hypre_ParCSRCommHandleDestroy(comm_handle);
3904 
3905 
3906    if (debug_flag==4)
3907    {
3908       wall_time = time_getWallclockSeconds() - wall_time;
3909       hypre_printf("Proc = %d     Interp: Comm 1 CF_marker =    %f\n",
3910                    my_id, wall_time);
3911       fflush(NULL);
3912    }
3913 
3914    /*----------------------------------------------------------------------
3915     * Get the ghost rows of A
3916     *---------------------------------------------------------------------*/
3917 
3918    if (debug_flag==4) wall_time = time_getWallclockSeconds();
3919 
3920    if (num_procs > 1)
3921    {
3922       A_ext      = hypre_ParCSRBlockMatrixExtractBExt(A, A, 1);
3923       A_ext_i    = hypre_CSRBlockMatrixI(A_ext);
3924       A_ext_j    = hypre_CSRBlockMatrixBigJ(A_ext);
3925       A_ext_data = hypre_CSRBlockMatrixData(A_ext);
3926    }
3927 
3928    index = 0;
3929    for (i=0; i < num_cols_A_offd; i++)
3930    {
3931       for (j=A_ext_i[i]; j < A_ext_i[i+1]; j++)
3932       {
3933          big_k = A_ext_j[j];
3934          if (big_k >= col_1 && big_k < col_n)
3935          {
3936             A_ext_j[index] = big_k - col_1;
3937             /* for the data field we must get all of the blocbig_k data */
3938             for (bd = 0; bd < bnnz; bd++)
3939             {
3940                A_ext_data[index*bnnz + bd] = A_ext_data[j*bnnz + bd];
3941             }
3942             index++;
3943          }
3944          else
3945          {
3946             kc = hypre_BigBinarySearch(col_map_offd, big_k, num_cols_A_offd);
3947             if (kc > -1)
3948             {
3949                A_ext_j[index] = (HYPRE_BigInt)(-kc-1);
3950                for (bd = 0; bd < bnnz; bd++)
3951                {
3952                   A_ext_data[index*bnnz + bd] = A_ext_data[j*bnnz + bd];
3953                }
3954                index++;
3955             }
3956          }
3957       }
3958       A_ext_i[i] = index;
3959    }
3960    for (i = num_cols_A_offd; i > 0; i--)
3961       A_ext_i[i] = A_ext_i[i-1];
3962    if (num_procs > 1) A_ext_i[0] = 0;
3963 
3964    if (debug_flag==4)
3965    {
3966       wall_time = time_getWallclockSeconds() - wall_time;
3967       hypre_printf("Proc = %d  Interp: Comm 2   Get A_ext =  %f\n",
3968                    my_id, wall_time);
3969       fflush(NULL);
3970    }
3971 
3972 
3973    /*-----------------------------------------------------------------------
3974     *  First Pass: Determine size of P and fill in fine_to_coarse mapping.
3975     *-----------------------------------------------------------------------*/
3976 
3977    /*-----------------------------------------------------------------------
3978     *  Intialize counters and allocate mapping vector.
3979     *-----------------------------------------------------------------------*/
3980 
3981    coarse_counter = hypre_CTAlloc(HYPRE_Int,  num_threads, HYPRE_MEMORY_HOST);
3982    jj_count = hypre_CTAlloc(HYPRE_Int,  num_threads, HYPRE_MEMORY_HOST);
3983    jj_count_offd = hypre_CTAlloc(HYPRE_Int,  num_threads, HYPRE_MEMORY_HOST);
3984 
3985    fine_to_coarse = hypre_CTAlloc(HYPRE_Int,  n_fine, HYPRE_MEMORY_HOST);
3986 
3987    for (i = 0; i < n_fine; i++) fine_to_coarse[i] = -1;
3988 
3989    jj_counter = start_indexing;
3990    jj_counter_offd = start_indexing;
3991 
3992    /*-----------------------------------------------------------------------
3993     *  Loop over fine grid.
3994     *-----------------------------------------------------------------------*/
3995 
3996 
3997    for (j = 0; j < num_threads; j++)
3998    {
3999       size = n_fine/num_threads;
4000       rest = n_fine - size*num_threads;
4001       if (j < rest)
4002       {
4003          ns = j*size+j;
4004          ne = (j+1)*size+j+1;
4005       }
4006       else
4007       {
4008          ns = j*size+rest;
4009          ne = (j+1)*size+rest;
4010       }
4011 
4012 
4013       /* loop over the fine grid points */
4014       for (i = ns; i < ne; i++)
4015       {
4016 
4017          /*--------------------------------------------------------------------
4018           *  If i is a C-point, interpolation is the identity. Also set up
4019           *  mapping vector (fine_to_coarse is the mapping vector).
4020           *--------------------------------------------------------------------*/
4021 
4022          if (CF_marker[i] >= 0)
4023          {
4024             jj_count[j]++;
4025             fine_to_coarse[i] = coarse_counter[j];
4026             coarse_counter[j]++;
4027          }
4028 
4029          /*--------------------------------------------------------------------
4030           *  If i is an F-point, interpolation is from the C-points that
4031           *  strongly influence i.
4032           *--------------------------------------------------------------------*/
4033 
4034          else
4035          {
4036             for (jj = S_diag_i[i]; jj < S_diag_i[i+1]; jj++)
4037             {
4038                i1 = S_diag_j[jj];
4039                if (CF_marker[i1] >= 0)
4040                {
4041                   jj_count[j]++;
4042                }
4043             }
4044 
4045             if (num_procs > 1)
4046             {
4047                for (jj = S_offd_i[i]; jj < S_offd_i[i+1]; jj++)
4048                {
4049                   i1 = S_offd_j[jj];
4050                   if (CF_marker_offd[i1] >= 0)
4051                   {
4052                      jj_count_offd[j]++;
4053                   }
4054                }
4055             }
4056          }
4057       }
4058    }
4059 
4060    /*-----------------------------------------------------------------------
4061     *  Allocate  arrays.
4062     *-----------------------------------------------------------------------*/
4063 
4064    for (i=0; i < num_threads-1; i++)
4065    {
4066       coarse_counter[i+1] += coarse_counter[i];
4067       jj_count[i+1] += jj_count[i];
4068       jj_count_offd[i+1] += jj_count_offd[i];
4069    }
4070    i = num_threads-1;
4071    jj_counter = jj_count[i];
4072    jj_counter_offd = jj_count_offd[i];
4073 
4074    P_diag_size = jj_counter;
4075 
4076    P_diag_i    = hypre_CTAlloc(HYPRE_Int,  n_fine+1, HYPRE_MEMORY_HOST);
4077    P_diag_j    = hypre_CTAlloc(HYPRE_Int,  P_diag_size, HYPRE_MEMORY_HOST);
4078    /* we need to include the size of the blocks in the data size */
4079    P_diag_data = hypre_CTAlloc(HYPRE_Real,  P_diag_size*bnnz, HYPRE_MEMORY_HOST);
4080 
4081    P_diag_i[n_fine] = jj_counter;
4082 
4083 
4084    P_offd_size = jj_counter_offd;
4085 
4086    P_offd_i    = hypre_CTAlloc(HYPRE_Int,  n_fine+1, HYPRE_MEMORY_HOST);
4087    P_offd_j    = hypre_CTAlloc(HYPRE_Int,  P_offd_size, HYPRE_MEMORY_HOST);
4088    /* we need to include the size of the blocks in the data size */
4089    P_offd_data = hypre_CTAlloc(HYPRE_Real,  P_offd_size*bnnz, HYPRE_MEMORY_HOST);
4090 
4091    /*-----------------------------------------------------------------------
4092     *  Intialize some stuff.
4093     *-----------------------------------------------------------------------*/
4094 
4095    jj_counter = start_indexing;
4096    jj_counter_offd = start_indexing;
4097 
4098    if (debug_flag==4)
4099    {
4100       wall_time = time_getWallclockSeconds() - wall_time;
4101       hypre_printf("Proc = %d     Interp: Internal work 1 =     %f\n",
4102                    my_id, wall_time);
4103       fflush(NULL);
4104    }
4105 
4106    /* we need a block identity and a block of zeros*/
4107    identity_block = hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
4108    zero_block =  hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
4109 
4110    for(i = 0; i < block_size; i++)
4111    {
4112       identity_block[i*block_size + i] = 1.0;
4113    }
4114 
4115 
4116    /* we also need a block to keep track of the diagonal values and a sum */
4117    diagonal_block =  hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
4118    sum_block =  hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
4119    distribute_block =  hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
4120 
4121    /*-----------------------------------------------------------------------
4122     *  Send and receive fine_to_coarse info.
4123     *-----------------------------------------------------------------------*/
4124 
4125    if (debug_flag==4) wall_time = time_getWallclockSeconds();
4126 
4127    fine_to_coarse_offd = hypre_CTAlloc(HYPRE_BigInt,  num_cols_A_offd, HYPRE_MEMORY_HOST);
4128    big_buf_data = hypre_CTAlloc(HYPRE_BigInt,  hypre_ParCSRCommPkgSendMapStart(comm_pkg,
4129                                                        num_sends), HYPRE_MEMORY_HOST);
4130 
4131    for (j = 0; j < num_threads; j++)
4132    {
4133       coarse_shift = 0;
4134       if (j > 0) coarse_shift = coarse_counter[j-1];
4135       size = n_fine/num_threads;
4136       rest = n_fine - size*num_threads;
4137       if (j < rest)
4138       {
4139          ns = j*size+j;
4140          ne = (j+1)*size+j+1;
4141       }
4142       else
4143       {
4144          ns = j*size+rest;
4145          ne = (j+1)*size+rest;
4146       }
4147       for (i = ns; i < ne; i++)
4148          fine_to_coarse[i] += coarse_shift;
4149    }
4150    index = 0;
4151    for (i = 0; i < num_sends; i++)
4152    {
4153       start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
4154       for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
4155          big_buf_data[index++] = my_first_cpt
4156             + fine_to_coarse[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
4157    }
4158 
4159    /* again, we do not need to use the block version of comm handle since
4160       the fine to coarse mapping is size of the nodes */
4161 
4162    comm_handle = hypre_ParCSRCommHandleCreate( 21, comm_pkg, big_buf_data,
4163                                                fine_to_coarse_offd);
4164 
4165    hypre_ParCSRCommHandleDestroy(comm_handle);
4166 
4167    if (debug_flag==4)
4168    {
4169       wall_time = time_getWallclockSeconds() - wall_time;
4170       hypre_printf("Proc = %d     Interp: Comm 4 FineToCoarse = %f\n",
4171                    my_id, wall_time);
4172       fflush(NULL);
4173    }
4174 
4175    if (debug_flag==4) wall_time = time_getWallclockSeconds();
4176 
4177 
4178    /*-----------------------------------------------------------------------
4179     *  Loop over fine grid points.
4180     *-----------------------------------------------------------------------*/
4181 
4182 
4183    for (jl = 0; jl < num_threads; jl++)
4184    {
4185       size = n_fine/num_threads;
4186       rest = n_fine - size*num_threads;
4187       if (jl < rest)
4188       {
4189          ns = jl*size+jl;
4190          ne = (jl+1)*size+jl+1;
4191       }
4192       else
4193       {
4194          ns = jl*size+rest;
4195          ne = (jl+1)*size+rest;
4196       }
4197       jj_counter = 0;
4198       if (jl > 0) jj_counter = jj_count[jl-1];
4199       jj_counter_offd = 0;
4200       if (jl > 0) jj_counter_offd = jj_count_offd[jl-1];
4201 
4202       P_marker = hypre_CTAlloc(HYPRE_Int,  n_fine, HYPRE_MEMORY_HOST);
4203       P_marker_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
4204 
4205       for (i = 0; i < n_fine; i++)
4206       {
4207          P_marker[i] = -1;
4208       }
4209       for (i = 0; i < num_cols_A_offd; i++)
4210       {
4211          P_marker_offd[i] = -1;
4212       }
4213       strong_f_marker = -2;
4214 
4215       for (i = ns; i < ne; i++)
4216       {
4217 
4218          /*--------------------------------------------------------------------
4219           *  If i is a c-point, interpolation is the identity.
4220           *--------------------------------------------------------------------*/
4221 
4222          if (CF_marker[i] >= 0)
4223          {
4224             P_diag_i[i] = jj_counter;
4225             P_diag_j[jj_counter]    = fine_to_coarse[i];
4226             /* P_diag_data[jj_counter] = one; */
4227             hypre_CSRBlockMatrixBlockCopyData(identity_block,
4228                                               &P_diag_data[jj_counter*bnnz],
4229                                               1.0, block_size);
4230             jj_counter++;
4231          }
4232 
4233          /*--------------------------------------------------------------------
4234           *  If i is an F-point, build interpolation.
4235           *--------------------------------------------------------------------*/
4236 
4237          else
4238          {
4239             /* Diagonal part of P */
4240             P_diag_i[i] = jj_counter;
4241             jj_begin_row = jj_counter;
4242 
4243             for (jj = S_diag_i[i]; jj < S_diag_i[i+1]; jj++)
4244             {
4245                i1 = S_diag_j[jj];
4246 
4247                /*--------------------------------------------------------------
4248                 * If neighbor i1 is a C-point, set column number in P_diag_j
4249                 * and initialize interpolation weight to zero.
4250                 *--------------------------------------------------------------*/
4251 
4252                if (CF_marker[i1] >= 0)
4253                {
4254                   P_marker[i1] = jj_counter;
4255                   P_diag_j[jj_counter]    = fine_to_coarse[i1];
4256                   /* P_diag_data[jj_counter] = zero; */
4257                   hypre_CSRBlockMatrixBlockCopyData(zero_block,
4258                                                     &P_diag_data[jj_counter*bnnz],
4259                                                     1.0, block_size);
4260                   jj_counter++;
4261                }
4262 
4263                /*--------------------------------------------------------------
4264                 * If neighbor i1 is an F-point, mark it as a strong F-point
4265                 * whose connection needs to be distributed.
4266                 *--------------------------------------------------------------*/
4267 
4268                else if (CF_marker[i1] != -3)
4269                {
4270                   P_marker[i1] = strong_f_marker;
4271                }
4272             }
4273             jj_end_row = jj_counter;
4274 
4275             /* Off-Diagonal part of P */
4276             P_offd_i[i] = jj_counter_offd;
4277             jj_begin_row_offd = jj_counter_offd;
4278 
4279 
4280             if (num_procs > 1)
4281             {
4282                for (jj = S_offd_i[i]; jj < S_offd_i[i+1]; jj++)
4283                {
4284                   i1 = S_offd_j[jj];
4285 
4286                   /*-----------------------------------------------------------
4287                    * If neighbor i1 is a C-point, set column number in P_offd_j
4288                    * and initialize interpolation weight to zero.
4289                    *-----------------------------------------------------------*/
4290 
4291                   if (CF_marker_offd[i1] >= 0)
4292                   {
4293                      P_marker_offd[i1] = jj_counter_offd;
4294                      P_offd_j[jj_counter_offd]  = i1;
4295                      /* P_offd_data[jj_counter_offd] = zero; */
4296                      hypre_CSRBlockMatrixBlockCopyData(zero_block,
4297                                                        &P_offd_data[jj_counter_offd*bnnz],
4298                                                        1.0, block_size);
4299 
4300                      jj_counter_offd++;
4301                   }
4302 
4303                   /*-----------------------------------------------------------
4304                    * If neighbor i1 is an F-point, mark it as a strong F-point
4305                    * whose connection needs to be distributed.
4306                    *-----------------------------------------------------------*/
4307 
4308                   else if (CF_marker_offd[i1] != -3)
4309                   {
4310                      P_marker_offd[i1] = strong_f_marker;
4311                   }
4312                }
4313             }
4314 
4315             jj_end_row_offd = jj_counter_offd;
4316 
4317 
4318             /* get the diagonal block */
4319             /* diagonal = A_diag_data[A_diag_i[i]]; */
4320             hypre_CSRBlockMatrixBlockCopyData(&A_diag_data[A_diag_i[i]*bnnz], diagonal_block,
4321                                               1.0, block_size);
4322 
4323 
4324 
4325             /* Here we go through the neighborhood of this grid point */
4326 
4327             /* Loop over ith row of A.  First, the diagonal part of A */
4328 
4329             for (jj = A_diag_i[i]+1; jj < A_diag_i[i+1]; jj++)
4330             {
4331                i1 = A_diag_j[jj];
4332 
4333                /*--------------------------------------------------------------
4334                 * Case 1: neighbor i1 is a C-point and strongly influences i,
4335                 * accumulate a_{i,i1} into the interpolation weight.
4336                 *--------------------------------------------------------------*/
4337 
4338                if (P_marker[i1] >= jj_begin_row)
4339                {
4340                   /*   P_diag_data[P_marker[i1]] += A_diag_data[jj]; */
4341                   hypre_CSRBlockMatrixBlockAddAccumulate(&A_diag_data[jj*bnnz],
4342                                                          &P_diag_data[P_marker[i1]*bnnz],
4343                                                          block_size);
4344 
4345                }
4346 
4347                /*--------------------------------------------------------------
4348                 * Case 2: neighbor i1 is an F-point (MAY or MAY NOT strongly influences i),
4349                 * distribute a_{i,i1} to C-points that strongly infuence i.
4350                 * Note: currently no distribution to the diagonal in this case.
4351                 *--------------------------------------------------------------*/
4352 
4353                else if (P_marker[i1] == strong_f_marker || CF_marker[i1] != -3)
4354                {
4355                   /* initialize sum to zero */
4356                   /* sum = zero; */
4357                   hypre_CSRBlockMatrixBlockCopyData(zero_block, sum_block, 1.0,
4358                                                     block_size);
4359 
4360                   /*-----------------------------------------------------------
4361                    * Loop over row of A for point i1 and calculate the sum
4362                    * of the connections to c-points that strongly influence i.-
4363 
4364                    HERE WE ONLY WANT THE DIAG CONTIRBUTIONS (intra-unknown)
4365 
4366                    *-----------------------------------------------------------*/
4367 
4368                   /* Diagonal block part of row i1 */
4369                   for (jj1 = A_diag_i[i1]; jj1 < A_diag_i[i1+1]; jj1++)
4370                   {
4371                      i2 = A_diag_j[jj1];
4372                      if (P_marker[i2] >= jj_begin_row)
4373                      {
4374                         /* add diag data to sum */
4375                         /* sum += A_diag_data[jj1]; */
4376                         hypre_CSRBlockMatrixBlockAddAccumulateDiag(&A_diag_data[jj1*bnnz],
4377                                                                    sum_block, block_size);
4378                      }
4379                   }
4380 
4381                   /* Off-Diagonal block part of row i1 */
4382                   if (num_procs > 1)
4383                   {
4384                      for (jj1 = A_offd_i[i1]; jj1 < A_offd_i[i1+1]; jj1++)
4385                      {
4386                         i2 = A_offd_j[jj1];
4387                         if (P_marker_offd[i2] >= jj_begin_row_offd )
4388                         {
4389                            /* add off diag data to sum */
4390                            /*sum += A_offd_data[jj1];*/
4391                            hypre_CSRBlockMatrixBlockAddAccumulateDiag(&A_offd_data[jj1*bnnz],
4392                                                                       sum_block, block_size);
4393                         }
4394                      }
4395                   }
4396                   /* check whether sum_block is singular (NOW SUM IS A DIAG MATRIX WHOSE
4397                      ENTRIES ARE THE ROW SUMS)*/
4398                   /* distribute = A_diag_data[jj] / sum;  (if a diag element is 0 then
4399                      that col is scaled by 1 instead of 1/diag) - doesn'treturn 0*/
4400                   if (hypre_CSRBlockMatrixBlockInvMultDiag3(&A_diag_data[jj*bnnz], sum_block,
4401                                                             distribute_block, block_size) == 0)
4402                   {
4403 
4404                      /*-----------------------------------------------------------
4405                       * Loop over row of A for point i1 and do the distribution.-
4406                       (here we we use row-sums for the nodes recv. the distribution)
4407                       *-----------------------------------------------------------*/
4408 
4409                      /* Diagonal block part of row i1 */
4410                      for (jj1 = A_diag_i[i1]; jj1 < A_diag_i[i1+1]; jj1++)
4411                      {
4412                         i2 = A_diag_j[jj1];
4413                         if (P_marker[i2] >= jj_begin_row )
4414                         {
4415 
4416                            /*  P_diag_data[P_marker[i2]]
4417                                += distribute * A_diag_data[jj1];*/
4418 
4419                            /* multiply - result in sum_block */
4420                            hypre_CSRBlockMatrixBlockMultAddDiag3(distribute_block,
4421                                                                  &A_diag_data[jj1*bnnz], 0.0,
4422                                                                  sum_block, block_size);
4423 
4424                            /* add result to p_diag_data */
4425                            hypre_CSRBlockMatrixBlockAddAccumulate(sum_block,
4426                                                                   &P_diag_data[P_marker[i2]*bnnz],
4427                                                                   block_size);
4428                         }
4429                      }
4430 
4431                      /* Off-Diagonal block part of row i1 */
4432                      if (num_procs > 1)
4433                      {
4434                         for (jj1 = A_offd_i[i1]; jj1 < A_offd_i[i1+1]; jj1++)
4435                         {
4436                            i2 = A_offd_j[jj1];
4437                            if (P_marker_offd[i2] >= jj_begin_row_offd)
4438                            {
4439                               /* P_offd_data[P_marker_offd[i2]]
4440                                  += distribute * A_offd_data[jj1]; */
4441 
4442                               /* multiply - result in sum_block */
4443                               hypre_CSRBlockMatrixBlockMultAddDiag3(distribute_block,
4444                                                                     &A_offd_data[jj1*bnnz], 0.0,
4445                                                                     sum_block, block_size);
4446 
4447 
4448                               /* add result to p_offd_data */
4449                               hypre_CSRBlockMatrixBlockAddAccumulate(sum_block,
4450                                                                      &P_offd_data[P_marker_offd[i2]*bnnz],
4451                                                                      block_size);
4452                            }
4453                         }
4454                      }
4455                   }
4456                }
4457             }
4458 
4459 
4460             /*----------------------------------------------------------------
4461              * Still looping over ith row of A. Next, loop over the
4462              * off-diagonal part of A
4463              *---------------------------------------------------------------*/
4464 
4465             if (num_procs > 1)
4466             {
4467                for (jj = A_offd_i[i]; jj < A_offd_i[i+1]; jj++)
4468                {
4469                   i1 = A_offd_j[jj];
4470 
4471                   /*--------------------------------------------------------------
4472                    * Case 1: neighbor i1 is a C-point and strongly influences i,
4473                    * accumulate a_{i,i1} into the interpolation weight.
4474                    *--------------------------------------------------------------*/
4475 
4476                   if (P_marker_offd[i1] >= jj_begin_row_offd)
4477                   {
4478                      /* P_offd_data[P_marker_offd[i1]] += A_offd_data[jj]; */
4479                      hypre_CSRBlockMatrixBlockAddAccumulate( &A_offd_data[jj*bnnz],
4480                                                              &P_offd_data[P_marker_offd[i1]*bnnz],
4481                                                              block_size);
4482                   }
4483 
4484                   /*------------------------------------------------------------
4485                    * Case 2: neighbor i1 is an F-point and (MAY or MAY NOT strongly influences i),
4486                    * distribute a_{i,i1} to C-points that strongly infuence i.
4487                    * Note: currently no distribution to the diagonal in this case.
4488                    *-----------------------------------------------------------*/
4489 
4490                   else if (P_marker_offd[i1] == strong_f_marker || CF_marker[i1] != -3 )
4491                   {
4492 
4493                      /* initialize sum to zero */
4494                      hypre_CSRBlockMatrixBlockCopyData(zero_block, sum_block,
4495                                                        1.0, block_size);
4496 
4497                      /*---------------------------------------------------------
4498                       * Loop over row of A_ext for point i1 and calculate the sum
4499                       * of the connections to c-points that strongly influence i.
4500 
4501 
4502                       HERE WE ONLY WANT THE DIAG CONTIRBUTIONS (intra-unknown)
4503 
4504                       *---------------------------------------------------------*/
4505 
4506                      /* find row number */
4507                      c_num = A_offd_j[jj];
4508 
4509                      for (jj1 = A_ext_i[c_num]; jj1 < A_ext_i[c_num+1]; jj1++)
4510                      {
4511                         i2 = (HYPRE_Int)A_ext_j[jj1];
4512 
4513                         if (i2 > -1)
4514                         {
4515                            /* in the diagonal block */
4516                            if (P_marker[i2] >= jj_begin_row)
4517                            {
4518                               /* sum += A_ext_data[jj1]; */
4519                               hypre_CSRBlockMatrixBlockAddAccumulateDiag(&A_ext_data[jj1*bnnz],
4520                                                                          sum_block, block_size);
4521                            }
4522                         }
4523                         else
4524                         {
4525                            /* in the off_diagonal block  */
4526                            if (P_marker_offd[-i2-1] >= jj_begin_row_offd)
4527                            {
4528                               /* sum += A_ext_data[jj1]; */
4529                               hypre_CSRBlockMatrixBlockAddAccumulateDiag(&A_ext_data[jj1*bnnz],
4530                                                                          sum_block, block_size);
4531 
4532                            }
4533                         }
4534                      }
4535 
4536                      /* check whether sum_block is singular */
4537 
4538 
4539                      /* distribute = A_offd_data[jj] / sum;  */
4540                      /* here we want: A_offd_data * sum^(-1)  - use the row sums as the
4541                         diag for sum*/
4542                      if (hypre_CSRBlockMatrixBlockInvMultDiag3(&A_offd_data[jj*bnnz], sum_block,
4543                                                                distribute_block, block_size) == 0)
4544                      {
4545 
4546                         /*---------------------------------------------------------
4547                          * Loop over row of A_ext for point i1 and do
4548                          * the distribution.
4549 
4550 
4551                          *--------------------------------------------------------*/
4552 
4553                         /* Diagonal block part of row i1 */
4554 
4555                         for (jj1 = A_ext_i[c_num]; jj1 < A_ext_i[c_num+1]; jj1++)
4556                         {
4557                            i2 = (HYPRE_Int)A_ext_j[jj1];
4558 
4559                            if (i2 > -1) /* in the diagonal block */
4560                            {
4561                               if (P_marker[i2] >= jj_begin_row)
4562                               {
4563                                  /* P_diag_data[P_marker[i2]]
4564                                     += distribute * A_ext_data[jj1]; */
4565 
4566                                  /* multiply - result in sum_block */
4567                                  hypre_CSRBlockMatrixBlockMultAddDiag3(distribute_block,
4568                                                                        &A_ext_data[jj1*bnnz], 0.0,
4569                                                                        sum_block, block_size);
4570 
4571 
4572                                  /* add result to p_diag_data */
4573                                  hypre_CSRBlockMatrixBlockAddAccumulate(sum_block,
4574                                                                         &P_diag_data[P_marker[i2]*bnnz],
4575                                                                         block_size);
4576                               }
4577                            }
4578                            else
4579                            {
4580                               /* in the off_diagonal block  */
4581                               if (P_marker_offd[-i2-1] >= jj_begin_row_offd)
4582 
4583                                  /*P_offd_data[P_marker_offd[-i2-1]]
4584                                    += distribute * A_ext_data[jj1];*/
4585                               {
4586 
4587                                  /* multiply - result in sum_block */
4588                                  hypre_CSRBlockMatrixBlockMultAddDiag3(distribute_block,
4589                                                                        &A_ext_data[jj1*bnnz], 0.0,
4590                                                                        sum_block, block_size);
4591 
4592                                  /* add result to p_offd_data */
4593                                  hypre_CSRBlockMatrixBlockAddAccumulate(sum_block,
4594                                                                         &P_offd_data[P_marker_offd[-i2-1]*bnnz],
4595                                                                         block_size);
4596                               }
4597                            }
4598                         }
4599                      }
4600                   }
4601                }
4602             }
4603 
4604             /*-----------------------------------------------------------------
4605              * Set interpolation weight by dividing by the diagonal.
4606              *-----------------------------------------------------------------*/
4607 
4608             for (jj = jj_begin_row; jj < jj_end_row; jj++)
4609             {
4610 
4611                /* P_diag_data[jj] /= -diagonal; */
4612 
4613                /* want diagonal^(-1)*P_diag_data */
4614                /* do division - put in sum_block */
4615                if ( hypre_CSRBlockMatrixBlockInvMult(diagonal_block, &P_diag_data[jj*bnnz],
4616                                                      sum_block, block_size) == 0)
4617                {
4618                   /* now copy to  P_diag_data[jj] and make negative */
4619                   hypre_CSRBlockMatrixBlockCopyData(sum_block, &P_diag_data[jj*bnnz],
4620                                                     -1.0, block_size);
4621                }
4622                else
4623                {
4624                   /* hypre_printf(" Warning! singular diagonal block! Proc id %d row %d\n", my_id,i);  */
4625                   /* just make P_diag_data negative since diagonal is singular) */
4626                   hypre_CSRBlockMatrixBlockCopyData(&P_diag_data[jj*bnnz], &P_diag_data[jj*bnnz],
4627                                                     -1.0, block_size);
4628                }
4629             }
4630 
4631             for (jj = jj_begin_row_offd; jj < jj_end_row_offd; jj++)
4632             {
4633                /* P_offd_data[jj] /= -diagonal; */
4634 
4635                /* do division - put in sum_block */
4636                hypre_CSRBlockMatrixBlockInvMult(diagonal_block, &P_offd_data[jj*bnnz],
4637                                                 sum_block, block_size);
4638 
4639                /* now copy to  P_offd_data[jj] and make negative */
4640                hypre_CSRBlockMatrixBlockCopyData(sum_block, &P_offd_data[jj*bnnz],
4641                                                  -1.0, block_size);
4642             }
4643          }
4644 
4645          strong_f_marker--;
4646 
4647          P_offd_i[i+1] = jj_counter_offd;
4648       }
4649       hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
4650       hypre_TFree(P_marker_offd, HYPRE_MEMORY_HOST);
4651    }
4652 
4653    /* Now create P - as a block matrix */
4654    P = hypre_ParCSRBlockMatrixCreate(comm, block_size,
4655                                      hypre_ParCSRBlockMatrixGlobalNumRows(A),
4656                                      total_global_cpts,
4657                                      hypre_ParCSRBlockMatrixColStarts(A),
4658                                      num_cpts_global,
4659                                      0,
4660                                      P_diag_i[n_fine],
4661                                      P_offd_i[n_fine]);
4662 
4663 
4664    P_diag = hypre_ParCSRBlockMatrixDiag(P);
4665    hypre_CSRBlockMatrixData(P_diag) = P_diag_data;
4666    hypre_CSRBlockMatrixI(P_diag) = P_diag_i;
4667    hypre_CSRBlockMatrixJ(P_diag) = P_diag_j;
4668 
4669    P_offd = hypre_ParCSRBlockMatrixOffd(P);
4670    hypre_CSRBlockMatrixData(P_offd) = P_offd_data;
4671    hypre_CSRBlockMatrixI(P_offd) = P_offd_i;
4672    hypre_CSRBlockMatrixJ(P_offd) = P_offd_j;
4673 
4674    /* Compress P, removing coefficients smaller than trunc_factor * Max */
4675    if (trunc_factor != 0.0 || max_elmts > 0)
4676    {
4677       hypre_BoomerAMGBlockInterpTruncation(P, trunc_factor, max_elmts);
4678       P_diag_data = hypre_CSRBlockMatrixData(P_diag);
4679       P_diag_i = hypre_CSRBlockMatrixI(P_diag);
4680       P_diag_j = hypre_CSRBlockMatrixJ(P_diag);
4681       P_offd_data = hypre_CSRBlockMatrixData(P_offd);
4682       P_offd_i = hypre_CSRBlockMatrixI(P_offd);
4683       P_offd_j = hypre_CSRBlockMatrixJ(P_offd);
4684       P_diag_size = P_diag_i[n_fine];
4685       P_offd_size = P_offd_i[n_fine];
4686    }
4687 
4688    num_cols_P_offd = 0;
4689    if (P_offd_size)
4690    {
4691       P_marker = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
4692 
4693       for (i=0; i < num_cols_A_offd; i++)
4694          P_marker[i] = 0;
4695 
4696       num_cols_P_offd = 0;
4697       for (i=0; i < P_offd_size; i++)
4698       {
4699          index = P_offd_j[i];
4700          if (!P_marker[index])
4701          {
4702             num_cols_P_offd++;
4703             P_marker[index] = 1;
4704          }
4705       }
4706 
4707       tmp_map_offd = hypre_CTAlloc(HYPRE_Int, num_cols_P_offd, HYPRE_MEMORY_HOST);
4708       col_map_offd_P = hypre_CTAlloc(HYPRE_BigInt, num_cols_P_offd, HYPRE_MEMORY_HOST);
4709 
4710       index = 0;
4711       for (i=0; i < num_cols_P_offd; i++)
4712       {
4713          while (P_marker[index]==0) index++;
4714          tmp_map_offd[i] = index++;
4715       }
4716 
4717       for (i=0; i < P_offd_size; i++)
4718          P_offd_j[i] = hypre_BinarySearch(tmp_map_offd,
4719                                           P_offd_j[i],
4720                                           num_cols_P_offd);
4721       hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
4722    }
4723 
4724    for (i=0; i < n_fine; i++)
4725       if (CF_marker[i] == -3) CF_marker[i] = -1;
4726 
4727    if (num_cols_P_offd)
4728    {
4729       hypre_ParCSRBlockMatrixColMapOffd(P) = col_map_offd_P;
4730       hypre_CSRBlockMatrixNumCols(P_offd) = num_cols_P_offd;
4731    }
4732 
4733    /* use block version */
4734    hypre_GetCommPkgBlockRTFromCommPkgBlockA(P,A, tmp_map_offd,fine_to_coarse_offd);
4735 
4736 
4737    *P_ptr = P;
4738 
4739 
4740    hypre_TFree(zero_block, HYPRE_MEMORY_HOST);
4741    hypre_TFree(identity_block, HYPRE_MEMORY_HOST);
4742    hypre_TFree(diagonal_block, HYPRE_MEMORY_HOST);
4743    hypre_TFree(sum_block, HYPRE_MEMORY_HOST);
4744    hypre_TFree(distribute_block, HYPRE_MEMORY_HOST);
4745 
4746    hypre_TFree(CF_marker_offd, HYPRE_MEMORY_HOST);
4747    hypre_TFree(tmp_map_offd, HYPRE_MEMORY_HOST);
4748    hypre_TFree(big_buf_data, HYPRE_MEMORY_HOST);
4749    hypre_TFree(int_buf_data, HYPRE_MEMORY_HOST);
4750    hypre_TFree(fine_to_coarse, HYPRE_MEMORY_HOST);
4751    hypre_TFree(fine_to_coarse_offd, HYPRE_MEMORY_HOST);
4752    hypre_TFree(coarse_counter, HYPRE_MEMORY_HOST);
4753    hypre_TFree(jj_count, HYPRE_MEMORY_HOST);
4754    hypre_TFree(jj_count_offd, HYPRE_MEMORY_HOST);
4755 
4756    if (num_procs > 1) hypre_CSRBlockMatrixDestroy(A_ext);
4757 
4758    return(0);
4759 
4760 }
4761 
4762 /*---------------------------------------------------------------------------
4763  * hypre_BoomerAMGBuildBlockDirInterp
4764  *--------------------------------------------------------------------------*/
4765 
4766 HYPRE_Int
hypre_BoomerAMGBuildBlockDirInterp(hypre_ParCSRBlockMatrix * A,HYPRE_Int * CF_marker,hypre_ParCSRMatrix * S,HYPRE_BigInt * num_cpts_global,HYPRE_Int num_functions,HYPRE_Int * dof_func,HYPRE_Int debug_flag,HYPRE_Real trunc_factor,HYPRE_Int max_elmts,hypre_ParCSRBlockMatrix ** P_ptr)4767 hypre_BoomerAMGBuildBlockDirInterp( hypre_ParCSRBlockMatrix    *A,
4768                                     HYPRE_Int                  *CF_marker,
4769                                     hypre_ParCSRMatrix         *S,
4770                                     HYPRE_BigInt               *num_cpts_global,
4771                                     HYPRE_Int                   num_functions,
4772                                     HYPRE_Int                  *dof_func,
4773                                     HYPRE_Int                   debug_flag,
4774                                     HYPRE_Real                  trunc_factor,
4775                                     HYPRE_Int                   max_elmts,
4776                                     hypre_ParCSRBlockMatrix   **P_ptr)
4777 {
4778 
4779    MPI_Comm           comm = hypre_ParCSRBlockMatrixComm(A);
4780    hypre_ParCSRCommPkg     *comm_pkg = hypre_ParCSRBlockMatrixCommPkg(A);
4781    hypre_ParCSRCommHandle  *comm_handle;
4782 
4783    hypre_CSRBlockMatrix *A_diag = hypre_ParCSRBlockMatrixDiag(A);
4784    HYPRE_Real      *A_diag_data = hypre_CSRBlockMatrixData(A_diag);
4785    HYPRE_Int            *A_diag_i = hypre_CSRBlockMatrixI(A_diag);
4786    HYPRE_Int            *A_diag_j = hypre_CSRBlockMatrixJ(A_diag);
4787 
4788    HYPRE_Int             block_size = hypre_CSRBlockMatrixBlockSize(A_diag);
4789    HYPRE_Int             bnnz = block_size*block_size;
4790 
4791 
4792    hypre_CSRBlockMatrix *A_offd = hypre_ParCSRBlockMatrixOffd(A);
4793    HYPRE_Real      *A_offd_data = hypre_CSRBlockMatrixData(A_offd);
4794    HYPRE_Int            *A_offd_i = hypre_CSRBlockMatrixI(A_offd);
4795    HYPRE_Int            *A_offd_j = hypre_CSRBlockMatrixJ(A_offd);
4796    HYPRE_Int             num_cols_A_offd = hypre_CSRBlockMatrixNumCols(A_offd);
4797 
4798    hypre_CSRMatrix *S_diag = hypre_ParCSRMatrixDiag(S);
4799    HYPRE_Int            *S_diag_i = hypre_CSRMatrixI(S_diag);
4800    HYPRE_Int            *S_diag_j = hypre_CSRMatrixJ(S_diag);
4801 
4802    hypre_CSRMatrix *S_offd = hypre_ParCSRMatrixOffd(S);
4803    HYPRE_Int            *S_offd_i = hypre_CSRMatrixI(S_offd);
4804    HYPRE_Int            *S_offd_j = hypre_CSRMatrixJ(S_offd);
4805 
4806    hypre_ParCSRBlockMatrix *P;
4807    HYPRE_BigInt         *col_map_offd_P;
4808    HYPRE_Int            *tmp_map_offd = NULL;
4809 
4810    HYPRE_Int            *CF_marker_offd = NULL;
4811    HYPRE_Int            *dof_func_offd = NULL;
4812 
4813    hypre_CSRBlockMatrix *P_diag;
4814    hypre_CSRBlockMatrix *P_offd;
4815 
4816    HYPRE_Real      *P_diag_data;
4817    HYPRE_Int            *P_diag_i;
4818    HYPRE_Int            *P_diag_j;
4819    HYPRE_Real      *P_offd_data;
4820    HYPRE_Int            *P_offd_i;
4821    HYPRE_Int            *P_offd_j;
4822 
4823    HYPRE_Int             P_diag_size, P_offd_size;
4824 
4825    HYPRE_Int            *P_marker, *P_marker_offd = NULL;
4826 
4827    HYPRE_Int             jj_counter,jj_counter_offd;
4828    HYPRE_Int            *jj_count, *jj_count_offd = NULL;
4829    HYPRE_Int             jj_begin_row,jj_begin_row_offd;
4830    HYPRE_Int             jj_end_row,jj_end_row_offd;
4831 
4832    HYPRE_Int             start_indexing = 0; /* start indexing for P_data at 0 */
4833 
4834    HYPRE_Int             n_fine = hypre_CSRBlockMatrixNumRows(A_diag);
4835 
4836    HYPRE_Int            *fine_to_coarse;
4837    HYPRE_BigInt         *fine_to_coarse_offd = NULL;
4838    HYPRE_Int            *coarse_counter;
4839    HYPRE_Int             coarse_shift;
4840    HYPRE_BigInt          total_global_cpts;
4841    HYPRE_Int             num_cols_P_offd;
4842    HYPRE_BigInt          my_first_cpt;
4843 
4844    HYPRE_Int             i,i1;
4845    HYPRE_Int             j,jl,jj;
4846    HYPRE_Int             start;
4847 
4848    HYPRE_Int             my_id;
4849    HYPRE_Int             num_procs;
4850    HYPRE_Int             num_threads;
4851    HYPRE_Int             num_sends;
4852    HYPRE_Int             index;
4853    HYPRE_Int             ns, ne, size, rest;
4854    HYPRE_Int            *int_buf_data = NULL;
4855    HYPRE_BigInt         *big_buf_data = NULL;
4856 
4857    HYPRE_Real       wall_time;  /* for debugging instrumentation  */
4858 
4859    HYPRE_Real       *identity_block;
4860    HYPRE_Real       *zero_block;
4861    HYPRE_Real       *diagonal_block;
4862    HYPRE_Real       *sum_block_p;
4863    HYPRE_Real       *sum_block_n;
4864    HYPRE_Real       *r_block;
4865 
4866    hypre_MPI_Comm_size(comm, &num_procs);
4867    hypre_MPI_Comm_rank(comm,&my_id);
4868    num_threads = hypre_NumThreads();
4869 
4870    my_first_cpt = num_cpts_global[0];
4871    if (my_id == (num_procs -1)) total_global_cpts = num_cpts_global[1];
4872    hypre_MPI_Bcast(&total_global_cpts, 1, HYPRE_MPI_BIG_INT, num_procs-1, comm);
4873 
4874    /*-------------------------------------------------------------------
4875     * Get the CF_marker data for the off-processor columns
4876     *-------------------------------------------------------------------*/
4877 
4878    if (debug_flag==4) wall_time = time_getWallclockSeconds();
4879 
4880    CF_marker_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
4881 
4882 
4883    if (!comm_pkg)
4884    {
4885       hypre_BlockMatvecCommPkgCreate(A);
4886       comm_pkg = hypre_ParCSRBlockMatrixCommPkg(A);
4887    }
4888 
4889    num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
4890    int_buf_data = hypre_CTAlloc(HYPRE_Int,  hypre_ParCSRCommPkgSendMapStart(comm_pkg,
4891                                                        num_sends), HYPRE_MEMORY_HOST);
4892 
4893    index = 0;
4894    for (i = 0; i < num_sends; i++)
4895    {
4896       start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
4897       for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
4898          int_buf_data[index++]
4899             = CF_marker[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
4900    }
4901 
4902    comm_handle = hypre_ParCSRCommHandleCreate( 11, comm_pkg, int_buf_data,
4903                                                CF_marker_offd);
4904 
4905    hypre_ParCSRCommHandleDestroy(comm_handle);
4906 
4907    if (debug_flag==4)
4908    {
4909       wall_time = time_getWallclockSeconds() - wall_time;
4910       hypre_printf("Proc = %d     Interp: Comm 1 CF_marker =    %f\n",
4911                    my_id, wall_time);
4912       fflush(NULL);
4913    }
4914 
4915    /*-----------------------------------------------------------------------
4916     *  First Pass: Determine size of P and fill in fine_to_coarse mapping.
4917     *-----------------------------------------------------------------------*/
4918 
4919    /*-----------------------------------------------------------------------
4920     *  Intialize counters and allocate mapping vector.
4921     *-----------------------------------------------------------------------*/
4922 
4923    coarse_counter = hypre_CTAlloc(HYPRE_Int,  num_threads, HYPRE_MEMORY_HOST);
4924    jj_count = hypre_CTAlloc(HYPRE_Int,  num_threads, HYPRE_MEMORY_HOST);
4925    jj_count_offd = hypre_CTAlloc(HYPRE_Int,  num_threads, HYPRE_MEMORY_HOST);
4926 
4927    fine_to_coarse = hypre_CTAlloc(HYPRE_Int,  n_fine, HYPRE_MEMORY_HOST);
4928 
4929    for (i = 0; i < n_fine; i++) fine_to_coarse[i] = -1;
4930 
4931    jj_counter = start_indexing;
4932    jj_counter_offd = start_indexing;
4933 
4934    /*-----------------------------------------------------------------------
4935     *  Loop over fine grid.
4936     *-----------------------------------------------------------------------*/
4937 
4938 
4939    for (j = 0; j < num_threads; j++)
4940    {
4941       size = n_fine/num_threads;
4942       rest = n_fine - size*num_threads;
4943       if (j < rest)
4944       {
4945          ns = j*size+j;
4946          ne = (j+1)*size+j+1;
4947       }
4948       else
4949       {
4950          ns = j*size+rest;
4951          ne = (j+1)*size+rest;
4952       }
4953 
4954 
4955       /* loop over the fine grid points */
4956       for (i = ns; i < ne; i++)
4957       {
4958 
4959          /*--------------------------------------------------------------------
4960           *  If i is a C-point, interpolation is the identity. Also set up
4961           *  mapping vector (fine_to_coarse is the mapping vector).
4962           *--------------------------------------------------------------------*/
4963 
4964          if (CF_marker[i] >= 0)
4965          {
4966             jj_count[j]++;
4967             fine_to_coarse[i] = coarse_counter[j];
4968             coarse_counter[j]++;
4969          }
4970 
4971          /*--------------------------------------------------------------------
4972           *  If i is an F-point, interpolation is from the C-points that
4973           *  strongly influence i.
4974           *--------------------------------------------------------------------*/
4975 
4976          else
4977          {
4978             for (jj = S_diag_i[i]; jj < S_diag_i[i+1]; jj++)
4979             {
4980                i1 = S_diag_j[jj];
4981                if (CF_marker[i1] > 0)
4982                {
4983                   jj_count[j]++;
4984                }
4985             }
4986 
4987             if (num_procs > 1)
4988             {
4989                for (jj = S_offd_i[i]; jj < S_offd_i[i+1]; jj++)
4990                {
4991                   i1 = S_offd_j[jj];
4992                   if (CF_marker_offd[i1] > 0)
4993                   {
4994                      jj_count_offd[j]++;
4995                   }
4996                }
4997             }
4998          }
4999       }
5000    }
5001 
5002    /*-----------------------------------------------------------------------
5003     *  Allocate  arrays.
5004     *-----------------------------------------------------------------------*/
5005 
5006    for (i=0; i < num_threads-1; i++)
5007    {
5008       coarse_counter[i+1] += coarse_counter[i];
5009       jj_count[i+1] += jj_count[i];
5010       jj_count_offd[i+1] += jj_count_offd[i];
5011    }
5012    i = num_threads-1;
5013    jj_counter = jj_count[i];
5014    jj_counter_offd = jj_count_offd[i];
5015 
5016    P_diag_size = jj_counter;
5017 
5018    P_diag_i    = hypre_CTAlloc(HYPRE_Int,  n_fine+1, HYPRE_MEMORY_HOST);
5019    P_diag_j    = hypre_CTAlloc(HYPRE_Int,  P_diag_size, HYPRE_MEMORY_HOST);
5020    /* we need to include the size of the blocks in the data size */
5021    P_diag_data = hypre_CTAlloc(HYPRE_Real,  P_diag_size*bnnz, HYPRE_MEMORY_HOST);
5022 
5023    P_diag_i[n_fine] = jj_counter;
5024 
5025 
5026    P_offd_size = jj_counter_offd;
5027 
5028    P_offd_i    = hypre_CTAlloc(HYPRE_Int,  n_fine+1, HYPRE_MEMORY_HOST);
5029    P_offd_j    = hypre_CTAlloc(HYPRE_Int,  P_offd_size, HYPRE_MEMORY_HOST);
5030    /* we need to include the size of the blocks in the data size */
5031    P_offd_data = hypre_CTAlloc(HYPRE_Real,  P_offd_size*bnnz, HYPRE_MEMORY_HOST);
5032 
5033    /*-----------------------------------------------------------------------
5034     *  Intialize some stuff.
5035     *-----------------------------------------------------------------------*/
5036 
5037    jj_counter = start_indexing;
5038    jj_counter_offd = start_indexing;
5039 
5040    if (debug_flag==4)
5041    {
5042       wall_time = time_getWallclockSeconds() - wall_time;
5043       hypre_printf("Proc = %d     Interp: Internal work 1 =     %f\n",
5044                    my_id, wall_time);
5045       fflush(NULL);
5046    }
5047 
5048    /* we need a block identity and a block of zeros*/
5049    identity_block = hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
5050    zero_block =  hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
5051 
5052    for(i = 0; i < block_size; i++)
5053    {
5054       identity_block[i*block_size + i] = 1.0;
5055    }
5056    /* we also need a block to keep track of the diagonal values and a sum */
5057    diagonal_block =  hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
5058    sum_block_p =  hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
5059    sum_block_n =  hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
5060    r_block =  hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
5061 
5062    /*-----------------------------------------------------------------------
5063     *  Send and receive fine_to_coarse info.
5064     *-----------------------------------------------------------------------*/
5065 
5066    if (debug_flag==4) wall_time = time_getWallclockSeconds();
5067 
5068    fine_to_coarse_offd = hypre_CTAlloc(HYPRE_BigInt,  num_cols_A_offd, HYPRE_MEMORY_HOST);
5069    big_buf_data = hypre_CTAlloc(HYPRE_BigInt,  hypre_ParCSRCommPkgSendMapStart(comm_pkg,
5070                                                        num_sends), HYPRE_MEMORY_HOST);
5071 
5072    for (j = 0; j < num_threads; j++)
5073    {
5074       coarse_shift = 0;
5075       if (j > 0) coarse_shift = coarse_counter[j-1];
5076       size = n_fine/num_threads;
5077       rest = n_fine - size*num_threads;
5078       if (j < rest)
5079       {
5080          ns = j*size+j;
5081          ne = (j+1)*size+j+1;
5082       }
5083       else
5084       {
5085          ns = j*size+rest;
5086          ne = (j+1)*size+rest;
5087       }
5088       for (i = ns; i < ne; i++)
5089          fine_to_coarse[i] += coarse_shift;
5090    }
5091    index = 0;
5092    for (i = 0; i < num_sends; i++)
5093    {
5094       start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
5095       for (j = start; j < hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1); j++)
5096          big_buf_data[index++] = my_first_cpt
5097             + fine_to_coarse[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,j)];
5098    }
5099 
5100    comm_handle = hypre_ParCSRCommHandleCreate( 21, comm_pkg, big_buf_data,
5101                                                fine_to_coarse_offd);
5102 
5103    hypre_ParCSRCommHandleDestroy(comm_handle);
5104 
5105    if (debug_flag==4)
5106    {
5107       wall_time = time_getWallclockSeconds() - wall_time;
5108       hypre_printf("Proc = %d     Interp: Comm 4 FineToCoarse = %f\n",
5109                    my_id, wall_time);
5110       fflush(NULL);
5111    }
5112 
5113    if (debug_flag==4) wall_time = time_getWallclockSeconds();
5114 
5115    /*-----------------------------------------------------------------------
5116     *  Loop over fine grid points.
5117     *-----------------------------------------------------------------------*/
5118 
5119    for (jl = 0; jl < num_threads; jl++)
5120    {
5121       size = n_fine/num_threads;
5122       rest = n_fine - size*num_threads;
5123       if (jl < rest)
5124       {
5125          ns = jl*size+jl;
5126          ne = (jl+1)*size+jl+1;
5127       }
5128       else
5129       {
5130          ns = jl*size+rest;
5131          ne = (jl+1)*size+rest;
5132       }
5133       jj_counter = 0;
5134       if (jl > 0) jj_counter = jj_count[jl-1];
5135       jj_counter_offd = 0;
5136       if (jl > 0) jj_counter_offd = jj_count_offd[jl-1];
5137 
5138       P_marker = hypre_CTAlloc(HYPRE_Int,  n_fine, HYPRE_MEMORY_HOST);
5139       P_marker_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
5140 
5141       for (i = 0; i < n_fine; i++)
5142       {
5143          P_marker[i] = -1;
5144       }
5145       for (i = 0; i < num_cols_A_offd; i++)
5146       {
5147          P_marker_offd[i] = -1;
5148       }
5149 
5150       for (i = ns; i < ne; i++)
5151       {
5152 
5153          /*--------------------------------------------------------------------
5154           *  If i is a c-point, interpolation is the identity.
5155           *--------------------------------------------------------------------*/
5156 
5157          if (CF_marker[i] >= 0)
5158          {
5159             P_diag_i[i] = jj_counter;
5160             P_diag_j[jj_counter]    = fine_to_coarse[i];
5161             /* P_diag_data[jj_counter] = one; */
5162             hypre_CSRBlockMatrixBlockCopyData(identity_block,
5163                                               &P_diag_data[jj_counter*bnnz],
5164                                               1.0, block_size);
5165             jj_counter++;
5166          }
5167 
5168          /*--------------------------------------------------------------------
5169           *  If i is an F-point, build interpolation.
5170           *--------------------------------------------------------------------*/
5171 
5172          else
5173          {
5174             /* Diagonal part of P */
5175             P_diag_i[i] = jj_counter;
5176             jj_begin_row = jj_counter;
5177 
5178             for (jj = S_diag_i[i]; jj < S_diag_i[i+1]; jj++)
5179             {
5180                i1 = S_diag_j[jj];
5181 
5182                /*--------------------------------------------------------------
5183                 * If neighbor i1 is a C-point, set column number in P_diag_j
5184                 * and initialize interpolation weight to zero.
5185                 *--------------------------------------------------------------*/
5186 
5187                if (CF_marker[i1] >= 0)
5188                {
5189                   P_marker[i1] = jj_counter;
5190                   P_diag_j[jj_counter]    = fine_to_coarse[i1];
5191                   /* P_diag_data[jj_counter] = zero; */
5192                   hypre_CSRBlockMatrixBlockCopyData(zero_block,
5193                                                     &P_diag_data[jj_counter*bnnz],
5194                                                     1.0, block_size);
5195                   jj_counter++;
5196                }
5197 
5198             }
5199             jj_end_row = jj_counter;
5200 
5201             /* Off-Diagonal part of P */
5202             P_offd_i[i] = jj_counter_offd;
5203             jj_begin_row_offd = jj_counter_offd;
5204 
5205 
5206             if (num_procs > 1)
5207             {
5208                for (jj = S_offd_i[i]; jj < S_offd_i[i+1]; jj++)
5209                {
5210                   i1 = S_offd_j[jj];
5211 
5212                   /*-----------------------------------------------------------
5213                    * If neighbor i1 is a C-point, set column number in P_offd_j
5214                    * and initialize interpolation weight to zero.
5215                    *-----------------------------------------------------------*/
5216 
5217                   if (CF_marker_offd[i1] >= 0)
5218                   {
5219                      P_marker_offd[i1] = jj_counter_offd;
5220                      P_offd_j[jj_counter_offd]  = i1;
5221                      /* P_offd_data[jj_counter_offd] = zero; */
5222                      hypre_CSRBlockMatrixBlockCopyData(zero_block,
5223                                                        &P_offd_data[jj_counter_offd*bnnz],
5224                                                        1.0, block_size);
5225                      jj_counter_offd++;
5226 
5227                   }
5228                }
5229             }
5230 
5231             jj_end_row_offd = jj_counter_offd;
5232             /* get the diagonal block */
5233             /* diagonal = A_diag_data[A_diag_i[i]];*/
5234             hypre_CSRBlockMatrixBlockCopyData(&A_diag_data[A_diag_i[i]*bnnz], diagonal_block,
5235                                               1.0, block_size);
5236 
5237 
5238             /* Loop over ith row of A.  First, the diagonal part of A */
5239             /*sum_N_pos = 0;
5240               sum_N_neg = 0;
5241               sum_P_pos = 0;
5242               sum_P_neg = 0;*/
5243             hypre_CSRBlockMatrixBlockCopyData(zero_block, sum_block_p, 1.0,
5244                                               block_size);
5245             hypre_CSRBlockMatrixBlockCopyData(zero_block, sum_block_n, 1.0,
5246                                               block_size);
5247 
5248 
5249             for (jj = A_diag_i[i]+1; jj < A_diag_i[i+1]; jj++)
5250             {
5251                i1 = A_diag_j[jj];
5252 
5253                /*if (A_diag_data[jj] > 0)
5254                  sum_N_pos += A_diag_data[jj];
5255                  else
5256                  sum_N_neg += A_diag_data[jj];*/
5257 
5258                hypre_CSRBlockMatrixBlockAddAccumulate(&A_diag_data[jj*bnnz],
5259                                                       sum_block_n, block_size);
5260 
5261                /*--------------------------------------------------------------
5262                 * Case 1: neighbor i1 is a C-point and strongly influences i,
5263                 * accumulate a_{i,i1} into the interpolation weight.
5264                 *--------------------------------------------------------------*/
5265 
5266                if (P_marker[i1] >= jj_begin_row)
5267                {
5268 
5269 
5270                   /* P_diag_data[P_marker[i1]] += A_diag_data[jj];*/
5271                   hypre_CSRBlockMatrixBlockAddAccumulate(&A_diag_data[jj*bnnz],
5272                                                          &P_diag_data[P_marker[i1]*bnnz],
5273                                                          block_size);
5274 
5275 
5276                   /*if (A_diag_data[jj] > 0)
5277                     sum_P_pos += A_diag_data[jj];
5278                     else
5279                     sum_P_neg += A_diag_data[jj];*/
5280                   hypre_CSRBlockMatrixBlockAddAccumulate(&A_diag_data[jj*bnnz],
5281                                                          sum_block_p, block_size);
5282 
5283                }
5284             }
5285 
5286             /*----------------------------------------------------------------
5287              * Still looping over ith row of A. Next, loop over the
5288              * off-diagonal part of A
5289              *---------------------------------------------------------------*/
5290 
5291             if (num_procs > 1)
5292             {
5293                for (jj = A_offd_i[i]; jj < A_offd_i[i+1]; jj++)
5294                {
5295                   i1 = A_offd_j[jj];
5296 
5297                   /*if (A_offd_data[jj] > 0)
5298                     sum_N_pos += A_offd_data[jj];
5299                     else
5300                     sum_N_neg += A_offd_data[jj];*/
5301                   hypre_CSRBlockMatrixBlockAddAccumulate(&A_offd_data[jj*bnnz],
5302                                                          sum_block_n, block_size);
5303 
5304                   /*--------------------------------------------------------------
5305                    * Case 1: neighbor i1 is a C-point and strongly influences i,
5306                    * accumulate a_{i,i1} into the interpolation weight.
5307                    *--------------------------------------------------------------*/
5308 
5309                   if (P_marker_offd[i1] >= jj_begin_row_offd)
5310                   {
5311                      /* P_offd_data[P_marker_offd[i1]] += A_offd_data[jj];*/
5312                      hypre_CSRBlockMatrixBlockAddAccumulate( &A_offd_data[jj*bnnz],
5313                                                              &P_offd_data[P_marker_offd[i1]*bnnz],
5314                                                              block_size);
5315                      /*if (A_offd_data[jj] > 0)
5316                        sum_P_pos += A_offd_data[jj];
5317                        else
5318                        sum_P_neg += A_offd_data[jj];*/
5319                      hypre_CSRBlockMatrixBlockAddAccumulate(&A_offd_data[jj*bnnz],
5320                                                             sum_block_p, block_size);
5321 
5322                   }
5323                }
5324             }
5325 
5326 
5327             /*if (sum_P_neg) alfa = sum_N_neg/sum_P_neg/diagonal;
5328               if (sum_P_pos) beta = sum_N_pos/sum_P_pos/diagonal;*/
5329 
5330             /*r_block = sum_block_n*sum_block_p^-1*/
5331             hypre_CSRBlockMatrixBlockMultInv(sum_block_p, sum_block_n,
5332                                              r_block, block_size);
5333 
5334             /* sum_block_n= diagonal^-1*r_block */
5335             hypre_CSRBlockMatrixBlockInvMult(diagonal_block, r_block,
5336                                              sum_block_n, block_size);
5337 
5338             /*-----------------------------------------------------------------
5339              * Set interpolation weight by dividing by the diagonal.
5340              *-----------------------------------------------------------------*/
5341 
5342             for (jj = jj_begin_row; jj < jj_end_row; jj++)
5343             {
5344                /*if (P_diag_data[jj]> 0)
5345                  P_diag_data[jj] *= -beta;
5346                  else
5347                  P_diag_data[jj] *= -alfa;*/
5348 
5349                hypre_CSRBlockMatrixBlockCopyData( &P_diag_data[jj*bnnz],
5350                                                   r_block, -1.0, block_size);
5351 
5352 
5353                hypre_CSRBlockMatrixBlockMultAdd(sum_block_n, r_block, 0.0,
5354                                                 &P_diag_data[jj*bnnz], block_size);
5355             }
5356 
5357             for (jj = jj_begin_row_offd; jj < jj_end_row_offd; jj++)
5358             {
5359                /*if (P_offd_data[jj]> 0)
5360                  P_offd_data[jj] *= -beta;
5361                  else
5362                  P_offd_data[jj] *= -alfa;*/
5363 
5364                hypre_CSRBlockMatrixBlockCopyData( &P_offd_data[jj*bnnz],
5365                                                   r_block, -1.0, block_size);
5366 
5367                hypre_CSRBlockMatrixBlockMultAdd(sum_block_n, r_block, 0.0,
5368                                                 &P_offd_data[jj*bnnz], block_size);
5369             }
5370          }
5371 
5372          P_offd_i[i+1] = jj_counter_offd;
5373       }
5374       hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
5375       hypre_TFree(P_marker_offd, HYPRE_MEMORY_HOST);
5376    }
5377 
5378    /* Now create P - as a block matrix */
5379    P = hypre_ParCSRBlockMatrixCreate(comm,block_size,
5380                                      hypre_ParCSRBlockMatrixGlobalNumRows(A),
5381                                      total_global_cpts,
5382                                      hypre_ParCSRBlockMatrixColStarts(A),
5383                                      num_cpts_global,
5384                                      0,
5385                                      P_diag_i[n_fine],
5386                                      P_offd_i[n_fine]);
5387 
5388    P_diag = hypre_ParCSRBlockMatrixDiag(P);
5389    hypre_CSRBlockMatrixData(P_diag) = P_diag_data;
5390    hypre_CSRBlockMatrixI(P_diag) = P_diag_i;
5391    hypre_CSRBlockMatrixJ(P_diag) = P_diag_j;
5392 
5393    P_offd = hypre_ParCSRBlockMatrixOffd(P);
5394    hypre_CSRBlockMatrixData(P_offd) = P_offd_data;
5395    hypre_CSRBlockMatrixI(P_offd) = P_offd_i;
5396    hypre_CSRBlockMatrixJ(P_offd) = P_offd_j;
5397 
5398    /* Compress P, removing coefficients smaller than trunc_factor * Max */
5399 
5400    if (trunc_factor != 0.0 || max_elmts > 0)
5401    {
5402       hypre_BoomerAMGBlockInterpTruncation(P, trunc_factor, max_elmts);
5403       P_diag_data = hypre_CSRBlockMatrixData(P_diag);
5404       P_diag_i = hypre_CSRBlockMatrixI(P_diag);
5405       P_diag_j = hypre_CSRBlockMatrixJ(P_diag);
5406       P_offd_data = hypre_CSRBlockMatrixData(P_offd);
5407       P_offd_i = hypre_CSRBlockMatrixI(P_offd);
5408       P_offd_j = hypre_CSRBlockMatrixJ(P_offd);
5409       P_diag_size = P_diag_i[n_fine];
5410       P_offd_size = P_offd_i[n_fine];
5411    }
5412 
5413    num_cols_P_offd = 0;
5414    if (P_offd_size)
5415    {
5416       P_marker = hypre_CTAlloc(HYPRE_Int,  num_cols_A_offd, HYPRE_MEMORY_HOST);
5417 
5418       for (i=0; i < num_cols_A_offd; i++)
5419          P_marker[i] = 0;
5420 
5421       num_cols_P_offd = 0;
5422       for (i=0; i < P_offd_size; i++)
5423       {
5424          index = P_offd_j[i];
5425          if (!P_marker[index])
5426          {
5427             num_cols_P_offd++;
5428             P_marker[index] = 1;
5429          }
5430       }
5431 
5432       col_map_offd_P = hypre_CTAlloc(HYPRE_BigInt, num_cols_P_offd, HYPRE_MEMORY_HOST);
5433       tmp_map_offd = hypre_CTAlloc(HYPRE_Int, num_cols_P_offd, HYPRE_MEMORY_HOST);
5434 
5435       index = 0;
5436       for (i=0; i < num_cols_P_offd; i++)
5437       {
5438          while (P_marker[index]==0) index++;
5439          tmp_map_offd[i] = index++;
5440       }
5441 
5442       for (i=0; i < P_offd_size; i++)
5443          P_offd_j[i] = hypre_BinarySearch(tmp_map_offd,
5444                                           P_offd_j[i],
5445                                           num_cols_P_offd);
5446       hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
5447    }
5448 
5449    for (i=0; i < n_fine; i++)
5450       if (CF_marker[i] == -3) CF_marker[i] = -1;
5451 
5452    if (num_cols_P_offd)
5453    {
5454       hypre_ParCSRBlockMatrixColMapOffd(P) = col_map_offd_P;
5455       hypre_CSRBlockMatrixNumCols(P_offd) = num_cols_P_offd;
5456    }
5457 
5458    hypre_GetCommPkgBlockRTFromCommPkgBlockA(P,A,tmp_map_offd,fine_to_coarse_offd);
5459 
5460    *P_ptr = P;
5461 
5462    hypre_TFree(zero_block, HYPRE_MEMORY_HOST);
5463    hypre_TFree(identity_block, HYPRE_MEMORY_HOST);
5464    hypre_TFree(diagonal_block, HYPRE_MEMORY_HOST);
5465    hypre_TFree(sum_block_n, HYPRE_MEMORY_HOST);
5466    hypre_TFree(sum_block_p, HYPRE_MEMORY_HOST);
5467    hypre_TFree(r_block, HYPRE_MEMORY_HOST);
5468 
5469 
5470    hypre_TFree(CF_marker_offd, HYPRE_MEMORY_HOST);
5471    hypre_TFree(tmp_map_offd, HYPRE_MEMORY_HOST);
5472    hypre_TFree(dof_func_offd, HYPRE_MEMORY_HOST);
5473    hypre_TFree(int_buf_data, HYPRE_MEMORY_HOST);
5474    hypre_TFree(big_buf_data, HYPRE_MEMORY_HOST);
5475    hypre_TFree(fine_to_coarse, HYPRE_MEMORY_HOST);
5476    hypre_TFree(fine_to_coarse_offd, HYPRE_MEMORY_HOST);
5477    hypre_TFree(coarse_counter, HYPRE_MEMORY_HOST);
5478    hypre_TFree(jj_count, HYPRE_MEMORY_HOST);
5479    hypre_TFree(jj_count_offd, HYPRE_MEMORY_HOST);
5480 
5481    return(0);
5482 
5483 }
5484 
5485 #if 0  /* not finished yet! */
5486 
5487 /*---------------------------------------------------------------------------
5488  * hypre_BoomerAMGBuildBlockStdInterp
5489  *  Comment: The interpolatory weighting can be changed with the sep_weight
5490  *           variable. This can enable not separating negative and positive
5491  *           off diagonals in the weight formula.
5492  *--------------------------------------------------------------------------*/
5493 
5494 HYPRE_Int
5495 hypre_BoomerAMGBuildBlockStdInterp(hypre_ParCSRBlockMatrix *A,
5496                                    HYPRE_Int *CF_marker,
5497                                    hypre_ParCSRMatrix   *S,
5498                                    HYPRE_Int *num_cpts_global,
5499                                    HYPRE_Int num_functions,
5500                                    HYPRE_Int *dof_func,
5501                                    HYPRE_Int debug_flag,
5502                                    HYPRE_Real    trunc_factor,
5503                                    HYPRE_Int max_elmts,
5504                                    HYPRE_Int sep_weight,
5505                                    hypre_ParCSRBlockMatrix  **P_ptr)
5506 {
5507    /* Communication Variables */
5508    MPI_Comm                 comm = hypre_ParCSRBlockMatrixComm(A);
5509    hypre_ParCSRCommPkg     *comm_pkg = hypre_ParCSRBlockMatrixCommPkg(A);
5510    HYPRE_Int              my_id, num_procs;
5511 
5512    /* Variables to store input variables */
5513    hypre_CSRBlockMatrix *A_diag = hypre_ParCSRBlockMatrixDiag(A);
5514    HYPRE_Real      *A_diag_data = hypre_CSRBlockMatrixData(A_diag);
5515    HYPRE_Int             *A_diag_i = hypre_CSRBlockMatrixI(A_diag);
5516    HYPRE_Int             *A_diag_j = hypre_CSRBlockMatrixJ(A_diag);
5517 
5518    hypre_CSRBlockMatrix *A_offd = hypre_ParCSRBlockMatrixOffd(A);
5519    HYPRE_Real      *A_offd_data = hypre_CSRBlockMatrixData(A_offd);
5520    HYPRE_Int             *A_offd_i = hypre_CSRBlockMatrixI(A_offd);
5521    HYPRE_Int             *A_offd_j = hypre_CSRBlockMatrixJ(A_offd);
5522 
5523    HYPRE_Int              num_cols_A_offd = hypre_CSRBlockMatrixNumCols(A_offd);
5524    HYPRE_Int             *col_map_offd = hypre_ParCSRBlockMatrixColMapOffd(A);
5525    HYPRE_Int              n_fine = hypre_CSRBlockMatrixNumRows(A_diag);
5526    HYPRE_Int              col_1 = hypre_ParCSRBlockMatrixFirstRowIndex(A);
5527    HYPRE_Int              local_numrows = hypre_CSRBlockMatrixNumRows(A_diag);
5528    HYPRE_Int              col_n = col_1 + local_numrows;
5529    HYPRE_Int              total_global_cpts, my_first_cpt;
5530 
5531    /* Variables to store strong connection matrix info */
5532    hypre_CSRMatrix *S_diag = hypre_ParCSRMatrixDiag(S);
5533    HYPRE_Int             *S_diag_i = hypre_CSRMatrixI(S_diag);
5534    HYPRE_Int             *S_diag_j = hypre_CSRMatrixJ(S_diag);
5535 
5536    hypre_CSRMatrix *S_offd = hypre_ParCSRMatrixOffd(S);
5537    HYPRE_Int             *S_offd_i = hypre_CSRMatrixI(S_offd);
5538    HYPRE_Int             *S_offd_j = hypre_CSRMatrixJ(S_offd);
5539 
5540    /* Interpolation matrix P */
5541    hypre_ParCSRBlockMatrix *P;
5542    hypre_CSRBlockMatrix    *P_diag;
5543    hypre_CSRBlockMatrix    *P_offd;
5544 
5545    HYPRE_Real      *P_diag_data;
5546    HYPRE_Int             *P_diag_i, *P_diag_j;
5547    HYPRE_Real      *P_offd_data;
5548    HYPRE_Int             *P_offd_i, *P_offd_j;
5549 
5550    HYPRE_Int               *col_map_offd_P;
5551    HYPRE_Int              P_diag_size;
5552    HYPRE_Int              P_offd_size;
5553    HYPRE_Int             *P_marker;
5554    HYPRE_Int             *P_marker_offd = NULL;
5555    HYPRE_Int             *CF_marker_offd = NULL;
5556    HYPRE_Int             *tmp_CF_marker_offd = NULL;
5557    HYPRE_Int             *dof_func_offd = NULL;
5558 
5559    /* Full row information for columns of A that are off diag*/
5560    hypre_CSRBlockMatrix *A_ext;
5561    HYPRE_Real      *A_ext_data;
5562    HYPRE_Int             *A_ext_i;
5563    HYPRE_Int             *A_ext_j;
5564 
5565    HYPRE_Int             *fine_to_coarse;
5566    HYPRE_Int             *fine_to_coarse_offd = NULL;
5567    HYPRE_Int             *found;
5568 
5569    HYPRE_Int              num_cols_P_offd;
5570    HYPRE_Int              newoff, loc_col;
5571    HYPRE_Int              A_ext_rows, full_off_procNodes;
5572 
5573    hypre_CSRMatrix *Sop;
5574    HYPRE_Int             *Sop_i;
5575    HYPRE_Int             *Sop_j;
5576 
5577    HYPRE_Int              Soprows;
5578 
5579    /* Variables to keep count of interpolatory points */
5580    HYPRE_Int              jj_counter, jj_counter_offd;
5581    HYPRE_Int              jj_begin_row, jj_end_row;
5582    HYPRE_Int              jj_begin_row_offd = 0;
5583    HYPRE_Int              jj_end_row_offd = 0;
5584    HYPRE_Int              coarse_counter, coarse_counter_offd;
5585    HYPRE_Int             *ihat, *ihat_offd = NULL;
5586    HYPRE_Int             *ipnt, *ipnt_offd = NULL;
5587    HYPRE_Int              strong_f_marker = -2;
5588 
5589    /* Interpolation weight variables */
5590    HYPRE_Real      *ahat, *ahat_offd = NULL;
5591    HYPRE_Real       sum_pos, sum_pos_C, sum_neg, sum_neg_C, sum, sum_C;
5592    HYPRE_Real       diagonal, distribute;
5593    HYPRE_Real       alfa, beta;
5594 
5595    /* Loop variables */
5596    HYPRE_Int              index;
5597    HYPRE_Int              start_indexing = 0;
5598    HYPRE_Int              i, i1, j, j1, jj, kk, k1;
5599    HYPRE_Int              cnt_c, cnt_f, cnt_c_offd, cnt_f_offd, indx;
5600 
5601    /* Definitions */
5602    HYPRE_Real       zero = 0.0;
5603    HYPRE_Real       one  = 1.0;
5604    HYPRE_Real       wall_time;
5605    HYPRE_Real       wall_1 = 0;
5606    HYPRE_Real       wall_2 = 0;
5607    HYPRE_Real       wall_3 = 0;
5608 
5609 
5610    hypre_ParCSRCommPkg   *extend_comm_pkg = NULL;
5611 
5612    HYPRE_Real       *identity_block;
5613    HYPRE_Real       *zero_block;
5614    HYPRE_Real       *diagonal_block;
5615    HYPRE_Real       *sum_block;
5616    HYPRE_Real       *distribute_block;
5617 
5618    HYPRE_Int                  block_size = hypre_CSRBlockMatrixBlockSize(A_diag);
5619    HYPRE_Int                  bnnz = block_size*block_size;
5620 
5621 
5622 
5623    if (debug_flag== 4) wall_time = time_getWallclockSeconds();
5624 
5625    /* BEGIN */
5626    hypre_MPI_Comm_size(comm, &num_procs);
5627    hypre_MPI_Comm_rank(comm,&my_id);
5628 
5629    my_first_cpt = num_cpts_global[0];
5630    if (my_id == (num_procs -1)) total_global_cpts = num_cpts_global[1];
5631    hypre_MPI_Bcast(&total_global_cpts, 1, HYPRE_MPI_INT, num_procs-1, comm);
5632 
5633    if (!comm_pkg)
5634    {
5635       hypre_BlockMatvecCommPkgCreate(A);
5636       comm_pkg = hypre_ParCSRBlockMatrixCommPkg(A);
5637    }
5638 
5639    /* Set up off processor information (specifically for neighbors of
5640     * neighbors */
5641    newoff = 0;
5642    full_off_procNodes = 0;
5643    if (num_procs > 1)
5644    {
5645       /*----------------------------------------------------------------------
5646        * Get the off processors rows for A and S, associated with columns in
5647        * A_offd and S_offd.
5648        *---------------------------------------------------------------------*/
5649       A_ext         = hypre_ParCSRBlockMatrixExtractBExt(A,A,1);
5650       A_ext_i       = hypre_CSRBlockMatrixI(A_ext);
5651       A_ext_j       = hypre_CSRBlockMatrixJ(A_ext);
5652       A_ext_data    = hypre_CSRBlockMatrixData(A_ext);
5653       A_ext_rows    = hypre_CSRBlockMatrixNumRows(A_ext);
5654 
5655 
5656 /* FIX THIS! - Sop - block or ???*/
5657 
5658       Sop           = hypre_ParCSRMatrixExtractBExt(S,A,0);
5659       Sop_i         = hypre_CSRMatrixI(Sop);
5660       Sop_j         = hypre_CSRMatrixJ(Sop);
5661       Soprows       = hypre_CSRMatrixNumRows(Sop);
5662 
5663       /* Find nodes that are neighbors of neighbors, not found in offd */
5664       newoff = new_offd_nodes(&found, A_ext_rows, A_ext_i, A_ext_j,
5665                               Soprows, col_map_offd, col_1, col_n,
5666                               Sop_i, Sop_j, CF_marker, comm_pkg);
5667       if(newoff >= 0)
5668          full_off_procNodes = newoff + num_cols_A_offd;
5669       else
5670          return(1);
5671 
5672       /* Possibly add new points and new processors to the comm_pkg, all
5673        * processors need new_comm_pkg */
5674 
5675       /* AHB - create a new comm package just for extended info -
5676          this will work better with the assumed partition*/
5677 
5678 /* FIX THIS: Block version of this? */
5679       hypre_ParCSRFindExtendCommPkg(A, newoff, found,
5680                                     &extend_comm_pkg);
5681 
5682       CF_marker_offd = hypre_CTAlloc(HYPRE_Int,  full_off_procNodes, HYPRE_MEMORY_HOST);
5683 
5684       if (num_functions > 1 && full_off_procNodes > 0)
5685          dof_func_offd = hypre_CTAlloc(HYPRE_Int,  full_off_procNodes, HYPRE_MEMORY_HOST);
5686 
5687       alt_insert_new_nodes(comm_pkg, extend_comm_pkg, CF_marker,
5688                            full_off_procNodes, CF_marker_offd);
5689 
5690       if(num_functions > 1)
5691          alt_insert_new_nodes(comm_pkg, extend_comm_pkg, dof_func,
5692                               full_off_procNodes, dof_func_offd);
5693    }
5694 
5695 
5696    /*-----------------------------------------------------------------------
5697     *  First Pass: Determine size of P and fill in fine_to_coarse mapping.
5698     *-----------------------------------------------------------------------*/
5699 
5700    /*-----------------------------------------------------------------------
5701     *  Intialize counters and allocate mapping vector.
5702     *-----------------------------------------------------------------------*/
5703    P_diag_i    = hypre_CTAlloc(HYPRE_Int,  n_fine+1, HYPRE_MEMORY_HOST);
5704    P_offd_i    = hypre_CTAlloc(HYPRE_Int,  n_fine+1, HYPRE_MEMORY_HOST);
5705 
5706    fine_to_coarse = hypre_CTAlloc(HYPRE_Int,  n_fine, HYPRE_MEMORY_HOST);
5707 
5708    P_marker = hypre_CTAlloc(HYPRE_Int,  n_fine, HYPRE_MEMORY_HOST);
5709 
5710 
5711 /* FIX THIS - figure out sizes - need bnnz? */
5712    if (full_off_procNodes)
5713    {
5714       P_marker_offd = hypre_CTAlloc(HYPRE_Int,  full_off_procNodes, HYPRE_MEMORY_HOST);
5715       fine_to_coarse_offd = hypre_CTAlloc(HYPRE_Int,  full_off_procNodes, HYPRE_MEMORY_HOST);
5716       tmp_CF_marker_offd = hypre_CTAlloc(HYPRE_Int,  full_off_procNodes, HYPRE_MEMORY_HOST);
5717    }
5718 
5719    initialize_vecs(n_fine, full_off_procNodes, fine_to_coarse,
5720                    fine_to_coarse_offd, P_marker, P_marker_offd,
5721                    tmp_CF_marker_offd);
5722 
5723 
5724    /* stuff for blocks */
5725    /* we need a block identity and a block of zeros*/
5726    identity_block = hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
5727    zero_block =  hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
5728 
5729    for(i = 0; i < block_size; i++)
5730    {
5731       identity_block[i*block_size + i] = 1.0;
5732    }
5733    /* we also need a block to keep track of the diagonal values and a sum */
5734    diagonal_block =  hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
5735    sum_block =  hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
5736    distribute_block =  hypre_CTAlloc(HYPRE_Real,  bnnz, HYPRE_MEMORY_HOST);
5737 
5738 
5739 
5740    jj_counter = start_indexing;
5741    jj_counter_offd = start_indexing;
5742    coarse_counter = 0;
5743    coarse_counter_offd = 0;
5744 
5745    /*-----------------------------------------------------------------------
5746     *  Loop over fine grid.
5747     *-----------------------------------------------------------------------*/
5748    for (i = 0; i < n_fine; i++)
5749    {
5750       P_diag_i[i] = jj_counter;
5751       if (num_procs > 1)
5752          P_offd_i[i] = jj_counter_offd;
5753 
5754       if (CF_marker[i] >= 0)
5755       {
5756          jj_counter++;
5757          fine_to_coarse[i] = coarse_counter;
5758          coarse_counter++;
5759       }
5760 
5761       /*--------------------------------------------------------------------
5762        *  If i is an F-point, interpolation is from the C-points that
5763        *  strongly influence i, or C-points that stronly influence F-points
5764        *  that strongly influence i.
5765        *--------------------------------------------------------------------*/
5766       else if (CF_marker[i] != -3)
5767       {
5768          for (jj = S_diag_i[i]; jj < S_diag_i[i+1]; jj++)
5769          {
5770             i1 = S_diag_j[jj];
5771             if (CF_marker[i1] >= 0)
5772             { /* i1 is a C point */
5773                if (P_marker[i1] < P_diag_i[i])
5774                {
5775                   P_marker[i1] = jj_counter;
5776                   jj_counter++;
5777                }
5778             }
5779             else if (CF_marker[i1] != -3)
5780             { /* i1 is a F point, loop through it's strong neighbors */
5781                for (kk = S_diag_i[i1]; kk < S_diag_i[i1+1]; kk++)
5782                {
5783                   k1 = S_diag_j[kk];
5784                   if (CF_marker[k1] >= 0)
5785                   {
5786                      if(P_marker[k1] < P_diag_i[i])
5787                      {
5788                         P_marker[k1] = jj_counter;
5789                         jj_counter++;
5790                      }
5791                   }
5792                }
5793                if(num_procs > 1)
5794                {
5795                   for (kk = S_offd_i[i1]; kk < S_offd_i[i1+1]; kk++)
5796                   {
5797                      k1 = S_offd_j[kk];
5798                      if (CF_marker_offd[k1] >= 0)
5799                      {
5800                         if(P_marker_offd[k1] < P_offd_i[i])
5801                         {
5802                            tmp_CF_marker_offd[k1] = 1;
5803                            P_marker_offd[k1] = jj_counter_offd;
5804                            jj_counter_offd++;
5805                         }
5806                      }
5807                   }
5808                }
5809             }
5810          }
5811          /* Look at off diag strong connections of i */
5812          if (num_procs > 1)
5813          {
5814             for (jj = S_offd_i[i]; jj < S_offd_i[i+1]; jj++)
5815             {
5816                i1 = S_offd_j[jj];
5817                if (CF_marker_offd[i1] >= 0)
5818                {
5819                   if(P_marker_offd[i1] < P_offd_i[i])
5820                   {
5821                      tmp_CF_marker_offd[i1] = 1;
5822                      P_marker_offd[i1] = jj_counter_offd;
5823                      jj_counter_offd++;
5824                   }
5825                }
5826                else if (CF_marker_offd[i1] != -3)
5827                { /* F point; look at neighbors of i1. Sop contains global col
5828                   * numbers and entries that could be in S_diag or S_offd or
5829                   * neither. */
5830                   for(kk = Sop_i[i1]; kk < Sop_i[i1+1]; kk++)
5831                   {
5832                      k1 = Sop_j[kk];
5833                      if(k1 >= col_1 && k1 < col_n)
5834                      { /* In S_diag */
5835                         loc_col = k1-col_1;
5836                         if(CF_marker[loc_col] >= 0)
5837                         {
5838                            if(P_marker[loc_col] < P_diag_i[i])
5839                            {
5840                               P_marker[loc_col] = jj_counter;
5841                               jj_counter++;
5842                            }
5843                         }
5844                      }
5845                      else
5846                      {
5847                         loc_col = -k1 - 1;
5848                         if(CF_marker_offd[loc_col] >= 0)
5849                         {
5850                            if(P_marker_offd[loc_col] < P_offd_i[i])
5851                            {
5852                               P_marker_offd[loc_col] = jj_counter_offd;
5853                               tmp_CF_marker_offd[loc_col] = 1;
5854                               jj_counter_offd++;
5855                            }
5856                         }
5857                      }
5858                   }
5859                }
5860             }
5861          }
5862       }
5863    }
5864 
5865    if (debug_flag==4)
5866    {
5867       wall_time = time_getWallclockSeconds() - wall_time;
5868       hypre_printf("Proc = %d     determine structure    %f\n",
5869                    my_id, wall_time);
5870       fflush(NULL);
5871    }
5872    /*-----------------------------------------------------------------------
5873     *  Allocate  arrays.
5874     *-----------------------------------------------------------------------*/
5875 
5876 
5877    P_diag_size = jj_counter;
5878    P_offd_size = jj_counter_offd;
5879 
5880    P_diag_j    = hypre_CTAlloc(HYPRE_Int,  P_diag_size, HYPRE_MEMORY_HOST);
5881    /* we need to include the size of the blocks in the data size */
5882    P_diag_data = hypre_CTAlloc(HYPRE_Real,  P_diag_size, HYPRE_MEMORY_HOST)*bnnz;
5883 
5884    P_offd_j    = hypre_CTAlloc(HYPRE_Int,  P_offd_size, HYPRE_MEMORY_HOST);
5885    /* we need to include the size of the blocks in the data size */
5886    P_offd_data = hypre_CTAlloc(HYPRE_Real,  P_offd_size*bnnz, HYPRE_MEMORY_HOST);
5887 
5888    P_diag_i[n_fine] = jj_counter;
5889    P_offd_i[n_fine] = jj_counter_offd;
5890 
5891    jj_counter = start_indexing;
5892    jj_counter_offd = start_indexing;
5893 
5894    /* Fine to coarse mapping */
5895    if(num_procs > 1)
5896    {
5897       for (i = 0; i < n_fine; i++)
5898          fine_to_coarse[i] += my_first_cpt;
5899 
5900       alt_insert_new_nodes(comm_pkg, extend_comm_pkg, fine_to_coarse,
5901                            full_off_procNodes,
5902                            fine_to_coarse_offd);
5903 
5904       for (i = 0; i < n_fine; i++)
5905          fine_to_coarse[i] -= my_first_cpt;
5906    }
5907 
5908    /* Initialize ahat, which is a modification to a, used in the standard
5909     * interpolation routine. */
5910    ahat = hypre_CTAlloc(HYPRE_Real,  n_fine*bnnz, HYPRE_MEMORY_HOST); /* this is data array */
5911    ihat = hypre_CTAlloc(HYPRE_Int,  n_fine, HYPRE_MEMORY_HOST);
5912    ipnt = hypre_CTAlloc(HYPRE_Int,  n_fine, HYPRE_MEMORY_HOST);
5913 
5914 
5915    if (full_off_procNodes)
5916    {
5917       ahat_offd = hypre_CTAlloc(HYPRE_Real,  full_off_procNodes*bnnz, HYPRE_MEMORY_HOST);  /* this is data array */
5918       ihat_offd = hypre_CTAlloc(HYPRE_Int,  full_off_procNodes, HYPRE_MEMORY_HOST);
5919       ipnt_offd = hypre_CTAlloc(HYPRE_Int,  full_off_procNodes, HYPRE_MEMORY_HOST);
5920    }
5921 
5922    for (i = 0; i < n_fine; i++)
5923    {
5924       P_marker[i] = -1;
5925       ahat[i] = 0;
5926       ihat[i] = -1;
5927    }
5928    for (i = 0; i < full_off_procNodes; i++)
5929    {
5930       P_marker_offd[i] = -1;
5931       ahat_offd[i] = 0;
5932       ihat_offd[i] = -1;
5933    }
5934 
5935    /*-----------------------------------------------------------------------
5936     *  Loop over fine grid points.
5937     *-----------------------------------------------------------------------*/
5938    for (i = 0; i < n_fine; i++)
5939    {
5940       jj_begin_row = jj_counter;
5941       if(num_procs > 1)
5942          jj_begin_row_offd = jj_counter_offd;
5943 
5944       /*--------------------------------------------------------------------
5945        *  If i is a c-point, interpolation is the identity.
5946        *--------------------------------------------------------------------*/
5947 
5948       if (CF_marker[i] >= 0)
5949       {
5950          P_diag_j[jj_counter]    = fine_to_coarse[i];
5951 
5952 
5953          /* P_diag_data[jj_counter] = one; */
5954          hypre_CSRBlockMatrixBlockCopyData(identity_block,
5955                                            &P_diag_data[jj_counter*bnnz],
5956                                            1.0, block_size);
5957 
5958          jj_counter++;
5959       }
5960 
5961       /*--------------------------------------------------------------------
5962        *  If i is an F-point, build interpolation.
5963        *--------------------------------------------------------------------*/
5964 
5965       else if (CF_marker[i] != -3)
5966       {
5967          if (debug_flag==4) wall_time = time_getWallclockSeconds();
5968          strong_f_marker--;
5969          for (jj = S_diag_i[i]; jj < S_diag_i[i+1]; jj++)
5970          {
5971             i1 = S_diag_j[jj];
5972 
5973             /*--------------------------------------------------------------
5974              * If neighbor i1 is a C-point, set column number in P_diag_j
5975              * and initialize interpolation weight to zero.
5976              *--------------------------------------------------------------*/
5977 
5978             if (CF_marker[i1] >= 0)
5979             {
5980                if (P_marker[i1] < jj_begin_row)
5981                {
5982                   P_marker[i1] = jj_counter;
5983                   P_diag_j[jj_counter]    = i1;
5984                   /* P_diag_data[jj_counter] = zero; */
5985                   hypre_CSRBlockMatrixBlockCopyData(zero_block,
5986                                                     &P_diag_data[jj_counter*bnnz],
5987                                                     1.0, block_size);
5988 
5989                   jj_counter++;
5990                }
5991             }
5992             else  if (CF_marker[i1] != -3)
5993             {
5994                P_marker[i1] = strong_f_marker;
5995                for (kk = S_diag_i[i1]; kk < S_diag_i[i1+1]; kk++)
5996                {
5997                   k1 = S_diag_j[kk];
5998                   if (CF_marker[k1] >= 0)
5999                   {
6000                      if(P_marker[k1] < jj_begin_row)
6001                      {
6002                         P_marker[k1] = jj_counter;
6003                         P_diag_j[jj_counter] = k1;
6004                         /* P_diag_data[jj_counter] = zero; */
6005                         hypre_CSRBlockMatrixBlockCopyData(zero_block,
6006                                                           &P_diag_data[jj_counter*bnnz],
6007                                                           1.0, block_size);
6008 
6009                         jj_counter++;
6010                      }
6011                   }
6012                }
6013                if(num_procs > 1)
6014                {
6015                   for (kk = S_offd_i[i1]; kk < S_offd_i[i1+1]; kk++)
6016                   {
6017                      k1 = S_offd_j[kk];
6018                      if(CF_marker_offd[k1] >= 0)
6019                      {
6020                         if(P_marker_offd[k1] < jj_begin_row_offd)
6021                         {
6022                            P_marker_offd[k1] = jj_counter_offd;
6023                            P_offd_j[jj_counter_offd] = k1;
6024                            /* P_offd_data[jj_counter_offd] = zero; */
6025                            hypre_CSRBlockMatrixBlockCopyData(zero_block,
6026                                                              &P_offd_data[jj_counter_offd*bnnz],
6027                                                              1.0, block_size);
6028 
6029                            jj_counter_offd++;
6030                         }
6031                      }
6032                   }
6033                }
6034             }
6035          }
6036 
6037          if ( num_procs > 1)
6038          {
6039             for (jj=S_offd_i[i]; jj < S_offd_i[i+1]; jj++)
6040             {
6041                i1 = S_offd_j[jj];
6042                if ( CF_marker_offd[i1] >= 0)
6043                {
6044                   if(P_marker_offd[i1] < jj_begin_row_offd)
6045                   {
6046                      P_marker_offd[i1] = jj_counter_offd;
6047                      P_offd_j[jj_counter_offd]=i1;
6048                      /* P_offd_data[jj_counter_offd] = zero;*/
6049                      hypre_CSRBlockMatrixBlockCopyData(zero_block,
6050                                                        &P_offd_data[jj_counter_offd*bnnz],
6051                                                        1.0, block_size);
6052 
6053                      jj_counter_offd++;
6054                   }
6055                }
6056                else if (CF_marker_offd[i1] != -3)
6057                {
6058                   P_marker_offd[i1] = strong_f_marker;
6059                   for(kk = Sop_i[i1]; kk < Sop_i[i1+1]; kk++)
6060                   {
6061                      k1 = Sop_j[kk];
6062                      if(k1 >= col_1 && k1 < col_n)
6063                      {
6064                         loc_col = k1-col_1;
6065                         if(CF_marker[loc_col] >= 0)
6066                         {
6067                            if(P_marker[loc_col] < jj_begin_row)
6068                            {
6069                               P_marker[loc_col] = jj_counter;
6070                               P_diag_j[jj_counter] = loc_col;
6071                               /* P_diag_data[jj_counter] = zero;*/
6072                               hypre_CSRBlockMatrixBlockCopyData(zero_block,
6073                                                                 &P_diag_data[jj_counter*bnnz],
6074                                                                 1.0, block_size);
6075                               jj_counter++;
6076                            }
6077                         }
6078                      }
6079                      else
6080                      {
6081                         loc_col = -k1 - 1;
6082                         if(CF_marker_offd[loc_col] >= 0)
6083                         {
6084                            if(P_marker_offd[loc_col] < jj_begin_row_offd)
6085                            {
6086                               P_marker_offd[loc_col] = jj_counter_offd;
6087                               P_offd_j[jj_counter_offd]=loc_col;
6088                               /* P_offd_data[jj_counter_offd] = zero;*/
6089                               hypre_CSRBlockMatrixBlockCopyData(zero_block,
6090                                                                 &P_offd_data[jj_counter_offd*bnnz],
6091                                                                 1.0, block_size);
6092                               jj_counter_offd++;
6093                            }
6094                         }
6095                      }
6096                   }
6097                }
6098             }
6099          }
6100 
6101          jj_end_row = jj_counter;
6102          jj_end_row_offd = jj_counter_offd;
6103 
6104          if (debug_flag==4)
6105          {
6106             wall_time = time_getWallclockSeconds() - wall_time;
6107             wall_1 += wall_time;
6108             fflush(NULL);
6109          }
6110 
6111 /* FIX THIS - is a_hat  - need to copy block data to ahat */
6112 
6113          if (debug_flag==4) wall_time = time_getWallclockSeconds();
6114          cnt_c = 0;
6115          cnt_f = jj_end_row-jj_begin_row;
6116          cnt_c_offd = 0;
6117          cnt_f_offd = jj_end_row_offd-jj_begin_row_offd;
6118          ihat[i] = cnt_f;
6119          ipnt[cnt_f] = i;
6120          ahat[cnt_f++] = A_diag_data[A_diag_i[i]];
6121          for (jj = A_diag_i[i]+1; jj < A_diag_i[i+1]; jj++)
6122          { /* i1 is direct neighbor */
6123             i1 = A_diag_j[jj];
6124             if (P_marker[i1] != strong_f_marker)
6125             {
6126                indx = ihat[i1];
6127                if (indx > -1)
6128                   ahat[indx] += A_diag_data[jj];
6129                else if (P_marker[i1] >= jj_begin_row)
6130                {
6131                   ihat[i1] = cnt_c;
6132                   ipnt[cnt_c] = i1;
6133                   ahat[cnt_c++] += A_diag_data[jj];
6134                }
6135                else if (CF_marker[i1] != -3)
6136                {
6137                   ihat[i1] = cnt_f;
6138                   ipnt[cnt_f] = i1;
6139                   ahat[cnt_f++] += A_diag_data[jj];
6140                }
6141             }
6142             else
6143             {
6144                if(num_functions == 1 || dof_func[i] == dof_func[i1])
6145                {
6146                   distribute = A_diag_data[jj]/A_diag_data[A_diag_i[i1]];
6147                   for (kk = A_diag_i[i1]+1; kk < A_diag_i[i1+1]; kk++)
6148                   {
6149                      k1 = A_diag_j[kk];
6150                      indx = ihat[k1];
6151                      if (indx > -1)
6152                         ahat[indx] -= A_diag_data[kk]*distribute;
6153                      else if (P_marker[k1] >= jj_begin_row)
6154                      {
6155                         ihat[k1] = cnt_c;
6156                         ipnt[cnt_c] = k1;
6157                         ahat[cnt_c++] -= A_diag_data[kk]*distribute;
6158                      }
6159                      else
6160                      {
6161                         ihat[k1] = cnt_f;
6162                         ipnt[cnt_f] = k1;
6163                         ahat[cnt_f++] -= A_diag_data[kk]*distribute;
6164                      }
6165                   }
6166                   if(num_procs > 1)
6167                   {
6168                      for (kk = A_offd_i[i1]; kk < A_offd_i[i1+1]; kk++)
6169                      {
6170                         k1 = A_offd_j[kk];
6171                         indx = ihat_offd[k1];
6172                         if(num_functions == 1 || dof_func[i1] == dof_func_offd[k1])
6173                         {
6174                            if (indx > -1)
6175                               ahat_offd[indx] -= A_offd_data[kk]*distribute;
6176                            else if (P_marker_offd[k1] >= jj_begin_row_offd)
6177                            {
6178                               ihat_offd[k1] = cnt_c_offd;
6179                               ipnt_offd[cnt_c_offd] = k1;
6180                               ahat_offd[cnt_c_offd++] -= A_offd_data[kk]*distribute;
6181                            }
6182                            else
6183                            {
6184                               ihat_offd[k1] = cnt_f_offd;
6185                               ipnt_offd[cnt_f_offd] = k1;
6186                               ahat_offd[cnt_f_offd++] -= A_offd_data[kk]*distribute;
6187                            }
6188                         }
6189                      }
6190                   }
6191                }
6192             }
6193          }
6194          if(num_procs > 1)
6195          {
6196             for(jj = A_offd_i[i]; jj < A_offd_i[i+1]; jj++)
6197             {
6198                i1 = A_offd_j[jj];
6199                if(P_marker_offd[i1] != strong_f_marker)
6200                {
6201                   indx = ihat_offd[i1];
6202                   if (indx > -1)
6203                      ahat_offd[indx] += A_offd_data[jj];
6204                   else if (P_marker_offd[i1] >= jj_begin_row_offd)
6205                   {
6206                      ihat_offd[i1] = cnt_c_offd;
6207                      ipnt_offd[cnt_c_offd] = i1;
6208                      ahat_offd[cnt_c_offd++] += A_offd_data[jj];
6209                   }
6210                   else if (CF_marker_offd[i1] != -3)
6211                   {
6212                      ihat_offd[i1] = cnt_f_offd;
6213                      ipnt_offd[cnt_f_offd] = i1;
6214                      ahat_offd[cnt_f_offd++] += A_offd_data[jj];
6215                   }
6216                }
6217                else
6218                {
6219                   if(num_functions == 1 || dof_func[i] == dof_func_offd[i1])
6220                   {
6221                      distribute = A_offd_data[jj]/A_ext_data[A_ext_i[i1]];
6222                      for (kk = A_ext_i[i1]+1; kk < A_ext_i[i1+1]; kk++)
6223                      {
6224                         k1 = A_ext_j[kk];
6225                         if(k1 >= col_1 && k1 < col_n)
6226                         { /*diag*/
6227                            loc_col = k1 - col_1;
6228                            indx = ihat[loc_col];
6229                            if (indx > -1)
6230                               ahat[indx] -= A_ext_data[kk]*distribute;
6231                            else if (P_marker[loc_col] >= jj_begin_row)
6232                            {
6233                               ihat[loc_col] = cnt_c;
6234                               ipnt[cnt_c] = loc_col;
6235                               ahat[cnt_c++] -= A_ext_data[kk]*distribute;
6236                            }
6237                            else
6238                            {
6239                               ihat[loc_col] = cnt_f;
6240                               ipnt[cnt_f] = loc_col;
6241                               ahat[cnt_f++] -= A_ext_data[kk]*distribute;
6242                            }
6243                         }
6244                         else
6245                         {
6246                            loc_col = -k1 - 1;
6247                            if(num_functions == 1 ||
6248                               dof_func_offd[loc_col] == dof_func_offd[i1])
6249                            {
6250                               indx = ihat_offd[loc_col];
6251                               if (indx > -1)
6252                                  ahat_offd[indx] -= A_ext_data[kk]*distribute;
6253                               else if(P_marker_offd[loc_col] >= jj_begin_row_offd)
6254                               {
6255                                  ihat_offd[loc_col] = cnt_c_offd;
6256                                  ipnt_offd[cnt_c_offd] = loc_col;
6257                                  ahat_offd[cnt_c_offd++] -= A_ext_data[kk]*distribute;
6258                               }
6259                               else
6260                               {
6261                                  ihat_offd[loc_col] = cnt_f_offd;
6262                                  ipnt_offd[cnt_f_offd] = loc_col;
6263                                  ahat_offd[cnt_f_offd++] -= A_ext_data[kk]*distribute;
6264                               }
6265                            }
6266                         }
6267                      }
6268                   }
6269                }
6270             }
6271          }
6272          if (debug_flag==4)
6273          {
6274             wall_time = time_getWallclockSeconds() - wall_time;
6275             wall_2 += wall_time;
6276             fflush(NULL);
6277          }
6278 
6279          if (debug_flag==4) wall_time = time_getWallclockSeconds();
6280          diagonal = ahat[cnt_c];
6281          ahat[cnt_c] = 0;
6282          sum_pos = 0;
6283          sum_pos_C = 0;
6284          sum_neg = 0;
6285          sum_neg_C = 0;
6286          sum = 0;
6287          sum_C = 0;
6288          if(sep_weight == 1)
6289          {
6290             for (jj=0; jj < cnt_c; jj++)
6291             {
6292                if (ahat[jj] > 0)
6293                {
6294                   sum_pos_C += ahat[jj];
6295                }
6296                else
6297                {
6298                   sum_neg_C += ahat[jj];
6299                }
6300             }
6301             if(num_procs > 1)
6302             {
6303                for (jj=0; jj < cnt_c_offd; jj++)
6304                {
6305                   if (ahat_offd[jj] > 0)
6306                   {
6307                      sum_pos_C += ahat_offd[jj];
6308                   }
6309                   else
6310                   {
6311                      sum_neg_C += ahat_offd[jj];
6312                   }
6313                }
6314             }
6315             sum_pos = sum_pos_C;
6316             sum_neg = sum_neg_C;
6317             for (jj=cnt_c+1; jj < cnt_f; jj++)
6318             {
6319                if (ahat[jj] > 0)
6320                {
6321                   sum_pos += ahat[jj];
6322                }
6323                else
6324                {
6325                   sum_neg += ahat[jj];
6326                }
6327                ahat[jj] = 0;
6328             }
6329             if(num_procs > 1)
6330             {
6331                for (jj=cnt_c_offd; jj < cnt_f_offd; jj++)
6332                {
6333                   if (ahat_offd[jj] > 0)
6334                   {
6335                      sum_pos += ahat_offd[jj];
6336                   }
6337                   else
6338                   {
6339                      sum_neg += ahat_offd[jj];
6340                   }
6341                   ahat_offd[jj] = 0;
6342                }
6343             }
6344             if (sum_neg_C) alfa = sum_neg/sum_neg_C/diagonal;
6345             if (sum_pos_C) beta = sum_pos/sum_pos_C/diagonal;
6346 
6347             /*-----------------------------------------------------------------
6348              * Set interpolation weight by dividing by the diagonal.
6349              *-----------------------------------------------------------------*/
6350 
6351             for (jj = jj_begin_row; jj < jj_end_row; jj++)
6352             {
6353                j1 = ihat[P_diag_j[jj]];
6354                if (ahat[j1] > 0)
6355                   P_diag_data[jj] = -beta*ahat[j1];
6356                else
6357                   P_diag_data[jj] = -alfa*ahat[j1];
6358 
6359                P_diag_j[jj] = fine_to_coarse[P_diag_j[jj]];
6360                ahat[j1] = 0;
6361             }
6362             for (jj=0; jj < cnt_f; jj++)
6363                ihat[ipnt[jj]] = -1;
6364             if(num_procs > 1)
6365             {
6366                for (jj = jj_begin_row_offd; jj < jj_end_row_offd; jj++)
6367                {
6368                   j1 = ihat_offd[P_offd_j[jj]];
6369                   if (ahat_offd[j1] > 0)
6370                      P_offd_data[jj] = -beta*ahat_offd[j1];
6371                   else
6372                      P_offd_data[jj] = -alfa*ahat_offd[j1];
6373 
6374                   ahat_offd[j1] = 0;
6375                }
6376                for (jj=0; jj < cnt_f_offd; jj++)
6377                   ihat_offd[ipnt_offd[jj]] = -1;
6378             }
6379          }
6380          else
6381          {
6382             for (jj=0; jj < cnt_c; jj++)
6383             {
6384                sum_C += ahat[jj];
6385             }
6386             if(num_procs > 1)
6387             {
6388                for (jj=0; jj < cnt_c_offd; jj++)
6389                {
6390                   sum_C += ahat_offd[jj];
6391                }
6392             }
6393             sum = sum_C;
6394             for (jj=cnt_c+1; jj < cnt_f; jj++)
6395             {
6396                sum += ahat[jj];
6397                ahat[jj] = 0;
6398             }
6399             if(num_procs > 1)
6400             {
6401                for (jj=cnt_c_offd; jj < cnt_f_offd; jj++)
6402                {
6403                   sum += ahat_offd[jj];
6404                   ahat_offd[jj] = 0;
6405                }
6406             }
6407             if (sum_C) alfa = sum/sum_C/diagonal;
6408 
6409             /*-----------------------------------------------------------------
6410              * Set interpolation weight by dividing by the diagonal.
6411              *-----------------------------------------------------------------*/
6412 
6413             for (jj = jj_begin_row; jj < jj_end_row; jj++)
6414             {
6415                j1 = ihat[P_diag_j[jj]];
6416                P_diag_data[jj] = -alfa*ahat[j1];
6417                P_diag_j[jj] = fine_to_coarse[P_diag_j[jj]];
6418                ahat[j1] = 0;
6419             }
6420             for (jj=0; jj < cnt_f; jj++)
6421                ihat[ipnt[jj]] = -1;
6422             if(num_procs > 1)
6423             {
6424                for (jj = jj_begin_row_offd; jj < jj_end_row_offd; jj++)
6425                {
6426                   j1 = ihat_offd[P_offd_j[jj]];
6427                   P_offd_data[jj] = -alfa*ahat_offd[j1];
6428                   ahat_offd[j1] = 0;
6429                }
6430                for (jj=0; jj < cnt_f_offd; jj++)
6431                   ihat_offd[ipnt_offd[jj]] = -1;
6432             }
6433          }
6434          if (debug_flag==4)
6435          {
6436             wall_time = time_getWallclockSeconds() - wall_time;
6437             wall_3 += wall_time;
6438             fflush(NULL);
6439          }
6440       }
6441    }
6442 
6443    if (debug_flag==4)
6444    {
6445       hypre_printf("Proc = %d fill part 1 %f part 2 %f  part 3 %f\n",
6446                    my_id, wall_1, wall_2, wall_3);
6447       fflush(NULL);
6448    }
6449    P = hypre_ParCSRMatrixCreate(comm,
6450                                 hypre_ParCSRMatrixGlobalNumRows(A),
6451                                 total_global_cpts,
6452                                 hypre_ParCSRMatrixColStarts(A),
6453                                 num_cpts_global,
6454                                 0,
6455                                 P_diag_i[n_fine],
6456                                 P_offd_i[n_fine]);
6457 
6458    P_diag = hypre_ParCSRMatrixDiag(P);
6459    hypre_CSRMatrixData(P_diag) = P_diag_data;
6460    hypre_CSRMatrixI(P_diag) = P_diag_i;
6461    hypre_CSRMatrixJ(P_diag) = P_diag_j;
6462    P_offd = hypre_ParCSRMatrixOffd(P);
6463    hypre_CSRMatrixData(P_offd) = P_offd_data;
6464    hypre_CSRMatrixI(P_offd) = P_offd_i;
6465    hypre_CSRMatrixJ(P_offd) = P_offd_j;
6466 
6467    /* Compress P, removing coefficients smaller than trunc_factor * Max */
6468    if (trunc_factor != 0.0 || max_elmts > 0)
6469    {
6470       hypre_BoomerAMGInterpTruncation(P, trunc_factor, max_elmts);
6471       P_diag_data = hypre_CSRMatrixData(P_diag);
6472       P_diag_i = hypre_CSRMatrixI(P_diag);
6473       P_diag_j = hypre_CSRMatrixJ(P_diag);
6474       P_offd_data = hypre_CSRMatrixData(P_offd);
6475       P_offd_i = hypre_CSRMatrixI(P_offd);
6476       P_offd_j = hypre_CSRMatrixJ(P_offd);
6477       P_diag_size = P_diag_i[n_fine];
6478       P_offd_size = P_offd_i[n_fine];
6479    }
6480 
6481    /* This builds col_map, col_map should be monotone increasing and contain
6482     * global numbers. */
6483    num_cols_P_offd = 0;
6484    if(P_offd_size)
6485    {
6486       hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
6487       P_marker = hypre_CTAlloc(HYPRE_Int,  full_off_procNodes, HYPRE_MEMORY_HOST);
6488 
6489       for (i=0; i < full_off_procNodes; i++)
6490          P_marker[i] = 0;
6491 
6492       num_cols_P_offd = 0;
6493       for (i=0; i < P_offd_size; i++)
6494       {
6495          index = P_offd_j[i];
6496          if (!P_marker[index])
6497          {
6498             if(tmp_CF_marker_offd[index] >= 0)
6499             {
6500                num_cols_P_offd++;
6501                P_marker[index] = 1;
6502             }
6503          }
6504       }
6505 
6506       col_map_offd_P = hypre_CTAlloc(HYPRE_Int,  num_cols_P_offd, HYPRE_MEMORY_HOST);
6507 
6508       index = 0;
6509       for(i = 0; i < num_cols_P_offd; i++)
6510       {
6511          while( P_marker[index] == 0) index++;
6512          col_map_offd_P[i] = index++;
6513       }
6514       for(i = 0; i < P_offd_size; i++)
6515          P_offd_j[i] = hypre_BinarySearch(col_map_offd_P,
6516                                           P_offd_j[i],
6517                                           num_cols_P_offd);
6518 
6519       index = 0;
6520       for(i = 0; i < num_cols_P_offd; i++)
6521       {
6522          while (P_marker[index] == 0) index++;
6523 
6524          col_map_offd_P[i] = fine_to_coarse_offd[index];
6525          index++;
6526       }
6527 
6528       /* Sort the col_map_offd_P and P_offd_j correctly */
6529       for(i = 0; i < num_cols_P_offd; i++)
6530          P_marker[i] = col_map_offd_P[i];
6531 
6532       /* Check if sort actually changed anything */
6533       if(ssort(col_map_offd_P,num_cols_P_offd))
6534       {
6535          for(i = 0; i < P_offd_size; i++)
6536             for(j = 0; j < num_cols_P_offd; j++)
6537                if(P_marker[P_offd_j[i]] == col_map_offd_P[j])
6538                {
6539                   P_offd_j[i] = j;
6540                   j = num_cols_P_offd;
6541                }
6542       }
6543       hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
6544    }
6545 
6546    if (num_cols_P_offd)
6547    {
6548       hypre_ParCSRMatrixColMapOffd(P) = col_map_offd_P;
6549       hypre_CSRMatrixNumCols(P_offd) = num_cols_P_offd;
6550    }
6551 
6552    hypre_MatvecCommPkgCreate(P);
6553 
6554    for (i=0; i < n_fine; i++)
6555       if (CF_marker[i] == -3) CF_marker[i] = -1;
6556 
6557    *P_ptr = P;
6558 
6559    /* Deallocate memory */
6560    hypre_TFree(fine_to_coarse, HYPRE_MEMORY_HOST);
6561    hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
6562    hypre_TFree(ahat, HYPRE_MEMORY_HOST);
6563    hypre_TFree(ihat, HYPRE_MEMORY_HOST);
6564    hypre_TFree(ipnt, HYPRE_MEMORY_HOST);
6565 
6566    if (full_off_procNodes)
6567    {
6568       hypre_TFree(ahat_offd, HYPRE_MEMORY_HOST);
6569       hypre_TFree(ihat_offd, HYPRE_MEMORY_HOST);
6570       hypre_TFree(ipnt_offd, HYPRE_MEMORY_HOST);
6571    }
6572    if (num_procs > 1)
6573    {
6574       hypre_CSRMatrixDestroy(Sop);
6575       hypre_CSRMatrixDestroy(A_ext);
6576       hypre_TFree(fine_to_coarse_offd, HYPRE_MEMORY_HOST);
6577       hypre_TFree(P_marker_offd, HYPRE_MEMORY_HOST);
6578       hypre_TFree(CF_marker_offd, HYPRE_MEMORY_HOST);
6579       hypre_TFree(tmp_CF_marker_offd, HYPRE_MEMORY_HOST);
6580 
6581       if(num_functions > 1)
6582          hypre_TFree(dof_func_offd, HYPRE_MEMORY_HOST);
6583       hypre_TFree(found, HYPRE_MEMORY_HOST);
6584 
6585       hypre_MatvecCommPkgDestroy(extend_comm_pkg);
6586 
6587    }
6588 
6589 
6590    return hypre_error_flag;
6591 }
6592 
6593 #endif
6594