1 /******************************************************************************
2  * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other
3  * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
4  *
5  * SPDX-License-Identifier: (Apache-2.0 OR MIT)
6  ******************************************************************************/
7 
8 #include "_hypre_utilities.h"
9 #include "_hypre_parcsr_mv.h"
10 #include "_hypre_lapack.h"
11 #include "_hypre_blas.h"
12 
13 /*--------------------------------------------------------------------------
14  * hypre_ParMatmul_RowSizes:
15  *
16  * Computes sizes of C rows. Formerly part of hypre_ParMatmul but removed
17  * so it can also be used for multiplication of Boolean matrices.
18  *
19  * Arrays computed: C_diag_i, C_offd_i.
20  *
21  * Arrays needed: (17, all HYPRE_Int*)
22  *   rownnz_A,
23  *   A_diag_i, A_diag_j,
24  *   A_offd_i, A_offd_j,
25  *   B_diag_i, B_diag_j,
26  *   B_offd_i, B_offd_j,
27  *   B_ext_i, B_ext_j,
28  *   col_map_offd_B, col_map_offd_B,
29  *   B_offd_i, B_offd_j,
30  *   B_ext_i, B_ext_j.
31  *
32  * Scalars computed: C_diag_size, C_offd_size.
33  *
34  * Scalars needed:
35  *   num_rownnz_A, num_rows_diag_A, num_cols_offd_A, allsquare,
36  *   first_col_diag_B, num_cols_diag_B, num_cols_offd_B, num_cols_offd_C
37  *--------------------------------------------------------------------------*/
38 
39 void
hypre_ParMatmul_RowSizes(HYPRE_MemoryLocation memory_location,HYPRE_Int ** C_diag_i,HYPRE_Int ** C_offd_i,HYPRE_Int * rownnz_A,HYPRE_Int * A_diag_i,HYPRE_Int * A_diag_j,HYPRE_Int * A_offd_i,HYPRE_Int * A_offd_j,HYPRE_Int * B_diag_i,HYPRE_Int * B_diag_j,HYPRE_Int * B_offd_i,HYPRE_Int * B_offd_j,HYPRE_Int * B_ext_diag_i,HYPRE_Int * B_ext_diag_j,HYPRE_Int * B_ext_offd_i,HYPRE_Int * B_ext_offd_j,HYPRE_Int * map_B_to_C,HYPRE_Int * C_diag_size,HYPRE_Int * C_offd_size,HYPRE_Int num_rownnz_A,HYPRE_Int num_rows_diag_A,HYPRE_Int num_cols_offd_A,HYPRE_Int allsquare,HYPRE_Int num_cols_diag_B,HYPRE_Int num_cols_offd_B,HYPRE_Int num_cols_offd_C)40 hypre_ParMatmul_RowSizes( HYPRE_MemoryLocation memory_location,
41                           HYPRE_Int **C_diag_i,
42                           HYPRE_Int **C_offd_i,
43                           HYPRE_Int  *rownnz_A,
44                           HYPRE_Int  *A_diag_i,
45                           HYPRE_Int  *A_diag_j,
46                           HYPRE_Int  *A_offd_i,
47                           HYPRE_Int  *A_offd_j,
48                           HYPRE_Int  *B_diag_i,
49                           HYPRE_Int  *B_diag_j,
50                           HYPRE_Int  *B_offd_i,
51                           HYPRE_Int  *B_offd_j,
52                           HYPRE_Int  *B_ext_diag_i,
53                           HYPRE_Int  *B_ext_diag_j,
54                           HYPRE_Int  *B_ext_offd_i,
55                           HYPRE_Int  *B_ext_offd_j,
56                           HYPRE_Int  *map_B_to_C,
57                           HYPRE_Int  *C_diag_size,
58                           HYPRE_Int  *C_offd_size,
59                           HYPRE_Int   num_rownnz_A,
60                           HYPRE_Int   num_rows_diag_A,
61                           HYPRE_Int   num_cols_offd_A,
62                           HYPRE_Int   allsquare,
63                           HYPRE_Int   num_cols_diag_B,
64                           HYPRE_Int   num_cols_offd_B,
65                           HYPRE_Int   num_cols_offd_C )
66 {
67    HYPRE_Int *jj_count_diag_array;
68    HYPRE_Int *jj_count_offd_array;
69 
70    HYPRE_Int  start_indexing = 0; /* start indexing for C_data at 0 */
71    HYPRE_Int  num_threads = hypre_NumThreads();
72 
73    *C_diag_i = hypre_CTAlloc(HYPRE_Int, num_rows_diag_A+1, memory_location);
74    *C_offd_i = hypre_CTAlloc(HYPRE_Int, num_rows_diag_A+1, memory_location);
75 
76    jj_count_diag_array = hypre_CTAlloc(HYPRE_Int, num_threads, HYPRE_MEMORY_HOST);
77    jj_count_offd_array = hypre_CTAlloc(HYPRE_Int, num_threads, HYPRE_MEMORY_HOST);
78 
79    /*-----------------------------------------------------------------------
80     *  Loop over rows of A
81     *-----------------------------------------------------------------------*/
82 #ifdef HYPRE_USING_OPENMP
83 #pragma omp parallel
84 #endif
85    {
86       HYPRE_Int  *B_marker = NULL;
87       HYPRE_Int   jj_row_begin_diag, jj_count_diag;
88       HYPRE_Int   jj_row_begin_offd, jj_count_offd;
89       HYPRE_Int   i1, ii1, i2, i3, jj2, jj3;
90       HYPRE_Int   size, rest, num_threads;
91       HYPRE_Int   ii, ns, ne;
92 
93       num_threads = hypre_NumActiveThreads();
94       size = num_rownnz_A/num_threads;
95       rest = num_rownnz_A - size*num_threads;
96 
97       ii = hypre_GetThreadNum();
98       if (ii < rest)
99       {
100          ns = ii*size+ii;
101          ne = (ii+1)*size+ii+1;
102       }
103       else
104       {
105          ns = ii*size+rest;
106          ne = (ii+1)*size+rest;
107       }
108       jj_count_diag = start_indexing;
109       jj_count_offd = start_indexing;
110 
111       if (num_cols_diag_B || num_cols_offd_C)
112       {
113          B_marker = hypre_CTAlloc(HYPRE_Int, num_cols_diag_B + num_cols_offd_C, HYPRE_MEMORY_HOST);
114       }
115 
116       for (i1 = 0; i1 < num_cols_diag_B + num_cols_offd_C; i1++)
117       {
118          B_marker[i1] = -1;
119       }
120 
121       for (i1 = ns; i1 < ne; i1++)
122       {
123          jj_row_begin_diag = jj_count_diag;
124          jj_row_begin_offd = jj_count_offd;
125          if (rownnz_A)
126          {
127             ii1 = rownnz_A[i1];
128          }
129          else
130          {
131             ii1 = i1;
132 
133             /*--------------------------------------------------------------------
134              *  Set marker for diagonal entry, C_{i1,i1} (for square matrices).
135              *--------------------------------------------------------------------*/
136 
137             if (allsquare)
138             {
139                B_marker[i1] = jj_count_diag;
140                jj_count_diag++;
141             }
142          }
143 
144          /*-----------------------------------------------------------------
145           *  Loop over entries in row ii1 of A_offd.
146           *-----------------------------------------------------------------*/
147 
148          if (num_cols_offd_A)
149          {
150             for (jj2 = A_offd_i[ii1]; jj2 < A_offd_i[ii1+1]; jj2++)
151             {
152                i2 = A_offd_j[jj2];
153 
154                /*-----------------------------------------------------------
155                 *  Loop over entries in row i2 of B_ext.
156                 *-----------------------------------------------------------*/
157 
158                for (jj3 = B_ext_offd_i[i2]; jj3 < B_ext_offd_i[i2+1]; jj3++)
159                {
160                   i3 = num_cols_diag_B+B_ext_offd_j[jj3];
161 
162                   /*--------------------------------------------------------
163                    *  Check B_marker to see that C_{ii1,i3} has not already
164                    *  been accounted for. If it has not, mark it and increment
165                    *  counter.
166                    *--------------------------------------------------------*/
167 
168                   if (B_marker[i3] < jj_row_begin_offd)
169                   {
170                      B_marker[i3] = jj_count_offd;
171                      jj_count_offd++;
172                   }
173                }
174 
175                for (jj3 = B_ext_diag_i[i2]; jj3 < B_ext_diag_i[i2+1]; jj3++)
176                {
177                   i3 = B_ext_diag_j[jj3];
178 
179                   if (B_marker[i3] < jj_row_begin_diag)
180                   {
181                      B_marker[i3] = jj_count_diag;
182                      jj_count_diag++;
183                   }
184                }
185             }
186          }
187 
188          /*-----------------------------------------------------------------
189           *  Loop over entries in row ii1 of A_diag.
190           *-----------------------------------------------------------------*/
191 
192          for (jj2 = A_diag_i[ii1]; jj2 < A_diag_i[ii1+1]; jj2++)
193          {
194             i2 = A_diag_j[jj2];
195 
196             /*-----------------------------------------------------------
197              *  Loop over entries in row i2 of B_diag.
198              *-----------------------------------------------------------*/
199 
200             for (jj3 = B_diag_i[i2]; jj3 < B_diag_i[i2+1]; jj3++)
201             {
202                i3 = B_diag_j[jj3];
203 
204                /*--------------------------------------------------------
205                 *  Check B_marker to see that C_{ii1,i3} has not already
206                 *  been accounted for. If it has not, mark it and increment
207                 *  counter.
208                 *--------------------------------------------------------*/
209 
210                if (B_marker[i3] < jj_row_begin_diag)
211                {
212                   B_marker[i3] = jj_count_diag;
213                   jj_count_diag++;
214                }
215             }
216 
217             /*-----------------------------------------------------------
218              *  Loop over entries in row i2 of B_offd.
219              *-----------------------------------------------------------*/
220 
221             if (num_cols_offd_B)
222             {
223                for (jj3 = B_offd_i[i2]; jj3 < B_offd_i[i2+1]; jj3++)
224                {
225                   i3 = num_cols_diag_B+map_B_to_C[B_offd_j[jj3]];
226 
227                   /*--------------------------------------------------------
228                    *  Check B_marker to see that C_{ii1,i3} has not already
229                    *  been accounted for. If it has not, mark it and increment
230                    *  counter.
231                    *--------------------------------------------------------*/
232 
233                   if (B_marker[i3] < jj_row_begin_offd)
234                   {
235                      B_marker[i3] = jj_count_offd;
236                      jj_count_offd++;
237                   }
238                }
239             }
240          }
241 
242          /*--------------------------------------------------------------------
243           * Set C_diag_i and C_offd_i for this row.
244           *--------------------------------------------------------------------*/
245 
246          (*C_diag_i)[ii1] = jj_row_begin_diag;
247          (*C_offd_i)[ii1] = jj_row_begin_offd;
248       }
249 
250       jj_count_diag_array[ii] = jj_count_diag;
251       jj_count_offd_array[ii] = jj_count_offd;
252 
253       hypre_TFree(B_marker, HYPRE_MEMORY_HOST);
254 #ifdef HYPRE_USING_OPENMP
255 #pragma omp barrier
256 #endif
257 
258       /* Correct diag_i and offd_i - phase 1 */
259       if (ii)
260       {
261          jj_count_diag = jj_count_diag_array[0];
262          jj_count_offd = jj_count_offd_array[0];
263          for (i1 = 1; i1 < ii; i1++)
264          {
265             jj_count_diag += jj_count_diag_array[i1];
266             jj_count_offd += jj_count_offd_array[i1];
267          }
268 
269          for (i1 = ns; i1 < ne; i1++)
270          {
271             ii1 = rownnz_A ? rownnz_A[i1] : i1;
272             (*C_diag_i)[ii1] += jj_count_diag;
273             (*C_offd_i)[ii1] += jj_count_offd;
274          }
275       }
276       else
277       {
278          (*C_diag_i)[num_rows_diag_A] = 0;
279          (*C_offd_i)[num_rows_diag_A] = 0;
280          for (i1 = 0; i1 < num_threads; i1++)
281          {
282             (*C_diag_i)[num_rows_diag_A] += jj_count_diag_array[i1];
283             (*C_offd_i)[num_rows_diag_A] += jj_count_offd_array[i1];
284          }
285       }
286 
287       /* Correct diag_i and offd_i - phase 2 */
288       if (rownnz_A != NULL)
289       {
290 #ifdef HYPRE_USING_OPENMP
291 #pragma omp barrier
292 #endif
293          for (i1 = ns; i1 < (ne-1); i1++)
294          {
295             for (ii1 = rownnz_A[i1] + 1; ii1 < rownnz_A[i1+1]; ii1++)
296             {
297                (*C_diag_i)[ii1] = (*C_diag_i)[rownnz_A[i1+1]];
298                (*C_offd_i)[ii1] = (*C_offd_i)[rownnz_A[i1+1]];
299             }
300          }
301 
302          if (ii < (num_threads - 1))
303          {
304             for (ii1 = rownnz_A[ne-1] + 1; ii1 < rownnz_A[ne]; ii1++)
305             {
306                (*C_diag_i)[ii1] = (*C_diag_i)[rownnz_A[ne]];
307                (*C_offd_i)[ii1] = (*C_offd_i)[rownnz_A[ne]];
308             }
309          }
310          else
311          {
312             for (ii1 = rownnz_A[ne-1] + 1; ii1 < num_rows_diag_A; ii1++)
313             {
314                (*C_diag_i)[ii1] = (*C_diag_i)[num_rows_diag_A];
315                (*C_offd_i)[ii1] = (*C_offd_i)[num_rows_diag_A];
316             }
317          }
318       }
319    } /* end parallel loop */
320 
321    *C_diag_size = (*C_diag_i)[num_rows_diag_A];
322    *C_offd_size = (*C_offd_i)[num_rows_diag_A];
323 
324 #ifdef HYPRE_DEBUG
325    HYPRE_Int i;
326 
327    for (i = 0; i < num_rows_diag_A; i++)
328    {
329       hypre_assert((*C_diag_i)[i] <= (*C_diag_i)[i+1]);
330       hypre_assert((*C_offd_i)[i] <= (*C_offd_i)[i+1]);
331    }
332 #endif
333 
334    hypre_TFree(jj_count_diag_array, HYPRE_MEMORY_HOST);
335    hypre_TFree(jj_count_offd_array, HYPRE_MEMORY_HOST);
336 
337    /* End of First Pass */
338 }
339 
340 /*--------------------------------------------------------------------------
341  * hypre_ParMatmul:
342  *
343  * Multiplies two ParCSRMatrices A and B and returns the product in
344  * ParCSRMatrix C.
345  *--------------------------------------------------------------------------*/
346 
347 hypre_ParCSRMatrix*
hypre_ParMatmul(hypre_ParCSRMatrix * A,hypre_ParCSRMatrix * B)348 hypre_ParMatmul( hypre_ParCSRMatrix  *A,
349                  hypre_ParCSRMatrix  *B )
350 {
351 #ifdef HYPRE_PROFILE
352    hypre_profile_times[HYPRE_TIMER_ID_MATMUL] -= hypre_MPI_Wtime();
353 #endif
354 
355    /* ParCSRMatrix A */
356    MPI_Comm            comm              = hypre_ParCSRMatrixComm(A);
357    HYPRE_BigInt        nrows_A           = hypre_ParCSRMatrixGlobalNumRows(A);
358    HYPRE_BigInt        ncols_A           = hypre_ParCSRMatrixGlobalNumCols(A);
359    HYPRE_BigInt       *row_starts_A      = hypre_ParCSRMatrixRowStarts(A);
360    HYPRE_Int           num_rownnz_A;
361    HYPRE_Int          *rownnz_A = NULL;
362 
363    /* ParCSRMatrix B */
364    HYPRE_BigInt        nrows_B           = hypre_ParCSRMatrixGlobalNumRows(B);
365    HYPRE_BigInt        ncols_B           = hypre_ParCSRMatrixGlobalNumCols(B);
366    HYPRE_BigInt        first_col_diag_B  = hypre_ParCSRMatrixFirstColDiag(B);
367    HYPRE_BigInt       *col_starts_B      = hypre_ParCSRMatrixColStarts(B);
368    HYPRE_BigInt        last_col_diag_B;
369 
370    /* A_diag */
371    hypre_CSRMatrix    *A_diag            = hypre_ParCSRMatrixDiag(A);
372    HYPRE_Complex      *A_diag_data       = hypre_CSRMatrixData(A_diag);
373    HYPRE_Int          *A_diag_i          = hypre_CSRMatrixI(A_diag);
374    HYPRE_Int          *A_diag_j          = hypre_CSRMatrixJ(A_diag);
375    HYPRE_Int          *A_diag_ir         = hypre_CSRMatrixRownnz(A_diag);
376    HYPRE_Int           num_rownnz_diag_A = hypre_CSRMatrixNumRownnz(A_diag);
377    HYPRE_Int           num_rows_diag_A   = hypre_CSRMatrixNumRows(A_diag);
378    HYPRE_Int           num_cols_diag_A   = hypre_CSRMatrixNumCols(A_diag);
379 
380    /* A_offd */
381    hypre_CSRMatrix    *A_offd            = hypre_ParCSRMatrixOffd(A);
382    HYPRE_Complex      *A_offd_data       = hypre_CSRMatrixData(A_offd);
383    HYPRE_Int          *A_offd_i          = hypre_CSRMatrixI(A_offd);
384    HYPRE_Int          *A_offd_j          = hypre_CSRMatrixJ(A_offd);
385    HYPRE_Int          *A_offd_ir         = hypre_CSRMatrixRownnz(A_offd);
386    HYPRE_Int           num_rownnz_offd_A = hypre_CSRMatrixNumRownnz(A_offd);
387    HYPRE_Int           num_rows_offd_A   = hypre_CSRMatrixNumRows(A_offd);
388    HYPRE_Int           num_cols_offd_A   = hypre_CSRMatrixNumCols(A_offd);
389 
390    /* B_diag */
391    hypre_CSRMatrix    *B_diag            = hypre_ParCSRMatrixDiag(B);
392    HYPRE_Complex      *B_diag_data       = hypre_CSRMatrixData(B_diag);
393    HYPRE_Int          *B_diag_i          = hypre_CSRMatrixI(B_diag);
394    HYPRE_Int          *B_diag_j          = hypre_CSRMatrixJ(B_diag);
395    HYPRE_Int           num_rows_diag_B   = hypre_CSRMatrixNumRows(B_diag);
396    HYPRE_Int           num_cols_diag_B   = hypre_CSRMatrixNumCols(B_diag);
397 
398    /* B_offd */
399    hypre_CSRMatrix    *B_offd            = hypre_ParCSRMatrixOffd(B);
400    HYPRE_BigInt       *col_map_offd_B    = hypre_ParCSRMatrixColMapOffd(B);
401    HYPRE_Complex      *B_offd_data       = hypre_CSRMatrixData(B_offd);
402    HYPRE_Int          *B_offd_i          = hypre_CSRMatrixI(B_offd);
403    HYPRE_Int          *B_offd_j          = hypre_CSRMatrixJ(B_offd);
404    HYPRE_Int           num_cols_offd_B   = hypre_CSRMatrixNumCols(B_offd);
405 
406    /* ParCSRMatrix C */
407    hypre_ParCSRMatrix *C;
408    HYPRE_BigInt       *col_map_offd_C;
409    HYPRE_Int          *map_B_to_C = NULL;
410 
411    /* C_diag */
412    hypre_CSRMatrix    *C_diag;
413    HYPRE_Complex      *C_diag_data;
414    HYPRE_Int          *C_diag_i;
415    HYPRE_Int          *C_diag_j;
416    HYPRE_Int           C_offd_size;
417    HYPRE_Int           num_cols_offd_C = 0;
418 
419    /* C_offd */
420    hypre_CSRMatrix    *C_offd;
421    HYPRE_Complex      *C_offd_data = NULL;
422    HYPRE_Int          *C_offd_i = NULL;
423    HYPRE_Int          *C_offd_j = NULL;
424    HYPRE_Int           C_diag_size;
425 
426    /* Bs_ext */
427    hypre_CSRMatrix    *Bs_ext;
428    HYPRE_Complex      *Bs_ext_data;
429    HYPRE_Int          *Bs_ext_i;
430    HYPRE_BigInt       *Bs_ext_j;
431    HYPRE_Complex      *B_ext_diag_data;
432    HYPRE_Int          *B_ext_diag_i;
433    HYPRE_Int          *B_ext_diag_j;
434    HYPRE_Int           B_ext_diag_size;
435    HYPRE_Complex      *B_ext_offd_data;
436    HYPRE_Int          *B_ext_offd_i;
437    HYPRE_Int          *B_ext_offd_j;
438    HYPRE_BigInt       *B_big_offd_j = NULL;
439    HYPRE_Int           B_ext_offd_size;
440 
441    HYPRE_Int           allsquare = 0;
442    HYPRE_Int           num_procs;
443    HYPRE_Int          *my_diag_array;
444    HYPRE_Int          *my_offd_array;
445    HYPRE_Int           max_num_threads;
446 
447    HYPRE_Complex       zero = 0.0;
448 
449    HYPRE_MemoryLocation memory_location_A = hypre_ParCSRMatrixMemoryLocation(A);
450    HYPRE_MemoryLocation memory_location_B = hypre_ParCSRMatrixMemoryLocation(B);
451 
452    /* RL: TODO cannot guarantee, maybe should never assert
453    hypre_assert(memory_location_A == memory_location_B);
454    */
455 
456    /* RL: in the case of A=H, B=D, or A=D, B=H, let C = D,
457     * not sure if this is the right thing to do.
458     * Also, need something like this in other places
459     * TODO */
460    HYPRE_MemoryLocation memory_location_C = hypre_max(memory_location_A, memory_location_B);
461 
462    HYPRE_ANNOTATE_FUNC_BEGIN;
463 
464    max_num_threads = hypre_NumThreads();
465    my_diag_array = hypre_CTAlloc(HYPRE_Int, max_num_threads, HYPRE_MEMORY_HOST);
466    my_offd_array = hypre_CTAlloc(HYPRE_Int, max_num_threads, HYPRE_MEMORY_HOST);
467 
468    if (ncols_A != nrows_B || num_cols_diag_A != num_rows_diag_B)
469    {
470       hypre_error_w_msg(HYPRE_ERROR_GENERIC," Error! Incompatible matrix dimensions!\n");
471 
472       HYPRE_ANNOTATE_FUNC_END;
473       return NULL;
474    }
475 
476    /* if C=A*B is square globally and locally, then C_diag should be square also */
477    if ( num_rows_diag_A == num_cols_diag_B && nrows_A == ncols_B )
478    {
479       allsquare = 1;
480    }
481 
482    /* Set rownnz of A */
483    if (num_rownnz_diag_A != num_rows_diag_A &&
484        num_rownnz_offd_A != num_rows_offd_A )
485    {
486       hypre_MergeOrderedArrays(num_rownnz_diag_A, A_diag_ir,
487                                num_rownnz_offd_A, A_offd_ir,
488                                &num_rownnz_A, &rownnz_A);
489    }
490    else
491    {
492       num_rownnz_A = hypre_max(num_rows_diag_A, num_rows_offd_A);
493    }
494 
495    /*-----------------------------------------------------------------------
496     *  Extract B_ext, i.e. portion of B that is stored on neighbor procs
497     *  and needed locally for matrix matrix product
498     *-----------------------------------------------------------------------*/
499 
500    hypre_MPI_Comm_size(comm, &num_procs);
501 
502 #ifdef HYPRE_PROFILE
503    hypre_profile_times[HYPRE_TIMER_ID_RENUMBER_COLIDX] -= hypre_MPI_Wtime();
504 #endif
505 
506    if (num_procs > 1)
507    {
508       /*---------------------------------------------------------------------
509        * If there exists no CommPkg for A, a CommPkg is generated using
510        * equally load balanced partitionings within
511        * hypre_ParCSRMatrixExtractBExt
512        *--------------------------------------------------------------------*/
513       Bs_ext      = hypre_ParCSRMatrixExtractBExt(B,A,1);
514       Bs_ext_data = hypre_CSRMatrixData(Bs_ext);
515       Bs_ext_i    = hypre_CSRMatrixI(Bs_ext);
516       Bs_ext_j    = hypre_CSRMatrixBigJ(Bs_ext);
517    }
518    B_ext_diag_i = hypre_CTAlloc(HYPRE_Int, num_cols_offd_A+1, HYPRE_MEMORY_HOST);
519    B_ext_offd_i = hypre_CTAlloc(HYPRE_Int, num_cols_offd_A+1, HYPRE_MEMORY_HOST);
520    B_ext_diag_size = 0;
521    B_ext_offd_size = 0;
522    last_col_diag_B = first_col_diag_B + (HYPRE_BigInt) num_cols_diag_B - 1;
523 
524 #ifdef HYPRE_CONCURRENT_HOPSCOTCH
525    hypre_UnorderedBigIntSet set;
526 
527 #pragma omp parallel
528    {
529       HYPRE_Int size, rest, ii;
530       HYPRE_Int ns, ne;
531       HYPRE_Int i1, i, j;
532       HYPRE_Int my_offd_size, my_diag_size;
533       HYPRE_Int cnt_offd, cnt_diag;
534       HYPRE_Int num_threads = hypre_NumActiveThreads();
535 
536       size = num_cols_offd_A/num_threads;
537       rest = num_cols_offd_A - size*num_threads;
538       ii = hypre_GetThreadNum();
539       if (ii < rest)
540       {
541          ns = ii*size+ii;
542          ne = (ii+1)*size+ii+1;
543       }
544       else
545       {
546          ns = ii*size+rest;
547          ne = (ii+1)*size+rest;
548       }
549 
550       my_diag_size = 0;
551       my_offd_size = 0;
552       for (i = ns; i < ne; i++)
553       {
554          B_ext_diag_i[i] = my_diag_size;
555          B_ext_offd_i[i] = my_offd_size;
556          for (j = Bs_ext_i[i]; j < Bs_ext_i[i+1]; j++)
557          {
558             if (Bs_ext_j[j] < first_col_diag_B ||
559                 Bs_ext_j[j] > last_col_diag_B)
560             {
561                my_offd_size++;
562             }
563             else
564             {
565                my_diag_size++;
566             }
567          }
568       }
569       my_diag_array[ii] = my_diag_size;
570       my_offd_array[ii] = my_offd_size;
571 
572 #pragma omp barrier
573 
574       if (ii)
575       {
576          my_diag_size = my_diag_array[0];
577          my_offd_size = my_offd_array[0];
578          for (i1 = 1; i1 < ii; i1++)
579          {
580             my_diag_size += my_diag_array[i1];
581             my_offd_size += my_offd_array[i1];
582          }
583 
584          for (i1 = ns; i1 < ne; i1++)
585          {
586             B_ext_diag_i[i1] += my_diag_size;
587             B_ext_offd_i[i1] += my_offd_size;
588          }
589       }
590       else
591       {
592          B_ext_diag_size = 0;
593          B_ext_offd_size = 0;
594          for (i1 = 0; i1 < num_threads; i1++)
595          {
596             B_ext_diag_size += my_diag_array[i1];
597             B_ext_offd_size += my_offd_array[i1];
598          }
599          B_ext_diag_i[num_cols_offd_A] = B_ext_diag_size;
600          B_ext_offd_i[num_cols_offd_A] = B_ext_offd_size;
601 
602          if (B_ext_diag_size)
603          {
604             B_ext_diag_j = hypre_CTAlloc(HYPRE_Int,  B_ext_diag_size, HYPRE_MEMORY_HOST);
605             B_ext_diag_data = hypre_CTAlloc(HYPRE_Complex, B_ext_diag_size, HYPRE_MEMORY_HOST);
606          }
607          if (B_ext_offd_size)
608          {
609             B_ext_offd_j = hypre_CTAlloc(HYPRE_Int, B_ext_offd_size, HYPRE_MEMORY_HOST);
610             B_big_offd_j = hypre_CTAlloc(HYPRE_BigInt, B_ext_offd_size, HYPRE_MEMORY_HOST);
611             B_ext_offd_data = hypre_CTAlloc(HYPRE_Complex, B_ext_offd_size, HYPRE_MEMORY_HOST);
612          }
613          hypre_UnorderedBigIntSetCreate(&set, B_ext_offd_size + num_cols_offd_B, 16*hypre_NumThreads());
614       }
615 
616 
617 #pragma omp barrier
618 
619       cnt_offd = B_ext_offd_i[ns];
620       cnt_diag = B_ext_diag_i[ns];
621       for (i = ns; i < ne; i++)
622       {
623          for (j = Bs_ext_i[i]; j < Bs_ext_i[i+1]; j++)
624          {
625             if (Bs_ext_j[j] < first_col_diag_B ||
626                 Bs_ext_j[j] > last_col_diag_B)
627             {
628                hypre_UnorderedBigIntSetPut(&set, Bs_ext_j[j]);
629                B_big_offd_j[cnt_offd] = Bs_ext_j[j];
630                //Bs_ext_j[cnt_offd] = Bs_ext_j[j];
631                B_ext_offd_data[cnt_offd++] = Bs_ext_data[j];
632             }
633             else
634             {
635                B_ext_diag_j[cnt_diag] = (HYPRE_Int)(Bs_ext_j[j] - first_col_diag_B);
636                B_ext_diag_data[cnt_diag++] = Bs_ext_data[j];
637             }
638          }
639       }
640 
641       HYPRE_Int i_begin, i_end;
642       hypre_GetSimpleThreadPartition(&i_begin, &i_end, num_cols_offd_B);
643       for (i = i_begin; i < i_end; i++)
644       {
645          hypre_UnorderedBigIntSetPut(&set, col_map_offd_B[i]);
646       }
647    } /* omp parallel */
648 
649    col_map_offd_C = hypre_UnorderedBigIntSetCopyToArray(&set, &num_cols_offd_C);
650    hypre_UnorderedBigIntSetDestroy(&set);
651    hypre_UnorderedBigIntMap col_map_offd_C_inverse;
652    hypre_big_sort_and_create_inverse_map(col_map_offd_C,
653                                          num_cols_offd_C,
654                                          &col_map_offd_C,
655                                          &col_map_offd_C_inverse);
656 
657    HYPRE_Int i, j;
658 #pragma omp parallel for private(j) HYPRE_SMP_SCHEDULE
659    for (i = 0; i < num_cols_offd_A; i++)
660    {
661       for (j = B_ext_offd_i[i]; j < B_ext_offd_i[i+1]; j++)
662       {
663          //B_ext_offd_j[j] = hypre_UnorderedIntMapGet(&col_map_offd_C_inverse, B_ext_offd_j[j]);
664          B_ext_offd_j[j] = hypre_UnorderedBigIntMapGet(&col_map_offd_C_inverse, B_big_offd_j[j]);
665       }
666    }
667 
668    if (num_cols_offd_C)
669    {
670       hypre_UnorderedBigIntMapDestroy(&col_map_offd_C_inverse);
671    }
672 
673    hypre_TFree(my_diag_array, HYPRE_MEMORY_HOST);
674    hypre_TFree(my_offd_array, HYPRE_MEMORY_HOST);
675 
676    if (num_cols_offd_B)
677    {
678       HYPRE_Int i;
679       map_B_to_C = hypre_CTAlloc(HYPRE_Int, num_cols_offd_B, HYPRE_MEMORY_HOST);
680 
681 #pragma omp parallel private(i)
682       {
683          HYPRE_Int i_begin, i_end;
684          hypre_GetSimpleThreadPartition(&i_begin, &i_end, num_cols_offd_C);
685 
686          HYPRE_Int cnt;
687          if (i_end > i_begin)
688          {
689             cnt = hypre_BigLowerBound(col_map_offd_B,
690                                       col_map_offd_B + (HYPRE_BigInt)num_cols_offd_B,
691                                       col_map_offd_C[i_begin]) - col_map_offd_B;
692          }
693 
694          for (i = i_begin; i < i_end && cnt < num_cols_offd_B; i++)
695          {
696             if (col_map_offd_C[i] == col_map_offd_B[cnt])
697             {
698                map_B_to_C[cnt++] = i;
699             }
700          }
701       }
702    }
703    if (num_procs > 1)
704    {
705       hypre_CSRMatrixDestroy(Bs_ext);
706       Bs_ext = NULL;
707    }
708 
709 #else /* !HYPRE_CONCURRENT_HOPSCOTCH */
710 
711    HYPRE_BigInt *temp;
712 #ifdef HYPRE_USING_OPENMP
713 #pragma omp parallel
714 #endif
715    {
716       HYPRE_Int size, rest, ii;
717       HYPRE_Int ns, ne;
718       HYPRE_Int i1, i, j;
719       HYPRE_Int my_offd_size, my_diag_size;
720       HYPRE_Int cnt_offd, cnt_diag;
721 
722       HYPRE_Int num_threads = hypre_NumActiveThreads();
723 
724       size = num_cols_offd_A/num_threads;
725       rest = num_cols_offd_A - size*num_threads;
726       ii = hypre_GetThreadNum();
727       if (ii < rest)
728       {
729          ns = ii*size+ii;
730          ne = (ii+1)*size+ii+1;
731       }
732       else
733       {
734          ns = ii*size+rest;
735          ne = (ii+1)*size+rest;
736       }
737 
738       my_diag_size = 0;
739       my_offd_size = 0;
740       for (i = ns; i < ne; i++)
741       {
742          B_ext_diag_i[i] = my_diag_size;
743          B_ext_offd_i[i] = my_offd_size;
744          for (j = Bs_ext_i[i]; j < Bs_ext_i[i+1]; j++)
745          {
746             if (Bs_ext_j[j] < first_col_diag_B ||
747                 Bs_ext_j[j] > last_col_diag_B)
748             {
749                my_offd_size++;
750             }
751             else
752             {
753                my_diag_size++;
754             }
755          }
756       }
757       my_diag_array[ii] = my_diag_size;
758       my_offd_array[ii] = my_offd_size;
759 
760 #ifdef HYPRE_USING_OPENMP
761 #pragma omp barrier
762 #endif
763 
764       if (ii)
765       {
766          my_diag_size = my_diag_array[0];
767          my_offd_size = my_offd_array[0];
768          for (i1 = 1; i1 < ii; i1++)
769          {
770             my_diag_size += my_diag_array[i1];
771             my_offd_size += my_offd_array[i1];
772          }
773 
774          for (i1 = ns; i1 < ne; i1++)
775          {
776             B_ext_diag_i[i1] += my_diag_size;
777             B_ext_offd_i[i1] += my_offd_size;
778          }
779       }
780       else
781       {
782          B_ext_diag_size = 0;
783          B_ext_offd_size = 0;
784          for (i1 = 0; i1 < num_threads; i1++)
785          {
786             B_ext_diag_size += my_diag_array[i1];
787             B_ext_offd_size += my_offd_array[i1];
788          }
789          B_ext_diag_i[num_cols_offd_A] = B_ext_diag_size;
790          B_ext_offd_i[num_cols_offd_A] = B_ext_offd_size;
791 
792          if (B_ext_diag_size)
793          {
794             B_ext_diag_j = hypre_CTAlloc(HYPRE_Int, B_ext_diag_size, HYPRE_MEMORY_HOST);
795             B_ext_diag_data = hypre_CTAlloc(HYPRE_Complex, B_ext_diag_size, HYPRE_MEMORY_HOST);
796          }
797 
798          if (B_ext_offd_size)
799          {
800             B_ext_offd_j = hypre_CTAlloc(HYPRE_Int, B_ext_offd_size, HYPRE_MEMORY_HOST);
801             B_big_offd_j = hypre_CTAlloc(HYPRE_BigInt, B_ext_offd_size, HYPRE_MEMORY_HOST);
802             B_ext_offd_data = hypre_CTAlloc(HYPRE_Complex, B_ext_offd_size, HYPRE_MEMORY_HOST);
803          }
804 
805          if (B_ext_offd_size || num_cols_offd_B)
806          {
807             temp = hypre_CTAlloc(HYPRE_BigInt, B_ext_offd_size+num_cols_offd_B, HYPRE_MEMORY_HOST);
808          }
809       }
810 
811 #ifdef HYPRE_USING_OPENMP
812 #pragma omp barrier
813 #endif
814 
815       cnt_offd = B_ext_offd_i[ns];
816       cnt_diag = B_ext_diag_i[ns];
817       for (i = ns; i < ne; i++)
818       {
819          for (j = Bs_ext_i[i]; j < Bs_ext_i[i+1]; j++)
820          {
821             if (Bs_ext_j[j] < first_col_diag_B ||
822                 Bs_ext_j[j] > last_col_diag_B)
823             {
824                temp[cnt_offd] = Bs_ext_j[j];
825                B_big_offd_j[cnt_offd] = Bs_ext_j[j];
826                //Bs_ext_j[cnt_offd] = Bs_ext_j[j];
827                B_ext_offd_data[cnt_offd++] = Bs_ext_data[j];
828             }
829             else
830             {
831                B_ext_diag_j[cnt_diag] = (HYPRE_Int)(Bs_ext_j[j] - first_col_diag_B);
832                B_ext_diag_data[cnt_diag++] = Bs_ext_data[j];
833             }
834          }
835       }
836 
837 #ifdef HYPRE_USING_OPENMP
838 #pragma omp barrier
839 #endif
840 
841       if (ii == 0)
842       {
843          HYPRE_Int cnt;
844 
845          if (num_procs > 1)
846          {
847             hypre_CSRMatrixDestroy(Bs_ext);
848             Bs_ext = NULL;
849          }
850 
851          cnt = 0;
852          if (B_ext_offd_size || num_cols_offd_B)
853          {
854             cnt = B_ext_offd_size;
855             for (i = 0; i < num_cols_offd_B; i++)
856             {
857                temp[cnt++] = col_map_offd_B[i];
858             }
859 
860             if (cnt)
861             {
862                HYPRE_BigInt value;
863 
864                hypre_BigQsort0(temp, 0, cnt-1);
865                num_cols_offd_C = 1;
866                value = temp[0];
867                for (i = 1; i < cnt; i++)
868                {
869                   if (temp[i] > value)
870                   {
871                      value = temp[i];
872                      temp[num_cols_offd_C++] = value;
873                   }
874                }
875             }
876 
877             if (num_cols_offd_C)
878             {
879                col_map_offd_C = hypre_CTAlloc(HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_HOST);
880             }
881 
882             for (i = 0; i < num_cols_offd_C; i++)
883             {
884                col_map_offd_C[i] = temp[i];
885             }
886 
887             hypre_TFree(temp, HYPRE_MEMORY_HOST);
888          }
889       }
890 
891 
892 #ifdef HYPRE_USING_OPENMP
893 #pragma omp barrier
894 #endif
895 
896       for (i = ns; i < ne; i++)
897       {
898          for (j = B_ext_offd_i[i]; j < B_ext_offd_i[i+1]; j++)
899          {
900             B_ext_offd_j[j] = hypre_BigBinarySearch(col_map_offd_C, B_big_offd_j[j],
901             //B_ext_offd_j[j] = hypre_BigBinarySearch(col_map_offd_C, Bs_ext_j[j],
902                                                  num_cols_offd_C);
903          }
904       }
905 
906    } /* end parallel region */
907    hypre_TFree(B_big_offd_j, HYPRE_MEMORY_HOST);
908 
909    hypre_TFree(my_diag_array, HYPRE_MEMORY_HOST);
910    hypre_TFree(my_offd_array, HYPRE_MEMORY_HOST);
911 
912    if (num_cols_offd_B)
913    {
914       HYPRE_Int i, cnt;
915       map_B_to_C = hypre_CTAlloc(HYPRE_Int, num_cols_offd_B, HYPRE_MEMORY_HOST);
916 
917       cnt = 0;
918       for (i = 0; i < num_cols_offd_C; i++)
919       {
920          if (col_map_offd_C[i] == col_map_offd_B[cnt])
921          {
922             map_B_to_C[cnt++] = i;
923             if (cnt == num_cols_offd_B) break;
924          }
925       }
926    }
927 
928 #endif /* !HYPRE_CONCURRENT_HOPSCOTCH */
929 
930 #ifdef HYPRE_PROFILE
931    hypre_profile_times[HYPRE_TIMER_ID_RENUMBER_COLIDX] += hypre_MPI_Wtime();
932 #endif
933 
934    HYPRE_ANNOTATE_REGION_BEGIN("%s", "First pass");
935    hypre_ParMatmul_RowSizes(memory_location_C, &C_diag_i, &C_offd_i,
936                             rownnz_A, A_diag_i, A_diag_j,
937                             A_offd_i, A_offd_j,
938                             B_diag_i, B_diag_j,
939                             B_offd_i, B_offd_j,
940                             B_ext_diag_i, B_ext_diag_j,
941                             B_ext_offd_i, B_ext_offd_j, map_B_to_C,
942                             &C_diag_size, &C_offd_size,
943                             num_rownnz_A, num_rows_diag_A, num_cols_offd_A,
944                             allsquare, num_cols_diag_B, num_cols_offd_B,
945                             num_cols_offd_C);
946    HYPRE_ANNOTATE_REGION_END("%s", "First pass");
947 
948    /*-----------------------------------------------------------------------
949     *  Allocate C_diag_data and C_diag_j arrays.
950     *  Allocate C_offd_data and C_offd_j arrays.
951     *-----------------------------------------------------------------------*/
952 
953    last_col_diag_B = first_col_diag_B + (HYPRE_BigInt)num_cols_diag_B - 1;
954    C_diag_data = hypre_CTAlloc(HYPRE_Complex, C_diag_size, memory_location_C);
955    C_diag_j    = hypre_CTAlloc(HYPRE_Int, C_diag_size, memory_location_C);
956    if (C_offd_size)
957    {
958       C_offd_data = hypre_CTAlloc(HYPRE_Complex, C_offd_size, memory_location_C);
959       C_offd_j    = hypre_CTAlloc(HYPRE_Int, C_offd_size, memory_location_C);
960    }
961 
962    /*-----------------------------------------------------------------------
963     *  Second Pass: Fill in C_diag_data and C_diag_j.
964     *  Second Pass: Fill in C_offd_data and C_offd_j.
965     *-----------------------------------------------------------------------*/
966 
967    HYPRE_ANNOTATE_REGION_BEGIN("%s", "Second pass");
968 
969 #ifdef HYPRE_USING_OPENMP
970 #pragma omp parallel
971 #endif
972    {
973       HYPRE_Int     *B_marker = NULL;
974       HYPRE_Int      ns, ne, size, rest, ii;
975       HYPRE_Int      i1, ii1, i2, i3, jj2, jj3;
976       HYPRE_Int      jj_row_begin_diag, jj_count_diag;
977       HYPRE_Int      jj_row_begin_offd, jj_count_offd;
978       HYPRE_Int      num_threads;
979       HYPRE_Complex  a_entry; /*, a_b_product;*/
980 
981       num_threads = hypre_NumActiveThreads();
982       size = num_rownnz_A/num_threads;
983       rest = num_rownnz_A - size*num_threads;
984 
985       ii = hypre_GetThreadNum();
986       if (ii < rest)
987       {
988          ns = ii*size+ii;
989          ne = (ii+1)*size+ii+1;
990       }
991       else
992       {
993          ns = ii*size+rest;
994          ne = (ii+1)*size+rest;
995       }
996       jj_count_diag = C_diag_i[rownnz_A ? rownnz_A[ns] : ns];
997       jj_count_offd = C_offd_i[rownnz_A ? rownnz_A[ns] : ns];
998 
999       if (num_cols_diag_B || num_cols_offd_C)
1000       {
1001          B_marker = hypre_CTAlloc(HYPRE_Int, num_cols_diag_B + num_cols_offd_C,
1002                                   HYPRE_MEMORY_HOST);
1003          for (i1 = 0; i1 < num_cols_diag_B + num_cols_offd_C; i1++)
1004          {
1005             B_marker[i1] = -1;
1006          }
1007       }
1008 
1009       /*-----------------------------------------------------------------------
1010        *  Loop over interior c-points.
1011        *-----------------------------------------------------------------------*/
1012       for (i1 = ns; i1 < ne; i1++)
1013       {
1014          jj_row_begin_diag = jj_count_diag;
1015          jj_row_begin_offd = jj_count_offd;
1016          if (rownnz_A)
1017          {
1018             ii1 = rownnz_A[i1];
1019          }
1020          else
1021          {
1022             ii1 = i1;
1023 
1024             /*--------------------------------------------------------------------
1025              *  Create diagonal entry, C_{i1,i1}
1026              *--------------------------------------------------------------------*/
1027 
1028             if (allsquare)
1029             {
1030                B_marker[i1] = jj_count_diag;
1031                C_diag_data[jj_count_diag] = zero;
1032                C_diag_j[jj_count_diag] = i1;
1033                jj_count_diag++;
1034             }
1035          }
1036 
1037          /*-----------------------------------------------------------------
1038           *  Loop over entries in row i1 of A_offd.
1039           *-----------------------------------------------------------------*/
1040 
1041          if (num_cols_offd_A)
1042          {
1043             for (jj2 = A_offd_i[ii1]; jj2 < A_offd_i[ii1+1]; jj2++)
1044             {
1045                i2 = A_offd_j[jj2];
1046                a_entry = A_offd_data[jj2];
1047 
1048                /*-----------------------------------------------------------
1049                 *  Loop over entries in row i2 of B_ext.
1050                 *-----------------------------------------------------------*/
1051 
1052                for (jj3 = B_ext_offd_i[i2]; jj3 < B_ext_offd_i[i2+1]; jj3++)
1053                {
1054                   i3 = num_cols_diag_B+B_ext_offd_j[jj3];
1055 
1056                   /*--------------------------------------------------------
1057                    *  Check B_marker to see that C_{ii1,i3} has not already
1058                    *  been accounted for. If it has not, create a new entry.
1059                    *  If it has, add new contribution.
1060                    *--------------------------------------------------------*/
1061 
1062                   if (B_marker[i3] < jj_row_begin_offd)
1063                   {
1064                      B_marker[i3] = jj_count_offd;
1065                      C_offd_data[jj_count_offd] = a_entry*B_ext_offd_data[jj3];
1066                      C_offd_j[jj_count_offd] = i3-num_cols_diag_B;
1067                      jj_count_offd++;
1068                   }
1069                   else
1070                   {
1071                      C_offd_data[B_marker[i3]] += a_entry*B_ext_offd_data[jj3];
1072                   }
1073                }
1074                for (jj3 = B_ext_diag_i[i2]; jj3 < B_ext_diag_i[i2+1]; jj3++)
1075                {
1076                   i3 = B_ext_diag_j[jj3];
1077                   if (B_marker[i3] < jj_row_begin_diag)
1078                   {
1079                      B_marker[i3] = jj_count_diag;
1080                      C_diag_data[jj_count_diag] = a_entry*B_ext_diag_data[jj3];
1081                      C_diag_j[jj_count_diag] = i3;
1082                      jj_count_diag++;
1083                   }
1084                   else
1085                   {
1086                      C_diag_data[B_marker[i3]] += a_entry*B_ext_diag_data[jj3];
1087                   }
1088                }
1089             }
1090          }
1091 
1092          /*-----------------------------------------------------------------
1093           *  Loop over entries in row ii1 of A_diag.
1094           *-----------------------------------------------------------------*/
1095 
1096          for (jj2 = A_diag_i[ii1]; jj2 < A_diag_i[ii1+1]; jj2++)
1097          {
1098             i2 = A_diag_j[jj2];
1099             a_entry = A_diag_data[jj2];
1100 
1101             /*-----------------------------------------------------------
1102              *  Loop over entries in row i2 of B_diag.
1103              *-----------------------------------------------------------*/
1104 
1105             for (jj3 = B_diag_i[i2]; jj3 < B_diag_i[i2+1]; jj3++)
1106             {
1107                i3 = B_diag_j[jj3];
1108 
1109                /*--------------------------------------------------------
1110                 *  Check B_marker to see that C_{ii1,i3} has not already
1111                 *  been accounted for. If it has not, create a new entry.
1112                 *  If it has, add new contribution.
1113                 *--------------------------------------------------------*/
1114 
1115                if (B_marker[i3] < jj_row_begin_diag)
1116                {
1117                   B_marker[i3] = jj_count_diag;
1118                   C_diag_data[jj_count_diag] = a_entry*B_diag_data[jj3];
1119                   C_diag_j[jj_count_diag] = i3;
1120                   jj_count_diag++;
1121                }
1122                else
1123                {
1124                   C_diag_data[B_marker[i3]] += a_entry*B_diag_data[jj3];
1125                }
1126             }
1127             if (num_cols_offd_B)
1128             {
1129                for (jj3 = B_offd_i[i2]; jj3 < B_offd_i[i2+1]; jj3++)
1130                {
1131                   i3 = num_cols_diag_B+map_B_to_C[B_offd_j[jj3]];
1132 
1133                   /*--------------------------------------------------------
1134                    *  Check B_marker to see that C_{ii1,i3} has not already
1135                    *  been accounted for. If it has not, create a new entry.
1136                    *  If it has, add new contribution.
1137                    *--------------------------------------------------------*/
1138 
1139                   if (B_marker[i3] < jj_row_begin_offd)
1140                   {
1141                      B_marker[i3] = jj_count_offd;
1142                      C_offd_data[jj_count_offd] = a_entry*B_offd_data[jj3];
1143                      C_offd_j[jj_count_offd] = i3-num_cols_diag_B;
1144                      jj_count_offd++;
1145                   }
1146                   else
1147                   {
1148                      C_offd_data[B_marker[i3]] += a_entry*B_offd_data[jj3];
1149                   }
1150                }
1151             }
1152          }
1153       }
1154 
1155       hypre_TFree(B_marker, HYPRE_MEMORY_HOST);
1156    } /*end parallel region */
1157    HYPRE_ANNOTATE_REGION_END("%s", "Second pass");
1158 
1159    C = hypre_ParCSRMatrixCreate(comm, nrows_A, ncols_B, row_starts_A,
1160                                 col_starts_B, num_cols_offd_C,
1161                                 C_diag_size, C_offd_size);
1162 
1163    C_diag = hypre_ParCSRMatrixDiag(C);
1164    hypre_CSRMatrixData(C_diag) = C_diag_data;
1165    hypre_CSRMatrixI(C_diag)    = C_diag_i;
1166    hypre_CSRMatrixJ(C_diag)    = C_diag_j;
1167    hypre_CSRMatrixSetRownnz(C_diag);
1168 
1169    C_offd = hypre_ParCSRMatrixOffd(C);
1170    hypre_CSRMatrixI(C_offd)  = C_offd_i;
1171    hypre_ParCSRMatrixOffd(C) = C_offd;
1172    if (num_cols_offd_C)
1173    {
1174       hypre_CSRMatrixData(C_offd)     = C_offd_data;
1175       hypre_CSRMatrixJ(C_offd)        = C_offd_j;
1176       hypre_ParCSRMatrixColMapOffd(C) = col_map_offd_C;
1177    }
1178    hypre_CSRMatrixSetRownnz(C_offd);
1179 
1180    hypre_CSRMatrixMemoryLocation(C_diag) = memory_location_C;
1181    hypre_CSRMatrixMemoryLocation(C_offd) = memory_location_C;
1182 
1183    /*-----------------------------------------------------------------------
1184     *  Free various arrays
1185     *-----------------------------------------------------------------------*/
1186    hypre_TFree(B_ext_diag_i, HYPRE_MEMORY_HOST);
1187    if (B_ext_diag_size)
1188    {
1189       hypre_TFree(B_ext_diag_j, HYPRE_MEMORY_HOST);
1190       hypre_TFree(B_ext_diag_data, HYPRE_MEMORY_HOST);
1191    }
1192    hypre_TFree(B_ext_offd_i, HYPRE_MEMORY_HOST);
1193    if (B_ext_offd_size)
1194    {
1195       hypre_TFree(B_ext_offd_j, HYPRE_MEMORY_HOST);
1196       hypre_TFree(B_ext_offd_data, HYPRE_MEMORY_HOST);
1197    }
1198    if (num_cols_offd_B)
1199    {
1200       hypre_TFree(map_B_to_C, HYPRE_MEMORY_HOST);
1201    }
1202    hypre_TFree(rownnz_A, HYPRE_MEMORY_HOST);
1203 
1204 #ifdef HYPRE_PROFILE
1205    hypre_profile_times[HYPRE_TIMER_ID_MATMUL] += hypre_MPI_Wtime();
1206 #endif
1207 
1208    HYPRE_ANNOTATE_FUNC_END;
1209 
1210    return C;
1211 }
1212 
1213 /* The following function was formerly part of hypre_ParCSRMatrixExtractBExt
1214    but the code was removed so it can be used for a corresponding function
1215    for Boolean matrices
1216 
1217    JSP: to allow communication overlapping, it returns comm_handle_idx and
1218    comm_handle_data. Before accessing B, they should be destroyed (including
1219    send_data contained in the comm_handle).
1220 */
1221 
hypre_ParCSRMatrixExtractBExt_Arrays_Overlap(HYPRE_Int ** pB_ext_i,HYPRE_BigInt ** pB_ext_j,HYPRE_Complex ** pB_ext_data,HYPRE_BigInt ** pB_ext_row_map,HYPRE_Int * num_nonzeros,HYPRE_Int data,HYPRE_Int find_row_map,MPI_Comm comm,hypre_ParCSRCommPkg * comm_pkg,HYPRE_Int num_cols_B,HYPRE_Int num_recvs,HYPRE_Int num_sends,HYPRE_BigInt first_col_diag,HYPRE_BigInt * row_starts,HYPRE_Int * recv_vec_starts,HYPRE_Int * send_map_starts,HYPRE_Int * send_map_elmts,HYPRE_Int * diag_i,HYPRE_Int * diag_j,HYPRE_Int * offd_i,HYPRE_Int * offd_j,HYPRE_BigInt * col_map_offd,HYPRE_Real * diag_data,HYPRE_Real * offd_data,hypre_ParCSRCommHandle ** comm_handle_idx,hypre_ParCSRCommHandle ** comm_handle_data,HYPRE_Int * CF_marker,HYPRE_Int * CF_marker_offd,HYPRE_Int skip_fine,HYPRE_Int skip_same_sign)1222 void hypre_ParCSRMatrixExtractBExt_Arrays_Overlap(
1223    HYPRE_Int ** pB_ext_i,
1224    HYPRE_BigInt ** pB_ext_j,
1225    HYPRE_Complex ** pB_ext_data,
1226    HYPRE_BigInt ** pB_ext_row_map,
1227    HYPRE_Int * num_nonzeros,
1228    HYPRE_Int data,
1229    HYPRE_Int find_row_map,
1230    MPI_Comm comm,
1231    hypre_ParCSRCommPkg * comm_pkg,
1232    HYPRE_Int num_cols_B,
1233    HYPRE_Int num_recvs,
1234    HYPRE_Int num_sends,
1235    HYPRE_BigInt first_col_diag,
1236    HYPRE_BigInt * row_starts,
1237    HYPRE_Int * recv_vec_starts,
1238    HYPRE_Int * send_map_starts,
1239    HYPRE_Int * send_map_elmts,
1240    HYPRE_Int * diag_i,
1241    HYPRE_Int * diag_j,
1242    HYPRE_Int * offd_i,
1243    HYPRE_Int * offd_j,
1244    HYPRE_BigInt * col_map_offd,
1245    HYPRE_Real * diag_data,
1246    HYPRE_Real * offd_data,
1247    hypre_ParCSRCommHandle **comm_handle_idx,
1248    hypre_ParCSRCommHandle **comm_handle_data,
1249    HYPRE_Int *CF_marker, HYPRE_Int *CF_marker_offd,
1250    HYPRE_Int skip_fine, /* 1 if only coarse points are needed */
1251    HYPRE_Int skip_same_sign /* 1 if only points that have the same sign are needed */
1252    // extended based long range interpolation: skip_fine = 1, skip_same_sign = 0 for S matrix, skip_fine = 1, skip_same_sign = 1 for A matrix
1253    // other interpolation: skip_fine = 0, skip_same_sign = 0
1254    )
1255 {
1256    hypre_ParCSRCommHandle *comm_handle, *row_map_comm_handle = NULL;
1257    hypre_ParCSRCommPkg *tmp_comm_pkg;
1258    HYPRE_Int *B_int_i;
1259    HYPRE_BigInt *B_int_j;
1260    HYPRE_Int *B_ext_i;
1261    HYPRE_BigInt * B_ext_j;
1262    HYPRE_Complex * B_ext_data;
1263    HYPRE_Complex * B_int_data;
1264    HYPRE_BigInt * B_int_row_map;
1265    HYPRE_BigInt * B_ext_row_map;
1266    HYPRE_Int num_procs, my_id;
1267    HYPRE_Int *jdata_recv_vec_starts;
1268    HYPRE_Int *jdata_send_map_starts;
1269 
1270    HYPRE_Int i, j, k;
1271    HYPRE_Int start_index;
1272    /*HYPRE_Int jrow;*/
1273    HYPRE_Int num_rows_B_ext;
1274    HYPRE_Int *prefix_sum_workspace;
1275 
1276    hypre_MPI_Comm_size(comm,&num_procs);
1277    hypre_MPI_Comm_rank(comm,&my_id);
1278 
1279    HYPRE_BigInt first_row_index = row_starts[0];
1280 
1281    num_rows_B_ext = recv_vec_starts[num_recvs];
1282    if ( num_rows_B_ext < 0 ) {  /* no B_ext, no communication */
1283       *pB_ext_i = NULL;
1284       *pB_ext_j = NULL;
1285       if ( data ) *pB_ext_data = NULL;
1286       if ( find_row_map ) *pB_ext_row_map = NULL;
1287       *num_nonzeros = 0;
1288       return;
1289    };
1290    B_int_i = hypre_CTAlloc(HYPRE_Int,  send_map_starts[num_sends]+1, HYPRE_MEMORY_HOST);
1291    B_ext_i = hypre_CTAlloc(HYPRE_Int,  num_rows_B_ext+1, HYPRE_MEMORY_HOST);
1292    *pB_ext_i = B_ext_i;
1293    if ( find_row_map ) {
1294       B_int_row_map = hypre_CTAlloc( HYPRE_BigInt,  send_map_starts[num_sends]+1 , HYPRE_MEMORY_HOST);
1295       B_ext_row_map = hypre_CTAlloc( HYPRE_BigInt,  num_rows_B_ext+1 , HYPRE_MEMORY_HOST);
1296       *pB_ext_row_map = B_ext_row_map;
1297    };
1298 
1299    /*--------------------------------------------------------------------------
1300     * generate B_int_i through adding number of row-elements of offd and diag
1301     * for corresponding rows. B_int_i[j+1] contains the number of elements of
1302     * a row j (which is determined through send_map_elmts)
1303     *--------------------------------------------------------------------------*/
1304 
1305    jdata_send_map_starts = hypre_CTAlloc(HYPRE_Int,  num_sends+1, HYPRE_MEMORY_HOST);
1306    jdata_recv_vec_starts = hypre_CTAlloc(HYPRE_Int,  num_recvs+1, HYPRE_MEMORY_HOST);
1307    jdata_send_map_starts[0] = B_int_i[0] = 0;
1308 
1309    /*HYPRE_Int prefix_sum_workspace[(hypre_NumThreads() + 1)*num_sends];*/
1310    prefix_sum_workspace = hypre_TAlloc(HYPRE_Int,  (hypre_NumThreads() + 1)*num_sends, HYPRE_MEMORY_HOST);
1311 
1312 #ifdef HYPRE_USING_OPENMP
1313 #pragma omp parallel private(i,j,k)
1314 #endif
1315    {
1316       /*HYPRE_Int counts[num_sends];*/
1317       HYPRE_Int *counts;
1318       counts = hypre_TAlloc(HYPRE_Int,  num_sends, HYPRE_MEMORY_HOST);
1319       for (i=0; i < num_sends; i++)
1320       {
1321          HYPRE_Int j_begin, j_end;
1322          hypre_GetSimpleThreadPartition(&j_begin, &j_end, send_map_starts[i + 1] - send_map_starts[i]);
1323          j_begin += send_map_starts[i];
1324          j_end += send_map_starts[i];
1325 
1326          HYPRE_Int count = 0;
1327          if (skip_fine && skip_same_sign)
1328          {
1329             for (j = j_begin; j < j_end; j++)
1330             {
1331                HYPRE_Int jrow = send_map_elmts[j];
1332                HYPRE_Int len = 0;
1333 
1334                if (diag_data[diag_i[jrow]] >= 0)
1335                {
1336                   for (k = diag_i[jrow] + 1; k < diag_i[jrow + 1]; k++)
1337                   {
1338                      if (diag_data[k] < 0 && CF_marker[diag_j[k]] >= 0) len++;
1339                   }
1340                   for (k = offd_i[jrow]; k < offd_i[jrow + 1]; k++)
1341                   {
1342                      if (offd_data[k] < 0) len++;
1343                   }
1344                }
1345                else
1346                {
1347                   for (k = diag_i[jrow] + 1; k < diag_i[jrow + 1]; k++)
1348                   {
1349                      if (diag_data[k] > 0 && CF_marker[diag_j[k]] >= 0) len++;
1350                   }
1351                   for (k = offd_i[jrow]; k < offd_i[jrow + 1]; k++)
1352                   {
1353                      if (offd_data[k] > 0) len++;
1354                   }
1355                }
1356 
1357                B_int_i[j + 1] = len;
1358                count += len;
1359             }
1360          }
1361          else if (skip_fine)
1362          {
1363             for (j = j_begin; j < j_end; j++)
1364             {
1365                HYPRE_Int jrow = send_map_elmts[j];
1366                HYPRE_Int len = 0;
1367 
1368                for (k = diag_i[jrow]; k < diag_i[jrow + 1]; k++)
1369                {
1370                   if (CF_marker[diag_j[k]] >= 0) len++;
1371                }
1372                for (k = offd_i[jrow]; k < offd_i[jrow + 1]; k++)
1373                {
1374                   if (CF_marker_offd[offd_j[k]] >= 0) len++;
1375                }
1376 
1377                B_int_i[j + 1] = len;
1378                count += len;
1379             }
1380          }
1381          else
1382          {
1383             for (j = j_begin; j < j_end; j++)
1384             {
1385                HYPRE_Int jrow = send_map_elmts[j];
1386                HYPRE_Int len = diag_i[jrow + 1] - diag_i[jrow];
1387                len += offd_i[jrow + 1] - offd_i[jrow];
1388                B_int_i[j + 1] = len;
1389                count += len;
1390             }
1391          }
1392 
1393          if (find_row_map)
1394          {
1395             for (j = j_begin; j < j_end; j++)
1396             {
1397                HYPRE_Int jrow = send_map_elmts[j];
1398                B_int_row_map[j] = (HYPRE_BigInt)jrow + first_row_index;
1399             }
1400          }
1401 
1402          counts[i] = count;
1403       }
1404 
1405       hypre_prefix_sum_multiple(counts, jdata_send_map_starts + 1, num_sends, prefix_sum_workspace);
1406 
1407 #ifdef HYPRE_USING_OPENMP
1408 #pragma omp master
1409 #endif
1410       {
1411          for (i = 1; i < num_sends; i++)
1412          {
1413             jdata_send_map_starts[i + 1] += jdata_send_map_starts[i];
1414          }
1415 
1416          /*--------------------------------------------------------------------------
1417           * initialize communication
1418           *--------------------------------------------------------------------------*/
1419 
1420          comm_handle = hypre_ParCSRCommHandleCreate(11,comm_pkg,
1421                                                     &B_int_i[1],&(B_ext_i[1]) );
1422          if ( find_row_map )
1423          {
1424             /* scatter/gather B_int row numbers to form array of B_ext row numbers */
1425             row_map_comm_handle = hypre_ParCSRCommHandleCreate
1426                (21,comm_pkg, B_int_row_map, B_ext_row_map );
1427          }
1428 
1429          B_int_j = hypre_TAlloc(HYPRE_BigInt,  jdata_send_map_starts[num_sends], HYPRE_MEMORY_HOST);
1430          if (data) B_int_data = hypre_TAlloc(HYPRE_Complex,  jdata_send_map_starts[num_sends], HYPRE_MEMORY_HOST);
1431       }
1432 #ifdef HYPRE_USING_OPENMP
1433 #pragma omp barrier
1434 #endif
1435 
1436       for (i = 0; i < num_sends; i++)
1437       {
1438          HYPRE_Int j_begin, j_end;
1439          hypre_GetSimpleThreadPartition(&j_begin, &j_end, send_map_starts[i + 1] - send_map_starts[i]);
1440          j_begin += send_map_starts[i];
1441          j_end += send_map_starts[i];
1442 
1443          HYPRE_Int count = counts[i] + jdata_send_map_starts[i];
1444 
1445          if (data)
1446          {
1447             if (skip_same_sign && skip_fine)
1448             {
1449                for (j = j_begin; j < j_end; j++)
1450                {
1451                   HYPRE_Int jrow = send_map_elmts[j];
1452                   /*HYPRE_Int count_begin = count;*/
1453 
1454                   if (diag_data[diag_i[jrow]] >= 0)
1455                   {
1456                      for (k = diag_i[jrow] + 1; k < diag_i[jrow + 1]; k++)
1457                      {
1458                         if (diag_data[k] < 0 && CF_marker[diag_j[k]] >= 0)
1459                         {
1460                            B_int_j[count] = (HYPRE_BigInt)diag_j[k]+first_col_diag;
1461                            B_int_data[count] = diag_data[k];
1462                            count++;
1463                         }
1464                      }
1465                      for (k = offd_i[jrow]; k < offd_i[jrow + 1]; k++)
1466                      {
1467                         HYPRE_Int c = offd_j[k];
1468                         HYPRE_BigInt c_global = col_map_offd[c];
1469                         if (offd_data[k] < 0)
1470                         {
1471                            B_int_j[count] = c_global;
1472                            B_int_data[count] = offd_data[k];
1473                            count++;
1474                         }
1475                      }
1476                   }
1477                   else
1478                   {
1479                      for (k = diag_i[jrow] + 1; k < diag_i[jrow + 1]; k++)
1480                      {
1481                         if (diag_data[k] > 0 && CF_marker[diag_j[k]] >= 0)
1482                         {
1483                            B_int_j[count] = (HYPRE_BigInt)diag_j[k]+first_col_diag;
1484                            B_int_data[count] = diag_data[k];
1485                            count++;
1486                         }
1487                      }
1488                      for (k = offd_i[jrow]; k < offd_i[jrow + 1]; k++)
1489                      {
1490                         HYPRE_Int c = offd_j[k];
1491                         HYPRE_BigInt c_global = col_map_offd[c];
1492                         if (offd_data[k] > 0)
1493                         {
1494                            B_int_j[count] = c_global;
1495                            B_int_data[count] = offd_data[k];
1496                            count++;
1497                         }
1498                      }
1499                   }
1500                }
1501             }
1502             else
1503             {
1504                for (j = j_begin; j < j_end; ++j) {
1505                   HYPRE_Int jrow = send_map_elmts[j];
1506                   for (k = diag_i[jrow]; k < diag_i[jrow+1]; k++)
1507                   {
1508                      B_int_j[count] = (HYPRE_BigInt)diag_j[k]+first_col_diag;
1509                      B_int_data[count] = diag_data[k];
1510                      count++;
1511                   }
1512                   for (k = offd_i[jrow]; k < offd_i[jrow+1]; k++)
1513                   {
1514                      B_int_j[count] = col_map_offd[offd_j[k]];
1515                      B_int_data[count] = offd_data[k];
1516                      count++;
1517                   }
1518                }
1519             }
1520          } // data
1521          else
1522          {
1523             if (skip_fine)
1524             {
1525                for (j = j_begin; j < j_end; j++)
1526                {
1527                   HYPRE_Int jrow = send_map_elmts[j];
1528                   for (k = diag_i[jrow]; k < diag_i[jrow + 1]; k++)
1529                   {
1530                      if (CF_marker[diag_j[k]] >= 0)
1531                      {
1532                         B_int_j[count] = (HYPRE_BigInt)diag_j[k] + first_col_diag;
1533                         count++;
1534                      }
1535                   }
1536                   for (k = offd_i[jrow]; k < offd_i[jrow + 1]; k++)
1537                   {
1538                      if (CF_marker_offd[offd_j[k]] >= 0)
1539                      {
1540                         B_int_j[count] = col_map_offd[offd_j[k]];
1541                         count++;
1542                      }
1543                   }
1544                }
1545             }
1546             else
1547             {
1548                for (j = j_begin; j < j_end; ++j) {
1549                   HYPRE_Int jrow = send_map_elmts[j];
1550                   for (k = diag_i[jrow]; k < diag_i[jrow+1]; k++)
1551                   {
1552                      B_int_j[count] = (HYPRE_BigInt)diag_j[k]+first_col_diag;
1553                      count++;
1554                   }
1555                   for (k = offd_i[jrow]; k < offd_i[jrow+1]; k++)
1556                   {
1557                      B_int_j[count] = col_map_offd[offd_j[k]];
1558                      count++;
1559                   }
1560                }
1561             }
1562          } // !data
1563       } /* for each send target */
1564       hypre_TFree(counts, HYPRE_MEMORY_HOST);
1565    } /* omp parallel. JSP: this takes most of time in this function */
1566    hypre_TFree(prefix_sum_workspace, HYPRE_MEMORY_HOST);
1567 
1568    tmp_comm_pkg = hypre_CTAlloc(hypre_ParCSRCommPkg, 1, HYPRE_MEMORY_HOST);
1569    hypre_ParCSRCommPkgComm(tmp_comm_pkg) = comm;
1570    hypre_ParCSRCommPkgNumSends(tmp_comm_pkg) = num_sends;
1571    hypre_ParCSRCommPkgNumRecvs(tmp_comm_pkg) = num_recvs;
1572    hypre_ParCSRCommPkgSendProcs(tmp_comm_pkg) =
1573       hypre_ParCSRCommPkgSendProcs(comm_pkg);
1574    hypre_ParCSRCommPkgRecvProcs(tmp_comm_pkg) =
1575       hypre_ParCSRCommPkgRecvProcs(comm_pkg);
1576    hypre_ParCSRCommPkgSendMapStarts(tmp_comm_pkg) = jdata_send_map_starts;
1577 
1578    hypre_ParCSRCommHandleDestroy(comm_handle);
1579    comm_handle = NULL;
1580 
1581    /*--------------------------------------------------------------------------
1582     * after communication exchange B_ext_i[j+1] contains the number of elements
1583     * of a row j !
1584     * evaluate B_ext_i and compute *num_nonzeros for B_ext
1585     *--------------------------------------------------------------------------*/
1586 
1587    for (i = 0; i < num_recvs; i++)
1588    {
1589       for (j = recv_vec_starts[i]; j < recv_vec_starts[i+1]; j++)
1590       {
1591          B_ext_i[j+1] += B_ext_i[j];
1592       }
1593    }
1594 
1595    *num_nonzeros = B_ext_i[num_rows_B_ext];
1596 
1597    *pB_ext_j = hypre_TAlloc(HYPRE_BigInt,  *num_nonzeros, HYPRE_MEMORY_HOST);
1598    B_ext_j = *pB_ext_j;
1599    if (data)
1600    {
1601       *pB_ext_data = hypre_TAlloc(HYPRE_Complex,  *num_nonzeros, HYPRE_MEMORY_HOST);
1602       B_ext_data = *pB_ext_data;
1603    }
1604 
1605    for (i = 0; i < num_recvs; i++)
1606    {
1607       start_index = B_ext_i[recv_vec_starts[i]];
1608       *num_nonzeros = B_ext_i[recv_vec_starts[i+1]]-start_index;
1609       jdata_recv_vec_starts[i+1] = B_ext_i[recv_vec_starts[i+1]];
1610    }
1611 
1612    hypre_ParCSRCommPkgRecvVecStarts(tmp_comm_pkg) = jdata_recv_vec_starts;
1613 
1614    *comm_handle_idx = hypre_ParCSRCommHandleCreate(21,tmp_comm_pkg,B_int_j,B_ext_j);
1615    if (data)
1616    {
1617       *comm_handle_data = hypre_ParCSRCommHandleCreate(1,tmp_comm_pkg,B_int_data,
1618                                                        B_ext_data);
1619    }
1620 
1621    if (row_map_comm_handle)
1622    {
1623       hypre_ParCSRCommHandleDestroy(row_map_comm_handle);
1624       row_map_comm_handle = NULL;
1625    }
1626 
1627    hypre_TFree(jdata_send_map_starts, HYPRE_MEMORY_HOST);
1628    hypre_TFree(jdata_recv_vec_starts, HYPRE_MEMORY_HOST);
1629    hypre_TFree(tmp_comm_pkg, HYPRE_MEMORY_HOST);
1630    hypre_TFree(B_int_i, HYPRE_MEMORY_HOST);
1631    if ( find_row_map ) hypre_TFree(B_int_row_map, HYPRE_MEMORY_HOST);
1632 
1633    /* end generic part */
1634 }
1635 
hypre_ParCSRMatrixExtractBExt_Arrays(HYPRE_Int ** pB_ext_i,HYPRE_BigInt ** pB_ext_j,HYPRE_Complex ** pB_ext_data,HYPRE_BigInt ** pB_ext_row_map,HYPRE_Int * num_nonzeros,HYPRE_Int data,HYPRE_Int find_row_map,MPI_Comm comm,hypre_ParCSRCommPkg * comm_pkg,HYPRE_Int num_cols_B,HYPRE_Int num_recvs,HYPRE_Int num_sends,HYPRE_BigInt first_col_diag,HYPRE_BigInt * row_starts,HYPRE_Int * recv_vec_starts,HYPRE_Int * send_map_starts,HYPRE_Int * send_map_elmts,HYPRE_Int * diag_i,HYPRE_Int * diag_j,HYPRE_Int * offd_i,HYPRE_Int * offd_j,HYPRE_BigInt * col_map_offd,HYPRE_Real * diag_data,HYPRE_Real * offd_data)1636 void hypre_ParCSRMatrixExtractBExt_Arrays(
1637    HYPRE_Int ** pB_ext_i,
1638    HYPRE_BigInt ** pB_ext_j,
1639    HYPRE_Complex ** pB_ext_data,
1640    HYPRE_BigInt ** pB_ext_row_map,
1641    HYPRE_Int * num_nonzeros,
1642    HYPRE_Int data,
1643    HYPRE_Int find_row_map,
1644    MPI_Comm comm,
1645    hypre_ParCSRCommPkg * comm_pkg,
1646    HYPRE_Int num_cols_B,
1647    HYPRE_Int num_recvs,
1648    HYPRE_Int num_sends,
1649    HYPRE_BigInt first_col_diag,
1650    HYPRE_BigInt * row_starts,
1651    HYPRE_Int * recv_vec_starts,
1652    HYPRE_Int * send_map_starts,
1653    HYPRE_Int * send_map_elmts,
1654    HYPRE_Int * diag_i,
1655    HYPRE_Int * diag_j,
1656    HYPRE_Int * offd_i,
1657    HYPRE_Int * offd_j,
1658    HYPRE_BigInt * col_map_offd,
1659    HYPRE_Real * diag_data,
1660    HYPRE_Real * offd_data
1661    )
1662 {
1663    hypre_ParCSRCommHandle *comm_handle_idx, *comm_handle_data;
1664 
1665    hypre_ParCSRMatrixExtractBExt_Arrays_Overlap(
1666       pB_ext_i, pB_ext_j, pB_ext_data, pB_ext_row_map, num_nonzeros,
1667       data, find_row_map, comm, comm_pkg, num_cols_B, num_recvs, num_sends,
1668       first_col_diag, row_starts, recv_vec_starts, send_map_starts, send_map_elmts,
1669       diag_i, diag_j, offd_i, offd_j, col_map_offd, diag_data, offd_data,
1670       &comm_handle_idx, &comm_handle_data,
1671       NULL, NULL,
1672       0, 0);
1673 
1674    HYPRE_Int *send_idx = (HYPRE_Int *)comm_handle_idx->send_data;
1675    hypre_ParCSRCommHandleDestroy(comm_handle_idx);
1676    hypre_TFree(send_idx, HYPRE_MEMORY_HOST);
1677 
1678    if (data)
1679    {
1680       HYPRE_Real *send_data = (HYPRE_Real *)comm_handle_data->send_data;
1681       hypre_ParCSRCommHandleDestroy(comm_handle_data);
1682       hypre_TFree(send_data, HYPRE_MEMORY_HOST);
1683    }
1684 }
1685 
1686 /*--------------------------------------------------------------------------
1687  * hypre_ParCSRMatrixExtractBExt : extracts rows from B which are located on
1688  * other processors and needed for multiplication with A locally. The rows
1689  * are returned as CSRMatrix.
1690  *--------------------------------------------------------------------------*/
1691 
1692 hypre_CSRMatrix *
hypre_ParCSRMatrixExtractBExt_Overlap(hypre_ParCSRMatrix * B,hypre_ParCSRMatrix * A,HYPRE_Int data,hypre_ParCSRCommHandle ** comm_handle_idx,hypre_ParCSRCommHandle ** comm_handle_data,HYPRE_Int * CF_marker,HYPRE_Int * CF_marker_offd,HYPRE_Int skip_fine,HYPRE_Int skip_same_sign)1693 hypre_ParCSRMatrixExtractBExt_Overlap( hypre_ParCSRMatrix *B,
1694                                        hypre_ParCSRMatrix *A,
1695                                        HYPRE_Int data,
1696                                        hypre_ParCSRCommHandle **comm_handle_idx,
1697                                        hypre_ParCSRCommHandle **comm_handle_data,
1698                                        HYPRE_Int *CF_marker, HYPRE_Int *CF_marker_offd,
1699                                        HYPRE_Int skip_fine, HYPRE_Int skip_same_sign )
1700 {
1701    MPI_Comm  comm = hypre_ParCSRMatrixComm(B);
1702    HYPRE_BigInt first_col_diag = hypre_ParCSRMatrixFirstColDiag(B);
1703    /*HYPRE_Int first_row_index = hypre_ParCSRMatrixFirstRowIndex(B);*/
1704    HYPRE_BigInt *col_map_offd = hypre_ParCSRMatrixColMapOffd(B);
1705 
1706    hypre_ParCSRCommPkg *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
1707    HYPRE_Int num_recvs;
1708    HYPRE_Int *recv_vec_starts;
1709    HYPRE_Int num_sends;
1710    HYPRE_Int *send_map_starts;
1711    HYPRE_Int *send_map_elmts;
1712 
1713    hypre_CSRMatrix *diag = hypre_ParCSRMatrixDiag(B);
1714 
1715    HYPRE_Int *diag_i = hypre_CSRMatrixI(diag);
1716    HYPRE_Int *diag_j = hypre_CSRMatrixJ(diag);
1717    HYPRE_Real *diag_data = hypre_CSRMatrixData(diag);
1718 
1719    hypre_CSRMatrix *offd = hypre_ParCSRMatrixOffd(B);
1720 
1721    HYPRE_Int *offd_i = hypre_CSRMatrixI(offd);
1722    HYPRE_Int *offd_j = hypre_CSRMatrixJ(offd);
1723    HYPRE_Real *offd_data = hypre_CSRMatrixData(offd);
1724 
1725    HYPRE_Int num_cols_B, num_nonzeros;
1726    HYPRE_Int num_rows_B_ext;
1727 
1728    hypre_CSRMatrix *B_ext;
1729 
1730    HYPRE_Int *B_ext_i;
1731    HYPRE_BigInt *B_ext_j;
1732    HYPRE_Complex *B_ext_data;
1733    HYPRE_BigInt *idummy;
1734 
1735    /*---------------------------------------------------------------------
1736     * If there exists no CommPkg for A, a CommPkg is generated using
1737     * equally load balanced partitionings
1738     *--------------------------------------------------------------------*/
1739 
1740    if (!hypre_ParCSRMatrixCommPkg(A))
1741    {
1742       hypre_MatvecCommPkgCreate(A);
1743    }
1744 
1745    comm_pkg = hypre_ParCSRMatrixCommPkg(A);
1746    num_recvs = hypre_ParCSRCommPkgNumRecvs(comm_pkg);
1747    recv_vec_starts = hypre_ParCSRCommPkgRecvVecStarts(comm_pkg);
1748    num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
1749    send_map_starts = hypre_ParCSRCommPkgSendMapStarts(comm_pkg);
1750    send_map_elmts = hypre_ParCSRCommPkgSendMapElmts(comm_pkg);
1751 
1752    num_cols_B = hypre_ParCSRMatrixGlobalNumCols(B);
1753    num_rows_B_ext = recv_vec_starts[num_recvs];
1754 
1755    hypre_ParCSRMatrixExtractBExt_Arrays_Overlap
1756       ( &B_ext_i, &B_ext_j, &B_ext_data, &idummy,
1757         &num_nonzeros,
1758         data, 0, comm, comm_pkg,
1759         num_cols_B, num_recvs, num_sends,
1760         first_col_diag, B->row_starts,
1761         recv_vec_starts, send_map_starts, send_map_elmts,
1762         diag_i, diag_j, offd_i, offd_j, col_map_offd,
1763         diag_data, offd_data,
1764         comm_handle_idx, comm_handle_data,
1765         CF_marker, CF_marker_offd,
1766         skip_fine, skip_same_sign
1767          );
1768 
1769    B_ext = hypre_CSRMatrixCreate(num_rows_B_ext,num_cols_B,num_nonzeros);
1770    hypre_CSRMatrixMemoryLocation(B_ext) = HYPRE_MEMORY_HOST;
1771    hypre_CSRMatrixI(B_ext) = B_ext_i;
1772    hypre_CSRMatrixBigJ(B_ext) = B_ext_j;
1773    if (data) hypre_CSRMatrixData(B_ext) = B_ext_data;
1774 
1775    return B_ext;
1776 }
1777 
1778 hypre_CSRMatrix *
hypre_ParCSRMatrixExtractBExt(hypre_ParCSRMatrix * B,hypre_ParCSRMatrix * A,HYPRE_Int want_data)1779 hypre_ParCSRMatrixExtractBExt( hypre_ParCSRMatrix *B,
1780                                hypre_ParCSRMatrix *A,
1781                                HYPRE_Int want_data )
1782 {
1783 #if 0
1784    hypre_ParCSRCommHandle *comm_handle_idx, *comm_handle_data;
1785 
1786    hypre_CSRMatrix *B_ext = hypre_ParCSRMatrixExtractBExt_Overlap(B, A, want_data, &comm_handle_idx, &comm_handle_data, NULL, NULL, 0, 0);
1787 
1788    HYPRE_Int *send_idx = (HYPRE_Int *)comm_handle_idx->send_data;
1789    hypre_ParCSRCommHandleDestroy(comm_handle_idx);
1790    hypre_TFree(send_idx, HYPRE_MEMORY_HOST);
1791 
1792    if (want_data)
1793    {
1794       HYPRE_Real *send_data = (HYPRE_Real *)comm_handle_data->send_data;
1795       hypre_ParCSRCommHandleDestroy(comm_handle_data);
1796       hypre_TFree(send_data, HYPRE_MEMORY_HOST);
1797    }
1798 #else
1799    hypre_assert( hypre_CSRMatrixMemoryLocation(hypre_ParCSRMatrixDiag(B)) ==
1800                  hypre_CSRMatrixMemoryLocation(hypre_ParCSRMatrixOffd(B)) );
1801 
1802    hypre_CSRMatrix *B_ext;
1803    void            *request;
1804 
1805    if (!hypre_ParCSRMatrixCommPkg(A))
1806    {
1807       hypre_MatvecCommPkgCreate(A);
1808    }
1809 
1810    hypre_ParcsrGetExternalRowsInit(B,
1811                                    hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(A)),
1812                                    hypre_ParCSRMatrixColMapOffd(A),
1813                                    hypre_ParCSRMatrixCommPkg(A),
1814                                    want_data,
1815                                    &request);
1816 
1817    B_ext = hypre_ParcsrGetExternalRowsWait(request);
1818 #endif
1819 
1820    return B_ext;
1821 }
1822 
1823 /*--------------------------------------------------------------------------
1824  * hypre_ParCSRMatrixTransposeHost
1825  *--------------------------------------------------------------------------*/
1826 
1827 HYPRE_Int
hypre_ParCSRMatrixTransposeHost(hypre_ParCSRMatrix * A,hypre_ParCSRMatrix ** AT_ptr,HYPRE_Int data)1828 hypre_ParCSRMatrixTransposeHost( hypre_ParCSRMatrix  *A,
1829                                  hypre_ParCSRMatrix **AT_ptr,
1830                                  HYPRE_Int            data )
1831 {
1832    MPI_Comm                 comm     = hypre_ParCSRMatrixComm(A);
1833    hypre_ParCSRCommPkg     *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
1834    hypre_CSRMatrix         *A_diag   = hypre_ParCSRMatrixDiag(A);
1835    hypre_CSRMatrix         *A_offd   = hypre_ParCSRMatrixOffd(A);
1836    HYPRE_Int                num_cols = hypre_ParCSRMatrixNumCols(A);
1837    HYPRE_BigInt             first_row_index = hypre_ParCSRMatrixFirstRowIndex(A);
1838    HYPRE_BigInt            *row_starts = hypre_ParCSRMatrixRowStarts(A);
1839    HYPRE_BigInt            *col_starts = hypre_ParCSRMatrixColStarts(A);
1840 
1841    HYPRE_Int                num_cols_offd = hypre_CSRMatrixNumCols(A_offd);
1842    HYPRE_Int                num_sends, num_recvs, num_cols_offd_AT;
1843    HYPRE_Int                i, j, k, index, counter, j_row;
1844    HYPRE_BigInt             value;
1845 
1846    hypre_ParCSRMatrix      *AT;
1847    hypre_CSRMatrix         *AT_diag;
1848    hypre_CSRMatrix         *AT_offd;
1849    hypre_CSRMatrix         *AT_tmp;
1850 
1851    HYPRE_BigInt             first_row_index_AT, first_col_diag_AT;
1852    HYPRE_Int                local_num_rows_AT, local_num_cols_AT;
1853 
1854    HYPRE_Int               *AT_tmp_i;
1855    HYPRE_Int               *AT_tmp_j;
1856    HYPRE_BigInt            *AT_big_j = NULL;
1857    HYPRE_Complex           *AT_tmp_data;
1858 
1859    HYPRE_Int               *AT_buf_i;
1860    HYPRE_BigInt            *AT_buf_j;
1861    HYPRE_Complex           *AT_buf_data;
1862 
1863    HYPRE_Int               *AT_offd_i;
1864    HYPRE_Int               *AT_offd_j;
1865    HYPRE_Complex           *AT_offd_data;
1866    HYPRE_BigInt            *col_map_offd_AT;
1867    HYPRE_BigInt             row_starts_AT[2];
1868    HYPRE_BigInt             col_starts_AT[2];
1869 
1870    HYPRE_Int                num_procs, my_id;
1871 
1872    HYPRE_Int               *recv_procs, *send_procs;
1873    HYPRE_Int               *recv_vec_starts;
1874    HYPRE_Int               *send_map_starts;
1875    HYPRE_Int               *send_map_elmts;
1876    HYPRE_Int               *tmp_recv_vec_starts;
1877    HYPRE_Int               *tmp_send_map_starts;
1878    hypre_ParCSRCommPkg     *tmp_comm_pkg;
1879    hypre_ParCSRCommHandle  *comm_handle;
1880 
1881    hypre_MPI_Comm_size(comm,&num_procs);
1882    hypre_MPI_Comm_rank(comm,&my_id);
1883 
1884    num_cols_offd_AT = 0;
1885    counter = 0;
1886    AT_offd_j = NULL;
1887    AT_offd_data = NULL;
1888    col_map_offd_AT = NULL;
1889 
1890    HYPRE_MemoryLocation memory_location = hypre_ParCSRMatrixMemoryLocation(A);
1891 
1892    /*---------------------------------------------------------------------
1893     * If there exists no CommPkg for A, a CommPkg is generated using
1894     * equally load balanced partitionings
1895     *--------------------------------------------------------------------*/
1896 
1897    if (!comm_pkg)
1898    {
1899       hypre_MatvecCommPkgCreate(A);
1900       comm_pkg = hypre_ParCSRMatrixCommPkg(A);
1901    }
1902 
1903    if (num_procs > 1)
1904    {
1905       hypre_CSRMatrixTranspose (A_offd, &AT_tmp, data);
1906 
1907       AT_tmp_i = hypre_CSRMatrixI(AT_tmp);
1908       AT_tmp_j = hypre_CSRMatrixJ(AT_tmp);
1909       if (data)
1910       {
1911          AT_tmp_data = hypre_CSRMatrixData(AT_tmp);
1912       }
1913 
1914       num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
1915       num_recvs = hypre_ParCSRCommPkgNumRecvs(comm_pkg);
1916       recv_procs = hypre_ParCSRCommPkgRecvProcs(comm_pkg);
1917       send_procs = hypre_ParCSRCommPkgSendProcs(comm_pkg);
1918       recv_vec_starts = hypre_ParCSRCommPkgRecvVecStarts(comm_pkg);
1919       send_map_starts = hypre_ParCSRCommPkgSendMapStarts(comm_pkg);
1920       send_map_elmts = hypre_ParCSRCommPkgSendMapElmts(comm_pkg);
1921 
1922       AT_buf_i = hypre_CTAlloc(HYPRE_Int, send_map_starts[num_sends], HYPRE_MEMORY_HOST);
1923       if (AT_tmp_i[num_cols_offd])
1924       {
1925          AT_big_j = hypre_CTAlloc(HYPRE_BigInt, AT_tmp_i[num_cols_offd], HYPRE_MEMORY_HOST);
1926       }
1927 
1928       for (i = 0; i < AT_tmp_i[num_cols_offd]; i++)
1929       {
1930          //AT_tmp_j[i] += first_row_index;
1931          AT_big_j[i] = (HYPRE_BigInt)AT_tmp_j[i]+first_row_index;
1932       }
1933 
1934       for (i = 0; i < num_cols_offd; i++)
1935       {
1936          AT_tmp_i[i] = AT_tmp_i[i+1]-AT_tmp_i[i];
1937       }
1938 
1939       comm_handle = hypre_ParCSRCommHandleCreate(12, comm_pkg, AT_tmp_i, AT_buf_i);
1940    }
1941 
1942    hypre_CSRMatrixTranspose(A_diag, &AT_diag, data);
1943 
1944    AT_offd_i = hypre_CTAlloc(HYPRE_Int, num_cols+1, memory_location);
1945 
1946    if (num_procs > 1)
1947    {
1948       hypre_ParCSRCommHandleDestroy(comm_handle);
1949       comm_handle = NULL;
1950 
1951       tmp_send_map_starts = hypre_CTAlloc(HYPRE_Int, num_sends+1, HYPRE_MEMORY_HOST);
1952       tmp_recv_vec_starts = hypre_CTAlloc(HYPRE_Int, num_recvs+1, HYPRE_MEMORY_HOST);
1953 
1954       tmp_send_map_starts[0] = send_map_starts[0];
1955       for (i = 0; i < num_sends; i++)
1956       {
1957          tmp_send_map_starts[i+1] = tmp_send_map_starts[i];
1958          for (j = send_map_starts[i]; j < send_map_starts[i+1]; j++)
1959          {
1960             tmp_send_map_starts[i+1] += AT_buf_i[j];
1961             AT_offd_i[send_map_elmts[j]+1] += AT_buf_i[j];
1962          }
1963       }
1964       for (i = 0; i < num_cols; i++)
1965       {
1966          AT_offd_i[i+1] += AT_offd_i[i];
1967       }
1968 
1969       tmp_recv_vec_starts[0] = recv_vec_starts[0];
1970       for (i = 0; i < num_recvs; i++)
1971       {
1972          tmp_recv_vec_starts[i+1] = tmp_recv_vec_starts[i];
1973          for (j = recv_vec_starts[i]; j < recv_vec_starts[i+1]; j++)
1974          {
1975             tmp_recv_vec_starts[i+1] +=  AT_tmp_i[j];
1976          }
1977       }
1978 
1979       tmp_comm_pkg = hypre_CTAlloc(hypre_ParCSRCommPkg, 1, HYPRE_MEMORY_HOST);
1980       hypre_ParCSRCommPkgComm(tmp_comm_pkg) = comm;
1981       hypre_ParCSRCommPkgNumSends(tmp_comm_pkg) = num_sends;
1982       hypre_ParCSRCommPkgNumRecvs(tmp_comm_pkg) = num_recvs;
1983       hypre_ParCSRCommPkgRecvProcs(tmp_comm_pkg) = recv_procs;
1984       hypre_ParCSRCommPkgSendProcs(tmp_comm_pkg) = send_procs;
1985       hypre_ParCSRCommPkgRecvVecStarts(tmp_comm_pkg) = tmp_recv_vec_starts;
1986       hypre_ParCSRCommPkgSendMapStarts(tmp_comm_pkg) = tmp_send_map_starts;
1987 
1988       AT_buf_j = hypre_CTAlloc(HYPRE_BigInt, tmp_send_map_starts[num_sends], HYPRE_MEMORY_HOST);
1989       comm_handle = hypre_ParCSRCommHandleCreate(22, tmp_comm_pkg, AT_big_j,
1990                                                  AT_buf_j);
1991       hypre_ParCSRCommHandleDestroy(comm_handle);
1992       comm_handle = NULL;
1993       hypre_TFree(AT_big_j, HYPRE_MEMORY_HOST);
1994 
1995       if (data)
1996       {
1997          AT_buf_data = hypre_CTAlloc(HYPRE_Complex, tmp_send_map_starts[num_sends], HYPRE_MEMORY_HOST);
1998          comm_handle = hypre_ParCSRCommHandleCreate(2,tmp_comm_pkg,AT_tmp_data,
1999                                                     AT_buf_data);
2000          hypre_ParCSRCommHandleDestroy(comm_handle);
2001          comm_handle = NULL;
2002       }
2003 
2004       hypre_TFree(tmp_recv_vec_starts, HYPRE_MEMORY_HOST);
2005       hypre_TFree(tmp_send_map_starts, HYPRE_MEMORY_HOST);
2006       hypre_TFree(tmp_comm_pkg, HYPRE_MEMORY_HOST);
2007       hypre_CSRMatrixDestroy(AT_tmp);
2008 
2009       if (AT_offd_i[num_cols])
2010       {
2011          AT_offd_j = hypre_CTAlloc(HYPRE_Int, AT_offd_i[num_cols], memory_location);
2012          AT_big_j = hypre_CTAlloc(HYPRE_BigInt, AT_offd_i[num_cols], HYPRE_MEMORY_HOST);
2013          if (data)
2014          {
2015             AT_offd_data = hypre_CTAlloc(HYPRE_Complex,  AT_offd_i[num_cols], memory_location);
2016          }
2017       }
2018       else
2019       {
2020          AT_offd_j = NULL;
2021          AT_offd_data = NULL;
2022       }
2023 
2024       counter = 0;
2025       for (i = 0; i < num_sends; i++)
2026       {
2027          for (j = send_map_starts[i]; j < send_map_starts[i+1]; j++)
2028          {
2029             j_row = send_map_elmts[j];
2030             index = AT_offd_i[j_row];
2031             for (k = 0; k < AT_buf_i[j]; k++)
2032             {
2033                if (data)
2034                {
2035                   AT_offd_data[index] = AT_buf_data[counter];
2036                }
2037                AT_big_j[index++] = AT_buf_j[counter++];
2038             }
2039             AT_offd_i[j_row] = index;
2040          }
2041       }
2042       for (i = num_cols; i > 0; i--)
2043       {
2044          AT_offd_i[i] = AT_offd_i[i-1];
2045       }
2046       AT_offd_i[0] = 0;
2047 
2048       if (counter)
2049       {
2050          hypre_BigQsort0(AT_buf_j,0,counter-1);
2051          num_cols_offd_AT = 1;
2052          value = AT_buf_j[0];
2053          for (i = 1; i < counter; i++)
2054          {
2055             if (value < AT_buf_j[i])
2056             {
2057                AT_buf_j[num_cols_offd_AT++] = AT_buf_j[i];
2058                value = AT_buf_j[i];
2059             }
2060          }
2061       }
2062 
2063       if (num_cols_offd_AT)
2064       {
2065          col_map_offd_AT = hypre_CTAlloc(HYPRE_BigInt, num_cols_offd_AT, HYPRE_MEMORY_HOST);
2066       }
2067       else
2068       {
2069          col_map_offd_AT = NULL;
2070       }
2071 
2072       for (i = 0; i < num_cols_offd_AT; i++)
2073       {
2074          col_map_offd_AT[i] = AT_buf_j[i];
2075       }
2076       hypre_TFree(AT_buf_i, HYPRE_MEMORY_HOST);
2077       hypre_TFree(AT_buf_j, HYPRE_MEMORY_HOST);
2078       if (data)
2079       {
2080          hypre_TFree(AT_buf_data, HYPRE_MEMORY_HOST);
2081       }
2082 
2083       for (i = 0; i < counter; i++)
2084       {
2085          AT_offd_j[i] = hypre_BigBinarySearch(col_map_offd_AT,AT_big_j[i],
2086                                               num_cols_offd_AT);
2087       }
2088       hypre_TFree(AT_big_j, HYPRE_MEMORY_HOST);
2089    }
2090 
2091    AT_offd = hypre_CSRMatrixCreate(num_cols, num_cols_offd_AT, counter);
2092    hypre_CSRMatrixMemoryLocation(AT_offd) = memory_location;
2093    hypre_CSRMatrixI(AT_offd) = AT_offd_i;
2094    hypre_CSRMatrixJ(AT_offd) = AT_offd_j;
2095    hypre_CSRMatrixData(AT_offd) = AT_offd_data;
2096 
2097    for (i = 0; i < 2; i++)
2098    {
2099       row_starts_AT[i] = col_starts[i];
2100       col_starts_AT[i] = row_starts[i];
2101    }
2102 
2103    first_row_index_AT = row_starts_AT[0];
2104    first_col_diag_AT  = col_starts_AT[0];
2105 
2106    local_num_rows_AT = (HYPRE_Int)(row_starts_AT[1]-first_row_index_AT );
2107    local_num_cols_AT = (HYPRE_Int)(col_starts_AT[1]-first_col_diag_AT);
2108 
2109    AT = hypre_CTAlloc(hypre_ParCSRMatrix, 1, HYPRE_MEMORY_HOST);
2110    hypre_ParCSRMatrixComm(AT) = comm;
2111    hypre_ParCSRMatrixDiag(AT) = AT_diag;
2112    hypre_ParCSRMatrixOffd(AT) = AT_offd;
2113    hypre_ParCSRMatrixGlobalNumRows(AT) = hypre_ParCSRMatrixGlobalNumCols(A);
2114    hypre_ParCSRMatrixGlobalNumCols(AT) = hypre_ParCSRMatrixGlobalNumRows(A);
2115    hypre_ParCSRMatrixRowStarts(AT)[0]  = row_starts_AT[0];
2116    hypre_ParCSRMatrixRowStarts(AT)[1]  = row_starts_AT[1];
2117    hypre_ParCSRMatrixColStarts(AT)[0]  = col_starts_AT[0];
2118    hypre_ParCSRMatrixColStarts(AT)[1]  = col_starts_AT[1];
2119    hypre_ParCSRMatrixColMapOffd(AT)    = col_map_offd_AT;
2120 
2121    hypre_ParCSRMatrixFirstRowIndex(AT) = first_row_index_AT;
2122    hypre_ParCSRMatrixFirstColDiag(AT)  = first_col_diag_AT;
2123 
2124    hypre_ParCSRMatrixLastRowIndex(AT) = first_row_index_AT + local_num_rows_AT - 1;
2125    hypre_ParCSRMatrixLastColDiag(AT)  = first_col_diag_AT + local_num_cols_AT - 1;
2126 
2127    hypre_ParCSRMatrixOwnsData(AT) = 1;
2128    hypre_ParCSRMatrixCommPkg(AT)  = NULL;
2129    hypre_ParCSRMatrixCommPkgT(AT) = NULL;
2130 
2131    hypre_ParCSRMatrixRowindices(AT) = NULL;
2132    hypre_ParCSRMatrixRowvalues(AT)  = NULL;
2133    hypre_ParCSRMatrixGetrowactive(AT) = 0;
2134 
2135    hypre_ParCSRMatrixOwnsAssumedPartition(AT) = 1;
2136 
2137    *AT_ptr = AT;
2138 
2139    return hypre_error_flag;
2140 }
2141 
2142 /*--------------------------------------------------------------------------
2143  * hypre_ParCSRMatrixTranspose
2144  *--------------------------------------------------------------------------*/
2145 
2146 HYPRE_Int
hypre_ParCSRMatrixTranspose(hypre_ParCSRMatrix * A,hypre_ParCSRMatrix ** AT_ptr,HYPRE_Int data)2147 hypre_ParCSRMatrixTranspose( hypre_ParCSRMatrix  *A,
2148                              hypre_ParCSRMatrix **AT_ptr,
2149                              HYPRE_Int            data )
2150 {
2151 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2152    hypre_GpuProfilingPushRange("ParCSRMatrixTranspose");
2153 #endif
2154 
2155 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2156    HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
2157 
2158    if (exec == HYPRE_EXEC_DEVICE)
2159    {
2160       hypre_ParCSRMatrixTransposeDevice(A, AT_ptr, data);
2161    }
2162    else
2163 #endif
2164    {
2165       hypre_ParCSRMatrixTransposeHost(A, AT_ptr, data);
2166    }
2167 
2168 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
2169    hypre_GpuProfilingPopRange();
2170 #endif
2171 
2172    return hypre_error_flag;
2173 }
2174 
2175 /* -----------------------------------------------------------------------------
2176  * generate a parallel spanning tree (for Maxwell Equation)
2177  * G_csr is the node to edge connectivity matrix
2178  * ----------------------------------------------------------------------------- */
2179 
2180 void
hypre_ParCSRMatrixGenSpanningTree(hypre_ParCSRMatrix * G_csr,HYPRE_Int ** indices,HYPRE_Int G_type)2181 hypre_ParCSRMatrixGenSpanningTree( hypre_ParCSRMatrix *G_csr,
2182                                    HYPRE_Int         **indices,
2183                                    HYPRE_Int           G_type )
2184 {
2185    HYPRE_BigInt nrows_G, ncols_G;
2186    HYPRE_Int *G_diag_i, *G_diag_j, *GT_diag_mat, i, j, k, edge;
2187    HYPRE_Int *nodes_marked, *edges_marked, *queue, queue_tail, queue_head, node;
2188    HYPRE_Int mypid, nprocs, n_children, *children, nsends, *send_procs, *recv_cnts;
2189    HYPRE_Int nrecvs, *recv_procs, n_proc_array, *proc_array, *pgraph_i, *pgraph_j;
2190    HYPRE_Int parent, proc, proc2, node2, found, *t_indices, tree_size, *T_diag_i;
2191    HYPRE_Int *T_diag_j, *counts, offset;
2192    MPI_Comm            comm;
2193    hypre_ParCSRCommPkg *comm_pkg;
2194    hypre_CSRMatrix     *G_diag;
2195 
2196    /* fetch G matrix (G_type = 0 ==> node to edge) */
2197 
2198    if (G_type == 0)
2199    {
2200       nrows_G = hypre_ParCSRMatrixGlobalNumRows(G_csr);
2201       ncols_G = hypre_ParCSRMatrixGlobalNumCols(G_csr);
2202       G_diag = hypre_ParCSRMatrixDiag(G_csr);
2203       G_diag_i = hypre_CSRMatrixI(G_diag);
2204       G_diag_j = hypre_CSRMatrixJ(G_diag);
2205    }
2206    else
2207    {
2208       nrows_G = hypre_ParCSRMatrixGlobalNumCols(G_csr);
2209       ncols_G = hypre_ParCSRMatrixGlobalNumRows(G_csr);
2210       G_diag = hypre_ParCSRMatrixDiag(G_csr);
2211       T_diag_i = hypre_CSRMatrixI(G_diag);
2212       T_diag_j = hypre_CSRMatrixJ(G_diag);
2213       counts = hypre_TAlloc(HYPRE_Int, nrows_G , HYPRE_MEMORY_HOST);
2214       for (i = 0; i < nrows_G; i++) counts[i] = 0;
2215       for (i = 0; i < T_diag_i[ncols_G]; i++) counts[T_diag_j[i]]++;
2216       G_diag_i = hypre_TAlloc(HYPRE_Int, (nrows_G+1) , HYPRE_MEMORY_HOST);
2217       G_diag_j = hypre_TAlloc(HYPRE_Int, T_diag_i[ncols_G] , HYPRE_MEMORY_HOST);
2218       G_diag_i[0] = 0;
2219       for (i = 1; i <= nrows_G; i++) G_diag_i[i] = G_diag_i[i-1] + counts[i-1];
2220       for (i = 0; i < ncols_G; i++)
2221       {
2222          for (j = T_diag_i[i]; j < T_diag_i[i+1]; j++)
2223          {
2224             k = T_diag_j[j];
2225             offset = G_diag_i[k]++;
2226             G_diag_j[offset] = i;
2227          }
2228       }
2229       G_diag_i[0] = 0;
2230       for (i = 1; i <= nrows_G; i++)
2231       {
2232          G_diag_i[i] = G_diag_i[i-1] + counts[i-1];
2233       }
2234       hypre_TFree(counts, HYPRE_MEMORY_HOST);
2235    }
2236 
2237    /* form G transpose in special form (2 nodes per edge max) */
2238 
2239    GT_diag_mat = hypre_TAlloc(HYPRE_Int, 2 * ncols_G , HYPRE_MEMORY_HOST);
2240    for (i = 0; i < 2 * ncols_G; i++) GT_diag_mat[i] = -1;
2241    for (i = 0; i < nrows_G; i++)
2242    {
2243       for (j = G_diag_i[i]; j < G_diag_i[i+1]; j++)
2244       {
2245          edge = G_diag_j[j];
2246          if (GT_diag_mat[edge*2] == -1) GT_diag_mat[edge*2] = i;
2247          else                           GT_diag_mat[edge*2+1] = i;
2248       }
2249    }
2250 
2251    /* BFS on the local matrix graph to find tree */
2252 
2253    nodes_marked = hypre_TAlloc(HYPRE_Int, nrows_G , HYPRE_MEMORY_HOST);
2254    edges_marked = hypre_TAlloc(HYPRE_Int, ncols_G , HYPRE_MEMORY_HOST);
2255    for (i = 0; i < nrows_G; i++) nodes_marked[i] = 0;
2256    for (i = 0; i < ncols_G; i++) edges_marked[i] = 0;
2257    queue = hypre_TAlloc(HYPRE_Int, nrows_G , HYPRE_MEMORY_HOST);
2258    queue_head = 0;
2259    queue_tail = 1;
2260    queue[0] = 0;
2261    nodes_marked[0] = 1;
2262    while ((queue_tail-queue_head) > 0)
2263    {
2264       node = queue[queue_tail-1];
2265       queue_tail--;
2266       for (i = G_diag_i[node]; i < G_diag_i[node+1]; i++)
2267       {
2268          edge = G_diag_j[i];
2269          if (edges_marked[edge] == 0)
2270          {
2271             if (GT_diag_mat[2*edge+1] != -1)
2272             {
2273                node2 = GT_diag_mat[2*edge];
2274                if (node2 == node) node2 = GT_diag_mat[2*edge+1];
2275                if (nodes_marked[node2] == 0)
2276                {
2277                   nodes_marked[node2] = 1;
2278                   edges_marked[edge] = 1;
2279                   queue[queue_tail] = node2;
2280                   queue_tail++;
2281                }
2282             }
2283          }
2284       }
2285    }
2286    hypre_TFree(nodes_marked, HYPRE_MEMORY_HOST);
2287    hypre_TFree(queue, HYPRE_MEMORY_HOST);
2288    hypre_TFree(GT_diag_mat, HYPRE_MEMORY_HOST);
2289 
2290    /* fetch the communication information from */
2291 
2292    comm = hypre_ParCSRMatrixComm(G_csr);
2293    hypre_MPI_Comm_rank(comm, &mypid);
2294    hypre_MPI_Comm_size(comm, &nprocs);
2295    comm_pkg = hypre_ParCSRMatrixCommPkg(G_csr);
2296    if (nprocs == 1 && comm_pkg == NULL)
2297    {
2298 
2299       hypre_MatvecCommPkgCreate((hypre_ParCSRMatrix *) G_csr);
2300 
2301       comm_pkg = hypre_ParCSRMatrixCommPkg(G_csr);
2302    }
2303 
2304    /* construct processor graph based on node-edge connection */
2305    /* (local edges connected to neighbor processor nodes)     */
2306 
2307    n_children = 0;
2308    nrecvs = nsends = 0;
2309    if (nprocs > 1)
2310    {
2311       nsends     = hypre_ParCSRCommPkgNumSends(comm_pkg);
2312       send_procs = hypre_ParCSRCommPkgSendProcs(comm_pkg);
2313       nrecvs     = hypre_ParCSRCommPkgNumRecvs(comm_pkg);
2314       recv_procs = hypre_ParCSRCommPkgRecvProcs(comm_pkg);
2315       proc_array = NULL;
2316       if ((nsends+nrecvs) > 0)
2317       {
2318          n_proc_array = 0;
2319          proc_array = hypre_TAlloc(HYPRE_Int, (nsends+nrecvs) , HYPRE_MEMORY_HOST);
2320          for (i = 0; i < nsends; i++) proc_array[i] = send_procs[i];
2321          for (i = 0; i < nrecvs; i++) proc_array[nsends+i] = recv_procs[i];
2322          hypre_qsort0(proc_array, 0, nsends+nrecvs-1);
2323          n_proc_array = 1;
2324          for (i = 1; i < nrecvs+nsends; i++)
2325             if (proc_array[i] != proc_array[n_proc_array])
2326                proc_array[n_proc_array++] = proc_array[i];
2327       }
2328       pgraph_i = hypre_TAlloc(HYPRE_Int, (nprocs+1) , HYPRE_MEMORY_HOST);
2329       recv_cnts = hypre_TAlloc(HYPRE_Int, nprocs , HYPRE_MEMORY_HOST);
2330       hypre_MPI_Allgather(&n_proc_array, 1, HYPRE_MPI_INT, recv_cnts, 1,
2331                           HYPRE_MPI_INT, comm);
2332       pgraph_i[0] = 0;
2333       for (i = 1; i <= nprocs; i++)
2334          pgraph_i[i] = pgraph_i[i-1] + recv_cnts[i-1];
2335       pgraph_j = hypre_TAlloc(HYPRE_Int, pgraph_i[nprocs] , HYPRE_MEMORY_HOST);
2336       hypre_MPI_Allgatherv(proc_array, n_proc_array, HYPRE_MPI_INT, pgraph_j,
2337                            recv_cnts, pgraph_i, HYPRE_MPI_INT, comm);
2338       hypre_TFree(recv_cnts, HYPRE_MEMORY_HOST);
2339 
2340       /* BFS on the processor graph to determine parent and children */
2341 
2342       nodes_marked = hypre_TAlloc(HYPRE_Int, nprocs , HYPRE_MEMORY_HOST);
2343       for (i = 0; i < nprocs; i++) nodes_marked[i] = -1;
2344       queue = hypre_TAlloc(HYPRE_Int, nprocs , HYPRE_MEMORY_HOST);
2345       queue_head = 0;
2346       queue_tail = 1;
2347       node = 0;
2348       queue[0] = node;
2349       while ((queue_tail-queue_head) > 0)
2350       {
2351          proc = queue[queue_tail-1];
2352          queue_tail--;
2353          for (i = pgraph_i[proc]; i < pgraph_i[proc+1]; i++)
2354          {
2355             proc2 = pgraph_j[i];
2356             if (nodes_marked[proc2] < 0)
2357             {
2358                nodes_marked[proc2] = proc;
2359                queue[queue_tail] = proc2;
2360                queue_tail++;
2361             }
2362          }
2363       }
2364       parent = nodes_marked[mypid];
2365       n_children = 0;
2366       for (i = 0; i < nprocs; i++) if (nodes_marked[i] == mypid) n_children++;
2367       if (n_children == 0) {n_children = 0; children = NULL;}
2368       else
2369       {
2370          children = hypre_TAlloc(HYPRE_Int, n_children , HYPRE_MEMORY_HOST);
2371          n_children = 0;
2372          for (i = 0; i < nprocs; i++)
2373             if (nodes_marked[i] == mypid) children[n_children++] = i;
2374       }
2375       hypre_TFree(nodes_marked, HYPRE_MEMORY_HOST);
2376       hypre_TFree(queue, HYPRE_MEMORY_HOST);
2377       hypre_TFree(pgraph_i, HYPRE_MEMORY_HOST);
2378       hypre_TFree(pgraph_j, HYPRE_MEMORY_HOST);
2379    }
2380 
2381    /* first, connection with my parent : if the edge in my parent *
2382     * is incident to one of my nodes, then my parent will mark it */
2383 
2384    found = 0;
2385    for (i = 0; i < nrecvs; i++)
2386    {
2387       proc = hypre_ParCSRCommPkgRecvProc(comm_pkg, i);
2388       if (proc == parent)
2389       {
2390          found = 1;
2391          break;
2392       }
2393    }
2394 
2395    /* but if all the edges connected to my parent are on my side, *
2396     * then I will just pick one of them as tree edge              */
2397 
2398    if (found == 0)
2399    {
2400       for (i = 0; i < nsends; i++)
2401       {
2402          proc = hypre_ParCSRCommPkgSendProc(comm_pkg, i);
2403          if (proc == parent)
2404          {
2405             k = hypre_ParCSRCommPkgSendMapStart(comm_pkg,i);
2406             edge = hypre_ParCSRCommPkgSendMapElmt(comm_pkg,k);
2407             edges_marked[edge] = 1;
2408             break;
2409          }
2410       }
2411    }
2412 
2413    /* next, if my processor has an edge incident on one node in my *
2414     * child, put this edge on the tree. But if there is no such    *
2415     * edge, then I will assume my child will pick up an edge       */
2416 
2417    for (j = 0; j < n_children; j++)
2418    {
2419       proc = children[j];
2420       for (i = 0; i < nsends; i++)
2421       {
2422          proc2 = hypre_ParCSRCommPkgSendProc(comm_pkg, i);
2423          if (proc == proc2)
2424          {
2425             k = hypre_ParCSRCommPkgSendMapStart(comm_pkg,i);
2426             edge = hypre_ParCSRCommPkgSendMapElmt(comm_pkg,k);
2427             edges_marked[edge] = 1;
2428             break;
2429          }
2430       }
2431    }
2432    if (n_children > 0)
2433    {
2434       hypre_TFree(children, HYPRE_MEMORY_HOST);
2435    }
2436 
2437    /* count the size of the tree */
2438 
2439    tree_size = 0;
2440    for (i = 0; i < ncols_G; i++)
2441       if (edges_marked[i] == 1) tree_size++;
2442    t_indices = hypre_TAlloc(HYPRE_Int, (tree_size+1) , HYPRE_MEMORY_HOST);
2443    t_indices[0] = tree_size;
2444    tree_size = 1;
2445    for (i = 0; i < ncols_G; i++)
2446       if (edges_marked[i] == 1) t_indices[tree_size++] = i;
2447    (*indices) = t_indices;
2448    hypre_TFree(edges_marked, HYPRE_MEMORY_HOST);
2449    if (G_type != 0)
2450    {
2451       hypre_TFree(G_diag_i, HYPRE_MEMORY_HOST);
2452       hypre_TFree(G_diag_j, HYPRE_MEMORY_HOST);
2453    }
2454 }
2455 
2456 /* -----------------------------------------------------------------------------
2457  * extract submatrices based on given indices
2458  * ----------------------------------------------------------------------------- */
2459 
hypre_ParCSRMatrixExtractSubmatrices(hypre_ParCSRMatrix * A_csr,HYPRE_Int * indices2,hypre_ParCSRMatrix *** submatrices)2460 void hypre_ParCSRMatrixExtractSubmatrices( hypre_ParCSRMatrix *A_csr,
2461                                            HYPRE_Int *indices2,
2462                                            hypre_ParCSRMatrix ***submatrices )
2463 {
2464    HYPRE_Int    nrows_A, nindices, *indices, *A_diag_i, *A_diag_j, mypid, nprocs;
2465    HYPRE_Int    i, j, k, *proc_offsets1, *proc_offsets2, *exp_indices;
2466    HYPRE_BigInt *itmp_array;
2467    HYPRE_Int    nnz11, nnz12, nnz21, nnz22, col, ncols_offd, nnz_offd, nnz_diag;
2468    HYPRE_Int    nrows, nnz;
2469    HYPRE_BigInt global_nrows, global_ncols, *row_starts, *col_starts;
2470    HYPRE_Int    *diag_i, *diag_j, row, *offd_i;
2471    HYPRE_Complex *A_diag_a, *diag_a;
2472    hypre_ParCSRMatrix *A11_csr, *A12_csr, *A21_csr, *A22_csr;
2473    hypre_CSRMatrix    *A_diag, *diag, *offd;
2474    MPI_Comm           comm;
2475 
2476    /* -----------------------------------------------------
2477     * first make sure the incoming indices are in order
2478     * ----------------------------------------------------- */
2479 
2480    nindices = indices2[0];
2481    indices  = &(indices2[1]);
2482    hypre_qsort0(indices, 0, nindices-1);
2483 
2484    /* -----------------------------------------------------
2485     * fetch matrix information
2486     * ----------------------------------------------------- */
2487 
2488    nrows_A = (HYPRE_Int) hypre_ParCSRMatrixGlobalNumRows(A_csr);
2489    A_diag = hypre_ParCSRMatrixDiag(A_csr);
2490    A_diag_i = hypre_CSRMatrixI(A_diag);
2491    A_diag_j = hypre_CSRMatrixJ(A_diag);
2492    A_diag_a = hypre_CSRMatrixData(A_diag);
2493    comm = hypre_ParCSRMatrixComm(A_csr);
2494    hypre_MPI_Comm_rank(comm, &mypid);
2495    hypre_MPI_Comm_size(comm, &nprocs);
2496    if (nprocs > 1)
2497    {
2498       hypre_error_w_msg(HYPRE_ERROR_GENERIC,"ExtractSubmatrices: cannot handle nprocs > 1 yet.\n");
2499       exit(1);
2500    }
2501 
2502    /* -----------------------------------------------------
2503     * compute new matrix dimensions
2504     * ----------------------------------------------------- */
2505 
2506    proc_offsets1 = hypre_TAlloc(HYPRE_Int, (nprocs+1) , HYPRE_MEMORY_HOST);
2507    proc_offsets2 = hypre_TAlloc(HYPRE_Int, (nprocs+1) , HYPRE_MEMORY_HOST);
2508    hypre_MPI_Allgather(&nindices, 1, HYPRE_MPI_INT, proc_offsets1, 1,
2509                        HYPRE_MPI_INT, comm);
2510    k = 0;
2511    for (i = 0; i < nprocs; i++)
2512    {
2513       j = proc_offsets1[i];
2514       proc_offsets1[i] = k;
2515       k += j;
2516    }
2517    proc_offsets1[nprocs] = k;
2518    itmp_array = hypre_ParCSRMatrixRowStarts(A_csr);
2519    for (i = 0; i <= nprocs; i++)
2520    {
2521       proc_offsets2[i] = itmp_array[i] - proc_offsets1[i];
2522    }
2523 
2524    /* -----------------------------------------------------
2525     * assign id's to row and col for later processing
2526     * ----------------------------------------------------- */
2527 
2528    exp_indices = hypre_TAlloc(HYPRE_Int, nrows_A , HYPRE_MEMORY_HOST);
2529    for (i = 0; i < nrows_A; i++) exp_indices[i] = -1;
2530    for (i = 0; i < nindices; i++)
2531    {
2532       if (exp_indices[indices[i]] == -1) exp_indices[indices[i]] = i;
2533       else
2534       {
2535          hypre_error_w_msg(HYPRE_ERROR_GENERIC,"ExtractSubmatrices: wrong index %d %d\n");
2536          exit(1);
2537       }
2538    }
2539    k = 0;
2540    for (i = 0; i < nrows_A; i++)
2541    {
2542       if (exp_indices[i] < 0)
2543       {
2544          exp_indices[i] = - k - 1;
2545          k++;
2546       }
2547    }
2548 
2549    /* -----------------------------------------------------
2550     * compute number of nonzeros for each block
2551     * ----------------------------------------------------- */
2552 
2553    nnz11 = nnz12 = nnz21 = nnz22 = 0;
2554    for (i = 0; i < nrows_A; i++)
2555    {
2556       if (exp_indices[i] >= 0)
2557       {
2558          for (j = A_diag_i[i]; j < A_diag_i[i+1]; j++)
2559          {
2560             col = A_diag_j[j];
2561             if (exp_indices[col] >= 0) nnz11++;
2562             else                       nnz12++;
2563          }
2564       }
2565       else
2566       {
2567          for (j = A_diag_i[i]; j < A_diag_i[i+1]; j++)
2568          {
2569             col = A_diag_j[j];
2570             if (exp_indices[col] >= 0) nnz21++;
2571             else                       nnz22++;
2572          }
2573       }
2574    }
2575 
2576    /* -----------------------------------------------------
2577     * create A11 matrix (assume sequential for the moment)
2578     * ----------------------------------------------------- */
2579 
2580    ncols_offd = 0;
2581    nnz_offd   = 0;
2582    nnz_diag   = nnz11;
2583    /* This case is not yet implemented! */
2584    global_nrows = 0;
2585    global_ncols = 0;
2586    row_starts = NULL;
2587    col_starts = NULL;
2588    A11_csr = hypre_ParCSRMatrixCreate(comm, global_nrows, global_ncols,
2589                                       row_starts, col_starts, ncols_offd,
2590                                       nnz_diag, nnz_offd);
2591    nrows = nindices;
2592    diag_i = hypre_CTAlloc(HYPRE_Int,  nrows+1, HYPRE_MEMORY_HOST);
2593    diag_j = hypre_CTAlloc(HYPRE_Int,  nnz_diag, HYPRE_MEMORY_HOST);
2594    diag_a = hypre_CTAlloc(HYPRE_Complex,  nnz_diag, HYPRE_MEMORY_HOST);
2595    nnz = 0;
2596    row = 0;
2597    diag_i[0] = 0;
2598    for (i = 0; i < nrows_A; i++)
2599    {
2600       if (exp_indices[i] >= 0)
2601       {
2602          for (j = A_diag_i[i]; j < A_diag_i[i+1]; j++)
2603          {
2604             col = A_diag_j[j];
2605             if (exp_indices[col] >= 0)
2606             {
2607                diag_j[nnz] = exp_indices[col];
2608                diag_a[nnz++] = A_diag_a[j];
2609             }
2610          }
2611          row++;
2612          diag_i[row] = nnz;
2613       }
2614    }
2615    diag = hypre_ParCSRMatrixDiag(A11_csr);
2616    hypre_CSRMatrixI(diag) = diag_i;
2617    hypre_CSRMatrixJ(diag) = diag_j;
2618    hypre_CSRMatrixData(diag) = diag_a;
2619 
2620    offd_i = hypre_CTAlloc(HYPRE_Int,  nrows+1, HYPRE_MEMORY_HOST);
2621    for (i = 0; i <= nrows; i++) offd_i[i] = 0;
2622    offd = hypre_ParCSRMatrixOffd(A11_csr);
2623    hypre_CSRMatrixI(offd) = offd_i;
2624    hypre_CSRMatrixJ(offd) = NULL;
2625    hypre_CSRMatrixData(offd) = NULL;
2626 
2627    /* -----------------------------------------------------
2628     * create A12 matrix (assume sequential for the moment)
2629     * ----------------------------------------------------- */
2630 
2631    ncols_offd = 0;
2632    nnz_offd   = 0;
2633    nnz_diag   = nnz12;
2634    global_nrows = (HYPRE_BigInt)proc_offsets1[nprocs];
2635    global_ncols = (HYPRE_BigInt)proc_offsets2[nprocs];
2636    row_starts = hypre_CTAlloc(HYPRE_BigInt,  nprocs+1, HYPRE_MEMORY_HOST);
2637    col_starts = hypre_CTAlloc(HYPRE_BigInt,  nprocs+1, HYPRE_MEMORY_HOST);
2638    for (i = 0; i <= nprocs; i++)
2639    {
2640       row_starts[i] = (HYPRE_BigInt)proc_offsets1[i];
2641       col_starts[i] = (HYPRE_BigInt)proc_offsets2[i];
2642    }
2643    A12_csr = hypre_ParCSRMatrixCreate(comm, global_nrows, global_ncols,
2644                                       row_starts, col_starts, ncols_offd,
2645                                       nnz_diag, nnz_offd);
2646    nrows = nindices;
2647    diag_i = hypre_CTAlloc(HYPRE_Int,  nrows+1, HYPRE_MEMORY_HOST);
2648    diag_j = hypre_CTAlloc(HYPRE_Int,  nnz_diag, HYPRE_MEMORY_HOST);
2649    diag_a = hypre_CTAlloc(HYPRE_Complex,  nnz_diag, HYPRE_MEMORY_HOST);
2650    nnz = 0;
2651    row = 0;
2652    diag_i[0] = 0;
2653    for (i = 0; i < nrows_A; i++)
2654    {
2655       if (exp_indices[i] >= 0)
2656       {
2657          for (j = A_diag_i[i]; j < A_diag_i[i+1]; j++)
2658          {
2659             col = A_diag_j[j];
2660             if (exp_indices[col] < 0)
2661             {
2662                diag_j[nnz] = - exp_indices[col] - 1;
2663                diag_a[nnz++] = A_diag_a[j];
2664             }
2665          }
2666          row++;
2667          diag_i[row] = nnz;
2668       }
2669    }
2670 
2671    if (nnz > nnz_diag)
2672    {
2673       hypre_assert(0);
2674       hypre_error(HYPRE_ERROR_GENERIC);
2675    }
2676 
2677    diag = hypre_ParCSRMatrixDiag(A12_csr);
2678    hypre_CSRMatrixI(diag) = diag_i;
2679    hypre_CSRMatrixJ(diag) = diag_j;
2680    hypre_CSRMatrixData(diag) = diag_a;
2681 
2682    offd_i = hypre_CTAlloc(HYPRE_Int,  nrows+1, HYPRE_MEMORY_HOST);
2683    for (i = 0; i <= nrows; i++) offd_i[i] = 0;
2684    offd = hypre_ParCSRMatrixOffd(A12_csr);
2685    hypre_CSRMatrixI(offd) = offd_i;
2686    hypre_CSRMatrixJ(offd) = NULL;
2687    hypre_CSRMatrixData(offd) = NULL;
2688    hypre_TFree(row_starts, HYPRE_MEMORY_HOST);
2689    hypre_TFree(col_starts, HYPRE_MEMORY_HOST);
2690 
2691    /* -----------------------------------------------------
2692     * create A21 matrix (assume sequential for the moment)
2693     * ----------------------------------------------------- */
2694 
2695    ncols_offd = 0;
2696    nnz_offd   = 0;
2697    nnz_diag   = nnz21;
2698    global_nrows = (HYPRE_BigInt)proc_offsets2[nprocs];
2699    global_ncols = (HYPRE_BigInt)proc_offsets1[nprocs];
2700    row_starts = hypre_CTAlloc(HYPRE_BigInt,  nprocs+1, HYPRE_MEMORY_HOST);
2701    col_starts = hypre_CTAlloc(HYPRE_BigInt,  nprocs+1, HYPRE_MEMORY_HOST);
2702    for (i = 0; i <= nprocs; i++)
2703    {
2704       row_starts[i] = (HYPRE_BigInt)proc_offsets2[i];
2705       col_starts[i] = (HYPRE_BigInt)proc_offsets1[i];
2706    }
2707    A21_csr = hypre_ParCSRMatrixCreate(comm, global_nrows, global_ncols,
2708                                       row_starts, col_starts, ncols_offd,
2709                                       nnz_diag, nnz_offd);
2710    nrows = nrows_A - nindices;
2711    diag_i = hypre_CTAlloc(HYPRE_Int,  nrows+1, HYPRE_MEMORY_HOST);
2712    diag_j = hypre_CTAlloc(HYPRE_Int,  nnz_diag, HYPRE_MEMORY_HOST);
2713    diag_a = hypre_CTAlloc(HYPRE_Complex,  nnz_diag, HYPRE_MEMORY_HOST);
2714    nnz = 0;
2715    row = 0;
2716    diag_i[0] = 0;
2717    for (i = 0; i < nrows_A; i++)
2718    {
2719       if (exp_indices[i] < 0)
2720       {
2721          for (j = A_diag_i[i]; j < A_diag_i[i+1]; j++)
2722          {
2723             col = A_diag_j[j];
2724             if (exp_indices[col] >= 0)
2725             {
2726                diag_j[nnz] = exp_indices[col];
2727                diag_a[nnz++] = A_diag_a[j];
2728             }
2729          }
2730          row++;
2731          diag_i[row] = nnz;
2732       }
2733    }
2734    diag = hypre_ParCSRMatrixDiag(A21_csr);
2735    hypre_CSRMatrixI(diag) = diag_i;
2736    hypre_CSRMatrixJ(diag) = diag_j;
2737    hypre_CSRMatrixData(diag) = diag_a;
2738 
2739    offd_i = hypre_CTAlloc(HYPRE_Int,  nrows+1, HYPRE_MEMORY_HOST);
2740    for (i = 0; i <= nrows; i++) offd_i[i] = 0;
2741    offd = hypre_ParCSRMatrixOffd(A21_csr);
2742    hypre_CSRMatrixI(offd) = offd_i;
2743    hypre_CSRMatrixJ(offd) = NULL;
2744    hypre_CSRMatrixData(offd) = NULL;
2745    hypre_TFree(row_starts, HYPRE_MEMORY_HOST);
2746    hypre_TFree(col_starts, HYPRE_MEMORY_HOST);
2747 
2748    /* -----------------------------------------------------
2749     * create A22 matrix (assume sequential for the moment)
2750     * ----------------------------------------------------- */
2751 
2752    ncols_offd = 0;
2753    nnz_offd   = 0;
2754    nnz_diag   = nnz22;
2755    global_nrows = (HYPRE_BigInt)proc_offsets2[nprocs];
2756    global_ncols = (HYPRE_BigInt)proc_offsets2[nprocs];
2757    row_starts = hypre_CTAlloc(HYPRE_BigInt,  nprocs+1, HYPRE_MEMORY_HOST);
2758    col_starts = hypre_CTAlloc(HYPRE_BigInt,  nprocs+1, HYPRE_MEMORY_HOST);
2759    for (i = 0; i <= nprocs; i++)
2760    {
2761       row_starts[i] = (HYPRE_BigInt)proc_offsets2[i];
2762       col_starts[i] = (HYPRE_BigInt)proc_offsets2[i];
2763    }
2764    A22_csr = hypre_ParCSRMatrixCreate(comm, global_nrows, global_ncols,
2765                                       row_starts, col_starts, ncols_offd,
2766                                       nnz_diag, nnz_offd);
2767    nrows = nrows_A - nindices;
2768    diag_i = hypre_CTAlloc(HYPRE_Int,  nrows+1, HYPRE_MEMORY_HOST);
2769    diag_j = hypre_CTAlloc(HYPRE_Int,  nnz_diag, HYPRE_MEMORY_HOST);
2770    diag_a = hypre_CTAlloc(HYPRE_Complex,  nnz_diag, HYPRE_MEMORY_HOST);
2771    nnz = 0;
2772    row = 0;
2773    diag_i[0] = 0;
2774    for (i = 0; i < nrows_A; i++)
2775    {
2776       if (exp_indices[i] < 0)
2777       {
2778          for (j = A_diag_i[i]; j < A_diag_i[i+1]; j++)
2779          {
2780             col = A_diag_j[j];
2781             if (exp_indices[col] < 0)
2782             {
2783                diag_j[nnz] = - exp_indices[col] - 1;
2784                diag_a[nnz++] = A_diag_a[j];
2785             }
2786          }
2787          row++;
2788          diag_i[row] = nnz;
2789       }
2790    }
2791    diag = hypre_ParCSRMatrixDiag(A22_csr);
2792    hypre_CSRMatrixI(diag) = diag_i;
2793    hypre_CSRMatrixJ(diag) = diag_j;
2794    hypre_CSRMatrixData(diag) = diag_a;
2795 
2796    offd_i = hypre_CTAlloc(HYPRE_Int,  nrows+1, HYPRE_MEMORY_HOST);
2797    for (i = 0; i <= nrows; i++) offd_i[i] = 0;
2798    offd = hypre_ParCSRMatrixOffd(A22_csr);
2799    hypre_CSRMatrixI(offd) = offd_i;
2800    hypre_CSRMatrixJ(offd) = NULL;
2801    hypre_CSRMatrixData(offd) = NULL;
2802    hypre_TFree(row_starts, HYPRE_MEMORY_HOST);
2803    hypre_TFree(col_starts, HYPRE_MEMORY_HOST);
2804 
2805    /* -----------------------------------------------------
2806     * hand the matrices back to the caller and clean up
2807     * ----------------------------------------------------- */
2808 
2809    (*submatrices)[0] = A11_csr;
2810    (*submatrices)[1] = A12_csr;
2811    (*submatrices)[2] = A21_csr;
2812    (*submatrices)[3] = A22_csr;
2813    hypre_TFree(proc_offsets1, HYPRE_MEMORY_HOST);
2814    hypre_TFree(proc_offsets2, HYPRE_MEMORY_HOST);
2815    hypre_TFree(exp_indices, HYPRE_MEMORY_HOST);
2816 }
2817 
2818 /* -----------------------------------------------------------------------------
2819  * extract submatrices of a rectangular matrix
2820  * ----------------------------------------------------------------------------- */
2821 
hypre_ParCSRMatrixExtractRowSubmatrices(hypre_ParCSRMatrix * A_csr,HYPRE_Int * indices2,hypre_ParCSRMatrix *** submatrices)2822 void hypre_ParCSRMatrixExtractRowSubmatrices( hypre_ParCSRMatrix *A_csr,
2823                                               HYPRE_Int *indices2,
2824                                               hypre_ParCSRMatrix ***submatrices )
2825 {
2826    HYPRE_Int    nrows_A, nindices, *indices, *A_diag_i, *A_diag_j, mypid, nprocs;
2827    HYPRE_Int    i, j, k, *proc_offsets1, *proc_offsets2, *exp_indices;
2828    HYPRE_Int    nnz11, nnz21, col, ncols_offd, nnz_offd, nnz_diag;
2829    HYPRE_Int    *A_offd_i, *A_offd_j;
2830    HYPRE_Int    nrows, nnz;
2831    HYPRE_BigInt global_nrows, global_ncols, *row_starts, *col_starts, *itmp_array;
2832    HYPRE_Int    *diag_i, *diag_j, row, *offd_i, *offd_j, nnz11_offd, nnz21_offd;
2833    HYPRE_Complex *A_diag_a, *diag_a, *offd_a;
2834    hypre_ParCSRMatrix *A11_csr, *A21_csr;
2835    hypre_CSRMatrix    *A_diag, *diag, *A_offd, *offd;
2836    MPI_Comm           comm;
2837 
2838    /* -----------------------------------------------------
2839     * first make sure the incoming indices are in order
2840     * ----------------------------------------------------- */
2841 
2842    nindices = indices2[0];
2843    indices  = &(indices2[1]);
2844    hypre_qsort0(indices, 0, nindices-1);
2845 
2846    /* -----------------------------------------------------
2847     * fetch matrix information
2848     * ----------------------------------------------------- */
2849 
2850    nrows_A = (HYPRE_Int)hypre_ParCSRMatrixGlobalNumRows(A_csr);
2851    A_diag = hypre_ParCSRMatrixDiag(A_csr);
2852    A_diag_i = hypre_CSRMatrixI(A_diag);
2853    A_diag_j = hypre_CSRMatrixJ(A_diag);
2854    A_diag_a = hypre_CSRMatrixData(A_diag);
2855    A_offd = hypre_ParCSRMatrixOffd(A_csr);
2856    A_offd_i = hypre_CSRMatrixI(A_offd);
2857    A_offd_j = hypre_CSRMatrixJ(A_offd);
2858    comm = hypre_ParCSRMatrixComm(A_csr);
2859    hypre_MPI_Comm_rank(comm, &mypid);
2860    hypre_MPI_Comm_size(comm, &nprocs);
2861 
2862    /* -----------------------------------------------------
2863     * compute new matrix dimensions
2864     * ----------------------------------------------------- */
2865 
2866    proc_offsets1 = hypre_TAlloc(HYPRE_Int, (nprocs+1) , HYPRE_MEMORY_HOST);
2867    proc_offsets2 = hypre_TAlloc(HYPRE_Int, (nprocs+1) , HYPRE_MEMORY_HOST);
2868    hypre_MPI_Allgather(&nindices, 1, HYPRE_MPI_INT, proc_offsets1, 1,
2869                        HYPRE_MPI_INT, comm);
2870    k = 0;
2871    for (i = 0; i < nprocs; i++)
2872    {
2873       j = proc_offsets1[i];
2874       proc_offsets1[i] = k;
2875       k += j;
2876    }
2877    proc_offsets1[nprocs] = k;
2878    itmp_array = hypre_ParCSRMatrixRowStarts(A_csr);
2879    for (i = 0; i <= nprocs; i++)
2880       proc_offsets2[i] = (HYPRE_Int)(itmp_array[i] - proc_offsets1[i]);
2881 
2882    /* -----------------------------------------------------
2883     * assign id's to row and col for later processing
2884     * ----------------------------------------------------- */
2885 
2886    exp_indices = hypre_TAlloc(HYPRE_Int, nrows_A , HYPRE_MEMORY_HOST);
2887    for (i = 0; i < nrows_A; i++) exp_indices[i] = -1;
2888    for (i = 0; i < nindices; i++)
2889    {
2890       if (exp_indices[indices[i]] == -1) exp_indices[indices[i]] = i;
2891       else
2892       {
2893          hypre_error_w_msg(HYPRE_ERROR_GENERIC,"ExtractRowSubmatrices: wrong index %d %d\n");
2894          exit(1);
2895       }
2896    }
2897    k = 0;
2898    for (i = 0; i < nrows_A; i++)
2899    {
2900       if (exp_indices[i] < 0)
2901       {
2902          exp_indices[i] = - k - 1;
2903          k++;
2904       }
2905    }
2906 
2907    /* -----------------------------------------------------
2908     * compute number of nonzeros for each block
2909     * ----------------------------------------------------- */
2910 
2911    nnz11 = nnz21 = nnz11_offd = nnz21_offd = 0;
2912    for (i = 0; i < nrows_A; i++)
2913    {
2914       if (exp_indices[i] >= 0)
2915       {
2916          for (j = A_diag_i[i]; j < A_diag_i[i+1]; j++)
2917          {
2918             col = A_diag_j[j];
2919             if (exp_indices[col] >= 0) nnz11++;
2920          }
2921          nnz11_offd += A_offd_i[i+1] - A_offd_i[i];
2922       }
2923       else
2924       {
2925          for (j = A_diag_i[i]; j < A_diag_i[i+1]; j++)
2926          {
2927             col = A_diag_j[j];
2928             if (exp_indices[col] < 0) nnz21++;
2929          }
2930          nnz21_offd += A_offd_i[i+1] - A_offd_i[i];
2931       }
2932    }
2933 
2934    /* -----------------------------------------------------
2935     * create A11 matrix (assume sequential for the moment)
2936     * ----------------------------------------------------- */
2937 
2938    ncols_offd = hypre_CSRMatrixNumCols(hypre_ParCSRMatrixDiag(A_csr));
2939    nnz_diag   = nnz11;
2940    nnz_offd   = nnz11_offd;
2941 
2942    global_nrows = (HYPRE_BigInt)proc_offsets1[nprocs];
2943    itmp_array   = hypre_ParCSRMatrixColStarts(A_csr);
2944    global_ncols = itmp_array[nprocs];
2945    row_starts = hypre_CTAlloc(HYPRE_BigInt,  nprocs+1, HYPRE_MEMORY_HOST);
2946    col_starts = hypre_CTAlloc(HYPRE_BigInt,  nprocs+1, HYPRE_MEMORY_HOST);
2947    for (i = 0; i <= nprocs; i++)
2948    {
2949       row_starts[i] = (HYPRE_BigInt)proc_offsets1[i];
2950       col_starts[i] = itmp_array[i];
2951    }
2952    A11_csr = hypre_ParCSRMatrixCreate(comm, global_nrows, global_ncols,
2953                                       row_starts, col_starts, ncols_offd,
2954                                       nnz_diag, nnz_offd);
2955    nrows = nindices;
2956    diag_i = hypre_CTAlloc(HYPRE_Int,  nrows+1, HYPRE_MEMORY_HOST);
2957    diag_j = hypre_CTAlloc(HYPRE_Int,  nnz_diag, HYPRE_MEMORY_HOST);
2958    diag_a = hypre_CTAlloc(HYPRE_Complex,  nnz_diag, HYPRE_MEMORY_HOST);
2959    nnz = 0;
2960    row = 0;
2961    diag_i[0] = 0;
2962    for (i = 0; i < nrows_A; i++)
2963    {
2964       if (exp_indices[i] >= 0)
2965       {
2966          for (j = A_diag_i[i]; j < A_diag_i[i+1]; j++)
2967          {
2968             col = A_diag_j[j];
2969             if (exp_indices[col] >= 0)
2970             {
2971                diag_j[nnz] = exp_indices[col];
2972                diag_a[nnz++] = A_diag_a[j];
2973             }
2974          }
2975          row++;
2976          diag_i[row] = nnz;
2977       }
2978    }
2979    diag = hypre_ParCSRMatrixDiag(A11_csr);
2980    hypre_CSRMatrixI(diag) = diag_i;
2981    hypre_CSRMatrixJ(diag) = diag_j;
2982    hypre_CSRMatrixData(diag) = diag_a;
2983 
2984    offd_i = hypre_CTAlloc(HYPRE_Int,  nrows+1, HYPRE_MEMORY_HOST);
2985    offd_j = hypre_CTAlloc(HYPRE_Int,  nnz_offd, HYPRE_MEMORY_HOST);
2986    offd_a = hypre_CTAlloc(HYPRE_Complex,  nnz_offd, HYPRE_MEMORY_HOST);
2987    nnz = 0;
2988    row = 0;
2989    offd_i[0] = 0;
2990    for (i = 0; i < nrows_A; i++)
2991    {
2992       if (exp_indices[i] >= 0)
2993       {
2994          for (j = A_offd_i[i]; j < A_offd_i[i+1]; j++)
2995          {
2996             offd_j[nnz] = A_offd_j[j];
2997             offd_a[nnz++] = A_diag_a[j];
2998          }
2999          row++;
3000          offd_i[row] = nnz;
3001       }
3002    }
3003    offd = hypre_ParCSRMatrixOffd(A11_csr);
3004    hypre_CSRMatrixI(offd) = offd_i;
3005    hypre_CSRMatrixJ(offd) = offd_j;
3006    hypre_CSRMatrixData(offd) = offd_a;
3007    hypre_TFree(row_starts, HYPRE_MEMORY_HOST);
3008    hypre_TFree(col_starts, HYPRE_MEMORY_HOST);
3009 
3010    /* -----------------------------------------------------
3011     * create A21 matrix
3012     * ----------------------------------------------------- */
3013 
3014    ncols_offd = hypre_CSRMatrixNumCols(hypre_ParCSRMatrixDiag(A_csr));
3015    nnz_offd   = nnz21_offd;
3016    nnz_diag   = nnz21;
3017    global_nrows = (HYPRE_BigInt)proc_offsets2[nprocs];
3018    itmp_array   = hypre_ParCSRMatrixColStarts(A_csr);
3019    global_ncols = itmp_array[nprocs];
3020    row_starts = hypre_CTAlloc(HYPRE_BigInt,  nprocs+1, HYPRE_MEMORY_HOST);
3021    col_starts = hypre_CTAlloc(HYPRE_BigInt,  nprocs+1, HYPRE_MEMORY_HOST);
3022    for (i = 0; i <= nprocs; i++)
3023    {
3024       row_starts[i] = (HYPRE_BigInt)proc_offsets2[i];
3025       col_starts[i] = itmp_array[i];
3026    }
3027    A21_csr = hypre_ParCSRMatrixCreate(comm, global_nrows, global_ncols,
3028                                       row_starts, col_starts, ncols_offd,
3029                                       nnz_diag, nnz_offd);
3030    nrows = nrows_A - nindices;
3031    diag_i = hypre_CTAlloc(HYPRE_Int,  nrows+1, HYPRE_MEMORY_HOST);
3032    diag_j = hypre_CTAlloc(HYPRE_Int,  nnz_diag, HYPRE_MEMORY_HOST);
3033    diag_a = hypre_CTAlloc(HYPRE_Complex,  nnz_diag, HYPRE_MEMORY_HOST);
3034    nnz = 0;
3035    row = 0;
3036    diag_i[0] = 0;
3037    for (i = 0; i < nrows_A; i++)
3038    {
3039       if (exp_indices[i] < 0)
3040       {
3041          for (j = A_diag_i[i]; j < A_diag_i[i+1]; j++)
3042          {
3043             diag_j[nnz] = A_diag_j[j];
3044             diag_a[nnz++] = A_diag_a[j];
3045          }
3046          row++;
3047          diag_i[row] = nnz;
3048       }
3049    }
3050    diag = hypre_ParCSRMatrixDiag(A21_csr);
3051    hypre_CSRMatrixI(diag) = diag_i;
3052    hypre_CSRMatrixJ(diag) = diag_j;
3053    hypre_CSRMatrixData(diag) = diag_a;
3054 
3055    offd_i = hypre_CTAlloc(HYPRE_Int,  nrows+1, HYPRE_MEMORY_HOST);
3056    offd_j = hypre_CTAlloc(HYPRE_Int,  nnz_offd, HYPRE_MEMORY_HOST);
3057    offd_a = hypre_CTAlloc(HYPRE_Complex,  nnz_offd, HYPRE_MEMORY_HOST);
3058    nnz = 0;
3059    row = 0;
3060    offd_i[0] = 0;
3061    for (i = 0; i < nrows_A; i++)
3062    {
3063       if (exp_indices[i] < 0)
3064       {
3065          for (j = A_offd_i[i]; j < A_offd_i[i+1]; j++)
3066          {
3067             offd_j[nnz] = A_offd_j[j];
3068             offd_a[nnz++] = A_diag_a[j];
3069          }
3070          row++;
3071          offd_i[row] = nnz;
3072       }
3073    }
3074    offd = hypre_ParCSRMatrixOffd(A21_csr);
3075    hypre_CSRMatrixI(offd) = offd_i;
3076    hypre_CSRMatrixJ(offd) = offd_j;
3077    hypre_CSRMatrixData(offd) = offd_a;
3078    hypre_TFree(row_starts, HYPRE_MEMORY_HOST);
3079    hypre_TFree(col_starts, HYPRE_MEMORY_HOST);
3080 
3081    /* -----------------------------------------------------
3082     * hand the matrices back to the caller and clean up
3083     * ----------------------------------------------------- */
3084 
3085    (*submatrices)[0] = A11_csr;
3086    (*submatrices)[1] = A21_csr;
3087    hypre_TFree(proc_offsets1, HYPRE_MEMORY_HOST);
3088    hypre_TFree(proc_offsets2, HYPRE_MEMORY_HOST);
3089    hypre_TFree(exp_indices, HYPRE_MEMORY_HOST);
3090 }
3091 
3092 /* -----------------------------------------------------------------------------
3093  * return the sum of all local elements of the matrix
3094  * ----------------------------------------------------------------------------- */
3095 
hypre_ParCSRMatrixLocalSumElts(hypre_ParCSRMatrix * A)3096 HYPRE_Complex hypre_ParCSRMatrixLocalSumElts( hypre_ParCSRMatrix * A )
3097 {
3098    hypre_CSRMatrix * A_diag = hypre_ParCSRMatrixDiag( A );
3099    hypre_CSRMatrix * A_offd = hypre_ParCSRMatrixOffd( A );
3100 
3101    return hypre_CSRMatrixSumElts(A_diag) + hypre_CSRMatrixSumElts(A_offd);
3102 }
3103 
3104 /*--------------------------------------------------------------------------
3105  * hypre_ParCSRMatrixMatAminvDB
3106  * computes C = (A - inv(D)B) where D is a diagonal matrix
3107  * Note: Data structure of A is expected to be a subset of data structure of B!
3108  *--------------------------------------------------------------------------*/
3109 
3110 HYPRE_Int
hypre_ParCSRMatrixAminvDB(hypre_ParCSRMatrix * A,hypre_ParCSRMatrix * B,HYPRE_Complex * d,hypre_ParCSRMatrix ** C_ptr)3111 hypre_ParCSRMatrixAminvDB( hypre_ParCSRMatrix *A,
3112                            hypre_ParCSRMatrix *B,
3113                            HYPRE_Complex *d,
3114                            hypre_ParCSRMatrix **C_ptr)
3115 {
3116    MPI_Comm comm = hypre_ParCSRMatrixComm(B);
3117    hypre_CSRMatrix      *A_diag   = hypre_ParCSRMatrixDiag(A);
3118    hypre_CSRMatrix      *A_offd   = hypre_ParCSRMatrixOffd(A);
3119    hypre_ParCSRMatrix  *C = NULL;
3120    HYPRE_Int          num_cols_offd_A = hypre_CSRMatrixNumCols(A_offd);
3121 
3122    hypre_ParCSRCommPkg  *comm_pkg_B = hypre_ParCSRMatrixCommPkg(B);
3123    hypre_CSRMatrix      *B_diag   = hypre_ParCSRMatrixDiag(B);
3124    hypre_CSRMatrix      *B_offd   = hypre_ParCSRMatrixOffd(B);
3125    HYPRE_Int          num_cols_offd_B = hypre_CSRMatrixNumCols(B_offd);
3126    HYPRE_Int          num_sends_B, num_recvs_B;
3127    HYPRE_Int          i, j, cnt;
3128 
3129    HYPRE_Int *A_diag_i = hypre_CSRMatrixI(A_diag);
3130    HYPRE_Int *A_diag_j = hypre_CSRMatrixJ(A_diag);
3131    HYPRE_Complex *A_diag_data = hypre_CSRMatrixData(A_diag);
3132    HYPRE_Int *A_offd_i = hypre_CSRMatrixI(A_offd);
3133    HYPRE_Int *A_offd_j = hypre_CSRMatrixJ(A_offd);
3134    HYPRE_Complex *A_offd_data = hypre_CSRMatrixData(A_offd);
3135    HYPRE_BigInt *col_map_offd_A = hypre_ParCSRMatrixColMapOffd(A);
3136 
3137    HYPRE_Int num_rows = hypre_CSRMatrixNumRows(B_diag);
3138    HYPRE_Int *B_diag_i = hypre_CSRMatrixI(B_diag);
3139    HYPRE_Int *B_diag_j = hypre_CSRMatrixJ(B_diag);
3140    HYPRE_Complex *B_diag_data = hypre_CSRMatrixData(B_diag);
3141    HYPRE_Int *B_offd_i = hypre_CSRMatrixI(B_offd);
3142    HYPRE_Int *B_offd_j = hypre_CSRMatrixJ(B_offd);
3143    HYPRE_Complex *B_offd_data = hypre_CSRMatrixData(B_offd);
3144    HYPRE_BigInt *col_map_offd_B = hypre_ParCSRMatrixColMapOffd(B);
3145 
3146    hypre_CSRMatrix      *C_diag   = NULL;
3147    hypre_CSRMatrix      *C_offd   = NULL;
3148    HYPRE_Int *C_diag_i = NULL;
3149    HYPRE_Int *C_diag_j = NULL;
3150    HYPRE_Complex *C_diag_data = NULL;
3151    HYPRE_Int *C_offd_i = NULL;
3152    HYPRE_Int *C_offd_j = NULL;
3153    HYPRE_Complex *C_offd_data = NULL;
3154 
3155    HYPRE_Int num_procs, my_id;
3156 
3157    HYPRE_Int *recv_procs_B;
3158    HYPRE_Int *send_procs_B;
3159    HYPRE_Int *recv_vec_starts_B;
3160    HYPRE_Int *send_map_starts_B;
3161    HYPRE_Int *send_map_elmts_B;
3162    hypre_ParCSRCommPkg *comm_pkg_C;
3163    HYPRE_Int *recv_procs_C;
3164    HYPRE_Int *send_procs_C;
3165    HYPRE_Int *recv_vec_starts_C;
3166    HYPRE_Int *send_map_starts_C;
3167    HYPRE_Int *send_map_elmts_C;
3168    HYPRE_Int *map_to_B;
3169 
3170    /*HYPRE_Int *C_diag_array;
3171      HYPRE_Int *C_offd_array;*/
3172    HYPRE_Complex *D_tmp;
3173    HYPRE_Int size, rest, num_threads, ii;
3174 
3175    hypre_MPI_Comm_size(comm,&num_procs);
3176    hypre_MPI_Comm_rank(comm,&my_id);
3177 
3178    num_threads = hypre_NumThreads();
3179    /*C_diag_array = hypre_CTAlloc(HYPRE_Int, num_threads);
3180      C_offd_array = hypre_CTAlloc(HYPRE_Int,  num_threads, HYPRE_MEMORY_HOST);*/
3181 
3182    /*---------------------------------------------------------------------
3183     * If there exists no CommPkg for B, a CommPkg is generated
3184     *--------------------------------------------------------------------*/
3185 
3186    if (!comm_pkg_B)
3187    {
3188       hypre_MatvecCommPkgCreate(B);
3189       comm_pkg_B = hypre_ParCSRMatrixCommPkg(B);
3190    }
3191 
3192    C = hypre_ParCSRMatrixClone(B, 0);
3193    /*hypre_ParCSRMatrixInitialize(C);*/
3194 
3195    C_diag = hypre_ParCSRMatrixDiag(C);
3196    C_diag_i = hypre_CSRMatrixI(C_diag);
3197    C_diag_j = hypre_CSRMatrixJ(C_diag);
3198    C_diag_data = hypre_CSRMatrixData(C_diag);
3199    C_offd = hypre_ParCSRMatrixOffd(C);
3200    C_offd_i = hypre_CSRMatrixI(C_offd);
3201    C_offd_j = hypre_CSRMatrixJ(C_offd);
3202    C_offd_data = hypre_CSRMatrixData(C_offd);
3203 
3204    size = num_rows/num_threads;
3205    rest = num_rows - size*num_threads;
3206 
3207    D_tmp = hypre_CTAlloc(HYPRE_Complex, num_rows, HYPRE_MEMORY_HOST);
3208 
3209    if (num_cols_offd_A)
3210    {
3211       map_to_B = hypre_CTAlloc(HYPRE_Int, num_cols_offd_A, HYPRE_MEMORY_HOST);
3212       cnt = 0;
3213       for (i = 0; i < num_cols_offd_A; i++)
3214       {
3215          while (col_map_offd_B[cnt] < col_map_offd_A[i])
3216          {
3217             cnt++;
3218          }
3219          map_to_B[i] = cnt;
3220          cnt++;
3221       }
3222    }
3223 
3224 #ifdef HYPRE_USING_OPENMP
3225 #pragma omp parallel for private(ii, i, j)
3226 #endif
3227    for (ii=0; ii < num_threads; ii++)
3228    {
3229       HYPRE_Int *A_marker = NULL;
3230       HYPRE_Int ns, ne, A_col, num_cols, nmax;
3231       if (ii < rest)
3232       {
3233          ns = ii*size+ii;
3234          ne = (ii+1)*size+ii+1;
3235       }
3236       else
3237       {
3238          ns = ii*size+rest;
3239          ne = (ii+1)*size+rest;
3240       }
3241       nmax = hypre_max(num_rows, num_cols_offd_B);
3242       A_marker = hypre_CTAlloc(HYPRE_Int,  nmax, HYPRE_MEMORY_HOST);
3243 
3244       for (i = 0; i < num_rows; i++)
3245       {
3246          A_marker[i] = -1;
3247       }
3248 
3249       for (i = ns; i < ne; i++)
3250       {
3251          D_tmp[i] = 1.0/d[i];
3252       }
3253 
3254       num_cols = C_diag_i[ns];
3255       for (i = ns; i < ne; i++)
3256       {
3257          for (j = A_diag_i[i]; j < A_diag_i[i+1]; j++)
3258          {
3259             A_col = A_diag_j[j];
3260             if (A_marker[A_col] < C_diag_i[i])
3261             {
3262                A_marker[A_col] = num_cols;
3263                C_diag_j[num_cols] = A_col;
3264                C_diag_data[num_cols] = A_diag_data[j];
3265                num_cols++;
3266             }
3267             else
3268             {
3269                C_diag_data[A_marker[A_col]] += A_diag_data[j];
3270             }
3271          }
3272          for (j = B_diag_i[i]; j < B_diag_i[i+1]; j++)
3273          {
3274             A_col = B_diag_j[j];
3275             if (A_marker[A_col] < C_diag_i[i])
3276             {
3277                A_marker[A_col] = num_cols;
3278                C_diag_j[num_cols] = A_col;
3279                C_diag_data[num_cols] = -D_tmp[i]*B_diag_data[j];
3280                num_cols++;
3281             }
3282             else
3283             {
3284                C_diag_data[A_marker[A_col]] -= D_tmp[i]*B_diag_data[j];
3285             }
3286          }
3287       }
3288 
3289       for (i = 0; i < num_cols_offd_B; i++)
3290       {
3291          A_marker[i] = -1;
3292       }
3293 
3294       num_cols = C_offd_i[ns];
3295       for (i = ns; i < ne; i++)
3296       {
3297          for (j = A_offd_i[i]; j < A_offd_i[i+1]; j++)
3298          {
3299             A_col = map_to_B[A_offd_j[j]];
3300             if (A_marker[A_col] < B_offd_i[i])
3301             {
3302                A_marker[A_col] = num_cols;
3303                C_offd_j[num_cols] = A_col;
3304                C_offd_data[num_cols] = A_offd_data[j];
3305                num_cols++;
3306             }
3307             else
3308             {
3309                C_offd_data[A_marker[A_col]] += A_offd_data[j];
3310             }
3311          }
3312          for (j = B_offd_i[i]; j < B_offd_i[i+1]; j++)
3313          {
3314             A_col = B_offd_j[j];
3315             if (A_marker[A_col] < B_offd_i[i])
3316             {
3317                A_marker[A_col] = num_cols;
3318                C_offd_j[num_cols] = A_col;
3319                C_offd_data[num_cols] = -D_tmp[i]*B_offd_data[j];
3320                num_cols++;
3321             }
3322             else
3323             {
3324                C_offd_data[A_marker[A_col]] -= D_tmp[i]*B_offd_data[j];
3325             }
3326          }
3327       }
3328       hypre_TFree(A_marker, HYPRE_MEMORY_HOST);
3329 
3330    } /* end parallel region */
3331 
3332    /*for (i=0; i < num_cols_offd_B; i++)
3333      col_map_offd_C[i] = col_map_offd_B[i]; */
3334 
3335    num_sends_B = hypre_ParCSRCommPkgNumSends(comm_pkg_B);
3336    num_recvs_B = hypre_ParCSRCommPkgNumRecvs(comm_pkg_B);
3337    recv_procs_B = hypre_ParCSRCommPkgRecvProcs(comm_pkg_B);
3338    recv_vec_starts_B = hypre_ParCSRCommPkgRecvVecStarts(comm_pkg_B);
3339    send_procs_B = hypre_ParCSRCommPkgSendProcs(comm_pkg_B);
3340    send_map_starts_B = hypre_ParCSRCommPkgSendMapStarts(comm_pkg_B);
3341    send_map_elmts_B = hypre_ParCSRCommPkgSendMapElmts(comm_pkg_B);
3342 
3343    recv_procs_C = hypre_CTAlloc(HYPRE_Int,  num_recvs_B, HYPRE_MEMORY_HOST);
3344    recv_vec_starts_C = hypre_CTAlloc(HYPRE_Int,  num_recvs_B+1, HYPRE_MEMORY_HOST);
3345    send_procs_C = hypre_CTAlloc(HYPRE_Int,  num_sends_B, HYPRE_MEMORY_HOST);
3346    send_map_starts_C = hypre_CTAlloc(HYPRE_Int,  num_sends_B+1, HYPRE_MEMORY_HOST);
3347    send_map_elmts_C = hypre_CTAlloc(HYPRE_Int,  send_map_starts_B[num_sends_B], HYPRE_MEMORY_HOST);
3348 
3349    for (i=0; i < num_recvs_B; i++)
3350       recv_procs_C[i] = recv_procs_B[i];
3351    for (i=0; i < num_recvs_B+1; i++)
3352       recv_vec_starts_C[i] = recv_vec_starts_B[i];
3353    for (i=0; i < num_sends_B; i++)
3354       send_procs_C[i] = send_procs_B[i];
3355    for (i=0; i < num_sends_B+1; i++)
3356       send_map_starts_C[i] = send_map_starts_B[i];
3357    for (i=0; i < send_map_starts_B[num_sends_B]; i++)
3358       send_map_elmts_C[i] = send_map_elmts_B[i];
3359 
3360    comm_pkg_C = hypre_CTAlloc(hypre_ParCSRCommPkg, 1, HYPRE_MEMORY_HOST);
3361    hypre_ParCSRCommPkgComm(comm_pkg_C) = comm;
3362    hypre_ParCSRCommPkgNumRecvs(comm_pkg_C) = num_recvs_B;
3363    hypre_ParCSRCommPkgRecvProcs(comm_pkg_C) = recv_procs_C;
3364    hypre_ParCSRCommPkgRecvVecStarts(comm_pkg_C) = recv_vec_starts_C;
3365    hypre_ParCSRCommPkgNumSends(comm_pkg_C) = num_sends_B;
3366    hypre_ParCSRCommPkgSendProcs(comm_pkg_C) = send_procs_C;
3367    hypre_ParCSRCommPkgSendMapStarts(comm_pkg_C) = send_map_starts_C;
3368    hypre_ParCSRCommPkgSendMapElmts(comm_pkg_C) = send_map_elmts_C;
3369 
3370    hypre_ParCSRMatrixCommPkg(C) = comm_pkg_C;
3371 
3372    hypre_TFree(D_tmp, HYPRE_MEMORY_HOST);
3373    if (num_cols_offd_A) hypre_TFree(map_to_B, HYPRE_MEMORY_HOST);
3374 
3375    *C_ptr = C;
3376 
3377    return (hypre_error_flag);
3378 }
3379 
3380 /*--------------------------------------------------------------------------
3381  * hypre_ParTMatmul:
3382  *
3383  * Multiplies two ParCSRMatrices transpose(A) and B and returns
3384  * the product in ParCSRMatrix C
3385  *
3386  * Note that C does not own the partitionings since its row_starts
3387  * is owned by A and col_starts by B.
3388  *--------------------------------------------------------------------------*/
3389 
3390 hypre_ParCSRMatrix*
hypre_ParTMatmul(hypre_ParCSRMatrix * A,hypre_ParCSRMatrix * B)3391 hypre_ParTMatmul( hypre_ParCSRMatrix  *A,
3392                   hypre_ParCSRMatrix  *B)
3393 {
3394    MPI_Comm        comm = hypre_ParCSRMatrixComm(A);
3395    hypre_ParCSRCommPkg *comm_pkg_A = hypre_ParCSRMatrixCommPkg(A);
3396 
3397    hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
3398    hypre_CSRMatrix *AT_diag = NULL;
3399 
3400    hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
3401    hypre_CSRMatrix *AT_offd = NULL;
3402 
3403    HYPRE_Int    num_rows_diag_A = hypre_CSRMatrixNumRows(A_diag);
3404    HYPRE_Int    num_cols_diag_A = hypre_CSRMatrixNumCols(A_diag);
3405 
3406    hypre_CSRMatrix *B_diag = hypre_ParCSRMatrixDiag(B);
3407 
3408    hypre_CSRMatrix *B_offd = hypre_ParCSRMatrixOffd(B);
3409    HYPRE_BigInt    *col_map_offd_B = hypre_ParCSRMatrixColMapOffd(B);
3410 
3411    HYPRE_BigInt    first_col_diag_B = hypre_ParCSRMatrixFirstColDiag(B);
3412    HYPRE_BigInt *col_starts_A = hypre_ParCSRMatrixColStarts(A);
3413    HYPRE_BigInt *col_starts_B = hypre_ParCSRMatrixColStarts(B);
3414    HYPRE_Int    num_rows_diag_B = hypre_CSRMatrixNumRows(B_diag);
3415    HYPRE_Int    num_cols_diag_B = hypre_CSRMatrixNumCols(B_diag);
3416    HYPRE_Int    num_cols_offd_B = hypre_CSRMatrixNumCols(B_offd);
3417 
3418    hypre_ParCSRMatrix *C;
3419    HYPRE_BigInt       *col_map_offd_C = NULL;
3420    HYPRE_Int          *map_B_to_C;
3421 
3422    hypre_CSRMatrix *C_diag = NULL;
3423    hypre_CSRMatrix *C_tmp_diag = NULL;
3424 
3425    HYPRE_Complex   *C_diag_data = NULL;
3426    HYPRE_Int       *C_diag_i = NULL;
3427    HYPRE_Int       *C_diag_j = NULL;
3428    HYPRE_BigInt    first_col_diag_C;
3429    HYPRE_BigInt    last_col_diag_C;
3430 
3431    hypre_CSRMatrix *C_offd = NULL;
3432    hypre_CSRMatrix *C_tmp_offd = NULL;
3433    hypre_CSRMatrix *C_int = NULL;
3434    hypre_CSRMatrix *C_ext = NULL;
3435    HYPRE_Int   *C_ext_i;
3436    HYPRE_BigInt   *C_ext_j;
3437    HYPRE_Complex   *C_ext_data;
3438    HYPRE_Int   *C_ext_diag_i;
3439    HYPRE_Int   *C_ext_diag_j;
3440    HYPRE_Complex   *C_ext_diag_data;
3441    HYPRE_Int   *C_ext_offd_i;
3442    HYPRE_Int   *C_ext_offd_j;
3443    HYPRE_Complex   *C_ext_offd_data;
3444    HYPRE_Int    C_ext_size = 0;
3445    HYPRE_Int    C_ext_diag_size = 0;
3446    HYPRE_Int    C_ext_offd_size = 0;
3447 
3448    HYPRE_Int   *C_tmp_diag_i;
3449    HYPRE_Int   *C_tmp_diag_j;
3450    HYPRE_Complex   *C_tmp_diag_data;
3451    HYPRE_Int   *C_tmp_offd_i;
3452    HYPRE_Int   *C_tmp_offd_j;
3453    HYPRE_Complex   *C_tmp_offd_data;
3454 
3455    HYPRE_Complex   *C_offd_data=NULL;
3456    HYPRE_Int       *C_offd_i=NULL;
3457    HYPRE_Int       *C_offd_j=NULL;
3458 
3459    HYPRE_BigInt    *temp;
3460    HYPRE_Int       *send_map_starts_A;
3461    HYPRE_Int       *send_map_elmts_A;
3462    HYPRE_Int        num_sends_A;
3463 
3464    HYPRE_Int        num_cols_offd_C = 0;
3465 
3466    HYPRE_Int       *P_marker;
3467 
3468    HYPRE_Int        i, j;
3469    HYPRE_Int        i1, j_indx;
3470 
3471    HYPRE_BigInt     nrows_A, ncols_A;
3472    HYPRE_BigInt     nrows_B, ncols_B;
3473    /*HYPRE_Int              allsquare = 0;*/
3474    HYPRE_Int        cnt, cnt_offd, cnt_diag;
3475    HYPRE_BigInt     value;
3476    HYPRE_Int        num_procs, my_id;
3477    HYPRE_Int        max_num_threads;
3478    HYPRE_Int       *C_diag_array = NULL;
3479    HYPRE_Int       *C_offd_array = NULL;
3480 
3481    HYPRE_BigInt first_row_index, first_col_diag;
3482    HYPRE_Int local_num_rows, local_num_cols;
3483 
3484    nrows_A = hypre_ParCSRMatrixGlobalNumRows(A);
3485    ncols_A = hypre_ParCSRMatrixGlobalNumCols(A);
3486    nrows_B = hypre_ParCSRMatrixGlobalNumRows(B);
3487    ncols_B = hypre_ParCSRMatrixGlobalNumCols(B);
3488 
3489    hypre_MPI_Comm_size(comm,&num_procs);
3490    hypre_MPI_Comm_rank(comm, &my_id);
3491    max_num_threads = hypre_NumThreads();
3492 
3493    if (nrows_A != nrows_B || num_rows_diag_A != num_rows_diag_B)
3494    {
3495       hypre_error_w_msg(HYPRE_ERROR_GENERIC," Error! Incompatible matrix dimensions!\n");
3496       return NULL;
3497    }
3498 
3499    HYPRE_MemoryLocation memory_location_A = hypre_ParCSRMatrixMemoryLocation(A);
3500    HYPRE_MemoryLocation memory_location_B = hypre_ParCSRMatrixMemoryLocation(B);
3501 
3502    /* RL: TODO cannot guarantee, maybe should never assert
3503    hypre_assert(memory_location_A == memory_location_B);
3504    */
3505 
3506    /* RL: in the case of A=H, B=D, or A=D, B=H, let C = D,
3507     * not sure if this is the right thing to do.
3508     * Also, need something like this in other places
3509     * TODO */
3510    HYPRE_MemoryLocation memory_location_C = hypre_max(memory_location_A, memory_location_B);
3511 
3512    /*if (num_cols_diag_A == num_cols_diag_B) allsquare = 1;*/
3513 
3514    /*---------------------------------------------------------------------
3515     * If there exists no CommPkg for A, a CommPkg is generated using
3516     * equally load balanced partitionings
3517     *--------------------------------------------------------------------*/
3518 
3519    HYPRE_ANNOTATE_FUNC_BEGIN;
3520 
3521    if (!comm_pkg_A)
3522    {
3523       hypre_MatvecCommPkgCreate(A);
3524       comm_pkg_A = hypre_ParCSRMatrixCommPkg(A);
3525    }
3526 
3527    hypre_CSRMatrixTranspose(A_diag, &AT_diag, 1);
3528    hypre_CSRMatrixTranspose(A_offd, &AT_offd, 1);
3529 
3530    C_tmp_diag = hypre_CSRMatrixMultiply(AT_diag, B_diag);
3531    C_ext_size = 0;
3532    if (num_procs > 1)
3533    {
3534       hypre_CSRMatrix *C_int_diag;
3535       hypre_CSRMatrix *C_int_offd;
3536       void            *request;
3537 
3538       C_tmp_offd = hypre_CSRMatrixMultiply(AT_diag, B_offd);
3539       C_int_diag = hypre_CSRMatrixMultiply(AT_offd, B_diag);
3540       C_int_offd = hypre_CSRMatrixMultiply(AT_offd, B_offd);
3541       hypre_ParCSRMatrixDiag(B) = C_int_diag;
3542       hypre_ParCSRMatrixOffd(B) = C_int_offd;
3543       C_int = hypre_MergeDiagAndOffd(B);
3544       hypre_ParCSRMatrixDiag(B) = B_diag;
3545       hypre_ParCSRMatrixOffd(B) = B_offd;
3546       hypre_ExchangeExternalRowsInit(C_int, comm_pkg_A, &request);
3547       C_ext = hypre_ExchangeExternalRowsWait(request);
3548       C_ext_i = hypre_CSRMatrixI(C_ext);
3549       C_ext_j = hypre_CSRMatrixBigJ(C_ext);
3550       C_ext_data = hypre_CSRMatrixData(C_ext);
3551       C_ext_size = C_ext_i[hypre_CSRMatrixNumRows(C_ext)];
3552 
3553       hypre_CSRMatrixDestroy(C_int);
3554       hypre_CSRMatrixDestroy(C_int_diag);
3555       hypre_CSRMatrixDestroy(C_int_offd);
3556    }
3557    else
3558    {
3559       C_tmp_offd = hypre_CSRMatrixCreate(num_cols_diag_A, 0, 0);
3560       hypre_CSRMatrixInitialize(C_tmp_offd);
3561       hypre_CSRMatrixNumRownnz(C_tmp_offd) = 0;
3562    }
3563    hypre_CSRMatrixDestroy(AT_diag);
3564    hypre_CSRMatrixDestroy(AT_offd);
3565 
3566    /*-----------------------------------------------------------------------
3567     *  Add contents of C_ext to C_tmp_diag and C_tmp_offd
3568     *  to obtain C_diag and C_offd
3569     *-----------------------------------------------------------------------*/
3570 
3571    /* check for new nonzero columns in C_offd generated through C_ext */
3572 
3573    first_col_diag_C = first_col_diag_B;
3574    last_col_diag_C = first_col_diag_B + (HYPRE_BigInt)num_cols_diag_B - 1;
3575 
3576    C_tmp_diag_i = hypre_CSRMatrixI(C_tmp_diag);
3577    if (C_ext_size || num_cols_offd_B)
3578    {
3579       HYPRE_Int C_ext_num_rows;
3580 
3581       num_sends_A = hypre_ParCSRCommPkgNumSends(comm_pkg_A);
3582       send_map_starts_A = hypre_ParCSRCommPkgSendMapStarts(comm_pkg_A);
3583       send_map_elmts_A = hypre_ParCSRCommPkgSendMapElmts(comm_pkg_A);
3584       C_ext_num_rows =  send_map_starts_A[num_sends_A];
3585 
3586       C_ext_diag_i = hypre_CTAlloc(HYPRE_Int,  C_ext_num_rows+1, HYPRE_MEMORY_HOST);
3587       C_ext_offd_i = hypre_CTAlloc(HYPRE_Int,  C_ext_num_rows+1, HYPRE_MEMORY_HOST);
3588       temp = hypre_CTAlloc(HYPRE_BigInt,  C_ext_size+num_cols_offd_B, HYPRE_MEMORY_HOST);
3589       C_ext_diag_size = 0;
3590       C_ext_offd_size = 0;
3591       for (i = 0; i < C_ext_num_rows; i++)
3592       {
3593          for (j = C_ext_i[i]; j < C_ext_i[i+1]; j++)
3594          {
3595             if (C_ext_j[j] < first_col_diag_C ||
3596                 C_ext_j[j] > last_col_diag_C)
3597             {
3598                temp[C_ext_offd_size++] = C_ext_j[j];
3599             }
3600             else
3601             {
3602                C_ext_diag_size++;
3603             }
3604          }
3605          C_ext_diag_i[i+1] = C_ext_diag_size;
3606          C_ext_offd_i[i+1] = C_ext_offd_size;
3607       }
3608       cnt = C_ext_offd_size;
3609       for (i = 0; i < num_cols_offd_B; i++)
3610       {
3611          temp[cnt++] = col_map_offd_B[i];
3612       }
3613 
3614       if (cnt)
3615       {
3616          hypre_BigQsort0(temp,0,cnt-1);
3617          value = temp[0];
3618          num_cols_offd_C = 1;
3619          for (i = 1; i < cnt; i++)
3620          {
3621             if (temp[i] > value)
3622             {
3623                value = temp[i];
3624                temp[num_cols_offd_C++] = value;
3625             }
3626          }
3627       }
3628 
3629       if (num_cols_offd_C)
3630       {
3631          col_map_offd_C = hypre_CTAlloc(HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_HOST);
3632       }
3633       for (i = 0; i < num_cols_offd_C; i++)
3634       {
3635          col_map_offd_C[i] = temp[i];
3636       }
3637 
3638       hypre_TFree(temp, HYPRE_MEMORY_HOST);
3639 
3640       if (C_ext_diag_size)
3641       {
3642          C_ext_diag_j = hypre_CTAlloc(HYPRE_Int,  C_ext_diag_size, HYPRE_MEMORY_HOST);
3643          C_ext_diag_data = hypre_CTAlloc(HYPRE_Complex,  C_ext_diag_size, HYPRE_MEMORY_HOST);
3644       }
3645       if (C_ext_offd_size)
3646       {
3647          C_ext_offd_j = hypre_CTAlloc(HYPRE_Int,  C_ext_offd_size, HYPRE_MEMORY_HOST);
3648          C_ext_offd_data = hypre_CTAlloc(HYPRE_Complex,  C_ext_offd_size, HYPRE_MEMORY_HOST);
3649       }
3650 
3651       C_tmp_diag_j = hypre_CSRMatrixJ(C_tmp_diag);
3652       C_tmp_diag_data = hypre_CSRMatrixData(C_tmp_diag);
3653 
3654       C_tmp_offd_i = hypre_CSRMatrixI(C_tmp_offd);
3655       C_tmp_offd_j = hypre_CSRMatrixJ(C_tmp_offd);
3656       C_tmp_offd_data = hypre_CSRMatrixData(C_tmp_offd);
3657 
3658       cnt_offd = 0;
3659       cnt_diag = 0;
3660       for (i = 0; i < C_ext_num_rows; i++)
3661       {
3662          for (j = C_ext_i[i]; j < C_ext_i[i+1]; j++)
3663          {
3664             if (C_ext_j[j] < first_col_diag_C ||
3665                 C_ext_j[j] > last_col_diag_C)
3666             {
3667                C_ext_offd_j[cnt_offd] = hypre_BigBinarySearch(col_map_offd_C,
3668                                                               C_ext_j[j],
3669                                                               num_cols_offd_C);
3670                C_ext_offd_data[cnt_offd++] = C_ext_data[j];
3671             }
3672             else
3673             {
3674                C_ext_diag_j[cnt_diag] = (HYPRE_Int)(C_ext_j[j] - first_col_diag_C);
3675                C_ext_diag_data[cnt_diag++] = C_ext_data[j];
3676             }
3677          }
3678       }
3679    }
3680 
3681    if (C_ext)
3682    {
3683       hypre_CSRMatrixDestroy(C_ext);
3684       C_ext = NULL;
3685    }
3686 
3687    if (num_cols_offd_B)
3688    {
3689       map_B_to_C = hypre_CTAlloc(HYPRE_Int, num_cols_offd_B, HYPRE_MEMORY_HOST);
3690 
3691       cnt = 0;
3692       for (i = 0; i < num_cols_offd_C; i++)
3693       {
3694          if (col_map_offd_C[i] == col_map_offd_B[cnt])
3695          {
3696             map_B_to_C[cnt++] = i;
3697             if (cnt == num_cols_offd_B) break;
3698          }
3699       }
3700       for (i = 0; i < hypre_CSRMatrixI(C_tmp_offd)[hypre_CSRMatrixNumRows(C_tmp_offd)]; i++)
3701       {
3702          j_indx = C_tmp_offd_j[i];
3703          C_tmp_offd_j[i] = map_B_to_C[j_indx];
3704       }
3705    }
3706 
3707    /*-----------------------------------------------------------------------
3708     *  Need to compute:
3709     *    C_diag = C_tmp_diag + C_ext_diag
3710     *    C_offd = C_tmp_offd + C_ext_offd
3711     *
3712     *  First generate structure
3713     *-----------------------------------------------------------------------*/
3714 
3715    if (C_ext_size || num_cols_offd_B)
3716    {
3717       C_diag_i = hypre_CTAlloc(HYPRE_Int, num_cols_diag_A+1, memory_location_C);
3718       C_offd_i = hypre_CTAlloc(HYPRE_Int, num_cols_diag_A+1, memory_location_C);
3719 
3720       C_diag_array = hypre_CTAlloc(HYPRE_Int,  max_num_threads, HYPRE_MEMORY_HOST);
3721       C_offd_array = hypre_CTAlloc(HYPRE_Int,  max_num_threads, HYPRE_MEMORY_HOST);
3722 
3723 #ifdef HYPRE_USING_OPENMP
3724 #pragma omp parallel
3725 #endif
3726       {
3727          HYPRE_Int *B_marker = NULL;
3728          HYPRE_Int *B_marker_offd = NULL;
3729          HYPRE_Int ik, jk, j1, j2, jcol;
3730          HYPRE_Int ns, ne, ii, nnz_d, nnz_o;
3731          HYPRE_Int rest, size;
3732          HYPRE_Int num_threads = hypre_NumActiveThreads();
3733 
3734          size = num_cols_diag_A/num_threads;
3735          rest = num_cols_diag_A - size*num_threads;
3736          ii = hypre_GetThreadNum();
3737          if (ii < rest)
3738          {
3739             ns = ii*size+ii;
3740             ne = (ii+1)*size+ii+1;
3741          }
3742          else
3743          {
3744             ns = ii*size+rest;
3745             ne = (ii+1)*size+rest;
3746          }
3747 
3748          B_marker = hypre_CTAlloc(HYPRE_Int,  num_cols_diag_B, HYPRE_MEMORY_HOST);
3749          B_marker_offd = hypre_CTAlloc(HYPRE_Int,  num_cols_offd_C, HYPRE_MEMORY_HOST);
3750 
3751          for (ik = 0; ik < num_cols_diag_B; ik++)
3752          {
3753             B_marker[ik] = -1;
3754          }
3755 
3756          for (ik = 0; ik < num_cols_offd_C; ik++)
3757          {
3758             B_marker_offd[ik] = -1;
3759          }
3760 
3761          nnz_d = 0;
3762          nnz_o = 0;
3763          for (ik = ns; ik < ne; ik++)
3764          {
3765             for (jk = C_tmp_diag_i[ik]; jk < C_tmp_diag_i[ik+1]; jk++)
3766             {
3767                jcol = C_tmp_diag_j[jk];
3768                B_marker[jcol] = ik;
3769                nnz_d++;
3770             }
3771 
3772             for (jk = C_tmp_offd_i[ik]; jk < C_tmp_offd_i[ik+1]; jk++)
3773             {
3774                jcol = C_tmp_offd_j[jk];
3775                B_marker_offd[jcol] = ik;
3776                nnz_o++;
3777             }
3778 
3779             for (jk = 0; jk < num_sends_A; jk++)
3780             {
3781                for (j1 = send_map_starts_A[jk]; j1 < send_map_starts_A[jk+1]; j1++)
3782                {
3783                   if (send_map_elmts_A[j1] == ik)
3784                   {
3785                      for (j2 = C_ext_diag_i[j1]; j2 < C_ext_diag_i[j1+1]; j2++)
3786                      {
3787                         jcol = C_ext_diag_j[j2];
3788                         if (B_marker[jcol] < ik)
3789                         {
3790                            B_marker[jcol] = ik;
3791                            nnz_d++;
3792                         }
3793                      }
3794                      for (j2 = C_ext_offd_i[j1]; j2 < C_ext_offd_i[j1+1]; j2++)
3795                      {
3796                         jcol = C_ext_offd_j[j2];
3797                         if (B_marker_offd[jcol] < ik)
3798                         {
3799                            B_marker_offd[jcol] = ik;
3800                            nnz_o++;
3801                         }
3802                      }
3803                      break;
3804                   }
3805                }
3806             }
3807             C_diag_array[ii] = nnz_d;
3808             C_offd_array[ii] = nnz_o;
3809          }
3810 #ifdef HYPRE_USING_OPENMP
3811 #pragma omp barrier
3812 #endif
3813 
3814          if (ii == 0)
3815          {
3816             nnz_d = 0;
3817             nnz_o = 0;
3818             for (ik = 0; ik < num_threads-1; ik++)
3819             {
3820                C_diag_array[ik+1] += C_diag_array[ik];
3821                C_offd_array[ik+1] += C_offd_array[ik];
3822             }
3823             nnz_d = C_diag_array[num_threads-1];
3824             nnz_o = C_offd_array[num_threads-1];
3825             C_diag_i[num_cols_diag_A] = nnz_d;
3826             C_offd_i[num_cols_diag_A] = nnz_o;
3827 
3828             C_diag = hypre_CSRMatrixCreate(num_cols_diag_A, num_cols_diag_A, nnz_d);
3829             C_offd = hypre_CSRMatrixCreate(num_cols_diag_A, num_cols_offd_C, nnz_o);
3830             hypre_CSRMatrixI(C_diag) = C_diag_i;
3831             hypre_CSRMatrixInitialize_v2(C_diag, 0, memory_location_C);
3832             C_diag_j = hypre_CSRMatrixJ(C_diag);
3833             C_diag_data = hypre_CSRMatrixData(C_diag);
3834             hypre_CSRMatrixI(C_offd) = C_offd_i;
3835             hypre_CSRMatrixInitialize_v2(C_offd, 0, memory_location_C);
3836             C_offd_j = hypre_CSRMatrixJ(C_offd);
3837             C_offd_data = hypre_CSRMatrixData(C_offd);
3838          }
3839 #ifdef HYPRE_USING_OPENMP
3840 #pragma omp barrier
3841 #endif
3842 
3843          /*-----------------------------------------------------------------------
3844           *  Need to compute C_diag = C_tmp_diag + C_ext_diag
3845           *  and  C_offd = C_tmp_offd + C_ext_offd   !!!!
3846           *  Now fill in values
3847           *-----------------------------------------------------------------------*/
3848 
3849          for (ik = 0; ik < num_cols_diag_B; ik++)
3850          {
3851             B_marker[ik] = -1;
3852          }
3853 
3854          for (ik = 0; ik < num_cols_offd_C; ik++)
3855          {
3856             B_marker_offd[ik] = -1;
3857          }
3858 
3859          /*-----------------------------------------------------------------------
3860           *  Populate matrices
3861           *-----------------------------------------------------------------------*/
3862 
3863          nnz_d = 0;
3864          nnz_o = 0;
3865          if (ii)
3866          {
3867             nnz_d = C_diag_array[ii-1];
3868             nnz_o = C_offd_array[ii-1];
3869          }
3870          for (ik = ns; ik < ne; ik++)
3871          {
3872             C_diag_i[ik] = nnz_d;
3873             C_offd_i[ik] = nnz_o;
3874             for (jk = C_tmp_diag_i[ik]; jk < C_tmp_diag_i[ik+1]; jk++)
3875             {
3876                jcol = C_tmp_diag_j[jk];
3877                C_diag_j[nnz_d] = jcol;
3878                C_diag_data[nnz_d] = C_tmp_diag_data[jk];
3879                B_marker[jcol] = nnz_d;
3880                nnz_d++;
3881             }
3882 
3883             for (jk = C_tmp_offd_i[ik]; jk < C_tmp_offd_i[ik+1]; jk++)
3884             {
3885                jcol = C_tmp_offd_j[jk];
3886                C_offd_j[nnz_o] = jcol;
3887                C_offd_data[nnz_o] = C_tmp_offd_data[jk];
3888                B_marker_offd[jcol] = nnz_o;
3889                nnz_o++;
3890             }
3891 
3892             for (jk = 0; jk < num_sends_A; jk++)
3893             {
3894                for (j1 = send_map_starts_A[jk]; j1 < send_map_starts_A[jk+1]; j1++)
3895                {
3896                   if (send_map_elmts_A[j1] == ik)
3897                   {
3898                      for (j2 = C_ext_diag_i[j1]; j2 < C_ext_diag_i[j1+1]; j2++)
3899                      {
3900                         jcol = C_ext_diag_j[j2];
3901                         if (B_marker[jcol] < C_diag_i[ik])
3902                         {
3903                            C_diag_j[nnz_d] = jcol;
3904                            C_diag_data[nnz_d] = C_ext_diag_data[j2];
3905                            B_marker[jcol] = nnz_d;
3906                            nnz_d++;
3907                         }
3908                         else
3909                         {
3910                            C_diag_data[B_marker[jcol]] += C_ext_diag_data[j2];
3911                         }
3912                      }
3913                      for (j2 = C_ext_offd_i[j1]; j2 < C_ext_offd_i[j1+1]; j2++)
3914                      {
3915                         jcol = C_ext_offd_j[j2];
3916                         if (B_marker_offd[jcol] < C_offd_i[ik])
3917                         {
3918                            C_offd_j[nnz_o] = jcol;
3919                            C_offd_data[nnz_o] = C_ext_offd_data[j2];
3920                            B_marker_offd[jcol] = nnz_o;
3921                            nnz_o++;
3922                         }
3923                         else
3924                         {
3925                            C_offd_data[B_marker_offd[jcol]] += C_ext_offd_data[j2];
3926                         }
3927                      }
3928                      break;
3929                   }
3930                }
3931             }
3932          }
3933          hypre_TFree(B_marker, HYPRE_MEMORY_HOST);
3934          hypre_TFree(B_marker_offd, HYPRE_MEMORY_HOST);
3935 
3936       } /*end parallel region */
3937 
3938       hypre_TFree(C_diag_array, HYPRE_MEMORY_HOST);
3939       hypre_TFree(C_offd_array, HYPRE_MEMORY_HOST);
3940    }
3941 
3942    /*C = hypre_ParCSRMatrixCreate(comm, ncols_A, ncols_B, col_starts_A,
3943      col_starts_B, num_cols_offd_C, nnz_diag, nnz_offd);
3944 
3945      hypre_CSRMatrixDestroy(hypre_ParCSRMatrixDiag(C));
3946      hypre_CSRMatrixDestroy(hypre_ParCSRMatrixOffd(C)); */
3947    /* row_starts[0] is start of local rows.  row_starts[1] is start of next
3948       processor's rows */
3949    first_row_index = col_starts_A[0];
3950    local_num_rows = (HYPRE_Int)(col_starts_A[1]-first_row_index );
3951    first_col_diag = col_starts_B[0];
3952    local_num_cols = (HYPRE_Int)(col_starts_B[1]-first_col_diag);
3953 
3954    C = hypre_CTAlloc(hypre_ParCSRMatrix, 1, HYPRE_MEMORY_HOST);
3955    hypre_ParCSRMatrixComm(C) = comm;
3956    hypre_ParCSRMatrixGlobalNumRows(C) = ncols_A;
3957    hypre_ParCSRMatrixGlobalNumCols(C) = ncols_B;
3958    hypre_ParCSRMatrixFirstRowIndex(C) = first_row_index;
3959    hypre_ParCSRMatrixFirstColDiag(C) = first_col_diag;
3960    hypre_ParCSRMatrixLastRowIndex(C) = first_row_index + (HYPRE_BigInt)local_num_rows - 1;
3961    hypre_ParCSRMatrixLastColDiag(C) = first_col_diag + (HYPRE_BigInt)local_num_cols - 1;
3962    hypre_ParCSRMatrixColMapOffd(C) = NULL;
3963    hypre_ParCSRMatrixAssumedPartition(C) = NULL;
3964    hypre_ParCSRMatrixCommPkg(C) = NULL;
3965    hypre_ParCSRMatrixCommPkgT(C) = NULL;
3966 
3967    /* C row/col starts*/
3968    hypre_ParCSRMatrixRowStarts(C)[0] = col_starts_A[0];
3969    hypre_ParCSRMatrixRowStarts(C)[1] = col_starts_A[1];
3970    hypre_ParCSRMatrixColStarts(C)[0] = col_starts_B[0];
3971    hypre_ParCSRMatrixColStarts(C)[1] = col_starts_B[1];
3972 
3973    /* set defaults */
3974    hypre_ParCSRMatrixOwnsData(C) = 1;
3975    hypre_ParCSRMatrixRowindices(C) = NULL;
3976    hypre_ParCSRMatrixRowvalues(C) = NULL;
3977    hypre_ParCSRMatrixGetrowactive(C) = 0;
3978 
3979    if (C_diag)
3980    {
3981       hypre_CSRMatrixSetRownnz(C_diag);
3982       hypre_ParCSRMatrixDiag(C) = C_diag;
3983    }
3984    else
3985    {
3986       hypre_ParCSRMatrixDiag(C) = C_tmp_diag;
3987    }
3988 
3989    if (C_offd)
3990    {
3991       hypre_CSRMatrixSetRownnz(C_offd);
3992       hypre_ParCSRMatrixOffd(C) = C_offd;
3993    }
3994    else
3995    {
3996       hypre_ParCSRMatrixOffd(C) = C_tmp_offd;
3997    }
3998 
3999    hypre_CSRMatrixMemoryLocation(hypre_ParCSRMatrixDiag(C)) = memory_location_C;
4000    hypre_CSRMatrixMemoryLocation(hypre_ParCSRMatrixOffd(C)) = memory_location_C;
4001 
4002    if (num_cols_offd_C)
4003    {
4004       HYPRE_Int jj_count_offd, nnz_offd;
4005       HYPRE_BigInt *new_col_map_offd_C = NULL;
4006 
4007       P_marker = hypre_CTAlloc(HYPRE_Int, num_cols_offd_C, HYPRE_MEMORY_HOST);
4008       for (i = 0; i < num_cols_offd_C; i++)
4009       {
4010          P_marker[i] = -1;
4011       }
4012 
4013       jj_count_offd = 0;
4014       nnz_offd = C_offd_i[num_cols_diag_A];
4015       for (i = 0; i < nnz_offd; i++)
4016       {
4017          i1 = C_offd_j[i];
4018          if (P_marker[i1])
4019          {
4020             P_marker[i1] = 0;
4021             jj_count_offd++;
4022          }
4023       }
4024 
4025       if (jj_count_offd < num_cols_offd_C)
4026       {
4027          new_col_map_offd_C = hypre_CTAlloc(HYPRE_BigInt, jj_count_offd, HYPRE_MEMORY_HOST);
4028          jj_count_offd = 0;
4029          for (i = 0; i < num_cols_offd_C; i++)
4030          {
4031             if (!P_marker[i])
4032             {
4033                P_marker[i] = jj_count_offd;
4034                new_col_map_offd_C[jj_count_offd++] = col_map_offd_C[i];
4035             }
4036          }
4037 
4038          for (i = 0; i < nnz_offd; i++)
4039          {
4040             i1 = C_offd_j[i];
4041             C_offd_j[i] = P_marker[i1];
4042          }
4043 
4044          num_cols_offd_C = jj_count_offd;
4045          hypre_TFree(col_map_offd_C, HYPRE_MEMORY_HOST);
4046          col_map_offd_C = new_col_map_offd_C;
4047          hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(C)) = num_cols_offd_C;
4048       }
4049       hypre_TFree(P_marker, HYPRE_MEMORY_HOST);
4050    }
4051 
4052    hypre_ParCSRMatrixColMapOffd(C) = col_map_offd_C;
4053 
4054    /*-----------------------------------------------------------------------
4055     *  Free various arrays
4056     *-----------------------------------------------------------------------*/
4057    if (C_ext_size || num_cols_offd_B)
4058    {
4059       hypre_TFree(C_ext_diag_i, HYPRE_MEMORY_HOST);
4060       hypre_TFree(C_ext_offd_i, HYPRE_MEMORY_HOST);
4061    }
4062 
4063    if (C_ext_diag_size)
4064    {
4065       hypre_TFree(C_ext_diag_j, HYPRE_MEMORY_HOST);
4066       hypre_TFree(C_ext_diag_data, HYPRE_MEMORY_HOST);
4067    }
4068 
4069    if (C_ext_offd_size)
4070    {
4071       hypre_TFree(C_ext_offd_j, HYPRE_MEMORY_HOST);
4072       hypre_TFree(C_ext_offd_data, HYPRE_MEMORY_HOST);
4073    }
4074 
4075    if (num_cols_offd_B)
4076    {
4077       hypre_TFree(map_B_to_C, HYPRE_MEMORY_HOST);
4078    }
4079 
4080    if (C_diag)
4081    {
4082       hypre_CSRMatrixDestroy(C_tmp_diag);
4083    }
4084 
4085    if (C_offd)
4086    {
4087      hypre_CSRMatrixDestroy(C_tmp_offd);
4088    }
4089 
4090 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
4091    if ( hypre_GetExecPolicy2(memory_location_A, memory_location_B) == HYPRE_EXEC_DEVICE )
4092    {
4093       hypre_CSRMatrixMoveDiagFirstDevice(hypre_ParCSRMatrixDiag(C));
4094       hypre_SyncCudaComputeStream(hypre_handle());
4095    }
4096 #endif
4097 
4098    HYPRE_ANNOTATE_FUNC_END;
4099 
4100    return C;
4101 }
4102 
4103 /*--------------------------------------------------------------------------
4104  * hypre_ParvecBdiagInvScal
4105  *--------------------------------------------------------------------------*/
4106 
4107 HYPRE_Int
hypre_ParvecBdiagInvScal(hypre_ParVector * b,HYPRE_Int blockSize,hypre_ParVector ** bs,hypre_ParCSRMatrix * A)4108 hypre_ParvecBdiagInvScal( hypre_ParVector     *b,
4109                           HYPRE_Int            blockSize,
4110                           hypre_ParVector    **bs,
4111                           hypre_ParCSRMatrix  *A)
4112 {
4113    MPI_Comm         comm     = hypre_ParCSRMatrixComm(b);
4114    HYPRE_Int        num_procs, my_id;
4115    hypre_MPI_Comm_rank(comm, &my_id);
4116    hypre_MPI_Comm_size(comm, &num_procs);
4117 
4118    HYPRE_Int i, j, s, block_start, block_end;
4119    HYPRE_BigInt nrow_global = hypre_ParVectorGlobalSize(b);
4120    HYPRE_BigInt first_row   = hypre_ParVectorFirstIndex(b);
4121    HYPRE_BigInt last_row    = hypre_ParVectorLastIndex(b);
4122    HYPRE_BigInt end_row     = last_row + 1; /* one past-the-last */
4123    HYPRE_BigInt first_row_block = first_row / (HYPRE_BigInt)(blockSize) * (HYPRE_BigInt)blockSize;
4124    HYPRE_BigInt end_row_block   = hypre_min( (last_row / (HYPRE_BigInt)blockSize + 1) * (HYPRE_BigInt)blockSize, nrow_global );
4125 
4126    hypre_assert(blockSize == A->bdiag_size);
4127    HYPRE_Complex *bdiaginv = A->bdiaginv;
4128    hypre_ParCSRCommPkg *comm_pkg = A->bdiaginv_comm_pkg;
4129 
4130    HYPRE_Complex *dense = bdiaginv;
4131 
4132    //for (i=first_row_block; i < end_row; i+=blockSize) ;
4133    //printf("===[%d %d), [ %d %d ) %d === \n", first_row, end_row, first_row_block, end_row_block, i);
4134 
4135    /* local vector of b */
4136    hypre_Vector    *b_local      = hypre_ParVectorLocalVector(b);
4137    HYPRE_Complex   *b_local_data = hypre_VectorData(b_local);
4138    /* number of sends (#procs) */
4139    HYPRE_Int num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
4140    /* number of rows to send */
4141    HYPRE_Int num_rows_send = hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends);
4142    /* number of recvs (#procs) */
4143    HYPRE_Int num_recvs = hypre_ParCSRCommPkgNumRecvs(comm_pkg);
4144    /* number of rows to recv */
4145    HYPRE_Int num_rows_recv = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs);
4146    hypre_ParCSRCommHandle  *comm_handle;
4147 
4148    HYPRE_BigInt *part = hypre_TAlloc(HYPRE_BigInt, 2, HYPRE_MEMORY_HOST);
4149    hypre_TMemcpy(part, hypre_ParVectorPartitioning(b), HYPRE_BigInt, 2,
4150                  HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
4151 
4152    hypre_ParVector *bnew = hypre_ParVectorCreate( hypre_ParVectorComm(b),
4153                                                   hypre_ParVectorGlobalSize(b), part );
4154    hypre_ParVectorInitialize(bnew);
4155    hypre_Vector    *bnew_local      = hypre_ParVectorLocalVector(bnew);
4156    HYPRE_Complex   *bnew_local_data = hypre_VectorData(bnew_local);
4157 
4158    /* send and recv b */
4159    HYPRE_Complex *send_b = hypre_TAlloc(HYPRE_Complex, num_rows_send, HYPRE_MEMORY_HOST);
4160    HYPRE_Complex *recv_b = hypre_TAlloc(HYPRE_Complex, num_rows_recv, HYPRE_MEMORY_HOST);
4161 
4162    for (i = 0; i < num_rows_send; i++)
4163    {
4164       j = hypre_ParCSRCommPkgSendMapElmt(comm_pkg, i);
4165       send_b[i] = b_local_data[j];
4166    }
4167    comm_handle = hypre_ParCSRCommHandleCreate(1, comm_pkg, send_b, recv_b);
4168    /* ... */
4169    hypre_ParCSRCommHandleDestroy(comm_handle);
4170 
4171    for (block_start = first_row_block; block_start < end_row_block; block_start += blockSize)
4172    {
4173       HYPRE_BigInt big_i;
4174       block_end = hypre_min(block_start + (HYPRE_BigInt)blockSize, nrow_global);
4175       s = (HYPRE_Int)(block_end - block_start);
4176       for (big_i = block_start; big_i < block_end; big_i++)
4177       {
4178          if (big_i < first_row || big_i >= end_row)
4179          {
4180             continue;
4181          }
4182 
4183          HYPRE_Int local_i = (HYPRE_Int)(big_i - first_row);
4184          HYPRE_Int block_i = (HYPRE_Int)(big_i - block_start);
4185 
4186          bnew_local_data[local_i] = 0.0;
4187 
4188          for (j = 0; j < s; j++)
4189          {
4190             HYPRE_BigInt global_rid = block_start + (HYPRE_BigInt)j;
4191             HYPRE_Complex val = dense[block_i + j*blockSize];
4192             if (val == 0.0)
4193             {
4194                continue;
4195             }
4196             if (global_rid >= first_row && global_rid < end_row)
4197             {
4198                HYPRE_Int rid = (HYPRE_Int)(global_rid - first_row);
4199                bnew_local_data[local_i] += val * b_local_data[rid];
4200             }
4201             else
4202             {
4203                HYPRE_Int rid;
4204 
4205                if (global_rid < first_row)
4206                {
4207                   rid = (HYPRE_Int)(global_rid - first_row_block);
4208                }
4209                else
4210                {
4211                   rid = (HYPRE_Int)(first_row - first_row_block + global_rid - end_row);
4212                }
4213                bnew_local_data[local_i] += val * recv_b[rid];
4214             }
4215          }
4216       }
4217       dense += blockSize * blockSize;
4218    }
4219 
4220    hypre_TFree(send_b, HYPRE_MEMORY_HOST);
4221    hypre_TFree(recv_b, HYPRE_MEMORY_HOST);
4222    *bs = bnew;
4223 
4224    return hypre_error_flag;
4225 }
4226 
4227 /**
4228  * @brief Compute As = B^{-1}*A, where B is the block diagonal of A
4229  * @param[in]  A        :
4230  * @param[in]  blockSize:    block size
4231  * @param[out] B        :
4232  * @return
4233  * @warning
4234  */
4235 HYPRE_Int
hypre_ParcsrBdiagInvScal(hypre_ParCSRMatrix * A,HYPRE_Int blockSize,hypre_ParCSRMatrix ** As)4236 hypre_ParcsrBdiagInvScal( hypre_ParCSRMatrix   *A,
4237                           HYPRE_Int             blockSize,
4238                           hypre_ParCSRMatrix  **As)
4239 {
4240    MPI_Comm         comm     = hypre_ParCSRMatrixComm(A);
4241    HYPRE_Int        num_procs, my_id;
4242    hypre_MPI_Comm_rank(comm, &my_id);
4243    hypre_MPI_Comm_size(comm, &num_procs);
4244 
4245    HYPRE_Int i, j, k, s;
4246    HYPRE_BigInt block_start, block_end;
4247    /* diag part of A */
4248    hypre_CSRMatrix *A_diag   = hypre_ParCSRMatrixDiag(A);
4249    HYPRE_Real      *A_diag_a = hypre_CSRMatrixData(A_diag);
4250    HYPRE_Int       *A_diag_i = hypre_CSRMatrixI(A_diag);
4251    HYPRE_Int       *A_diag_j = hypre_CSRMatrixJ(A_diag);
4252    /* off-diag part of A */
4253    hypre_CSRMatrix *A_offd   = hypre_ParCSRMatrixOffd(A);
4254    HYPRE_Real      *A_offd_a = hypre_CSRMatrixData(A_offd);
4255    HYPRE_Int       *A_offd_i = hypre_CSRMatrixI(A_offd);
4256    HYPRE_Int       *A_offd_j = hypre_CSRMatrixJ(A_offd);
4257 
4258    HYPRE_Int        num_cols_A_offd = hypre_CSRMatrixNumCols(A_offd);
4259    HYPRE_BigInt    *col_map_offd_A  = hypre_ParCSRMatrixColMapOffd(A);
4260 
4261 
4262    HYPRE_Int nrow_local = hypre_CSRMatrixNumRows(A_diag);
4263    HYPRE_BigInt first_row  = hypre_ParCSRMatrixFirstRowIndex(A);
4264    HYPRE_BigInt last_row   = hypre_ParCSRMatrixLastRowIndex(A);
4265    HYPRE_BigInt end_row    = first_row + (HYPRE_BigInt)nrow_local; /* one past-the-last */
4266 
4267    HYPRE_Int ncol_local = hypre_CSRMatrixNumCols(A_diag);
4268    HYPRE_BigInt first_col  = hypre_ParCSRMatrixFirstColDiag(A);
4269    /* HYPRE_Int last_col   = hypre_ParCSRMatrixLastColDiag(A); */
4270    HYPRE_BigInt end_col    = first_col + (HYPRE_BigInt)ncol_local;
4271 
4272    HYPRE_BigInt nrow_global = hypre_ParCSRMatrixGlobalNumRows(A);
4273    HYPRE_BigInt ncol_global = hypre_ParCSRMatrixGlobalNumCols(A);
4274    HYPRE_BigInt *row_starts = hypre_ParCSRMatrixRowStarts(A);
4275    void *request;
4276 
4277    /* if square globally and locally */
4278    HYPRE_Int square2 = (nrow_global == ncol_global) && (nrow_local == ncol_local) &&
4279       (first_row == first_col);
4280 
4281    if (nrow_global != ncol_global)
4282    {
4283       hypre_printf("hypre_ParcsrBdiagInvScal: only support N_ROW == N_COL\n");
4284       return hypre_error_flag;
4285    }
4286 
4287    /* in block diagonals, row range of the blocks this proc span */
4288    HYPRE_BigInt first_row_block = first_row / (HYPRE_BigInt)blockSize * (HYPRE_BigInt)blockSize;
4289    HYPRE_BigInt end_row_block   = hypre_min( (last_row / (HYPRE_BigInt)blockSize + 1) * (HYPRE_BigInt)blockSize, nrow_global );
4290    HYPRE_Int num_blocks = (HYPRE_Int)(last_row / (HYPRE_BigInt)blockSize + 1 - first_row / (HYPRE_BigInt)blockSize);
4291 
4292    //for (i=first_row_block; i < end_row; i+=blockSize) ;
4293    //printf("===[%d %d), [ %d %d ) %d === \n", first_row, end_row, first_row_block, end_row_block, i);
4294    //return 0;
4295 
4296    /* number of external rows */
4297    HYPRE_Int num_ext_rows = (HYPRE_Int)(end_row_block - first_row_block - (end_row - first_row));
4298    HYPRE_BigInt *ext_indices;
4299    HYPRE_Int A_ext_nnz;
4300 
4301    hypre_CSRMatrix *A_ext   = NULL;
4302    HYPRE_Complex   *A_ext_a = NULL;
4303    HYPRE_Int       *A_ext_i = NULL;
4304    HYPRE_BigInt    *A_ext_j = NULL;
4305 
4306    HYPRE_Real *dense_all = hypre_CTAlloc(HYPRE_Complex, num_blocks*blockSize*blockSize, HYPRE_MEMORY_HOST);
4307    HYPRE_Real *dense = dense_all;
4308    HYPRE_Int *IPIV  = hypre_TAlloc(HYPRE_Int, blockSize, HYPRE_MEMORY_HOST);
4309    HYPRE_Complex *dgetri_work = NULL;
4310    HYPRE_Int      dgetri_lwork = -1, lapack_info;
4311 
4312    HYPRE_Int  num_cols_A_offd_new;
4313    HYPRE_BigInt *col_map_offd_A_new;
4314    HYPRE_BigInt big_i;
4315    HYPRE_Int *offd2new = NULL;
4316    HYPRE_Int *marker_diag, *marker_newoffd;
4317 
4318    HYPRE_Int nnz_diag = A_diag_i[nrow_local];
4319    HYPRE_Int nnz_offd = A_offd_i[nrow_local];
4320    HYPRE_Int nnz_diag_new = 0, nnz_offd_new = 0;
4321    HYPRE_Int *A_diag_i_new, *A_diag_j_new, *A_offd_i_new, *A_offd_j_new;
4322    HYPRE_Complex *A_diag_a_new, *A_offd_a_new;
4323    /* heuristic */
4324    HYPRE_Int nnz_diag_alloc = 2 * nnz_diag;
4325    HYPRE_Int nnz_offd_alloc = 2 * nnz_offd;
4326 
4327    A_diag_i_new = hypre_CTAlloc(HYPRE_Int,     nrow_local + 1, HYPRE_MEMORY_HOST);
4328    A_diag_j_new = hypre_CTAlloc(HYPRE_Int,     nnz_diag_alloc, HYPRE_MEMORY_HOST);
4329    A_diag_a_new = hypre_CTAlloc(HYPRE_Complex, nnz_diag_alloc, HYPRE_MEMORY_HOST);
4330    A_offd_i_new = hypre_CTAlloc(HYPRE_Int,     nrow_local + 1, HYPRE_MEMORY_HOST);
4331    A_offd_j_new = hypre_CTAlloc(HYPRE_Int,     nnz_offd_alloc, HYPRE_MEMORY_HOST);
4332    A_offd_a_new = hypre_CTAlloc(HYPRE_Complex, nnz_offd_alloc, HYPRE_MEMORY_HOST);
4333 
4334    hypre_ParCSRMatrix *Anew;
4335    hypre_CSRMatrix    *Anew_diag;
4336    hypre_CSRMatrix    *Anew_offd;
4337 
4338    HYPRE_Real eps = 2.2e-16;
4339 
4340    /* Start with extracting the external rows */
4341    HYPRE_BigInt *ext_offd;
4342    ext_indices = hypre_CTAlloc(HYPRE_BigInt, num_ext_rows, HYPRE_MEMORY_HOST);
4343    j = 0;
4344    for (big_i = first_row_block; big_i < first_row; big_i++)
4345    {
4346       ext_indices[j++] = big_i;
4347    }
4348    for (big_i = end_row; big_i < end_row_block; big_i++)
4349    {
4350       ext_indices[j++] = big_i;
4351    }
4352 
4353    hypre_assert(j == num_ext_rows);
4354 
4355    /* create CommPkg for external rows */
4356    hypre_ParCSRFindExtendCommPkg(comm, nrow_global, first_row, nrow_local, row_starts,
4357                                  hypre_ParCSRMatrixAssumedPartition(A),
4358                                  num_ext_rows, ext_indices, &A->bdiaginv_comm_pkg);
4359 
4360    hypre_ParcsrGetExternalRowsInit(A, num_ext_rows, ext_indices, A->bdiaginv_comm_pkg, 1, &request);
4361    A_ext = hypre_ParcsrGetExternalRowsWait(request);
4362 
4363    hypre_TFree(ext_indices, HYPRE_MEMORY_HOST);
4364 
4365    A_ext_i = hypre_CSRMatrixI(A_ext);
4366    A_ext_j = hypre_CSRMatrixBigJ(A_ext);
4367    A_ext_a = hypre_CSRMatrixData(A_ext);
4368    A_ext_nnz = A_ext_i[num_ext_rows];
4369    ext_offd = hypre_CTAlloc(HYPRE_BigInt, A_ext_nnz, HYPRE_MEMORY_HOST);
4370 
4371    /* fint the offd incides in A_ext */
4372    for (i = 0, j = 0; i < A_ext_nnz; i++)
4373    {
4374       /* global index */
4375       HYPRE_BigInt cid = A_ext_j[i];
4376       /* keep the offd indices */
4377       if (cid < first_col || cid >= end_col)
4378       {
4379          ext_offd[j++] = cid;
4380       }
4381    }
4382    /* remove duplicates after sorting (TODO better ways?) */
4383    hypre_BigQsort0(ext_offd, 0, j-1);
4384    for (i = 0, k = 0; i < j; i++)
4385    {
4386       if (i == 0 || ext_offd[i] != ext_offd[i-1])
4387       {
4388          ext_offd[k++] = ext_offd[i];
4389       }
4390    }
4391    /* uniion these `k' new indices into col_map_offd_A */
4392    col_map_offd_A_new = hypre_CTAlloc(HYPRE_BigInt, num_cols_A_offd + k, HYPRE_MEMORY_HOST);
4393    if (k)
4394    {
4395       /* map offd to offd_new */
4396       offd2new = hypre_CTAlloc(HYPRE_Int, num_cols_A_offd, HYPRE_MEMORY_HOST);
4397    }
4398    hypre_union2(num_cols_A_offd, col_map_offd_A, k, ext_offd,
4399                 &num_cols_A_offd_new, col_map_offd_A_new, offd2new, NULL);
4400    hypre_TFree(ext_offd, HYPRE_MEMORY_HOST);
4401    /*
4402     *   adjust column indices in A_ext
4403     */
4404    for (i = 0; i < A_ext_nnz; i++)
4405    {
4406       HYPRE_BigInt cid = A_ext_j[i];
4407       if (cid < first_col || cid >= end_col)
4408       {
4409          j = hypre_BigBinarySearch(col_map_offd_A_new, cid, num_cols_A_offd_new);
4410          /* searching must succeed */
4411          hypre_assert(j >= 0 && j < num_cols_A_offd_new);
4412          /* trick: save ncol_local + j back */
4413          A_ext_j[i] = ncol_local + j;
4414       }
4415       else
4416       {
4417          /* save local index: [0, ncol_local-1] */
4418          A_ext_j[i] = cid - first_col;
4419       }
4420    }
4421 
4422    /* marker for diag */
4423    marker_diag = hypre_TAlloc(HYPRE_Int, ncol_local, HYPRE_MEMORY_HOST);
4424    for (i = 0; i < ncol_local; i++)
4425    {
4426       marker_diag[i] = -1;
4427    }
4428    /* marker for newoffd */
4429    marker_newoffd = hypre_TAlloc(HYPRE_Int, num_cols_A_offd_new, HYPRE_MEMORY_HOST);
4430    for (i = 0; i < num_cols_A_offd_new; i++)
4431    {
4432       marker_newoffd[i] = -1;
4433    }
4434 
4435    /* outer most loop for blocks */
4436    for (block_start = first_row_block; block_start < end_row_block; block_start += (HYPRE_BigInt)blockSize)
4437    {
4438       HYPRE_BigInt big_i;
4439       block_end = hypre_min(block_start + (HYPRE_BigInt)blockSize, nrow_global);
4440       s = (HYPRE_Int)(block_end - block_start);
4441 
4442       /* 1. fill the dense block diag matrix */
4443       for (big_i = block_start; big_i < block_end; big_i++)
4444       {
4445          /* row index in this block */
4446          HYPRE_Int block_i = (HYPRE_Int)(big_i - block_start);
4447 
4448          /* row index i: it can be local or external */
4449          if (big_i >= first_row && big_i < end_row)
4450          {
4451             /* is a local row */
4452             j = (HYPRE_Int)(big_i - first_row);
4453             for (k = A_diag_i[j]; k < A_diag_i[j+1]; k++)
4454             {
4455                HYPRE_BigInt cid = (HYPRE_BigInt)A_diag_j[k] + first_col;
4456                if (cid >= block_start && cid < block_end)
4457                {
4458                   dense[block_i + (HYPRE_Int)(cid-block_start)*blockSize] = A_diag_a[k];
4459                }
4460             }
4461             if (num_cols_A_offd)
4462             {
4463                for (k = A_offd_i[j]; k < A_offd_i[j+1]; k++)
4464                {
4465                   HYPRE_BigInt cid = col_map_offd_A[A_offd_j[k]];
4466                   if (cid >= block_start && cid < block_end)
4467                   {
4468                      dense[block_i + (HYPRE_Int)(cid-block_start)*blockSize] = A_offd_a[k];
4469                   }
4470                }
4471             }
4472          }
4473          else
4474          {
4475             /* is an external row */
4476             if (big_i < first_row)
4477             {
4478                j = (HYPRE_Int)(big_i - first_row_block);
4479             }
4480             else
4481             {
4482                j = (HYPRE_Int)(first_row - first_row_block + big_i - end_row);
4483             }
4484             for (k = A_ext_i[j]; k < A_ext_i[j+1]; k++)
4485             {
4486                HYPRE_BigInt cid = A_ext_j[k];
4487                /* recover the global index */
4488                cid = cid < (HYPRE_BigInt)ncol_local ? cid + first_col : col_map_offd_A_new[cid-ncol_local];
4489                if (cid >= block_start && cid < block_end)
4490                {
4491                   dense[block_i + (HYPRE_Int)(cid-block_start)*blockSize] = A_ext_a[k];
4492                }
4493             }
4494          }
4495       }
4496 
4497       /* 2. invert the dense matrix */
4498       hypre_dgetrf(&s, &s, dense, &blockSize, IPIV, &lapack_info);
4499 
4500       hypre_assert(lapack_info == 0);
4501 
4502       if (lapack_info == 0)
4503       {
4504          HYPRE_Int query = -1;
4505          HYPRE_Real lwork_opt;
4506          /* query the optimal size of work */
4507          hypre_dgetri(&s, dense, &blockSize, IPIV, &lwork_opt, &query, &lapack_info);
4508 
4509          hypre_assert(lapack_info == 0);
4510 
4511          if (lwork_opt > dgetri_lwork)
4512          {
4513             dgetri_lwork = lwork_opt;
4514             dgetri_work = hypre_TReAlloc(dgetri_work, HYPRE_Complex, dgetri_lwork, HYPRE_MEMORY_HOST);
4515          }
4516 
4517          hypre_dgetri(&s, dense, &blockSize, IPIV, dgetri_work, &dgetri_lwork, &lapack_info);
4518 
4519          hypre_assert(lapack_info == 0);
4520       }
4521 
4522       /* filter out *zeros* */
4523       HYPRE_Real Fnorm = 0.0;
4524       for (i = 0; i < s; i++)
4525       {
4526          for (j = 0; j < s; j++)
4527          {
4528             HYPRE_Complex t = dense[j+i*blockSize];
4529             Fnorm += t * t;
4530          }
4531       }
4532 
4533       Fnorm = sqrt(Fnorm);
4534 
4535       for (i = 0; i < s; i++)
4536       {
4537          for (j = 0; j < s; j++)
4538          {
4539             if ( hypre_abs(dense[j+i*blockSize]) < eps * Fnorm )
4540             {
4541                dense[j+i*blockSize] = 0.0;
4542             }
4543          }
4544       }
4545 
4546       /* 3. premultiplication: one-pass dynamic allocation */
4547       for (big_i = block_start; big_i < block_end; big_i++)
4548       {
4549          /* starting points of this row in j */
4550          HYPRE_Int diag_i_start = nnz_diag_new;
4551          HYPRE_Int offd_i_start = nnz_offd_new;
4552 
4553          /* compute a new row with global index 'i' and local index 'local_i' */
4554          HYPRE_Int local_i = (HYPRE_Int)(big_i - first_row);
4555          /* row index in this block */
4556          HYPRE_Int block_i = (HYPRE_Int)(big_i - block_start);
4557 
4558          if (big_i < first_row || big_i >= end_row)
4559          {
4560             continue;
4561          }
4562 
4563          /* if square^2: reserve the first space in diag part to the diag entry */
4564          if (square2)
4565          {
4566             marker_diag[local_i] = nnz_diag_new;
4567             if (nnz_diag_new == nnz_diag_alloc)
4568             {
4569                nnz_diag_alloc = nnz_diag_alloc * 2 + 1;
4570                A_diag_j_new = hypre_TReAlloc(A_diag_j_new, HYPRE_Int,     nnz_diag_alloc, HYPRE_MEMORY_HOST);
4571                A_diag_a_new = hypre_TReAlloc(A_diag_a_new, HYPRE_Complex, nnz_diag_alloc, HYPRE_MEMORY_HOST);
4572             }
4573             A_diag_j_new[nnz_diag_new] = local_i;
4574             A_diag_a_new[nnz_diag_new] = 0.0;
4575             nnz_diag_new ++;
4576          }
4577 
4578          /* combine s rows */
4579          for (j = 0; j < s; j++)
4580          {
4581             /* row to combine: global row id */
4582             HYPRE_BigInt global_rid = block_start + (HYPRE_BigInt)j;
4583             /* the multipiler */
4584             HYPRE_Complex val = dense[block_i + j*blockSize];
4585 
4586             if (val == 0.0)
4587             {
4588                continue;
4589             }
4590 
4591             if (global_rid >= first_row && global_rid < end_row)
4592             {
4593                /* this row is local */
4594                HYPRE_Int rid = (HYPRE_Int)(global_rid - first_row);
4595                HYPRE_Int ii;
4596 
4597                for (ii = A_diag_i[rid]; ii < A_diag_i[rid+1]; ii++)
4598                {
4599                   HYPRE_Int col = A_diag_j[ii];
4600                   HYPRE_Complex vv = A_diag_a[ii];
4601 
4602                   if (marker_diag[col] < diag_i_start)
4603                   {
4604                      /* this col has not been seen before, create new entry */
4605                      marker_diag[col] = nnz_diag_new;
4606                      if (nnz_diag_new == nnz_diag_alloc)
4607                      {
4608                         nnz_diag_alloc = nnz_diag_alloc * 2 + 1;
4609                         A_diag_j_new = hypre_TReAlloc(A_diag_j_new, HYPRE_Int,     nnz_diag_alloc, HYPRE_MEMORY_HOST);
4610                         A_diag_a_new = hypre_TReAlloc(A_diag_a_new, HYPRE_Complex, nnz_diag_alloc, HYPRE_MEMORY_HOST);
4611                      }
4612                      A_diag_j_new[nnz_diag_new] = col;
4613                      A_diag_a_new[nnz_diag_new] = val * vv;
4614                      nnz_diag_new ++;
4615                   }
4616                   else
4617                   {
4618                      /* existing entry, update */
4619                      HYPRE_Int p = marker_diag[col];
4620 
4621                      hypre_assert(A_diag_j_new[p] == col);
4622 
4623                      A_diag_a_new[p] += val * vv;
4624                   }
4625                }
4626 
4627                for (ii = A_offd_i[rid]; ii < A_offd_i[rid+1]; ii++)
4628                {
4629                   HYPRE_Int col = A_offd_j[ii];
4630                   /* use the mapper to map to new offd */
4631                   HYPRE_Int col_new = offd2new ? offd2new[col] : col;
4632                   HYPRE_Complex vv = A_offd_a[ii];
4633 
4634                   if (marker_newoffd[col_new] < offd_i_start)
4635                   {
4636                      /* this col has not been seen before, create new entry */
4637                      marker_newoffd[col_new] = nnz_offd_new;
4638                      if (nnz_offd_new == nnz_offd_alloc)
4639                      {
4640                         nnz_offd_alloc = nnz_offd_alloc * 2 + 1;
4641                         A_offd_j_new = hypre_TReAlloc(A_offd_j_new, HYPRE_Int,     nnz_offd_alloc, HYPRE_MEMORY_HOST);
4642                         A_offd_a_new = hypre_TReAlloc(A_offd_a_new, HYPRE_Complex, nnz_offd_alloc, HYPRE_MEMORY_HOST);
4643                      }
4644                      A_offd_j_new[nnz_offd_new] = col_new;
4645                      A_offd_a_new[nnz_offd_new] = val * vv;
4646                      nnz_offd_new ++;
4647                   }
4648                   else
4649                   {
4650                      /* existing entry, update */
4651                      HYPRE_Int p = marker_newoffd[col_new];
4652 
4653                      hypre_assert(A_offd_j_new[p] == col_new);
4654 
4655                      A_offd_a_new[p] += val * vv;
4656                   }
4657                }
4658             }
4659             else
4660             {
4661                /* this is an external row: go to A_ext */
4662                HYPRE_Int rid, ii;
4663 
4664                if (global_rid < first_row)
4665                {
4666                   rid = (HYPRE_Int)(global_rid - first_row_block);
4667                }
4668                else
4669                {
4670                   rid = (HYPRE_Int)(first_row - first_row_block + global_rid - end_row);
4671                }
4672 
4673                for (ii = A_ext_i[rid]; ii < A_ext_i[rid+1]; ii++)
4674                {
4675                   HYPRE_Int col = (HYPRE_Int)A_ext_j[ii];
4676                   HYPRE_Complex vv = A_ext_a[ii];
4677 
4678                   if (col < ncol_local)
4679                   {
4680                      /* in diag part */
4681                      if (marker_diag[col] < diag_i_start)
4682                      {
4683                         /* this col has not been seen before, create new entry */
4684                         marker_diag[col] = nnz_diag_new;
4685                         if (nnz_diag_new == nnz_diag_alloc)
4686                         {
4687                            nnz_diag_alloc = nnz_diag_alloc * 2 + 1;
4688                            A_diag_j_new = hypre_TReAlloc(A_diag_j_new, HYPRE_Int,     nnz_diag_alloc, HYPRE_MEMORY_HOST);
4689                            A_diag_a_new = hypre_TReAlloc(A_diag_a_new, HYPRE_Complex, nnz_diag_alloc, HYPRE_MEMORY_HOST);
4690                         }
4691                         A_diag_j_new[nnz_diag_new] = col;
4692                         A_diag_a_new[nnz_diag_new] = val * vv;
4693                         nnz_diag_new ++;
4694                      }
4695                      else
4696                      {
4697                         /* existing entry, update */
4698                         HYPRE_Int p = marker_diag[col];
4699 
4700                         hypre_assert(A_diag_j_new[p] == col);
4701 
4702                         A_diag_a_new[p] += val * vv;
4703                      }
4704                   }
4705                   else
4706                   {
4707                      /* in offd part */
4708                      col -= ncol_local;
4709 
4710                      if (marker_newoffd[col] < offd_i_start)
4711                      {
4712                         /* this col has not been seen before, create new entry */
4713                         marker_newoffd[col] = nnz_offd_new;
4714                         if (nnz_offd_new == nnz_offd_alloc)
4715                         {
4716                            nnz_offd_alloc = nnz_offd_alloc * 2 + 1;
4717                            A_offd_j_new = hypre_TReAlloc(A_offd_j_new, HYPRE_Int,     nnz_offd_alloc, HYPRE_MEMORY_HOST);
4718                            A_offd_a_new = hypre_TReAlloc(A_offd_a_new, HYPRE_Complex, nnz_offd_alloc, HYPRE_MEMORY_HOST);
4719                         }
4720                         A_offd_j_new[nnz_offd_new] = col;
4721                         A_offd_a_new[nnz_offd_new] = val * vv;
4722                         nnz_offd_new ++;
4723                      }
4724                      else
4725                      {
4726                         /* existing entry, update */
4727                         HYPRE_Int p = marker_newoffd[col];
4728 
4729                         hypre_assert(A_offd_j_new[p] == col);
4730 
4731                         A_offd_a_new[p] += val * vv;
4732                      }
4733                   }
4734                }
4735             }
4736          }
4737 
4738          /* done for row local_i */
4739          A_diag_i_new[local_i + 1] = nnz_diag_new;
4740          A_offd_i_new[local_i + 1] = nnz_offd_new;
4741       } /* for i, each row */
4742 
4743       dense += blockSize * blockSize;
4744    } /* for each block */
4745 
4746    /* done with all rows */
4747    /* resize properly */
4748    A_diag_j_new = hypre_TReAlloc(A_diag_j_new, HYPRE_Int,     nnz_diag_new, HYPRE_MEMORY_HOST);
4749    A_diag_a_new = hypre_TReAlloc(A_diag_a_new, HYPRE_Complex, nnz_diag_new, HYPRE_MEMORY_HOST);
4750    A_offd_j_new = hypre_TReAlloc(A_offd_j_new, HYPRE_Int,     nnz_offd_new, HYPRE_MEMORY_HOST);
4751    A_offd_a_new = hypre_TReAlloc(A_offd_a_new, HYPRE_Complex, nnz_offd_new, HYPRE_MEMORY_HOST);
4752 
4753    /* readjust col_map_offd_new */
4754    for (i = 0; i < num_cols_A_offd_new; i++)
4755    {
4756       marker_newoffd[i] = -1;
4757    }
4758    for (i = 0; i < nnz_offd_new; i++)
4759    {
4760       j = A_offd_j_new[i];
4761       if (marker_newoffd[j] == -1)
4762       {
4763          marker_newoffd[j] = 1;
4764       }
4765    }
4766    for (i = 0, j = 0; i < num_cols_A_offd_new; i++)
4767    {
4768       if (marker_newoffd[i] == 1)
4769       {
4770          col_map_offd_A_new[j] = col_map_offd_A_new[i];
4771          marker_newoffd[i] = j++;
4772       }
4773    }
4774    num_cols_A_offd_new = j;
4775 
4776    for (i = 0; i < nnz_offd_new; i++)
4777    {
4778       j = marker_newoffd[A_offd_j_new[i]];
4779       hypre_assert(j >= 0 && j < num_cols_A_offd_new);
4780       A_offd_j_new[i] = j;
4781    }
4782 
4783    /* Now, we should have everything of Parcsr matrix As */
4784    Anew = hypre_ParCSRMatrixCreate(comm,
4785                                    nrow_global,
4786                                    ncol_global,
4787                                    hypre_ParCSRMatrixRowStarts(A),
4788                                    hypre_ParCSRMatrixColStarts(A),
4789                                    num_cols_A_offd_new,
4790                                    nnz_diag_new,
4791                                    nnz_offd_new);
4792 
4793    Anew_diag = hypre_ParCSRMatrixDiag(Anew);
4794    hypre_CSRMatrixData(Anew_diag) = A_diag_a_new;
4795    hypre_CSRMatrixI(Anew_diag)    = A_diag_i_new;
4796    hypre_CSRMatrixJ(Anew_diag)    = A_diag_j_new;
4797 
4798    Anew_offd = hypre_ParCSRMatrixOffd(Anew);
4799    hypre_CSRMatrixData(Anew_offd) = A_offd_a_new;
4800    hypre_CSRMatrixI(Anew_offd)    = A_offd_i_new;
4801    hypre_CSRMatrixJ(Anew_offd)    = A_offd_j_new;
4802 
4803    hypre_ParCSRMatrixColMapOffd(Anew) = col_map_offd_A_new;
4804 
4805    hypre_ParCSRMatrixSetNumNonzeros(Anew);
4806    hypre_ParCSRMatrixDNumNonzeros(Anew) = (HYPRE_Real) hypre_ParCSRMatrixNumNonzeros(Anew);
4807    //printf("nnz_diag %d --> %d, nnz_offd %d --> %d\n", nnz_diag, nnz_diag_new, nnz_offd, nnz_offd_new);
4808 
4809    /* create CommPkg of Anew */
4810    hypre_MatvecCommPkgCreate(Anew);
4811 
4812    *As = Anew;
4813 
4814    /*
4815    if (bdiaginv)
4816    {
4817       *bdiaginv = dense_all;
4818    }
4819    else
4820    {
4821       hypre_TFree(dense_all, HYPRE_MEMORY_HOST);
4822    }
4823    */
4824    /* save diagonal blocks in A */
4825    A->bdiag_size = blockSize;
4826    A->bdiaginv = dense_all;
4827 
4828    /* free workspace */
4829    hypre_TFree(IPIV, HYPRE_MEMORY_HOST);
4830    hypre_TFree(dgetri_work, HYPRE_MEMORY_HOST);
4831    hypre_TFree(marker_diag, HYPRE_MEMORY_HOST);
4832    hypre_TFree(marker_newoffd, HYPRE_MEMORY_HOST);
4833    hypre_TFree(offd2new, HYPRE_MEMORY_HOST);
4834    hypre_CSRMatrixDestroy(A_ext);
4835 
4836    return hypre_error_flag;
4837 }
4838 
4839 /*--------------------------------------------------------------------------
4840  * hypre_ParcsrGetExternalRowsInit
4841  *--------------------------------------------------------------------------*/
4842 
4843 HYPRE_Int
hypre_ParcsrGetExternalRowsInit(hypre_ParCSRMatrix * A,HYPRE_Int indices_len,HYPRE_BigInt * indices,hypre_ParCSRCommPkg * comm_pkg,HYPRE_Int want_data,void ** request_ptr)4844 hypre_ParcsrGetExternalRowsInit( hypre_ParCSRMatrix   *A,
4845                                  HYPRE_Int             indices_len,
4846                                  HYPRE_BigInt         *indices,
4847                                  hypre_ParCSRCommPkg  *comm_pkg,
4848                                  HYPRE_Int             want_data,
4849                                  void                **request_ptr)
4850 {
4851    HYPRE_Int i, j, k;
4852    HYPRE_Int num_sends, num_rows_send, num_nnz_send, *send_i,
4853              num_recvs, num_rows_recv, num_nnz_recv, *recv_i,
4854             *send_jstarts, *recv_jstarts, *send_i_offset;
4855    HYPRE_BigInt *send_j, *recv_j;
4856    HYPRE_Complex *send_a = NULL, *recv_a = NULL;
4857    hypre_ParCSRCommPkg     *comm_pkg_j;
4858    hypre_ParCSRCommHandle  *comm_handle, *comm_handle_j, *comm_handle_a;
4859    /* HYPRE_Int global_num_rows = hypre_ParCSRMatrixGlobalNumRows(A); */
4860    /* diag part of A */
4861    hypre_CSRMatrix *A_diag   = hypre_ParCSRMatrixDiag(A);
4862    HYPRE_Real      *A_diag_a = hypre_CSRMatrixData(A_diag);
4863    HYPRE_Int       *A_diag_i = hypre_CSRMatrixI(A_diag);
4864    HYPRE_Int       *A_diag_j = hypre_CSRMatrixJ(A_diag);
4865    /* HYPRE_Int local_num_rows  = hypre_CSRMatrixNumRows(A_diag); */
4866    /* off-diag part of A */
4867    hypre_CSRMatrix *A_offd   = hypre_ParCSRMatrixOffd(A);
4868    HYPRE_Real      *A_offd_a = hypre_CSRMatrixData(A_offd);
4869    HYPRE_Int       *A_offd_i = hypre_CSRMatrixI(A_offd);
4870    HYPRE_Int       *A_offd_j = hypre_CSRMatrixJ(A_offd);
4871    /* HYPRE_BigInt *row_starts      = hypre_ParCSRMatrixRowStarts(A); */
4872    /* HYPRE_BigInt  first_row       = hypre_ParCSRMatrixFirstRowIndex(A); */
4873    HYPRE_BigInt     first_col       = hypre_ParCSRMatrixFirstColDiag(A);
4874    HYPRE_BigInt    *col_map_offd_A  = hypre_ParCSRMatrixColMapOffd(A);
4875    MPI_Comm         comm = hypre_ParCSRMatrixComm(A);
4876    HYPRE_Int        num_procs;
4877    HYPRE_Int        my_id;
4878    void           **vrequest;
4879    hypre_CSRMatrix *A_ext;
4880 
4881    hypre_MPI_Comm_size(comm, &num_procs);
4882    hypre_MPI_Comm_rank(comm, &my_id);
4883 
4884    /* number of sends (#procs) */
4885    num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
4886    /* number of rows to send */
4887    num_rows_send = hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends);
4888    /* number of recvs (#procs) */
4889    num_recvs = hypre_ParCSRCommPkgNumRecvs(comm_pkg);
4890    /* number of rows to recv */
4891    num_rows_recv = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs);
4892 
4893    /* must be true if indices contains proper offd indices */
4894    hypre_assert(indices_len == num_rows_recv);
4895 
4896    /* send_i/recv_i:
4897     * the arrays to send and recv: we first send and recv the row lengths */
4898    send_i = hypre_TAlloc(HYPRE_Int, num_rows_send, HYPRE_MEMORY_HOST);
4899    recv_i = hypre_CTAlloc(HYPRE_Int, num_rows_recv + 1, HYPRE_MEMORY_HOST);
4900    /* fill the send array with row lengths */
4901    for (i = 0, num_nnz_send = 0; i < num_rows_send; i++)
4902    {
4903       /* j: row index to send */
4904       j = hypre_ParCSRCommPkgSendMapElmt(comm_pkg, i);
4905       send_i[i] = A_diag_i[j+1] - A_diag_i[j] + A_offd_i[j+1] - A_offd_i[j];
4906       num_nnz_send += send_i[i];
4907    }
4908 
4909    /* send this array out: note the shift in recv_i by one (async) */
4910    comm_handle = hypre_ParCSRCommHandleCreate(11, comm_pkg, send_i, recv_i+1);
4911 
4912    /* prepare data to send out. overlap with the above commmunication */
4913    send_j = hypre_TAlloc(HYPRE_BigInt, num_nnz_send, HYPRE_MEMORY_HOST);
4914    if (want_data)
4915    {
4916       send_a = hypre_TAlloc(HYPRE_Complex, num_nnz_send, HYPRE_MEMORY_HOST);
4917    }
4918 
4919    send_i_offset = hypre_TAlloc(HYPRE_Int, num_rows_send + 1, HYPRE_MEMORY_HOST);
4920    send_i_offset[0] = 0;
4921    hypre_TMemcpy(send_i_offset + 1, send_i, HYPRE_Int, num_rows_send,
4922                  HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
4923    /* prefix sum. TODO: OMP parallelization */
4924    for (i = 1; i <= num_rows_send; i++)
4925    {
4926       send_i_offset[i] += send_i_offset[i-1];
4927    }
4928    hypre_assert(send_i_offset[num_rows_send] == num_nnz_send);
4929 
4930    /* pointers to each proc in send_j */
4931    send_jstarts = hypre_TAlloc(HYPRE_Int, num_sends + 1, HYPRE_MEMORY_HOST);
4932 #ifdef HYPRE_USING_OPENMP
4933 #pragma omp parallel for HYPRE_SMP_SCHEDULE
4934 #endif
4935    for (i = 0; i <= num_sends; i++)
4936    {
4937       send_jstarts[i] = send_i_offset[hypre_ParCSRCommPkgSendMapStart(comm_pkg, i)];
4938    }
4939    hypre_assert(send_jstarts[num_sends] == num_nnz_send);
4940 
4941    /* fill the CSR matrix: j and a */
4942 #ifdef HYPRE_USING_OPENMP
4943 #pragma omp parallel for HYPRE_SMP_SCHEDULE private(i,j,k)
4944 #endif
4945    for (i = 0; i < num_rows_send; i++)
4946    {
4947       HYPRE_Int i1 = send_i_offset[i];
4948       j = hypre_ParCSRCommPkgSendMapElmt(comm_pkg, i);
4949       /* open row j and fill ja and a to send */
4950       for (k = A_diag_i[j]; k < A_diag_i[j+1]; k++)
4951       {
4952          send_j[i1] = first_col + A_diag_j[k];
4953          if (want_data)
4954          {
4955             send_a[i1] = A_diag_a[k];
4956          }
4957             i1++;
4958          }
4959          if (num_procs > 1)
4960          {
4961          for (k = A_offd_i[j]; k < A_offd_i[j+1]; k++)
4962             {
4963                send_j[i1] = col_map_offd_A[A_offd_j[k]];
4964             if (want_data)
4965             {
4966                send_a[i1] = A_offd_a[k];
4967             }
4968                i1++;
4969             }
4970          }
4971       hypre_assert(send_i_offset[i+1] == i1);
4972    }
4973 
4974    /* finish the above communication: send_i/recv_i */
4975    hypre_ParCSRCommHandleDestroy(comm_handle);
4976 
4977    /* adjust recv_i to ptrs */
4978    for (i = 1; i <= num_rows_recv; i++)
4979    {
4980       recv_i[i] += recv_i[i-1];
4981    }
4982    num_nnz_recv = recv_i[num_rows_recv];
4983    recv_j = hypre_CTAlloc(HYPRE_BigInt, num_nnz_recv, HYPRE_MEMORY_HOST);
4984    if (want_data)
4985    {
4986       recv_a = hypre_CTAlloc(HYPRE_Complex, num_nnz_recv, HYPRE_MEMORY_HOST);
4987    }
4988    recv_jstarts = hypre_CTAlloc(HYPRE_Int, num_recvs + 1, HYPRE_MEMORY_HOST);
4989    for (i = 1; i <= num_recvs; i++)
4990    {
4991       j = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i);
4992       recv_jstarts[i] = recv_i[j];
4993    }
4994 
4995    /* ready to send and recv: create a communication package for data */
4996    comm_pkg_j = hypre_CTAlloc(hypre_ParCSRCommPkg, 1, HYPRE_MEMORY_HOST);
4997    hypre_ParCSRCommPkgComm         (comm_pkg_j) = comm;
4998    hypre_ParCSRCommPkgNumSends     (comm_pkg_j) = num_sends;
4999    hypre_ParCSRCommPkgSendProcs    (comm_pkg_j) = hypre_ParCSRCommPkgSendProcs(comm_pkg);
5000    hypre_ParCSRCommPkgSendMapStarts(comm_pkg_j) = send_jstarts;
5001    hypre_ParCSRCommPkgNumRecvs     (comm_pkg_j) = num_recvs;
5002    hypre_ParCSRCommPkgRecvProcs    (comm_pkg_j) = hypre_ParCSRCommPkgRecvProcs(comm_pkg);
5003    hypre_ParCSRCommPkgRecvVecStarts(comm_pkg_j) = recv_jstarts;
5004 
5005    /* init communication */
5006    /* ja */
5007    comm_handle_j = hypre_ParCSRCommHandleCreate(21, comm_pkg_j, send_j, recv_j);
5008    if (want_data)
5009    {
5010       /* a */
5011       comm_handle_a = hypre_ParCSRCommHandleCreate(1, comm_pkg_j, send_a, recv_a);
5012    }
5013    else
5014    {
5015       comm_handle_a = NULL;
5016    }
5017 
5018    /* create A_ext */
5019    A_ext = hypre_CSRMatrixCreate(num_rows_recv, hypre_ParCSRMatrixGlobalNumCols(A), num_nnz_recv);
5020    hypre_CSRMatrixMemoryLocation(A_ext) = HYPRE_MEMORY_HOST;
5021    hypre_CSRMatrixI   (A_ext) = recv_i;
5022    hypre_CSRMatrixBigJ(A_ext) = recv_j;
5023    hypre_CSRMatrixData(A_ext) = recv_a;
5024 
5025    /* output */
5026    vrequest = hypre_TAlloc(void *, 4, HYPRE_MEMORY_HOST);
5027    vrequest[0] = (void *) comm_handle_j;
5028    vrequest[1] = (void *) comm_handle_a;
5029    vrequest[2] = (void *) A_ext;
5030    vrequest[3] = (void *) comm_pkg_j;
5031 
5032    *request_ptr = (void *) vrequest;
5033 
5034    /* free */
5035    hypre_TFree(send_i, HYPRE_MEMORY_HOST);
5036    hypre_TFree(send_i_offset, HYPRE_MEMORY_HOST);
5037 
5038    return hypre_error_flag;
5039 }
5040 
5041 hypre_CSRMatrix*
hypre_ParcsrGetExternalRowsWait(void * vrequest)5042 hypre_ParcsrGetExternalRowsWait(void *vrequest)
5043 {
5044    void **request = (void **) vrequest;
5045 
5046    hypre_ParCSRCommHandle *comm_handle_j = (hypre_ParCSRCommHandle *) request[0];
5047    hypre_ParCSRCommHandle *comm_handle_a = (hypre_ParCSRCommHandle *) request[1];
5048    hypre_CSRMatrix        *A_ext         = (hypre_CSRMatrix *)        request[2];
5049    hypre_ParCSRCommPkg    *comm_pkg_j    = (hypre_ParCSRCommPkg *)    request[3];
5050    HYPRE_BigInt           *send_j        = (HYPRE_BigInt *) hypre_ParCSRCommHandleSendData(comm_handle_j);
5051 
5052    if (comm_handle_a)
5053    {
5054       HYPRE_Complex *send_a = (HYPRE_Complex *) hypre_ParCSRCommHandleSendData(comm_handle_a);
5055       hypre_ParCSRCommHandleDestroy(comm_handle_a);
5056       hypre_TFree(send_a, HYPRE_MEMORY_HOST);
5057    }
5058 
5059    hypre_ParCSRCommHandleDestroy(comm_handle_j);
5060    hypre_TFree(send_j, HYPRE_MEMORY_HOST);
5061 
5062    hypre_TFree(hypre_ParCSRCommPkgSendMapStarts(comm_pkg_j), HYPRE_MEMORY_HOST);
5063    hypre_TFree(hypre_ParCSRCommPkgRecvVecStarts(comm_pkg_j), HYPRE_MEMORY_HOST);
5064    hypre_TFree(comm_pkg_j, HYPRE_MEMORY_HOST);
5065 
5066    hypre_TFree(request, HYPRE_MEMORY_HOST);
5067 
5068    return A_ext;
5069 }
5070 
5071 /*--------------------------------------------------------------------------
5072  * hypre_ParCSRMatrixAdd: performs C = alpha*A + beta*B
5073  *
5074  * A and B are assumed to have the same row and column partitionings
5075  *--------------------------------------------------------------------------*/
5076 HYPRE_Int
hypre_ParCSRMatrixAddHost(HYPRE_Complex alpha,hypre_ParCSRMatrix * A,HYPRE_Complex beta,hypre_ParCSRMatrix * B,hypre_ParCSRMatrix ** C_ptr)5077 hypre_ParCSRMatrixAddHost( HYPRE_Complex        alpha,
5078                            hypre_ParCSRMatrix  *A,
5079                            HYPRE_Complex        beta,
5080                            hypre_ParCSRMatrix  *B,
5081                            hypre_ParCSRMatrix **C_ptr )
5082 {
5083    /* ParCSRMatrix data */
5084    MPI_Comm          comm       = hypre_ParCSRMatrixComm(A);
5085    HYPRE_BigInt      num_rows_A = hypre_ParCSRMatrixGlobalNumRows(A);
5086    HYPRE_BigInt      num_cols_A = hypre_ParCSRMatrixGlobalNumCols(A);
5087    /* HYPRE_BigInt      num_rows_B = hypre_ParCSRMatrixGlobalNumRows(B); */
5088    /* HYPRE_BigInt      num_cols_B = hypre_ParCSRMatrixGlobalNumCols(B); */
5089 
5090    /* diag part of A */
5091    hypre_CSRMatrix    *A_diag   = hypre_ParCSRMatrixDiag(A);
5092    HYPRE_Int     *rownnz_diag_A = hypre_CSRMatrixRownnz(A_diag);
5093    HYPRE_Int  num_rownnz_diag_A = hypre_CSRMatrixNumRownnz(A_diag);
5094    HYPRE_Int    num_rows_diag_A = hypre_CSRMatrixNumRows(A_diag);
5095    HYPRE_Int    num_cols_diag_A = hypre_CSRMatrixNumCols(A_diag);
5096 
5097    /* off-diag part of A */
5098    hypre_CSRMatrix    *A_offd   = hypre_ParCSRMatrixOffd(A);
5099    HYPRE_Int     *rownnz_offd_A = hypre_CSRMatrixRownnz(A_offd);
5100    HYPRE_Int  num_rownnz_offd_A = hypre_CSRMatrixNumRownnz(A_offd);
5101    HYPRE_Int    num_rows_offd_A = hypre_CSRMatrixNumRows(A_offd);
5102    HYPRE_Int    num_cols_offd_A = hypre_CSRMatrixNumCols(A_offd);
5103    HYPRE_BigInt *col_map_offd_A = hypre_ParCSRMatrixColMapOffd(A);
5104    HYPRE_Int          *A2C_offd;
5105 
5106    /* diag part of B */
5107    hypre_CSRMatrix    *B_diag   = hypre_ParCSRMatrixDiag(B);
5108    HYPRE_Int     *rownnz_diag_B = hypre_CSRMatrixRownnz(B_diag);
5109    HYPRE_Int  num_rownnz_diag_B = hypre_CSRMatrixNumRownnz(B_diag);
5110    HYPRE_Int    num_rows_diag_B = hypre_CSRMatrixNumRows(B_diag);
5111    /* HYPRE_Int    num_cols_diag_B = hypre_CSRMatrixNumCols(B_diag); */
5112 
5113    /* off-diag part of B */
5114    hypre_CSRMatrix    *B_offd   = hypre_ParCSRMatrixOffd(B);
5115    HYPRE_Int     *rownnz_offd_B = hypre_CSRMatrixRownnz(B_offd);
5116    HYPRE_Int  num_rownnz_offd_B = hypre_CSRMatrixNumRownnz(B_offd);
5117    HYPRE_Int    num_rows_offd_B = hypre_CSRMatrixNumRows(B_offd);
5118    HYPRE_Int    num_cols_offd_B = hypre_CSRMatrixNumCols(B_offd);
5119    HYPRE_BigInt *col_map_offd_B = hypre_ParCSRMatrixColMapOffd(B);
5120    HYPRE_Int          *B2C_offd;
5121 
5122    /* C data */
5123    hypre_ParCSRMatrix   *C;
5124    hypre_CSRMatrix      *C_diag;
5125    hypre_CSRMatrix      *C_offd;
5126    HYPRE_BigInt         *col_map_offd_C;
5127    HYPRE_Int            *C_diag_i, *C_offd_i;
5128    HYPRE_Int            *rownnz_diag_C = NULL;
5129    HYPRE_Int            *rownnz_offd_C = NULL;
5130    HYPRE_Int             num_rownnz_diag_C;
5131    HYPRE_Int             num_rownnz_offd_C;
5132    HYPRE_Int             num_rows_diag_C = num_rows_diag_A;
5133    HYPRE_Int             num_cols_diag_C = num_cols_diag_A;
5134    HYPRE_Int             num_rows_offd_C = num_rows_offd_A;
5135    HYPRE_Int             num_cols_offd_C = num_cols_offd_A + num_cols_offd_B;
5136    HYPRE_Int            *twspace;
5137 
5138    HYPRE_MemoryLocation  memory_location_A = hypre_ParCSRMatrixMemoryLocation(A);
5139    HYPRE_MemoryLocation  memory_location_B = hypre_ParCSRMatrixMemoryLocation(B);
5140 
5141    /* RL: TODO cannot guarantee, maybe should never assert
5142    hypre_assert(memory_location_A == memory_location_B);
5143    */
5144 
5145    /* RL: in the case of A=H, B=D, or A=D, B=H, let C = D,
5146     * not sure if this is the right thing to do.
5147     * Also, need something like this in other places
5148     * TODO */
5149    HYPRE_MemoryLocation  memory_location_C = hypre_max(memory_location_A, memory_location_B);
5150 
5151    HYPRE_ANNOTATE_FUNC_BEGIN;
5152 
5153    /* Allocate memory */
5154    twspace  = hypre_TAlloc(HYPRE_Int, hypre_NumThreads(), HYPRE_MEMORY_HOST);
5155    C_diag_i = hypre_CTAlloc(HYPRE_Int, num_rows_diag_A + 1, memory_location_C);
5156    C_offd_i = hypre_CTAlloc(HYPRE_Int, num_rows_offd_A + 1, memory_location_C);
5157    col_map_offd_C = hypre_TAlloc(HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_HOST);
5158 
5159    /* Compute num_cols_offd_C, A2C_offd, and B2C_offd*/
5160    A2C_offd = hypre_TAlloc(HYPRE_Int, num_cols_offd_A, HYPRE_MEMORY_HOST);
5161    B2C_offd = hypre_TAlloc(HYPRE_Int, num_cols_offd_B, HYPRE_MEMORY_HOST);
5162    hypre_union2(num_cols_offd_A, col_map_offd_A,
5163                 num_cols_offd_B, col_map_offd_B,
5164                 &num_cols_offd_C, col_map_offd_C,
5165                 A2C_offd, B2C_offd);
5166 
5167    /* Set nonzero rows data of diag_C */
5168    num_rownnz_diag_C = num_rows_diag_A;
5169    if ((num_rownnz_diag_A < num_rows_diag_A) &&
5170        (num_rownnz_diag_B < num_rows_diag_B))
5171    {
5172       hypre_MergeOrderedArrays( num_rownnz_diag_A,  rownnz_diag_A,
5173                                 num_rownnz_diag_B,  rownnz_diag_B,
5174                                &num_rownnz_diag_C, &rownnz_diag_C);
5175    }
5176 
5177    /* Set nonzero rows data of offd_C */
5178    num_rownnz_offd_C = num_rows_offd_A;
5179    if ((num_rownnz_offd_A < num_rows_offd_A) &&
5180        (num_rownnz_offd_B < num_rows_offd_B))
5181    {
5182       hypre_MergeOrderedArrays( num_rownnz_offd_A,  rownnz_offd_A,
5183                                 num_rownnz_offd_B,  rownnz_offd_B,
5184                                &num_rownnz_offd_C, &rownnz_offd_C);
5185    }
5186 
5187    /* Set diag_C */
5188 #ifdef HYPRE_USING_OPENMP
5189 #pragma omp parallel
5190 #endif
5191    {
5192       HYPRE_Int   ii, num_threads;
5193       HYPRE_Int   size, rest, ns, ne;
5194       HYPRE_Int  *marker_diag;
5195       HYPRE_Int  *marker_offd;
5196 
5197       ii = hypre_GetThreadNum();
5198       num_threads = hypre_NumActiveThreads();
5199 
5200      /*-----------------------------------------------------------------------
5201       *  Compute C_diag = alpha*A_diag + beta*B_diag
5202       *-----------------------------------------------------------------------*/
5203 
5204       size = num_rownnz_diag_C/num_threads;
5205       rest = num_rownnz_diag_C - size*num_threads;
5206       if (ii < rest)
5207       {
5208          ns = ii*size+ii;
5209          ne = (ii+1)*size+ii+1;
5210       }
5211       else
5212       {
5213          ns = ii*size+rest;
5214          ne = (ii+1)*size+rest;
5215       }
5216 
5217       marker_diag = hypre_TAlloc(HYPRE_Int, num_cols_diag_A, HYPRE_MEMORY_HOST);
5218       hypre_CSRMatrixAddFirstPass(ns, ne, twspace, marker_diag,
5219                                   NULL, NULL, A_diag, B_diag,
5220                                   num_rows_diag_C, num_rownnz_diag_C,
5221                                   num_cols_diag_C, rownnz_diag_C,
5222                                   memory_location_C, C_diag_i, &C_diag);
5223       hypre_CSRMatrixAddSecondPass(ns, ne, twspace, marker_diag,
5224                                    NULL, NULL, rownnz_diag_C,
5225                                    alpha, beta, A_diag, B_diag, C_diag);
5226       hypre_TFree(marker_diag, HYPRE_MEMORY_HOST);
5227 
5228      /*-----------------------------------------------------------------------
5229       *  Compute C_offd = alpha*A_offd + beta*B_offd
5230       *-----------------------------------------------------------------------*/
5231 
5232       size = num_rownnz_offd_C/num_threads;
5233       rest = num_rownnz_offd_C - size*num_threads;
5234       if (ii < rest)
5235       {
5236          ns = ii*size+ii;
5237          ne = (ii+1)*size+ii+1;
5238       }
5239       else
5240       {
5241          ns = ii*size+rest;
5242          ne = (ii+1)*size+rest;
5243       }
5244 
5245       marker_offd = hypre_TAlloc(HYPRE_Int, num_cols_offd_C, HYPRE_MEMORY_HOST);
5246       hypre_CSRMatrixAddFirstPass(ns, ne, twspace, marker_offd,
5247                                   A2C_offd, B2C_offd, A_offd, B_offd,
5248                                   num_rows_offd_C, num_rownnz_offd_C,
5249                                   num_cols_offd_C, rownnz_offd_C,
5250                                   memory_location_C, C_offd_i, &C_offd);
5251       hypre_CSRMatrixAddSecondPass(ns, ne, twspace, marker_offd,
5252                                    A2C_offd, B2C_offd, rownnz_offd_C,
5253                                    alpha, beta, A_offd, B_offd, C_offd);
5254       hypre_TFree(marker_offd, HYPRE_MEMORY_HOST);
5255    } /* end of omp parallel region */
5256 
5257    /* Free memory */
5258    hypre_TFree(twspace, HYPRE_MEMORY_HOST);
5259    hypre_TFree(A2C_offd, HYPRE_MEMORY_HOST);
5260    hypre_TFree(B2C_offd, HYPRE_MEMORY_HOST);
5261 
5262    /* Create ParCSRMatrix C */
5263    C = hypre_ParCSRMatrixCreate(comm,
5264                                 num_rows_A,
5265                                 num_cols_A,
5266                                 hypre_ParCSRMatrixRowStarts(A),
5267                                 hypre_ParCSRMatrixColStarts(A),
5268                                 num_cols_offd_C,
5269                                 hypre_CSRMatrixNumNonzeros(C_diag),
5270                                 hypre_CSRMatrixNumNonzeros(C_offd));
5271 
5272    hypre_CSRMatrixDestroy(hypre_ParCSRMatrixDiag(C));
5273    hypre_CSRMatrixDestroy(hypre_ParCSRMatrixOffd(C));
5274    hypre_ParCSRMatrixDiag(C) = C_diag;
5275    hypre_ParCSRMatrixOffd(C) = C_offd;
5276    hypre_ParCSRMatrixColMapOffd(C) = col_map_offd_C;
5277    hypre_ParCSRMatrixSetNumNonzeros(C);
5278    hypre_ParCSRMatrixDNumNonzeros(C) = (HYPRE_Real) hypre_ParCSRMatrixNumNonzeros(C);
5279 
5280    /* create CommPkg of C */
5281    hypre_MatvecCommPkgCreate(C);
5282 
5283    *C_ptr = C;
5284 
5285    HYPRE_ANNOTATE_FUNC_END;
5286 
5287    return hypre_error_flag;
5288 }
5289 
5290 HYPRE_Int
hypre_ParCSRMatrixAdd(HYPRE_Complex alpha,hypre_ParCSRMatrix * A,HYPRE_Complex beta,hypre_ParCSRMatrix * B,hypre_ParCSRMatrix ** C_ptr)5291 hypre_ParCSRMatrixAdd( HYPRE_Complex        alpha,
5292                        hypre_ParCSRMatrix  *A,
5293                        HYPRE_Complex        beta,
5294                        hypre_ParCSRMatrix  *B,
5295                        hypre_ParCSRMatrix **C_ptr )
5296 {
5297    hypre_assert(hypre_ParCSRMatrixGlobalNumRows(A) == hypre_ParCSRMatrixGlobalNumRows(B));
5298    hypre_assert(hypre_ParCSRMatrixGlobalNumCols(A) == hypre_ParCSRMatrixGlobalNumCols(B));
5299    hypre_assert(hypre_ParCSRMatrixNumRows(A) == hypre_ParCSRMatrixNumRows(B));
5300    hypre_assert(hypre_ParCSRMatrixNumCols(A) == hypre_ParCSRMatrixNumCols(B));
5301 
5302 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
5303    if ( hypre_GetExecPolicy2( hypre_ParCSRMatrixMemoryLocation(A),
5304                               hypre_ParCSRMatrixMemoryLocation(B) ) == HYPRE_EXEC_DEVICE )
5305    {
5306       hypre_ParCSRMatrixAddDevice(alpha, A, beta, B, C_ptr);
5307    }
5308    else
5309 #endif
5310    {
5311       hypre_ParCSRMatrixAddHost(alpha, A, beta, B, C_ptr);
5312    }
5313 
5314    return hypre_error_flag;
5315 }
5316 
5317 /*--------------------------------------------------------------------------
5318  * hypre_ParCSRMatrixFnorm
5319  *--------------------------------------------------------------------------*/
5320 
5321 HYPRE_Real
hypre_ParCSRMatrixFnorm(hypre_ParCSRMatrix * A)5322 hypre_ParCSRMatrixFnorm( hypre_ParCSRMatrix *A )
5323 {
5324    MPI_Comm   comm = hypre_ParCSRMatrixComm(A);
5325    HYPRE_Real f_diag, f_offd, local_result, result;
5326 
5327    f_diag = hypre_CSRMatrixFnorm(hypre_ParCSRMatrixDiag(A));
5328    f_offd = hypre_CSRMatrixFnorm(hypre_ParCSRMatrixOffd(A));
5329    local_result = f_diag * f_diag + f_offd * f_offd;
5330 
5331    hypre_MPI_Allreduce(&local_result, &result, 1, HYPRE_MPI_REAL, hypre_MPI_SUM, comm);
5332 
5333    return sqrt(result);
5334 }
5335 
5336 /*--------------------------------------------------------------------------
5337  * hypre_ParCSRMatrixInfNorm
5338  *
5339  * Computes the infinity norm of A:
5340  *
5341  *       norm = max_{i} sum_{j} |A_{ij}|
5342  *--------------------------------------------------------------------------*/
5343 
5344 HYPRE_Int
hypre_ParCSRMatrixInfNorm(hypre_ParCSRMatrix * A,HYPRE_Real * norm)5345 hypre_ParCSRMatrixInfNorm( hypre_ParCSRMatrix  *A,
5346                            HYPRE_Real          *norm )
5347 {
5348    MPI_Comm            comm     = hypre_ParCSRMatrixComm(A);
5349 
5350    /* diag part of A */
5351    hypre_CSRMatrix    *A_diag   = hypre_ParCSRMatrixDiag(A);
5352    HYPRE_Complex      *A_diag_a = hypre_CSRMatrixData(A_diag);
5353    HYPRE_Int          *A_diag_i = hypre_CSRMatrixI(A_diag);
5354    HYPRE_Int    num_rows_diag_A = hypre_CSRMatrixNumRows(A_diag);
5355 
5356    /* off-diag part of A */
5357    hypre_CSRMatrix    *A_offd   = hypre_ParCSRMatrixOffd(A);
5358    HYPRE_Complex      *A_offd_a = hypre_CSRMatrixData(A_offd);
5359    HYPRE_Int          *A_offd_i = hypre_CSRMatrixI(A_offd);
5360 
5361    /* Local variables */
5362    HYPRE_Int           i, j;
5363    HYPRE_Real          maxsum = 0.0;
5364    HYPRE_Real          rowsum;
5365 
5366 #ifdef _MSC_VER
5367 #ifdef HYPRE_USING_OPENMP
5368 #pragma omp parallel private(i,j,rowsum)
5369 #endif
5370    {
5371       HYPRE_Real maxsum_local;
5372 
5373       maxsum_local = 0.0;
5374 #ifdef HYPRE_USING_OPENMP
5375 #pragma omp for HYPRE_SMP_SCHEDULE
5376 #endif
5377       for (i = 0; i < num_rows_diag_A; i++)
5378       {
5379          rowsum = 0.0;
5380          for (j = A_diag_i[i]; j < A_diag_i[i+1]; j++)
5381          {
5382             rowsum += hypre_cabs(A_diag_a[j]);
5383          }
5384          for (j = A_offd_i[i]; j < A_offd_i[i+1]; j++)
5385          {
5386             rowsum += hypre_cabs(A_offd_a[j]);
5387          }
5388 
5389          maxsum_local = hypre_max(maxsum_local, rowsum);
5390       }
5391 
5392 #ifdef HYPRE_USING_OPENMP
5393 #pragma omp critical
5394 #endif
5395       {
5396          maxsum = hypre_max(maxsum, maxsum_local);
5397       }
5398    }
5399 #else
5400 #ifdef HYPRE_USING_OPENMP
5401 #pragma omp parallel for private(i,j,rowsum) reduction(max:maxsum) HYPRE_SMP_SCHEDULE
5402 #endif
5403    for (i = 0; i < num_rows_diag_A; i++)
5404    {
5405       rowsum = 0.0;
5406       for (j = A_diag_i[i]; j < A_diag_i[i+1]; j++)
5407       {
5408          rowsum += hypre_cabs(A_diag_a[j]);
5409       }
5410       for (j = A_offd_i[i]; j < A_offd_i[i+1]; j++)
5411       {
5412          rowsum += hypre_cabs(A_offd_a[j]);
5413       }
5414 
5415       maxsum = hypre_max(maxsum, rowsum);
5416    }
5417 #endif
5418 
5419    hypre_MPI_Allreduce(&maxsum, norm, 1, HYPRE_MPI_REAL, hypre_MPI_MAX, comm);
5420 
5421    return hypre_error_flag;
5422 }
5423 
5424 /*--------------------------------------------------------------------------
5425  * hypre_ExchangeExternalRowsInit
5426  *--------------------------------------------------------------------------*/
5427 
5428 HYPRE_Int
hypre_ExchangeExternalRowsInit(hypre_CSRMatrix * B_ext,hypre_ParCSRCommPkg * comm_pkg_A,void ** request_ptr)5429 hypre_ExchangeExternalRowsInit( hypre_CSRMatrix      *B_ext,
5430                                 hypre_ParCSRCommPkg  *comm_pkg_A,
5431                                 void                **request_ptr)
5432 {
5433    MPI_Comm   comm             = hypre_ParCSRCommPkgComm(comm_pkg_A);
5434    HYPRE_Int  num_recvs        = hypre_ParCSRCommPkgNumRecvs(comm_pkg_A);
5435    HYPRE_Int *recv_procs       = hypre_ParCSRCommPkgRecvProcs(comm_pkg_A);
5436    HYPRE_Int *recv_vec_starts  = hypre_ParCSRCommPkgRecvVecStarts(comm_pkg_A);
5437    HYPRE_Int  num_sends        = hypre_ParCSRCommPkgNumSends(comm_pkg_A);
5438    HYPRE_Int *send_procs       = hypre_ParCSRCommPkgSendProcs(comm_pkg_A);
5439    HYPRE_Int *send_map_starts  = hypre_ParCSRCommPkgSendMapStarts(comm_pkg_A);
5440 
5441    HYPRE_Int  num_elmts_send   = send_map_starts[num_sends];
5442    HYPRE_Int  num_elmts_recv   = recv_vec_starts[num_recvs];
5443 
5444    HYPRE_Int     *B_ext_i      = B_ext ? hypre_CSRMatrixI(B_ext) : NULL;
5445    HYPRE_BigInt  *B_ext_j      = B_ext ? hypre_CSRMatrixBigJ(B_ext) : NULL;
5446    HYPRE_Complex *B_ext_data   = B_ext ? hypre_CSRMatrixData(B_ext) : NULL;
5447    HYPRE_Int      B_ext_ncols  = B_ext ? hypre_CSRMatrixNumCols(B_ext) : 0;
5448    HYPRE_Int      B_ext_nrows  = B_ext ? hypre_CSRMatrixNumRows(B_ext) : 0;
5449    HYPRE_Int     *B_ext_rownnz = hypre_CTAlloc(HYPRE_Int, B_ext_nrows, HYPRE_MEMORY_HOST);
5450 
5451    hypre_assert(num_elmts_recv == B_ext_nrows);
5452 
5453    /* output matrix */
5454    hypre_CSRMatrix *B_int;
5455    HYPRE_Int        B_int_nrows = num_elmts_send;
5456    HYPRE_Int        B_int_ncols = B_ext_ncols;
5457    HYPRE_Int       *B_int_i     = hypre_TAlloc(HYPRE_Int, B_int_nrows + 1, HYPRE_MEMORY_HOST);
5458    HYPRE_BigInt    *B_int_j     = NULL;
5459    HYPRE_Complex   *B_int_data  = NULL;
5460    HYPRE_Int        B_int_nnz;
5461 
5462    hypre_ParCSRCommHandle *comm_handle, *comm_handle_j, *comm_handle_a;
5463    hypre_ParCSRCommPkg    *comm_pkg_j;
5464 
5465    HYPRE_Int *jdata_recv_vec_starts;
5466    HYPRE_Int *jdata_send_map_starts;
5467 
5468    HYPRE_Int i;
5469    HYPRE_Int num_procs;
5470    void **vrequest;
5471 
5472    hypre_MPI_Comm_size(comm, &num_procs);
5473 
5474    jdata_send_map_starts = hypre_TAlloc(HYPRE_Int, num_sends+1, HYPRE_MEMORY_HOST);
5475 
5476    /*--------------------------------------------------------------------------
5477     * B_ext_rownnz contains the number of elements of row j
5478     * (to be determined through send_map_elmnts on the receiving end)
5479     *--------------------------------------------------------------------------*/
5480    for (i = 0; i < B_ext_nrows; i++)
5481    {
5482       B_ext_rownnz[i] = B_ext_i[i+1] - B_ext_i[i];
5483    }
5484 
5485    /*--------------------------------------------------------------------------
5486     * initialize communication: send/recv the row nnz
5487     * (note the use of comm_pkg_A, mode 12, as in transpose matvec
5488     *--------------------------------------------------------------------------*/
5489    comm_handle = hypre_ParCSRCommHandleCreate(12, comm_pkg_A, B_ext_rownnz, B_int_i + 1);
5490 
5491    jdata_recv_vec_starts = hypre_TAlloc(HYPRE_Int, num_recvs + 1, HYPRE_MEMORY_HOST);
5492    jdata_recv_vec_starts[0] = 0;
5493    for (i = 1; i <= num_recvs; i++)
5494    {
5495       jdata_recv_vec_starts[i] = B_ext_i[recv_vec_starts[i]];
5496    }
5497 
5498    comm_pkg_j = hypre_CTAlloc(hypre_ParCSRCommPkg, 1, HYPRE_MEMORY_HOST);
5499    hypre_ParCSRCommPkgComm(comm_pkg_j)      = comm;
5500    hypre_ParCSRCommPkgNumSends(comm_pkg_j)  = num_recvs;
5501    hypre_ParCSRCommPkgNumRecvs(comm_pkg_j)  = num_sends;
5502    hypre_ParCSRCommPkgSendProcs(comm_pkg_j) = recv_procs;
5503    hypre_ParCSRCommPkgRecvProcs(comm_pkg_j) = send_procs;
5504 
5505    hypre_ParCSRCommHandleDestroy(comm_handle);
5506 
5507    /*--------------------------------------------------------------------------
5508     * compute B_int: row nnz to row ptrs
5509     *--------------------------------------------------------------------------*/
5510    B_int_i[0] = 0;
5511    for (i = 1; i <= B_int_nrows; i++)
5512    {
5513       B_int_i[i] += B_int_i[i-1];
5514    }
5515 
5516    B_int_nnz = B_int_i[B_int_nrows];
5517 
5518    B_int_j    = hypre_TAlloc(HYPRE_BigInt,  B_int_nnz, HYPRE_MEMORY_HOST);
5519    B_int_data = hypre_TAlloc(HYPRE_Complex, B_int_nnz, HYPRE_MEMORY_HOST);
5520 
5521    for (i = 0; i <= num_sends; i++)
5522    {
5523       jdata_send_map_starts[i] = B_int_i[send_map_starts[i]];
5524    }
5525 
5526    /* note the order of send/recv is reversed */
5527    hypre_ParCSRCommPkgRecvVecStarts(comm_pkg_j) = jdata_send_map_starts;
5528    hypre_ParCSRCommPkgSendMapStarts(comm_pkg_j) = jdata_recv_vec_starts;
5529 
5530    /* send/recv CSR rows */
5531    comm_handle_a = hypre_ParCSRCommHandleCreate( 1, comm_pkg_j, B_ext_data, B_int_data);
5532    comm_handle_j = hypre_ParCSRCommHandleCreate(21, comm_pkg_j, B_ext_j, B_int_j);
5533 
5534    /* create CSR */
5535    B_int = hypre_CSRMatrixCreate(B_int_nrows, B_int_ncols, B_int_nnz);
5536    hypre_CSRMatrixMemoryLocation(B_int) = HYPRE_MEMORY_HOST;
5537    hypre_CSRMatrixI(B_int)    = B_int_i;
5538    hypre_CSRMatrixBigJ(B_int) = B_int_j;
5539    hypre_CSRMatrixData(B_int) = B_int_data;
5540 
5541    /* output */
5542    vrequest = hypre_TAlloc(void *, 4, HYPRE_MEMORY_HOST);
5543    vrequest[0] = (void *) comm_handle_j;
5544    vrequest[1] = (void *) comm_handle_a;
5545    vrequest[2] = (void *) B_int;
5546    vrequest[3] = (void *) comm_pkg_j;
5547 
5548    *request_ptr = (void *) vrequest;
5549 
5550    hypre_TFree(B_ext_rownnz, HYPRE_MEMORY_HOST);
5551 
5552    return hypre_error_flag;
5553 }
5554 
5555 /*--------------------------------------------------------------------------
5556  * hypre_ExchangeExternalRowsWait
5557  *--------------------------------------------------------------------------*/
5558 
5559 hypre_CSRMatrix*
hypre_ExchangeExternalRowsWait(void * vrequest)5560 hypre_ExchangeExternalRowsWait(void *vrequest)
5561 {
5562    void **request = (void **) vrequest;
5563 
5564    hypre_ParCSRCommHandle *comm_handle_j = (hypre_ParCSRCommHandle *) request[0];
5565    hypre_ParCSRCommHandle *comm_handle_a = (hypre_ParCSRCommHandle *) request[1];
5566    hypre_CSRMatrix        *B_int         = (hypre_CSRMatrix *)        request[2];
5567    hypre_ParCSRCommPkg    *comm_pkg_j    = (hypre_ParCSRCommPkg *)    request[3];
5568 
5569    /* communication done */
5570    hypre_ParCSRCommHandleDestroy(comm_handle_a);
5571    hypre_ParCSRCommHandleDestroy(comm_handle_j);
5572 
5573    hypre_TFree(hypre_ParCSRCommPkgSendMapStarts(comm_pkg_j), HYPRE_MEMORY_HOST);
5574    hypre_TFree(hypre_ParCSRCommPkgRecvVecStarts(comm_pkg_j), HYPRE_MEMORY_HOST);
5575    hypre_TFree(comm_pkg_j, HYPRE_MEMORY_HOST);
5576 
5577    hypre_TFree(request, HYPRE_MEMORY_HOST);
5578 
5579    return B_int;
5580 }
5581 
5582 
5583 /*--------------------------------------------------------------------------
5584  * hypre_ParCSRMatrixExtractSubmatrixFC
5585  *
5586  * extract submatrix A_{FF}, A_{FC}, A_{CF} or A_{CC}
5587  * char job[2] = "FF", "FC", "CF" or "CC"
5588  *--------------------------------------------------------------------------*/
5589 
5590 HYPRE_Int
hypre_ParCSRMatrixExtractSubmatrixFC(hypre_ParCSRMatrix * A,HYPRE_Int * CF_marker,HYPRE_BigInt * cpts_starts_in,const char * job,hypre_ParCSRMatrix ** B_ptr,HYPRE_Real strength_thresh)5591 hypre_ParCSRMatrixExtractSubmatrixFC( hypre_ParCSRMatrix  *A,
5592                                       HYPRE_Int           *CF_marker,
5593                                       HYPRE_BigInt        *cpts_starts_in,
5594                                       const char          *job,
5595                                       hypre_ParCSRMatrix **B_ptr,
5596                                       HYPRE_Real           strength_thresh)
5597 {
5598    MPI_Comm                 comm     = hypre_ParCSRMatrixComm(A);
5599    hypre_ParCSRCommPkg     *comm_pkg = hypre_ParCSRMatrixCommPkg(A);
5600    hypre_ParCSRCommHandle  *comm_handle;
5601 
5602    /* diag part of A */
5603    hypre_CSRMatrix    *A_diag   = hypre_ParCSRMatrixDiag(A);
5604    HYPRE_Complex      *A_diag_a = hypre_CSRMatrixData(A_diag);
5605    HYPRE_Int          *A_diag_i = hypre_CSRMatrixI(A_diag);
5606    HYPRE_Int          *A_diag_j = hypre_CSRMatrixJ(A_diag);
5607    /* off-diag part of A */
5608    hypre_CSRMatrix    *A_offd   = hypre_ParCSRMatrixOffd(A);
5609    HYPRE_Complex      *A_offd_a = hypre_CSRMatrixData(A_offd);
5610    HYPRE_Int          *A_offd_i = hypre_CSRMatrixI(A_offd);
5611    HYPRE_Int          *A_offd_j = hypre_CSRMatrixJ(A_offd);
5612 
5613    HYPRE_Int           num_cols_A_offd = hypre_CSRMatrixNumCols(A_offd);
5614    //HYPRE_Int          *col_map_offd_A  = hypre_ParCSRMatrixColMapOffd(A);
5615 
5616    hypre_ParCSRMatrix *B;
5617    hypre_CSRMatrix    *B_diag, *B_offd;
5618    HYPRE_Real         *B_maxel_row;
5619    HYPRE_Int          *B_diag_i, *B_diag_j, *B_offd_i, *B_offd_j;
5620    HYPRE_Complex      *B_diag_a, *B_offd_a;
5621    HYPRE_Int           num_cols_B_offd;
5622    HYPRE_BigInt       *col_map_offd_B;
5623 
5624    HYPRE_Int           i, j, k, k1, k2;
5625    HYPRE_BigInt        B_nrow_global, B_ncol_global;
5626    HYPRE_Int           A_nlocal, B_nrow_local, B_ncol_local,
5627                        B_nnz_diag, B_nnz_offd;
5628    HYPRE_BigInt        total_global_fpts, total_global_cpts, *fpts_starts, *cpts_starts;
5629    HYPRE_Int           nf_local, nc_local;
5630    HYPRE_BigInt        big_nf_local;
5631    HYPRE_Int           row_set, col_set;
5632    HYPRE_BigInt       *B_row_starts, *B_col_starts, B_first_col;
5633    HYPRE_Int           my_id, num_procs;
5634    HYPRE_Int          *sub_idx_diag;
5635    HYPRE_BigInt       *sub_idx_offd;
5636    HYPRE_Int           num_sends;
5637    HYPRE_BigInt       *send_buf_data;
5638 
5639    /* MPI size and rank*/
5640    hypre_MPI_Comm_size(comm, &num_procs);
5641    hypre_MPI_Comm_rank(comm, &my_id);
5642 
5643    row_set = job[0] == 'F' ? -1 : 1;
5644    col_set = job[1] == 'F' ? -1 : 1;
5645 
5646    A_nlocal = hypre_CSRMatrixNumRows(A_diag);
5647 
5648    /*-------------- global number of C points and local C points
5649     *               assuming cpts_starts is given */
5650    if (row_set == 1 || col_set == 1)
5651    {
5652       /* copy cpts_starts first */
5653       HYPRE_Int len;
5654       len = 2;
5655       cpts_starts = hypre_TAlloc(HYPRE_BigInt, len, HYPRE_MEMORY_HOST);
5656       hypre_TMemcpy(cpts_starts, cpts_starts_in, HYPRE_BigInt, len,
5657                     HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
5658 
5659       if (my_id == (num_procs -1))
5660       {
5661          total_global_cpts = cpts_starts[1];
5662       }
5663       hypre_MPI_Bcast(&total_global_cpts, 1, HYPRE_MPI_BIG_INT, num_procs-1, comm);
5664       nc_local = (HYPRE_Int)(cpts_starts[1] - cpts_starts[0]);
5665    }
5666 
5667    /*-------------- global number of F points, local F points, and F starts */
5668    if (row_set == -1 || col_set == -1)
5669    {
5670       nf_local = 0;
5671       for (i = 0; i < A_nlocal; i++)
5672       {
5673          if (CF_marker[i] < 0)
5674          {
5675             nf_local++;
5676          }
5677       }
5678       big_nf_local = (HYPRE_BigInt) nf_local;
5679       fpts_starts = hypre_TAlloc(HYPRE_BigInt, 2, HYPRE_MEMORY_HOST);
5680       hypre_MPI_Scan(&big_nf_local, fpts_starts+1, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
5681       fpts_starts[0] = fpts_starts[1] - nf_local;
5682       if (my_id == num_procs - 1)
5683       {
5684          total_global_fpts = fpts_starts[1];
5685       }
5686       hypre_MPI_Bcast(&total_global_fpts, 1, HYPRE_MPI_BIG_INT, num_procs-1, comm);
5687    }
5688 
5689    if (row_set == -1 && col_set == -1)
5690    {
5691       /* FF */
5692       B_nrow_local = nf_local;
5693       B_ncol_local = nf_local;
5694       B_nrow_global = total_global_fpts;
5695       B_ncol_global = total_global_fpts;
5696 
5697       B_row_starts = B_col_starts = fpts_starts;
5698    }
5699    else if (row_set == -1 && col_set == 1)
5700    {
5701       /* FC */
5702       B_nrow_local = nf_local;
5703       B_ncol_local = nc_local;
5704       B_nrow_global = total_global_fpts;
5705       B_ncol_global = total_global_cpts;
5706 
5707       B_row_starts = fpts_starts;
5708       B_col_starts = cpts_starts;
5709    }
5710    else if (row_set == 1 && col_set == -1)
5711    {
5712       /* CF */
5713       B_nrow_local = nc_local;
5714       B_ncol_local = nf_local;
5715       B_nrow_global = total_global_cpts;
5716       B_ncol_global = total_global_fpts;
5717 
5718       B_row_starts = cpts_starts;
5719       B_col_starts = fpts_starts;
5720    }
5721    else
5722    {
5723       /* CC */
5724       B_nrow_local = nc_local;
5725       B_ncol_local = nc_local;
5726       B_nrow_global = total_global_cpts;
5727       B_ncol_global = total_global_cpts;
5728 
5729       B_row_starts = B_col_starts = cpts_starts;
5730    }
5731 
5732    /* global index of my first col */
5733    B_first_col = B_col_starts[0];
5734 
5735    /* sub_idx_diag: [local] mapping from F+C to F/C, if not selected, be -1 */
5736    sub_idx_diag = hypre_TAlloc(HYPRE_Int, A_nlocal, HYPRE_MEMORY_HOST);
5737    for (i = 0, k = 0; i < A_nlocal; i++)
5738    {
5739       HYPRE_Int CF_i = CF_marker[i] > 0 ? 1 : -1;
5740       if (CF_i == col_set)
5741       {
5742          sub_idx_diag[i] = k++;
5743       }
5744       else
5745       {
5746          sub_idx_diag[i] = -1;
5747       }
5748    }
5749 
5750    hypre_assert(k == B_ncol_local);
5751 
5752    num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
5753    send_buf_data = hypre_TAlloc(HYPRE_BigInt, hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends),
5754                                 HYPRE_MEMORY_HOST);
5755    k = 0;
5756    for (i = 0; i < num_sends; i++)
5757    {
5758       /* start pos of elements sent to send_proc[i] */
5759       HYPRE_Int si = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
5760       HYPRE_Int ei = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i+1);
5761       /* loop through all elems to send_proc[i] */
5762       for (j = si; j < ei; j++)
5763       {
5764          /* j1: local idx */
5765          HYPRE_BigInt j1 = sub_idx_diag[hypre_ParCSRCommPkgSendMapElmt(comm_pkg, j)];
5766          if (j1 != -1)
5767          {
5768             /* adjust j1 to B global idx */
5769             j1 += B_first_col;
5770          }
5771          send_buf_data[k++] = j1;
5772       }
5773    }
5774 
5775    hypre_assert(k == hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends));
5776 
5777    /* recv buffer */
5778    sub_idx_offd = hypre_TAlloc(HYPRE_BigInt, num_cols_A_offd, HYPRE_MEMORY_HOST);
5779    /* create a handle to start communication. 11: for integer */
5780    comm_handle = hypre_ParCSRCommHandleCreate(21, comm_pkg, send_buf_data, sub_idx_offd);
5781    /* destroy the handle to finish communication */
5782    hypre_ParCSRCommHandleDestroy(comm_handle);
5783 
5784    for (i = 0, num_cols_B_offd = 0; i < num_cols_A_offd; i++)
5785    {
5786       if (sub_idx_offd[i] != -1)
5787       {
5788          num_cols_B_offd ++;
5789       }
5790    }
5791    col_map_offd_B = hypre_TAlloc(HYPRE_BigInt, num_cols_B_offd, HYPRE_MEMORY_HOST);
5792    for (i = 0, k = 0; i < num_cols_A_offd; i++)
5793    {
5794       if (sub_idx_offd[i] != -1)
5795       {
5796          col_map_offd_B[k] = sub_idx_offd[i];
5797          sub_idx_offd[i] = k++;
5798       }
5799    }
5800 
5801    hypre_assert(k == num_cols_B_offd);
5802 
5803    /* count nnz and set ia */
5804    B_nnz_diag = B_nnz_offd = 0;
5805    B_maxel_row = hypre_TAlloc(HYPRE_Real, B_nrow_local, HYPRE_MEMORY_HOST);
5806    B_diag_i = hypre_TAlloc(HYPRE_Int, B_nrow_local+1, HYPRE_MEMORY_HOST);
5807    B_offd_i = hypre_TAlloc(HYPRE_Int, B_nrow_local+1, HYPRE_MEMORY_HOST);
5808    B_diag_i[0] = B_offd_i[0] = 0;
5809 
5810    for (i = 0, k = 0; i < A_nlocal; i++)
5811    {
5812       HYPRE_Int CF_i = CF_marker[i] > 0 ? 1 : -1;
5813       if (CF_i != row_set)
5814       {
5815          continue;
5816       }
5817       k++;
5818 
5819       // Get max abs-value element of this row
5820       HYPRE_Real temp_max = 0;
5821       if (strength_thresh > 0) {
5822          for (j = A_diag_i[i]+1; j < A_diag_i[i+1]; j++)
5823          {
5824             if (hypre_cabs(A_diag_a[j]) > temp_max)
5825             {
5826                temp_max = hypre_cabs(A_diag_a[j]);
5827             }
5828          }
5829          for (j = A_offd_i[i]; j < A_offd_i[i+1]; j++)
5830          {
5831             if (hypre_cabs(A_offd_a[j]) > temp_max)
5832             {
5833                temp_max = hypre_cabs(A_offd_a[j]);
5834             }
5835          }
5836       }
5837       B_maxel_row[k-1] = temp_max;
5838 
5839       // add one for diagonal element
5840       j = A_diag_i[i];
5841       if (sub_idx_diag[A_diag_j[j]] != -1)
5842       {
5843          B_nnz_diag++;
5844       }
5845 
5846       // Count nnzs larger than tolerance times max row element
5847       for (j = A_diag_i[i]+1; j < A_diag_i[i+1]; j++)
5848       {
5849          if ( (sub_idx_diag[A_diag_j[j]] != -1) &&
5850               (hypre_cabs(A_diag_a[j]) > (strength_thresh*temp_max)) )
5851          {
5852             B_nnz_diag++;
5853          }
5854       }
5855       for (j = A_offd_i[i]; j < A_offd_i[i+1]; j++)
5856       {
5857          if ( (sub_idx_offd[A_offd_j[j]] != -1) &&
5858               (hypre_cabs(A_offd_a[j]) > (strength_thresh*temp_max)) )
5859          {
5860             B_nnz_offd++;
5861          }
5862       }
5863       B_diag_i[k] = B_nnz_diag;
5864       B_offd_i[k] = B_nnz_offd;
5865    }
5866 
5867    hypre_assert(k == B_nrow_local);
5868 
5869    B_diag_j = hypre_TAlloc(HYPRE_Int,     B_nnz_diag, HYPRE_MEMORY_HOST);
5870    B_diag_a = hypre_TAlloc(HYPRE_Complex, B_nnz_diag, HYPRE_MEMORY_HOST);
5871    B_offd_j = hypre_TAlloc(HYPRE_Int,     B_nnz_offd, HYPRE_MEMORY_HOST);
5872    B_offd_a = hypre_TAlloc(HYPRE_Complex, B_nnz_offd, HYPRE_MEMORY_HOST);
5873 
5874    for (i = 0, k=0, k1 = 0, k2 = 0; i < A_nlocal; i++)
5875    {
5876       HYPRE_Int CF_i = CF_marker[i] > 0 ? 1 : -1;
5877       if (CF_i != row_set)
5878       {
5879          continue;
5880       }
5881       HYPRE_Real maxel = B_maxel_row[k];
5882       k++;
5883 
5884       for (j = A_diag_i[i]; j < A_diag_i[i+1]; j++)
5885       {
5886          HYPRE_Int j1 = sub_idx_diag[A_diag_j[j]];
5887          if ( (j1 != -1) && ( (hypre_cabs(A_diag_a[j]) > (strength_thresh*maxel)) || j==A_diag_i[i] ) )
5888          {
5889             B_diag_j[k1] = j1;
5890             B_diag_a[k1] = A_diag_a[j];
5891             k1++;
5892          }
5893       }
5894       for (j = A_offd_i[i]; j < A_offd_i[i+1]; j++)
5895       {
5896          HYPRE_Int j1 = sub_idx_offd[A_offd_j[j]];
5897          if ((j1 != -1) && (hypre_cabs(A_offd_a[j]) > (strength_thresh*maxel)))
5898          {
5899             hypre_assert(j1 >= 0 && j1 < num_cols_B_offd);
5900             B_offd_j[k2] = j1;
5901             B_offd_a[k2] = A_offd_a[j];
5902             k2++;
5903          }
5904       }
5905    }
5906 
5907    hypre_assert(k1 == B_nnz_diag && k2 == B_nnz_offd);
5908 
5909    /* ready to create B = A(rowset, colset) */
5910    B = hypre_ParCSRMatrixCreate(comm,
5911                                 B_nrow_global,
5912                                 B_ncol_global,
5913                                 B_row_starts,
5914                                 B_col_starts,
5915                                 num_cols_B_offd,
5916                                 B_nnz_diag,
5917                                 B_nnz_offd);
5918 
5919    B_diag = hypre_ParCSRMatrixDiag(B);
5920    hypre_CSRMatrixMemoryLocation(B_diag) = HYPRE_MEMORY_HOST;
5921    hypre_CSRMatrixData(B_diag) = B_diag_a;
5922    hypre_CSRMatrixI(B_diag)    = B_diag_i;
5923    hypre_CSRMatrixJ(B_diag)    = B_diag_j;
5924 
5925    B_offd = hypre_ParCSRMatrixOffd(B);
5926    hypre_CSRMatrixMemoryLocation(B_offd) = HYPRE_MEMORY_HOST;
5927    hypre_CSRMatrixData(B_offd) = B_offd_a;
5928    hypre_CSRMatrixI(B_offd)    = B_offd_i;
5929    hypre_CSRMatrixJ(B_offd)    = B_offd_j;
5930 
5931    hypre_ParCSRMatrixColMapOffd(B) = col_map_offd_B;
5932 
5933    hypre_ParCSRMatrixSetNumNonzeros(B);
5934    hypre_ParCSRMatrixDNumNonzeros(B) = (HYPRE_Real) hypre_ParCSRMatrixNumNonzeros(B);
5935 
5936    hypre_MatvecCommPkgCreate(B);
5937 
5938    *B_ptr = B;
5939 
5940    if (B_row_starts == B_col_starts)
5941    {
5942       hypre_TFree(B_row_starts, HYPRE_MEMORY_HOST);
5943    }
5944    else
5945    {
5946       hypre_TFree(B_row_starts, HYPRE_MEMORY_HOST);
5947       hypre_TFree(B_col_starts, HYPRE_MEMORY_HOST);
5948    }
5949    hypre_TFree(B_maxel_row, HYPRE_MEMORY_HOST);
5950    hypre_TFree(send_buf_data, HYPRE_MEMORY_HOST);
5951    hypre_TFree(sub_idx_diag, HYPRE_MEMORY_HOST);
5952    hypre_TFree(sub_idx_offd, HYPRE_MEMORY_HOST);
5953 
5954    return hypre_error_flag;
5955 }
5956 
5957 /* drop the entries that are not on the diagonal and smaller than:
5958  *    type 0: tol (TODO)
5959  *    type 1: tol*(1-norm of row)
5960  *    type 2: tol*(2-norm of row)
5961  *    type -1: tol*(infinity norm of row) */
5962 HYPRE_Int
hypre_ParCSRMatrixDropSmallEntriesHost(hypre_ParCSRMatrix * A,HYPRE_Real tol,HYPRE_Int type)5963 hypre_ParCSRMatrixDropSmallEntriesHost( hypre_ParCSRMatrix *A,
5964                                         HYPRE_Real          tol,
5965                                         HYPRE_Int           type)
5966 {
5967    HYPRE_Int i, j, k, nnz_diag, nnz_offd, A_diag_i_i, A_offd_i_i;
5968 
5969    MPI_Comm         comm     = hypre_ParCSRMatrixComm(A);
5970    /* diag part of A */
5971    hypre_CSRMatrix *A_diag   = hypre_ParCSRMatrixDiag(A);
5972    HYPRE_Real      *A_diag_a = hypre_CSRMatrixData(A_diag);
5973    HYPRE_Int       *A_diag_i = hypre_CSRMatrixI(A_diag);
5974    HYPRE_Int       *A_diag_j = hypre_CSRMatrixJ(A_diag);
5975    /* off-diag part of A */
5976    hypre_CSRMatrix *A_offd   = hypre_ParCSRMatrixOffd(A);
5977    HYPRE_Real      *A_offd_a = hypre_CSRMatrixData(A_offd);
5978    HYPRE_Int       *A_offd_i = hypre_CSRMatrixI(A_offd);
5979    HYPRE_Int       *A_offd_j = hypre_CSRMatrixJ(A_offd);
5980 
5981    HYPRE_Int  num_cols_A_offd = hypre_CSRMatrixNumCols(A_offd);
5982    HYPRE_BigInt *col_map_offd_A  = hypre_ParCSRMatrixColMapOffd(A);
5983    HYPRE_Int *marker_offd = NULL;
5984 
5985    HYPRE_BigInt first_row  = hypre_ParCSRMatrixFirstRowIndex(A);
5986    HYPRE_Int nrow_local = hypre_CSRMatrixNumRows(A_diag);
5987    HYPRE_Int my_id, num_procs;
5988    /* MPI size and rank*/
5989    hypre_MPI_Comm_size(comm, &num_procs);
5990    hypre_MPI_Comm_rank(comm, &my_id);
5991 
5992    marker_offd = hypre_CTAlloc(HYPRE_Int, num_cols_A_offd, HYPRE_MEMORY_HOST);
5993 
5994    nnz_diag = nnz_offd = A_diag_i_i = A_offd_i_i = 0;
5995    for (i = 0; i < nrow_local; i++)
5996    {
5997       /* compute row norm */
5998       HYPRE_Real row_nrm = 0.0;
5999       for (j = A_diag_i_i; j < A_diag_i[i+1]; j++)
6000       {
6001          HYPRE_Complex v = A_diag_a[j];
6002          if (type == 1)
6003          {
6004             row_nrm += fabs(v);
6005          }
6006          else if (type == 2)
6007          {
6008             row_nrm += v*v;
6009          }
6010          else
6011          {
6012             row_nrm = hypre_max(row_nrm, fabs(v));
6013          }
6014       }
6015       if (num_procs > 1)
6016       {
6017          for (j = A_offd_i_i; j < A_offd_i[i+1]; j++)
6018          {
6019             HYPRE_Complex v = A_offd_a[j];
6020             if (type == 1)
6021             {
6022                row_nrm += fabs(v);
6023             }
6024             else if (type == 2)
6025             {
6026                row_nrm += v*v;
6027             }
6028             else
6029             {
6030                row_nrm = hypre_max(row_nrm, fabs(v));
6031             }
6032          }
6033       }
6034 
6035       if (type == 2)
6036       {
6037          row_nrm = sqrt(row_nrm);
6038       }
6039 
6040       /* drop small entries based on tol and row norm */
6041       for (j = A_diag_i_i; j < A_diag_i[i+1]; j++)
6042       {
6043          HYPRE_Int     col = A_diag_j[j];
6044          HYPRE_Complex val = A_diag_a[j];
6045          if (i == col || fabs(val) >= tol * row_nrm)
6046          {
6047             A_diag_j[nnz_diag] = col;
6048             A_diag_a[nnz_diag] = val;
6049             nnz_diag ++;
6050          }
6051       }
6052       if (num_procs > 1)
6053       {
6054          for (j = A_offd_i_i; j < A_offd_i[i+1]; j++)
6055          {
6056             HYPRE_Int     col = A_offd_j[j];
6057             HYPRE_Complex val = A_offd_a[j];
6058             /* in normal cases: diagonal entry should not
6059              * appear in A_offd (but this can still be possible) */
6060             if (i + first_row == col_map_offd_A[col] || fabs(val) >= tol * row_nrm)
6061             {
6062                if (0 == marker_offd[col])
6063                {
6064                   marker_offd[col] = 1;
6065                }
6066                A_offd_j[nnz_offd] = col;
6067                A_offd_a[nnz_offd] = val;
6068                nnz_offd ++;
6069             }
6070          }
6071       }
6072       A_diag_i_i = A_diag_i[i+1];
6073       A_offd_i_i = A_offd_i[i+1];
6074       A_diag_i[i+1] = nnz_diag;
6075       A_offd_i[i+1] = nnz_offd;
6076    }
6077 
6078    hypre_CSRMatrixNumNonzeros(A_diag) = nnz_diag;
6079    hypre_CSRMatrixNumNonzeros(A_offd) = nnz_offd;
6080    hypre_ParCSRMatrixSetNumNonzeros(A);
6081    hypre_ParCSRMatrixDNumNonzeros(A) = (HYPRE_Real) hypre_ParCSRMatrixNumNonzeros(A);
6082 
6083    for (i = 0, k = 0; i < num_cols_A_offd; i++)
6084    {
6085       if (marker_offd[i])
6086       {
6087          col_map_offd_A[k] = col_map_offd_A[i];
6088          marker_offd[i] = k++;
6089       }
6090    }
6091    /* num_cols_A_offd = k; */
6092    hypre_CSRMatrixNumCols(A_offd) = k;
6093    for (i = 0; i < nnz_offd; i++)
6094    {
6095       A_offd_j[i] = marker_offd[A_offd_j[i]];
6096    }
6097 
6098    if ( hypre_ParCSRMatrixCommPkg(A) )
6099    {
6100       hypre_MatvecCommPkgDestroy( hypre_ParCSRMatrixCommPkg(A) );
6101    }
6102    hypre_MatvecCommPkgCreate(A);
6103 
6104    hypre_TFree(marker_offd, HYPRE_MEMORY_HOST);
6105 
6106    return hypre_error_flag;
6107 }
6108 
6109 /* drop the entries that are not on the diagonal and smaller than
6110  *    type 0: tol
6111  *    type 1: tol*(1-norm of row)
6112  *    type 2: tol*(2-norm of row)
6113  *    type -1: tol*(infinity norm of row)
6114  *    NOTE: some type options above unavailable on either host or device */
6115 HYPRE_Int
hypre_ParCSRMatrixDropSmallEntries(hypre_ParCSRMatrix * A,HYPRE_Real tol,HYPRE_Int type)6116 hypre_ParCSRMatrixDropSmallEntries( hypre_ParCSRMatrix *A,
6117                                     HYPRE_Real          tol,
6118                                     HYPRE_Int           type)
6119 {
6120    if (tol <= 0.0)
6121    {
6122       return hypre_error_flag;
6123    }
6124 
6125 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
6126    hypre_GpuProfilingPushRange("ParCSRMatrixDropSmallEntries");
6127 #endif
6128 
6129    HYPRE_Int ierr = 0;
6130 
6131 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
6132    HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
6133 
6134    if (exec == HYPRE_EXEC_DEVICE)
6135    {
6136       ierr = hypre_ParCSRMatrixDropSmallEntriesDevice(A, tol, type);
6137    }
6138    else
6139 #endif
6140    {
6141       ierr = hypre_ParCSRMatrixDropSmallEntriesHost(A, tol, type);
6142    }
6143 
6144 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
6145    hypre_GpuProfilingPopRange();
6146 #endif
6147 
6148    return ierr;
6149 }
6150 
6151 /* Scale ParCSR matrix A = scalar * A
6152  * A: the target CSR matrix
6153  * scalar: real number
6154  */
6155 HYPRE_Int
hypre_ParCSRMatrixScale(hypre_ParCSRMatrix * A,HYPRE_Complex scalar)6156 hypre_ParCSRMatrixScale(hypre_ParCSRMatrix *A,
6157                         HYPRE_Complex       scalar)
6158 {
6159    hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
6160    hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
6161 
6162    hypre_CSRMatrixScale(A_diag, scalar);
6163    hypre_CSRMatrixScale(A_offd, scalar);
6164 
6165    return hypre_error_flag;
6166 }
6167 
6168 /*--------------------------------------------------------------------------
6169  * hypre_ParCSRMatrixReorder:
6170  *
6171  * Reorders the column and data arrays of a the diagonal component of a square
6172  * ParCSR matrix, such that the first entry in each row is the diagonal one.
6173  *--------------------------------------------------------------------------*/
6174 
6175 HYPRE_Int
hypre_ParCSRMatrixReorder(hypre_ParCSRMatrix * A)6176 hypre_ParCSRMatrixReorder(hypre_ParCSRMatrix *A)
6177 {
6178    HYPRE_BigInt      nrows_A = hypre_ParCSRMatrixGlobalNumRows(A);
6179    HYPRE_BigInt      ncols_A = hypre_ParCSRMatrixGlobalNumCols(A);
6180    hypre_CSRMatrix  *A_diag  = hypre_ParCSRMatrixDiag(A);
6181 
6182    if (nrows_A != ncols_A)
6183    {
6184       hypre_error_w_msg(HYPRE_ERROR_GENERIC," Error! Matrix should be square!\n");
6185       return hypre_error_flag;
6186    }
6187 
6188    hypre_CSRMatrixReorder(A_diag);
6189 
6190    return hypre_error_flag;
6191 }
6192