1 /******************************************************************************
2  * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other
3  * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
4  *
5  * SPDX-License-Identifier: (Apache-2.0 OR MIT)
6  ******************************************************************************/
7 #include "_hypre_parcsr_ls.h"
8 #include "_hypre_utilities.hpp"
9 #include "par_ilu.h"
10 #include "seq_mv.hpp"
11 
12 /* Setup ILU data */
13 HYPRE_Int
hypre_ILUSetup(void * ilu_vdata,hypre_ParCSRMatrix * A,hypre_ParVector * f,hypre_ParVector * u)14 hypre_ILUSetup( void               *ilu_vdata,
15                 hypre_ParCSRMatrix *A,
16                 hypre_ParVector    *f,
17                 hypre_ParVector    *u )
18 {
19    MPI_Comm             comm                 = hypre_ParCSRMatrixComm(A);
20    hypre_ParILUData     *ilu_data            = (hypre_ParILUData*) ilu_vdata;
21    hypre_ParILUData     *schur_precond_ilu;
22    hypre_ParNSHData     *schur_solver_nsh;
23 
24    HYPRE_Int            i;
25    // HYPRE_Int            num_threads;
26    // HYPRE_Int            debug_flag           = 0;
27 
28    /* pointers to ilu data */
29    HYPRE_Int            logging              = hypre_ParILUDataLogging(ilu_data);
30    HYPRE_Int            print_level          = hypre_ParILUDataPrintLevel(ilu_data);
31    HYPRE_Int            ilu_type             = hypre_ParILUDataIluType(ilu_data);
32    HYPRE_Int            nLU                  = hypre_ParILUDataNLU(ilu_data);
33    HYPRE_Int            nI                   = hypre_ParILUDataNI(ilu_data);
34    HYPRE_Int            fill_level           = hypre_ParILUDataLfil(ilu_data);
35    HYPRE_Int            max_row_elmts        = hypre_ParILUDataMaxRowNnz(ilu_data);
36    HYPRE_Real           *droptol             = hypre_ParILUDataDroptol(ilu_data);
37    HYPRE_Int            *CF_marker_array     = hypre_ParILUDataCFMarkerArray(ilu_data);
38    HYPRE_Int            *perm                = hypre_ParILUDataPerm(ilu_data);
39    HYPRE_Int            *qperm               = hypre_ParILUDataQPerm(ilu_data);
40    HYPRE_Real           tol_ddPQ             = hypre_ParILUDataTolDDPQ(ilu_data);
41 
42 #ifdef HYPRE_USING_CUDA
43    /* pointers to cusparse data, note that they are not NULL only when needed */
44    cusparseMatDescr_t      matL_des          = hypre_ParILUDataMatLMatrixDescription(ilu_data);
45    cusparseMatDescr_t      matU_des          = hypre_ParILUDataMatUMatrixDescription(ilu_data);
46    void                    *ilu_solve_buffer = hypre_ParILUDataILUSolveBuffer(ilu_data);//device memory
47    cusparseSolvePolicy_t   ilu_solve_policy  = hypre_ParILUDataILUSolvePolicy(ilu_data);
48    hypre_ParCSRMatrix      *Aperm            = hypre_ParILUDataAperm(ilu_data);
49    hypre_ParCSRMatrix      *R                = hypre_ParILUDataR(ilu_data);
50    hypre_ParCSRMatrix      *P                = hypre_ParILUDataP(ilu_data);
51    hypre_CSRMatrix         *matALU_d         = hypre_ParILUDataMatAILUDevice(ilu_data);
52    hypre_CSRMatrix         *matBLU_d         = hypre_ParILUDataMatBILUDevice(ilu_data);
53    hypre_CSRMatrix         *matSLU_d         = hypre_ParILUDataMatSILUDevice(ilu_data);
54    hypre_CSRMatrix         *matE_d           = hypre_ParILUDataMatEDevice(ilu_data);
55    hypre_CSRMatrix         *matF_d           = hypre_ParILUDataMatFDevice(ilu_data);
56    csrsv2Info_t            matAL_info        = hypre_ParILUDataMatALILUSolveInfo(ilu_data);
57    csrsv2Info_t            matAU_info        = hypre_ParILUDataMatAUILUSolveInfo(ilu_data);
58    csrsv2Info_t            matBL_info        = hypre_ParILUDataMatBLILUSolveInfo(ilu_data);
59    csrsv2Info_t            matBU_info        = hypre_ParILUDataMatBUILUSolveInfo(ilu_data);
60    csrsv2Info_t            matSL_info        = hypre_ParILUDataMatSLILUSolveInfo(ilu_data);
61    csrsv2Info_t            matSU_info        = hypre_ParILUDataMatSUILUSolveInfo(ilu_data);
62    HYPRE_Int               *A_diag_fake      = hypre_ParILUDataMatAFakeDiagonal(ilu_data);
63    hypre_Vector            *Ftemp_upper      = NULL;
64    hypre_Vector            *Utemp_lower      = NULL;
65 #endif
66 
67    hypre_ParCSRMatrix   *matA                = hypre_ParILUDataMatA(ilu_data);
68    hypre_ParCSRMatrix   *matL                = hypre_ParILUDataMatL(ilu_data);
69    HYPRE_Real           *matD                = hypre_ParILUDataMatD(ilu_data);
70    hypre_ParCSRMatrix   *matU                = hypre_ParILUDataMatU(ilu_data);
71    hypre_ParCSRMatrix   *matmL               = hypre_ParILUDataMatLModified(ilu_data);
72    HYPRE_Real           *matmD               = hypre_ParILUDataMatDModified(ilu_data);
73    hypre_ParCSRMatrix   *matmU               = hypre_ParILUDataMatUModified(ilu_data);
74    hypre_ParCSRMatrix   *matS                = hypre_ParILUDataMatS(ilu_data);
75 //   hypre_ParCSRMatrix   *matM                = NULL;
76 //   HYPRE_Int            nnzG;/* g stands for global */
77    HYPRE_Real           nnzS;/* total nnz in S */
78    HYPRE_Int            nnzS_offd;
79    HYPRE_Int            size_C/* total size of coarse grid */;
80 
81    HYPRE_Int            n                    = hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A));
82    //   HYPRE_Int            m;/* m = n-LU */
83    /* reordering option */
84    HYPRE_Int            reordering_type = hypre_ParILUDataReorderingType(ilu_data);
85    HYPRE_Int            num_procs,  my_id;
86 
87    hypre_ParVector      *Utemp               = NULL;
88    hypre_ParVector      *Ftemp               = NULL;
89    hypre_ParVector      *Xtemp               = NULL;
90    hypre_ParVector      *Ytemp               = NULL;
91    HYPRE_Real           *uext                = NULL;
92    HYPRE_Real           *fext                = NULL;
93    hypre_ParVector      *rhs                 = NULL;
94    hypre_ParVector      *x                   = NULL;
95    hypre_ParVector      *F_array             = hypre_ParILUDataF(ilu_data);
96    hypre_ParVector      *U_array             = hypre_ParILUDataU(ilu_data);
97    hypre_ParVector      *residual            = hypre_ParILUDataResidual(ilu_data);
98    HYPRE_Real           *rel_res_norms       = hypre_ParILUDataRelResNorms(ilu_data);
99 
100    /* might need for Schur Complement */
101    HYPRE_Int            *u_end               = NULL;
102    HYPRE_Solver         schur_solver         = NULL;
103    HYPRE_Solver         schur_precond        = NULL;
104    HYPRE_Solver         schur_precond_gotten = NULL;
105 
106    /* help to build external */
107    hypre_ParCSRCommPkg  *comm_pkg;
108    HYPRE_Int            buffer_size;
109    HYPRE_Int            send_size;
110    HYPRE_Int            recv_size;
111 #ifdef HYPRE_USING_CUDA
112    HYPRE_Int            test_opt;
113 #endif
114    /* ----- begin -----*/
115    HYPRE_ANNOTATE_FUNC_BEGIN;
116 
117    //num_threads = hypre_NumThreads();
118 
119    hypre_MPI_Comm_size(comm,&num_procs);
120    hypre_MPI_Comm_rank(comm,&my_id);
121 
122 #ifdef HYPRE_USING_CUDA
123    /* create cuda and cusparse information when needed */
124    /* Use most of them from global information */
125    /* set matrix L descripter, L is a lower triangular matrix with unit diagonal entries */
126    if (!matL_des)
127    {
128       HYPRE_CUSPARSE_CALL(cusparseCreateMatDescr(&(hypre_ParILUDataMatLMatrixDescription(ilu_data))));
129       matL_des = hypre_ParILUDataMatLMatrixDescription(ilu_data);
130       HYPRE_CUSPARSE_CALL(cusparseSetMatIndexBase(matL_des, CUSPARSE_INDEX_BASE_ZERO));
131       HYPRE_CUSPARSE_CALL(cusparseSetMatType(matL_des, CUSPARSE_MATRIX_TYPE_GENERAL));
132       HYPRE_CUSPARSE_CALL(cusparseSetMatFillMode(matL_des, CUSPARSE_FILL_MODE_LOWER));
133       HYPRE_CUSPARSE_CALL(cusparseSetMatDiagType(matL_des, CUSPARSE_DIAG_TYPE_UNIT));
134    }
135    /* set matrix U descripter, U is a upper triangular matrix with non-unit diagonal entries */
136    if (!matU_des)
137    {
138       HYPRE_CUSPARSE_CALL(cusparseCreateMatDescr(&(hypre_ParILUDataMatUMatrixDescription(ilu_data))));
139       matU_des = hypre_ParILUDataMatUMatrixDescription(ilu_data);
140       HYPRE_CUSPARSE_CALL(cusparseSetMatIndexBase(matU_des, CUSPARSE_INDEX_BASE_ZERO));
141       HYPRE_CUSPARSE_CALL(cusparseSetMatType(matU_des, CUSPARSE_MATRIX_TYPE_GENERAL));
142       HYPRE_CUSPARSE_CALL(cusparseSetMatFillMode(matU_des, CUSPARSE_FILL_MODE_UPPER));
143       HYPRE_CUSPARSE_CALL(cusparseSetMatDiagType(matU_des, CUSPARSE_DIAG_TYPE_NON_UNIT));
144    }
145    if (!matAL_info)
146    {
147       HYPRE_CUSPARSE_CALL( (cusparseDestroyCsrsv2Info(hypre_ParILUDataMatALILUSolveInfo(ilu_data))) );
148       matAL_info = NULL;
149    }
150    if (!matAU_info)
151    {
152       HYPRE_CUSPARSE_CALL( (cusparseDestroyCsrsv2Info(hypre_ParILUDataMatAUILUSolveInfo(ilu_data))) );
153       matAU_info = NULL;
154    }
155    if (!matBL_info)
156    {
157       HYPRE_CUSPARSE_CALL( (cusparseDestroyCsrsv2Info(hypre_ParILUDataMatBLILUSolveInfo(ilu_data))) );
158       matBL_info = NULL;
159    }
160    if (!matBU_info)
161    {
162       HYPRE_CUSPARSE_CALL( (cusparseDestroyCsrsv2Info(hypre_ParILUDataMatBUILUSolveInfo(ilu_data))) );
163       matBU_info = NULL;
164    }
165    if (!matSL_info)
166    {
167       HYPRE_CUSPARSE_CALL( (cusparseDestroyCsrsv2Info(hypre_ParILUDataMatSLILUSolveInfo(ilu_data))) );
168       matSL_info = NULL;
169    }
170    if (!matSU_info)
171    {
172       HYPRE_CUSPARSE_CALL( (cusparseDestroyCsrsv2Info(hypre_ParILUDataMatSUILUSolveInfo(ilu_data))) );
173       matSU_info = NULL;
174    }
175    if (ilu_solve_buffer)
176    {
177       hypre_TFree(ilu_solve_buffer, HYPRE_MEMORY_DEVICE);
178       ilu_solve_buffer = NULL;
179    }
180    if (matALU_d)
181    {
182       hypre_CSRMatrixDestroy( matALU_d );
183       matALU_d = NULL;
184    }
185    if (matSLU_d)
186    {
187       hypre_CSRMatrixDestroy( matSLU_d );
188       matSLU_d = NULL;
189    }
190    if (matBLU_d)
191    {
192       hypre_CSRMatrixDestroy( matBLU_d );
193       matBLU_d = NULL;
194    }
195    if (matE_d)
196    {
197       hypre_CSRMatrixDestroy( matE_d );
198       matE_d = NULL;
199    }
200    if (matF_d)
201    {
202       hypre_CSRMatrixDestroy( matF_d );
203       matF_d = NULL;
204    }
205    if (Aperm)
206    {
207       hypre_ParCSRMatrixDestroy( Aperm );
208       Aperm = NULL;
209    }
210    if (R)
211    {
212       hypre_ParCSRMatrixDestroy( R );
213       R = NULL;
214    }
215    if (P)
216    {
217       hypre_ParCSRMatrixDestroy( P );
218       P = NULL;
219    }
220    if (hypre_ParILUDataXTemp(ilu_data))
221    {
222       hypre_ParVectorDestroy(hypre_ParILUDataXTemp(ilu_data));
223       hypre_ParILUDataXTemp(ilu_data) = NULL;
224    }
225    if (hypre_ParILUDataYTemp(ilu_data))
226    {
227       hypre_ParVectorDestroy(hypre_ParILUDataYTemp(ilu_data));
228       hypre_ParILUDataYTemp(ilu_data) = NULL;
229    }
230    if (hypre_ParILUDataFTempUpper(ilu_data))
231    {
232       hypre_SeqVectorDestroy(hypre_ParILUDataFTempUpper(ilu_data));
233       hypre_ParILUDataFTempUpper(ilu_data) = NULL;
234    }
235    if (hypre_ParILUDataUTempLower(ilu_data))
236    {
237       hypre_SeqVectorDestroy(hypre_ParILUDataUTempLower(ilu_data));
238       hypre_ParILUDataUTempLower(ilu_data) = NULL;
239    }
240    if (hypre_ParILUDataMatAFakeDiagonal(ilu_data))
241    {
242       hypre_TFree(hypre_ParILUDataMatAFakeDiagonal(ilu_data), HYPRE_MEMORY_DEVICE);
243       hypre_ParILUDataMatAFakeDiagonal(ilu_data) = NULL;
244    }
245 #endif
246 
247    /* Free Previously allocated data, if any not destroyed */
248    if (matL)
249    {
250       hypre_ParCSRMatrixDestroy(matL);
251       matL = NULL;
252    }
253    if (matU)
254    {
255       hypre_ParCSRMatrixDestroy(matU);
256       matU = NULL;
257    }
258    if (matmL)
259    {
260        hypre_ParCSRMatrixDestroy(matmL);
261        matmL = NULL;
262    }
263    if (matmU)
264    {
265       hypre_ParCSRMatrixDestroy(matmU);
266       matmU = NULL;
267    }
268    if (matS)
269    {
270       hypre_ParCSRMatrixDestroy(matS);
271       matS = NULL;
272    }
273    if (matD)
274    {
275       hypre_TFree(matD, HYPRE_MEMORY_DEVICE);
276       matD = NULL;
277    }
278    if (matmD)
279    {
280       hypre_TFree(matmD, HYPRE_MEMORY_DEVICE);
281       matmD = NULL;
282    }
283    if (CF_marker_array)
284    {
285       hypre_TFree(CF_marker_array, HYPRE_MEMORY_HOST);
286       CF_marker_array = NULL;
287    }
288 
289 
290    /* clear old l1_norm data, if created */
291    if (hypre_ParILUDataL1Norms(ilu_data))
292    {
293       hypre_TFree(hypre_ParILUDataL1Norms(ilu_data), HYPRE_MEMORY_HOST);
294       hypre_ParILUDataL1Norms(ilu_data) = NULL;
295    }
296 
297    /* setup temporary storage
298     * first check is they've already here
299     */
300    if (hypre_ParILUDataUTemp(ilu_data))
301    {
302       hypre_ParVectorDestroy(hypre_ParILUDataUTemp(ilu_data));
303       hypre_ParILUDataUTemp(ilu_data) = NULL;
304    }
305    if (hypre_ParILUDataFTemp(ilu_data))
306    {
307       hypre_ParVectorDestroy(hypre_ParILUDataFTemp(ilu_data));
308       hypre_ParILUDataFTemp(ilu_data) = NULL;
309    }
310    if (hypre_ParILUDataUExt(ilu_data))
311    {
312       hypre_TFree(hypre_ParILUDataUExt(ilu_data), HYPRE_MEMORY_HOST);
313       hypre_ParILUDataUExt(ilu_data) = NULL;
314    }
315    if ( hypre_ParILUDataFExt(ilu_data))
316    {
317       hypre_TFree(hypre_ParILUDataFExt(ilu_data), HYPRE_MEMORY_HOST);
318       hypre_ParILUDataFExt(ilu_data) = NULL;
319    }
320    if ( hypre_ParILUDataUEnd(ilu_data))
321    {
322       hypre_TFree(hypre_ParILUDataUEnd(ilu_data), HYPRE_MEMORY_HOST);
323       hypre_ParILUDataUEnd(ilu_data) = NULL;
324    }
325    if (hypre_ParILUDataRhs(ilu_data))
326    {
327       hypre_ParVectorDestroy(hypre_ParILUDataRhs(ilu_data));
328       hypre_ParILUDataRhs(ilu_data) = NULL;
329    }
330    if (hypre_ParILUDataX(ilu_data))
331    {
332       hypre_ParVectorDestroy(hypre_ParILUDataX(ilu_data));
333       hypre_ParILUDataX(ilu_data) = NULL;
334    }
335    if (hypre_ParILUDataResidual(ilu_data))
336    {
337       hypre_ParVectorDestroy(hypre_ParILUDataResidual(ilu_data));
338       hypre_ParILUDataResidual(ilu_data) = NULL;
339    }
340    if (hypre_ParILUDataRelResNorms(ilu_data))
341    {
342       hypre_TFree(hypre_ParILUDataRelResNorms(ilu_data), HYPRE_MEMORY_HOST);
343       hypre_ParILUDataRelResNorms(ilu_data) = NULL;
344    }
345    if (hypre_ParILUDataSchurSolver(ilu_data))
346    {
347       switch(ilu_type){
348          case 10: case 11: case 40: case 41: case 50:
349             HYPRE_ParCSRGMRESDestroy(hypre_ParILUDataSchurSolver(ilu_data)); //GMRES for Schur
350             break;
351          case 20: case 21:
352             hypre_NSHDestroy(hypre_ParILUDataSchurSolver(ilu_data)); //NSH for Schur
353             break;
354          default:
355             break;
356       }
357       (hypre_ParILUDataSchurSolver(ilu_data)) = NULL;
358    }
359    if (hypre_ParILUDataSchurPrecond(ilu_data))
360    {
361       switch(ilu_type){
362          case 10: case 11: case 40: case 41:
363 #ifdef HYPRE_USING_CUDA
364          if (hypre_ParILUDataIluType(ilu_data) != 10 &&
365             hypre_ParILUDataIluType(ilu_data) != 11)
366          {
367 #endif
368             HYPRE_ILUDestroy(hypre_ParILUDataSchurPrecond(ilu_data)); //ILU as precond for Schur
369 #ifdef HYPRE_USING_CUDA
370          }
371 #endif
372             break;
373          default:
374             break;
375       }
376       (hypre_ParILUDataSchurPrecond(ilu_data)) = NULL;
377    }
378    /* start to create working vectors */
379    Utemp = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A),
380          hypre_ParCSRMatrixGlobalNumRows(A),
381          hypre_ParCSRMatrixRowStarts(A));
382    hypre_ParVectorInitialize(Utemp);
383    hypre_ParILUDataUTemp(ilu_data) = Utemp;
384 
385    Ftemp = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A),
386          hypre_ParCSRMatrixGlobalNumRows(A),
387          hypre_ParCSRMatrixRowStarts(A));
388    hypre_ParVectorInitialize(Ftemp);
389    hypre_ParILUDataFTemp(ilu_data) = Ftemp;
390    /* set matrix, solution and rhs pointers */
391    matA = A;
392    F_array = f;
393    U_array = u;
394 
395    // create perm arary if necessary
396    if (perm == NULL)
397    {
398       switch(ilu_type)
399       {
400          case 10: case 11: case 20: case 21: case 30: case 31: case 50:/* symmetric */
401             hypre_ILUGetInteriorExteriorPerm(matA, &perm, &nLU, reordering_type);
402             break;
403          case 40: case 41:/* ddPQ */
404             hypre_ILUGetPermddPQ(matA, &perm, &qperm, tol_ddPQ, &nLU, &nI, reordering_type);
405             break;
406          case 0: case 1:
407             hypre_ILUGetLocalPerm(matA, &perm, &nLU, reordering_type);
408             break;
409          default:
410             hypre_ILUGetLocalPerm(matA, &perm, &nLU, reordering_type);
411             break;
412       }
413    }
414    //   m = n - nLU;
415    /* factorization */
416    switch(ilu_type)
417    {
418       case 0:
419 #ifdef HYPRE_USING_CUDA
420                /* only apply the setup of ILU0 with cusparse */
421                if (fill_level == 0)
422                {
423                   hypre_ILUSetupILU0Device(matA, perm, perm, n, n, matL_des, matU_des, ilu_solve_policy, &ilu_solve_buffer,
424                                                          &matBL_info, &matBU_info, &matSL_info, &matSU_info, &matBLU_d, &matS,
425                                                          &matE_d, &matF_d, &A_diag_fake);//BJ + cusparse_ilu0()
426                }
427                else
428                {
429                   hypre_ILUSetupILUKDevice(matA, fill_level, perm, perm, n, n, matL_des, matU_des, ilu_solve_policy, &ilu_solve_buffer,
430                                                          &matBL_info, &matBU_info, &matSL_info, &matSU_info, &matBLU_d, &matS,
431                                                          &matE_d, &matF_d, &A_diag_fake);//BJ + hypre_iluk(), setup the device solve
432                }
433 #else
434                hypre_ILUSetupILUK(matA, fill_level, perm, perm, n, n, &matL, &matD, &matU, &matS, &u_end); //BJ + hypre_iluk()
435 #endif
436                break;
437       case 1:
438 #ifdef HYPRE_USING_CUDA
439                hypre_ILUSetupILUTDevice(matA, max_row_elmts, droptol, perm, perm, n, n, matL_des, matU_des, ilu_solve_policy, &ilu_solve_buffer,
440                                                          &matBL_info, &matBU_info, &matSL_info, &matSU_info, &matBLU_d, &matS,
441                                                          &matE_d, &matF_d, &A_diag_fake);//BJ + hypre_ilut(), setup the device solve
442 #else
443                hypre_ILUSetupILUT(matA, max_row_elmts, droptol, perm, perm, n, n, &matL, &matD, &matU, &matS, &u_end); //BJ + hypre_ilut()
444 #endif
445                break;
446       case 10:
447 #ifdef HYPRE_USING_CUDA
448                if (fill_level == 0)
449                {
450                   /* Only support ILU0 */
451                   hypre_ILUSetupILU0Device(matA, perm, perm, n, nLU, matL_des, matU_des, ilu_solve_policy, &ilu_solve_buffer,
452                                                          &matBL_info, &matBU_info, &matSL_info, &matSU_info, &matBLU_d, &matS,
453                                                          &matE_d, &matF_d, &A_diag_fake);//BJ + cusparse_ilu0()
454                }
455                else
456                {
457                   hypre_ILUSetupILUKDevice(matA, fill_level, perm, perm, n, nLU, matL_des, matU_des, ilu_solve_policy, &ilu_solve_buffer,
458                                                          &matBL_info, &matBU_info, &matSL_info, &matSU_info, &matBLU_d, &matS,
459                                                          &matE_d, &matF_d, &A_diag_fake);//BJ + cusparse_ilu0()
460                }
461 #else
462                hypre_ILUSetupILUK(matA, fill_level, perm, perm, nLU, nLU, &matL, &matD, &matU, &matS, &u_end); //GMRES + hypre_iluk()
463 #endif
464                break;
465       case 11:
466 #ifdef HYPRE_USING_CUDA
467                hypre_ILUSetupILUTDevice(matA, max_row_elmts, droptol, perm, perm, n, nLU, matL_des, matU_des, ilu_solve_policy, &ilu_solve_buffer,
468                                                          &matBL_info, &matBU_info, &matSL_info, &matSU_info, &matBLU_d, &matS,
469                                                          &matE_d, &matF_d, &A_diag_fake);//BJ + cusparse_ilu0()
470 #else
471                hypre_ILUSetupILUT(matA, max_row_elmts, droptol, perm, perm, nLU, nLU, &matL, &matD, &matU, &matS, &u_end); //GMRES + hypre_ilut()
472 #endif
473                break;
474       case 20: hypre_ILUSetupILUK(matA, fill_level, perm, perm, nLU, nLU, &matL, &matD, &matU, &matS, &u_end); //Newton Schulz Hotelling + hypre_iluk()
475                break;
476       case 21: hypre_ILUSetupILUT(matA, max_row_elmts, droptol, perm, perm, nLU, nLU, &matL, &matD, &matU, &matS, &u_end); //Newton Schulz Hotelling + hypre_ilut()
477                break;
478       case 30: hypre_ILUSetupILUKRAS(matA, fill_level, perm, nLU, &matL, &matD, &matU); //RAS + hypre_iluk()
479                break;
480       case 31: hypre_ILUSetupILUTRAS(matA, max_row_elmts, droptol, perm, nLU, &matL, &matD, &matU); //RAS + hypre_ilut()
481                break;
482       case 40: hypre_ILUSetupILUK(matA, fill_level, perm, qperm, nLU, nI, &matL, &matD, &matU, &matS, &u_end); //ddPQ + GMRES + hypre_iluk()
483                break;
484       case 41: hypre_ILUSetupILUT(matA, max_row_elmts, droptol, perm, qperm, nLU, nI, &matL, &matD, &matU, &matS, &u_end); //ddPQ + GMRES + hypre_ilut()
485                break;
486       case 50:
487 #ifdef HYPRE_USING_CUDA
488                test_opt = hypre_ParILUDataTestOption(ilu_data);
489                hypre_ILUSetupRAPILU0Device(matA, perm, n, nLU, matL_des, matU_des, ilu_solve_policy,
490                               &ilu_solve_buffer, &matAL_info, &matAU_info, &matBL_info, &matBU_info, &matSL_info, &matSU_info,
491                               &Aperm, &matS, &matALU_d, &matBLU_d, &matSLU_d, &matE_d, &matF_d, test_opt); //RAP + hypre_modified_ilu0
492 #else
493                hypre_ILUSetupRAPILU0(matA, perm, n, nLU, &matL, &matD, &matU, &matmL, &matmD, &matmU, &u_end); //RAP + hypre_modified_ilu0
494 #endif
495                break;
496       default:
497 #ifdef HYPRE_USING_CUDA
498                hypre_ILUSetupILU0Device(matA, perm, perm, n, n, matL_des, matU_des, ilu_solve_policy, &ilu_solve_buffer,
499                                                       &matBL_info, &matBU_info, &matSL_info, &matSU_info, &matBLU_d, &matS,
500                                                       &matE_d, &matF_d, &A_diag_fake);//BJ + cusparse_ilu0()
501 #else
502                hypre_ILUSetupILU0(matA, perm, perm, n, n, &matL, &matD, &matU, &matS, &u_end);//BJ + hypre_ilu0()
503 #endif
504                break;
505    }
506    /* setup Schur solver */
507    switch(ilu_type)
508    {
509       case 10: case 11:
510          if (matS)
511          {
512 #ifdef HYPRE_USING_CUDA
513             /* create working vectors */
514 
515             Xtemp = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(matS),
516                                    hypre_ParCSRMatrixGlobalNumRows(matS),
517                                    hypre_ParCSRMatrixRowStarts(matS));
518             hypre_ParVectorInitialize(Xtemp);
519 
520             Ytemp = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(matS),
521                                    hypre_ParCSRMatrixGlobalNumRows(matS),
522                                    hypre_ParCSRMatrixRowStarts(matS));
523             hypre_ParVectorInitialize(Ytemp);
524 
525             Ftemp_upper = hypre_SeqVectorCreate(nLU);
526             hypre_VectorOwnsData(Ftemp_upper)   = 0;
527             hypre_VectorData(Ftemp_upper)       = hypre_VectorData(hypre_ParVectorLocalVector(Ftemp));
528             hypre_SeqVectorInitialize(Ftemp_upper);
529 
530             Utemp_lower = hypre_SeqVectorCreate(n - nLU);
531             hypre_VectorOwnsData(Utemp_lower)   = 0;
532             hypre_VectorData(Utemp_lower)       = hypre_VectorData(hypre_ParVectorLocalVector(Utemp)) + nLU;
533             hypre_SeqVectorInitialize(Utemp_lower);
534 
535             /* create GMRES */
536 //            HYPRE_ParCSRGMRESCreate(comm, &schur_solver);
537 
538             hypre_GMRESFunctions * gmres_functions;
539 
540             gmres_functions =
541                hypre_GMRESFunctionsCreate(
542                   hypre_CAlloc,
543                   hypre_ParKrylovFree,
544                   hypre_ParILUCusparseSchurGMRESCommInfo, //parCSR A -> ilu_data
545                   hypre_ParKrylovCreateVector,
546                   hypre_ParKrylovCreateVectorArray,
547                   hypre_ParKrylovDestroyVector,
548                   hypre_ParILUCusparseSchurGMRESMatvecCreate, //parCSR A -- inactive
549                   hypre_ParILUCusparseSchurGMRESMatvec, //parCSR A -> ilu_data
550                   hypre_ParILUCusparseSchurGMRESMatvecDestroy, //parCSR A -- inactive
551                   hypre_ParKrylovInnerProd,
552                   hypre_ParKrylovCopyVector,
553                   hypre_ParKrylovClearVector,
554                   hypre_ParKrylovScaleVector,
555                   hypre_ParKrylovAxpy,
556                   hypre_ParKrylovIdentitySetup, //parCSR A -- inactive
557                   hypre_ParKrylovIdentity ); //parCSR A -- inactive
558             schur_solver = ( (HYPRE_Solver) hypre_GMRESCreate( gmres_functions ) );
559 
560             /* setup GMRES parameters */
561             HYPRE_GMRESSetKDim            (schur_solver, hypre_ParILUDataSchurGMRESKDim(ilu_data));
562             HYPRE_GMRESSetMaxIter         (schur_solver, hypre_ParILUDataSchurGMRESMaxIter(ilu_data));/* we don't need that many solves */
563             HYPRE_GMRESSetTol             (schur_solver, hypre_ParILUDataSchurGMRESTol(ilu_data));
564             HYPRE_GMRESSetAbsoluteTol     (schur_solver, hypre_ParILUDataSchurGMRESAbsoluteTol(ilu_data));
565             HYPRE_GMRESSetLogging         (schur_solver, hypre_ParILUDataSchurSolverLogging(ilu_data));
566             HYPRE_GMRESSetPrintLevel      (schur_solver, hypre_ParILUDataSchurSolverPrintLevel(ilu_data));/* set to zero now, don't print */
567             HYPRE_GMRESSetRelChange       (schur_solver, hypre_ParILUDataSchurGMRESRelChange(ilu_data));
568 
569             /* setup preconditioner parameters */
570             /* create Unit precond */
571             schur_precond = (HYPRE_Solver) ilu_vdata;
572             /* add preconditioner to solver */
573             HYPRE_GMRESSetPrecond(schur_solver,
574                      (HYPRE_PtrToSolverFcn) hypre_ParILUCusparseSchurGMRESDummySolve,
575                      (HYPRE_PtrToSolverFcn) hypre_ParILUCusparseSchurGMRESDummySetup,
576                                           schur_precond);
577             HYPRE_GMRESGetPrecond(schur_solver, &schur_precond_gotten);
578             if (schur_precond_gotten != (schur_precond))
579             {
580                hypre_printf("Schur complement got bad precond\n");
581                return(-1);
582             }
583 
584             /* need to create working vector rhs and x for Schur System */
585             rhs = hypre_ParVectorCreate(comm,
586                                     hypre_ParCSRMatrixGlobalNumRows(matS),
587                                     hypre_ParCSRMatrixRowStarts(matS));
588             hypre_ParVectorInitialize(rhs);
589             x = hypre_ParVectorCreate(comm,
590                                     hypre_ParCSRMatrixGlobalNumRows(matS),
591                                     hypre_ParCSRMatrixRowStarts(matS));
592             hypre_ParVectorInitialize(x);
593 
594             /* setup solver */
595             HYPRE_GMRESSetup(schur_solver,(HYPRE_Matrix)ilu_vdata,(HYPRE_Vector)rhs,(HYPRE_Vector)x);
596 
597             /* solve for right-hand-side consists of only 1 */
598             hypre_Vector      *rhs_local = hypre_ParVectorLocalVector(rhs);
599             //HYPRE_Real        *Xtemp_data  = hypre_VectorData(Xtemp_local);
600             hypre_SeqVectorSetConstantValues(rhs_local, 1.0);
601 
602             /* update ilu_data */
603             hypre_ParILUDataSchurSolver   (ilu_data) = schur_solver;
604             hypre_ParILUDataSchurPrecond  (ilu_data) = schur_precond;
605             hypre_ParILUDataRhs           (ilu_data) = rhs;
606             hypre_ParILUDataX             (ilu_data) = x;
607 #else
608             /* setup GMRES parameters */
609             HYPRE_ParCSRGMRESCreate(comm, &schur_solver);
610 
611             HYPRE_GMRESSetKDim            (schur_solver, hypre_ParILUDataSchurGMRESKDim(ilu_data));
612             HYPRE_GMRESSetMaxIter         (schur_solver, hypre_ParILUDataSchurGMRESMaxIter(ilu_data));/* we don't need that many solves */
613             HYPRE_GMRESSetTol             (schur_solver, hypre_ParILUDataSchurGMRESTol(ilu_data));
614             HYPRE_GMRESSetAbsoluteTol     (schur_solver, hypre_ParILUDataSchurGMRESAbsoluteTol(ilu_data));
615             HYPRE_GMRESSetLogging         (schur_solver, hypre_ParILUDataSchurSolverLogging(ilu_data));
616             HYPRE_GMRESSetPrintLevel      (schur_solver, hypre_ParILUDataSchurSolverPrintLevel(ilu_data));/* set to zero now, don't print */
617             HYPRE_GMRESSetRelChange       (schur_solver, hypre_ParILUDataSchurGMRESRelChange(ilu_data));
618 
619             /* setup preconditioner parameters */
620             /* create precond, the default is ILU0 */
621             HYPRE_ILUCreate               (&schur_precond);
622             HYPRE_ILUSetType              (schur_precond, hypre_ParILUDataSchurPrecondIluType(ilu_data));
623             HYPRE_ILUSetLevelOfFill       (schur_precond, hypre_ParILUDataSchurPrecondIluLfil(ilu_data));
624             HYPRE_ILUSetMaxNnzPerRow      (schur_precond, hypre_ParILUDataSchurPrecondIluMaxRowNnz(ilu_data));
625             HYPRE_ILUSetDropThresholdArray(schur_precond, hypre_ParILUDataSchurPrecondIluDroptol(ilu_data));
626             HYPRE_ILUSetPrintLevel        (schur_precond, hypre_ParILUDataSchurPrecondPrintLevel(ilu_data));
627             HYPRE_ILUSetMaxIter           (schur_precond, hypre_ParILUDataSchurPrecondMaxIter(ilu_data));
628             HYPRE_ILUSetTol               (schur_precond, hypre_ParILUDataSchurPrecondTol(ilu_data));
629 
630             /* add preconditioner to solver */
631             HYPRE_GMRESSetPrecond(schur_solver,
632                      (HYPRE_PtrToSolverFcn) HYPRE_ILUSolve,
633                      (HYPRE_PtrToSolverFcn) HYPRE_ILUSetup,
634                                           schur_precond);
635             HYPRE_GMRESGetPrecond(schur_solver, &schur_precond_gotten);
636             if (schur_precond_gotten != (schur_precond))
637             {
638                hypre_printf("Schur complement got bad precond\n");
639                HYPRE_ANNOTATE_FUNC_END;
640 
641                return(-1);
642             }
643 
644             /* need to create working vector rhs and x for Schur System */
645             rhs = hypre_ParVectorCreate(comm,
646                                     hypre_ParCSRMatrixGlobalNumRows(matS),
647                                     hypre_ParCSRMatrixRowStarts(matS));
648             hypre_ParVectorInitialize(rhs);
649             x = hypre_ParVectorCreate(comm,
650                                     hypre_ParCSRMatrixGlobalNumRows(matS),
651                                     hypre_ParCSRMatrixRowStarts(matS));
652             hypre_ParVectorInitialize(x);
653 
654             /* setup solver */
655             HYPRE_GMRESSetup(schur_solver,(HYPRE_Matrix)matS,(HYPRE_Vector)rhs,(HYPRE_Vector)x);
656 
657             /* update ilu_data */
658             hypre_ParILUDataSchurSolver   (ilu_data) = schur_solver;
659             hypre_ParILUDataSchurPrecond  (ilu_data) = schur_precond;
660             hypre_ParILUDataRhs           (ilu_data) = rhs;
661             hypre_ParILUDataX             (ilu_data) = x;
662 #endif
663          }
664          break;
665       case 20: case 21:
666          if (matS)
667          {
668             /* approximate inverse preconditioner */
669             schur_solver = (HYPRE_Solver)hypre_NSHCreate();
670 
671             /* set NSH parameters */
672             hypre_NSHSetMaxIter           (schur_solver, hypre_ParILUDataSchurNSHSolveMaxIter(ilu_data));
673             hypre_NSHSetTol               (schur_solver, hypre_ParILUDataSchurNSHSolveTol(ilu_data));
674             hypre_NSHSetLogging           (schur_solver, hypre_ParILUDataSchurSolverLogging(ilu_data));
675             hypre_NSHSetPrintLevel        (schur_solver, hypre_ParILUDataSchurSolverPrintLevel(ilu_data));
676             hypre_NSHSetDropThresholdArray(schur_solver, hypre_ParILUDataSchurNSHDroptol(ilu_data));
677 
678             hypre_NSHSetNSHMaxIter        (schur_solver, hypre_ParILUDataSchurNSHMaxNumIter(ilu_data));
679             hypre_NSHSetNSHMaxRowNnz      (schur_solver, hypre_ParILUDataSchurNSHMaxRowNnz(ilu_data));
680             hypre_NSHSetNSHTol            (schur_solver, hypre_ParILUDataSchurNSHTol(ilu_data));
681 
682             hypre_NSHSetMRMaxIter         (schur_solver, hypre_ParILUDataSchurMRMaxIter(ilu_data));
683             hypre_NSHSetMRMaxRowNnz       (schur_solver, hypre_ParILUDataSchurMRMaxRowNnz(ilu_data));
684             hypre_NSHSetMRTol             (schur_solver, hypre_ParILUDataSchurMRTol(ilu_data));
685             hypre_NSHSetColVersion        (schur_solver, hypre_ParILUDataSchurMRColVersion(ilu_data));
686 
687             /* need to create working vector rhs and x for Schur System */
688             rhs = hypre_ParVectorCreate(comm,
689                   hypre_ParCSRMatrixGlobalNumRows(matS),
690                   hypre_ParCSRMatrixRowStarts(matS));
691             hypre_ParVectorInitialize(rhs);
692             x = hypre_ParVectorCreate(comm,
693                   hypre_ParCSRMatrixGlobalNumRows(matS),
694                   hypre_ParCSRMatrixRowStarts(matS));
695             hypre_ParVectorInitialize(x);
696 
697             /* setup solver */
698             hypre_NSHSetup(schur_solver,matS,rhs,x);
699 
700             hypre_ParILUDataSchurSolver(ilu_data) = schur_solver;
701             hypre_ParILUDataRhs        (ilu_data) = rhs;
702             hypre_ParILUDataX          (ilu_data) = x;
703          }
704          break;
705       case 30 : case 31:
706          /* now check communication package */
707          comm_pkg = hypre_ParCSRMatrixCommPkg(matA);
708          /* create if not yet built */
709          if (!comm_pkg)
710          {
711             hypre_MatvecCommPkgCreate(matA);
712             comm_pkg = hypre_ParCSRMatrixCommPkg(matA);
713          }
714          /* create uext and fext */
715          send_size =  hypre_ParCSRCommPkgSendMapStart(comm_pkg,hypre_ParCSRCommPkgNumSends(comm_pkg))
716             - hypre_ParCSRCommPkgSendMapStart(comm_pkg,0);
717          recv_size = hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(matA));
718          buffer_size = send_size > recv_size ? send_size : recv_size;
719          fext = hypre_TAlloc(HYPRE_Real,buffer_size,HYPRE_MEMORY_HOST);
720          uext = hypre_TAlloc(HYPRE_Real,buffer_size,HYPRE_MEMORY_HOST);
721          break;
722       case 40: case 41:
723          if (matS)
724          {
725             /* setup GMRES parameters */
726             HYPRE_ParCSRGMRESCreate(comm, &schur_solver);
727 
728             HYPRE_GMRESSetKDim            (schur_solver, hypre_ParILUDataSchurGMRESKDim(ilu_data));
729             HYPRE_GMRESSetMaxIter         (schur_solver, hypre_ParILUDataSchurGMRESMaxIter(ilu_data));/* we don't need that many solves */
730             HYPRE_GMRESSetTol             (schur_solver, hypre_ParILUDataSchurGMRESTol(ilu_data));
731             HYPRE_GMRESSetAbsoluteTol     (schur_solver, hypre_ParILUDataSchurGMRESAbsoluteTol(ilu_data));
732             HYPRE_GMRESSetLogging         (schur_solver, hypre_ParILUDataSchurSolverLogging(ilu_data));
733             HYPRE_GMRESSetPrintLevel      (schur_solver, hypre_ParILUDataSchurSolverPrintLevel(ilu_data));/* set to zero now, don't print */
734             HYPRE_GMRESSetRelChange       (schur_solver, hypre_ParILUDataSchurGMRESRelChange(ilu_data));
735 
736             /* setup preconditioner parameters */
737             /* create precond, the default is ILU0 */
738             HYPRE_ILUCreate               (&schur_precond);
739             HYPRE_ILUSetType              (schur_precond, hypre_ParILUDataSchurPrecondIluType(ilu_data));
740             HYPRE_ILUSetLevelOfFill       (schur_precond, hypre_ParILUDataSchurPrecondIluLfil(ilu_data));
741             HYPRE_ILUSetMaxNnzPerRow      (schur_precond, hypre_ParILUDataSchurPrecondIluMaxRowNnz(ilu_data));
742             HYPRE_ILUSetDropThresholdArray(schur_precond, hypre_ParILUDataSchurPrecondIluDroptol(ilu_data));
743             HYPRE_ILUSetPrintLevel        (schur_precond, hypre_ParILUDataSchurPrecondPrintLevel(ilu_data));
744             HYPRE_ILUSetMaxIter           (schur_precond, hypre_ParILUDataSchurPrecondMaxIter(ilu_data));
745             HYPRE_ILUSetTol               (schur_precond, hypre_ParILUDataSchurPrecondTol(ilu_data));
746 
747             /* add preconditioner to solver */
748             HYPRE_GMRESSetPrecond(schur_solver,
749                      (HYPRE_PtrToSolverFcn) HYPRE_ILUSolve,
750                      (HYPRE_PtrToSolverFcn) HYPRE_ILUSetup,
751                                           schur_precond);
752             HYPRE_GMRESGetPrecond(schur_solver, &schur_precond_gotten);
753             if (schur_precond_gotten != (schur_precond))
754             {
755                hypre_printf("Schur complement got bad precond\n");
756                return(-1);
757             }
758 
759             /* need to create working vector rhs and x for Schur System */
760             rhs = hypre_ParVectorCreate(comm,
761                                     hypre_ParCSRMatrixGlobalNumRows(matS),
762                                     hypre_ParCSRMatrixRowStarts(matS));
763             hypre_ParVectorInitialize(rhs);
764             x = hypre_ParVectorCreate(comm,
765                                     hypre_ParCSRMatrixGlobalNumRows(matS),
766                                     hypre_ParCSRMatrixRowStarts(matS));
767             hypre_ParVectorInitialize(x);
768 
769             /* setup solver */
770             HYPRE_GMRESSetup(schur_solver,(HYPRE_Matrix)matS,(HYPRE_Vector)rhs,(HYPRE_Vector)x);
771 
772             /* update ilu_data */
773             hypre_ParILUDataSchurSolver   (ilu_data) = schur_solver;
774             hypre_ParILUDataSchurPrecond  (ilu_data) = schur_precond;
775             hypre_ParILUDataRhs           (ilu_data) = rhs;
776             hypre_ParILUDataX             (ilu_data) = x;
777          }
778          break;
779       case 50:
780       {
781 #ifdef HYPRE_USING_CUDA
782          if (matS)
783          {
784             /* create working vectors */
785             Xtemp = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(matA),
786                                    hypre_ParCSRMatrixGlobalNumRows(matA),
787                                    hypre_ParCSRMatrixRowStarts(matA));
788             hypre_ParVectorInitialize(Xtemp);
789 
790             Ytemp = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(matA),
791                                    hypre_ParCSRMatrixGlobalNumRows(matA),
792                                    hypre_ParCSRMatrixRowStarts(matA));
793             hypre_ParVectorInitialize(Ytemp);
794 
795             Ftemp_upper = hypre_SeqVectorCreate(nLU);
796             hypre_VectorOwnsData(Ftemp_upper)   = 0;
797             hypre_VectorData(Ftemp_upper)       = hypre_VectorData(hypre_ParVectorLocalVector(Ftemp));
798             hypre_SeqVectorInitialize(Ftemp_upper);
799 
800             Utemp_lower = hypre_SeqVectorCreate(n - nLU);
801             hypre_VectorOwnsData(Utemp_lower)   = 0;
802             hypre_VectorData(Utemp_lower)       = hypre_VectorData(hypre_ParVectorLocalVector(Utemp)) + nLU;
803             hypre_SeqVectorInitialize(Utemp_lower);
804 
805             /* create GMRES */
806 //            HYPRE_ParCSRGMRESCreate(comm, &schur_solver);
807 
808             hypre_GMRESFunctions * gmres_functions;
809 
810             gmres_functions =
811                hypre_GMRESFunctionsCreate(
812                   hypre_CAlloc,
813                   hypre_ParKrylovFree,
814                   hypre_ParILUCusparseSchurGMRESCommInfo, //parCSR A -> ilu_data
815                   hypre_ParKrylovCreateVector,
816                   hypre_ParKrylovCreateVectorArray,
817                   hypre_ParKrylovDestroyVector,
818                   hypre_ParILURAPSchurGMRESMatvecCreate, //parCSR A -- inactive
819                   hypre_ParILURAPSchurGMRESMatvec, //parCSR A -> ilu_data
820                   hypre_ParILURAPSchurGMRESMatvecDestroy, //parCSR A -- inactive
821                   hypre_ParKrylovInnerProd,
822                   hypre_ParKrylovCopyVector,
823                   hypre_ParKrylovClearVector,
824                   hypre_ParKrylovScaleVector,
825                   hypre_ParKrylovAxpy,
826                   hypre_ParKrylovIdentitySetup, //parCSR A -- inactive
827                   hypre_ParKrylovIdentity ); //parCSR A -- inactive
828             schur_solver = ( (HYPRE_Solver) hypre_GMRESCreate( gmres_functions ) );
829 
830             /* setup GMRES parameters */
831             /* at least should apply 1 solve */
832             if (hypre_ParILUDataSchurGMRESKDim(ilu_data) == 0)
833             {
834                hypre_ParILUDataSchurGMRESKDim(ilu_data) ++;
835             }
836             HYPRE_GMRESSetKDim            (schur_solver, hypre_ParILUDataSchurGMRESKDim(ilu_data));
837             HYPRE_GMRESSetMaxIter         (schur_solver, hypre_ParILUDataSchurGMRESMaxIter(ilu_data));/* we don't need that many solves */
838             HYPRE_GMRESSetTol             (schur_solver, hypre_ParILUDataSchurGMRESTol(ilu_data));
839             HYPRE_GMRESSetAbsoluteTol     (schur_solver, hypre_ParILUDataSchurGMRESAbsoluteTol(ilu_data));
840             HYPRE_GMRESSetLogging         (schur_solver, hypre_ParILUDataSchurSolverLogging(ilu_data));
841             HYPRE_GMRESSetPrintLevel      (schur_solver, hypre_ParILUDataSchurSolverPrintLevel(ilu_data));/* set to zero now, don't print */
842             HYPRE_GMRESSetRelChange       (schur_solver, hypre_ParILUDataSchurGMRESRelChange(ilu_data));
843 
844             /* setup preconditioner parameters */
845             /* create Schur precond */
846             schur_precond = (HYPRE_Solver) ilu_vdata;
847             /* add preconditioner to solver */
848             HYPRE_GMRESSetPrecond(schur_solver,
849                      (HYPRE_PtrToSolverFcn) hypre_ParILURAPSchurGMRESSolve,
850                      //(HYPRE_PtrToSolverFcn) hypre_ParILUCusparseSchurGMRESDummySolve,
851                      (HYPRE_PtrToSolverFcn) hypre_ParILURAPSchurGMRESDummySetup,
852                                           schur_precond);
853             HYPRE_GMRESGetPrecond(schur_solver, &schur_precond_gotten);
854             if (schur_precond_gotten != (schur_precond))
855             {
856                hypre_printf("Schur complement got bad precond\n");
857                return(-1);
858             }
859 
860             /* need to create working vector rhs and x for Schur System */
861             rhs = hypre_ParVectorCreate(comm,
862                                     hypre_ParCSRMatrixGlobalNumRows(matS),
863                                     hypre_ParCSRMatrixRowStarts(matS));
864             hypre_ParVectorInitialize(rhs);
865             x = hypre_ParVectorCreate(comm,
866                                     hypre_ParCSRMatrixGlobalNumRows(matS),
867                                     hypre_ParCSRMatrixRowStarts(matS));
868             hypre_ParVectorInitialize(x);
869 
870             /* setup solver */
871             HYPRE_GMRESSetup(schur_solver,(HYPRE_Matrix)ilu_vdata,(HYPRE_Vector)rhs,(HYPRE_Vector)x);
872 
873             /* solve for right-hand-side consists of only 1 */
874             //hypre_Vector      *rhs_local = hypre_ParVectorLocalVector(rhs);
875             //HYPRE_Real        *Xtemp_data  = hypre_VectorData(Xtemp_local);
876             //hypre_SeqVectorSetConstantValues(rhs_local, 1.0);
877 
878             /* update ilu_data */
879             hypre_ParILUDataSchurSolver   (ilu_data) = schur_solver;
880             hypre_ParILUDataSchurPrecond  (ilu_data) = schur_precond;
881             hypre_ParILUDataRhs           (ilu_data) = rhs;
882             hypre_ParILUDataX             (ilu_data) = x;
883          }
884 #else
885          /* need to create working vector rhs and x for Schur System */
886          HYPRE_Int      m = n - nLU;
887          HYPRE_BigInt   S_total_rows, S_row_starts[2];
888          HYPRE_BigInt   big_m = (HYPRE_BigInt)m;
889          hypre_MPI_Allreduce( &big_m, &S_total_rows, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
890 
891          if ( S_total_rows > 0 )
892          {
893             /* create working vectors */
894             Xtemp = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(matA),
895                                       hypre_ParCSRMatrixGlobalNumRows(matA),
896                                       hypre_ParCSRMatrixRowStarts(matA));
897             hypre_ParVectorInitialize(Xtemp);
898 
899             Ytemp = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(matA),
900                                       hypre_ParCSRMatrixGlobalNumRows(matA),
901                                       hypre_ParCSRMatrixRowStarts(matA));
902             hypre_ParVectorInitialize(Ytemp);
903 
904             /* only do so when we hae the Schur Complement */
905             {
906                HYPRE_BigInt global_start;
907                hypre_MPI_Scan( &big_m, &global_start, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
908                S_row_starts[0] = global_start - m;
909                S_row_starts[1] = global_start;
910             }
911 
912             rhs = hypre_ParVectorCreate(comm,
913                                     S_total_rows,
914                                     S_row_starts);
915             hypre_ParVectorInitialize(rhs);
916 
917             x = hypre_ParVectorCreate(comm,
918                                     S_total_rows,
919                                     S_row_starts);
920             hypre_ParVectorInitialize(x);
921 
922             /* add when necessary */
923             /* create GMRES */
924 //            HYPRE_ParCSRGMRESCreate(comm, &schur_solver);
925 
926             hypre_GMRESFunctions * gmres_functions;
927 
928             gmres_functions =
929                   hypre_GMRESFunctionsCreate(
930                      hypre_CAlloc,
931                      hypre_ParKrylovFree,
932                      hypre_ParILURAPSchurGMRESCommInfoH, //parCSR A -> ilu_data
933                      hypre_ParKrylovCreateVector,
934                      hypre_ParKrylovCreateVectorArray,
935                      hypre_ParKrylovDestroyVector,
936                      hypre_ParILURAPSchurGMRESMatvecCreateH, //parCSR A -- inactive
937                      hypre_ParILURAPSchurGMRESMatvecH, //parCSR A -> ilu_data
938                      hypre_ParILURAPSchurGMRESMatvecDestroyH, //parCSR A -- inactive
939                      hypre_ParKrylovInnerProd,
940                      hypre_ParKrylovCopyVector,
941                      hypre_ParKrylovClearVector,
942                      hypre_ParKrylovScaleVector,
943                      hypre_ParKrylovAxpy,
944                      hypre_ParKrylovIdentitySetup, //parCSR A -- inactive
945                      hypre_ParKrylovIdentity ); //parCSR A -- inactive
946             schur_solver = ( (HYPRE_Solver) hypre_GMRESCreate( gmres_functions ) );
947 
948             /* setup GMRES parameters */
949             /* at least should apply 1 solve */
950             if (hypre_ParILUDataSchurGMRESKDim(ilu_data) == 0)
951             {
952                hypre_ParILUDataSchurGMRESKDim(ilu_data) ++;
953             }
954             HYPRE_GMRESSetKDim            (schur_solver, hypre_ParILUDataSchurGMRESKDim(ilu_data));
955             HYPRE_GMRESSetMaxIter         (schur_solver, hypre_ParILUDataSchurGMRESMaxIter(ilu_data));/* we don't need that many solves */
956             HYPRE_GMRESSetTol             (schur_solver, hypre_ParILUDataSchurGMRESTol(ilu_data));
957             HYPRE_GMRESSetAbsoluteTol     (schur_solver, hypre_ParILUDataSchurGMRESAbsoluteTol(ilu_data));
958             HYPRE_GMRESSetLogging         (schur_solver, hypre_ParILUDataSchurSolverLogging(ilu_data));
959             HYPRE_GMRESSetPrintLevel      (schur_solver, hypre_ParILUDataSchurSolverPrintLevel(ilu_data));/* set to zero now, don't print */
960             HYPRE_GMRESSetRelChange       (schur_solver, hypre_ParILUDataSchurGMRESRelChange(ilu_data));
961 
962             /* setup preconditioner parameters */
963             /* create Schur precond */
964             schur_precond = (HYPRE_Solver) ilu_vdata;
965             /* add preconditioner to solver */
966             HYPRE_GMRESSetPrecond(schur_solver,
967                      (HYPRE_PtrToSolverFcn) hypre_ParILURAPSchurGMRESSolveH,
968                      //(HYPRE_PtrToSolverFcn) hypre_ParILUCusparseSchurGMRESDummySolve,
969                      (HYPRE_PtrToSolverFcn) hypre_ParILURAPSchurGMRESDummySetupH,
970                                           schur_precond);
971             HYPRE_GMRESGetPrecond(schur_solver, &schur_precond_gotten);
972             if (schur_precond_gotten != (schur_precond))
973             {
974                hypre_printf("Schur complement got bad precond\n");
975                return(-1);
976             }
977 
978             /* setup solver */
979             HYPRE_GMRESSetup(schur_solver,(HYPRE_Matrix)ilu_vdata,(HYPRE_Vector)rhs,(HYPRE_Vector)x);
980 
981             /* solve for right-hand-side consists of only 1 */
982             //hypre_Vector      *rhs_local = hypre_ParVectorLocalVector(rhs);
983             //HYPRE_Real        *Xtemp_data  = hypre_VectorData(Xtemp_local);
984             //hypre_SeqVectorSetConstantValues(rhs_local, 1.0);
985          }
986          /* update ilu_data */
987          hypre_ParILUDataSchurSolver   (ilu_data) = schur_solver;
988          hypre_ParILUDataSchurPrecond  (ilu_data) = schur_precond;
989          hypre_ParILUDataRhs           (ilu_data) = rhs;
990          hypre_ParILUDataX             (ilu_data) = x;
991 
992 #endif
993          break;
994       }
995       default:
996          break;
997    }
998    /* set pointers to ilu data */
999 #ifdef HYPRE_USING_CUDA
1000    /* set cusparse pointers */
1001    //hypre_ParILUDataILUSolveBuffer(ilu_data)  = ilu_solve_buffer;
1002    hypre_ParILUDataMatAILUDevice(ilu_data)      = matALU_d;
1003    hypre_ParILUDataMatBILUDevice(ilu_data)      = matBLU_d;
1004    hypre_ParILUDataMatSILUDevice(ilu_data)      = matSLU_d;
1005    hypre_ParILUDataMatEDevice(ilu_data)         = matE_d;
1006    hypre_ParILUDataMatFDevice(ilu_data)         = matF_d;
1007    hypre_ParILUDataILUSolveBuffer(ilu_data)     = ilu_solve_buffer;
1008    hypre_ParILUDataMatALILUSolveInfo(ilu_data)  = matAL_info;
1009    hypre_ParILUDataMatAUILUSolveInfo(ilu_data)  = matAU_info;
1010    hypre_ParILUDataMatBLILUSolveInfo(ilu_data)  = matBL_info;
1011    hypre_ParILUDataMatBUILUSolveInfo(ilu_data)  = matBU_info;
1012    hypre_ParILUDataMatSLILUSolveInfo(ilu_data)  = matSL_info;
1013    hypre_ParILUDataMatSUILUSolveInfo(ilu_data)  = matSU_info;
1014    hypre_ParILUDataAperm(ilu_data)              = Aperm;
1015    hypre_ParILUDataR(ilu_data)                  = R;
1016    hypre_ParILUDataP(ilu_data)                  = P;
1017    hypre_ParILUDataFTempUpper(ilu_data)         = Ftemp_upper;
1018    hypre_ParILUDataUTempLower(ilu_data)         = Utemp_lower;
1019    hypre_ParILUDataMatAFakeDiagonal(ilu_data)   = A_diag_fake;
1020 #endif
1021    hypre_ParILUDataMatA(ilu_data)               = matA;
1022    hypre_ParILUDataXTemp(ilu_data)              = Xtemp;
1023    hypre_ParILUDataYTemp(ilu_data)              = Ytemp;
1024    hypre_ParILUDataF(ilu_data)                  = F_array;
1025    hypre_ParILUDataU(ilu_data)                  = U_array;
1026    hypre_ParILUDataMatL(ilu_data)               = matL;
1027    hypre_ParILUDataMatD(ilu_data)               = matD;
1028    hypre_ParILUDataMatU(ilu_data)               = matU;
1029    hypre_ParILUDataMatLModified(ilu_data)       = matmL;
1030    hypre_ParILUDataMatDModified(ilu_data)       = matmD;
1031    hypre_ParILUDataMatUModified(ilu_data)       = matmU;
1032    hypre_ParILUDataMatS(ilu_data)               = matS;
1033    hypre_ParILUDataCFMarkerArray(ilu_data)      = CF_marker_array;
1034    hypre_ParILUDataPerm(ilu_data)               = perm;
1035    hypre_ParILUDataQPerm(ilu_data)              = qperm;
1036    hypre_ParILUDataNLU(ilu_data)                = nLU;
1037    hypre_ParILUDataNI(ilu_data)                 = nI;
1038    hypre_ParILUDataUEnd(ilu_data)               = u_end;
1039    hypre_ParILUDataUExt(ilu_data)               = uext;
1040    hypre_ParILUDataFExt(ilu_data)               = fext;
1041 
1042    /* compute operator complexity */
1043    hypre_ParCSRMatrixSetDNumNonzeros(matA);
1044    nnzS = 0.0;
1045    /* size_C is the size of global coarse grid, upper left part */
1046    size_C = hypre_ParCSRMatrixGlobalNumRows(matA);
1047    /* switch to compute complexity */
1048 
1049 #ifdef HYPRE_USING_CUDA
1050    HYPRE_Int nnzBEF = 0;
1051    HYPRE_Int nnzG;/* Global nnz */
1052    if (ilu_type == 0 && fill_level == 0)
1053    {
1054       /* The nnz is for sure 1.0 in this case */
1055       hypre_ParILUDataOperatorComplexity(ilu_data) =  1.0;
1056    }
1057    else if (ilu_type == 10 && fill_level == 0)
1058    {
1059       /* The nnz is the sum of different parts */
1060       if (matBLU_d)
1061       {
1062          nnzBEF  += hypre_CSRMatrixNumNonzeros(matBLU_d);
1063       }
1064       if (matE_d)
1065       {
1066          nnzBEF  += hypre_CSRMatrixNumNonzeros(matE_d);
1067       }
1068       if (matF_d)
1069       {
1070          nnzBEF  += hypre_CSRMatrixNumNonzeros(matF_d);
1071       }
1072       hypre_MPI_Allreduce(&nnzBEF, &nnzG, 1, HYPRE_MPI_INT, hypre_MPI_SUM, comm);
1073       if (matS)
1074       {
1075          hypre_ParCSRMatrixSetDNumNonzeros(matS);
1076          nnzS = hypre_ParCSRMatrixDNumNonzeros(matS);
1077          /* if we have Schur system need to reduce it from size_C */
1078       }
1079       hypre_ParILUDataOperatorComplexity(ilu_data) =  ((HYPRE_Real)nnzG + nnzS) /
1080                                            hypre_ParCSRMatrixDNumNonzeros(matA);
1081    }
1082    else if (ilu_type == 50)
1083    {
1084       hypre_ParILUDataOperatorComplexity(ilu_data) =  1.0;
1085    }
1086    else if (ilu_type == 0 || ilu_type == 1 || ilu_type == 10 || ilu_type == 11)
1087    {
1088       if (matBLU_d)
1089       {
1090          nnzBEF  += hypre_CSRMatrixNumNonzeros(matBLU_d);
1091       }
1092       if (matE_d)
1093       {
1094          nnzBEF  += hypre_CSRMatrixNumNonzeros(matE_d);
1095       }
1096       if (matF_d)
1097       {
1098          nnzBEF  += hypre_CSRMatrixNumNonzeros(matF_d);
1099       }
1100       hypre_MPI_Allreduce(&nnzBEF, &nnzG, 1, HYPRE_MPI_INT, hypre_MPI_SUM, comm);
1101       if (matS)
1102       {
1103          hypre_ParCSRMatrixSetDNumNonzeros(matS);
1104          nnzS = hypre_ParCSRMatrixDNumNonzeros(matS);
1105          /* if we have Schur system need to reduce it from size_C */
1106       }
1107       hypre_ParILUDataOperatorComplexity(ilu_data) =  ((HYPRE_Real)nnzG + nnzS) /
1108                                            hypre_ParCSRMatrixDNumNonzeros(matA);
1109    }
1110    else
1111    {
1112 #endif
1113       if (matS)
1114       {
1115          hypre_ParCSRMatrixSetDNumNonzeros(matS);
1116          nnzS = hypre_ParCSRMatrixDNumNonzeros(matS);
1117          /* if we have Schur system need to reduce it from size_C */
1118          size_C -= hypre_ParCSRMatrixGlobalNumRows(matS);
1119          switch(ilu_type)
1120          {
1121             case 10: case 11: case 40: case 41: case 50:
1122                /* now we need to compute the preconditioner */
1123                schur_precond_ilu = (hypre_ParILUData*) (hypre_ParILUDataSchurPrecond(ilu_data));
1124                /* borrow i for local nnz of S */
1125                i = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixOffd(matS));
1126                hypre_MPI_Allreduce(&i, &nnzS_offd, 1, HYPRE_MPI_INT, hypre_MPI_SUM, comm);
1127                nnzS = nnzS * hypre_ParILUDataOperatorComplexity(schur_precond_ilu) +nnzS_offd;
1128                break;
1129             case 20: case 21:
1130                schur_solver_nsh = (hypre_ParNSHData*) hypre_ParILUDataSchurSolver(ilu_data);
1131                nnzS = nnzS * (hypre_ParNSHDataOperatorComplexity(schur_solver_nsh));
1132                break;
1133             default:
1134                break;
1135          }
1136       }
1137 
1138       hypre_ParILUDataOperatorComplexity(ilu_data) =  ((HYPRE_Real)size_C + nnzS +
1139                                           hypre_ParCSRMatrixDNumNonzeros(matL) +
1140                                           hypre_ParCSRMatrixDNumNonzeros(matU))/
1141                                           hypre_ParCSRMatrixDNumNonzeros(matA);
1142 #ifdef HYPRE_USING_CUDA
1143    }
1144 #endif
1145    if ((my_id == 0) && (print_level > 0))
1146    {
1147       hypre_printf("ILU SETUP: operator complexity = %f  \n", hypre_ParILUDataOperatorComplexity(ilu_data));
1148    }
1149 
1150    if ( logging > 1 ) {
1151       residual =
1152          hypre_ParVectorCreate(hypre_ParCSRMatrixComm(matA),
1153                hypre_ParCSRMatrixGlobalNumRows(matA),
1154                hypre_ParCSRMatrixRowStarts(matA) );
1155       hypre_ParVectorInitialize(residual);
1156       hypre_ParILUDataResidual(ilu_data) = residual;
1157    }
1158    else{
1159       hypre_ParILUDataResidual(ilu_data) = NULL;
1160    }
1161    rel_res_norms = hypre_CTAlloc(HYPRE_Real, hypre_ParILUDataMaxIter(ilu_data), HYPRE_MEMORY_HOST);
1162    hypre_ParILUDataRelResNorms(ilu_data) = rel_res_norms;
1163    HYPRE_ANNOTATE_FUNC_END;
1164 
1165    return hypre_error_flag;
1166 }
1167 
1168 #ifdef HYPRE_USING_CUDA
1169 
1170 /* Extract submatrix from diagonal part of A into a new CSRMatrix without sort rows
1171  * WARNING: We don't put diagonal to the first entry of each row since this function is now for cuSparse only
1172  * A = input matrix
1173  * perm = permutation array indicating ordering of rows. Perm could come from a
1174  *    CF_marker array or a reordering routine.
1175  * rqperm = reverse permutation array indicating ordering of columns
1176  * A_diagp = pointer to the output diagonal matrix.
1177  */
1178 HYPRE_Int
hypre_ParILUCusparseExtractDiagonalCSR(hypre_ParCSRMatrix * A,HYPRE_Int * perm,HYPRE_Int * rqperm,hypre_CSRMatrix ** A_diagp)1179 hypre_ParILUCusparseExtractDiagonalCSR( hypre_ParCSRMatrix *A,
1180                                         HYPRE_Int          *perm,
1181                                         HYPRE_Int          *rqperm,
1182                                         hypre_CSRMatrix   **A_diagp )
1183 {
1184    /* Get necessary slots */
1185    hypre_CSRMatrix     *A_diag         = hypre_ParCSRMatrixDiag(A);
1186    HYPRE_Int           *A_diag_i       = hypre_CSRMatrixI(A_diag);
1187    HYPRE_Int           *A_diag_j       = hypre_CSRMatrixJ(A_diag);
1188    HYPRE_Real          *A_diag_data    = hypre_CSRMatrixData(A_diag);
1189    HYPRE_Int            n              = hypre_CSRMatrixNumRows(A_diag);
1190    HYPRE_Int            nnz_A_diag     = A_diag_i[n];
1191 
1192    HYPRE_Int            i, j, current_idx;
1193 
1194    /* No schur complement makes everything easy :) */
1195    hypre_CSRMatrix  *B              = NULL;
1196    B                                = hypre_CSRMatrixCreate(n, n, nnz_A_diag);
1197    hypre_CSRMatrixInitialize(B);
1198    HYPRE_Int        *B_i            = hypre_CSRMatrixI(B);
1199    HYPRE_Int        *B_j            = hypre_CSRMatrixJ(B);
1200    HYPRE_Real       *B_data         = hypre_CSRMatrixData(B);
1201 
1202    /* Copy everything in with permutation */
1203    current_idx = 0;
1204    for ( i = 0; i < n; i++ )
1205    {
1206       B_i[i] = current_idx;
1207       for (j = A_diag_i[perm[i]] ; j < A_diag_i[perm[i]+1] ; j ++)
1208       {
1209          B_j[current_idx] = rqperm[A_diag_j[j]];
1210          B_data[current_idx++] = A_diag_data[j];
1211       }
1212    }
1213    B_i[n] = current_idx;
1214 
1215    hypre_assert(current_idx == nnz_A_diag);
1216    *A_diagp = B;
1217 
1218    return hypre_error_flag;
1219 }
1220 
1221 /* Extract submatrix from diagonal part of A into a
1222  * | B F |
1223  * | E C |
1224  * Struct in order to do ILU with cusparse.
1225  * WARNING: Cusparse requires each row been sorted by column
1226  *          This function only works when rows are sorted!.
1227  * A = input matrix
1228  * perm = permutation array indicating ordering of rows. Perm could come from a
1229  *    CF_marker array or a reordering routine.
1230  * qperm = permutation array indicating ordering of columns
1231  * Bp = pointer to the output B matrix.
1232  * Cp = pointer to the output C matrix.
1233  * Ep = pointer to the output E matrix.
1234  * Fp = pointer to the output F matrix.
1235  */
1236 HYPRE_Int
hypre_ParILUCusparseILUExtractEBFC(hypre_CSRMatrix * A_diag,HYPRE_Int nLU,hypre_CSRMatrix ** Bp,hypre_CSRMatrix ** Cp,hypre_CSRMatrix ** Ep,hypre_CSRMatrix ** Fp)1237 hypre_ParILUCusparseILUExtractEBFC(hypre_CSRMatrix *A_diag, HYPRE_Int nLU, hypre_CSRMatrix **Bp, hypre_CSRMatrix **Cp, hypre_CSRMatrix **Ep, hypre_CSRMatrix **Fp)
1238 {
1239    /* Get necessary slots */
1240    HYPRE_Int           *A_diag_i       = hypre_CSRMatrixI(A_diag);
1241    HYPRE_Int           *A_diag_j       = hypre_CSRMatrixJ(A_diag);
1242    HYPRE_Real          *A_diag_data    = hypre_CSRMatrixData(A_diag);
1243    HYPRE_Int            n              = hypre_CSRMatrixNumRows(A_diag);
1244    HYPRE_Int            nnz_A_diag     = A_diag_i[n];
1245 
1246    HYPRE_Int            i, j, row, col;
1247 
1248    hypre_assert(nLU >= 0 && nLU <= n);
1249 
1250    if (nLU == n)
1251    {
1252       /* No schur complement makes everything easy :) */
1253       hypre_CSRMatrix  *B              = NULL;
1254       hypre_CSRMatrix  *C              = NULL;
1255       hypre_CSRMatrix  *E              = NULL;
1256       hypre_CSRMatrix  *F              = NULL;
1257       B                                = hypre_CSRMatrixCreate(n, n, nnz_A_diag);
1258       hypre_CSRMatrixInitialize(B);
1259       hypre_CSRMatrixCopy(A_diag, B, 1);
1260       C                                = hypre_CSRMatrixCreate(0, 0, 0);
1261       hypre_CSRMatrixInitialize(C);
1262       E                                = hypre_CSRMatrixCreate(0, 0, 0);
1263       hypre_CSRMatrixInitialize(E);
1264       F                                = hypre_CSRMatrixCreate(0, 0, 0);
1265       hypre_CSRMatrixInitialize(F);
1266       *Bp = B;
1267       *Cp = C;
1268       *Ep = E;
1269       *Fp = F;
1270    }
1271    else if (nLU ==0)
1272    {
1273       /* All schur complement also makes everything easy :) */
1274       hypre_CSRMatrix  *B              = NULL;
1275       hypre_CSRMatrix  *C              = NULL;
1276       hypre_CSRMatrix  *E              = NULL;
1277       hypre_CSRMatrix  *F              = NULL;
1278       C                                = hypre_CSRMatrixCreate(n, n, nnz_A_diag);
1279       hypre_CSRMatrixInitialize(C);
1280       hypre_CSRMatrixCopy(A_diag, C, 1);
1281       B                                = hypre_CSRMatrixCreate(0, 0, 0);
1282       hypre_CSRMatrixInitialize(B);
1283       E                                = hypre_CSRMatrixCreate(0, 0, 0);
1284       hypre_CSRMatrixInitialize(E);
1285       F                                = hypre_CSRMatrixCreate(0, 0, 0);
1286       hypre_CSRMatrixInitialize(F);
1287       *Bp = B;
1288       *Cp = C;
1289       *Ep = E;
1290       *Fp = F;
1291    }
1292    else
1293    {
1294       /* Has schur complement :( */
1295       HYPRE_Int         m              = n - nLU;
1296       hypre_CSRMatrix  *B              = NULL;
1297       hypre_CSRMatrix  *C              = NULL;
1298       hypre_CSRMatrix  *E              = NULL;
1299       hypre_CSRMatrix  *F              = NULL;
1300       HYPRE_Int         capacity_B;
1301       HYPRE_Int         capacity_E;
1302       HYPRE_Int         capacity_F;
1303       HYPRE_Int         capacity_C;
1304       HYPRE_Int         ctrB;
1305       HYPRE_Int         ctrC;
1306       HYPRE_Int         ctrE;
1307       HYPRE_Int         ctrF;
1308 
1309       HYPRE_Int        *B_i            = NULL;
1310       HYPRE_Int        *C_i            = NULL;
1311       HYPRE_Int        *E_i            = NULL;
1312       HYPRE_Int        *F_i            = NULL;
1313       HYPRE_Int        *B_j            = NULL;
1314       HYPRE_Int        *C_j            = NULL;
1315       HYPRE_Int        *E_j            = NULL;
1316       HYPRE_Int        *F_j            = NULL;
1317       HYPRE_Real       *B_data         = NULL;
1318       HYPRE_Real       *C_data         = NULL;
1319       HYPRE_Real       *E_data         = NULL;
1320       HYPRE_Real       *F_data         = NULL;
1321 
1322       /* Create CSRMatrices */
1323       B                                = hypre_CSRMatrixCreate(nLU, nLU, 0);
1324       hypre_CSRMatrixInitialize(B);
1325       C                                = hypre_CSRMatrixCreate(m, m, 0);
1326       hypre_CSRMatrixInitialize(C);
1327       E                                = hypre_CSRMatrixCreate(m, nLU, 0);
1328       hypre_CSRMatrixInitialize(E);
1329       F                                = hypre_CSRMatrixCreate(nLU, m, 0);
1330       hypre_CSRMatrixInitialize(F);
1331 
1332       /* Estimate # of nonzeros */
1333       capacity_B                       = nLU + ceil(nnz_A_diag * 1.0 * nLU / n * nLU / n);
1334       capacity_C                       = m + ceil(nnz_A_diag * 1.0 * m / n * m / n);
1335       capacity_E                       = hypre_min(m, nLU) + ceil(nnz_A_diag * 1.0 * nLU / n * m / n);
1336       capacity_F                       = capacity_E;
1337 
1338       /* Allocate memory */
1339       B_i                              = hypre_CSRMatrixI(B);
1340       B_j                              = hypre_CTAlloc(HYPRE_Int, capacity_B, HYPRE_MEMORY_DEVICE);
1341       B_data                           = hypre_CTAlloc(HYPRE_Real, capacity_B, HYPRE_MEMORY_DEVICE);
1342       C_i                              = hypre_CSRMatrixI(C);
1343       C_j                              = hypre_CTAlloc(HYPRE_Int, capacity_C, HYPRE_MEMORY_DEVICE);
1344       C_data                           = hypre_CTAlloc(HYPRE_Real, capacity_C, HYPRE_MEMORY_DEVICE);
1345       E_i                              = hypre_CSRMatrixI(E);
1346       E_j                              = hypre_CTAlloc(HYPRE_Int, capacity_E, HYPRE_MEMORY_DEVICE);
1347       E_data                           = hypre_CTAlloc(HYPRE_Real, capacity_E, HYPRE_MEMORY_DEVICE);
1348       F_i                              = hypre_CSRMatrixI(F);
1349       F_j                              = hypre_CTAlloc(HYPRE_Int, capacity_F, HYPRE_MEMORY_DEVICE);
1350       F_data                           = hypre_CTAlloc(HYPRE_Real, capacity_F, HYPRE_MEMORY_DEVICE);
1351       ctrB                             = 0;
1352       ctrC                             = 0;
1353       ctrE                             = 0;
1354       ctrF                             = 0;
1355 
1356       /* Loop to copy data */
1357       /* B and F first */
1358       for (i = 0; i < nLU; i++)
1359       {
1360          B_i[i]   = ctrB;
1361          F_i[i]   = ctrF;
1362          for (j = A_diag_i[i]; j < A_diag_i[i+1]; j++)
1363          {
1364             col = A_diag_j[j];
1365             if (col >= nLU)
1366             {
1367                break;
1368             }
1369             B_j[ctrB] = col;
1370             B_data[ctrB++] = A_diag_data[j];
1371             /* check capacity */
1372             if (ctrB >= capacity_B)
1373             {
1374                HYPRE_Int tmp;
1375                tmp = capacity_B;
1376                capacity_B = capacity_B * EXPAND_FACT + 1;
1377                B_j = hypre_TReAlloc_v2(B_j, HYPRE_Int, tmp, HYPRE_Int, capacity_B, HYPRE_MEMORY_DEVICE);
1378                B_data = hypre_TReAlloc_v2(B_data, HYPRE_Real, tmp, HYPRE_Real, capacity_B, HYPRE_MEMORY_DEVICE);
1379             }
1380          }
1381          for (; j < A_diag_i[i+1]; j++)
1382          {
1383             col = A_diag_j[j];
1384             col = col - nLU;
1385             F_j[ctrF] = col;
1386             F_data[ctrF++] = A_diag_data[j];
1387             if (ctrF >= capacity_F)
1388             {
1389                HYPRE_Int tmp;
1390                tmp = capacity_F;
1391                capacity_F = capacity_F * EXPAND_FACT + 1;
1392                F_j = hypre_TReAlloc_v2(F_j, HYPRE_Int, tmp, HYPRE_Int, capacity_F, HYPRE_MEMORY_DEVICE);
1393                F_data = hypre_TReAlloc_v2(F_data, HYPRE_Real, tmp, HYPRE_Real, capacity_F, HYPRE_MEMORY_DEVICE);
1394             }
1395          }
1396       }
1397       B_i[nLU] = ctrB;
1398       F_i[nLU] = ctrF;
1399 
1400       /* E and C afterward */
1401       for (i = nLU; i < n; i++)
1402       {
1403          row = i - nLU;
1404          E_i[row] = ctrE;
1405          C_i[row] = ctrC;
1406          for (j = A_diag_i[i]; j < A_diag_i[i+1]; j++)
1407          {
1408             col = A_diag_j[j];
1409             if (col >= nLU)
1410             {
1411                break;
1412             }
1413             E_j[ctrE] = col;
1414             E_data[ctrE++] = A_diag_data[j];
1415             /* check capacity */
1416             if (ctrE >= capacity_E)
1417             {
1418                HYPRE_Int tmp;
1419                tmp = capacity_E;
1420                capacity_E = capacity_E * EXPAND_FACT + 1;
1421                E_j = hypre_TReAlloc_v2(E_j, HYPRE_Int, tmp, HYPRE_Int, capacity_E, HYPRE_MEMORY_DEVICE);
1422                E_data = hypre_TReAlloc_v2(E_data, HYPRE_Real, tmp, HYPRE_Real, capacity_E, HYPRE_MEMORY_DEVICE);
1423             }
1424          }
1425          for (; j < A_diag_i[i+1]; j++)
1426          {
1427             col = A_diag_j[j];
1428             col = col - nLU;
1429             C_j[ctrC] = col;
1430             C_data[ctrC++] = A_diag_data[j];
1431             if (ctrC >= capacity_C)
1432             {
1433                HYPRE_Int tmp;
1434                tmp = capacity_C;
1435                capacity_C = capacity_C * EXPAND_FACT + 1;
1436                C_j = hypre_TReAlloc_v2(C_j, HYPRE_Int, tmp, HYPRE_Int, capacity_C, HYPRE_MEMORY_DEVICE);
1437                C_data = hypre_TReAlloc_v2(C_data, HYPRE_Real, tmp, HYPRE_Real, capacity_C, HYPRE_MEMORY_DEVICE);
1438             }
1439          }
1440       }
1441       E_i[m] = ctrE;
1442       C_i[m] = ctrC;
1443 
1444       hypre_assert((ctrB+ctrC+ctrE+ctrF) == nnz_A_diag);
1445 
1446       /* Create CSRMatrices */
1447       hypre_CSRMatrixJ(B)              = B_j;
1448       hypre_CSRMatrixData(B)           = B_data;
1449       hypre_CSRMatrixNumNonzeros(B)    = ctrB;
1450       hypre_CSRMatrixSetDataOwner(B, 1);
1451       *Bp                              = B;
1452 
1453       hypre_CSRMatrixJ(C)              = C_j;
1454       hypre_CSRMatrixData(C)           = C_data;
1455       hypre_CSRMatrixNumNonzeros(C)    = ctrC;
1456       hypre_CSRMatrixSetDataOwner(C, 1);
1457       *Cp                              = C;
1458 
1459       hypre_CSRMatrixJ(E)              = E_j;
1460       hypre_CSRMatrixData(E)           = E_data;
1461       hypre_CSRMatrixNumNonzeros(E)    = ctrE;
1462       hypre_CSRMatrixSetDataOwner(E, 1);
1463       *Ep                              = E;
1464 
1465       hypre_CSRMatrixJ(F)              = F_j;
1466       hypre_CSRMatrixData(F)           = F_data;
1467       hypre_CSRMatrixNumNonzeros(F)    = ctrF;
1468       hypre_CSRMatrixSetDataOwner(F, 1);
1469       *Fp                              = F;
1470    }
1471 
1472    return hypre_error_flag;
1473 }
1474 
1475 /* Wrapper for ILU0 with cusparse on a matrix, csr sort was done in this function */
1476 HYPRE_Int
HYPRE_ILUSetupCusparseCSRILU0(hypre_CSRMatrix * A,cusparseSolvePolicy_t ilu_solve_policy)1477 HYPRE_ILUSetupCusparseCSRILU0(hypre_CSRMatrix *A, cusparseSolvePolicy_t ilu_solve_policy)
1478 {
1479 
1480    /* data objects for A */
1481    HYPRE_Int               n                    = hypre_CSRMatrixNumRows(A);
1482    HYPRE_Int               m                    = hypre_CSRMatrixNumCols(A);
1483 
1484    hypre_assert(n == m);
1485 
1486    HYPRE_Real              *A_data              = hypre_CSRMatrixData(A);
1487    HYPRE_Int               *A_i                 = hypre_CSRMatrixI(A);
1488    HYPRE_Int               *A_j                 = hypre_CSRMatrixJ(A);
1489    HYPRE_Int               nnz_A                = hypre_CSRMatrixNumNonzeros(A);
1490 
1491    /* pointers to cusparse data */
1492    csrilu02Info_t          matA_info            = NULL;
1493 
1494    /* variables and working arrays used during the ilu */
1495    HYPRE_Int               zero_pivot;
1496    HYPRE_Int               matA_buffersize;
1497    void                    *matA_buffer         = NULL;
1498 
1499    HYPRE_Int               isDoublePrecision    = sizeof(HYPRE_Complex) == sizeof(hypre_double);
1500    HYPRE_Int               isSinglePrecision    = sizeof(HYPRE_Complex) == sizeof(hypre_double) / 2;
1501 
1502    cusparseHandle_t handle = hypre_HandleCusparseHandle(hypre_handle());
1503    cusparseMatDescr_t descr = hypre_CSRMatrixGPUMatDescr(A);
1504 
1505    hypre_assert(isDoublePrecision || isSinglePrecision);
1506 
1507    /* 1. Sort columns inside each row first, we can't assume that's sorted */
1508    hypre_SortCSRCusparse(n, m, nnz_A, descr, A_i, A_j, A_data);
1509 
1510    /* 2. Create info for ilu setup and solve */
1511    HYPRE_CUSPARSE_CALL(cusparseCreateCsrilu02Info(&matA_info));
1512 
1513    /* 3. Get working array size */
1514    if (isDoublePrecision)
1515    {
1516       HYPRE_CUSPARSE_CALL(cusparseDcsrilu02_bufferSize(handle, n, nnz_A, descr,
1517                                                          (hypre_double *) A_data, A_i, A_j,
1518                                                          matA_info, &matA_buffersize));
1519    }
1520    else if (isSinglePrecision)
1521    {
1522       HYPRE_CUSPARSE_CALL(cusparseScsrilu02_bufferSize(handle, n, nnz_A, descr,
1523                                                          (float *) A_data, A_i, A_j,
1524                                                          matA_info, &matA_buffersize));
1525    }
1526    /* 4. Create working array, since they won't be visited by host, allocate on device */
1527    matA_buffer                                  = hypre_MAlloc(matA_buffersize, HYPRE_MEMORY_DEVICE);
1528 
1529    /* 5. Now perform the analysis */
1530    /* 5-1. Analysis */
1531    if (isDoublePrecision)
1532    {
1533       HYPRE_CUSPARSE_CALL(cusparseDcsrilu02_analysis(handle, n, nnz_A, descr,
1534                                                       (hypre_double *) A_data, A_i, A_j,
1535                                                       matA_info, ilu_solve_policy, matA_buffer));
1536    }
1537    else if (isSinglePrecision)
1538    {
1539       HYPRE_CUSPARSE_CALL(cusparseScsrilu02_analysis(handle, n, nnz_A, descr,
1540                                                       (float *) A_data, A_i, A_j,
1541                                                       matA_info, ilu_solve_policy, matA_buffer));
1542    }
1543    /* 5-2. Check for zero pivot */
1544    HYPRE_CUSPARSE_CALL(cusparseXcsrilu02_zeroPivot(handle, matA_info, &zero_pivot));
1545 
1546    /* 6. Apply the factorization */
1547    if (isDoublePrecision)
1548    {
1549       HYPRE_CUSPARSE_CALL(cusparseDcsrilu02(handle, n, nnz_A, descr,
1550                                              (hypre_double *) A_data, A_i, A_j,
1551                                              matA_info, ilu_solve_policy, matA_buffer));
1552    }
1553    else if (isSinglePrecision)
1554    {
1555       HYPRE_CUSPARSE_CALL(cusparseScsrilu02(handle, n, nnz_A, descr,
1556                                              (float *) A_data, A_i, A_j,
1557                                              matA_info, ilu_solve_policy, matA_buffer));
1558    }
1559 
1560    /* Check for zero pivot */
1561    HYPRE_CUSPARSE_CALL(cusparseXcsrilu02_zeroPivot(handle, matA_info, &zero_pivot));
1562 
1563    /* Done with factorization, finishing up */
1564    hypre_TFree(matA_buffer, HYPRE_MEMORY_DEVICE);
1565    HYPRE_CUSPARSE_CALL(cusparseDestroyCsrilu02Info(matA_info));
1566 
1567    return hypre_error_flag;
1568 }
1569 
1570 /* Wrapper for ILU0 solve analysis phase with cusparse on a matrix */
1571 HYPRE_Int
HYPRE_ILUSetupCusparseCSRILU0SetupSolve(hypre_CSRMatrix * A,cusparseMatDescr_t matL_des,cusparseMatDescr_t matU_des,cusparseSolvePolicy_t ilu_solve_policy,csrsv2Info_t * matL_infop,csrsv2Info_t * matU_infop,HYPRE_Int * buffer_sizep,void ** bufferp)1572 HYPRE_ILUSetupCusparseCSRILU0SetupSolve(hypre_CSRMatrix *A, cusparseMatDescr_t matL_des, cusparseMatDescr_t matU_des,
1573                               cusparseSolvePolicy_t ilu_solve_policy, csrsv2Info_t *matL_infop, csrsv2Info_t *matU_infop,
1574                               HYPRE_Int *buffer_sizep, void **bufferp)
1575 {
1576    if (!A)
1577    {
1578       /* return if A is NULL */
1579       *matL_infop    = NULL;
1580       *matU_infop    = NULL;
1581       *buffer_sizep  = 0;
1582       *bufferp       = NULL;
1583       return hypre_error_flag;
1584    }
1585 
1586    /* data objects for A */
1587    HYPRE_Int               n                    = hypre_CSRMatrixNumRows(A);
1588    HYPRE_Int               m                    = hypre_CSRMatrixNumCols(A);
1589 
1590    hypre_assert(n == m);
1591 
1592    if (n == 0)
1593    {
1594       /* return if A is 0 by 0 */
1595       *matL_infop    = NULL;
1596       *matU_infop    = NULL;
1597       *buffer_sizep  = 0;
1598       *bufferp       = NULL;
1599       return hypre_error_flag;
1600    }
1601 
1602    HYPRE_Real              *A_data              = hypre_CSRMatrixData(A);
1603    HYPRE_Int               *A_i                 = hypre_CSRMatrixI(A);
1604    HYPRE_Int               *A_j                 = hypre_CSRMatrixJ(A);
1605    HYPRE_Int               nnz_A                = A_i[n];
1606 
1607    /* pointers to cusparse data */
1608    csrsv2Info_t            matL_info            = *matL_infop;
1609    csrsv2Info_t            matU_info            = *matU_infop;
1610 
1611    /* clear data if already exists */
1612    if (matL_info)
1613    {
1614       HYPRE_CUSPARSE_CALL( cusparseDestroyCsrsv2Info(matL_info) );
1615       matL_info = NULL;
1616    }
1617    if (matU_info)
1618    {
1619       HYPRE_CUSPARSE_CALL( cusparseDestroyCsrsv2Info(matU_info) );
1620       matU_info = NULL;
1621    }
1622 
1623    /* variables and working arrays used during the ilu */
1624    HYPRE_Int               matL_buffersize;
1625    HYPRE_Int               matU_buffersize;
1626    HYPRE_Int               solve_buffersize;
1627    HYPRE_Int               solve_oldbuffersize  = *buffer_sizep;
1628    void                    *solve_buffer        = *bufferp;
1629 
1630    HYPRE_Int               isDoublePrecision    = sizeof(HYPRE_Complex) == sizeof(hypre_double);
1631    HYPRE_Int               isSinglePrecision    = sizeof(HYPRE_Complex) == sizeof(hypre_double) / 2;
1632 
1633    hypre_assert(isDoublePrecision || isSinglePrecision);
1634 
1635    cusparseHandle_t handle = hypre_HandleCusparseHandle(hypre_handle());
1636 
1637    /* 1. Create info for ilu setup and solve */
1638    HYPRE_CUSPARSE_CALL(cusparseCreateCsrsv2Info(&(matL_info)));
1639    HYPRE_CUSPARSE_CALL(cusparseCreateCsrsv2Info(&(matU_info)));
1640 
1641    /* 2. Get working array size */
1642    if (isDoublePrecision)
1643    {
1644 
1645       HYPRE_CUSPARSE_CALL(cusparseDcsrsv2_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n, nnz_A,
1646                                                       matL_des, (hypre_double *) A_data, A_i, A_j,
1647                                                       matL_info, &matL_buffersize));
1648 
1649       HYPRE_CUSPARSE_CALL(cusparseDcsrsv2_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n, nnz_A,
1650                                                       matU_des, (hypre_double *) A_data, A_i, A_j,
1651                                                       matU_info, &matU_buffersize));
1652    }
1653    else if (isSinglePrecision)
1654    {
1655 
1656       HYPRE_CUSPARSE_CALL(cusparseScsrsv2_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n, nnz_A,
1657                                                       matL_des, (float *) A_data, A_i, A_j,
1658                                                       matL_info, &matL_buffersize));
1659 
1660       HYPRE_CUSPARSE_CALL(cusparseScsrsv2_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n, nnz_A,
1661                                                       matU_des, (float *) A_data, A_i, A_j,
1662                                                       matU_info, &matU_buffersize));
1663    }
1664    solve_buffersize = hypre_max( matL_buffersize, matU_buffersize );
1665    /* 3. Create working array, since they won't be visited by host, allocate on device */
1666    if (solve_buffersize > solve_oldbuffersize)
1667    {
1668       if (solve_buffer)
1669       {
1670          solve_buffer                           = hypre_ReAlloc_v2(solve_buffer, solve_oldbuffersize, solve_buffersize, HYPRE_MEMORY_DEVICE);
1671       }
1672       else
1673       {
1674          solve_buffer                           = hypre_MAlloc(solve_buffersize, HYPRE_MEMORY_DEVICE);
1675       }
1676    }
1677 
1678    /* 4. Now perform the analysis */
1679    if (isDoublePrecision)
1680    {
1681 
1682       HYPRE_CUSPARSE_CALL(cusparseDcsrsv2_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1683                                                       n, nnz_A, matL_des,
1684                                                       (hypre_double *) A_data, A_i, A_j,
1685                                                       matL_info, ilu_solve_policy, solve_buffer));
1686 
1687       HYPRE_CUSPARSE_CALL(cusparseDcsrsv2_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1688                                                       n, nnz_A, matU_des,
1689                                                       (hypre_double *) A_data, A_i, A_j,
1690                                                       matU_info, ilu_solve_policy, solve_buffer));
1691    }
1692    else if (isSinglePrecision)
1693    {
1694 
1695       HYPRE_CUSPARSE_CALL(cusparseScsrsv2_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1696                                                       n, nnz_A, matL_des,
1697                                                       (float *) A_data, A_i, A_j,
1698                                                       matL_info, ilu_solve_policy, solve_buffer));
1699 
1700       HYPRE_CUSPARSE_CALL(cusparseScsrsv2_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1701                                                       n, nnz_A, matU_des,
1702                                                       (float *) A_data, A_i, A_j,
1703                                                       matU_info, ilu_solve_policy, solve_buffer));
1704    }
1705 
1706    /* Done with analysis, finishing up */
1707    /* Set return value */
1708    *matL_infop    = matL_info;
1709    *matU_infop    = matU_info;
1710    *buffer_sizep  = solve_buffersize;
1711    *bufferp       = solve_buffer;
1712 
1713    return hypre_error_flag;
1714 }
1715 
1716 /* ILU(0) (GPU)
1717  * A = input matrix
1718  * perm = permutation array indicating ordering of rows. Perm could come from a
1719  *    CF_marker array or a reordering routine.
1720  * qperm = permutation array indicating ordering of columns
1721  * nI = number of interial unknowns
1722  * nLU = size of incomplete factorization, nLU should obey nLU <= nI.
1723  *    Schur complement is formed if nLU < n
1724  * Lptr, Dptr, Uptr, Sptr = L, D, U, S factors. Note that with CUDA, Dptr and Uptr are unused
1725  * xtempp, ytempp = helper vector used in 2-level solve.
1726  * A_fake_diagp = fake diagonal for matvec
1727  * will form global Schur Matrix if nLU < n
1728  */
1729 HYPRE_Int
hypre_ILUSetupILU0Device(hypre_ParCSRMatrix * A,HYPRE_Int * perm,HYPRE_Int * qperm,HYPRE_Int n,HYPRE_Int nLU,cusparseMatDescr_t matL_des,cusparseMatDescr_t matU_des,cusparseSolvePolicy_t ilu_solve_policy,void ** bufferp,csrsv2Info_t * matBL_infop,csrsv2Info_t * matBU_infop,csrsv2Info_t * matSL_infop,csrsv2Info_t * matSU_infop,hypre_CSRMatrix ** BLUptr,hypre_ParCSRMatrix ** matSptr,hypre_CSRMatrix ** Eptr,hypre_CSRMatrix ** Fptr,HYPRE_Int ** A_fake_diag_ip)1730 hypre_ILUSetupILU0Device(hypre_ParCSRMatrix *A, HYPRE_Int *perm, HYPRE_Int *qperm, HYPRE_Int n, HYPRE_Int nLU,
1731                            cusparseMatDescr_t matL_des, cusparseMatDescr_t matU_des, cusparseSolvePolicy_t ilu_solve_policy,
1732                            void **bufferp, csrsv2Info_t *matBL_infop, csrsv2Info_t *matBU_infop,
1733                            csrsv2Info_t *matSL_infop, csrsv2Info_t *matSU_infop,
1734                            hypre_CSRMatrix **BLUptr, hypre_ParCSRMatrix **matSptr, hypre_CSRMatrix **Eptr, hypre_CSRMatrix **Fptr,
1735                            HYPRE_Int **A_fake_diag_ip)
1736 {
1737    /* GPU-accelerated ILU0 with cusparse */
1738    HYPRE_Int               i, j, k1, k2, k3, col;
1739 
1740    /* communication stuffs for S */
1741    MPI_Comm                comm                 = hypre_ParCSRMatrixComm(A);
1742 
1743    HYPRE_Int               my_id, num_procs;
1744    hypre_MPI_Comm_size(comm,&num_procs);
1745    hypre_MPI_Comm_rank(comm,&my_id);
1746 
1747    hypre_ParCSRCommPkg     *comm_pkg;
1748    hypre_ParCSRCommHandle  *comm_handle;
1749    HYPRE_Int               num_sends, begin, end;
1750    HYPRE_BigInt            *send_buf            = NULL;
1751    HYPRE_Int               *rperm               = NULL;
1752    HYPRE_Int               *rqperm              = NULL;
1753 
1754    hypre_ParCSRMatrix      *matS                = NULL;
1755    hypre_CSRMatrix         *A_diag              = NULL;
1756    HYPRE_Int               *A_fake_diag_i       = NULL;
1757    hypre_CSRMatrix         *A_offd              = NULL;
1758    HYPRE_Int               *A_offd_i            = NULL;
1759    HYPRE_Int               *A_offd_j            = NULL;
1760    HYPRE_Real              *A_offd_data         = NULL;
1761    hypre_CSRMatrix         *SLU                 = NULL;
1762    /* pointers to cusparse data */
1763    csrsv2Info_t            matBL_info           = NULL;
1764    csrsv2Info_t            matBU_info           = NULL;
1765    csrsv2Info_t            matSL_info           = NULL;
1766    csrsv2Info_t            matSU_info           = NULL;
1767 
1768    HYPRE_Int               buffer_size          = 0;
1769    void                    *buffer              = NULL;
1770 
1771    /* variables for matS */
1772    HYPRE_Int               m                    = n - nLU;
1773    HYPRE_Int               nI                   = nLU;//use default
1774    HYPRE_Int               e                    = 0;
1775    HYPRE_Int               m_e                  = m;
1776    HYPRE_BigInt            total_rows;
1777    HYPRE_BigInt            col_starts[2];
1778    HYPRE_Int               *S_diag_i            = NULL;
1779    HYPRE_Int               S_diag_nnz;
1780    hypre_CSRMatrix         *S_offd              = NULL;
1781    HYPRE_Int               *S_offd_i            = NULL;
1782    HYPRE_Int               *S_offd_j            = NULL;
1783    HYPRE_Real              *S_offd_data         = NULL;
1784    HYPRE_BigInt            *S_offd_colmap       = NULL;
1785    HYPRE_Int               S_offd_nnz;
1786    HYPRE_Int               S_offd_ncols;
1787 
1788    /* set data slots */
1789    A_offd                                       = hypre_ParCSRMatrixOffd(A);
1790    A_offd_i                                     = hypre_CSRMatrixI(A_offd);
1791    A_offd_j                                     = hypre_CSRMatrixJ(A_offd);
1792    A_offd_data                                  = hypre_CSRMatrixData(A_offd);
1793 
1794    /* unfortunately we need to build the reverse permutation array */
1795    rperm                                        = hypre_CTAlloc(HYPRE_Int, n, HYPRE_MEMORY_HOST);
1796    rqperm                                       = hypre_CTAlloc(HYPRE_Int, n, HYPRE_MEMORY_HOST);
1797    for (i = 0; i < n; i++)
1798    {
1799       rperm[perm[i]] = i;
1800       rqperm[qperm[i]] = i;
1801    }
1802 
1803    /* Only call ILU when we really have a matrix on this processor */
1804    if (n > 0)
1805    {
1806       /* Copy diagonal matrix into a new place with permutation
1807        * That is, A_diag = A_diag(perm,qperm);
1808        */
1809       hypre_ParILUCusparseExtractDiagonalCSR(A, perm, rqperm, &A_diag);
1810 
1811       /* Apply ILU factorization to the entile A_diag */
1812       HYPRE_ILUSetupCusparseCSRILU0(A_diag, ilu_solve_policy);
1813 
1814       /* | L \ U (B) L^{-1}F  |
1815        * | EU^{-1}   L \ U (S)|
1816        * Extract submatrix L_B U_B, L_S U_S, EU_B^{-1}, L_B^{-1}F
1817        * Note that in this function after ILU, all rows are sorted
1818        * in a way different than HYPRE. Diagonal is not listed in the front
1819        */
1820       hypre_ParILUCusparseILUExtractEBFC(A_diag, nLU, BLUptr, &SLU, Eptr, Fptr);
1821    }
1822    else
1823    {
1824       *BLUptr = NULL;
1825       *Eptr = NULL;
1826       *Fptr = NULL;
1827       SLU = NULL;
1828    }
1829 
1830    /* create B */
1831    /* only analyse when nacessary */
1832    if ( nLU > 0 )
1833    {
1834       /* Analysis of BILU */
1835       HYPRE_ILUSetupCusparseCSRILU0SetupSolve(*BLUptr, matL_des, matU_des,
1836                                  ilu_solve_policy, &matBL_info, &matBU_info,
1837                                  &buffer_size, &buffer);
1838    }
1839 
1840    HYPRE_BigInt big_m = (HYPRE_BigInt)m;
1841    hypre_MPI_Allreduce(&big_m, &total_rows, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
1842    /* only form when total_rows > 0 */
1843    if ( total_rows > 0 )
1844    {
1845       /* now create S */
1846       /* need to get new column start */
1847       {
1848          HYPRE_BigInt global_start;
1849          hypre_MPI_Scan( &big_m, &global_start, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
1850          col_starts[0] = global_start - m;
1851          col_starts[1] = global_start;
1852       }
1853 
1854       A_fake_diag_i = hypre_CTAlloc(HYPRE_Int, m + 1, HYPRE_MEMORY_DEVICE);
1855       if (SLU)
1856       {
1857          /* Analysis of SILU */
1858          HYPRE_ILUSetupCusparseCSRILU0SetupSolve(SLU, matL_des, matU_des,
1859                                        ilu_solve_policy, &matSL_info, &matSU_info,
1860                                        &buffer_size, &buffer);
1861       }
1862       else
1863       {
1864          SLU = hypre_CSRMatrixCreate(0,0,0);
1865          hypre_CSRMatrixInitialize(SLU);
1866       }
1867       S_diag_i = hypre_CSRMatrixI(SLU);
1868       S_diag_nnz = S_diag_i[m];
1869       /* Build ParCSRMatrix matS
1870        * For example when np == 3 the new matrix takes the following form
1871        * |IS_1 E_12 E_13|
1872        * |E_21 IS_2 E_22| = S
1873        * |E_31 E_32 IS_3|
1874        * In which IS_i is the cusparse ILU factorization of S_i in one matrix
1875        * */
1876 
1877       /* We did nothing to A_offd, so all the data kept, just reorder them
1878        * The create function takes comm, global num rows/cols,
1879        *    row/col start, num cols offd, nnz diag, nnz offd
1880        */
1881       S_offd_nnz = hypre_CSRMatrixNumNonzeros(A_offd);
1882       S_offd_ncols = hypre_CSRMatrixNumCols(A_offd);
1883 
1884       matS = hypre_ParCSRMatrixCreate( comm,
1885                            total_rows,
1886                            total_rows,
1887                            col_starts,
1888                            col_starts,
1889                            S_offd_ncols,
1890                            S_diag_nnz,
1891                            S_offd_nnz);
1892 
1893       /* first put diagonal data in */
1894       hypre_CSRMatrixDestroy(hypre_ParCSRMatrixDiag(matS));
1895       hypre_ParCSRMatrixDiag(matS) = SLU;
1896 
1897       /* now start to construct offdiag of S */
1898       S_offd = hypre_ParCSRMatrixOffd(matS);
1899       S_offd_i = hypre_TAlloc(HYPRE_Int, m+1, HYPRE_MEMORY_DEVICE);
1900       S_offd_j = hypre_TAlloc(HYPRE_Int, S_offd_nnz, HYPRE_MEMORY_DEVICE);
1901       S_offd_data = hypre_TAlloc(HYPRE_Real, S_offd_nnz, HYPRE_MEMORY_DEVICE);
1902       S_offd_colmap = hypre_CTAlloc(HYPRE_BigInt, S_offd_ncols, HYPRE_MEMORY_HOST);
1903 
1904       /* simply use a loop to copy data from A_offd */
1905       S_offd_i[0] = 0;
1906       k3 = 0;
1907       for (i = 1; i <= e; i++)
1908       {
1909          S_offd_i[i] = k3;
1910       }
1911       for (i = 0; i < m_e; i++)
1912       {
1913          col = perm[i + nI];
1914          k1 = A_offd_i[col];
1915          k2 = A_offd_i[col+1];
1916          for (j = k1; j < k2; j++)
1917          {
1918             S_offd_j[k3] = A_offd_j[j];
1919             S_offd_data[k3++] = A_offd_data[j];
1920          }
1921          S_offd_i[i+1+e] = k3;
1922       }
1923 
1924       /* give I, J, DATA to S_offd */
1925       hypre_CSRMatrixI(S_offd) = S_offd_i;
1926       hypre_CSRMatrixJ(S_offd) = S_offd_j;
1927       hypre_CSRMatrixData(S_offd) = S_offd_data;
1928 
1929       /* now we need to update S_offd_colmap */
1930       comm_pkg = hypre_ParCSRMatrixCommPkg(A);
1931       /* setup comm_pkg if not yet built */
1932       if (!comm_pkg)
1933       {
1934          hypre_MatvecCommPkgCreate(A);
1935          comm_pkg = hypre_ParCSRMatrixCommPkg(A);
1936       }
1937       /* get total num of send */
1938       num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
1939       begin = hypre_ParCSRCommPkgSendMapStart(comm_pkg,0);
1940       end = hypre_ParCSRCommPkgSendMapStart(comm_pkg,num_sends);
1941       send_buf = hypre_TAlloc(HYPRE_BigInt, end - begin, HYPRE_MEMORY_HOST);
1942       /* copy new index into send_buf */
1943       for (i = begin; i < end; i++)
1944       {
1945          send_buf[i-begin] = rperm[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,i)] - nLU + col_starts[0];
1946       }
1947 
1948       /* main communication */
1949       comm_handle = hypre_ParCSRCommHandleCreate(21, comm_pkg, send_buf, S_offd_colmap);
1950       hypre_ParCSRCommHandleDestroy(comm_handle);
1951 
1952       /* setup index */
1953       hypre_ParCSRMatrixColMapOffd(matS) = S_offd_colmap;
1954 
1955       hypre_ILUSortOffdColmap(matS);
1956 
1957       /* free */
1958       hypre_TFree(send_buf, HYPRE_MEMORY_HOST);
1959    } /* end of forming S */
1960 
1961    *matSptr       = matS;
1962    *bufferp       = buffer;
1963    *matBL_infop   = matBL_info;
1964    *matBU_infop   = matBU_info;
1965    *matSL_infop   = matSL_info;
1966    *matSU_infop   = matSU_info;
1967    *A_fake_diag_ip= A_fake_diag_i;
1968 
1969    /* Destroy the bridge after acrossing the river */
1970    hypre_CSRMatrixDestroy(A_diag);
1971    hypre_TFree(rperm, HYPRE_MEMORY_HOST);
1972    hypre_TFree(rqperm, HYPRE_MEMORY_HOST);
1973 
1974    return hypre_error_flag;
1975 }
1976 
1977 HYPRE_Int
hypre_ILUSetupILUKDevice(hypre_ParCSRMatrix * A,HYPRE_Int lfil,HYPRE_Int * perm,HYPRE_Int * qperm,HYPRE_Int n,HYPRE_Int nLU,cusparseMatDescr_t matL_des,cusparseMatDescr_t matU_des,cusparseSolvePolicy_t ilu_solve_policy,void ** bufferp,csrsv2Info_t * matBL_infop,csrsv2Info_t * matBU_infop,csrsv2Info_t * matSL_infop,csrsv2Info_t * matSU_infop,hypre_CSRMatrix ** BLUptr,hypre_ParCSRMatrix ** matSptr,hypre_CSRMatrix ** Eptr,hypre_CSRMatrix ** Fptr,HYPRE_Int ** A_fake_diag_ip)1978 hypre_ILUSetupILUKDevice(hypre_ParCSRMatrix *A, HYPRE_Int lfil, HYPRE_Int *perm, HYPRE_Int *qperm, HYPRE_Int n, HYPRE_Int nLU,
1979                            cusparseMatDescr_t matL_des, cusparseMatDescr_t matU_des, cusparseSolvePolicy_t ilu_solve_policy,
1980                            void **bufferp, csrsv2Info_t *matBL_infop, csrsv2Info_t *matBU_infop,
1981                            csrsv2Info_t *matSL_infop, csrsv2Info_t *matSU_infop,
1982                            hypre_CSRMatrix **BLUptr, hypre_ParCSRMatrix **matSptr, hypre_CSRMatrix **Eptr, hypre_CSRMatrix **Fptr,
1983                            HYPRE_Int **A_fake_diag_ip)
1984 {
1985    /* GPU-accelerated ILU0 with cusparse */
1986    HYPRE_Int               i, j, k1, k2, k3, col;
1987 
1988    /* communication stuffs for S */
1989    MPI_Comm                comm                 = hypre_ParCSRMatrixComm(A);
1990 
1991    HYPRE_Int               my_id, num_procs;
1992    hypre_MPI_Comm_size(comm,&num_procs);
1993    hypre_MPI_Comm_rank(comm,&my_id);
1994 
1995    hypre_ParCSRCommPkg     *comm_pkg;
1996    hypre_ParCSRCommHandle  *comm_handle;
1997    HYPRE_Int               num_sends, begin, end;
1998    HYPRE_BigInt            *send_buf            = NULL;
1999    HYPRE_Int               *rperm               = NULL;
2000    HYPRE_Int               *rqperm              = NULL;
2001 
2002    hypre_ParCSRMatrix      *Apq                 = NULL;
2003    hypre_ParCSRMatrix      *ALU                 = NULL;
2004 
2005    hypre_ParCSRMatrix      *matS                = NULL;
2006    hypre_CSRMatrix         *A_diag              = NULL;
2007    HYPRE_Int               *A_fake_diag_i       = NULL;
2008    hypre_CSRMatrix         *A_offd              = NULL;
2009    HYPRE_Int               *A_offd_i            = NULL;
2010    HYPRE_Int               *A_offd_j            = NULL;
2011    HYPRE_Real              *A_offd_data         = NULL;
2012    hypre_CSRMatrix         *SLU                 = NULL;
2013    /* pointers to cusparse data */
2014    csrsv2Info_t            matBL_info           = NULL;
2015    csrsv2Info_t            matBU_info           = NULL;
2016    csrsv2Info_t            matSL_info           = NULL;
2017    csrsv2Info_t            matSU_info           = NULL;
2018 
2019    HYPRE_Int               buffer_size          = 0;
2020    void                    *buffer              = NULL;
2021 
2022    /* variables for matS */
2023    HYPRE_Int               m                    = n - nLU;
2024    HYPRE_Int               nI                   = nLU;//use default
2025    HYPRE_Int               e                    = 0;
2026    HYPRE_Int               m_e                  = m;
2027    HYPRE_BigInt            total_rows;
2028    HYPRE_BigInt            col_starts[2];
2029    HYPRE_Int               *S_diag_i            = NULL;
2030    HYPRE_Int               S_diag_nnz;
2031    hypre_CSRMatrix         *S_offd              = NULL;
2032    HYPRE_Int               *S_offd_i            = NULL;
2033    HYPRE_Int               *S_offd_j            = NULL;
2034    HYPRE_Real              *S_offd_data         = NULL;
2035    HYPRE_BigInt            *S_offd_colmap       = NULL;
2036    HYPRE_Int               S_offd_nnz;
2037    HYPRE_Int               S_offd_ncols;
2038 
2039    /* set data slots */
2040    A_offd                                       = hypre_ParCSRMatrixOffd(A);
2041    A_offd_i                                     = hypre_CSRMatrixI(A_offd);
2042    A_offd_j                                     = hypre_CSRMatrixJ(A_offd);
2043    A_offd_data                                  = hypre_CSRMatrixData(A_offd);
2044 
2045    hypre_ParCSRMatrix      *parL = NULL;
2046    hypre_ParCSRMatrix      *parU = NULL;
2047    hypre_ParCSRMatrix      *parS = NULL;
2048    HYPRE_Real              *parD = NULL;
2049    HYPRE_Int               *uend = NULL;
2050 
2051    /* unfortunately we need to build the reverse permutation array */
2052    rperm                                        = hypre_CTAlloc(HYPRE_Int, n, HYPRE_MEMORY_HOST);
2053    rqperm                                       = hypre_CTAlloc(HYPRE_Int, n, HYPRE_MEMORY_HOST);
2054    for (i = 0; i < n; i++)
2055    {
2056       rperm[perm[i]] = i;
2057       rqperm[qperm[i]] = i;
2058    }
2059 
2060    /* Only call ILU when we really have a matrix on this processor */
2061    if (n > 0)
2062    {
2063       /* Copy diagonal matrix into a new place with permutation
2064        * That is, A_diag = A_diag(perm,qperm);
2065        */
2066       hypre_ParILURAPReorder( A, perm, rqperm, &Apq);
2067 
2068       /* Apply ILU factorization to the entile A_diag */
2069       hypre_ILUSetupILUK(Apq, lfil, NULL, NULL, n, n, &parL, &parD, &parU, &parS, &uend);
2070 
2071       if (uend)
2072       {
2073          hypre_TFree(uend, HYPRE_MEMORY_HOST);
2074       }
2075 
2076       if (parS)
2077       {
2078          hypre_ParCSRMatrixDestroy(parS);
2079       }
2080 
2081       /* | L \ U (B) L^{-1}F  |
2082        * | EU^{-1}   L \ U (S)|
2083        * Extract submatrix L_B U_B, L_S U_S, EU_B^{-1}, L_B^{-1}F
2084        * Note that in this function after ILU, all rows are sorted
2085        * in a way different than HYPRE. Diagonal is not listed in the front
2086        */
2087       hypre_ILUSetupLDUtoCusparse( parL, parD, parU, &ALU);
2088 
2089       if (parL)
2090       {
2091          hypre_ParCSRMatrixDestroy(parL);
2092       }
2093       if (parD)
2094       {
2095          hypre_TFree(parD, HYPRE_MEMORY_DEVICE);
2096       }
2097       if (parU)
2098       {
2099          hypre_ParCSRMatrixDestroy(parU);
2100       }
2101 
2102       A_diag = hypre_ParCSRMatrixDiag(ALU);
2103 
2104       hypre_ParILUCusparseILUExtractEBFC(A_diag, nLU, BLUptr, &SLU, Eptr, Fptr);
2105 
2106       if (Apq)
2107       {
2108          hypre_ParCSRMatrixDestroy(Apq);
2109       }
2110 
2111    }
2112    else
2113    {
2114       *BLUptr = NULL;
2115       *Eptr = NULL;
2116       *Fptr = NULL;
2117       SLU = NULL;
2118    }
2119 
2120    /* create B */
2121    /* only analyse when nacessary */
2122    if ( nLU > 0 )
2123    {
2124       /* Analysis of BILU */
2125       HYPRE_ILUSetupCusparseCSRILU0SetupSolve(*BLUptr, matL_des, matU_des,
2126                                  ilu_solve_policy, &matBL_info, &matBU_info,
2127                                  &buffer_size, &buffer);
2128    }
2129 
2130    HYPRE_BigInt big_m = (HYPRE_BigInt)m;
2131    hypre_MPI_Allreduce(&big_m, &total_rows, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
2132    /* only form when total_rows > 0 */
2133    if ( total_rows > 0 )
2134    {
2135       /* now create S */
2136       /* need to get new column start */
2137       {
2138          HYPRE_BigInt global_start;
2139          hypre_MPI_Scan( &big_m, &global_start, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
2140          col_starts[0] = global_start - m;
2141          col_starts[1] = global_start;
2142       }
2143 
2144       A_fake_diag_i = hypre_CTAlloc(HYPRE_Int, m + 1, HYPRE_MEMORY_DEVICE);
2145       if (SLU)
2146       {
2147          /* Analysis of SILU */
2148          HYPRE_ILUSetupCusparseCSRILU0SetupSolve(SLU, matL_des, matU_des,
2149                                        ilu_solve_policy, &matSL_info, &matSU_info,
2150                                        &buffer_size, &buffer);
2151       }
2152       else
2153       {
2154          SLU = hypre_CSRMatrixCreate(0,0,0);
2155          hypre_CSRMatrixInitialize(SLU);
2156       }
2157       S_diag_i = hypre_CSRMatrixI(SLU);
2158       S_diag_nnz = S_diag_i[m];
2159       /* Build ParCSRMatrix matS
2160        * For example when np == 3 the new matrix takes the following form
2161        * |IS_1 E_12 E_13|
2162        * |E_21 IS_2 E_22| = S
2163        * |E_31 E_32 IS_3|
2164        * In which IS_i is the cusparse ILU factorization of S_i in one matrix
2165        * */
2166 
2167       /* We did nothing to A_offd, so all the data kept, just reorder them
2168        * The create function takes comm, global num rows/cols,
2169        *    row/col start, num cols offd, nnz diag, nnz offd
2170        */
2171       S_offd_nnz = hypre_CSRMatrixNumNonzeros(A_offd);
2172       S_offd_ncols = hypre_CSRMatrixNumCols(A_offd);
2173 
2174       matS = hypre_ParCSRMatrixCreate( comm,
2175                            total_rows,
2176                            total_rows,
2177                            col_starts,
2178                            col_starts,
2179                            S_offd_ncols,
2180                            S_diag_nnz,
2181                            S_offd_nnz);
2182 
2183       /* first put diagonal data in */
2184       hypre_CSRMatrixDestroy(hypre_ParCSRMatrixDiag(matS));
2185       hypre_ParCSRMatrixDiag(matS) = SLU;
2186 
2187       /* now start to construct offdiag of S */
2188       S_offd = hypre_ParCSRMatrixOffd(matS);
2189       S_offd_i = hypre_TAlloc(HYPRE_Int, m+1, HYPRE_MEMORY_DEVICE);
2190       S_offd_j = hypre_TAlloc(HYPRE_Int, S_offd_nnz, HYPRE_MEMORY_DEVICE);
2191       S_offd_data = hypre_TAlloc(HYPRE_Real, S_offd_nnz, HYPRE_MEMORY_DEVICE);
2192       S_offd_colmap = hypre_CTAlloc(HYPRE_BigInt, S_offd_ncols, HYPRE_MEMORY_HOST);
2193 
2194       /* simply use a loop to copy data from A_offd */
2195       S_offd_i[0] = 0;
2196       k3 = 0;
2197       for (i = 1; i <= e; i++)
2198       {
2199          S_offd_i[i] = k3;
2200       }
2201       for (i = 0; i < m_e; i++)
2202       {
2203          col = perm[i + nI];
2204          k1 = A_offd_i[col];
2205          k2 = A_offd_i[col+1];
2206          for (j = k1; j < k2; j++)
2207          {
2208             S_offd_j[k3] = A_offd_j[j];
2209             S_offd_data[k3++] = A_offd_data[j];
2210          }
2211          S_offd_i[i+1+e] = k3;
2212       }
2213 
2214       /* give I, J, DATA to S_offd */
2215       hypre_CSRMatrixI(S_offd) = S_offd_i;
2216       hypre_CSRMatrixJ(S_offd) = S_offd_j;
2217       hypre_CSRMatrixData(S_offd) = S_offd_data;
2218 
2219       /* now we need to update S_offd_colmap */
2220       comm_pkg = hypre_ParCSRMatrixCommPkg(A);
2221       /* setup comm_pkg if not yet built */
2222       if (!comm_pkg)
2223       {
2224          hypre_MatvecCommPkgCreate(A);
2225          comm_pkg = hypre_ParCSRMatrixCommPkg(A);
2226       }
2227       /* get total num of send */
2228       num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
2229       begin = hypre_ParCSRCommPkgSendMapStart(comm_pkg,0);
2230       end = hypre_ParCSRCommPkgSendMapStart(comm_pkg,num_sends);
2231       send_buf = hypre_TAlloc(HYPRE_BigInt, end - begin, HYPRE_MEMORY_HOST);
2232       /* copy new index into send_buf */
2233       for (i = begin; i < end; i++)
2234       {
2235          send_buf[i-begin] = rperm[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,i)] - nLU + col_starts[0];
2236       }
2237 
2238       /* main communication */
2239       comm_handle = hypre_ParCSRCommHandleCreate(21, comm_pkg, send_buf, S_offd_colmap);
2240       hypre_ParCSRCommHandleDestroy(comm_handle);
2241 
2242       /* setup index */
2243       hypre_ParCSRMatrixColMapOffd(matS) = S_offd_colmap;
2244 
2245       hypre_ILUSortOffdColmap(matS);
2246 
2247       /* free */
2248       hypre_TFree(send_buf, HYPRE_MEMORY_HOST);
2249    } /* end of forming S */
2250 
2251    *matSptr       = matS;
2252    *bufferp       = buffer;
2253    *matBL_infop   = matBL_info;
2254    *matBU_infop   = matBU_info;
2255    *matSL_infop   = matSL_info;
2256    *matSU_infop   = matSU_info;
2257    *A_fake_diag_ip= A_fake_diag_i;
2258 
2259    /* Destroy the bridge after acrossing the river */
2260    hypre_CSRMatrixDestroy(A_diag);
2261    hypre_TFree(rperm, HYPRE_MEMORY_HOST);
2262    hypre_TFree(rqperm, HYPRE_MEMORY_HOST);
2263 
2264    return hypre_error_flag;
2265 }
2266 
2267 
2268 HYPRE_Int
hypre_ILUSetupILUTDevice(hypre_ParCSRMatrix * A,HYPRE_Int lfil,HYPRE_Real * tol,HYPRE_Int * perm,HYPRE_Int * qperm,HYPRE_Int n,HYPRE_Int nLU,cusparseMatDescr_t matL_des,cusparseMatDescr_t matU_des,cusparseSolvePolicy_t ilu_solve_policy,void ** bufferp,csrsv2Info_t * matBL_infop,csrsv2Info_t * matBU_infop,csrsv2Info_t * matSL_infop,csrsv2Info_t * matSU_infop,hypre_CSRMatrix ** BLUptr,hypre_ParCSRMatrix ** matSptr,hypre_CSRMatrix ** Eptr,hypre_CSRMatrix ** Fptr,HYPRE_Int ** A_fake_diag_ip)2269 hypre_ILUSetupILUTDevice(hypre_ParCSRMatrix *A, HYPRE_Int lfil, HYPRE_Real *tol, HYPRE_Int *perm, HYPRE_Int *qperm, HYPRE_Int n, HYPRE_Int nLU,
2270                            cusparseMatDescr_t matL_des, cusparseMatDescr_t matU_des, cusparseSolvePolicy_t ilu_solve_policy,
2271                            void **bufferp, csrsv2Info_t *matBL_infop, csrsv2Info_t *matBU_infop,
2272                            csrsv2Info_t *matSL_infop, csrsv2Info_t *matSU_infop,
2273                            hypre_CSRMatrix **BLUptr, hypre_ParCSRMatrix **matSptr, hypre_CSRMatrix **Eptr, hypre_CSRMatrix **Fptr,
2274                            HYPRE_Int **A_fake_diag_ip)
2275 {
2276    /* GPU-accelerated ILU0 with cusparse */
2277    HYPRE_Int               i, j, k1, k2, k3, col;
2278 
2279    /* communication stuffs for S */
2280    MPI_Comm                comm                 = hypre_ParCSRMatrixComm(A);
2281 
2282    HYPRE_Int               my_id, num_procs;
2283    hypre_MPI_Comm_size(comm,&num_procs);
2284    hypre_MPI_Comm_rank(comm,&my_id);
2285 
2286    hypre_ParCSRCommPkg     *comm_pkg;
2287    hypre_ParCSRCommHandle  *comm_handle;
2288    HYPRE_Int               num_sends, begin, end;
2289    HYPRE_BigInt            *send_buf            = NULL;
2290    HYPRE_Int               *rperm               = NULL;
2291    HYPRE_Int               *rqperm              = NULL;
2292 
2293    hypre_ParCSRMatrix      *Apq                 = NULL;
2294    hypre_ParCSRMatrix      *ALU                 = NULL;
2295 
2296    hypre_ParCSRMatrix      *matS                = NULL;
2297    hypre_CSRMatrix         *A_diag              = NULL;
2298    HYPRE_Int               *A_fake_diag_i       = NULL;
2299    hypre_CSRMatrix         *A_offd              = NULL;
2300    HYPRE_Int               *A_offd_i            = NULL;
2301    HYPRE_Int               *A_offd_j            = NULL;
2302    HYPRE_Real              *A_offd_data         = NULL;
2303    hypre_CSRMatrix         *SLU                 = NULL;
2304    /* pointers to cusparse data */
2305    csrsv2Info_t            matBL_info           = NULL;
2306    csrsv2Info_t            matBU_info           = NULL;
2307    csrsv2Info_t            matSL_info           = NULL;
2308    csrsv2Info_t            matSU_info           = NULL;
2309 
2310    HYPRE_Int               buffer_size          = 0;
2311    void                    *buffer              = NULL;
2312 
2313    /* variables for matS */
2314    HYPRE_Int               m                    = n - nLU;
2315    HYPRE_Int               nI                   = nLU;//use default
2316    HYPRE_Int               e                    = 0;
2317    HYPRE_Int               m_e                  = m;
2318    HYPRE_BigInt            total_rows;
2319    HYPRE_BigInt            col_starts[2];
2320    HYPRE_Int               *S_diag_i            = NULL;
2321    HYPRE_Int               S_diag_nnz;
2322    hypre_CSRMatrix         *S_offd              = NULL;
2323    HYPRE_Int               *S_offd_i            = NULL;
2324    HYPRE_Int               *S_offd_j            = NULL;
2325    HYPRE_Real              *S_offd_data         = NULL;
2326    HYPRE_BigInt            *S_offd_colmap       = NULL;
2327    HYPRE_Int               S_offd_nnz;
2328    HYPRE_Int               S_offd_ncols;
2329 
2330    /* set data slots */
2331    A_offd                                       = hypre_ParCSRMatrixOffd(A);
2332    A_offd_i                                     = hypre_CSRMatrixI(A_offd);
2333    A_offd_j                                     = hypre_CSRMatrixJ(A_offd);
2334    A_offd_data                                  = hypre_CSRMatrixData(A_offd);
2335 
2336    hypre_ParCSRMatrix      *parL = NULL;
2337    hypre_ParCSRMatrix      *parU = NULL;
2338    hypre_ParCSRMatrix      *parS = NULL;
2339    HYPRE_Real              *parD = NULL;
2340    HYPRE_Int               *uend = NULL;
2341 
2342    /* unfortunately we need to build the reverse permutation array */
2343    rperm                                        = hypre_CTAlloc(HYPRE_Int, n, HYPRE_MEMORY_HOST);
2344    rqperm                                       = hypre_CTAlloc(HYPRE_Int, n, HYPRE_MEMORY_HOST);
2345    for (i = 0; i < n; i++)
2346    {
2347       rperm[perm[i]] = i;
2348       rqperm[qperm[i]] = i;
2349    }
2350 
2351    /* Only call ILU when we really have a matrix on this processor */
2352    if (n > 0)
2353    {
2354       /* Copy diagonal matrix into a new place with permutation
2355        * That is, A_diag = A_diag(perm,qperm);
2356        */
2357       hypre_ParILURAPReorder( A, perm, rqperm, &Apq);
2358 
2359       /* Apply ILU factorization to the entile A_diag */
2360       hypre_ILUSetupILUT(Apq, lfil, tol, NULL, NULL, n, n, &parL, &parD, &parU, &parS, &uend);
2361 
2362       if (uend)
2363       {
2364          hypre_TFree(uend, HYPRE_MEMORY_HOST);
2365       }
2366 
2367       if (parS)
2368       {
2369          hypre_ParCSRMatrixDestroy(parS);
2370       }
2371 
2372       /* | L \ U (B) L^{-1}F  |
2373        * | EU^{-1}   L \ U (S)|
2374        * Extract submatrix L_B U_B, L_S U_S, EU_B^{-1}, L_B^{-1}F
2375        * Note that in this function after ILU, all rows are sorted
2376        * in a way different than HYPRE. Diagonal is not listed in the front
2377        */
2378       hypre_ILUSetupLDUtoCusparse( parL, parD, parU, &ALU);
2379 
2380       if (parL)
2381       {
2382          hypre_ParCSRMatrixDestroy(parL);
2383       }
2384       if (parD)
2385       {
2386          hypre_TFree(parD, HYPRE_MEMORY_DEVICE);
2387       }
2388       if (parU)
2389       {
2390          hypre_ParCSRMatrixDestroy(parU);
2391       }
2392 
2393       A_diag = hypre_ParCSRMatrixDiag(ALU);
2394 
2395       hypre_ParILUCusparseILUExtractEBFC(A_diag, nLU, BLUptr, &SLU, Eptr, Fptr);
2396 
2397       if (Apq)
2398       {
2399          hypre_ParCSRMatrixDestroy(Apq);
2400       }
2401 
2402    }
2403    else
2404    {
2405       *BLUptr = NULL;
2406       *Eptr = NULL;
2407       *Fptr = NULL;
2408       SLU = NULL;
2409    }
2410 
2411    /* create B */
2412    /* only analyse when nacessary */
2413    if ( nLU > 0 )
2414    {
2415       /* Analysis of BILU */
2416       HYPRE_ILUSetupCusparseCSRILU0SetupSolve(*BLUptr, matL_des, matU_des,
2417                                  ilu_solve_policy, &matBL_info, &matBU_info,
2418                                  &buffer_size, &buffer);
2419    }
2420 
2421    HYPRE_BigInt big_m = (HYPRE_BigInt)m;
2422    hypre_MPI_Allreduce(&big_m, &total_rows, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
2423    /* only form when total_rows > 0 */
2424    if ( total_rows > 0 )
2425    {
2426       /* now create S */
2427       /* need to get new column start */
2428       {
2429          HYPRE_BigInt global_start;
2430          hypre_MPI_Scan( &big_m, &global_start, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
2431          col_starts[0] = global_start - m;
2432          col_starts[1] = global_start;
2433       }
2434 
2435       A_fake_diag_i = hypre_CTAlloc(HYPRE_Int, m + 1, HYPRE_MEMORY_DEVICE);
2436       if (SLU)
2437       {
2438          /* Analysis of SILU */
2439          HYPRE_ILUSetupCusparseCSRILU0SetupSolve(SLU, matL_des, matU_des,
2440                                        ilu_solve_policy, &matSL_info, &matSU_info,
2441                                        &buffer_size, &buffer);
2442       }
2443       else
2444       {
2445          SLU = hypre_CSRMatrixCreate(0,0,0);
2446          hypre_CSRMatrixInitialize(SLU);
2447       }
2448       S_diag_i = hypre_CSRMatrixI(SLU);
2449       S_diag_nnz = S_diag_i[m];
2450       /* Build ParCSRMatrix matS
2451        * For example when np == 3 the new matrix takes the following form
2452        * |IS_1 E_12 E_13|
2453        * |E_21 IS_2 E_22| = S
2454        * |E_31 E_32 IS_3|
2455        * In which IS_i is the cusparse ILU factorization of S_i in one matrix
2456        * */
2457 
2458       /* We did nothing to A_offd, so all the data kept, just reorder them
2459        * The create function takes comm, global num rows/cols,
2460        *    row/col start, num cols offd, nnz diag, nnz offd
2461        */
2462       S_offd_nnz = hypre_CSRMatrixNumNonzeros(A_offd);
2463       S_offd_ncols = hypre_CSRMatrixNumCols(A_offd);
2464 
2465       matS = hypre_ParCSRMatrixCreate( comm,
2466                            total_rows,
2467                            total_rows,
2468                            col_starts,
2469                            col_starts,
2470                            S_offd_ncols,
2471                            S_diag_nnz,
2472                            S_offd_nnz);
2473 
2474       /* first put diagonal data in */
2475       hypre_CSRMatrixDestroy(hypre_ParCSRMatrixDiag(matS));
2476       hypre_ParCSRMatrixDiag(matS) = SLU;
2477 
2478       /* now start to construct offdiag of S */
2479       S_offd = hypre_ParCSRMatrixOffd(matS);
2480       S_offd_i = hypre_TAlloc(HYPRE_Int, m+1, HYPRE_MEMORY_DEVICE);
2481       S_offd_j = hypre_TAlloc(HYPRE_Int, S_offd_nnz, HYPRE_MEMORY_DEVICE);
2482       S_offd_data = hypre_TAlloc(HYPRE_Real, S_offd_nnz, HYPRE_MEMORY_DEVICE);
2483       S_offd_colmap = hypre_CTAlloc(HYPRE_BigInt, S_offd_ncols, HYPRE_MEMORY_HOST);
2484 
2485       /* simply use a loop to copy data from A_offd */
2486       S_offd_i[0] = 0;
2487       k3 = 0;
2488       for (i = 1; i <= e; i++)
2489       {
2490          S_offd_i[i] = k3;
2491       }
2492       for (i = 0; i < m_e; i++)
2493       {
2494          col = perm[i + nI];
2495          k1 = A_offd_i[col];
2496          k2 = A_offd_i[col+1];
2497          for (j = k1; j < k2; j++)
2498          {
2499             S_offd_j[k3] = A_offd_j[j];
2500             S_offd_data[k3++] = A_offd_data[j];
2501          }
2502          S_offd_i[i+1+e] = k3;
2503       }
2504 
2505       /* give I, J, DATA to S_offd */
2506       hypre_CSRMatrixI(S_offd) = S_offd_i;
2507       hypre_CSRMatrixJ(S_offd) = S_offd_j;
2508       hypre_CSRMatrixData(S_offd) = S_offd_data;
2509 
2510       /* now we need to update S_offd_colmap */
2511       comm_pkg = hypre_ParCSRMatrixCommPkg(A);
2512       /* setup comm_pkg if not yet built */
2513       if (!comm_pkg)
2514       {
2515          hypre_MatvecCommPkgCreate(A);
2516          comm_pkg = hypre_ParCSRMatrixCommPkg(A);
2517       }
2518       /* get total num of send */
2519       num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
2520       begin = hypre_ParCSRCommPkgSendMapStart(comm_pkg,0);
2521       end = hypre_ParCSRCommPkgSendMapStart(comm_pkg,num_sends);
2522       send_buf = hypre_TAlloc(HYPRE_BigInt, end - begin, HYPRE_MEMORY_HOST);
2523       /* copy new index into send_buf */
2524       for (i = begin; i < end; i++)
2525       {
2526          send_buf[i-begin] = rperm[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,i)] - nLU + col_starts[0];
2527       }
2528 
2529       /* main communication */
2530       comm_handle = hypre_ParCSRCommHandleCreate(21, comm_pkg, send_buf, S_offd_colmap);
2531       hypre_ParCSRCommHandleDestroy(comm_handle);
2532 
2533       /* setup index */
2534       hypre_ParCSRMatrixColMapOffd(matS) = S_offd_colmap;
2535 
2536       hypre_ILUSortOffdColmap(matS);
2537 
2538       /* free */
2539       hypre_TFree(send_buf, HYPRE_MEMORY_HOST);
2540    } /* end of forming S */
2541 
2542    *matSptr       = matS;
2543    *bufferp       = buffer;
2544    *matBL_infop   = matBL_info;
2545    *matBU_infop   = matBU_info;
2546    *matSL_infop   = matSL_info;
2547    *matSU_infop   = matSU_info;
2548    *A_fake_diag_ip= A_fake_diag_i;
2549 
2550    /* Destroy the bridge after acrossing the river */
2551    hypre_CSRMatrixDestroy(A_diag);
2552    hypre_TFree(rperm, HYPRE_MEMORY_HOST);
2553    hypre_TFree(rqperm, HYPRE_MEMORY_HOST);
2554 
2555    return hypre_error_flag;
2556 }
2557 
2558 /* Reorder matrix A based on local permutation (combine local permutation into global permutation)
2559  * WARNING: We don't put diagonal to the first entry of each row
2560  * A = input matrix
2561  * perm = permutation array indicating ordering of rows. Perm could come from a
2562  *    CF_marker array or a reordering routine.
2563  * rqperm = reverse permutation array indicating ordering of columns
2564  * A_pq = pointer to the output par CSR matrix.
2565  */
2566 HYPRE_Int
hypre_ParILURAPReorder(hypre_ParCSRMatrix * A,HYPRE_Int * perm,HYPRE_Int * rqperm,hypre_ParCSRMatrix ** A_pq)2567 hypre_ParILURAPReorder(hypre_ParCSRMatrix *A, HYPRE_Int *perm, HYPRE_Int *rqperm, hypre_ParCSRMatrix **A_pq)
2568 {
2569    /* Get necessary slots */
2570    hypre_CSRMatrix     *A_diag         = hypre_ParCSRMatrixDiag(A);
2571    //HYPRE_Int           *A_diag_i       = hypre_CSRMatrixI(A_diag);
2572    //HYPRE_Int           *A_diag_j       = hypre_CSRMatrixJ(A_diag);
2573    //HYPRE_Real          *A_diag_data    = hypre_CSRMatrixData(A_diag);
2574    HYPRE_Int            n              = hypre_CSRMatrixNumRows(A_diag);
2575    //HYPRE_Int            nnz_A_diag     = A_diag_i[n];
2576 
2577    //HYPRE_Int            i, j, current_idx;
2578    HYPRE_Int            i;
2579 
2580    /* MPI */
2581    MPI_Comm             comm                 = hypre_ParCSRMatrixComm(A);
2582    HYPRE_Int            num_procs,  my_id;
2583 
2584    hypre_MPI_Comm_size(comm,&num_procs);
2585    hypre_MPI_Comm_rank(comm,&my_id);
2586 
2587    /* Create permutation matrices P = I(perm,:) and Q(rqperm,:), such that Apq = PAQ */
2588    hypre_ParCSRMatrix *P, *Q, *PAQ, *PA;
2589 
2590    hypre_CSRMatrix *P_diag, *Q_diag;
2591    hypre_CSRMatrix *P_offd, *Q_offd;
2592 
2593    P = hypre_ParCSRMatrixCreate( comm,
2594                            hypre_ParCSRMatrixGlobalNumRows(A),
2595                            hypre_ParCSRMatrixGlobalNumRows(A),
2596                            hypre_ParCSRMatrixRowStarts(A),
2597                            hypre_ParCSRMatrixColStarts(A),
2598                            0,
2599                            n,
2600                            0);
2601 
2602    Q = hypre_ParCSRMatrixCreate( comm,
2603                            hypre_ParCSRMatrixGlobalNumRows(A),
2604                            hypre_ParCSRMatrixGlobalNumRows(A),
2605                            hypre_ParCSRMatrixRowStarts(A),
2606                            hypre_ParCSRMatrixColStarts(A),
2607                            0,
2608                            n,
2609                            0);
2610 
2611    P_diag = hypre_ParCSRMatrixDiag(P);
2612    Q_diag = hypre_ParCSRMatrixDiag(Q);
2613    P_offd = hypre_ParCSRMatrixOffd(P);
2614    Q_offd = hypre_ParCSRMatrixOffd(Q);
2615 
2616    HYPRE_Int   *P_diag_i, *P_diag_j, *Q_diag_i, *Q_diag_j;
2617    HYPRE_Real  *P_diag_data, *Q_diag_data;
2618    HYPRE_Int   *P_offd_i, *Q_offd_i;
2619 
2620    P_diag_i = hypre_TAlloc(HYPRE_Int, n+1, HYPRE_MEMORY_DEVICE);
2621    P_diag_j = hypre_TAlloc(HYPRE_Int, n, HYPRE_MEMORY_DEVICE);
2622    P_diag_data = hypre_TAlloc(HYPRE_Real, n, HYPRE_MEMORY_DEVICE);
2623 
2624    Q_diag_i = hypre_TAlloc(HYPRE_Int, n+1, HYPRE_MEMORY_DEVICE);
2625    Q_diag_j = hypre_TAlloc(HYPRE_Int, n, HYPRE_MEMORY_DEVICE);
2626    Q_diag_data = hypre_TAlloc(HYPRE_Real, n, HYPRE_MEMORY_DEVICE);
2627 
2628    /* fill data, openmp should be availiable here */
2629    for (i = 0; i < n; i++)
2630    {
2631       P_diag_i[i] = i;
2632       P_diag_j[i] = perm[i];
2633       P_diag_data[i] = 1.0;
2634 
2635       Q_diag_i[i] = i;
2636       Q_diag_j[i] = rqperm[i];
2637       Q_diag_data[i] = 1.0;
2638 
2639    }
2640    P_diag_i[n] = n;
2641    Q_diag_i[n] = n;
2642 
2643    /* give I, J, DATA */
2644    hypre_CSRMatrixI(P_diag) = P_diag_i;
2645    hypre_CSRMatrixJ(P_diag) = P_diag_j;
2646    hypre_CSRMatrixData(P_diag) = P_diag_data;
2647 
2648    hypre_CSRMatrixI(Q_diag) = Q_diag_i;
2649    hypre_CSRMatrixJ(Q_diag) = Q_diag_j;
2650    hypre_CSRMatrixData(Q_diag) = Q_diag_data;
2651 
2652    P_offd_i = hypre_CTAlloc(HYPRE_Int, n+1, HYPRE_MEMORY_DEVICE);
2653    Q_offd_i = hypre_CTAlloc(HYPRE_Int, n+1, HYPRE_MEMORY_DEVICE);
2654 
2655    hypre_CSRMatrixI(P_offd) = P_offd_i;
2656    hypre_CSRMatrixI(Q_offd) = Q_offd_i;
2657 
2658    /* Update A */
2659    PA = hypre_ParCSRMatMat(P, A);
2660    PAQ = hypre_ParCSRMatMat(PA, Q);
2661    //PAQ = hypre_ParCSRMatrixRAPKT(P, A, Q, 0);
2662 
2663    /* free and return */
2664    hypre_ParCSRMatrixDestroy(P);
2665    hypre_ParCSRMatrixDestroy(Q);
2666 
2667    *A_pq = PAQ;
2668 
2669    return hypre_error_flag;
2670 }
2671 
2672 /* Convert the L, D, U style to the cusparse style
2673  * Assume the diagonal of L and U are the ilu factorization, directly combine them
2674  */
2675 HYPRE_Int
hypre_ParILURAPBuildRP(hypre_ParCSRMatrix * A,hypre_ParCSRMatrix * BLUm,hypre_ParCSRMatrix * E,hypre_ParCSRMatrix * F,cusparseMatDescr_t matL_des,cusparseMatDescr_t matU_des,hypre_ParCSRMatrix ** Rp,hypre_ParCSRMatrix ** Pp)2676 hypre_ParILURAPBuildRP(hypre_ParCSRMatrix *A, hypre_ParCSRMatrix *BLUm, hypre_ParCSRMatrix* E, hypre_ParCSRMatrix *F,
2677                         cusparseMatDescr_t matL_des, cusparseMatDescr_t matU_des, hypre_ParCSRMatrix **Rp, hypre_ParCSRMatrix **Pp)
2678 {
2679    /* declare variables */
2680    HYPRE_Int            j, row, col;
2681    HYPRE_Real           val;
2682    hypre_ParCSRMatrix   *R, *P;
2683    hypre_CSRMatrix      *R_diag, *P_diag;
2684 
2685    hypre_CSRMatrix      *BLUm_diag           = hypre_ParCSRMatrixDiag(BLUm);
2686    HYPRE_Int            *BLUm_diag_i         = hypre_CSRMatrixI(BLUm_diag);
2687    HYPRE_Int            *BLUm_diag_j         = hypre_CSRMatrixJ(BLUm_diag);
2688    HYPRE_Real           *BLUm_diag_data      = hypre_CSRMatrixData(BLUm_diag);
2689 
2690    hypre_CSRMatrix      *E_diag              = hypre_ParCSRMatrixDiag(E);
2691    HYPRE_Int            *E_diag_i            = hypre_CSRMatrixI(E_diag);
2692    HYPRE_Int            *E_diag_j            = hypre_CSRMatrixJ(E_diag);
2693    HYPRE_Real           *E_diag_data         = hypre_CSRMatrixData(E_diag);
2694    hypre_CSRMatrix      *F_diag              = hypre_ParCSRMatrixDiag(F);
2695    HYPRE_Int            *F_diag_i            = hypre_CSRMatrixI(F_diag);
2696    HYPRE_Int            *F_diag_j            = hypre_CSRMatrixJ(F_diag);
2697    HYPRE_Real           *F_diag_data         = hypre_CSRMatrixData(F_diag);
2698 
2699    HYPRE_Int            n                    = hypre_CSRMatrixNumRows(F_diag);
2700    HYPRE_Int            m                    = hypre_CSRMatrixNumCols(F_diag);
2701 
2702    HYPRE_Int            nnz_BLUm             = BLUm_diag_i[n];
2703 
2704    /* MPI */
2705    MPI_Comm             comm                 = hypre_ParCSRMatrixComm(A);
2706    HYPRE_Int            num_procs,  my_id;
2707 
2708    hypre_MPI_Comm_size(comm,&num_procs);
2709    hypre_MPI_Comm_rank(comm,&my_id);
2710 
2711    /* cusparse */
2712    HYPRE_Int               isDoublePrecision    = sizeof(HYPRE_Complex) == sizeof(hypre_double);
2713    HYPRE_Int               isSinglePrecision    = sizeof(HYPRE_Complex) == sizeof(hypre_double) / 2;
2714 
2715    hypre_assert(isDoublePrecision || isSinglePrecision);
2716 
2717    cusparseHandle_t handle = hypre_HandleCusparseHandle(hypre_handle());
2718 
2719    /* compute P = -UB\(LB\F)
2720     * op(A) * op(X) = \alpha op(B)
2721     * first iLF = LB\F -> LB*iLF = F
2722     */
2723 
2724    HYPRE_Int               algo = 0;
2725    HYPRE_Real              alpha = 1.0;
2726    HYPRE_Real              *rhs;
2727    cusparseSolvePolicy_t   policy = CUSPARSE_SOLVE_POLICY_NO_LEVEL;
2728    size_t                  buffer_size, buffer_size_old;
2729    void                    *buffer;
2730    csrsm2Info_t            malL_info = NULL;
2731    HYPRE_CUSPARSE_CALL(cusparseCreateCsrsm2Info(&malL_info));
2732 
2733    rhs = hypre_CTAlloc(HYPRE_Real, m * n, HYPRE_MEMORY_DEVICE);
2734 
2735    /* fill data, note that rhs is in Fortan style (col first)
2736     * oprating by col is slow, but
2737     */
2738    for (row = 0; row < n; row++)
2739    {
2740       for (j = F_diag_i[row]; j < F_diag_i[row+1]; j++)
2741       {
2742          col = F_diag_j[j];
2743          *(rhs + col*n + row) = F_diag_data[j];
2744       }
2745    }
2746 
2747    /* check buffer size and create buffer */
2748 
2749    if (isDoublePrecision)
2750    {
2751       HYPRE_CUSPARSE_CALL(cusparseDcsrsm2_bufferSizeExt( handle, algo, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2752                                                          n, m, nnz_BLUm, (hypre_double *)&alpha, matL_des, (hypre_double *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (hypre_double *)rhs, n, malL_info, policy, &buffer_size));
2753    }
2754    else if (isSinglePrecision)
2755    {
2756       HYPRE_CUSPARSE_CALL(cusparseScsrsm2_bufferSizeExt( handle, algo, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2757                                                          n, m, nnz_BLUm, (float *)&alpha, matL_des, (float *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (float *)rhs, n, malL_info, policy, &buffer_size));
2758    }
2759 
2760    buffer = hypre_MAlloc(buffer_size, HYPRE_MEMORY_DEVICE);
2761 
2762    /* analysis */
2763 
2764    if (isDoublePrecision)
2765    {
2766       HYPRE_CUSPARSE_CALL(cusparseDcsrsm2_analysis( handle, algo, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2767                                                          n, m, nnz_BLUm, (hypre_double *)&alpha, matL_des, (hypre_double *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (hypre_double *)rhs, n, malL_info, policy, buffer));
2768    }
2769    else if (isSinglePrecision)
2770    {
2771       HYPRE_CUSPARSE_CALL(cusparseScsrsm2_analysis( handle, algo, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2772                                                          n, m, nnz_BLUm, (float *)&alpha, matL_des, (float *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (float *)rhs, n, malL_info, policy, buffer));
2773    }
2774 
2775    /* solve phase */
2776    if (isDoublePrecision)
2777    {
2778       HYPRE_CUSPARSE_CALL(cusparseDcsrsm2_solve( handle, algo, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2779                                                          n, m, nnz_BLUm, (hypre_double *)&alpha, matL_des, (hypre_double *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (hypre_double *)rhs, n, malL_info, policy, buffer));
2780    }
2781    else if (isSinglePrecision)
2782    {
2783       HYPRE_CUSPARSE_CALL(cusparseScsrsm2_solve( handle, algo, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2784                                                          n, m, nnz_BLUm, (float *)&alpha, matL_des, (float *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (float *)rhs, n, malL_info, policy, buffer));
2785    }
2786    /* now P = -UB\(LB\F) -> UB*P = -(LB\F)
2787     */
2788    alpha = -1.0;
2789    csrsm2Info_t            malU_info = NULL;
2790    HYPRE_CUSPARSE_CALL(cusparseCreateCsrsm2Info(&malU_info));
2791 
2792    buffer_size_old = buffer_size;
2793 
2794    /* check buffer size and create buffer */
2795 
2796    if (isDoublePrecision)
2797    {
2798       HYPRE_CUSPARSE_CALL(cusparseDcsrsm2_bufferSizeExt( handle, algo, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2799                                                          n, m, nnz_BLUm, (hypre_double *)&alpha, matU_des, (hypre_double *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (hypre_double *)rhs, n, malU_info, policy, &buffer_size));
2800    }
2801    else if (isSinglePrecision)
2802    {
2803       HYPRE_CUSPARSE_CALL(cusparseScsrsm2_bufferSizeExt( handle, algo, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2804                                                          n, m, nnz_BLUm, (float *)&alpha, matU_des, (float *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (float *)rhs, n, malU_info, policy, &buffer_size));
2805    }
2806 
2807    if (buffer_size > buffer_size_old)
2808    {
2809       buffer = hypre_ReAlloc_v2(buffer, buffer_size_old, buffer_size, HYPRE_MEMORY_DEVICE);
2810       buffer_size_old = buffer_size;
2811    }
2812 
2813    /* analysis */
2814 
2815    if (isDoublePrecision)
2816    {
2817       HYPRE_CUSPARSE_CALL(cusparseDcsrsm2_analysis( handle, algo, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2818                                                          n, m, nnz_BLUm, (hypre_double *)&alpha, matU_des, (hypre_double *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (hypre_double *)rhs, n, malU_info, policy, buffer));
2819    }
2820    else if (isSinglePrecision)
2821    {
2822       HYPRE_CUSPARSE_CALL(cusparseScsrsm2_analysis( handle, algo, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2823                                                          n, m, nnz_BLUm, (float *)&alpha, matU_des, (float *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (float *)rhs, n, malU_info, policy, buffer));
2824    }
2825 
2826    /* solve phase */
2827    if (isDoublePrecision)
2828    {
2829       HYPRE_CUSPARSE_CALL(cusparseDcsrsm2_solve( handle, algo, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2830                                                          n, m, nnz_BLUm, (hypre_double *)&alpha, matU_des, (hypre_double *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (hypre_double *)rhs, n, malU_info, policy, buffer));
2831    }
2832    else if (isSinglePrecision)
2833    {
2834       HYPRE_CUSPARSE_CALL(cusparseScsrsm2_solve( handle, algo, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2835                                                          n, m, nnz_BLUm, (float *)&alpha, matU_des, (float *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (float *)rhs, n, malU_info, policy, buffer));
2836    }
2837    /* wait till GPU done to copy data */
2838    cudaDeviceSynchronize();
2839    /* now form P, (n + m) * m */
2840    HYPRE_Real           drop_tol = 1e-06;
2841    HYPRE_Int            ctrP = 0;
2842    HYPRE_Int            *P_diag_i;
2843    HYPRE_Int            *P_offd_i;
2844    HYPRE_Int            *P_diag_j;
2845    HYPRE_Real           *P_diag_data;
2846 
2847    HYPRE_Int             capacity_P = nnz_BLUm + m;
2848 
2849    P_diag_i       = hypre_TAlloc(HYPRE_Int, n+m+1, HYPRE_MEMORY_DEVICE);
2850    P_offd_i       = hypre_CTAlloc(HYPRE_Int, n+m+1, HYPRE_MEMORY_DEVICE);
2851    P_diag_j       = hypre_TAlloc(HYPRE_Int, capacity_P, HYPRE_MEMORY_DEVICE);
2852    P_diag_data    = hypre_TAlloc(HYPRE_Real, capacity_P, HYPRE_MEMORY_DEVICE);
2853 
2854    for (row = 0; row < n; row++)
2855    {
2856       P_diag_i[row] = ctrP;
2857       for (col = 0; col < m; col++)
2858       {
2859          val = *(rhs + col*n + row);
2860          if (hypre_abs(val) > drop_tol)
2861          {
2862             if (ctrP >= capacity_P)
2863             {
2864                HYPRE_Int tmp;
2865                tmp = capacity_P;
2866                capacity_P = capacity_P * EXPAND_FACT;
2867                P_diag_j       = hypre_TReAlloc_v2(P_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_P, HYPRE_MEMORY_DEVICE);
2868                P_diag_data    = hypre_TReAlloc_v2(P_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_P, HYPRE_MEMORY_DEVICE);
2869             }
2870             P_diag_j[ctrP] = col;
2871             P_diag_data[ctrP++] = val;
2872          }
2873       }
2874    }
2875 
2876    if (ctrP + m >= capacity_P)
2877    {
2878       HYPRE_Int tmp;
2879       tmp = capacity_P;
2880       capacity_P = ctrP + m;
2881       P_diag_j       = hypre_TReAlloc_v2(P_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_P, HYPRE_MEMORY_DEVICE);
2882       P_diag_data    = hypre_TReAlloc_v2(P_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_P, HYPRE_MEMORY_DEVICE);
2883    }
2884 
2885    for (row = 0; row < m; row++)
2886    {
2887       P_diag_i[row+n] = ctrP;
2888       P_diag_j[ctrP] = row;
2889       P_diag_data[ctrP++] = 1.0;
2890    }
2891 
2892    P_diag_i[m+n] = ctrP;
2893 
2894    /* now start to form R = - (E / UB ) / LB
2895     * first EiUB = E / UB -> UB'*EiUB'=E'
2896     */
2897    alpha = 1.0;
2898    csrsm2Info_t            malU_info2 = NULL;
2899    HYPRE_CUSPARSE_CALL(cusparseCreateCsrsm2Info(&malU_info2));
2900 
2901    /* fill data, note that rhs is in Fortan style (col first)
2902     * oprating by col is slow, but
2903     */
2904 
2905    hypre_TFree(rhs, HYPRE_MEMORY_DEVICE);
2906    rhs = hypre_CTAlloc(HYPRE_Real, m * n, HYPRE_MEMORY_DEVICE);
2907 
2908    for (row = 0; row < m; row++)
2909    {
2910       for (j = E_diag_i[row]; j < E_diag_i[row+1]; j++)
2911       {
2912          col = E_diag_j[j];
2913          *(rhs + col*m + row) = E_diag_data[j];
2914       }
2915    }
2916 
2917    /* check buffer size and create buffer */
2918 
2919    if (isDoublePrecision)
2920    {
2921       HYPRE_CUSPARSE_CALL(cusparseDcsrsm2_bufferSizeExt( handle, algo, CUSPARSE_OPERATION_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
2922                                                          n, m, nnz_BLUm, (hypre_double *)&alpha, matU_des, (hypre_double *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (hypre_double *)rhs, m, malU_info2, policy, &buffer_size));
2923    }
2924    else if (isSinglePrecision)
2925    {
2926       HYPRE_CUSPARSE_CALL(cusparseScsrsm2_bufferSizeExt( handle, algo, CUSPARSE_OPERATION_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
2927                                                          n, m, nnz_BLUm, (float *)&alpha, matU_des, (float *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (float *)rhs, m, malU_info2, policy, &buffer_size));
2928    }
2929 
2930    if (buffer_size > buffer_size_old)
2931    {
2932       buffer = hypre_ReAlloc_v2(buffer, buffer_size_old, buffer_size, HYPRE_MEMORY_DEVICE);
2933       buffer_size_old = buffer_size;
2934    }
2935 
2936    /* analysis */
2937 
2938    if (isDoublePrecision)
2939    {
2940       HYPRE_CUSPARSE_CALL(cusparseDcsrsm2_analysis( handle, algo, CUSPARSE_OPERATION_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
2941                                                          n, m, nnz_BLUm, (hypre_double *)&alpha, matU_des, (hypre_double *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (hypre_double *)rhs, m, malU_info2, policy, buffer));
2942    }
2943    else if (isSinglePrecision)
2944    {
2945       HYPRE_CUSPARSE_CALL(cusparseScsrsm2_analysis( handle, algo, CUSPARSE_OPERATION_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
2946                                                       n, m, nnz_BLUm, (float *)&alpha, matU_des, (float *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (float *)rhs, m, malU_info2, policy, buffer));
2947    }
2948 
2949    /* solve phase */
2950    if (isDoublePrecision)
2951    {
2952       HYPRE_CUSPARSE_CALL(cusparseDcsrsm2_solve( handle, algo, CUSPARSE_OPERATION_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
2953                                                          n, m, nnz_BLUm, (hypre_double *)&alpha, matU_des, (hypre_double *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (hypre_double *)rhs, m, malU_info2, policy, buffer));
2954    }
2955    else if (isSinglePrecision)
2956    {
2957       HYPRE_CUSPARSE_CALL(cusparseScsrsm2_solve( handle, algo, CUSPARSE_OPERATION_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
2958                                                          n, m, nnz_BLUm, (float *)&alpha, matU_des, (float *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (float *)rhs, m, malU_info2, policy, buffer));
2959    }
2960 
2961    /* R = - (EiUB ) / LB -> LB'R' = -EiUB'
2962     */
2963    alpha = -1.0;
2964    csrsm2Info_t            malL_info2 = NULL;
2965    HYPRE_CUSPARSE_CALL(cusparseCreateCsrsm2Info(&malL_info2));
2966 
2967    /* check buffer size and create buffer */
2968 
2969    if (isDoublePrecision)
2970    {
2971       HYPRE_CUSPARSE_CALL(cusparseDcsrsm2_bufferSizeExt( handle, algo, CUSPARSE_OPERATION_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
2972                                                          n, m, nnz_BLUm, (hypre_double *)&alpha, matL_des, (hypre_double *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (hypre_double *)rhs, m, malL_info2, policy, &buffer_size));
2973    }
2974    else if (isSinglePrecision)
2975    {
2976       HYPRE_CUSPARSE_CALL(cusparseScsrsm2_bufferSizeExt( handle, algo, CUSPARSE_OPERATION_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
2977                                                          n, m, nnz_BLUm, (float *)&alpha, matL_des, (float *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (float *)rhs, m, malL_info2, policy, &buffer_size));
2978    }
2979 
2980    if (buffer_size > buffer_size_old)
2981    {
2982       buffer = hypre_ReAlloc_v2(buffer, buffer_size_old, buffer_size, HYPRE_MEMORY_DEVICE);
2983       buffer_size_old = buffer_size;
2984    }
2985 
2986    /* analysis */
2987 
2988    if (isDoublePrecision)
2989    {
2990       HYPRE_CUSPARSE_CALL(cusparseDcsrsm2_analysis( handle, algo, CUSPARSE_OPERATION_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
2991                                                          n, m, nnz_BLUm, (hypre_double *)&alpha, matL_des, (hypre_double *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (hypre_double *)rhs, m, malL_info2, policy, buffer));
2992    }
2993    else if (isSinglePrecision)
2994    {
2995       HYPRE_CUSPARSE_CALL(cusparseScsrsm2_analysis( handle, algo, CUSPARSE_OPERATION_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
2996                                                       n, m, nnz_BLUm, (float *)&alpha, matL_des, (float *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (float *)rhs, m, malL_info2, policy, buffer));
2997    }
2998 
2999    /* solve phase */
3000    if (isDoublePrecision)
3001    {
3002       HYPRE_CUSPARSE_CALL(cusparseDcsrsm2_solve( handle, algo, CUSPARSE_OPERATION_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
3003                                                          n, m, nnz_BLUm, (hypre_double *)&alpha, matL_des, (hypre_double *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (hypre_double *)rhs, m, malL_info2, policy, buffer));
3004    }
3005    else if (isSinglePrecision)
3006    {
3007       HYPRE_CUSPARSE_CALL(cusparseScsrsm2_solve( handle, algo, CUSPARSE_OPERATION_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
3008                                                          n, m, nnz_BLUm, (float *)&alpha, matL_des, (float *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (float *)rhs, m, malL_info2, policy, buffer));
3009    }
3010    cudaDeviceSynchronize();
3011    /* now form R, m * (n + m) */
3012    HYPRE_Int            ctrR = 0;
3013    HYPRE_Int            *R_diag_i;
3014    HYPRE_Int            *R_offd_i;
3015    HYPRE_Int            *R_diag_j;
3016    HYPRE_Real           *R_diag_data;
3017 
3018    HYPRE_Int       capacity_R = nnz_BLUm + m;
3019    R_diag_i       = hypre_TAlloc(HYPRE_Int, m+1, HYPRE_MEMORY_DEVICE);
3020    R_offd_i       = hypre_CTAlloc(HYPRE_Int, m+1, HYPRE_MEMORY_DEVICE);
3021    R_diag_j       = hypre_TAlloc(HYPRE_Int, capacity_R, HYPRE_MEMORY_DEVICE);
3022    R_diag_data    = hypre_TAlloc(HYPRE_Real, capacity_R, HYPRE_MEMORY_DEVICE);
3023 
3024    for (row = 0; row < m; row++)
3025    {
3026       R_diag_i[row] = ctrR;
3027       for (col = 0; col < n; col++)
3028       {
3029          val = *(rhs + col*m + row);
3030          if (hypre_abs(val) > drop_tol)
3031          {
3032             if (ctrR >= capacity_R)
3033             {
3034                HYPRE_Int tmp;
3035                tmp = capacity_R;
3036                capacity_R = capacity_R * EXPAND_FACT;
3037                R_diag_j       = hypre_TReAlloc_v2(R_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_R, HYPRE_MEMORY_DEVICE);
3038                R_diag_data    = hypre_TReAlloc_v2(R_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_R, HYPRE_MEMORY_DEVICE);
3039             }
3040             R_diag_j[ctrR] = col;
3041             R_diag_data[ctrR++] = val;
3042          }
3043       }
3044       if (ctrR >= capacity_R)
3045       {
3046          HYPRE_Int tmp;
3047          tmp = capacity_R;
3048          capacity_R = capacity_R * EXPAND_FACT;
3049          R_diag_j       = hypre_TReAlloc_v2(R_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_R, HYPRE_MEMORY_DEVICE);
3050          R_diag_data    = hypre_TReAlloc_v2(R_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_R, HYPRE_MEMORY_DEVICE);
3051       }
3052       R_diag_j[ctrR] = n + row;
3053       R_diag_data[ctrR++] = 1.0;
3054    }
3055 
3056    R_diag_i[m] = ctrR;
3057 
3058    hypre_TFree(buffer, HYPRE_MEMORY_DEVICE);
3059 
3060    /* create ParCSR matrices */
3061 
3062    R = hypre_ParCSRMatrixCreate( hypre_ParCSRMatrixComm(A),
3063                         hypre_ParCSRMatrixGlobalNumRows(E),
3064                         hypre_ParCSRMatrixGlobalNumCols(A),
3065                         hypre_ParCSRMatrixRowStarts(E),
3066                         hypre_ParCSRMatrixColStarts(A),
3067                         0,
3068                         ctrR,
3069                         0);
3070 
3071    P = hypre_ParCSRMatrixCreate( hypre_ParCSRMatrixComm(A),
3072                         hypre_ParCSRMatrixGlobalNumRows(A),
3073                         hypre_ParCSRMatrixGlobalNumCols(F),
3074                         hypre_ParCSRMatrixRowStarts(A),
3075                         hypre_ParCSRMatrixColStarts(F),
3076                         0,
3077                         ctrP,
3078                         0);
3079 
3080    /* Assign value to diagonal data */
3081 
3082    R_diag = hypre_ParCSRMatrixDiag(R);
3083    hypre_CSRMatrixI(R_diag) = R_diag_i;
3084    hypre_CSRMatrixJ(R_diag) = R_diag_j;
3085    hypre_CSRMatrixData(R_diag) = R_diag_data;
3086    hypre_CSRMatrixSetDataOwner(R_diag, 1);
3087 
3088    P_diag = hypre_ParCSRMatrixDiag(P);
3089    hypre_CSRMatrixI(P_diag) = P_diag_i;
3090    hypre_CSRMatrixJ(P_diag) = P_diag_j;
3091    hypre_CSRMatrixData(P_diag) = P_diag_data;
3092    hypre_CSRMatrixSetDataOwner(P_diag, 1);
3093 
3094    /* Assign value to off diagonal data */
3095 
3096    R_diag = hypre_ParCSRMatrixOffd(R);
3097    hypre_CSRMatrixI(R_diag) = R_offd_i;
3098    P_diag = hypre_ParCSRMatrixOffd(P);
3099    hypre_CSRMatrixI(P_diag) = P_offd_i;
3100 
3101    *Rp = R;
3102    *Pp = P;
3103 
3104    HYPRE_CUSPARSE_CALL(cusparseDestroyCsrsm2Info(malL_info));
3105    HYPRE_CUSPARSE_CALL(cusparseDestroyCsrsm2Info(malU_info));
3106    HYPRE_CUSPARSE_CALL(cusparseDestroyCsrsm2Info(malL_info2));
3107    HYPRE_CUSPARSE_CALL(cusparseDestroyCsrsm2Info(malU_info2));
3108 
3109    return hypre_error_flag;
3110 }
3111 
3112 /* Convert the L, D, U style to the cusparse style
3113  * Assume the diagonal of L and U are the ilu factorization, directly combine them
3114  */
3115 HYPRE_Int
hypre_ILUSetupLDUtoCusparse(hypre_ParCSRMatrix * L,HYPRE_Real * D,hypre_ParCSRMatrix * U,hypre_ParCSRMatrix ** LDUp)3116 hypre_ILUSetupLDUtoCusparse(hypre_ParCSRMatrix *L, HYPRE_Real *D, hypre_ParCSRMatrix *U, hypre_ParCSRMatrix **LDUp)
3117 {
3118    /* data slots */
3119    HYPRE_Int            i, j, pos;
3120 
3121    hypre_CSRMatrix      *L_diag        = hypre_ParCSRMatrixDiag(L);
3122    hypre_CSRMatrix      *U_diag        = hypre_ParCSRMatrixDiag(U);
3123    HYPRE_Int            *L_diag_i      = hypre_CSRMatrixI(L_diag);
3124    HYPRE_Int            *L_diag_j      = hypre_CSRMatrixJ(L_diag);
3125    HYPRE_Real           *L_diag_data   = hypre_CSRMatrixData(L_diag);
3126    HYPRE_Int            *U_diag_i      = hypre_CSRMatrixI(U_diag);
3127    HYPRE_Int            *U_diag_j      = hypre_CSRMatrixJ(U_diag);
3128    HYPRE_Real           *U_diag_data   = hypre_CSRMatrixData(U_diag);
3129    HYPRE_Int            n              = hypre_ParCSRMatrixNumRows(L);
3130    HYPRE_Int            nnz_L          = L_diag_i[n];
3131    HYPRE_Int            nnz_U          = U_diag_i[n];
3132    HYPRE_Int            nnz_LDU        = n + nnz_L + nnz_U;
3133 
3134    hypre_ParCSRMatrix   *LDU;
3135    hypre_CSRMatrix      *LDU_diag;
3136    HYPRE_Int            *LDU_diag_i;
3137    HYPRE_Int            *LDU_diag_j;
3138    HYPRE_Real           *LDU_diag_data;
3139 
3140    /* MPI */
3141    MPI_Comm             comm                 = hypre_ParCSRMatrixComm(L);
3142    HYPRE_Int            num_procs,  my_id;
3143 
3144    hypre_MPI_Comm_size(comm,&num_procs);
3145    hypre_MPI_Comm_rank(comm,&my_id);
3146 
3147 
3148    /* cuda data slot */
3149 
3150    /* create matrix */
3151 
3152    LDU = hypre_ParCSRMatrixCreate(  comm,
3153                                     hypre_ParCSRMatrixGlobalNumRows(L),
3154                                     hypre_ParCSRMatrixGlobalNumRows(L),
3155                                     hypre_ParCSRMatrixRowStarts(L),
3156                                     hypre_ParCSRMatrixColStarts(L),
3157                                     0,
3158                                     nnz_LDU,
3159                                     0);
3160 
3161    LDU_diag = hypre_ParCSRMatrixDiag(LDU);
3162    LDU_diag_i = hypre_TAlloc(HYPRE_Int, n+1, HYPRE_MEMORY_DEVICE);
3163    LDU_diag_j = hypre_TAlloc(HYPRE_Int, nnz_LDU, HYPRE_MEMORY_DEVICE);
3164    LDU_diag_data = hypre_TAlloc(HYPRE_Real, nnz_LDU, HYPRE_MEMORY_DEVICE);
3165 
3166    pos = 0;
3167 
3168    for (i = 1; i <= n; i++)
3169    {
3170       LDU_diag_i[i-1] = pos;
3171       for (j = L_diag_i[i-1]; j < L_diag_i[i]; j++)
3172       {
3173          LDU_diag_j[pos] = L_diag_j[j];
3174          LDU_diag_data[pos++] = L_diag_data[j];
3175       }
3176       LDU_diag_j[pos] = i-1;
3177       LDU_diag_data[pos++] = 1.0/D[i-1];
3178       for (j = U_diag_i[i-1]; j < U_diag_i[i]; j++)
3179       {
3180          LDU_diag_j[pos] = U_diag_j[j];
3181          LDU_diag_data[pos++] = U_diag_data[j];
3182       }
3183    }
3184    LDU_diag_i[n] = pos;
3185 
3186    hypre_CSRMatrixI(LDU_diag)    = LDU_diag_i;
3187    hypre_CSRMatrixJ(LDU_diag)    = LDU_diag_j;
3188    hypre_CSRMatrixData(LDU_diag) = LDU_diag_data;
3189 
3190    /* now sort */
3191    hypre_CSRMatrixSortRow(LDU_diag);
3192    hypre_ParCSRMatrixDiag(LDU) = LDU_diag;
3193 
3194    *LDUp = LDU;
3195 
3196    return hypre_error_flag;
3197 }
3198 
3199 /* Apply the (modified) ILU factorization to the diagonal block of A only.
3200  * A: matrix
3201  * ALUp: pointer to the result, factorization stroed on the diagonal
3202  * modified: set to 0 to use classical ILU0
3203  */
3204 HYPRE_Int
hypre_ILUSetupRAPMILU0(hypre_ParCSRMatrix * A,hypre_ParCSRMatrix ** ALUp,HYPRE_Int modified)3205 hypre_ILUSetupRAPMILU0(hypre_ParCSRMatrix *A, hypre_ParCSRMatrix **ALUp, HYPRE_Int modified)
3206 {
3207    HYPRE_Int            n              = hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A));
3208    /* Get necessary slots */
3209    hypre_ParCSRMatrix   *L, *U, *S, *ALU;
3210    HYPRE_Real           *D;
3211    HYPRE_Int            *u_end;
3212 
3213    /* u_end is the end position of the upper triangular part (if we need E and F implicitly), not used here */
3214    hypre_ILUSetupMILU0( A, NULL, NULL, n, n, &L, &D, &U, &S, &u_end, modified);
3215    hypre_TFree(u_end, HYPRE_MEMORY_HOST);
3216 
3217    hypre_ILUSetupLDUtoCusparse(L, D, U, &ALU);
3218 
3219    if (L)
3220    {
3221       hypre_ParCSRMatrixDestroy(L);
3222    }
3223    if (D)
3224    {
3225       hypre_TFree(D, HYPRE_MEMORY_DEVICE);
3226    }
3227    if (U)
3228    {
3229       hypre_ParCSRMatrixDestroy(U);
3230    }
3231 
3232    *ALUp = ALU;
3233 
3234    return hypre_error_flag;
3235 }
3236 
3237 /* Modified ILU(0) with RAP like solve
3238  * A = input matrix
3239  * Not explicitly forming the matrix, the previous version was abondoned
3240  */
3241 HYPRE_Int
hypre_ILUSetupRAPILU0Device(hypre_ParCSRMatrix * A,HYPRE_Int * perm,HYPRE_Int n,HYPRE_Int nLU,cusparseMatDescr_t matL_des,cusparseMatDescr_t matU_des,cusparseSolvePolicy_t ilu_solve_policy,void ** bufferp,csrsv2Info_t * matAL_infop,csrsv2Info_t * matAU_infop,csrsv2Info_t * matBL_infop,csrsv2Info_t * matBU_infop,csrsv2Info_t * matSL_infop,csrsv2Info_t * matSU_infop,hypre_ParCSRMatrix ** Apermptr,hypre_ParCSRMatrix ** matSptr,hypre_CSRMatrix ** ALUptr,hypre_CSRMatrix ** BLUptr,hypre_CSRMatrix ** CLUptr,hypre_CSRMatrix ** Eptr,hypre_CSRMatrix ** Fptr,HYPRE_Int test_opt)3242 hypre_ILUSetupRAPILU0Device(hypre_ParCSRMatrix *A, HYPRE_Int *perm, HYPRE_Int n, HYPRE_Int nLU,
3243                            cusparseMatDescr_t matL_des, cusparseMatDescr_t matU_des, cusparseSolvePolicy_t ilu_solve_policy,
3244                            void **bufferp, csrsv2Info_t *matAL_infop, csrsv2Info_t *matAU_infop,
3245                            csrsv2Info_t *matBL_infop, csrsv2Info_t *matBU_infop,
3246                            csrsv2Info_t *matSL_infop, csrsv2Info_t *matSU_infop,
3247                            hypre_ParCSRMatrix **Apermptr, hypre_ParCSRMatrix **matSptr, hypre_CSRMatrix **ALUptr, hypre_CSRMatrix **BLUptr, hypre_CSRMatrix **CLUptr,
3248                            hypre_CSRMatrix **Eptr, hypre_CSRMatrix **Fptr, HYPRE_Int test_opt)
3249 {
3250 
3251    /* params */
3252    MPI_Comm             comm           = hypre_ParCSRMatrixComm(A);
3253    HYPRE_Int            *rperm         = NULL;
3254 
3255    csrsv2Info_t         matAL_info     = NULL;
3256    csrsv2Info_t         matAU_info     = NULL;
3257    csrsv2Info_t         matBL_info     = NULL;
3258    csrsv2Info_t         matBU_info     = NULL;
3259    csrsv2Info_t         matSL_info     = NULL;
3260    csrsv2Info_t         matSU_info     = NULL;
3261 
3262    HYPRE_Int            buffer_size    = 0;
3263    void                 *buffer        = NULL;
3264 
3265    //hypre_CSRMatrix      *A_diag        = hypre_ParCSRMatrixDiag(A);
3266    HYPRE_Int            m              = n - nLU;
3267 
3268    //printf("Size of local Schur: %d\n",m);
3269 
3270    HYPRE_Int            i;
3271 
3272    /* MPI */
3273    HYPRE_Int            num_procs,  my_id;
3274    hypre_MPI_Comm_size(comm,&num_procs);
3275    hypre_MPI_Comm_rank(comm,&my_id);
3276 
3277    /* Matrix Structure */
3278    hypre_ParCSRMatrix   *Apq, *ALU, *ALUm, *S;
3279    hypre_CSRMatrix      *Amd, *Ad, *SLU, *Apq_diag;
3280 
3281    rperm                               = hypre_CTAlloc(HYPRE_Int, n, HYPRE_MEMORY_HOST);
3282 
3283    for(i = 0; i < n; i++)
3284    {
3285       rperm[perm[i]] = i;
3286    }
3287 
3288    /* first we need to compute the ILU0 factorization of B */
3289 
3290    /* Copy diagonal matrix into a new place with permutation
3291     * That is, Apq = A(perm,qperm);
3292     */
3293    hypre_ParILURAPReorder(A, perm, rperm, &Apq);
3294 
3295    /* do the full ILU0 and modified ILU0 */
3296    hypre_ILUSetupRAPMILU0(Apq, &ALU, 0);
3297    hypre_ILUSetupRAPMILU0(Apq, &ALUm, 1);
3298 
3299    hypre_CSRMatrix *dB, *dS, *dE, *dF;
3300 
3301    /* get modified and extract LU factorization */
3302    Amd = hypre_ParCSRMatrixDiag(ALUm);
3303    Ad = hypre_ParCSRMatrixDiag(ALU);
3304    switch(test_opt)
3305    {
3306       case 1:
3307          {
3308             /* RAP where we save E and F */
3309             Apq_diag = hypre_ParCSRMatrixDiag(Apq);
3310             hypre_CSRMatrixSortRow(Apq_diag);
3311             hypre_ParILUCusparseILUExtractEBFC(Apq_diag, nLU, &dB, &dS, Eptr, Fptr);
3312             /* get modified ILU of B */
3313             hypre_ParILUCusparseILUExtractEBFC(Amd, nLU, BLUptr, &SLU, &dE, &dF);
3314             hypre_CSRMatrixDestroy(dB);
3315             hypre_CSRMatrixDestroy(dS);
3316             hypre_CSRMatrixDestroy(dE);
3317             hypre_CSRMatrixDestroy(dF);
3318          }
3319          break;
3320       case 2:
3321          {
3322             /* C-EB^{-1}F where we save EU^{-1}, L^{-1}F as sparse matrices */
3323             Apq_diag = hypre_ParCSRMatrixDiag(Apq);
3324             hypre_CSRMatrixSortRow(Apq_diag);
3325             hypre_ParILUCusparseILUExtractEBFC(Apq_diag, nLU, &dB, CLUptr, &dE, &dF);
3326             /* get modified ILU of B */
3327             hypre_ParILUCusparseILUExtractEBFC(Amd, nLU, BLUptr, &SLU, Eptr, Fptr);
3328             hypre_CSRMatrixDestroy(dB);
3329             hypre_CSRMatrixDestroy(dE);
3330             hypre_CSRMatrixDestroy(dF);
3331          }
3332          break;
3333       case 3:
3334          {
3335             /* C-EB^{-1}F where we save E and F */
3336             Apq_diag = hypre_ParCSRMatrixDiag(Apq);
3337             hypre_CSRMatrixSortRow(Apq_diag);
3338             hypre_ParILUCusparseILUExtractEBFC(Apq_diag, nLU, &dB, CLUptr, Eptr, Fptr);
3339             /* get modified ILU of B */
3340             hypre_ParILUCusparseILUExtractEBFC(Amd, nLU, BLUptr, &SLU, &dE, &dF);
3341             hypre_CSRMatrixDestroy(dB);
3342             hypre_CSRMatrixDestroy(dE);
3343             hypre_CSRMatrixDestroy(dF);
3344          }
3345          break;
3346       case 4:
3347          {
3348             /* RAP where we save EU^{-1}, L^{-1}F as sparse matrices */
3349             hypre_ParILUCusparseILUExtractEBFC(Ad, nLU, BLUptr, &SLU, Eptr, Fptr);
3350          }
3351          break;
3352       case 0: default:
3353          {
3354             /* RAP where we save EU^{-1}, L^{-1}F as sparse matrices */
3355             hypre_ParILUCusparseILUExtractEBFC(Amd, nLU, BLUptr, &SLU, Eptr, Fptr);
3356          }
3357          break;
3358    }
3359 
3360    *ALUptr = hypre_ParCSRMatrixDiag(ALU);
3361    /* Analysis of BILU */
3362    HYPRE_ILUSetupCusparseCSRILU0SetupSolve(*ALUptr, matL_des, matU_des,
3363                            ilu_solve_policy, &matAL_info, &matAU_info,
3364                            &buffer_size, &buffer);
3365 
3366    /* Analysis of BILU */
3367    HYPRE_ILUSetupCusparseCSRILU0SetupSolve(*BLUptr, matL_des, matU_des,
3368                            ilu_solve_policy, &matBL_info, &matBU_info,
3369                            &buffer_size, &buffer);
3370 
3371    /* Analysis of SILU */
3372    HYPRE_ILUSetupCusparseCSRILU0SetupSolve(SLU, matL_des, matU_des,
3373                            ilu_solve_policy, &matSL_info, &matSU_info,
3374                            &buffer_size, &buffer);
3375 
3376    /* start forming parCSR matrix S */
3377 
3378    HYPRE_BigInt   S_total_rows, *S_row_starts;
3379    HYPRE_BigInt   big_m = (HYPRE_BigInt)m;
3380    hypre_MPI_Allreduce( &big_m, &S_total_rows, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
3381 
3382    if (S_total_rows>0)
3383    {
3384       {
3385          HYPRE_BigInt global_start;
3386          S_row_starts = hypre_CTAlloc(HYPRE_BigInt,2,HYPRE_MEMORY_HOST);
3387          hypre_MPI_Scan( &big_m, &global_start, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
3388          S_row_starts[0] = global_start - m;
3389          S_row_starts[1] = global_start;
3390       }
3391 
3392       S_row_starts = hypre_CTAlloc(HYPRE_BigInt, 2, HYPRE_MEMORY_HOST);
3393       S_row_starts[1] = S_total_rows;
3394       S_row_starts[0] = S_total_rows - m;
3395       hypre_MPI_Allreduce(&m, &S_total_rows, 1, HYPRE_MPI_INT, hypre_MPI_SUM, comm);
3396       S = hypre_ParCSRMatrixCreate( hypre_ParCSRMatrixComm(A),
3397                            S_total_rows,
3398                            S_total_rows,
3399                            S_row_starts,
3400                            S_row_starts,
3401                            0,
3402                            0,
3403                            0);
3404 
3405       /* memroy leak here */
3406       hypre_ParCSRMatrixDiag(S) = SLU;
3407 
3408       /* free memory */
3409       hypre_TFree(S_row_starts, HYPRE_MEMORY_HOST);
3410    }
3411 
3412    *matSptr       = S;
3413    *Apermptr      = Apq;
3414    *bufferp       = buffer;
3415    *matAL_infop   = matAL_info;
3416    *matAU_infop   = matAU_info;
3417    *matBL_infop   = matBL_info;
3418    *matBU_infop   = matBU_info;
3419    *matSL_infop   = matSL_info;
3420    *matSU_infop   = matSU_info;
3421 
3422    return hypre_error_flag;
3423 }
3424 
3425 #endif
3426 
3427 /* Modified ILU(0) with RAP like solve
3428  * A = input matrix
3429  * Not explicitly forming the matrix
3430  */
3431 HYPRE_Int
hypre_ILUSetupRAPILU0(hypre_ParCSRMatrix * A,HYPRE_Int * perm,HYPRE_Int n,HYPRE_Int nLU,hypre_ParCSRMatrix ** Lptr,HYPRE_Real ** Dptr,hypre_ParCSRMatrix ** Uptr,hypre_ParCSRMatrix ** mLptr,HYPRE_Real ** mDptr,hypre_ParCSRMatrix ** mUptr,HYPRE_Int ** u_end)3432 hypre_ILUSetupRAPILU0(hypre_ParCSRMatrix *A, HYPRE_Int *perm, HYPRE_Int n, HYPRE_Int nLU,
3433                            hypre_ParCSRMatrix **Lptr, HYPRE_Real **Dptr, hypre_ParCSRMatrix **Uptr,
3434                            hypre_ParCSRMatrix **mLptr, HYPRE_Real **mDptr, hypre_ParCSRMatrix **mUptr, HYPRE_Int **u_end)
3435 {
3436    HYPRE_Int            i;
3437    hypre_ParCSRMatrix   *S_temp = NULL;
3438    HYPRE_Int            *u_temp = NULL;
3439 
3440    /* standard ILU0 factorization */
3441    hypre_ILUSetupMILU0(A, perm, perm, n, n, Lptr, Dptr, Uptr, &S_temp, &u_temp, 0);
3442    if (S_temp)
3443    {
3444       hypre_ParCSRMatrixDestroy(S_temp);
3445    }
3446    if (u_temp)
3447    {
3448       hypre_Free( u_temp, HYPRE_MEMORY_HOST);
3449    }
3450    /* modified ILU0 factorization */
3451    hypre_ILUSetupMILU0(A, perm, perm, n, n, mLptr, mDptr, mUptr, &S_temp, &u_temp, 1);
3452    if (S_temp)
3453    {
3454       hypre_ParCSRMatrixDestroy(S_temp);
3455    }
3456    if (u_temp)
3457    {
3458       hypre_Free( u_temp, HYPRE_MEMORY_HOST);
3459    }
3460 
3461    /* pointer to the start location */
3462    HYPRE_Int *u_end_array;
3463    u_end_array = hypre_TAlloc(HYPRE_Int, n, HYPRE_MEMORY_HOST);
3464 
3465    hypre_CSRMatrix   *U_diag = hypre_ParCSRMatrixDiag(*Uptr);
3466    HYPRE_Int         *U_diag_i = hypre_CSRMatrixI(U_diag);
3467    HYPRE_Int         *U_diag_j = hypre_CSRMatrixJ(U_diag);
3468    HYPRE_Real        *U_diag_data = hypre_CSRMatrixData(U_diag);
3469    hypre_CSRMatrix   *mU_diag = hypre_ParCSRMatrixDiag(*mUptr);
3470    HYPRE_Int         *mU_diag_i = hypre_CSRMatrixI(mU_diag);
3471    HYPRE_Int         *mU_diag_j = hypre_CSRMatrixJ(mU_diag);
3472    HYPRE_Real        *mU_diag_data = hypre_CSRMatrixData(mU_diag);
3473 
3474    // first sort the Upper part U
3475    for (i = 0; i < nLU; i++)
3476    {
3477       hypre_qsort1(U_diag_j,U_diag_data,U_diag_i[i],U_diag_i[i+1]-1);
3478       hypre_qsort1(mU_diag_j,mU_diag_data,mU_diag_i[i],mU_diag_i[i+1]-1);
3479       hypre_BinarySearch2(U_diag_j,nLU,U_diag_i[i],U_diag_i[i+1]-1,u_end_array + i);
3480    }
3481 
3482    hypre_CSRMatrix   *L_diag = hypre_ParCSRMatrixDiag(*Lptr);
3483    HYPRE_Int         *L_diag_i = hypre_CSRMatrixI(L_diag);
3484    HYPRE_Int         *L_diag_j = hypre_CSRMatrixJ(L_diag);
3485    HYPRE_Real        *L_diag_data = hypre_CSRMatrixData(L_diag);
3486    hypre_CSRMatrix   *mL_diag = hypre_ParCSRMatrixDiag(*mLptr);
3487    HYPRE_Int         *mL_diag_i = hypre_CSRMatrixI(mL_diag);
3488    HYPRE_Int         *mL_diag_j = hypre_CSRMatrixJ(mL_diag);
3489    HYPRE_Real        *mL_diag_data = hypre_CSRMatrixData(mL_diag);
3490 
3491    // now sort the Lower part L
3492    for (i = nLU; i < n; i++)
3493    {
3494       hypre_qsort1(L_diag_j,L_diag_data,L_diag_i[i],L_diag_i[i+1]-1);
3495       hypre_qsort1(mL_diag_j,mL_diag_data,mL_diag_i[i],mL_diag_i[i+1]-1);
3496       hypre_BinarySearch2(L_diag_j, nLU, L_diag_i[i], L_diag_i[i+1]-1, u_end_array + i);
3497    }
3498 
3499    *u_end = u_end_array;
3500 
3501    return hypre_error_flag;
3502 }
3503 
3504 /* ILU(0)
3505  * A = input matrix
3506  * perm = permutation array indicating ordering of rows. Perm could come from a
3507  *    CF_marker array or a reordering routine. When set to NULL, indentity permutation is used.
3508  * qperm = permutation array indicating ordering of columns. When set to NULL, indentity permutation is used.
3509  * nI = number of interial unknowns
3510  * nLU = size of incomplete factorization, nLU should obey nLU <= nI.
3511  *    Schur complement is formed if nLU < n
3512  * Lptr, Dptr, Uptr, Sptr = L, D, U, S factors.
3513  * will form global Schur Matrix if nLU < n
3514  */
3515 HYPRE_Int
hypre_ILUSetupILU0(hypre_ParCSRMatrix * A,HYPRE_Int * perm,HYPRE_Int * qperm,HYPRE_Int nLU,HYPRE_Int nI,hypre_ParCSRMatrix ** Lptr,HYPRE_Real ** Dptr,hypre_ParCSRMatrix ** Uptr,hypre_ParCSRMatrix ** Sptr,HYPRE_Int ** u_end)3516 hypre_ILUSetupILU0(hypre_ParCSRMatrix *A, HYPRE_Int *perm, HYPRE_Int *qperm, HYPRE_Int nLU, HYPRE_Int nI,
3517       hypre_ParCSRMatrix **Lptr, HYPRE_Real** Dptr, hypre_ParCSRMatrix **Uptr, hypre_ParCSRMatrix **Sptr, HYPRE_Int **u_end)
3518 {
3519    return hypre_ILUSetupMILU0( A, perm, qperm, nLU, nI, Lptr, Dptr, Uptr, Sptr, u_end, 0);
3520 }
3521 
3522 /* (modified) ILU(0)
3523  * A = input matrix
3524  * perm = permutation array indicating ordering of rows. Perm could come from a
3525  *    CF_marker array or a reordering routine. When set to NULL, indentity permutation is used.
3526  * qperm = permutation array indicating ordering of columns When set to NULL, identity permutation is used.
3527  * nI = number of interior unknowns
3528  * nLU = size of incomplete factorization, nLU should obey nLU <= nI.
3529  *    Schur complement is formed if nLU < n
3530  * Lptr, Dptr, Uptr, Sptr = L, D, U, S factors.
3531  * modified set to 0 to use classical ILU
3532  * will form global Schur Matrix if nLU < n
3533  */
3534 HYPRE_Int
hypre_ILUSetupMILU0(hypre_ParCSRMatrix * A,HYPRE_Int * permp,HYPRE_Int * qpermp,HYPRE_Int nLU,HYPRE_Int nI,hypre_ParCSRMatrix ** Lptr,HYPRE_Real ** Dptr,hypre_ParCSRMatrix ** Uptr,hypre_ParCSRMatrix ** Sptr,HYPRE_Int ** u_end,HYPRE_Int modified)3535 hypre_ILUSetupMILU0(hypre_ParCSRMatrix *A, HYPRE_Int *permp, HYPRE_Int *qpermp, HYPRE_Int nLU, HYPRE_Int nI,
3536       hypre_ParCSRMatrix **Lptr, HYPRE_Real** Dptr, hypre_ParCSRMatrix **Uptr, hypre_ParCSRMatrix **Sptr, HYPRE_Int **u_end,
3537       HYPRE_Int modified)
3538 {
3539    HYPRE_Int                i, ii, j, k, k1, k2, k3, ctrU, ctrL, ctrS, lenl, lenu, jpiv, col, jpos;
3540    HYPRE_Int                *iw, *iL, *iU;
3541    HYPRE_Real               dd, t, dpiv, lxu, *wU, *wL;
3542    HYPRE_Real               drop;
3543 
3544    /* communication stuffs for S */
3545    MPI_Comm                 comm             = hypre_ParCSRMatrixComm(A);
3546    HYPRE_Int                S_offd_nnz, S_offd_ncols;
3547    hypre_ParCSRCommPkg      *comm_pkg;
3548    hypre_ParCSRCommHandle   *comm_handle;
3549    HYPRE_Int                num_sends, begin, end;
3550    HYPRE_BigInt                *send_buf        = NULL;
3551    HYPRE_Int                num_procs, my_id;
3552 
3553    /* data objects for A */
3554    hypre_CSRMatrix          *A_diag          = hypre_ParCSRMatrixDiag(A);
3555    hypre_CSRMatrix          *A_offd          = hypre_ParCSRMatrixOffd(A);
3556    HYPRE_Real               *A_diag_data     = hypre_CSRMatrixData(A_diag);
3557    HYPRE_Int                *A_diag_i        = hypre_CSRMatrixI(A_diag);
3558    HYPRE_Int                *A_diag_j        = hypre_CSRMatrixJ(A_diag);
3559    HYPRE_Real               *A_offd_data     = hypre_CSRMatrixData(A_offd);
3560    HYPRE_Int                *A_offd_i        = hypre_CSRMatrixI(A_offd);
3561    HYPRE_Int                *A_offd_j        = hypre_CSRMatrixJ(A_offd);
3562 
3563    /* size of problem and schur system */
3564    HYPRE_Int                n                = hypre_CSRMatrixNumRows(A_diag);
3565    HYPRE_Int                m                = n - nLU;
3566    HYPRE_Int                e                = nI - nLU;
3567    HYPRE_Int                m_e              = n - nI;
3568    HYPRE_Real               local_nnz, total_nnz;
3569    HYPRE_Int                *u_end_array;
3570 
3571    /* data objects for L, D, U */
3572    hypre_ParCSRMatrix       *matL;
3573    hypre_ParCSRMatrix       *matU;
3574    hypre_CSRMatrix          *L_diag;
3575    hypre_CSRMatrix          *U_diag;
3576    HYPRE_Real               *D_data;
3577    HYPRE_Real               *L_diag_data;
3578    HYPRE_Int                *L_diag_i;
3579    HYPRE_Int                *L_diag_j;
3580    HYPRE_Real               *U_diag_data;
3581    HYPRE_Int                *U_diag_i;
3582    HYPRE_Int                *U_diag_j;
3583 
3584    /* data objects for S */
3585    hypre_ParCSRMatrix       *matS = NULL;
3586    hypre_CSRMatrix          *S_diag;
3587    hypre_CSRMatrix          *S_offd;
3588    HYPRE_Real               *S_diag_data     = NULL;
3589    HYPRE_Int                *S_diag_i        = NULL;
3590    HYPRE_Int                *S_diag_j        = NULL;
3591    HYPRE_Int                *S_offd_i        = NULL;
3592    HYPRE_Int                *S_offd_j        = NULL;
3593    HYPRE_BigInt             *S_offd_colmap   = NULL;
3594    HYPRE_Real               *S_offd_data;
3595    HYPRE_BigInt             col_starts[2];
3596    HYPRE_BigInt             total_rows;
3597 
3598    /* memory management */
3599    HYPRE_Int                initial_alloc    = 0;
3600    HYPRE_Int                capacity_L;
3601    HYPRE_Int                capacity_U;
3602    HYPRE_Int                capacity_S       = 0;
3603    HYPRE_Int                nnz_A            = A_diag_i[n];
3604 
3605    /* reverse permutation array */
3606    HYPRE_Int                *rperm;
3607    HYPRE_Int                *perm, *qperm;
3608 
3609    /* start setup
3610     * get communication stuffs first
3611     */
3612    hypre_MPI_Comm_size(comm,&num_procs);
3613    hypre_MPI_Comm_rank(comm,&my_id);
3614    comm_pkg = hypre_ParCSRMatrixCommPkg(A);
3615    /* setup if not yet built */
3616    if (!comm_pkg)
3617    {
3618       hypre_MatvecCommPkgCreate(A);
3619       comm_pkg = hypre_ParCSRMatrixCommPkg(A);
3620    }
3621 
3622    /* check for correctness */
3623    if (nLU < 0 || nLU > n)
3624    {
3625       hypre_error_w_msg(HYPRE_ERROR_ARG,"WARNING: nLU out of range.\n");
3626    }
3627    if (e < 0)
3628    {
3629       hypre_error_w_msg(HYPRE_ERROR_ARG,"WARNING: nLU should not exceed nI.\n");
3630    }
3631 
3632    /* Allocate memory for u_end array */
3633    u_end_array    = hypre_TAlloc(HYPRE_Int, nLU, HYPRE_MEMORY_HOST);
3634 
3635    /* Allocate memory for L,D,U,S factors */
3636    if (n > 0)
3637    {
3638       initial_alloc  = nLU + ceil((nnz_A / 2.0)*nLU/n);
3639       capacity_S     = m + ceil((nnz_A / 2.0)*m/n);
3640    }
3641    capacity_L     = initial_alloc;
3642    capacity_U     = initial_alloc;
3643 
3644    D_data         = hypre_TAlloc(HYPRE_Real, n, HYPRE_MEMORY_DEVICE);
3645    L_diag_i       = hypre_TAlloc(HYPRE_Int, n+1, HYPRE_MEMORY_DEVICE);
3646    L_diag_j       = hypre_TAlloc(HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
3647    L_diag_data    = hypre_TAlloc(HYPRE_Real, capacity_L, HYPRE_MEMORY_DEVICE);
3648    U_diag_i       = hypre_TAlloc(HYPRE_Int, n+1, HYPRE_MEMORY_DEVICE);
3649    U_diag_j       = hypre_TAlloc(HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
3650    U_diag_data    = hypre_TAlloc(HYPRE_Real, capacity_U, HYPRE_MEMORY_DEVICE);
3651    S_diag_i       = hypre_TAlloc(HYPRE_Int, m+1, HYPRE_MEMORY_DEVICE);
3652    S_diag_j       = hypre_TAlloc(HYPRE_Int, capacity_S, HYPRE_MEMORY_DEVICE);
3653    S_diag_data    = hypre_TAlloc(HYPRE_Real, capacity_S, HYPRE_MEMORY_DEVICE);
3654 
3655    /* allocate working arrays */
3656    iw             = hypre_TAlloc(HYPRE_Int, 3*n, HYPRE_MEMORY_HOST);
3657    iL             = iw+n;
3658    rperm          = iw + 2*n;
3659    wL             = hypre_TAlloc(HYPRE_Real, n, HYPRE_MEMORY_HOST);
3660 
3661    ctrU        = ctrL        = ctrS        = 0;
3662    L_diag_i[0] = U_diag_i[0] = S_diag_i[0] = 0;
3663    /* set marker array iw to -1 */
3664    for (i = 0; i < n; i++)
3665    {
3666       iw[i] = -1;
3667    }
3668 
3669    /* get reverse permutation (rperm).
3670     * create permutation if they are null
3671     * rperm holds the reordered indexes.
3672     * rperm only used for column
3673     */
3674 
3675    if (!permp)
3676    {
3677       perm = hypre_TAlloc(HYPRE_Int, n, HYPRE_MEMORY_DEVICE);
3678       for (i = 0; i < n; i++)
3679       {
3680          perm[i] = i;
3681       }
3682    }
3683    else
3684    {
3685       perm = permp;
3686    }
3687 
3688    if (!qpermp)
3689    {
3690       qperm = hypre_TAlloc(HYPRE_Int, n, HYPRE_MEMORY_DEVICE);
3691       for (i = 0; i < n; i++)
3692       {
3693          qperm[i] = i;
3694       }
3695    }
3696    else
3697    {
3698       qperm = qpermp;
3699    }
3700 
3701    for (i = 0; i < n; i++)
3702    {
3703       rperm[qperm[i]] = i;
3704    }
3705 
3706    /*---------  Begin Factorization. Work in permuted space  ----*/
3707    for (ii = 0; ii < nLU; ii++)
3708    {
3709       // get row i
3710       i = perm[ii];
3711       // get extents of row i
3712       k1=A_diag_i[i];
3713       k2=A_diag_i[i+1];
3714       // track the drop
3715       drop = 0.0;
3716 
3717       /*-------------------- unpack L & U-parts of row of A in arrays w */
3718       iU = iL+ii;
3719       wU = wL+ii;
3720       /*--------------------  diagonal entry */
3721       dd = 0.0;
3722       lenl  = lenu = 0;
3723       iw[ii] = ii;
3724       /*-------------------- scan & unwrap column */
3725       for (j = k1; j < k2; j++)
3726       {
3727          col = rperm[A_diag_j[j]];
3728          t = A_diag_data[j];
3729          if ( col < ii )
3730          {
3731             iw[col] = lenl;
3732             iL[lenl] = col;
3733             wL[lenl++] = t;
3734          }
3735          else if (col > ii)
3736          {
3737             iw[col] = lenu;
3738             iU[lenu] = col;
3739             wU[lenu++] = t;
3740          }
3741          else
3742          {
3743             dd=t;
3744          }
3745       }
3746 
3747       /* eliminate row */
3748       /*-------------------------------------------------------------------------
3749        *  In order to do the elimination in the correct order we must select the
3750        *  smallest column index among iL[k], k = j, j+1, ..., lenl-1. For ILU(0),
3751        *  no new fill-ins are expect, so we can pre-sort iL and wL prior to the
3752        *  entering the elimination loop.
3753        *-----------------------------------------------------------------------*/
3754       //      hypre_quickSortIR(iL, wL, iw, 0, (lenl-1));
3755       hypre_qsort3ir(iL, wL, iw, 0, (lenl-1));
3756       for (j = 0; j < lenl; j++)
3757       {
3758          jpiv = iL[j];
3759          /* get factor/ pivot element */
3760          dpiv = wL[j] * D_data[jpiv];
3761          /* store entry in L */
3762          wL[j] = dpiv;
3763 
3764          /* zero out element - reset pivot */
3765          iw[jpiv] = -1;
3766          /* combine current row and pivot row */
3767          for (k = U_diag_i[jpiv]; k < U_diag_i[jpiv+1]; k++)
3768          {
3769             col = U_diag_j[k];
3770             jpos = iw[col];
3771 
3772             /* Only fill-in nonzero pattern (jpos != 0) */
3773             if (jpos < 0)
3774             {
3775                drop = drop - U_diag_data[k] * dpiv;
3776                continue;
3777             }
3778 
3779             lxu = - U_diag_data[k] * dpiv;
3780             if (col < ii)
3781             {
3782                /* dealing with L part */
3783                wL[jpos] += lxu;
3784             }
3785             else if (col > ii)
3786             {
3787                /* dealing with U part */
3788                wU[jpos] += lxu;
3789             }
3790             else
3791             {
3792                /* diagonal update */
3793                dd += lxu;
3794             }
3795          }
3796       }
3797       /* modify when necessary */
3798       if (modified)
3799       {
3800          dd = dd + drop;
3801       }
3802 
3803       /* restore iw (only need to restore diagonal and U part */
3804       iw[ii] = -1;
3805       for (j = 0; j < lenu; j++)
3806       {
3807          iw[iU[j]] = -1;
3808       }
3809 
3810       /* Update LDU factors */
3811       /* L part */
3812       /* Check that memory is sufficient */
3813       if (lenl > 0)
3814       {
3815          while ((ctrL+lenl) > capacity_L)
3816          {
3817             HYPRE_Int tmp = capacity_L;
3818             capacity_L = capacity_L * EXPAND_FACT + 1;
3819             L_diag_j = hypre_TReAlloc_v2(L_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
3820             L_diag_data = hypre_TReAlloc_v2(L_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_L, HYPRE_MEMORY_DEVICE);
3821          }
3822          //hypre_TMemcpy(&(L_diag_j)[ctrL], iL, HYPRE_Int, lenl, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
3823          //hypre_TMemcpy(&(L_diag_data)[ctrL], wL, HYPRE_Real, lenl, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
3824          hypre_TMemcpy(&L_diag_j[ctrL], iL, HYPRE_Int, lenl,
3825                        HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
3826          hypre_TMemcpy(&L_diag_data[ctrL], wL, HYPRE_Real, lenl,
3827                        HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
3828       }
3829       L_diag_i[ii+1] = (ctrL+=lenl);
3830 
3831       /* diagonal part (we store the inverse) */
3832       if (fabs(dd) < MAT_TOL)
3833       {
3834          dd = 1.0e-6;
3835       }
3836       D_data[ii] = 1./dd;
3837 
3838       /* U part */
3839       /* Check that memory is sufficient */
3840       if (lenu > 0)
3841       {
3842          while ((ctrU+lenu) > capacity_U)
3843          {
3844             HYPRE_Int tmp = capacity_U;
3845             capacity_U = capacity_U * EXPAND_FACT + 1;
3846             U_diag_j = hypre_TReAlloc_v2(U_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
3847             U_diag_data = hypre_TReAlloc_v2(U_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_U, HYPRE_MEMORY_DEVICE);
3848          }
3849          //hypre_TMemcpy(&(U_diag_j)[ctrU], iU, HYPRE_Int, lenu, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
3850          //hypre_TMemcpy(&(U_diag_data)[ctrU], wU, HYPRE_Real, lenu, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
3851          hypre_TMemcpy(&U_diag_j[ctrU], iU, HYPRE_Int, lenu,
3852                        HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
3853          hypre_TMemcpy(&U_diag_data[ctrU], wU, HYPRE_Real, lenu,
3854                        HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
3855       }
3856       U_diag_i[ii+1] = (ctrU+=lenu);
3857 
3858       /* check and build u_end array */
3859       if (m > 0)
3860       {
3861          hypre_qsort1(U_diag_j,U_diag_data,U_diag_i[ii],U_diag_i[ii+1]-1);
3862          hypre_BinarySearch2(U_diag_j,nLU,U_diag_i[ii],U_diag_i[ii+1]-1,u_end_array + ii);
3863       }
3864       else
3865       {
3866          /* Everything is in U */
3867          u_end_array[ii] = ctrU;
3868       }
3869 
3870    }
3871 
3872    /*---------  Begin Factorization in Schur Complement part  ----*/
3873    for (ii = nLU; ii < n; ii++)
3874    {
3875       // get row i
3876       i = perm[ii];
3877       // get extents of row i
3878       k1=A_diag_i[i];
3879       k2=A_diag_i[i+1];
3880       drop = 0.0;
3881 
3882       /*-------------------- unpack L & U-parts of row of A in arrays w */
3883       iU = iL+nLU + 1;
3884       wU = wL+nLU + 1;
3885       /*--------------------  diagonal entry */
3886       dd = 0.0;
3887       lenl  = lenu = 0;
3888       iw[ii] = nLU;
3889       /*-------------------- scan & unwrap column */
3890       for (j = k1; j < k2; j++)
3891       {
3892          col = rperm[A_diag_j[j]];
3893          t = A_diag_data[j];
3894          if ( col < nLU )
3895          {
3896             iw[col] = lenl;
3897             iL[lenl] = col;
3898             wL[lenl++] = t;
3899          }
3900          else if (col != ii)
3901          {
3902             iw[col] = lenu;
3903             iU[lenu] = col;
3904             wU[lenu++] = t;
3905          }
3906          else
3907          {
3908             dd=t;
3909          }
3910       }
3911 
3912       /* eliminate row */
3913       /*-------------------------------------------------------------------------
3914        *  In order to do the elimination in the correct order we must select the
3915        *  smallest column index among iL[k], k = j, j+1, ..., lenl-1. For ILU(0),
3916        *  no new fill-ins are expect, so we can pre-sort iL and wL prior to the
3917        *  entering the elimination loop.
3918        *-----------------------------------------------------------------------*/
3919       //      hypre_quickSortIR(iL, wL, iw, 0, (lenl-1));
3920       hypre_qsort3ir(iL, wL, iw, 0, (lenl-1));
3921       for (j = 0; j < lenl; j++)
3922       {
3923          jpiv = iL[j];
3924          /* get factor/ pivot element */
3925          dpiv = wL[j] * D_data[jpiv];
3926          /* store entry in L */
3927          wL[j] = dpiv;
3928 
3929          /* zero out element - reset pivot */
3930          iw[jpiv] = -1;
3931          /* combine current row and pivot row */
3932          for (k = U_diag_i[jpiv]; k < U_diag_i[jpiv+1]; k++)
3933          {
3934             col = U_diag_j[k];
3935             jpos = iw[col];
3936 
3937             /* Only fill-in nonzero pattern (jpos != 0) */
3938             if (jpos < 0)
3939             {
3940                drop = drop - U_diag_data[k] * dpiv;
3941                continue;
3942             }
3943 
3944             lxu = - U_diag_data[k] * dpiv;
3945             if (col < nLU)
3946             {
3947                /* dealing with L part */
3948                wL[jpos] += lxu;
3949             }
3950             else if (col != ii)
3951             {
3952                /* dealing with U part */
3953                wU[jpos] += lxu;
3954             }
3955             else
3956             {
3957                /* diagonal update */
3958                dd += lxu;
3959             }
3960          }
3961       }
3962       if (modified)
3963       {
3964          dd = dd + drop;
3965       }
3966       /* restore iw (only need to restore diagonal and U part */
3967       iw[ii] = -1;
3968       for (j = 0; j < lenu; j++)
3969       {
3970          iw[iU[j]] = -1;
3971       }
3972 
3973       /* Update LDU factors */
3974       /* L part */
3975       /* Check that memory is sufficient */
3976       if (lenl > 0)
3977       {
3978          while ((ctrL+lenl) > capacity_L)
3979          {
3980             HYPRE_Int tmp = capacity_L;
3981             capacity_L = capacity_L * EXPAND_FACT + 1;
3982             L_diag_j = hypre_TReAlloc_v2(L_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
3983             L_diag_data = hypre_TReAlloc_v2(L_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_L, HYPRE_MEMORY_DEVICE);
3984          }
3985          //hypre_TMemcpy(&(L_diag_j)[ctrL], iL, HYPRE_Int, lenl, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
3986          //hypre_TMemcpy(&(L_diag_data)[ctrL], wL, HYPRE_Real, lenl, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
3987          hypre_TMemcpy(&L_diag_j[ctrL], iL, HYPRE_Int, lenl,
3988                        HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
3989          hypre_TMemcpy(&L_diag_data[ctrL], wL, HYPRE_Real, lenl,
3990                        HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
3991       }
3992       L_diag_i[ii+1] = (ctrL+=lenl);
3993 
3994       /* S part */
3995       /* Check that memory is sufficient */
3996       while ((ctrS+lenu+1) > capacity_S)
3997       {
3998          HYPRE_Int tmp = capacity_S;
3999          capacity_S = capacity_S * EXPAND_FACT + 1;
4000          S_diag_j = hypre_TReAlloc_v2(S_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_S, HYPRE_MEMORY_DEVICE);
4001          S_diag_data = hypre_TReAlloc_v2(S_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_S, HYPRE_MEMORY_DEVICE);
4002       }
4003       /* remember S in under a new index system! */
4004       S_diag_j[ctrS] = ii - nLU;
4005       S_diag_data[ctrS] = dd;
4006       for (j = 0; j < lenu; j++)
4007       {
4008          S_diag_j[ctrS+1+j] = iU[j] - nLU;
4009       }
4010       //hypre_TMemcpy(S_diag_data+ctrS+1, wU, HYPRE_Real, lenu, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
4011       hypre_TMemcpy(S_diag_data+ctrS+1, wU, HYPRE_Real, lenu,
4012                     HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
4013       S_diag_i[ii-nLU+1] = ctrS+=(lenu+1);
4014    }
4015    /* Assemble LDUS matrices */
4016    /* zero out unfactored rows for U and D */
4017    for (k = nLU; k < n; k++)
4018    {
4019       U_diag_i[k+1] = ctrU;
4020       D_data[k] = 1.;
4021    }
4022 
4023    /* First create Schur complement if necessary
4024     * Check if we need to create Schur complement
4025     */
4026    HYPRE_BigInt big_m = (HYPRE_BigInt)m;
4027    hypre_MPI_Allreduce(&big_m, &total_rows, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
4028    /* only form when total_rows > 0 */
4029    if ( total_rows > 0 )
4030    {
4031       /* now create S */
4032       /* need to get new column start */
4033       {
4034          HYPRE_BigInt global_start;
4035          hypre_MPI_Scan( &big_m, &global_start, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
4036          col_starts[0] = global_start - m;
4037          col_starts[1] = global_start;
4038       }
4039 
4040       /* We did nothing to A_offd, so all the data kept, just reorder them
4041        * The create function takes comm, global num rows/cols,
4042        *    row/col start, num cols offd, nnz diag, nnz offd
4043        */
4044       S_offd_nnz = hypre_CSRMatrixNumNonzeros(A_offd);
4045       S_offd_ncols = hypre_CSRMatrixNumCols(A_offd);
4046 
4047       matS = hypre_ParCSRMatrixCreate( comm,
4048             total_rows,
4049             total_rows,
4050             col_starts,
4051             col_starts,
4052             S_offd_ncols,
4053             ctrS,
4054             S_offd_nnz);
4055 
4056       /* first put diagonal data in */
4057       S_diag = hypre_ParCSRMatrixDiag(matS);
4058 
4059       hypre_CSRMatrixI(S_diag) = S_diag_i;
4060       hypre_CSRMatrixData(S_diag) = S_diag_data;
4061       hypre_CSRMatrixJ(S_diag) = S_diag_j;
4062 
4063       /* now start to construct offdiag of S */
4064       S_offd = hypre_ParCSRMatrixOffd(matS);
4065       S_offd_i = hypre_TAlloc(HYPRE_Int, m+1, HYPRE_MEMORY_DEVICE);
4066       S_offd_j = hypre_TAlloc(HYPRE_Int, S_offd_nnz, HYPRE_MEMORY_DEVICE);
4067       S_offd_data = hypre_TAlloc(HYPRE_Real, S_offd_nnz, HYPRE_MEMORY_DEVICE);
4068       S_offd_colmap = hypre_CTAlloc(HYPRE_BigInt, S_offd_ncols, HYPRE_MEMORY_HOST);
4069 
4070       /* simply use a loop to copy data from A_offd */
4071       S_offd_i[0] = 0;
4072       k3 = 0;
4073       for (i = 1; i <= e; i++)
4074       {
4075          S_offd_i[i] = k3;
4076       }
4077       for (i = 0; i < m_e; i++)
4078       {
4079          col = perm[i + nI];
4080          k1 = A_offd_i[col];
4081          k2 = A_offd_i[col+1];
4082          for (j = k1; j < k2; j++)
4083          {
4084             S_offd_j[k3] = A_offd_j[j];
4085             S_offd_data[k3++] = A_offd_data[j];
4086          }
4087          S_offd_i[i+1+e] = k3;
4088       }
4089 
4090       /* give I, J, DATA to S_offd */
4091       hypre_CSRMatrixI(S_offd) = S_offd_i;
4092       hypre_CSRMatrixJ(S_offd) = S_offd_j;
4093       hypre_CSRMatrixData(S_offd) = S_offd_data;
4094 
4095       /* now we need to update S_offd_colmap */
4096 
4097       /* get total num of send */
4098       num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
4099       begin = hypre_ParCSRCommPkgSendMapStart(comm_pkg,0);
4100       end = hypre_ParCSRCommPkgSendMapStart(comm_pkg,num_sends);
4101       send_buf = hypre_TAlloc(HYPRE_BigInt, end - begin, HYPRE_MEMORY_HOST);
4102       /* copy new index into send_buf */
4103       for (i = begin; i < end; i++)
4104       {
4105          send_buf[i-begin] = rperm[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,i)] - nLU + col_starts[0];
4106       }
4107       /* main communication */
4108       comm_handle = hypre_ParCSRCommHandleCreate(21, comm_pkg, send_buf, S_offd_colmap);
4109       hypre_ParCSRCommHandleDestroy(comm_handle);
4110 
4111       /* setup index */
4112       hypre_ParCSRMatrixColMapOffd(matS) = S_offd_colmap;
4113 
4114       hypre_ILUSortOffdColmap(matS);
4115 
4116       /* free */
4117       hypre_TFree(send_buf, HYPRE_MEMORY_HOST);
4118    } /* end of forming S */
4119 
4120    /* create S finished */
4121 
4122    matL = hypre_ParCSRMatrixCreate( comm,
4123          hypre_ParCSRMatrixGlobalNumRows(A),
4124          hypre_ParCSRMatrixGlobalNumRows(A),
4125          hypre_ParCSRMatrixRowStarts(A),
4126          hypre_ParCSRMatrixColStarts(A),
4127          0,
4128          ctrL,
4129          0 );
4130 
4131    L_diag = hypre_ParCSRMatrixDiag(matL);
4132    hypre_CSRMatrixI(L_diag) = L_diag_i;
4133    if (ctrL)
4134    {
4135       hypre_CSRMatrixData(L_diag) = L_diag_data;
4136       hypre_CSRMatrixJ(L_diag) = L_diag_j;
4137    }
4138    else
4139    {
4140       /* we've allocated some memory, so free if not used */
4141       hypre_TFree(L_diag_j,HYPRE_MEMORY_DEVICE);
4142       hypre_TFree(L_diag_data,HYPRE_MEMORY_DEVICE);
4143    }
4144    /* store (global) total number of nonzeros */
4145    local_nnz = (HYPRE_Real) ctrL;
4146    hypre_MPI_Allreduce(&local_nnz, &total_nnz, 1, HYPRE_MPI_REAL, hypre_MPI_SUM, comm);
4147    hypre_ParCSRMatrixDNumNonzeros(matL) = total_nnz;
4148 
4149    matU = hypre_ParCSRMatrixCreate( comm,
4150          hypre_ParCSRMatrixGlobalNumRows(A),
4151          hypre_ParCSRMatrixGlobalNumRows(A),
4152          hypre_ParCSRMatrixRowStarts(A),
4153          hypre_ParCSRMatrixColStarts(A),
4154          0,
4155          ctrU,
4156          0 );
4157 
4158    U_diag = hypre_ParCSRMatrixDiag(matU);
4159    hypre_CSRMatrixI(U_diag) = U_diag_i;
4160    if (ctrU)
4161    {
4162       hypre_CSRMatrixData(U_diag) = U_diag_data;
4163       hypre_CSRMatrixJ(U_diag) = U_diag_j;
4164    }
4165    else
4166    {
4167       /* we've allocated some memory, so free if not used */
4168       hypre_TFree(U_diag_j,HYPRE_MEMORY_DEVICE);
4169       hypre_TFree(U_diag_data,HYPRE_MEMORY_DEVICE);
4170    }
4171    /* store (global) total number of nonzeros */
4172    local_nnz = (HYPRE_Real) ctrU;
4173    hypre_MPI_Allreduce(&local_nnz, &total_nnz, 1, HYPRE_MPI_REAL, hypre_MPI_SUM, comm);
4174    hypre_ParCSRMatrixDNumNonzeros(matU) = total_nnz;
4175    /* free memory */
4176    hypre_TFree(wL,HYPRE_MEMORY_HOST);
4177    hypre_TFree(iw,HYPRE_MEMORY_HOST);
4178    if (!matS)
4179    {
4180       /* we allocate some memory for S, need to free if unused */
4181       hypre_TFree(S_diag_i,HYPRE_MEMORY_DEVICE);
4182    }
4183 
4184    if (!permp)
4185    {
4186       hypre_TFree(perm, HYPRE_MEMORY_DEVICE);
4187    }
4188    if (!qpermp)
4189    {
4190       hypre_TFree(qperm, HYPRE_MEMORY_DEVICE);
4191    }
4192 
4193    /* set matrix pointers */
4194    *Lptr = matL;
4195    *Dptr = D_data;
4196    *Uptr = matU;
4197    *Sptr = matS;
4198    *u_end = u_end_array;
4199 
4200    return hypre_error_flag;
4201 }
4202 
4203 /* ILU(k) symbolic factorization
4204  * n = total rows of input
4205  * lfil = level of fill-in, the k in ILU(k)
4206  * perm = permutation array indicating ordering of factorization. Perm could come from a
4207  * rperm = reverse permutation array, used here to avoid duplicate memory allocation
4208  * iw = working array, used here to avoid duplicate memory allocation
4209  * nLU = size of computed LDU factorization.
4210  * A/L/U/S_diag_i = the I slot of A, L, U and S
4211  * A/L/U/S_diag_j = the J slot of A, L, U and S
4212  * will form global Schur Matrix if nLU < n
4213  */
4214 HYPRE_Int
hypre_ILUSetupILUKSymbolic(HYPRE_Int n,HYPRE_Int * A_diag_i,HYPRE_Int * A_diag_j,HYPRE_Int lfil,HYPRE_Int * perm,HYPRE_Int * rperm,HYPRE_Int * iw,HYPRE_Int nLU,HYPRE_Int * L_diag_i,HYPRE_Int * U_diag_i,HYPRE_Int * S_diag_i,HYPRE_Int ** L_diag_j,HYPRE_Int ** U_diag_j,HYPRE_Int ** S_diag_j,HYPRE_Int ** u_end)4215 hypre_ILUSetupILUKSymbolic(HYPRE_Int n, HYPRE_Int *A_diag_i, HYPRE_Int *A_diag_j, HYPRE_Int lfil, HYPRE_Int *perm,
4216       HYPRE_Int *rperm,   HYPRE_Int *iw,   HYPRE_Int nLU, HYPRE_Int *L_diag_i, HYPRE_Int *U_diag_i,
4217       HYPRE_Int *S_diag_i, HYPRE_Int **L_diag_j, HYPRE_Int **U_diag_j, HYPRE_Int **S_diag_j, HYPRE_Int **u_end)
4218 {
4219    /*
4220     * 1: Setup and create buffers
4221     * A_diag_*: tempory pointer for the diagonal matrix of A and its '*' slot
4222     * ii: outer loop from 0 to nLU - 1
4223     * i: the real col number in diag inside the outer loop
4224     * iw:  working array store the reverse of active col number
4225     * iL: working array store the active col number
4226     * iLev: working array store the active level of current row
4227     * lenl/u: current position in iw and so
4228     * ctrL/U/S: global position in J
4229     */
4230 
4231    HYPRE_Int         *temp_L_diag_j, *temp_U_diag_j, *temp_S_diag_j = NULL, *u_levels;
4232    HYPRE_Int         *iL, *iLev;
4233    HYPRE_Int         ii, i, j, k, ku, lena, lenl, lenu, lenh, ilev, lev, col, icol;
4234    HYPRE_Int         m = n - nLU;
4235    HYPRE_Int         *u_end_array;
4236 
4237    /* memory management */
4238    HYPRE_Int         ctrL;
4239    HYPRE_Int         ctrU;
4240    HYPRE_Int         ctrS;
4241    HYPRE_Int         capacity_L;
4242    HYPRE_Int         capacity_U;
4243    HYPRE_Int         capacity_S;
4244    HYPRE_Int         initial_alloc = 0;
4245    HYPRE_Int         nnz_A;
4246 
4247    /* set iL and iLev to right place in iw array */
4248    iL                = iw + n;
4249    iLev              = iw + 2*n;
4250 
4251    /* setup initial memory used */
4252    nnz_A             = A_diag_i[n];
4253    if (n > 0)
4254    {
4255       initial_alloc     = nLU + ceil((nnz_A / 2.0) * nLU / n);
4256    }
4257    capacity_L        = initial_alloc;
4258    capacity_U        = initial_alloc;
4259 
4260    /* allocate other memory for L and U struct */
4261    temp_L_diag_j     = hypre_CTAlloc(HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
4262    temp_U_diag_j     = hypre_CTAlloc(HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
4263 
4264    if (m > 0)
4265    {
4266       capacity_S     = m + ceil(nnz_A / 2.0 * m / n);
4267       temp_S_diag_j  = hypre_CTAlloc(HYPRE_Int, capacity_S, HYPRE_MEMORY_DEVICE);
4268    }
4269 
4270    u_end_array       = hypre_TAlloc(HYPRE_Int, nLU, HYPRE_MEMORY_HOST);
4271    u_levels          = hypre_CTAlloc(HYPRE_Int, capacity_U, HYPRE_MEMORY_HOST);
4272    ctrL = ctrU = ctrS = 0;
4273 
4274    /* set initial value for working array */
4275    for (ii = 0 ; ii < n; ii++)
4276    {
4277       iw[ii] = -1;
4278    }
4279 
4280    /*
4281     * 2: Start of main loop
4282     * those in iL are NEW col index (after permutation)
4283     */
4284    for (ii = 0; ii < nLU; ii++)
4285    {
4286       i = perm[ii];
4287       lenl = 0;
4288       lenh = 0;/* this is the current length of heap */
4289       lenu = ii;
4290       lena = A_diag_i[i+1];
4291       /* put those already inside original pattern, and set their level to 0 */
4292       for (j = A_diag_i[i]; j < lena; j++)
4293       {
4294          /* get the neworder of that col */
4295          col = rperm[A_diag_j[j]];
4296          if (col < ii)
4297          {
4298             /*
4299              * this is an entry in L
4300              * we maintain a heap structure for L part
4301              */
4302             iL[lenh] = col;
4303             iLev[lenh] = 0;
4304             iw[col] = lenh++;
4305             /*now miantian a heap structure*/
4306             hypre_ILUMinHeapAddIIIi(iL,iLev,iw,lenh);
4307          }
4308          else if (col > ii)
4309          {
4310             /* this is an entry in U */
4311             iL[lenu] = col;
4312             iLev[lenu] = 0;
4313             iw[col] = lenu++;
4314          }
4315       }/* end of j loop for adding pattern in original matrix */
4316 
4317       /*
4318        * search lower part of current row and update pattern based on level
4319        */
4320       while (lenh > 0)
4321       {
4322          /*
4323           * k is now the new col index after permutation
4324           * the first element of the heap is the smallest
4325           */
4326          k = iL[0];
4327          ilev = iLev[0];
4328          /*
4329           * we now need to maintain the heap structure
4330           */
4331          hypre_ILUMinHeapRemoveIIIi(iL,iLev,iw,lenh);
4332          lenh--;
4333          /* copy to the end of array */
4334          lenl++;
4335          /* reset iw for that, not using anymore */
4336          iw[k]=-1;
4337          hypre_swap2i(iL,iLev,ii-lenl,lenh);
4338          /*
4339           * now the elimination on current row could start.
4340           * eliminate row k (new index) from current row
4341           */
4342          ku = U_diag_i[k+1];
4343          for (j = U_diag_i[k]; j < ku; j++)
4344          {
4345             col = temp_U_diag_j[j];
4346             lev = u_levels[j] + ilev + 1;
4347             /* ignore large level */
4348             icol = iw[col];
4349             /* skill large level */
4350             if (lev > lfil)
4351             {
4352                continue;
4353             }
4354             if (icol < 0)
4355             {
4356                /* not yet in */
4357                if (col < ii)
4358                {
4359                   /*
4360                    * if we add to the left L, we need to maintian the
4361                    *    heap structure
4362                    */
4363                   iL[lenh] = col;
4364                   iLev[lenh] = lev;
4365                   iw[col] = lenh++;
4366                   /*swap it with the element right after the heap*/
4367 
4368                   /* maintain the heap */
4369                   hypre_ILUMinHeapAddIIIi(iL,iLev,iw,lenh);
4370                }
4371                else if (col > ii)
4372                {
4373                   iL[lenu] = col;
4374                   iLev[lenu] = lev;
4375                   iw[col] = lenu++;
4376                }
4377             }
4378             else
4379             {
4380                iLev[icol] = hypre_min(lev, iLev[icol]);
4381             }
4382          }/* end of loop j for level update */
4383       }/* end of while loop for iith row */
4384 
4385       /* now update everything, indices, levels and so */
4386       L_diag_i[ii+1] = L_diag_i[ii] + lenl;
4387       if (lenl > 0)
4388       {
4389          /* check if memory is enough */
4390          while (ctrL + lenl > capacity_L)
4391          {
4392             HYPRE_Int tmp = capacity_L;
4393             capacity_L = capacity_L * EXPAND_FACT + 1;
4394             temp_L_diag_j = hypre_TReAlloc_v2(temp_L_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
4395          }
4396          /* now copy L data, reverse order */
4397          for (j = 0; j < lenl; j++)
4398          {
4399             temp_L_diag_j[ctrL+j] = iL[ii-j-1];
4400          }
4401          ctrL += lenl;
4402       }
4403       k = lenu - ii;
4404       U_diag_i[ii+1] = U_diag_i[ii] + k;
4405       if (k > 0)
4406       {
4407          /* check if memory is enough */
4408          while (ctrU + k > capacity_U)
4409          {
4410             HYPRE_Int tmp = capacity_U;
4411             capacity_U = capacity_U * EXPAND_FACT + 1;
4412             temp_U_diag_j = hypre_TReAlloc_v2(temp_U_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
4413             u_levels = hypre_TReAlloc_v2(u_levels, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_HOST);
4414          }
4415          //hypre_TMemcpy(temp_U_diag_j+ctrU,iL+ii,HYPRE_Int,k,HYPRE_MEMORY_DEVICE,HYPRE_MEMORY_HOST);
4416          hypre_TMemcpy(temp_U_diag_j+ctrU, iL+ii, HYPRE_Int, k,
4417                        HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
4418          hypre_TMemcpy(u_levels+ctrU, iLev+ii, HYPRE_Int, k,
4419                        HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
4420          ctrU += k;
4421       }
4422       if (m > 0)
4423       {
4424          hypre_qsort2i(temp_U_diag_j,u_levels,U_diag_i[ii],U_diag_i[ii+1]-1);
4425          hypre_BinarySearch2(temp_U_diag_j,nLU,U_diag_i[ii],U_diag_i[ii+1]-1,u_end_array + ii);
4426       }
4427       else
4428       {
4429          /* Everything is in U */
4430          u_end_array[ii] = ctrU;
4431       }
4432 
4433       /* reset iw */
4434       for(j = ii; j < lenu; j++)
4435       {
4436          iw[iL[j]] = -1;
4437       }
4438 
4439    }/* end of main loop ii from 0 to nLU-1 */
4440 
4441    /* another loop to set EU^-1 and Schur complement */
4442    for (ii = nLU; ii < n; ii++)
4443    {
4444       i = perm[ii];
4445       lenl = 0;
4446       lenh = 0;/* this is the current length of heap */
4447       lenu = nLU;/* now this stores S, start from nLU */
4448       lena = A_diag_i[i+1];
4449       /* put those already inside original pattern, and set their level to 0 */
4450       for (j = A_diag_i[i]; j < lena; j++)
4451       {
4452          /* get the neworder of that col */
4453          col = rperm[A_diag_j[j]];
4454          if (col < nLU)
4455          {
4456             /*
4457              * this is an entry in L
4458              * we maintain a heap structure for L part
4459              */
4460             iL[lenh] = col;
4461             iLev[lenh] = 0;
4462             iw[col] = lenh++;
4463             /*now miantian a heap structure*/
4464             hypre_ILUMinHeapAddIIIi(iL,iLev,iw,lenh);
4465          }
4466          else if (col != ii) /* we for sure to add ii, avoid duplicate */
4467          {
4468             /* this is an entry in S */
4469             iL[lenu] = col;
4470             iLev[lenu] = 0;
4471             iw[col] = lenu++;
4472          }
4473       }/* end of j loop for adding pattern in original matrix */
4474 
4475       /*
4476        * search lower part of current row and update pattern based on level
4477        */
4478       while (lenh > 0)
4479       {
4480          /*
4481           * k is now the new col index after permutation
4482           * the first element of the heap is the smallest
4483           */
4484          k = iL[0];
4485          ilev = iLev[0];
4486          /*
4487           * we now need to maintain the heap structure
4488           */
4489          hypre_ILUMinHeapRemoveIIIi(iL,iLev,iw,lenh);
4490          lenh--;
4491          /* copy to the end of array */
4492          lenl++;
4493          /* reset iw for that, not using anymore */
4494          iw[k]=-1;
4495          hypre_swap2i(iL,iLev,nLU-lenl,lenh);
4496          /*
4497           * now the elimination on current row could start.
4498           * eliminate row k (new index) from current row
4499           */
4500          ku = U_diag_i[k+1];
4501          for (j = U_diag_i[k]; j < ku; j++)
4502          {
4503             col = temp_U_diag_j[j];
4504             lev = u_levels[j] + ilev + 1;
4505             /* ignore large level */
4506             icol = iw[col];
4507             /* skill large level */
4508             if (lev > lfil)
4509             {
4510                continue;
4511             }
4512             if (icol < 0)
4513             {
4514                /* not yet in */
4515                if (col < nLU)
4516                {
4517                   /*
4518                    * if we add to the left L, we need to maintian the
4519                    *    heap structure
4520                    */
4521                   iL[lenh] = col;
4522                   iLev[lenh] = lev;
4523                   iw[col] = lenh++;
4524                   /*swap it with the element right after the heap*/
4525 
4526                   /* maintain the heap */
4527                   hypre_ILUMinHeapAddIIIi(iL,iLev,iw,lenh);
4528                }
4529                else if (col != ii)
4530                {
4531                   /* S part */
4532                   iL[lenu] = col;
4533                   iLev[lenu] = lev;
4534                   iw[col] = lenu++;
4535                }
4536             }
4537             else
4538             {
4539                iLev[icol] = hypre_min(lev, iLev[icol]);
4540             }
4541          }/* end of loop j for level update */
4542       }/* end of while loop for iith row */
4543 
4544       /* now update everything, indices, levels and so */
4545       L_diag_i[ii+1] = L_diag_i[ii] + lenl;
4546       if (lenl > 0)
4547       {
4548          /* check if memory is enough */
4549          while (ctrL + lenl > capacity_L)
4550          {
4551             HYPRE_Int tmp = capacity_L;
4552             capacity_L = capacity_L * EXPAND_FACT + 1;
4553             temp_L_diag_j = hypre_TReAlloc_v2(temp_L_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
4554          }
4555          /* now copy L data, reverse order */
4556          for (j = 0; j < lenl; j ++)
4557          {
4558             temp_L_diag_j[ctrL+j] = iL[nLU-j-1];
4559          }
4560          ctrL += lenl;
4561       }
4562       k = lenu - nLU + 1;
4563       /* check if memory is enough */
4564       while (ctrS + k > capacity_S)
4565       {
4566          HYPRE_Int tmp = capacity_S;
4567          capacity_S = capacity_S * EXPAND_FACT + 1;
4568          temp_S_diag_j = hypre_TReAlloc_v2(temp_S_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_S, HYPRE_MEMORY_DEVICE);
4569       }
4570       temp_S_diag_j[ctrS] = ii;/* must have diagonal */
4571       //hypre_TMemcpy(temp_S_diag_j+ctrS+1,iL+nLU,HYPRE_Int,k-1,HYPRE_MEMORY_DEVICE,HYPRE_MEMORY_HOST);
4572       hypre_TMemcpy(temp_S_diag_j+ctrS+1, iL+nLU, HYPRE_Int, k-1,
4573                     HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
4574       ctrS += k;
4575       S_diag_i[ii-nLU+1] = ctrS;
4576 
4577       /* reset iw */
4578       for (j = nLU; j < lenu; j++)
4579       {
4580          iw[iL[j]] = -1;
4581       }
4582 
4583    }/* end of main loop ii from nLU to n-1 */
4584 
4585    /*
4586     * 3: Update the struct for L, U and S
4587     */
4588    for (k = nLU; k < n; k++)
4589    {
4590       U_diag_i[k+1] = U_diag_i[nLU];
4591    }
4592    /*
4593     * 4: Finishing up and free memory
4594     */
4595    hypre_TFree(u_levels,HYPRE_MEMORY_HOST);
4596 
4597    *L_diag_j = temp_L_diag_j;
4598    *U_diag_j = temp_U_diag_j;
4599    *S_diag_j = temp_S_diag_j;
4600    *u_end = u_end_array;
4601 
4602    return hypre_error_flag;
4603 }
4604 
4605 /* ILU(k)
4606  * A: input matrix
4607  * lfil: level of fill-in, the k in ILU(k)
4608  * permp: permutation array indicating ordering of factorization. Perm could come from a
4609  *    CF_marker: array or a reordering routine.
4610  * qpermp: column permutation array.
4611  * nLU: size of computed LDU factorization.
4612  * nI: number of interial unknowns, nI should obey nI >= nLU
4613  * Lptr, Dptr, Uptr: L, D, U factors.
4614  * Sprt: Schur Complement, if no Schur Complement is needed it will be set to NULL
4615  */
4616 HYPRE_Int
hypre_ILUSetupILUK(hypre_ParCSRMatrix * A,HYPRE_Int lfil,HYPRE_Int * permp,HYPRE_Int * qpermp,HYPRE_Int nLU,HYPRE_Int nI,hypre_ParCSRMatrix ** Lptr,HYPRE_Real ** Dptr,hypre_ParCSRMatrix ** Uptr,hypre_ParCSRMatrix ** Sptr,HYPRE_Int ** u_end)4617 hypre_ILUSetupILUK(hypre_ParCSRMatrix *A, HYPRE_Int lfil, HYPRE_Int *permp, HYPRE_Int *qpermp, HYPRE_Int nLU, HYPRE_Int nI,
4618       hypre_ParCSRMatrix **Lptr, HYPRE_Real** Dptr, hypre_ParCSRMatrix **Uptr, hypre_ParCSRMatrix **Sptr, HYPRE_Int **u_end)
4619 {
4620    /*
4621     * 1: Setup and create buffers
4622     * matL/U: the ParCSR matrix for L and U
4623     * L/U_diag: the diagonal csr matrix of matL/U
4624     * A_diag_*: tempory pointer for the diagonal matrix of A and its '*' slot
4625     * ii = outer loop from 0 to nLU - 1
4626     * i = the real col number in diag inside the outer loop
4627     * iw =  working array store the reverse of active col number
4628     * iL = working array store the active col number
4629     */
4630 
4631    /* call ILU0 if lfil is 0 */
4632    if (lfil == 0)
4633    {
4634       return hypre_ILUSetupILU0( A, permp, qpermp, nLU, nI, Lptr, Dptr, Uptr, Sptr, u_end);
4635    }
4636    HYPRE_Real              local_nnz, total_nnz;
4637    HYPRE_Int               i, ii, j, k, k1, k2, k3, kl, ku, jpiv, col, icol;
4638    HYPRE_Int               *iw;
4639    MPI_Comm                comm = hypre_ParCSRMatrixComm(A);
4640    HYPRE_Int            num_procs,  my_id;
4641 
4642    /* data objects for A */
4643    hypre_CSRMatrix         *A_diag        = hypre_ParCSRMatrixDiag(A);
4644    hypre_CSRMatrix         *A_offd        = hypre_ParCSRMatrixOffd(A);
4645    HYPRE_Real              *A_diag_data   = hypre_CSRMatrixData(A_diag);
4646    HYPRE_Int               *A_diag_i      = hypre_CSRMatrixI(A_diag);
4647    HYPRE_Int               *A_diag_j      = hypre_CSRMatrixJ(A_diag);
4648    HYPRE_Real              *A_offd_data   = hypre_CSRMatrixData(A_offd);
4649    HYPRE_Int               *A_offd_i      = hypre_CSRMatrixI(A_offd);
4650    HYPRE_Int               *A_offd_j      = hypre_CSRMatrixJ(A_offd);
4651 
4652    /* data objects for L, D, U */
4653    hypre_ParCSRMatrix      *matL;
4654    hypre_ParCSRMatrix      *matU;
4655    hypre_CSRMatrix         *L_diag;
4656    hypre_CSRMatrix         *U_diag;
4657    HYPRE_Real              *D_data;
4658    HYPRE_Real              *L_diag_data   = NULL;
4659    HYPRE_Int               *L_diag_i;
4660    HYPRE_Int               *L_diag_j      = NULL;
4661    HYPRE_Real              *U_diag_data   = NULL;
4662    HYPRE_Int               *U_diag_i;
4663    HYPRE_Int               *U_diag_j      = NULL;
4664 
4665    /* data objects for S */
4666    hypre_ParCSRMatrix      *matS          = NULL;
4667    hypre_CSRMatrix         *S_diag;
4668    hypre_CSRMatrix         *S_offd;
4669    HYPRE_Real              *S_diag_data   = NULL;
4670    HYPRE_Int               *S_diag_i      = NULL;
4671    HYPRE_Int               *S_diag_j      = NULL;
4672    HYPRE_Int               *S_offd_i      = NULL;
4673    HYPRE_Int               *S_offd_j      = NULL;
4674    HYPRE_BigInt            *S_offd_colmap = NULL;
4675    HYPRE_Real              *S_offd_data;
4676    HYPRE_Int               S_offd_nnz, S_offd_ncols;
4677    HYPRE_BigInt            col_starts[2];
4678    HYPRE_BigInt            total_rows;
4679 
4680    /* communication */
4681    hypre_ParCSRCommPkg     *comm_pkg;
4682    hypre_ParCSRCommHandle  *comm_handle;
4683    HYPRE_BigInt            *send_buf      = NULL;
4684 
4685    /* problem size */
4686    HYPRE_Int               n;
4687    HYPRE_Int               m;
4688    HYPRE_Int               e;
4689    HYPRE_Int               m_e;
4690 
4691    /* reverse permutation array */
4692    HYPRE_Int               *rperm;
4693    HYPRE_Int               *perm, *qperm;
4694 
4695    /* start setup */
4696    /* check input and get problem size */
4697    n =  hypre_CSRMatrixNumRows(A_diag);
4698    if (nLU < 0 || nLU > n)
4699    {
4700       hypre_error_w_msg(HYPRE_ERROR_ARG,"WARNING: nLU out of range.\n");
4701    }
4702    m = n - nLU;
4703    e = nI - nLU;
4704    m_e = n - nI;
4705    if (e < 0)
4706    {
4707       hypre_error_w_msg(HYPRE_ERROR_ARG,"WARNING: nLU should not exceed nI.\n");
4708    }
4709 
4710    /* Init I array anyway. S's might be freed later */
4711    D_data = hypre_CTAlloc(HYPRE_Real, n, HYPRE_MEMORY_DEVICE);
4712    L_diag_i = hypre_CTAlloc(HYPRE_Int, (n+1), HYPRE_MEMORY_DEVICE);
4713    U_diag_i = hypre_CTAlloc(HYPRE_Int, (n+1), HYPRE_MEMORY_DEVICE);
4714    S_diag_i = hypre_CTAlloc(HYPRE_Int, (m+1), HYPRE_MEMORY_DEVICE);
4715 
4716    /* set Comm_Pkg if not yet built */
4717    hypre_MPI_Comm_size(comm,&num_procs);
4718    hypre_MPI_Comm_rank(comm,&my_id);
4719    comm_pkg = hypre_ParCSRMatrixCommPkg(A);
4720    if (!comm_pkg)
4721    {
4722       hypre_MatvecCommPkgCreate(A);
4723       comm_pkg = hypre_ParCSRMatrixCommPkg(A);
4724    }
4725 
4726    /*
4727     * 2: Symbolic factorization
4728     * setup iw and rperm first
4729     */
4730    /* allocate work arrays */
4731    iw = hypre_CTAlloc(HYPRE_Int, 4*n, HYPRE_MEMORY_HOST);
4732    rperm = iw + 3*n;
4733    L_diag_i[0] = U_diag_i[0] = S_diag_i[0] = 0;
4734    /* get reverse permutation (rperm).
4735     * rperm holds the reordered indexes.
4736     */
4737 
4738    if (!permp)
4739    {
4740       perm = hypre_TAlloc(HYPRE_Int, n, HYPRE_MEMORY_DEVICE);
4741       for (i = 0; i < n; i++)
4742       {
4743          perm[i] = i;
4744       }
4745    }
4746    else
4747    {
4748       perm = permp;
4749    }
4750 
4751    if (!qpermp)
4752    {
4753       qperm = hypre_TAlloc(HYPRE_Int, n, HYPRE_MEMORY_DEVICE);
4754       for (i = 0; i < n; i++)
4755       {
4756          qperm[i] = i;
4757       }
4758    }
4759    else
4760    {
4761       qperm = qpermp;
4762    }
4763 
4764    for (i = 0; i < n; i++)
4765    {
4766       rperm[qperm[i]] = i;
4767    }
4768 
4769    /* do symbolic factorization */
4770    hypre_ILUSetupILUKSymbolic(n, A_diag_i, A_diag_j, lfil, perm, rperm, iw,
4771          nLU, L_diag_i, U_diag_i, S_diag_i, &L_diag_j, &U_diag_j, &S_diag_j, u_end);
4772 
4773    /*
4774     * after this, we have our I,J for L, U and S ready, and L sorted
4775     * iw are still -1 after symbolic factorization
4776     * now setup helper array here
4777     */
4778    if (L_diag_i[n])
4779    {
4780       L_diag_data = hypre_CTAlloc(HYPRE_Real, L_diag_i[n], HYPRE_MEMORY_DEVICE);
4781    }
4782    if (U_diag_i[n])
4783    {
4784       U_diag_data = hypre_CTAlloc(HYPRE_Real, U_diag_i[n], HYPRE_MEMORY_DEVICE);
4785    }
4786    if (S_diag_i[m])
4787    {
4788       S_diag_data = hypre_CTAlloc(HYPRE_Real, S_diag_i[m], HYPRE_MEMORY_DEVICE);
4789    }
4790 
4791    /*
4792     * 3: Begin real factorization
4793     * we already have L and U structure ready, so no extra working array needed
4794     */
4795    /* first loop for upper part */
4796    for (ii = 0; ii < nLU; ii++)
4797    {
4798       // get row i
4799       i = perm[ii];
4800       kl = L_diag_i[ii+1];
4801       ku = U_diag_i[ii+1];
4802       k1 = A_diag_i[i];
4803       k2 = A_diag_i[i+1];
4804       /* set up working arrays */
4805       for (j = L_diag_i[ii]; j < kl; j++)
4806       {
4807          col = L_diag_j[j];
4808          iw[col] = j;
4809       }
4810       D_data[ii] = 0.0;
4811       iw[ii] = ii;
4812       for (j = U_diag_i[ii]; j < ku; j++)
4813       {
4814          col = U_diag_j[j];
4815          iw[col] = j;
4816       }
4817       /* copy data from A into L, D and U */
4818       for (j = k1; j < k2; j++)
4819       {
4820          /* compute everything in new index */
4821          col = rperm[A_diag_j[j]];
4822          icol = iw[col];
4823          /* A for sure to be inside the pattern */
4824          if (col < ii)
4825          {
4826             L_diag_data[icol] = A_diag_data[j];
4827          }
4828          else if (col == ii)
4829          {
4830             D_data[ii] = A_diag_data[j];
4831          }
4832          else
4833          {
4834             U_diag_data[icol] = A_diag_data[j];
4835          }
4836       }
4837       /* elimination */
4838       for (j = L_diag_i[ii]; j < kl; j++)
4839       {
4840          jpiv = L_diag_j[j];
4841          L_diag_data[j] *= D_data[jpiv];
4842          ku = U_diag_i[jpiv+1];
4843 
4844          for (k = U_diag_i[jpiv]; k < ku; k++)
4845          {
4846             col = U_diag_j[k];
4847             icol = iw[col];
4848             if (icol < 0)
4849             {
4850                /* not in partern */
4851                continue;
4852             }
4853             if (col < ii)
4854             {
4855                /* L part */
4856                L_diag_data[icol] -= L_diag_data[j]*U_diag_data[k];
4857             }
4858             else if (col == ii)
4859             {
4860                /* diag part */
4861                D_data[icol] -= L_diag_data[j]*U_diag_data[k];
4862             }
4863             else
4864             {
4865                /* U part */
4866                U_diag_data[icol] -= L_diag_data[j]*U_diag_data[k];
4867             }
4868          }
4869       }
4870       /* reset working array */
4871       ku = U_diag_i[ii+1];
4872       for (j = L_diag_i[ii]; j < kl; j++)
4873       {
4874          col = L_diag_j[j];
4875          iw[col] = -1;
4876       }
4877       iw[ii] = -1;
4878       for (j = U_diag_i[ii]; j < ku ; j++)
4879       {
4880          col = U_diag_j[j];
4881          iw[col] = -1;
4882       }
4883 
4884       /* diagonal part (we store the inverse) */
4885       if (fabs(D_data[ii]) < MAT_TOL)
4886       {
4887          D_data[ii] = 1e-06;
4888       }
4889       D_data[ii] = 1./ D_data[ii];
4890    }
4891 
4892    /* Now lower part for Schur complement */
4893    for (ii = nLU; ii < n; ii++)
4894    {
4895       // get row i
4896       i = perm[ii];
4897       kl = L_diag_i[ii+1];
4898       ku = S_diag_i[ii - nLU +1];
4899       k1 = A_diag_i[i];
4900       k2 = A_diag_i[i+1];
4901       /* set up working arrays */
4902       for (j = L_diag_i[ii]; j < kl; j++)
4903       {
4904          col = L_diag_j[j];
4905          iw[col] = j;
4906       }
4907       for (j = S_diag_i[ii - nLU]; j < ku; j++)
4908       {
4909          col = S_diag_j[j];
4910          iw[col] = j;
4911       }
4912       /* copy data from A into L, and S */
4913       for (j = k1; j < k2; j++)
4914       {
4915          /* compute everything in new index */
4916          col = rperm[A_diag_j[j]];
4917          icol = iw[col];
4918          /* A for sure to be inside the pattern */
4919          if (col < nLU)
4920          {
4921             L_diag_data[icol] = A_diag_data[j];
4922          }
4923          else
4924          {
4925             S_diag_data[icol] = A_diag_data[j];
4926          }
4927       }
4928       /* elimination */
4929       for (j = L_diag_i[ii]; j < kl; j++)
4930       {
4931          jpiv = L_diag_j[j];
4932          L_diag_data[j] *= D_data[jpiv];
4933          ku = U_diag_i[jpiv+1];
4934          for (k = U_diag_i[jpiv]; k < ku; k++)
4935          {
4936             col = U_diag_j[k];
4937             icol = iw[col];
4938             if (icol < 0)
4939             {
4940                /* not in partern */
4941                continue;
4942             }
4943             if (col < nLU)
4944             {
4945                /* L part */
4946                L_diag_data[icol] -= L_diag_data[j]*U_diag_data[k];
4947             }
4948             else
4949             {
4950                /* S part */
4951                S_diag_data[icol] -= L_diag_data[j]*U_diag_data[k];
4952             }
4953          }
4954       }
4955       /* reset working array */
4956       for (j = L_diag_i[ii]; j < kl ; j++)
4957       {
4958          col = L_diag_j[j];
4959          iw[col] = -1;
4960       }
4961       ku = S_diag_i[ii-nLU+1];
4962       for (j = S_diag_i[ii-nLU]; j < ku; j++)
4963       {
4964          col = S_diag_j[j];
4965          iw[col] = -1;
4966          /* remember to update index, S is smaller! */
4967          S_diag_j[j]-=nLU;
4968       }
4969    }
4970 
4971    /*
4972     * 4: Finishing up and free
4973     */
4974 
4975    /* First create Schur complement if necessary
4976     * Check if we need to create Schur complement
4977     */
4978    HYPRE_BigInt big_m = (HYPRE_BigInt)m;
4979    hypre_MPI_Allreduce(&big_m, &total_rows, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
4980    /* only form when total_rows > 0 */
4981    if ( total_rows > 0 )
4982    {
4983       /* now create S */
4984       /* need to get new column start */
4985       {
4986          HYPRE_BigInt global_start;
4987          hypre_MPI_Scan( &big_m, &global_start, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
4988          col_starts[0] = global_start - m;
4989          col_starts[1] = global_start;
4990       }
4991 
4992       /* We did nothing to A_offd, so all the data kept, just reorder them
4993        * The create function takes comm, global num rows/cols,
4994        *    row/col start, num cols offd, nnz diag, nnz offd
4995        */
4996       S_offd_nnz = hypre_CSRMatrixNumNonzeros(A_offd);
4997       S_offd_ncols = hypre_CSRMatrixNumCols(A_offd);
4998 
4999       matS = hypre_ParCSRMatrixCreate( comm,
5000             total_rows,
5001             total_rows,
5002             col_starts,
5003             col_starts,
5004             S_offd_ncols,
5005             S_diag_i[m],
5006             S_offd_nnz);
5007 
5008       /* first put diagonal data in */
5009       S_diag = hypre_ParCSRMatrixDiag(matS);
5010 
5011       hypre_CSRMatrixI(S_diag) = S_diag_i;
5012       hypre_CSRMatrixData(S_diag) = S_diag_data;
5013       hypre_CSRMatrixJ(S_diag) = S_diag_j;
5014 
5015       /* now start to construct offdiag of S */
5016       S_offd = hypre_ParCSRMatrixOffd(matS);
5017       S_offd_i = hypre_TAlloc(HYPRE_Int, m+1, HYPRE_MEMORY_DEVICE);
5018       S_offd_j = hypre_TAlloc(HYPRE_Int, S_offd_nnz, HYPRE_MEMORY_DEVICE);
5019       S_offd_data = hypre_TAlloc(HYPRE_Real, S_offd_nnz, HYPRE_MEMORY_DEVICE);
5020       S_offd_colmap = hypre_CTAlloc(HYPRE_BigInt, S_offd_ncols, HYPRE_MEMORY_HOST);
5021 
5022       /* simply use a loop to copy data from A_offd */
5023       S_offd_i[0] = 0;
5024       k3 = 0;
5025       for (i = 1; i <= e; i++)
5026       {
5027          S_offd_i[i+1] = k3;
5028       }
5029       for (i = 0; i < m_e; i++)
5030       {
5031          col = perm[i + nI];
5032          k1 = A_offd_i[col];
5033          k2 = A_offd_i[col+1];
5034          for (j = k1; j < k2; j++)
5035          {
5036             S_offd_j[k3] = A_offd_j[j];
5037             S_offd_data[k3++] = A_offd_data[j];
5038          }
5039          S_offd_i[i+e+1] = k3;
5040       }
5041 
5042       /* give I, J, DATA to S_offd */
5043       hypre_CSRMatrixI(S_offd) = S_offd_i;
5044       hypre_CSRMatrixJ(S_offd) = S_offd_j;
5045       hypre_CSRMatrixData(S_offd) = S_offd_data;
5046 
5047       /* now we need to update S_offd_colmap */
5048 
5049       /* get total num of send */
5050       HYPRE_Int num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
5051       HYPRE_Int begin = hypre_ParCSRCommPkgSendMapStart(comm_pkg,0);
5052       HYPRE_Int end = hypre_ParCSRCommPkgSendMapStart(comm_pkg,num_sends);
5053       send_buf = hypre_TAlloc(HYPRE_BigInt, end - begin, HYPRE_MEMORY_HOST);
5054       /* copy new index into send_buf */
5055       for (i = begin; i < end; i++)
5056       {
5057          send_buf[i-begin] = rperm[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,i)] - nLU + col_starts[0];
5058       }
5059 
5060       /* main communication */
5061       comm_handle = hypre_ParCSRCommHandleCreate(21, comm_pkg, send_buf, S_offd_colmap);
5062       hypre_ParCSRCommHandleDestroy(comm_handle);
5063 
5064       /* setup index */
5065       hypre_ParCSRMatrixColMapOffd(matS) = S_offd_colmap;
5066 
5067       hypre_ILUSortOffdColmap(matS);
5068 
5069       /* free */
5070       hypre_TFree(send_buf, HYPRE_MEMORY_HOST);
5071    } /* end of forming S */
5072 
5073    /* Assemble LDU matrices */
5074    /* zero out unfactored rows */
5075    for (k = nLU; k < n; k++)
5076    {
5077       D_data[k] = 1.;
5078    }
5079 
5080    matL = hypre_ParCSRMatrixCreate( comm,
5081          hypre_ParCSRMatrixGlobalNumRows(A),
5082          hypre_ParCSRMatrixGlobalNumRows(A),
5083          hypre_ParCSRMatrixRowStarts(A),
5084          hypre_ParCSRMatrixColStarts(A),
5085          0 /* num_cols_offd */,
5086          L_diag_i[n],
5087          0 /* num_nonzeros_offd */);
5088 
5089    L_diag = hypre_ParCSRMatrixDiag(matL);
5090    hypre_CSRMatrixI(L_diag) = L_diag_i;
5091    if (L_diag_i[n]>0)
5092    {
5093       hypre_CSRMatrixData(L_diag) = L_diag_data;
5094       hypre_CSRMatrixJ(L_diag) = L_diag_j;
5095    }
5096    else
5097    {
5098       /* we allocated some initial length, so free them */
5099       hypre_TFree(L_diag_j, HYPRE_MEMORY_DEVICE);
5100    }
5101    /* store (global) total number of nonzeros */
5102    local_nnz = (HYPRE_Real) (L_diag_i[n]);
5103    hypre_MPI_Allreduce(&local_nnz, &total_nnz, 1, HYPRE_MPI_REAL, hypre_MPI_SUM, comm);
5104    hypre_ParCSRMatrixDNumNonzeros(matL) = total_nnz;
5105 
5106    matU = hypre_ParCSRMatrixCreate( comm,
5107          hypre_ParCSRMatrixGlobalNumRows(A),
5108          hypre_ParCSRMatrixGlobalNumRows(A),
5109          hypre_ParCSRMatrixRowStarts(A),
5110          hypre_ParCSRMatrixColStarts(A),
5111          0,
5112          U_diag_i[n],
5113          0 );
5114 
5115    U_diag = hypre_ParCSRMatrixDiag(matU);
5116    hypre_CSRMatrixI(U_diag) = U_diag_i;
5117    if (U_diag_i[n]>0)
5118    {
5119       hypre_CSRMatrixData(U_diag) = U_diag_data;
5120       hypre_CSRMatrixJ(U_diag) = U_diag_j;
5121    }
5122    else
5123    {
5124       /* we allocated some initial length, so free them */
5125       hypre_TFree(U_diag_j, HYPRE_MEMORY_DEVICE);
5126    }
5127    /* store (global) total number of nonzeros */
5128    local_nnz = (HYPRE_Real) (U_diag_i[n]);
5129    hypre_MPI_Allreduce(&local_nnz, &total_nnz, 1, HYPRE_MPI_REAL, hypre_MPI_SUM, comm);
5130    hypre_ParCSRMatrixDNumNonzeros(matU) = total_nnz;
5131 
5132    /* free */
5133    hypre_TFree(iw,HYPRE_MEMORY_HOST);
5134    if (!matS)
5135    {
5136       /* we allocate some memory for S, need to free if unused */
5137       hypre_TFree(S_diag_i,HYPRE_MEMORY_DEVICE);
5138    }
5139 
5140    if (!permp)
5141    {
5142       hypre_TFree(perm, HYPRE_MEMORY_DEVICE);
5143    }
5144 
5145    if (!qpermp)
5146    {
5147       hypre_TFree(qperm, HYPRE_MEMORY_DEVICE);
5148    }
5149 
5150    /* set matrix pointers */
5151    *Lptr = matL;
5152    *Dptr = D_data;
5153    *Uptr = matU;
5154    *Sptr = matS;
5155 
5156    return hypre_error_flag;
5157 }
5158 
5159 /* ILUT
5160  * A: input matrix
5161  * lfil: maximum nnz per row in L and U
5162  * tol: droptol array in ILUT
5163  *    tol[0]: matrix B
5164  *    tol[1]: matrix E and F
5165  *    tol[2]: matrix S
5166  * perm: permutation array indicating ordering of factorization. Perm could come from a
5167  *    CF_marker: array or a reordering routine.
5168  * qperm: permutation array for column
5169  * nLU: size of computed LDU factorization. If nLU < n, Schur compelemnt will be formed
5170  * nI: number of interial unknowns. nLU should obey nLU <= nI.
5171  * Lptr, Dptr, Uptr: L, D, U factors.
5172  * Sptr: Schur complement
5173  *
5174  * Keep the largest lfil entries that is greater than some tol relative
5175  *    to the input tol and the norm of that row in both L and U
5176  */
5177 HYPRE_Int
hypre_ILUSetupILUT(hypre_ParCSRMatrix * A,HYPRE_Int lfil,HYPRE_Real * tol,HYPRE_Int * permp,HYPRE_Int * qpermp,HYPRE_Int nLU,HYPRE_Int nI,hypre_ParCSRMatrix ** Lptr,HYPRE_Real ** Dptr,hypre_ParCSRMatrix ** Uptr,hypre_ParCSRMatrix ** Sptr,HYPRE_Int ** u_end)5178 hypre_ILUSetupILUT(hypre_ParCSRMatrix *A, HYPRE_Int lfil, HYPRE_Real *tol,
5179       HYPRE_Int *permp, HYPRE_Int *qpermp, HYPRE_Int nLU, HYPRE_Int nI, hypre_ParCSRMatrix **Lptr,
5180       HYPRE_Real** Dptr, hypre_ParCSRMatrix **Uptr, hypre_ParCSRMatrix **Sptr, HYPRE_Int **u_end)
5181 {
5182    /*
5183     * 1: Setup and create buffers
5184     * matL/U: the ParCSR matrix for L and U
5185     * L/U_diag: the diagonal csr matrix of matL/U
5186     * A_diag_*: tempory pointer for the diagonal matrix of A and its '*' slot
5187     * ii = outer loop from 0 to nLU - 1
5188     * i = the real col number in diag inside the outer loop
5189     * iw =  working array store the reverse of active col number
5190     * iL = working array store the active col number
5191     */
5192    HYPRE_Real               local_nnz, total_nnz;
5193    HYPRE_Int                i, ii, j, k, k1, k2, k3, kl, ku, col, icol, lenl, lenu, lenhu, lenhlr, lenhll, jpos, jrow;
5194    HYPRE_Real               inorm, itolb, itolef, itols, dpiv, lxu;
5195    HYPRE_Int                *iw,*iL;
5196    HYPRE_Real               *w;
5197 
5198    /* memory management */
5199    HYPRE_Int                ctrL;
5200    HYPRE_Int                ctrU;
5201    HYPRE_Int                initial_alloc = 0;
5202    HYPRE_Int                capacity_L;
5203    HYPRE_Int                capacity_U;
5204    HYPRE_Int                ctrS;
5205    HYPRE_Int                capacity_S;
5206    HYPRE_Int                nnz_A;
5207 
5208    /* communication stuffs for S */
5209    MPI_Comm                 comm             = hypre_ParCSRMatrixComm(A);
5210    HYPRE_Int                S_offd_nnz, S_offd_ncols;
5211    hypre_ParCSRCommPkg      *comm_pkg;
5212    hypre_ParCSRCommHandle   *comm_handle;
5213    HYPRE_Int                num_procs, my_id;
5214    HYPRE_BigInt             col_starts[2];
5215    HYPRE_BigInt             total_rows;
5216    HYPRE_Int                num_sends;
5217    HYPRE_Int                begin, end;
5218 
5219    /* data objects for A */
5220    hypre_CSRMatrix          *A_diag          = hypre_ParCSRMatrixDiag(A);
5221    hypre_CSRMatrix          *A_offd          = hypre_ParCSRMatrixOffd(A);
5222    HYPRE_Real               *A_diag_data     = hypre_CSRMatrixData(A_diag);
5223    HYPRE_Int                *A_diag_i        = hypre_CSRMatrixI(A_diag);
5224    HYPRE_Int                *A_diag_j        = hypre_CSRMatrixJ(A_diag);
5225    HYPRE_Int                *A_offd_i        = hypre_CSRMatrixI(A_offd);
5226    HYPRE_Int                *A_offd_j        = hypre_CSRMatrixJ(A_offd);
5227    HYPRE_Real               *A_offd_data     = hypre_CSRMatrixData(A_offd);
5228 
5229    /* data objects for L, D, U */
5230    hypre_ParCSRMatrix       *matL;
5231    hypre_ParCSRMatrix       *matU;
5232    hypre_CSRMatrix          *L_diag;
5233    hypre_CSRMatrix          *U_diag;
5234    HYPRE_Real               *D_data;
5235    HYPRE_Real               *L_diag_data     = NULL;
5236    HYPRE_Int                *L_diag_i;
5237    HYPRE_Int                *L_diag_j        = NULL;
5238    HYPRE_Real               *U_diag_data     = NULL;
5239    HYPRE_Int                *U_diag_i;
5240    HYPRE_Int                *U_diag_j        = NULL;
5241 
5242    /* data objects for S */
5243    hypre_ParCSRMatrix       *matS            = NULL;
5244    hypre_CSRMatrix          *S_diag;
5245    hypre_CSRMatrix          *S_offd;
5246    HYPRE_Real               *S_diag_data     = NULL;
5247    HYPRE_Int                *S_diag_i        = NULL;
5248    HYPRE_Int                *S_diag_j        = NULL;
5249    HYPRE_Int                *S_offd_i        = NULL;
5250    HYPRE_Int                *S_offd_j        = NULL;
5251    HYPRE_BigInt                *S_offd_colmap   = NULL;
5252    HYPRE_Real               *S_offd_data;
5253    HYPRE_BigInt                *send_buf        = NULL;
5254    HYPRE_Int                *u_end_array;
5255 
5256    /* reverse permutation */
5257    HYPRE_Int                *rperm;
5258    HYPRE_Int                *perm, *qperm;
5259 
5260    /* problem size
5261     * m is n - nLU, num of rows of local Schur system
5262     * m_e is the size of interface nodes
5263     * e is the number of interial rows in local Schur Complement
5264     */
5265    HYPRE_Int                n;
5266    HYPRE_Int                m;
5267    HYPRE_Int                e;
5268    HYPRE_Int                m_e;
5269 
5270    /* start setup
5271     * check input first
5272     */
5273    n = hypre_CSRMatrixNumRows(A_diag);
5274    if (nLU < 0 || nLU > n)
5275    {
5276       hypre_error_w_msg(HYPRE_ERROR_ARG,"WARNING: nLU out of range.\n");
5277    }
5278    m = n - nLU;
5279    e = nI - nLU;
5280    m_e = n - nI;
5281    if (e < 0)
5282    {
5283       hypre_error_w_msg(HYPRE_ERROR_ARG,"WARNING: nLU should not exceed nI.\n");
5284    }
5285 
5286    u_end_array = hypre_TAlloc(HYPRE_Int, nLU, HYPRE_MEMORY_HOST);
5287 
5288    /* start set up
5289     * setup communication stuffs first
5290     */
5291    hypre_MPI_Comm_size(comm,&num_procs);
5292    hypre_MPI_Comm_rank(comm,&my_id);
5293    comm_pkg = hypre_ParCSRMatrixCommPkg(A);
5294    /* create if not yet built */
5295    if (!comm_pkg)
5296    {
5297       hypre_MatvecCommPkgCreate(A);
5298       comm_pkg = hypre_ParCSRMatrixCommPkg(A);
5299    }
5300 
5301    /* setup initial memory, in ILUT, just guess with max nnz per row */
5302    nnz_A = A_diag_i[nLU];
5303    if (n > 0)
5304    {
5305       initial_alloc = hypre_min(nLU + ceil((nnz_A / 2.0) * nLU / n), nLU * lfil);
5306    }
5307    capacity_L = initial_alloc;
5308    capacity_U = initial_alloc;
5309 
5310    D_data = hypre_CTAlloc(HYPRE_Real, n, HYPRE_MEMORY_DEVICE);
5311    L_diag_i = hypre_CTAlloc(HYPRE_Int, (n+1), HYPRE_MEMORY_DEVICE);
5312    U_diag_i = hypre_CTAlloc(HYPRE_Int, (n+1), HYPRE_MEMORY_DEVICE);
5313 
5314    L_diag_j = hypre_CTAlloc(HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
5315    U_diag_j = hypre_CTAlloc(HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
5316    L_diag_data = hypre_CTAlloc(HYPRE_Real, capacity_L, HYPRE_MEMORY_DEVICE);
5317    U_diag_data = hypre_CTAlloc(HYPRE_Real, capacity_U, HYPRE_MEMORY_DEVICE);
5318 
5319    ctrL = ctrU = 0;
5320 
5321    ctrS = 0;
5322    S_diag_i = hypre_CTAlloc(HYPRE_Int, (m + 1), HYPRE_MEMORY_DEVICE);
5323    S_diag_i[0] = 0;
5324    /* only setup S part when n > nLU */
5325    if (m > 0)
5326    {
5327       capacity_S = hypre_min(m + ceil((nnz_A / 2.0) * m / n), m * lfil);
5328       S_diag_j = hypre_CTAlloc(HYPRE_Int, capacity_S, HYPRE_MEMORY_DEVICE);
5329       S_diag_data = hypre_CTAlloc(HYPRE_Real, capacity_S, HYPRE_MEMORY_DEVICE);
5330    }
5331 
5332    /* setting up working array */
5333    iw = hypre_CTAlloc(HYPRE_Int,3*n,HYPRE_MEMORY_HOST);
5334    iL = iw + n;
5335    w = hypre_CTAlloc(HYPRE_Real,n,HYPRE_MEMORY_HOST);
5336    for (i = 0; i < n; i++)
5337    {
5338       iw[i] = -1;
5339    }
5340    L_diag_i[0] = U_diag_i[0] = 0;
5341    /* get reverse permutation (rperm).
5342     * rperm holds the reordered indexes.
5343     * rperm[old] -> new
5344     * perm[new]  -> old
5345     */
5346    rperm = iw + 2*n;
5347 
5348    if (!permp)
5349    {
5350       perm = hypre_TAlloc(HYPRE_Int, n, HYPRE_MEMORY_DEVICE);
5351       for (i = 0; i < n; i++)
5352       {
5353          perm[i] = i;
5354       }
5355    }
5356    else
5357    {
5358       perm = permp;
5359    }
5360 
5361    if (!qpermp)
5362    {
5363       qperm = hypre_TAlloc(HYPRE_Int, n, HYPRE_MEMORY_DEVICE);
5364       for (i = 0; i < n; i++)
5365       {
5366          qperm[i] = i;
5367       }
5368    }
5369    else
5370    {
5371       qperm = qpermp;
5372    }
5373 
5374    for (i = 0; i < n; i++)
5375    {
5376       rperm[perm[i]] = i;
5377    }
5378    /*
5379     * 2: Main loop of elimination
5380     * maintain two heaps
5381     * |----->*********<-----|-----*********|
5382     * |col heap***value heap|value in U****|
5383     */
5384 
5385    /* main outer loop for upper part */
5386    for (ii = 0; ii < nLU; ii++)
5387    {
5388       /* get real row with perm */
5389       i = perm[ii];
5390       k1 = A_diag_i[i];
5391       k2 = A_diag_i[i+1];
5392       kl = ii-1;
5393       /* reset row norm of ith row */
5394       inorm = .0;
5395       for (j = k1; j < k2; j++)
5396       {
5397          inorm += fabs(A_diag_data[j]);
5398       }
5399       if (inorm == .0)
5400       {
5401          hypre_error_w_msg(HYPRE_ERROR_ARG,"WARNING: ILUT with zero row.\n");
5402       }
5403       inorm /= (HYPRE_Real)(k2-k1);
5404       /* set the scaled tol for that row */
5405       itolb = tol[0] * inorm;
5406       itolef = tol[1] * inorm;
5407 
5408       /* reset displacement */
5409       lenhll = lenhlr = lenu = 0;
5410       w[ii] = 0.0;
5411       iw[ii] = ii;
5412       /* copy in data from A */
5413       for (j = k1; j < k2; j++)
5414       {
5415          /* get now col number */
5416          col = rperm[A_diag_j[j]];
5417          if (col < ii)
5418          {
5419             /* L part of it */
5420             iL[lenhll] = col;
5421             w[lenhll] = A_diag_data[j];
5422             iw[col] = lenhll++;
5423             /* add to heap, by col number */
5424             hypre_ILUMinHeapAddIRIi(iL,w,iw,lenhll);
5425          }
5426          else if (col == ii)
5427          {
5428             w[ii] = A_diag_data[j];
5429          }
5430          else
5431          {
5432             lenu++;
5433             jpos = lenu + ii;
5434             iL[jpos] = col;
5435             w[jpos] = A_diag_data[j];
5436             iw[col] = jpos;
5437          }
5438       }
5439 
5440       /*
5441        * main elimination
5442        * need to maintain 2 heaps for L, one heap for col and one heaps for value
5443        * maintian an array for U, and do qsplit with quick sort after that
5444        * while the heap of col is greater than zero
5445        */
5446       while (lenhll > 0)
5447       {
5448 
5449          /* get the next row from top of the heap */
5450          jrow = iL[0];
5451          dpiv = w[0] * D_data[jrow];
5452          w[0] = dpiv;
5453          /* now remove it from the top of the heap */
5454          hypre_ILUMinHeapRemoveIRIi(iL,w,iw,lenhll);
5455          lenhll--;
5456          /*
5457           * reset the drop part to -1
5458           * we don't need this iw anymore
5459           */
5460          iw[jrow] = -1;
5461          /* need to keep this one, move to the end of the heap */
5462          /* no longer need to maintain iw */
5463          hypre_swap2(iL,w,lenhll,kl-lenhlr);
5464          lenhlr++;
5465          hypre_ILUMaxrHeapAddRabsI(w+kl,iL+kl,lenhlr);
5466          /* loop for elimination */
5467          ku = U_diag_i[jrow+1];
5468          for (j = U_diag_i[jrow]; j < ku; j++)
5469          {
5470             col = U_diag_j[j];
5471             icol = iw[col];
5472             lxu = - dpiv*U_diag_data[j];
5473             /* we don't want to fill small number to empty place */
5474             if ((icol == -1) &&
5475                 ((col < nLU && fabs(lxu) < itolb) || (col >= nLU && fabs(lxu) < itolef)))
5476             {
5477                continue;
5478             }
5479             if (icol == -1)
5480             {
5481                if (col < ii)
5482                {
5483                   /* L part
5484                    * not already in L part
5485                    * put it to the end of heap
5486                    * might overwrite some small entries, no issue
5487                    */
5488                   iL[lenhll] = col;
5489                   w[lenhll] = lxu;
5490                   iw[col] = lenhll++;
5491                   /* add to heap, by col number */
5492                   hypre_ILUMinHeapAddIRIi(iL,w,iw,lenhll);
5493                }
5494                else if (col == ii)
5495                {
5496                   w[ii] += lxu;
5497                }
5498                else
5499                {
5500                   /*
5501                    * not already in U part
5502                    * put is to the end of heap
5503                    */
5504                   lenu++;
5505                   jpos = lenu + ii;
5506                   iL[jpos] = col;
5507                   w[jpos] = lxu;
5508                   iw[col] = jpos;
5509                }
5510             }
5511             else
5512             {
5513                w[icol] += lxu;
5514             }
5515          }
5516       }/* while loop for the elimination of current row */
5517 
5518       if (fabs(w[ii]) < MAT_TOL)
5519       {
5520          w[ii]=1e-06;
5521       }
5522       D_data[ii] = 1./w[ii];
5523       iw[ii] = -1;
5524 
5525       /*
5526        * now pick up the largest lfil from L
5527        * L part is guarantee to be larger than itol
5528        */
5529 
5530       lenl = lenhlr < lfil ? lenhlr : lfil;
5531       L_diag_i[ii+1] = L_diag_i[ii] + lenl;
5532       if (lenl > 0)
5533       {
5534          /* test if memory is enough */
5535          while (ctrL + lenl > capacity_L)
5536          {
5537             HYPRE_Int tmp = capacity_L;
5538             capacity_L = capacity_L * EXPAND_FACT + 1;
5539             L_diag_j = hypre_TReAlloc_v2(L_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
5540             L_diag_data = hypre_TReAlloc_v2(L_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_L, HYPRE_MEMORY_DEVICE);
5541          }
5542          ctrL += lenl;
5543          /* copy large data in */
5544          for (j = L_diag_i[ii]; j < ctrL; j++)
5545          {
5546             L_diag_j[j] = iL[kl];
5547             L_diag_data[j] = w[kl];
5548             hypre_ILUMaxrHeapRemoveRabsI(w+kl,iL+kl,lenhlr);
5549             lenhlr--;
5550          }
5551       }
5552       /*
5553        * now reset working array
5554        * L part already reset when move out of heap, only U part
5555        */
5556       ku = lenu+ii;
5557       for (j = ii + 1; j <= ku; j++)
5558       {
5559          iw[iL[j]] = -1;
5560       }
5561 
5562       if (lenu < lfil)
5563       {
5564          /* we simply keep all of the data, no need to sort */
5565          lenhu = lenu;
5566       }
5567       else
5568       {
5569          /* need to sort the first small(hopefully) part of it */
5570          lenhu = lfil;
5571          /* quick split, only sort the first small part of the array */
5572          hypre_ILUMaxQSplitRabsI(w,iL,ii+1,ii+lenhu,ii+lenu);
5573       }
5574 
5575       U_diag_i[ii+1] = U_diag_i[ii] + lenhu;
5576       if (lenhu > 0)
5577       {
5578          /* test if memory is enough */
5579          while (ctrU + lenhu > capacity_U)
5580          {
5581             HYPRE_Int tmp = capacity_U;
5582             capacity_U = capacity_U * EXPAND_FACT + 1;
5583             U_diag_j = hypre_TReAlloc_v2(U_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
5584             U_diag_data = hypre_TReAlloc_v2(U_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_U, HYPRE_MEMORY_DEVICE);
5585          }
5586          ctrU += lenhu;
5587          /* copy large data in */
5588          for (j = U_diag_i[ii]; j < ctrU; j++)
5589          {
5590             jpos = ii+1+j-U_diag_i[ii];
5591             U_diag_j[j] = iL[jpos];
5592             U_diag_data[j] = w[jpos];
5593          }
5594       }
5595       /* check and build u_end array */
5596       if (m > 0)
5597       {
5598          hypre_qsort1(U_diag_j,U_diag_data,U_diag_i[ii],U_diag_i[ii+1]-1);
5599          hypre_BinarySearch2(U_diag_j,nLU,U_diag_i[ii],U_diag_i[ii+1]-1,u_end_array + ii);
5600       }
5601       else
5602       {
5603          /* Everything is in U */
5604          u_end_array[ii] = ctrU;
5605       }
5606    }/* end of ii loop from 0 to nLU-1 */
5607 
5608 
5609    /* now main loop for Schur comlement part */
5610    for (ii = nLU; ii < n; ii++)
5611    {
5612       /* get real row with perm */
5613       i = perm[ii];
5614       k1 = A_diag_i[i];
5615       k2 = A_diag_i[i+1];
5616       kl = nLU-1;
5617       /* reset row norm of ith row */
5618       inorm = .0;
5619       for (j = k1; j < k2; j++)
5620       {
5621          inorm += fabs(A_diag_data[j]);
5622       }
5623       if (inorm == .0)
5624       {
5625          hypre_error_w_msg(HYPRE_ERROR_ARG,"WARNING: ILUT with zero row.\n");
5626       }
5627       inorm /= (HYPRE_Real)(k2-k1);
5628       /* set the scaled tol for that row */
5629       itols = tol[2] * inorm;
5630       itolef = tol[1] * inorm;
5631 
5632       /* reset displacement */
5633       lenhll = lenhlr = lenu = 0;
5634       /* copy in data from A */
5635       for (j = k1; j < k2; j++)
5636       {
5637          /* get now col number */
5638          col = rperm[A_diag_j[j]];
5639          if (col < nLU)
5640          {
5641             /* L part of it */
5642             iL[lenhll] = col;
5643             w[lenhll] = A_diag_data[j];
5644             iw[col] = lenhll++;
5645             /* add to heap, by col number */
5646             hypre_ILUMinHeapAddIRIi(iL,w,iw,lenhll);
5647          }
5648          else if (col == ii)
5649          {
5650             /* the diagonla entry of S */
5651             iL[nLU] = col;
5652             w[nLU] = A_diag_data[j];
5653             iw[col] = nLU;
5654          }
5655          else
5656          {
5657             /* S part of it */
5658             lenu++;
5659             jpos = lenu + nLU;
5660             iL[jpos] = col;
5661             w[jpos] = A_diag_data[j];
5662             iw[col] = jpos;
5663          }
5664       }
5665 
5666       /*
5667        * main elimination
5668        * need to maintain 2 heaps for L, one heap for col and one heaps for value
5669        * maintian an array for S, and do qsplit with quick sort after that
5670        * while the heap of col is greater than zero
5671        */
5672       while (lenhll > 0)
5673       {
5674          /* get the next row from top of the heap */
5675          jrow = iL[0];
5676          dpiv = w[0] * D_data[jrow];
5677          w[0] = dpiv;
5678          /* now remove it from the top of the heap */
5679          hypre_ILUMinHeapRemoveIRIi(iL,w,iw,lenhll);
5680          lenhll--;
5681          /*
5682           * reset the drop part to -1
5683           * we don't need this iw anymore
5684           */
5685          iw[jrow] = -1;
5686          /* need to keep this one, move to the end of the heap */
5687          /* no longer need to maintain iw */
5688          hypre_swap2(iL,w,lenhll,kl-lenhlr);
5689          lenhlr++;
5690          hypre_ILUMaxrHeapAddRabsI(w+kl,iL+kl,lenhlr);
5691          /* loop for elimination */
5692          ku = U_diag_i[jrow+1];
5693          for (j = U_diag_i[jrow]; j < ku; j++)
5694          {
5695             col = U_diag_j[j];
5696             icol = iw[col];
5697             lxu = - dpiv*U_diag_data[j];
5698             /* we don't want to fill small number to empty place */
5699             if ((icol == -1) &&
5700                 ((col < nLU && fabs(lxu) < itolef) || ( col >= nLU && fabs(lxu) < itols )))
5701             {
5702                continue;
5703             }
5704             if (icol == -1)
5705             {
5706                if (col < nLU)
5707                {
5708                   /* L part
5709                    * not already in L part
5710                    * put it to the end of heap
5711                    * might overwrite some small entries, no issue
5712                    */
5713                   iL[lenhll] = col;
5714                   w[lenhll] = lxu;
5715                   iw[col] = lenhll++;
5716                   /* add to heap, by col number */
5717                   hypre_ILUMinHeapAddIRIi(iL,w,iw,lenhll);
5718                }
5719                else if (col == ii)
5720                {
5721                   /* the diagonla entry of S */
5722                   iL[nLU] = col;
5723                   w[nLU] = A_diag_data[j];
5724                   iw[col] = nLU;
5725                }
5726                else
5727                {
5728                   /*
5729                    * not already in S part
5730                    * put is to the end of heap
5731                    */
5732                   lenu++;
5733                   jpos = lenu + nLU;
5734                   iL[jpos] = col;
5735                   w[jpos] = lxu;
5736                   iw[col] = jpos;
5737                }
5738             }
5739             else
5740             {
5741                w[icol] += lxu;
5742             }
5743          }
5744       }/* while loop for the elimination of current row */
5745 
5746       /*
5747        * now pick up the largest lfil from L
5748        * L part is guarantee to be larger than itol
5749        */
5750 
5751       lenl = lenhlr < lfil ? lenhlr : lfil;
5752       L_diag_i[ii+1] = L_diag_i[ii] + lenl;
5753       if (lenl > 0)
5754       {
5755          /* test if memory is enough */
5756          while (ctrL + lenl > capacity_L)
5757          {
5758             HYPRE_Int tmp = capacity_L;
5759             capacity_L = capacity_L * EXPAND_FACT + 1;
5760             L_diag_j = hypre_TReAlloc_v2(L_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
5761             L_diag_data = hypre_TReAlloc_v2(L_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_L, HYPRE_MEMORY_DEVICE);
5762          }
5763          ctrL += lenl;
5764          /* copy large data in */
5765          for (j = L_diag_i[ii]; j < ctrL; j ++)
5766          {
5767             L_diag_j[j] = iL[kl];
5768             L_diag_data[j] = w[kl];
5769             hypre_ILUMaxrHeapRemoveRabsI(w+kl,iL+kl,lenhlr);
5770             lenhlr--;
5771          }
5772       }
5773       /*
5774        * now reset working array
5775        * L part already reset when move out of heap, only S part
5776        */
5777       ku = lenu+nLU;
5778       for (j = nLU; j <= ku; j++)
5779       {
5780          iw[iL[j]] = -1;
5781       }
5782 
5783       /* no dropping at this point of time for S */
5784       //lenhu = lenu < lfil ? lenu : lfil;
5785       lenhu = lenu;
5786       /* quick split, only sort the first small part of the array */
5787       hypre_ILUMaxQSplitRabsI(w,iL,nLU+1,nLU+lenhu,nLU+lenu);
5788       /* we have diagonal in S anyway */
5789       /* test if memory is enough */
5790       while (ctrS + lenhu + 1 > capacity_S)
5791       {
5792          HYPRE_Int tmp = capacity_S;
5793          capacity_S = capacity_S * EXPAND_FACT + 1;
5794          S_diag_j = hypre_TReAlloc_v2(S_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_S, HYPRE_MEMORY_DEVICE);
5795          S_diag_data = hypre_TReAlloc_v2(S_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_S, HYPRE_MEMORY_DEVICE);
5796       }
5797 
5798       ctrS += (lenhu+1);
5799       S_diag_i[ii-nLU+1] = ctrS;
5800 
5801       /* copy large data in, diagonal first */
5802       S_diag_j[S_diag_i[ii-nLU]] = iL[nLU]-nLU;
5803       S_diag_data[S_diag_i[ii-nLU]] = w[nLU];
5804       for (j = S_diag_i[ii-nLU] + 1; j < ctrS; j++)
5805       {
5806          jpos = nLU+j-S_diag_i[ii-nLU];
5807          S_diag_j[j] = iL[jpos]-nLU;
5808          S_diag_data[j] = w[jpos];
5809       }
5810    }/* end of ii loop from nLU to n-1 */
5811 
5812    /*
5813     * 3: Finishing up and free
5814     */
5815 
5816    /* First create Schur complement if necessary
5817     * Check if we need to create Schur complement
5818     */
5819    HYPRE_BigInt big_m = (HYPRE_BigInt)m;
5820    hypre_MPI_Allreduce(&big_m, &total_rows, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
5821    /* only form when total_rows > 0 */
5822    if ( total_rows > 0 )
5823    {
5824       /* now create S */
5825       /* need to get new column start */
5826       {
5827          HYPRE_BigInt global_start;
5828          hypre_MPI_Scan( &big_m, &global_start, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
5829          col_starts[0] = global_start - m;
5830          col_starts[1] = global_start;
5831       }
5832       /* We did nothing to A_offd, so all the data kept, just reorder them
5833        * The create function takes comm, global num rows/cols,
5834        *    row/col start, num cols offd, nnz diag, nnz offd
5835        */
5836       S_offd_nnz = hypre_CSRMatrixNumNonzeros(A_offd);
5837       S_offd_ncols = hypre_CSRMatrixNumCols(A_offd);
5838 
5839       matS = hypre_ParCSRMatrixCreate( comm,
5840             total_rows,
5841             total_rows,
5842             col_starts,
5843             col_starts,
5844             S_offd_ncols,
5845             S_diag_i[m],
5846             S_offd_nnz);
5847 
5848       /* first put diagonal data in */
5849       S_diag = hypre_ParCSRMatrixDiag(matS);
5850 
5851       hypre_CSRMatrixI(S_diag) = S_diag_i;
5852       hypre_CSRMatrixData(S_diag) = S_diag_data;
5853       hypre_CSRMatrixJ(S_diag) = S_diag_j;
5854 
5855       /* now start to construct offdiag of S */
5856       S_offd = hypre_ParCSRMatrixOffd(matS);
5857       S_offd_i = hypre_TAlloc(HYPRE_Int, m+1, HYPRE_MEMORY_DEVICE);
5858       S_offd_j = hypre_TAlloc(HYPRE_Int, S_offd_nnz, HYPRE_MEMORY_DEVICE);
5859       S_offd_data = hypre_TAlloc(HYPRE_Real, S_offd_nnz, HYPRE_MEMORY_DEVICE);
5860       S_offd_colmap = hypre_CTAlloc(HYPRE_BigInt, S_offd_ncols, HYPRE_MEMORY_HOST);
5861 
5862       /* simply use a loop to copy data from A_offd */
5863       S_offd_i[0] = 0;
5864       k3 = 0;
5865       for (i = 1; i <= e; i++)
5866       {
5867          S_offd_i[i] = k3;
5868       }
5869       for (i = 0; i < m_e; i++)
5870       {
5871          col = perm[i + nI];
5872          k1 = A_offd_i[col];
5873          k2 = A_offd_i[col+1];
5874          for (j = k1; j < k2; j++)
5875          {
5876             S_offd_j[k3] = A_offd_j[j];
5877             S_offd_data[k3++] = A_offd_data[j];
5878          }
5879          S_offd_i[i+e+1] = k3;
5880       }
5881 
5882       /* give I, J, DATA to S_offd */
5883       hypre_CSRMatrixI(S_offd) = S_offd_i;
5884       hypre_CSRMatrixJ(S_offd) = S_offd_j;
5885       hypre_CSRMatrixData(S_offd) = S_offd_data;
5886 
5887       /* now we need to update S_offd_colmap */
5888 
5889       /* get total num of send */
5890       num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
5891       begin = hypre_ParCSRCommPkgSendMapStart(comm_pkg,0);
5892       end = hypre_ParCSRCommPkgSendMapStart(comm_pkg,num_sends);
5893       send_buf = hypre_TAlloc(HYPRE_BigInt, end - begin, HYPRE_MEMORY_HOST);
5894       /* copy new index into send_buf */
5895       for (i = begin; i < end; i++)
5896       {
5897          send_buf[i-begin] = rperm[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,i)] - nLU + col_starts[0];
5898       }
5899 
5900       /* main communication */
5901       comm_handle = hypre_ParCSRCommHandleCreate(21, comm_pkg, send_buf, S_offd_colmap);
5902       /* need this to synchronize, Isend & Irecv used in above functions */
5903       hypre_ParCSRCommHandleDestroy(comm_handle);
5904 
5905       /* setup index */
5906       hypre_ParCSRMatrixColMapOffd(matS) = S_offd_colmap;
5907 
5908       hypre_ILUSortOffdColmap(matS);
5909 
5910       /* free */
5911       hypre_TFree(send_buf, HYPRE_MEMORY_HOST);
5912    } /* end of forming S */
5913 
5914    /* now start to construct L and U */
5915    for (k = nLU; k < n; k++)
5916    {
5917       /* set U after nLU to be 0, and diag to be one */
5918       U_diag_i[k+1] = U_diag_i[nLU];
5919       D_data[k] = 1.;
5920    }
5921 
5922    /* create parcsr matrix */
5923    matL = hypre_ParCSRMatrixCreate( comm,
5924          hypre_ParCSRMatrixGlobalNumRows(A),
5925          hypre_ParCSRMatrixGlobalNumRows(A),
5926          hypre_ParCSRMatrixRowStarts(A),
5927          hypre_ParCSRMatrixColStarts(A),
5928          0,
5929          L_diag_i[n],
5930          0 );
5931 
5932    L_diag = hypre_ParCSRMatrixDiag(matL);
5933    hypre_CSRMatrixI(L_diag) = L_diag_i;
5934    if (L_diag_i[n] > 0)
5935    {
5936       hypre_CSRMatrixData(L_diag) = L_diag_data;
5937       hypre_CSRMatrixJ(L_diag) = L_diag_j;
5938    }
5939    else
5940    {
5941       /* we initialized some anyway, so remove if unused */
5942       hypre_TFree(L_diag_j,HYPRE_MEMORY_DEVICE);
5943       hypre_TFree(L_diag_data,HYPRE_MEMORY_DEVICE);
5944    }
5945    /* store (global) total number of nonzeros */
5946    local_nnz = (HYPRE_Real) (L_diag_i[n]);
5947    hypre_MPI_Allreduce(&local_nnz, &total_nnz, 1, HYPRE_MPI_REAL, hypre_MPI_SUM, comm);
5948    hypre_ParCSRMatrixDNumNonzeros(matL) = total_nnz;
5949 
5950    matU = hypre_ParCSRMatrixCreate( comm,
5951          hypre_ParCSRMatrixGlobalNumRows(A),
5952          hypre_ParCSRMatrixGlobalNumRows(A),
5953          hypre_ParCSRMatrixRowStarts(A),
5954          hypre_ParCSRMatrixColStarts(A),
5955          0,
5956          U_diag_i[n],
5957          0 );
5958 
5959    U_diag = hypre_ParCSRMatrixDiag(matU);
5960    hypre_CSRMatrixI(U_diag) = U_diag_i;
5961    if (U_diag_i[n] > 0)
5962    {
5963       hypre_CSRMatrixData(U_diag) = U_diag_data;
5964       hypre_CSRMatrixJ(U_diag) = U_diag_j;
5965    }
5966    else
5967    {
5968       /* we initialized some anyway, so remove if unused */
5969       hypre_TFree(U_diag_j,HYPRE_MEMORY_DEVICE);
5970       hypre_TFree(U_diag_data,HYPRE_MEMORY_DEVICE);
5971    }
5972    /* store (global) total number of nonzeros */
5973    local_nnz = (HYPRE_Real) (U_diag_i[n]);
5974    hypre_MPI_Allreduce(&local_nnz, &total_nnz, 1, HYPRE_MPI_REAL, hypre_MPI_SUM, comm);
5975    hypre_ParCSRMatrixDNumNonzeros(matU) = total_nnz;
5976 
5977    /* free working array */
5978    hypre_TFree(iw,HYPRE_MEMORY_HOST);
5979    hypre_TFree(w,HYPRE_MEMORY_HOST);
5980 
5981    if (!matS)
5982    {
5983       hypre_TFree(S_diag_i,HYPRE_MEMORY_DEVICE);
5984    }
5985 
5986    if (!permp)
5987    {
5988       hypre_TFree(perm, HYPRE_MEMORY_DEVICE);
5989    }
5990 
5991    if (!qpermp)
5992    {
5993       hypre_TFree(qperm, HYPRE_MEMORY_DEVICE);
5994    }
5995 
5996    /* set matrix pointers */
5997    *Lptr = matL;
5998    *Dptr = D_data;
5999    *Uptr = matU;
6000    *Sptr = matS;
6001    *u_end = u_end_array;
6002 
6003    return hypre_error_flag;
6004 }
6005 
6006 
6007 /* NSH setup */
6008 /* Setup NSH data */
6009 HYPRE_Int
hypre_NSHSetup(void * nsh_vdata,hypre_ParCSRMatrix * A,hypre_ParVector * f,hypre_ParVector * u)6010 hypre_NSHSetup( void               *nsh_vdata,
6011                 hypre_ParCSRMatrix *A,
6012                 hypre_ParVector    *f,
6013                 hypre_ParVector    *u )
6014 {
6015    MPI_Comm             comm              = hypre_ParCSRMatrixComm(A);
6016    hypre_ParNSHData     *nsh_data         = (hypre_ParNSHData*) nsh_vdata;
6017 
6018    //   HYPRE_Int            i;
6019    // HYPRE_Int            num_threads;
6020    // HYPRE_Int            debug_flag = 0;
6021 
6022    /* pointers to NSH data */
6023    HYPRE_Int            logging           = hypre_ParNSHDataLogging(nsh_data);
6024    HYPRE_Int            print_level       = hypre_ParNSHDataPrintLevel(nsh_data);
6025 
6026    hypre_ParCSRMatrix   *matA             = hypre_ParNSHDataMatA(nsh_data);
6027    hypre_ParCSRMatrix   *matM             = hypre_ParNSHDataMatM(nsh_data);
6028 
6029    //   HYPRE_Int            n                 = hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A));
6030    HYPRE_Int            num_procs,  my_id;
6031 
6032    hypre_ParVector      *Utemp;
6033    hypre_ParVector      *Ftemp;
6034    hypre_ParVector      *F_array          = hypre_ParNSHDataF(nsh_data);
6035    hypre_ParVector      *U_array          = hypre_ParNSHDataU(nsh_data);
6036    hypre_ParVector      *residual         = hypre_ParNSHDataResidual(nsh_data);
6037    HYPRE_Real           *rel_res_norms    = hypre_ParNSHDataRelResNorms(nsh_data);
6038 
6039    /* solver setting */
6040    HYPRE_Real           *droptol          = hypre_ParNSHDataDroptol(nsh_data);
6041    HYPRE_Real           mr_tol            = hypre_ParNSHDataMRTol(nsh_data);
6042    HYPRE_Int            mr_max_row_nnz    = hypre_ParNSHDataMRMaxRowNnz(nsh_data);
6043    HYPRE_Int            mr_max_iter       = hypre_ParNSHDataMRMaxIter(nsh_data);
6044    HYPRE_Int            mr_col_version    = hypre_ParNSHDataMRColVersion(nsh_data);
6045    HYPRE_Real           nsh_tol           = hypre_ParNSHDataNSHTol(nsh_data);
6046    HYPRE_Int            nsh_max_row_nnz   = hypre_ParNSHDataNSHMaxRowNnz(nsh_data);
6047    HYPRE_Int            nsh_max_iter      = hypre_ParNSHDataNSHMaxIter(nsh_data);
6048 
6049    /* ----- begin -----*/
6050 
6051    //num_threads = hypre_NumThreads();
6052 
6053    hypre_MPI_Comm_size(comm,&num_procs);
6054    hypre_MPI_Comm_rank(comm,&my_id);
6055 
6056    /* Free Previously allocated data, if any not destroyed */
6057    if (matM)
6058    {
6059       hypre_TFree(matM, HYPRE_MEMORY_HOST);
6060       matM = NULL;
6061    }
6062 
6063    /* clear old l1_norm data, if created */
6064    if (hypre_ParNSHDataL1Norms(nsh_data))
6065    {
6066       hypre_TFree(hypre_ParNSHDataL1Norms(nsh_data), HYPRE_MEMORY_HOST);
6067       hypre_ParNSHDataL1Norms(nsh_data) = NULL;
6068    }
6069 
6070    /* setup temporary storage
6071     * first check is they've already here
6072     */
6073    if (hypre_ParNSHDataUTemp(nsh_data))
6074    {
6075       hypre_ParVectorDestroy(hypre_ParNSHDataUTemp(nsh_data));
6076       hypre_ParNSHDataUTemp(nsh_data) = NULL;
6077    }
6078    if (hypre_ParNSHDataFTemp(nsh_data))
6079    {
6080       hypre_ParVectorDestroy(hypre_ParNSHDataFTemp(nsh_data));
6081       hypre_ParNSHDataFTemp(nsh_data) = NULL;
6082    }
6083    if (hypre_ParNSHDataResidual(nsh_data))
6084    {
6085       hypre_ParVectorDestroy(hypre_ParNSHDataResidual(nsh_data));
6086       hypre_ParNSHDataResidual(nsh_data) = NULL;
6087    }
6088    if (hypre_ParNSHDataRelResNorms(nsh_data))
6089    {
6090       hypre_TFree(hypre_ParNSHDataRelResNorms(nsh_data), HYPRE_MEMORY_HOST);
6091       hypre_ParNSHDataRelResNorms(nsh_data) = NULL;
6092    }
6093 
6094    /* start to create working vectors */
6095    Utemp = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A),
6096          hypre_ParCSRMatrixGlobalNumRows(A),
6097          hypre_ParCSRMatrixRowStarts(A));
6098    hypre_ParVectorInitialize(Utemp);
6099    hypre_ParNSHDataUTemp(nsh_data) = Utemp;
6100 
6101    Ftemp = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A),
6102          hypre_ParCSRMatrixGlobalNumRows(A),
6103          hypre_ParCSRMatrixRowStarts(A));
6104    hypre_ParVectorInitialize(Ftemp);
6105    hypre_ParNSHDataFTemp(nsh_data) = Ftemp;
6106    /* set matrix, solution and rhs pointers */
6107    matA = A;
6108    F_array = f;
6109    U_array = u;
6110 
6111    /* NSH compute approximate inverse, see par_ilu.c */
6112    hypre_ILUParCSRInverseNSH(matA, &matM, droptol, mr_tol, nsh_tol, DIVIDE_TOL, mr_max_row_nnz,
6113          nsh_max_row_nnz, mr_max_iter, nsh_max_iter, mr_col_version, print_level);
6114 
6115    /* set pointers to NSH data */
6116    hypre_ParNSHDataMatA(nsh_data) = matA;
6117    hypre_ParNSHDataF(nsh_data) = F_array;
6118    hypre_ParNSHDataU(nsh_data) = U_array;
6119    hypre_ParNSHDataMatM(nsh_data) = matM;
6120 
6121    /* compute operator complexity */
6122    hypre_ParCSRMatrixSetDNumNonzeros(matA);
6123    hypre_ParCSRMatrixSetDNumNonzeros(matM);
6124    /* compute complexity */
6125    hypre_ParNSHDataOperatorComplexity(nsh_data) =  hypre_ParCSRMatrixDNumNonzeros(matM)/hypre_ParCSRMatrixDNumNonzeros(matA);
6126    if (my_id == 0)
6127    {
6128       hypre_printf("NSH SETUP: operator complexity = %f  \n", hypre_ParNSHDataOperatorComplexity(nsh_data));
6129    }
6130 
6131    if ( logging > 1 ) {
6132       residual =
6133          hypre_ParVectorCreate(hypre_ParCSRMatrixComm(matA),
6134                hypre_ParCSRMatrixGlobalNumRows(matA),
6135                hypre_ParCSRMatrixRowStarts(matA) );
6136       hypre_ParVectorInitialize(residual);
6137       hypre_ParNSHDataResidual(nsh_data)= residual;
6138    }
6139    else{
6140       hypre_ParNSHDataResidual(nsh_data) = NULL;
6141    }
6142    rel_res_norms = hypre_CTAlloc(HYPRE_Real, hypre_ParNSHDataMaxIter(nsh_data), HYPRE_MEMORY_HOST);
6143    hypre_ParNSHDataRelResNorms(nsh_data) = rel_res_norms;
6144 
6145    return hypre_error_flag;
6146 }
6147 
6148 
6149 /* ILU(0) for RAS, has some external rows
6150  * A = input matrix
6151  * perm = permutation array indicating ordering of factorization. Perm could come from a
6152  *    CF_marker array or a reordering routine.
6153  * nLU = size of computed LDU factorization.
6154  * Lptr, Dptr, Uptr, Sptr = L, D, U, S factors.
6155  * will form global Schur Matrix if nLU < n
6156  */
6157 HYPRE_Int
hypre_ILUSetupILU0RAS(hypre_ParCSRMatrix * A,HYPRE_Int * perm,HYPRE_Int nLU,hypre_ParCSRMatrix ** Lptr,HYPRE_Real ** Dptr,hypre_ParCSRMatrix ** Uptr)6158 hypre_ILUSetupILU0RAS(hypre_ParCSRMatrix *A, HYPRE_Int *perm, HYPRE_Int nLU,
6159       hypre_ParCSRMatrix **Lptr, HYPRE_Real** Dptr, hypre_ParCSRMatrix **Uptr)
6160 {
6161    HYPRE_Int                i, ii, j, k, k1, k2, ctrU, ctrL, lenl, lenu, jpiv, col, jpos;
6162    HYPRE_Int                *iw, *iL, *iU;
6163    HYPRE_Real               dd, t, dpiv, lxu, *wU, *wL;
6164 
6165    /* communication stuffs for S */
6166    MPI_Comm                 comm          = hypre_ParCSRMatrixComm(A);
6167    HYPRE_Int                num_procs;
6168    //   HYPRE_Int                S_offd_nnz, S_offd_ncols;
6169    hypre_ParCSRCommPkg      *comm_pkg;
6170    //   hypre_ParCSRCommHandle   *comm_handle;
6171    //   HYPRE_Int                num_sends, begin, end;
6172    //   HYPRE_Int                *send_buf     = NULL;
6173 
6174    /* data objects for A */
6175    hypre_CSRMatrix          *A_diag       = hypre_ParCSRMatrixDiag(A);
6176    hypre_CSRMatrix          *A_offd       = hypre_ParCSRMatrixOffd(A);
6177    HYPRE_Real               *A_diag_data  = hypre_CSRMatrixData(A_diag);
6178    HYPRE_Int                *A_diag_i     = hypre_CSRMatrixI(A_diag);
6179    HYPRE_Int                *A_diag_j     = hypre_CSRMatrixJ(A_diag);
6180    HYPRE_Real               *A_offd_data  = hypre_CSRMatrixData(A_offd);
6181    HYPRE_Int                *A_offd_i     = hypre_CSRMatrixI(A_offd);
6182    HYPRE_Int                *A_offd_j     = hypre_CSRMatrixJ(A_offd);
6183 
6184    /* size of problem and external matrix */
6185    HYPRE_Int                n             =  hypre_CSRMatrixNumRows(A_diag);
6186    //   HYPRE_Int                m             = n - nLU;
6187    HYPRE_Int                ext           = hypre_CSRMatrixNumCols(A_offd);
6188    HYPRE_Int                total_rows    = n + ext;
6189    HYPRE_BigInt             col_starts[2];
6190    HYPRE_BigInt             global_num_rows;
6191    HYPRE_Real               local_nnz, total_nnz;
6192 
6193    /* data objects for L, D, U */
6194    hypre_ParCSRMatrix       *matL;
6195    hypre_ParCSRMatrix       *matU;
6196    hypre_CSRMatrix          *L_diag;
6197    hypre_CSRMatrix          *U_diag;
6198    HYPRE_Real               *D_data;
6199    HYPRE_Real               *L_diag_data;
6200    HYPRE_Int                *L_diag_i;
6201    HYPRE_Int                *L_diag_j;
6202    HYPRE_Real               *U_diag_data;
6203    HYPRE_Int                *U_diag_i;
6204    HYPRE_Int                *U_diag_j;
6205 
6206    /* data objects for E, external matrix */
6207    HYPRE_Int                *E_i;
6208    HYPRE_Int                *E_j;
6209    HYPRE_Real               *E_data;
6210 
6211    /* memory management */
6212    HYPRE_Int                initial_alloc = 0;
6213    HYPRE_Int                capacity_L;
6214    HYPRE_Int                capacity_U;
6215    HYPRE_Int                nnz_A = A_diag_i[n];
6216 
6217    /* reverse permutation array */
6218    HYPRE_Int                *rperm;
6219    /* the original permutation array */
6220    HYPRE_Int                *perm_old;
6221 
6222    /* start setup
6223     * get communication stuffs first
6224     */
6225    hypre_MPI_Comm_size(comm,&num_procs);
6226    comm_pkg = hypre_ParCSRMatrixCommPkg(A);
6227    /* setup if not yet built */
6228    if (!comm_pkg)
6229    {
6230       hypre_MatvecCommPkgCreate(A);
6231       comm_pkg = hypre_ParCSRMatrixCommPkg(A);
6232    }
6233 
6234    /* check for correctness */
6235    if (nLU < 0 || nLU > n)
6236    {
6237       hypre_error_w_msg(HYPRE_ERROR_ARG,"WARNING: nLU out of range.\n");
6238    }
6239 
6240    /* Allocate memory for L,D,U,S factors */
6241    if (n > 0)
6242    {
6243       initial_alloc = (n + ext) + ceil((nnz_A / 2.0)*total_rows/n);
6244    }
6245    capacity_L = initial_alloc;
6246    capacity_U = initial_alloc;
6247 
6248    D_data      = hypre_TAlloc(HYPRE_Real, total_rows, HYPRE_MEMORY_DEVICE);
6249    L_diag_i    = hypre_TAlloc(HYPRE_Int, total_rows+1, HYPRE_MEMORY_DEVICE);
6250    L_diag_j    = hypre_TAlloc(HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
6251    L_diag_data = hypre_TAlloc(HYPRE_Real, capacity_L, HYPRE_MEMORY_DEVICE);
6252    U_diag_i    = hypre_TAlloc(HYPRE_Int, total_rows+1, HYPRE_MEMORY_DEVICE);
6253    U_diag_j    = hypre_TAlloc(HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
6254    U_diag_data = hypre_TAlloc(HYPRE_Real, capacity_U, HYPRE_MEMORY_DEVICE);
6255 
6256    /* allocate working arrays */
6257    iw          = hypre_TAlloc(HYPRE_Int, 4*total_rows, HYPRE_MEMORY_HOST);
6258    iL          = iw+total_rows;
6259    rperm       = iw + 2 * total_rows;
6260    perm_old    = perm;
6261    perm        = iw + 3 * total_rows;
6262    wL          = hypre_TAlloc(HYPRE_Real, total_rows, HYPRE_MEMORY_HOST);
6263    ctrU = ctrL = 0;
6264    L_diag_i[0] = U_diag_i[0] = 0;
6265    /* set marker array iw to -1 */
6266    for (i = 0; i < total_rows; i++)
6267    {
6268       iw[i] = -1;
6269    }
6270 
6271    /* expand perm to suit extra data, remember to free */
6272    for (i = 0; i < n; i++)
6273    {
6274       perm[i] = perm_old[i];
6275    }
6276    for (i = n; i < total_rows; i++)
6277    {
6278       perm[i] = i;
6279    }
6280 
6281    /* get reverse permutation (rperm).
6282     * rperm holds the reordered indexes.
6283     */
6284    for (i = 0; i < total_rows; i++)
6285    {
6286       rperm[perm[i]] = i;
6287    }
6288 
6289    /* get external rows */
6290    hypre_ILUBuildRASExternalMatrix(A, rperm, &E_i, &E_j, &E_data);
6291 
6292    /*---------  Begin Factorization. Work in permuted space  ----
6293     * this is the first part, without offd
6294     */
6295    for (ii = 0; ii < nLU; ii++)
6296    {
6297       // get row i
6298       i = perm[ii];
6299       // get extents of row i
6300       k1=A_diag_i[i];
6301       k2=A_diag_i[i+1];
6302 
6303       /*-------------------- unpack L & U-parts of row of A in arrays w */
6304       iU = iL+ii;
6305       wU = wL+ii;
6306       /*--------------------  diagonal entry */
6307       dd = 0.0;
6308       lenl  = lenu = 0;
6309       iw[ii] = ii;
6310       /*-------------------- scan & unwrap column */
6311       for (j = k1; j < k2; j++)
6312       {
6313          col = rperm[A_diag_j[j]];
6314          t = A_diag_data[j];
6315          if ( col < ii )
6316          {
6317             iw[col] = lenl;
6318             iL[lenl] = col;
6319             wL[lenl++] = t;
6320          }
6321          else if (col > ii)
6322          {
6323             iw[col] = lenu;
6324             iU[lenu] = col;
6325             wU[lenu++] = t;
6326          }
6327          else
6328          {
6329             dd=t;
6330          }
6331       }
6332 
6333       /* eliminate row */
6334       /*-------------------------------------------------------------------------
6335        *  In order to do the elimination in the correct order we must select the
6336        *  smallest column index among iL[k], k = j, j+1, ..., lenl-1. For ILU(0),
6337        *  no new fill-ins are expect, so we can pre-sort iL and wL prior to the
6338        *  entering the elimination loop.
6339        *-----------------------------------------------------------------------*/
6340       //      hypre_quickSortIR(iL, wL, iw, 0, (lenl-1));
6341       hypre_qsort3ir(iL, wL, iw, 0, (lenl-1));
6342       for (j = 0; j < lenl; j++)
6343       {
6344          jpiv = iL[j];
6345          /* get factor/ pivot element */
6346          dpiv = wL[j] * D_data[jpiv];
6347          /* store entry in L */
6348          wL[j] = dpiv;
6349 
6350          /* zero out element - reset pivot */
6351          iw[jpiv] = -1;
6352          /* combine current row and pivot row */
6353          for (k = U_diag_i[jpiv]; k < U_diag_i[jpiv+1]; k++)
6354          {
6355             col = U_diag_j[k];
6356             jpos = iw[col];
6357 
6358             /* Only fill-in nonzero pattern (jpos != 0) */
6359             if (jpos < 0)
6360             {
6361                continue;
6362             }
6363 
6364             lxu = - U_diag_data[k] * dpiv;
6365             if (col < ii)
6366             {
6367                /* dealing with L part */
6368                wL[jpos] += lxu;
6369             }
6370             else if (col > ii)
6371             {
6372                /* dealing with U part */
6373                wU[jpos] += lxu;
6374             }
6375             else
6376             {
6377                /* diagonal update */
6378                dd += lxu;
6379             }
6380          }
6381       }
6382       /* restore iw (only need to restore diagonal and U part */
6383       iw[ii] = -1;
6384       for (j = 0; j < lenu; j++)
6385       {
6386          iw[iU[j]] = -1;
6387       }
6388 
6389       /* Update LDU factors */
6390       /* L part */
6391       /* Check that memory is sufficient */
6392       while ((ctrL+lenl) > capacity_L)
6393       {
6394          HYPRE_Int tmp = capacity_L;
6395          capacity_L = capacity_L * EXPAND_FACT + 1;
6396          L_diag_j = hypre_TReAlloc_v2(L_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
6397          L_diag_data = hypre_TReAlloc_v2(L_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_L, HYPRE_MEMORY_DEVICE);
6398       }
6399       //hypre_TMemcpy(&(L_diag_j)[ctrL], iL, HYPRE_Int, lenl, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
6400       //hypre_TMemcpy(&(L_diag_data)[ctrL], wL, HYPRE_Real, lenl, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
6401       hypre_TMemcpy(&(L_diag_j)[ctrL], iL, HYPRE_Int, lenl,
6402                     HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
6403       hypre_TMemcpy(&(L_diag_data)[ctrL], wL, HYPRE_Real, lenl,
6404                     HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
6405       L_diag_i[ii+1] = (ctrL+=lenl);
6406 
6407       /* diagonal part (we store the inverse) */
6408       if (fabs(dd) < MAT_TOL)
6409       {
6410          dd = 1.0e-6;
6411       }
6412       D_data[ii] = 1./dd;
6413 
6414       /* U part */
6415       /* Check that memory is sufficient */
6416       while ((ctrU+lenu) > capacity_U)
6417       {
6418          HYPRE_Int tmp = capacity_U;
6419          capacity_U = capacity_U * EXPAND_FACT + 1;
6420          U_diag_j = hypre_TReAlloc_v2(U_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
6421          U_diag_data = hypre_TReAlloc_v2(U_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_U, HYPRE_MEMORY_DEVICE);
6422       }
6423       //hypre_TMemcpy(&(U_diag_j)[ctrU], iU, HYPRE_Int, lenu, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
6424       //hypre_TMemcpy(&(U_diag_data)[ctrU], wU, HYPRE_Real, lenu, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
6425       hypre_TMemcpy(&(U_diag_j)[ctrU], iU, HYPRE_Int, lenu,
6426                     HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
6427       hypre_TMemcpy(&(U_diag_data)[ctrU], wU, HYPRE_Real, lenu,
6428                     HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
6429       U_diag_i[ii+1] = (ctrU+=lenu);
6430    }
6431 
6432    /*---------  Begin Factorization in lower part  ----
6433     * here we need to get off diagonals in
6434     */
6435    for (ii = nLU; ii < n; ii++)
6436    {
6437       // get row i
6438       i = perm[ii];
6439       // get extents of row i
6440       k1=A_diag_i[i];
6441       k2=A_diag_i[i+1];
6442 
6443       /*-------------------- unpack L & U-parts of row of A in arrays w */
6444       iU = iL+ii;
6445       wU = wL+ii;
6446       /*--------------------  diagonal entry */
6447       dd = 0.0;
6448       lenl  = lenu = 0;
6449       iw[ii] = ii;
6450       /*-------------------- scan & unwrap column */
6451       for (j = k1; j < k2; j++)
6452       {
6453          col = rperm[A_diag_j[j]];
6454          t = A_diag_data[j];
6455          if (col < ii)
6456          {
6457             iw[col] = lenl;
6458             iL[lenl] = col;
6459             wL[lenl++] = t;
6460          }
6461          else if (col > ii)
6462          {
6463             iw[col] = lenu;
6464             iU[lenu] = col;
6465             wU[lenu++] = t;
6466          }
6467          else
6468          {
6469             dd=t;
6470          }
6471       }
6472 
6473       /*------------------ sjcan offd*/
6474       k1=A_offd_i[i];
6475       k2=A_offd_i[i+1];
6476       for (j = k1; j < k2; j++)
6477       {
6478          /* add offd to U part, all offd are U for this part */
6479          col = A_offd_j[j] + n;
6480          t = A_offd_data[j];
6481          iw[col] = lenu;
6482          iU[lenu] = col;
6483          wU[lenu++] = t;
6484       }
6485 
6486       /* eliminate row */
6487       /*-------------------------------------------------------------------------
6488        *  In order to do the elimination in the correct order we must select the
6489        *  smallest column index among iL[k], k = j, j+1, ..., lenl-1. For ILU(0),
6490        *  no new fill-ins are expect, so we can pre-sort iL and wL prior to the
6491        *  entering the elimination loop.
6492        *-----------------------------------------------------------------------*/
6493       //      hypre_quickSortIR(iL, wL, iw, 0, (lenl-1));
6494       hypre_qsort3ir(iL, wL, iw, 0, (lenl-1));
6495       for (j = 0; j < lenl; j++)
6496       {
6497          jpiv = iL[j];
6498          /* get factor/ pivot element */
6499          dpiv = wL[j] * D_data[jpiv];
6500          /* store entry in L */
6501          wL[j] = dpiv;
6502 
6503          /* zero out element - reset pivot */
6504          iw[jpiv] = -1;
6505          /* combine current row and pivot row */
6506          for (k = U_diag_i[jpiv]; k < U_diag_i[jpiv+1]; k++)
6507          {
6508             col = U_diag_j[k];
6509             jpos = iw[col];
6510 
6511             /* Only fill-in nonzero pattern (jpos != 0) */
6512             if (jpos < 0)
6513             {
6514                continue;
6515             }
6516 
6517             lxu = - U_diag_data[k] * dpiv;
6518             if (col < ii)
6519             {
6520                /* dealing with L part */
6521                wL[jpos] += lxu;
6522             }
6523             else if (col > ii)
6524             {
6525                /* dealing with U part */
6526                wU[jpos] += lxu;
6527             }
6528             else
6529             {
6530                /* diagonal update */
6531                dd += lxu;
6532             }
6533          }
6534       }
6535       /* restore iw (only need to restore diagonal and U part */
6536       iw[ii] = -1;
6537       for (j = 0; j < lenu; j++)
6538       {
6539          iw[iU[j]] = -1;
6540       }
6541 
6542       /* Update LDU factors */
6543       /* L part */
6544       /* Check that memory is sufficient */
6545       while ((ctrL+lenl) > capacity_L)
6546       {
6547          HYPRE_Int tmp = capacity_L;
6548          capacity_L = capacity_L * EXPAND_FACT + 1;
6549          L_diag_j = hypre_TReAlloc_v2(L_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
6550          L_diag_data = hypre_TReAlloc_v2(L_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_L, HYPRE_MEMORY_DEVICE);
6551       }
6552       //hypre_TMemcpy(&(L_diag_j)[ctrL], iL, HYPRE_Int, lenl, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
6553       //hypre_TMemcpy(&(L_diag_data)[ctrL], wL, HYPRE_Real, lenl, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
6554       hypre_TMemcpy(&(L_diag_j)[ctrL], iL, HYPRE_Int, lenl,
6555                     HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
6556       hypre_TMemcpy(&(L_diag_data)[ctrL], wL, HYPRE_Real, lenl,
6557                     HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
6558       L_diag_i[ii+1] = (ctrL+=lenl);
6559 
6560       /* diagonal part (we store the inverse) */
6561       if (fabs(dd) < MAT_TOL)
6562       {
6563          dd = 1.0e-6;
6564       }
6565       D_data[ii] = 1./dd;
6566 
6567       /* U part */
6568       /* Check that memory is sufficient */
6569       while ((ctrU+lenu) > capacity_U)
6570       {
6571          HYPRE_Int tmp = capacity_U;
6572          capacity_U = capacity_U * EXPAND_FACT + 1;
6573          U_diag_j = hypre_TReAlloc_v2(U_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
6574          U_diag_data = hypre_TReAlloc_v2(U_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_U, HYPRE_MEMORY_DEVICE);
6575       }
6576       //hypre_TMemcpy(&(U_diag_j)[ctrU], iU, HYPRE_Int, lenu, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
6577       //hypre_TMemcpy(&(U_diag_data)[ctrU], wU, HYPRE_Real, lenu, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
6578       hypre_TMemcpy(&(U_diag_j)[ctrU], iU, HYPRE_Int, lenu,
6579                     HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
6580       hypre_TMemcpy(&(U_diag_data)[ctrU], wU, HYPRE_Real, lenu,
6581                     HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
6582       U_diag_i[ii+1] = (ctrU+=lenu);
6583    }
6584 
6585    /*---------  Begin Factorization in external part  ----
6586     * here we need to get off diagonals in
6587     */
6588    for (ii = n ; ii < total_rows ; ii++)
6589    {
6590       // get row i
6591       i = ii-n;
6592       // get extents of row i
6593       k1=E_i[i];
6594       k2=E_i[i+1];
6595 
6596       /*-------------------- unpack L & U-parts of row of A in arrays w */
6597       iU = iL+ii;
6598       wU = wL+ii;
6599       /*--------------------  diagonal entry */
6600       dd = 0.0;
6601       lenl  = lenu = 0;
6602       iw[ii] = ii;
6603       /*-------------------- scan & unwrap column */
6604       for (j = k1; j < k2; j++)
6605       {
6606          col = rperm[E_j[j]];
6607          t = E_data[j];
6608          if (col < ii)
6609          {
6610             iw[col] = lenl;
6611             iL[lenl] = col;
6612             wL[lenl++] = t;
6613          }
6614          else if (col > ii)
6615          {
6616             iw[col] = lenu;
6617             iU[lenu] = col;
6618             wU[lenu++] = t;
6619          }
6620          else
6621          {
6622             dd=t;
6623          }
6624       }
6625 
6626       /* eliminate row */
6627       /*-------------------------------------------------------------------------
6628        *  In order to do the elimination in the correct order we must select the
6629        *  smallest column index among iL[k], k = j, j+1, ..., lenl-1. For ILU(0),
6630        *  no new fill-ins are expect, so we can pre-sort iL and wL prior to the
6631        *  entering the elimination loop.
6632        *-----------------------------------------------------------------------*/
6633       //      hypre_quickSortIR(iL, wL, iw, 0, (lenl-1));
6634       hypre_qsort3ir(iL, wL, iw, 0, (lenl-1));
6635       for (j = 0; j < lenl; j++)
6636       {
6637          jpiv = iL[j];
6638          /* get factor/ pivot element */
6639          dpiv = wL[j] * D_data[jpiv];
6640          /* store entry in L */
6641          wL[j] = dpiv;
6642 
6643          /* zero out element - reset pivot */
6644          iw[jpiv] = -1;
6645          /* combine current row and pivot row */
6646          for (k = U_diag_i[jpiv]; k < U_diag_i[jpiv+1]; k++)
6647          {
6648             col = U_diag_j[k];
6649             jpos = iw[col];
6650 
6651             /* Only fill-in nonzero pattern (jpos != 0) */
6652             if (jpos < 0)
6653             {
6654                continue;
6655             }
6656 
6657             lxu = - U_diag_data[k] * dpiv;
6658             if (col < ii)
6659             {
6660                /* dealing with L part */
6661                wL[jpos] += lxu;
6662             }
6663             else if (col > ii)
6664             {
6665                /* dealing with U part */
6666                wU[jpos] += lxu;
6667             }
6668             else
6669             {
6670                /* diagonal update */
6671                dd += lxu;
6672             }
6673          }
6674       }
6675       /* restore iw (only need to restore diagonal and U part */
6676       iw[ii] = -1;
6677       for (j = 0; j < lenu; j++)
6678       {
6679          iw[iU[j]] = -1;
6680       }
6681 
6682       /* Update LDU factors */
6683       /* L part */
6684       /* Check that memory is sufficient */
6685       while ((ctrL+lenl) > capacity_L)
6686       {
6687          HYPRE_Int tmp = capacity_L;
6688          capacity_L = capacity_L * EXPAND_FACT + 1;
6689          L_diag_j = hypre_TReAlloc_v2(L_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
6690          L_diag_data = hypre_TReAlloc_v2(L_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_L, HYPRE_MEMORY_DEVICE);
6691       }
6692       //hypre_TMemcpy(&(L_diag_j)[ctrL], iL, HYPRE_Int, lenl, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
6693       //hypre_TMemcpy(&(L_diag_data)[ctrL], wL, HYPRE_Real, lenl, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
6694       hypre_TMemcpy(&(L_diag_j)[ctrL], iL, HYPRE_Int, lenl,
6695                     HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
6696       hypre_TMemcpy(&(L_diag_data)[ctrL], wL, HYPRE_Real, lenl,
6697                     HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
6698       L_diag_i[ii+1] = (ctrL+=lenl);
6699 
6700       /* diagonal part (we store the inverse) */
6701       if (fabs(dd) < MAT_TOL)
6702       {
6703          dd = 1.0e-6;
6704       }
6705       D_data[ii] = 1./dd;
6706 
6707       /* U part */
6708       /* Check that memory is sufficient */
6709       while ((ctrU+lenu) > capacity_U)
6710       {
6711          HYPRE_Int tmp = capacity_U;
6712          capacity_U = capacity_U * EXPAND_FACT + 1;
6713          U_diag_j = hypre_TReAlloc_v2(U_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
6714          U_diag_data = hypre_TReAlloc_v2(U_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_U, HYPRE_MEMORY_DEVICE);
6715       }
6716       //hypre_TMemcpy(&(U_diag_j)[ctrU], iU, HYPRE_Int, lenu, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
6717       //hypre_TMemcpy(&(U_diag_data)[ctrU], wU, HYPRE_Real, lenu, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
6718       hypre_TMemcpy(&(U_diag_j)[ctrU], iU, HYPRE_Int, lenu,
6719                     HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
6720       hypre_TMemcpy(&(U_diag_data)[ctrU], wU, HYPRE_Real, lenu,
6721                     HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
6722       U_diag_i[ii+1] = (ctrU+=lenu);
6723    }
6724 
6725    HYPRE_BigInt big_total_rows = (HYPRE_BigInt)total_rows;
6726    hypre_MPI_Allreduce(&big_total_rows, &global_num_rows, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
6727 
6728    /* need to get new column start */
6729    {
6730       HYPRE_BigInt global_start;
6731       hypre_MPI_Scan( &big_total_rows, &global_start, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
6732       col_starts[0] = global_start - total_rows;
6733       col_starts[1] = global_start;
6734    }
6735 
6736    matL = hypre_ParCSRMatrixCreate( comm,
6737          global_num_rows,
6738          global_num_rows,
6739          col_starts,
6740          col_starts,
6741          0,
6742          ctrL,
6743          0 );
6744 
6745    L_diag = hypre_ParCSRMatrixDiag(matL);
6746    hypre_CSRMatrixI(L_diag) = L_diag_i;
6747    if (ctrL)
6748    {
6749       hypre_CSRMatrixData(L_diag) = L_diag_data;
6750       hypre_CSRMatrixJ(L_diag) = L_diag_j;
6751    }
6752    else
6753    {
6754       /* we've allocated some memory, so free if not used */
6755       hypre_TFree(L_diag_j,HYPRE_MEMORY_DEVICE);
6756       hypre_TFree(L_diag_data,HYPRE_MEMORY_DEVICE);
6757    }
6758    /* store (global) total number of nonzeros */
6759    local_nnz = (HYPRE_Real) ctrL;
6760    hypre_MPI_Allreduce(&local_nnz, &total_nnz, 1, HYPRE_MPI_REAL, hypre_MPI_SUM, comm);
6761    hypre_ParCSRMatrixDNumNonzeros(matL) = total_nnz;
6762 
6763    matU = hypre_ParCSRMatrixCreate( comm,
6764          global_num_rows,
6765          global_num_rows,
6766          col_starts,
6767          col_starts,
6768          0,
6769          ctrU,
6770          0 );
6771 
6772    U_diag = hypre_ParCSRMatrixDiag(matU);
6773    hypre_CSRMatrixI(U_diag) = U_diag_i;
6774    if (ctrU)
6775    {
6776       hypre_CSRMatrixData(U_diag) = U_diag_data;
6777       hypre_CSRMatrixJ(U_diag) = U_diag_j;
6778    }
6779    else
6780    {
6781       /* we've allocated some memory, so free if not used */
6782       hypre_TFree(U_diag_j,HYPRE_MEMORY_DEVICE);
6783       hypre_TFree(U_diag_data,HYPRE_MEMORY_DEVICE);
6784    }
6785    /* store (global) total number of nonzeros */
6786    local_nnz = (HYPRE_Real) ctrU;
6787    hypre_MPI_Allreduce(&local_nnz, &total_nnz, 1, HYPRE_MPI_REAL, hypre_MPI_SUM, comm);
6788    hypre_ParCSRMatrixDNumNonzeros(matU) = total_nnz;
6789    /* free memory */
6790    hypre_TFree(wL,HYPRE_MEMORY_HOST);
6791    hypre_TFree(iw,HYPRE_MEMORY_HOST);
6792 
6793    /* free external data */
6794    if (E_i)
6795    {
6796       hypre_TFree(E_i, HYPRE_MEMORY_HOST);
6797    }
6798    if (E_j)
6799    {
6800       hypre_TFree(E_j, HYPRE_MEMORY_HOST);
6801       hypre_TFree(E_data, HYPRE_MEMORY_HOST);
6802    }
6803 
6804    /* set matrix pointers */
6805    *Lptr = matL;
6806    *Dptr = D_data;
6807    *Uptr = matU;
6808 
6809    return hypre_error_flag;
6810 }
6811 
6812 
6813 
6814 /* ILU(k) symbolic factorization for RAS
6815  * n = total rows of input
6816  * lfil = level of fill-in, the k in ILU(k)
6817  * perm = permutation array indicating ordering of factorization. Perm could come from a
6818  * rperm = reverse permutation array, used here to avoid duplicate memory allocation
6819  * iw = working array, used here to avoid duplicate memory allocation
6820  * nLU = size of computed LDU factorization.
6821  * A/L/U/E_i = the I slot of A, L, U and E
6822  * A/L/U/E_j = the J slot of A, L, U and E
6823  * will form global Schur Matrix if nLU < n
6824  */
6825 HYPRE_Int
hypre_ILUSetupILUKRASSymbolic(HYPRE_Int n,HYPRE_Int * A_diag_i,HYPRE_Int * A_diag_j,HYPRE_Int * A_offd_i,HYPRE_Int * A_offd_j,HYPRE_Int * E_i,HYPRE_Int * E_j,HYPRE_Int ext,HYPRE_Int lfil,HYPRE_Int * perm,HYPRE_Int * rperm,HYPRE_Int * iw,HYPRE_Int nLU,HYPRE_Int * L_diag_i,HYPRE_Int * U_diag_i,HYPRE_Int ** L_diag_j,HYPRE_Int ** U_diag_j)6826 hypre_ILUSetupILUKRASSymbolic(HYPRE_Int n, HYPRE_Int *A_diag_i, HYPRE_Int *A_diag_j, HYPRE_Int *A_offd_i, HYPRE_Int *A_offd_j,
6827                               HYPRE_Int *E_i, HYPRE_Int *E_j, HYPRE_Int ext,
6828                               HYPRE_Int lfil, HYPRE_Int *perm,
6829                               HYPRE_Int *rperm,   HYPRE_Int *iw,   HYPRE_Int nLU,
6830                               HYPRE_Int *L_diag_i, HYPRE_Int *U_diag_i,
6831                               HYPRE_Int **L_diag_j, HYPRE_Int **U_diag_j)
6832 {
6833    /*
6834     * 1: Setup and create buffers
6835     * A_diag_*: tempory pointer for the diagonal matrix of A and its '*' slot
6836     * ii: outer loop from 0 to nLU - 1
6837     * i: the real col number in diag inside the outer loop
6838     * iw:  working array store the reverse of active col number
6839     * iL: working array store the active col number
6840     * iLev: working array store the active level of current row
6841     * lenl/u: current position in iw and so
6842     * ctrL/U/S: global position in J
6843     */
6844 
6845    HYPRE_Int      *temp_L_diag_j, *temp_U_diag_j, *u_levels;
6846    HYPRE_Int      *iL, *iLev;
6847    HYPRE_Int      ii, i, j, k, ku, lena, lenl, lenu, lenh, ilev, lev, col, icol;
6848    //   HYPRE_Int      m = n - nLU;
6849    HYPRE_Int      total_rows = ext + n;
6850 
6851    /* memory management */
6852    HYPRE_Int      ctrL;
6853    HYPRE_Int      ctrU;
6854    HYPRE_Int      capacity_L;
6855    HYPRE_Int      capacity_U;
6856    HYPRE_Int      initial_alloc = 0;
6857    HYPRE_Int      nnz_A;
6858 
6859    /* set iL and iLev to right place in iw array */
6860    iL             = iw + total_rows;
6861    iLev           = iw + 2*total_rows;
6862 
6863    /* setup initial memory used */
6864    nnz_A          = A_diag_i[n];
6865    if (n > 0)
6866    {
6867       initial_alloc  = (n + ext) + ceil((nnz_A / 2.0) * total_rows / n);
6868    }
6869    capacity_L     = initial_alloc;
6870    capacity_U     = initial_alloc;
6871 
6872    /* allocate other memory for L and U struct */
6873    temp_L_diag_j  = hypre_CTAlloc(HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
6874    temp_U_diag_j  = hypre_CTAlloc(HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
6875 
6876    u_levels       = hypre_CTAlloc(HYPRE_Int, capacity_U, HYPRE_MEMORY_HOST);
6877    ctrL = ctrU = 0;
6878 
6879    /* set initial value for working array */
6880    for (ii = 0; ii < total_rows; ii++)
6881    {
6882       iw[ii] = -1;
6883    }
6884 
6885    /*
6886     * 2: Start of main loop
6887     * those in iL are NEW col index (after permutation)
6888     */
6889    for (ii = 0; ii < nLU; ii++)
6890    {
6891       i = perm[ii];
6892       lenl = 0;
6893       lenh = 0;/* this is the current length of heap */
6894       lenu = ii;
6895       lena = A_diag_i[i+1];
6896       /* put those already inside original pattern, and set their level to 0 */
6897       for (j = A_diag_i[i]; j < lena; j++)
6898       {
6899          /* get the neworder of that col */
6900          col = rperm[A_diag_j[j]];
6901          if (col < ii)
6902          {
6903             /*
6904              * this is an entry in L
6905              * we maintain a heap structure for L part
6906              */
6907             iL[lenh] = col;
6908             iLev[lenh] = 0;
6909             iw[col] = lenh++;
6910             /*now miantian a heap structure*/
6911             hypre_ILUMinHeapAddIIIi(iL,iLev,iw,lenh);
6912          }
6913          else if (col > ii)
6914          {
6915             /* this is an entry in U */
6916             iL[lenu] = col;
6917             iLev[lenu] = 0;
6918             iw[col] = lenu++;
6919          }
6920       }/* end of j loop for adding pattern in original matrix */
6921 
6922       /*
6923        * search lower part of current row and update pattern based on level
6924        */
6925       while (lenh > 0)
6926       {
6927          /*
6928           * k is now the new col index after permutation
6929           * the first element of the heap is the smallest
6930           */
6931          k = iL[0];
6932          ilev = iLev[0];
6933          /*
6934           * we now need to maintain the heap structure
6935           */
6936          hypre_ILUMinHeapRemoveIIIi(iL,iLev,iw,lenh);
6937          lenh--;
6938          /* copy to the end of array */
6939          lenl++;
6940          /* reset iw for that, not using anymore */
6941          iw[k]=-1;
6942          hypre_swap2i(iL,iLev,ii-lenl,lenh);
6943          /*
6944           * now the elimination on current row could start.
6945           * eliminate row k (new index) from current row
6946           */
6947          ku = U_diag_i[k+1];
6948          for (j = U_diag_i[k]; j < ku; j++)
6949          {
6950             col = temp_U_diag_j[j];
6951             lev = u_levels[j] + ilev + 1;
6952             /* ignore large level */
6953             icol = iw[col];
6954             /* skill large level */
6955             if (lev > lfil)
6956             {
6957                continue;
6958             }
6959             if (icol < 0)
6960             {
6961                /* not yet in */
6962                if (col < ii)
6963                {
6964                   /*
6965                    * if we add to the left L, we need to maintian the
6966                    *    heap structure
6967                    */
6968                   iL[lenh] = col;
6969                   iLev[lenh] = lev;
6970                   iw[col] = lenh++;
6971                   /*swap it with the element right after the heap*/
6972 
6973                   /* maintain the heap */
6974                   hypre_ILUMinHeapAddIIIi(iL,iLev,iw,lenh);
6975                }
6976                else if (col > ii)
6977                {
6978                   iL[lenu] = col;
6979                   iLev[lenu] = lev;
6980                   iw[col] = lenu++;
6981                }
6982             }
6983             else
6984             {
6985                iLev[icol] = hypre_min(lev, iLev[icol]);
6986             }
6987          }/* end of loop j for level update */
6988       }/* end of while loop for iith row */
6989 
6990       /* now update everything, indices, levels and so */
6991       L_diag_i[ii+1] = L_diag_i[ii] + lenl;
6992       if (lenl > 0)
6993       {
6994          /* check if memory is enough */
6995          while (ctrL + lenl > capacity_L)
6996          {
6997             HYPRE_Int tmp = capacity_L;
6998             capacity_L = capacity_L * EXPAND_FACT + 1;
6999             temp_L_diag_j = hypre_TReAlloc_v2(temp_L_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
7000          }
7001          /* now copy L data, reverse order */
7002          for (j = 0; j < lenl; j++)
7003          {
7004             temp_L_diag_j[ctrL+j] = iL[ii-j-1];
7005          }
7006          ctrL += lenl;
7007       }
7008       k = lenu - ii;
7009       U_diag_i[ii+1] = U_diag_i[ii] + k;
7010       if (k > 0)
7011       {
7012          /* check if memory is enough */
7013          while (ctrU + k > capacity_U)
7014          {
7015             HYPRE_Int tmp = capacity_U;
7016             capacity_U = capacity_U * EXPAND_FACT + 1;
7017             temp_U_diag_j = hypre_TReAlloc_v2(temp_U_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
7018             u_levels = hypre_TReAlloc_v2(u_levels, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_HOST);
7019          }
7020          //hypre_TMemcpy(temp_U_diag_j+ctrU,iL+ii,HYPRE_Int,k,HYPRE_MEMORY_DEVICE,HYPRE_MEMORY_HOST);
7021          hypre_TMemcpy(temp_U_diag_j+ctrU, iL+ii, HYPRE_Int, k,
7022                        HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
7023          hypre_TMemcpy(u_levels+ctrU, iLev+ii, HYPRE_Int, k,
7024                        HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
7025          ctrU += k;
7026       }
7027 
7028       /* reset iw */
7029       for (j = ii; j < lenu; j++)
7030       {
7031          iw[iL[j]] = -1;
7032       }
7033 
7034    }/* end of main loop ii from 0 to nLU-1 */
7035 
7036    /*
7037     * Offd part
7038     */
7039    for (ii = nLU; ii < n; ii++)
7040    {
7041       i = perm[ii];
7042       lenl = 0;
7043       lenh = 0;/* this is the current length of heap */
7044       lenu = ii;
7045       lena = A_diag_i[i+1];
7046       /* put those already inside original pattern, and set their level to 0 */
7047       for (j = A_diag_i[i]; j < lena; j++)
7048       {
7049          /* get the neworder of that col */
7050          col = rperm[A_diag_j[j]];
7051          if (col < ii)
7052          {
7053             /*
7054              * this is an entry in L
7055              * we maintain a heap structure for L part
7056              */
7057             iL[lenh] = col;
7058             iLev[lenh] = 0;
7059             iw[col] = lenh++;
7060             /*now miantian a heap structure*/
7061             hypre_ILUMinHeapAddIIIi(iL,iLev,iw,lenh);
7062          }
7063          else if (col > ii)
7064          {
7065             /* this is an entry in U */
7066             iL[lenu] = col;
7067             iLev[lenu] = 0;
7068             iw[col] = lenu++;
7069          }
7070       }/* end of j loop for adding pattern in original matrix */
7071 
7072       /* put those already inside offd pattern in, and set their level to 0 */
7073       lena = A_offd_i[i+1];
7074       for (j = A_offd_i[i]; j < lena; j++)
7075       {
7076          /* the offd cols are in order */
7077          col = A_offd_j[j] + n;
7078          /* col for sure to be greater than ii */
7079          iL[lenu] = col;
7080          iLev[lenu] = 0;
7081          iw[col] = lenu++;
7082       }
7083 
7084       /*
7085        * search lower part of current row and update pattern based on level
7086        */
7087       while (lenh > 0)
7088       {
7089          /*
7090           * k is now the new col index after permutation
7091           * the first element of the heap is the smallest
7092           */
7093          k = iL[0];
7094          ilev = iLev[0];
7095          /*
7096           * we now need to maintain the heap structure
7097           */
7098          hypre_ILUMinHeapRemoveIIIi(iL,iLev,iw,lenh);
7099          lenh--;
7100          /* copy to the end of array */
7101          lenl++;
7102          /* reset iw for that, not using anymore */
7103          iw[k]=-1;
7104          hypre_swap2i(iL,iLev,ii-lenl,lenh);
7105          /*
7106           * now the elimination on current row could start.
7107           * eliminate row k (new index) from current row
7108           */
7109          ku = U_diag_i[k+1];
7110          for (j = U_diag_i[k]; j < ku; j++)
7111          {
7112             col = temp_U_diag_j[j];
7113             lev = u_levels[j] + ilev + 1;
7114             /* ignore large level */
7115             icol = iw[col];
7116             /* skill large level */
7117             if (lev > lfil)
7118             {
7119                continue;
7120             }
7121             if (icol < 0)
7122             {
7123                /* not yet in */
7124                if (col < ii)
7125                {
7126                   /*
7127                    * if we add to the left L, we need to maintian the
7128                    *    heap structure
7129                    */
7130                   iL[lenh] = col;
7131                   iLev[lenh] = lev;
7132                   iw[col] = lenh++;
7133                   /*swap it with the element right after the heap*/
7134 
7135                   /* maintain the heap */
7136                   hypre_ILUMinHeapAddIIIi(iL,iLev,iw,lenh);
7137                }
7138                else if (col > ii)
7139                {
7140                   iL[lenu] = col;
7141                   iLev[lenu] = lev;
7142                   iw[col] = lenu++;
7143                }
7144             }
7145             else
7146             {
7147                iLev[icol] = hypre_min(lev, iLev[icol]);
7148             }
7149          }/* end of loop j for level update */
7150       }/* end of while loop for iith row */
7151 
7152       /* now update everything, indices, levels and so */
7153       L_diag_i[ii+1] = L_diag_i[ii] + lenl;
7154       if (lenl > 0)
7155       {
7156          /* check if memory is enough */
7157          while (ctrL + lenl > capacity_L)
7158          {
7159             HYPRE_Int tmp = capacity_L;
7160             capacity_L = capacity_L * EXPAND_FACT + 1;
7161             temp_L_diag_j = hypre_TReAlloc_v2(temp_L_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
7162          }
7163          /* now copy L data, reverse order */
7164          for (j = 0; j < lenl; j++)
7165          {
7166             temp_L_diag_j[ctrL+j] = iL[ii-j-1];
7167          }
7168          ctrL += lenl;
7169       }
7170       k = lenu - ii;
7171       U_diag_i[ii+1] = U_diag_i[ii] + k;
7172       if (k > 0)
7173       {
7174          /* check if memory is enough */
7175          while (ctrU + k > capacity_U)
7176          {
7177             HYPRE_Int tmp = capacity_U;
7178             capacity_U = capacity_U * EXPAND_FACT + 1;
7179             temp_U_diag_j = hypre_TReAlloc_v2(temp_U_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
7180             u_levels = hypre_TReAlloc_v2(u_levels, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_HOST);
7181          }
7182          //hypre_TMemcpy(temp_U_diag_j+ctrU,iL+ii,HYPRE_Int,k,HYPRE_MEMORY_DEVICE,HYPRE_MEMORY_HOST);
7183          hypre_TMemcpy(temp_U_diag_j+ctrU, iL+ii, HYPRE_Int, k,
7184                        HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
7185          hypre_TMemcpy(u_levels+ctrU, iLev+ii, HYPRE_Int, k,
7186                        HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
7187          ctrU += k;
7188       }
7189 
7190       /* reset iw */
7191       for (j = ii; j < lenu; j++)
7192       {
7193          iw[iL[j]] = -1;
7194       }
7195    } /* end of main loop ii from nLU to n */
7196 
7197    /* external part matrix */
7198    for (ii = n ; ii < total_rows ; ii ++)
7199    {
7200       i = ii - n;
7201       lenl = 0;
7202       lenh = 0;/* this is the current length of heap */
7203       lenu = ii;
7204       lena = E_i[i+1];
7205       /* put those already inside original pattern, and set their level to 0 */
7206       for (j = E_i[i]; j < lena; j++)
7207       {
7208          /* get the neworder of that col */
7209          col = E_j[j];
7210          if (col < ii)
7211          {
7212             /*
7213              * this is an entry in L
7214              * we maintain a heap structure for L part
7215              */
7216             iL[lenh] = col;
7217             iLev[lenh] = 0;
7218             iw[col] = lenh++;
7219             /*now miantian a heap structure*/
7220             hypre_ILUMinHeapAddIIIi(iL,iLev,iw,lenh);
7221          }
7222          else if (col > ii)
7223          {
7224             /* this is an entry in U */
7225             iL[lenu] = col;
7226             iLev[lenu] = 0;
7227             iw[col] = lenu++;
7228          }
7229       }/* end of j loop for adding pattern in original matrix */
7230 
7231       /*
7232        * search lower part of current row and update pattern based on level
7233        */
7234       while (lenh > 0)
7235       {
7236          /*
7237           * k is now the new col index after permutation
7238           * the first element of the heap is the smallest
7239           */
7240          k = iL[0];
7241          ilev = iLev[0];
7242          /*
7243           * we now need to maintain the heap structure
7244           */
7245          hypre_ILUMinHeapRemoveIIIi(iL,iLev,iw,lenh);
7246          lenh--;
7247          /* copy to the end of array */
7248          lenl++;
7249          /* reset iw for that, not using anymore */
7250          iw[k]=-1;
7251          hypre_swap2i(iL,iLev,ii-lenl,lenh);
7252          /*
7253           * now the elimination on current row could start.
7254           * eliminate row k (new index) from current row
7255           */
7256          ku = U_diag_i[k+1];
7257          for (j = U_diag_i[k]; j < ku; j++)
7258          {
7259             col = temp_U_diag_j[j];
7260             lev = u_levels[j] + ilev + 1;
7261             /* ignore large level */
7262             icol = iw[col];
7263             /* skill large level */
7264             if (lev > lfil)
7265             {
7266                continue;
7267             }
7268             if (icol < 0)
7269             {
7270                /* not yet in */
7271                if (col < ii)
7272                {
7273                   /*
7274                    * if we add to the left L, we need to maintian the
7275                    *    heap structure
7276                    */
7277                   iL[lenh] = col;
7278                   iLev[lenh] = lev;
7279                   iw[col] = lenh++;
7280                   /*swap it with the element right after the heap*/
7281 
7282                   /* maintain the heap */
7283                   hypre_ILUMinHeapAddIIIi(iL,iLev,iw,lenh);
7284                }
7285                else if (col > ii)
7286                {
7287                   iL[lenu] = col;
7288                   iLev[lenu] = lev;
7289                   iw[col] = lenu++;
7290                }
7291             }
7292             else
7293             {
7294                iLev[icol] = hypre_min(lev, iLev[icol]);
7295             }
7296          }/* end of loop j for level update */
7297       }/* end of while loop for iith row */
7298 
7299       /* now update everything, indices, levels and so */
7300       L_diag_i[ii+1] = L_diag_i[ii] + lenl;
7301       if (lenl > 0)
7302       {
7303          /* check if memory is enough */
7304          while (ctrL + lenl > capacity_L)
7305          {
7306             HYPRE_Int tmp = capacity_L;
7307             capacity_L = capacity_L * EXPAND_FACT + 1;
7308             temp_L_diag_j = hypre_TReAlloc_v2(temp_L_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
7309          }
7310          /* now copy L data, reverse order */
7311          for (j = 0; j < lenl; j++)
7312          {
7313             temp_L_diag_j[ctrL+j] = iL[ii-j-1];
7314          }
7315          ctrL += lenl;
7316       }
7317       k = lenu - ii;
7318       U_diag_i[ii+1] = U_diag_i[ii] + k;
7319       if (k > 0)
7320       {
7321          /* check if memory is enough */
7322          while (ctrU + k > capacity_U)
7323          {
7324             HYPRE_Int tmp = capacity_U;
7325             capacity_U = capacity_U * EXPAND_FACT + 1;
7326             temp_U_diag_j = hypre_TReAlloc_v2(temp_U_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
7327             u_levels = hypre_TReAlloc_v2(u_levels, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_HOST);
7328          }
7329          //hypre_TMemcpy(temp_U_diag_j+ctrU,iL+ii,HYPRE_Int,k,HYPRE_MEMORY_DEVICE,HYPRE_MEMORY_HOST);
7330          hypre_TMemcpy(temp_U_diag_j+ctrU, iL+ii, HYPRE_Int, k,
7331                        HYPRE_MEMORY_HOST,HYPRE_MEMORY_HOST);
7332          hypre_TMemcpy(u_levels+ctrU, iLev+ii, HYPRE_Int, k,
7333                        HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
7334          ctrU += k;
7335       }
7336 
7337       /* reset iw */
7338       for (j = ii; j < lenu; j++)
7339       {
7340          iw[iL[j]] = -1;
7341       }
7342 
7343    }/* end of main loop ii from n to total_rows */
7344 
7345    /*
7346     * 3: Finishing up and free memory
7347     */
7348    hypre_TFree(u_levels,HYPRE_MEMORY_HOST);
7349 
7350    *L_diag_j = temp_L_diag_j;
7351    *U_diag_j = temp_U_diag_j;
7352 
7353    return hypre_error_flag;
7354 }
7355 
7356 /* ILU(k) for RAS
7357  * A: input matrix
7358  * lfil: level of fill-in, the k in ILU(k)
7359  * perm: permutation array indicating ordering of factorization. Perm could come from a
7360  * CF_marker: array or a reordering routine.
7361  * nLU: size of computed LDU factorization.
7362  * Lptr, Dptr, Uptr: L, D, U factors.
7363  */
7364 HYPRE_Int
hypre_ILUSetupILUKRAS(hypre_ParCSRMatrix * A,HYPRE_Int lfil,HYPRE_Int * perm,HYPRE_Int nLU,hypre_ParCSRMatrix ** Lptr,HYPRE_Real ** Dptr,hypre_ParCSRMatrix ** Uptr)7365 hypre_ILUSetupILUKRAS(hypre_ParCSRMatrix *A, HYPRE_Int lfil, HYPRE_Int *perm, HYPRE_Int nLU,
7366       hypre_ParCSRMatrix **Lptr, HYPRE_Real** Dptr, hypre_ParCSRMatrix **Uptr)
7367 {
7368    /*
7369     * 1: Setup and create buffers
7370     * matL/U: the ParCSR matrix for L and U
7371     * L/U_diag: the diagonal csr matrix of matL/U
7372     * A_diag_*: tempory pointer for the diagonal matrix of A and its '*' slot
7373     * ii = outer loop from 0 to nLU - 1
7374     * i = the real col number in diag inside the outer loop
7375     * iw =  working array store the reverse of active col number
7376     * iL = working array store the active col number
7377     */
7378 
7379    /* call ILU0 if lfil is 0 */
7380    if (lfil == 0)
7381    {
7382       return hypre_ILUSetupILU0RAS(A,perm,nLU,Lptr,Dptr,Uptr);
7383    }
7384    HYPRE_Int               i, ii, j, k, k1, k2, kl, ku, jpiv, col, icol;
7385    HYPRE_Int               *iw;
7386    MPI_Comm                comm           = hypre_ParCSRMatrixComm(A);
7387    HYPRE_Int               num_procs;
7388 
7389    /* data objects for A */
7390    hypre_CSRMatrix         *A_diag        = hypre_ParCSRMatrixDiag(A);
7391    hypre_CSRMatrix         *A_offd        = hypre_ParCSRMatrixOffd(A);
7392    HYPRE_Real              *A_diag_data   = hypre_CSRMatrixData(A_diag);
7393    HYPRE_Int               *A_diag_i      = hypre_CSRMatrixI(A_diag);
7394    HYPRE_Int               *A_diag_j      = hypre_CSRMatrixJ(A_diag);
7395    HYPRE_Real              *A_offd_data   = hypre_CSRMatrixData(A_offd);
7396    HYPRE_Int               *A_offd_i      = hypre_CSRMatrixI(A_offd);
7397    HYPRE_Int               *A_offd_j      = hypre_CSRMatrixJ(A_offd);
7398 
7399    /* data objects for L, D, U */
7400    hypre_ParCSRMatrix      *matL;
7401    hypre_ParCSRMatrix      *matU;
7402    hypre_CSRMatrix         *L_diag;
7403    hypre_CSRMatrix         *U_diag;
7404    HYPRE_Real              *D_data;
7405    HYPRE_Real              *L_diag_data   = NULL;
7406    HYPRE_Int               *L_diag_i;
7407    HYPRE_Int               *L_diag_j      = NULL;
7408    HYPRE_Real              *U_diag_data   = NULL;
7409    HYPRE_Int               *U_diag_i;
7410    HYPRE_Int               *U_diag_j      = NULL;
7411 
7412    /* size of problem and external matrix */
7413    HYPRE_Int               n              = hypre_CSRMatrixNumRows(A_diag);
7414    //   HYPRE_Int               m              = n - nLU;
7415    HYPRE_Int               ext            = hypre_CSRMatrixNumCols(A_offd);
7416    HYPRE_Int               total_rows     = n + ext;
7417    HYPRE_BigInt            global_num_rows;
7418    HYPRE_BigInt            col_starts[2];
7419    HYPRE_Real              local_nnz, total_nnz;
7420 
7421    /* data objects for E, external matrix */
7422    HYPRE_Int               *E_i;
7423    HYPRE_Int               *E_j;
7424    HYPRE_Real              *E_data;
7425 
7426    /* communication */
7427    hypre_ParCSRCommPkg     *comm_pkg;
7428    hypre_MPI_Comm_size(comm, &num_procs);
7429    //   hypre_ParCSRCommHandle  *comm_handle;
7430    //   HYPRE_Int               *send_buf      = NULL;
7431 
7432    /* reverse permutation array */
7433    HYPRE_Int               *rperm;
7434    /* temp array for old permutation */
7435    HYPRE_Int               *perm_old;
7436 
7437    /* start setup */
7438    /* check input and get problem size */
7439    n =  hypre_CSRMatrixNumRows(A_diag);
7440    if (nLU < 0 || nLU > n)
7441    {
7442       hypre_error_w_msg(HYPRE_ERROR_ARG,"WARNING: nLU out of range.\n");
7443    }
7444 
7445    /* Init I array anyway. S's might be freed later */
7446    D_data   = hypre_CTAlloc(HYPRE_Real, total_rows, HYPRE_MEMORY_DEVICE);
7447    L_diag_i = hypre_CTAlloc(HYPRE_Int, (total_rows+1), HYPRE_MEMORY_DEVICE);
7448    U_diag_i = hypre_CTAlloc(HYPRE_Int, (total_rows+1), HYPRE_MEMORY_DEVICE);
7449 
7450    /* set Comm_Pkg if not yet built */
7451    comm_pkg = hypre_ParCSRMatrixCommPkg(A);
7452    if (!comm_pkg)
7453    {
7454       hypre_MatvecCommPkgCreate(A);
7455       comm_pkg = hypre_ParCSRMatrixCommPkg(A);
7456    }
7457 
7458    /*
7459     * 2: Symbolic factorization
7460     * setup iw and rperm first
7461     */
7462    /* allocate work arrays */
7463    iw          = hypre_CTAlloc(HYPRE_Int, 5*total_rows, HYPRE_MEMORY_HOST);
7464    rperm       = iw + 3*total_rows;
7465    perm_old    = perm;
7466    perm        = iw + 4*total_rows;
7467    L_diag_i[0] = U_diag_i[0] = 0;
7468    /* get reverse permutation (rperm).
7469     * rperm holds the reordered indexes.
7470     */
7471    for (i = 0; i < n; i++)
7472    {
7473       perm[i] = perm_old[i];
7474    }
7475    for (i = n; i < total_rows; i++)
7476    {
7477       perm[i] = i;
7478    }
7479    for (i = 0; i < total_rows; i++)
7480    {
7481       rperm[perm[i]] = i;
7482    }
7483 
7484    /* get external rows */
7485    hypre_ILUBuildRASExternalMatrix(A,rperm,&E_i,&E_j,&E_data);
7486    /* do symbolic factorization */
7487    hypre_ILUSetupILUKRASSymbolic(n, A_diag_i, A_diag_j, A_offd_i, A_offd_j, E_i, E_j, ext, lfil, perm, rperm, iw,
7488          nLU, L_diag_i, U_diag_i, &L_diag_j, &U_diag_j);
7489 
7490    /*
7491     * after this, we have our I,J for L, U and S ready, and L sorted
7492     * iw are still -1 after symbolic factorization
7493     * now setup helper array here
7494     */
7495    if (L_diag_i[total_rows])
7496    {
7497       L_diag_data = hypre_CTAlloc(HYPRE_Real, L_diag_i[total_rows], HYPRE_MEMORY_DEVICE);
7498    }
7499    if (U_diag_i[total_rows])
7500    {
7501       U_diag_data = hypre_CTAlloc(HYPRE_Real, U_diag_i[total_rows], HYPRE_MEMORY_DEVICE);
7502    }
7503 
7504    /*
7505     * 3: Begin real factorization
7506     * we already have L and U structure ready, so no extra working array needed
7507     */
7508    /* first loop for upper part */
7509    for (ii = 0; ii < nLU; ii++)
7510    {
7511       // get row i
7512       i = perm[ii];
7513       kl = L_diag_i[ii+1];
7514       ku = U_diag_i[ii+1];
7515       k1 = A_diag_i[i];
7516       k2 = A_diag_i[i+1];
7517       /* set up working arrays */
7518       for (j = L_diag_i[ii]; j < kl; j++)
7519       {
7520          col = L_diag_j[j];
7521          iw[col] = j;
7522       }
7523       D_data[ii] = 0.0;
7524       iw[ii] = ii;
7525       for (j = U_diag_i[ii]; j < ku; j++)
7526       {
7527          col = U_diag_j[j];
7528          iw[col] = j;
7529       }
7530       /* copy data from A into L, D and U */
7531       for (j = k1; j < k2; j++)
7532       {
7533          /* compute everything in new index */
7534          col = rperm[A_diag_j[j]];
7535          icol = iw[col];
7536          /* A for sure to be inside the pattern */
7537          if (col < ii)
7538          {
7539             L_diag_data[icol] = A_diag_data[j];
7540          }
7541          else if (col == ii)
7542          {
7543             D_data[ii] = A_diag_data[j];
7544          }
7545          else
7546          {
7547             U_diag_data[icol] = A_diag_data[j];
7548          }
7549       }
7550       /* elimination */
7551       for (j = L_diag_i[ii]; j < kl; j++)
7552       {
7553          jpiv = L_diag_j[j];
7554          L_diag_data[j] *= D_data[jpiv];
7555          ku = U_diag_i[jpiv+1];
7556 
7557          for (k = U_diag_i[jpiv]; k < ku; k++)
7558          {
7559             col = U_diag_j[k];
7560             icol = iw[col];
7561             if (icol < 0)
7562             {
7563                /* not in partern */
7564                continue;
7565             }
7566             if (col < ii)
7567             {
7568                /* L part */
7569                L_diag_data[icol] -= L_diag_data[j]*U_diag_data[k];
7570             }
7571             else if (col == ii)
7572             {
7573                /* diag part */
7574                D_data[icol] -= L_diag_data[j]*U_diag_data[k];
7575             }
7576             else
7577             {
7578                /* U part */
7579                U_diag_data[icol] -= L_diag_data[j]*U_diag_data[k];
7580             }
7581          }
7582       }
7583       /* reset working array */
7584       ku = U_diag_i[ii+1];
7585       for (j = L_diag_i[ii]; j < kl; j++)
7586       {
7587          col = L_diag_j[j];
7588          iw[col] = -1;
7589       }
7590       iw[ii] = -1;
7591       for (j = U_diag_i[ii]; j < ku; j++)
7592       {
7593          col = U_diag_j[j];
7594          iw[col] = -1;
7595       }
7596 
7597       /* diagonal part (we store the inverse) */
7598       if (fabs(D_data[ii]) < MAT_TOL)
7599       {
7600          D_data[ii] = 1e-06;
7601       }
7602       D_data[ii] = 1./ D_data[ii];
7603 
7604    }/* end of loop for upper part */
7605 
7606    /* first loop for upper part */
7607    for (ii = nLU; ii < n; ii++)
7608    {
7609       // get row i
7610       i = perm[ii];
7611       kl = L_diag_i[ii+1];
7612       ku = U_diag_i[ii+1];
7613       /* set up working arrays */
7614       for (j = L_diag_i[ii]; j < kl; j++)
7615       {
7616          col = L_diag_j[j];
7617          iw[col] = j;
7618       }
7619       D_data[ii] = 0.0;
7620       iw[ii] = ii;
7621       for (j = U_diag_i[ii]; j < ku; j++)
7622       {
7623          col = U_diag_j[j];
7624          iw[col] = j;
7625       }
7626       /* copy data from A into L, D and U */
7627       k1 = A_diag_i[i];
7628       k2 = A_diag_i[i+1];
7629       for (j = k1; j < k2; j++)
7630       {
7631          /* compute everything in new index */
7632          col = rperm[A_diag_j[j]];
7633          icol = iw[col];
7634          /* A for sure to be inside the pattern */
7635          if (col < ii)
7636          {
7637             L_diag_data[icol] = A_diag_data[j];
7638          }
7639          else if (col == ii)
7640          {
7641             D_data[ii] = A_diag_data[j];
7642          }
7643          else
7644          {
7645             U_diag_data[icol] = A_diag_data[j];
7646          }
7647       }
7648       /* copy data from A_offd into L, D and U */
7649       k1 = A_offd_i[i];
7650       k2 = A_offd_i[i+1];
7651       for (j = k1; j < k2; j++)
7652       {
7653          /* compute everything in new index */
7654          col = A_offd_j[j] + n;
7655          icol = iw[col];
7656          U_diag_data[icol] = A_offd_data[j];
7657       }
7658       /* elimination */
7659       for (j = L_diag_i[ii]; j < kl; j++)
7660       {
7661          jpiv = L_diag_j[j];
7662          L_diag_data[j] *= D_data[jpiv];
7663          ku = U_diag_i[jpiv+1];
7664 
7665          for (k = U_diag_i[jpiv]; k < ku; k++)
7666          {
7667             col = U_diag_j[k];
7668             icol = iw[col];
7669             if (icol < 0)
7670             {
7671                /* not in partern */
7672                continue;
7673             }
7674             if (col < ii)
7675             {
7676                /* L part */
7677                L_diag_data[icol] -= L_diag_data[j]*U_diag_data[k];
7678             }
7679             else if (col == ii)
7680             {
7681                /* diag part */
7682                D_data[icol] -= L_diag_data[j]*U_diag_data[k];
7683             }
7684             else
7685             {
7686                /* U part */
7687                U_diag_data[icol] -= L_diag_data[j]*U_diag_data[k];
7688             }
7689          }
7690       }
7691       /* reset working array */
7692       ku = U_diag_i[ii+1];
7693       for (j = L_diag_i[ii]; j < kl; j++)
7694       {
7695          col = L_diag_j[j];
7696          iw[col] = -1;
7697       }
7698       iw[ii] = -1;
7699       for (j = U_diag_i[ii]; j < ku; j++)
7700       {
7701          col = U_diag_j[j];
7702          iw[col] = -1;
7703       }
7704 
7705       /* diagonal part (we store the inverse) */
7706       if (fabs(D_data[ii]) < MAT_TOL)
7707       {
7708          D_data[ii] = 1e-06;
7709       }
7710       D_data[ii] = 1./ D_data[ii];
7711 
7712    }/* end of loop for lower part */
7713 
7714    /* last loop through external */
7715    for (ii = n; ii < total_rows; ii++)
7716    {
7717       // get row i
7718       i = ii - n;
7719       kl = L_diag_i[ii+1];
7720       ku = U_diag_i[ii+1];
7721       k1 = E_i[i];
7722       k2 = E_i[i+1];
7723       /* set up working arrays */
7724       for (j = L_diag_i[ii]; j < kl; j++)
7725       {
7726          col = L_diag_j[j];
7727          iw[col] = j;
7728       }
7729       D_data[ii] = 0.0;
7730       iw[ii] = ii;
7731       for (j = U_diag_i[ii]; j < ku; j++)
7732       {
7733          col = U_diag_j[j];
7734          iw[col] = j;
7735       }
7736       /* copy data from E into L, D and U */
7737       for (j = k1; j < k2; j++)
7738       {
7739          /* compute everything in new index */
7740          col = E_j[j];
7741          icol = iw[col];
7742          /* A for sure to be inside the pattern */
7743          if (col < ii)
7744          {
7745             L_diag_data[icol] = E_data[j];
7746          }
7747          else if (col == ii)
7748          {
7749             D_data[ii] = E_data[j];
7750          }
7751          else
7752          {
7753             U_diag_data[icol] = E_data[j];
7754          }
7755       }
7756       /* elimination */
7757       for (j = L_diag_i[ii]; j < kl; j++)
7758       {
7759          jpiv = L_diag_j[j];
7760          L_diag_data[j] *= D_data[jpiv];
7761          ku = U_diag_i[jpiv+1];
7762 
7763          for (k = U_diag_i[jpiv]; k < ku; k++)
7764          {
7765             col = U_diag_j[k];
7766             icol = iw[col];
7767             if (icol < 0)
7768             {
7769                /* not in partern */
7770                continue;
7771             }
7772             if (col < ii)
7773             {
7774                /* L part */
7775                L_diag_data[icol] -= L_diag_data[j]*U_diag_data[k];
7776             }
7777             else if (col == ii)
7778             {
7779                /* diag part */
7780                D_data[icol] -= L_diag_data[j]*U_diag_data[k];
7781             }
7782             else
7783             {
7784                /* U part */
7785                U_diag_data[icol] -= L_diag_data[j]*U_diag_data[k];
7786             }
7787          }
7788       }
7789       /* reset working array */
7790       ku = U_diag_i[ii+1];
7791       for (j = L_diag_i[ii]; j < kl; j++)
7792       {
7793          col = L_diag_j[j];
7794          iw[col] = -1;
7795       }
7796       iw[ii] = -1;
7797       for (j = U_diag_i[ii]; j < ku; j++)
7798       {
7799          col = U_diag_j[j];
7800          iw[col] = -1;
7801       }
7802 
7803       /* diagonal part (we store the inverse) */
7804       if (fabs(D_data[ii]) < MAT_TOL)
7805       {
7806          D_data[ii] = 1e-06;
7807       }
7808       D_data[ii] = 1./ D_data[ii];
7809 
7810    }/* end of loop for external loop */
7811 
7812    /*
7813     * 4: Finishing up and free
7814     */
7815    HYPRE_BigInt big_total_rows = (HYPRE_BigInt)total_rows;
7816    hypre_MPI_Allreduce( &big_total_rows, &global_num_rows, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
7817    /* need to get new column start */
7818    {
7819       HYPRE_BigInt global_start;
7820       hypre_MPI_Scan( &big_total_rows, &global_start, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
7821       col_starts[0] = global_start - total_rows;
7822       col_starts[1] = global_start;
7823    }
7824    /* Assemble LDU matrices */
7825    matL = hypre_ParCSRMatrixCreate( comm,
7826          global_num_rows,
7827          global_num_rows,
7828          col_starts,
7829          col_starts,
7830          0 /* num_cols_offd */,
7831          L_diag_i[total_rows],
7832          0 /* num_nonzeros_offd */);
7833 
7834    L_diag = hypre_ParCSRMatrixDiag(matL);
7835    hypre_CSRMatrixI(L_diag) = L_diag_i;
7836    if (L_diag_i[total_rows]>0)
7837    {
7838       hypre_CSRMatrixData(L_diag) = L_diag_data;
7839       hypre_CSRMatrixJ(L_diag) = L_diag_j;
7840    }
7841    else
7842    {
7843       /* we allocated some initial length, so free them */
7844       hypre_TFree(L_diag_j, HYPRE_MEMORY_DEVICE);
7845    }
7846    /* store (global) total number of nonzeros */
7847    local_nnz = (HYPRE_Real) (L_diag_i[total_rows]);
7848    hypre_MPI_Allreduce(&local_nnz, &total_nnz, 1, HYPRE_MPI_REAL, hypre_MPI_SUM, comm);
7849    hypre_ParCSRMatrixDNumNonzeros(matL) = total_nnz;
7850 
7851    matU = hypre_ParCSRMatrixCreate( comm,
7852          global_num_rows,
7853          global_num_rows,
7854          col_starts,
7855          col_starts,
7856          0,
7857          U_diag_i[total_rows],
7858          0 );
7859 
7860    U_diag = hypre_ParCSRMatrixDiag(matU);
7861    hypre_CSRMatrixI(U_diag) = U_diag_i;
7862    if (U_diag_i[n]>0)
7863    {
7864       hypre_CSRMatrixData(U_diag) = U_diag_data;
7865       hypre_CSRMatrixJ(U_diag) = U_diag_j;
7866    }
7867    else
7868    {
7869       /* we allocated some initial length, so free them */
7870       hypre_TFree(U_diag_j, HYPRE_MEMORY_DEVICE);
7871    }
7872    /* store (global) total number of nonzeros */
7873    local_nnz = (HYPRE_Real) (U_diag_i[total_rows]);
7874    hypre_MPI_Allreduce(&local_nnz, &total_nnz, 1, HYPRE_MPI_REAL, hypre_MPI_SUM, comm);
7875    hypre_ParCSRMatrixDNumNonzeros(matU) = total_nnz;
7876 
7877    /* free */
7878    hypre_TFree(iw,HYPRE_MEMORY_HOST);
7879 
7880    /* free external data */
7881    if (E_i)
7882    {
7883       hypre_TFree(E_i, HYPRE_MEMORY_HOST);
7884    }
7885    if (E_j)
7886    {
7887       hypre_TFree(E_j, HYPRE_MEMORY_HOST);
7888       hypre_TFree(E_data, HYPRE_MEMORY_HOST);
7889    }
7890 
7891    /* set matrix pointers */
7892    *Lptr = matL;
7893    *Dptr = D_data;
7894    *Uptr = matU;
7895 
7896    return hypre_error_flag;
7897 }
7898 
7899 /* ILUT for RAS
7900  * A: input matrix
7901  * lfil: level of fill-in, the k in ILU(k)
7902  * tol: droptol array in ILUT
7903  *    tol[0]: matrix B
7904  *    tol[1]: matrix E and F
7905  *    tol[2]: matrix S
7906  * perm: permutation array indicating ordering of factorization. Perm could come from a
7907  * CF_marker: array or a reordering routine.
7908  * nLU: size of computed LDU factorization. If nLU < n, Schur compelemnt will be formed
7909  * Lptr, Dptr, Uptr: L, D, U factors.
7910  * Sptr: Schur complement
7911  *
7912  * Keep the largest lfil entries that is greater than some tol relative
7913  *    to the input tol and the norm of that row in both L and U
7914  */
7915 HYPRE_Int
hypre_ILUSetupILUTRAS(hypre_ParCSRMatrix * A,HYPRE_Int lfil,HYPRE_Real * tol,HYPRE_Int * perm,HYPRE_Int nLU,hypre_ParCSRMatrix ** Lptr,HYPRE_Real ** Dptr,hypre_ParCSRMatrix ** Uptr)7916 hypre_ILUSetupILUTRAS(hypre_ParCSRMatrix *A, HYPRE_Int lfil, HYPRE_Real *tol,
7917       HYPRE_Int *perm, HYPRE_Int nLU, hypre_ParCSRMatrix **Lptr,
7918       HYPRE_Real** Dptr, hypre_ParCSRMatrix **Uptr)
7919 {
7920    /*
7921     * 1: Setup and create buffers
7922     * matL/U: the ParCSR matrix for L and U
7923     * L/U_diag: the diagonal csr matrix of matL/U
7924     * A_diag_*: tempory pointer for the diagonal matrix of A and its '*' slot
7925     * ii = outer loop from 0 to nLU - 1
7926     * i = the real col number in diag inside the outer loop
7927     * iw =  working array store the reverse of active col number
7928     * iL = working array store the active col number
7929     */
7930    HYPRE_Real               local_nnz, total_nnz;
7931    HYPRE_Int                i, ii, j, k1, k2, k12, k22, kl, ku, col, icol, lenl, lenu, lenhu, lenhlr, lenhll, jpos, jrow;
7932    HYPRE_Real               inorm, itolb, itolef, dpiv, lxu;
7933    HYPRE_Int                *iw,*iL;
7934    HYPRE_Real               *w;
7935 
7936    /* memory management */
7937    HYPRE_Int                ctrL;
7938    HYPRE_Int                ctrU;
7939    HYPRE_Int                initial_alloc = 0;
7940    HYPRE_Int                capacity_L;
7941    HYPRE_Int                capacity_U;
7942    HYPRE_Int                nnz_A;
7943 
7944    /* communication stuffs for S */
7945    MPI_Comm                 comm          = hypre_ParCSRMatrixComm(A);
7946    HYPRE_Int                num_procs;
7947    hypre_ParCSRCommPkg      *comm_pkg;
7948    //   hypre_ParCSRCommHandle   *comm_handle;
7949    HYPRE_BigInt             col_starts[2];
7950    //   HYPRE_Int                num_sends;
7951    //   HYPRE_Int                begin, end;
7952 
7953    /* data objects for A */
7954    hypre_CSRMatrix          *A_diag       = hypre_ParCSRMatrixDiag(A);
7955    hypre_CSRMatrix          *A_offd       = hypre_ParCSRMatrixOffd(A);
7956    HYPRE_Real               *A_diag_data  = hypre_CSRMatrixData(A_diag);
7957    HYPRE_Int                *A_diag_i     = hypre_CSRMatrixI(A_diag);
7958    HYPRE_Int                *A_diag_j     = hypre_CSRMatrixJ(A_diag);
7959    HYPRE_Int                *A_offd_i     = hypre_CSRMatrixI(A_offd);
7960    HYPRE_Int                *A_offd_j     = hypre_CSRMatrixJ(A_offd);
7961    HYPRE_Real               *A_offd_data  = hypre_CSRMatrixData(A_offd);
7962 
7963    /* data objects for L, D, U */
7964    hypre_ParCSRMatrix       *matL;
7965    hypre_ParCSRMatrix       *matU;
7966    hypre_CSRMatrix          *L_diag;
7967    hypre_CSRMatrix          *U_diag;
7968    HYPRE_Real               *D_data;
7969    HYPRE_Real               *L_diag_data  = NULL;
7970    HYPRE_Int                *L_diag_i;
7971    HYPRE_Int                *L_diag_j     = NULL;
7972    HYPRE_Real               *U_diag_data  = NULL;
7973    HYPRE_Int                *U_diag_i;
7974    HYPRE_Int                *U_diag_j     = NULL;
7975 
7976    /* size of problem and external matrix */
7977    HYPRE_Int                n             = hypre_CSRMatrixNumRows(A_diag);
7978    //   HYPRE_Int                m             = n - nLU;
7979    HYPRE_Int                ext           = hypre_CSRMatrixNumCols(A_offd);
7980    HYPRE_Int                total_rows    = n + ext;
7981    HYPRE_BigInt              global_num_rows;
7982 
7983    /* data objects for E, external matrix */
7984    HYPRE_Int                *E_i;
7985    HYPRE_Int                *E_j;
7986    HYPRE_Real               *E_data;
7987 
7988    /* reverse permutation */
7989    HYPRE_Int                *rperm;
7990    /* old permutation */
7991    HYPRE_Int                *perm_old;
7992 
7993    /* start setup
7994     * check input first
7995     */
7996    n = hypre_CSRMatrixNumRows(A_diag);
7997    if (nLU < 0 || nLU > n)
7998    {
7999       hypre_error_w_msg(HYPRE_ERROR_ARG,"WARNING: nLU out of range.\n");
8000    }
8001 
8002    /* start set up
8003     * setup communication stuffs first
8004     */
8005    hypre_MPI_Comm_size(comm, &num_procs);
8006    comm_pkg = hypre_ParCSRMatrixCommPkg(A);
8007    /* create if not yet built */
8008    if (!comm_pkg)
8009    {
8010       hypre_MatvecCommPkgCreate(A);
8011       comm_pkg = hypre_ParCSRMatrixCommPkg(A);
8012    }
8013 
8014    /* setup initial memory */
8015    nnz_A = A_diag_i[nLU];
8016    if (n > 0)
8017    {
8018       initial_alloc = nLU + ceil(nnz_A / 2.0);
8019    }
8020    capacity_L = initial_alloc;
8021    capacity_U = initial_alloc;
8022 
8023    D_data = hypre_CTAlloc(HYPRE_Real, total_rows, HYPRE_MEMORY_DEVICE);
8024    L_diag_i = hypre_CTAlloc(HYPRE_Int, (total_rows+1), HYPRE_MEMORY_DEVICE);
8025    U_diag_i = hypre_CTAlloc(HYPRE_Int, (total_rows+1), HYPRE_MEMORY_DEVICE);
8026 
8027    L_diag_j = hypre_CTAlloc(HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
8028    U_diag_j = hypre_CTAlloc(HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
8029    L_diag_data = hypre_CTAlloc(HYPRE_Real, capacity_L, HYPRE_MEMORY_DEVICE);
8030    U_diag_data = hypre_CTAlloc(HYPRE_Real, capacity_U, HYPRE_MEMORY_DEVICE);
8031 
8032    ctrL = ctrU = 0;
8033 
8034    /* setting up working array */
8035    iw = hypre_CTAlloc(HYPRE_Int,4*total_rows,HYPRE_MEMORY_HOST);
8036    iL = iw + total_rows;
8037    w = hypre_CTAlloc(HYPRE_Real,total_rows,HYPRE_MEMORY_HOST);
8038    for (i = 0; i < total_rows; i++)
8039    {
8040       iw[i] = -1;
8041    }
8042    L_diag_i[0] = U_diag_i[0] = 0;
8043    /* get reverse permutation (rperm).
8044     * rperm holds the reordered indexes.
8045     * rperm[old] -> new
8046     * perm[new]  -> old
8047     */
8048    rperm = iw + 2*total_rows;
8049    perm_old = perm;
8050    perm = iw + 3*total_rows;
8051    for (i = 0; i < n; i++)
8052    {
8053       perm[i] = perm_old[i];
8054    }
8055    for (i = n; i < total_rows; i++)
8056    {
8057       perm[i] = i;
8058    }
8059    for (i = 0; i < total_rows; i++)
8060    {
8061       rperm[perm[i]] = i;
8062    }
8063    /* get external matrix */
8064    hypre_ILUBuildRASExternalMatrix(A,rperm,&E_i,&E_j,&E_data);
8065 
8066    /*
8067     * 2: Main loop of elimination
8068     * maintain two heaps
8069     * |----->*********<-----|-----*********|
8070     * |col heap***value heap|value in U****|
8071     */
8072 
8073    /* main outer loop for upper part */
8074    for (ii = 0 ; ii < nLU; ii++)
8075    {
8076       /* get real row with perm */
8077       i = perm[ii];
8078       k1 = A_diag_i[i];
8079       k2 = A_diag_i[i+1];
8080       kl = ii-1;
8081       /* reset row norm of ith row */
8082       inorm = .0;
8083       for (j = k1; j < k2; j++)
8084       {
8085          inorm += fabs(A_diag_data[j]);
8086       }
8087       if (inorm == .0)
8088       {
8089          hypre_error_w_msg(HYPRE_ERROR_ARG,"WARNING: ILUT with zero row.\n");
8090       }
8091       inorm /= (HYPRE_Real)(k2-k1);
8092       /* set the scaled tol for that row */
8093       itolb = tol[0] * inorm;
8094       itolef = tol[1] * inorm;
8095 
8096       /* reset displacement */
8097       lenhll = lenhlr = lenu = 0;
8098       w[ii] = 0.0;
8099       iw[ii] = ii;
8100       /* copy in data from A */
8101       for (j = k1; j < k2; j++)
8102       {
8103          /* get now col number */
8104          col = rperm[A_diag_j[j]];
8105          if (col < ii)
8106          {
8107             /* L part of it */
8108             iL[lenhll] = col;
8109             w[lenhll] = A_diag_data[j];
8110             iw[col] = lenhll++;
8111             /* add to heap, by col number */
8112             hypre_ILUMinHeapAddIRIi(iL,w,iw,lenhll);
8113          }
8114          else if (col == ii)
8115          {
8116             w[ii] = A_diag_data[j];
8117          }
8118          else
8119          {
8120             lenu++;
8121             jpos = lenu + ii;
8122             iL[jpos] = col;
8123             w[jpos] = A_diag_data[j];
8124             iw[col] = jpos;
8125          }
8126       }
8127 
8128       /*
8129        * main elimination
8130        * need to maintain 2 heaps for L, one heap for col and one heaps for value
8131        * maintian an array for U, and do qsplit with quick sort after that
8132        * while the heap of col is greater than zero
8133        */
8134       while (lenhll > 0)
8135       {
8136 
8137          /* get the next row from top of the heap */
8138          jrow = iL[0];
8139          dpiv = w[0] * D_data[jrow];
8140          w[0] = dpiv;
8141          /* now remove it from the top of the heap */
8142          hypre_ILUMinHeapRemoveIRIi(iL,w,iw,lenhll);
8143          lenhll--;
8144          /*
8145           * reset the drop part to -1
8146           * we don't need this iw anymore
8147           */
8148          iw[jrow] = -1;
8149          /* need to keep this one, move to the end of the heap */
8150          /* no longer need to maintain iw */
8151          hypre_swap2(iL,w,lenhll,kl-lenhlr);
8152          lenhlr++;
8153          hypre_ILUMaxrHeapAddRabsI(w+kl,iL+kl,lenhlr);
8154          /* loop for elimination */
8155          ku = U_diag_i[jrow+1];
8156          for (j = U_diag_i[jrow]; j < ku; j++)
8157          {
8158             col = U_diag_j[j];
8159             icol = iw[col];
8160             lxu = - dpiv*U_diag_data[j];
8161             /* we don't want to fill small number to empty place */
8162             if ((icol == -1) &&
8163                 ((col < nLU && fabs(lxu) < itolb) || (col >= nLU && fabs(lxu) < itolef)))
8164             {
8165                continue;
8166             }
8167             if (icol == -1)
8168             {
8169                if (col < ii)
8170                {
8171                   /* L part
8172                    * not already in L part
8173                    * put it to the end of heap
8174                    * might overwrite some small entries, no issue
8175                    */
8176                   iL[lenhll] = col;
8177                   w[lenhll] = lxu;
8178                   iw[col] = lenhll++;
8179                   /* add to heap, by col number */
8180                   hypre_ILUMinHeapAddIRIi(iL,w,iw,lenhll);
8181                }
8182                else if (col == ii)
8183                {
8184                   w[ii] += lxu;
8185                }
8186                else
8187                {
8188                   /*
8189                    * not already in U part
8190                    * put is to the end of heap
8191                    */
8192                   lenu++;
8193                   jpos = lenu + ii;
8194                   iL[jpos] = col;
8195                   w[jpos] = lxu;
8196                   iw[col] = jpos;
8197                }
8198             }
8199             else
8200             {
8201                w[icol] += lxu;
8202             }
8203          }
8204       }/* while loop for the elimination of current row */
8205 
8206       if (fabs(w[ii]) < MAT_TOL)
8207       {
8208          w[ii]=1e-06;
8209       }
8210       D_data[ii] = 1./w[ii];
8211       iw[ii] = -1;
8212 
8213       /*
8214        * now pick up the largest lfil from L
8215        * L part is guarantee to be larger than itol
8216        */
8217 
8218       lenl = lenhlr < lfil ? lenhlr : lfil;
8219       L_diag_i[ii+1] = L_diag_i[ii] + lenl;
8220       if (lenl > 0)
8221       {
8222          /* test if memory is enough */
8223          while (ctrL + lenl > capacity_L)
8224          {
8225             HYPRE_Int tmp = capacity_L;
8226             capacity_L = capacity_L * EXPAND_FACT + 1;
8227             L_diag_j = hypre_TReAlloc_v2(L_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
8228             L_diag_data = hypre_TReAlloc_v2(L_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_L, HYPRE_MEMORY_DEVICE);
8229          }
8230          ctrL += lenl;
8231          /* copy large data in */
8232          for (j = L_diag_i[ii]; j < ctrL; j++)
8233          {
8234             L_diag_j[j] = iL[kl];
8235             L_diag_data[j] = w[kl];
8236             hypre_ILUMaxrHeapRemoveRabsI(w+kl,iL+kl,lenhlr);
8237             lenhlr--;
8238          }
8239       }
8240       /*
8241        * now reset working array
8242        * L part already reset when move out of heap, only U part
8243        */
8244       ku = lenu+ii;
8245       for (j = ii + 1; j <= ku; j++)
8246       {
8247          iw[iL[j]] = -1;
8248       }
8249 
8250       if (lenu < lfil)
8251       {
8252          /* we simply keep all of the data, no need to sort */
8253          lenhu = lenu;
8254       }
8255       else
8256       {
8257          /* need to sort the first small(hopefully) part of it */
8258          lenhu = lfil;
8259          /* quick split, only sort the first small part of the array */
8260          hypre_ILUMaxQSplitRabsI(w,iL,ii+1,ii+lenhu,ii+lenu);
8261       }
8262 
8263       U_diag_i[ii+1] = U_diag_i[ii] + lenhu;
8264       if (lenhu > 0)
8265       {
8266          /* test if memory is enough */
8267          while (ctrU + lenhu > capacity_U)
8268          {
8269             HYPRE_Int tmp = capacity_U;
8270             capacity_U = capacity_U * EXPAND_FACT + 1;
8271             U_diag_j = hypre_TReAlloc_v2(U_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
8272             U_diag_data = hypre_TReAlloc_v2(U_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_U, HYPRE_MEMORY_DEVICE);
8273          }
8274          ctrU += lenhu;
8275          /* copy large data in */
8276          for (j = U_diag_i[ii]; j < ctrU; j++)
8277          {
8278             jpos = ii+1+j-U_diag_i[ii];
8279             U_diag_j[j] = iL[jpos];
8280             U_diag_data[j] = w[jpos];
8281          }
8282       }
8283    }/* end of ii loop from 0 to nLU-1 */
8284 
8285    /* second outer loop for lower part */
8286    for (ii = nLU; ii < n; ii++)
8287    {
8288       /* get real row with perm */
8289       i = perm[ii];
8290       k1 = A_diag_i[i];
8291       k2 = A_diag_i[i+1];
8292       k12 = A_offd_i[i];
8293       k22 = A_offd_i[i+1];
8294       kl = ii-1;
8295       /* reset row norm of ith row */
8296       inorm = .0;
8297       for (j = k1; j < k2; j++)
8298       {
8299          inorm += fabs(A_diag_data[j]);
8300       }
8301       for (j = k12; j < k22; j++)
8302       {
8303          inorm += fabs(A_offd_data[j]);
8304       }
8305       if (inorm == .0)
8306       {
8307          hypre_error_w_msg(HYPRE_ERROR_ARG,"WARNING: ILUT with zero row.\n");
8308       }
8309       inorm /= (HYPRE_Real)(k2+k22-k1-k12);
8310       /* set the scaled tol for that row */
8311       itolb = tol[0] * inorm;
8312       itolef = tol[1] * inorm;
8313 
8314       /* reset displacement */
8315       lenhll = lenhlr = lenu = 0;
8316       w[ii] = 0.0;
8317       iw[ii] = ii;
8318       /* copy in data from A_diag */
8319       for (j = k1; j < k2; j++)
8320       {
8321          /* get now col number */
8322          col = rperm[A_diag_j[j]];
8323          if (col < ii)
8324          {
8325             /* L part of it */
8326             iL[lenhll] = col;
8327             w[lenhll] = A_diag_data[j];
8328             iw[col] = lenhll++;
8329             /* add to heap, by col number */
8330             hypre_ILUMinHeapAddIRIi(iL,w,iw,lenhll);
8331          }
8332          else if (col == ii)
8333          {
8334             w[ii] = A_diag_data[j];
8335          }
8336          else
8337          {
8338             lenu++;
8339             jpos = lenu + ii;
8340             iL[jpos] = col;
8341             w[jpos] = A_diag_data[j];
8342             iw[col] = jpos;
8343          }
8344       }
8345       /* copy in data from A_offd */
8346       for (j = k12; j < k22; j++)
8347       {
8348          /* get now col number */
8349          col = A_offd_j[j] + n;
8350          /* all should greater than ii in lower part */
8351          lenu++;
8352          jpos = lenu + ii;
8353          iL[jpos] = col;
8354          w[jpos] = A_offd_data[j];
8355          iw[col] = jpos;
8356       }
8357 
8358       /*
8359        * main elimination
8360        * need to maintain 2 heaps for L, one heap for col and one heaps for value
8361        * maintian an array for U, and do qsplit with quick sort after that
8362        * while the heap of col is greater than zero
8363        */
8364       while (lenhll > 0)
8365       {
8366 
8367          /* get the next row from top of the heap */
8368          jrow = iL[0];
8369          dpiv = w[0] * D_data[jrow];
8370          w[0] = dpiv;
8371          /* now remove it from the top of the heap */
8372          hypre_ILUMinHeapRemoveIRIi(iL,w,iw,lenhll);
8373          lenhll--;
8374          /*
8375           * reset the drop part to -1
8376           * we don't need this iw anymore
8377           */
8378          iw[jrow] = -1;
8379          /* need to keep this one, move to the end of the heap */
8380          /* no longer need to maintain iw */
8381          hypre_swap2(iL,w,lenhll,kl-lenhlr);
8382          lenhlr++;
8383          hypre_ILUMaxrHeapAddRabsI(w+kl,iL+kl,lenhlr);
8384          /* loop for elimination */
8385          ku = U_diag_i[jrow+1];
8386          for (j = U_diag_i[jrow]; j < ku; j++)
8387          {
8388             col = U_diag_j[j];
8389             icol = iw[col];
8390             lxu = - dpiv*U_diag_data[j];
8391             /* we don't want to fill small number to empty place */
8392             if ((icol == -1) &&
8393                 ((col < nLU && fabs(lxu) < itolb) || (col >= nLU && fabs(lxu) < itolef)))
8394             {
8395                continue;
8396             }
8397             if (icol == -1)
8398             {
8399                if (col < ii)
8400                {
8401                   /* L part
8402                    * not already in L part
8403                    * put it to the end of heap
8404                    * might overwrite some small entries, no issue
8405                    */
8406                   iL[lenhll] = col;
8407                   w[lenhll] = lxu;
8408                   iw[col] = lenhll++;
8409                   /* add to heap, by col number */
8410                   hypre_ILUMinHeapAddIRIi(iL,w,iw,lenhll);
8411                }
8412                else if (col == ii)
8413                {
8414                   w[ii] += lxu;
8415                }
8416                else
8417                {
8418                   /*
8419                    * not already in U part
8420                    * put is to the end of heap
8421                    */
8422                   lenu++;
8423                   jpos = lenu + ii;
8424                   iL[jpos] = col;
8425                   w[jpos] = lxu;
8426                   iw[col] = jpos;
8427                }
8428             }
8429             else
8430             {
8431                w[icol] += lxu;
8432             }
8433          }
8434       }/* while loop for the elimination of current row */
8435 
8436       if (fabs(w[ii]) < MAT_TOL)
8437       {
8438          w[ii]=1e-06;
8439       }
8440       D_data[ii] = 1./w[ii];
8441       iw[ii] = -1;
8442 
8443       /*
8444        * now pick up the largest lfil from L
8445        * L part is guarantee to be larger than itol
8446        */
8447 
8448       lenl = lenhlr < lfil ? lenhlr : lfil;
8449       L_diag_i[ii+1] = L_diag_i[ii] + lenl;
8450       if (lenl > 0)
8451       {
8452          /* test if memory is enough */
8453          while (ctrL + lenl > capacity_L)
8454          {
8455             HYPRE_Int tmp = capacity_L;
8456             capacity_L = capacity_L * EXPAND_FACT + 1;
8457             L_diag_j = hypre_TReAlloc_v2(L_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
8458             L_diag_data = hypre_TReAlloc_v2(L_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_L, HYPRE_MEMORY_DEVICE);
8459          }
8460          ctrL += lenl;
8461          /* copy large data in */
8462          for (j = L_diag_i[ii]; j < ctrL; j++)
8463          {
8464             L_diag_j[j] = iL[kl];
8465             L_diag_data[j] = w[kl];
8466             hypre_ILUMaxrHeapRemoveRabsI(w+kl,iL+kl,lenhlr);
8467             lenhlr--;
8468          }
8469       }
8470       /*
8471        * now reset working array
8472        * L part already reset when move out of heap, only U part
8473        */
8474       ku = lenu+ii;
8475       for (j = ii + 1; j <= ku; j++)
8476       {
8477          iw[iL[j]] = -1;
8478       }
8479 
8480       if (lenu < lfil)
8481       {
8482          /* we simply keep all of the data, no need to sort */
8483          lenhu = lenu;
8484       }
8485       else
8486       {
8487          /* need to sort the first small(hopefully) part of it */
8488          lenhu = lfil;
8489          /* quick split, only sort the first small part of the array */
8490          hypre_ILUMaxQSplitRabsI(w,iL,ii+1,ii+lenhu,ii+lenu);
8491       }
8492 
8493       U_diag_i[ii+1] = U_diag_i[ii] + lenhu;
8494       if (lenhu > 0)
8495       {
8496          /* test if memory is enough */
8497          while (ctrU + lenhu > capacity_U)
8498          {
8499             HYPRE_Int tmp = capacity_U;
8500             capacity_U = capacity_U * EXPAND_FACT + 1;
8501             U_diag_j = hypre_TReAlloc_v2(U_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
8502             U_diag_data = hypre_TReAlloc_v2(U_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_U, HYPRE_MEMORY_DEVICE);
8503          }
8504          ctrU += lenhu;
8505          /* copy large data in */
8506          for (j = U_diag_i[ii]; j < ctrU; j++)
8507          {
8508             jpos = ii+1+j-U_diag_i[ii];
8509             U_diag_j[j] = iL[jpos];
8510             U_diag_data[j] = w[jpos];
8511          }
8512       }
8513    }/* end of ii loop from nLU to n */
8514 
8515 
8516    /* main outer loop for upper part */
8517    for (ii = n; ii < total_rows; ii++)
8518    {
8519       /* get real row with perm */
8520       i = ii-n;
8521       k1 = E_i[i];
8522       k2 = E_i[i+1];
8523       kl = ii-1;
8524       /* reset row norm of ith row */
8525       inorm = .0;
8526       for (j = k1; j < k2; j++)
8527       {
8528          inorm += fabs(E_data[j]);
8529       }
8530       if (inorm == .0)
8531       {
8532          hypre_error_w_msg(HYPRE_ERROR_ARG,"WARNING: ILUT with zero row.\n");
8533       }
8534       inorm /= (HYPRE_Real)(k2-k1);
8535       /* set the scaled tol for that row */
8536       itolb = tol[0] * inorm;
8537       itolef = tol[1] * inorm;
8538 
8539       /* reset displacement */
8540       lenhll = lenhlr = lenu = 0;
8541       w[ii] = 0.0;
8542       iw[ii] = ii;
8543       /* copy in data from A */
8544       for (j = k1; j < k2; j++)
8545       {
8546          /* get now col number */
8547          col = rperm[E_j[j]];
8548          if (col < ii)
8549          {
8550             /* L part of it */
8551             iL[lenhll] = col;
8552             w[lenhll] = E_data[j];
8553             iw[col] = lenhll++;
8554             /* add to heap, by col number */
8555             hypre_ILUMinHeapAddIRIi(iL,w,iw,lenhll);
8556          }
8557          else if (col == ii)
8558          {
8559             w[ii] = E_data[j];
8560          }
8561          else
8562          {
8563             lenu++;
8564             jpos = lenu + ii;
8565             iL[jpos] = col;
8566             w[jpos] = E_data[j];
8567             iw[col] = jpos;
8568          }
8569       }
8570 
8571       /*
8572        * main elimination
8573        * need to maintain 2 heaps for L, one heap for col and one heaps for value
8574        * maintian an array for U, and do qsplit with quick sort after that
8575        * while the heap of col is greater than zero
8576        */
8577       while (lenhll > 0)
8578       {
8579 
8580          /* get the next row from top of the heap */
8581          jrow = iL[0];
8582          dpiv = w[0] * D_data[jrow];
8583          w[0] = dpiv;
8584          /* now remove it from the top of the heap */
8585          hypre_ILUMinHeapRemoveIRIi(iL,w,iw,lenhll);
8586          lenhll--;
8587          /*
8588           * reset the drop part to -1
8589           * we don't need this iw anymore
8590           */
8591          iw[jrow] = -1;
8592          /* need to keep this one, move to the end of the heap */
8593          /* no longer need to maintain iw */
8594          hypre_swap2(iL,w,lenhll,kl-lenhlr);
8595          lenhlr++;
8596          hypre_ILUMaxrHeapAddRabsI(w+kl,iL+kl,lenhlr);
8597          /* loop for elimination */
8598          ku = U_diag_i[jrow+1];
8599          for (j = U_diag_i[jrow]; j < ku; j++)
8600          {
8601             col = U_diag_j[j];
8602             icol = iw[col];
8603             lxu = - dpiv*U_diag_data[j];
8604             /* we don't want to fill small number to empty place */
8605             if ((icol == -1) &&
8606                 ((col < nLU && fabs(lxu) < itolb) || (col >= nLU && fabs(lxu) < itolef)))
8607             {
8608                continue;
8609             }
8610             if (icol == -1)
8611             {
8612                if (col < ii)
8613                {
8614                   /* L part
8615                    * not already in L part
8616                    * put it to the end of heap
8617                    * might overwrite some small entries, no issue
8618                    */
8619                   iL[lenhll] = col;
8620                   w[lenhll] = lxu;
8621                   iw[col] = lenhll++;
8622                   /* add to heap, by col number */
8623                   hypre_ILUMinHeapAddIRIi(iL,w,iw,lenhll);
8624                }
8625                else if (col == ii)
8626                {
8627                   w[ii] += lxu;
8628                }
8629                else
8630                {
8631                   /*
8632                    * not already in U part
8633                    * put is to the end of heap
8634                    */
8635                   lenu++;
8636                   jpos = lenu + ii;
8637                   iL[jpos] = col;
8638                   w[jpos] = lxu;
8639                   iw[col] = jpos;
8640                }
8641             }
8642             else
8643             {
8644                w[icol] += lxu;
8645             }
8646          }
8647       }/* while loop for the elimination of current row */
8648 
8649       if (fabs(w[ii]) < MAT_TOL)
8650       {
8651          w[ii]=1e-06;
8652       }
8653       D_data[ii] = 1./w[ii];
8654       iw[ii] = -1;
8655 
8656       /*
8657        * now pick up the largest lfil from L
8658        * L part is guarantee to be larger than itol
8659        */
8660 
8661       lenl = lenhlr < lfil ? lenhlr : lfil;
8662       L_diag_i[ii+1] = L_diag_i[ii] + lenl;
8663       if (lenl > 0)
8664       {
8665          /* test if memory is enough */
8666          while (ctrL + lenl > capacity_L)
8667          {
8668             HYPRE_Int tmp = capacity_L;
8669             capacity_L = capacity_L * EXPAND_FACT + 1;
8670             L_diag_j = hypre_TReAlloc_v2(L_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
8671             L_diag_data = hypre_TReAlloc_v2(L_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_L, HYPRE_MEMORY_DEVICE);
8672          }
8673          ctrL += lenl;
8674          /* copy large data in */
8675          for (j = L_diag_i[ii]; j < ctrL; j++)
8676          {
8677             L_diag_j[j] = iL[kl];
8678             L_diag_data[j] = w[kl];
8679             hypre_ILUMaxrHeapRemoveRabsI(w+kl,iL+kl,lenhlr);
8680             lenhlr--;
8681          }
8682       }
8683       /*
8684        * now reset working array
8685        * L part already reset when move out of heap, only U part
8686        */
8687       ku = lenu+ii;
8688       for (j = ii + 1; j <= ku; j++)
8689       {
8690          iw[iL[j]] = -1;
8691       }
8692 
8693       if (lenu < lfil)
8694       {
8695          /* we simply keep all of the data, no need to sort */
8696          lenhu = lenu;
8697       }
8698       else
8699       {
8700          /* need to sort the first small(hopefully) part of it */
8701          lenhu = lfil;
8702          /* quick split, only sort the first small part of the array */
8703          hypre_ILUMaxQSplitRabsI(w,iL,ii+1,ii+lenhu,ii+lenu);
8704       }
8705 
8706       U_diag_i[ii+1] = U_diag_i[ii] + lenhu;
8707       if (lenhu > 0)
8708       {
8709          /* test if memory is enough */
8710          while (ctrU + lenhu > capacity_U)
8711          {
8712             HYPRE_Int tmp = capacity_U;
8713             capacity_U = capacity_U * EXPAND_FACT + 1;
8714             U_diag_j = hypre_TReAlloc_v2(U_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
8715             U_diag_data = hypre_TReAlloc_v2(U_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_U, HYPRE_MEMORY_DEVICE);
8716          }
8717          ctrU += lenhu;
8718          /* copy large data in */
8719          for (j = U_diag_i[ii]; j < ctrU; j++)
8720          {
8721             jpos = ii+1+j-U_diag_i[ii];
8722             U_diag_j[j] = iL[jpos];
8723             U_diag_data[j] = w[jpos];
8724          }
8725       }
8726    }/* end of ii loop from nLU to total_rows */
8727 
8728    /*
8729     * 3: Finishing up and free
8730     */
8731    HYPRE_BigInt big_total_rows = (HYPRE_BigInt)total_rows;
8732    hypre_MPI_Allreduce( &big_total_rows, &global_num_rows, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
8733    /* need to get new column start */
8734    {
8735       HYPRE_BigInt global_start;
8736       hypre_MPI_Scan( &big_total_rows, &global_start, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
8737       col_starts[0] = global_start - total_rows;
8738       col_starts[1] = global_start;
8739    }
8740 
8741    /* create parcsr matrix */
8742    matL = hypre_ParCSRMatrixCreate( comm,
8743          global_num_rows,
8744          global_num_rows,
8745          col_starts,
8746          col_starts,
8747          0,
8748          L_diag_i[total_rows],
8749          0 );
8750 
8751    L_diag = hypre_ParCSRMatrixDiag(matL);
8752    hypre_CSRMatrixI(L_diag) = L_diag_i;
8753    if (L_diag_i[total_rows] > 0)
8754    {
8755       hypre_CSRMatrixData(L_diag) = L_diag_data;
8756       hypre_CSRMatrixJ(L_diag) = L_diag_j;
8757    }
8758    else
8759    {
8760       /* we initialized some anyway, so remove if unused */
8761       hypre_TFree(L_diag_j,HYPRE_MEMORY_DEVICE);
8762       hypre_TFree(L_diag_data,HYPRE_MEMORY_DEVICE);
8763    }
8764    /* store (global) total number of nonzeros */
8765    local_nnz = (HYPRE_Real) (L_diag_i[total_rows]);
8766    hypre_MPI_Allreduce(&local_nnz, &total_nnz, 1, HYPRE_MPI_REAL, hypre_MPI_SUM, comm);
8767    hypre_ParCSRMatrixDNumNonzeros(matL) = total_nnz;
8768 
8769    matU = hypre_ParCSRMatrixCreate( comm,
8770          global_num_rows,
8771          global_num_rows,
8772          col_starts,
8773          col_starts,
8774          0,
8775          U_diag_i[total_rows],
8776          0 );
8777 
8778    U_diag = hypre_ParCSRMatrixDiag(matU);
8779    hypre_CSRMatrixI(U_diag) = U_diag_i;
8780    if (U_diag_i[total_rows] > 0)
8781    {
8782       hypre_CSRMatrixData(U_diag) = U_diag_data;
8783       hypre_CSRMatrixJ(U_diag) = U_diag_j;
8784    }
8785    else
8786    {
8787       /* we initialized some anyway, so remove if unused */
8788       hypre_TFree(U_diag_j,HYPRE_MEMORY_DEVICE);
8789       hypre_TFree(U_diag_data,HYPRE_MEMORY_DEVICE);
8790    }
8791    /* store (global) total number of nonzeros */
8792    local_nnz = (HYPRE_Real) (U_diag_i[total_rows]);
8793    hypre_MPI_Allreduce(&local_nnz, &total_nnz, 1, HYPRE_MPI_REAL, hypre_MPI_SUM, comm);
8794    hypre_ParCSRMatrixDNumNonzeros(matU) = total_nnz;
8795 
8796    /* free working array */
8797    hypre_TFree(iw,HYPRE_MEMORY_HOST);
8798    hypre_TFree(w,HYPRE_MEMORY_HOST);
8799 
8800    /* free external data */
8801    if (E_i)
8802    {
8803       hypre_TFree(E_i, HYPRE_MEMORY_HOST);
8804    }
8805    if (E_j)
8806    {
8807       hypre_TFree(E_j, HYPRE_MEMORY_HOST);
8808       hypre_TFree(E_data, HYPRE_MEMORY_HOST);
8809    }
8810 
8811    /* set matrix pointers */
8812    *Lptr = matL;
8813    *Dptr = D_data;
8814    *Uptr = matU;
8815 
8816    return hypre_error_flag;
8817 }
8818