1 /******************************************************************************
2 * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other
3 * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
4 *
5 * SPDX-License-Identifier: (Apache-2.0 OR MIT)
6 ******************************************************************************/
7 #include "_hypre_parcsr_ls.h"
8 #include "_hypre_utilities.hpp"
9 #include "par_ilu.h"
10 #include "seq_mv.hpp"
11
12 /* Setup ILU data */
13 HYPRE_Int
hypre_ILUSetup(void * ilu_vdata,hypre_ParCSRMatrix * A,hypre_ParVector * f,hypre_ParVector * u)14 hypre_ILUSetup( void *ilu_vdata,
15 hypre_ParCSRMatrix *A,
16 hypre_ParVector *f,
17 hypre_ParVector *u )
18 {
19 MPI_Comm comm = hypre_ParCSRMatrixComm(A);
20 hypre_ParILUData *ilu_data = (hypre_ParILUData*) ilu_vdata;
21 hypre_ParILUData *schur_precond_ilu;
22 hypre_ParNSHData *schur_solver_nsh;
23
24 HYPRE_Int i;
25 // HYPRE_Int num_threads;
26 // HYPRE_Int debug_flag = 0;
27
28 /* pointers to ilu data */
29 HYPRE_Int logging = hypre_ParILUDataLogging(ilu_data);
30 HYPRE_Int print_level = hypre_ParILUDataPrintLevel(ilu_data);
31 HYPRE_Int ilu_type = hypre_ParILUDataIluType(ilu_data);
32 HYPRE_Int nLU = hypre_ParILUDataNLU(ilu_data);
33 HYPRE_Int nI = hypre_ParILUDataNI(ilu_data);
34 HYPRE_Int fill_level = hypre_ParILUDataLfil(ilu_data);
35 HYPRE_Int max_row_elmts = hypre_ParILUDataMaxRowNnz(ilu_data);
36 HYPRE_Real *droptol = hypre_ParILUDataDroptol(ilu_data);
37 HYPRE_Int *CF_marker_array = hypre_ParILUDataCFMarkerArray(ilu_data);
38 HYPRE_Int *perm = hypre_ParILUDataPerm(ilu_data);
39 HYPRE_Int *qperm = hypre_ParILUDataQPerm(ilu_data);
40 HYPRE_Real tol_ddPQ = hypre_ParILUDataTolDDPQ(ilu_data);
41
42 #ifdef HYPRE_USING_CUDA
43 /* pointers to cusparse data, note that they are not NULL only when needed */
44 cusparseMatDescr_t matL_des = hypre_ParILUDataMatLMatrixDescription(ilu_data);
45 cusparseMatDescr_t matU_des = hypre_ParILUDataMatUMatrixDescription(ilu_data);
46 void *ilu_solve_buffer = hypre_ParILUDataILUSolveBuffer(ilu_data);//device memory
47 cusparseSolvePolicy_t ilu_solve_policy = hypre_ParILUDataILUSolvePolicy(ilu_data);
48 hypre_ParCSRMatrix *Aperm = hypre_ParILUDataAperm(ilu_data);
49 hypre_ParCSRMatrix *R = hypre_ParILUDataR(ilu_data);
50 hypre_ParCSRMatrix *P = hypre_ParILUDataP(ilu_data);
51 hypre_CSRMatrix *matALU_d = hypre_ParILUDataMatAILUDevice(ilu_data);
52 hypre_CSRMatrix *matBLU_d = hypre_ParILUDataMatBILUDevice(ilu_data);
53 hypre_CSRMatrix *matSLU_d = hypre_ParILUDataMatSILUDevice(ilu_data);
54 hypre_CSRMatrix *matE_d = hypre_ParILUDataMatEDevice(ilu_data);
55 hypre_CSRMatrix *matF_d = hypre_ParILUDataMatFDevice(ilu_data);
56 csrsv2Info_t matAL_info = hypre_ParILUDataMatALILUSolveInfo(ilu_data);
57 csrsv2Info_t matAU_info = hypre_ParILUDataMatAUILUSolveInfo(ilu_data);
58 csrsv2Info_t matBL_info = hypre_ParILUDataMatBLILUSolveInfo(ilu_data);
59 csrsv2Info_t matBU_info = hypre_ParILUDataMatBUILUSolveInfo(ilu_data);
60 csrsv2Info_t matSL_info = hypre_ParILUDataMatSLILUSolveInfo(ilu_data);
61 csrsv2Info_t matSU_info = hypre_ParILUDataMatSUILUSolveInfo(ilu_data);
62 HYPRE_Int *A_diag_fake = hypre_ParILUDataMatAFakeDiagonal(ilu_data);
63 hypre_Vector *Ftemp_upper = NULL;
64 hypre_Vector *Utemp_lower = NULL;
65 #endif
66
67 hypre_ParCSRMatrix *matA = hypre_ParILUDataMatA(ilu_data);
68 hypre_ParCSRMatrix *matL = hypre_ParILUDataMatL(ilu_data);
69 HYPRE_Real *matD = hypre_ParILUDataMatD(ilu_data);
70 hypre_ParCSRMatrix *matU = hypre_ParILUDataMatU(ilu_data);
71 hypre_ParCSRMatrix *matmL = hypre_ParILUDataMatLModified(ilu_data);
72 HYPRE_Real *matmD = hypre_ParILUDataMatDModified(ilu_data);
73 hypre_ParCSRMatrix *matmU = hypre_ParILUDataMatUModified(ilu_data);
74 hypre_ParCSRMatrix *matS = hypre_ParILUDataMatS(ilu_data);
75 // hypre_ParCSRMatrix *matM = NULL;
76 // HYPRE_Int nnzG;/* g stands for global */
77 HYPRE_Real nnzS;/* total nnz in S */
78 HYPRE_Int nnzS_offd;
79 HYPRE_Int size_C/* total size of coarse grid */;
80
81 HYPRE_Int n = hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A));
82 // HYPRE_Int m;/* m = n-LU */
83 /* reordering option */
84 HYPRE_Int reordering_type = hypre_ParILUDataReorderingType(ilu_data);
85 HYPRE_Int num_procs, my_id;
86
87 hypre_ParVector *Utemp = NULL;
88 hypre_ParVector *Ftemp = NULL;
89 hypre_ParVector *Xtemp = NULL;
90 hypre_ParVector *Ytemp = NULL;
91 HYPRE_Real *uext = NULL;
92 HYPRE_Real *fext = NULL;
93 hypre_ParVector *rhs = NULL;
94 hypre_ParVector *x = NULL;
95 hypre_ParVector *F_array = hypre_ParILUDataF(ilu_data);
96 hypre_ParVector *U_array = hypre_ParILUDataU(ilu_data);
97 hypre_ParVector *residual = hypre_ParILUDataResidual(ilu_data);
98 HYPRE_Real *rel_res_norms = hypre_ParILUDataRelResNorms(ilu_data);
99
100 /* might need for Schur Complement */
101 HYPRE_Int *u_end = NULL;
102 HYPRE_Solver schur_solver = NULL;
103 HYPRE_Solver schur_precond = NULL;
104 HYPRE_Solver schur_precond_gotten = NULL;
105
106 /* help to build external */
107 hypre_ParCSRCommPkg *comm_pkg;
108 HYPRE_Int buffer_size;
109 HYPRE_Int send_size;
110 HYPRE_Int recv_size;
111 #ifdef HYPRE_USING_CUDA
112 HYPRE_Int test_opt;
113 #endif
114 /* ----- begin -----*/
115 HYPRE_ANNOTATE_FUNC_BEGIN;
116
117 //num_threads = hypre_NumThreads();
118
119 hypre_MPI_Comm_size(comm,&num_procs);
120 hypre_MPI_Comm_rank(comm,&my_id);
121
122 #ifdef HYPRE_USING_CUDA
123 /* create cuda and cusparse information when needed */
124 /* Use most of them from global information */
125 /* set matrix L descripter, L is a lower triangular matrix with unit diagonal entries */
126 if (!matL_des)
127 {
128 HYPRE_CUSPARSE_CALL(cusparseCreateMatDescr(&(hypre_ParILUDataMatLMatrixDescription(ilu_data))));
129 matL_des = hypre_ParILUDataMatLMatrixDescription(ilu_data);
130 HYPRE_CUSPARSE_CALL(cusparseSetMatIndexBase(matL_des, CUSPARSE_INDEX_BASE_ZERO));
131 HYPRE_CUSPARSE_CALL(cusparseSetMatType(matL_des, CUSPARSE_MATRIX_TYPE_GENERAL));
132 HYPRE_CUSPARSE_CALL(cusparseSetMatFillMode(matL_des, CUSPARSE_FILL_MODE_LOWER));
133 HYPRE_CUSPARSE_CALL(cusparseSetMatDiagType(matL_des, CUSPARSE_DIAG_TYPE_UNIT));
134 }
135 /* set matrix U descripter, U is a upper triangular matrix with non-unit diagonal entries */
136 if (!matU_des)
137 {
138 HYPRE_CUSPARSE_CALL(cusparseCreateMatDescr(&(hypre_ParILUDataMatUMatrixDescription(ilu_data))));
139 matU_des = hypre_ParILUDataMatUMatrixDescription(ilu_data);
140 HYPRE_CUSPARSE_CALL(cusparseSetMatIndexBase(matU_des, CUSPARSE_INDEX_BASE_ZERO));
141 HYPRE_CUSPARSE_CALL(cusparseSetMatType(matU_des, CUSPARSE_MATRIX_TYPE_GENERAL));
142 HYPRE_CUSPARSE_CALL(cusparseSetMatFillMode(matU_des, CUSPARSE_FILL_MODE_UPPER));
143 HYPRE_CUSPARSE_CALL(cusparseSetMatDiagType(matU_des, CUSPARSE_DIAG_TYPE_NON_UNIT));
144 }
145 if (!matAL_info)
146 {
147 HYPRE_CUSPARSE_CALL( (cusparseDestroyCsrsv2Info(hypre_ParILUDataMatALILUSolveInfo(ilu_data))) );
148 matAL_info = NULL;
149 }
150 if (!matAU_info)
151 {
152 HYPRE_CUSPARSE_CALL( (cusparseDestroyCsrsv2Info(hypre_ParILUDataMatAUILUSolveInfo(ilu_data))) );
153 matAU_info = NULL;
154 }
155 if (!matBL_info)
156 {
157 HYPRE_CUSPARSE_CALL( (cusparseDestroyCsrsv2Info(hypre_ParILUDataMatBLILUSolveInfo(ilu_data))) );
158 matBL_info = NULL;
159 }
160 if (!matBU_info)
161 {
162 HYPRE_CUSPARSE_CALL( (cusparseDestroyCsrsv2Info(hypre_ParILUDataMatBUILUSolveInfo(ilu_data))) );
163 matBU_info = NULL;
164 }
165 if (!matSL_info)
166 {
167 HYPRE_CUSPARSE_CALL( (cusparseDestroyCsrsv2Info(hypre_ParILUDataMatSLILUSolveInfo(ilu_data))) );
168 matSL_info = NULL;
169 }
170 if (!matSU_info)
171 {
172 HYPRE_CUSPARSE_CALL( (cusparseDestroyCsrsv2Info(hypre_ParILUDataMatSUILUSolveInfo(ilu_data))) );
173 matSU_info = NULL;
174 }
175 if (ilu_solve_buffer)
176 {
177 hypre_TFree(ilu_solve_buffer, HYPRE_MEMORY_DEVICE);
178 ilu_solve_buffer = NULL;
179 }
180 if (matALU_d)
181 {
182 hypre_CSRMatrixDestroy( matALU_d );
183 matALU_d = NULL;
184 }
185 if (matSLU_d)
186 {
187 hypre_CSRMatrixDestroy( matSLU_d );
188 matSLU_d = NULL;
189 }
190 if (matBLU_d)
191 {
192 hypre_CSRMatrixDestroy( matBLU_d );
193 matBLU_d = NULL;
194 }
195 if (matE_d)
196 {
197 hypre_CSRMatrixDestroy( matE_d );
198 matE_d = NULL;
199 }
200 if (matF_d)
201 {
202 hypre_CSRMatrixDestroy( matF_d );
203 matF_d = NULL;
204 }
205 if (Aperm)
206 {
207 hypre_ParCSRMatrixDestroy( Aperm );
208 Aperm = NULL;
209 }
210 if (R)
211 {
212 hypre_ParCSRMatrixDestroy( R );
213 R = NULL;
214 }
215 if (P)
216 {
217 hypre_ParCSRMatrixDestroy( P );
218 P = NULL;
219 }
220 if (hypre_ParILUDataXTemp(ilu_data))
221 {
222 hypre_ParVectorDestroy(hypre_ParILUDataXTemp(ilu_data));
223 hypre_ParILUDataXTemp(ilu_data) = NULL;
224 }
225 if (hypre_ParILUDataYTemp(ilu_data))
226 {
227 hypre_ParVectorDestroy(hypre_ParILUDataYTemp(ilu_data));
228 hypre_ParILUDataYTemp(ilu_data) = NULL;
229 }
230 if (hypre_ParILUDataFTempUpper(ilu_data))
231 {
232 hypre_SeqVectorDestroy(hypre_ParILUDataFTempUpper(ilu_data));
233 hypre_ParILUDataFTempUpper(ilu_data) = NULL;
234 }
235 if (hypre_ParILUDataUTempLower(ilu_data))
236 {
237 hypre_SeqVectorDestroy(hypre_ParILUDataUTempLower(ilu_data));
238 hypre_ParILUDataUTempLower(ilu_data) = NULL;
239 }
240 if (hypre_ParILUDataMatAFakeDiagonal(ilu_data))
241 {
242 hypre_TFree(hypre_ParILUDataMatAFakeDiagonal(ilu_data), HYPRE_MEMORY_DEVICE);
243 hypre_ParILUDataMatAFakeDiagonal(ilu_data) = NULL;
244 }
245 #endif
246
247 /* Free Previously allocated data, if any not destroyed */
248 if (matL)
249 {
250 hypre_ParCSRMatrixDestroy(matL);
251 matL = NULL;
252 }
253 if (matU)
254 {
255 hypre_ParCSRMatrixDestroy(matU);
256 matU = NULL;
257 }
258 if (matmL)
259 {
260 hypre_ParCSRMatrixDestroy(matmL);
261 matmL = NULL;
262 }
263 if (matmU)
264 {
265 hypre_ParCSRMatrixDestroy(matmU);
266 matmU = NULL;
267 }
268 if (matS)
269 {
270 hypre_ParCSRMatrixDestroy(matS);
271 matS = NULL;
272 }
273 if (matD)
274 {
275 hypre_TFree(matD, HYPRE_MEMORY_DEVICE);
276 matD = NULL;
277 }
278 if (matmD)
279 {
280 hypre_TFree(matmD, HYPRE_MEMORY_DEVICE);
281 matmD = NULL;
282 }
283 if (CF_marker_array)
284 {
285 hypre_TFree(CF_marker_array, HYPRE_MEMORY_HOST);
286 CF_marker_array = NULL;
287 }
288
289
290 /* clear old l1_norm data, if created */
291 if (hypre_ParILUDataL1Norms(ilu_data))
292 {
293 hypre_TFree(hypre_ParILUDataL1Norms(ilu_data), HYPRE_MEMORY_HOST);
294 hypre_ParILUDataL1Norms(ilu_data) = NULL;
295 }
296
297 /* setup temporary storage
298 * first check is they've already here
299 */
300 if (hypre_ParILUDataUTemp(ilu_data))
301 {
302 hypre_ParVectorDestroy(hypre_ParILUDataUTemp(ilu_data));
303 hypre_ParILUDataUTemp(ilu_data) = NULL;
304 }
305 if (hypre_ParILUDataFTemp(ilu_data))
306 {
307 hypre_ParVectorDestroy(hypre_ParILUDataFTemp(ilu_data));
308 hypre_ParILUDataFTemp(ilu_data) = NULL;
309 }
310 if (hypre_ParILUDataUExt(ilu_data))
311 {
312 hypre_TFree(hypre_ParILUDataUExt(ilu_data), HYPRE_MEMORY_HOST);
313 hypre_ParILUDataUExt(ilu_data) = NULL;
314 }
315 if ( hypre_ParILUDataFExt(ilu_data))
316 {
317 hypre_TFree(hypre_ParILUDataFExt(ilu_data), HYPRE_MEMORY_HOST);
318 hypre_ParILUDataFExt(ilu_data) = NULL;
319 }
320 if ( hypre_ParILUDataUEnd(ilu_data))
321 {
322 hypre_TFree(hypre_ParILUDataUEnd(ilu_data), HYPRE_MEMORY_HOST);
323 hypre_ParILUDataUEnd(ilu_data) = NULL;
324 }
325 if (hypre_ParILUDataRhs(ilu_data))
326 {
327 hypre_ParVectorDestroy(hypre_ParILUDataRhs(ilu_data));
328 hypre_ParILUDataRhs(ilu_data) = NULL;
329 }
330 if (hypre_ParILUDataX(ilu_data))
331 {
332 hypre_ParVectorDestroy(hypre_ParILUDataX(ilu_data));
333 hypre_ParILUDataX(ilu_data) = NULL;
334 }
335 if (hypre_ParILUDataResidual(ilu_data))
336 {
337 hypre_ParVectorDestroy(hypre_ParILUDataResidual(ilu_data));
338 hypre_ParILUDataResidual(ilu_data) = NULL;
339 }
340 if (hypre_ParILUDataRelResNorms(ilu_data))
341 {
342 hypre_TFree(hypre_ParILUDataRelResNorms(ilu_data), HYPRE_MEMORY_HOST);
343 hypre_ParILUDataRelResNorms(ilu_data) = NULL;
344 }
345 if (hypre_ParILUDataSchurSolver(ilu_data))
346 {
347 switch(ilu_type){
348 case 10: case 11: case 40: case 41: case 50:
349 HYPRE_ParCSRGMRESDestroy(hypre_ParILUDataSchurSolver(ilu_data)); //GMRES for Schur
350 break;
351 case 20: case 21:
352 hypre_NSHDestroy(hypre_ParILUDataSchurSolver(ilu_data)); //NSH for Schur
353 break;
354 default:
355 break;
356 }
357 (hypre_ParILUDataSchurSolver(ilu_data)) = NULL;
358 }
359 if (hypre_ParILUDataSchurPrecond(ilu_data))
360 {
361 switch(ilu_type){
362 case 10: case 11: case 40: case 41:
363 #ifdef HYPRE_USING_CUDA
364 if (hypre_ParILUDataIluType(ilu_data) != 10 &&
365 hypre_ParILUDataIluType(ilu_data) != 11)
366 {
367 #endif
368 HYPRE_ILUDestroy(hypre_ParILUDataSchurPrecond(ilu_data)); //ILU as precond for Schur
369 #ifdef HYPRE_USING_CUDA
370 }
371 #endif
372 break;
373 default:
374 break;
375 }
376 (hypre_ParILUDataSchurPrecond(ilu_data)) = NULL;
377 }
378 /* start to create working vectors */
379 Utemp = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A),
380 hypre_ParCSRMatrixGlobalNumRows(A),
381 hypre_ParCSRMatrixRowStarts(A));
382 hypre_ParVectorInitialize(Utemp);
383 hypre_ParILUDataUTemp(ilu_data) = Utemp;
384
385 Ftemp = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A),
386 hypre_ParCSRMatrixGlobalNumRows(A),
387 hypre_ParCSRMatrixRowStarts(A));
388 hypre_ParVectorInitialize(Ftemp);
389 hypre_ParILUDataFTemp(ilu_data) = Ftemp;
390 /* set matrix, solution and rhs pointers */
391 matA = A;
392 F_array = f;
393 U_array = u;
394
395 // create perm arary if necessary
396 if (perm == NULL)
397 {
398 switch(ilu_type)
399 {
400 case 10: case 11: case 20: case 21: case 30: case 31: case 50:/* symmetric */
401 hypre_ILUGetInteriorExteriorPerm(matA, &perm, &nLU, reordering_type);
402 break;
403 case 40: case 41:/* ddPQ */
404 hypre_ILUGetPermddPQ(matA, &perm, &qperm, tol_ddPQ, &nLU, &nI, reordering_type);
405 break;
406 case 0: case 1:
407 hypre_ILUGetLocalPerm(matA, &perm, &nLU, reordering_type);
408 break;
409 default:
410 hypre_ILUGetLocalPerm(matA, &perm, &nLU, reordering_type);
411 break;
412 }
413 }
414 // m = n - nLU;
415 /* factorization */
416 switch(ilu_type)
417 {
418 case 0:
419 #ifdef HYPRE_USING_CUDA
420 /* only apply the setup of ILU0 with cusparse */
421 if (fill_level == 0)
422 {
423 hypre_ILUSetupILU0Device(matA, perm, perm, n, n, matL_des, matU_des, ilu_solve_policy, &ilu_solve_buffer,
424 &matBL_info, &matBU_info, &matSL_info, &matSU_info, &matBLU_d, &matS,
425 &matE_d, &matF_d, &A_diag_fake);//BJ + cusparse_ilu0()
426 }
427 else
428 {
429 hypre_ILUSetupILUKDevice(matA, fill_level, perm, perm, n, n, matL_des, matU_des, ilu_solve_policy, &ilu_solve_buffer,
430 &matBL_info, &matBU_info, &matSL_info, &matSU_info, &matBLU_d, &matS,
431 &matE_d, &matF_d, &A_diag_fake);//BJ + hypre_iluk(), setup the device solve
432 }
433 #else
434 hypre_ILUSetupILUK(matA, fill_level, perm, perm, n, n, &matL, &matD, &matU, &matS, &u_end); //BJ + hypre_iluk()
435 #endif
436 break;
437 case 1:
438 #ifdef HYPRE_USING_CUDA
439 hypre_ILUSetupILUTDevice(matA, max_row_elmts, droptol, perm, perm, n, n, matL_des, matU_des, ilu_solve_policy, &ilu_solve_buffer,
440 &matBL_info, &matBU_info, &matSL_info, &matSU_info, &matBLU_d, &matS,
441 &matE_d, &matF_d, &A_diag_fake);//BJ + hypre_ilut(), setup the device solve
442 #else
443 hypre_ILUSetupILUT(matA, max_row_elmts, droptol, perm, perm, n, n, &matL, &matD, &matU, &matS, &u_end); //BJ + hypre_ilut()
444 #endif
445 break;
446 case 10:
447 #ifdef HYPRE_USING_CUDA
448 if (fill_level == 0)
449 {
450 /* Only support ILU0 */
451 hypre_ILUSetupILU0Device(matA, perm, perm, n, nLU, matL_des, matU_des, ilu_solve_policy, &ilu_solve_buffer,
452 &matBL_info, &matBU_info, &matSL_info, &matSU_info, &matBLU_d, &matS,
453 &matE_d, &matF_d, &A_diag_fake);//BJ + cusparse_ilu0()
454 }
455 else
456 {
457 hypre_ILUSetupILUKDevice(matA, fill_level, perm, perm, n, nLU, matL_des, matU_des, ilu_solve_policy, &ilu_solve_buffer,
458 &matBL_info, &matBU_info, &matSL_info, &matSU_info, &matBLU_d, &matS,
459 &matE_d, &matF_d, &A_diag_fake);//BJ + cusparse_ilu0()
460 }
461 #else
462 hypre_ILUSetupILUK(matA, fill_level, perm, perm, nLU, nLU, &matL, &matD, &matU, &matS, &u_end); //GMRES + hypre_iluk()
463 #endif
464 break;
465 case 11:
466 #ifdef HYPRE_USING_CUDA
467 hypre_ILUSetupILUTDevice(matA, max_row_elmts, droptol, perm, perm, n, nLU, matL_des, matU_des, ilu_solve_policy, &ilu_solve_buffer,
468 &matBL_info, &matBU_info, &matSL_info, &matSU_info, &matBLU_d, &matS,
469 &matE_d, &matF_d, &A_diag_fake);//BJ + cusparse_ilu0()
470 #else
471 hypre_ILUSetupILUT(matA, max_row_elmts, droptol, perm, perm, nLU, nLU, &matL, &matD, &matU, &matS, &u_end); //GMRES + hypre_ilut()
472 #endif
473 break;
474 case 20: hypre_ILUSetupILUK(matA, fill_level, perm, perm, nLU, nLU, &matL, &matD, &matU, &matS, &u_end); //Newton Schulz Hotelling + hypre_iluk()
475 break;
476 case 21: hypre_ILUSetupILUT(matA, max_row_elmts, droptol, perm, perm, nLU, nLU, &matL, &matD, &matU, &matS, &u_end); //Newton Schulz Hotelling + hypre_ilut()
477 break;
478 case 30: hypre_ILUSetupILUKRAS(matA, fill_level, perm, nLU, &matL, &matD, &matU); //RAS + hypre_iluk()
479 break;
480 case 31: hypre_ILUSetupILUTRAS(matA, max_row_elmts, droptol, perm, nLU, &matL, &matD, &matU); //RAS + hypre_ilut()
481 break;
482 case 40: hypre_ILUSetupILUK(matA, fill_level, perm, qperm, nLU, nI, &matL, &matD, &matU, &matS, &u_end); //ddPQ + GMRES + hypre_iluk()
483 break;
484 case 41: hypre_ILUSetupILUT(matA, max_row_elmts, droptol, perm, qperm, nLU, nI, &matL, &matD, &matU, &matS, &u_end); //ddPQ + GMRES + hypre_ilut()
485 break;
486 case 50:
487 #ifdef HYPRE_USING_CUDA
488 test_opt = hypre_ParILUDataTestOption(ilu_data);
489 hypre_ILUSetupRAPILU0Device(matA, perm, n, nLU, matL_des, matU_des, ilu_solve_policy,
490 &ilu_solve_buffer, &matAL_info, &matAU_info, &matBL_info, &matBU_info, &matSL_info, &matSU_info,
491 &Aperm, &matS, &matALU_d, &matBLU_d, &matSLU_d, &matE_d, &matF_d, test_opt); //RAP + hypre_modified_ilu0
492 #else
493 hypre_ILUSetupRAPILU0(matA, perm, n, nLU, &matL, &matD, &matU, &matmL, &matmD, &matmU, &u_end); //RAP + hypre_modified_ilu0
494 #endif
495 break;
496 default:
497 #ifdef HYPRE_USING_CUDA
498 hypre_ILUSetupILU0Device(matA, perm, perm, n, n, matL_des, matU_des, ilu_solve_policy, &ilu_solve_buffer,
499 &matBL_info, &matBU_info, &matSL_info, &matSU_info, &matBLU_d, &matS,
500 &matE_d, &matF_d, &A_diag_fake);//BJ + cusparse_ilu0()
501 #else
502 hypre_ILUSetupILU0(matA, perm, perm, n, n, &matL, &matD, &matU, &matS, &u_end);//BJ + hypre_ilu0()
503 #endif
504 break;
505 }
506 /* setup Schur solver */
507 switch(ilu_type)
508 {
509 case 10: case 11:
510 if (matS)
511 {
512 #ifdef HYPRE_USING_CUDA
513 /* create working vectors */
514
515 Xtemp = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(matS),
516 hypre_ParCSRMatrixGlobalNumRows(matS),
517 hypre_ParCSRMatrixRowStarts(matS));
518 hypre_ParVectorInitialize(Xtemp);
519
520 Ytemp = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(matS),
521 hypre_ParCSRMatrixGlobalNumRows(matS),
522 hypre_ParCSRMatrixRowStarts(matS));
523 hypre_ParVectorInitialize(Ytemp);
524
525 Ftemp_upper = hypre_SeqVectorCreate(nLU);
526 hypre_VectorOwnsData(Ftemp_upper) = 0;
527 hypre_VectorData(Ftemp_upper) = hypre_VectorData(hypre_ParVectorLocalVector(Ftemp));
528 hypre_SeqVectorInitialize(Ftemp_upper);
529
530 Utemp_lower = hypre_SeqVectorCreate(n - nLU);
531 hypre_VectorOwnsData(Utemp_lower) = 0;
532 hypre_VectorData(Utemp_lower) = hypre_VectorData(hypre_ParVectorLocalVector(Utemp)) + nLU;
533 hypre_SeqVectorInitialize(Utemp_lower);
534
535 /* create GMRES */
536 // HYPRE_ParCSRGMRESCreate(comm, &schur_solver);
537
538 hypre_GMRESFunctions * gmres_functions;
539
540 gmres_functions =
541 hypre_GMRESFunctionsCreate(
542 hypre_CAlloc,
543 hypre_ParKrylovFree,
544 hypre_ParILUCusparseSchurGMRESCommInfo, //parCSR A -> ilu_data
545 hypre_ParKrylovCreateVector,
546 hypre_ParKrylovCreateVectorArray,
547 hypre_ParKrylovDestroyVector,
548 hypre_ParILUCusparseSchurGMRESMatvecCreate, //parCSR A -- inactive
549 hypre_ParILUCusparseSchurGMRESMatvec, //parCSR A -> ilu_data
550 hypre_ParILUCusparseSchurGMRESMatvecDestroy, //parCSR A -- inactive
551 hypre_ParKrylovInnerProd,
552 hypre_ParKrylovCopyVector,
553 hypre_ParKrylovClearVector,
554 hypre_ParKrylovScaleVector,
555 hypre_ParKrylovAxpy,
556 hypre_ParKrylovIdentitySetup, //parCSR A -- inactive
557 hypre_ParKrylovIdentity ); //parCSR A -- inactive
558 schur_solver = ( (HYPRE_Solver) hypre_GMRESCreate( gmres_functions ) );
559
560 /* setup GMRES parameters */
561 HYPRE_GMRESSetKDim (schur_solver, hypre_ParILUDataSchurGMRESKDim(ilu_data));
562 HYPRE_GMRESSetMaxIter (schur_solver, hypre_ParILUDataSchurGMRESMaxIter(ilu_data));/* we don't need that many solves */
563 HYPRE_GMRESSetTol (schur_solver, hypre_ParILUDataSchurGMRESTol(ilu_data));
564 HYPRE_GMRESSetAbsoluteTol (schur_solver, hypre_ParILUDataSchurGMRESAbsoluteTol(ilu_data));
565 HYPRE_GMRESSetLogging (schur_solver, hypre_ParILUDataSchurSolverLogging(ilu_data));
566 HYPRE_GMRESSetPrintLevel (schur_solver, hypre_ParILUDataSchurSolverPrintLevel(ilu_data));/* set to zero now, don't print */
567 HYPRE_GMRESSetRelChange (schur_solver, hypre_ParILUDataSchurGMRESRelChange(ilu_data));
568
569 /* setup preconditioner parameters */
570 /* create Unit precond */
571 schur_precond = (HYPRE_Solver) ilu_vdata;
572 /* add preconditioner to solver */
573 HYPRE_GMRESSetPrecond(schur_solver,
574 (HYPRE_PtrToSolverFcn) hypre_ParILUCusparseSchurGMRESDummySolve,
575 (HYPRE_PtrToSolverFcn) hypre_ParILUCusparseSchurGMRESDummySetup,
576 schur_precond);
577 HYPRE_GMRESGetPrecond(schur_solver, &schur_precond_gotten);
578 if (schur_precond_gotten != (schur_precond))
579 {
580 hypre_printf("Schur complement got bad precond\n");
581 return(-1);
582 }
583
584 /* need to create working vector rhs and x for Schur System */
585 rhs = hypre_ParVectorCreate(comm,
586 hypre_ParCSRMatrixGlobalNumRows(matS),
587 hypre_ParCSRMatrixRowStarts(matS));
588 hypre_ParVectorInitialize(rhs);
589 x = hypre_ParVectorCreate(comm,
590 hypre_ParCSRMatrixGlobalNumRows(matS),
591 hypre_ParCSRMatrixRowStarts(matS));
592 hypre_ParVectorInitialize(x);
593
594 /* setup solver */
595 HYPRE_GMRESSetup(schur_solver,(HYPRE_Matrix)ilu_vdata,(HYPRE_Vector)rhs,(HYPRE_Vector)x);
596
597 /* solve for right-hand-side consists of only 1 */
598 hypre_Vector *rhs_local = hypre_ParVectorLocalVector(rhs);
599 //HYPRE_Real *Xtemp_data = hypre_VectorData(Xtemp_local);
600 hypre_SeqVectorSetConstantValues(rhs_local, 1.0);
601
602 /* update ilu_data */
603 hypre_ParILUDataSchurSolver (ilu_data) = schur_solver;
604 hypre_ParILUDataSchurPrecond (ilu_data) = schur_precond;
605 hypre_ParILUDataRhs (ilu_data) = rhs;
606 hypre_ParILUDataX (ilu_data) = x;
607 #else
608 /* setup GMRES parameters */
609 HYPRE_ParCSRGMRESCreate(comm, &schur_solver);
610
611 HYPRE_GMRESSetKDim (schur_solver, hypre_ParILUDataSchurGMRESKDim(ilu_data));
612 HYPRE_GMRESSetMaxIter (schur_solver, hypre_ParILUDataSchurGMRESMaxIter(ilu_data));/* we don't need that many solves */
613 HYPRE_GMRESSetTol (schur_solver, hypre_ParILUDataSchurGMRESTol(ilu_data));
614 HYPRE_GMRESSetAbsoluteTol (schur_solver, hypre_ParILUDataSchurGMRESAbsoluteTol(ilu_data));
615 HYPRE_GMRESSetLogging (schur_solver, hypre_ParILUDataSchurSolverLogging(ilu_data));
616 HYPRE_GMRESSetPrintLevel (schur_solver, hypre_ParILUDataSchurSolverPrintLevel(ilu_data));/* set to zero now, don't print */
617 HYPRE_GMRESSetRelChange (schur_solver, hypre_ParILUDataSchurGMRESRelChange(ilu_data));
618
619 /* setup preconditioner parameters */
620 /* create precond, the default is ILU0 */
621 HYPRE_ILUCreate (&schur_precond);
622 HYPRE_ILUSetType (schur_precond, hypre_ParILUDataSchurPrecondIluType(ilu_data));
623 HYPRE_ILUSetLevelOfFill (schur_precond, hypre_ParILUDataSchurPrecondIluLfil(ilu_data));
624 HYPRE_ILUSetMaxNnzPerRow (schur_precond, hypre_ParILUDataSchurPrecondIluMaxRowNnz(ilu_data));
625 HYPRE_ILUSetDropThresholdArray(schur_precond, hypre_ParILUDataSchurPrecondIluDroptol(ilu_data));
626 HYPRE_ILUSetPrintLevel (schur_precond, hypre_ParILUDataSchurPrecondPrintLevel(ilu_data));
627 HYPRE_ILUSetMaxIter (schur_precond, hypre_ParILUDataSchurPrecondMaxIter(ilu_data));
628 HYPRE_ILUSetTol (schur_precond, hypre_ParILUDataSchurPrecondTol(ilu_data));
629
630 /* add preconditioner to solver */
631 HYPRE_GMRESSetPrecond(schur_solver,
632 (HYPRE_PtrToSolverFcn) HYPRE_ILUSolve,
633 (HYPRE_PtrToSolverFcn) HYPRE_ILUSetup,
634 schur_precond);
635 HYPRE_GMRESGetPrecond(schur_solver, &schur_precond_gotten);
636 if (schur_precond_gotten != (schur_precond))
637 {
638 hypre_printf("Schur complement got bad precond\n");
639 HYPRE_ANNOTATE_FUNC_END;
640
641 return(-1);
642 }
643
644 /* need to create working vector rhs and x for Schur System */
645 rhs = hypre_ParVectorCreate(comm,
646 hypre_ParCSRMatrixGlobalNumRows(matS),
647 hypre_ParCSRMatrixRowStarts(matS));
648 hypre_ParVectorInitialize(rhs);
649 x = hypre_ParVectorCreate(comm,
650 hypre_ParCSRMatrixGlobalNumRows(matS),
651 hypre_ParCSRMatrixRowStarts(matS));
652 hypre_ParVectorInitialize(x);
653
654 /* setup solver */
655 HYPRE_GMRESSetup(schur_solver,(HYPRE_Matrix)matS,(HYPRE_Vector)rhs,(HYPRE_Vector)x);
656
657 /* update ilu_data */
658 hypre_ParILUDataSchurSolver (ilu_data) = schur_solver;
659 hypre_ParILUDataSchurPrecond (ilu_data) = schur_precond;
660 hypre_ParILUDataRhs (ilu_data) = rhs;
661 hypre_ParILUDataX (ilu_data) = x;
662 #endif
663 }
664 break;
665 case 20: case 21:
666 if (matS)
667 {
668 /* approximate inverse preconditioner */
669 schur_solver = (HYPRE_Solver)hypre_NSHCreate();
670
671 /* set NSH parameters */
672 hypre_NSHSetMaxIter (schur_solver, hypre_ParILUDataSchurNSHSolveMaxIter(ilu_data));
673 hypre_NSHSetTol (schur_solver, hypre_ParILUDataSchurNSHSolveTol(ilu_data));
674 hypre_NSHSetLogging (schur_solver, hypre_ParILUDataSchurSolverLogging(ilu_data));
675 hypre_NSHSetPrintLevel (schur_solver, hypre_ParILUDataSchurSolverPrintLevel(ilu_data));
676 hypre_NSHSetDropThresholdArray(schur_solver, hypre_ParILUDataSchurNSHDroptol(ilu_data));
677
678 hypre_NSHSetNSHMaxIter (schur_solver, hypre_ParILUDataSchurNSHMaxNumIter(ilu_data));
679 hypre_NSHSetNSHMaxRowNnz (schur_solver, hypre_ParILUDataSchurNSHMaxRowNnz(ilu_data));
680 hypre_NSHSetNSHTol (schur_solver, hypre_ParILUDataSchurNSHTol(ilu_data));
681
682 hypre_NSHSetMRMaxIter (schur_solver, hypre_ParILUDataSchurMRMaxIter(ilu_data));
683 hypre_NSHSetMRMaxRowNnz (schur_solver, hypre_ParILUDataSchurMRMaxRowNnz(ilu_data));
684 hypre_NSHSetMRTol (schur_solver, hypre_ParILUDataSchurMRTol(ilu_data));
685 hypre_NSHSetColVersion (schur_solver, hypre_ParILUDataSchurMRColVersion(ilu_data));
686
687 /* need to create working vector rhs and x for Schur System */
688 rhs = hypre_ParVectorCreate(comm,
689 hypre_ParCSRMatrixGlobalNumRows(matS),
690 hypre_ParCSRMatrixRowStarts(matS));
691 hypre_ParVectorInitialize(rhs);
692 x = hypre_ParVectorCreate(comm,
693 hypre_ParCSRMatrixGlobalNumRows(matS),
694 hypre_ParCSRMatrixRowStarts(matS));
695 hypre_ParVectorInitialize(x);
696
697 /* setup solver */
698 hypre_NSHSetup(schur_solver,matS,rhs,x);
699
700 hypre_ParILUDataSchurSolver(ilu_data) = schur_solver;
701 hypre_ParILUDataRhs (ilu_data) = rhs;
702 hypre_ParILUDataX (ilu_data) = x;
703 }
704 break;
705 case 30 : case 31:
706 /* now check communication package */
707 comm_pkg = hypre_ParCSRMatrixCommPkg(matA);
708 /* create if not yet built */
709 if (!comm_pkg)
710 {
711 hypre_MatvecCommPkgCreate(matA);
712 comm_pkg = hypre_ParCSRMatrixCommPkg(matA);
713 }
714 /* create uext and fext */
715 send_size = hypre_ParCSRCommPkgSendMapStart(comm_pkg,hypre_ParCSRCommPkgNumSends(comm_pkg))
716 - hypre_ParCSRCommPkgSendMapStart(comm_pkg,0);
717 recv_size = hypre_CSRMatrixNumCols(hypre_ParCSRMatrixOffd(matA));
718 buffer_size = send_size > recv_size ? send_size : recv_size;
719 fext = hypre_TAlloc(HYPRE_Real,buffer_size,HYPRE_MEMORY_HOST);
720 uext = hypre_TAlloc(HYPRE_Real,buffer_size,HYPRE_MEMORY_HOST);
721 break;
722 case 40: case 41:
723 if (matS)
724 {
725 /* setup GMRES parameters */
726 HYPRE_ParCSRGMRESCreate(comm, &schur_solver);
727
728 HYPRE_GMRESSetKDim (schur_solver, hypre_ParILUDataSchurGMRESKDim(ilu_data));
729 HYPRE_GMRESSetMaxIter (schur_solver, hypre_ParILUDataSchurGMRESMaxIter(ilu_data));/* we don't need that many solves */
730 HYPRE_GMRESSetTol (schur_solver, hypre_ParILUDataSchurGMRESTol(ilu_data));
731 HYPRE_GMRESSetAbsoluteTol (schur_solver, hypre_ParILUDataSchurGMRESAbsoluteTol(ilu_data));
732 HYPRE_GMRESSetLogging (schur_solver, hypre_ParILUDataSchurSolverLogging(ilu_data));
733 HYPRE_GMRESSetPrintLevel (schur_solver, hypre_ParILUDataSchurSolverPrintLevel(ilu_data));/* set to zero now, don't print */
734 HYPRE_GMRESSetRelChange (schur_solver, hypre_ParILUDataSchurGMRESRelChange(ilu_data));
735
736 /* setup preconditioner parameters */
737 /* create precond, the default is ILU0 */
738 HYPRE_ILUCreate (&schur_precond);
739 HYPRE_ILUSetType (schur_precond, hypre_ParILUDataSchurPrecondIluType(ilu_data));
740 HYPRE_ILUSetLevelOfFill (schur_precond, hypre_ParILUDataSchurPrecondIluLfil(ilu_data));
741 HYPRE_ILUSetMaxNnzPerRow (schur_precond, hypre_ParILUDataSchurPrecondIluMaxRowNnz(ilu_data));
742 HYPRE_ILUSetDropThresholdArray(schur_precond, hypre_ParILUDataSchurPrecondIluDroptol(ilu_data));
743 HYPRE_ILUSetPrintLevel (schur_precond, hypre_ParILUDataSchurPrecondPrintLevel(ilu_data));
744 HYPRE_ILUSetMaxIter (schur_precond, hypre_ParILUDataSchurPrecondMaxIter(ilu_data));
745 HYPRE_ILUSetTol (schur_precond, hypre_ParILUDataSchurPrecondTol(ilu_data));
746
747 /* add preconditioner to solver */
748 HYPRE_GMRESSetPrecond(schur_solver,
749 (HYPRE_PtrToSolverFcn) HYPRE_ILUSolve,
750 (HYPRE_PtrToSolverFcn) HYPRE_ILUSetup,
751 schur_precond);
752 HYPRE_GMRESGetPrecond(schur_solver, &schur_precond_gotten);
753 if (schur_precond_gotten != (schur_precond))
754 {
755 hypre_printf("Schur complement got bad precond\n");
756 return(-1);
757 }
758
759 /* need to create working vector rhs and x for Schur System */
760 rhs = hypre_ParVectorCreate(comm,
761 hypre_ParCSRMatrixGlobalNumRows(matS),
762 hypre_ParCSRMatrixRowStarts(matS));
763 hypre_ParVectorInitialize(rhs);
764 x = hypre_ParVectorCreate(comm,
765 hypre_ParCSRMatrixGlobalNumRows(matS),
766 hypre_ParCSRMatrixRowStarts(matS));
767 hypre_ParVectorInitialize(x);
768
769 /* setup solver */
770 HYPRE_GMRESSetup(schur_solver,(HYPRE_Matrix)matS,(HYPRE_Vector)rhs,(HYPRE_Vector)x);
771
772 /* update ilu_data */
773 hypre_ParILUDataSchurSolver (ilu_data) = schur_solver;
774 hypre_ParILUDataSchurPrecond (ilu_data) = schur_precond;
775 hypre_ParILUDataRhs (ilu_data) = rhs;
776 hypre_ParILUDataX (ilu_data) = x;
777 }
778 break;
779 case 50:
780 {
781 #ifdef HYPRE_USING_CUDA
782 if (matS)
783 {
784 /* create working vectors */
785 Xtemp = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(matA),
786 hypre_ParCSRMatrixGlobalNumRows(matA),
787 hypre_ParCSRMatrixRowStarts(matA));
788 hypre_ParVectorInitialize(Xtemp);
789
790 Ytemp = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(matA),
791 hypre_ParCSRMatrixGlobalNumRows(matA),
792 hypre_ParCSRMatrixRowStarts(matA));
793 hypre_ParVectorInitialize(Ytemp);
794
795 Ftemp_upper = hypre_SeqVectorCreate(nLU);
796 hypre_VectorOwnsData(Ftemp_upper) = 0;
797 hypre_VectorData(Ftemp_upper) = hypre_VectorData(hypre_ParVectorLocalVector(Ftemp));
798 hypre_SeqVectorInitialize(Ftemp_upper);
799
800 Utemp_lower = hypre_SeqVectorCreate(n - nLU);
801 hypre_VectorOwnsData(Utemp_lower) = 0;
802 hypre_VectorData(Utemp_lower) = hypre_VectorData(hypre_ParVectorLocalVector(Utemp)) + nLU;
803 hypre_SeqVectorInitialize(Utemp_lower);
804
805 /* create GMRES */
806 // HYPRE_ParCSRGMRESCreate(comm, &schur_solver);
807
808 hypre_GMRESFunctions * gmres_functions;
809
810 gmres_functions =
811 hypre_GMRESFunctionsCreate(
812 hypre_CAlloc,
813 hypre_ParKrylovFree,
814 hypre_ParILUCusparseSchurGMRESCommInfo, //parCSR A -> ilu_data
815 hypre_ParKrylovCreateVector,
816 hypre_ParKrylovCreateVectorArray,
817 hypre_ParKrylovDestroyVector,
818 hypre_ParILURAPSchurGMRESMatvecCreate, //parCSR A -- inactive
819 hypre_ParILURAPSchurGMRESMatvec, //parCSR A -> ilu_data
820 hypre_ParILURAPSchurGMRESMatvecDestroy, //parCSR A -- inactive
821 hypre_ParKrylovInnerProd,
822 hypre_ParKrylovCopyVector,
823 hypre_ParKrylovClearVector,
824 hypre_ParKrylovScaleVector,
825 hypre_ParKrylovAxpy,
826 hypre_ParKrylovIdentitySetup, //parCSR A -- inactive
827 hypre_ParKrylovIdentity ); //parCSR A -- inactive
828 schur_solver = ( (HYPRE_Solver) hypre_GMRESCreate( gmres_functions ) );
829
830 /* setup GMRES parameters */
831 /* at least should apply 1 solve */
832 if (hypre_ParILUDataSchurGMRESKDim(ilu_data) == 0)
833 {
834 hypre_ParILUDataSchurGMRESKDim(ilu_data) ++;
835 }
836 HYPRE_GMRESSetKDim (schur_solver, hypre_ParILUDataSchurGMRESKDim(ilu_data));
837 HYPRE_GMRESSetMaxIter (schur_solver, hypre_ParILUDataSchurGMRESMaxIter(ilu_data));/* we don't need that many solves */
838 HYPRE_GMRESSetTol (schur_solver, hypre_ParILUDataSchurGMRESTol(ilu_data));
839 HYPRE_GMRESSetAbsoluteTol (schur_solver, hypre_ParILUDataSchurGMRESAbsoluteTol(ilu_data));
840 HYPRE_GMRESSetLogging (schur_solver, hypre_ParILUDataSchurSolverLogging(ilu_data));
841 HYPRE_GMRESSetPrintLevel (schur_solver, hypre_ParILUDataSchurSolverPrintLevel(ilu_data));/* set to zero now, don't print */
842 HYPRE_GMRESSetRelChange (schur_solver, hypre_ParILUDataSchurGMRESRelChange(ilu_data));
843
844 /* setup preconditioner parameters */
845 /* create Schur precond */
846 schur_precond = (HYPRE_Solver) ilu_vdata;
847 /* add preconditioner to solver */
848 HYPRE_GMRESSetPrecond(schur_solver,
849 (HYPRE_PtrToSolverFcn) hypre_ParILURAPSchurGMRESSolve,
850 //(HYPRE_PtrToSolverFcn) hypre_ParILUCusparseSchurGMRESDummySolve,
851 (HYPRE_PtrToSolverFcn) hypre_ParILURAPSchurGMRESDummySetup,
852 schur_precond);
853 HYPRE_GMRESGetPrecond(schur_solver, &schur_precond_gotten);
854 if (schur_precond_gotten != (schur_precond))
855 {
856 hypre_printf("Schur complement got bad precond\n");
857 return(-1);
858 }
859
860 /* need to create working vector rhs and x for Schur System */
861 rhs = hypre_ParVectorCreate(comm,
862 hypre_ParCSRMatrixGlobalNumRows(matS),
863 hypre_ParCSRMatrixRowStarts(matS));
864 hypre_ParVectorInitialize(rhs);
865 x = hypre_ParVectorCreate(comm,
866 hypre_ParCSRMatrixGlobalNumRows(matS),
867 hypre_ParCSRMatrixRowStarts(matS));
868 hypre_ParVectorInitialize(x);
869
870 /* setup solver */
871 HYPRE_GMRESSetup(schur_solver,(HYPRE_Matrix)ilu_vdata,(HYPRE_Vector)rhs,(HYPRE_Vector)x);
872
873 /* solve for right-hand-side consists of only 1 */
874 //hypre_Vector *rhs_local = hypre_ParVectorLocalVector(rhs);
875 //HYPRE_Real *Xtemp_data = hypre_VectorData(Xtemp_local);
876 //hypre_SeqVectorSetConstantValues(rhs_local, 1.0);
877
878 /* update ilu_data */
879 hypre_ParILUDataSchurSolver (ilu_data) = schur_solver;
880 hypre_ParILUDataSchurPrecond (ilu_data) = schur_precond;
881 hypre_ParILUDataRhs (ilu_data) = rhs;
882 hypre_ParILUDataX (ilu_data) = x;
883 }
884 #else
885 /* need to create working vector rhs and x for Schur System */
886 HYPRE_Int m = n - nLU;
887 HYPRE_BigInt S_total_rows, S_row_starts[2];
888 HYPRE_BigInt big_m = (HYPRE_BigInt)m;
889 hypre_MPI_Allreduce( &big_m, &S_total_rows, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
890
891 if ( S_total_rows > 0 )
892 {
893 /* create working vectors */
894 Xtemp = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(matA),
895 hypre_ParCSRMatrixGlobalNumRows(matA),
896 hypre_ParCSRMatrixRowStarts(matA));
897 hypre_ParVectorInitialize(Xtemp);
898
899 Ytemp = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(matA),
900 hypre_ParCSRMatrixGlobalNumRows(matA),
901 hypre_ParCSRMatrixRowStarts(matA));
902 hypre_ParVectorInitialize(Ytemp);
903
904 /* only do so when we hae the Schur Complement */
905 {
906 HYPRE_BigInt global_start;
907 hypre_MPI_Scan( &big_m, &global_start, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
908 S_row_starts[0] = global_start - m;
909 S_row_starts[1] = global_start;
910 }
911
912 rhs = hypre_ParVectorCreate(comm,
913 S_total_rows,
914 S_row_starts);
915 hypre_ParVectorInitialize(rhs);
916
917 x = hypre_ParVectorCreate(comm,
918 S_total_rows,
919 S_row_starts);
920 hypre_ParVectorInitialize(x);
921
922 /* add when necessary */
923 /* create GMRES */
924 // HYPRE_ParCSRGMRESCreate(comm, &schur_solver);
925
926 hypre_GMRESFunctions * gmres_functions;
927
928 gmres_functions =
929 hypre_GMRESFunctionsCreate(
930 hypre_CAlloc,
931 hypre_ParKrylovFree,
932 hypre_ParILURAPSchurGMRESCommInfoH, //parCSR A -> ilu_data
933 hypre_ParKrylovCreateVector,
934 hypre_ParKrylovCreateVectorArray,
935 hypre_ParKrylovDestroyVector,
936 hypre_ParILURAPSchurGMRESMatvecCreateH, //parCSR A -- inactive
937 hypre_ParILURAPSchurGMRESMatvecH, //parCSR A -> ilu_data
938 hypre_ParILURAPSchurGMRESMatvecDestroyH, //parCSR A -- inactive
939 hypre_ParKrylovInnerProd,
940 hypre_ParKrylovCopyVector,
941 hypre_ParKrylovClearVector,
942 hypre_ParKrylovScaleVector,
943 hypre_ParKrylovAxpy,
944 hypre_ParKrylovIdentitySetup, //parCSR A -- inactive
945 hypre_ParKrylovIdentity ); //parCSR A -- inactive
946 schur_solver = ( (HYPRE_Solver) hypre_GMRESCreate( gmres_functions ) );
947
948 /* setup GMRES parameters */
949 /* at least should apply 1 solve */
950 if (hypre_ParILUDataSchurGMRESKDim(ilu_data) == 0)
951 {
952 hypre_ParILUDataSchurGMRESKDim(ilu_data) ++;
953 }
954 HYPRE_GMRESSetKDim (schur_solver, hypre_ParILUDataSchurGMRESKDim(ilu_data));
955 HYPRE_GMRESSetMaxIter (schur_solver, hypre_ParILUDataSchurGMRESMaxIter(ilu_data));/* we don't need that many solves */
956 HYPRE_GMRESSetTol (schur_solver, hypre_ParILUDataSchurGMRESTol(ilu_data));
957 HYPRE_GMRESSetAbsoluteTol (schur_solver, hypre_ParILUDataSchurGMRESAbsoluteTol(ilu_data));
958 HYPRE_GMRESSetLogging (schur_solver, hypre_ParILUDataSchurSolverLogging(ilu_data));
959 HYPRE_GMRESSetPrintLevel (schur_solver, hypre_ParILUDataSchurSolverPrintLevel(ilu_data));/* set to zero now, don't print */
960 HYPRE_GMRESSetRelChange (schur_solver, hypre_ParILUDataSchurGMRESRelChange(ilu_data));
961
962 /* setup preconditioner parameters */
963 /* create Schur precond */
964 schur_precond = (HYPRE_Solver) ilu_vdata;
965 /* add preconditioner to solver */
966 HYPRE_GMRESSetPrecond(schur_solver,
967 (HYPRE_PtrToSolverFcn) hypre_ParILURAPSchurGMRESSolveH,
968 //(HYPRE_PtrToSolverFcn) hypre_ParILUCusparseSchurGMRESDummySolve,
969 (HYPRE_PtrToSolverFcn) hypre_ParILURAPSchurGMRESDummySetupH,
970 schur_precond);
971 HYPRE_GMRESGetPrecond(schur_solver, &schur_precond_gotten);
972 if (schur_precond_gotten != (schur_precond))
973 {
974 hypre_printf("Schur complement got bad precond\n");
975 return(-1);
976 }
977
978 /* setup solver */
979 HYPRE_GMRESSetup(schur_solver,(HYPRE_Matrix)ilu_vdata,(HYPRE_Vector)rhs,(HYPRE_Vector)x);
980
981 /* solve for right-hand-side consists of only 1 */
982 //hypre_Vector *rhs_local = hypre_ParVectorLocalVector(rhs);
983 //HYPRE_Real *Xtemp_data = hypre_VectorData(Xtemp_local);
984 //hypre_SeqVectorSetConstantValues(rhs_local, 1.0);
985 }
986 /* update ilu_data */
987 hypre_ParILUDataSchurSolver (ilu_data) = schur_solver;
988 hypre_ParILUDataSchurPrecond (ilu_data) = schur_precond;
989 hypre_ParILUDataRhs (ilu_data) = rhs;
990 hypre_ParILUDataX (ilu_data) = x;
991
992 #endif
993 break;
994 }
995 default:
996 break;
997 }
998 /* set pointers to ilu data */
999 #ifdef HYPRE_USING_CUDA
1000 /* set cusparse pointers */
1001 //hypre_ParILUDataILUSolveBuffer(ilu_data) = ilu_solve_buffer;
1002 hypre_ParILUDataMatAILUDevice(ilu_data) = matALU_d;
1003 hypre_ParILUDataMatBILUDevice(ilu_data) = matBLU_d;
1004 hypre_ParILUDataMatSILUDevice(ilu_data) = matSLU_d;
1005 hypre_ParILUDataMatEDevice(ilu_data) = matE_d;
1006 hypre_ParILUDataMatFDevice(ilu_data) = matF_d;
1007 hypre_ParILUDataILUSolveBuffer(ilu_data) = ilu_solve_buffer;
1008 hypre_ParILUDataMatALILUSolveInfo(ilu_data) = matAL_info;
1009 hypre_ParILUDataMatAUILUSolveInfo(ilu_data) = matAU_info;
1010 hypre_ParILUDataMatBLILUSolveInfo(ilu_data) = matBL_info;
1011 hypre_ParILUDataMatBUILUSolveInfo(ilu_data) = matBU_info;
1012 hypre_ParILUDataMatSLILUSolveInfo(ilu_data) = matSL_info;
1013 hypre_ParILUDataMatSUILUSolveInfo(ilu_data) = matSU_info;
1014 hypre_ParILUDataAperm(ilu_data) = Aperm;
1015 hypre_ParILUDataR(ilu_data) = R;
1016 hypre_ParILUDataP(ilu_data) = P;
1017 hypre_ParILUDataFTempUpper(ilu_data) = Ftemp_upper;
1018 hypre_ParILUDataUTempLower(ilu_data) = Utemp_lower;
1019 hypre_ParILUDataMatAFakeDiagonal(ilu_data) = A_diag_fake;
1020 #endif
1021 hypre_ParILUDataMatA(ilu_data) = matA;
1022 hypre_ParILUDataXTemp(ilu_data) = Xtemp;
1023 hypre_ParILUDataYTemp(ilu_data) = Ytemp;
1024 hypre_ParILUDataF(ilu_data) = F_array;
1025 hypre_ParILUDataU(ilu_data) = U_array;
1026 hypre_ParILUDataMatL(ilu_data) = matL;
1027 hypre_ParILUDataMatD(ilu_data) = matD;
1028 hypre_ParILUDataMatU(ilu_data) = matU;
1029 hypre_ParILUDataMatLModified(ilu_data) = matmL;
1030 hypre_ParILUDataMatDModified(ilu_data) = matmD;
1031 hypre_ParILUDataMatUModified(ilu_data) = matmU;
1032 hypre_ParILUDataMatS(ilu_data) = matS;
1033 hypre_ParILUDataCFMarkerArray(ilu_data) = CF_marker_array;
1034 hypre_ParILUDataPerm(ilu_data) = perm;
1035 hypre_ParILUDataQPerm(ilu_data) = qperm;
1036 hypre_ParILUDataNLU(ilu_data) = nLU;
1037 hypre_ParILUDataNI(ilu_data) = nI;
1038 hypre_ParILUDataUEnd(ilu_data) = u_end;
1039 hypre_ParILUDataUExt(ilu_data) = uext;
1040 hypre_ParILUDataFExt(ilu_data) = fext;
1041
1042 /* compute operator complexity */
1043 hypre_ParCSRMatrixSetDNumNonzeros(matA);
1044 nnzS = 0.0;
1045 /* size_C is the size of global coarse grid, upper left part */
1046 size_C = hypre_ParCSRMatrixGlobalNumRows(matA);
1047 /* switch to compute complexity */
1048
1049 #ifdef HYPRE_USING_CUDA
1050 HYPRE_Int nnzBEF = 0;
1051 HYPRE_Int nnzG;/* Global nnz */
1052 if (ilu_type == 0 && fill_level == 0)
1053 {
1054 /* The nnz is for sure 1.0 in this case */
1055 hypre_ParILUDataOperatorComplexity(ilu_data) = 1.0;
1056 }
1057 else if (ilu_type == 10 && fill_level == 0)
1058 {
1059 /* The nnz is the sum of different parts */
1060 if (matBLU_d)
1061 {
1062 nnzBEF += hypre_CSRMatrixNumNonzeros(matBLU_d);
1063 }
1064 if (matE_d)
1065 {
1066 nnzBEF += hypre_CSRMatrixNumNonzeros(matE_d);
1067 }
1068 if (matF_d)
1069 {
1070 nnzBEF += hypre_CSRMatrixNumNonzeros(matF_d);
1071 }
1072 hypre_MPI_Allreduce(&nnzBEF, &nnzG, 1, HYPRE_MPI_INT, hypre_MPI_SUM, comm);
1073 if (matS)
1074 {
1075 hypre_ParCSRMatrixSetDNumNonzeros(matS);
1076 nnzS = hypre_ParCSRMatrixDNumNonzeros(matS);
1077 /* if we have Schur system need to reduce it from size_C */
1078 }
1079 hypre_ParILUDataOperatorComplexity(ilu_data) = ((HYPRE_Real)nnzG + nnzS) /
1080 hypre_ParCSRMatrixDNumNonzeros(matA);
1081 }
1082 else if (ilu_type == 50)
1083 {
1084 hypre_ParILUDataOperatorComplexity(ilu_data) = 1.0;
1085 }
1086 else if (ilu_type == 0 || ilu_type == 1 || ilu_type == 10 || ilu_type == 11)
1087 {
1088 if (matBLU_d)
1089 {
1090 nnzBEF += hypre_CSRMatrixNumNonzeros(matBLU_d);
1091 }
1092 if (matE_d)
1093 {
1094 nnzBEF += hypre_CSRMatrixNumNonzeros(matE_d);
1095 }
1096 if (matF_d)
1097 {
1098 nnzBEF += hypre_CSRMatrixNumNonzeros(matF_d);
1099 }
1100 hypre_MPI_Allreduce(&nnzBEF, &nnzG, 1, HYPRE_MPI_INT, hypre_MPI_SUM, comm);
1101 if (matS)
1102 {
1103 hypre_ParCSRMatrixSetDNumNonzeros(matS);
1104 nnzS = hypre_ParCSRMatrixDNumNonzeros(matS);
1105 /* if we have Schur system need to reduce it from size_C */
1106 }
1107 hypre_ParILUDataOperatorComplexity(ilu_data) = ((HYPRE_Real)nnzG + nnzS) /
1108 hypre_ParCSRMatrixDNumNonzeros(matA);
1109 }
1110 else
1111 {
1112 #endif
1113 if (matS)
1114 {
1115 hypre_ParCSRMatrixSetDNumNonzeros(matS);
1116 nnzS = hypre_ParCSRMatrixDNumNonzeros(matS);
1117 /* if we have Schur system need to reduce it from size_C */
1118 size_C -= hypre_ParCSRMatrixGlobalNumRows(matS);
1119 switch(ilu_type)
1120 {
1121 case 10: case 11: case 40: case 41: case 50:
1122 /* now we need to compute the preconditioner */
1123 schur_precond_ilu = (hypre_ParILUData*) (hypre_ParILUDataSchurPrecond(ilu_data));
1124 /* borrow i for local nnz of S */
1125 i = hypre_CSRMatrixNumNonzeros(hypre_ParCSRMatrixOffd(matS));
1126 hypre_MPI_Allreduce(&i, &nnzS_offd, 1, HYPRE_MPI_INT, hypre_MPI_SUM, comm);
1127 nnzS = nnzS * hypre_ParILUDataOperatorComplexity(schur_precond_ilu) +nnzS_offd;
1128 break;
1129 case 20: case 21:
1130 schur_solver_nsh = (hypre_ParNSHData*) hypre_ParILUDataSchurSolver(ilu_data);
1131 nnzS = nnzS * (hypre_ParNSHDataOperatorComplexity(schur_solver_nsh));
1132 break;
1133 default:
1134 break;
1135 }
1136 }
1137
1138 hypre_ParILUDataOperatorComplexity(ilu_data) = ((HYPRE_Real)size_C + nnzS +
1139 hypre_ParCSRMatrixDNumNonzeros(matL) +
1140 hypre_ParCSRMatrixDNumNonzeros(matU))/
1141 hypre_ParCSRMatrixDNumNonzeros(matA);
1142 #ifdef HYPRE_USING_CUDA
1143 }
1144 #endif
1145 if ((my_id == 0) && (print_level > 0))
1146 {
1147 hypre_printf("ILU SETUP: operator complexity = %f \n", hypre_ParILUDataOperatorComplexity(ilu_data));
1148 }
1149
1150 if ( logging > 1 ) {
1151 residual =
1152 hypre_ParVectorCreate(hypre_ParCSRMatrixComm(matA),
1153 hypre_ParCSRMatrixGlobalNumRows(matA),
1154 hypre_ParCSRMatrixRowStarts(matA) );
1155 hypre_ParVectorInitialize(residual);
1156 hypre_ParILUDataResidual(ilu_data) = residual;
1157 }
1158 else{
1159 hypre_ParILUDataResidual(ilu_data) = NULL;
1160 }
1161 rel_res_norms = hypre_CTAlloc(HYPRE_Real, hypre_ParILUDataMaxIter(ilu_data), HYPRE_MEMORY_HOST);
1162 hypre_ParILUDataRelResNorms(ilu_data) = rel_res_norms;
1163 HYPRE_ANNOTATE_FUNC_END;
1164
1165 return hypre_error_flag;
1166 }
1167
1168 #ifdef HYPRE_USING_CUDA
1169
1170 /* Extract submatrix from diagonal part of A into a new CSRMatrix without sort rows
1171 * WARNING: We don't put diagonal to the first entry of each row since this function is now for cuSparse only
1172 * A = input matrix
1173 * perm = permutation array indicating ordering of rows. Perm could come from a
1174 * CF_marker array or a reordering routine.
1175 * rqperm = reverse permutation array indicating ordering of columns
1176 * A_diagp = pointer to the output diagonal matrix.
1177 */
1178 HYPRE_Int
hypre_ParILUCusparseExtractDiagonalCSR(hypre_ParCSRMatrix * A,HYPRE_Int * perm,HYPRE_Int * rqperm,hypre_CSRMatrix ** A_diagp)1179 hypre_ParILUCusparseExtractDiagonalCSR( hypre_ParCSRMatrix *A,
1180 HYPRE_Int *perm,
1181 HYPRE_Int *rqperm,
1182 hypre_CSRMatrix **A_diagp )
1183 {
1184 /* Get necessary slots */
1185 hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
1186 HYPRE_Int *A_diag_i = hypre_CSRMatrixI(A_diag);
1187 HYPRE_Int *A_diag_j = hypre_CSRMatrixJ(A_diag);
1188 HYPRE_Real *A_diag_data = hypre_CSRMatrixData(A_diag);
1189 HYPRE_Int n = hypre_CSRMatrixNumRows(A_diag);
1190 HYPRE_Int nnz_A_diag = A_diag_i[n];
1191
1192 HYPRE_Int i, j, current_idx;
1193
1194 /* No schur complement makes everything easy :) */
1195 hypre_CSRMatrix *B = NULL;
1196 B = hypre_CSRMatrixCreate(n, n, nnz_A_diag);
1197 hypre_CSRMatrixInitialize(B);
1198 HYPRE_Int *B_i = hypre_CSRMatrixI(B);
1199 HYPRE_Int *B_j = hypre_CSRMatrixJ(B);
1200 HYPRE_Real *B_data = hypre_CSRMatrixData(B);
1201
1202 /* Copy everything in with permutation */
1203 current_idx = 0;
1204 for ( i = 0; i < n; i++ )
1205 {
1206 B_i[i] = current_idx;
1207 for (j = A_diag_i[perm[i]] ; j < A_diag_i[perm[i]+1] ; j ++)
1208 {
1209 B_j[current_idx] = rqperm[A_diag_j[j]];
1210 B_data[current_idx++] = A_diag_data[j];
1211 }
1212 }
1213 B_i[n] = current_idx;
1214
1215 hypre_assert(current_idx == nnz_A_diag);
1216 *A_diagp = B;
1217
1218 return hypre_error_flag;
1219 }
1220
1221 /* Extract submatrix from diagonal part of A into a
1222 * | B F |
1223 * | E C |
1224 * Struct in order to do ILU with cusparse.
1225 * WARNING: Cusparse requires each row been sorted by column
1226 * This function only works when rows are sorted!.
1227 * A = input matrix
1228 * perm = permutation array indicating ordering of rows. Perm could come from a
1229 * CF_marker array or a reordering routine.
1230 * qperm = permutation array indicating ordering of columns
1231 * Bp = pointer to the output B matrix.
1232 * Cp = pointer to the output C matrix.
1233 * Ep = pointer to the output E matrix.
1234 * Fp = pointer to the output F matrix.
1235 */
1236 HYPRE_Int
hypre_ParILUCusparseILUExtractEBFC(hypre_CSRMatrix * A_diag,HYPRE_Int nLU,hypre_CSRMatrix ** Bp,hypre_CSRMatrix ** Cp,hypre_CSRMatrix ** Ep,hypre_CSRMatrix ** Fp)1237 hypre_ParILUCusparseILUExtractEBFC(hypre_CSRMatrix *A_diag, HYPRE_Int nLU, hypre_CSRMatrix **Bp, hypre_CSRMatrix **Cp, hypre_CSRMatrix **Ep, hypre_CSRMatrix **Fp)
1238 {
1239 /* Get necessary slots */
1240 HYPRE_Int *A_diag_i = hypre_CSRMatrixI(A_diag);
1241 HYPRE_Int *A_diag_j = hypre_CSRMatrixJ(A_diag);
1242 HYPRE_Real *A_diag_data = hypre_CSRMatrixData(A_diag);
1243 HYPRE_Int n = hypre_CSRMatrixNumRows(A_diag);
1244 HYPRE_Int nnz_A_diag = A_diag_i[n];
1245
1246 HYPRE_Int i, j, row, col;
1247
1248 hypre_assert(nLU >= 0 && nLU <= n);
1249
1250 if (nLU == n)
1251 {
1252 /* No schur complement makes everything easy :) */
1253 hypre_CSRMatrix *B = NULL;
1254 hypre_CSRMatrix *C = NULL;
1255 hypre_CSRMatrix *E = NULL;
1256 hypre_CSRMatrix *F = NULL;
1257 B = hypre_CSRMatrixCreate(n, n, nnz_A_diag);
1258 hypre_CSRMatrixInitialize(B);
1259 hypre_CSRMatrixCopy(A_diag, B, 1);
1260 C = hypre_CSRMatrixCreate(0, 0, 0);
1261 hypre_CSRMatrixInitialize(C);
1262 E = hypre_CSRMatrixCreate(0, 0, 0);
1263 hypre_CSRMatrixInitialize(E);
1264 F = hypre_CSRMatrixCreate(0, 0, 0);
1265 hypre_CSRMatrixInitialize(F);
1266 *Bp = B;
1267 *Cp = C;
1268 *Ep = E;
1269 *Fp = F;
1270 }
1271 else if (nLU ==0)
1272 {
1273 /* All schur complement also makes everything easy :) */
1274 hypre_CSRMatrix *B = NULL;
1275 hypre_CSRMatrix *C = NULL;
1276 hypre_CSRMatrix *E = NULL;
1277 hypre_CSRMatrix *F = NULL;
1278 C = hypre_CSRMatrixCreate(n, n, nnz_A_diag);
1279 hypre_CSRMatrixInitialize(C);
1280 hypre_CSRMatrixCopy(A_diag, C, 1);
1281 B = hypre_CSRMatrixCreate(0, 0, 0);
1282 hypre_CSRMatrixInitialize(B);
1283 E = hypre_CSRMatrixCreate(0, 0, 0);
1284 hypre_CSRMatrixInitialize(E);
1285 F = hypre_CSRMatrixCreate(0, 0, 0);
1286 hypre_CSRMatrixInitialize(F);
1287 *Bp = B;
1288 *Cp = C;
1289 *Ep = E;
1290 *Fp = F;
1291 }
1292 else
1293 {
1294 /* Has schur complement :( */
1295 HYPRE_Int m = n - nLU;
1296 hypre_CSRMatrix *B = NULL;
1297 hypre_CSRMatrix *C = NULL;
1298 hypre_CSRMatrix *E = NULL;
1299 hypre_CSRMatrix *F = NULL;
1300 HYPRE_Int capacity_B;
1301 HYPRE_Int capacity_E;
1302 HYPRE_Int capacity_F;
1303 HYPRE_Int capacity_C;
1304 HYPRE_Int ctrB;
1305 HYPRE_Int ctrC;
1306 HYPRE_Int ctrE;
1307 HYPRE_Int ctrF;
1308
1309 HYPRE_Int *B_i = NULL;
1310 HYPRE_Int *C_i = NULL;
1311 HYPRE_Int *E_i = NULL;
1312 HYPRE_Int *F_i = NULL;
1313 HYPRE_Int *B_j = NULL;
1314 HYPRE_Int *C_j = NULL;
1315 HYPRE_Int *E_j = NULL;
1316 HYPRE_Int *F_j = NULL;
1317 HYPRE_Real *B_data = NULL;
1318 HYPRE_Real *C_data = NULL;
1319 HYPRE_Real *E_data = NULL;
1320 HYPRE_Real *F_data = NULL;
1321
1322 /* Create CSRMatrices */
1323 B = hypre_CSRMatrixCreate(nLU, nLU, 0);
1324 hypre_CSRMatrixInitialize(B);
1325 C = hypre_CSRMatrixCreate(m, m, 0);
1326 hypre_CSRMatrixInitialize(C);
1327 E = hypre_CSRMatrixCreate(m, nLU, 0);
1328 hypre_CSRMatrixInitialize(E);
1329 F = hypre_CSRMatrixCreate(nLU, m, 0);
1330 hypre_CSRMatrixInitialize(F);
1331
1332 /* Estimate # of nonzeros */
1333 capacity_B = nLU + ceil(nnz_A_diag * 1.0 * nLU / n * nLU / n);
1334 capacity_C = m + ceil(nnz_A_diag * 1.0 * m / n * m / n);
1335 capacity_E = hypre_min(m, nLU) + ceil(nnz_A_diag * 1.0 * nLU / n * m / n);
1336 capacity_F = capacity_E;
1337
1338 /* Allocate memory */
1339 B_i = hypre_CSRMatrixI(B);
1340 B_j = hypre_CTAlloc(HYPRE_Int, capacity_B, HYPRE_MEMORY_DEVICE);
1341 B_data = hypre_CTAlloc(HYPRE_Real, capacity_B, HYPRE_MEMORY_DEVICE);
1342 C_i = hypre_CSRMatrixI(C);
1343 C_j = hypre_CTAlloc(HYPRE_Int, capacity_C, HYPRE_MEMORY_DEVICE);
1344 C_data = hypre_CTAlloc(HYPRE_Real, capacity_C, HYPRE_MEMORY_DEVICE);
1345 E_i = hypre_CSRMatrixI(E);
1346 E_j = hypre_CTAlloc(HYPRE_Int, capacity_E, HYPRE_MEMORY_DEVICE);
1347 E_data = hypre_CTAlloc(HYPRE_Real, capacity_E, HYPRE_MEMORY_DEVICE);
1348 F_i = hypre_CSRMatrixI(F);
1349 F_j = hypre_CTAlloc(HYPRE_Int, capacity_F, HYPRE_MEMORY_DEVICE);
1350 F_data = hypre_CTAlloc(HYPRE_Real, capacity_F, HYPRE_MEMORY_DEVICE);
1351 ctrB = 0;
1352 ctrC = 0;
1353 ctrE = 0;
1354 ctrF = 0;
1355
1356 /* Loop to copy data */
1357 /* B and F first */
1358 for (i = 0; i < nLU; i++)
1359 {
1360 B_i[i] = ctrB;
1361 F_i[i] = ctrF;
1362 for (j = A_diag_i[i]; j < A_diag_i[i+1]; j++)
1363 {
1364 col = A_diag_j[j];
1365 if (col >= nLU)
1366 {
1367 break;
1368 }
1369 B_j[ctrB] = col;
1370 B_data[ctrB++] = A_diag_data[j];
1371 /* check capacity */
1372 if (ctrB >= capacity_B)
1373 {
1374 HYPRE_Int tmp;
1375 tmp = capacity_B;
1376 capacity_B = capacity_B * EXPAND_FACT + 1;
1377 B_j = hypre_TReAlloc_v2(B_j, HYPRE_Int, tmp, HYPRE_Int, capacity_B, HYPRE_MEMORY_DEVICE);
1378 B_data = hypre_TReAlloc_v2(B_data, HYPRE_Real, tmp, HYPRE_Real, capacity_B, HYPRE_MEMORY_DEVICE);
1379 }
1380 }
1381 for (; j < A_diag_i[i+1]; j++)
1382 {
1383 col = A_diag_j[j];
1384 col = col - nLU;
1385 F_j[ctrF] = col;
1386 F_data[ctrF++] = A_diag_data[j];
1387 if (ctrF >= capacity_F)
1388 {
1389 HYPRE_Int tmp;
1390 tmp = capacity_F;
1391 capacity_F = capacity_F * EXPAND_FACT + 1;
1392 F_j = hypre_TReAlloc_v2(F_j, HYPRE_Int, tmp, HYPRE_Int, capacity_F, HYPRE_MEMORY_DEVICE);
1393 F_data = hypre_TReAlloc_v2(F_data, HYPRE_Real, tmp, HYPRE_Real, capacity_F, HYPRE_MEMORY_DEVICE);
1394 }
1395 }
1396 }
1397 B_i[nLU] = ctrB;
1398 F_i[nLU] = ctrF;
1399
1400 /* E and C afterward */
1401 for (i = nLU; i < n; i++)
1402 {
1403 row = i - nLU;
1404 E_i[row] = ctrE;
1405 C_i[row] = ctrC;
1406 for (j = A_diag_i[i]; j < A_diag_i[i+1]; j++)
1407 {
1408 col = A_diag_j[j];
1409 if (col >= nLU)
1410 {
1411 break;
1412 }
1413 E_j[ctrE] = col;
1414 E_data[ctrE++] = A_diag_data[j];
1415 /* check capacity */
1416 if (ctrE >= capacity_E)
1417 {
1418 HYPRE_Int tmp;
1419 tmp = capacity_E;
1420 capacity_E = capacity_E * EXPAND_FACT + 1;
1421 E_j = hypre_TReAlloc_v2(E_j, HYPRE_Int, tmp, HYPRE_Int, capacity_E, HYPRE_MEMORY_DEVICE);
1422 E_data = hypre_TReAlloc_v2(E_data, HYPRE_Real, tmp, HYPRE_Real, capacity_E, HYPRE_MEMORY_DEVICE);
1423 }
1424 }
1425 for (; j < A_diag_i[i+1]; j++)
1426 {
1427 col = A_diag_j[j];
1428 col = col - nLU;
1429 C_j[ctrC] = col;
1430 C_data[ctrC++] = A_diag_data[j];
1431 if (ctrC >= capacity_C)
1432 {
1433 HYPRE_Int tmp;
1434 tmp = capacity_C;
1435 capacity_C = capacity_C * EXPAND_FACT + 1;
1436 C_j = hypre_TReAlloc_v2(C_j, HYPRE_Int, tmp, HYPRE_Int, capacity_C, HYPRE_MEMORY_DEVICE);
1437 C_data = hypre_TReAlloc_v2(C_data, HYPRE_Real, tmp, HYPRE_Real, capacity_C, HYPRE_MEMORY_DEVICE);
1438 }
1439 }
1440 }
1441 E_i[m] = ctrE;
1442 C_i[m] = ctrC;
1443
1444 hypre_assert((ctrB+ctrC+ctrE+ctrF) == nnz_A_diag);
1445
1446 /* Create CSRMatrices */
1447 hypre_CSRMatrixJ(B) = B_j;
1448 hypre_CSRMatrixData(B) = B_data;
1449 hypre_CSRMatrixNumNonzeros(B) = ctrB;
1450 hypre_CSRMatrixSetDataOwner(B, 1);
1451 *Bp = B;
1452
1453 hypre_CSRMatrixJ(C) = C_j;
1454 hypre_CSRMatrixData(C) = C_data;
1455 hypre_CSRMatrixNumNonzeros(C) = ctrC;
1456 hypre_CSRMatrixSetDataOwner(C, 1);
1457 *Cp = C;
1458
1459 hypre_CSRMatrixJ(E) = E_j;
1460 hypre_CSRMatrixData(E) = E_data;
1461 hypre_CSRMatrixNumNonzeros(E) = ctrE;
1462 hypre_CSRMatrixSetDataOwner(E, 1);
1463 *Ep = E;
1464
1465 hypre_CSRMatrixJ(F) = F_j;
1466 hypre_CSRMatrixData(F) = F_data;
1467 hypre_CSRMatrixNumNonzeros(F) = ctrF;
1468 hypre_CSRMatrixSetDataOwner(F, 1);
1469 *Fp = F;
1470 }
1471
1472 return hypre_error_flag;
1473 }
1474
1475 /* Wrapper for ILU0 with cusparse on a matrix, csr sort was done in this function */
1476 HYPRE_Int
HYPRE_ILUSetupCusparseCSRILU0(hypre_CSRMatrix * A,cusparseSolvePolicy_t ilu_solve_policy)1477 HYPRE_ILUSetupCusparseCSRILU0(hypre_CSRMatrix *A, cusparseSolvePolicy_t ilu_solve_policy)
1478 {
1479
1480 /* data objects for A */
1481 HYPRE_Int n = hypre_CSRMatrixNumRows(A);
1482 HYPRE_Int m = hypre_CSRMatrixNumCols(A);
1483
1484 hypre_assert(n == m);
1485
1486 HYPRE_Real *A_data = hypre_CSRMatrixData(A);
1487 HYPRE_Int *A_i = hypre_CSRMatrixI(A);
1488 HYPRE_Int *A_j = hypre_CSRMatrixJ(A);
1489 HYPRE_Int nnz_A = hypre_CSRMatrixNumNonzeros(A);
1490
1491 /* pointers to cusparse data */
1492 csrilu02Info_t matA_info = NULL;
1493
1494 /* variables and working arrays used during the ilu */
1495 HYPRE_Int zero_pivot;
1496 HYPRE_Int matA_buffersize;
1497 void *matA_buffer = NULL;
1498
1499 HYPRE_Int isDoublePrecision = sizeof(HYPRE_Complex) == sizeof(hypre_double);
1500 HYPRE_Int isSinglePrecision = sizeof(HYPRE_Complex) == sizeof(hypre_double) / 2;
1501
1502 cusparseHandle_t handle = hypre_HandleCusparseHandle(hypre_handle());
1503 cusparseMatDescr_t descr = hypre_CSRMatrixGPUMatDescr(A);
1504
1505 hypre_assert(isDoublePrecision || isSinglePrecision);
1506
1507 /* 1. Sort columns inside each row first, we can't assume that's sorted */
1508 hypre_SortCSRCusparse(n, m, nnz_A, descr, A_i, A_j, A_data);
1509
1510 /* 2. Create info for ilu setup and solve */
1511 HYPRE_CUSPARSE_CALL(cusparseCreateCsrilu02Info(&matA_info));
1512
1513 /* 3. Get working array size */
1514 if (isDoublePrecision)
1515 {
1516 HYPRE_CUSPARSE_CALL(cusparseDcsrilu02_bufferSize(handle, n, nnz_A, descr,
1517 (hypre_double *) A_data, A_i, A_j,
1518 matA_info, &matA_buffersize));
1519 }
1520 else if (isSinglePrecision)
1521 {
1522 HYPRE_CUSPARSE_CALL(cusparseScsrilu02_bufferSize(handle, n, nnz_A, descr,
1523 (float *) A_data, A_i, A_j,
1524 matA_info, &matA_buffersize));
1525 }
1526 /* 4. Create working array, since they won't be visited by host, allocate on device */
1527 matA_buffer = hypre_MAlloc(matA_buffersize, HYPRE_MEMORY_DEVICE);
1528
1529 /* 5. Now perform the analysis */
1530 /* 5-1. Analysis */
1531 if (isDoublePrecision)
1532 {
1533 HYPRE_CUSPARSE_CALL(cusparseDcsrilu02_analysis(handle, n, nnz_A, descr,
1534 (hypre_double *) A_data, A_i, A_j,
1535 matA_info, ilu_solve_policy, matA_buffer));
1536 }
1537 else if (isSinglePrecision)
1538 {
1539 HYPRE_CUSPARSE_CALL(cusparseScsrilu02_analysis(handle, n, nnz_A, descr,
1540 (float *) A_data, A_i, A_j,
1541 matA_info, ilu_solve_policy, matA_buffer));
1542 }
1543 /* 5-2. Check for zero pivot */
1544 HYPRE_CUSPARSE_CALL(cusparseXcsrilu02_zeroPivot(handle, matA_info, &zero_pivot));
1545
1546 /* 6. Apply the factorization */
1547 if (isDoublePrecision)
1548 {
1549 HYPRE_CUSPARSE_CALL(cusparseDcsrilu02(handle, n, nnz_A, descr,
1550 (hypre_double *) A_data, A_i, A_j,
1551 matA_info, ilu_solve_policy, matA_buffer));
1552 }
1553 else if (isSinglePrecision)
1554 {
1555 HYPRE_CUSPARSE_CALL(cusparseScsrilu02(handle, n, nnz_A, descr,
1556 (float *) A_data, A_i, A_j,
1557 matA_info, ilu_solve_policy, matA_buffer));
1558 }
1559
1560 /* Check for zero pivot */
1561 HYPRE_CUSPARSE_CALL(cusparseXcsrilu02_zeroPivot(handle, matA_info, &zero_pivot));
1562
1563 /* Done with factorization, finishing up */
1564 hypre_TFree(matA_buffer, HYPRE_MEMORY_DEVICE);
1565 HYPRE_CUSPARSE_CALL(cusparseDestroyCsrilu02Info(matA_info));
1566
1567 return hypre_error_flag;
1568 }
1569
1570 /* Wrapper for ILU0 solve analysis phase with cusparse on a matrix */
1571 HYPRE_Int
HYPRE_ILUSetupCusparseCSRILU0SetupSolve(hypre_CSRMatrix * A,cusparseMatDescr_t matL_des,cusparseMatDescr_t matU_des,cusparseSolvePolicy_t ilu_solve_policy,csrsv2Info_t * matL_infop,csrsv2Info_t * matU_infop,HYPRE_Int * buffer_sizep,void ** bufferp)1572 HYPRE_ILUSetupCusparseCSRILU0SetupSolve(hypre_CSRMatrix *A, cusparseMatDescr_t matL_des, cusparseMatDescr_t matU_des,
1573 cusparseSolvePolicy_t ilu_solve_policy, csrsv2Info_t *matL_infop, csrsv2Info_t *matU_infop,
1574 HYPRE_Int *buffer_sizep, void **bufferp)
1575 {
1576 if (!A)
1577 {
1578 /* return if A is NULL */
1579 *matL_infop = NULL;
1580 *matU_infop = NULL;
1581 *buffer_sizep = 0;
1582 *bufferp = NULL;
1583 return hypre_error_flag;
1584 }
1585
1586 /* data objects for A */
1587 HYPRE_Int n = hypre_CSRMatrixNumRows(A);
1588 HYPRE_Int m = hypre_CSRMatrixNumCols(A);
1589
1590 hypre_assert(n == m);
1591
1592 if (n == 0)
1593 {
1594 /* return if A is 0 by 0 */
1595 *matL_infop = NULL;
1596 *matU_infop = NULL;
1597 *buffer_sizep = 0;
1598 *bufferp = NULL;
1599 return hypre_error_flag;
1600 }
1601
1602 HYPRE_Real *A_data = hypre_CSRMatrixData(A);
1603 HYPRE_Int *A_i = hypre_CSRMatrixI(A);
1604 HYPRE_Int *A_j = hypre_CSRMatrixJ(A);
1605 HYPRE_Int nnz_A = A_i[n];
1606
1607 /* pointers to cusparse data */
1608 csrsv2Info_t matL_info = *matL_infop;
1609 csrsv2Info_t matU_info = *matU_infop;
1610
1611 /* clear data if already exists */
1612 if (matL_info)
1613 {
1614 HYPRE_CUSPARSE_CALL( cusparseDestroyCsrsv2Info(matL_info) );
1615 matL_info = NULL;
1616 }
1617 if (matU_info)
1618 {
1619 HYPRE_CUSPARSE_CALL( cusparseDestroyCsrsv2Info(matU_info) );
1620 matU_info = NULL;
1621 }
1622
1623 /* variables and working arrays used during the ilu */
1624 HYPRE_Int matL_buffersize;
1625 HYPRE_Int matU_buffersize;
1626 HYPRE_Int solve_buffersize;
1627 HYPRE_Int solve_oldbuffersize = *buffer_sizep;
1628 void *solve_buffer = *bufferp;
1629
1630 HYPRE_Int isDoublePrecision = sizeof(HYPRE_Complex) == sizeof(hypre_double);
1631 HYPRE_Int isSinglePrecision = sizeof(HYPRE_Complex) == sizeof(hypre_double) / 2;
1632
1633 hypre_assert(isDoublePrecision || isSinglePrecision);
1634
1635 cusparseHandle_t handle = hypre_HandleCusparseHandle(hypre_handle());
1636
1637 /* 1. Create info for ilu setup and solve */
1638 HYPRE_CUSPARSE_CALL(cusparseCreateCsrsv2Info(&(matL_info)));
1639 HYPRE_CUSPARSE_CALL(cusparseCreateCsrsv2Info(&(matU_info)));
1640
1641 /* 2. Get working array size */
1642 if (isDoublePrecision)
1643 {
1644
1645 HYPRE_CUSPARSE_CALL(cusparseDcsrsv2_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n, nnz_A,
1646 matL_des, (hypre_double *) A_data, A_i, A_j,
1647 matL_info, &matL_buffersize));
1648
1649 HYPRE_CUSPARSE_CALL(cusparseDcsrsv2_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n, nnz_A,
1650 matU_des, (hypre_double *) A_data, A_i, A_j,
1651 matU_info, &matU_buffersize));
1652 }
1653 else if (isSinglePrecision)
1654 {
1655
1656 HYPRE_CUSPARSE_CALL(cusparseScsrsv2_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n, nnz_A,
1657 matL_des, (float *) A_data, A_i, A_j,
1658 matL_info, &matL_buffersize));
1659
1660 HYPRE_CUSPARSE_CALL(cusparseScsrsv2_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, n, nnz_A,
1661 matU_des, (float *) A_data, A_i, A_j,
1662 matU_info, &matU_buffersize));
1663 }
1664 solve_buffersize = hypre_max( matL_buffersize, matU_buffersize );
1665 /* 3. Create working array, since they won't be visited by host, allocate on device */
1666 if (solve_buffersize > solve_oldbuffersize)
1667 {
1668 if (solve_buffer)
1669 {
1670 solve_buffer = hypre_ReAlloc_v2(solve_buffer, solve_oldbuffersize, solve_buffersize, HYPRE_MEMORY_DEVICE);
1671 }
1672 else
1673 {
1674 solve_buffer = hypre_MAlloc(solve_buffersize, HYPRE_MEMORY_DEVICE);
1675 }
1676 }
1677
1678 /* 4. Now perform the analysis */
1679 if (isDoublePrecision)
1680 {
1681
1682 HYPRE_CUSPARSE_CALL(cusparseDcsrsv2_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1683 n, nnz_A, matL_des,
1684 (hypre_double *) A_data, A_i, A_j,
1685 matL_info, ilu_solve_policy, solve_buffer));
1686
1687 HYPRE_CUSPARSE_CALL(cusparseDcsrsv2_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1688 n, nnz_A, matU_des,
1689 (hypre_double *) A_data, A_i, A_j,
1690 matU_info, ilu_solve_policy, solve_buffer));
1691 }
1692 else if (isSinglePrecision)
1693 {
1694
1695 HYPRE_CUSPARSE_CALL(cusparseScsrsv2_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1696 n, nnz_A, matL_des,
1697 (float *) A_data, A_i, A_j,
1698 matL_info, ilu_solve_policy, solve_buffer));
1699
1700 HYPRE_CUSPARSE_CALL(cusparseScsrsv2_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
1701 n, nnz_A, matU_des,
1702 (float *) A_data, A_i, A_j,
1703 matU_info, ilu_solve_policy, solve_buffer));
1704 }
1705
1706 /* Done with analysis, finishing up */
1707 /* Set return value */
1708 *matL_infop = matL_info;
1709 *matU_infop = matU_info;
1710 *buffer_sizep = solve_buffersize;
1711 *bufferp = solve_buffer;
1712
1713 return hypre_error_flag;
1714 }
1715
1716 /* ILU(0) (GPU)
1717 * A = input matrix
1718 * perm = permutation array indicating ordering of rows. Perm could come from a
1719 * CF_marker array or a reordering routine.
1720 * qperm = permutation array indicating ordering of columns
1721 * nI = number of interial unknowns
1722 * nLU = size of incomplete factorization, nLU should obey nLU <= nI.
1723 * Schur complement is formed if nLU < n
1724 * Lptr, Dptr, Uptr, Sptr = L, D, U, S factors. Note that with CUDA, Dptr and Uptr are unused
1725 * xtempp, ytempp = helper vector used in 2-level solve.
1726 * A_fake_diagp = fake diagonal for matvec
1727 * will form global Schur Matrix if nLU < n
1728 */
1729 HYPRE_Int
hypre_ILUSetupILU0Device(hypre_ParCSRMatrix * A,HYPRE_Int * perm,HYPRE_Int * qperm,HYPRE_Int n,HYPRE_Int nLU,cusparseMatDescr_t matL_des,cusparseMatDescr_t matU_des,cusparseSolvePolicy_t ilu_solve_policy,void ** bufferp,csrsv2Info_t * matBL_infop,csrsv2Info_t * matBU_infop,csrsv2Info_t * matSL_infop,csrsv2Info_t * matSU_infop,hypre_CSRMatrix ** BLUptr,hypre_ParCSRMatrix ** matSptr,hypre_CSRMatrix ** Eptr,hypre_CSRMatrix ** Fptr,HYPRE_Int ** A_fake_diag_ip)1730 hypre_ILUSetupILU0Device(hypre_ParCSRMatrix *A, HYPRE_Int *perm, HYPRE_Int *qperm, HYPRE_Int n, HYPRE_Int nLU,
1731 cusparseMatDescr_t matL_des, cusparseMatDescr_t matU_des, cusparseSolvePolicy_t ilu_solve_policy,
1732 void **bufferp, csrsv2Info_t *matBL_infop, csrsv2Info_t *matBU_infop,
1733 csrsv2Info_t *matSL_infop, csrsv2Info_t *matSU_infop,
1734 hypre_CSRMatrix **BLUptr, hypre_ParCSRMatrix **matSptr, hypre_CSRMatrix **Eptr, hypre_CSRMatrix **Fptr,
1735 HYPRE_Int **A_fake_diag_ip)
1736 {
1737 /* GPU-accelerated ILU0 with cusparse */
1738 HYPRE_Int i, j, k1, k2, k3, col;
1739
1740 /* communication stuffs for S */
1741 MPI_Comm comm = hypre_ParCSRMatrixComm(A);
1742
1743 HYPRE_Int my_id, num_procs;
1744 hypre_MPI_Comm_size(comm,&num_procs);
1745 hypre_MPI_Comm_rank(comm,&my_id);
1746
1747 hypre_ParCSRCommPkg *comm_pkg;
1748 hypre_ParCSRCommHandle *comm_handle;
1749 HYPRE_Int num_sends, begin, end;
1750 HYPRE_BigInt *send_buf = NULL;
1751 HYPRE_Int *rperm = NULL;
1752 HYPRE_Int *rqperm = NULL;
1753
1754 hypre_ParCSRMatrix *matS = NULL;
1755 hypre_CSRMatrix *A_diag = NULL;
1756 HYPRE_Int *A_fake_diag_i = NULL;
1757 hypre_CSRMatrix *A_offd = NULL;
1758 HYPRE_Int *A_offd_i = NULL;
1759 HYPRE_Int *A_offd_j = NULL;
1760 HYPRE_Real *A_offd_data = NULL;
1761 hypre_CSRMatrix *SLU = NULL;
1762 /* pointers to cusparse data */
1763 csrsv2Info_t matBL_info = NULL;
1764 csrsv2Info_t matBU_info = NULL;
1765 csrsv2Info_t matSL_info = NULL;
1766 csrsv2Info_t matSU_info = NULL;
1767
1768 HYPRE_Int buffer_size = 0;
1769 void *buffer = NULL;
1770
1771 /* variables for matS */
1772 HYPRE_Int m = n - nLU;
1773 HYPRE_Int nI = nLU;//use default
1774 HYPRE_Int e = 0;
1775 HYPRE_Int m_e = m;
1776 HYPRE_BigInt total_rows;
1777 HYPRE_BigInt col_starts[2];
1778 HYPRE_Int *S_diag_i = NULL;
1779 HYPRE_Int S_diag_nnz;
1780 hypre_CSRMatrix *S_offd = NULL;
1781 HYPRE_Int *S_offd_i = NULL;
1782 HYPRE_Int *S_offd_j = NULL;
1783 HYPRE_Real *S_offd_data = NULL;
1784 HYPRE_BigInt *S_offd_colmap = NULL;
1785 HYPRE_Int S_offd_nnz;
1786 HYPRE_Int S_offd_ncols;
1787
1788 /* set data slots */
1789 A_offd = hypre_ParCSRMatrixOffd(A);
1790 A_offd_i = hypre_CSRMatrixI(A_offd);
1791 A_offd_j = hypre_CSRMatrixJ(A_offd);
1792 A_offd_data = hypre_CSRMatrixData(A_offd);
1793
1794 /* unfortunately we need to build the reverse permutation array */
1795 rperm = hypre_CTAlloc(HYPRE_Int, n, HYPRE_MEMORY_HOST);
1796 rqperm = hypre_CTAlloc(HYPRE_Int, n, HYPRE_MEMORY_HOST);
1797 for (i = 0; i < n; i++)
1798 {
1799 rperm[perm[i]] = i;
1800 rqperm[qperm[i]] = i;
1801 }
1802
1803 /* Only call ILU when we really have a matrix on this processor */
1804 if (n > 0)
1805 {
1806 /* Copy diagonal matrix into a new place with permutation
1807 * That is, A_diag = A_diag(perm,qperm);
1808 */
1809 hypre_ParILUCusparseExtractDiagonalCSR(A, perm, rqperm, &A_diag);
1810
1811 /* Apply ILU factorization to the entile A_diag */
1812 HYPRE_ILUSetupCusparseCSRILU0(A_diag, ilu_solve_policy);
1813
1814 /* | L \ U (B) L^{-1}F |
1815 * | EU^{-1} L \ U (S)|
1816 * Extract submatrix L_B U_B, L_S U_S, EU_B^{-1}, L_B^{-1}F
1817 * Note that in this function after ILU, all rows are sorted
1818 * in a way different than HYPRE. Diagonal is not listed in the front
1819 */
1820 hypre_ParILUCusparseILUExtractEBFC(A_diag, nLU, BLUptr, &SLU, Eptr, Fptr);
1821 }
1822 else
1823 {
1824 *BLUptr = NULL;
1825 *Eptr = NULL;
1826 *Fptr = NULL;
1827 SLU = NULL;
1828 }
1829
1830 /* create B */
1831 /* only analyse when nacessary */
1832 if ( nLU > 0 )
1833 {
1834 /* Analysis of BILU */
1835 HYPRE_ILUSetupCusparseCSRILU0SetupSolve(*BLUptr, matL_des, matU_des,
1836 ilu_solve_policy, &matBL_info, &matBU_info,
1837 &buffer_size, &buffer);
1838 }
1839
1840 HYPRE_BigInt big_m = (HYPRE_BigInt)m;
1841 hypre_MPI_Allreduce(&big_m, &total_rows, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
1842 /* only form when total_rows > 0 */
1843 if ( total_rows > 0 )
1844 {
1845 /* now create S */
1846 /* need to get new column start */
1847 {
1848 HYPRE_BigInt global_start;
1849 hypre_MPI_Scan( &big_m, &global_start, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
1850 col_starts[0] = global_start - m;
1851 col_starts[1] = global_start;
1852 }
1853
1854 A_fake_diag_i = hypre_CTAlloc(HYPRE_Int, m + 1, HYPRE_MEMORY_DEVICE);
1855 if (SLU)
1856 {
1857 /* Analysis of SILU */
1858 HYPRE_ILUSetupCusparseCSRILU0SetupSolve(SLU, matL_des, matU_des,
1859 ilu_solve_policy, &matSL_info, &matSU_info,
1860 &buffer_size, &buffer);
1861 }
1862 else
1863 {
1864 SLU = hypre_CSRMatrixCreate(0,0,0);
1865 hypre_CSRMatrixInitialize(SLU);
1866 }
1867 S_diag_i = hypre_CSRMatrixI(SLU);
1868 S_diag_nnz = S_diag_i[m];
1869 /* Build ParCSRMatrix matS
1870 * For example when np == 3 the new matrix takes the following form
1871 * |IS_1 E_12 E_13|
1872 * |E_21 IS_2 E_22| = S
1873 * |E_31 E_32 IS_3|
1874 * In which IS_i is the cusparse ILU factorization of S_i in one matrix
1875 * */
1876
1877 /* We did nothing to A_offd, so all the data kept, just reorder them
1878 * The create function takes comm, global num rows/cols,
1879 * row/col start, num cols offd, nnz diag, nnz offd
1880 */
1881 S_offd_nnz = hypre_CSRMatrixNumNonzeros(A_offd);
1882 S_offd_ncols = hypre_CSRMatrixNumCols(A_offd);
1883
1884 matS = hypre_ParCSRMatrixCreate( comm,
1885 total_rows,
1886 total_rows,
1887 col_starts,
1888 col_starts,
1889 S_offd_ncols,
1890 S_diag_nnz,
1891 S_offd_nnz);
1892
1893 /* first put diagonal data in */
1894 hypre_CSRMatrixDestroy(hypre_ParCSRMatrixDiag(matS));
1895 hypre_ParCSRMatrixDiag(matS) = SLU;
1896
1897 /* now start to construct offdiag of S */
1898 S_offd = hypre_ParCSRMatrixOffd(matS);
1899 S_offd_i = hypre_TAlloc(HYPRE_Int, m+1, HYPRE_MEMORY_DEVICE);
1900 S_offd_j = hypre_TAlloc(HYPRE_Int, S_offd_nnz, HYPRE_MEMORY_DEVICE);
1901 S_offd_data = hypre_TAlloc(HYPRE_Real, S_offd_nnz, HYPRE_MEMORY_DEVICE);
1902 S_offd_colmap = hypre_CTAlloc(HYPRE_BigInt, S_offd_ncols, HYPRE_MEMORY_HOST);
1903
1904 /* simply use a loop to copy data from A_offd */
1905 S_offd_i[0] = 0;
1906 k3 = 0;
1907 for (i = 1; i <= e; i++)
1908 {
1909 S_offd_i[i] = k3;
1910 }
1911 for (i = 0; i < m_e; i++)
1912 {
1913 col = perm[i + nI];
1914 k1 = A_offd_i[col];
1915 k2 = A_offd_i[col+1];
1916 for (j = k1; j < k2; j++)
1917 {
1918 S_offd_j[k3] = A_offd_j[j];
1919 S_offd_data[k3++] = A_offd_data[j];
1920 }
1921 S_offd_i[i+1+e] = k3;
1922 }
1923
1924 /* give I, J, DATA to S_offd */
1925 hypre_CSRMatrixI(S_offd) = S_offd_i;
1926 hypre_CSRMatrixJ(S_offd) = S_offd_j;
1927 hypre_CSRMatrixData(S_offd) = S_offd_data;
1928
1929 /* now we need to update S_offd_colmap */
1930 comm_pkg = hypre_ParCSRMatrixCommPkg(A);
1931 /* setup comm_pkg if not yet built */
1932 if (!comm_pkg)
1933 {
1934 hypre_MatvecCommPkgCreate(A);
1935 comm_pkg = hypre_ParCSRMatrixCommPkg(A);
1936 }
1937 /* get total num of send */
1938 num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
1939 begin = hypre_ParCSRCommPkgSendMapStart(comm_pkg,0);
1940 end = hypre_ParCSRCommPkgSendMapStart(comm_pkg,num_sends);
1941 send_buf = hypre_TAlloc(HYPRE_BigInt, end - begin, HYPRE_MEMORY_HOST);
1942 /* copy new index into send_buf */
1943 for (i = begin; i < end; i++)
1944 {
1945 send_buf[i-begin] = rperm[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,i)] - nLU + col_starts[0];
1946 }
1947
1948 /* main communication */
1949 comm_handle = hypre_ParCSRCommHandleCreate(21, comm_pkg, send_buf, S_offd_colmap);
1950 hypre_ParCSRCommHandleDestroy(comm_handle);
1951
1952 /* setup index */
1953 hypre_ParCSRMatrixColMapOffd(matS) = S_offd_colmap;
1954
1955 hypre_ILUSortOffdColmap(matS);
1956
1957 /* free */
1958 hypre_TFree(send_buf, HYPRE_MEMORY_HOST);
1959 } /* end of forming S */
1960
1961 *matSptr = matS;
1962 *bufferp = buffer;
1963 *matBL_infop = matBL_info;
1964 *matBU_infop = matBU_info;
1965 *matSL_infop = matSL_info;
1966 *matSU_infop = matSU_info;
1967 *A_fake_diag_ip= A_fake_diag_i;
1968
1969 /* Destroy the bridge after acrossing the river */
1970 hypre_CSRMatrixDestroy(A_diag);
1971 hypre_TFree(rperm, HYPRE_MEMORY_HOST);
1972 hypre_TFree(rqperm, HYPRE_MEMORY_HOST);
1973
1974 return hypre_error_flag;
1975 }
1976
1977 HYPRE_Int
hypre_ILUSetupILUKDevice(hypre_ParCSRMatrix * A,HYPRE_Int lfil,HYPRE_Int * perm,HYPRE_Int * qperm,HYPRE_Int n,HYPRE_Int nLU,cusparseMatDescr_t matL_des,cusparseMatDescr_t matU_des,cusparseSolvePolicy_t ilu_solve_policy,void ** bufferp,csrsv2Info_t * matBL_infop,csrsv2Info_t * matBU_infop,csrsv2Info_t * matSL_infop,csrsv2Info_t * matSU_infop,hypre_CSRMatrix ** BLUptr,hypre_ParCSRMatrix ** matSptr,hypre_CSRMatrix ** Eptr,hypre_CSRMatrix ** Fptr,HYPRE_Int ** A_fake_diag_ip)1978 hypre_ILUSetupILUKDevice(hypre_ParCSRMatrix *A, HYPRE_Int lfil, HYPRE_Int *perm, HYPRE_Int *qperm, HYPRE_Int n, HYPRE_Int nLU,
1979 cusparseMatDescr_t matL_des, cusparseMatDescr_t matU_des, cusparseSolvePolicy_t ilu_solve_policy,
1980 void **bufferp, csrsv2Info_t *matBL_infop, csrsv2Info_t *matBU_infop,
1981 csrsv2Info_t *matSL_infop, csrsv2Info_t *matSU_infop,
1982 hypre_CSRMatrix **BLUptr, hypre_ParCSRMatrix **matSptr, hypre_CSRMatrix **Eptr, hypre_CSRMatrix **Fptr,
1983 HYPRE_Int **A_fake_diag_ip)
1984 {
1985 /* GPU-accelerated ILU0 with cusparse */
1986 HYPRE_Int i, j, k1, k2, k3, col;
1987
1988 /* communication stuffs for S */
1989 MPI_Comm comm = hypre_ParCSRMatrixComm(A);
1990
1991 HYPRE_Int my_id, num_procs;
1992 hypre_MPI_Comm_size(comm,&num_procs);
1993 hypre_MPI_Comm_rank(comm,&my_id);
1994
1995 hypre_ParCSRCommPkg *comm_pkg;
1996 hypre_ParCSRCommHandle *comm_handle;
1997 HYPRE_Int num_sends, begin, end;
1998 HYPRE_BigInt *send_buf = NULL;
1999 HYPRE_Int *rperm = NULL;
2000 HYPRE_Int *rqperm = NULL;
2001
2002 hypre_ParCSRMatrix *Apq = NULL;
2003 hypre_ParCSRMatrix *ALU = NULL;
2004
2005 hypre_ParCSRMatrix *matS = NULL;
2006 hypre_CSRMatrix *A_diag = NULL;
2007 HYPRE_Int *A_fake_diag_i = NULL;
2008 hypre_CSRMatrix *A_offd = NULL;
2009 HYPRE_Int *A_offd_i = NULL;
2010 HYPRE_Int *A_offd_j = NULL;
2011 HYPRE_Real *A_offd_data = NULL;
2012 hypre_CSRMatrix *SLU = NULL;
2013 /* pointers to cusparse data */
2014 csrsv2Info_t matBL_info = NULL;
2015 csrsv2Info_t matBU_info = NULL;
2016 csrsv2Info_t matSL_info = NULL;
2017 csrsv2Info_t matSU_info = NULL;
2018
2019 HYPRE_Int buffer_size = 0;
2020 void *buffer = NULL;
2021
2022 /* variables for matS */
2023 HYPRE_Int m = n - nLU;
2024 HYPRE_Int nI = nLU;//use default
2025 HYPRE_Int e = 0;
2026 HYPRE_Int m_e = m;
2027 HYPRE_BigInt total_rows;
2028 HYPRE_BigInt col_starts[2];
2029 HYPRE_Int *S_diag_i = NULL;
2030 HYPRE_Int S_diag_nnz;
2031 hypre_CSRMatrix *S_offd = NULL;
2032 HYPRE_Int *S_offd_i = NULL;
2033 HYPRE_Int *S_offd_j = NULL;
2034 HYPRE_Real *S_offd_data = NULL;
2035 HYPRE_BigInt *S_offd_colmap = NULL;
2036 HYPRE_Int S_offd_nnz;
2037 HYPRE_Int S_offd_ncols;
2038
2039 /* set data slots */
2040 A_offd = hypre_ParCSRMatrixOffd(A);
2041 A_offd_i = hypre_CSRMatrixI(A_offd);
2042 A_offd_j = hypre_CSRMatrixJ(A_offd);
2043 A_offd_data = hypre_CSRMatrixData(A_offd);
2044
2045 hypre_ParCSRMatrix *parL = NULL;
2046 hypre_ParCSRMatrix *parU = NULL;
2047 hypre_ParCSRMatrix *parS = NULL;
2048 HYPRE_Real *parD = NULL;
2049 HYPRE_Int *uend = NULL;
2050
2051 /* unfortunately we need to build the reverse permutation array */
2052 rperm = hypre_CTAlloc(HYPRE_Int, n, HYPRE_MEMORY_HOST);
2053 rqperm = hypre_CTAlloc(HYPRE_Int, n, HYPRE_MEMORY_HOST);
2054 for (i = 0; i < n; i++)
2055 {
2056 rperm[perm[i]] = i;
2057 rqperm[qperm[i]] = i;
2058 }
2059
2060 /* Only call ILU when we really have a matrix on this processor */
2061 if (n > 0)
2062 {
2063 /* Copy diagonal matrix into a new place with permutation
2064 * That is, A_diag = A_diag(perm,qperm);
2065 */
2066 hypre_ParILURAPReorder( A, perm, rqperm, &Apq);
2067
2068 /* Apply ILU factorization to the entile A_diag */
2069 hypre_ILUSetupILUK(Apq, lfil, NULL, NULL, n, n, &parL, &parD, &parU, &parS, &uend);
2070
2071 if (uend)
2072 {
2073 hypre_TFree(uend, HYPRE_MEMORY_HOST);
2074 }
2075
2076 if (parS)
2077 {
2078 hypre_ParCSRMatrixDestroy(parS);
2079 }
2080
2081 /* | L \ U (B) L^{-1}F |
2082 * | EU^{-1} L \ U (S)|
2083 * Extract submatrix L_B U_B, L_S U_S, EU_B^{-1}, L_B^{-1}F
2084 * Note that in this function after ILU, all rows are sorted
2085 * in a way different than HYPRE. Diagonal is not listed in the front
2086 */
2087 hypre_ILUSetupLDUtoCusparse( parL, parD, parU, &ALU);
2088
2089 if (parL)
2090 {
2091 hypre_ParCSRMatrixDestroy(parL);
2092 }
2093 if (parD)
2094 {
2095 hypre_TFree(parD, HYPRE_MEMORY_DEVICE);
2096 }
2097 if (parU)
2098 {
2099 hypre_ParCSRMatrixDestroy(parU);
2100 }
2101
2102 A_diag = hypre_ParCSRMatrixDiag(ALU);
2103
2104 hypre_ParILUCusparseILUExtractEBFC(A_diag, nLU, BLUptr, &SLU, Eptr, Fptr);
2105
2106 if (Apq)
2107 {
2108 hypre_ParCSRMatrixDestroy(Apq);
2109 }
2110
2111 }
2112 else
2113 {
2114 *BLUptr = NULL;
2115 *Eptr = NULL;
2116 *Fptr = NULL;
2117 SLU = NULL;
2118 }
2119
2120 /* create B */
2121 /* only analyse when nacessary */
2122 if ( nLU > 0 )
2123 {
2124 /* Analysis of BILU */
2125 HYPRE_ILUSetupCusparseCSRILU0SetupSolve(*BLUptr, matL_des, matU_des,
2126 ilu_solve_policy, &matBL_info, &matBU_info,
2127 &buffer_size, &buffer);
2128 }
2129
2130 HYPRE_BigInt big_m = (HYPRE_BigInt)m;
2131 hypre_MPI_Allreduce(&big_m, &total_rows, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
2132 /* only form when total_rows > 0 */
2133 if ( total_rows > 0 )
2134 {
2135 /* now create S */
2136 /* need to get new column start */
2137 {
2138 HYPRE_BigInt global_start;
2139 hypre_MPI_Scan( &big_m, &global_start, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
2140 col_starts[0] = global_start - m;
2141 col_starts[1] = global_start;
2142 }
2143
2144 A_fake_diag_i = hypre_CTAlloc(HYPRE_Int, m + 1, HYPRE_MEMORY_DEVICE);
2145 if (SLU)
2146 {
2147 /* Analysis of SILU */
2148 HYPRE_ILUSetupCusparseCSRILU0SetupSolve(SLU, matL_des, matU_des,
2149 ilu_solve_policy, &matSL_info, &matSU_info,
2150 &buffer_size, &buffer);
2151 }
2152 else
2153 {
2154 SLU = hypre_CSRMatrixCreate(0,0,0);
2155 hypre_CSRMatrixInitialize(SLU);
2156 }
2157 S_diag_i = hypre_CSRMatrixI(SLU);
2158 S_diag_nnz = S_diag_i[m];
2159 /* Build ParCSRMatrix matS
2160 * For example when np == 3 the new matrix takes the following form
2161 * |IS_1 E_12 E_13|
2162 * |E_21 IS_2 E_22| = S
2163 * |E_31 E_32 IS_3|
2164 * In which IS_i is the cusparse ILU factorization of S_i in one matrix
2165 * */
2166
2167 /* We did nothing to A_offd, so all the data kept, just reorder them
2168 * The create function takes comm, global num rows/cols,
2169 * row/col start, num cols offd, nnz diag, nnz offd
2170 */
2171 S_offd_nnz = hypre_CSRMatrixNumNonzeros(A_offd);
2172 S_offd_ncols = hypre_CSRMatrixNumCols(A_offd);
2173
2174 matS = hypre_ParCSRMatrixCreate( comm,
2175 total_rows,
2176 total_rows,
2177 col_starts,
2178 col_starts,
2179 S_offd_ncols,
2180 S_diag_nnz,
2181 S_offd_nnz);
2182
2183 /* first put diagonal data in */
2184 hypre_CSRMatrixDestroy(hypre_ParCSRMatrixDiag(matS));
2185 hypre_ParCSRMatrixDiag(matS) = SLU;
2186
2187 /* now start to construct offdiag of S */
2188 S_offd = hypre_ParCSRMatrixOffd(matS);
2189 S_offd_i = hypre_TAlloc(HYPRE_Int, m+1, HYPRE_MEMORY_DEVICE);
2190 S_offd_j = hypre_TAlloc(HYPRE_Int, S_offd_nnz, HYPRE_MEMORY_DEVICE);
2191 S_offd_data = hypre_TAlloc(HYPRE_Real, S_offd_nnz, HYPRE_MEMORY_DEVICE);
2192 S_offd_colmap = hypre_CTAlloc(HYPRE_BigInt, S_offd_ncols, HYPRE_MEMORY_HOST);
2193
2194 /* simply use a loop to copy data from A_offd */
2195 S_offd_i[0] = 0;
2196 k3 = 0;
2197 for (i = 1; i <= e; i++)
2198 {
2199 S_offd_i[i] = k3;
2200 }
2201 for (i = 0; i < m_e; i++)
2202 {
2203 col = perm[i + nI];
2204 k1 = A_offd_i[col];
2205 k2 = A_offd_i[col+1];
2206 for (j = k1; j < k2; j++)
2207 {
2208 S_offd_j[k3] = A_offd_j[j];
2209 S_offd_data[k3++] = A_offd_data[j];
2210 }
2211 S_offd_i[i+1+e] = k3;
2212 }
2213
2214 /* give I, J, DATA to S_offd */
2215 hypre_CSRMatrixI(S_offd) = S_offd_i;
2216 hypre_CSRMatrixJ(S_offd) = S_offd_j;
2217 hypre_CSRMatrixData(S_offd) = S_offd_data;
2218
2219 /* now we need to update S_offd_colmap */
2220 comm_pkg = hypre_ParCSRMatrixCommPkg(A);
2221 /* setup comm_pkg if not yet built */
2222 if (!comm_pkg)
2223 {
2224 hypre_MatvecCommPkgCreate(A);
2225 comm_pkg = hypre_ParCSRMatrixCommPkg(A);
2226 }
2227 /* get total num of send */
2228 num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
2229 begin = hypre_ParCSRCommPkgSendMapStart(comm_pkg,0);
2230 end = hypre_ParCSRCommPkgSendMapStart(comm_pkg,num_sends);
2231 send_buf = hypre_TAlloc(HYPRE_BigInt, end - begin, HYPRE_MEMORY_HOST);
2232 /* copy new index into send_buf */
2233 for (i = begin; i < end; i++)
2234 {
2235 send_buf[i-begin] = rperm[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,i)] - nLU + col_starts[0];
2236 }
2237
2238 /* main communication */
2239 comm_handle = hypre_ParCSRCommHandleCreate(21, comm_pkg, send_buf, S_offd_colmap);
2240 hypre_ParCSRCommHandleDestroy(comm_handle);
2241
2242 /* setup index */
2243 hypre_ParCSRMatrixColMapOffd(matS) = S_offd_colmap;
2244
2245 hypre_ILUSortOffdColmap(matS);
2246
2247 /* free */
2248 hypre_TFree(send_buf, HYPRE_MEMORY_HOST);
2249 } /* end of forming S */
2250
2251 *matSptr = matS;
2252 *bufferp = buffer;
2253 *matBL_infop = matBL_info;
2254 *matBU_infop = matBU_info;
2255 *matSL_infop = matSL_info;
2256 *matSU_infop = matSU_info;
2257 *A_fake_diag_ip= A_fake_diag_i;
2258
2259 /* Destroy the bridge after acrossing the river */
2260 hypre_CSRMatrixDestroy(A_diag);
2261 hypre_TFree(rperm, HYPRE_MEMORY_HOST);
2262 hypre_TFree(rqperm, HYPRE_MEMORY_HOST);
2263
2264 return hypre_error_flag;
2265 }
2266
2267
2268 HYPRE_Int
hypre_ILUSetupILUTDevice(hypre_ParCSRMatrix * A,HYPRE_Int lfil,HYPRE_Real * tol,HYPRE_Int * perm,HYPRE_Int * qperm,HYPRE_Int n,HYPRE_Int nLU,cusparseMatDescr_t matL_des,cusparseMatDescr_t matU_des,cusparseSolvePolicy_t ilu_solve_policy,void ** bufferp,csrsv2Info_t * matBL_infop,csrsv2Info_t * matBU_infop,csrsv2Info_t * matSL_infop,csrsv2Info_t * matSU_infop,hypre_CSRMatrix ** BLUptr,hypre_ParCSRMatrix ** matSptr,hypre_CSRMatrix ** Eptr,hypre_CSRMatrix ** Fptr,HYPRE_Int ** A_fake_diag_ip)2269 hypre_ILUSetupILUTDevice(hypre_ParCSRMatrix *A, HYPRE_Int lfil, HYPRE_Real *tol, HYPRE_Int *perm, HYPRE_Int *qperm, HYPRE_Int n, HYPRE_Int nLU,
2270 cusparseMatDescr_t matL_des, cusparseMatDescr_t matU_des, cusparseSolvePolicy_t ilu_solve_policy,
2271 void **bufferp, csrsv2Info_t *matBL_infop, csrsv2Info_t *matBU_infop,
2272 csrsv2Info_t *matSL_infop, csrsv2Info_t *matSU_infop,
2273 hypre_CSRMatrix **BLUptr, hypre_ParCSRMatrix **matSptr, hypre_CSRMatrix **Eptr, hypre_CSRMatrix **Fptr,
2274 HYPRE_Int **A_fake_diag_ip)
2275 {
2276 /* GPU-accelerated ILU0 with cusparse */
2277 HYPRE_Int i, j, k1, k2, k3, col;
2278
2279 /* communication stuffs for S */
2280 MPI_Comm comm = hypre_ParCSRMatrixComm(A);
2281
2282 HYPRE_Int my_id, num_procs;
2283 hypre_MPI_Comm_size(comm,&num_procs);
2284 hypre_MPI_Comm_rank(comm,&my_id);
2285
2286 hypre_ParCSRCommPkg *comm_pkg;
2287 hypre_ParCSRCommHandle *comm_handle;
2288 HYPRE_Int num_sends, begin, end;
2289 HYPRE_BigInt *send_buf = NULL;
2290 HYPRE_Int *rperm = NULL;
2291 HYPRE_Int *rqperm = NULL;
2292
2293 hypre_ParCSRMatrix *Apq = NULL;
2294 hypre_ParCSRMatrix *ALU = NULL;
2295
2296 hypre_ParCSRMatrix *matS = NULL;
2297 hypre_CSRMatrix *A_diag = NULL;
2298 HYPRE_Int *A_fake_diag_i = NULL;
2299 hypre_CSRMatrix *A_offd = NULL;
2300 HYPRE_Int *A_offd_i = NULL;
2301 HYPRE_Int *A_offd_j = NULL;
2302 HYPRE_Real *A_offd_data = NULL;
2303 hypre_CSRMatrix *SLU = NULL;
2304 /* pointers to cusparse data */
2305 csrsv2Info_t matBL_info = NULL;
2306 csrsv2Info_t matBU_info = NULL;
2307 csrsv2Info_t matSL_info = NULL;
2308 csrsv2Info_t matSU_info = NULL;
2309
2310 HYPRE_Int buffer_size = 0;
2311 void *buffer = NULL;
2312
2313 /* variables for matS */
2314 HYPRE_Int m = n - nLU;
2315 HYPRE_Int nI = nLU;//use default
2316 HYPRE_Int e = 0;
2317 HYPRE_Int m_e = m;
2318 HYPRE_BigInt total_rows;
2319 HYPRE_BigInt col_starts[2];
2320 HYPRE_Int *S_diag_i = NULL;
2321 HYPRE_Int S_diag_nnz;
2322 hypre_CSRMatrix *S_offd = NULL;
2323 HYPRE_Int *S_offd_i = NULL;
2324 HYPRE_Int *S_offd_j = NULL;
2325 HYPRE_Real *S_offd_data = NULL;
2326 HYPRE_BigInt *S_offd_colmap = NULL;
2327 HYPRE_Int S_offd_nnz;
2328 HYPRE_Int S_offd_ncols;
2329
2330 /* set data slots */
2331 A_offd = hypre_ParCSRMatrixOffd(A);
2332 A_offd_i = hypre_CSRMatrixI(A_offd);
2333 A_offd_j = hypre_CSRMatrixJ(A_offd);
2334 A_offd_data = hypre_CSRMatrixData(A_offd);
2335
2336 hypre_ParCSRMatrix *parL = NULL;
2337 hypre_ParCSRMatrix *parU = NULL;
2338 hypre_ParCSRMatrix *parS = NULL;
2339 HYPRE_Real *parD = NULL;
2340 HYPRE_Int *uend = NULL;
2341
2342 /* unfortunately we need to build the reverse permutation array */
2343 rperm = hypre_CTAlloc(HYPRE_Int, n, HYPRE_MEMORY_HOST);
2344 rqperm = hypre_CTAlloc(HYPRE_Int, n, HYPRE_MEMORY_HOST);
2345 for (i = 0; i < n; i++)
2346 {
2347 rperm[perm[i]] = i;
2348 rqperm[qperm[i]] = i;
2349 }
2350
2351 /* Only call ILU when we really have a matrix on this processor */
2352 if (n > 0)
2353 {
2354 /* Copy diagonal matrix into a new place with permutation
2355 * That is, A_diag = A_diag(perm,qperm);
2356 */
2357 hypre_ParILURAPReorder( A, perm, rqperm, &Apq);
2358
2359 /* Apply ILU factorization to the entile A_diag */
2360 hypre_ILUSetupILUT(Apq, lfil, tol, NULL, NULL, n, n, &parL, &parD, &parU, &parS, &uend);
2361
2362 if (uend)
2363 {
2364 hypre_TFree(uend, HYPRE_MEMORY_HOST);
2365 }
2366
2367 if (parS)
2368 {
2369 hypre_ParCSRMatrixDestroy(parS);
2370 }
2371
2372 /* | L \ U (B) L^{-1}F |
2373 * | EU^{-1} L \ U (S)|
2374 * Extract submatrix L_B U_B, L_S U_S, EU_B^{-1}, L_B^{-1}F
2375 * Note that in this function after ILU, all rows are sorted
2376 * in a way different than HYPRE. Diagonal is not listed in the front
2377 */
2378 hypre_ILUSetupLDUtoCusparse( parL, parD, parU, &ALU);
2379
2380 if (parL)
2381 {
2382 hypre_ParCSRMatrixDestroy(parL);
2383 }
2384 if (parD)
2385 {
2386 hypre_TFree(parD, HYPRE_MEMORY_DEVICE);
2387 }
2388 if (parU)
2389 {
2390 hypre_ParCSRMatrixDestroy(parU);
2391 }
2392
2393 A_diag = hypre_ParCSRMatrixDiag(ALU);
2394
2395 hypre_ParILUCusparseILUExtractEBFC(A_diag, nLU, BLUptr, &SLU, Eptr, Fptr);
2396
2397 if (Apq)
2398 {
2399 hypre_ParCSRMatrixDestroy(Apq);
2400 }
2401
2402 }
2403 else
2404 {
2405 *BLUptr = NULL;
2406 *Eptr = NULL;
2407 *Fptr = NULL;
2408 SLU = NULL;
2409 }
2410
2411 /* create B */
2412 /* only analyse when nacessary */
2413 if ( nLU > 0 )
2414 {
2415 /* Analysis of BILU */
2416 HYPRE_ILUSetupCusparseCSRILU0SetupSolve(*BLUptr, matL_des, matU_des,
2417 ilu_solve_policy, &matBL_info, &matBU_info,
2418 &buffer_size, &buffer);
2419 }
2420
2421 HYPRE_BigInt big_m = (HYPRE_BigInt)m;
2422 hypre_MPI_Allreduce(&big_m, &total_rows, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
2423 /* only form when total_rows > 0 */
2424 if ( total_rows > 0 )
2425 {
2426 /* now create S */
2427 /* need to get new column start */
2428 {
2429 HYPRE_BigInt global_start;
2430 hypre_MPI_Scan( &big_m, &global_start, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
2431 col_starts[0] = global_start - m;
2432 col_starts[1] = global_start;
2433 }
2434
2435 A_fake_diag_i = hypre_CTAlloc(HYPRE_Int, m + 1, HYPRE_MEMORY_DEVICE);
2436 if (SLU)
2437 {
2438 /* Analysis of SILU */
2439 HYPRE_ILUSetupCusparseCSRILU0SetupSolve(SLU, matL_des, matU_des,
2440 ilu_solve_policy, &matSL_info, &matSU_info,
2441 &buffer_size, &buffer);
2442 }
2443 else
2444 {
2445 SLU = hypre_CSRMatrixCreate(0,0,0);
2446 hypre_CSRMatrixInitialize(SLU);
2447 }
2448 S_diag_i = hypre_CSRMatrixI(SLU);
2449 S_diag_nnz = S_diag_i[m];
2450 /* Build ParCSRMatrix matS
2451 * For example when np == 3 the new matrix takes the following form
2452 * |IS_1 E_12 E_13|
2453 * |E_21 IS_2 E_22| = S
2454 * |E_31 E_32 IS_3|
2455 * In which IS_i is the cusparse ILU factorization of S_i in one matrix
2456 * */
2457
2458 /* We did nothing to A_offd, so all the data kept, just reorder them
2459 * The create function takes comm, global num rows/cols,
2460 * row/col start, num cols offd, nnz diag, nnz offd
2461 */
2462 S_offd_nnz = hypre_CSRMatrixNumNonzeros(A_offd);
2463 S_offd_ncols = hypre_CSRMatrixNumCols(A_offd);
2464
2465 matS = hypre_ParCSRMatrixCreate( comm,
2466 total_rows,
2467 total_rows,
2468 col_starts,
2469 col_starts,
2470 S_offd_ncols,
2471 S_diag_nnz,
2472 S_offd_nnz);
2473
2474 /* first put diagonal data in */
2475 hypre_CSRMatrixDestroy(hypre_ParCSRMatrixDiag(matS));
2476 hypre_ParCSRMatrixDiag(matS) = SLU;
2477
2478 /* now start to construct offdiag of S */
2479 S_offd = hypre_ParCSRMatrixOffd(matS);
2480 S_offd_i = hypre_TAlloc(HYPRE_Int, m+1, HYPRE_MEMORY_DEVICE);
2481 S_offd_j = hypre_TAlloc(HYPRE_Int, S_offd_nnz, HYPRE_MEMORY_DEVICE);
2482 S_offd_data = hypre_TAlloc(HYPRE_Real, S_offd_nnz, HYPRE_MEMORY_DEVICE);
2483 S_offd_colmap = hypre_CTAlloc(HYPRE_BigInt, S_offd_ncols, HYPRE_MEMORY_HOST);
2484
2485 /* simply use a loop to copy data from A_offd */
2486 S_offd_i[0] = 0;
2487 k3 = 0;
2488 for (i = 1; i <= e; i++)
2489 {
2490 S_offd_i[i] = k3;
2491 }
2492 for (i = 0; i < m_e; i++)
2493 {
2494 col = perm[i + nI];
2495 k1 = A_offd_i[col];
2496 k2 = A_offd_i[col+1];
2497 for (j = k1; j < k2; j++)
2498 {
2499 S_offd_j[k3] = A_offd_j[j];
2500 S_offd_data[k3++] = A_offd_data[j];
2501 }
2502 S_offd_i[i+1+e] = k3;
2503 }
2504
2505 /* give I, J, DATA to S_offd */
2506 hypre_CSRMatrixI(S_offd) = S_offd_i;
2507 hypre_CSRMatrixJ(S_offd) = S_offd_j;
2508 hypre_CSRMatrixData(S_offd) = S_offd_data;
2509
2510 /* now we need to update S_offd_colmap */
2511 comm_pkg = hypre_ParCSRMatrixCommPkg(A);
2512 /* setup comm_pkg if not yet built */
2513 if (!comm_pkg)
2514 {
2515 hypre_MatvecCommPkgCreate(A);
2516 comm_pkg = hypre_ParCSRMatrixCommPkg(A);
2517 }
2518 /* get total num of send */
2519 num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
2520 begin = hypre_ParCSRCommPkgSendMapStart(comm_pkg,0);
2521 end = hypre_ParCSRCommPkgSendMapStart(comm_pkg,num_sends);
2522 send_buf = hypre_TAlloc(HYPRE_BigInt, end - begin, HYPRE_MEMORY_HOST);
2523 /* copy new index into send_buf */
2524 for (i = begin; i < end; i++)
2525 {
2526 send_buf[i-begin] = rperm[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,i)] - nLU + col_starts[0];
2527 }
2528
2529 /* main communication */
2530 comm_handle = hypre_ParCSRCommHandleCreate(21, comm_pkg, send_buf, S_offd_colmap);
2531 hypre_ParCSRCommHandleDestroy(comm_handle);
2532
2533 /* setup index */
2534 hypre_ParCSRMatrixColMapOffd(matS) = S_offd_colmap;
2535
2536 hypre_ILUSortOffdColmap(matS);
2537
2538 /* free */
2539 hypre_TFree(send_buf, HYPRE_MEMORY_HOST);
2540 } /* end of forming S */
2541
2542 *matSptr = matS;
2543 *bufferp = buffer;
2544 *matBL_infop = matBL_info;
2545 *matBU_infop = matBU_info;
2546 *matSL_infop = matSL_info;
2547 *matSU_infop = matSU_info;
2548 *A_fake_diag_ip= A_fake_diag_i;
2549
2550 /* Destroy the bridge after acrossing the river */
2551 hypre_CSRMatrixDestroy(A_diag);
2552 hypre_TFree(rperm, HYPRE_MEMORY_HOST);
2553 hypre_TFree(rqperm, HYPRE_MEMORY_HOST);
2554
2555 return hypre_error_flag;
2556 }
2557
2558 /* Reorder matrix A based on local permutation (combine local permutation into global permutation)
2559 * WARNING: We don't put diagonal to the first entry of each row
2560 * A = input matrix
2561 * perm = permutation array indicating ordering of rows. Perm could come from a
2562 * CF_marker array or a reordering routine.
2563 * rqperm = reverse permutation array indicating ordering of columns
2564 * A_pq = pointer to the output par CSR matrix.
2565 */
2566 HYPRE_Int
hypre_ParILURAPReorder(hypre_ParCSRMatrix * A,HYPRE_Int * perm,HYPRE_Int * rqperm,hypre_ParCSRMatrix ** A_pq)2567 hypre_ParILURAPReorder(hypre_ParCSRMatrix *A, HYPRE_Int *perm, HYPRE_Int *rqperm, hypre_ParCSRMatrix **A_pq)
2568 {
2569 /* Get necessary slots */
2570 hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
2571 //HYPRE_Int *A_diag_i = hypre_CSRMatrixI(A_diag);
2572 //HYPRE_Int *A_diag_j = hypre_CSRMatrixJ(A_diag);
2573 //HYPRE_Real *A_diag_data = hypre_CSRMatrixData(A_diag);
2574 HYPRE_Int n = hypre_CSRMatrixNumRows(A_diag);
2575 //HYPRE_Int nnz_A_diag = A_diag_i[n];
2576
2577 //HYPRE_Int i, j, current_idx;
2578 HYPRE_Int i;
2579
2580 /* MPI */
2581 MPI_Comm comm = hypre_ParCSRMatrixComm(A);
2582 HYPRE_Int num_procs, my_id;
2583
2584 hypre_MPI_Comm_size(comm,&num_procs);
2585 hypre_MPI_Comm_rank(comm,&my_id);
2586
2587 /* Create permutation matrices P = I(perm,:) and Q(rqperm,:), such that Apq = PAQ */
2588 hypre_ParCSRMatrix *P, *Q, *PAQ, *PA;
2589
2590 hypre_CSRMatrix *P_diag, *Q_diag;
2591 hypre_CSRMatrix *P_offd, *Q_offd;
2592
2593 P = hypre_ParCSRMatrixCreate( comm,
2594 hypre_ParCSRMatrixGlobalNumRows(A),
2595 hypre_ParCSRMatrixGlobalNumRows(A),
2596 hypre_ParCSRMatrixRowStarts(A),
2597 hypre_ParCSRMatrixColStarts(A),
2598 0,
2599 n,
2600 0);
2601
2602 Q = hypre_ParCSRMatrixCreate( comm,
2603 hypre_ParCSRMatrixGlobalNumRows(A),
2604 hypre_ParCSRMatrixGlobalNumRows(A),
2605 hypre_ParCSRMatrixRowStarts(A),
2606 hypre_ParCSRMatrixColStarts(A),
2607 0,
2608 n,
2609 0);
2610
2611 P_diag = hypre_ParCSRMatrixDiag(P);
2612 Q_diag = hypre_ParCSRMatrixDiag(Q);
2613 P_offd = hypre_ParCSRMatrixOffd(P);
2614 Q_offd = hypre_ParCSRMatrixOffd(Q);
2615
2616 HYPRE_Int *P_diag_i, *P_diag_j, *Q_diag_i, *Q_diag_j;
2617 HYPRE_Real *P_diag_data, *Q_diag_data;
2618 HYPRE_Int *P_offd_i, *Q_offd_i;
2619
2620 P_diag_i = hypre_TAlloc(HYPRE_Int, n+1, HYPRE_MEMORY_DEVICE);
2621 P_diag_j = hypre_TAlloc(HYPRE_Int, n, HYPRE_MEMORY_DEVICE);
2622 P_diag_data = hypre_TAlloc(HYPRE_Real, n, HYPRE_MEMORY_DEVICE);
2623
2624 Q_diag_i = hypre_TAlloc(HYPRE_Int, n+1, HYPRE_MEMORY_DEVICE);
2625 Q_diag_j = hypre_TAlloc(HYPRE_Int, n, HYPRE_MEMORY_DEVICE);
2626 Q_diag_data = hypre_TAlloc(HYPRE_Real, n, HYPRE_MEMORY_DEVICE);
2627
2628 /* fill data, openmp should be availiable here */
2629 for (i = 0; i < n; i++)
2630 {
2631 P_diag_i[i] = i;
2632 P_diag_j[i] = perm[i];
2633 P_diag_data[i] = 1.0;
2634
2635 Q_diag_i[i] = i;
2636 Q_diag_j[i] = rqperm[i];
2637 Q_diag_data[i] = 1.0;
2638
2639 }
2640 P_diag_i[n] = n;
2641 Q_diag_i[n] = n;
2642
2643 /* give I, J, DATA */
2644 hypre_CSRMatrixI(P_diag) = P_diag_i;
2645 hypre_CSRMatrixJ(P_diag) = P_diag_j;
2646 hypre_CSRMatrixData(P_diag) = P_diag_data;
2647
2648 hypre_CSRMatrixI(Q_diag) = Q_diag_i;
2649 hypre_CSRMatrixJ(Q_diag) = Q_diag_j;
2650 hypre_CSRMatrixData(Q_diag) = Q_diag_data;
2651
2652 P_offd_i = hypre_CTAlloc(HYPRE_Int, n+1, HYPRE_MEMORY_DEVICE);
2653 Q_offd_i = hypre_CTAlloc(HYPRE_Int, n+1, HYPRE_MEMORY_DEVICE);
2654
2655 hypre_CSRMatrixI(P_offd) = P_offd_i;
2656 hypre_CSRMatrixI(Q_offd) = Q_offd_i;
2657
2658 /* Update A */
2659 PA = hypre_ParCSRMatMat(P, A);
2660 PAQ = hypre_ParCSRMatMat(PA, Q);
2661 //PAQ = hypre_ParCSRMatrixRAPKT(P, A, Q, 0);
2662
2663 /* free and return */
2664 hypre_ParCSRMatrixDestroy(P);
2665 hypre_ParCSRMatrixDestroy(Q);
2666
2667 *A_pq = PAQ;
2668
2669 return hypre_error_flag;
2670 }
2671
2672 /* Convert the L, D, U style to the cusparse style
2673 * Assume the diagonal of L and U are the ilu factorization, directly combine them
2674 */
2675 HYPRE_Int
hypre_ParILURAPBuildRP(hypre_ParCSRMatrix * A,hypre_ParCSRMatrix * BLUm,hypre_ParCSRMatrix * E,hypre_ParCSRMatrix * F,cusparseMatDescr_t matL_des,cusparseMatDescr_t matU_des,hypre_ParCSRMatrix ** Rp,hypre_ParCSRMatrix ** Pp)2676 hypre_ParILURAPBuildRP(hypre_ParCSRMatrix *A, hypre_ParCSRMatrix *BLUm, hypre_ParCSRMatrix* E, hypre_ParCSRMatrix *F,
2677 cusparseMatDescr_t matL_des, cusparseMatDescr_t matU_des, hypre_ParCSRMatrix **Rp, hypre_ParCSRMatrix **Pp)
2678 {
2679 /* declare variables */
2680 HYPRE_Int j, row, col;
2681 HYPRE_Real val;
2682 hypre_ParCSRMatrix *R, *P;
2683 hypre_CSRMatrix *R_diag, *P_diag;
2684
2685 hypre_CSRMatrix *BLUm_diag = hypre_ParCSRMatrixDiag(BLUm);
2686 HYPRE_Int *BLUm_diag_i = hypre_CSRMatrixI(BLUm_diag);
2687 HYPRE_Int *BLUm_diag_j = hypre_CSRMatrixJ(BLUm_diag);
2688 HYPRE_Real *BLUm_diag_data = hypre_CSRMatrixData(BLUm_diag);
2689
2690 hypre_CSRMatrix *E_diag = hypre_ParCSRMatrixDiag(E);
2691 HYPRE_Int *E_diag_i = hypre_CSRMatrixI(E_diag);
2692 HYPRE_Int *E_diag_j = hypre_CSRMatrixJ(E_diag);
2693 HYPRE_Real *E_diag_data = hypre_CSRMatrixData(E_diag);
2694 hypre_CSRMatrix *F_diag = hypre_ParCSRMatrixDiag(F);
2695 HYPRE_Int *F_diag_i = hypre_CSRMatrixI(F_diag);
2696 HYPRE_Int *F_diag_j = hypre_CSRMatrixJ(F_diag);
2697 HYPRE_Real *F_diag_data = hypre_CSRMatrixData(F_diag);
2698
2699 HYPRE_Int n = hypre_CSRMatrixNumRows(F_diag);
2700 HYPRE_Int m = hypre_CSRMatrixNumCols(F_diag);
2701
2702 HYPRE_Int nnz_BLUm = BLUm_diag_i[n];
2703
2704 /* MPI */
2705 MPI_Comm comm = hypre_ParCSRMatrixComm(A);
2706 HYPRE_Int num_procs, my_id;
2707
2708 hypre_MPI_Comm_size(comm,&num_procs);
2709 hypre_MPI_Comm_rank(comm,&my_id);
2710
2711 /* cusparse */
2712 HYPRE_Int isDoublePrecision = sizeof(HYPRE_Complex) == sizeof(hypre_double);
2713 HYPRE_Int isSinglePrecision = sizeof(HYPRE_Complex) == sizeof(hypre_double) / 2;
2714
2715 hypre_assert(isDoublePrecision || isSinglePrecision);
2716
2717 cusparseHandle_t handle = hypre_HandleCusparseHandle(hypre_handle());
2718
2719 /* compute P = -UB\(LB\F)
2720 * op(A) * op(X) = \alpha op(B)
2721 * first iLF = LB\F -> LB*iLF = F
2722 */
2723
2724 HYPRE_Int algo = 0;
2725 HYPRE_Real alpha = 1.0;
2726 HYPRE_Real *rhs;
2727 cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_NO_LEVEL;
2728 size_t buffer_size, buffer_size_old;
2729 void *buffer;
2730 csrsm2Info_t malL_info = NULL;
2731 HYPRE_CUSPARSE_CALL(cusparseCreateCsrsm2Info(&malL_info));
2732
2733 rhs = hypre_CTAlloc(HYPRE_Real, m * n, HYPRE_MEMORY_DEVICE);
2734
2735 /* fill data, note that rhs is in Fortan style (col first)
2736 * oprating by col is slow, but
2737 */
2738 for (row = 0; row < n; row++)
2739 {
2740 for (j = F_diag_i[row]; j < F_diag_i[row+1]; j++)
2741 {
2742 col = F_diag_j[j];
2743 *(rhs + col*n + row) = F_diag_data[j];
2744 }
2745 }
2746
2747 /* check buffer size and create buffer */
2748
2749 if (isDoublePrecision)
2750 {
2751 HYPRE_CUSPARSE_CALL(cusparseDcsrsm2_bufferSizeExt( handle, algo, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2752 n, m, nnz_BLUm, (hypre_double *)&alpha, matL_des, (hypre_double *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (hypre_double *)rhs, n, malL_info, policy, &buffer_size));
2753 }
2754 else if (isSinglePrecision)
2755 {
2756 HYPRE_CUSPARSE_CALL(cusparseScsrsm2_bufferSizeExt( handle, algo, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2757 n, m, nnz_BLUm, (float *)&alpha, matL_des, (float *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (float *)rhs, n, malL_info, policy, &buffer_size));
2758 }
2759
2760 buffer = hypre_MAlloc(buffer_size, HYPRE_MEMORY_DEVICE);
2761
2762 /* analysis */
2763
2764 if (isDoublePrecision)
2765 {
2766 HYPRE_CUSPARSE_CALL(cusparseDcsrsm2_analysis( handle, algo, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2767 n, m, nnz_BLUm, (hypre_double *)&alpha, matL_des, (hypre_double *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (hypre_double *)rhs, n, malL_info, policy, buffer));
2768 }
2769 else if (isSinglePrecision)
2770 {
2771 HYPRE_CUSPARSE_CALL(cusparseScsrsm2_analysis( handle, algo, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2772 n, m, nnz_BLUm, (float *)&alpha, matL_des, (float *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (float *)rhs, n, malL_info, policy, buffer));
2773 }
2774
2775 /* solve phase */
2776 if (isDoublePrecision)
2777 {
2778 HYPRE_CUSPARSE_CALL(cusparseDcsrsm2_solve( handle, algo, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2779 n, m, nnz_BLUm, (hypre_double *)&alpha, matL_des, (hypre_double *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (hypre_double *)rhs, n, malL_info, policy, buffer));
2780 }
2781 else if (isSinglePrecision)
2782 {
2783 HYPRE_CUSPARSE_CALL(cusparseScsrsm2_solve( handle, algo, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2784 n, m, nnz_BLUm, (float *)&alpha, matL_des, (float *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (float *)rhs, n, malL_info, policy, buffer));
2785 }
2786 /* now P = -UB\(LB\F) -> UB*P = -(LB\F)
2787 */
2788 alpha = -1.0;
2789 csrsm2Info_t malU_info = NULL;
2790 HYPRE_CUSPARSE_CALL(cusparseCreateCsrsm2Info(&malU_info));
2791
2792 buffer_size_old = buffer_size;
2793
2794 /* check buffer size and create buffer */
2795
2796 if (isDoublePrecision)
2797 {
2798 HYPRE_CUSPARSE_CALL(cusparseDcsrsm2_bufferSizeExt( handle, algo, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2799 n, m, nnz_BLUm, (hypre_double *)&alpha, matU_des, (hypre_double *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (hypre_double *)rhs, n, malU_info, policy, &buffer_size));
2800 }
2801 else if (isSinglePrecision)
2802 {
2803 HYPRE_CUSPARSE_CALL(cusparseScsrsm2_bufferSizeExt( handle, algo, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2804 n, m, nnz_BLUm, (float *)&alpha, matU_des, (float *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (float *)rhs, n, malU_info, policy, &buffer_size));
2805 }
2806
2807 if (buffer_size > buffer_size_old)
2808 {
2809 buffer = hypre_ReAlloc_v2(buffer, buffer_size_old, buffer_size, HYPRE_MEMORY_DEVICE);
2810 buffer_size_old = buffer_size;
2811 }
2812
2813 /* analysis */
2814
2815 if (isDoublePrecision)
2816 {
2817 HYPRE_CUSPARSE_CALL(cusparseDcsrsm2_analysis( handle, algo, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2818 n, m, nnz_BLUm, (hypre_double *)&alpha, matU_des, (hypre_double *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (hypre_double *)rhs, n, malU_info, policy, buffer));
2819 }
2820 else if (isSinglePrecision)
2821 {
2822 HYPRE_CUSPARSE_CALL(cusparseScsrsm2_analysis( handle, algo, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2823 n, m, nnz_BLUm, (float *)&alpha, matU_des, (float *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (float *)rhs, n, malU_info, policy, buffer));
2824 }
2825
2826 /* solve phase */
2827 if (isDoublePrecision)
2828 {
2829 HYPRE_CUSPARSE_CALL(cusparseDcsrsm2_solve( handle, algo, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2830 n, m, nnz_BLUm, (hypre_double *)&alpha, matU_des, (hypre_double *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (hypre_double *)rhs, n, malU_info, policy, buffer));
2831 }
2832 else if (isSinglePrecision)
2833 {
2834 HYPRE_CUSPARSE_CALL(cusparseScsrsm2_solve( handle, algo, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2835 n, m, nnz_BLUm, (float *)&alpha, matU_des, (float *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (float *)rhs, n, malU_info, policy, buffer));
2836 }
2837 /* wait till GPU done to copy data */
2838 cudaDeviceSynchronize();
2839 /* now form P, (n + m) * m */
2840 HYPRE_Real drop_tol = 1e-06;
2841 HYPRE_Int ctrP = 0;
2842 HYPRE_Int *P_diag_i;
2843 HYPRE_Int *P_offd_i;
2844 HYPRE_Int *P_diag_j;
2845 HYPRE_Real *P_diag_data;
2846
2847 HYPRE_Int capacity_P = nnz_BLUm + m;
2848
2849 P_diag_i = hypre_TAlloc(HYPRE_Int, n+m+1, HYPRE_MEMORY_DEVICE);
2850 P_offd_i = hypre_CTAlloc(HYPRE_Int, n+m+1, HYPRE_MEMORY_DEVICE);
2851 P_diag_j = hypre_TAlloc(HYPRE_Int, capacity_P, HYPRE_MEMORY_DEVICE);
2852 P_diag_data = hypre_TAlloc(HYPRE_Real, capacity_P, HYPRE_MEMORY_DEVICE);
2853
2854 for (row = 0; row < n; row++)
2855 {
2856 P_diag_i[row] = ctrP;
2857 for (col = 0; col < m; col++)
2858 {
2859 val = *(rhs + col*n + row);
2860 if (hypre_abs(val) > drop_tol)
2861 {
2862 if (ctrP >= capacity_P)
2863 {
2864 HYPRE_Int tmp;
2865 tmp = capacity_P;
2866 capacity_P = capacity_P * EXPAND_FACT;
2867 P_diag_j = hypre_TReAlloc_v2(P_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_P, HYPRE_MEMORY_DEVICE);
2868 P_diag_data = hypre_TReAlloc_v2(P_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_P, HYPRE_MEMORY_DEVICE);
2869 }
2870 P_diag_j[ctrP] = col;
2871 P_diag_data[ctrP++] = val;
2872 }
2873 }
2874 }
2875
2876 if (ctrP + m >= capacity_P)
2877 {
2878 HYPRE_Int tmp;
2879 tmp = capacity_P;
2880 capacity_P = ctrP + m;
2881 P_diag_j = hypre_TReAlloc_v2(P_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_P, HYPRE_MEMORY_DEVICE);
2882 P_diag_data = hypre_TReAlloc_v2(P_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_P, HYPRE_MEMORY_DEVICE);
2883 }
2884
2885 for (row = 0; row < m; row++)
2886 {
2887 P_diag_i[row+n] = ctrP;
2888 P_diag_j[ctrP] = row;
2889 P_diag_data[ctrP++] = 1.0;
2890 }
2891
2892 P_diag_i[m+n] = ctrP;
2893
2894 /* now start to form R = - (E / UB ) / LB
2895 * first EiUB = E / UB -> UB'*EiUB'=E'
2896 */
2897 alpha = 1.0;
2898 csrsm2Info_t malU_info2 = NULL;
2899 HYPRE_CUSPARSE_CALL(cusparseCreateCsrsm2Info(&malU_info2));
2900
2901 /* fill data, note that rhs is in Fortan style (col first)
2902 * oprating by col is slow, but
2903 */
2904
2905 hypre_TFree(rhs, HYPRE_MEMORY_DEVICE);
2906 rhs = hypre_CTAlloc(HYPRE_Real, m * n, HYPRE_MEMORY_DEVICE);
2907
2908 for (row = 0; row < m; row++)
2909 {
2910 for (j = E_diag_i[row]; j < E_diag_i[row+1]; j++)
2911 {
2912 col = E_diag_j[j];
2913 *(rhs + col*m + row) = E_diag_data[j];
2914 }
2915 }
2916
2917 /* check buffer size and create buffer */
2918
2919 if (isDoublePrecision)
2920 {
2921 HYPRE_CUSPARSE_CALL(cusparseDcsrsm2_bufferSizeExt( handle, algo, CUSPARSE_OPERATION_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
2922 n, m, nnz_BLUm, (hypre_double *)&alpha, matU_des, (hypre_double *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (hypre_double *)rhs, m, malU_info2, policy, &buffer_size));
2923 }
2924 else if (isSinglePrecision)
2925 {
2926 HYPRE_CUSPARSE_CALL(cusparseScsrsm2_bufferSizeExt( handle, algo, CUSPARSE_OPERATION_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
2927 n, m, nnz_BLUm, (float *)&alpha, matU_des, (float *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (float *)rhs, m, malU_info2, policy, &buffer_size));
2928 }
2929
2930 if (buffer_size > buffer_size_old)
2931 {
2932 buffer = hypre_ReAlloc_v2(buffer, buffer_size_old, buffer_size, HYPRE_MEMORY_DEVICE);
2933 buffer_size_old = buffer_size;
2934 }
2935
2936 /* analysis */
2937
2938 if (isDoublePrecision)
2939 {
2940 HYPRE_CUSPARSE_CALL(cusparseDcsrsm2_analysis( handle, algo, CUSPARSE_OPERATION_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
2941 n, m, nnz_BLUm, (hypre_double *)&alpha, matU_des, (hypre_double *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (hypre_double *)rhs, m, malU_info2, policy, buffer));
2942 }
2943 else if (isSinglePrecision)
2944 {
2945 HYPRE_CUSPARSE_CALL(cusparseScsrsm2_analysis( handle, algo, CUSPARSE_OPERATION_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
2946 n, m, nnz_BLUm, (float *)&alpha, matU_des, (float *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (float *)rhs, m, malU_info2, policy, buffer));
2947 }
2948
2949 /* solve phase */
2950 if (isDoublePrecision)
2951 {
2952 HYPRE_CUSPARSE_CALL(cusparseDcsrsm2_solve( handle, algo, CUSPARSE_OPERATION_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
2953 n, m, nnz_BLUm, (hypre_double *)&alpha, matU_des, (hypre_double *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (hypre_double *)rhs, m, malU_info2, policy, buffer));
2954 }
2955 else if (isSinglePrecision)
2956 {
2957 HYPRE_CUSPARSE_CALL(cusparseScsrsm2_solve( handle, algo, CUSPARSE_OPERATION_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
2958 n, m, nnz_BLUm, (float *)&alpha, matU_des, (float *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (float *)rhs, m, malU_info2, policy, buffer));
2959 }
2960
2961 /* R = - (EiUB ) / LB -> LB'R' = -EiUB'
2962 */
2963 alpha = -1.0;
2964 csrsm2Info_t malL_info2 = NULL;
2965 HYPRE_CUSPARSE_CALL(cusparseCreateCsrsm2Info(&malL_info2));
2966
2967 /* check buffer size and create buffer */
2968
2969 if (isDoublePrecision)
2970 {
2971 HYPRE_CUSPARSE_CALL(cusparseDcsrsm2_bufferSizeExt( handle, algo, CUSPARSE_OPERATION_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
2972 n, m, nnz_BLUm, (hypre_double *)&alpha, matL_des, (hypre_double *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (hypre_double *)rhs, m, malL_info2, policy, &buffer_size));
2973 }
2974 else if (isSinglePrecision)
2975 {
2976 HYPRE_CUSPARSE_CALL(cusparseScsrsm2_bufferSizeExt( handle, algo, CUSPARSE_OPERATION_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
2977 n, m, nnz_BLUm, (float *)&alpha, matL_des, (float *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (float *)rhs, m, malL_info2, policy, &buffer_size));
2978 }
2979
2980 if (buffer_size > buffer_size_old)
2981 {
2982 buffer = hypre_ReAlloc_v2(buffer, buffer_size_old, buffer_size, HYPRE_MEMORY_DEVICE);
2983 buffer_size_old = buffer_size;
2984 }
2985
2986 /* analysis */
2987
2988 if (isDoublePrecision)
2989 {
2990 HYPRE_CUSPARSE_CALL(cusparseDcsrsm2_analysis( handle, algo, CUSPARSE_OPERATION_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
2991 n, m, nnz_BLUm, (hypre_double *)&alpha, matL_des, (hypre_double *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (hypre_double *)rhs, m, malL_info2, policy, buffer));
2992 }
2993 else if (isSinglePrecision)
2994 {
2995 HYPRE_CUSPARSE_CALL(cusparseScsrsm2_analysis( handle, algo, CUSPARSE_OPERATION_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
2996 n, m, nnz_BLUm, (float *)&alpha, matL_des, (float *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (float *)rhs, m, malL_info2, policy, buffer));
2997 }
2998
2999 /* solve phase */
3000 if (isDoublePrecision)
3001 {
3002 HYPRE_CUSPARSE_CALL(cusparseDcsrsm2_solve( handle, algo, CUSPARSE_OPERATION_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
3003 n, m, nnz_BLUm, (hypre_double *)&alpha, matL_des, (hypre_double *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (hypre_double *)rhs, m, malL_info2, policy, buffer));
3004 }
3005 else if (isSinglePrecision)
3006 {
3007 HYPRE_CUSPARSE_CALL(cusparseScsrsm2_solve( handle, algo, CUSPARSE_OPERATION_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE,
3008 n, m, nnz_BLUm, (float *)&alpha, matL_des, (float *)BLUm_diag_data, BLUm_diag_i, BLUm_diag_j, (float *)rhs, m, malL_info2, policy, buffer));
3009 }
3010 cudaDeviceSynchronize();
3011 /* now form R, m * (n + m) */
3012 HYPRE_Int ctrR = 0;
3013 HYPRE_Int *R_diag_i;
3014 HYPRE_Int *R_offd_i;
3015 HYPRE_Int *R_diag_j;
3016 HYPRE_Real *R_diag_data;
3017
3018 HYPRE_Int capacity_R = nnz_BLUm + m;
3019 R_diag_i = hypre_TAlloc(HYPRE_Int, m+1, HYPRE_MEMORY_DEVICE);
3020 R_offd_i = hypre_CTAlloc(HYPRE_Int, m+1, HYPRE_MEMORY_DEVICE);
3021 R_diag_j = hypre_TAlloc(HYPRE_Int, capacity_R, HYPRE_MEMORY_DEVICE);
3022 R_diag_data = hypre_TAlloc(HYPRE_Real, capacity_R, HYPRE_MEMORY_DEVICE);
3023
3024 for (row = 0; row < m; row++)
3025 {
3026 R_diag_i[row] = ctrR;
3027 for (col = 0; col < n; col++)
3028 {
3029 val = *(rhs + col*m + row);
3030 if (hypre_abs(val) > drop_tol)
3031 {
3032 if (ctrR >= capacity_R)
3033 {
3034 HYPRE_Int tmp;
3035 tmp = capacity_R;
3036 capacity_R = capacity_R * EXPAND_FACT;
3037 R_diag_j = hypre_TReAlloc_v2(R_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_R, HYPRE_MEMORY_DEVICE);
3038 R_diag_data = hypre_TReAlloc_v2(R_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_R, HYPRE_MEMORY_DEVICE);
3039 }
3040 R_diag_j[ctrR] = col;
3041 R_diag_data[ctrR++] = val;
3042 }
3043 }
3044 if (ctrR >= capacity_R)
3045 {
3046 HYPRE_Int tmp;
3047 tmp = capacity_R;
3048 capacity_R = capacity_R * EXPAND_FACT;
3049 R_diag_j = hypre_TReAlloc_v2(R_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_R, HYPRE_MEMORY_DEVICE);
3050 R_diag_data = hypre_TReAlloc_v2(R_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_R, HYPRE_MEMORY_DEVICE);
3051 }
3052 R_diag_j[ctrR] = n + row;
3053 R_diag_data[ctrR++] = 1.0;
3054 }
3055
3056 R_diag_i[m] = ctrR;
3057
3058 hypre_TFree(buffer, HYPRE_MEMORY_DEVICE);
3059
3060 /* create ParCSR matrices */
3061
3062 R = hypre_ParCSRMatrixCreate( hypre_ParCSRMatrixComm(A),
3063 hypre_ParCSRMatrixGlobalNumRows(E),
3064 hypre_ParCSRMatrixGlobalNumCols(A),
3065 hypre_ParCSRMatrixRowStarts(E),
3066 hypre_ParCSRMatrixColStarts(A),
3067 0,
3068 ctrR,
3069 0);
3070
3071 P = hypre_ParCSRMatrixCreate( hypre_ParCSRMatrixComm(A),
3072 hypre_ParCSRMatrixGlobalNumRows(A),
3073 hypre_ParCSRMatrixGlobalNumCols(F),
3074 hypre_ParCSRMatrixRowStarts(A),
3075 hypre_ParCSRMatrixColStarts(F),
3076 0,
3077 ctrP,
3078 0);
3079
3080 /* Assign value to diagonal data */
3081
3082 R_diag = hypre_ParCSRMatrixDiag(R);
3083 hypre_CSRMatrixI(R_diag) = R_diag_i;
3084 hypre_CSRMatrixJ(R_diag) = R_diag_j;
3085 hypre_CSRMatrixData(R_diag) = R_diag_data;
3086 hypre_CSRMatrixSetDataOwner(R_diag, 1);
3087
3088 P_diag = hypre_ParCSRMatrixDiag(P);
3089 hypre_CSRMatrixI(P_diag) = P_diag_i;
3090 hypre_CSRMatrixJ(P_diag) = P_diag_j;
3091 hypre_CSRMatrixData(P_diag) = P_diag_data;
3092 hypre_CSRMatrixSetDataOwner(P_diag, 1);
3093
3094 /* Assign value to off diagonal data */
3095
3096 R_diag = hypre_ParCSRMatrixOffd(R);
3097 hypre_CSRMatrixI(R_diag) = R_offd_i;
3098 P_diag = hypre_ParCSRMatrixOffd(P);
3099 hypre_CSRMatrixI(P_diag) = P_offd_i;
3100
3101 *Rp = R;
3102 *Pp = P;
3103
3104 HYPRE_CUSPARSE_CALL(cusparseDestroyCsrsm2Info(malL_info));
3105 HYPRE_CUSPARSE_CALL(cusparseDestroyCsrsm2Info(malU_info));
3106 HYPRE_CUSPARSE_CALL(cusparseDestroyCsrsm2Info(malL_info2));
3107 HYPRE_CUSPARSE_CALL(cusparseDestroyCsrsm2Info(malU_info2));
3108
3109 return hypre_error_flag;
3110 }
3111
3112 /* Convert the L, D, U style to the cusparse style
3113 * Assume the diagonal of L and U are the ilu factorization, directly combine them
3114 */
3115 HYPRE_Int
hypre_ILUSetupLDUtoCusparse(hypre_ParCSRMatrix * L,HYPRE_Real * D,hypre_ParCSRMatrix * U,hypre_ParCSRMatrix ** LDUp)3116 hypre_ILUSetupLDUtoCusparse(hypre_ParCSRMatrix *L, HYPRE_Real *D, hypre_ParCSRMatrix *U, hypre_ParCSRMatrix **LDUp)
3117 {
3118 /* data slots */
3119 HYPRE_Int i, j, pos;
3120
3121 hypre_CSRMatrix *L_diag = hypre_ParCSRMatrixDiag(L);
3122 hypre_CSRMatrix *U_diag = hypre_ParCSRMatrixDiag(U);
3123 HYPRE_Int *L_diag_i = hypre_CSRMatrixI(L_diag);
3124 HYPRE_Int *L_diag_j = hypre_CSRMatrixJ(L_diag);
3125 HYPRE_Real *L_diag_data = hypre_CSRMatrixData(L_diag);
3126 HYPRE_Int *U_diag_i = hypre_CSRMatrixI(U_diag);
3127 HYPRE_Int *U_diag_j = hypre_CSRMatrixJ(U_diag);
3128 HYPRE_Real *U_diag_data = hypre_CSRMatrixData(U_diag);
3129 HYPRE_Int n = hypre_ParCSRMatrixNumRows(L);
3130 HYPRE_Int nnz_L = L_diag_i[n];
3131 HYPRE_Int nnz_U = U_diag_i[n];
3132 HYPRE_Int nnz_LDU = n + nnz_L + nnz_U;
3133
3134 hypre_ParCSRMatrix *LDU;
3135 hypre_CSRMatrix *LDU_diag;
3136 HYPRE_Int *LDU_diag_i;
3137 HYPRE_Int *LDU_diag_j;
3138 HYPRE_Real *LDU_diag_data;
3139
3140 /* MPI */
3141 MPI_Comm comm = hypre_ParCSRMatrixComm(L);
3142 HYPRE_Int num_procs, my_id;
3143
3144 hypre_MPI_Comm_size(comm,&num_procs);
3145 hypre_MPI_Comm_rank(comm,&my_id);
3146
3147
3148 /* cuda data slot */
3149
3150 /* create matrix */
3151
3152 LDU = hypre_ParCSRMatrixCreate( comm,
3153 hypre_ParCSRMatrixGlobalNumRows(L),
3154 hypre_ParCSRMatrixGlobalNumRows(L),
3155 hypre_ParCSRMatrixRowStarts(L),
3156 hypre_ParCSRMatrixColStarts(L),
3157 0,
3158 nnz_LDU,
3159 0);
3160
3161 LDU_diag = hypre_ParCSRMatrixDiag(LDU);
3162 LDU_diag_i = hypre_TAlloc(HYPRE_Int, n+1, HYPRE_MEMORY_DEVICE);
3163 LDU_diag_j = hypre_TAlloc(HYPRE_Int, nnz_LDU, HYPRE_MEMORY_DEVICE);
3164 LDU_diag_data = hypre_TAlloc(HYPRE_Real, nnz_LDU, HYPRE_MEMORY_DEVICE);
3165
3166 pos = 0;
3167
3168 for (i = 1; i <= n; i++)
3169 {
3170 LDU_diag_i[i-1] = pos;
3171 for (j = L_diag_i[i-1]; j < L_diag_i[i]; j++)
3172 {
3173 LDU_diag_j[pos] = L_diag_j[j];
3174 LDU_diag_data[pos++] = L_diag_data[j];
3175 }
3176 LDU_diag_j[pos] = i-1;
3177 LDU_diag_data[pos++] = 1.0/D[i-1];
3178 for (j = U_diag_i[i-1]; j < U_diag_i[i]; j++)
3179 {
3180 LDU_diag_j[pos] = U_diag_j[j];
3181 LDU_diag_data[pos++] = U_diag_data[j];
3182 }
3183 }
3184 LDU_diag_i[n] = pos;
3185
3186 hypre_CSRMatrixI(LDU_diag) = LDU_diag_i;
3187 hypre_CSRMatrixJ(LDU_diag) = LDU_diag_j;
3188 hypre_CSRMatrixData(LDU_diag) = LDU_diag_data;
3189
3190 /* now sort */
3191 hypre_CSRMatrixSortRow(LDU_diag);
3192 hypre_ParCSRMatrixDiag(LDU) = LDU_diag;
3193
3194 *LDUp = LDU;
3195
3196 return hypre_error_flag;
3197 }
3198
3199 /* Apply the (modified) ILU factorization to the diagonal block of A only.
3200 * A: matrix
3201 * ALUp: pointer to the result, factorization stroed on the diagonal
3202 * modified: set to 0 to use classical ILU0
3203 */
3204 HYPRE_Int
hypre_ILUSetupRAPMILU0(hypre_ParCSRMatrix * A,hypre_ParCSRMatrix ** ALUp,HYPRE_Int modified)3205 hypre_ILUSetupRAPMILU0(hypre_ParCSRMatrix *A, hypre_ParCSRMatrix **ALUp, HYPRE_Int modified)
3206 {
3207 HYPRE_Int n = hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A));
3208 /* Get necessary slots */
3209 hypre_ParCSRMatrix *L, *U, *S, *ALU;
3210 HYPRE_Real *D;
3211 HYPRE_Int *u_end;
3212
3213 /* u_end is the end position of the upper triangular part (if we need E and F implicitly), not used here */
3214 hypre_ILUSetupMILU0( A, NULL, NULL, n, n, &L, &D, &U, &S, &u_end, modified);
3215 hypre_TFree(u_end, HYPRE_MEMORY_HOST);
3216
3217 hypre_ILUSetupLDUtoCusparse(L, D, U, &ALU);
3218
3219 if (L)
3220 {
3221 hypre_ParCSRMatrixDestroy(L);
3222 }
3223 if (D)
3224 {
3225 hypre_TFree(D, HYPRE_MEMORY_DEVICE);
3226 }
3227 if (U)
3228 {
3229 hypre_ParCSRMatrixDestroy(U);
3230 }
3231
3232 *ALUp = ALU;
3233
3234 return hypre_error_flag;
3235 }
3236
3237 /* Modified ILU(0) with RAP like solve
3238 * A = input matrix
3239 * Not explicitly forming the matrix, the previous version was abondoned
3240 */
3241 HYPRE_Int
hypre_ILUSetupRAPILU0Device(hypre_ParCSRMatrix * A,HYPRE_Int * perm,HYPRE_Int n,HYPRE_Int nLU,cusparseMatDescr_t matL_des,cusparseMatDescr_t matU_des,cusparseSolvePolicy_t ilu_solve_policy,void ** bufferp,csrsv2Info_t * matAL_infop,csrsv2Info_t * matAU_infop,csrsv2Info_t * matBL_infop,csrsv2Info_t * matBU_infop,csrsv2Info_t * matSL_infop,csrsv2Info_t * matSU_infop,hypre_ParCSRMatrix ** Apermptr,hypre_ParCSRMatrix ** matSptr,hypre_CSRMatrix ** ALUptr,hypre_CSRMatrix ** BLUptr,hypre_CSRMatrix ** CLUptr,hypre_CSRMatrix ** Eptr,hypre_CSRMatrix ** Fptr,HYPRE_Int test_opt)3242 hypre_ILUSetupRAPILU0Device(hypre_ParCSRMatrix *A, HYPRE_Int *perm, HYPRE_Int n, HYPRE_Int nLU,
3243 cusparseMatDescr_t matL_des, cusparseMatDescr_t matU_des, cusparseSolvePolicy_t ilu_solve_policy,
3244 void **bufferp, csrsv2Info_t *matAL_infop, csrsv2Info_t *matAU_infop,
3245 csrsv2Info_t *matBL_infop, csrsv2Info_t *matBU_infop,
3246 csrsv2Info_t *matSL_infop, csrsv2Info_t *matSU_infop,
3247 hypre_ParCSRMatrix **Apermptr, hypre_ParCSRMatrix **matSptr, hypre_CSRMatrix **ALUptr, hypre_CSRMatrix **BLUptr, hypre_CSRMatrix **CLUptr,
3248 hypre_CSRMatrix **Eptr, hypre_CSRMatrix **Fptr, HYPRE_Int test_opt)
3249 {
3250
3251 /* params */
3252 MPI_Comm comm = hypre_ParCSRMatrixComm(A);
3253 HYPRE_Int *rperm = NULL;
3254
3255 csrsv2Info_t matAL_info = NULL;
3256 csrsv2Info_t matAU_info = NULL;
3257 csrsv2Info_t matBL_info = NULL;
3258 csrsv2Info_t matBU_info = NULL;
3259 csrsv2Info_t matSL_info = NULL;
3260 csrsv2Info_t matSU_info = NULL;
3261
3262 HYPRE_Int buffer_size = 0;
3263 void *buffer = NULL;
3264
3265 //hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
3266 HYPRE_Int m = n - nLU;
3267
3268 //printf("Size of local Schur: %d\n",m);
3269
3270 HYPRE_Int i;
3271
3272 /* MPI */
3273 HYPRE_Int num_procs, my_id;
3274 hypre_MPI_Comm_size(comm,&num_procs);
3275 hypre_MPI_Comm_rank(comm,&my_id);
3276
3277 /* Matrix Structure */
3278 hypre_ParCSRMatrix *Apq, *ALU, *ALUm, *S;
3279 hypre_CSRMatrix *Amd, *Ad, *SLU, *Apq_diag;
3280
3281 rperm = hypre_CTAlloc(HYPRE_Int, n, HYPRE_MEMORY_HOST);
3282
3283 for(i = 0; i < n; i++)
3284 {
3285 rperm[perm[i]] = i;
3286 }
3287
3288 /* first we need to compute the ILU0 factorization of B */
3289
3290 /* Copy diagonal matrix into a new place with permutation
3291 * That is, Apq = A(perm,qperm);
3292 */
3293 hypre_ParILURAPReorder(A, perm, rperm, &Apq);
3294
3295 /* do the full ILU0 and modified ILU0 */
3296 hypre_ILUSetupRAPMILU0(Apq, &ALU, 0);
3297 hypre_ILUSetupRAPMILU0(Apq, &ALUm, 1);
3298
3299 hypre_CSRMatrix *dB, *dS, *dE, *dF;
3300
3301 /* get modified and extract LU factorization */
3302 Amd = hypre_ParCSRMatrixDiag(ALUm);
3303 Ad = hypre_ParCSRMatrixDiag(ALU);
3304 switch(test_opt)
3305 {
3306 case 1:
3307 {
3308 /* RAP where we save E and F */
3309 Apq_diag = hypre_ParCSRMatrixDiag(Apq);
3310 hypre_CSRMatrixSortRow(Apq_diag);
3311 hypre_ParILUCusparseILUExtractEBFC(Apq_diag, nLU, &dB, &dS, Eptr, Fptr);
3312 /* get modified ILU of B */
3313 hypre_ParILUCusparseILUExtractEBFC(Amd, nLU, BLUptr, &SLU, &dE, &dF);
3314 hypre_CSRMatrixDestroy(dB);
3315 hypre_CSRMatrixDestroy(dS);
3316 hypre_CSRMatrixDestroy(dE);
3317 hypre_CSRMatrixDestroy(dF);
3318 }
3319 break;
3320 case 2:
3321 {
3322 /* C-EB^{-1}F where we save EU^{-1}, L^{-1}F as sparse matrices */
3323 Apq_diag = hypre_ParCSRMatrixDiag(Apq);
3324 hypre_CSRMatrixSortRow(Apq_diag);
3325 hypre_ParILUCusparseILUExtractEBFC(Apq_diag, nLU, &dB, CLUptr, &dE, &dF);
3326 /* get modified ILU of B */
3327 hypre_ParILUCusparseILUExtractEBFC(Amd, nLU, BLUptr, &SLU, Eptr, Fptr);
3328 hypre_CSRMatrixDestroy(dB);
3329 hypre_CSRMatrixDestroy(dE);
3330 hypre_CSRMatrixDestroy(dF);
3331 }
3332 break;
3333 case 3:
3334 {
3335 /* C-EB^{-1}F where we save E and F */
3336 Apq_diag = hypre_ParCSRMatrixDiag(Apq);
3337 hypre_CSRMatrixSortRow(Apq_diag);
3338 hypre_ParILUCusparseILUExtractEBFC(Apq_diag, nLU, &dB, CLUptr, Eptr, Fptr);
3339 /* get modified ILU of B */
3340 hypre_ParILUCusparseILUExtractEBFC(Amd, nLU, BLUptr, &SLU, &dE, &dF);
3341 hypre_CSRMatrixDestroy(dB);
3342 hypre_CSRMatrixDestroy(dE);
3343 hypre_CSRMatrixDestroy(dF);
3344 }
3345 break;
3346 case 4:
3347 {
3348 /* RAP where we save EU^{-1}, L^{-1}F as sparse matrices */
3349 hypre_ParILUCusparseILUExtractEBFC(Ad, nLU, BLUptr, &SLU, Eptr, Fptr);
3350 }
3351 break;
3352 case 0: default:
3353 {
3354 /* RAP where we save EU^{-1}, L^{-1}F as sparse matrices */
3355 hypre_ParILUCusparseILUExtractEBFC(Amd, nLU, BLUptr, &SLU, Eptr, Fptr);
3356 }
3357 break;
3358 }
3359
3360 *ALUptr = hypre_ParCSRMatrixDiag(ALU);
3361 /* Analysis of BILU */
3362 HYPRE_ILUSetupCusparseCSRILU0SetupSolve(*ALUptr, matL_des, matU_des,
3363 ilu_solve_policy, &matAL_info, &matAU_info,
3364 &buffer_size, &buffer);
3365
3366 /* Analysis of BILU */
3367 HYPRE_ILUSetupCusparseCSRILU0SetupSolve(*BLUptr, matL_des, matU_des,
3368 ilu_solve_policy, &matBL_info, &matBU_info,
3369 &buffer_size, &buffer);
3370
3371 /* Analysis of SILU */
3372 HYPRE_ILUSetupCusparseCSRILU0SetupSolve(SLU, matL_des, matU_des,
3373 ilu_solve_policy, &matSL_info, &matSU_info,
3374 &buffer_size, &buffer);
3375
3376 /* start forming parCSR matrix S */
3377
3378 HYPRE_BigInt S_total_rows, *S_row_starts;
3379 HYPRE_BigInt big_m = (HYPRE_BigInt)m;
3380 hypre_MPI_Allreduce( &big_m, &S_total_rows, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
3381
3382 if (S_total_rows>0)
3383 {
3384 {
3385 HYPRE_BigInt global_start;
3386 S_row_starts = hypre_CTAlloc(HYPRE_BigInt,2,HYPRE_MEMORY_HOST);
3387 hypre_MPI_Scan( &big_m, &global_start, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
3388 S_row_starts[0] = global_start - m;
3389 S_row_starts[1] = global_start;
3390 }
3391
3392 S_row_starts = hypre_CTAlloc(HYPRE_BigInt, 2, HYPRE_MEMORY_HOST);
3393 S_row_starts[1] = S_total_rows;
3394 S_row_starts[0] = S_total_rows - m;
3395 hypre_MPI_Allreduce(&m, &S_total_rows, 1, HYPRE_MPI_INT, hypre_MPI_SUM, comm);
3396 S = hypre_ParCSRMatrixCreate( hypre_ParCSRMatrixComm(A),
3397 S_total_rows,
3398 S_total_rows,
3399 S_row_starts,
3400 S_row_starts,
3401 0,
3402 0,
3403 0);
3404
3405 /* memroy leak here */
3406 hypre_ParCSRMatrixDiag(S) = SLU;
3407
3408 /* free memory */
3409 hypre_TFree(S_row_starts, HYPRE_MEMORY_HOST);
3410 }
3411
3412 *matSptr = S;
3413 *Apermptr = Apq;
3414 *bufferp = buffer;
3415 *matAL_infop = matAL_info;
3416 *matAU_infop = matAU_info;
3417 *matBL_infop = matBL_info;
3418 *matBU_infop = matBU_info;
3419 *matSL_infop = matSL_info;
3420 *matSU_infop = matSU_info;
3421
3422 return hypre_error_flag;
3423 }
3424
3425 #endif
3426
3427 /* Modified ILU(0) with RAP like solve
3428 * A = input matrix
3429 * Not explicitly forming the matrix
3430 */
3431 HYPRE_Int
hypre_ILUSetupRAPILU0(hypre_ParCSRMatrix * A,HYPRE_Int * perm,HYPRE_Int n,HYPRE_Int nLU,hypre_ParCSRMatrix ** Lptr,HYPRE_Real ** Dptr,hypre_ParCSRMatrix ** Uptr,hypre_ParCSRMatrix ** mLptr,HYPRE_Real ** mDptr,hypre_ParCSRMatrix ** mUptr,HYPRE_Int ** u_end)3432 hypre_ILUSetupRAPILU0(hypre_ParCSRMatrix *A, HYPRE_Int *perm, HYPRE_Int n, HYPRE_Int nLU,
3433 hypre_ParCSRMatrix **Lptr, HYPRE_Real **Dptr, hypre_ParCSRMatrix **Uptr,
3434 hypre_ParCSRMatrix **mLptr, HYPRE_Real **mDptr, hypre_ParCSRMatrix **mUptr, HYPRE_Int **u_end)
3435 {
3436 HYPRE_Int i;
3437 hypre_ParCSRMatrix *S_temp = NULL;
3438 HYPRE_Int *u_temp = NULL;
3439
3440 /* standard ILU0 factorization */
3441 hypre_ILUSetupMILU0(A, perm, perm, n, n, Lptr, Dptr, Uptr, &S_temp, &u_temp, 0);
3442 if (S_temp)
3443 {
3444 hypre_ParCSRMatrixDestroy(S_temp);
3445 }
3446 if (u_temp)
3447 {
3448 hypre_Free( u_temp, HYPRE_MEMORY_HOST);
3449 }
3450 /* modified ILU0 factorization */
3451 hypre_ILUSetupMILU0(A, perm, perm, n, n, mLptr, mDptr, mUptr, &S_temp, &u_temp, 1);
3452 if (S_temp)
3453 {
3454 hypre_ParCSRMatrixDestroy(S_temp);
3455 }
3456 if (u_temp)
3457 {
3458 hypre_Free( u_temp, HYPRE_MEMORY_HOST);
3459 }
3460
3461 /* pointer to the start location */
3462 HYPRE_Int *u_end_array;
3463 u_end_array = hypre_TAlloc(HYPRE_Int, n, HYPRE_MEMORY_HOST);
3464
3465 hypre_CSRMatrix *U_diag = hypre_ParCSRMatrixDiag(*Uptr);
3466 HYPRE_Int *U_diag_i = hypre_CSRMatrixI(U_diag);
3467 HYPRE_Int *U_diag_j = hypre_CSRMatrixJ(U_diag);
3468 HYPRE_Real *U_diag_data = hypre_CSRMatrixData(U_diag);
3469 hypre_CSRMatrix *mU_diag = hypre_ParCSRMatrixDiag(*mUptr);
3470 HYPRE_Int *mU_diag_i = hypre_CSRMatrixI(mU_diag);
3471 HYPRE_Int *mU_diag_j = hypre_CSRMatrixJ(mU_diag);
3472 HYPRE_Real *mU_diag_data = hypre_CSRMatrixData(mU_diag);
3473
3474 // first sort the Upper part U
3475 for (i = 0; i < nLU; i++)
3476 {
3477 hypre_qsort1(U_diag_j,U_diag_data,U_diag_i[i],U_diag_i[i+1]-1);
3478 hypre_qsort1(mU_diag_j,mU_diag_data,mU_diag_i[i],mU_diag_i[i+1]-1);
3479 hypre_BinarySearch2(U_diag_j,nLU,U_diag_i[i],U_diag_i[i+1]-1,u_end_array + i);
3480 }
3481
3482 hypre_CSRMatrix *L_diag = hypre_ParCSRMatrixDiag(*Lptr);
3483 HYPRE_Int *L_diag_i = hypre_CSRMatrixI(L_diag);
3484 HYPRE_Int *L_diag_j = hypre_CSRMatrixJ(L_diag);
3485 HYPRE_Real *L_diag_data = hypre_CSRMatrixData(L_diag);
3486 hypre_CSRMatrix *mL_diag = hypre_ParCSRMatrixDiag(*mLptr);
3487 HYPRE_Int *mL_diag_i = hypre_CSRMatrixI(mL_diag);
3488 HYPRE_Int *mL_diag_j = hypre_CSRMatrixJ(mL_diag);
3489 HYPRE_Real *mL_diag_data = hypre_CSRMatrixData(mL_diag);
3490
3491 // now sort the Lower part L
3492 for (i = nLU; i < n; i++)
3493 {
3494 hypre_qsort1(L_diag_j,L_diag_data,L_diag_i[i],L_diag_i[i+1]-1);
3495 hypre_qsort1(mL_diag_j,mL_diag_data,mL_diag_i[i],mL_diag_i[i+1]-1);
3496 hypre_BinarySearch2(L_diag_j, nLU, L_diag_i[i], L_diag_i[i+1]-1, u_end_array + i);
3497 }
3498
3499 *u_end = u_end_array;
3500
3501 return hypre_error_flag;
3502 }
3503
3504 /* ILU(0)
3505 * A = input matrix
3506 * perm = permutation array indicating ordering of rows. Perm could come from a
3507 * CF_marker array or a reordering routine. When set to NULL, indentity permutation is used.
3508 * qperm = permutation array indicating ordering of columns. When set to NULL, indentity permutation is used.
3509 * nI = number of interial unknowns
3510 * nLU = size of incomplete factorization, nLU should obey nLU <= nI.
3511 * Schur complement is formed if nLU < n
3512 * Lptr, Dptr, Uptr, Sptr = L, D, U, S factors.
3513 * will form global Schur Matrix if nLU < n
3514 */
3515 HYPRE_Int
hypre_ILUSetupILU0(hypre_ParCSRMatrix * A,HYPRE_Int * perm,HYPRE_Int * qperm,HYPRE_Int nLU,HYPRE_Int nI,hypre_ParCSRMatrix ** Lptr,HYPRE_Real ** Dptr,hypre_ParCSRMatrix ** Uptr,hypre_ParCSRMatrix ** Sptr,HYPRE_Int ** u_end)3516 hypre_ILUSetupILU0(hypre_ParCSRMatrix *A, HYPRE_Int *perm, HYPRE_Int *qperm, HYPRE_Int nLU, HYPRE_Int nI,
3517 hypre_ParCSRMatrix **Lptr, HYPRE_Real** Dptr, hypre_ParCSRMatrix **Uptr, hypre_ParCSRMatrix **Sptr, HYPRE_Int **u_end)
3518 {
3519 return hypre_ILUSetupMILU0( A, perm, qperm, nLU, nI, Lptr, Dptr, Uptr, Sptr, u_end, 0);
3520 }
3521
3522 /* (modified) ILU(0)
3523 * A = input matrix
3524 * perm = permutation array indicating ordering of rows. Perm could come from a
3525 * CF_marker array or a reordering routine. When set to NULL, indentity permutation is used.
3526 * qperm = permutation array indicating ordering of columns When set to NULL, identity permutation is used.
3527 * nI = number of interior unknowns
3528 * nLU = size of incomplete factorization, nLU should obey nLU <= nI.
3529 * Schur complement is formed if nLU < n
3530 * Lptr, Dptr, Uptr, Sptr = L, D, U, S factors.
3531 * modified set to 0 to use classical ILU
3532 * will form global Schur Matrix if nLU < n
3533 */
3534 HYPRE_Int
hypre_ILUSetupMILU0(hypre_ParCSRMatrix * A,HYPRE_Int * permp,HYPRE_Int * qpermp,HYPRE_Int nLU,HYPRE_Int nI,hypre_ParCSRMatrix ** Lptr,HYPRE_Real ** Dptr,hypre_ParCSRMatrix ** Uptr,hypre_ParCSRMatrix ** Sptr,HYPRE_Int ** u_end,HYPRE_Int modified)3535 hypre_ILUSetupMILU0(hypre_ParCSRMatrix *A, HYPRE_Int *permp, HYPRE_Int *qpermp, HYPRE_Int nLU, HYPRE_Int nI,
3536 hypre_ParCSRMatrix **Lptr, HYPRE_Real** Dptr, hypre_ParCSRMatrix **Uptr, hypre_ParCSRMatrix **Sptr, HYPRE_Int **u_end,
3537 HYPRE_Int modified)
3538 {
3539 HYPRE_Int i, ii, j, k, k1, k2, k3, ctrU, ctrL, ctrS, lenl, lenu, jpiv, col, jpos;
3540 HYPRE_Int *iw, *iL, *iU;
3541 HYPRE_Real dd, t, dpiv, lxu, *wU, *wL;
3542 HYPRE_Real drop;
3543
3544 /* communication stuffs for S */
3545 MPI_Comm comm = hypre_ParCSRMatrixComm(A);
3546 HYPRE_Int S_offd_nnz, S_offd_ncols;
3547 hypre_ParCSRCommPkg *comm_pkg;
3548 hypre_ParCSRCommHandle *comm_handle;
3549 HYPRE_Int num_sends, begin, end;
3550 HYPRE_BigInt *send_buf = NULL;
3551 HYPRE_Int num_procs, my_id;
3552
3553 /* data objects for A */
3554 hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
3555 hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
3556 HYPRE_Real *A_diag_data = hypre_CSRMatrixData(A_diag);
3557 HYPRE_Int *A_diag_i = hypre_CSRMatrixI(A_diag);
3558 HYPRE_Int *A_diag_j = hypre_CSRMatrixJ(A_diag);
3559 HYPRE_Real *A_offd_data = hypre_CSRMatrixData(A_offd);
3560 HYPRE_Int *A_offd_i = hypre_CSRMatrixI(A_offd);
3561 HYPRE_Int *A_offd_j = hypre_CSRMatrixJ(A_offd);
3562
3563 /* size of problem and schur system */
3564 HYPRE_Int n = hypre_CSRMatrixNumRows(A_diag);
3565 HYPRE_Int m = n - nLU;
3566 HYPRE_Int e = nI - nLU;
3567 HYPRE_Int m_e = n - nI;
3568 HYPRE_Real local_nnz, total_nnz;
3569 HYPRE_Int *u_end_array;
3570
3571 /* data objects for L, D, U */
3572 hypre_ParCSRMatrix *matL;
3573 hypre_ParCSRMatrix *matU;
3574 hypre_CSRMatrix *L_diag;
3575 hypre_CSRMatrix *U_diag;
3576 HYPRE_Real *D_data;
3577 HYPRE_Real *L_diag_data;
3578 HYPRE_Int *L_diag_i;
3579 HYPRE_Int *L_diag_j;
3580 HYPRE_Real *U_diag_data;
3581 HYPRE_Int *U_diag_i;
3582 HYPRE_Int *U_diag_j;
3583
3584 /* data objects for S */
3585 hypre_ParCSRMatrix *matS = NULL;
3586 hypre_CSRMatrix *S_diag;
3587 hypre_CSRMatrix *S_offd;
3588 HYPRE_Real *S_diag_data = NULL;
3589 HYPRE_Int *S_diag_i = NULL;
3590 HYPRE_Int *S_diag_j = NULL;
3591 HYPRE_Int *S_offd_i = NULL;
3592 HYPRE_Int *S_offd_j = NULL;
3593 HYPRE_BigInt *S_offd_colmap = NULL;
3594 HYPRE_Real *S_offd_data;
3595 HYPRE_BigInt col_starts[2];
3596 HYPRE_BigInt total_rows;
3597
3598 /* memory management */
3599 HYPRE_Int initial_alloc = 0;
3600 HYPRE_Int capacity_L;
3601 HYPRE_Int capacity_U;
3602 HYPRE_Int capacity_S = 0;
3603 HYPRE_Int nnz_A = A_diag_i[n];
3604
3605 /* reverse permutation array */
3606 HYPRE_Int *rperm;
3607 HYPRE_Int *perm, *qperm;
3608
3609 /* start setup
3610 * get communication stuffs first
3611 */
3612 hypre_MPI_Comm_size(comm,&num_procs);
3613 hypre_MPI_Comm_rank(comm,&my_id);
3614 comm_pkg = hypre_ParCSRMatrixCommPkg(A);
3615 /* setup if not yet built */
3616 if (!comm_pkg)
3617 {
3618 hypre_MatvecCommPkgCreate(A);
3619 comm_pkg = hypre_ParCSRMatrixCommPkg(A);
3620 }
3621
3622 /* check for correctness */
3623 if (nLU < 0 || nLU > n)
3624 {
3625 hypre_error_w_msg(HYPRE_ERROR_ARG,"WARNING: nLU out of range.\n");
3626 }
3627 if (e < 0)
3628 {
3629 hypre_error_w_msg(HYPRE_ERROR_ARG,"WARNING: nLU should not exceed nI.\n");
3630 }
3631
3632 /* Allocate memory for u_end array */
3633 u_end_array = hypre_TAlloc(HYPRE_Int, nLU, HYPRE_MEMORY_HOST);
3634
3635 /* Allocate memory for L,D,U,S factors */
3636 if (n > 0)
3637 {
3638 initial_alloc = nLU + ceil((nnz_A / 2.0)*nLU/n);
3639 capacity_S = m + ceil((nnz_A / 2.0)*m/n);
3640 }
3641 capacity_L = initial_alloc;
3642 capacity_U = initial_alloc;
3643
3644 D_data = hypre_TAlloc(HYPRE_Real, n, HYPRE_MEMORY_DEVICE);
3645 L_diag_i = hypre_TAlloc(HYPRE_Int, n+1, HYPRE_MEMORY_DEVICE);
3646 L_diag_j = hypre_TAlloc(HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
3647 L_diag_data = hypre_TAlloc(HYPRE_Real, capacity_L, HYPRE_MEMORY_DEVICE);
3648 U_diag_i = hypre_TAlloc(HYPRE_Int, n+1, HYPRE_MEMORY_DEVICE);
3649 U_diag_j = hypre_TAlloc(HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
3650 U_diag_data = hypre_TAlloc(HYPRE_Real, capacity_U, HYPRE_MEMORY_DEVICE);
3651 S_diag_i = hypre_TAlloc(HYPRE_Int, m+1, HYPRE_MEMORY_DEVICE);
3652 S_diag_j = hypre_TAlloc(HYPRE_Int, capacity_S, HYPRE_MEMORY_DEVICE);
3653 S_diag_data = hypre_TAlloc(HYPRE_Real, capacity_S, HYPRE_MEMORY_DEVICE);
3654
3655 /* allocate working arrays */
3656 iw = hypre_TAlloc(HYPRE_Int, 3*n, HYPRE_MEMORY_HOST);
3657 iL = iw+n;
3658 rperm = iw + 2*n;
3659 wL = hypre_TAlloc(HYPRE_Real, n, HYPRE_MEMORY_HOST);
3660
3661 ctrU = ctrL = ctrS = 0;
3662 L_diag_i[0] = U_diag_i[0] = S_diag_i[0] = 0;
3663 /* set marker array iw to -1 */
3664 for (i = 0; i < n; i++)
3665 {
3666 iw[i] = -1;
3667 }
3668
3669 /* get reverse permutation (rperm).
3670 * create permutation if they are null
3671 * rperm holds the reordered indexes.
3672 * rperm only used for column
3673 */
3674
3675 if (!permp)
3676 {
3677 perm = hypre_TAlloc(HYPRE_Int, n, HYPRE_MEMORY_DEVICE);
3678 for (i = 0; i < n; i++)
3679 {
3680 perm[i] = i;
3681 }
3682 }
3683 else
3684 {
3685 perm = permp;
3686 }
3687
3688 if (!qpermp)
3689 {
3690 qperm = hypre_TAlloc(HYPRE_Int, n, HYPRE_MEMORY_DEVICE);
3691 for (i = 0; i < n; i++)
3692 {
3693 qperm[i] = i;
3694 }
3695 }
3696 else
3697 {
3698 qperm = qpermp;
3699 }
3700
3701 for (i = 0; i < n; i++)
3702 {
3703 rperm[qperm[i]] = i;
3704 }
3705
3706 /*--------- Begin Factorization. Work in permuted space ----*/
3707 for (ii = 0; ii < nLU; ii++)
3708 {
3709 // get row i
3710 i = perm[ii];
3711 // get extents of row i
3712 k1=A_diag_i[i];
3713 k2=A_diag_i[i+1];
3714 // track the drop
3715 drop = 0.0;
3716
3717 /*-------------------- unpack L & U-parts of row of A in arrays w */
3718 iU = iL+ii;
3719 wU = wL+ii;
3720 /*-------------------- diagonal entry */
3721 dd = 0.0;
3722 lenl = lenu = 0;
3723 iw[ii] = ii;
3724 /*-------------------- scan & unwrap column */
3725 for (j = k1; j < k2; j++)
3726 {
3727 col = rperm[A_diag_j[j]];
3728 t = A_diag_data[j];
3729 if ( col < ii )
3730 {
3731 iw[col] = lenl;
3732 iL[lenl] = col;
3733 wL[lenl++] = t;
3734 }
3735 else if (col > ii)
3736 {
3737 iw[col] = lenu;
3738 iU[lenu] = col;
3739 wU[lenu++] = t;
3740 }
3741 else
3742 {
3743 dd=t;
3744 }
3745 }
3746
3747 /* eliminate row */
3748 /*-------------------------------------------------------------------------
3749 * In order to do the elimination in the correct order we must select the
3750 * smallest column index among iL[k], k = j, j+1, ..., lenl-1. For ILU(0),
3751 * no new fill-ins are expect, so we can pre-sort iL and wL prior to the
3752 * entering the elimination loop.
3753 *-----------------------------------------------------------------------*/
3754 // hypre_quickSortIR(iL, wL, iw, 0, (lenl-1));
3755 hypre_qsort3ir(iL, wL, iw, 0, (lenl-1));
3756 for (j = 0; j < lenl; j++)
3757 {
3758 jpiv = iL[j];
3759 /* get factor/ pivot element */
3760 dpiv = wL[j] * D_data[jpiv];
3761 /* store entry in L */
3762 wL[j] = dpiv;
3763
3764 /* zero out element - reset pivot */
3765 iw[jpiv] = -1;
3766 /* combine current row and pivot row */
3767 for (k = U_diag_i[jpiv]; k < U_diag_i[jpiv+1]; k++)
3768 {
3769 col = U_diag_j[k];
3770 jpos = iw[col];
3771
3772 /* Only fill-in nonzero pattern (jpos != 0) */
3773 if (jpos < 0)
3774 {
3775 drop = drop - U_diag_data[k] * dpiv;
3776 continue;
3777 }
3778
3779 lxu = - U_diag_data[k] * dpiv;
3780 if (col < ii)
3781 {
3782 /* dealing with L part */
3783 wL[jpos] += lxu;
3784 }
3785 else if (col > ii)
3786 {
3787 /* dealing with U part */
3788 wU[jpos] += lxu;
3789 }
3790 else
3791 {
3792 /* diagonal update */
3793 dd += lxu;
3794 }
3795 }
3796 }
3797 /* modify when necessary */
3798 if (modified)
3799 {
3800 dd = dd + drop;
3801 }
3802
3803 /* restore iw (only need to restore diagonal and U part */
3804 iw[ii] = -1;
3805 for (j = 0; j < lenu; j++)
3806 {
3807 iw[iU[j]] = -1;
3808 }
3809
3810 /* Update LDU factors */
3811 /* L part */
3812 /* Check that memory is sufficient */
3813 if (lenl > 0)
3814 {
3815 while ((ctrL+lenl) > capacity_L)
3816 {
3817 HYPRE_Int tmp = capacity_L;
3818 capacity_L = capacity_L * EXPAND_FACT + 1;
3819 L_diag_j = hypre_TReAlloc_v2(L_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
3820 L_diag_data = hypre_TReAlloc_v2(L_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_L, HYPRE_MEMORY_DEVICE);
3821 }
3822 //hypre_TMemcpy(&(L_diag_j)[ctrL], iL, HYPRE_Int, lenl, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
3823 //hypre_TMemcpy(&(L_diag_data)[ctrL], wL, HYPRE_Real, lenl, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
3824 hypre_TMemcpy(&L_diag_j[ctrL], iL, HYPRE_Int, lenl,
3825 HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
3826 hypre_TMemcpy(&L_diag_data[ctrL], wL, HYPRE_Real, lenl,
3827 HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
3828 }
3829 L_diag_i[ii+1] = (ctrL+=lenl);
3830
3831 /* diagonal part (we store the inverse) */
3832 if (fabs(dd) < MAT_TOL)
3833 {
3834 dd = 1.0e-6;
3835 }
3836 D_data[ii] = 1./dd;
3837
3838 /* U part */
3839 /* Check that memory is sufficient */
3840 if (lenu > 0)
3841 {
3842 while ((ctrU+lenu) > capacity_U)
3843 {
3844 HYPRE_Int tmp = capacity_U;
3845 capacity_U = capacity_U * EXPAND_FACT + 1;
3846 U_diag_j = hypre_TReAlloc_v2(U_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
3847 U_diag_data = hypre_TReAlloc_v2(U_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_U, HYPRE_MEMORY_DEVICE);
3848 }
3849 //hypre_TMemcpy(&(U_diag_j)[ctrU], iU, HYPRE_Int, lenu, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
3850 //hypre_TMemcpy(&(U_diag_data)[ctrU], wU, HYPRE_Real, lenu, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
3851 hypre_TMemcpy(&U_diag_j[ctrU], iU, HYPRE_Int, lenu,
3852 HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
3853 hypre_TMemcpy(&U_diag_data[ctrU], wU, HYPRE_Real, lenu,
3854 HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
3855 }
3856 U_diag_i[ii+1] = (ctrU+=lenu);
3857
3858 /* check and build u_end array */
3859 if (m > 0)
3860 {
3861 hypre_qsort1(U_diag_j,U_diag_data,U_diag_i[ii],U_diag_i[ii+1]-1);
3862 hypre_BinarySearch2(U_diag_j,nLU,U_diag_i[ii],U_diag_i[ii+1]-1,u_end_array + ii);
3863 }
3864 else
3865 {
3866 /* Everything is in U */
3867 u_end_array[ii] = ctrU;
3868 }
3869
3870 }
3871
3872 /*--------- Begin Factorization in Schur Complement part ----*/
3873 for (ii = nLU; ii < n; ii++)
3874 {
3875 // get row i
3876 i = perm[ii];
3877 // get extents of row i
3878 k1=A_diag_i[i];
3879 k2=A_diag_i[i+1];
3880 drop = 0.0;
3881
3882 /*-------------------- unpack L & U-parts of row of A in arrays w */
3883 iU = iL+nLU + 1;
3884 wU = wL+nLU + 1;
3885 /*-------------------- diagonal entry */
3886 dd = 0.0;
3887 lenl = lenu = 0;
3888 iw[ii] = nLU;
3889 /*-------------------- scan & unwrap column */
3890 for (j = k1; j < k2; j++)
3891 {
3892 col = rperm[A_diag_j[j]];
3893 t = A_diag_data[j];
3894 if ( col < nLU )
3895 {
3896 iw[col] = lenl;
3897 iL[lenl] = col;
3898 wL[lenl++] = t;
3899 }
3900 else if (col != ii)
3901 {
3902 iw[col] = lenu;
3903 iU[lenu] = col;
3904 wU[lenu++] = t;
3905 }
3906 else
3907 {
3908 dd=t;
3909 }
3910 }
3911
3912 /* eliminate row */
3913 /*-------------------------------------------------------------------------
3914 * In order to do the elimination in the correct order we must select the
3915 * smallest column index among iL[k], k = j, j+1, ..., lenl-1. For ILU(0),
3916 * no new fill-ins are expect, so we can pre-sort iL and wL prior to the
3917 * entering the elimination loop.
3918 *-----------------------------------------------------------------------*/
3919 // hypre_quickSortIR(iL, wL, iw, 0, (lenl-1));
3920 hypre_qsort3ir(iL, wL, iw, 0, (lenl-1));
3921 for (j = 0; j < lenl; j++)
3922 {
3923 jpiv = iL[j];
3924 /* get factor/ pivot element */
3925 dpiv = wL[j] * D_data[jpiv];
3926 /* store entry in L */
3927 wL[j] = dpiv;
3928
3929 /* zero out element - reset pivot */
3930 iw[jpiv] = -1;
3931 /* combine current row and pivot row */
3932 for (k = U_diag_i[jpiv]; k < U_diag_i[jpiv+1]; k++)
3933 {
3934 col = U_diag_j[k];
3935 jpos = iw[col];
3936
3937 /* Only fill-in nonzero pattern (jpos != 0) */
3938 if (jpos < 0)
3939 {
3940 drop = drop - U_diag_data[k] * dpiv;
3941 continue;
3942 }
3943
3944 lxu = - U_diag_data[k] * dpiv;
3945 if (col < nLU)
3946 {
3947 /* dealing with L part */
3948 wL[jpos] += lxu;
3949 }
3950 else if (col != ii)
3951 {
3952 /* dealing with U part */
3953 wU[jpos] += lxu;
3954 }
3955 else
3956 {
3957 /* diagonal update */
3958 dd += lxu;
3959 }
3960 }
3961 }
3962 if (modified)
3963 {
3964 dd = dd + drop;
3965 }
3966 /* restore iw (only need to restore diagonal and U part */
3967 iw[ii] = -1;
3968 for (j = 0; j < lenu; j++)
3969 {
3970 iw[iU[j]] = -1;
3971 }
3972
3973 /* Update LDU factors */
3974 /* L part */
3975 /* Check that memory is sufficient */
3976 if (lenl > 0)
3977 {
3978 while ((ctrL+lenl) > capacity_L)
3979 {
3980 HYPRE_Int tmp = capacity_L;
3981 capacity_L = capacity_L * EXPAND_FACT + 1;
3982 L_diag_j = hypre_TReAlloc_v2(L_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
3983 L_diag_data = hypre_TReAlloc_v2(L_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_L, HYPRE_MEMORY_DEVICE);
3984 }
3985 //hypre_TMemcpy(&(L_diag_j)[ctrL], iL, HYPRE_Int, lenl, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
3986 //hypre_TMemcpy(&(L_diag_data)[ctrL], wL, HYPRE_Real, lenl, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
3987 hypre_TMemcpy(&L_diag_j[ctrL], iL, HYPRE_Int, lenl,
3988 HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
3989 hypre_TMemcpy(&L_diag_data[ctrL], wL, HYPRE_Real, lenl,
3990 HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
3991 }
3992 L_diag_i[ii+1] = (ctrL+=lenl);
3993
3994 /* S part */
3995 /* Check that memory is sufficient */
3996 while ((ctrS+lenu+1) > capacity_S)
3997 {
3998 HYPRE_Int tmp = capacity_S;
3999 capacity_S = capacity_S * EXPAND_FACT + 1;
4000 S_diag_j = hypre_TReAlloc_v2(S_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_S, HYPRE_MEMORY_DEVICE);
4001 S_diag_data = hypre_TReAlloc_v2(S_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_S, HYPRE_MEMORY_DEVICE);
4002 }
4003 /* remember S in under a new index system! */
4004 S_diag_j[ctrS] = ii - nLU;
4005 S_diag_data[ctrS] = dd;
4006 for (j = 0; j < lenu; j++)
4007 {
4008 S_diag_j[ctrS+1+j] = iU[j] - nLU;
4009 }
4010 //hypre_TMemcpy(S_diag_data+ctrS+1, wU, HYPRE_Real, lenu, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
4011 hypre_TMemcpy(S_diag_data+ctrS+1, wU, HYPRE_Real, lenu,
4012 HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
4013 S_diag_i[ii-nLU+1] = ctrS+=(lenu+1);
4014 }
4015 /* Assemble LDUS matrices */
4016 /* zero out unfactored rows for U and D */
4017 for (k = nLU; k < n; k++)
4018 {
4019 U_diag_i[k+1] = ctrU;
4020 D_data[k] = 1.;
4021 }
4022
4023 /* First create Schur complement if necessary
4024 * Check if we need to create Schur complement
4025 */
4026 HYPRE_BigInt big_m = (HYPRE_BigInt)m;
4027 hypre_MPI_Allreduce(&big_m, &total_rows, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
4028 /* only form when total_rows > 0 */
4029 if ( total_rows > 0 )
4030 {
4031 /* now create S */
4032 /* need to get new column start */
4033 {
4034 HYPRE_BigInt global_start;
4035 hypre_MPI_Scan( &big_m, &global_start, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
4036 col_starts[0] = global_start - m;
4037 col_starts[1] = global_start;
4038 }
4039
4040 /* We did nothing to A_offd, so all the data kept, just reorder them
4041 * The create function takes comm, global num rows/cols,
4042 * row/col start, num cols offd, nnz diag, nnz offd
4043 */
4044 S_offd_nnz = hypre_CSRMatrixNumNonzeros(A_offd);
4045 S_offd_ncols = hypre_CSRMatrixNumCols(A_offd);
4046
4047 matS = hypre_ParCSRMatrixCreate( comm,
4048 total_rows,
4049 total_rows,
4050 col_starts,
4051 col_starts,
4052 S_offd_ncols,
4053 ctrS,
4054 S_offd_nnz);
4055
4056 /* first put diagonal data in */
4057 S_diag = hypre_ParCSRMatrixDiag(matS);
4058
4059 hypre_CSRMatrixI(S_diag) = S_diag_i;
4060 hypre_CSRMatrixData(S_diag) = S_diag_data;
4061 hypre_CSRMatrixJ(S_diag) = S_diag_j;
4062
4063 /* now start to construct offdiag of S */
4064 S_offd = hypre_ParCSRMatrixOffd(matS);
4065 S_offd_i = hypre_TAlloc(HYPRE_Int, m+1, HYPRE_MEMORY_DEVICE);
4066 S_offd_j = hypre_TAlloc(HYPRE_Int, S_offd_nnz, HYPRE_MEMORY_DEVICE);
4067 S_offd_data = hypre_TAlloc(HYPRE_Real, S_offd_nnz, HYPRE_MEMORY_DEVICE);
4068 S_offd_colmap = hypre_CTAlloc(HYPRE_BigInt, S_offd_ncols, HYPRE_MEMORY_HOST);
4069
4070 /* simply use a loop to copy data from A_offd */
4071 S_offd_i[0] = 0;
4072 k3 = 0;
4073 for (i = 1; i <= e; i++)
4074 {
4075 S_offd_i[i] = k3;
4076 }
4077 for (i = 0; i < m_e; i++)
4078 {
4079 col = perm[i + nI];
4080 k1 = A_offd_i[col];
4081 k2 = A_offd_i[col+1];
4082 for (j = k1; j < k2; j++)
4083 {
4084 S_offd_j[k3] = A_offd_j[j];
4085 S_offd_data[k3++] = A_offd_data[j];
4086 }
4087 S_offd_i[i+1+e] = k3;
4088 }
4089
4090 /* give I, J, DATA to S_offd */
4091 hypre_CSRMatrixI(S_offd) = S_offd_i;
4092 hypre_CSRMatrixJ(S_offd) = S_offd_j;
4093 hypre_CSRMatrixData(S_offd) = S_offd_data;
4094
4095 /* now we need to update S_offd_colmap */
4096
4097 /* get total num of send */
4098 num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
4099 begin = hypre_ParCSRCommPkgSendMapStart(comm_pkg,0);
4100 end = hypre_ParCSRCommPkgSendMapStart(comm_pkg,num_sends);
4101 send_buf = hypre_TAlloc(HYPRE_BigInt, end - begin, HYPRE_MEMORY_HOST);
4102 /* copy new index into send_buf */
4103 for (i = begin; i < end; i++)
4104 {
4105 send_buf[i-begin] = rperm[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,i)] - nLU + col_starts[0];
4106 }
4107 /* main communication */
4108 comm_handle = hypre_ParCSRCommHandleCreate(21, comm_pkg, send_buf, S_offd_colmap);
4109 hypre_ParCSRCommHandleDestroy(comm_handle);
4110
4111 /* setup index */
4112 hypre_ParCSRMatrixColMapOffd(matS) = S_offd_colmap;
4113
4114 hypre_ILUSortOffdColmap(matS);
4115
4116 /* free */
4117 hypre_TFree(send_buf, HYPRE_MEMORY_HOST);
4118 } /* end of forming S */
4119
4120 /* create S finished */
4121
4122 matL = hypre_ParCSRMatrixCreate( comm,
4123 hypre_ParCSRMatrixGlobalNumRows(A),
4124 hypre_ParCSRMatrixGlobalNumRows(A),
4125 hypre_ParCSRMatrixRowStarts(A),
4126 hypre_ParCSRMatrixColStarts(A),
4127 0,
4128 ctrL,
4129 0 );
4130
4131 L_diag = hypre_ParCSRMatrixDiag(matL);
4132 hypre_CSRMatrixI(L_diag) = L_diag_i;
4133 if (ctrL)
4134 {
4135 hypre_CSRMatrixData(L_diag) = L_diag_data;
4136 hypre_CSRMatrixJ(L_diag) = L_diag_j;
4137 }
4138 else
4139 {
4140 /* we've allocated some memory, so free if not used */
4141 hypre_TFree(L_diag_j,HYPRE_MEMORY_DEVICE);
4142 hypre_TFree(L_diag_data,HYPRE_MEMORY_DEVICE);
4143 }
4144 /* store (global) total number of nonzeros */
4145 local_nnz = (HYPRE_Real) ctrL;
4146 hypre_MPI_Allreduce(&local_nnz, &total_nnz, 1, HYPRE_MPI_REAL, hypre_MPI_SUM, comm);
4147 hypre_ParCSRMatrixDNumNonzeros(matL) = total_nnz;
4148
4149 matU = hypre_ParCSRMatrixCreate( comm,
4150 hypre_ParCSRMatrixGlobalNumRows(A),
4151 hypre_ParCSRMatrixGlobalNumRows(A),
4152 hypre_ParCSRMatrixRowStarts(A),
4153 hypre_ParCSRMatrixColStarts(A),
4154 0,
4155 ctrU,
4156 0 );
4157
4158 U_diag = hypre_ParCSRMatrixDiag(matU);
4159 hypre_CSRMatrixI(U_diag) = U_diag_i;
4160 if (ctrU)
4161 {
4162 hypre_CSRMatrixData(U_diag) = U_diag_data;
4163 hypre_CSRMatrixJ(U_diag) = U_diag_j;
4164 }
4165 else
4166 {
4167 /* we've allocated some memory, so free if not used */
4168 hypre_TFree(U_diag_j,HYPRE_MEMORY_DEVICE);
4169 hypre_TFree(U_diag_data,HYPRE_MEMORY_DEVICE);
4170 }
4171 /* store (global) total number of nonzeros */
4172 local_nnz = (HYPRE_Real) ctrU;
4173 hypre_MPI_Allreduce(&local_nnz, &total_nnz, 1, HYPRE_MPI_REAL, hypre_MPI_SUM, comm);
4174 hypre_ParCSRMatrixDNumNonzeros(matU) = total_nnz;
4175 /* free memory */
4176 hypre_TFree(wL,HYPRE_MEMORY_HOST);
4177 hypre_TFree(iw,HYPRE_MEMORY_HOST);
4178 if (!matS)
4179 {
4180 /* we allocate some memory for S, need to free if unused */
4181 hypre_TFree(S_diag_i,HYPRE_MEMORY_DEVICE);
4182 }
4183
4184 if (!permp)
4185 {
4186 hypre_TFree(perm, HYPRE_MEMORY_DEVICE);
4187 }
4188 if (!qpermp)
4189 {
4190 hypre_TFree(qperm, HYPRE_MEMORY_DEVICE);
4191 }
4192
4193 /* set matrix pointers */
4194 *Lptr = matL;
4195 *Dptr = D_data;
4196 *Uptr = matU;
4197 *Sptr = matS;
4198 *u_end = u_end_array;
4199
4200 return hypre_error_flag;
4201 }
4202
4203 /* ILU(k) symbolic factorization
4204 * n = total rows of input
4205 * lfil = level of fill-in, the k in ILU(k)
4206 * perm = permutation array indicating ordering of factorization. Perm could come from a
4207 * rperm = reverse permutation array, used here to avoid duplicate memory allocation
4208 * iw = working array, used here to avoid duplicate memory allocation
4209 * nLU = size of computed LDU factorization.
4210 * A/L/U/S_diag_i = the I slot of A, L, U and S
4211 * A/L/U/S_diag_j = the J slot of A, L, U and S
4212 * will form global Schur Matrix if nLU < n
4213 */
4214 HYPRE_Int
hypre_ILUSetupILUKSymbolic(HYPRE_Int n,HYPRE_Int * A_diag_i,HYPRE_Int * A_diag_j,HYPRE_Int lfil,HYPRE_Int * perm,HYPRE_Int * rperm,HYPRE_Int * iw,HYPRE_Int nLU,HYPRE_Int * L_diag_i,HYPRE_Int * U_diag_i,HYPRE_Int * S_diag_i,HYPRE_Int ** L_diag_j,HYPRE_Int ** U_diag_j,HYPRE_Int ** S_diag_j,HYPRE_Int ** u_end)4215 hypre_ILUSetupILUKSymbolic(HYPRE_Int n, HYPRE_Int *A_diag_i, HYPRE_Int *A_diag_j, HYPRE_Int lfil, HYPRE_Int *perm,
4216 HYPRE_Int *rperm, HYPRE_Int *iw, HYPRE_Int nLU, HYPRE_Int *L_diag_i, HYPRE_Int *U_diag_i,
4217 HYPRE_Int *S_diag_i, HYPRE_Int **L_diag_j, HYPRE_Int **U_diag_j, HYPRE_Int **S_diag_j, HYPRE_Int **u_end)
4218 {
4219 /*
4220 * 1: Setup and create buffers
4221 * A_diag_*: tempory pointer for the diagonal matrix of A and its '*' slot
4222 * ii: outer loop from 0 to nLU - 1
4223 * i: the real col number in diag inside the outer loop
4224 * iw: working array store the reverse of active col number
4225 * iL: working array store the active col number
4226 * iLev: working array store the active level of current row
4227 * lenl/u: current position in iw and so
4228 * ctrL/U/S: global position in J
4229 */
4230
4231 HYPRE_Int *temp_L_diag_j, *temp_U_diag_j, *temp_S_diag_j = NULL, *u_levels;
4232 HYPRE_Int *iL, *iLev;
4233 HYPRE_Int ii, i, j, k, ku, lena, lenl, lenu, lenh, ilev, lev, col, icol;
4234 HYPRE_Int m = n - nLU;
4235 HYPRE_Int *u_end_array;
4236
4237 /* memory management */
4238 HYPRE_Int ctrL;
4239 HYPRE_Int ctrU;
4240 HYPRE_Int ctrS;
4241 HYPRE_Int capacity_L;
4242 HYPRE_Int capacity_U;
4243 HYPRE_Int capacity_S;
4244 HYPRE_Int initial_alloc = 0;
4245 HYPRE_Int nnz_A;
4246
4247 /* set iL and iLev to right place in iw array */
4248 iL = iw + n;
4249 iLev = iw + 2*n;
4250
4251 /* setup initial memory used */
4252 nnz_A = A_diag_i[n];
4253 if (n > 0)
4254 {
4255 initial_alloc = nLU + ceil((nnz_A / 2.0) * nLU / n);
4256 }
4257 capacity_L = initial_alloc;
4258 capacity_U = initial_alloc;
4259
4260 /* allocate other memory for L and U struct */
4261 temp_L_diag_j = hypre_CTAlloc(HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
4262 temp_U_diag_j = hypre_CTAlloc(HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
4263
4264 if (m > 0)
4265 {
4266 capacity_S = m + ceil(nnz_A / 2.0 * m / n);
4267 temp_S_diag_j = hypre_CTAlloc(HYPRE_Int, capacity_S, HYPRE_MEMORY_DEVICE);
4268 }
4269
4270 u_end_array = hypre_TAlloc(HYPRE_Int, nLU, HYPRE_MEMORY_HOST);
4271 u_levels = hypre_CTAlloc(HYPRE_Int, capacity_U, HYPRE_MEMORY_HOST);
4272 ctrL = ctrU = ctrS = 0;
4273
4274 /* set initial value for working array */
4275 for (ii = 0 ; ii < n; ii++)
4276 {
4277 iw[ii] = -1;
4278 }
4279
4280 /*
4281 * 2: Start of main loop
4282 * those in iL are NEW col index (after permutation)
4283 */
4284 for (ii = 0; ii < nLU; ii++)
4285 {
4286 i = perm[ii];
4287 lenl = 0;
4288 lenh = 0;/* this is the current length of heap */
4289 lenu = ii;
4290 lena = A_diag_i[i+1];
4291 /* put those already inside original pattern, and set their level to 0 */
4292 for (j = A_diag_i[i]; j < lena; j++)
4293 {
4294 /* get the neworder of that col */
4295 col = rperm[A_diag_j[j]];
4296 if (col < ii)
4297 {
4298 /*
4299 * this is an entry in L
4300 * we maintain a heap structure for L part
4301 */
4302 iL[lenh] = col;
4303 iLev[lenh] = 0;
4304 iw[col] = lenh++;
4305 /*now miantian a heap structure*/
4306 hypre_ILUMinHeapAddIIIi(iL,iLev,iw,lenh);
4307 }
4308 else if (col > ii)
4309 {
4310 /* this is an entry in U */
4311 iL[lenu] = col;
4312 iLev[lenu] = 0;
4313 iw[col] = lenu++;
4314 }
4315 }/* end of j loop for adding pattern in original matrix */
4316
4317 /*
4318 * search lower part of current row and update pattern based on level
4319 */
4320 while (lenh > 0)
4321 {
4322 /*
4323 * k is now the new col index after permutation
4324 * the first element of the heap is the smallest
4325 */
4326 k = iL[0];
4327 ilev = iLev[0];
4328 /*
4329 * we now need to maintain the heap structure
4330 */
4331 hypre_ILUMinHeapRemoveIIIi(iL,iLev,iw,lenh);
4332 lenh--;
4333 /* copy to the end of array */
4334 lenl++;
4335 /* reset iw for that, not using anymore */
4336 iw[k]=-1;
4337 hypre_swap2i(iL,iLev,ii-lenl,lenh);
4338 /*
4339 * now the elimination on current row could start.
4340 * eliminate row k (new index) from current row
4341 */
4342 ku = U_diag_i[k+1];
4343 for (j = U_diag_i[k]; j < ku; j++)
4344 {
4345 col = temp_U_diag_j[j];
4346 lev = u_levels[j] + ilev + 1;
4347 /* ignore large level */
4348 icol = iw[col];
4349 /* skill large level */
4350 if (lev > lfil)
4351 {
4352 continue;
4353 }
4354 if (icol < 0)
4355 {
4356 /* not yet in */
4357 if (col < ii)
4358 {
4359 /*
4360 * if we add to the left L, we need to maintian the
4361 * heap structure
4362 */
4363 iL[lenh] = col;
4364 iLev[lenh] = lev;
4365 iw[col] = lenh++;
4366 /*swap it with the element right after the heap*/
4367
4368 /* maintain the heap */
4369 hypre_ILUMinHeapAddIIIi(iL,iLev,iw,lenh);
4370 }
4371 else if (col > ii)
4372 {
4373 iL[lenu] = col;
4374 iLev[lenu] = lev;
4375 iw[col] = lenu++;
4376 }
4377 }
4378 else
4379 {
4380 iLev[icol] = hypre_min(lev, iLev[icol]);
4381 }
4382 }/* end of loop j for level update */
4383 }/* end of while loop for iith row */
4384
4385 /* now update everything, indices, levels and so */
4386 L_diag_i[ii+1] = L_diag_i[ii] + lenl;
4387 if (lenl > 0)
4388 {
4389 /* check if memory is enough */
4390 while (ctrL + lenl > capacity_L)
4391 {
4392 HYPRE_Int tmp = capacity_L;
4393 capacity_L = capacity_L * EXPAND_FACT + 1;
4394 temp_L_diag_j = hypre_TReAlloc_v2(temp_L_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
4395 }
4396 /* now copy L data, reverse order */
4397 for (j = 0; j < lenl; j++)
4398 {
4399 temp_L_diag_j[ctrL+j] = iL[ii-j-1];
4400 }
4401 ctrL += lenl;
4402 }
4403 k = lenu - ii;
4404 U_diag_i[ii+1] = U_diag_i[ii] + k;
4405 if (k > 0)
4406 {
4407 /* check if memory is enough */
4408 while (ctrU + k > capacity_U)
4409 {
4410 HYPRE_Int tmp = capacity_U;
4411 capacity_U = capacity_U * EXPAND_FACT + 1;
4412 temp_U_diag_j = hypre_TReAlloc_v2(temp_U_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
4413 u_levels = hypre_TReAlloc_v2(u_levels, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_HOST);
4414 }
4415 //hypre_TMemcpy(temp_U_diag_j+ctrU,iL+ii,HYPRE_Int,k,HYPRE_MEMORY_DEVICE,HYPRE_MEMORY_HOST);
4416 hypre_TMemcpy(temp_U_diag_j+ctrU, iL+ii, HYPRE_Int, k,
4417 HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
4418 hypre_TMemcpy(u_levels+ctrU, iLev+ii, HYPRE_Int, k,
4419 HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
4420 ctrU += k;
4421 }
4422 if (m > 0)
4423 {
4424 hypre_qsort2i(temp_U_diag_j,u_levels,U_diag_i[ii],U_diag_i[ii+1]-1);
4425 hypre_BinarySearch2(temp_U_diag_j,nLU,U_diag_i[ii],U_diag_i[ii+1]-1,u_end_array + ii);
4426 }
4427 else
4428 {
4429 /* Everything is in U */
4430 u_end_array[ii] = ctrU;
4431 }
4432
4433 /* reset iw */
4434 for(j = ii; j < lenu; j++)
4435 {
4436 iw[iL[j]] = -1;
4437 }
4438
4439 }/* end of main loop ii from 0 to nLU-1 */
4440
4441 /* another loop to set EU^-1 and Schur complement */
4442 for (ii = nLU; ii < n; ii++)
4443 {
4444 i = perm[ii];
4445 lenl = 0;
4446 lenh = 0;/* this is the current length of heap */
4447 lenu = nLU;/* now this stores S, start from nLU */
4448 lena = A_diag_i[i+1];
4449 /* put those already inside original pattern, and set their level to 0 */
4450 for (j = A_diag_i[i]; j < lena; j++)
4451 {
4452 /* get the neworder of that col */
4453 col = rperm[A_diag_j[j]];
4454 if (col < nLU)
4455 {
4456 /*
4457 * this is an entry in L
4458 * we maintain a heap structure for L part
4459 */
4460 iL[lenh] = col;
4461 iLev[lenh] = 0;
4462 iw[col] = lenh++;
4463 /*now miantian a heap structure*/
4464 hypre_ILUMinHeapAddIIIi(iL,iLev,iw,lenh);
4465 }
4466 else if (col != ii) /* we for sure to add ii, avoid duplicate */
4467 {
4468 /* this is an entry in S */
4469 iL[lenu] = col;
4470 iLev[lenu] = 0;
4471 iw[col] = lenu++;
4472 }
4473 }/* end of j loop for adding pattern in original matrix */
4474
4475 /*
4476 * search lower part of current row and update pattern based on level
4477 */
4478 while (lenh > 0)
4479 {
4480 /*
4481 * k is now the new col index after permutation
4482 * the first element of the heap is the smallest
4483 */
4484 k = iL[0];
4485 ilev = iLev[0];
4486 /*
4487 * we now need to maintain the heap structure
4488 */
4489 hypre_ILUMinHeapRemoveIIIi(iL,iLev,iw,lenh);
4490 lenh--;
4491 /* copy to the end of array */
4492 lenl++;
4493 /* reset iw for that, not using anymore */
4494 iw[k]=-1;
4495 hypre_swap2i(iL,iLev,nLU-lenl,lenh);
4496 /*
4497 * now the elimination on current row could start.
4498 * eliminate row k (new index) from current row
4499 */
4500 ku = U_diag_i[k+1];
4501 for (j = U_diag_i[k]; j < ku; j++)
4502 {
4503 col = temp_U_diag_j[j];
4504 lev = u_levels[j] + ilev + 1;
4505 /* ignore large level */
4506 icol = iw[col];
4507 /* skill large level */
4508 if (lev > lfil)
4509 {
4510 continue;
4511 }
4512 if (icol < 0)
4513 {
4514 /* not yet in */
4515 if (col < nLU)
4516 {
4517 /*
4518 * if we add to the left L, we need to maintian the
4519 * heap structure
4520 */
4521 iL[lenh] = col;
4522 iLev[lenh] = lev;
4523 iw[col] = lenh++;
4524 /*swap it with the element right after the heap*/
4525
4526 /* maintain the heap */
4527 hypre_ILUMinHeapAddIIIi(iL,iLev,iw,lenh);
4528 }
4529 else if (col != ii)
4530 {
4531 /* S part */
4532 iL[lenu] = col;
4533 iLev[lenu] = lev;
4534 iw[col] = lenu++;
4535 }
4536 }
4537 else
4538 {
4539 iLev[icol] = hypre_min(lev, iLev[icol]);
4540 }
4541 }/* end of loop j for level update */
4542 }/* end of while loop for iith row */
4543
4544 /* now update everything, indices, levels and so */
4545 L_diag_i[ii+1] = L_diag_i[ii] + lenl;
4546 if (lenl > 0)
4547 {
4548 /* check if memory is enough */
4549 while (ctrL + lenl > capacity_L)
4550 {
4551 HYPRE_Int tmp = capacity_L;
4552 capacity_L = capacity_L * EXPAND_FACT + 1;
4553 temp_L_diag_j = hypre_TReAlloc_v2(temp_L_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
4554 }
4555 /* now copy L data, reverse order */
4556 for (j = 0; j < lenl; j ++)
4557 {
4558 temp_L_diag_j[ctrL+j] = iL[nLU-j-1];
4559 }
4560 ctrL += lenl;
4561 }
4562 k = lenu - nLU + 1;
4563 /* check if memory is enough */
4564 while (ctrS + k > capacity_S)
4565 {
4566 HYPRE_Int tmp = capacity_S;
4567 capacity_S = capacity_S * EXPAND_FACT + 1;
4568 temp_S_diag_j = hypre_TReAlloc_v2(temp_S_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_S, HYPRE_MEMORY_DEVICE);
4569 }
4570 temp_S_diag_j[ctrS] = ii;/* must have diagonal */
4571 //hypre_TMemcpy(temp_S_diag_j+ctrS+1,iL+nLU,HYPRE_Int,k-1,HYPRE_MEMORY_DEVICE,HYPRE_MEMORY_HOST);
4572 hypre_TMemcpy(temp_S_diag_j+ctrS+1, iL+nLU, HYPRE_Int, k-1,
4573 HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
4574 ctrS += k;
4575 S_diag_i[ii-nLU+1] = ctrS;
4576
4577 /* reset iw */
4578 for (j = nLU; j < lenu; j++)
4579 {
4580 iw[iL[j]] = -1;
4581 }
4582
4583 }/* end of main loop ii from nLU to n-1 */
4584
4585 /*
4586 * 3: Update the struct for L, U and S
4587 */
4588 for (k = nLU; k < n; k++)
4589 {
4590 U_diag_i[k+1] = U_diag_i[nLU];
4591 }
4592 /*
4593 * 4: Finishing up and free memory
4594 */
4595 hypre_TFree(u_levels,HYPRE_MEMORY_HOST);
4596
4597 *L_diag_j = temp_L_diag_j;
4598 *U_diag_j = temp_U_diag_j;
4599 *S_diag_j = temp_S_diag_j;
4600 *u_end = u_end_array;
4601
4602 return hypre_error_flag;
4603 }
4604
4605 /* ILU(k)
4606 * A: input matrix
4607 * lfil: level of fill-in, the k in ILU(k)
4608 * permp: permutation array indicating ordering of factorization. Perm could come from a
4609 * CF_marker: array or a reordering routine.
4610 * qpermp: column permutation array.
4611 * nLU: size of computed LDU factorization.
4612 * nI: number of interial unknowns, nI should obey nI >= nLU
4613 * Lptr, Dptr, Uptr: L, D, U factors.
4614 * Sprt: Schur Complement, if no Schur Complement is needed it will be set to NULL
4615 */
4616 HYPRE_Int
hypre_ILUSetupILUK(hypre_ParCSRMatrix * A,HYPRE_Int lfil,HYPRE_Int * permp,HYPRE_Int * qpermp,HYPRE_Int nLU,HYPRE_Int nI,hypre_ParCSRMatrix ** Lptr,HYPRE_Real ** Dptr,hypre_ParCSRMatrix ** Uptr,hypre_ParCSRMatrix ** Sptr,HYPRE_Int ** u_end)4617 hypre_ILUSetupILUK(hypre_ParCSRMatrix *A, HYPRE_Int lfil, HYPRE_Int *permp, HYPRE_Int *qpermp, HYPRE_Int nLU, HYPRE_Int nI,
4618 hypre_ParCSRMatrix **Lptr, HYPRE_Real** Dptr, hypre_ParCSRMatrix **Uptr, hypre_ParCSRMatrix **Sptr, HYPRE_Int **u_end)
4619 {
4620 /*
4621 * 1: Setup and create buffers
4622 * matL/U: the ParCSR matrix for L and U
4623 * L/U_diag: the diagonal csr matrix of matL/U
4624 * A_diag_*: tempory pointer for the diagonal matrix of A and its '*' slot
4625 * ii = outer loop from 0 to nLU - 1
4626 * i = the real col number in diag inside the outer loop
4627 * iw = working array store the reverse of active col number
4628 * iL = working array store the active col number
4629 */
4630
4631 /* call ILU0 if lfil is 0 */
4632 if (lfil == 0)
4633 {
4634 return hypre_ILUSetupILU0( A, permp, qpermp, nLU, nI, Lptr, Dptr, Uptr, Sptr, u_end);
4635 }
4636 HYPRE_Real local_nnz, total_nnz;
4637 HYPRE_Int i, ii, j, k, k1, k2, k3, kl, ku, jpiv, col, icol;
4638 HYPRE_Int *iw;
4639 MPI_Comm comm = hypre_ParCSRMatrixComm(A);
4640 HYPRE_Int num_procs, my_id;
4641
4642 /* data objects for A */
4643 hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
4644 hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
4645 HYPRE_Real *A_diag_data = hypre_CSRMatrixData(A_diag);
4646 HYPRE_Int *A_diag_i = hypre_CSRMatrixI(A_diag);
4647 HYPRE_Int *A_diag_j = hypre_CSRMatrixJ(A_diag);
4648 HYPRE_Real *A_offd_data = hypre_CSRMatrixData(A_offd);
4649 HYPRE_Int *A_offd_i = hypre_CSRMatrixI(A_offd);
4650 HYPRE_Int *A_offd_j = hypre_CSRMatrixJ(A_offd);
4651
4652 /* data objects for L, D, U */
4653 hypre_ParCSRMatrix *matL;
4654 hypre_ParCSRMatrix *matU;
4655 hypre_CSRMatrix *L_diag;
4656 hypre_CSRMatrix *U_diag;
4657 HYPRE_Real *D_data;
4658 HYPRE_Real *L_diag_data = NULL;
4659 HYPRE_Int *L_diag_i;
4660 HYPRE_Int *L_diag_j = NULL;
4661 HYPRE_Real *U_diag_data = NULL;
4662 HYPRE_Int *U_diag_i;
4663 HYPRE_Int *U_diag_j = NULL;
4664
4665 /* data objects for S */
4666 hypre_ParCSRMatrix *matS = NULL;
4667 hypre_CSRMatrix *S_diag;
4668 hypre_CSRMatrix *S_offd;
4669 HYPRE_Real *S_diag_data = NULL;
4670 HYPRE_Int *S_diag_i = NULL;
4671 HYPRE_Int *S_diag_j = NULL;
4672 HYPRE_Int *S_offd_i = NULL;
4673 HYPRE_Int *S_offd_j = NULL;
4674 HYPRE_BigInt *S_offd_colmap = NULL;
4675 HYPRE_Real *S_offd_data;
4676 HYPRE_Int S_offd_nnz, S_offd_ncols;
4677 HYPRE_BigInt col_starts[2];
4678 HYPRE_BigInt total_rows;
4679
4680 /* communication */
4681 hypre_ParCSRCommPkg *comm_pkg;
4682 hypre_ParCSRCommHandle *comm_handle;
4683 HYPRE_BigInt *send_buf = NULL;
4684
4685 /* problem size */
4686 HYPRE_Int n;
4687 HYPRE_Int m;
4688 HYPRE_Int e;
4689 HYPRE_Int m_e;
4690
4691 /* reverse permutation array */
4692 HYPRE_Int *rperm;
4693 HYPRE_Int *perm, *qperm;
4694
4695 /* start setup */
4696 /* check input and get problem size */
4697 n = hypre_CSRMatrixNumRows(A_diag);
4698 if (nLU < 0 || nLU > n)
4699 {
4700 hypre_error_w_msg(HYPRE_ERROR_ARG,"WARNING: nLU out of range.\n");
4701 }
4702 m = n - nLU;
4703 e = nI - nLU;
4704 m_e = n - nI;
4705 if (e < 0)
4706 {
4707 hypre_error_w_msg(HYPRE_ERROR_ARG,"WARNING: nLU should not exceed nI.\n");
4708 }
4709
4710 /* Init I array anyway. S's might be freed later */
4711 D_data = hypre_CTAlloc(HYPRE_Real, n, HYPRE_MEMORY_DEVICE);
4712 L_diag_i = hypre_CTAlloc(HYPRE_Int, (n+1), HYPRE_MEMORY_DEVICE);
4713 U_diag_i = hypre_CTAlloc(HYPRE_Int, (n+1), HYPRE_MEMORY_DEVICE);
4714 S_diag_i = hypre_CTAlloc(HYPRE_Int, (m+1), HYPRE_MEMORY_DEVICE);
4715
4716 /* set Comm_Pkg if not yet built */
4717 hypre_MPI_Comm_size(comm,&num_procs);
4718 hypre_MPI_Comm_rank(comm,&my_id);
4719 comm_pkg = hypre_ParCSRMatrixCommPkg(A);
4720 if (!comm_pkg)
4721 {
4722 hypre_MatvecCommPkgCreate(A);
4723 comm_pkg = hypre_ParCSRMatrixCommPkg(A);
4724 }
4725
4726 /*
4727 * 2: Symbolic factorization
4728 * setup iw and rperm first
4729 */
4730 /* allocate work arrays */
4731 iw = hypre_CTAlloc(HYPRE_Int, 4*n, HYPRE_MEMORY_HOST);
4732 rperm = iw + 3*n;
4733 L_diag_i[0] = U_diag_i[0] = S_diag_i[0] = 0;
4734 /* get reverse permutation (rperm).
4735 * rperm holds the reordered indexes.
4736 */
4737
4738 if (!permp)
4739 {
4740 perm = hypre_TAlloc(HYPRE_Int, n, HYPRE_MEMORY_DEVICE);
4741 for (i = 0; i < n; i++)
4742 {
4743 perm[i] = i;
4744 }
4745 }
4746 else
4747 {
4748 perm = permp;
4749 }
4750
4751 if (!qpermp)
4752 {
4753 qperm = hypre_TAlloc(HYPRE_Int, n, HYPRE_MEMORY_DEVICE);
4754 for (i = 0; i < n; i++)
4755 {
4756 qperm[i] = i;
4757 }
4758 }
4759 else
4760 {
4761 qperm = qpermp;
4762 }
4763
4764 for (i = 0; i < n; i++)
4765 {
4766 rperm[qperm[i]] = i;
4767 }
4768
4769 /* do symbolic factorization */
4770 hypre_ILUSetupILUKSymbolic(n, A_diag_i, A_diag_j, lfil, perm, rperm, iw,
4771 nLU, L_diag_i, U_diag_i, S_diag_i, &L_diag_j, &U_diag_j, &S_diag_j, u_end);
4772
4773 /*
4774 * after this, we have our I,J for L, U and S ready, and L sorted
4775 * iw are still -1 after symbolic factorization
4776 * now setup helper array here
4777 */
4778 if (L_diag_i[n])
4779 {
4780 L_diag_data = hypre_CTAlloc(HYPRE_Real, L_diag_i[n], HYPRE_MEMORY_DEVICE);
4781 }
4782 if (U_diag_i[n])
4783 {
4784 U_diag_data = hypre_CTAlloc(HYPRE_Real, U_diag_i[n], HYPRE_MEMORY_DEVICE);
4785 }
4786 if (S_diag_i[m])
4787 {
4788 S_diag_data = hypre_CTAlloc(HYPRE_Real, S_diag_i[m], HYPRE_MEMORY_DEVICE);
4789 }
4790
4791 /*
4792 * 3: Begin real factorization
4793 * we already have L and U structure ready, so no extra working array needed
4794 */
4795 /* first loop for upper part */
4796 for (ii = 0; ii < nLU; ii++)
4797 {
4798 // get row i
4799 i = perm[ii];
4800 kl = L_diag_i[ii+1];
4801 ku = U_diag_i[ii+1];
4802 k1 = A_diag_i[i];
4803 k2 = A_diag_i[i+1];
4804 /* set up working arrays */
4805 for (j = L_diag_i[ii]; j < kl; j++)
4806 {
4807 col = L_diag_j[j];
4808 iw[col] = j;
4809 }
4810 D_data[ii] = 0.0;
4811 iw[ii] = ii;
4812 for (j = U_diag_i[ii]; j < ku; j++)
4813 {
4814 col = U_diag_j[j];
4815 iw[col] = j;
4816 }
4817 /* copy data from A into L, D and U */
4818 for (j = k1; j < k2; j++)
4819 {
4820 /* compute everything in new index */
4821 col = rperm[A_diag_j[j]];
4822 icol = iw[col];
4823 /* A for sure to be inside the pattern */
4824 if (col < ii)
4825 {
4826 L_diag_data[icol] = A_diag_data[j];
4827 }
4828 else if (col == ii)
4829 {
4830 D_data[ii] = A_diag_data[j];
4831 }
4832 else
4833 {
4834 U_diag_data[icol] = A_diag_data[j];
4835 }
4836 }
4837 /* elimination */
4838 for (j = L_diag_i[ii]; j < kl; j++)
4839 {
4840 jpiv = L_diag_j[j];
4841 L_diag_data[j] *= D_data[jpiv];
4842 ku = U_diag_i[jpiv+1];
4843
4844 for (k = U_diag_i[jpiv]; k < ku; k++)
4845 {
4846 col = U_diag_j[k];
4847 icol = iw[col];
4848 if (icol < 0)
4849 {
4850 /* not in partern */
4851 continue;
4852 }
4853 if (col < ii)
4854 {
4855 /* L part */
4856 L_diag_data[icol] -= L_diag_data[j]*U_diag_data[k];
4857 }
4858 else if (col == ii)
4859 {
4860 /* diag part */
4861 D_data[icol] -= L_diag_data[j]*U_diag_data[k];
4862 }
4863 else
4864 {
4865 /* U part */
4866 U_diag_data[icol] -= L_diag_data[j]*U_diag_data[k];
4867 }
4868 }
4869 }
4870 /* reset working array */
4871 ku = U_diag_i[ii+1];
4872 for (j = L_diag_i[ii]; j < kl; j++)
4873 {
4874 col = L_diag_j[j];
4875 iw[col] = -1;
4876 }
4877 iw[ii] = -1;
4878 for (j = U_diag_i[ii]; j < ku ; j++)
4879 {
4880 col = U_diag_j[j];
4881 iw[col] = -1;
4882 }
4883
4884 /* diagonal part (we store the inverse) */
4885 if (fabs(D_data[ii]) < MAT_TOL)
4886 {
4887 D_data[ii] = 1e-06;
4888 }
4889 D_data[ii] = 1./ D_data[ii];
4890 }
4891
4892 /* Now lower part for Schur complement */
4893 for (ii = nLU; ii < n; ii++)
4894 {
4895 // get row i
4896 i = perm[ii];
4897 kl = L_diag_i[ii+1];
4898 ku = S_diag_i[ii - nLU +1];
4899 k1 = A_diag_i[i];
4900 k2 = A_diag_i[i+1];
4901 /* set up working arrays */
4902 for (j = L_diag_i[ii]; j < kl; j++)
4903 {
4904 col = L_diag_j[j];
4905 iw[col] = j;
4906 }
4907 for (j = S_diag_i[ii - nLU]; j < ku; j++)
4908 {
4909 col = S_diag_j[j];
4910 iw[col] = j;
4911 }
4912 /* copy data from A into L, and S */
4913 for (j = k1; j < k2; j++)
4914 {
4915 /* compute everything in new index */
4916 col = rperm[A_diag_j[j]];
4917 icol = iw[col];
4918 /* A for sure to be inside the pattern */
4919 if (col < nLU)
4920 {
4921 L_diag_data[icol] = A_diag_data[j];
4922 }
4923 else
4924 {
4925 S_diag_data[icol] = A_diag_data[j];
4926 }
4927 }
4928 /* elimination */
4929 for (j = L_diag_i[ii]; j < kl; j++)
4930 {
4931 jpiv = L_diag_j[j];
4932 L_diag_data[j] *= D_data[jpiv];
4933 ku = U_diag_i[jpiv+1];
4934 for (k = U_diag_i[jpiv]; k < ku; k++)
4935 {
4936 col = U_diag_j[k];
4937 icol = iw[col];
4938 if (icol < 0)
4939 {
4940 /* not in partern */
4941 continue;
4942 }
4943 if (col < nLU)
4944 {
4945 /* L part */
4946 L_diag_data[icol] -= L_diag_data[j]*U_diag_data[k];
4947 }
4948 else
4949 {
4950 /* S part */
4951 S_diag_data[icol] -= L_diag_data[j]*U_diag_data[k];
4952 }
4953 }
4954 }
4955 /* reset working array */
4956 for (j = L_diag_i[ii]; j < kl ; j++)
4957 {
4958 col = L_diag_j[j];
4959 iw[col] = -1;
4960 }
4961 ku = S_diag_i[ii-nLU+1];
4962 for (j = S_diag_i[ii-nLU]; j < ku; j++)
4963 {
4964 col = S_diag_j[j];
4965 iw[col] = -1;
4966 /* remember to update index, S is smaller! */
4967 S_diag_j[j]-=nLU;
4968 }
4969 }
4970
4971 /*
4972 * 4: Finishing up and free
4973 */
4974
4975 /* First create Schur complement if necessary
4976 * Check if we need to create Schur complement
4977 */
4978 HYPRE_BigInt big_m = (HYPRE_BigInt)m;
4979 hypre_MPI_Allreduce(&big_m, &total_rows, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
4980 /* only form when total_rows > 0 */
4981 if ( total_rows > 0 )
4982 {
4983 /* now create S */
4984 /* need to get new column start */
4985 {
4986 HYPRE_BigInt global_start;
4987 hypre_MPI_Scan( &big_m, &global_start, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
4988 col_starts[0] = global_start - m;
4989 col_starts[1] = global_start;
4990 }
4991
4992 /* We did nothing to A_offd, so all the data kept, just reorder them
4993 * The create function takes comm, global num rows/cols,
4994 * row/col start, num cols offd, nnz diag, nnz offd
4995 */
4996 S_offd_nnz = hypre_CSRMatrixNumNonzeros(A_offd);
4997 S_offd_ncols = hypre_CSRMatrixNumCols(A_offd);
4998
4999 matS = hypre_ParCSRMatrixCreate( comm,
5000 total_rows,
5001 total_rows,
5002 col_starts,
5003 col_starts,
5004 S_offd_ncols,
5005 S_diag_i[m],
5006 S_offd_nnz);
5007
5008 /* first put diagonal data in */
5009 S_diag = hypre_ParCSRMatrixDiag(matS);
5010
5011 hypre_CSRMatrixI(S_diag) = S_diag_i;
5012 hypre_CSRMatrixData(S_diag) = S_diag_data;
5013 hypre_CSRMatrixJ(S_diag) = S_diag_j;
5014
5015 /* now start to construct offdiag of S */
5016 S_offd = hypre_ParCSRMatrixOffd(matS);
5017 S_offd_i = hypre_TAlloc(HYPRE_Int, m+1, HYPRE_MEMORY_DEVICE);
5018 S_offd_j = hypre_TAlloc(HYPRE_Int, S_offd_nnz, HYPRE_MEMORY_DEVICE);
5019 S_offd_data = hypre_TAlloc(HYPRE_Real, S_offd_nnz, HYPRE_MEMORY_DEVICE);
5020 S_offd_colmap = hypre_CTAlloc(HYPRE_BigInt, S_offd_ncols, HYPRE_MEMORY_HOST);
5021
5022 /* simply use a loop to copy data from A_offd */
5023 S_offd_i[0] = 0;
5024 k3 = 0;
5025 for (i = 1; i <= e; i++)
5026 {
5027 S_offd_i[i+1] = k3;
5028 }
5029 for (i = 0; i < m_e; i++)
5030 {
5031 col = perm[i + nI];
5032 k1 = A_offd_i[col];
5033 k2 = A_offd_i[col+1];
5034 for (j = k1; j < k2; j++)
5035 {
5036 S_offd_j[k3] = A_offd_j[j];
5037 S_offd_data[k3++] = A_offd_data[j];
5038 }
5039 S_offd_i[i+e+1] = k3;
5040 }
5041
5042 /* give I, J, DATA to S_offd */
5043 hypre_CSRMatrixI(S_offd) = S_offd_i;
5044 hypre_CSRMatrixJ(S_offd) = S_offd_j;
5045 hypre_CSRMatrixData(S_offd) = S_offd_data;
5046
5047 /* now we need to update S_offd_colmap */
5048
5049 /* get total num of send */
5050 HYPRE_Int num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
5051 HYPRE_Int begin = hypre_ParCSRCommPkgSendMapStart(comm_pkg,0);
5052 HYPRE_Int end = hypre_ParCSRCommPkgSendMapStart(comm_pkg,num_sends);
5053 send_buf = hypre_TAlloc(HYPRE_BigInt, end - begin, HYPRE_MEMORY_HOST);
5054 /* copy new index into send_buf */
5055 for (i = begin; i < end; i++)
5056 {
5057 send_buf[i-begin] = rperm[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,i)] - nLU + col_starts[0];
5058 }
5059
5060 /* main communication */
5061 comm_handle = hypre_ParCSRCommHandleCreate(21, comm_pkg, send_buf, S_offd_colmap);
5062 hypre_ParCSRCommHandleDestroy(comm_handle);
5063
5064 /* setup index */
5065 hypre_ParCSRMatrixColMapOffd(matS) = S_offd_colmap;
5066
5067 hypre_ILUSortOffdColmap(matS);
5068
5069 /* free */
5070 hypre_TFree(send_buf, HYPRE_MEMORY_HOST);
5071 } /* end of forming S */
5072
5073 /* Assemble LDU matrices */
5074 /* zero out unfactored rows */
5075 for (k = nLU; k < n; k++)
5076 {
5077 D_data[k] = 1.;
5078 }
5079
5080 matL = hypre_ParCSRMatrixCreate( comm,
5081 hypre_ParCSRMatrixGlobalNumRows(A),
5082 hypre_ParCSRMatrixGlobalNumRows(A),
5083 hypre_ParCSRMatrixRowStarts(A),
5084 hypre_ParCSRMatrixColStarts(A),
5085 0 /* num_cols_offd */,
5086 L_diag_i[n],
5087 0 /* num_nonzeros_offd */);
5088
5089 L_diag = hypre_ParCSRMatrixDiag(matL);
5090 hypre_CSRMatrixI(L_diag) = L_diag_i;
5091 if (L_diag_i[n]>0)
5092 {
5093 hypre_CSRMatrixData(L_diag) = L_diag_data;
5094 hypre_CSRMatrixJ(L_diag) = L_diag_j;
5095 }
5096 else
5097 {
5098 /* we allocated some initial length, so free them */
5099 hypre_TFree(L_diag_j, HYPRE_MEMORY_DEVICE);
5100 }
5101 /* store (global) total number of nonzeros */
5102 local_nnz = (HYPRE_Real) (L_diag_i[n]);
5103 hypre_MPI_Allreduce(&local_nnz, &total_nnz, 1, HYPRE_MPI_REAL, hypre_MPI_SUM, comm);
5104 hypre_ParCSRMatrixDNumNonzeros(matL) = total_nnz;
5105
5106 matU = hypre_ParCSRMatrixCreate( comm,
5107 hypre_ParCSRMatrixGlobalNumRows(A),
5108 hypre_ParCSRMatrixGlobalNumRows(A),
5109 hypre_ParCSRMatrixRowStarts(A),
5110 hypre_ParCSRMatrixColStarts(A),
5111 0,
5112 U_diag_i[n],
5113 0 );
5114
5115 U_diag = hypre_ParCSRMatrixDiag(matU);
5116 hypre_CSRMatrixI(U_diag) = U_diag_i;
5117 if (U_diag_i[n]>0)
5118 {
5119 hypre_CSRMatrixData(U_diag) = U_diag_data;
5120 hypre_CSRMatrixJ(U_diag) = U_diag_j;
5121 }
5122 else
5123 {
5124 /* we allocated some initial length, so free them */
5125 hypre_TFree(U_diag_j, HYPRE_MEMORY_DEVICE);
5126 }
5127 /* store (global) total number of nonzeros */
5128 local_nnz = (HYPRE_Real) (U_diag_i[n]);
5129 hypre_MPI_Allreduce(&local_nnz, &total_nnz, 1, HYPRE_MPI_REAL, hypre_MPI_SUM, comm);
5130 hypre_ParCSRMatrixDNumNonzeros(matU) = total_nnz;
5131
5132 /* free */
5133 hypre_TFree(iw,HYPRE_MEMORY_HOST);
5134 if (!matS)
5135 {
5136 /* we allocate some memory for S, need to free if unused */
5137 hypre_TFree(S_diag_i,HYPRE_MEMORY_DEVICE);
5138 }
5139
5140 if (!permp)
5141 {
5142 hypre_TFree(perm, HYPRE_MEMORY_DEVICE);
5143 }
5144
5145 if (!qpermp)
5146 {
5147 hypre_TFree(qperm, HYPRE_MEMORY_DEVICE);
5148 }
5149
5150 /* set matrix pointers */
5151 *Lptr = matL;
5152 *Dptr = D_data;
5153 *Uptr = matU;
5154 *Sptr = matS;
5155
5156 return hypre_error_flag;
5157 }
5158
5159 /* ILUT
5160 * A: input matrix
5161 * lfil: maximum nnz per row in L and U
5162 * tol: droptol array in ILUT
5163 * tol[0]: matrix B
5164 * tol[1]: matrix E and F
5165 * tol[2]: matrix S
5166 * perm: permutation array indicating ordering of factorization. Perm could come from a
5167 * CF_marker: array or a reordering routine.
5168 * qperm: permutation array for column
5169 * nLU: size of computed LDU factorization. If nLU < n, Schur compelemnt will be formed
5170 * nI: number of interial unknowns. nLU should obey nLU <= nI.
5171 * Lptr, Dptr, Uptr: L, D, U factors.
5172 * Sptr: Schur complement
5173 *
5174 * Keep the largest lfil entries that is greater than some tol relative
5175 * to the input tol and the norm of that row in both L and U
5176 */
5177 HYPRE_Int
hypre_ILUSetupILUT(hypre_ParCSRMatrix * A,HYPRE_Int lfil,HYPRE_Real * tol,HYPRE_Int * permp,HYPRE_Int * qpermp,HYPRE_Int nLU,HYPRE_Int nI,hypre_ParCSRMatrix ** Lptr,HYPRE_Real ** Dptr,hypre_ParCSRMatrix ** Uptr,hypre_ParCSRMatrix ** Sptr,HYPRE_Int ** u_end)5178 hypre_ILUSetupILUT(hypre_ParCSRMatrix *A, HYPRE_Int lfil, HYPRE_Real *tol,
5179 HYPRE_Int *permp, HYPRE_Int *qpermp, HYPRE_Int nLU, HYPRE_Int nI, hypre_ParCSRMatrix **Lptr,
5180 HYPRE_Real** Dptr, hypre_ParCSRMatrix **Uptr, hypre_ParCSRMatrix **Sptr, HYPRE_Int **u_end)
5181 {
5182 /*
5183 * 1: Setup and create buffers
5184 * matL/U: the ParCSR matrix for L and U
5185 * L/U_diag: the diagonal csr matrix of matL/U
5186 * A_diag_*: tempory pointer for the diagonal matrix of A and its '*' slot
5187 * ii = outer loop from 0 to nLU - 1
5188 * i = the real col number in diag inside the outer loop
5189 * iw = working array store the reverse of active col number
5190 * iL = working array store the active col number
5191 */
5192 HYPRE_Real local_nnz, total_nnz;
5193 HYPRE_Int i, ii, j, k, k1, k2, k3, kl, ku, col, icol, lenl, lenu, lenhu, lenhlr, lenhll, jpos, jrow;
5194 HYPRE_Real inorm, itolb, itolef, itols, dpiv, lxu;
5195 HYPRE_Int *iw,*iL;
5196 HYPRE_Real *w;
5197
5198 /* memory management */
5199 HYPRE_Int ctrL;
5200 HYPRE_Int ctrU;
5201 HYPRE_Int initial_alloc = 0;
5202 HYPRE_Int capacity_L;
5203 HYPRE_Int capacity_U;
5204 HYPRE_Int ctrS;
5205 HYPRE_Int capacity_S;
5206 HYPRE_Int nnz_A;
5207
5208 /* communication stuffs for S */
5209 MPI_Comm comm = hypre_ParCSRMatrixComm(A);
5210 HYPRE_Int S_offd_nnz, S_offd_ncols;
5211 hypre_ParCSRCommPkg *comm_pkg;
5212 hypre_ParCSRCommHandle *comm_handle;
5213 HYPRE_Int num_procs, my_id;
5214 HYPRE_BigInt col_starts[2];
5215 HYPRE_BigInt total_rows;
5216 HYPRE_Int num_sends;
5217 HYPRE_Int begin, end;
5218
5219 /* data objects for A */
5220 hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
5221 hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
5222 HYPRE_Real *A_diag_data = hypre_CSRMatrixData(A_diag);
5223 HYPRE_Int *A_diag_i = hypre_CSRMatrixI(A_diag);
5224 HYPRE_Int *A_diag_j = hypre_CSRMatrixJ(A_diag);
5225 HYPRE_Int *A_offd_i = hypre_CSRMatrixI(A_offd);
5226 HYPRE_Int *A_offd_j = hypre_CSRMatrixJ(A_offd);
5227 HYPRE_Real *A_offd_data = hypre_CSRMatrixData(A_offd);
5228
5229 /* data objects for L, D, U */
5230 hypre_ParCSRMatrix *matL;
5231 hypre_ParCSRMatrix *matU;
5232 hypre_CSRMatrix *L_diag;
5233 hypre_CSRMatrix *U_diag;
5234 HYPRE_Real *D_data;
5235 HYPRE_Real *L_diag_data = NULL;
5236 HYPRE_Int *L_diag_i;
5237 HYPRE_Int *L_diag_j = NULL;
5238 HYPRE_Real *U_diag_data = NULL;
5239 HYPRE_Int *U_diag_i;
5240 HYPRE_Int *U_diag_j = NULL;
5241
5242 /* data objects for S */
5243 hypre_ParCSRMatrix *matS = NULL;
5244 hypre_CSRMatrix *S_diag;
5245 hypre_CSRMatrix *S_offd;
5246 HYPRE_Real *S_diag_data = NULL;
5247 HYPRE_Int *S_diag_i = NULL;
5248 HYPRE_Int *S_diag_j = NULL;
5249 HYPRE_Int *S_offd_i = NULL;
5250 HYPRE_Int *S_offd_j = NULL;
5251 HYPRE_BigInt *S_offd_colmap = NULL;
5252 HYPRE_Real *S_offd_data;
5253 HYPRE_BigInt *send_buf = NULL;
5254 HYPRE_Int *u_end_array;
5255
5256 /* reverse permutation */
5257 HYPRE_Int *rperm;
5258 HYPRE_Int *perm, *qperm;
5259
5260 /* problem size
5261 * m is n - nLU, num of rows of local Schur system
5262 * m_e is the size of interface nodes
5263 * e is the number of interial rows in local Schur Complement
5264 */
5265 HYPRE_Int n;
5266 HYPRE_Int m;
5267 HYPRE_Int e;
5268 HYPRE_Int m_e;
5269
5270 /* start setup
5271 * check input first
5272 */
5273 n = hypre_CSRMatrixNumRows(A_diag);
5274 if (nLU < 0 || nLU > n)
5275 {
5276 hypre_error_w_msg(HYPRE_ERROR_ARG,"WARNING: nLU out of range.\n");
5277 }
5278 m = n - nLU;
5279 e = nI - nLU;
5280 m_e = n - nI;
5281 if (e < 0)
5282 {
5283 hypre_error_w_msg(HYPRE_ERROR_ARG,"WARNING: nLU should not exceed nI.\n");
5284 }
5285
5286 u_end_array = hypre_TAlloc(HYPRE_Int, nLU, HYPRE_MEMORY_HOST);
5287
5288 /* start set up
5289 * setup communication stuffs first
5290 */
5291 hypre_MPI_Comm_size(comm,&num_procs);
5292 hypre_MPI_Comm_rank(comm,&my_id);
5293 comm_pkg = hypre_ParCSRMatrixCommPkg(A);
5294 /* create if not yet built */
5295 if (!comm_pkg)
5296 {
5297 hypre_MatvecCommPkgCreate(A);
5298 comm_pkg = hypre_ParCSRMatrixCommPkg(A);
5299 }
5300
5301 /* setup initial memory, in ILUT, just guess with max nnz per row */
5302 nnz_A = A_diag_i[nLU];
5303 if (n > 0)
5304 {
5305 initial_alloc = hypre_min(nLU + ceil((nnz_A / 2.0) * nLU / n), nLU * lfil);
5306 }
5307 capacity_L = initial_alloc;
5308 capacity_U = initial_alloc;
5309
5310 D_data = hypre_CTAlloc(HYPRE_Real, n, HYPRE_MEMORY_DEVICE);
5311 L_diag_i = hypre_CTAlloc(HYPRE_Int, (n+1), HYPRE_MEMORY_DEVICE);
5312 U_diag_i = hypre_CTAlloc(HYPRE_Int, (n+1), HYPRE_MEMORY_DEVICE);
5313
5314 L_diag_j = hypre_CTAlloc(HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
5315 U_diag_j = hypre_CTAlloc(HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
5316 L_diag_data = hypre_CTAlloc(HYPRE_Real, capacity_L, HYPRE_MEMORY_DEVICE);
5317 U_diag_data = hypre_CTAlloc(HYPRE_Real, capacity_U, HYPRE_MEMORY_DEVICE);
5318
5319 ctrL = ctrU = 0;
5320
5321 ctrS = 0;
5322 S_diag_i = hypre_CTAlloc(HYPRE_Int, (m + 1), HYPRE_MEMORY_DEVICE);
5323 S_diag_i[0] = 0;
5324 /* only setup S part when n > nLU */
5325 if (m > 0)
5326 {
5327 capacity_S = hypre_min(m + ceil((nnz_A / 2.0) * m / n), m * lfil);
5328 S_diag_j = hypre_CTAlloc(HYPRE_Int, capacity_S, HYPRE_MEMORY_DEVICE);
5329 S_diag_data = hypre_CTAlloc(HYPRE_Real, capacity_S, HYPRE_MEMORY_DEVICE);
5330 }
5331
5332 /* setting up working array */
5333 iw = hypre_CTAlloc(HYPRE_Int,3*n,HYPRE_MEMORY_HOST);
5334 iL = iw + n;
5335 w = hypre_CTAlloc(HYPRE_Real,n,HYPRE_MEMORY_HOST);
5336 for (i = 0; i < n; i++)
5337 {
5338 iw[i] = -1;
5339 }
5340 L_diag_i[0] = U_diag_i[0] = 0;
5341 /* get reverse permutation (rperm).
5342 * rperm holds the reordered indexes.
5343 * rperm[old] -> new
5344 * perm[new] -> old
5345 */
5346 rperm = iw + 2*n;
5347
5348 if (!permp)
5349 {
5350 perm = hypre_TAlloc(HYPRE_Int, n, HYPRE_MEMORY_DEVICE);
5351 for (i = 0; i < n; i++)
5352 {
5353 perm[i] = i;
5354 }
5355 }
5356 else
5357 {
5358 perm = permp;
5359 }
5360
5361 if (!qpermp)
5362 {
5363 qperm = hypre_TAlloc(HYPRE_Int, n, HYPRE_MEMORY_DEVICE);
5364 for (i = 0; i < n; i++)
5365 {
5366 qperm[i] = i;
5367 }
5368 }
5369 else
5370 {
5371 qperm = qpermp;
5372 }
5373
5374 for (i = 0; i < n; i++)
5375 {
5376 rperm[perm[i]] = i;
5377 }
5378 /*
5379 * 2: Main loop of elimination
5380 * maintain two heaps
5381 * |----->*********<-----|-----*********|
5382 * |col heap***value heap|value in U****|
5383 */
5384
5385 /* main outer loop for upper part */
5386 for (ii = 0; ii < nLU; ii++)
5387 {
5388 /* get real row with perm */
5389 i = perm[ii];
5390 k1 = A_diag_i[i];
5391 k2 = A_diag_i[i+1];
5392 kl = ii-1;
5393 /* reset row norm of ith row */
5394 inorm = .0;
5395 for (j = k1; j < k2; j++)
5396 {
5397 inorm += fabs(A_diag_data[j]);
5398 }
5399 if (inorm == .0)
5400 {
5401 hypre_error_w_msg(HYPRE_ERROR_ARG,"WARNING: ILUT with zero row.\n");
5402 }
5403 inorm /= (HYPRE_Real)(k2-k1);
5404 /* set the scaled tol for that row */
5405 itolb = tol[0] * inorm;
5406 itolef = tol[1] * inorm;
5407
5408 /* reset displacement */
5409 lenhll = lenhlr = lenu = 0;
5410 w[ii] = 0.0;
5411 iw[ii] = ii;
5412 /* copy in data from A */
5413 for (j = k1; j < k2; j++)
5414 {
5415 /* get now col number */
5416 col = rperm[A_diag_j[j]];
5417 if (col < ii)
5418 {
5419 /* L part of it */
5420 iL[lenhll] = col;
5421 w[lenhll] = A_diag_data[j];
5422 iw[col] = lenhll++;
5423 /* add to heap, by col number */
5424 hypre_ILUMinHeapAddIRIi(iL,w,iw,lenhll);
5425 }
5426 else if (col == ii)
5427 {
5428 w[ii] = A_diag_data[j];
5429 }
5430 else
5431 {
5432 lenu++;
5433 jpos = lenu + ii;
5434 iL[jpos] = col;
5435 w[jpos] = A_diag_data[j];
5436 iw[col] = jpos;
5437 }
5438 }
5439
5440 /*
5441 * main elimination
5442 * need to maintain 2 heaps for L, one heap for col and one heaps for value
5443 * maintian an array for U, and do qsplit with quick sort after that
5444 * while the heap of col is greater than zero
5445 */
5446 while (lenhll > 0)
5447 {
5448
5449 /* get the next row from top of the heap */
5450 jrow = iL[0];
5451 dpiv = w[0] * D_data[jrow];
5452 w[0] = dpiv;
5453 /* now remove it from the top of the heap */
5454 hypre_ILUMinHeapRemoveIRIi(iL,w,iw,lenhll);
5455 lenhll--;
5456 /*
5457 * reset the drop part to -1
5458 * we don't need this iw anymore
5459 */
5460 iw[jrow] = -1;
5461 /* need to keep this one, move to the end of the heap */
5462 /* no longer need to maintain iw */
5463 hypre_swap2(iL,w,lenhll,kl-lenhlr);
5464 lenhlr++;
5465 hypre_ILUMaxrHeapAddRabsI(w+kl,iL+kl,lenhlr);
5466 /* loop for elimination */
5467 ku = U_diag_i[jrow+1];
5468 for (j = U_diag_i[jrow]; j < ku; j++)
5469 {
5470 col = U_diag_j[j];
5471 icol = iw[col];
5472 lxu = - dpiv*U_diag_data[j];
5473 /* we don't want to fill small number to empty place */
5474 if ((icol == -1) &&
5475 ((col < nLU && fabs(lxu) < itolb) || (col >= nLU && fabs(lxu) < itolef)))
5476 {
5477 continue;
5478 }
5479 if (icol == -1)
5480 {
5481 if (col < ii)
5482 {
5483 /* L part
5484 * not already in L part
5485 * put it to the end of heap
5486 * might overwrite some small entries, no issue
5487 */
5488 iL[lenhll] = col;
5489 w[lenhll] = lxu;
5490 iw[col] = lenhll++;
5491 /* add to heap, by col number */
5492 hypre_ILUMinHeapAddIRIi(iL,w,iw,lenhll);
5493 }
5494 else if (col == ii)
5495 {
5496 w[ii] += lxu;
5497 }
5498 else
5499 {
5500 /*
5501 * not already in U part
5502 * put is to the end of heap
5503 */
5504 lenu++;
5505 jpos = lenu + ii;
5506 iL[jpos] = col;
5507 w[jpos] = lxu;
5508 iw[col] = jpos;
5509 }
5510 }
5511 else
5512 {
5513 w[icol] += lxu;
5514 }
5515 }
5516 }/* while loop for the elimination of current row */
5517
5518 if (fabs(w[ii]) < MAT_TOL)
5519 {
5520 w[ii]=1e-06;
5521 }
5522 D_data[ii] = 1./w[ii];
5523 iw[ii] = -1;
5524
5525 /*
5526 * now pick up the largest lfil from L
5527 * L part is guarantee to be larger than itol
5528 */
5529
5530 lenl = lenhlr < lfil ? lenhlr : lfil;
5531 L_diag_i[ii+1] = L_diag_i[ii] + lenl;
5532 if (lenl > 0)
5533 {
5534 /* test if memory is enough */
5535 while (ctrL + lenl > capacity_L)
5536 {
5537 HYPRE_Int tmp = capacity_L;
5538 capacity_L = capacity_L * EXPAND_FACT + 1;
5539 L_diag_j = hypre_TReAlloc_v2(L_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
5540 L_diag_data = hypre_TReAlloc_v2(L_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_L, HYPRE_MEMORY_DEVICE);
5541 }
5542 ctrL += lenl;
5543 /* copy large data in */
5544 for (j = L_diag_i[ii]; j < ctrL; j++)
5545 {
5546 L_diag_j[j] = iL[kl];
5547 L_diag_data[j] = w[kl];
5548 hypre_ILUMaxrHeapRemoveRabsI(w+kl,iL+kl,lenhlr);
5549 lenhlr--;
5550 }
5551 }
5552 /*
5553 * now reset working array
5554 * L part already reset when move out of heap, only U part
5555 */
5556 ku = lenu+ii;
5557 for (j = ii + 1; j <= ku; j++)
5558 {
5559 iw[iL[j]] = -1;
5560 }
5561
5562 if (lenu < lfil)
5563 {
5564 /* we simply keep all of the data, no need to sort */
5565 lenhu = lenu;
5566 }
5567 else
5568 {
5569 /* need to sort the first small(hopefully) part of it */
5570 lenhu = lfil;
5571 /* quick split, only sort the first small part of the array */
5572 hypre_ILUMaxQSplitRabsI(w,iL,ii+1,ii+lenhu,ii+lenu);
5573 }
5574
5575 U_diag_i[ii+1] = U_diag_i[ii] + lenhu;
5576 if (lenhu > 0)
5577 {
5578 /* test if memory is enough */
5579 while (ctrU + lenhu > capacity_U)
5580 {
5581 HYPRE_Int tmp = capacity_U;
5582 capacity_U = capacity_U * EXPAND_FACT + 1;
5583 U_diag_j = hypre_TReAlloc_v2(U_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
5584 U_diag_data = hypre_TReAlloc_v2(U_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_U, HYPRE_MEMORY_DEVICE);
5585 }
5586 ctrU += lenhu;
5587 /* copy large data in */
5588 for (j = U_diag_i[ii]; j < ctrU; j++)
5589 {
5590 jpos = ii+1+j-U_diag_i[ii];
5591 U_diag_j[j] = iL[jpos];
5592 U_diag_data[j] = w[jpos];
5593 }
5594 }
5595 /* check and build u_end array */
5596 if (m > 0)
5597 {
5598 hypre_qsort1(U_diag_j,U_diag_data,U_diag_i[ii],U_diag_i[ii+1]-1);
5599 hypre_BinarySearch2(U_diag_j,nLU,U_diag_i[ii],U_diag_i[ii+1]-1,u_end_array + ii);
5600 }
5601 else
5602 {
5603 /* Everything is in U */
5604 u_end_array[ii] = ctrU;
5605 }
5606 }/* end of ii loop from 0 to nLU-1 */
5607
5608
5609 /* now main loop for Schur comlement part */
5610 for (ii = nLU; ii < n; ii++)
5611 {
5612 /* get real row with perm */
5613 i = perm[ii];
5614 k1 = A_diag_i[i];
5615 k2 = A_diag_i[i+1];
5616 kl = nLU-1;
5617 /* reset row norm of ith row */
5618 inorm = .0;
5619 for (j = k1; j < k2; j++)
5620 {
5621 inorm += fabs(A_diag_data[j]);
5622 }
5623 if (inorm == .0)
5624 {
5625 hypre_error_w_msg(HYPRE_ERROR_ARG,"WARNING: ILUT with zero row.\n");
5626 }
5627 inorm /= (HYPRE_Real)(k2-k1);
5628 /* set the scaled tol for that row */
5629 itols = tol[2] * inorm;
5630 itolef = tol[1] * inorm;
5631
5632 /* reset displacement */
5633 lenhll = lenhlr = lenu = 0;
5634 /* copy in data from A */
5635 for (j = k1; j < k2; j++)
5636 {
5637 /* get now col number */
5638 col = rperm[A_diag_j[j]];
5639 if (col < nLU)
5640 {
5641 /* L part of it */
5642 iL[lenhll] = col;
5643 w[lenhll] = A_diag_data[j];
5644 iw[col] = lenhll++;
5645 /* add to heap, by col number */
5646 hypre_ILUMinHeapAddIRIi(iL,w,iw,lenhll);
5647 }
5648 else if (col == ii)
5649 {
5650 /* the diagonla entry of S */
5651 iL[nLU] = col;
5652 w[nLU] = A_diag_data[j];
5653 iw[col] = nLU;
5654 }
5655 else
5656 {
5657 /* S part of it */
5658 lenu++;
5659 jpos = lenu + nLU;
5660 iL[jpos] = col;
5661 w[jpos] = A_diag_data[j];
5662 iw[col] = jpos;
5663 }
5664 }
5665
5666 /*
5667 * main elimination
5668 * need to maintain 2 heaps for L, one heap for col and one heaps for value
5669 * maintian an array for S, and do qsplit with quick sort after that
5670 * while the heap of col is greater than zero
5671 */
5672 while (lenhll > 0)
5673 {
5674 /* get the next row from top of the heap */
5675 jrow = iL[0];
5676 dpiv = w[0] * D_data[jrow];
5677 w[0] = dpiv;
5678 /* now remove it from the top of the heap */
5679 hypre_ILUMinHeapRemoveIRIi(iL,w,iw,lenhll);
5680 lenhll--;
5681 /*
5682 * reset the drop part to -1
5683 * we don't need this iw anymore
5684 */
5685 iw[jrow] = -1;
5686 /* need to keep this one, move to the end of the heap */
5687 /* no longer need to maintain iw */
5688 hypre_swap2(iL,w,lenhll,kl-lenhlr);
5689 lenhlr++;
5690 hypre_ILUMaxrHeapAddRabsI(w+kl,iL+kl,lenhlr);
5691 /* loop for elimination */
5692 ku = U_diag_i[jrow+1];
5693 for (j = U_diag_i[jrow]; j < ku; j++)
5694 {
5695 col = U_diag_j[j];
5696 icol = iw[col];
5697 lxu = - dpiv*U_diag_data[j];
5698 /* we don't want to fill small number to empty place */
5699 if ((icol == -1) &&
5700 ((col < nLU && fabs(lxu) < itolef) || ( col >= nLU && fabs(lxu) < itols )))
5701 {
5702 continue;
5703 }
5704 if (icol == -1)
5705 {
5706 if (col < nLU)
5707 {
5708 /* L part
5709 * not already in L part
5710 * put it to the end of heap
5711 * might overwrite some small entries, no issue
5712 */
5713 iL[lenhll] = col;
5714 w[lenhll] = lxu;
5715 iw[col] = lenhll++;
5716 /* add to heap, by col number */
5717 hypre_ILUMinHeapAddIRIi(iL,w,iw,lenhll);
5718 }
5719 else if (col == ii)
5720 {
5721 /* the diagonla entry of S */
5722 iL[nLU] = col;
5723 w[nLU] = A_diag_data[j];
5724 iw[col] = nLU;
5725 }
5726 else
5727 {
5728 /*
5729 * not already in S part
5730 * put is to the end of heap
5731 */
5732 lenu++;
5733 jpos = lenu + nLU;
5734 iL[jpos] = col;
5735 w[jpos] = lxu;
5736 iw[col] = jpos;
5737 }
5738 }
5739 else
5740 {
5741 w[icol] += lxu;
5742 }
5743 }
5744 }/* while loop for the elimination of current row */
5745
5746 /*
5747 * now pick up the largest lfil from L
5748 * L part is guarantee to be larger than itol
5749 */
5750
5751 lenl = lenhlr < lfil ? lenhlr : lfil;
5752 L_diag_i[ii+1] = L_diag_i[ii] + lenl;
5753 if (lenl > 0)
5754 {
5755 /* test if memory is enough */
5756 while (ctrL + lenl > capacity_L)
5757 {
5758 HYPRE_Int tmp = capacity_L;
5759 capacity_L = capacity_L * EXPAND_FACT + 1;
5760 L_diag_j = hypre_TReAlloc_v2(L_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
5761 L_diag_data = hypre_TReAlloc_v2(L_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_L, HYPRE_MEMORY_DEVICE);
5762 }
5763 ctrL += lenl;
5764 /* copy large data in */
5765 for (j = L_diag_i[ii]; j < ctrL; j ++)
5766 {
5767 L_diag_j[j] = iL[kl];
5768 L_diag_data[j] = w[kl];
5769 hypre_ILUMaxrHeapRemoveRabsI(w+kl,iL+kl,lenhlr);
5770 lenhlr--;
5771 }
5772 }
5773 /*
5774 * now reset working array
5775 * L part already reset when move out of heap, only S part
5776 */
5777 ku = lenu+nLU;
5778 for (j = nLU; j <= ku; j++)
5779 {
5780 iw[iL[j]] = -1;
5781 }
5782
5783 /* no dropping at this point of time for S */
5784 //lenhu = lenu < lfil ? lenu : lfil;
5785 lenhu = lenu;
5786 /* quick split, only sort the first small part of the array */
5787 hypre_ILUMaxQSplitRabsI(w,iL,nLU+1,nLU+lenhu,nLU+lenu);
5788 /* we have diagonal in S anyway */
5789 /* test if memory is enough */
5790 while (ctrS + lenhu + 1 > capacity_S)
5791 {
5792 HYPRE_Int tmp = capacity_S;
5793 capacity_S = capacity_S * EXPAND_FACT + 1;
5794 S_diag_j = hypre_TReAlloc_v2(S_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_S, HYPRE_MEMORY_DEVICE);
5795 S_diag_data = hypre_TReAlloc_v2(S_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_S, HYPRE_MEMORY_DEVICE);
5796 }
5797
5798 ctrS += (lenhu+1);
5799 S_diag_i[ii-nLU+1] = ctrS;
5800
5801 /* copy large data in, diagonal first */
5802 S_diag_j[S_diag_i[ii-nLU]] = iL[nLU]-nLU;
5803 S_diag_data[S_diag_i[ii-nLU]] = w[nLU];
5804 for (j = S_diag_i[ii-nLU] + 1; j < ctrS; j++)
5805 {
5806 jpos = nLU+j-S_diag_i[ii-nLU];
5807 S_diag_j[j] = iL[jpos]-nLU;
5808 S_diag_data[j] = w[jpos];
5809 }
5810 }/* end of ii loop from nLU to n-1 */
5811
5812 /*
5813 * 3: Finishing up and free
5814 */
5815
5816 /* First create Schur complement if necessary
5817 * Check if we need to create Schur complement
5818 */
5819 HYPRE_BigInt big_m = (HYPRE_BigInt)m;
5820 hypre_MPI_Allreduce(&big_m, &total_rows, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
5821 /* only form when total_rows > 0 */
5822 if ( total_rows > 0 )
5823 {
5824 /* now create S */
5825 /* need to get new column start */
5826 {
5827 HYPRE_BigInt global_start;
5828 hypre_MPI_Scan( &big_m, &global_start, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
5829 col_starts[0] = global_start - m;
5830 col_starts[1] = global_start;
5831 }
5832 /* We did nothing to A_offd, so all the data kept, just reorder them
5833 * The create function takes comm, global num rows/cols,
5834 * row/col start, num cols offd, nnz diag, nnz offd
5835 */
5836 S_offd_nnz = hypre_CSRMatrixNumNonzeros(A_offd);
5837 S_offd_ncols = hypre_CSRMatrixNumCols(A_offd);
5838
5839 matS = hypre_ParCSRMatrixCreate( comm,
5840 total_rows,
5841 total_rows,
5842 col_starts,
5843 col_starts,
5844 S_offd_ncols,
5845 S_diag_i[m],
5846 S_offd_nnz);
5847
5848 /* first put diagonal data in */
5849 S_diag = hypre_ParCSRMatrixDiag(matS);
5850
5851 hypre_CSRMatrixI(S_diag) = S_diag_i;
5852 hypre_CSRMatrixData(S_diag) = S_diag_data;
5853 hypre_CSRMatrixJ(S_diag) = S_diag_j;
5854
5855 /* now start to construct offdiag of S */
5856 S_offd = hypre_ParCSRMatrixOffd(matS);
5857 S_offd_i = hypre_TAlloc(HYPRE_Int, m+1, HYPRE_MEMORY_DEVICE);
5858 S_offd_j = hypre_TAlloc(HYPRE_Int, S_offd_nnz, HYPRE_MEMORY_DEVICE);
5859 S_offd_data = hypre_TAlloc(HYPRE_Real, S_offd_nnz, HYPRE_MEMORY_DEVICE);
5860 S_offd_colmap = hypre_CTAlloc(HYPRE_BigInt, S_offd_ncols, HYPRE_MEMORY_HOST);
5861
5862 /* simply use a loop to copy data from A_offd */
5863 S_offd_i[0] = 0;
5864 k3 = 0;
5865 for (i = 1; i <= e; i++)
5866 {
5867 S_offd_i[i] = k3;
5868 }
5869 for (i = 0; i < m_e; i++)
5870 {
5871 col = perm[i + nI];
5872 k1 = A_offd_i[col];
5873 k2 = A_offd_i[col+1];
5874 for (j = k1; j < k2; j++)
5875 {
5876 S_offd_j[k3] = A_offd_j[j];
5877 S_offd_data[k3++] = A_offd_data[j];
5878 }
5879 S_offd_i[i+e+1] = k3;
5880 }
5881
5882 /* give I, J, DATA to S_offd */
5883 hypre_CSRMatrixI(S_offd) = S_offd_i;
5884 hypre_CSRMatrixJ(S_offd) = S_offd_j;
5885 hypre_CSRMatrixData(S_offd) = S_offd_data;
5886
5887 /* now we need to update S_offd_colmap */
5888
5889 /* get total num of send */
5890 num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
5891 begin = hypre_ParCSRCommPkgSendMapStart(comm_pkg,0);
5892 end = hypre_ParCSRCommPkgSendMapStart(comm_pkg,num_sends);
5893 send_buf = hypre_TAlloc(HYPRE_BigInt, end - begin, HYPRE_MEMORY_HOST);
5894 /* copy new index into send_buf */
5895 for (i = begin; i < end; i++)
5896 {
5897 send_buf[i-begin] = rperm[hypre_ParCSRCommPkgSendMapElmt(comm_pkg,i)] - nLU + col_starts[0];
5898 }
5899
5900 /* main communication */
5901 comm_handle = hypre_ParCSRCommHandleCreate(21, comm_pkg, send_buf, S_offd_colmap);
5902 /* need this to synchronize, Isend & Irecv used in above functions */
5903 hypre_ParCSRCommHandleDestroy(comm_handle);
5904
5905 /* setup index */
5906 hypre_ParCSRMatrixColMapOffd(matS) = S_offd_colmap;
5907
5908 hypre_ILUSortOffdColmap(matS);
5909
5910 /* free */
5911 hypre_TFree(send_buf, HYPRE_MEMORY_HOST);
5912 } /* end of forming S */
5913
5914 /* now start to construct L and U */
5915 for (k = nLU; k < n; k++)
5916 {
5917 /* set U after nLU to be 0, and diag to be one */
5918 U_diag_i[k+1] = U_diag_i[nLU];
5919 D_data[k] = 1.;
5920 }
5921
5922 /* create parcsr matrix */
5923 matL = hypre_ParCSRMatrixCreate( comm,
5924 hypre_ParCSRMatrixGlobalNumRows(A),
5925 hypre_ParCSRMatrixGlobalNumRows(A),
5926 hypre_ParCSRMatrixRowStarts(A),
5927 hypre_ParCSRMatrixColStarts(A),
5928 0,
5929 L_diag_i[n],
5930 0 );
5931
5932 L_diag = hypre_ParCSRMatrixDiag(matL);
5933 hypre_CSRMatrixI(L_diag) = L_diag_i;
5934 if (L_diag_i[n] > 0)
5935 {
5936 hypre_CSRMatrixData(L_diag) = L_diag_data;
5937 hypre_CSRMatrixJ(L_diag) = L_diag_j;
5938 }
5939 else
5940 {
5941 /* we initialized some anyway, so remove if unused */
5942 hypre_TFree(L_diag_j,HYPRE_MEMORY_DEVICE);
5943 hypre_TFree(L_diag_data,HYPRE_MEMORY_DEVICE);
5944 }
5945 /* store (global) total number of nonzeros */
5946 local_nnz = (HYPRE_Real) (L_diag_i[n]);
5947 hypre_MPI_Allreduce(&local_nnz, &total_nnz, 1, HYPRE_MPI_REAL, hypre_MPI_SUM, comm);
5948 hypre_ParCSRMatrixDNumNonzeros(matL) = total_nnz;
5949
5950 matU = hypre_ParCSRMatrixCreate( comm,
5951 hypre_ParCSRMatrixGlobalNumRows(A),
5952 hypre_ParCSRMatrixGlobalNumRows(A),
5953 hypre_ParCSRMatrixRowStarts(A),
5954 hypre_ParCSRMatrixColStarts(A),
5955 0,
5956 U_diag_i[n],
5957 0 );
5958
5959 U_diag = hypre_ParCSRMatrixDiag(matU);
5960 hypre_CSRMatrixI(U_diag) = U_diag_i;
5961 if (U_diag_i[n] > 0)
5962 {
5963 hypre_CSRMatrixData(U_diag) = U_diag_data;
5964 hypre_CSRMatrixJ(U_diag) = U_diag_j;
5965 }
5966 else
5967 {
5968 /* we initialized some anyway, so remove if unused */
5969 hypre_TFree(U_diag_j,HYPRE_MEMORY_DEVICE);
5970 hypre_TFree(U_diag_data,HYPRE_MEMORY_DEVICE);
5971 }
5972 /* store (global) total number of nonzeros */
5973 local_nnz = (HYPRE_Real) (U_diag_i[n]);
5974 hypre_MPI_Allreduce(&local_nnz, &total_nnz, 1, HYPRE_MPI_REAL, hypre_MPI_SUM, comm);
5975 hypre_ParCSRMatrixDNumNonzeros(matU) = total_nnz;
5976
5977 /* free working array */
5978 hypre_TFree(iw,HYPRE_MEMORY_HOST);
5979 hypre_TFree(w,HYPRE_MEMORY_HOST);
5980
5981 if (!matS)
5982 {
5983 hypre_TFree(S_diag_i,HYPRE_MEMORY_DEVICE);
5984 }
5985
5986 if (!permp)
5987 {
5988 hypre_TFree(perm, HYPRE_MEMORY_DEVICE);
5989 }
5990
5991 if (!qpermp)
5992 {
5993 hypre_TFree(qperm, HYPRE_MEMORY_DEVICE);
5994 }
5995
5996 /* set matrix pointers */
5997 *Lptr = matL;
5998 *Dptr = D_data;
5999 *Uptr = matU;
6000 *Sptr = matS;
6001 *u_end = u_end_array;
6002
6003 return hypre_error_flag;
6004 }
6005
6006
6007 /* NSH setup */
6008 /* Setup NSH data */
6009 HYPRE_Int
hypre_NSHSetup(void * nsh_vdata,hypre_ParCSRMatrix * A,hypre_ParVector * f,hypre_ParVector * u)6010 hypre_NSHSetup( void *nsh_vdata,
6011 hypre_ParCSRMatrix *A,
6012 hypre_ParVector *f,
6013 hypre_ParVector *u )
6014 {
6015 MPI_Comm comm = hypre_ParCSRMatrixComm(A);
6016 hypre_ParNSHData *nsh_data = (hypre_ParNSHData*) nsh_vdata;
6017
6018 // HYPRE_Int i;
6019 // HYPRE_Int num_threads;
6020 // HYPRE_Int debug_flag = 0;
6021
6022 /* pointers to NSH data */
6023 HYPRE_Int logging = hypre_ParNSHDataLogging(nsh_data);
6024 HYPRE_Int print_level = hypre_ParNSHDataPrintLevel(nsh_data);
6025
6026 hypre_ParCSRMatrix *matA = hypre_ParNSHDataMatA(nsh_data);
6027 hypre_ParCSRMatrix *matM = hypre_ParNSHDataMatM(nsh_data);
6028
6029 // HYPRE_Int n = hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(A));
6030 HYPRE_Int num_procs, my_id;
6031
6032 hypre_ParVector *Utemp;
6033 hypre_ParVector *Ftemp;
6034 hypre_ParVector *F_array = hypre_ParNSHDataF(nsh_data);
6035 hypre_ParVector *U_array = hypre_ParNSHDataU(nsh_data);
6036 hypre_ParVector *residual = hypre_ParNSHDataResidual(nsh_data);
6037 HYPRE_Real *rel_res_norms = hypre_ParNSHDataRelResNorms(nsh_data);
6038
6039 /* solver setting */
6040 HYPRE_Real *droptol = hypre_ParNSHDataDroptol(nsh_data);
6041 HYPRE_Real mr_tol = hypre_ParNSHDataMRTol(nsh_data);
6042 HYPRE_Int mr_max_row_nnz = hypre_ParNSHDataMRMaxRowNnz(nsh_data);
6043 HYPRE_Int mr_max_iter = hypre_ParNSHDataMRMaxIter(nsh_data);
6044 HYPRE_Int mr_col_version = hypre_ParNSHDataMRColVersion(nsh_data);
6045 HYPRE_Real nsh_tol = hypre_ParNSHDataNSHTol(nsh_data);
6046 HYPRE_Int nsh_max_row_nnz = hypre_ParNSHDataNSHMaxRowNnz(nsh_data);
6047 HYPRE_Int nsh_max_iter = hypre_ParNSHDataNSHMaxIter(nsh_data);
6048
6049 /* ----- begin -----*/
6050
6051 //num_threads = hypre_NumThreads();
6052
6053 hypre_MPI_Comm_size(comm,&num_procs);
6054 hypre_MPI_Comm_rank(comm,&my_id);
6055
6056 /* Free Previously allocated data, if any not destroyed */
6057 if (matM)
6058 {
6059 hypre_TFree(matM, HYPRE_MEMORY_HOST);
6060 matM = NULL;
6061 }
6062
6063 /* clear old l1_norm data, if created */
6064 if (hypre_ParNSHDataL1Norms(nsh_data))
6065 {
6066 hypre_TFree(hypre_ParNSHDataL1Norms(nsh_data), HYPRE_MEMORY_HOST);
6067 hypre_ParNSHDataL1Norms(nsh_data) = NULL;
6068 }
6069
6070 /* setup temporary storage
6071 * first check is they've already here
6072 */
6073 if (hypre_ParNSHDataUTemp(nsh_data))
6074 {
6075 hypre_ParVectorDestroy(hypre_ParNSHDataUTemp(nsh_data));
6076 hypre_ParNSHDataUTemp(nsh_data) = NULL;
6077 }
6078 if (hypre_ParNSHDataFTemp(nsh_data))
6079 {
6080 hypre_ParVectorDestroy(hypre_ParNSHDataFTemp(nsh_data));
6081 hypre_ParNSHDataFTemp(nsh_data) = NULL;
6082 }
6083 if (hypre_ParNSHDataResidual(nsh_data))
6084 {
6085 hypre_ParVectorDestroy(hypre_ParNSHDataResidual(nsh_data));
6086 hypre_ParNSHDataResidual(nsh_data) = NULL;
6087 }
6088 if (hypre_ParNSHDataRelResNorms(nsh_data))
6089 {
6090 hypre_TFree(hypre_ParNSHDataRelResNorms(nsh_data), HYPRE_MEMORY_HOST);
6091 hypre_ParNSHDataRelResNorms(nsh_data) = NULL;
6092 }
6093
6094 /* start to create working vectors */
6095 Utemp = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A),
6096 hypre_ParCSRMatrixGlobalNumRows(A),
6097 hypre_ParCSRMatrixRowStarts(A));
6098 hypre_ParVectorInitialize(Utemp);
6099 hypre_ParNSHDataUTemp(nsh_data) = Utemp;
6100
6101 Ftemp = hypre_ParVectorCreate(hypre_ParCSRMatrixComm(A),
6102 hypre_ParCSRMatrixGlobalNumRows(A),
6103 hypre_ParCSRMatrixRowStarts(A));
6104 hypre_ParVectorInitialize(Ftemp);
6105 hypre_ParNSHDataFTemp(nsh_data) = Ftemp;
6106 /* set matrix, solution and rhs pointers */
6107 matA = A;
6108 F_array = f;
6109 U_array = u;
6110
6111 /* NSH compute approximate inverse, see par_ilu.c */
6112 hypre_ILUParCSRInverseNSH(matA, &matM, droptol, mr_tol, nsh_tol, DIVIDE_TOL, mr_max_row_nnz,
6113 nsh_max_row_nnz, mr_max_iter, nsh_max_iter, mr_col_version, print_level);
6114
6115 /* set pointers to NSH data */
6116 hypre_ParNSHDataMatA(nsh_data) = matA;
6117 hypre_ParNSHDataF(nsh_data) = F_array;
6118 hypre_ParNSHDataU(nsh_data) = U_array;
6119 hypre_ParNSHDataMatM(nsh_data) = matM;
6120
6121 /* compute operator complexity */
6122 hypre_ParCSRMatrixSetDNumNonzeros(matA);
6123 hypre_ParCSRMatrixSetDNumNonzeros(matM);
6124 /* compute complexity */
6125 hypre_ParNSHDataOperatorComplexity(nsh_data) = hypre_ParCSRMatrixDNumNonzeros(matM)/hypre_ParCSRMatrixDNumNonzeros(matA);
6126 if (my_id == 0)
6127 {
6128 hypre_printf("NSH SETUP: operator complexity = %f \n", hypre_ParNSHDataOperatorComplexity(nsh_data));
6129 }
6130
6131 if ( logging > 1 ) {
6132 residual =
6133 hypre_ParVectorCreate(hypre_ParCSRMatrixComm(matA),
6134 hypre_ParCSRMatrixGlobalNumRows(matA),
6135 hypre_ParCSRMatrixRowStarts(matA) );
6136 hypre_ParVectorInitialize(residual);
6137 hypre_ParNSHDataResidual(nsh_data)= residual;
6138 }
6139 else{
6140 hypre_ParNSHDataResidual(nsh_data) = NULL;
6141 }
6142 rel_res_norms = hypre_CTAlloc(HYPRE_Real, hypre_ParNSHDataMaxIter(nsh_data), HYPRE_MEMORY_HOST);
6143 hypre_ParNSHDataRelResNorms(nsh_data) = rel_res_norms;
6144
6145 return hypre_error_flag;
6146 }
6147
6148
6149 /* ILU(0) for RAS, has some external rows
6150 * A = input matrix
6151 * perm = permutation array indicating ordering of factorization. Perm could come from a
6152 * CF_marker array or a reordering routine.
6153 * nLU = size of computed LDU factorization.
6154 * Lptr, Dptr, Uptr, Sptr = L, D, U, S factors.
6155 * will form global Schur Matrix if nLU < n
6156 */
6157 HYPRE_Int
hypre_ILUSetupILU0RAS(hypre_ParCSRMatrix * A,HYPRE_Int * perm,HYPRE_Int nLU,hypre_ParCSRMatrix ** Lptr,HYPRE_Real ** Dptr,hypre_ParCSRMatrix ** Uptr)6158 hypre_ILUSetupILU0RAS(hypre_ParCSRMatrix *A, HYPRE_Int *perm, HYPRE_Int nLU,
6159 hypre_ParCSRMatrix **Lptr, HYPRE_Real** Dptr, hypre_ParCSRMatrix **Uptr)
6160 {
6161 HYPRE_Int i, ii, j, k, k1, k2, ctrU, ctrL, lenl, lenu, jpiv, col, jpos;
6162 HYPRE_Int *iw, *iL, *iU;
6163 HYPRE_Real dd, t, dpiv, lxu, *wU, *wL;
6164
6165 /* communication stuffs for S */
6166 MPI_Comm comm = hypre_ParCSRMatrixComm(A);
6167 HYPRE_Int num_procs;
6168 // HYPRE_Int S_offd_nnz, S_offd_ncols;
6169 hypre_ParCSRCommPkg *comm_pkg;
6170 // hypre_ParCSRCommHandle *comm_handle;
6171 // HYPRE_Int num_sends, begin, end;
6172 // HYPRE_Int *send_buf = NULL;
6173
6174 /* data objects for A */
6175 hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
6176 hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
6177 HYPRE_Real *A_diag_data = hypre_CSRMatrixData(A_diag);
6178 HYPRE_Int *A_diag_i = hypre_CSRMatrixI(A_diag);
6179 HYPRE_Int *A_diag_j = hypre_CSRMatrixJ(A_diag);
6180 HYPRE_Real *A_offd_data = hypre_CSRMatrixData(A_offd);
6181 HYPRE_Int *A_offd_i = hypre_CSRMatrixI(A_offd);
6182 HYPRE_Int *A_offd_j = hypre_CSRMatrixJ(A_offd);
6183
6184 /* size of problem and external matrix */
6185 HYPRE_Int n = hypre_CSRMatrixNumRows(A_diag);
6186 // HYPRE_Int m = n - nLU;
6187 HYPRE_Int ext = hypre_CSRMatrixNumCols(A_offd);
6188 HYPRE_Int total_rows = n + ext;
6189 HYPRE_BigInt col_starts[2];
6190 HYPRE_BigInt global_num_rows;
6191 HYPRE_Real local_nnz, total_nnz;
6192
6193 /* data objects for L, D, U */
6194 hypre_ParCSRMatrix *matL;
6195 hypre_ParCSRMatrix *matU;
6196 hypre_CSRMatrix *L_diag;
6197 hypre_CSRMatrix *U_diag;
6198 HYPRE_Real *D_data;
6199 HYPRE_Real *L_diag_data;
6200 HYPRE_Int *L_diag_i;
6201 HYPRE_Int *L_diag_j;
6202 HYPRE_Real *U_diag_data;
6203 HYPRE_Int *U_diag_i;
6204 HYPRE_Int *U_diag_j;
6205
6206 /* data objects for E, external matrix */
6207 HYPRE_Int *E_i;
6208 HYPRE_Int *E_j;
6209 HYPRE_Real *E_data;
6210
6211 /* memory management */
6212 HYPRE_Int initial_alloc = 0;
6213 HYPRE_Int capacity_L;
6214 HYPRE_Int capacity_U;
6215 HYPRE_Int nnz_A = A_diag_i[n];
6216
6217 /* reverse permutation array */
6218 HYPRE_Int *rperm;
6219 /* the original permutation array */
6220 HYPRE_Int *perm_old;
6221
6222 /* start setup
6223 * get communication stuffs first
6224 */
6225 hypre_MPI_Comm_size(comm,&num_procs);
6226 comm_pkg = hypre_ParCSRMatrixCommPkg(A);
6227 /* setup if not yet built */
6228 if (!comm_pkg)
6229 {
6230 hypre_MatvecCommPkgCreate(A);
6231 comm_pkg = hypre_ParCSRMatrixCommPkg(A);
6232 }
6233
6234 /* check for correctness */
6235 if (nLU < 0 || nLU > n)
6236 {
6237 hypre_error_w_msg(HYPRE_ERROR_ARG,"WARNING: nLU out of range.\n");
6238 }
6239
6240 /* Allocate memory for L,D,U,S factors */
6241 if (n > 0)
6242 {
6243 initial_alloc = (n + ext) + ceil((nnz_A / 2.0)*total_rows/n);
6244 }
6245 capacity_L = initial_alloc;
6246 capacity_U = initial_alloc;
6247
6248 D_data = hypre_TAlloc(HYPRE_Real, total_rows, HYPRE_MEMORY_DEVICE);
6249 L_diag_i = hypre_TAlloc(HYPRE_Int, total_rows+1, HYPRE_MEMORY_DEVICE);
6250 L_diag_j = hypre_TAlloc(HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
6251 L_diag_data = hypre_TAlloc(HYPRE_Real, capacity_L, HYPRE_MEMORY_DEVICE);
6252 U_diag_i = hypre_TAlloc(HYPRE_Int, total_rows+1, HYPRE_MEMORY_DEVICE);
6253 U_diag_j = hypre_TAlloc(HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
6254 U_diag_data = hypre_TAlloc(HYPRE_Real, capacity_U, HYPRE_MEMORY_DEVICE);
6255
6256 /* allocate working arrays */
6257 iw = hypre_TAlloc(HYPRE_Int, 4*total_rows, HYPRE_MEMORY_HOST);
6258 iL = iw+total_rows;
6259 rperm = iw + 2 * total_rows;
6260 perm_old = perm;
6261 perm = iw + 3 * total_rows;
6262 wL = hypre_TAlloc(HYPRE_Real, total_rows, HYPRE_MEMORY_HOST);
6263 ctrU = ctrL = 0;
6264 L_diag_i[0] = U_diag_i[0] = 0;
6265 /* set marker array iw to -1 */
6266 for (i = 0; i < total_rows; i++)
6267 {
6268 iw[i] = -1;
6269 }
6270
6271 /* expand perm to suit extra data, remember to free */
6272 for (i = 0; i < n; i++)
6273 {
6274 perm[i] = perm_old[i];
6275 }
6276 for (i = n; i < total_rows; i++)
6277 {
6278 perm[i] = i;
6279 }
6280
6281 /* get reverse permutation (rperm).
6282 * rperm holds the reordered indexes.
6283 */
6284 for (i = 0; i < total_rows; i++)
6285 {
6286 rperm[perm[i]] = i;
6287 }
6288
6289 /* get external rows */
6290 hypre_ILUBuildRASExternalMatrix(A, rperm, &E_i, &E_j, &E_data);
6291
6292 /*--------- Begin Factorization. Work in permuted space ----
6293 * this is the first part, without offd
6294 */
6295 for (ii = 0; ii < nLU; ii++)
6296 {
6297 // get row i
6298 i = perm[ii];
6299 // get extents of row i
6300 k1=A_diag_i[i];
6301 k2=A_diag_i[i+1];
6302
6303 /*-------------------- unpack L & U-parts of row of A in arrays w */
6304 iU = iL+ii;
6305 wU = wL+ii;
6306 /*-------------------- diagonal entry */
6307 dd = 0.0;
6308 lenl = lenu = 0;
6309 iw[ii] = ii;
6310 /*-------------------- scan & unwrap column */
6311 for (j = k1; j < k2; j++)
6312 {
6313 col = rperm[A_diag_j[j]];
6314 t = A_diag_data[j];
6315 if ( col < ii )
6316 {
6317 iw[col] = lenl;
6318 iL[lenl] = col;
6319 wL[lenl++] = t;
6320 }
6321 else if (col > ii)
6322 {
6323 iw[col] = lenu;
6324 iU[lenu] = col;
6325 wU[lenu++] = t;
6326 }
6327 else
6328 {
6329 dd=t;
6330 }
6331 }
6332
6333 /* eliminate row */
6334 /*-------------------------------------------------------------------------
6335 * In order to do the elimination in the correct order we must select the
6336 * smallest column index among iL[k], k = j, j+1, ..., lenl-1. For ILU(0),
6337 * no new fill-ins are expect, so we can pre-sort iL and wL prior to the
6338 * entering the elimination loop.
6339 *-----------------------------------------------------------------------*/
6340 // hypre_quickSortIR(iL, wL, iw, 0, (lenl-1));
6341 hypre_qsort3ir(iL, wL, iw, 0, (lenl-1));
6342 for (j = 0; j < lenl; j++)
6343 {
6344 jpiv = iL[j];
6345 /* get factor/ pivot element */
6346 dpiv = wL[j] * D_data[jpiv];
6347 /* store entry in L */
6348 wL[j] = dpiv;
6349
6350 /* zero out element - reset pivot */
6351 iw[jpiv] = -1;
6352 /* combine current row and pivot row */
6353 for (k = U_diag_i[jpiv]; k < U_diag_i[jpiv+1]; k++)
6354 {
6355 col = U_diag_j[k];
6356 jpos = iw[col];
6357
6358 /* Only fill-in nonzero pattern (jpos != 0) */
6359 if (jpos < 0)
6360 {
6361 continue;
6362 }
6363
6364 lxu = - U_diag_data[k] * dpiv;
6365 if (col < ii)
6366 {
6367 /* dealing with L part */
6368 wL[jpos] += lxu;
6369 }
6370 else if (col > ii)
6371 {
6372 /* dealing with U part */
6373 wU[jpos] += lxu;
6374 }
6375 else
6376 {
6377 /* diagonal update */
6378 dd += lxu;
6379 }
6380 }
6381 }
6382 /* restore iw (only need to restore diagonal and U part */
6383 iw[ii] = -1;
6384 for (j = 0; j < lenu; j++)
6385 {
6386 iw[iU[j]] = -1;
6387 }
6388
6389 /* Update LDU factors */
6390 /* L part */
6391 /* Check that memory is sufficient */
6392 while ((ctrL+lenl) > capacity_L)
6393 {
6394 HYPRE_Int tmp = capacity_L;
6395 capacity_L = capacity_L * EXPAND_FACT + 1;
6396 L_diag_j = hypre_TReAlloc_v2(L_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
6397 L_diag_data = hypre_TReAlloc_v2(L_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_L, HYPRE_MEMORY_DEVICE);
6398 }
6399 //hypre_TMemcpy(&(L_diag_j)[ctrL], iL, HYPRE_Int, lenl, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
6400 //hypre_TMemcpy(&(L_diag_data)[ctrL], wL, HYPRE_Real, lenl, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
6401 hypre_TMemcpy(&(L_diag_j)[ctrL], iL, HYPRE_Int, lenl,
6402 HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
6403 hypre_TMemcpy(&(L_diag_data)[ctrL], wL, HYPRE_Real, lenl,
6404 HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
6405 L_diag_i[ii+1] = (ctrL+=lenl);
6406
6407 /* diagonal part (we store the inverse) */
6408 if (fabs(dd) < MAT_TOL)
6409 {
6410 dd = 1.0e-6;
6411 }
6412 D_data[ii] = 1./dd;
6413
6414 /* U part */
6415 /* Check that memory is sufficient */
6416 while ((ctrU+lenu) > capacity_U)
6417 {
6418 HYPRE_Int tmp = capacity_U;
6419 capacity_U = capacity_U * EXPAND_FACT + 1;
6420 U_diag_j = hypre_TReAlloc_v2(U_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
6421 U_diag_data = hypre_TReAlloc_v2(U_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_U, HYPRE_MEMORY_DEVICE);
6422 }
6423 //hypre_TMemcpy(&(U_diag_j)[ctrU], iU, HYPRE_Int, lenu, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
6424 //hypre_TMemcpy(&(U_diag_data)[ctrU], wU, HYPRE_Real, lenu, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
6425 hypre_TMemcpy(&(U_diag_j)[ctrU], iU, HYPRE_Int, lenu,
6426 HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
6427 hypre_TMemcpy(&(U_diag_data)[ctrU], wU, HYPRE_Real, lenu,
6428 HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
6429 U_diag_i[ii+1] = (ctrU+=lenu);
6430 }
6431
6432 /*--------- Begin Factorization in lower part ----
6433 * here we need to get off diagonals in
6434 */
6435 for (ii = nLU; ii < n; ii++)
6436 {
6437 // get row i
6438 i = perm[ii];
6439 // get extents of row i
6440 k1=A_diag_i[i];
6441 k2=A_diag_i[i+1];
6442
6443 /*-------------------- unpack L & U-parts of row of A in arrays w */
6444 iU = iL+ii;
6445 wU = wL+ii;
6446 /*-------------------- diagonal entry */
6447 dd = 0.0;
6448 lenl = lenu = 0;
6449 iw[ii] = ii;
6450 /*-------------------- scan & unwrap column */
6451 for (j = k1; j < k2; j++)
6452 {
6453 col = rperm[A_diag_j[j]];
6454 t = A_diag_data[j];
6455 if (col < ii)
6456 {
6457 iw[col] = lenl;
6458 iL[lenl] = col;
6459 wL[lenl++] = t;
6460 }
6461 else if (col > ii)
6462 {
6463 iw[col] = lenu;
6464 iU[lenu] = col;
6465 wU[lenu++] = t;
6466 }
6467 else
6468 {
6469 dd=t;
6470 }
6471 }
6472
6473 /*------------------ sjcan offd*/
6474 k1=A_offd_i[i];
6475 k2=A_offd_i[i+1];
6476 for (j = k1; j < k2; j++)
6477 {
6478 /* add offd to U part, all offd are U for this part */
6479 col = A_offd_j[j] + n;
6480 t = A_offd_data[j];
6481 iw[col] = lenu;
6482 iU[lenu] = col;
6483 wU[lenu++] = t;
6484 }
6485
6486 /* eliminate row */
6487 /*-------------------------------------------------------------------------
6488 * In order to do the elimination in the correct order we must select the
6489 * smallest column index among iL[k], k = j, j+1, ..., lenl-1. For ILU(0),
6490 * no new fill-ins are expect, so we can pre-sort iL and wL prior to the
6491 * entering the elimination loop.
6492 *-----------------------------------------------------------------------*/
6493 // hypre_quickSortIR(iL, wL, iw, 0, (lenl-1));
6494 hypre_qsort3ir(iL, wL, iw, 0, (lenl-1));
6495 for (j = 0; j < lenl; j++)
6496 {
6497 jpiv = iL[j];
6498 /* get factor/ pivot element */
6499 dpiv = wL[j] * D_data[jpiv];
6500 /* store entry in L */
6501 wL[j] = dpiv;
6502
6503 /* zero out element - reset pivot */
6504 iw[jpiv] = -1;
6505 /* combine current row and pivot row */
6506 for (k = U_diag_i[jpiv]; k < U_diag_i[jpiv+1]; k++)
6507 {
6508 col = U_diag_j[k];
6509 jpos = iw[col];
6510
6511 /* Only fill-in nonzero pattern (jpos != 0) */
6512 if (jpos < 0)
6513 {
6514 continue;
6515 }
6516
6517 lxu = - U_diag_data[k] * dpiv;
6518 if (col < ii)
6519 {
6520 /* dealing with L part */
6521 wL[jpos] += lxu;
6522 }
6523 else if (col > ii)
6524 {
6525 /* dealing with U part */
6526 wU[jpos] += lxu;
6527 }
6528 else
6529 {
6530 /* diagonal update */
6531 dd += lxu;
6532 }
6533 }
6534 }
6535 /* restore iw (only need to restore diagonal and U part */
6536 iw[ii] = -1;
6537 for (j = 0; j < lenu; j++)
6538 {
6539 iw[iU[j]] = -1;
6540 }
6541
6542 /* Update LDU factors */
6543 /* L part */
6544 /* Check that memory is sufficient */
6545 while ((ctrL+lenl) > capacity_L)
6546 {
6547 HYPRE_Int tmp = capacity_L;
6548 capacity_L = capacity_L * EXPAND_FACT + 1;
6549 L_diag_j = hypre_TReAlloc_v2(L_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
6550 L_diag_data = hypre_TReAlloc_v2(L_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_L, HYPRE_MEMORY_DEVICE);
6551 }
6552 //hypre_TMemcpy(&(L_diag_j)[ctrL], iL, HYPRE_Int, lenl, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
6553 //hypre_TMemcpy(&(L_diag_data)[ctrL], wL, HYPRE_Real, lenl, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
6554 hypre_TMemcpy(&(L_diag_j)[ctrL], iL, HYPRE_Int, lenl,
6555 HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
6556 hypre_TMemcpy(&(L_diag_data)[ctrL], wL, HYPRE_Real, lenl,
6557 HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
6558 L_diag_i[ii+1] = (ctrL+=lenl);
6559
6560 /* diagonal part (we store the inverse) */
6561 if (fabs(dd) < MAT_TOL)
6562 {
6563 dd = 1.0e-6;
6564 }
6565 D_data[ii] = 1./dd;
6566
6567 /* U part */
6568 /* Check that memory is sufficient */
6569 while ((ctrU+lenu) > capacity_U)
6570 {
6571 HYPRE_Int tmp = capacity_U;
6572 capacity_U = capacity_U * EXPAND_FACT + 1;
6573 U_diag_j = hypre_TReAlloc_v2(U_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
6574 U_diag_data = hypre_TReAlloc_v2(U_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_U, HYPRE_MEMORY_DEVICE);
6575 }
6576 //hypre_TMemcpy(&(U_diag_j)[ctrU], iU, HYPRE_Int, lenu, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
6577 //hypre_TMemcpy(&(U_diag_data)[ctrU], wU, HYPRE_Real, lenu, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
6578 hypre_TMemcpy(&(U_diag_j)[ctrU], iU, HYPRE_Int, lenu,
6579 HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
6580 hypre_TMemcpy(&(U_diag_data)[ctrU], wU, HYPRE_Real, lenu,
6581 HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
6582 U_diag_i[ii+1] = (ctrU+=lenu);
6583 }
6584
6585 /*--------- Begin Factorization in external part ----
6586 * here we need to get off diagonals in
6587 */
6588 for (ii = n ; ii < total_rows ; ii++)
6589 {
6590 // get row i
6591 i = ii-n;
6592 // get extents of row i
6593 k1=E_i[i];
6594 k2=E_i[i+1];
6595
6596 /*-------------------- unpack L & U-parts of row of A in arrays w */
6597 iU = iL+ii;
6598 wU = wL+ii;
6599 /*-------------------- diagonal entry */
6600 dd = 0.0;
6601 lenl = lenu = 0;
6602 iw[ii] = ii;
6603 /*-------------------- scan & unwrap column */
6604 for (j = k1; j < k2; j++)
6605 {
6606 col = rperm[E_j[j]];
6607 t = E_data[j];
6608 if (col < ii)
6609 {
6610 iw[col] = lenl;
6611 iL[lenl] = col;
6612 wL[lenl++] = t;
6613 }
6614 else if (col > ii)
6615 {
6616 iw[col] = lenu;
6617 iU[lenu] = col;
6618 wU[lenu++] = t;
6619 }
6620 else
6621 {
6622 dd=t;
6623 }
6624 }
6625
6626 /* eliminate row */
6627 /*-------------------------------------------------------------------------
6628 * In order to do the elimination in the correct order we must select the
6629 * smallest column index among iL[k], k = j, j+1, ..., lenl-1. For ILU(0),
6630 * no new fill-ins are expect, so we can pre-sort iL and wL prior to the
6631 * entering the elimination loop.
6632 *-----------------------------------------------------------------------*/
6633 // hypre_quickSortIR(iL, wL, iw, 0, (lenl-1));
6634 hypre_qsort3ir(iL, wL, iw, 0, (lenl-1));
6635 for (j = 0; j < lenl; j++)
6636 {
6637 jpiv = iL[j];
6638 /* get factor/ pivot element */
6639 dpiv = wL[j] * D_data[jpiv];
6640 /* store entry in L */
6641 wL[j] = dpiv;
6642
6643 /* zero out element - reset pivot */
6644 iw[jpiv] = -1;
6645 /* combine current row and pivot row */
6646 for (k = U_diag_i[jpiv]; k < U_diag_i[jpiv+1]; k++)
6647 {
6648 col = U_diag_j[k];
6649 jpos = iw[col];
6650
6651 /* Only fill-in nonzero pattern (jpos != 0) */
6652 if (jpos < 0)
6653 {
6654 continue;
6655 }
6656
6657 lxu = - U_diag_data[k] * dpiv;
6658 if (col < ii)
6659 {
6660 /* dealing with L part */
6661 wL[jpos] += lxu;
6662 }
6663 else if (col > ii)
6664 {
6665 /* dealing with U part */
6666 wU[jpos] += lxu;
6667 }
6668 else
6669 {
6670 /* diagonal update */
6671 dd += lxu;
6672 }
6673 }
6674 }
6675 /* restore iw (only need to restore diagonal and U part */
6676 iw[ii] = -1;
6677 for (j = 0; j < lenu; j++)
6678 {
6679 iw[iU[j]] = -1;
6680 }
6681
6682 /* Update LDU factors */
6683 /* L part */
6684 /* Check that memory is sufficient */
6685 while ((ctrL+lenl) > capacity_L)
6686 {
6687 HYPRE_Int tmp = capacity_L;
6688 capacity_L = capacity_L * EXPAND_FACT + 1;
6689 L_diag_j = hypre_TReAlloc_v2(L_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
6690 L_diag_data = hypre_TReAlloc_v2(L_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_L, HYPRE_MEMORY_DEVICE);
6691 }
6692 //hypre_TMemcpy(&(L_diag_j)[ctrL], iL, HYPRE_Int, lenl, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
6693 //hypre_TMemcpy(&(L_diag_data)[ctrL], wL, HYPRE_Real, lenl, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
6694 hypre_TMemcpy(&(L_diag_j)[ctrL], iL, HYPRE_Int, lenl,
6695 HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
6696 hypre_TMemcpy(&(L_diag_data)[ctrL], wL, HYPRE_Real, lenl,
6697 HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
6698 L_diag_i[ii+1] = (ctrL+=lenl);
6699
6700 /* diagonal part (we store the inverse) */
6701 if (fabs(dd) < MAT_TOL)
6702 {
6703 dd = 1.0e-6;
6704 }
6705 D_data[ii] = 1./dd;
6706
6707 /* U part */
6708 /* Check that memory is sufficient */
6709 while ((ctrU+lenu) > capacity_U)
6710 {
6711 HYPRE_Int tmp = capacity_U;
6712 capacity_U = capacity_U * EXPAND_FACT + 1;
6713 U_diag_j = hypre_TReAlloc_v2(U_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
6714 U_diag_data = hypre_TReAlloc_v2(U_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_U, HYPRE_MEMORY_DEVICE);
6715 }
6716 //hypre_TMemcpy(&(U_diag_j)[ctrU], iU, HYPRE_Int, lenu, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
6717 //hypre_TMemcpy(&(U_diag_data)[ctrU], wU, HYPRE_Real, lenu, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
6718 hypre_TMemcpy(&(U_diag_j)[ctrU], iU, HYPRE_Int, lenu,
6719 HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
6720 hypre_TMemcpy(&(U_diag_data)[ctrU], wU, HYPRE_Real, lenu,
6721 HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
6722 U_diag_i[ii+1] = (ctrU+=lenu);
6723 }
6724
6725 HYPRE_BigInt big_total_rows = (HYPRE_BigInt)total_rows;
6726 hypre_MPI_Allreduce(&big_total_rows, &global_num_rows, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
6727
6728 /* need to get new column start */
6729 {
6730 HYPRE_BigInt global_start;
6731 hypre_MPI_Scan( &big_total_rows, &global_start, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
6732 col_starts[0] = global_start - total_rows;
6733 col_starts[1] = global_start;
6734 }
6735
6736 matL = hypre_ParCSRMatrixCreate( comm,
6737 global_num_rows,
6738 global_num_rows,
6739 col_starts,
6740 col_starts,
6741 0,
6742 ctrL,
6743 0 );
6744
6745 L_diag = hypre_ParCSRMatrixDiag(matL);
6746 hypre_CSRMatrixI(L_diag) = L_diag_i;
6747 if (ctrL)
6748 {
6749 hypre_CSRMatrixData(L_diag) = L_diag_data;
6750 hypre_CSRMatrixJ(L_diag) = L_diag_j;
6751 }
6752 else
6753 {
6754 /* we've allocated some memory, so free if not used */
6755 hypre_TFree(L_diag_j,HYPRE_MEMORY_DEVICE);
6756 hypre_TFree(L_diag_data,HYPRE_MEMORY_DEVICE);
6757 }
6758 /* store (global) total number of nonzeros */
6759 local_nnz = (HYPRE_Real) ctrL;
6760 hypre_MPI_Allreduce(&local_nnz, &total_nnz, 1, HYPRE_MPI_REAL, hypre_MPI_SUM, comm);
6761 hypre_ParCSRMatrixDNumNonzeros(matL) = total_nnz;
6762
6763 matU = hypre_ParCSRMatrixCreate( comm,
6764 global_num_rows,
6765 global_num_rows,
6766 col_starts,
6767 col_starts,
6768 0,
6769 ctrU,
6770 0 );
6771
6772 U_diag = hypre_ParCSRMatrixDiag(matU);
6773 hypre_CSRMatrixI(U_diag) = U_diag_i;
6774 if (ctrU)
6775 {
6776 hypre_CSRMatrixData(U_diag) = U_diag_data;
6777 hypre_CSRMatrixJ(U_diag) = U_diag_j;
6778 }
6779 else
6780 {
6781 /* we've allocated some memory, so free if not used */
6782 hypre_TFree(U_diag_j,HYPRE_MEMORY_DEVICE);
6783 hypre_TFree(U_diag_data,HYPRE_MEMORY_DEVICE);
6784 }
6785 /* store (global) total number of nonzeros */
6786 local_nnz = (HYPRE_Real) ctrU;
6787 hypre_MPI_Allreduce(&local_nnz, &total_nnz, 1, HYPRE_MPI_REAL, hypre_MPI_SUM, comm);
6788 hypre_ParCSRMatrixDNumNonzeros(matU) = total_nnz;
6789 /* free memory */
6790 hypre_TFree(wL,HYPRE_MEMORY_HOST);
6791 hypre_TFree(iw,HYPRE_MEMORY_HOST);
6792
6793 /* free external data */
6794 if (E_i)
6795 {
6796 hypre_TFree(E_i, HYPRE_MEMORY_HOST);
6797 }
6798 if (E_j)
6799 {
6800 hypre_TFree(E_j, HYPRE_MEMORY_HOST);
6801 hypre_TFree(E_data, HYPRE_MEMORY_HOST);
6802 }
6803
6804 /* set matrix pointers */
6805 *Lptr = matL;
6806 *Dptr = D_data;
6807 *Uptr = matU;
6808
6809 return hypre_error_flag;
6810 }
6811
6812
6813
6814 /* ILU(k) symbolic factorization for RAS
6815 * n = total rows of input
6816 * lfil = level of fill-in, the k in ILU(k)
6817 * perm = permutation array indicating ordering of factorization. Perm could come from a
6818 * rperm = reverse permutation array, used here to avoid duplicate memory allocation
6819 * iw = working array, used here to avoid duplicate memory allocation
6820 * nLU = size of computed LDU factorization.
6821 * A/L/U/E_i = the I slot of A, L, U and E
6822 * A/L/U/E_j = the J slot of A, L, U and E
6823 * will form global Schur Matrix if nLU < n
6824 */
6825 HYPRE_Int
hypre_ILUSetupILUKRASSymbolic(HYPRE_Int n,HYPRE_Int * A_diag_i,HYPRE_Int * A_diag_j,HYPRE_Int * A_offd_i,HYPRE_Int * A_offd_j,HYPRE_Int * E_i,HYPRE_Int * E_j,HYPRE_Int ext,HYPRE_Int lfil,HYPRE_Int * perm,HYPRE_Int * rperm,HYPRE_Int * iw,HYPRE_Int nLU,HYPRE_Int * L_diag_i,HYPRE_Int * U_diag_i,HYPRE_Int ** L_diag_j,HYPRE_Int ** U_diag_j)6826 hypre_ILUSetupILUKRASSymbolic(HYPRE_Int n, HYPRE_Int *A_diag_i, HYPRE_Int *A_diag_j, HYPRE_Int *A_offd_i, HYPRE_Int *A_offd_j,
6827 HYPRE_Int *E_i, HYPRE_Int *E_j, HYPRE_Int ext,
6828 HYPRE_Int lfil, HYPRE_Int *perm,
6829 HYPRE_Int *rperm, HYPRE_Int *iw, HYPRE_Int nLU,
6830 HYPRE_Int *L_diag_i, HYPRE_Int *U_diag_i,
6831 HYPRE_Int **L_diag_j, HYPRE_Int **U_diag_j)
6832 {
6833 /*
6834 * 1: Setup and create buffers
6835 * A_diag_*: tempory pointer for the diagonal matrix of A and its '*' slot
6836 * ii: outer loop from 0 to nLU - 1
6837 * i: the real col number in diag inside the outer loop
6838 * iw: working array store the reverse of active col number
6839 * iL: working array store the active col number
6840 * iLev: working array store the active level of current row
6841 * lenl/u: current position in iw and so
6842 * ctrL/U/S: global position in J
6843 */
6844
6845 HYPRE_Int *temp_L_diag_j, *temp_U_diag_j, *u_levels;
6846 HYPRE_Int *iL, *iLev;
6847 HYPRE_Int ii, i, j, k, ku, lena, lenl, lenu, lenh, ilev, lev, col, icol;
6848 // HYPRE_Int m = n - nLU;
6849 HYPRE_Int total_rows = ext + n;
6850
6851 /* memory management */
6852 HYPRE_Int ctrL;
6853 HYPRE_Int ctrU;
6854 HYPRE_Int capacity_L;
6855 HYPRE_Int capacity_U;
6856 HYPRE_Int initial_alloc = 0;
6857 HYPRE_Int nnz_A;
6858
6859 /* set iL and iLev to right place in iw array */
6860 iL = iw + total_rows;
6861 iLev = iw + 2*total_rows;
6862
6863 /* setup initial memory used */
6864 nnz_A = A_diag_i[n];
6865 if (n > 0)
6866 {
6867 initial_alloc = (n + ext) + ceil((nnz_A / 2.0) * total_rows / n);
6868 }
6869 capacity_L = initial_alloc;
6870 capacity_U = initial_alloc;
6871
6872 /* allocate other memory for L and U struct */
6873 temp_L_diag_j = hypre_CTAlloc(HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
6874 temp_U_diag_j = hypre_CTAlloc(HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
6875
6876 u_levels = hypre_CTAlloc(HYPRE_Int, capacity_U, HYPRE_MEMORY_HOST);
6877 ctrL = ctrU = 0;
6878
6879 /* set initial value for working array */
6880 for (ii = 0; ii < total_rows; ii++)
6881 {
6882 iw[ii] = -1;
6883 }
6884
6885 /*
6886 * 2: Start of main loop
6887 * those in iL are NEW col index (after permutation)
6888 */
6889 for (ii = 0; ii < nLU; ii++)
6890 {
6891 i = perm[ii];
6892 lenl = 0;
6893 lenh = 0;/* this is the current length of heap */
6894 lenu = ii;
6895 lena = A_diag_i[i+1];
6896 /* put those already inside original pattern, and set their level to 0 */
6897 for (j = A_diag_i[i]; j < lena; j++)
6898 {
6899 /* get the neworder of that col */
6900 col = rperm[A_diag_j[j]];
6901 if (col < ii)
6902 {
6903 /*
6904 * this is an entry in L
6905 * we maintain a heap structure for L part
6906 */
6907 iL[lenh] = col;
6908 iLev[lenh] = 0;
6909 iw[col] = lenh++;
6910 /*now miantian a heap structure*/
6911 hypre_ILUMinHeapAddIIIi(iL,iLev,iw,lenh);
6912 }
6913 else if (col > ii)
6914 {
6915 /* this is an entry in U */
6916 iL[lenu] = col;
6917 iLev[lenu] = 0;
6918 iw[col] = lenu++;
6919 }
6920 }/* end of j loop for adding pattern in original matrix */
6921
6922 /*
6923 * search lower part of current row and update pattern based on level
6924 */
6925 while (lenh > 0)
6926 {
6927 /*
6928 * k is now the new col index after permutation
6929 * the first element of the heap is the smallest
6930 */
6931 k = iL[0];
6932 ilev = iLev[0];
6933 /*
6934 * we now need to maintain the heap structure
6935 */
6936 hypre_ILUMinHeapRemoveIIIi(iL,iLev,iw,lenh);
6937 lenh--;
6938 /* copy to the end of array */
6939 lenl++;
6940 /* reset iw for that, not using anymore */
6941 iw[k]=-1;
6942 hypre_swap2i(iL,iLev,ii-lenl,lenh);
6943 /*
6944 * now the elimination on current row could start.
6945 * eliminate row k (new index) from current row
6946 */
6947 ku = U_diag_i[k+1];
6948 for (j = U_diag_i[k]; j < ku; j++)
6949 {
6950 col = temp_U_diag_j[j];
6951 lev = u_levels[j] + ilev + 1;
6952 /* ignore large level */
6953 icol = iw[col];
6954 /* skill large level */
6955 if (lev > lfil)
6956 {
6957 continue;
6958 }
6959 if (icol < 0)
6960 {
6961 /* not yet in */
6962 if (col < ii)
6963 {
6964 /*
6965 * if we add to the left L, we need to maintian the
6966 * heap structure
6967 */
6968 iL[lenh] = col;
6969 iLev[lenh] = lev;
6970 iw[col] = lenh++;
6971 /*swap it with the element right after the heap*/
6972
6973 /* maintain the heap */
6974 hypre_ILUMinHeapAddIIIi(iL,iLev,iw,lenh);
6975 }
6976 else if (col > ii)
6977 {
6978 iL[lenu] = col;
6979 iLev[lenu] = lev;
6980 iw[col] = lenu++;
6981 }
6982 }
6983 else
6984 {
6985 iLev[icol] = hypre_min(lev, iLev[icol]);
6986 }
6987 }/* end of loop j for level update */
6988 }/* end of while loop for iith row */
6989
6990 /* now update everything, indices, levels and so */
6991 L_diag_i[ii+1] = L_diag_i[ii] + lenl;
6992 if (lenl > 0)
6993 {
6994 /* check if memory is enough */
6995 while (ctrL + lenl > capacity_L)
6996 {
6997 HYPRE_Int tmp = capacity_L;
6998 capacity_L = capacity_L * EXPAND_FACT + 1;
6999 temp_L_diag_j = hypre_TReAlloc_v2(temp_L_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
7000 }
7001 /* now copy L data, reverse order */
7002 for (j = 0; j < lenl; j++)
7003 {
7004 temp_L_diag_j[ctrL+j] = iL[ii-j-1];
7005 }
7006 ctrL += lenl;
7007 }
7008 k = lenu - ii;
7009 U_diag_i[ii+1] = U_diag_i[ii] + k;
7010 if (k > 0)
7011 {
7012 /* check if memory is enough */
7013 while (ctrU + k > capacity_U)
7014 {
7015 HYPRE_Int tmp = capacity_U;
7016 capacity_U = capacity_U * EXPAND_FACT + 1;
7017 temp_U_diag_j = hypre_TReAlloc_v2(temp_U_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
7018 u_levels = hypre_TReAlloc_v2(u_levels, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_HOST);
7019 }
7020 //hypre_TMemcpy(temp_U_diag_j+ctrU,iL+ii,HYPRE_Int,k,HYPRE_MEMORY_DEVICE,HYPRE_MEMORY_HOST);
7021 hypre_TMemcpy(temp_U_diag_j+ctrU, iL+ii, HYPRE_Int, k,
7022 HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
7023 hypre_TMemcpy(u_levels+ctrU, iLev+ii, HYPRE_Int, k,
7024 HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
7025 ctrU += k;
7026 }
7027
7028 /* reset iw */
7029 for (j = ii; j < lenu; j++)
7030 {
7031 iw[iL[j]] = -1;
7032 }
7033
7034 }/* end of main loop ii from 0 to nLU-1 */
7035
7036 /*
7037 * Offd part
7038 */
7039 for (ii = nLU; ii < n; ii++)
7040 {
7041 i = perm[ii];
7042 lenl = 0;
7043 lenh = 0;/* this is the current length of heap */
7044 lenu = ii;
7045 lena = A_diag_i[i+1];
7046 /* put those already inside original pattern, and set their level to 0 */
7047 for (j = A_diag_i[i]; j < lena; j++)
7048 {
7049 /* get the neworder of that col */
7050 col = rperm[A_diag_j[j]];
7051 if (col < ii)
7052 {
7053 /*
7054 * this is an entry in L
7055 * we maintain a heap structure for L part
7056 */
7057 iL[lenh] = col;
7058 iLev[lenh] = 0;
7059 iw[col] = lenh++;
7060 /*now miantian a heap structure*/
7061 hypre_ILUMinHeapAddIIIi(iL,iLev,iw,lenh);
7062 }
7063 else if (col > ii)
7064 {
7065 /* this is an entry in U */
7066 iL[lenu] = col;
7067 iLev[lenu] = 0;
7068 iw[col] = lenu++;
7069 }
7070 }/* end of j loop for adding pattern in original matrix */
7071
7072 /* put those already inside offd pattern in, and set their level to 0 */
7073 lena = A_offd_i[i+1];
7074 for (j = A_offd_i[i]; j < lena; j++)
7075 {
7076 /* the offd cols are in order */
7077 col = A_offd_j[j] + n;
7078 /* col for sure to be greater than ii */
7079 iL[lenu] = col;
7080 iLev[lenu] = 0;
7081 iw[col] = lenu++;
7082 }
7083
7084 /*
7085 * search lower part of current row and update pattern based on level
7086 */
7087 while (lenh > 0)
7088 {
7089 /*
7090 * k is now the new col index after permutation
7091 * the first element of the heap is the smallest
7092 */
7093 k = iL[0];
7094 ilev = iLev[0];
7095 /*
7096 * we now need to maintain the heap structure
7097 */
7098 hypre_ILUMinHeapRemoveIIIi(iL,iLev,iw,lenh);
7099 lenh--;
7100 /* copy to the end of array */
7101 lenl++;
7102 /* reset iw for that, not using anymore */
7103 iw[k]=-1;
7104 hypre_swap2i(iL,iLev,ii-lenl,lenh);
7105 /*
7106 * now the elimination on current row could start.
7107 * eliminate row k (new index) from current row
7108 */
7109 ku = U_diag_i[k+1];
7110 for (j = U_diag_i[k]; j < ku; j++)
7111 {
7112 col = temp_U_diag_j[j];
7113 lev = u_levels[j] + ilev + 1;
7114 /* ignore large level */
7115 icol = iw[col];
7116 /* skill large level */
7117 if (lev > lfil)
7118 {
7119 continue;
7120 }
7121 if (icol < 0)
7122 {
7123 /* not yet in */
7124 if (col < ii)
7125 {
7126 /*
7127 * if we add to the left L, we need to maintian the
7128 * heap structure
7129 */
7130 iL[lenh] = col;
7131 iLev[lenh] = lev;
7132 iw[col] = lenh++;
7133 /*swap it with the element right after the heap*/
7134
7135 /* maintain the heap */
7136 hypre_ILUMinHeapAddIIIi(iL,iLev,iw,lenh);
7137 }
7138 else if (col > ii)
7139 {
7140 iL[lenu] = col;
7141 iLev[lenu] = lev;
7142 iw[col] = lenu++;
7143 }
7144 }
7145 else
7146 {
7147 iLev[icol] = hypre_min(lev, iLev[icol]);
7148 }
7149 }/* end of loop j for level update */
7150 }/* end of while loop for iith row */
7151
7152 /* now update everything, indices, levels and so */
7153 L_diag_i[ii+1] = L_diag_i[ii] + lenl;
7154 if (lenl > 0)
7155 {
7156 /* check if memory is enough */
7157 while (ctrL + lenl > capacity_L)
7158 {
7159 HYPRE_Int tmp = capacity_L;
7160 capacity_L = capacity_L * EXPAND_FACT + 1;
7161 temp_L_diag_j = hypre_TReAlloc_v2(temp_L_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
7162 }
7163 /* now copy L data, reverse order */
7164 for (j = 0; j < lenl; j++)
7165 {
7166 temp_L_diag_j[ctrL+j] = iL[ii-j-1];
7167 }
7168 ctrL += lenl;
7169 }
7170 k = lenu - ii;
7171 U_diag_i[ii+1] = U_diag_i[ii] + k;
7172 if (k > 0)
7173 {
7174 /* check if memory is enough */
7175 while (ctrU + k > capacity_U)
7176 {
7177 HYPRE_Int tmp = capacity_U;
7178 capacity_U = capacity_U * EXPAND_FACT + 1;
7179 temp_U_diag_j = hypre_TReAlloc_v2(temp_U_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
7180 u_levels = hypre_TReAlloc_v2(u_levels, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_HOST);
7181 }
7182 //hypre_TMemcpy(temp_U_diag_j+ctrU,iL+ii,HYPRE_Int,k,HYPRE_MEMORY_DEVICE,HYPRE_MEMORY_HOST);
7183 hypre_TMemcpy(temp_U_diag_j+ctrU, iL+ii, HYPRE_Int, k,
7184 HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
7185 hypre_TMemcpy(u_levels+ctrU, iLev+ii, HYPRE_Int, k,
7186 HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
7187 ctrU += k;
7188 }
7189
7190 /* reset iw */
7191 for (j = ii; j < lenu; j++)
7192 {
7193 iw[iL[j]] = -1;
7194 }
7195 } /* end of main loop ii from nLU to n */
7196
7197 /* external part matrix */
7198 for (ii = n ; ii < total_rows ; ii ++)
7199 {
7200 i = ii - n;
7201 lenl = 0;
7202 lenh = 0;/* this is the current length of heap */
7203 lenu = ii;
7204 lena = E_i[i+1];
7205 /* put those already inside original pattern, and set their level to 0 */
7206 for (j = E_i[i]; j < lena; j++)
7207 {
7208 /* get the neworder of that col */
7209 col = E_j[j];
7210 if (col < ii)
7211 {
7212 /*
7213 * this is an entry in L
7214 * we maintain a heap structure for L part
7215 */
7216 iL[lenh] = col;
7217 iLev[lenh] = 0;
7218 iw[col] = lenh++;
7219 /*now miantian a heap structure*/
7220 hypre_ILUMinHeapAddIIIi(iL,iLev,iw,lenh);
7221 }
7222 else if (col > ii)
7223 {
7224 /* this is an entry in U */
7225 iL[lenu] = col;
7226 iLev[lenu] = 0;
7227 iw[col] = lenu++;
7228 }
7229 }/* end of j loop for adding pattern in original matrix */
7230
7231 /*
7232 * search lower part of current row and update pattern based on level
7233 */
7234 while (lenh > 0)
7235 {
7236 /*
7237 * k is now the new col index after permutation
7238 * the first element of the heap is the smallest
7239 */
7240 k = iL[0];
7241 ilev = iLev[0];
7242 /*
7243 * we now need to maintain the heap structure
7244 */
7245 hypre_ILUMinHeapRemoveIIIi(iL,iLev,iw,lenh);
7246 lenh--;
7247 /* copy to the end of array */
7248 lenl++;
7249 /* reset iw for that, not using anymore */
7250 iw[k]=-1;
7251 hypre_swap2i(iL,iLev,ii-lenl,lenh);
7252 /*
7253 * now the elimination on current row could start.
7254 * eliminate row k (new index) from current row
7255 */
7256 ku = U_diag_i[k+1];
7257 for (j = U_diag_i[k]; j < ku; j++)
7258 {
7259 col = temp_U_diag_j[j];
7260 lev = u_levels[j] + ilev + 1;
7261 /* ignore large level */
7262 icol = iw[col];
7263 /* skill large level */
7264 if (lev > lfil)
7265 {
7266 continue;
7267 }
7268 if (icol < 0)
7269 {
7270 /* not yet in */
7271 if (col < ii)
7272 {
7273 /*
7274 * if we add to the left L, we need to maintian the
7275 * heap structure
7276 */
7277 iL[lenh] = col;
7278 iLev[lenh] = lev;
7279 iw[col] = lenh++;
7280 /*swap it with the element right after the heap*/
7281
7282 /* maintain the heap */
7283 hypre_ILUMinHeapAddIIIi(iL,iLev,iw,lenh);
7284 }
7285 else if (col > ii)
7286 {
7287 iL[lenu] = col;
7288 iLev[lenu] = lev;
7289 iw[col] = lenu++;
7290 }
7291 }
7292 else
7293 {
7294 iLev[icol] = hypre_min(lev, iLev[icol]);
7295 }
7296 }/* end of loop j for level update */
7297 }/* end of while loop for iith row */
7298
7299 /* now update everything, indices, levels and so */
7300 L_diag_i[ii+1] = L_diag_i[ii] + lenl;
7301 if (lenl > 0)
7302 {
7303 /* check if memory is enough */
7304 while (ctrL + lenl > capacity_L)
7305 {
7306 HYPRE_Int tmp = capacity_L;
7307 capacity_L = capacity_L * EXPAND_FACT + 1;
7308 temp_L_diag_j = hypre_TReAlloc_v2(temp_L_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
7309 }
7310 /* now copy L data, reverse order */
7311 for (j = 0; j < lenl; j++)
7312 {
7313 temp_L_diag_j[ctrL+j] = iL[ii-j-1];
7314 }
7315 ctrL += lenl;
7316 }
7317 k = lenu - ii;
7318 U_diag_i[ii+1] = U_diag_i[ii] + k;
7319 if (k > 0)
7320 {
7321 /* check if memory is enough */
7322 while (ctrU + k > capacity_U)
7323 {
7324 HYPRE_Int tmp = capacity_U;
7325 capacity_U = capacity_U * EXPAND_FACT + 1;
7326 temp_U_diag_j = hypre_TReAlloc_v2(temp_U_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
7327 u_levels = hypre_TReAlloc_v2(u_levels, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_HOST);
7328 }
7329 //hypre_TMemcpy(temp_U_diag_j+ctrU,iL+ii,HYPRE_Int,k,HYPRE_MEMORY_DEVICE,HYPRE_MEMORY_HOST);
7330 hypre_TMemcpy(temp_U_diag_j+ctrU, iL+ii, HYPRE_Int, k,
7331 HYPRE_MEMORY_HOST,HYPRE_MEMORY_HOST);
7332 hypre_TMemcpy(u_levels+ctrU, iLev+ii, HYPRE_Int, k,
7333 HYPRE_MEMORY_HOST, HYPRE_MEMORY_HOST);
7334 ctrU += k;
7335 }
7336
7337 /* reset iw */
7338 for (j = ii; j < lenu; j++)
7339 {
7340 iw[iL[j]] = -1;
7341 }
7342
7343 }/* end of main loop ii from n to total_rows */
7344
7345 /*
7346 * 3: Finishing up and free memory
7347 */
7348 hypre_TFree(u_levels,HYPRE_MEMORY_HOST);
7349
7350 *L_diag_j = temp_L_diag_j;
7351 *U_diag_j = temp_U_diag_j;
7352
7353 return hypre_error_flag;
7354 }
7355
7356 /* ILU(k) for RAS
7357 * A: input matrix
7358 * lfil: level of fill-in, the k in ILU(k)
7359 * perm: permutation array indicating ordering of factorization. Perm could come from a
7360 * CF_marker: array or a reordering routine.
7361 * nLU: size of computed LDU factorization.
7362 * Lptr, Dptr, Uptr: L, D, U factors.
7363 */
7364 HYPRE_Int
hypre_ILUSetupILUKRAS(hypre_ParCSRMatrix * A,HYPRE_Int lfil,HYPRE_Int * perm,HYPRE_Int nLU,hypre_ParCSRMatrix ** Lptr,HYPRE_Real ** Dptr,hypre_ParCSRMatrix ** Uptr)7365 hypre_ILUSetupILUKRAS(hypre_ParCSRMatrix *A, HYPRE_Int lfil, HYPRE_Int *perm, HYPRE_Int nLU,
7366 hypre_ParCSRMatrix **Lptr, HYPRE_Real** Dptr, hypre_ParCSRMatrix **Uptr)
7367 {
7368 /*
7369 * 1: Setup and create buffers
7370 * matL/U: the ParCSR matrix for L and U
7371 * L/U_diag: the diagonal csr matrix of matL/U
7372 * A_diag_*: tempory pointer for the diagonal matrix of A and its '*' slot
7373 * ii = outer loop from 0 to nLU - 1
7374 * i = the real col number in diag inside the outer loop
7375 * iw = working array store the reverse of active col number
7376 * iL = working array store the active col number
7377 */
7378
7379 /* call ILU0 if lfil is 0 */
7380 if (lfil == 0)
7381 {
7382 return hypre_ILUSetupILU0RAS(A,perm,nLU,Lptr,Dptr,Uptr);
7383 }
7384 HYPRE_Int i, ii, j, k, k1, k2, kl, ku, jpiv, col, icol;
7385 HYPRE_Int *iw;
7386 MPI_Comm comm = hypre_ParCSRMatrixComm(A);
7387 HYPRE_Int num_procs;
7388
7389 /* data objects for A */
7390 hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
7391 hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
7392 HYPRE_Real *A_diag_data = hypre_CSRMatrixData(A_diag);
7393 HYPRE_Int *A_diag_i = hypre_CSRMatrixI(A_diag);
7394 HYPRE_Int *A_diag_j = hypre_CSRMatrixJ(A_diag);
7395 HYPRE_Real *A_offd_data = hypre_CSRMatrixData(A_offd);
7396 HYPRE_Int *A_offd_i = hypre_CSRMatrixI(A_offd);
7397 HYPRE_Int *A_offd_j = hypre_CSRMatrixJ(A_offd);
7398
7399 /* data objects for L, D, U */
7400 hypre_ParCSRMatrix *matL;
7401 hypre_ParCSRMatrix *matU;
7402 hypre_CSRMatrix *L_diag;
7403 hypre_CSRMatrix *U_diag;
7404 HYPRE_Real *D_data;
7405 HYPRE_Real *L_diag_data = NULL;
7406 HYPRE_Int *L_diag_i;
7407 HYPRE_Int *L_diag_j = NULL;
7408 HYPRE_Real *U_diag_data = NULL;
7409 HYPRE_Int *U_diag_i;
7410 HYPRE_Int *U_diag_j = NULL;
7411
7412 /* size of problem and external matrix */
7413 HYPRE_Int n = hypre_CSRMatrixNumRows(A_diag);
7414 // HYPRE_Int m = n - nLU;
7415 HYPRE_Int ext = hypre_CSRMatrixNumCols(A_offd);
7416 HYPRE_Int total_rows = n + ext;
7417 HYPRE_BigInt global_num_rows;
7418 HYPRE_BigInt col_starts[2];
7419 HYPRE_Real local_nnz, total_nnz;
7420
7421 /* data objects for E, external matrix */
7422 HYPRE_Int *E_i;
7423 HYPRE_Int *E_j;
7424 HYPRE_Real *E_data;
7425
7426 /* communication */
7427 hypre_ParCSRCommPkg *comm_pkg;
7428 hypre_MPI_Comm_size(comm, &num_procs);
7429 // hypre_ParCSRCommHandle *comm_handle;
7430 // HYPRE_Int *send_buf = NULL;
7431
7432 /* reverse permutation array */
7433 HYPRE_Int *rperm;
7434 /* temp array for old permutation */
7435 HYPRE_Int *perm_old;
7436
7437 /* start setup */
7438 /* check input and get problem size */
7439 n = hypre_CSRMatrixNumRows(A_diag);
7440 if (nLU < 0 || nLU > n)
7441 {
7442 hypre_error_w_msg(HYPRE_ERROR_ARG,"WARNING: nLU out of range.\n");
7443 }
7444
7445 /* Init I array anyway. S's might be freed later */
7446 D_data = hypre_CTAlloc(HYPRE_Real, total_rows, HYPRE_MEMORY_DEVICE);
7447 L_diag_i = hypre_CTAlloc(HYPRE_Int, (total_rows+1), HYPRE_MEMORY_DEVICE);
7448 U_diag_i = hypre_CTAlloc(HYPRE_Int, (total_rows+1), HYPRE_MEMORY_DEVICE);
7449
7450 /* set Comm_Pkg if not yet built */
7451 comm_pkg = hypre_ParCSRMatrixCommPkg(A);
7452 if (!comm_pkg)
7453 {
7454 hypre_MatvecCommPkgCreate(A);
7455 comm_pkg = hypre_ParCSRMatrixCommPkg(A);
7456 }
7457
7458 /*
7459 * 2: Symbolic factorization
7460 * setup iw and rperm first
7461 */
7462 /* allocate work arrays */
7463 iw = hypre_CTAlloc(HYPRE_Int, 5*total_rows, HYPRE_MEMORY_HOST);
7464 rperm = iw + 3*total_rows;
7465 perm_old = perm;
7466 perm = iw + 4*total_rows;
7467 L_diag_i[0] = U_diag_i[0] = 0;
7468 /* get reverse permutation (rperm).
7469 * rperm holds the reordered indexes.
7470 */
7471 for (i = 0; i < n; i++)
7472 {
7473 perm[i] = perm_old[i];
7474 }
7475 for (i = n; i < total_rows; i++)
7476 {
7477 perm[i] = i;
7478 }
7479 for (i = 0; i < total_rows; i++)
7480 {
7481 rperm[perm[i]] = i;
7482 }
7483
7484 /* get external rows */
7485 hypre_ILUBuildRASExternalMatrix(A,rperm,&E_i,&E_j,&E_data);
7486 /* do symbolic factorization */
7487 hypre_ILUSetupILUKRASSymbolic(n, A_diag_i, A_diag_j, A_offd_i, A_offd_j, E_i, E_j, ext, lfil, perm, rperm, iw,
7488 nLU, L_diag_i, U_diag_i, &L_diag_j, &U_diag_j);
7489
7490 /*
7491 * after this, we have our I,J for L, U and S ready, and L sorted
7492 * iw are still -1 after symbolic factorization
7493 * now setup helper array here
7494 */
7495 if (L_diag_i[total_rows])
7496 {
7497 L_diag_data = hypre_CTAlloc(HYPRE_Real, L_diag_i[total_rows], HYPRE_MEMORY_DEVICE);
7498 }
7499 if (U_diag_i[total_rows])
7500 {
7501 U_diag_data = hypre_CTAlloc(HYPRE_Real, U_diag_i[total_rows], HYPRE_MEMORY_DEVICE);
7502 }
7503
7504 /*
7505 * 3: Begin real factorization
7506 * we already have L and U structure ready, so no extra working array needed
7507 */
7508 /* first loop for upper part */
7509 for (ii = 0; ii < nLU; ii++)
7510 {
7511 // get row i
7512 i = perm[ii];
7513 kl = L_diag_i[ii+1];
7514 ku = U_diag_i[ii+1];
7515 k1 = A_diag_i[i];
7516 k2 = A_diag_i[i+1];
7517 /* set up working arrays */
7518 for (j = L_diag_i[ii]; j < kl; j++)
7519 {
7520 col = L_diag_j[j];
7521 iw[col] = j;
7522 }
7523 D_data[ii] = 0.0;
7524 iw[ii] = ii;
7525 for (j = U_diag_i[ii]; j < ku; j++)
7526 {
7527 col = U_diag_j[j];
7528 iw[col] = j;
7529 }
7530 /* copy data from A into L, D and U */
7531 for (j = k1; j < k2; j++)
7532 {
7533 /* compute everything in new index */
7534 col = rperm[A_diag_j[j]];
7535 icol = iw[col];
7536 /* A for sure to be inside the pattern */
7537 if (col < ii)
7538 {
7539 L_diag_data[icol] = A_diag_data[j];
7540 }
7541 else if (col == ii)
7542 {
7543 D_data[ii] = A_diag_data[j];
7544 }
7545 else
7546 {
7547 U_diag_data[icol] = A_diag_data[j];
7548 }
7549 }
7550 /* elimination */
7551 for (j = L_diag_i[ii]; j < kl; j++)
7552 {
7553 jpiv = L_diag_j[j];
7554 L_diag_data[j] *= D_data[jpiv];
7555 ku = U_diag_i[jpiv+1];
7556
7557 for (k = U_diag_i[jpiv]; k < ku; k++)
7558 {
7559 col = U_diag_j[k];
7560 icol = iw[col];
7561 if (icol < 0)
7562 {
7563 /* not in partern */
7564 continue;
7565 }
7566 if (col < ii)
7567 {
7568 /* L part */
7569 L_diag_data[icol] -= L_diag_data[j]*U_diag_data[k];
7570 }
7571 else if (col == ii)
7572 {
7573 /* diag part */
7574 D_data[icol] -= L_diag_data[j]*U_diag_data[k];
7575 }
7576 else
7577 {
7578 /* U part */
7579 U_diag_data[icol] -= L_diag_data[j]*U_diag_data[k];
7580 }
7581 }
7582 }
7583 /* reset working array */
7584 ku = U_diag_i[ii+1];
7585 for (j = L_diag_i[ii]; j < kl; j++)
7586 {
7587 col = L_diag_j[j];
7588 iw[col] = -1;
7589 }
7590 iw[ii] = -1;
7591 for (j = U_diag_i[ii]; j < ku; j++)
7592 {
7593 col = U_diag_j[j];
7594 iw[col] = -1;
7595 }
7596
7597 /* diagonal part (we store the inverse) */
7598 if (fabs(D_data[ii]) < MAT_TOL)
7599 {
7600 D_data[ii] = 1e-06;
7601 }
7602 D_data[ii] = 1./ D_data[ii];
7603
7604 }/* end of loop for upper part */
7605
7606 /* first loop for upper part */
7607 for (ii = nLU; ii < n; ii++)
7608 {
7609 // get row i
7610 i = perm[ii];
7611 kl = L_diag_i[ii+1];
7612 ku = U_diag_i[ii+1];
7613 /* set up working arrays */
7614 for (j = L_diag_i[ii]; j < kl; j++)
7615 {
7616 col = L_diag_j[j];
7617 iw[col] = j;
7618 }
7619 D_data[ii] = 0.0;
7620 iw[ii] = ii;
7621 for (j = U_diag_i[ii]; j < ku; j++)
7622 {
7623 col = U_diag_j[j];
7624 iw[col] = j;
7625 }
7626 /* copy data from A into L, D and U */
7627 k1 = A_diag_i[i];
7628 k2 = A_diag_i[i+1];
7629 for (j = k1; j < k2; j++)
7630 {
7631 /* compute everything in new index */
7632 col = rperm[A_diag_j[j]];
7633 icol = iw[col];
7634 /* A for sure to be inside the pattern */
7635 if (col < ii)
7636 {
7637 L_diag_data[icol] = A_diag_data[j];
7638 }
7639 else if (col == ii)
7640 {
7641 D_data[ii] = A_diag_data[j];
7642 }
7643 else
7644 {
7645 U_diag_data[icol] = A_diag_data[j];
7646 }
7647 }
7648 /* copy data from A_offd into L, D and U */
7649 k1 = A_offd_i[i];
7650 k2 = A_offd_i[i+1];
7651 for (j = k1; j < k2; j++)
7652 {
7653 /* compute everything in new index */
7654 col = A_offd_j[j] + n;
7655 icol = iw[col];
7656 U_diag_data[icol] = A_offd_data[j];
7657 }
7658 /* elimination */
7659 for (j = L_diag_i[ii]; j < kl; j++)
7660 {
7661 jpiv = L_diag_j[j];
7662 L_diag_data[j] *= D_data[jpiv];
7663 ku = U_diag_i[jpiv+1];
7664
7665 for (k = U_diag_i[jpiv]; k < ku; k++)
7666 {
7667 col = U_diag_j[k];
7668 icol = iw[col];
7669 if (icol < 0)
7670 {
7671 /* not in partern */
7672 continue;
7673 }
7674 if (col < ii)
7675 {
7676 /* L part */
7677 L_diag_data[icol] -= L_diag_data[j]*U_diag_data[k];
7678 }
7679 else if (col == ii)
7680 {
7681 /* diag part */
7682 D_data[icol] -= L_diag_data[j]*U_diag_data[k];
7683 }
7684 else
7685 {
7686 /* U part */
7687 U_diag_data[icol] -= L_diag_data[j]*U_diag_data[k];
7688 }
7689 }
7690 }
7691 /* reset working array */
7692 ku = U_diag_i[ii+1];
7693 for (j = L_diag_i[ii]; j < kl; j++)
7694 {
7695 col = L_diag_j[j];
7696 iw[col] = -1;
7697 }
7698 iw[ii] = -1;
7699 for (j = U_diag_i[ii]; j < ku; j++)
7700 {
7701 col = U_diag_j[j];
7702 iw[col] = -1;
7703 }
7704
7705 /* diagonal part (we store the inverse) */
7706 if (fabs(D_data[ii]) < MAT_TOL)
7707 {
7708 D_data[ii] = 1e-06;
7709 }
7710 D_data[ii] = 1./ D_data[ii];
7711
7712 }/* end of loop for lower part */
7713
7714 /* last loop through external */
7715 for (ii = n; ii < total_rows; ii++)
7716 {
7717 // get row i
7718 i = ii - n;
7719 kl = L_diag_i[ii+1];
7720 ku = U_diag_i[ii+1];
7721 k1 = E_i[i];
7722 k2 = E_i[i+1];
7723 /* set up working arrays */
7724 for (j = L_diag_i[ii]; j < kl; j++)
7725 {
7726 col = L_diag_j[j];
7727 iw[col] = j;
7728 }
7729 D_data[ii] = 0.0;
7730 iw[ii] = ii;
7731 for (j = U_diag_i[ii]; j < ku; j++)
7732 {
7733 col = U_diag_j[j];
7734 iw[col] = j;
7735 }
7736 /* copy data from E into L, D and U */
7737 for (j = k1; j < k2; j++)
7738 {
7739 /* compute everything in new index */
7740 col = E_j[j];
7741 icol = iw[col];
7742 /* A for sure to be inside the pattern */
7743 if (col < ii)
7744 {
7745 L_diag_data[icol] = E_data[j];
7746 }
7747 else if (col == ii)
7748 {
7749 D_data[ii] = E_data[j];
7750 }
7751 else
7752 {
7753 U_diag_data[icol] = E_data[j];
7754 }
7755 }
7756 /* elimination */
7757 for (j = L_diag_i[ii]; j < kl; j++)
7758 {
7759 jpiv = L_diag_j[j];
7760 L_diag_data[j] *= D_data[jpiv];
7761 ku = U_diag_i[jpiv+1];
7762
7763 for (k = U_diag_i[jpiv]; k < ku; k++)
7764 {
7765 col = U_diag_j[k];
7766 icol = iw[col];
7767 if (icol < 0)
7768 {
7769 /* not in partern */
7770 continue;
7771 }
7772 if (col < ii)
7773 {
7774 /* L part */
7775 L_diag_data[icol] -= L_diag_data[j]*U_diag_data[k];
7776 }
7777 else if (col == ii)
7778 {
7779 /* diag part */
7780 D_data[icol] -= L_diag_data[j]*U_diag_data[k];
7781 }
7782 else
7783 {
7784 /* U part */
7785 U_diag_data[icol] -= L_diag_data[j]*U_diag_data[k];
7786 }
7787 }
7788 }
7789 /* reset working array */
7790 ku = U_diag_i[ii+1];
7791 for (j = L_diag_i[ii]; j < kl; j++)
7792 {
7793 col = L_diag_j[j];
7794 iw[col] = -1;
7795 }
7796 iw[ii] = -1;
7797 for (j = U_diag_i[ii]; j < ku; j++)
7798 {
7799 col = U_diag_j[j];
7800 iw[col] = -1;
7801 }
7802
7803 /* diagonal part (we store the inverse) */
7804 if (fabs(D_data[ii]) < MAT_TOL)
7805 {
7806 D_data[ii] = 1e-06;
7807 }
7808 D_data[ii] = 1./ D_data[ii];
7809
7810 }/* end of loop for external loop */
7811
7812 /*
7813 * 4: Finishing up and free
7814 */
7815 HYPRE_BigInt big_total_rows = (HYPRE_BigInt)total_rows;
7816 hypre_MPI_Allreduce( &big_total_rows, &global_num_rows, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
7817 /* need to get new column start */
7818 {
7819 HYPRE_BigInt global_start;
7820 hypre_MPI_Scan( &big_total_rows, &global_start, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
7821 col_starts[0] = global_start - total_rows;
7822 col_starts[1] = global_start;
7823 }
7824 /* Assemble LDU matrices */
7825 matL = hypre_ParCSRMatrixCreate( comm,
7826 global_num_rows,
7827 global_num_rows,
7828 col_starts,
7829 col_starts,
7830 0 /* num_cols_offd */,
7831 L_diag_i[total_rows],
7832 0 /* num_nonzeros_offd */);
7833
7834 L_diag = hypre_ParCSRMatrixDiag(matL);
7835 hypre_CSRMatrixI(L_diag) = L_diag_i;
7836 if (L_diag_i[total_rows]>0)
7837 {
7838 hypre_CSRMatrixData(L_diag) = L_diag_data;
7839 hypre_CSRMatrixJ(L_diag) = L_diag_j;
7840 }
7841 else
7842 {
7843 /* we allocated some initial length, so free them */
7844 hypre_TFree(L_diag_j, HYPRE_MEMORY_DEVICE);
7845 }
7846 /* store (global) total number of nonzeros */
7847 local_nnz = (HYPRE_Real) (L_diag_i[total_rows]);
7848 hypre_MPI_Allreduce(&local_nnz, &total_nnz, 1, HYPRE_MPI_REAL, hypre_MPI_SUM, comm);
7849 hypre_ParCSRMatrixDNumNonzeros(matL) = total_nnz;
7850
7851 matU = hypre_ParCSRMatrixCreate( comm,
7852 global_num_rows,
7853 global_num_rows,
7854 col_starts,
7855 col_starts,
7856 0,
7857 U_diag_i[total_rows],
7858 0 );
7859
7860 U_diag = hypre_ParCSRMatrixDiag(matU);
7861 hypre_CSRMatrixI(U_diag) = U_diag_i;
7862 if (U_diag_i[n]>0)
7863 {
7864 hypre_CSRMatrixData(U_diag) = U_diag_data;
7865 hypre_CSRMatrixJ(U_diag) = U_diag_j;
7866 }
7867 else
7868 {
7869 /* we allocated some initial length, so free them */
7870 hypre_TFree(U_diag_j, HYPRE_MEMORY_DEVICE);
7871 }
7872 /* store (global) total number of nonzeros */
7873 local_nnz = (HYPRE_Real) (U_diag_i[total_rows]);
7874 hypre_MPI_Allreduce(&local_nnz, &total_nnz, 1, HYPRE_MPI_REAL, hypre_MPI_SUM, comm);
7875 hypre_ParCSRMatrixDNumNonzeros(matU) = total_nnz;
7876
7877 /* free */
7878 hypre_TFree(iw,HYPRE_MEMORY_HOST);
7879
7880 /* free external data */
7881 if (E_i)
7882 {
7883 hypre_TFree(E_i, HYPRE_MEMORY_HOST);
7884 }
7885 if (E_j)
7886 {
7887 hypre_TFree(E_j, HYPRE_MEMORY_HOST);
7888 hypre_TFree(E_data, HYPRE_MEMORY_HOST);
7889 }
7890
7891 /* set matrix pointers */
7892 *Lptr = matL;
7893 *Dptr = D_data;
7894 *Uptr = matU;
7895
7896 return hypre_error_flag;
7897 }
7898
7899 /* ILUT for RAS
7900 * A: input matrix
7901 * lfil: level of fill-in, the k in ILU(k)
7902 * tol: droptol array in ILUT
7903 * tol[0]: matrix B
7904 * tol[1]: matrix E and F
7905 * tol[2]: matrix S
7906 * perm: permutation array indicating ordering of factorization. Perm could come from a
7907 * CF_marker: array or a reordering routine.
7908 * nLU: size of computed LDU factorization. If nLU < n, Schur compelemnt will be formed
7909 * Lptr, Dptr, Uptr: L, D, U factors.
7910 * Sptr: Schur complement
7911 *
7912 * Keep the largest lfil entries that is greater than some tol relative
7913 * to the input tol and the norm of that row in both L and U
7914 */
7915 HYPRE_Int
hypre_ILUSetupILUTRAS(hypre_ParCSRMatrix * A,HYPRE_Int lfil,HYPRE_Real * tol,HYPRE_Int * perm,HYPRE_Int nLU,hypre_ParCSRMatrix ** Lptr,HYPRE_Real ** Dptr,hypre_ParCSRMatrix ** Uptr)7916 hypre_ILUSetupILUTRAS(hypre_ParCSRMatrix *A, HYPRE_Int lfil, HYPRE_Real *tol,
7917 HYPRE_Int *perm, HYPRE_Int nLU, hypre_ParCSRMatrix **Lptr,
7918 HYPRE_Real** Dptr, hypre_ParCSRMatrix **Uptr)
7919 {
7920 /*
7921 * 1: Setup and create buffers
7922 * matL/U: the ParCSR matrix for L and U
7923 * L/U_diag: the diagonal csr matrix of matL/U
7924 * A_diag_*: tempory pointer for the diagonal matrix of A and its '*' slot
7925 * ii = outer loop from 0 to nLU - 1
7926 * i = the real col number in diag inside the outer loop
7927 * iw = working array store the reverse of active col number
7928 * iL = working array store the active col number
7929 */
7930 HYPRE_Real local_nnz, total_nnz;
7931 HYPRE_Int i, ii, j, k1, k2, k12, k22, kl, ku, col, icol, lenl, lenu, lenhu, lenhlr, lenhll, jpos, jrow;
7932 HYPRE_Real inorm, itolb, itolef, dpiv, lxu;
7933 HYPRE_Int *iw,*iL;
7934 HYPRE_Real *w;
7935
7936 /* memory management */
7937 HYPRE_Int ctrL;
7938 HYPRE_Int ctrU;
7939 HYPRE_Int initial_alloc = 0;
7940 HYPRE_Int capacity_L;
7941 HYPRE_Int capacity_U;
7942 HYPRE_Int nnz_A;
7943
7944 /* communication stuffs for S */
7945 MPI_Comm comm = hypre_ParCSRMatrixComm(A);
7946 HYPRE_Int num_procs;
7947 hypre_ParCSRCommPkg *comm_pkg;
7948 // hypre_ParCSRCommHandle *comm_handle;
7949 HYPRE_BigInt col_starts[2];
7950 // HYPRE_Int num_sends;
7951 // HYPRE_Int begin, end;
7952
7953 /* data objects for A */
7954 hypre_CSRMatrix *A_diag = hypre_ParCSRMatrixDiag(A);
7955 hypre_CSRMatrix *A_offd = hypre_ParCSRMatrixOffd(A);
7956 HYPRE_Real *A_diag_data = hypre_CSRMatrixData(A_diag);
7957 HYPRE_Int *A_diag_i = hypre_CSRMatrixI(A_diag);
7958 HYPRE_Int *A_diag_j = hypre_CSRMatrixJ(A_diag);
7959 HYPRE_Int *A_offd_i = hypre_CSRMatrixI(A_offd);
7960 HYPRE_Int *A_offd_j = hypre_CSRMatrixJ(A_offd);
7961 HYPRE_Real *A_offd_data = hypre_CSRMatrixData(A_offd);
7962
7963 /* data objects for L, D, U */
7964 hypre_ParCSRMatrix *matL;
7965 hypre_ParCSRMatrix *matU;
7966 hypre_CSRMatrix *L_diag;
7967 hypre_CSRMatrix *U_diag;
7968 HYPRE_Real *D_data;
7969 HYPRE_Real *L_diag_data = NULL;
7970 HYPRE_Int *L_diag_i;
7971 HYPRE_Int *L_diag_j = NULL;
7972 HYPRE_Real *U_diag_data = NULL;
7973 HYPRE_Int *U_diag_i;
7974 HYPRE_Int *U_diag_j = NULL;
7975
7976 /* size of problem and external matrix */
7977 HYPRE_Int n = hypre_CSRMatrixNumRows(A_diag);
7978 // HYPRE_Int m = n - nLU;
7979 HYPRE_Int ext = hypre_CSRMatrixNumCols(A_offd);
7980 HYPRE_Int total_rows = n + ext;
7981 HYPRE_BigInt global_num_rows;
7982
7983 /* data objects for E, external matrix */
7984 HYPRE_Int *E_i;
7985 HYPRE_Int *E_j;
7986 HYPRE_Real *E_data;
7987
7988 /* reverse permutation */
7989 HYPRE_Int *rperm;
7990 /* old permutation */
7991 HYPRE_Int *perm_old;
7992
7993 /* start setup
7994 * check input first
7995 */
7996 n = hypre_CSRMatrixNumRows(A_diag);
7997 if (nLU < 0 || nLU > n)
7998 {
7999 hypre_error_w_msg(HYPRE_ERROR_ARG,"WARNING: nLU out of range.\n");
8000 }
8001
8002 /* start set up
8003 * setup communication stuffs first
8004 */
8005 hypre_MPI_Comm_size(comm, &num_procs);
8006 comm_pkg = hypre_ParCSRMatrixCommPkg(A);
8007 /* create if not yet built */
8008 if (!comm_pkg)
8009 {
8010 hypre_MatvecCommPkgCreate(A);
8011 comm_pkg = hypre_ParCSRMatrixCommPkg(A);
8012 }
8013
8014 /* setup initial memory */
8015 nnz_A = A_diag_i[nLU];
8016 if (n > 0)
8017 {
8018 initial_alloc = nLU + ceil(nnz_A / 2.0);
8019 }
8020 capacity_L = initial_alloc;
8021 capacity_U = initial_alloc;
8022
8023 D_data = hypre_CTAlloc(HYPRE_Real, total_rows, HYPRE_MEMORY_DEVICE);
8024 L_diag_i = hypre_CTAlloc(HYPRE_Int, (total_rows+1), HYPRE_MEMORY_DEVICE);
8025 U_diag_i = hypre_CTAlloc(HYPRE_Int, (total_rows+1), HYPRE_MEMORY_DEVICE);
8026
8027 L_diag_j = hypre_CTAlloc(HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
8028 U_diag_j = hypre_CTAlloc(HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
8029 L_diag_data = hypre_CTAlloc(HYPRE_Real, capacity_L, HYPRE_MEMORY_DEVICE);
8030 U_diag_data = hypre_CTAlloc(HYPRE_Real, capacity_U, HYPRE_MEMORY_DEVICE);
8031
8032 ctrL = ctrU = 0;
8033
8034 /* setting up working array */
8035 iw = hypre_CTAlloc(HYPRE_Int,4*total_rows,HYPRE_MEMORY_HOST);
8036 iL = iw + total_rows;
8037 w = hypre_CTAlloc(HYPRE_Real,total_rows,HYPRE_MEMORY_HOST);
8038 for (i = 0; i < total_rows; i++)
8039 {
8040 iw[i] = -1;
8041 }
8042 L_diag_i[0] = U_diag_i[0] = 0;
8043 /* get reverse permutation (rperm).
8044 * rperm holds the reordered indexes.
8045 * rperm[old] -> new
8046 * perm[new] -> old
8047 */
8048 rperm = iw + 2*total_rows;
8049 perm_old = perm;
8050 perm = iw + 3*total_rows;
8051 for (i = 0; i < n; i++)
8052 {
8053 perm[i] = perm_old[i];
8054 }
8055 for (i = n; i < total_rows; i++)
8056 {
8057 perm[i] = i;
8058 }
8059 for (i = 0; i < total_rows; i++)
8060 {
8061 rperm[perm[i]] = i;
8062 }
8063 /* get external matrix */
8064 hypre_ILUBuildRASExternalMatrix(A,rperm,&E_i,&E_j,&E_data);
8065
8066 /*
8067 * 2: Main loop of elimination
8068 * maintain two heaps
8069 * |----->*********<-----|-----*********|
8070 * |col heap***value heap|value in U****|
8071 */
8072
8073 /* main outer loop for upper part */
8074 for (ii = 0 ; ii < nLU; ii++)
8075 {
8076 /* get real row with perm */
8077 i = perm[ii];
8078 k1 = A_diag_i[i];
8079 k2 = A_diag_i[i+1];
8080 kl = ii-1;
8081 /* reset row norm of ith row */
8082 inorm = .0;
8083 for (j = k1; j < k2; j++)
8084 {
8085 inorm += fabs(A_diag_data[j]);
8086 }
8087 if (inorm == .0)
8088 {
8089 hypre_error_w_msg(HYPRE_ERROR_ARG,"WARNING: ILUT with zero row.\n");
8090 }
8091 inorm /= (HYPRE_Real)(k2-k1);
8092 /* set the scaled tol for that row */
8093 itolb = tol[0] * inorm;
8094 itolef = tol[1] * inorm;
8095
8096 /* reset displacement */
8097 lenhll = lenhlr = lenu = 0;
8098 w[ii] = 0.0;
8099 iw[ii] = ii;
8100 /* copy in data from A */
8101 for (j = k1; j < k2; j++)
8102 {
8103 /* get now col number */
8104 col = rperm[A_diag_j[j]];
8105 if (col < ii)
8106 {
8107 /* L part of it */
8108 iL[lenhll] = col;
8109 w[lenhll] = A_diag_data[j];
8110 iw[col] = lenhll++;
8111 /* add to heap, by col number */
8112 hypre_ILUMinHeapAddIRIi(iL,w,iw,lenhll);
8113 }
8114 else if (col == ii)
8115 {
8116 w[ii] = A_diag_data[j];
8117 }
8118 else
8119 {
8120 lenu++;
8121 jpos = lenu + ii;
8122 iL[jpos] = col;
8123 w[jpos] = A_diag_data[j];
8124 iw[col] = jpos;
8125 }
8126 }
8127
8128 /*
8129 * main elimination
8130 * need to maintain 2 heaps for L, one heap for col and one heaps for value
8131 * maintian an array for U, and do qsplit with quick sort after that
8132 * while the heap of col is greater than zero
8133 */
8134 while (lenhll > 0)
8135 {
8136
8137 /* get the next row from top of the heap */
8138 jrow = iL[0];
8139 dpiv = w[0] * D_data[jrow];
8140 w[0] = dpiv;
8141 /* now remove it from the top of the heap */
8142 hypre_ILUMinHeapRemoveIRIi(iL,w,iw,lenhll);
8143 lenhll--;
8144 /*
8145 * reset the drop part to -1
8146 * we don't need this iw anymore
8147 */
8148 iw[jrow] = -1;
8149 /* need to keep this one, move to the end of the heap */
8150 /* no longer need to maintain iw */
8151 hypre_swap2(iL,w,lenhll,kl-lenhlr);
8152 lenhlr++;
8153 hypre_ILUMaxrHeapAddRabsI(w+kl,iL+kl,lenhlr);
8154 /* loop for elimination */
8155 ku = U_diag_i[jrow+1];
8156 for (j = U_diag_i[jrow]; j < ku; j++)
8157 {
8158 col = U_diag_j[j];
8159 icol = iw[col];
8160 lxu = - dpiv*U_diag_data[j];
8161 /* we don't want to fill small number to empty place */
8162 if ((icol == -1) &&
8163 ((col < nLU && fabs(lxu) < itolb) || (col >= nLU && fabs(lxu) < itolef)))
8164 {
8165 continue;
8166 }
8167 if (icol == -1)
8168 {
8169 if (col < ii)
8170 {
8171 /* L part
8172 * not already in L part
8173 * put it to the end of heap
8174 * might overwrite some small entries, no issue
8175 */
8176 iL[lenhll] = col;
8177 w[lenhll] = lxu;
8178 iw[col] = lenhll++;
8179 /* add to heap, by col number */
8180 hypre_ILUMinHeapAddIRIi(iL,w,iw,lenhll);
8181 }
8182 else if (col == ii)
8183 {
8184 w[ii] += lxu;
8185 }
8186 else
8187 {
8188 /*
8189 * not already in U part
8190 * put is to the end of heap
8191 */
8192 lenu++;
8193 jpos = lenu + ii;
8194 iL[jpos] = col;
8195 w[jpos] = lxu;
8196 iw[col] = jpos;
8197 }
8198 }
8199 else
8200 {
8201 w[icol] += lxu;
8202 }
8203 }
8204 }/* while loop for the elimination of current row */
8205
8206 if (fabs(w[ii]) < MAT_TOL)
8207 {
8208 w[ii]=1e-06;
8209 }
8210 D_data[ii] = 1./w[ii];
8211 iw[ii] = -1;
8212
8213 /*
8214 * now pick up the largest lfil from L
8215 * L part is guarantee to be larger than itol
8216 */
8217
8218 lenl = lenhlr < lfil ? lenhlr : lfil;
8219 L_diag_i[ii+1] = L_diag_i[ii] + lenl;
8220 if (lenl > 0)
8221 {
8222 /* test if memory is enough */
8223 while (ctrL + lenl > capacity_L)
8224 {
8225 HYPRE_Int tmp = capacity_L;
8226 capacity_L = capacity_L * EXPAND_FACT + 1;
8227 L_diag_j = hypre_TReAlloc_v2(L_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
8228 L_diag_data = hypre_TReAlloc_v2(L_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_L, HYPRE_MEMORY_DEVICE);
8229 }
8230 ctrL += lenl;
8231 /* copy large data in */
8232 for (j = L_diag_i[ii]; j < ctrL; j++)
8233 {
8234 L_diag_j[j] = iL[kl];
8235 L_diag_data[j] = w[kl];
8236 hypre_ILUMaxrHeapRemoveRabsI(w+kl,iL+kl,lenhlr);
8237 lenhlr--;
8238 }
8239 }
8240 /*
8241 * now reset working array
8242 * L part already reset when move out of heap, only U part
8243 */
8244 ku = lenu+ii;
8245 for (j = ii + 1; j <= ku; j++)
8246 {
8247 iw[iL[j]] = -1;
8248 }
8249
8250 if (lenu < lfil)
8251 {
8252 /* we simply keep all of the data, no need to sort */
8253 lenhu = lenu;
8254 }
8255 else
8256 {
8257 /* need to sort the first small(hopefully) part of it */
8258 lenhu = lfil;
8259 /* quick split, only sort the first small part of the array */
8260 hypre_ILUMaxQSplitRabsI(w,iL,ii+1,ii+lenhu,ii+lenu);
8261 }
8262
8263 U_diag_i[ii+1] = U_diag_i[ii] + lenhu;
8264 if (lenhu > 0)
8265 {
8266 /* test if memory is enough */
8267 while (ctrU + lenhu > capacity_U)
8268 {
8269 HYPRE_Int tmp = capacity_U;
8270 capacity_U = capacity_U * EXPAND_FACT + 1;
8271 U_diag_j = hypre_TReAlloc_v2(U_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
8272 U_diag_data = hypre_TReAlloc_v2(U_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_U, HYPRE_MEMORY_DEVICE);
8273 }
8274 ctrU += lenhu;
8275 /* copy large data in */
8276 for (j = U_diag_i[ii]; j < ctrU; j++)
8277 {
8278 jpos = ii+1+j-U_diag_i[ii];
8279 U_diag_j[j] = iL[jpos];
8280 U_diag_data[j] = w[jpos];
8281 }
8282 }
8283 }/* end of ii loop from 0 to nLU-1 */
8284
8285 /* second outer loop for lower part */
8286 for (ii = nLU; ii < n; ii++)
8287 {
8288 /* get real row with perm */
8289 i = perm[ii];
8290 k1 = A_diag_i[i];
8291 k2 = A_diag_i[i+1];
8292 k12 = A_offd_i[i];
8293 k22 = A_offd_i[i+1];
8294 kl = ii-1;
8295 /* reset row norm of ith row */
8296 inorm = .0;
8297 for (j = k1; j < k2; j++)
8298 {
8299 inorm += fabs(A_diag_data[j]);
8300 }
8301 for (j = k12; j < k22; j++)
8302 {
8303 inorm += fabs(A_offd_data[j]);
8304 }
8305 if (inorm == .0)
8306 {
8307 hypre_error_w_msg(HYPRE_ERROR_ARG,"WARNING: ILUT with zero row.\n");
8308 }
8309 inorm /= (HYPRE_Real)(k2+k22-k1-k12);
8310 /* set the scaled tol for that row */
8311 itolb = tol[0] * inorm;
8312 itolef = tol[1] * inorm;
8313
8314 /* reset displacement */
8315 lenhll = lenhlr = lenu = 0;
8316 w[ii] = 0.0;
8317 iw[ii] = ii;
8318 /* copy in data from A_diag */
8319 for (j = k1; j < k2; j++)
8320 {
8321 /* get now col number */
8322 col = rperm[A_diag_j[j]];
8323 if (col < ii)
8324 {
8325 /* L part of it */
8326 iL[lenhll] = col;
8327 w[lenhll] = A_diag_data[j];
8328 iw[col] = lenhll++;
8329 /* add to heap, by col number */
8330 hypre_ILUMinHeapAddIRIi(iL,w,iw,lenhll);
8331 }
8332 else if (col == ii)
8333 {
8334 w[ii] = A_diag_data[j];
8335 }
8336 else
8337 {
8338 lenu++;
8339 jpos = lenu + ii;
8340 iL[jpos] = col;
8341 w[jpos] = A_diag_data[j];
8342 iw[col] = jpos;
8343 }
8344 }
8345 /* copy in data from A_offd */
8346 for (j = k12; j < k22; j++)
8347 {
8348 /* get now col number */
8349 col = A_offd_j[j] + n;
8350 /* all should greater than ii in lower part */
8351 lenu++;
8352 jpos = lenu + ii;
8353 iL[jpos] = col;
8354 w[jpos] = A_offd_data[j];
8355 iw[col] = jpos;
8356 }
8357
8358 /*
8359 * main elimination
8360 * need to maintain 2 heaps for L, one heap for col and one heaps for value
8361 * maintian an array for U, and do qsplit with quick sort after that
8362 * while the heap of col is greater than zero
8363 */
8364 while (lenhll > 0)
8365 {
8366
8367 /* get the next row from top of the heap */
8368 jrow = iL[0];
8369 dpiv = w[0] * D_data[jrow];
8370 w[0] = dpiv;
8371 /* now remove it from the top of the heap */
8372 hypre_ILUMinHeapRemoveIRIi(iL,w,iw,lenhll);
8373 lenhll--;
8374 /*
8375 * reset the drop part to -1
8376 * we don't need this iw anymore
8377 */
8378 iw[jrow] = -1;
8379 /* need to keep this one, move to the end of the heap */
8380 /* no longer need to maintain iw */
8381 hypre_swap2(iL,w,lenhll,kl-lenhlr);
8382 lenhlr++;
8383 hypre_ILUMaxrHeapAddRabsI(w+kl,iL+kl,lenhlr);
8384 /* loop for elimination */
8385 ku = U_diag_i[jrow+1];
8386 for (j = U_diag_i[jrow]; j < ku; j++)
8387 {
8388 col = U_diag_j[j];
8389 icol = iw[col];
8390 lxu = - dpiv*U_diag_data[j];
8391 /* we don't want to fill small number to empty place */
8392 if ((icol == -1) &&
8393 ((col < nLU && fabs(lxu) < itolb) || (col >= nLU && fabs(lxu) < itolef)))
8394 {
8395 continue;
8396 }
8397 if (icol == -1)
8398 {
8399 if (col < ii)
8400 {
8401 /* L part
8402 * not already in L part
8403 * put it to the end of heap
8404 * might overwrite some small entries, no issue
8405 */
8406 iL[lenhll] = col;
8407 w[lenhll] = lxu;
8408 iw[col] = lenhll++;
8409 /* add to heap, by col number */
8410 hypre_ILUMinHeapAddIRIi(iL,w,iw,lenhll);
8411 }
8412 else if (col == ii)
8413 {
8414 w[ii] += lxu;
8415 }
8416 else
8417 {
8418 /*
8419 * not already in U part
8420 * put is to the end of heap
8421 */
8422 lenu++;
8423 jpos = lenu + ii;
8424 iL[jpos] = col;
8425 w[jpos] = lxu;
8426 iw[col] = jpos;
8427 }
8428 }
8429 else
8430 {
8431 w[icol] += lxu;
8432 }
8433 }
8434 }/* while loop for the elimination of current row */
8435
8436 if (fabs(w[ii]) < MAT_TOL)
8437 {
8438 w[ii]=1e-06;
8439 }
8440 D_data[ii] = 1./w[ii];
8441 iw[ii] = -1;
8442
8443 /*
8444 * now pick up the largest lfil from L
8445 * L part is guarantee to be larger than itol
8446 */
8447
8448 lenl = lenhlr < lfil ? lenhlr : lfil;
8449 L_diag_i[ii+1] = L_diag_i[ii] + lenl;
8450 if (lenl > 0)
8451 {
8452 /* test if memory is enough */
8453 while (ctrL + lenl > capacity_L)
8454 {
8455 HYPRE_Int tmp = capacity_L;
8456 capacity_L = capacity_L * EXPAND_FACT + 1;
8457 L_diag_j = hypre_TReAlloc_v2(L_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
8458 L_diag_data = hypre_TReAlloc_v2(L_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_L, HYPRE_MEMORY_DEVICE);
8459 }
8460 ctrL += lenl;
8461 /* copy large data in */
8462 for (j = L_diag_i[ii]; j < ctrL; j++)
8463 {
8464 L_diag_j[j] = iL[kl];
8465 L_diag_data[j] = w[kl];
8466 hypre_ILUMaxrHeapRemoveRabsI(w+kl,iL+kl,lenhlr);
8467 lenhlr--;
8468 }
8469 }
8470 /*
8471 * now reset working array
8472 * L part already reset when move out of heap, only U part
8473 */
8474 ku = lenu+ii;
8475 for (j = ii + 1; j <= ku; j++)
8476 {
8477 iw[iL[j]] = -1;
8478 }
8479
8480 if (lenu < lfil)
8481 {
8482 /* we simply keep all of the data, no need to sort */
8483 lenhu = lenu;
8484 }
8485 else
8486 {
8487 /* need to sort the first small(hopefully) part of it */
8488 lenhu = lfil;
8489 /* quick split, only sort the first small part of the array */
8490 hypre_ILUMaxQSplitRabsI(w,iL,ii+1,ii+lenhu,ii+lenu);
8491 }
8492
8493 U_diag_i[ii+1] = U_diag_i[ii] + lenhu;
8494 if (lenhu > 0)
8495 {
8496 /* test if memory is enough */
8497 while (ctrU + lenhu > capacity_U)
8498 {
8499 HYPRE_Int tmp = capacity_U;
8500 capacity_U = capacity_U * EXPAND_FACT + 1;
8501 U_diag_j = hypre_TReAlloc_v2(U_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
8502 U_diag_data = hypre_TReAlloc_v2(U_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_U, HYPRE_MEMORY_DEVICE);
8503 }
8504 ctrU += lenhu;
8505 /* copy large data in */
8506 for (j = U_diag_i[ii]; j < ctrU; j++)
8507 {
8508 jpos = ii+1+j-U_diag_i[ii];
8509 U_diag_j[j] = iL[jpos];
8510 U_diag_data[j] = w[jpos];
8511 }
8512 }
8513 }/* end of ii loop from nLU to n */
8514
8515
8516 /* main outer loop for upper part */
8517 for (ii = n; ii < total_rows; ii++)
8518 {
8519 /* get real row with perm */
8520 i = ii-n;
8521 k1 = E_i[i];
8522 k2 = E_i[i+1];
8523 kl = ii-1;
8524 /* reset row norm of ith row */
8525 inorm = .0;
8526 for (j = k1; j < k2; j++)
8527 {
8528 inorm += fabs(E_data[j]);
8529 }
8530 if (inorm == .0)
8531 {
8532 hypre_error_w_msg(HYPRE_ERROR_ARG,"WARNING: ILUT with zero row.\n");
8533 }
8534 inorm /= (HYPRE_Real)(k2-k1);
8535 /* set the scaled tol for that row */
8536 itolb = tol[0] * inorm;
8537 itolef = tol[1] * inorm;
8538
8539 /* reset displacement */
8540 lenhll = lenhlr = lenu = 0;
8541 w[ii] = 0.0;
8542 iw[ii] = ii;
8543 /* copy in data from A */
8544 for (j = k1; j < k2; j++)
8545 {
8546 /* get now col number */
8547 col = rperm[E_j[j]];
8548 if (col < ii)
8549 {
8550 /* L part of it */
8551 iL[lenhll] = col;
8552 w[lenhll] = E_data[j];
8553 iw[col] = lenhll++;
8554 /* add to heap, by col number */
8555 hypre_ILUMinHeapAddIRIi(iL,w,iw,lenhll);
8556 }
8557 else if (col == ii)
8558 {
8559 w[ii] = E_data[j];
8560 }
8561 else
8562 {
8563 lenu++;
8564 jpos = lenu + ii;
8565 iL[jpos] = col;
8566 w[jpos] = E_data[j];
8567 iw[col] = jpos;
8568 }
8569 }
8570
8571 /*
8572 * main elimination
8573 * need to maintain 2 heaps for L, one heap for col and one heaps for value
8574 * maintian an array for U, and do qsplit with quick sort after that
8575 * while the heap of col is greater than zero
8576 */
8577 while (lenhll > 0)
8578 {
8579
8580 /* get the next row from top of the heap */
8581 jrow = iL[0];
8582 dpiv = w[0] * D_data[jrow];
8583 w[0] = dpiv;
8584 /* now remove it from the top of the heap */
8585 hypre_ILUMinHeapRemoveIRIi(iL,w,iw,lenhll);
8586 lenhll--;
8587 /*
8588 * reset the drop part to -1
8589 * we don't need this iw anymore
8590 */
8591 iw[jrow] = -1;
8592 /* need to keep this one, move to the end of the heap */
8593 /* no longer need to maintain iw */
8594 hypre_swap2(iL,w,lenhll,kl-lenhlr);
8595 lenhlr++;
8596 hypre_ILUMaxrHeapAddRabsI(w+kl,iL+kl,lenhlr);
8597 /* loop for elimination */
8598 ku = U_diag_i[jrow+1];
8599 for (j = U_diag_i[jrow]; j < ku; j++)
8600 {
8601 col = U_diag_j[j];
8602 icol = iw[col];
8603 lxu = - dpiv*U_diag_data[j];
8604 /* we don't want to fill small number to empty place */
8605 if ((icol == -1) &&
8606 ((col < nLU && fabs(lxu) < itolb) || (col >= nLU && fabs(lxu) < itolef)))
8607 {
8608 continue;
8609 }
8610 if (icol == -1)
8611 {
8612 if (col < ii)
8613 {
8614 /* L part
8615 * not already in L part
8616 * put it to the end of heap
8617 * might overwrite some small entries, no issue
8618 */
8619 iL[lenhll] = col;
8620 w[lenhll] = lxu;
8621 iw[col] = lenhll++;
8622 /* add to heap, by col number */
8623 hypre_ILUMinHeapAddIRIi(iL,w,iw,lenhll);
8624 }
8625 else if (col == ii)
8626 {
8627 w[ii] += lxu;
8628 }
8629 else
8630 {
8631 /*
8632 * not already in U part
8633 * put is to the end of heap
8634 */
8635 lenu++;
8636 jpos = lenu + ii;
8637 iL[jpos] = col;
8638 w[jpos] = lxu;
8639 iw[col] = jpos;
8640 }
8641 }
8642 else
8643 {
8644 w[icol] += lxu;
8645 }
8646 }
8647 }/* while loop for the elimination of current row */
8648
8649 if (fabs(w[ii]) < MAT_TOL)
8650 {
8651 w[ii]=1e-06;
8652 }
8653 D_data[ii] = 1./w[ii];
8654 iw[ii] = -1;
8655
8656 /*
8657 * now pick up the largest lfil from L
8658 * L part is guarantee to be larger than itol
8659 */
8660
8661 lenl = lenhlr < lfil ? lenhlr : lfil;
8662 L_diag_i[ii+1] = L_diag_i[ii] + lenl;
8663 if (lenl > 0)
8664 {
8665 /* test if memory is enough */
8666 while (ctrL + lenl > capacity_L)
8667 {
8668 HYPRE_Int tmp = capacity_L;
8669 capacity_L = capacity_L * EXPAND_FACT + 1;
8670 L_diag_j = hypre_TReAlloc_v2(L_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_L, HYPRE_MEMORY_DEVICE);
8671 L_diag_data = hypre_TReAlloc_v2(L_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_L, HYPRE_MEMORY_DEVICE);
8672 }
8673 ctrL += lenl;
8674 /* copy large data in */
8675 for (j = L_diag_i[ii]; j < ctrL; j++)
8676 {
8677 L_diag_j[j] = iL[kl];
8678 L_diag_data[j] = w[kl];
8679 hypre_ILUMaxrHeapRemoveRabsI(w+kl,iL+kl,lenhlr);
8680 lenhlr--;
8681 }
8682 }
8683 /*
8684 * now reset working array
8685 * L part already reset when move out of heap, only U part
8686 */
8687 ku = lenu+ii;
8688 for (j = ii + 1; j <= ku; j++)
8689 {
8690 iw[iL[j]] = -1;
8691 }
8692
8693 if (lenu < lfil)
8694 {
8695 /* we simply keep all of the data, no need to sort */
8696 lenhu = lenu;
8697 }
8698 else
8699 {
8700 /* need to sort the first small(hopefully) part of it */
8701 lenhu = lfil;
8702 /* quick split, only sort the first small part of the array */
8703 hypre_ILUMaxQSplitRabsI(w,iL,ii+1,ii+lenhu,ii+lenu);
8704 }
8705
8706 U_diag_i[ii+1] = U_diag_i[ii] + lenhu;
8707 if (lenhu > 0)
8708 {
8709 /* test if memory is enough */
8710 while (ctrU + lenhu > capacity_U)
8711 {
8712 HYPRE_Int tmp = capacity_U;
8713 capacity_U = capacity_U * EXPAND_FACT + 1;
8714 U_diag_j = hypre_TReAlloc_v2(U_diag_j, HYPRE_Int, tmp, HYPRE_Int, capacity_U, HYPRE_MEMORY_DEVICE);
8715 U_diag_data = hypre_TReAlloc_v2(U_diag_data, HYPRE_Real, tmp, HYPRE_Real, capacity_U, HYPRE_MEMORY_DEVICE);
8716 }
8717 ctrU += lenhu;
8718 /* copy large data in */
8719 for (j = U_diag_i[ii]; j < ctrU; j++)
8720 {
8721 jpos = ii+1+j-U_diag_i[ii];
8722 U_diag_j[j] = iL[jpos];
8723 U_diag_data[j] = w[jpos];
8724 }
8725 }
8726 }/* end of ii loop from nLU to total_rows */
8727
8728 /*
8729 * 3: Finishing up and free
8730 */
8731 HYPRE_BigInt big_total_rows = (HYPRE_BigInt)total_rows;
8732 hypre_MPI_Allreduce( &big_total_rows, &global_num_rows, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
8733 /* need to get new column start */
8734 {
8735 HYPRE_BigInt global_start;
8736 hypre_MPI_Scan( &big_total_rows, &global_start, 1, HYPRE_MPI_BIG_INT, hypre_MPI_SUM, comm);
8737 col_starts[0] = global_start - total_rows;
8738 col_starts[1] = global_start;
8739 }
8740
8741 /* create parcsr matrix */
8742 matL = hypre_ParCSRMatrixCreate( comm,
8743 global_num_rows,
8744 global_num_rows,
8745 col_starts,
8746 col_starts,
8747 0,
8748 L_diag_i[total_rows],
8749 0 );
8750
8751 L_diag = hypre_ParCSRMatrixDiag(matL);
8752 hypre_CSRMatrixI(L_diag) = L_diag_i;
8753 if (L_diag_i[total_rows] > 0)
8754 {
8755 hypre_CSRMatrixData(L_diag) = L_diag_data;
8756 hypre_CSRMatrixJ(L_diag) = L_diag_j;
8757 }
8758 else
8759 {
8760 /* we initialized some anyway, so remove if unused */
8761 hypre_TFree(L_diag_j,HYPRE_MEMORY_DEVICE);
8762 hypre_TFree(L_diag_data,HYPRE_MEMORY_DEVICE);
8763 }
8764 /* store (global) total number of nonzeros */
8765 local_nnz = (HYPRE_Real) (L_diag_i[total_rows]);
8766 hypre_MPI_Allreduce(&local_nnz, &total_nnz, 1, HYPRE_MPI_REAL, hypre_MPI_SUM, comm);
8767 hypre_ParCSRMatrixDNumNonzeros(matL) = total_nnz;
8768
8769 matU = hypre_ParCSRMatrixCreate( comm,
8770 global_num_rows,
8771 global_num_rows,
8772 col_starts,
8773 col_starts,
8774 0,
8775 U_diag_i[total_rows],
8776 0 );
8777
8778 U_diag = hypre_ParCSRMatrixDiag(matU);
8779 hypre_CSRMatrixI(U_diag) = U_diag_i;
8780 if (U_diag_i[total_rows] > 0)
8781 {
8782 hypre_CSRMatrixData(U_diag) = U_diag_data;
8783 hypre_CSRMatrixJ(U_diag) = U_diag_j;
8784 }
8785 else
8786 {
8787 /* we initialized some anyway, so remove if unused */
8788 hypre_TFree(U_diag_j,HYPRE_MEMORY_DEVICE);
8789 hypre_TFree(U_diag_data,HYPRE_MEMORY_DEVICE);
8790 }
8791 /* store (global) total number of nonzeros */
8792 local_nnz = (HYPRE_Real) (U_diag_i[total_rows]);
8793 hypre_MPI_Allreduce(&local_nnz, &total_nnz, 1, HYPRE_MPI_REAL, hypre_MPI_SUM, comm);
8794 hypre_ParCSRMatrixDNumNonzeros(matU) = total_nnz;
8795
8796 /* free working array */
8797 hypre_TFree(iw,HYPRE_MEMORY_HOST);
8798 hypre_TFree(w,HYPRE_MEMORY_HOST);
8799
8800 /* free external data */
8801 if (E_i)
8802 {
8803 hypre_TFree(E_i, HYPRE_MEMORY_HOST);
8804 }
8805 if (E_j)
8806 {
8807 hypre_TFree(E_j, HYPRE_MEMORY_HOST);
8808 hypre_TFree(E_data, HYPRE_MEMORY_HOST);
8809 }
8810
8811 /* set matrix pointers */
8812 *Lptr = matL;
8813 *Dptr = D_data;
8814 *Uptr = matU;
8815
8816 return hypre_error_flag;
8817 }
8818