1 #include <hara.h>
2 #include <distributed/distributed_hara_handle.h>
3 #include <distributed/distributed_hmatrix.h>
4 #include <distributed/distributed_geometric_construction.h>
5 #include <distributed/distributed_hgemv.h>
6 #include <petsc/private/matimpl.h>
7 #include <petscsf.h>
8 
9 #define MatHaraGetThrustPointer(v) thrust::raw_pointer_cast((v).data())
10 
11 // TODO HARA:
12 // MatDuplicate buggy with commsize > 1
13 // kernel needs (global?) id of points (issues with Chebyshev points and coupling matrix computation)
14 // unsymmetrix DistributedHMatrix (transpose for distributed_hgemv?)
15 // Unify interface for sequential and parallel?
16 // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled)
17 // Fix includes:
18 // - everything under hara/ dir (except for hara.h)
19 // - fix kblas includes
20 // - namespaces?
21 // Fix naming convention DistributedHMatrix_GPU vs GPU_HMatrix
22 // Diagnostics? FLOPS, MEMORY USAGE IN PARALLEL
23 // Why do we need to template the admissibility condition in the hmatrix construction?
24 //
25 template<typename T, int D>
26 struct PetscPointCloud
27 {
28   static const int Dim = D;
29   typedef T ElementType;
30 
31   struct Point
32   {
33     T x[D];
PointPetscPointCloud::Point34     Point() {
35       for (int i = 0; i < D; i++) x[i] = 0;
36     }
PointPetscPointCloud::Point37     Point(const T xx[]) {
38       for (int i = 0; i < D; i++) x[i] = xx[i];
39     }
40   };
41 
42   std::vector<Point> pts;
43 
44   // Must return the number of data points
kdtree_get_point_countPetscPointCloud45   inline size_t kdtree_get_point_count() const { return pts.size(); }
46 
47   // Returns the dim'th component of the idx'th point in the class:
kdtree_get_ptPetscPointCloud48   inline T kdtree_get_pt(const size_t idx, int dim) const
49   {
50     return pts[idx].x[dim];
51   }
52 
get_ptPetscPointCloud53   inline T get_pt(const size_t idx, int dim) const
54   {
55     return kdtree_get_pt(idx, dim);
56   }
57 
kdtree_ignore_pointPetscPointCloud58   inline bool kdtree_ignore_point(const size_t idx) const { return false; }
59 
60   // Optional bounding-box computation: return false to default to a standard bbox computation loop.
61   //   Return true if the BBOX was already computed by the class and returned in "bb" so it can be avoided to redo it again.
62   //   Look at bb.size() to find out the expected dimensionality (e.g. 2 or 3 for point clouds)
63   template <class BBOX>
kdtree_get_bboxPetscPointCloud64   bool kdtree_get_bbox(BBOX& /* bb */) const { return false; }
65 
66   PetscPointCloud(PetscInt,const T[]);
67   PetscPointCloud(PetscPointCloud&);
68 };
69 
70 template<typename T, int D>
PetscPointCloud(PetscInt n,const T coords[])71 PetscPointCloud<T,D>::PetscPointCloud(PetscInt n, const T coords[])
72 {
73   this->pts.resize(n);
74   for (PetscInt i = 0; i < n; i++) {
75     Point p(coords);
76     this->pts[i] = p;
77     coords += D;
78   }
79 }
80 
81 template<typename T, int D>
PetscPointCloud(PetscPointCloud<T,D> & other)82 PetscPointCloud<T,D>::PetscPointCloud(PetscPointCloud<T,D>& other)
83 {
84   this->pts = other.pts;
85 }
86 
87 template <typename T>
88 using PetscPointCloud3D = PetscPointCloud<T,3>;
89 template <typename T>
90 using PetscPointCloud2D = PetscPointCloud<T,2>;
91 template <typename T>
92 using PetscPointCloud1D = PetscPointCloud<T,1>;
93 
94 template<typename T, int Dim>
95 struct PetscFunctionGenerator
96 {
97 private:
98   MatHaraKernel k;
99   void          *ctx;
100 
101 public:
PetscFunctionGeneratorPetscFunctionGenerator102     PetscFunctionGenerator(MatHaraKernel k, void* ctx) { this->k = k; this->ctx = ctx; }
PetscFunctionGeneratorPetscFunctionGenerator103     PetscFunctionGenerator(PetscFunctionGenerator& other) { this->k = other.k; this->ctx = other.ctx; }
operator ()PetscFunctionGenerator104     T operator()(PetscReal pt1[Dim], PetscReal pt2[Dim])
105     {
106         return (T)(this->k ? (*this->k)(Dim,pt1,pt2,this->ctx) : 0);
107     }
108 };
109 
110 template <typename T>
111 using PetscFunctionGenerator3D = PetscFunctionGenerator<T,3>;
112 template <typename T>
113 using PetscFunctionGenerator2D = PetscFunctionGenerator<T,2>;
114 template <typename T>
115 using PetscFunctionGenerator1D = PetscFunctionGenerator<T,1>;
116 
117 #include <../src/mat/impls/hara/matharasampler.hpp>
118 
119 typedef struct {
120   distributedHaraHandle_t handle;
121 
122   /* two different classes at the moment */
123   HMatrix *hmatrix;
124   DistributedHMatrix *dist_hmatrix;
125 
126   /* May use permutations */
127   PetscSF sf;
128   PetscLayout hara_rmap;
129   thrust::host_vector<PetscScalar> *xx,*yy;
130   PetscInt xxs,yys;
131   PetscBool multsetup;
132 
133   /* GPU */
134 #if defined(HARA_USE_GPU)
135   GPU_HMatrix *hmatrix_gpu;
136   DistributedHMatrix_GPU *dist_hmatrix_gpu;
137   thrust::device_vector<PetscScalar> *xx_gpu,*yy_gpu;
138   PetscInt xxs_gpu,yys_gpu;
139 #endif
140 
141   /* construction from matvecs */
142   PetscMatrixSampler* sampler;
143 
144   /* Admissibility */
145   PetscReal eta;
146   PetscInt  leafsize;
147 
148   /* for dof reordering */
149   PetscInt spacedim;
150   PetscPointCloud1D<PetscReal> *ptcloud1;
151   PetscPointCloud2D<PetscReal> *ptcloud2;
152   PetscPointCloud3D<PetscReal> *ptcloud3;
153 
154   /* kernel for generating matrix entries */
155   PetscFunctionGenerator1D<PetscScalar> *kernel1;
156   PetscFunctionGenerator2D<PetscScalar> *kernel2;
157   PetscFunctionGenerator3D<PetscScalar> *kernel3;
158 
159   /* customization */
160   PetscInt  basisord;
161   PetscInt  max_rank;
162   PetscInt  bs;
163   PetscReal rtol;
164   PetscInt  norm_max_samples;
165   PetscBool check_construction;
166 
167   /* keeps track of MatScale values */
168   PetscScalar s;
169 } Mat_HARA;
170 
MatDestroy_HARA(Mat A)171 static PetscErrorCode MatDestroy_HARA(Mat A)
172 {
173   Mat_HARA       *a = (Mat_HARA*)A->data;
174   PetscErrorCode ierr;
175 
176   PetscFunctionBegin;
177   haraDestroyDistributedHandle(a->handle);
178   delete a->hmatrix;
179   delete a->dist_hmatrix;
180   ierr = PetscSFDestroy(&a->sf);CHKERRQ(ierr);
181   ierr = PetscLayoutDestroy(&a->hara_rmap);CHKERRQ(ierr);
182   delete a->xx;
183   delete a->yy;
184 #if defined(HARA_USE_GPU)
185   delete a->hmatrix_gpu;
186   delete a->dist_hmatrix_gpu;
187   delete a->xx_gpu;
188   delete a->yy_gpu;
189 #endif
190   delete a->sampler;
191   delete a->ptcloud1;
192   delete a->ptcloud2;
193   delete a->ptcloud3;
194   delete a->kernel1;
195   delete a->kernel2;
196   delete a->kernel3;
197   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_hara_seqdense_C",NULL);CHKERRQ(ierr);
198   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_hara_seqdensecuda_C",NULL);CHKERRQ(ierr);
199   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_hara_mpidense_C",NULL);CHKERRQ(ierr);
200   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_hara_mpidensecuda_C",NULL);CHKERRQ(ierr);
201   ierr = PetscObjectChangeTypeName((PetscObject)A,NULL);CHKERRQ(ierr);
202   ierr = PetscFree(A->data);CHKERRQ(ierr);
203   PetscFunctionReturn(0);
204 }
205 
PetscSFGetVectorSF(PetscSF sf,PetscInt nv,PetscInt ldr,PetscInt ldl,PetscSF * vsf)206 static PetscErrorCode PetscSFGetVectorSF(PetscSF sf, PetscInt nv, PetscInt ldr, PetscInt ldl, PetscSF *vsf)
207 {
208   PetscSF           rankssf;
209   const PetscSFNode *iremote;
210   PetscSFNode       *viremote,*rremotes;
211   const PetscInt    *ilocal;
212   PetscInt          *vilocal = NULL,*ldrs;
213   const PetscMPIInt *ranks;
214   PetscMPIInt       *sranks;
215   PetscInt          nranks,nr,nl,vnr,vnl,i,v,j,maxl;
216   MPI_Comm          comm;
217   PetscErrorCode    ierr;
218 
219   PetscFunctionBegin;
220   if (nv == 1) {
221     ierr = PetscObjectReference((PetscObject)sf);CHKERRQ(ierr);
222     *vsf = sf;
223     PetscFunctionReturn(0);
224   }
225   ierr = PetscObjectGetComm((PetscObject)sf,&comm);CHKERRQ(ierr);
226   ierr = PetscSFGetGraph(sf,&nr,&nl,&ilocal,&iremote);CHKERRQ(ierr);
227   ierr = PetscSFGetLeafRange(sf,NULL,&maxl);CHKERRQ(ierr);
228   maxl += 1;
229   if (ldl == PETSC_DECIDE) ldl = maxl;
230   if (ldr == PETSC_DECIDE) ldr = nr;
231   if (ldr < nr) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Invalid leading dimension %D < %D",ldr,nr);
232   if (ldl < maxl) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Invalid leading dimension %D < %D",ldl,maxl);
233   vnr  = nr*nv;
234   vnl  = nl*nv;
235   ierr = PetscMalloc1(vnl,&viremote);CHKERRQ(ierr);
236   if (ilocal) {
237     ierr = PetscMalloc1(vnl,&vilocal);CHKERRQ(ierr);
238   }
239 
240   /* TODO: Should this special SF be available, e.g.
241      PetscSFGetRanksSF or similar? */
242   ierr = PetscSFGetRootRanks(sf,&nranks,&ranks,NULL,NULL,NULL);CHKERRQ(ierr);
243   ierr = PetscMalloc1(nranks,&sranks);CHKERRQ(ierr);
244   ierr = PetscArraycpy(sranks,ranks,nranks);CHKERRQ(ierr);
245   ierr = PetscSortMPIInt(nranks,sranks);CHKERRQ(ierr);
246   ierr = PetscMalloc1(nranks,&rremotes);CHKERRQ(ierr);
247   for (i=0;i<nranks;i++) {
248     rremotes[i].rank  = sranks[i];
249     rremotes[i].index = 0;
250   }
251   ierr = PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,&rankssf);CHKERRQ(ierr);
252   ierr = PetscSFSetGraph(rankssf,1,nranks,NULL,PETSC_OWN_POINTER,rremotes,PETSC_OWN_POINTER);CHKERRQ(ierr);
253   ierr = PetscMalloc1(nranks,&ldrs);CHKERRQ(ierr);
254   ierr = PetscSFBcastBegin(rankssf,MPIU_INT,&ldr,ldrs);CHKERRQ(ierr);
255   ierr = PetscSFBcastEnd(rankssf,MPIU_INT,&ldr,ldrs);CHKERRQ(ierr);
256   ierr = PetscSFDestroy(&rankssf);CHKERRQ(ierr);
257 
258   j = -1;
259   for (i=0;i<nl;i++) {
260     const PetscInt r  = iremote[i].rank;
261     const PetscInt ii = iremote[i].index;
262 
263     if (j < 0 || sranks[j] != r) {
264       ierr = PetscFindMPIInt(r,nranks,sranks,&j);CHKERRQ(ierr);
265     }
266     if (j < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unable to locate neighbor rank %D",r);
267     for (v=0;v<nv;v++) {
268       viremote[v*nl + i].rank  = r;
269       viremote[v*nl + i].index = v*ldrs[j] + ii;
270       if (ilocal) vilocal[v*nl + i] = v*ldl + ilocal[i];
271     }
272   }
273   ierr = PetscFree(sranks);CHKERRQ(ierr);
274   ierr = PetscFree(ldrs);CHKERRQ(ierr);
275   ierr = PetscSFCreate(comm,vsf);CHKERRQ(ierr);
276   ierr = PetscSFSetGraph(*vsf,vnr,vnl,vilocal,PETSC_OWN_POINTER,viremote,PETSC_OWN_POINTER);CHKERRQ(ierr);
277   PetscFunctionReturn(0);
278 }
279 
VecSign(Vec v,Vec s)280 static PetscErrorCode VecSign(Vec v, Vec s)
281 {
282   const PetscScalar *av;
283   PetscScalar       *as;
284   PetscInt          i,n;
285   PetscErrorCode    ierr;
286 
287   PetscFunctionBegin;
288   PetscValidHeaderSpecific(v,VEC_CLASSID,1);
289   PetscValidHeaderSpecific(s,VEC_CLASSID,2);
290   ierr = VecGetArrayRead(v,&av);CHKERRQ(ierr);
291   ierr = VecGetArrayWrite(s,&as);CHKERRQ(ierr);
292   ierr = VecGetLocalSize(s,&n);CHKERRQ(ierr);
293   ierr = VecGetLocalSize(v,&i);CHKERRQ(ierr);
294   if (i != n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Invalid local sizes %D != %D",i,n);
295   for (i=0;i<n;i++) as[i] = PetscAbsScalar(av[i]) < 0 ? -1. : 1.;
296   ierr = VecRestoreArrayWrite(s,&as);CHKERRQ(ierr);
297   ierr = VecRestoreArrayRead(v,&av);CHKERRQ(ierr);
298   PetscFunctionReturn(0);
299 }
300 
301 /* these are approximate norms */
302 /* NORM_2: Estimating the matrix p-norm Nicholas J. Higham
303    NORM_1/NORM_INFINITY: A block algorithm for matrix 1-norm estimation, with an application to 1-norm pseudospectra Higham, Nicholas J. and Tisseur, Francoise */
MatApproximateNorm_Private(Mat A,NormType normtype,PetscInt normsamples,PetscReal * n)304 static PetscErrorCode MatApproximateNorm_Private(Mat A, NormType normtype, PetscInt normsamples, PetscReal* n)
305 {
306   Vec            x,y,w,z;
307   PetscReal      normz,adot;
308   PetscScalar    dot;
309   PetscInt       i,j,N,jold = -1;
310   PetscErrorCode ierr;
311 
312   PetscFunctionBegin;
313   switch (normtype) {
314   case NORM_INFINITY:
315   case NORM_1:
316     if (normsamples < 0) normsamples = 10; /* pure guess */
317     if (normtype == NORM_INFINITY) {
318       Mat B;
319       ierr = MatCreateTranspose(A,&B);CHKERRQ(ierr);
320       A = B;
321     } else {
322       ierr = PetscObjectReference((PetscObject)A);CHKERRQ(ierr);
323     }
324     ierr = MatCreateVecs(A,&x,&y);CHKERRQ(ierr);
325     ierr = MatCreateVecs(A,&z,&w);CHKERRQ(ierr);
326     ierr = VecGetSize(x,&N);CHKERRQ(ierr);
327     ierr = VecSet(x,1./N);CHKERRQ(ierr);
328     ierr = VecSetOption(x,VEC_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE);CHKERRQ(ierr);
329     *n   = 0.0;
330     for (i = 0; i < normsamples; i++) {
331       ierr = MatMult(A,x,y);CHKERRQ(ierr);
332       ierr = VecSign(y,w);CHKERRQ(ierr);
333       ierr = MatMultTranspose(A,w,z);CHKERRQ(ierr);
334       ierr = VecNorm(z,NORM_INFINITY,&normz);CHKERRQ(ierr);
335       ierr = VecDot(x,z,&dot);CHKERRQ(ierr);
336       adot = PetscAbsScalar(dot);
337       ierr = PetscInfo4(A,"%s norm it %D -> (%g %g)\n",NormTypes[normtype],i,(double)normz,(double)adot);CHKERRQ(ierr);
338       if (normz <= adot && i > 0) {
339         ierr = VecNorm(y,NORM_1,n);CHKERRQ(ierr);
340         break;
341       }
342       ierr = VecSet(x,0.);CHKERRQ(ierr);
343       ierr = VecMax(z,&j,&normz);CHKERRQ(ierr);
344       if (j == jold) {
345         ierr = VecNorm(y,NORM_1,n);CHKERRQ(ierr);
346         ierr = PetscInfo2(A,"%s norm it %D -> breakdown (j==jold)\n",NormTypes[normtype],i);CHKERRQ(ierr);
347         break;
348       }
349       jold = j;
350       ierr = VecSetValue(x,j,1.0,INSERT_VALUES);CHKERRQ(ierr);
351       ierr = VecAssemblyBegin(x);CHKERRQ(ierr);
352       ierr = VecAssemblyEnd(x);CHKERRQ(ierr);
353     }
354     ierr = MatDestroy(&A);CHKERRQ(ierr);
355     ierr = VecDestroy(&x);CHKERRQ(ierr);
356     ierr = VecDestroy(&w);CHKERRQ(ierr);
357     ierr = VecDestroy(&y);CHKERRQ(ierr);
358     ierr = VecDestroy(&z);CHKERRQ(ierr);
359     break;
360   case NORM_2:
361     if (normsamples < 0) normsamples = 20; /* pure guess */
362     ierr = MatCreateVecs(A,&x,&y);CHKERRQ(ierr);
363     ierr = MatCreateVecs(A,&z,NULL);CHKERRQ(ierr);
364     ierr = VecSetRandom(x,NULL);CHKERRQ(ierr);
365     ierr = VecNormalize(x,NULL);CHKERRQ(ierr);
366     *n   = 0.0;
367     for (i = 0; i < normsamples; i++) {
368       ierr = MatMult(A,x,y);CHKERRQ(ierr);
369       ierr = VecNormalize(y,n);CHKERRQ(ierr);
370       ierr = MatMultTranspose(A,y,z);CHKERRQ(ierr);
371       ierr = VecNorm(z,NORM_2,&normz);CHKERRQ(ierr);
372       ierr = VecDot(x,z,&dot);CHKERRQ(ierr);
373       adot = PetscAbsScalar(dot);
374       ierr = PetscInfo5(A,"%s norm it %D -> %g (%g %g)\n",NormTypes[normtype],i,(double)*n,(double)normz,(double)adot);CHKERRQ(ierr);
375       if (normz <= adot) break;
376       if (i < normsamples - 1) {
377         Vec t;
378 
379         ierr = VecNormalize(z,NULL);CHKERRQ(ierr);
380         t = x;
381         x = z;
382         z = t;
383       }
384     }
385     ierr = VecDestroy(&x);CHKERRQ(ierr);
386     ierr = VecDestroy(&y);CHKERRQ(ierr);
387     ierr = VecDestroy(&z);CHKERRQ(ierr);
388     break;
389   default:
390     SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"%s norm not supported",NormTypes[normtype]);
391   }
392   ierr = PetscInfo3(A,"%s norm %g computed in %D iterations\n",NormTypes[normtype],(double)*n,i);CHKERRQ(ierr);
393   PetscFunctionReturn(0);
394 }
395 
MatNorm_HARA(Mat A,NormType normtype,PetscReal * n)396 PETSC_EXTERN PetscErrorCode MatNorm_HARA(Mat A, NormType normtype, PetscReal* n)
397 {
398   PetscErrorCode ierr;
399   PetscBool      ishara;
400   PetscInt       nmax = PETSC_DECIDE;
401 
402   PetscFunctionBegin;
403   ierr = PetscObjectTypeCompare((PetscObject)A,MATHARA,&ishara);CHKERRQ(ierr);
404   if (ishara) {
405     Mat_HARA *a = (Mat_HARA*)A->data;
406 
407     nmax = a->norm_max_samples;
408   }
409   ierr = MatApproximateNorm_Private(A,normtype,nmax,n);CHKERRQ(ierr);
410   PetscFunctionReturn(0);
411 }
412 
MatMultNKernel_HARA(Mat A,PetscBool transA,Mat B,Mat C)413 static PetscErrorCode MatMultNKernel_HARA(Mat A, PetscBool transA, Mat B, Mat C)
414 {
415   Mat_HARA       *hara = (Mat_HARA*)A->data;
416   haraHandle_t   handle = hara->handle->handle;
417   PetscBool      boundtocpu = PETSC_TRUE;
418   PetscScalar    *xx,*yy,*uxx,*uyy;
419   PetscInt       blda,clda;
420   PetscSF        bsf,csf;
421   PetscErrorCode ierr;
422 
423   PetscFunctionBegin;
424 #if defined(HARA_USE_GPU)
425   boundtocpu = A->boundtocpu;
426 #endif
427   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
428   ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
429   if (hara->sf) {
430     PetscInt n;
431 
432     ierr = PetscSFGetGraph(hara->sf,NULL,&n,NULL,NULL);CHKERRQ(ierr);
433     ierr = PetscObjectQuery((PetscObject)B,"_mathara_vectorsf",(PetscObject*)&bsf);CHKERRQ(ierr);
434     if (!bsf) {
435       ierr = PetscSFGetVectorSF(hara->sf,B->cmap->N,blda,PETSC_DECIDE,&bsf);CHKERRQ(ierr);
436       ierr = PetscObjectCompose((PetscObject)B,"_mathara_vectorsf",(PetscObject)bsf);CHKERRQ(ierr);
437       ierr = PetscObjectDereference((PetscObject)bsf);CHKERRQ(ierr);
438     }
439     ierr = PetscObjectQuery((PetscObject)C,"_mathara_vectorsf",(PetscObject*)&csf);CHKERRQ(ierr);
440     if (!csf) {
441       ierr = PetscSFGetVectorSF(hara->sf,B->cmap->N,clda,PETSC_DECIDE,&csf);CHKERRQ(ierr);
442       ierr = PetscObjectCompose((PetscObject)C,"_mathara_vectorsf",(PetscObject)csf);CHKERRQ(ierr);
443       ierr = PetscObjectDereference((PetscObject)csf);CHKERRQ(ierr);
444     }
445     blda = n;
446     clda = n;
447   }
448   if (!boundtocpu) {
449 #if defined(HARA_USE_GPU)
450     PetscBool ciscuda,biscuda;
451 
452     if (hara->sf) {
453       PetscInt n;
454 
455       ierr = PetscSFGetGraph(hara->sf,NULL,&n,NULL,NULL);CHKERRQ(ierr);
456       if (hara->xxs_gpu < B->cmap->n) { hara->xx_gpu->resize(n*B->cmap->N); hara->xxs_gpu = B->cmap->N; }
457       if (hara->yys_gpu < B->cmap->n) { hara->yy_gpu->resize(n*B->cmap->N); hara->yys_gpu = B->cmap->N; }
458     }
459     /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */
460     ierr = PetscObjectTypeCompareAny((PetscObject)B,&biscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,"");CHKERRQ(ierr);
461     if (!biscuda) {
462       ierr = MatConvert(B,MATDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
463     }
464     ierr = PetscObjectTypeCompareAny((PetscObject)C,&ciscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,"");CHKERRQ(ierr);
465     if (!ciscuda) {
466       C->assembled = PETSC_TRUE;
467       ierr = MatConvert(C,MATDENSECUDA,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
468     }
469     ierr = MatDenseCUDAGetArrayRead(B,(const PetscScalar**)&xx);CHKERRQ(ierr);
470     ierr = MatDenseCUDAGetArrayWrite(C,&yy);CHKERRQ(ierr);
471     if (hara->sf) {
472       uxx  = MatHaraGetThrustPointer(*hara->xx_gpu);
473       uyy  = MatHaraGetThrustPointer(*hara->yy_gpu);
474       ierr = PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx);CHKERRQ(ierr);
475       ierr = PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx);CHKERRQ(ierr);
476     } else {
477       uxx = xx;
478       uyy = yy;
479     }
480     if (hara->dist_hmatrix_gpu) {
481       if (transA && !A->symmetric) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
482       distributed_hgemv(/* transA ? HARA_Trans : HARA_NoTrans, */hara->s, *hara->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, hara->handle);
483     } else {
484       hgemv(transA ? HARA_Trans : HARA_NoTrans, hara->s, *hara->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
485     }
486     ierr = MatDenseCUDARestoreArrayRead(B,(const PetscScalar**)&xx);CHKERRQ(ierr);
487     if (hara->sf) {
488       ierr = PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPIU_REPLACE);CHKERRQ(ierr);
489       ierr = PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPIU_REPLACE);CHKERRQ(ierr);
490     }
491     ierr = MatDenseCUDARestoreArrayWrite(C,&yy);CHKERRQ(ierr);
492     if (!biscuda) {
493       ierr = MatConvert(B,MATDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
494     }
495     if (!ciscuda) {
496       ierr = MatConvert(C,MATDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
497     }
498 #endif
499   } else {
500     if (hara->sf) {
501       PetscInt n;
502 
503       ierr = PetscSFGetGraph(hara->sf,NULL,&n,NULL,NULL);CHKERRQ(ierr);
504       if (hara->xxs < B->cmap->n) { hara->xx->resize(n*B->cmap->N); hara->xxs = B->cmap->N; }
505       if (hara->yys < B->cmap->n) { hara->yy->resize(n*B->cmap->N); hara->yys = B->cmap->N; }
506     }
507     ierr = MatDenseGetArrayRead(B,(const PetscScalar**)&xx);CHKERRQ(ierr);
508     ierr = MatDenseGetArrayWrite(C,&yy);CHKERRQ(ierr);
509     if (hara->sf) {
510       uxx  = MatHaraGetThrustPointer(*hara->xx);
511       uyy  = MatHaraGetThrustPointer(*hara->yy);
512       ierr = PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx);CHKERRQ(ierr);
513       ierr = PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx);CHKERRQ(ierr);
514     } else {
515       uxx = xx;
516       uyy = yy;
517     }
518     if (hara->dist_hmatrix) {
519       if (transA && !A->symmetric) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
520       distributed_hgemv(/*transA ? HARA_Trans : HARA_NoTrans, */hara->s, *hara->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, hara->handle);
521     } else {
522       hgemv(transA ? HARA_Trans : HARA_NoTrans, hara->s, *hara->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
523     }
524     ierr = MatDenseRestoreArrayRead(B,(const PetscScalar**)&xx);CHKERRQ(ierr);
525     if (hara->sf) {
526       ierr = PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPIU_REPLACE);CHKERRQ(ierr);
527       ierr = PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPIU_REPLACE);CHKERRQ(ierr);
528     }
529     ierr = MatDenseRestoreArrayWrite(C,&yy);CHKERRQ(ierr);
530   }
531   PetscFunctionReturn(0);
532 }
533 
MatProductNumeric_HARA(Mat C)534 static PetscErrorCode MatProductNumeric_HARA(Mat C)
535 {
536   Mat_Product    *product = C->product;
537   PetscErrorCode ierr;
538 
539   PetscFunctionBegin;
540   MatCheckProduct(C,1);
541   switch (product->type) {
542   case MATPRODUCT_AB:
543     ierr = MatMultNKernel_HARA(product->A,PETSC_FALSE,product->B,C);CHKERRQ(ierr);
544     break;
545   case MATPRODUCT_AtB:
546     ierr = MatMultNKernel_HARA(product->A,PETSC_TRUE,product->B,C);CHKERRQ(ierr);
547     break;
548   default:
549     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]);
550   }
551   PetscFunctionReturn(0);
552 }
553 
MatProductSymbolic_HARA(Mat C)554 static PetscErrorCode MatProductSymbolic_HARA(Mat C)
555 {
556   PetscErrorCode ierr;
557   Mat_Product    *product = C->product;
558   PetscBool      cisdense;
559   Mat            A,B;
560 
561   PetscFunctionBegin;
562   MatCheckProduct(C,1);
563   A = product->A;
564   B = product->B;
565   switch (product->type) {
566   case MATPRODUCT_AB:
567     ierr = MatSetSizes(C,A->rmap->n,B->cmap->n,A->rmap->N,B->cmap->N);CHKERRQ(ierr);
568     ierr = MatSetBlockSizesFromMats(C,product->A,product->B);CHKERRQ(ierr);
569     ierr = PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,"");CHKERRQ(ierr);
570     if (!cisdense) { ierr = MatSetType(C,((PetscObject)product->B)->type_name);CHKERRQ(ierr); }
571     ierr = MatSetUp(C);CHKERRQ(ierr);
572     break;
573   case MATPRODUCT_AtB:
574     ierr = MatSetSizes(C,A->cmap->n,B->cmap->n,A->cmap->N,B->cmap->N);CHKERRQ(ierr);
575     ierr = MatSetBlockSizesFromMats(C,product->A,product->B);CHKERRQ(ierr);
576     ierr = PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,"");CHKERRQ(ierr);
577     if (!cisdense) { ierr = MatSetType(C,((PetscObject)product->B)->type_name);CHKERRQ(ierr); }
578     ierr = MatSetUp(C);CHKERRQ(ierr);
579     break;
580   default:
581     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]);
582   }
583   C->ops->productsymbolic = NULL;
584   C->ops->productnumeric = MatProductNumeric_HARA;
585   PetscFunctionReturn(0);
586 }
587 
MatProductSetFromOptions_HARA(Mat C)588 static PetscErrorCode MatProductSetFromOptions_HARA(Mat C)
589 {
590   PetscFunctionBegin;
591   MatCheckProduct(C,1);
592   if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) {
593     C->ops->productsymbolic = MatProductSymbolic_HARA;
594   }
595   PetscFunctionReturn(0);
596 }
597 
MatMultKernel_HARA(Mat A,Vec x,PetscScalar sy,Vec y,PetscBool trans)598 static PetscErrorCode MatMultKernel_HARA(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans)
599 {
600   Mat_HARA       *hara = (Mat_HARA*)A->data;
601   haraHandle_t   handle = hara->handle->handle;
602   PetscBool      boundtocpu = PETSC_TRUE;
603   PetscInt       n;
604   PetscScalar    *xx,*yy,*uxx,*uyy;
605   PetscErrorCode ierr;
606 
607   PetscFunctionBegin;
608 #if defined(HARA_USE_GPU)
609   boundtocpu = A->boundtocpu;
610 #endif
611   if (hara->sf) {
612     ierr = PetscSFGetGraph(hara->sf,NULL,&n,NULL,NULL);CHKERRQ(ierr);
613   } else n = A->rmap->n;
614   if (!boundtocpu) {
615 #if defined(HARA_USE_GPU)
616     ierr = VecCUDAGetArrayRead(x,(const PetscScalar**)&xx);CHKERRQ(ierr);
617     if (sy == 0.0) {
618       ierr = VecCUDAGetArrayWrite(y,&yy);CHKERRQ(ierr);
619     } else {
620       ierr = VecCUDAGetArray(y,&yy);CHKERRQ(ierr);
621     }
622     if (hara->sf) {
623       uxx = MatHaraGetThrustPointer(*hara->xx_gpu);
624       uyy = MatHaraGetThrustPointer(*hara->yy_gpu);
625 
626       ierr = PetscSFBcastBegin(hara->sf,MPIU_SCALAR,xx,uxx);CHKERRQ(ierr);
627       ierr = PetscSFBcastEnd(hara->sf,MPIU_SCALAR,xx,uxx);CHKERRQ(ierr);
628       if (sy != 0.0) {
629         ierr = PetscSFBcastBegin(hara->sf,MPIU_SCALAR,yy,uyy);CHKERRQ(ierr);
630         ierr = PetscSFBcastEnd(hara->sf,MPIU_SCALAR,yy,uyy);CHKERRQ(ierr);
631       }
632     } else {
633       uxx = xx;
634       uyy = yy;
635     }
636     if (hara->dist_hmatrix_gpu) {
637       if (trans && !A->symmetric) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
638       distributed_hgemv(/*trans ? HARA_Trans : HARA_NoTrans, */hara->s, *hara->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, hara->handle);
639     } else {
640       hgemv(trans ? HARA_Trans : HARA_NoTrans, hara->s, *hara->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle);
641     }
642     ierr = VecCUDARestoreArrayRead(x,(const PetscScalar**)&xx);CHKERRQ(ierr);
643     if (hara->sf) {
644       ierr = PetscSFReduceBegin(hara->sf,MPIU_SCALAR,uyy,yy,MPIU_REPLACE);CHKERRQ(ierr);
645       ierr = PetscSFReduceEnd(hara->sf,MPIU_SCALAR,uyy,yy,MPIU_REPLACE);CHKERRQ(ierr);
646     }
647     if (sy == 0.0) {
648       ierr = VecCUDARestoreArrayWrite(y,&yy);CHKERRQ(ierr);
649     } else {
650       ierr = VecCUDARestoreArray(y,&yy);CHKERRQ(ierr);
651     }
652 #endif
653   } else {
654     ierr = VecGetArrayRead(x,(const PetscScalar**)&xx);CHKERRQ(ierr);
655     if (sy == 0.0) {
656       ierr = VecGetArrayWrite(y,&yy);CHKERRQ(ierr);
657     } else {
658       ierr = VecGetArray(y,&yy);CHKERRQ(ierr);
659     }
660     if (hara->sf) {
661       uxx = MatHaraGetThrustPointer(*hara->xx);
662       uyy = MatHaraGetThrustPointer(*hara->yy);
663 
664       ierr = PetscSFBcastBegin(hara->sf,MPIU_SCALAR,xx,uxx);CHKERRQ(ierr);
665       ierr = PetscSFBcastEnd(hara->sf,MPIU_SCALAR,xx,uxx);CHKERRQ(ierr);
666       if (sy != 0.0) {
667         ierr = PetscSFBcastBegin(hara->sf,MPIU_SCALAR,yy,uyy);CHKERRQ(ierr);
668         ierr = PetscSFBcastEnd(hara->sf,MPIU_SCALAR,yy,uyy);CHKERRQ(ierr);
669       }
670     } else {
671       uxx = xx;
672       uyy = yy;
673     }
674     if (hara->dist_hmatrix) {
675       if (trans && !A->symmetric) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
676       distributed_hgemv(/*trans ? HARA_Trans : HARA_NoTrans, */hara->s, *hara->dist_hmatrix, uxx, n, sy, uyy, n, 1, hara->handle);
677     } else {
678       hgemv(trans ? HARA_Trans : HARA_NoTrans, hara->s, *hara->hmatrix, uxx, n, sy, uyy, n, 1, handle);
679     }
680     ierr = VecRestoreArrayRead(x,(const PetscScalar**)&xx);CHKERRQ(ierr);
681     if (hara->sf) {
682       ierr = PetscSFReduceBegin(hara->sf,MPIU_SCALAR,uyy,yy,MPIU_REPLACE);CHKERRQ(ierr);
683       ierr = PetscSFReduceEnd(hara->sf,MPIU_SCALAR,uyy,yy,MPIU_REPLACE);CHKERRQ(ierr);
684     }
685     if (sy == 0.0) {
686       ierr = VecRestoreArrayWrite(y,&yy);CHKERRQ(ierr);
687     } else {
688       ierr = VecRestoreArray(y,&yy);CHKERRQ(ierr);
689     }
690   }
691   PetscFunctionReturn(0);
692 }
693 
MatMultTranspose_HARA(Mat A,Vec x,Vec y)694 static PetscErrorCode MatMultTranspose_HARA(Mat A, Vec x, Vec y)
695 {
696   PetscErrorCode ierr;
697 
698   PetscFunctionBegin;
699   ierr = MatMultKernel_HARA(A,x,0.0,y,PETSC_TRUE);CHKERRQ(ierr);
700   PetscFunctionReturn(0);
701 }
702 
MatMult_HARA(Mat A,Vec x,Vec y)703 static PetscErrorCode MatMult_HARA(Mat A, Vec x, Vec y)
704 {
705   PetscErrorCode ierr;
706 
707   PetscFunctionBegin;
708   ierr = MatMultKernel_HARA(A,x,0.0,y,PETSC_FALSE);CHKERRQ(ierr);
709   PetscFunctionReturn(0);
710 }
711 
MatMultTransposeAdd_HARA(Mat A,Vec x,Vec y,Vec z)712 static PetscErrorCode MatMultTransposeAdd_HARA(Mat A, Vec x, Vec y, Vec z)
713 {
714   PetscErrorCode ierr;
715 
716   PetscFunctionBegin;
717   ierr = VecCopy(y,z);CHKERRQ(ierr);
718   ierr = MatMultKernel_HARA(A,x,1.0,z,PETSC_TRUE);CHKERRQ(ierr);
719   PetscFunctionReturn(0);
720 }
721 
MatMultAdd_HARA(Mat A,Vec x,Vec y,Vec z)722 static PetscErrorCode MatMultAdd_HARA(Mat A, Vec x, Vec y, Vec z)
723 {
724   PetscErrorCode ierr;
725 
726   PetscFunctionBegin;
727   ierr = VecCopy(y,z);CHKERRQ(ierr);
728   ierr = MatMultKernel_HARA(A,x,1.0,z,PETSC_FALSE);CHKERRQ(ierr);
729   PetscFunctionReturn(0);
730 }
731 
MatScale_HARA(Mat A,PetscScalar s)732 static PetscErrorCode MatScale_HARA(Mat A, PetscScalar s)
733 {
734   Mat_HARA *a = (Mat_HARA*)A->data;
735 
736   PetscFunctionBegin;
737   a->s *= s;
738   PetscFunctionReturn(0);
739 }
740 
MatSetFromOptions_HARA(PetscOptionItems * PetscOptionsObject,Mat A)741 static PetscErrorCode MatSetFromOptions_HARA(PetscOptionItems *PetscOptionsObject,Mat A)
742 {
743   Mat_HARA       *a = (Mat_HARA*)A->data;
744   PetscErrorCode ierr;
745 
746   PetscFunctionBegin;
747   ierr = PetscOptionsHead(PetscOptionsObject,"HARA options");CHKERRQ(ierr);
748   ierr = PetscOptionsInt("-mat_hara_leafsize","Leaf size when constructed from kernel",NULL,a->leafsize,&a->leafsize,NULL);CHKERRQ(ierr);
749   ierr = PetscOptionsReal("-mat_hara_eta","Admissibility condition tolerance",NULL,a->eta,&a->eta,NULL);CHKERRQ(ierr);
750   ierr = PetscOptionsInt("-mat_hara_order","Basis order for off-diagonal sampling when constructed from kernel",NULL,a->basisord,&a->basisord,NULL);CHKERRQ(ierr);
751   ierr = PetscOptionsInt("-mat_hara_maxrank","Maximum rank when constructed from matvecs",NULL,a->max_rank,&a->max_rank,NULL);CHKERRQ(ierr);
752   ierr = PetscOptionsInt("-mat_hara_samples","Number of samples to be taken concurrently when constructing from matvecs",NULL,a->bs,&a->bs,NULL);CHKERRQ(ierr);
753   ierr = PetscOptionsReal("-mat_hara_rtol","Relative tolerance for construction from sampling",NULL,a->rtol,&a->rtol,NULL);CHKERRQ(ierr);
754   ierr = PetscOptionsBool("-mat_hara_check","Check error when constructing from sampling during MatAssemblyEnd()",NULL,a->check_construction,&a->check_construction,NULL);CHKERRQ(ierr);
755   ierr = PetscOptionsTail();CHKERRQ(ierr);
756   PetscFunctionReturn(0);
757 }
758 
759 static PetscErrorCode MatHaraSetCoords_HARA(Mat,PetscInt,const PetscReal[],MatHaraKernel,void*);
760 
MatHaraInferCoordinates_Private(Mat A)761 static PetscErrorCode MatHaraInferCoordinates_Private(Mat A)
762 {
763   Mat_HARA          *a = (Mat_HARA*)A->data;
764   Vec               c;
765   PetscInt          spacedim;
766   const PetscScalar *coords;
767   PetscErrorCode    ierr;
768 
769   PetscFunctionBegin;
770   if (a->spacedim) PetscFunctionReturn(0);
771   ierr = PetscObjectQuery((PetscObject)A,"__mathara_coords",(PetscObject*)&c);CHKERRQ(ierr);
772   if (!c && a->sampler) {
773     Mat S = a->sampler->GetSamplingMat();
774 
775     ierr = PetscObjectQuery((PetscObject)S,"__mathara_coords",(PetscObject*)&c);CHKERRQ(ierr);
776     if (!c) {
777       PetscBool ishara;
778 
779       ierr = PetscObjectTypeCompare((PetscObject)S,MATHARA,&ishara);CHKERRQ(ierr);
780       if (ishara) {
781         Mat_HARA *s = (Mat_HARA*)S->data;
782 
783         a->spacedim = s->spacedim;
784         if (s->ptcloud1) {
785           a->ptcloud1 = new PetscPointCloud1D<PetscReal>(*s->ptcloud1);
786         } else if (s->ptcloud2) {
787           a->ptcloud2 = new PetscPointCloud2D<PetscReal>(*s->ptcloud2);
788         } else if (s->ptcloud3) {
789           a->ptcloud3 = new PetscPointCloud3D<PetscReal>(*s->ptcloud3);
790         }
791       }
792     }
793   }
794   if (!c) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Missing coordinates");
795   ierr = VecGetArrayRead(c,&coords);CHKERRQ(ierr);
796   ierr = VecGetBlockSize(c,&spacedim);CHKERRQ(ierr);
797   ierr = MatHaraSetCoords_HARA(A,spacedim,coords,NULL,NULL);CHKERRQ(ierr);
798   ierr = VecRestoreArrayRead(c,&coords);CHKERRQ(ierr);
799   PetscFunctionReturn(0);
800 }
801 
MatSetUpMultiply_HARA(Mat A)802 static PetscErrorCode MatSetUpMultiply_HARA(Mat A)
803 {
804   MPI_Comm       comm;
805   PetscMPIInt    size;
806   PetscErrorCode ierr;
807   Mat_HARA       *a = (Mat_HARA*)A->data;
808   IS             is;
809   PetscInt       n,*idx;
810   int            *iidx;
811   PetscCopyMode  own;
812   PetscBool      rid;
813 
814   PetscFunctionBegin;
815   if (a->multsetup) PetscFunctionReturn(0);
816   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
817   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
818   if (size > 1) {
819     iidx = MatHaraGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map);
820     n    = a->dist_hmatrix->basis_tree.basis_branch.index_map.size();
821   } else {
822     n    = a->hmatrix->u_basis_tree.index_map.size();
823     iidx = MatHaraGetThrustPointer(a->hmatrix->u_basis_tree.index_map);
824   }
825   if (PetscDefined(USE_64BIT_INDICES)) {
826     PetscInt i;
827 
828     own  = PETSC_OWN_POINTER;
829     ierr = PetscMalloc1(n,&idx);CHKERRQ(ierr);
830     for (i=0;i<n;i++) idx[i] = iidx[i];
831   } else {
832     own  = PETSC_USE_POINTER;
833     idx  = iidx;
834   }
835   ierr = ISCreateGeneral(comm,n,idx,own,&is);CHKERRQ(ierr);
836   ierr = ISIdentity(is,&rid);CHKERRQ(ierr);
837   if (!rid) {
838     ierr = PetscSFCreate(comm,&a->sf);CHKERRQ(ierr);
839     ierr = PetscSFSetGraphLayout(a->sf,A->rmap,n,NULL,PETSC_OWN_POINTER,idx);CHKERRQ(ierr);
840 #if defined(HARA_USE_GPU)
841     a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
842     a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
843     a->xxs_gpu = 1;
844     a->yys_gpu = 1;
845 #endif
846     a->xx  = new thrust::host_vector<PetscScalar>(n);
847     a->yy  = new thrust::host_vector<PetscScalar>(n);
848     a->xxs = 1;
849     a->yys = 1;
850   }
851   ierr = ISDestroy(&is);CHKERRQ(ierr);
852   a->multsetup = PETSC_TRUE;
853   PetscFunctionReturn(0);
854 }
855 
MatAssemblyEnd_HARA(Mat A,MatAssemblyType asstype)856 static PetscErrorCode MatAssemblyEnd_HARA(Mat A, MatAssemblyType asstype)
857 {
858   Mat_HARA       *a = (Mat_HARA*)A->data;
859   haraHandle_t   handle = a->handle->handle;
860   PetscBool      kernel = PETSC_FALSE;
861   PetscBool      boundtocpu = PETSC_TRUE;
862   MPI_Comm       comm;
863   PetscMPIInt    size;
864   PetscErrorCode ierr;
865 
866   PetscFunctionBegin;
867   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
868   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
869   /* TODO REUSABILITY of geometric construction */
870   delete a->hmatrix;
871   delete a->dist_hmatrix;
872 #if defined(HARA_USE_GPU)
873   delete a->hmatrix_gpu;
874   delete a->dist_hmatrix_gpu;
875 #endif
876   /* TODO: other? */
877   BoxCenterAdmissibility<Hara_Real,1> adm1(a->eta,a->leafsize);
878   BoxCenterAdmissibility<Hara_Real,2> adm2(a->eta,a->leafsize);
879   BoxCenterAdmissibility<Hara_Real,3> adm3(a->eta,a->leafsize);
880   if (size > 1) {
881     a->dist_hmatrix = new DistributedHMatrix(A->rmap->n/*,A->symmetric*/);
882   } else {
883     a->hmatrix = new HMatrix(A->rmap->n,A->symmetric);
884   }
885   ierr = MatHaraInferCoordinates_Private(A);CHKERRQ(ierr);
886   switch (a->spacedim) {
887   case 1:
888     if (!a->ptcloud1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing pointcloud");
889     if (a->kernel1) {
890       kernel = PETSC_TRUE;
891       if (size > 1) {
892         buildDistributedHMatrix(*a->dist_hmatrix,*a->ptcloud1,adm1,*a->kernel1,a->leafsize,a->basisord/*,a->basisord*/,a->handle);
893       } else {
894         buildHMatrix(*a->hmatrix,*a->ptcloud1,adm1,*a->kernel1,a->leafsize,a->basisord,a->basisord);
895       }
896     } else {
897       if (size > 1) SETERRQ(comm,PETSC_ERR_SUP,"Construction from sampling not supported in parallel");
898       buildHMatrixStructure(*a->hmatrix,*a->ptcloud1,adm1,a->leafsize,0,0);
899     }
900     break;
901   case 2:
902     if (!a->ptcloud2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing pointcloud");
903     if (a->kernel2) {
904       kernel = PETSC_TRUE;
905       if (size > 1) {
906         buildDistributedHMatrix(*a->dist_hmatrix,*a->ptcloud2,adm2,*a->kernel2,a->leafsize,a->basisord/*,a->basisord*/,a->handle);
907       } else {
908         buildHMatrix(*a->hmatrix,*a->ptcloud2,adm2,*a->kernel2,a->leafsize,a->basisord,a->basisord);
909       }
910     } else {
911       if (size > 1) SETERRQ(comm,PETSC_ERR_SUP,"Construction from sampling not supported in parallel");
912       buildHMatrixStructure(*a->hmatrix,*a->ptcloud2,adm2,a->leafsize,0,0);
913     }
914     break;
915   case 3:
916     if (!a->ptcloud3) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing pointcloud");
917     if (a->kernel3) {
918       kernel = PETSC_TRUE;
919       if (size > 1) {
920         buildDistributedHMatrix(*a->dist_hmatrix,*a->ptcloud3,adm3,*a->kernel3,a->leafsize,a->basisord/*,a->basisord*/,a->handle);
921       } else {
922         buildHMatrix(*a->hmatrix,*a->ptcloud3,adm3,*a->kernel3,a->leafsize,a->basisord,a->basisord);
923       }
924     } else {
925       if (size > 1) SETERRQ(comm,PETSC_ERR_SUP,"Construction from sampling not supported in parallel");
926       buildHMatrixStructure(*a->hmatrix,*a->ptcloud3,adm3,a->leafsize,0,0);
927     }
928     break;
929   default:
930     SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Unhandled dimension %D",a->spacedim);
931   }
932 
933   ierr = MatSetUpMultiply_HARA(A);CHKERRQ(ierr);
934 
935 #if defined(HARA_USE_GPU)
936   boundtocpu = A->boundtocpu;
937   if (!boundtocpu) {
938     if (size > 1) {
939       a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
940     } else {
941       a->hmatrix_gpu = new GPU_HMatrix(*a->hmatrix);
942     }
943   }
944 #endif
945   if (size == 1) {
946     if (!kernel && a->sampler) {
947       PetscReal Anorm;
948       bool      verbose = false;
949 
950       ierr = MatApproximateNorm_Private(a->sampler->GetSamplingMat(),NORM_2,PETSC_DECIDE,&Anorm);CHKERRQ(ierr);
951       if (boundtocpu) {
952         a->sampler->SetGPUSampling(false);
953         hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose);
954 #if defined(HARA_USE_GPU)
955       } else {
956         a->sampler->SetGPUSampling(true);
957         hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose);
958 #endif
959       }
960     }
961   }
962 #if defined(HARA_USE_GPU)
963   if (kernel) A->offloadmask = PETSC_OFFLOAD_BOTH;
964   else A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
965 #endif
966 
967   if (!a->s) a->s = 1.0;
968   A->assembled = PETSC_TRUE;
969 
970   if (a->sampler) {
971     PetscBool check = a->check_construction;
972 
973     ierr = PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_hara_check",&check,NULL);CHKERRQ(ierr);
974     if (check) {
975       Mat       E,Ae;
976       PetscReal n1,ni,n2;
977       PetscReal n1A,niA,n2A;
978       void      (*normfunc)(void);
979 
980       Ae   = a->sampler->GetSamplingMat();
981       ierr = MatConvert(A,MATSHELL,MAT_INITIAL_MATRIX,&E);CHKERRQ(ierr);
982       ierr = MatShellSetOperation(E,MATOP_NORM,(void (*)(void))MatNorm_HARA);CHKERRQ(ierr);
983       ierr = MatAXPY(E,-1.0,Ae,DIFFERENT_NONZERO_PATTERN);CHKERRQ(ierr);
984       ierr = MatNorm(E,NORM_1,&n1);CHKERRQ(ierr);
985       ierr = MatNorm(E,NORM_INFINITY,&ni);CHKERRQ(ierr);
986       ierr = MatNorm(E,NORM_2,&n2);CHKERRQ(ierr);
987 
988       ierr = MatGetOperation(Ae,MATOP_NORM,&normfunc);CHKERRQ(ierr);
989       ierr = MatSetOperation(Ae,MATOP_NORM,(void (*)(void))MatNorm_HARA);CHKERRQ(ierr);
990       ierr = MatNorm(Ae,NORM_1,&n1A);CHKERRQ(ierr);
991       ierr = MatNorm(Ae,NORM_INFINITY,&niA);CHKERRQ(ierr);
992       ierr = MatNorm(Ae,NORM_2,&n2A);CHKERRQ(ierr);
993       n1A  = PetscMax(n1A,PETSC_SMALL);
994       n2A  = PetscMax(n2A,PETSC_SMALL);
995       niA  = PetscMax(niA,PETSC_SMALL);
996       ierr = MatSetOperation(Ae,MATOP_NORM,normfunc);CHKERRQ(ierr);
997       ierr = PetscPrintf(PetscObjectComm((PetscObject)A),"MATHARA construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n",(double)n1,(double)ni,(double)n2,(double)(n1/n1A),(double)(ni/niA),(double)(n2/n2A));
998       ierr = MatDestroy(&E);CHKERRQ(ierr);
999     }
1000   }
1001   PetscFunctionReturn(0);
1002 }
1003 
MatZeroEntries_HARA(Mat A)1004 static PetscErrorCode MatZeroEntries_HARA(Mat A)
1005 {
1006   PetscErrorCode ierr;
1007   PetscMPIInt    size;
1008   Mat_HARA       *a = (Mat_HARA*)A->data;
1009 
1010   PetscFunctionBegin;
1011   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
1012   if (size > 1) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not yet supported");
1013   else {
1014     a->hmatrix->clearData();
1015 #if defined(HARA_USE_GPU)
1016     if (a->hmatrix_gpu) a->hmatrix_gpu->clearData();
1017 #endif
1018   }
1019   PetscFunctionReturn(0);
1020 }
1021 
MatDuplicate_HARA(Mat B,MatDuplicateOption op,Mat * nA)1022 static PetscErrorCode MatDuplicate_HARA(Mat B, MatDuplicateOption op, Mat *nA)
1023 {
1024   Mat            A;
1025   Mat_HARA       *a, *b = (Mat_HARA*)B->data;
1026 #if defined(PETSC_HAVE_CUDA)
1027   PetscBool      iscpu = PETSC_FALSE;
1028 #else
1029   PetscBool      iscpu = PETSC_TRUE;
1030 #endif
1031   PetscErrorCode ierr;
1032   MPI_Comm       comm;
1033 
1034   PetscFunctionBegin;
1035   ierr = PetscObjectGetComm((PetscObject)B,&comm);CHKERRQ(ierr);
1036   ierr = MatCreate(comm,&A);CHKERRQ(ierr);
1037   ierr = MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N);CHKERRQ(ierr);
1038   ierr = MatSetType(A,MATHARA);CHKERRQ(ierr);
1039   ierr = MatPropagateSymmetryOptions(B,A);CHKERRQ(ierr);
1040 
1041   a = (Mat_HARA*)A->data;
1042   a->s = b->s;
1043   a->spacedim = b->spacedim;
1044   if (b->ptcloud1) {
1045     a->ptcloud1 = new PetscPointCloud1D<PetscReal>(*b->ptcloud1);
1046   } else if (b->ptcloud2) {
1047     a->ptcloud2 = new PetscPointCloud2D<PetscReal>(*b->ptcloud2);
1048   } else if (b->ptcloud3) {
1049     a->ptcloud3 = new PetscPointCloud3D<PetscReal>(*b->ptcloud3);
1050   }
1051   if (op == MAT_COPY_VALUES) {
1052     if (b->kernel1) {
1053       a->kernel1 = new PetscFunctionGenerator1D<PetscScalar>(*b->kernel1);
1054     } else if (b->kernel2) {
1055       a->kernel2 = new PetscFunctionGenerator2D<PetscScalar>(*b->kernel2);
1056     } else if (b->kernel3) {
1057       a->kernel3 = new PetscFunctionGenerator3D<PetscScalar>(*b->kernel3);
1058     }
1059   }
1060 
1061   if (b->dist_hmatrix) { a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix); }
1062 #if defined(HARA_USE_GPU)
1063   if (b->dist_hmatrix_gpu) { a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu); }
1064 #endif
1065   if (b->hmatrix) {
1066     a->hmatrix = new HMatrix(*b->hmatrix);
1067     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData();
1068   }
1069 #if defined(HARA_USE_GPU)
1070   if (b->hmatrix_gpu) {
1071     a->hmatrix_gpu = new GPU_HMatrix(*b->hmatrix_gpu);
1072     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData();
1073   }
1074 #endif
1075 
1076   ierr = MatSetUp(A);CHKERRQ(ierr);
1077   ierr = MatSetUpMultiply_HARA(A);CHKERRQ(ierr);
1078   if (op == MAT_COPY_VALUES) {
1079     A->assembled = PETSC_TRUE;
1080 #if defined(PETSC_HAVE_CUDA)
1081     iscpu = B->boundtocpu;
1082 #endif
1083   }
1084   ierr = MatBindToCPU(A,iscpu);CHKERRQ(ierr);
1085 
1086   *nA = A;
1087   PetscFunctionReturn(0);
1088 }
1089 
MatView_HARA(Mat A,PetscViewer view)1090 static PetscErrorCode MatView_HARA(Mat A, PetscViewer view)
1091 {
1092   Mat_HARA          *hara = (Mat_HARA*)A->data;
1093   PetscBool         isascii;
1094   PetscErrorCode    ierr;
1095   PetscMPIInt       size;
1096   PetscViewerFormat format;
1097 
1098   PetscFunctionBegin;
1099   ierr = PetscObjectTypeCompare((PetscObject)view,PETSCVIEWERASCII,&isascii);CHKERRQ(ierr);
1100   ierr = PetscViewerGetFormat(view,&format);CHKERRQ(ierr);
1101   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
1102   if (isascii) {
1103     ierr = PetscViewerASCIIPrintf(view,"  H-Matrix constructed from %s\n",hara->sampler ? "Mat" : (hara->spacedim ? "Kernel" : "None"));CHKERRQ(ierr);
1104     ierr = PetscViewerASCIIPrintf(view,"  PointCloud dim %D\n",hara->spacedim);CHKERRQ(ierr);
1105     ierr = PetscViewerASCIIPrintf(view,"  Admissibility parameters: leaf size %D, eta %g\n",hara->leafsize,(double)hara->eta);CHKERRQ(ierr);
1106     if (hara->sampler) {
1107       ierr = PetscViewerASCIIPrintf(view,"  Sampling parameters: max_rank %D, samples %D, tolerance %g\n",hara->max_rank,hara->bs,(double)hara->rtol);CHKERRQ(ierr);
1108     } else {
1109       ierr = PetscViewerASCIIPrintf(view,"  Offdiagonal blocks approximation order %D\n",hara->basisord);CHKERRQ(ierr);
1110     }
1111     if (size == 1) {
1112       double dense_mem_cpu = hara->hmatrix ? hara->hmatrix->getDenseMemoryUsage() : 0;
1113       double low_rank_cpu = hara->hmatrix ? hara->hmatrix->getLowRankMemoryUsage() : 0;
1114 #if defined(HARA_USE_GPU)
1115       double dense_mem_gpu = hara->hmatrix_gpu ? hara->hmatrix_gpu->getDenseMemoryUsage() : 0;
1116       double low_rank_gpu = hara->hmatrix_gpu ? hara->hmatrix_gpu->getLowRankMemoryUsage() : 0;
1117 #endif
1118       ierr = PetscViewerASCIIPrintf(view,"  Memory consumption (CPU): %g (dense) %g (low rank) %g GB (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu);CHKERRQ(ierr);
1119 #if defined(HARA_USE_GPU)
1120       ierr = PetscViewerASCIIPrintf(view,"  Memory consumption (GPU): %g (dense) %g (low rank) %g GB (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu);CHKERRQ(ierr);
1121 #endif
1122     }
1123   }
1124   PetscFunctionReturn(0);
1125 }
1126 
MatHaraSetCoords_HARA(Mat A,PetscInt spacedim,const PetscReal coords[],MatHaraKernel kernel,void * kernelctx)1127 static PetscErrorCode MatHaraSetCoords_HARA(Mat A, PetscInt spacedim, const PetscReal coords[], MatHaraKernel kernel, void *kernelctx)
1128 {
1129   Mat_HARA       *hara = (Mat_HARA*)A->data;
1130   PetscReal      *gcoords;
1131   PetscInt       N;
1132   MPI_Comm       comm;
1133   PetscMPIInt    size;
1134   PetscBool      cong;
1135   PetscErrorCode ierr;
1136 
1137   PetscFunctionBegin;
1138   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
1139   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
1140   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1141   ierr = MatHasCongruentLayouts(A,&cong);CHKERRQ(ierr);
1142   if (!cong) SETERRQ(comm,PETSC_ERR_SUP,"Only for square matrices with congruent layouts");
1143   N    = A->rmap->N;
1144   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
1145   if (size > 1) {
1146     PetscSF      sf;
1147     MPI_Datatype dtype;
1148 
1149     ierr = MPI_Type_contiguous(spacedim,MPIU_REAL,&dtype);CHKERRQ(ierr);
1150     ierr = MPI_Type_commit(&dtype);CHKERRQ(ierr);
1151 
1152     ierr = PetscSFCreate(comm,&sf);CHKERRQ(ierr);
1153     ierr = PetscSFSetGraphWithPattern(sf,A->rmap,PETSCSF_PATTERN_ALLGATHER);CHKERRQ(ierr);
1154     ierr = PetscMalloc1(spacedim*N,&gcoords);CHKERRQ(ierr);
1155     ierr = PetscSFBcastBegin(sf,dtype,coords,gcoords);CHKERRQ(ierr);
1156     ierr = PetscSFBcastEnd(sf,dtype,coords,gcoords);CHKERRQ(ierr);
1157     ierr = PetscSFDestroy(&sf);CHKERRQ(ierr);
1158     ierr = MPI_Type_free(&dtype);CHKERRQ(ierr);
1159   } else gcoords = (PetscReal*)coords;
1160 
1161   delete hara->ptcloud1;
1162   delete hara->ptcloud2;
1163   delete hara->ptcloud3;
1164   delete hara->kernel1;
1165   delete hara->kernel2;
1166   delete hara->kernel3;
1167   hara->spacedim = spacedim;
1168   switch (spacedim) {
1169   case 1:
1170     hara->ptcloud1 = new PetscPointCloud1D<PetscReal>(N,gcoords);
1171     if (kernel) hara->kernel1 = new PetscFunctionGenerator1D<PetscScalar>(kernel,kernelctx);
1172     break;
1173   case 2:
1174     hara->ptcloud2 = new PetscPointCloud2D<PetscReal>(N,gcoords);
1175     if (kernel) hara->kernel2 = new PetscFunctionGenerator2D<PetscScalar>(kernel,kernelctx);
1176     break;
1177   case 3:
1178     hara->ptcloud3 = new PetscPointCloud3D<PetscReal>(N,gcoords);
1179     if (kernel) hara->kernel3 = new PetscFunctionGenerator3D<PetscScalar>(kernel,kernelctx);
1180     break;
1181   default:
1182     SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Unhandled dimension %D",hara->spacedim);
1183   }
1184   if (gcoords != coords) { ierr = PetscFree(gcoords);CHKERRQ(ierr); }
1185   A->preallocated = PETSC_TRUE;
1186   PetscFunctionReturn(0);
1187 }
1188 
MatCreate_HARA(Mat A)1189 PETSC_EXTERN PetscErrorCode MatCreate_HARA(Mat A)
1190 {
1191   Mat_HARA       *a;
1192   PetscErrorCode ierr;
1193   PetscMPIInt    size;
1194 
1195   PetscFunctionBegin;
1196   ierr = PetscNewLog(A,&a);CHKERRQ(ierr);
1197   A->data = (void*)a;
1198 
1199   a->eta              = 0.9;
1200   a->leafsize         = 32;
1201   a->basisord         = 4;
1202   a->max_rank         = 64;
1203   a->bs               = 32;
1204   a->rtol             = 1.e-4;
1205   a->s                = 1.0;
1206   a->norm_max_samples = 10;
1207   haraCreateDistributedHandleComm(&a->handle,PetscObjectComm((PetscObject)A));
1208 
1209   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRQ(ierr);
1210   ierr = PetscObjectChangeTypeName((PetscObject)A,MATHARA);CHKERRQ(ierr);
1211   ierr = PetscMemzero(A->ops,sizeof(struct _MatOps));CHKERRQ(ierr);
1212 
1213   A->ops->destroy          = MatDestroy_HARA;
1214   A->ops->view             = MatView_HARA;
1215   A->ops->assemblyend      = MatAssemblyEnd_HARA;
1216   A->ops->mult             = MatMult_HARA;
1217   A->ops->multtranspose    = MatMultTranspose_HARA;
1218   A->ops->multadd          = MatMultAdd_HARA;
1219   A->ops->multtransposeadd = MatMultTransposeAdd_HARA;
1220   A->ops->scale            = MatScale_HARA;
1221   A->ops->duplicate        = MatDuplicate_HARA;
1222   A->ops->setfromoptions   = MatSetFromOptions_HARA;
1223   A->ops->norm             = MatNorm_HARA;
1224   A->ops->zeroentries      = MatZeroEntries_HARA;
1225 
1226   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_hara_seqdense_C",MatProductSetFromOptions_HARA);CHKERRQ(ierr);
1227   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_hara_seqdensecuda_C",MatProductSetFromOptions_HARA);CHKERRQ(ierr);
1228   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_hara_mpidense_C",MatProductSetFromOptions_HARA);CHKERRQ(ierr);
1229   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_hara_mpidensecuda_C",MatProductSetFromOptions_HARA);CHKERRQ(ierr);
1230 #if defined(PETSC_HAVE_CUDA)
1231   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
1232   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
1233 #endif
1234   PetscFunctionReturn(0);
1235 }
1236 
MatHaraSetSamplingMat(Mat A,Mat B,PetscInt bs,PetscReal tol)1237 PetscErrorCode MatHaraSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol)
1238 {
1239   PetscBool      ishara;
1240   PetscErrorCode ierr;
1241 
1242   PetscFunctionBegin;
1243   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1244   PetscValidType(A,1);
1245   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
1246   ierr = PetscObjectTypeCompare((PetscObject)A,MATHARA,&ishara);CHKERRQ(ierr);
1247   if (ishara) {
1248     Mat_HARA *a = (Mat_HARA*)A->data;
1249 
1250 
1251     if (!a->sampler) a->sampler = new PetscMatrixSampler();
1252     a->sampler->SetSamplingMat(B);
1253     if (bs > 0) a->bs = bs;
1254     if (tol > 0.) a->rtol = tol;
1255     delete a->kernel1;
1256     delete a->kernel2;
1257     delete a->kernel3;
1258   }
1259   PetscFunctionReturn(0);
1260 }
1261 
MatCreateHaraFromKernel(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt spacedim,const PetscReal coords[],MatHaraKernel kernel,void * kernelctx,PetscReal eta,PetscInt leafsize,PetscInt basisord,Mat * nA)1262 PetscErrorCode MatCreateHaraFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], MatHaraKernel kernel,void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat* nA)
1263 {
1264   Mat            A;
1265   Mat_HARA       *hara;
1266 #if defined(PETSC_HAVE_CUDA)
1267   PetscBool      iscpu = PETSC_FALSE;
1268 #else
1269   PetscBool      iscpu = PETSC_TRUE;
1270 #endif
1271   PetscErrorCode ierr;
1272 
1273   PetscFunctionBegin;
1274   ierr = MatCreate(comm,&A);CHKERRQ(ierr);
1275   ierr = MatSetSizes(A,m,n,M,N);CHKERRQ(ierr);
1276   ierr = MatSetType(A,MATHARA);CHKERRQ(ierr);
1277   ierr = MatBindToCPU(A,iscpu);CHKERRQ(ierr);
1278   ierr = MatHaraSetCoords_HARA(A,spacedim,coords,kernel,kernelctx);CHKERRQ(ierr);
1279 
1280   hara = (Mat_HARA*)A->data;
1281   if (eta > 0.) hara->eta = eta;
1282   if (leafsize > 0) hara->leafsize = leafsize;
1283   if (basisord > 0) hara->basisord = basisord;
1284 
1285   *nA = A;
1286   PetscFunctionReturn(0);
1287 }
1288 
MatCreateHaraFromMat(Mat B,PetscInt spacedim,const PetscReal coords[],PetscReal eta,PetscInt leafsize,PetscInt maxrank,PetscInt bs,PetscReal rtol,Mat * nA)1289 PetscErrorCode MatCreateHaraFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA)
1290 {
1291   Mat            A;
1292   Mat_HARA       *hara;
1293   MPI_Comm       comm;
1294 #if defined(PETSC_HAVE_CUDA)
1295   PetscBool      iscpu = PETSC_FALSE;
1296 #else
1297   PetscBool      iscpu = PETSC_TRUE;
1298 #endif
1299   PetscErrorCode ierr;
1300 
1301   PetscFunctionBegin;
1302   PetscValidHeaderSpecific(B,MAT_CLASSID,1);
1303   PetscValidLogicalCollectiveInt(B,spacedim,2);
1304   PetscValidPointer(nA,4);
1305   ierr = PetscObjectGetComm((PetscObject)B,&comm);CHKERRQ(ierr);
1306   ierr = MatCreate(comm,&A);CHKERRQ(ierr);
1307   ierr = MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N);CHKERRQ(ierr);
1308   ierr = MatSetType(A,MATHARA);CHKERRQ(ierr);
1309   ierr = MatBindToCPU(A,iscpu);CHKERRQ(ierr);
1310   if (spacedim) {
1311     ierr = MatHaraSetCoords_HARA(A,spacedim,coords,NULL,NULL);CHKERRQ(ierr);
1312   }
1313   ierr = MatPropagateSymmetryOptions(B,A);CHKERRQ(ierr);
1314   /* if (!A->symmetric) SETERRQ(comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */
1315 
1316   hara = (Mat_HARA*)A->data;
1317   hara->sampler = new PetscMatrixSampler(B);
1318   if (eta > 0.) hara->eta = eta;
1319   if (leafsize > 0) hara->leafsize = leafsize;
1320   if (maxrank > 0) hara->max_rank = maxrank;
1321   if (bs > 0) hara->bs = bs;
1322   if (rtol > 0.) hara->rtol = rtol;
1323 
1324   *nA = A;
1325   A->preallocated = PETSC_TRUE;
1326   PetscFunctionReturn(0);
1327 }
1328