1 #if !defined(PETSCSFIMPL_H)
2 #define PETSCSFIMPL_H
3 
4 #include <petscvec.h>
5 #include <petscsf.h>
6 #include <petsc/private/petscimpl.h>
7 #include <petscviewer.h>
8 
9 #if defined(PETSC_HAVE_CUDA)
10   #include <cuda_runtime.h>
11 #endif
12 
13 #if defined(PETSC_HAVE_HIP)
14   #include <hip/hip_runtime.h>
15 #endif
16 
17 PETSC_EXTERN PetscLogEvent PETSCSF_SetGraph;
18 PETSC_EXTERN PetscLogEvent PETSCSF_SetUp;
19 PETSC_EXTERN PetscLogEvent PETSCSF_BcastBegin;
20 PETSC_EXTERN PetscLogEvent PETSCSF_BcastEnd;
21 PETSC_EXTERN PetscLogEvent PETSCSF_BcastAndOpBegin;
22 PETSC_EXTERN PetscLogEvent PETSCSF_BcastAndOpEnd;
23 PETSC_EXTERN PetscLogEvent PETSCSF_ReduceBegin;
24 PETSC_EXTERN PetscLogEvent PETSCSF_ReduceEnd;
25 PETSC_EXTERN PetscLogEvent PETSCSF_FetchAndOpBegin;
26 PETSC_EXTERN PetscLogEvent PETSCSF_FetchAndOpEnd;
27 PETSC_EXTERN PetscLogEvent PETSCSF_EmbedSF;
28 PETSC_EXTERN PetscLogEvent PETSCSF_DistSect;
29 PETSC_EXTERN PetscLogEvent PETSCSF_SectSF;
30 PETSC_EXTERN PetscLogEvent PETSCSF_RemoteOff;
31 PETSC_EXTERN PetscLogEvent PETSCSF_Pack;
32 PETSC_EXTERN PetscLogEvent PETSCSF_Unpack;
33 
34 typedef enum {PETSCSF_ROOT2LEAF=0, PETSCSF_LEAF2ROOT} PetscSFDirection;
35 typedef enum {PETSCSF_BCAST=0, PETSCSF_REDUCE, PETSCSF_FETCH} PetscSFOperation;
36 /* When doing device-aware MPI, a backend refers to the SF/device interface */
37 typedef enum {PETSCSF_BACKEND_INVALID=0,PETSCSF_BACKEND_CUDA,PETSCSF_BACKEND_KOKKOS} PetscSFBackend;
38 
39 struct _PetscSFOps {
40   PetscErrorCode (*Reset)(PetscSF);
41   PetscErrorCode (*Destroy)(PetscSF);
42   PetscErrorCode (*SetUp)(PetscSF);
43   PetscErrorCode (*SetFromOptions)(PetscOptionItems*,PetscSF);
44   PetscErrorCode (*View)(PetscSF,PetscViewer);
45   PetscErrorCode (*Duplicate)(PetscSF,PetscSFDuplicateOption,PetscSF);
46   PetscErrorCode (*BcastAndOpBegin)(PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,void*,MPI_Op);
47   PetscErrorCode (*BcastAndOpEnd)  (PetscSF,MPI_Datatype,const void*,void*,MPI_Op);
48   PetscErrorCode (*ReduceBegin)    (PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,void*,MPI_Op);
49   PetscErrorCode (*ReduceEnd)      (PetscSF,MPI_Datatype,const void*,void*,MPI_Op);
50   PetscErrorCode (*FetchAndOpBegin)(PetscSF,MPI_Datatype,PetscMemType,void*,PetscMemType,const void*,void*,MPI_Op);
51   PetscErrorCode (*FetchAndOpEnd)  (PetscSF,MPI_Datatype,void*,const void*,void*,MPI_Op);
52   PetscErrorCode (*BcastToZero)    (PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,      void*); /* For interal use only */
53   PetscErrorCode (*GetRootRanks)(PetscSF,PetscInt*,const PetscMPIInt**,const PetscInt**,const PetscInt**,const PetscInt**);
54   PetscErrorCode (*GetLeafRanks)(PetscSF,PetscInt*,const PetscMPIInt**,const PetscInt**,const PetscInt**);
55   PetscErrorCode (*CreateLocalSF)(PetscSF,PetscSF*);
56   PetscErrorCode (*GetGraph)(PetscSF,PetscInt*,PetscInt*,const PetscInt**,const PetscSFNode**);
57   PetscErrorCode (*CreateEmbeddedSF)(PetscSF,PetscInt,const PetscInt*,PetscSF*);
58   PetscErrorCode (*CreateEmbeddedLeafSF)(PetscSF,PetscInt,const PetscInt*,PetscSF*);
59 
60   PetscErrorCode (*Malloc)(PetscMemType,size_t,void**);
61   PetscErrorCode (*Free)(PetscMemType,void*);
62 };
63 
64 typedef struct _n_PetscSFPackOpt *PetscSFPackOpt;
65 
66 struct _p_PetscSF {
67   PETSCHEADER(struct _PetscSFOps);
68   PetscInt        nroots;          /* Number of root vertices on current process (candidates for incoming edges) */
69   PetscInt        nleaves;         /* Number of leaf vertices on current process (this process specifies a root for each leaf) */
70   PetscInt        *mine;           /* Location of leaves in leafdata arrays provided to the communication routines */
71   PetscInt        *mine_alloc;
72   PetscInt        minleaf,maxleaf;
73   PetscSFNode     *remote;         /* Remote references to roots for each local leaf */
74   PetscSFNode     *remote_alloc;
75   PetscInt        nranks;          /* Number of ranks owning roots connected to my leaves */
76   PetscInt        ndranks;         /* Number of ranks in distinguished group holding roots connected to my leaves */
77   PetscMPIInt     *ranks;          /* List of ranks referenced by "remote" */
78   PetscInt        *roffset;        /* Array of length nranks+1, offset in rmine/rremote for each rank */
79   PetscInt        *rmine;          /* Concatenated array holding local indices referencing each remote rank */
80   PetscInt        *rmine_d[2];     /* A copy of rmine[local/remote] in device memory if needed */
81 
82   /* Some results useful in packing by analyzing rmine[] */
83   PetscInt        leafbuflen[2];   /* Length (in unit) of leaf buffers, in layout of [PETSCSF_LOCAL/REMOTE] */
84   PetscBool       leafcontig[2];   /* True means indices in rmine[self part] or rmine[remote part] are contiguous, and they start from ... */
85   PetscInt        leafstart[2];    /* ... leafstart[0] and leafstart[1] respectively */
86   PetscSFPackOpt  leafpackopt[2];  /* Optimization plans to (un)pack leaves connected to remote roots, based on index patterns in rmine[]. NULL for no optimization */
87   PetscSFPackOpt  leafpackopt_d[2];/* Copy of leafpackopt_d[] on device if needed */
88   PetscBool       leafdups[2];     /* Indices in rmine[] for self(0)/remote(1) communication have dups? TRUE implies theads working on them in parallel may have data race. */
89 
90   PetscInt        nleafreqs;       /* Number of MPI reqests for leaves */
91   PetscInt        *rremote;        /* Concatenated array holding remote indices referenced for each remote rank */
92   PetscBool       degreeknown;     /* The degree is currently known, do not have to recompute */
93   PetscInt        *degree;         /* Degree of each of my root vertices */
94   PetscInt        *degreetmp;      /* Temporary local array for computing degree */
95   PetscBool       rankorder;       /* Sort ranks for gather and scatter operations */
96   MPI_Group       ingroup;         /* Group of processes connected to my roots */
97   MPI_Group       outgroup;        /* Group of processes connected to my leaves */
98   PetscSF         multi;           /* Internal graph used to implement gather and scatter operations */
99   PetscBool       graphset;        /* Flag indicating that the graph has been set, required before calling communication routines */
100   PetscBool       setupcalled;     /* Type and communication structures have been set up */
101   PetscSFPattern  pattern;         /* Pattern of the graph */
102   PetscBool       persistent;      /* Does this SF use MPI persistent requests for communication */
103   PetscLayout     map;             /* Layout of leaves over all processes when building a patterned graph */
104   PetscBool       use_default_stream;  /* If true, SF assumes root/leafdata is on the default stream upon input and will also leave them there upon output */
105   PetscBool       use_gpu_aware_mpi;   /* If true, SF assumes it can pass GPU pointers to MPI */
106   PetscBool       use_stream_aware_mpi;/* If true, SF assumes the underlying MPI is cuda-stream aware and we won't sync streams for send/recv buffers passed to MPI */
107 #if defined(PETSC_HAVE_CUDA)
108   PetscInt        maxResidentThreadsPerGPU;
109 #endif
110   PetscSFBackend  backend;         /* The device backend (if any) SF will use */
111   void *data;                      /* Pointer to implementation */
112 };
113 
114 PETSC_EXTERN PetscBool PetscSFRegisterAllCalled;
115 PETSC_EXTERN PetscErrorCode PetscSFRegisterAll(void);
116 
117 PETSC_INTERN PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF,PetscSF*);
118 PETSC_INTERN PetscErrorCode PetscSFBcastToZero_Private(PetscSF,MPI_Datatype,const void*,void*);
119 
120 PETSC_EXTERN PetscErrorCode MPIPetsc_Type_unwrap(MPI_Datatype,MPI_Datatype*,PetscBool*);
121 PETSC_EXTERN PetscErrorCode MPIPetsc_Type_compare(MPI_Datatype,MPI_Datatype,PetscBool*);
122 PETSC_EXTERN PetscErrorCode MPIPetsc_Type_compare_contig(MPI_Datatype,MPI_Datatype,PetscInt*);
123 
124 #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
125 #define MPIU_Iscatter(a,b,c,d,e,f,g,h,req)     MPI_Iscatter(a,b,c,d,e,f,g,h,req)
126 #define MPIU_Iscatterv(a,b,c,d,e,f,g,h,i,req)  MPI_Iscatterv(a,b,c,d,e,f,g,h,i,req)
127 #define MPIU_Igather(a,b,c,d,e,f,g,h,req)      MPI_Igather(a,b,c,d,e,f,g,h,req)
128 #define MPIU_Igatherv(a,b,c,d,e,f,g,h,i,req)   MPI_Igatherv(a,b,c,d,e,f,g,h,i,req)
129 #define MPIU_Iallgather(a,b,c,d,e,f,g,req)     MPI_Iallgather(a,b,c,d,e,f,g,req)
130 #define MPIU_Iallgatherv(a,b,c,d,e,f,g,h,req)  MPI_Iallgatherv(a,b,c,d,e,f,g,h,req)
131 #define MPIU_Ialltoall(a,b,c,d,e,f,g,req)      MPI_Ialltoall(a,b,c,d,e,f,g,req)
132 #else
133 /* Ignore req, the MPI_Request argument, and use MPI blocking collectives. One should initialize req
134    to MPI_REQUEST_NULL so that one can do MPI_Wait(req,status) no matter the call is blocking or not.
135  */
136 #define MPIU_Iscatter(a,b,c,d,e,f,g,h,req)     MPI_Scatter(a,b,c,d,e,f,g,h)
137 #define MPIU_Iscatterv(a,b,c,d,e,f,g,h,i,req)  MPI_Scatterv(a,b,c,d,e,f,g,h,i)
138 #define MPIU_Igather(a,b,c,d,e,f,g,h,req)      MPI_Gather(a,b,c,d,e,f,g,h)
139 #define MPIU_Igatherv(a,b,c,d,e,f,g,h,i,req)   MPI_Gatherv(a,b,c,d,e,f,g,h,i)
140 #define MPIU_Iallgather(a,b,c,d,e,f,g,req)     MPI_Allgather(a,b,c,d,e,f,g)
141 #define MPIU_Iallgatherv(a,b,c,d,e,f,g,h,req)  MPI_Allgatherv(a,b,c,d,e,f,g,h)
142 #define MPIU_Ialltoall(a,b,c,d,e,f,g,req)      MPI_Alltoall(a,b,c,d,e,f,g)
143 #endif
144 
145 #if defined(PETSC_HAVE_CUDA)
146 PETSC_EXTERN PetscErrorCode PetscSFMalloc_Cuda(PetscMemType,size_t,void**);
147 PETSC_EXTERN PetscErrorCode PetscSFFree_Cuda(PetscMemType,void*);
148 #endif
149 
150 #if defined(PETSC_HAVE_KOKKOS)
151 PETSC_EXTERN PetscErrorCode PetscSFMalloc_Kokkos(PetscMemType,size_t,void**);
152 PETSC_EXTERN PetscErrorCode PetscSFFree_Kokkos(PetscMemType,void*);
153 #endif
154 
155 /* SF only supports CUDA and Kokkos devices. Even VIENNACL is a device, its device pointers are invisible to SF.
156    Through VecGetArray(), we copy data of VECVIENNACL from device to host and pass host pointers to SF.
157  */
158 #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_KOKKOS)
159   #define PetscSFMalloc(sf,mtype,sz,ptr)  ((*(sf)->ops->Malloc)(mtype,sz,ptr))
160   /* Free memory and set ptr to NULL when succeeded */
161   #define PetscSFFree(sf,mtype,ptr)       ((ptr) && ((*(sf)->ops->Free)(mtype,ptr) || ((ptr)=NULL,0)))
162 #else
163   /* If pure host code, do with less indirection */
164   #define PetscSFMalloc(sf,mtype,sz,ptr)  PetscMalloc(sz,ptr)
165   #define PetscSFFree(sf,mtype,ptr)       PetscFree(ptr)
166 #endif
167 
168 #endif
169