Actual source code: sfimpl.h
petsc-3.14.2 2020-12-03
1: #if !defined(PETSCSFIMPL_H)
2: #define PETSCSFIMPL_H
4: #include <petscvec.h>
5: #include <petscsf.h>
6: #include <petsc/private/petscimpl.h>
7: #include <petscviewer.h>
9: #if defined(PETSC_HAVE_CUDA)
10: #include <cuda_runtime.h>
11: #endif
13: #if defined(PETSC_HAVE_HIP)
14: #include <hip/hip_runtime.h>
15: #endif
17: PETSC_EXTERN PetscLogEvent PETSCSF_SetGraph;
18: PETSC_EXTERN PetscLogEvent PETSCSF_SetUp;
19: PETSC_EXTERN PetscLogEvent PETSCSF_BcastBegin;
20: PETSC_EXTERN PetscLogEvent PETSCSF_BcastEnd;
21: PETSC_EXTERN PetscLogEvent PETSCSF_BcastAndOpBegin;
22: PETSC_EXTERN PetscLogEvent PETSCSF_BcastAndOpEnd;
23: PETSC_EXTERN PetscLogEvent PETSCSF_ReduceBegin;
24: PETSC_EXTERN PetscLogEvent PETSCSF_ReduceEnd;
25: PETSC_EXTERN PetscLogEvent PETSCSF_FetchAndOpBegin;
26: PETSC_EXTERN PetscLogEvent PETSCSF_FetchAndOpEnd;
27: PETSC_EXTERN PetscLogEvent PETSCSF_EmbedSF;
28: PETSC_EXTERN PetscLogEvent PETSCSF_DistSect;
29: PETSC_EXTERN PetscLogEvent PETSCSF_SectSF;
30: PETSC_EXTERN PetscLogEvent PETSCSF_RemoteOff;
31: PETSC_EXTERN PetscLogEvent PETSCSF_Pack;
32: PETSC_EXTERN PetscLogEvent PETSCSF_Unpack;
34: typedef enum {PETSCSF_../../..2LEAF=0, PETSCSF_LEAF2../../..} PetscSFDirection;
35: typedef enum {PETSCSF_BCAST=0, PETSCSF_REDUCE, PETSCSF_FETCH} PetscSFOperation;
36: /* When doing device-aware MPI, a backend refers to the SF/device interface */
37: typedef enum {PETSCSF_BACKEND_INVALID=0,PETSCSF_BACKEND_CUDA,PETSCSF_BACKEND_KOKKOS} PetscSFBackend;
39: struct _PetscSFOps {
40: PetscErrorCode (*Reset)(PetscSF);
41: PetscErrorCode (*Destroy)(PetscSF);
42: PetscErrorCode (*SetUp)(PetscSF);
43: PetscErrorCode (*SetFromOptions)(PetscOptionItems*,PetscSF);
44: PetscErrorCode (*View)(PetscSF,PetscViewer);
45: PetscErrorCode (*Duplicate)(PetscSF,PetscSFDuplicateOption,PetscSF);
46: PetscErrorCode (*BcastAndOpBegin)(PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,void*,MPI_Op);
47: PetscErrorCode (*BcastAndOpEnd) (PetscSF,MPI_Datatype,const void*,void*,MPI_Op);
48: PetscErrorCode (*ReduceBegin) (PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,void*,MPI_Op);
49: PetscErrorCode (*ReduceEnd) (PetscSF,MPI_Datatype,const void*,void*,MPI_Op);
50: PetscErrorCode (*FetchAndOpBegin)(PetscSF,MPI_Datatype,PetscMemType,void*,PetscMemType,const void*,void*,MPI_Op);
51: PetscErrorCode (*FetchAndOpEnd) (PetscSF,MPI_Datatype,void*,const void*,void*,MPI_Op);
52: PetscErrorCode (*BcastToZero) (PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType, void*); /* For interal use only */
53: PetscErrorCode (*GetRootRanks)(PetscSF,PetscInt*,const PetscMPIInt**,const PetscInt**,const PetscInt**,const PetscInt**);
54: PetscErrorCode (*GetLeafRanks)(PetscSF,PetscInt*,const PetscMPIInt**,const PetscInt**,const PetscInt**);
55: PetscErrorCode (*CreateLocalSF)(PetscSF,PetscSF*);
56: PetscErrorCode (*GetGraph)(PetscSF,PetscInt*,PetscInt*,const PetscInt**,const PetscSFNode**);
57: PetscErrorCode (*CreateEmbeddedSF)(PetscSF,PetscInt,const PetscInt*,PetscSF*);
58: PetscErrorCode (*CreateEmbeddedLeafSF)(PetscSF,PetscInt,const PetscInt*,PetscSF*);
60: PetscErrorCode (*Malloc)(PetscMemType,size_t,void**);
61: PetscErrorCode (*Free)(PetscMemType,void*);
62: };
64: typedef struct _n_PetscSFPackOpt *PetscSFPackOpt;
66: struct _p_PetscSF {
67: PETSCHEADER(struct _PetscSFOps);
68: PetscInt nroots; /* Number of root vertices on current process (candidates for incoming edges) */
69: PetscInt nleaves; /* Number of leaf vertices on current process (this process specifies a root for each leaf) */
70: PetscInt *mine; /* Location of leaves in leafdata arrays provided to the communication routines */
71: PetscInt *mine_alloc;
72: PetscInt minleaf,maxleaf;
73: PetscSFNode *remote; /* Remote references to roots for each local leaf */
74: PetscSFNode *remote_alloc;
75: PetscInt nranks; /* Number of ranks owning roots connected to my leaves */
76: PetscInt ndranks; /* Number of ranks in distinguished group holding roots connected to my leaves */
77: PetscMPIInt *ranks; /* List of ranks referenced by "remote" */
78: PetscInt *roffset; /* Array of length nranks+1, offset in rmine/rremote for each rank */
79: PetscInt *rmine; /* Concatenated array holding local indices referencing each remote rank */
80: PetscInt *rmine_d[2]; /* A copy of rmine[local/remote] in device memory if needed */
82: /* Some results useful in packing by analyzing rmine[] */
83: PetscInt leafbuflen[2]; /* Length (in unit) of leaf buffers, in layout of [PETSCSF_LOCAL/REMOTE] */
84: PetscBool leafcontig[2]; /* True means indices in rmine[self part] or rmine[remote part] are contiguous, and they start from ... */
85: PetscInt leafstart[2]; /* ... leafstart[0] and leafstart[1] respectively */
86: PetscSFPackOpt leafpackopt[2]; /* Optimization plans to (un)pack leaves connected to remote roots, based on index patterns in rmine[]. NULL for no optimization */
87: PetscSFPackOpt leafpackopt_d[2];/* Copy of leafpackopt_d[] on device if needed */
88: PetscBool leafdups[2]; /* Indices in rmine[] for self(0)/remote(1) communication have dups? TRUE implies theads working on them in parallel may have data race. */
90: PetscInt nleafreqs; /* Number of MPI reqests for leaves */
91: PetscInt *rremote; /* Concatenated array holding remote indices referenced for each remote rank */
92: PetscBool degreeknown; /* The degree is currently known, do not have to recompute */
93: PetscInt *degree; /* Degree of each of my root vertices */
94: PetscInt *degreetmp; /* Temporary local array for computing degree */
95: PetscBool rankorder; /* Sort ranks for gather and scatter operations */
96: MPI_Group ingroup; /* Group of processes connected to my roots */
97: MPI_Group outgroup; /* Group of processes connected to my leaves */
98: PetscSF multi; /* Internal graph used to implement gather and scatter operations */
99: PetscBool graphset; /* Flag indicating that the graph has been set, required before calling communication routines */
100: PetscBool setupcalled; /* Type and communication structures have been set up */
101: PetscSFPattern pattern; /* Pattern of the graph */
102: PetscBool persistent; /* Does this SF use MPI persistent requests for communication */
103: PetscLayout map; /* Layout of leaves over all processes when building a patterned graph */
104: PetscBool use_default_stream; /* If true, SF assumes root/leafdata is on the default stream upon input and will also leave them there upon output */
105: PetscBool use_gpu_aware_mpi; /* If true, SF assumes it can pass GPU pointers to MPI */
106: PetscBool use_stream_aware_mpi;/* If true, SF assumes the underlying MPI is cuda-stream aware and we won't sync streams for send/recv buffers passed to MPI */
107: #if defined(PETSC_HAVE_CUDA)
108: PetscInt maxResidentThreadsPerGPU;
109: #endif
110: PetscSFBackend backend; /* The device backend (if any) SF will use */
111: void *data; /* Pointer to implementation */
112: };
114: PETSC_EXTERN PetscBool PetscSFRegisterAllCalled;
115: PETSC_EXTERN PetscErrorCode PetscSFRegisterAll(void);
117: PETSC_INTERN PetscErrorCode PetscSFCreateLocalSF_Private(PetscSF,PetscSF*);
118: PETSC_INTERN PetscErrorCode PetscSFBcastToZero_Private(PetscSF,MPI_Datatype,const void*,void*);
120: PETSC_EXTERN PetscErrorCode MPIPetsc_Type_unwrap(MPI_Datatype,MPI_Datatype*,PetscBool*);
121: PETSC_EXTERN PetscErrorCode MPIPetsc_Type_compare(MPI_Datatype,MPI_Datatype,PetscBool*);
122: PETSC_EXTERN PetscErrorCode MPIPetsc_Type_compare_contig(MPI_Datatype,MPI_Datatype,PetscInt*);
124: #if defined(PETSC_HAVE_MPI_NONBLOCKING_COLLECTIVES)
125: #define MPIU_Iscatter(a,b,c,d,e,f,g,h,req) MPI_Iscatter(a,b,c,d,e,f,g,h,req)
126: #define MPIU_Iscatterv(a,b,c,d,e,f,g,h,i,req) MPI_Iscatterv(a,b,c,d,e,f,g,h,i,req)
127: #define MPIU_Igather(a,b,c,d,e,f,g,h,req) MPI_Igather(a,b,c,d,e,f,g,h,req)
128: #define MPIU_Igatherv(a,b,c,d,e,f,g,h,i,req) MPI_Igatherv(a,b,c,d,e,f,g,h,i,req)
129: #define MPIU_Iallgather(a,b,c,d,e,f,g,req) MPI_Iallgather(a,b,c,d,e,f,g,req)
130: #define MPIU_Iallgatherv(a,b,c,d,e,f,g,h,req) MPI_Iallgatherv(a,b,c,d,e,f,g,h,req)
131: #define MPIU_Ialltoall(a,b,c,d,e,f,g,req) MPI_Ialltoall(a,b,c,d,e,f,g,req)
132: #else
133: /* Ignore req, the MPI_Request argument, and use MPI blocking collectives. One should initialize req
134: to MPI_REQUEST_NULL so that one can do MPI_Wait(req,status) no matter the call is blocking or not.
135: */
136: #define MPIU_Iscatter(a,b,c,d,e,f,g,h,req) MPI_Scatter(a,b,c,d,e,f,g,h)
137: #define MPIU_Iscatterv(a,b,c,d,e,f,g,h,i,req) MPI_Scatterv(a,b,c,d,e,f,g,h,i)
138: #define MPIU_Igather(a,b,c,d,e,f,g,h,req) MPI_Gather(a,b,c,d,e,f,g,h)
139: #define MPIU_Igatherv(a,b,c,d,e,f,g,h,i,req) MPI_Gatherv(a,b,c,d,e,f,g,h,i)
140: #define MPIU_Iallgather(a,b,c,d,e,f,g,req) MPI_Allgather(a,b,c,d,e,f,g)
141: #define MPIU_Iallgatherv(a,b,c,d,e,f,g,h,req) MPI_Allgatherv(a,b,c,d,e,f,g,h)
142: #define MPIU_Ialltoall(a,b,c,d,e,f,g,req) MPI_Alltoall(a,b,c,d,e,f,g)
143: #endif
145: #if defined(PETSC_HAVE_CUDA)
146: PETSC_EXTERN PetscErrorCode PetscSFMalloc_Cuda(PetscMemType,size_t,void**);
147: PETSC_EXTERN PetscErrorCode PetscSFFree_Cuda(PetscMemType,void*);
148: #endif
150: #if defined(PETSC_HAVE_KOKKOS)
151: PETSC_EXTERN PetscErrorCode PetscSFMalloc_Kokkos(PetscMemType,size_t,void**);
152: PETSC_EXTERN PetscErrorCode PetscSFFree_Kokkos(PetscMemType,void*);
153: #endif
155: /* SF only supports CUDA and Kokkos devices. Even VIENNACL is a device, its device pointers are invisible to SF.
156: Through VecGetArray(), we copy data of VECVIENNACL from device to host and pass host pointers to SF.
157: */
158: #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_KOKKOS)
159: #define PetscSFMalloc(sf,mtype,sz,ptr) ((*(sf)->ops->Malloc)(mtype,sz,ptr))
160: /* Free memory and set ptr to NULL when succeeded */
161: #define PetscSFFree(sf,mtype,ptr) ((ptr) && ((*(sf)->ops->Free)(mtype,ptr) || ((ptr)=NULL,0)))
162: #else
163: /* If pure host code, do with less indirection */
164: #define PetscSFMalloc(sf,mtype,sz,ptr) PetscMalloc(sz,ptr)
165: #define PetscSFFree(sf,mtype,ptr) PetscFree(ptr)
166: #endif
168: #endif