Actual source code: cudavecimpl.h


  4: #include <petscvec.h>
  5: #include <petsc/private/deviceimpl.h>
  6: #include <petsc/private/vecimpl.h>

  8: typedef struct {
  9:   PetscScalar  *GPUarray;           /* this always holds the GPU data */
 10:   PetscScalar  *GPUarray_allocated; /* if the array was allocated by PETSc this is its pointer */
 11:   cudaStream_t stream;              /* A stream for doing asynchronous data transfers */
 12:   PetscBool    nvshmem;             /* Is GPUarray_allocated allocated in nvshmem? It is used to allocate Mvctx->lvec in nvshmem */
 13: } Vec_CUDA;

 15: PETSC_INTERN PetscErrorCode VecCUDAGetArrays_Private(Vec,const PetscScalar**,const PetscScalar**,PetscOffloadMask*);
 16: PETSC_INTERN PetscErrorCode VecDotNorm2_SeqCUDA(Vec,Vec,PetscScalar*, PetscScalar*);
 17: PETSC_INTERN PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec,Vec,Vec);
 18: PETSC_INTERN PetscErrorCode VecWAXPY_SeqCUDA(Vec,PetscScalar,Vec,Vec);
 19: PETSC_INTERN PetscErrorCode VecMDot_SeqCUDA(Vec,PetscInt,const Vec[],PetscScalar*);
 20: PETSC_EXTERN PetscErrorCode VecSet_SeqCUDA(Vec,PetscScalar);
 21: PETSC_INTERN PetscErrorCode VecMAXPY_SeqCUDA(Vec,PetscInt,const PetscScalar*,Vec*);
 22: PETSC_INTERN PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec,PetscScalar,PetscScalar,PetscScalar,Vec,Vec);
 23: PETSC_INTERN PetscErrorCode VecPointwiseMult_SeqCUDA(Vec,Vec,Vec);
 24: PETSC_INTERN PetscErrorCode VecPlaceArray_SeqCUDA(Vec,const PetscScalar*);
 25: PETSC_INTERN PetscErrorCode VecResetArray_SeqCUDA(Vec);
 26: PETSC_INTERN PetscErrorCode VecReplaceArray_SeqCUDA(Vec,const PetscScalar*);
 27: PETSC_INTERN PetscErrorCode VecDot_SeqCUDA(Vec,Vec,PetscScalar*);
 28: PETSC_INTERN PetscErrorCode VecTDot_SeqCUDA(Vec,Vec,PetscScalar*);
 29: PETSC_INTERN PetscErrorCode VecScale_SeqCUDA(Vec,PetscScalar);
 30: PETSC_EXTERN PetscErrorCode VecCopy_SeqCUDA(Vec,Vec);
 31: PETSC_INTERN PetscErrorCode VecSwap_SeqCUDA(Vec,Vec);
 32: PETSC_EXTERN PetscErrorCode VecAXPY_SeqCUDA(Vec,PetscScalar,Vec);
 33: PETSC_INTERN PetscErrorCode VecAXPBY_SeqCUDA(Vec,PetscScalar,PetscScalar,Vec);
 34: PETSC_INTERN PetscErrorCode VecDuplicate_SeqCUDA(Vec,Vec*);
 35: PETSC_INTERN PetscErrorCode VecConjugate_SeqCUDA(Vec xin);
 36: PETSC_INTERN PetscErrorCode VecNorm_SeqCUDA(Vec,NormType,PetscReal*);
 37: PETSC_INTERN PetscErrorCode VecCUDACopyToGPU(Vec);
 38: PETSC_INTERN PetscErrorCode VecCUDAAllocateCheck(Vec);
 39: PETSC_EXTERN PetscErrorCode VecCreate_SeqCUDA(Vec);
 40: PETSC_INTERN PetscErrorCode VecCreate_SeqCUDA_Private(Vec,const PetscScalar*);
 41: PETSC_INTERN PetscErrorCode VecCreate_MPICUDA(Vec);
 42: PETSC_INTERN PetscErrorCode VecCreate_MPICUDA_Private(Vec,PetscBool,PetscInt,const PetscScalar*);
 43: PETSC_INTERN PetscErrorCode VecCreate_CUDA(Vec);
 44: PETSC_INTERN PetscErrorCode VecDestroy_SeqCUDA(Vec);
 45: PETSC_INTERN PetscErrorCode VecDestroy_MPICUDA(Vec);
 46: PETSC_INTERN PetscErrorCode VecAYPX_SeqCUDA(Vec,PetscScalar,Vec);
 47: PETSC_INTERN PetscErrorCode VecSetRandom_SeqCUDA(Vec,PetscRandom);
 48: PETSC_INTERN PetscErrorCode VecGetLocalVector_SeqCUDA(Vec,Vec);
 49: PETSC_INTERN PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec,Vec);
 50: PETSC_INTERN PetscErrorCode VecGetLocalVectorRead_SeqCUDA(Vec,Vec);
 51: PETSC_INTERN PetscErrorCode VecRestoreLocalVectorRead_SeqCUDA(Vec,Vec);
 52: PETSC_INTERN PetscErrorCode VecGetArrayWrite_SeqCUDA(Vec,PetscScalar**);
 53: PETSC_INTERN PetscErrorCode VecGetArray_SeqCUDA(Vec,PetscScalar**);
 54: PETSC_INTERN PetscErrorCode VecRestoreArray_SeqCUDA(Vec,PetscScalar**);
 55: PETSC_INTERN PetscErrorCode VecGetArrayAndMemType_SeqCUDA(Vec,PetscScalar**,PetscMemType*);
 56: PETSC_INTERN PetscErrorCode VecRestoreArrayAndMemType_SeqCUDA(Vec,PetscScalar**);
 57: PETSC_INTERN PetscErrorCode VecGetArrayWriteAndMemType_SeqCUDA(Vec,PetscScalar**,PetscMemType*);
 58: PETSC_INTERN PetscErrorCode VecCopy_SeqCUDA_Private(Vec,Vec);
 59: PETSC_INTERN PetscErrorCode VecDestroy_SeqCUDA_Private(Vec);
 60: PETSC_INTERN PetscErrorCode VecResetArray_SeqCUDA_Private(Vec);
 61: PETSC_INTERN PetscErrorCode VecMax_SeqCUDA(Vec,PetscInt*,PetscReal*);
 62: PETSC_INTERN PetscErrorCode VecMin_SeqCUDA(Vec,PetscInt*,PetscReal*);
 63: PETSC_INTERN PetscErrorCode VecReciprocal_SeqCUDA(Vec);
 64: PETSC_INTERN PetscErrorCode VecSum_SeqCUDA(Vec,PetscScalar*);
 65: PETSC_INTERN PetscErrorCode VecShift_SeqCUDA(Vec,PetscScalar);

 67: #if defined(PETSC_HAVE_NVSHMEM)
 68: PETSC_EXTERN PetscErrorCode PetscNvshmemInitializeCheck(void);
 69: PETSC_EXTERN PetscErrorCode PetscNvshmemMalloc(size_t,void**);
 70: PETSC_EXTERN PetscErrorCode PetscNvshmemCalloc(size_t,void**);
 71: PETSC_EXTERN PetscErrorCode PetscNvshmemFree_Private(void*);
 72: #define      PetscNvshmemFree(ptr)      ((ptr) && (PetscNvshmemFree_Private(ptr),(ptr)=NULL,0))
 73: PETSC_INTERN PetscErrorCode PetscNvshmemSum(PetscInt,PetscScalar*,const PetscScalar*);
 74: PETSC_INTERN PetscErrorCode PetscNvshmemMax(PetscInt,PetscReal*,const PetscReal*);
 75: PETSC_INTERN PetscErrorCode VecNormAsync_NVSHMEM(Vec,NormType,PetscReal*);
 76: PETSC_INTERN PetscErrorCode VecAllocateNVSHMEM_SeqCUDA(Vec);
 77: #endif

 79: /* complex single */
 80: #if defined(PETSC_USE_COMPLEX)
 81: #if defined(PETSC_USE_REAL_SINGLE)
 82: #define cublasXaxpy(a,b,c,d,e,f,g)               cublasCaxpy((a),(b),(cuComplex*)(c),(cuComplex*)(d),(e),(cuComplex*)(f),(g))
 83: #define cublasXscal(a,b,c,d,e)                   cublasCscal((a),(b),(cuComplex*)(c),(cuComplex*)(d),(e))
 84: #define cublasXdotu(a,b,c,d,e,f,g)               cublasCdotu((a),(b),(cuComplex*)(c),(d),(cuComplex*)(e),(f),(cuComplex*)(g))
 85: #define cublasXdot(a,b,c,d,e,f,g)                cublasCdotc((a),(b),(cuComplex*)(c),(d),(cuComplex*)(e),(f),(cuComplex*)(g))
 86: #define cublasXswap(a,b,c,d,e,f)                 cublasCswap((a),(b),(cuComplex*)(c),(d),(cuComplex*)(e),(f))
 87: #define cublasXnrm2(a,b,c,d,e)                   cublasScnrm2((a),(b),(cuComplex*)(c),(d),(e))
 88: #define cublasIXamax(a,b,c,d,e)                  cublasIcamax((a),(b),(cuComplex*)(c),(d),(e))
 89: #define cublasXasum(a,b,c,d,e)                   cublasScasum((a),(b),(cuComplex*)(c),(d),(e))
 90: #define cublasXgemv(a,b,c,d,e,f,g,h,i,j,k,l)     cublasCgemv((a),(b),(c),(d),(cuComplex*)(e),(cuComplex*)(f),(g),(cuComplex*)(h),(i),(cuComplex*)(j),(cuComplex*)(k),(l))
 91: #define cublasXgemm(a,b,c,d,e,f,g,h,i,j,k,l,m,n) cublasCgemm((a),(b),(c),(d),(e),(f),(cuComplex*)(g),(cuComplex*)(h),(i),(cuComplex*)(j),(k),(cuComplex*)(l),(cuComplex*)(m),(n))
 92: #define cublasXgeam(a,b,c,d,e,f,g,h,i,j,k,l,m)   cublasCgeam((a),(b),(c),(d),(e),(cuComplex*)(f),(cuComplex*)(g),(h),(cuComplex*)(i),(cuComplex*)(j),(k),(cuComplex*)(l),(m))
 93: #else /* complex double */
 94: #define cublasXaxpy(a,b,c,d,e,f,g)               cublasZaxpy((a),(b),(cuDoubleComplex*)(c),(cuDoubleComplex*)(d),(e),(cuDoubleComplex*)(f),(g))
 95: #define cublasXscal(a,b,c,d,e)                   cublasZscal((a),(b),(cuDoubleComplex*)(c),(cuDoubleComplex*)(d),(e))
 96: #define cublasXdotu(a,b,c,d,e,f,g)               cublasZdotu((a),(b),(cuDoubleComplex*)(c),(d),(cuDoubleComplex*)(e),(f),(cuDoubleComplex*)(g))
 97: #define cublasXdot(a,b,c,d,e,f,g)                cublasZdotc((a),(b),(cuDoubleComplex*)(c),(d),(cuDoubleComplex*)(e),(f),(cuDoubleComplex*)(g))
 98: #define cublasXswap(a,b,c,d,e,f)                 cublasZswap((a),(b),(cuDoubleComplex*)(c),(d),(cuDoubleComplex*)(e),(f))
 99: #define cublasXnrm2(a,b,c,d,e)                   cublasDznrm2((a),(b),(cuDoubleComplex*)(c),(d),(e))
100: #define cublasIXamax(a,b,c,d,e)                  cublasIzamax((a),(b),(cuDoubleComplex*)(c),(d),(e))
101: #define cublasXasum(a,b,c,d,e)                   cublasDzasum((a),(b),(cuDoubleComplex*)(c),(d),(e))
102: #define cublasXgemv(a,b,c,d,e,f,g,h,i,j,k,l)     cublasZgemv((a),(b),(c),(d),(cuDoubleComplex*)(e),(cuDoubleComplex*)(f),(g),(cuDoubleComplex*)(h),(i),(cuDoubleComplex*)(j),(cuDoubleComplex*)(k),(l))
103: #define cublasXgemm(a,b,c,d,e,f,g,h,i,j,k,l,m,n) cublasZgemm((a),(b),(c),(d),(e),(f),(cuDoubleComplex*)(g),(cuDoubleComplex*)(h),(i),(cuDoubleComplex*)(j),(k),(cuDoubleComplex*)(l),(cuDoubleComplex*)(m),(n))
104: #define cublasXgeam(a,b,c,d,e,f,g,h,i,j,k,l,m)   cublasZgeam((a),(b),(c),(d),(e),(cuDoubleComplex*)(f),(cuDoubleComplex*)(g),(h),(cuDoubleComplex*)(i),(cuDoubleComplex*)(j),(k),(cuDoubleComplex*)(l),(m))
105: #endif
106: #else /* real single */
107: #if defined(PETSC_USE_REAL_SINGLE)
108: #define cublasXaxpy  cublasSaxpy
109: #define cublasXscal  cublasSscal
110: #define cublasXdotu  cublasSdot
111: #define cublasXdot   cublasSdot
112: #define cublasXswap  cublasSswap
113: #define cublasXnrm2  cublasSnrm2
114: #define cublasIXamax cublasIsamax
115: #define cublasXasum  cublasSasum
116: #define cublasXgemv  cublasSgemv
117: #define cublasXgemm  cublasSgemm
118: #define cublasXgeam  cublasSgeam
119: #else /* real double */
120: #define cublasXaxpy  cublasDaxpy
121: #define cublasXscal  cublasDscal
122: #define cublasXdotu  cublasDdot
123: #define cublasXdot   cublasDdot
124: #define cublasXswap  cublasDswap
125: #define cublasXnrm2  cublasDnrm2
126: #define cublasIXamax cublasIdamax
127: #define cublasXasum  cublasDasum
128: #define cublasXgemv  cublasDgemv
129: #define cublasXgemm  cublasDgemm
130: #define cublasXgeam  cublasDgeam
131: #endif
132: #endif

134: #endif