Actual source code: mpicuda.cu
2: /*
3: This file contains routines for Parallel vector operations.
4: */
5: #define PETSC_SKIP_SPINLOCK
7: #include <petscconf.h>
8: #include <../src/vec/vec/impls/mpi/pvecimpl.h>
9: #include <petsc/private/cudavecimpl.h>
11: /*MC
12: VECCUDA - VECCUDA = "cuda" - A VECSEQCUDA on a single-process communicator, and VECMPICUDA otherwise.
14: Options Database Keys:
15: . -vec_type cuda - sets the vector type to VECCUDA during a call to VecSetFromOptions()
17: Level: beginner
19: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateMPIWithArray(), VECSEQCUDA, VECMPICUDA, VECSTANDARD, VecType, VecCreateMPI(), VecSetPinnedMemoryMin()
20: M*/
22: PetscErrorCode VecDestroy_MPICUDA(Vec v)
23: {
24: Vec_MPI *vecmpi = (Vec_MPI*)v->data;
25: Vec_CUDA *veccuda;
27: if (v->spptr) {
28: veccuda = (Vec_CUDA*)v->spptr;
29: if (veccuda->GPUarray_allocated) {
30: cudaFree(((Vec_CUDA*)v->spptr)->GPUarray_allocated);
31: veccuda->GPUarray_allocated = NULL;
32: }
33: if (veccuda->stream) {
34: cudaStreamDestroy(((Vec_CUDA*)v->spptr)->stream);
35: }
36: if (v->pinned_memory) {
37: PetscMallocSetCUDAHost();
38: PetscFree(vecmpi->array_allocated);
39: PetscMallocResetCUDAHost();
40: v->pinned_memory = PETSC_FALSE;
41: }
42: PetscFree(v->spptr);
43: }
44: VecDestroy_MPI(v);
45: return 0;
46: }
48: PetscErrorCode VecNorm_MPICUDA(Vec xin,NormType type,PetscReal *z)
49: {
50: PetscReal sum,work = 0.0;
52: if (type == NORM_2 || type == NORM_FROBENIUS) {
53: VecNorm_SeqCUDA(xin,NORM_2,&work);
54: work *= work;
55: MPIU_Allreduce(&work,&sum,1,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)xin));
56: *z = PetscSqrtReal(sum);
57: } else if (type == NORM_1) {
58: /* Find the local part */
59: VecNorm_SeqCUDA(xin,NORM_1,&work);
60: /* Find the global max */
61: MPIU_Allreduce(&work,z,1,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)xin));
62: } else if (type == NORM_INFINITY) {
63: /* Find the local max */
64: VecNorm_SeqCUDA(xin,NORM_INFINITY,&work);
65: /* Find the global max */
66: MPIU_Allreduce(&work,z,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)xin));
67: } else if (type == NORM_1_AND_2) {
68: PetscReal temp[2];
69: VecNorm_SeqCUDA(xin,NORM_1,temp);
70: VecNorm_SeqCUDA(xin,NORM_2,temp+1);
71: temp[1] = temp[1]*temp[1];
72: MPIU_Allreduce(temp,z,2,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)xin));
73: z[1] = PetscSqrtReal(z[1]);
74: }
75: return 0;
76: }
78: PetscErrorCode VecDot_MPICUDA(Vec xin,Vec yin,PetscScalar *z)
79: {
80: PetscScalar sum,work;
82: VecDot_SeqCUDA(xin,yin,&work);
83: MPIU_Allreduce(&work,&sum,1,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)xin));
84: *z = sum;
85: return 0;
86: }
88: PetscErrorCode VecTDot_MPICUDA(Vec xin,Vec yin,PetscScalar *z)
89: {
90: PetscScalar sum,work;
92: VecTDot_SeqCUDA(xin,yin,&work);
93: MPIU_Allreduce(&work,&sum,1,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)xin));
94: *z = sum;
95: return 0;
96: }
98: PetscErrorCode VecMDot_MPICUDA(Vec xin,PetscInt nv,const Vec y[],PetscScalar *z)
99: {
100: PetscScalar awork[128],*work = awork;
102: if (nv > 128) {
103: PetscMalloc1(nv,&work);
104: }
105: VecMDot_SeqCUDA(xin,nv,y,work);
106: MPIU_Allreduce(work,z,nv,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)xin));
107: if (nv > 128) {
108: PetscFree(work);
109: }
110: return 0;
111: }
113: /*MC
114: VECMPICUDA - VECMPICUDA = "mpicuda" - The basic parallel vector, modified to use CUDA
116: Options Database Keys:
117: . -vec_type mpicuda - sets the vector type to VECMPICUDA during a call to VecSetFromOptions()
119: Level: beginner
121: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateMPIWithArray(), VECMPI, VecType, VecCreateMPI(), VecSetPinnedMemoryMin()
122: M*/
124: PetscErrorCode VecDuplicate_MPICUDA(Vec win,Vec *v)
125: {
126: Vec_MPI *vw,*w = (Vec_MPI*)win->data;
127: PetscScalar *array;
129: VecCreate(PetscObjectComm((PetscObject)win),v);
130: PetscLayoutReference(win->map,&(*v)->map);
132: VecCreate_MPICUDA_Private(*v,PETSC_TRUE,w->nghost,0);
133: vw = (Vec_MPI*)(*v)->data;
134: PetscMemcpy((*v)->ops,win->ops,sizeof(struct _VecOps));
136: /* save local representation of the parallel vector (and scatter) if it exists */
137: if (w->localrep) {
138: VecGetArray(*v,&array);
139: VecCreateSeqWithArray(PETSC_COMM_SELF,1,win->map->n+w->nghost,array,&vw->localrep);
140: PetscMemcpy(vw->localrep->ops,w->localrep->ops,sizeof(struct _VecOps));
141: VecRestoreArray(*v,&array);
142: PetscLogObjectParent((PetscObject)*v,(PetscObject)vw->localrep);
143: vw->localupdate = w->localupdate;
144: if (vw->localupdate) {
145: PetscObjectReference((PetscObject)vw->localupdate);
146: }
147: }
149: /* New vector should inherit stashing property of parent */
150: (*v)->stash.donotstash = win->stash.donotstash;
151: (*v)->stash.ignorenegidx = win->stash.ignorenegidx;
153: /* change type_name appropriately */
154: VecCUDAAllocateCheck(*v);
155: PetscObjectChangeTypeName((PetscObject)(*v),VECMPICUDA);
157: PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*v))->olist);
158: PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*v))->qlist);
159: (*v)->map->bs = PetscAbs(win->map->bs);
160: (*v)->bstash.bs = win->bstash.bs;
161: return 0;
162: }
164: PetscErrorCode VecDotNorm2_MPICUDA(Vec s,Vec t,PetscScalar *dp,PetscScalar *nm)
165: {
166: PetscScalar work[2],sum[2];
168: VecDotNorm2_SeqCUDA(s,t,work,work+1);
169: MPIU_Allreduce(&work,&sum,2,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)s));
170: *dp = sum[0];
171: *nm = sum[1];
172: return 0;
173: }
175: PetscErrorCode VecCreate_MPICUDA(Vec vv)
176: {
177: PetscDeviceInitialize(PETSC_DEVICE_CUDA);
178: PetscLayoutSetUp(vv->map);
179: VecCUDAAllocateCheck(vv);
180: VecCreate_MPICUDA_Private(vv,PETSC_FALSE,0,((Vec_CUDA*)vv->spptr)->GPUarray_allocated);
181: VecCUDAAllocateCheckHost(vv);
182: VecSet(vv,0.0);
183: VecSet_Seq(vv,0.0);
184: vv->offloadmask = PETSC_OFFLOAD_BOTH;
185: return 0;
186: }
188: PetscErrorCode VecCreate_CUDA(Vec v)
189: {
190: PetscMPIInt size;
192: MPI_Comm_size(PetscObjectComm((PetscObject)v),&size);
193: if (size == 1) {
194: VecSetType(v,VECSEQCUDA);
195: } else {
196: VecSetType(v,VECMPICUDA);
197: }
198: return 0;
199: }
201: /*@
202: VecCreateMPICUDA - Creates a standard, parallel array-style vector for CUDA devices.
204: Collective
206: Input Parameters:
207: + comm - the MPI communicator to use
208: . n - local vector length (or PETSC_DECIDE to have calculated if N is given)
209: - N - global vector length (or PETSC_DETERMINE to have calculated if n is given)
211: Output Parameter:
212: . v - the vector
214: Notes:
215: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
216: same type as an existing vector.
218: Level: intermediate
220: .seealso: VecCreateMPICUDAWithArray(), VecCreateMPICUDAWithArrays(), VecCreateSeqCUDA(), VecCreateSeq(),
221: VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost(),
222: VecCreateMPIWithArray(), VecCreateGhostWithArray(), VecMPISetGhost()
224: @*/
225: PetscErrorCode VecCreateMPICUDA(MPI_Comm comm,PetscInt n,PetscInt N,Vec *v)
226: {
227: VecCreate(comm,v);
228: VecSetSizes(*v,n,N);
229: VecSetType(*v,VECMPICUDA);
230: return 0;
231: }
233: /*@C
234: VecCreateMPICUDAWithArray - Creates a parallel, array-style vector,
235: where the user provides the GPU array space to store the vector values.
237: Collective
239: Input Parameters:
240: + comm - the MPI communicator to use
241: . bs - block size, same meaning as VecSetBlockSize()
242: . n - local vector length, cannot be PETSC_DECIDE
243: . N - global vector length (or PETSC_DECIDE to have calculated)
244: - array - the user provided GPU array to store the vector values
246: Output Parameter:
247: . vv - the vector
249: Notes:
250: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
251: same type as an existing vector.
253: If the user-provided array is NULL, then VecCUDAPlaceArray() can be used
254: at a later stage to SET the array for storing the vector values.
256: PETSc does NOT free the array when the vector is destroyed via VecDestroy().
257: The user should not free the array until the vector is destroyed.
259: Level: intermediate
261: .seealso: VecCreateMPICUDA(), VecCreateSeqCUDAWithArray(), VecCreateMPIWithArray(), VecCreateSeqWithArray(),
262: VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost(),
263: VecCreateMPI(), VecCreateGhostWithArray(), VecPlaceArray()
265: @*/
266: PetscErrorCode VecCreateMPICUDAWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,PetscInt N,const PetscScalar array[],Vec *vv)
267: {
269: PetscDeviceInitialize(PETSC_DEVICE_CUDA);
270: VecCreate(comm,vv);
271: VecSetSizes(*vv,n,N);
272: VecSetBlockSize(*vv,bs);
273: VecCreate_MPICUDA_Private(*vv,PETSC_FALSE,0,array);
274: return 0;
275: }
277: /*@C
278: VecCreateMPICUDAWithArrays - Creates a parallel, array-style vector,
279: where the user provides the GPU array space to store the vector values.
281: Collective
283: Input Parameters:
284: + comm - the MPI communicator to use
285: . bs - block size, same meaning as VecSetBlockSize()
286: . n - local vector length, cannot be PETSC_DECIDE
287: . N - global vector length (or PETSC_DECIDE to have calculated)
288: - cpuarray - the user provided CPU array to store the vector values
289: - gpuarray - the user provided GPU array to store the vector values
291: Output Parameter:
292: . vv - the vector
294: Notes:
295: If both cpuarray and gpuarray are provided, the caller must ensure that
296: the provided arrays have identical values.
298: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
299: same type as an existing vector.
301: PETSc does NOT free the provided arrays when the vector is destroyed via
302: VecDestroy(). The user should not free the array until the vector is
303: destroyed.
305: Level: intermediate
307: .seealso: VecCreateSeqCUDAWithArrays(), VecCreateMPIWithArray(), VecCreateSeqWithArray(),
308: VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost(),
309: VecCreateMPI(), VecCreateGhostWithArray(), VecCUDAPlaceArray(), VecPlaceArray(),
310: VecCUDAAllocateCheckHost()
311: @*/
312: PetscErrorCode VecCreateMPICUDAWithArrays(MPI_Comm comm,PetscInt bs,PetscInt n,PetscInt N,const PetscScalar cpuarray[],const PetscScalar gpuarray[],Vec *vv)
313: {
314: VecCreateMPICUDAWithArray(comm,bs,n,N,gpuarray,vv);
316: if (cpuarray && gpuarray) {
317: Vec_MPI *s = (Vec_MPI*)((*vv)->data);
318: s->array = (PetscScalar*)cpuarray;
319: (*vv)->offloadmask = PETSC_OFFLOAD_BOTH;
320: } else if (cpuarray) {
321: Vec_MPI *s = (Vec_MPI*)((*vv)->data);
322: s->array = (PetscScalar*)cpuarray;
323: (*vv)->offloadmask = PETSC_OFFLOAD_CPU;
324: } else if (gpuarray) {
325: (*vv)->offloadmask = PETSC_OFFLOAD_GPU;
326: } else {
327: (*vv)->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
328: }
330: return 0;
331: }
333: PetscErrorCode VecMax_MPICUDA(Vec xin,PetscInt *idx,PetscReal *z)
334: {
335: PetscReal work;
337: VecMax_SeqCUDA(xin,idx,&work);
338: #if defined(PETSC_HAVE_MPIUNI)
339: *z = work;
340: #else
341: if (!idx) {
342: MPIU_Allreduce(&work,z,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)xin));
343: } else {
344: struct { PetscReal v; PetscInt i; } in,out;
346: in.v = work;
347: in.i = *idx + xin->map->rstart;
348: MPIU_Allreduce(&in,&out,1,MPIU_REAL_INT,MPIU_MAXLOC,PetscObjectComm((PetscObject)xin));
349: *z = out.v;
350: *idx = out.i;
351: }
352: #endif
353: return 0;
354: }
356: PetscErrorCode VecMin_MPICUDA(Vec xin,PetscInt *idx,PetscReal *z)
357: {
358: PetscReal work;
360: VecMin_SeqCUDA(xin,idx,&work);
361: #if defined(PETSC_HAVE_MPIUNI)
362: *z = work;
363: #else
364: if (!idx) {
365: MPIU_Allreduce(&work,z,1,MPIU_REAL,MPIU_MIN,PetscObjectComm((PetscObject)xin));
366: } else {
367: struct { PetscReal v; PetscInt i; } in,out;
369: in.v = work;
370: in.i = *idx + xin->map->rstart;
371: MPIU_Allreduce(&in,&out,1,MPIU_REAL_INT,MPIU_MINLOC,PetscObjectComm((PetscObject)xin));
372: *z = out.v;
373: *idx = out.i;
374: }
375: #endif
376: return 0;
377: }
379: PetscErrorCode VecBindToCPU_MPICUDA(Vec V,PetscBool bind)
380: {
381: V->boundtocpu = bind;
382: if (bind) {
383: VecCUDACopyFromGPU(V);
384: V->offloadmask = PETSC_OFFLOAD_CPU; /* since the CPU code will likely change values in the vector */
385: V->ops->dotnorm2 = NULL;
386: V->ops->waxpy = VecWAXPY_Seq;
387: V->ops->dot = VecDot_MPI;
388: V->ops->mdot = VecMDot_MPI;
389: V->ops->tdot = VecTDot_MPI;
390: V->ops->norm = VecNorm_MPI;
391: V->ops->scale = VecScale_Seq;
392: V->ops->copy = VecCopy_Seq;
393: V->ops->set = VecSet_Seq;
394: V->ops->swap = VecSwap_Seq;
395: V->ops->axpy = VecAXPY_Seq;
396: V->ops->axpby = VecAXPBY_Seq;
397: V->ops->maxpy = VecMAXPY_Seq;
398: V->ops->aypx = VecAYPX_Seq;
399: V->ops->axpbypcz = VecAXPBYPCZ_Seq;
400: V->ops->pointwisemult = VecPointwiseMult_Seq;
401: V->ops->setrandom = VecSetRandom_Seq;
402: V->ops->placearray = VecPlaceArray_Seq;
403: V->ops->replacearray = VecReplaceArray_SeqCUDA;
404: V->ops->resetarray = VecResetArray_Seq;
405: V->ops->dot_local = VecDot_Seq;
406: V->ops->tdot_local = VecTDot_Seq;
407: V->ops->norm_local = VecNorm_Seq;
408: V->ops->mdot_local = VecMDot_Seq;
409: V->ops->pointwisedivide = VecPointwiseDivide_Seq;
410: V->ops->getlocalvector = NULL;
411: V->ops->restorelocalvector = NULL;
412: V->ops->getlocalvectorread = NULL;
413: V->ops->restorelocalvectorread = NULL;
414: V->ops->getarraywrite = NULL;
415: V->ops->getarrayandmemtype = NULL;
416: V->ops->restorearrayandmemtype = NULL;
417: V->ops->getarraywriteandmemtype= NULL;
418: V->ops->max = VecMax_MPI;
419: V->ops->min = VecMin_MPI;
420: V->ops->reciprocal = VecReciprocal_Default;
421: V->ops->sum = NULL;
422: V->ops->shift = NULL;
423: /* default random number generator */
424: PetscFree(V->defaultrandtype);
425: PetscStrallocpy(PETSCRANDER48,&V->defaultrandtype);
426: } else {
427: V->ops->dotnorm2 = VecDotNorm2_MPICUDA;
428: V->ops->waxpy = VecWAXPY_SeqCUDA;
429: V->ops->duplicate = VecDuplicate_MPICUDA;
430: V->ops->dot = VecDot_MPICUDA;
431: V->ops->mdot = VecMDot_MPICUDA;
432: V->ops->tdot = VecTDot_MPICUDA;
433: V->ops->norm = VecNorm_MPICUDA;
434: V->ops->scale = VecScale_SeqCUDA;
435: V->ops->copy = VecCopy_SeqCUDA;
436: V->ops->set = VecSet_SeqCUDA;
437: V->ops->swap = VecSwap_SeqCUDA;
438: V->ops->axpy = VecAXPY_SeqCUDA;
439: V->ops->axpby = VecAXPBY_SeqCUDA;
440: V->ops->maxpy = VecMAXPY_SeqCUDA;
441: V->ops->aypx = VecAYPX_SeqCUDA;
442: V->ops->axpbypcz = VecAXPBYPCZ_SeqCUDA;
443: V->ops->pointwisemult = VecPointwiseMult_SeqCUDA;
444: V->ops->setrandom = VecSetRandom_SeqCUDA;
445: V->ops->placearray = VecPlaceArray_SeqCUDA;
446: V->ops->replacearray = VecReplaceArray_SeqCUDA;
447: V->ops->resetarray = VecResetArray_SeqCUDA;
448: V->ops->dot_local = VecDot_SeqCUDA;
449: V->ops->tdot_local = VecTDot_SeqCUDA;
450: V->ops->norm_local = VecNorm_SeqCUDA;
451: V->ops->mdot_local = VecMDot_SeqCUDA;
452: V->ops->destroy = VecDestroy_MPICUDA;
453: V->ops->pointwisedivide = VecPointwiseDivide_SeqCUDA;
454: V->ops->getlocalvector = VecGetLocalVector_SeqCUDA;
455: V->ops->restorelocalvector = VecRestoreLocalVector_SeqCUDA;
456: V->ops->getlocalvectorread = VecGetLocalVectorRead_SeqCUDA;
457: V->ops->restorelocalvectorread = VecRestoreLocalVectorRead_SeqCUDA;
458: V->ops->getarraywrite = VecGetArrayWrite_SeqCUDA;
459: V->ops->getarray = VecGetArray_SeqCUDA;
460: V->ops->restorearray = VecRestoreArray_SeqCUDA;
461: V->ops->getarrayandmemtype = VecGetArrayAndMemType_SeqCUDA;
462: V->ops->restorearrayandmemtype = VecRestoreArrayAndMemType_SeqCUDA;
463: V->ops->getarraywriteandmemtype= VecGetArrayWriteAndMemType_SeqCUDA;
464: V->ops->max = VecMax_MPICUDA;
465: V->ops->min = VecMin_MPICUDA;
466: V->ops->reciprocal = VecReciprocal_SeqCUDA;
467: V->ops->sum = VecSum_SeqCUDA;
468: V->ops->shift = VecShift_SeqCUDA;
469: /* default random number generator */
470: PetscFree(V->defaultrandtype);
471: PetscStrallocpy(PETSCCURAND,&V->defaultrandtype);
472: }
473: return 0;
474: }
476: PetscErrorCode VecCreate_MPICUDA_Private(Vec vv,PetscBool alloc,PetscInt nghost,const PetscScalar array[])
477: {
479: Vec_CUDA *veccuda;
481: VecCreate_MPI_Private(vv,PETSC_FALSE,0,0);
482: PetscObjectChangeTypeName((PetscObject)vv,VECMPICUDA);
484: VecBindToCPU_MPICUDA(vv,PETSC_FALSE);
485: vv->ops->bindtocpu = VecBindToCPU_MPICUDA;
487: /* Later, functions check for the Vec_CUDA structure existence, so do not create it without array */
488: if (alloc && !array) {
489: VecCUDAAllocateCheck(vv);
490: VecCUDAAllocateCheckHost(vv);
491: VecSet(vv,0.0);
492: VecSet_Seq(vv,0.0);
493: vv->offloadmask = PETSC_OFFLOAD_BOTH;
494: }
495: if (array) {
496: if (!vv->spptr) {
497: PetscReal pinned_memory_min;
498: PetscBool flag;
499: /* Cannot use PetscNew() here because spptr is void* */
500: PetscCalloc(sizeof(Vec_CUDA),&vv->spptr);
501: veccuda = (Vec_CUDA*)vv->spptr;
502: vv->minimum_bytes_pinned_memory = 0;
504: /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
505: Note: This same code duplicated in VecCreate_SeqCUDA_Private() and VecCUDAAllocateCheck(). Is there a good way to avoid this? */
506: PetscOptionsBegin(PetscObjectComm((PetscObject)vv),((PetscObject)vv)->prefix,"VECCUDA Options","Vec");
507: pinned_memory_min = vv->minimum_bytes_pinned_memory;
508: PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&flag);
509: if (flag) vv->minimum_bytes_pinned_memory = pinned_memory_min;
510: PetscOptionsEnd();
511: }
512: veccuda = (Vec_CUDA*)vv->spptr;
513: veccuda->GPUarray = (PetscScalar*)array;
514: vv->offloadmask = PETSC_OFFLOAD_GPU;
515: }
516: return 0;
517: }