Actual source code: veccuda.c

  1: /*
  2:  Implementation of the sequential cuda vectors.

  4:  This file contains the code that can be compiled with a C
  5:  compiler.  The companion file veccuda2.cu contains the code that
  6:  must be compiled with nvcc or a C++ compiler.
  7:  */

  9: #define PETSC_SKIP_SPINLOCK

 11: #include <petscconf.h>
 12: #include <petsc/private/vecimpl.h>
 13: #include <../src/vec/vec/impls/dvecimpl.h>
 14: #include <petsc/private/cudavecimpl.h>

 16: PetscErrorCode VecCUDAGetArrays_Private(Vec v,const PetscScalar** x,const PetscScalar** x_d,PetscOffloadMask* flg)
 17: {
 19:   if (x) {
 20:     Vec_Seq *h = (Vec_Seq*)v->data;

 22:     *x = h->array;
 23:   }
 24:   if (x_d) {
 25:     Vec_CUDA *d = (Vec_CUDA*)v->spptr;

 27:     *x_d = d ? d->GPUarray : NULL;
 28:   }
 29:   if (flg) *flg = v->offloadmask;
 30:   return 0;
 31: }

 33: /*
 34:     Allocates space for the vector array on the Host if it does not exist.
 35:     Does NOT change the PetscCUDAFlag for the vector
 36:     Does NOT zero the CUDA array
 37:  */
 38: PetscErrorCode VecCUDAAllocateCheckHost(Vec v)
 39: {
 40:   PetscScalar    *array;
 41:   Vec_Seq        *s = (Vec_Seq*)v->data;
 42:   PetscInt       n = v->map->n;

 44:   if (!s) {
 45:     PetscNewLog((PetscObject)v,&s);
 46:     v->data = s;
 47:   }
 48:   if (!s->array) {
 49:     if (n*sizeof(PetscScalar) > v->minimum_bytes_pinned_memory) {
 50:       PetscMallocSetCUDAHost();
 51:       v->pinned_memory = PETSC_TRUE;
 52:     }
 53:     PetscMalloc1(n,&array);
 54:     PetscLogObjectMemory((PetscObject)v,n*sizeof(PetscScalar));
 55:     s->array           = array;
 56:     s->array_allocated = array;
 57:     if (n*sizeof(PetscScalar) > v->minimum_bytes_pinned_memory) {
 58:       PetscMallocResetCUDAHost();
 59:     }
 60:     if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
 61:       v->offloadmask = PETSC_OFFLOAD_CPU;
 62:     }
 63:   }
 64:   return 0;
 65: }

 67: PetscErrorCode VecCopy_SeqCUDA_Private(Vec xin,Vec yin)
 68: {
 69:   PetscScalar       *ya;
 70:   const PetscScalar *xa;

 72:   VecCUDAAllocateCheckHost(xin);
 73:   VecCUDAAllocateCheckHost(yin);
 74:   if (xin != yin) {
 75:     VecGetArrayRead(xin,&xa);
 76:     VecGetArray(yin,&ya);
 77:     PetscArraycpy(ya,xa,xin->map->n);
 78:     VecRestoreArrayRead(xin,&xa);
 79:     VecRestoreArray(yin,&ya);
 80:   }
 81:   return 0;
 82: }

 84: PetscErrorCode VecSetRandom_SeqCUDA(Vec xin,PetscRandom r)
 85: {
 86:   PetscInt       n = xin->map->n;
 87:   PetscBool      iscurand;
 88:   PetscScalar    *xx;

 90:   PetscObjectTypeCompare((PetscObject)r,PETSCCURAND,&iscurand);
 91:   if (iscurand) {
 92:     VecCUDAGetArrayWrite(xin,&xx);
 93:   } else {
 94:     VecGetArrayWrite(xin,&xx);
 95:   }
 96:   PetscRandomGetValues(r,n,xx);
 97:   if (iscurand) {
 98:     VecCUDARestoreArrayWrite(xin,&xx);
 99:   } else {
100:     VecRestoreArrayWrite(xin,&xx);
101:   }
102:   return 0;
103: }

105: PetscErrorCode VecDestroy_SeqCUDA_Private(Vec v)
106: {
107:   Vec_Seq        *vs = (Vec_Seq*)v->data;

109:   PetscObjectSAWsViewOff(v);
110: #if defined(PETSC_USE_LOG)
111:   PetscLogObjectState((PetscObject)v,"Length=%" PetscInt_FMT,v->map->n);
112: #endif
113:   if (vs) {
114:     if (vs->array_allocated) {
115:       if (v->pinned_memory) {
116:         PetscMallocSetCUDAHost();
117:       }
118:       PetscFree(vs->array_allocated);
119:       if (v->pinned_memory) {
120:         PetscMallocResetCUDAHost();
121:         v->pinned_memory = PETSC_FALSE;
122:       }
123:     }
124:     PetscFree(vs);
125:   }
126:   return 0;
127: }

129: PetscErrorCode VecResetArray_SeqCUDA_Private(Vec vin)
130: {
131:   Vec_Seq *v = (Vec_Seq*)vin->data;

133:   v->array         = v->unplacedarray;
134:   v->unplacedarray = 0;
135:   return 0;
136: }

138: PetscErrorCode VecResetArray_SeqCUDA(Vec vin)
139: {
140:   VecCUDACopyFromGPU(vin);
141:   VecResetArray_SeqCUDA_Private(vin);
142:   vin->offloadmask = PETSC_OFFLOAD_CPU;
143:   return 0;
144: }

146: PetscErrorCode VecPlaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
147: {
148:   VecCUDACopyFromGPU(vin);
149:   VecPlaceArray_Seq(vin,a);
150:   vin->offloadmask = PETSC_OFFLOAD_CPU;
151:   return 0;
152: }

154: PetscErrorCode VecReplaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
155: {
156:   Vec_Seq        *vs = (Vec_Seq*)vin->data;

158:   if (vs->array != vs->array_allocated) {
159:     /* make sure the users array has the latest values */
160:     VecCUDACopyFromGPU(vin);
161:   }
162:   if (vs->array_allocated) {
163:     if (vin->pinned_memory) {
164:       PetscMallocSetCUDAHost();
165:     }
166:     PetscFree(vs->array_allocated);
167:     if (vin->pinned_memory) {
168:       PetscMallocResetCUDAHost();
169:     }
170:   }
171:   vin->pinned_memory = PETSC_FALSE;
172:   vs->array_allocated = vs->array = (PetscScalar*)a;
173:   vin->offloadmask = PETSC_OFFLOAD_CPU;
174:   return 0;
175: }

177: /*@
178:  VecCreateSeqCUDA - Creates a standard, sequential array-style vector.

180:  Collective

182:  Input Parameter:
183:  +  comm - the communicator, should be PETSC_COMM_SELF
184:  -  n - the vector length

186:  Output Parameter:
187:  .  v - the vector

189:  Notes:
190:  Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
191:  same type as an existing vector.

193:  Level: intermediate

195:  .seealso: VecCreateMPICUDA(), VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
196:  @*/
197: PetscErrorCode VecCreateSeqCUDA(MPI_Comm comm,PetscInt n,Vec *v)
198: {
199:   VecCreate(comm,v);
200:   VecSetSizes(*v,n,n);
201:   VecSetType(*v,VECSEQCUDA);
202:   return 0;
203: }

205: PetscErrorCode VecDuplicate_SeqCUDA(Vec win,Vec *V)
206: {
207:   VecCreateSeqCUDA(PetscObjectComm((PetscObject)win),win->map->n,V);
208:   PetscLayoutReference(win->map,&(*V)->map);
209:   PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
210:   PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
211:   (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
212:   return 0;
213: }

215: PetscErrorCode VecCreate_SeqCUDA(Vec V)
216: {
217:   PetscDeviceInitialize(PETSC_DEVICE_CUDA);
218:   PetscLayoutSetUp(V->map);
219:   VecCUDAAllocateCheck(V);
220:   VecCreate_SeqCUDA_Private(V,((Vec_CUDA*)V->spptr)->GPUarray_allocated);
221:   VecSet_SeqCUDA(V,0.0);
222:   return 0;
223: }

225: /*@C
226:    VecCreateSeqCUDAWithArray - Creates a CUDA sequential array-style vector,
227:    where the user provides the array space to store the vector values. The array
228:    provided must be a GPU array.

230:    Collective

232:    Input Parameters:
233: +  comm - the communicator, should be PETSC_COMM_SELF
234: .  bs - the block size
235: .  n - the vector length
236: -  array - GPU memory where the vector elements are to be stored.

238:    Output Parameter:
239: .  V - the vector

241:    Notes:
242:    Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
243:    same type as an existing vector.

245:    If the user-provided array is NULL, then VecCUDAPlaceArray() can be used
246:    at a later stage to SET the array for storing the vector values.

248:    PETSc does NOT free the array when the vector is destroyed via VecDestroy().
249:    The user should not free the array until the vector is destroyed.

251:    Level: intermediate

253: .seealso: VecCreateMPICUDAWithArray(), VecCreate(), VecDuplicate(), VecDuplicateVecs(),
254:           VecCreateGhost(), VecCreateSeq(), VecCUDAPlaceArray(), VecCreateSeqWithArray(),
255:           VecCreateMPIWithArray()
256: @*/
257: PetscErrorCode  VecCreateSeqCUDAWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar array[],Vec *V)
258: {
259:   PetscDeviceInitialize(PETSC_DEVICE_CUDA);
260:   VecCreate(comm,V);
261:   VecSetSizes(*V,n,n);
262:   VecSetBlockSize(*V,bs);
263:   VecCreate_SeqCUDA_Private(*V,array);
264:   return 0;
265: }

267: /*@C
268:    VecCreateSeqCUDAWithArrays - Creates a CUDA sequential array-style vector,
269:    where the user provides the array space to store the vector values.

271:    Collective

273:    Input Parameters:
274: +  comm - the communicator, should be PETSC_COMM_SELF
275: .  bs - the block size
276: .  n - the vector length
277: -  cpuarray - CPU memory where the vector elements are to be stored.
278: -  gpuarray - GPU memory where the vector elements are to be stored.

280:    Output Parameter:
281: .  V - the vector

283:    Notes:
284:    If both cpuarray and gpuarray are provided, the caller must ensure that
285:    the provided arrays have identical values.

287:    PETSc does NOT free the provided arrays when the vector is destroyed via
288:    VecDestroy(). The user should not free the array until the vector is
289:    destroyed.

291:    Level: intermediate

293: .seealso: VecCreateMPICUDAWithArrays(), VecCreate(), VecCreateSeqWithArray(),
294:           VecCUDAPlaceArray(), VecCreateSeqCUDAWithArray(),
295:           VecCUDAAllocateCheckHost()
296: @*/
297: PetscErrorCode  VecCreateSeqCUDAWithArrays(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar cpuarray[],const PetscScalar gpuarray[],Vec *V)
298: {
299:   // set V's gpuarray to be gpuarray, do not allocate memory on host yet.
300:   VecCreateSeqCUDAWithArray(comm,bs,n,gpuarray,V);

302:   if (cpuarray && gpuarray) {
303:     Vec_Seq *s = (Vec_Seq*)((*V)->data);
304:     s->array = (PetscScalar*)cpuarray;
305:     (*V)->offloadmask = PETSC_OFFLOAD_BOTH;
306:   } else if (cpuarray) {
307:     Vec_Seq *s = (Vec_Seq*)((*V)->data);
308:     s->array = (PetscScalar*)cpuarray;
309:     (*V)->offloadmask = PETSC_OFFLOAD_CPU;
310:   } else if (gpuarray) {
311:     (*V)->offloadmask = PETSC_OFFLOAD_GPU;
312:   } else {
313:     (*V)->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
314:   }

316:   return 0;
317: }

319: PetscErrorCode VecGetArray_SeqCUDA(Vec v,PetscScalar **a)
320: {
321:   VecCUDACopyFromGPU(v);
322:   *a = *((PetscScalar**)v->data);
323:   return 0;
324: }

326: PetscErrorCode VecRestoreArray_SeqCUDA(Vec v,PetscScalar **a)
327: {
328:   v->offloadmask = PETSC_OFFLOAD_CPU;
329:   return 0;
330: }

332: PetscErrorCode VecGetArrayWrite_SeqCUDA(Vec v,PetscScalar **a)
333: {
334:   VecCUDAAllocateCheckHost(v);
335:   *a   = *((PetscScalar**)v->data);
336:   return 0;
337: }

339: PetscErrorCode VecGetArrayAndMemType_SeqCUDA(Vec v,PetscScalar** a,PetscMemType *mtype)
340: {
341:   VecCUDACopyToGPU(v);
342:   *a   = ((Vec_CUDA*)v->spptr)->GPUarray;
343:   if (mtype) *mtype = ((Vec_CUDA*)v->spptr)->nvshmem ? PETSC_MEMTYPE_NVSHMEM : PETSC_MEMTYPE_CUDA;
344:   return 0;
345: }

347: PetscErrorCode VecRestoreArrayAndMemType_SeqCUDA(Vec v,PetscScalar** a)
348: {
349:   v->offloadmask = PETSC_OFFLOAD_GPU;
350:   return 0;
351: }

353: PetscErrorCode VecGetArrayWriteAndMemType_SeqCUDA(Vec v,PetscScalar** a,PetscMemType *mtype)
354: {
355:   /* Allocate memory (not zeroed) on device if not yet, but no need to sync data from host to device */
356:   VecCUDAAllocateCheck(v);
357:   *a   = ((Vec_CUDA*)v->spptr)->GPUarray;
358:   if (mtype) *mtype = ((Vec_CUDA*)v->spptr)->nvshmem ? PETSC_MEMTYPE_NVSHMEM : PETSC_MEMTYPE_CUDA;
359:   return 0;
360: }

362: PetscErrorCode VecBindToCPU_SeqCUDA(Vec V,PetscBool bind)
363: {
364:   V->boundtocpu = bind;
365:   if (bind) {
366:     VecCUDACopyFromGPU(V);
367:     V->offloadmask                 = PETSC_OFFLOAD_CPU; /* since the CPU code will likely change values in the vector */
368:     V->ops->dot                    = VecDot_Seq;
369:     V->ops->norm                   = VecNorm_Seq;
370:     V->ops->tdot                   = VecTDot_Seq;
371:     V->ops->scale                  = VecScale_Seq;
372:     V->ops->copy                   = VecCopy_Seq;
373:     V->ops->set                    = VecSet_Seq;
374:     V->ops->swap                   = VecSwap_Seq;
375:     V->ops->axpy                   = VecAXPY_Seq;
376:     V->ops->axpby                  = VecAXPBY_Seq;
377:     V->ops->axpbypcz               = VecAXPBYPCZ_Seq;
378:     V->ops->pointwisemult          = VecPointwiseMult_Seq;
379:     V->ops->pointwisedivide        = VecPointwiseDivide_Seq;
380:     V->ops->setrandom              = VecSetRandom_Seq;
381:     V->ops->dot_local              = VecDot_Seq;
382:     V->ops->tdot_local             = VecTDot_Seq;
383:     V->ops->norm_local             = VecNorm_Seq;
384:     V->ops->mdot_local             = VecMDot_Seq;
385:     V->ops->mtdot_local            = VecMTDot_Seq;
386:     V->ops->maxpy                  = VecMAXPY_Seq;
387:     V->ops->mdot                   = VecMDot_Seq;
388:     V->ops->mtdot                  = VecMTDot_Seq;
389:     V->ops->aypx                   = VecAYPX_Seq;
390:     V->ops->waxpy                  = VecWAXPY_Seq;
391:     V->ops->dotnorm2               = NULL;
392:     V->ops->placearray             = VecPlaceArray_Seq;
393:     V->ops->replacearray           = VecReplaceArray_SeqCUDA;
394:     V->ops->resetarray             = VecResetArray_Seq;
395:     V->ops->duplicate              = VecDuplicate_Seq;
396:     V->ops->conjugate              = VecConjugate_Seq;
397:     V->ops->getlocalvector         = NULL;
398:     V->ops->restorelocalvector     = NULL;
399:     V->ops->getlocalvectorread     = NULL;
400:     V->ops->restorelocalvectorread = NULL;
401:     V->ops->getarraywrite          = NULL;
402:     V->ops->getarrayandmemtype     = NULL;
403:     V->ops->getarraywriteandmemtype= NULL;
404:     V->ops->restorearrayandmemtype = NULL;
405:     V->ops->max                    = VecMax_Seq;
406:     V->ops->min                    = VecMin_Seq;
407:     V->ops->reciprocal             = VecReciprocal_Default;
408:     V->ops->sum                    = NULL;
409:     V->ops->shift                  = NULL;
410:     /* default random number generator */
411:     PetscFree(V->defaultrandtype);
412:     PetscStrallocpy(PETSCRANDER48,&V->defaultrandtype);
413:   } else {
414:     V->ops->dot                    = VecDot_SeqCUDA;
415:     V->ops->norm                   = VecNorm_SeqCUDA;
416:     V->ops->tdot                   = VecTDot_SeqCUDA;
417:     V->ops->scale                  = VecScale_SeqCUDA;
418:     V->ops->copy                   = VecCopy_SeqCUDA;
419:     V->ops->set                    = VecSet_SeqCUDA;
420:     V->ops->swap                   = VecSwap_SeqCUDA;
421:     V->ops->axpy                   = VecAXPY_SeqCUDA;
422:     V->ops->axpby                  = VecAXPBY_SeqCUDA;
423:     V->ops->axpbypcz               = VecAXPBYPCZ_SeqCUDA;
424:     V->ops->pointwisemult          = VecPointwiseMult_SeqCUDA;
425:     V->ops->pointwisedivide        = VecPointwiseDivide_SeqCUDA;
426:     V->ops->setrandom              = VecSetRandom_SeqCUDA;
427:     V->ops->dot_local              = VecDot_SeqCUDA;
428:     V->ops->tdot_local             = VecTDot_SeqCUDA;
429:     V->ops->norm_local             = VecNorm_SeqCUDA;
430:     V->ops->mdot_local             = VecMDot_SeqCUDA;
431:     V->ops->maxpy                  = VecMAXPY_SeqCUDA;
432:     V->ops->mdot                   = VecMDot_SeqCUDA;
433:     V->ops->aypx                   = VecAYPX_SeqCUDA;
434:     V->ops->waxpy                  = VecWAXPY_SeqCUDA;
435:     V->ops->dotnorm2               = VecDotNorm2_SeqCUDA;
436:     V->ops->placearray             = VecPlaceArray_SeqCUDA;
437:     V->ops->replacearray           = VecReplaceArray_SeqCUDA;
438:     V->ops->resetarray             = VecResetArray_SeqCUDA;
439:     V->ops->destroy                = VecDestroy_SeqCUDA;
440:     V->ops->duplicate              = VecDuplicate_SeqCUDA;
441:     V->ops->conjugate              = VecConjugate_SeqCUDA;
442:     V->ops->getlocalvector         = VecGetLocalVector_SeqCUDA;
443:     V->ops->restorelocalvector     = VecRestoreLocalVector_SeqCUDA;
444:     V->ops->getlocalvectorread     = VecGetLocalVectorRead_SeqCUDA;
445:     V->ops->restorelocalvectorread = VecRestoreLocalVectorRead_SeqCUDA;
446:     V->ops->getarraywrite          = VecGetArrayWrite_SeqCUDA;
447:     V->ops->getarray               = VecGetArray_SeqCUDA;
448:     V->ops->restorearray           = VecRestoreArray_SeqCUDA;
449:     V->ops->getarrayandmemtype     = VecGetArrayAndMemType_SeqCUDA;
450:     V->ops->getarraywriteandmemtype= VecGetArrayWriteAndMemType_SeqCUDA;
451:     V->ops->restorearrayandmemtype = VecRestoreArrayAndMemType_SeqCUDA;
452:     V->ops->max                    = VecMax_SeqCUDA;
453:     V->ops->min                    = VecMin_SeqCUDA;
454:     V->ops->reciprocal             = VecReciprocal_SeqCUDA;
455:     V->ops->sum                    = VecSum_SeqCUDA;
456:     V->ops->shift                  = VecShift_SeqCUDA;

458:     /* default random number generator */
459:     PetscFree(V->defaultrandtype);
460:     PetscStrallocpy(PETSCCURAND,&V->defaultrandtype);
461:   }
462:   return 0;
463: }

465: PetscErrorCode VecCreate_SeqCUDA_Private(Vec V,const PetscScalar *array)
466: {
468:   Vec_CUDA       *veccuda;
469:   PetscMPIInt    size;
470:   PetscBool      option_set;

472:   MPI_Comm_size(PetscObjectComm((PetscObject)V),&size);
474:   VecCreate_Seq_Private(V,0);
475:   PetscObjectChangeTypeName((PetscObject)V,VECSEQCUDA);
476:   VecBindToCPU_SeqCUDA(V,PETSC_FALSE);
477:   V->ops->bindtocpu = VecBindToCPU_SeqCUDA;

479:   /* Later, functions check for the Vec_CUDA structure existence, so do not create it without array */
480:   if (array) {
481:     if (!V->spptr) {
482:       PetscReal pinned_memory_min;
483:       PetscCalloc(sizeof(Vec_CUDA),&V->spptr);
484:       veccuda = (Vec_CUDA*)V->spptr;
485:       V->offloadmask = PETSC_OFFLOAD_UNALLOCATED;

487:       pinned_memory_min = 0;
488:       /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
489:          Note: This same code duplicated in VecCUDAAllocateCheck() and VecCreate_MPICUDA_Private(). Is there a good way to avoid this? */
490:       PetscOptionsBegin(PetscObjectComm((PetscObject)V),((PetscObject)V)->prefix,"VECCUDA Options","Vec");
491:       PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&option_set);
492:       if (option_set) V->minimum_bytes_pinned_memory = pinned_memory_min;
493:       PetscOptionsEnd();
494:     }
495:     veccuda = (Vec_CUDA*)V->spptr;
496:     veccuda->GPUarray = (PetscScalar*)array;
497:     V->offloadmask = PETSC_OFFLOAD_GPU;
498:   }
499:   return 0;
500: }