Actual source code: matstash.c


  2: #include <petsc/private/matimpl.h>

  4: #define DEFAULT_STASH_SIZE   10000

  6: static PetscErrorCode MatStashScatterBegin_Ref(Mat,MatStash*,PetscInt*);
  7: PETSC_INTERN PetscErrorCode MatStashScatterGetMesg_Ref(MatStash*,PetscMPIInt*,PetscInt**,PetscInt**,PetscScalar**,PetscInt*);
  8: PETSC_INTERN PetscErrorCode MatStashScatterEnd_Ref(MatStash*);
  9: #if !defined(PETSC_HAVE_MPIUNI)
 10: static PetscErrorCode MatStashScatterBegin_BTS(Mat,MatStash*,PetscInt*);
 11: static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash*,PetscMPIInt*,PetscInt**,PetscInt**,PetscScalar**,PetscInt*);
 12: static PetscErrorCode MatStashScatterEnd_BTS(MatStash*);
 13: #endif

 15: /*
 16:   MatStashCreate_Private - Creates a stash,currently used for all the parallel
 17:   matrix implementations. The stash is where elements of a matrix destined
 18:   to be stored on other processors are kept until matrix assembly is done.

 20:   This is a simple minded stash. Simply adds entries to end of stash.

 22:   Input Parameters:
 23:   comm - communicator, required for scatters.
 24:   bs   - stash block size. used when stashing blocks of values

 26:   Output Parameters:
 27:   stash    - the newly created stash
 28: */
 29: PetscErrorCode MatStashCreate_Private(MPI_Comm comm,PetscInt bs,MatStash *stash)
 30: {
 31:   PetscInt       max,*opt,nopt,i;
 32:   PetscBool      flg;

 34:   /* Require 2 tags,get the second using PetscCommGetNewTag() */
 35:   stash->comm = comm;

 37:   PetscCommGetNewTag(stash->comm,&stash->tag1);
 38:   PetscCommGetNewTag(stash->comm,&stash->tag2);
 39:   MPI_Comm_size(stash->comm,&stash->size);
 40:   MPI_Comm_rank(stash->comm,&stash->rank);
 41:   PetscMalloc1(2*stash->size,&stash->flg_v);
 42:   for (i=0; i<2*stash->size; i++) stash->flg_v[i] = -1;

 44:   nopt = stash->size;
 45:   PetscMalloc1(nopt,&opt);
 46:   PetscOptionsGetIntArray(NULL,NULL,"-matstash_initial_size",opt,&nopt,&flg);
 47:   if (flg) {
 48:     if (nopt == 1)                max = opt[0];
 49:     else if (nopt == stash->size) max = opt[stash->rank];
 50:     else if (stash->rank < nopt)  max = opt[stash->rank];
 51:     else                          max = 0; /* Use default */
 52:     stash->umax = max;
 53:   } else {
 54:     stash->umax = 0;
 55:   }
 56:   PetscFree(opt);
 57:   if (bs <= 0) bs = 1;

 59:   stash->bs         = bs;
 60:   stash->nmax       = 0;
 61:   stash->oldnmax    = 0;
 62:   stash->n          = 0;
 63:   stash->reallocs   = -1;
 64:   stash->space_head = NULL;
 65:   stash->space      = NULL;

 67:   stash->send_waits  = NULL;
 68:   stash->recv_waits  = NULL;
 69:   stash->send_status = NULL;
 70:   stash->nsends      = 0;
 71:   stash->nrecvs      = 0;
 72:   stash->svalues     = NULL;
 73:   stash->rvalues     = NULL;
 74:   stash->rindices    = NULL;
 75:   stash->nprocessed  = 0;
 76:   stash->reproduce   = PETSC_FALSE;
 77:   stash->blocktype   = MPI_DATATYPE_NULL;

 79:   PetscOptionsGetBool(NULL,NULL,"-matstash_reproduce",&stash->reproduce,NULL);
 80: #if !defined(PETSC_HAVE_MPIUNI)
 81:   flg  = PETSC_FALSE;
 82:   PetscOptionsGetBool(NULL,NULL,"-matstash_legacy",&flg,NULL);
 83:   if (!flg) {
 84:     stash->ScatterBegin   = MatStashScatterBegin_BTS;
 85:     stash->ScatterGetMesg = MatStashScatterGetMesg_BTS;
 86:     stash->ScatterEnd     = MatStashScatterEnd_BTS;
 87:     stash->ScatterDestroy = MatStashScatterDestroy_BTS;
 88:   } else {
 89: #endif
 90:     stash->ScatterBegin   = MatStashScatterBegin_Ref;
 91:     stash->ScatterGetMesg = MatStashScatterGetMesg_Ref;
 92:     stash->ScatterEnd     = MatStashScatterEnd_Ref;
 93:     stash->ScatterDestroy = NULL;
 94: #if !defined(PETSC_HAVE_MPIUNI)
 95:   }
 96: #endif
 97:   return 0;
 98: }

100: /*
101:    MatStashDestroy_Private - Destroy the stash
102: */
103: PetscErrorCode MatStashDestroy_Private(MatStash *stash)
104: {
105:   PetscMatStashSpaceDestroy(&stash->space_head);
106:   if (stash->ScatterDestroy) (*stash->ScatterDestroy)(stash);

108:   stash->space = NULL;

110:   PetscFree(stash->flg_v);
111:   return 0;
112: }

114: /*
115:    MatStashScatterEnd_Private - This is called as the final stage of
116:    scatter. The final stages of message passing is done here, and
117:    all the memory used for message passing is cleaned up. This
118:    routine also resets the stash, and deallocates the memory used
119:    for the stash. It also keeps track of the current memory usage
120:    so that the same value can be used the next time through.
121: */
122: PetscErrorCode MatStashScatterEnd_Private(MatStash *stash)
123: {
124:   (*stash->ScatterEnd)(stash);
125:   return 0;
126: }

128: PETSC_INTERN PetscErrorCode MatStashScatterEnd_Ref(MatStash *stash)
129: {
130:   PetscInt       nsends=stash->nsends,bs2,oldnmax,i;
131:   MPI_Status     *send_status;

133:   for (i=0; i<2*stash->size; i++) stash->flg_v[i] = -1;
134:   /* wait on sends */
135:   if (nsends) {
136:     PetscMalloc1(2*nsends,&send_status);
137:     MPI_Waitall(2*nsends,stash->send_waits,send_status);
138:     PetscFree(send_status);
139:   }

141:   /* Now update nmaxold to be app 10% more than max n used, this way the
142:      wastage of space is reduced the next time this stash is used.
143:      Also update the oldmax, only if it increases */
144:   if (stash->n) {
145:     bs2     = stash->bs*stash->bs;
146:     oldnmax = ((int)(stash->n * 1.1) + 5)*bs2;
147:     if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax;
148:   }

150:   stash->nmax       = 0;
151:   stash->n          = 0;
152:   stash->reallocs   = -1;
153:   stash->nprocessed = 0;

155:   PetscMatStashSpaceDestroy(&stash->space_head);

157:   stash->space = NULL;

159:   PetscFree(stash->send_waits);
160:   PetscFree(stash->recv_waits);
161:   PetscFree2(stash->svalues,stash->sindices);
162:   PetscFree(stash->rvalues[0]);
163:   PetscFree(stash->rvalues);
164:   PetscFree(stash->rindices[0]);
165:   PetscFree(stash->rindices);
166:   return 0;
167: }

169: /*
170:    MatStashGetInfo_Private - Gets the relevant statistics of the stash

172:    Input Parameters:
173:    stash    - the stash
174:    nstash   - the size of the stash. Indicates the number of values stored.
175:    reallocs - the number of additional mallocs incurred.

177: */
178: PetscErrorCode MatStashGetInfo_Private(MatStash *stash,PetscInt *nstash,PetscInt *reallocs)
179: {
180:   PetscInt bs2 = stash->bs*stash->bs;

182:   if (nstash) *nstash = stash->n*bs2;
183:   if (reallocs) {
184:     if (stash->reallocs < 0) *reallocs = 0;
185:     else                     *reallocs = stash->reallocs;
186:   }
187:   return 0;
188: }

190: /*
191:    MatStashSetInitialSize_Private - Sets the initial size of the stash

193:    Input Parameters:
194:    stash  - the stash
195:    max    - the value that is used as the max size of the stash.
196:             this value is used while allocating memory.
197: */
198: PetscErrorCode MatStashSetInitialSize_Private(MatStash *stash,PetscInt max)
199: {
200:   stash->umax = max;
201:   return 0;
202: }

204: /* MatStashExpand_Private - Expand the stash. This function is called
205:    when the space in the stash is not sufficient to add the new values
206:    being inserted into the stash.

208:    Input Parameters:
209:    stash - the stash
210:    incr  - the minimum increase requested

212:    Notes:
213:    This routine doubles the currently used memory.
214:  */
215: static PetscErrorCode MatStashExpand_Private(MatStash *stash,PetscInt incr)
216: {
217:   PetscInt       newnmax,bs2= stash->bs*stash->bs;

219:   /* allocate a larger stash */
220:   if (!stash->oldnmax && !stash->nmax) { /* new stash */
221:     if (stash->umax)                  newnmax = stash->umax/bs2;
222:     else                              newnmax = DEFAULT_STASH_SIZE/bs2;
223:   } else if (!stash->nmax) { /* resuing stash */
224:     if (stash->umax > stash->oldnmax) newnmax = stash->umax/bs2;
225:     else                              newnmax = stash->oldnmax/bs2;
226:   } else                              newnmax = stash->nmax*2;
227:   if (newnmax  < (stash->nmax + incr)) newnmax += 2*incr;

229:   /* Get a MatStashSpace and attach it to stash */
230:   PetscMatStashSpaceGet(bs2,newnmax,&stash->space);
231:   if (!stash->space_head) { /* new stash or resuing stash->oldnmax */
232:     stash->space_head = stash->space;
233:   }

235:   stash->reallocs++;
236:   stash->nmax = newnmax;
237:   return 0;
238: }
239: /*
240:   MatStashValuesRow_Private - inserts values into the stash. This function
241:   expects the values to be roworiented. Multiple columns belong to the same row
242:   can be inserted with a single call to this function.

244:   Input Parameters:
245:   stash  - the stash
246:   row    - the global row correspoiding to the values
247:   n      - the number of elements inserted. All elements belong to the above row.
248:   idxn   - the global column indices corresponding to each of the values.
249:   values - the values inserted
250: */
251: PetscErrorCode MatStashValuesRow_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscBool ignorezeroentries)
252: {
253:   PetscInt           i,k,cnt = 0;
254:   PetscMatStashSpace space=stash->space;

256:   /* Check and see if we have sufficient memory */
257:   if (!space || space->local_remaining < n) {
258:     MatStashExpand_Private(stash,n);
259:   }
260:   space = stash->space;
261:   k     = space->local_used;
262:   for (i=0; i<n; i++) {
263:     if (ignorezeroentries && values && values[i] == 0.0) continue;
264:     space->idx[k] = row;
265:     space->idy[k] = idxn[i];
266:     space->val[k] = values ? values[i] : 0.0;
267:     k++;
268:     cnt++;
269:   }
270:   stash->n               += cnt;
271:   space->local_used      += cnt;
272:   space->local_remaining -= cnt;
273:   return 0;
274: }

276: /*
277:   MatStashValuesCol_Private - inserts values into the stash. This function
278:   expects the values to be columnoriented. Multiple columns belong to the same row
279:   can be inserted with a single call to this function.

281:   Input Parameters:
282:   stash   - the stash
283:   row     - the global row correspoiding to the values
284:   n       - the number of elements inserted. All elements belong to the above row.
285:   idxn    - the global column indices corresponding to each of the values.
286:   values  - the values inserted
287:   stepval - the consecutive values are sepated by a distance of stepval.
288:             this happens because the input is columnoriented.
289: */
290: PetscErrorCode MatStashValuesCol_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscInt stepval,PetscBool ignorezeroentries)
291: {
292:   PetscInt           i,k,cnt = 0;
293:   PetscMatStashSpace space=stash->space;

295:   /* Check and see if we have sufficient memory */
296:   if (!space || space->local_remaining < n) {
297:     MatStashExpand_Private(stash,n);
298:   }
299:   space = stash->space;
300:   k     = space->local_used;
301:   for (i=0; i<n; i++) {
302:     if (ignorezeroentries && values && values[i*stepval] == 0.0) continue;
303:     space->idx[k] = row;
304:     space->idy[k] = idxn[i];
305:     space->val[k] = values ? values[i*stepval] : 0.0;
306:     k++;
307:     cnt++;
308:   }
309:   stash->n               += cnt;
310:   space->local_used      += cnt;
311:   space->local_remaining -= cnt;
312:   return 0;
313: }

315: /*
316:   MatStashValuesRowBlocked_Private - inserts blocks of values into the stash.
317:   This function expects the values to be roworiented. Multiple columns belong
318:   to the same block-row can be inserted with a single call to this function.
319:   This function extracts the sub-block of values based on the dimensions of
320:   the original input block, and the row,col values corresponding to the blocks.

322:   Input Parameters:
323:   stash  - the stash
324:   row    - the global block-row correspoiding to the values
325:   n      - the number of elements inserted. All elements belong to the above row.
326:   idxn   - the global block-column indices corresponding to each of the blocks of
327:            values. Each block is of size bs*bs.
328:   values - the values inserted
329:   rmax   - the number of block-rows in the original block.
330:   cmax   - the number of block-columns on the original block.
331:   idx    - the index of the current block-row in the original block.
332: */
333: PetscErrorCode MatStashValuesRowBlocked_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscInt rmax,PetscInt cmax,PetscInt idx)
334: {
335:   PetscInt           i,j,k,bs2,bs=stash->bs,l;
336:   const PetscScalar  *vals;
337:   PetscScalar        *array;
338:   PetscMatStashSpace space=stash->space;

340:   if (!space || space->local_remaining < n) {
341:     MatStashExpand_Private(stash,n);
342:   }
343:   space = stash->space;
344:   l     = space->local_used;
345:   bs2   = bs*bs;
346:   for (i=0; i<n; i++) {
347:     space->idx[l] = row;
348:     space->idy[l] = idxn[i];
349:     /* Now copy over the block of values. Store the values column oriented.
350:        This enables inserting multiple blocks belonging to a row with a single
351:        funtion call */
352:     array = space->val + bs2*l;
353:     vals  = values + idx*bs2*n + bs*i;
354:     for (j=0; j<bs; j++) {
355:       for (k=0; k<bs; k++) array[k*bs] = values ? vals[k] : 0.0;
356:       array++;
357:       vals += cmax*bs;
358:     }
359:     l++;
360:   }
361:   stash->n               += n;
362:   space->local_used      += n;
363:   space->local_remaining -= n;
364:   return 0;
365: }

367: /*
368:   MatStashValuesColBlocked_Private - inserts blocks of values into the stash.
369:   This function expects the values to be roworiented. Multiple columns belong
370:   to the same block-row can be inserted with a single call to this function.
371:   This function extracts the sub-block of values based on the dimensions of
372:   the original input block, and the row,col values corresponding to the blocks.

374:   Input Parameters:
375:   stash  - the stash
376:   row    - the global block-row correspoiding to the values
377:   n      - the number of elements inserted. All elements belong to the above row.
378:   idxn   - the global block-column indices corresponding to each of the blocks of
379:            values. Each block is of size bs*bs.
380:   values - the values inserted
381:   rmax   - the number of block-rows in the original block.
382:   cmax   - the number of block-columns on the original block.
383:   idx    - the index of the current block-row in the original block.
384: */
385: PetscErrorCode MatStashValuesColBlocked_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscInt rmax,PetscInt cmax,PetscInt idx)
386: {
387:   PetscInt           i,j,k,bs2,bs=stash->bs,l;
388:   const PetscScalar  *vals;
389:   PetscScalar        *array;
390:   PetscMatStashSpace space=stash->space;

392:   if (!space || space->local_remaining < n) {
393:     MatStashExpand_Private(stash,n);
394:   }
395:   space = stash->space;
396:   l     = space->local_used;
397:   bs2   = bs*bs;
398:   for (i=0; i<n; i++) {
399:     space->idx[l] = row;
400:     space->idy[l] = idxn[i];
401:     /* Now copy over the block of values. Store the values column oriented.
402:      This enables inserting multiple blocks belonging to a row with a single
403:      funtion call */
404:     array = space->val + bs2*l;
405:     vals  = values + idx*bs2*n + bs*i;
406:     for (j=0; j<bs; j++) {
407:       for (k=0; k<bs; k++) array[k] = values ? vals[k] : 0.0;
408:       array += bs;
409:       vals  += rmax*bs;
410:     }
411:     l++;
412:   }
413:   stash->n               += n;
414:   space->local_used      += n;
415:   space->local_remaining -= n;
416:   return 0;
417: }
418: /*
419:   MatStashScatterBegin_Private - Initiates the transfer of values to the
420:   correct owners. This function goes through the stash, and check the
421:   owners of each stashed value, and sends the values off to the owner
422:   processors.

424:   Input Parameters:
425:   stash  - the stash
426:   owners - an array of size 'no-of-procs' which gives the ownership range
427:            for each node.

429:   Notes:
430:     The 'owners' array in the cased of the blocked-stash has the
431:   ranges specified blocked global indices, and for the regular stash in
432:   the proper global indices.
433: */
434: PetscErrorCode MatStashScatterBegin_Private(Mat mat,MatStash *stash,PetscInt *owners)
435: {
436:   (*stash->ScatterBegin)(mat,stash,owners);
437:   return 0;
438: }

440: static PetscErrorCode MatStashScatterBegin_Ref(Mat mat,MatStash *stash,PetscInt *owners)
441: {
442:   PetscInt           *owner,*startv,*starti,tag1=stash->tag1,tag2=stash->tag2,bs2;
443:   PetscInt           size=stash->size,nsends;
444:   PetscInt           count,*sindices,**rindices,i,j,idx,lastidx,l;
445:   PetscScalar        **rvalues,*svalues;
446:   MPI_Comm           comm = stash->comm;
447:   MPI_Request        *send_waits,*recv_waits,*recv_waits1,*recv_waits2;
448:   PetscMPIInt        *sizes,*nlengths,nreceives;
449:   PetscInt           *sp_idx,*sp_idy;
450:   PetscScalar        *sp_val;
451:   PetscMatStashSpace space,space_next;

453:   {                             /* make sure all processors are either in INSERTMODE or ADDMODE */
454:     InsertMode addv;
455:     MPIU_Allreduce((PetscEnum*)&mat->insertmode,(PetscEnum*)&addv,1,MPIU_ENUM,MPI_BOR,PetscObjectComm((PetscObject)mat));
457:     mat->insertmode = addv; /* in case this processor had no cache */
458:   }

460:   bs2 = stash->bs*stash->bs;

462:   /*  first count number of contributors to each processor */
463:   PetscCalloc1(size,&nlengths);
464:   PetscMalloc1(stash->n+1,&owner);

466:   i       = j    = 0;
467:   lastidx = -1;
468:   space   = stash->space_head;
469:   while (space) {
470:     space_next = space->next;
471:     sp_idx     = space->idx;
472:     for (l=0; l<space->local_used; l++) {
473:       /* if indices are NOT locally sorted, need to start search at the beginning */
474:       if (lastidx > (idx = sp_idx[l])) j = 0;
475:       lastidx = idx;
476:       for (; j<size; j++) {
477:         if (idx >= owners[j] && idx < owners[j+1]) {
478:           nlengths[j]++; owner[i] = j; break;
479:         }
480:       }
481:       i++;
482:     }
483:     space = space_next;
484:   }

486:   /* Now check what procs get messages - and compute nsends. */
487:   PetscCalloc1(size,&sizes);
488:   for (i=0, nsends=0; i<size; i++) {
489:     if (nlengths[i]) {
490:       sizes[i] = 1; nsends++;
491:     }
492:   }

494:   {PetscMPIInt *onodes,*olengths;
495:    /* Determine the number of messages to expect, their lengths, from from-ids */
496:    PetscGatherNumberOfMessages(comm,sizes,nlengths,&nreceives);
497:    PetscGatherMessageLengths(comm,nsends,nreceives,nlengths,&onodes,&olengths);
498:    /* since clubbing row,col - lengths are multiplied by 2 */
499:    for (i=0; i<nreceives; i++) olengths[i] *=2;
500:    PetscPostIrecvInt(comm,tag1,nreceives,onodes,olengths,&rindices,&recv_waits1);
501:    /* values are size 'bs2' lengths (and remove earlier factor 2 */
502:    for (i=0; i<nreceives; i++) olengths[i] = olengths[i]*bs2/2;
503:    PetscPostIrecvScalar(comm,tag2,nreceives,onodes,olengths,&rvalues,&recv_waits2);
504:    PetscFree(onodes);
505:    PetscFree(olengths);}

507:   /* do sends:
508:       1) starts[i] gives the starting index in svalues for stuff going to
509:          the ith processor
510:   */
511:   PetscMalloc2(bs2*stash->n,&svalues,2*(stash->n+1),&sindices);
512:   PetscMalloc1(2*nsends,&send_waits);
513:   PetscMalloc2(size,&startv,size,&starti);
514:   /* use 2 sends the first with all_a, the next with all_i and all_j */
515:   startv[0] = 0; starti[0] = 0;
516:   for (i=1; i<size; i++) {
517:     startv[i] = startv[i-1] + nlengths[i-1];
518:     starti[i] = starti[i-1] + 2*nlengths[i-1];
519:   }

521:   i     = 0;
522:   space = stash->space_head;
523:   while (space) {
524:     space_next = space->next;
525:     sp_idx     = space->idx;
526:     sp_idy     = space->idy;
527:     sp_val     = space->val;
528:     for (l=0; l<space->local_used; l++) {
529:       j = owner[i];
530:       if (bs2 == 1) {
531:         svalues[startv[j]] = sp_val[l];
532:       } else {
533:         PetscInt    k;
534:         PetscScalar *buf1,*buf2;
535:         buf1 = svalues+bs2*startv[j];
536:         buf2 = space->val + bs2*l;
537:         for (k=0; k<bs2; k++) buf1[k] = buf2[k];
538:       }
539:       sindices[starti[j]]             = sp_idx[l];
540:       sindices[starti[j]+nlengths[j]] = sp_idy[l];
541:       startv[j]++;
542:       starti[j]++;
543:       i++;
544:     }
545:     space = space_next;
546:   }
547:   startv[0] = 0;
548:   for (i=1; i<size; i++) startv[i] = startv[i-1] + nlengths[i-1];

550:   for (i=0,count=0; i<size; i++) {
551:     if (sizes[i]) {
552:       MPI_Isend(sindices+2*startv[i],2*nlengths[i],MPIU_INT,i,tag1,comm,send_waits+count++);
553:       MPI_Isend(svalues+bs2*startv[i],bs2*nlengths[i],MPIU_SCALAR,i,tag2,comm,send_waits+count++);
554:     }
555:   }
556: #if defined(PETSC_USE_INFO)
557:   PetscInfo(NULL,"No of messages: %" PetscInt_FMT " \n",nsends);
558:   for (i=0; i<size; i++) {
559:     if (sizes[i]) {
560:       PetscInfo(NULL,"Mesg_to: %" PetscInt_FMT ": size: %zu bytes\n",i,(size_t)(nlengths[i]*(bs2*sizeof(PetscScalar)+2*sizeof(PetscInt))));
561:     }
562:   }
563: #endif
564:   PetscFree(nlengths);
565:   PetscFree(owner);
566:   PetscFree2(startv,starti);
567:   PetscFree(sizes);

569:   /* recv_waits need to be contiguous for MatStashScatterGetMesg_Private() */
570:   PetscMalloc1(2*nreceives,&recv_waits);

572:   for (i=0; i<nreceives; i++) {
573:     recv_waits[2*i]   = recv_waits1[i];
574:     recv_waits[2*i+1] = recv_waits2[i];
575:   }
576:   stash->recv_waits = recv_waits;

578:   PetscFree(recv_waits1);
579:   PetscFree(recv_waits2);

581:   stash->svalues         = svalues;
582:   stash->sindices        = sindices;
583:   stash->rvalues         = rvalues;
584:   stash->rindices        = rindices;
585:   stash->send_waits      = send_waits;
586:   stash->nsends          = nsends;
587:   stash->nrecvs          = nreceives;
588:   stash->reproduce_count = 0;
589:   return 0;
590: }

592: /*
593:    MatStashScatterGetMesg_Private - This function waits on the receives posted
594:    in the function MatStashScatterBegin_Private() and returns one message at
595:    a time to the calling function. If no messages are left, it indicates this
596:    by setting flg = 0, else it sets flg = 1.

598:    Input Parameters:
599:    stash - the stash

601:    Output Parameters:
602:    nvals - the number of entries in the current message.
603:    rows  - an array of row indices (or blocked indices) corresponding to the values
604:    cols  - an array of columnindices (or blocked indices) corresponding to the values
605:    vals  - the values
606:    flg   - 0 indicates no more message left, and the current call has no values associated.
607:            1 indicates that the current call successfully received a message, and the
608:              other output parameters nvals,rows,cols,vals are set appropriately.
609: */
610: PetscErrorCode MatStashScatterGetMesg_Private(MatStash *stash,PetscMPIInt *nvals,PetscInt **rows,PetscInt **cols,PetscScalar **vals,PetscInt *flg)
611: {
612:   (*stash->ScatterGetMesg)(stash,nvals,rows,cols,vals,flg);
613:   return 0;
614: }

616: PETSC_INTERN PetscErrorCode MatStashScatterGetMesg_Ref(MatStash *stash,PetscMPIInt *nvals,PetscInt **rows,PetscInt **cols,PetscScalar **vals,PetscInt *flg)
617: {
618:   PetscMPIInt    i,*flg_v = stash->flg_v,i1,i2;
619:   PetscInt       bs2;
620:   MPI_Status     recv_status;
621:   PetscBool      match_found = PETSC_FALSE;

623:   *flg = 0; /* When a message is discovered this is reset to 1 */
624:   /* Return if no more messages to process */
625:   if (stash->nprocessed == stash->nrecvs) return 0;

627:   bs2 = stash->bs*stash->bs;
628:   /* If a matching pair of receives are found, process them, and return the data to
629:      the calling function. Until then keep receiving messages */
630:   while (!match_found) {
631:     if (stash->reproduce) {
632:       i    = stash->reproduce_count++;
633:       MPI_Wait(stash->recv_waits+i,&recv_status);
634:     } else {
635:       MPI_Waitany(2*stash->nrecvs,stash->recv_waits,&i,&recv_status);
636:     }

639:     /* Now pack the received message into a structure which is usable by others */
640:     if (i % 2) {
641:       MPI_Get_count(&recv_status,MPIU_SCALAR,nvals);

643:       flg_v[2*recv_status.MPI_SOURCE] = i/2;

645:       *nvals = *nvals/bs2;
646:     } else {
647:       MPI_Get_count(&recv_status,MPIU_INT,nvals);

649:       flg_v[2*recv_status.MPI_SOURCE+1] = i/2;

651:       *nvals = *nvals/2; /* This message has both row indices and col indices */
652:     }

654:     /* Check if we have both messages from this proc */
655:     i1 = flg_v[2*recv_status.MPI_SOURCE];
656:     i2 = flg_v[2*recv_status.MPI_SOURCE+1];
657:     if (i1 != -1 && i2 != -1) {
658:       *rows = stash->rindices[i2];
659:       *cols = *rows + *nvals;
660:       *vals = stash->rvalues[i1];
661:       *flg  = 1;
662:       stash->nprocessed++;
663:       match_found = PETSC_TRUE;
664:     }
665:   }
666:   return 0;
667: }

669: #if !defined(PETSC_HAVE_MPIUNI)
670: typedef struct {
671:   PetscInt row;
672:   PetscInt col;
673:   PetscScalar vals[1];          /* Actually an array of length bs2 */
674: } MatStashBlock;

676: static PetscErrorCode MatStashSortCompress_Private(MatStash *stash,InsertMode insertmode)
677: {
678:   PetscMatStashSpace space;
679:   PetscInt n = stash->n,bs = stash->bs,bs2 = bs*bs,cnt,*row,*col,*perm,rowstart,i;
680:   PetscScalar **valptr;

682:   PetscMalloc4(n,&row,n,&col,n,&valptr,n,&perm);
683:   for (space=stash->space_head,cnt=0; space; space=space->next) {
684:     for (i=0; i<space->local_used; i++) {
685:       row[cnt] = space->idx[i];
686:       col[cnt] = space->idy[i];
687:       valptr[cnt] = &space->val[i*bs2];
688:       perm[cnt] = cnt;          /* Will tell us where to find valptr after sorting row[] and col[] */
689:       cnt++;
690:     }
691:   }
693:   PetscSortIntWithArrayPair(n,row,col,perm);
694:   /* Scan through the rows, sorting each one, combining duplicates, and packing send buffers */
695:   for (rowstart=0,cnt=0,i=1; i<=n; i++) {
696:     if (i == n || row[i] != row[rowstart]) {         /* Sort the last row. */
697:       PetscInt colstart;
698:       PetscSortIntWithArray(i-rowstart,&col[rowstart],&perm[rowstart]);
699:       for (colstart=rowstart; colstart<i;) { /* Compress multiple insertions to the same location */
700:         PetscInt j,l;
701:         MatStashBlock *block;
702:         PetscSegBufferGet(stash->segsendblocks,1,&block);
703:         block->row = row[rowstart];
704:         block->col = col[colstart];
705:         PetscArraycpy(block->vals,valptr[perm[colstart]],bs2);
706:         for (j=colstart+1; j<i && col[j] == col[colstart]; j++) { /* Add any extra stashed blocks at the same (row,col) */
707:           if (insertmode == ADD_VALUES) {
708:             for (l=0; l<bs2; l++) block->vals[l] += valptr[perm[j]][l];
709:           } else {
710:             PetscArraycpy(block->vals,valptr[perm[j]],bs2);
711:           }
712:         }
713:         colstart = j;
714:       }
715:       rowstart = i;
716:     }
717:   }
718:   PetscFree4(row,col,valptr,perm);
719:   return 0;
720: }

722: static PetscErrorCode MatStashBlockTypeSetUp(MatStash *stash)
723: {
724:   if (stash->blocktype == MPI_DATATYPE_NULL) {
725:     PetscInt     bs2 = PetscSqr(stash->bs);
726:     PetscMPIInt  blocklens[2];
727:     MPI_Aint     displs[2];
728:     MPI_Datatype types[2],stype;
729:     /* Note that DummyBlock is a type having standard layout, even when PetscScalar is C++ std::complex.
730:        std::complex itself has standard layout, so does DummyBlock, recursively.
731:        To be compatible with C++ std::complex, complex implementations on GPUs must also have standard layout,
732:        though they can have different alignment, e.g, 16 bytes for double complex, instead of 8 bytes as in GCC stdlibc++.
733:        offsetof(type, member) only requires type to have standard layout. Ref. https://en.cppreference.com/w/cpp/types/offsetof.

735:        We can test if std::complex has standard layout with the following code:
736:        #include <iostream>
737:        #include <type_traits>
738:        #include <complex>
739:        int main() {
740:          std::cout << std::boolalpha;
741:          std::cout << std::is_standard_layout<std::complex<double> >::value << '\n';
742:        }
743:        Output: true
744:      */
745:     struct DummyBlock {PetscInt row,col; PetscScalar vals;};

747:     stash->blocktype_size = offsetof(struct DummyBlock,vals) + bs2*sizeof(PetscScalar);
748:     if (stash->blocktype_size % sizeof(PetscInt)) { /* Implies that PetscInt is larger and does not satisfy alignment without padding */
749:       stash->blocktype_size += sizeof(PetscInt) - stash->blocktype_size % sizeof(PetscInt);
750:     }
751:     PetscSegBufferCreate(stash->blocktype_size,1,&stash->segsendblocks);
752:     PetscSegBufferCreate(stash->blocktype_size,1,&stash->segrecvblocks);
753:     PetscSegBufferCreate(sizeof(MatStashFrame),1,&stash->segrecvframe);
754:     blocklens[0] = 2;
755:     blocklens[1] = bs2;
756:     displs[0] = offsetof(struct DummyBlock,row);
757:     displs[1] = offsetof(struct DummyBlock,vals);
758:     types[0] = MPIU_INT;
759:     types[1] = MPIU_SCALAR;
760:     MPI_Type_create_struct(2,blocklens,displs,types,&stype);
761:     MPI_Type_commit(&stype);
762:     MPI_Type_create_resized(stype,0,stash->blocktype_size,&stash->blocktype);
763:     MPI_Type_commit(&stash->blocktype);
764:     MPI_Type_free(&stype);
765:   }
766:   return 0;
767: }

769: /* Callback invoked after target rank has initiatied receive of rendezvous message.
770:  * Here we post the main sends.
771:  */
772: static PetscErrorCode MatStashBTSSend_Private(MPI_Comm comm,const PetscMPIInt tag[],PetscMPIInt rankid,PetscMPIInt rank,void *sdata,MPI_Request req[],void *ctx)
773: {
774:   MatStash *stash = (MatStash*)ctx;
775:   MatStashHeader *hdr = (MatStashHeader*)sdata;

778:   MPI_Isend(stash->sendframes[rankid].buffer,hdr->count,stash->blocktype,rank,tag[0],comm,&req[0]);
779:   stash->sendframes[rankid].count = hdr->count;
780:   stash->sendframes[rankid].pending = 1;
781:   return 0;
782: }

784: /* Callback invoked by target after receiving rendezvous message.
785:  * Here we post the main recvs.
786:  */
787: static PetscErrorCode MatStashBTSRecv_Private(MPI_Comm comm,const PetscMPIInt tag[],PetscMPIInt rank,void *rdata,MPI_Request req[],void *ctx)
788: {
789:   MatStash *stash = (MatStash*)ctx;
790:   MatStashHeader *hdr = (MatStashHeader*)rdata;
791:   MatStashFrame *frame;

793:   PetscSegBufferGet(stash->segrecvframe,1,&frame);
794:   PetscSegBufferGet(stash->segrecvblocks,hdr->count,&frame->buffer);
795:   MPI_Irecv(frame->buffer,hdr->count,stash->blocktype,rank,tag[0],comm,&req[0]);
796:   frame->count = hdr->count;
797:   frame->pending = 1;
798:   return 0;
799: }

801: /*
802:  * owners[] contains the ownership ranges; may be indexed by either blocks or scalars
803:  */
804: static PetscErrorCode MatStashScatterBegin_BTS(Mat mat,MatStash *stash,PetscInt owners[])
805: {
807:   size_t nblocks;
808:   char *sendblocks;

810:   if (PetscDefined(USE_DEBUG)) { /* make sure all processors are either in INSERTMODE or ADDMODE */
811:     InsertMode addv;
812:     MPIU_Allreduce((PetscEnum*)&mat->insertmode,(PetscEnum*)&addv,1,MPIU_ENUM,MPI_BOR,PetscObjectComm((PetscObject)mat));
814:   }

816:   MatStashBlockTypeSetUp(stash);
817:   MatStashSortCompress_Private(stash,mat->insertmode);
818:   PetscSegBufferGetSize(stash->segsendblocks,&nblocks);
819:   PetscSegBufferExtractInPlace(stash->segsendblocks,&sendblocks);
820:   if (stash->first_assembly_done) { /* Set up sendhdrs and sendframes for each rank that we sent before */
821:     PetscInt i;
822:     size_t b;
823:     for (i=0,b=0; i<stash->nsendranks; i++) {
824:       stash->sendframes[i].buffer = &sendblocks[b*stash->blocktype_size];
825:       /* sendhdr is never actually sent, but the count is used by MatStashBTSSend_Private */
826:       stash->sendhdr[i].count = 0; /* Might remain empty (in which case we send a zero-sized message) if no values are communicated to that process */
827:       for (; b<nblocks; b++) {
828:         MatStashBlock *sendblock_b = (MatStashBlock*)&sendblocks[b*stash->blocktype_size];
830:         if (sendblock_b->row >= owners[stash->sendranks[i]+1]) break;
831:         stash->sendhdr[i].count++;
832:       }
833:     }
834:   } else {                      /* Dynamically count and pack (first time) */
835:     PetscInt sendno;
836:     size_t i,rowstart;

838:     /* Count number of send ranks and allocate for sends */
839:     stash->nsendranks = 0;
840:     for (rowstart=0; rowstart<nblocks;) {
841:       PetscInt owner;
842:       MatStashBlock *sendblock_rowstart = (MatStashBlock*)&sendblocks[rowstart*stash->blocktype_size];
843:       PetscFindInt(sendblock_rowstart->row,stash->size+1,owners,&owner);
844:       if (owner < 0) owner = -(owner+2);
845:       for (i=rowstart+1; i<nblocks; i++) { /* Move forward through a run of blocks with the same owner */
846:         MatStashBlock *sendblock_i = (MatStashBlock*)&sendblocks[i*stash->blocktype_size];
847:         if (sendblock_i->row >= owners[owner+1]) break;
848:       }
849:       stash->nsendranks++;
850:       rowstart = i;
851:     }
852:     PetscMalloc3(stash->nsendranks,&stash->sendranks,stash->nsendranks,&stash->sendhdr,stash->nsendranks,&stash->sendframes);

854:     /* Set up sendhdrs and sendframes */
855:     sendno = 0;
856:     for (rowstart=0; rowstart<nblocks;) {
857:       PetscInt owner;
858:       MatStashBlock *sendblock_rowstart = (MatStashBlock*)&sendblocks[rowstart*stash->blocktype_size];
859:       PetscFindInt(sendblock_rowstart->row,stash->size+1,owners,&owner);
860:       if (owner < 0) owner = -(owner+2);
861:       stash->sendranks[sendno] = owner;
862:       for (i=rowstart+1; i<nblocks; i++) { /* Move forward through a run of blocks with the same owner */
863:         MatStashBlock *sendblock_i = (MatStashBlock*)&sendblocks[i*stash->blocktype_size];
864:         if (sendblock_i->row >= owners[owner+1]) break;
865:       }
866:       stash->sendframes[sendno].buffer = sendblock_rowstart;
867:       stash->sendframes[sendno].pending = 0;
868:       stash->sendhdr[sendno].count = i - rowstart;
869:       sendno++;
870:       rowstart = i;
871:     }
873:   }

875:   /* Encode insertmode on the outgoing messages. If we want to support more than two options, we would need a new
876:    * message or a dummy entry of some sort. */
877:   if (mat->insertmode == INSERT_VALUES) {
878:     size_t i;
879:     for (i=0; i<nblocks; i++) {
880:       MatStashBlock *sendblock_i = (MatStashBlock*)&sendblocks[i*stash->blocktype_size];
881:       sendblock_i->row = -(sendblock_i->row+1);
882:     }
883:   }

885:   if (stash->first_assembly_done) {
886:     PetscMPIInt i,tag;
887:     PetscCommGetNewTag(stash->comm,&tag);
888:     for (i=0; i<stash->nrecvranks; i++) {
889:       MatStashBTSRecv_Private(stash->comm,&tag,stash->recvranks[i],&stash->recvhdr[i],&stash->recvreqs[i],stash);
890:     }
891:     for (i=0; i<stash->nsendranks; i++) {
892:       MatStashBTSSend_Private(stash->comm,&tag,i,stash->sendranks[i],&stash->sendhdr[i],&stash->sendreqs[i],stash);
893:     }
894:     stash->use_status = PETSC_TRUE; /* Use count from message status. */
895:   } else {
896:     PetscCommBuildTwoSidedFReq(stash->comm,1,MPIU_INT,stash->nsendranks,stash->sendranks,(PetscInt*)stash->sendhdr,
897:                                       &stash->nrecvranks,&stash->recvranks,(PetscInt*)&stash->recvhdr,1,&stash->sendreqs,&stash->recvreqs,
898:                                       MatStashBTSSend_Private,MatStashBTSRecv_Private,stash);
899:     PetscMalloc2(stash->nrecvranks,&stash->some_indices,stash->nrecvranks,&stash->some_statuses);
900:     stash->use_status = PETSC_FALSE; /* Use count from header instead of from message. */
901:   }

903:   PetscSegBufferExtractInPlace(stash->segrecvframe,&stash->recvframes);
904:   stash->recvframe_active     = NULL;
905:   stash->recvframe_i          = 0;
906:   stash->some_i               = 0;
907:   stash->some_count           = 0;
908:   stash->recvcount            = 0;
909:   stash->first_assembly_done  = mat->assembly_subset; /* See the same logic in VecAssemblyBegin_MPI_BTS */
910:   stash->insertmode           = &mat->insertmode;
911:   return 0;
912: }

914: static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash *stash,PetscMPIInt *n,PetscInt **row,PetscInt **col,PetscScalar **val,PetscInt *flg)
915: {
916:   MatStashBlock *block;

918:   *flg = 0;
919:   while (!stash->recvframe_active || stash->recvframe_i == stash->recvframe_count) {
920:     if (stash->some_i == stash->some_count) {
921:       if (stash->recvcount == stash->nrecvranks) return 0; /* Done */
922:       MPI_Waitsome(stash->nrecvranks,stash->recvreqs,&stash->some_count,stash->some_indices,stash->use_status?stash->some_statuses:MPI_STATUSES_IGNORE);
923:       stash->some_i = 0;
924:     }
925:     stash->recvframe_active = &stash->recvframes[stash->some_indices[stash->some_i]];
926:     stash->recvframe_count = stash->recvframe_active->count; /* From header; maximum count */
927:     if (stash->use_status) { /* Count what was actually sent */
928:       MPI_Get_count(&stash->some_statuses[stash->some_i],stash->blocktype,&stash->recvframe_count);
929:     }
930:     if (stash->recvframe_count > 0) { /* Check for InsertMode consistency */
931:       block = (MatStashBlock*)&((char*)stash->recvframe_active->buffer)[0];
932:       if (PetscUnlikely(*stash->insertmode == NOT_SET_VALUES)) *stash->insertmode = block->row < 0 ? INSERT_VALUES : ADD_VALUES;
935:     }
936:     stash->some_i++;
937:     stash->recvcount++;
938:     stash->recvframe_i = 0;
939:   }
940:   *n = 1;
941:   block = (MatStashBlock*)&((char*)stash->recvframe_active->buffer)[stash->recvframe_i*stash->blocktype_size];
942:   if (block->row < 0) block->row = -(block->row + 1);
943:   *row = &block->row;
944:   *col = &block->col;
945:   *val = block->vals;
946:   stash->recvframe_i++;
947:   *flg = 1;
948:   return 0;
949: }

951: static PetscErrorCode MatStashScatterEnd_BTS(MatStash *stash)
952: {
953:   MPI_Waitall(stash->nsendranks,stash->sendreqs,MPI_STATUSES_IGNORE);
954:   if (stash->first_assembly_done) { /* Reuse the communication contexts, so consolidate and reset segrecvblocks  */
955:     void *dummy;
956:     PetscSegBufferExtractInPlace(stash->segrecvblocks,&dummy);
957:   } else {                      /* No reuse, so collect everything. */
958:     MatStashScatterDestroy_BTS(stash);
959:   }

961:   /* Now update nmaxold to be app 10% more than max n used, this way the
962:      wastage of space is reduced the next time this stash is used.
963:      Also update the oldmax, only if it increases */
964:   if (stash->n) {
965:     PetscInt bs2     = stash->bs*stash->bs;
966:     PetscInt oldnmax = ((int)(stash->n * 1.1) + 5)*bs2;
967:     if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax;
968:   }

970:   stash->nmax       = 0;
971:   stash->n          = 0;
972:   stash->reallocs   = -1;
973:   stash->nprocessed = 0;

975:   PetscMatStashSpaceDestroy(&stash->space_head);

977:   stash->space = NULL;

979:   return 0;
980: }

982: PetscErrorCode MatStashScatterDestroy_BTS(MatStash *stash)
983: {
984:   PetscSegBufferDestroy(&stash->segsendblocks);
985:   PetscSegBufferDestroy(&stash->segrecvframe);
986:   stash->recvframes = NULL;
987:   PetscSegBufferDestroy(&stash->segrecvblocks);
988:   if (stash->blocktype != MPI_DATATYPE_NULL) {
989:     MPI_Type_free(&stash->blocktype);
990:   }
991:   stash->nsendranks = 0;
992:   stash->nrecvranks = 0;
993:   PetscFree3(stash->sendranks,stash->sendhdr,stash->sendframes);
994:   PetscFree(stash->sendreqs);
995:   PetscFree(stash->recvreqs);
996:   PetscFree(stash->recvranks);
997:   PetscFree(stash->recvhdr);
998:   PetscFree2(stash->some_indices,stash->some_statuses);
999:   return 0;
1000: }
1001: #endif