Actual source code: mpiptap.c


  2: /*
  3:   Defines projective product routines where A is a MPIAIJ matrix
  4:           C = P^T * A * P
  5: */

  7: #include <../src/mat/impls/aij/seq/aij.h>
  8: #include <../src/mat/utils/freespace.h>
  9: #include <../src/mat/impls/aij/mpi/mpiaij.h>
 10: #include <petscbt.h>
 11: #include <petsctime.h>
 12: #include <petsc/private/hashmapiv.h>
 13: #include <petsc/private/hashseti.h>
 14: #include <petscsf.h>

 16: PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer)
 17: {
 18:   PetscBool         iascii;
 19:   PetscViewerFormat format;
 20:   Mat_APMPI         *ptap;

 22:   MatCheckProduct(A,1);
 23:   ptap = (Mat_APMPI*)A->product->data;
 24:   PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
 25:   if (iascii) {
 26:     PetscViewerGetFormat(viewer,&format);
 27:     if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
 28:       if (ptap->algType == 0) {
 29:         PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n");
 30:       } else if (ptap->algType == 1) {
 31:         PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n");
 32:       } else if (ptap->algType == 2) {
 33:         PetscViewerASCIIPrintf(viewer,"using allatonce MatPtAP() implementation\n");
 34:       } else if (ptap->algType == 3) {
 35:         PetscViewerASCIIPrintf(viewer,"using merged allatonce MatPtAP() implementation\n");
 36:       }
 37:     }
 38:   }
 39:   return 0;
 40: }

 42: PetscErrorCode MatDestroy_MPIAIJ_PtAP(void *data)
 43: {
 44:   Mat_APMPI           *ptap = (Mat_APMPI*)data;
 45:   Mat_Merge_SeqsToMPI *merge;

 47:   PetscFree2(ptap->startsj_s,ptap->startsj_r);
 48:   PetscFree(ptap->bufa);
 49:   MatDestroy(&ptap->P_loc);
 50:   MatDestroy(&ptap->P_oth);
 51:   MatDestroy(&ptap->A_loc); /* used by MatTransposeMatMult() */
 52:   MatDestroy(&ptap->Rd);
 53:   MatDestroy(&ptap->Ro);
 54:   if (ptap->AP_loc) { /* used by alg_rap */
 55:     Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data;
 56:     PetscFree(ap->i);
 57:     PetscFree2(ap->j,ap->a);
 58:     MatDestroy(&ptap->AP_loc);
 59:   } else { /* used by alg_ptap */
 60:     PetscFree(ptap->api);
 61:     PetscFree(ptap->apj);
 62:   }
 63:   MatDestroy(&ptap->C_loc);
 64:   MatDestroy(&ptap->C_oth);
 65:   if (ptap->apa) PetscFree(ptap->apa);

 67:   MatDestroy(&ptap->Pt);

 69:   merge = ptap->merge;
 70:   if (merge) { /* used by alg_ptap */
 71:     PetscFree(merge->id_r);
 72:     PetscFree(merge->len_s);
 73:     PetscFree(merge->len_r);
 74:     PetscFree(merge->bi);
 75:     PetscFree(merge->bj);
 76:     PetscFree(merge->buf_ri[0]);
 77:     PetscFree(merge->buf_ri);
 78:     PetscFree(merge->buf_rj[0]);
 79:     PetscFree(merge->buf_rj);
 80:     PetscFree(merge->coi);
 81:     PetscFree(merge->coj);
 82:     PetscFree(merge->owners_co);
 83:     PetscLayoutDestroy(&merge->rowmap);
 84:     PetscFree(ptap->merge);
 85:   }
 86:   ISLocalToGlobalMappingDestroy(&ptap->ltog);

 88:   PetscSFDestroy(&ptap->sf);
 89:   PetscFree(ptap->c_othi);
 90:   PetscFree(ptap->c_rmti);
 91:   PetscFree(ptap);
 92:   return 0;
 93: }

 95: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C)
 96: {
 97:   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
 98:   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
 99:   Mat_SeqAIJ        *ap,*p_loc,*p_oth=NULL,*c_seq;
100:   Mat_APMPI         *ptap;
101:   Mat               AP_loc,C_loc,C_oth;
102:   PetscInt          i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz,nout;
103:   PetscScalar       *apa;
104:   const PetscInt    *cols;
105:   const PetscScalar *vals;

107:   MatCheckProduct(C,3);
108:   ptap = (Mat_APMPI*)C->product->data;

112:   MatZeroEntries(C);

114:   /* 1) get R = Pd^T,Ro = Po^T */
115:   if (ptap->reuse == MAT_REUSE_MATRIX) {
116:     MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
117:     MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
118:   }

120:   /* 2) get AP_loc */
121:   AP_loc = ptap->AP_loc;
122:   ap = (Mat_SeqAIJ*)AP_loc->data;

124:   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
125:   /*-----------------------------------------------------*/
126:   if (ptap->reuse == MAT_REUSE_MATRIX) {
127:     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
128:     MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
129:     MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
130:   }

132:   /* 2-2) compute numeric A_loc*P - dominating part */
133:   /* ---------------------------------------------- */
134:   /* get data from symbolic products */
135:   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
136:   if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;

138:   api   = ap->i;
139:   apj   = ap->j;
140:   ISLocalToGlobalMappingApply(ptap->ltog,api[AP_loc->rmap->n],apj,apj);
141:   for (i=0; i<am; i++) {
142:     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
143:     apnz = api[i+1] - api[i];
144:     apa = ap->a + api[i];
145:     PetscArrayzero(apa,apnz);
146:     AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa);
147:   }
148:   ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,api[AP_loc->rmap->n],apj,&nout,apj);

151:   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
152:   /* Always use scalable version since we are in the MPI scalable version */
153:   MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);
154:   MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);

156:   C_loc = ptap->C_loc;
157:   C_oth = ptap->C_oth;

159:   /* add C_loc and Co to to C */
160:   MatGetOwnershipRange(C,&rstart,&rend);

162:   /* C_loc -> C */
163:   cm    = C_loc->rmap->N;
164:   c_seq = (Mat_SeqAIJ*)C_loc->data;
165:   cols = c_seq->j;
166:   vals = c_seq->a;
167:   ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_loc->rmap->n],c_seq->j,c_seq->j);

169:   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
170:   /* when there are no off-processor parts.  */
171:   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
172:   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
173:   /* a table, and other, more complex stuff has to be done. */
174:   if (C->assembled) {
175:     C->was_assembled = PETSC_TRUE;
176:     C->assembled     = PETSC_FALSE;
177:   }
178:   if (C->was_assembled) {
179:     for (i=0; i<cm; i++) {
180:       ncols = c_seq->i[i+1] - c_seq->i[i];
181:       row = rstart + i;
182:       MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);
183:       cols += ncols; vals += ncols;
184:     }
185:   } else {
186:     MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);
187:   }
188:   ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_loc->rmap->n],c_seq->j,&nout,c_seq->j);

191:   /* Co -> C, off-processor part */
192:   cm = C_oth->rmap->N;
193:   c_seq = (Mat_SeqAIJ*)C_oth->data;
194:   cols = c_seq->j;
195:   vals = c_seq->a;
196:   ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_oth->rmap->n],c_seq->j,c_seq->j);
197:   for (i=0; i<cm; i++) {
198:     ncols = c_seq->i[i+1] - c_seq->i[i];
199:     row = p->garray[i];
200:     MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
201:     cols += ncols; vals += ncols;
202:   }
203:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
204:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

206:   ptap->reuse = MAT_REUSE_MATRIX;

208:   ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_oth->rmap->n],c_seq->j,&nout,c_seq->j);
210:   return 0;
211: }

213: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat Cmpi)
214: {
215:   PetscErrorCode      ierr;
216:   Mat_APMPI           *ptap;
217:   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
218:   MPI_Comm            comm;
219:   PetscMPIInt         size,rank;
220:   Mat                 P_loc,P_oth;
221:   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
222:   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
223:   PetscInt            *lnk,i,k,pnz,row,nsend;
224:   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,nrecv;
225:   PETSC_UNUSED PetscMPIInt icompleted=0;
226:   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
227:   const PetscInt      *owners;
228:   PetscInt            len,proc,*dnz,*onz,nzi,nspacedouble;
229:   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
230:   MPI_Request         *swaits,*rwaits;
231:   MPI_Status          *sstatus,rstatus;
232:   PetscLayout         rowmap;
233:   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
234:   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
235:   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi,nout;
236:   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
237:   PetscScalar         *apv;
238:   PetscTable          ta;
239:   MatType             mtype;
240:   const char          *prefix;
241: #if defined(PETSC_USE_INFO)
242:   PetscReal           apfill;
243: #endif

245:   MatCheckProduct(Cmpi,4);
247:   PetscObjectGetComm((PetscObject)A,&comm);
248:   MPI_Comm_size(comm,&size);
249:   MPI_Comm_rank(comm,&rank);

251:   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;

253:   /* create symbolic parallel matrix Cmpi */
254:   MatGetType(A,&mtype);
255:   MatSetType(Cmpi,mtype);

257:   /* create struct Mat_APMPI and attached it to C later */
258:   PetscNew(&ptap);
259:   ptap->reuse = MAT_INITIAL_MATRIX;
260:   ptap->algType = 0;

262:   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
263:   MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);
264:   /* get P_loc by taking all local rows of P */
265:   MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);

267:   ptap->P_loc = P_loc;
268:   ptap->P_oth = P_oth;

270:   /* (0) compute Rd = Pd^T, Ro = Po^T  */
271:   /* --------------------------------- */
272:   MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
273:   MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);

275:   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
276:   /* ----------------------------------------------------------------- */
277:   p_loc  = (Mat_SeqAIJ*)P_loc->data;
278:   if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data;

280:   /* create and initialize a linked list */
281:   PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
282:   MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta);
283:   MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta);
284:   PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */

286:   PetscLLCondensedCreate_Scalable(Crmax,&lnk);

288:   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
289:   if (ao) {
290:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
291:   } else {
292:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
293:   }
294:   current_space = free_space;
295:   nspacedouble  = 0;

297:   PetscMalloc1(am+1,&api);
298:   api[0] = 0;
299:   for (i=0; i<am; i++) {
300:     /* diagonal portion: Ad[i,:]*P */
301:     ai = ad->i; pi = p_loc->i;
302:     nzi = ai[i+1] - ai[i];
303:     aj  = ad->j + ai[i];
304:     for (j=0; j<nzi; j++) {
305:       row  = aj[j];
306:       pnz  = pi[row+1] - pi[row];
307:       Jptr = p_loc->j + pi[row];
308:       /* add non-zero cols of P into the sorted linked list lnk */
309:       PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
310:     }
311:     /* off-diagonal portion: Ao[i,:]*P */
312:     if (ao) {
313:       ai = ao->i; pi = p_oth->i;
314:       nzi = ai[i+1] - ai[i];
315:       aj  = ao->j + ai[i];
316:       for (j=0; j<nzi; j++) {
317:         row  = aj[j];
318:         pnz  = pi[row+1] - pi[row];
319:         Jptr = p_oth->j + pi[row];
320:         PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
321:       }
322:     }
323:     apnz     = lnk[0];
324:     api[i+1] = api[i] + apnz;

326:     /* if free space is not available, double the total space in the list */
327:     if (current_space->local_remaining<apnz) {
328:       PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);
329:       nspacedouble++;
330:     }

332:     /* Copy data into free space, then initialize lnk */
333:     PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);

335:     current_space->array           += apnz;
336:     current_space->local_used      += apnz;
337:     current_space->local_remaining -= apnz;
338:   }
339:   /* Allocate space for apj and apv, initialize apj, and */
340:   /* destroy list of free space and other temporary array(s) */
341:   PetscCalloc2(api[am],&apj,api[am],&apv);
342:   PetscFreeSpaceContiguous(&free_space,apj);
343:   PetscLLCondensedDestroy_Scalable(lnk);

345:   /* Create AP_loc for reuse */
346:   MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);
347:   MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog);

349: #if defined(PETSC_USE_INFO)
350:   if (ao) {
351:     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
352:   } else {
353:     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
354:   }
355:   ptap->AP_loc->info.mallocs           = nspacedouble;
356:   ptap->AP_loc->info.fill_ratio_given  = fill;
357:   ptap->AP_loc->info.fill_ratio_needed = apfill;

359:   if (api[am]) {
360:     PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
361:     PetscInfo(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
362:   } else {
363:     PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n");
364:   }
365: #endif

367:   /* (2-1) compute symbolic Co = Ro*AP_loc  */
368:   /* -------------------------------------- */
369:   MatProductCreate(ptap->Ro,ptap->AP_loc,NULL,&ptap->C_oth);
370:   MatGetOptionsPrefix(A,&prefix);
371:   MatSetOptionsPrefix(ptap->C_oth,prefix);
372:   MatAppendOptionsPrefix(ptap->C_oth,"inner_offdiag_");

374:   MatProductSetType(ptap->C_oth,MATPRODUCT_AB);
375:   MatProductSetAlgorithm(ptap->C_oth,"sorted");
376:   MatProductSetFill(ptap->C_oth,fill);
377:   MatProductSetFromOptions(ptap->C_oth);
378:   MatProductSymbolic(ptap->C_oth);

380:   /* (3) send coj of C_oth to other processors  */
381:   /* ------------------------------------------ */
382:   /* determine row ownership */
383:   PetscLayoutCreate(comm,&rowmap);
384:   PetscLayoutSetLocalSize(rowmap, pn);
385:   PetscLayoutSetBlockSize(rowmap, 1);
386:   PetscLayoutSetUp(rowmap);
387:   PetscLayoutGetRanges(rowmap,&owners);

389:   /* determine the number of messages to send, their lengths */
390:   PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
391:   PetscArrayzero(len_s,size);
392:   PetscArrayzero(len_si,size);

394:   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
395:   coi   = c_oth->i; coj = c_oth->j;
396:   con   = ptap->C_oth->rmap->n;
397:   proc  = 0;
398:   ISLocalToGlobalMappingApply(ptap->ltog,coi[con],coj,coj);
399:   for (i=0; i<con; i++) {
400:     while (prmap[i] >= owners[proc+1]) proc++;
401:     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
402:     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
403:   }

405:   len          = 0; /* max length of buf_si[], see (4) */
406:   owners_co[0] = 0;
407:   nsend        = 0;
408:   for (proc=0; proc<size; proc++) {
409:     owners_co[proc+1] = owners_co[proc] + len_si[proc];
410:     if (len_s[proc]) {
411:       nsend++;
412:       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
413:       len         += len_si[proc];
414:     }
415:   }

417:   /* determine the number and length of messages to receive for coi and coj  */
418:   PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
419:   PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);

421:   /* post the Irecv and Isend of coj */
422:   PetscCommGetNewTag(comm,&tagj);
423:   PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
424:   PetscMalloc1(nsend+1,&swaits);
425:   for (proc=0, k=0; proc<size; proc++) {
426:     if (!len_s[proc]) continue;
427:     i    = owners_co[proc];
428:     MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
429:     k++;
430:   }

432:   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
433:   /* ---------------------------------------- */
434:   MatProductCreate(ptap->Rd,ptap->AP_loc,NULL,&ptap->C_loc);
435:   MatProductSetType(ptap->C_loc,MATPRODUCT_AB);
436:   MatProductSetAlgorithm(ptap->C_loc,"default");
437:   MatProductSetFill(ptap->C_loc,fill);

439:   MatSetOptionsPrefix(ptap->C_loc,prefix);
440:   MatAppendOptionsPrefix(ptap->C_loc,"inner_diag_");

442:   MatProductSetFromOptions(ptap->C_loc);
443:   MatProductSymbolic(ptap->C_loc);

445:   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
446:   ISLocalToGlobalMappingApply(ptap->ltog,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,c_loc->j);

448:   /* receives coj are complete */
449:   for (i=0; i<nrecv; i++) {
450:     MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
451:   }
452:   PetscFree(rwaits);
453:   if (nsend) MPI_Waitall(nsend,swaits,sstatus);

455:   /* add received column indices into ta to update Crmax */
456:   for (k=0; k<nrecv; k++) {/* k-th received message */
457:     Jptr = buf_rj[k];
458:     for (j=0; j<len_r[k]; j++) {
459:       PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
460:     }
461:   }
462:   PetscTableGetCount(ta,&Crmax);
463:   PetscTableDestroy(&ta);

465:   /* (4) send and recv coi */
466:   /*-----------------------*/
467:   PetscCommGetNewTag(comm,&tagi);
468:   PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
469:   PetscMalloc1(len+1,&buf_s);
470:   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
471:   for (proc=0,k=0; proc<size; proc++) {
472:     if (!len_s[proc]) continue;
473:     /* form outgoing message for i-structure:
474:          buf_si[0]:                 nrows to be sent
475:                [1:nrows]:           row index (global)
476:                [nrows+1:2*nrows+1]: i-structure index
477:     */
478:     /*-------------------------------------------*/
479:     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
480:     buf_si_i    = buf_si + nrows+1;
481:     buf_si[0]   = nrows;
482:     buf_si_i[0] = 0;
483:     nrows       = 0;
484:     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
485:       nzi = coi[i+1] - coi[i];
486:       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
487:       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
488:       nrows++;
489:     }
490:     MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
491:     k++;
492:     buf_si += len_si[proc];
493:   }
494:   for (i=0; i<nrecv; i++) {
495:     MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
496:   }
497:   PetscFree(rwaits);
498:   if (nsend) MPI_Waitall(nsend,swaits,sstatus);

500:   PetscFree4(len_s,len_si,sstatus,owners_co);
501:   PetscFree(len_ri);
502:   PetscFree(swaits);
503:   PetscFree(buf_s);

505:   /* (5) compute the local portion of Cmpi      */
506:   /* ------------------------------------------ */
507:   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
508:   PetscFreeSpaceGet(Crmax,&free_space);
509:   current_space = free_space;

511:   PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
512:   for (k=0; k<nrecv; k++) {
513:     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
514:     nrows       = *buf_ri_k[k];
515:     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
516:     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
517:   }

519:   MatPreallocateInitialize(comm,pn,pn,dnz,onz);
520:   PetscLLCondensedCreate_Scalable(Crmax,&lnk);
521:   for (i=0; i<pn; i++) {
522:     /* add C_loc into Cmpi */
523:     nzi  = c_loc->i[i+1] - c_loc->i[i];
524:     Jptr = c_loc->j + c_loc->i[i];
525:     PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);

527:     /* add received col data into lnk */
528:     for (k=0; k<nrecv; k++) { /* k-th received message */
529:       if (i == *nextrow[k]) { /* i-th row */
530:         nzi  = *(nextci[k]+1) - *nextci[k];
531:         Jptr = buf_rj[k] + *nextci[k];
532:         PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
533:         nextrow[k]++; nextci[k]++;
534:       }
535:     }
536:     nzi = lnk[0];

538:     /* copy data into free space, then initialize lnk */
539:     PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);
540:     MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
541:   }
542:   PetscFree3(buf_ri_k,nextrow,nextci);
543:   PetscLLCondensedDestroy_Scalable(lnk);
544:   PetscFreeSpaceDestroy(free_space);

546:   /* local sizes and preallocation */
547:   MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
548:   if (P->cmap->bs > 0) {
549:     PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);
550:     PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs);
551:   }
552:   MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
553:   MatPreallocateFinalize(dnz,onz);

555:   /* members in merge */
556:   PetscFree(id_r);
557:   PetscFree(len_r);
558:   PetscFree(buf_ri[0]);
559:   PetscFree(buf_ri);
560:   PetscFree(buf_rj[0]);
561:   PetscFree(buf_rj);
562:   PetscLayoutDestroy(&rowmap);

564:   nout = 0;
565:   ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_oth->i[ptap->C_oth->rmap->n],c_oth->j,&nout,c_oth->j);
567:   ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,&nout,c_loc->j);

570:   /* attach the supporting struct to Cmpi for reuse */
571:   Cmpi->product->data    = ptap;
572:   Cmpi->product->view    = MatView_MPIAIJ_PtAP;
573:   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;

575:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
576:   Cmpi->assembled        = PETSC_FALSE;
577:   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
578:   return 0;
579: }

581: static inline PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHSetI dht,PetscHSetI oht)
582: {
583:   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
584:   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
585:   PetscInt             *ai,nzi,j,*aj,row,col,*pi,*pj,pnz,nzpi,*p_othcols,k;
586:   PetscInt             pcstart,pcend,column,offset;

588:   pcstart = P->cmap->rstart;
589:   pcstart *= dof;
590:   pcend   = P->cmap->rend;
591:   pcend   *= dof;
592:   /* diagonal portion: Ad[i,:]*P */
593:   ai = ad->i;
594:   nzi = ai[i+1] - ai[i];
595:   aj  = ad->j + ai[i];
596:   for (j=0; j<nzi; j++) {
597:     row  = aj[j];
598:     offset = row%dof;
599:     row   /= dof;
600:     nzpi = pd->i[row+1] - pd->i[row];
601:     pj  = pd->j + pd->i[row];
602:     for (k=0; k<nzpi; k++) {
603:       PetscHSetIAdd(dht,pj[k]*dof+offset+pcstart);
604:     }
605:   }
606:   /* off diag P */
607:   for (j=0; j<nzi; j++) {
608:     row  = aj[j];
609:     offset = row%dof;
610:     row   /= dof;
611:     nzpi = po->i[row+1] - po->i[row];
612:     pj  = po->j + po->i[row];
613:     for (k=0; k<nzpi; k++) {
614:       PetscHSetIAdd(oht,p->garray[pj[k]]*dof+offset);
615:     }
616:   }

618:   /* off diagonal part: Ao[i, :]*P_oth */
619:   if (ao) {
620:     ai = ao->i;
621:     pi = p_oth->i;
622:     nzi = ai[i+1] - ai[i];
623:     aj  = ao->j + ai[i];
624:     for (j=0; j<nzi; j++) {
625:       row  = aj[j];
626:       offset = a->garray[row]%dof;
627:       row  = map[row];
628:       pnz  = pi[row+1] - pi[row];
629:       p_othcols = p_oth->j + pi[row];
630:       for (col=0; col<pnz; col++) {
631:         column = p_othcols[col] * dof + offset;
632:         if (column>=pcstart && column<pcend) {
633:           PetscHSetIAdd(dht,column);
634:         } else {
635:           PetscHSetIAdd(oht,column);
636:         }
637:       }
638:     }
639:   } /* end if (ao) */
640:   return 0;
641: }

643: static inline PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHMapIV hmap)
644: {
645:   Mat_MPIAIJ           *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
646:   Mat_SeqAIJ           *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
647:   PetscInt             *ai,nzi,j,*aj,row,col,*pi,pnz,*p_othcols,pcstart,*pj,k,nzpi,offset;
648:   PetscScalar          ra,*aa,*pa;

650:   pcstart = P->cmap->rstart;
651:   pcstart *= dof;

653:   /* diagonal portion: Ad[i,:]*P */
654:   ai  = ad->i;
655:   nzi = ai[i+1] - ai[i];
656:   aj  = ad->j + ai[i];
657:   aa  = ad->a + ai[i];
658:   for (j=0; j<nzi; j++) {
659:     ra   = aa[j];
660:     row  = aj[j];
661:     offset = row%dof;
662:     row    /= dof;
663:     nzpi = pd->i[row+1] - pd->i[row];
664:     pj = pd->j + pd->i[row];
665:     pa = pd->a + pd->i[row];
666:     for (k=0; k<nzpi; k++) {
667:       PetscHMapIVAddValue(hmap,pj[k]*dof+offset+pcstart,ra*pa[k]);
668:     }
669:     PetscLogFlops(2.0*nzpi);
670:   }
671:   for (j=0; j<nzi; j++) {
672:     ra   = aa[j];
673:     row  = aj[j];
674:     offset = row%dof;
675:     row   /= dof;
676:     nzpi = po->i[row+1] - po->i[row];
677:     pj = po->j + po->i[row];
678:     pa = po->a + po->i[row];
679:     for (k=0; k<nzpi; k++) {
680:       PetscHMapIVAddValue(hmap,p->garray[pj[k]]*dof+offset,ra*pa[k]);
681:     }
682:     PetscLogFlops(2.0*nzpi);
683:   }

685:   /* off diagonal part: Ao[i, :]*P_oth */
686:   if (ao) {
687:     ai = ao->i;
688:     pi = p_oth->i;
689:     nzi = ai[i+1] - ai[i];
690:     aj  = ao->j + ai[i];
691:     aa  = ao->a + ai[i];
692:     for (j=0; j<nzi; j++) {
693:       row  = aj[j];
694:       offset = a->garray[row]%dof;
695:       row    = map[row];
696:       ra   = aa[j];
697:       pnz  = pi[row+1] - pi[row];
698:       p_othcols = p_oth->j + pi[row];
699:       pa   = p_oth->a + pi[row];
700:       for (col=0; col<pnz; col++) {
701:         PetscHMapIVAddValue(hmap,p_othcols[col]*dof+offset,ra*pa[col]);
702:       }
703:       PetscLogFlops(2.0*pnz);
704:     }
705:   } /* end if (ao) */

707:   return 0;
708: }

710: PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat,Mat,PetscInt dof,MatReuse,Mat*);

712: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,Mat C)
713: {
714:   Mat_MPIAIJ        *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
715:   Mat_SeqAIJ        *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data;
716:   Mat_APMPI         *ptap;
717:   PetscHMapIV       hmap;
718:   PetscInt          i,j,jj,kk,nzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,ccstart,ccend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,*dcc,*occ,loc;
719:   PetscScalar       *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa;
720:   PetscInt          offset,ii,pocol;
721:   const PetscInt    *mappingindices;
722:   IS                map;

724:   MatCheckProduct(C,4);
725:   ptap = (Mat_APMPI*)C->product->data;

729:   MatZeroEntries(C);

731:   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
732:   /*-----------------------------------------------------*/
733:   if (ptap->reuse == MAT_REUSE_MATRIX) {
734:     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
735:     MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth);
736:   }
737:   PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);

739:   MatGetLocalSize(p->B,NULL,&pon);
740:   pon *= dof;
741:   PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);
742:   MatGetLocalSize(A,&am,NULL);
743:   cmaxr = 0;
744:   for (i=0; i<pon; i++) {
745:     cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]);
746:   }
747:   PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);
748:   PetscHMapIVCreate(&hmap);
749:   PetscHMapIVResize(hmap,cmaxr);
750:   ISGetIndices(map,&mappingindices);
751:   for (i=0; i<am && pon; i++) {
752:     PetscHMapIVClear(hmap);
753:     offset = i%dof;
754:     ii     = i/dof;
755:     nzi = po->i[ii+1] - po->i[ii];
756:     if (!nzi) continue;
757:     MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);
758:     voff = 0;
759:     PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);
760:     if (!voff) continue;

762:     /* Form C(ii, :) */
763:     poj = po->j + po->i[ii];
764:     poa = po->a + po->i[ii];
765:     for (j=0; j<nzi; j++) {
766:       pocol = poj[j]*dof+offset;
767:       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
768:       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
769:       for (jj=0; jj<voff; jj++) {
770:         apvaluestmp[jj] = apvalues[jj]*poa[j];
771:         /*If the row is empty */
772:         if (!c_rmtc[pocol]) {
773:           c_rmtjj[jj] = apindices[jj];
774:           c_rmtaa[jj] = apvaluestmp[jj];
775:           c_rmtc[pocol]++;
776:         } else {
777:           PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc);
778:           if (loc>=0){ /* hit */
779:             c_rmtaa[loc] += apvaluestmp[jj];
780:             PetscLogFlops(1.0);
781:           } else { /* new element */
782:             loc = -(loc+1);
783:             /* Move data backward */
784:             for (kk=c_rmtc[pocol]; kk>loc; kk--) {
785:               c_rmtjj[kk] = c_rmtjj[kk-1];
786:               c_rmtaa[kk] = c_rmtaa[kk-1];
787:             }/* End kk */
788:             c_rmtjj[loc] = apindices[jj];
789:             c_rmtaa[loc] = apvaluestmp[jj];
790:             c_rmtc[pocol]++;
791:           }
792:         }
793:         PetscLogFlops(voff);
794:       } /* End jj */
795:     } /* End j */
796:   } /* End i */

798:   PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);
799:   PetscHMapIVDestroy(&hmap);

801:   MatGetLocalSize(P,NULL,&pn);
802:   pn *= dof;
803:   PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);

805:   PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);
806:   PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE);
807:   MatGetOwnershipRangeColumn(P,&pcstart,&pcend);
808:   pcstart = pcstart*dof;
809:   pcend   = pcend*dof;
810:   cd = (Mat_SeqAIJ*)(c->A)->data;
811:   co = (Mat_SeqAIJ*)(c->B)->data;

813:   cmaxr = 0;
814:   for (i=0; i<pn; i++) {
815:     cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i]));
816:   }
817:   PetscCalloc5(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pn,&dcc,pn,&occ);
818:   PetscHMapIVCreate(&hmap);
819:   PetscHMapIVResize(hmap,cmaxr);
820:   for (i=0; i<am && pn; i++) {
821:     PetscHMapIVClear(hmap);
822:     offset = i%dof;
823:     ii     = i/dof;
824:     nzi = pd->i[ii+1] - pd->i[ii];
825:     if (!nzi) continue;
826:     MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);
827:     voff = 0;
828:     PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);
829:     if (!voff) continue;
830:     /* Form C(ii, :) */
831:     pdj = pd->j + pd->i[ii];
832:     pda = pd->a + pd->i[ii];
833:     for (j=0; j<nzi; j++) {
834:       row = pcstart + pdj[j] * dof + offset;
835:       for (jj=0; jj<voff; jj++) {
836:         apvaluestmp[jj] = apvalues[jj]*pda[j];
837:       }
838:       PetscLogFlops(voff);
839:       MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);
840:     }
841:   }
842:   ISRestoreIndices(map,&mappingindices);
843:   MatGetOwnershipRangeColumn(C,&ccstart,&ccend);
844:   PetscFree5(apindices,apvalues,apvaluestmp,dcc,occ);
845:   PetscHMapIVDestroy(&hmap);
846:   PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);
847:   PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE);
848:   PetscFree2(c_rmtj,c_rmta);

850:   /* Add contributions from remote */
851:   for (i = 0; i < pn; i++) {
852:     row = i + pcstart;
853:     MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);
854:   }
855:   PetscFree2(c_othj,c_otha);

857:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
858:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

860:   ptap->reuse = MAT_REUSE_MATRIX;
861:   return 0;
862: }

864: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,Mat C)
865: {

867:   MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A,P,1,C);
868:   return 0;
869: }

871: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,Mat C)
872: {
873:   Mat_MPIAIJ        *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
874:   Mat_SeqAIJ        *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data;
875:   Mat_APMPI         *ptap;
876:   PetscHMapIV       hmap;
877:   PetscInt          i,j,jj,kk,nzi,dnzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,loc;
878:   PetscScalar       *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa;
879:   PetscInt          offset,ii,pocol;
880:   const PetscInt    *mappingindices;
881:   IS                map;

883:   MatCheckProduct(C,4);
884:   ptap = (Mat_APMPI*)C->product->data;

888:   MatZeroEntries(C);

890:   /* Get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
891:   /*-----------------------------------------------------*/
892:   if (ptap->reuse == MAT_REUSE_MATRIX) {
893:     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
894:     MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth);
895:   }
896:   PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);
897:   MatGetLocalSize(p->B,NULL,&pon);
898:   pon *= dof;
899:   MatGetLocalSize(P,NULL,&pn);
900:   pn  *= dof;

902:   PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);
903:   MatGetLocalSize(A,&am,NULL);
904:   MatGetOwnershipRangeColumn(P,&pcstart,&pcend);
905:   pcstart *= dof;
906:   pcend   *= dof;
907:   cmaxr = 0;
908:   for (i=0; i<pon; i++) {
909:     cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]);
910:   }
911:   cd = (Mat_SeqAIJ*)(c->A)->data;
912:   co = (Mat_SeqAIJ*)(c->B)->data;
913:   for (i=0; i<pn; i++) {
914:     cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i]));
915:   }
916:   PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);
917:   PetscHMapIVCreate(&hmap);
918:   PetscHMapIVResize(hmap,cmaxr);
919:   ISGetIndices(map,&mappingindices);
920:   for (i=0; i<am && (pon || pn); i++) {
921:     PetscHMapIVClear(hmap);
922:     offset = i%dof;
923:     ii     = i/dof;
924:     nzi  = po->i[ii+1] - po->i[ii];
925:     dnzi = pd->i[ii+1] - pd->i[ii];
926:     if (!nzi && !dnzi) continue;
927:     MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);
928:     voff = 0;
929:     PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);
930:     if (!voff) continue;

932:     /* Form remote C(ii, :) */
933:     poj = po->j + po->i[ii];
934:     poa = po->a + po->i[ii];
935:     for (j=0; j<nzi; j++) {
936:       pocol = poj[j]*dof+offset;
937:       c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
938:       c_rmtaa = c_rmta + ptap->c_rmti[pocol];
939:       for (jj=0; jj<voff; jj++) {
940:         apvaluestmp[jj] = apvalues[jj]*poa[j];
941:         /*If the row is empty */
942:         if (!c_rmtc[pocol]) {
943:           c_rmtjj[jj] = apindices[jj];
944:           c_rmtaa[jj] = apvaluestmp[jj];
945:           c_rmtc[pocol]++;
946:         } else {
947:           PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc);
948:           if (loc>=0){ /* hit */
949:             c_rmtaa[loc] += apvaluestmp[jj];
950:             PetscLogFlops(1.0);
951:           } else { /* new element */
952:             loc = -(loc+1);
953:             /* Move data backward */
954:             for (kk=c_rmtc[pocol]; kk>loc; kk--) {
955:               c_rmtjj[kk] = c_rmtjj[kk-1];
956:               c_rmtaa[kk] = c_rmtaa[kk-1];
957:             }/* End kk */
958:             c_rmtjj[loc] = apindices[jj];
959:             c_rmtaa[loc] = apvaluestmp[jj];
960:             c_rmtc[pocol]++;
961:           }
962:         }
963:       } /* End jj */
964:       PetscLogFlops(voff);
965:     } /* End j */

967:     /* Form local C(ii, :) */
968:     pdj = pd->j + pd->i[ii];
969:     pda = pd->a + pd->i[ii];
970:     for (j=0; j<dnzi; j++) {
971:       row = pcstart + pdj[j] * dof + offset;
972:       for (jj=0; jj<voff; jj++) {
973:         apvaluestmp[jj] = apvalues[jj]*pda[j];
974:       }/* End kk */
975:       PetscLogFlops(voff);
976:       MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);
977:     }/* End j */
978:   } /* End i */

980:   ISRestoreIndices(map,&mappingindices);
981:   PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);
982:   PetscHMapIVDestroy(&hmap);
983:   PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);

985:   PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);
986:   PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE);
987:   PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);
988:   PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE);
989:   PetscFree2(c_rmtj,c_rmta);

991:   /* Add contributions from remote */
992:   for (i = 0; i < pn; i++) {
993:     row = i + pcstart;
994:     MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);
995:   }
996:   PetscFree2(c_othj,c_otha);

998:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
999:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

1001:   ptap->reuse = MAT_REUSE_MATRIX;
1002:   return 0;
1003: }

1005: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,Mat C)
1006: {

1008:   MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,C);
1009:   return 0;
1010: }

1012: /* TODO: move algorithm selection to MatProductSetFromOptions */
1013: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat Cmpi)
1014: {
1015:   Mat_APMPI           *ptap;
1016:   Mat_MPIAIJ          *p=(Mat_MPIAIJ*)P->data;
1017:   MPI_Comm            comm;
1018:   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
1019:   MatType             mtype;
1020:   PetscSF             sf;
1021:   PetscSFNode         *iremote;
1022:   PetscInt            rootspacesize,*rootspace,*rootspaceoffsets,nleaves;
1023:   const PetscInt      *rootdegrees;
1024:   PetscHSetI          ht,oht,*hta,*hto;
1025:   PetscInt            pn,pon,*c_rmtc,i,j,nzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets;
1026:   PetscInt            lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj;
1027:   PetscInt            nalg=2,alg=0,offset,ii;
1028:   PetscMPIInt         owner;
1029:   const PetscInt      *mappingindices;
1030:   PetscBool           flg;
1031:   const char          *algTypes[2] = {"overlapping","merged"};
1032:   IS                  map;
1033:   PetscErrorCode      ierr;

1035:   MatCheckProduct(Cmpi,5);
1037:   PetscObjectGetComm((PetscObject)A,&comm);

1039:   /* Create symbolic parallel matrix Cmpi */
1040:   MatGetLocalSize(P,NULL,&pn);
1041:   pn *= dof;
1042:   MatGetType(A,&mtype);
1043:   MatSetType(Cmpi,mtype);
1044:   MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);

1046:   PetscNew(&ptap);
1047:   ptap->reuse = MAT_INITIAL_MATRIX;
1048:   ptap->algType = 2;

1050:   /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1051:   MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth);
1052:   PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);
1053:   /* This equals to the number of offdiag columns in P */
1054:   MatGetLocalSize(p->B,NULL,&pon);
1055:   pon *= dof;
1056:   /* offsets */
1057:   PetscMalloc1(pon+1,&ptap->c_rmti);
1058:   /* The number of columns we will send to remote ranks */
1059:   PetscMalloc1(pon,&c_rmtc);
1060:   PetscMalloc1(pon,&hta);
1061:   for (i=0; i<pon; i++) {
1062:     PetscHSetICreate(&hta[i]);
1063:   }
1064:   MatGetLocalSize(A,&am,NULL);
1065:   MatGetOwnershipRange(A,&arstart,&arend);
1066:   /* Create hash table to merge all columns for C(i, :) */
1067:   PetscHSetICreate(&ht);

1069:   ISGetIndices(map,&mappingindices);
1070:   ptap->c_rmti[0] = 0;
1071:   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
1072:   for (i=0; i<am && pon; i++) {
1073:     /* Form one row of AP */
1074:     PetscHSetIClear(ht);
1075:     offset = i%dof;
1076:     ii     = i/dof;
1077:     /* If the off diag is empty, we should not do any calculation */
1078:     nzi = po->i[ii+1] - po->i[ii];
1079:     if (!nzi) continue;

1081:     MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,ht);
1082:     PetscHSetIGetSize(ht,&htsize);
1083:     /* If AP is empty, just continue */
1084:     if (!htsize) continue;
1085:     /* Form C(ii, :) */
1086:     poj = po->j + po->i[ii];
1087:     for (j=0; j<nzi; j++) {
1088:       PetscHSetIUpdate(hta[poj[j]*dof+offset],ht);
1089:     }
1090:   }

1092:   for (i=0; i<pon; i++) {
1093:     PetscHSetIGetSize(hta[i],&htsize);
1094:     ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize;
1095:     c_rmtc[i] = htsize;
1096:   }

1098:   PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);

1100:   for (i=0; i<pon; i++) {
1101:     off = 0;
1102:     PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);
1103:     PetscHSetIDestroy(&hta[i]);
1104:   }
1105:   PetscFree(hta);

1107:   PetscMalloc1(pon,&iremote);
1108:   for (i=0; i<pon; i++) {
1109:     owner = 0; lidx = 0;
1110:     offset = i%dof;
1111:     ii     = i/dof;
1112:     PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx);
1113:     iremote[i].index = lidx*dof + offset;
1114:     iremote[i].rank  = owner;
1115:   }

1117:   PetscSFCreate(comm,&sf);
1118:   PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
1119:   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1120:   PetscSFSetRankOrder(sf,PETSC_TRUE);
1121:   PetscSFSetFromOptions(sf);
1122:   PetscSFSetUp(sf);
1123:   /* How many neighbors have contributions to my rows? */
1124:   PetscSFComputeDegreeBegin(sf,&rootdegrees);
1125:   PetscSFComputeDegreeEnd(sf,&rootdegrees);
1126:   rootspacesize = 0;
1127:   for (i = 0; i < pn; i++) {
1128:     rootspacesize += rootdegrees[i];
1129:   }
1130:   PetscMalloc1(rootspacesize,&rootspace);
1131:   PetscMalloc1(rootspacesize+1,&rootspaceoffsets);
1132:   /* Get information from leaves
1133:    * Number of columns other people contribute to my rows
1134:    * */
1135:   PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);
1136:   PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);
1137:   PetscFree(c_rmtc);
1138:   PetscCalloc1(pn+1,&ptap->c_othi);
1139:   /* The number of columns is received for each row */
1140:   ptap->c_othi[0] = 0;
1141:   rootspacesize = 0;
1142:   rootspaceoffsets[0] = 0;
1143:   for (i = 0; i < pn; i++) {
1144:     rcvncols = 0;
1145:     for (j = 0; j<rootdegrees[i]; j++) {
1146:       rcvncols += rootspace[rootspacesize];
1147:       rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1148:       rootspacesize++;
1149:     }
1150:     ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols;
1151:   }
1152:   PetscFree(rootspace);

1154:   PetscMalloc1(pon,&c_rmtoffsets);
1155:   PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);
1156:   PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);
1157:   PetscSFDestroy(&sf);
1158:   PetscFree(rootspaceoffsets);

1160:   PetscCalloc1(ptap->c_rmti[pon],&iremote);
1161:   nleaves = 0;
1162:   for (i = 0; i<pon; i++) {
1163:     owner = 0;
1164:     ii = i/dof;
1165:     PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL);
1166:     sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i];
1167:     for (j=0; j<sendncols; j++) {
1168:       iremote[nleaves].rank = owner;
1169:       iremote[nleaves++].index = c_rmtoffsets[i] + j;
1170:     }
1171:   }
1172:   PetscFree(c_rmtoffsets);
1173:   PetscCalloc1(ptap->c_othi[pn],&c_othj);

1175:   PetscSFCreate(comm,&ptap->sf);
1176:   PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
1177:   PetscSFSetFromOptions(ptap->sf);
1178:   /* One to one map */
1179:   PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);

1181:   PetscMalloc2(pn,&dnz,pn,&onz);
1182:   PetscHSetICreate(&oht);
1183:   MatGetOwnershipRangeColumn(P,&pcstart,&pcend);
1184:   pcstart *= dof;
1185:   pcend   *= dof;
1186:   PetscMalloc2(pn,&hta,pn,&hto);
1187:   for (i=0; i<pn; i++) {
1188:     PetscHSetICreate(&hta[i]);
1189:     PetscHSetICreate(&hto[i]);
1190:   }
1191:   /* Work on local part */
1192:   /* 4) Pass 1: Estimate memory for C_loc */
1193:   for (i=0; i<am && pn; i++) {
1194:     PetscHSetIClear(ht);
1195:     PetscHSetIClear(oht);
1196:     offset = i%dof;
1197:     ii     = i/dof;
1198:     nzi = pd->i[ii+1] - pd->i[ii];
1199:     if (!nzi) continue;

1201:     MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht);
1202:     PetscHSetIGetSize(ht,&htsize);
1203:     PetscHSetIGetSize(oht,&htosize);
1204:     if (!(htsize+htosize)) continue;
1205:     /* Form C(ii, :) */
1206:     pdj = pd->j + pd->i[ii];
1207:     for (j=0; j<nzi; j++) {
1208:       PetscHSetIUpdate(hta[pdj[j]*dof+offset],ht);
1209:       PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht);
1210:     }
1211:   }

1213:   ISRestoreIndices(map,&mappingindices);

1215:   PetscHSetIDestroy(&ht);
1216:   PetscHSetIDestroy(&oht);

1218:   /* Get remote data */
1219:   PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);
1220:   PetscFree(c_rmtj);

1222:   for (i = 0; i < pn; i++) {
1223:     nzi = ptap->c_othi[i+1] - ptap->c_othi[i];
1224:     rdj = c_othj + ptap->c_othi[i];
1225:     for (j = 0; j < nzi; j++) {
1226:       col = rdj[j];
1227:       /* diag part */
1228:       if (col>=pcstart && col<pcend) {
1229:         PetscHSetIAdd(hta[i],col);
1230:       } else { /* off diag */
1231:         PetscHSetIAdd(hto[i],col);
1232:       }
1233:     }
1234:     PetscHSetIGetSize(hta[i],&htsize);
1235:     dnz[i] = htsize;
1236:     PetscHSetIDestroy(&hta[i]);
1237:     PetscHSetIGetSize(hto[i],&htsize);
1238:     onz[i] = htsize;
1239:     PetscHSetIDestroy(&hto[i]);
1240:   }

1242:   PetscFree2(hta,hto);
1243:   PetscFree(c_othj);

1245:   /* local sizes and preallocation */
1246:   MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1247:   MatSetBlockSizes(Cmpi,dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs);
1248:   MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1249:   MatSetUp(Cmpi);
1250:   PetscFree2(dnz,onz);

1252:   /* attach the supporting struct to Cmpi for reuse */
1253:   Cmpi->product->data    = ptap;
1254:   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1255:   Cmpi->product->view    = MatView_MPIAIJ_PtAP;

1257:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1258:   Cmpi->assembled = PETSC_FALSE;
1259:   /* pick an algorithm */
1260:   PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");
1261:   alg = 0;
1262:   PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);
1263:   PetscOptionsEnd();
1264:   switch (alg) {
1265:     case 0:
1266:       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1267:       break;
1268:     case 1:
1269:       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1270:       break;
1271:     default:
1272:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm ");
1273:   }
1274:   return 0;
1275: }

1277: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,PetscReal fill,Mat C)
1278: {
1279:   MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A,P,1,fill,C);
1280:   return 0;
1281: }

1283: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat Cmpi)
1284: {
1285:   Mat_APMPI           *ptap;
1286:   Mat_MPIAIJ          *p=(Mat_MPIAIJ*)P->data;
1287:   MPI_Comm            comm;
1288:   Mat_SeqAIJ          *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
1289:   MatType             mtype;
1290:   PetscSF             sf;
1291:   PetscSFNode         *iremote;
1292:   PetscInt            rootspacesize,*rootspace,*rootspaceoffsets,nleaves;
1293:   const PetscInt      *rootdegrees;
1294:   PetscHSetI          ht,oht,*hta,*hto,*htd;
1295:   PetscInt            pn,pon,*c_rmtc,i,j,nzi,dnzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets;
1296:   PetscInt            lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj;
1297:   PetscInt            nalg=2,alg=0,offset,ii;
1298:   PetscMPIInt         owner;
1299:   PetscBool           flg;
1300:   const char          *algTypes[2] = {"merged","overlapping"};
1301:   const PetscInt      *mappingindices;
1302:   IS                  map;
1303:   PetscErrorCode      ierr;

1305:   MatCheckProduct(Cmpi,5);
1307:   PetscObjectGetComm((PetscObject)A,&comm);

1309:   /* Create symbolic parallel matrix Cmpi */
1310:   MatGetLocalSize(P,NULL,&pn);
1311:   pn *= dof;
1312:   MatGetType(A,&mtype);
1313:   MatSetType(Cmpi,mtype);
1314:   MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);

1316:   PetscNew(&ptap);
1317:   ptap->reuse = MAT_INITIAL_MATRIX;
1318:   ptap->algType = 3;

1320:   /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1321:   MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth);
1322:   PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);

1324:   /* This equals to the number of offdiag columns in P */
1325:   MatGetLocalSize(p->B,NULL,&pon);
1326:   pon *= dof;
1327:   /* offsets */
1328:   PetscMalloc1(pon+1,&ptap->c_rmti);
1329:   /* The number of columns we will send to remote ranks */
1330:   PetscMalloc1(pon,&c_rmtc);
1331:   PetscMalloc1(pon,&hta);
1332:   for (i=0; i<pon; i++) {
1333:     PetscHSetICreate(&hta[i]);
1334:   }
1335:   MatGetLocalSize(A,&am,NULL);
1336:   MatGetOwnershipRange(A,&arstart,&arend);
1337:   /* Create hash table to merge all columns for C(i, :) */
1338:   PetscHSetICreate(&ht);
1339:   PetscHSetICreate(&oht);
1340:   PetscMalloc2(pn,&htd,pn,&hto);
1341:   for (i=0; i<pn; i++) {
1342:     PetscHSetICreate(&htd[i]);
1343:     PetscHSetICreate(&hto[i]);
1344:   }

1346:   ISGetIndices(map,&mappingindices);
1347:   ptap->c_rmti[0] = 0;
1348:   /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors)  */
1349:   for (i=0; i<am && (pon || pn); i++) {
1350:     /* Form one row of AP */
1351:     PetscHSetIClear(ht);
1352:     PetscHSetIClear(oht);
1353:     offset = i%dof;
1354:     ii     = i/dof;
1355:     /* If the off diag is empty, we should not do any calculation */
1356:     nzi = po->i[ii+1] - po->i[ii];
1357:     dnzi = pd->i[ii+1] - pd->i[ii];
1358:     if (!nzi && !dnzi) continue;

1360:     MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht);
1361:     PetscHSetIGetSize(ht,&htsize);
1362:     PetscHSetIGetSize(oht,&htosize);
1363:     /* If AP is empty, just continue */
1364:     if (!(htsize+htosize)) continue;

1366:     /* Form remote C(ii, :) */
1367:     poj = po->j + po->i[ii];
1368:     for (j=0; j<nzi; j++) {
1369:       PetscHSetIUpdate(hta[poj[j]*dof+offset],ht);
1370:       PetscHSetIUpdate(hta[poj[j]*dof+offset],oht);
1371:     }

1373:     /* Form local C(ii, :) */
1374:     pdj = pd->j + pd->i[ii];
1375:     for (j=0; j<dnzi; j++) {
1376:       PetscHSetIUpdate(htd[pdj[j]*dof+offset],ht);
1377:       PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht);
1378:     }
1379:   }

1381:   ISRestoreIndices(map,&mappingindices);

1383:   PetscHSetIDestroy(&ht);
1384:   PetscHSetIDestroy(&oht);

1386:   for (i=0; i<pon; i++) {
1387:     PetscHSetIGetSize(hta[i],&htsize);
1388:     ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize;
1389:     c_rmtc[i] = htsize;
1390:   }

1392:   PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);

1394:   for (i=0; i<pon; i++) {
1395:     off = 0;
1396:     PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);
1397:     PetscHSetIDestroy(&hta[i]);
1398:   }
1399:   PetscFree(hta);

1401:   PetscMalloc1(pon,&iremote);
1402:   for (i=0; i<pon; i++) {
1403:     owner = 0; lidx = 0;
1404:     offset = i%dof;
1405:     ii     = i/dof;
1406:     PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx);
1407:     iremote[i].index = lidx*dof+offset;
1408:     iremote[i].rank  = owner;
1409:   }

1411:   PetscSFCreate(comm,&sf);
1412:   PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
1413:   /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1414:   PetscSFSetRankOrder(sf,PETSC_TRUE);
1415:   PetscSFSetFromOptions(sf);
1416:   PetscSFSetUp(sf);
1417:   /* How many neighbors have contributions to my rows? */
1418:   PetscSFComputeDegreeBegin(sf,&rootdegrees);
1419:   PetscSFComputeDegreeEnd(sf,&rootdegrees);
1420:   rootspacesize = 0;
1421:   for (i = 0; i < pn; i++) {
1422:     rootspacesize += rootdegrees[i];
1423:   }
1424:   PetscMalloc1(rootspacesize,&rootspace);
1425:   PetscMalloc1(rootspacesize+1,&rootspaceoffsets);
1426:   /* Get information from leaves
1427:    * Number of columns other people contribute to my rows
1428:    * */
1429:   PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);
1430:   PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);
1431:   PetscFree(c_rmtc);
1432:   PetscMalloc1(pn+1,&ptap->c_othi);
1433:   /* The number of columns is received for each row */
1434:   ptap->c_othi[0]     = 0;
1435:   rootspacesize       = 0;
1436:   rootspaceoffsets[0] = 0;
1437:   for (i = 0; i < pn; i++) {
1438:     rcvncols = 0;
1439:     for (j = 0; j<rootdegrees[i]; j++) {
1440:       rcvncols += rootspace[rootspacesize];
1441:       rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1442:       rootspacesize++;
1443:     }
1444:     ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols;
1445:   }
1446:   PetscFree(rootspace);

1448:   PetscMalloc1(pon,&c_rmtoffsets);
1449:   PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);
1450:   PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);
1451:   PetscSFDestroy(&sf);
1452:   PetscFree(rootspaceoffsets);

1454:   PetscCalloc1(ptap->c_rmti[pon],&iremote);
1455:   nleaves = 0;
1456:   for (i = 0; i<pon; i++) {
1457:     owner = 0;
1458:     ii    = i/dof;
1459:     PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL);
1460:     sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i];
1461:     for (j=0; j<sendncols; j++) {
1462:       iremote[nleaves].rank    = owner;
1463:       iremote[nleaves++].index = c_rmtoffsets[i] + j;
1464:     }
1465:   }
1466:   PetscFree(c_rmtoffsets);
1467:   PetscCalloc1(ptap->c_othi[pn],&c_othj);

1469:   PetscSFCreate(comm,&ptap->sf);
1470:   PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
1471:   PetscSFSetFromOptions(ptap->sf);
1472:   /* One to one map */
1473:   PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);
1474:   /* Get remote data */
1475:   PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);
1476:   PetscFree(c_rmtj);
1477:   PetscMalloc2(pn,&dnz,pn,&onz);
1478:   MatGetOwnershipRangeColumn(P,&pcstart,&pcend);
1479:   pcstart *= dof;
1480:   pcend   *= dof;
1481:   for (i = 0; i < pn; i++) {
1482:     nzi = ptap->c_othi[i+1] - ptap->c_othi[i];
1483:     rdj = c_othj + ptap->c_othi[i];
1484:     for (j = 0; j < nzi; j++) {
1485:       col =  rdj[j];
1486:       /* diag part */
1487:       if (col>=pcstart && col<pcend) {
1488:         PetscHSetIAdd(htd[i],col);
1489:       } else { /* off diag */
1490:         PetscHSetIAdd(hto[i],col);
1491:       }
1492:     }
1493:     PetscHSetIGetSize(htd[i],&htsize);
1494:     dnz[i] = htsize;
1495:     PetscHSetIDestroy(&htd[i]);
1496:     PetscHSetIGetSize(hto[i],&htsize);
1497:     onz[i] = htsize;
1498:     PetscHSetIDestroy(&hto[i]);
1499:   }

1501:   PetscFree2(htd,hto);
1502:   PetscFree(c_othj);

1504:   /* local sizes and preallocation */
1505:   MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1506:   MatSetBlockSizes(Cmpi, dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs);
1507:   MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1508:   PetscFree2(dnz,onz);

1510:   /* attach the supporting struct to Cmpi for reuse */
1511:   Cmpi->product->data    = ptap;
1512:   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1513:   Cmpi->product->view    = MatView_MPIAIJ_PtAP;

1515:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1516:   Cmpi->assembled = PETSC_FALSE;
1517:   /* pick an algorithm */
1518:   PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");
1519:   alg = 0;
1520:   PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);
1521:   PetscOptionsEnd();
1522:   switch (alg) {
1523:     case 0:
1524:       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1525:       break;
1526:     case 1:
1527:       Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1528:       break;
1529:     default:
1530:       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm ");
1531:   }
1532:   return 0;
1533: }

1535: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,PetscReal fill,Mat C)
1536: {
1537:   MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,fill,C);
1538:   return 0;
1539: }

1541: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat Cmpi)
1542: {
1543:   PetscErrorCode      ierr;
1544:   Mat_APMPI           *ptap;
1545:   Mat_MPIAIJ          *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
1546:   MPI_Comm            comm;
1547:   PetscMPIInt         size,rank;
1548:   PetscFreeSpaceList  free_space=NULL,current_space=NULL;
1549:   PetscInt            am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
1550:   PetscInt            *lnk,i,k,pnz,row,nsend;
1551:   PetscBT             lnkbt;
1552:   PetscMPIInt         tagi,tagj,*len_si,*len_s,*len_ri,nrecv;
1553:   PETSC_UNUSED PetscMPIInt icompleted=0;
1554:   PetscInt            **buf_rj,**buf_ri,**buf_ri_k;
1555:   PetscInt            len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
1556:   PetscInt            nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1557:   MPI_Request         *swaits,*rwaits;
1558:   MPI_Status          *sstatus,rstatus;
1559:   PetscLayout         rowmap;
1560:   PetscInt            *owners_co,*coi,*coj;    /* i and j array of (p->B)^T*A*P - used in the communication */
1561:   PetscMPIInt         *len_r,*id_r;    /* array of length of comm->size, store send/recv matrix values */
1562:   PetscInt            *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi;
1563:   Mat_SeqAIJ          *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
1564:   PetscScalar         *apv;
1565:   PetscTable          ta;
1566:   MatType             mtype;
1567:   const char          *prefix;
1568: #if defined(PETSC_USE_INFO)
1569:   PetscReal           apfill;
1570: #endif

1572:   MatCheckProduct(Cmpi,4);
1574:   PetscObjectGetComm((PetscObject)A,&comm);
1575:   MPI_Comm_size(comm,&size);
1576:   MPI_Comm_rank(comm,&rank);

1578:   if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;

1580:   /* create symbolic parallel matrix Cmpi */
1581:   MatGetType(A,&mtype);
1582:   MatSetType(Cmpi,mtype);

1584:   /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
1585:   Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;

1587:   /* create struct Mat_APMPI and attached it to C later */
1588:   PetscNew(&ptap);
1589:   ptap->reuse = MAT_INITIAL_MATRIX;
1590:   ptap->algType = 1;

1592:   /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1593:   MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
1594:   /* get P_loc by taking all local rows of P */
1595:   MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);

1597:   /* (0) compute Rd = Pd^T, Ro = Po^T  */
1598:   /* --------------------------------- */
1599:   MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
1600:   MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);

1602:   /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
1603:   /* ----------------------------------------------------------------- */
1604:   p_loc  = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1605:   if (ptap->P_oth) p_oth  = (Mat_SeqAIJ*)(ptap->P_oth)->data;

1607:   /* create and initialize a linked list */
1608:   PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
1609:   MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
1610:   MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
1611:   PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */
1612:   /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */

1614:   PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);

1616:   /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
1617:   if (ao) {
1618:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
1619:   } else {
1620:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
1621:   }
1622:   current_space = free_space;
1623:   nspacedouble  = 0;

1625:   PetscMalloc1(am+1,&api);
1626:   api[0] = 0;
1627:   for (i=0; i<am; i++) {
1628:     /* diagonal portion: Ad[i,:]*P */
1629:     ai = ad->i; pi = p_loc->i;
1630:     nzi = ai[i+1] - ai[i];
1631:     aj  = ad->j + ai[i];
1632:     for (j=0; j<nzi; j++) {
1633:       row  = aj[j];
1634:       pnz  = pi[row+1] - pi[row];
1635:       Jptr = p_loc->j + pi[row];
1636:       /* add non-zero cols of P into the sorted linked list lnk */
1637:       PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
1638:     }
1639:     /* off-diagonal portion: Ao[i,:]*P */
1640:     if (ao) {
1641:       ai = ao->i; pi = p_oth->i;
1642:       nzi = ai[i+1] - ai[i];
1643:       aj  = ao->j + ai[i];
1644:       for (j=0; j<nzi; j++) {
1645:         row  = aj[j];
1646:         pnz  = pi[row+1] - pi[row];
1647:         Jptr = p_oth->j + pi[row];
1648:         PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
1649:       }
1650:     }
1651:     apnz     = lnk[0];
1652:     api[i+1] = api[i] + apnz;
1653:     if (ap_rmax < apnz) ap_rmax = apnz;

1655:     /* if free space is not available, double the total space in the list */
1656:     if (current_space->local_remaining<apnz) {
1657:       PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),&current_space);
1658:       nspacedouble++;
1659:     }

1661:     /* Copy data into free space, then initialize lnk */
1662:     PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);

1664:     current_space->array           += apnz;
1665:     current_space->local_used      += apnz;
1666:     current_space->local_remaining -= apnz;
1667:   }
1668:   /* Allocate space for apj and apv, initialize apj, and */
1669:   /* destroy list of free space and other temporary array(s) */
1670:   PetscMalloc2(api[am],&apj,api[am],&apv);
1671:   PetscFreeSpaceContiguous(&free_space,apj);
1672:   PetscLLDestroy(lnk,lnkbt);

1674:   /* Create AP_loc for reuse */
1675:   MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);
1676:   MatSetType(ptap->AP_loc,((PetscObject)p->A)->type_name);
1677: #if defined(PETSC_USE_INFO)
1678:   if (ao) {
1679:     apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
1680:   } else {
1681:     apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
1682:   }
1683:   ptap->AP_loc->info.mallocs           = nspacedouble;
1684:   ptap->AP_loc->info.fill_ratio_given  = fill;
1685:   ptap->AP_loc->info.fill_ratio_needed = apfill;

1687:   if (api[am]) {
1688:     PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
1689:     PetscInfo(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
1690:   } else {
1691:     PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n");
1692:   }
1693: #endif

1695:   /* (2-1) compute symbolic Co = Ro*AP_loc  */
1696:   /* ------------------------------------ */
1697:   MatGetOptionsPrefix(A,&prefix);
1698:   MatSetOptionsPrefix(ptap->Ro,prefix);
1699:   MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_");
1700:   MatProductCreate(ptap->Ro,ptap->AP_loc,NULL,&ptap->C_oth);
1701:   MatGetOptionsPrefix(Cmpi,&prefix);
1702:   MatSetOptionsPrefix(ptap->C_oth,prefix);
1703:   MatAppendOptionsPrefix(ptap->C_oth,"inner_C_oth_");
1704:   MatProductSetType(ptap->C_oth,MATPRODUCT_AB);
1705:   MatProductSetAlgorithm(ptap->C_oth,"default");
1706:   MatProductSetFill(ptap->C_oth,fill);
1707:   MatProductSetFromOptions(ptap->C_oth);
1708:   MatProductSymbolic(ptap->C_oth);

1710:   /* (3) send coj of C_oth to other processors  */
1711:   /* ------------------------------------------ */
1712:   /* determine row ownership */
1713:   PetscLayoutCreate(comm,&rowmap);
1714:   rowmap->n  = pn;
1715:   rowmap->bs = 1;
1716:   PetscLayoutSetUp(rowmap);
1717:   owners = rowmap->range;

1719:   /* determine the number of messages to send, their lengths */
1720:   PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
1721:   PetscArrayzero(len_s,size);
1722:   PetscArrayzero(len_si,size);

1724:   c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
1725:   coi   = c_oth->i; coj = c_oth->j;
1726:   con   = ptap->C_oth->rmap->n;
1727:   proc  = 0;
1728:   for (i=0; i<con; i++) {
1729:     while (prmap[i] >= owners[proc+1]) proc++;
1730:     len_si[proc]++;               /* num of rows in Co(=Pt*AP) to be sent to [proc] */
1731:     len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
1732:   }

1734:   len          = 0; /* max length of buf_si[], see (4) */
1735:   owners_co[0] = 0;
1736:   nsend        = 0;
1737:   for (proc=0; proc<size; proc++) {
1738:     owners_co[proc+1] = owners_co[proc] + len_si[proc];
1739:     if (len_s[proc]) {
1740:       nsend++;
1741:       len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
1742:       len         += len_si[proc];
1743:     }
1744:   }

1746:   /* determine the number and length of messages to receive for coi and coj  */
1747:   PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
1748:   PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);

1750:   /* post the Irecv and Isend of coj */
1751:   PetscCommGetNewTag(comm,&tagj);
1752:   PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
1753:   PetscMalloc1(nsend+1,&swaits);
1754:   for (proc=0, k=0; proc<size; proc++) {
1755:     if (!len_s[proc]) continue;
1756:     i    = owners_co[proc];
1757:     MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
1758:     k++;
1759:   }

1761:   /* (2-2) compute symbolic C_loc = Rd*AP_loc */
1762:   /* ---------------------------------------- */
1763:   MatSetOptionsPrefix(ptap->Rd,prefix);
1764:   MatAppendOptionsPrefix(ptap->Rd,"inner_diag_");
1765:   MatProductCreate(ptap->Rd,ptap->AP_loc,NULL,&ptap->C_loc);
1766:   MatGetOptionsPrefix(Cmpi,&prefix);
1767:   MatSetOptionsPrefix(ptap->C_loc,prefix);
1768:   MatAppendOptionsPrefix(ptap->C_loc,"inner_C_loc_");
1769:   MatProductSetType(ptap->C_loc,MATPRODUCT_AB);
1770:   MatProductSetAlgorithm(ptap->C_loc,"default");
1771:   MatProductSetFill(ptap->C_loc,fill);
1772:   MatProductSetFromOptions(ptap->C_loc);
1773:   MatProductSymbolic(ptap->C_loc);

1775:   c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;

1777:   /* receives coj are complete */
1778:   for (i=0; i<nrecv; i++) {
1779:     MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
1780:   }
1781:   PetscFree(rwaits);
1782:   if (nsend) MPI_Waitall(nsend,swaits,sstatus);

1784:   /* add received column indices into ta to update Crmax */
1785:   for (k=0; k<nrecv; k++) {/* k-th received message */
1786:     Jptr = buf_rj[k];
1787:     for (j=0; j<len_r[k]; j++) {
1788:       PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
1789:     }
1790:   }
1791:   PetscTableGetCount(ta,&Crmax);
1792:   PetscTableDestroy(&ta);

1794:   /* (4) send and recv coi */
1795:   /*-----------------------*/
1796:   PetscCommGetNewTag(comm,&tagi);
1797:   PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
1798:   PetscMalloc1(len+1,&buf_s);
1799:   buf_si = buf_s;  /* points to the beginning of k-th msg to be sent */
1800:   for (proc=0,k=0; proc<size; proc++) {
1801:     if (!len_s[proc]) continue;
1802:     /* form outgoing message for i-structure:
1803:          buf_si[0]:                 nrows to be sent
1804:                [1:nrows]:           row index (global)
1805:                [nrows+1:2*nrows+1]: i-structure index
1806:     */
1807:     /*-------------------------------------------*/
1808:     nrows       = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
1809:     buf_si_i    = buf_si + nrows+1;
1810:     buf_si[0]   = nrows;
1811:     buf_si_i[0] = 0;
1812:     nrows       = 0;
1813:     for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1814:       nzi = coi[i+1] - coi[i];
1815:       buf_si_i[nrows+1] = buf_si_i[nrows] + nzi;  /* i-structure */
1816:       buf_si[nrows+1]   = prmap[i] -owners[proc]; /* local row index */
1817:       nrows++;
1818:     }
1819:     MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
1820:     k++;
1821:     buf_si += len_si[proc];
1822:   }
1823:   for (i=0; i<nrecv; i++) {
1824:     MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
1825:   }
1826:   PetscFree(rwaits);
1827:   if (nsend) MPI_Waitall(nsend,swaits,sstatus);

1829:   PetscFree4(len_s,len_si,sstatus,owners_co);
1830:   PetscFree(len_ri);
1831:   PetscFree(swaits);
1832:   PetscFree(buf_s);

1834:   /* (5) compute the local portion of Cmpi      */
1835:   /* ------------------------------------------ */
1836:   /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
1837:   PetscFreeSpaceGet(Crmax,&free_space);
1838:   current_space = free_space;

1840:   PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
1841:   for (k=0; k<nrecv; k++) {
1842:     buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1843:     nrows       = *buf_ri_k[k];
1844:     nextrow[k]  = buf_ri_k[k] + 1;  /* next row number of k-th recved i-structure */
1845:     nextci[k]   = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure  */
1846:   }

1848:   MatPreallocateInitialize(comm,pn,pn,dnz,onz);
1849:   PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);
1850:   for (i=0; i<pn; i++) {
1851:     /* add C_loc into Cmpi */
1852:     nzi  = c_loc->i[i+1] - c_loc->i[i];
1853:     Jptr = c_loc->j + c_loc->i[i];
1854:     PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);

1856:     /* add received col data into lnk */
1857:     for (k=0; k<nrecv; k++) { /* k-th received message */
1858:       if (i == *nextrow[k]) { /* i-th row */
1859:         nzi  = *(nextci[k]+1) - *nextci[k];
1860:         Jptr = buf_rj[k] + *nextci[k];
1861:         PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
1862:         nextrow[k]++; nextci[k]++;
1863:       }
1864:     }
1865:     nzi = lnk[0];

1867:     /* copy data into free space, then initialize lnk */
1868:     PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);
1869:     MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
1870:   }
1871:   PetscFree3(buf_ri_k,nextrow,nextci);
1872:   PetscLLDestroy(lnk,lnkbt);
1873:   PetscFreeSpaceDestroy(free_space);

1875:   /* local sizes and preallocation */
1876:   MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1877:   if (P->cmap->bs > 0) {
1878:     PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);
1879:     PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs);
1880:   }
1881:   MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1882:   MatPreallocateFinalize(dnz,onz);

1884:   /* members in merge */
1885:   PetscFree(id_r);
1886:   PetscFree(len_r);
1887:   PetscFree(buf_ri[0]);
1888:   PetscFree(buf_ri);
1889:   PetscFree(buf_rj[0]);
1890:   PetscFree(buf_rj);
1891:   PetscLayoutDestroy(&rowmap);

1893:   PetscCalloc1(pN,&ptap->apa);

1895:   /* attach the supporting struct to Cmpi for reuse */
1896:   Cmpi->product->data    = ptap;
1897:   Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1898:   Cmpi->product->view    = MatView_MPIAIJ_PtAP;

1900:   /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1901:   Cmpi->assembled = PETSC_FALSE;
1902:   return 0;
1903: }

1905: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
1906: {
1907:   Mat_MPIAIJ        *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
1908:   Mat_SeqAIJ        *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1909:   Mat_SeqAIJ        *ap,*p_loc,*p_oth=NULL,*c_seq;
1910:   Mat_APMPI         *ptap;
1911:   Mat               AP_loc,C_loc,C_oth;
1912:   PetscInt          i,rstart,rend,cm,ncols,row;
1913:   PetscInt          *api,*apj,am = A->rmap->n,j,col,apnz;
1914:   PetscScalar       *apa;
1915:   const PetscInt    *cols;
1916:   const PetscScalar *vals;

1918:   MatCheckProduct(C,3);
1919:   ptap = (Mat_APMPI*)C->product->data;

1923:   MatZeroEntries(C);
1924:   /* 1) get R = Pd^T,Ro = Po^T */
1925:   if (ptap->reuse == MAT_REUSE_MATRIX) {
1926:     MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
1927:     MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
1928:   }

1930:   /* 2) get AP_loc */
1931:   AP_loc = ptap->AP_loc;
1932:   ap = (Mat_SeqAIJ*)AP_loc->data;

1934:   /* 2-1) get P_oth = ptap->P_oth  and P_loc = ptap->P_loc */
1935:   /*-----------------------------------------------------*/
1936:   if (ptap->reuse == MAT_REUSE_MATRIX) {
1937:     /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1938:     MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
1939:     MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
1940:   }

1942:   /* 2-2) compute numeric A_loc*P - dominating part */
1943:   /* ---------------------------------------------- */
1944:   /* get data from symbolic products */
1945:   p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1946:   if (ptap->P_oth) {
1947:     p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1948:   }
1949:   apa   = ptap->apa;
1950:   api   = ap->i;
1951:   apj   = ap->j;
1952:   for (i=0; i<am; i++) {
1953:     /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1954:     AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
1955:     apnz = api[i+1] - api[i];
1956:     for (j=0; j<apnz; j++) {
1957:       col = apj[j+api[i]];
1958:       ap->a[j+ap->i[i]] = apa[col];
1959:       apa[col] = 0.0;
1960:     }
1961:   }
1962:   /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */
1963:   PetscObjectStateIncrease((PetscObject)AP_loc);

1965:   /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
1966:   MatProductNumeric(ptap->C_loc);
1967:   MatProductNumeric(ptap->C_oth);
1968:   C_loc = ptap->C_loc;
1969:   C_oth = ptap->C_oth;

1971:   /* add C_loc and Co to to C */
1972:   MatGetOwnershipRange(C,&rstart,&rend);

1974:   /* C_loc -> C */
1975:   cm    = C_loc->rmap->N;
1976:   c_seq = (Mat_SeqAIJ*)C_loc->data;
1977:   cols = c_seq->j;
1978:   vals = c_seq->a;

1980:   /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
1981:   /* when there are no off-processor parts.  */
1982:   /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
1983:   /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
1984:   /* a table, and other, more complex stuff has to be done. */
1985:   if (C->assembled) {
1986:     C->was_assembled = PETSC_TRUE;
1987:     C->assembled     = PETSC_FALSE;
1988:   }
1989:   if (C->was_assembled) {
1990:     for (i=0; i<cm; i++) {
1991:       ncols = c_seq->i[i+1] - c_seq->i[i];
1992:       row = rstart + i;
1993:       MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);
1994:       cols += ncols; vals += ncols;
1995:     }
1996:   } else {
1997:     MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);
1998:   }

2000:   /* Co -> C, off-processor part */
2001:   cm = C_oth->rmap->N;
2002:   c_seq = (Mat_SeqAIJ*)C_oth->data;
2003:   cols = c_seq->j;
2004:   vals = c_seq->a;
2005:   for (i=0; i<cm; i++) {
2006:     ncols = c_seq->i[i+1] - c_seq->i[i];
2007:     row = p->garray[i];
2008:     MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
2009:     cols += ncols; vals += ncols;
2010:   }

2012:   MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
2013:   MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);

2015:   ptap->reuse = MAT_REUSE_MATRIX;
2016:   return 0;
2017: }

2019: PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C)
2020: {
2021:   Mat_Product         *product = C->product;
2022:   Mat                 A=product->A,P=product->B;
2023:   MatProductAlgorithm alg=product->alg;
2024:   PetscReal           fill=product->fill;
2025:   PetscBool           flg;

2027:   /* scalable: do R=P^T locally, then C=R*A*P */
2028:   PetscStrcmp(alg,"scalable",&flg);
2029:   if (flg) {
2030:     MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,product->fill,C);
2031:     C->ops->productnumeric = MatProductNumeric_PtAP;
2032:     goto next;
2033:   }

2035:   /* nonscalable: do R=P^T locally, then C=R*A*P */
2036:   PetscStrcmp(alg,"nonscalable",&flg);
2037:   if (flg) {
2038:     MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);
2039:     goto next;
2040:   }

2042:   /* allatonce */
2043:   PetscStrcmp(alg,"allatonce",&flg);
2044:   if (flg) {
2045:     MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A,P,fill,C);
2046:     goto next;
2047:   }

2049:   /* allatonce_merged */
2050:   PetscStrcmp(alg,"allatonce_merged",&flg);
2051:   if (flg) {
2052:     MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A,P,fill,C);
2053:     goto next;
2054:   }

2056:   /* backend general code */
2057:   PetscStrcmp(alg,"backend",&flg);
2058:   if (flg) {
2059:     MatProductSymbolic_MPIAIJBACKEND(C);
2060:     return 0;
2061:   }

2063:   /* hypre */
2064: #if defined(PETSC_HAVE_HYPRE)
2065:   PetscStrcmp(alg,"hypre",&flg);
2066:   if (flg) {
2067:     MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);
2068:     C->ops->productnumeric = MatProductNumeric_PtAP;
2069:     return 0;
2070:   }
2071: #endif
2072:   SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_SUP,"Mat Product Algorithm is not supported");

2074: next:
2075:   C->ops->productnumeric = MatProductNumeric_PtAP;
2076:   return 0;
2077: }