Actual source code: mpiptap.c
2: /*
3: Defines projective product routines where A is a MPIAIJ matrix
4: C = P^T * A * P
5: */
7: #include <../src/mat/impls/aij/seq/aij.h>
8: #include <../src/mat/utils/freespace.h>
9: #include <../src/mat/impls/aij/mpi/mpiaij.h>
10: #include <petscbt.h>
11: #include <petsctime.h>
12: #include <petsc/private/hashmapiv.h>
13: #include <petsc/private/hashseti.h>
14: #include <petscsf.h>
16: PetscErrorCode MatView_MPIAIJ_PtAP(Mat A,PetscViewer viewer)
17: {
18: PetscBool iascii;
19: PetscViewerFormat format;
20: Mat_APMPI *ptap;
22: MatCheckProduct(A,1);
23: ptap = (Mat_APMPI*)A->product->data;
24: PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&iascii);
25: if (iascii) {
26: PetscViewerGetFormat(viewer,&format);
27: if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
28: if (ptap->algType == 0) {
29: PetscViewerASCIIPrintf(viewer,"using scalable MatPtAP() implementation\n");
30: } else if (ptap->algType == 1) {
31: PetscViewerASCIIPrintf(viewer,"using nonscalable MatPtAP() implementation\n");
32: } else if (ptap->algType == 2) {
33: PetscViewerASCIIPrintf(viewer,"using allatonce MatPtAP() implementation\n");
34: } else if (ptap->algType == 3) {
35: PetscViewerASCIIPrintf(viewer,"using merged allatonce MatPtAP() implementation\n");
36: }
37: }
38: }
39: return 0;
40: }
42: PetscErrorCode MatDestroy_MPIAIJ_PtAP(void *data)
43: {
44: Mat_APMPI *ptap = (Mat_APMPI*)data;
45: Mat_Merge_SeqsToMPI *merge;
47: PetscFree2(ptap->startsj_s,ptap->startsj_r);
48: PetscFree(ptap->bufa);
49: MatDestroy(&ptap->P_loc);
50: MatDestroy(&ptap->P_oth);
51: MatDestroy(&ptap->A_loc); /* used by MatTransposeMatMult() */
52: MatDestroy(&ptap->Rd);
53: MatDestroy(&ptap->Ro);
54: if (ptap->AP_loc) { /* used by alg_rap */
55: Mat_SeqAIJ *ap = (Mat_SeqAIJ*)(ptap->AP_loc)->data;
56: PetscFree(ap->i);
57: PetscFree2(ap->j,ap->a);
58: MatDestroy(&ptap->AP_loc);
59: } else { /* used by alg_ptap */
60: PetscFree(ptap->api);
61: PetscFree(ptap->apj);
62: }
63: MatDestroy(&ptap->C_loc);
64: MatDestroy(&ptap->C_oth);
65: if (ptap->apa) PetscFree(ptap->apa);
67: MatDestroy(&ptap->Pt);
69: merge = ptap->merge;
70: if (merge) { /* used by alg_ptap */
71: PetscFree(merge->id_r);
72: PetscFree(merge->len_s);
73: PetscFree(merge->len_r);
74: PetscFree(merge->bi);
75: PetscFree(merge->bj);
76: PetscFree(merge->buf_ri[0]);
77: PetscFree(merge->buf_ri);
78: PetscFree(merge->buf_rj[0]);
79: PetscFree(merge->buf_rj);
80: PetscFree(merge->coi);
81: PetscFree(merge->coj);
82: PetscFree(merge->owners_co);
83: PetscLayoutDestroy(&merge->rowmap);
84: PetscFree(ptap->merge);
85: }
86: ISLocalToGlobalMappingDestroy(&ptap->ltog);
88: PetscSFDestroy(&ptap->sf);
89: PetscFree(ptap->c_othi);
90: PetscFree(ptap->c_rmti);
91: PetscFree(ptap);
92: return 0;
93: }
95: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,Mat C)
96: {
97: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
98: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
99: Mat_SeqAIJ *ap,*p_loc,*p_oth=NULL,*c_seq;
100: Mat_APMPI *ptap;
101: Mat AP_loc,C_loc,C_oth;
102: PetscInt i,rstart,rend,cm,ncols,row,*api,*apj,am = A->rmap->n,apnz,nout;
103: PetscScalar *apa;
104: const PetscInt *cols;
105: const PetscScalar *vals;
107: MatCheckProduct(C,3);
108: ptap = (Mat_APMPI*)C->product->data;
112: MatZeroEntries(C);
114: /* 1) get R = Pd^T,Ro = Po^T */
115: if (ptap->reuse == MAT_REUSE_MATRIX) {
116: MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
117: MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
118: }
120: /* 2) get AP_loc */
121: AP_loc = ptap->AP_loc;
122: ap = (Mat_SeqAIJ*)AP_loc->data;
124: /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
125: /*-----------------------------------------------------*/
126: if (ptap->reuse == MAT_REUSE_MATRIX) {
127: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
128: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
129: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
130: }
132: /* 2-2) compute numeric A_loc*P - dominating part */
133: /* ---------------------------------------------- */
134: /* get data from symbolic products */
135: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
136: if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
138: api = ap->i;
139: apj = ap->j;
140: ISLocalToGlobalMappingApply(ptap->ltog,api[AP_loc->rmap->n],apj,apj);
141: for (i=0; i<am; i++) {
142: /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
143: apnz = api[i+1] - api[i];
144: apa = ap->a + api[i];
145: PetscArrayzero(apa,apnz);
146: AProw_scalable(i,ad,ao,p_loc,p_oth,api,apj,apa);
147: }
148: ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,api[AP_loc->rmap->n],apj,&nout,apj);
151: /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
152: /* Always use scalable version since we are in the MPI scalable version */
153: MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Rd,AP_loc,ptap->C_loc);
154: MatMatMultNumeric_SeqAIJ_SeqAIJ_Scalable(ptap->Ro,AP_loc,ptap->C_oth);
156: C_loc = ptap->C_loc;
157: C_oth = ptap->C_oth;
159: /* add C_loc and Co to to C */
160: MatGetOwnershipRange(C,&rstart,&rend);
162: /* C_loc -> C */
163: cm = C_loc->rmap->N;
164: c_seq = (Mat_SeqAIJ*)C_loc->data;
165: cols = c_seq->j;
166: vals = c_seq->a;
167: ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_loc->rmap->n],c_seq->j,c_seq->j);
169: /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
170: /* when there are no off-processor parts. */
171: /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
172: /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
173: /* a table, and other, more complex stuff has to be done. */
174: if (C->assembled) {
175: C->was_assembled = PETSC_TRUE;
176: C->assembled = PETSC_FALSE;
177: }
178: if (C->was_assembled) {
179: for (i=0; i<cm; i++) {
180: ncols = c_seq->i[i+1] - c_seq->i[i];
181: row = rstart + i;
182: MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);
183: cols += ncols; vals += ncols;
184: }
185: } else {
186: MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);
187: }
188: ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_loc->rmap->n],c_seq->j,&nout,c_seq->j);
191: /* Co -> C, off-processor part */
192: cm = C_oth->rmap->N;
193: c_seq = (Mat_SeqAIJ*)C_oth->data;
194: cols = c_seq->j;
195: vals = c_seq->a;
196: ISLocalToGlobalMappingApply(ptap->ltog,c_seq->i[C_oth->rmap->n],c_seq->j,c_seq->j);
197: for (i=0; i<cm; i++) {
198: ncols = c_seq->i[i+1] - c_seq->i[i];
199: row = p->garray[i];
200: MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
201: cols += ncols; vals += ncols;
202: }
203: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
204: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
206: ptap->reuse = MAT_REUSE_MATRIX;
208: ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_seq->i[C_oth->rmap->n],c_seq->j,&nout,c_seq->j);
210: return 0;
211: }
213: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(Mat A,Mat P,PetscReal fill,Mat Cmpi)
214: {
215: PetscErrorCode ierr;
216: Mat_APMPI *ptap;
217: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
218: MPI_Comm comm;
219: PetscMPIInt size,rank;
220: Mat P_loc,P_oth;
221: PetscFreeSpaceList free_space=NULL,current_space=NULL;
222: PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
223: PetscInt *lnk,i,k,pnz,row,nsend;
224: PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,nrecv;
225: PETSC_UNUSED PetscMPIInt icompleted=0;
226: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
227: const PetscInt *owners;
228: PetscInt len,proc,*dnz,*onz,nzi,nspacedouble;
229: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
230: MPI_Request *swaits,*rwaits;
231: MPI_Status *sstatus,rstatus;
232: PetscLayout rowmap;
233: PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */
234: PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */
235: PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,Crmax,*aj,*ai,*pi,nout;
236: Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
237: PetscScalar *apv;
238: PetscTable ta;
239: MatType mtype;
240: const char *prefix;
241: #if defined(PETSC_USE_INFO)
242: PetscReal apfill;
243: #endif
245: MatCheckProduct(Cmpi,4);
247: PetscObjectGetComm((PetscObject)A,&comm);
248: MPI_Comm_size(comm,&size);
249: MPI_Comm_rank(comm,&rank);
251: if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
253: /* create symbolic parallel matrix Cmpi */
254: MatGetType(A,&mtype);
255: MatSetType(Cmpi,mtype);
257: /* create struct Mat_APMPI and attached it to C later */
258: PetscNew(&ptap);
259: ptap->reuse = MAT_INITIAL_MATRIX;
260: ptap->algType = 0;
262: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
263: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&P_oth);
264: /* get P_loc by taking all local rows of P */
265: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&P_loc);
267: ptap->P_loc = P_loc;
268: ptap->P_oth = P_oth;
270: /* (0) compute Rd = Pd^T, Ro = Po^T */
271: /* --------------------------------- */
272: MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
273: MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);
275: /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
276: /* ----------------------------------------------------------------- */
277: p_loc = (Mat_SeqAIJ*)P_loc->data;
278: if (P_oth) p_oth = (Mat_SeqAIJ*)P_oth->data;
280: /* create and initialize a linked list */
281: PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
282: MatRowMergeMax_SeqAIJ(p_loc,P_loc->rmap->N,ta);
283: MatRowMergeMax_SeqAIJ(p_oth,P_oth->rmap->N,ta);
284: PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */
286: PetscLLCondensedCreate_Scalable(Crmax,&lnk);
288: /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
289: if (ao) {
290: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
291: } else {
292: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
293: }
294: current_space = free_space;
295: nspacedouble = 0;
297: PetscMalloc1(am+1,&api);
298: api[0] = 0;
299: for (i=0; i<am; i++) {
300: /* diagonal portion: Ad[i,:]*P */
301: ai = ad->i; pi = p_loc->i;
302: nzi = ai[i+1] - ai[i];
303: aj = ad->j + ai[i];
304: for (j=0; j<nzi; j++) {
305: row = aj[j];
306: pnz = pi[row+1] - pi[row];
307: Jptr = p_loc->j + pi[row];
308: /* add non-zero cols of P into the sorted linked list lnk */
309: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
310: }
311: /* off-diagonal portion: Ao[i,:]*P */
312: if (ao) {
313: ai = ao->i; pi = p_oth->i;
314: nzi = ai[i+1] - ai[i];
315: aj = ao->j + ai[i];
316: for (j=0; j<nzi; j++) {
317: row = aj[j];
318: pnz = pi[row+1] - pi[row];
319: Jptr = p_oth->j + pi[row];
320: PetscLLCondensedAddSorted_Scalable(pnz,Jptr,lnk);
321: }
322: }
323: apnz = lnk[0];
324: api[i+1] = api[i] + apnz;
326: /* if free space is not available, double the total space in the list */
327: if (current_space->local_remaining<apnz) {
328: PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);
329: nspacedouble++;
330: }
332: /* Copy data into free space, then initialize lnk */
333: PetscLLCondensedClean_Scalable(apnz,current_space->array,lnk);
335: current_space->array += apnz;
336: current_space->local_used += apnz;
337: current_space->local_remaining -= apnz;
338: }
339: /* Allocate space for apj and apv, initialize apj, and */
340: /* destroy list of free space and other temporary array(s) */
341: PetscCalloc2(api[am],&apj,api[am],&apv);
342: PetscFreeSpaceContiguous(&free_space,apj);
343: PetscLLCondensedDestroy_Scalable(lnk);
345: /* Create AP_loc for reuse */
346: MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);
347: MatSeqAIJCompactOutExtraColumns_SeqAIJ(ptap->AP_loc, &ptap->ltog);
349: #if defined(PETSC_USE_INFO)
350: if (ao) {
351: apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
352: } else {
353: apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
354: }
355: ptap->AP_loc->info.mallocs = nspacedouble;
356: ptap->AP_loc->info.fill_ratio_given = fill;
357: ptap->AP_loc->info.fill_ratio_needed = apfill;
359: if (api[am]) {
360: PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
361: PetscInfo(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
362: } else {
363: PetscInfo(ptap->AP_loc,"Scalable algorithm, AP_loc is empty \n");
364: }
365: #endif
367: /* (2-1) compute symbolic Co = Ro*AP_loc */
368: /* -------------------------------------- */
369: MatProductCreate(ptap->Ro,ptap->AP_loc,NULL,&ptap->C_oth);
370: MatGetOptionsPrefix(A,&prefix);
371: MatSetOptionsPrefix(ptap->C_oth,prefix);
372: MatAppendOptionsPrefix(ptap->C_oth,"inner_offdiag_");
374: MatProductSetType(ptap->C_oth,MATPRODUCT_AB);
375: MatProductSetAlgorithm(ptap->C_oth,"sorted");
376: MatProductSetFill(ptap->C_oth,fill);
377: MatProductSetFromOptions(ptap->C_oth);
378: MatProductSymbolic(ptap->C_oth);
380: /* (3) send coj of C_oth to other processors */
381: /* ------------------------------------------ */
382: /* determine row ownership */
383: PetscLayoutCreate(comm,&rowmap);
384: PetscLayoutSetLocalSize(rowmap, pn);
385: PetscLayoutSetBlockSize(rowmap, 1);
386: PetscLayoutSetUp(rowmap);
387: PetscLayoutGetRanges(rowmap,&owners);
389: /* determine the number of messages to send, their lengths */
390: PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
391: PetscArrayzero(len_s,size);
392: PetscArrayzero(len_si,size);
394: c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
395: coi = c_oth->i; coj = c_oth->j;
396: con = ptap->C_oth->rmap->n;
397: proc = 0;
398: ISLocalToGlobalMappingApply(ptap->ltog,coi[con],coj,coj);
399: for (i=0; i<con; i++) {
400: while (prmap[i] >= owners[proc+1]) proc++;
401: len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */
402: len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
403: }
405: len = 0; /* max length of buf_si[], see (4) */
406: owners_co[0] = 0;
407: nsend = 0;
408: for (proc=0; proc<size; proc++) {
409: owners_co[proc+1] = owners_co[proc] + len_si[proc];
410: if (len_s[proc]) {
411: nsend++;
412: len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
413: len += len_si[proc];
414: }
415: }
417: /* determine the number and length of messages to receive for coi and coj */
418: PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
419: PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);
421: /* post the Irecv and Isend of coj */
422: PetscCommGetNewTag(comm,&tagj);
423: PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
424: PetscMalloc1(nsend+1,&swaits);
425: for (proc=0, k=0; proc<size; proc++) {
426: if (!len_s[proc]) continue;
427: i = owners_co[proc];
428: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
429: k++;
430: }
432: /* (2-2) compute symbolic C_loc = Rd*AP_loc */
433: /* ---------------------------------------- */
434: MatProductCreate(ptap->Rd,ptap->AP_loc,NULL,&ptap->C_loc);
435: MatProductSetType(ptap->C_loc,MATPRODUCT_AB);
436: MatProductSetAlgorithm(ptap->C_loc,"default");
437: MatProductSetFill(ptap->C_loc,fill);
439: MatSetOptionsPrefix(ptap->C_loc,prefix);
440: MatAppendOptionsPrefix(ptap->C_loc,"inner_diag_");
442: MatProductSetFromOptions(ptap->C_loc);
443: MatProductSymbolic(ptap->C_loc);
445: c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
446: ISLocalToGlobalMappingApply(ptap->ltog,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,c_loc->j);
448: /* receives coj are complete */
449: for (i=0; i<nrecv; i++) {
450: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
451: }
452: PetscFree(rwaits);
453: if (nsend) MPI_Waitall(nsend,swaits,sstatus);
455: /* add received column indices into ta to update Crmax */
456: for (k=0; k<nrecv; k++) {/* k-th received message */
457: Jptr = buf_rj[k];
458: for (j=0; j<len_r[k]; j++) {
459: PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
460: }
461: }
462: PetscTableGetCount(ta,&Crmax);
463: PetscTableDestroy(&ta);
465: /* (4) send and recv coi */
466: /*-----------------------*/
467: PetscCommGetNewTag(comm,&tagi);
468: PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
469: PetscMalloc1(len+1,&buf_s);
470: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
471: for (proc=0,k=0; proc<size; proc++) {
472: if (!len_s[proc]) continue;
473: /* form outgoing message for i-structure:
474: buf_si[0]: nrows to be sent
475: [1:nrows]: row index (global)
476: [nrows+1:2*nrows+1]: i-structure index
477: */
478: /*-------------------------------------------*/
479: nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
480: buf_si_i = buf_si + nrows+1;
481: buf_si[0] = nrows;
482: buf_si_i[0] = 0;
483: nrows = 0;
484: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
485: nzi = coi[i+1] - coi[i];
486: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
487: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
488: nrows++;
489: }
490: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
491: k++;
492: buf_si += len_si[proc];
493: }
494: for (i=0; i<nrecv; i++) {
495: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
496: }
497: PetscFree(rwaits);
498: if (nsend) MPI_Waitall(nsend,swaits,sstatus);
500: PetscFree4(len_s,len_si,sstatus,owners_co);
501: PetscFree(len_ri);
502: PetscFree(swaits);
503: PetscFree(buf_s);
505: /* (5) compute the local portion of Cmpi */
506: /* ------------------------------------------ */
507: /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
508: PetscFreeSpaceGet(Crmax,&free_space);
509: current_space = free_space;
511: PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
512: for (k=0; k<nrecv; k++) {
513: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
514: nrows = *buf_ri_k[k];
515: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
516: nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure */
517: }
519: MatPreallocateInitialize(comm,pn,pn,dnz,onz);
520: PetscLLCondensedCreate_Scalable(Crmax,&lnk);
521: for (i=0; i<pn; i++) {
522: /* add C_loc into Cmpi */
523: nzi = c_loc->i[i+1] - c_loc->i[i];
524: Jptr = c_loc->j + c_loc->i[i];
525: PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
527: /* add received col data into lnk */
528: for (k=0; k<nrecv; k++) { /* k-th received message */
529: if (i == *nextrow[k]) { /* i-th row */
530: nzi = *(nextci[k]+1) - *nextci[k];
531: Jptr = buf_rj[k] + *nextci[k];
532: PetscLLCondensedAddSorted_Scalable(nzi,Jptr,lnk);
533: nextrow[k]++; nextci[k]++;
534: }
535: }
536: nzi = lnk[0];
538: /* copy data into free space, then initialize lnk */
539: PetscLLCondensedClean_Scalable(nzi,current_space->array,lnk);
540: MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
541: }
542: PetscFree3(buf_ri_k,nextrow,nextci);
543: PetscLLCondensedDestroy_Scalable(lnk);
544: PetscFreeSpaceDestroy(free_space);
546: /* local sizes and preallocation */
547: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
548: if (P->cmap->bs > 0) {
549: PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);
550: PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs);
551: }
552: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
553: MatPreallocateFinalize(dnz,onz);
555: /* members in merge */
556: PetscFree(id_r);
557: PetscFree(len_r);
558: PetscFree(buf_ri[0]);
559: PetscFree(buf_ri);
560: PetscFree(buf_rj[0]);
561: PetscFree(buf_rj);
562: PetscLayoutDestroy(&rowmap);
564: nout = 0;
565: ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_oth->i[ptap->C_oth->rmap->n],c_oth->j,&nout,c_oth->j);
567: ISGlobalToLocalMappingApply(ptap->ltog,IS_GTOLM_DROP,c_loc->i[ptap->C_loc->rmap->n],c_loc->j,&nout,c_loc->j);
570: /* attach the supporting struct to Cmpi for reuse */
571: Cmpi->product->data = ptap;
572: Cmpi->product->view = MatView_MPIAIJ_PtAP;
573: Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
575: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
576: Cmpi->assembled = PETSC_FALSE;
577: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_scalable;
578: return 0;
579: }
581: static inline PetscErrorCode MatPtAPSymbolicComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHSetI dht,PetscHSetI oht)
582: {
583: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
584: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
585: PetscInt *ai,nzi,j,*aj,row,col,*pi,*pj,pnz,nzpi,*p_othcols,k;
586: PetscInt pcstart,pcend,column,offset;
588: pcstart = P->cmap->rstart;
589: pcstart *= dof;
590: pcend = P->cmap->rend;
591: pcend *= dof;
592: /* diagonal portion: Ad[i,:]*P */
593: ai = ad->i;
594: nzi = ai[i+1] - ai[i];
595: aj = ad->j + ai[i];
596: for (j=0; j<nzi; j++) {
597: row = aj[j];
598: offset = row%dof;
599: row /= dof;
600: nzpi = pd->i[row+1] - pd->i[row];
601: pj = pd->j + pd->i[row];
602: for (k=0; k<nzpi; k++) {
603: PetscHSetIAdd(dht,pj[k]*dof+offset+pcstart);
604: }
605: }
606: /* off diag P */
607: for (j=0; j<nzi; j++) {
608: row = aj[j];
609: offset = row%dof;
610: row /= dof;
611: nzpi = po->i[row+1] - po->i[row];
612: pj = po->j + po->i[row];
613: for (k=0; k<nzpi; k++) {
614: PetscHSetIAdd(oht,p->garray[pj[k]]*dof+offset);
615: }
616: }
618: /* off diagonal part: Ao[i, :]*P_oth */
619: if (ao) {
620: ai = ao->i;
621: pi = p_oth->i;
622: nzi = ai[i+1] - ai[i];
623: aj = ao->j + ai[i];
624: for (j=0; j<nzi; j++) {
625: row = aj[j];
626: offset = a->garray[row]%dof;
627: row = map[row];
628: pnz = pi[row+1] - pi[row];
629: p_othcols = p_oth->j + pi[row];
630: for (col=0; col<pnz; col++) {
631: column = p_othcols[col] * dof + offset;
632: if (column>=pcstart && column<pcend) {
633: PetscHSetIAdd(dht,column);
634: } else {
635: PetscHSetIAdd(oht,column);
636: }
637: }
638: }
639: } /* end if (ao) */
640: return 0;
641: }
643: static inline PetscErrorCode MatPtAPNumericComputeOneRowOfAP_private(Mat A,Mat P,Mat P_oth,const PetscInt *map,PetscInt dof,PetscInt i,PetscHMapIV hmap)
644: {
645: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
646: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data,*p_oth=(Mat_SeqAIJ*)P_oth->data,*pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
647: PetscInt *ai,nzi,j,*aj,row,col,*pi,pnz,*p_othcols,pcstart,*pj,k,nzpi,offset;
648: PetscScalar ra,*aa,*pa;
650: pcstart = P->cmap->rstart;
651: pcstart *= dof;
653: /* diagonal portion: Ad[i,:]*P */
654: ai = ad->i;
655: nzi = ai[i+1] - ai[i];
656: aj = ad->j + ai[i];
657: aa = ad->a + ai[i];
658: for (j=0; j<nzi; j++) {
659: ra = aa[j];
660: row = aj[j];
661: offset = row%dof;
662: row /= dof;
663: nzpi = pd->i[row+1] - pd->i[row];
664: pj = pd->j + pd->i[row];
665: pa = pd->a + pd->i[row];
666: for (k=0; k<nzpi; k++) {
667: PetscHMapIVAddValue(hmap,pj[k]*dof+offset+pcstart,ra*pa[k]);
668: }
669: PetscLogFlops(2.0*nzpi);
670: }
671: for (j=0; j<nzi; j++) {
672: ra = aa[j];
673: row = aj[j];
674: offset = row%dof;
675: row /= dof;
676: nzpi = po->i[row+1] - po->i[row];
677: pj = po->j + po->i[row];
678: pa = po->a + po->i[row];
679: for (k=0; k<nzpi; k++) {
680: PetscHMapIVAddValue(hmap,p->garray[pj[k]]*dof+offset,ra*pa[k]);
681: }
682: PetscLogFlops(2.0*nzpi);
683: }
685: /* off diagonal part: Ao[i, :]*P_oth */
686: if (ao) {
687: ai = ao->i;
688: pi = p_oth->i;
689: nzi = ai[i+1] - ai[i];
690: aj = ao->j + ai[i];
691: aa = ao->a + ai[i];
692: for (j=0; j<nzi; j++) {
693: row = aj[j];
694: offset = a->garray[row]%dof;
695: row = map[row];
696: ra = aa[j];
697: pnz = pi[row+1] - pi[row];
698: p_othcols = p_oth->j + pi[row];
699: pa = p_oth->a + pi[row];
700: for (col=0; col<pnz; col++) {
701: PetscHMapIVAddValue(hmap,p_othcols[col]*dof+offset,ra*pa[col]);
702: }
703: PetscLogFlops(2.0*pnz);
704: }
705: } /* end if (ao) */
707: return 0;
708: }
710: PetscErrorCode MatGetBrowsOfAcols_MPIXAIJ(Mat,Mat,PetscInt dof,MatReuse,Mat*);
712: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,Mat C)
713: {
714: Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
715: Mat_SeqAIJ *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data;
716: Mat_APMPI *ptap;
717: PetscHMapIV hmap;
718: PetscInt i,j,jj,kk,nzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,ccstart,ccend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,*dcc,*occ,loc;
719: PetscScalar *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa;
720: PetscInt offset,ii,pocol;
721: const PetscInt *mappingindices;
722: IS map;
724: MatCheckProduct(C,4);
725: ptap = (Mat_APMPI*)C->product->data;
729: MatZeroEntries(C);
731: /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
732: /*-----------------------------------------------------*/
733: if (ptap->reuse == MAT_REUSE_MATRIX) {
734: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
735: MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth);
736: }
737: PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);
739: MatGetLocalSize(p->B,NULL,&pon);
740: pon *= dof;
741: PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);
742: MatGetLocalSize(A,&am,NULL);
743: cmaxr = 0;
744: for (i=0; i<pon; i++) {
745: cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]);
746: }
747: PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);
748: PetscHMapIVCreate(&hmap);
749: PetscHMapIVResize(hmap,cmaxr);
750: ISGetIndices(map,&mappingindices);
751: for (i=0; i<am && pon; i++) {
752: PetscHMapIVClear(hmap);
753: offset = i%dof;
754: ii = i/dof;
755: nzi = po->i[ii+1] - po->i[ii];
756: if (!nzi) continue;
757: MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);
758: voff = 0;
759: PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);
760: if (!voff) continue;
762: /* Form C(ii, :) */
763: poj = po->j + po->i[ii];
764: poa = po->a + po->i[ii];
765: for (j=0; j<nzi; j++) {
766: pocol = poj[j]*dof+offset;
767: c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
768: c_rmtaa = c_rmta + ptap->c_rmti[pocol];
769: for (jj=0; jj<voff; jj++) {
770: apvaluestmp[jj] = apvalues[jj]*poa[j];
771: /*If the row is empty */
772: if (!c_rmtc[pocol]) {
773: c_rmtjj[jj] = apindices[jj];
774: c_rmtaa[jj] = apvaluestmp[jj];
775: c_rmtc[pocol]++;
776: } else {
777: PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc);
778: if (loc>=0){ /* hit */
779: c_rmtaa[loc] += apvaluestmp[jj];
780: PetscLogFlops(1.0);
781: } else { /* new element */
782: loc = -(loc+1);
783: /* Move data backward */
784: for (kk=c_rmtc[pocol]; kk>loc; kk--) {
785: c_rmtjj[kk] = c_rmtjj[kk-1];
786: c_rmtaa[kk] = c_rmtaa[kk-1];
787: }/* End kk */
788: c_rmtjj[loc] = apindices[jj];
789: c_rmtaa[loc] = apvaluestmp[jj];
790: c_rmtc[pocol]++;
791: }
792: }
793: PetscLogFlops(voff);
794: } /* End jj */
795: } /* End j */
796: } /* End i */
798: PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);
799: PetscHMapIVDestroy(&hmap);
801: MatGetLocalSize(P,NULL,&pn);
802: pn *= dof;
803: PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);
805: PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);
806: PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE);
807: MatGetOwnershipRangeColumn(P,&pcstart,&pcend);
808: pcstart = pcstart*dof;
809: pcend = pcend*dof;
810: cd = (Mat_SeqAIJ*)(c->A)->data;
811: co = (Mat_SeqAIJ*)(c->B)->data;
813: cmaxr = 0;
814: for (i=0; i<pn; i++) {
815: cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i]));
816: }
817: PetscCalloc5(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pn,&dcc,pn,&occ);
818: PetscHMapIVCreate(&hmap);
819: PetscHMapIVResize(hmap,cmaxr);
820: for (i=0; i<am && pn; i++) {
821: PetscHMapIVClear(hmap);
822: offset = i%dof;
823: ii = i/dof;
824: nzi = pd->i[ii+1] - pd->i[ii];
825: if (!nzi) continue;
826: MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);
827: voff = 0;
828: PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);
829: if (!voff) continue;
830: /* Form C(ii, :) */
831: pdj = pd->j + pd->i[ii];
832: pda = pd->a + pd->i[ii];
833: for (j=0; j<nzi; j++) {
834: row = pcstart + pdj[j] * dof + offset;
835: for (jj=0; jj<voff; jj++) {
836: apvaluestmp[jj] = apvalues[jj]*pda[j];
837: }
838: PetscLogFlops(voff);
839: MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);
840: }
841: }
842: ISRestoreIndices(map,&mappingindices);
843: MatGetOwnershipRangeColumn(C,&ccstart,&ccend);
844: PetscFree5(apindices,apvalues,apvaluestmp,dcc,occ);
845: PetscHMapIVDestroy(&hmap);
846: PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);
847: PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE);
848: PetscFree2(c_rmtj,c_rmta);
850: /* Add contributions from remote */
851: for (i = 0; i < pn; i++) {
852: row = i + pcstart;
853: MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);
854: }
855: PetscFree2(c_othj,c_otha);
857: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
858: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
860: ptap->reuse = MAT_REUSE_MATRIX;
861: return 0;
862: }
864: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,Mat C)
865: {
867: MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce(A,P,1,C);
868: return 0;
869: }
871: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,Mat C)
872: {
873: Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data,*c=(Mat_MPIAIJ*)C->data;
874: Mat_SeqAIJ *cd,*co,*po=(Mat_SeqAIJ*)p->B->data,*pd=(Mat_SeqAIJ*)p->A->data;
875: Mat_APMPI *ptap;
876: PetscHMapIV hmap;
877: PetscInt i,j,jj,kk,nzi,dnzi,*c_rmtj,voff,*c_othj,pn,pon,pcstart,pcend,row,am,*poj,*pdj,*apindices,cmaxr,*c_rmtc,*c_rmtjj,loc;
878: PetscScalar *c_rmta,*c_otha,*poa,*pda,*apvalues,*apvaluestmp,*c_rmtaa;
879: PetscInt offset,ii,pocol;
880: const PetscInt *mappingindices;
881: IS map;
883: MatCheckProduct(C,4);
884: ptap = (Mat_APMPI*)C->product->data;
888: MatZeroEntries(C);
890: /* Get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
891: /*-----------------------------------------------------*/
892: if (ptap->reuse == MAT_REUSE_MATRIX) {
893: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
894: MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_REUSE_MATRIX,&ptap->P_oth);
895: }
896: PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);
897: MatGetLocalSize(p->B,NULL,&pon);
898: pon *= dof;
899: MatGetLocalSize(P,NULL,&pn);
900: pn *= dof;
902: PetscCalloc2(ptap->c_rmti[pon],&c_rmtj,ptap->c_rmti[pon],&c_rmta);
903: MatGetLocalSize(A,&am,NULL);
904: MatGetOwnershipRangeColumn(P,&pcstart,&pcend);
905: pcstart *= dof;
906: pcend *= dof;
907: cmaxr = 0;
908: for (i=0; i<pon; i++) {
909: cmaxr = PetscMax(cmaxr,ptap->c_rmti[i+1]-ptap->c_rmti[i]);
910: }
911: cd = (Mat_SeqAIJ*)(c->A)->data;
912: co = (Mat_SeqAIJ*)(c->B)->data;
913: for (i=0; i<pn; i++) {
914: cmaxr = PetscMax(cmaxr,(cd->i[i+1]-cd->i[i])+(co->i[i+1]-co->i[i]));
915: }
916: PetscCalloc4(cmaxr,&apindices,cmaxr,&apvalues,cmaxr,&apvaluestmp,pon,&c_rmtc);
917: PetscHMapIVCreate(&hmap);
918: PetscHMapIVResize(hmap,cmaxr);
919: ISGetIndices(map,&mappingindices);
920: for (i=0; i<am && (pon || pn); i++) {
921: PetscHMapIVClear(hmap);
922: offset = i%dof;
923: ii = i/dof;
924: nzi = po->i[ii+1] - po->i[ii];
925: dnzi = pd->i[ii+1] - pd->i[ii];
926: if (!nzi && !dnzi) continue;
927: MatPtAPNumericComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,hmap);
928: voff = 0;
929: PetscHMapIVGetPairs(hmap,&voff,apindices,apvalues);
930: if (!voff) continue;
932: /* Form remote C(ii, :) */
933: poj = po->j + po->i[ii];
934: poa = po->a + po->i[ii];
935: for (j=0; j<nzi; j++) {
936: pocol = poj[j]*dof+offset;
937: c_rmtjj = c_rmtj + ptap->c_rmti[pocol];
938: c_rmtaa = c_rmta + ptap->c_rmti[pocol];
939: for (jj=0; jj<voff; jj++) {
940: apvaluestmp[jj] = apvalues[jj]*poa[j];
941: /*If the row is empty */
942: if (!c_rmtc[pocol]) {
943: c_rmtjj[jj] = apindices[jj];
944: c_rmtaa[jj] = apvaluestmp[jj];
945: c_rmtc[pocol]++;
946: } else {
947: PetscFindInt(apindices[jj],c_rmtc[pocol],c_rmtjj,&loc);
948: if (loc>=0){ /* hit */
949: c_rmtaa[loc] += apvaluestmp[jj];
950: PetscLogFlops(1.0);
951: } else { /* new element */
952: loc = -(loc+1);
953: /* Move data backward */
954: for (kk=c_rmtc[pocol]; kk>loc; kk--) {
955: c_rmtjj[kk] = c_rmtjj[kk-1];
956: c_rmtaa[kk] = c_rmtaa[kk-1];
957: }/* End kk */
958: c_rmtjj[loc] = apindices[jj];
959: c_rmtaa[loc] = apvaluestmp[jj];
960: c_rmtc[pocol]++;
961: }
962: }
963: } /* End jj */
964: PetscLogFlops(voff);
965: } /* End j */
967: /* Form local C(ii, :) */
968: pdj = pd->j + pd->i[ii];
969: pda = pd->a + pd->i[ii];
970: for (j=0; j<dnzi; j++) {
971: row = pcstart + pdj[j] * dof + offset;
972: for (jj=0; jj<voff; jj++) {
973: apvaluestmp[jj] = apvalues[jj]*pda[j];
974: }/* End kk */
975: PetscLogFlops(voff);
976: MatSetValues(C,1,&row,voff,apindices,apvaluestmp,ADD_VALUES);
977: }/* End j */
978: } /* End i */
980: ISRestoreIndices(map,&mappingindices);
981: PetscFree4(apindices,apvalues,apvaluestmp,c_rmtc);
982: PetscHMapIVDestroy(&hmap);
983: PetscCalloc2(ptap->c_othi[pn],&c_othj,ptap->c_othi[pn],&c_otha);
985: PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);
986: PetscSFReduceBegin(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE);
987: PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);
988: PetscSFReduceEnd(ptap->sf,MPIU_SCALAR,c_rmta,c_otha,MPI_REPLACE);
989: PetscFree2(c_rmtj,c_rmta);
991: /* Add contributions from remote */
992: for (i = 0; i < pn; i++) {
993: row = i + pcstart;
994: MatSetValues(C,1,&row,ptap->c_othi[i+1]-ptap->c_othi[i],c_othj+ptap->c_othi[i],c_otha+ptap->c_othi[i],ADD_VALUES);
995: }
996: PetscFree2(c_othj,c_otha);
998: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
999: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
1001: ptap->reuse = MAT_REUSE_MATRIX;
1002: return 0;
1003: }
1005: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,Mat C)
1006: {
1008: MatPtAPNumeric_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,C);
1009: return 0;
1010: }
1012: /* TODO: move algorithm selection to MatProductSetFromOptions */
1013: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat Cmpi)
1014: {
1015: Mat_APMPI *ptap;
1016: Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data;
1017: MPI_Comm comm;
1018: Mat_SeqAIJ *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
1019: MatType mtype;
1020: PetscSF sf;
1021: PetscSFNode *iremote;
1022: PetscInt rootspacesize,*rootspace,*rootspaceoffsets,nleaves;
1023: const PetscInt *rootdegrees;
1024: PetscHSetI ht,oht,*hta,*hto;
1025: PetscInt pn,pon,*c_rmtc,i,j,nzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets;
1026: PetscInt lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj;
1027: PetscInt nalg=2,alg=0,offset,ii;
1028: PetscMPIInt owner;
1029: const PetscInt *mappingindices;
1030: PetscBool flg;
1031: const char *algTypes[2] = {"overlapping","merged"};
1032: IS map;
1033: PetscErrorCode ierr;
1035: MatCheckProduct(Cmpi,5);
1037: PetscObjectGetComm((PetscObject)A,&comm);
1039: /* Create symbolic parallel matrix Cmpi */
1040: MatGetLocalSize(P,NULL,&pn);
1041: pn *= dof;
1042: MatGetType(A,&mtype);
1043: MatSetType(Cmpi,mtype);
1044: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1046: PetscNew(&ptap);
1047: ptap->reuse = MAT_INITIAL_MATRIX;
1048: ptap->algType = 2;
1050: /* Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1051: MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth);
1052: PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);
1053: /* This equals to the number of offdiag columns in P */
1054: MatGetLocalSize(p->B,NULL,&pon);
1055: pon *= dof;
1056: /* offsets */
1057: PetscMalloc1(pon+1,&ptap->c_rmti);
1058: /* The number of columns we will send to remote ranks */
1059: PetscMalloc1(pon,&c_rmtc);
1060: PetscMalloc1(pon,&hta);
1061: for (i=0; i<pon; i++) {
1062: PetscHSetICreate(&hta[i]);
1063: }
1064: MatGetLocalSize(A,&am,NULL);
1065: MatGetOwnershipRange(A,&arstart,&arend);
1066: /* Create hash table to merge all columns for C(i, :) */
1067: PetscHSetICreate(&ht);
1069: ISGetIndices(map,&mappingindices);
1070: ptap->c_rmti[0] = 0;
1071: /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */
1072: for (i=0; i<am && pon; i++) {
1073: /* Form one row of AP */
1074: PetscHSetIClear(ht);
1075: offset = i%dof;
1076: ii = i/dof;
1077: /* If the off diag is empty, we should not do any calculation */
1078: nzi = po->i[ii+1] - po->i[ii];
1079: if (!nzi) continue;
1081: MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,ht);
1082: PetscHSetIGetSize(ht,&htsize);
1083: /* If AP is empty, just continue */
1084: if (!htsize) continue;
1085: /* Form C(ii, :) */
1086: poj = po->j + po->i[ii];
1087: for (j=0; j<nzi; j++) {
1088: PetscHSetIUpdate(hta[poj[j]*dof+offset],ht);
1089: }
1090: }
1092: for (i=0; i<pon; i++) {
1093: PetscHSetIGetSize(hta[i],&htsize);
1094: ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize;
1095: c_rmtc[i] = htsize;
1096: }
1098: PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);
1100: for (i=0; i<pon; i++) {
1101: off = 0;
1102: PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);
1103: PetscHSetIDestroy(&hta[i]);
1104: }
1105: PetscFree(hta);
1107: PetscMalloc1(pon,&iremote);
1108: for (i=0; i<pon; i++) {
1109: owner = 0; lidx = 0;
1110: offset = i%dof;
1111: ii = i/dof;
1112: PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx);
1113: iremote[i].index = lidx*dof + offset;
1114: iremote[i].rank = owner;
1115: }
1117: PetscSFCreate(comm,&sf);
1118: PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
1119: /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1120: PetscSFSetRankOrder(sf,PETSC_TRUE);
1121: PetscSFSetFromOptions(sf);
1122: PetscSFSetUp(sf);
1123: /* How many neighbors have contributions to my rows? */
1124: PetscSFComputeDegreeBegin(sf,&rootdegrees);
1125: PetscSFComputeDegreeEnd(sf,&rootdegrees);
1126: rootspacesize = 0;
1127: for (i = 0; i < pn; i++) {
1128: rootspacesize += rootdegrees[i];
1129: }
1130: PetscMalloc1(rootspacesize,&rootspace);
1131: PetscMalloc1(rootspacesize+1,&rootspaceoffsets);
1132: /* Get information from leaves
1133: * Number of columns other people contribute to my rows
1134: * */
1135: PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);
1136: PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);
1137: PetscFree(c_rmtc);
1138: PetscCalloc1(pn+1,&ptap->c_othi);
1139: /* The number of columns is received for each row */
1140: ptap->c_othi[0] = 0;
1141: rootspacesize = 0;
1142: rootspaceoffsets[0] = 0;
1143: for (i = 0; i < pn; i++) {
1144: rcvncols = 0;
1145: for (j = 0; j<rootdegrees[i]; j++) {
1146: rcvncols += rootspace[rootspacesize];
1147: rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1148: rootspacesize++;
1149: }
1150: ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols;
1151: }
1152: PetscFree(rootspace);
1154: PetscMalloc1(pon,&c_rmtoffsets);
1155: PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);
1156: PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);
1157: PetscSFDestroy(&sf);
1158: PetscFree(rootspaceoffsets);
1160: PetscCalloc1(ptap->c_rmti[pon],&iremote);
1161: nleaves = 0;
1162: for (i = 0; i<pon; i++) {
1163: owner = 0;
1164: ii = i/dof;
1165: PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL);
1166: sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i];
1167: for (j=0; j<sendncols; j++) {
1168: iremote[nleaves].rank = owner;
1169: iremote[nleaves++].index = c_rmtoffsets[i] + j;
1170: }
1171: }
1172: PetscFree(c_rmtoffsets);
1173: PetscCalloc1(ptap->c_othi[pn],&c_othj);
1175: PetscSFCreate(comm,&ptap->sf);
1176: PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
1177: PetscSFSetFromOptions(ptap->sf);
1178: /* One to one map */
1179: PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);
1181: PetscMalloc2(pn,&dnz,pn,&onz);
1182: PetscHSetICreate(&oht);
1183: MatGetOwnershipRangeColumn(P,&pcstart,&pcend);
1184: pcstart *= dof;
1185: pcend *= dof;
1186: PetscMalloc2(pn,&hta,pn,&hto);
1187: for (i=0; i<pn; i++) {
1188: PetscHSetICreate(&hta[i]);
1189: PetscHSetICreate(&hto[i]);
1190: }
1191: /* Work on local part */
1192: /* 4) Pass 1: Estimate memory for C_loc */
1193: for (i=0; i<am && pn; i++) {
1194: PetscHSetIClear(ht);
1195: PetscHSetIClear(oht);
1196: offset = i%dof;
1197: ii = i/dof;
1198: nzi = pd->i[ii+1] - pd->i[ii];
1199: if (!nzi) continue;
1201: MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht);
1202: PetscHSetIGetSize(ht,&htsize);
1203: PetscHSetIGetSize(oht,&htosize);
1204: if (!(htsize+htosize)) continue;
1205: /* Form C(ii, :) */
1206: pdj = pd->j + pd->i[ii];
1207: for (j=0; j<nzi; j++) {
1208: PetscHSetIUpdate(hta[pdj[j]*dof+offset],ht);
1209: PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht);
1210: }
1211: }
1213: ISRestoreIndices(map,&mappingindices);
1215: PetscHSetIDestroy(&ht);
1216: PetscHSetIDestroy(&oht);
1218: /* Get remote data */
1219: PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);
1220: PetscFree(c_rmtj);
1222: for (i = 0; i < pn; i++) {
1223: nzi = ptap->c_othi[i+1] - ptap->c_othi[i];
1224: rdj = c_othj + ptap->c_othi[i];
1225: for (j = 0; j < nzi; j++) {
1226: col = rdj[j];
1227: /* diag part */
1228: if (col>=pcstart && col<pcend) {
1229: PetscHSetIAdd(hta[i],col);
1230: } else { /* off diag */
1231: PetscHSetIAdd(hto[i],col);
1232: }
1233: }
1234: PetscHSetIGetSize(hta[i],&htsize);
1235: dnz[i] = htsize;
1236: PetscHSetIDestroy(&hta[i]);
1237: PetscHSetIGetSize(hto[i],&htsize);
1238: onz[i] = htsize;
1239: PetscHSetIDestroy(&hto[i]);
1240: }
1242: PetscFree2(hta,hto);
1243: PetscFree(c_othj);
1245: /* local sizes and preallocation */
1246: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1247: MatSetBlockSizes(Cmpi,dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs);
1248: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1249: MatSetUp(Cmpi);
1250: PetscFree2(dnz,onz);
1252: /* attach the supporting struct to Cmpi for reuse */
1253: Cmpi->product->data = ptap;
1254: Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1255: Cmpi->product->view = MatView_MPIAIJ_PtAP;
1257: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1258: Cmpi->assembled = PETSC_FALSE;
1259: /* pick an algorithm */
1260: PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");
1261: alg = 0;
1262: PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);
1263: PetscOptionsEnd();
1264: switch (alg) {
1265: case 0:
1266: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1267: break;
1268: case 1:
1269: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1270: break;
1271: default:
1272: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm ");
1273: }
1274: return 0;
1275: }
1277: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(Mat A,Mat P,PetscReal fill,Mat C)
1278: {
1279: MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce(A,P,1,fill,C);
1280: return 0;
1281: }
1283: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(Mat A,Mat P,PetscInt dof,PetscReal fill,Mat Cmpi)
1284: {
1285: Mat_APMPI *ptap;
1286: Mat_MPIAIJ *p=(Mat_MPIAIJ*)P->data;
1287: MPI_Comm comm;
1288: Mat_SeqAIJ *pd=(Mat_SeqAIJ*)p->A->data,*po=(Mat_SeqAIJ*)p->B->data;
1289: MatType mtype;
1290: PetscSF sf;
1291: PetscSFNode *iremote;
1292: PetscInt rootspacesize,*rootspace,*rootspaceoffsets,nleaves;
1293: const PetscInt *rootdegrees;
1294: PetscHSetI ht,oht,*hta,*hto,*htd;
1295: PetscInt pn,pon,*c_rmtc,i,j,nzi,dnzi,htsize,htosize,*c_rmtj,off,*c_othj,rcvncols,sendncols,*c_rmtoffsets;
1296: PetscInt lidx,*rdj,col,pcstart,pcend,*dnz,*onz,am,arstart,arend,*poj,*pdj;
1297: PetscInt nalg=2,alg=0,offset,ii;
1298: PetscMPIInt owner;
1299: PetscBool flg;
1300: const char *algTypes[2] = {"merged","overlapping"};
1301: const PetscInt *mappingindices;
1302: IS map;
1303: PetscErrorCode ierr;
1305: MatCheckProduct(Cmpi,5);
1307: PetscObjectGetComm((PetscObject)A,&comm);
1309: /* Create symbolic parallel matrix Cmpi */
1310: MatGetLocalSize(P,NULL,&pn);
1311: pn *= dof;
1312: MatGetType(A,&mtype);
1313: MatSetType(Cmpi,mtype);
1314: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1316: PetscNew(&ptap);
1317: ptap->reuse = MAT_INITIAL_MATRIX;
1318: ptap->algType = 3;
1320: /* 0) Get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1321: MatGetBrowsOfAcols_MPIXAIJ(A,P,dof,MAT_INITIAL_MATRIX,&ptap->P_oth);
1322: PetscObjectQuery((PetscObject)ptap->P_oth,"aoffdiagtopothmapping",(PetscObject*)&map);
1324: /* This equals to the number of offdiag columns in P */
1325: MatGetLocalSize(p->B,NULL,&pon);
1326: pon *= dof;
1327: /* offsets */
1328: PetscMalloc1(pon+1,&ptap->c_rmti);
1329: /* The number of columns we will send to remote ranks */
1330: PetscMalloc1(pon,&c_rmtc);
1331: PetscMalloc1(pon,&hta);
1332: for (i=0; i<pon; i++) {
1333: PetscHSetICreate(&hta[i]);
1334: }
1335: MatGetLocalSize(A,&am,NULL);
1336: MatGetOwnershipRange(A,&arstart,&arend);
1337: /* Create hash table to merge all columns for C(i, :) */
1338: PetscHSetICreate(&ht);
1339: PetscHSetICreate(&oht);
1340: PetscMalloc2(pn,&htd,pn,&hto);
1341: for (i=0; i<pn; i++) {
1342: PetscHSetICreate(&htd[i]);
1343: PetscHSetICreate(&hto[i]);
1344: }
1346: ISGetIndices(map,&mappingindices);
1347: ptap->c_rmti[0] = 0;
1348: /* 2) Pass 1: calculate the size for C_rmt (a matrix need to be sent to other processors) */
1349: for (i=0; i<am && (pon || pn); i++) {
1350: /* Form one row of AP */
1351: PetscHSetIClear(ht);
1352: PetscHSetIClear(oht);
1353: offset = i%dof;
1354: ii = i/dof;
1355: /* If the off diag is empty, we should not do any calculation */
1356: nzi = po->i[ii+1] - po->i[ii];
1357: dnzi = pd->i[ii+1] - pd->i[ii];
1358: if (!nzi && !dnzi) continue;
1360: MatPtAPSymbolicComputeOneRowOfAP_private(A,P,ptap->P_oth,mappingindices,dof,i,ht,oht);
1361: PetscHSetIGetSize(ht,&htsize);
1362: PetscHSetIGetSize(oht,&htosize);
1363: /* If AP is empty, just continue */
1364: if (!(htsize+htosize)) continue;
1366: /* Form remote C(ii, :) */
1367: poj = po->j + po->i[ii];
1368: for (j=0; j<nzi; j++) {
1369: PetscHSetIUpdate(hta[poj[j]*dof+offset],ht);
1370: PetscHSetIUpdate(hta[poj[j]*dof+offset],oht);
1371: }
1373: /* Form local C(ii, :) */
1374: pdj = pd->j + pd->i[ii];
1375: for (j=0; j<dnzi; j++) {
1376: PetscHSetIUpdate(htd[pdj[j]*dof+offset],ht);
1377: PetscHSetIUpdate(hto[pdj[j]*dof+offset],oht);
1378: }
1379: }
1381: ISRestoreIndices(map,&mappingindices);
1383: PetscHSetIDestroy(&ht);
1384: PetscHSetIDestroy(&oht);
1386: for (i=0; i<pon; i++) {
1387: PetscHSetIGetSize(hta[i],&htsize);
1388: ptap->c_rmti[i+1] = ptap->c_rmti[i] + htsize;
1389: c_rmtc[i] = htsize;
1390: }
1392: PetscMalloc1(ptap->c_rmti[pon],&c_rmtj);
1394: for (i=0; i<pon; i++) {
1395: off = 0;
1396: PetscHSetIGetElems(hta[i],&off,c_rmtj+ptap->c_rmti[i]);
1397: PetscHSetIDestroy(&hta[i]);
1398: }
1399: PetscFree(hta);
1401: PetscMalloc1(pon,&iremote);
1402: for (i=0; i<pon; i++) {
1403: owner = 0; lidx = 0;
1404: offset = i%dof;
1405: ii = i/dof;
1406: PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,&lidx);
1407: iremote[i].index = lidx*dof+offset;
1408: iremote[i].rank = owner;
1409: }
1411: PetscSFCreate(comm,&sf);
1412: PetscSFSetGraph(sf,pn,pon,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
1413: /* Reorder ranks properly so that the data handled by gather and scatter have the same order */
1414: PetscSFSetRankOrder(sf,PETSC_TRUE);
1415: PetscSFSetFromOptions(sf);
1416: PetscSFSetUp(sf);
1417: /* How many neighbors have contributions to my rows? */
1418: PetscSFComputeDegreeBegin(sf,&rootdegrees);
1419: PetscSFComputeDegreeEnd(sf,&rootdegrees);
1420: rootspacesize = 0;
1421: for (i = 0; i < pn; i++) {
1422: rootspacesize += rootdegrees[i];
1423: }
1424: PetscMalloc1(rootspacesize,&rootspace);
1425: PetscMalloc1(rootspacesize+1,&rootspaceoffsets);
1426: /* Get information from leaves
1427: * Number of columns other people contribute to my rows
1428: * */
1429: PetscSFGatherBegin(sf,MPIU_INT,c_rmtc,rootspace);
1430: PetscSFGatherEnd(sf,MPIU_INT,c_rmtc,rootspace);
1431: PetscFree(c_rmtc);
1432: PetscMalloc1(pn+1,&ptap->c_othi);
1433: /* The number of columns is received for each row */
1434: ptap->c_othi[0] = 0;
1435: rootspacesize = 0;
1436: rootspaceoffsets[0] = 0;
1437: for (i = 0; i < pn; i++) {
1438: rcvncols = 0;
1439: for (j = 0; j<rootdegrees[i]; j++) {
1440: rcvncols += rootspace[rootspacesize];
1441: rootspaceoffsets[rootspacesize+1] = rootspaceoffsets[rootspacesize] + rootspace[rootspacesize];
1442: rootspacesize++;
1443: }
1444: ptap->c_othi[i+1] = ptap->c_othi[i] + rcvncols;
1445: }
1446: PetscFree(rootspace);
1448: PetscMalloc1(pon,&c_rmtoffsets);
1449: PetscSFScatterBegin(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);
1450: PetscSFScatterEnd(sf,MPIU_INT,rootspaceoffsets,c_rmtoffsets);
1451: PetscSFDestroy(&sf);
1452: PetscFree(rootspaceoffsets);
1454: PetscCalloc1(ptap->c_rmti[pon],&iremote);
1455: nleaves = 0;
1456: for (i = 0; i<pon; i++) {
1457: owner = 0;
1458: ii = i/dof;
1459: PetscLayoutFindOwnerIndex(P->cmap,p->garray[ii],&owner,NULL);
1460: sendncols = ptap->c_rmti[i+1] - ptap->c_rmti[i];
1461: for (j=0; j<sendncols; j++) {
1462: iremote[nleaves].rank = owner;
1463: iremote[nleaves++].index = c_rmtoffsets[i] + j;
1464: }
1465: }
1466: PetscFree(c_rmtoffsets);
1467: PetscCalloc1(ptap->c_othi[pn],&c_othj);
1469: PetscSFCreate(comm,&ptap->sf);
1470: PetscSFSetGraph(ptap->sf,ptap->c_othi[pn],nleaves,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER);
1471: PetscSFSetFromOptions(ptap->sf);
1472: /* One to one map */
1473: PetscSFReduceBegin(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);
1474: /* Get remote data */
1475: PetscSFReduceEnd(ptap->sf,MPIU_INT,c_rmtj,c_othj,MPI_REPLACE);
1476: PetscFree(c_rmtj);
1477: PetscMalloc2(pn,&dnz,pn,&onz);
1478: MatGetOwnershipRangeColumn(P,&pcstart,&pcend);
1479: pcstart *= dof;
1480: pcend *= dof;
1481: for (i = 0; i < pn; i++) {
1482: nzi = ptap->c_othi[i+1] - ptap->c_othi[i];
1483: rdj = c_othj + ptap->c_othi[i];
1484: for (j = 0; j < nzi; j++) {
1485: col = rdj[j];
1486: /* diag part */
1487: if (col>=pcstart && col<pcend) {
1488: PetscHSetIAdd(htd[i],col);
1489: } else { /* off diag */
1490: PetscHSetIAdd(hto[i],col);
1491: }
1492: }
1493: PetscHSetIGetSize(htd[i],&htsize);
1494: dnz[i] = htsize;
1495: PetscHSetIDestroy(&htd[i]);
1496: PetscHSetIGetSize(hto[i],&htsize);
1497: onz[i] = htsize;
1498: PetscHSetIDestroy(&hto[i]);
1499: }
1501: PetscFree2(htd,hto);
1502: PetscFree(c_othj);
1504: /* local sizes and preallocation */
1505: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1506: MatSetBlockSizes(Cmpi, dof>1? dof: P->cmap->bs,dof>1? dof: P->cmap->bs);
1507: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1508: PetscFree2(dnz,onz);
1510: /* attach the supporting struct to Cmpi for reuse */
1511: Cmpi->product->data = ptap;
1512: Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1513: Cmpi->product->view = MatView_MPIAIJ_PtAP;
1515: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1516: Cmpi->assembled = PETSC_FALSE;
1517: /* pick an algorithm */
1518: PetscOptionsBegin(PetscObjectComm((PetscObject)A),((PetscObject)A)->prefix,"MatPtAP","Mat");
1519: alg = 0;
1520: PetscOptionsEList("-matptap_allatonce_via","PtAP allatonce numeric approach","MatPtAP",algTypes,nalg,algTypes[alg],&alg,&flg);
1521: PetscOptionsEnd();
1522: switch (alg) {
1523: case 0:
1524: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce_merged;
1525: break;
1526: case 1:
1527: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ_allatonce;
1528: break;
1529: default:
1530: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG," Unsupported allatonce numerical algorithm ");
1531: }
1532: return 0;
1533: }
1535: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(Mat A,Mat P,PetscReal fill,Mat C)
1536: {
1537: MatPtAPSymbolic_MPIAIJ_MPIXAIJ_allatonce_merged(A,P,1,fill,C);
1538: return 0;
1539: }
1541: PetscErrorCode MatPtAPSymbolic_MPIAIJ_MPIAIJ(Mat A,Mat P,PetscReal fill,Mat Cmpi)
1542: {
1543: PetscErrorCode ierr;
1544: Mat_APMPI *ptap;
1545: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
1546: MPI_Comm comm;
1547: PetscMPIInt size,rank;
1548: PetscFreeSpaceList free_space=NULL,current_space=NULL;
1549: PetscInt am=A->rmap->n,pm=P->rmap->n,pN=P->cmap->N,pn=P->cmap->n;
1550: PetscInt *lnk,i,k,pnz,row,nsend;
1551: PetscBT lnkbt;
1552: PetscMPIInt tagi,tagj,*len_si,*len_s,*len_ri,nrecv;
1553: PETSC_UNUSED PetscMPIInt icompleted=0;
1554: PetscInt **buf_rj,**buf_ri,**buf_ri_k;
1555: PetscInt len,proc,*dnz,*onz,*owners,nzi,nspacedouble;
1556: PetscInt nrows,*buf_s,*buf_si,*buf_si_i,**nextrow,**nextci;
1557: MPI_Request *swaits,*rwaits;
1558: MPI_Status *sstatus,rstatus;
1559: PetscLayout rowmap;
1560: PetscInt *owners_co,*coi,*coj; /* i and j array of (p->B)^T*A*P - used in the communication */
1561: PetscMPIInt *len_r,*id_r; /* array of length of comm->size, store send/recv matrix values */
1562: PetscInt *api,*apj,*Jptr,apnz,*prmap=p->garray,con,j,ap_rmax=0,Crmax,*aj,*ai,*pi;
1563: Mat_SeqAIJ *p_loc,*p_oth=NULL,*ad=(Mat_SeqAIJ*)(a->A)->data,*ao=NULL,*c_loc,*c_oth;
1564: PetscScalar *apv;
1565: PetscTable ta;
1566: MatType mtype;
1567: const char *prefix;
1568: #if defined(PETSC_USE_INFO)
1569: PetscReal apfill;
1570: #endif
1572: MatCheckProduct(Cmpi,4);
1574: PetscObjectGetComm((PetscObject)A,&comm);
1575: MPI_Comm_size(comm,&size);
1576: MPI_Comm_rank(comm,&rank);
1578: if (size > 1) ao = (Mat_SeqAIJ*)(a->B)->data;
1580: /* create symbolic parallel matrix Cmpi */
1581: MatGetType(A,&mtype);
1582: MatSetType(Cmpi,mtype);
1584: /* Do dense axpy in MatPtAPNumeric_MPIAIJ_MPIAIJ() */
1585: Cmpi->ops->ptapnumeric = MatPtAPNumeric_MPIAIJ_MPIAIJ;
1587: /* create struct Mat_APMPI and attached it to C later */
1588: PetscNew(&ptap);
1589: ptap->reuse = MAT_INITIAL_MATRIX;
1590: ptap->algType = 1;
1592: /* get P_oth by taking rows of P (= non-zero cols of local A) from other processors */
1593: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
1594: /* get P_loc by taking all local rows of P */
1595: MatMPIAIJGetLocalMat(P,MAT_INITIAL_MATRIX,&ptap->P_loc);
1597: /* (0) compute Rd = Pd^T, Ro = Po^T */
1598: /* --------------------------------- */
1599: MatTranspose(p->A,MAT_INITIAL_MATRIX,&ptap->Rd);
1600: MatTranspose(p->B,MAT_INITIAL_MATRIX,&ptap->Ro);
1602: /* (1) compute symbolic AP = A_loc*P = Ad*P_loc + Ao*P_oth (api,apj) */
1603: /* ----------------------------------------------------------------- */
1604: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1605: if (ptap->P_oth) p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1607: /* create and initialize a linked list */
1608: PetscTableCreate(pn,pN,&ta); /* for compute AP_loc and Cmpi */
1609: MatRowMergeMax_SeqAIJ(p_loc,ptap->P_loc->rmap->N,ta);
1610: MatRowMergeMax_SeqAIJ(p_oth,ptap->P_oth->rmap->N,ta);
1611: PetscTableGetCount(ta,&Crmax); /* Crmax = nnz(sum of Prows) */
1612: /* printf("[%d] est %d, Crmax %d; pN %d\n",rank,5*(p_loc->rmax+p_oth->rmax + (PetscInt)(1.e-2*pN)),Crmax,pN); */
1614: PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);
1616: /* Initial FreeSpace size is fill*(nnz(A) + nnz(P)) */
1617: if (ao) {
1618: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],PetscIntSumTruncate(ao->i[am],p_loc->i[pm]))),&free_space);
1619: } else {
1620: PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,PetscIntSumTruncate(ad->i[am],p_loc->i[pm])),&free_space);
1621: }
1622: current_space = free_space;
1623: nspacedouble = 0;
1625: PetscMalloc1(am+1,&api);
1626: api[0] = 0;
1627: for (i=0; i<am; i++) {
1628: /* diagonal portion: Ad[i,:]*P */
1629: ai = ad->i; pi = p_loc->i;
1630: nzi = ai[i+1] - ai[i];
1631: aj = ad->j + ai[i];
1632: for (j=0; j<nzi; j++) {
1633: row = aj[j];
1634: pnz = pi[row+1] - pi[row];
1635: Jptr = p_loc->j + pi[row];
1636: /* add non-zero cols of P into the sorted linked list lnk */
1637: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
1638: }
1639: /* off-diagonal portion: Ao[i,:]*P */
1640: if (ao) {
1641: ai = ao->i; pi = p_oth->i;
1642: nzi = ai[i+1] - ai[i];
1643: aj = ao->j + ai[i];
1644: for (j=0; j<nzi; j++) {
1645: row = aj[j];
1646: pnz = pi[row+1] - pi[row];
1647: Jptr = p_oth->j + pi[row];
1648: PetscLLCondensedAddSorted(pnz,Jptr,lnk,lnkbt);
1649: }
1650: }
1651: apnz = lnk[0];
1652: api[i+1] = api[i] + apnz;
1653: if (ap_rmax < apnz) ap_rmax = apnz;
1655: /* if free space is not available, double the total space in the list */
1656: if (current_space->local_remaining<apnz) {
1657: PetscFreeSpaceGet(PetscIntSumTruncate(apnz,current_space->total_array_size),¤t_space);
1658: nspacedouble++;
1659: }
1661: /* Copy data into free space, then initialize lnk */
1662: PetscLLCondensedClean(pN,apnz,current_space->array,lnk,lnkbt);
1664: current_space->array += apnz;
1665: current_space->local_used += apnz;
1666: current_space->local_remaining -= apnz;
1667: }
1668: /* Allocate space for apj and apv, initialize apj, and */
1669: /* destroy list of free space and other temporary array(s) */
1670: PetscMalloc2(api[am],&apj,api[am],&apv);
1671: PetscFreeSpaceContiguous(&free_space,apj);
1672: PetscLLDestroy(lnk,lnkbt);
1674: /* Create AP_loc for reuse */
1675: MatCreateSeqAIJWithArrays(PETSC_COMM_SELF,am,pN,api,apj,apv,&ptap->AP_loc);
1676: MatSetType(ptap->AP_loc,((PetscObject)p->A)->type_name);
1677: #if defined(PETSC_USE_INFO)
1678: if (ao) {
1679: apfill = (PetscReal)api[am]/(ad->i[am]+ao->i[am]+p_loc->i[pm]+1);
1680: } else {
1681: apfill = (PetscReal)api[am]/(ad->i[am]+p_loc->i[pm]+1);
1682: }
1683: ptap->AP_loc->info.mallocs = nspacedouble;
1684: ptap->AP_loc->info.fill_ratio_given = fill;
1685: ptap->AP_loc->info.fill_ratio_needed = apfill;
1687: if (api[am]) {
1688: PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc reallocs %" PetscInt_FMT "; Fill ratio: given %g needed %g.\n",nspacedouble,(double)fill,(double)apfill);
1689: PetscInfo(ptap->AP_loc,"Use MatPtAP(A,B,MatReuse,%g,&C) for best AP_loc performance.;\n",(double)apfill);
1690: } else {
1691: PetscInfo(ptap->AP_loc,"Nonscalable algorithm, AP_loc is empty \n");
1692: }
1693: #endif
1695: /* (2-1) compute symbolic Co = Ro*AP_loc */
1696: /* ------------------------------------ */
1697: MatGetOptionsPrefix(A,&prefix);
1698: MatSetOptionsPrefix(ptap->Ro,prefix);
1699: MatAppendOptionsPrefix(ptap->Ro,"inner_offdiag_");
1700: MatProductCreate(ptap->Ro,ptap->AP_loc,NULL,&ptap->C_oth);
1701: MatGetOptionsPrefix(Cmpi,&prefix);
1702: MatSetOptionsPrefix(ptap->C_oth,prefix);
1703: MatAppendOptionsPrefix(ptap->C_oth,"inner_C_oth_");
1704: MatProductSetType(ptap->C_oth,MATPRODUCT_AB);
1705: MatProductSetAlgorithm(ptap->C_oth,"default");
1706: MatProductSetFill(ptap->C_oth,fill);
1707: MatProductSetFromOptions(ptap->C_oth);
1708: MatProductSymbolic(ptap->C_oth);
1710: /* (3) send coj of C_oth to other processors */
1711: /* ------------------------------------------ */
1712: /* determine row ownership */
1713: PetscLayoutCreate(comm,&rowmap);
1714: rowmap->n = pn;
1715: rowmap->bs = 1;
1716: PetscLayoutSetUp(rowmap);
1717: owners = rowmap->range;
1719: /* determine the number of messages to send, their lengths */
1720: PetscMalloc4(size,&len_s,size,&len_si,size,&sstatus,size+2,&owners_co);
1721: PetscArrayzero(len_s,size);
1722: PetscArrayzero(len_si,size);
1724: c_oth = (Mat_SeqAIJ*)ptap->C_oth->data;
1725: coi = c_oth->i; coj = c_oth->j;
1726: con = ptap->C_oth->rmap->n;
1727: proc = 0;
1728: for (i=0; i<con; i++) {
1729: while (prmap[i] >= owners[proc+1]) proc++;
1730: len_si[proc]++; /* num of rows in Co(=Pt*AP) to be sent to [proc] */
1731: len_s[proc] += coi[i+1] - coi[i]; /* num of nonzeros in Co to be sent to [proc] */
1732: }
1734: len = 0; /* max length of buf_si[], see (4) */
1735: owners_co[0] = 0;
1736: nsend = 0;
1737: for (proc=0; proc<size; proc++) {
1738: owners_co[proc+1] = owners_co[proc] + len_si[proc];
1739: if (len_s[proc]) {
1740: nsend++;
1741: len_si[proc] = 2*(len_si[proc] + 1); /* length of buf_si to be sent to [proc] */
1742: len += len_si[proc];
1743: }
1744: }
1746: /* determine the number and length of messages to receive for coi and coj */
1747: PetscGatherNumberOfMessages(comm,NULL,len_s,&nrecv);
1748: PetscGatherMessageLengths2(comm,nsend,nrecv,len_s,len_si,&id_r,&len_r,&len_ri);
1750: /* post the Irecv and Isend of coj */
1751: PetscCommGetNewTag(comm,&tagj);
1752: PetscPostIrecvInt(comm,tagj,nrecv,id_r,len_r,&buf_rj,&rwaits);
1753: PetscMalloc1(nsend+1,&swaits);
1754: for (proc=0, k=0; proc<size; proc++) {
1755: if (!len_s[proc]) continue;
1756: i = owners_co[proc];
1757: MPI_Isend(coj+coi[i],len_s[proc],MPIU_INT,proc,tagj,comm,swaits+k);
1758: k++;
1759: }
1761: /* (2-2) compute symbolic C_loc = Rd*AP_loc */
1762: /* ---------------------------------------- */
1763: MatSetOptionsPrefix(ptap->Rd,prefix);
1764: MatAppendOptionsPrefix(ptap->Rd,"inner_diag_");
1765: MatProductCreate(ptap->Rd,ptap->AP_loc,NULL,&ptap->C_loc);
1766: MatGetOptionsPrefix(Cmpi,&prefix);
1767: MatSetOptionsPrefix(ptap->C_loc,prefix);
1768: MatAppendOptionsPrefix(ptap->C_loc,"inner_C_loc_");
1769: MatProductSetType(ptap->C_loc,MATPRODUCT_AB);
1770: MatProductSetAlgorithm(ptap->C_loc,"default");
1771: MatProductSetFill(ptap->C_loc,fill);
1772: MatProductSetFromOptions(ptap->C_loc);
1773: MatProductSymbolic(ptap->C_loc);
1775: c_loc = (Mat_SeqAIJ*)ptap->C_loc->data;
1777: /* receives coj are complete */
1778: for (i=0; i<nrecv; i++) {
1779: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
1780: }
1781: PetscFree(rwaits);
1782: if (nsend) MPI_Waitall(nsend,swaits,sstatus);
1784: /* add received column indices into ta to update Crmax */
1785: for (k=0; k<nrecv; k++) {/* k-th received message */
1786: Jptr = buf_rj[k];
1787: for (j=0; j<len_r[k]; j++) {
1788: PetscTableAdd(ta,*(Jptr+j)+1,1,INSERT_VALUES);
1789: }
1790: }
1791: PetscTableGetCount(ta,&Crmax);
1792: PetscTableDestroy(&ta);
1794: /* (4) send and recv coi */
1795: /*-----------------------*/
1796: PetscCommGetNewTag(comm,&tagi);
1797: PetscPostIrecvInt(comm,tagi,nrecv,id_r,len_ri,&buf_ri,&rwaits);
1798: PetscMalloc1(len+1,&buf_s);
1799: buf_si = buf_s; /* points to the beginning of k-th msg to be sent */
1800: for (proc=0,k=0; proc<size; proc++) {
1801: if (!len_s[proc]) continue;
1802: /* form outgoing message for i-structure:
1803: buf_si[0]: nrows to be sent
1804: [1:nrows]: row index (global)
1805: [nrows+1:2*nrows+1]: i-structure index
1806: */
1807: /*-------------------------------------------*/
1808: nrows = len_si[proc]/2 - 1; /* num of rows in Co to be sent to [proc] */
1809: buf_si_i = buf_si + nrows+1;
1810: buf_si[0] = nrows;
1811: buf_si_i[0] = 0;
1812: nrows = 0;
1813: for (i=owners_co[proc]; i<owners_co[proc+1]; i++) {
1814: nzi = coi[i+1] - coi[i];
1815: buf_si_i[nrows+1] = buf_si_i[nrows] + nzi; /* i-structure */
1816: buf_si[nrows+1] = prmap[i] -owners[proc]; /* local row index */
1817: nrows++;
1818: }
1819: MPI_Isend(buf_si,len_si[proc],MPIU_INT,proc,tagi,comm,swaits+k);
1820: k++;
1821: buf_si += len_si[proc];
1822: }
1823: for (i=0; i<nrecv; i++) {
1824: MPI_Waitany(nrecv,rwaits,&icompleted,&rstatus);
1825: }
1826: PetscFree(rwaits);
1827: if (nsend) MPI_Waitall(nsend,swaits,sstatus);
1829: PetscFree4(len_s,len_si,sstatus,owners_co);
1830: PetscFree(len_ri);
1831: PetscFree(swaits);
1832: PetscFree(buf_s);
1834: /* (5) compute the local portion of Cmpi */
1835: /* ------------------------------------------ */
1836: /* set initial free space to be Crmax, sufficient for holding nozeros in each row of Cmpi */
1837: PetscFreeSpaceGet(Crmax,&free_space);
1838: current_space = free_space;
1840: PetscMalloc3(nrecv,&buf_ri_k,nrecv,&nextrow,nrecv,&nextci);
1841: for (k=0; k<nrecv; k++) {
1842: buf_ri_k[k] = buf_ri[k]; /* beginning of k-th recved i-structure */
1843: nrows = *buf_ri_k[k];
1844: nextrow[k] = buf_ri_k[k] + 1; /* next row number of k-th recved i-structure */
1845: nextci[k] = buf_ri_k[k] + (nrows + 1); /* points to the next i-structure of k-th recved i-structure */
1846: }
1848: MatPreallocateInitialize(comm,pn,pn,dnz,onz);
1849: PetscLLCondensedCreate(Crmax,pN,&lnk,&lnkbt);
1850: for (i=0; i<pn; i++) {
1851: /* add C_loc into Cmpi */
1852: nzi = c_loc->i[i+1] - c_loc->i[i];
1853: Jptr = c_loc->j + c_loc->i[i];
1854: PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
1856: /* add received col data into lnk */
1857: for (k=0; k<nrecv; k++) { /* k-th received message */
1858: if (i == *nextrow[k]) { /* i-th row */
1859: nzi = *(nextci[k]+1) - *nextci[k];
1860: Jptr = buf_rj[k] + *nextci[k];
1861: PetscLLCondensedAddSorted(nzi,Jptr,lnk,lnkbt);
1862: nextrow[k]++; nextci[k]++;
1863: }
1864: }
1865: nzi = lnk[0];
1867: /* copy data into free space, then initialize lnk */
1868: PetscLLCondensedClean(pN,nzi,current_space->array,lnk,lnkbt);
1869: MatPreallocateSet(i+owners[rank],nzi,current_space->array,dnz,onz);
1870: }
1871: PetscFree3(buf_ri_k,nextrow,nextci);
1872: PetscLLDestroy(lnk,lnkbt);
1873: PetscFreeSpaceDestroy(free_space);
1875: /* local sizes and preallocation */
1876: MatSetSizes(Cmpi,pn,pn,PETSC_DETERMINE,PETSC_DETERMINE);
1877: if (P->cmap->bs > 0) {
1878: PetscLayoutSetBlockSize(Cmpi->rmap,P->cmap->bs);
1879: PetscLayoutSetBlockSize(Cmpi->cmap,P->cmap->bs);
1880: }
1881: MatMPIAIJSetPreallocation(Cmpi,0,dnz,0,onz);
1882: MatPreallocateFinalize(dnz,onz);
1884: /* members in merge */
1885: PetscFree(id_r);
1886: PetscFree(len_r);
1887: PetscFree(buf_ri[0]);
1888: PetscFree(buf_ri);
1889: PetscFree(buf_rj[0]);
1890: PetscFree(buf_rj);
1891: PetscLayoutDestroy(&rowmap);
1893: PetscCalloc1(pN,&ptap->apa);
1895: /* attach the supporting struct to Cmpi for reuse */
1896: Cmpi->product->data = ptap;
1897: Cmpi->product->destroy = MatDestroy_MPIAIJ_PtAP;
1898: Cmpi->product->view = MatView_MPIAIJ_PtAP;
1900: /* Cmpi is not ready for use - assembly will be done by MatPtAPNumeric() */
1901: Cmpi->assembled = PETSC_FALSE;
1902: return 0;
1903: }
1905: PetscErrorCode MatPtAPNumeric_MPIAIJ_MPIAIJ(Mat A,Mat P,Mat C)
1906: {
1907: Mat_MPIAIJ *a=(Mat_MPIAIJ*)A->data,*p=(Mat_MPIAIJ*)P->data;
1908: Mat_SeqAIJ *ad=(Mat_SeqAIJ*)(a->A)->data,*ao=(Mat_SeqAIJ*)(a->B)->data;
1909: Mat_SeqAIJ *ap,*p_loc,*p_oth=NULL,*c_seq;
1910: Mat_APMPI *ptap;
1911: Mat AP_loc,C_loc,C_oth;
1912: PetscInt i,rstart,rend,cm,ncols,row;
1913: PetscInt *api,*apj,am = A->rmap->n,j,col,apnz;
1914: PetscScalar *apa;
1915: const PetscInt *cols;
1916: const PetscScalar *vals;
1918: MatCheckProduct(C,3);
1919: ptap = (Mat_APMPI*)C->product->data;
1923: MatZeroEntries(C);
1924: /* 1) get R = Pd^T,Ro = Po^T */
1925: if (ptap->reuse == MAT_REUSE_MATRIX) {
1926: MatTranspose(p->A,MAT_REUSE_MATRIX,&ptap->Rd);
1927: MatTranspose(p->B,MAT_REUSE_MATRIX,&ptap->Ro);
1928: }
1930: /* 2) get AP_loc */
1931: AP_loc = ptap->AP_loc;
1932: ap = (Mat_SeqAIJ*)AP_loc->data;
1934: /* 2-1) get P_oth = ptap->P_oth and P_loc = ptap->P_loc */
1935: /*-----------------------------------------------------*/
1936: if (ptap->reuse == MAT_REUSE_MATRIX) {
1937: /* P_oth and P_loc are obtained in MatPtASymbolic() when reuse == MAT_INITIAL_MATRIX */
1938: MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_REUSE_MATRIX,&ptap->startsj_s,&ptap->startsj_r,&ptap->bufa,&ptap->P_oth);
1939: MatMPIAIJGetLocalMat(P,MAT_REUSE_MATRIX,&ptap->P_loc);
1940: }
1942: /* 2-2) compute numeric A_loc*P - dominating part */
1943: /* ---------------------------------------------- */
1944: /* get data from symbolic products */
1945: p_loc = (Mat_SeqAIJ*)(ptap->P_loc)->data;
1946: if (ptap->P_oth) {
1947: p_oth = (Mat_SeqAIJ*)(ptap->P_oth)->data;
1948: }
1949: apa = ptap->apa;
1950: api = ap->i;
1951: apj = ap->j;
1952: for (i=0; i<am; i++) {
1953: /* AP[i,:] = A[i,:]*P = Ad*P_loc Ao*P_oth */
1954: AProw_nonscalable(i,ad,ao,p_loc,p_oth,apa);
1955: apnz = api[i+1] - api[i];
1956: for (j=0; j<apnz; j++) {
1957: col = apj[j+api[i]];
1958: ap->a[j+ap->i[i]] = apa[col];
1959: apa[col] = 0.0;
1960: }
1961: }
1962: /* We have modified the contents of local matrix AP_loc and must increase its ObjectState, since we are not doing AssemblyBegin/End on it. */
1963: PetscObjectStateIncrease((PetscObject)AP_loc);
1965: /* 3) C_loc = Rd*AP_loc, C_oth = Ro*AP_loc */
1966: MatProductNumeric(ptap->C_loc);
1967: MatProductNumeric(ptap->C_oth);
1968: C_loc = ptap->C_loc;
1969: C_oth = ptap->C_oth;
1971: /* add C_loc and Co to to C */
1972: MatGetOwnershipRange(C,&rstart,&rend);
1974: /* C_loc -> C */
1975: cm = C_loc->rmap->N;
1976: c_seq = (Mat_SeqAIJ*)C_loc->data;
1977: cols = c_seq->j;
1978: vals = c_seq->a;
1980: /* The (fast) MatSetValues_MPIAIJ_CopyFromCSRFormat function can only be used when C->was_assembled is PETSC_FALSE and */
1981: /* when there are no off-processor parts. */
1982: /* If was_assembled is true, then the statement aj[rowstart_diag+dnz_row] = mat_j[col] - cstart; in MatSetValues_MPIAIJ_CopyFromCSRFormat */
1983: /* is no longer true. Then the more complex function MatSetValues_MPIAIJ() has to be used, where the column index is looked up from */
1984: /* a table, and other, more complex stuff has to be done. */
1985: if (C->assembled) {
1986: C->was_assembled = PETSC_TRUE;
1987: C->assembled = PETSC_FALSE;
1988: }
1989: if (C->was_assembled) {
1990: for (i=0; i<cm; i++) {
1991: ncols = c_seq->i[i+1] - c_seq->i[i];
1992: row = rstart + i;
1993: MatSetValues_MPIAIJ(C,1,&row,ncols,cols,vals,ADD_VALUES);
1994: cols += ncols; vals += ncols;
1995: }
1996: } else {
1997: MatSetValues_MPIAIJ_CopyFromCSRFormat(C,c_seq->j,c_seq->i,c_seq->a);
1998: }
2000: /* Co -> C, off-processor part */
2001: cm = C_oth->rmap->N;
2002: c_seq = (Mat_SeqAIJ*)C_oth->data;
2003: cols = c_seq->j;
2004: vals = c_seq->a;
2005: for (i=0; i<cm; i++) {
2006: ncols = c_seq->i[i+1] - c_seq->i[i];
2007: row = p->garray[i];
2008: MatSetValues(C,1,&row,ncols,cols,vals,ADD_VALUES);
2009: cols += ncols; vals += ncols;
2010: }
2012: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
2013: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
2015: ptap->reuse = MAT_REUSE_MATRIX;
2016: return 0;
2017: }
2019: PETSC_INTERN PetscErrorCode MatProductSymbolic_PtAP_MPIAIJ_MPIAIJ(Mat C)
2020: {
2021: Mat_Product *product = C->product;
2022: Mat A=product->A,P=product->B;
2023: MatProductAlgorithm alg=product->alg;
2024: PetscReal fill=product->fill;
2025: PetscBool flg;
2027: /* scalable: do R=P^T locally, then C=R*A*P */
2028: PetscStrcmp(alg,"scalable",&flg);
2029: if (flg) {
2030: MatPtAPSymbolic_MPIAIJ_MPIAIJ_scalable(A,P,product->fill,C);
2031: C->ops->productnumeric = MatProductNumeric_PtAP;
2032: goto next;
2033: }
2035: /* nonscalable: do R=P^T locally, then C=R*A*P */
2036: PetscStrcmp(alg,"nonscalable",&flg);
2037: if (flg) {
2038: MatPtAPSymbolic_MPIAIJ_MPIAIJ(A,P,fill,C);
2039: goto next;
2040: }
2042: /* allatonce */
2043: PetscStrcmp(alg,"allatonce",&flg);
2044: if (flg) {
2045: MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce(A,P,fill,C);
2046: goto next;
2047: }
2049: /* allatonce_merged */
2050: PetscStrcmp(alg,"allatonce_merged",&flg);
2051: if (flg) {
2052: MatPtAPSymbolic_MPIAIJ_MPIAIJ_allatonce_merged(A,P,fill,C);
2053: goto next;
2054: }
2056: /* backend general code */
2057: PetscStrcmp(alg,"backend",&flg);
2058: if (flg) {
2059: MatProductSymbolic_MPIAIJBACKEND(C);
2060: return 0;
2061: }
2063: /* hypre */
2064: #if defined(PETSC_HAVE_HYPRE)
2065: PetscStrcmp(alg,"hypre",&flg);
2066: if (flg) {
2067: MatPtAPSymbolic_AIJ_AIJ_wHYPRE(A,P,fill,C);
2068: C->ops->productnumeric = MatProductNumeric_PtAP;
2069: return 0;
2070: }
2071: #endif
2072: SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_SUP,"Mat Product Algorithm is not supported");
2074: next:
2075: C->ops->productnumeric = MatProductNumeric_PtAP;
2076: return 0;
2077: }