Actual source code: baijfact2.c
2: /*
3: Factorization code for BAIJ format.
4: */
6: #include <../src/mat/impls/baij/seq/baij.h>
7: #include <petsc/private/kernels/blockinvert.h>
8: #include <petscbt.h>
9: #include <../src/mat/utils/freespace.h>
11: /* ----------------------------------------------------------------*/
12: extern PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscBool);
14: /*
15: This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes
16: */
17: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
18: {
19: Mat C =B;
20: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
21: PetscInt i,j,k,ipvt[15];
22: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj;
23: PetscInt nz,nzL,row;
24: MatScalar *rtmp,*pc,*mwork,*pv,*vv,work[225];
25: const MatScalar *v,*aa=a->a;
26: PetscInt bs2 = a->bs2,bs=A->rmap->bs,flg;
27: PetscInt sol_ver;
28: PetscBool allowzeropivot,zeropivotdetected;
30: allowzeropivot = PetscNot(A->erroriffailure);
31: PetscOptionsGetInt(NULL,((PetscObject)A)->prefix,"-sol_ver",&sol_ver,NULL);
33: /* generate work space needed by the factorization */
34: PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);
35: PetscArrayzero(rtmp,bs2*n);
37: for (i=0; i<n; i++) {
38: /* zero rtmp */
39: /* L part */
40: nz = bi[i+1] - bi[i];
41: bjtmp = bj + bi[i];
42: for (j=0; j<nz; j++) {
43: PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);
44: }
46: /* U part */
47: nz = bdiag[i] - bdiag[i+1];
48: bjtmp = bj + bdiag[i+1]+1;
49: for (j=0; j<nz; j++) {
50: PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);
51: }
53: /* load in initial (unfactored row) */
54: nz = ai[i+1] - ai[i];
55: ajtmp = aj + ai[i];
56: v = aa + bs2*ai[i];
57: for (j=0; j<nz; j++) {
58: PetscArraycpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2);
59: }
61: /* elimination */
62: bjtmp = bj + bi[i];
63: nzL = bi[i+1] - bi[i];
64: for (k=0; k < nzL; k++) {
65: row = bjtmp[k];
66: pc = rtmp + bs2*row;
67: for (flg=0,j=0; j<bs2; j++) {
68: if (pc[j]!=0.0) {
69: flg = 1;
70: break;
71: }
72: }
73: if (flg) {
74: pv = b->a + bs2*bdiag[row];
75: PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork);
76: /* PetscKernel_A_gets_A_times_B_15(pc,pv,mwork); */
77: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
78: pv = b->a + bs2*(bdiag[row+1]+1);
79: nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
80: for (j=0; j<nz; j++) {
81: vv = rtmp + bs2*pj[j];
82: PetscKernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv);
83: /* PetscKernel_A_gets_A_minus_B_times_C_15(vv,pc,pv); */
84: pv += bs2;
85: }
86: PetscLogFlops(2.0*bs2*bs*(nz+1)-bs2); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
87: }
88: }
90: /* finished row so stick it into b->a */
91: /* L part */
92: pv = b->a + bs2*bi[i];
93: pj = b->j + bi[i];
94: nz = bi[i+1] - bi[i];
95: for (j=0; j<nz; j++) {
96: PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);
97: }
99: /* Mark diagonal and invert diagonal for simpler triangular solves */
100: pv = b->a + bs2*bdiag[i];
101: pj = b->j + bdiag[i];
102: PetscArraycpy(pv,rtmp+bs2*pj[0],bs2);
103: PetscKernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount,allowzeropivot,&zeropivotdetected);
104: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
106: /* U part */
107: pv = b->a + bs2*(bdiag[i+1]+1);
108: pj = b->j + bdiag[i+1]+1;
109: nz = bdiag[i] - bdiag[i+1] - 1;
110: for (j=0; j<nz; j++) {
111: PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);
112: }
113: }
115: PetscFree2(rtmp,mwork);
117: C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1;
118: C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering;
119: C->assembled = PETSC_TRUE;
121: PetscLogFlops(1.333333333333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
122: return 0;
123: }
125: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info)
126: {
127: Mat C =B;
128: Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
129: IS isrow = b->row,isicol = b->icol;
130: const PetscInt *r,*ic;
131: PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
132: PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
133: MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
134: PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
135: MatScalar *v_work;
136: PetscBool col_identity,row_identity,both_identity;
137: PetscBool allowzeropivot,zeropivotdetected;
139: ISGetIndices(isrow,&r);
140: ISGetIndices(isicol,&ic);
141: allowzeropivot = PetscNot(A->erroriffailure);
143: PetscCalloc1(bs2*n,&rtmp);
145: /* generate work space needed by dense LU factorization */
146: PetscMalloc3(bs,&v_work,bs2,&mwork,bs,&v_pivots);
148: for (i=0; i<n; i++) {
149: /* zero rtmp */
150: /* L part */
151: nz = bi[i+1] - bi[i];
152: bjtmp = bj + bi[i];
153: for (j=0; j<nz; j++) {
154: PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);
155: }
157: /* U part */
158: nz = bdiag[i] - bdiag[i+1];
159: bjtmp = bj + bdiag[i+1]+1;
160: for (j=0; j<nz; j++) {
161: PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);
162: }
164: /* load in initial (unfactored row) */
165: nz = ai[r[i]+1] - ai[r[i]];
166: ajtmp = aj + ai[r[i]];
167: v = aa + bs2*ai[r[i]];
168: for (j=0; j<nz; j++) {
169: PetscArraycpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2);
170: }
172: /* elimination */
173: bjtmp = bj + bi[i];
174: nzL = bi[i+1] - bi[i];
175: for (k=0; k < nzL; k++) {
176: row = bjtmp[k];
177: pc = rtmp + bs2*row;
178: for (flg=0,j=0; j<bs2; j++) {
179: if (pc[j]!=0.0) {
180: flg = 1;
181: break;
182: }
183: }
184: if (flg) {
185: pv = b->a + bs2*bdiag[row];
186: PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
187: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
188: pv = b->a + bs2*(bdiag[row+1]+1);
189: nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
190: for (j=0; j<nz; j++) {
191: PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
192: }
193: PetscLogFlops(2.0*bs2*bs*(nz+1)-bs2); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
194: }
195: }
197: /* finished row so stick it into b->a */
198: /* L part */
199: pv = b->a + bs2*bi[i];
200: pj = b->j + bi[i];
201: nz = bi[i+1] - bi[i];
202: for (j=0; j<nz; j++) {
203: PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);
204: }
206: /* Mark diagonal and invert diagonal for simpler triangular solves */
207: pv = b->a + bs2*bdiag[i];
208: pj = b->j + bdiag[i];
209: PetscArraycpy(pv,rtmp+bs2*pj[0],bs2);
211: PetscKernel_A_gets_inverse_A(bs,pv,v_pivots,v_work,allowzeropivot,&zeropivotdetected);
212: if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
214: /* U part */
215: pv = b->a + bs2*(bdiag[i+1]+1);
216: pj = b->j + bdiag[i+1]+1;
217: nz = bdiag[i] - bdiag[i+1] - 1;
218: for (j=0; j<nz; j++) {
219: PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);
220: }
221: }
223: PetscFree(rtmp);
224: PetscFree3(v_work,mwork,v_pivots);
225: ISRestoreIndices(isicol,&ic);
226: ISRestoreIndices(isrow,&r);
228: ISIdentity(isrow,&row_identity);
229: ISIdentity(isicol,&col_identity);
231: both_identity = (PetscBool) (row_identity && col_identity);
232: if (both_identity) {
233: switch (bs) {
234: case 9:
235: #if defined(PETSC_HAVE_IMMINTRIN_H) && defined(__AVX2__) && defined(__FMA__) && defined(PETSC_USE_REAL_DOUBLE) && !defined(PETSC_USE_COMPLEX) && !defined(PETSC_USE_64BIT_INDICES)
236: C->ops->solve = MatSolve_SeqBAIJ_9_NaturalOrdering;
237: #else
238: C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
239: #endif
240: break;
241: case 11:
242: C->ops->solve = MatSolve_SeqBAIJ_11_NaturalOrdering;
243: break;
244: case 12:
245: C->ops->solve = MatSolve_SeqBAIJ_12_NaturalOrdering;
246: break;
247: case 13:
248: C->ops->solve = MatSolve_SeqBAIJ_13_NaturalOrdering;
249: break;
250: case 14:
251: C->ops->solve = MatSolve_SeqBAIJ_14_NaturalOrdering;
252: break;
253: default:
254: C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
255: break;
256: }
257: } else {
258: C->ops->solve = MatSolve_SeqBAIJ_N;
259: }
260: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N;
262: C->assembled = PETSC_TRUE;
264: PetscLogFlops(1.333333333333*bs*bs2*b->mbs); /* from inverting diagonal blocks */
265: return 0;
266: }
268: /*
269: ilu(0) with natural ordering under new data structure.
270: See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description
271: because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace().
272: */
274: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
275: {
277: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
278: PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
279: PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp;
281: MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);
282: b = (Mat_SeqBAIJ*)(fact)->data;
284: /* allocate matrix arrays for new data structure */
285: PetscMalloc3(bs2*ai[n]+1,&b->a,ai[n]+1,&b->j,n+1,&b->i);
286: PetscLogObjectMemory((PetscObject)fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));
288: b->singlemalloc = PETSC_TRUE;
289: b->free_a = PETSC_TRUE;
290: b->free_ij = PETSC_TRUE;
291: fact->preallocated = PETSC_TRUE;
292: fact->assembled = PETSC_TRUE;
293: if (!b->diag) {
294: PetscMalloc1(n+1,&b->diag);
295: PetscLogObjectMemory((PetscObject)fact,(n+1)*sizeof(PetscInt));
296: }
297: bdiag = b->diag;
299: if (n > 0) {
300: PetscArrayzero(b->a,bs2*ai[n]);
301: }
303: /* set bi and bj with new data structure */
304: bi = b->i;
305: bj = b->j;
307: /* L part */
308: bi[0] = 0;
309: for (i=0; i<n; i++) {
310: nz = adiag[i] - ai[i];
311: bi[i+1] = bi[i] + nz;
312: aj = a->j + ai[i];
313: for (j=0; j<nz; j++) {
314: *bj = aj[j]; bj++;
315: }
316: }
318: /* U part */
319: bi_temp = bi[n];
320: bdiag[n] = bi[n]-1;
321: for (i=n-1; i>=0; i--) {
322: nz = ai[i+1] - adiag[i] - 1;
323: bi_temp = bi_temp + nz + 1;
324: aj = a->j + adiag[i] + 1;
325: for (j=0; j<nz; j++) {
326: *bj = aj[j]; bj++;
327: }
328: /* diag[i] */
329: *bj = i; bj++;
330: bdiag[i] = bi_temp - 1;
331: }
332: return 0;
333: }
335: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
336: {
337: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
338: IS isicol;
339: const PetscInt *r,*ic;
340: PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d;
341: PetscInt *bi,*cols,nnz,*cols_lvl;
342: PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
343: PetscInt i,levels,diagonal_fill;
344: PetscBool col_identity,row_identity,both_identity;
345: PetscReal f;
346: PetscInt nlnk,*lnk,*lnk_lvl=NULL;
347: PetscBT lnkbt;
348: PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr;
349: PetscFreeSpaceList free_space =NULL,current_space=NULL;
350: PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
351: PetscBool missing;
352: PetscInt bs=A->rmap->bs,bs2=a->bs2;
355: if (bs>1) { /* check shifttype */
357: }
359: MatMissingDiagonal(A,&missing,&d);
362: f = info->fill;
363: levels = (PetscInt)info->levels;
364: diagonal_fill = (PetscInt)info->diagonal_fill;
366: ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
368: ISIdentity(isrow,&row_identity);
369: ISIdentity(iscol,&col_identity);
371: both_identity = (PetscBool) (row_identity && col_identity);
373: if (!levels && both_identity) {
374: /* special case: ilu(0) with natural ordering */
375: MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);
376: MatSeqBAIJSetNumericFactorization(fact,both_identity);
378: fact->factortype = MAT_FACTOR_ILU;
379: (fact)->info.factor_mallocs = 0;
380: (fact)->info.fill_ratio_given = info->fill;
381: (fact)->info.fill_ratio_needed = 1.0;
383: b = (Mat_SeqBAIJ*)(fact)->data;
384: b->row = isrow;
385: b->col = iscol;
386: b->icol = isicol;
387: PetscObjectReference((PetscObject)isrow);
388: PetscObjectReference((PetscObject)iscol);
389: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
391: PetscMalloc1((n+1)*bs,&b->solve_work);
392: return 0;
393: }
395: ISGetIndices(isrow,&r);
396: ISGetIndices(isicol,&ic);
398: /* get new row pointers */
399: PetscMalloc1(n+1,&bi);
400: bi[0] = 0;
401: /* bdiag is location of diagonal in factor */
402: PetscMalloc1(n+1,&bdiag);
403: bdiag[0] = 0;
405: PetscMalloc2(n,&bj_ptr,n,&bjlvl_ptr);
407: /* create a linked list for storing column indices of the active row */
408: nlnk = n + 1;
409: PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);
411: /* initial FreeSpace size is f*(ai[n]+1) */
412: PetscFreeSpaceGet(PetscRealIntMultTruncate(f,ai[n]+1),&free_space);
413: current_space = free_space;
414: PetscFreeSpaceGet(PetscRealIntMultTruncate(f,ai[n]+1),&free_space_lvl);
415: current_space_lvl = free_space_lvl;
417: for (i=0; i<n; i++) {
418: nzi = 0;
419: /* copy current row into linked list */
420: nnz = ai[r[i]+1] - ai[r[i]];
422: cols = aj + ai[r[i]];
423: lnk[i] = -1; /* marker to indicate if diagonal exists */
424: PetscIncompleteLLInit(nnz,cols,n,ic,&nlnk,lnk,lnk_lvl,lnkbt);
425: nzi += nlnk;
427: /* make sure diagonal entry is included */
428: if (diagonal_fill && lnk[i] == -1) {
429: fm = n;
430: while (lnk[fm] < i) fm = lnk[fm];
431: lnk[i] = lnk[fm]; /* insert diagonal into linked list */
432: lnk[fm] = i;
433: lnk_lvl[i] = 0;
434: nzi++; dcount++;
435: }
437: /* add pivot rows into the active row */
438: nzbd = 0;
439: prow = lnk[n];
440: while (prow < i) {
441: nnz = bdiag[prow];
442: cols = bj_ptr[prow] + nnz + 1;
443: cols_lvl = bjlvl_ptr[prow] + nnz + 1;
444: nnz = bi[prow+1] - bi[prow] - nnz - 1;
446: PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,&nlnk,lnk,lnk_lvl,lnkbt,prow);
447: nzi += nlnk;
448: prow = lnk[prow];
449: nzbd++;
450: }
451: bdiag[i] = nzbd;
452: bi[i+1] = bi[i] + nzi;
454: /* if free space is not available, make more free space */
455: if (current_space->local_remaining<nzi) {
456: nnz = PetscIntMultTruncate(2,PetscIntMultTruncate(nzi,(n - i))); /* estimated and max additional space needed */
457: PetscFreeSpaceGet(nnz,¤t_space);
458: PetscFreeSpaceGet(nnz,¤t_space_lvl);
459: reallocs++;
460: }
462: /* copy data into free_space and free_space_lvl, then initialize lnk */
463: PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);
465: bj_ptr[i] = current_space->array;
466: bjlvl_ptr[i] = current_space_lvl->array;
468: /* make sure the active row i has diagonal entry */
471: current_space->array += nzi;
472: current_space->local_used += nzi;
473: current_space->local_remaining -= nzi;
475: current_space_lvl->array += nzi;
476: current_space_lvl->local_used += nzi;
477: current_space_lvl->local_remaining -= nzi;
478: }
480: ISRestoreIndices(isrow,&r);
481: ISRestoreIndices(isicol,&ic);
483: /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
484: PetscMalloc1(bi[n]+1,&bj);
485: PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);
487: PetscIncompleteLLDestroy(lnk,lnkbt);
488: PetscFreeSpaceDestroy(free_space_lvl);
489: PetscFree2(bj_ptr,bjlvl_ptr);
491: #if defined(PETSC_USE_INFO)
492: {
493: PetscReal af = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
494: PetscInfo(A,"Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af);
495: PetscInfo(A,"Run with -[sub_]pc_factor_fill %g or use \n",(double)af);
496: PetscInfo(A,"PCFactorSetFill([sub]pc,%g);\n",(double)af);
497: PetscInfo(A,"for best performance.\n");
498: if (diagonal_fill) {
499: PetscInfo(A,"Detected and replaced %" PetscInt_FMT " missing diagonals\n",dcount);
500: }
501: }
502: #endif
504: /* put together the new matrix */
505: MatSeqBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);
506: PetscLogObjectParent((PetscObject)fact,(PetscObject)isicol);
508: b = (Mat_SeqBAIJ*)(fact)->data;
509: b->free_a = PETSC_TRUE;
510: b->free_ij = PETSC_TRUE;
511: b->singlemalloc = PETSC_FALSE;
513: PetscMalloc1(bs2*(bdiag[0]+1),&b->a);
515: b->j = bj;
516: b->i = bi;
517: b->diag = bdiag;
518: b->free_diag = PETSC_TRUE;
519: b->ilen = NULL;
520: b->imax = NULL;
521: b->row = isrow;
522: b->col = iscol;
523: PetscObjectReference((PetscObject)isrow);
524: PetscObjectReference((PetscObject)iscol);
525: b->icol = isicol;
527: PetscMalloc1(bs*n+bs,&b->solve_work);
528: /* In b structure: Free imax, ilen, old a, old j.
529: Allocate bdiag, solve_work, new a, new j */
530: PetscLogObjectMemory((PetscObject)fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));
531: b->maxnz = b->nz = bdiag[0]+1;
533: fact->info.factor_mallocs = reallocs;
534: fact->info.fill_ratio_given = f;
535: fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
537: MatSeqBAIJSetNumericFactorization(fact,both_identity);
538: return 0;
539: }
541: /*
542: This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
543: except that the data structure of Mat_SeqAIJ is slightly different.
544: Not a good example of code reuse.
545: */
546: PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
547: {
548: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
549: IS isicol;
550: const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
551: PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
552: PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
553: PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
554: PetscBool col_identity,row_identity,both_identity,flg;
555: PetscReal f;
557: MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);
560: f = info->fill;
561: levels = (PetscInt)info->levels;
562: diagonal_fill = (PetscInt)info->diagonal_fill;
564: ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);
566: ISIdentity(isrow,&row_identity);
567: ISIdentity(iscol,&col_identity);
568: both_identity = (PetscBool) (row_identity && col_identity);
570: if (!levels && both_identity) { /* special case copy the nonzero structure */
571: MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);
572: MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);
574: fact->factortype = MAT_FACTOR_ILU;
575: b = (Mat_SeqBAIJ*)fact->data;
576: b->row = isrow;
577: b->col = iscol;
578: PetscObjectReference((PetscObject)isrow);
579: PetscObjectReference((PetscObject)iscol);
580: b->icol = isicol;
581: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
583: PetscMalloc1((n+1)*bs,&b->solve_work);
584: return 0;
585: }
587: /* general case perform the symbolic factorization */
588: ISGetIndices(isrow,&r);
589: ISGetIndices(isicol,&ic);
591: /* get new row pointers */
592: PetscMalloc1(n+1,&ainew);
593: ainew[0] = 0;
594: /* don't know how many column pointers are needed so estimate */
595: jmax = (PetscInt)(f*ai[n] + 1);
596: PetscMalloc1(jmax,&ajnew);
597: /* ajfill is level of fill for each fill entry */
598: PetscMalloc1(jmax,&ajfill);
599: /* fill is a linked list of nonzeros in active row */
600: PetscMalloc1(n+1,&fill);
601: /* im is level for each filled value */
602: PetscMalloc1(n+1,&im);
603: /* dloc is location of diagonal in factor */
604: PetscMalloc1(n+1,&dloc);
605: dloc[0] = 0;
606: for (prow=0; prow<n; prow++) {
608: /* copy prow into linked list */
609: nzf = nz = ai[r[prow]+1] - ai[r[prow]];
611: xi = aj + ai[r[prow]];
612: fill[n] = n;
613: fill[prow] = -1; /* marker for diagonal entry */
614: while (nz--) {
615: fm = n;
616: idx = ic[*xi++];
617: do {
618: m = fm;
619: fm = fill[m];
620: } while (fm < idx);
621: fill[m] = idx;
622: fill[idx] = fm;
623: im[idx] = 0;
624: }
626: /* make sure diagonal entry is included */
627: if (diagonal_fill && fill[prow] == -1) {
628: fm = n;
629: while (fill[fm] < prow) fm = fill[fm];
630: fill[prow] = fill[fm]; /* insert diagonal into linked list */
631: fill[fm] = prow;
632: im[prow] = 0;
633: nzf++;
634: dcount++;
635: }
637: nzi = 0;
638: row = fill[n];
639: while (row < prow) {
640: incrlev = im[row] + 1;
641: nz = dloc[row];
642: xi = ajnew + ainew[row] + nz + 1;
643: flev = ajfill + ainew[row] + nz + 1;
644: nnz = ainew[row+1] - ainew[row] - nz - 1;
645: fm = row;
646: while (nnz-- > 0) {
647: idx = *xi++;
648: if (*flev + incrlev > levels) {
649: flev++;
650: continue;
651: }
652: do {
653: m = fm;
654: fm = fill[m];
655: } while (fm < idx);
656: if (fm != idx) {
657: im[idx] = *flev + incrlev;
658: fill[m] = idx;
659: fill[idx] = fm;
660: fm = idx;
661: nzf++;
662: } else if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
663: flev++;
664: }
665: row = fill[row];
666: nzi++;
667: }
668: /* copy new filled row into permanent storage */
669: ainew[prow+1] = ainew[prow] + nzf;
670: if (ainew[prow+1] > jmax) {
672: /* estimate how much additional space we will need */
673: /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
674: /* just double the memory each time */
675: PetscInt maxadd = jmax;
676: /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
677: if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
678: jmax += maxadd;
680: /* allocate a longer ajnew and ajfill */
681: PetscMalloc1(jmax,&xitmp);
682: PetscArraycpy(xitmp,ajnew,ainew[prow]);
683: PetscFree(ajnew);
684: ajnew = xitmp;
685: PetscMalloc1(jmax,&xitmp);
686: PetscArraycpy(xitmp,ajfill,ainew[prow]);
687: PetscFree(ajfill);
688: ajfill = xitmp;
689: reallocate++; /* count how many reallocations are needed */
690: }
691: xitmp = ajnew + ainew[prow];
692: flev = ajfill + ainew[prow];
693: dloc[prow] = nzi;
694: fm = fill[n];
695: while (nzf--) {
696: *xitmp++ = fm;
697: *flev++ = im[fm];
698: fm = fill[fm];
699: }
700: /* make sure row has diagonal entry */
702: try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
703: }
704: PetscFree(ajfill);
705: ISRestoreIndices(isrow,&r);
706: ISRestoreIndices(isicol,&ic);
707: PetscFree(fill);
708: PetscFree(im);
710: #if defined(PETSC_USE_INFO)
711: {
712: PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
713: PetscInfo(A,"Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n",reallocate,(double)f,(double)af);
714: PetscInfo(A,"Run with -pc_factor_fill %g or use \n",(double)af);
715: PetscInfo(A,"PCFactorSetFill(pc,%g);\n",(double)af);
716: PetscInfo(A,"for best performance.\n");
717: if (diagonal_fill) {
718: PetscInfo(A,"Detected and replaced %" PetscInt_FMT " missing diagonals\n",dcount);
719: }
720: }
721: #endif
723: /* put together the new matrix */
724: MatSeqBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);
725: PetscLogObjectParent((PetscObject)fact,(PetscObject)isicol);
726: b = (Mat_SeqBAIJ*)fact->data;
728: b->free_a = PETSC_TRUE;
729: b->free_ij = PETSC_TRUE;
730: b->singlemalloc = PETSC_FALSE;
732: PetscMalloc1(bs2*ainew[n],&b->a);
734: b->j = ajnew;
735: b->i = ainew;
736: for (i=0; i<n; i++) dloc[i] += ainew[i];
737: b->diag = dloc;
738: b->free_diag = PETSC_TRUE;
739: b->ilen = NULL;
740: b->imax = NULL;
741: b->row = isrow;
742: b->col = iscol;
743: b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
745: PetscObjectReference((PetscObject)isrow);
746: PetscObjectReference((PetscObject)iscol);
747: b->icol = isicol;
748: PetscMalloc1(bs*n+bs,&b->solve_work);
749: /* In b structure: Free imax, ilen, old a, old j.
750: Allocate dloc, solve_work, new a, new j */
751: PetscLogObjectMemory((PetscObject)fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));
752: b->maxnz = b->nz = ainew[n];
754: fact->info.factor_mallocs = reallocate;
755: fact->info.fill_ratio_given = f;
756: fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
758: MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);
759: return 0;
760: }
762: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
763: {
764: /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; */
765: /* int i,*AJ=a->j,nz=a->nz; */
767: /* Undo Column scaling */
768: /* while (nz--) { */
769: /* AJ[i] = AJ[i]/4; */
770: /* } */
771: /* This should really invoke a push/pop logic, but we don't have that yet. */
772: A->ops->setunfactored = NULL;
773: return 0;
774: }
776: PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
777: {
778: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
779: PetscInt *AJ=a->j,nz=a->nz;
780: unsigned short *aj=(unsigned short*)AJ;
782: /* Is this really necessary? */
783: while (nz--) {
784: AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
785: }
786: A->ops->setunfactored = NULL;
787: return 0;
788: }