Actual source code: sbaijfact2.c


  2: /*
  3:     Factorization code for SBAIJ format.
  4: */

  6: #include <../src/mat/impls/sbaij/seq/sbaij.h>
  7: #include <../src/mat/impls/baij/seq/baij.h>
  8: #include <petsc/private/kernels/blockinvert.h>

 10: PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
 11: {
 12:   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
 13:   IS                isrow=a->row;
 14:   PetscInt          mbs  =a->mbs,*ai=a->i,*aj=a->j;
 15:   const PetscInt    *r;
 16:   PetscInt          nz,*vj,k,idx,k1;
 17:   PetscInt          bs =A->rmap->bs,bs2 = a->bs2;
 18:   const MatScalar   *aa=a->a,*v,*diag;
 19:   PetscScalar       *x,*xk,*xj,*xk_tmp,*t;
 20:   const PetscScalar *b;

 22:   VecGetArrayRead(bb,&b);
 23:   VecGetArray(xx,&x);
 24:   t    = a->solve_work;
 25:   ISGetIndices(isrow,&r);
 26:   PetscMalloc1(bs,&xk_tmp);

 28:   /* solve U^T * D * y = b by forward substitution */
 29:   xk = t;
 30:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
 31:     idx = bs*r[k];
 32:     for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
 33:   }
 34:   for (k=0; k<mbs; k++) {
 35:     v    = aa + bs2*ai[k];
 36:     xk   = t + k*bs;    /* Dk*xk = k-th block of x */
 37:     PetscArraycpy(xk_tmp,xk,bs); /* xk_tmp <- xk */
 38:     nz   = ai[k+1] - ai[k];
 39:     vj   = aj + ai[k];
 40:     xj   = t + (*vj)*bs; /* *vj-th block of x, *vj>k */
 41:     while (nz--) {
 42:       /* x(:) += U(k,:)^T*(Dk*xk) */
 43:       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
 44:       vj++; xj = t + (*vj)*bs;
 45:       v       += bs2;
 46:     }
 47:     /* xk = inv(Dk)*(Dk*xk) */
 48:     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
 49:     PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
 50:   }

 52:   /* solve U*x = y by back substitution */
 53:   for (k=mbs-1; k>=0; k--) {
 54:     v  = aa + bs2*ai[k];
 55:     xk = t + k*bs;        /* xk */
 56:     nz = ai[k+1] - ai[k];
 57:     vj = aj + ai[k];
 58:     xj = t + (*vj)*bs;
 59:     while (nz--) {
 60:       /* xk += U(k,:)*x(:) */
 61:       PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
 62:       vj++;
 63:       v += bs2; xj = t + (*vj)*bs;
 64:     }
 65:     idx = bs*r[k];
 66:     for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
 67:   }

 69:   PetscFree(xk_tmp);
 70:   ISRestoreIndices(isrow,&r);
 71:   VecRestoreArrayRead(bb,&b);
 72:   VecRestoreArray(xx,&x);
 73:   PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);
 74:   return 0;
 75: }

 77: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
 78: {
 79:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"not implemented yet");
 80: }

 82: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
 83: {
 84:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not yet implemented");
 85: }

 87: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
 88: {
 89:   PetscInt        nz,k;
 90:   const PetscInt  *vj,bs2 = bs*bs;
 91:   const MatScalar *v,*diag;
 92:   PetscScalar     *xk,*xj,*xk_tmp;

 94:   PetscMalloc1(bs,&xk_tmp);
 95:   for (k=0; k<mbs; k++) {
 96:     v    = aa + bs2*ai[k];
 97:     xk   = x + k*bs;    /* Dk*xk = k-th block of x */
 98:     PetscArraycpy(xk_tmp,xk,bs); /* xk_tmp <- xk */
 99:     nz   = ai[k+1] - ai[k];
100:     vj   = aj + ai[k];
101:     xj   = x + (*vj)*bs; /* *vj-th block of x, *vj>k */
102:     while (nz--) {
103:       /* x(:) += U(k,:)^T*(Dk*xk) */
104:       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
105:       vj++; xj = x + (*vj)*bs;
106:       v       += bs2;
107:     }
108:     /* xk = inv(Dk)*(Dk*xk) */
109:     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
110:     PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
111:   }
112:   PetscFree(xk_tmp);
113:   return 0;
114: }

116: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
117: {
118:   PetscInt        nz,k;
119:   const PetscInt  *vj,bs2 = bs*bs;
120:   const MatScalar *v;
121:   PetscScalar     *xk,*xj;

123:   for (k=mbs-1; k>=0; k--) {
124:     v  = aa + bs2*ai[k];
125:     xk = x + k*bs;        /* xk */
126:     nz = ai[k+1] - ai[k];
127:     vj = aj + ai[k];
128:     xj = x + (*vj)*bs;
129:     while (nz--) {
130:       /* xk += U(k,:)*x(:) */
131:       PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
132:       vj++;
133:       v += bs2; xj = x + (*vj)*bs;
134:     }
135:   }
136:   return 0;
137: }

139: PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
140: {
141:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
142:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
143:   PetscInt          bs =A->rmap->bs;
144:   const MatScalar   *aa=a->a;
145:   PetscScalar       *x;
146:   const PetscScalar *b;

148:   VecGetArrayRead(bb,&b);
149:   VecGetArray(xx,&x);

151:   /* solve U^T * D * y = b by forward substitution */
152:   PetscArraycpy(x,b,bs*mbs); /* x <- b */
153:   MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);

155:   /* solve U*x = y by back substitution */
156:   MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);

158:   VecRestoreArrayRead(bb,&b);
159:   VecRestoreArray(xx,&x);
160:   PetscLogFlops(4.0*a->bs2*a->nz - (bs+2.0*a->bs2)*mbs);
161:   return 0;
162: }

164: PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
165: {
166:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
167:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
168:   PetscInt          bs =A->rmap->bs;
169:   const MatScalar   *aa=a->a;
170:   const PetscScalar *b;
171:   PetscScalar       *x;

173:   VecGetArrayRead(bb,&b);
174:   VecGetArray(xx,&x);
175:   PetscArraycpy(x,b,bs*mbs); /* x <- b */
176:   MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
177:   VecRestoreArrayRead(bb,&b);
178:   VecRestoreArray(xx,&x);
179:   PetscLogFlops(2.0*a->bs2*a->nz - bs*mbs);
180:   return 0;
181: }

183: PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
184: {
185:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
186:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
187:   PetscInt          bs =A->rmap->bs;
188:   const MatScalar   *aa=a->a;
189:   const PetscScalar *b;
190:   PetscScalar       *x;

192:   VecGetArrayRead(bb,&b);
193:   VecGetArray(xx,&x);
194:   PetscArraycpy(x,b,bs*mbs);
195:   MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);
196:   VecRestoreArrayRead(bb,&b);
197:   VecRestoreArray(xx,&x);
198:   PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
199:   return 0;
200: }

202: PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
203: {
204:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
205:   IS                isrow=a->row;
206:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j,*r,*vj;
207:   PetscInt          nz,k,idx;
208:   const MatScalar   *aa=a->a,*v,*d;
209:   PetscScalar       *x,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
210:   const PetscScalar *b;

212:   VecGetArrayRead(bb,&b);
213:   VecGetArray(xx,&x);
214:   t    = a->solve_work;
215:   ISGetIndices(isrow,&r);

217:   /* solve U^T * D * y = b by forward substitution */
218:   tp = t;
219:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
220:     idx   = 7*r[k];
221:     tp[0] = b[idx];
222:     tp[1] = b[idx+1];
223:     tp[2] = b[idx+2];
224:     tp[3] = b[idx+3];
225:     tp[4] = b[idx+4];
226:     tp[5] = b[idx+5];
227:     tp[6] = b[idx+6];
228:     tp   += 7;
229:   }

231:   for (k=0; k<mbs; k++) {
232:     v  = aa + 49*ai[k];
233:     vj = aj + ai[k];
234:     tp = t + k*7;
235:     x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
236:     nz = ai[k+1] - ai[k];
237:     tp = t + (*vj)*7;
238:     while (nz--) {
239:       tp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
240:       tp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
241:       tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
242:       tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
243:       tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
244:       tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
245:       tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
246:       vj++;
247:       tp = t + (*vj)*7;
248:       v += 49;
249:     }

251:     /* xk = inv(Dk)*(Dk*xk) */
252:     d     = aa+k*49;       /* ptr to inv(Dk) */
253:     tp    = t + k*7;
254:     tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
255:     tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
256:     tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
257:     tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
258:     tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
259:     tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
260:     tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
261:   }

263:   /* solve U*x = y by back substitution */
264:   for (k=mbs-1; k>=0; k--) {
265:     v  = aa + 49*ai[k];
266:     vj = aj + ai[k];
267:     tp = t + k*7;
268:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  x6=tp[6]; /* xk */
269:     nz = ai[k+1] - ai[k];

271:     tp = t + (*vj)*7;
272:     while (nz--) {
273:       /* xk += U(k,:)*x(:) */
274:       x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
275:       x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
276:       x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
277:       x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
278:       x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
279:       x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
280:       x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
281:       vj++;
282:       tp = t + (*vj)*7;
283:       v += 49;
284:     }
285:     tp       = t + k*7;
286:     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
287:     idx      = 7*r[k];
288:     x[idx]   = x0;
289:     x[idx+1] = x1;
290:     x[idx+2] = x2;
291:     x[idx+3] = x3;
292:     x[idx+4] = x4;
293:     x[idx+5] = x5;
294:     x[idx+6] = x6;
295:   }

297:   ISRestoreIndices(isrow,&r);
298:   VecRestoreArrayRead(bb,&b);
299:   VecRestoreArray(xx,&x);
300:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
301:   return 0;
302: }

304: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
305: {
306:   const MatScalar *v,*d;
307:   PetscScalar     *xp,x0,x1,x2,x3,x4,x5,x6;
308:   PetscInt        nz,k;
309:   const PetscInt  *vj;

311:   for (k=0; k<mbs; k++) {
312:     v  = aa + 49*ai[k];
313:     xp = x + k*7;
314:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
315:     nz = ai[k+1] - ai[k];
316:     vj = aj + ai[k];
317:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
318:     PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
319:     while (nz--) {
320:       xp = x + (*vj)*7;
321:       /* x(:) += U(k,:)^T*(Dk*xk) */
322:       xp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
323:       xp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
324:       xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
325:       xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
326:       xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
327:       xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
328:       xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
329:       vj++;
330:       v += 49;
331:     }
332:     /* xk = inv(Dk)*(Dk*xk) */
333:     d     = aa+k*49;       /* ptr to inv(Dk) */
334:     xp    = x + k*7;
335:     xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
336:     xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
337:     xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
338:     xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
339:     xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
340:     xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
341:     xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
342:   }
343:   return 0;
344: }

346: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
347: {
348:   const MatScalar *v;
349:   PetscScalar     *xp,x0,x1,x2,x3,x4,x5,x6;
350:   PetscInt        nz,k;
351:   const PetscInt  *vj;

353:   for (k=mbs-1; k>=0; k--) {
354:     v  = aa + 49*ai[k];
355:     xp = x + k*7;
356:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
357:     nz = ai[k+1] - ai[k];
358:     vj = aj + ai[k];
359:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
360:     PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
361:     while (nz--) {
362:       xp = x + (*vj)*7;
363:       /* xk += U(k,:)*x(:) */
364:       x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
365:       x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
366:       x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
367:       x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
368:       x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
369:       x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
370:       x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
371:       vj++;
372:       v += 49;
373:     }
374:     xp = x + k*7;
375:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
376:   }
377:   return 0;
378: }

380: PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
381: {
382:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
383:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
384:   const MatScalar   *aa=a->a;
385:   PetscScalar       *x;
386:   const PetscScalar *b;

388:   VecGetArrayRead(bb,&b);
389:   VecGetArray(xx,&x);

391:   /* solve U^T * D * y = b by forward substitution */
392:   PetscArraycpy(x,b,7*mbs); /* x <- b */
393:   MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);

395:   /* solve U*x = y by back substitution */
396:   MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);

398:   VecRestoreArrayRead(bb,&b);
399:   VecRestoreArray(xx,&x);
400:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
401:   return 0;
402: }

404: PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
405: {
406:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
407:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
408:   const MatScalar   *aa=a->a;
409:   PetscScalar       *x;
410:   const PetscScalar *b;

412:   VecGetArrayRead(bb,&b);
413:   VecGetArray(xx,&x);
414:   PetscArraycpy(x,b,7*mbs);
415:   MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
416:   VecRestoreArrayRead(bb,&b);
417:   VecRestoreArray(xx,&x);
418:   PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
419:   return 0;
420: }

422: PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
423: {
424:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
425:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
426:   const MatScalar   *aa=a->a;
427:   PetscScalar       *x;
428:   const PetscScalar *b;

430:   VecGetArrayRead(bb,&b);
431:   VecGetArray(xx,&x);
432:   PetscArraycpy(x,b,7*mbs);
433:   MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);
434:   VecRestoreArrayRead(bb,&b);
435:   VecRestoreArray(xx,&x);
436:   PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
437:   return 0;
438: }

440: PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
441: {
442:   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
443:   IS                isrow=a->row;
444:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j,*r,*vj;
445:   PetscInt          nz,k,idx;
446:   const MatScalar   *aa=a->a,*v,*d;
447:   PetscScalar       *x,x0,x1,x2,x3,x4,x5,*t,*tp;
448:   const PetscScalar *b;

450:   VecGetArrayRead(bb,&b);
451:   VecGetArray(xx,&x);
452:   t    = a->solve_work;
453:   ISGetIndices(isrow,&r);

455:   /* solve U^T * D * y = b by forward substitution */
456:   tp = t;
457:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
458:     idx   = 6*r[k];
459:     tp[0] = b[idx];
460:     tp[1] = b[idx+1];
461:     tp[2] = b[idx+2];
462:     tp[3] = b[idx+3];
463:     tp[4] = b[idx+4];
464:     tp[5] = b[idx+5];
465:     tp   += 6;
466:   }

468:   for (k=0; k<mbs; k++) {
469:     v  = aa + 36*ai[k];
470:     vj = aj + ai[k];
471:     tp = t + k*6;
472:     x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
473:     nz = ai[k+1] - ai[k];
474:     tp = t + (*vj)*6;
475:     while (nz--) {
476:       tp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
477:       tp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
478:       tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
479:       tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
480:       tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
481:       tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
482:       vj++;
483:       tp = t + (*vj)*6;
484:       v += 36;
485:     }

487:     /* xk = inv(Dk)*(Dk*xk) */
488:     d     = aa+k*36;       /* ptr to inv(Dk) */
489:     tp    = t + k*6;
490:     tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
491:     tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
492:     tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
493:     tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
494:     tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
495:     tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
496:   }

498:   /* solve U*x = y by back substitution */
499:   for (k=mbs-1; k>=0; k--) {
500:     v  = aa + 36*ai[k];
501:     vj = aj + ai[k];
502:     tp = t + k*6;
503:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */
504:     nz = ai[k+1] - ai[k];

506:     tp = t + (*vj)*6;
507:     while (nz--) {
508:       /* xk += U(k,:)*x(:) */
509:       x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
510:       x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
511:       x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
512:       x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
513:       x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
514:       x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
515:       vj++;
516:       tp = t + (*vj)*6;
517:       v += 36;
518:     }
519:     tp       = t + k*6;
520:     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
521:     idx      = 6*r[k];
522:     x[idx]   = x0;
523:     x[idx+1] = x1;
524:     x[idx+2] = x2;
525:     x[idx+3] = x3;
526:     x[idx+4] = x4;
527:     x[idx+5] = x5;
528:   }

530:   ISRestoreIndices(isrow,&r);
531:   VecRestoreArrayRead(bb,&b);
532:   VecRestoreArray(xx,&x);
533:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
534:   return 0;
535: }

537: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
538: {
539:   const MatScalar *v,*d;
540:   PetscScalar     *xp,x0,x1,x2,x3,x4,x5;
541:   PetscInt        nz,k;
542:   const PetscInt  *vj;

544:   for (k=0; k<mbs; k++) {
545:     v  = aa + 36*ai[k];
546:     xp = x + k*6;
547:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
548:     nz = ai[k+1] - ai[k];
549:     vj = aj + ai[k];
550:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
551:     PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
552:     while (nz--) {
553:       xp = x + (*vj)*6;
554:       /* x(:) += U(k,:)^T*(Dk*xk) */
555:       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
556:       xp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
557:       xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
558:       xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
559:       xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
560:       xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
561:       vj++;
562:       v += 36;
563:     }
564:     /* xk = inv(Dk)*(Dk*xk) */
565:     d     = aa+k*36;       /* ptr to inv(Dk) */
566:     xp    = x + k*6;
567:     xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
568:     xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
569:     xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
570:     xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
571:     xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
572:     xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
573:   }
574:   return 0;
575: }
576: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
577: {
578:   const MatScalar   *v;
579:   PetscScalar       *xp,x0,x1,x2,x3,x4,x5;
580:   PetscInt          nz,k;
581:   const PetscInt    *vj;

583:   for (k=mbs-1; k>=0; k--) {
584:     v  = aa + 36*ai[k];
585:     xp = x + k*6;
586:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
587:     nz = ai[k+1] - ai[k];
588:     vj = aj + ai[k];
589:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
590:     PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
591:     while (nz--) {
592:       xp = x + (*vj)*6;
593:       /* xk += U(k,:)*x(:) */
594:       x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
595:       x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
596:       x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
597:       x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
598:       x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
599:       x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
600:       vj++;
601:       v += 36;
602:     }
603:     xp   = x + k*6;
604:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
605:   }
606:   return 0;
607: }

609: PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
610: {
611:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
612:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
613:   const MatScalar   *aa=a->a;
614:   PetscScalar       *x;
615:   const PetscScalar *b;

617:   VecGetArrayRead(bb,&b);
618:   VecGetArray(xx,&x);

620:   /* solve U^T * D * y = b by forward substitution */
621:   PetscArraycpy(x,b,6*mbs); /* x <- b */
622:   MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);

624:   /* solve U*x = y by back substitution */
625:   MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);

627:   VecRestoreArrayRead(bb,&b);
628:   VecRestoreArray(xx,&x);
629:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
630:   return 0;
631: }

633: PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
634: {
635:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
636:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
637:   const MatScalar   *aa=a->a;
638:   PetscScalar       *x;
639:   const PetscScalar *b;

641:   VecGetArrayRead(bb,&b);
642:   VecGetArray(xx,&x);
643:   PetscArraycpy(x,b,6*mbs); /* x <- b */
644:   MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
645:   VecRestoreArrayRead(bb,&b);
646:   VecRestoreArray(xx,&x);
647:   PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
648:   return 0;
649: }

651: PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
652: {
653:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
654:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
655:   const MatScalar   *aa=a->a;
656:   PetscScalar       *x;
657:   const PetscScalar *b;

659:   VecGetArrayRead(bb,&b);
660:   VecGetArray(xx,&x);
661:   PetscArraycpy(x,b,6*mbs); /* x <- b */
662:   MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);
663:   VecRestoreArrayRead(bb,&b);
664:   VecRestoreArray(xx,&x);
665:   PetscLogFlops(2.0*a->bs2*(a->nz - mbs));
666:   return 0;
667: }

669: PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
670: {
671:   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
672:   IS                isrow=a->row;
673:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
674:   const PetscInt    *r,*vj;
675:   PetscInt          nz,k,idx;
676:   const MatScalar   *aa=a->a,*v,*diag;
677:   PetscScalar       *x,x0,x1,x2,x3,x4,*t,*tp;
678:   const PetscScalar *b;

680:   VecGetArrayRead(bb,&b);
681:   VecGetArray(xx,&x);
682:   t    = a->solve_work;
683:   ISGetIndices(isrow,&r);

685:   /* solve U^T * D * y = b by forward substitution */
686:   tp = t;
687:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
688:     idx   = 5*r[k];
689:     tp[0] = b[idx];
690:     tp[1] = b[idx+1];
691:     tp[2] = b[idx+2];
692:     tp[3] = b[idx+3];
693:     tp[4] = b[idx+4];
694:     tp   += 5;
695:   }

697:   for (k=0; k<mbs; k++) {
698:     v  = aa + 25*ai[k];
699:     vj = aj + ai[k];
700:     tp = t + k*5;
701:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
702:     nz = ai[k+1] - ai[k];

704:     tp = t + (*vj)*5;
705:     while (nz--) {
706:       tp[0] +=  v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
707:       tp[1] +=  v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
708:       tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
709:       tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
710:       tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
711:       vj++;
712:       tp = t + (*vj)*5;
713:       v += 25;
714:     }

716:     /* xk = inv(Dk)*(Dk*xk) */
717:     diag  = aa+k*25;          /* ptr to inv(Dk) */
718:     tp    = t + k*5;
719:     tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
720:     tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
721:     tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
722:     tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
723:     tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
724:   }

726:   /* solve U*x = y by back substitution */
727:   for (k=mbs-1; k>=0; k--) {
728:     v  = aa + 25*ai[k];
729:     vj = aj + ai[k];
730:     tp = t + k*5;
731:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; /* xk */
732:     nz = ai[k+1] - ai[k];

734:     tp = t + (*vj)*5;
735:     while (nz--) {
736:       /* xk += U(k,:)*x(:) */
737:       x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
738:       x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
739:       x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
740:       x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
741:       x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
742:       vj++;
743:       tp = t + (*vj)*5;
744:       v += 25;
745:     }
746:     tp       = t + k*5;
747:     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
748:     idx      = 5*r[k];
749:     x[idx]   = x0;
750:     x[idx+1] = x1;
751:     x[idx+2] = x2;
752:     x[idx+3] = x3;
753:     x[idx+4] = x4;
754:   }

756:   ISRestoreIndices(isrow,&r);
757:   VecRestoreArrayRead(bb,&b);
758:   VecRestoreArray(xx,&x);
759:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
760:   return 0;
761: }

763: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
764: {
765:   const MatScalar *v,*diag;
766:   PetscScalar     *xp,x0,x1,x2,x3,x4;
767:   PetscInt        nz,k;
768:   const PetscInt  *vj;

770:   for (k=0; k<mbs; k++) {
771:     v  = aa + 25*ai[k];
772:     xp = x + k*5;
773:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
774:     nz = ai[k+1] - ai[k];
775:     vj = aj + ai[k];
776:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
777:     PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
778:     while (nz--) {
779:       xp = x + (*vj)*5;
780:       /* x(:) += U(k,:)^T*(Dk*xk) */
781:       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4;
782:       xp[1] +=  v[5]*x0 +  v[6]*x1 +  v[7]*x2 + v[8]*x3 + v[9]*x4;
783:       xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
784:       xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
785:       xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
786:       vj++;
787:       v += 25;
788:     }
789:     /* xk = inv(Dk)*(Dk*xk) */
790:     diag  = aa+k*25;         /* ptr to inv(Dk) */
791:     xp    = x + k*5;
792:     xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
793:     xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
794:     xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
795:     xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
796:     xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
797:   }
798:   return 0;
799: }

801: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
802: {
803:   const MatScalar *v;
804:   PetscScalar     *xp,x0,x1,x2,x3,x4;
805:   PetscInt        nz,k;
806:   const PetscInt  *vj;

808:   for (k=mbs-1; k>=0; k--) {
809:     v  = aa + 25*ai[k];
810:     xp = x + k*5;
811:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; /* xk */
812:     nz = ai[k+1] - ai[k];
813:     vj = aj + ai[k];
814:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
815:     PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
816:     while (nz--) {
817:       xp = x + (*vj)*5;
818:       /* xk += U(k,:)*x(:) */
819:       x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
820:       x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
821:       x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
822:       x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
823:       x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
824:       vj++;
825:       v += 25;
826:     }
827:     xp   = x + k*5;
828:     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
829:   }
830:   return 0;
831: }

833: PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
834: {
835:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
836:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
837:   const MatScalar   *aa=a->a;
838:   PetscScalar       *x;
839:   const PetscScalar *b;

841:   VecGetArrayRead(bb,&b);
842:   VecGetArray(xx,&x);

844:   /* solve U^T * D * y = b by forward substitution */
845:   PetscArraycpy(x,b,5*mbs); /* x <- b */
846:   MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);

848:   /* solve U*x = y by back substitution */
849:   MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);

851:   VecRestoreArrayRead(bb,&b);
852:   VecRestoreArray(xx,&x);
853:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
854:   return 0;
855: }

857: PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
858: {
859:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
860:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
861:   const MatScalar   *aa=a->a;
862:   PetscScalar       *x;
863:   const PetscScalar *b;

865:   VecGetArrayRead(bb,&b);
866:   VecGetArray(xx,&x);
867:   PetscArraycpy(x,b,5*mbs); /* x <- b */
868:   MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
869:   VecRestoreArrayRead(bb,&b);
870:   VecRestoreArray(xx,&x);
871:   PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
872:   return 0;
873: }

875: PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
876: {
877:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
878:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
879:   const MatScalar   *aa=a->a;
880:   PetscScalar       *x;
881:   const PetscScalar *b;

883:   VecGetArrayRead(bb,&b);
884:   VecGetArray(xx,&x);
885:   PetscArraycpy(x,b,5*mbs);
886:   MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);
887:   VecRestoreArrayRead(bb,&b);
888:   VecRestoreArray(xx,&x);
889:   PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
890:   return 0;
891: }

893: PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
894: {
895:   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
896:   IS                isrow=a->row;
897:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
898:   const PetscInt    *r,*vj;
899:   PetscInt          nz,k,idx;
900:   const MatScalar   *aa=a->a,*v,*diag;
901:   PetscScalar       *x,x0,x1,x2,x3,*t,*tp;
902:   const PetscScalar *b;

904:   VecGetArrayRead(bb,&b);
905:   VecGetArray(xx,&x);
906:   t    = a->solve_work;
907:   ISGetIndices(isrow,&r);

909:   /* solve U^T * D * y = b by forward substitution */
910:   tp = t;
911:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
912:     idx   = 4*r[k];
913:     tp[0] = b[idx];
914:     tp[1] = b[idx+1];
915:     tp[2] = b[idx+2];
916:     tp[3] = b[idx+3];
917:     tp   += 4;
918:   }

920:   for (k=0; k<mbs; k++) {
921:     v  = aa + 16*ai[k];
922:     vj = aj + ai[k];
923:     tp = t + k*4;
924:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
925:     nz = ai[k+1] - ai[k];

927:     tp = t + (*vj)*4;
928:     while (nz--) {
929:       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
930:       tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
931:       tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
932:       tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
933:       vj++;
934:       tp = t + (*vj)*4;
935:       v += 16;
936:     }

938:     /* xk = inv(Dk)*(Dk*xk) */
939:     diag  = aa+k*16;          /* ptr to inv(Dk) */
940:     tp    = t + k*4;
941:     tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
942:     tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
943:     tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
944:     tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
945:   }

947:   /* solve U*x = y by back substitution */
948:   for (k=mbs-1; k>=0; k--) {
949:     v  = aa + 16*ai[k];
950:     vj = aj + ai[k];
951:     tp = t + k*4;
952:     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
953:     nz = ai[k+1] - ai[k];

955:     tp = t + (*vj)*4;
956:     while (nz--) {
957:       /* xk += U(k,:)*x(:) */
958:       x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
959:       x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
960:       x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
961:       x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
962:       vj++;
963:       tp = t + (*vj)*4;
964:       v += 16;
965:     }
966:     tp       = t + k*4;
967:     tp[0]    =x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
968:     idx      = 4*r[k];
969:     x[idx]   = x0;
970:     x[idx+1] = x1;
971:     x[idx+2] = x2;
972:     x[idx+3] = x3;
973:   }

975:   ISRestoreIndices(isrow,&r);
976:   VecRestoreArrayRead(bb,&b);
977:   VecRestoreArray(xx,&x);
978:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
979:   return 0;
980: }

982: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
983: {
984:   const MatScalar *v,*diag;
985:   PetscScalar     *xp,x0,x1,x2,x3;
986:   PetscInt        nz,k;
987:   const PetscInt  *vj;

989:   for (k=0; k<mbs; k++) {
990:     v  = aa + 16*ai[k];
991:     xp = x + k*4;
992:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
993:     nz = ai[k+1] - ai[k];
994:     vj = aj + ai[k];
995:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
996:     PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
997:     while (nz--) {
998:       xp = x + (*vj)*4;
999:       /* x(:) += U(k,:)^T*(Dk*xk) */
1000:       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1001:       xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1002:       xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1003:       xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1004:       vj++;
1005:       v += 16;
1006:     }
1007:     /* xk = inv(Dk)*(Dk*xk) */
1008:     diag  = aa+k*16;         /* ptr to inv(Dk) */
1009:     xp    = x + k*4;
1010:     xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1011:     xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1012:     xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1013:     xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1014:   }
1015:   return 0;
1016: }

1018: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1019: {
1020:   const MatScalar *v;
1021:   PetscScalar     *xp,x0,x1,x2,x3;
1022:   PetscInt        nz,k;
1023:   const PetscInt  *vj;

1025:   for (k=mbs-1; k>=0; k--) {
1026:     v  = aa + 16*ai[k];
1027:     xp = x + k*4;
1028:     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
1029:     nz = ai[k+1] - ai[k];
1030:     vj = aj + ai[k];
1031:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1032:     PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1033:     while (nz--) {
1034:       xp = x + (*vj)*4;
1035:       /* xk += U(k,:)*x(:) */
1036:       x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
1037:       x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
1038:       x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
1039:       x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
1040:       vj++;
1041:       v += 16;
1042:     }
1043:     xp    = x + k*4;
1044:     xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
1045:   }
1046:   return 0;
1047: }

1049: PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1050: {
1051:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1052:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1053:   const MatScalar   *aa=a->a;
1054:   PetscScalar       *x;
1055:   const PetscScalar *b;

1057:   VecGetArrayRead(bb,&b);
1058:   VecGetArray(xx,&x);

1060:   /* solve U^T * D * y = b by forward substitution */
1061:   PetscArraycpy(x,b,4*mbs); /* x <- b */
1062:   MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);

1064:   /* solve U*x = y by back substitution */
1065:   MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1066:   VecRestoreArrayRead(bb,&b);
1067:   VecRestoreArray(xx,&x);
1068:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1069:   return 0;
1070: }

1072: PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1073: {
1074:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1075:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1076:   const MatScalar   *aa=a->a;
1077:   PetscScalar       *x;
1078:   const PetscScalar *b;

1080:   VecGetArrayRead(bb,&b);
1081:   VecGetArray(xx,&x);
1082:   PetscArraycpy(x,b,4*mbs); /* x <- b */
1083:   MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1084:   VecRestoreArrayRead(bb,&b);
1085:   VecRestoreArray(xx,&x);
1086:   PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1087:   return 0;
1088: }

1090: PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1091: {
1092:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1093:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1094:   const MatScalar   *aa=a->a;
1095:   PetscScalar       *x;
1096:   const PetscScalar *b;

1098:   VecGetArrayRead(bb,&b);
1099:   VecGetArray(xx,&x);
1100:   PetscArraycpy(x,b,4*mbs);
1101:   MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);
1102:   VecRestoreArrayRead(bb,&b);
1103:   VecRestoreArray(xx,&x);
1104:   PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
1105:   return 0;
1106: }

1108: PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1109: {
1110:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1111:   IS                isrow=a->row;
1112:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
1113:   const PetscInt    *r;
1114:   PetscInt          nz,k,idx;
1115:   const PetscInt    *vj;
1116:   const MatScalar   *aa=a->a,*v,*diag;
1117:   PetscScalar       *x,x0,x1,x2,*t,*tp;
1118:   const PetscScalar *b;

1120:   VecGetArrayRead(bb,&b);
1121:   VecGetArray(xx,&x);
1122:   t    = a->solve_work;
1123:   ISGetIndices(isrow,&r);

1125:   /* solve U^T * D * y = b by forward substitution */
1126:   tp = t;
1127:   for (k=0; k<mbs; k++) { /* t <- perm(b) */
1128:     idx   = 3*r[k];
1129:     tp[0] = b[idx];
1130:     tp[1] = b[idx+1];
1131:     tp[2] = b[idx+2];
1132:     tp   += 3;
1133:   }

1135:   for (k=0; k<mbs; k++) {
1136:     v  = aa + 9*ai[k];
1137:     vj = aj + ai[k];
1138:     tp = t + k*3;
1139:     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
1140:     nz = ai[k+1] - ai[k];

1142:     tp = t + (*vj)*3;
1143:     while (nz--) {
1144:       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1145:       tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1146:       tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1147:       vj++;
1148:       tp = t + (*vj)*3;
1149:       v += 9;
1150:     }

1152:     /* xk = inv(Dk)*(Dk*xk) */
1153:     diag  = aa+k*9;          /* ptr to inv(Dk) */
1154:     tp    = t + k*3;
1155:     tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1156:     tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1157:     tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1158:   }

1160:   /* solve U*x = y by back substitution */
1161:   for (k=mbs-1; k>=0; k--) {
1162:     v  = aa + 9*ai[k];
1163:     vj = aj + ai[k];
1164:     tp = t + k*3;
1165:     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];  /* xk */
1166:     nz = ai[k+1] - ai[k];

1168:     tp = t + (*vj)*3;
1169:     while (nz--) {
1170:       /* xk += U(k,:)*x(:) */
1171:       x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
1172:       x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
1173:       x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
1174:       vj++;
1175:       tp = t + (*vj)*3;
1176:       v += 9;
1177:     }
1178:     tp       = t + k*3;
1179:     tp[0]    = x0; tp[1] = x1; tp[2] = x2;
1180:     idx      = 3*r[k];
1181:     x[idx]   = x0;
1182:     x[idx+1] = x1;
1183:     x[idx+2] = x2;
1184:   }

1186:   ISRestoreIndices(isrow,&r);
1187:   VecRestoreArrayRead(bb,&b);
1188:   VecRestoreArray(xx,&x);
1189:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1190:   return 0;
1191: }

1193: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1194: {
1195:   const MatScalar *v,*diag;
1196:   PetscScalar     *xp,x0,x1,x2;
1197:   PetscInt        nz,k;
1198:   const PetscInt  *vj;

1200:   for (k=0; k<mbs; k++) {
1201:     v  = aa + 9*ai[k];
1202:     xp = x + k*3;
1203:     x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1204:     nz = ai[k+1] - ai[k];
1205:     vj = aj + ai[k];
1206:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1207:     PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1208:     while (nz--) {
1209:       xp = x + (*vj)*3;
1210:       /* x(:) += U(k,:)^T*(Dk*xk) */
1211:       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1212:       xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1213:       xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1214:       vj++;
1215:       v += 9;
1216:     }
1217:     /* xk = inv(Dk)*(Dk*xk) */
1218:     diag  = aa+k*9;         /* ptr to inv(Dk) */
1219:     xp    = x + k*3;
1220:     xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1221:     xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1222:     xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1223:   }
1224:   return 0;
1225: }

1227: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1228: {
1229:   const MatScalar *v;
1230:   PetscScalar     *xp,x0,x1,x2;
1231:   PetscInt        nz,k;
1232:   const PetscInt  *vj;

1234:   for (k=mbs-1; k>=0; k--) {
1235:     v  = aa + 9*ai[k];
1236:     xp = x + k*3;
1237:     x0 = xp[0]; x1 = xp[1]; x2 = xp[2];  /* xk */
1238:     nz = ai[k+1] - ai[k];
1239:     vj = aj + ai[k];
1240:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1241:     PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1242:     while (nz--) {
1243:       xp = x + (*vj)*3;
1244:       /* xk += U(k,:)*x(:) */
1245:       x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1246:       x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1247:       x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1248:       vj++;
1249:       v += 9;
1250:     }
1251:     xp    = x + k*3;
1252:     xp[0] = x0; xp[1] = x1; xp[2] = x2;
1253:   }
1254:   return 0;
1255: }

1257: PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1258: {
1259:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1260:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1261:   const MatScalar   *aa=a->a;
1262:   PetscScalar       *x;
1263:   const PetscScalar *b;

1265:   VecGetArrayRead(bb,&b);
1266:   VecGetArray(xx,&x);

1268:   /* solve U^T * D * y = b by forward substitution */
1269:   PetscArraycpy(x,b,3*mbs);
1270:   MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);

1272:   /* solve U*x = y by back substitution */
1273:   MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);

1275:   VecRestoreArrayRead(bb,&b);
1276:   VecRestoreArray(xx,&x);
1277:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1278:   return 0;
1279: }

1281: PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1282: {
1283:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1284:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1285:   const MatScalar   *aa=a->a;
1286:   PetscScalar       *x;
1287:   const PetscScalar *b;

1289:   VecGetArrayRead(bb,&b);
1290:   VecGetArray(xx,&x);
1291:   PetscArraycpy(x,b,3*mbs);
1292:   MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1293:   VecRestoreArrayRead(bb,&b);
1294:   VecRestoreArray(xx,&x);
1295:   PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1296:   return 0;
1297: }

1299: PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1300: {
1301:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1302:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1303:   const MatScalar   *aa=a->a;
1304:   PetscScalar       *x;
1305:   const PetscScalar *b;

1307:   VecGetArrayRead(bb,&b);
1308:   VecGetArray(xx,&x);
1309:   PetscArraycpy(x,b,3*mbs);
1310:   MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);
1311:   VecRestoreArrayRead(bb,&b);
1312:   VecRestoreArray(xx,&x);
1313:   PetscLogFlops(2.0*a->bs2*(a->nz-mbs));
1314:   return 0;
1315: }

1317: PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1318: {
1319:   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
1320:   IS                isrow=a->row;
1321:   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
1322:   const PetscInt    *r,*vj;
1323:   PetscInt          nz,k,k2,idx;
1324:   const MatScalar   *aa=a->a,*v,*diag;
1325:   PetscScalar       *x,x0,x1,*t;
1326:   const PetscScalar *b;

1328:   VecGetArrayRead(bb,&b);
1329:   VecGetArray(xx,&x);
1330:   t    = a->solve_work;
1331:   ISGetIndices(isrow,&r);

1333:   /* solve U^T * D * y = perm(b) by forward substitution */
1334:   for (k=0; k<mbs; k++) {  /* t <- perm(b) */
1335:     idx      = 2*r[k];
1336:     t[k*2]   = b[idx];
1337:     t[k*2+1] = b[idx+1];
1338:   }
1339:   for (k=0; k<mbs; k++) {
1340:     v  = aa + 4*ai[k];
1341:     vj = aj + ai[k];
1342:     k2 = k*2;
1343:     x0 = t[k2]; x1 = t[k2+1];
1344:     nz = ai[k+1] - ai[k];
1345:     while (nz--) {
1346:       t[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1347:       t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1348:       vj++; v      += 4;
1349:     }
1350:     diag    = aa+k*4; /* ptr to inv(Dk) */
1351:     t[k2]   = diag[0]*x0 + diag[2]*x1;
1352:     t[k2+1] = diag[1]*x0 + diag[3]*x1;
1353:   }

1355:   /* solve U*x = y by back substitution */
1356:   for (k=mbs-1; k>=0; k--) {
1357:     v  = aa + 4*ai[k];
1358:     vj = aj + ai[k];
1359:     k2 = k*2;
1360:     x0 = t[k2]; x1 = t[k2+1];
1361:     nz = ai[k+1] - ai[k];
1362:     while (nz--) {
1363:       x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1364:       x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1365:       vj++;
1366:       v += 4;
1367:     }
1368:     t[k2]    = x0;
1369:     t[k2+1]  = x1;
1370:     idx      = 2*r[k];
1371:     x[idx]   = x0;
1372:     x[idx+1] = x1;
1373:   }

1375:   ISRestoreIndices(isrow,&r);
1376:   VecRestoreArrayRead(bb,&b);
1377:   VecRestoreArray(xx,&x);
1378:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1379:   return 0;
1380: }

1382: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1383: {
1384:   const MatScalar *v,*diag;
1385:   PetscScalar     x0,x1;
1386:   PetscInt        nz,k,k2;
1387:   const PetscInt  *vj;

1389:   for (k=0; k<mbs; k++) {
1390:     v  = aa + 4*ai[k];
1391:     vj = aj + ai[k];
1392:     k2 = k*2;
1393:     x0 = x[k2]; x1 = x[k2+1];  /* Dk*xk = k-th block of x */
1394:     nz = ai[k+1] - ai[k];
1395:     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1396:     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1397:     while (nz--) {
1398:       /* x(:) += U(k,:)^T*(Dk*xk) */
1399:       x[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1400:       x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1401:       vj++; v      += 4;
1402:     }
1403:     /* xk = inv(Dk)*(Dk*xk) */
1404:     diag    = aa+k*4;       /* ptr to inv(Dk) */
1405:     x[k2]   = diag[0]*x0 + diag[2]*x1;
1406:     x[k2+1] = diag[1]*x0 + diag[3]*x1;
1407:   }
1408:   return 0;
1409: }

1411: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1412: {
1413:   const MatScalar *v;
1414:   PetscScalar     x0,x1;
1415:   PetscInt        nz,k,k2;
1416:   const PetscInt  *vj;

1418:   for (k=mbs-1; k>=0; k--) {
1419:     v  = aa + 4*ai[k];
1420:     vj = aj + ai[k];
1421:     k2 = k*2;
1422:     x0 = x[k2]; x1 = x[k2+1];  /* xk */
1423:     nz = ai[k+1] - ai[k];
1424:     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1425:     PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1426:     while (nz--) {
1427:       /* xk += U(k,:)*x(:) */
1428:       x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1429:       x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1430:       vj++;
1431:       v += 4;
1432:     }
1433:     x[k2]   = x0;
1434:     x[k2+1] = x1;
1435:   }
1436:   return 0;
1437: }

1439: PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1440: {
1441:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1442:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1443:   const MatScalar   *aa=a->a;
1444:   PetscScalar       *x;
1445:   const PetscScalar *b;

1447:   VecGetArrayRead(bb,&b);
1448:   VecGetArray(xx,&x);

1450:   /* solve U^T * D * y = b by forward substitution */
1451:   PetscArraycpy(x,b,2*mbs);
1452:   MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);

1454:   /* solve U*x = y by back substitution */
1455:   MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);

1457:   VecRestoreArrayRead(bb,&b);
1458:   VecRestoreArray(xx,&x);
1459:   PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);
1460:   return 0;
1461: }

1463: PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1464: {
1465:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1466:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1467:   const MatScalar   *aa=a->a;
1468:   PetscScalar       *x;
1469:   const PetscScalar *b;

1471:   VecGetArrayRead(bb,&b);
1472:   VecGetArray(xx,&x);
1473:   PetscArraycpy(x,b,2*mbs);
1474:   MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1475:   VecRestoreArrayRead(bb,&b);
1476:   VecRestoreArray(xx,&x);
1477:   PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);
1478:   return 0;
1479: }

1481: PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1482: {
1483:   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1484:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1485:   const MatScalar   *aa=a->a;
1486:   PetscScalar       *x;
1487:   const PetscScalar *b;

1489:   VecGetArrayRead(bb,&b);
1490:   VecGetArray(xx,&x);
1491:   PetscArraycpy(x,b,2*mbs);
1492:   MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);
1493:   VecRestoreArrayRead(bb,&b);
1494:   VecRestoreArray(xx,&x);
1495:   PetscLogFlops(2.0*a->bs2*(a->nz - mbs));
1496:   return 0;
1497: }

1499: PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1500: {
1501:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1502:   IS                isrow=a->row;
1503:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1504:   const MatScalar   *aa=a->a,*v;
1505:   const PetscScalar *b;
1506:   PetscScalar       *x,xk,*t;
1507:   PetscInt          nz,k,j;

1509:   VecGetArrayRead(bb,&b);
1510:   VecGetArray(xx,&x);
1511:   t    = a->solve_work;
1512:   ISGetIndices(isrow,&rp);

1514:   /* solve U^T*D*y = perm(b) by forward substitution */
1515:   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1516:   for (k=0; k<mbs; k++) {
1517:     v  = aa + ai[k];
1518:     vj = aj + ai[k];
1519:     xk = t[k];
1520:     nz = ai[k+1] - ai[k] - 1;
1521:     for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk;
1522:     t[k] = xk*v[nz];   /* v[nz] = 1/D(k) */
1523:   }

1525:   /* solve U*perm(x) = y by back substitution */
1526:   for (k=mbs-1; k>=0; k--) {
1527:     v  = aa + adiag[k] - 1;
1528:     vj = aj + adiag[k] - 1;
1529:     nz = ai[k+1] - ai[k] - 1;
1530:     for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]];
1531:     x[rp[k]] = t[k];
1532:   }

1534:   ISRestoreIndices(isrow,&rp);
1535:   VecRestoreArrayRead(bb,&b);
1536:   VecRestoreArray(xx,&x);
1537:   PetscLogFlops(4.0*a->nz - 3.0*mbs);
1538:   return 0;
1539: }

1541: PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1542: {
1543:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1544:   IS                isrow=a->row;
1545:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1546:   const MatScalar   *aa=a->a,*v;
1547:   PetscScalar       *x,xk,*t;
1548:   const PetscScalar *b;
1549:   PetscInt          nz,k;

1551:   VecGetArrayRead(bb,&b);
1552:   VecGetArray(xx,&x);
1553:   t    = a->solve_work;
1554:   ISGetIndices(isrow,&rp);

1556:   /* solve U^T*D*y = perm(b) by forward substitution */
1557:   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1558:   for (k=0; k<mbs; k++) {
1559:     v  = aa + ai[k] + 1;
1560:     vj = aj + ai[k] + 1;
1561:     xk = t[k];
1562:     nz = ai[k+1] - ai[k] - 1;
1563:     while (nz--) t[*vj++] += (*v++) * xk;
1564:     t[k] = xk*aa[ai[k]];  /* aa[k] = 1/D(k) */
1565:   }

1567:   /* solve U*perm(x) = y by back substitution */
1568:   for (k=mbs-1; k>=0; k--) {
1569:     v  = aa + ai[k] + 1;
1570:     vj = aj + ai[k] + 1;
1571:     nz = ai[k+1] - ai[k] - 1;
1572:     while (nz--) t[k] += (*v++) * t[*vj++];
1573:     x[rp[k]] = t[k];
1574:   }

1576:   ISRestoreIndices(isrow,&rp);
1577:   VecRestoreArrayRead(bb,&b);
1578:   VecRestoreArray(xx,&x);
1579:   PetscLogFlops(4.0*a->nz - 3*mbs);
1580:   return 0;
1581: }

1583: PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1584: {
1585:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1586:   IS                isrow=a->row;
1587:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1588:   const MatScalar   *aa=a->a,*v;
1589:   PetscReal         diagk;
1590:   PetscScalar       *x,xk;
1591:   const PetscScalar *b;
1592:   PetscInt          nz,k;

1594:   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1595:   VecGetArrayRead(bb,&b);
1596:   VecGetArray(xx,&x);
1597:   ISGetIndices(isrow,&rp);

1599:   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1600:   for (k=0; k<mbs; k++) {
1601:     v  = aa + ai[k];
1602:     vj = aj + ai[k];
1603:     xk = x[k];
1604:     nz = ai[k+1] - ai[k] - 1;
1605:     while (nz--) x[*vj++] += (*v++) * xk;

1607:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1609:     x[k] = xk*PetscSqrtReal(diagk);
1610:   }
1611:   ISRestoreIndices(isrow,&rp);
1612:   VecRestoreArrayRead(bb,&b);
1613:   VecRestoreArray(xx,&x);
1614:   PetscLogFlops(2.0*a->nz - mbs);
1615:   return 0;
1616: }

1618: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1619: {
1620:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1621:   IS                isrow=a->row;
1622:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1623:   const MatScalar   *aa=a->a,*v;
1624:   PetscReal         diagk;
1625:   PetscScalar       *x,xk;
1626:   const PetscScalar *b;
1627:   PetscInt          nz,k;

1629:   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1630:   VecGetArrayRead(bb,&b);
1631:   VecGetArray(xx,&x);
1632:   ISGetIndices(isrow,&rp);

1634:   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1635:   for (k=0; k<mbs; k++) {
1636:     v  = aa + ai[k] + 1;
1637:     vj = aj + ai[k] + 1;
1638:     xk = x[k];
1639:     nz = ai[k+1] - ai[k] - 1;
1640:     while (nz--) x[*vj++] += (*v++) * xk;

1642:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1644:     x[k] = xk*PetscSqrtReal(diagk);
1645:   }
1646:   ISRestoreIndices(isrow,&rp);
1647:   VecRestoreArrayRead(bb,&b);
1648:   VecRestoreArray(xx,&x);
1649:   PetscLogFlops(2.0*a->nz - mbs);
1650:   return 0;
1651: }

1653: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1654: {
1655:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1656:   IS                isrow=a->row;
1657:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1658:   const MatScalar   *aa=a->a,*v;
1659:   PetscReal         diagk;
1660:   PetscScalar       *x,*t;
1661:   const PetscScalar *b;
1662:   PetscInt          nz,k;

1664:   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1665:   VecGetArrayRead(bb,&b);
1666:   VecGetArray(xx,&x);
1667:   t    = a->solve_work;
1668:   ISGetIndices(isrow,&rp);

1670:   for (k=mbs-1; k>=0; k--) {
1671:     v     = aa + ai[k];
1672:     vj    = aj + ai[k];
1673:     diagk = PetscRealPart(aa[adiag[k]]);
1675:     t[k] = b[k] * PetscSqrtReal(diagk);
1676:     nz   = ai[k+1] - ai[k] - 1;
1677:     while (nz--) t[k] += (*v++) * t[*vj++];
1678:     x[rp[k]] = t[k];
1679:   }
1680:   ISRestoreIndices(isrow,&rp);
1681:   VecRestoreArrayRead(bb,&b);
1682:   VecRestoreArray(xx,&x);
1683:   PetscLogFlops(2.0*a->nz - mbs);
1684:   return 0;
1685: }

1687: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1688: {
1689:   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1690:   IS                isrow=a->row;
1691:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1692:   const MatScalar   *aa=a->a,*v;
1693:   PetscReal         diagk;
1694:   PetscScalar       *x,*t;
1695:   const PetscScalar *b;
1696:   PetscInt          nz,k;

1698:   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1699:   VecGetArrayRead(bb,&b);
1700:   VecGetArray(xx,&x);
1701:   t    = a->solve_work;
1702:   ISGetIndices(isrow,&rp);

1704:   for (k=mbs-1; k>=0; k--) {
1705:     v     = aa + ai[k] + 1;
1706:     vj    = aj + ai[k] + 1;
1707:     diagk = PetscRealPart(aa[ai[k]]);
1709:     t[k] = b[k] * PetscSqrtReal(diagk);
1710:     nz   = ai[k+1] - ai[k] - 1;
1711:     while (nz--) t[k] += (*v++) * t[*vj++];
1712:     x[rp[k]] = t[k];
1713:   }
1714:   ISRestoreIndices(isrow,&rp);
1715:   VecRestoreArrayRead(bb,&b);
1716:   VecRestoreArray(xx,&x);
1717:   PetscLogFlops(2.0*a->nz - mbs);
1718:   return 0;
1719: }

1721: PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1722: {
1723:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;

1725:   if (A->rmap->bs == 1) {
1726:     MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);
1727:   } else {
1728:     IS                isrow=a->row;
1729:     const PetscInt    *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1730:     const MatScalar   *aa=a->a,*v;
1731:     PetscScalar       *x,*t;
1732:     const PetscScalar *b;
1733:     PetscInt          nz,k,n,i,j;

1735:     if (bb->n > a->solves_work_n) {
1736:       PetscFree(a->solves_work);
1737:       PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);
1738:       a->solves_work_n = bb->n;
1739:     }
1740:     n    = bb->n;
1741:     VecGetArrayRead(bb->v,&b);
1742:     VecGetArray(xx->v,&x);
1743:     t    = a->solves_work;

1745:     ISGetIndices(isrow,&rp);

1747:     /* solve U^T*D*y = perm(b) by forward substitution */
1748:     for (k=0; k<mbs; k++) {
1749:       for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */
1750:     }
1751:     for (k=0; k<mbs; k++) {
1752:       v  = aa + ai[k];
1753:       vj = aj + ai[k];
1754:       nz = ai[k+1] - ai[k] - 1;
1755:       for (j=0; j<nz; j++) {
1756:         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1757:         v++;vj++;
1758:       }
1759:       for (i=0; i<n; i++) t[n*k+i] *= aa[nz];  /* note: aa[nz] = 1/D(k) */
1760:     }

1762:     /* solve U*perm(x) = y by back substitution */
1763:     for (k=mbs-1; k>=0; k--) {
1764:       v  = aa + ai[k] - 1;
1765:       vj = aj + ai[k] - 1;
1766:       nz = ai[k+1] - ai[k] - 1;
1767:       for (j=0; j<nz; j++) {
1768:         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1769:         v++;vj++;
1770:       }
1771:       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1772:     }

1774:     ISRestoreIndices(isrow,&rp);
1775:     VecRestoreArrayRead(bb->v,&b);
1776:     VecRestoreArray(xx->v,&x);
1777:     PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1778:   }
1779:   return 0;
1780: }

1782: PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)
1783: {
1784:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;

1786:   if (A->rmap->bs == 1) {
1787:     MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);
1788:   } else {
1789:     IS                isrow=a->row;
1790:     const PetscInt    *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1791:     const MatScalar   *aa=a->a,*v;
1792:     PetscScalar       *x,*t;
1793:     const PetscScalar *b;
1794:     PetscInt          nz,k,n,i;

1796:     if (bb->n > a->solves_work_n) {
1797:       PetscFree(a->solves_work);
1798:       PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);
1799:       a->solves_work_n = bb->n;
1800:     }
1801:     n    = bb->n;
1802:     VecGetArrayRead(bb->v,&b);
1803:     VecGetArray(xx->v,&x);
1804:     t    = a->solves_work;

1806:     ISGetIndices(isrow,&rp);

1808:     /* solve U^T*D*y = perm(b) by forward substitution */
1809:     for (k=0; k<mbs; k++) {
1810:       for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];  /* values are stored interlaced in t */
1811:     }
1812:     for (k=0; k<mbs; k++) {
1813:       v  = aa + ai[k];
1814:       vj = aj + ai[k];
1815:       nz = ai[k+1] - ai[k];
1816:       while (nz--) {
1817:         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1818:         v++;vj++;
1819:       }
1820:       for (i=0; i<n; i++) t[n*k+i] *= aa[k];  /* note: aa[k] = 1/D(k) */
1821:     }

1823:     /* solve U*perm(x) = y by back substitution */
1824:     for (k=mbs-1; k>=0; k--) {
1825:       v  = aa + ai[k];
1826:       vj = aj + ai[k];
1827:       nz = ai[k+1] - ai[k];
1828:       while (nz--) {
1829:         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1830:         v++;vj++;
1831:       }
1832:       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1833:     }

1835:     ISRestoreIndices(isrow,&rp);
1836:     VecRestoreArrayRead(bb->v,&b);
1837:     VecRestoreArray(xx->v,&x);
1838:     PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));
1839:   }
1840:   return 0;
1841: }

1843: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1844: {
1845:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1846:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag;
1847:   const MatScalar   *aa=a->a,*v;
1848:   const PetscScalar *b;
1849:   PetscScalar       *x,xi;
1850:   PetscInt          nz,i,j;

1852:   VecGetArrayRead(bb,&b);
1853:   VecGetArray(xx,&x);
1854:   /* solve U^T*D*y = b by forward substitution */
1855:   PetscArraycpy(x,b,mbs);
1856:   for (i=0; i<mbs; i++) {
1857:     v  = aa + ai[i];
1858:     vj = aj + ai[i];
1859:     xi = x[i];
1860:     nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
1861:     for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi;
1862:     x[i] = xi*v[nz];  /* v[nz] = aa[diag[i]] = 1/D(i) */
1863:   }
1864:   /* solve U*x = y by backward substitution */
1865:   for (i=mbs-2; i>=0; i--) {
1866:     xi = x[i];
1867:     v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
1868:     vj = aj + adiag[i] - 1;
1869:     nz = ai[i+1] - ai[i] - 1;
1870:     for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
1871:     x[i] = xi;
1872:   }
1873:   VecRestoreArrayRead(bb,&b);
1874:   VecRestoreArray(xx,&x);
1875:   PetscLogFlops(4.0*a->nz - 3*mbs);
1876:   return 0;
1877: }

1879: PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat B,Mat X)
1880: {
1881:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1882:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag;
1883:   const MatScalar   *aa=a->a,*v;
1884:   const PetscScalar *b;
1885:   PetscScalar       *x,xi;
1886:   PetscInt          nz,i,j,neq,ldb,ldx;
1887:   PetscBool         isdense;

1889:   if (!mbs) return 0;
1890:   PetscObjectTypeCompare((PetscObject)B,MATSEQDENSE,&isdense);
1892:   if (X != B) {
1893:     PetscObjectTypeCompare((PetscObject)X,MATSEQDENSE,&isdense);
1895:   }
1896:   MatDenseGetArrayRead(B,&b);
1897:   MatDenseGetLDA(B,&ldb);
1898:   MatDenseGetArray(X,&x);
1899:   MatDenseGetLDA(X,&ldx);
1900:   for (neq=0; neq<B->cmap->n; neq++) {
1901:     /* solve U^T*D*y = b by forward substitution */
1902:     PetscArraycpy(x,b,mbs);
1903:     for (i=0; i<mbs; i++) {
1904:       v  = aa + ai[i];
1905:       vj = aj + ai[i];
1906:       xi = x[i];
1907:       nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
1908:       for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi;
1909:       x[i] = xi*v[nz];  /* v[nz] = aa[diag[i]] = 1/D(i) */
1910:     }
1911:     /* solve U*x = y by backward substitution */
1912:     for (i=mbs-2; i>=0; i--) {
1913:       xi = x[i];
1914:       v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
1915:       vj = aj + adiag[i] - 1;
1916:       nz = ai[i+1] - ai[i] - 1;
1917:       for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
1918:       x[i] = xi;
1919:     }
1920:     b += ldb;
1921:     x += ldx;
1922:   }
1923:   MatDenseRestoreArrayRead(B,&b);
1924:   MatDenseRestoreArray(X,&x);
1925:   PetscLogFlops(B->cmap->n*(4.0*a->nz - 3*mbs));
1926:   return 0;
1927: }

1929: PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1930: {
1931:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1932:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
1933:   const MatScalar   *aa=a->a,*v;
1934:   PetscScalar       *x,xk;
1935:   const PetscScalar *b;
1936:   PetscInt          nz,k;

1938:   VecGetArrayRead(bb,&b);
1939:   VecGetArray(xx,&x);

1941:   /* solve U^T*D*y = b by forward substitution */
1942:   PetscArraycpy(x,b,mbs);
1943:   for (k=0; k<mbs; k++) {
1944:     v  = aa + ai[k] + 1;
1945:     vj = aj + ai[k] + 1;
1946:     xk = x[k];
1947:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
1948:     while (nz--) x[*vj++] += (*v++) * xk;
1949:     x[k] = xk*aa[ai[k]];  /* note: aa[diag[k]] = 1/D(k) */
1950:   }

1952:   /* solve U*x = y by back substitution */
1953:   for (k=mbs-2; k>=0; k--) {
1954:     v  = aa + ai[k] + 1;
1955:     vj = aj + ai[k] + 1;
1956:     xk = x[k];
1957:     nz = ai[k+1] - ai[k] - 1;
1958:     while (nz--) {
1959:       xk += (*v++) * x[*vj++];
1960:     }
1961:     x[k] = xk;
1962:   }

1964:   VecRestoreArrayRead(bb,&b);
1965:   VecRestoreArray(xx,&x);
1966:   PetscLogFlops(4.0*a->nz - 3*mbs);
1967:   return 0;
1968: }

1970: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1971: {
1972:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1973:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj;
1974:   const MatScalar   *aa=a->a,*v;
1975:   PetscReal         diagk;
1976:   PetscScalar       *x;
1977:   const PetscScalar *b;
1978:   PetscInt          nz,k;

1980:   /* solve U^T*D^(1/2)*x = b by forward substitution */
1981:   VecGetArrayRead(bb,&b);
1982:   VecGetArray(xx,&x);
1983:   PetscArraycpy(x,b,mbs);
1984:   for (k=0; k<mbs; k++) {
1985:     v  = aa + ai[k];
1986:     vj = aj + ai[k];
1987:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
1988:     while (nz--) x[*vj++] += (*v++) * x[k];
1989:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
1991:     x[k] *= PetscSqrtReal(diagk);
1992:   }
1993:   VecRestoreArrayRead(bb,&b);
1994:   VecRestoreArray(xx,&x);
1995:   PetscLogFlops(2.0*a->nz - mbs);
1996:   return 0;
1997: }

1999: PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2000: {
2001:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2002:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2003:   const MatScalar   *aa=a->a,*v;
2004:   PetscReal         diagk;
2005:   PetscScalar       *x;
2006:   const PetscScalar *b;
2007:   PetscInt          nz,k;

2009:   /* solve U^T*D^(1/2)*x = b by forward substitution */
2010:   VecGetArrayRead(bb,&b);
2011:   VecGetArray(xx,&x);
2012:   PetscArraycpy(x,b,mbs);
2013:   for (k=0; k<mbs; k++) {
2014:     v  = aa + ai[k] + 1;
2015:     vj = aj + ai[k] + 1;
2016:     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2017:     while (nz--) x[*vj++] += (*v++) * x[k];
2018:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2020:     x[k] *= PetscSqrtReal(diagk);
2021:   }
2022:   VecRestoreArrayRead(bb,&b);
2023:   VecRestoreArray(xx,&x);
2024:   PetscLogFlops(2.0*a->nz - mbs);
2025:   return 0;
2026: }

2028: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2029: {
2030:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2031:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj;
2032:   const MatScalar   *aa=a->a,*v;
2033:   PetscReal         diagk;
2034:   PetscScalar       *x;
2035:   const PetscScalar *b;
2036:   PetscInt          nz,k;

2038:   /* solve D^(1/2)*U*x = b by back substitution */
2039:   VecGetArrayRead(bb,&b);
2040:   VecGetArray(xx,&x);

2042:   for (k=mbs-1; k>=0; k--) {
2043:     v     = aa + ai[k];
2044:     vj    = aj + ai[k];
2045:     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2047:     x[k] = PetscSqrtReal(diagk)*b[k];
2048:     nz   = ai[k+1] - ai[k] - 1;
2049:     while (nz--) x[k] += (*v++) * x[*vj++];
2050:   }
2051:   VecRestoreArrayRead(bb,&b);
2052:   VecRestoreArray(xx,&x);
2053:   PetscLogFlops(2.0*a->nz - mbs);
2054:   return 0;
2055: }

2057: PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2058: {
2059:   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2060:   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2061:   const MatScalar   *aa=a->a,*v;
2062:   PetscReal         diagk;
2063:   PetscScalar       *x;
2064:   const PetscScalar *b;
2065:   PetscInt          nz,k;

2067:   /* solve D^(1/2)*U*x = b by back substitution */
2068:   VecGetArrayRead(bb,&b);
2069:   VecGetArray(xx,&x);

2071:   for (k=mbs-1; k>=0; k--) {
2072:     v     = aa + ai[k] + 1;
2073:     vj    = aj + ai[k] + 1;
2074:     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2076:     x[k] = PetscSqrtReal(diagk)*b[k];
2077:     nz   = ai[k+1] - ai[k] - 1;
2078:     while (nz--) x[k] += (*v++) * x[*vj++];
2079:   }
2080:   VecRestoreArrayRead(bb,&b);
2081:   VecRestoreArray(xx,&x);
2082:   PetscLogFlops(2.0*a->nz - mbs);
2083:   return 0;
2084: }

2086: /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2087: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2088: {
2089:   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
2090:   const PetscInt *rip,mbs = a->mbs,*ai,*aj;
2091:   PetscInt       *jutmp,bs = A->rmap->bs,i;
2092:   PetscInt       m,reallocs = 0,*levtmp;
2093:   PetscInt       *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
2094:   PetscInt       incrlev,*lev,shift,prow,nz;
2095:   PetscReal      f = info->fill,levels = info->levels;
2096:   PetscBool      perm_identity;

2098:   /* check whether perm is the identity mapping */
2099:   ISIdentity(perm,&perm_identity);

2101:   if (perm_identity) {
2102:     a->permute = PETSC_FALSE;
2103:     ai         = a->i; aj = a->j;
2104:   } else { /*  non-trivial permutation */
2105:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2106:   }

2108:   /* initialization */
2109:   ISGetIndices(perm,&rip);
2110:   umax  = (PetscInt)(f*ai[mbs] + 1);
2111:   PetscMalloc1(umax,&lev);
2112:   umax += mbs + 1;
2113:   shift = mbs + 1;
2114:   PetscMalloc1(mbs+1,&iu);
2115:   PetscMalloc1(umax,&ju);
2116:   iu[0] = mbs + 1;
2117:   juidx = mbs + 1;
2118:   /* prowl: linked list for pivot row */
2119:   PetscMalloc3(mbs,&prowl,mbs,&q,mbs,&levtmp);
2120:   /* q: linked list for col index */

2122:   for (i=0; i<mbs; i++) {
2123:     prowl[i] = mbs;
2124:     q[i]     = 0;
2125:   }

2127:   /* for each row k */
2128:   for (k=0; k<mbs; k++) {
2129:     nzk  = 0;
2130:     q[k] = mbs;
2131:     /* copy current row into linked list */
2132:     nz = ai[rip[k]+1] - ai[rip[k]];
2133:     j  = ai[rip[k]];
2134:     while (nz--) {
2135:       vj = rip[aj[j++]];
2136:       if (vj > k) {
2137:         qm = k;
2138:         do {
2139:           m = qm; qm = q[m];
2140:         } while (qm < vj);
2142:         nzk++;
2143:         q[m]       = vj;
2144:         q[vj]      = qm;
2145:         levtmp[vj] = 0;   /* initialize lev for nonzero element */
2146:       }
2147:     }

2149:     /* modify nonzero structure of k-th row by computing fill-in
2150:        for each row prow to be merged in */
2151:     prow = k;
2152:     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */

2154:     while (prow < k) {
2155:       /* merge row prow into k-th row */
2156:       jmin = iu[prow] + 1;
2157:       jmax = iu[prow+1];
2158:       qm   = k;
2159:       for (j=jmin; j<jmax; j++) {
2160:         incrlev = lev[j-shift] + 1;
2161:         if (incrlev > levels) continue;
2162:         vj = ju[j];
2163:         do {
2164:           m = qm; qm = q[m];
2165:         } while (qm < vj);
2166:         if (qm != vj) {      /* a fill */
2167:           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
2168:           levtmp[vj] = incrlev;
2169:         } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2170:       }
2171:       prow = prowl[prow]; /* next pivot row */
2172:     }

2174:     /* add k to row list for first nonzero element in k-th row */
2175:     if (nzk > 1) {
2176:       i        = q[k]; /* col value of first nonzero element in k_th row of U */
2177:       prowl[k] = prowl[i]; prowl[i] = k;
2178:     }
2179:     iu[k+1] = iu[k] + nzk;

2181:     /* allocate more space to ju and lev if needed */
2182:     if (iu[k+1] > umax) {
2183:       /* estimate how much additional space we will need */
2184:       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2185:       /* just double the memory each time */
2186:       maxadd = umax;
2187:       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2188:       umax += maxadd;

2190:       /* allocate a longer ju */
2191:       PetscMalloc1(umax,&jutmp);
2192:       PetscArraycpy(jutmp,ju,iu[k]);
2193:       PetscFree(ju);
2194:       ju   = jutmp;

2196:       PetscMalloc1(umax,&jutmp);
2197:       PetscArraycpy(jutmp,lev,iu[k]-shift);
2198:       PetscFree(lev);
2199:       lev       = jutmp;
2200:       reallocs += 2; /* count how many times we realloc */
2201:     }

2203:     /* save nonzero structure of k-th row in ju */
2204:     i=k;
2205:     while (nzk--) {
2206:       i                = q[i];
2207:       ju[juidx]        = i;
2208:       lev[juidx-shift] = levtmp[i];
2209:       juidx++;
2210:     }
2211:   }

2213: #if defined(PETSC_USE_INFO)
2214:   if (ai[mbs] != 0) {
2215:     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2216:     PetscInfo(A,"Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af);
2217:     PetscInfo(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2218:     PetscInfo(A,"PCFactorSetFill(pc,%g);\n",(double)af);
2219:     PetscInfo(A,"for best performance.\n");
2220:   } else {
2221:     PetscInfo(A,"Empty matrix\n");
2222:   }
2223: #endif

2225:   ISRestoreIndices(perm,&rip);
2226:   PetscFree3(prowl,q,levtmp);
2227:   PetscFree(lev);

2229:   /* put together the new matrix */
2230:   MatSeqSBAIJSetPreallocation(B,bs,0,NULL);

2232:   /* PetscLogObjectParent((PetscObject)B,(PetscObject)iperm); */
2233:   b    = (Mat_SeqSBAIJ*)(B)->data;
2234:   PetscFree2(b->imax,b->ilen);

2236:   b->singlemalloc = PETSC_FALSE;
2237:   b->free_a       = PETSC_TRUE;
2238:   b->free_ij      = PETSC_TRUE;
2239:   /* the next line frees the default space generated by the Create() */
2240:   PetscFree3(b->a,b->j,b->i);
2241:   PetscMalloc1((iu[mbs]+1)*a->bs2,&b->a);
2242:   b->j    = ju;
2243:   b->i    = iu;
2244:   b->diag = NULL;
2245:   b->ilen = NULL;
2246:   b->imax = NULL;

2248:   ISDestroy(&b->row);
2249:   ISDestroy(&b->icol);
2250:   b->row  = perm;
2251:   b->icol = perm;
2252:   PetscObjectReference((PetscObject)perm);
2253:   PetscObjectReference((PetscObject)perm);
2254:   PetscMalloc1(bs*mbs+bs,&b->solve_work);
2255:   /* In b structure:  Free imax, ilen, old a, old j.
2256:      Allocate idnew, solve_work, new a, new j */
2257:   PetscLogObjectMemory((PetscObject)B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));
2258:   b->maxnz = b->nz = iu[mbs];

2260:   (B)->info.factor_mallocs   = reallocs;
2261:   (B)->info.fill_ratio_given = f;
2262:   if (ai[mbs] != 0) {
2263:     (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2264:   } else {
2265:     (B)->info.fill_ratio_needed = 0.0;
2266:   }
2267:   MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);
2268:   return 0;
2269: }

2271: /*
2272:   See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2273: */
2274: #include <petscbt.h>
2275: #include <../src/mat/utils/freespace.h>
2276: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2277: {
2278:   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b;
2279:   PetscBool          perm_identity,free_ij = PETSC_TRUE,missing;
2280:   PetscInt           bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j;
2281:   const PetscInt     *rip;
2282:   PetscInt           reallocs=0,i,*ui,*udiag,*cols;
2283:   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2284:   PetscInt           nlnk,*lnk,*lnk_lvl=NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr;
2285:   PetscReal          fill          =info->fill,levels=info->levels;
2286:   PetscFreeSpaceList free_space    =NULL,current_space=NULL;
2287:   PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2288:   PetscBT            lnkbt;

2291:   MatMissingDiagonal(A,&missing,&d);
2293:   if (bs > 1) {
2294:     MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);
2295:     return 0;
2296:   }

2298:   /* check whether perm is the identity mapping */
2299:   ISIdentity(perm,&perm_identity);
2301:   a->permute = PETSC_FALSE;

2303:   PetscMalloc1(am+1,&ui);
2304:   PetscMalloc1(am+1,&udiag);
2305:   ui[0] = 0;

2307:   /* ICC(0) without matrix ordering: simply rearrange column indices */
2308:   if (!levels) {
2309:     /* reuse the column pointers and row offsets for memory savings */
2310:     for (i=0; i<am; i++) {
2311:       ncols    = ai[i+1] - ai[i];
2312:       ui[i+1]  = ui[i] + ncols;
2313:       udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */
2314:     }
2315:     PetscMalloc1(ui[am]+1,&uj);
2316:     cols = uj;
2317:     for (i=0; i<am; i++) {
2318:       aj    = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2319:       ncols = ai[i+1] - ai[i] -1;
2320:       for (j=0; j<ncols; j++) *cols++ = aj[j];
2321:       *cols++ = i; /* diagonal is located as the last entry of U(i,:) */
2322:     }
2323:   } else { /* case: levels>0 */
2324:     ISGetIndices(perm,&rip);

2326:     /* initialization */
2327:     /* jl: linked list for storing indices of the pivot rows
2328:        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2329:     PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);
2330:     for (i=0; i<am; i++) {
2331:       jl[i] = am; il[i] = 0;
2332:     }

2334:     /* create and initialize a linked list for storing column indices of the active row k */
2335:     nlnk = am + 1;
2336:     PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);

2338:     /* initial FreeSpace size is fill*(ai[am]+1) */
2339:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);

2341:     current_space = free_space;

2343:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);

2345:     current_space_lvl = free_space_lvl;

2347:     for (k=0; k<am; k++) {  /* for each active row k */
2348:       /* initialize lnk by the column indices of row k */
2349:       nzk   = 0;
2350:       ncols = ai[k+1] - ai[k];
2352:       cols = aj+ai[k];
2353:       PetscIncompleteLLInit(ncols,cols,am,rip,&nlnk,lnk,lnk_lvl,lnkbt);
2354:       nzk += nlnk;

2356:       /* update lnk by computing fill-in for each pivot row to be merged in */
2357:       prow = jl[k]; /* 1st pivot row */

2359:       while (prow < k) {
2360:         nextprow = jl[prow];

2362:         /* merge prow into k-th row */
2363:         jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2364:         jmax  = ui[prow+1];
2365:         ncols = jmax-jmin;
2366:         i     = jmin - ui[prow];

2368:         cols = uj_ptr[prow] + i;  /* points to the 2nd nzero entry in U(prow,k:am-1) */
2369:         uj   = uj_lvl_ptr[prow] + i;  /* levels of cols */
2370:         j    = *(uj - 1);
2371:         PetscICCLLAddSorted(ncols,cols,levels,uj,am,&nlnk,lnk,lnk_lvl,lnkbt,j);
2372:         nzk += nlnk;

2374:         /* update il and jl for prow */
2375:         if (jmin < jmax) {
2376:           il[prow] = jmin;

2378:           j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2379:         }
2380:         prow = nextprow;
2381:       }

2383:       /* if free space is not available, make more free space */
2384:       if (current_space->local_remaining<nzk) {
2385:         i    = am - k + 1; /* num of unfactored rows */
2386:         i    = PetscIntMultTruncate(i,PetscMin(nzk, i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2387:         PetscFreeSpaceGet(i,&current_space);
2388:         PetscFreeSpaceGet(i,&current_space_lvl);
2389:         reallocs++;
2390:       }

2392:       /* copy data into free_space and free_space_lvl, then initialize lnk */
2394:       PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);

2396:       /* add the k-th row into il and jl */
2397:       if (nzk > 1) {
2398:         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2399:         jl[k] = jl[i]; jl[i] = k;
2400:         il[k] = ui[k] + 1;
2401:       }
2402:       uj_ptr[k]     = current_space->array;
2403:       uj_lvl_ptr[k] = current_space_lvl->array;

2405:       current_space->array               += nzk;
2406:       current_space->local_used          += nzk;
2407:       current_space->local_remaining     -= nzk;
2408:       current_space_lvl->array           += nzk;
2409:       current_space_lvl->local_used      += nzk;
2410:       current_space_lvl->local_remaining -= nzk;

2412:       ui[k+1] = ui[k] + nzk;
2413:     }

2415:     ISRestoreIndices(perm,&rip);
2416:     PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);

2418:     /* destroy list of free space and other temporary array(s) */
2419:     PetscMalloc1(ui[am]+1,&uj);
2420:     PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag); /* store matrix factor  */
2421:     PetscIncompleteLLDestroy(lnk,lnkbt);
2422:     PetscFreeSpaceDestroy(free_space_lvl);

2424:   } /* end of case: levels>0 || (levels=0 && !perm_identity) */

2426:   /* put together the new matrix in MATSEQSBAIJ format */
2427:   MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);

2429:   b    = (Mat_SeqSBAIJ*)(fact)->data;
2430:   PetscFree2(b->imax,b->ilen);

2432:   b->singlemalloc = PETSC_FALSE;
2433:   b->free_a       = PETSC_TRUE;
2434:   b->free_ij      = free_ij;

2436:   PetscMalloc1(ui[am]+1,&b->a);

2438:   b->j         = uj;
2439:   b->i         = ui;
2440:   b->diag      = udiag;
2441:   b->free_diag = PETSC_TRUE;
2442:   b->ilen      = NULL;
2443:   b->imax      = NULL;
2444:   b->row       = perm;
2445:   b->col       = perm;

2447:   PetscObjectReference((PetscObject)perm);
2448:   PetscObjectReference((PetscObject)perm);

2450:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

2452:   PetscMalloc1(am+1,&b->solve_work);
2453:   PetscLogObjectMemory((PetscObject)fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));

2455:   b->maxnz = b->nz = ui[am];

2457:   fact->info.factor_mallocs   = reallocs;
2458:   fact->info.fill_ratio_given = fill;
2459:   if (ai[am] != 0) {
2460:     fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am];
2461:   } else {
2462:     fact->info.fill_ratio_needed = 0.0;
2463:   }
2464: #if defined(PETSC_USE_INFO)
2465:   if (ai[am] != 0) {
2466:     PetscReal af = fact->info.fill_ratio_needed;
2467:     PetscInfo(A,"Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);
2468:     PetscInfo(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2469:     PetscInfo(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);
2470:   } else {
2471:     PetscInfo(A,"Empty matrix\n");
2472:   }
2473: #endif
2474:   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2475:   return 0;
2476: }

2478: PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2479: {
2480:   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
2481:   Mat_SeqSBAIJ       *b;
2482:   PetscBool          perm_identity,free_ij = PETSC_TRUE;
2483:   PetscInt           bs=A->rmap->bs,am=a->mbs;
2484:   const PetscInt     *cols,*rip,*ai=a->i,*aj=a->j;
2485:   PetscInt           reallocs=0,i,*ui;
2486:   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2487:   PetscInt           nlnk,*lnk,*lnk_lvl=NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
2488:   PetscReal          fill          =info->fill,levels=info->levels,ratio_needed;
2489:   PetscFreeSpaceList free_space    =NULL,current_space=NULL;
2490:   PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2491:   PetscBT            lnkbt;

2493:   /*
2494:    This code originally uses Modified Sparse Row (MSR) storage
2495:    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2496:    Then it is rewritten so the factor B takes seqsbaij format. However the associated
2497:    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2498:    thus the original code in MSR format is still used for these cases.
2499:    The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2500:    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2501:   */
2502:   if (bs > 1) {
2503:     MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);
2504:     return 0;
2505:   }

2507:   /* check whether perm is the identity mapping */
2508:   ISIdentity(perm,&perm_identity);
2510:   a->permute = PETSC_FALSE;

2512:   /* special case that simply copies fill pattern */
2513:   if (!levels) {
2514:     /* reuse the column pointers and row offsets for memory savings */
2515:     ui           = a->i;
2516:     uj           = a->j;
2517:     free_ij      = PETSC_FALSE;
2518:     ratio_needed = 1.0;
2519:   } else { /* case: levels>0 */
2520:     ISGetIndices(perm,&rip);

2522:     /* initialization */
2523:     PetscMalloc1(am+1,&ui);
2524:     ui[0] = 0;

2526:     /* jl: linked list for storing indices of the pivot rows
2527:        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2528:     PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);
2529:     for (i=0; i<am; i++) {
2530:       jl[i] = am; il[i] = 0;
2531:     }

2533:     /* create and initialize a linked list for storing column indices of the active row k */
2534:     nlnk = am + 1;
2535:     PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);

2537:     /* initial FreeSpace size is fill*(ai[am]+1) */
2538:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);

2540:     current_space = free_space;

2542:     PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);

2544:     current_space_lvl = free_space_lvl;

2546:     for (k=0; k<am; k++) {  /* for each active row k */
2547:       /* initialize lnk by the column indices of row rip[k] */
2548:       nzk   = 0;
2549:       ncols = ai[rip[k]+1] - ai[rip[k]];
2550:       cols  = aj+ai[rip[k]];
2551:       PetscIncompleteLLInit(ncols,cols,am,rip,&nlnk,lnk,lnk_lvl,lnkbt);
2552:       nzk  += nlnk;

2554:       /* update lnk by computing fill-in for each pivot row to be merged in */
2555:       prow = jl[k]; /* 1st pivot row */

2557:       while (prow < k) {
2558:         nextprow = jl[prow];

2560:         /* merge prow into k-th row */
2561:         jmin     = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2562:         jmax     = ui[prow+1];
2563:         ncols    = jmax-jmin;
2564:         i        = jmin - ui[prow];
2565:         cols     = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2566:         j        = *(uj_lvl_ptr[prow] + i - 1);
2567:         cols_lvl = uj_lvl_ptr[prow]+i;
2568:         PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,&nlnk,lnk,lnk_lvl,lnkbt,j);
2569:         nzk     += nlnk;

2571:         /* update il and jl for prow */
2572:         if (jmin < jmax) {
2573:           il[prow] = jmin;
2574:           j        = *cols;
2575:           jl[prow] = jl[j];
2576:           jl[j]    = prow;
2577:         }
2578:         prow = nextprow;
2579:       }

2581:       /* if free space is not available, make more free space */
2582:       if (current_space->local_remaining<nzk) {
2583:         i    = am - k + 1; /* num of unfactored rows */
2584:         i    = PetscMin(PetscIntMultTruncate(i,nzk), PetscIntMultTruncate(i,i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2585:         PetscFreeSpaceGet(i,&current_space);
2586:         PetscFreeSpaceGet(i,&current_space_lvl);
2587:         reallocs++;
2588:       }

2590:       /* copy data into free_space and free_space_lvl, then initialize lnk */
2591:       PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);

2593:       /* add the k-th row into il and jl */
2594:       if (nzk-1 > 0) {
2595:         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2596:         jl[k] = jl[i]; jl[i] = k;
2597:         il[k] = ui[k] + 1;
2598:       }
2599:       uj_ptr[k]     = current_space->array;
2600:       uj_lvl_ptr[k] = current_space_lvl->array;

2602:       current_space->array               += nzk;
2603:       current_space->local_used          += nzk;
2604:       current_space->local_remaining     -= nzk;
2605:       current_space_lvl->array           += nzk;
2606:       current_space_lvl->local_used      += nzk;
2607:       current_space_lvl->local_remaining -= nzk;

2609:       ui[k+1] = ui[k] + nzk;
2610:     }

2612:     ISRestoreIndices(perm,&rip);
2613:     PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);

2615:     /* destroy list of free space and other temporary array(s) */
2616:     PetscMalloc1(ui[am]+1,&uj);
2617:     PetscFreeSpaceContiguous(&free_space,uj);
2618:     PetscIncompleteLLDestroy(lnk,lnkbt);
2619:     PetscFreeSpaceDestroy(free_space_lvl);
2620:     if (ai[am] != 0) {
2621:       ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2622:     } else {
2623:       ratio_needed = 0.0;
2624:     }
2625:   } /* end of case: levels>0 || (levels=0 && !perm_identity) */

2627:   /* put together the new matrix in MATSEQSBAIJ format */
2628:   MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);

2630:   b = (Mat_SeqSBAIJ*)(fact)->data;

2632:   PetscFree2(b->imax,b->ilen);

2634:   b->singlemalloc = PETSC_FALSE;
2635:   b->free_a       = PETSC_TRUE;
2636:   b->free_ij      = free_ij;

2638:   PetscMalloc1(ui[am]+1,&b->a);

2640:   b->j             = uj;
2641:   b->i             = ui;
2642:   b->diag          = NULL;
2643:   b->ilen          = NULL;
2644:   b->imax          = NULL;
2645:   b->row           = perm;
2646:   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */

2648:   PetscObjectReference((PetscObject)perm);
2649:   b->icol = perm;
2650:   PetscObjectReference((PetscObject)perm);
2651:   PetscMalloc1(am+1,&b->solve_work);

2653:   b->maxnz = b->nz = ui[am];

2655:   fact->info.factor_mallocs    = reallocs;
2656:   fact->info.fill_ratio_given  = fill;
2657:   fact->info.fill_ratio_needed = ratio_needed;
2658: #if defined(PETSC_USE_INFO)
2659:   if (ai[am] != 0) {
2660:     PetscReal af = fact->info.fill_ratio_needed;
2661:     PetscInfo(A,"Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);
2662:     PetscInfo(A,"Run with -pc_factor_fill %g or use \n",(double)af);
2663:     PetscInfo(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);
2664:   } else {
2665:     PetscInfo(A,"Empty matrix\n");
2666:   }
2667: #endif
2668:   if (perm_identity) {
2669:     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2670:   } else {
2671:     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2672:   }
2673:   return 0;
2674: }