Actual source code: blockinvert.h

  1: /*
  2:     Kernels used in sparse ILU (and LU) and in the resulting triangular
  3:  solves. These are for block algorithms where the block sizes are on
  4:  the order of 2-6+.

  6:     There are TWO versions of the macros below.
  7:     1) standard for MatScalar == PetscScalar use the standard BLAS for
  8:        block size larger than 7 and
  9:     2) handcoded Fortran single precision for the matrices, since BLAS
 10:        does not have some arguments in single and some in double.

 12: */
 15: #include <petscblaslapack.h>

 17: /*
 18:       These are C kernels,they are contained in
 19:    src/mat/impls/baij/seq
 20: */

 22: PETSC_INTERN PetscErrorCode PetscLINPACKgefa(MatScalar*,PetscInt,PetscInt*,PetscBool,PetscBool*);
 23: PETSC_INTERN PetscErrorCode PetscLINPACKgedi(MatScalar*,PetscInt,PetscInt*,MatScalar*);
 24: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_2(MatScalar*,PetscReal,PetscBool,PetscBool*);
 25: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_3(MatScalar*,PetscReal,PetscBool,PetscBool*);

 27: #define PetscKernel_A_gets_inverse_A_4_nopivot(mat) PetscMacroReturnStandard(\
 28:     MatScalar d, di = mat[0];                                   \
 29:     mat[0]   = d = 1.0 / di;                                    \
 30:     mat[4]  *= -d;                                              \
 31:     mat[8]  *= -d;                                              \
 32:     mat[12] *= -d;                                              \
 33:     mat[1]  *= d;                                               \
 34:     mat[2]  *= d;                                               \
 35:     mat[3]  *= d;                                               \
 36:     mat[5]  += mat[4] * mat[1] * di;                            \
 37:     mat[6]  += mat[4] * mat[2] * di;                            \
 38:     mat[7]  += mat[4] * mat[3] * di;                            \
 39:     mat[9]  += mat[8] * mat[1] * di;                            \
 40:     mat[10] += mat[8] * mat[2] * di;                            \
 41:     mat[11] += mat[8] * mat[3] * di;                            \
 42:     mat[13] += mat[12] * mat[1] * di;                           \
 43:     mat[14] += mat[12] * mat[2] * di;                           \
 44:     mat[15] += mat[12] * mat[3] * di;                           \
 45:     di       = mat[5];                                          \
 46:     mat[5]   = d = 1.0 / di;                                    \
 47:     mat[1]  *= -d;                                              \
 48:     mat[9]  *= -d;                                              \
 49:     mat[13] *= -d;                                              \
 50:     mat[4]  *= d;                                               \
 51:     mat[6]  *= d;                                               \
 52:     mat[7]  *= d;                                               \
 53:     mat[0]  += mat[1] * mat[4] * di;                            \
 54:     mat[2]  += mat[1] * mat[6] * di;                            \
 55:     mat[3]  += mat[1] * mat[7] * di;                            \
 56:     mat[8]  += mat[9] * mat[4] * di;                            \
 57:     mat[10] += mat[9] * mat[6] * di;                            \
 58:     mat[11] += mat[9] * mat[7] * di;                            \
 59:     mat[12] += mat[13] * mat[4] * di;                           \
 60:     mat[14] += mat[13] * mat[6] * di;                           \
 61:     mat[15] += mat[13] * mat[7] * di;                           \
 62:     di       = mat[10];                                         \
 63:     mat[10]  = d = 1.0 / di;                                    \
 64:     mat[2]  *= -d;                                              \
 65:     mat[6]  *= -d;                                              \
 66:     mat[14] *= -d;                                              \
 67:     mat[8]  *= d;                                               \
 68:     mat[9]  *= d;                                               \
 69:     mat[11] *= d;                                               \
 70:     mat[0]  += mat[2] * mat[8] * di;                            \
 71:     mat[1]  += mat[2] * mat[9] * di;                            \
 72:     mat[3]  += mat[2] * mat[11] * di;                           \
 73:     mat[4]  += mat[6] * mat[8] * di;                            \
 74:     mat[5]  += mat[6] * mat[9] * di;                            \
 75:     mat[7]  += mat[6] * mat[11] * di;                           \
 76:     mat[12] += mat[14] * mat[8] * di;                           \
 77:     mat[13] += mat[14] * mat[9] * di;                           \
 78:     mat[15] += mat[14] * mat[11] * di;                          \
 79:     di       = mat[15];                                         \
 80:     mat[15]  = d = 1.0 / di;                                    \
 81:     mat[3]  *= -d;                                              \
 82:     mat[7]  *= -d;                                              \
 83:     mat[11] *= -d;                                              \
 84:     mat[12] *= d;                                               \
 85:     mat[13] *= d;                                               \
 86:     mat[14] *= d;                                               \
 87:     mat[0]  += mat[3] * mat[12] * di;                           \
 88:     mat[1]  += mat[3] * mat[13] * di;                           \
 89:     mat[2]  += mat[3] * mat[14] * di;                           \
 90:     mat[4]  += mat[7] * mat[12] * di;                           \
 91:     mat[5]  += mat[7] * mat[13] * di;                           \
 92:     mat[6]  += mat[7] * mat[14] * di;                           \
 93:     mat[8]  += mat[11] * mat[12] * di;                          \
 94:     mat[9]  += mat[11] * mat[13] * di;                          \
 95:     mat[10] += mat[11] * mat[14] * di;                          \
 96:   )

 98: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_4(MatScalar*,PetscReal,PetscBool,PetscBool*);
 99: # if defined(PETSC_HAVE_SSE)
100: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_4_SSE(MatScalar*);
101: # endif
102: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_5(MatScalar*,PetscInt*,MatScalar*,PetscReal,PetscBool,PetscBool*);
103: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_6(MatScalar*,PetscReal,PetscBool,PetscBool*);
104: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_7(MatScalar*,PetscReal,PetscBool,PetscBool*);
105: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_9(MatScalar*,PetscReal,PetscBool,PetscBool*);
106: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_15(MatScalar*,PetscInt*,MatScalar*,PetscReal,PetscBool,PetscBool*);

108: /*
109:     A = inv(A)    A_gets_inverse_A

111:    A      - square bs by bs array stored in column major order
112:    pivots - integer work array of length bs
113:    W      -  bs by 1 work array
114: */
115: #define PetscKernel_A_gets_inverse_A(bs,A,pivots,W,allowzeropivot,zeropivotdetected) (PetscLINPACKgefa((A),(bs),(pivots),(allowzeropivot),(zeropivotdetected)) || PetscLINPACKgedi((A),(bs),(pivots),(W)))

117: /* -----------------------------------------------------------------------*/

119: #if !defined(PETSC_USE_REAL_MAT_SINGLE)
120: /*
121:         Version that calls the BLAS directly
122: */
123: /*
124:       A = A * B   A_gets_A_times_B

126:    A, B - square bs by bs arrays stored in column major order
127:    W    - square bs by bs work array

129: */
130: #define PetscKernel_A_gets_A_times_B(bs,A,B,W) do {                                            \
131:     PetscBLASInt _bbs;                                                                         \
132:     PetscScalar  _one = 1.0,_zero = 0.0;                                                       \
133:     PetscBLASIntCast(bs,&_bbs);                                                       \
134:     PetscArraycpy((W),(A),(bs)*(bs));                                                 \
138:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&(_bbs),&(_bbs),&(_bbs),&_one,(W),&(_bbs),(B),&(_bbs),&_zero,(A),&(_bbs))); \
139:   } while (0)

141: /*

143:     A = A - B * C  A_gets_A_minus_B_times_C

145:    A, B, C - square bs by bs arrays stored in column major order
146: */
147: #define PetscKernel_A_gets_A_minus_B_times_C(bs,A,B,C) do {                                    \
148:     PetscScalar    _mone = -1.0,_one = 1.0;                                                    \
149:     PetscBLASInt   _bbs;                                                                       \
150:     PetscBLASIntCast(bs,&_bbs);                                                       \
154:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&(_bbs),&(_bbs),&(_bbs),&_mone,(B),&(_bbs),(C),&(_bbs),&_one,(A),&(_bbs))); \
155:   } while (0)

157: /*

159:     A = A + B^T * C  A_gets_A_plus_Btranspose_times_C

161:    A, B, C - square bs by bs arrays stored in column major order
162: */
163: #define PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C) do {                            \
164:     PetscScalar    _one = 1.0;                                                                 \
165:     PetscBLASInt   _bbs;                                                                       \
166:     PetscBLASIntCast(bs,&_bbs);                                                       \
170:     PetscStackCallBLAS("BLASgemm",BLASgemm_("T","N",&(_bbs),&(_bbs),&(_bbs),&_one,(B),&(_bbs),(C),&(_bbs),&_one,(A),&(_bbs))); \
171:   } while (0)

173: /*
174:     v = v + A^T w  v_gets_v_plus_Atranspose_times_w

176:    v - array of length bs
177:    A - square bs by bs array
178:    w - array of length bs
179: */
180: #define  PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w) do {                           \
181:     PetscScalar    _one  = 1.0;                                                                \
182:     PetscBLASInt   _ione = 1, _bbs;                                                            \
183:     PetscBLASIntCast(bs,&_bbs);                                                       \
187:     PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&(_bbs),&(_bbs),&_one,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
188:   } while (0)

190: /*
191:     v = v - A w  v_gets_v_minus_A_times_w

193:    v - array of length bs
194:    A - square bs by bs array
195:    w - array of length bs
196: */
197: #define  PetscKernel_v_gets_v_minus_A_times_w(bs,v,A,w) do {                                   \
198:     PetscScalar    _mone = -1.0,_one = 1.0;                                                    \
199:     PetscBLASInt   _ione = 1,_bbs;                                                             \
200:     PetscBLASIntCast(bs,&_bbs);                                                       \
204:     PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bbs),&_mone,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
205:   } while (0)

207: /*
208:     v = v - A w  v_gets_v_minus_transA_times_w

210:    v - array of length bs
211:    A - square bs by bs array
212:    w - array of length bs
213: */
214: #define  PetscKernel_v_gets_v_minus_transA_times_w(bs,v,A,w) do {                              \
215:     PetscScalar    _mone = -1.0,_one = 1.0;                                                    \
216:     PetscBLASInt   _ione = 1,_bbs;                                                             \
217:     PetscBLASIntCast(bs,&_bbs);                                                       \
221:     PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&(_bbs),&(_bbs),&_mone,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
222:   } while (0)

224: /*
225:     v = v + A w  v_gets_v_plus_A_times_w

227:    v - array of length bs
228:    A - square bs by bs array
229:    w - array of length bs
230: */
231: #define  PetscKernel_v_gets_v_plus_A_times_w(bs,v,A,w) do {                                    \
232:     PetscScalar    _one  = 1.0;                                                                \
233:     PetscBLASInt   _ione = 1,_bbs;                                                             \
234:     PetscBLASIntCast(bs,&_bbs);                                                       \
238:     PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bbs),&_one,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
239:   } while (0)

241: /*
242:     v = v + A w  v_gets_v_plus_Ar_times_w

244:    v - array of length bs
245:    A - square bs by bs array
246:    w - array of length bs
247: */
248: #define  PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,v,A,w) do {                             \
249:     PetscScalar    _one  = 1.0;                                                                \
250:     PetscBLASInt   _ione = 1,_bbs,_bncols;                                                     \
251:     PetscBLASIntCast(bs,&_bbs);                                                       \
252:     PetscBLASIntCast(ncols,&_bncols);                                                 \
256:     PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bncols),&_one,A,&(_bbs),v,&_ione,&_one,w,&_ione)); \
257:   } while (0)

259: /*
260:     w <- w - A v  w_gets_w_minus_Ar_times_v

262:    v - array of length ncol
263:    A(bs,ncols)
264:    w - array of length bs
265: */
266: #define  PetscKernel_w_gets_w_minus_Ar_times_v(bs,ncols,w,A,v) do {                            \
267:     PetscScalar    _one  = 1.0,_mone = -1.0;                                                   \
268:     PetscBLASInt   _ione = 1,_bbs,_bncols;                                                     \
269:     PetscBLASIntCast(bs,&_bbs);                                                       \
270: PetscBLASIntCast(ncols,&_bncols);                                                     \
274:     PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bncols),&_mone,A,&(_bbs),v,&_ione,&_one,w,&_ione)); \
275:   } while (0)

277: /*
278:     w = A v   w_gets_A_times_v

280:    v - array of length bs
281:    A - square bs by bs array
282:    w - array of length bs
283:  */
284: #define PetscKernel_w_gets_A_times_v(bs,v,A,w) do {                                            \
285:     PetscScalar    _zero = 0.0,_one = 1.0;                                                     \
286:     PetscBLASInt   _ione = 1,_bbs;                                                             \
287:     PetscBLASIntCast(bs,&_bbs);                                                       \
291:     PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bbs),&_one,A,&(_bbs),v,&_ione,&_zero,w,&_ione)); \
292:   } while (0)

294: /*
295:     w = A' v   w_gets_transA_times_v

297:    v - array of length bs
298:    A - square bs by bs array
299:    w - array of length bs
300: */
301: #define PetscKernel_w_gets_transA_times_v(bs,v,A,w) do {                                       \
302:     PetscScalar    _zero = 0.0,_one = 1.0;                                                     \
303:     PetscBLASInt   _ione = 1,_bbs;                                                             \
304:     PetscBLASIntCast(bs,&_bbs);                                                       \
308:     PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&(_bbs),&(_bbs),&_one,A,&(_bbs),v,&_ione,&_zero,w,&_ione)); \
309:   } while (0)

311: /*
312:         z = A*x
313: */
314: #define PetscKernel_w_gets_Ar_times_v(bs,ncols,x,A,z) do {                                     \
315:     PetscScalar    _one  = 1.0,_zero = 0.0;                                                    \
316:     PetscBLASInt   _ione = 1,_bbs,_bncols;                                                     \
317:     PetscBLASIntCast(bs,&_bbs);                                                       \
318:     PetscBLASIntCast(ncols,&_bncols);                                                 \
322:     PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&_bncols,&_one,A,&(_bbs),x,&_ione,&_zero,z,&_ione)); \
323:   } while (0)

325: /*
326:         z = trans(A)*x
327: */
328: #define PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z) do {                        \
329:     PetscScalar    _one  = 1.0;                                                                \
330:     PetscBLASInt   _ione = 1,_bbs,_bncols;                                                     \
331:     PetscBLASIntCast(bs,&_bbs);                                                       \
332:     PetscBLASIntCast(ncols,&_bncols);                                                 \
336:     PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&_bbs,&_bncols,&_one,A,&_bbs,x,&_ione,&_one,z,&_ione)); \
337:   } while (0)

339: #else /* !defined(PETSC_USE_REAL_MAT_SINGLE) */
340: /*
341:        Version that calls Fortran routines; can handle different precision
342:    of matrix (array) and vectors
343: */
344: /*
345:      These are Fortran kernels: They replace certain BLAS routines but
346:    have some arguments that may be single precision,rather than double
347:    These routines are provided in src/fortran/kernels/sgemv.F
348:    They are pretty pitiful but get the job done. The intention is
349:    that for important block sizes (currently 1,2,3,4,5,6,7) custom inlined
350:    code is used.
351: */

353: #if defined(PETSC_HAVE_FORTRAN_CAPS)
354: #define msgemv_  MSGEMV
355: #define msgemvp_ MSGEMVP
356: #define msgemvm_ MSGEMVM
357: #define msgemvt_ MSGEMVT
358: #define msgemmi_ MSGEMMI
359: #define msgemm_  MSGEMM
360: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
361: #define msgemv_  msgemv
362: #define msgemvp_ msgemvp
363: #define msgemvm_ msgemvm
364: #define msgemvt_ msgemvt
365: #define msgemmi_ msgemmi
366: #define msgemm_  msgemm
367: #endif

369: PETSC_EXTERN void msgemv_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
370: PETSC_EXTERN void msgemvp_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
371: PETSC_EXTERN void msgemvm_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
372: PETSC_EXTERN void msgemvt_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
373: PETSC_EXTERN void msgemmi_(PetscInt*,MatScalar*,MatScalar*,MatScalar*);
374: PETSC_EXTERN void msgemm_(PetscInt*,MatScalar*,MatScalar*,MatScalar*);

376: /*
377:       A = A * B   A_gets_A_times_B

379:    A, B - square bs by bs arrays stored in column major order
380:    W    - square bs by bs work array

382: */
383: #define PetscKernel_A_gets_A_times_B(bs,A,B,W) do {                     \
384:     PetscArraycpy((W),(A),(bs)*(bs));                          \
385:     msgemmi_(&bs,A,B,W);                                                \
386:   } while (0)

388: /*

390:     A = A - B * C  A_gets_A_minus_B_times_C

392:    A, B, C - square bs by bs arrays stored in column major order
393: */
394: #define PetscKernel_A_gets_A_minus_B_times_C(bs,A,B,C) msgemm_(&(bs),(A),(B),(C))

396: /*
397:     v = v - A w  v_gets_v_minus_A_times_w

399:    v - array of length bs
400:    A - square bs by bs array
401:    w - array of length bs
402: */
403: #define  PetscKernel_v_gets_v_minus_A_times_w(bs,v,A,w) msgemvm_(&(bs),&(bs),(A),(w),(v))

405: /*
406:     v = v + A w  v_gets_v_plus_A_times_w

408:    v - array of length bs
409:    A - square bs by bs array
410:    w - array of length bs
411: */
412: #define  PetscKernel_v_gets_v_plus_A_times_w(bs,v,A,w) msgemvp_(&(bs),&(bs),(A),(w),(v))

414: /*
415:     v = v + A w  v_gets_v_plus_Ar_times_w

417:    v - array of length bs
418:    A - square bs by bs array
419:    w - array of length bs
420: */
421: #define  PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncol,v,A,w) msgemvp_(&(bs),&(ncol),(A),(v),(w))

423: /*
424:     w = A v   w_gets_A_times_v

426:    v - array of length bs
427:    A - square bs by bs array
428:    w - array of length bs
429: */
430: #define PetscKernel_w_gets_A_times_v(bs,v,A,w) msgemv_(&(bs),&(bs),(A),(v),(w))

432: /*
433:         z = A*x
434: */
435: #define PetscKernel_w_gets_Ar_times_v(bs,ncols,x,A,z) msgemv_(&(bs),&(ncols),(A),(x),(z))

437: /*
438:         z = trans(A)*x
439: */
440: #define PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z) msgemvt_(&(bs),&(ncols),(A),(x),(z))

442: /* These do not work yet */
443: #define PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C)
444: #define PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w)

446: #endif /* !defined(PETSC_USE_REAL_MAT_SINGLE) */

448: #endif /* __ILU_H */