Actual source code: baijfact11.c
2: /*
3: Factorization code for BAIJ format.
4: */
5: #include <../src/mat/impls/baij/seq/baij.h>
6: #include <petsc/private/kernels/blockinvert.h>
8: /* ------------------------------------------------------------*/
9: /*
10: Version for when blocks are 4 by 4
11: */
12: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info)
13: {
14: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
15: IS isrow = b->row,isicol = b->icol;
16: const PetscInt *r,*ic;
17: PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j;
18: PetscInt *ajtmpold,*ajtmp,nz,row;
19: PetscInt *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj;
20: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
21: MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
22: MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
23: MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
24: MatScalar m13,m14,m15,m16;
25: MatScalar *ba = b->a,*aa = a->a;
26: PetscBool pivotinblocks = b->pivotinblocks;
27: PetscReal shift = info->shiftamount;
28: PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE;
30: ISGetIndices(isrow,&r);
31: ISGetIndices(isicol,&ic);
32: PetscMalloc1(16*(n+1),&rtmp);
33: allowzeropivot = PetscNot(A->erroriffailure);
35: for (i=0; i<n; i++) {
36: nz = bi[i+1] - bi[i];
37: ajtmp = bj + bi[i];
38: for (j=0; j<nz; j++) {
39: x = rtmp+16*ajtmp[j];
40: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
41: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
42: }
43: /* load in initial (unfactored row) */
44: idx = r[i];
45: nz = ai[idx+1] - ai[idx];
46: ajtmpold = aj + ai[idx];
47: v = aa + 16*ai[idx];
48: for (j=0; j<nz; j++) {
49: x = rtmp+16*ic[ajtmpold[j]];
50: x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
51: x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8];
52: x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
53: x[14] = v[14]; x[15] = v[15];
54: v += 16;
55: }
56: row = *ajtmp++;
57: while (row < i) {
58: pc = rtmp + 16*row;
59: p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
60: p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8];
61: p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
62: p15 = pc[14]; p16 = pc[15];
63: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
64: p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
65: p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
66: || p16 != 0.0) {
67: pv = ba + 16*diag_offset[row];
68: pj = bj + diag_offset[row] + 1;
69: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
70: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
71: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
72: x15 = pv[14]; x16 = pv[15];
73: pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4;
74: pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4;
75: pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4;
76: pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4;
78: pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8;
79: pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8;
80: pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8;
81: pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8;
83: pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12;
84: pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12;
85: pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12;
86: pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12;
88: pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16;
89: pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16;
90: pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16;
91: pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16;
93: nz = bi[row+1] - diag_offset[row] - 1;
94: pv += 16;
95: for (j=0; j<nz; j++) {
96: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
97: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
98: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
99: x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
100: x = rtmp + 16*pj[j];
101: x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4;
102: x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4;
103: x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4;
104: x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4;
106: x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8;
107: x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8;
108: x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8;
109: x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8;
111: x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12;
112: x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
113: x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
114: x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
116: x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16;
117: x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16;
118: x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16;
119: x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16;
121: pv += 16;
122: }
123: PetscLogFlops(128.0*nz+112.0);
124: }
125: row = *ajtmp++;
126: }
127: /* finished row so stick it into b->a */
128: pv = ba + 16*bi[i];
129: pj = bj + bi[i];
130: nz = bi[i+1] - bi[i];
131: for (j=0; j<nz; j++) {
132: x = rtmp+16*pj[j];
133: pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
134: pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8];
135: pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
136: pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
137: pv += 16;
138: }
139: /* invert diagonal block */
140: w = ba + 16*diag_offset[i];
141: if (pivotinblocks) {
142: PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);
143: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
144: } else {
145: PetscKernel_A_gets_inverse_A_4_nopivot(w);
146: }
147: }
149: PetscFree(rtmp);
150: ISRestoreIndices(isicol,&ic);
151: ISRestoreIndices(isrow,&r);
153: C->ops->solve = MatSolve_SeqBAIJ_4_inplace;
154: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace;
155: C->assembled = PETSC_TRUE;
157: PetscLogFlops(1.333333333333*4*4*4*b->mbs); /* from inverting diagonal blocks */
158: return 0;
159: }
161: /* MatLUFactorNumeric_SeqBAIJ_4 -
162: copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
163: PetscKernel_A_gets_A_times_B()
164: PetscKernel_A_gets_A_minus_B_times_C()
165: PetscKernel_A_gets_inverse_A()
166: */
168: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info)
169: {
170: Mat C = B;
171: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
172: IS isrow = b->row,isicol = b->icol;
173: const PetscInt *r,*ic;
174: PetscInt i,j,k,nz,nzL,row;
175: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
176: const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
177: MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
178: PetscInt flg;
179: PetscReal shift;
180: PetscBool allowzeropivot,zeropivotdetected;
182: allowzeropivot = PetscNot(A->erroriffailure);
183: ISGetIndices(isrow,&r);
184: ISGetIndices(isicol,&ic);
186: if (info->shifttype == MAT_SHIFT_NONE) {
187: shift = 0;
188: } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
189: shift = info->shiftamount;
190: }
192: /* generate work space needed by the factorization */
193: PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);
194: PetscArrayzero(rtmp,bs2*n);
196: for (i=0; i<n; i++) {
197: /* zero rtmp */
198: /* L part */
199: nz = bi[i+1] - bi[i];
200: bjtmp = bj + bi[i];
201: for (j=0; j<nz; j++) {
202: PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);
203: }
205: /* U part */
206: nz = bdiag[i]-bdiag[i+1];
207: bjtmp = bj + bdiag[i+1]+1;
208: for (j=0; j<nz; j++) {
209: PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);
210: }
212: /* load in initial (unfactored row) */
213: nz = ai[r[i]+1] - ai[r[i]];
214: ajtmp = aj + ai[r[i]];
215: v = aa + bs2*ai[r[i]];
216: for (j=0; j<nz; j++) {
217: PetscArraycpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2);
218: }
220: /* elimination */
221: bjtmp = bj + bi[i];
222: nzL = bi[i+1] - bi[i];
223: for (k=0; k < nzL; k++) {
224: row = bjtmp[k];
225: pc = rtmp + bs2*row;
226: for (flg=0,j=0; j<bs2; j++) {
227: if (pc[j]!=0.0) {
228: flg = 1;
229: break;
230: }
231: }
232: if (flg) {
233: pv = b->a + bs2*bdiag[row];
234: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
235: PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);
237: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
238: pv = b->a + bs2*(bdiag[row+1]+1);
239: nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
240: for (j=0; j<nz; j++) {
241: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
242: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
243: v = rtmp + bs2*pj[j];
244: PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);
245: pv += bs2;
246: }
247: PetscLogFlops(128.0*nz+112); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
248: }
249: }
251: /* finished row so stick it into b->a */
252: /* L part */
253: pv = b->a + bs2*bi[i];
254: pj = b->j + bi[i];
255: nz = bi[i+1] - bi[i];
256: for (j=0; j<nz; j++) {
257: PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);
258: }
260: /* Mark diagonal and invert diagonal for simpler triangular solves */
261: pv = b->a + bs2*bdiag[i];
262: pj = b->j + bdiag[i];
263: PetscArraycpy(pv,rtmp+bs2*pj[0],bs2);
264: PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);
265: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
267: /* U part */
268: pv = b->a + bs2*(bdiag[i+1]+1);
269: pj = b->j + bdiag[i+1]+1;
270: nz = bdiag[i] - bdiag[i+1] - 1;
271: for (j=0; j<nz; j++) {
272: PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);
273: }
274: }
276: PetscFree2(rtmp,mwork);
277: ISRestoreIndices(isicol,&ic);
278: ISRestoreIndices(isrow,&r);
280: C->ops->solve = MatSolve_SeqBAIJ_4;
281: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4;
282: C->assembled = PETSC_TRUE;
284: PetscLogFlops(1.333333333333*4*4*4*n); /* from inverting diagonal blocks */
285: return 0;
286: }
288: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info)
289: {
290: /*
291: Default Version for when blocks are 4 by 4 Using natural ordering
292: */
293: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
294: PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j;
295: PetscInt *ajtmpold,*ajtmp,nz,row;
296: PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj;
297: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
298: MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4;
299: MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16;
300: MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12;
301: MatScalar m13,m14,m15,m16;
302: MatScalar *ba = b->a,*aa = a->a;
303: PetscBool pivotinblocks = b->pivotinblocks;
304: PetscReal shift = info->shiftamount;
305: PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE;
307: allowzeropivot = PetscNot(A->erroriffailure);
308: PetscMalloc1(16*(n+1),&rtmp);
310: for (i=0; i<n; i++) {
311: nz = bi[i+1] - bi[i];
312: ajtmp = bj + bi[i];
313: for (j=0; j<nz; j++) {
314: x = rtmp+16*ajtmp[j];
315: x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0;
316: x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0;
317: }
318: /* load in initial (unfactored row) */
319: nz = ai[i+1] - ai[i];
320: ajtmpold = aj + ai[i];
321: v = aa + 16*ai[i];
322: for (j=0; j<nz; j++) {
323: x = rtmp+16*ajtmpold[j];
324: x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3];
325: x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8];
326: x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13];
327: x[14] = v[14]; x[15] = v[15];
328: v += 16;
329: }
330: row = *ajtmp++;
331: while (row < i) {
332: pc = rtmp + 16*row;
333: p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3];
334: p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8];
335: p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13];
336: p15 = pc[14]; p16 = pc[15];
337: if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 ||
338: p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 ||
339: p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0
340: || p16 != 0.0) {
341: pv = ba + 16*diag_offset[row];
342: pj = bj + diag_offset[row] + 1;
343: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
344: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
345: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13];
346: x15 = pv[14]; x16 = pv[15];
347: pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4;
348: pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4;
349: pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4;
350: pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4;
352: pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8;
353: pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8;
354: pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8;
355: pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8;
357: pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12;
358: pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12;
359: pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12;
360: pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12;
362: pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16;
363: pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16;
364: pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16;
365: pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16;
366: nz = bi[row+1] - diag_offset[row] - 1;
367: pv += 16;
368: for (j=0; j<nz; j++) {
369: x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3];
370: x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8];
371: x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12];
372: x14 = pv[13]; x15 = pv[14]; x16 = pv[15];
373: x = rtmp + 16*pj[j];
374: x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4;
375: x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4;
376: x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4;
377: x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4;
379: x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8;
380: x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8;
381: x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8;
382: x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8;
384: x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12;
385: x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12;
386: x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12;
387: x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12;
389: x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16;
390: x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16;
391: x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16;
392: x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16;
394: pv += 16;
395: }
396: PetscLogFlops(128.0*nz+112.0);
397: }
398: row = *ajtmp++;
399: }
400: /* finished row so stick it into b->a */
401: pv = ba + 16*bi[i];
402: pj = bj + bi[i];
403: nz = bi[i+1] - bi[i];
404: for (j=0; j<nz; j++) {
405: x = rtmp+16*pj[j];
406: pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3];
407: pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8];
408: pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12];
409: pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15];
410: pv += 16;
411: }
412: /* invert diagonal block */
413: w = ba + 16*diag_offset[i];
414: if (pivotinblocks) {
415: PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);
416: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
417: } else {
418: PetscKernel_A_gets_inverse_A_4_nopivot(w);
419: }
420: }
422: PetscFree(rtmp);
424: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace;
425: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace;
426: C->assembled = PETSC_TRUE;
428: PetscLogFlops(1.333333333333*4*4*4*b->mbs); /* from inverting diagonal blocks */
429: return 0;
430: }
432: /*
433: MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering -
434: copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace()
435: */
436: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
437: {
438: Mat C =B;
439: Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data;
440: PetscInt i,j,k,nz,nzL,row;
441: const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
442: const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2;
443: MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
444: PetscInt flg;
445: PetscReal shift;
446: PetscBool allowzeropivot,zeropivotdetected;
448: allowzeropivot = PetscNot(A->erroriffailure);
450: /* generate work space needed by the factorization */
451: PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);
452: PetscArrayzero(rtmp,bs2*n);
454: if (info->shifttype == MAT_SHIFT_NONE) {
455: shift = 0;
456: } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */
457: shift = info->shiftamount;
458: }
460: for (i=0; i<n; i++) {
461: /* zero rtmp */
462: /* L part */
463: nz = bi[i+1] - bi[i];
464: bjtmp = bj + bi[i];
465: for (j=0; j<nz; j++) {
466: PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);
467: }
469: /* U part */
470: nz = bdiag[i] - bdiag[i+1];
471: bjtmp = bj + bdiag[i+1]+1;
472: for (j=0; j<nz; j++) {
473: PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);
474: }
476: /* load in initial (unfactored row) */
477: nz = ai[i+1] - ai[i];
478: ajtmp = aj + ai[i];
479: v = aa + bs2*ai[i];
480: for (j=0; j<nz; j++) {
481: PetscArraycpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2);
482: }
484: /* elimination */
485: bjtmp = bj + bi[i];
486: nzL = bi[i+1] - bi[i];
487: for (k=0; k < nzL; k++) {
488: row = bjtmp[k];
489: pc = rtmp + bs2*row;
490: for (flg=0,j=0; j<bs2; j++) {
491: if (pc[j]!=0.0) {
492: flg = 1;
493: break;
494: }
495: }
496: if (flg) {
497: pv = b->a + bs2*bdiag[row];
498: /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
499: PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);
501: pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */
502: pv = b->a + bs2*(bdiag[row+1]+1);
503: nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
504: for (j=0; j<nz; j++) {
505: /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
506: /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
507: v = rtmp + bs2*pj[j];
508: PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);
509: pv += bs2;
510: }
511: PetscLogFlops(128.0*nz+112); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
512: }
513: }
515: /* finished row so stick it into b->a */
516: /* L part */
517: pv = b->a + bs2*bi[i];
518: pj = b->j + bi[i];
519: nz = bi[i+1] - bi[i];
520: for (j=0; j<nz; j++) {
521: PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);
522: }
524: /* Mark diagonal and invert diagonal for simpler triangular solves */
525: pv = b->a + bs2*bdiag[i];
526: pj = b->j + bdiag[i];
527: PetscArraycpy(pv,rtmp+bs2*pj[0],bs2);
528: PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);
529: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
531: /* U part */
532: pv = b->a + bs2*(bdiag[i+1]+1);
533: pj = b->j + bdiag[i+1]+1;
534: nz = bdiag[i] - bdiag[i+1] - 1;
535: for (j=0; j<nz; j++) {
536: PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);
537: }
538: }
539: PetscFree2(rtmp,mwork);
541: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering;
542: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
543: C->assembled = PETSC_TRUE;
545: PetscLogFlops(1.333333333333*4*4*4*n); /* from inverting diagonal blocks */
546: return 0;
547: }
549: #if defined(PETSC_HAVE_SSE)
551: #include PETSC_HAVE_SSE
553: /* SSE Version for when blocks are 4 by 4 Using natural ordering */
554: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info)
555: {
556: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
557: int i,j,n = a->mbs;
558: int *bj = b->j,*bjtmp,*pj;
559: int row;
560: int *ajtmpold,nz,*bi=b->i;
561: int *diag_offset = b->diag,*ai=a->i,*aj=a->j;
562: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
563: MatScalar *ba = b->a,*aa = a->a;
564: int nonzero=0;
565: PetscBool pivotinblocks = b->pivotinblocks;
566: PetscReal shift = info->shiftamount;
567: PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE;
569: allowzeropivot = PetscNot(A->erroriffailure);
570: SSE_SCOPE_BEGIN;
574: PetscMalloc1(16*(n+1),&rtmp);
576: /* if ((unsigned long)bj==(unsigned long)aj) { */
577: /* colscale = 4; */
578: /* } */
579: for (i=0; i<n; i++) {
580: nz = bi[i+1] - bi[i];
581: bjtmp = bj + bi[i];
582: /* zero out the 4x4 block accumulators */
583: /* zero out one register */
584: XOR_PS(XMM7,XMM7);
585: for (j=0; j<nz; j++) {
586: x = rtmp+16*bjtmp[j];
587: /* x = rtmp+4*bjtmp[j]; */
588: SSE_INLINE_BEGIN_1(x)
589: /* Copy zero register to memory locations */
590: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
591: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
592: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
593: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
594: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
595: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
596: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
597: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
598: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
599: SSE_INLINE_END_1;
600: }
601: /* load in initial (unfactored row) */
602: nz = ai[i+1] - ai[i];
603: ajtmpold = aj + ai[i];
604: v = aa + 16*ai[i];
605: for (j=0; j<nz; j++) {
606: x = rtmp+16*ajtmpold[j];
607: /* x = rtmp+colscale*ajtmpold[j]; */
608: /* Copy v block into x block */
609: SSE_INLINE_BEGIN_2(v,x)
610: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
611: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
612: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
614: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
615: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
617: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
618: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
620: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
621: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
623: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
624: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
626: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
627: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
629: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
630: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
632: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
633: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
634: SSE_INLINE_END_2;
636: v += 16;
637: }
638: /* row = (*bjtmp++)/4; */
639: row = *bjtmp++;
640: while (row < i) {
641: pc = rtmp + 16*row;
642: SSE_INLINE_BEGIN_1(pc)
643: /* Load block from lower triangle */
644: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
645: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
646: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
648: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
649: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
651: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
652: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
654: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
655: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
657: /* Compare block to zero block */
659: SSE_COPY_PS(XMM4,XMM7)
660: SSE_CMPNEQ_PS(XMM4,XMM0)
662: SSE_COPY_PS(XMM5,XMM7)
663: SSE_CMPNEQ_PS(XMM5,XMM1)
665: SSE_COPY_PS(XMM6,XMM7)
666: SSE_CMPNEQ_PS(XMM6,XMM2)
668: SSE_CMPNEQ_PS(XMM7,XMM3)
670: /* Reduce the comparisons to one SSE register */
671: SSE_OR_PS(XMM6,XMM7)
672: SSE_OR_PS(XMM5,XMM4)
673: SSE_OR_PS(XMM5,XMM6)
674: SSE_INLINE_END_1;
676: /* Reduce the one SSE register to an integer register for branching */
677: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
678: MOVEMASK(nonzero,XMM5);
680: /* If block is nonzero ... */
681: if (nonzero) {
682: pv = ba + 16*diag_offset[row];
683: PREFETCH_L1(&pv[16]);
684: pj = bj + diag_offset[row] + 1;
686: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
687: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
688: /* but the diagonal was inverted already */
689: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
691: SSE_INLINE_BEGIN_2(pv,pc)
692: /* Column 0, product is accumulated in XMM4 */
693: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
694: SSE_SHUFFLE(XMM4,XMM4,0x00)
695: SSE_MULT_PS(XMM4,XMM0)
697: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
698: SSE_SHUFFLE(XMM5,XMM5,0x00)
699: SSE_MULT_PS(XMM5,XMM1)
700: SSE_ADD_PS(XMM4,XMM5)
702: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
703: SSE_SHUFFLE(XMM6,XMM6,0x00)
704: SSE_MULT_PS(XMM6,XMM2)
705: SSE_ADD_PS(XMM4,XMM6)
707: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
708: SSE_SHUFFLE(XMM7,XMM7,0x00)
709: SSE_MULT_PS(XMM7,XMM3)
710: SSE_ADD_PS(XMM4,XMM7)
712: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
713: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
715: /* Column 1, product is accumulated in XMM5 */
716: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
717: SSE_SHUFFLE(XMM5,XMM5,0x00)
718: SSE_MULT_PS(XMM5,XMM0)
720: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
721: SSE_SHUFFLE(XMM6,XMM6,0x00)
722: SSE_MULT_PS(XMM6,XMM1)
723: SSE_ADD_PS(XMM5,XMM6)
725: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
726: SSE_SHUFFLE(XMM7,XMM7,0x00)
727: SSE_MULT_PS(XMM7,XMM2)
728: SSE_ADD_PS(XMM5,XMM7)
730: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
731: SSE_SHUFFLE(XMM6,XMM6,0x00)
732: SSE_MULT_PS(XMM6,XMM3)
733: SSE_ADD_PS(XMM5,XMM6)
735: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
736: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
738: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
740: /* Column 2, product is accumulated in XMM6 */
741: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
742: SSE_SHUFFLE(XMM6,XMM6,0x00)
743: SSE_MULT_PS(XMM6,XMM0)
745: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
746: SSE_SHUFFLE(XMM7,XMM7,0x00)
747: SSE_MULT_PS(XMM7,XMM1)
748: SSE_ADD_PS(XMM6,XMM7)
750: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
751: SSE_SHUFFLE(XMM7,XMM7,0x00)
752: SSE_MULT_PS(XMM7,XMM2)
753: SSE_ADD_PS(XMM6,XMM7)
755: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
756: SSE_SHUFFLE(XMM7,XMM7,0x00)
757: SSE_MULT_PS(XMM7,XMM3)
758: SSE_ADD_PS(XMM6,XMM7)
760: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
761: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
763: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
764: /* Column 3, product is accumulated in XMM0 */
765: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
766: SSE_SHUFFLE(XMM7,XMM7,0x00)
767: SSE_MULT_PS(XMM0,XMM7)
769: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
770: SSE_SHUFFLE(XMM7,XMM7,0x00)
771: SSE_MULT_PS(XMM1,XMM7)
772: SSE_ADD_PS(XMM0,XMM1)
774: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
775: SSE_SHUFFLE(XMM1,XMM1,0x00)
776: SSE_MULT_PS(XMM1,XMM2)
777: SSE_ADD_PS(XMM0,XMM1)
779: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
780: SSE_SHUFFLE(XMM7,XMM7,0x00)
781: SSE_MULT_PS(XMM3,XMM7)
782: SSE_ADD_PS(XMM0,XMM3)
784: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
785: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
787: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
788: /* This is code to be maintained and read by humans after all. */
789: /* Copy Multiplier Col 3 into XMM3 */
790: SSE_COPY_PS(XMM3,XMM0)
791: /* Copy Multiplier Col 2 into XMM2 */
792: SSE_COPY_PS(XMM2,XMM6)
793: /* Copy Multiplier Col 1 into XMM1 */
794: SSE_COPY_PS(XMM1,XMM5)
795: /* Copy Multiplier Col 0 into XMM0 */
796: SSE_COPY_PS(XMM0,XMM4)
797: SSE_INLINE_END_2;
799: /* Update the row: */
800: nz = bi[row+1] - diag_offset[row] - 1;
801: pv += 16;
802: for (j=0; j<nz; j++) {
803: PREFETCH_L1(&pv[16]);
804: x = rtmp + 16*pj[j];
805: /* x = rtmp + 4*pj[j]; */
807: /* X:=X-M*PV, One column at a time */
808: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
809: SSE_INLINE_BEGIN_2(x,pv)
810: /* Load First Column of X*/
811: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
812: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
814: /* Matrix-Vector Product: */
815: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
816: SSE_SHUFFLE(XMM5,XMM5,0x00)
817: SSE_MULT_PS(XMM5,XMM0)
818: SSE_SUB_PS(XMM4,XMM5)
820: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
821: SSE_SHUFFLE(XMM6,XMM6,0x00)
822: SSE_MULT_PS(XMM6,XMM1)
823: SSE_SUB_PS(XMM4,XMM6)
825: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
826: SSE_SHUFFLE(XMM7,XMM7,0x00)
827: SSE_MULT_PS(XMM7,XMM2)
828: SSE_SUB_PS(XMM4,XMM7)
830: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
831: SSE_SHUFFLE(XMM5,XMM5,0x00)
832: SSE_MULT_PS(XMM5,XMM3)
833: SSE_SUB_PS(XMM4,XMM5)
835: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
836: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
838: /* Second Column */
839: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
840: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
842: /* Matrix-Vector Product: */
843: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
844: SSE_SHUFFLE(XMM6,XMM6,0x00)
845: SSE_MULT_PS(XMM6,XMM0)
846: SSE_SUB_PS(XMM5,XMM6)
848: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
849: SSE_SHUFFLE(XMM7,XMM7,0x00)
850: SSE_MULT_PS(XMM7,XMM1)
851: SSE_SUB_PS(XMM5,XMM7)
853: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
854: SSE_SHUFFLE(XMM4,XMM4,0x00)
855: SSE_MULT_PS(XMM4,XMM2)
856: SSE_SUB_PS(XMM5,XMM4)
858: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
859: SSE_SHUFFLE(XMM6,XMM6,0x00)
860: SSE_MULT_PS(XMM6,XMM3)
861: SSE_SUB_PS(XMM5,XMM6)
863: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
864: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
866: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
868: /* Third Column */
869: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
870: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
872: /* Matrix-Vector Product: */
873: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
874: SSE_SHUFFLE(XMM7,XMM7,0x00)
875: SSE_MULT_PS(XMM7,XMM0)
876: SSE_SUB_PS(XMM6,XMM7)
878: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
879: SSE_SHUFFLE(XMM4,XMM4,0x00)
880: SSE_MULT_PS(XMM4,XMM1)
881: SSE_SUB_PS(XMM6,XMM4)
883: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
884: SSE_SHUFFLE(XMM5,XMM5,0x00)
885: SSE_MULT_PS(XMM5,XMM2)
886: SSE_SUB_PS(XMM6,XMM5)
888: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
889: SSE_SHUFFLE(XMM7,XMM7,0x00)
890: SSE_MULT_PS(XMM7,XMM3)
891: SSE_SUB_PS(XMM6,XMM7)
893: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
894: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
896: /* Fourth Column */
897: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
898: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
900: /* Matrix-Vector Product: */
901: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
902: SSE_SHUFFLE(XMM5,XMM5,0x00)
903: SSE_MULT_PS(XMM5,XMM0)
904: SSE_SUB_PS(XMM4,XMM5)
906: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
907: SSE_SHUFFLE(XMM6,XMM6,0x00)
908: SSE_MULT_PS(XMM6,XMM1)
909: SSE_SUB_PS(XMM4,XMM6)
911: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
912: SSE_SHUFFLE(XMM7,XMM7,0x00)
913: SSE_MULT_PS(XMM7,XMM2)
914: SSE_SUB_PS(XMM4,XMM7)
916: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
917: SSE_SHUFFLE(XMM5,XMM5,0x00)
918: SSE_MULT_PS(XMM5,XMM3)
919: SSE_SUB_PS(XMM4,XMM5)
921: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
922: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
923: SSE_INLINE_END_2;
924: pv += 16;
925: }
926: PetscLogFlops(128.0*nz+112.0);
927: }
928: row = *bjtmp++;
929: /* row = (*bjtmp++)/4; */
930: }
931: /* finished row so stick it into b->a */
932: pv = ba + 16*bi[i];
933: pj = bj + bi[i];
934: nz = bi[i+1] - bi[i];
936: /* Copy x block back into pv block */
937: for (j=0; j<nz; j++) {
938: x = rtmp+16*pj[j];
939: /* x = rtmp+4*pj[j]; */
941: SSE_INLINE_BEGIN_2(x,pv)
942: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
943: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
944: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
946: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
947: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
949: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
950: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
952: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
953: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
955: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
956: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
958: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
959: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
961: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
962: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
964: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
965: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
966: SSE_INLINE_END_2;
967: pv += 16;
968: }
969: /* invert diagonal block */
970: w = ba + 16*diag_offset[i];
971: if (pivotinblocks) {
972: PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);
973: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
974: } else {
975: PetscKernel_A_gets_inverse_A_4_nopivot(w);
976: }
977: /* PetscKernel_A_gets_inverse_A_4_SSE(w); */
978: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
979: }
981: PetscFree(rtmp);
983: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
984: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
985: C->assembled = PETSC_TRUE;
987: PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
988: /* Flop Count from inverting diagonal blocks */
989: SSE_SCOPE_END;
990: return 0;
991: }
993: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C)
994: {
995: Mat A =C;
996: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
997: int i,j,n = a->mbs;
998: unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
999: unsigned short *aj = (unsigned short*)(a->j),*ajtmp;
1000: unsigned int row;
1001: int nz,*bi=b->i;
1002: int *diag_offset = b->diag,*ai=a->i;
1003: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
1004: MatScalar *ba = b->a,*aa = a->a;
1005: int nonzero=0;
1006: PetscBool pivotinblocks = b->pivotinblocks;
1007: PetscReal shift = info->shiftamount;
1008: PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE;
1010: allowzeropivot = PetscNot(A->erroriffailure);
1011: SSE_SCOPE_BEGIN;
1015: PetscMalloc1(16*(n+1),&rtmp);
1017: /* if ((unsigned long)bj==(unsigned long)aj) { */
1018: /* colscale = 4; */
1019: /* } */
1021: for (i=0; i<n; i++) {
1022: nz = bi[i+1] - bi[i];
1023: bjtmp = bj + bi[i];
1024: /* zero out the 4x4 block accumulators */
1025: /* zero out one register */
1026: XOR_PS(XMM7,XMM7);
1027: for (j=0; j<nz; j++) {
1028: x = rtmp+16*((unsigned int)bjtmp[j]);
1029: /* x = rtmp+4*bjtmp[j]; */
1030: SSE_INLINE_BEGIN_1(x)
1031: /* Copy zero register to memory locations */
1032: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1033: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1034: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1035: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1036: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1037: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1038: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1039: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1040: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1041: SSE_INLINE_END_1;
1042: }
1043: /* load in initial (unfactored row) */
1044: nz = ai[i+1] - ai[i];
1045: ajtmp = aj + ai[i];
1046: v = aa + 16*ai[i];
1047: for (j=0; j<nz; j++) {
1048: x = rtmp+16*((unsigned int)ajtmp[j]);
1049: /* x = rtmp+colscale*ajtmp[j]; */
1050: /* Copy v block into x block */
1051: SSE_INLINE_BEGIN_2(v,x)
1052: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1053: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1054: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1056: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1057: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1059: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1060: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1062: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1063: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1065: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1066: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1068: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1069: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1071: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1072: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1074: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1075: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1076: SSE_INLINE_END_2;
1078: v += 16;
1079: }
1080: /* row = (*bjtmp++)/4; */
1081: row = (unsigned int)(*bjtmp++);
1082: while (row < i) {
1083: pc = rtmp + 16*row;
1084: SSE_INLINE_BEGIN_1(pc)
1085: /* Load block from lower triangle */
1086: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1087: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1088: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1090: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1091: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1093: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1094: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1096: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1097: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1099: /* Compare block to zero block */
1101: SSE_COPY_PS(XMM4,XMM7)
1102: SSE_CMPNEQ_PS(XMM4,XMM0)
1104: SSE_COPY_PS(XMM5,XMM7)
1105: SSE_CMPNEQ_PS(XMM5,XMM1)
1107: SSE_COPY_PS(XMM6,XMM7)
1108: SSE_CMPNEQ_PS(XMM6,XMM2)
1110: SSE_CMPNEQ_PS(XMM7,XMM3)
1112: /* Reduce the comparisons to one SSE register */
1113: SSE_OR_PS(XMM6,XMM7)
1114: SSE_OR_PS(XMM5,XMM4)
1115: SSE_OR_PS(XMM5,XMM6)
1116: SSE_INLINE_END_1;
1118: /* Reduce the one SSE register to an integer register for branching */
1119: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1120: MOVEMASK(nonzero,XMM5);
1122: /* If block is nonzero ... */
1123: if (nonzero) {
1124: pv = ba + 16*diag_offset[row];
1125: PREFETCH_L1(&pv[16]);
1126: pj = bj + diag_offset[row] + 1;
1128: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1129: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1130: /* but the diagonal was inverted already */
1131: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1133: SSE_INLINE_BEGIN_2(pv,pc)
1134: /* Column 0, product is accumulated in XMM4 */
1135: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1136: SSE_SHUFFLE(XMM4,XMM4,0x00)
1137: SSE_MULT_PS(XMM4,XMM0)
1139: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1140: SSE_SHUFFLE(XMM5,XMM5,0x00)
1141: SSE_MULT_PS(XMM5,XMM1)
1142: SSE_ADD_PS(XMM4,XMM5)
1144: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1145: SSE_SHUFFLE(XMM6,XMM6,0x00)
1146: SSE_MULT_PS(XMM6,XMM2)
1147: SSE_ADD_PS(XMM4,XMM6)
1149: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1150: SSE_SHUFFLE(XMM7,XMM7,0x00)
1151: SSE_MULT_PS(XMM7,XMM3)
1152: SSE_ADD_PS(XMM4,XMM7)
1154: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1155: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1157: /* Column 1, product is accumulated in XMM5 */
1158: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1159: SSE_SHUFFLE(XMM5,XMM5,0x00)
1160: SSE_MULT_PS(XMM5,XMM0)
1162: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1163: SSE_SHUFFLE(XMM6,XMM6,0x00)
1164: SSE_MULT_PS(XMM6,XMM1)
1165: SSE_ADD_PS(XMM5,XMM6)
1167: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1168: SSE_SHUFFLE(XMM7,XMM7,0x00)
1169: SSE_MULT_PS(XMM7,XMM2)
1170: SSE_ADD_PS(XMM5,XMM7)
1172: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1173: SSE_SHUFFLE(XMM6,XMM6,0x00)
1174: SSE_MULT_PS(XMM6,XMM3)
1175: SSE_ADD_PS(XMM5,XMM6)
1177: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1178: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1180: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1182: /* Column 2, product is accumulated in XMM6 */
1183: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1184: SSE_SHUFFLE(XMM6,XMM6,0x00)
1185: SSE_MULT_PS(XMM6,XMM0)
1187: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1188: SSE_SHUFFLE(XMM7,XMM7,0x00)
1189: SSE_MULT_PS(XMM7,XMM1)
1190: SSE_ADD_PS(XMM6,XMM7)
1192: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1193: SSE_SHUFFLE(XMM7,XMM7,0x00)
1194: SSE_MULT_PS(XMM7,XMM2)
1195: SSE_ADD_PS(XMM6,XMM7)
1197: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1198: SSE_SHUFFLE(XMM7,XMM7,0x00)
1199: SSE_MULT_PS(XMM7,XMM3)
1200: SSE_ADD_PS(XMM6,XMM7)
1202: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1203: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1205: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1206: /* Column 3, product is accumulated in XMM0 */
1207: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1208: SSE_SHUFFLE(XMM7,XMM7,0x00)
1209: SSE_MULT_PS(XMM0,XMM7)
1211: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1212: SSE_SHUFFLE(XMM7,XMM7,0x00)
1213: SSE_MULT_PS(XMM1,XMM7)
1214: SSE_ADD_PS(XMM0,XMM1)
1216: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1217: SSE_SHUFFLE(XMM1,XMM1,0x00)
1218: SSE_MULT_PS(XMM1,XMM2)
1219: SSE_ADD_PS(XMM0,XMM1)
1221: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1222: SSE_SHUFFLE(XMM7,XMM7,0x00)
1223: SSE_MULT_PS(XMM3,XMM7)
1224: SSE_ADD_PS(XMM0,XMM3)
1226: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1227: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1229: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1230: /* This is code to be maintained and read by humans after all. */
1231: /* Copy Multiplier Col 3 into XMM3 */
1232: SSE_COPY_PS(XMM3,XMM0)
1233: /* Copy Multiplier Col 2 into XMM2 */
1234: SSE_COPY_PS(XMM2,XMM6)
1235: /* Copy Multiplier Col 1 into XMM1 */
1236: SSE_COPY_PS(XMM1,XMM5)
1237: /* Copy Multiplier Col 0 into XMM0 */
1238: SSE_COPY_PS(XMM0,XMM4)
1239: SSE_INLINE_END_2;
1241: /* Update the row: */
1242: nz = bi[row+1] - diag_offset[row] - 1;
1243: pv += 16;
1244: for (j=0; j<nz; j++) {
1245: PREFETCH_L1(&pv[16]);
1246: x = rtmp + 16*((unsigned int)pj[j]);
1247: /* x = rtmp + 4*pj[j]; */
1249: /* X:=X-M*PV, One column at a time */
1250: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1251: SSE_INLINE_BEGIN_2(x,pv)
1252: /* Load First Column of X*/
1253: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1254: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1256: /* Matrix-Vector Product: */
1257: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1258: SSE_SHUFFLE(XMM5,XMM5,0x00)
1259: SSE_MULT_PS(XMM5,XMM0)
1260: SSE_SUB_PS(XMM4,XMM5)
1262: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1263: SSE_SHUFFLE(XMM6,XMM6,0x00)
1264: SSE_MULT_PS(XMM6,XMM1)
1265: SSE_SUB_PS(XMM4,XMM6)
1267: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1268: SSE_SHUFFLE(XMM7,XMM7,0x00)
1269: SSE_MULT_PS(XMM7,XMM2)
1270: SSE_SUB_PS(XMM4,XMM7)
1272: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1273: SSE_SHUFFLE(XMM5,XMM5,0x00)
1274: SSE_MULT_PS(XMM5,XMM3)
1275: SSE_SUB_PS(XMM4,XMM5)
1277: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1278: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1280: /* Second Column */
1281: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1282: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1284: /* Matrix-Vector Product: */
1285: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1286: SSE_SHUFFLE(XMM6,XMM6,0x00)
1287: SSE_MULT_PS(XMM6,XMM0)
1288: SSE_SUB_PS(XMM5,XMM6)
1290: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1291: SSE_SHUFFLE(XMM7,XMM7,0x00)
1292: SSE_MULT_PS(XMM7,XMM1)
1293: SSE_SUB_PS(XMM5,XMM7)
1295: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1296: SSE_SHUFFLE(XMM4,XMM4,0x00)
1297: SSE_MULT_PS(XMM4,XMM2)
1298: SSE_SUB_PS(XMM5,XMM4)
1300: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1301: SSE_SHUFFLE(XMM6,XMM6,0x00)
1302: SSE_MULT_PS(XMM6,XMM3)
1303: SSE_SUB_PS(XMM5,XMM6)
1305: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1306: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1308: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1310: /* Third Column */
1311: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1312: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1314: /* Matrix-Vector Product: */
1315: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1316: SSE_SHUFFLE(XMM7,XMM7,0x00)
1317: SSE_MULT_PS(XMM7,XMM0)
1318: SSE_SUB_PS(XMM6,XMM7)
1320: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1321: SSE_SHUFFLE(XMM4,XMM4,0x00)
1322: SSE_MULT_PS(XMM4,XMM1)
1323: SSE_SUB_PS(XMM6,XMM4)
1325: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1326: SSE_SHUFFLE(XMM5,XMM5,0x00)
1327: SSE_MULT_PS(XMM5,XMM2)
1328: SSE_SUB_PS(XMM6,XMM5)
1330: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1331: SSE_SHUFFLE(XMM7,XMM7,0x00)
1332: SSE_MULT_PS(XMM7,XMM3)
1333: SSE_SUB_PS(XMM6,XMM7)
1335: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1336: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1338: /* Fourth Column */
1339: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1340: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1342: /* Matrix-Vector Product: */
1343: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1344: SSE_SHUFFLE(XMM5,XMM5,0x00)
1345: SSE_MULT_PS(XMM5,XMM0)
1346: SSE_SUB_PS(XMM4,XMM5)
1348: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1349: SSE_SHUFFLE(XMM6,XMM6,0x00)
1350: SSE_MULT_PS(XMM6,XMM1)
1351: SSE_SUB_PS(XMM4,XMM6)
1353: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1354: SSE_SHUFFLE(XMM7,XMM7,0x00)
1355: SSE_MULT_PS(XMM7,XMM2)
1356: SSE_SUB_PS(XMM4,XMM7)
1358: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1359: SSE_SHUFFLE(XMM5,XMM5,0x00)
1360: SSE_MULT_PS(XMM5,XMM3)
1361: SSE_SUB_PS(XMM4,XMM5)
1363: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1364: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1365: SSE_INLINE_END_2;
1366: pv += 16;
1367: }
1368: PetscLogFlops(128.0*nz+112.0);
1369: }
1370: row = (unsigned int)(*bjtmp++);
1371: /* row = (*bjtmp++)/4; */
1372: /* bjtmp++; */
1373: }
1374: /* finished row so stick it into b->a */
1375: pv = ba + 16*bi[i];
1376: pj = bj + bi[i];
1377: nz = bi[i+1] - bi[i];
1379: /* Copy x block back into pv block */
1380: for (j=0; j<nz; j++) {
1381: x = rtmp+16*((unsigned int)pj[j]);
1382: /* x = rtmp+4*pj[j]; */
1384: SSE_INLINE_BEGIN_2(x,pv)
1385: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1386: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1387: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1389: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1390: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1392: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1393: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1395: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1396: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1398: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1399: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1401: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1402: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1404: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1405: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1407: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1408: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1409: SSE_INLINE_END_2;
1410: pv += 16;
1411: }
1412: /* invert diagonal block */
1413: w = ba + 16*diag_offset[i];
1414: if (pivotinblocks) {
1415: PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);
1416: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1417: } else {
1418: PetscKernel_A_gets_inverse_A_4_nopivot(w);
1419: }
1420: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1421: }
1423: PetscFree(rtmp);
1425: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1426: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1427: C->assembled = PETSC_TRUE;
1429: PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
1430: /* Flop Count from inverting diagonal blocks */
1431: SSE_SCOPE_END;
1432: return 0;
1433: }
1435: PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info)
1436: {
1437: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data;
1438: int i,j,n = a->mbs;
1439: unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj;
1440: unsigned int row;
1441: int *ajtmpold,nz,*bi=b->i;
1442: int *diag_offset = b->diag,*ai=a->i,*aj=a->j;
1443: MatScalar *pv,*v,*rtmp,*pc,*w,*x;
1444: MatScalar *ba = b->a,*aa = a->a;
1445: int nonzero=0;
1446: PetscBool pivotinblocks = b->pivotinblocks;
1447: PetscReal shift = info->shiftamount;
1448: PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE;
1450: allowzeropivot = PetscNot(A->erroriffailure);
1451: SSE_SCOPE_BEGIN;
1455: PetscMalloc1(16*(n+1),&rtmp);
1457: /* if ((unsigned long)bj==(unsigned long)aj) { */
1458: /* colscale = 4; */
1459: /* } */
1460: if ((unsigned long)bj==(unsigned long)aj) {
1461: return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C));
1462: }
1464: for (i=0; i<n; i++) {
1465: nz = bi[i+1] - bi[i];
1466: bjtmp = bj + bi[i];
1467: /* zero out the 4x4 block accumulators */
1468: /* zero out one register */
1469: XOR_PS(XMM7,XMM7);
1470: for (j=0; j<nz; j++) {
1471: x = rtmp+16*((unsigned int)bjtmp[j]);
1472: /* x = rtmp+4*bjtmp[j]; */
1473: SSE_INLINE_BEGIN_1(x)
1474: /* Copy zero register to memory locations */
1475: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1476: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7)
1477: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7)
1478: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7)
1479: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7)
1480: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7)
1481: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7)
1482: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1483: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7)
1484: SSE_INLINE_END_1;
1485: }
1486: /* load in initial (unfactored row) */
1487: nz = ai[i+1] - ai[i];
1488: ajtmpold = aj + ai[i];
1489: v = aa + 16*ai[i];
1490: for (j=0; j<nz; j++) {
1491: x = rtmp+16*ajtmpold[j];
1492: /* x = rtmp+colscale*ajtmpold[j]; */
1493: /* Copy v block into x block */
1494: SSE_INLINE_BEGIN_2(v,x)
1495: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1496: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1497: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0)
1499: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1)
1500: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1)
1502: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2)
1503: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2)
1505: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3)
1506: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3)
1508: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4)
1509: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4)
1511: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5)
1512: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5)
1514: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6)
1515: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6)
1517: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1518: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1519: SSE_INLINE_END_2;
1521: v += 16;
1522: }
1523: /* row = (*bjtmp++)/4; */
1524: row = (unsigned int)(*bjtmp++);
1525: while (row < i) {
1526: pc = rtmp + 16*row;
1527: SSE_INLINE_BEGIN_1(pc)
1528: /* Load block from lower triangle */
1529: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1530: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
1531: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0)
1533: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1)
1534: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1)
1536: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2)
1537: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2)
1539: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3)
1540: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3)
1542: /* Compare block to zero block */
1544: SSE_COPY_PS(XMM4,XMM7)
1545: SSE_CMPNEQ_PS(XMM4,XMM0)
1547: SSE_COPY_PS(XMM5,XMM7)
1548: SSE_CMPNEQ_PS(XMM5,XMM1)
1550: SSE_COPY_PS(XMM6,XMM7)
1551: SSE_CMPNEQ_PS(XMM6,XMM2)
1553: SSE_CMPNEQ_PS(XMM7,XMM3)
1555: /* Reduce the comparisons to one SSE register */
1556: SSE_OR_PS(XMM6,XMM7)
1557: SSE_OR_PS(XMM5,XMM4)
1558: SSE_OR_PS(XMM5,XMM6)
1559: SSE_INLINE_END_1;
1561: /* Reduce the one SSE register to an integer register for branching */
1562: /* Note: Since nonzero is an int, there is no INLINE block version of this call */
1563: MOVEMASK(nonzero,XMM5);
1565: /* If block is nonzero ... */
1566: if (nonzero) {
1567: pv = ba + 16*diag_offset[row];
1568: PREFETCH_L1(&pv[16]);
1569: pj = bj + diag_offset[row] + 1;
1571: /* Form Multiplier, one column at a time (Matrix-Matrix Product) */
1572: /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */
1573: /* but the diagonal was inverted already */
1574: /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */
1576: SSE_INLINE_BEGIN_2(pv,pc)
1577: /* Column 0, product is accumulated in XMM4 */
1578: SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4)
1579: SSE_SHUFFLE(XMM4,XMM4,0x00)
1580: SSE_MULT_PS(XMM4,XMM0)
1582: SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5)
1583: SSE_SHUFFLE(XMM5,XMM5,0x00)
1584: SSE_MULT_PS(XMM5,XMM1)
1585: SSE_ADD_PS(XMM4,XMM5)
1587: SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6)
1588: SSE_SHUFFLE(XMM6,XMM6,0x00)
1589: SSE_MULT_PS(XMM6,XMM2)
1590: SSE_ADD_PS(XMM4,XMM6)
1592: SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7)
1593: SSE_SHUFFLE(XMM7,XMM7,0x00)
1594: SSE_MULT_PS(XMM7,XMM3)
1595: SSE_ADD_PS(XMM4,XMM7)
1597: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4)
1598: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4)
1600: /* Column 1, product is accumulated in XMM5 */
1601: SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5)
1602: SSE_SHUFFLE(XMM5,XMM5,0x00)
1603: SSE_MULT_PS(XMM5,XMM0)
1605: SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6)
1606: SSE_SHUFFLE(XMM6,XMM6,0x00)
1607: SSE_MULT_PS(XMM6,XMM1)
1608: SSE_ADD_PS(XMM5,XMM6)
1610: SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7)
1611: SSE_SHUFFLE(XMM7,XMM7,0x00)
1612: SSE_MULT_PS(XMM7,XMM2)
1613: SSE_ADD_PS(XMM5,XMM7)
1615: SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6)
1616: SSE_SHUFFLE(XMM6,XMM6,0x00)
1617: SSE_MULT_PS(XMM6,XMM3)
1618: SSE_ADD_PS(XMM5,XMM6)
1620: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5)
1621: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5)
1623: SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24)
1625: /* Column 2, product is accumulated in XMM6 */
1626: SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6)
1627: SSE_SHUFFLE(XMM6,XMM6,0x00)
1628: SSE_MULT_PS(XMM6,XMM0)
1630: SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7)
1631: SSE_SHUFFLE(XMM7,XMM7,0x00)
1632: SSE_MULT_PS(XMM7,XMM1)
1633: SSE_ADD_PS(XMM6,XMM7)
1635: SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7)
1636: SSE_SHUFFLE(XMM7,XMM7,0x00)
1637: SSE_MULT_PS(XMM7,XMM2)
1638: SSE_ADD_PS(XMM6,XMM7)
1640: SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7)
1641: SSE_SHUFFLE(XMM7,XMM7,0x00)
1642: SSE_MULT_PS(XMM7,XMM3)
1643: SSE_ADD_PS(XMM6,XMM7)
1645: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6)
1646: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1648: /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */
1649: /* Column 3, product is accumulated in XMM0 */
1650: SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7)
1651: SSE_SHUFFLE(XMM7,XMM7,0x00)
1652: SSE_MULT_PS(XMM0,XMM7)
1654: SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7)
1655: SSE_SHUFFLE(XMM7,XMM7,0x00)
1656: SSE_MULT_PS(XMM1,XMM7)
1657: SSE_ADD_PS(XMM0,XMM1)
1659: SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1)
1660: SSE_SHUFFLE(XMM1,XMM1,0x00)
1661: SSE_MULT_PS(XMM1,XMM2)
1662: SSE_ADD_PS(XMM0,XMM1)
1664: SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7)
1665: SSE_SHUFFLE(XMM7,XMM7,0x00)
1666: SSE_MULT_PS(XMM3,XMM7)
1667: SSE_ADD_PS(XMM0,XMM3)
1669: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0)
1670: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1672: /* Simplify Bookkeeping -- Completely Unnecessary Instructions */
1673: /* This is code to be maintained and read by humans after all. */
1674: /* Copy Multiplier Col 3 into XMM3 */
1675: SSE_COPY_PS(XMM3,XMM0)
1676: /* Copy Multiplier Col 2 into XMM2 */
1677: SSE_COPY_PS(XMM2,XMM6)
1678: /* Copy Multiplier Col 1 into XMM1 */
1679: SSE_COPY_PS(XMM1,XMM5)
1680: /* Copy Multiplier Col 0 into XMM0 */
1681: SSE_COPY_PS(XMM0,XMM4)
1682: SSE_INLINE_END_2;
1684: /* Update the row: */
1685: nz = bi[row+1] - diag_offset[row] - 1;
1686: pv += 16;
1687: for (j=0; j<nz; j++) {
1688: PREFETCH_L1(&pv[16]);
1689: x = rtmp + 16*((unsigned int)pj[j]);
1690: /* x = rtmp + 4*pj[j]; */
1692: /* X:=X-M*PV, One column at a time */
1693: /* Note: M is already loaded columnwise into registers XMM0-XMM3 */
1694: SSE_INLINE_BEGIN_2(x,pv)
1695: /* Load First Column of X*/
1696: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1697: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1699: /* Matrix-Vector Product: */
1700: SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5)
1701: SSE_SHUFFLE(XMM5,XMM5,0x00)
1702: SSE_MULT_PS(XMM5,XMM0)
1703: SSE_SUB_PS(XMM4,XMM5)
1705: SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6)
1706: SSE_SHUFFLE(XMM6,XMM6,0x00)
1707: SSE_MULT_PS(XMM6,XMM1)
1708: SSE_SUB_PS(XMM4,XMM6)
1710: SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7)
1711: SSE_SHUFFLE(XMM7,XMM7,0x00)
1712: SSE_MULT_PS(XMM7,XMM2)
1713: SSE_SUB_PS(XMM4,XMM7)
1715: SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5)
1716: SSE_SHUFFLE(XMM5,XMM5,0x00)
1717: SSE_MULT_PS(XMM5,XMM3)
1718: SSE_SUB_PS(XMM4,XMM5)
1720: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4)
1721: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4)
1723: /* Second Column */
1724: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1725: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1727: /* Matrix-Vector Product: */
1728: SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6)
1729: SSE_SHUFFLE(XMM6,XMM6,0x00)
1730: SSE_MULT_PS(XMM6,XMM0)
1731: SSE_SUB_PS(XMM5,XMM6)
1733: SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7)
1734: SSE_SHUFFLE(XMM7,XMM7,0x00)
1735: SSE_MULT_PS(XMM7,XMM1)
1736: SSE_SUB_PS(XMM5,XMM7)
1738: SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4)
1739: SSE_SHUFFLE(XMM4,XMM4,0x00)
1740: SSE_MULT_PS(XMM4,XMM2)
1741: SSE_SUB_PS(XMM5,XMM4)
1743: SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6)
1744: SSE_SHUFFLE(XMM6,XMM6,0x00)
1745: SSE_MULT_PS(XMM6,XMM3)
1746: SSE_SUB_PS(XMM5,XMM6)
1748: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5)
1749: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5)
1751: SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24)
1753: /* Third Column */
1754: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1755: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1757: /* Matrix-Vector Product: */
1758: SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7)
1759: SSE_SHUFFLE(XMM7,XMM7,0x00)
1760: SSE_MULT_PS(XMM7,XMM0)
1761: SSE_SUB_PS(XMM6,XMM7)
1763: SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4)
1764: SSE_SHUFFLE(XMM4,XMM4,0x00)
1765: SSE_MULT_PS(XMM4,XMM1)
1766: SSE_SUB_PS(XMM6,XMM4)
1768: SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5)
1769: SSE_SHUFFLE(XMM5,XMM5,0x00)
1770: SSE_MULT_PS(XMM5,XMM2)
1771: SSE_SUB_PS(XMM6,XMM5)
1773: SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7)
1774: SSE_SHUFFLE(XMM7,XMM7,0x00)
1775: SSE_MULT_PS(XMM7,XMM3)
1776: SSE_SUB_PS(XMM6,XMM7)
1778: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6)
1779: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1781: /* Fourth Column */
1782: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1783: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1785: /* Matrix-Vector Product: */
1786: SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5)
1787: SSE_SHUFFLE(XMM5,XMM5,0x00)
1788: SSE_MULT_PS(XMM5,XMM0)
1789: SSE_SUB_PS(XMM4,XMM5)
1791: SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6)
1792: SSE_SHUFFLE(XMM6,XMM6,0x00)
1793: SSE_MULT_PS(XMM6,XMM1)
1794: SSE_SUB_PS(XMM4,XMM6)
1796: SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7)
1797: SSE_SHUFFLE(XMM7,XMM7,0x00)
1798: SSE_MULT_PS(XMM7,XMM2)
1799: SSE_SUB_PS(XMM4,XMM7)
1801: SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5)
1802: SSE_SHUFFLE(XMM5,XMM5,0x00)
1803: SSE_MULT_PS(XMM5,XMM3)
1804: SSE_SUB_PS(XMM4,XMM5)
1806: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
1807: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
1808: SSE_INLINE_END_2;
1809: pv += 16;
1810: }
1811: PetscLogFlops(128.0*nz+112.0);
1812: }
1813: row = (unsigned int)(*bjtmp++);
1814: /* row = (*bjtmp++)/4; */
1815: /* bjtmp++; */
1816: }
1817: /* finished row so stick it into b->a */
1818: pv = ba + 16*bi[i];
1819: pj = bj + bi[i];
1820: nz = bi[i+1] - bi[i];
1822: /* Copy x block back into pv block */
1823: for (j=0; j<nz; j++) {
1824: x = rtmp+16*((unsigned int)pj[j]);
1825: /* x = rtmp+4*pj[j]; */
1827: SSE_INLINE_BEGIN_2(x,pv)
1828: /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */
1829: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1)
1830: SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1)
1832: SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2)
1833: SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2)
1835: SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3)
1836: SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3)
1838: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4)
1839: SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4)
1841: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
1842: SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5)
1844: SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6)
1845: SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6)
1847: SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7)
1848: SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7)
1850: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0)
1851: SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0)
1852: SSE_INLINE_END_2;
1853: pv += 16;
1854: }
1855: /* invert diagonal block */
1856: w = ba + 16*diag_offset[i];
1857: if (pivotinblocks) {
1858: PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,,&zeropivotdetected);
1859: if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1860: } else {
1861: PetscKernel_A_gets_inverse_A_4_nopivot(w);
1862: }
1863: /* PetscKernel_A_gets_inverse_A_4_SSE(w); */
1864: /* Note: Using Kramer's rule, flop count below might be infairly high or low? */
1865: }
1867: PetscFree(rtmp);
1869: C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE;
1870: C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE;
1871: C->assembled = PETSC_TRUE;
1873: PetscLogFlops(1.333333333333*bs*bs2*b->mbs);
1874: /* Flop Count from inverting diagonal blocks */
1875: SSE_SCOPE_END;
1876: return 0;
1877: }
1879: #endif