Actual source code: agmresorthog.c

  1: #define PETSCKSP_DLL

  3: #include <../src/ksp/ksp/impls/gmres/agmres/agmresimpl.h>
  4: /*
  5:  *  This file implements the RODDEC algorithm : its purpose is to orthogonalize a set of vectors distributed across several processes. These processes are organized in a virtual ring.
  6:  * References : [1] Sidje, Roger B. Alternatives for parallel Krylov subspace basis computation. Numer. Linear Algebra Appl. 4 (1997), no. 4, 305-331
  7:  *
  8:  *
  9:  * initial author R. B. SIDJE,
 10:  * modified : G.-A Atenekeng-Kahou, 2008
 11:  * modified : D. NUENTSA WAKAM, 2011
 12:  *
 13:  */

 15: /*
 16:  * Take the processes that own the vectors and Organize them on a virtual ring.
 17:  */
 18: static PetscErrorCode  KSPAGMRESRoddecGivens(PetscReal*, PetscReal*, PetscReal*, PetscInt);

 20: PetscErrorCode KSPAGMRESRoddecInitNeighboor(KSP ksp)
 21: {
 22:   MPI_Comm       comm;
 23:   KSP_AGMRES     *agmres = (KSP_AGMRES*)(ksp->data);
 24:   PetscMPIInt    First, Last, rank, size;

 26:   PetscObjectGetComm((PetscObject)agmres->vecs[0], &comm);
 27:   MPI_Comm_rank(comm, &rank);
 28:   MPI_Comm_size(comm, &size);
 29:   MPIU_Allreduce(&rank, &First, 1, MPI_INT, MPI_MIN, comm);
 30:   MPIU_Allreduce(&rank, &Last, 1, MPI_INT, MPI_MAX, comm);

 32:   if ((rank != Last) && (rank == 0)) {
 33:     agmres->Ileft  = rank - 1;
 34:     agmres->Iright = rank + 1;
 35:   } else {
 36:     if (rank == Last) {
 37:       agmres->Ileft  = rank - 1;
 38:       agmres->Iright = First;
 39:     } else {
 40:       {
 41:         agmres->Ileft  = Last;
 42:         agmres->Iright = rank + 1;
 43:       }
 44:     }
 45:   }
 46:   agmres->rank  = rank;
 47:   agmres->size  = size;
 48:   agmres->First = First;
 49:   agmres->Last  = Last;
 50:   return 0;
 51: }

 53: static PetscErrorCode  KSPAGMRESRoddecGivens(PetscReal * c, PetscReal * s, PetscReal * r, PetscInt make_r)
 54: {
 55:   PetscReal a, b, t;

 57:   if (make_r == 1) {
 58:     a = *c;
 59:     b = *s;
 60:     if (b == 0.e0) {
 61:       *c = 1.e0;
 62:       *s = 0.e0;
 63:     } else {
 64:       if (PetscAbsReal(b) > PetscAbsReal(a)) {
 65:         t  = -a / b;
 66:         *s = 1.e0 / PetscSqrtReal(1.e0 + t * t);
 67:         *c = (*s) * t;
 68:       } else {
 69:         t  = -b / a;
 70:         *c = 1.e0 / PetscSqrtReal(1.e0 + t * t);
 71:         *s = (*c) * t;
 72:       }
 73:     }
 74:     if (*c == 0.e0) {
 75:       *r = 1.e0;
 76:     } else {
 77:       if (PetscAbsReal(*s) < PetscAbsReal(*c)) {
 78:         *r = PetscSign(*c) * (*s) / 2.e0;
 79:       } else {
 80:         *r = PetscSign(*s) * 2.e0 / (*c);
 81:       }
 82:     }
 83:   }

 85:   if (*r == 1.e0) {
 86:     *c = 0.e0;
 87:     *s = 1.e0;
 88:   } else {
 89:     if (PetscAbsReal(*r) < 1.e0) {
 90:       *s = 2.e0 * (*r);
 91:       *c = PetscSqrtReal(1.e0 - (*s) * (*s));
 92:     } else {
 93:       *c = 2.e0 / (*r);
 94:       *s = PetscSqrtReal(1.e0 - (*c) * (*c));
 95:     }
 96:   }
 97:   return 0;

 99: }

101: /*
102:  * Compute the QR factorization of the Krylov basis vectors
103:  * Input :
104:  *  - the vectors are availabe in agmres->vecs (alias VEC_V)
105:  *  - nvec :  the number of vectors
106:  * Output :
107:  *  - agmres->Qloc : product of elementary reflectors for the QR of the (local part) of the vectors.
108:  *  - agmres->sgn :  Sign of the rotations
109:  *  - agmres->tloc : scalar factors of the elementary reflectors.

111:  */
112: PetscErrorCode KSPAGMRESRoddec(KSP ksp, PetscInt nvec)
113: {
114:   KSP_AGMRES     *agmres = (KSP_AGMRES*) ksp->data;
115:   MPI_Comm       comm;
116:   PetscScalar    *Qloc   = agmres->Qloc;
117:   PetscScalar    *sgn    = agmres->sgn;
118:   PetscScalar    *tloc   = agmres->tloc;
119:   PetscReal      *wbufptr = agmres->wbufptr;
120:   PetscMPIInt    rank     = agmres->rank;
121:   PetscMPIInt    First    = agmres->First;
122:   PetscMPIInt    Last     = agmres->Last;
123:   PetscBLASInt   pas,len,bnloc,bpos;
124:   PetscInt       nloc,d, i, j, k;
125:   PetscInt       pos;
126:   PetscReal      c, s, rho, Ajj, val, tt, old;
127:   PetscScalar    *col;
128:   MPI_Status     status;
129:   PetscBLASInt   N = MAXKSPSIZE + 1;

131:   PetscObjectGetComm((PetscObject)ksp,&comm);
132:   PetscLogEventBegin(KSP_AGMRESRoddec,ksp,0,0,0);
133:   PetscArrayzero(agmres->Rloc, N*N);
134:   /* check input arguments */
136:   VecGetLocalSize(VEC_V(0), &nloc);
137:   PetscBLASIntCast(nloc,&bnloc);
139:   pas = 1;
140:   /* Copy the vectors of the basis */
141:   for (j = 0; j < nvec; j++) {
142:     VecGetArray(VEC_V(j), &col);
143:     PetscStackCallBLAS("BLAScopy",BLAScopy_(&bnloc, col, &pas, &Qloc[j*nloc], &pas));
144:     VecRestoreArray(VEC_V(j), &col);
145:   }
146:   /* Each process performs a local QR on its own block */
147:   for (j = 0; j < nvec; j++) {
148:     len = nloc - j;
149:     Ajj = Qloc[j*nloc+j];
150:     PetscStackCallBLAS("BLASnrm2",rho = -PetscSign(Ajj) * BLASnrm2_(&len, &(Qloc[j*nloc+j]), &pas));
151:     if (rho == 0.0) tloc[j] = 0.0;
152:     else {
153:       tloc[j] = (Ajj - rho) / rho;
154:       len     = len - 1;
155:       val     = 1.0 / (Ajj - rho);
156:       PetscStackCallBLAS("BLASscal",BLASscal_(&len, &val, &(Qloc[j*nloc+j+1]), &pas));
157:       Qloc[j*nloc+j] = 1.0;
158:       len            = len + 1;
159:       for (k = j + 1; k < nvec; k++) {
160:         PetscStackCallBLAS("BLASdot",tt = tloc[j] * BLASdot_(&len, &(Qloc[j*nloc+j]), &pas, &(Qloc[k*nloc+j]), &pas));
161:         PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&len, &tt, &(Qloc[j*nloc+j]), &pas, &(Qloc[k*nloc+j]), &pas));
162:       }
163:       Qloc[j*nloc+j] = rho;
164:     }
165:   }
166:   /* annihilate undesirable Rloc, diagonal by diagonal*/
167:   for (d = 0; d < nvec; d++) {
168:     len = nvec - d;
169:     if (rank == First) {
170:       PetscStackCallBLAS("BLAScopy",BLAScopy_(&len, &(Qloc[d*nloc+d]), &bnloc, &(wbufptr[d]), &pas));
171:       MPI_Send(&(wbufptr[d]), len, MPIU_SCALAR, rank + 1, agmres->tag, comm);
172:     } else {
173:       MPI_Recv(&(wbufptr[d]), len, MPIU_SCALAR, rank - 1, agmres->tag, comm, &status);
174:       /* Elimination of Rloc(1,d)*/
175:       c    = wbufptr[d];
176:       s    = Qloc[d*nloc];
177:       KSPAGMRESRoddecGivens(&c, &s, &rho, 1);
178:       /* Apply Givens Rotation*/
179:       for (k = d; k < nvec; k++) {
180:         old          = wbufptr[k];
181:         wbufptr[k]   =  c * old - s * Qloc[k*nloc];
182:         Qloc[k*nloc] =  s * old + c * Qloc[k*nloc];
183:       }
184:       Qloc[d*nloc] = rho;
185:       if (rank != Last) {
186:         MPI_Send(& (wbufptr[d]), len, MPIU_SCALAR, rank + 1, agmres->tag, comm);
187:       }
188:       /* zero-out the d-th diagonal of Rloc ...*/
189:       for (j = d + 1; j < nvec; j++) {
190:         /* elimination of Rloc[i][j]*/
191:         i    = j - d;
192:         c    = Qloc[j*nloc+i-1];
193:         s    = Qloc[j*nloc+i];
194:         KSPAGMRESRoddecGivens(&c, &s, &rho, 1);
195:         for (k = j; k < nvec; k++) {
196:           old              = Qloc[k*nloc+i-1];
197:           Qloc[k*nloc+i-1] = c * old - s * Qloc[k*nloc+i];
198:           Qloc[k*nloc+i]   =   s * old + c * Qloc[k*nloc+i];
199:         }
200:         Qloc[j*nloc+i] = rho;
201:       }
202:       if (rank == Last) {
203:         PetscStackCallBLAS("BLAScopy",BLAScopy_(&len, &(wbufptr[d]), &pas, RLOC(d,d), &N));
204:         for (k = d + 1; k < nvec; k++) *RLOC(k,d) = 0.0;
205:       }
206:     }
207:   }

209:   if (rank == Last) {
210:     for (d = 0; d < nvec; d++) {
211:       pos    = nvec - d;
212:       PetscBLASIntCast(pos,&bpos);
213:       sgn[d] = PetscSign(*RLOC(d,d));
214:       PetscStackCallBLAS("BLASscal",BLASscal_(&bpos, &(sgn[d]), RLOC(d,d), &N));
215:     }
216:   }
217:   /* BroadCast Rloc to all other processes
218:    * NWD : should not be needed
219:    */
220:   MPI_Bcast(agmres->Rloc,N*N,MPIU_SCALAR,Last,comm);
221:   PetscLogEventEnd(KSP_AGMRESRoddec,ksp,0,0,0);
222:   return 0;
223: }

225: /*
226:  *  Computes Out <-- Q * In where Q is the orthogonal matrix from AGMRESRoddec
227:  *  Input
228:  *   - Qloc, sgn, tloc, nvec (see AGMRESRoddec above)
229:  *   - In : input vector (size nvec)
230:  *  Output :
231:  *   - Out : Petsc vector (distributed as the basis vectors)
232:  */
233: PetscErrorCode KSPAGMRESRodvec(KSP ksp, PetscInt nvec, PetscScalar *In, Vec Out)
234: {
235:   KSP_AGMRES     *agmres  = (KSP_AGMRES*) ksp->data;
236:   MPI_Comm       comm;
237:   PetscScalar    *Qloc    = agmres->Qloc;
238:   PetscScalar    *sgn     = agmres->sgn;
239:   PetscScalar    *tloc    = agmres->tloc;
240:   PetscMPIInt    rank     = agmres->rank;
241:   PetscMPIInt    First    = agmres->First, Last = agmres->Last;
242:   PetscMPIInt    Iright   = agmres->Iright, Ileft = agmres->Ileft;
243:   PetscScalar    *y, *zloc;
244:   PetscInt       nloc,d, len, i, j;
245:   PetscBLASInt   bnvec,pas,blen;
246:   PetscInt       dpt;
247:   PetscReal      c, s, rho, zp, zq, yd=0.0, tt;
248:   MPI_Status     status;

250:   PetscBLASIntCast(nvec,&bnvec);
251:   PetscObjectGetComm((PetscObject)ksp,&comm);
252:   pas  = 1;
253:   VecGetLocalSize(VEC_V(0), &nloc);
254:   PetscMalloc1(nvec, &y);
255:   PetscArraycpy(y, In, nvec);
256:   VecGetArray(Out, &zloc);

258:   if (rank == Last) {
259:     for (i = 0; i < nvec; i++) y[i] = sgn[i] * y[i];
260:   }
261:   for (i = 0; i < nloc; i++) zloc[i] = 0.0;
262:   if (agmres->size == 1) PetscStackCallBLAS("BLAScopy",BLAScopy_(&bnvec, y, &pas, &(zloc[0]), &pas));
263:   else {
264:     for (d = nvec - 1; d >= 0; d--) {
265:       if (rank == First) {
266:         MPI_Recv(&(zloc[d]), 1, MPIU_SCALAR, Iright, agmres->tag, comm, &status);
267:       } else {
268:         for (j = nvec - 1; j >= d + 1; j--) {
269:           i         = j - d;
270:           KSPAGMRESRoddecGivens(&c, &s, &(Qloc[j * nloc + i]), 0);
271:           zp        = zloc[i-1];
272:           zq        = zloc[i];
273:           zloc[i-1] =     c * zp + s * zq;
274:           zloc[i]   =     -s * zp + c * zq;
275:         }
276:         KSPAGMRESRoddecGivens(&c, &s, &(Qloc[d * nloc]), 0);
277:         if (rank == Last) {
278:           zp      = y[d];
279:           zq      = zloc[0];
280:           y[d]    =      c * zp + s * zq;
281:           zloc[0] =   -s * zp + c * zq;
282:           MPI_Send(&(y[d]), 1, MPIU_SCALAR, Ileft, agmres->tag, comm);
283:         } else {
284:           MPI_Recv(&yd, 1, MPIU_SCALAR, Iright, agmres->tag, comm, &status);
285:           zp      = yd;
286:           zq      = zloc[0];
287:           yd      =      c * zp + s * zq;
288:           zloc[0] =   -s * zp + c * zq;
289:           MPI_Send(&yd, 1, MPIU_SCALAR, Ileft, agmres->tag, comm);
290:         }
291:       }
292:     }
293:   }
294:   for (j = nvec - 1; j >= 0; j--) {
295:     dpt = j * nloc + j;
296:     if (tloc[j] != 0.0) {
297:       len       = nloc - j;
298:       PetscBLASIntCast(len,&blen);
299:       rho       = Qloc[dpt];
300:       Qloc[dpt] = 1.0;
301:       tt        = tloc[j] * (BLASdot_(&blen, &(Qloc[dpt]), &pas, &(zloc[j]), &pas));
302:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&blen, &tt, &(Qloc[dpt]), &pas, &(zloc[j]), &pas));
303:       Qloc[dpt] = rho;
304:     }
305:   }
306:   VecRestoreArray(Out, &zloc);
307:   PetscFree(y);
308:   return 0;
309: }