Actual source code: ex20.c
1: static const char help[] = "PetscSF Ping-pong test to measure MPI latency\n\n";
3: /*
4: This is a simple test to measure the latency of MPI communication.
5: The test is run with two processes. The first process sends a message
6: to the second process, and after having received the message, the second
7: process sends a message back to the first process once. The is repeated
8: a number of times. The latency is defined as half time of the round-trip.
10: It mimics osu_latency from the OSU microbenchmarks (https://mvapich.cse.ohio-state.edu/benchmarks/).
12: Usage: mpirun -n 2 ./ex18 -mtype <type>
13: Other arguments have a default value that is also used in osu_latency.
15: Examples:
17: On Summit at OLCF:
18: jsrun --smpiargs "-gpu" -n 2 -a 1 -c 7 -g 1 -r 2 -l GPU-GPU -d packed -b packed:7 ./ex18 -mtype cuda
20: On Crusher at OLCF:
21: srun -n2 -c32 --cpu-bind=map_cpu:0,1 --gpus-per-node=8 --gpu-bind=map_gpu:0,1 ./ex18 -mtype hip
22: */
24: #include <petscsf.h>
25: #include <petscdevice.h>
26: #if defined(PETSC_HAVE_UNISTD_H)
27: #include <unistd.h>
28: #endif
30: /* Same values as OSU microbenchmarks */
31: #define LAT_LOOP_SMALL 10000
32: #define LAT_SKIP_SMALL 100
33: #define LAT_LOOP_LARGE 1000
34: #define LAT_SKIP_LARGE 10
35: #define LARGE_MESSAGE_SIZE 8192
37: static inline PetscErrorCode PetscMallocWithMemType(PetscMemType mtype,size_t size,void** ptr)
38: {
39: if (PetscMemTypeHost(mtype)) {
40: #if defined(PETSC_HAVE_GETPAGESIZE)
41: posix_memalign(ptr,getpagesize(),size);
42: #else
43: PetscMalloc(size,ptr);
44: #endif
45: }
46: #if defined(PETSC_HAVE_CUDA)
47: else if (PetscMemTypeCUDA(mtype)) cudaMalloc(ptr,size);
48: #elif defined(PETSC_HAVE_HIP)
49: else if (PetscMemTypeHIP(mtype)) hipMalloc(ptr,size);
50: #endif
51: return 0;
52: }
54: static inline PetscErrorCode PetscFreeWithMemType_Private(PetscMemType mtype,void* ptr)
55: {
56: if (PetscMemTypeHost(mtype)) {free(ptr);}
57: #if defined(PETSC_HAVE_CUDA)
58: else if (PetscMemTypeCUDA(mtype)) cudaFree(ptr);
59: #elif defined(PETSC_HAVE_HIP)
60: else if (PetscMemTypeHIP(mtype)) hipFree(ptr);
61: #endif
62: return 0;
63: }
65: /* Free memory and set ptr to NULL when succeeded */
66: #define PetscFreeWithMemType(t,p) ((p) && (PetscFreeWithMemType_Private((t),(p)) || ((p)=NULL,0)))
68: static inline PetscErrorCode PetscMemcpyFromHostWithMemType(PetscMemType mtype,void* dst, const void *src, size_t n)
69: {
70: if (PetscMemTypeHost(mtype)) PetscMemcpy(dst,src,n);
71: #if defined(PETSC_HAVE_CUDA)
72: else if (PetscMemTypeCUDA(mtype)) cudaMemcpy(dst,src,n,cudaMemcpyHostToDevice);
73: #elif defined(PETSC_HAVE_HIP)
74: else if (PetscMemTypeHIP(mtype)) hipMemcpy(dst,src,n,hipMemcpyHostToDevice);
75: #endif
76: return 0;
77: }
79: int main(int argc,char **argv)
80: {
81: PetscSF sf[64];
82: PetscLogDouble t_start=0,t_end=0,time[64];
83: PetscInt i,j,n,nroots,nleaves,niter=100,nskip=10;
84: PetscInt maxn=512*1024; /* max 4M bytes messages */
85: PetscSFNode *iremote;
86: PetscMPIInt rank,size;
87: PetscScalar *rootdata=NULL,*leafdata=NULL,*pbuf,*ebuf;
88: size_t msgsize;
89: PetscMemType mtype = PETSC_MEMTYPE_HOST;
90: char mstring[16]={0};
91: PetscBool isCuda,isHip,isHost,set;
92: PetscInt skipSmall=-1,loopSmall=-1;
93: MPI_Op op = MPI_REPLACE;
95: PetscInitialize(&argc,&argv,NULL,help);
96: /* Must init the device first if one wants to call PetscGetMemType() without creating PETSc device objects */
97: #if defined(PETSC_HAVE_CUDA)
98: PetscDeviceInitialize(PETSC_DEVICE_CUDA);
99: #elif defined(PETSC_HAVE_HIP)
100: PetscDeviceInitialize(PETSC_DEVICE_HIP);
101: #endif
102: MPI_Comm_size(PETSC_COMM_WORLD,&size);
103: MPI_Comm_rank(PETSC_COMM_WORLD,&rank);
106: PetscOptionsGetInt(NULL,NULL,"-maxn",&maxn,NULL); /* maxn PetscScalars */
107: PetscOptionsGetInt(NULL,NULL,"-skipSmall",&skipSmall,NULL);
108: PetscOptionsGetInt(NULL,NULL,"-loopSmall",&loopSmall,NULL);
110: PetscMalloc1(maxn,&iremote);
111: PetscOptionsGetString(NULL,NULL,"-mtype",mstring,16,&set);
112: if (set) {
113: PetscStrcasecmp(mstring,"cuda",&isCuda);
114: PetscStrcasecmp(mstring,"hip",&isHip);
115: PetscStrcasecmp(mstring,"host",&isHost);
117: if (isHost) mtype = PETSC_MEMTYPE_HOST;
118: else if (isCuda) mtype = PETSC_MEMTYPE_CUDA;
119: else if (isHip) mtype = PETSC_MEMTYPE_HIP;
120: else SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_ARG_WRONG,"Unknown memory type: %s",mstring);
121: }
123: PetscMallocWithMemType(mtype,sizeof(PetscScalar)*maxn,(void**)&rootdata);
124: PetscMallocWithMemType(mtype,sizeof(PetscScalar)*maxn,(void**)&leafdata);
126: PetscMalloc2(maxn,&pbuf,maxn,&ebuf);
127: for (i=0; i<maxn; i++) {
128: pbuf[i] = 123.0;
129: ebuf[i] = 456.0;
130: }
132: for (n=1,i=0; n<=maxn; n*=2,i++) {
133: PetscSFCreate(PETSC_COMM_WORLD,&sf[i]);
134: PetscSFSetFromOptions(sf[i]);
135: if (!rank) {
136: nroots = n;
137: nleaves = 0;
138: } else {
139: nroots = 0;
140: nleaves = n;
141: for (j=0; j<nleaves; j++) {
142: iremote[j].rank = 0;
143: iremote[j].index = j;
144: }
145: }
146: PetscSFSetGraph(sf[i],nroots,nleaves,NULL,PETSC_COPY_VALUES,iremote,PETSC_COPY_VALUES);
147: PetscSFSetUp(sf[i]);
148: }
150: if (loopSmall > 0) {
151: nskip = skipSmall;
152: niter = loopSmall;
153: } else {
154: nskip = LAT_SKIP_SMALL;
155: niter = LAT_LOOP_SMALL;
156: }
158: for (n=1,j=0; n<=maxn; n*=2,j++) {
159: msgsize = sizeof(PetscScalar)*n;
160: PetscMemcpyFromHostWithMemType(mtype,rootdata,pbuf,msgsize);
161: PetscMemcpyFromHostWithMemType(mtype,leafdata,ebuf,msgsize);
163: if (msgsize > LARGE_MESSAGE_SIZE) {
164: nskip = LAT_SKIP_LARGE;
165: niter = LAT_LOOP_LARGE;
166: }
167: MPI_Barrier(MPI_COMM_WORLD);
169: for (i=0; i<niter + nskip; i++) {
170: if (i == nskip) {
171: #if defined(PETSC_HAVE_CUDA)
172: cudaDeviceSynchronize();
173: #elif defined(PETSC_HAVE_HIP)
174: hipDeviceSynchronize();
175: #endif
176: MPI_Barrier(PETSC_COMM_WORLD);
177: t_start = MPI_Wtime();
178: }
179: PetscSFBcastWithMemTypeBegin(sf[j],MPIU_SCALAR,mtype,rootdata,mtype,leafdata,op);
180: PetscSFBcastEnd(sf[j],MPIU_SCALAR,rootdata,leafdata,op);
181: PetscSFReduceWithMemTypeBegin(sf[j],MPIU_SCALAR,mtype,leafdata,mtype,rootdata,op);
182: PetscSFReduceEnd(sf[j],MPIU_SCALAR,leafdata,rootdata,op);
183: }
184: #if defined(PETSC_HAVE_CUDA)
185: cudaDeviceSynchronize();
186: #elif defined(PETSC_HAVE_HIP)
187: hipDeviceSynchronize();
188: #endif
189: MPI_Barrier(PETSC_COMM_WORLD);
190: t_end = MPI_Wtime();
191: time[j] = (t_end - t_start)*1e6 / (niter*2);
192: }
194: PetscPrintf(PETSC_COMM_WORLD,"\t## PetscSF Ping-pong test on %s ##\n Message(Bytes) \t\tLatency(us)\n", mtype==PETSC_MEMTYPE_HOST? "Host" : "Device");
195: for (n=1,j=0; n<=maxn; n*=2,j++) {
196: PetscSFDestroy(&sf[j]);
197: PetscPrintf(PETSC_COMM_WORLD,"%16D \t %16.4f\n",sizeof(PetscScalar)*n,time[j]);
198: }
200: PetscFree2(pbuf,ebuf);
201: PetscFreeWithMemType(mtype,rootdata);
202: PetscFreeWithMemType(mtype,leafdata);
203: PetscFree(iremote);
204: PetscFinalize();
205: return 0;
206: }
208: /**TEST
209: testset:
210: # use small numbers to make the test cheap
211: args: -maxn 4 -skipSmall 1 -loopSmall 1
212: filter: grep "DOES_NOT_EXIST"
213: output_file: output/empty.out
214: nsize: 2
216: test:
217: args: -mtype host
219: test:
220: requires: cuda
221: args: -mtype cuda
223: test:
224: requires: hip
225: args: -mtype hip
226: TEST**/