Actual source code: ex1cu.cu
1: static char help[] = "Benchmarking CUDA kernel launch time\n";
2: /*
3: Running example on Summit at OLCF:
4: # run with total 1 resource set (RS) (-n1), 1 RS per node (-r1), 1 MPI rank (-a1), 7 cores (-c7) and 1 GPU (-g1) per RS
5: $ jsrun -n1 -a1 -c7 -g1 -r1 ./ex1cu
6: Average asynchronous CUDA kernel launch time = 9.48 microseconds
7: Average synchronous CUDA kernel launch time = 12.83 microseconds
8: */
9: #include <petscsys.h>
10: #include <petscdevice.h>
12: __global__ void NullKernel(){}
14: int main(int argc,char **argv)
15: {
16: PetscInt i,n=100000;
17: PetscLogDouble tstart,tend,time;
19: PetscInitialize(&argc,&argv,(char*)0,help);
20: PetscOptionsGetInt(NULL,NULL,"-n",&n,NULL);
22: /* Launch a sequence of kernels asynchronously. Previous launched kernels do not need to be completed before launching a new one */
23: PetscTime(&tstart);
24: for (i=0; i<n; i++) {NullKernel<<<1,1,0,NULL>>>();}
25: PetscTime(&tend);
26: cudaStreamSynchronize(NULL); /* Sync after tend since we don't want to count kernel execution time */
27: time = (tend-tstart)*1e6/n;
28: PetscPrintf(PETSC_COMM_WORLD,"Average asynchronous CUDA kernel launch time = %.2f microseconds\n",time);
30: /* Launch a sequence of kernels synchronously. Only launch a new kernel after the one before it has been completed */
31: PetscTime(&tstart);
32: for (i=0; i<n; i++) {
33: NullKernel<<<1,1,0,NULL>>>();
34: cudaStreamSynchronize(NULL);
35: }
36: PetscTime(&tend);
37: time = (tend-tstart)*1e6/n;
38: PetscPrintf(PETSC_COMM_WORLD,"Average synchronous CUDA kernel launch time = %.2f microseconds\n",time);
40: PetscFinalize();
41: return 0;
42: }
44: /*TEST
45: build:
46: requires: cuda
48: test:
49: requires: cuda
50: args: -n 2
51: output_file: output/empty.out
52: filter: grep "DOES_NOT_EXIST"
54: TEST*/