diff --git a/L26/gpu_harmonic.cu b/L26/gpu_harmonic.cu index 146c1c29419dfa8787bad928a0975a0743f15f64..6a2363e4762e6da8d190c1b40bba38fef9ec4e5d 100644 --- a/L26/gpu_harmonic.cu +++ b/L26/gpu_harmonic.cu @@ -30,25 +30,29 @@ __global__ void sumKernel(uint64 N) { if (thread_num == 0) { printf ("sum = %.10f\n",sum); } - + } int main (int argc, char** argv) { - // get N and num_threads from the command line + // get N and B from the command line + // B is the number of threads per block + // we typically choose B to be a multiple of 32 + // the maximum value of B is 1024 if (argc < 3) { - printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads"); + printf ("Command usage : %s %s %s\n",argv[0],"N","B"); return 1; } uint64 N = atoll(argv[1]); - int num_threads = atoi(argv[2]); - printf ("number of threads = %d\n",num_threads); + int B = atoi(argv[2]); + printf ("N = %llu\n",N); + printf ("number of threads = %d\n",B); // start the timer clock_t start = clock(); // launch kernel - sumKernel <<< 1, num_threads >>> (N); + sumKernel <<< 1, B >>> (N); cudaDeviceSynchronize(); // stop the timer diff --git a/L26/key/gpu_harmonic_v1.cu b/L26/key/gpu_harmonic_v1.cu index 146c1c29419dfa8787bad928a0975a0743f15f64..6a2363e4762e6da8d190c1b40bba38fef9ec4e5d 100644 --- a/L26/key/gpu_harmonic_v1.cu +++ b/L26/key/gpu_harmonic_v1.cu @@ -30,25 +30,29 @@ __global__ void sumKernel(uint64 N) { if (thread_num == 0) { printf ("sum = %.10f\n",sum); } - + } int main (int argc, char** argv) { - // get N and num_threads from the command line + // get N and B from the command line + // B is the number of threads per block + // we typically choose B to be a multiple of 32 + // the maximum value of B is 1024 if (argc < 3) { - printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads"); + printf ("Command usage : %s %s %s\n",argv[0],"N","B"); return 1; } uint64 N = atoll(argv[1]); - int num_threads = atoi(argv[2]); - printf ("number of threads = %d\n",num_threads); + int B = atoi(argv[2]); + printf ("N = %llu\n",N); + printf ("number of threads = %d\n",B); // start the timer clock_t start = clock(); // launch kernel - sumKernel <<< 1, num_threads >>> (N); + sumKernel <<< 1, B >>> (N); cudaDeviceSynchronize(); // stop the timer diff --git a/L26/key/gpu_harmonic_v2.cu b/L26/key/gpu_harmonic_v2.cu index 7ddc60dcb3254733567d5e5bdb1dbf10e5a5e927..80a6026b0e64995383061dc2912c4ad09e7e3a37 100644 --- a/L26/key/gpu_harmonic_v2.cu +++ b/L26/key/gpu_harmonic_v2.cu @@ -9,10 +9,10 @@ __global__ void sumKernel(uint64 N, double* sum) { uint64 thread_num = (uint64)blockIdx.x*blockDim.x + threadIdx.x; if (thread_num < N) { - double term = 1.0/(thread_num+1); - atomicAdd(sum,term); + double term = 1.0/(thread_num+1); + atomicAdd(sum,term); } - + } int main (int argc, char** argv) { @@ -36,7 +36,7 @@ int main (int argc, char** argv) { printf ("threads per block B = %d\n",B); printf ("number of thread blocks G = %d\n",G); printf ("number of threads G*B = %llu\n",(uint64)G*B); - + // the computed sum in device memory double* d_sum; cudaMalloc (&d_sum,sizeof(double)); diff --git a/L26/key/gpu_harmonic_v3.cu b/L26/key/gpu_harmonic_v3.cu index d787319e5629959d330836053f24514e7e92be9f..c7c93efe2c21f076bcaaafe53ff3e54c351bb60c 100644 --- a/L26/key/gpu_harmonic_v3.cu +++ b/L26/key/gpu_harmonic_v3.cu @@ -12,10 +12,10 @@ __global__ void sumKernel(uint64 N, uint64 T, double* sum) { uint64 start = thread_num*T; uint64 end = start+T; if (end > N) { - end = N; + end = N; } for (uint64 i = start; i<end;i++) { - thread_sum += 1.0/(i+1); + thread_sum += 1.0/(i+1); } atomicAdd(sum,thread_sum); } @@ -45,7 +45,7 @@ int main (int argc, char** argv) { printf ("threads per block B = %d\n",B); printf ("number of thread blocks G = %d\n",G); printf ("number of threads G*B = %llu\n",(uint64)G*B); - + // the computed sum in device memory double* d_sum; cudaMalloc (&d_sum,sizeof(double)); diff --git a/L26/results.out b/L26/results.out new file mode 100644 index 0000000000000000000000000000000000000000..7b89e63cdd0b7d3a33c52700fc8f5494fe1adcd1 --- /dev/null +++ b/L26/results.out @@ -0,0 +1,54 @@ +# C Sequential (compiled with -O3 and run on tinkercliffs) +N = 1000000000 (1 billion) +sum = 21.3004815023 +elapsed time = 2.06 seconds + +# CUDA Version 1 (one thread block) +N = 1000000000 (1 billion) +number of threads = 256 +sum = 21.3004815023 +elapsed time = 0.43 seconds + +# CUDA Version 2 (first full GPU attempt) +N = 1000000000 (1 billion) +threads per block B = 256 +number of thread blocks G = 3906250 +number of threads G*B = 1000000000 +harmonic sum = 21.3004815023 +elapsed time = 2.36 seconds + +# CUDA Version 3 (second full GPU attempt) +N = 1000000000 (1 billion) +terms per thread T = 1000 +threads per block B = 256 +number of thread blocks G = 3907 +number of threads G*B = 1000192 +sum = 21.3004815023 +elapsed time = 0.01 seconds + +# CUDA Version 3 (second full GPU attempt) +N = 1000000000000 (1 trillion) +terms per thread T = 10000 +threads per block B = 256 +number of thread blocks G = 390625 +number of threads G*B = 100000000 +sum = 28.2082367808 +elapsed time = 4.21 seconds + +# Note that speedup (from sequential) is (2.06*1000)/4.21 = 489.3 +# For this problem, a Nvidia P100 GPU launched on June 20th, 2016 +# is nearly 500 times faster than the C sequential version compiled with -O3. + +# CUDA Version 3 (second full GPU attempt) +# Run on a V100 (80 SMs) instead of a P100 (56 SMs)! +N = 1000000000000 (1 trillion) +terms per thread T = 10000 +threads per block B = 256 +number of thread blocks G = 390625 +number of threads G*B = 100000000 +sum = 28.2082367808 +elapsed time = 2.02 seconds + +# Note that speedup (from sequential) is (2.06*1000)/2.02 = 1020 +# For this problem, a Nvidia V100 GPU launched on June 21st, 2017 +# is over 1000 times faster than the C sequential version compiled with -O3.