updated files for lecture 26

efe9e054 · Jason R Wilson · 7b4bc253 · efe9e054 · efe9e054 · efe9e054
Commit efe9e054 authored 10 months ago by Jason R Wilson
--- a/L26/gpu_harmonic.cu
+++ b/L26/gpu_harmonic.cu
@@ -30,25 +30,29 @@ __global__ void sumKernel(uint64 N) {
    if (thread_num == 0) {
        printf ("sum = %.10f\n",sum);
    }
-    
+
 }

 int main (int argc, char** argv) {

-    // get N and num_threads from the command line
+    // get N and B from the command line
+    // B is the number of threads per block
+    // we typically choose B to be a multiple of 32
+    // the maximum value of B is 1024
    if (argc < 3) {
-        printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads");
+        printf ("Command usage : %s %s %s\n",argv[0],"N","B");
        return 1;
    }
    uint64 N = atoll(argv[1]);
-    int num_threads = atoi(argv[2]);
-    printf ("number of threads = %d\n",num_threads);
+    int B = atoi(argv[2]);
+    printf ("N = %llu\n",N);
+    printf ("number of threads = %d\n",B);

    // start the timer
    clock_t start = clock();

    // launch kernel
-    sumKernel <<< 1, num_threads >>> (N);
+    sumKernel <<< 1, B >>> (N);
    cudaDeviceSynchronize();

    // stop the timer

--- a/L26/key/gpu_harmonic_v1.cu
+++ b/L26/key/gpu_harmonic_v1.cu
@@ -30,25 +30,29 @@ __global__ void sumKernel(uint64 N) {
    if (thread_num == 0) {
        printf ("sum = %.10f\n",sum);
    }
-    
+
 }

 int main (int argc, char** argv) {

-    // get N and num_threads from the command line
+    // get N and B from the command line
+    // B is the number of threads per block
+    // we typically choose B to be a multiple of 32
+    // the maximum value of B is 1024
    if (argc < 3) {
-        printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads");
+        printf ("Command usage : %s %s %s\n",argv[0],"N","B");
        return 1;
    }
    uint64 N = atoll(argv[1]);
-    int num_threads = atoi(argv[2]);
-    printf ("number of threads = %d\n",num_threads);
+    int B = atoi(argv[2]);
+    printf ("N = %llu\n",N);
+    printf ("number of threads = %d\n",B);

    // start the timer
    clock_t start = clock();

    // launch kernel
-    sumKernel <<< 1, num_threads >>> (N);
+    sumKernel <<< 1, B >>> (N);
    cudaDeviceSynchronize();

    // stop the timer

--- a/L26/key/gpu_harmonic_v2.cu
+++ b/L26/key/gpu_harmonic_v2.cu
@@ -9,10 +9,10 @@ __global__ void sumKernel(uint64 N, double* sum) {

    uint64 thread_num = (uint64)blockIdx.x*blockDim.x + threadIdx.x;
    if (thread_num < N) {
-	double term = 1.0/(thread_num+1);
-	atomicAdd(sum,term);
+        double term = 1.0/(thread_num+1);
+        atomicAdd(sum,term);
    }
-    
+
 }

 int main (int argc, char** argv) {
@@ -36,7 +36,7 @@ int main (int argc, char** argv) {
    printf ("threads per block B = %d\n",B);
    printf ("number of thread blocks G = %d\n",G);
    printf ("number of threads G*B = %llu\n",(uint64)G*B);
-    
+
    // the computed sum in device memory
    double* d_sum;
    cudaMalloc (&d_sum,sizeof(double));

--- a/L26/key/gpu_harmonic_v3.cu
+++ b/L26/key/gpu_harmonic_v3.cu
@@ -12,10 +12,10 @@ __global__ void sumKernel(uint64 N, uint64 T, double* sum) {
    uint64 start = thread_num*T;
    uint64 end = start+T;
    if (end > N) {
-	end = N;
+        end = N;
    }
    for (uint64 i = start; i<end;i++) {
-	thread_sum += 1.0/(i+1);
+        thread_sum += 1.0/(i+1);
    }
    atomicAdd(sum,thread_sum);
 }
@@ -45,7 +45,7 @@ int main (int argc, char** argv) {
    printf ("threads per block B = %d\n",B);
    printf ("number of thread blocks G = %d\n",G);
    printf ("number of threads G*B = %llu\n",(uint64)G*B);
-    
+
    // the computed sum in device memory
    double* d_sum;
    cudaMalloc (&d_sum,sizeof(double));

--- a/L26/results.out
+++ b/L26/results.out
+# C Sequential (compiled with -O3 and run on tinkercliffs)
+N = 1000000000 (1 billion)
+sum = 21.3004815023
+elapsed time = 2.06 seconds
+
+# CUDA Version 1 (one thread block)
+N = 1000000000 (1 billion)
+number of threads = 256
+sum = 21.3004815023
+elapsed time = 0.43 seconds
+
+# CUDA Version 2 (first full GPU attempt)
+N = 1000000000 (1 billion)
+threads per block B = 256
+number of thread blocks G = 3906250
+number of threads G*B = 1000000000
+harmonic sum = 21.3004815023
+elapsed time = 2.36 seconds
+
+# CUDA Version 3 (second full GPU attempt)
+N = 1000000000 (1 billion)
+terms per thread T = 1000
+threads per block B = 256
+number of thread blocks G = 3907
+number of threads G*B = 1000192
+sum = 21.3004815023
+elapsed time = 0.01 seconds
+
+# CUDA Version 3 (second full GPU attempt)
+N = 1000000000000 (1 trillion)
+terms per thread T = 10000
+threads per block B = 256
+number of thread blocks G = 390625
+number of threads G*B = 100000000
+sum = 28.2082367808
+elapsed time = 4.21 seconds
+
+# Note that speedup (from sequential) is (2.06*1000)/4.21 = 489.3
+# For this problem, a Nvidia P100 GPU launched on June 20th, 2016 
+#  is nearly 500 times faster than the C sequential version compiled with -O3.
+
+# CUDA Version 3 (second full GPU attempt)
+# Run on a V100 (80 SMs) instead of a P100 (56 SMs)!
+N = 1000000000000 (1 trillion)
+terms per thread T = 10000
+threads per block B = 256
+number of thread blocks G = 390625
+number of threads G*B = 100000000
+sum = 28.2082367808
+elapsed time = 2.02 seconds
+
+# Note that speedup (from sequential) is (2.06*1000)/2.02 = 1020
+# For this problem, a Nvidia V100 GPU launched on June 21st, 2017
+#  is over 1000 times faster than the C sequential version compiled with -O3.