diff --git a/L26/gpu_harmonic.cu b/L26/gpu_harmonic.cu
index 146c1c29419dfa8787bad928a0975a0743f15f64..6a2363e4762e6da8d190c1b40bba38fef9ec4e5d 100644
--- a/L26/gpu_harmonic.cu
+++ b/L26/gpu_harmonic.cu
@@ -30,25 +30,29 @@ __global__ void sumKernel(uint64 N) {
     if (thread_num == 0) {
         printf ("sum = %.10f\n",sum);
     }
-    
+
 }
 
 int main (int argc, char** argv) {
 
-    // get N and num_threads from the command line
+    // get N and B from the command line
+    // B is the number of threads per block
+    // we typically choose B to be a multiple of 32
+    // the maximum value of B is 1024
     if (argc < 3) {
-        printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads");
+        printf ("Command usage : %s %s %s\n",argv[0],"N","B");
         return 1;
     }
     uint64 N = atoll(argv[1]);
-    int num_threads = atoi(argv[2]);
-    printf ("number of threads = %d\n",num_threads);
+    int B = atoi(argv[2]);
+    printf ("N = %llu\n",N);
+    printf ("number of threads = %d\n",B);
 
     // start the timer
     clock_t start = clock();
 
     // launch kernel
-    sumKernel <<< 1, num_threads >>> (N);
+    sumKernel <<< 1, B >>> (N);
     cudaDeviceSynchronize();
 
     // stop the timer
diff --git a/L26/key/gpu_harmonic_v1.cu b/L26/key/gpu_harmonic_v1.cu
index 146c1c29419dfa8787bad928a0975a0743f15f64..6a2363e4762e6da8d190c1b40bba38fef9ec4e5d 100644
--- a/L26/key/gpu_harmonic_v1.cu
+++ b/L26/key/gpu_harmonic_v1.cu
@@ -30,25 +30,29 @@ __global__ void sumKernel(uint64 N) {
     if (thread_num == 0) {
         printf ("sum = %.10f\n",sum);
     }
-    
+
 }
 
 int main (int argc, char** argv) {
 
-    // get N and num_threads from the command line
+    // get N and B from the command line
+    // B is the number of threads per block
+    // we typically choose B to be a multiple of 32
+    // the maximum value of B is 1024
     if (argc < 3) {
-        printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads");
+        printf ("Command usage : %s %s %s\n",argv[0],"N","B");
         return 1;
     }
     uint64 N = atoll(argv[1]);
-    int num_threads = atoi(argv[2]);
-    printf ("number of threads = %d\n",num_threads);
+    int B = atoi(argv[2]);
+    printf ("N = %llu\n",N);
+    printf ("number of threads = %d\n",B);
 
     // start the timer
     clock_t start = clock();
 
     // launch kernel
-    sumKernel <<< 1, num_threads >>> (N);
+    sumKernel <<< 1, B >>> (N);
     cudaDeviceSynchronize();
 
     // stop the timer
diff --git a/L26/key/gpu_harmonic_v2.cu b/L26/key/gpu_harmonic_v2.cu
index 7ddc60dcb3254733567d5e5bdb1dbf10e5a5e927..80a6026b0e64995383061dc2912c4ad09e7e3a37 100644
--- a/L26/key/gpu_harmonic_v2.cu
+++ b/L26/key/gpu_harmonic_v2.cu
@@ -9,10 +9,10 @@ __global__ void sumKernel(uint64 N, double* sum) {
 
     uint64 thread_num = (uint64)blockIdx.x*blockDim.x + threadIdx.x;
     if (thread_num < N) {
-	double term = 1.0/(thread_num+1);
-	atomicAdd(sum,term);
+        double term = 1.0/(thread_num+1);
+        atomicAdd(sum,term);
     }
-    
+
 }
 
 int main (int argc, char** argv) {
@@ -36,7 +36,7 @@ int main (int argc, char** argv) {
     printf ("threads per block B = %d\n",B);
     printf ("number of thread blocks G = %d\n",G);
     printf ("number of threads G*B = %llu\n",(uint64)G*B);
-    
+
     // the computed sum in device memory
     double* d_sum;
     cudaMalloc (&d_sum,sizeof(double));
diff --git a/L26/key/gpu_harmonic_v3.cu b/L26/key/gpu_harmonic_v3.cu
index d787319e5629959d330836053f24514e7e92be9f..c7c93efe2c21f076bcaaafe53ff3e54c351bb60c 100644
--- a/L26/key/gpu_harmonic_v3.cu
+++ b/L26/key/gpu_harmonic_v3.cu
@@ -12,10 +12,10 @@ __global__ void sumKernel(uint64 N, uint64 T, double* sum) {
     uint64 start = thread_num*T;
     uint64 end = start+T;
     if (end > N) {
-	end = N;
+        end = N;
     }
     for (uint64 i = start; i<end;i++) {
-	thread_sum += 1.0/(i+1);
+        thread_sum += 1.0/(i+1);
     }
     atomicAdd(sum,thread_sum);
 }
@@ -45,7 +45,7 @@ int main (int argc, char** argv) {
     printf ("threads per block B = %d\n",B);
     printf ("number of thread blocks G = %d\n",G);
     printf ("number of threads G*B = %llu\n",(uint64)G*B);
-    
+
     // the computed sum in device memory
     double* d_sum;
     cudaMalloc (&d_sum,sizeof(double));
diff --git a/L26/results.out b/L26/results.out
new file mode 100644
index 0000000000000000000000000000000000000000..7b89e63cdd0b7d3a33c52700fc8f5494fe1adcd1
--- /dev/null
+++ b/L26/results.out
@@ -0,0 +1,54 @@
+# C Sequential (compiled with -O3 and run on tinkercliffs)
+N = 1000000000 (1 billion)
+sum = 21.3004815023
+elapsed time = 2.06 seconds
+
+# CUDA Version 1 (one thread block)
+N = 1000000000 (1 billion)
+number of threads = 256
+sum = 21.3004815023
+elapsed time = 0.43 seconds
+
+# CUDA Version 2 (first full GPU attempt)
+N = 1000000000 (1 billion)
+threads per block B = 256
+number of thread blocks G = 3906250
+number of threads G*B = 1000000000
+harmonic sum = 21.3004815023
+elapsed time = 2.36 seconds
+
+# CUDA Version 3 (second full GPU attempt)
+N = 1000000000 (1 billion)
+terms per thread T = 1000
+threads per block B = 256
+number of thread blocks G = 3907
+number of threads G*B = 1000192
+sum = 21.3004815023
+elapsed time = 0.01 seconds
+
+# CUDA Version 3 (second full GPU attempt)
+N = 1000000000000 (1 trillion)
+terms per thread T = 10000
+threads per block B = 256
+number of thread blocks G = 390625
+number of threads G*B = 100000000
+sum = 28.2082367808
+elapsed time = 4.21 seconds
+
+# Note that speedup (from sequential) is (2.06*1000)/4.21 = 489.3
+# For this problem, a Nvidia P100 GPU launched on June 20th, 2016 
+#  is nearly 500 times faster than the C sequential version compiled with -O3.
+
+# CUDA Version 3 (second full GPU attempt)
+# Run on a V100 (80 SMs) instead of a P100 (56 SMs)!
+N = 1000000000000 (1 trillion)
+terms per thread T = 10000
+threads per block B = 256
+number of thread blocks G = 390625
+number of threads G*B = 100000000
+sum = 28.2082367808
+elapsed time = 2.02 seconds
+
+# Note that speedup (from sequential) is (2.06*1000)/2.02 = 1020
+# For this problem, a Nvidia V100 GPU launched on June 21st, 2017
+#  is over 1000 times faster than the C sequential version compiled with -O3.