Skip to content
Snippets Groups Projects
Commit efe9e054 authored by Jason R Wilson's avatar Jason R Wilson
Browse files

updated files for lecture 26

parent 7b4bc253
Branches main
No related merge requests found
......@@ -30,25 +30,29 @@ __global__ void sumKernel(uint64 N) {
if (thread_num == 0) {
printf ("sum = %.10f\n",sum);
}
}
int main (int argc, char** argv) {
// get N and num_threads from the command line
// get N and B from the command line
// B is the number of threads per block
// we typically choose B to be a multiple of 32
// the maximum value of B is 1024
if (argc < 3) {
printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads");
printf ("Command usage : %s %s %s\n",argv[0],"N","B");
return 1;
}
uint64 N = atoll(argv[1]);
int num_threads = atoi(argv[2]);
printf ("number of threads = %d\n",num_threads);
int B = atoi(argv[2]);
printf ("N = %llu\n",N);
printf ("number of threads = %d\n",B);
// start the timer
clock_t start = clock();
// launch kernel
sumKernel <<< 1, num_threads >>> (N);
sumKernel <<< 1, B >>> (N);
cudaDeviceSynchronize();
// stop the timer
......
......@@ -30,25 +30,29 @@ __global__ void sumKernel(uint64 N) {
if (thread_num == 0) {
printf ("sum = %.10f\n",sum);
}
}
int main (int argc, char** argv) {
// get N and num_threads from the command line
// get N and B from the command line
// B is the number of threads per block
// we typically choose B to be a multiple of 32
// the maximum value of B is 1024
if (argc < 3) {
printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads");
printf ("Command usage : %s %s %s\n",argv[0],"N","B");
return 1;
}
uint64 N = atoll(argv[1]);
int num_threads = atoi(argv[2]);
printf ("number of threads = %d\n",num_threads);
int B = atoi(argv[2]);
printf ("N = %llu\n",N);
printf ("number of threads = %d\n",B);
// start the timer
clock_t start = clock();
// launch kernel
sumKernel <<< 1, num_threads >>> (N);
sumKernel <<< 1, B >>> (N);
cudaDeviceSynchronize();
// stop the timer
......
......@@ -9,10 +9,10 @@ __global__ void sumKernel(uint64 N, double* sum) {
uint64 thread_num = (uint64)blockIdx.x*blockDim.x + threadIdx.x;
if (thread_num < N) {
double term = 1.0/(thread_num+1);
atomicAdd(sum,term);
double term = 1.0/(thread_num+1);
atomicAdd(sum,term);
}
}
int main (int argc, char** argv) {
......@@ -36,7 +36,7 @@ int main (int argc, char** argv) {
printf ("threads per block B = %d\n",B);
printf ("number of thread blocks G = %d\n",G);
printf ("number of threads G*B = %llu\n",(uint64)G*B);
// the computed sum in device memory
double* d_sum;
cudaMalloc (&d_sum,sizeof(double));
......
......@@ -12,10 +12,10 @@ __global__ void sumKernel(uint64 N, uint64 T, double* sum) {
uint64 start = thread_num*T;
uint64 end = start+T;
if (end > N) {
end = N;
end = N;
}
for (uint64 i = start; i<end;i++) {
thread_sum += 1.0/(i+1);
thread_sum += 1.0/(i+1);
}
atomicAdd(sum,thread_sum);
}
......@@ -45,7 +45,7 @@ int main (int argc, char** argv) {
printf ("threads per block B = %d\n",B);
printf ("number of thread blocks G = %d\n",G);
printf ("number of threads G*B = %llu\n",(uint64)G*B);
// the computed sum in device memory
double* d_sum;
cudaMalloc (&d_sum,sizeof(double));
......
# C Sequential (compiled with -O3 and run on tinkercliffs)
N = 1000000000 (1 billion)
sum = 21.3004815023
elapsed time = 2.06 seconds
# CUDA Version 1 (one thread block)
N = 1000000000 (1 billion)
number of threads = 256
sum = 21.3004815023
elapsed time = 0.43 seconds
# CUDA Version 2 (first full GPU attempt)
N = 1000000000 (1 billion)
threads per block B = 256
number of thread blocks G = 3906250
number of threads G*B = 1000000000
harmonic sum = 21.3004815023
elapsed time = 2.36 seconds
# CUDA Version 3 (second full GPU attempt)
N = 1000000000 (1 billion)
terms per thread T = 1000
threads per block B = 256
number of thread blocks G = 3907
number of threads G*B = 1000192
sum = 21.3004815023
elapsed time = 0.01 seconds
# CUDA Version 3 (second full GPU attempt)
N = 1000000000000 (1 trillion)
terms per thread T = 10000
threads per block B = 256
number of thread blocks G = 390625
number of threads G*B = 100000000
sum = 28.2082367808
elapsed time = 4.21 seconds
# Note that speedup (from sequential) is (2.06*1000)/4.21 = 489.3
# For this problem, a Nvidia P100 GPU launched on June 20th, 2016
# is nearly 500 times faster than the C sequential version compiled with -O3.
# CUDA Version 3 (second full GPU attempt)
# Run on a V100 (80 SMs) instead of a P100 (56 SMs)!
N = 1000000000000 (1 trillion)
terms per thread T = 10000
threads per block B = 256
number of thread blocks G = 390625
number of threads G*B = 100000000
sum = 28.2082367808
elapsed time = 2.02 seconds
# Note that speedup (from sequential) is (2.06*1000)/2.02 = 1020
# For this problem, a Nvidia V100 GPU launched on June 21st, 2017
# is over 1000 times faster than the C sequential version compiled with -O3.
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment