diff --git a/L23/gpu_hello.cu b/L23/gpu_hello.cu new file mode 100644 index 0000000000000000000000000000000000000000..73d3f1b7deb75c889ffd1733922f412d754933c9 --- /dev/null +++ b/L23/gpu_hello.cu @@ -0,0 +1,27 @@ +#include <stdio.h> +#include <stdlib.h> +#include <cuda.h> + +__global__ void helloKernel() { + + /*****************/ + /* Add Code Here */ + /*****************/ + +} + +int main(int argc, char **argv) { + + /* get num_threads from the command line */ + if (argc < 2) { + printf ("Command usage : %s %s\n",argv[0],"num_threads"); + return 1; + } + + int num_threads = atoi(argv[1]); + + printf ("num_threads = %d\n",num_threads); + + helloKernel <<< 1, num_threads >>> (); + cudaDeviceSynchronize(); +} diff --git a/L23/gpu_hello.sh b/L23/gpu_hello.sh new file mode 100644 index 0000000000000000000000000000000000000000..b5ec0fe8de2adcb655631eadacd90b07e3f6b5ed --- /dev/null +++ b/L23/gpu_hello.sh @@ -0,0 +1,19 @@ +#!/bin/bash +#SBATCH -A cmda3634_rjh +#SBATCH -p p100_normal_q +#SBATCH -t 5 +#SBATCH --gres=gpu:1 +#SBATCH -o gpu_hello.out + +# Go to the directory where the job was submitted +cd $SLURM_SUBMIT_DIR + +# Load CUDA toolkit module +module load cuda11.6/toolkit/11.6.2 + +# compile +nvcc -w -arch=sm_60 -o gpu_hello gpu_hello.cu + +# run hello +./gpu_hello $1 + diff --git a/L23/gpu_sum.cu b/L23/gpu_sum.cu new file mode 100644 index 0000000000000000000000000000000000000000..4914ba7ac58bdd5b5ec98adbd49a1798305e52ee --- /dev/null +++ b/L23/gpu_sum.cu @@ -0,0 +1,37 @@ +#include <stdio.h> +#include <stdlib.h> +#include <cuda.h> + +typedef unsigned long long int uint64; + +__global__ void sumKernel(uint64 N) { + + int thread_num = threadIdx.x; + int num_threads = blockDim.x; + + uint64 sum = 0; + for (uint64 i = 1; i <= N;i++) { + sum += i; + } + + printf (" on thread %d of %d, sum = %llu\n",thread_num,num_threads,sum); +} + +int main(int argc, char **argv) { + + /* get N and num_threads from the command line */ + if (argc < 3) { + printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads"); + return 1; + } + + uint64 N = atol(argv[1]); + int num_threads = atoi(argv[2]); + + printf ("num_threads = %d\n",num_threads); + printf ("N*(N+1)/2 = %llu\n",(N/2)*(N+1)); + + sumKernel <<< 1, num_threads >>> (N); + cudaDeviceSynchronize(); + +} diff --git a/L23/gpu_sum.sh b/L23/gpu_sum.sh new file mode 100644 index 0000000000000000000000000000000000000000..90e40346d0ee4edc3de12acaded01040b26fa769 --- /dev/null +++ b/L23/gpu_sum.sh @@ -0,0 +1,19 @@ +#!/bin/bash +#SBATCH -A cmda3634_rjh +#SBATCH -p p100_normal_q +#SBATCH -t 5 +#SBATCH --gres=gpu:1 +#SBATCH -o gpu_sum.out + +# Go to the directory where the job was submitted +cd $SLURM_SUBMIT_DIR + +# Load CUDA toolkit module +module load cuda11.6/toolkit/11.6.2 + +# compile +nvcc -w -arch=sm_60 -o gpu_sum gpu_sum.cu + +# run sum +./gpu_sum $1 $2 + diff --git a/L23/key/gpu_hello.cu b/L23/key/gpu_hello.cu new file mode 100644 index 0000000000000000000000000000000000000000..92258d87d6142a89ff87dfa536a2ee9e5c587f06 --- /dev/null +++ b/L23/key/gpu_hello.cu @@ -0,0 +1,27 @@ +#include <stdio.h> +#include <stdlib.h> +#include <cuda.h> + +__global__ void helloKernel() { + + int thread_num = threadIdx.x; + int num_threads = blockDim.x; + + printf (" Hello World! from thread %d of %d\n",thread_num,num_threads); +} + +int main(int argc, char **argv) { + + /* get num_threads from the command line */ + if (argc < 2) { + printf ("Command usage : %s %s\n",argv[0],"num_threads"); + return 1; + } + + int num_threads = atoi(argv[1]); + + printf ("num_threads = %d\n",num_threads); + + helloKernel <<< 1, num_threads >>> (); + cudaDeviceSynchronize(); +} diff --git a/L23/key/gpu_sum_v1.cu b/L23/key/gpu_sum_v1.cu new file mode 100644 index 0000000000000000000000000000000000000000..4914ba7ac58bdd5b5ec98adbd49a1798305e52ee --- /dev/null +++ b/L23/key/gpu_sum_v1.cu @@ -0,0 +1,37 @@ +#include <stdio.h> +#include <stdlib.h> +#include <cuda.h> + +typedef unsigned long long int uint64; + +__global__ void sumKernel(uint64 N) { + + int thread_num = threadIdx.x; + int num_threads = blockDim.x; + + uint64 sum = 0; + for (uint64 i = 1; i <= N;i++) { + sum += i; + } + + printf (" on thread %d of %d, sum = %llu\n",thread_num,num_threads,sum); +} + +int main(int argc, char **argv) { + + /* get N and num_threads from the command line */ + if (argc < 3) { + printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads"); + return 1; + } + + uint64 N = atol(argv[1]); + int num_threads = atoi(argv[2]); + + printf ("num_threads = %d\n",num_threads); + printf ("N*(N+1)/2 = %llu\n",(N/2)*(N+1)); + + sumKernel <<< 1, num_threads >>> (N); + cudaDeviceSynchronize(); + +} diff --git a/L23/key/gpu_sum_v2.cu b/L23/key/gpu_sum_v2.cu new file mode 100644 index 0000000000000000000000000000000000000000..8a4e890b4fd73c5b0ad754a54d8f01508958ffdd --- /dev/null +++ b/L23/key/gpu_sum_v2.cu @@ -0,0 +1,37 @@ +#include <stdio.h> +#include <stdlib.h> +#include <cuda.h> + +typedef unsigned long long int uint64; + +__global__ void sumKernel(uint64 N) { + + int thread_num = threadIdx.x; + int num_threads = blockDim.x; + + uint64 sum = 0; + for (uint64 i = 1+thread_num; i <= N;i+=num_threads) { + sum += i; + } + + printf (" on thread %d of %d, sum = %llu\n",thread_num,num_threads,sum); +} + +int main(int argc, char **argv) { + + /* get N and num_threads from the command line */ + if (argc < 3) { + printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads"); + return 1; + } + + uint64 N = atol(argv[1]); + int num_threads = atoi(argv[2]); + + printf ("num_threads = %d\n",num_threads); + printf ("N*(N+1)/2 = %llu\n",(N/2)*(N+1)); + + sumKernel <<< 1, num_threads >>> (N); + cudaDeviceSynchronize(); + +} diff --git a/L23/key/gpu_sum_v3.cu b/L23/key/gpu_sum_v3.cu new file mode 100644 index 0000000000000000000000000000000000000000..a427c7daff67e147367a4edcf20425eaaf1b731b --- /dev/null +++ b/L23/key/gpu_sum_v3.cu @@ -0,0 +1,47 @@ +#include <stdio.h> +#include <stdlib.h> +#include <cuda.h> + +typedef unsigned long long int uint64; + +__global__ void sumKernel(uint64 N) { + + __shared__ uint64 sum; + + int thread_num = threadIdx.x; + int num_threads = blockDim.x; + + /* thread 0 initializes sum to 0 */ + if (thread_num == 0) { + sum = 0; + } + + /* calculate the sum */ + for (uint64 i = 1+thread_num; i <= N;i+=num_threads) { + sum += i; + } + + /* thread 0 prints the sum */ + if (thread_num == 0) { + printf (" sum = %llu\n",sum); + } +} + +int main(int argc, char **argv) { + + /* get N and num_threads from the command line */ + if (argc < 3) { + printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads"); + return 1; + } + + uint64 N = atol(argv[1]); + int num_threads = atoi(argv[2]); + + printf ("num_threads = %d\n",num_threads); + printf ("N*(N+1)/2 = %llu\n",(N/2)*(N+1)); + + sumKernel <<< 1, num_threads >>> (N); + cudaDeviceSynchronize(); + +} diff --git a/L23/key/gpu_sum_v4.cu b/L23/key/gpu_sum_v4.cu new file mode 100644 index 0000000000000000000000000000000000000000..9c51068a9c18ee4ad14db76340f4b8ae32935c9d --- /dev/null +++ b/L23/key/gpu_sum_v4.cu @@ -0,0 +1,47 @@ +#include <stdio.h> +#include <stdlib.h> +#include <cuda.h> + +typedef unsigned long long int uint64; + +__global__ void sumKernel(uint64 N) { + + __shared__ uint64 sum; + + int thread_num = threadIdx.x; + int num_threads = blockDim.x; + + /* initialize sum to 0 */ + if (thread_num == 0) { + sum = 0; + } + + /* calculate the sum */ + for (uint64 i = 1+thread_num; i <= N;i+=num_threads) { + atomicAdd(&sum,i); + } + + /* thread 0 prints the sum */ + if (thread_num == 0) { + printf (" sum = %llu\n",sum); + } +} + +int main(int argc, char **argv) { + + /* get N and num_threads from the command line */ + if (argc < 3) { + printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads"); + return 1; + } + + uint64 N = atol(argv[1]); + int num_threads = atoi(argv[2]); + + printf ("num_threads = %d\n",num_threads); + printf ("N*(N+1)/2 = %llu\n",(N/2)*(N+1)); + + sumKernel <<< 1, num_threads >>> (N); + cudaDeviceSynchronize(); + +} diff --git a/L23/key/gpu_sum_v5.cu b/L23/key/gpu_sum_v5.cu new file mode 100644 index 0000000000000000000000000000000000000000..643a59f52a72d3499c428b39bb39522494122584 --- /dev/null +++ b/L23/key/gpu_sum_v5.cu @@ -0,0 +1,49 @@ +#include <stdio.h> +#include <stdlib.h> +#include <cuda.h> + +typedef unsigned long long int uint64; + +__global__ void sumKernel(uint64 N) { + + __shared__ uint64 sum; + + int thread_num = threadIdx.x; + int num_threads = blockDim.x; + + /* initialize sum to 0 */ + if (thread_num == 0) { + sum = 0; + } + + /* calculate the sum */ + uint64 thread_sum = 0; + for (uint64 i = 1+thread_num; i <= N;i+=num_threads) { + thread_sum += i; + } + atomicAdd(&sum,thread_sum); + + /* thread 0 prints the sum */ + if (thread_num == 0) { + printf (" sum = %llu\n",sum); + } +} + +int main(int argc, char **argv) { + + /* get N and num_threads from the command line */ + if (argc < 3) { + printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads"); + return 1; + } + + uint64 N = atol(argv[1]); + int num_threads = atoi(argv[2]); + + printf ("num_threads = %d\n",num_threads); + printf ("N*(N+1)/2 = %llu\n",(N/2)*(N+1)); + + sumKernel <<< 1, num_threads >>> (N); + cudaDeviceSynchronize(); + +} diff --git a/L23/key/gpu_sum_v6.cu b/L23/key/gpu_sum_v6.cu new file mode 100644 index 0000000000000000000000000000000000000000..0199b224c1b75d7efa4deb78a2e3e4a8ba0f2718 --- /dev/null +++ b/L23/key/gpu_sum_v6.cu @@ -0,0 +1,51 @@ +#include <stdio.h> +#include <stdlib.h> +#include <cuda.h> + +typedef unsigned long long int uint64; + +__global__ void sumKernel(uint64 N) { + + __shared__ uint64 sum; + + int thread_num = threadIdx.x; + int num_threads = blockDim.x; + + /* initialize sum to 0 */ + if (thread_num == 0) { + sum = 0; + } + __syncthreads(); + + /* calculate the sum */ + uint64 thread_sum = 0; + for (uint64 i = 1+thread_num; i <= N;i+=num_threads) { + thread_sum += i; + } + atomicAdd(&sum,thread_sum); + __syncthreads(); + + /* thread 0 prints the sum */ + if (thread_num == 0) { + printf (" sum = %llu\n",sum); + } +} + +int main(int argc, char **argv) { + + /* get N and num_threads from the command line */ + if (argc < 3) { + printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads"); + return 1; + } + + uint64 N = atol(argv[1]); + int num_threads = atoi(argv[2]); + + printf ("num_threads = %d\n",num_threads); + printf ("N*(N+1)/2 = %llu\n",(N/2)*(N+1)); + + sumKernel <<< 1, num_threads >>> (N); + cudaDeviceSynchronize(); + +}