files for lecture23

99385e85 · Jason R Wilson · cc9b938b · 99385e85 · 99385e85 · 99385e85
Commit 99385e85 authored 10 months ago by Jason R Wilson
--- a/L23/gpu_hello.cu
+++ b/L23/gpu_hello.cu
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+
+__global__ void helloKernel() {
+
+    /*****************/
+    /* Add Code Here */
+    /*****************/
+
+}
+
+int main(int argc, char **argv) {
+
+    /* get num_threads from the command line */
+    if (argc < 2) {
+        printf ("Command usage : %s %s\n",argv[0],"num_threads");
+        return 1;
+    }
+
+    int num_threads = atoi(argv[1]);
+
+    printf ("num_threads = %d\n",num_threads);
+
+    helloKernel <<< 1, num_threads >>> ();
+    cudaDeviceSynchronize();
+}
--- a/L23/gpu_hello.sh
+++ b/L23/gpu_hello.sh
+#!/bin/bash
+#SBATCH -A cmda3634_rjh
+#SBATCH -p p100_normal_q
+#SBATCH -t 5
+#SBATCH --gres=gpu:1
+#SBATCH -o gpu_hello.out
+
+# Go to the directory where the job was submitted
+cd $SLURM_SUBMIT_DIR
+
+# Load CUDA toolkit module
+module load cuda11.6/toolkit/11.6.2
+
+# compile
+nvcc -w -arch=sm_60 -o gpu_hello gpu_hello.cu
+
+# run hello
+./gpu_hello $1
+
--- a/L23/gpu_sum.cu
+++ b/L23/gpu_sum.cu
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+
+typedef unsigned long long int uint64;
+
+__global__ void sumKernel(uint64 N) {
+
+    int thread_num = threadIdx.x;
+    int num_threads = blockDim.x;
+
+    uint64 sum = 0;
+    for (uint64 i = 1; i <= N;i++) {
+        sum += i;
+    }
+
+    printf (" on thread %d of %d, sum = %llu\n",thread_num,num_threads,sum);
+}
+
+int main(int argc, char **argv) {
+
+    /* get N and num_threads from the command line */
+    if (argc < 3) {
+        printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads");
+        return 1;
+    }
+
+    uint64 N = atol(argv[1]);
+    int num_threads = atoi(argv[2]);
+
+    printf ("num_threads = %d\n",num_threads);
+    printf ("N*(N+1)/2 = %llu\n",(N/2)*(N+1));
+
+    sumKernel <<< 1, num_threads >>> (N);
+    cudaDeviceSynchronize();
+
+}
--- a/L23/gpu_sum.sh
+++ b/L23/gpu_sum.sh
+#!/bin/bash
+#SBATCH -A cmda3634_rjh
+#SBATCH -p p100_normal_q
+#SBATCH -t 5
+#SBATCH --gres=gpu:1
+#SBATCH -o gpu_sum.out
+
+# Go to the directory where the job was submitted
+cd $SLURM_SUBMIT_DIR
+
+# Load CUDA toolkit module
+module load cuda11.6/toolkit/11.6.2
+
+# compile
+nvcc -w -arch=sm_60 -o gpu_sum gpu_sum.cu
+
+# run sum
+./gpu_sum $1 $2
+
--- a/L23/key/gpu_hello.cu
+++ b/L23/key/gpu_hello.cu
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+
+__global__ void helloKernel() {
+
+    int thread_num = threadIdx.x;
+    int num_threads = blockDim.x;
+
+    printf (" Hello World! from thread %d of %d\n",thread_num,num_threads);
+}
+
+int main(int argc, char **argv) {
+
+    /* get num_threads from the command line */
+    if (argc < 2) {
+        printf ("Command usage : %s %s\n",argv[0],"num_threads");
+        return 1;
+    }
+
+    int num_threads = atoi(argv[1]);
+
+    printf ("num_threads = %d\n",num_threads);
+
+    helloKernel <<< 1, num_threads >>> ();
+    cudaDeviceSynchronize();
+}
--- a/L23/key/gpu_sum_v1.cu
+++ b/L23/key/gpu_sum_v1.cu
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+
+typedef unsigned long long int uint64;
+
+__global__ void sumKernel(uint64 N) {
+
+    int thread_num = threadIdx.x;
+    int num_threads = blockDim.x;
+
+    uint64 sum = 0;
+    for (uint64 i = 1; i <= N;i++) {
+        sum += i;
+    }
+
+    printf (" on thread %d of %d, sum = %llu\n",thread_num,num_threads,sum);
+}
+
+int main(int argc, char **argv) {
+
+    /* get N and num_threads from the command line */
+    if (argc < 3) {
+        printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads");
+        return 1;
+    }
+
+    uint64 N = atol(argv[1]);
+    int num_threads = atoi(argv[2]);
+
+    printf ("num_threads = %d\n",num_threads);
+    printf ("N*(N+1)/2 = %llu\n",(N/2)*(N+1));
+
+    sumKernel <<< 1, num_threads >>> (N);
+    cudaDeviceSynchronize();
+
+}
--- a/L23/key/gpu_sum_v2.cu
+++ b/L23/key/gpu_sum_v2.cu
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+
+typedef unsigned long long int uint64;
+
+__global__ void sumKernel(uint64 N) {
+
+    int thread_num = threadIdx.x;
+    int num_threads = blockDim.x;
+
+    uint64 sum = 0;
+    for (uint64 i = 1+thread_num; i <= N;i+=num_threads) {
+        sum += i;
+    }
+
+    printf (" on thread %d of %d, sum = %llu\n",thread_num,num_threads,sum);
+}
+
+int main(int argc, char **argv) {
+
+    /* get N and num_threads from the command line */
+    if (argc < 3) {
+        printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads");
+        return 1;
+    }
+
+    uint64 N = atol(argv[1]);
+    int num_threads = atoi(argv[2]);
+
+    printf ("num_threads = %d\n",num_threads);
+    printf ("N*(N+1)/2 = %llu\n",(N/2)*(N+1));
+
+    sumKernel <<< 1, num_threads >>> (N);
+    cudaDeviceSynchronize();
+
+}
--- a/L23/key/gpu_sum_v3.cu
+++ b/L23/key/gpu_sum_v3.cu
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+
+typedef unsigned long long int uint64;
+
+__global__ void sumKernel(uint64 N) {
+
+    __shared__ uint64 sum;
+
+    int thread_num = threadIdx.x;
+    int num_threads = blockDim.x;
+
+    /* thread 0 initializes sum to 0 */
+    if (thread_num == 0) {
+        sum = 0;
+    }
+
+    /* calculate the sum */
+    for (uint64 i = 1+thread_num; i <= N;i+=num_threads) {
+        sum += i;
+    }
+
+    /* thread 0 prints the sum */
+    if (thread_num == 0) {
+        printf (" sum = %llu\n",sum);
+    }
+}
+
+int main(int argc, char **argv) {
+
+    /* get N and num_threads from the command line */
+    if (argc < 3) {
+        printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads");
+        return 1;
+    }
+
+    uint64 N = atol(argv[1]);
+    int num_threads = atoi(argv[2]);
+
+    printf ("num_threads = %d\n",num_threads);
+    printf ("N*(N+1)/2 = %llu\n",(N/2)*(N+1));
+
+    sumKernel <<< 1, num_threads >>> (N);
+    cudaDeviceSynchronize();
+
+}
--- a/L23/key/gpu_sum_v4.cu
+++ b/L23/key/gpu_sum_v4.cu
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+
+typedef unsigned long long int uint64;
+
+__global__ void sumKernel(uint64 N) {
+
+    __shared__ uint64 sum;
+
+    int thread_num = threadIdx.x;
+    int num_threads = blockDim.x;
+
+    /* initialize sum to 0 */
+    if (thread_num == 0) {
+        sum = 0;
+    }
+
+    /* calculate the sum */
+    for (uint64 i = 1+thread_num; i <= N;i+=num_threads) {
+        atomicAdd(&sum,i);
+    }
+
+    /* thread 0 prints the sum */
+    if (thread_num == 0) {
+        printf (" sum = %llu\n",sum);
+    }
+}
+
+int main(int argc, char **argv) {
+
+    /* get N and num_threads from the command line */
+    if (argc < 3) {
+        printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads");
+        return 1;
+    }
+
+    uint64 N = atol(argv[1]);
+    int num_threads = atoi(argv[2]);
+
+    printf ("num_threads = %d\n",num_threads);
+    printf ("N*(N+1)/2 = %llu\n",(N/2)*(N+1));
+
+    sumKernel <<< 1, num_threads >>> (N);
+    cudaDeviceSynchronize();
+
+}
--- a/L23/key/gpu_sum_v5.cu
+++ b/L23/key/gpu_sum_v5.cu
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+
+typedef unsigned long long int uint64;
+
+__global__ void sumKernel(uint64 N) {
+
+    __shared__ uint64 sum;
+
+    int thread_num = threadIdx.x;
+    int num_threads = blockDim.x;
+
+    /* initialize sum to 0 */
+    if (thread_num == 0) {
+        sum = 0;
+    }
+
+    /* calculate the sum */
+    uint64 thread_sum = 0;
+    for (uint64 i = 1+thread_num; i <= N;i+=num_threads) {
+        thread_sum += i;
+    }
+    atomicAdd(&sum,thread_sum);
+
+    /* thread 0 prints the sum */
+    if (thread_num == 0) {
+        printf (" sum = %llu\n",sum);
+    }
+}
+
+int main(int argc, char **argv) {
+
+    /* get N and num_threads from the command line */
+    if (argc < 3) {
+        printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads");
+        return 1;
+    }
+
+    uint64 N = atol(argv[1]);
+    int num_threads = atoi(argv[2]);
+
+    printf ("num_threads = %d\n",num_threads);
+    printf ("N*(N+1)/2 = %llu\n",(N/2)*(N+1));
+
+    sumKernel <<< 1, num_threads >>> (N);
+    cudaDeviceSynchronize();
+
+}
--- a/L23/key/gpu_sum_v6.cu
+++ b/L23/key/gpu_sum_v6.cu
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+
+typedef unsigned long long int uint64;
+
+__global__ void sumKernel(uint64 N) {
+
+    __shared__ uint64 sum;
+
+    int thread_num = threadIdx.x;
+    int num_threads = blockDim.x;
+
+    /* initialize sum to 0 */
+    if (thread_num == 0) {
+        sum = 0;
+    }
+    __syncthreads();
+
+    /* calculate the sum */
+    uint64 thread_sum = 0;
+    for (uint64 i = 1+thread_num; i <= N;i+=num_threads) {
+        thread_sum += i;
+    }
+    atomicAdd(&sum,thread_sum);
+    __syncthreads();
+
+    /* thread 0 prints the sum */
+    if (thread_num == 0) {
+        printf (" sum = %llu\n",sum);
+    }
+}
+
+int main(int argc, char **argv) {
+
+    /* get N and num_threads from the command line */
+    if (argc < 3) {
+        printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads");
+        return 1;
+    }
+
+    uint64 N = atol(argv[1]);
+    int num_threads = atoi(argv[2]);
+
+    printf ("num_threads = %d\n",num_threads);
+    printf ("N*(N+1)/2 = %llu\n",(N/2)*(N+1));
+
+    sumKernel <<< 1, num_threads >>> (N);
+    cudaDeviceSynchronize();
+
+}