diff --git a/L23/gpu_hello.cu b/L23/gpu_hello.cu
new file mode 100644
index 0000000000000000000000000000000000000000..73d3f1b7deb75c889ffd1733922f412d754933c9
--- /dev/null
+++ b/L23/gpu_hello.cu
@@ -0,0 +1,27 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+
+__global__ void helloKernel() {
+
+    /*****************/
+    /* Add Code Here */
+    /*****************/
+
+}
+
+int main(int argc, char **argv) {
+
+    /* get num_threads from the command line */
+    if (argc < 2) {
+        printf ("Command usage : %s %s\n",argv[0],"num_threads");
+        return 1;
+    }
+
+    int num_threads = atoi(argv[1]);
+
+    printf ("num_threads = %d\n",num_threads);
+
+    helloKernel <<< 1, num_threads >>> ();
+    cudaDeviceSynchronize();
+}
diff --git a/L23/gpu_hello.sh b/L23/gpu_hello.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b5ec0fe8de2adcb655631eadacd90b07e3f6b5ed
--- /dev/null
+++ b/L23/gpu_hello.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+#SBATCH -A cmda3634_rjh
+#SBATCH -p p100_normal_q
+#SBATCH -t 5
+#SBATCH --gres=gpu:1
+#SBATCH -o gpu_hello.out
+
+# Go to the directory where the job was submitted
+cd $SLURM_SUBMIT_DIR
+
+# Load CUDA toolkit module
+module load cuda11.6/toolkit/11.6.2
+
+# compile
+nvcc -w -arch=sm_60 -o gpu_hello gpu_hello.cu
+
+# run hello
+./gpu_hello $1
+
diff --git a/L23/gpu_sum.cu b/L23/gpu_sum.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4914ba7ac58bdd5b5ec98adbd49a1798305e52ee
--- /dev/null
+++ b/L23/gpu_sum.cu
@@ -0,0 +1,37 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+
+typedef unsigned long long int uint64;
+
+__global__ void sumKernel(uint64 N) {
+
+    int thread_num = threadIdx.x;
+    int num_threads = blockDim.x;
+
+    uint64 sum = 0;
+    for (uint64 i = 1; i <= N;i++) {
+        sum += i;
+    }
+
+    printf (" on thread %d of %d, sum = %llu\n",thread_num,num_threads,sum);
+}
+
+int main(int argc, char **argv) {
+
+    /* get N and num_threads from the command line */
+    if (argc < 3) {
+        printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads");
+        return 1;
+    }
+
+    uint64 N = atol(argv[1]);
+    int num_threads = atoi(argv[2]);
+
+    printf ("num_threads = %d\n",num_threads);
+    printf ("N*(N+1)/2 = %llu\n",(N/2)*(N+1));
+
+    sumKernel <<< 1, num_threads >>> (N);
+    cudaDeviceSynchronize();
+
+}
diff --git a/L23/gpu_sum.sh b/L23/gpu_sum.sh
new file mode 100644
index 0000000000000000000000000000000000000000..90e40346d0ee4edc3de12acaded01040b26fa769
--- /dev/null
+++ b/L23/gpu_sum.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+#SBATCH -A cmda3634_rjh
+#SBATCH -p p100_normal_q
+#SBATCH -t 5
+#SBATCH --gres=gpu:1
+#SBATCH -o gpu_sum.out
+
+# Go to the directory where the job was submitted
+cd $SLURM_SUBMIT_DIR
+
+# Load CUDA toolkit module
+module load cuda11.6/toolkit/11.6.2
+
+# compile
+nvcc -w -arch=sm_60 -o gpu_sum gpu_sum.cu
+
+# run sum
+./gpu_sum $1 $2
+
diff --git a/L23/key/gpu_hello.cu b/L23/key/gpu_hello.cu
new file mode 100644
index 0000000000000000000000000000000000000000..92258d87d6142a89ff87dfa536a2ee9e5c587f06
--- /dev/null
+++ b/L23/key/gpu_hello.cu
@@ -0,0 +1,27 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+
+__global__ void helloKernel() {
+
+    int thread_num = threadIdx.x;
+    int num_threads = blockDim.x;
+
+    printf (" Hello World! from thread %d of %d\n",thread_num,num_threads);
+}
+
+int main(int argc, char **argv) {
+
+    /* get num_threads from the command line */
+    if (argc < 2) {
+        printf ("Command usage : %s %s\n",argv[0],"num_threads");
+        return 1;
+    }
+
+    int num_threads = atoi(argv[1]);
+
+    printf ("num_threads = %d\n",num_threads);
+
+    helloKernel <<< 1, num_threads >>> ();
+    cudaDeviceSynchronize();
+}
diff --git a/L23/key/gpu_sum_v1.cu b/L23/key/gpu_sum_v1.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4914ba7ac58bdd5b5ec98adbd49a1798305e52ee
--- /dev/null
+++ b/L23/key/gpu_sum_v1.cu
@@ -0,0 +1,37 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+
+typedef unsigned long long int uint64;
+
+__global__ void sumKernel(uint64 N) {
+
+    int thread_num = threadIdx.x;
+    int num_threads = blockDim.x;
+
+    uint64 sum = 0;
+    for (uint64 i = 1; i <= N;i++) {
+        sum += i;
+    }
+
+    printf (" on thread %d of %d, sum = %llu\n",thread_num,num_threads,sum);
+}
+
+int main(int argc, char **argv) {
+
+    /* get N and num_threads from the command line */
+    if (argc < 3) {
+        printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads");
+        return 1;
+    }
+
+    uint64 N = atol(argv[1]);
+    int num_threads = atoi(argv[2]);
+
+    printf ("num_threads = %d\n",num_threads);
+    printf ("N*(N+1)/2 = %llu\n",(N/2)*(N+1));
+
+    sumKernel <<< 1, num_threads >>> (N);
+    cudaDeviceSynchronize();
+
+}
diff --git a/L23/key/gpu_sum_v2.cu b/L23/key/gpu_sum_v2.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8a4e890b4fd73c5b0ad754a54d8f01508958ffdd
--- /dev/null
+++ b/L23/key/gpu_sum_v2.cu
@@ -0,0 +1,37 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+
+typedef unsigned long long int uint64;
+
+__global__ void sumKernel(uint64 N) {
+
+    int thread_num = threadIdx.x;
+    int num_threads = blockDim.x;
+
+    uint64 sum = 0;
+    for (uint64 i = 1+thread_num; i <= N;i+=num_threads) {
+        sum += i;
+    }
+
+    printf (" on thread %d of %d, sum = %llu\n",thread_num,num_threads,sum);
+}
+
+int main(int argc, char **argv) {
+
+    /* get N and num_threads from the command line */
+    if (argc < 3) {
+        printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads");
+        return 1;
+    }
+
+    uint64 N = atol(argv[1]);
+    int num_threads = atoi(argv[2]);
+
+    printf ("num_threads = %d\n",num_threads);
+    printf ("N*(N+1)/2 = %llu\n",(N/2)*(N+1));
+
+    sumKernel <<< 1, num_threads >>> (N);
+    cudaDeviceSynchronize();
+
+}
diff --git a/L23/key/gpu_sum_v3.cu b/L23/key/gpu_sum_v3.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a427c7daff67e147367a4edcf20425eaaf1b731b
--- /dev/null
+++ b/L23/key/gpu_sum_v3.cu
@@ -0,0 +1,47 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+
+typedef unsigned long long int uint64;
+
+__global__ void sumKernel(uint64 N) {
+
+    __shared__ uint64 sum;
+
+    int thread_num = threadIdx.x;
+    int num_threads = blockDim.x;
+
+    /* thread 0 initializes sum to 0 */
+    if (thread_num == 0) {
+        sum = 0;
+    }
+
+    /* calculate the sum */
+    for (uint64 i = 1+thread_num; i <= N;i+=num_threads) {
+        sum += i;
+    }
+
+    /* thread 0 prints the sum */
+    if (thread_num == 0) {
+        printf (" sum = %llu\n",sum);
+    }
+}
+
+int main(int argc, char **argv) {
+
+    /* get N and num_threads from the command line */
+    if (argc < 3) {
+        printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads");
+        return 1;
+    }
+
+    uint64 N = atol(argv[1]);
+    int num_threads = atoi(argv[2]);
+
+    printf ("num_threads = %d\n",num_threads);
+    printf ("N*(N+1)/2 = %llu\n",(N/2)*(N+1));
+
+    sumKernel <<< 1, num_threads >>> (N);
+    cudaDeviceSynchronize();
+
+}
diff --git a/L23/key/gpu_sum_v4.cu b/L23/key/gpu_sum_v4.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9c51068a9c18ee4ad14db76340f4b8ae32935c9d
--- /dev/null
+++ b/L23/key/gpu_sum_v4.cu
@@ -0,0 +1,47 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+
+typedef unsigned long long int uint64;
+
+__global__ void sumKernel(uint64 N) {
+
+    __shared__ uint64 sum;
+
+    int thread_num = threadIdx.x;
+    int num_threads = blockDim.x;
+
+    /* initialize sum to 0 */
+    if (thread_num == 0) {
+        sum = 0;
+    }
+
+    /* calculate the sum */
+    for (uint64 i = 1+thread_num; i <= N;i+=num_threads) {
+        atomicAdd(&sum,i);
+    }
+
+    /* thread 0 prints the sum */
+    if (thread_num == 0) {
+        printf (" sum = %llu\n",sum);
+    }
+}
+
+int main(int argc, char **argv) {
+
+    /* get N and num_threads from the command line */
+    if (argc < 3) {
+        printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads");
+        return 1;
+    }
+
+    uint64 N = atol(argv[1]);
+    int num_threads = atoi(argv[2]);
+
+    printf ("num_threads = %d\n",num_threads);
+    printf ("N*(N+1)/2 = %llu\n",(N/2)*(N+1));
+
+    sumKernel <<< 1, num_threads >>> (N);
+    cudaDeviceSynchronize();
+
+}
diff --git a/L23/key/gpu_sum_v5.cu b/L23/key/gpu_sum_v5.cu
new file mode 100644
index 0000000000000000000000000000000000000000..643a59f52a72d3499c428b39bb39522494122584
--- /dev/null
+++ b/L23/key/gpu_sum_v5.cu
@@ -0,0 +1,49 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+
+typedef unsigned long long int uint64;
+
+__global__ void sumKernel(uint64 N) {
+
+    __shared__ uint64 sum;
+
+    int thread_num = threadIdx.x;
+    int num_threads = blockDim.x;
+
+    /* initialize sum to 0 */
+    if (thread_num == 0) {
+        sum = 0;
+    }
+
+    /* calculate the sum */
+    uint64 thread_sum = 0;
+    for (uint64 i = 1+thread_num; i <= N;i+=num_threads) {
+        thread_sum += i;
+    }
+    atomicAdd(&sum,thread_sum);
+
+    /* thread 0 prints the sum */
+    if (thread_num == 0) {
+        printf (" sum = %llu\n",sum);
+    }
+}
+
+int main(int argc, char **argv) {
+
+    /* get N and num_threads from the command line */
+    if (argc < 3) {
+        printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads");
+        return 1;
+    }
+
+    uint64 N = atol(argv[1]);
+    int num_threads = atoi(argv[2]);
+
+    printf ("num_threads = %d\n",num_threads);
+    printf ("N*(N+1)/2 = %llu\n",(N/2)*(N+1));
+
+    sumKernel <<< 1, num_threads >>> (N);
+    cudaDeviceSynchronize();
+
+}
diff --git a/L23/key/gpu_sum_v6.cu b/L23/key/gpu_sum_v6.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0199b224c1b75d7efa4deb78a2e3e4a8ba0f2718
--- /dev/null
+++ b/L23/key/gpu_sum_v6.cu
@@ -0,0 +1,51 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+
+typedef unsigned long long int uint64;
+
+__global__ void sumKernel(uint64 N) {
+
+    __shared__ uint64 sum;
+
+    int thread_num = threadIdx.x;
+    int num_threads = blockDim.x;
+
+    /* initialize sum to 0 */
+    if (thread_num == 0) {
+        sum = 0;
+    }
+    __syncthreads();
+
+    /* calculate the sum */
+    uint64 thread_sum = 0;
+    for (uint64 i = 1+thread_num; i <= N;i+=num_threads) {
+        thread_sum += i;
+    }
+    atomicAdd(&sum,thread_sum);
+    __syncthreads();
+
+    /* thread 0 prints the sum */
+    if (thread_num == 0) {
+        printf (" sum = %llu\n",sum);
+    }
+}
+
+int main(int argc, char **argv) {
+
+    /* get N and num_threads from the command line */
+    if (argc < 3) {
+        printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads");
+        return 1;
+    }
+
+    uint64 N = atol(argv[1]);
+    int num_threads = atoi(argv[2]);
+
+    printf ("num_threads = %d\n",num_threads);
+    printf ("N*(N+1)/2 = %llu\n",(N/2)*(N+1));
+
+    sumKernel <<< 1, num_threads >>> (N);
+    cudaDeviceSynchronize();
+
+}