diff --git a/L23/lecture23_hello_sum.ipynb b/L23/lecture23_hello_sum.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..7de813ae4b904bf294ec4972bc9206d21e35a27c --- /dev/null +++ b/L23/lecture23_hello_sum.ipynb @@ -0,0 +1,768 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Lecture 22 : GPU Hello World and Sum" + ], + "metadata": { + "id": "E65Z9gKvJMoo" + } + }, + { + "cell_type": "markdown", + "source": [ + "## We will learn to program Nvidia GPUs using CUDA (Compute Unified Device Architecture). \n", + "\n", + "## Google Colab gives free access (be responsible!) to a Nvidia T4 GPU (Turing Class). \n", + "\n", + "## Here is a picture of a Turing Class GPU (not T4).\n", + "\n", + "## Such a GPU is capaple of performing thousands of calculations simultaneously!\n", + "\n", + "## The TU102 shown here has 72 SMs (stream multiprocessors). " + ], + "metadata": { + "id": "lRDJGEu4aBxN" + } + }, + { + "cell_type": "markdown", + "source": [ + "" + ], + "metadata": { + "id": "CuJT9I7jGaZL" + } + }, + { + "cell_type": "markdown", + "source": [ + "## The Nvidia T4 is a version of the TU104 GPU (shown below) that has 40 SMs. " + ], + "metadata": { + "id": "jpTq4ik_KWpS" + } + }, + { + "cell_type": "markdown", + "source": [ + "" + ], + "metadata": { + "id": "ICNsRt6AJ5Te" + } + }, + { + "cell_type": "markdown", + "source": [ + "## One of the 40 SMs on the T4 is shown below. " + ], + "metadata": { + "id": "3tkTfKCSLWrJ" + } + }, + { + "cell_type": "markdown", + "source": [ + "" + ], + "metadata": { + "id": "lEEY1-yZKq6p" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Here is our first CUDA program : Hello World!\n", + "\n", + "## Note that a CUDA source file ends with *.cu* and we must include *cuda.h*\n", + "\n", + "## A CUDA kernel such as the helloKernel shown below is executed by each thread. \n", + "\n", + "## A CUDA kernel is a similar to a OpenMP parallel region but there are some differences.\n", + "\n", + "## We use the command given in line 26 to launch the kernel.\n", + "\n", + "## The parameters between the <<< and >>> are called *launch parameters*.\n", + "\n", + "## The first launch parameter is the number of thread blocks. \n", + "\n", + "## The second launch parameter is the number of threads per thread block. \n", + "\n", + "## Note that for the examples in this lecture we are specifying only a single thread block. That means that all of our threads will run on a single SM. This is a big limitation as it means we will only use 1/40 of the full computational power of the T4 GPU. \n", + "\n", + "## Later we will learn how to use multiple thread blocks to unleash the full power of the GPU.\n", + "\n", + "## Uncomment the writefile magic command to write the source code to the file gpu_hello.cu" + ], + "metadata": { + "id": "3kmpjsMxLonG" + } + }, + { + "cell_type": "code", + "source": [ + "#%%writefile gpu_hello.cu\n", + "#include <stdio.h>\n", + "#include <stdlib.h>\n", + "#include <cuda.h>\n", + "\n", + "__global__ void helloKernel() {\n", + "\n", + " int thread_num = threadIdx.x;\n", + " int num_threads = blockDim.x;\n", + "\n", + " printf (\" Hello World! from thread %d of %d\\n\",thread_num,num_threads);\n", + "}\n", + "\n", + "int main(int argc, char **argv) {\n", + "\n", + " /* get num_threads from the command line */\n", + " if (argc < 2) {\n", + " printf (\"Command usage : %s %s\\n\",argv[0],\"num_threads\");\n", + " return 1;\n", + " }\n", + "\n", + " int num_threads = atoi(argv[1]);\n", + "\n", + " printf (\"num_threads = %d\\n\",num_threads);\n", + "\n", + " helloKernel <<< 1, num_threads >>> ();\n", + " cudaDeviceSynchronize();\n", + "}" + ], + "metadata": { + "id": "p896Aw2JAXvn" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## To compile a CUDA source file we use *nvcc* instead of our normal *gcc*." + ], + "metadata": { + "id": "vLmIVNmk0U6o" + } + }, + { + "cell_type": "code", + "source": [ + "!nvcc -arch=sm_75 -o gpu_hello gpu_hello.cu" + ], + "metadata": { + "id": "3TCNJa5FBIMg" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Run the program with 32 threads and with 128 threads. What do you observe?\n", + "\n", + "## Threads are grouped into warps of size 32. \n", + "\n", + "## Threads in a particular warp execute the same instruction simultaneously but on different data. \n", + "\n", + "## This type of parallelism is called SIMD (same instruction multiple data)." + ], + "metadata": { + "id": "Pef7RVqj0aXu" + } + }, + { + "cell_type": "code", + "source": [ + "!./gpu_hello 32" + ], + "metadata": { + "id": "1PgJqgnoB893" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Next let's write a CUDA program for computing the sum of the first $N$ integers.\n", + "\n", + "## Recall that Gauss showed that\n", + "$$\\displaystyle\\sum_{i=1}^{N} i = \\displaystyle\\frac{N(N+1)}{2}$$\n", + "\n", + "## For our first version we will just have each thread compute the entire sum and print out the result. Note that the kernel function now has an argument.\n", + "\n", + "## As in OpenMP, variables defined in a CUDA kernel (including arguments) are private variables (one for each thread) by default. \n", + "\n", + "## CUDA kernels always have a *void* return type so outputs must be returned through pointers. \n", + "\n", + "## Discussion: How would you change the kernel so that each thread only calculates only an approximately equal share of the sum?\n", + "\n", + "## Note: Unlike OpenMP, both CUDA and MPI do not have built in support for scheduling for loop iterations across threads." + ], + "metadata": { + "id": "1sw0RMMTOyyZ" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_gvrpuHbZ71v" + }, + "outputs": [], + "source": [ + "#%%writefile gpu_sum_v1.cu\n", + "#include <stdio.h>\n", + "#include <stdlib.h>\n", + "#include <cuda.h>\n", + "\n", + "typedef unsigned long long int uint64;\n", + "\n", + "__global__ void sumKernel(uint64 N) {\n", + "\n", + " int thread_num = threadIdx.x;\n", + " int num_threads = blockDim.x;\n", + "\n", + " uint64 sum = 0;\n", + " for (uint64 i = 1; i <= N;i++) {\n", + " sum += i;\n", + " }\n", + "\n", + " printf (\" on thread %d of %d, sum = %llu\\n\",thread_num,num_threads,sum);\n", + "}\n", + "\n", + "int main(int argc, char **argv) {\n", + "\n", + " /* get N and num_threads from the command line */\n", + " if (argc < 3) {\n", + " printf (\"Command usage : %s %s %s\\n\",argv[0],\"N\",\"num_threads\");\n", + " return 1;\n", + " }\n", + "\n", + " uint64 N = atol(argv[1]);\n", + " int num_threads = atoi(argv[2]);\n", + "\n", + " printf (\"num_threads = %d\\n\",num_threads);\n", + " printf (\"N*(N+1)/2 = %llu\\n\",(N/2)*(N+1));\n", + "\n", + " sumKernel <<< 1, num_threads >>> (N);\n", + " cudaDeviceSynchronize();\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "source": [ + "!nvcc -arch=sm_75 -o gpu_sum_v1 gpu_sum_v1.cu" + ], + "metadata": { + "id": "32NaKTasacUy" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!./gpu_sum_v1 1000 2" + ], + "metadata": { + "id": "xRyLt_oAb3UR" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Here is version 2 of the kernel where each thread calculates an approximately equal share of the sum and prints the partial result.\n", + "\n", + "## Discussion: In order for the threads to communicate partial results we will need to use what type of variable?" + ], + "metadata": { + "id": "jgvsnWNLQ-Lg" + } + }, + { + "cell_type": "code", + "source": [ + "#%%writefile gpu_sum_v2.cu\n", + "#include <stdio.h>\n", + "#include <stdlib.h>\n", + "#include <cuda.h>\n", + "\n", + "typedef unsigned long long int uint64;\n", + "\n", + "__global__ void sumKernel(uint64 N) {\n", + "\n", + " int thread_num = threadIdx.x;\n", + " int num_threads = blockDim.x;\n", + "\n", + " uint64 sum = 0;\n", + " for (uint64 i = 1+thread_num; i <= N;i+=num_threads) {\n", + " sum += i;\n", + " }\n", + "\n", + " printf (\" on thread %d of %d, sum = %llu\\n\",thread_num,num_threads,sum);\n", + "}\n", + "\n", + "int main(int argc, char **argv) {\n", + "\n", + " /* get N and num_threads from the command line */\n", + " if (argc < 3) {\n", + " printf (\"Command usage : %s %s %s\\n\",argv[0],\"N\",\"num_threads\");\n", + " return 1;\n", + " }\n", + "\n", + " uint64 N = atol(argv[1]);\n", + " int num_threads = atoi(argv[2]);\n", + "\n", + " printf (\"num_threads = %d\\n\",num_threads);\n", + " printf (\"N*(N+1)/2 = %llu\\n\",(N/2)*(N+1));\n", + "\n", + " sumKernel <<< 1, num_threads >>> (N);\n", + " cudaDeviceSynchronize();\n", + "\n", + "}" + ], + "metadata": { + "id": "Yzl5EWkUcdRG" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!nvcc -arch=sm_75 -o gpu_sum_v2 gpu_sum_v2.cu" + ], + "metadata": { + "id": "xGI6hvjodJwU" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!./gpu_sum_v2 1000 2" + ], + "metadata": { + "id": "-f8yucN8dS85" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Here is version 3 of the kernel where we use a shared variable sum. Note as we frequently do in MPI, we designate the thread with thread_num equal to 0 as a special thread. Note that only the thread 0 initializes the shared variable and prints the final result. \n", + "\n", + "## When we run version 3 we get the incorrect answer. \n", + "\n", + "## Discussion : Describe the problem with the kernel and a potential solution. " + ], + "metadata": { + "id": "EKsvLXTw3Rqy" + } + }, + { + "cell_type": "code", + "source": [ + "#%%writefile gpu_sum_v3.cu\n", + "#include <stdio.h>\n", + "#include <stdlib.h>\n", + "#include <cuda.h>\n", + "\n", + "typedef unsigned long long int uint64;\n", + "\n", + "__global__ void sumKernel(uint64 N) {\n", + "\n", + " __shared__ uint64 sum;\n", + "\n", + " int thread_num = threadIdx.x;\n", + " int num_threads = blockDim.x;\n", + "\n", + " /* thread 0 initializes sum to 0 */\n", + " if (thread_num == 0) {\n", + " sum = 0;\n", + " }\n", + "\n", + " /* calculate the sum */\n", + " for (uint64 i = 1+thread_num; i <= N;i+=num_threads) {\n", + " sum += i;\n", + " }\n", + "\n", + " /* thread 0 prints the sum */\n", + " if (thread_num == 0) {\n", + " printf (\" sum = %llu\\n\",sum);\n", + " }\n", + "}\n", + "\n", + "int main(int argc, char **argv) {\n", + "\n", + " /* get N and num_threads from the command line */\n", + " if (argc < 3) {\n", + " printf (\"Command usage : %s %s %s\\n\",argv[0],\"N\",\"num_threads\");\n", + " return 1;\n", + " }\n", + "\n", + " uint64 N = atol(argv[1]);\n", + " int num_threads = atoi(argv[2]);\n", + "\n", + " printf (\"num_threads = %d\\n\",num_threads);\n", + " printf (\"N*(N+1)/2 = %llu\\n\",(N/2)*(N+1));\n", + "\n", + " sumKernel <<< 1, num_threads >>> (N);\n", + " cudaDeviceSynchronize();\n", + "\n", + "}" + ], + "metadata": { + "id": "1Z5WSV0VdhhO" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!nvcc -arch=sm_75 -o gpu_sum_v3 gpu_sum_v3.cu" + ], + "metadata": { + "id": "OXovPeM8d_5a" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!./gpu_sum_v3 1000 4" + ], + "metadata": { + "id": "WeKSCCqKeDut" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Here is version 4 of the kernel that uses an atomic add instruction to avoid the read-write race condition present in the previous version.\n", + "\n", + "## Note that this version of the kernel takes a while to run on a modest N of 100 million with 32 threads. Even though we are only using a small part of the GPU this seems very slow. \n", + "\n", + "## Discussion : Describe the problem with the kernel and a potential solution. \n", + "\n", + "## Hint: the atomic add instruction is serving the same purpose as what OpenMP construct? What do we know about that OpenMP construct?" + ], + "metadata": { + "id": "LRzumKK-4Mdr" + } + }, + { + "cell_type": "code", + "source": [ + "#%%writefile gpu_sum_v4.cu\n", + "#include <stdio.h>\n", + "#include <stdlib.h>\n", + "#include <cuda.h>\n", + "\n", + "typedef unsigned long long int uint64;\n", + "\n", + "__global__ void sumKernel(uint64 N) {\n", + "\n", + " __shared__ uint64 sum;\n", + "\n", + " int thread_num = threadIdx.x;\n", + " int num_threads = blockDim.x;\n", + "\n", + " /* initialize sum to 0 */\n", + " if (thread_num == 0) {\n", + " sum = 0;\n", + " }\n", + "\n", + " /* calculate the sum */\n", + " for (uint64 i = 1+thread_num; i <= N;i+=num_threads) {\n", + " atomicAdd(&sum,i);\n", + " }\n", + "\n", + " /* thread 0 prints the sum */\n", + " if (thread_num == 0) {\n", + " printf (\" sum = %llu\\n\",sum);\n", + " }\n", + "}\n", + "\n", + "int main(int argc, char **argv) {\n", + "\n", + " /* get N and num_threads from the command line */\n", + " if (argc < 3) {\n", + " printf (\"Command usage : %s %s %s\\n\",argv[0],\"N\",\"num_threads\");\n", + " return 1;\n", + " }\n", + "\n", + " uint64 N = atol(argv[1]);\n", + " int num_threads = atoi(argv[2]);\n", + "\n", + " printf (\"num_threads = %d\\n\",num_threads);\n", + " printf (\"N*(N+1)/2 = %llu\\n\",(N/2)*(N+1));\n", + "\n", + " sumKernel <<< 1, num_threads >>> (N);\n", + " cudaDeviceSynchronize();\n", + "\n", + "}" + ], + "metadata": { + "id": "hdHi3sEUeKze" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!nvcc -arch=sm_75 -o gpu_sum_v4 gpu_sum_v4.cu" + ], + "metadata": { + "id": "Hbm8nLE3en-e" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!time ./gpu_sum_v4 100000000 32" + ], + "metadata": { + "id": "KbQmVA0tepY9" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Here is version 5 of the kernel that uses a thread version of the sum variable to pull the atomicAdd outside of the for loop. Note in particular that each thread only executes the atomicAdd one time. \n", + "\n", + "## Note that this version of the kernel is much faster than the previous. In fact it can now calculate a larger sum (N equal to a billion) in around one second.\n", + "\n", + "## Try running the kernel with 64 threads instead of 32 threads. What do you observe?\n", + "\n", + "## Discussion : Describe the problem with the kernel and a potential solution.\n", + "\n", + "## Hints : What would happen if thread 32 finishes updating the shared sum variable with its partial sum before thread 0 initializes sum to 0? What would happen if thread 0 prints the final result before thread 32 finishes updating the shared sum variable with its partial sum?" + ], + "metadata": { + "id": "msfZibpb5-0v" + } + }, + { + "cell_type": "code", + "source": [ + "#%%writefile gpu_sum_v5.cu\n", + "#include <stdio.h>\n", + "#include <stdlib.h>\n", + "#include <cuda.h>\n", + "\n", + "typedef unsigned long long int uint64;\n", + "\n", + "__global__ void sumKernel(uint64 N) {\n", + "\n", + " __shared__ uint64 sum;\n", + "\n", + " int thread_num = threadIdx.x;\n", + " int num_threads = blockDim.x;\n", + "\n", + " /* initialize sum to 0 */\n", + " if (thread_num == 0) {\n", + " sum = 0;\n", + " }\n", + "\n", + " /* calculate the sum */\n", + " uint64 thread_sum = 0;\n", + " for (uint64 i = 1+thread_num; i <= N;i+=num_threads) {\n", + " thread_sum += i;\n", + " }\n", + " atomicAdd(&sum,thread_sum);\n", + "\n", + " /* thread 0 prints the sum */\n", + " if (thread_num == 0) {\n", + " printf (\" sum = %llu\\n\",sum);\n", + " }\n", + "}\n", + "\n", + "int main(int argc, char **argv) {\n", + "\n", + " /* get N and num_threads from the command line */\n", + " if (argc < 3) {\n", + " printf (\"Command usage : %s %s %s\\n\",argv[0],\"N\",\"num_threads\");\n", + " return 1;\n", + " }\n", + "\n", + " uint64 N = atol(argv[1]);\n", + " int num_threads = atoi(argv[2]);\n", + "\n", + " printf (\"num_threads = %d\\n\",num_threads);\n", + " printf (\"N*(N+1)/2 = %llu\\n\",(N/2)*(N+1));\n", + "\n", + " sumKernel <<< 1, num_threads >>> (N);\n", + " cudaDeviceSynchronize();\n", + "\n", + "}" + ], + "metadata": { + "id": "LX27xY56e1LG" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!nvcc -arch=sm_75 -o gpu_sum_v5 gpu_sum_v5.cu" + ], + "metadata": { + "id": "oe-EXddihA22" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!time ./gpu_sum_v5 1000000000 32" + ], + "metadata": { + "id": "tgRlAXswhEVT" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Here is version 6 of the kernel that uses two barriers to fix the bugs in kernel 5. \n", + "## A barrier is a line of code that all threads (in a particular thread block) must get to before any thread is allowed to continue. \n", + "## The first barrier ensures that thread 0 initializes the sum variable to 0 before other threads are allowed to start updating the shared sum variable with the partial sums. \n", + "## The second barrier ensures that all threads have finished adding their partial sums to the shared sum variable before thread 0 prints the final result. " + ], + "metadata": { + "id": "JehyAdqP9GKu" + } + }, + { + "cell_type": "code", + "source": [ + "#%%writefile gpu_sum_v6.cu\n", + "#include <stdio.h>\n", + "#include <stdlib.h>\n", + "#include <cuda.h>\n", + "\n", + "typedef unsigned long long int uint64;\n", + "\n", + "__global__ void sumKernel(uint64 N) {\n", + "\n", + " __shared__ uint64 sum;\n", + "\n", + " int thread_num = threadIdx.x;\n", + " int num_threads = blockDim.x;\n", + "\n", + " /* initialize sum to 0 */\n", + " if (thread_num == 0) {\n", + " sum = 0;\n", + " }\n", + " __syncthreads();\n", + "\n", + " /* calculate the sum */\n", + " uint64 thread_sum = 0;\n", + " for (uint64 i = 1+thread_num; i <= N;i+=num_threads) {\n", + " thread_sum += i;\n", + " }\n", + " atomicAdd(&sum,thread_sum);\n", + " __syncthreads();\n", + "\n", + " /* thread 0 prints the sum */\n", + " if (thread_num == 0) {\n", + " printf (\" sum = %llu\\n\",sum);\n", + " }\n", + "}\n", + "\n", + "int main(int argc, char **argv) {\n", + "\n", + " /* get N and num_threads from the command line */\n", + " if (argc < 3) {\n", + " printf (\"Command usage : %s %s %s\\n\",argv[0],\"N\",\"num_threads\");\n", + " return 1;\n", + " }\n", + "\n", + " uint64 N = atol(argv[1]);\n", + " int num_threads = atoi(argv[2]);\n", + "\n", + " printf (\"num_threads = %d\\n\",num_threads);\n", + " printf (\"N*(N+1)/2 = %llu\\n\",(N/2)*(N+1));\n", + "\n", + " sumKernel <<< 1, num_threads >>> (N);\n", + " cudaDeviceSynchronize();\n", + "\n", + "}" + ], + "metadata": { + "id": "VAlmylq0hHGV" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!nvcc -arch=sm_75 -o gpu_sum_v6 gpu_sum_v6.cu" + ], + "metadata": { + "id": "Qqaxd_bgiHSX" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!time ./gpu_sum_v6 1000000000 64" + ], + "metadata": { + "id": "OlXnfwrGiI2H" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "K0teo_chiMsT" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file