From be0644bf088044ad272181641cd9d3db026815f7 Mon Sep 17 00:00:00 2001 From: Jason R Wilson <jasonwil@vt.edu> Date: Sun, 14 Apr 2024 23:55:19 +0000 Subject: [PATCH] Upload New File --- L23/lecture23_hello_sum_key.ipynb | 1171 +++++++++++++++++++++++++++++ 1 file changed, 1171 insertions(+) create mode 100644 L23/lecture23_hello_sum_key.ipynb diff --git a/L23/lecture23_hello_sum_key.ipynb b/L23/lecture23_hello_sum_key.ipynb new file mode 100644 index 0000000..3ffa29d --- /dev/null +++ b/L23/lecture23_hello_sum_key.ipynb @@ -0,0 +1,1171 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Lecture 23 : GPU Hello World and Sum" + ], + "metadata": { + "id": "E65Z9gKvJMoo" + } + }, + { + "cell_type": "markdown", + "source": [ + "## We will learn to program Nvidia GPUs using CUDA (Compute Unified Device Architecture). \n", + "\n", + "## Google Colab gives free access (be responsible!) to a Nvidia T4 GPU (Turing Class). \n", + "\n", + "## Here is a picture of a Turing Class GPU (not T4).\n", + "\n", + "## Such a GPU is capaple of performing thousands of calculations simultaneously!\n", + "\n", + "## The TU102 shown here has 72 SMs (stream multiprocessors). " + ], + "metadata": { + "id": "lRDJGEu4aBxN" + } + }, + { + "cell_type": "markdown", + "source": [ + "" + ], + "metadata": { + "id": "CuJT9I7jGaZL" + } + }, + { + "cell_type": "markdown", + "source": [ + "## The Nvidia T4 is a version of the TU104 GPU (shown below) that has 40 SMs. " + ], + "metadata": { + "id": "jpTq4ik_KWpS" + } + }, + { + "cell_type": "markdown", + "source": [ + "" + ], + "metadata": { + "id": "ICNsRt6AJ5Te" + } + }, + { + "cell_type": "markdown", + "source": [ + "## One of the 40 SMs on the T4 is shown below. " + ], + "metadata": { + "id": "3tkTfKCSLWrJ" + } + }, + { + "cell_type": "markdown", + "source": [ + "" + ], + "metadata": { + "id": "lEEY1-yZKq6p" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Here is our first CUDA program : Hello World!\n", + "\n", + "## Note that a CUDA source file ends with *.cu* and we must include *cuda.h*\n", + "\n", + "## A CUDA kernel such as the helloKernel shown below is executed by each thread. \n", + "\n", + "## A CUDA kernel is a similar to a OpenMP parallel region but there are some differences.\n", + "\n", + "## We use the command given in line 26 to launch the kernel.\n", + "\n", + "## The parameters between the <<< and >>> are called *launch parameters*.\n", + "\n", + "## The first launch parameter is the number of thread blocks. \n", + "\n", + "## The second launch parameter is the number of threads per thread block. \n", + "\n", + "## Note that for the examples in this lecture we are specifying only a single thread block. That means that all of our threads will run on a single SM. This is a big limitation as it means we will only use 1/40 of the full computational power of the T4 GPU. \n", + "\n", + "## Later we will learn how to use multiple thread blocks to unleash the full power of the GPU." + ], + "metadata": { + "id": "3kmpjsMxLonG" + } + }, + { + "cell_type": "code", + "source": [ + "%%writefile gpu_hello.cu\n", + "#include <stdio.h>\n", + "#include <stdlib.h>\n", + "#include <cuda.h>\n", + "\n", + "__global__ void helloKernel() {\n", + "\n", + " int thread_num = threadIdx.x;\n", + " int num_threads = blockDim.x;\n", + "\n", + " printf (\" Hello World! from thread %d of %d\\n\",thread_num,num_threads);\n", + "}\n", + "\n", + "int main(int argc, char **argv) {\n", + "\n", + " /* get num_threads from the command line */\n", + " if (argc < 2) {\n", + " printf (\"Command usage : %s %s\\n\",argv[0],\"num_threads\");\n", + " return 1;\n", + " }\n", + "\n", + " int num_threads = atoi(argv[1]);\n", + "\n", + " printf (\"num_threads = %d\\n\",num_threads);\n", + "\n", + " helloKernel <<< 1, num_threads >>> ();\n", + " cudaDeviceSynchronize();\n", + "}" + ], + "metadata": { + "id": "p896Aw2JAXvn", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "352cd5c6-7e59-4683-c534-88914b5bab3d" + }, + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Overwriting gpu_hello.cu\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## To compile a CUDA source file we use *nvcc* instead of our normal *gcc*." + ], + "metadata": { + "id": "vLmIVNmk0U6o" + } + }, + { + "cell_type": "code", + "source": [ + "!nvcc -arch=sm_75 -o gpu_hello gpu_hello.cu" + ], + "metadata": { + "id": "3TCNJa5FBIMg" + }, + "execution_count": 2, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Run the program with 32 threads and with 128 threads. What do you observe?\n", + "\n", + "## Threads are grouped into warps of size 32. \n", + "\n", + "## Threads in a particular warp execute the same instruction simultaneously but on different data. \n", + "\n", + "## This type of parallelism is called SIMD (same instruction multiple data)." + ], + "metadata": { + "id": "Pef7RVqj0aXu" + } + }, + { + "cell_type": "code", + "source": [ + "!./gpu_hello 32" + ], + "metadata": { + "id": "1PgJqgnoB893", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "c12778f1-5f7b-448c-fed3-e0349875cd25" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "num_threads = 32\n", + " Hello World! from thread 0 of 32\n", + " Hello World! from thread 1 of 32\n", + " Hello World! from thread 2 of 32\n", + " Hello World! from thread 3 of 32\n", + " Hello World! from thread 4 of 32\n", + " Hello World! from thread 5 of 32\n", + " Hello World! from thread 6 of 32\n", + " Hello World! from thread 7 of 32\n", + " Hello World! from thread 8 of 32\n", + " Hello World! from thread 9 of 32\n", + " Hello World! from thread 10 of 32\n", + " Hello World! from thread 11 of 32\n", + " Hello World! from thread 12 of 32\n", + " Hello World! from thread 13 of 32\n", + " Hello World! from thread 14 of 32\n", + " Hello World! from thread 15 of 32\n", + " Hello World! from thread 16 of 32\n", + " Hello World! from thread 17 of 32\n", + " Hello World! from thread 18 of 32\n", + " Hello World! from thread 19 of 32\n", + " Hello World! from thread 20 of 32\n", + " Hello World! from thread 21 of 32\n", + " Hello World! from thread 22 of 32\n", + " Hello World! from thread 23 of 32\n", + " Hello World! from thread 24 of 32\n", + " Hello World! from thread 25 of 32\n", + " Hello World! from thread 26 of 32\n", + " Hello World! from thread 27 of 32\n", + " Hello World! from thread 28 of 32\n", + " Hello World! from thread 29 of 32\n", + " Hello World! from thread 30 of 32\n", + " Hello World! from thread 31 of 32\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "!./gpu_hello 128" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "C1i_9TOe9dGt", + "outputId": "24ca2962-fa95-4e02-e214-599721e8c534" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "num_threads = 128\n", + " Hello World! from thread 64 of 128\n", + " Hello World! from thread 65 of 128\n", + " Hello World! from thread 66 of 128\n", + " Hello World! from thread 67 of 128\n", + " Hello World! from thread 68 of 128\n", + " Hello World! from thread 69 of 128\n", + " Hello World! from thread 70 of 128\n", + " Hello World! from thread 71 of 128\n", + " Hello World! from thread 72 of 128\n", + " Hello World! from thread 73 of 128\n", + " Hello World! from thread 74 of 128\n", + " Hello World! from thread 75 of 128\n", + " Hello World! from thread 76 of 128\n", + " Hello World! from thread 77 of 128\n", + " Hello World! from thread 78 of 128\n", + " Hello World! from thread 79 of 128\n", + " Hello World! from thread 80 of 128\n", + " Hello World! from thread 81 of 128\n", + " Hello World! from thread 82 of 128\n", + " Hello World! from thread 83 of 128\n", + " Hello World! from thread 84 of 128\n", + " Hello World! from thread 85 of 128\n", + " Hello World! from thread 86 of 128\n", + " Hello World! from thread 87 of 128\n", + " Hello World! from thread 88 of 128\n", + " Hello World! from thread 89 of 128\n", + " Hello World! from thread 90 of 128\n", + " Hello World! from thread 91 of 128\n", + " Hello World! from thread 92 of 128\n", + " Hello World! from thread 93 of 128\n", + " Hello World! from thread 94 of 128\n", + " Hello World! from thread 95 of 128\n", + " Hello World! from thread 96 of 128\n", + " Hello World! from thread 97 of 128\n", + " Hello World! from thread 98 of 128\n", + " Hello World! from thread 99 of 128\n", + " Hello World! from thread 100 of 128\n", + " Hello World! from thread 101 of 128\n", + " Hello World! from thread 102 of 128\n", + " Hello World! from thread 103 of 128\n", + " Hello World! from thread 104 of 128\n", + " Hello World! from thread 105 of 128\n", + " Hello World! from thread 106 of 128\n", + " Hello World! from thread 107 of 128\n", + " Hello World! from thread 108 of 128\n", + " Hello World! from thread 109 of 128\n", + " Hello World! from thread 110 of 128\n", + " Hello World! from thread 111 of 128\n", + " Hello World! from thread 112 of 128\n", + " Hello World! from thread 113 of 128\n", + " Hello World! from thread 114 of 128\n", + " Hello World! from thread 115 of 128\n", + " Hello World! from thread 116 of 128\n", + " Hello World! from thread 117 of 128\n", + " Hello World! from thread 118 of 128\n", + " Hello World! from thread 119 of 128\n", + " Hello World! from thread 120 of 128\n", + " Hello World! from thread 121 of 128\n", + " Hello World! from thread 122 of 128\n", + " Hello World! from thread 123 of 128\n", + " Hello World! from thread 124 of 128\n", + " Hello World! from thread 125 of 128\n", + " Hello World! from thread 126 of 128\n", + " Hello World! from thread 127 of 128\n", + " Hello World! from thread 0 of 128\n", + " Hello World! from thread 1 of 128\n", + " Hello World! from thread 2 of 128\n", + " Hello World! from thread 3 of 128\n", + " Hello World! from thread 4 of 128\n", + " Hello World! from thread 5 of 128\n", + " Hello World! from thread 6 of 128\n", + " Hello World! from thread 7 of 128\n", + " Hello World! from thread 8 of 128\n", + " Hello World! from thread 9 of 128\n", + " Hello World! from thread 10 of 128\n", + " Hello World! from thread 11 of 128\n", + " Hello World! from thread 12 of 128\n", + " Hello World! from thread 13 of 128\n", + " Hello World! from thread 14 of 128\n", + " Hello World! from thread 15 of 128\n", + " Hello World! from thread 16 of 128\n", + " Hello World! from thread 17 of 128\n", + " Hello World! from thread 18 of 128\n", + " Hello World! from thread 19 of 128\n", + " Hello World! from thread 20 of 128\n", + " Hello World! from thread 21 of 128\n", + " Hello World! from thread 22 of 128\n", + " Hello World! from thread 23 of 128\n", + " Hello World! from thread 24 of 128\n", + " Hello World! from thread 25 of 128\n", + " Hello World! from thread 26 of 128\n", + " Hello World! from thread 27 of 128\n", + " Hello World! from thread 28 of 128\n", + " Hello World! from thread 29 of 128\n", + " Hello World! from thread 30 of 128\n", + " Hello World! from thread 31 of 128\n", + " Hello World! from thread 32 of 128\n", + " Hello World! from thread 33 of 128\n", + " Hello World! from thread 34 of 128\n", + " Hello World! from thread 35 of 128\n", + " Hello World! from thread 36 of 128\n", + " Hello World! from thread 37 of 128\n", + " Hello World! from thread 38 of 128\n", + " Hello World! from thread 39 of 128\n", + " Hello World! from thread 40 of 128\n", + " Hello World! from thread 41 of 128\n", + " Hello World! from thread 42 of 128\n", + " Hello World! from thread 43 of 128\n", + " Hello World! from thread 44 of 128\n", + " Hello World! from thread 45 of 128\n", + " Hello World! from thread 46 of 128\n", + " Hello World! from thread 47 of 128\n", + " Hello World! from thread 48 of 128\n", + " Hello World! from thread 49 of 128\n", + " Hello World! from thread 50 of 128\n", + " Hello World! from thread 51 of 128\n", + " Hello World! from thread 52 of 128\n", + " Hello World! from thread 53 of 128\n", + " Hello World! from thread 54 of 128\n", + " Hello World! from thread 55 of 128\n", + " Hello World! from thread 56 of 128\n", + " Hello World! from thread 57 of 128\n", + " Hello World! from thread 58 of 128\n", + " Hello World! from thread 59 of 128\n", + " Hello World! from thread 60 of 128\n", + " Hello World! from thread 61 of 128\n", + " Hello World! from thread 62 of 128\n", + " Hello World! from thread 63 of 128\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Next let's write a CUDA program for computing the sum of the first $N$ integers.\n", + "\n", + "## Recall that Gauss showed that\n", + "$$\\displaystyle\\sum_{i=1}^{N} i = \\displaystyle\\frac{N(N+1)}{2}$$\n", + "\n", + "## For our first version we will just have each thread compute the entire sum and print out the result. Note that the kernel function now has an argument.\n", + "\n", + "## As in OpenMP, variables defined in a CUDA kernel (including arguments) are private variables (one for each thread) by default. \n", + "\n", + "## CUDA kernels always have a *void* return type so outputs must be returned through pointers. \n", + "\n", + "## Discussion: How would you change the kernel so that each thread only calculates only an approximately equal share of the sum?\n", + "\n", + "## Note: Unlike OpenMP, both CUDA and MPI do not have built in support for scheduling for loop iterations across threads." + ], + "metadata": { + "id": "1sw0RMMTOyyZ" + } + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "_gvrpuHbZ71v", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "8351bbe6-f62a-4471-9926-d72a7aba2ce4" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Overwriting gpu_sum_v1.cu\n" + ] + } + ], + "source": [ + "%%writefile gpu_sum_v1.cu\n", + "#include <stdio.h>\n", + "#include <stdlib.h>\n", + "#include <cuda.h>\n", + "\n", + "typedef unsigned long long int uint64;\n", + "\n", + "__global__ void sumKernel(uint64 N) {\n", + "\n", + " int thread_num = threadIdx.x;\n", + " int num_threads = blockDim.x;\n", + "\n", + " uint64 sum = 0;\n", + " for (uint64 i = 1; i <= N;i++) {\n", + " sum += i;\n", + " }\n", + "\n", + " printf (\" on thread %d of %d, sum = %llu\\n\",thread_num,num_threads,sum);\n", + "}\n", + "\n", + "int main(int argc, char **argv) {\n", + "\n", + " /* get N and num_threads from the command line */\n", + " if (argc < 3) {\n", + " printf (\"Command usage : %s %s %s\\n\",argv[0],\"N\",\"num_threads\");\n", + " return 1;\n", + " }\n", + "\n", + " uint64 N = atol(argv[1]);\n", + " int num_threads = atoi(argv[2]);\n", + "\n", + " printf (\"num_threads = %d\\n\",num_threads);\n", + " printf (\"N*(N+1)/2 = %llu\\n\",(N/2)*(N+1));\n", + "\n", + " sumKernel <<< 1, num_threads >>> (N);\n", + " cudaDeviceSynchronize();\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "source": [ + "!nvcc -arch=sm_75 -o gpu_sum_v1 gpu_sum_v1.cu" + ], + "metadata": { + "id": "32NaKTasacUy" + }, + "execution_count": 6, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!./gpu_sum_v1 1000 2" + ], + "metadata": { + "id": "xRyLt_oAb3UR", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "32c8cd9b-61cd-456d-da42-3fa6f1c88baf" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "num_threads = 2\n", + "N*(N+1)/2 = 500500\n", + " on thread 0 of 2, sum = 500500\n", + " on thread 1 of 2, sum = 500500\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Here is version 2 of the kernel where each thread calculates an approximately equal share of the sum and prints the partial result.\n", + "\n", + "## Discussion: In order for the threads to communicate partial results we will need to use what type of variable?" + ], + "metadata": { + "id": "jgvsnWNLQ-Lg" + } + }, + { + "cell_type": "code", + "source": [ + "%%writefile gpu_sum_v2.cu\n", + "#include <stdio.h>\n", + "#include <stdlib.h>\n", + "#include <cuda.h>\n", + "\n", + "typedef unsigned long long int uint64;\n", + "\n", + "__global__ void sumKernel(uint64 N) {\n", + "\n", + " int thread_num = threadIdx.x;\n", + " int num_threads = blockDim.x;\n", + "\n", + " uint64 sum = 0;\n", + " for (uint64 i = 1+thread_num; i <= N;i+=num_threads) {\n", + " sum += i;\n", + " }\n", + "\n", + " printf (\" on thread %d of %d, sum = %llu\\n\",thread_num,num_threads,sum);\n", + "}\n", + "\n", + "int main(int argc, char **argv) {\n", + "\n", + " /* get N and num_threads from the command line */\n", + " if (argc < 3) {\n", + " printf (\"Command usage : %s %s %s\\n\",argv[0],\"N\",\"num_threads\");\n", + " return 1;\n", + " }\n", + "\n", + " uint64 N = atol(argv[1]);\n", + " int num_threads = atoi(argv[2]);\n", + "\n", + " printf (\"num_threads = %d\\n\",num_threads);\n", + " printf (\"N*(N+1)/2 = %llu\\n\",(N/2)*(N+1));\n", + "\n", + " sumKernel <<< 1, num_threads >>> (N);\n", + " cudaDeviceSynchronize();\n", + "\n", + "}" + ], + "metadata": { + "id": "Yzl5EWkUcdRG", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "447681fb-ba91-4c76-b0d1-09bd4cfd514d" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Overwriting gpu_sum_v2.cu\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "!nvcc -arch=sm_75 -o gpu_sum_v2 gpu_sum_v2.cu" + ], + "metadata": { + "id": "xGI6hvjodJwU" + }, + "execution_count": 9, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!./gpu_sum_v2 1000 2" + ], + "metadata": { + "id": "-f8yucN8dS85", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "52201f35-1de8-4afa-a387-970b0f3f9588" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "num_threads = 2\n", + "N*(N+1)/2 = 500500\n", + " on thread 0 of 2, sum = 250000\n", + " on thread 1 of 2, sum = 250500\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Here is version 3 of the kernel where we use a shared variable sum. Note as we frequently do in MPI, we designate the thread with thread_num equal to 0 as a special thread. Note that only the thread 0 initializes the shared variable and prints the final result. \n", + "\n", + "## When we run version 3 we get the incorrect answer. \n", + "\n", + "## Discussion : Describe the problem with the kernel and a potential solution. " + ], + "metadata": { + "id": "EKsvLXTw3Rqy" + } + }, + { + "cell_type": "code", + "source": [ + "%%writefile gpu_sum_v3.cu\n", + "#include <stdio.h>\n", + "#include <stdlib.h>\n", + "#include <cuda.h>\n", + "\n", + "typedef unsigned long long int uint64;\n", + "\n", + "__global__ void sumKernel(uint64 N) {\n", + "\n", + " __shared__ uint64 sum;\n", + "\n", + " int thread_num = threadIdx.x;\n", + " int num_threads = blockDim.x;\n", + "\n", + " /* thread 0 initializes sum to 0 */\n", + " if (thread_num == 0) {\n", + " sum = 0;\n", + " }\n", + "\n", + " /* calculate the sum */\n", + " for (uint64 i = 1+thread_num; i <= N;i+=num_threads) {\n", + " sum += i;\n", + " }\n", + "\n", + " /* thread 0 prints the sum */\n", + " if (thread_num == 0) {\n", + " printf (\" sum = %llu\\n\",sum);\n", + " }\n", + "}\n", + "\n", + "int main(int argc, char **argv) {\n", + "\n", + " /* get N and num_threads from the command line */\n", + " if (argc < 3) {\n", + " printf (\"Command usage : %s %s %s\\n\",argv[0],\"N\",\"num_threads\");\n", + " return 1;\n", + " }\n", + "\n", + " uint64 N = atol(argv[1]);\n", + " int num_threads = atoi(argv[2]);\n", + "\n", + " printf (\"num_threads = %d\\n\",num_threads);\n", + " printf (\"N*(N+1)/2 = %llu\\n\",(N/2)*(N+1));\n", + "\n", + " sumKernel <<< 1, num_threads >>> (N);\n", + " cudaDeviceSynchronize();\n", + "\n", + "}" + ], + "metadata": { + "id": "1Z5WSV0VdhhO", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "e83d10f1-8055-4ad5-dc10-949d7b8f72e2" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Overwriting gpu_sum_v3.cu\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "!nvcc -arch=sm_75 -o gpu_sum_v3 gpu_sum_v3.cu" + ], + "metadata": { + "id": "OXovPeM8d_5a" + }, + "execution_count": 12, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!./gpu_sum_v3 1000 4" + ], + "metadata": { + "id": "WeKSCCqKeDut", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "4dea4c3e-4f64-41da-f597-5f62cdd32863" + }, + "execution_count": 13, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "num_threads = 4\n", + "N*(N+1)/2 = 500500\n", + " sum = 124750\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Here is version 4 of the kernel that uses an atomic add instruction to avoid the read-write race condition present in the previous version.\n", + "\n", + "## Note that this version of the kernel takes a while to run on a modest N of 100 million with 32 threads. Even though we are only using a small part of the GPU this seems very slow. \n", + "\n", + "## Discussion : Describe the problem with the kernel and a potential solution. \n", + "\n", + "## Hint: the atomic add instruction is serving the same purpose as what OpenMP construct? What do we know about that OpenMP construct?" + ], + "metadata": { + "id": "LRzumKK-4Mdr" + } + }, + { + "cell_type": "code", + "source": [ + "%%writefile gpu_sum_v4.cu\n", + "#include <stdio.h>\n", + "#include <stdlib.h>\n", + "#include <cuda.h>\n", + "\n", + "typedef unsigned long long int uint64;\n", + "\n", + "__global__ void sumKernel(uint64 N) {\n", + "\n", + " __shared__ uint64 sum;\n", + "\n", + " int thread_num = threadIdx.x;\n", + " int num_threads = blockDim.x;\n", + "\n", + " /* initialize sum to 0 */\n", + " if (thread_num == 0) {\n", + " sum = 0;\n", + " }\n", + "\n", + " /* calculate the sum */\n", + " for (uint64 i = 1+thread_num; i <= N;i+=num_threads) {\n", + " atomicAdd(&sum,i);\n", + " }\n", + "\n", + " /* thread 0 prints the sum */\n", + " if (thread_num == 0) {\n", + " printf (\" sum = %llu\\n\",sum);\n", + " }\n", + "}\n", + "\n", + "int main(int argc, char **argv) {\n", + "\n", + " /* get N and num_threads from the command line */\n", + " if (argc < 3) {\n", + " printf (\"Command usage : %s %s %s\\n\",argv[0],\"N\",\"num_threads\");\n", + " return 1;\n", + " }\n", + "\n", + " uint64 N = atol(argv[1]);\n", + " int num_threads = atoi(argv[2]);\n", + "\n", + " printf (\"num_threads = %d\\n\",num_threads);\n", + " printf (\"N*(N+1)/2 = %llu\\n\",(N/2)*(N+1));\n", + "\n", + " sumKernel <<< 1, num_threads >>> (N);\n", + " cudaDeviceSynchronize();\n", + "\n", + "}" + ], + "metadata": { + "id": "hdHi3sEUeKze", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "8086fe10-fe58-4adf-a601-0267ffa232e4" + }, + "execution_count": 14, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Overwriting gpu_sum_v4.cu\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "!nvcc -arch=sm_75 -o gpu_sum_v4 gpu_sum_v4.cu" + ], + "metadata": { + "id": "Hbm8nLE3en-e" + }, + "execution_count": 15, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!time ./gpu_sum_v4 100000000 32" + ], + "metadata": { + "id": "KbQmVA0tepY9", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "2b7b09f9-c94a-4232-9664-247f85b82bbd" + }, + "execution_count": 16, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "num_threads = 32\n", + "N*(N+1)/2 = 5000000050000000\n", + " sum = 5000000050000000\n", + "\n", + "real\t0m10.015s\n", + "user\t0m9.767s\n", + "sys\t0m0.143s\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Here is version 5 of the kernel that uses a thread version of the sum variable to pull the atomicAdd outside of the for loop. Note in particular that each thread only executes the atomicAdd one time. \n", + "\n", + "## Note that this version of the kernel is much faster than the previous. In fact it can now calculate a larger sum (N equal to a billion) in around one second.\n", + "\n", + "## Try running the kernel with 64 threads instead of 32 threads. What do you observe?\n", + "\n", + "## Discussion : Describe the problem with the kernel and a potential solution.\n", + "\n", + "## Hints : What would happen if thread 32 finishes updating the shared sum variable with its partial sum before thread 0 initializes sum to 0? What would happen if thread 0 prints the final result before thread 32 finishes updating the shared sum variable with its partial sum?" + ], + "metadata": { + "id": "msfZibpb5-0v" + } + }, + { + "cell_type": "code", + "source": [ + "%%writefile gpu_sum_v5.cu\n", + "#include <stdio.h>\n", + "#include <stdlib.h>\n", + "#include <cuda.h>\n", + "\n", + "typedef unsigned long long int uint64;\n", + "\n", + "__global__ void sumKernel(uint64 N) {\n", + "\n", + " __shared__ uint64 sum;\n", + "\n", + " int thread_num = threadIdx.x;\n", + " int num_threads = blockDim.x;\n", + "\n", + " /* initialize sum to 0 */\n", + " if (thread_num == 0) {\n", + " sum = 0;\n", + " }\n", + "\n", + " /* calculate the sum */\n", + " uint64 thread_sum = 0;\n", + " for (uint64 i = 1+thread_num; i <= N;i+=num_threads) {\n", + " thread_sum += i;\n", + " }\n", + " atomicAdd(&sum,thread_sum);\n", + "\n", + " /* thread 0 prints the sum */\n", + " if (thread_num == 0) {\n", + " printf (\" sum = %llu\\n\",sum);\n", + " }\n", + "}\n", + "\n", + "int main(int argc, char **argv) {\n", + "\n", + " /* get N and num_threads from the command line */\n", + " if (argc < 3) {\n", + " printf (\"Command usage : %s %s %s\\n\",argv[0],\"N\",\"num_threads\");\n", + " return 1;\n", + " }\n", + "\n", + " uint64 N = atol(argv[1]);\n", + " int num_threads = atoi(argv[2]);\n", + "\n", + " printf (\"num_threads = %d\\n\",num_threads);\n", + " printf (\"N*(N+1)/2 = %llu\\n\",(N/2)*(N+1));\n", + "\n", + " sumKernel <<< 1, num_threads >>> (N);\n", + " cudaDeviceSynchronize();\n", + "\n", + "}" + ], + "metadata": { + "id": "LX27xY56e1LG", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "8a155e7c-01c6-4ab6-a7b3-a757e8821977" + }, + "execution_count": 17, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Overwriting gpu_sum_v5.cu\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "!nvcc -arch=sm_75 -o gpu_sum_v5 gpu_sum_v5.cu" + ], + "metadata": { + "id": "oe-EXddihA22" + }, + "execution_count": 18, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!time ./gpu_sum_v5 1000000000 32" + ], + "metadata": { + "id": "tgRlAXswhEVT", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "6ea7d97b-2539-4962-a0c9-1b8ee8e4932d" + }, + "execution_count": 19, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "num_threads = 32\n", + "N*(N+1)/2 = 500000000500000000\n", + " sum = 500000000500000000\n", + "\n", + "real\t0m0.798s\n", + "user\t0m0.674s\n", + "sys\t0m0.114s\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "!time ./gpu_sum_v5 1000000000 64" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mOdnYNT7-IfZ", + "outputId": "7cef1d8b-b320-4185-ecc2-2264082656b5" + }, + "execution_count": 20, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "num_threads = 64\n", + "N*(N+1)/2 = 500000000500000000\n", + " sum = 492187500000000000\n", + "\n", + "real\t0m0.459s\n", + "user\t0m0.346s\n", + "sys\t0m0.101s\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Here is version 6 of the kernel that uses two barriers to fix the bugs in kernel 5. \n", + "## A barrier is a line of code that all threads (in a particular thread block) must get to before any thread is allowed to continue. \n", + "## The first barrier ensures that thread 0 initializes the sum variable to 0 before other threads are allowed to start updating the shared sum variable with the partial sums. \n", + "## The second barrier ensures that all threads have finished adding their partial sums to the shared sum variable before thread 0 prints the final result. " + ], + "metadata": { + "id": "JehyAdqP9GKu" + } + }, + { + "cell_type": "code", + "source": [ + "%%writefile gpu_sum_v6.cu\n", + "#include <stdio.h>\n", + "#include <stdlib.h>\n", + "#include <cuda.h>\n", + "\n", + "typedef unsigned long long int uint64;\n", + "\n", + "__global__ void sumKernel(uint64 N) {\n", + "\n", + " __shared__ uint64 sum;\n", + "\n", + " int thread_num = threadIdx.x;\n", + " int num_threads = blockDim.x;\n", + "\n", + " /* initialize sum to 0 */\n", + " if (thread_num == 0) {\n", + " sum = 0;\n", + " }\n", + " __syncthreads();\n", + "\n", + " /* calculate the sum */\n", + " uint64 thread_sum = 0;\n", + " for (uint64 i = 1+thread_num; i <= N;i+=num_threads) {\n", + " thread_sum += i;\n", + " }\n", + " atomicAdd(&sum,thread_sum);\n", + " __syncthreads();\n", + "\n", + " /* thread 0 prints the sum */\n", + " if (thread_num == 0) {\n", + " printf (\" sum = %llu\\n\",sum);\n", + " }\n", + "}\n", + "\n", + "int main(int argc, char **argv) {\n", + "\n", + " /* get N and num_threads from the command line */\n", + " if (argc < 3) {\n", + " printf (\"Command usage : %s %s %s\\n\",argv[0],\"N\",\"num_threads\");\n", + " return 1;\n", + " }\n", + "\n", + " uint64 N = atol(argv[1]);\n", + " int num_threads = atoi(argv[2]);\n", + "\n", + " printf (\"num_threads = %d\\n\",num_threads);\n", + " printf (\"N*(N+1)/2 = %llu\\n\",(N/2)*(N+1));\n", + "\n", + " sumKernel <<< 1, num_threads >>> (N);\n", + " cudaDeviceSynchronize();\n", + "\n", + "}" + ], + "metadata": { + "id": "VAlmylq0hHGV", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "3006fb61-d79c-43d0-b1a7-2402c1aa616c" + }, + "execution_count": 21, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Overwriting gpu_sum_v6.cu\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "!nvcc -arch=sm_75 -o gpu_sum_v6 gpu_sum_v6.cu" + ], + "metadata": { + "id": "Qqaxd_bgiHSX" + }, + "execution_count": 22, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!time ./gpu_sum_v6 1000000000 64" + ], + "metadata": { + "id": "OlXnfwrGiI2H", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "78b728dd-a857-441a-9e3b-c3cd2bb79d31" + }, + "execution_count": 23, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "num_threads = 64\n", + "N*(N+1)/2 = 500000000500000000\n", + " sum = 500000000500000000\n", + "\n", + "real\t0m0.463s\n", + "user\t0m0.325s\n", + "sys\t0m0.117s\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "K0teo_chiMsT" + }, + "execution_count": 23, + "outputs": [] + } + ] +} \ No newline at end of file -- GitLab