diff --git a/PSA05/05_psa_gpu_std_dev.ipynb b/PSA05/05_psa_gpu_std_dev.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..d2f9af7346c15b693ee842b928e9d5ac74066a32 --- /dev/null +++ b/PSA05/05_psa_gpu_std_dev.ipynb @@ -0,0 +1,618 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "30fNyHXCuepC" + }, + "source": [ + "# CMDA 3634 SP2024 Parallel Programming Skills Activity\n", + "# GPU Standard Deviation\n", + "# 50 points\n", + "# Instructions\n", + "* Complete this Jupyter notebook by writing and testing the requested CUDA code.\n", + "* Also, answer the given questions in the markdown cells provided. \n", + "* **Upload your completed .ipynb file to your cmda3634_arc repo on code.vt.edu under the directory PSA05.**\n", + "* **Also submit a printed version of your .ipynb file as a .pdf file to Canvas.**\n", + "\n", + "# Academic Integrity\n", + "\n", + "* The use of code from prior sections of the class (or similar classes at other institutions, **Chegg, Course Hero, GitHub,\n", + "Stack Overflow, ChatGPT, rent-a-coder sites, etc.**) is **strictly prohibited**, regardless of how they are obtained.\n", + "\n", + "# Honor Code\n", + "* By submitting this assignment, you acknowledge that you have adhered to the Virginia Tech Honor Code and attest to the following:\n", + " \n", + "*I have neither given nor received unauthorized assistance on this assignment. The work I am presenting is ultimately my own.*\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VsBv8XTDwKjt" + }, + "source": [ + "# Standard Deviation\n", + "\n", + "* The standard deviation of the numbers $1 ... N$ is given by\n", + "$$\\sigma = \\sqrt{\\frac{N^2-1}{12}}$$\n", + "\n", + "* A sequential code to compute the standard deviation of the numbers\n", + "$1 ... N$ is given below. \n", + "\n", + "* To test the sequential version, configure Google Colab to use a CPU runtime (not GPU!). \n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "eBWW9vgft1Uy", + "outputId": "94b07a57-9ef5-4956-92e7-33c73e608556" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing std_dev.c\n" + ] + } + ], + "source": [ + "%%writefile std_dev.c\n", + "#include <stdio.h>\n", + "#include <stdlib.h>\n", + "#include <math.h>\n", + "\n", + "typedef unsigned long long int uint64;\n", + "\n", + "int main (int argc, char** argv) {\n", + "\n", + " /* get N from the command line */\n", + " if (argc < 2) {\n", + " printf (\"Command usage : %s %s\\n\",argv[0],\"N\");\n", + " return 1;\n", + " }\n", + " uint64 N = atol(argv[1]);\n", + "\n", + " /* compute the mean */\n", + " uint64 sum = 0;\n", + " for (uint64 i=1;i<=N;i++) {\n", + " sum += i;\n", + " }\n", + " double mean = 1.0*sum/N;\n", + "\n", + " /* compute the sum of differences squared */\n", + " double sum_diff_sq = 0;\n", + " for (uint64 i=1;i<=N;i++) {\n", + " sum_diff_sq += (i-mean)*(i-mean);\n", + " }\n", + "\n", + " /* compute the standard deviation */\n", + " double std_dev = sqrt(sum_diff_sq/N);\n", + "\n", + " /* print the results */\n", + " printf (\"computed std dev is %.1lf\",std_dev);\n", + " printf (\", sqrt((N^2-1)/12) is %.1lf\\n\",sqrt((N*N-1)/12.0));\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jrzCTmYWw22j" + }, + "outputs": [], + "source": [ + "!gcc -o std_dev std_dev.c -lm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "DM9uys2ww6hP", + "outputId": "325662b3-7fd5-4673-a8e4-58ba7423e416" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "computed std dev is 28867513.5, sqrt((N^2-1)/12) is 28867513.5\n", + "\n", + "real\t0m0.642s\n", + "user\t0m0.636s\n", + "sys\t0m0.002s\n" + ] + } + ], + "source": [ + "!time ./std_dev 100000000" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6cVGIqMJyeSU" + }, + "source": [ + "# Part 1 : Complete a CUDA standard deviation version that uses a single thread block.\n", + "\n", + "# 25 points\n", + "\n", + "## For this part the main function is already complete so you just need to write the kernel. \n", + "\n", + "## You will need to have one thread print the standard deviation from inside the kernel.\n", + "\n", + "### Use a GPU runtime on Google Colab to test your code. \n", + "\n", + "### Be careful to only use the GPU runtime when you are actively running CUDA code.\n", + "\n", + "### It is possible to temporarily lose your free access to a GPU on Google!\n", + "\n", + "### In particular, when writing code or answering questions you should have your GPU runtime disconnected. \n", + "\n", + "### You can disconnect your GPU runtime by selecting *Disconnect and delete runtime* from the *Runtime* menu. \n", + "\n", + "# Question: Explain how you made your kernel code thread safe and parallel efficient.\n", + "\n", + "## Answer:\n", + "\n", + "### Hints: Your kernel should use 3 barriers. How many times does each thread execute an atomic instruction?\n", + "\n", + "# Question: What is the primary limitation of using a single thread block? Explain in terms of the GPU hardware (i.e. the SMs--symmetric multiprocessors)\n", + "\n", + "## Answer:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "uGWGpLuJw99u", + "outputId": "f6f5776a-b9f2-425b-a348-a0bb128efcd7" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing gpu_std_dev_v1.cu\n" + ] + } + ], + "source": [ + "%%writefile gpu_std_dev_v1.cu\n", + "#include <stdio.h>\n", + "#include <stdlib.h>\n", + "#include <cuda.h>\n", + "\n", + "typedef unsigned long long int uint64;\n", + "\n", + "__global__ void stdevKernel(uint64 N) {\n", + "\n", + " /*****************************/\n", + " /* add your kernel code here */\n", + " /*****************************/\n", + "\n", + "}\n", + "\n", + "int main(int argc, char **argv) {\n", + "\n", + " /* get N and num_threads from the command line */\n", + " if (argc < 3) {\n", + " printf (\"Command usage : %s %s %s\\n\",argv[0],\"N\",\"num_threads\");\n", + " return 1;\n", + " }\n", + "\n", + " uint64 N = atol(argv[1]);\n", + " int num_threads = atoi(argv[2]);\n", + "\n", + " printf (\"num_threads = %d\\n\",num_threads);\n", + "\n", + " stdevKernel <<< 1, num_threads >>> (N);\n", + " cudaDeviceSynchronize();\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "F1yKNW0WycNt" + }, + "outputs": [], + "source": [ + "!nvcc -arch=sm_75 -o gpu_std_dev_v1 gpu_std_dev_v1.cu" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LD0RJxQoBd1U" + }, + "source": [ + "# Use $N$ equal to 1 billion to test your code for accuracy.\n", + "\n", + "### Note: Here we are using 128 threads." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "iwHRLj7D0OAr", + "outputId": "058bb72d-c77d-4c8b-a0ef-ce08c3b53bbe" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "num_threads = 128\n", + "computed std dev is 288675134.6, sqrt((N^2-1)/12) is 288675134.6\n", + "\n", + "real\t0m1.688s\n", + "user\t0m1.339s\n", + "sys\t0m0.249s\n" + ] + } + ], + "source": [ + "!time ./gpu_std_dev_v1 1000000000 128" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uKoooXBkBVN5" + }, + "source": [ + "# Use N equal to 10 billion to test your code for speed.\n", + "\n", + "### Note: In order to load the GPU we need to use a large value of $N$. Unfortunately, the intermediate steps in the standard deviation calculation overflow the double precision data type and so the answers output are not correct!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-XWSIUFaBsmg", + "outputId": "40b1e95e-0d66-4631-e4cd-5a8c4f594536" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "num_threads = 128\n", + "computed std dev is 4684509367.1, sqrt((N^2-1)/12) is 804481180.2\n", + "\n", + "real\t0m12.049s\n", + "user\t0m11.415s\n", + "sys\t0m0.246s\n" + ] + } + ], + "source": [ + "!time ./gpu_std_dev_v1 10000000000 128" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m73X-1sj6R8l" + }, + "source": [ + "# Part 2 : Complete a CUDA Standard Deviation Version that Uses multiple thread blocks.\n", + "\n", + "# 25 points\n", + "\n", + "## For this part you will need to write two kernels. The interfaces to the kernels are provided below.\n", + "\n", + "## Note: In each kernel, each thread calculates the sum of $T$ terms. \n", + "\n", + "## In addition you will have to finish writing the main function which is partially provided below. \n", + "\n", + "# Question: Explain how you made your kernel code thread safe and parallel efficient.\n", + "\n", + "## Answer:\n", + "\n", + "# Question: About how many times faster is your verion 2 kernel (with $T$ equal to 1000) than your version 1 kernel for $N$ equal to 10 billion?\n", + "\n", + "## Answer:\n", + "\n", + "# Question: When you run your version 2 with $N$ equal to 1 billion and $T$ equal to 1, the runtime is actually longer than the version 1 runtime with the same value of $N$! Explain why version 2 is slower than version 1 when $T$ is equal to 1 despite the fact that version 2 is using every SM and version 1 is only using 1 SM. \n", + "\n", + "## Answer:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "iaxwR4Pu3LVh", + "outputId": "d0e918ac-dc46-48b8-e4f6-12df36283f83" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing gpu_std_dev_v2.cu\n" + ] + } + ], + "source": [ + "%%writefile gpu_std_dev_v2.cu\n", + "#include <stdio.h>\n", + "#include <stdlib.h>\n", + "#include <cuda.h>\n", + "\n", + "typedef unsigned long long int uint64;\n", + "\n", + "__global__ void sumKernel(uint64 N, uint64 T, uint64* sum) {\n", + "\n", + " /*****************************/\n", + " /* add your kernel code here */\n", + " /*****************************/\n", + "\n", + "}\n", + "\n", + "__global__ void sumdiffsqKernel(uint64 N, uint64 T, double mean, double* sum_diff_sq) {\n", + "\n", + " /*****************************/\n", + " /* add your kernel code here */\n", + " /*****************************/\n", + "\n", + "}\n", + "\n", + "int main (int argc, char** argv) {\n", + "\n", + " /* get N, T, and B from the command line */\n", + " /* T is the number of terms per thread */\n", + " /* B is the number of threads per block */\n", + " /* we typically choose B to be a multiple of 32 */\n", + " /* the maximum value of B is 1024 */\n", + " if (argc < 4) {\n", + " printf (\"Command usage : %s %s %s %s\\n\",argv[0],\"N\",\"T\",\"B\");\n", + " return 1;\n", + " }\n", + " uint64 N = atol(argv[1]);\n", + " uint64 T = atol(argv[2]);\n", + " int B = atoi(argv[3]);\n", + "\n", + " /* G is the number of thread blocks */\n", + " /* the maximum number of thread blocks G is 2^31 - 1 = 2147483647 */\n", + " /* We choose G to be the minimum number of thread blocks to have at least N/T threads */\n", + "\n", + " /***********************************/\n", + " /* add your code to compute G here */\n", + " int G;\n", + " /***********************************/\n", + "\n", + " printf (\"N = %lld\\n\",N);\n", + " printf (\"terms per thread T = %lld\\n\",T);\n", + " printf (\"threads per block B = %d\\n\",B);\n", + " printf (\"number of thread blocks G = %d\\n\",G);\n", + " printf (\"number of threads G*B = %d\\n\",G*B);\n", + "\n", + " /***************************/\n", + " /* add your host code here */\n", + " double std_dev;\n", + " /***************************/\n", + "\n", + " /* output the results */\n", + " printf (\"computed std dev is %.1lf\",std_dev);\n", + " printf (\", sqrt((N^2-1)/12) is %.1lf\\n\",sqrt((N*N-1)/12.0));\n", + "\n", + " /*************************************/\n", + " /* add your code to free memory here */\n", + " /*************************************/\n", + "\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5wZcLFpk-Jfq" + }, + "outputs": [], + "source": [ + "!nvcc -arch=sm_75 -o gpu_std_dev_v2 gpu_std_dev_v2.cu" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PoUaY29fAUgJ" + }, + "source": [ + "# Use $N$ equal to 1 billion to test your code for accuracy.\n", + "\n", + "### Note: Here we are using T=1000 and B=128" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "2_dWDUBr-TF9", + "outputId": "b448cbe2-4dd4-4c56-8c8e-15ba86668fc9" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "N = 1000000000\n", + "terms per thread T = 1000\n", + "threads per block B = 128\n", + "number of thread blocks G = 7813\n", + "number of threads G*B = 1000064\n", + "standard deviation (using formula) = 288675134.59\n", + "computed std dev is 288675134.6, sqrt((N^2-1)/12) is 288675134.6\n", + "\n", + "real\t0m0.320s\n", + "user\t0m0.090s\n", + "sys\t0m0.216s\n" + ] + } + ], + "source": [ + "!time ./gpu_std_dev_v2 1000000000 1000 128" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "F-KyZEI3Ao0B" + }, + "source": [ + "# Use N equal to 10 billion to test your code for speed.\n", + "\n", + "### Note: In order to load the GPU we need to use a large value of $N$. Unfortunately the intermediate steps in the standard deviation calculation overflow the double precision data type and so the answers output are not correct!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "W8hIWHx--Vf0", + "outputId": "8a0a8c69-17a0-4e10-e955-53d5dbc50a6f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "N = 10000000000\n", + "terms per thread T = 1000\n", + "threads per block B = 128\n", + "number of thread blocks G = 78125\n", + "number of threads G*B = 10000000\n", + "standard deviation (using formula) = 804481180.19\n", + "computed std dev is 4684509367.1, sqrt((N^2-1)/12) is 804481180.2\n", + "\n", + "real\t0m0.685s\n", + "user\t0m0.461s\n", + "sys\t0m0.212s\n" + ] + } + ], + "source": [ + "!time ./gpu_std_dev_v2 10000000000 1000 128" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LpMsyYGuMsbs" + }, + "source": [ + "# Use $N$ equal to 1 billion and $T$ equal to 1. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "MXCLYeX2BImo", + "outputId": "c0807d2d-2d47-413e-8cca-578bfcae7888" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "N = 1000000000\n", + "terms per thread T = 1\n", + "threads per block B = 128\n", + "number of thread blocks G = 7812500\n", + "number of threads G*B = 1000000000\n", + "standard deviation (using formula) = 288675134.59\n", + "computed std dev is 288675134.6, sqrt((N^2-1)/12) is 288675134.6\n", + "\n", + "real\t0m3.126s\n", + "user\t0m2.935s\n", + "sys\t0m0.122s\n" + ] + } + ], + "source": [ + "!time ./gpu_std_dev_v2 1000000000 1 128" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "CmjI2XEqM0ix" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}