Skip to content
Snippets Groups Projects
Commit 99385e85 authored by Jason R Wilson's avatar Jason R Wilson
Browse files

files for lecture23

parent cc9b938b
Branches
No related merge requests found
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
__global__ void helloKernel() {
/*****************/
/* Add Code Here */
/*****************/
}
int main(int argc, char **argv) {
/* get num_threads from the command line */
if (argc < 2) {
printf ("Command usage : %s %s\n",argv[0],"num_threads");
return 1;
}
int num_threads = atoi(argv[1]);
printf ("num_threads = %d\n",num_threads);
helloKernel <<< 1, num_threads >>> ();
cudaDeviceSynchronize();
}
#!/bin/bash
#SBATCH -A cmda3634_rjh
#SBATCH -p p100_normal_q
#SBATCH -t 5
#SBATCH --gres=gpu:1
#SBATCH -o gpu_hello.out
# Go to the directory where the job was submitted
cd $SLURM_SUBMIT_DIR
# Load CUDA toolkit module
module load cuda11.6/toolkit/11.6.2
# compile
nvcc -w -arch=sm_60 -o gpu_hello gpu_hello.cu
# run hello
./gpu_hello $1
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
typedef unsigned long long int uint64;
__global__ void sumKernel(uint64 N) {
int thread_num = threadIdx.x;
int num_threads = blockDim.x;
uint64 sum = 0;
for (uint64 i = 1; i <= N;i++) {
sum += i;
}
printf (" on thread %d of %d, sum = %llu\n",thread_num,num_threads,sum);
}
int main(int argc, char **argv) {
/* get N and num_threads from the command line */
if (argc < 3) {
printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads");
return 1;
}
uint64 N = atol(argv[1]);
int num_threads = atoi(argv[2]);
printf ("num_threads = %d\n",num_threads);
printf ("N*(N+1)/2 = %llu\n",(N/2)*(N+1));
sumKernel <<< 1, num_threads >>> (N);
cudaDeviceSynchronize();
}
#!/bin/bash
#SBATCH -A cmda3634_rjh
#SBATCH -p p100_normal_q
#SBATCH -t 5
#SBATCH --gres=gpu:1
#SBATCH -o gpu_sum.out
# Go to the directory where the job was submitted
cd $SLURM_SUBMIT_DIR
# Load CUDA toolkit module
module load cuda11.6/toolkit/11.6.2
# compile
nvcc -w -arch=sm_60 -o gpu_sum gpu_sum.cu
# run sum
./gpu_sum $1 $2
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
__global__ void helloKernel() {
int thread_num = threadIdx.x;
int num_threads = blockDim.x;
printf (" Hello World! from thread %d of %d\n",thread_num,num_threads);
}
int main(int argc, char **argv) {
/* get num_threads from the command line */
if (argc < 2) {
printf ("Command usage : %s %s\n",argv[0],"num_threads");
return 1;
}
int num_threads = atoi(argv[1]);
printf ("num_threads = %d\n",num_threads);
helloKernel <<< 1, num_threads >>> ();
cudaDeviceSynchronize();
}
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
typedef unsigned long long int uint64;
__global__ void sumKernel(uint64 N) {
int thread_num = threadIdx.x;
int num_threads = blockDim.x;
uint64 sum = 0;
for (uint64 i = 1; i <= N;i++) {
sum += i;
}
printf (" on thread %d of %d, sum = %llu\n",thread_num,num_threads,sum);
}
int main(int argc, char **argv) {
/* get N and num_threads from the command line */
if (argc < 3) {
printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads");
return 1;
}
uint64 N = atol(argv[1]);
int num_threads = atoi(argv[2]);
printf ("num_threads = %d\n",num_threads);
printf ("N*(N+1)/2 = %llu\n",(N/2)*(N+1));
sumKernel <<< 1, num_threads >>> (N);
cudaDeviceSynchronize();
}
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
typedef unsigned long long int uint64;
__global__ void sumKernel(uint64 N) {
int thread_num = threadIdx.x;
int num_threads = blockDim.x;
uint64 sum = 0;
for (uint64 i = 1+thread_num; i <= N;i+=num_threads) {
sum += i;
}
printf (" on thread %d of %d, sum = %llu\n",thread_num,num_threads,sum);
}
int main(int argc, char **argv) {
/* get N and num_threads from the command line */
if (argc < 3) {
printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads");
return 1;
}
uint64 N = atol(argv[1]);
int num_threads = atoi(argv[2]);
printf ("num_threads = %d\n",num_threads);
printf ("N*(N+1)/2 = %llu\n",(N/2)*(N+1));
sumKernel <<< 1, num_threads >>> (N);
cudaDeviceSynchronize();
}
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
typedef unsigned long long int uint64;
__global__ void sumKernel(uint64 N) {
__shared__ uint64 sum;
int thread_num = threadIdx.x;
int num_threads = blockDim.x;
/* thread 0 initializes sum to 0 */
if (thread_num == 0) {
sum = 0;
}
/* calculate the sum */
for (uint64 i = 1+thread_num; i <= N;i+=num_threads) {
sum += i;
}
/* thread 0 prints the sum */
if (thread_num == 0) {
printf (" sum = %llu\n",sum);
}
}
int main(int argc, char **argv) {
/* get N and num_threads from the command line */
if (argc < 3) {
printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads");
return 1;
}
uint64 N = atol(argv[1]);
int num_threads = atoi(argv[2]);
printf ("num_threads = %d\n",num_threads);
printf ("N*(N+1)/2 = %llu\n",(N/2)*(N+1));
sumKernel <<< 1, num_threads >>> (N);
cudaDeviceSynchronize();
}
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
typedef unsigned long long int uint64;
__global__ void sumKernel(uint64 N) {
__shared__ uint64 sum;
int thread_num = threadIdx.x;
int num_threads = blockDim.x;
/* initialize sum to 0 */
if (thread_num == 0) {
sum = 0;
}
/* calculate the sum */
for (uint64 i = 1+thread_num; i <= N;i+=num_threads) {
atomicAdd(&sum,i);
}
/* thread 0 prints the sum */
if (thread_num == 0) {
printf (" sum = %llu\n",sum);
}
}
int main(int argc, char **argv) {
/* get N and num_threads from the command line */
if (argc < 3) {
printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads");
return 1;
}
uint64 N = atol(argv[1]);
int num_threads = atoi(argv[2]);
printf ("num_threads = %d\n",num_threads);
printf ("N*(N+1)/2 = %llu\n",(N/2)*(N+1));
sumKernel <<< 1, num_threads >>> (N);
cudaDeviceSynchronize();
}
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
typedef unsigned long long int uint64;
__global__ void sumKernel(uint64 N) {
__shared__ uint64 sum;
int thread_num = threadIdx.x;
int num_threads = blockDim.x;
/* initialize sum to 0 */
if (thread_num == 0) {
sum = 0;
}
/* calculate the sum */
uint64 thread_sum = 0;
for (uint64 i = 1+thread_num; i <= N;i+=num_threads) {
thread_sum += i;
}
atomicAdd(&sum,thread_sum);
/* thread 0 prints the sum */
if (thread_num == 0) {
printf (" sum = %llu\n",sum);
}
}
int main(int argc, char **argv) {
/* get N and num_threads from the command line */
if (argc < 3) {
printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads");
return 1;
}
uint64 N = atol(argv[1]);
int num_threads = atoi(argv[2]);
printf ("num_threads = %d\n",num_threads);
printf ("N*(N+1)/2 = %llu\n",(N/2)*(N+1));
sumKernel <<< 1, num_threads >>> (N);
cudaDeviceSynchronize();
}
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
typedef unsigned long long int uint64;
__global__ void sumKernel(uint64 N) {
__shared__ uint64 sum;
int thread_num = threadIdx.x;
int num_threads = blockDim.x;
/* initialize sum to 0 */
if (thread_num == 0) {
sum = 0;
}
__syncthreads();
/* calculate the sum */
uint64 thread_sum = 0;
for (uint64 i = 1+thread_num; i <= N;i+=num_threads) {
thread_sum += i;
}
atomicAdd(&sum,thread_sum);
__syncthreads();
/* thread 0 prints the sum */
if (thread_num == 0) {
printf (" sum = %llu\n",sum);
}
}
int main(int argc, char **argv) {
/* get N and num_threads from the command line */
if (argc < 3) {
printf ("Command usage : %s %s %s\n",argv[0],"N","num_threads");
return 1;
}
uint64 N = atol(argv[1]);
int num_threads = atoi(argv[2]);
printf ("num_threads = %d\n",num_threads);
printf ("N*(N+1)/2 = %llu\n",(N/2)*(N+1));
sumKernel <<< 1, num_threads >>> (N);
cudaDeviceSynchronize();
}
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment