Newer
Older
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda.h>
typedef unsigned long long int uint64;
__global__ void sumKernel(uint64 N) {
__shared__ double sum;
int thread_num = threadIdx.x;
int num_threads = blockDim.x;
// initialize sum to 0
if (thread_num == 0) {
sum = 0;
}
__syncthreads();
// calculate the sum
double thread_sum = 0;
for (uint64 i = 1+thread_num; i <= N;i+=num_threads) {
thread_sum += 1.0/i;
}
atomicAdd(&sum,thread_sum);
__syncthreads();
// thread 0 prints the sum
if (thread_num == 0) {
printf ("sum = %.10f\n",sum);
}
// get N and B from the command line
// B is the number of threads per block
// we typically choose B to be a multiple of 32
// the maximum value of B is 1024
printf ("Command usage : %s %s %s\n",argv[0],"N","B");
int B = atoi(argv[2]);
printf ("N = %llu\n",N);
printf ("number of threads = %d\n",B);
// start the timer
clock_t start = clock();
// launch kernel
cudaDeviceSynchronize();
// stop the timer
clock_t stop = clock();
double elapsed = (double)(stop-start)/CLOCKS_PER_SEC;
printf ("elapsed time = %.2f seconds\n",elapsed);
}