Newer
Older
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda.h>
typedef unsigned long long int uint64;
__global__ void sumKernel(uint64 N, double* sum) {
uint64 thread_num = (uint64)blockIdx.x*blockDim.x + threadIdx.x;
if (thread_num < N) {
double term = 1.0/(thread_num+1);
atomicAdd(sum,term);
}
int main (int argc, char** argv) {
// get N and B from the command line
// B is the number of threads per block
// we typically choose B to be a multiple of 32
// the maximum value of B is 1024
if (argc < 3) {
printf ("Command usage : %s %s %s\n",argv[0],"N","B");
return 1;
}
uint64 N = atoll(argv[1]);
int B = atoi(argv[2]);
// G is the number of thread blocks
// the maximum number of thread blocks G is 2^31 - 1 = 2147483647
// We choose G to be the minimum number of thread blocks to have at least N threads
int G = (N+B-1)/B;
printf ("N = %llu\n",N);
printf ("threads per block B = %d\n",B);
printf ("number of thread blocks G = %d\n",G);
printf ("number of threads G*B = %llu\n",(uint64)G*B);
// the computed sum in device memory
double* d_sum;
cudaMalloc (&d_sum,sizeof(double));
// initialize the device harmonic sum to 0
cudaMemset (d_sum,0,sizeof(double));
// start the timer
clock_t start = clock();
// launch kernel
sumKernel <<< G, B >>> (N,d_sum);
// copy device sum from device to host
double sum;
cudaMemcpy (&sum, d_sum, sizeof(double),cudaMemcpyDeviceToHost);
// stop the timer
clock_t stop = clock();
double elapsed = (double)(stop-start)/CLOCKS_PER_SEC;
// output the results
printf ("harmonic sum = %.10f\n",sum);
printf ("elapsed time = %.2f seconds\n",elapsed);
// free the memory on the device
cudaFree (d_sum);
}