Newer
Older
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda.h>
typedef unsigned long long int uint64;
__global__ void sumKernel(uint64 N, uint64 T, double* sum) {
uint64 thread_num = (uint64)blockIdx.x*blockDim.x + threadIdx.x;
double thread_sum = 0;
uint64 start = thread_num*T;
uint64 end = start+T;
if (end > N) {
}
atomicAdd(sum,thread_sum);
}
int main (int argc, char** argv) {
// get N, T, and B from the command line
// T is the number of terms per thread
// B is the number of threads per block
// we typically choose B to be a multiple of 32
// the maximum value of B is 1024
if (argc < 4) {
printf ("Command usage : %s %s %s %s\n",argv[0],"N","T","B");
return 1;
}
uint64 N = atoll(argv[1]);
uint64 T = atoll(argv[2]);
int B = atoi(argv[3]);
// G is the number of thread blocks
// the maximum number of thread blocks G is 2^31 - 1 = 2147483647
// We choose G to be the minimum number of thread blocks to have
// at least N/T threads
int G = (N+B*T-1)/(B*T);
printf ("N = %llu\n",N);
printf ("terms per thread T = %llu\n",T);
printf ("threads per block B = %d\n",B);
printf ("number of thread blocks G = %d\n",G);
printf ("number of threads G*B = %llu\n",(uint64)G*B);
// the computed sum in device memory
double* d_sum;
cudaMalloc (&d_sum,sizeof(double));
// initialize the device harmonic sum to 0
cudaMemset (d_sum,0,sizeof(double));
// start the timer
clock_t start = clock();
// launch kernel
sumKernel <<< G, B >>> (N,T,d_sum);
// copy device sum from device to host
double sum;
cudaMemcpy (&sum, d_sum, sizeof(double),cudaMemcpyDeviceToHost);
// stop the timer
clock_t stop = clock();
double elapsed = (double)(stop-start)/CLOCKS_PER_SEC;
// output the results
printf ("sum = %.10f\n",sum);
printf ("elapsed time = %.2f seconds\n",elapsed);
// free the memory on the device
cudaFree (d_sum);
}