gpu_harmonic_v3.cu

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda.h>

typedef unsigned long long int uint64;

__global__ void sumKernel(uint64 N, uint64 T, double* sum) {

    uint64 thread_num = (uint64)blockIdx.x*blockDim.x + threadIdx.x;
    double thread_sum = 0;
    uint64 start = thread_num*T;
    uint64 end = start+T;
    if (end > N) {
        end = N;
    }
    for (uint64 i = start; i<end;i++) {
        thread_sum += 1.0/(i+1);
    }
    atomicAdd(sum,thread_sum);
}

int main (int argc, char** argv) {

    // get N, T, and B from the command line
    // T is the number of terms per thread
    // B is the number of threads per block
    // we typically choose B to be a multiple of 32
    // the maximum value of B is 1024
    if (argc < 4) {
        printf ("Command usage : %s %s %s %s\n",argv[0],"N","T","B");
        return 1;
    }
    uint64 N = atoll(argv[1]);
    uint64 T = atoll(argv[2]);
    int B = atoi(argv[3]);

    // G is the number of thread blocks
    // the maximum number of thread blocks G is 2^31 - 1 = 2147483647
    // We choose G to be the minimum number of thread blocks to have 
    // at least N/T threads
    int G = (N+B*T-1)/(B*T);
    printf ("N = %llu\n",N);
    printf ("terms per thread T = %llu\n",T);
    printf ("threads per block B = %d\n",B);
    printf ("number of thread blocks G = %d\n",G);
    printf ("number of threads G*B = %llu\n",(uint64)G*B);

    // the computed sum in device memory
    double* d_sum;
    cudaMalloc (&d_sum,sizeof(double));

    // initialize the device harmonic sum to 0
    cudaMemset (d_sum,0,sizeof(double));

    // start the timer
    clock_t start = clock();

    // launch kernel
    sumKernel <<< G, B >>> (N,T,d_sum);

    // copy device sum from device to host
    double sum;
    cudaMemcpy (&sum, d_sum, sizeof(double),cudaMemcpyDeviceToHost);

    // stop the timer
    clock_t stop = clock();
    double elapsed = (double)(stop-start)/CLOCKS_PER_SEC;

    // output the results
    printf ("sum = %.10f\n",sum);
    printf ("elapsed time = %.2f seconds\n",elapsed);

    // free the memory on the device
    cudaFree (d_sum);

}