gpu_harmonic_v2.cu

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda.h>

typedef unsigned long long int uint64;

__global__ void sumKernel(uint64 N, double* sum) {

    uint64 thread_num = (uint64)blockIdx.x*blockDim.x + threadIdx.x;
    if (thread_num < N) {
        double term = 1.0/(thread_num+1);
        atomicAdd(sum,term);
    }

}

int main (int argc, char** argv) {

    // get N and B from the command line
    // B is the number of threads per block
    // we typically choose B to be a multiple of 32
    // the maximum value of B is 1024
    if (argc < 3) {
        printf ("Command usage : %s %s %s\n",argv[0],"N","B");
        return 1;
    }
    uint64 N = atoll(argv[1]);
    int B = atoi(argv[2]);

    // G is the number of thread blocks
    // the maximum number of thread blocks G is 2^31 - 1 = 2147483647
    // We choose G to be the minimum number of thread blocks to have at least N threads
    int G = (N+B-1)/B;
    printf ("N = %llu\n",N);
    printf ("threads per block B = %d\n",B);
    printf ("number of thread blocks G = %d\n",G);
    printf ("number of threads G*B = %llu\n",(uint64)G*B);

    // the computed sum in device memory
    double* d_sum;
    cudaMalloc (&d_sum,sizeof(double));

    // initialize the device harmonic sum to 0
    cudaMemset (d_sum,0,sizeof(double));

    // start the timer
    clock_t start = clock();

    // launch kernel
    sumKernel <<< G, B >>> (N,d_sum);

    // copy device sum from device to host
    double sum;
    cudaMemcpy (&sum, d_sum, sizeof(double),cudaMemcpyDeviceToHost);

    // stop the timer
    clock_t stop = clock();
    double elapsed = (double)(stop-start)/CLOCKS_PER_SEC;

    // output the results
    printf ("harmonic sum = %.10f\n",sum);
    printf ("elapsed time = %.2f seconds\n",elapsed);

    // free the memory on the device
    cudaFree (d_sum);

}