gpu_harmonic_v1.cu

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda.h>

typedef unsigned long long int uint64;

__global__ void sumKernel(uint64 N) {

    __shared__ double sum;

    int thread_num = threadIdx.x;
    int num_threads = blockDim.x;

    // initialize sum to 0
    if (thread_num == 0) {
        sum = 0;
    }
    __syncthreads();

    // calculate the sum
    double thread_sum = 0;
    for (uint64 i = 1+thread_num; i <= N;i+=num_threads) {
        thread_sum += 1.0/i;
    }
    atomicAdd(&sum,thread_sum);
    __syncthreads();

    // thread 0 prints the sum
    if (thread_num == 0) {
        printf ("sum = %.10f\n",sum);
    }

}

int main (int argc, char** argv) {

    // get N and B from the command line
    // B is the number of threads per block
    // we typically choose B to be a multiple of 32
    // the maximum value of B is 1024
    if (argc < 3) {
        printf ("Command usage : %s %s %s\n",argv[0],"N","B");
        return 1;
    }
    uint64 N = atoll(argv[1]);
    int B = atoi(argv[2]);
    printf ("N = %llu\n",N);
    printf ("number of threads = %d\n",B);

    // start the timer
    clock_t start = clock();

    // launch kernel
    sumKernel <<< 1, B >>> (N);
    cudaDeviceSynchronize();

    // stop the timer
    clock_t stop = clock();
    double elapsed = (double)(stop-start)/CLOCKS_PER_SEC;
    printf ("elapsed time = %.2f seconds\n",elapsed);

}