#include <iostream>
#include <curand.h>
#include <curand_kernel.h>

__global__ void calculate_pi(int *count, unsigned long seed) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    curandState state;
    curand_init(seed, idx, 0, &state);

    float x = curand_uniform(&state);
    float y = curand_uniform(&state);
    if (x * x + y * y <= 1.0f) {
        atomicAdd(count, 1);
    }
}

int main() {
    int N = 1000000;
    int *d_count, h_count = 0;

    // Allocate memory on the device
    cudaMalloc((void**)&d_count, sizeof(int));
    cudaMemcpy(d_count, &h_count, sizeof(int), cudaMemcpyHostToDevice);

    // Launch kernel
    int threadsPerBlock = 1024;
    int blocks = (N + threadsPerBlock - 1) / threadsPerBlock;
    calculate_pi<<<blocks, threadsPerBlock>>>(d_count, time(NULL));

    // Copy result back to host
    cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);

    // Calculate pi
    float pi = 4.0f * h_count / N;
    std::cout << "Estimated Pi = " << pi << std::endl;

    // Free device memory
    cudaFree(d_count);

    return 0;
}
