pi.cu
                        
                             · 1.1 KiB · Text
                        
                    
                    
                      
                        Ham
                      
                    
                      
                    
                        
                          
                        
                    
                    
                
                
                
            #include <iostream>
#include <curand.h>
#include <curand_kernel.h>
__global__ void calculate_pi(int *count, unsigned long seed) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    curandState state;
    curand_init(seed, idx, 0, &state);
    float x = curand_uniform(&state);
    float y = curand_uniform(&state);
    if (x * x + y * y <= 1.0f) {
        atomicAdd(count, 1);
    }
}
int main() {
    int N = 1000000;
    int *d_count, h_count = 0;
    // Allocate memory on the device
    cudaMalloc((void**)&d_count, sizeof(int));
    cudaMemcpy(d_count, &h_count, sizeof(int), cudaMemcpyHostToDevice);
    // Launch kernel
    int threadsPerBlock = 1024;
    int blocks = (N + threadsPerBlock - 1) / threadsPerBlock;
    calculate_pi<<<blocks, threadsPerBlock>>>(d_count, time(NULL));
    // Copy result back to host
    cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
    // Calculate pi
    float pi = 4.0f * h_count / N;
    std::cout << "Estimated Pi = " << pi << std::endl;
    // Free device memory
    cudaFree(d_count);
    return 0;
}
                | 1 | #include <iostream> | 
| 2 | #include <curand.h> | 
| 3 | #include <curand_kernel.h> | 
| 4 | |
| 5 | __global__ void calculate_pi(int *count, unsigned long seed) { | 
| 6 | int idx = threadIdx.x + blockIdx.x * blockDim.x; | 
| 7 | curandState state; | 
| 8 | curand_init(seed, idx, 0, &state); | 
| 9 | |
| 10 | float x = curand_uniform(&state); | 
| 11 | float y = curand_uniform(&state); | 
| 12 | if (x * x + y * y <= 1.0f) { | 
| 13 | atomicAdd(count, 1); | 
| 14 | } | 
| 15 | } | 
| 16 | |
| 17 | int main() { | 
| 18 | int N = 1000000; | 
| 19 | int *d_count, h_count = 0; | 
| 20 | |
| 21 | // Allocate memory on the device | 
| 22 | cudaMalloc((void**)&d_count, sizeof(int)); | 
| 23 | cudaMemcpy(d_count, &h_count, sizeof(int), cudaMemcpyHostToDevice); | 
| 24 | |
| 25 | // Launch kernel | 
| 26 | int threadsPerBlock = 1024; | 
| 27 | int blocks = (N + threadsPerBlock - 1) / threadsPerBlock; | 
| 28 | calculate_pi<<<blocks, threadsPerBlock>>>(d_count, time(NULL)); | 
| 29 | |
| 30 | // Copy result back to host | 
| 31 | cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost); | 
| 32 | |
| 33 | // Calculate pi | 
| 34 | float pi = 4.0f * h_count / N; | 
| 35 | std::cout << "Estimated Pi = " << pi << std::endl; | 
| 36 | |
| 37 | // Free device memory | 
| 38 | cudaFree(d_count); | 
| 39 | |
| 40 | return 0; | 
| 41 | } | 
| 42 |