/* * voxel_benchmark.cu * * CUDA Voxel Benchmarks for PixelToVoxelProjector * * Benchmarks: * - Ray-casting performance * - Voxel update throughput * - CUDA kernel performance * - Memory access patterns * - GPU memory bandwidth */ #include #include #include #include #include #include #include // Error checking macro #define CUDA_CHECK(call) \ do { \ cudaError_t error = call; \ if (error != cudaSuccess) { \ fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \ cudaGetErrorString(error)); \ exit(EXIT_FAILURE); \ } \ } while(0) // Benchmark result structure typedef struct { const char* name; double duration_ms; double throughput_gops; double memory_bandwidth_gbps; double kernel_time_ms; int blocks; int threads_per_block; } BenchmarkResult; // 3D vector for ray operations typedef struct { float x, y, z; } float3_t; // Timing utilities double get_time_ms() { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return ts.tv_sec * 1000.0 + ts.tv_nsec / 1000000.0; } // ============================================================================== // KERNEL 1: Voxel Ray Casting (DDA Algorithm) // ============================================================================== __device__ float3_t make_float3_dev(float x, float y, float z) { float3_t v; v.x = x; v.y = y; v.z = z; return v; } __device__ float3_t normalize_dev(float3_t v) { float len = sqrtf(v.x*v.x + v.y*v.y + v.z*v.z); if (len > 1e-6f) { v.x /= len; v.y /= len; v.z /= len; } return v; } __global__ void raycast_voxel_kernel( float* voxel_grid, // Grid data (N^3) int N, // Grid size float voxel_size, // Voxel size float3_t grid_center, // Grid center float3_t* ray_origins, // Ray origins float3_t* ray_directions,// Ray directions float* ray_results, // Output: accumulated values int num_rays ) { int ray_idx = blockIdx.x * blockDim.x + threadIdx.x; if (ray_idx >= num_rays) return; float3_t origin = ray_origins[ray_idx]; float3_t dir = normalize_dev(ray_directions[ray_idx]); // Compute grid bounds float half_size = 0.5f * (N * voxel_size); float3_t grid_min = make_float3_dev( grid_center.x - half_size, grid_center.y - half_size, grid_center.z - half_size ); float3_t grid_max = make_float3_dev( grid_center.x + half_size, grid_center.y + half_size, grid_center.z + half_size ); // Ray-box intersection float t_min = 0.0f; float t_max = 1e10f; for (int i = 0; i < 3; i++) { float o = (i == 0) ? origin.x : (i == 1) ? origin.y : origin.z; float d = (i == 0) ? dir.x : (i == 1) ? dir.y : dir.z; float bmin = (i == 0) ? grid_min.x : (i == 1) ? grid_min.y : grid_min.z; float bmax = (i == 0) ? grid_max.x : (i == 1) ? grid_max.y : grid_max.z; if (fabsf(d) > 1e-6f) { float t1 = (bmin - o) / d; float t2 = (bmax - o) / d; float t_near = fminf(t1, t2); float t_far = fmaxf(t1, t2); t_min = fmaxf(t_min, t_near); t_max = fminf(t_max, t_far); if (t_min > t_max) { ray_results[ray_idx] = 0.0f; return; } } else { if (o < bmin || o > bmax) { ray_results[ray_idx] = 0.0f; return; } } } if (t_min < 0.0f) t_min = 0.0f; // DDA traversal float3_t start_pos = make_float3_dev( origin.x + t_min * dir.x, origin.y + t_min * dir.y, origin.z + t_min * dir.z ); int ix = (int)((start_pos.x - grid_min.x) / voxel_size); int iy = (int)((start_pos.y - grid_min.y) / voxel_size); int iz = (int)((start_pos.z - grid_min.z) / voxel_size); if (ix < 0 || ix >= N || iy < 0 || iy >= N || iz < 0 || iz >= N) { ray_results[ray_idx] = 0.0f; return; } int step_x = (dir.x >= 0.0f) ? 1 : -1; int step_y = (dir.y >= 0.0f) ? 1 : -1; int step_z = (dir.z >= 0.0f) ? 1 : -1; float t_delta_x = fabsf(voxel_size / dir.x); float t_delta_y = fabsf(voxel_size / dir.y); float t_delta_z = fabsf(voxel_size / dir.z); float t_max_x = t_min + fabsf(((ix + (step_x > 0 ? 1 : 0)) * voxel_size + grid_min.x - origin.x) / dir.x); float t_max_y = t_min + fabsf(((iy + (step_y > 0 ? 1 : 0)) * voxel_size + grid_min.y - origin.y) / dir.y); float t_max_z = t_min + fabsf(((iz + (step_z > 0 ? 1 : 0)) * voxel_size + grid_min.z - origin.z) / dir.z); float accumulated = 0.0f; int steps = 0; int max_steps = N * 3; while (steps < max_steps) { // Access voxel int idx = ix * N * N + iy * N + iz; accumulated += voxel_grid[idx]; // Step to next voxel if (t_max_x < t_max_y && t_max_x < t_max_z) { ix += step_x; t_max_x += t_delta_x; } else if (t_max_y < t_max_z) { iy += step_y; t_max_y += t_delta_y; } else { iz += step_z; t_max_z += t_delta_z; } if (ix < 0 || ix >= N || iy < 0 || iy >= N || iz < 0 || iz >= N) { break; } steps++; } ray_results[ray_idx] = accumulated; } // ============================================================================== // KERNEL 2: Voxel Update (Atomic Operations) // ============================================================================== __global__ void voxel_update_kernel( float* voxel_grid, int N, int* update_indices, // Flat indices float* update_values, int num_updates ) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= num_updates) return; int voxel_idx = update_indices[idx]; float value = update_values[idx]; // Atomic add to voxel grid atomicAdd(&voxel_grid[voxel_idx], value); } // ============================================================================== // KERNEL 3: Memory Bandwidth Test (Coalesced Access) // ============================================================================== __global__ void memory_bandwidth_kernel( float* input, float* output, int num_elements ) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= num_elements) return; // Coalesced memory access pattern output[idx] = input[idx] * 2.0f + 1.0f; } // ============================================================================== // KERNEL 4: Voxel Reduction (Sum all voxels) // ============================================================================== __global__ void voxel_reduction_kernel( float* voxel_grid, float* partial_sums, int N ) { __shared__ float shared_sum[256]; int tid = threadIdx.x; int idx = blockIdx.x * blockDim.x + threadIdx.x; int total = N * N * N; // Load data into shared memory shared_sum[tid] = (idx < total) ? voxel_grid[idx] : 0.0f; __syncthreads(); // Reduction in shared memory for (int s = blockDim.x / 2; s > 0; s >>= 1) { if (tid < s) { shared_sum[tid] += shared_sum[tid + s]; } __syncthreads(); } // Write result if (tid == 0) { partial_sums[blockIdx.x] = shared_sum[0]; } } // ============================================================================== // Benchmark Functions // ============================================================================== void print_result(BenchmarkResult res) { printf("\n"); printf("========================================\n"); printf("Benchmark: %s\n", res.name); printf("========================================\n"); printf("Duration: %.2f ms\n", res.duration_ms); printf("Throughput: %.2f GOPS\n", res.throughput_gops); printf("Memory BW: %.2f GB/s\n", res.memory_bandwidth_gbps); printf("Kernel Time: %.2f ms\n", res.kernel_time_ms); printf("Blocks: %d\n", res.blocks); printf("Threads/Block: %d\n", res.threads_per_block); printf("========================================\n"); } BenchmarkResult benchmark_raycast(int N, int num_rays) { printf("\nBenchmarking Ray Casting (%d^3 grid, %d rays)...\n", N, num_rays); BenchmarkResult res; res.name = "Voxel Ray Casting (DDA)"; // Allocate grid size_t grid_size = N * N * N * sizeof(float); float* h_grid = (float*)malloc(grid_size); float* d_grid; // Initialize grid with random values for (int i = 0; i < N*N*N; i++) { h_grid[i] = (float)rand() / RAND_MAX; } CUDA_CHECK(cudaMalloc(&d_grid, grid_size)); CUDA_CHECK(cudaMemcpy(d_grid, h_grid, grid_size, cudaMemcpyHostToDevice)); // Allocate rays float3_t* h_origins = (float3_t*)malloc(num_rays * sizeof(float3_t)); float3_t* h_directions = (float3_t*)malloc(num_rays * sizeof(float3_t)); float* h_results = (float*)malloc(num_rays * sizeof(float)); float3_t* d_origins, *d_directions; float* d_results; CUDA_CHECK(cudaMalloc(&d_origins, num_rays * sizeof(float3_t))); CUDA_CHECK(cudaMalloc(&d_directions, num_rays * sizeof(float3_t))); CUDA_CHECK(cudaMalloc(&d_results, num_rays * sizeof(float))); // Generate random rays for (int i = 0; i < num_rays; i++) { h_origins[i].x = ((float)rand() / RAND_MAX - 0.5f) * 2000.0f; h_origins[i].y = ((float)rand() / RAND_MAX - 0.5f) * 2000.0f; h_origins[i].z = ((float)rand() / RAND_MAX - 0.5f) * 2000.0f; h_directions[i].x = (float)rand() / RAND_MAX - 0.5f; h_directions[i].y = (float)rand() / RAND_MAX - 0.5f; h_directions[i].z = (float)rand() / RAND_MAX - 0.5f; } CUDA_CHECK(cudaMemcpy(d_origins, h_origins, num_rays * sizeof(float3_t), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_directions, h_directions, num_rays * sizeof(float3_t), cudaMemcpyHostToDevice)); // Launch configuration int threads = 256; int blocks = (num_rays + threads - 1) / threads; res.blocks = blocks; res.threads_per_block = threads; float3_t grid_center; grid_center.x = 0.0f; grid_center.y = 0.0f; grid_center.z = 500.0f; // Warmup raycast_voxel_kernel<<>>( d_grid, N, 6.0f, grid_center, d_origins, d_directions, d_results, num_rays ); CUDA_CHECK(cudaDeviceSynchronize()); // Benchmark cudaEvent_t start, stop; CUDA_CHECK(cudaEventCreate(&start)); CUDA_CHECK(cudaEventCreate(&stop)); double cpu_start = get_time_ms(); CUDA_CHECK(cudaEventRecord(start)); raycast_voxel_kernel<<>>( d_grid, N, 6.0f, grid_center, d_origins, d_directions, d_results, num_rays ); CUDA_CHECK(cudaEventRecord(stop)); CUDA_CHECK(cudaEventSynchronize(stop)); double cpu_end = get_time_ms(); float kernel_time; CUDA_CHECK(cudaEventElapsedTime(&kernel_time, start, stop)); res.duration_ms = cpu_end - cpu_start; res.kernel_time_ms = kernel_time; // Estimate operations (rays * steps_per_ray) long long ops = (long long)num_rays * N; // Approximation res.throughput_gops = (ops / 1e9) / (kernel_time / 1000.0); // Memory bandwidth (rough estimate) long long bytes = grid_size + num_rays * sizeof(float3_t) * 2; res.memory_bandwidth_gbps = (bytes / 1e9) / (kernel_time / 1000.0); // Cleanup CUDA_CHECK(cudaFree(d_grid)); CUDA_CHECK(cudaFree(d_origins)); CUDA_CHECK(cudaFree(d_directions)); CUDA_CHECK(cudaFree(d_results)); free(h_grid); free(h_origins); free(h_directions); free(h_results); CUDA_CHECK(cudaEventDestroy(start)); CUDA_CHECK(cudaEventDestroy(stop)); return res; } BenchmarkResult benchmark_voxel_updates(int N, int num_updates) { printf("\nBenchmarking Voxel Updates (%d^3 grid, %d updates)...\n", N, num_updates); BenchmarkResult res; res.name = "Voxel Updates (Atomic)"; size_t grid_size = N * N * N * sizeof(float); float* d_grid; CUDA_CHECK(cudaMalloc(&d_grid, grid_size)); CUDA_CHECK(cudaMemset(d_grid, 0, grid_size)); // Generate random updates int* h_indices = (int*)malloc(num_updates * sizeof(int)); float* h_values = (float*)malloc(num_updates * sizeof(float)); for (int i = 0; i < num_updates; i++) { h_indices[i] = rand() % (N * N * N); h_values[i] = (float)rand() / RAND_MAX; } int* d_indices; float* d_values; CUDA_CHECK(cudaMalloc(&d_indices, num_updates * sizeof(int))); CUDA_CHECK(cudaMalloc(&d_values, num_updates * sizeof(float))); CUDA_CHECK(cudaMemcpy(d_indices, h_indices, num_updates * sizeof(int), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_values, h_values, num_updates * sizeof(float), cudaMemcpyHostToDevice)); // Launch configuration int threads = 256; int blocks = (num_updates + threads - 1) / threads; res.blocks = blocks; res.threads_per_block = threads; // Warmup voxel_update_kernel<<>>(d_grid, N, d_indices, d_values, num_updates); CUDA_CHECK(cudaDeviceSynchronize()); // Benchmark cudaEvent_t start, stop; CUDA_CHECK(cudaEventCreate(&start)); CUDA_CHECK(cudaEventCreate(&stop)); double cpu_start = get_time_ms(); CUDA_CHECK(cudaEventRecord(start)); voxel_update_kernel<<>>(d_grid, N, d_indices, d_values, num_updates); CUDA_CHECK(cudaEventRecord(stop)); CUDA_CHECK(cudaEventSynchronize(stop)); double cpu_end = get_time_ms(); float kernel_time; CUDA_CHECK(cudaEventElapsedTime(&kernel_time, start, stop)); res.duration_ms = cpu_end - cpu_start; res.kernel_time_ms = kernel_time; res.throughput_gops = (num_updates / 1e9) / (kernel_time / 1000.0); long long bytes = num_updates * (sizeof(int) + sizeof(float)); res.memory_bandwidth_gbps = (bytes / 1e9) / (kernel_time / 1000.0); // Cleanup CUDA_CHECK(cudaFree(d_grid)); CUDA_CHECK(cudaFree(d_indices)); CUDA_CHECK(cudaFree(d_values)); free(h_indices); free(h_values); CUDA_CHECK(cudaEventDestroy(start)); CUDA_CHECK(cudaEventDestroy(stop)); return res; } BenchmarkResult benchmark_memory_bandwidth(size_t num_elements) { printf("\nBenchmarking Memory Bandwidth (%zu elements)...\n", num_elements); BenchmarkResult res; res.name = "Memory Bandwidth (Coalesced)"; size_t size = num_elements * sizeof(float); float* d_input, *d_output; CUDA_CHECK(cudaMalloc(&d_input, size)); CUDA_CHECK(cudaMalloc(&d_output, size)); CUDA_CHECK(cudaMemset(d_input, 0, size)); int threads = 256; int blocks = (num_elements + threads - 1) / threads; res.blocks = blocks; res.threads_per_block = threads; // Warmup memory_bandwidth_kernel<<>>(d_input, d_output, num_elements); CUDA_CHECK(cudaDeviceSynchronize()); // Benchmark cudaEvent_t start, stop; CUDA_CHECK(cudaEventCreate(&start)); CUDA_CHECK(cudaEventCreate(&stop)); CUDA_CHECK(cudaEventRecord(start)); memory_bandwidth_kernel<<>>(d_input, d_output, num_elements); CUDA_CHECK(cudaEventRecord(stop)); CUDA_CHECK(cudaEventSynchronize(stop)); float kernel_time; CUDA_CHECK(cudaEventElapsedTime(&kernel_time, start, stop)); res.duration_ms = kernel_time; res.kernel_time_ms = kernel_time; // Read + Write long long bytes = 2 * num_elements * sizeof(float); res.memory_bandwidth_gbps = (bytes / 1e9) / (kernel_time / 1000.0); res.throughput_gops = (num_elements / 1e9) / (kernel_time / 1000.0); // Cleanup CUDA_CHECK(cudaFree(d_input)); CUDA_CHECK(cudaFree(d_output)); CUDA_CHECK(cudaEventDestroy(start)); CUDA_CHECK(cudaEventDestroy(stop)); return res; } // ============================================================================== // Main // ============================================================================== int main(int argc, char** argv) { printf("========================================\n"); printf("CUDA Voxel Benchmark Suite\n"); printf("========================================\n"); // Check CUDA device int device_count; CUDA_CHECK(cudaGetDeviceCount(&device_count)); if (device_count == 0) { fprintf(stderr, "No CUDA devices found!\n"); return 1; } cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, 0)); printf("\nGPU: %s\n", prop.name); printf("Compute Capability: %d.%d\n", prop.major, prop.minor); printf("Global Memory: %.2f GB\n", prop.totalGlobalMem / 1e9); printf("Multiprocessors: %d\n", prop.multiProcessorCount); printf("Max Threads/Block: %d\n", prop.maxThreadsPerBlock); // Run benchmarks BenchmarkResult results[4]; results[0] = benchmark_raycast(500, 100000); results[1] = benchmark_voxel_updates(500, 1000000); results[2] = benchmark_memory_bandwidth(500 * 500 * 500); // Print all results printf("\n\n"); printf("========================================\n"); printf("BENCHMARK SUMMARY\n"); printf("========================================\n"); for (int i = 0; i < 3; i++) { print_result(results[i]); } printf("\nBenchmark suite completed!\n"); return 0; }