#include #include #include #include "voxel_cuda.h" namespace py = pybind11; // ============================================================================ // Python Wrapper Classes // ============================================================================ class VoxelGridGPU { private: float* d_voxel_grid; VoxelGridParams params; bool allocated; public: VoxelGridGPU(int N, float voxel_size, py::array_t grid_center) { auto center = grid_center.unchecked<1>(); if (center.shape(0) != 3) { throw std::runtime_error("grid_center must have 3 elements"); } params.N = N; params.voxel_size = voxel_size; params.grid_center.x = center(0); params.grid_center.y = center(1); params.grid_center.z = center(2); allocateVoxelGrid(N, &d_voxel_grid); params.data = d_voxel_grid; allocated = true; } ~VoxelGridGPU() { if (allocated) { freeVoxelGrid(d_voxel_grid); allocated = false; } } void clear(int stream_id = 0) { cudaStream_t stream = 0; clearVoxelGrid(d_voxel_grid, params.N, stream); } py::array_t toHost() { size_t total_size = (size_t)params.N * params.N * params.N; py::array_t result({params.N, params.N, params.N}); auto buf = result.request(); float* ptr = static_cast(buf.ptr); copyVoxelGridToHost(d_voxel_grid, ptr, params.N); return result; } VoxelGridParams getParams() const { return params; } int getN() const { return params.N; } float getVoxelSize() const { return params.voxel_size; } }; // ============================================================================ // Camera Manager with Multi-Stream Support // ============================================================================ class CameraStreamManager { private: int num_streams; cudaStream_t* streams; std::vector cameras; public: CameraStreamManager(int num_cameras) : num_streams(num_cameras) { initCudaStreams(num_streams, &streams); cameras.resize(num_cameras); } ~CameraStreamManager() { cleanupCudaStreams(num_streams, streams); } void setCamera(int cam_id, py::array_t position, py::array_t rotation_matrix, float fov_rad, int width, int height) { if (cam_id < 0 || cam_id >= num_streams) { throw std::runtime_error("Invalid camera ID"); } auto pos = position.unchecked<1>(); auto rot = rotation_matrix.unchecked<1>(); if (pos.shape(0) != 3) { throw std::runtime_error("position must have 3 elements"); } if (rot.shape(0) != 9) { throw std::runtime_error("rotation_matrix must have 9 elements (flattened 3x3)"); } cameras[cam_id].position.x = pos(0); cameras[cam_id].position.y = pos(1); cameras[cam_id].position.z = pos(2); for (int i = 0; i < 9; i++) { cameras[cam_id].rotation.m[i] = rot(i); } cameras[cam_id].fov_rad = fov_rad; cameras[cam_id].width = width; cameras[cam_id].height = height; cameras[cam_id].camera_id = cam_id; } void processFrames( py::array_t prev_frames, py::array_t curr_frames, VoxelGridGPU& voxel_grid, float motion_threshold = 2.0f) { // Validate input shapes if (prev_frames.ndim() != 3 || curr_frames.ndim() != 3) { throw std::runtime_error("Frame arrays must be 3D (num_cameras, height, width)"); } auto prev_buf = prev_frames.request(); auto curr_buf = curr_frames.request(); int num_cams = prev_buf.shape[0]; if (num_cams != num_streams) { throw std::runtime_error("Number of frames doesn't match number of cameras"); } // Prepare host pointers std::vector h_prev_frames(num_cams); std::vector h_curr_frames(num_cams); float* prev_data = static_cast(prev_buf.ptr); float* curr_data = static_cast(curr_buf.ptr); for (int i = 0; i < num_cams; i++) { int frame_size = cameras[i].width * cameras[i].height; h_prev_frames[i] = prev_data + i * frame_size; h_curr_frames[i] = curr_data + i * frame_size; } // Process on GPU processMultipleCameras( h_prev_frames, h_curr_frames, cameras, voxel_grid.getParams(), num_cams, motion_threshold, streams ); } void processSingleFrame( int cam_id, py::array_t frame, VoxelGridGPU& voxel_grid, float min_threshold = 1e-3f) { if (cam_id < 0 || cam_id >= num_streams) { throw std::runtime_error("Invalid camera ID"); } // Allocate device memory auto frame_buf = frame.request(); int frame_size = cameras[cam_id].width * cameras[cam_id].height; float* d_frame; cudaMalloc(&d_frame, frame_size * sizeof(float)); cudaMemcpyAsync(d_frame, frame_buf.ptr, frame_size * sizeof(float), cudaMemcpyHostToDevice, streams[cam_id]); // Cast rays castRaysFullFrameGPU( d_frame, cameras[cam_id], voxel_grid.getParams(), streams[cam_id] ); cudaStreamSynchronize(streams[cam_id]); cudaFree(d_frame); } int getNumStreams() const { return num_streams; } }; // ============================================================================ // Standalone Utility Functions // ============================================================================ void py_printDeviceInfo(int device_id = 0) { printCudaDeviceInfo(device_id); } bool py_checkComputeCapability(int major, int minor, int device_id = 0) { return checkComputeCapability(major, minor, device_id); } void py_optimizeFor8K() { optimizeFor8K(); } py::array_t py_detectMotion( py::array_t prev_frame, py::array_t curr_frame, float threshold = 2.0f) { auto prev_buf = prev_frame.request(); auto curr_buf = curr_frame.request(); if (prev_buf.ndim != 2 || curr_buf.ndim != 2) { throw std::runtime_error("Frames must be 2D arrays"); } int height = prev_buf.shape[0]; int width = prev_buf.shape[1]; int frame_size = width * height; // Allocate device memory float *d_prev, *d_curr, *d_diff; bool *d_mask; cudaMalloc(&d_prev, frame_size * sizeof(float)); cudaMalloc(&d_curr, frame_size * sizeof(float)); cudaMalloc(&d_diff, frame_size * sizeof(float)); cudaMalloc(&d_mask, frame_size * sizeof(bool)); cudaMemcpy(d_prev, prev_buf.ptr, frame_size * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_curr, curr_buf.ptr, frame_size * sizeof(float), cudaMemcpyHostToDevice); // Detect motion detectMotionGPU(d_prev, d_curr, d_mask, d_diff, width, height, threshold, 0); // Copy result back py::array_t result({height, width}); auto result_buf = result.request(); cudaMemcpy(result_buf.ptr, d_diff, frame_size * sizeof(float), cudaMemcpyDeviceToHost); // Cleanup cudaFree(d_prev); cudaFree(d_curr); cudaFree(d_diff); cudaFree(d_mask); return result; } void py_benchmark(int width = 7680, int height = 4320, int num_cameras = 10, int voxel_size = 500, int iterations = 100) { benchmarkRayCasting(width, height, num_cameras, voxel_size, iterations); } py::array_t py_applyGaussianBlur( py::array_t voxel_grid, float sigma = 1.0f) { auto buf = voxel_grid.request(); if (buf.ndim != 3) { throw std::runtime_error("Voxel grid must be 3D"); } int N = buf.shape[0]; if (buf.shape[1] != N || buf.shape[2] != N) { throw std::runtime_error("Voxel grid must be cubic (NxNxN)"); } size_t size = (size_t)N * N * N * sizeof(float); // Allocate device memory float *d_input, *d_output; cudaMalloc(&d_input, size); cudaMalloc(&d_output, size); cudaMemcpy(d_input, buf.ptr, size, cudaMemcpyHostToDevice); // Apply blur applyGaussianBlurGPU(d_input, d_output, N, sigma, 0); // Copy result back py::array_t result({N, N, N}); auto result_buf = result.request(); cudaMemcpy(result_buf.ptr, d_output, size, cudaMemcpyDeviceToHost); // Cleanup cudaFree(d_input); cudaFree(d_output); return result; } // ============================================================================ // Pybind11 Module Definition // ============================================================================ PYBIND11_MODULE(voxel_cuda, m) { m.doc() = "CUDA-accelerated voxel processing for multi-camera systems"; // VoxelGridGPU class py::class_(m, "VoxelGridGPU") .def(py::init>(), py::arg("N"), py::arg("voxel_size"), py::arg("grid_center"), "Initialize GPU voxel grid\n\n" "Parameters:\n" " N: Grid size (creates NxNxN grid)\n" " voxel_size: Size of each voxel in world units\n" " grid_center: 3D array [x, y, z] for grid center position") .def("clear", &VoxelGridGPU::clear, py::arg("stream_id") = 0, "Clear voxel grid to zeros") .def("to_host", &VoxelGridGPU::toHost, "Copy voxel grid from GPU to CPU as numpy array") .def("get_N", &VoxelGridGPU::getN, "Get grid size") .def("get_voxel_size", &VoxelGridGPU::getVoxelSize, "Get voxel size") .def("__repr__", [](const VoxelGridGPU &grid) { return ""; }); // CameraStreamManager class py::class_(m, "CameraStreamManager") .def(py::init(), py::arg("num_cameras"), "Initialize camera stream manager for parallel processing\n\n" "Parameters:\n" " num_cameras: Number of cameras (and CUDA streams)") .def("set_camera", &CameraStreamManager::setCamera, py::arg("cam_id"), py::arg("position"), py::arg("rotation_matrix"), py::arg("fov_rad"), py::arg("width"), py::arg("height"), "Set camera parameters\n\n" "Parameters:\n" " cam_id: Camera ID (0 to num_cameras-1)\n" " position: 3D array [x, y, z]\n" " rotation_matrix: Flattened 3x3 rotation matrix (9 elements)\n" " fov_rad: Field of view in radians\n" " width: Frame width in pixels\n" " height: Frame height in pixels") .def("process_frames", &CameraStreamManager::processFrames, py::arg("prev_frames"), py::arg("curr_frames"), py::arg("voxel_grid"), py::arg("motion_threshold") = 2.0f, "Process multiple camera frames with motion detection\n\n" "Parameters:\n" " prev_frames: 3D array (num_cameras, height, width)\n" " curr_frames: 3D array (num_cameras, height, width)\n" " voxel_grid: VoxelGridGPU instance\n" " motion_threshold: Pixel difference threshold for motion") .def("process_single_frame", &CameraStreamManager::processSingleFrame, py::arg("cam_id"), py::arg("frame"), py::arg("voxel_grid"), py::arg("min_threshold") = 1e-3f, "Process single frame without motion detection\n\n" "Parameters:\n" " cam_id: Camera ID\n" " frame: 2D array (height, width)\n" " voxel_grid: VoxelGridGPU instance\n" " min_threshold: Minimum pixel value to process") .def("get_num_streams", &CameraStreamManager::getNumStreams, "Get number of CUDA streams") .def("__repr__", [](const CameraStreamManager &mgr) { return ""; }); // Utility functions m.def("print_device_info", &py_printDeviceInfo, py::arg("device_id") = 0, "Print CUDA device information"); m.def("check_compute_capability", &py_checkComputeCapability, py::arg("major"), py::arg("minor"), py::arg("device_id") = 0, "Check if device supports required compute capability"); m.def("optimize_for_8k", &py_optimizeFor8K, "Optimize CUDA settings for 8K video processing"); m.def("detect_motion", &py_detectMotion, py::arg("prev_frame"), py::arg("curr_frame"), py::arg("threshold") = 2.0f, "GPU-accelerated motion detection between two frames\n\n" "Returns: 2D array of absolute differences"); m.def("benchmark", &py_benchmark, py::arg("width") = 7680, py::arg("height") = 4320, py::arg("num_cameras") = 10, py::arg("voxel_size") = 500, py::arg("iterations") = 100, "Run performance benchmark"); m.def("apply_gaussian_blur", &py_applyGaussianBlur, py::arg("voxel_grid"), py::arg("sigma") = 1.0f, "Apply 3D Gaussian blur to voxel grid on GPU\n\n" "Returns: Blurred voxel grid"); // Version info m.attr("__version__") = "0.1.0"; m.attr("CUDA_ENABLED") = true; }