#!/usr/bin/env python3
"""
Example: CUDA-Accelerated Multi-Camera Voxel Processing

This script demonstrates how to use the CUDA voxel processing module
for real-time multi-camera voxel grid reconstruction.

Features demonstrated:
1. GPU device info and capability checking
2. Multi-camera setup with CUDA streams
3. Motion detection on GPU
4. Ray-casting with voxel accumulation
5. Post-processing (Gaussian blur)
6. Performance benchmarking

Usage:
    python example_cuda_usage.py [--num-cameras 10] [--8k]
"""

import numpy as np
import argparse
import time
from pathlib import Path

try:
    import voxel_cuda
    CUDA_AVAILABLE = True
except ImportError as e:
    print(f"CUDA module not available: {e}")
    print("Please compile the CUDA extension first:")
    print("  python setup.py build_ext --inplace")
    CUDA_AVAILABLE = False
    exit(1)


def check_gpu_requirements():
    """Check if GPU meets requirements for 8K processing"""
    print("=" * 70)
    print("GPU Requirements Check")
    print("=" * 70)

    voxel_cuda.print_device_info()

    # Check compute capability (RTX 3090: 8.6, RTX 4090: 8.9)
    if voxel_cuda.check_compute_capability(8, 6):
        print("✓ GPU supports RTX 3090/4090 features (Compute 8.6+)")
        return True
    elif voxel_cuda.check_compute_capability(7, 5):
        print("⚠ GPU is older generation (Compute 7.5+)")
        print("  Performance may be reduced, but should work")
        return True
    else:
        print("✗ GPU too old (Compute < 7.5)")
        print("  Please use RTX 2080 or newer")
        return False


def create_rotation_matrix(yaw_deg, pitch_deg, roll_deg):
    """Create rotation matrix from Euler angles (matching C++ code)"""
    import math

    yaw = math.radians(yaw_deg)
    pitch = math.radians(pitch_deg)
    roll = math.radians(roll_deg)

    cy, sy = math.cos(yaw), math.sin(yaw)
    cp, sp = math.cos(pitch), math.sin(pitch)
    cr, sr = math.cos(roll), math.sin(roll)

    # Rz(yaw)
    Rz = np.array([
        [cy, -sy, 0],
        [sy, cy, 0],
        [0, 0, 1]
    ], dtype=np.float32)

    # Ry(roll)
    Ry = np.array([
        [cr, 0, sr],
        [0, 1, 0],
        [-sr, 0, cr]
    ], dtype=np.float32)

    # Rx(pitch)
    Rx = np.array([
        [1, 0, 0],
        [0, cp, -sp],
        [0, sp, cp]
    ], dtype=np.float32)

    # Combined: Rz * Ry * Rx
    return (Rz @ Ry @ Rx).astype(np.float32)


def setup_circular_camera_array(num_cameras, radius=1000.0, height=0.0, fov_deg=60.0):
    """
    Setup cameras in a circular array pointing toward center

    Args:
        num_cameras: Number of cameras
        radius: Distance from center
        height: Height above ground plane
        fov_deg: Field of view in degrees

    Returns:
        List of (position, rotation_matrix, fov_rad) tuples
    """
    cameras = []

    for i in range(num_cameras):
        # Angle around circle
        angle = 2.0 * np.pi * i / num_cameras

        # Position on circle
        x = radius * np.cos(angle)
        y = radius * np.sin(angle)
        z = height

        position = np.array([x, y, z], dtype=np.float32)

        # Rotation to point toward center
        # Yaw points camera toward origin
        yaw_deg = np.degrees(angle) + 180.0  # Point inward
        pitch_deg = 0.0
        roll_deg = 0.0

        rotation = create_rotation_matrix(yaw_deg, pitch_deg, roll_deg)

        fov_rad = np.radians(fov_deg)

        cameras.append((position, rotation, fov_rad))

    return cameras


def generate_synthetic_frames(num_cameras, width, height, frame_idx, add_motion=True):
    """
    Generate synthetic test frames with optional motion

    Args:
        num_cameras: Number of cameras
        width: Frame width
        height: Frame height
        frame_idx: Current frame index
        add_motion: Whether to add synthetic motion

    Returns:
        frames: numpy array (num_cameras, height, width)
    """
    frames = np.zeros((num_cameras, height, width), dtype=np.float32)

    for cam_id in range(num_cameras):
        # Base pattern (gradient + noise)
        x = np.linspace(0, 1, width)
        y = np.linspace(0, 1, height)
        X, Y = np.meshgrid(x, y)

        # Radial gradient
        frame = (X + Y) / 2.0 * 255.0

        # Add noise
        frame += np.random.randn(height, width) * 5.0

        if add_motion:
            # Add moving bright spot
            spot_x = int(width * 0.3 + (frame_idx % 100) * width * 0.004)
            spot_y = int(height * 0.5)
            spot_radius = 50

            y_coords, x_coords = np.ogrid[:height, :width]
            dist = np.sqrt((x_coords - spot_x)**2 + (y_coords - spot_y)**2)
            spot = np.exp(-(dist**2) / (2 * spot_radius**2)) * 200.0
            frame += spot

        # Clamp to valid range
        frame = np.clip(frame, 0, 255).astype(np.float32)
        frames[cam_id] = frame

    return frames


def main():
    parser = argparse.ArgumentParser(description='CUDA Voxel Processing Example')
    parser.add_argument('--num-cameras', type=int, default=5,
                        help='Number of cameras (default: 5)')
    parser.add_argument('--8k', action='store_true',
                        help='Use 8K resolution (7680x4320), default is 1080p')
    parser.add_argument('--frames', type=int, default=10,
                        help='Number of frames to process (default: 10)')
    parser.add_argument('--voxel-size', type=int, default=500,
                        help='Voxel grid size (NxNxN, default: 500)')
    parser.add_argument('--benchmark', action='store_true',
                        help='Run performance benchmark')
    parser.add_argument('--save-output', action='store_true',
                        help='Save voxel grid to file')

    args = parser.parse_args()

    # Check GPU
    if not check_gpu_requirements():
        print("\nGPU requirements not met!")
        return

    print("\n" + "=" * 70)
    print("Configuration")
    print("=" * 70)

    # Resolution
    if args.__dict__['8k']:
        width, height = 7680, 4320
        print("Resolution: 8K (7680x4320)")
        voxel_cuda.optimize_for_8k()
    else:
        width, height = 1920, 1080
        print("Resolution: 1080p (1920x1080)")

    print(f"Number of cameras: {args.num_cameras}")
    print(f"Voxel grid size: {args.voxel_size}³")
    print(f"Frames to process: {args.frames}")

    # Create voxel grid on GPU
    print("\n" + "=" * 70)
    print("Initializing Voxel Grid")
    print("=" * 70)

    grid_center = np.array([0.0, 0.0, 500.0], dtype=np.float32)
    voxel_grid = voxel_cuda.VoxelGridGPU(
        N=args.voxel_size,
        voxel_size=6.0,
        grid_center=grid_center
    )

    print(f"Created: {voxel_grid}")
    print(f"Memory: ~{(args.voxel_size**3 * 4) / (1024**2):.1f} MB")

    # Setup camera manager
    print("\n" + "=" * 70)
    print("Initializing Camera Streams")
    print("=" * 70)

    camera_mgr = voxel_cuda.CameraStreamManager(num_cameras=args.num_cameras)
    print(f"Created: {camera_mgr}")

    # Configure cameras in circular array
    camera_configs = setup_circular_camera_array(
        num_cameras=args.num_cameras,
        radius=1000.0,
        height=0.0,
        fov_deg=60.0
    )

    for cam_id, (position, rotation, fov_rad) in enumerate(camera_configs):
        camera_mgr.set_camera(
            cam_id=cam_id,
            position=position,
            rotation_matrix=rotation.flatten(),
            fov_rad=fov_rad,
            width=width,
            height=height
        )
        print(f"Camera {cam_id}: pos={position}, fov={np.degrees(fov_rad):.1f}°")

    # Process frames
    print("\n" + "=" * 70)
    print("Processing Frames")
    print("=" * 70)

    prev_frames = None
    total_time = 0.0

    for frame_idx in range(args.frames):
        # Generate synthetic frames
        curr_frames = generate_synthetic_frames(
            args.num_cameras, width, height, frame_idx, add_motion=True
        )

        if prev_frames is None:
            prev_frames = curr_frames.copy()
            continue

        # Process on GPU
        start_time = time.time()

        camera_mgr.process_frames(
            prev_frames=prev_frames,
            curr_frames=curr_frames,
            voxel_grid=voxel_grid,
            motion_threshold=2.0
        )

        elapsed = time.time() - start_time
        total_time += elapsed

        # Calculate metrics
        fps = 1.0 / elapsed if elapsed > 0 else 0
        megapixels = (width * height * args.num_cameras) / 1e6
        throughput = megapixels / elapsed if elapsed > 0 else 0

        print(f"Frame {frame_idx:3d}: {elapsed*1000:6.2f} ms "
              f"({fps:5.1f} FPS, {throughput:6.1f} MP/s)")

        prev_frames = curr_frames.copy()

    # Statistics
    avg_time = total_time / (args.frames - 1) if args.frames > 1 else 0
    avg_fps = 1.0 / avg_time if avg_time > 0 else 0
    avg_throughput = (width * height * args.num_cameras) / 1e6 / avg_time if avg_time > 0 else 0

    print("\n" + "=" * 70)
    print("Performance Summary")
    print("=" * 70)
    print(f"Average time per frame: {avg_time*1000:.2f} ms")
    print(f"Average FPS: {avg_fps:.1f}")
    print(f"Average throughput: {avg_throughput:.1f} MP/s")
    print(f"Total processing time: {total_time:.2f} s")

    # Get results from GPU
    print("\n" + "=" * 70)
    print("Retrieving Results")
    print("=" * 70)

    start_time = time.time()
    voxel_data = voxel_grid.to_host()
    copy_time = time.time() - start_time

    print(f"Copy to host: {copy_time*1000:.2f} ms")
    print(f"Voxel grid shape: {voxel_data.shape}")
    print(f"Voxel grid dtype: {voxel_data.dtype}")
    print(f"Min value: {voxel_data.min():.2f}")
    print(f"Max value: {voxel_data.max():.2f}")
    print(f"Mean value: {voxel_data.mean():.2f}")
    print(f"Non-zero voxels: {np.count_nonzero(voxel_data)} "
          f"({100 * np.count_nonzero(voxel_data) / voxel_data.size:.3f}%)")

    # Optional: Apply Gaussian blur
    if voxel_data.max() > 0:
        print("\n" + "=" * 70)
        print("Applying 3D Gaussian Blur")
        print("=" * 70)

        start_time = time.time()
        blurred = voxel_cuda.apply_gaussian_blur(voxel_data, sigma=1.5)
        blur_time = time.time() - start_time

        print(f"Blur time: {blur_time*1000:.2f} ms")
        print(f"Blurred max value: {blurred.max():.2f}")

    # Save output
    if args.save_output:
        print("\n" + "=" * 70)
        print("Saving Output")
        print("=" * 70)

        output_dir = Path("output_cuda")
        output_dir.mkdir(exist_ok=True)

        # Save raw voxel grid
        raw_path = output_dir / "voxel_grid_raw.npy"
        np.save(raw_path, voxel_data)
        print(f"Saved raw voxel grid to: {raw_path}")

        # Save blurred voxel grid
        if voxel_data.max() > 0:
            blurred_path = output_dir / "voxel_grid_blurred.npy"
            np.save(blurred_path, blurred)
            print(f"Saved blurred voxel grid to: {blurred_path}")

        # Save as binary (matching C++ output format)
        bin_path = output_dir / "voxel_grid.bin"
        with open(bin_path, 'wb') as f:
            # Write metadata
            f.write(np.array([args.voxel_size], dtype=np.int32).tobytes())
            f.write(np.array([6.0], dtype=np.float32).tobytes())  # voxel_size
            # Write data
            f.write(voxel_data.astype(np.float32).tobytes())
        print(f"Saved binary voxel grid to: {bin_path}")

    # Run benchmark if requested
    if args.benchmark:
        print("\n" + "=" * 70)
        print("Running Benchmark")
        print("=" * 70)

        voxel_cuda.benchmark(
            width=width,
            height=height,
            num_cameras=args.num_cameras,
            voxel_size=args.voxel_size,
            iterations=100
        )

    print("\n" + "=" * 70)
    print("Complete!")
    print("=" * 70)


if __name__ == '__main__':
    main()