ConsistentlyInconsistentYT-.../tests/benchmarks/compare_benchmarks.py

#!/usr/bin/env python3
"""
Benchmark Comparison Tool

Compare benchmark results to detect performance regressions.
Compares current benchmark results against a baseline and reports
any significant performance degradation.

Usage:
    python compare_benchmarks.py --baseline baseline.json --current latest.json
"""

import json
import sys
import argparse
from pathlib import Path
from typing import Dict, List, Tuple
from dataclasses import dataclass


@dataclass
class BenchmarkComparison:
    """Comparison result for a single benchmark"""
    name: str
    baseline_fps: float
    current_fps: float
    change_percent: float
    is_regression: bool


class BenchmarkComparator:
    """Compare benchmark results and detect regressions"""

    def __init__(self, threshold_percent: float = 10.0):
        """
        Initialize comparator

        Args:
            threshold_percent: Performance degradation threshold as percentage
        """
        self.threshold = threshold_percent
        self.comparisons: List[BenchmarkComparison] = []

    def load_results(self, filepath: Path) -> Dict:
        """Load benchmark results from JSON file"""
        if not filepath.exists():
            raise FileNotFoundError(f"Benchmark file not found: {filepath}")

        with open(filepath, 'r') as f:
            return json.load(f)

    def compare(self, baseline_file: Path, current_file: Path) -> bool:
        """
        Compare two benchmark result files

        Args:
            baseline_file: Path to baseline results
            current_file: Path to current results

        Returns:
            True if no regressions detected, False otherwise
        """
        print("="*70)
        print(" Benchmark Comparison")
        print("="*70)
        print()

        # Load results
        baseline = self.load_results(baseline_file)
        current = self.load_results(current_file)

        print(f"Baseline: {baseline_file}")
        print(f"Current:  {current_file}")
        print(f"Threshold: {self.threshold}%")
        print()

        # Extract benchmark data
        baseline_benchmarks = self._extract_benchmarks(baseline)
        current_benchmarks = self._extract_benchmarks(current)

        # Compare common benchmarks
        common_names = set(baseline_benchmarks.keys()) & set(current_benchmarks.keys())

        if not common_names:
            print("Warning: No common benchmarks found")
            return True

        print(f"Comparing {len(common_names)} benchmarks...")
        print()

        # Perform comparisons
        has_regression = False
        for name in sorted(common_names):
            baseline_fps = baseline_benchmarks[name]
            current_fps = current_benchmarks[name]

            # Calculate percentage change (negative means worse performance)
            change_percent = ((current_fps - baseline_fps) / baseline_fps) * 100
            is_regression = change_percent < -self.threshold

            comparison = BenchmarkComparison(
                name=name,
                baseline_fps=baseline_fps,
                current_fps=current_fps,
                change_percent=change_percent,
                is_regression=is_regression
            )

            self.comparisons.append(comparison)

            if is_regression:
                has_regression = True

        # Print results
        self._print_results()

        return not has_regression

    def _extract_benchmarks(self, results: Dict) -> Dict[str, float]:
        """Extract benchmark names and FPS values from results"""
        benchmarks = {}

        # Handle different result formats
        if 'suites' in results:
            # Combined results format
            for suite_name, suite_data in results['suites'].items():
                if suite_data.get('completed') and 'results' in suite_data:
                    for result in suite_data['results']:
                        name = result.get('name', 'unknown')
                        fps = result.get('throughput_fps', 0.0)
                        benchmarks[name] = fps

        elif 'results' in results:
            # Single suite format
            for result in results['results']:
                name = result.get('name', 'unknown')
                fps = result.get('throughput_fps', 0.0)
                benchmarks[name] = fps

        elif 'benchmarks' in results:
            # Direct benchmark format
            for name, data in results['benchmarks'].items():
                fps = data.get('fps', data.get('throughput_fps', 0.0))
                benchmarks[name] = fps

        return benchmarks

    def _print_results(self):
        """Print comparison results in a formatted table"""
        print("="*70)
        print(" Results")
        print("="*70)
        print()

        # Print header
        print(f"{'Benchmark':<40} {'Baseline':>10} {'Current':>10} {'Change':>10}")
        print("-"*70)

        # Print each comparison
        for comp in sorted(self.comparisons, key=lambda x: x.change_percent):
            status = "REGRESSION" if comp.is_regression else "OK"
            status_symbol = "✗" if comp.is_regression else "✓"

            color_code = ""
            if comp.is_regression:
                color_code = "\033[91m"  # Red
            elif comp.change_percent > 5:
                color_code = "\033[92m"  # Green (improvement)
            reset_code = "\033[0m" if color_code else ""

            print(f"{status_symbol} {comp.name:<38} "
                  f"{comp.baseline_fps:>9.2f} "
                  f"{comp.current_fps:>9.2f} "
                  f"{color_code}{comp.change_percent:>+9.1f}%{reset_code}")

        print()

        # Print summary
        regressions = [c for c in self.comparisons if c.is_regression]
        improvements = [c for c in self.comparisons if c.change_percent > 5]

        print("="*70)
        print(" Summary")
        print("="*70)
        print()
        print(f"Total benchmarks: {len(self.comparisons)}")
        print(f"Regressions:      {len(regressions)}")
        print(f"Improvements:     {len(improvements)}")
        print()

        if regressions:
            print("\033[91m✗ Performance regressions detected!\033[0m")
            print()
            print("Benchmarks with regressions:")
            for comp in regressions:
                print(f"  - {comp.name}: {comp.change_percent:+.1f}%")
            print()
        else:
            print("\033[92m✓ No performance regressions detected\033[0m")
            print()

    def export_report(self, output_file: Path):
        """Export comparison report to file"""
        report = {
            'threshold_percent': self.threshold,
            'total_benchmarks': len(self.comparisons),
            'regressions_count': len([c for c in self.comparisons if c.is_regression]),
            'comparisons': [
                {
                    'name': c.name,
                    'baseline_fps': c.baseline_fps,
                    'current_fps': c.current_fps,
                    'change_percent': c.change_percent,
                    'is_regression': c.is_regression,
                }
                for c in self.comparisons
            ]
        }

        with open(output_file, 'w') as f:
            json.dump(report, f, indent=2)

        print(f"Report exported to: {output_file}")


def main():
    """Main entry point"""
    parser = argparse.ArgumentParser(
        description='Compare benchmark results and detect performance regressions'
    )

    parser.add_argument(
        '--baseline',
        type=Path,
        required=True,
        help='Path to baseline benchmark results (JSON)'
    )

    parser.add_argument(
        '--current',
        type=Path,
        required=True,
        help='Path to current benchmark results (JSON)'
    )

    parser.add_argument(
        '--threshold',
        type=float,
        default=10.0,
        help='Performance degradation threshold percentage (default: 10.0)'
    )

    parser.add_argument(
        '--fail-on-regression',
        action='store_true',
        help='Exit with error code if regressions detected'
    )

    parser.add_argument(
        '--export',
        type=Path,
        help='Export comparison report to file'
    )

    args = parser.parse_args()

    # Create comparator
    comparator = BenchmarkComparator(threshold_percent=args.threshold)

    try:
        # Perform comparison
        no_regression = comparator.compare(args.baseline, args.current)

        # Export report if requested
        if args.export:
            comparator.export_report(args.export)

        # Exit with appropriate code
        if args.fail_on_regression and not no_regression:
            print("Exiting with error code due to performance regressions")
            sys.exit(1)
        else:
            sys.exit(0)

    except FileNotFoundError as e:
        print(f"Error: {e}", file=sys.stderr)
        sys.exit(2)
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON: {e}", file=sys.stderr)
        sys.exit(2)
    except Exception as e:
        print(f"Unexpected error: {e}", file=sys.stderr)
        sys.exit(2)


if __name__ == '__main__':
    main()