sprint-econtai/analysis/generators/estimate_ratio_distribution.py

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
import tempfile
import logging

def generate(processed_df: pd.DataFrame):
    """
    Generates a histogram of the log-ratio of upper to lower time estimates.

    This corresponds to 'cell4' from the original analysis notebook. It shows
    the distribution of how many times larger the upper estimate is compared
    to the lower estimate.

    Args:
        processed_df (pd.DataFrame): The preprocessed data. Expected columns:
                                     'lb_estimate_in_minutes',
                                     'ub_estimate_in_minutes'.

    Returns:
        Path: The path to the generated temporary image file, or None on failure.
    """
    logging.info("Generating distribution plot of estimate ratios...")

    # --- Data Validation and Preparation ---
    required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes']
    if not all(col in processed_df.columns for col in required_cols):
        logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
        return None

    df = processed_df.copy()

    # Calculate the ratio. We need to handle cases where the lower bound is zero.
    # Replace lower bound of 0 with a small number to avoid division by zero, or filter them out.
    # Here, we filter, as a ratio with a zero denominator is undefined.
    df = df[df['lb_estimate_in_minutes'] > 0]
    df['estimate_ratio'] = df['ub_estimate_in_minutes'] / df['lb_estimate_in_minutes']

    # Replace infinite values (which can occur if ub is huge and lb is tiny) with NaN
    # and drop rows with NaN or infinite ratios.
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(subset=['estimate_ratio'], inplace=True)

    if df.empty:
        logging.warning("No valid data available to plot the estimate ratio distribution.")
        return None

    # --- Plotting ---
    try:
        plt.figure(figsize=(10, 6))

        # We plot the log10 of the ratio to better visualize the wide distribution
        log_ratio = np.log10(df['estimate_ratio'])

        sns.histplot(log_ratio, bins=60, kde=True)

        # Add vertical lines for reference points
        # log10(1) = 0, which is where upper bound equals lower bound
        plt.axvline(x=0, color='black', linestyle='-', linewidth=1.5, label='1x (Upper = Lower)')
        # A small ratio, e.g., 5% difference
        plt.axvline(x=np.log10(1.05), color='orange', linestyle='--', linewidth=1, label='1.05x ratio')
        # A 10x ratio
        plt.axvline(x=np.log10(10), color='red', linestyle='--', linewidth=1, label='10x ratio')

        plt.xlabel('log₁₀(Upper Estimate / Lower Estimate)', fontsize=12)
        plt.ylabel('Number of Tasks', fontsize=12)
        plt.title('Distribution of Time Estimate Ratios', fontsize=16)
        plt.legend()
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.tight_layout()

        # --- File Saving ---
        temp_dir = tempfile.gettempdir()
        temp_path = Path(temp_dir) / "estimate_ratio_distribution.png"
        plt.savefig(temp_path, dpi=300)
        logging.info(f"Successfully saved plot to temporary file: {temp_path}")

        return temp_path

    except Exception as e:
        logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
        return None
    finally:
        plt.close()