sprint-econtai/analysis/generators/estimate_ratio_distribution.py
Félix Dorn 43076bcbb1 old
2025-07-15 00:41:05 +02:00

86 lines
3.4 KiB
Python

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
import tempfile
import logging
def generate(processed_df: pd.DataFrame):
"""
Generates a histogram of the log-ratio of upper to lower time estimates.
This corresponds to 'cell4' from the original analysis notebook. It shows
the distribution of how many times larger the upper estimate is compared
to the lower estimate.
Args:
processed_df (pd.DataFrame): The preprocessed data. Expected columns:
'lb_estimate_in_minutes',
'ub_estimate_in_minutes'.
Returns:
Path: The path to the generated temporary image file, or None on failure.
"""
logging.info("Generating distribution plot of estimate ratios...")
# --- Data Validation and Preparation ---
required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes']
if not all(col in processed_df.columns for col in required_cols):
logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
return None
df = processed_df.copy()
# Calculate the ratio. We need to handle cases where the lower bound is zero.
# Replace lower bound of 0 with a small number to avoid division by zero, or filter them out.
# Here, we filter, as a ratio with a zero denominator is undefined.
df = df[df['lb_estimate_in_minutes'] > 0]
df['estimate_ratio'] = df['ub_estimate_in_minutes'] / df['lb_estimate_in_minutes']
# Replace infinite values (which can occur if ub is huge and lb is tiny) with NaN
# and drop rows with NaN or infinite ratios.
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(subset=['estimate_ratio'], inplace=True)
if df.empty:
logging.warning("No valid data available to plot the estimate ratio distribution.")
return None
# --- Plotting ---
try:
plt.figure(figsize=(10, 6))
# We plot the log10 of the ratio to better visualize the wide distribution
log_ratio = np.log10(df['estimate_ratio'])
sns.histplot(log_ratio, bins=60, kde=True)
# Add vertical lines for reference points
# log10(1) = 0, which is where upper bound equals lower bound
plt.axvline(x=0, color='black', linestyle='-', linewidth=1.5, label='1x (Upper = Lower)')
# A small ratio, e.g., 5% difference
plt.axvline(x=np.log10(1.05), color='orange', linestyle='--', linewidth=1, label='1.05x ratio')
# A 10x ratio
plt.axvline(x=np.log10(10), color='red', linestyle='--', linewidth=1, label='10x ratio')
plt.xlabel('log₁₀(Upper Estimate / Lower Estimate)', fontsize=12)
plt.ylabel('Number of Tasks', fontsize=12)
plt.title('Distribution of Time Estimate Ratios', fontsize=16)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
# --- File Saving ---
temp_dir = tempfile.gettempdir()
temp_path = Path(temp_dir) / "estimate_ratio_distribution.png"
plt.savefig(temp_path, dpi=300)
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
return temp_path
except Exception as e:
logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
return None
finally:
plt.close()