86 lines
3.4 KiB
Python
86 lines
3.4 KiB
Python
import seaborn as sns
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
import pandas as pd
|
|
from pathlib import Path
|
|
import tempfile
|
|
import logging
|
|
|
|
def generate(processed_df: pd.DataFrame):
|
|
"""
|
|
Generates a histogram of the log-ratio of upper to lower time estimates.
|
|
|
|
This corresponds to 'cell4' from the original analysis notebook. It shows
|
|
the distribution of how many times larger the upper estimate is compared
|
|
to the lower estimate.
|
|
|
|
Args:
|
|
processed_df (pd.DataFrame): The preprocessed data. Expected columns:
|
|
'lb_estimate_in_minutes',
|
|
'ub_estimate_in_minutes'.
|
|
|
|
Returns:
|
|
Path: The path to the generated temporary image file, or None on failure.
|
|
"""
|
|
logging.info("Generating distribution plot of estimate ratios...")
|
|
|
|
# --- Data Validation and Preparation ---
|
|
required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes']
|
|
if not all(col in processed_df.columns for col in required_cols):
|
|
logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
|
|
return None
|
|
|
|
df = processed_df.copy()
|
|
|
|
# Calculate the ratio. We need to handle cases where the lower bound is zero.
|
|
# Replace lower bound of 0 with a small number to avoid division by zero, or filter them out.
|
|
# Here, we filter, as a ratio with a zero denominator is undefined.
|
|
df = df[df['lb_estimate_in_minutes'] > 0]
|
|
df['estimate_ratio'] = df['ub_estimate_in_minutes'] / df['lb_estimate_in_minutes']
|
|
|
|
# Replace infinite values (which can occur if ub is huge and lb is tiny) with NaN
|
|
# and drop rows with NaN or infinite ratios.
|
|
df.replace([np.inf, -np.inf], np.nan, inplace=True)
|
|
df.dropna(subset=['estimate_ratio'], inplace=True)
|
|
|
|
if df.empty:
|
|
logging.warning("No valid data available to plot the estimate ratio distribution.")
|
|
return None
|
|
|
|
# --- Plotting ---
|
|
try:
|
|
plt.figure(figsize=(10, 6))
|
|
|
|
# We plot the log10 of the ratio to better visualize the wide distribution
|
|
log_ratio = np.log10(df['estimate_ratio'])
|
|
|
|
sns.histplot(log_ratio, bins=60, kde=True)
|
|
|
|
# Add vertical lines for reference points
|
|
# log10(1) = 0, which is where upper bound equals lower bound
|
|
plt.axvline(x=0, color='black', linestyle='-', linewidth=1.5, label='1x (Upper = Lower)')
|
|
# A small ratio, e.g., 5% difference
|
|
plt.axvline(x=np.log10(1.05), color='orange', linestyle='--', linewidth=1, label='1.05x ratio')
|
|
# A 10x ratio
|
|
plt.axvline(x=np.log10(10), color='red', linestyle='--', linewidth=1, label='10x ratio')
|
|
|
|
plt.xlabel('log₁₀(Upper Estimate / Lower Estimate)', fontsize=12)
|
|
plt.ylabel('Number of Tasks', fontsize=12)
|
|
plt.title('Distribution of Time Estimate Ratios', fontsize=16)
|
|
plt.legend()
|
|
plt.grid(axis='y', linestyle='--', alpha=0.7)
|
|
plt.tight_layout()
|
|
|
|
# --- File Saving ---
|
|
temp_dir = tempfile.gettempdir()
|
|
temp_path = Path(temp_dir) / "estimate_ratio_distribution.png"
|
|
plt.savefig(temp_path, dpi=300)
|
|
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
|
|
|
|
return temp_path
|
|
|
|
except Exception as e:
|
|
logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
|
|
return None
|
|
finally:
|
|
plt.close()
|