import seaborn as sns import matplotlib.pyplot as plt import numpy as np import pandas as pd from pathlib import Path import tempfile import logging def generate(processed_df: pd.DataFrame): """ Generates a histogram of the log-ratio of upper to lower time estimates. This corresponds to 'cell4' from the original analysis notebook. It shows the distribution of how many times larger the upper estimate is compared to the lower estimate. Args: processed_df (pd.DataFrame): The preprocessed data. Expected columns: 'lb_estimate_in_minutes', 'ub_estimate_in_minutes'. Returns: Path: The path to the generated temporary image file, or None on failure. """ logging.info("Generating distribution plot of estimate ratios...") # --- Data Validation and Preparation --- required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes'] if not all(col in processed_df.columns for col in required_cols): logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.") return None df = processed_df.copy() # Calculate the ratio. We need to handle cases where the lower bound is zero. # Replace lower bound of 0 with a small number to avoid division by zero, or filter them out. # Here, we filter, as a ratio with a zero denominator is undefined. df = df[df['lb_estimate_in_minutes'] > 0] df['estimate_ratio'] = df['ub_estimate_in_minutes'] / df['lb_estimate_in_minutes'] # Replace infinite values (which can occur if ub is huge and lb is tiny) with NaN # and drop rows with NaN or infinite ratios. df.replace([np.inf, -np.inf], np.nan, inplace=True) df.dropna(subset=['estimate_ratio'], inplace=True) if df.empty: logging.warning("No valid data available to plot the estimate ratio distribution.") return None # --- Plotting --- try: plt.figure(figsize=(10, 6)) # We plot the log10 of the ratio to better visualize the wide distribution log_ratio = np.log10(df['estimate_ratio']) sns.histplot(log_ratio, bins=60, kde=True) # Add vertical lines for reference points # log10(1) = 0, which is where upper bound equals lower bound plt.axvline(x=0, color='black', linestyle='-', linewidth=1.5, label='1x (Upper = Lower)') # A small ratio, e.g., 5% difference plt.axvline(x=np.log10(1.05), color='orange', linestyle='--', linewidth=1, label='1.05x ratio') # A 10x ratio plt.axvline(x=np.log10(10), color='red', linestyle='--', linewidth=1, label='10x ratio') plt.xlabel('log₁₀(Upper Estimate / Lower Estimate)', fontsize=12) plt.ylabel('Number of Tasks', fontsize=12) plt.title('Distribution of Time Estimate Ratios', fontsize=16) plt.legend() plt.grid(axis='y', linestyle='--', alpha=0.7) plt.tight_layout() # --- File Saving --- temp_dir = tempfile.gettempdir() temp_path = Path(temp_dir) / "estimate_ratio_distribution.png" plt.savefig(temp_path, dpi=300) logging.info(f"Successfully saved plot to temporary file: {temp_path}") return temp_path except Exception as e: logging.error(f"An error occurred while generating the plot: {e}", exc_info=True) return None finally: plt.close()