import seaborn as sns import matplotlib.pyplot as plt from pathlib import Path import tempfile import logging import pandas as pd def generate(processed_df: pd.DataFrame): """ Generates a histogram of the task time estimate midpoints. This generator corresponds to 'cell1' from the original analysis notebook. It visualizes the distribution of the calculated midpoint of time estimates for all tasks on a logarithmic scale to handle the wide range of values. Args: processed_df (pd.DataFrame): The preprocessed data, expected to contain 'lb_estimate_in_minutes' and 'ub_estimate_in_minutes' columns. Returns: Path: The path to the generated temporary image file, or None if generation fails. """ logging.info("Generating task estimate distribution plot...") # --- Data Validation and Preparation --- required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes'] if not all(col in processed_df.columns for col in required_cols): logging.error( f"Required columns {required_cols} not found in the DataFrame. " "Cannot generate plot." ) return None # Create a copy to avoid modifying the original DataFrame df = processed_df.copy() # Calculate the midpoint from lower and upper bounds, as was done in the notebook df['estimate_midpoint'] = (df['lb_estimate_in_minutes'] + df['ub_estimate_in_minutes']) / 2 # For log scaling, we must use positive values. Filter out any non-positive midpoints. df = df[df['estimate_midpoint'] > 0] if df.empty: logging.warning("No data with positive estimate midpoints available to plot.") return None # --- Plotting --- try: plt.figure(figsize=(10, 6)) ax = sns.histplot(data=df, x='estimate_midpoint', log_scale=True) ax.set_title('Distribution of Task Time Estimate Midpoints', fontsize=16) ax.set_xlabel('Estimate Midpoint (minutes, log scale)', fontsize=12) ax.set_ylabel('Number of Tasks', fontsize=12) plt.tight_layout() # --- File Saving --- # Create a temporary file to save the plot. The orchestrator (`generate.py`) # will move this to the final 'dist/' directory. temp_dir = tempfile.gettempdir() temp_path = Path(temp_dir) / "task_estimate_distribution.png" plt.savefig(temp_path, dpi=300) logging.info(f"Successfully saved plot to temporary file: {temp_path}") return temp_path except Exception as e: logging.error(f"An error occurred while generating the plot: {e}", exc_info=True) return None finally: # Close the figure to free up memory, which is crucial when running many generators. plt.close()