sprint-econtai/analysis/generators/task_estimate_distribution.py

import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import tempfile
import logging
import pandas as pd

def generate(processed_df: pd.DataFrame):
    """
    Generates a histogram of the task time estimate midpoints.

    This generator corresponds to 'cell1' from the original analysis notebook.
    It visualizes the distribution of the calculated midpoint of time estimates
    for all tasks on a logarithmic scale to handle the wide range of values.

    Args:
        processed_df (pd.DataFrame): The preprocessed data, expected to contain
                                     'lb_estimate_in_minutes' and
                                     'ub_estimate_in_minutes' columns.

    Returns:
        Path: The path to the generated temporary image file, or None if
              generation fails.
    """
    logging.info("Generating task estimate distribution plot...")

    # --- Data Validation and Preparation ---
    required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes']
    if not all(col in processed_df.columns for col in required_cols):
        logging.error(
            f"Required columns {required_cols} not found in the DataFrame. "
            "Cannot generate plot."
        )
        return None

    # Create a copy to avoid modifying the original DataFrame
    df = processed_df.copy()

    # Calculate the midpoint from lower and upper bounds, as was done in the notebook
    df['estimate_midpoint'] = (df['lb_estimate_in_minutes'] + df['ub_estimate_in_minutes']) / 2

    # For log scaling, we must use positive values. Filter out any non-positive midpoints.
    df = df[df['estimate_midpoint'] > 0]
    if df.empty:
        logging.warning("No data with positive estimate midpoints available to plot.")
        return None

    # --- Plotting ---
    try:
        plt.figure(figsize=(10, 6))
        ax = sns.histplot(data=df, x='estimate_midpoint', log_scale=True)

        ax.set_title('Distribution of Task Time Estimate Midpoints', fontsize=16)
        ax.set_xlabel('Estimate Midpoint (minutes, log scale)', fontsize=12)
        ax.set_ylabel('Number of Tasks', fontsize=12)
        plt.tight_layout()

        # --- File Saving ---
        # Create a temporary file to save the plot. The orchestrator (`generate.py`)
        # will move this to the final 'dist/' directory.
        temp_dir = tempfile.gettempdir()
        temp_path = Path(temp_dir) / "task_estimate_distribution.png"

        plt.savefig(temp_path, dpi=300)
        logging.info(f"Successfully saved plot to temporary file: {temp_path}")

        return temp_path

    except Exception as e:
        logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
        return None
    finally:
        # Close the figure to free up memory, which is crucial when running many generators.
        plt.close()