sprint-econtai/analysis/generators/temporal_coherence_cdf.py

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from pathlib import Path
import tempfile
import logging

# Replicating the color palette from the original notebook for consistency.
# These appear to be inspired by Tailwind CSS colors.
GRAY_PALETTE = {
    '100': '#F3F4F6',
    '300': '#D1D5DB',
}
LIME_PALETTE = {
    '300': '#D9F99D',
    '600': '#A3E635', # A mid-tone lime
    '900': '#4D7C0F', # A dark lime/green
}


def _calculate_cdf(series: pd.Series):
    """
    Calculates the empirical Cumulative Distribution Function (CDF) for a series.
    Returns the sorted values and their corresponding cumulative percentages.
    """
    # Drop NA values and ensure the series is sorted
    s = series.dropna().sort_values().reset_index(drop=True)
    # Calculate cumulative percentage: (index + 1) / total_count
    cdf_y = ((s.index + 1) / len(s)) * 100
    return s.values, cdf_y


def generate(processed_df: pd.DataFrame):
    """
    Generates a Cumulative Distribution Function (CDF) plot for task time estimates.

    This corresponds to the second 'cell11' from the original notebook. It plots
    the CDF for the lower-bound, upper-bound, and mid-point of time estimates,
    showing the percentage of tasks that can be completed within a certain time.

    Args:
        processed_df (pd.DataFrame): The preprocessed data. Expected columns:
                                     'lb_estimate_in_minutes',
                                     'ub_estimate_in_minutes'.

    Returns:
        Path: The path to the generated temporary image file, or None on failure.
    """
    logging.info("Generating temporal coherence CDF plot...")

    # --- Data Validation and Preparation ---
    required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes']
    if not all(col in processed_df.columns for col in required_cols):
        logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
        return None

    df = processed_df.copy()

    # Log scale requires positive values.
    df = df[(df['lb_estimate_in_minutes'] > 0) & (df['ub_estimate_in_minutes'] > 0)]
    if df.empty:
        logging.warning("No data with positive estimates available to generate CDF plot.")
        return None

    # Calculate mid-point estimate
    df['midpoint_estimate'] = (df['lb_estimate_in_minutes'] + df['ub_estimate_in_minutes']) / 2

    # Prepare data for CDF plots
    x_lb, y_lb = _calculate_cdf(df['lb_estimate_in_minutes'])
    x_ub, y_ub = _calculate_cdf(df['ub_estimate_in_minutes'])
    x_mid, y_mid = _calculate_cdf(df['midpoint_estimate'])

    # --- Plotting ---
    try:
        fig, ax = plt.subplots(figsize=(12, 8))

        # --- Grid and Reference Lines ---
        # Horizontal reference lines for percentages
        for y_val in range(0, 101, 10):
            ax.axhline(y_val, color=GRAY_PALETTE['100'], linewidth=0.8, zorder=1)

        # Vertical reference lines for human-friendly durations
        ticks = [1, 5, 10, 30, 60, 120, 240, 480, 1440, 2880, 10080, 43200]
        for tick in ticks:
            ax.axvline(tick, color=GRAY_PALETTE['300'], linewidth=0.8, linestyle='--', zorder=1)

        # --- CDF Plots ---
        ax.step(x_lb, y_lb, where='post', color=LIME_PALETTE['300'], linewidth=1.8, linestyle='--', zorder=2, label='Lower-bound Estimate (CDF)')
        ax.step(x_ub, y_ub, where='post', color=LIME_PALETTE['900'], linewidth=1.8, linestyle=':', zorder=3, label='Upper-bound Estimate (CDF)')
        ax.step(x_mid, y_mid, where='post', color=LIME_PALETTE['600'], linewidth=2.2, zorder=4, label='Mid-point Estimate (CDF)')

        # --- Axes Configuration ---
        ax.set_ylim(0, 100)
        ax.set_xscale('log')

        # Custom x-ticks for durations
        ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hr', '2 hrs', '4 hrs', '8 hrs', '1 day', '2 days', '1 week', '30 days']
        ax.set_xticks(ticks)
        ax.set_xticklabels(ticklabels, rotation=45, ha='right')
        ax.minorticks_off() # Turn off minor ticks for clarity with custom grid

        # Format y-axis as percentages
        ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(decimals=0))

        # --- Spines and Labels ---
        for spine in ['top', 'right']:
            ax.spines[spine].set_visible(False)
        for spine in ['left', 'bottom']:
            ax.spines[spine].set_edgecolor(GRAY_PALETTE['300'])

        # Use ax.text for more control over label placement than ax.set_ylabel/xlabel
        ax.text(-0.07, 1.02, "% of tasks with duration ≤ X", transform=ax.transAxes,
                fontsize=12, fontweight='semibold', va='bottom')
        ax.text(0.5, -0.25, 'Task Duration (X)', transform=ax.transAxes,
                fontsize=12, fontweight='semibold', ha='center')

        ax.legend(frameon=False, loc='lower right')
        fig.suptitle('Cumulative Distribution of Task Time Estimates', fontsize=16, y=0.96)
        plt.tight_layout(rect=[0, 0, 1, 0.95]) # Adjust layout to make space for suptitle

        # --- File Saving ---
        temp_dir = tempfile.gettempdir()
        temp_path = Path(temp_dir) / "temporal_coherence_cdf.png"
        plt.savefig(temp_path, dpi=300, bbox_inches='tight')
        logging.info(f"Successfully saved plot to temporary file: {temp_path}")

        return temp_path

    except Exception as e:
        logging.error(f"An error occurred while generating the CDF plot: {e}", exc_info=True)
        return None
    finally:
        plt.close()