sprint-econtai/analysis/generators/temporal_coherence_cdf.py
Félix Dorn 43076bcbb1 old
2025-07-15 00:41:05 +02:00

134 lines
5.4 KiB
Python

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from pathlib import Path
import tempfile
import logging
# Replicating the color palette from the original notebook for consistency.
# These appear to be inspired by Tailwind CSS colors.
GRAY_PALETTE = {
'100': '#F3F4F6',
'300': '#D1D5DB',
}
LIME_PALETTE = {
'300': '#D9F99D',
'600': '#A3E635', # A mid-tone lime
'900': '#4D7C0F', # A dark lime/green
}
def _calculate_cdf(series: pd.Series):
"""
Calculates the empirical Cumulative Distribution Function (CDF) for a series.
Returns the sorted values and their corresponding cumulative percentages.
"""
# Drop NA values and ensure the series is sorted
s = series.dropna().sort_values().reset_index(drop=True)
# Calculate cumulative percentage: (index + 1) / total_count
cdf_y = ((s.index + 1) / len(s)) * 100
return s.values, cdf_y
def generate(processed_df: pd.DataFrame):
"""
Generates a Cumulative Distribution Function (CDF) plot for task time estimates.
This corresponds to the second 'cell11' from the original notebook. It plots
the CDF for the lower-bound, upper-bound, and mid-point of time estimates,
showing the percentage of tasks that can be completed within a certain time.
Args:
processed_df (pd.DataFrame): The preprocessed data. Expected columns:
'lb_estimate_in_minutes',
'ub_estimate_in_minutes'.
Returns:
Path: The path to the generated temporary image file, or None on failure.
"""
logging.info("Generating temporal coherence CDF plot...")
# --- Data Validation and Preparation ---
required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes']
if not all(col in processed_df.columns for col in required_cols):
logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
return None
df = processed_df.copy()
# Log scale requires positive values.
df = df[(df['lb_estimate_in_minutes'] > 0) & (df['ub_estimate_in_minutes'] > 0)]
if df.empty:
logging.warning("No data with positive estimates available to generate CDF plot.")
return None
# Calculate mid-point estimate
df['midpoint_estimate'] = (df['lb_estimate_in_minutes'] + df['ub_estimate_in_minutes']) / 2
# Prepare data for CDF plots
x_lb, y_lb = _calculate_cdf(df['lb_estimate_in_minutes'])
x_ub, y_ub = _calculate_cdf(df['ub_estimate_in_minutes'])
x_mid, y_mid = _calculate_cdf(df['midpoint_estimate'])
# --- Plotting ---
try:
fig, ax = plt.subplots(figsize=(12, 8))
# --- Grid and Reference Lines ---
# Horizontal reference lines for percentages
for y_val in range(0, 101, 10):
ax.axhline(y_val, color=GRAY_PALETTE['100'], linewidth=0.8, zorder=1)
# Vertical reference lines for human-friendly durations
ticks = [1, 5, 10, 30, 60, 120, 240, 480, 1440, 2880, 10080, 43200]
for tick in ticks:
ax.axvline(tick, color=GRAY_PALETTE['300'], linewidth=0.8, linestyle='--', zorder=1)
# --- CDF Plots ---
ax.step(x_lb, y_lb, where='post', color=LIME_PALETTE['300'], linewidth=1.8, linestyle='--', zorder=2, label='Lower-bound Estimate (CDF)')
ax.step(x_ub, y_ub, where='post', color=LIME_PALETTE['900'], linewidth=1.8, linestyle=':', zorder=3, label='Upper-bound Estimate (CDF)')
ax.step(x_mid, y_mid, where='post', color=LIME_PALETTE['600'], linewidth=2.2, zorder=4, label='Mid-point Estimate (CDF)')
# --- Axes Configuration ---
ax.set_ylim(0, 100)
ax.set_xscale('log')
# Custom x-ticks for durations
ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hr', '2 hrs', '4 hrs', '8 hrs', '1 day', '2 days', '1 week', '30 days']
ax.set_xticks(ticks)
ax.set_xticklabels(ticklabels, rotation=45, ha='right')
ax.minorticks_off() # Turn off minor ticks for clarity with custom grid
# Format y-axis as percentages
ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(decimals=0))
# --- Spines and Labels ---
for spine in ['top', 'right']:
ax.spines[spine].set_visible(False)
for spine in ['left', 'bottom']:
ax.spines[spine].set_edgecolor(GRAY_PALETTE['300'])
# Use ax.text for more control over label placement than ax.set_ylabel/xlabel
ax.text(-0.07, 1.02, "% of tasks with duration ≤ X", transform=ax.transAxes,
fontsize=12, fontweight='semibold', va='bottom')
ax.text(0.5, -0.25, 'Task Duration (X)', transform=ax.transAxes,
fontsize=12, fontweight='semibold', ha='center')
ax.legend(frameon=False, loc='lower right')
fig.suptitle('Cumulative Distribution of Task Time Estimates', fontsize=16, y=0.96)
plt.tight_layout(rect=[0, 0, 1, 0.95]) # Adjust layout to make space for suptitle
# --- File Saving ---
temp_dir = tempfile.gettempdir()
temp_path = Path(temp_dir) / "temporal_coherence_cdf.png"
plt.savefig(temp_path, dpi=300, bbox_inches='tight')
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
return temp_path
except Exception as e:
logging.error(f"An error occurred while generating the CDF plot: {e}", exc_info=True)
return None
finally:
plt.close()