134 lines
5.4 KiB
Python
134 lines
5.4 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
import matplotlib as mpl
|
|
from pathlib import Path
|
|
import tempfile
|
|
import logging
|
|
|
|
# Replicating the color palette from the original notebook for consistency.
|
|
# These appear to be inspired by Tailwind CSS colors.
|
|
GRAY_PALETTE = {
|
|
'100': '#F3F4F6',
|
|
'300': '#D1D5DB',
|
|
}
|
|
LIME_PALETTE = {
|
|
'300': '#D9F99D',
|
|
'600': '#A3E635', # A mid-tone lime
|
|
'900': '#4D7C0F', # A dark lime/green
|
|
}
|
|
|
|
|
|
def _calculate_cdf(series: pd.Series):
|
|
"""
|
|
Calculates the empirical Cumulative Distribution Function (CDF) for a series.
|
|
Returns the sorted values and their corresponding cumulative percentages.
|
|
"""
|
|
# Drop NA values and ensure the series is sorted
|
|
s = series.dropna().sort_values().reset_index(drop=True)
|
|
# Calculate cumulative percentage: (index + 1) / total_count
|
|
cdf_y = ((s.index + 1) / len(s)) * 100
|
|
return s.values, cdf_y
|
|
|
|
|
|
def generate(processed_df: pd.DataFrame):
|
|
"""
|
|
Generates a Cumulative Distribution Function (CDF) plot for task time estimates.
|
|
|
|
This corresponds to the second 'cell11' from the original notebook. It plots
|
|
the CDF for the lower-bound, upper-bound, and mid-point of time estimates,
|
|
showing the percentage of tasks that can be completed within a certain time.
|
|
|
|
Args:
|
|
processed_df (pd.DataFrame): The preprocessed data. Expected columns:
|
|
'lb_estimate_in_minutes',
|
|
'ub_estimate_in_minutes'.
|
|
|
|
Returns:
|
|
Path: The path to the generated temporary image file, or None on failure.
|
|
"""
|
|
logging.info("Generating temporal coherence CDF plot...")
|
|
|
|
# --- Data Validation and Preparation ---
|
|
required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes']
|
|
if not all(col in processed_df.columns for col in required_cols):
|
|
logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
|
|
return None
|
|
|
|
df = processed_df.copy()
|
|
|
|
# Log scale requires positive values.
|
|
df = df[(df['lb_estimate_in_minutes'] > 0) & (df['ub_estimate_in_minutes'] > 0)]
|
|
if df.empty:
|
|
logging.warning("No data with positive estimates available to generate CDF plot.")
|
|
return None
|
|
|
|
# Calculate mid-point estimate
|
|
df['midpoint_estimate'] = (df['lb_estimate_in_minutes'] + df['ub_estimate_in_minutes']) / 2
|
|
|
|
# Prepare data for CDF plots
|
|
x_lb, y_lb = _calculate_cdf(df['lb_estimate_in_minutes'])
|
|
x_ub, y_ub = _calculate_cdf(df['ub_estimate_in_minutes'])
|
|
x_mid, y_mid = _calculate_cdf(df['midpoint_estimate'])
|
|
|
|
# --- Plotting ---
|
|
try:
|
|
fig, ax = plt.subplots(figsize=(12, 8))
|
|
|
|
# --- Grid and Reference Lines ---
|
|
# Horizontal reference lines for percentages
|
|
for y_val in range(0, 101, 10):
|
|
ax.axhline(y_val, color=GRAY_PALETTE['100'], linewidth=0.8, zorder=1)
|
|
|
|
# Vertical reference lines for human-friendly durations
|
|
ticks = [1, 5, 10, 30, 60, 120, 240, 480, 1440, 2880, 10080, 43200]
|
|
for tick in ticks:
|
|
ax.axvline(tick, color=GRAY_PALETTE['300'], linewidth=0.8, linestyle='--', zorder=1)
|
|
|
|
# --- CDF Plots ---
|
|
ax.step(x_lb, y_lb, where='post', color=LIME_PALETTE['300'], linewidth=1.8, linestyle='--', zorder=2, label='Lower-bound Estimate (CDF)')
|
|
ax.step(x_ub, y_ub, where='post', color=LIME_PALETTE['900'], linewidth=1.8, linestyle=':', zorder=3, label='Upper-bound Estimate (CDF)')
|
|
ax.step(x_mid, y_mid, where='post', color=LIME_PALETTE['600'], linewidth=2.2, zorder=4, label='Mid-point Estimate (CDF)')
|
|
|
|
# --- Axes Configuration ---
|
|
ax.set_ylim(0, 100)
|
|
ax.set_xscale('log')
|
|
|
|
# Custom x-ticks for durations
|
|
ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hr', '2 hrs', '4 hrs', '8 hrs', '1 day', '2 days', '1 week', '30 days']
|
|
ax.set_xticks(ticks)
|
|
ax.set_xticklabels(ticklabels, rotation=45, ha='right')
|
|
ax.minorticks_off() # Turn off minor ticks for clarity with custom grid
|
|
|
|
# Format y-axis as percentages
|
|
ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(decimals=0))
|
|
|
|
# --- Spines and Labels ---
|
|
for spine in ['top', 'right']:
|
|
ax.spines[spine].set_visible(False)
|
|
for spine in ['left', 'bottom']:
|
|
ax.spines[spine].set_edgecolor(GRAY_PALETTE['300'])
|
|
|
|
# Use ax.text for more control over label placement than ax.set_ylabel/xlabel
|
|
ax.text(-0.07, 1.02, "% of tasks with duration ≤ X", transform=ax.transAxes,
|
|
fontsize=12, fontweight='semibold', va='bottom')
|
|
ax.text(0.5, -0.25, 'Task Duration (X)', transform=ax.transAxes,
|
|
fontsize=12, fontweight='semibold', ha='center')
|
|
|
|
ax.legend(frameon=False, loc='lower right')
|
|
fig.suptitle('Cumulative Distribution of Task Time Estimates', fontsize=16, y=0.96)
|
|
plt.tight_layout(rect=[0, 0, 1, 0.95]) # Adjust layout to make space for suptitle
|
|
|
|
# --- File Saving ---
|
|
temp_dir = tempfile.gettempdir()
|
|
temp_path = Path(temp_dir) / "temporal_coherence_cdf.png"
|
|
plt.savefig(temp_path, dpi=300, bbox_inches='tight')
|
|
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
|
|
|
|
return temp_path
|
|
|
|
except Exception as e:
|
|
logging.error(f"An error occurred while generating the CDF plot: {e}", exc_info=True)
|
|
return None
|
|
finally:
|
|
plt.close()
|