import pandas as pd import numpy as np import matplotlib.pyplot as plt import matplotlib as mpl from pathlib import Path import tempfile import logging # Replicating the color palette from the original notebook for consistency. # These appear to be inspired by Tailwind CSS colors. GRAY_PALETTE = { '100': '#F3F4F6', '300': '#D1D5DB', } LIME_PALETTE = { '300': '#D9F99D', '600': '#A3E635', # A mid-tone lime '900': '#4D7C0F', # A dark lime/green } def _calculate_cdf(series: pd.Series): """ Calculates the empirical Cumulative Distribution Function (CDF) for a series. Returns the sorted values and their corresponding cumulative percentages. """ # Drop NA values and ensure the series is sorted s = series.dropna().sort_values().reset_index(drop=True) # Calculate cumulative percentage: (index + 1) / total_count cdf_y = ((s.index + 1) / len(s)) * 100 return s.values, cdf_y def generate(processed_df: pd.DataFrame): """ Generates a Cumulative Distribution Function (CDF) plot for task time estimates. This corresponds to the second 'cell11' from the original notebook. It plots the CDF for the lower-bound, upper-bound, and mid-point of time estimates, showing the percentage of tasks that can be completed within a certain time. Args: processed_df (pd.DataFrame): The preprocessed data. Expected columns: 'lb_estimate_in_minutes', 'ub_estimate_in_minutes'. Returns: Path: The path to the generated temporary image file, or None on failure. """ logging.info("Generating temporal coherence CDF plot...") # --- Data Validation and Preparation --- required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes'] if not all(col in processed_df.columns for col in required_cols): logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.") return None df = processed_df.copy() # Log scale requires positive values. df = df[(df['lb_estimate_in_minutes'] > 0) & (df['ub_estimate_in_minutes'] > 0)] if df.empty: logging.warning("No data with positive estimates available to generate CDF plot.") return None # Calculate mid-point estimate df['midpoint_estimate'] = (df['lb_estimate_in_minutes'] + df['ub_estimate_in_minutes']) / 2 # Prepare data for CDF plots x_lb, y_lb = _calculate_cdf(df['lb_estimate_in_minutes']) x_ub, y_ub = _calculate_cdf(df['ub_estimate_in_minutes']) x_mid, y_mid = _calculate_cdf(df['midpoint_estimate']) # --- Plotting --- try: fig, ax = plt.subplots(figsize=(12, 8)) # --- Grid and Reference Lines --- # Horizontal reference lines for percentages for y_val in range(0, 101, 10): ax.axhline(y_val, color=GRAY_PALETTE['100'], linewidth=0.8, zorder=1) # Vertical reference lines for human-friendly durations ticks = [1, 5, 10, 30, 60, 120, 240, 480, 1440, 2880, 10080, 43200] for tick in ticks: ax.axvline(tick, color=GRAY_PALETTE['300'], linewidth=0.8, linestyle='--', zorder=1) # --- CDF Plots --- ax.step(x_lb, y_lb, where='post', color=LIME_PALETTE['300'], linewidth=1.8, linestyle='--', zorder=2, label='Lower-bound Estimate (CDF)') ax.step(x_ub, y_ub, where='post', color=LIME_PALETTE['900'], linewidth=1.8, linestyle=':', zorder=3, label='Upper-bound Estimate (CDF)') ax.step(x_mid, y_mid, where='post', color=LIME_PALETTE['600'], linewidth=2.2, zorder=4, label='Mid-point Estimate (CDF)') # --- Axes Configuration --- ax.set_ylim(0, 100) ax.set_xscale('log') # Custom x-ticks for durations ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hr', '2 hrs', '4 hrs', '8 hrs', '1 day', '2 days', '1 week', '30 days'] ax.set_xticks(ticks) ax.set_xticklabels(ticklabels, rotation=45, ha='right') ax.minorticks_off() # Turn off minor ticks for clarity with custom grid # Format y-axis as percentages ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(decimals=0)) # --- Spines and Labels --- for spine in ['top', 'right']: ax.spines[spine].set_visible(False) for spine in ['left', 'bottom']: ax.spines[spine].set_edgecolor(GRAY_PALETTE['300']) # Use ax.text for more control over label placement than ax.set_ylabel/xlabel ax.text(-0.07, 1.02, "% of tasks with duration ≤ X", transform=ax.transAxes, fontsize=12, fontweight='semibold', va='bottom') ax.text(0.5, -0.25, 'Task Duration (X)', transform=ax.transAxes, fontsize=12, fontweight='semibold', ha='center') ax.legend(frameon=False, loc='lower right') fig.suptitle('Cumulative Distribution of Task Time Estimates', fontsize=16, y=0.96) plt.tight_layout(rect=[0, 0, 1, 0.95]) # Adjust layout to make space for suptitle # --- File Saving --- temp_dir = tempfile.gettempdir() temp_path = Path(temp_dir) / "temporal_coherence_cdf.png" plt.savefig(temp_path, dpi=300, bbox_inches='tight') logging.info(f"Successfully saved plot to temporary file: {temp_path}") return temp_path except Exception as e: logging.error(f"An error occurred while generating the CDF plot: {e}", exc_info=True) return None finally: plt.close()