wip

2025-07-15 00:34:54 +02:00 · 2025-07-15 00:34:54 +02:00 · 65dc648797
commit 65dc648797
parent 62296e1b69
37 changed files with 1413 additions and 2433 deletions
--- a/pipeline/generators/init.py
+++ b/pipeline/generators/init.py
@ -1,5 +1,15 @@
 from .estimate_histplot import generate_estimate_histplot
+from .estimates_spread_per_occupation import generate_estimate_spread_per_occupation
+from .estimates_lower_vs_upper_scatter import generate_estimates_lower_vs_upper_scatter
+from .sequential_coherence_cdf import plot_sequential_coherence_cdf
+from .projected_automatable_wage_bill import generate_projected_automatable_wage_bill
+from .projected_task_automation import generate_projected_task_automation_plot

 GENERATORS = [
-    generate_estimate_histplot
+    generate_estimate_histplot,
+    generate_estimate_spread_per_occupation,
+    generate_estimates_lower_vs_upper_scatter,
+    #plot_sequential_coherence_cdf,
+    generate_projected_automatable_wage_bill,
+    generate_projected_task_automation_plot,
 ]
--- a/pipeline/generators/estimate_histplot.py
+++ b/pipeline/generators/estimate_histplot.py
@ -1,6 +1,32 @@
-from ..run import Run
 from pathlib import Path
 from typing import Generator
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+from ..utils import style_plot

-def generate_estimate_histplot(run: Run) -> Generator[Path]:
-    raise NotImplementedError
+def generate_estimate_histplot(output_dir: Path, df: pd.DataFrame, **kwargs) -> Generator[Path]:
+    """
+    Generates a styled histogram of the distribution of midpoint time estimates.
+    """
+    style_plot()
+    OUTPUT_PATH = output_dir / "estimate_distribution_histplot.png"
+
+    fig, ax = plt.subplots()
+
+    sns.histplot(
+        data=df,
+        x='estimate_midpoint',
+        log_scale=True,
+        ax=ax
+    )
+
+    ax.set_xlabel("Task Time (minutes, log scale)")
+    ax.set_ylabel("Number of Tasks")
+    ax.set_title("Distribution of Time Estimates for Atomic Tasks")
+
+    plt.tight_layout()
+    plt.savefig(OUTPUT_PATH)
+    plt.close(fig)
+
+    yield OUTPUT_PATH
--- a/pipeline/generators/estimates_lower_vs_upper_scatter.py
+++ b/pipeline/generators/estimates_lower_vs_upper_scatter.py
@ -0,0 +1,56 @@
+from pathlib import Path
+from typing import Generator
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+from ..utils import OCCUPATION_MAJOR_CODES, style_plot
+
+
+def generate_estimates_lower_vs_upper_scatter(output_dir: Path, df: pd.DataFrame, **kwargs) -> Generator[Path]:
+    """
+    Generates a styled scatter plot of lower-bound vs upper-bound time estimates for tasks.
+    """
+    style_plot()
+    OUTPUT_PATH = output_dir / "estimates_lower_vs_upper_scatter.png"
+
+    plot_df = df.copy()
+    # Replace onetsoc_major codes with their corresponding labels for the plot legend
+    plot_df['onetsoc_major'] = plot_df['onetsoc_major'].map(OCCUPATION_MAJOR_CODES)
+
+    fig, ax = plt.subplots(figsize=(12, 10))
+    sns.scatterplot(
+            data=plot_df,
+            x='lb_estimate_in_minutes',
+            y='ub_estimate_in_minutes',
+            alpha=0.3,
+            edgecolor=None,
+            hue="onetsoc_major",
+            ax=ax
+        )
+
+    # 45° reference line (y=x)
+    lims = (
+        min(df['lb_estimate_in_minutes'].min(), df['ub_estimate_in_minutes'].min()),
+        max(df['lb_estimate_in_minutes'].max(), df['ub_estimate_in_minutes'].max())
+    )
+    lims = (lims[0] * 0.9, lims[1] * 1.1)
+    ax.plot(lims, lims, color='black', linestyle='--', linewidth=1, zorder=0)
+
+    # Optional helper lines for ratios
+    for k in [2, 10, 100]:
+        ax.plot(lims, [k*l for l in lims],
+                linestyle=':', color='grey', linewidth=1, zorder=0)
+
+    ax.set_xscale('log')
+    ax.set_yscale('log')
+    ax.set_xlabel('Lower-bound (min, log scale)')
+    ax.set_ylabel('Upper-bound (min, log scale)')
+    ax.set_title('Lower vs Upper Estimates for All Tasks')
+
+    ax.legend(title="Occupation Major Group", bbox_to_anchor=(1.02, 1), loc='upper left')
+
+    plt.tight_layout()
+    plt.savefig(OUTPUT_PATH, bbox_inches='tight')
+    plt.close(fig)
+
+    yield OUTPUT_PATH
--- a/pipeline/generators/estimates_spread_per_occupation.py
+++ b/pipeline/generators/estimates_spread_per_occupation.py
@ -0,0 +1,39 @@
+from pathlib import Path
+from typing import Generator
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+from ..utils import OCCUPATION_MAJOR_CODES, style_plot
+
+
+def generate_estimate_spread_per_occupation(output_dir: Path, df: pd.DataFrame, **kwargs) -> Generator[Path]:
+    """
+    Generates a styled boxplot of the estimate range spread per major occupation group.
+    """
+    style_plot()
+    OUTPUT_PATH = output_dir / "estimates_spread_per_occupation.png"
+
+    fig, ax = plt.subplots(figsize=(10, 12))
+
+    sns.boxplot(
+        data=df,
+        x='onetsoc_major',
+        y='estimate_range',
+        showfliers=False,
+        ax=ax
+    )
+
+    ax.set_yscale('log')
+    ax.set_xlabel('Occupation')
+    ax.set_ylabel('Range (upper-lower, minutes)')
+    ax.set_title('Spread of time-range estimates per occupation')
+
+    # Get occupation labels from codes for x-axis ticks
+    labels = [OCCUPATION_MAJOR_CODES.get(code.get_text(), code.get_text()) for code in ax.get_xticklabels()]
+    ax.set_xticklabels(labels, rotation=60, ha='right')
+
+    plt.tight_layout()
+    plt.savefig(OUTPUT_PATH)
+    plt.close(fig)
+
+    yield OUTPUT_PATH
--- a/pipeline/generators/helpers.py
+++ b/pipeline/generators/helpers.py
@ -1,6 +0,0 @@
-import pandas as pd
-from typings import List
-
-def must_have_columns(df: pd.DataFrame, columns: List[str]):
-    if not all(col in df.columns for col in columns):
-        raise ValueError(f"DataFrame is missing required columns: {columns}")
--- a/pipeline/generators/projected_automatable_wage_bill.py
+++ b/pipeline/generators/projected_automatable_wage_bill.py
@ -0,0 +1,229 @@
+from pathlib import Path
+from typing import Generator, Dict, Tuple, Optional
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.ticker as mticker
+from scipy.stats import linregress
+from datetime import datetime
+from ..utils import style_plot, LIME
+
+def _generate_wage_projection_data(
+    metr_results: Dict,
+    df_with_wages: pd.DataFrame,
+    percentile_key: str,
+    doubling_time_modifier: float,
+) -> Optional[Tuple[pd.DataFrame, pd.DataFrame, float]]:
+    """
+    Generates wage projection data for different AI progress scenarios.
+
+    Args:
+        metr_results: The METR benchmark data.
+        df_with_wages: DataFrame containing tasks with their estimated wage value.
+        percentile_key: The percentile to use from METR data (e.g., 'p50_horizon_length').
+        doubling_time_modifier: Multiplier for the doubling time (e.g., 1.0 for baseline,
+                                  0.5 for optimistic, 2.0 for pessimistic).
+
+    Returns:
+        A tuple of (metr_df, projection_df, doubling_time_days), or None if data is insufficient.
+    """
+    all_model_data = []
+    for model_name, data in metr_results.get("results", {}).items():
+        for agent_name, agent_data in data.get("agents", {}).items():
+            release_date_str = data.get("release_date")
+            horizon = agent_data.get(percentile_key, {}).get("estimate")
+            if release_date_str and horizon is not None:
+                all_model_data.append({
+                    "release_date": release_date_str,
+                    "horizon_minutes": horizon,
+                })
+
+    if not all_model_data:
+        return None
+
+    metr_df = pd.DataFrame(all_model_data).sort_values("release_date").reset_index(drop=True)
+    metr_df['release_date'] = pd.to_datetime(metr_df['release_date'])
+    metr_df = metr_df[metr_df['horizon_minutes'] > 0].copy()
+
+    if len(metr_df) < 2:
+        return None
+
+    metr_df['days_since_start'] = (metr_df['release_date'] - metr_df['release_date'].min()).dt.days
+    log_y = np.log(metr_df['horizon_minutes'])
+    slope, intercept, r_value, _, _ = linregress(metr_df['days_since_start'], log_y)
+
+    # Apply the scenario modifier to the doubling time
+    base_doubling_time_days = np.log(2) / slope
+    modified_doubling_time_days = base_doubling_time_days * doubling_time_modifier
+    modified_slope = np.log(2) / modified_doubling_time_days
+
+    start_date = metr_df['release_date'].min()
+    future_dates = pd.to_datetime(pd.date_range(start=start_date, end="2035-01-01", freq="ME"))
+    future_days = (future_dates - start_date).days.to_numpy()
+
+    projected_log_horizon = intercept + modified_slope * future_days
+    projected_horizon_minutes = np.exp(projected_log_horizon)
+
+    projection_df = pd.DataFrame({
+        "date": future_dates,
+        "projected_coherence_minutes": projected_horizon_minutes,
+    })
+
+    # Calculate the total wage bill of tasks automated over time
+    for bound in ["lb", "mid", "ub"]:
+        col_name = 'estimate_midpoint' if bound == 'mid' else f'{bound}_estimate_in_minutes'
+        projection_df[f"automatable_wage_bill_{bound}"] = projection_df["projected_coherence_minutes"].apply(
+            lambda h: df_with_wages.loc[df_with_wages[col_name] <= h, 'wage_per_task'].sum()
+        )
+
+    # Also calculate for the actual METR data points for plotting
+    metr_df["automatable_wage_bill_mid"] = metr_df["horizon_minutes"].apply(
+         lambda h: df_with_wages.loc[df_with_wages['estimate_midpoint'] <= h, 'wage_per_task'].sum()
+    )
+
+    return metr_df, projection_df, modified_doubling_time_days
+
+
+def _plot_scenario(ax, projection_df, metr_df, label, color, line_style='-'):
+    """Helper function to draw a single projection scenario on a given axis."""
+    # Plot the projected wage bill
+    ax.plot(
+        projection_df["date"],
+        projection_df["automatable_wage_bill_mid"],
+        label=label,
+        color=color,
+        linewidth=2.5,
+        linestyle=line_style,
+        zorder=3
+    )
+    # Plot the shaded range for lower/upper bounds
+    ax.fill_between(
+        projection_df["date"],
+        projection_df["automatable_wage_bill_lb"],
+        projection_df["automatable_wage_bill_ub"],
+        color=color,
+        alpha=0.15,
+        zorder=2
+    )
+    # Plot the actual METR data points against the wage bill
+    ax.scatter(
+        metr_df['release_date'],
+        metr_df['automatable_wage_bill_mid'],
+        color=color,
+        edgecolor='black',
+        s=60,
+        zorder=4,
+        label=f"Model Capabilities (P50)"
+    )
+
+
+def generate_projected_automatable_wage_bill(
+    output_dir: Path,
+    df: pd.DataFrame,
+    task_summary_by_occupation_df: pd.DataFrame,
+    metr_results: Dict,
+    **kwargs,
+) -> Generator[Path, None, None]:
+    """
+    Generates a plot projecting the automatable wage bill under different
+    AI progress scenarios (optimistic, baseline, pessimistic).
+    """
+    style_plot()
+    OUTPUT_PATH = output_dir / "projected_automatable_wage_bill_sensitivity.png"
+
+    # 1. Calculate wage_per_task for each occupation
+    wage_bill_info = task_summary_by_occupation_df[['onetsoc_code', 'wage_bill', 'total_tasks']].copy()
+    wage_bill_info['wage_per_task'] = wage_bill_info['wage_bill'] / wage_bill_info['total_tasks']
+    wage_bill_info.replace([np.inf, -np.inf], 0, inplace=True) # Avoid division by zero issues
+    wage_bill_info.drop(columns=['wage_bill', 'total_tasks'], inplace=True)
+
+    # 2. Merge wage_per_task into the main task dataframe
+    df_with_wages = pd.merge(df, wage_bill_info, on='onetsoc_code', how='left')
+    df_with_wages['wage_per_task'].fillna(0, inplace=True)
+
+    # 3. Generate data for all three scenarios
+    scenarios = {
+        "Optimistic": {"modifier": 0.5, "color": "tab:green", "style": "--"},
+        "Baseline": {"modifier": 1.0, "color": LIME['600'], "style": "-"},
+        "Pessimistic": {"modifier": 2.0, "color": "tab:red", "style": ":"},
+    }
+
+    projection_results = {}
+    for name, config in scenarios.items():
+        result = _generate_wage_projection_data(metr_results, df_with_wages, 'p50_horizon_length', config['modifier'])
+        if result:
+            projection_results[name] = result
+
+    if not projection_results:
+        print("Warning: Could not generate any projection data. Skipping wage bill plot.")
+        return
+
+    # 4. Create the plot
+    fig, ax = plt.subplots(figsize=(14, 9))
+
+    # We only need to plot the scatter points once, let's use the baseline ones.
+    if "Baseline" in projection_results:
+        metr_df, _, _ = projection_results["Baseline"]
+        ax.scatter(
+            metr_df['release_date'],
+            metr_df['automatable_wage_bill_mid'],
+            color='black',
+            s=80,
+            zorder=5,
+            label=f"Model Capabilities (P50)"
+        )
+
+
+    legend_lines = []
+    for name, (metr_df, proj_df, doubling_time) in projection_results.items():
+        config = scenarios[name]
+        ax.plot(
+            proj_df["date"],
+            proj_df["automatable_wage_bill_mid"],
+            color=config['color'],
+            linestyle=config['style'],
+            linewidth=2.5,
+            zorder=3
+        )
+        ax.fill_between(
+            proj_df["date"],
+            proj_df["automatable_wage_bill_lb"],
+            proj_df["automatable_wage_bill_ub"],
+            color=config['color'],
+            alpha=0.15,
+            zorder=2
+        )
+        # Create a custom line for the legend
+        line = plt.Line2D([0], [0], color=config['color'], linestyle=config['style'], lw=2.5,
+                          label=f'{name} (Doubling Time: {doubling_time:.0f} days)')
+        legend_lines.append(line)
+
+
+    # 5. Styling and annotations
+    ax.set_title("Projected Automatable Wage Bill (P50 Coherence)", fontsize=18, pad=20)
+    ax.set_xlabel("Year", fontsize=12)
+    ax.set_ylabel("Automatable Annual Wage Bill (Trillions of USD)", fontsize=12)
+
+    # Format Y-axis to show trillions
+    def trillions_formatter(x, pos):
+        return f'${x / 1e12:.1f}T'
+    ax.yaxis.set_major_formatter(mticker.FuncFormatter(trillions_formatter))
+
+    total_wage_bill = df_with_wages['wage_per_task'].sum()
+    ax.set_ylim(0, total_wage_bill * 1.05)
+
+    if "Baseline" in projection_results:
+         _, proj_df, _ = projection_results["Baseline"]
+         ax.set_xlim(datetime(2022, 1, 1), proj_df["date"].max())
+
+    # Create the legend from the custom lines and the scatter plot
+    scatter_legend = ax.get_legend_handles_labels()[0]
+    ax.legend(handles=legend_lines + scatter_legend, loc="upper left", fontsize=11)
+
+    ax.grid(True, which="both", linestyle="--", linewidth=0.5)
+    plt.tight_layout()
+    plt.savefig(OUTPUT_PATH)
+    plt.close(fig)
+
+    print(f"Generated sensitivity analysis plot: {OUTPUT_PATH}")
+    yield OUTPUT_PATH
--- a/pipeline/generators/projected_task_automation.py
+++ b/pipeline/generators/projected_task_automation.py
@ -0,0 +1,168 @@
+from pathlib import Path
+from typing import Generator, Dict, Tuple
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.stats import linregress
+from datetime import datetime
+from ..utils import style_plot, LIME
+
+def _generate_projection_data(
+    metr_results: Dict,
+    df: pd.DataFrame,
+    percentile_key: str,
+) -> Tuple[pd.DataFrame, pd.DataFrame] | None:
+    """
+    Generates projection data for a given percentile key (e.g., 'p50_horizon_length').
+    Returns a tuple of (metr_df_with_pct, projection_df), or None if data is insufficient.
+    """
+    # 1. Process METR data to get all model performance over time for the given percentile
+    all_model_data = []
+    for model_name, data in metr_results.get("results", {}).items():
+        for agent_name, agent_data in data.get("agents", {}).items():
+            release_date_str = data.get("release_date")
+            horizon = agent_data.get(percentile_key, {}).get("estimate")
+
+            if release_date_str and horizon is not None:
+                unique_model_name = f"{model_name}-{agent_name}"
+                all_model_data.append({
+                    "model": unique_model_name,
+                    "release_date": release_date_str,
+                    "horizon_minutes": horizon,
+                })
+
+    if not all_model_data:
+        print(f"Warning: No models with {percentile_key} found in METR data. Skipping.")
+        return None
+
+    metr_df = pd.DataFrame(all_model_data).sort_values("release_date").reset_index(drop=True)
+    metr_df['release_date'] = pd.to_datetime(metr_df['release_date'])
+
+    # 2. Perform log-linear regression on coherence over time
+    metr_df = metr_df[metr_df['horizon_minutes'] > 0].copy()
+    if len(metr_df) < 2:
+        print(f"Warning: Not enough data points for regression for {percentile_key}. Skipping.")
+        return None
+
+    metr_df['days_since_start'] = (metr_df['release_date'] - metr_df['release_date'].min()).dt.days
+    log_y = np.log(metr_df['horizon_minutes'])
+    x = metr_df['days_since_start']
+
+    slope, intercept, r_value, _, _ = linregress(x, log_y)
+    doubling_time_days = np.log(2) / slope
+    print(f"METR all models {percentile_key} trend: R^2 = {r_value**2:.2f}, Doubling time = {doubling_time_days:.1f} days")
+
+    # 3. Project coherence into the future
+    start_date = metr_df['release_date'].min()
+    future_dates = pd.to_datetime(pd.date_range(start=start_date, end="2035-01-01", freq="ME"))
+    future_days = (future_dates - start_date).days.to_numpy()
+
+    projected_log_horizon = intercept + slope * future_days
+    projected_horizon_minutes = np.exp(projected_log_horizon)
+
+    projection_df = pd.DataFrame({
+        "date": future_dates,
+        "projected_coherence_minutes": projected_horizon_minutes,
+    })
+
+    # 4. Calculate the percentage of tasks automated over time based on our estimates
+    total_tasks = len(df)
+    if total_tasks == 0:
+        return None
+
+    for bound in ["lb", "mid", "ub"]:
+        col_name = 'estimate_midpoint' if bound == 'mid' else f'{bound}_estimate_in_minutes'
+        projection_df[f"pct_automatable_{bound}"] = projection_df["projected_coherence_minutes"].apply(
+            lambda h: (df[col_name] <= h).sum() / total_tasks * 100
+        )
+
+    metr_df["pct_automatable_mid"] = metr_df["horizon_minutes"].apply(
+         lambda h: (df['estimate_midpoint'] <= h).sum() / total_tasks * 100
+    )
+
+    return metr_df, projection_df
+
+
+def _plot_projection(ax, projection_df, metr_df, label, color, line_style='-'):
+    """Helper function to draw a single projection on a given axis."""
+    # Plot the projected automation percentage
+    ax.plot(
+        projection_df["date"],
+        projection_df["pct_automatable_mid"],
+        label=f"Mid-point",
+        color=color,
+        linewidth=2.5,
+        linestyle=line_style,
+        zorder=3
+    )
+    ax.fill_between(
+        projection_df["date"],
+        projection_df["pct_automatable_lb"],
+        projection_df["pct_automatable_ub"],
+        color=color,
+        alpha=0.15,
+        label=f"Lower/upper bound range",
+        zorder=2
+    )
+    # Plot the actual METR data points
+    ax.scatter(
+        metr_df['release_date'],
+        metr_df['pct_automatable_mid'],
+        color=color,
+        edgecolor='black',
+        s=60,
+        zorder=4,
+        label=f"Model with {label[1:]}% success rate"
+    )
+
+
+def generate_projected_task_automation_plot(
+    output_dir: Path,
+    metr_results: Dict,
+    df: pd.DataFrame,
+    **kwargs,
+) -> Generator[Path, None, None]:
+    """
+    Generates plots projecting task automation based on METR's p50 and p80
+    coherence data.
+    """
+    style_plot()
+
+    p50_data = _generate_projection_data(metr_results, df, 'p50_horizon_length')
+    p80_data = _generate_projection_data(metr_results, df, 'p80_horizon_length')
+
+    # Plot P50 alone
+    if p50_data:
+        p50_metr_df, p50_proj_df = p50_data
+        fig, ax = plt.subplots(figsize=(12, 8))
+        _plot_projection(ax, p50_proj_df, p50_metr_df, "P50", LIME['600'])
+        ax.set_title("How long before sequential coherence stops being a bottleneck?", fontsize=16, pad=20)
+        ax.set_xlabel("Year")
+        ax.set_ylabel("% of task automatable (50% success rate)")
+        ax.set_ylim(0, 100.5)
+        ax.set_xlim(datetime(2022, 1, 1), p50_proj_df["date"].max())
+        ax.grid(True, which="both", linestyle="--", linewidth=0.5)
+        ax.legend(loc="upper left")
+        plt.tight_layout()
+        output_path = output_dir / "projected_task_automation_p50.png"
+        plt.savefig(output_path)
+        plt.close(fig)
+        yield output_path
+
+    # Plot P80 alone
+    if p80_data:
+        p80_metr_df, p80_proj_df = p80_data
+        fig, ax = plt.subplots(figsize=(12, 8))
+        _plot_projection(ax, p80_proj_df, p80_metr_df, "P80", 'tab:cyan')
+        ax.set_title("Projected Task Automation (P80 AI Coherence)", fontsize=16, pad=20)
+        ax.set_xlabel("Year")
+        ax.set_ylabel("% of Estimable Economic Tasks Automatable")
+        ax.set_ylim(0, 100.5)
+        ax.set_xlim(datetime(2022, 1, 1), p80_proj_df["date"].max())
+        ax.grid(True, which="both", linestyle="--", linewidth=0.5)
+        ax.legend(loc="upper left")
+        plt.tight_layout()
+        output_path = output_dir / "projected_task_automation_p80.png"
+        plt.savefig(output_path)
+        plt.close(fig)
+        yield output_path
--- a/pipeline/generators/sequential_coherence_cdf.py
+++ b/pipeline/generators/sequential_coherence_cdf.py
@ -0,0 +1,54 @@
+from pathlib import Path
+import pandas as pd
+import matplotlib.pyplot as plt
+import matplotlib.ticker as mtick
+from ..utils import LIME, style_plot
+
+def plot_sequential_coherence_cdf(output_dir: Path, df: pd.DataFrame, **kwargs):
+    style_plot()
+    output_path = output_dir / "sequential_coherence_cdf.png"
+
+    def cdf(series):
+        """Helper function to calculate CDF data."""
+        s = series.sort_values().reset_index(drop=True)
+        # Calculate cumulative percentage
+        return s.values, ((s.index + 1) / len(s)) * 100
+
+    # Calculate CDF for lower, upper, and midpoint estimates
+    x_lb, y_lb = cdf(df['lb_estimate_in_minutes'])
+    x_ub, y_ub = cdf(df['ub_estimate_in_minutes'])
+    x_mid, y_mid = cdf(df['estimate_midpoint'])
+
+    # Create the plot
+    fig, ax = plt.subplots(figsize=(12, 7))
+
+    # Plot the CDFs as step plots
+    ax.step(x_lb, y_lb, where='post', color=LIME['300'], linewidth=1.8, linestyle='--', zorder=2, label='Lower bound estimate')
+    ax.step(x_ub, y_ub, where='post', color=LIME['900'], linewidth=1.8, linestyle=':', zorder=3, label='Upper bound estimate')
+    ax.step(x_mid, y_mid, where='post', color=LIME['600'], linewidth=2.2, zorder=4, label='Mid-point')
+
+    # --- Styling and Annotations ---
+    ax.set_xscale('log')
+    ax.set_ylim(0, 100)
+    ax.yaxis.set_major_formatter(mtick.PercentFormatter(decimals=0))
+
+    # Set titles and labels using the standard axes methods
+    ax.set_title("% of Tasks With Sequential Coherence ≤ X")
+    ax.set_xlabel("Sequential Coherence (X)")
+    ax.set_ylabel("Cumulative Percentage of Tasks")
+
+    # Define custom x-axis ticks and labels for better readability
+    ticks = [1, 5, 10, 30, 60, 120, 240, 480, 1440, 2880, 10080, 43200, 129600, 259200, 525600]
+    ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hr', '2 hr', '4 hr', '8 hr', '1 day', '2 days',
+ '1 wk', '30 days', '90 days', '180 days', '1 yr']
+    ax.set_xticks(ticks)
+    ax.set_xticklabels(ticklabels, rotation=45, ha='right')
+
+    ax.legend(loc='lower right')
+
+    # --- Save and close ---
+    plt.tight_layout()
+    plt.savefig(output_path, bbox_inches='tight')
+    plt.close(fig)
+
+    yield output_path