wip

2025-07-15 00:34:54 +02:00 · 2025-07-15 00:34:54 +02:00 · 65dc648797
commit 65dc648797
parent 62296e1b69
37 changed files with 1413 additions and 2433 deletions
--- a/pipeline/generators/projected_task_automation.py
+++ b/pipeline/generators/projected_task_automation.py
@ -0,0 +1,168 @@
+from pathlib import Path
+from typing import Generator, Dict, Tuple
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.stats import linregress
+from datetime import datetime
+from ..utils import style_plot, LIME
+
+def _generate_projection_data(
+    metr_results: Dict,
+    df: pd.DataFrame,
+    percentile_key: str,
+) -> Tuple[pd.DataFrame, pd.DataFrame] | None:
+    """
+    Generates projection data for a given percentile key (e.g., 'p50_horizon_length').
+    Returns a tuple of (metr_df_with_pct, projection_df), or None if data is insufficient.
+    """
+    # 1. Process METR data to get all model performance over time for the given percentile
+    all_model_data = []
+    for model_name, data in metr_results.get("results", {}).items():
+        for agent_name, agent_data in data.get("agents", {}).items():
+            release_date_str = data.get("release_date")
+            horizon = agent_data.get(percentile_key, {}).get("estimate")
+
+            if release_date_str and horizon is not None:
+                unique_model_name = f"{model_name}-{agent_name}"
+                all_model_data.append({
+                    "model": unique_model_name,
+                    "release_date": release_date_str,
+                    "horizon_minutes": horizon,
+                })
+
+    if not all_model_data:
+        print(f"Warning: No models with {percentile_key} found in METR data. Skipping.")
+        return None
+
+    metr_df = pd.DataFrame(all_model_data).sort_values("release_date").reset_index(drop=True)
+    metr_df['release_date'] = pd.to_datetime(metr_df['release_date'])
+
+    # 2. Perform log-linear regression on coherence over time
+    metr_df = metr_df[metr_df['horizon_minutes'] > 0].copy()
+    if len(metr_df) < 2:
+        print(f"Warning: Not enough data points for regression for {percentile_key}. Skipping.")
+        return None
+
+    metr_df['days_since_start'] = (metr_df['release_date'] - metr_df['release_date'].min()).dt.days
+    log_y = np.log(metr_df['horizon_minutes'])
+    x = metr_df['days_since_start']
+
+    slope, intercept, r_value, _, _ = linregress(x, log_y)
+    doubling_time_days = np.log(2) / slope
+    print(f"METR all models {percentile_key} trend: R^2 = {r_value**2:.2f}, Doubling time = {doubling_time_days:.1f} days")
+
+    # 3. Project coherence into the future
+    start_date = metr_df['release_date'].min()
+    future_dates = pd.to_datetime(pd.date_range(start=start_date, end="2035-01-01", freq="ME"))
+    future_days = (future_dates - start_date).days.to_numpy()
+
+    projected_log_horizon = intercept + slope * future_days
+    projected_horizon_minutes = np.exp(projected_log_horizon)
+
+    projection_df = pd.DataFrame({
+        "date": future_dates,
+        "projected_coherence_minutes": projected_horizon_minutes,
+    })
+
+    # 4. Calculate the percentage of tasks automated over time based on our estimates
+    total_tasks = len(df)
+    if total_tasks == 0:
+        return None
+
+    for bound in ["lb", "mid", "ub"]:
+        col_name = 'estimate_midpoint' if bound == 'mid' else f'{bound}_estimate_in_minutes'
+        projection_df[f"pct_automatable_{bound}"] = projection_df["projected_coherence_minutes"].apply(
+            lambda h: (df[col_name] <= h).sum() / total_tasks * 100
+        )
+
+    metr_df["pct_automatable_mid"] = metr_df["horizon_minutes"].apply(
+         lambda h: (df['estimate_midpoint'] <= h).sum() / total_tasks * 100
+    )
+
+    return metr_df, projection_df
+
+
+def _plot_projection(ax, projection_df, metr_df, label, color, line_style='-'):
+    """Helper function to draw a single projection on a given axis."""
+    # Plot the projected automation percentage
+    ax.plot(
+        projection_df["date"],
+        projection_df["pct_automatable_mid"],
+        label=f"Mid-point",
+        color=color,
+        linewidth=2.5,
+        linestyle=line_style,
+        zorder=3
+    )
+    ax.fill_between(
+        projection_df["date"],
+        projection_df["pct_automatable_lb"],
+        projection_df["pct_automatable_ub"],
+        color=color,
+        alpha=0.15,
+        label=f"Lower/upper bound range",
+        zorder=2
+    )
+    # Plot the actual METR data points
+    ax.scatter(
+        metr_df['release_date'],
+        metr_df['pct_automatable_mid'],
+        color=color,
+        edgecolor='black',
+        s=60,
+        zorder=4,
+        label=f"Model with {label[1:]}% success rate"
+    )
+
+
+def generate_projected_task_automation_plot(
+    output_dir: Path,
+    metr_results: Dict,
+    df: pd.DataFrame,
+    **kwargs,
+) -> Generator[Path, None, None]:
+    """
+    Generates plots projecting task automation based on METR's p50 and p80
+    coherence data.
+    """
+    style_plot()
+
+    p50_data = _generate_projection_data(metr_results, df, 'p50_horizon_length')
+    p80_data = _generate_projection_data(metr_results, df, 'p80_horizon_length')
+
+    # Plot P50 alone
+    if p50_data:
+        p50_metr_df, p50_proj_df = p50_data
+        fig, ax = plt.subplots(figsize=(12, 8))
+        _plot_projection(ax, p50_proj_df, p50_metr_df, "P50", LIME['600'])
+        ax.set_title("How long before sequential coherence stops being a bottleneck?", fontsize=16, pad=20)
+        ax.set_xlabel("Year")
+        ax.set_ylabel("% of task automatable (50% success rate)")
+        ax.set_ylim(0, 100.5)
+        ax.set_xlim(datetime(2022, 1, 1), p50_proj_df["date"].max())
+        ax.grid(True, which="both", linestyle="--", linewidth=0.5)
+        ax.legend(loc="upper left")
+        plt.tight_layout()
+        output_path = output_dir / "projected_task_automation_p50.png"
+        plt.savefig(output_path)
+        plt.close(fig)
+        yield output_path
+
+    # Plot P80 alone
+    if p80_data:
+        p80_metr_df, p80_proj_df = p80_data
+        fig, ax = plt.subplots(figsize=(12, 8))
+        _plot_projection(ax, p80_proj_df, p80_metr_df, "P80", 'tab:cyan')
+        ax.set_title("Projected Task Automation (P80 AI Coherence)", fontsize=16, pad=20)
+        ax.set_xlabel("Year")
+        ax.set_ylabel("% of Estimable Economic Tasks Automatable")
+        ax.set_ylim(0, 100.5)
+        ax.set_xlim(datetime(2022, 1, 1), p80_proj_df["date"].max())
+        ax.grid(True, which="both", linestyle="--", linewidth=0.5)
+        ax.legend(loc="upper left")
+        plt.tight_layout()
+        output_path = output_dir / "projected_task_automation_p80.png"
+        plt.savefig(output_path)
+        plt.close(fig)
+        yield output_path