This commit is contained in:
Félix Dorn 2025-07-15 00:34:54 +02:00
parent 62296e1b69
commit 65dc648797
37 changed files with 1413 additions and 2433 deletions

View file

@ -1,5 +1,15 @@
from .estimate_histplot import generate_estimate_histplot
from .estimates_spread_per_occupation import generate_estimate_spread_per_occupation
from .estimates_lower_vs_upper_scatter import generate_estimates_lower_vs_upper_scatter
from .sequential_coherence_cdf import plot_sequential_coherence_cdf
from .projected_automatable_wage_bill import generate_projected_automatable_wage_bill
from .projected_task_automation import generate_projected_task_automation_plot
GENERATORS = [
generate_estimate_histplot
generate_estimate_histplot,
generate_estimate_spread_per_occupation,
generate_estimates_lower_vs_upper_scatter,
#plot_sequential_coherence_cdf,
generate_projected_automatable_wage_bill,
generate_projected_task_automation_plot,
]

View file

@ -1,6 +1,32 @@
from ..run import Run
from pathlib import Path
from typing import Generator
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from ..utils import style_plot
def generate_estimate_histplot(run: Run) -> Generator[Path]:
raise NotImplementedError
def generate_estimate_histplot(output_dir: Path, df: pd.DataFrame, **kwargs) -> Generator[Path]:
"""
Generates a styled histogram of the distribution of midpoint time estimates.
"""
style_plot()
OUTPUT_PATH = output_dir / "estimate_distribution_histplot.png"
fig, ax = plt.subplots()
sns.histplot(
data=df,
x='estimate_midpoint',
log_scale=True,
ax=ax
)
ax.set_xlabel("Task Time (minutes, log scale)")
ax.set_ylabel("Number of Tasks")
ax.set_title("Distribution of Time Estimates for Atomic Tasks")
plt.tight_layout()
plt.savefig(OUTPUT_PATH)
plt.close(fig)
yield OUTPUT_PATH

View file

@ -0,0 +1,56 @@
from pathlib import Path
from typing import Generator
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from ..utils import OCCUPATION_MAJOR_CODES, style_plot
def generate_estimates_lower_vs_upper_scatter(output_dir: Path, df: pd.DataFrame, **kwargs) -> Generator[Path]:
"""
Generates a styled scatter plot of lower-bound vs upper-bound time estimates for tasks.
"""
style_plot()
OUTPUT_PATH = output_dir / "estimates_lower_vs_upper_scatter.png"
plot_df = df.copy()
# Replace onetsoc_major codes with their corresponding labels for the plot legend
plot_df['onetsoc_major'] = plot_df['onetsoc_major'].map(OCCUPATION_MAJOR_CODES)
fig, ax = plt.subplots(figsize=(12, 10))
sns.scatterplot(
data=plot_df,
x='lb_estimate_in_minutes',
y='ub_estimate_in_minutes',
alpha=0.3,
edgecolor=None,
hue="onetsoc_major",
ax=ax
)
# 45° reference line (y=x)
lims = (
min(df['lb_estimate_in_minutes'].min(), df['ub_estimate_in_minutes'].min()),
max(df['lb_estimate_in_minutes'].max(), df['ub_estimate_in_minutes'].max())
)
lims = (lims[0] * 0.9, lims[1] * 1.1)
ax.plot(lims, lims, color='black', linestyle='--', linewidth=1, zorder=0)
# Optional helper lines for ratios
for k in [2, 10, 100]:
ax.plot(lims, [k*l for l in lims],
linestyle=':', color='grey', linewidth=1, zorder=0)
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel('Lower-bound (min, log scale)')
ax.set_ylabel('Upper-bound (min, log scale)')
ax.set_title('Lower vs Upper Estimates for All Tasks')
ax.legend(title="Occupation Major Group", bbox_to_anchor=(1.02, 1), loc='upper left')
plt.tight_layout()
plt.savefig(OUTPUT_PATH, bbox_inches='tight')
plt.close(fig)
yield OUTPUT_PATH

View file

@ -0,0 +1,39 @@
from pathlib import Path
from typing import Generator
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from ..utils import OCCUPATION_MAJOR_CODES, style_plot
def generate_estimate_spread_per_occupation(output_dir: Path, df: pd.DataFrame, **kwargs) -> Generator[Path]:
"""
Generates a styled boxplot of the estimate range spread per major occupation group.
"""
style_plot()
OUTPUT_PATH = output_dir / "estimates_spread_per_occupation.png"
fig, ax = plt.subplots(figsize=(10, 12))
sns.boxplot(
data=df,
x='onetsoc_major',
y='estimate_range',
showfliers=False,
ax=ax
)
ax.set_yscale('log')
ax.set_xlabel('Occupation')
ax.set_ylabel('Range (upper-lower, minutes)')
ax.set_title('Spread of time-range estimates per occupation')
# Get occupation labels from codes for x-axis ticks
labels = [OCCUPATION_MAJOR_CODES.get(code.get_text(), code.get_text()) for code in ax.get_xticklabels()]
ax.set_xticklabels(labels, rotation=60, ha='right')
plt.tight_layout()
plt.savefig(OUTPUT_PATH)
plt.close(fig)
yield OUTPUT_PATH

View file

@ -1,6 +0,0 @@
import pandas as pd
from typings import List
def must_have_columns(df: pd.DataFrame, columns: List[str]):
if not all(col in df.columns for col in columns):
raise ValueError(f"DataFrame is missing required columns: {columns}")

View file

@ -0,0 +1,229 @@
from pathlib import Path
from typing import Generator, Dict, Tuple, Optional
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
from scipy.stats import linregress
from datetime import datetime
from ..utils import style_plot, LIME
def _generate_wage_projection_data(
metr_results: Dict,
df_with_wages: pd.DataFrame,
percentile_key: str,
doubling_time_modifier: float,
) -> Optional[Tuple[pd.DataFrame, pd.DataFrame, float]]:
"""
Generates wage projection data for different AI progress scenarios.
Args:
metr_results: The METR benchmark data.
df_with_wages: DataFrame containing tasks with their estimated wage value.
percentile_key: The percentile to use from METR data (e.g., 'p50_horizon_length').
doubling_time_modifier: Multiplier for the doubling time (e.g., 1.0 for baseline,
0.5 for optimistic, 2.0 for pessimistic).
Returns:
A tuple of (metr_df, projection_df, doubling_time_days), or None if data is insufficient.
"""
all_model_data = []
for model_name, data in metr_results.get("results", {}).items():
for agent_name, agent_data in data.get("agents", {}).items():
release_date_str = data.get("release_date")
horizon = agent_data.get(percentile_key, {}).get("estimate")
if release_date_str and horizon is not None:
all_model_data.append({
"release_date": release_date_str,
"horizon_minutes": horizon,
})
if not all_model_data:
return None
metr_df = pd.DataFrame(all_model_data).sort_values("release_date").reset_index(drop=True)
metr_df['release_date'] = pd.to_datetime(metr_df['release_date'])
metr_df = metr_df[metr_df['horizon_minutes'] > 0].copy()
if len(metr_df) < 2:
return None
metr_df['days_since_start'] = (metr_df['release_date'] - metr_df['release_date'].min()).dt.days
log_y = np.log(metr_df['horizon_minutes'])
slope, intercept, r_value, _, _ = linregress(metr_df['days_since_start'], log_y)
# Apply the scenario modifier to the doubling time
base_doubling_time_days = np.log(2) / slope
modified_doubling_time_days = base_doubling_time_days * doubling_time_modifier
modified_slope = np.log(2) / modified_doubling_time_days
start_date = metr_df['release_date'].min()
future_dates = pd.to_datetime(pd.date_range(start=start_date, end="2035-01-01", freq="ME"))
future_days = (future_dates - start_date).days.to_numpy()
projected_log_horizon = intercept + modified_slope * future_days
projected_horizon_minutes = np.exp(projected_log_horizon)
projection_df = pd.DataFrame({
"date": future_dates,
"projected_coherence_minutes": projected_horizon_minutes,
})
# Calculate the total wage bill of tasks automated over time
for bound in ["lb", "mid", "ub"]:
col_name = 'estimate_midpoint' if bound == 'mid' else f'{bound}_estimate_in_minutes'
projection_df[f"automatable_wage_bill_{bound}"] = projection_df["projected_coherence_minutes"].apply(
lambda h: df_with_wages.loc[df_with_wages[col_name] <= h, 'wage_per_task'].sum()
)
# Also calculate for the actual METR data points for plotting
metr_df["automatable_wage_bill_mid"] = metr_df["horizon_minutes"].apply(
lambda h: df_with_wages.loc[df_with_wages['estimate_midpoint'] <= h, 'wage_per_task'].sum()
)
return metr_df, projection_df, modified_doubling_time_days
def _plot_scenario(ax, projection_df, metr_df, label, color, line_style='-'):
"""Helper function to draw a single projection scenario on a given axis."""
# Plot the projected wage bill
ax.plot(
projection_df["date"],
projection_df["automatable_wage_bill_mid"],
label=label,
color=color,
linewidth=2.5,
linestyle=line_style,
zorder=3
)
# Plot the shaded range for lower/upper bounds
ax.fill_between(
projection_df["date"],
projection_df["automatable_wage_bill_lb"],
projection_df["automatable_wage_bill_ub"],
color=color,
alpha=0.15,
zorder=2
)
# Plot the actual METR data points against the wage bill
ax.scatter(
metr_df['release_date'],
metr_df['automatable_wage_bill_mid'],
color=color,
edgecolor='black',
s=60,
zorder=4,
label=f"Model Capabilities (P50)"
)
def generate_projected_automatable_wage_bill(
output_dir: Path,
df: pd.DataFrame,
task_summary_by_occupation_df: pd.DataFrame,
metr_results: Dict,
**kwargs,
) -> Generator[Path, None, None]:
"""
Generates a plot projecting the automatable wage bill under different
AI progress scenarios (optimistic, baseline, pessimistic).
"""
style_plot()
OUTPUT_PATH = output_dir / "projected_automatable_wage_bill_sensitivity.png"
# 1. Calculate wage_per_task for each occupation
wage_bill_info = task_summary_by_occupation_df[['onetsoc_code', 'wage_bill', 'total_tasks']].copy()
wage_bill_info['wage_per_task'] = wage_bill_info['wage_bill'] / wage_bill_info['total_tasks']
wage_bill_info.replace([np.inf, -np.inf], 0, inplace=True) # Avoid division by zero issues
wage_bill_info.drop(columns=['wage_bill', 'total_tasks'], inplace=True)
# 2. Merge wage_per_task into the main task dataframe
df_with_wages = pd.merge(df, wage_bill_info, on='onetsoc_code', how='left')
df_with_wages['wage_per_task'].fillna(0, inplace=True)
# 3. Generate data for all three scenarios
scenarios = {
"Optimistic": {"modifier": 0.5, "color": "tab:green", "style": "--"},
"Baseline": {"modifier": 1.0, "color": LIME['600'], "style": "-"},
"Pessimistic": {"modifier": 2.0, "color": "tab:red", "style": ":"},
}
projection_results = {}
for name, config in scenarios.items():
result = _generate_wage_projection_data(metr_results, df_with_wages, 'p50_horizon_length', config['modifier'])
if result:
projection_results[name] = result
if not projection_results:
print("Warning: Could not generate any projection data. Skipping wage bill plot.")
return
# 4. Create the plot
fig, ax = plt.subplots(figsize=(14, 9))
# We only need to plot the scatter points once, let's use the baseline ones.
if "Baseline" in projection_results:
metr_df, _, _ = projection_results["Baseline"]
ax.scatter(
metr_df['release_date'],
metr_df['automatable_wage_bill_mid'],
color='black',
s=80,
zorder=5,
label=f"Model Capabilities (P50)"
)
legend_lines = []
for name, (metr_df, proj_df, doubling_time) in projection_results.items():
config = scenarios[name]
ax.plot(
proj_df["date"],
proj_df["automatable_wage_bill_mid"],
color=config['color'],
linestyle=config['style'],
linewidth=2.5,
zorder=3
)
ax.fill_between(
proj_df["date"],
proj_df["automatable_wage_bill_lb"],
proj_df["automatable_wage_bill_ub"],
color=config['color'],
alpha=0.15,
zorder=2
)
# Create a custom line for the legend
line = plt.Line2D([0], [0], color=config['color'], linestyle=config['style'], lw=2.5,
label=f'{name} (Doubling Time: {doubling_time:.0f} days)')
legend_lines.append(line)
# 5. Styling and annotations
ax.set_title("Projected Automatable Wage Bill (P50 Coherence)", fontsize=18, pad=20)
ax.set_xlabel("Year", fontsize=12)
ax.set_ylabel("Automatable Annual Wage Bill (Trillions of USD)", fontsize=12)
# Format Y-axis to show trillions
def trillions_formatter(x, pos):
return f'${x / 1e12:.1f}T'
ax.yaxis.set_major_formatter(mticker.FuncFormatter(trillions_formatter))
total_wage_bill = df_with_wages['wage_per_task'].sum()
ax.set_ylim(0, total_wage_bill * 1.05)
if "Baseline" in projection_results:
_, proj_df, _ = projection_results["Baseline"]
ax.set_xlim(datetime(2022, 1, 1), proj_df["date"].max())
# Create the legend from the custom lines and the scatter plot
scatter_legend = ax.get_legend_handles_labels()[0]
ax.legend(handles=legend_lines + scatter_legend, loc="upper left", fontsize=11)
ax.grid(True, which="both", linestyle="--", linewidth=0.5)
plt.tight_layout()
plt.savefig(OUTPUT_PATH)
plt.close(fig)
print(f"Generated sensitivity analysis plot: {OUTPUT_PATH}")
yield OUTPUT_PATH

View file

@ -0,0 +1,168 @@
from pathlib import Path
from typing import Generator, Dict, Tuple
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import linregress
from datetime import datetime
from ..utils import style_plot, LIME
def _generate_projection_data(
metr_results: Dict,
df: pd.DataFrame,
percentile_key: str,
) -> Tuple[pd.DataFrame, pd.DataFrame] | None:
"""
Generates projection data for a given percentile key (e.g., 'p50_horizon_length').
Returns a tuple of (metr_df_with_pct, projection_df), or None if data is insufficient.
"""
# 1. Process METR data to get all model performance over time for the given percentile
all_model_data = []
for model_name, data in metr_results.get("results", {}).items():
for agent_name, agent_data in data.get("agents", {}).items():
release_date_str = data.get("release_date")
horizon = agent_data.get(percentile_key, {}).get("estimate")
if release_date_str and horizon is not None:
unique_model_name = f"{model_name}-{agent_name}"
all_model_data.append({
"model": unique_model_name,
"release_date": release_date_str,
"horizon_minutes": horizon,
})
if not all_model_data:
print(f"Warning: No models with {percentile_key} found in METR data. Skipping.")
return None
metr_df = pd.DataFrame(all_model_data).sort_values("release_date").reset_index(drop=True)
metr_df['release_date'] = pd.to_datetime(metr_df['release_date'])
# 2. Perform log-linear regression on coherence over time
metr_df = metr_df[metr_df['horizon_minutes'] > 0].copy()
if len(metr_df) < 2:
print(f"Warning: Not enough data points for regression for {percentile_key}. Skipping.")
return None
metr_df['days_since_start'] = (metr_df['release_date'] - metr_df['release_date'].min()).dt.days
log_y = np.log(metr_df['horizon_minutes'])
x = metr_df['days_since_start']
slope, intercept, r_value, _, _ = linregress(x, log_y)
doubling_time_days = np.log(2) / slope
print(f"METR all models {percentile_key} trend: R^2 = {r_value**2:.2f}, Doubling time = {doubling_time_days:.1f} days")
# 3. Project coherence into the future
start_date = metr_df['release_date'].min()
future_dates = pd.to_datetime(pd.date_range(start=start_date, end="2035-01-01", freq="ME"))
future_days = (future_dates - start_date).days.to_numpy()
projected_log_horizon = intercept + slope * future_days
projected_horizon_minutes = np.exp(projected_log_horizon)
projection_df = pd.DataFrame({
"date": future_dates,
"projected_coherence_minutes": projected_horizon_minutes,
})
# 4. Calculate the percentage of tasks automated over time based on our estimates
total_tasks = len(df)
if total_tasks == 0:
return None
for bound in ["lb", "mid", "ub"]:
col_name = 'estimate_midpoint' if bound == 'mid' else f'{bound}_estimate_in_minutes'
projection_df[f"pct_automatable_{bound}"] = projection_df["projected_coherence_minutes"].apply(
lambda h: (df[col_name] <= h).sum() / total_tasks * 100
)
metr_df["pct_automatable_mid"] = metr_df["horizon_minutes"].apply(
lambda h: (df['estimate_midpoint'] <= h).sum() / total_tasks * 100
)
return metr_df, projection_df
def _plot_projection(ax, projection_df, metr_df, label, color, line_style='-'):
"""Helper function to draw a single projection on a given axis."""
# Plot the projected automation percentage
ax.plot(
projection_df["date"],
projection_df["pct_automatable_mid"],
label=f"Mid-point",
color=color,
linewidth=2.5,
linestyle=line_style,
zorder=3
)
ax.fill_between(
projection_df["date"],
projection_df["pct_automatable_lb"],
projection_df["pct_automatable_ub"],
color=color,
alpha=0.15,
label=f"Lower/upper bound range",
zorder=2
)
# Plot the actual METR data points
ax.scatter(
metr_df['release_date'],
metr_df['pct_automatable_mid'],
color=color,
edgecolor='black',
s=60,
zorder=4,
label=f"Model with {label[1:]}% success rate"
)
def generate_projected_task_automation_plot(
output_dir: Path,
metr_results: Dict,
df: pd.DataFrame,
**kwargs,
) -> Generator[Path, None, None]:
"""
Generates plots projecting task automation based on METR's p50 and p80
coherence data.
"""
style_plot()
p50_data = _generate_projection_data(metr_results, df, 'p50_horizon_length')
p80_data = _generate_projection_data(metr_results, df, 'p80_horizon_length')
# Plot P50 alone
if p50_data:
p50_metr_df, p50_proj_df = p50_data
fig, ax = plt.subplots(figsize=(12, 8))
_plot_projection(ax, p50_proj_df, p50_metr_df, "P50", LIME['600'])
ax.set_title("How long before sequential coherence stops being a bottleneck?", fontsize=16, pad=20)
ax.set_xlabel("Year")
ax.set_ylabel("% of task automatable (50% success rate)")
ax.set_ylim(0, 100.5)
ax.set_xlim(datetime(2022, 1, 1), p50_proj_df["date"].max())
ax.grid(True, which="both", linestyle="--", linewidth=0.5)
ax.legend(loc="upper left")
plt.tight_layout()
output_path = output_dir / "projected_task_automation_p50.png"
plt.savefig(output_path)
plt.close(fig)
yield output_path
# Plot P80 alone
if p80_data:
p80_metr_df, p80_proj_df = p80_data
fig, ax = plt.subplots(figsize=(12, 8))
_plot_projection(ax, p80_proj_df, p80_metr_df, "P80", 'tab:cyan')
ax.set_title("Projected Task Automation (P80 AI Coherence)", fontsize=16, pad=20)
ax.set_xlabel("Year")
ax.set_ylabel("% of Estimable Economic Tasks Automatable")
ax.set_ylim(0, 100.5)
ax.set_xlim(datetime(2022, 1, 1), p80_proj_df["date"].max())
ax.grid(True, which="both", linestyle="--", linewidth=0.5)
ax.legend(loc="upper left")
plt.tight_layout()
output_path = output_dir / "projected_task_automation_p80.png"
plt.savefig(output_path)
plt.close(fig)
yield output_path

View file

@ -0,0 +1,54 @@
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from ..utils import LIME, style_plot
def plot_sequential_coherence_cdf(output_dir: Path, df: pd.DataFrame, **kwargs):
style_plot()
output_path = output_dir / "sequential_coherence_cdf.png"
def cdf(series):
"""Helper function to calculate CDF data."""
s = series.sort_values().reset_index(drop=True)
# Calculate cumulative percentage
return s.values, ((s.index + 1) / len(s)) * 100
# Calculate CDF for lower, upper, and midpoint estimates
x_lb, y_lb = cdf(df['lb_estimate_in_minutes'])
x_ub, y_ub = cdf(df['ub_estimate_in_minutes'])
x_mid, y_mid = cdf(df['estimate_midpoint'])
# Create the plot
fig, ax = plt.subplots(figsize=(12, 7))
# Plot the CDFs as step plots
ax.step(x_lb, y_lb, where='post', color=LIME['300'], linewidth=1.8, linestyle='--', zorder=2, label='Lower bound estimate')
ax.step(x_ub, y_ub, where='post', color=LIME['900'], linewidth=1.8, linestyle=':', zorder=3, label='Upper bound estimate')
ax.step(x_mid, y_mid, where='post', color=LIME['600'], linewidth=2.2, zorder=4, label='Mid-point')
# --- Styling and Annotations ---
ax.set_xscale('log')
ax.set_ylim(0, 100)
ax.yaxis.set_major_formatter(mtick.PercentFormatter(decimals=0))
# Set titles and labels using the standard axes methods
ax.set_title("% of Tasks With Sequential Coherence ≤ X")
ax.set_xlabel("Sequential Coherence (X)")
ax.set_ylabel("Cumulative Percentage of Tasks")
# Define custom x-axis ticks and labels for better readability
ticks = [1, 5, 10, 30, 60, 120, 240, 480, 1440, 2880, 10080, 43200, 129600, 259200, 525600]
ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hr', '2 hr', '4 hr', '8 hr', '1 day', '2 days',
'1 wk', '30 days', '90 days', '180 days', '1 yr']
ax.set_xticks(ticks)
ax.set_xticklabels(ticklabels, rotation=45, ha='right')
ax.legend(loc='lower right')
# --- Save and close ---
plt.tight_layout()
plt.savefig(output_path, bbox_inches='tight')
plt.close(fig)
yield output_path