sprint-econtai/analysis.py
Félix Dorn 43076bcbb1 old
2025-07-15 00:41:05 +02:00

563 lines
23 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import litellm
import sqlite3
import numpy as np
import pandas as pd
from google.colab import userdata, files
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
os.environ['GEMINI_API_KEY'] = userdata.get('GEMINI_API_KEY')
occupation_major_codes = {
'11': 'Management',
'13': 'Business and Financial Operations',
'15': 'Computer and Mathematical Occupations',
'17': 'Architecture and Engineering',
'19': 'Life, Physical, and Social Science',
'21': 'Community and Social Services',
'23': 'Legal',
'25': 'Education, Training, and Library',
'27': 'Arts, Design, Entertainment, Sports, and Media',
'29': 'Healthcare Practitioners and Technical',
'31': 'Healthcare Support',
'33': 'Protective Service',
'35': 'Food Preparation and Serving Related',
'37': 'Building and Grounds Cleaning and Maintenance',
'39': 'Personal Care and Service',
'41': 'Sales and Related',
'43': 'Office and Administrative Support',
'45': 'Farming, Fishing, and Forestry',
'47': 'Construction and Extraction',
'49': 'Installation, Maintenance, and Repair',
'51': 'Production',
'53': 'Transportation and Material Moving',
'55': 'Military Specific'
}
gray = {'50':'#f8fafc','100':'#f1f5f9','200':'#e2e8f0',
'300':'#cbd5e1','400':'#94a3b8','500':'#64748b',
'600':'#475569','700':'#334155','800':'#1e293b',
'900':'#0f172a','950':'#020617'}
lime = {'50': '#f7fee7','100': '#ecfcca','200': '#d8f999',
'300': '#bbf451','400': '#9ae600','500': '#83cd00',
'600': '#64a400','700': '#497d00','800': '#3c6300',
'900': '#35530e','950': '#192e03'}
mpl.rcParams.update({
'figure.facecolor' : gray['50'],
'axes.facecolor' : gray['50'],
'axes.edgecolor' : gray['100'],
'axes.labelcolor' : gray['700'],
'xtick.color' : gray['700'],
'ytick.color' : gray['700'],
'font.family' : 'Inter', # falls back to DejaVu if Inter not present
'font.size' : 11,
})
sns.set_style("white") # keep minimal axes, we will remove default grid
sns.set_context("notebook")
def prepare_tasks():
# This dataset comes from https://epoch.ai/gradient-updates/consequences-of-automating-remote-work
# It contains labels for a O*NET task can be done remotely or not (labeled by GPT-4o)
# You can download it here: https://drive.google.com/file/d/1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r/view?usp=sharing
df_remote_status = pd.read_csv("epoch_task_data.csv")
# BLS OEWS: https://www.bls.gov/oes/special-requests/oesm23nat.zip
df_oesm = pd.read_excel("oesm23national.xlsx")
# Run uv run ./enrich_task_ratings.py
df_tasks = pd.read_json("task_ratings_enriched.json")
# Run uv run classify_estimateability_of_tasks.py
df_task_estimateable = pd.read_csv("tasks_estimateable.csv").rename(columns={"task_estimateable": "estimateable"}).drop_duplicates(subset=['task'], keep='first')
# df_tasks now has a remote_status column which contains either "remote" or "not remote"
df_tasks = pd.merge(df_tasks, df_remote_status[['Task', 'Remote']], left_on='task', right_on='Task', how='left')
df_tasks = df_tasks.drop('Task', axis=1).rename(columns={'Remote': 'remote_status'})
# df_tasks now has a estimateable column which contains either "ATOMIC" or "ONGOING-CONSTRAINT"
df_tasks = pd.merge(df_tasks, df_task_estimateable[['task', 'estimateable']], on='task', how='left')
df_tasks = df_tasks[df_tasks['importance_average'] < 3].copy()
df_tasks['onetsoc_major'] = df_tasks['onetsoc_code'].str[:2]
df_remote_tasks = df_tasks[df_tasks['remote_status'] == 'remote'].copy()
# Call create_task_estimates() from add_task_estimates? which creates tasks_with_estimates.csv
def preprocessing_time_estimates():
df = pd.read_csv("tasks_with_estimates.csv")
df = df[df['importance_average'] > 3].copy()
# The embeddings comes from running `uv run ./embed_task_description.py`
# Columns: ['embedding_id', 'task', 'embedding_vector']
# These contain embedding for UNIQUE tasks
df_task_embeddings = pd.read_parquet("tasks_with_embeddings.parquet").drop_duplicates(subset=['task'])[['task', 'task_embedding']].rename(columns={"task_embedding": "embedding_vector"}).copy()
df = pd.merge(df, df_task_embeddings[['task', 'embedding_vector']], on='task', how='left')
df = pd.merge(df, df_task_estimateable[['task', 'estimateable']], on='task', how='left')
df['onetsoc_major'] = df['onetsoc_code'].str[:2]
def convert_to_minutes(qty, unit):
"""Converts a quantity in a given unit to minutes."""
return qty * {
"minute": 1,
"hour": 60,
"day": 60 * 24,
"week": 60 * 24 * 7,
"month": 60 * 24 * 30,
"trimester": 60 * 24 * 90,
"semester": 60 * 24 * 180,
"year": 60 * 24 * 365,
}[unit]
df['lb_estimate_in_minutes'] = df.apply(
lambda row: convert_to_minutes(row['lb_estimate_qty'], row['lb_estimate_unit']), axis=1
)
df['ub_estimate_in_minutes'] = df.apply(
lambda row: convert_to_minutes(row['ub_estimate_qty'], row['ub_estimate_unit']), axis=1
)
df['estimate_range'] = df.ub_estimate_in_minutes - df.lb_estimate_in_minutes
df['estimate_ratio'] = df.ub_estimate_in_minutes / df.lb_estimate_in_minutes
df['estimate_midpoint'] = (df.lb_estimate_in_minutes + df.ub_estimate_in_minutes)/2
atomic_tasks = df[df['estimateable'] == 'ATOMIC']
ongoing_tasks = df[df['estimateable'] == 'ONGOING-CONSTRAINT']
with pd.option_context('display.max_columns', None):
display(df)
# Check for empty estimates
if atomic_tasks['lb_estimate_in_minutes'].isnull().sum() > 0:
print("Missing values in 'lb_estimate_in_minutes':", atomic_tasks['lb_estimate_in_minutes'].isnull().sum())
if atomic_tasks['ub_estimate_in_minutes'].isnull().sum() > 0:
print("Missing values in 'ub_estimate_in_minutes':", atomic_tasks['ub_estimate_in_minutes'].isnull().sum())
# Check for impossible bounds
impossible_bounds = atomic_tasks[
(atomic_tasks['lb_estimate_in_minutes'] <= 0) |
(atomic_tasks['ub_estimate_in_minutes'] <= 0) |
(atomic_tasks['lb_estimate_in_minutes'] > atomic_tasks['ub_estimate_in_minutes'])
]
if not impossible_bounds.empty:
print(f"Error: Found rows with impossible bounds.")
with pd.option_context('display.max_colwidth', None):
display(impossible_bounds[['task', 'lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'dwas']])
#with pd.option_context('display.max_colwidth', None):
#display(atomic_tasks.nlargest(20, 'ub_estimate_in_minutes')[['task', 'lb_estimate_qty', 'lb_estimate_unit', 'lb_estimate_in_minutes', 'ub_estimate_qty', 'ub_estimate_unit', 'ub_estimate_in_minutes', 'estimate_ratio']])
def cell1():
sns.histplot(atomic_tasks.estimate_midpoint, log_scale=True)
def cell2():
plt.figure(figsize=(14,10))
sns.boxplot(
data=atomic_tasks,
x='onetsoc_major', # 11 = Management, 15 = Computer/Math, …
y='estimate_range',
showfliers=False
)
plt.yscale('log') # long tail => log scale
plt.xlabel('Occupation')
plt.ylabel('Range (upper-lower, minutes)')
plt.title('Spread of time-range estimates per occupation')
ax = plt.gca()
ax.set_xticklabels([occupation_major_codes[code.get_text()] for code in ax.get_xticklabels()], rotation=60, ha='right')
def cell3():
plt.figure(figsize=(10, 10))
ax = sns.scatterplot(
data=atomic_tasks.replace({'onetsoc_major': occupation_major_codes}), # Replace codes with labels
x='lb_estimate_in_minutes', y='ub_estimate_in_minutes',
alpha=0.2, edgecolor=None, hue="onetsoc_major" # Use the labeled column for hue
)
# 45° reference
lims = (1, atomic_tasks[['lb_estimate_in_minutes','ub_estimate_in_minutes']].max().max())
ax.plot(lims, lims, color='black', linestyle='--', linewidth=1)
# optional helper lines: 2× and 10×, 100× ratios
for k in [2,10, 100]:
ax.plot(lims, [k*l for l in lims],
linestyle=':', color='grey', linewidth=1)
ax.set(xscale='log', yscale='log')
ax.set_xlabel('Lower-bound (min, log scale)')
ax.set_ylabel('Upper-bound (min, log scale)')
ax.set_title('Lower vs upper estimates for all tasks')
# Place the legend outside the plot
ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
def cell4():
plt.figure(figsize=(8,4))
sns.histplot(np.log10(atomic_tasks['estimate_ratio'].replace([np.inf, -np.inf], np.nan).dropna()),
bins=60, kde=True)
plt.axvline(np.log10(10), color='red', ls='--', lw=1, label='10×')
plt.axvline(np.log10(1.05), color='orange', ls='--', lw=1, label='1.05×')
plt.axvline(0, color='black', ls='-', lw=1) # ub = lb
plt.xlabel('log₁₀(upper / lower)')
plt.ylabel('Count')
plt.title('Distribution of upper:lower ratio')
plt.legend()
plt.tight_layout()
def cell5():
# 1. Bin lower bounds into quartiles (Q1Q4)
atomic_tasks['lb_q'] = pd.qcut(atomic_tasks.lb_estimate_in_minutes,
q=4, labels=['Q1 shortest','Q2','Q3','Q4 longest'])
# 3. Aggregate: median (or mean) ratio per cell
pivot = atomic_tasks.pivot_table(index='onetsoc_major', columns='lb_q',
values='estimate_ratio', aggfunc='median')
# Map the index (onetsoc_major codes) to their corresponding labels
pivot.index = pivot.index.map(occupation_major_codes)
# 4. Visualise
plt.figure(figsize=(10,8))
sns.heatmap(pivot, cmap='RdYlGn_r', center=2, annot=True, fmt='.1f',
cbar_kws={'label':'Median upper/lower ratio'})
plt.xlabel('Lower-bound quartile')
plt.ylabel('Occupation (major group)')
plt.title('Typical range width by occupation and task length')
plt.tight_layout()
def cell6():
"""
from scipy.stats import median_abs_deviation
def mad_z(series):
med = series.median()
mad = median_abs_deviation(series, scale='normal') # ⇒ comparable to σ
return (series - med) / mad
df['robust_z'] = df.groupby('onetsoc_code')['estimate_midpoint'].transform(mad_z)
"""
agg = (atomic_tasks
.groupby('onetsoc_code')['estimate_midpoint']
.agg(median='median',
q1=lambda x: x.quantile(.25),
q3=lambda x: x.quantile(.75),
mean='mean',
std='std')
.reset_index())
agg['IQR'] = agg.q3 - agg.q1
agg['CV'] = agg['std'] / agg['mean'] # coefficient of variation
# merge back the group mean and std so each row can be scored
atomic_tasks = atomic_tasks.merge(agg[['onetsoc_code','mean','std']], on='onetsoc_code')
atomic_tasks['z'] = (atomic_tasks.estimate_midpoint - atomic_tasks['mean']) / atomic_tasks['std']
outliers = atomic_tasks.loc[atomic_tasks.z.abs() > 3]
outliers
def cell7():
from scipy.stats import median_abs_deviation
def mad_z(series):
med = series.median()
mad = median_abs_deviation(series, scale='normal') # ⇒ comparable to σ
return (series - med) / mad
atomic_tasks['robust_z'] = atomic_tasks.groupby('onetsoc_code')['estimate_midpoint'].transform(mad_z)
def cell8():
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
# unit-normalise embeddings
X = np.vstack(atomic_tasks.embedding_vector.values)
X = X / np.linalg.norm(X, axis=1, keepdims=True)
k = 5
nn = NearestNeighbors(n_neighbors=k+1, metric='cosine').fit(X)
dist, idx = nn.kneighbors(X) # idx[:,0] is the task itself
pairs = pd.DataFrame({
'i': np.repeat(np.arange(len(X)), k),
'j': idx[:,1:].ravel(), # drop self-match
'sim': 1 - dist[:,1:].ravel() # cosine similarity
})
# join time estimates
pairs = (pairs
.merge(atomic_tasks[['estimate_midpoint']], left_on='i', right_index=True)
.merge(atomic_tasks[['estimate_midpoint']], left_on='j', right_index=True,
suffixes=('_i','_j')))
pairs['ratio'] = pairs[['estimate_midpoint_i','estimate_midpoint_j']].max(1) / \
pairs[['estimate_midpoint_i','estimate_midpoint_j']].min(1)
pairs
def cell9():
sns.scatterplot(data=pairs.sample(20_000), x='sim', y=np.log10(pairs['ratio']),
alpha=.1)
plt.axhline(np.log10(2), ls=':', color='red'); # e.g. >2× difference
plt.xlabel('Cosine similarity'); plt.ylabel('log₁₀ time-ratio');
plt.title('Are similar tasks given similar time estimates?');
def cell10():
import matplotlib.ticker as mtick # For percentage formatting
import matplotlib.colors as mcolors # For color conversion
summary_data = []
for code, label in occupation_major_codes.items():
occ_df = df_tasks[df_tasks['onetsoc_major'] == code]
total_tasks_in_occ = len(occ_df)
if total_tasks_in_occ == 0:
continue # Skip if no tasks for this occupation
# Stack 1: % that isn't equal to "remote"
not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote'])
# For the remaining remote tasks:
remote_df = occ_df[occ_df['remote_status'] == 'remote']
# Stack 2: % of remote + ATOMIC
remote_atomic_count = len(remote_df[remote_df['estimateable'] == 'ATOMIC'])
# Stack 3: % of remote + ONGOING-CONSTRAINT
remote_ongoing_count = len(remote_df[remote_df['estimateable'] == 'ONGOING-CONSTRAINT'])
summary_data.append({
'onetsoc_major_code': code,
'occupation_label': label,
'count_not_remote': not_remote_count,
'count_remote_atomic': remote_atomic_count,
'count_remote_ongoing': remote_ongoing_count,
'total_tasks': total_tasks_in_occ
})
summary_df = pd.DataFrame(summary_data)
# --- 3. Calculate Percentages ---
# Ensure total_tasks is not zero to avoid division by zero errors if an occupation had no tasks
summary_df = summary_df[summary_df['total_tasks'] > 0].copy() # Use .copy() to avoid SettingWithCopyWarning
summary_df['pct_not_remote'] = (summary_df['count_not_remote'] / summary_df['total_tasks']) * 100
summary_df['pct_remote_atomic'] = (summary_df['count_remote_atomic'] / summary_df['total_tasks']) * 100
summary_df['pct_remote_ongoing'] = (summary_df['count_remote_ongoing'] / summary_df['total_tasks']) * 100
# Select columns for plotting and set index to occupation label
plot_df = summary_df.set_index('occupation_label')[
['pct_not_remote', 'pct_remote_atomic', 'pct_remote_ongoing']
]
# Rename columns for a clearer legend
plot_df.columns = ['Not Remote', 'Remote + Estimable', 'Remote + Not estimable']
plot_df = plot_df.sort_values(by='Not Remote', ascending=False)
# --- 4. Plotting (Modified) ---
# Define the custom colors based on your requirements
# The order must match the column order in plot_df:
# 1. 'Not Remote'
# 2. 'Remote & ATOMIC'
# 3. 'Remote & ONGOING-CONSTRAINT'
bar_colors = [gray["300"], lime["500"], lime["200"]]
fig, ax = plt.subplots(figsize=(14, 10)) # Adjusted figsize for better readability
plot_df.plot(kind='barh', stacked=True, ax=ax, color=bar_colors)
ax.set_xlabel("Percentage of Tasks (%)", fontsize=12)
ax.set_ylabel("Occupation Major Group", fontsize=12)
ax.set_title("Task Breakdown by Occupation, Remote Status, and Estimateability", fontsize=14, pad=20)
# Format x-axis as percentages
ax.xaxis.set_major_formatter(mtick.PercentFormatter())
plt.xlim(0, 100) # Ensure x-axis goes from 0 to 100%
# Remove right and top spines
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
# Function to get contrasting text color
def get_contrasting_text_color(bg_color_hex_or_rgba):
"""
Determines if black or white text provides better contrast against a given background color.
bg_color_hex_or_rgba: A hex string (e.g., '#RRGGBB') or an RGBA tuple (values in [0, 1]).
Returns: 'black' or 'white'.
"""
# Convert to RGBA if it's a hex string or name
if isinstance(bg_color_hex_or_rgba, str):
rgba = mcolors.to_rgba(bg_color_hex_or_rgba)
else:
rgba = bg_color_hex_or_rgba
r, g, b, _ = rgba # Ignore alpha for luminance calculation
# Calculate luminance (standard formula for sRGB)
# Values r, g, b should be in [0, 1] for this formula
luminance = 0.2126 * r + 0.7152 * g + 0.0722 * b
# Threshold for deciding text color
return 'black' if luminance > 0.55 else 'white' # Adjusted threshold slightly for better visual
# Add percentages inside each bar segment
# Iterate through each "category" of bars (Not Remote, Remote & ATOMIC, etc.)
for i, container in enumerate(ax.containers):
# Get the color for this container/category
segment_color = bar_colors[i]
text_color = get_contrasting_text_color(segment_color)
for patch in container.patches: # Iterate through each bar segment in the category
width = patch.get_width()
if width > 3: # Only add text if segment is wide enough (e.g., >3%)
x = patch.get_x() + width / 2
y = patch.get_y() + patch.get_height() / 2
ax.text(x, y,
f"{width:.1f}%",
ha='center',
va='center',
fontsize=8, # Adjust font size as needed
color=text_color,
fontweight='medium') # Bolder text can help
plt.legend(title="Task Category", bbox_to_anchor=(1.02, 1), loc='upper left', frameon=False)
def cell11():
df_oesm['onetsoc_major'] = df_oesm['OCC_CODE'].str[:2]
# Calculate wage bill per occupation
# Wage bill = Total Employment * Annual Mean Wage
# Ensure columns are numeric, converting non-numeric values to NaN first
df_oesm['TOT_EMP'] = pd.to_numeric(df_oesm['TOT_EMP'], errors='coerce')
df_oesm['A_MEAN'] = pd.to_numeric(df_oesm['A_MEAN'], errors='coerce')
# Drop rows with NaN in necessary columns after coercion
df_oesm.dropna(subset=['TOT_EMP', 'A_MEAN', 'onetsoc_major'], inplace=True)
df_oesm['wage_bill'] = df_oesm['TOT_EMP'] * df_oesm['A_MEAN']
# Aggregate wage bill by onetsoc_major
df_wage_bill_major = df_oesm.groupby('onetsoc_major')['wage_bill'].sum().reset_index()
# Map major codes to titles for better plotting
df_wage_bill_major['OCC_TITLE_MAJOR'] = df_wage_bill_major['onetsoc_major'].map(occupation_major_codes)
# Sort by wage bill for better visualization
df_wage_bill_major = df_wage_bill_major.sort_values('wage_bill', ascending=False)
# Plotting
plt.figure(figsize=(12, 8))
sns.barplot(x='wage_bill', y='OCC_TITLE_MAJOR', data=df_wage_bill_major, palette="viridis")
plt.title('Total Wage Bill per Major Occupation Group')
plt.xlabel('Total Wage Bill (in billions)')
plt.ylabel('Major Occupation Group')
plt.grid(axis='x', linestyle='--', alpha=0.7)
def cell11():
# ───────────────────────────────────────────────────────────────
# 1. CUMULATIVE-DISTRIBUTION-FUNCTION (CDF) PREP
# ───────────────────────────────────────────────────────────────
def cdf(series):
s = series.sort_values().reset_index(drop=True)
return s.values, ((s.index + 1) / len(s)) * 100
x_lb , y_lb = cdf(atomic_tasks['lb_estimate_in_minutes'])
x_ub , y_ub = cdf(atomic_tasks['ub_estimate_in_minutes'])
x_mid, y_mid = cdf((atomic_tasks['ub_estimate_in_minutes'] + atomic_tasks['lb_estimate_in_minutes']) / 2)
# ───────────────────────────────────────────────────────────────
# 2. PLOTTING
# ───────────────────────────────────────────────────────────────
fig, ax = plt.subplots(figsize=(10, 6))
# horizontal reference lines every 10 %
for y_val in range(0, 101, 10):
ax.axhline(y_val, color=gray['100'], linewidth=.8, zorder=1)
# Plot Lower Bound CDF
ax.step(x_lb, y_lb,
where='post',
color=lime['300'], # Example: light blue for lower bound
linewidth=1.8,
linestyle='--',
zorder=2,
label='Lower bound estimate (CDF)')
# Plot Upper Bound CDF
ax.step(x_ub, y_ub,
where='post',
color=lime['900'], # Example: light orange/red for upper bound
linewidth=1.8,
linestyle=':',
zorder=3,
label='Upper bound estimate (CDF)')
# Plot Midpoint CDF (plotted last to be on top, or adjust zorder)
ax.step(x_mid, y_mid,
where='post',
color=lime['600'],
linewidth=2.2,
zorder=4, # Ensure it's on top of other lines if they overlap significantly
label='Mid-point estimate (CDF)')
# axes limits / scales
ax.set_ylim(0, 100)
ax.set_xscale('log')
# y-axis ➝ percent labels
ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(decimals=0))
# move y-label to top-left (just inside plotting area)
ax.text(-0.06, 1.03,
"% of tasks with temporal coherence ≤ X",
ha='left', va='bottom',
transform=ax.transAxes,
fontsize=12, fontweight='semibold')
# custom x-ticks at human-friendly durations
ticks = [1, 5, 10, 30, 60, 120, 240, 480,
1440, 2880, 10080, 43200, 129600,
259200, 525600]
ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hour', '2 hours', '4 hours', '8 hours',
'1 day', '2 days', '1 week', '30 days',
'90 days', '180 days', '1 year']
# Vertical reference lines for x-ticks
for tick in ticks:
ax.axvline(tick, color=gray['300'], linewidth=.8, linestyle='--', zorder=1)
ax.set_xticks(ticks)
ax.set_xticklabels(ticklabels, rotation=45, ha='right')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_edgecolor(gray['300'])
ax.spines['bottom'].set_edgecolor(gray['300'])
# legend
ax.legend(frameon=False, loc='lower right') # Keep 'lower right' or adjust as needed
ax.text(0.5, -0.3,
'Temporal coherence (X)',
ha='center', va='center',
transform=ax.transAxes,
fontsize=12, fontweight='semibold')