This commit is contained in:
Félix Dorn 2025-07-15 00:34:54 +02:00
parent 62296e1b69
commit 65dc648797
37 changed files with 1413 additions and 2433 deletions

View file

@ -1,2 +1,3 @@
- I use Nix. To run a command, prefix them with `nix develop .#impure -c`
- I use uv. To add a package, use: uv add. To run a script use: uv run path/to/script
- To run the pipeline: `uv run -m pipeline.runner`

BIN
dist/estimate_distribution_histplot.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 75 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 295 KiB

BIN
dist/estimates_spread_per_occupation.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 279 KiB

BIN
dist/intermediate/df_tasks.parquet vendored Normal file

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 239 KiB

BIN
dist/projected_task_automation_p50.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 145 KiB

BIN
dist/projected_task_automation_p80.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 136 KiB

BIN
dist/sequential_coherence_cdf.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 145 KiB

View file

@ -1,507 +0,0 @@
import pandas as pd
import litellm
import dotenv
import os
import time
import json
import math
import numpy as np
# --- Configuration ---
MODEL = "gpt-4.1-mini" # Make sure this model supports json_schema or structured output
RATE_LIMIT = 5000 # Requests per minute
CHUNK_SIZE = 300
SECONDS_PER_MINUTE = 60
FILENAME = (
"tasks_with_estimates.csv" # This CSV should contain the tasks to be processed
)
# --- Prompts and Schema ---
SYSTEM_PROMPT = """
You are an expert assistant evaluating the time to completion required for job tasks. Your goal is to estimate the time range needed for a skilled human to complete the following job task remotely, without supervision.
Provide a lower and upper bound estimate for the time to completion time. These bounds should capture the time within which approximately 80% of instances of performing this specific task are typically completed by a qualified individual.
Base your estimate on the provided task description, its associated activities, and the occupational context. Your estimate must be in one the allowed units: minute, hour, day, week, month, trimester, semester, year.
""".strip()
USER_MESSAGE_TEMPLATE = """
Please estimate the time range for the following remote task:
**Task Description:** {task}
**Relevant activies for the task:**
{dwas}
**Occupation Category:** {occupation_title}
**Occupation Description:** {occupation_description}
Consider the complexity and the typical steps involved.
""".strip()
ALLOWED_UNITS = [
"minute",
"hour",
"day",
"week",
"month",
"trimester",
"semester",
"year",
]
SCHEMA_FOR_VALIDATION = {
"name": "estimate_time",
"strict": True, # Enforce schema adherence
"schema": {
"type": "object",
"properties": {
"lower_bound_estimate": {
"type": "object",
"properties": {
"quantity": {
"type": "number",
"description": "The numerical value for the lower bound of the estimate.",
},
"unit": {
"type": "string",
"enum": ALLOWED_UNITS,
"description": "The unit of time for the lower bound.",
},
},
"required": ["quantity", "unit"],
"additionalProperties": False,
},
"upper_bound_estimate": {
"type": "object",
"properties": {
"quantity": {
"type": "number",
"description": "The numerical value for the upper bound of the estimate.",
},
"unit": {
"type": "string",
"enum": ALLOWED_UNITS,
"description": "The unit of time for the upper bound.",
},
},
"required": ["quantity", "unit"],
"additionalProperties": False,
},
},
"required": ["lower_bound_estimate", "upper_bound_estimate"],
"additionalProperties": False,
},
}
def save_dataframe(df_to_save, filename):
"""Saves the DataFrame to the specified CSV file using atomic write."""
try:
temp_filename = filename + ".tmp"
df_to_save.to_csv(temp_filename, encoding="utf-8-sig", index=False)
os.replace(temp_filename, filename)
except Exception as e:
print(f"--- Error saving DataFrame to {filename}: {e} ---")
if os.path.exists(temp_filename):
try:
os.remove(temp_filename)
except Exception as remove_err:
print(
f"--- Error removing temporary save file {temp_filename}: {remove_err} ---"
)
def create_task_estimates():
try:
# Read the CSV
if os.path.exists(FILENAME):
df = pd.read_csv(FILENAME, encoding="utf-8-sig")
print(f"Successfully read {len(df)} rows from {FILENAME}.")
estimate_columns_spec = {
"lb_estimate_qty": float,
"lb_estimate_unit": object,
"ub_estimate_qty": float,
"ub_estimate_unit": object,
}
save_needed = False
for col_name, target_dtype in estimate_columns_spec.items():
if col_name not in df.columns:
# Initialize with a type-compatible missing value
if target_dtype == float:
df[col_name] = np.nan
else: # object
df[col_name] = pd.NA
df[col_name] = df[col_name].astype(target_dtype) # Enforce dtype
print(f"Added '{col_name}' column as {df[col_name].dtype}.")
save_needed = True
else:
# Column exists, ensure correct dtype
current_pd_dtype = df[col_name].dtype
expected_pd_dtype = pd.Series(dtype=target_dtype).dtype
if current_pd_dtype != expected_pd_dtype:
try:
if target_dtype == float:
df[col_name] = pd.to_numeric(df[col_name], errors="coerce")
else: # object
df[col_name] = df[col_name].astype(object)
print(
f"Corrected dtype of '{col_name}' to {df[col_name].dtype}."
)
save_needed = True
except Exception as e:
print(
f"Warning: Could not convert column '{col_name}' to {target_dtype}: {e}. Current dtype: {current_pd_dtype}"
)
# Standardize missing values (e.g., empty strings to NA/NaN)
# Replace common missing placeholders with pd.NA first
df[col_name].replace(["", None, ""], pd.NA, inplace=True)
if target_dtype == float:
# For float columns, ensure they are numeric and use np.nan after replacement
df[col_name] = pd.to_numeric(df[col_name], errors="coerce")
if save_needed:
print(f"Saving {FILENAME} after adding/adjusting estimate columns.")
save_dataframe(df, FILENAME)
else:
print(
f"Error: {FILENAME} not found. Please ensure the file exists and contains task data."
)
exit()
except FileNotFoundError:
print(
f"Error: {FILENAME} not found. Please ensure the file exists and contains task data."
)
exit()
except Exception as e:
print(f"Error reading or initializing {FILENAME}: {e}")
exit()
# --- Identify Rows to Process ---
# We'll check for NaN in one of the primary quantity columns.
unprocessed_mask = df["lb_estimate_qty"].isna()
if unprocessed_mask.any():
start_index = unprocessed_mask.idxmax() # Finds the index of the first True value
print(f"Resuming processing. First unprocessed row found at index {start_index}.")
df_to_process = df.loc[unprocessed_mask].copy()
original_indices = df_to_process.index # Keep track of original indices
else:
print(
"All rows seem to have estimates already (based on 'lb_estimate_qty'). Exiting."
)
exit()
# --- Prepare messages for batch completion (only for rows needing processing) ---
messages_list = []
skipped_rows_indices = []
valid_original_indices = []
if not df_to_process.empty:
required_cols = ["task", "occupation_title", "occupation_description", "dwas"]
print(
f"Preparing messages for up to {len(df_to_process)} rows starting from original index {original_indices[0] if len(original_indices) > 0 else 'N/A'}..."
)
print(f"Checking for required columns: {required_cols}")
for index, row in df_to_process.iterrows():
missing_or_empty = []
for col in required_cols:
if col not in row or pd.isna(row[col]) or str(row[col]).strip() == "":
missing_or_empty.append(col)
if missing_or_empty:
print(
f"Warning: Skipping row original index {index} due to missing/empty required data in columns: {', '.join(missing_or_empty)}."
)
skipped_rows_indices.append(index)
continue
try:
user_message = USER_MESSAGE_TEMPLATE.format(
task=row["task"],
occupation_title=row["occupation_title"],
occupation_description=row["occupation_description"],
dwas=row["dwas"],
)
except KeyError as e:
print(
f"Error: Skipping row original index {index} due to formatting error - missing key: {e}. Check USER_MESSAGE_TEMPLATE and CSV columns."
)
skipped_rows_indices.append(index)
continue
messages_for_row = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_message},
]
messages_list.append(messages_for_row)
valid_original_indices.append(index) # This is the original DataFrame index
print(
f"Prepared {len(messages_list)} valid message sets for batch completion (skipped {len(skipped_rows_indices)} rows)."
)
if not messages_list:
print("No valid rows found to process after checking required data. Exiting.")
exit()
else:
print(
"No rows found needing processing (df_to_process is empty)."
) # Should have been caught by earlier check
exit()
# --- Call batch_completion in chunks with rate limiting and periodic saving ---
total_messages_to_send = len(messages_list)
num_chunks = math.ceil(total_messages_to_send / CHUNK_SIZE)
print(
f"\nStarting batch completion for {total_messages_to_send} items in {num_chunks} chunks..."
)
overall_start_time = time.time()
processed_count_total = 0
for i in range(num_chunks):
chunk_start_message_index = i * CHUNK_SIZE
chunk_end_message_index = min((i + 1) * CHUNK_SIZE, total_messages_to_send)
message_chunk = messages_list[chunk_start_message_index:chunk_end_message_index]
# Get corresponding original DataFrame indices for this chunk
chunk_original_indices = valid_original_indices[
chunk_start_message_index:chunk_end_message_index
]
if not message_chunk:
continue
min_idx_disp = min(chunk_original_indices) if chunk_original_indices else "N/A"
max_idx_disp = max(chunk_original_indices) if chunk_original_indices else "N/A"
print(
f"\nProcessing chunk {i + 1}/{num_chunks} (Messages {chunk_start_message_index + 1}-{chunk_end_message_index} of this run)..."
f" Corresponding to original indices: {min_idx_disp} - {max_idx_disp}"
)
chunk_start_time = time.time()
responses = []
try:
print(f"Sending {len(message_chunk)} requests for chunk {i + 1}...")
responses = litellm.batch_completion(
model=MODEL,
messages=message_chunk,
response_format={
"type": "json_schema",
"json_schema": SCHEMA_FOR_VALIDATION,
},
num_retries=3,
# request_timeout=60 # Optional: uncomment if needed
)
print(f"Chunk {i + 1} API call completed.")
except Exception as e:
print(f"Error during litellm.batch_completion for chunk {i + 1}: {e}")
responses = [None] * len(
message_chunk
) # Ensure responses list matches message_chunk length for processing loop
# --- Process responses for the current chunk ---
chunk_updates = {} # To store {original_df_index: {qty/unit data}}
successful_in_chunk = 0
failed_in_chunk = 0
if responses and len(responses) == len(message_chunk):
for j, response in enumerate(responses):
original_df_index = chunk_original_indices[j]
# Initialize values for this item
lb_qty_val, lb_unit_val, ub_qty_val, ub_unit_val = None, None, None, None
content_str = None
if response is None:
print(
f"Skipping processing for original index {original_df_index} due to API call failure for this item (response is None)."
)
failed_in_chunk += 1
continue
try:
if (
response.choices
and response.choices[0].message
and response.choices[0].message.content
):
content_str = response.choices[0].message.content
estimate_data = json.loads(content_str) # Can raise JSONDecodeError
lower_bound_dict = estimate_data.get("lower_bound_estimate")
upper_bound_dict = estimate_data.get("upper_bound_estimate")
valid_response_structure = isinstance(
lower_bound_dict, dict
) and isinstance(upper_bound_dict, dict)
if valid_response_structure:
lb_qty_raw = lower_bound_dict.get("quantity")
lb_unit_raw = lower_bound_dict.get("unit")
ub_qty_raw = upper_bound_dict.get("quantity")
ub_unit_raw = upper_bound_dict.get("unit")
is_valid_item = True
# Validate LB Qty
if (
not isinstance(lb_qty_raw, (int, float))
or math.isnan(float(lb_qty_raw))
or float(lb_qty_raw) < 0
):
print(
f"Warning: Invalid lb_quantity for original index {original_df_index}: {lb_qty_raw}"
)
is_valid_item = False
else:
lb_qty_val = float(lb_qty_raw)
# Validate UB Qty
if (
not isinstance(ub_qty_raw, (int, float))
or math.isnan(float(ub_qty_raw))
or float(ub_qty_raw) < 0
):
print(
f"Warning: Invalid ub_quantity for original index {original_df_index}: {ub_qty_raw}"
)
is_valid_item = False
else:
ub_qty_val = float(ub_qty_raw)
# Validate Units
if lb_unit_raw not in ALLOWED_UNITS:
print(
f"Warning: Invalid lb_unit for original index {original_df_index}: '{lb_unit_raw}'"
)
is_valid_item = False
else:
lb_unit_val = lb_unit_raw
if ub_unit_raw not in ALLOWED_UNITS:
print(
f"Warning: Invalid ub_unit for original index {original_df_index}: '{ub_unit_raw}'"
)
is_valid_item = False
else:
ub_unit_val = ub_unit_raw
if is_valid_item:
successful_in_chunk += 1
chunk_updates[original_df_index] = {
"lb_estimate_qty": lb_qty_val,
"lb_estimate_unit": lb_unit_val,
"ub_estimate_qty": ub_qty_val,
"ub_estimate_unit": ub_unit_val,
}
else:
failed_in_chunk += (
1 # Values remain None if not fully valid
)
else:
print(
f"Warning: Missing or malformed estimate dicts in JSON for original index {original_df_index}. Content: '{content_str}'"
)
failed_in_chunk += 1
else:
finish_reason = (
response.choices[0].finish_reason
if (response.choices and response.choices[0].finish_reason)
else "unknown"
)
error_message = (
response.choices[0].message.content
if (
response.choices
and response.choices[0].message
and response.choices[0].message.content
)
else "No content in message."
)
print(
f"Warning: Received non-standard or empty response content for original index {original_df_index}. "
f"Finish Reason: '{finish_reason}'. Message: '{error_message}'. Raw Choices: {response.choices}"
)
failed_in_chunk += 1
except json.JSONDecodeError:
print(
f"Warning: Could not decode JSON for original index {original_df_index}. Content received: '{content_str}'"
)
failed_in_chunk += 1
except AttributeError as ae:
print(
f"Warning: Missing expected attribute processing response for original index {original_df_index}: {ae}. Response: {response}"
)
failed_in_chunk += 1
except Exception as e:
print(
f"Warning: An unexpected error occurred processing response for original index {original_df_index}: {type(e).__name__} - {e}. Response: {response}"
)
failed_in_chunk += 1
else:
print(
f"Warning: Mismatch between number of responses ({len(responses) if responses else 0}) "
f"and messages sent ({len(message_chunk)}) for chunk {i + 1}, or no responses. Marking all as failed."
)
failed_in_chunk = len(
message_chunk
) # All items in this chunk are considered failed if response array is problematic
print(
f"Chunk {i + 1} processing summary: Success={successful_in_chunk}, Failed/Skipped={failed_in_chunk}"
)
processed_count_total += successful_in_chunk
# --- Update Main DataFrame and Save Periodically ---
if chunk_updates:
print(
f"Updating main DataFrame with {len(chunk_updates)} new estimates for chunk {i + 1}..."
)
for idx, estimates in chunk_updates.items():
if idx in df.index:
df.loc[idx, "lb_estimate_qty"] = estimates["lb_estimate_qty"]
df.loc[idx, "lb_estimate_unit"] = estimates["lb_estimate_unit"]
df.loc[idx, "ub_estimate_qty"] = estimates["ub_estimate_qty"]
df.loc[idx, "ub_estimate_unit"] = estimates["ub_estimate_unit"]
print(f"Saving progress to {FILENAME}...")
save_dataframe(df, FILENAME)
else:
print(f"No successful estimates obtained in chunk {i + 1} to save.")
# --- Rate Limiting Pause ---
chunk_end_time = time.time()
chunk_duration = chunk_end_time - chunk_start_time
print(f"Chunk {i + 1} took {chunk_duration:.2f} seconds.")
if i < num_chunks - 1: # No pause after the last chunk
# Calculate ideal time per request based on rate limit
time_per_request = SECONDS_PER_MINUTE / RATE_LIMIT if RATE_LIMIT > 0 else 0
# Calculate minimum duration this chunk should have taken to respect rate limit
min_chunk_duration_for_rate = len(message_chunk) * time_per_request
# Calculate pause needed
pause_needed = max(0, min_chunk_duration_for_rate - chunk_duration)
if pause_needed > 0:
print(
f"Pausing for {pause_needed:.2f} seconds to respect rate limit ({RATE_LIMIT}/min)..."
)
time.sleep(pause_needed)
overall_end_time = time.time()
total_duration_minutes = (overall_end_time - overall_start_time) / 60
print(
f"\nBatch completion finished."
f" Processed {processed_count_total} new estimates in this run in {total_duration_minutes:.2f} minutes."
)
print(f"Performing final save to {FILENAME}...")
save_dataframe(df, FILENAME)
print("\nScript finished.")

View file

@ -1,528 +0,0 @@
import os
import litellm
import sqlite3
import numpy as np
import pandas as pd
from google.colab import userdata, files
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
os.environ['GEMINI_API_KEY'] = userdata.get('GEMINI_API_KEY')
occupation_major_codes = {
'11': 'Management',
'13': 'Business and Financial Operations',
'15': 'Computer and Mathematical Occupations',
'17': 'Architecture and Engineering',
'19': 'Life, Physical, and Social Science',
'21': 'Community and Social Services',
'23': 'Legal',
'25': 'Education, Training, and Library',
'27': 'Arts, Design, Entertainment, Sports, and Media',
'29': 'Healthcare Practitioners and Technical',
'31': 'Healthcare Support',
'33': 'Protective Service',
'35': 'Food Preparation and Serving Related',
'37': 'Building and Grounds Cleaning and Maintenance',
'39': 'Personal Care and Service',
'41': 'Sales and Related',
'43': 'Office and Administrative Support',
'45': 'Farming, Fishing, and Forestry',
'47': 'Construction and Extraction',
'49': 'Installation, Maintenance, and Repair',
'51': 'Production',
'53': 'Transportation and Material Moving',
'55': 'Military Specific'
}
gray = {'50':'#f8fafc','100':'#f1f5f9','200':'#e2e8f0',
'300':'#cbd5e1','400':'#94a3b8','500':'#64748b',
'600':'#475569','700':'#334155','800':'#1e293b',
'900':'#0f172a','950':'#020617'}
lime = {'50': '#f7fee7','100': '#ecfcca','200': '#d8f999',
'300': '#bbf451','400': '#9ae600','500': '#83cd00',
'600': '#64a400','700': '#497d00','800': '#3c6300',
'900': '#35530e','950': '#192e03'}
mpl.rcParams.update({
'figure.facecolor' : gray['50'],
'axes.facecolor' : gray['50'],
'axes.edgecolor' : gray['100'],
'axes.labelcolor' : gray['700'],
'xtick.color' : gray['700'],
'ytick.color' : gray['700'],
'font.family' : 'Inter', # falls back to DejaVu if Inter not present
'font.size' : 11,
})
sns.set_style("white") # keep minimal axes, we will remove default grid
sns.set_context("notebook")
def prepare_tasks():
# This dataset comes from https://epoch.ai/gradient-updates/consequences-of-automating-remote-work
# It contains labels for a O*NET task can be done remotely or not (labeled by GPT-4o)
# You can download it here: https://drive.google.com/file/d/1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r/view?usp=sharing
df_remote_status = pd.read_csv("epoch_task_data.csv")
# BLS OEWS: Https://www.bls.gov/oes/special-requests/oesm23nat.zip
df_oesm = pd.read_excel("oesm23national.xlsx")
# Run uv run ./enrich_task_ratings.py
df_tasks = pd.read_json("task_ratings_enriched.json")
# Run uv run classify_estimateability_of_tasks.py
df_task_estimateable = pd.read_csv("tasks_estimateable.csv").rename(columns={"task_estimateable": "estimateable"}).drop_duplicates(subset=['task'], keep='first')
# df_tasks now has a remote_status column which contains either "remote" or "not remote"
df_tasks = pd.merge(df_tasks, df_remote_status[['Task', 'Remote']], left_on='task', right_on='Task', how='left')
df_tasks = df_tasks.drop('Task', axis=1).rename(columns={'Remote': 'remote_status'})
# df_tasks now has a estimateable column which contains either "ATOMIC" or "ONGOING-CONSTRAINT"
df_tasks = pd.merge(df_tasks, df_task_estimateable[['task', 'estimateable']], on='task', how='left')
df_tasks = df_tasks[df_tasks['importance_average'] < 3].copy()
df_tasks['onetsoc_major'] = df_tasks['onetsoc_code'].str[:2]
df_remote_tasks = df_tasks[df_tasks['remote_status'] == 'remote'].copy()
# Call create_task_estimates() from add_task_estimates? which creates tasks_with_estimates.csv
def preprocessing_time_estimates():
df = pd.read_csv("tasks_with_estimates.csv")
df = df[df['importance_average'] > 3].copy()
# The embeddings comes from running `uv run ./embed_task_description.py`
# Columns: ['embedding_id', 'task', 'embedding_vector']
# These contain embedding for UNIQUE tasks
df_task_embeddings = pd.read_parquet("tasks_with_embeddings.parquet").drop_duplicates(subset=['task'])[['task', 'task_embedding']].rename(columns={"task_embedding": "embedding_vector"}).copy()
df = pd.merge(df, df_task_embeddings[['task', 'embedding_vector']], on='task', how='left')
df = pd.merge(df, df_task_estimateable[['task', 'estimateable']], on='task', how='left')
df['onetsoc_major'] = df['onetsoc_code'].str[:2]
def convert_to_minutes(qty, unit):
"""Converts a quantity in a given unit to minutes."""
return qty * {
"minute": 1,
"hour": 60,
"day": 60 * 24,
"week": 60 * 24 * 7,
"month": 60 * 24 * 30,
"trimester": 60 * 24 * 90,
"semester": 60 * 24 * 180,
"year": 60 * 24 * 365,
}[unit]
df['lb_estimate_in_minutes'] = df.apply(
lambda row: convert_to_minutes(row['lb_estimate_qty'], row['lb_estimate_unit']), axis=1
)
df['ub_estimate_in_minutes'] = df.apply(
lambda row: convert_to_minutes(row['ub_estimate_qty'], row['ub_estimate_unit']), axis=1
)
df['estimate_range'] = df.ub_estimate_in_minutes - df.lb_estimate_in_minutes
df['estimate_ratio'] = df.ub_estimate_in_minutes / df.lb_estimate_in_minutes
df['estimate_midpoint'] = (df.lb_estimate_in_minutes + df.ub_estimate_in_minutes)/2
atomic_tasks = df[df['estimateable'] == 'ATOMIC']
ongoing_tasks = df[df['estimateable'] == 'ONGOING-CONSTRAINT']
with pd.option_context('display.max_columns', None):
display(df)
# Check for empty estimates
if atomic_tasks['lb_estimate_in_minutes'].isnull().sum() > 0:
print("Missing values in 'lb_estimate_in_minutes':", atomic_tasks['lb_estimate_in_minutes'].isnull().sum())
if atomic_tasks['ub_estimate_in_minutes'].isnull().sum() > 0:
print("Missing values in 'ub_estimate_in_minutes':", atomic_tasks['ub_estimate_in_minutes'].isnull().sum())
# Check for impossible bounds
impossible_bounds = atomic_tasks[
(atomic_tasks['lb_estimate_in_minutes'] <= 0) |
(atomic_tasks['ub_estimate_in_minutes'] <= 0) |
(atomic_tasks['lb_estimate_in_minutes'] > atomic_tasks['ub_estimate_in_minutes'])
]
if not impossible_bounds.empty:
print(f"Error: Found rows with impossible bounds.")
with pd.option_context('display.max_colwidth', None):
display(impossible_bounds[['task', 'lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'dwas']])
#with pd.option_context('display.max_colwidth', None):
#display(atomic_tasks.nlargest(20, 'ub_estimate_in_minutes')[['task', 'lb_estimate_qty', 'lb_estimate_unit', 'lb_estimate_in_minutes', 'ub_estimate_qty', 'ub_estimate_unit', 'ub_estimate_in_minutes', 'estimate_ratio']])
def cell1():
sns.histplot(atomic_tasks.estimate_midpoint, log_scale=True)
def cell2():
plt.figure(figsize=(14,10))
sns.boxplot(
data=atomic_tasks,
x='onetsoc_major', # 11 = Management, 15 = Computer/Math, …
y='estimate_range',
showfliers=False
)
plt.yscale('log') # long tail => log scale
plt.xlabel('Occupation')
plt.ylabel('Range (upper-lower, minutes)')
plt.title('Spread of time-range estimates per occupation')
ax = plt.gca()
ax.set_xticklabels([occupation_major_codes[code.get_text()] for code in ax.get_xticklabels()], rotation=60, ha='right')
def cell3():
plt.figure(figsize=(10, 10))
ax = sns.scatterplot(
data=atomic_tasks.replace({'onetsoc_major': occupation_major_codes}), # Replace codes with labels
x='lb_estimate_in_minutes', y='ub_estimate_in_minutes',
alpha=0.2, edgecolor=None, hue="onetsoc_major" # Use the labeled column for hue
)
# 45° reference
lims = (1, atomic_tasks[['lb_estimate_in_minutes','ub_estimate_in_minutes']].max().max())
ax.plot(lims, lims, color='black', linestyle='--', linewidth=1)
# optional helper lines: 2× and 10×, 100× ratios
for k in [2,10, 100]:
ax.plot(lims, [k*l for l in lims],
linestyle=':', color='grey', linewidth=1)
ax.set(xscale='log', yscale='log')
ax.set_xlabel('Lower-bound (min, log scale)')
ax.set_ylabel('Upper-bound (min, log scale)')
ax.set_title('Lower vs upper estimates for all tasks')
# Place the legend outside the plot
ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
def cell4():
plt.figure(figsize=(8,4))
sns.histplot(np.log10(atomic_tasks['estimate_ratio'].replace([np.inf, -np.inf], np.nan).dropna()),
bins=60, kde=True)
plt.axvline(np.log10(10), color='red', ls='--', lw=1, label='10×')
plt.axvline(np.log10(1.05), color='orange', ls='--', lw=1, label='1.05×')
plt.axvline(0, color='black', ls='-', lw=1) # ub = lb
plt.xlabel('log₁₀(upper / lower)')
plt.ylabel('Count')
plt.title('Distribution of upper:lower ratio')
plt.legend()
plt.tight_layout()
def cell5():
# 1. Bin lower bounds into quartiles (Q1Q4)
atomic_tasks['lb_q'] = pd.qcut(atomic_tasks.lb_estimate_in_minutes,
q=4, labels=['Q1 shortest','Q2','Q3','Q4 longest'])
# 3. Aggregate: median (or mean) ratio per cell
pivot = atomic_tasks.pivot_table(index='onetsoc_major', columns='lb_q',
values='estimate_ratio', aggfunc='median')
# Map the index (onetsoc_major codes) to their corresponding labels
pivot.index = pivot.index.map(occupation_major_codes)
# 4. Visualise
plt.figure(figsize=(10,8))
sns.heatmap(pivot, cmap='RdYlGn_r', center=2, annot=True, fmt='.1f',
cbar_kws={'label':'Median upper/lower ratio'})
plt.xlabel('Lower-bound quartile')
plt.ylabel('Occupation (major group)')
plt.title('Typical range width by occupation and task length')
plt.tight_layout()
def cell6():
"""
from scipy.stats import median_abs_deviation
def mad_z(series):
med = series.median()
mad = median_abs_deviation(series, scale='normal') # ⇒ comparable to σ
return (series - med) / mad
df['robust_z'] = df.groupby('onetsoc_code')['estimate_midpoint'].transform(mad_z)
"""
agg = (atomic_tasks
.groupby('onetsoc_code')['estimate_midpoint']
.agg(median='median',
q1=lambda x: x.quantile(.25),
q3=lambda x: x.quantile(.75),
mean='mean',
std='std')
.reset_index())
agg['IQR'] = agg.q3 - agg.q1
agg['CV'] = agg['std'] / agg['mean'] # coefficient of variation
# merge back the group mean and std so each row can be scored
atomic_tasks = atomic_tasks.merge(agg[['onetsoc_code','mean','std']], on='onetsoc_code')
atomic_tasks['z'] = (atomic_tasks.estimate_midpoint - atomic_tasks['mean']) / atomic_tasks['std']
outliers = atomic_tasks.loc[atomic_tasks.z.abs() > 3]
outliers
def cell7():
from scipy.stats import median_abs_deviation
def mad_z(series):
med = series.median()
mad = median_abs_deviation(series, scale='normal') # ⇒ comparable to σ
return (series - med) / mad
atomic_tasks['robust_z'] = atomic_tasks.groupby('onetsoc_code')['estimate_midpoint'].transform(mad_z)
def cell10():
import matplotlib.ticker as mtick # For percentage formatting
import matplotlib.colors as mcolors # For color conversion
summary_data = []
for code, label in occupation_major_codes.items():
occ_df = df_tasks[df_tasks['onetsoc_major'] == code]
total_tasks_in_occ = len(occ_df)
if total_tasks_in_occ == 0:
continue # Skip if no tasks for this occupation
# Stack 1: % that isn't equal to "remote"
not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote'])
# For the remaining remote tasks:
remote_df = occ_df[occ_df['remote_status'] == 'remote']
# Stack 2: % of remote + ATOMIC
remote_atomic_count = len(remote_df[remote_df['estimateable'] == 'ATOMIC'])
# Stack 3: % of remote + ONGOING-CONSTRAINT
remote_ongoing_count = len(remote_df[remote_df['estimateable'] == 'ONGOING-CONSTRAINT'])
summary_data.append({
'onetsoc_major_code': code,
'occupation_label': label,
'count_not_remote': not_remote_count,
'count_remote_atomic': remote_atomic_count,
'count_remote_ongoing': remote_ongoing_count,
'total_tasks': total_tasks_in_occ
})
summary_df = pd.DataFrame(summary_data)
# --- 3. Calculate Percentages ---
# Ensure total_tasks is not zero to avoid division by zero errors if an occupation had no tasks
summary_df = summary_df[summary_df['total_tasks'] > 0].copy() # Use .copy() to avoid SettingWithCopyWarning
summary_df['pct_not_remote'] = (summary_df['count_not_remote'] / summary_df['total_tasks']) * 100
summary_df['pct_remote_atomic'] = (summary_df['count_remote_atomic'] / summary_df['total_tasks']) * 100
summary_df['pct_remote_ongoing'] = (summary_df['count_remote_ongoing'] / summary_df['total_tasks']) * 100
# Select columns for plotting and set index to occupation label
plot_df = summary_df.set_index('occupation_label')[
['pct_not_remote', 'pct_remote_atomic', 'pct_remote_ongoing']
]
# Rename columns for a clearer legend
plot_df.columns = ['Not Remote', 'Remote + Estimable', 'Remote + Not estimable']
plot_df = plot_df.sort_values(by='Not Remote', ascending=False)
# --- 4. Plotting (Modified) ---
# Define the custom colors based on your requirements
# The order must match the column order in plot_df:
# 1. 'Not Remote'
# 2. 'Remote & ATOMIC'
# 3. 'Remote & ONGOING-CONSTRAINT'
bar_colors = [gray["300"], lime["500"], lime["200"]]
fig, ax = plt.subplots(figsize=(14, 10)) # Adjusted figsize for better readability
plot_df.plot(kind='barh', stacked=True, ax=ax, color=bar_colors)
ax.set_xlabel("Percentage of Tasks (%)", fontsize=12)
ax.set_ylabel("Occupation Major Group", fontsize=12)
ax.set_title("Task Breakdown by Occupation, Remote Status, and Estimateability", fontsize=14, pad=20)
# Format x-axis as percentages
ax.xaxis.set_major_formatter(mtick.PercentFormatter())
plt.xlim(0, 100) # Ensure x-axis goes from 0 to 100%
# Remove right and top spines
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
# Function to get contrasting text color
def get_contrasting_text_color(bg_color_hex_or_rgba):
"""
Determines if black or white text provides better contrast against a given background color.
bg_color_hex_or_rgba: A hex string (e.g., '#RRGGBB') or an RGBA tuple (values in [0, 1]).
Returns: 'black' or 'white'.
"""
# Convert to RGBA if it's a hex string or name
if isinstance(bg_color_hex_or_rgba, str):
rgba = mcolors.to_rgba(bg_color_hex_or_rgba)
else:
rgba = bg_color_hex_or_rgba
r, g, b, _ = rgba # Ignore alpha for luminance calculation
# Calculate luminance (standard formula for sRGB)
# Values r, g, b should be in [0, 1] for this formula
luminance = 0.2126 * r + 0.7152 * g + 0.0722 * b
# Threshold for deciding text color
return 'black' if luminance > 0.55 else 'white' # Adjusted threshold slightly for better visual
# Add percentages inside each bar segment
# Iterate through each "category" of bars (Not Remote, Remote & ATOMIC, etc.)
for i, container in enumerate(ax.containers):
# Get the color for this container/category
segment_color = bar_colors[i]
text_color = get_contrasting_text_color(segment_color)
for patch in container.patches: # Iterate through each bar segment in the category
width = patch.get_width()
if width > 3: # Only add text if segment is wide enough (e.g., >3%)
x = patch.get_x() + width / 2
y = patch.get_y() + patch.get_height() / 2
ax.text(x, y,
f"{width:.1f}%",
ha='center',
va='center',
fontsize=8, # Adjust font size as needed
color=text_color,
fontweight='medium') # Bolder text can help
plt.legend(title="Task Category", bbox_to_anchor=(1.02, 1), loc='upper left', frameon=False)
def cell11():
df_oesm['onetsoc_major'] = df_oesm['OCC_CODE'].str[:2]
# Calculate wage bill per occupation
# Wage bill = Total Employment * Annual Mean Wage
# Ensure columns are numeric, converting non-numeric values to NaN first
df_oesm['TOT_EMP'] = pd.to_numeric(df_oesm['TOT_EMP'], errors='coerce')
df_oesm['A_MEAN'] = pd.to_numeric(df_oesm['A_MEAN'], errors='coerce')
# Drop rows with NaN in necessary columns after coercion
df_oesm.dropna(subset=['TOT_EMP', 'A_MEAN', 'onetsoc_major'], inplace=True)
df_oesm['wage_bill'] = df_oesm['TOT_EMP'] * df_oesm['A_MEAN']
# Aggregate wage bill by onetsoc_major
df_wage_bill_major = df_oesm.groupby('onetsoc_major')['wage_bill'].sum().reset_index()
# Map major codes to titles for better plotting
df_wage_bill_major['OCC_TITLE_MAJOR'] = df_wage_bill_major['onetsoc_major'].map(occupation_major_codes)
# Sort by wage bill for better visualization
df_wage_bill_major = df_wage_bill_major.sort_values('wage_bill', ascending=False)
# Plotting
plt.figure(figsize=(12, 8))
sns.barplot(x='wage_bill', y='OCC_TITLE_MAJOR', data=df_wage_bill_major, palette="viridis")
plt.title('Total Wage Bill per Major Occupation Group')
plt.xlabel('Total Wage Bill (in billions)')
plt.ylabel('Major Occupation Group')
plt.grid(axis='x', linestyle='--', alpha=0.7)
def cell11():
# ───────────────────────────────────────────────────────────────
# 1. CUMULATIVE-DISTRIBUTION-FUNCTION (CDF) PREP
# ───────────────────────────────────────────────────────────────
def cdf(series):
s = series.sort_values().reset_index(drop=True)
return s.values, ((s.index + 1) / len(s)) * 100
x_lb , y_lb = cdf(atomic_tasks['lb_estimate_in_minutes'])
x_ub , y_ub = cdf(atomic_tasks['ub_estimate_in_minutes'])
x_mid, y_mid = cdf((atomic_tasks['ub_estimate_in_minutes'] + atomic_tasks['lb_estimate_in_minutes']) / 2)
# ───────────────────────────────────────────────────────────────
# 2. PLOTTING
# ───────────────────────────────────────────────────────────────
fig, ax = plt.subplots(figsize=(10, 6))
# horizontal reference lines every 10 %
for y_val in range(0, 101, 10):
ax.axhline(y_val, color=gray['100'], linewidth=.8, zorder=1)
# Plot Lower Bound CDF
ax.step(x_lb, y_lb,
where='post',
color=lime['300'], # Example: light blue for lower bound
linewidth=1.8,
linestyle='--',
zorder=2,
label='Lower bound estimate (CDF)')
# Plot Upper Bound CDF
ax.step(x_ub, y_ub,
where='post',
color=lime['900'], # Example: light orange/red for upper bound
linewidth=1.8,
linestyle=':',
zorder=3,
label='Upper bound estimate (CDF)')
# Plot Midpoint CDF (plotted last to be on top, or adjust zorder)
ax.step(x_mid, y_mid,
where='post',
color=lime['600'],
linewidth=2.2,
zorder=4, # Ensure it's on top of other lines if they overlap significantly
label='Mid-point estimate (CDF)')
# axes limits / scales
ax.set_ylim(0, 100)
ax.set_xscale('log')
# y-axis ➝ percent labels
ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(decimals=0))
# move y-label to top-left (just inside plotting area)
ax.text(-0.06, 1.03,
"% of tasks with temporal coherence ≤ X",
ha='left', va='bottom',
transform=ax.transAxes,
fontsize=12, fontweight='semibold')
# custom x-ticks at human-friendly durations
ticks = [1, 5, 10, 30, 60, 120, 240, 480,
1440, 2880, 10080, 43200, 129600,
259200, 525600]
ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hour', '2 hours', '4 hours', '8 hours',
'1 day', '2 days', '1 week', '30 days',
'90 days', '180 days', '1 year']
# Vertical reference lines for x-ticks
for tick in ticks:
ax.axvline(tick, color=gray['300'], linewidth=.8, linestyle='--', zorder=1)
ax.set_xticks(ticks)
ax.set_xticklabels(ticklabels, rotation=45, ha='right')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_edgecolor(gray['300'])
ax.spines['bottom'].set_edgecolor(gray['300'])
# legend
ax.legend(frameon=False, loc='lower right') # Keep 'lower right' or adjust as needed
ax.text(0.5, -0.3,
'Temporal coherence (X)',
ha='center', va='center',
transform=ax.transAxes,
fontsize=12, fontweight='semibold')

View file

@ -1,411 +0,0 @@
import pandas as pd
import litellm
import dotenv
import os
import time
import json
import math
# Load environment variables
dotenv.load_dotenv(override=True)
# litellm._turn_on_debug() # Optional debugging
# --- Configuration ---
MODEL = "gpt-4.1-mini" # Make sure this model supports json_schema or structured output
RATE_LIMIT = 5000 # Requests per minute
CHUNK_SIZE = 300 # Number of unique tasks per API call
SECONDS_PER_MINUTE = 60
# File configuration
CLASSIFICATION_FILENAME = "tasks_estimateable.csv" # Output file with classifications
TASK_SOURCE_FOR_INIT_FILENAME = "tasks_with_estimates.csv"
OUTPUT_COLUMN_NAME = "task_estimateable"
SOURCE_FILTER_COLUMN = "remote_status"
SOURCE_FILTER_VALUE = "remote"
# --- Prompts and Schema ---
SYSTEM_PROMPT_CLASSIFY = """
Classify the provided O*NET task into one of these categories:
- ATOMIC (schedulable): A single, clearly-bounded activity, typically lasting minutes, hours, or a few days.
- ONGOING-CONSTRAINT (background role/ethical rule): A continuous responsibility or behavioural norm with no schedulable duration (e.g., follow confidentiality rules, serve as department head).
""".strip()
USER_MESSAGE_TEMPLATE_CLASSIFY = "Task: {task}"
CLASSIFICATION_CATEGORIES = ["ATOMIC", "ONGOING-CONSTRAINT"]
SCHEMA_FOR_CLASSIFICATION = {
"name": "classify_task_type",
"strict": True,
"schema": {
"type": "object",
"properties": {
"task_category": {
"type": "string",
"enum": CLASSIFICATION_CATEGORIES,
"description": "The classification of the task (ATOMIC or ONGOING-CONSTRAINT).",
}
},
"required": ["task_category"],
"additionalProperties": False,
},
}
def save_dataframe(df_to_save, filename):
"""Saves the DataFrame to the specified CSV file using atomic write."""
try:
temp_filename = filename + ".tmp"
df_to_save.to_csv(temp_filename, encoding="utf-8-sig", index=False)
os.replace(temp_filename, filename)
except Exception as e:
print(f"--- Error saving DataFrame to {filename}: {e} ---")
if os.path.exists(temp_filename):
try:
os.remove(temp_filename)
except Exception as remove_err:
print(
f"--- Error removing temporary save file {temp_filename}: {remove_err} ---"
)
# --- Load or Initialize DataFrame ---
try:
if os.path.exists(CLASSIFICATION_FILENAME):
df = pd.read_csv(CLASSIFICATION_FILENAME, encoding="utf-8-sig")
print(f"Successfully read {len(df)} rows from {CLASSIFICATION_FILENAME}.")
save_needed_after_load = False
if OUTPUT_COLUMN_NAME not in df.columns:
df[OUTPUT_COLUMN_NAME] = pd.NA
print(f"Added '{OUTPUT_COLUMN_NAME}' column.")
save_needed_after_load = True
df[OUTPUT_COLUMN_NAME].replace(["", None, ""], pd.NA, inplace=True)
if df[OUTPUT_COLUMN_NAME].dtype != object and not isinstance(
df[OUTPUT_COLUMN_NAME].dtype, pd.StringDtype
):
try:
df[OUTPUT_COLUMN_NAME] = df[OUTPUT_COLUMN_NAME].astype(object)
print(
f"Corrected dtype of '{OUTPUT_COLUMN_NAME}' to {df[OUTPUT_COLUMN_NAME].dtype}."
)
save_needed_after_load = True
except Exception as e:
print(
f"Warning: Could not convert column '{OUTPUT_COLUMN_NAME}' to object: {e}."
)
if "task" not in df.columns:
print(
f"Error: {CLASSIFICATION_FILENAME} must contain a 'task' column for processing."
)
exit()
if save_needed_after_load:
print(f"Saving {CLASSIFICATION_FILENAME} after adding/adjusting column.")
save_dataframe(df, CLASSIFICATION_FILENAME)
else:
print(
f"{CLASSIFICATION_FILENAME} not found. Attempting to create it from {TASK_SOURCE_FOR_INIT_FILENAME}."
)
if not os.path.exists(TASK_SOURCE_FOR_INIT_FILENAME):
print(
f"Error: Source file {TASK_SOURCE_FOR_INIT_FILENAME} not found. Cannot create {CLASSIFICATION_FILENAME}."
)
exit()
df_source = pd.read_csv(TASK_SOURCE_FOR_INIT_FILENAME, encoding="utf-8-sig")
required_source_cols_for_init = ["task", SOURCE_FILTER_COLUMN]
missing_source_cols = [
col for col in required_source_cols_for_init if col not in df_source.columns
]
if missing_source_cols:
print(
f"Error: Source file {TASK_SOURCE_FOR_INIT_FILENAME} is missing required columns for initialization: {', '.join(missing_source_cols)}."
)
exit()
df_source_filtered = df_source[
df_source[SOURCE_FILTER_COLUMN] == SOURCE_FILTER_VALUE
].copy()
if df_source_filtered.empty:
print(
f"Warning: No tasks with '{SOURCE_FILTER_COLUMN}' == '{SOURCE_FILTER_VALUE}' found in {TASK_SOURCE_FOR_INIT_FILENAME}. "
f"{CLASSIFICATION_FILENAME} will be created with schema but no tasks to classify initially."
)
df = df_source_filtered[["task"]].copy()
df[OUTPUT_COLUMN_NAME] = pd.NA
df[OUTPUT_COLUMN_NAME] = df[OUTPUT_COLUMN_NAME].astype(object)
print(
f"Created {CLASSIFICATION_FILENAME} using tasks from {TASK_SOURCE_FOR_INIT_FILENAME} "
f"(where {SOURCE_FILTER_COLUMN}='{SOURCE_FILTER_VALUE}'). New file has {len(df)} tasks."
)
save_dataframe(df, CLASSIFICATION_FILENAME)
except FileNotFoundError:
print(f"Error: A required file was not found. Please check paths.")
exit()
except Exception as e:
print(f"Error during DataFrame loading or initialization: {e}")
exit()
# --- Identify Unique Tasks to Process ---
if df.empty:
print(f"{CLASSIFICATION_FILENAME} is empty. Nothing to process. Exiting.")
exit()
initial_unprocessed_mask = df[OUTPUT_COLUMN_NAME].isna()
if not initial_unprocessed_mask.any():
print(
f"All tasks in {CLASSIFICATION_FILENAME} seem to have been classified already. Exiting."
)
exit()
# Filter for rows that are unprocessed AND have a valid 'task' string
valid_tasks_to_consider_df = df[
initial_unprocessed_mask & df["task"].notna() & (df["task"].str.strip() != "")
]
if valid_tasks_to_consider_df.empty:
print(
f"No valid, unclassified tasks found to process (after filtering out empty/NaN task descriptions). Exiting."
)
exit()
unique_task_labels_for_api = (
valid_tasks_to_consider_df["task"].drop_duplicates().tolist()
)
total_rows_to_update_potentially = len(
df[initial_unprocessed_mask]
) # Count all rows that are NA
print(
f"Found {total_rows_to_update_potentially} total rows in {CLASSIFICATION_FILENAME} needing classification."
)
print(
f"Identified {len(unique_task_labels_for_api)} unique, valid task labels to send to the API."
)
# --- Prepare messages for batch completion (only for unique task labels) ---
messages_list = []
print(f"Preparing messages for {len(unique_task_labels_for_api)} unique task labels...")
for task_label in unique_task_labels_for_api:
# task_label is already guaranteed to be non-empty and not NaN from the filtering above
user_message = USER_MESSAGE_TEMPLATE_CLASSIFY.format(task=task_label)
messages_for_task = [
{"role": "system", "content": SYSTEM_PROMPT_CLASSIFY},
{"role": "user", "content": user_message},
]
messages_list.append(messages_for_task)
print(f"Prepared {len(messages_list)} message sets for batch completion.")
if (
not messages_list
): # Should only happen if unique_task_labels_for_api was empty, caught above
print(
"No messages prepared, though unique tasks were identified. This is unexpected. Exiting."
)
exit()
# --- Call batch_completion in chunks with rate limiting and periodic saving ---
total_unique_tasks_to_send = len(
messages_list
) # Same as len(unique_task_labels_for_api)
num_chunks = math.ceil(total_unique_tasks_to_send / CHUNK_SIZE)
print(
f"\nStarting batch classification for {total_unique_tasks_to_send} unique task labels in {num_chunks} chunks..."
)
overall_start_time = time.time()
processed_rows_count_total = 0 # Counts actual rows updated in the DataFrame
for i in range(num_chunks):
chunk_start_message_index = i * CHUNK_SIZE
chunk_end_message_index = min((i + 1) * CHUNK_SIZE, total_unique_tasks_to_send)
message_chunk = messages_list[chunk_start_message_index:chunk_end_message_index]
# Get corresponding unique task labels for this chunk
chunk_task_labels = unique_task_labels_for_api[
chunk_start_message_index:chunk_end_message_index
]
if not message_chunk: # Should not happen if loop range is correct
continue
print(
f"\nProcessing chunk {i + 1}/{num_chunks} (Unique Task Labels {chunk_start_message_index + 1}-{chunk_end_message_index} of this run)..."
)
chunk_start_time = time.time()
responses = []
try:
print(
f"Sending {len(message_chunk)} requests (for unique tasks) for chunk {i + 1}..."
)
responses = litellm.batch_completion(
model=MODEL,
messages=message_chunk,
response_format={
"type": "json_schema",
"json_schema": SCHEMA_FOR_CLASSIFICATION,
},
num_retries=3,
)
print(f"Chunk {i + 1} API call completed.")
except Exception as e:
print(f"Error during litellm.batch_completion for chunk {i + 1}: {e}")
responses = [None] * len(message_chunk)
# --- Process responses for the current chunk ---
# chunk_updates stores {task_label: classification_category}
chunk_task_classifications = {}
successful_api_calls_in_chunk = 0
failed_api_calls_in_chunk = 0
if responses and len(responses) == len(message_chunk):
for j, response in enumerate(responses):
current_task_label = chunk_task_labels[
j
] # The unique task label for this response
content_str = None
if response is None:
print(
f"API call failed for task label '{current_task_label}' (response is None)."
)
failed_api_calls_in_chunk += 1
continue
try:
if (
response.choices
and response.choices[0].message
and response.choices[0].message.content
):
content_str = response.choices[0].message.content
classification_data = json.loads(content_str)
category_raw = classification_data.get("task_category")
if category_raw in CLASSIFICATION_CATEGORIES:
successful_api_calls_in_chunk += 1
chunk_task_classifications[current_task_label] = category_raw
else:
print(
f"Warning: Invalid or missing task_category for task label '{current_task_label}': '{category_raw}'. Content: '{content_str}'"
)
failed_api_calls_in_chunk += 1
else:
finish_reason = (
response.choices[0].finish_reason
if (response.choices and response.choices[0].finish_reason)
else "unknown"
)
error_message = (
response.choices[0].message.content
if (response.choices and response.choices[0].message)
else "No content in message."
)
print(
f"Warning: Received non-standard or empty response content for task label '{current_task_label}'. "
f"Finish Reason: '{finish_reason}'. Message: '{error_message}'. Raw Choices: {response.choices}"
)
failed_api_calls_in_chunk += 1
except json.JSONDecodeError:
print(
f"Warning: Could not decode JSON for task label '{current_task_label}'. Content received: '{content_str}'"
)
failed_api_calls_in_chunk += 1
except AttributeError as ae:
print(
f"Warning: Missing attribute processing response for task label '{current_task_label}': {ae}. Response: {response}"
)
failed_api_calls_in_chunk += 1
except Exception as e:
print(
f"Warning: Unexpected error processing response for task label '{current_task_label}': {type(e).__name__} - {e}. Response: {response}"
)
failed_api_calls_in_chunk += 1
else:
print(
f"Warning: Mismatch between #responses ({len(responses) if responses else 0}) "
f"and #messages sent ({len(message_chunk)}) for chunk {i + 1}, or no responses. Marking all API calls in chunk as failed."
)
failed_api_calls_in_chunk = len(message_chunk)
# --- Update Main DataFrame and Save Periodically ---
rows_updated_this_chunk = 0
if chunk_task_classifications:
print(
f"Updating main DataFrame with classifications for {len(chunk_task_classifications)} unique tasks from chunk {i + 1}..."
)
for task_label, category in chunk_task_classifications.items():
# Update all rows in the main df that match this task_label AND are still NA in the output column
update_condition = (df["task"] == task_label) & (
df[OUTPUT_COLUMN_NAME].isna()
)
num_rows_for_this_task_label = df[update_condition].shape[0]
if num_rows_for_this_task_label > 0:
df.loc[update_condition, OUTPUT_COLUMN_NAME] = category
rows_updated_this_chunk += num_rows_for_this_task_label
print(
f"Updated {rows_updated_this_chunk} rows in the DataFrame based on this chunk's API responses."
)
print(f"Saving progress to {CLASSIFICATION_FILENAME}...")
save_dataframe(df, CLASSIFICATION_FILENAME)
else:
print(
f"No successful API classifications obtained in chunk {i + 1} to update DataFrame or save."
)
print(
f"Chunk {i + 1} API summary: Successful Calls={successful_api_calls_in_chunk}, Failed/Skipped Calls={failed_api_calls_in_chunk}. "
f"Rows updated in DataFrame this chunk: {rows_updated_this_chunk}"
)
processed_rows_count_total += rows_updated_this_chunk
# --- Rate Limiting Pause ---
chunk_end_time = time.time()
chunk_duration = chunk_end_time - chunk_start_time
print(f"Chunk {i + 1} (API calls and DF update) took {chunk_duration:.2f} seconds.")
if i < num_chunks - 1:
time_per_request = SECONDS_PER_MINUTE / RATE_LIMIT if RATE_LIMIT > 0 else 0
min_chunk_duration_for_rate = (
len(message_chunk) * time_per_request
) # Based on API calls made
pause_needed = max(0, min_chunk_duration_for_rate - chunk_duration)
if pause_needed > 0:
print(
f"Pausing for {pause_needed:.2f} seconds to respect rate limit ({RATE_LIMIT}/min)..."
)
time.sleep(pause_needed)
overall_end_time = time.time()
total_duration_minutes = (overall_end_time - overall_start_time) / 60
print(
f"\nBatch classification finished."
f" Updated {processed_rows_count_total} rows in '{CLASSIFICATION_FILENAME}' with new classifications in this run."
f" Total duration: {total_duration_minutes:.2f} minutes."
)
print(f"Performing final save to {CLASSIFICATION_FILENAME}...")
save_dataframe(df, CLASSIFICATION_FILENAME)
print("\nScript finished.")

View file

@ -1,85 +0,0 @@
#!/usr/bin/env bash
# Set database name and directories
ONET_DB_NAME="onet.database"
ONET_ZIP_URL="https://www.onetcenter.org/dl_files/database/db_29_1_mysql.zip"
ONET_ZIP_FILE="db_29_1_mysql.zip"
ONET_EXTRACT_DIR="db_29_1_mysql"
# Download O*NET database only if not already downloaded
if [ ! -f "$ONET_ZIP_FILE" ]; then
echo "Downloading O*NET database from $ONET_ZIP_URL"
curl -L -o "$ONET_ZIP_FILE" "$ONET_ZIP_URL" || wget -O "$ONET_ZIP_FILE" "$ONET_ZIP_URL"
if [ $? -ne 0 ]; then
echo "Failed to download O*NET database"
exit 1
fi
else
echo "Using existing O*NET database zip file"
fi
# Extract downloaded zip file only if extraction directory doesn't exist
if [ ! -d "$ONET_EXTRACT_DIR" ]; then
echo "Extracting O*NET database files"
unzip -o "$ONET_ZIP_FILE"
if [ $? -ne 0 ]; then
echo "Failed to extract O*NET database files"
exit 1
fi
else
echo "Using existing extracted O*NET database files"
fi
# Remove existing database if it exists
if [ -f "$ONET_DB_NAME" ]; then
echo "Removing existing database"
rm "$ONET_DB_NAME"
fi
# Create a new SQLite database with optimized settings for fast import
echo "Creating new SQLite database: $ONET_DB_NAME with performance settings"
sqlite3 "$ONET_DB_NAME" << EOF
PRAGMA journal_mode = OFF;
PRAGMA synchronous = 0;
PRAGMA cache_size = 1000000;
PRAGMA locking_mode = EXCLUSIVE;
PRAGMA temp_store = MEMORY;
PRAGMA foreign_keys = ON;
EOF
# Combine and execute all SQL files in one transaction
echo "Executing SQL files in alphabetical order (single transaction mode)"
sqlite3 "$ONET_DB_NAME" << EOF
BEGIN TRANSACTION;
$(find "$ONET_EXTRACT_DIR" -name "*.sql" | sort | xargs cat)
COMMIT;
EOF
# Check if the execution was successful
if [ $? -ne 0 ]; then
echo "Error executing SQL files in batch transaction"
exit 1
else
echo "Database populated successfully. Restoring reliability settings..."
# Restore reliability-focused settings after import
sqlite3 "$ONET_DB_NAME" << EOF
PRAGMA journal_mode = WAL;
PRAGMA synchronous = NORMAL;
PRAGMA locking_mode = NORMAL;
PRAGMA temp_store = DEFAULT;
PRAGMA foreign_keys = ON;
PRAGMA optimize;
VACUUM;
EOF
if [ $? -ne 0 ]; then
echo "Warning: Failed to restore reliability settings, but database is populated"
else
echo "Reliability settings restored successfully"
fi
echo "O*NET database created and optimized successfully!"
fi

View file

@ -1,392 +0,0 @@
import sqlite3
import pandas as pd
import json
import os
from collections import defaultdict
import numpy as np
# --- Configuration ---
DB_FILE = "onet.database"
OUTPUT_FILE = "task_ratings_enriched.json" # Changed output filename
# --- Database Interaction ---
def fetch_data_from_db(db_path):
"""
Fetches required data from the O*NET SQLite database using JOINs,
including DWAs.
Args:
db_path (str): Path to the SQLite database file.
Returns:
tuple(pandas.DataFrame, pandas.DataFrame): A tuple containing:
- DataFrame with task ratings info.
- DataFrame with task-to-DWA mapping.
Returns (None, None) if the database file doesn't exist or an error occurs.
"""
if not os.path.exists(db_path):
print(f"Error: Database file not found at {db_path}")
return None, None
try:
conn = sqlite3.connect(db_path)
# Construct the SQL query to join the tables and select necessary columns
# Added LEFT JOINs for tasks_to_dwas and dwa_reference
# Use LEFT JOIN in case a task has no DWAs
query = """
SELECT
tr.onetsoc_code,
tr.task_id,
ts.task,
od.title AS occupation_title,
od.description AS occupation_description,
tr.scale_id,
tr.category,
tr.data_value,
dr.dwa_title -- Added DWA title
FROM
task_ratings tr
JOIN
task_statements ts ON tr.task_id = ts.task_id
JOIN
occupation_data od ON tr.onetsoc_code = od.onetsoc_code
LEFT JOIN
tasks_to_dwas td ON tr.onetsoc_code = td.onetsoc_code AND tr.task_id = td.task_id --
LEFT JOIN
dwa_reference dr ON td.dwa_id = dr.dwa_id; --
"""
df = pd.read_sql_query(query, conn)
conn.close()
print(
f"Successfully fetched {len(df)} records (including DWA info) from the database."
)
if df.empty:
print("Warning: Fetched DataFrame is empty.")
# Return empty DataFrames with expected columns if the main fetch is empty
ratings_cols = [
"onetsoc_code",
"task_id",
"task",
"occupation_title",
"occupation_description",
"scale_id",
"category",
"data_value",
]
dwa_cols = ["onetsoc_code", "task_id", "dwa_title"]
return pd.DataFrame(columns=ratings_cols), pd.DataFrame(columns=dwa_cols)
# Remove duplicates caused by joining ratings with potentially multiple DWAs per task
# Keep only unique combinations of the core task/rating info before processing
core_cols = [
"onetsoc_code",
"task_id",
"task",
"occupation_title",
"occupation_description",
"scale_id",
"category",
"data_value",
]
# Check if all core columns exist before attempting to drop duplicates
missing_core_cols = [col for col in core_cols if col not in df.columns]
if missing_core_cols:
print(f"Error: Missing core columns in fetched data: {missing_core_cols}")
return None, None
ratings_df = df[core_cols].drop_duplicates().reset_index(drop=True)
# Get unique DWA info separately
dwa_cols = ["onetsoc_code", "task_id", "dwa_title"]
# Check if all DWA columns exist before processing
if all(col in df.columns for col in dwa_cols):
dwas_df = (
df[dwa_cols]
.dropna(subset=["dwa_title"])
.drop_duplicates()
.reset_index(drop=True)
)
else:
print("Warning: DWA related columns missing, creating empty DWA DataFrame.")
dwas_df = pd.DataFrame(
columns=dwa_cols
) # Create empty df if columns missing
return ratings_df, dwas_df # Return two dataframes now
except sqlite3.Error as e:
print(f"SQLite error: {e}")
if "conn" in locals() and conn:
conn.close()
return None, None # Return None for both if error
except Exception as e:
print(f"An error occurred during data fetching: {e}")
if "conn" in locals() and conn:
conn.close()
return None, None # Return None for both if error
# --- Data Processing ---
def process_task_ratings_with_dwas(ratings_df, dwas_df):
"""
Processes the fetched data to group, pivot frequency, calculate averages,
structure the output, and add associated DWAs.
Args:
ratings_df (pandas.DataFrame): The input DataFrame with task ratings info.
dwas_df (pandas.DataFrame): The input DataFrame with task-to-DWA mapping. Can be None or empty.
Returns:
list: A list of dictionaries, each representing an enriched task rating with DWAs.
Returns None if the input ratings DataFrame is invalid.
"""
if ratings_df is None or not isinstance(
ratings_df, pd.DataFrame
): # Check if it's a DataFrame
print("Error: Input ratings DataFrame is invalid.")
return None
if ratings_df.empty:
print(
"Warning: Input ratings DataFrame is empty. Processing will yield empty result."
)
# Decide how to handle empty input, maybe return empty list directly
# return []
# Ensure dwas_df is a DataFrame, even if empty
if dwas_df is None or not isinstance(dwas_df, pd.DataFrame):
print("Warning: Invalid or missing DWA DataFrame. Proceeding without DWA data.")
dwas_df = pd.DataFrame(
columns=["onetsoc_code", "task_id", "dwa_title"]
) # Ensure it's an empty DF
print("Starting data processing...")
# --- 1. Handle Frequency (FT) ---
freq_df = ratings_df[ratings_df["scale_id"] == "FT"].copy()
if not freq_df.empty:
freq_pivot = freq_df.pivot_table(
index=["onetsoc_code", "task_id"],
columns="category",
values="data_value",
fill_value=0,
)
freq_pivot.columns = [
f"frequency_category_{int(col)}" for col in freq_pivot.columns
]
print(f"Processed Frequency data. Shape: {freq_pivot.shape}")
else:
print("No Frequency (FT) data found.")
# Create an empty DataFrame with the multi-index to allow merging later
idx = pd.MultiIndex(
levels=[[], []], codes=[[], []], names=["onetsoc_code", "task_id"]
)
freq_pivot = pd.DataFrame(index=idx)
# --- 2. Handle Importance (IM, IJ) ---
imp_df = ratings_df[ratings_df["scale_id"].isin(["IM", "IJ"])].copy()
if not imp_df.empty:
imp_avg = (
imp_df.groupby(["onetsoc_code", "task_id"])["data_value"]
.mean()
.reset_index()
)
imp_avg.rename(columns={"data_value": "importance_average"}, inplace=True)
print(f"Processed Importance data. Shape: {imp_avg.shape}")
else:
print("No Importance (IM, IJ) data found.")
imp_avg = pd.DataFrame(
columns=["onetsoc_code", "task_id", "importance_average"]
)
# --- 3. Handle Relevance (RT) ---
rel_df = ratings_df[ratings_df["scale_id"] == "RT"].copy()
if not rel_df.empty:
rel_avg = (
rel_df.groupby(["onetsoc_code", "task_id"])["data_value"]
.mean()
.reset_index()
)
rel_avg.rename(columns={"data_value": "relevance_average"}, inplace=True)
print(f"Processed Relevance data. Shape: {rel_avg.shape}")
else:
print("No Relevance (RT) data found.")
rel_avg = pd.DataFrame(columns=["onetsoc_code", "task_id", "relevance_average"])
# --- 4. Process DWAs ---
if dwas_df is not None and not dwas_df.empty and "dwa_title" in dwas_df.columns:
print("Processing DWA data...")
# Group DWAs by task_id and aggregate titles into a list
dwas_grouped = (
dwas_df.groupby(["onetsoc_code", "task_id"])["dwa_title"]
.apply(list)
.reset_index()
) #
dwas_grouped.rename(
columns={"dwa_title": "dwas"}, inplace=True
) # Rename column to 'dwas'
print(f"Processed DWA data. Shape: {dwas_grouped.shape}")
else:
print("No valid DWA data found or provided for processing.")
dwas_grouped = None # Set to None if no DWAs
# --- 5. Get Base Task/Occupation Info ---
base_cols = [
"onetsoc_code",
"task_id",
"task",
"occupation_title",
"occupation_description",
]
# Check if base columns exist in ratings_df
missing_base_cols = [col for col in base_cols if col not in ratings_df.columns]
if missing_base_cols:
print(
f"Error: Missing base info columns in ratings_df: {missing_base_cols}. Cannot proceed."
)
return None
if not ratings_df.empty:
base_info = (
ratings_df[base_cols]
.drop_duplicates()
.set_index(["onetsoc_code", "task_id"])
)
print(f"Extracted base info. Shape: {base_info.shape}")
else:
print("Cannot extract base info from empty ratings DataFrame.")
# Create an empty df with index to avoid errors later if possible
idx = pd.MultiIndex(
levels=[[], []], codes=[[], []], names=["onetsoc_code", "task_id"]
)
base_info = pd.DataFrame(
index=idx,
columns=[
col for col in base_cols if col not in ["onetsoc_code", "task_id"]
],
)
# --- 6. Merge Processed Data ---
print("Merging processed data...")
# Start with base_info, which should have the index ['onetsoc_code', 'task_id']
final_df = base_info.merge(
freq_pivot, left_index=True, right_index=True, how="left"
)
# Reset index before merging non-indexed dfs
final_df = final_df.reset_index()
# Merge averages - check if they are not empty before merging
if not imp_avg.empty:
final_df = final_df.merge(imp_avg, on=["onetsoc_code", "task_id"], how="left")
else:
final_df["importance_average"] = np.nan # Add column if imp_avg was empty
if not rel_avg.empty:
final_df = final_df.merge(rel_avg, on=["onetsoc_code", "task_id"], how="left")
else:
final_df["relevance_average"] = np.nan # Add column if rel_avg was empty
# Merge DWAs if available
if dwas_grouped is not None and not dwas_grouped.empty:
final_df = final_df.merge(
dwas_grouped, on=["onetsoc_code", "task_id"], how="left"
) # Merge the dwas list
# Fill NaN in 'dwas' column (for tasks with no DWAs) with empty lists
# Check if 'dwas' column exists before applying function
if "dwas" in final_df.columns:
final_df["dwas"] = final_df["dwas"].apply(
lambda x: x if isinstance(x, list) else []
) # Ensure tasks without DWAs get []
else:
print("Warning: 'dwas' column not created during merge.")
final_df["dwas"] = [
[] for _ in range(len(final_df))
] # Add empty list column
else:
# Add an empty 'dwas' column if no DWA data was processed or merged
final_df["dwas"] = [[] for _ in range(len(final_df))]
print(f"Final merged data shape: {final_df.shape}")
# Convert DataFrame to list of dictionaries for JSON output
# Handle potential NaN values during JSON conversion
# Replace numpy NaN with Python None for JSON compatibility
final_df = final_df.replace({np.nan: None})
result_list = final_df.to_dict(orient="records")
return result_list
# --- Output ---
def write_to_json(data, output_path):
"""
Writes the processed data to a JSON file.
Args:
data (list): The list of dictionaries to write.
output_path (str): Path to the output JSON file.
"""
if data is None:
print("No data to write to JSON.")
return
if not isinstance(data, list):
print(
f"Error: Data to write is not a list (type: {type(data)}). Cannot write to JSON."
)
return
# Create directory if it doesn't exist
output_dir = os.path.dirname(output_path)
if output_dir and not os.path.exists(output_dir):
try:
os.makedirs(output_dir)
print(f"Created output directory: {output_dir}")
except OSError as e:
print(f"Error creating output directory {output_dir}: {e}")
return # Exit if cannot create directory
try:
with open(output_path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=4, ensure_ascii=False)
print(f"Successfully wrote enriched data to {output_path}")
except IOError as e:
print(f"Error writing JSON file to {output_path}: {e}")
except TypeError as e:
print(f"Error during JSON serialization: {e}. Check data types.")
except Exception as e:
print(f"An unexpected error occurred during JSON writing: {e}")
# --- Main Execution ---
if __name__ == "__main__":
print("Starting O*NET Task Ratings & DWAs Enrichment Script...")
# 1. Fetch data
ratings_data_df, dwas_data_df = fetch_data_from_db(DB_FILE) # Fetch both datasets
# 2. Process data
# Proceed only if ratings_data_df is a valid DataFrame (even if empty)
# dwas_data_df can be None or empty, handled inside process function
if isinstance(ratings_data_df, pd.DataFrame):
enriched_data = process_task_ratings_with_dwas(
ratings_data_df, dwas_data_df
) # Pass both dataframes
# 3. Write output
if (
enriched_data is not None
): # Check if processing returned data (even an empty list is valid)
write_to_json(enriched_data, OUTPUT_FILE)
else:
print("Data processing failed or returned None. No output file generated.")
else:
print(
"Data fetching failed or returned invalid type for ratings data. Script terminated."
)
print("Script finished.")

81
pipeline/aggregate.py Normal file
View file

@ -0,0 +1,81 @@
from .utils import OCCUPATION_MAJOR_CODES
import pandas as pd
def create_task_summary_by_occupation_df(df_tasks: pd.DataFrame, oesm_df: pd.DataFrame) -> pd.DataFrame:
# --- OESM Wage Bill Calculation ---
df_oesm_with_bill = oesm_df.copy()
df_oesm_with_bill.rename(columns={'OCC_CODE': 'onetsoc_code'}, inplace=True)
# Convert key columns to numeric, handling potential errors
df_oesm_with_bill['TOT_EMP'] = pd.to_numeric(df_oesm_with_bill['TOT_EMP'], errors='coerce')
df_oesm_with_bill['A_MEAN'] = pd.to_numeric(df_oesm_with_bill['A_MEAN'], errors='coerce')
df_oesm_with_bill.dropna(subset=['TOT_EMP', 'A_MEAN', 'onetsoc_code'], inplace=True)
# Calculate the wage bill for each occupation
df_oesm_with_bill['wage_bill'] = df_oesm_with_bill['TOT_EMP'] * df_oesm_with_bill['A_MEAN']
oesm_lookup = df_oesm_with_bill.set_index('onetsoc_code')
summary_data = []
# Assuming df_tasks has an 'onetsoc_code' column with the full SOC code
unique_soc_codes = df_tasks['onetsoc_code'].unique()
for code in unique_soc_codes:
occ_df = df_tasks[df_tasks['onetsoc_code'] == code]
total_tasks_in_occ = len(occ_df)
not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote'])
remote_df = occ_df[occ_df['remote_status'] == 'remote']
remote_estimable_count = len(remote_df[remote_df['estimable']])
remote_not_estimable_count = len(remote_df[~remote_df['estimable']])
try:
# O*NET codes (e.g., 11-1011.03) are more specific than OESM SOC codes (e.g., 11-1011).
# We strip the suffix from the O*NET code to find the corresponding wage data.
soc_code_for_lookup = code.split('.')[0]
wage_bill = oesm_lookup.loc[soc_code_for_lookup, 'wage_bill']
label = oesm_lookup.loc[soc_code_for_lookup, 'OCC_TITLE']
except KeyError:
wage_bill = 0
label = "Unknown"
summary_data.append({
'onetsoc_code': code,
'occupation_label': label,
'wage_bill': wage_bill,
'count_not_remote': not_remote_count,
'count_remote_estimable': remote_estimable_count,
'count_remote_not_estimable': remote_not_estimable_count,
'total_tasks': total_tasks_in_occ
})
return pd.DataFrame(summary_data)
def aggregate_task_summary_by_major_code(summary_df: pd.DataFrame) -> pd.DataFrame:
df_agg = summary_df.copy()
df_agg['onetsoc_major_code'] = df_agg['onetsoc_code'].str[:2]
aggregation = {
'wage_bill': 'sum',
'count_not_remote': 'sum',
'count_remote_estimable': 'sum',
'count_remote_not_estimable': 'sum',
'total_tasks': 'sum'
}
major_summary = df_agg.groupby('onetsoc_major_code').agg(aggregation).reset_index()
major_summary['occupation_label'] = major_summary['onetsoc_major_code'].map(OCCUPATION_MAJOR_CODES)
# Reorder columns to match original output format
major_summary = major_summary[[
'onetsoc_major_code',
'occupation_label',
'wage_bill',
'count_not_remote',
'count_remote_estimable',
'count_remote_not_estimable',
'total_tasks'
]]
return major_summary

225
pipeline/classification.py Normal file
View file

@ -0,0 +1,225 @@
from pathlib import Path
import pandas as pd
from .logger import logger
from .utils import enrich
import json
ALLOWED_UNITS = [
"minute",
"hour",
"day",
"week",
"month",
"trimester",
"semester",
"year",
]
ESTIMABLE_CLASSIFICATION_VERSION = "old_version"
TIME_ESTIMATES_GENERATION_VERSION = "old_version"
def classify_tasks_as_estimable(cache_dir: Path, df_to_process: pd.DataFrame, bust: bool = False) -> pd.DataFrame:
CACHE_PATH = cache_dir / f"task_estimability.{ESTIMABLE_CLASSIFICATION_VERSION}.parquet"
if CACHE_PATH.exists() and not bust:
logger.info(f"Loading cached task estimability from {CACHE_PATH}")
return pd.read_parquet(CACHE_PATH)
logger.info("Enriching tasks with estimability classification.")
df_unique_tasks = df_to_process.drop_duplicates(subset=['task']).copy()
logger.info(f"Found {len(df_unique_tasks)} unique remote tasks to classify.")
if df_unique_tasks.empty:
raise ValueError("No unique tasks to classify.")
results = enrich(
model="gpt-4.1-mini",
rpm=5000,
messages_to_process=[
[
{"role": "system", "content": """
Classify the provided O*NET task into one of these categories:
- ATOMIC (schedulable): A single, clearly-bounded activity, typically lasting minutes, hours, or a few days.
- ONGOING-CONSTRAINT (background role/ethical rule): A continuous responsibility or behavioural norm with no schedulable duration (e.g., follow confidentiality rules, serve as department head).
""".strip()},
{"role": "user", "content": f"Task: {row.task}"},
]
for row in df_unique_tasks.itertuples()
],
schema={
"name": "estimability_classification",
"schema": {
"type": "object",
"properties": {"task_category": {"type": "string", "enum": ["ATOMIC", "ONGOING-CONSTRAINT"]}},
"required": ["task_category"],
"additionalProperties": False
}
},
chunk_size=300,
)
if not results or len(results) != len(df_unique_tasks):
raise ValueError(f"Task estimability classification failed or returned mismatched number of results. Expected {len(df_unique_tasks)}, got {len(results) if results else 0}.")
classifications = []
for index, response in enumerate(results):
task_label = df_unique_tasks.iloc[index]['task']
task_category_flag = None
if response is None:
logger.warning(f"API call failed for task (enrich returned None): '{task_label}'")
else:
try:
content_str = response.choices[0].message.content
if not content_str:
raise ValueError("No content found in the response message")
data = json.loads(content_str)
if 'task_category' in data and isinstance(data['task_category'], str):
task_category_flag = data['task_category']
else:
logger.warning(f"Invalid or missing 'task_category' payload for task '{task_label}'. Data: '{data}'")
except (json.JSONDecodeError, AttributeError, KeyError, IndexError, ValueError) as e:
logger.warning(f"Could not parse response for task '{task_label}'. Error: {e}. Response: {response}")
classifications.append({
'task': task_label,
'estimable': task_category_flag == 'ATOMIC'
})
classification_df = pd.DataFrame(classifications)
logger.info(f"Finished classification. Got {classification_df['estimable'].notna().sum()} successful classifications out of {len(df_unique_tasks)} unique tasks.")
logger.info(f"Saving task estimability classifications to {CACHE_PATH}")
classification_df.to_parquet(CACHE_PATH)
return classification_df
def generate_time_estimates_for_tasks(cache_dir: Path, df_to_process: pd.DataFrame, bust: bool = False) -> pd.DataFrame:
CACHE_PATH = cache_dir / f"task_estimates.{TIME_ESTIMATES_GENERATION_VERSION}.parquet"
if CACHE_PATH.exists() and not bust:
logger.info(f"Loading cached task estimates from {CACHE_PATH}")
return pd.read_parquet(CACHE_PATH)
logger.info("Enriching tasks with time estimates.")
if df_to_process.empty:
raise ValueError("No tasks to process for estimates.")
results = enrich(
model="gpt-4.1-mini",
rpm=5000,
messages_to_process=[
[
{
"role": "system",
"content": """
You are an expert assistant evaluating the time required for job tasks. Your goal is to estimate the 'effective time' range needed for a skilled human to complete the following job task **remotely**, without supervision
'Effective time' is the active, focused work duration required to complete the task. Crucially, **exclude all waiting periods, delays, or time spent on other unrelated activities**. Think of it as the continuous, productive time investment needed if the worker could pause and resume instantly without cost.
Provide a lower and upper bound estimate for the 'effective time'. These bounds should capture the time within which approximately 80% of instances of performing this specific task are typically completed by a qualified individual.
Base your estimate on the provided task and the associated occupation and occupation description. Your estimate must be in one the allowed units: minute, hour, day, week, month, trimester, semester, year.""".strip()
},
{
"role": "user",
"content": f"{row.task} done by {row.occupation_title} ({row.occupation_description})"
}
]
for row in df_to_process.itertuples()
],
schema= {
"name": "estimate_time",
"strict": True,
"schema": {
"type": "object",
"properties": {
"lower_bound_estimate": {
"type": "object",
"properties": {
"quantity": {
"type": "number",
"description": "The numerical value for the lower bound of the estimate.",
},
"unit": {
"type": "string",
"enum": ALLOWED_UNITS,
"description": "The unit of time for the lower bound.",
},
},
"required": ["quantity", "unit"],
"additionalProperties": False,
},
"upper_bound_estimate": {
"type": "object",
"properties": {
"quantity": {
"type": "number",
"description": "The numerical value for the upper bound of the estimate.",
},
"unit": {
"type": "string",
"enum": ALLOWED_UNITS,
"description": "The unit of time for the upper bound.",
},
},
"required": ["quantity", "unit"],
"additionalProperties": False,
},
},
"required": ["lower_bound_estimate", "upper_bound_estimate"],
"additionalProperties": False,
},
},
chunk_size=200,
)
if not results or len(results) != len(df_to_process):
raise ValueError(f"API call for task estimates failed or returned mismatched number of results. "
f"Expected {len(df_to_process)}, got {len(results) if results else 0}.")
estimates = []
for index, response in enumerate(results):
row = df_to_process.iloc[index]
task_info = f"O*NET: {row.onetsoc_code}, Task ID: {row.task_id}"
lb_qty, lb_unit, ub_qty, ub_unit = None, None, None, None
if response is None:
logger.warning(f"API call failed for task (enrich returned None): {task_info}")
else:
try:
content_str = response.choices[0].message.content
if not content_str:
raise ValueError("No content found in the response message")
data = json.loads(content_str)
lb_qty = data['lower_bound_estimate']['quantity']
lb_unit = data['lower_bound_estimate']['unit']
ub_qty = data['upper_bound_estimate']['quantity']
ub_unit = data['upper_bound_estimate']['unit']
except Exception as e:
logger.warning(f"Could not parse valid estimate for task {task_info}. Error: {e}. Response: {response}")
lb_qty, lb_unit, ub_qty, ub_unit = None, None, None, None # Reset on failure
estimates.append({
'onetsoc_code': row.onetsoc_code,
'task_id': row.task_id,
'lb_estimate_qty': lb_qty,
'lb_estimate_unit': lb_unit,
'ub_estimate_qty': ub_qty,
'ub_estimate_unit': ub_unit
})
estimates_df = pd.DataFrame(estimates)
logger.info(f"Finished estimates. Got {estimates_df['lb_estimate_qty'].notna().sum()} successful estimates out of {len(df_to_process)} tasks.")
logger.info(f"Saving task estimates to {CACHE_PATH}")
estimates_df.to_parquet(CACHE_PATH)
return estimates_df

View file

@ -1,35 +0,0 @@
OCCUPATION_MAJOR_CODES = {
'11': 'Management',
'13': 'Business & Financial',
'15': 'Computer & Mathematical',
'17': 'Architecture & Engineering',
'19': 'Life, Physical, & Social Science',
'21': 'Community & Social Service',
'23': 'Legal',
'25': 'Education, Training, & Library',
'27': 'Arts, Design, & Media',
'29': 'Healthcare Practitioners',
'31': 'Healthcare Support',
'33': 'Protective Service',
'35': 'Food Preparation & Serving',
'37': 'Building & Grounds Maintenance',
'39': 'Personal Care & Service',
'41': 'Sales & Related',
'43': 'Office & Admin Support',
'45': 'Farming, Fishing, & Forestry',
'47': 'Construction & Extraction',
'49': 'Installation, Maintenance, & Repair',
'51': 'Production',
'53': 'Transportation & Material Moving',
'55': 'Military Specific',
}
GRAY = {'50':'#f8fafc','100':'#f1f5f9','200':'#e2e8f0',
'300':'#cbd5e1','400':'#94a3b8','500':'#64748b',
'600':'#475569','700':'#334155','800':'#1e293b',
'900':'#0f172a','950':'#020617'}
LIME = {'50': '#f7fee7','100': '#ecfcca','200': '#d8f999',
'300': '#bbf451','400': '#9ae600','500': '#83cd00',
'600': '#64a400','700': '#497d00','800': '#3c6300',
'900': '#35530e','950': '#192e03'}

View file

@ -1,97 +0,0 @@
"""
This module enriches data, they take time to run, and are usually expensive (API calls...),
they should manage their own state, and only be run if the data's version is different than
their save.
"""
from .run import Run
import pandas as pd
from typing import Any, List, Dict
import litellm
def enrich(
model: str,
rpm: int,
messages_to_process: List[List[Dict[str, str]]],
schema: Dict[str, Any],
chunk_size: int = 100,
):
# Use litellm.batch_completion
pass
def enrich_with_task_estimateability(run: Run) -> pd.DataFrame:
output_path = run.cache_dir / "computed_task_estimateability.parquet"
if output_path.exists():
print(f"Loading cached task estimateability from {output_path}")
return pd.read_parquet(output_path)
df_remote_tasks = run.df_tasks[run.df_tasks['remote_status'] == 'remote'].copy()
# In the old script, we only passed unique tasks to the API
df_unique_tasks = df_remote_tasks.drop_duplicates(subset=['task'])
results = enrich(
model="gpt-4.1-mini",
rpm=5000,
messages_to_process=[
[
{"role": "system", "content": """
Judge whether the provided O*NET task is suitable for a time estimate. If it is a single, clearly-bounded activity, typically lasting minutes, hours, or a few days, then clearly yes. If it is a continuous responsibility or behavioural norm with no schedulable duration (e.g., follow confidentiality rules, serve as department head), then clearly no.
"""},
{"role": "user", "content": f"Task: {row.task}"},
]
for row in df_unique_tasks.itertuples()
],
schema={
"type": "object",
"properties": {"estimateable": {"type": "bool"}},
"required": ["estimateable"]
},
chunk_size=300,
)
# Create a new dataframe with just enough information to identify the task uniquely + estimateability classification, save it, return it. Careful: the "task" column in itself is not unique.
return pd.DataFrame()
def enrich_with_task_estimates(run: Run) -> pd.DataFrame:
output_path = run.cache_dir / "computed_task_estimates.parquet"
if output_path.exists():
print(f"Loading cached task estimates from {output_path}")
return pd.read_parquet(output_path)
df = ... # todo
results = enrich(
model="gpt-4.1-mini",
rpm=5000,
messages_to_process=[
[
{"role": "system", "content": "Estimate the time required to complete the following O*NET task. Your estimate should be a plausible range for how long it might take a typical, qualified worker to perform this task once. Provide your answer as a time range (lower and upper bounds). Do not provide explanations or apologies. If the task is not suitable for a time estimate (e.g., it is an ongoing responsibility), interpret it as a single, schedulable action."},
{"role": "user", "content": f"""
Task: {row.task}
For Occupation: {row.occupation_title}
Occupation Description: {row.occupation_description}"""}
]
for row in df.itertuples()
],
schema={
"type": "object",
"properties": {
"lower_bound_estimate": {
"type": "object",
"properties": {"quantity": {"type": "number"}, "unit": {"type": "string", "enum": ["minutes", "hours", "days"]}},
"required": ["quantity", "unit"],
},
"upper_bound_estimate": {
"type": "object",
"properties": {"quantity": {"type": "number"}, "unit": {"type": "string", "enum": ["minutes", "hours", "days"]}},
"required": ["quantity", "unit"],
},
},
"required": ["lower_bound_estimate", "upper_bound_estimate"],
},
chunk_size=200,
)
# Create a new dataframe with just enough information to identify the task uniquely + the estimates classification, save it, return it. Careful: the "task" column in itself is not unique.
raise NotImplementedError

View file

@ -1,50 +1,30 @@
"""
Fetchers retrieve remote data and return it in a format suitable for further processing, they also return its version, which should be considered opaque, though it is usually a checksum.
"""
import sqlite3
from typing import Tuple
import pandas as pd
import requests
import io
import zipfile
from pipeline.run import Run
from pipeline.logger import logger
import yaml
from pathlib import Path
from .logger import logger
from typing import Tuple, Dict
def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
"""
Downloads the O*NET database, creates a local SQLite file from it, and returns a connection.
"""
version = "29_1"
url = f"https://www.onetcenter.org/dl_files/database/db_{version}_mysql.zip"
db_path = run.cache_dir / f"onet_{version}.db"
run.meta.fetchers['onet'] = {
'url': url,
'version': version,
'db_path': str(db_path),
}
ONET_VERSION = "29_1"
ONET_URL = f"https://www.onetcenter.org/dl_files/database/db_{ONET_VERSION}_mysql.zip"
if db_path.exists():
logger.info(f"Using cached O*NET database: {db_path}")
conn = sqlite3.connect(db_path)
return conn, version
def fetch_onet_database(cache_dir: Path) -> sqlite3.Connection:
DB_PATH = cache_dir / f"onet_{ONET_VERSION}.db"
logger.info(f"Downloading O*NET database from {url}")
response = requests.get(url, stream=True, headers={
if DB_PATH.exists():
logger.info(f"Using cached O*NET database: {DB_PATH}")
return sqlite3.connect(DB_PATH)
logger.info(f"Downloading O*NET database from {ONET_URL}")
response = requests.get(ONET_URL, stream=True, headers={
"User-Agent": "econ-agent/1.0"
})
response.raise_for_status()
# Read content into memory
zip_content = response.content
db_path = run.cache_dir / f"onet_{version}.db"
logger.info(f"Creating new O*NET database: {db_path}")
conn = sqlite3.connect(db_path)
# Set performance PRAGMAs for fast import
logger.info("Creating new SQLite database with performance settings")
conn = sqlite3.connect(DB_PATH)
conn.executescript("""
PRAGMA journal_mode = OFF;
PRAGMA synchronous = 0;
@ -54,6 +34,7 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
PRAGMA foreign_keys = ON;
""")
zip_content = response.content
with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
sql_scripts = []
for filename in sorted(z.namelist()):
@ -63,14 +44,10 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
if not sql_scripts:
raise RuntimeError("No SQL files found in the O*NET zip archive.")
# Combine and execute all SQL files in one transaction
full_script = "BEGIN TRANSACTION;\n" + "\n".join(sql_scripts) + "\nCOMMIT;"
logger.info("Executing SQL files in alphabetical order (single transaction mode)")
full_script = "BEGIN TRANSACTION;\n" + "\n".join(sql_scripts) + "\nCOMMIT;"
conn.executescript(full_script)
logger.info("Database populated successfully. Restoring reliability settings...")
# Restore reliability-focused settings after import
conn.executescript("""
PRAGMA journal_mode = WAL;
PRAGMA synchronous = NORMAL;
@ -81,87 +58,75 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
""")
conn.execute("VACUUM;")
conn.commit()
logger.info("Reliability settings restored and database optimized successfully!")
return conn, version
return conn
def fetch_oesm_data(run: Run) -> Tuple[pd.DataFrame, str]:
"""
Downloads the OESM national data from the BLS website.
"""
version = "23"
url = f"https://www.bls.gov/oes/special-requests/oesm{version}nat.zip"
parquet_path = run.cache_dir / "oesm.parquet"
run.meta.fetchers['oesm'] = {
'url': url,
'version': version,
'parquet_path': str(parquet_path),
}
def fetch_oesm_data(cache_dir: Path) -> pd.DataFrame:
VERSION = "23"
URL = f"https://www.bls.gov/oes/special-requests/oesm{VERSION}nat.zip"
DATA_PATH = cache_dir / "oesm.parquet"
if parquet_path.exists():
logger.info(f"Using cached OESM data: {parquet_path}")
return pd.read_parquet(parquet_path), version
if DATA_PATH.exists():
logger.info(f"Using cached OESM data: {DATA_PATH}")
return pd.read_parquet(DATA_PATH)
logger.info(f"Downloading OESM data from {url}")
logger.info(f"Downloading OESM data from {URL}")
headers = {'User-Agent': 'econ-agent/1.0'}
response = requests.get(url, headers=headers)
response = requests.get(URL, headers=headers)
response.raise_for_status()
zip_content = response.content
logger.info(f"OESM data version: {version}")
logger.info(f"Creating new OESM data cache: {parquet_path}")
logger.info(f"Creating new OESM data cache: {DATA_PATH}")
with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
# Find the excel file in the zip
excel_filename = None
for filename in z.namelist():
logger.debug(f"Found file in OESM zip: {filename}")
if filename.lower().endswith(".xlsx"):
excel_filename = filename
break
if excel_filename is None:
raise FileNotFoundError("Could not find the Excel file in the OESM zip archive.")
logger.info(f"Reading {excel_filename} from zip archive.")
with z.open(excel_filename) as f:
with z.open(f"oesm{VERSION}national.xlsx") as f:
df = pd.read_excel(f, engine='openpyxl', na_values=['*', '#'])
df.to_parquet(parquet_path)
logger.info(f"Saved OESM data to cache: {parquet_path}")
return df, version
df.to_parquet(DATA_PATH)
logger.info(f"Saved OESM data to cache: {DATA_PATH}")
return df
def fetch_epoch_remote_data(run: Run) -> Tuple[pd.DataFrame, str]:
"""
Downloads the EPOCH AI remote work task data.
"""
# This is the direct download link constructed from the Google Drive share link
version = "latest"
url = "https://drive.google.com/uc?export=download&id=1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r"
parquet_path = run.cache_dir / f"epoch_remote_{version}.parquet"
run.meta.fetchers['epoch_remote'] = {
'url': url,
'version': version,
'parquet_path': str(parquet_path),
}
def fetch_epoch_remote_data(cache_dir: Path) -> pd.DataFrame:
URL = "https://drive.google.com/uc?export=download&id=1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r"
DATA_PATH = cache_dir / f"epoch_remote_latest.parquet"
if parquet_path.exists():
logger.info(f"Using cached EPOCH remote data: {parquet_path}")
return pd.read_parquet(parquet_path), version
if DATA_PATH.exists():
logger.info(f"Using cached EPOCH remote data: {DATA_PATH}")
return pd.read_parquet(DATA_PATH)
logger.info(f"Downloading EPOCH remote data from Google Drive: {url}")
logger.info(f"Downloading EPOCH remote data from Google Drive: {URL}")
# Need to handle potential cookies/redirects from Google Drive
session = requests.Session()
session.headers.update({"User-Agent": "econ-agent/1.0"})
response = session.get(url, stream=True)
response = session.get(URL, stream=True)
response.raise_for_status()
csv_content = response.content
logger.info(f"Creating new EPOCH remote data cache: {parquet_path}")
logger.info(f"Creating new EPOCH remote data cache: {DATA_PATH}")
df = pd.read_csv(io.BytesIO(csv_content))
df.to_parquet(parquet_path)
logger.info(f"Saved EPOCH remote data to cache: {parquet_path}")
df.to_parquet(DATA_PATH)
return df, version
return df
def fetch_metr_data(cache_dir: Path) -> Dict:
URL = "https://metr.org/assets/benchmark_results.yaml"
DATA_PATH = cache_dir / "metr_benchmark_results.yaml"
if DATA_PATH.exists():
logger.info(f"Using cached METR data: {DATA_PATH}")
with open(DATA_PATH, "r") as f:
return yaml.safe_load(f)
logger.info(f"Downloading METR data from {URL}")
headers = {"User-Agent": "econ-agent/1.0"}
response = requests.get(URL, headers=headers)
response.raise_for_status()
yaml_content = response.content
logger.info(f"Creating new METR data cache: {DATA_PATH}")
with open(DATA_PATH, "wb") as f:
f.write(yaml_content)
return yaml.safe_load(yaml_content)

View file

@ -1,5 +1,15 @@
from .estimate_histplot import generate_estimate_histplot
from .estimates_spread_per_occupation import generate_estimate_spread_per_occupation
from .estimates_lower_vs_upper_scatter import generate_estimates_lower_vs_upper_scatter
from .sequential_coherence_cdf import plot_sequential_coherence_cdf
from .projected_automatable_wage_bill import generate_projected_automatable_wage_bill
from .projected_task_automation import generate_projected_task_automation_plot
GENERATORS = [
generate_estimate_histplot
generate_estimate_histplot,
generate_estimate_spread_per_occupation,
generate_estimates_lower_vs_upper_scatter,
#plot_sequential_coherence_cdf,
generate_projected_automatable_wage_bill,
generate_projected_task_automation_plot,
]

View file

@ -1,6 +1,32 @@
from ..run import Run
from pathlib import Path
from typing import Generator
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from ..utils import style_plot
def generate_estimate_histplot(run: Run) -> Generator[Path]:
raise NotImplementedError
def generate_estimate_histplot(output_dir: Path, df: pd.DataFrame, **kwargs) -> Generator[Path]:
"""
Generates a styled histogram of the distribution of midpoint time estimates.
"""
style_plot()
OUTPUT_PATH = output_dir / "estimate_distribution_histplot.png"
fig, ax = plt.subplots()
sns.histplot(
data=df,
x='estimate_midpoint',
log_scale=True,
ax=ax
)
ax.set_xlabel("Task Time (minutes, log scale)")
ax.set_ylabel("Number of Tasks")
ax.set_title("Distribution of Time Estimates for Atomic Tasks")
plt.tight_layout()
plt.savefig(OUTPUT_PATH)
plt.close(fig)
yield OUTPUT_PATH

View file

@ -0,0 +1,56 @@
from pathlib import Path
from typing import Generator
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from ..utils import OCCUPATION_MAJOR_CODES, style_plot
def generate_estimates_lower_vs_upper_scatter(output_dir: Path, df: pd.DataFrame, **kwargs) -> Generator[Path]:
"""
Generates a styled scatter plot of lower-bound vs upper-bound time estimates for tasks.
"""
style_plot()
OUTPUT_PATH = output_dir / "estimates_lower_vs_upper_scatter.png"
plot_df = df.copy()
# Replace onetsoc_major codes with their corresponding labels for the plot legend
plot_df['onetsoc_major'] = plot_df['onetsoc_major'].map(OCCUPATION_MAJOR_CODES)
fig, ax = plt.subplots(figsize=(12, 10))
sns.scatterplot(
data=plot_df,
x='lb_estimate_in_minutes',
y='ub_estimate_in_minutes',
alpha=0.3,
edgecolor=None,
hue="onetsoc_major",
ax=ax
)
# 45° reference line (y=x)
lims = (
min(df['lb_estimate_in_minutes'].min(), df['ub_estimate_in_minutes'].min()),
max(df['lb_estimate_in_minutes'].max(), df['ub_estimate_in_minutes'].max())
)
lims = (lims[0] * 0.9, lims[1] * 1.1)
ax.plot(lims, lims, color='black', linestyle='--', linewidth=1, zorder=0)
# Optional helper lines for ratios
for k in [2, 10, 100]:
ax.plot(lims, [k*l for l in lims],
linestyle=':', color='grey', linewidth=1, zorder=0)
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel('Lower-bound (min, log scale)')
ax.set_ylabel('Upper-bound (min, log scale)')
ax.set_title('Lower vs Upper Estimates for All Tasks')
ax.legend(title="Occupation Major Group", bbox_to_anchor=(1.02, 1), loc='upper left')
plt.tight_layout()
plt.savefig(OUTPUT_PATH, bbox_inches='tight')
plt.close(fig)
yield OUTPUT_PATH

View file

@ -0,0 +1,39 @@
from pathlib import Path
from typing import Generator
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from ..utils import OCCUPATION_MAJOR_CODES, style_plot
def generate_estimate_spread_per_occupation(output_dir: Path, df: pd.DataFrame, **kwargs) -> Generator[Path]:
"""
Generates a styled boxplot of the estimate range spread per major occupation group.
"""
style_plot()
OUTPUT_PATH = output_dir / "estimates_spread_per_occupation.png"
fig, ax = plt.subplots(figsize=(10, 12))
sns.boxplot(
data=df,
x='onetsoc_major',
y='estimate_range',
showfliers=False,
ax=ax
)
ax.set_yscale('log')
ax.set_xlabel('Occupation')
ax.set_ylabel('Range (upper-lower, minutes)')
ax.set_title('Spread of time-range estimates per occupation')
# Get occupation labels from codes for x-axis ticks
labels = [OCCUPATION_MAJOR_CODES.get(code.get_text(), code.get_text()) for code in ax.get_xticklabels()]
ax.set_xticklabels(labels, rotation=60, ha='right')
plt.tight_layout()
plt.savefig(OUTPUT_PATH)
plt.close(fig)
yield OUTPUT_PATH

View file

@ -1,6 +0,0 @@
import pandas as pd
from typings import List
def must_have_columns(df: pd.DataFrame, columns: List[str]):
if not all(col in df.columns for col in columns):
raise ValueError(f"DataFrame is missing required columns: {columns}")

View file

@ -0,0 +1,229 @@
from pathlib import Path
from typing import Generator, Dict, Tuple, Optional
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
from scipy.stats import linregress
from datetime import datetime
from ..utils import style_plot, LIME
def _generate_wage_projection_data(
metr_results: Dict,
df_with_wages: pd.DataFrame,
percentile_key: str,
doubling_time_modifier: float,
) -> Optional[Tuple[pd.DataFrame, pd.DataFrame, float]]:
"""
Generates wage projection data for different AI progress scenarios.
Args:
metr_results: The METR benchmark data.
df_with_wages: DataFrame containing tasks with their estimated wage value.
percentile_key: The percentile to use from METR data (e.g., 'p50_horizon_length').
doubling_time_modifier: Multiplier for the doubling time (e.g., 1.0 for baseline,
0.5 for optimistic, 2.0 for pessimistic).
Returns:
A tuple of (metr_df, projection_df, doubling_time_days), or None if data is insufficient.
"""
all_model_data = []
for model_name, data in metr_results.get("results", {}).items():
for agent_name, agent_data in data.get("agents", {}).items():
release_date_str = data.get("release_date")
horizon = agent_data.get(percentile_key, {}).get("estimate")
if release_date_str and horizon is not None:
all_model_data.append({
"release_date": release_date_str,
"horizon_minutes": horizon,
})
if not all_model_data:
return None
metr_df = pd.DataFrame(all_model_data).sort_values("release_date").reset_index(drop=True)
metr_df['release_date'] = pd.to_datetime(metr_df['release_date'])
metr_df = metr_df[metr_df['horizon_minutes'] > 0].copy()
if len(metr_df) < 2:
return None
metr_df['days_since_start'] = (metr_df['release_date'] - metr_df['release_date'].min()).dt.days
log_y = np.log(metr_df['horizon_minutes'])
slope, intercept, r_value, _, _ = linregress(metr_df['days_since_start'], log_y)
# Apply the scenario modifier to the doubling time
base_doubling_time_days = np.log(2) / slope
modified_doubling_time_days = base_doubling_time_days * doubling_time_modifier
modified_slope = np.log(2) / modified_doubling_time_days
start_date = metr_df['release_date'].min()
future_dates = pd.to_datetime(pd.date_range(start=start_date, end="2035-01-01", freq="ME"))
future_days = (future_dates - start_date).days.to_numpy()
projected_log_horizon = intercept + modified_slope * future_days
projected_horizon_minutes = np.exp(projected_log_horizon)
projection_df = pd.DataFrame({
"date": future_dates,
"projected_coherence_minutes": projected_horizon_minutes,
})
# Calculate the total wage bill of tasks automated over time
for bound in ["lb", "mid", "ub"]:
col_name = 'estimate_midpoint' if bound == 'mid' else f'{bound}_estimate_in_minutes'
projection_df[f"automatable_wage_bill_{bound}"] = projection_df["projected_coherence_minutes"].apply(
lambda h: df_with_wages.loc[df_with_wages[col_name] <= h, 'wage_per_task'].sum()
)
# Also calculate for the actual METR data points for plotting
metr_df["automatable_wage_bill_mid"] = metr_df["horizon_minutes"].apply(
lambda h: df_with_wages.loc[df_with_wages['estimate_midpoint'] <= h, 'wage_per_task'].sum()
)
return metr_df, projection_df, modified_doubling_time_days
def _plot_scenario(ax, projection_df, metr_df, label, color, line_style='-'):
"""Helper function to draw a single projection scenario on a given axis."""
# Plot the projected wage bill
ax.plot(
projection_df["date"],
projection_df["automatable_wage_bill_mid"],
label=label,
color=color,
linewidth=2.5,
linestyle=line_style,
zorder=3
)
# Plot the shaded range for lower/upper bounds
ax.fill_between(
projection_df["date"],
projection_df["automatable_wage_bill_lb"],
projection_df["automatable_wage_bill_ub"],
color=color,
alpha=0.15,
zorder=2
)
# Plot the actual METR data points against the wage bill
ax.scatter(
metr_df['release_date'],
metr_df['automatable_wage_bill_mid'],
color=color,
edgecolor='black',
s=60,
zorder=4,
label=f"Model Capabilities (P50)"
)
def generate_projected_automatable_wage_bill(
output_dir: Path,
df: pd.DataFrame,
task_summary_by_occupation_df: pd.DataFrame,
metr_results: Dict,
**kwargs,
) -> Generator[Path, None, None]:
"""
Generates a plot projecting the automatable wage bill under different
AI progress scenarios (optimistic, baseline, pessimistic).
"""
style_plot()
OUTPUT_PATH = output_dir / "projected_automatable_wage_bill_sensitivity.png"
# 1. Calculate wage_per_task for each occupation
wage_bill_info = task_summary_by_occupation_df[['onetsoc_code', 'wage_bill', 'total_tasks']].copy()
wage_bill_info['wage_per_task'] = wage_bill_info['wage_bill'] / wage_bill_info['total_tasks']
wage_bill_info.replace([np.inf, -np.inf], 0, inplace=True) # Avoid division by zero issues
wage_bill_info.drop(columns=['wage_bill', 'total_tasks'], inplace=True)
# 2. Merge wage_per_task into the main task dataframe
df_with_wages = pd.merge(df, wage_bill_info, on='onetsoc_code', how='left')
df_with_wages['wage_per_task'].fillna(0, inplace=True)
# 3. Generate data for all three scenarios
scenarios = {
"Optimistic": {"modifier": 0.5, "color": "tab:green", "style": "--"},
"Baseline": {"modifier": 1.0, "color": LIME['600'], "style": "-"},
"Pessimistic": {"modifier": 2.0, "color": "tab:red", "style": ":"},
}
projection_results = {}
for name, config in scenarios.items():
result = _generate_wage_projection_data(metr_results, df_with_wages, 'p50_horizon_length', config['modifier'])
if result:
projection_results[name] = result
if not projection_results:
print("Warning: Could not generate any projection data. Skipping wage bill plot.")
return
# 4. Create the plot
fig, ax = plt.subplots(figsize=(14, 9))
# We only need to plot the scatter points once, let's use the baseline ones.
if "Baseline" in projection_results:
metr_df, _, _ = projection_results["Baseline"]
ax.scatter(
metr_df['release_date'],
metr_df['automatable_wage_bill_mid'],
color='black',
s=80,
zorder=5,
label=f"Model Capabilities (P50)"
)
legend_lines = []
for name, (metr_df, proj_df, doubling_time) in projection_results.items():
config = scenarios[name]
ax.plot(
proj_df["date"],
proj_df["automatable_wage_bill_mid"],
color=config['color'],
linestyle=config['style'],
linewidth=2.5,
zorder=3
)
ax.fill_between(
proj_df["date"],
proj_df["automatable_wage_bill_lb"],
proj_df["automatable_wage_bill_ub"],
color=config['color'],
alpha=0.15,
zorder=2
)
# Create a custom line for the legend
line = plt.Line2D([0], [0], color=config['color'], linestyle=config['style'], lw=2.5,
label=f'{name} (Doubling Time: {doubling_time:.0f} days)')
legend_lines.append(line)
# 5. Styling and annotations
ax.set_title("Projected Automatable Wage Bill (P50 Coherence)", fontsize=18, pad=20)
ax.set_xlabel("Year", fontsize=12)
ax.set_ylabel("Automatable Annual Wage Bill (Trillions of USD)", fontsize=12)
# Format Y-axis to show trillions
def trillions_formatter(x, pos):
return f'${x / 1e12:.1f}T'
ax.yaxis.set_major_formatter(mticker.FuncFormatter(trillions_formatter))
total_wage_bill = df_with_wages['wage_per_task'].sum()
ax.set_ylim(0, total_wage_bill * 1.05)
if "Baseline" in projection_results:
_, proj_df, _ = projection_results["Baseline"]
ax.set_xlim(datetime(2022, 1, 1), proj_df["date"].max())
# Create the legend from the custom lines and the scatter plot
scatter_legend = ax.get_legend_handles_labels()[0]
ax.legend(handles=legend_lines + scatter_legend, loc="upper left", fontsize=11)
ax.grid(True, which="both", linestyle="--", linewidth=0.5)
plt.tight_layout()
plt.savefig(OUTPUT_PATH)
plt.close(fig)
print(f"Generated sensitivity analysis plot: {OUTPUT_PATH}")
yield OUTPUT_PATH

View file

@ -0,0 +1,168 @@
from pathlib import Path
from typing import Generator, Dict, Tuple
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import linregress
from datetime import datetime
from ..utils import style_plot, LIME
def _generate_projection_data(
metr_results: Dict,
df: pd.DataFrame,
percentile_key: str,
) -> Tuple[pd.DataFrame, pd.DataFrame] | None:
"""
Generates projection data for a given percentile key (e.g., 'p50_horizon_length').
Returns a tuple of (metr_df_with_pct, projection_df), or None if data is insufficient.
"""
# 1. Process METR data to get all model performance over time for the given percentile
all_model_data = []
for model_name, data in metr_results.get("results", {}).items():
for agent_name, agent_data in data.get("agents", {}).items():
release_date_str = data.get("release_date")
horizon = agent_data.get(percentile_key, {}).get("estimate")
if release_date_str and horizon is not None:
unique_model_name = f"{model_name}-{agent_name}"
all_model_data.append({
"model": unique_model_name,
"release_date": release_date_str,
"horizon_minutes": horizon,
})
if not all_model_data:
print(f"Warning: No models with {percentile_key} found in METR data. Skipping.")
return None
metr_df = pd.DataFrame(all_model_data).sort_values("release_date").reset_index(drop=True)
metr_df['release_date'] = pd.to_datetime(metr_df['release_date'])
# 2. Perform log-linear regression on coherence over time
metr_df = metr_df[metr_df['horizon_minutes'] > 0].copy()
if len(metr_df) < 2:
print(f"Warning: Not enough data points for regression for {percentile_key}. Skipping.")
return None
metr_df['days_since_start'] = (metr_df['release_date'] - metr_df['release_date'].min()).dt.days
log_y = np.log(metr_df['horizon_minutes'])
x = metr_df['days_since_start']
slope, intercept, r_value, _, _ = linregress(x, log_y)
doubling_time_days = np.log(2) / slope
print(f"METR all models {percentile_key} trend: R^2 = {r_value**2:.2f}, Doubling time = {doubling_time_days:.1f} days")
# 3. Project coherence into the future
start_date = metr_df['release_date'].min()
future_dates = pd.to_datetime(pd.date_range(start=start_date, end="2035-01-01", freq="ME"))
future_days = (future_dates - start_date).days.to_numpy()
projected_log_horizon = intercept + slope * future_days
projected_horizon_minutes = np.exp(projected_log_horizon)
projection_df = pd.DataFrame({
"date": future_dates,
"projected_coherence_minutes": projected_horizon_minutes,
})
# 4. Calculate the percentage of tasks automated over time based on our estimates
total_tasks = len(df)
if total_tasks == 0:
return None
for bound in ["lb", "mid", "ub"]:
col_name = 'estimate_midpoint' if bound == 'mid' else f'{bound}_estimate_in_minutes'
projection_df[f"pct_automatable_{bound}"] = projection_df["projected_coherence_minutes"].apply(
lambda h: (df[col_name] <= h).sum() / total_tasks * 100
)
metr_df["pct_automatable_mid"] = metr_df["horizon_minutes"].apply(
lambda h: (df['estimate_midpoint'] <= h).sum() / total_tasks * 100
)
return metr_df, projection_df
def _plot_projection(ax, projection_df, metr_df, label, color, line_style='-'):
"""Helper function to draw a single projection on a given axis."""
# Plot the projected automation percentage
ax.plot(
projection_df["date"],
projection_df["pct_automatable_mid"],
label=f"Mid-point",
color=color,
linewidth=2.5,
linestyle=line_style,
zorder=3
)
ax.fill_between(
projection_df["date"],
projection_df["pct_automatable_lb"],
projection_df["pct_automatable_ub"],
color=color,
alpha=0.15,
label=f"Lower/upper bound range",
zorder=2
)
# Plot the actual METR data points
ax.scatter(
metr_df['release_date'],
metr_df['pct_automatable_mid'],
color=color,
edgecolor='black',
s=60,
zorder=4,
label=f"Model with {label[1:]}% success rate"
)
def generate_projected_task_automation_plot(
output_dir: Path,
metr_results: Dict,
df: pd.DataFrame,
**kwargs,
) -> Generator[Path, None, None]:
"""
Generates plots projecting task automation based on METR's p50 and p80
coherence data.
"""
style_plot()
p50_data = _generate_projection_data(metr_results, df, 'p50_horizon_length')
p80_data = _generate_projection_data(metr_results, df, 'p80_horizon_length')
# Plot P50 alone
if p50_data:
p50_metr_df, p50_proj_df = p50_data
fig, ax = plt.subplots(figsize=(12, 8))
_plot_projection(ax, p50_proj_df, p50_metr_df, "P50", LIME['600'])
ax.set_title("How long before sequential coherence stops being a bottleneck?", fontsize=16, pad=20)
ax.set_xlabel("Year")
ax.set_ylabel("% of task automatable (50% success rate)")
ax.set_ylim(0, 100.5)
ax.set_xlim(datetime(2022, 1, 1), p50_proj_df["date"].max())
ax.grid(True, which="both", linestyle="--", linewidth=0.5)
ax.legend(loc="upper left")
plt.tight_layout()
output_path = output_dir / "projected_task_automation_p50.png"
plt.savefig(output_path)
plt.close(fig)
yield output_path
# Plot P80 alone
if p80_data:
p80_metr_df, p80_proj_df = p80_data
fig, ax = plt.subplots(figsize=(12, 8))
_plot_projection(ax, p80_proj_df, p80_metr_df, "P80", 'tab:cyan')
ax.set_title("Projected Task Automation (P80 AI Coherence)", fontsize=16, pad=20)
ax.set_xlabel("Year")
ax.set_ylabel("% of Estimable Economic Tasks Automatable")
ax.set_ylim(0, 100.5)
ax.set_xlim(datetime(2022, 1, 1), p80_proj_df["date"].max())
ax.grid(True, which="both", linestyle="--", linewidth=0.5)
ax.legend(loc="upper left")
plt.tight_layout()
output_path = output_dir / "projected_task_automation_p80.png"
plt.savefig(output_path)
plt.close(fig)
yield output_path

View file

@ -0,0 +1,54 @@
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from ..utils import LIME, style_plot
def plot_sequential_coherence_cdf(output_dir: Path, df: pd.DataFrame, **kwargs):
style_plot()
output_path = output_dir / "sequential_coherence_cdf.png"
def cdf(series):
"""Helper function to calculate CDF data."""
s = series.sort_values().reset_index(drop=True)
# Calculate cumulative percentage
return s.values, ((s.index + 1) / len(s)) * 100
# Calculate CDF for lower, upper, and midpoint estimates
x_lb, y_lb = cdf(df['lb_estimate_in_minutes'])
x_ub, y_ub = cdf(df['ub_estimate_in_minutes'])
x_mid, y_mid = cdf(df['estimate_midpoint'])
# Create the plot
fig, ax = plt.subplots(figsize=(12, 7))
# Plot the CDFs as step plots
ax.step(x_lb, y_lb, where='post', color=LIME['300'], linewidth=1.8, linestyle='--', zorder=2, label='Lower bound estimate')
ax.step(x_ub, y_ub, where='post', color=LIME['900'], linewidth=1.8, linestyle=':', zorder=3, label='Upper bound estimate')
ax.step(x_mid, y_mid, where='post', color=LIME['600'], linewidth=2.2, zorder=4, label='Mid-point')
# --- Styling and Annotations ---
ax.set_xscale('log')
ax.set_ylim(0, 100)
ax.yaxis.set_major_formatter(mtick.PercentFormatter(decimals=0))
# Set titles and labels using the standard axes methods
ax.set_title("% of Tasks With Sequential Coherence ≤ X")
ax.set_xlabel("Sequential Coherence (X)")
ax.set_ylabel("Cumulative Percentage of Tasks")
# Define custom x-axis ticks and labels for better readability
ticks = [1, 5, 10, 30, 60, 120, 240, 480, 1440, 2880, 10080, 43200, 129600, 259200, 525600]
ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hr', '2 hr', '4 hr', '8 hr', '1 day', '2 days',
'1 wk', '30 days', '90 days', '180 days', '1 yr']
ax.set_xticks(ticks)
ax.set_xticklabels(ticklabels, rotation=45, ha='right')
ax.legend(loc='lower right')
# --- Save and close ---
plt.tight_layout()
plt.savefig(output_path, bbox_inches='tight')
plt.close(fig)
yield output_path

View file

@ -1,41 +0,0 @@
"""
This module defines the Metadata model for the pipeline.
"""
from datetime import datetime
from pydantic import BaseModel, Field
from typing import Dict, Any
class Metadata(BaseModel):
"""
A Pydantic model for storing pipeline metadata.
This class is intended to be instantiated once and passed through the
pipeline. Each step in the pipeline can then add its own metadata.
This provides a centralized and structured way to track data provenance,
versions, and other important information.
"""
fetchers: Dict[str, Dict[str, Any]] = Field(default_factory=dict)
enrichments: Dict[str, Dict[str, Any]] = Field(default_factory=dict)
ts: str = Field(default_factory=lambda: datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
commit: str = Field(default_factory=lambda: _get_current_commit())
def _get_current_commit() -> str:
"""
Returns the current git commit hash, "unknown", or "errored" depending on why the commit could not be retrieved.
"""
import subprocess
try:
# Get the current commit hash
commit_hash = subprocess.check_output(
["git", "rev-parse", "HEAD"], stderr=subprocess.PIPE, text=True
).strip()
return commit_hash
except subprocess.CalledProcessError:
# If git command fails (e.g., not a git repository)
return "errored"
except FileNotFoundError:
# If git is not installed
return "unknown"

View file

@ -1,140 +0,0 @@
from .run import Run
from .logger import logger
import pandas as pd
import numpy as np
def check_for_insanity(run: Run) -> Run:
raise NotImplementedError
def create_df_tasks(run: Run) -> Run:
"""
Creates a dataframe of tasks from the O*NET database, and merges it with remote status data.
This replicates the logic from old/enrich_task_ratings.py and parts of old/analysis.py
The resulting dataframe, `run.df_tasks` will be used by the enrichment steps.
"""
logger.info("Creating tasks dataframe")
cache_path = run.cache_dir / f"onet_{run.onet_version}_tasks_with_remote_status.parquet"
if cache_path.exists():
logger.info(f"Loading cached tasks dataframe from {cache_path}")
run.df_tasks = pd.read_parquet(cache_path)
return run
query = """
SELECT
tr.onetsoc_code,
tr.task_id,
ts.task,
od.title AS occupation_title,
od.description AS occupation_description,
tr.scale_id,
tr.category,
tr.data_value,
dr.dwa_title
FROM
task_ratings tr
JOIN
task_statements ts ON tr.task_id = ts.task_id
JOIN
occupation_data od ON tr.onetsoc_code = od.onetsoc_code
LEFT JOIN
tasks_to_dwas td ON tr.onetsoc_code = td.onetsoc_code AND tr.task_id = td.task_id
LEFT JOIN
dwa_reference dr ON td.dwa_id = dr.dwa_id;
"""
df = pd.read_sql_query(query, run.onet_conn)
logger.info(f"Fetched {len(df)} records (including DWA info) from the database.")
# Separate ratings from DWAs
core_cols = [
"onetsoc_code", "task_id", "task", "occupation_title",
"occupation_description", "scale_id", "category", "data_value"
]
ratings_df = df[core_cols].drop_duplicates().reset_index(drop=True)
dwa_cols = ["onetsoc_code", "task_id", "dwa_title"]
dwas_df = df[dwa_cols].dropna(subset=["dwa_title"]).drop_duplicates().reset_index(drop=True)
# 1. Handle Frequency (FT)
logger.info("Processing Frequency data")
freq_df = ratings_df[ratings_df["scale_id"] == "FT"].copy()
if not freq_df.empty:
freq_pivot = freq_df.pivot_table(
index=["onetsoc_code", "task_id"],
columns="category",
values="data_value",
fill_value=0,
)
freq_pivot.columns = [f"frequency_category_{int(col)}" for col in freq_pivot.columns]
else:
idx = pd.MultiIndex(levels=[[], []], codes=[[], []], names=["onetsoc_code", "task_id"])
freq_pivot = pd.DataFrame(index=idx)
# 2. Handle Importance (IM, IJ)
logger.info("Processing Importance data")
imp_df = ratings_df[ratings_df["scale_id"].isin(["IM", "IJ"])].copy()
if not imp_df.empty:
imp_avg = imp_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index()
imp_avg.rename(columns={"data_value": "importance_average"}, inplace=True)
else:
imp_avg = pd.DataFrame(columns=["onetsoc_code", "task_id", "importance_average"])
# 3. Handle Relevance (RT)
logger.info("Processing Relevance data")
rel_df = ratings_df[ratings_df["scale_id"] == "RT"].copy()
if not rel_df.empty:
rel_avg = rel_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index()
rel_avg.rename(columns={"data_value": "relevance_average"}, inplace=True)
else:
rel_avg = pd.DataFrame(columns=["onetsoc_code", "task_id", "relevance_average"])
# 4. Process DWAs
logger.info("Processing DWA data")
if not dwas_df.empty:
dwas_grouped = dwas_df.groupby(["onetsoc_code", "task_id"])["dwa_title"].apply(list).reset_index()
dwas_grouped.rename(columns={"dwa_title": "dwas"}, inplace=True)
else:
dwas_grouped = None
# 5. Get Base Task/Occupation Info
logger.info("Extracting base task/occupation info")
base_cols = ["onetsoc_code", "task_id", "task", "occupation_title", "occupation_description"]
base_info = ratings_df[base_cols].drop_duplicates().set_index(["onetsoc_code", "task_id"])
# 6. Merge Processed ONET Data
logger.info("Merging processed ONET data")
final_df = base_info.merge(freq_pivot, left_index=True, right_index=True, how="left")
final_df = final_df.reset_index()
if not imp_avg.empty:
final_df = final_df.merge(imp_avg, on=["onetsoc_code", "task_id"], how="left")
else:
final_df["importance_average"] = np.nan
if not rel_avg.empty:
final_df = final_df.merge(rel_avg, on=["onetsoc_code", "task_id"], how="left")
else:
final_df["relevance_average"] = np.nan
if dwas_grouped is not None and not dwas_grouped.empty:
final_df = final_df.merge(dwas_grouped, on=["onetsoc_code", "task_id"], how="left")
if "dwas" in final_df.columns:
final_df["dwas"] = final_df["dwas"].apply(lambda x: x if isinstance(x, list) else [])
else:
final_df["dwas"] = [[] for _ in range(len(final_df))]
final_df = final_df.replace({np.nan: None})
# 7. Merge with EPOCH remote data
logger.info("Merging with EPOCH remote data")
final_df = pd.merge(final_df, run.epoch_df[['Task', 'Remote']], left_on='task', right_on='Task', how='left')
final_df = final_df.drop('Task', axis=1).rename(columns={'Remote': 'remote_status'})
logger.info(f"Created tasks dataframe with shape {final_df.shape}")
final_df.to_parquet(cache_path)
run.df_tasks = final_df
return run

View file

@ -1,27 +0,0 @@
from pydantic import BaseModel, Field
import sqlite3
import pandas as pd
from pathlib import Path
from typing import Optional
from .metadata import Metadata
class Run(BaseModel):
model_config = {"arbitrary_types_allowed": True}
# === FETCHERS ===
onet_conn: Optional[sqlite3.Connection] = None
onet_version: Optional[str] = None
oesm_df: Optional[pd.DataFrame] = None
oesm_version: Optional[str] = None
epoch_df: Optional[pd.DataFrame] = None
epoch_version: Optional[str] = None
# === ENRICHMENTS ===
task_estimateability_df: Optional[pd.DataFrame] = None
task_estimates_df: Optional[pd.DataFrame] = None
meta: Metadata = Field(default_factory=Metadata)
cache_dir: Path
output_dir: Path

View file

@ -1,74 +1,215 @@
import sqlite3
import os
from .logger import logger
import pandas as pd
from dotenv import load_dotenv
from .fetchers import fetch_oesm_data, fetch_epoch_remote_data, fetch_onet_database
from .enrichments import enrich_with_task_estimateability, enrich_with_task_estimates
from .postprocessors import check_for_insanity, create_df_tasks
from .fetchers import fetch_onet_database, fetch_oesm_data, fetch_epoch_remote_data, ONET_VERSION, fetch_metr_data
from .classification import classify_tasks_as_estimable, generate_time_estimates_for_tasks
from .generators import GENERATORS
from .run import Run
from .constants import GRAY
from .aggregate import create_task_summary_by_occupation_df, aggregate_task_summary_by_major_code
from .utils import convert_to_minutes
import argparse
import platformdirs
import seaborn as sns
import matplotlib as mpl
import numpy as np
from pathlib import Path
from typing import Optional
CACHE_DIR = platformdirs.user_cache_dir("econtai")
def run(output_dir: Path | Optional[str] = None):
load_dotenv()
_setup_graph_rendering()
if output_dir is None:
output_dir = Path("dist/")
elif isinstance(output_dir, str):
output_dir = Path(output_dir).resolve()
output_dir.mkdir(parents=True, exist_ok=True)
current_run = Run(output_dir=output_dir, cache_dir=Path(CACHE_DIR).resolve())
current_run.cache_dir.mkdir(parents=True, exist_ok=True)
# Fetchers (fetchers.py)
current_run.onet_conn, current_run.onet_version = fetch_onet_database(current_run)
current_run.oesm_df, current_run.oesm_version = fetch_oesm_data(current_run)
current_run.epoch_df, current_run.epoch_version = fetch_epoch_remote_data(current_run)
current_run = create_df_tasks(current_run)
# Enrichments (enrichments.py)
current_run.task_estimateability_df = enrich_with_task_estimateability(current_run)
current_run.task_estimates_df = enrich_with_task_estimates(current_run)
# Postprocessors (postprocessors.py)
check_for_insanity(current_run)
# Generators (generators/)
for gen in GENERATORS:
gen(current_run)
def _setup_graph_rendering():
mpl.rcParams.update({
'figure.facecolor' : GRAY['50'],
'axes.facecolor' : GRAY['50'],
'axes.edgecolor' : GRAY['100'],
'axes.labelcolor' : GRAY['700'],
'xtick.color' : GRAY['700'],
'ytick.color' : GRAY['700'],
'font.family' : 'Inter',
'font.size' : 11,
})
class Runner:
onet_conn: sqlite3.Connection
oesm_df: pd.DataFrame
epoch_df: pd.DataFrame
metr_results: dict
def __init__(self, output_dir: Path | str, debug: bool, bust_estimability: bool, bust_estimates: bool):
if isinstance(output_dir, str):
output_dir = Path(output_dir).resolve()
sns.set_style("white")
output_dir.mkdir(parents=True, exist_ok=True)
self.output_dir = output_dir
self.intermediate_dir = self.output_dir / "intermediate"
self.intermediate_dir.mkdir(parents=True, exist_ok=True)
self.cache_dir = platformdirs.user_cache_path("econtai")
self.debug = debug
self.bust_estimability = bust_estimability
self.bust_estimates = bust_estimates
def main():
parser = argparse.ArgumentParser(description="Run the econtai pipeline.")
parser.add_argument("--output-dir", type=str, help="The directory to write output files to.")
args = parser.parse_args()
run(output_dir=args.output_dir)
if debug:
os.environ["LITELLM_LOG"] = os.environ.get("LITELLM_LOG", "INFO")
def run(self):
load_dotenv()
self.onet_conn = fetch_onet_database(self.cache_dir)
self.oesm_df = fetch_oesm_data(self.cache_dir)
self.epoch_df = fetch_epoch_remote_data(self.cache_dir)
self.metr_results = fetch_metr_data(self.cache_dir)
self.df_tasks = self._create_df_tasks()
self.df_tasks['onetsoc_major'] = self.df_tasks['onetsoc_code'].str[:2]
df_to_process = self.df_tasks[
(self.df_tasks['importance_average'] > 3) &
(self.df_tasks['remote_status'] == 'remote')
].copy()
if self.debug:
df_to_process = df_to_process.head(10)
task_estimability_df = classify_tasks_as_estimable(self.cache_dir, df_to_process, bust=self.bust_estimability)
self.df_tasks = pd.merge(self.df_tasks, task_estimability_df, on='task', how='left')
self.df_tasks['estimable'] = self.df_tasks['estimable'].fillna(False)
self.df_tasks.to_parquet(self.intermediate_dir / "df_tasks.parquet")
df_to_process = pd.merge(df_to_process, task_estimability_df, on='task', how='left')
df_to_process['estimable'] = self.df_tasks['estimable'].fillna(False)
df_to_process = df_to_process[df_to_process['estimable']].copy()
task_estimates_df = generate_time_estimates_for_tasks(self.cache_dir, df_to_process, bust=self.bust_estimates)
df = pd.merge(df_to_process, task_estimates_df, on=['onetsoc_code', 'task_id'], how='left')
df['lb_estimate_in_minutes'] = df.apply(lambda row: convert_to_minutes(row['lb_estimate_qty'], row['lb_estimate_unit']), axis=1)
df['ub_estimate_in_minutes'] = df.apply(lambda row: convert_to_minutes(row['ub_estimate_qty'], row['ub_estimate_unit']), axis=1)
df['estimate_range'] = df.ub_estimate_in_minutes - df.lb_estimate_in_minutes
df['estimate_ratio'] = np.divide(df.ub_estimate_in_minutes, df.lb_estimate_in_minutes).replace([np.inf, -np.inf], None)
df['estimate_midpoint'] = (df.lb_estimate_in_minutes + df.ub_estimate_in_minutes) / 2
df.to_parquet(self.intermediate_dir / "estimable_tasks_with_estimates.parquet")
self.task_summary_by_occupation_df = create_task_summary_by_occupation_df(self.df_tasks, self.oesm_df)
self.task_summary_by_occupation_df.to_parquet(self.intermediate_dir / "task_summary_by_occupation.parquet")
self.task_summary_by_major_occupation_df = aggregate_task_summary_by_major_code(self.task_summary_by_occupation_df)
self.task_summary_by_major_occupation_df.to_parquet(self.intermediate_dir / "task_summary_by_major_occupation.parquet")
self._check_for_insanity(df)
for gen in GENERATORS:
for asset in gen(**{
"output_dir": self.output_dir,
"runner": self,
"df": df,
"task_summary_by_occupation_df": self.task_summary_by_occupation_df,
"task_summary_by_major_occupation_df": self.task_summary_by_major_occupation_df,
"df_tasks": self.df_tasks,
"oesm_df": self.oesm_df,
"metr_results": self.metr_results,
}):
logger.info(f"New asset: {asset}")
def _create_df_tasks(self) -> pd.DataFrame:
DATA_PATH = self.cache_dir / f"onet_{ONET_VERSION}_tasks_with_remote_status.parquet"
if DATA_PATH.exists():
logger.info(f"Loading cached tasks dataframe from {DATA_PATH}")
return pd.read_parquet(DATA_PATH)
logger.info("Creating tasks dataframe")
query = """
SELECT
tr.onetsoc_code,
tr.task_id,
ts.task,
od.title AS occupation_title,
od.description AS occupation_description,
tr.scale_id,
tr.category,
tr.data_value
FROM
task_ratings tr
JOIN
task_statements ts ON tr.task_id = ts.task_id
JOIN
occupation_data od ON tr.onetsoc_code = od.onetsoc_code;
"""
ratings_df = pd.read_sql_query(query, self.onet_conn)
logger.info(f"Fetched {len(ratings_df)} task rating records from the database.")
# 1. Handle Frequency (FT)
logger.info("Processing Frequency data")
freq_df = ratings_df[ratings_df["scale_id"] == "FT"].copy()
if not freq_df.empty:
freq_pivot = freq_df.pivot_table(
index=["onetsoc_code", "task_id"],
columns="category",
values="data_value",
fill_value=0,
)
freq_pivot.columns = [f"frequency_category_{int(col)}" for col in freq_pivot.columns]
else:
raise ValueError("No frequency data.")
# 2. Handle Importance (IM, IJ)
logger.info("Processing Importance data")
imp_df = ratings_df[ratings_df["scale_id"].isin(["IM", "IJ"])].copy()
if not imp_df.empty:
imp_avg = imp_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index()
imp_avg.rename(columns={"data_value": "importance_average"}, inplace=True)
else:
raise ValueError("No importance data.")
# 3. Handle Relevance (RT)
logger.info("Processing Relevance data")
rel_df = ratings_df[ratings_df["scale_id"] == "RT"].copy()
if not rel_df.empty:
rel_avg = rel_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index()
rel_avg.rename(columns={"data_value": "relevance_average"}, inplace=True)
else:
raise ValueError("No relevance data.")
# 5. Get Base Task/Occupation Info
logger.info("Extracting base task/occupation info")
base_cols = ["onetsoc_code", "task_id", "task", "occupation_title", "occupation_description"]
base_info = ratings_df[base_cols].drop_duplicates().set_index(["onetsoc_code", "task_id"])
# 6. Merge Processed ONET Data
logger.info("Merging processed ONET data")
final_df = base_info.merge(freq_pivot, left_index=True, right_index=True, how="left")
final_df = final_df.reset_index()
if not imp_avg.empty:
final_df = final_df.merge(imp_avg, on=["onetsoc_code", "task_id"], how="left")
else:
final_df["importance_average"] = np.nan
if not rel_avg.empty:
final_df = final_df.merge(rel_avg, on=["onetsoc_code", "task_id"], how="left")
else:
final_df["relevance_average"] = np.nan
final_df = final_df.replace({np.nan: None})
# 7. Merge with EPOCH remote data
logger.info("Merging with EPOCH remote data")
final_df = pd.merge(final_df, self.epoch_df[['Task', 'Remote']], left_on='task', right_on='Task', how='left')
final_df = final_df.drop('Task', axis=1).rename(columns={'Remote': 'remote_status'})
logger.info(f"Created tasks dataframe with shape {final_df.shape}")
final_df.to_parquet(DATA_PATH)
return final_df
def _check_for_insanity(self, df: pd.DataFrame):
if df['lb_estimate_in_minutes'].isnull().any():
missing_count = df['lb_estimate_in_minutes'].isnull().sum()
raise ValueError(f"Found {missing_count} atomic tasks with missing 'lb_estimate_in_minutes'.")
if df['ub_estimate_in_minutes'].isnull().any():
missing_count = df['ub_estimate_in_minutes'].isnull().sum()
raise ValueError(f"Found {missing_count} atomic tasks with missing 'ub_estimate_in_minutes'.")
valid_estimates = df.dropna(subset=['lb_estimate_in_minutes', 'ub_estimate_in_minutes'])
impossible_bounds = valid_estimates[
(valid_estimates['lb_estimate_in_minutes'] <= 0) |
(valid_estimates['ub_estimate_in_minutes'] <= 0) |
(valid_estimates['lb_estimate_in_minutes'] > valid_estimates['ub_estimate_in_minutes'])
]
if not impossible_bounds.empty:
raise ValueError(f"Found {len(impossible_bounds)} rows with impossible bounds (e.g., lb > ub or value <= 0).")
if __name__ == "__main__":
main()
parser = argparse.ArgumentParser(description="Run the econtai pipeline.")
parser.add_argument("--output-dir", type=str, default="dist/", help="The directory to write output files to.")
parser.add_argument("--bust-estimability", action="store_true", help="Bust the saved task estimability classification (EXPENSIVE)")
parser.add_argument("--bust-estimates", action="store_true", help="Bust the tasks estimates (EXPENSIVE)")
parser.add_argument("--debug", action="store_true", help="Enable debug mode (e.g., process fewer tasks).")
args = parser.parse_args()
Runner(output_dir=args.output_dir, debug=args.debug, bust_estimability=args.bust_estimability, bust_estimates=args.bust_estimates).run()

222
pipeline/utils.py Normal file
View file

@ -0,0 +1,222 @@
import subprocess
import matplotlib.colors as mcolors
import matplotlib as mpl
import seaborn as sns
import tempfile
import litellm
import time
import math
from tqdm import tqdm
from typing import Any, List, Dict
from .logger import logger
OCCUPATION_MAJOR_CODES = {
'11': 'Management',
'13': 'Business & Financial',
'15': 'Computer & Mathematical',
'17': 'Architecture & Engineering',
'19': 'Life, Physical, & Social Science',
'21': 'Community & Social Service',
'23': 'Legal',
'25': 'Education, Training, & Library',
'27': 'Arts, Design, & Media',
'29': 'Healthcare Practitioners',
'31': 'Healthcare Support',
'33': 'Protective Service',
'35': 'Food Preparation & Serving',
'37': 'Building & Grounds Maintenance',
'39': 'Personal Care & Service',
'41': 'Sales & Related',
'43': 'Office & Admin Support',
'45': 'Farming, Fishing, & Forestry',
'47': 'Construction & Extraction',
'49': 'Installation, Maintenance, & Repair',
'51': 'Production',
'53': 'Transportation & Material Moving',
'55': 'Military Specific',
}
GRAY = {'50':'#f8fafc','100':'#f1f5f9','200':'#e2e8f0',
'300':'#cbd5e1','400':'#94a3b8','500':'#64748b',
'600':'#475569','700':'#334155','800':'#1e293b',
'900':'#0f172a','950':'#020617'}
LIME = {'50': '#f7fee7','100': '#ecfcca','200': '#d8f999',
'300': '#bbf451','400': '#9ae600','500': '#83cd00',
'600': '#64a400','700': '#497d00','800': '#3c6300',
'900': '#35530e','950': '#192e03'}
def convert_to_minutes(qty, unit):
"""Converts a quantity in a given unit to minutes."""
return qty * {
"minute": 1,
"hour": 60,
"day": 60 * 24,
"week": 60 * 24 * 7,
"month": 60 * 24 * 30,
"trimester": 60 * 24 * 90,
"semester": 60 * 24 * 180,
"year": 60 * 24 * 365,
}[unit]
def pretty_display(df):
print(df)
return
html_output = df.to_html(index=False)
# Create a temporary HTML file
with tempfile.NamedTemporaryFile(mode='w', suffix=".html", encoding="utf-8") as temp_file:
temp_file.write(html_output)
temp_file_path = temp_file.name
subprocess.run(["/home/felix/.nix-profile/bin/firefox-devedition", "-p", "Work (YouthAI)", temp_file_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
input("Press Enter to continue after reviewing the HTML output...")
def enrich(
model: str,
rpm: int, # Requests per minute
messages_to_process: List[List[Dict[str, str]]],
schema: Dict[str, Any],
chunk_size: int = 100,
):
all_results = []
num_messages = len(messages_to_process)
if num_messages == 0:
return all_results
num_chunks = math.ceil(num_messages / chunk_size)
logger.info(f"Starting enrichment for {num_messages} messages, in {num_chunks} chunks of up to {chunk_size} each.")
# Calculate the time that should be allocated per request to respect the RPM limit.
time_per_request = 60.0 / rpm if rpm > 0 else 0
for i in tqdm(range(num_chunks), desc="Enriching data in chunks"):
chunk_start_time = time.time()
start_index = i * chunk_size
end_index = start_index + chunk_size
message_chunk = messages_to_process[start_index:end_index]
if not message_chunk:
continue
try:
# Send requests for the entire chunk in a batch for better performance.
responses = litellm.batch_completion(
model=model,
messages=message_chunk,
response_format={
"type": "json_schema",
"json_schema": schema,
},
)
# batch_completion returns the response or an exception object for each message.
# We'll replace exceptions with None as expected by the calling functions.
for response in responses:
if isinstance(response, Exception):
logger.error(f"API call within batch failed: {response}")
all_results.append(None)
else:
all_results.append(response)
except Exception as e:
# This catches catastrophic failures in batch_completion itself (e.g., auth)
logger.error(f"litellm.batch_completion call failed for chunk {i+1}/{num_chunks}: {e}")
all_results.extend([None] * len(message_chunk))
chunk_end_time = time.time()
elapsed_time = chunk_end_time - chunk_start_time
# To enforce the rate limit, we calculate how long the chunk *should* have taken
# and sleep for the remainder of that time.
if time_per_request > 0:
expected_duration_for_chunk = len(message_chunk) * time_per_request
if elapsed_time < expected_duration_for_chunk:
sleep_duration = expected_duration_for_chunk - elapsed_time
logger.debug(f"Chunk processed in {elapsed_time:.2f}s. Sleeping for {sleep_duration:.2f}s to respect RPM.")
time.sleep(sleep_duration)
return all_results
def get_contrasting_text_color(bg_color_hex_or_rgba):
if isinstance(bg_color_hex_or_rgba, str):
rgba = mcolors.to_rgba(bg_color_hex_or_rgba)
else:
rgba = bg_color_hex_or_rgba
r, g, b, _ = rgba
luminance = 0.2126 * r + 0.7152 * g + 0.0722 * b
return 'black' if luminance > 0.55 else 'white'
def style_plot():
"""
Applies a consistent and professional style to all plots.
This function sets matplotlib's rcParams for a global effect.
"""
mpl.rcParams.update({
'figure.facecolor': GRAY['50'],
'figure.edgecolor': 'none',
'figure.figsize': (12, 8),
'figure.dpi': 150,
'axes.facecolor': GRAY['50'],
'axes.edgecolor': GRAY['300'],
'axes.grid': True,
'axes.labelcolor': GRAY['800'],
'axes.titlecolor': GRAY['900'],
'axes.titlesize': 18,
'axes.titleweight': 'bold',
'axes.titlepad': 20,
'axes.labelsize': 14,
'axes.labelweight': 'semibold',
'axes.labelpad': 10,
'axes.spines.top': False,
'axes.spines.right': False,
'axes.spines.left': True,
'axes.spines.bottom': True,
'text.color': GRAY['700'],
'xtick.color': GRAY['600'],
'ytick.color': GRAY['600'],
'xtick.labelsize': 12,
'ytick.labelsize': 12,
'xtick.major.size': 0,
'ytick.major.size': 0,
'xtick.minor.size': 0,
'ytick.minor.size': 0,
'xtick.major.pad': 8,
'ytick.major.pad': 8,
'grid.color': GRAY['200'],
'grid.linestyle': '--',
'grid.linewidth': 1,
'legend.frameon': False,
'legend.fontsize': 12,
'legend.title_fontsize': 14,
'legend.facecolor': 'inherit',
'font.family': 'sans-serif',
'font.sans-serif': ['Inter'],
'font.weight': 'normal',
'lines.linewidth': 2,
'lines.markersize': 6,
})
# Seaborn specific styles
# Use shades of LIME as the primary color palette.
# Sorting by integer value of keys, and reversed to have darker shades first.
# Excluding very light colors that won't be visible on a light background.
lime_palette = [LIME[k] for k in sorted(LIME.keys(), key=int, reverse=True) if k not in ['50', '100', '700', '800', '900', '950',]]
sns.set_palette(lime_palette)
sns.set_style("whitegrid", {
'axes.edgecolor': GRAY['300'],
'grid.color': GRAY['200'],
'grid.linestyle': '--',
})

View file

@ -16,6 +16,7 @@ dependencies = [
"python-dotenv>=1.1.1",
"requests>=2.32.4",
"rich>=14.0.0",
"scipy>=1.16.0",
"seaborn>=0.13.2",
]

31
uv.lock generated
View file

@ -1120,6 +1120,35 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/75/04/5302cea1aa26d886d34cadbf2dc77d90d7737e576c0065f357b96dc7a1a6/rpds_py-0.26.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f14440b9573a6f76b4ee4770c13f0b5921f71dde3b6fcb8dabbefd13b7fe05d7", size = 232821, upload_time = "2025-07-01T15:55:55.167Z" },
]
[[package]]
name = "scipy"
version = "1.16.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "numpy" },
]
sdist = { url = "https://files.pythonhosted.org/packages/81/18/b06a83f0c5ee8cddbde5e3f3d0bb9b702abfa5136ef6d4620ff67df7eee5/scipy-1.16.0.tar.gz", hash = "sha256:b5ef54021e832869c8cfb03bc3bf20366cbcd426e02a58e8a58d7584dfbb8f62", size = 30581216, upload_time = "2025-06-22T16:27:55.782Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/46/95/0746417bc24be0c2a7b7563946d61f670a3b491b76adede420e9d173841f/scipy-1.16.0-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:e9f414cbe9ca289a73e0cc92e33a6a791469b6619c240aa32ee18abdce8ab451", size = 36418162, upload_time = "2025-06-22T16:19:56.3Z" },
{ url = "https://files.pythonhosted.org/packages/19/5a/914355a74481b8e4bbccf67259bbde171348a3f160b67b4945fbc5f5c1e5/scipy-1.16.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:bbba55fb97ba3cdef9b1ee973f06b09d518c0c7c66a009c729c7d1592be1935e", size = 28465985, upload_time = "2025-06-22T16:20:01.238Z" },
{ url = "https://files.pythonhosted.org/packages/58/46/63477fc1246063855969cbefdcee8c648ba4b17f67370bd542ba56368d0b/scipy-1.16.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:58e0d4354eacb6004e7aa1cd350e5514bd0270acaa8d5b36c0627bb3bb486974", size = 20737961, upload_time = "2025-06-22T16:20:05.913Z" },
{ url = "https://files.pythonhosted.org/packages/93/86/0fbb5588b73555e40f9d3d6dde24ee6fac7d8e301a27f6f0cab9d8f66ff2/scipy-1.16.0-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:75b2094ec975c80efc273567436e16bb794660509c12c6a31eb5c195cbf4b6dc", size = 23377941, upload_time = "2025-06-22T16:20:10.668Z" },
{ url = "https://files.pythonhosted.org/packages/ca/80/a561f2bf4c2da89fa631b3cbf31d120e21ea95db71fd9ec00cb0247c7a93/scipy-1.16.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6b65d232157a380fdd11a560e7e21cde34fdb69d65c09cb87f6cc024ee376351", size = 33196703, upload_time = "2025-06-22T16:20:16.097Z" },
{ url = "https://files.pythonhosted.org/packages/11/6b/3443abcd0707d52e48eb315e33cc669a95e29fc102229919646f5a501171/scipy-1.16.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1d8747f7736accd39289943f7fe53a8333be7f15a82eea08e4afe47d79568c32", size = 35083410, upload_time = "2025-06-22T16:20:21.734Z" },
{ url = "https://files.pythonhosted.org/packages/20/ab/eb0fc00e1e48961f1bd69b7ad7e7266896fe5bad4ead91b5fc6b3561bba4/scipy-1.16.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:eb9f147a1b8529bb7fec2a85cf4cf42bdfadf9e83535c309a11fdae598c88e8b", size = 35387829, upload_time = "2025-06-22T16:20:27.548Z" },
{ url = "https://files.pythonhosted.org/packages/57/9e/d6fc64e41fad5d481c029ee5a49eefc17f0b8071d636a02ceee44d4a0de2/scipy-1.16.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d2b83c37edbfa837a8923d19c749c1935ad3d41cf196006a24ed44dba2ec4358", size = 37841356, upload_time = "2025-06-22T16:20:35.112Z" },
{ url = "https://files.pythonhosted.org/packages/7c/a7/4c94bbe91f12126b8bf6709b2471900577b7373a4fd1f431f28ba6f81115/scipy-1.16.0-cp313-cp313-win_amd64.whl", hash = "sha256:79a3c13d43c95aa80b87328a46031cf52508cf5f4df2767602c984ed1d3c6bbe", size = 38403710, upload_time = "2025-06-22T16:21:54.473Z" },
{ url = "https://files.pythonhosted.org/packages/47/20/965da8497f6226e8fa90ad3447b82ed0e28d942532e92dd8b91b43f100d4/scipy-1.16.0-cp313-cp313t-macosx_10_14_x86_64.whl", hash = "sha256:f91b87e1689f0370690e8470916fe1b2308e5b2061317ff76977c8f836452a47", size = 36813833, upload_time = "2025-06-22T16:20:43.925Z" },
{ url = "https://files.pythonhosted.org/packages/28/f4/197580c3dac2d234e948806e164601c2df6f0078ed9f5ad4a62685b7c331/scipy-1.16.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:88a6ca658fb94640079e7a50b2ad3b67e33ef0f40e70bdb7dc22017dae73ac08", size = 28974431, upload_time = "2025-06-22T16:20:51.302Z" },
{ url = "https://files.pythonhosted.org/packages/8a/fc/e18b8550048d9224426e76906694c60028dbdb65d28b1372b5503914b89d/scipy-1.16.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:ae902626972f1bd7e4e86f58fd72322d7f4ec7b0cfc17b15d4b7006efc385176", size = 21246454, upload_time = "2025-06-22T16:20:57.276Z" },
{ url = "https://files.pythonhosted.org/packages/8c/48/07b97d167e0d6a324bfd7484cd0c209cc27338b67e5deadae578cf48e809/scipy-1.16.0-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:8cb824c1fc75ef29893bc32b3ddd7b11cf9ab13c1127fe26413a05953b8c32ed", size = 23772979, upload_time = "2025-06-22T16:21:03.363Z" },
{ url = "https://files.pythonhosted.org/packages/4c/4f/9efbd3f70baf9582edf271db3002b7882c875ddd37dc97f0f675ad68679f/scipy-1.16.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:de2db7250ff6514366a9709c2cba35cb6d08498e961cba20d7cff98a7ee88938", size = 33341972, upload_time = "2025-06-22T16:21:11.14Z" },
{ url = "https://files.pythonhosted.org/packages/3f/dc/9e496a3c5dbe24e76ee24525155ab7f659c20180bab058ef2c5fa7d9119c/scipy-1.16.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e85800274edf4db8dd2e4e93034f92d1b05c9421220e7ded9988b16976f849c1", size = 35185476, upload_time = "2025-06-22T16:21:19.156Z" },
{ url = "https://files.pythonhosted.org/packages/ce/b3/21001cff985a122ba434c33f2c9d7d1dc3b669827e94f4fc4e1fe8b9dfd8/scipy-1.16.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4f720300a3024c237ace1cb11f9a84c38beb19616ba7c4cdcd771047a10a1706", size = 35570990, upload_time = "2025-06-22T16:21:27.797Z" },
{ url = "https://files.pythonhosted.org/packages/e5/d3/7ba42647d6709251cdf97043d0c107e0317e152fa2f76873b656b509ff55/scipy-1.16.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:aad603e9339ddb676409b104c48a027e9916ce0d2838830691f39552b38a352e", size = 37950262, upload_time = "2025-06-22T16:21:36.976Z" },
{ url = "https://files.pythonhosted.org/packages/eb/c4/231cac7a8385394ebbbb4f1ca662203e9d8c332825ab4f36ffc3ead09a42/scipy-1.16.0-cp313-cp313t-win_amd64.whl", hash = "sha256:f56296fefca67ba605fd74d12f7bd23636267731a72cb3947963e76b8c0a25db", size = 38515076, upload_time = "2025-06-22T16:21:45.694Z" },
]
[[package]]
name = "seaborn"
version = "0.13.2"
@ -1168,6 +1197,7 @@ dependencies = [
{ name = "python-dotenv" },
{ name = "requests" },
{ name = "rich" },
{ name = "scipy" },
{ name = "seaborn" },
]
@ -1184,6 +1214,7 @@ requires-dist = [
{ name = "python-dotenv", specifier = ">=1.1.1" },
{ name = "requests", specifier = ">=2.32.4" },
{ name = "rich", specifier = ">=14.0.0" },
{ name = "scipy", specifier = ">=1.16.0" },
{ name = "seaborn", specifier = ">=0.13.2" },
]