wip
This commit is contained in:
parent
62296e1b69
commit
65dc648797
37 changed files with 1413 additions and 2433 deletions
81
pipeline/aggregate.py
Normal file
81
pipeline/aggregate.py
Normal file
|
@ -0,0 +1,81 @@
|
|||
from .utils import OCCUPATION_MAJOR_CODES
|
||||
import pandas as pd
|
||||
|
||||
def create_task_summary_by_occupation_df(df_tasks: pd.DataFrame, oesm_df: pd.DataFrame) -> pd.DataFrame:
|
||||
# --- OESM Wage Bill Calculation ---
|
||||
df_oesm_with_bill = oesm_df.copy()
|
||||
df_oesm_with_bill.rename(columns={'OCC_CODE': 'onetsoc_code'}, inplace=True)
|
||||
|
||||
# Convert key columns to numeric, handling potential errors
|
||||
df_oesm_with_bill['TOT_EMP'] = pd.to_numeric(df_oesm_with_bill['TOT_EMP'], errors='coerce')
|
||||
df_oesm_with_bill['A_MEAN'] = pd.to_numeric(df_oesm_with_bill['A_MEAN'], errors='coerce')
|
||||
df_oesm_with_bill.dropna(subset=['TOT_EMP', 'A_MEAN', 'onetsoc_code'], inplace=True)
|
||||
|
||||
# Calculate the wage bill for each occupation
|
||||
df_oesm_with_bill['wage_bill'] = df_oesm_with_bill['TOT_EMP'] * df_oesm_with_bill['A_MEAN']
|
||||
oesm_lookup = df_oesm_with_bill.set_index('onetsoc_code')
|
||||
|
||||
summary_data = []
|
||||
|
||||
# Assuming df_tasks has an 'onetsoc_code' column with the full SOC code
|
||||
unique_soc_codes = df_tasks['onetsoc_code'].unique()
|
||||
|
||||
for code in unique_soc_codes:
|
||||
occ_df = df_tasks[df_tasks['onetsoc_code'] == code]
|
||||
total_tasks_in_occ = len(occ_df)
|
||||
|
||||
not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote'])
|
||||
remote_df = occ_df[occ_df['remote_status'] == 'remote']
|
||||
remote_estimable_count = len(remote_df[remote_df['estimable']])
|
||||
remote_not_estimable_count = len(remote_df[~remote_df['estimable']])
|
||||
|
||||
try:
|
||||
# O*NET codes (e.g., 11-1011.03) are more specific than OESM SOC codes (e.g., 11-1011).
|
||||
# We strip the suffix from the O*NET code to find the corresponding wage data.
|
||||
soc_code_for_lookup = code.split('.')[0]
|
||||
wage_bill = oesm_lookup.loc[soc_code_for_lookup, 'wage_bill']
|
||||
label = oesm_lookup.loc[soc_code_for_lookup, 'OCC_TITLE']
|
||||
except KeyError:
|
||||
wage_bill = 0
|
||||
label = "Unknown"
|
||||
|
||||
summary_data.append({
|
||||
'onetsoc_code': code,
|
||||
'occupation_label': label,
|
||||
'wage_bill': wage_bill,
|
||||
'count_not_remote': not_remote_count,
|
||||
'count_remote_estimable': remote_estimable_count,
|
||||
'count_remote_not_estimable': remote_not_estimable_count,
|
||||
'total_tasks': total_tasks_in_occ
|
||||
})
|
||||
|
||||
return pd.DataFrame(summary_data)
|
||||
|
||||
|
||||
def aggregate_task_summary_by_major_code(summary_df: pd.DataFrame) -> pd.DataFrame:
|
||||
df_agg = summary_df.copy()
|
||||
df_agg['onetsoc_major_code'] = df_agg['onetsoc_code'].str[:2]
|
||||
|
||||
aggregation = {
|
||||
'wage_bill': 'sum',
|
||||
'count_not_remote': 'sum',
|
||||
'count_remote_estimable': 'sum',
|
||||
'count_remote_not_estimable': 'sum',
|
||||
'total_tasks': 'sum'
|
||||
}
|
||||
major_summary = df_agg.groupby('onetsoc_major_code').agg(aggregation).reset_index()
|
||||
|
||||
major_summary['occupation_label'] = major_summary['onetsoc_major_code'].map(OCCUPATION_MAJOR_CODES)
|
||||
|
||||
# Reorder columns to match original output format
|
||||
major_summary = major_summary[[
|
||||
'onetsoc_major_code',
|
||||
'occupation_label',
|
||||
'wage_bill',
|
||||
'count_not_remote',
|
||||
'count_remote_estimable',
|
||||
'count_remote_not_estimable',
|
||||
'total_tasks'
|
||||
]]
|
||||
|
||||
return major_summary
|
225
pipeline/classification.py
Normal file
225
pipeline/classification.py
Normal file
|
@ -0,0 +1,225 @@
|
|||
from pathlib import Path
|
||||
import pandas as pd
|
||||
from .logger import logger
|
||||
from .utils import enrich
|
||||
import json
|
||||
|
||||
ALLOWED_UNITS = [
|
||||
"minute",
|
||||
"hour",
|
||||
"day",
|
||||
"week",
|
||||
"month",
|
||||
"trimester",
|
||||
"semester",
|
||||
"year",
|
||||
]
|
||||
|
||||
ESTIMABLE_CLASSIFICATION_VERSION = "old_version"
|
||||
TIME_ESTIMATES_GENERATION_VERSION = "old_version"
|
||||
|
||||
def classify_tasks_as_estimable(cache_dir: Path, df_to_process: pd.DataFrame, bust: bool = False) -> pd.DataFrame:
|
||||
CACHE_PATH = cache_dir / f"task_estimability.{ESTIMABLE_CLASSIFICATION_VERSION}.parquet"
|
||||
if CACHE_PATH.exists() and not bust:
|
||||
logger.info(f"Loading cached task estimability from {CACHE_PATH}")
|
||||
return pd.read_parquet(CACHE_PATH)
|
||||
|
||||
logger.info("Enriching tasks with estimability classification.")
|
||||
|
||||
df_unique_tasks = df_to_process.drop_duplicates(subset=['task']).copy()
|
||||
|
||||
logger.info(f"Found {len(df_unique_tasks)} unique remote tasks to classify.")
|
||||
|
||||
if df_unique_tasks.empty:
|
||||
raise ValueError("No unique tasks to classify.")
|
||||
|
||||
results = enrich(
|
||||
model="gpt-4.1-mini",
|
||||
rpm=5000,
|
||||
messages_to_process=[
|
||||
[
|
||||
{"role": "system", "content": """
|
||||
Classify the provided O*NET task into one of these categories:
|
||||
- ATOMIC (schedulable): A single, clearly-bounded activity, typically lasting minutes, hours, or a few days.
|
||||
- ONGOING-CONSTRAINT (background role/ethical rule): A continuous responsibility or behavioural norm with no schedulable duration (e.g., “follow confidentiality rules,” “serve as department head”).
|
||||
""".strip()},
|
||||
{"role": "user", "content": f"Task: {row.task}"},
|
||||
]
|
||||
for row in df_unique_tasks.itertuples()
|
||||
],
|
||||
schema={
|
||||
"name": "estimability_classification",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {"task_category": {"type": "string", "enum": ["ATOMIC", "ONGOING-CONSTRAINT"]}},
|
||||
"required": ["task_category"],
|
||||
"additionalProperties": False
|
||||
}
|
||||
},
|
||||
chunk_size=300,
|
||||
)
|
||||
|
||||
if not results or len(results) != len(df_unique_tasks):
|
||||
raise ValueError(f"Task estimability classification failed or returned mismatched number of results. Expected {len(df_unique_tasks)}, got {len(results) if results else 0}.")
|
||||
|
||||
classifications = []
|
||||
for index, response in enumerate(results):
|
||||
task_label = df_unique_tasks.iloc[index]['task']
|
||||
task_category_flag = None
|
||||
|
||||
if response is None:
|
||||
logger.warning(f"API call failed for task (enrich returned None): '{task_label}'")
|
||||
else:
|
||||
try:
|
||||
content_str = response.choices[0].message.content
|
||||
if not content_str:
|
||||
raise ValueError("No content found in the response message")
|
||||
|
||||
data = json.loads(content_str)
|
||||
|
||||
if 'task_category' in data and isinstance(data['task_category'], str):
|
||||
task_category_flag = data['task_category']
|
||||
else:
|
||||
logger.warning(f"Invalid or missing 'task_category' payload for task '{task_label}'. Data: '{data}'")
|
||||
except (json.JSONDecodeError, AttributeError, KeyError, IndexError, ValueError) as e:
|
||||
logger.warning(f"Could not parse response for task '{task_label}'. Error: {e}. Response: {response}")
|
||||
|
||||
classifications.append({
|
||||
'task': task_label,
|
||||
'estimable': task_category_flag == 'ATOMIC'
|
||||
})
|
||||
|
||||
classification_df = pd.DataFrame(classifications)
|
||||
|
||||
logger.info(f"Finished classification. Got {classification_df['estimable'].notna().sum()} successful classifications out of {len(df_unique_tasks)} unique tasks.")
|
||||
|
||||
logger.info(f"Saving task estimability classifications to {CACHE_PATH}")
|
||||
classification_df.to_parquet(CACHE_PATH)
|
||||
|
||||
return classification_df
|
||||
|
||||
|
||||
def generate_time_estimates_for_tasks(cache_dir: Path, df_to_process: pd.DataFrame, bust: bool = False) -> pd.DataFrame:
|
||||
CACHE_PATH = cache_dir / f"task_estimates.{TIME_ESTIMATES_GENERATION_VERSION}.parquet"
|
||||
if CACHE_PATH.exists() and not bust:
|
||||
logger.info(f"Loading cached task estimates from {CACHE_PATH}")
|
||||
return pd.read_parquet(CACHE_PATH)
|
||||
|
||||
logger.info("Enriching tasks with time estimates.")
|
||||
|
||||
if df_to_process.empty:
|
||||
raise ValueError("No tasks to process for estimates.")
|
||||
|
||||
results = enrich(
|
||||
model="gpt-4.1-mini",
|
||||
rpm=5000,
|
||||
messages_to_process=[
|
||||
[
|
||||
{
|
||||
"role": "system",
|
||||
"content": """
|
||||
You are an expert assistant evaluating the time required for job tasks. Your goal is to estimate the 'effective time' range needed for a skilled human to complete the following job task **remotely**, without supervision
|
||||
|
||||
'Effective time' is the active, focused work duration required to complete the task. Crucially, **exclude all waiting periods, delays, or time spent on other unrelated activities**. Think of it as the continuous, productive time investment needed if the worker could pause and resume instantly without cost.
|
||||
|
||||
Provide a lower and upper bound estimate for the 'effective time'. These bounds should capture the time within which approximately 80% of instances of performing this specific task are typically completed by a qualified individual.
|
||||
|
||||
Base your estimate on the provided task and the associated occupation and occupation description. Your estimate must be in one the allowed units: minute, hour, day, week, month, trimester, semester, year.""".strip()
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"{row.task} done by {row.occupation_title} ({row.occupation_description})"
|
||||
}
|
||||
]
|
||||
for row in df_to_process.itertuples()
|
||||
],
|
||||
schema= {
|
||||
"name": "estimate_time",
|
||||
"strict": True,
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"lower_bound_estimate": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"quantity": {
|
||||
"type": "number",
|
||||
"description": "The numerical value for the lower bound of the estimate.",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"enum": ALLOWED_UNITS,
|
||||
"description": "The unit of time for the lower bound.",
|
||||
},
|
||||
},
|
||||
"required": ["quantity", "unit"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
"upper_bound_estimate": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"quantity": {
|
||||
"type": "number",
|
||||
"description": "The numerical value for the upper bound of the estimate.",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"enum": ALLOWED_UNITS,
|
||||
"description": "The unit of time for the upper bound.",
|
||||
},
|
||||
},
|
||||
"required": ["quantity", "unit"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
},
|
||||
"required": ["lower_bound_estimate", "upper_bound_estimate"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
},
|
||||
chunk_size=200,
|
||||
)
|
||||
|
||||
if not results or len(results) != len(df_to_process):
|
||||
raise ValueError(f"API call for task estimates failed or returned mismatched number of results. "
|
||||
f"Expected {len(df_to_process)}, got {len(results) if results else 0}.")
|
||||
|
||||
estimates = []
|
||||
for index, response in enumerate(results):
|
||||
row = df_to_process.iloc[index]
|
||||
task_info = f"O*NET: {row.onetsoc_code}, Task ID: {row.task_id}"
|
||||
lb_qty, lb_unit, ub_qty, ub_unit = None, None, None, None
|
||||
|
||||
if response is None:
|
||||
logger.warning(f"API call failed for task (enrich returned None): {task_info}")
|
||||
else:
|
||||
try:
|
||||
content_str = response.choices[0].message.content
|
||||
if not content_str:
|
||||
raise ValueError("No content found in the response message")
|
||||
|
||||
data = json.loads(content_str)
|
||||
|
||||
lb_qty = data['lower_bound_estimate']['quantity']
|
||||
lb_unit = data['lower_bound_estimate']['unit']
|
||||
ub_qty = data['upper_bound_estimate']['quantity']
|
||||
ub_unit = data['upper_bound_estimate']['unit']
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not parse valid estimate for task {task_info}. Error: {e}. Response: {response}")
|
||||
lb_qty, lb_unit, ub_qty, ub_unit = None, None, None, None # Reset on failure
|
||||
|
||||
estimates.append({
|
||||
'onetsoc_code': row.onetsoc_code,
|
||||
'task_id': row.task_id,
|
||||
'lb_estimate_qty': lb_qty,
|
||||
'lb_estimate_unit': lb_unit,
|
||||
'ub_estimate_qty': ub_qty,
|
||||
'ub_estimate_unit': ub_unit
|
||||
})
|
||||
|
||||
estimates_df = pd.DataFrame(estimates)
|
||||
logger.info(f"Finished estimates. Got {estimates_df['lb_estimate_qty'].notna().sum()} successful estimates out of {len(df_to_process)} tasks.")
|
||||
|
||||
logger.info(f"Saving task estimates to {CACHE_PATH}")
|
||||
estimates_df.to_parquet(CACHE_PATH)
|
||||
|
||||
return estimates_df
|
|
@ -1,35 +0,0 @@
|
|||
OCCUPATION_MAJOR_CODES = {
|
||||
'11': 'Management',
|
||||
'13': 'Business & Financial',
|
||||
'15': 'Computer & Mathematical',
|
||||
'17': 'Architecture & Engineering',
|
||||
'19': 'Life, Physical, & Social Science',
|
||||
'21': 'Community & Social Service',
|
||||
'23': 'Legal',
|
||||
'25': 'Education, Training, & Library',
|
||||
'27': 'Arts, Design, & Media',
|
||||
'29': 'Healthcare Practitioners',
|
||||
'31': 'Healthcare Support',
|
||||
'33': 'Protective Service',
|
||||
'35': 'Food Preparation & Serving',
|
||||
'37': 'Building & Grounds Maintenance',
|
||||
'39': 'Personal Care & Service',
|
||||
'41': 'Sales & Related',
|
||||
'43': 'Office & Admin Support',
|
||||
'45': 'Farming, Fishing, & Forestry',
|
||||
'47': 'Construction & Extraction',
|
||||
'49': 'Installation, Maintenance, & Repair',
|
||||
'51': 'Production',
|
||||
'53': 'Transportation & Material Moving',
|
||||
'55': 'Military Specific',
|
||||
}
|
||||
|
||||
GRAY = {'50':'#f8fafc','100':'#f1f5f9','200':'#e2e8f0',
|
||||
'300':'#cbd5e1','400':'#94a3b8','500':'#64748b',
|
||||
'600':'#475569','700':'#334155','800':'#1e293b',
|
||||
'900':'#0f172a','950':'#020617'}
|
||||
|
||||
LIME = {'50': '#f7fee7','100': '#ecfcca','200': '#d8f999',
|
||||
'300': '#bbf451','400': '#9ae600','500': '#83cd00',
|
||||
'600': '#64a400','700': '#497d00','800': '#3c6300',
|
||||
'900': '#35530e','950': '#192e03'}
|
|
@ -1,97 +0,0 @@
|
|||
"""
|
||||
This module enriches data, they take time to run, and are usually expensive (API calls...),
|
||||
they should manage their own state, and only be run if the data's version is different than
|
||||
their save.
|
||||
"""
|
||||
from .run import Run
|
||||
import pandas as pd
|
||||
from typing import Any, List, Dict
|
||||
import litellm
|
||||
|
||||
def enrich(
|
||||
model: str,
|
||||
rpm: int,
|
||||
messages_to_process: List[List[Dict[str, str]]],
|
||||
schema: Dict[str, Any],
|
||||
chunk_size: int = 100,
|
||||
):
|
||||
# Use litellm.batch_completion
|
||||
pass
|
||||
|
||||
def enrich_with_task_estimateability(run: Run) -> pd.DataFrame:
|
||||
output_path = run.cache_dir / "computed_task_estimateability.parquet"
|
||||
if output_path.exists():
|
||||
print(f"Loading cached task estimateability from {output_path}")
|
||||
return pd.read_parquet(output_path)
|
||||
|
||||
df_remote_tasks = run.df_tasks[run.df_tasks['remote_status'] == 'remote'].copy()
|
||||
|
||||
# In the old script, we only passed unique tasks to the API
|
||||
df_unique_tasks = df_remote_tasks.drop_duplicates(subset=['task'])
|
||||
|
||||
|
||||
results = enrich(
|
||||
model="gpt-4.1-mini",
|
||||
rpm=5000,
|
||||
messages_to_process=[
|
||||
[
|
||||
{"role": "system", "content": """
|
||||
Judge whether the provided O*NET task is suitable for a time estimate. If it is a single, clearly-bounded activity, typically lasting minutes, hours, or a few days, then clearly yes. If it is a continuous responsibility or behavioural norm with no schedulable duration (e.g., “follow confidentiality rules,” “serve as department head”), then clearly no.
|
||||
"""},
|
||||
{"role": "user", "content": f"Task: {row.task}"},
|
||||
]
|
||||
for row in df_unique_tasks.itertuples()
|
||||
],
|
||||
schema={
|
||||
"type": "object",
|
||||
"properties": {"estimateable": {"type": "bool"}},
|
||||
"required": ["estimateable"]
|
||||
},
|
||||
chunk_size=300,
|
||||
)
|
||||
|
||||
# Create a new dataframe with just enough information to identify the task uniquely + estimateability classification, save it, return it. Careful: the "task" column in itself is not unique.
|
||||
return pd.DataFrame()
|
||||
|
||||
def enrich_with_task_estimates(run: Run) -> pd.DataFrame:
|
||||
output_path = run.cache_dir / "computed_task_estimates.parquet"
|
||||
if output_path.exists():
|
||||
print(f"Loading cached task estimates from {output_path}")
|
||||
return pd.read_parquet(output_path)
|
||||
|
||||
df = ... # todo
|
||||
|
||||
results = enrich(
|
||||
model="gpt-4.1-mini",
|
||||
rpm=5000,
|
||||
messages_to_process=[
|
||||
[
|
||||
{"role": "system", "content": "Estimate the time required to complete the following O*NET task. Your estimate should be a plausible range for how long it might take a typical, qualified worker to perform this task once. Provide your answer as a time range (lower and upper bounds). Do not provide explanations or apologies. If the task is not suitable for a time estimate (e.g., it is an ongoing responsibility), interpret it as a single, schedulable action."},
|
||||
{"role": "user", "content": f"""
|
||||
Task: {row.task}
|
||||
For Occupation: {row.occupation_title}
|
||||
Occupation Description: {row.occupation_description}"""}
|
||||
]
|
||||
for row in df.itertuples()
|
||||
],
|
||||
schema={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"lower_bound_estimate": {
|
||||
"type": "object",
|
||||
"properties": {"quantity": {"type": "number"}, "unit": {"type": "string", "enum": ["minutes", "hours", "days"]}},
|
||||
"required": ["quantity", "unit"],
|
||||
},
|
||||
"upper_bound_estimate": {
|
||||
"type": "object",
|
||||
"properties": {"quantity": {"type": "number"}, "unit": {"type": "string", "enum": ["minutes", "hours", "days"]}},
|
||||
"required": ["quantity", "unit"],
|
||||
},
|
||||
},
|
||||
"required": ["lower_bound_estimate", "upper_bound_estimate"],
|
||||
},
|
||||
chunk_size=200,
|
||||
)
|
||||
|
||||
# Create a new dataframe with just enough information to identify the task uniquely + the estimates classification, save it, return it. Careful: the "task" column in itself is not unique.
|
||||
raise NotImplementedError
|
|
@ -1,50 +1,30 @@
|
|||
"""
|
||||
Fetchers retrieve remote data and return it in a format suitable for further processing, they also return its version, which should be considered opaque, though it is usually a checksum.
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
from typing import Tuple
|
||||
import pandas as pd
|
||||
import requests
|
||||
import io
|
||||
import zipfile
|
||||
from pipeline.run import Run
|
||||
from pipeline.logger import logger
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
from .logger import logger
|
||||
from typing import Tuple, Dict
|
||||
|
||||
def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
|
||||
"""
|
||||
Downloads the O*NET database, creates a local SQLite file from it, and returns a connection.
|
||||
"""
|
||||
version = "29_1"
|
||||
url = f"https://www.onetcenter.org/dl_files/database/db_{version}_mysql.zip"
|
||||
db_path = run.cache_dir / f"onet_{version}.db"
|
||||
run.meta.fetchers['onet'] = {
|
||||
'url': url,
|
||||
'version': version,
|
||||
'db_path': str(db_path),
|
||||
}
|
||||
ONET_VERSION = "29_1"
|
||||
ONET_URL = f"https://www.onetcenter.org/dl_files/database/db_{ONET_VERSION}_mysql.zip"
|
||||
|
||||
if db_path.exists():
|
||||
logger.info(f"Using cached O*NET database: {db_path}")
|
||||
conn = sqlite3.connect(db_path)
|
||||
return conn, version
|
||||
def fetch_onet_database(cache_dir: Path) -> sqlite3.Connection:
|
||||
DB_PATH = cache_dir / f"onet_{ONET_VERSION}.db"
|
||||
|
||||
logger.info(f"Downloading O*NET database from {url}")
|
||||
response = requests.get(url, stream=True, headers={
|
||||
if DB_PATH.exists():
|
||||
logger.info(f"Using cached O*NET database: {DB_PATH}")
|
||||
return sqlite3.connect(DB_PATH)
|
||||
|
||||
logger.info(f"Downloading O*NET database from {ONET_URL}")
|
||||
response = requests.get(ONET_URL, stream=True, headers={
|
||||
"User-Agent": "econ-agent/1.0"
|
||||
})
|
||||
response.raise_for_status()
|
||||
|
||||
# Read content into memory
|
||||
zip_content = response.content
|
||||
|
||||
db_path = run.cache_dir / f"onet_{version}.db"
|
||||
|
||||
logger.info(f"Creating new O*NET database: {db_path}")
|
||||
conn = sqlite3.connect(db_path)
|
||||
|
||||
# Set performance PRAGMAs for fast import
|
||||
logger.info("Creating new SQLite database with performance settings")
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
conn.executescript("""
|
||||
PRAGMA journal_mode = OFF;
|
||||
PRAGMA synchronous = 0;
|
||||
|
@ -54,6 +34,7 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
|
|||
PRAGMA foreign_keys = ON;
|
||||
""")
|
||||
|
||||
zip_content = response.content
|
||||
with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
|
||||
sql_scripts = []
|
||||
for filename in sorted(z.namelist()):
|
||||
|
@ -63,14 +44,10 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
|
|||
if not sql_scripts:
|
||||
raise RuntimeError("No SQL files found in the O*NET zip archive.")
|
||||
|
||||
# Combine and execute all SQL files in one transaction
|
||||
full_script = "BEGIN TRANSACTION;\n" + "\n".join(sql_scripts) + "\nCOMMIT;"
|
||||
|
||||
logger.info("Executing SQL files in alphabetical order (single transaction mode)")
|
||||
full_script = "BEGIN TRANSACTION;\n" + "\n".join(sql_scripts) + "\nCOMMIT;"
|
||||
conn.executescript(full_script)
|
||||
logger.info("Database populated successfully. Restoring reliability settings...")
|
||||
|
||||
# Restore reliability-focused settings after import
|
||||
conn.executescript("""
|
||||
PRAGMA journal_mode = WAL;
|
||||
PRAGMA synchronous = NORMAL;
|
||||
|
@ -81,87 +58,75 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
|
|||
""")
|
||||
conn.execute("VACUUM;")
|
||||
conn.commit()
|
||||
logger.info("Reliability settings restored and database optimized successfully!")
|
||||
|
||||
return conn, version
|
||||
return conn
|
||||
|
||||
def fetch_oesm_data(run: Run) -> Tuple[pd.DataFrame, str]:
|
||||
"""
|
||||
Downloads the OESM national data from the BLS website.
|
||||
"""
|
||||
version = "23"
|
||||
url = f"https://www.bls.gov/oes/special-requests/oesm{version}nat.zip"
|
||||
parquet_path = run.cache_dir / "oesm.parquet"
|
||||
run.meta.fetchers['oesm'] = {
|
||||
'url': url,
|
||||
'version': version,
|
||||
'parquet_path': str(parquet_path),
|
||||
}
|
||||
def fetch_oesm_data(cache_dir: Path) -> pd.DataFrame:
|
||||
VERSION = "23"
|
||||
URL = f"https://www.bls.gov/oes/special-requests/oesm{VERSION}nat.zip"
|
||||
DATA_PATH = cache_dir / "oesm.parquet"
|
||||
|
||||
if parquet_path.exists():
|
||||
logger.info(f"Using cached OESM data: {parquet_path}")
|
||||
return pd.read_parquet(parquet_path), version
|
||||
if DATA_PATH.exists():
|
||||
logger.info(f"Using cached OESM data: {DATA_PATH}")
|
||||
return pd.read_parquet(DATA_PATH)
|
||||
|
||||
logger.info(f"Downloading OESM data from {url}")
|
||||
logger.info(f"Downloading OESM data from {URL}")
|
||||
headers = {'User-Agent': 'econ-agent/1.0'}
|
||||
response = requests.get(url, headers=headers)
|
||||
response = requests.get(URL, headers=headers)
|
||||
response.raise_for_status()
|
||||
|
||||
zip_content = response.content
|
||||
logger.info(f"OESM data version: {version}")
|
||||
|
||||
logger.info(f"Creating new OESM data cache: {parquet_path}")
|
||||
logger.info(f"Creating new OESM data cache: {DATA_PATH}")
|
||||
with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
|
||||
# Find the excel file in the zip
|
||||
excel_filename = None
|
||||
for filename in z.namelist():
|
||||
logger.debug(f"Found file in OESM zip: {filename}")
|
||||
if filename.lower().endswith(".xlsx"):
|
||||
excel_filename = filename
|
||||
break
|
||||
|
||||
if excel_filename is None:
|
||||
raise FileNotFoundError("Could not find the Excel file in the OESM zip archive.")
|
||||
|
||||
logger.info(f"Reading {excel_filename} from zip archive.")
|
||||
with z.open(excel_filename) as f:
|
||||
with z.open(f"oesm{VERSION}national.xlsx") as f:
|
||||
df = pd.read_excel(f, engine='openpyxl', na_values=['*', '#'])
|
||||
|
||||
df.to_parquet(parquet_path)
|
||||
logger.info(f"Saved OESM data to cache: {parquet_path}")
|
||||
return df, version
|
||||
df.to_parquet(DATA_PATH)
|
||||
logger.info(f"Saved OESM data to cache: {DATA_PATH}")
|
||||
return df
|
||||
|
||||
def fetch_epoch_remote_data(run: Run) -> Tuple[pd.DataFrame, str]:
|
||||
"""
|
||||
Downloads the EPOCH AI remote work task data.
|
||||
"""
|
||||
# This is the direct download link constructed from the Google Drive share link
|
||||
version = "latest"
|
||||
url = "https://drive.google.com/uc?export=download&id=1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r"
|
||||
parquet_path = run.cache_dir / f"epoch_remote_{version}.parquet"
|
||||
run.meta.fetchers['epoch_remote'] = {
|
||||
'url': url,
|
||||
'version': version,
|
||||
'parquet_path': str(parquet_path),
|
||||
}
|
||||
def fetch_epoch_remote_data(cache_dir: Path) -> pd.DataFrame:
|
||||
URL = "https://drive.google.com/uc?export=download&id=1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r"
|
||||
DATA_PATH = cache_dir / f"epoch_remote_latest.parquet"
|
||||
|
||||
if parquet_path.exists():
|
||||
logger.info(f"Using cached EPOCH remote data: {parquet_path}")
|
||||
return pd.read_parquet(parquet_path), version
|
||||
if DATA_PATH.exists():
|
||||
logger.info(f"Using cached EPOCH remote data: {DATA_PATH}")
|
||||
return pd.read_parquet(DATA_PATH)
|
||||
|
||||
logger.info(f"Downloading EPOCH remote data from Google Drive: {url}")
|
||||
logger.info(f"Downloading EPOCH remote data from Google Drive: {URL}")
|
||||
|
||||
# Need to handle potential cookies/redirects from Google Drive
|
||||
session = requests.Session()
|
||||
session.headers.update({"User-Agent": "econ-agent/1.0"})
|
||||
response = session.get(url, stream=True)
|
||||
response = session.get(URL, stream=True)
|
||||
response.raise_for_status()
|
||||
|
||||
csv_content = response.content
|
||||
|
||||
logger.info(f"Creating new EPOCH remote data cache: {parquet_path}")
|
||||
logger.info(f"Creating new EPOCH remote data cache: {DATA_PATH}")
|
||||
df = pd.read_csv(io.BytesIO(csv_content))
|
||||
df.to_parquet(parquet_path)
|
||||
logger.info(f"Saved EPOCH remote data to cache: {parquet_path}")
|
||||
df.to_parquet(DATA_PATH)
|
||||
|
||||
return df, version
|
||||
return df
|
||||
|
||||
def fetch_metr_data(cache_dir: Path) -> Dict:
|
||||
URL = "https://metr.org/assets/benchmark_results.yaml"
|
||||
DATA_PATH = cache_dir / "metr_benchmark_results.yaml"
|
||||
|
||||
if DATA_PATH.exists():
|
||||
logger.info(f"Using cached METR data: {DATA_PATH}")
|
||||
with open(DATA_PATH, "r") as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
logger.info(f"Downloading METR data from {URL}")
|
||||
headers = {"User-Agent": "econ-agent/1.0"}
|
||||
response = requests.get(URL, headers=headers)
|
||||
response.raise_for_status()
|
||||
|
||||
yaml_content = response.content
|
||||
|
||||
logger.info(f"Creating new METR data cache: {DATA_PATH}")
|
||||
with open(DATA_PATH, "wb") as f:
|
||||
f.write(yaml_content)
|
||||
|
||||
return yaml.safe_load(yaml_content)
|
||||
|
|
|
@ -1,5 +1,15 @@
|
|||
from .estimate_histplot import generate_estimate_histplot
|
||||
from .estimates_spread_per_occupation import generate_estimate_spread_per_occupation
|
||||
from .estimates_lower_vs_upper_scatter import generate_estimates_lower_vs_upper_scatter
|
||||
from .sequential_coherence_cdf import plot_sequential_coherence_cdf
|
||||
from .projected_automatable_wage_bill import generate_projected_automatable_wage_bill
|
||||
from .projected_task_automation import generate_projected_task_automation_plot
|
||||
|
||||
GENERATORS = [
|
||||
generate_estimate_histplot
|
||||
generate_estimate_histplot,
|
||||
generate_estimate_spread_per_occupation,
|
||||
generate_estimates_lower_vs_upper_scatter,
|
||||
#plot_sequential_coherence_cdf,
|
||||
generate_projected_automatable_wage_bill,
|
||||
generate_projected_task_automation_plot,
|
||||
]
|
||||
|
|
|
@ -1,6 +1,32 @@
|
|||
from ..run import Run
|
||||
from pathlib import Path
|
||||
from typing import Generator
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
import pandas as pd
|
||||
from ..utils import style_plot
|
||||
|
||||
def generate_estimate_histplot(run: Run) -> Generator[Path]:
|
||||
raise NotImplementedError
|
||||
def generate_estimate_histplot(output_dir: Path, df: pd.DataFrame, **kwargs) -> Generator[Path]:
|
||||
"""
|
||||
Generates a styled histogram of the distribution of midpoint time estimates.
|
||||
"""
|
||||
style_plot()
|
||||
OUTPUT_PATH = output_dir / "estimate_distribution_histplot.png"
|
||||
|
||||
fig, ax = plt.subplots()
|
||||
|
||||
sns.histplot(
|
||||
data=df,
|
||||
x='estimate_midpoint',
|
||||
log_scale=True,
|
||||
ax=ax
|
||||
)
|
||||
|
||||
ax.set_xlabel("Task Time (minutes, log scale)")
|
||||
ax.set_ylabel("Number of Tasks")
|
||||
ax.set_title("Distribution of Time Estimates for Atomic Tasks")
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(OUTPUT_PATH)
|
||||
plt.close(fig)
|
||||
|
||||
yield OUTPUT_PATH
|
||||
|
|
56
pipeline/generators/estimates_lower_vs_upper_scatter.py
Normal file
56
pipeline/generators/estimates_lower_vs_upper_scatter.py
Normal file
|
@ -0,0 +1,56 @@
|
|||
from pathlib import Path
|
||||
from typing import Generator
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
import pandas as pd
|
||||
from ..utils import OCCUPATION_MAJOR_CODES, style_plot
|
||||
|
||||
|
||||
def generate_estimates_lower_vs_upper_scatter(output_dir: Path, df: pd.DataFrame, **kwargs) -> Generator[Path]:
|
||||
"""
|
||||
Generates a styled scatter plot of lower-bound vs upper-bound time estimates for tasks.
|
||||
"""
|
||||
style_plot()
|
||||
OUTPUT_PATH = output_dir / "estimates_lower_vs_upper_scatter.png"
|
||||
|
||||
plot_df = df.copy()
|
||||
# Replace onetsoc_major codes with their corresponding labels for the plot legend
|
||||
plot_df['onetsoc_major'] = plot_df['onetsoc_major'].map(OCCUPATION_MAJOR_CODES)
|
||||
|
||||
fig, ax = plt.subplots(figsize=(12, 10))
|
||||
sns.scatterplot(
|
||||
data=plot_df,
|
||||
x='lb_estimate_in_minutes',
|
||||
y='ub_estimate_in_minutes',
|
||||
alpha=0.3,
|
||||
edgecolor=None,
|
||||
hue="onetsoc_major",
|
||||
ax=ax
|
||||
)
|
||||
|
||||
# 45° reference line (y=x)
|
||||
lims = (
|
||||
min(df['lb_estimate_in_minutes'].min(), df['ub_estimate_in_minutes'].min()),
|
||||
max(df['lb_estimate_in_minutes'].max(), df['ub_estimate_in_minutes'].max())
|
||||
)
|
||||
lims = (lims[0] * 0.9, lims[1] * 1.1)
|
||||
ax.plot(lims, lims, color='black', linestyle='--', linewidth=1, zorder=0)
|
||||
|
||||
# Optional helper lines for ratios
|
||||
for k in [2, 10, 100]:
|
||||
ax.plot(lims, [k*l for l in lims],
|
||||
linestyle=':', color='grey', linewidth=1, zorder=0)
|
||||
|
||||
ax.set_xscale('log')
|
||||
ax.set_yscale('log')
|
||||
ax.set_xlabel('Lower-bound (min, log scale)')
|
||||
ax.set_ylabel('Upper-bound (min, log scale)')
|
||||
ax.set_title('Lower vs Upper Estimates for All Tasks')
|
||||
|
||||
ax.legend(title="Occupation Major Group", bbox_to_anchor=(1.02, 1), loc='upper left')
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(OUTPUT_PATH, bbox_inches='tight')
|
||||
plt.close(fig)
|
||||
|
||||
yield OUTPUT_PATH
|
39
pipeline/generators/estimates_spread_per_occupation.py
Normal file
39
pipeline/generators/estimates_spread_per_occupation.py
Normal file
|
@ -0,0 +1,39 @@
|
|||
from pathlib import Path
|
||||
from typing import Generator
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
import pandas as pd
|
||||
from ..utils import OCCUPATION_MAJOR_CODES, style_plot
|
||||
|
||||
|
||||
def generate_estimate_spread_per_occupation(output_dir: Path, df: pd.DataFrame, **kwargs) -> Generator[Path]:
|
||||
"""
|
||||
Generates a styled boxplot of the estimate range spread per major occupation group.
|
||||
"""
|
||||
style_plot()
|
||||
OUTPUT_PATH = output_dir / "estimates_spread_per_occupation.png"
|
||||
|
||||
fig, ax = plt.subplots(figsize=(10, 12))
|
||||
|
||||
sns.boxplot(
|
||||
data=df,
|
||||
x='onetsoc_major',
|
||||
y='estimate_range',
|
||||
showfliers=False,
|
||||
ax=ax
|
||||
)
|
||||
|
||||
ax.set_yscale('log')
|
||||
ax.set_xlabel('Occupation')
|
||||
ax.set_ylabel('Range (upper-lower, minutes)')
|
||||
ax.set_title('Spread of time-range estimates per occupation')
|
||||
|
||||
# Get occupation labels from codes for x-axis ticks
|
||||
labels = [OCCUPATION_MAJOR_CODES.get(code.get_text(), code.get_text()) for code in ax.get_xticklabels()]
|
||||
ax.set_xticklabels(labels, rotation=60, ha='right')
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(OUTPUT_PATH)
|
||||
plt.close(fig)
|
||||
|
||||
yield OUTPUT_PATH
|
|
@ -1,6 +0,0 @@
|
|||
import pandas as pd
|
||||
from typings import List
|
||||
|
||||
def must_have_columns(df: pd.DataFrame, columns: List[str]):
|
||||
if not all(col in df.columns for col in columns):
|
||||
raise ValueError(f"DataFrame is missing required columns: {columns}")
|
229
pipeline/generators/projected_automatable_wage_bill.py
Normal file
229
pipeline/generators/projected_automatable_wage_bill.py
Normal file
|
@ -0,0 +1,229 @@
|
|||
from pathlib import Path
|
||||
from typing import Generator, Dict, Tuple, Optional
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.ticker as mticker
|
||||
from scipy.stats import linregress
|
||||
from datetime import datetime
|
||||
from ..utils import style_plot, LIME
|
||||
|
||||
def _generate_wage_projection_data(
|
||||
metr_results: Dict,
|
||||
df_with_wages: pd.DataFrame,
|
||||
percentile_key: str,
|
||||
doubling_time_modifier: float,
|
||||
) -> Optional[Tuple[pd.DataFrame, pd.DataFrame, float]]:
|
||||
"""
|
||||
Generates wage projection data for different AI progress scenarios.
|
||||
|
||||
Args:
|
||||
metr_results: The METR benchmark data.
|
||||
df_with_wages: DataFrame containing tasks with their estimated wage value.
|
||||
percentile_key: The percentile to use from METR data (e.g., 'p50_horizon_length').
|
||||
doubling_time_modifier: Multiplier for the doubling time (e.g., 1.0 for baseline,
|
||||
0.5 for optimistic, 2.0 for pessimistic).
|
||||
|
||||
Returns:
|
||||
A tuple of (metr_df, projection_df, doubling_time_days), or None if data is insufficient.
|
||||
"""
|
||||
all_model_data = []
|
||||
for model_name, data in metr_results.get("results", {}).items():
|
||||
for agent_name, agent_data in data.get("agents", {}).items():
|
||||
release_date_str = data.get("release_date")
|
||||
horizon = agent_data.get(percentile_key, {}).get("estimate")
|
||||
if release_date_str and horizon is not None:
|
||||
all_model_data.append({
|
||||
"release_date": release_date_str,
|
||||
"horizon_minutes": horizon,
|
||||
})
|
||||
|
||||
if not all_model_data:
|
||||
return None
|
||||
|
||||
metr_df = pd.DataFrame(all_model_data).sort_values("release_date").reset_index(drop=True)
|
||||
metr_df['release_date'] = pd.to_datetime(metr_df['release_date'])
|
||||
metr_df = metr_df[metr_df['horizon_minutes'] > 0].copy()
|
||||
|
||||
if len(metr_df) < 2:
|
||||
return None
|
||||
|
||||
metr_df['days_since_start'] = (metr_df['release_date'] - metr_df['release_date'].min()).dt.days
|
||||
log_y = np.log(metr_df['horizon_minutes'])
|
||||
slope, intercept, r_value, _, _ = linregress(metr_df['days_since_start'], log_y)
|
||||
|
||||
# Apply the scenario modifier to the doubling time
|
||||
base_doubling_time_days = np.log(2) / slope
|
||||
modified_doubling_time_days = base_doubling_time_days * doubling_time_modifier
|
||||
modified_slope = np.log(2) / modified_doubling_time_days
|
||||
|
||||
start_date = metr_df['release_date'].min()
|
||||
future_dates = pd.to_datetime(pd.date_range(start=start_date, end="2035-01-01", freq="ME"))
|
||||
future_days = (future_dates - start_date).days.to_numpy()
|
||||
|
||||
projected_log_horizon = intercept + modified_slope * future_days
|
||||
projected_horizon_minutes = np.exp(projected_log_horizon)
|
||||
|
||||
projection_df = pd.DataFrame({
|
||||
"date": future_dates,
|
||||
"projected_coherence_minutes": projected_horizon_minutes,
|
||||
})
|
||||
|
||||
# Calculate the total wage bill of tasks automated over time
|
||||
for bound in ["lb", "mid", "ub"]:
|
||||
col_name = 'estimate_midpoint' if bound == 'mid' else f'{bound}_estimate_in_minutes'
|
||||
projection_df[f"automatable_wage_bill_{bound}"] = projection_df["projected_coherence_minutes"].apply(
|
||||
lambda h: df_with_wages.loc[df_with_wages[col_name] <= h, 'wage_per_task'].sum()
|
||||
)
|
||||
|
||||
# Also calculate for the actual METR data points for plotting
|
||||
metr_df["automatable_wage_bill_mid"] = metr_df["horizon_minutes"].apply(
|
||||
lambda h: df_with_wages.loc[df_with_wages['estimate_midpoint'] <= h, 'wage_per_task'].sum()
|
||||
)
|
||||
|
||||
return metr_df, projection_df, modified_doubling_time_days
|
||||
|
||||
|
||||
def _plot_scenario(ax, projection_df, metr_df, label, color, line_style='-'):
|
||||
"""Helper function to draw a single projection scenario on a given axis."""
|
||||
# Plot the projected wage bill
|
||||
ax.plot(
|
||||
projection_df["date"],
|
||||
projection_df["automatable_wage_bill_mid"],
|
||||
label=label,
|
||||
color=color,
|
||||
linewidth=2.5,
|
||||
linestyle=line_style,
|
||||
zorder=3
|
||||
)
|
||||
# Plot the shaded range for lower/upper bounds
|
||||
ax.fill_between(
|
||||
projection_df["date"],
|
||||
projection_df["automatable_wage_bill_lb"],
|
||||
projection_df["automatable_wage_bill_ub"],
|
||||
color=color,
|
||||
alpha=0.15,
|
||||
zorder=2
|
||||
)
|
||||
# Plot the actual METR data points against the wage bill
|
||||
ax.scatter(
|
||||
metr_df['release_date'],
|
||||
metr_df['automatable_wage_bill_mid'],
|
||||
color=color,
|
||||
edgecolor='black',
|
||||
s=60,
|
||||
zorder=4,
|
||||
label=f"Model Capabilities (P50)"
|
||||
)
|
||||
|
||||
|
||||
def generate_projected_automatable_wage_bill(
|
||||
output_dir: Path,
|
||||
df: pd.DataFrame,
|
||||
task_summary_by_occupation_df: pd.DataFrame,
|
||||
metr_results: Dict,
|
||||
**kwargs,
|
||||
) -> Generator[Path, None, None]:
|
||||
"""
|
||||
Generates a plot projecting the automatable wage bill under different
|
||||
AI progress scenarios (optimistic, baseline, pessimistic).
|
||||
"""
|
||||
style_plot()
|
||||
OUTPUT_PATH = output_dir / "projected_automatable_wage_bill_sensitivity.png"
|
||||
|
||||
# 1. Calculate wage_per_task for each occupation
|
||||
wage_bill_info = task_summary_by_occupation_df[['onetsoc_code', 'wage_bill', 'total_tasks']].copy()
|
||||
wage_bill_info['wage_per_task'] = wage_bill_info['wage_bill'] / wage_bill_info['total_tasks']
|
||||
wage_bill_info.replace([np.inf, -np.inf], 0, inplace=True) # Avoid division by zero issues
|
||||
wage_bill_info.drop(columns=['wage_bill', 'total_tasks'], inplace=True)
|
||||
|
||||
# 2. Merge wage_per_task into the main task dataframe
|
||||
df_with_wages = pd.merge(df, wage_bill_info, on='onetsoc_code', how='left')
|
||||
df_with_wages['wage_per_task'].fillna(0, inplace=True)
|
||||
|
||||
# 3. Generate data for all three scenarios
|
||||
scenarios = {
|
||||
"Optimistic": {"modifier": 0.5, "color": "tab:green", "style": "--"},
|
||||
"Baseline": {"modifier": 1.0, "color": LIME['600'], "style": "-"},
|
||||
"Pessimistic": {"modifier": 2.0, "color": "tab:red", "style": ":"},
|
||||
}
|
||||
|
||||
projection_results = {}
|
||||
for name, config in scenarios.items():
|
||||
result = _generate_wage_projection_data(metr_results, df_with_wages, 'p50_horizon_length', config['modifier'])
|
||||
if result:
|
||||
projection_results[name] = result
|
||||
|
||||
if not projection_results:
|
||||
print("Warning: Could not generate any projection data. Skipping wage bill plot.")
|
||||
return
|
||||
|
||||
# 4. Create the plot
|
||||
fig, ax = plt.subplots(figsize=(14, 9))
|
||||
|
||||
# We only need to plot the scatter points once, let's use the baseline ones.
|
||||
if "Baseline" in projection_results:
|
||||
metr_df, _, _ = projection_results["Baseline"]
|
||||
ax.scatter(
|
||||
metr_df['release_date'],
|
||||
metr_df['automatable_wage_bill_mid'],
|
||||
color='black',
|
||||
s=80,
|
||||
zorder=5,
|
||||
label=f"Model Capabilities (P50)"
|
||||
)
|
||||
|
||||
|
||||
legend_lines = []
|
||||
for name, (metr_df, proj_df, doubling_time) in projection_results.items():
|
||||
config = scenarios[name]
|
||||
ax.plot(
|
||||
proj_df["date"],
|
||||
proj_df["automatable_wage_bill_mid"],
|
||||
color=config['color'],
|
||||
linestyle=config['style'],
|
||||
linewidth=2.5,
|
||||
zorder=3
|
||||
)
|
||||
ax.fill_between(
|
||||
proj_df["date"],
|
||||
proj_df["automatable_wage_bill_lb"],
|
||||
proj_df["automatable_wage_bill_ub"],
|
||||
color=config['color'],
|
||||
alpha=0.15,
|
||||
zorder=2
|
||||
)
|
||||
# Create a custom line for the legend
|
||||
line = plt.Line2D([0], [0], color=config['color'], linestyle=config['style'], lw=2.5,
|
||||
label=f'{name} (Doubling Time: {doubling_time:.0f} days)')
|
||||
legend_lines.append(line)
|
||||
|
||||
|
||||
# 5. Styling and annotations
|
||||
ax.set_title("Projected Automatable Wage Bill (P50 Coherence)", fontsize=18, pad=20)
|
||||
ax.set_xlabel("Year", fontsize=12)
|
||||
ax.set_ylabel("Automatable Annual Wage Bill (Trillions of USD)", fontsize=12)
|
||||
|
||||
# Format Y-axis to show trillions
|
||||
def trillions_formatter(x, pos):
|
||||
return f'${x / 1e12:.1f}T'
|
||||
ax.yaxis.set_major_formatter(mticker.FuncFormatter(trillions_formatter))
|
||||
|
||||
total_wage_bill = df_with_wages['wage_per_task'].sum()
|
||||
ax.set_ylim(0, total_wage_bill * 1.05)
|
||||
|
||||
if "Baseline" in projection_results:
|
||||
_, proj_df, _ = projection_results["Baseline"]
|
||||
ax.set_xlim(datetime(2022, 1, 1), proj_df["date"].max())
|
||||
|
||||
# Create the legend from the custom lines and the scatter plot
|
||||
scatter_legend = ax.get_legend_handles_labels()[0]
|
||||
ax.legend(handles=legend_lines + scatter_legend, loc="upper left", fontsize=11)
|
||||
|
||||
ax.grid(True, which="both", linestyle="--", linewidth=0.5)
|
||||
plt.tight_layout()
|
||||
plt.savefig(OUTPUT_PATH)
|
||||
plt.close(fig)
|
||||
|
||||
print(f"Generated sensitivity analysis plot: {OUTPUT_PATH}")
|
||||
yield OUTPUT_PATH
|
168
pipeline/generators/projected_task_automation.py
Normal file
168
pipeline/generators/projected_task_automation.py
Normal file
|
@ -0,0 +1,168 @@
|
|||
from pathlib import Path
|
||||
from typing import Generator, Dict, Tuple
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from scipy.stats import linregress
|
||||
from datetime import datetime
|
||||
from ..utils import style_plot, LIME
|
||||
|
||||
def _generate_projection_data(
|
||||
metr_results: Dict,
|
||||
df: pd.DataFrame,
|
||||
percentile_key: str,
|
||||
) -> Tuple[pd.DataFrame, pd.DataFrame] | None:
|
||||
"""
|
||||
Generates projection data for a given percentile key (e.g., 'p50_horizon_length').
|
||||
Returns a tuple of (metr_df_with_pct, projection_df), or None if data is insufficient.
|
||||
"""
|
||||
# 1. Process METR data to get all model performance over time for the given percentile
|
||||
all_model_data = []
|
||||
for model_name, data in metr_results.get("results", {}).items():
|
||||
for agent_name, agent_data in data.get("agents", {}).items():
|
||||
release_date_str = data.get("release_date")
|
||||
horizon = agent_data.get(percentile_key, {}).get("estimate")
|
||||
|
||||
if release_date_str and horizon is not None:
|
||||
unique_model_name = f"{model_name}-{agent_name}"
|
||||
all_model_data.append({
|
||||
"model": unique_model_name,
|
||||
"release_date": release_date_str,
|
||||
"horizon_minutes": horizon,
|
||||
})
|
||||
|
||||
if not all_model_data:
|
||||
print(f"Warning: No models with {percentile_key} found in METR data. Skipping.")
|
||||
return None
|
||||
|
||||
metr_df = pd.DataFrame(all_model_data).sort_values("release_date").reset_index(drop=True)
|
||||
metr_df['release_date'] = pd.to_datetime(metr_df['release_date'])
|
||||
|
||||
# 2. Perform log-linear regression on coherence over time
|
||||
metr_df = metr_df[metr_df['horizon_minutes'] > 0].copy()
|
||||
if len(metr_df) < 2:
|
||||
print(f"Warning: Not enough data points for regression for {percentile_key}. Skipping.")
|
||||
return None
|
||||
|
||||
metr_df['days_since_start'] = (metr_df['release_date'] - metr_df['release_date'].min()).dt.days
|
||||
log_y = np.log(metr_df['horizon_minutes'])
|
||||
x = metr_df['days_since_start']
|
||||
|
||||
slope, intercept, r_value, _, _ = linregress(x, log_y)
|
||||
doubling_time_days = np.log(2) / slope
|
||||
print(f"METR all models {percentile_key} trend: R^2 = {r_value**2:.2f}, Doubling time = {doubling_time_days:.1f} days")
|
||||
|
||||
# 3. Project coherence into the future
|
||||
start_date = metr_df['release_date'].min()
|
||||
future_dates = pd.to_datetime(pd.date_range(start=start_date, end="2035-01-01", freq="ME"))
|
||||
future_days = (future_dates - start_date).days.to_numpy()
|
||||
|
||||
projected_log_horizon = intercept + slope * future_days
|
||||
projected_horizon_minutes = np.exp(projected_log_horizon)
|
||||
|
||||
projection_df = pd.DataFrame({
|
||||
"date": future_dates,
|
||||
"projected_coherence_minutes": projected_horizon_minutes,
|
||||
})
|
||||
|
||||
# 4. Calculate the percentage of tasks automated over time based on our estimates
|
||||
total_tasks = len(df)
|
||||
if total_tasks == 0:
|
||||
return None
|
||||
|
||||
for bound in ["lb", "mid", "ub"]:
|
||||
col_name = 'estimate_midpoint' if bound == 'mid' else f'{bound}_estimate_in_minutes'
|
||||
projection_df[f"pct_automatable_{bound}"] = projection_df["projected_coherence_minutes"].apply(
|
||||
lambda h: (df[col_name] <= h).sum() / total_tasks * 100
|
||||
)
|
||||
|
||||
metr_df["pct_automatable_mid"] = metr_df["horizon_minutes"].apply(
|
||||
lambda h: (df['estimate_midpoint'] <= h).sum() / total_tasks * 100
|
||||
)
|
||||
|
||||
return metr_df, projection_df
|
||||
|
||||
|
||||
def _plot_projection(ax, projection_df, metr_df, label, color, line_style='-'):
|
||||
"""Helper function to draw a single projection on a given axis."""
|
||||
# Plot the projected automation percentage
|
||||
ax.plot(
|
||||
projection_df["date"],
|
||||
projection_df["pct_automatable_mid"],
|
||||
label=f"Mid-point",
|
||||
color=color,
|
||||
linewidth=2.5,
|
||||
linestyle=line_style,
|
||||
zorder=3
|
||||
)
|
||||
ax.fill_between(
|
||||
projection_df["date"],
|
||||
projection_df["pct_automatable_lb"],
|
||||
projection_df["pct_automatable_ub"],
|
||||
color=color,
|
||||
alpha=0.15,
|
||||
label=f"Lower/upper bound range",
|
||||
zorder=2
|
||||
)
|
||||
# Plot the actual METR data points
|
||||
ax.scatter(
|
||||
metr_df['release_date'],
|
||||
metr_df['pct_automatable_mid'],
|
||||
color=color,
|
||||
edgecolor='black',
|
||||
s=60,
|
||||
zorder=4,
|
||||
label=f"Model with {label[1:]}% success rate"
|
||||
)
|
||||
|
||||
|
||||
def generate_projected_task_automation_plot(
|
||||
output_dir: Path,
|
||||
metr_results: Dict,
|
||||
df: pd.DataFrame,
|
||||
**kwargs,
|
||||
) -> Generator[Path, None, None]:
|
||||
"""
|
||||
Generates plots projecting task automation based on METR's p50 and p80
|
||||
coherence data.
|
||||
"""
|
||||
style_plot()
|
||||
|
||||
p50_data = _generate_projection_data(metr_results, df, 'p50_horizon_length')
|
||||
p80_data = _generate_projection_data(metr_results, df, 'p80_horizon_length')
|
||||
|
||||
# Plot P50 alone
|
||||
if p50_data:
|
||||
p50_metr_df, p50_proj_df = p50_data
|
||||
fig, ax = plt.subplots(figsize=(12, 8))
|
||||
_plot_projection(ax, p50_proj_df, p50_metr_df, "P50", LIME['600'])
|
||||
ax.set_title("How long before sequential coherence stops being a bottleneck?", fontsize=16, pad=20)
|
||||
ax.set_xlabel("Year")
|
||||
ax.set_ylabel("% of task automatable (50% success rate)")
|
||||
ax.set_ylim(0, 100.5)
|
||||
ax.set_xlim(datetime(2022, 1, 1), p50_proj_df["date"].max())
|
||||
ax.grid(True, which="both", linestyle="--", linewidth=0.5)
|
||||
ax.legend(loc="upper left")
|
||||
plt.tight_layout()
|
||||
output_path = output_dir / "projected_task_automation_p50.png"
|
||||
plt.savefig(output_path)
|
||||
plt.close(fig)
|
||||
yield output_path
|
||||
|
||||
# Plot P80 alone
|
||||
if p80_data:
|
||||
p80_metr_df, p80_proj_df = p80_data
|
||||
fig, ax = plt.subplots(figsize=(12, 8))
|
||||
_plot_projection(ax, p80_proj_df, p80_metr_df, "P80", 'tab:cyan')
|
||||
ax.set_title("Projected Task Automation (P80 AI Coherence)", fontsize=16, pad=20)
|
||||
ax.set_xlabel("Year")
|
||||
ax.set_ylabel("% of Estimable Economic Tasks Automatable")
|
||||
ax.set_ylim(0, 100.5)
|
||||
ax.set_xlim(datetime(2022, 1, 1), p80_proj_df["date"].max())
|
||||
ax.grid(True, which="both", linestyle="--", linewidth=0.5)
|
||||
ax.legend(loc="upper left")
|
||||
plt.tight_layout()
|
||||
output_path = output_dir / "projected_task_automation_p80.png"
|
||||
plt.savefig(output_path)
|
||||
plt.close(fig)
|
||||
yield output_path
|
54
pipeline/generators/sequential_coherence_cdf.py
Normal file
54
pipeline/generators/sequential_coherence_cdf.py
Normal file
|
@ -0,0 +1,54 @@
|
|||
from pathlib import Path
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.ticker as mtick
|
||||
from ..utils import LIME, style_plot
|
||||
|
||||
def plot_sequential_coherence_cdf(output_dir: Path, df: pd.DataFrame, **kwargs):
|
||||
style_plot()
|
||||
output_path = output_dir / "sequential_coherence_cdf.png"
|
||||
|
||||
def cdf(series):
|
||||
"""Helper function to calculate CDF data."""
|
||||
s = series.sort_values().reset_index(drop=True)
|
||||
# Calculate cumulative percentage
|
||||
return s.values, ((s.index + 1) / len(s)) * 100
|
||||
|
||||
# Calculate CDF for lower, upper, and midpoint estimates
|
||||
x_lb, y_lb = cdf(df['lb_estimate_in_minutes'])
|
||||
x_ub, y_ub = cdf(df['ub_estimate_in_minutes'])
|
||||
x_mid, y_mid = cdf(df['estimate_midpoint'])
|
||||
|
||||
# Create the plot
|
||||
fig, ax = plt.subplots(figsize=(12, 7))
|
||||
|
||||
# Plot the CDFs as step plots
|
||||
ax.step(x_lb, y_lb, where='post', color=LIME['300'], linewidth=1.8, linestyle='--', zorder=2, label='Lower bound estimate')
|
||||
ax.step(x_ub, y_ub, where='post', color=LIME['900'], linewidth=1.8, linestyle=':', zorder=3, label='Upper bound estimate')
|
||||
ax.step(x_mid, y_mid, where='post', color=LIME['600'], linewidth=2.2, zorder=4, label='Mid-point')
|
||||
|
||||
# --- Styling and Annotations ---
|
||||
ax.set_xscale('log')
|
||||
ax.set_ylim(0, 100)
|
||||
ax.yaxis.set_major_formatter(mtick.PercentFormatter(decimals=0))
|
||||
|
||||
# Set titles and labels using the standard axes methods
|
||||
ax.set_title("% of Tasks With Sequential Coherence ≤ X")
|
||||
ax.set_xlabel("Sequential Coherence (X)")
|
||||
ax.set_ylabel("Cumulative Percentage of Tasks")
|
||||
|
||||
# Define custom x-axis ticks and labels for better readability
|
||||
ticks = [1, 5, 10, 30, 60, 120, 240, 480, 1440, 2880, 10080, 43200, 129600, 259200, 525600]
|
||||
ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hr', '2 hr', '4 hr', '8 hr', '1 day', '2 days',
|
||||
'1 wk', '30 days', '90 days', '180 days', '1 yr']
|
||||
ax.set_xticks(ticks)
|
||||
ax.set_xticklabels(ticklabels, rotation=45, ha='right')
|
||||
|
||||
ax.legend(loc='lower right')
|
||||
|
||||
# --- Save and close ---
|
||||
plt.tight_layout()
|
||||
plt.savefig(output_path, bbox_inches='tight')
|
||||
plt.close(fig)
|
||||
|
||||
yield output_path
|
|
@ -1,41 +0,0 @@
|
|||
"""
|
||||
This module defines the Metadata model for the pipeline.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import Dict, Any
|
||||
|
||||
class Metadata(BaseModel):
|
||||
"""
|
||||
A Pydantic model for storing pipeline metadata.
|
||||
|
||||
This class is intended to be instantiated once and passed through the
|
||||
pipeline. Each step in the pipeline can then add its own metadata.
|
||||
This provides a centralized and structured way to track data provenance,
|
||||
versions, and other important information.
|
||||
"""
|
||||
fetchers: Dict[str, Dict[str, Any]] = Field(default_factory=dict)
|
||||
enrichments: Dict[str, Dict[str, Any]] = Field(default_factory=dict)
|
||||
|
||||
ts: str = Field(default_factory=lambda: datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
|
||||
commit: str = Field(default_factory=lambda: _get_current_commit())
|
||||
|
||||
|
||||
def _get_current_commit() -> str:
|
||||
"""
|
||||
Returns the current git commit hash, "unknown", or "errored" depending on why the commit could not be retrieved.
|
||||
"""
|
||||
import subprocess
|
||||
try:
|
||||
# Get the current commit hash
|
||||
commit_hash = subprocess.check_output(
|
||||
["git", "rev-parse", "HEAD"], stderr=subprocess.PIPE, text=True
|
||||
).strip()
|
||||
return commit_hash
|
||||
except subprocess.CalledProcessError:
|
||||
# If git command fails (e.g., not a git repository)
|
||||
return "errored"
|
||||
except FileNotFoundError:
|
||||
# If git is not installed
|
||||
return "unknown"
|
|
@ -1,140 +0,0 @@
|
|||
from .run import Run
|
||||
from .logger import logger
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
|
||||
def check_for_insanity(run: Run) -> Run:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def create_df_tasks(run: Run) -> Run:
|
||||
"""
|
||||
Creates a dataframe of tasks from the O*NET database, and merges it with remote status data.
|
||||
This replicates the logic from old/enrich_task_ratings.py and parts of old/analysis.py
|
||||
|
||||
The resulting dataframe, `run.df_tasks` will be used by the enrichment steps.
|
||||
"""
|
||||
logger.info("Creating tasks dataframe")
|
||||
cache_path = run.cache_dir / f"onet_{run.onet_version}_tasks_with_remote_status.parquet"
|
||||
if cache_path.exists():
|
||||
logger.info(f"Loading cached tasks dataframe from {cache_path}")
|
||||
run.df_tasks = pd.read_parquet(cache_path)
|
||||
return run
|
||||
|
||||
query = """
|
||||
SELECT
|
||||
tr.onetsoc_code,
|
||||
tr.task_id,
|
||||
ts.task,
|
||||
od.title AS occupation_title,
|
||||
od.description AS occupation_description,
|
||||
tr.scale_id,
|
||||
tr.category,
|
||||
tr.data_value,
|
||||
dr.dwa_title
|
||||
FROM
|
||||
task_ratings tr
|
||||
JOIN
|
||||
task_statements ts ON tr.task_id = ts.task_id
|
||||
JOIN
|
||||
occupation_data od ON tr.onetsoc_code = od.onetsoc_code
|
||||
LEFT JOIN
|
||||
tasks_to_dwas td ON tr.onetsoc_code = td.onetsoc_code AND tr.task_id = td.task_id
|
||||
LEFT JOIN
|
||||
dwa_reference dr ON td.dwa_id = dr.dwa_id;
|
||||
"""
|
||||
df = pd.read_sql_query(query, run.onet_conn)
|
||||
logger.info(f"Fetched {len(df)} records (including DWA info) from the database.")
|
||||
|
||||
# Separate ratings from DWAs
|
||||
core_cols = [
|
||||
"onetsoc_code", "task_id", "task", "occupation_title",
|
||||
"occupation_description", "scale_id", "category", "data_value"
|
||||
]
|
||||
ratings_df = df[core_cols].drop_duplicates().reset_index(drop=True)
|
||||
|
||||
dwa_cols = ["onetsoc_code", "task_id", "dwa_title"]
|
||||
dwas_df = df[dwa_cols].dropna(subset=["dwa_title"]).drop_duplicates().reset_index(drop=True)
|
||||
|
||||
# 1. Handle Frequency (FT)
|
||||
logger.info("Processing Frequency data")
|
||||
freq_df = ratings_df[ratings_df["scale_id"] == "FT"].copy()
|
||||
if not freq_df.empty:
|
||||
freq_pivot = freq_df.pivot_table(
|
||||
index=["onetsoc_code", "task_id"],
|
||||
columns="category",
|
||||
values="data_value",
|
||||
fill_value=0,
|
||||
)
|
||||
freq_pivot.columns = [f"frequency_category_{int(col)}" for col in freq_pivot.columns]
|
||||
else:
|
||||
idx = pd.MultiIndex(levels=[[], []], codes=[[], []], names=["onetsoc_code", "task_id"])
|
||||
freq_pivot = pd.DataFrame(index=idx)
|
||||
|
||||
# 2. Handle Importance (IM, IJ)
|
||||
logger.info("Processing Importance data")
|
||||
imp_df = ratings_df[ratings_df["scale_id"].isin(["IM", "IJ"])].copy()
|
||||
if not imp_df.empty:
|
||||
imp_avg = imp_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index()
|
||||
imp_avg.rename(columns={"data_value": "importance_average"}, inplace=True)
|
||||
else:
|
||||
imp_avg = pd.DataFrame(columns=["onetsoc_code", "task_id", "importance_average"])
|
||||
|
||||
# 3. Handle Relevance (RT)
|
||||
logger.info("Processing Relevance data")
|
||||
rel_df = ratings_df[ratings_df["scale_id"] == "RT"].copy()
|
||||
if not rel_df.empty:
|
||||
rel_avg = rel_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index()
|
||||
rel_avg.rename(columns={"data_value": "relevance_average"}, inplace=True)
|
||||
else:
|
||||
rel_avg = pd.DataFrame(columns=["onetsoc_code", "task_id", "relevance_average"])
|
||||
|
||||
# 4. Process DWAs
|
||||
logger.info("Processing DWA data")
|
||||
if not dwas_df.empty:
|
||||
dwas_grouped = dwas_df.groupby(["onetsoc_code", "task_id"])["dwa_title"].apply(list).reset_index()
|
||||
dwas_grouped.rename(columns={"dwa_title": "dwas"}, inplace=True)
|
||||
else:
|
||||
dwas_grouped = None
|
||||
|
||||
# 5. Get Base Task/Occupation Info
|
||||
logger.info("Extracting base task/occupation info")
|
||||
base_cols = ["onetsoc_code", "task_id", "task", "occupation_title", "occupation_description"]
|
||||
base_info = ratings_df[base_cols].drop_duplicates().set_index(["onetsoc_code", "task_id"])
|
||||
|
||||
# 6. Merge Processed ONET Data
|
||||
logger.info("Merging processed ONET data")
|
||||
final_df = base_info.merge(freq_pivot, left_index=True, right_index=True, how="left")
|
||||
final_df = final_df.reset_index()
|
||||
|
||||
if not imp_avg.empty:
|
||||
final_df = final_df.merge(imp_avg, on=["onetsoc_code", "task_id"], how="left")
|
||||
else:
|
||||
final_df["importance_average"] = np.nan
|
||||
|
||||
if not rel_avg.empty:
|
||||
final_df = final_df.merge(rel_avg, on=["onetsoc_code", "task_id"], how="left")
|
||||
else:
|
||||
final_df["relevance_average"] = np.nan
|
||||
|
||||
if dwas_grouped is not None and not dwas_grouped.empty:
|
||||
final_df = final_df.merge(dwas_grouped, on=["onetsoc_code", "task_id"], how="left")
|
||||
if "dwas" in final_df.columns:
|
||||
final_df["dwas"] = final_df["dwas"].apply(lambda x: x if isinstance(x, list) else [])
|
||||
else:
|
||||
final_df["dwas"] = [[] for _ in range(len(final_df))]
|
||||
|
||||
final_df = final_df.replace({np.nan: None})
|
||||
|
||||
# 7. Merge with EPOCH remote data
|
||||
logger.info("Merging with EPOCH remote data")
|
||||
final_df = pd.merge(final_df, run.epoch_df[['Task', 'Remote']], left_on='task', right_on='Task', how='left')
|
||||
final_df = final_df.drop('Task', axis=1).rename(columns={'Remote': 'remote_status'})
|
||||
|
||||
|
||||
logger.info(f"Created tasks dataframe with shape {final_df.shape}")
|
||||
final_df.to_parquet(cache_path)
|
||||
|
||||
run.df_tasks = final_df
|
||||
return run
|
|
@ -1,27 +0,0 @@
|
|||
from pydantic import BaseModel, Field
|
||||
import sqlite3
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from .metadata import Metadata
|
||||
|
||||
class Run(BaseModel):
|
||||
model_config = {"arbitrary_types_allowed": True}
|
||||
# === FETCHERS ===
|
||||
onet_conn: Optional[sqlite3.Connection] = None
|
||||
onet_version: Optional[str] = None
|
||||
|
||||
oesm_df: Optional[pd.DataFrame] = None
|
||||
oesm_version: Optional[str] = None
|
||||
|
||||
epoch_df: Optional[pd.DataFrame] = None
|
||||
epoch_version: Optional[str] = None
|
||||
|
||||
# === ENRICHMENTS ===
|
||||
task_estimateability_df: Optional[pd.DataFrame] = None
|
||||
task_estimates_df: Optional[pd.DataFrame] = None
|
||||
|
||||
meta: Metadata = Field(default_factory=Metadata)
|
||||
|
||||
cache_dir: Path
|
||||
output_dir: Path
|
|
@ -1,74 +1,215 @@
|
|||
import sqlite3
|
||||
import os
|
||||
from .logger import logger
|
||||
import pandas as pd
|
||||
from dotenv import load_dotenv
|
||||
from .fetchers import fetch_oesm_data, fetch_epoch_remote_data, fetch_onet_database
|
||||
from .enrichments import enrich_with_task_estimateability, enrich_with_task_estimates
|
||||
from .postprocessors import check_for_insanity, create_df_tasks
|
||||
from .fetchers import fetch_onet_database, fetch_oesm_data, fetch_epoch_remote_data, ONET_VERSION, fetch_metr_data
|
||||
from .classification import classify_tasks_as_estimable, generate_time_estimates_for_tasks
|
||||
from .generators import GENERATORS
|
||||
from .run import Run
|
||||
from .constants import GRAY
|
||||
from .aggregate import create_task_summary_by_occupation_df, aggregate_task_summary_by_major_code
|
||||
from .utils import convert_to_minutes
|
||||
import argparse
|
||||
import platformdirs
|
||||
import seaborn as sns
|
||||
import matplotlib as mpl
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
CACHE_DIR = platformdirs.user_cache_dir("econtai")
|
||||
|
||||
def run(output_dir: Path | Optional[str] = None):
|
||||
load_dotenv()
|
||||
_setup_graph_rendering()
|
||||
|
||||
if output_dir is None:
|
||||
output_dir = Path("dist/")
|
||||
elif isinstance(output_dir, str):
|
||||
output_dir = Path(output_dir).resolve()
|
||||
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
current_run = Run(output_dir=output_dir, cache_dir=Path(CACHE_DIR).resolve())
|
||||
current_run.cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Fetchers (fetchers.py)
|
||||
current_run.onet_conn, current_run.onet_version = fetch_onet_database(current_run)
|
||||
current_run.oesm_df, current_run.oesm_version = fetch_oesm_data(current_run)
|
||||
current_run.epoch_df, current_run.epoch_version = fetch_epoch_remote_data(current_run)
|
||||
|
||||
current_run = create_df_tasks(current_run)
|
||||
|
||||
# Enrichments (enrichments.py)
|
||||
current_run.task_estimateability_df = enrich_with_task_estimateability(current_run)
|
||||
current_run.task_estimates_df = enrich_with_task_estimates(current_run)
|
||||
|
||||
# Postprocessors (postprocessors.py)
|
||||
check_for_insanity(current_run)
|
||||
|
||||
# Generators (generators/)
|
||||
for gen in GENERATORS:
|
||||
gen(current_run)
|
||||
|
||||
|
||||
def _setup_graph_rendering():
|
||||
mpl.rcParams.update({
|
||||
'figure.facecolor' : GRAY['50'],
|
||||
'axes.facecolor' : GRAY['50'],
|
||||
'axes.edgecolor' : GRAY['100'],
|
||||
'axes.labelcolor' : GRAY['700'],
|
||||
'xtick.color' : GRAY['700'],
|
||||
'ytick.color' : GRAY['700'],
|
||||
'font.family' : 'Inter',
|
||||
'font.size' : 11,
|
||||
})
|
||||
class Runner:
|
||||
onet_conn: sqlite3.Connection
|
||||
oesm_df: pd.DataFrame
|
||||
epoch_df: pd.DataFrame
|
||||
metr_results: dict
|
||||
|
||||
def __init__(self, output_dir: Path | str, debug: bool, bust_estimability: bool, bust_estimates: bool):
|
||||
if isinstance(output_dir, str):
|
||||
output_dir = Path(output_dir).resolve()
|
||||
|
||||
sns.set_style("white")
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self.output_dir = output_dir
|
||||
self.intermediate_dir = self.output_dir / "intermediate"
|
||||
self.intermediate_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.cache_dir = platformdirs.user_cache_path("econtai")
|
||||
self.debug = debug
|
||||
self.bust_estimability = bust_estimability
|
||||
self.bust_estimates = bust_estimates
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Run the econtai pipeline.")
|
||||
parser.add_argument("--output-dir", type=str, help="The directory to write output files to.")
|
||||
args = parser.parse_args()
|
||||
run(output_dir=args.output_dir)
|
||||
if debug:
|
||||
os.environ["LITELLM_LOG"] = os.environ.get("LITELLM_LOG", "INFO")
|
||||
|
||||
def run(self):
|
||||
load_dotenv()
|
||||
|
||||
self.onet_conn = fetch_onet_database(self.cache_dir)
|
||||
self.oesm_df = fetch_oesm_data(self.cache_dir)
|
||||
self.epoch_df = fetch_epoch_remote_data(self.cache_dir)
|
||||
self.metr_results = fetch_metr_data(self.cache_dir)
|
||||
|
||||
self.df_tasks = self._create_df_tasks()
|
||||
self.df_tasks['onetsoc_major'] = self.df_tasks['onetsoc_code'].str[:2]
|
||||
|
||||
df_to_process = self.df_tasks[
|
||||
(self.df_tasks['importance_average'] > 3) &
|
||||
(self.df_tasks['remote_status'] == 'remote')
|
||||
].copy()
|
||||
|
||||
if self.debug:
|
||||
df_to_process = df_to_process.head(10)
|
||||
|
||||
task_estimability_df = classify_tasks_as_estimable(self.cache_dir, df_to_process, bust=self.bust_estimability)
|
||||
self.df_tasks = pd.merge(self.df_tasks, task_estimability_df, on='task', how='left')
|
||||
self.df_tasks['estimable'] = self.df_tasks['estimable'].fillna(False)
|
||||
self.df_tasks.to_parquet(self.intermediate_dir / "df_tasks.parquet")
|
||||
df_to_process = pd.merge(df_to_process, task_estimability_df, on='task', how='left')
|
||||
df_to_process['estimable'] = self.df_tasks['estimable'].fillna(False)
|
||||
|
||||
df_to_process = df_to_process[df_to_process['estimable']].copy()
|
||||
|
||||
task_estimates_df = generate_time_estimates_for_tasks(self.cache_dir, df_to_process, bust=self.bust_estimates)
|
||||
df = pd.merge(df_to_process, task_estimates_df, on=['onetsoc_code', 'task_id'], how='left')
|
||||
df['lb_estimate_in_minutes'] = df.apply(lambda row: convert_to_minutes(row['lb_estimate_qty'], row['lb_estimate_unit']), axis=1)
|
||||
df['ub_estimate_in_minutes'] = df.apply(lambda row: convert_to_minutes(row['ub_estimate_qty'], row['ub_estimate_unit']), axis=1)
|
||||
df['estimate_range'] = df.ub_estimate_in_minutes - df.lb_estimate_in_minutes
|
||||
df['estimate_ratio'] = np.divide(df.ub_estimate_in_minutes, df.lb_estimate_in_minutes).replace([np.inf, -np.inf], None)
|
||||
df['estimate_midpoint'] = (df.lb_estimate_in_minutes + df.ub_estimate_in_minutes) / 2
|
||||
|
||||
df.to_parquet(self.intermediate_dir / "estimable_tasks_with_estimates.parquet")
|
||||
|
||||
self.task_summary_by_occupation_df = create_task_summary_by_occupation_df(self.df_tasks, self.oesm_df)
|
||||
self.task_summary_by_occupation_df.to_parquet(self.intermediate_dir / "task_summary_by_occupation.parquet")
|
||||
self.task_summary_by_major_occupation_df = aggregate_task_summary_by_major_code(self.task_summary_by_occupation_df)
|
||||
self.task_summary_by_major_occupation_df.to_parquet(self.intermediate_dir / "task_summary_by_major_occupation.parquet")
|
||||
|
||||
self._check_for_insanity(df)
|
||||
|
||||
for gen in GENERATORS:
|
||||
for asset in gen(**{
|
||||
"output_dir": self.output_dir,
|
||||
"runner": self,
|
||||
"df": df,
|
||||
"task_summary_by_occupation_df": self.task_summary_by_occupation_df,
|
||||
"task_summary_by_major_occupation_df": self.task_summary_by_major_occupation_df,
|
||||
"df_tasks": self.df_tasks,
|
||||
"oesm_df": self.oesm_df,
|
||||
"metr_results": self.metr_results,
|
||||
}):
|
||||
logger.info(f"New asset: {asset}")
|
||||
|
||||
def _create_df_tasks(self) -> pd.DataFrame:
|
||||
DATA_PATH = self.cache_dir / f"onet_{ONET_VERSION}_tasks_with_remote_status.parquet"
|
||||
if DATA_PATH.exists():
|
||||
logger.info(f"Loading cached tasks dataframe from {DATA_PATH}")
|
||||
return pd.read_parquet(DATA_PATH)
|
||||
|
||||
logger.info("Creating tasks dataframe")
|
||||
query = """
|
||||
SELECT
|
||||
tr.onetsoc_code,
|
||||
tr.task_id,
|
||||
ts.task,
|
||||
od.title AS occupation_title,
|
||||
od.description AS occupation_description,
|
||||
tr.scale_id,
|
||||
tr.category,
|
||||
tr.data_value
|
||||
FROM
|
||||
task_ratings tr
|
||||
JOIN
|
||||
task_statements ts ON tr.task_id = ts.task_id
|
||||
JOIN
|
||||
occupation_data od ON tr.onetsoc_code = od.onetsoc_code;
|
||||
"""
|
||||
ratings_df = pd.read_sql_query(query, self.onet_conn)
|
||||
logger.info(f"Fetched {len(ratings_df)} task rating records from the database.")
|
||||
|
||||
# 1. Handle Frequency (FT)
|
||||
logger.info("Processing Frequency data")
|
||||
freq_df = ratings_df[ratings_df["scale_id"] == "FT"].copy()
|
||||
if not freq_df.empty:
|
||||
freq_pivot = freq_df.pivot_table(
|
||||
index=["onetsoc_code", "task_id"],
|
||||
columns="category",
|
||||
values="data_value",
|
||||
fill_value=0,
|
||||
)
|
||||
freq_pivot.columns = [f"frequency_category_{int(col)}" for col in freq_pivot.columns]
|
||||
else:
|
||||
raise ValueError("No frequency data.")
|
||||
|
||||
# 2. Handle Importance (IM, IJ)
|
||||
logger.info("Processing Importance data")
|
||||
imp_df = ratings_df[ratings_df["scale_id"].isin(["IM", "IJ"])].copy()
|
||||
if not imp_df.empty:
|
||||
imp_avg = imp_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index()
|
||||
imp_avg.rename(columns={"data_value": "importance_average"}, inplace=True)
|
||||
else:
|
||||
raise ValueError("No importance data.")
|
||||
|
||||
# 3. Handle Relevance (RT)
|
||||
logger.info("Processing Relevance data")
|
||||
rel_df = ratings_df[ratings_df["scale_id"] == "RT"].copy()
|
||||
if not rel_df.empty:
|
||||
rel_avg = rel_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index()
|
||||
rel_avg.rename(columns={"data_value": "relevance_average"}, inplace=True)
|
||||
else:
|
||||
raise ValueError("No relevance data.")
|
||||
|
||||
# 5. Get Base Task/Occupation Info
|
||||
logger.info("Extracting base task/occupation info")
|
||||
base_cols = ["onetsoc_code", "task_id", "task", "occupation_title", "occupation_description"]
|
||||
base_info = ratings_df[base_cols].drop_duplicates().set_index(["onetsoc_code", "task_id"])
|
||||
|
||||
# 6. Merge Processed ONET Data
|
||||
logger.info("Merging processed ONET data")
|
||||
final_df = base_info.merge(freq_pivot, left_index=True, right_index=True, how="left")
|
||||
final_df = final_df.reset_index()
|
||||
|
||||
if not imp_avg.empty:
|
||||
final_df = final_df.merge(imp_avg, on=["onetsoc_code", "task_id"], how="left")
|
||||
else:
|
||||
final_df["importance_average"] = np.nan
|
||||
|
||||
if not rel_avg.empty:
|
||||
final_df = final_df.merge(rel_avg, on=["onetsoc_code", "task_id"], how="left")
|
||||
else:
|
||||
final_df["relevance_average"] = np.nan
|
||||
|
||||
final_df = final_df.replace({np.nan: None})
|
||||
|
||||
# 7. Merge with EPOCH remote data
|
||||
logger.info("Merging with EPOCH remote data")
|
||||
final_df = pd.merge(final_df, self.epoch_df[['Task', 'Remote']], left_on='task', right_on='Task', how='left')
|
||||
final_df = final_df.drop('Task', axis=1).rename(columns={'Remote': 'remote_status'})
|
||||
|
||||
logger.info(f"Created tasks dataframe with shape {final_df.shape}")
|
||||
final_df.to_parquet(DATA_PATH)
|
||||
|
||||
return final_df
|
||||
|
||||
def _check_for_insanity(self, df: pd.DataFrame):
|
||||
if df['lb_estimate_in_minutes'].isnull().any():
|
||||
missing_count = df['lb_estimate_in_minutes'].isnull().sum()
|
||||
raise ValueError(f"Found {missing_count} atomic tasks with missing 'lb_estimate_in_minutes'.")
|
||||
|
||||
if df['ub_estimate_in_minutes'].isnull().any():
|
||||
missing_count = df['ub_estimate_in_minutes'].isnull().sum()
|
||||
raise ValueError(f"Found {missing_count} atomic tasks with missing 'ub_estimate_in_minutes'.")
|
||||
|
||||
valid_estimates = df.dropna(subset=['lb_estimate_in_minutes', 'ub_estimate_in_minutes'])
|
||||
impossible_bounds = valid_estimates[
|
||||
(valid_estimates['lb_estimate_in_minutes'] <= 0) |
|
||||
(valid_estimates['ub_estimate_in_minutes'] <= 0) |
|
||||
(valid_estimates['lb_estimate_in_minutes'] > valid_estimates['ub_estimate_in_minutes'])
|
||||
]
|
||||
if not impossible_bounds.empty:
|
||||
raise ValueError(f"Found {len(impossible_bounds)} rows with impossible bounds (e.g., lb > ub or value <= 0).")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
parser = argparse.ArgumentParser(description="Run the econtai pipeline.")
|
||||
parser.add_argument("--output-dir", type=str, default="dist/", help="The directory to write output files to.")
|
||||
parser.add_argument("--bust-estimability", action="store_true", help="Bust the saved task estimability classification (EXPENSIVE)")
|
||||
parser.add_argument("--bust-estimates", action="store_true", help="Bust the tasks estimates (EXPENSIVE)")
|
||||
parser.add_argument("--debug", action="store_true", help="Enable debug mode (e.g., process fewer tasks).")
|
||||
|
||||
args = parser.parse_args()
|
||||
Runner(output_dir=args.output_dir, debug=args.debug, bust_estimability=args.bust_estimability, bust_estimates=args.bust_estimates).run()
|
||||
|
|
222
pipeline/utils.py
Normal file
222
pipeline/utils.py
Normal file
|
@ -0,0 +1,222 @@
|
|||
import subprocess
|
||||
import matplotlib.colors as mcolors
|
||||
import matplotlib as mpl
|
||||
import seaborn as sns
|
||||
import tempfile
|
||||
import litellm
|
||||
import time
|
||||
import math
|
||||
from tqdm import tqdm
|
||||
from typing import Any, List, Dict
|
||||
from .logger import logger
|
||||
|
||||
OCCUPATION_MAJOR_CODES = {
|
||||
'11': 'Management',
|
||||
'13': 'Business & Financial',
|
||||
'15': 'Computer & Mathematical',
|
||||
'17': 'Architecture & Engineering',
|
||||
'19': 'Life, Physical, & Social Science',
|
||||
'21': 'Community & Social Service',
|
||||
'23': 'Legal',
|
||||
'25': 'Education, Training, & Library',
|
||||
'27': 'Arts, Design, & Media',
|
||||
'29': 'Healthcare Practitioners',
|
||||
'31': 'Healthcare Support',
|
||||
'33': 'Protective Service',
|
||||
'35': 'Food Preparation & Serving',
|
||||
'37': 'Building & Grounds Maintenance',
|
||||
'39': 'Personal Care & Service',
|
||||
'41': 'Sales & Related',
|
||||
'43': 'Office & Admin Support',
|
||||
'45': 'Farming, Fishing, & Forestry',
|
||||
'47': 'Construction & Extraction',
|
||||
'49': 'Installation, Maintenance, & Repair',
|
||||
'51': 'Production',
|
||||
'53': 'Transportation & Material Moving',
|
||||
'55': 'Military Specific',
|
||||
}
|
||||
|
||||
GRAY = {'50':'#f8fafc','100':'#f1f5f9','200':'#e2e8f0',
|
||||
'300':'#cbd5e1','400':'#94a3b8','500':'#64748b',
|
||||
'600':'#475569','700':'#334155','800':'#1e293b',
|
||||
'900':'#0f172a','950':'#020617'}
|
||||
|
||||
LIME = {'50': '#f7fee7','100': '#ecfcca','200': '#d8f999',
|
||||
'300': '#bbf451','400': '#9ae600','500': '#83cd00',
|
||||
'600': '#64a400','700': '#497d00','800': '#3c6300',
|
||||
'900': '#35530e','950': '#192e03'}
|
||||
|
||||
|
||||
def convert_to_minutes(qty, unit):
|
||||
"""Converts a quantity in a given unit to minutes."""
|
||||
return qty * {
|
||||
"minute": 1,
|
||||
"hour": 60,
|
||||
"day": 60 * 24,
|
||||
"week": 60 * 24 * 7,
|
||||
"month": 60 * 24 * 30,
|
||||
"trimester": 60 * 24 * 90,
|
||||
"semester": 60 * 24 * 180,
|
||||
"year": 60 * 24 * 365,
|
||||
}[unit]
|
||||
|
||||
|
||||
def pretty_display(df):
|
||||
print(df)
|
||||
return
|
||||
html_output = df.to_html(index=False)
|
||||
|
||||
# Create a temporary HTML file
|
||||
with tempfile.NamedTemporaryFile(mode='w', suffix=".html", encoding="utf-8") as temp_file:
|
||||
temp_file.write(html_output)
|
||||
temp_file_path = temp_file.name
|
||||
subprocess.run(["/home/felix/.nix-profile/bin/firefox-devedition", "-p", "Work (YouthAI)", temp_file_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
input("Press Enter to continue after reviewing the HTML output...")
|
||||
|
||||
|
||||
def enrich(
|
||||
model: str,
|
||||
rpm: int, # Requests per minute
|
||||
messages_to_process: List[List[Dict[str, str]]],
|
||||
schema: Dict[str, Any],
|
||||
chunk_size: int = 100,
|
||||
):
|
||||
all_results = []
|
||||
num_messages = len(messages_to_process)
|
||||
if num_messages == 0:
|
||||
return all_results
|
||||
|
||||
num_chunks = math.ceil(num_messages / chunk_size)
|
||||
logger.info(f"Starting enrichment for {num_messages} messages, in {num_chunks} chunks of up to {chunk_size} each.")
|
||||
|
||||
# Calculate the time that should be allocated per request to respect the RPM limit.
|
||||
time_per_request = 60.0 / rpm if rpm > 0 else 0
|
||||
|
||||
for i in tqdm(range(num_chunks), desc="Enriching data in chunks"):
|
||||
chunk_start_time = time.time()
|
||||
|
||||
start_index = i * chunk_size
|
||||
end_index = start_index + chunk_size
|
||||
message_chunk = messages_to_process[start_index:end_index]
|
||||
|
||||
if not message_chunk:
|
||||
continue
|
||||
|
||||
try:
|
||||
# Send requests for the entire chunk in a batch for better performance.
|
||||
responses = litellm.batch_completion(
|
||||
model=model,
|
||||
messages=message_chunk,
|
||||
response_format={
|
||||
"type": "json_schema",
|
||||
"json_schema": schema,
|
||||
},
|
||||
)
|
||||
|
||||
# batch_completion returns the response or an exception object for each message.
|
||||
# We'll replace exceptions with None as expected by the calling functions.
|
||||
for response in responses:
|
||||
if isinstance(response, Exception):
|
||||
logger.error(f"API call within batch failed: {response}")
|
||||
all_results.append(None)
|
||||
else:
|
||||
all_results.append(response)
|
||||
|
||||
except Exception as e:
|
||||
# This catches catastrophic failures in batch_completion itself (e.g., auth)
|
||||
logger.error(f"litellm.batch_completion call failed for chunk {i+1}/{num_chunks}: {e}")
|
||||
all_results.extend([None] * len(message_chunk))
|
||||
|
||||
chunk_end_time = time.time()
|
||||
elapsed_time = chunk_end_time - chunk_start_time
|
||||
|
||||
# To enforce the rate limit, we calculate how long the chunk *should* have taken
|
||||
# and sleep for the remainder of that time.
|
||||
if time_per_request > 0:
|
||||
expected_duration_for_chunk = len(message_chunk) * time_per_request
|
||||
if elapsed_time < expected_duration_for_chunk:
|
||||
sleep_duration = expected_duration_for_chunk - elapsed_time
|
||||
logger.debug(f"Chunk processed in {elapsed_time:.2f}s. Sleeping for {sleep_duration:.2f}s to respect RPM.")
|
||||
time.sleep(sleep_duration)
|
||||
|
||||
return all_results
|
||||
|
||||
def get_contrasting_text_color(bg_color_hex_or_rgba):
|
||||
if isinstance(bg_color_hex_or_rgba, str):
|
||||
rgba = mcolors.to_rgba(bg_color_hex_or_rgba)
|
||||
else:
|
||||
rgba = bg_color_hex_or_rgba
|
||||
r, g, b, _ = rgba
|
||||
luminance = 0.2126 * r + 0.7152 * g + 0.0722 * b
|
||||
return 'black' if luminance > 0.55 else 'white'
|
||||
|
||||
|
||||
def style_plot():
|
||||
"""
|
||||
Applies a consistent and professional style to all plots.
|
||||
This function sets matplotlib's rcParams for a global effect.
|
||||
"""
|
||||
mpl.rcParams.update({
|
||||
'figure.facecolor': GRAY['50'],
|
||||
'figure.edgecolor': 'none',
|
||||
'figure.figsize': (12, 8),
|
||||
'figure.dpi': 150,
|
||||
|
||||
'axes.facecolor': GRAY['50'],
|
||||
'axes.edgecolor': GRAY['300'],
|
||||
'axes.grid': True,
|
||||
'axes.labelcolor': GRAY['800'],
|
||||
'axes.titlecolor': GRAY['900'],
|
||||
'axes.titlesize': 18,
|
||||
'axes.titleweight': 'bold',
|
||||
'axes.titlepad': 20,
|
||||
'axes.labelsize': 14,
|
||||
'axes.labelweight': 'semibold',
|
||||
'axes.labelpad': 10,
|
||||
'axes.spines.top': False,
|
||||
'axes.spines.right': False,
|
||||
'axes.spines.left': True,
|
||||
'axes.spines.bottom': True,
|
||||
|
||||
'text.color': GRAY['700'],
|
||||
|
||||
'xtick.color': GRAY['600'],
|
||||
'ytick.color': GRAY['600'],
|
||||
'xtick.labelsize': 12,
|
||||
'ytick.labelsize': 12,
|
||||
'xtick.major.size': 0,
|
||||
'ytick.major.size': 0,
|
||||
'xtick.minor.size': 0,
|
||||
'ytick.minor.size': 0,
|
||||
'xtick.major.pad': 8,
|
||||
'ytick.major.pad': 8,
|
||||
|
||||
'grid.color': GRAY['200'],
|
||||
'grid.linestyle': '--',
|
||||
'grid.linewidth': 1,
|
||||
|
||||
'legend.frameon': False,
|
||||
'legend.fontsize': 12,
|
||||
'legend.title_fontsize': 14,
|
||||
'legend.facecolor': 'inherit',
|
||||
|
||||
'font.family': 'sans-serif',
|
||||
'font.sans-serif': ['Inter'],
|
||||
'font.weight': 'normal',
|
||||
|
||||
'lines.linewidth': 2,
|
||||
'lines.markersize': 6,
|
||||
})
|
||||
|
||||
# Seaborn specific styles
|
||||
# Use shades of LIME as the primary color palette.
|
||||
# Sorting by integer value of keys, and reversed to have darker shades first.
|
||||
# Excluding very light colors that won't be visible on a light background.
|
||||
lime_palette = [LIME[k] for k in sorted(LIME.keys(), key=int, reverse=True) if k not in ['50', '100', '700', '800', '900', '950',]]
|
||||
|
||||
sns.set_palette(lime_palette)
|
||||
sns.set_style("whitegrid", {
|
||||
'axes.edgecolor': GRAY['300'],
|
||||
'grid.color': GRAY['200'],
|
||||
'grid.linestyle': '--',
|
||||
})
|
Loading…
Add table
Add a link
Reference in a new issue