wip
|
@ -1,2 +1,3 @@
|
|||
- I use Nix. To run a command, prefix them with `nix develop .#impure -c`
|
||||
- I use uv. To add a package, use: uv add. To run a script use: uv run path/to/script
|
||||
- To run the pipeline: `uv run -m pipeline.runner`
|
||||
|
|
BIN
dist/estimate_distribution_histplot.png
vendored
Normal file
After Width: | Height: | Size: 75 KiB |
BIN
dist/estimates_lower_vs_upper_scatter.png
vendored
Normal file
After Width: | Height: | Size: 295 KiB |
BIN
dist/estimates_spread_per_occupation.png
vendored
Normal file
After Width: | Height: | Size: 279 KiB |
BIN
dist/intermediate/df_tasks.parquet
vendored
Normal file
BIN
dist/intermediate/estimable_tasks_with_estimates.parquet
vendored
Normal file
BIN
dist/intermediate/task_summary_by_major_occupation.parquet
vendored
Normal file
BIN
dist/intermediate/task_summary_by_occupation.parquet
vendored
Normal file
BIN
dist/projected_automatable_wage_bill_sensitivity.png
vendored
Normal file
After Width: | Height: | Size: 239 KiB |
BIN
dist/projected_task_automation_p50.png
vendored
Normal file
After Width: | Height: | Size: 145 KiB |
BIN
dist/projected_task_automation_p80.png
vendored
Normal file
After Width: | Height: | Size: 136 KiB |
BIN
dist/sequential_coherence_cdf.png
vendored
Normal file
After Width: | Height: | Size: 145 KiB |
|
@ -1,507 +0,0 @@
|
|||
import pandas as pd
|
||||
import litellm
|
||||
import dotenv
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
# --- Configuration ---
|
||||
MODEL = "gpt-4.1-mini" # Make sure this model supports json_schema or structured output
|
||||
RATE_LIMIT = 5000 # Requests per minute
|
||||
CHUNK_SIZE = 300
|
||||
SECONDS_PER_MINUTE = 60
|
||||
FILENAME = (
|
||||
"tasks_with_estimates.csv" # This CSV should contain the tasks to be processed
|
||||
)
|
||||
|
||||
# --- Prompts and Schema ---
|
||||
SYSTEM_PROMPT = """
|
||||
You are an expert assistant evaluating the time to completion required for job tasks. Your goal is to estimate the time range needed for a skilled human to complete the following job task remotely, without supervision.
|
||||
|
||||
Provide a lower and upper bound estimate for the time to completion time. These bounds should capture the time within which approximately 80% of instances of performing this specific task are typically completed by a qualified individual.
|
||||
|
||||
Base your estimate on the provided task description, its associated activities, and the occupational context. Your estimate must be in one the allowed units: minute, hour, day, week, month, trimester, semester, year.
|
||||
""".strip()
|
||||
|
||||
USER_MESSAGE_TEMPLATE = """
|
||||
Please estimate the time range for the following remote task:
|
||||
|
||||
**Task Description:** {task}
|
||||
**Relevant activies for the task:**
|
||||
{dwas}
|
||||
|
||||
**Occupation Category:** {occupation_title}
|
||||
**Occupation Description:** {occupation_description}
|
||||
|
||||
Consider the complexity and the typical steps involved.
|
||||
""".strip()
|
||||
|
||||
ALLOWED_UNITS = [
|
||||
"minute",
|
||||
"hour",
|
||||
"day",
|
||||
"week",
|
||||
"month",
|
||||
"trimester",
|
||||
"semester",
|
||||
"year",
|
||||
]
|
||||
|
||||
SCHEMA_FOR_VALIDATION = {
|
||||
"name": "estimate_time",
|
||||
"strict": True, # Enforce schema adherence
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"lower_bound_estimate": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"quantity": {
|
||||
"type": "number",
|
||||
"description": "The numerical value for the lower bound of the estimate.",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"enum": ALLOWED_UNITS,
|
||||
"description": "The unit of time for the lower bound.",
|
||||
},
|
||||
},
|
||||
"required": ["quantity", "unit"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
"upper_bound_estimate": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"quantity": {
|
||||
"type": "number",
|
||||
"description": "The numerical value for the upper bound of the estimate.",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"enum": ALLOWED_UNITS,
|
||||
"description": "The unit of time for the upper bound.",
|
||||
},
|
||||
},
|
||||
"required": ["quantity", "unit"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
},
|
||||
"required": ["lower_bound_estimate", "upper_bound_estimate"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def save_dataframe(df_to_save, filename):
|
||||
|
||||
"""Saves the DataFrame to the specified CSV file using atomic write."""
|
||||
try:
|
||||
temp_filename = filename + ".tmp"
|
||||
df_to_save.to_csv(temp_filename, encoding="utf-8-sig", index=False)
|
||||
os.replace(temp_filename, filename)
|
||||
except Exception as e:
|
||||
print(f"--- Error saving DataFrame to {filename}: {e} ---")
|
||||
if os.path.exists(temp_filename):
|
||||
try:
|
||||
os.remove(temp_filename)
|
||||
except Exception as remove_err:
|
||||
print(
|
||||
f"--- Error removing temporary save file {temp_filename}: {remove_err} ---"
|
||||
)
|
||||
|
||||
def create_task_estimates():
|
||||
try:
|
||||
# Read the CSV
|
||||
if os.path.exists(FILENAME):
|
||||
df = pd.read_csv(FILENAME, encoding="utf-8-sig")
|
||||
print(f"Successfully read {len(df)} rows from {FILENAME}.")
|
||||
|
||||
estimate_columns_spec = {
|
||||
"lb_estimate_qty": float,
|
||||
"lb_estimate_unit": object,
|
||||
"ub_estimate_qty": float,
|
||||
"ub_estimate_unit": object,
|
||||
}
|
||||
save_needed = False
|
||||
|
||||
for col_name, target_dtype in estimate_columns_spec.items():
|
||||
if col_name not in df.columns:
|
||||
# Initialize with a type-compatible missing value
|
||||
if target_dtype == float:
|
||||
df[col_name] = np.nan
|
||||
else: # object
|
||||
df[col_name] = pd.NA
|
||||
df[col_name] = df[col_name].astype(target_dtype) # Enforce dtype
|
||||
print(f"Added '{col_name}' column as {df[col_name].dtype}.")
|
||||
save_needed = True
|
||||
else:
|
||||
# Column exists, ensure correct dtype
|
||||
current_pd_dtype = df[col_name].dtype
|
||||
expected_pd_dtype = pd.Series(dtype=target_dtype).dtype
|
||||
|
||||
if current_pd_dtype != expected_pd_dtype:
|
||||
try:
|
||||
if target_dtype == float:
|
||||
df[col_name] = pd.to_numeric(df[col_name], errors="coerce")
|
||||
else: # object
|
||||
df[col_name] = df[col_name].astype(object)
|
||||
print(
|
||||
f"Corrected dtype of '{col_name}' to {df[col_name].dtype}."
|
||||
)
|
||||
save_needed = True
|
||||
except Exception as e:
|
||||
print(
|
||||
f"Warning: Could not convert column '{col_name}' to {target_dtype}: {e}. Current dtype: {current_pd_dtype}"
|
||||
)
|
||||
|
||||
# Standardize missing values (e.g., empty strings to NA/NaN)
|
||||
# Replace common missing placeholders with pd.NA first
|
||||
df[col_name].replace(["", None, ""], pd.NA, inplace=True)
|
||||
if target_dtype == float:
|
||||
# For float columns, ensure they are numeric and use np.nan after replacement
|
||||
df[col_name] = pd.to_numeric(df[col_name], errors="coerce")
|
||||
|
||||
if save_needed:
|
||||
print(f"Saving {FILENAME} after adding/adjusting estimate columns.")
|
||||
save_dataframe(df, FILENAME)
|
||||
else:
|
||||
print(
|
||||
f"Error: {FILENAME} not found. Please ensure the file exists and contains task data."
|
||||
)
|
||||
exit()
|
||||
except FileNotFoundError:
|
||||
print(
|
||||
f"Error: {FILENAME} not found. Please ensure the file exists and contains task data."
|
||||
)
|
||||
exit()
|
||||
except Exception as e:
|
||||
print(f"Error reading or initializing {FILENAME}: {e}")
|
||||
exit()
|
||||
|
||||
# --- Identify Rows to Process ---
|
||||
# We'll check for NaN in one of the primary quantity columns.
|
||||
unprocessed_mask = df["lb_estimate_qty"].isna()
|
||||
if unprocessed_mask.any():
|
||||
start_index = unprocessed_mask.idxmax() # Finds the index of the first True value
|
||||
print(f"Resuming processing. First unprocessed row found at index {start_index}.")
|
||||
df_to_process = df.loc[unprocessed_mask].copy()
|
||||
original_indices = df_to_process.index # Keep track of original indices
|
||||
else:
|
||||
print(
|
||||
"All rows seem to have estimates already (based on 'lb_estimate_qty'). Exiting."
|
||||
)
|
||||
exit()
|
||||
|
||||
|
||||
# --- Prepare messages for batch completion (only for rows needing processing) ---
|
||||
messages_list = []
|
||||
skipped_rows_indices = []
|
||||
valid_original_indices = []
|
||||
|
||||
if not df_to_process.empty:
|
||||
required_cols = ["task", "occupation_title", "occupation_description", "dwas"]
|
||||
print(
|
||||
f"Preparing messages for up to {len(df_to_process)} rows starting from original index {original_indices[0] if len(original_indices) > 0 else 'N/A'}..."
|
||||
)
|
||||
print(f"Checking for required columns: {required_cols}")
|
||||
|
||||
for index, row in df_to_process.iterrows():
|
||||
missing_or_empty = []
|
||||
for col in required_cols:
|
||||
if col not in row or pd.isna(row[col]) or str(row[col]).strip() == "":
|
||||
missing_or_empty.append(col)
|
||||
|
||||
if missing_or_empty:
|
||||
print(
|
||||
f"Warning: Skipping row original index {index} due to missing/empty required data in columns: {', '.join(missing_or_empty)}."
|
||||
)
|
||||
skipped_rows_indices.append(index)
|
||||
continue
|
||||
|
||||
try:
|
||||
user_message = USER_MESSAGE_TEMPLATE.format(
|
||||
task=row["task"],
|
||||
occupation_title=row["occupation_title"],
|
||||
occupation_description=row["occupation_description"],
|
||||
dwas=row["dwas"],
|
||||
)
|
||||
except KeyError as e:
|
||||
print(
|
||||
f"Error: Skipping row original index {index} due to formatting error - missing key: {e}. Check USER_MESSAGE_TEMPLATE and CSV columns."
|
||||
)
|
||||
skipped_rows_indices.append(index)
|
||||
continue
|
||||
|
||||
messages_for_row = [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": user_message},
|
||||
]
|
||||
messages_list.append(messages_for_row)
|
||||
valid_original_indices.append(index) # This is the original DataFrame index
|
||||
|
||||
print(
|
||||
f"Prepared {len(messages_list)} valid message sets for batch completion (skipped {len(skipped_rows_indices)} rows)."
|
||||
)
|
||||
if not messages_list:
|
||||
print("No valid rows found to process after checking required data. Exiting.")
|
||||
exit()
|
||||
else:
|
||||
print(
|
||||
"No rows found needing processing (df_to_process is empty)."
|
||||
) # Should have been caught by earlier check
|
||||
exit()
|
||||
|
||||
|
||||
# --- Call batch_completion in chunks with rate limiting and periodic saving ---
|
||||
total_messages_to_send = len(messages_list)
|
||||
num_chunks = math.ceil(total_messages_to_send / CHUNK_SIZE)
|
||||
|
||||
print(
|
||||
f"\nStarting batch completion for {total_messages_to_send} items in {num_chunks} chunks..."
|
||||
)
|
||||
|
||||
overall_start_time = time.time()
|
||||
processed_count_total = 0
|
||||
|
||||
for i in range(num_chunks):
|
||||
chunk_start_message_index = i * CHUNK_SIZE
|
||||
chunk_end_message_index = min((i + 1) * CHUNK_SIZE, total_messages_to_send)
|
||||
message_chunk = messages_list[chunk_start_message_index:chunk_end_message_index]
|
||||
# Get corresponding original DataFrame indices for this chunk
|
||||
chunk_original_indices = valid_original_indices[
|
||||
chunk_start_message_index:chunk_end_message_index
|
||||
]
|
||||
|
||||
if not message_chunk:
|
||||
continue
|
||||
|
||||
min_idx_disp = min(chunk_original_indices) if chunk_original_indices else "N/A"
|
||||
max_idx_disp = max(chunk_original_indices) if chunk_original_indices else "N/A"
|
||||
print(
|
||||
f"\nProcessing chunk {i + 1}/{num_chunks} (Messages {chunk_start_message_index + 1}-{chunk_end_message_index} of this run)..."
|
||||
f" Corresponding to original indices: {min_idx_disp} - {max_idx_disp}"
|
||||
)
|
||||
chunk_start_time = time.time()
|
||||
responses = []
|
||||
try:
|
||||
print(f"Sending {len(message_chunk)} requests for chunk {i + 1}...")
|
||||
responses = litellm.batch_completion(
|
||||
model=MODEL,
|
||||
messages=message_chunk,
|
||||
response_format={
|
||||
"type": "json_schema",
|
||||
"json_schema": SCHEMA_FOR_VALIDATION,
|
||||
},
|
||||
num_retries=3,
|
||||
# request_timeout=60 # Optional: uncomment if needed
|
||||
)
|
||||
print(f"Chunk {i + 1} API call completed.")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during litellm.batch_completion for chunk {i + 1}: {e}")
|
||||
responses = [None] * len(
|
||||
message_chunk
|
||||
) # Ensure responses list matches message_chunk length for processing loop
|
||||
|
||||
# --- Process responses for the current chunk ---
|
||||
chunk_updates = {} # To store {original_df_index: {qty/unit data}}
|
||||
successful_in_chunk = 0
|
||||
failed_in_chunk = 0
|
||||
|
||||
if responses and len(responses) == len(message_chunk):
|
||||
for j, response in enumerate(responses):
|
||||
original_df_index = chunk_original_indices[j]
|
||||
|
||||
# Initialize values for this item
|
||||
lb_qty_val, lb_unit_val, ub_qty_val, ub_unit_val = None, None, None, None
|
||||
content_str = None
|
||||
|
||||
if response is None:
|
||||
print(
|
||||
f"Skipping processing for original index {original_df_index} due to API call failure for this item (response is None)."
|
||||
)
|
||||
failed_in_chunk += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
if (
|
||||
response.choices
|
||||
and response.choices[0].message
|
||||
and response.choices[0].message.content
|
||||
):
|
||||
content_str = response.choices[0].message.content
|
||||
estimate_data = json.loads(content_str) # Can raise JSONDecodeError
|
||||
|
||||
lower_bound_dict = estimate_data.get("lower_bound_estimate")
|
||||
upper_bound_dict = estimate_data.get("upper_bound_estimate")
|
||||
|
||||
valid_response_structure = isinstance(
|
||||
lower_bound_dict, dict
|
||||
) and isinstance(upper_bound_dict, dict)
|
||||
|
||||
if valid_response_structure:
|
||||
lb_qty_raw = lower_bound_dict.get("quantity")
|
||||
lb_unit_raw = lower_bound_dict.get("unit")
|
||||
ub_qty_raw = upper_bound_dict.get("quantity")
|
||||
ub_unit_raw = upper_bound_dict.get("unit")
|
||||
|
||||
is_valid_item = True
|
||||
# Validate LB Qty
|
||||
if (
|
||||
not isinstance(lb_qty_raw, (int, float))
|
||||
or math.isnan(float(lb_qty_raw))
|
||||
or float(lb_qty_raw) < 0
|
||||
):
|
||||
print(
|
||||
f"Warning: Invalid lb_quantity for original index {original_df_index}: {lb_qty_raw}"
|
||||
)
|
||||
is_valid_item = False
|
||||
else:
|
||||
lb_qty_val = float(lb_qty_raw)
|
||||
|
||||
# Validate UB Qty
|
||||
if (
|
||||
not isinstance(ub_qty_raw, (int, float))
|
||||
or math.isnan(float(ub_qty_raw))
|
||||
or float(ub_qty_raw) < 0
|
||||
):
|
||||
print(
|
||||
f"Warning: Invalid ub_quantity for original index {original_df_index}: {ub_qty_raw}"
|
||||
)
|
||||
is_valid_item = False
|
||||
else:
|
||||
ub_qty_val = float(ub_qty_raw)
|
||||
|
||||
# Validate Units
|
||||
if lb_unit_raw not in ALLOWED_UNITS:
|
||||
print(
|
||||
f"Warning: Invalid lb_unit for original index {original_df_index}: '{lb_unit_raw}'"
|
||||
)
|
||||
is_valid_item = False
|
||||
else:
|
||||
lb_unit_val = lb_unit_raw
|
||||
|
||||
if ub_unit_raw not in ALLOWED_UNITS:
|
||||
print(
|
||||
f"Warning: Invalid ub_unit for original index {original_df_index}: '{ub_unit_raw}'"
|
||||
)
|
||||
is_valid_item = False
|
||||
else:
|
||||
ub_unit_val = ub_unit_raw
|
||||
|
||||
if is_valid_item:
|
||||
successful_in_chunk += 1
|
||||
chunk_updates[original_df_index] = {
|
||||
"lb_estimate_qty": lb_qty_val,
|
||||
"lb_estimate_unit": lb_unit_val,
|
||||
"ub_estimate_qty": ub_qty_val,
|
||||
"ub_estimate_unit": ub_unit_val,
|
||||
}
|
||||
else:
|
||||
failed_in_chunk += (
|
||||
1 # Values remain None if not fully valid
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f"Warning: Missing or malformed estimate dicts in JSON for original index {original_df_index}. Content: '{content_str}'"
|
||||
)
|
||||
failed_in_chunk += 1
|
||||
else:
|
||||
finish_reason = (
|
||||
response.choices[0].finish_reason
|
||||
if (response.choices and response.choices[0].finish_reason)
|
||||
else "unknown"
|
||||
)
|
||||
error_message = (
|
||||
response.choices[0].message.content
|
||||
if (
|
||||
response.choices
|
||||
and response.choices[0].message
|
||||
and response.choices[0].message.content
|
||||
)
|
||||
else "No content in message."
|
||||
)
|
||||
print(
|
||||
f"Warning: Received non-standard or empty response content for original index {original_df_index}. "
|
||||
f"Finish Reason: '{finish_reason}'. Message: '{error_message}'. Raw Choices: {response.choices}"
|
||||
)
|
||||
failed_in_chunk += 1
|
||||
|
||||
except json.JSONDecodeError:
|
||||
print(
|
||||
f"Warning: Could not decode JSON for original index {original_df_index}. Content received: '{content_str}'"
|
||||
)
|
||||
failed_in_chunk += 1
|
||||
except AttributeError as ae:
|
||||
print(
|
||||
f"Warning: Missing expected attribute processing response for original index {original_df_index}: {ae}. Response: {response}"
|
||||
)
|
||||
failed_in_chunk += 1
|
||||
except Exception as e:
|
||||
print(
|
||||
f"Warning: An unexpected error occurred processing response for original index {original_df_index}: {type(e).__name__} - {e}. Response: {response}"
|
||||
)
|
||||
failed_in_chunk += 1
|
||||
else:
|
||||
print(
|
||||
f"Warning: Mismatch between number of responses ({len(responses) if responses else 0}) "
|
||||
f"and messages sent ({len(message_chunk)}) for chunk {i + 1}, or no responses. Marking all as failed."
|
||||
)
|
||||
failed_in_chunk = len(
|
||||
message_chunk
|
||||
) # All items in this chunk are considered failed if response array is problematic
|
||||
|
||||
print(
|
||||
f"Chunk {i + 1} processing summary: Success={successful_in_chunk}, Failed/Skipped={failed_in_chunk}"
|
||||
)
|
||||
processed_count_total += successful_in_chunk
|
||||
|
||||
# --- Update Main DataFrame and Save Periodically ---
|
||||
if chunk_updates:
|
||||
print(
|
||||
f"Updating main DataFrame with {len(chunk_updates)} new estimates for chunk {i + 1}..."
|
||||
)
|
||||
for idx, estimates in chunk_updates.items():
|
||||
if idx in df.index:
|
||||
df.loc[idx, "lb_estimate_qty"] = estimates["lb_estimate_qty"]
|
||||
df.loc[idx, "lb_estimate_unit"] = estimates["lb_estimate_unit"]
|
||||
df.loc[idx, "ub_estimate_qty"] = estimates["ub_estimate_qty"]
|
||||
df.loc[idx, "ub_estimate_unit"] = estimates["ub_estimate_unit"]
|
||||
|
||||
print(f"Saving progress to {FILENAME}...")
|
||||
save_dataframe(df, FILENAME)
|
||||
else:
|
||||
print(f"No successful estimates obtained in chunk {i + 1} to save.")
|
||||
|
||||
# --- Rate Limiting Pause ---
|
||||
chunk_end_time = time.time()
|
||||
chunk_duration = chunk_end_time - chunk_start_time
|
||||
print(f"Chunk {i + 1} took {chunk_duration:.2f} seconds.")
|
||||
|
||||
if i < num_chunks - 1: # No pause after the last chunk
|
||||
# Calculate ideal time per request based on rate limit
|
||||
time_per_request = SECONDS_PER_MINUTE / RATE_LIMIT if RATE_LIMIT > 0 else 0
|
||||
# Calculate minimum duration this chunk should have taken to respect rate limit
|
||||
min_chunk_duration_for_rate = len(message_chunk) * time_per_request
|
||||
# Calculate pause needed
|
||||
pause_needed = max(0, min_chunk_duration_for_rate - chunk_duration)
|
||||
|
||||
if pause_needed > 0:
|
||||
print(
|
||||
f"Pausing for {pause_needed:.2f} seconds to respect rate limit ({RATE_LIMIT}/min)..."
|
||||
)
|
||||
time.sleep(pause_needed)
|
||||
|
||||
overall_end_time = time.time()
|
||||
total_duration_minutes = (overall_end_time - overall_start_time) / 60
|
||||
print(
|
||||
f"\nBatch completion finished."
|
||||
f" Processed {processed_count_total} new estimates in this run in {total_duration_minutes:.2f} minutes."
|
||||
)
|
||||
|
||||
print(f"Performing final save to {FILENAME}...")
|
||||
save_dataframe(df, FILENAME)
|
||||
|
||||
print("\nScript finished.")
|
528
old/analysis.py
|
@ -1,528 +0,0 @@
|
|||
import os
|
||||
import litellm
|
||||
import sqlite3
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from google.colab import userdata, files
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib as mpl
|
||||
|
||||
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
|
||||
os.environ['GEMINI_API_KEY'] = userdata.get('GEMINI_API_KEY')
|
||||
|
||||
occupation_major_codes = {
|
||||
'11': 'Management',
|
||||
'13': 'Business and Financial Operations',
|
||||
'15': 'Computer and Mathematical Occupations',
|
||||
'17': 'Architecture and Engineering',
|
||||
'19': 'Life, Physical, and Social Science',
|
||||
'21': 'Community and Social Services',
|
||||
'23': 'Legal',
|
||||
'25': 'Education, Training, and Library',
|
||||
'27': 'Arts, Design, Entertainment, Sports, and Media',
|
||||
'29': 'Healthcare Practitioners and Technical',
|
||||
'31': 'Healthcare Support',
|
||||
'33': 'Protective Service',
|
||||
'35': 'Food Preparation and Serving Related',
|
||||
'37': 'Building and Grounds Cleaning and Maintenance',
|
||||
'39': 'Personal Care and Service',
|
||||
'41': 'Sales and Related',
|
||||
'43': 'Office and Administrative Support',
|
||||
'45': 'Farming, Fishing, and Forestry',
|
||||
'47': 'Construction and Extraction',
|
||||
'49': 'Installation, Maintenance, and Repair',
|
||||
'51': 'Production',
|
||||
'53': 'Transportation and Material Moving',
|
||||
'55': 'Military Specific'
|
||||
}
|
||||
|
||||
gray = {'50':'#f8fafc','100':'#f1f5f9','200':'#e2e8f0',
|
||||
'300':'#cbd5e1','400':'#94a3b8','500':'#64748b',
|
||||
'600':'#475569','700':'#334155','800':'#1e293b',
|
||||
'900':'#0f172a','950':'#020617'}
|
||||
lime = {'50': '#f7fee7','100': '#ecfcca','200': '#d8f999',
|
||||
'300': '#bbf451','400': '#9ae600','500': '#83cd00',
|
||||
'600': '#64a400','700': '#497d00','800': '#3c6300',
|
||||
'900': '#35530e','950': '#192e03'}
|
||||
|
||||
mpl.rcParams.update({
|
||||
'figure.facecolor' : gray['50'],
|
||||
'axes.facecolor' : gray['50'],
|
||||
'axes.edgecolor' : gray['100'],
|
||||
'axes.labelcolor' : gray['700'],
|
||||
'xtick.color' : gray['700'],
|
||||
'ytick.color' : gray['700'],
|
||||
'font.family' : 'Inter', # falls back to DejaVu if Inter not present
|
||||
'font.size' : 11,
|
||||
})
|
||||
|
||||
sns.set_style("white") # keep minimal axes, we will remove default grid
|
||||
sns.set_context("notebook")
|
||||
|
||||
def prepare_tasks():
|
||||
# This dataset comes from https://epoch.ai/gradient-updates/consequences-of-automating-remote-work
|
||||
# It contains labels for a O*NET task can be done remotely or not (labeled by GPT-4o)
|
||||
# You can download it here: https://drive.google.com/file/d/1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r/view?usp=sharing
|
||||
df_remote_status = pd.read_csv("epoch_task_data.csv")
|
||||
|
||||
# BLS OEWS: Https://www.bls.gov/oes/special-requests/oesm23nat.zip
|
||||
df_oesm = pd.read_excel("oesm23national.xlsx")
|
||||
|
||||
# Run uv run ./enrich_task_ratings.py
|
||||
df_tasks = pd.read_json("task_ratings_enriched.json")
|
||||
|
||||
# Run uv run classify_estimateability_of_tasks.py
|
||||
df_task_estimateable = pd.read_csv("tasks_estimateable.csv").rename(columns={"task_estimateable": "estimateable"}).drop_duplicates(subset=['task'], keep='first')
|
||||
|
||||
# df_tasks now has a remote_status column which contains either "remote" or "not remote"
|
||||
df_tasks = pd.merge(df_tasks, df_remote_status[['Task', 'Remote']], left_on='task', right_on='Task', how='left')
|
||||
df_tasks = df_tasks.drop('Task', axis=1).rename(columns={'Remote': 'remote_status'})
|
||||
|
||||
# df_tasks now has a estimateable column which contains either "ATOMIC" or "ONGOING-CONSTRAINT"
|
||||
df_tasks = pd.merge(df_tasks, df_task_estimateable[['task', 'estimateable']], on='task', how='left')
|
||||
|
||||
df_tasks = df_tasks[df_tasks['importance_average'] < 3].copy()
|
||||
|
||||
df_tasks['onetsoc_major'] = df_tasks['onetsoc_code'].str[:2]
|
||||
|
||||
df_remote_tasks = df_tasks[df_tasks['remote_status'] == 'remote'].copy()
|
||||
|
||||
# Call create_task_estimates() from add_task_estimates? which creates tasks_with_estimates.csv
|
||||
|
||||
def preprocessing_time_estimates():
|
||||
df = pd.read_csv("tasks_with_estimates.csv")
|
||||
|
||||
df = df[df['importance_average'] > 3].copy()
|
||||
|
||||
# The embeddings comes from running `uv run ./embed_task_description.py`
|
||||
# Columns: ['embedding_id', 'task', 'embedding_vector']
|
||||
# These contain embedding for UNIQUE tasks
|
||||
df_task_embeddings = pd.read_parquet("tasks_with_embeddings.parquet").drop_duplicates(subset=['task'])[['task', 'task_embedding']].rename(columns={"task_embedding": "embedding_vector"}).copy()
|
||||
|
||||
df = pd.merge(df, df_task_embeddings[['task', 'embedding_vector']], on='task', how='left')
|
||||
df = pd.merge(df, df_task_estimateable[['task', 'estimateable']], on='task', how='left')
|
||||
|
||||
df['onetsoc_major'] = df['onetsoc_code'].str[:2]
|
||||
|
||||
def convert_to_minutes(qty, unit):
|
||||
"""Converts a quantity in a given unit to minutes."""
|
||||
return qty * {
|
||||
"minute": 1,
|
||||
"hour": 60,
|
||||
"day": 60 * 24,
|
||||
"week": 60 * 24 * 7,
|
||||
"month": 60 * 24 * 30,
|
||||
"trimester": 60 * 24 * 90,
|
||||
"semester": 60 * 24 * 180,
|
||||
"year": 60 * 24 * 365,
|
||||
}[unit]
|
||||
|
||||
df['lb_estimate_in_minutes'] = df.apply(
|
||||
lambda row: convert_to_minutes(row['lb_estimate_qty'], row['lb_estimate_unit']), axis=1
|
||||
)
|
||||
df['ub_estimate_in_minutes'] = df.apply(
|
||||
lambda row: convert_to_minutes(row['ub_estimate_qty'], row['ub_estimate_unit']), axis=1
|
||||
)
|
||||
|
||||
df['estimate_range'] = df.ub_estimate_in_minutes - df.lb_estimate_in_minutes
|
||||
df['estimate_ratio'] = df.ub_estimate_in_minutes / df.lb_estimate_in_minutes
|
||||
df['estimate_midpoint'] = (df.lb_estimate_in_minutes + df.ub_estimate_in_minutes)/2
|
||||
|
||||
atomic_tasks = df[df['estimateable'] == 'ATOMIC']
|
||||
ongoing_tasks = df[df['estimateable'] == 'ONGOING-CONSTRAINT']
|
||||
|
||||
with pd.option_context('display.max_columns', None):
|
||||
display(df)
|
||||
|
||||
# Check for empty estimates
|
||||
if atomic_tasks['lb_estimate_in_minutes'].isnull().sum() > 0:
|
||||
print("Missing values in 'lb_estimate_in_minutes':", atomic_tasks['lb_estimate_in_minutes'].isnull().sum())
|
||||
|
||||
if atomic_tasks['ub_estimate_in_minutes'].isnull().sum() > 0:
|
||||
print("Missing values in 'ub_estimate_in_minutes':", atomic_tasks['ub_estimate_in_minutes'].isnull().sum())
|
||||
|
||||
# Check for impossible bounds
|
||||
impossible_bounds = atomic_tasks[
|
||||
(atomic_tasks['lb_estimate_in_minutes'] <= 0) |
|
||||
(atomic_tasks['ub_estimate_in_minutes'] <= 0) |
|
||||
(atomic_tasks['lb_estimate_in_minutes'] > atomic_tasks['ub_estimate_in_minutes'])
|
||||
]
|
||||
if not impossible_bounds.empty:
|
||||
print(f"Error: Found rows with impossible bounds.")
|
||||
with pd.option_context('display.max_colwidth', None):
|
||||
display(impossible_bounds[['task', 'lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'dwas']])
|
||||
|
||||
#with pd.option_context('display.max_colwidth', None):
|
||||
#display(atomic_tasks.nlargest(20, 'ub_estimate_in_minutes')[['task', 'lb_estimate_qty', 'lb_estimate_unit', 'lb_estimate_in_minutes', 'ub_estimate_qty', 'ub_estimate_unit', 'ub_estimate_in_minutes', 'estimate_ratio']])
|
||||
|
||||
def cell1():
|
||||
sns.histplot(atomic_tasks.estimate_midpoint, log_scale=True)
|
||||
|
||||
def cell2():
|
||||
plt.figure(figsize=(14,10))
|
||||
sns.boxplot(
|
||||
data=atomic_tasks,
|
||||
x='onetsoc_major', # 11 = Management, 15 = Computer/Math, …
|
||||
y='estimate_range',
|
||||
showfliers=False
|
||||
)
|
||||
plt.yscale('log') # long tail => log scale
|
||||
plt.xlabel('Occupation')
|
||||
plt.ylabel('Range (upper-lower, minutes)')
|
||||
plt.title('Spread of time-range estimates per occupation')
|
||||
|
||||
ax = plt.gca()
|
||||
ax.set_xticklabels([occupation_major_codes[code.get_text()] for code in ax.get_xticklabels()], rotation=60, ha='right')
|
||||
|
||||
def cell3():
|
||||
plt.figure(figsize=(10, 10))
|
||||
ax = sns.scatterplot(
|
||||
data=atomic_tasks.replace({'onetsoc_major': occupation_major_codes}), # Replace codes with labels
|
||||
x='lb_estimate_in_minutes', y='ub_estimate_in_minutes',
|
||||
alpha=0.2, edgecolor=None, hue="onetsoc_major" # Use the labeled column for hue
|
||||
)
|
||||
|
||||
# 45° reference
|
||||
lims = (1, atomic_tasks[['lb_estimate_in_minutes','ub_estimate_in_minutes']].max().max())
|
||||
ax.plot(lims, lims, color='black', linestyle='--', linewidth=1)
|
||||
|
||||
# optional helper lines: 2× and 10×, 100× ratios
|
||||
for k in [2,10, 100]:
|
||||
ax.plot(lims, [k*l for l in lims],
|
||||
linestyle=':', color='grey', linewidth=1)
|
||||
|
||||
ax.set(xscale='log', yscale='log')
|
||||
ax.set_xlabel('Lower-bound (min, log scale)')
|
||||
ax.set_ylabel('Upper-bound (min, log scale)')
|
||||
ax.set_title('Lower vs upper estimates for all tasks')
|
||||
|
||||
# Place the legend outside the plot
|
||||
ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
|
||||
|
||||
def cell4():
|
||||
plt.figure(figsize=(8,4))
|
||||
sns.histplot(np.log10(atomic_tasks['estimate_ratio'].replace([np.inf, -np.inf], np.nan).dropna()),
|
||||
bins=60, kde=True)
|
||||
plt.axvline(np.log10(10), color='red', ls='--', lw=1, label='10×')
|
||||
plt.axvline(np.log10(1.05), color='orange', ls='--', lw=1, label='1.05×')
|
||||
plt.axvline(0, color='black', ls='-', lw=1) # ub = lb
|
||||
plt.xlabel('log₁₀(upper / lower)')
|
||||
plt.ylabel('Count')
|
||||
plt.title('Distribution of upper:lower ratio')
|
||||
plt.legend()
|
||||
plt.tight_layout()
|
||||
|
||||
|
||||
def cell5():
|
||||
# 1. Bin lower bounds into quartiles (Q1–Q4)
|
||||
atomic_tasks['lb_q'] = pd.qcut(atomic_tasks.lb_estimate_in_minutes,
|
||||
q=4, labels=['Q1 shortest','Q2','Q3','Q4 longest'])
|
||||
|
||||
|
||||
# 3. Aggregate: median (or mean) ratio per cell
|
||||
pivot = atomic_tasks.pivot_table(index='onetsoc_major', columns='lb_q',
|
||||
values='estimate_ratio', aggfunc='median')
|
||||
|
||||
# Map the index (onetsoc_major codes) to their corresponding labels
|
||||
pivot.index = pivot.index.map(occupation_major_codes)
|
||||
|
||||
|
||||
# 4. Visualise
|
||||
plt.figure(figsize=(10,8))
|
||||
sns.heatmap(pivot, cmap='RdYlGn_r', center=2, annot=True, fmt='.1f',
|
||||
cbar_kws={'label':'Median upper/lower ratio'})
|
||||
plt.xlabel('Lower-bound quartile')
|
||||
plt.ylabel('Occupation (major group)')
|
||||
plt.title('Typical range width by occupation and task length')
|
||||
plt.tight_layout()
|
||||
|
||||
|
||||
|
||||
def cell6():
|
||||
"""
|
||||
from scipy.stats import median_abs_deviation
|
||||
|
||||
def mad_z(series):
|
||||
med = series.median()
|
||||
mad = median_abs_deviation(series, scale='normal') # ⇒ comparable to σ
|
||||
return (series - med) / mad
|
||||
|
||||
df['robust_z'] = df.groupby('onetsoc_code')['estimate_midpoint'].transform(mad_z)
|
||||
"""
|
||||
|
||||
agg = (atomic_tasks
|
||||
.groupby('onetsoc_code')['estimate_midpoint']
|
||||
.agg(median='median',
|
||||
q1=lambda x: x.quantile(.25),
|
||||
q3=lambda x: x.quantile(.75),
|
||||
mean='mean',
|
||||
std='std')
|
||||
.reset_index())
|
||||
agg['IQR'] = agg.q3 - agg.q1
|
||||
agg['CV'] = agg['std'] / agg['mean'] # coefficient of variation
|
||||
|
||||
# merge back the group mean and std so each row can be scored
|
||||
atomic_tasks = atomic_tasks.merge(agg[['onetsoc_code','mean','std']], on='onetsoc_code')
|
||||
|
||||
|
||||
atomic_tasks['z'] = (atomic_tasks.estimate_midpoint - atomic_tasks['mean']) / atomic_tasks['std']
|
||||
outliers = atomic_tasks.loc[atomic_tasks.z.abs() > 3]
|
||||
outliers
|
||||
|
||||
def cell7():
|
||||
from scipy.stats import median_abs_deviation
|
||||
|
||||
def mad_z(series):
|
||||
med = series.median()
|
||||
mad = median_abs_deviation(series, scale='normal') # ⇒ comparable to σ
|
||||
return (series - med) / mad
|
||||
|
||||
atomic_tasks['robust_z'] = atomic_tasks.groupby('onetsoc_code')['estimate_midpoint'].transform(mad_z)
|
||||
|
||||
def cell10():
|
||||
import matplotlib.ticker as mtick # For percentage formatting
|
||||
import matplotlib.colors as mcolors # For color conversion
|
||||
|
||||
summary_data = []
|
||||
|
||||
for code, label in occupation_major_codes.items():
|
||||
occ_df = df_tasks[df_tasks['onetsoc_major'] == code]
|
||||
total_tasks_in_occ = len(occ_df)
|
||||
|
||||
if total_tasks_in_occ == 0:
|
||||
continue # Skip if no tasks for this occupation
|
||||
|
||||
# Stack 1: % that isn't equal to "remote"
|
||||
not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote'])
|
||||
|
||||
# For the remaining remote tasks:
|
||||
remote_df = occ_df[occ_df['remote_status'] == 'remote']
|
||||
|
||||
# Stack 2: % of remote + ATOMIC
|
||||
remote_atomic_count = len(remote_df[remote_df['estimateable'] == 'ATOMIC'])
|
||||
|
||||
# Stack 3: % of remote + ONGOING-CONSTRAINT
|
||||
remote_ongoing_count = len(remote_df[remote_df['estimateable'] == 'ONGOING-CONSTRAINT'])
|
||||
|
||||
summary_data.append({
|
||||
'onetsoc_major_code': code,
|
||||
'occupation_label': label,
|
||||
'count_not_remote': not_remote_count,
|
||||
'count_remote_atomic': remote_atomic_count,
|
||||
'count_remote_ongoing': remote_ongoing_count,
|
||||
'total_tasks': total_tasks_in_occ
|
||||
})
|
||||
|
||||
summary_df = pd.DataFrame(summary_data)
|
||||
|
||||
# --- 3. Calculate Percentages ---
|
||||
# Ensure total_tasks is not zero to avoid division by zero errors if an occupation had no tasks
|
||||
summary_df = summary_df[summary_df['total_tasks'] > 0].copy() # Use .copy() to avoid SettingWithCopyWarning
|
||||
|
||||
summary_df['pct_not_remote'] = (summary_df['count_not_remote'] / summary_df['total_tasks']) * 100
|
||||
summary_df['pct_remote_atomic'] = (summary_df['count_remote_atomic'] / summary_df['total_tasks']) * 100
|
||||
summary_df['pct_remote_ongoing'] = (summary_df['count_remote_ongoing'] / summary_df['total_tasks']) * 100
|
||||
|
||||
# Select columns for plotting and set index to occupation label
|
||||
plot_df = summary_df.set_index('occupation_label')[
|
||||
['pct_not_remote', 'pct_remote_atomic', 'pct_remote_ongoing']
|
||||
]
|
||||
|
||||
# Rename columns for a clearer legend
|
||||
plot_df.columns = ['Not Remote', 'Remote + Estimable', 'Remote + Not estimable']
|
||||
|
||||
plot_df = plot_df.sort_values(by='Not Remote', ascending=False)
|
||||
|
||||
|
||||
# --- 4. Plotting (Modified) ---
|
||||
|
||||
# Define the custom colors based on your requirements
|
||||
# The order must match the column order in plot_df:
|
||||
# 1. 'Not Remote'
|
||||
# 2. 'Remote & ATOMIC'
|
||||
# 3. 'Remote & ONGOING-CONSTRAINT'
|
||||
bar_colors = [gray["300"], lime["500"], lime["200"]]
|
||||
|
||||
fig, ax = plt.subplots(figsize=(14, 10)) # Adjusted figsize for better readability
|
||||
|
||||
plot_df.plot(kind='barh', stacked=True, ax=ax, color=bar_colors)
|
||||
|
||||
ax.set_xlabel("Percentage of Tasks (%)", fontsize=12)
|
||||
ax.set_ylabel("Occupation Major Group", fontsize=12)
|
||||
ax.set_title("Task Breakdown by Occupation, Remote Status, and Estimateability", fontsize=14, pad=20)
|
||||
|
||||
# Format x-axis as percentages
|
||||
ax.xaxis.set_major_formatter(mtick.PercentFormatter())
|
||||
plt.xlim(0, 100) # Ensure x-axis goes from 0 to 100%
|
||||
|
||||
# Remove right and top spines
|
||||
ax.spines['right'].set_visible(False)
|
||||
ax.spines['top'].set_visible(False)
|
||||
|
||||
# Function to get contrasting text color
|
||||
def get_contrasting_text_color(bg_color_hex_or_rgba):
|
||||
"""
|
||||
Determines if black or white text provides better contrast against a given background color.
|
||||
bg_color_hex_or_rgba: A hex string (e.g., '#RRGGBB') or an RGBA tuple (values in [0, 1]).
|
||||
Returns: 'black' or 'white'.
|
||||
"""
|
||||
# Convert to RGBA if it's a hex string or name
|
||||
if isinstance(bg_color_hex_or_rgba, str):
|
||||
rgba = mcolors.to_rgba(bg_color_hex_or_rgba)
|
||||
else:
|
||||
rgba = bg_color_hex_or_rgba
|
||||
|
||||
r, g, b, _ = rgba # Ignore alpha for luminance calculation
|
||||
# Calculate luminance (standard formula for sRGB)
|
||||
# Values r, g, b should be in [0, 1] for this formula
|
||||
luminance = 0.2126 * r + 0.7152 * g + 0.0722 * b
|
||||
# Threshold for deciding text color
|
||||
return 'black' if luminance > 0.55 else 'white' # Adjusted threshold slightly for better visual
|
||||
|
||||
# Add percentages inside each bar segment
|
||||
# Iterate through each "category" of bars (Not Remote, Remote & ATOMIC, etc.)
|
||||
for i, container in enumerate(ax.containers):
|
||||
# Get the color for this container/category
|
||||
segment_color = bar_colors[i]
|
||||
text_color = get_contrasting_text_color(segment_color)
|
||||
|
||||
for patch in container.patches: # Iterate through each bar segment in the category
|
||||
width = patch.get_width()
|
||||
if width > 3: # Only add text if segment is wide enough (e.g., >3%)
|
||||
x = patch.get_x() + width / 2
|
||||
y = patch.get_y() + patch.get_height() / 2
|
||||
ax.text(x, y,
|
||||
f"{width:.1f}%",
|
||||
ha='center',
|
||||
va='center',
|
||||
fontsize=8, # Adjust font size as needed
|
||||
color=text_color,
|
||||
fontweight='medium') # Bolder text can help
|
||||
|
||||
|
||||
plt.legend(title="Task Category", bbox_to_anchor=(1.02, 1), loc='upper left', frameon=False)
|
||||
|
||||
def cell11():
|
||||
df_oesm['onetsoc_major'] = df_oesm['OCC_CODE'].str[:2]
|
||||
|
||||
# Calculate wage bill per occupation
|
||||
# Wage bill = Total Employment * Annual Mean Wage
|
||||
# Ensure columns are numeric, converting non-numeric values to NaN first
|
||||
df_oesm['TOT_EMP'] = pd.to_numeric(df_oesm['TOT_EMP'], errors='coerce')
|
||||
df_oesm['A_MEAN'] = pd.to_numeric(df_oesm['A_MEAN'], errors='coerce')
|
||||
|
||||
# Drop rows with NaN in necessary columns after coercion
|
||||
df_oesm.dropna(subset=['TOT_EMP', 'A_MEAN', 'onetsoc_major'], inplace=True)
|
||||
|
||||
df_oesm['wage_bill'] = df_oesm['TOT_EMP'] * df_oesm['A_MEAN']
|
||||
|
||||
# Aggregate wage bill by onetsoc_major
|
||||
df_wage_bill_major = df_oesm.groupby('onetsoc_major')['wage_bill'].sum().reset_index()
|
||||
|
||||
# Map major codes to titles for better plotting
|
||||
df_wage_bill_major['OCC_TITLE_MAJOR'] = df_wage_bill_major['onetsoc_major'].map(occupation_major_codes)
|
||||
|
||||
# Sort by wage bill for better visualization
|
||||
df_wage_bill_major = df_wage_bill_major.sort_values('wage_bill', ascending=False)
|
||||
|
||||
# Plotting
|
||||
plt.figure(figsize=(12, 8))
|
||||
sns.barplot(x='wage_bill', y='OCC_TITLE_MAJOR', data=df_wage_bill_major, palette="viridis")
|
||||
plt.title('Total Wage Bill per Major Occupation Group')
|
||||
plt.xlabel('Total Wage Bill (in billions)')
|
||||
plt.ylabel('Major Occupation Group')
|
||||
plt.grid(axis='x', linestyle='--', alpha=0.7)
|
||||
|
||||
def cell11():
|
||||
# ───────────────────────────────────────────────────────────────
|
||||
# 1. CUMULATIVE-DISTRIBUTION-FUNCTION (CDF) PREP
|
||||
# ───────────────────────────────────────────────────────────────
|
||||
def cdf(series):
|
||||
s = series.sort_values().reset_index(drop=True)
|
||||
return s.values, ((s.index + 1) / len(s)) * 100
|
||||
|
||||
x_lb , y_lb = cdf(atomic_tasks['lb_estimate_in_minutes'])
|
||||
x_ub , y_ub = cdf(atomic_tasks['ub_estimate_in_minutes'])
|
||||
x_mid, y_mid = cdf((atomic_tasks['ub_estimate_in_minutes'] + atomic_tasks['lb_estimate_in_minutes']) / 2)
|
||||
|
||||
# ───────────────────────────────────────────────────────────────
|
||||
# 2. PLOTTING
|
||||
# ───────────────────────────────────────────────────────────────
|
||||
fig, ax = plt.subplots(figsize=(10, 6))
|
||||
|
||||
# horizontal reference lines every 10 %
|
||||
for y_val in range(0, 101, 10):
|
||||
ax.axhline(y_val, color=gray['100'], linewidth=.8, zorder=1)
|
||||
|
||||
# Plot Lower Bound CDF
|
||||
ax.step(x_lb, y_lb,
|
||||
where='post',
|
||||
color=lime['300'], # Example: light blue for lower bound
|
||||
linewidth=1.8,
|
||||
linestyle='--',
|
||||
zorder=2,
|
||||
label='Lower bound estimate (CDF)')
|
||||
|
||||
# Plot Upper Bound CDF
|
||||
ax.step(x_ub, y_ub,
|
||||
where='post',
|
||||
color=lime['900'], # Example: light orange/red for upper bound
|
||||
linewidth=1.8,
|
||||
linestyle=':',
|
||||
zorder=3,
|
||||
label='Upper bound estimate (CDF)')
|
||||
|
||||
# Plot Midpoint CDF (plotted last to be on top, or adjust zorder)
|
||||
ax.step(x_mid, y_mid,
|
||||
where='post',
|
||||
color=lime['600'],
|
||||
linewidth=2.2,
|
||||
zorder=4, # Ensure it's on top of other lines if they overlap significantly
|
||||
label='Mid-point estimate (CDF)')
|
||||
|
||||
|
||||
# axes limits / scales
|
||||
ax.set_ylim(0, 100)
|
||||
ax.set_xscale('log')
|
||||
|
||||
# y-axis ➝ percent labels
|
||||
ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(decimals=0))
|
||||
|
||||
|
||||
# move y-label to top-left (just inside plotting area)
|
||||
ax.text(-0.06, 1.03,
|
||||
"% of tasks with temporal coherence ≤ X",
|
||||
ha='left', va='bottom',
|
||||
transform=ax.transAxes,
|
||||
fontsize=12, fontweight='semibold')
|
||||
|
||||
# custom x-ticks at human-friendly durations
|
||||
ticks = [1, 5, 10, 30, 60, 120, 240, 480,
|
||||
1440, 2880, 10080, 43200, 129600,
|
||||
259200, 525600]
|
||||
ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hour', '2 hours', '4 hours', '8 hours',
|
||||
'1 day', '2 days', '1 week', '30 days',
|
||||
'90 days', '180 days', '1 year']
|
||||
|
||||
# Vertical reference lines for x-ticks
|
||||
for tick in ticks:
|
||||
ax.axvline(tick, color=gray['300'], linewidth=.8, linestyle='--', zorder=1)
|
||||
|
||||
ax.set_xticks(ticks)
|
||||
ax.set_xticklabels(ticklabels, rotation=45, ha='right')
|
||||
|
||||
ax.spines['top'].set_visible(False)
|
||||
ax.spines['right'].set_visible(False)
|
||||
ax.spines['left'].set_edgecolor(gray['300'])
|
||||
ax.spines['bottom'].set_edgecolor(gray['300'])
|
||||
|
||||
|
||||
# legend
|
||||
ax.legend(frameon=False, loc='lower right') # Keep 'lower right' or adjust as needed
|
||||
|
||||
ax.text(0.5, -0.3,
|
||||
'Temporal coherence (X)',
|
||||
ha='center', va='center',
|
||||
transform=ax.transAxes,
|
||||
fontsize=12, fontweight='semibold')
|
|
@ -1,411 +0,0 @@
|
|||
import pandas as pd
|
||||
import litellm
|
||||
import dotenv
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
import math
|
||||
|
||||
# Load environment variables
|
||||
dotenv.load_dotenv(override=True)
|
||||
|
||||
# litellm._turn_on_debug() # Optional debugging
|
||||
|
||||
# --- Configuration ---
|
||||
MODEL = "gpt-4.1-mini" # Make sure this model supports json_schema or structured output
|
||||
RATE_LIMIT = 5000 # Requests per minute
|
||||
CHUNK_SIZE = 300 # Number of unique tasks per API call
|
||||
SECONDS_PER_MINUTE = 60
|
||||
|
||||
# File configuration
|
||||
CLASSIFICATION_FILENAME = "tasks_estimateable.csv" # Output file with classifications
|
||||
TASK_SOURCE_FOR_INIT_FILENAME = "tasks_with_estimates.csv"
|
||||
OUTPUT_COLUMN_NAME = "task_estimateable"
|
||||
SOURCE_FILTER_COLUMN = "remote_status"
|
||||
SOURCE_FILTER_VALUE = "remote"
|
||||
|
||||
# --- Prompts and Schema ---
|
||||
SYSTEM_PROMPT_CLASSIFY = """
|
||||
Classify the provided O*NET task into one of these categories:
|
||||
- ATOMIC (schedulable): A single, clearly-bounded activity, typically lasting minutes, hours, or a few days.
|
||||
- ONGOING-CONSTRAINT (background role/ethical rule): A continuous responsibility or behavioural norm with no schedulable duration (e.g., “follow confidentiality rules,” “serve as department head”).
|
||||
""".strip()
|
||||
|
||||
USER_MESSAGE_TEMPLATE_CLASSIFY = "Task: {task}"
|
||||
|
||||
CLASSIFICATION_CATEGORIES = ["ATOMIC", "ONGOING-CONSTRAINT"]
|
||||
|
||||
SCHEMA_FOR_CLASSIFICATION = {
|
||||
"name": "classify_task_type",
|
||||
"strict": True,
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"task_category": {
|
||||
"type": "string",
|
||||
"enum": CLASSIFICATION_CATEGORIES,
|
||||
"description": "The classification of the task (ATOMIC or ONGOING-CONSTRAINT).",
|
||||
}
|
||||
},
|
||||
"required": ["task_category"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def save_dataframe(df_to_save, filename):
|
||||
"""Saves the DataFrame to the specified CSV file using atomic write."""
|
||||
try:
|
||||
temp_filename = filename + ".tmp"
|
||||
df_to_save.to_csv(temp_filename, encoding="utf-8-sig", index=False)
|
||||
os.replace(temp_filename, filename)
|
||||
except Exception as e:
|
||||
print(f"--- Error saving DataFrame to {filename}: {e} ---")
|
||||
if os.path.exists(temp_filename):
|
||||
try:
|
||||
os.remove(temp_filename)
|
||||
except Exception as remove_err:
|
||||
print(
|
||||
f"--- Error removing temporary save file {temp_filename}: {remove_err} ---"
|
||||
)
|
||||
|
||||
|
||||
# --- Load or Initialize DataFrame ---
|
||||
try:
|
||||
if os.path.exists(CLASSIFICATION_FILENAME):
|
||||
df = pd.read_csv(CLASSIFICATION_FILENAME, encoding="utf-8-sig")
|
||||
print(f"Successfully read {len(df)} rows from {CLASSIFICATION_FILENAME}.")
|
||||
|
||||
save_needed_after_load = False
|
||||
if OUTPUT_COLUMN_NAME not in df.columns:
|
||||
df[OUTPUT_COLUMN_NAME] = pd.NA
|
||||
print(f"Added '{OUTPUT_COLUMN_NAME}' column.")
|
||||
save_needed_after_load = True
|
||||
|
||||
df[OUTPUT_COLUMN_NAME].replace(["", None, ""], pd.NA, inplace=True)
|
||||
|
||||
if df[OUTPUT_COLUMN_NAME].dtype != object and not isinstance(
|
||||
df[OUTPUT_COLUMN_NAME].dtype, pd.StringDtype
|
||||
):
|
||||
try:
|
||||
df[OUTPUT_COLUMN_NAME] = df[OUTPUT_COLUMN_NAME].astype(object)
|
||||
print(
|
||||
f"Corrected dtype of '{OUTPUT_COLUMN_NAME}' to {df[OUTPUT_COLUMN_NAME].dtype}."
|
||||
)
|
||||
save_needed_after_load = True
|
||||
except Exception as e:
|
||||
print(
|
||||
f"Warning: Could not convert column '{OUTPUT_COLUMN_NAME}' to object: {e}."
|
||||
)
|
||||
|
||||
if "task" not in df.columns:
|
||||
print(
|
||||
f"Error: {CLASSIFICATION_FILENAME} must contain a 'task' column for processing."
|
||||
)
|
||||
exit()
|
||||
|
||||
if save_needed_after_load:
|
||||
print(f"Saving {CLASSIFICATION_FILENAME} after adding/adjusting column.")
|
||||
save_dataframe(df, CLASSIFICATION_FILENAME)
|
||||
else:
|
||||
print(
|
||||
f"{CLASSIFICATION_FILENAME} not found. Attempting to create it from {TASK_SOURCE_FOR_INIT_FILENAME}."
|
||||
)
|
||||
if not os.path.exists(TASK_SOURCE_FOR_INIT_FILENAME):
|
||||
print(
|
||||
f"Error: Source file {TASK_SOURCE_FOR_INIT_FILENAME} not found. Cannot create {CLASSIFICATION_FILENAME}."
|
||||
)
|
||||
exit()
|
||||
|
||||
df_source = pd.read_csv(TASK_SOURCE_FOR_INIT_FILENAME, encoding="utf-8-sig")
|
||||
|
||||
required_source_cols_for_init = ["task", SOURCE_FILTER_COLUMN]
|
||||
missing_source_cols = [
|
||||
col for col in required_source_cols_for_init if col not in df_source.columns
|
||||
]
|
||||
if missing_source_cols:
|
||||
print(
|
||||
f"Error: Source file {TASK_SOURCE_FOR_INIT_FILENAME} is missing required columns for initialization: {', '.join(missing_source_cols)}."
|
||||
)
|
||||
exit()
|
||||
|
||||
df_source_filtered = df_source[
|
||||
df_source[SOURCE_FILTER_COLUMN] == SOURCE_FILTER_VALUE
|
||||
].copy()
|
||||
|
||||
if df_source_filtered.empty:
|
||||
print(
|
||||
f"Warning: No tasks with '{SOURCE_FILTER_COLUMN}' == '{SOURCE_FILTER_VALUE}' found in {TASK_SOURCE_FOR_INIT_FILENAME}. "
|
||||
f"{CLASSIFICATION_FILENAME} will be created with schema but no tasks to classify initially."
|
||||
)
|
||||
|
||||
df = df_source_filtered[["task"]].copy()
|
||||
df[OUTPUT_COLUMN_NAME] = pd.NA
|
||||
df[OUTPUT_COLUMN_NAME] = df[OUTPUT_COLUMN_NAME].astype(object)
|
||||
|
||||
print(
|
||||
f"Created {CLASSIFICATION_FILENAME} using tasks from {TASK_SOURCE_FOR_INIT_FILENAME} "
|
||||
f"(where {SOURCE_FILTER_COLUMN}='{SOURCE_FILTER_VALUE}'). New file has {len(df)} tasks."
|
||||
)
|
||||
save_dataframe(df, CLASSIFICATION_FILENAME)
|
||||
|
||||
except FileNotFoundError:
|
||||
print(f"Error: A required file was not found. Please check paths.")
|
||||
exit()
|
||||
except Exception as e:
|
||||
print(f"Error during DataFrame loading or initialization: {e}")
|
||||
exit()
|
||||
|
||||
|
||||
# --- Identify Unique Tasks to Process ---
|
||||
if df.empty:
|
||||
print(f"{CLASSIFICATION_FILENAME} is empty. Nothing to process. Exiting.")
|
||||
exit()
|
||||
|
||||
initial_unprocessed_mask = df[OUTPUT_COLUMN_NAME].isna()
|
||||
|
||||
if not initial_unprocessed_mask.any():
|
||||
print(
|
||||
f"All tasks in {CLASSIFICATION_FILENAME} seem to have been classified already. Exiting."
|
||||
)
|
||||
exit()
|
||||
|
||||
# Filter for rows that are unprocessed AND have a valid 'task' string
|
||||
valid_tasks_to_consider_df = df[
|
||||
initial_unprocessed_mask & df["task"].notna() & (df["task"].str.strip() != "")
|
||||
]
|
||||
|
||||
if valid_tasks_to_consider_df.empty:
|
||||
print(
|
||||
f"No valid, unclassified tasks found to process (after filtering out empty/NaN task descriptions). Exiting."
|
||||
)
|
||||
exit()
|
||||
|
||||
unique_task_labels_for_api = (
|
||||
valid_tasks_to_consider_df["task"].drop_duplicates().tolist()
|
||||
)
|
||||
total_rows_to_update_potentially = len(
|
||||
df[initial_unprocessed_mask]
|
||||
) # Count all rows that are NA
|
||||
|
||||
print(
|
||||
f"Found {total_rows_to_update_potentially} total rows in {CLASSIFICATION_FILENAME} needing classification."
|
||||
)
|
||||
print(
|
||||
f"Identified {len(unique_task_labels_for_api)} unique, valid task labels to send to the API."
|
||||
)
|
||||
|
||||
|
||||
# --- Prepare messages for batch completion (only for unique task labels) ---
|
||||
messages_list = []
|
||||
print(f"Preparing messages for {len(unique_task_labels_for_api)} unique task labels...")
|
||||
|
||||
for task_label in unique_task_labels_for_api:
|
||||
# task_label is already guaranteed to be non-empty and not NaN from the filtering above
|
||||
user_message = USER_MESSAGE_TEMPLATE_CLASSIFY.format(task=task_label)
|
||||
messages_for_task = [
|
||||
{"role": "system", "content": SYSTEM_PROMPT_CLASSIFY},
|
||||
{"role": "user", "content": user_message},
|
||||
]
|
||||
messages_list.append(messages_for_task)
|
||||
|
||||
print(f"Prepared {len(messages_list)} message sets for batch completion.")
|
||||
if (
|
||||
not messages_list
|
||||
): # Should only happen if unique_task_labels_for_api was empty, caught above
|
||||
print(
|
||||
"No messages prepared, though unique tasks were identified. This is unexpected. Exiting."
|
||||
)
|
||||
exit()
|
||||
|
||||
|
||||
# --- Call batch_completion in chunks with rate limiting and periodic saving ---
|
||||
total_unique_tasks_to_send = len(
|
||||
messages_list
|
||||
) # Same as len(unique_task_labels_for_api)
|
||||
num_chunks = math.ceil(total_unique_tasks_to_send / CHUNK_SIZE)
|
||||
|
||||
print(
|
||||
f"\nStarting batch classification for {total_unique_tasks_to_send} unique task labels in {num_chunks} chunks..."
|
||||
)
|
||||
|
||||
overall_start_time = time.time()
|
||||
processed_rows_count_total = 0 # Counts actual rows updated in the DataFrame
|
||||
|
||||
for i in range(num_chunks):
|
||||
chunk_start_message_index = i * CHUNK_SIZE
|
||||
chunk_end_message_index = min((i + 1) * CHUNK_SIZE, total_unique_tasks_to_send)
|
||||
|
||||
message_chunk = messages_list[chunk_start_message_index:chunk_end_message_index]
|
||||
# Get corresponding unique task labels for this chunk
|
||||
chunk_task_labels = unique_task_labels_for_api[
|
||||
chunk_start_message_index:chunk_end_message_index
|
||||
]
|
||||
|
||||
if not message_chunk: # Should not happen if loop range is correct
|
||||
continue
|
||||
|
||||
print(
|
||||
f"\nProcessing chunk {i + 1}/{num_chunks} (Unique Task Labels {chunk_start_message_index + 1}-{chunk_end_message_index} of this run)..."
|
||||
)
|
||||
chunk_start_time = time.time()
|
||||
responses = []
|
||||
try:
|
||||
print(
|
||||
f"Sending {len(message_chunk)} requests (for unique tasks) for chunk {i + 1}..."
|
||||
)
|
||||
responses = litellm.batch_completion(
|
||||
model=MODEL,
|
||||
messages=message_chunk,
|
||||
response_format={
|
||||
"type": "json_schema",
|
||||
"json_schema": SCHEMA_FOR_CLASSIFICATION,
|
||||
},
|
||||
num_retries=3,
|
||||
)
|
||||
print(f"Chunk {i + 1} API call completed.")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during litellm.batch_completion for chunk {i + 1}: {e}")
|
||||
responses = [None] * len(message_chunk)
|
||||
|
||||
# --- Process responses for the current chunk ---
|
||||
# chunk_updates stores {task_label: classification_category}
|
||||
chunk_task_classifications = {}
|
||||
successful_api_calls_in_chunk = 0
|
||||
failed_api_calls_in_chunk = 0
|
||||
|
||||
if responses and len(responses) == len(message_chunk):
|
||||
for j, response in enumerate(responses):
|
||||
current_task_label = chunk_task_labels[
|
||||
j
|
||||
] # The unique task label for this response
|
||||
content_str = None
|
||||
|
||||
if response is None:
|
||||
print(
|
||||
f"API call failed for task label '{current_task_label}' (response is None)."
|
||||
)
|
||||
failed_api_calls_in_chunk += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
if (
|
||||
response.choices
|
||||
and response.choices[0].message
|
||||
and response.choices[0].message.content
|
||||
):
|
||||
content_str = response.choices[0].message.content
|
||||
classification_data = json.loads(content_str)
|
||||
category_raw = classification_data.get("task_category")
|
||||
|
||||
if category_raw in CLASSIFICATION_CATEGORIES:
|
||||
successful_api_calls_in_chunk += 1
|
||||
chunk_task_classifications[current_task_label] = category_raw
|
||||
else:
|
||||
print(
|
||||
f"Warning: Invalid or missing task_category for task label '{current_task_label}': '{category_raw}'. Content: '{content_str}'"
|
||||
)
|
||||
failed_api_calls_in_chunk += 1
|
||||
else:
|
||||
finish_reason = (
|
||||
response.choices[0].finish_reason
|
||||
if (response.choices and response.choices[0].finish_reason)
|
||||
else "unknown"
|
||||
)
|
||||
error_message = (
|
||||
response.choices[0].message.content
|
||||
if (response.choices and response.choices[0].message)
|
||||
else "No content in message."
|
||||
)
|
||||
print(
|
||||
f"Warning: Received non-standard or empty response content for task label '{current_task_label}'. "
|
||||
f"Finish Reason: '{finish_reason}'. Message: '{error_message}'. Raw Choices: {response.choices}"
|
||||
)
|
||||
failed_api_calls_in_chunk += 1
|
||||
|
||||
except json.JSONDecodeError:
|
||||
print(
|
||||
f"Warning: Could not decode JSON for task label '{current_task_label}'. Content received: '{content_str}'"
|
||||
)
|
||||
failed_api_calls_in_chunk += 1
|
||||
except AttributeError as ae:
|
||||
print(
|
||||
f"Warning: Missing attribute processing response for task label '{current_task_label}': {ae}. Response: {response}"
|
||||
)
|
||||
failed_api_calls_in_chunk += 1
|
||||
except Exception as e:
|
||||
print(
|
||||
f"Warning: Unexpected error processing response for task label '{current_task_label}': {type(e).__name__} - {e}. Response: {response}"
|
||||
)
|
||||
failed_api_calls_in_chunk += 1
|
||||
else:
|
||||
print(
|
||||
f"Warning: Mismatch between #responses ({len(responses) if responses else 0}) "
|
||||
f"and #messages sent ({len(message_chunk)}) for chunk {i + 1}, or no responses. Marking all API calls in chunk as failed."
|
||||
)
|
||||
failed_api_calls_in_chunk = len(message_chunk)
|
||||
|
||||
# --- Update Main DataFrame and Save Periodically ---
|
||||
rows_updated_this_chunk = 0
|
||||
if chunk_task_classifications:
|
||||
print(
|
||||
f"Updating main DataFrame with classifications for {len(chunk_task_classifications)} unique tasks from chunk {i + 1}..."
|
||||
)
|
||||
for task_label, category in chunk_task_classifications.items():
|
||||
# Update all rows in the main df that match this task_label AND are still NA in the output column
|
||||
update_condition = (df["task"] == task_label) & (
|
||||
df[OUTPUT_COLUMN_NAME].isna()
|
||||
)
|
||||
num_rows_for_this_task_label = df[update_condition].shape[0]
|
||||
|
||||
if num_rows_for_this_task_label > 0:
|
||||
df.loc[update_condition, OUTPUT_COLUMN_NAME] = category
|
||||
rows_updated_this_chunk += num_rows_for_this_task_label
|
||||
|
||||
print(
|
||||
f"Updated {rows_updated_this_chunk} rows in the DataFrame based on this chunk's API responses."
|
||||
)
|
||||
print(f"Saving progress to {CLASSIFICATION_FILENAME}...")
|
||||
save_dataframe(df, CLASSIFICATION_FILENAME)
|
||||
else:
|
||||
print(
|
||||
f"No successful API classifications obtained in chunk {i + 1} to update DataFrame or save."
|
||||
)
|
||||
|
||||
print(
|
||||
f"Chunk {i + 1} API summary: Successful Calls={successful_api_calls_in_chunk}, Failed/Skipped Calls={failed_api_calls_in_chunk}. "
|
||||
f"Rows updated in DataFrame this chunk: {rows_updated_this_chunk}"
|
||||
)
|
||||
processed_rows_count_total += rows_updated_this_chunk
|
||||
|
||||
# --- Rate Limiting Pause ---
|
||||
chunk_end_time = time.time()
|
||||
chunk_duration = chunk_end_time - chunk_start_time
|
||||
print(f"Chunk {i + 1} (API calls and DF update) took {chunk_duration:.2f} seconds.")
|
||||
|
||||
if i < num_chunks - 1:
|
||||
time_per_request = SECONDS_PER_MINUTE / RATE_LIMIT if RATE_LIMIT > 0 else 0
|
||||
min_chunk_duration_for_rate = (
|
||||
len(message_chunk) * time_per_request
|
||||
) # Based on API calls made
|
||||
pause_needed = max(0, min_chunk_duration_for_rate - chunk_duration)
|
||||
|
||||
if pause_needed > 0:
|
||||
print(
|
||||
f"Pausing for {pause_needed:.2f} seconds to respect rate limit ({RATE_LIMIT}/min)..."
|
||||
)
|
||||
time.sleep(pause_needed)
|
||||
|
||||
overall_end_time = time.time()
|
||||
total_duration_minutes = (overall_end_time - overall_start_time) / 60
|
||||
print(
|
||||
f"\nBatch classification finished."
|
||||
f" Updated {processed_rows_count_total} rows in '{CLASSIFICATION_FILENAME}' with new classifications in this run."
|
||||
f" Total duration: {total_duration_minutes:.2f} minutes."
|
||||
)
|
||||
|
||||
print(f"Performing final save to {CLASSIFICATION_FILENAME}...")
|
||||
save_dataframe(df, CLASSIFICATION_FILENAME)
|
||||
|
||||
print("\nScript finished.")
|
|
@ -1,85 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# Set database name and directories
|
||||
ONET_DB_NAME="onet.database"
|
||||
ONET_ZIP_URL="https://www.onetcenter.org/dl_files/database/db_29_1_mysql.zip"
|
||||
ONET_ZIP_FILE="db_29_1_mysql.zip"
|
||||
ONET_EXTRACT_DIR="db_29_1_mysql"
|
||||
|
||||
# Download O*NET database only if not already downloaded
|
||||
if [ ! -f "$ONET_ZIP_FILE" ]; then
|
||||
echo "Downloading O*NET database from $ONET_ZIP_URL"
|
||||
curl -L -o "$ONET_ZIP_FILE" "$ONET_ZIP_URL" || wget -O "$ONET_ZIP_FILE" "$ONET_ZIP_URL"
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed to download O*NET database"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
echo "Using existing O*NET database zip file"
|
||||
fi
|
||||
|
||||
# Extract downloaded zip file only if extraction directory doesn't exist
|
||||
if [ ! -d "$ONET_EXTRACT_DIR" ]; then
|
||||
echo "Extracting O*NET database files"
|
||||
unzip -o "$ONET_ZIP_FILE"
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed to extract O*NET database files"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
echo "Using existing extracted O*NET database files"
|
||||
fi
|
||||
|
||||
# Remove existing database if it exists
|
||||
if [ -f "$ONET_DB_NAME" ]; then
|
||||
echo "Removing existing database"
|
||||
rm "$ONET_DB_NAME"
|
||||
fi
|
||||
|
||||
# Create a new SQLite database with optimized settings for fast import
|
||||
echo "Creating new SQLite database: $ONET_DB_NAME with performance settings"
|
||||
sqlite3 "$ONET_DB_NAME" << EOF
|
||||
PRAGMA journal_mode = OFF;
|
||||
PRAGMA synchronous = 0;
|
||||
PRAGMA cache_size = 1000000;
|
||||
PRAGMA locking_mode = EXCLUSIVE;
|
||||
PRAGMA temp_store = MEMORY;
|
||||
PRAGMA foreign_keys = ON;
|
||||
EOF
|
||||
|
||||
# Combine and execute all SQL files in one transaction
|
||||
echo "Executing SQL files in alphabetical order (single transaction mode)"
|
||||
sqlite3 "$ONET_DB_NAME" << EOF
|
||||
BEGIN TRANSACTION;
|
||||
$(find "$ONET_EXTRACT_DIR" -name "*.sql" | sort | xargs cat)
|
||||
COMMIT;
|
||||
EOF
|
||||
|
||||
# Check if the execution was successful
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Error executing SQL files in batch transaction"
|
||||
exit 1
|
||||
else
|
||||
echo "Database populated successfully. Restoring reliability settings..."
|
||||
|
||||
# Restore reliability-focused settings after import
|
||||
sqlite3 "$ONET_DB_NAME" << EOF
|
||||
PRAGMA journal_mode = WAL;
|
||||
PRAGMA synchronous = NORMAL;
|
||||
PRAGMA locking_mode = NORMAL;
|
||||
PRAGMA temp_store = DEFAULT;
|
||||
PRAGMA foreign_keys = ON;
|
||||
PRAGMA optimize;
|
||||
VACUUM;
|
||||
EOF
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Warning: Failed to restore reliability settings, but database is populated"
|
||||
else
|
||||
echo "Reliability settings restored successfully"
|
||||
fi
|
||||
|
||||
echo "O*NET database created and optimized successfully!"
|
||||
fi
|
|
@ -1,392 +0,0 @@
|
|||
import sqlite3
|
||||
import pandas as pd
|
||||
import json
|
||||
import os
|
||||
from collections import defaultdict
|
||||
import numpy as np
|
||||
|
||||
# --- Configuration ---
|
||||
DB_FILE = "onet.database"
|
||||
OUTPUT_FILE = "task_ratings_enriched.json" # Changed output filename
|
||||
|
||||
# --- Database Interaction ---
|
||||
|
||||
|
||||
def fetch_data_from_db(db_path):
|
||||
"""
|
||||
Fetches required data from the O*NET SQLite database using JOINs,
|
||||
including DWAs.
|
||||
|
||||
Args:
|
||||
db_path (str): Path to the SQLite database file.
|
||||
|
||||
Returns:
|
||||
tuple(pandas.DataFrame, pandas.DataFrame): A tuple containing:
|
||||
- DataFrame with task ratings info.
|
||||
- DataFrame with task-to-DWA mapping.
|
||||
Returns (None, None) if the database file doesn't exist or an error occurs.
|
||||
"""
|
||||
if not os.path.exists(db_path):
|
||||
print(f"Error: Database file not found at {db_path}")
|
||||
return None, None
|
||||
|
||||
try:
|
||||
conn = sqlite3.connect(db_path)
|
||||
# Construct the SQL query to join the tables and select necessary columns
|
||||
# Added LEFT JOINs for tasks_to_dwas and dwa_reference
|
||||
# Use LEFT JOIN in case a task has no DWAs
|
||||
query = """
|
||||
SELECT
|
||||
tr.onetsoc_code,
|
||||
tr.task_id,
|
||||
ts.task,
|
||||
od.title AS occupation_title,
|
||||
od.description AS occupation_description,
|
||||
tr.scale_id,
|
||||
tr.category,
|
||||
tr.data_value,
|
||||
dr.dwa_title -- Added DWA title
|
||||
FROM
|
||||
task_ratings tr
|
||||
JOIN
|
||||
task_statements ts ON tr.task_id = ts.task_id
|
||||
JOIN
|
||||
occupation_data od ON tr.onetsoc_code = od.onetsoc_code
|
||||
LEFT JOIN
|
||||
tasks_to_dwas td ON tr.onetsoc_code = td.onetsoc_code AND tr.task_id = td.task_id --
|
||||
LEFT JOIN
|
||||
dwa_reference dr ON td.dwa_id = dr.dwa_id; --
|
||||
"""
|
||||
df = pd.read_sql_query(query, conn)
|
||||
conn.close()
|
||||
print(
|
||||
f"Successfully fetched {len(df)} records (including DWA info) from the database."
|
||||
)
|
||||
|
||||
if df.empty:
|
||||
print("Warning: Fetched DataFrame is empty.")
|
||||
# Return empty DataFrames with expected columns if the main fetch is empty
|
||||
ratings_cols = [
|
||||
"onetsoc_code",
|
||||
"task_id",
|
||||
"task",
|
||||
"occupation_title",
|
||||
"occupation_description",
|
||||
"scale_id",
|
||||
"category",
|
||||
"data_value",
|
||||
]
|
||||
dwa_cols = ["onetsoc_code", "task_id", "dwa_title"]
|
||||
return pd.DataFrame(columns=ratings_cols), pd.DataFrame(columns=dwa_cols)
|
||||
|
||||
# Remove duplicates caused by joining ratings with potentially multiple DWAs per task
|
||||
# Keep only unique combinations of the core task/rating info before processing
|
||||
core_cols = [
|
||||
"onetsoc_code",
|
||||
"task_id",
|
||||
"task",
|
||||
"occupation_title",
|
||||
"occupation_description",
|
||||
"scale_id",
|
||||
"category",
|
||||
"data_value",
|
||||
]
|
||||
# Check if all core columns exist before attempting to drop duplicates
|
||||
missing_core_cols = [col for col in core_cols if col not in df.columns]
|
||||
if missing_core_cols:
|
||||
print(f"Error: Missing core columns in fetched data: {missing_core_cols}")
|
||||
return None, None
|
||||
ratings_df = df[core_cols].drop_duplicates().reset_index(drop=True)
|
||||
|
||||
# Get unique DWA info separately
|
||||
dwa_cols = ["onetsoc_code", "task_id", "dwa_title"]
|
||||
# Check if all DWA columns exist before processing
|
||||
if all(col in df.columns for col in dwa_cols):
|
||||
dwas_df = (
|
||||
df[dwa_cols]
|
||||
.dropna(subset=["dwa_title"])
|
||||
.drop_duplicates()
|
||||
.reset_index(drop=True)
|
||||
)
|
||||
else:
|
||||
print("Warning: DWA related columns missing, creating empty DWA DataFrame.")
|
||||
dwas_df = pd.DataFrame(
|
||||
columns=dwa_cols
|
||||
) # Create empty df if columns missing
|
||||
|
||||
return ratings_df, dwas_df # Return two dataframes now
|
||||
|
||||
except sqlite3.Error as e:
|
||||
print(f"SQLite error: {e}")
|
||||
if "conn" in locals() and conn:
|
||||
conn.close()
|
||||
return None, None # Return None for both if error
|
||||
except Exception as e:
|
||||
print(f"An error occurred during data fetching: {e}")
|
||||
if "conn" in locals() and conn:
|
||||
conn.close()
|
||||
return None, None # Return None for both if error
|
||||
|
||||
|
||||
# --- Data Processing ---
|
||||
|
||||
|
||||
def process_task_ratings_with_dwas(ratings_df, dwas_df):
|
||||
"""
|
||||
Processes the fetched data to group, pivot frequency, calculate averages,
|
||||
structure the output, and add associated DWAs.
|
||||
|
||||
Args:
|
||||
ratings_df (pandas.DataFrame): The input DataFrame with task ratings info.
|
||||
dwas_df (pandas.DataFrame): The input DataFrame with task-to-DWA mapping. Can be None or empty.
|
||||
|
||||
Returns:
|
||||
list: A list of dictionaries, each representing an enriched task rating with DWAs.
|
||||
Returns None if the input ratings DataFrame is invalid.
|
||||
"""
|
||||
if ratings_df is None or not isinstance(
|
||||
ratings_df, pd.DataFrame
|
||||
): # Check if it's a DataFrame
|
||||
print("Error: Input ratings DataFrame is invalid.")
|
||||
return None
|
||||
if ratings_df.empty:
|
||||
print(
|
||||
"Warning: Input ratings DataFrame is empty. Processing will yield empty result."
|
||||
)
|
||||
# Decide how to handle empty input, maybe return empty list directly
|
||||
# return []
|
||||
|
||||
# Ensure dwas_df is a DataFrame, even if empty
|
||||
if dwas_df is None or not isinstance(dwas_df, pd.DataFrame):
|
||||
print("Warning: Invalid or missing DWA DataFrame. Proceeding without DWA data.")
|
||||
dwas_df = pd.DataFrame(
|
||||
columns=["onetsoc_code", "task_id", "dwa_title"]
|
||||
) # Ensure it's an empty DF
|
||||
|
||||
print("Starting data processing...")
|
||||
|
||||
# --- 1. Handle Frequency (FT) ---
|
||||
freq_df = ratings_df[ratings_df["scale_id"] == "FT"].copy()
|
||||
if not freq_df.empty:
|
||||
freq_pivot = freq_df.pivot_table(
|
||||
index=["onetsoc_code", "task_id"],
|
||||
columns="category",
|
||||
values="data_value",
|
||||
fill_value=0,
|
||||
)
|
||||
freq_pivot.columns = [
|
||||
f"frequency_category_{int(col)}" for col in freq_pivot.columns
|
||||
]
|
||||
print(f"Processed Frequency data. Shape: {freq_pivot.shape}")
|
||||
else:
|
||||
print("No Frequency (FT) data found.")
|
||||
# Create an empty DataFrame with the multi-index to allow merging later
|
||||
idx = pd.MultiIndex(
|
||||
levels=[[], []], codes=[[], []], names=["onetsoc_code", "task_id"]
|
||||
)
|
||||
freq_pivot = pd.DataFrame(index=idx)
|
||||
|
||||
# --- 2. Handle Importance (IM, IJ) ---
|
||||
imp_df = ratings_df[ratings_df["scale_id"].isin(["IM", "IJ"])].copy()
|
||||
if not imp_df.empty:
|
||||
imp_avg = (
|
||||
imp_df.groupby(["onetsoc_code", "task_id"])["data_value"]
|
||||
.mean()
|
||||
.reset_index()
|
||||
)
|
||||
imp_avg.rename(columns={"data_value": "importance_average"}, inplace=True)
|
||||
print(f"Processed Importance data. Shape: {imp_avg.shape}")
|
||||
else:
|
||||
print("No Importance (IM, IJ) data found.")
|
||||
imp_avg = pd.DataFrame(
|
||||
columns=["onetsoc_code", "task_id", "importance_average"]
|
||||
)
|
||||
|
||||
# --- 3. Handle Relevance (RT) ---
|
||||
rel_df = ratings_df[ratings_df["scale_id"] == "RT"].copy()
|
||||
if not rel_df.empty:
|
||||
rel_avg = (
|
||||
rel_df.groupby(["onetsoc_code", "task_id"])["data_value"]
|
||||
.mean()
|
||||
.reset_index()
|
||||
)
|
||||
rel_avg.rename(columns={"data_value": "relevance_average"}, inplace=True)
|
||||
print(f"Processed Relevance data. Shape: {rel_avg.shape}")
|
||||
else:
|
||||
print("No Relevance (RT) data found.")
|
||||
rel_avg = pd.DataFrame(columns=["onetsoc_code", "task_id", "relevance_average"])
|
||||
|
||||
# --- 4. Process DWAs ---
|
||||
if dwas_df is not None and not dwas_df.empty and "dwa_title" in dwas_df.columns:
|
||||
print("Processing DWA data...")
|
||||
# Group DWAs by task_id and aggregate titles into a list
|
||||
dwas_grouped = (
|
||||
dwas_df.groupby(["onetsoc_code", "task_id"])["dwa_title"]
|
||||
.apply(list)
|
||||
.reset_index()
|
||||
) #
|
||||
dwas_grouped.rename(
|
||||
columns={"dwa_title": "dwas"}, inplace=True
|
||||
) # Rename column to 'dwas'
|
||||
print(f"Processed DWA data. Shape: {dwas_grouped.shape}")
|
||||
else:
|
||||
print("No valid DWA data found or provided for processing.")
|
||||
dwas_grouped = None # Set to None if no DWAs
|
||||
|
||||
# --- 5. Get Base Task/Occupation Info ---
|
||||
base_cols = [
|
||||
"onetsoc_code",
|
||||
"task_id",
|
||||
"task",
|
||||
"occupation_title",
|
||||
"occupation_description",
|
||||
]
|
||||
# Check if base columns exist in ratings_df
|
||||
missing_base_cols = [col for col in base_cols if col not in ratings_df.columns]
|
||||
if missing_base_cols:
|
||||
print(
|
||||
f"Error: Missing base info columns in ratings_df: {missing_base_cols}. Cannot proceed."
|
||||
)
|
||||
return None
|
||||
if not ratings_df.empty:
|
||||
base_info = (
|
||||
ratings_df[base_cols]
|
||||
.drop_duplicates()
|
||||
.set_index(["onetsoc_code", "task_id"])
|
||||
)
|
||||
print(f"Extracted base info. Shape: {base_info.shape}")
|
||||
else:
|
||||
print("Cannot extract base info from empty ratings DataFrame.")
|
||||
# Create an empty df with index to avoid errors later if possible
|
||||
idx = pd.MultiIndex(
|
||||
levels=[[], []], codes=[[], []], names=["onetsoc_code", "task_id"]
|
||||
)
|
||||
base_info = pd.DataFrame(
|
||||
index=idx,
|
||||
columns=[
|
||||
col for col in base_cols if col not in ["onetsoc_code", "task_id"]
|
||||
],
|
||||
)
|
||||
|
||||
# --- 6. Merge Processed Data ---
|
||||
print("Merging processed data...")
|
||||
# Start with base_info, which should have the index ['onetsoc_code', 'task_id']
|
||||
final_df = base_info.merge(
|
||||
freq_pivot, left_index=True, right_index=True, how="left"
|
||||
)
|
||||
# Reset index before merging non-indexed dfs
|
||||
final_df = final_df.reset_index()
|
||||
|
||||
# Merge averages - check if they are not empty before merging
|
||||
if not imp_avg.empty:
|
||||
final_df = final_df.merge(imp_avg, on=["onetsoc_code", "task_id"], how="left")
|
||||
else:
|
||||
final_df["importance_average"] = np.nan # Add column if imp_avg was empty
|
||||
|
||||
if not rel_avg.empty:
|
||||
final_df = final_df.merge(rel_avg, on=["onetsoc_code", "task_id"], how="left")
|
||||
else:
|
||||
final_df["relevance_average"] = np.nan # Add column if rel_avg was empty
|
||||
|
||||
# Merge DWAs if available
|
||||
if dwas_grouped is not None and not dwas_grouped.empty:
|
||||
final_df = final_df.merge(
|
||||
dwas_grouped, on=["onetsoc_code", "task_id"], how="left"
|
||||
) # Merge the dwas list
|
||||
# Fill NaN in 'dwas' column (for tasks with no DWAs) with empty lists
|
||||
# Check if 'dwas' column exists before applying function
|
||||
if "dwas" in final_df.columns:
|
||||
final_df["dwas"] = final_df["dwas"].apply(
|
||||
lambda x: x if isinstance(x, list) else []
|
||||
) # Ensure tasks without DWAs get []
|
||||
else:
|
||||
print("Warning: 'dwas' column not created during merge.")
|
||||
final_df["dwas"] = [
|
||||
[] for _ in range(len(final_df))
|
||||
] # Add empty list column
|
||||
|
||||
else:
|
||||
# Add an empty 'dwas' column if no DWA data was processed or merged
|
||||
final_df["dwas"] = [[] for _ in range(len(final_df))]
|
||||
|
||||
print(f"Final merged data shape: {final_df.shape}")
|
||||
|
||||
# Convert DataFrame to list of dictionaries for JSON output
|
||||
# Handle potential NaN values during JSON conversion
|
||||
# Replace numpy NaN with Python None for JSON compatibility
|
||||
final_df = final_df.replace({np.nan: None})
|
||||
result_list = final_df.to_dict(orient="records")
|
||||
|
||||
return result_list
|
||||
|
||||
|
||||
# --- Output ---
|
||||
|
||||
|
||||
def write_to_json(data, output_path):
|
||||
"""
|
||||
Writes the processed data to a JSON file.
|
||||
|
||||
Args:
|
||||
data (list): The list of dictionaries to write.
|
||||
output_path (str): Path to the output JSON file.
|
||||
"""
|
||||
if data is None:
|
||||
print("No data to write to JSON.")
|
||||
return
|
||||
if not isinstance(data, list):
|
||||
print(
|
||||
f"Error: Data to write is not a list (type: {type(data)}). Cannot write to JSON."
|
||||
)
|
||||
return
|
||||
|
||||
# Create directory if it doesn't exist
|
||||
output_dir = os.path.dirname(output_path)
|
||||
if output_dir and not os.path.exists(output_dir):
|
||||
try:
|
||||
os.makedirs(output_dir)
|
||||
print(f"Created output directory: {output_dir}")
|
||||
except OSError as e:
|
||||
print(f"Error creating output directory {output_dir}: {e}")
|
||||
return # Exit if cannot create directory
|
||||
|
||||
try:
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, indent=4, ensure_ascii=False)
|
||||
print(f"Successfully wrote enriched data to {output_path}")
|
||||
except IOError as e:
|
||||
print(f"Error writing JSON file to {output_path}: {e}")
|
||||
except TypeError as e:
|
||||
print(f"Error during JSON serialization: {e}. Check data types.")
|
||||
except Exception as e:
|
||||
print(f"An unexpected error occurred during JSON writing: {e}")
|
||||
|
||||
|
||||
# --- Main Execution ---
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("Starting O*NET Task Ratings & DWAs Enrichment Script...")
|
||||
# 1. Fetch data
|
||||
ratings_data_df, dwas_data_df = fetch_data_from_db(DB_FILE) # Fetch both datasets
|
||||
|
||||
# 2. Process data
|
||||
# Proceed only if ratings_data_df is a valid DataFrame (even if empty)
|
||||
# dwas_data_df can be None or empty, handled inside process function
|
||||
if isinstance(ratings_data_df, pd.DataFrame):
|
||||
enriched_data = process_task_ratings_with_dwas(
|
||||
ratings_data_df, dwas_data_df
|
||||
) # Pass both dataframes
|
||||
|
||||
# 3. Write output
|
||||
if (
|
||||
enriched_data is not None
|
||||
): # Check if processing returned data (even an empty list is valid)
|
||||
write_to_json(enriched_data, OUTPUT_FILE)
|
||||
else:
|
||||
print("Data processing failed or returned None. No output file generated.")
|
||||
else:
|
||||
print(
|
||||
"Data fetching failed or returned invalid type for ratings data. Script terminated."
|
||||
)
|
||||
|
||||
print("Script finished.")
|
81
pipeline/aggregate.py
Normal file
|
@ -0,0 +1,81 @@
|
|||
from .utils import OCCUPATION_MAJOR_CODES
|
||||
import pandas as pd
|
||||
|
||||
def create_task_summary_by_occupation_df(df_tasks: pd.DataFrame, oesm_df: pd.DataFrame) -> pd.DataFrame:
|
||||
# --- OESM Wage Bill Calculation ---
|
||||
df_oesm_with_bill = oesm_df.copy()
|
||||
df_oesm_with_bill.rename(columns={'OCC_CODE': 'onetsoc_code'}, inplace=True)
|
||||
|
||||
# Convert key columns to numeric, handling potential errors
|
||||
df_oesm_with_bill['TOT_EMP'] = pd.to_numeric(df_oesm_with_bill['TOT_EMP'], errors='coerce')
|
||||
df_oesm_with_bill['A_MEAN'] = pd.to_numeric(df_oesm_with_bill['A_MEAN'], errors='coerce')
|
||||
df_oesm_with_bill.dropna(subset=['TOT_EMP', 'A_MEAN', 'onetsoc_code'], inplace=True)
|
||||
|
||||
# Calculate the wage bill for each occupation
|
||||
df_oesm_with_bill['wage_bill'] = df_oesm_with_bill['TOT_EMP'] * df_oesm_with_bill['A_MEAN']
|
||||
oesm_lookup = df_oesm_with_bill.set_index('onetsoc_code')
|
||||
|
||||
summary_data = []
|
||||
|
||||
# Assuming df_tasks has an 'onetsoc_code' column with the full SOC code
|
||||
unique_soc_codes = df_tasks['onetsoc_code'].unique()
|
||||
|
||||
for code in unique_soc_codes:
|
||||
occ_df = df_tasks[df_tasks['onetsoc_code'] == code]
|
||||
total_tasks_in_occ = len(occ_df)
|
||||
|
||||
not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote'])
|
||||
remote_df = occ_df[occ_df['remote_status'] == 'remote']
|
||||
remote_estimable_count = len(remote_df[remote_df['estimable']])
|
||||
remote_not_estimable_count = len(remote_df[~remote_df['estimable']])
|
||||
|
||||
try:
|
||||
# O*NET codes (e.g., 11-1011.03) are more specific than OESM SOC codes (e.g., 11-1011).
|
||||
# We strip the suffix from the O*NET code to find the corresponding wage data.
|
||||
soc_code_for_lookup = code.split('.')[0]
|
||||
wage_bill = oesm_lookup.loc[soc_code_for_lookup, 'wage_bill']
|
||||
label = oesm_lookup.loc[soc_code_for_lookup, 'OCC_TITLE']
|
||||
except KeyError:
|
||||
wage_bill = 0
|
||||
label = "Unknown"
|
||||
|
||||
summary_data.append({
|
||||
'onetsoc_code': code,
|
||||
'occupation_label': label,
|
||||
'wage_bill': wage_bill,
|
||||
'count_not_remote': not_remote_count,
|
||||
'count_remote_estimable': remote_estimable_count,
|
||||
'count_remote_not_estimable': remote_not_estimable_count,
|
||||
'total_tasks': total_tasks_in_occ
|
||||
})
|
||||
|
||||
return pd.DataFrame(summary_data)
|
||||
|
||||
|
||||
def aggregate_task_summary_by_major_code(summary_df: pd.DataFrame) -> pd.DataFrame:
|
||||
df_agg = summary_df.copy()
|
||||
df_agg['onetsoc_major_code'] = df_agg['onetsoc_code'].str[:2]
|
||||
|
||||
aggregation = {
|
||||
'wage_bill': 'sum',
|
||||
'count_not_remote': 'sum',
|
||||
'count_remote_estimable': 'sum',
|
||||
'count_remote_not_estimable': 'sum',
|
||||
'total_tasks': 'sum'
|
||||
}
|
||||
major_summary = df_agg.groupby('onetsoc_major_code').agg(aggregation).reset_index()
|
||||
|
||||
major_summary['occupation_label'] = major_summary['onetsoc_major_code'].map(OCCUPATION_MAJOR_CODES)
|
||||
|
||||
# Reorder columns to match original output format
|
||||
major_summary = major_summary[[
|
||||
'onetsoc_major_code',
|
||||
'occupation_label',
|
||||
'wage_bill',
|
||||
'count_not_remote',
|
||||
'count_remote_estimable',
|
||||
'count_remote_not_estimable',
|
||||
'total_tasks'
|
||||
]]
|
||||
|
||||
return major_summary
|
225
pipeline/classification.py
Normal file
|
@ -0,0 +1,225 @@
|
|||
from pathlib import Path
|
||||
import pandas as pd
|
||||
from .logger import logger
|
||||
from .utils import enrich
|
||||
import json
|
||||
|
||||
ALLOWED_UNITS = [
|
||||
"minute",
|
||||
"hour",
|
||||
"day",
|
||||
"week",
|
||||
"month",
|
||||
"trimester",
|
||||
"semester",
|
||||
"year",
|
||||
]
|
||||
|
||||
ESTIMABLE_CLASSIFICATION_VERSION = "old_version"
|
||||
TIME_ESTIMATES_GENERATION_VERSION = "old_version"
|
||||
|
||||
def classify_tasks_as_estimable(cache_dir: Path, df_to_process: pd.DataFrame, bust: bool = False) -> pd.DataFrame:
|
||||
CACHE_PATH = cache_dir / f"task_estimability.{ESTIMABLE_CLASSIFICATION_VERSION}.parquet"
|
||||
if CACHE_PATH.exists() and not bust:
|
||||
logger.info(f"Loading cached task estimability from {CACHE_PATH}")
|
||||
return pd.read_parquet(CACHE_PATH)
|
||||
|
||||
logger.info("Enriching tasks with estimability classification.")
|
||||
|
||||
df_unique_tasks = df_to_process.drop_duplicates(subset=['task']).copy()
|
||||
|
||||
logger.info(f"Found {len(df_unique_tasks)} unique remote tasks to classify.")
|
||||
|
||||
if df_unique_tasks.empty:
|
||||
raise ValueError("No unique tasks to classify.")
|
||||
|
||||
results = enrich(
|
||||
model="gpt-4.1-mini",
|
||||
rpm=5000,
|
||||
messages_to_process=[
|
||||
[
|
||||
{"role": "system", "content": """
|
||||
Classify the provided O*NET task into one of these categories:
|
||||
- ATOMIC (schedulable): A single, clearly-bounded activity, typically lasting minutes, hours, or a few days.
|
||||
- ONGOING-CONSTRAINT (background role/ethical rule): A continuous responsibility or behavioural norm with no schedulable duration (e.g., “follow confidentiality rules,” “serve as department head”).
|
||||
""".strip()},
|
||||
{"role": "user", "content": f"Task: {row.task}"},
|
||||
]
|
||||
for row in df_unique_tasks.itertuples()
|
||||
],
|
||||
schema={
|
||||
"name": "estimability_classification",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {"task_category": {"type": "string", "enum": ["ATOMIC", "ONGOING-CONSTRAINT"]}},
|
||||
"required": ["task_category"],
|
||||
"additionalProperties": False
|
||||
}
|
||||
},
|
||||
chunk_size=300,
|
||||
)
|
||||
|
||||
if not results or len(results) != len(df_unique_tasks):
|
||||
raise ValueError(f"Task estimability classification failed or returned mismatched number of results. Expected {len(df_unique_tasks)}, got {len(results) if results else 0}.")
|
||||
|
||||
classifications = []
|
||||
for index, response in enumerate(results):
|
||||
task_label = df_unique_tasks.iloc[index]['task']
|
||||
task_category_flag = None
|
||||
|
||||
if response is None:
|
||||
logger.warning(f"API call failed for task (enrich returned None): '{task_label}'")
|
||||
else:
|
||||
try:
|
||||
content_str = response.choices[0].message.content
|
||||
if not content_str:
|
||||
raise ValueError("No content found in the response message")
|
||||
|
||||
data = json.loads(content_str)
|
||||
|
||||
if 'task_category' in data and isinstance(data['task_category'], str):
|
||||
task_category_flag = data['task_category']
|
||||
else:
|
||||
logger.warning(f"Invalid or missing 'task_category' payload for task '{task_label}'. Data: '{data}'")
|
||||
except (json.JSONDecodeError, AttributeError, KeyError, IndexError, ValueError) as e:
|
||||
logger.warning(f"Could not parse response for task '{task_label}'. Error: {e}. Response: {response}")
|
||||
|
||||
classifications.append({
|
||||
'task': task_label,
|
||||
'estimable': task_category_flag == 'ATOMIC'
|
||||
})
|
||||
|
||||
classification_df = pd.DataFrame(classifications)
|
||||
|
||||
logger.info(f"Finished classification. Got {classification_df['estimable'].notna().sum()} successful classifications out of {len(df_unique_tasks)} unique tasks.")
|
||||
|
||||
logger.info(f"Saving task estimability classifications to {CACHE_PATH}")
|
||||
classification_df.to_parquet(CACHE_PATH)
|
||||
|
||||
return classification_df
|
||||
|
||||
|
||||
def generate_time_estimates_for_tasks(cache_dir: Path, df_to_process: pd.DataFrame, bust: bool = False) -> pd.DataFrame:
|
||||
CACHE_PATH = cache_dir / f"task_estimates.{TIME_ESTIMATES_GENERATION_VERSION}.parquet"
|
||||
if CACHE_PATH.exists() and not bust:
|
||||
logger.info(f"Loading cached task estimates from {CACHE_PATH}")
|
||||
return pd.read_parquet(CACHE_PATH)
|
||||
|
||||
logger.info("Enriching tasks with time estimates.")
|
||||
|
||||
if df_to_process.empty:
|
||||
raise ValueError("No tasks to process for estimates.")
|
||||
|
||||
results = enrich(
|
||||
model="gpt-4.1-mini",
|
||||
rpm=5000,
|
||||
messages_to_process=[
|
||||
[
|
||||
{
|
||||
"role": "system",
|
||||
"content": """
|
||||
You are an expert assistant evaluating the time required for job tasks. Your goal is to estimate the 'effective time' range needed for a skilled human to complete the following job task **remotely**, without supervision
|
||||
|
||||
'Effective time' is the active, focused work duration required to complete the task. Crucially, **exclude all waiting periods, delays, or time spent on other unrelated activities**. Think of it as the continuous, productive time investment needed if the worker could pause and resume instantly without cost.
|
||||
|
||||
Provide a lower and upper bound estimate for the 'effective time'. These bounds should capture the time within which approximately 80% of instances of performing this specific task are typically completed by a qualified individual.
|
||||
|
||||
Base your estimate on the provided task and the associated occupation and occupation description. Your estimate must be in one the allowed units: minute, hour, day, week, month, trimester, semester, year.""".strip()
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"{row.task} done by {row.occupation_title} ({row.occupation_description})"
|
||||
}
|
||||
]
|
||||
for row in df_to_process.itertuples()
|
||||
],
|
||||
schema= {
|
||||
"name": "estimate_time",
|
||||
"strict": True,
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"lower_bound_estimate": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"quantity": {
|
||||
"type": "number",
|
||||
"description": "The numerical value for the lower bound of the estimate.",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"enum": ALLOWED_UNITS,
|
||||
"description": "The unit of time for the lower bound.",
|
||||
},
|
||||
},
|
||||
"required": ["quantity", "unit"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
"upper_bound_estimate": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"quantity": {
|
||||
"type": "number",
|
||||
"description": "The numerical value for the upper bound of the estimate.",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"enum": ALLOWED_UNITS,
|
||||
"description": "The unit of time for the upper bound.",
|
||||
},
|
||||
},
|
||||
"required": ["quantity", "unit"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
},
|
||||
"required": ["lower_bound_estimate", "upper_bound_estimate"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
},
|
||||
chunk_size=200,
|
||||
)
|
||||
|
||||
if not results or len(results) != len(df_to_process):
|
||||
raise ValueError(f"API call for task estimates failed or returned mismatched number of results. "
|
||||
f"Expected {len(df_to_process)}, got {len(results) if results else 0}.")
|
||||
|
||||
estimates = []
|
||||
for index, response in enumerate(results):
|
||||
row = df_to_process.iloc[index]
|
||||
task_info = f"O*NET: {row.onetsoc_code}, Task ID: {row.task_id}"
|
||||
lb_qty, lb_unit, ub_qty, ub_unit = None, None, None, None
|
||||
|
||||
if response is None:
|
||||
logger.warning(f"API call failed for task (enrich returned None): {task_info}")
|
||||
else:
|
||||
try:
|
||||
content_str = response.choices[0].message.content
|
||||
if not content_str:
|
||||
raise ValueError("No content found in the response message")
|
||||
|
||||
data = json.loads(content_str)
|
||||
|
||||
lb_qty = data['lower_bound_estimate']['quantity']
|
||||
lb_unit = data['lower_bound_estimate']['unit']
|
||||
ub_qty = data['upper_bound_estimate']['quantity']
|
||||
ub_unit = data['upper_bound_estimate']['unit']
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not parse valid estimate for task {task_info}. Error: {e}. Response: {response}")
|
||||
lb_qty, lb_unit, ub_qty, ub_unit = None, None, None, None # Reset on failure
|
||||
|
||||
estimates.append({
|
||||
'onetsoc_code': row.onetsoc_code,
|
||||
'task_id': row.task_id,
|
||||
'lb_estimate_qty': lb_qty,
|
||||
'lb_estimate_unit': lb_unit,
|
||||
'ub_estimate_qty': ub_qty,
|
||||
'ub_estimate_unit': ub_unit
|
||||
})
|
||||
|
||||
estimates_df = pd.DataFrame(estimates)
|
||||
logger.info(f"Finished estimates. Got {estimates_df['lb_estimate_qty'].notna().sum()} successful estimates out of {len(df_to_process)} tasks.")
|
||||
|
||||
logger.info(f"Saving task estimates to {CACHE_PATH}")
|
||||
estimates_df.to_parquet(CACHE_PATH)
|
||||
|
||||
return estimates_df
|
|
@ -1,35 +0,0 @@
|
|||
OCCUPATION_MAJOR_CODES = {
|
||||
'11': 'Management',
|
||||
'13': 'Business & Financial',
|
||||
'15': 'Computer & Mathematical',
|
||||
'17': 'Architecture & Engineering',
|
||||
'19': 'Life, Physical, & Social Science',
|
||||
'21': 'Community & Social Service',
|
||||
'23': 'Legal',
|
||||
'25': 'Education, Training, & Library',
|
||||
'27': 'Arts, Design, & Media',
|
||||
'29': 'Healthcare Practitioners',
|
||||
'31': 'Healthcare Support',
|
||||
'33': 'Protective Service',
|
||||
'35': 'Food Preparation & Serving',
|
||||
'37': 'Building & Grounds Maintenance',
|
||||
'39': 'Personal Care & Service',
|
||||
'41': 'Sales & Related',
|
||||
'43': 'Office & Admin Support',
|
||||
'45': 'Farming, Fishing, & Forestry',
|
||||
'47': 'Construction & Extraction',
|
||||
'49': 'Installation, Maintenance, & Repair',
|
||||
'51': 'Production',
|
||||
'53': 'Transportation & Material Moving',
|
||||
'55': 'Military Specific',
|
||||
}
|
||||
|
||||
GRAY = {'50':'#f8fafc','100':'#f1f5f9','200':'#e2e8f0',
|
||||
'300':'#cbd5e1','400':'#94a3b8','500':'#64748b',
|
||||
'600':'#475569','700':'#334155','800':'#1e293b',
|
||||
'900':'#0f172a','950':'#020617'}
|
||||
|
||||
LIME = {'50': '#f7fee7','100': '#ecfcca','200': '#d8f999',
|
||||
'300': '#bbf451','400': '#9ae600','500': '#83cd00',
|
||||
'600': '#64a400','700': '#497d00','800': '#3c6300',
|
||||
'900': '#35530e','950': '#192e03'}
|
|
@ -1,97 +0,0 @@
|
|||
"""
|
||||
This module enriches data, they take time to run, and are usually expensive (API calls...),
|
||||
they should manage their own state, and only be run if the data's version is different than
|
||||
their save.
|
||||
"""
|
||||
from .run import Run
|
||||
import pandas as pd
|
||||
from typing import Any, List, Dict
|
||||
import litellm
|
||||
|
||||
def enrich(
|
||||
model: str,
|
||||
rpm: int,
|
||||
messages_to_process: List[List[Dict[str, str]]],
|
||||
schema: Dict[str, Any],
|
||||
chunk_size: int = 100,
|
||||
):
|
||||
# Use litellm.batch_completion
|
||||
pass
|
||||
|
||||
def enrich_with_task_estimateability(run: Run) -> pd.DataFrame:
|
||||
output_path = run.cache_dir / "computed_task_estimateability.parquet"
|
||||
if output_path.exists():
|
||||
print(f"Loading cached task estimateability from {output_path}")
|
||||
return pd.read_parquet(output_path)
|
||||
|
||||
df_remote_tasks = run.df_tasks[run.df_tasks['remote_status'] == 'remote'].copy()
|
||||
|
||||
# In the old script, we only passed unique tasks to the API
|
||||
df_unique_tasks = df_remote_tasks.drop_duplicates(subset=['task'])
|
||||
|
||||
|
||||
results = enrich(
|
||||
model="gpt-4.1-mini",
|
||||
rpm=5000,
|
||||
messages_to_process=[
|
||||
[
|
||||
{"role": "system", "content": """
|
||||
Judge whether the provided O*NET task is suitable for a time estimate. If it is a single, clearly-bounded activity, typically lasting minutes, hours, or a few days, then clearly yes. If it is a continuous responsibility or behavioural norm with no schedulable duration (e.g., “follow confidentiality rules,” “serve as department head”), then clearly no.
|
||||
"""},
|
||||
{"role": "user", "content": f"Task: {row.task}"},
|
||||
]
|
||||
for row in df_unique_tasks.itertuples()
|
||||
],
|
||||
schema={
|
||||
"type": "object",
|
||||
"properties": {"estimateable": {"type": "bool"}},
|
||||
"required": ["estimateable"]
|
||||
},
|
||||
chunk_size=300,
|
||||
)
|
||||
|
||||
# Create a new dataframe with just enough information to identify the task uniquely + estimateability classification, save it, return it. Careful: the "task" column in itself is not unique.
|
||||
return pd.DataFrame()
|
||||
|
||||
def enrich_with_task_estimates(run: Run) -> pd.DataFrame:
|
||||
output_path = run.cache_dir / "computed_task_estimates.parquet"
|
||||
if output_path.exists():
|
||||
print(f"Loading cached task estimates from {output_path}")
|
||||
return pd.read_parquet(output_path)
|
||||
|
||||
df = ... # todo
|
||||
|
||||
results = enrich(
|
||||
model="gpt-4.1-mini",
|
||||
rpm=5000,
|
||||
messages_to_process=[
|
||||
[
|
||||
{"role": "system", "content": "Estimate the time required to complete the following O*NET task. Your estimate should be a plausible range for how long it might take a typical, qualified worker to perform this task once. Provide your answer as a time range (lower and upper bounds). Do not provide explanations or apologies. If the task is not suitable for a time estimate (e.g., it is an ongoing responsibility), interpret it as a single, schedulable action."},
|
||||
{"role": "user", "content": f"""
|
||||
Task: {row.task}
|
||||
For Occupation: {row.occupation_title}
|
||||
Occupation Description: {row.occupation_description}"""}
|
||||
]
|
||||
for row in df.itertuples()
|
||||
],
|
||||
schema={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"lower_bound_estimate": {
|
||||
"type": "object",
|
||||
"properties": {"quantity": {"type": "number"}, "unit": {"type": "string", "enum": ["minutes", "hours", "days"]}},
|
||||
"required": ["quantity", "unit"],
|
||||
},
|
||||
"upper_bound_estimate": {
|
||||
"type": "object",
|
||||
"properties": {"quantity": {"type": "number"}, "unit": {"type": "string", "enum": ["minutes", "hours", "days"]}},
|
||||
"required": ["quantity", "unit"],
|
||||
},
|
||||
},
|
||||
"required": ["lower_bound_estimate", "upper_bound_estimate"],
|
||||
},
|
||||
chunk_size=200,
|
||||
)
|
||||
|
||||
# Create a new dataframe with just enough information to identify the task uniquely + the estimates classification, save it, return it. Careful: the "task" column in itself is not unique.
|
||||
raise NotImplementedError
|
|
@ -1,50 +1,30 @@
|
|||
"""
|
||||
Fetchers retrieve remote data and return it in a format suitable for further processing, they also return its version, which should be considered opaque, though it is usually a checksum.
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
from typing import Tuple
|
||||
import pandas as pd
|
||||
import requests
|
||||
import io
|
||||
import zipfile
|
||||
from pipeline.run import Run
|
||||
from pipeline.logger import logger
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
from .logger import logger
|
||||
from typing import Tuple, Dict
|
||||
|
||||
def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
|
||||
"""
|
||||
Downloads the O*NET database, creates a local SQLite file from it, and returns a connection.
|
||||
"""
|
||||
version = "29_1"
|
||||
url = f"https://www.onetcenter.org/dl_files/database/db_{version}_mysql.zip"
|
||||
db_path = run.cache_dir / f"onet_{version}.db"
|
||||
run.meta.fetchers['onet'] = {
|
||||
'url': url,
|
||||
'version': version,
|
||||
'db_path': str(db_path),
|
||||
}
|
||||
ONET_VERSION = "29_1"
|
||||
ONET_URL = f"https://www.onetcenter.org/dl_files/database/db_{ONET_VERSION}_mysql.zip"
|
||||
|
||||
if db_path.exists():
|
||||
logger.info(f"Using cached O*NET database: {db_path}")
|
||||
conn = sqlite3.connect(db_path)
|
||||
return conn, version
|
||||
def fetch_onet_database(cache_dir: Path) -> sqlite3.Connection:
|
||||
DB_PATH = cache_dir / f"onet_{ONET_VERSION}.db"
|
||||
|
||||
logger.info(f"Downloading O*NET database from {url}")
|
||||
response = requests.get(url, stream=True, headers={
|
||||
if DB_PATH.exists():
|
||||
logger.info(f"Using cached O*NET database: {DB_PATH}")
|
||||
return sqlite3.connect(DB_PATH)
|
||||
|
||||
logger.info(f"Downloading O*NET database from {ONET_URL}")
|
||||
response = requests.get(ONET_URL, stream=True, headers={
|
||||
"User-Agent": "econ-agent/1.0"
|
||||
})
|
||||
response.raise_for_status()
|
||||
|
||||
# Read content into memory
|
||||
zip_content = response.content
|
||||
|
||||
db_path = run.cache_dir / f"onet_{version}.db"
|
||||
|
||||
logger.info(f"Creating new O*NET database: {db_path}")
|
||||
conn = sqlite3.connect(db_path)
|
||||
|
||||
# Set performance PRAGMAs for fast import
|
||||
logger.info("Creating new SQLite database with performance settings")
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
conn.executescript("""
|
||||
PRAGMA journal_mode = OFF;
|
||||
PRAGMA synchronous = 0;
|
||||
|
@ -54,6 +34,7 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
|
|||
PRAGMA foreign_keys = ON;
|
||||
""")
|
||||
|
||||
zip_content = response.content
|
||||
with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
|
||||
sql_scripts = []
|
||||
for filename in sorted(z.namelist()):
|
||||
|
@ -63,14 +44,10 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
|
|||
if not sql_scripts:
|
||||
raise RuntimeError("No SQL files found in the O*NET zip archive.")
|
||||
|
||||
# Combine and execute all SQL files in one transaction
|
||||
full_script = "BEGIN TRANSACTION;\n" + "\n".join(sql_scripts) + "\nCOMMIT;"
|
||||
|
||||
logger.info("Executing SQL files in alphabetical order (single transaction mode)")
|
||||
full_script = "BEGIN TRANSACTION;\n" + "\n".join(sql_scripts) + "\nCOMMIT;"
|
||||
conn.executescript(full_script)
|
||||
logger.info("Database populated successfully. Restoring reliability settings...")
|
||||
|
||||
# Restore reliability-focused settings after import
|
||||
conn.executescript("""
|
||||
PRAGMA journal_mode = WAL;
|
||||
PRAGMA synchronous = NORMAL;
|
||||
|
@ -81,87 +58,75 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
|
|||
""")
|
||||
conn.execute("VACUUM;")
|
||||
conn.commit()
|
||||
logger.info("Reliability settings restored and database optimized successfully!")
|
||||
|
||||
return conn, version
|
||||
return conn
|
||||
|
||||
def fetch_oesm_data(run: Run) -> Tuple[pd.DataFrame, str]:
|
||||
"""
|
||||
Downloads the OESM national data from the BLS website.
|
||||
"""
|
||||
version = "23"
|
||||
url = f"https://www.bls.gov/oes/special-requests/oesm{version}nat.zip"
|
||||
parquet_path = run.cache_dir / "oesm.parquet"
|
||||
run.meta.fetchers['oesm'] = {
|
||||
'url': url,
|
||||
'version': version,
|
||||
'parquet_path': str(parquet_path),
|
||||
}
|
||||
def fetch_oesm_data(cache_dir: Path) -> pd.DataFrame:
|
||||
VERSION = "23"
|
||||
URL = f"https://www.bls.gov/oes/special-requests/oesm{VERSION}nat.zip"
|
||||
DATA_PATH = cache_dir / "oesm.parquet"
|
||||
|
||||
if parquet_path.exists():
|
||||
logger.info(f"Using cached OESM data: {parquet_path}")
|
||||
return pd.read_parquet(parquet_path), version
|
||||
if DATA_PATH.exists():
|
||||
logger.info(f"Using cached OESM data: {DATA_PATH}")
|
||||
return pd.read_parquet(DATA_PATH)
|
||||
|
||||
logger.info(f"Downloading OESM data from {url}")
|
||||
logger.info(f"Downloading OESM data from {URL}")
|
||||
headers = {'User-Agent': 'econ-agent/1.0'}
|
||||
response = requests.get(url, headers=headers)
|
||||
response = requests.get(URL, headers=headers)
|
||||
response.raise_for_status()
|
||||
|
||||
zip_content = response.content
|
||||
logger.info(f"OESM data version: {version}")
|
||||
|
||||
logger.info(f"Creating new OESM data cache: {parquet_path}")
|
||||
logger.info(f"Creating new OESM data cache: {DATA_PATH}")
|
||||
with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
|
||||
# Find the excel file in the zip
|
||||
excel_filename = None
|
||||
for filename in z.namelist():
|
||||
logger.debug(f"Found file in OESM zip: {filename}")
|
||||
if filename.lower().endswith(".xlsx"):
|
||||
excel_filename = filename
|
||||
break
|
||||
|
||||
if excel_filename is None:
|
||||
raise FileNotFoundError("Could not find the Excel file in the OESM zip archive.")
|
||||
|
||||
logger.info(f"Reading {excel_filename} from zip archive.")
|
||||
with z.open(excel_filename) as f:
|
||||
with z.open(f"oesm{VERSION}national.xlsx") as f:
|
||||
df = pd.read_excel(f, engine='openpyxl', na_values=['*', '#'])
|
||||
|
||||
df.to_parquet(parquet_path)
|
||||
logger.info(f"Saved OESM data to cache: {parquet_path}")
|
||||
return df, version
|
||||
df.to_parquet(DATA_PATH)
|
||||
logger.info(f"Saved OESM data to cache: {DATA_PATH}")
|
||||
return df
|
||||
|
||||
def fetch_epoch_remote_data(run: Run) -> Tuple[pd.DataFrame, str]:
|
||||
"""
|
||||
Downloads the EPOCH AI remote work task data.
|
||||
"""
|
||||
# This is the direct download link constructed from the Google Drive share link
|
||||
version = "latest"
|
||||
url = "https://drive.google.com/uc?export=download&id=1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r"
|
||||
parquet_path = run.cache_dir / f"epoch_remote_{version}.parquet"
|
||||
run.meta.fetchers['epoch_remote'] = {
|
||||
'url': url,
|
||||
'version': version,
|
||||
'parquet_path': str(parquet_path),
|
||||
}
|
||||
def fetch_epoch_remote_data(cache_dir: Path) -> pd.DataFrame:
|
||||
URL = "https://drive.google.com/uc?export=download&id=1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r"
|
||||
DATA_PATH = cache_dir / f"epoch_remote_latest.parquet"
|
||||
|
||||
if parquet_path.exists():
|
||||
logger.info(f"Using cached EPOCH remote data: {parquet_path}")
|
||||
return pd.read_parquet(parquet_path), version
|
||||
if DATA_PATH.exists():
|
||||
logger.info(f"Using cached EPOCH remote data: {DATA_PATH}")
|
||||
return pd.read_parquet(DATA_PATH)
|
||||
|
||||
logger.info(f"Downloading EPOCH remote data from Google Drive: {url}")
|
||||
logger.info(f"Downloading EPOCH remote data from Google Drive: {URL}")
|
||||
|
||||
# Need to handle potential cookies/redirects from Google Drive
|
||||
session = requests.Session()
|
||||
session.headers.update({"User-Agent": "econ-agent/1.0"})
|
||||
response = session.get(url, stream=True)
|
||||
response = session.get(URL, stream=True)
|
||||
response.raise_for_status()
|
||||
|
||||
csv_content = response.content
|
||||
|
||||
logger.info(f"Creating new EPOCH remote data cache: {parquet_path}")
|
||||
logger.info(f"Creating new EPOCH remote data cache: {DATA_PATH}")
|
||||
df = pd.read_csv(io.BytesIO(csv_content))
|
||||
df.to_parquet(parquet_path)
|
||||
logger.info(f"Saved EPOCH remote data to cache: {parquet_path}")
|
||||
df.to_parquet(DATA_PATH)
|
||||
|
||||
return df, version
|
||||
return df
|
||||
|
||||
def fetch_metr_data(cache_dir: Path) -> Dict:
|
||||
URL = "https://metr.org/assets/benchmark_results.yaml"
|
||||
DATA_PATH = cache_dir / "metr_benchmark_results.yaml"
|
||||
|
||||
if DATA_PATH.exists():
|
||||
logger.info(f"Using cached METR data: {DATA_PATH}")
|
||||
with open(DATA_PATH, "r") as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
logger.info(f"Downloading METR data from {URL}")
|
||||
headers = {"User-Agent": "econ-agent/1.0"}
|
||||
response = requests.get(URL, headers=headers)
|
||||
response.raise_for_status()
|
||||
|
||||
yaml_content = response.content
|
||||
|
||||
logger.info(f"Creating new METR data cache: {DATA_PATH}")
|
||||
with open(DATA_PATH, "wb") as f:
|
||||
f.write(yaml_content)
|
||||
|
||||
return yaml.safe_load(yaml_content)
|
||||
|
|
|
@ -1,5 +1,15 @@
|
|||
from .estimate_histplot import generate_estimate_histplot
|
||||
from .estimates_spread_per_occupation import generate_estimate_spread_per_occupation
|
||||
from .estimates_lower_vs_upper_scatter import generate_estimates_lower_vs_upper_scatter
|
||||
from .sequential_coherence_cdf import plot_sequential_coherence_cdf
|
||||
from .projected_automatable_wage_bill import generate_projected_automatable_wage_bill
|
||||
from .projected_task_automation import generate_projected_task_automation_plot
|
||||
|
||||
GENERATORS = [
|
||||
generate_estimate_histplot
|
||||
generate_estimate_histplot,
|
||||
generate_estimate_spread_per_occupation,
|
||||
generate_estimates_lower_vs_upper_scatter,
|
||||
#plot_sequential_coherence_cdf,
|
||||
generate_projected_automatable_wage_bill,
|
||||
generate_projected_task_automation_plot,
|
||||
]
|
||||
|
|
|
@ -1,6 +1,32 @@
|
|||
from ..run import Run
|
||||
from pathlib import Path
|
||||
from typing import Generator
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
import pandas as pd
|
||||
from ..utils import style_plot
|
||||
|
||||
def generate_estimate_histplot(run: Run) -> Generator[Path]:
|
||||
raise NotImplementedError
|
||||
def generate_estimate_histplot(output_dir: Path, df: pd.DataFrame, **kwargs) -> Generator[Path]:
|
||||
"""
|
||||
Generates a styled histogram of the distribution of midpoint time estimates.
|
||||
"""
|
||||
style_plot()
|
||||
OUTPUT_PATH = output_dir / "estimate_distribution_histplot.png"
|
||||
|
||||
fig, ax = plt.subplots()
|
||||
|
||||
sns.histplot(
|
||||
data=df,
|
||||
x='estimate_midpoint',
|
||||
log_scale=True,
|
||||
ax=ax
|
||||
)
|
||||
|
||||
ax.set_xlabel("Task Time (minutes, log scale)")
|
||||
ax.set_ylabel("Number of Tasks")
|
||||
ax.set_title("Distribution of Time Estimates for Atomic Tasks")
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(OUTPUT_PATH)
|
||||
plt.close(fig)
|
||||
|
||||
yield OUTPUT_PATH
|
||||
|
|
56
pipeline/generators/estimates_lower_vs_upper_scatter.py
Normal file
|
@ -0,0 +1,56 @@
|
|||
from pathlib import Path
|
||||
from typing import Generator
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
import pandas as pd
|
||||
from ..utils import OCCUPATION_MAJOR_CODES, style_plot
|
||||
|
||||
|
||||
def generate_estimates_lower_vs_upper_scatter(output_dir: Path, df: pd.DataFrame, **kwargs) -> Generator[Path]:
|
||||
"""
|
||||
Generates a styled scatter plot of lower-bound vs upper-bound time estimates for tasks.
|
||||
"""
|
||||
style_plot()
|
||||
OUTPUT_PATH = output_dir / "estimates_lower_vs_upper_scatter.png"
|
||||
|
||||
plot_df = df.copy()
|
||||
# Replace onetsoc_major codes with their corresponding labels for the plot legend
|
||||
plot_df['onetsoc_major'] = plot_df['onetsoc_major'].map(OCCUPATION_MAJOR_CODES)
|
||||
|
||||
fig, ax = plt.subplots(figsize=(12, 10))
|
||||
sns.scatterplot(
|
||||
data=plot_df,
|
||||
x='lb_estimate_in_minutes',
|
||||
y='ub_estimate_in_minutes',
|
||||
alpha=0.3,
|
||||
edgecolor=None,
|
||||
hue="onetsoc_major",
|
||||
ax=ax
|
||||
)
|
||||
|
||||
# 45° reference line (y=x)
|
||||
lims = (
|
||||
min(df['lb_estimate_in_minutes'].min(), df['ub_estimate_in_minutes'].min()),
|
||||
max(df['lb_estimate_in_minutes'].max(), df['ub_estimate_in_minutes'].max())
|
||||
)
|
||||
lims = (lims[0] * 0.9, lims[1] * 1.1)
|
||||
ax.plot(lims, lims, color='black', linestyle='--', linewidth=1, zorder=0)
|
||||
|
||||
# Optional helper lines for ratios
|
||||
for k in [2, 10, 100]:
|
||||
ax.plot(lims, [k*l for l in lims],
|
||||
linestyle=':', color='grey', linewidth=1, zorder=0)
|
||||
|
||||
ax.set_xscale('log')
|
||||
ax.set_yscale('log')
|
||||
ax.set_xlabel('Lower-bound (min, log scale)')
|
||||
ax.set_ylabel('Upper-bound (min, log scale)')
|
||||
ax.set_title('Lower vs Upper Estimates for All Tasks')
|
||||
|
||||
ax.legend(title="Occupation Major Group", bbox_to_anchor=(1.02, 1), loc='upper left')
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(OUTPUT_PATH, bbox_inches='tight')
|
||||
plt.close(fig)
|
||||
|
||||
yield OUTPUT_PATH
|
39
pipeline/generators/estimates_spread_per_occupation.py
Normal file
|
@ -0,0 +1,39 @@
|
|||
from pathlib import Path
|
||||
from typing import Generator
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
import pandas as pd
|
||||
from ..utils import OCCUPATION_MAJOR_CODES, style_plot
|
||||
|
||||
|
||||
def generate_estimate_spread_per_occupation(output_dir: Path, df: pd.DataFrame, **kwargs) -> Generator[Path]:
|
||||
"""
|
||||
Generates a styled boxplot of the estimate range spread per major occupation group.
|
||||
"""
|
||||
style_plot()
|
||||
OUTPUT_PATH = output_dir / "estimates_spread_per_occupation.png"
|
||||
|
||||
fig, ax = plt.subplots(figsize=(10, 12))
|
||||
|
||||
sns.boxplot(
|
||||
data=df,
|
||||
x='onetsoc_major',
|
||||
y='estimate_range',
|
||||
showfliers=False,
|
||||
ax=ax
|
||||
)
|
||||
|
||||
ax.set_yscale('log')
|
||||
ax.set_xlabel('Occupation')
|
||||
ax.set_ylabel('Range (upper-lower, minutes)')
|
||||
ax.set_title('Spread of time-range estimates per occupation')
|
||||
|
||||
# Get occupation labels from codes for x-axis ticks
|
||||
labels = [OCCUPATION_MAJOR_CODES.get(code.get_text(), code.get_text()) for code in ax.get_xticklabels()]
|
||||
ax.set_xticklabels(labels, rotation=60, ha='right')
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(OUTPUT_PATH)
|
||||
plt.close(fig)
|
||||
|
||||
yield OUTPUT_PATH
|
|
@ -1,6 +0,0 @@
|
|||
import pandas as pd
|
||||
from typings import List
|
||||
|
||||
def must_have_columns(df: pd.DataFrame, columns: List[str]):
|
||||
if not all(col in df.columns for col in columns):
|
||||
raise ValueError(f"DataFrame is missing required columns: {columns}")
|
229
pipeline/generators/projected_automatable_wage_bill.py
Normal file
|
@ -0,0 +1,229 @@
|
|||
from pathlib import Path
|
||||
from typing import Generator, Dict, Tuple, Optional
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.ticker as mticker
|
||||
from scipy.stats import linregress
|
||||
from datetime import datetime
|
||||
from ..utils import style_plot, LIME
|
||||
|
||||
def _generate_wage_projection_data(
|
||||
metr_results: Dict,
|
||||
df_with_wages: pd.DataFrame,
|
||||
percentile_key: str,
|
||||
doubling_time_modifier: float,
|
||||
) -> Optional[Tuple[pd.DataFrame, pd.DataFrame, float]]:
|
||||
"""
|
||||
Generates wage projection data for different AI progress scenarios.
|
||||
|
||||
Args:
|
||||
metr_results: The METR benchmark data.
|
||||
df_with_wages: DataFrame containing tasks with their estimated wage value.
|
||||
percentile_key: The percentile to use from METR data (e.g., 'p50_horizon_length').
|
||||
doubling_time_modifier: Multiplier for the doubling time (e.g., 1.0 for baseline,
|
||||
0.5 for optimistic, 2.0 for pessimistic).
|
||||
|
||||
Returns:
|
||||
A tuple of (metr_df, projection_df, doubling_time_days), or None if data is insufficient.
|
||||
"""
|
||||
all_model_data = []
|
||||
for model_name, data in metr_results.get("results", {}).items():
|
||||
for agent_name, agent_data in data.get("agents", {}).items():
|
||||
release_date_str = data.get("release_date")
|
||||
horizon = agent_data.get(percentile_key, {}).get("estimate")
|
||||
if release_date_str and horizon is not None:
|
||||
all_model_data.append({
|
||||
"release_date": release_date_str,
|
||||
"horizon_minutes": horizon,
|
||||
})
|
||||
|
||||
if not all_model_data:
|
||||
return None
|
||||
|
||||
metr_df = pd.DataFrame(all_model_data).sort_values("release_date").reset_index(drop=True)
|
||||
metr_df['release_date'] = pd.to_datetime(metr_df['release_date'])
|
||||
metr_df = metr_df[metr_df['horizon_minutes'] > 0].copy()
|
||||
|
||||
if len(metr_df) < 2:
|
||||
return None
|
||||
|
||||
metr_df['days_since_start'] = (metr_df['release_date'] - metr_df['release_date'].min()).dt.days
|
||||
log_y = np.log(metr_df['horizon_minutes'])
|
||||
slope, intercept, r_value, _, _ = linregress(metr_df['days_since_start'], log_y)
|
||||
|
||||
# Apply the scenario modifier to the doubling time
|
||||
base_doubling_time_days = np.log(2) / slope
|
||||
modified_doubling_time_days = base_doubling_time_days * doubling_time_modifier
|
||||
modified_slope = np.log(2) / modified_doubling_time_days
|
||||
|
||||
start_date = metr_df['release_date'].min()
|
||||
future_dates = pd.to_datetime(pd.date_range(start=start_date, end="2035-01-01", freq="ME"))
|
||||
future_days = (future_dates - start_date).days.to_numpy()
|
||||
|
||||
projected_log_horizon = intercept + modified_slope * future_days
|
||||
projected_horizon_minutes = np.exp(projected_log_horizon)
|
||||
|
||||
projection_df = pd.DataFrame({
|
||||
"date": future_dates,
|
||||
"projected_coherence_minutes": projected_horizon_minutes,
|
||||
})
|
||||
|
||||
# Calculate the total wage bill of tasks automated over time
|
||||
for bound in ["lb", "mid", "ub"]:
|
||||
col_name = 'estimate_midpoint' if bound == 'mid' else f'{bound}_estimate_in_minutes'
|
||||
projection_df[f"automatable_wage_bill_{bound}"] = projection_df["projected_coherence_minutes"].apply(
|
||||
lambda h: df_with_wages.loc[df_with_wages[col_name] <= h, 'wage_per_task'].sum()
|
||||
)
|
||||
|
||||
# Also calculate for the actual METR data points for plotting
|
||||
metr_df["automatable_wage_bill_mid"] = metr_df["horizon_minutes"].apply(
|
||||
lambda h: df_with_wages.loc[df_with_wages['estimate_midpoint'] <= h, 'wage_per_task'].sum()
|
||||
)
|
||||
|
||||
return metr_df, projection_df, modified_doubling_time_days
|
||||
|
||||
|
||||
def _plot_scenario(ax, projection_df, metr_df, label, color, line_style='-'):
|
||||
"""Helper function to draw a single projection scenario on a given axis."""
|
||||
# Plot the projected wage bill
|
||||
ax.plot(
|
||||
projection_df["date"],
|
||||
projection_df["automatable_wage_bill_mid"],
|
||||
label=label,
|
||||
color=color,
|
||||
linewidth=2.5,
|
||||
linestyle=line_style,
|
||||
zorder=3
|
||||
)
|
||||
# Plot the shaded range for lower/upper bounds
|
||||
ax.fill_between(
|
||||
projection_df["date"],
|
||||
projection_df["automatable_wage_bill_lb"],
|
||||
projection_df["automatable_wage_bill_ub"],
|
||||
color=color,
|
||||
alpha=0.15,
|
||||
zorder=2
|
||||
)
|
||||
# Plot the actual METR data points against the wage bill
|
||||
ax.scatter(
|
||||
metr_df['release_date'],
|
||||
metr_df['automatable_wage_bill_mid'],
|
||||
color=color,
|
||||
edgecolor='black',
|
||||
s=60,
|
||||
zorder=4,
|
||||
label=f"Model Capabilities (P50)"
|
||||
)
|
||||
|
||||
|
||||
def generate_projected_automatable_wage_bill(
|
||||
output_dir: Path,
|
||||
df: pd.DataFrame,
|
||||
task_summary_by_occupation_df: pd.DataFrame,
|
||||
metr_results: Dict,
|
||||
**kwargs,
|
||||
) -> Generator[Path, None, None]:
|
||||
"""
|
||||
Generates a plot projecting the automatable wage bill under different
|
||||
AI progress scenarios (optimistic, baseline, pessimistic).
|
||||
"""
|
||||
style_plot()
|
||||
OUTPUT_PATH = output_dir / "projected_automatable_wage_bill_sensitivity.png"
|
||||
|
||||
# 1. Calculate wage_per_task for each occupation
|
||||
wage_bill_info = task_summary_by_occupation_df[['onetsoc_code', 'wage_bill', 'total_tasks']].copy()
|
||||
wage_bill_info['wage_per_task'] = wage_bill_info['wage_bill'] / wage_bill_info['total_tasks']
|
||||
wage_bill_info.replace([np.inf, -np.inf], 0, inplace=True) # Avoid division by zero issues
|
||||
wage_bill_info.drop(columns=['wage_bill', 'total_tasks'], inplace=True)
|
||||
|
||||
# 2. Merge wage_per_task into the main task dataframe
|
||||
df_with_wages = pd.merge(df, wage_bill_info, on='onetsoc_code', how='left')
|
||||
df_with_wages['wage_per_task'].fillna(0, inplace=True)
|
||||
|
||||
# 3. Generate data for all three scenarios
|
||||
scenarios = {
|
||||
"Optimistic": {"modifier": 0.5, "color": "tab:green", "style": "--"},
|
||||
"Baseline": {"modifier": 1.0, "color": LIME['600'], "style": "-"},
|
||||
"Pessimistic": {"modifier": 2.0, "color": "tab:red", "style": ":"},
|
||||
}
|
||||
|
||||
projection_results = {}
|
||||
for name, config in scenarios.items():
|
||||
result = _generate_wage_projection_data(metr_results, df_with_wages, 'p50_horizon_length', config['modifier'])
|
||||
if result:
|
||||
projection_results[name] = result
|
||||
|
||||
if not projection_results:
|
||||
print("Warning: Could not generate any projection data. Skipping wage bill plot.")
|
||||
return
|
||||
|
||||
# 4. Create the plot
|
||||
fig, ax = plt.subplots(figsize=(14, 9))
|
||||
|
||||
# We only need to plot the scatter points once, let's use the baseline ones.
|
||||
if "Baseline" in projection_results:
|
||||
metr_df, _, _ = projection_results["Baseline"]
|
||||
ax.scatter(
|
||||
metr_df['release_date'],
|
||||
metr_df['automatable_wage_bill_mid'],
|
||||
color='black',
|
||||
s=80,
|
||||
zorder=5,
|
||||
label=f"Model Capabilities (P50)"
|
||||
)
|
||||
|
||||
|
||||
legend_lines = []
|
||||
for name, (metr_df, proj_df, doubling_time) in projection_results.items():
|
||||
config = scenarios[name]
|
||||
ax.plot(
|
||||
proj_df["date"],
|
||||
proj_df["automatable_wage_bill_mid"],
|
||||
color=config['color'],
|
||||
linestyle=config['style'],
|
||||
linewidth=2.5,
|
||||
zorder=3
|
||||
)
|
||||
ax.fill_between(
|
||||
proj_df["date"],
|
||||
proj_df["automatable_wage_bill_lb"],
|
||||
proj_df["automatable_wage_bill_ub"],
|
||||
color=config['color'],
|
||||
alpha=0.15,
|
||||
zorder=2
|
||||
)
|
||||
# Create a custom line for the legend
|
||||
line = plt.Line2D([0], [0], color=config['color'], linestyle=config['style'], lw=2.5,
|
||||
label=f'{name} (Doubling Time: {doubling_time:.0f} days)')
|
||||
legend_lines.append(line)
|
||||
|
||||
|
||||
# 5. Styling and annotations
|
||||
ax.set_title("Projected Automatable Wage Bill (P50 Coherence)", fontsize=18, pad=20)
|
||||
ax.set_xlabel("Year", fontsize=12)
|
||||
ax.set_ylabel("Automatable Annual Wage Bill (Trillions of USD)", fontsize=12)
|
||||
|
||||
# Format Y-axis to show trillions
|
||||
def trillions_formatter(x, pos):
|
||||
return f'${x / 1e12:.1f}T'
|
||||
ax.yaxis.set_major_formatter(mticker.FuncFormatter(trillions_formatter))
|
||||
|
||||
total_wage_bill = df_with_wages['wage_per_task'].sum()
|
||||
ax.set_ylim(0, total_wage_bill * 1.05)
|
||||
|
||||
if "Baseline" in projection_results:
|
||||
_, proj_df, _ = projection_results["Baseline"]
|
||||
ax.set_xlim(datetime(2022, 1, 1), proj_df["date"].max())
|
||||
|
||||
# Create the legend from the custom lines and the scatter plot
|
||||
scatter_legend = ax.get_legend_handles_labels()[0]
|
||||
ax.legend(handles=legend_lines + scatter_legend, loc="upper left", fontsize=11)
|
||||
|
||||
ax.grid(True, which="both", linestyle="--", linewidth=0.5)
|
||||
plt.tight_layout()
|
||||
plt.savefig(OUTPUT_PATH)
|
||||
plt.close(fig)
|
||||
|
||||
print(f"Generated sensitivity analysis plot: {OUTPUT_PATH}")
|
||||
yield OUTPUT_PATH
|
168
pipeline/generators/projected_task_automation.py
Normal file
|
@ -0,0 +1,168 @@
|
|||
from pathlib import Path
|
||||
from typing import Generator, Dict, Tuple
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from scipy.stats import linregress
|
||||
from datetime import datetime
|
||||
from ..utils import style_plot, LIME
|
||||
|
||||
def _generate_projection_data(
|
||||
metr_results: Dict,
|
||||
df: pd.DataFrame,
|
||||
percentile_key: str,
|
||||
) -> Tuple[pd.DataFrame, pd.DataFrame] | None:
|
||||
"""
|
||||
Generates projection data for a given percentile key (e.g., 'p50_horizon_length').
|
||||
Returns a tuple of (metr_df_with_pct, projection_df), or None if data is insufficient.
|
||||
"""
|
||||
# 1. Process METR data to get all model performance over time for the given percentile
|
||||
all_model_data = []
|
||||
for model_name, data in metr_results.get("results", {}).items():
|
||||
for agent_name, agent_data in data.get("agents", {}).items():
|
||||
release_date_str = data.get("release_date")
|
||||
horizon = agent_data.get(percentile_key, {}).get("estimate")
|
||||
|
||||
if release_date_str and horizon is not None:
|
||||
unique_model_name = f"{model_name}-{agent_name}"
|
||||
all_model_data.append({
|
||||
"model": unique_model_name,
|
||||
"release_date": release_date_str,
|
||||
"horizon_minutes": horizon,
|
||||
})
|
||||
|
||||
if not all_model_data:
|
||||
print(f"Warning: No models with {percentile_key} found in METR data. Skipping.")
|
||||
return None
|
||||
|
||||
metr_df = pd.DataFrame(all_model_data).sort_values("release_date").reset_index(drop=True)
|
||||
metr_df['release_date'] = pd.to_datetime(metr_df['release_date'])
|
||||
|
||||
# 2. Perform log-linear regression on coherence over time
|
||||
metr_df = metr_df[metr_df['horizon_minutes'] > 0].copy()
|
||||
if len(metr_df) < 2:
|
||||
print(f"Warning: Not enough data points for regression for {percentile_key}. Skipping.")
|
||||
return None
|
||||
|
||||
metr_df['days_since_start'] = (metr_df['release_date'] - metr_df['release_date'].min()).dt.days
|
||||
log_y = np.log(metr_df['horizon_minutes'])
|
||||
x = metr_df['days_since_start']
|
||||
|
||||
slope, intercept, r_value, _, _ = linregress(x, log_y)
|
||||
doubling_time_days = np.log(2) / slope
|
||||
print(f"METR all models {percentile_key} trend: R^2 = {r_value**2:.2f}, Doubling time = {doubling_time_days:.1f} days")
|
||||
|
||||
# 3. Project coherence into the future
|
||||
start_date = metr_df['release_date'].min()
|
||||
future_dates = pd.to_datetime(pd.date_range(start=start_date, end="2035-01-01", freq="ME"))
|
||||
future_days = (future_dates - start_date).days.to_numpy()
|
||||
|
||||
projected_log_horizon = intercept + slope * future_days
|
||||
projected_horizon_minutes = np.exp(projected_log_horizon)
|
||||
|
||||
projection_df = pd.DataFrame({
|
||||
"date": future_dates,
|
||||
"projected_coherence_minutes": projected_horizon_minutes,
|
||||
})
|
||||
|
||||
# 4. Calculate the percentage of tasks automated over time based on our estimates
|
||||
total_tasks = len(df)
|
||||
if total_tasks == 0:
|
||||
return None
|
||||
|
||||
for bound in ["lb", "mid", "ub"]:
|
||||
col_name = 'estimate_midpoint' if bound == 'mid' else f'{bound}_estimate_in_minutes'
|
||||
projection_df[f"pct_automatable_{bound}"] = projection_df["projected_coherence_minutes"].apply(
|
||||
lambda h: (df[col_name] <= h).sum() / total_tasks * 100
|
||||
)
|
||||
|
||||
metr_df["pct_automatable_mid"] = metr_df["horizon_minutes"].apply(
|
||||
lambda h: (df['estimate_midpoint'] <= h).sum() / total_tasks * 100
|
||||
)
|
||||
|
||||
return metr_df, projection_df
|
||||
|
||||
|
||||
def _plot_projection(ax, projection_df, metr_df, label, color, line_style='-'):
|
||||
"""Helper function to draw a single projection on a given axis."""
|
||||
# Plot the projected automation percentage
|
||||
ax.plot(
|
||||
projection_df["date"],
|
||||
projection_df["pct_automatable_mid"],
|
||||
label=f"Mid-point",
|
||||
color=color,
|
||||
linewidth=2.5,
|
||||
linestyle=line_style,
|
||||
zorder=3
|
||||
)
|
||||
ax.fill_between(
|
||||
projection_df["date"],
|
||||
projection_df["pct_automatable_lb"],
|
||||
projection_df["pct_automatable_ub"],
|
||||
color=color,
|
||||
alpha=0.15,
|
||||
label=f"Lower/upper bound range",
|
||||
zorder=2
|
||||
)
|
||||
# Plot the actual METR data points
|
||||
ax.scatter(
|
||||
metr_df['release_date'],
|
||||
metr_df['pct_automatable_mid'],
|
||||
color=color,
|
||||
edgecolor='black',
|
||||
s=60,
|
||||
zorder=4,
|
||||
label=f"Model with {label[1:]}% success rate"
|
||||
)
|
||||
|
||||
|
||||
def generate_projected_task_automation_plot(
|
||||
output_dir: Path,
|
||||
metr_results: Dict,
|
||||
df: pd.DataFrame,
|
||||
**kwargs,
|
||||
) -> Generator[Path, None, None]:
|
||||
"""
|
||||
Generates plots projecting task automation based on METR's p50 and p80
|
||||
coherence data.
|
||||
"""
|
||||
style_plot()
|
||||
|
||||
p50_data = _generate_projection_data(metr_results, df, 'p50_horizon_length')
|
||||
p80_data = _generate_projection_data(metr_results, df, 'p80_horizon_length')
|
||||
|
||||
# Plot P50 alone
|
||||
if p50_data:
|
||||
p50_metr_df, p50_proj_df = p50_data
|
||||
fig, ax = plt.subplots(figsize=(12, 8))
|
||||
_plot_projection(ax, p50_proj_df, p50_metr_df, "P50", LIME['600'])
|
||||
ax.set_title("How long before sequential coherence stops being a bottleneck?", fontsize=16, pad=20)
|
||||
ax.set_xlabel("Year")
|
||||
ax.set_ylabel("% of task automatable (50% success rate)")
|
||||
ax.set_ylim(0, 100.5)
|
||||
ax.set_xlim(datetime(2022, 1, 1), p50_proj_df["date"].max())
|
||||
ax.grid(True, which="both", linestyle="--", linewidth=0.5)
|
||||
ax.legend(loc="upper left")
|
||||
plt.tight_layout()
|
||||
output_path = output_dir / "projected_task_automation_p50.png"
|
||||
plt.savefig(output_path)
|
||||
plt.close(fig)
|
||||
yield output_path
|
||||
|
||||
# Plot P80 alone
|
||||
if p80_data:
|
||||
p80_metr_df, p80_proj_df = p80_data
|
||||
fig, ax = plt.subplots(figsize=(12, 8))
|
||||
_plot_projection(ax, p80_proj_df, p80_metr_df, "P80", 'tab:cyan')
|
||||
ax.set_title("Projected Task Automation (P80 AI Coherence)", fontsize=16, pad=20)
|
||||
ax.set_xlabel("Year")
|
||||
ax.set_ylabel("% of Estimable Economic Tasks Automatable")
|
||||
ax.set_ylim(0, 100.5)
|
||||
ax.set_xlim(datetime(2022, 1, 1), p80_proj_df["date"].max())
|
||||
ax.grid(True, which="both", linestyle="--", linewidth=0.5)
|
||||
ax.legend(loc="upper left")
|
||||
plt.tight_layout()
|
||||
output_path = output_dir / "projected_task_automation_p80.png"
|
||||
plt.savefig(output_path)
|
||||
plt.close(fig)
|
||||
yield output_path
|
54
pipeline/generators/sequential_coherence_cdf.py
Normal file
|
@ -0,0 +1,54 @@
|
|||
from pathlib import Path
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.ticker as mtick
|
||||
from ..utils import LIME, style_plot
|
||||
|
||||
def plot_sequential_coherence_cdf(output_dir: Path, df: pd.DataFrame, **kwargs):
|
||||
style_plot()
|
||||
output_path = output_dir / "sequential_coherence_cdf.png"
|
||||
|
||||
def cdf(series):
|
||||
"""Helper function to calculate CDF data."""
|
||||
s = series.sort_values().reset_index(drop=True)
|
||||
# Calculate cumulative percentage
|
||||
return s.values, ((s.index + 1) / len(s)) * 100
|
||||
|
||||
# Calculate CDF for lower, upper, and midpoint estimates
|
||||
x_lb, y_lb = cdf(df['lb_estimate_in_minutes'])
|
||||
x_ub, y_ub = cdf(df['ub_estimate_in_minutes'])
|
||||
x_mid, y_mid = cdf(df['estimate_midpoint'])
|
||||
|
||||
# Create the plot
|
||||
fig, ax = plt.subplots(figsize=(12, 7))
|
||||
|
||||
# Plot the CDFs as step plots
|
||||
ax.step(x_lb, y_lb, where='post', color=LIME['300'], linewidth=1.8, linestyle='--', zorder=2, label='Lower bound estimate')
|
||||
ax.step(x_ub, y_ub, where='post', color=LIME['900'], linewidth=1.8, linestyle=':', zorder=3, label='Upper bound estimate')
|
||||
ax.step(x_mid, y_mid, where='post', color=LIME['600'], linewidth=2.2, zorder=4, label='Mid-point')
|
||||
|
||||
# --- Styling and Annotations ---
|
||||
ax.set_xscale('log')
|
||||
ax.set_ylim(0, 100)
|
||||
ax.yaxis.set_major_formatter(mtick.PercentFormatter(decimals=0))
|
||||
|
||||
# Set titles and labels using the standard axes methods
|
||||
ax.set_title("% of Tasks With Sequential Coherence ≤ X")
|
||||
ax.set_xlabel("Sequential Coherence (X)")
|
||||
ax.set_ylabel("Cumulative Percentage of Tasks")
|
||||
|
||||
# Define custom x-axis ticks and labels for better readability
|
||||
ticks = [1, 5, 10, 30, 60, 120, 240, 480, 1440, 2880, 10080, 43200, 129600, 259200, 525600]
|
||||
ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hr', '2 hr', '4 hr', '8 hr', '1 day', '2 days',
|
||||
'1 wk', '30 days', '90 days', '180 days', '1 yr']
|
||||
ax.set_xticks(ticks)
|
||||
ax.set_xticklabels(ticklabels, rotation=45, ha='right')
|
||||
|
||||
ax.legend(loc='lower right')
|
||||
|
||||
# --- Save and close ---
|
||||
plt.tight_layout()
|
||||
plt.savefig(output_path, bbox_inches='tight')
|
||||
plt.close(fig)
|
||||
|
||||
yield output_path
|
|
@ -1,41 +0,0 @@
|
|||
"""
|
||||
This module defines the Metadata model for the pipeline.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import Dict, Any
|
||||
|
||||
class Metadata(BaseModel):
|
||||
"""
|
||||
A Pydantic model for storing pipeline metadata.
|
||||
|
||||
This class is intended to be instantiated once and passed through the
|
||||
pipeline. Each step in the pipeline can then add its own metadata.
|
||||
This provides a centralized and structured way to track data provenance,
|
||||
versions, and other important information.
|
||||
"""
|
||||
fetchers: Dict[str, Dict[str, Any]] = Field(default_factory=dict)
|
||||
enrichments: Dict[str, Dict[str, Any]] = Field(default_factory=dict)
|
||||
|
||||
ts: str = Field(default_factory=lambda: datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
|
||||
commit: str = Field(default_factory=lambda: _get_current_commit())
|
||||
|
||||
|
||||
def _get_current_commit() -> str:
|
||||
"""
|
||||
Returns the current git commit hash, "unknown", or "errored" depending on why the commit could not be retrieved.
|
||||
"""
|
||||
import subprocess
|
||||
try:
|
||||
# Get the current commit hash
|
||||
commit_hash = subprocess.check_output(
|
||||
["git", "rev-parse", "HEAD"], stderr=subprocess.PIPE, text=True
|
||||
).strip()
|
||||
return commit_hash
|
||||
except subprocess.CalledProcessError:
|
||||
# If git command fails (e.g., not a git repository)
|
||||
return "errored"
|
||||
except FileNotFoundError:
|
||||
# If git is not installed
|
||||
return "unknown"
|
|
@ -1,140 +0,0 @@
|
|||
from .run import Run
|
||||
from .logger import logger
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
|
||||
def check_for_insanity(run: Run) -> Run:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def create_df_tasks(run: Run) -> Run:
|
||||
"""
|
||||
Creates a dataframe of tasks from the O*NET database, and merges it with remote status data.
|
||||
This replicates the logic from old/enrich_task_ratings.py and parts of old/analysis.py
|
||||
|
||||
The resulting dataframe, `run.df_tasks` will be used by the enrichment steps.
|
||||
"""
|
||||
logger.info("Creating tasks dataframe")
|
||||
cache_path = run.cache_dir / f"onet_{run.onet_version}_tasks_with_remote_status.parquet"
|
||||
if cache_path.exists():
|
||||
logger.info(f"Loading cached tasks dataframe from {cache_path}")
|
||||
run.df_tasks = pd.read_parquet(cache_path)
|
||||
return run
|
||||
|
||||
query = """
|
||||
SELECT
|
||||
tr.onetsoc_code,
|
||||
tr.task_id,
|
||||
ts.task,
|
||||
od.title AS occupation_title,
|
||||
od.description AS occupation_description,
|
||||
tr.scale_id,
|
||||
tr.category,
|
||||
tr.data_value,
|
||||
dr.dwa_title
|
||||
FROM
|
||||
task_ratings tr
|
||||
JOIN
|
||||
task_statements ts ON tr.task_id = ts.task_id
|
||||
JOIN
|
||||
occupation_data od ON tr.onetsoc_code = od.onetsoc_code
|
||||
LEFT JOIN
|
||||
tasks_to_dwas td ON tr.onetsoc_code = td.onetsoc_code AND tr.task_id = td.task_id
|
||||
LEFT JOIN
|
||||
dwa_reference dr ON td.dwa_id = dr.dwa_id;
|
||||
"""
|
||||
df = pd.read_sql_query(query, run.onet_conn)
|
||||
logger.info(f"Fetched {len(df)} records (including DWA info) from the database.")
|
||||
|
||||
# Separate ratings from DWAs
|
||||
core_cols = [
|
||||
"onetsoc_code", "task_id", "task", "occupation_title",
|
||||
"occupation_description", "scale_id", "category", "data_value"
|
||||
]
|
||||
ratings_df = df[core_cols].drop_duplicates().reset_index(drop=True)
|
||||
|
||||
dwa_cols = ["onetsoc_code", "task_id", "dwa_title"]
|
||||
dwas_df = df[dwa_cols].dropna(subset=["dwa_title"]).drop_duplicates().reset_index(drop=True)
|
||||
|
||||
# 1. Handle Frequency (FT)
|
||||
logger.info("Processing Frequency data")
|
||||
freq_df = ratings_df[ratings_df["scale_id"] == "FT"].copy()
|
||||
if not freq_df.empty:
|
||||
freq_pivot = freq_df.pivot_table(
|
||||
index=["onetsoc_code", "task_id"],
|
||||
columns="category",
|
||||
values="data_value",
|
||||
fill_value=0,
|
||||
)
|
||||
freq_pivot.columns = [f"frequency_category_{int(col)}" for col in freq_pivot.columns]
|
||||
else:
|
||||
idx = pd.MultiIndex(levels=[[], []], codes=[[], []], names=["onetsoc_code", "task_id"])
|
||||
freq_pivot = pd.DataFrame(index=idx)
|
||||
|
||||
# 2. Handle Importance (IM, IJ)
|
||||
logger.info("Processing Importance data")
|
||||
imp_df = ratings_df[ratings_df["scale_id"].isin(["IM", "IJ"])].copy()
|
||||
if not imp_df.empty:
|
||||
imp_avg = imp_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index()
|
||||
imp_avg.rename(columns={"data_value": "importance_average"}, inplace=True)
|
||||
else:
|
||||
imp_avg = pd.DataFrame(columns=["onetsoc_code", "task_id", "importance_average"])
|
||||
|
||||
# 3. Handle Relevance (RT)
|
||||
logger.info("Processing Relevance data")
|
||||
rel_df = ratings_df[ratings_df["scale_id"] == "RT"].copy()
|
||||
if not rel_df.empty:
|
||||
rel_avg = rel_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index()
|
||||
rel_avg.rename(columns={"data_value": "relevance_average"}, inplace=True)
|
||||
else:
|
||||
rel_avg = pd.DataFrame(columns=["onetsoc_code", "task_id", "relevance_average"])
|
||||
|
||||
# 4. Process DWAs
|
||||
logger.info("Processing DWA data")
|
||||
if not dwas_df.empty:
|
||||
dwas_grouped = dwas_df.groupby(["onetsoc_code", "task_id"])["dwa_title"].apply(list).reset_index()
|
||||
dwas_grouped.rename(columns={"dwa_title": "dwas"}, inplace=True)
|
||||
else:
|
||||
dwas_grouped = None
|
||||
|
||||
# 5. Get Base Task/Occupation Info
|
||||
logger.info("Extracting base task/occupation info")
|
||||
base_cols = ["onetsoc_code", "task_id", "task", "occupation_title", "occupation_description"]
|
||||
base_info = ratings_df[base_cols].drop_duplicates().set_index(["onetsoc_code", "task_id"])
|
||||
|
||||
# 6. Merge Processed ONET Data
|
||||
logger.info("Merging processed ONET data")
|
||||
final_df = base_info.merge(freq_pivot, left_index=True, right_index=True, how="left")
|
||||
final_df = final_df.reset_index()
|
||||
|
||||
if not imp_avg.empty:
|
||||
final_df = final_df.merge(imp_avg, on=["onetsoc_code", "task_id"], how="left")
|
||||
else:
|
||||
final_df["importance_average"] = np.nan
|
||||
|
||||
if not rel_avg.empty:
|
||||
final_df = final_df.merge(rel_avg, on=["onetsoc_code", "task_id"], how="left")
|
||||
else:
|
||||
final_df["relevance_average"] = np.nan
|
||||
|
||||
if dwas_grouped is not None and not dwas_grouped.empty:
|
||||
final_df = final_df.merge(dwas_grouped, on=["onetsoc_code", "task_id"], how="left")
|
||||
if "dwas" in final_df.columns:
|
||||
final_df["dwas"] = final_df["dwas"].apply(lambda x: x if isinstance(x, list) else [])
|
||||
else:
|
||||
final_df["dwas"] = [[] for _ in range(len(final_df))]
|
||||
|
||||
final_df = final_df.replace({np.nan: None})
|
||||
|
||||
# 7. Merge with EPOCH remote data
|
||||
logger.info("Merging with EPOCH remote data")
|
||||
final_df = pd.merge(final_df, run.epoch_df[['Task', 'Remote']], left_on='task', right_on='Task', how='left')
|
||||
final_df = final_df.drop('Task', axis=1).rename(columns={'Remote': 'remote_status'})
|
||||
|
||||
|
||||
logger.info(f"Created tasks dataframe with shape {final_df.shape}")
|
||||
final_df.to_parquet(cache_path)
|
||||
|
||||
run.df_tasks = final_df
|
||||
return run
|
|
@ -1,27 +0,0 @@
|
|||
from pydantic import BaseModel, Field
|
||||
import sqlite3
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from .metadata import Metadata
|
||||
|
||||
class Run(BaseModel):
|
||||
model_config = {"arbitrary_types_allowed": True}
|
||||
# === FETCHERS ===
|
||||
onet_conn: Optional[sqlite3.Connection] = None
|
||||
onet_version: Optional[str] = None
|
||||
|
||||
oesm_df: Optional[pd.DataFrame] = None
|
||||
oesm_version: Optional[str] = None
|
||||
|
||||
epoch_df: Optional[pd.DataFrame] = None
|
||||
epoch_version: Optional[str] = None
|
||||
|
||||
# === ENRICHMENTS ===
|
||||
task_estimateability_df: Optional[pd.DataFrame] = None
|
||||
task_estimates_df: Optional[pd.DataFrame] = None
|
||||
|
||||
meta: Metadata = Field(default_factory=Metadata)
|
||||
|
||||
cache_dir: Path
|
||||
output_dir: Path
|
|
@ -1,74 +1,215 @@
|
|||
import sqlite3
|
||||
import os
|
||||
from .logger import logger
|
||||
import pandas as pd
|
||||
from dotenv import load_dotenv
|
||||
from .fetchers import fetch_oesm_data, fetch_epoch_remote_data, fetch_onet_database
|
||||
from .enrichments import enrich_with_task_estimateability, enrich_with_task_estimates
|
||||
from .postprocessors import check_for_insanity, create_df_tasks
|
||||
from .fetchers import fetch_onet_database, fetch_oesm_data, fetch_epoch_remote_data, ONET_VERSION, fetch_metr_data
|
||||
from .classification import classify_tasks_as_estimable, generate_time_estimates_for_tasks
|
||||
from .generators import GENERATORS
|
||||
from .run import Run
|
||||
from .constants import GRAY
|
||||
from .aggregate import create_task_summary_by_occupation_df, aggregate_task_summary_by_major_code
|
||||
from .utils import convert_to_minutes
|
||||
import argparse
|
||||
import platformdirs
|
||||
import seaborn as sns
|
||||
import matplotlib as mpl
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
CACHE_DIR = platformdirs.user_cache_dir("econtai")
|
||||
|
||||
def run(output_dir: Path | Optional[str] = None):
|
||||
load_dotenv()
|
||||
_setup_graph_rendering()
|
||||
|
||||
if output_dir is None:
|
||||
output_dir = Path("dist/")
|
||||
elif isinstance(output_dir, str):
|
||||
output_dir = Path(output_dir).resolve()
|
||||
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
current_run = Run(output_dir=output_dir, cache_dir=Path(CACHE_DIR).resolve())
|
||||
current_run.cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Fetchers (fetchers.py)
|
||||
current_run.onet_conn, current_run.onet_version = fetch_onet_database(current_run)
|
||||
current_run.oesm_df, current_run.oesm_version = fetch_oesm_data(current_run)
|
||||
current_run.epoch_df, current_run.epoch_version = fetch_epoch_remote_data(current_run)
|
||||
|
||||
current_run = create_df_tasks(current_run)
|
||||
|
||||
# Enrichments (enrichments.py)
|
||||
current_run.task_estimateability_df = enrich_with_task_estimateability(current_run)
|
||||
current_run.task_estimates_df = enrich_with_task_estimates(current_run)
|
||||
|
||||
# Postprocessors (postprocessors.py)
|
||||
check_for_insanity(current_run)
|
||||
|
||||
# Generators (generators/)
|
||||
for gen in GENERATORS:
|
||||
gen(current_run)
|
||||
|
||||
|
||||
def _setup_graph_rendering():
|
||||
mpl.rcParams.update({
|
||||
'figure.facecolor' : GRAY['50'],
|
||||
'axes.facecolor' : GRAY['50'],
|
||||
'axes.edgecolor' : GRAY['100'],
|
||||
'axes.labelcolor' : GRAY['700'],
|
||||
'xtick.color' : GRAY['700'],
|
||||
'ytick.color' : GRAY['700'],
|
||||
'font.family' : 'Inter',
|
||||
'font.size' : 11,
|
||||
})
|
||||
class Runner:
|
||||
onet_conn: sqlite3.Connection
|
||||
oesm_df: pd.DataFrame
|
||||
epoch_df: pd.DataFrame
|
||||
metr_results: dict
|
||||
|
||||
def __init__(self, output_dir: Path | str, debug: bool, bust_estimability: bool, bust_estimates: bool):
|
||||
if isinstance(output_dir, str):
|
||||
output_dir = Path(output_dir).resolve()
|
||||
|
||||
sns.set_style("white")
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self.output_dir = output_dir
|
||||
self.intermediate_dir = self.output_dir / "intermediate"
|
||||
self.intermediate_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.cache_dir = platformdirs.user_cache_path("econtai")
|
||||
self.debug = debug
|
||||
self.bust_estimability = bust_estimability
|
||||
self.bust_estimates = bust_estimates
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Run the econtai pipeline.")
|
||||
parser.add_argument("--output-dir", type=str, help="The directory to write output files to.")
|
||||
args = parser.parse_args()
|
||||
run(output_dir=args.output_dir)
|
||||
if debug:
|
||||
os.environ["LITELLM_LOG"] = os.environ.get("LITELLM_LOG", "INFO")
|
||||
|
||||
def run(self):
|
||||
load_dotenv()
|
||||
|
||||
self.onet_conn = fetch_onet_database(self.cache_dir)
|
||||
self.oesm_df = fetch_oesm_data(self.cache_dir)
|
||||
self.epoch_df = fetch_epoch_remote_data(self.cache_dir)
|
||||
self.metr_results = fetch_metr_data(self.cache_dir)
|
||||
|
||||
self.df_tasks = self._create_df_tasks()
|
||||
self.df_tasks['onetsoc_major'] = self.df_tasks['onetsoc_code'].str[:2]
|
||||
|
||||
df_to_process = self.df_tasks[
|
||||
(self.df_tasks['importance_average'] > 3) &
|
||||
(self.df_tasks['remote_status'] == 'remote')
|
||||
].copy()
|
||||
|
||||
if self.debug:
|
||||
df_to_process = df_to_process.head(10)
|
||||
|
||||
task_estimability_df = classify_tasks_as_estimable(self.cache_dir, df_to_process, bust=self.bust_estimability)
|
||||
self.df_tasks = pd.merge(self.df_tasks, task_estimability_df, on='task', how='left')
|
||||
self.df_tasks['estimable'] = self.df_tasks['estimable'].fillna(False)
|
||||
self.df_tasks.to_parquet(self.intermediate_dir / "df_tasks.parquet")
|
||||
df_to_process = pd.merge(df_to_process, task_estimability_df, on='task', how='left')
|
||||
df_to_process['estimable'] = self.df_tasks['estimable'].fillna(False)
|
||||
|
||||
df_to_process = df_to_process[df_to_process['estimable']].copy()
|
||||
|
||||
task_estimates_df = generate_time_estimates_for_tasks(self.cache_dir, df_to_process, bust=self.bust_estimates)
|
||||
df = pd.merge(df_to_process, task_estimates_df, on=['onetsoc_code', 'task_id'], how='left')
|
||||
df['lb_estimate_in_minutes'] = df.apply(lambda row: convert_to_minutes(row['lb_estimate_qty'], row['lb_estimate_unit']), axis=1)
|
||||
df['ub_estimate_in_minutes'] = df.apply(lambda row: convert_to_minutes(row['ub_estimate_qty'], row['ub_estimate_unit']), axis=1)
|
||||
df['estimate_range'] = df.ub_estimate_in_minutes - df.lb_estimate_in_minutes
|
||||
df['estimate_ratio'] = np.divide(df.ub_estimate_in_minutes, df.lb_estimate_in_minutes).replace([np.inf, -np.inf], None)
|
||||
df['estimate_midpoint'] = (df.lb_estimate_in_minutes + df.ub_estimate_in_minutes) / 2
|
||||
|
||||
df.to_parquet(self.intermediate_dir / "estimable_tasks_with_estimates.parquet")
|
||||
|
||||
self.task_summary_by_occupation_df = create_task_summary_by_occupation_df(self.df_tasks, self.oesm_df)
|
||||
self.task_summary_by_occupation_df.to_parquet(self.intermediate_dir / "task_summary_by_occupation.parquet")
|
||||
self.task_summary_by_major_occupation_df = aggregate_task_summary_by_major_code(self.task_summary_by_occupation_df)
|
||||
self.task_summary_by_major_occupation_df.to_parquet(self.intermediate_dir / "task_summary_by_major_occupation.parquet")
|
||||
|
||||
self._check_for_insanity(df)
|
||||
|
||||
for gen in GENERATORS:
|
||||
for asset in gen(**{
|
||||
"output_dir": self.output_dir,
|
||||
"runner": self,
|
||||
"df": df,
|
||||
"task_summary_by_occupation_df": self.task_summary_by_occupation_df,
|
||||
"task_summary_by_major_occupation_df": self.task_summary_by_major_occupation_df,
|
||||
"df_tasks": self.df_tasks,
|
||||
"oesm_df": self.oesm_df,
|
||||
"metr_results": self.metr_results,
|
||||
}):
|
||||
logger.info(f"New asset: {asset}")
|
||||
|
||||
def _create_df_tasks(self) -> pd.DataFrame:
|
||||
DATA_PATH = self.cache_dir / f"onet_{ONET_VERSION}_tasks_with_remote_status.parquet"
|
||||
if DATA_PATH.exists():
|
||||
logger.info(f"Loading cached tasks dataframe from {DATA_PATH}")
|
||||
return pd.read_parquet(DATA_PATH)
|
||||
|
||||
logger.info("Creating tasks dataframe")
|
||||
query = """
|
||||
SELECT
|
||||
tr.onetsoc_code,
|
||||
tr.task_id,
|
||||
ts.task,
|
||||
od.title AS occupation_title,
|
||||
od.description AS occupation_description,
|
||||
tr.scale_id,
|
||||
tr.category,
|
||||
tr.data_value
|
||||
FROM
|
||||
task_ratings tr
|
||||
JOIN
|
||||
task_statements ts ON tr.task_id = ts.task_id
|
||||
JOIN
|
||||
occupation_data od ON tr.onetsoc_code = od.onetsoc_code;
|
||||
"""
|
||||
ratings_df = pd.read_sql_query(query, self.onet_conn)
|
||||
logger.info(f"Fetched {len(ratings_df)} task rating records from the database.")
|
||||
|
||||
# 1. Handle Frequency (FT)
|
||||
logger.info("Processing Frequency data")
|
||||
freq_df = ratings_df[ratings_df["scale_id"] == "FT"].copy()
|
||||
if not freq_df.empty:
|
||||
freq_pivot = freq_df.pivot_table(
|
||||
index=["onetsoc_code", "task_id"],
|
||||
columns="category",
|
||||
values="data_value",
|
||||
fill_value=0,
|
||||
)
|
||||
freq_pivot.columns = [f"frequency_category_{int(col)}" for col in freq_pivot.columns]
|
||||
else:
|
||||
raise ValueError("No frequency data.")
|
||||
|
||||
# 2. Handle Importance (IM, IJ)
|
||||
logger.info("Processing Importance data")
|
||||
imp_df = ratings_df[ratings_df["scale_id"].isin(["IM", "IJ"])].copy()
|
||||
if not imp_df.empty:
|
||||
imp_avg = imp_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index()
|
||||
imp_avg.rename(columns={"data_value": "importance_average"}, inplace=True)
|
||||
else:
|
||||
raise ValueError("No importance data.")
|
||||
|
||||
# 3. Handle Relevance (RT)
|
||||
logger.info("Processing Relevance data")
|
||||
rel_df = ratings_df[ratings_df["scale_id"] == "RT"].copy()
|
||||
if not rel_df.empty:
|
||||
rel_avg = rel_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index()
|
||||
rel_avg.rename(columns={"data_value": "relevance_average"}, inplace=True)
|
||||
else:
|
||||
raise ValueError("No relevance data.")
|
||||
|
||||
# 5. Get Base Task/Occupation Info
|
||||
logger.info("Extracting base task/occupation info")
|
||||
base_cols = ["onetsoc_code", "task_id", "task", "occupation_title", "occupation_description"]
|
||||
base_info = ratings_df[base_cols].drop_duplicates().set_index(["onetsoc_code", "task_id"])
|
||||
|
||||
# 6. Merge Processed ONET Data
|
||||
logger.info("Merging processed ONET data")
|
||||
final_df = base_info.merge(freq_pivot, left_index=True, right_index=True, how="left")
|
||||
final_df = final_df.reset_index()
|
||||
|
||||
if not imp_avg.empty:
|
||||
final_df = final_df.merge(imp_avg, on=["onetsoc_code", "task_id"], how="left")
|
||||
else:
|
||||
final_df["importance_average"] = np.nan
|
||||
|
||||
if not rel_avg.empty:
|
||||
final_df = final_df.merge(rel_avg, on=["onetsoc_code", "task_id"], how="left")
|
||||
else:
|
||||
final_df["relevance_average"] = np.nan
|
||||
|
||||
final_df = final_df.replace({np.nan: None})
|
||||
|
||||
# 7. Merge with EPOCH remote data
|
||||
logger.info("Merging with EPOCH remote data")
|
||||
final_df = pd.merge(final_df, self.epoch_df[['Task', 'Remote']], left_on='task', right_on='Task', how='left')
|
||||
final_df = final_df.drop('Task', axis=1).rename(columns={'Remote': 'remote_status'})
|
||||
|
||||
logger.info(f"Created tasks dataframe with shape {final_df.shape}")
|
||||
final_df.to_parquet(DATA_PATH)
|
||||
|
||||
return final_df
|
||||
|
||||
def _check_for_insanity(self, df: pd.DataFrame):
|
||||
if df['lb_estimate_in_minutes'].isnull().any():
|
||||
missing_count = df['lb_estimate_in_minutes'].isnull().sum()
|
||||
raise ValueError(f"Found {missing_count} atomic tasks with missing 'lb_estimate_in_minutes'.")
|
||||
|
||||
if df['ub_estimate_in_minutes'].isnull().any():
|
||||
missing_count = df['ub_estimate_in_minutes'].isnull().sum()
|
||||
raise ValueError(f"Found {missing_count} atomic tasks with missing 'ub_estimate_in_minutes'.")
|
||||
|
||||
valid_estimates = df.dropna(subset=['lb_estimate_in_minutes', 'ub_estimate_in_minutes'])
|
||||
impossible_bounds = valid_estimates[
|
||||
(valid_estimates['lb_estimate_in_minutes'] <= 0) |
|
||||
(valid_estimates['ub_estimate_in_minutes'] <= 0) |
|
||||
(valid_estimates['lb_estimate_in_minutes'] > valid_estimates['ub_estimate_in_minutes'])
|
||||
]
|
||||
if not impossible_bounds.empty:
|
||||
raise ValueError(f"Found {len(impossible_bounds)} rows with impossible bounds (e.g., lb > ub or value <= 0).")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
parser = argparse.ArgumentParser(description="Run the econtai pipeline.")
|
||||
parser.add_argument("--output-dir", type=str, default="dist/", help="The directory to write output files to.")
|
||||
parser.add_argument("--bust-estimability", action="store_true", help="Bust the saved task estimability classification (EXPENSIVE)")
|
||||
parser.add_argument("--bust-estimates", action="store_true", help="Bust the tasks estimates (EXPENSIVE)")
|
||||
parser.add_argument("--debug", action="store_true", help="Enable debug mode (e.g., process fewer tasks).")
|
||||
|
||||
args = parser.parse_args()
|
||||
Runner(output_dir=args.output_dir, debug=args.debug, bust_estimability=args.bust_estimability, bust_estimates=args.bust_estimates).run()
|
||||
|
|
222
pipeline/utils.py
Normal file
|
@ -0,0 +1,222 @@
|
|||
import subprocess
|
||||
import matplotlib.colors as mcolors
|
||||
import matplotlib as mpl
|
||||
import seaborn as sns
|
||||
import tempfile
|
||||
import litellm
|
||||
import time
|
||||
import math
|
||||
from tqdm import tqdm
|
||||
from typing import Any, List, Dict
|
||||
from .logger import logger
|
||||
|
||||
OCCUPATION_MAJOR_CODES = {
|
||||
'11': 'Management',
|
||||
'13': 'Business & Financial',
|
||||
'15': 'Computer & Mathematical',
|
||||
'17': 'Architecture & Engineering',
|
||||
'19': 'Life, Physical, & Social Science',
|
||||
'21': 'Community & Social Service',
|
||||
'23': 'Legal',
|
||||
'25': 'Education, Training, & Library',
|
||||
'27': 'Arts, Design, & Media',
|
||||
'29': 'Healthcare Practitioners',
|
||||
'31': 'Healthcare Support',
|
||||
'33': 'Protective Service',
|
||||
'35': 'Food Preparation & Serving',
|
||||
'37': 'Building & Grounds Maintenance',
|
||||
'39': 'Personal Care & Service',
|
||||
'41': 'Sales & Related',
|
||||
'43': 'Office & Admin Support',
|
||||
'45': 'Farming, Fishing, & Forestry',
|
||||
'47': 'Construction & Extraction',
|
||||
'49': 'Installation, Maintenance, & Repair',
|
||||
'51': 'Production',
|
||||
'53': 'Transportation & Material Moving',
|
||||
'55': 'Military Specific',
|
||||
}
|
||||
|
||||
GRAY = {'50':'#f8fafc','100':'#f1f5f9','200':'#e2e8f0',
|
||||
'300':'#cbd5e1','400':'#94a3b8','500':'#64748b',
|
||||
'600':'#475569','700':'#334155','800':'#1e293b',
|
||||
'900':'#0f172a','950':'#020617'}
|
||||
|
||||
LIME = {'50': '#f7fee7','100': '#ecfcca','200': '#d8f999',
|
||||
'300': '#bbf451','400': '#9ae600','500': '#83cd00',
|
||||
'600': '#64a400','700': '#497d00','800': '#3c6300',
|
||||
'900': '#35530e','950': '#192e03'}
|
||||
|
||||
|
||||
def convert_to_minutes(qty, unit):
|
||||
"""Converts a quantity in a given unit to minutes."""
|
||||
return qty * {
|
||||
"minute": 1,
|
||||
"hour": 60,
|
||||
"day": 60 * 24,
|
||||
"week": 60 * 24 * 7,
|
||||
"month": 60 * 24 * 30,
|
||||
"trimester": 60 * 24 * 90,
|
||||
"semester": 60 * 24 * 180,
|
||||
"year": 60 * 24 * 365,
|
||||
}[unit]
|
||||
|
||||
|
||||
def pretty_display(df):
|
||||
print(df)
|
||||
return
|
||||
html_output = df.to_html(index=False)
|
||||
|
||||
# Create a temporary HTML file
|
||||
with tempfile.NamedTemporaryFile(mode='w', suffix=".html", encoding="utf-8") as temp_file:
|
||||
temp_file.write(html_output)
|
||||
temp_file_path = temp_file.name
|
||||
subprocess.run(["/home/felix/.nix-profile/bin/firefox-devedition", "-p", "Work (YouthAI)", temp_file_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
input("Press Enter to continue after reviewing the HTML output...")
|
||||
|
||||
|
||||
def enrich(
|
||||
model: str,
|
||||
rpm: int, # Requests per minute
|
||||
messages_to_process: List[List[Dict[str, str]]],
|
||||
schema: Dict[str, Any],
|
||||
chunk_size: int = 100,
|
||||
):
|
||||
all_results = []
|
||||
num_messages = len(messages_to_process)
|
||||
if num_messages == 0:
|
||||
return all_results
|
||||
|
||||
num_chunks = math.ceil(num_messages / chunk_size)
|
||||
logger.info(f"Starting enrichment for {num_messages} messages, in {num_chunks} chunks of up to {chunk_size} each.")
|
||||
|
||||
# Calculate the time that should be allocated per request to respect the RPM limit.
|
||||
time_per_request = 60.0 / rpm if rpm > 0 else 0
|
||||
|
||||
for i in tqdm(range(num_chunks), desc="Enriching data in chunks"):
|
||||
chunk_start_time = time.time()
|
||||
|
||||
start_index = i * chunk_size
|
||||
end_index = start_index + chunk_size
|
||||
message_chunk = messages_to_process[start_index:end_index]
|
||||
|
||||
if not message_chunk:
|
||||
continue
|
||||
|
||||
try:
|
||||
# Send requests for the entire chunk in a batch for better performance.
|
||||
responses = litellm.batch_completion(
|
||||
model=model,
|
||||
messages=message_chunk,
|
||||
response_format={
|
||||
"type": "json_schema",
|
||||
"json_schema": schema,
|
||||
},
|
||||
)
|
||||
|
||||
# batch_completion returns the response or an exception object for each message.
|
||||
# We'll replace exceptions with None as expected by the calling functions.
|
||||
for response in responses:
|
||||
if isinstance(response, Exception):
|
||||
logger.error(f"API call within batch failed: {response}")
|
||||
all_results.append(None)
|
||||
else:
|
||||
all_results.append(response)
|
||||
|
||||
except Exception as e:
|
||||
# This catches catastrophic failures in batch_completion itself (e.g., auth)
|
||||
logger.error(f"litellm.batch_completion call failed for chunk {i+1}/{num_chunks}: {e}")
|
||||
all_results.extend([None] * len(message_chunk))
|
||||
|
||||
chunk_end_time = time.time()
|
||||
elapsed_time = chunk_end_time - chunk_start_time
|
||||
|
||||
# To enforce the rate limit, we calculate how long the chunk *should* have taken
|
||||
# and sleep for the remainder of that time.
|
||||
if time_per_request > 0:
|
||||
expected_duration_for_chunk = len(message_chunk) * time_per_request
|
||||
if elapsed_time < expected_duration_for_chunk:
|
||||
sleep_duration = expected_duration_for_chunk - elapsed_time
|
||||
logger.debug(f"Chunk processed in {elapsed_time:.2f}s. Sleeping for {sleep_duration:.2f}s to respect RPM.")
|
||||
time.sleep(sleep_duration)
|
||||
|
||||
return all_results
|
||||
|
||||
def get_contrasting_text_color(bg_color_hex_or_rgba):
|
||||
if isinstance(bg_color_hex_or_rgba, str):
|
||||
rgba = mcolors.to_rgba(bg_color_hex_or_rgba)
|
||||
else:
|
||||
rgba = bg_color_hex_or_rgba
|
||||
r, g, b, _ = rgba
|
||||
luminance = 0.2126 * r + 0.7152 * g + 0.0722 * b
|
||||
return 'black' if luminance > 0.55 else 'white'
|
||||
|
||||
|
||||
def style_plot():
|
||||
"""
|
||||
Applies a consistent and professional style to all plots.
|
||||
This function sets matplotlib's rcParams for a global effect.
|
||||
"""
|
||||
mpl.rcParams.update({
|
||||
'figure.facecolor': GRAY['50'],
|
||||
'figure.edgecolor': 'none',
|
||||
'figure.figsize': (12, 8),
|
||||
'figure.dpi': 150,
|
||||
|
||||
'axes.facecolor': GRAY['50'],
|
||||
'axes.edgecolor': GRAY['300'],
|
||||
'axes.grid': True,
|
||||
'axes.labelcolor': GRAY['800'],
|
||||
'axes.titlecolor': GRAY['900'],
|
||||
'axes.titlesize': 18,
|
||||
'axes.titleweight': 'bold',
|
||||
'axes.titlepad': 20,
|
||||
'axes.labelsize': 14,
|
||||
'axes.labelweight': 'semibold',
|
||||
'axes.labelpad': 10,
|
||||
'axes.spines.top': False,
|
||||
'axes.spines.right': False,
|
||||
'axes.spines.left': True,
|
||||
'axes.spines.bottom': True,
|
||||
|
||||
'text.color': GRAY['700'],
|
||||
|
||||
'xtick.color': GRAY['600'],
|
||||
'ytick.color': GRAY['600'],
|
||||
'xtick.labelsize': 12,
|
||||
'ytick.labelsize': 12,
|
||||
'xtick.major.size': 0,
|
||||
'ytick.major.size': 0,
|
||||
'xtick.minor.size': 0,
|
||||
'ytick.minor.size': 0,
|
||||
'xtick.major.pad': 8,
|
||||
'ytick.major.pad': 8,
|
||||
|
||||
'grid.color': GRAY['200'],
|
||||
'grid.linestyle': '--',
|
||||
'grid.linewidth': 1,
|
||||
|
||||
'legend.frameon': False,
|
||||
'legend.fontsize': 12,
|
||||
'legend.title_fontsize': 14,
|
||||
'legend.facecolor': 'inherit',
|
||||
|
||||
'font.family': 'sans-serif',
|
||||
'font.sans-serif': ['Inter'],
|
||||
'font.weight': 'normal',
|
||||
|
||||
'lines.linewidth': 2,
|
||||
'lines.markersize': 6,
|
||||
})
|
||||
|
||||
# Seaborn specific styles
|
||||
# Use shades of LIME as the primary color palette.
|
||||
# Sorting by integer value of keys, and reversed to have darker shades first.
|
||||
# Excluding very light colors that won't be visible on a light background.
|
||||
lime_palette = [LIME[k] for k in sorted(LIME.keys(), key=int, reverse=True) if k not in ['50', '100', '700', '800', '900', '950',]]
|
||||
|
||||
sns.set_palette(lime_palette)
|
||||
sns.set_style("whitegrid", {
|
||||
'axes.edgecolor': GRAY['300'],
|
||||
'grid.color': GRAY['200'],
|
||||
'grid.linestyle': '--',
|
||||
})
|
|
@ -16,6 +16,7 @@ dependencies = [
|
|||
"python-dotenv>=1.1.1",
|
||||
"requests>=2.32.4",
|
||||
"rich>=14.0.0",
|
||||
"scipy>=1.16.0",
|
||||
"seaborn>=0.13.2",
|
||||
]
|
||||
|
||||
|
|
31
uv.lock
generated
|
@ -1120,6 +1120,35 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/75/04/5302cea1aa26d886d34cadbf2dc77d90d7737e576c0065f357b96dc7a1a6/rpds_py-0.26.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f14440b9573a6f76b4ee4770c13f0b5921f71dde3b6fcb8dabbefd13b7fe05d7", size = 232821, upload_time = "2025-07-01T15:55:55.167Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "scipy"
|
||||
version = "1.16.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "numpy" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/81/18/b06a83f0c5ee8cddbde5e3f3d0bb9b702abfa5136ef6d4620ff67df7eee5/scipy-1.16.0.tar.gz", hash = "sha256:b5ef54021e832869c8cfb03bc3bf20366cbcd426e02a58e8a58d7584dfbb8f62", size = 30581216, upload_time = "2025-06-22T16:27:55.782Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/46/95/0746417bc24be0c2a7b7563946d61f670a3b491b76adede420e9d173841f/scipy-1.16.0-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:e9f414cbe9ca289a73e0cc92e33a6a791469b6619c240aa32ee18abdce8ab451", size = 36418162, upload_time = "2025-06-22T16:19:56.3Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/19/5a/914355a74481b8e4bbccf67259bbde171348a3f160b67b4945fbc5f5c1e5/scipy-1.16.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:bbba55fb97ba3cdef9b1ee973f06b09d518c0c7c66a009c729c7d1592be1935e", size = 28465985, upload_time = "2025-06-22T16:20:01.238Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/58/46/63477fc1246063855969cbefdcee8c648ba4b17f67370bd542ba56368d0b/scipy-1.16.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:58e0d4354eacb6004e7aa1cd350e5514bd0270acaa8d5b36c0627bb3bb486974", size = 20737961, upload_time = "2025-06-22T16:20:05.913Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/93/86/0fbb5588b73555e40f9d3d6dde24ee6fac7d8e301a27f6f0cab9d8f66ff2/scipy-1.16.0-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:75b2094ec975c80efc273567436e16bb794660509c12c6a31eb5c195cbf4b6dc", size = 23377941, upload_time = "2025-06-22T16:20:10.668Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ca/80/a561f2bf4c2da89fa631b3cbf31d120e21ea95db71fd9ec00cb0247c7a93/scipy-1.16.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6b65d232157a380fdd11a560e7e21cde34fdb69d65c09cb87f6cc024ee376351", size = 33196703, upload_time = "2025-06-22T16:20:16.097Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/11/6b/3443abcd0707d52e48eb315e33cc669a95e29fc102229919646f5a501171/scipy-1.16.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1d8747f7736accd39289943f7fe53a8333be7f15a82eea08e4afe47d79568c32", size = 35083410, upload_time = "2025-06-22T16:20:21.734Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/20/ab/eb0fc00e1e48961f1bd69b7ad7e7266896fe5bad4ead91b5fc6b3561bba4/scipy-1.16.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:eb9f147a1b8529bb7fec2a85cf4cf42bdfadf9e83535c309a11fdae598c88e8b", size = 35387829, upload_time = "2025-06-22T16:20:27.548Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/57/9e/d6fc64e41fad5d481c029ee5a49eefc17f0b8071d636a02ceee44d4a0de2/scipy-1.16.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d2b83c37edbfa837a8923d19c749c1935ad3d41cf196006a24ed44dba2ec4358", size = 37841356, upload_time = "2025-06-22T16:20:35.112Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7c/a7/4c94bbe91f12126b8bf6709b2471900577b7373a4fd1f431f28ba6f81115/scipy-1.16.0-cp313-cp313-win_amd64.whl", hash = "sha256:79a3c13d43c95aa80b87328a46031cf52508cf5f4df2767602c984ed1d3c6bbe", size = 38403710, upload_time = "2025-06-22T16:21:54.473Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/47/20/965da8497f6226e8fa90ad3447b82ed0e28d942532e92dd8b91b43f100d4/scipy-1.16.0-cp313-cp313t-macosx_10_14_x86_64.whl", hash = "sha256:f91b87e1689f0370690e8470916fe1b2308e5b2061317ff76977c8f836452a47", size = 36813833, upload_time = "2025-06-22T16:20:43.925Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/28/f4/197580c3dac2d234e948806e164601c2df6f0078ed9f5ad4a62685b7c331/scipy-1.16.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:88a6ca658fb94640079e7a50b2ad3b67e33ef0f40e70bdb7dc22017dae73ac08", size = 28974431, upload_time = "2025-06-22T16:20:51.302Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/8a/fc/e18b8550048d9224426e76906694c60028dbdb65d28b1372b5503914b89d/scipy-1.16.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:ae902626972f1bd7e4e86f58fd72322d7f4ec7b0cfc17b15d4b7006efc385176", size = 21246454, upload_time = "2025-06-22T16:20:57.276Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/8c/48/07b97d167e0d6a324bfd7484cd0c209cc27338b67e5deadae578cf48e809/scipy-1.16.0-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:8cb824c1fc75ef29893bc32b3ddd7b11cf9ab13c1127fe26413a05953b8c32ed", size = 23772979, upload_time = "2025-06-22T16:21:03.363Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/4c/4f/9efbd3f70baf9582edf271db3002b7882c875ddd37dc97f0f675ad68679f/scipy-1.16.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:de2db7250ff6514366a9709c2cba35cb6d08498e961cba20d7cff98a7ee88938", size = 33341972, upload_time = "2025-06-22T16:21:11.14Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/3f/dc/9e496a3c5dbe24e76ee24525155ab7f659c20180bab058ef2c5fa7d9119c/scipy-1.16.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e85800274edf4db8dd2e4e93034f92d1b05c9421220e7ded9988b16976f849c1", size = 35185476, upload_time = "2025-06-22T16:21:19.156Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ce/b3/21001cff985a122ba434c33f2c9d7d1dc3b669827e94f4fc4e1fe8b9dfd8/scipy-1.16.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4f720300a3024c237ace1cb11f9a84c38beb19616ba7c4cdcd771047a10a1706", size = 35570990, upload_time = "2025-06-22T16:21:27.797Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e5/d3/7ba42647d6709251cdf97043d0c107e0317e152fa2f76873b656b509ff55/scipy-1.16.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:aad603e9339ddb676409b104c48a027e9916ce0d2838830691f39552b38a352e", size = 37950262, upload_time = "2025-06-22T16:21:36.976Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/eb/c4/231cac7a8385394ebbbb4f1ca662203e9d8c332825ab4f36ffc3ead09a42/scipy-1.16.0-cp313-cp313t-win_amd64.whl", hash = "sha256:f56296fefca67ba605fd74d12f7bd23636267731a72cb3947963e76b8c0a25db", size = 38515076, upload_time = "2025-06-22T16:21:45.694Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "seaborn"
|
||||
version = "0.13.2"
|
||||
|
@ -1168,6 +1197,7 @@ dependencies = [
|
|||
{ name = "python-dotenv" },
|
||||
{ name = "requests" },
|
||||
{ name = "rich" },
|
||||
{ name = "scipy" },
|
||||
{ name = "seaborn" },
|
||||
]
|
||||
|
||||
|
@ -1184,6 +1214,7 @@ requires-dist = [
|
|||
{ name = "python-dotenv", specifier = ">=1.1.1" },
|
||||
{ name = "requests", specifier = ">=2.32.4" },
|
||||
{ name = "rich", specifier = ">=14.0.0" },
|
||||
{ name = "scipy", specifier = ">=1.16.0" },
|
||||
{ name = "seaborn", specifier = ">=0.13.2" },
|
||||
]
|
||||
|
||||
|
|