This commit is contained in:
Félix Dorn 2025-07-03 17:32:41 +02:00
parent 2da206d368
commit b7c94590f9
14 changed files with 2200 additions and 13 deletions

507
old/add_task_estimates.py Normal file
View file

@ -0,0 +1,507 @@
import pandas as pd
import litellm
import dotenv
import os
import time
import json
import math
import numpy as np
# --- Configuration ---
MODEL = "gpt-4.1-mini" # Make sure this model supports json_schema or structured output
RATE_LIMIT = 5000 # Requests per minute
CHUNK_SIZE = 300
SECONDS_PER_MINUTE = 60
FILENAME = (
"tasks_with_estimates.csv" # This CSV should contain the tasks to be processed
)
# --- Prompts and Schema ---
SYSTEM_PROMPT = """
You are an expert assistant evaluating the time to completion required for job tasks. Your goal is to estimate the time range needed for a skilled human to complete the following job task remotely, without supervision.
Provide a lower and upper bound estimate for the time to completion time. These bounds should capture the time within which approximately 80% of instances of performing this specific task are typically completed by a qualified individual.
Base your estimate on the provided task description, its associated activities, and the occupational context. Your estimate must be in one the allowed units: minute, hour, day, week, month, trimester, semester, year.
""".strip()
USER_MESSAGE_TEMPLATE = """
Please estimate the time range for the following remote task:
**Task Description:** {task}
**Relevant activies for the task:**
{dwas}
**Occupation Category:** {occupation_title}
**Occupation Description:** {occupation_description}
Consider the complexity and the typical steps involved.
""".strip()
ALLOWED_UNITS = [
"minute",
"hour",
"day",
"week",
"month",
"trimester",
"semester",
"year",
]
SCHEMA_FOR_VALIDATION = {
"name": "estimate_time",
"strict": True, # Enforce schema adherence
"schema": {
"type": "object",
"properties": {
"lower_bound_estimate": {
"type": "object",
"properties": {
"quantity": {
"type": "number",
"description": "The numerical value for the lower bound of the estimate.",
},
"unit": {
"type": "string",
"enum": ALLOWED_UNITS,
"description": "The unit of time for the lower bound.",
},
},
"required": ["quantity", "unit"],
"additionalProperties": False,
},
"upper_bound_estimate": {
"type": "object",
"properties": {
"quantity": {
"type": "number",
"description": "The numerical value for the upper bound of the estimate.",
},
"unit": {
"type": "string",
"enum": ALLOWED_UNITS,
"description": "The unit of time for the upper bound.",
},
},
"required": ["quantity", "unit"],
"additionalProperties": False,
},
},
"required": ["lower_bound_estimate", "upper_bound_estimate"],
"additionalProperties": False,
},
}
def save_dataframe(df_to_save, filename):
"""Saves the DataFrame to the specified CSV file using atomic write."""
try:
temp_filename = filename + ".tmp"
df_to_save.to_csv(temp_filename, encoding="utf-8-sig", index=False)
os.replace(temp_filename, filename)
except Exception as e:
print(f"--- Error saving DataFrame to {filename}: {e} ---")
if os.path.exists(temp_filename):
try:
os.remove(temp_filename)
except Exception as remove_err:
print(
f"--- Error removing temporary save file {temp_filename}: {remove_err} ---"
)
def create_task_estimates():
try:
# Read the CSV
if os.path.exists(FILENAME):
df = pd.read_csv(FILENAME, encoding="utf-8-sig")
print(f"Successfully read {len(df)} rows from {FILENAME}.")
estimate_columns_spec = {
"lb_estimate_qty": float,
"lb_estimate_unit": object,
"ub_estimate_qty": float,
"ub_estimate_unit": object,
}
save_needed = False
for col_name, target_dtype in estimate_columns_spec.items():
if col_name not in df.columns:
# Initialize with a type-compatible missing value
if target_dtype == float:
df[col_name] = np.nan
else: # object
df[col_name] = pd.NA
df[col_name] = df[col_name].astype(target_dtype) # Enforce dtype
print(f"Added '{col_name}' column as {df[col_name].dtype}.")
save_needed = True
else:
# Column exists, ensure correct dtype
current_pd_dtype = df[col_name].dtype
expected_pd_dtype = pd.Series(dtype=target_dtype).dtype
if current_pd_dtype != expected_pd_dtype:
try:
if target_dtype == float:
df[col_name] = pd.to_numeric(df[col_name], errors="coerce")
else: # object
df[col_name] = df[col_name].astype(object)
print(
f"Corrected dtype of '{col_name}' to {df[col_name].dtype}."
)
save_needed = True
except Exception as e:
print(
f"Warning: Could not convert column '{col_name}' to {target_dtype}: {e}. Current dtype: {current_pd_dtype}"
)
# Standardize missing values (e.g., empty strings to NA/NaN)
# Replace common missing placeholders with pd.NA first
df[col_name].replace(["", None, ""], pd.NA, inplace=True)
if target_dtype == float:
# For float columns, ensure they are numeric and use np.nan after replacement
df[col_name] = pd.to_numeric(df[col_name], errors="coerce")
if save_needed:
print(f"Saving {FILENAME} after adding/adjusting estimate columns.")
save_dataframe(df, FILENAME)
else:
print(
f"Error: {FILENAME} not found. Please ensure the file exists and contains task data."
)
exit()
except FileNotFoundError:
print(
f"Error: {FILENAME} not found. Please ensure the file exists and contains task data."
)
exit()
except Exception as e:
print(f"Error reading or initializing {FILENAME}: {e}")
exit()
# --- Identify Rows to Process ---
# We'll check for NaN in one of the primary quantity columns.
unprocessed_mask = df["lb_estimate_qty"].isna()
if unprocessed_mask.any():
start_index = unprocessed_mask.idxmax() # Finds the index of the first True value
print(f"Resuming processing. First unprocessed row found at index {start_index}.")
df_to_process = df.loc[unprocessed_mask].copy()
original_indices = df_to_process.index # Keep track of original indices
else:
print(
"All rows seem to have estimates already (based on 'lb_estimate_qty'). Exiting."
)
exit()
# --- Prepare messages for batch completion (only for rows needing processing) ---
messages_list = []
skipped_rows_indices = []
valid_original_indices = []
if not df_to_process.empty:
required_cols = ["task", "occupation_title", "occupation_description", "dwas"]
print(
f"Preparing messages for up to {len(df_to_process)} rows starting from original index {original_indices[0] if len(original_indices) > 0 else 'N/A'}..."
)
print(f"Checking for required columns: {required_cols}")
for index, row in df_to_process.iterrows():
missing_or_empty = []
for col in required_cols:
if col not in row or pd.isna(row[col]) or str(row[col]).strip() == "":
missing_or_empty.append(col)
if missing_or_empty:
print(
f"Warning: Skipping row original index {index} due to missing/empty required data in columns: {', '.join(missing_or_empty)}."
)
skipped_rows_indices.append(index)
continue
try:
user_message = USER_MESSAGE_TEMPLATE.format(
task=row["task"],
occupation_title=row["occupation_title"],
occupation_description=row["occupation_description"],
dwas=row["dwas"],
)
except KeyError as e:
print(
f"Error: Skipping row original index {index} due to formatting error - missing key: {e}. Check USER_MESSAGE_TEMPLATE and CSV columns."
)
skipped_rows_indices.append(index)
continue
messages_for_row = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_message},
]
messages_list.append(messages_for_row)
valid_original_indices.append(index) # This is the original DataFrame index
print(
f"Prepared {len(messages_list)} valid message sets for batch completion (skipped {len(skipped_rows_indices)} rows)."
)
if not messages_list:
print("No valid rows found to process after checking required data. Exiting.")
exit()
else:
print(
"No rows found needing processing (df_to_process is empty)."
) # Should have been caught by earlier check
exit()
# --- Call batch_completion in chunks with rate limiting and periodic saving ---
total_messages_to_send = len(messages_list)
num_chunks = math.ceil(total_messages_to_send / CHUNK_SIZE)
print(
f"\nStarting batch completion for {total_messages_to_send} items in {num_chunks} chunks..."
)
overall_start_time = time.time()
processed_count_total = 0
for i in range(num_chunks):
chunk_start_message_index = i * CHUNK_SIZE
chunk_end_message_index = min((i + 1) * CHUNK_SIZE, total_messages_to_send)
message_chunk = messages_list[chunk_start_message_index:chunk_end_message_index]
# Get corresponding original DataFrame indices for this chunk
chunk_original_indices = valid_original_indices[
chunk_start_message_index:chunk_end_message_index
]
if not message_chunk:
continue
min_idx_disp = min(chunk_original_indices) if chunk_original_indices else "N/A"
max_idx_disp = max(chunk_original_indices) if chunk_original_indices else "N/A"
print(
f"\nProcessing chunk {i + 1}/{num_chunks} (Messages {chunk_start_message_index + 1}-{chunk_end_message_index} of this run)..."
f" Corresponding to original indices: {min_idx_disp} - {max_idx_disp}"
)
chunk_start_time = time.time()
responses = []
try:
print(f"Sending {len(message_chunk)} requests for chunk {i + 1}...")
responses = litellm.batch_completion(
model=MODEL,
messages=message_chunk,
response_format={
"type": "json_schema",
"json_schema": SCHEMA_FOR_VALIDATION,
},
num_retries=3,
# request_timeout=60 # Optional: uncomment if needed
)
print(f"Chunk {i + 1} API call completed.")
except Exception as e:
print(f"Error during litellm.batch_completion for chunk {i + 1}: {e}")
responses = [None] * len(
message_chunk
) # Ensure responses list matches message_chunk length for processing loop
# --- Process responses for the current chunk ---
chunk_updates = {} # To store {original_df_index: {qty/unit data}}
successful_in_chunk = 0
failed_in_chunk = 0
if responses and len(responses) == len(message_chunk):
for j, response in enumerate(responses):
original_df_index = chunk_original_indices[j]
# Initialize values for this item
lb_qty_val, lb_unit_val, ub_qty_val, ub_unit_val = None, None, None, None
content_str = None
if response is None:
print(
f"Skipping processing for original index {original_df_index} due to API call failure for this item (response is None)."
)
failed_in_chunk += 1
continue
try:
if (
response.choices
and response.choices[0].message
and response.choices[0].message.content
):
content_str = response.choices[0].message.content
estimate_data = json.loads(content_str) # Can raise JSONDecodeError
lower_bound_dict = estimate_data.get("lower_bound_estimate")
upper_bound_dict = estimate_data.get("upper_bound_estimate")
valid_response_structure = isinstance(
lower_bound_dict, dict
) and isinstance(upper_bound_dict, dict)
if valid_response_structure:
lb_qty_raw = lower_bound_dict.get("quantity")
lb_unit_raw = lower_bound_dict.get("unit")
ub_qty_raw = upper_bound_dict.get("quantity")
ub_unit_raw = upper_bound_dict.get("unit")
is_valid_item = True
# Validate LB Qty
if (
not isinstance(lb_qty_raw, (int, float))
or math.isnan(float(lb_qty_raw))
or float(lb_qty_raw) < 0
):
print(
f"Warning: Invalid lb_quantity for original index {original_df_index}: {lb_qty_raw}"
)
is_valid_item = False
else:
lb_qty_val = float(lb_qty_raw)
# Validate UB Qty
if (
not isinstance(ub_qty_raw, (int, float))
or math.isnan(float(ub_qty_raw))
or float(ub_qty_raw) < 0
):
print(
f"Warning: Invalid ub_quantity for original index {original_df_index}: {ub_qty_raw}"
)
is_valid_item = False
else:
ub_qty_val = float(ub_qty_raw)
# Validate Units
if lb_unit_raw not in ALLOWED_UNITS:
print(
f"Warning: Invalid lb_unit for original index {original_df_index}: '{lb_unit_raw}'"
)
is_valid_item = False
else:
lb_unit_val = lb_unit_raw
if ub_unit_raw not in ALLOWED_UNITS:
print(
f"Warning: Invalid ub_unit for original index {original_df_index}: '{ub_unit_raw}'"
)
is_valid_item = False
else:
ub_unit_val = ub_unit_raw
if is_valid_item:
successful_in_chunk += 1
chunk_updates[original_df_index] = {
"lb_estimate_qty": lb_qty_val,
"lb_estimate_unit": lb_unit_val,
"ub_estimate_qty": ub_qty_val,
"ub_estimate_unit": ub_unit_val,
}
else:
failed_in_chunk += (
1 # Values remain None if not fully valid
)
else:
print(
f"Warning: Missing or malformed estimate dicts in JSON for original index {original_df_index}. Content: '{content_str}'"
)
failed_in_chunk += 1
else:
finish_reason = (
response.choices[0].finish_reason
if (response.choices and response.choices[0].finish_reason)
else "unknown"
)
error_message = (
response.choices[0].message.content
if (
response.choices
and response.choices[0].message
and response.choices[0].message.content
)
else "No content in message."
)
print(
f"Warning: Received non-standard or empty response content for original index {original_df_index}. "
f"Finish Reason: '{finish_reason}'. Message: '{error_message}'. Raw Choices: {response.choices}"
)
failed_in_chunk += 1
except json.JSONDecodeError:
print(
f"Warning: Could not decode JSON for original index {original_df_index}. Content received: '{content_str}'"
)
failed_in_chunk += 1
except AttributeError as ae:
print(
f"Warning: Missing expected attribute processing response for original index {original_df_index}: {ae}. Response: {response}"
)
failed_in_chunk += 1
except Exception as e:
print(
f"Warning: An unexpected error occurred processing response for original index {original_df_index}: {type(e).__name__} - {e}. Response: {response}"
)
failed_in_chunk += 1
else:
print(
f"Warning: Mismatch between number of responses ({len(responses) if responses else 0}) "
f"and messages sent ({len(message_chunk)}) for chunk {i + 1}, or no responses. Marking all as failed."
)
failed_in_chunk = len(
message_chunk
) # All items in this chunk are considered failed if response array is problematic
print(
f"Chunk {i + 1} processing summary: Success={successful_in_chunk}, Failed/Skipped={failed_in_chunk}"
)
processed_count_total += successful_in_chunk
# --- Update Main DataFrame and Save Periodically ---
if chunk_updates:
print(
f"Updating main DataFrame with {len(chunk_updates)} new estimates for chunk {i + 1}..."
)
for idx, estimates in chunk_updates.items():
if idx in df.index:
df.loc[idx, "lb_estimate_qty"] = estimates["lb_estimate_qty"]
df.loc[idx, "lb_estimate_unit"] = estimates["lb_estimate_unit"]
df.loc[idx, "ub_estimate_qty"] = estimates["ub_estimate_qty"]
df.loc[idx, "ub_estimate_unit"] = estimates["ub_estimate_unit"]
print(f"Saving progress to {FILENAME}...")
save_dataframe(df, FILENAME)
else:
print(f"No successful estimates obtained in chunk {i + 1} to save.")
# --- Rate Limiting Pause ---
chunk_end_time = time.time()
chunk_duration = chunk_end_time - chunk_start_time
print(f"Chunk {i + 1} took {chunk_duration:.2f} seconds.")
if i < num_chunks - 1: # No pause after the last chunk
# Calculate ideal time per request based on rate limit
time_per_request = SECONDS_PER_MINUTE / RATE_LIMIT if RATE_LIMIT > 0 else 0
# Calculate minimum duration this chunk should have taken to respect rate limit
min_chunk_duration_for_rate = len(message_chunk) * time_per_request
# Calculate pause needed
pause_needed = max(0, min_chunk_duration_for_rate - chunk_duration)
if pause_needed > 0:
print(
f"Pausing for {pause_needed:.2f} seconds to respect rate limit ({RATE_LIMIT}/min)..."
)
time.sleep(pause_needed)
overall_end_time = time.time()
total_duration_minutes = (overall_end_time - overall_start_time) / 60
print(
f"\nBatch completion finished."
f" Processed {processed_count_total} new estimates in this run in {total_duration_minutes:.2f} minutes."
)
print(f"Performing final save to {FILENAME}...")
save_dataframe(df, FILENAME)
print("\nScript finished.")

521
old/analysis.py Normal file
View file

@ -0,0 +1,521 @@
import os
import litellm
import sqlite3
import numpy as np
import pandas as pd
from google.colab import userdata, files
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
os.environ['GEMINI_API_KEY'] = userdata.get('GEMINI_API_KEY')
occupation_major_codes = {
'11': 'Management',
'13': 'Business and Financial Operations',
'15': 'Computer and Mathematical Occupations',
'17': 'Architecture and Engineering',
'19': 'Life, Physical, and Social Science',
'21': 'Community and Social Services',
'23': 'Legal',
'25': 'Education, Training, and Library',
'27': 'Arts, Design, Entertainment, Sports, and Media',
'29': 'Healthcare Practitioners and Technical',
'31': 'Healthcare Support',
'33': 'Protective Service',
'35': 'Food Preparation and Serving Related',
'37': 'Building and Grounds Cleaning and Maintenance',
'39': 'Personal Care and Service',
'41': 'Sales and Related',
'43': 'Office and Administrative Support',
'45': 'Farming, Fishing, and Forestry',
'47': 'Construction and Extraction',
'49': 'Installation, Maintenance, and Repair',
'51': 'Production',
'53': 'Transportation and Material Moving',
'55': 'Military Specific'
}
gray = {'50':'#f8fafc','100':'#f1f5f9','200':'#e2e8f0',
'300':'#cbd5e1','400':'#94a3b8','500':'#64748b',
'600':'#475569','700':'#334155','800':'#1e293b',
'900':'#0f172a','950':'#020617'}
lime = {'50': '#f7fee7','100': '#ecfcca','200': '#d8f999',
'300': '#bbf451','400': '#9ae600','500': '#83cd00',
'600': '#64a400','700': '#497d00','800': '#3c6300',
'900': '#35530e','950': '#192e03'}
mpl.rcParams.update({
'figure.facecolor' : gray['50'],
'axes.facecolor' : gray['50'],
'axes.edgecolor' : gray['100'],
'axes.labelcolor' : gray['700'],
'xtick.color' : gray['700'],
'ytick.color' : gray['700'],
'font.family' : 'Inter', # falls back to DejaVu if Inter not present
'font.size' : 11,
})
sns.set_style("white") # keep minimal axes, we will remove default grid
sns.set_context("notebook")
def prepare_tasks():
# Run uv run ./enrich_task_ratings.py
df_tasks = pd.read_json("task_ratings_enriched.json")
# Run uv run classify_estimateability_of_tasks.py
df_task_estimateable = pd.read_csv("tasks_estimateable.csv").rename(columns={"task_estimateable": "estimateable"}).drop_duplicates(subset=['task'], keep='first')
# df_tasks now has a remote_status column which contains either "remote" or "not remote"
df_tasks = pd.merge(df_tasks, df_remote_status[['Task', 'Remote']], left_on='task', right_on='Task', how='left')
df_tasks = df_tasks.drop('Task', axis=1).rename(columns={'Remote': 'remote_status'})
# df_tasks now has a estimateable column which contains either "ATOMIC" or "ONGOING-CONSTRAINT"
df_tasks = pd.merge(df_tasks, df_task_estimateable[['task', 'estimateable']], on='task', how='left')
df_tasks = df_tasks[df_tasks['importance_average'] < 3].copy()
df_tasks['onetsoc_major'] = df_tasks['onetsoc_code'].str[:2]
df_remote_tasks = df_tasks[df_tasks['remote_status'] == 'remote'].copy()
# Call create_task_estimates() from add_task_estimates? which creates tasks_with_estimates.csv
def preprocessing_time_estimates():
df = pd.read_csv("tasks_with_estimates.csv")
df = df[df['importance_average'] > 3].copy()
# The embeddings comes from running `uv run ./embed_task_description.py`
# Columns: ['embedding_id', 'task', 'embedding_vector']
# These contain embedding for UNIQUE tasks
df_task_embeddings = pd.read_parquet("tasks_with_embeddings.parquet").drop_duplicates(subset=['task'])[['task', 'task_embedding']].rename(columns={"task_embedding": "embedding_vector"}).copy()
df = pd.merge(df, df_task_embeddings[['task', 'embedding_vector']], on='task', how='left')
df = pd.merge(df, df_task_estimateable[['task', 'estimateable']], on='task', how='left')
df['onetsoc_major'] = df['onetsoc_code'].str[:2]
def convert_to_minutes(qty, unit):
"""Converts a quantity in a given unit to minutes."""
return qty * {
"minute": 1,
"hour": 60,
"day": 60 * 24,
"week": 60 * 24 * 7,
"month": 60 * 24 * 30,
"trimester": 60 * 24 * 90,
"semester": 60 * 24 * 180,
"year": 60 * 24 * 365,
}[unit]
df['lb_estimate_in_minutes'] = df.apply(
lambda row: convert_to_minutes(row['lb_estimate_qty'], row['lb_estimate_unit']), axis=1
)
df['ub_estimate_in_minutes'] = df.apply(
lambda row: convert_to_minutes(row['ub_estimate_qty'], row['ub_estimate_unit']), axis=1
)
df['estimate_range'] = df.ub_estimate_in_minutes - df.lb_estimate_in_minutes
df['estimate_ratio'] = df.ub_estimate_in_minutes / df.lb_estimate_in_minutes
df['estimate_midpoint'] = (df.lb_estimate_in_minutes + df.ub_estimate_in_minutes)/2
atomic_tasks = df[df['estimateable'] == 'ATOMIC']
ongoing_tasks = df[df['estimateable'] == 'ONGOING-CONSTRAINT']
with pd.option_context('display.max_columns', None):
display(df)
# Check for empty estimates
if atomic_tasks['lb_estimate_in_minutes'].isnull().sum() > 0:
print("Missing values in 'lb_estimate_in_minutes':", atomic_tasks['lb_estimate_in_minutes'].isnull().sum())
if atomic_tasks['ub_estimate_in_minutes'].isnull().sum() > 0:
print("Missing values in 'ub_estimate_in_minutes':", atomic_tasks['ub_estimate_in_minutes'].isnull().sum())
# Check for impossible bounds
impossible_bounds = atomic_tasks[
(atomic_tasks['lb_estimate_in_minutes'] <= 0) |
(atomic_tasks['ub_estimate_in_minutes'] <= 0) |
(atomic_tasks['lb_estimate_in_minutes'] > atomic_tasks['ub_estimate_in_minutes'])
]
if not impossible_bounds.empty:
print(f"Error: Found rows with impossible bounds.")
with pd.option_context('display.max_colwidth', None):
display(impossible_bounds[['task', 'lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'dwas']])
#with pd.option_context('display.max_colwidth', None):
#display(atomic_tasks.nlargest(20, 'ub_estimate_in_minutes')[['task', 'lb_estimate_qty', 'lb_estimate_unit', 'lb_estimate_in_minutes', 'ub_estimate_qty', 'ub_estimate_unit', 'ub_estimate_in_minutes', 'estimate_ratio']])
def cell1():
sns.histplot(atomic_tasks.estimate_midpoint, log_scale=True)
def cell2():
plt.figure(figsize=(14,10))
sns.boxplot(
data=atomic_tasks,
x='onetsoc_major', # 11 = Management, 15 = Computer/Math, …
y='estimate_range',
showfliers=False
)
plt.yscale('log') # long tail => log scale
plt.xlabel('Occupation')
plt.ylabel('Range (upper-lower, minutes)')
plt.title('Spread of time-range estimates per occupation')
ax = plt.gca()
ax.set_xticklabels([occupation_major_codes[code.get_text()] for code in ax.get_xticklabels()], rotation=60, ha='right')
def cell3():
plt.figure(figsize=(10, 10))
ax = sns.scatterplot(
data=atomic_tasks.replace({'onetsoc_major': occupation_major_codes}), # Replace codes with labels
x='lb_estimate_in_minutes', y='ub_estimate_in_minutes',
alpha=0.2, edgecolor=None, hue="onetsoc_major" # Use the labeled column for hue
)
# 45° reference
lims = (1, atomic_tasks[['lb_estimate_in_minutes','ub_estimate_in_minutes']].max().max())
ax.plot(lims, lims, color='black', linestyle='--', linewidth=1)
# optional helper lines: 2× and 10×, 100× ratios
for k in [2,10, 100]:
ax.plot(lims, [k*l for l in lims],
linestyle=':', color='grey', linewidth=1)
ax.set(xscale='log', yscale='log')
ax.set_xlabel('Lower-bound (min, log scale)')
ax.set_ylabel('Upper-bound (min, log scale)')
ax.set_title('Lower vs upper estimates for all tasks')
# Place the legend outside the plot
ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
def cell4():
plt.figure(figsize=(8,4))
sns.histplot(np.log10(atomic_tasks['estimate_ratio'].replace([np.inf, -np.inf], np.nan).dropna()),
bins=60, kde=True)
plt.axvline(np.log10(10), color='red', ls='--', lw=1, label='10×')
plt.axvline(np.log10(1.05), color='orange', ls='--', lw=1, label='1.05×')
plt.axvline(0, color='black', ls='-', lw=1) # ub = lb
plt.xlabel('log₁₀(upper / lower)')
plt.ylabel('Count')
plt.title('Distribution of upper:lower ratio')
plt.legend()
plt.tight_layout()
def cell5():
# 1. Bin lower bounds into quartiles (Q1Q4)
atomic_tasks['lb_q'] = pd.qcut(atomic_tasks.lb_estimate_in_minutes,
q=4, labels=['Q1 shortest','Q2','Q3','Q4 longest'])
# 3. Aggregate: median (or mean) ratio per cell
pivot = atomic_tasks.pivot_table(index='onetsoc_major', columns='lb_q',
values='estimate_ratio', aggfunc='median')
# Map the index (onetsoc_major codes) to their corresponding labels
pivot.index = pivot.index.map(occupation_major_codes)
# 4. Visualise
plt.figure(figsize=(10,8))
sns.heatmap(pivot, cmap='RdYlGn_r', center=2, annot=True, fmt='.1f',
cbar_kws={'label':'Median upper/lower ratio'})
plt.xlabel('Lower-bound quartile')
plt.ylabel('Occupation (major group)')
plt.title('Typical range width by occupation and task length')
plt.tight_layout()
def cell6():
"""
from scipy.stats import median_abs_deviation
def mad_z(series):
med = series.median()
mad = median_abs_deviation(series, scale='normal') # ⇒ comparable to σ
return (series - med) / mad
df['robust_z'] = df.groupby('onetsoc_code')['estimate_midpoint'].transform(mad_z)
"""
agg = (atomic_tasks
.groupby('onetsoc_code')['estimate_midpoint']
.agg(median='median',
q1=lambda x: x.quantile(.25),
q3=lambda x: x.quantile(.75),
mean='mean',
std='std')
.reset_index())
agg['IQR'] = agg.q3 - agg.q1
agg['CV'] = agg['std'] / agg['mean'] # coefficient of variation
# merge back the group mean and std so each row can be scored
atomic_tasks = atomic_tasks.merge(agg[['onetsoc_code','mean','std']], on='onetsoc_code')
atomic_tasks['z'] = (atomic_tasks.estimate_midpoint - atomic_tasks['mean']) / atomic_tasks['std']
outliers = atomic_tasks.loc[atomic_tasks.z.abs() > 3]
outliers
def cell7():
from scipy.stats import median_abs_deviation
def mad_z(series):
med = series.median()
mad = median_abs_deviation(series, scale='normal') # ⇒ comparable to σ
return (series - med) / mad
atomic_tasks['robust_z'] = atomic_tasks.groupby('onetsoc_code')['estimate_midpoint'].transform(mad_z)
def cell10():
import matplotlib.ticker as mtick # For percentage formatting
import matplotlib.colors as mcolors # For color conversion
summary_data = []
for code, label in occupation_major_codes.items():
occ_df = df_tasks[df_tasks['onetsoc_major'] == code]
total_tasks_in_occ = len(occ_df)
if total_tasks_in_occ == 0:
continue # Skip if no tasks for this occupation
# Stack 1: % that isn't equal to "remote"
not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote'])
# For the remaining remote tasks:
remote_df = occ_df[occ_df['remote_status'] == 'remote']
# Stack 2: % of remote + ATOMIC
remote_atomic_count = len(remote_df[remote_df['estimateable'] == 'ATOMIC'])
# Stack 3: % of remote + ONGOING-CONSTRAINT
remote_ongoing_count = len(remote_df[remote_df['estimateable'] == 'ONGOING-CONSTRAINT'])
summary_data.append({
'onetsoc_major_code': code,
'occupation_label': label,
'count_not_remote': not_remote_count,
'count_remote_atomic': remote_atomic_count,
'count_remote_ongoing': remote_ongoing_count,
'total_tasks': total_tasks_in_occ
})
summary_df = pd.DataFrame(summary_data)
# --- 3. Calculate Percentages ---
# Ensure total_tasks is not zero to avoid division by zero errors if an occupation had no tasks
summary_df = summary_df[summary_df['total_tasks'] > 0].copy() # Use .copy() to avoid SettingWithCopyWarning
summary_df['pct_not_remote'] = (summary_df['count_not_remote'] / summary_df['total_tasks']) * 100
summary_df['pct_remote_atomic'] = (summary_df['count_remote_atomic'] / summary_df['total_tasks']) * 100
summary_df['pct_remote_ongoing'] = (summary_df['count_remote_ongoing'] / summary_df['total_tasks']) * 100
# Select columns for plotting and set index to occupation label
plot_df = summary_df.set_index('occupation_label')[
['pct_not_remote', 'pct_remote_atomic', 'pct_remote_ongoing']
]
# Rename columns for a clearer legend
plot_df.columns = ['Not Remote', 'Remote + Estimable', 'Remote + Not estimable']
plot_df = plot_df.sort_values(by='Not Remote', ascending=False)
# --- 4. Plotting (Modified) ---
# Define the custom colors based on your requirements
# The order must match the column order in plot_df:
# 1. 'Not Remote'
# 2. 'Remote & ATOMIC'
# 3. 'Remote & ONGOING-CONSTRAINT'
bar_colors = [gray["300"], lime["500"], lime["200"]]
fig, ax = plt.subplots(figsize=(14, 10)) # Adjusted figsize for better readability
plot_df.plot(kind='barh', stacked=True, ax=ax, color=bar_colors)
ax.set_xlabel("Percentage of Tasks (%)", fontsize=12)
ax.set_ylabel("Occupation Major Group", fontsize=12)
ax.set_title("Task Breakdown by Occupation, Remote Status, and Estimateability", fontsize=14, pad=20)
# Format x-axis as percentages
ax.xaxis.set_major_formatter(mtick.PercentFormatter())
plt.xlim(0, 100) # Ensure x-axis goes from 0 to 100%
# Remove right and top spines
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
# Function to get contrasting text color
def get_contrasting_text_color(bg_color_hex_or_rgba):
"""
Determines if black or white text provides better contrast against a given background color.
bg_color_hex_or_rgba: A hex string (e.g., '#RRGGBB') or an RGBA tuple (values in [0, 1]).
Returns: 'black' or 'white'.
"""
# Convert to RGBA if it's a hex string or name
if isinstance(bg_color_hex_or_rgba, str):
rgba = mcolors.to_rgba(bg_color_hex_or_rgba)
else:
rgba = bg_color_hex_or_rgba
r, g, b, _ = rgba # Ignore alpha for luminance calculation
# Calculate luminance (standard formula for sRGB)
# Values r, g, b should be in [0, 1] for this formula
luminance = 0.2126 * r + 0.7152 * g + 0.0722 * b
# Threshold for deciding text color
return 'black' if luminance > 0.55 else 'white' # Adjusted threshold slightly for better visual
# Add percentages inside each bar segment
# Iterate through each "category" of bars (Not Remote, Remote & ATOMIC, etc.)
for i, container in enumerate(ax.containers):
# Get the color for this container/category
segment_color = bar_colors[i]
text_color = get_contrasting_text_color(segment_color)
for patch in container.patches: # Iterate through each bar segment in the category
width = patch.get_width()
if width > 3: # Only add text if segment is wide enough (e.g., >3%)
x = patch.get_x() + width / 2
y = patch.get_y() + patch.get_height() / 2
ax.text(x, y,
f"{width:.1f}%",
ha='center',
va='center',
fontsize=8, # Adjust font size as needed
color=text_color,
fontweight='medium') # Bolder text can help
plt.legend(title="Task Category", bbox_to_anchor=(1.02, 1), loc='upper left', frameon=False)
def cell11():
df_oesm['onetsoc_major'] = df_oesm['OCC_CODE'].str[:2]
# Calculate wage bill per occupation
# Wage bill = Total Employment * Annual Mean Wage
# Ensure columns are numeric, converting non-numeric values to NaN first
df_oesm['TOT_EMP'] = pd.to_numeric(df_oesm['TOT_EMP'], errors='coerce')
df_oesm['A_MEAN'] = pd.to_numeric(df_oesm['A_MEAN'], errors='coerce')
# Drop rows with NaN in necessary columns after coercion
df_oesm.dropna(subset=['TOT_EMP', 'A_MEAN', 'onetsoc_major'], inplace=True)
df_oesm['wage_bill'] = df_oesm['TOT_EMP'] * df_oesm['A_MEAN']
# Aggregate wage bill by onetsoc_major
df_wage_bill_major = df_oesm.groupby('onetsoc_major')['wage_bill'].sum().reset_index()
# Map major codes to titles for better plotting
df_wage_bill_major['OCC_TITLE_MAJOR'] = df_wage_bill_major['onetsoc_major'].map(occupation_major_codes)
# Sort by wage bill for better visualization
df_wage_bill_major = df_wage_bill_major.sort_values('wage_bill', ascending=False)
# Plotting
plt.figure(figsize=(12, 8))
sns.barplot(x='wage_bill', y='OCC_TITLE_MAJOR', data=df_wage_bill_major, palette="viridis")
plt.title('Total Wage Bill per Major Occupation Group')
plt.xlabel('Total Wage Bill (in billions)')
plt.ylabel('Major Occupation Group')
plt.grid(axis='x', linestyle='--', alpha=0.7)
def cell11():
# ───────────────────────────────────────────────────────────────
# 1. CUMULATIVE-DISTRIBUTION-FUNCTION (CDF) PREP
# ───────────────────────────────────────────────────────────────
def cdf(series):
s = series.sort_values().reset_index(drop=True)
return s.values, ((s.index + 1) / len(s)) * 100
x_lb , y_lb = cdf(atomic_tasks['lb_estimate_in_minutes'])
x_ub , y_ub = cdf(atomic_tasks['ub_estimate_in_minutes'])
x_mid, y_mid = cdf((atomic_tasks['ub_estimate_in_minutes'] + atomic_tasks['lb_estimate_in_minutes']) / 2)
# ───────────────────────────────────────────────────────────────
# 2. PLOTTING
# ───────────────────────────────────────────────────────────────
fig, ax = plt.subplots(figsize=(10, 6))
# horizontal reference lines every 10 %
for y_val in range(0, 101, 10):
ax.axhline(y_val, color=gray['100'], linewidth=.8, zorder=1)
# Plot Lower Bound CDF
ax.step(x_lb, y_lb,
where='post',
color=lime['300'], # Example: light blue for lower bound
linewidth=1.8,
linestyle='--',
zorder=2,
label='Lower bound estimate (CDF)')
# Plot Upper Bound CDF
ax.step(x_ub, y_ub,
where='post',
color=lime['900'], # Example: light orange/red for upper bound
linewidth=1.8,
linestyle=':',
zorder=3,
label='Upper bound estimate (CDF)')
# Plot Midpoint CDF (plotted last to be on top, or adjust zorder)
ax.step(x_mid, y_mid,
where='post',
color=lime['600'],
linewidth=2.2,
zorder=4, # Ensure it's on top of other lines if they overlap significantly
label='Mid-point estimate (CDF)')
# axes limits / scales
ax.set_ylim(0, 100)
ax.set_xscale('log')
# y-axis ➝ percent labels
ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(decimals=0))
# move y-label to top-left (just inside plotting area)
ax.text(-0.06, 1.03,
"% of tasks with temporal coherence ≤ X",
ha='left', va='bottom',
transform=ax.transAxes,
fontsize=12, fontweight='semibold')
# custom x-ticks at human-friendly durations
ticks = [1, 5, 10, 30, 60, 120, 240, 480,
1440, 2880, 10080, 43200, 129600,
259200, 525600]
ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hour', '2 hours', '4 hours', '8 hours',
'1 day', '2 days', '1 week', '30 days',
'90 days', '180 days', '1 year']
# Vertical reference lines for x-ticks
for tick in ticks:
ax.axvline(tick, color=gray['300'], linewidth=.8, linestyle='--', zorder=1)
ax.set_xticks(ticks)
ax.set_xticklabels(ticklabels, rotation=45, ha='right')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_edgecolor(gray['300'])
ax.spines['bottom'].set_edgecolor(gray['300'])
# legend
ax.legend(frameon=False, loc='lower right') # Keep 'lower right' or adjust as needed
ax.text(0.5, -0.3,
'Temporal coherence (X)',
ha='center', va='center',
transform=ax.transAxes,
fontsize=12, fontweight='semibold')

View file

@ -0,0 +1,411 @@
import pandas as pd
import litellm
import dotenv
import os
import time
import json
import math
# Load environment variables
dotenv.load_dotenv(override=True)
# litellm._turn_on_debug() # Optional debugging
# --- Configuration ---
MODEL = "gpt-4.1-mini" # Make sure this model supports json_schema or structured output
RATE_LIMIT = 5000 # Requests per minute
CHUNK_SIZE = 300 # Number of unique tasks per API call
SECONDS_PER_MINUTE = 60
# File configuration
CLASSIFICATION_FILENAME = "tasks_estimateable.csv" # Output file with classifications
TASK_SOURCE_FOR_INIT_FILENAME = "tasks_with_estimates.csv"
OUTPUT_COLUMN_NAME = "task_estimateable"
SOURCE_FILTER_COLUMN = "remote_status"
SOURCE_FILTER_VALUE = "remote"
# --- Prompts and Schema ---
SYSTEM_PROMPT_CLASSIFY = """
Classify the provided O*NET task into one of these categories:
- ATOMIC (schedulable): A single, clearly-bounded activity, typically lasting minutes, hours, or a few days.
- ONGOING-CONSTRAINT (background role/ethical rule): A continuous responsibility or behavioural norm with no schedulable duration (e.g., follow confidentiality rules, serve as department head).
""".strip()
USER_MESSAGE_TEMPLATE_CLASSIFY = "Task: {task}"
CLASSIFICATION_CATEGORIES = ["ATOMIC", "ONGOING-CONSTRAINT"]
SCHEMA_FOR_CLASSIFICATION = {
"name": "classify_task_type",
"strict": True,
"schema": {
"type": "object",
"properties": {
"task_category": {
"type": "string",
"enum": CLASSIFICATION_CATEGORIES,
"description": "The classification of the task (ATOMIC or ONGOING-CONSTRAINT).",
}
},
"required": ["task_category"],
"additionalProperties": False,
},
}
def save_dataframe(df_to_save, filename):
"""Saves the DataFrame to the specified CSV file using atomic write."""
try:
temp_filename = filename + ".tmp"
df_to_save.to_csv(temp_filename, encoding="utf-8-sig", index=False)
os.replace(temp_filename, filename)
except Exception as e:
print(f"--- Error saving DataFrame to {filename}: {e} ---")
if os.path.exists(temp_filename):
try:
os.remove(temp_filename)
except Exception as remove_err:
print(
f"--- Error removing temporary save file {temp_filename}: {remove_err} ---"
)
# --- Load or Initialize DataFrame ---
try:
if os.path.exists(CLASSIFICATION_FILENAME):
df = pd.read_csv(CLASSIFICATION_FILENAME, encoding="utf-8-sig")
print(f"Successfully read {len(df)} rows from {CLASSIFICATION_FILENAME}.")
save_needed_after_load = False
if OUTPUT_COLUMN_NAME not in df.columns:
df[OUTPUT_COLUMN_NAME] = pd.NA
print(f"Added '{OUTPUT_COLUMN_NAME}' column.")
save_needed_after_load = True
df[OUTPUT_COLUMN_NAME].replace(["", None, ""], pd.NA, inplace=True)
if df[OUTPUT_COLUMN_NAME].dtype != object and not isinstance(
df[OUTPUT_COLUMN_NAME].dtype, pd.StringDtype
):
try:
df[OUTPUT_COLUMN_NAME] = df[OUTPUT_COLUMN_NAME].astype(object)
print(
f"Corrected dtype of '{OUTPUT_COLUMN_NAME}' to {df[OUTPUT_COLUMN_NAME].dtype}."
)
save_needed_after_load = True
except Exception as e:
print(
f"Warning: Could not convert column '{OUTPUT_COLUMN_NAME}' to object: {e}."
)
if "task" not in df.columns:
print(
f"Error: {CLASSIFICATION_FILENAME} must contain a 'task' column for processing."
)
exit()
if save_needed_after_load:
print(f"Saving {CLASSIFICATION_FILENAME} after adding/adjusting column.")
save_dataframe(df, CLASSIFICATION_FILENAME)
else:
print(
f"{CLASSIFICATION_FILENAME} not found. Attempting to create it from {TASK_SOURCE_FOR_INIT_FILENAME}."
)
if not os.path.exists(TASK_SOURCE_FOR_INIT_FILENAME):
print(
f"Error: Source file {TASK_SOURCE_FOR_INIT_FILENAME} not found. Cannot create {CLASSIFICATION_FILENAME}."
)
exit()
df_source = pd.read_csv(TASK_SOURCE_FOR_INIT_FILENAME, encoding="utf-8-sig")
required_source_cols_for_init = ["task", SOURCE_FILTER_COLUMN]
missing_source_cols = [
col for col in required_source_cols_for_init if col not in df_source.columns
]
if missing_source_cols:
print(
f"Error: Source file {TASK_SOURCE_FOR_INIT_FILENAME} is missing required columns for initialization: {', '.join(missing_source_cols)}."
)
exit()
df_source_filtered = df_source[
df_source[SOURCE_FILTER_COLUMN] == SOURCE_FILTER_VALUE
].copy()
if df_source_filtered.empty:
print(
f"Warning: No tasks with '{SOURCE_FILTER_COLUMN}' == '{SOURCE_FILTER_VALUE}' found in {TASK_SOURCE_FOR_INIT_FILENAME}. "
f"{CLASSIFICATION_FILENAME} will be created with schema but no tasks to classify initially."
)
df = df_source_filtered[["task"]].copy()
df[OUTPUT_COLUMN_NAME] = pd.NA
df[OUTPUT_COLUMN_NAME] = df[OUTPUT_COLUMN_NAME].astype(object)
print(
f"Created {CLASSIFICATION_FILENAME} using tasks from {TASK_SOURCE_FOR_INIT_FILENAME} "
f"(where {SOURCE_FILTER_COLUMN}='{SOURCE_FILTER_VALUE}'). New file has {len(df)} tasks."
)
save_dataframe(df, CLASSIFICATION_FILENAME)
except FileNotFoundError:
print(f"Error: A required file was not found. Please check paths.")
exit()
except Exception as e:
print(f"Error during DataFrame loading or initialization: {e}")
exit()
# --- Identify Unique Tasks to Process ---
if df.empty:
print(f"{CLASSIFICATION_FILENAME} is empty. Nothing to process. Exiting.")
exit()
initial_unprocessed_mask = df[OUTPUT_COLUMN_NAME].isna()
if not initial_unprocessed_mask.any():
print(
f"All tasks in {CLASSIFICATION_FILENAME} seem to have been classified already. Exiting."
)
exit()
# Filter for rows that are unprocessed AND have a valid 'task' string
valid_tasks_to_consider_df = df[
initial_unprocessed_mask & df["task"].notna() & (df["task"].str.strip() != "")
]
if valid_tasks_to_consider_df.empty:
print(
f"No valid, unclassified tasks found to process (after filtering out empty/NaN task descriptions). Exiting."
)
exit()
unique_task_labels_for_api = (
valid_tasks_to_consider_df["task"].drop_duplicates().tolist()
)
total_rows_to_update_potentially = len(
df[initial_unprocessed_mask]
) # Count all rows that are NA
print(
f"Found {total_rows_to_update_potentially} total rows in {CLASSIFICATION_FILENAME} needing classification."
)
print(
f"Identified {len(unique_task_labels_for_api)} unique, valid task labels to send to the API."
)
# --- Prepare messages for batch completion (only for unique task labels) ---
messages_list = []
print(f"Preparing messages for {len(unique_task_labels_for_api)} unique task labels...")
for task_label in unique_task_labels_for_api:
# task_label is already guaranteed to be non-empty and not NaN from the filtering above
user_message = USER_MESSAGE_TEMPLATE_CLASSIFY.format(task=task_label)
messages_for_task = [
{"role": "system", "content": SYSTEM_PROMPT_CLASSIFY},
{"role": "user", "content": user_message},
]
messages_list.append(messages_for_task)
print(f"Prepared {len(messages_list)} message sets for batch completion.")
if (
not messages_list
): # Should only happen if unique_task_labels_for_api was empty, caught above
print(
"No messages prepared, though unique tasks were identified. This is unexpected. Exiting."
)
exit()
# --- Call batch_completion in chunks with rate limiting and periodic saving ---
total_unique_tasks_to_send = len(
messages_list
) # Same as len(unique_task_labels_for_api)
num_chunks = math.ceil(total_unique_tasks_to_send / CHUNK_SIZE)
print(
f"\nStarting batch classification for {total_unique_tasks_to_send} unique task labels in {num_chunks} chunks..."
)
overall_start_time = time.time()
processed_rows_count_total = 0 # Counts actual rows updated in the DataFrame
for i in range(num_chunks):
chunk_start_message_index = i * CHUNK_SIZE
chunk_end_message_index = min((i + 1) * CHUNK_SIZE, total_unique_tasks_to_send)
message_chunk = messages_list[chunk_start_message_index:chunk_end_message_index]
# Get corresponding unique task labels for this chunk
chunk_task_labels = unique_task_labels_for_api[
chunk_start_message_index:chunk_end_message_index
]
if not message_chunk: # Should not happen if loop range is correct
continue
print(
f"\nProcessing chunk {i + 1}/{num_chunks} (Unique Task Labels {chunk_start_message_index + 1}-{chunk_end_message_index} of this run)..."
)
chunk_start_time = time.time()
responses = []
try:
print(
f"Sending {len(message_chunk)} requests (for unique tasks) for chunk {i + 1}..."
)
responses = litellm.batch_completion(
model=MODEL,
messages=message_chunk,
response_format={
"type": "json_schema",
"json_schema": SCHEMA_FOR_CLASSIFICATION,
},
num_retries=3,
)
print(f"Chunk {i + 1} API call completed.")
except Exception as e:
print(f"Error during litellm.batch_completion for chunk {i + 1}: {e}")
responses = [None] * len(message_chunk)
# --- Process responses for the current chunk ---
# chunk_updates stores {task_label: classification_category}
chunk_task_classifications = {}
successful_api_calls_in_chunk = 0
failed_api_calls_in_chunk = 0
if responses and len(responses) == len(message_chunk):
for j, response in enumerate(responses):
current_task_label = chunk_task_labels[
j
] # The unique task label for this response
content_str = None
if response is None:
print(
f"API call failed for task label '{current_task_label}' (response is None)."
)
failed_api_calls_in_chunk += 1
continue
try:
if (
response.choices
and response.choices[0].message
and response.choices[0].message.content
):
content_str = response.choices[0].message.content
classification_data = json.loads(content_str)
category_raw = classification_data.get("task_category")
if category_raw in CLASSIFICATION_CATEGORIES:
successful_api_calls_in_chunk += 1
chunk_task_classifications[current_task_label] = category_raw
else:
print(
f"Warning: Invalid or missing task_category for task label '{current_task_label}': '{category_raw}'. Content: '{content_str}'"
)
failed_api_calls_in_chunk += 1
else:
finish_reason = (
response.choices[0].finish_reason
if (response.choices and response.choices[0].finish_reason)
else "unknown"
)
error_message = (
response.choices[0].message.content
if (response.choices and response.choices[0].message)
else "No content in message."
)
print(
f"Warning: Received non-standard or empty response content for task label '{current_task_label}'. "
f"Finish Reason: '{finish_reason}'. Message: '{error_message}'. Raw Choices: {response.choices}"
)
failed_api_calls_in_chunk += 1
except json.JSONDecodeError:
print(
f"Warning: Could not decode JSON for task label '{current_task_label}'. Content received: '{content_str}'"
)
failed_api_calls_in_chunk += 1
except AttributeError as ae:
print(
f"Warning: Missing attribute processing response for task label '{current_task_label}': {ae}. Response: {response}"
)
failed_api_calls_in_chunk += 1
except Exception as e:
print(
f"Warning: Unexpected error processing response for task label '{current_task_label}': {type(e).__name__} - {e}. Response: {response}"
)
failed_api_calls_in_chunk += 1
else:
print(
f"Warning: Mismatch between #responses ({len(responses) if responses else 0}) "
f"and #messages sent ({len(message_chunk)}) for chunk {i + 1}, or no responses. Marking all API calls in chunk as failed."
)
failed_api_calls_in_chunk = len(message_chunk)
# --- Update Main DataFrame and Save Periodically ---
rows_updated_this_chunk = 0
if chunk_task_classifications:
print(
f"Updating main DataFrame with classifications for {len(chunk_task_classifications)} unique tasks from chunk {i + 1}..."
)
for task_label, category in chunk_task_classifications.items():
# Update all rows in the main df that match this task_label AND are still NA in the output column
update_condition = (df["task"] == task_label) & (
df[OUTPUT_COLUMN_NAME].isna()
)
num_rows_for_this_task_label = df[update_condition].shape[0]
if num_rows_for_this_task_label > 0:
df.loc[update_condition, OUTPUT_COLUMN_NAME] = category
rows_updated_this_chunk += num_rows_for_this_task_label
print(
f"Updated {rows_updated_this_chunk} rows in the DataFrame based on this chunk's API responses."
)
print(f"Saving progress to {CLASSIFICATION_FILENAME}...")
save_dataframe(df, CLASSIFICATION_FILENAME)
else:
print(
f"No successful API classifications obtained in chunk {i + 1} to update DataFrame or save."
)
print(
f"Chunk {i + 1} API summary: Successful Calls={successful_api_calls_in_chunk}, Failed/Skipped Calls={failed_api_calls_in_chunk}. "
f"Rows updated in DataFrame this chunk: {rows_updated_this_chunk}"
)
processed_rows_count_total += rows_updated_this_chunk
# --- Rate Limiting Pause ---
chunk_end_time = time.time()
chunk_duration = chunk_end_time - chunk_start_time
print(f"Chunk {i + 1} (API calls and DF update) took {chunk_duration:.2f} seconds.")
if i < num_chunks - 1:
time_per_request = SECONDS_PER_MINUTE / RATE_LIMIT if RATE_LIMIT > 0 else 0
min_chunk_duration_for_rate = (
len(message_chunk) * time_per_request
) # Based on API calls made
pause_needed = max(0, min_chunk_duration_for_rate - chunk_duration)
if pause_needed > 0:
print(
f"Pausing for {pause_needed:.2f} seconds to respect rate limit ({RATE_LIMIT}/min)..."
)
time.sleep(pause_needed)
overall_end_time = time.time()
total_duration_minutes = (overall_end_time - overall_start_time) / 60
print(
f"\nBatch classification finished."
f" Updated {processed_rows_count_total} rows in '{CLASSIFICATION_FILENAME}' with new classifications in this run."
f" Total duration: {total_duration_minutes:.2f} minutes."
)
print(f"Performing final save to {CLASSIFICATION_FILENAME}...")
save_dataframe(df, CLASSIFICATION_FILENAME)
print("\nScript finished.")

85
old/create_onet_database.sh Executable file
View file

@ -0,0 +1,85 @@
#!/usr/bin/env bash
# Set database name and directories
ONET_DB_NAME="onet.database"
ONET_ZIP_URL="https://www.onetcenter.org/dl_files/database/db_29_1_mysql.zip"
ONET_ZIP_FILE="db_29_1_mysql.zip"
ONET_EXTRACT_DIR="db_29_1_mysql"
# Download O*NET database only if not already downloaded
if [ ! -f "$ONET_ZIP_FILE" ]; then
echo "Downloading O*NET database from $ONET_ZIP_URL"
curl -L -o "$ONET_ZIP_FILE" "$ONET_ZIP_URL" || wget -O "$ONET_ZIP_FILE" "$ONET_ZIP_URL"
if [ $? -ne 0 ]; then
echo "Failed to download O*NET database"
exit 1
fi
else
echo "Using existing O*NET database zip file"
fi
# Extract downloaded zip file only if extraction directory doesn't exist
if [ ! -d "$ONET_EXTRACT_DIR" ]; then
echo "Extracting O*NET database files"
unzip -o "$ONET_ZIP_FILE"
if [ $? -ne 0 ]; then
echo "Failed to extract O*NET database files"
exit 1
fi
else
echo "Using existing extracted O*NET database files"
fi
# Remove existing database if it exists
if [ -f "$ONET_DB_NAME" ]; then
echo "Removing existing database"
rm "$ONET_DB_NAME"
fi
# Create a new SQLite database with optimized settings for fast import
echo "Creating new SQLite database: $ONET_DB_NAME with performance settings"
sqlite3 "$ONET_DB_NAME" << EOF
PRAGMA journal_mode = OFF;
PRAGMA synchronous = 0;
PRAGMA cache_size = 1000000;
PRAGMA locking_mode = EXCLUSIVE;
PRAGMA temp_store = MEMORY;
PRAGMA foreign_keys = ON;
EOF
# Combine and execute all SQL files in one transaction
echo "Executing SQL files in alphabetical order (single transaction mode)"
sqlite3 "$ONET_DB_NAME" << EOF
BEGIN TRANSACTION;
$(find "$ONET_EXTRACT_DIR" -name "*.sql" | sort | xargs cat)
COMMIT;
EOF
# Check if the execution was successful
if [ $? -ne 0 ]; then
echo "Error executing SQL files in batch transaction"
exit 1
else
echo "Database populated successfully. Restoring reliability settings..."
# Restore reliability-focused settings after import
sqlite3 "$ONET_DB_NAME" << EOF
PRAGMA journal_mode = WAL;
PRAGMA synchronous = NORMAL;
PRAGMA locking_mode = NORMAL;
PRAGMA temp_store = DEFAULT;
PRAGMA foreign_keys = ON;
PRAGMA optimize;
VACUUM;
EOF
if [ $? -ne 0 ]; then
echo "Warning: Failed to restore reliability settings, but database is populated"
else
echo "Reliability settings restored successfully"
fi
echo "O*NET database created and optimized successfully!"
fi

392
old/enrich_task_ratings.py Normal file
View file

@ -0,0 +1,392 @@
import sqlite3
import pandas as pd
import json
import os
from collections import defaultdict
import numpy as np
# --- Configuration ---
DB_FILE = "onet.database"
OUTPUT_FILE = "task_ratings_enriched.json" # Changed output filename
# --- Database Interaction ---
def fetch_data_from_db(db_path):
"""
Fetches required data from the O*NET SQLite database using JOINs,
including DWAs.
Args:
db_path (str): Path to the SQLite database file.
Returns:
tuple(pandas.DataFrame, pandas.DataFrame): A tuple containing:
- DataFrame with task ratings info.
- DataFrame with task-to-DWA mapping.
Returns (None, None) if the database file doesn't exist or an error occurs.
"""
if not os.path.exists(db_path):
print(f"Error: Database file not found at {db_path}")
return None, None
try:
conn = sqlite3.connect(db_path)
# Construct the SQL query to join the tables and select necessary columns
# Added LEFT JOINs for tasks_to_dwas and dwa_reference
# Use LEFT JOIN in case a task has no DWAs
query = """
SELECT
tr.onetsoc_code,
tr.task_id,
ts.task,
od.title AS occupation_title,
od.description AS occupation_description,
tr.scale_id,
tr.category,
tr.data_value,
dr.dwa_title -- Added DWA title
FROM
task_ratings tr
JOIN
task_statements ts ON tr.task_id = ts.task_id
JOIN
occupation_data od ON tr.onetsoc_code = od.onetsoc_code
LEFT JOIN
tasks_to_dwas td ON tr.onetsoc_code = td.onetsoc_code AND tr.task_id = td.task_id --
LEFT JOIN
dwa_reference dr ON td.dwa_id = dr.dwa_id; --
"""
df = pd.read_sql_query(query, conn)
conn.close()
print(
f"Successfully fetched {len(df)} records (including DWA info) from the database."
)
if df.empty:
print("Warning: Fetched DataFrame is empty.")
# Return empty DataFrames with expected columns if the main fetch is empty
ratings_cols = [
"onetsoc_code",
"task_id",
"task",
"occupation_title",
"occupation_description",
"scale_id",
"category",
"data_value",
]
dwa_cols = ["onetsoc_code", "task_id", "dwa_title"]
return pd.DataFrame(columns=ratings_cols), pd.DataFrame(columns=dwa_cols)
# Remove duplicates caused by joining ratings with potentially multiple DWAs per task
# Keep only unique combinations of the core task/rating info before processing
core_cols = [
"onetsoc_code",
"task_id",
"task",
"occupation_title",
"occupation_description",
"scale_id",
"category",
"data_value",
]
# Check if all core columns exist before attempting to drop duplicates
missing_core_cols = [col for col in core_cols if col not in df.columns]
if missing_core_cols:
print(f"Error: Missing core columns in fetched data: {missing_core_cols}")
return None, None
ratings_df = df[core_cols].drop_duplicates().reset_index(drop=True)
# Get unique DWA info separately
dwa_cols = ["onetsoc_code", "task_id", "dwa_title"]
# Check if all DWA columns exist before processing
if all(col in df.columns for col in dwa_cols):
dwas_df = (
df[dwa_cols]
.dropna(subset=["dwa_title"])
.drop_duplicates()
.reset_index(drop=True)
)
else:
print("Warning: DWA related columns missing, creating empty DWA DataFrame.")
dwas_df = pd.DataFrame(
columns=dwa_cols
) # Create empty df if columns missing
return ratings_df, dwas_df # Return two dataframes now
except sqlite3.Error as e:
print(f"SQLite error: {e}")
if "conn" in locals() and conn:
conn.close()
return None, None # Return None for both if error
except Exception as e:
print(f"An error occurred during data fetching: {e}")
if "conn" in locals() and conn:
conn.close()
return None, None # Return None for both if error
# --- Data Processing ---
def process_task_ratings_with_dwas(ratings_df, dwas_df):
"""
Processes the fetched data to group, pivot frequency, calculate averages,
structure the output, and add associated DWAs.
Args:
ratings_df (pandas.DataFrame): The input DataFrame with task ratings info.
dwas_df (pandas.DataFrame): The input DataFrame with task-to-DWA mapping. Can be None or empty.
Returns:
list: A list of dictionaries, each representing an enriched task rating with DWAs.
Returns None if the input ratings DataFrame is invalid.
"""
if ratings_df is None or not isinstance(
ratings_df, pd.DataFrame
): # Check if it's a DataFrame
print("Error: Input ratings DataFrame is invalid.")
return None
if ratings_df.empty:
print(
"Warning: Input ratings DataFrame is empty. Processing will yield empty result."
)
# Decide how to handle empty input, maybe return empty list directly
# return []
# Ensure dwas_df is a DataFrame, even if empty
if dwas_df is None or not isinstance(dwas_df, pd.DataFrame):
print("Warning: Invalid or missing DWA DataFrame. Proceeding without DWA data.")
dwas_df = pd.DataFrame(
columns=["onetsoc_code", "task_id", "dwa_title"]
) # Ensure it's an empty DF
print("Starting data processing...")
# --- 1. Handle Frequency (FT) ---
freq_df = ratings_df[ratings_df["scale_id"] == "FT"].copy()
if not freq_df.empty:
freq_pivot = freq_df.pivot_table(
index=["onetsoc_code", "task_id"],
columns="category",
values="data_value",
fill_value=0,
)
freq_pivot.columns = [
f"frequency_category_{int(col)}" for col in freq_pivot.columns
]
print(f"Processed Frequency data. Shape: {freq_pivot.shape}")
else:
print("No Frequency (FT) data found.")
# Create an empty DataFrame with the multi-index to allow merging later
idx = pd.MultiIndex(
levels=[[], []], codes=[[], []], names=["onetsoc_code", "task_id"]
)
freq_pivot = pd.DataFrame(index=idx)
# --- 2. Handle Importance (IM, IJ) ---
imp_df = ratings_df[ratings_df["scale_id"].isin(["IM", "IJ"])].copy()
if not imp_df.empty:
imp_avg = (
imp_df.groupby(["onetsoc_code", "task_id"])["data_value"]
.mean()
.reset_index()
)
imp_avg.rename(columns={"data_value": "importance_average"}, inplace=True)
print(f"Processed Importance data. Shape: {imp_avg.shape}")
else:
print("No Importance (IM, IJ) data found.")
imp_avg = pd.DataFrame(
columns=["onetsoc_code", "task_id", "importance_average"]
)
# --- 3. Handle Relevance (RT) ---
rel_df = ratings_df[ratings_df["scale_id"] == "RT"].copy()
if not rel_df.empty:
rel_avg = (
rel_df.groupby(["onetsoc_code", "task_id"])["data_value"]
.mean()
.reset_index()
)
rel_avg.rename(columns={"data_value": "relevance_average"}, inplace=True)
print(f"Processed Relevance data. Shape: {rel_avg.shape}")
else:
print("No Relevance (RT) data found.")
rel_avg = pd.DataFrame(columns=["onetsoc_code", "task_id", "relevance_average"])
# --- 4. Process DWAs ---
if dwas_df is not None and not dwas_df.empty and "dwa_title" in dwas_df.columns:
print("Processing DWA data...")
# Group DWAs by task_id and aggregate titles into a list
dwas_grouped = (
dwas_df.groupby(["onetsoc_code", "task_id"])["dwa_title"]
.apply(list)
.reset_index()
) #
dwas_grouped.rename(
columns={"dwa_title": "dwas"}, inplace=True
) # Rename column to 'dwas'
print(f"Processed DWA data. Shape: {dwas_grouped.shape}")
else:
print("No valid DWA data found or provided for processing.")
dwas_grouped = None # Set to None if no DWAs
# --- 5. Get Base Task/Occupation Info ---
base_cols = [
"onetsoc_code",
"task_id",
"task",
"occupation_title",
"occupation_description",
]
# Check if base columns exist in ratings_df
missing_base_cols = [col for col in base_cols if col not in ratings_df.columns]
if missing_base_cols:
print(
f"Error: Missing base info columns in ratings_df: {missing_base_cols}. Cannot proceed."
)
return None
if not ratings_df.empty:
base_info = (
ratings_df[base_cols]
.drop_duplicates()
.set_index(["onetsoc_code", "task_id"])
)
print(f"Extracted base info. Shape: {base_info.shape}")
else:
print("Cannot extract base info from empty ratings DataFrame.")
# Create an empty df with index to avoid errors later if possible
idx = pd.MultiIndex(
levels=[[], []], codes=[[], []], names=["onetsoc_code", "task_id"]
)
base_info = pd.DataFrame(
index=idx,
columns=[
col for col in base_cols if col not in ["onetsoc_code", "task_id"]
],
)
# --- 6. Merge Processed Data ---
print("Merging processed data...")
# Start with base_info, which should have the index ['onetsoc_code', 'task_id']
final_df = base_info.merge(
freq_pivot, left_index=True, right_index=True, how="left"
)
# Reset index before merging non-indexed dfs
final_df = final_df.reset_index()
# Merge averages - check if they are not empty before merging
if not imp_avg.empty:
final_df = final_df.merge(imp_avg, on=["onetsoc_code", "task_id"], how="left")
else:
final_df["importance_average"] = np.nan # Add column if imp_avg was empty
if not rel_avg.empty:
final_df = final_df.merge(rel_avg, on=["onetsoc_code", "task_id"], how="left")
else:
final_df["relevance_average"] = np.nan # Add column if rel_avg was empty
# Merge DWAs if available
if dwas_grouped is not None and not dwas_grouped.empty:
final_df = final_df.merge(
dwas_grouped, on=["onetsoc_code", "task_id"], how="left"
) # Merge the dwas list
# Fill NaN in 'dwas' column (for tasks with no DWAs) with empty lists
# Check if 'dwas' column exists before applying function
if "dwas" in final_df.columns:
final_df["dwas"] = final_df["dwas"].apply(
lambda x: x if isinstance(x, list) else []
) # Ensure tasks without DWAs get []
else:
print("Warning: 'dwas' column not created during merge.")
final_df["dwas"] = [
[] for _ in range(len(final_df))
] # Add empty list column
else:
# Add an empty 'dwas' column if no DWA data was processed or merged
final_df["dwas"] = [[] for _ in range(len(final_df))]
print(f"Final merged data shape: {final_df.shape}")
# Convert DataFrame to list of dictionaries for JSON output
# Handle potential NaN values during JSON conversion
# Replace numpy NaN with Python None for JSON compatibility
final_df = final_df.replace({np.nan: None})
result_list = final_df.to_dict(orient="records")
return result_list
# --- Output ---
def write_to_json(data, output_path):
"""
Writes the processed data to a JSON file.
Args:
data (list): The list of dictionaries to write.
output_path (str): Path to the output JSON file.
"""
if data is None:
print("No data to write to JSON.")
return
if not isinstance(data, list):
print(
f"Error: Data to write is not a list (type: {type(data)}). Cannot write to JSON."
)
return
# Create directory if it doesn't exist
output_dir = os.path.dirname(output_path)
if output_dir and not os.path.exists(output_dir):
try:
os.makedirs(output_dir)
print(f"Created output directory: {output_dir}")
except OSError as e:
print(f"Error creating output directory {output_dir}: {e}")
return # Exit if cannot create directory
try:
with open(output_path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=4, ensure_ascii=False)
print(f"Successfully wrote enriched data to {output_path}")
except IOError as e:
print(f"Error writing JSON file to {output_path}: {e}")
except TypeError as e:
print(f"Error during JSON serialization: {e}. Check data types.")
except Exception as e:
print(f"An unexpected error occurred during JSON writing: {e}")
# --- Main Execution ---
if __name__ == "__main__":
print("Starting O*NET Task Ratings & DWAs Enrichment Script...")
# 1. Fetch data
ratings_data_df, dwas_data_df = fetch_data_from_db(DB_FILE) # Fetch both datasets
# 2. Process data
# Proceed only if ratings_data_df is a valid DataFrame (even if empty)
# dwas_data_df can be None or empty, handled inside process function
if isinstance(ratings_data_df, pd.DataFrame):
enriched_data = process_task_ratings_with_dwas(
ratings_data_df, dwas_data_df
) # Pass both dataframes
# 3. Write output
if (
enriched_data is not None
): # Check if processing returned data (even an empty list is valid)
write_to_json(enriched_data, OUTPUT_FILE)
else:
print("Data processing failed or returned None. No output file generated.")
else:
print(
"Data fetching failed or returned invalid type for ratings data. Script terminated."
)
print("Script finished.")

View file

@ -7,6 +7,7 @@ from .run import Run
import pandas as pd
def enrich_with_task_estimateability(run: Run) -> pd.DataFrame:
run.metadata.
raise NotImplementedError
def enrich_with_task_estimates(run: Run) -> pd.DataFrame:

View file

@ -5,13 +5,148 @@ Fetchers retrieve remote data and return it in a format suitable for further pro
import sqlite3
from typing import Tuple
import pandas as pd
from .metadata import Metadata
import requests
import hashlib
import io
import zipfile
from .run import Run
from .logger import logger
def fetch_onet_database(meta: Metadata) -> Tuple[sqlite3.Connection, str]:
raise NotImplementedError
def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
"""
Downloads the O*NET database, creates a local SQLite file from it, and returns a connection.
The version is the sha256 of the downloaded zip file.
"""
url = "https://www.onetcenter.org/dl_files/database/db_29_1_mysql.zip"
logger.info(f"Downloading O*NET database from {url}")
response = requests.get(url, stream=True)
response.raise_for_status()
def fetch_oesm_data(meta: Metadata) -> Tuple[pd.DataFrame, str]:
raise NotImplementedError
# Read content into memory
zip_content = response.content
version = hashlib.sha256(zip_content).hexdigest()
logger.info(f"O*NET database version (sha256): {version}")
def fetch_epoch_remote_data(meta: Metadata) -> Tuple[pd.DataFrame, str]:
raise NotImplementedError
db_path = run.cache_dir / f"onet_{version}.db"
if db_path.exists():
logger.info(f"Using cached O*NET database: {db_path}")
conn = sqlite3.connect(db_path)
# Set PRAGMA for foreign keys on every connection
conn.execute("PRAGMA foreign_keys = ON;")
return conn, version
logger.info(f"Creating new O*NET database: {db_path}")
conn = sqlite3.connect(db_path)
# Set performance PRAGMAs for fast import
logger.info("Creating new SQLite database with performance settings")
conn.executescript("""
PRAGMA journal_mode = OFF;
PRAGMA synchronous = 0;
PRAGMA cache_size = 1000000;
PRAGMA locking_mode = EXCLUSIVE;
PRAGMA temp_store = MEMORY;
PRAGMA foreign_keys = ON;
""")
with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
sql_scripts = []
for filename in sorted(z.namelist()):
if filename.endswith(".sql"):
sql_scripts.append(z.read(filename).decode('utf-8'))
if not sql_scripts:
raise RuntimeError("No SQL files found in the O*NET zip archive.")
# Combine and execute all SQL files in one transaction
full_script = "BEGIN TRANSACTION;\n" + "\n".join(sql_scripts) + "\nCOMMIT;"
logger.info("Executing SQL files in alphabetical order (single transaction mode)")
conn.executescript(full_script)
logger.info("Database populated successfully. Restoring reliability settings...")
# Restore reliability-focused settings after import
conn.executescript("""
PRAGMA journal_mode = WAL;
PRAGMA synchronous = NORMAL;
PRAGMA locking_mode = NORMAL;
PRAGMA temp_store = DEFAULT;
PRAGMA foreign_keys = ON;
PRAGMA optimize;
""")
conn.execute("VACUUM;")
conn.commit()
logger.info("Reliability settings restored and database optimized successfully!")
return conn, version
def fetch_oesm_data(run: Run) -> Tuple[pd.DataFrame, str]:
"""
Downloads the OESM national data from the BLS website.
The version is the sha256 of the downloaded zip file.
"""
url = "https://www.bls.gov/oes/special-requests/oesm23nat.zip"
logger.info(f"Downloading OESM data from {url}")
response = requests.get(url)
response.raise_for_status()
zip_content = response.content
version = hashlib.sha256(zip_content).hexdigest()
logger.info(f"OESM data version (sha256): {version}")
parquet_path = run.cache_dir / f"oesm_{version}.parquet"
if parquet_path.exists():
logger.info(f"Using cached OESM data: {parquet_path}")
return pd.read_parquet(parquet_path), version
logger.info(f"Creating new OESM data cache: {parquet_path}")
with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
# Find the excel file in the zip
excel_filename = None
for filename in z.namelist():
logger.debug(f"Found file in OESM zip: {filename}")
if filename.lower().endswith(".xlsx"):
excel_filename = filename
break
if excel_filename is None:
raise FileNotFoundError("Could not find the Excel file in the OESM zip archive.")
logger.info(f"Reading {excel_filename} from zip archive.")
with z.open(excel_filename) as f:
df = pd.read_excel(f, engine='openpyxl')
df.to_parquet(parquet_path)
logger.info(f"Saved OESM data to cache: {parquet_path}")
return df, version
def fetch_epoch_remote_data(run: Run) -> Tuple[pd.DataFrame, str]:
"""
Downloads the EPOCH AI remote work task data.
The version is the sha256 of the downloaded CSV file.
"""
# This is the direct download link constructed from the Google Drive share link
url = "https://drive.google.com/uc?export=download&id=1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r"
logger.info(f"Downloading EPOCH remote data from Google Drive: {url}")
# Need to handle potential cookies/redirects from Google Drive
session = requests.Session()
response = session.get(url, stream=True)
response.raise_for_status()
csv_content = response.content
version = hashlib.sha256(csv_content).hexdigest()
logger.info(f"EPOCH remote data version (sha256): {version}")
parquet_path = run.cache_dir / f"epoch_remote_{version}.parquet"
if parquet_path.exists():
logger.info(f"Using cached EPOCH remote data: {parquet_path}")
return pd.read_parquet(parquet_path), version
logger.info(f"Creating new EPOCH remote data cache: {parquet_path}")
df = pd.read_csv(io.BytesIO(csv_content))
df.to_parquet(parquet_path)
logger.info(f"Saved EPOCH remote data to cache: {parquet_path}")
return df, version

View file

@ -2,5 +2,5 @@ from ..run import Run
from pathlib import Path
from typing import Generator
def generate_estimate_histplot(run: Run, output_dir: Path) -> Generator[Path]:
def generate_estimate_histplot(run: Run) -> Generator[Path]:
raise NotImplementedError

24
pipeline/logger.py Normal file
View file

@ -0,0 +1,24 @@
import logging
from logging.handlers import RotatingFileHandler
from rich.logging import RichHandler
LOGGER_NAME = "pipeline"
def setup_logging() -> logging.Logger:
# Set up Rich console handler
rich_handler = RichHandler(
level=logging.DEBUG,
show_time=True,
enable_link_path=True,
rich_tracebacks=True,
# omit_repeated_times=False,
)
logger = logging.getLogger(LOGGER_NAME)
logger.setLevel(logging.DEBUG)
logger.addHandler(rich_handler)
return logger
logger = setup_logging()

View file

@ -16,6 +16,7 @@ class Metadata(BaseModel):
versions, and other important information.
"""
fetchers: Dict[str, Dict[str, Any]] = Field(default_factory=dict)
enrichments: Dict[str, Dict[str, Any]] = Field(default_factory=dict)
ts: str = Field(default_factory=lambda: datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
commit: str = Field(default_factory=lambda: _get_current_commit())

View file

@ -1,6 +1,7 @@
from pydantic import BaseModel, Field
import sqlite3
import pandas as pd
from pathlib import Path
from typing import Optional
from .metadata import Metadata
@ -20,3 +21,6 @@ class Run(BaseModel):
task_estimates_df: Optional[pd.DataFrame] = None
meta: Metadata = Field(default_factory=Metadata)
cache_dir: Path
output_dir: Path

View file

@ -5,11 +5,14 @@ from .postprocessors import check_for_insanity, create_df_tasks
from .generators import GENERATORS
from .run import Run
from .constants import GRAY
import platformdirs
import seaborn as sns
import matplotlib as mpl
from pathlib import Path
from typings import Optional
CACHE_DIR = platformdirs.user_cache_dir("econtai")
def run(output_dir: Optional[str] = None):
if output_dir is None:
output_dir = Path(".")
@ -17,12 +20,12 @@ def run(output_dir: Optional[str] = None):
load_dotenv()
_setup_graph_rendering()
current_run = Run()
current_run = Run(output_dir=output_dir, cache_dir=CACHE_DIR)
# Fetchers (fetchers.py)
current_run.onet_conn, current_run.onet_version = fetch_onet_database(current_run.meta)
current_run.oesm_df, current_run.oesm_version = fetch_oesm_data(current_run.meta)
current_run.epoch_df, current_run.epoch_version = fetch_epoch_remote_data(current_run.meta)
current_run.onet_conn, current_run.onet_version = fetch_onet_database(current_run)
current_run.oesm_df, current_run.oesm_version = fetch_oesm_data(current_run)
current_run.epoch_df, current_run.epoch_version = fetch_epoch_remote_data(current_run)
# Enrichments (enrichments.py)
current_run.task_estimateability_df = enrich_with_task_estimateability(current_run)
@ -34,7 +37,7 @@ def run(output_dir: Optional[str] = None):
# Generators (generators/)
for gen in GENERATORS:
gen(current_run, output_dir)
gen(current_run)
def _setup_graph_rendering():

View file

@ -6,9 +6,12 @@ readme = "README.md"
requires-python = ">=3.13"
dependencies = [
"matplotlib>=3.10.3",
"openpyxl>=3.1.5",
"pandas>=2.2.3",
"platformdirs>=4.3.8",
"pydantic>=2.11.7",
"python-dotenv>=1.1.1",
"requests>=2.32.4",
"seaborn>=0.13.2",
]

100
uv.lock generated
View file

@ -11,6 +11,37 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload_time = "2024-05-20T21:33:24.1Z" },
]
[[package]]
name = "certifi"
version = "2025.6.15"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/73/f7/f14b46d4bcd21092d7d3ccef689615220d8a08fb25e564b65d20738e672e/certifi-2025.6.15.tar.gz", hash = "sha256:d747aa5a8b9bbbb1bb8c22bb13e22bd1f18e9796defa16bab421f7f7a317323b", size = 158753, upload_time = "2025-06-15T02:45:51.329Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/84/ae/320161bd181fc06471eed047ecce67b693fd7515b16d495d8932db763426/certifi-2025.6.15-py3-none-any.whl", hash = "sha256:2e0c7ce7cb5d8f8634ca55d2ba7e6ec2689a2fd6537d8dec1296a477a4910057", size = 157650, upload_time = "2025-06-15T02:45:49.977Z" },
]
[[package]]
name = "charset-normalizer"
version = "3.4.2"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/e4/33/89c2ced2b67d1c2a61c19c6751aa8902d46ce3dacb23600a283619f5a12d/charset_normalizer-3.4.2.tar.gz", hash = "sha256:5baececa9ecba31eff645232d59845c07aa030f0c81ee70184a90d35099a0e63", size = 126367, upload_time = "2025-05-02T08:34:42.01Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/ea/12/a93df3366ed32db1d907d7593a94f1fe6293903e3e92967bebd6950ed12c/charset_normalizer-3.4.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:926ca93accd5d36ccdabd803392ddc3e03e6d4cd1cf17deff3b989ab8e9dbcf0", size = 199622, upload_time = "2025-05-02T08:32:56.363Z" },
{ url = "https://files.pythonhosted.org/packages/04/93/bf204e6f344c39d9937d3c13c8cd5bbfc266472e51fc8c07cb7f64fcd2de/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eba9904b0f38a143592d9fc0e19e2df0fa2e41c3c3745554761c5f6447eedabf", size = 143435, upload_time = "2025-05-02T08:32:58.551Z" },
{ url = "https://files.pythonhosted.org/packages/22/2a/ea8a2095b0bafa6c5b5a55ffdc2f924455233ee7b91c69b7edfcc9e02284/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3fddb7e2c84ac87ac3a947cb4e66d143ca5863ef48e4a5ecb83bd48619e4634e", size = 153653, upload_time = "2025-05-02T08:33:00.342Z" },
{ url = "https://files.pythonhosted.org/packages/b6/57/1b090ff183d13cef485dfbe272e2fe57622a76694061353c59da52c9a659/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98f862da73774290f251b9df8d11161b6cf25b599a66baf087c1ffe340e9bfd1", size = 146231, upload_time = "2025-05-02T08:33:02.081Z" },
{ url = "https://files.pythonhosted.org/packages/e2/28/ffc026b26f441fc67bd21ab7f03b313ab3fe46714a14b516f931abe1a2d8/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c9379d65defcab82d07b2a9dfbfc2e95bc8fe0ebb1b176a3190230a3ef0e07c", size = 148243, upload_time = "2025-05-02T08:33:04.063Z" },
{ url = "https://files.pythonhosted.org/packages/c0/0f/9abe9bd191629c33e69e47c6ef45ef99773320e9ad8e9cb08b8ab4a8d4cb/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e635b87f01ebc977342e2697d05b56632f5f879a4f15955dfe8cef2448b51691", size = 150442, upload_time = "2025-05-02T08:33:06.418Z" },
{ url = "https://files.pythonhosted.org/packages/67/7c/a123bbcedca91d5916c056407f89a7f5e8fdfce12ba825d7d6b9954a1a3c/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1c95a1e2902a8b722868587c0e1184ad5c55631de5afc0eb96bc4b0d738092c0", size = 145147, upload_time = "2025-05-02T08:33:08.183Z" },
{ url = "https://files.pythonhosted.org/packages/ec/fe/1ac556fa4899d967b83e9893788e86b6af4d83e4726511eaaad035e36595/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ef8de666d6179b009dce7bcb2ad4c4a779f113f12caf8dc77f0162c29d20490b", size = 153057, upload_time = "2025-05-02T08:33:09.986Z" },
{ url = "https://files.pythonhosted.org/packages/2b/ff/acfc0b0a70b19e3e54febdd5301a98b72fa07635e56f24f60502e954c461/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:32fc0341d72e0f73f80acb0a2c94216bd704f4f0bce10aedea38f30502b271ff", size = 156454, upload_time = "2025-05-02T08:33:11.814Z" },
{ url = "https://files.pythonhosted.org/packages/92/08/95b458ce9c740d0645feb0e96cea1f5ec946ea9c580a94adfe0b617f3573/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:289200a18fa698949d2b39c671c2cc7a24d44096784e76614899a7ccf2574b7b", size = 154174, upload_time = "2025-05-02T08:33:13.707Z" },
{ url = "https://files.pythonhosted.org/packages/78/be/8392efc43487ac051eee6c36d5fbd63032d78f7728cb37aebcc98191f1ff/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4a476b06fbcf359ad25d34a057b7219281286ae2477cc5ff5e3f70a246971148", size = 149166, upload_time = "2025-05-02T08:33:15.458Z" },
{ url = "https://files.pythonhosted.org/packages/44/96/392abd49b094d30b91d9fbda6a69519e95802250b777841cf3bda8fe136c/charset_normalizer-3.4.2-cp313-cp313-win32.whl", hash = "sha256:aaeeb6a479c7667fbe1099af9617c83aaca22182d6cf8c53966491a0f1b7ffb7", size = 98064, upload_time = "2025-05-02T08:33:17.06Z" },
{ url = "https://files.pythonhosted.org/packages/e9/b0/0200da600134e001d91851ddc797809e2fe0ea72de90e09bec5a2fbdaccb/charset_normalizer-3.4.2-cp313-cp313-win_amd64.whl", hash = "sha256:aa6af9e7d59f9c12b33ae4e9450619cf2488e2bbe9b44030905877f0b2324980", size = 105641, upload_time = "2025-05-02T08:33:18.753Z" },
{ url = "https://files.pythonhosted.org/packages/20/94/c5790835a017658cbfabd07f3bfb549140c3ac458cfc196323996b10095a/charset_normalizer-3.4.2-py3-none-any.whl", hash = "sha256:7f56930ab0abd1c45cd15be65cc741c28b1c9a34876ce8c17a2fa107810c0af0", size = 52626, upload_time = "2025-05-02T08:34:40.053Z" },
]
[[package]]
name = "contourpy"
version = "1.3.2"
@ -51,6 +82,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30", size = 8321, upload_time = "2023-10-07T05:32:16.783Z" },
]
[[package]]
name = "et-xmlfile"
version = "2.0.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload_time = "2024-10-25T17:25:40.039Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload_time = "2024-10-25T17:25:39.051Z" },
]
[[package]]
name = "fonttools"
version = "4.58.5"
@ -68,6 +108,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/d7/d4/1d85a1996b6188cd2713230e002d79a6f3a289bb17cef600cba385848b72/fonttools-4.58.5-py3-none-any.whl", hash = "sha256:e48a487ed24d9b611c5c4b25db1e50e69e9854ca2670e39a3486ffcd98863ec4", size = 1115318, upload_time = "2025-07-03T14:04:45.378Z" },
]
[[package]]
name = "idna"
version = "3.10"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490, upload_time = "2024-09-15T18:07:39.745Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload_time = "2024-09-15T18:07:37.964Z" },
]
[[package]]
name = "kiwisolver"
version = "1.4.8"
@ -163,6 +212,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/63/be/b85e4aa4bf42c6502851b971f1c326d583fcc68227385f92089cf50a7b45/numpy-2.2.5-cp313-cp313t-win_amd64.whl", hash = "sha256:d403c84991b5ad291d3809bace5e85f4bbf44a04bdc9a88ed2bb1807b3360bb8", size = 12750096, upload_time = "2025-04-19T22:47:00.147Z" },
]
[[package]]
name = "openpyxl"
version = "3.1.5"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "et-xmlfile" },
]
sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload_time = "2024-06-28T14:03:44.161Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload_time = "2024-06-28T14:03:41.161Z" },
]
[[package]]
name = "packaging"
version = "25.0"
@ -254,6 +315,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/89/c7/5572fa4a3f45740eaab6ae86fcdf7195b55beac1371ac8c619d880cfe948/pillow-11.3.0-cp314-cp314t-win_arm64.whl", hash = "sha256:79ea0d14d3ebad43ec77ad5272e6ff9bba5b679ef73375ea760261207fa8e0aa", size = 2512835, upload_time = "2025-07-01T09:15:50.399Z" },
]
[[package]]
name = "platformdirs"
version = "4.3.8"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/fe/8b/3c73abc9c759ecd3f1f7ceff6685840859e8070c4d947c93fae71f6a0bf2/platformdirs-4.3.8.tar.gz", hash = "sha256:3d512d96e16bcb959a814c9f348431070822a6496326a4be0911c40b5a74c2bc", size = 21362, upload_time = "2025-05-07T22:47:42.121Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/fe/39/979e8e21520d4e47a0bbe349e2713c0aac6f3d853d0e5b34d76206c439aa/platformdirs-4.3.8-py3-none-any.whl", hash = "sha256:ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4", size = 18567, upload_time = "2025-05-07T22:47:40.376Z" },
]
[[package]]
name = "pydantic"
version = "2.11.7"
@ -336,6 +406,21 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload_time = "2025-03-25T02:24:58.468Z" },
]
[[package]]
name = "requests"
version = "2.32.4"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "certifi" },
{ name = "charset-normalizer" },
{ name = "idna" },
{ name = "urllib3" },
]
sdist = { url = "https://files.pythonhosted.org/packages/e1/0a/929373653770d8a0d7ea76c37de6e41f11eb07559b103b1c02cafb3f7cf8/requests-2.32.4.tar.gz", hash = "sha256:27d0316682c8a29834d3264820024b62a36942083d52caf2f14c0591336d3422", size = 135258, upload_time = "2025-06-09T16:43:07.34Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/7c/e4/56027c4a6b4ae70ca9de302488c5ca95ad4a39e190093d6c1a8ace08341b/requests-2.32.4-py3-none-any.whl", hash = "sha256:27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c", size = 64847, upload_time = "2025-06-09T16:43:05.728Z" },
]
[[package]]
name = "seaborn"
version = "0.13.2"
@ -365,18 +450,24 @@ version = "0.1.0"
source = { virtual = "." }
dependencies = [
{ name = "matplotlib" },
{ name = "openpyxl" },
{ name = "pandas" },
{ name = "platformdirs" },
{ name = "pydantic" },
{ name = "python-dotenv" },
{ name = "requests" },
{ name = "seaborn" },
]
[package.metadata]
requires-dist = [
{ name = "matplotlib", specifier = ">=3.10.3" },
{ name = "openpyxl", specifier = ">=3.1.5" },
{ name = "pandas", specifier = ">=2.2.3" },
{ name = "platformdirs", specifier = ">=4.3.8" },
{ name = "pydantic", specifier = ">=2.11.7" },
{ name = "python-dotenv", specifier = ">=1.1.1" },
{ name = "requests", specifier = ">=2.32.4" },
{ name = "seaborn", specifier = ">=0.13.2" },
]
@ -412,3 +503,12 @@ sdist = { url = "https://files.pythonhosted.org/packages/95/32/1a225d6164441be76
wheels = [
{ url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload_time = "2025-03-23T13:54:41.845Z" },
]
[[package]]
name = "urllib3"
version = "2.5.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/15/22/9ee70a2574a4f4599c47dd506532914ce044817c7752a79b6a51286319bc/urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760", size = 393185, upload_time = "2025-06-18T14:07:41.644Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload_time = "2025-06-18T14:07:40.39Z" },
]