progress
This commit is contained in:
parent
2da206d368
commit
b7c94590f9
14 changed files with 2200 additions and 13 deletions
507
old/add_task_estimates.py
Normal file
507
old/add_task_estimates.py
Normal file
|
@ -0,0 +1,507 @@
|
|||
import pandas as pd
|
||||
import litellm
|
||||
import dotenv
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
# --- Configuration ---
|
||||
MODEL = "gpt-4.1-mini" # Make sure this model supports json_schema or structured output
|
||||
RATE_LIMIT = 5000 # Requests per minute
|
||||
CHUNK_SIZE = 300
|
||||
SECONDS_PER_MINUTE = 60
|
||||
FILENAME = (
|
||||
"tasks_with_estimates.csv" # This CSV should contain the tasks to be processed
|
||||
)
|
||||
|
||||
# --- Prompts and Schema ---
|
||||
SYSTEM_PROMPT = """
|
||||
You are an expert assistant evaluating the time to completion required for job tasks. Your goal is to estimate the time range needed for a skilled human to complete the following job task remotely, without supervision.
|
||||
|
||||
Provide a lower and upper bound estimate for the time to completion time. These bounds should capture the time within which approximately 80% of instances of performing this specific task are typically completed by a qualified individual.
|
||||
|
||||
Base your estimate on the provided task description, its associated activities, and the occupational context. Your estimate must be in one the allowed units: minute, hour, day, week, month, trimester, semester, year.
|
||||
""".strip()
|
||||
|
||||
USER_MESSAGE_TEMPLATE = """
|
||||
Please estimate the time range for the following remote task:
|
||||
|
||||
**Task Description:** {task}
|
||||
**Relevant activies for the task:**
|
||||
{dwas}
|
||||
|
||||
**Occupation Category:** {occupation_title}
|
||||
**Occupation Description:** {occupation_description}
|
||||
|
||||
Consider the complexity and the typical steps involved.
|
||||
""".strip()
|
||||
|
||||
ALLOWED_UNITS = [
|
||||
"minute",
|
||||
"hour",
|
||||
"day",
|
||||
"week",
|
||||
"month",
|
||||
"trimester",
|
||||
"semester",
|
||||
"year",
|
||||
]
|
||||
|
||||
SCHEMA_FOR_VALIDATION = {
|
||||
"name": "estimate_time",
|
||||
"strict": True, # Enforce schema adherence
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"lower_bound_estimate": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"quantity": {
|
||||
"type": "number",
|
||||
"description": "The numerical value for the lower bound of the estimate.",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"enum": ALLOWED_UNITS,
|
||||
"description": "The unit of time for the lower bound.",
|
||||
},
|
||||
},
|
||||
"required": ["quantity", "unit"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
"upper_bound_estimate": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"quantity": {
|
||||
"type": "number",
|
||||
"description": "The numerical value for the upper bound of the estimate.",
|
||||
},
|
||||
"unit": {
|
||||
"type": "string",
|
||||
"enum": ALLOWED_UNITS,
|
||||
"description": "The unit of time for the upper bound.",
|
||||
},
|
||||
},
|
||||
"required": ["quantity", "unit"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
},
|
||||
"required": ["lower_bound_estimate", "upper_bound_estimate"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def save_dataframe(df_to_save, filename):
|
||||
|
||||
"""Saves the DataFrame to the specified CSV file using atomic write."""
|
||||
try:
|
||||
temp_filename = filename + ".tmp"
|
||||
df_to_save.to_csv(temp_filename, encoding="utf-8-sig", index=False)
|
||||
os.replace(temp_filename, filename)
|
||||
except Exception as e:
|
||||
print(f"--- Error saving DataFrame to {filename}: {e} ---")
|
||||
if os.path.exists(temp_filename):
|
||||
try:
|
||||
os.remove(temp_filename)
|
||||
except Exception as remove_err:
|
||||
print(
|
||||
f"--- Error removing temporary save file {temp_filename}: {remove_err} ---"
|
||||
)
|
||||
|
||||
def create_task_estimates():
|
||||
try:
|
||||
# Read the CSV
|
||||
if os.path.exists(FILENAME):
|
||||
df = pd.read_csv(FILENAME, encoding="utf-8-sig")
|
||||
print(f"Successfully read {len(df)} rows from {FILENAME}.")
|
||||
|
||||
estimate_columns_spec = {
|
||||
"lb_estimate_qty": float,
|
||||
"lb_estimate_unit": object,
|
||||
"ub_estimate_qty": float,
|
||||
"ub_estimate_unit": object,
|
||||
}
|
||||
save_needed = False
|
||||
|
||||
for col_name, target_dtype in estimate_columns_spec.items():
|
||||
if col_name not in df.columns:
|
||||
# Initialize with a type-compatible missing value
|
||||
if target_dtype == float:
|
||||
df[col_name] = np.nan
|
||||
else: # object
|
||||
df[col_name] = pd.NA
|
||||
df[col_name] = df[col_name].astype(target_dtype) # Enforce dtype
|
||||
print(f"Added '{col_name}' column as {df[col_name].dtype}.")
|
||||
save_needed = True
|
||||
else:
|
||||
# Column exists, ensure correct dtype
|
||||
current_pd_dtype = df[col_name].dtype
|
||||
expected_pd_dtype = pd.Series(dtype=target_dtype).dtype
|
||||
|
||||
if current_pd_dtype != expected_pd_dtype:
|
||||
try:
|
||||
if target_dtype == float:
|
||||
df[col_name] = pd.to_numeric(df[col_name], errors="coerce")
|
||||
else: # object
|
||||
df[col_name] = df[col_name].astype(object)
|
||||
print(
|
||||
f"Corrected dtype of '{col_name}' to {df[col_name].dtype}."
|
||||
)
|
||||
save_needed = True
|
||||
except Exception as e:
|
||||
print(
|
||||
f"Warning: Could not convert column '{col_name}' to {target_dtype}: {e}. Current dtype: {current_pd_dtype}"
|
||||
)
|
||||
|
||||
# Standardize missing values (e.g., empty strings to NA/NaN)
|
||||
# Replace common missing placeholders with pd.NA first
|
||||
df[col_name].replace(["", None, ""], pd.NA, inplace=True)
|
||||
if target_dtype == float:
|
||||
# For float columns, ensure they are numeric and use np.nan after replacement
|
||||
df[col_name] = pd.to_numeric(df[col_name], errors="coerce")
|
||||
|
||||
if save_needed:
|
||||
print(f"Saving {FILENAME} after adding/adjusting estimate columns.")
|
||||
save_dataframe(df, FILENAME)
|
||||
else:
|
||||
print(
|
||||
f"Error: {FILENAME} not found. Please ensure the file exists and contains task data."
|
||||
)
|
||||
exit()
|
||||
except FileNotFoundError:
|
||||
print(
|
||||
f"Error: {FILENAME} not found. Please ensure the file exists and contains task data."
|
||||
)
|
||||
exit()
|
||||
except Exception as e:
|
||||
print(f"Error reading or initializing {FILENAME}: {e}")
|
||||
exit()
|
||||
|
||||
# --- Identify Rows to Process ---
|
||||
# We'll check for NaN in one of the primary quantity columns.
|
||||
unprocessed_mask = df["lb_estimate_qty"].isna()
|
||||
if unprocessed_mask.any():
|
||||
start_index = unprocessed_mask.idxmax() # Finds the index of the first True value
|
||||
print(f"Resuming processing. First unprocessed row found at index {start_index}.")
|
||||
df_to_process = df.loc[unprocessed_mask].copy()
|
||||
original_indices = df_to_process.index # Keep track of original indices
|
||||
else:
|
||||
print(
|
||||
"All rows seem to have estimates already (based on 'lb_estimate_qty'). Exiting."
|
||||
)
|
||||
exit()
|
||||
|
||||
|
||||
# --- Prepare messages for batch completion (only for rows needing processing) ---
|
||||
messages_list = []
|
||||
skipped_rows_indices = []
|
||||
valid_original_indices = []
|
||||
|
||||
if not df_to_process.empty:
|
||||
required_cols = ["task", "occupation_title", "occupation_description", "dwas"]
|
||||
print(
|
||||
f"Preparing messages for up to {len(df_to_process)} rows starting from original index {original_indices[0] if len(original_indices) > 0 else 'N/A'}..."
|
||||
)
|
||||
print(f"Checking for required columns: {required_cols}")
|
||||
|
||||
for index, row in df_to_process.iterrows():
|
||||
missing_or_empty = []
|
||||
for col in required_cols:
|
||||
if col not in row or pd.isna(row[col]) or str(row[col]).strip() == "":
|
||||
missing_or_empty.append(col)
|
||||
|
||||
if missing_or_empty:
|
||||
print(
|
||||
f"Warning: Skipping row original index {index} due to missing/empty required data in columns: {', '.join(missing_or_empty)}."
|
||||
)
|
||||
skipped_rows_indices.append(index)
|
||||
continue
|
||||
|
||||
try:
|
||||
user_message = USER_MESSAGE_TEMPLATE.format(
|
||||
task=row["task"],
|
||||
occupation_title=row["occupation_title"],
|
||||
occupation_description=row["occupation_description"],
|
||||
dwas=row["dwas"],
|
||||
)
|
||||
except KeyError as e:
|
||||
print(
|
||||
f"Error: Skipping row original index {index} due to formatting error - missing key: {e}. Check USER_MESSAGE_TEMPLATE and CSV columns."
|
||||
)
|
||||
skipped_rows_indices.append(index)
|
||||
continue
|
||||
|
||||
messages_for_row = [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": user_message},
|
||||
]
|
||||
messages_list.append(messages_for_row)
|
||||
valid_original_indices.append(index) # This is the original DataFrame index
|
||||
|
||||
print(
|
||||
f"Prepared {len(messages_list)} valid message sets for batch completion (skipped {len(skipped_rows_indices)} rows)."
|
||||
)
|
||||
if not messages_list:
|
||||
print("No valid rows found to process after checking required data. Exiting.")
|
||||
exit()
|
||||
else:
|
||||
print(
|
||||
"No rows found needing processing (df_to_process is empty)."
|
||||
) # Should have been caught by earlier check
|
||||
exit()
|
||||
|
||||
|
||||
# --- Call batch_completion in chunks with rate limiting and periodic saving ---
|
||||
total_messages_to_send = len(messages_list)
|
||||
num_chunks = math.ceil(total_messages_to_send / CHUNK_SIZE)
|
||||
|
||||
print(
|
||||
f"\nStarting batch completion for {total_messages_to_send} items in {num_chunks} chunks..."
|
||||
)
|
||||
|
||||
overall_start_time = time.time()
|
||||
processed_count_total = 0
|
||||
|
||||
for i in range(num_chunks):
|
||||
chunk_start_message_index = i * CHUNK_SIZE
|
||||
chunk_end_message_index = min((i + 1) * CHUNK_SIZE, total_messages_to_send)
|
||||
message_chunk = messages_list[chunk_start_message_index:chunk_end_message_index]
|
||||
# Get corresponding original DataFrame indices for this chunk
|
||||
chunk_original_indices = valid_original_indices[
|
||||
chunk_start_message_index:chunk_end_message_index
|
||||
]
|
||||
|
||||
if not message_chunk:
|
||||
continue
|
||||
|
||||
min_idx_disp = min(chunk_original_indices) if chunk_original_indices else "N/A"
|
||||
max_idx_disp = max(chunk_original_indices) if chunk_original_indices else "N/A"
|
||||
print(
|
||||
f"\nProcessing chunk {i + 1}/{num_chunks} (Messages {chunk_start_message_index + 1}-{chunk_end_message_index} of this run)..."
|
||||
f" Corresponding to original indices: {min_idx_disp} - {max_idx_disp}"
|
||||
)
|
||||
chunk_start_time = time.time()
|
||||
responses = []
|
||||
try:
|
||||
print(f"Sending {len(message_chunk)} requests for chunk {i + 1}...")
|
||||
responses = litellm.batch_completion(
|
||||
model=MODEL,
|
||||
messages=message_chunk,
|
||||
response_format={
|
||||
"type": "json_schema",
|
||||
"json_schema": SCHEMA_FOR_VALIDATION,
|
||||
},
|
||||
num_retries=3,
|
||||
# request_timeout=60 # Optional: uncomment if needed
|
||||
)
|
||||
print(f"Chunk {i + 1} API call completed.")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during litellm.batch_completion for chunk {i + 1}: {e}")
|
||||
responses = [None] * len(
|
||||
message_chunk
|
||||
) # Ensure responses list matches message_chunk length for processing loop
|
||||
|
||||
# --- Process responses for the current chunk ---
|
||||
chunk_updates = {} # To store {original_df_index: {qty/unit data}}
|
||||
successful_in_chunk = 0
|
||||
failed_in_chunk = 0
|
||||
|
||||
if responses and len(responses) == len(message_chunk):
|
||||
for j, response in enumerate(responses):
|
||||
original_df_index = chunk_original_indices[j]
|
||||
|
||||
# Initialize values for this item
|
||||
lb_qty_val, lb_unit_val, ub_qty_val, ub_unit_val = None, None, None, None
|
||||
content_str = None
|
||||
|
||||
if response is None:
|
||||
print(
|
||||
f"Skipping processing for original index {original_df_index} due to API call failure for this item (response is None)."
|
||||
)
|
||||
failed_in_chunk += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
if (
|
||||
response.choices
|
||||
and response.choices[0].message
|
||||
and response.choices[0].message.content
|
||||
):
|
||||
content_str = response.choices[0].message.content
|
||||
estimate_data = json.loads(content_str) # Can raise JSONDecodeError
|
||||
|
||||
lower_bound_dict = estimate_data.get("lower_bound_estimate")
|
||||
upper_bound_dict = estimate_data.get("upper_bound_estimate")
|
||||
|
||||
valid_response_structure = isinstance(
|
||||
lower_bound_dict, dict
|
||||
) and isinstance(upper_bound_dict, dict)
|
||||
|
||||
if valid_response_structure:
|
||||
lb_qty_raw = lower_bound_dict.get("quantity")
|
||||
lb_unit_raw = lower_bound_dict.get("unit")
|
||||
ub_qty_raw = upper_bound_dict.get("quantity")
|
||||
ub_unit_raw = upper_bound_dict.get("unit")
|
||||
|
||||
is_valid_item = True
|
||||
# Validate LB Qty
|
||||
if (
|
||||
not isinstance(lb_qty_raw, (int, float))
|
||||
or math.isnan(float(lb_qty_raw))
|
||||
or float(lb_qty_raw) < 0
|
||||
):
|
||||
print(
|
||||
f"Warning: Invalid lb_quantity for original index {original_df_index}: {lb_qty_raw}"
|
||||
)
|
||||
is_valid_item = False
|
||||
else:
|
||||
lb_qty_val = float(lb_qty_raw)
|
||||
|
||||
# Validate UB Qty
|
||||
if (
|
||||
not isinstance(ub_qty_raw, (int, float))
|
||||
or math.isnan(float(ub_qty_raw))
|
||||
or float(ub_qty_raw) < 0
|
||||
):
|
||||
print(
|
||||
f"Warning: Invalid ub_quantity for original index {original_df_index}: {ub_qty_raw}"
|
||||
)
|
||||
is_valid_item = False
|
||||
else:
|
||||
ub_qty_val = float(ub_qty_raw)
|
||||
|
||||
# Validate Units
|
||||
if lb_unit_raw not in ALLOWED_UNITS:
|
||||
print(
|
||||
f"Warning: Invalid lb_unit for original index {original_df_index}: '{lb_unit_raw}'"
|
||||
)
|
||||
is_valid_item = False
|
||||
else:
|
||||
lb_unit_val = lb_unit_raw
|
||||
|
||||
if ub_unit_raw not in ALLOWED_UNITS:
|
||||
print(
|
||||
f"Warning: Invalid ub_unit for original index {original_df_index}: '{ub_unit_raw}'"
|
||||
)
|
||||
is_valid_item = False
|
||||
else:
|
||||
ub_unit_val = ub_unit_raw
|
||||
|
||||
if is_valid_item:
|
||||
successful_in_chunk += 1
|
||||
chunk_updates[original_df_index] = {
|
||||
"lb_estimate_qty": lb_qty_val,
|
||||
"lb_estimate_unit": lb_unit_val,
|
||||
"ub_estimate_qty": ub_qty_val,
|
||||
"ub_estimate_unit": ub_unit_val,
|
||||
}
|
||||
else:
|
||||
failed_in_chunk += (
|
||||
1 # Values remain None if not fully valid
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f"Warning: Missing or malformed estimate dicts in JSON for original index {original_df_index}. Content: '{content_str}'"
|
||||
)
|
||||
failed_in_chunk += 1
|
||||
else:
|
||||
finish_reason = (
|
||||
response.choices[0].finish_reason
|
||||
if (response.choices and response.choices[0].finish_reason)
|
||||
else "unknown"
|
||||
)
|
||||
error_message = (
|
||||
response.choices[0].message.content
|
||||
if (
|
||||
response.choices
|
||||
and response.choices[0].message
|
||||
and response.choices[0].message.content
|
||||
)
|
||||
else "No content in message."
|
||||
)
|
||||
print(
|
||||
f"Warning: Received non-standard or empty response content for original index {original_df_index}. "
|
||||
f"Finish Reason: '{finish_reason}'. Message: '{error_message}'. Raw Choices: {response.choices}"
|
||||
)
|
||||
failed_in_chunk += 1
|
||||
|
||||
except json.JSONDecodeError:
|
||||
print(
|
||||
f"Warning: Could not decode JSON for original index {original_df_index}. Content received: '{content_str}'"
|
||||
)
|
||||
failed_in_chunk += 1
|
||||
except AttributeError as ae:
|
||||
print(
|
||||
f"Warning: Missing expected attribute processing response for original index {original_df_index}: {ae}. Response: {response}"
|
||||
)
|
||||
failed_in_chunk += 1
|
||||
except Exception as e:
|
||||
print(
|
||||
f"Warning: An unexpected error occurred processing response for original index {original_df_index}: {type(e).__name__} - {e}. Response: {response}"
|
||||
)
|
||||
failed_in_chunk += 1
|
||||
else:
|
||||
print(
|
||||
f"Warning: Mismatch between number of responses ({len(responses) if responses else 0}) "
|
||||
f"and messages sent ({len(message_chunk)}) for chunk {i + 1}, or no responses. Marking all as failed."
|
||||
)
|
||||
failed_in_chunk = len(
|
||||
message_chunk
|
||||
) # All items in this chunk are considered failed if response array is problematic
|
||||
|
||||
print(
|
||||
f"Chunk {i + 1} processing summary: Success={successful_in_chunk}, Failed/Skipped={failed_in_chunk}"
|
||||
)
|
||||
processed_count_total += successful_in_chunk
|
||||
|
||||
# --- Update Main DataFrame and Save Periodically ---
|
||||
if chunk_updates:
|
||||
print(
|
||||
f"Updating main DataFrame with {len(chunk_updates)} new estimates for chunk {i + 1}..."
|
||||
)
|
||||
for idx, estimates in chunk_updates.items():
|
||||
if idx in df.index:
|
||||
df.loc[idx, "lb_estimate_qty"] = estimates["lb_estimate_qty"]
|
||||
df.loc[idx, "lb_estimate_unit"] = estimates["lb_estimate_unit"]
|
||||
df.loc[idx, "ub_estimate_qty"] = estimates["ub_estimate_qty"]
|
||||
df.loc[idx, "ub_estimate_unit"] = estimates["ub_estimate_unit"]
|
||||
|
||||
print(f"Saving progress to {FILENAME}...")
|
||||
save_dataframe(df, FILENAME)
|
||||
else:
|
||||
print(f"No successful estimates obtained in chunk {i + 1} to save.")
|
||||
|
||||
# --- Rate Limiting Pause ---
|
||||
chunk_end_time = time.time()
|
||||
chunk_duration = chunk_end_time - chunk_start_time
|
||||
print(f"Chunk {i + 1} took {chunk_duration:.2f} seconds.")
|
||||
|
||||
if i < num_chunks - 1: # No pause after the last chunk
|
||||
# Calculate ideal time per request based on rate limit
|
||||
time_per_request = SECONDS_PER_MINUTE / RATE_LIMIT if RATE_LIMIT > 0 else 0
|
||||
# Calculate minimum duration this chunk should have taken to respect rate limit
|
||||
min_chunk_duration_for_rate = len(message_chunk) * time_per_request
|
||||
# Calculate pause needed
|
||||
pause_needed = max(0, min_chunk_duration_for_rate - chunk_duration)
|
||||
|
||||
if pause_needed > 0:
|
||||
print(
|
||||
f"Pausing for {pause_needed:.2f} seconds to respect rate limit ({RATE_LIMIT}/min)..."
|
||||
)
|
||||
time.sleep(pause_needed)
|
||||
|
||||
overall_end_time = time.time()
|
||||
total_duration_minutes = (overall_end_time - overall_start_time) / 60
|
||||
print(
|
||||
f"\nBatch completion finished."
|
||||
f" Processed {processed_count_total} new estimates in this run in {total_duration_minutes:.2f} minutes."
|
||||
)
|
||||
|
||||
print(f"Performing final save to {FILENAME}...")
|
||||
save_dataframe(df, FILENAME)
|
||||
|
||||
print("\nScript finished.")
|
521
old/analysis.py
Normal file
521
old/analysis.py
Normal file
|
@ -0,0 +1,521 @@
|
|||
import os
|
||||
import litellm
|
||||
import sqlite3
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from google.colab import userdata, files
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib as mpl
|
||||
|
||||
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
|
||||
os.environ['GEMINI_API_KEY'] = userdata.get('GEMINI_API_KEY')
|
||||
|
||||
occupation_major_codes = {
|
||||
'11': 'Management',
|
||||
'13': 'Business and Financial Operations',
|
||||
'15': 'Computer and Mathematical Occupations',
|
||||
'17': 'Architecture and Engineering',
|
||||
'19': 'Life, Physical, and Social Science',
|
||||
'21': 'Community and Social Services',
|
||||
'23': 'Legal',
|
||||
'25': 'Education, Training, and Library',
|
||||
'27': 'Arts, Design, Entertainment, Sports, and Media',
|
||||
'29': 'Healthcare Practitioners and Technical',
|
||||
'31': 'Healthcare Support',
|
||||
'33': 'Protective Service',
|
||||
'35': 'Food Preparation and Serving Related',
|
||||
'37': 'Building and Grounds Cleaning and Maintenance',
|
||||
'39': 'Personal Care and Service',
|
||||
'41': 'Sales and Related',
|
||||
'43': 'Office and Administrative Support',
|
||||
'45': 'Farming, Fishing, and Forestry',
|
||||
'47': 'Construction and Extraction',
|
||||
'49': 'Installation, Maintenance, and Repair',
|
||||
'51': 'Production',
|
||||
'53': 'Transportation and Material Moving',
|
||||
'55': 'Military Specific'
|
||||
}
|
||||
|
||||
gray = {'50':'#f8fafc','100':'#f1f5f9','200':'#e2e8f0',
|
||||
'300':'#cbd5e1','400':'#94a3b8','500':'#64748b',
|
||||
'600':'#475569','700':'#334155','800':'#1e293b',
|
||||
'900':'#0f172a','950':'#020617'}
|
||||
lime = {'50': '#f7fee7','100': '#ecfcca','200': '#d8f999',
|
||||
'300': '#bbf451','400': '#9ae600','500': '#83cd00',
|
||||
'600': '#64a400','700': '#497d00','800': '#3c6300',
|
||||
'900': '#35530e','950': '#192e03'}
|
||||
|
||||
mpl.rcParams.update({
|
||||
'figure.facecolor' : gray['50'],
|
||||
'axes.facecolor' : gray['50'],
|
||||
'axes.edgecolor' : gray['100'],
|
||||
'axes.labelcolor' : gray['700'],
|
||||
'xtick.color' : gray['700'],
|
||||
'ytick.color' : gray['700'],
|
||||
'font.family' : 'Inter', # falls back to DejaVu if Inter not present
|
||||
'font.size' : 11,
|
||||
})
|
||||
|
||||
sns.set_style("white") # keep minimal axes, we will remove default grid
|
||||
sns.set_context("notebook")
|
||||
|
||||
def prepare_tasks():
|
||||
|
||||
# Run uv run ./enrich_task_ratings.py
|
||||
df_tasks = pd.read_json("task_ratings_enriched.json")
|
||||
|
||||
# Run uv run classify_estimateability_of_tasks.py
|
||||
df_task_estimateable = pd.read_csv("tasks_estimateable.csv").rename(columns={"task_estimateable": "estimateable"}).drop_duplicates(subset=['task'], keep='first')
|
||||
|
||||
# df_tasks now has a remote_status column which contains either "remote" or "not remote"
|
||||
df_tasks = pd.merge(df_tasks, df_remote_status[['Task', 'Remote']], left_on='task', right_on='Task', how='left')
|
||||
df_tasks = df_tasks.drop('Task', axis=1).rename(columns={'Remote': 'remote_status'})
|
||||
|
||||
# df_tasks now has a estimateable column which contains either "ATOMIC" or "ONGOING-CONSTRAINT"
|
||||
df_tasks = pd.merge(df_tasks, df_task_estimateable[['task', 'estimateable']], on='task', how='left')
|
||||
|
||||
df_tasks = df_tasks[df_tasks['importance_average'] < 3].copy()
|
||||
|
||||
df_tasks['onetsoc_major'] = df_tasks['onetsoc_code'].str[:2]
|
||||
|
||||
df_remote_tasks = df_tasks[df_tasks['remote_status'] == 'remote'].copy()
|
||||
|
||||
# Call create_task_estimates() from add_task_estimates? which creates tasks_with_estimates.csv
|
||||
|
||||
def preprocessing_time_estimates():
|
||||
df = pd.read_csv("tasks_with_estimates.csv")
|
||||
|
||||
df = df[df['importance_average'] > 3].copy()
|
||||
|
||||
# The embeddings comes from running `uv run ./embed_task_description.py`
|
||||
# Columns: ['embedding_id', 'task', 'embedding_vector']
|
||||
# These contain embedding for UNIQUE tasks
|
||||
df_task_embeddings = pd.read_parquet("tasks_with_embeddings.parquet").drop_duplicates(subset=['task'])[['task', 'task_embedding']].rename(columns={"task_embedding": "embedding_vector"}).copy()
|
||||
|
||||
df = pd.merge(df, df_task_embeddings[['task', 'embedding_vector']], on='task', how='left')
|
||||
df = pd.merge(df, df_task_estimateable[['task', 'estimateable']], on='task', how='left')
|
||||
|
||||
df['onetsoc_major'] = df['onetsoc_code'].str[:2]
|
||||
|
||||
def convert_to_minutes(qty, unit):
|
||||
"""Converts a quantity in a given unit to minutes."""
|
||||
return qty * {
|
||||
"minute": 1,
|
||||
"hour": 60,
|
||||
"day": 60 * 24,
|
||||
"week": 60 * 24 * 7,
|
||||
"month": 60 * 24 * 30,
|
||||
"trimester": 60 * 24 * 90,
|
||||
"semester": 60 * 24 * 180,
|
||||
"year": 60 * 24 * 365,
|
||||
}[unit]
|
||||
|
||||
df['lb_estimate_in_minutes'] = df.apply(
|
||||
lambda row: convert_to_minutes(row['lb_estimate_qty'], row['lb_estimate_unit']), axis=1
|
||||
)
|
||||
df['ub_estimate_in_minutes'] = df.apply(
|
||||
lambda row: convert_to_minutes(row['ub_estimate_qty'], row['ub_estimate_unit']), axis=1
|
||||
)
|
||||
|
||||
df['estimate_range'] = df.ub_estimate_in_minutes - df.lb_estimate_in_minutes
|
||||
df['estimate_ratio'] = df.ub_estimate_in_minutes / df.lb_estimate_in_minutes
|
||||
df['estimate_midpoint'] = (df.lb_estimate_in_minutes + df.ub_estimate_in_minutes)/2
|
||||
|
||||
atomic_tasks = df[df['estimateable'] == 'ATOMIC']
|
||||
ongoing_tasks = df[df['estimateable'] == 'ONGOING-CONSTRAINT']
|
||||
|
||||
with pd.option_context('display.max_columns', None):
|
||||
display(df)
|
||||
|
||||
# Check for empty estimates
|
||||
if atomic_tasks['lb_estimate_in_minutes'].isnull().sum() > 0:
|
||||
print("Missing values in 'lb_estimate_in_minutes':", atomic_tasks['lb_estimate_in_minutes'].isnull().sum())
|
||||
|
||||
if atomic_tasks['ub_estimate_in_minutes'].isnull().sum() > 0:
|
||||
print("Missing values in 'ub_estimate_in_minutes':", atomic_tasks['ub_estimate_in_minutes'].isnull().sum())
|
||||
|
||||
# Check for impossible bounds
|
||||
impossible_bounds = atomic_tasks[
|
||||
(atomic_tasks['lb_estimate_in_minutes'] <= 0) |
|
||||
(atomic_tasks['ub_estimate_in_minutes'] <= 0) |
|
||||
(atomic_tasks['lb_estimate_in_minutes'] > atomic_tasks['ub_estimate_in_minutes'])
|
||||
]
|
||||
if not impossible_bounds.empty:
|
||||
print(f"Error: Found rows with impossible bounds.")
|
||||
with pd.option_context('display.max_colwidth', None):
|
||||
display(impossible_bounds[['task', 'lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'dwas']])
|
||||
|
||||
#with pd.option_context('display.max_colwidth', None):
|
||||
#display(atomic_tasks.nlargest(20, 'ub_estimate_in_minutes')[['task', 'lb_estimate_qty', 'lb_estimate_unit', 'lb_estimate_in_minutes', 'ub_estimate_qty', 'ub_estimate_unit', 'ub_estimate_in_minutes', 'estimate_ratio']])
|
||||
|
||||
def cell1():
|
||||
sns.histplot(atomic_tasks.estimate_midpoint, log_scale=True)
|
||||
|
||||
def cell2():
|
||||
plt.figure(figsize=(14,10))
|
||||
sns.boxplot(
|
||||
data=atomic_tasks,
|
||||
x='onetsoc_major', # 11 = Management, 15 = Computer/Math, …
|
||||
y='estimate_range',
|
||||
showfliers=False
|
||||
)
|
||||
plt.yscale('log') # long tail => log scale
|
||||
plt.xlabel('Occupation')
|
||||
plt.ylabel('Range (upper-lower, minutes)')
|
||||
plt.title('Spread of time-range estimates per occupation')
|
||||
|
||||
ax = plt.gca()
|
||||
ax.set_xticklabels([occupation_major_codes[code.get_text()] for code in ax.get_xticklabels()], rotation=60, ha='right')
|
||||
|
||||
def cell3():
|
||||
plt.figure(figsize=(10, 10))
|
||||
ax = sns.scatterplot(
|
||||
data=atomic_tasks.replace({'onetsoc_major': occupation_major_codes}), # Replace codes with labels
|
||||
x='lb_estimate_in_minutes', y='ub_estimate_in_minutes',
|
||||
alpha=0.2, edgecolor=None, hue="onetsoc_major" # Use the labeled column for hue
|
||||
)
|
||||
|
||||
# 45° reference
|
||||
lims = (1, atomic_tasks[['lb_estimate_in_minutes','ub_estimate_in_minutes']].max().max())
|
||||
ax.plot(lims, lims, color='black', linestyle='--', linewidth=1)
|
||||
|
||||
# optional helper lines: 2× and 10×, 100× ratios
|
||||
for k in [2,10, 100]:
|
||||
ax.plot(lims, [k*l for l in lims],
|
||||
linestyle=':', color='grey', linewidth=1)
|
||||
|
||||
ax.set(xscale='log', yscale='log')
|
||||
ax.set_xlabel('Lower-bound (min, log scale)')
|
||||
ax.set_ylabel('Upper-bound (min, log scale)')
|
||||
ax.set_title('Lower vs upper estimates for all tasks')
|
||||
|
||||
# Place the legend outside the plot
|
||||
ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
|
||||
|
||||
def cell4():
|
||||
plt.figure(figsize=(8,4))
|
||||
sns.histplot(np.log10(atomic_tasks['estimate_ratio'].replace([np.inf, -np.inf], np.nan).dropna()),
|
||||
bins=60, kde=True)
|
||||
plt.axvline(np.log10(10), color='red', ls='--', lw=1, label='10×')
|
||||
plt.axvline(np.log10(1.05), color='orange', ls='--', lw=1, label='1.05×')
|
||||
plt.axvline(0, color='black', ls='-', lw=1) # ub = lb
|
||||
plt.xlabel('log₁₀(upper / lower)')
|
||||
plt.ylabel('Count')
|
||||
plt.title('Distribution of upper:lower ratio')
|
||||
plt.legend()
|
||||
plt.tight_layout()
|
||||
|
||||
|
||||
def cell5():
|
||||
# 1. Bin lower bounds into quartiles (Q1–Q4)
|
||||
atomic_tasks['lb_q'] = pd.qcut(atomic_tasks.lb_estimate_in_minutes,
|
||||
q=4, labels=['Q1 shortest','Q2','Q3','Q4 longest'])
|
||||
|
||||
|
||||
# 3. Aggregate: median (or mean) ratio per cell
|
||||
pivot = atomic_tasks.pivot_table(index='onetsoc_major', columns='lb_q',
|
||||
values='estimate_ratio', aggfunc='median')
|
||||
|
||||
# Map the index (onetsoc_major codes) to their corresponding labels
|
||||
pivot.index = pivot.index.map(occupation_major_codes)
|
||||
|
||||
|
||||
# 4. Visualise
|
||||
plt.figure(figsize=(10,8))
|
||||
sns.heatmap(pivot, cmap='RdYlGn_r', center=2, annot=True, fmt='.1f',
|
||||
cbar_kws={'label':'Median upper/lower ratio'})
|
||||
plt.xlabel('Lower-bound quartile')
|
||||
plt.ylabel('Occupation (major group)')
|
||||
plt.title('Typical range width by occupation and task length')
|
||||
plt.tight_layout()
|
||||
|
||||
|
||||
|
||||
def cell6():
|
||||
"""
|
||||
from scipy.stats import median_abs_deviation
|
||||
|
||||
def mad_z(series):
|
||||
med = series.median()
|
||||
mad = median_abs_deviation(series, scale='normal') # ⇒ comparable to σ
|
||||
return (series - med) / mad
|
||||
|
||||
df['robust_z'] = df.groupby('onetsoc_code')['estimate_midpoint'].transform(mad_z)
|
||||
"""
|
||||
|
||||
agg = (atomic_tasks
|
||||
.groupby('onetsoc_code')['estimate_midpoint']
|
||||
.agg(median='median',
|
||||
q1=lambda x: x.quantile(.25),
|
||||
q3=lambda x: x.quantile(.75),
|
||||
mean='mean',
|
||||
std='std')
|
||||
.reset_index())
|
||||
agg['IQR'] = agg.q3 - agg.q1
|
||||
agg['CV'] = agg['std'] / agg['mean'] # coefficient of variation
|
||||
|
||||
# merge back the group mean and std so each row can be scored
|
||||
atomic_tasks = atomic_tasks.merge(agg[['onetsoc_code','mean','std']], on='onetsoc_code')
|
||||
|
||||
|
||||
atomic_tasks['z'] = (atomic_tasks.estimate_midpoint - atomic_tasks['mean']) / atomic_tasks['std']
|
||||
outliers = atomic_tasks.loc[atomic_tasks.z.abs() > 3]
|
||||
outliers
|
||||
|
||||
def cell7():
|
||||
from scipy.stats import median_abs_deviation
|
||||
|
||||
def mad_z(series):
|
||||
med = series.median()
|
||||
mad = median_abs_deviation(series, scale='normal') # ⇒ comparable to σ
|
||||
return (series - med) / mad
|
||||
|
||||
atomic_tasks['robust_z'] = atomic_tasks.groupby('onetsoc_code')['estimate_midpoint'].transform(mad_z)
|
||||
|
||||
def cell10():
|
||||
import matplotlib.ticker as mtick # For percentage formatting
|
||||
import matplotlib.colors as mcolors # For color conversion
|
||||
|
||||
summary_data = []
|
||||
|
||||
for code, label in occupation_major_codes.items():
|
||||
occ_df = df_tasks[df_tasks['onetsoc_major'] == code]
|
||||
total_tasks_in_occ = len(occ_df)
|
||||
|
||||
if total_tasks_in_occ == 0:
|
||||
continue # Skip if no tasks for this occupation
|
||||
|
||||
# Stack 1: % that isn't equal to "remote"
|
||||
not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote'])
|
||||
|
||||
# For the remaining remote tasks:
|
||||
remote_df = occ_df[occ_df['remote_status'] == 'remote']
|
||||
|
||||
# Stack 2: % of remote + ATOMIC
|
||||
remote_atomic_count = len(remote_df[remote_df['estimateable'] == 'ATOMIC'])
|
||||
|
||||
# Stack 3: % of remote + ONGOING-CONSTRAINT
|
||||
remote_ongoing_count = len(remote_df[remote_df['estimateable'] == 'ONGOING-CONSTRAINT'])
|
||||
|
||||
summary_data.append({
|
||||
'onetsoc_major_code': code,
|
||||
'occupation_label': label,
|
||||
'count_not_remote': not_remote_count,
|
||||
'count_remote_atomic': remote_atomic_count,
|
||||
'count_remote_ongoing': remote_ongoing_count,
|
||||
'total_tasks': total_tasks_in_occ
|
||||
})
|
||||
|
||||
summary_df = pd.DataFrame(summary_data)
|
||||
|
||||
# --- 3. Calculate Percentages ---
|
||||
# Ensure total_tasks is not zero to avoid division by zero errors if an occupation had no tasks
|
||||
summary_df = summary_df[summary_df['total_tasks'] > 0].copy() # Use .copy() to avoid SettingWithCopyWarning
|
||||
|
||||
summary_df['pct_not_remote'] = (summary_df['count_not_remote'] / summary_df['total_tasks']) * 100
|
||||
summary_df['pct_remote_atomic'] = (summary_df['count_remote_atomic'] / summary_df['total_tasks']) * 100
|
||||
summary_df['pct_remote_ongoing'] = (summary_df['count_remote_ongoing'] / summary_df['total_tasks']) * 100
|
||||
|
||||
# Select columns for plotting and set index to occupation label
|
||||
plot_df = summary_df.set_index('occupation_label')[
|
||||
['pct_not_remote', 'pct_remote_atomic', 'pct_remote_ongoing']
|
||||
]
|
||||
|
||||
# Rename columns for a clearer legend
|
||||
plot_df.columns = ['Not Remote', 'Remote + Estimable', 'Remote + Not estimable']
|
||||
|
||||
plot_df = plot_df.sort_values(by='Not Remote', ascending=False)
|
||||
|
||||
|
||||
# --- 4. Plotting (Modified) ---
|
||||
|
||||
# Define the custom colors based on your requirements
|
||||
# The order must match the column order in plot_df:
|
||||
# 1. 'Not Remote'
|
||||
# 2. 'Remote & ATOMIC'
|
||||
# 3. 'Remote & ONGOING-CONSTRAINT'
|
||||
bar_colors = [gray["300"], lime["500"], lime["200"]]
|
||||
|
||||
fig, ax = plt.subplots(figsize=(14, 10)) # Adjusted figsize for better readability
|
||||
|
||||
plot_df.plot(kind='barh', stacked=True, ax=ax, color=bar_colors)
|
||||
|
||||
ax.set_xlabel("Percentage of Tasks (%)", fontsize=12)
|
||||
ax.set_ylabel("Occupation Major Group", fontsize=12)
|
||||
ax.set_title("Task Breakdown by Occupation, Remote Status, and Estimateability", fontsize=14, pad=20)
|
||||
|
||||
# Format x-axis as percentages
|
||||
ax.xaxis.set_major_formatter(mtick.PercentFormatter())
|
||||
plt.xlim(0, 100) # Ensure x-axis goes from 0 to 100%
|
||||
|
||||
# Remove right and top spines
|
||||
ax.spines['right'].set_visible(False)
|
||||
ax.spines['top'].set_visible(False)
|
||||
|
||||
# Function to get contrasting text color
|
||||
def get_contrasting_text_color(bg_color_hex_or_rgba):
|
||||
"""
|
||||
Determines if black or white text provides better contrast against a given background color.
|
||||
bg_color_hex_or_rgba: A hex string (e.g., '#RRGGBB') or an RGBA tuple (values in [0, 1]).
|
||||
Returns: 'black' or 'white'.
|
||||
"""
|
||||
# Convert to RGBA if it's a hex string or name
|
||||
if isinstance(bg_color_hex_or_rgba, str):
|
||||
rgba = mcolors.to_rgba(bg_color_hex_or_rgba)
|
||||
else:
|
||||
rgba = bg_color_hex_or_rgba
|
||||
|
||||
r, g, b, _ = rgba # Ignore alpha for luminance calculation
|
||||
# Calculate luminance (standard formula for sRGB)
|
||||
# Values r, g, b should be in [0, 1] for this formula
|
||||
luminance = 0.2126 * r + 0.7152 * g + 0.0722 * b
|
||||
# Threshold for deciding text color
|
||||
return 'black' if luminance > 0.55 else 'white' # Adjusted threshold slightly for better visual
|
||||
|
||||
# Add percentages inside each bar segment
|
||||
# Iterate through each "category" of bars (Not Remote, Remote & ATOMIC, etc.)
|
||||
for i, container in enumerate(ax.containers):
|
||||
# Get the color for this container/category
|
||||
segment_color = bar_colors[i]
|
||||
text_color = get_contrasting_text_color(segment_color)
|
||||
|
||||
for patch in container.patches: # Iterate through each bar segment in the category
|
||||
width = patch.get_width()
|
||||
if width > 3: # Only add text if segment is wide enough (e.g., >3%)
|
||||
x = patch.get_x() + width / 2
|
||||
y = patch.get_y() + patch.get_height() / 2
|
||||
ax.text(x, y,
|
||||
f"{width:.1f}%",
|
||||
ha='center',
|
||||
va='center',
|
||||
fontsize=8, # Adjust font size as needed
|
||||
color=text_color,
|
||||
fontweight='medium') # Bolder text can help
|
||||
|
||||
|
||||
plt.legend(title="Task Category", bbox_to_anchor=(1.02, 1), loc='upper left', frameon=False)
|
||||
|
||||
def cell11():
|
||||
df_oesm['onetsoc_major'] = df_oesm['OCC_CODE'].str[:2]
|
||||
|
||||
# Calculate wage bill per occupation
|
||||
# Wage bill = Total Employment * Annual Mean Wage
|
||||
# Ensure columns are numeric, converting non-numeric values to NaN first
|
||||
df_oesm['TOT_EMP'] = pd.to_numeric(df_oesm['TOT_EMP'], errors='coerce')
|
||||
df_oesm['A_MEAN'] = pd.to_numeric(df_oesm['A_MEAN'], errors='coerce')
|
||||
|
||||
# Drop rows with NaN in necessary columns after coercion
|
||||
df_oesm.dropna(subset=['TOT_EMP', 'A_MEAN', 'onetsoc_major'], inplace=True)
|
||||
|
||||
df_oesm['wage_bill'] = df_oesm['TOT_EMP'] * df_oesm['A_MEAN']
|
||||
|
||||
# Aggregate wage bill by onetsoc_major
|
||||
df_wage_bill_major = df_oesm.groupby('onetsoc_major')['wage_bill'].sum().reset_index()
|
||||
|
||||
# Map major codes to titles for better plotting
|
||||
df_wage_bill_major['OCC_TITLE_MAJOR'] = df_wage_bill_major['onetsoc_major'].map(occupation_major_codes)
|
||||
|
||||
# Sort by wage bill for better visualization
|
||||
df_wage_bill_major = df_wage_bill_major.sort_values('wage_bill', ascending=False)
|
||||
|
||||
# Plotting
|
||||
plt.figure(figsize=(12, 8))
|
||||
sns.barplot(x='wage_bill', y='OCC_TITLE_MAJOR', data=df_wage_bill_major, palette="viridis")
|
||||
plt.title('Total Wage Bill per Major Occupation Group')
|
||||
plt.xlabel('Total Wage Bill (in billions)')
|
||||
plt.ylabel('Major Occupation Group')
|
||||
plt.grid(axis='x', linestyle='--', alpha=0.7)
|
||||
|
||||
def cell11():
|
||||
# ───────────────────────────────────────────────────────────────
|
||||
# 1. CUMULATIVE-DISTRIBUTION-FUNCTION (CDF) PREP
|
||||
# ───────────────────────────────────────────────────────────────
|
||||
def cdf(series):
|
||||
s = series.sort_values().reset_index(drop=True)
|
||||
return s.values, ((s.index + 1) / len(s)) * 100
|
||||
|
||||
x_lb , y_lb = cdf(atomic_tasks['lb_estimate_in_minutes'])
|
||||
x_ub , y_ub = cdf(atomic_tasks['ub_estimate_in_minutes'])
|
||||
x_mid, y_mid = cdf((atomic_tasks['ub_estimate_in_minutes'] + atomic_tasks['lb_estimate_in_minutes']) / 2)
|
||||
|
||||
# ───────────────────────────────────────────────────────────────
|
||||
# 2. PLOTTING
|
||||
# ───────────────────────────────────────────────────────────────
|
||||
fig, ax = plt.subplots(figsize=(10, 6))
|
||||
|
||||
# horizontal reference lines every 10 %
|
||||
for y_val in range(0, 101, 10):
|
||||
ax.axhline(y_val, color=gray['100'], linewidth=.8, zorder=1)
|
||||
|
||||
# Plot Lower Bound CDF
|
||||
ax.step(x_lb, y_lb,
|
||||
where='post',
|
||||
color=lime['300'], # Example: light blue for lower bound
|
||||
linewidth=1.8,
|
||||
linestyle='--',
|
||||
zorder=2,
|
||||
label='Lower bound estimate (CDF)')
|
||||
|
||||
# Plot Upper Bound CDF
|
||||
ax.step(x_ub, y_ub,
|
||||
where='post',
|
||||
color=lime['900'], # Example: light orange/red for upper bound
|
||||
linewidth=1.8,
|
||||
linestyle=':',
|
||||
zorder=3,
|
||||
label='Upper bound estimate (CDF)')
|
||||
|
||||
# Plot Midpoint CDF (plotted last to be on top, or adjust zorder)
|
||||
ax.step(x_mid, y_mid,
|
||||
where='post',
|
||||
color=lime['600'],
|
||||
linewidth=2.2,
|
||||
zorder=4, # Ensure it's on top of other lines if they overlap significantly
|
||||
label='Mid-point estimate (CDF)')
|
||||
|
||||
|
||||
# axes limits / scales
|
||||
ax.set_ylim(0, 100)
|
||||
ax.set_xscale('log')
|
||||
|
||||
# y-axis ➝ percent labels
|
||||
ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(decimals=0))
|
||||
|
||||
|
||||
# move y-label to top-left (just inside plotting area)
|
||||
ax.text(-0.06, 1.03,
|
||||
"% of tasks with temporal coherence ≤ X",
|
||||
ha='left', va='bottom',
|
||||
transform=ax.transAxes,
|
||||
fontsize=12, fontweight='semibold')
|
||||
|
||||
# custom x-ticks at human-friendly durations
|
||||
ticks = [1, 5, 10, 30, 60, 120, 240, 480,
|
||||
1440, 2880, 10080, 43200, 129600,
|
||||
259200, 525600]
|
||||
ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hour', '2 hours', '4 hours', '8 hours',
|
||||
'1 day', '2 days', '1 week', '30 days',
|
||||
'90 days', '180 days', '1 year']
|
||||
|
||||
# Vertical reference lines for x-ticks
|
||||
for tick in ticks:
|
||||
ax.axvline(tick, color=gray['300'], linewidth=.8, linestyle='--', zorder=1)
|
||||
|
||||
ax.set_xticks(ticks)
|
||||
ax.set_xticklabels(ticklabels, rotation=45, ha='right')
|
||||
|
||||
ax.spines['top'].set_visible(False)
|
||||
ax.spines['right'].set_visible(False)
|
||||
ax.spines['left'].set_edgecolor(gray['300'])
|
||||
ax.spines['bottom'].set_edgecolor(gray['300'])
|
||||
|
||||
|
||||
# legend
|
||||
ax.legend(frameon=False, loc='lower right') # Keep 'lower right' or adjust as needed
|
||||
|
||||
ax.text(0.5, -0.3,
|
||||
'Temporal coherence (X)',
|
||||
ha='center', va='center',
|
||||
transform=ax.transAxes,
|
||||
fontsize=12, fontweight='semibold')
|
411
old/classify_estimateability_of_tasks.py
Normal file
411
old/classify_estimateability_of_tasks.py
Normal file
|
@ -0,0 +1,411 @@
|
|||
import pandas as pd
|
||||
import litellm
|
||||
import dotenv
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
import math
|
||||
|
||||
# Load environment variables
|
||||
dotenv.load_dotenv(override=True)
|
||||
|
||||
# litellm._turn_on_debug() # Optional debugging
|
||||
|
||||
# --- Configuration ---
|
||||
MODEL = "gpt-4.1-mini" # Make sure this model supports json_schema or structured output
|
||||
RATE_LIMIT = 5000 # Requests per minute
|
||||
CHUNK_SIZE = 300 # Number of unique tasks per API call
|
||||
SECONDS_PER_MINUTE = 60
|
||||
|
||||
# File configuration
|
||||
CLASSIFICATION_FILENAME = "tasks_estimateable.csv" # Output file with classifications
|
||||
TASK_SOURCE_FOR_INIT_FILENAME = "tasks_with_estimates.csv"
|
||||
OUTPUT_COLUMN_NAME = "task_estimateable"
|
||||
SOURCE_FILTER_COLUMN = "remote_status"
|
||||
SOURCE_FILTER_VALUE = "remote"
|
||||
|
||||
# --- Prompts and Schema ---
|
||||
SYSTEM_PROMPT_CLASSIFY = """
|
||||
Classify the provided O*NET task into one of these categories:
|
||||
- ATOMIC (schedulable): A single, clearly-bounded activity, typically lasting minutes, hours, or a few days.
|
||||
- ONGOING-CONSTRAINT (background role/ethical rule): A continuous responsibility or behavioural norm with no schedulable duration (e.g., “follow confidentiality rules,” “serve as department head”).
|
||||
""".strip()
|
||||
|
||||
USER_MESSAGE_TEMPLATE_CLASSIFY = "Task: {task}"
|
||||
|
||||
CLASSIFICATION_CATEGORIES = ["ATOMIC", "ONGOING-CONSTRAINT"]
|
||||
|
||||
SCHEMA_FOR_CLASSIFICATION = {
|
||||
"name": "classify_task_type",
|
||||
"strict": True,
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"task_category": {
|
||||
"type": "string",
|
||||
"enum": CLASSIFICATION_CATEGORIES,
|
||||
"description": "The classification of the task (ATOMIC or ONGOING-CONSTRAINT).",
|
||||
}
|
||||
},
|
||||
"required": ["task_category"],
|
||||
"additionalProperties": False,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def save_dataframe(df_to_save, filename):
|
||||
"""Saves the DataFrame to the specified CSV file using atomic write."""
|
||||
try:
|
||||
temp_filename = filename + ".tmp"
|
||||
df_to_save.to_csv(temp_filename, encoding="utf-8-sig", index=False)
|
||||
os.replace(temp_filename, filename)
|
||||
except Exception as e:
|
||||
print(f"--- Error saving DataFrame to {filename}: {e} ---")
|
||||
if os.path.exists(temp_filename):
|
||||
try:
|
||||
os.remove(temp_filename)
|
||||
except Exception as remove_err:
|
||||
print(
|
||||
f"--- Error removing temporary save file {temp_filename}: {remove_err} ---"
|
||||
)
|
||||
|
||||
|
||||
# --- Load or Initialize DataFrame ---
|
||||
try:
|
||||
if os.path.exists(CLASSIFICATION_FILENAME):
|
||||
df = pd.read_csv(CLASSIFICATION_FILENAME, encoding="utf-8-sig")
|
||||
print(f"Successfully read {len(df)} rows from {CLASSIFICATION_FILENAME}.")
|
||||
|
||||
save_needed_after_load = False
|
||||
if OUTPUT_COLUMN_NAME not in df.columns:
|
||||
df[OUTPUT_COLUMN_NAME] = pd.NA
|
||||
print(f"Added '{OUTPUT_COLUMN_NAME}' column.")
|
||||
save_needed_after_load = True
|
||||
|
||||
df[OUTPUT_COLUMN_NAME].replace(["", None, ""], pd.NA, inplace=True)
|
||||
|
||||
if df[OUTPUT_COLUMN_NAME].dtype != object and not isinstance(
|
||||
df[OUTPUT_COLUMN_NAME].dtype, pd.StringDtype
|
||||
):
|
||||
try:
|
||||
df[OUTPUT_COLUMN_NAME] = df[OUTPUT_COLUMN_NAME].astype(object)
|
||||
print(
|
||||
f"Corrected dtype of '{OUTPUT_COLUMN_NAME}' to {df[OUTPUT_COLUMN_NAME].dtype}."
|
||||
)
|
||||
save_needed_after_load = True
|
||||
except Exception as e:
|
||||
print(
|
||||
f"Warning: Could not convert column '{OUTPUT_COLUMN_NAME}' to object: {e}."
|
||||
)
|
||||
|
||||
if "task" not in df.columns:
|
||||
print(
|
||||
f"Error: {CLASSIFICATION_FILENAME} must contain a 'task' column for processing."
|
||||
)
|
||||
exit()
|
||||
|
||||
if save_needed_after_load:
|
||||
print(f"Saving {CLASSIFICATION_FILENAME} after adding/adjusting column.")
|
||||
save_dataframe(df, CLASSIFICATION_FILENAME)
|
||||
else:
|
||||
print(
|
||||
f"{CLASSIFICATION_FILENAME} not found. Attempting to create it from {TASK_SOURCE_FOR_INIT_FILENAME}."
|
||||
)
|
||||
if not os.path.exists(TASK_SOURCE_FOR_INIT_FILENAME):
|
||||
print(
|
||||
f"Error: Source file {TASK_SOURCE_FOR_INIT_FILENAME} not found. Cannot create {CLASSIFICATION_FILENAME}."
|
||||
)
|
||||
exit()
|
||||
|
||||
df_source = pd.read_csv(TASK_SOURCE_FOR_INIT_FILENAME, encoding="utf-8-sig")
|
||||
|
||||
required_source_cols_for_init = ["task", SOURCE_FILTER_COLUMN]
|
||||
missing_source_cols = [
|
||||
col for col in required_source_cols_for_init if col not in df_source.columns
|
||||
]
|
||||
if missing_source_cols:
|
||||
print(
|
||||
f"Error: Source file {TASK_SOURCE_FOR_INIT_FILENAME} is missing required columns for initialization: {', '.join(missing_source_cols)}."
|
||||
)
|
||||
exit()
|
||||
|
||||
df_source_filtered = df_source[
|
||||
df_source[SOURCE_FILTER_COLUMN] == SOURCE_FILTER_VALUE
|
||||
].copy()
|
||||
|
||||
if df_source_filtered.empty:
|
||||
print(
|
||||
f"Warning: No tasks with '{SOURCE_FILTER_COLUMN}' == '{SOURCE_FILTER_VALUE}' found in {TASK_SOURCE_FOR_INIT_FILENAME}. "
|
||||
f"{CLASSIFICATION_FILENAME} will be created with schema but no tasks to classify initially."
|
||||
)
|
||||
|
||||
df = df_source_filtered[["task"]].copy()
|
||||
df[OUTPUT_COLUMN_NAME] = pd.NA
|
||||
df[OUTPUT_COLUMN_NAME] = df[OUTPUT_COLUMN_NAME].astype(object)
|
||||
|
||||
print(
|
||||
f"Created {CLASSIFICATION_FILENAME} using tasks from {TASK_SOURCE_FOR_INIT_FILENAME} "
|
||||
f"(where {SOURCE_FILTER_COLUMN}='{SOURCE_FILTER_VALUE}'). New file has {len(df)} tasks."
|
||||
)
|
||||
save_dataframe(df, CLASSIFICATION_FILENAME)
|
||||
|
||||
except FileNotFoundError:
|
||||
print(f"Error: A required file was not found. Please check paths.")
|
||||
exit()
|
||||
except Exception as e:
|
||||
print(f"Error during DataFrame loading or initialization: {e}")
|
||||
exit()
|
||||
|
||||
|
||||
# --- Identify Unique Tasks to Process ---
|
||||
if df.empty:
|
||||
print(f"{CLASSIFICATION_FILENAME} is empty. Nothing to process. Exiting.")
|
||||
exit()
|
||||
|
||||
initial_unprocessed_mask = df[OUTPUT_COLUMN_NAME].isna()
|
||||
|
||||
if not initial_unprocessed_mask.any():
|
||||
print(
|
||||
f"All tasks in {CLASSIFICATION_FILENAME} seem to have been classified already. Exiting."
|
||||
)
|
||||
exit()
|
||||
|
||||
# Filter for rows that are unprocessed AND have a valid 'task' string
|
||||
valid_tasks_to_consider_df = df[
|
||||
initial_unprocessed_mask & df["task"].notna() & (df["task"].str.strip() != "")
|
||||
]
|
||||
|
||||
if valid_tasks_to_consider_df.empty:
|
||||
print(
|
||||
f"No valid, unclassified tasks found to process (after filtering out empty/NaN task descriptions). Exiting."
|
||||
)
|
||||
exit()
|
||||
|
||||
unique_task_labels_for_api = (
|
||||
valid_tasks_to_consider_df["task"].drop_duplicates().tolist()
|
||||
)
|
||||
total_rows_to_update_potentially = len(
|
||||
df[initial_unprocessed_mask]
|
||||
) # Count all rows that are NA
|
||||
|
||||
print(
|
||||
f"Found {total_rows_to_update_potentially} total rows in {CLASSIFICATION_FILENAME} needing classification."
|
||||
)
|
||||
print(
|
||||
f"Identified {len(unique_task_labels_for_api)} unique, valid task labels to send to the API."
|
||||
)
|
||||
|
||||
|
||||
# --- Prepare messages for batch completion (only for unique task labels) ---
|
||||
messages_list = []
|
||||
print(f"Preparing messages for {len(unique_task_labels_for_api)} unique task labels...")
|
||||
|
||||
for task_label in unique_task_labels_for_api:
|
||||
# task_label is already guaranteed to be non-empty and not NaN from the filtering above
|
||||
user_message = USER_MESSAGE_TEMPLATE_CLASSIFY.format(task=task_label)
|
||||
messages_for_task = [
|
||||
{"role": "system", "content": SYSTEM_PROMPT_CLASSIFY},
|
||||
{"role": "user", "content": user_message},
|
||||
]
|
||||
messages_list.append(messages_for_task)
|
||||
|
||||
print(f"Prepared {len(messages_list)} message sets for batch completion.")
|
||||
if (
|
||||
not messages_list
|
||||
): # Should only happen if unique_task_labels_for_api was empty, caught above
|
||||
print(
|
||||
"No messages prepared, though unique tasks were identified. This is unexpected. Exiting."
|
||||
)
|
||||
exit()
|
||||
|
||||
|
||||
# --- Call batch_completion in chunks with rate limiting and periodic saving ---
|
||||
total_unique_tasks_to_send = len(
|
||||
messages_list
|
||||
) # Same as len(unique_task_labels_for_api)
|
||||
num_chunks = math.ceil(total_unique_tasks_to_send / CHUNK_SIZE)
|
||||
|
||||
print(
|
||||
f"\nStarting batch classification for {total_unique_tasks_to_send} unique task labels in {num_chunks} chunks..."
|
||||
)
|
||||
|
||||
overall_start_time = time.time()
|
||||
processed_rows_count_total = 0 # Counts actual rows updated in the DataFrame
|
||||
|
||||
for i in range(num_chunks):
|
||||
chunk_start_message_index = i * CHUNK_SIZE
|
||||
chunk_end_message_index = min((i + 1) * CHUNK_SIZE, total_unique_tasks_to_send)
|
||||
|
||||
message_chunk = messages_list[chunk_start_message_index:chunk_end_message_index]
|
||||
# Get corresponding unique task labels for this chunk
|
||||
chunk_task_labels = unique_task_labels_for_api[
|
||||
chunk_start_message_index:chunk_end_message_index
|
||||
]
|
||||
|
||||
if not message_chunk: # Should not happen if loop range is correct
|
||||
continue
|
||||
|
||||
print(
|
||||
f"\nProcessing chunk {i + 1}/{num_chunks} (Unique Task Labels {chunk_start_message_index + 1}-{chunk_end_message_index} of this run)..."
|
||||
)
|
||||
chunk_start_time = time.time()
|
||||
responses = []
|
||||
try:
|
||||
print(
|
||||
f"Sending {len(message_chunk)} requests (for unique tasks) for chunk {i + 1}..."
|
||||
)
|
||||
responses = litellm.batch_completion(
|
||||
model=MODEL,
|
||||
messages=message_chunk,
|
||||
response_format={
|
||||
"type": "json_schema",
|
||||
"json_schema": SCHEMA_FOR_CLASSIFICATION,
|
||||
},
|
||||
num_retries=3,
|
||||
)
|
||||
print(f"Chunk {i + 1} API call completed.")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during litellm.batch_completion for chunk {i + 1}: {e}")
|
||||
responses = [None] * len(message_chunk)
|
||||
|
||||
# --- Process responses for the current chunk ---
|
||||
# chunk_updates stores {task_label: classification_category}
|
||||
chunk_task_classifications = {}
|
||||
successful_api_calls_in_chunk = 0
|
||||
failed_api_calls_in_chunk = 0
|
||||
|
||||
if responses and len(responses) == len(message_chunk):
|
||||
for j, response in enumerate(responses):
|
||||
current_task_label = chunk_task_labels[
|
||||
j
|
||||
] # The unique task label for this response
|
||||
content_str = None
|
||||
|
||||
if response is None:
|
||||
print(
|
||||
f"API call failed for task label '{current_task_label}' (response is None)."
|
||||
)
|
||||
failed_api_calls_in_chunk += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
if (
|
||||
response.choices
|
||||
and response.choices[0].message
|
||||
and response.choices[0].message.content
|
||||
):
|
||||
content_str = response.choices[0].message.content
|
||||
classification_data = json.loads(content_str)
|
||||
category_raw = classification_data.get("task_category")
|
||||
|
||||
if category_raw in CLASSIFICATION_CATEGORIES:
|
||||
successful_api_calls_in_chunk += 1
|
||||
chunk_task_classifications[current_task_label] = category_raw
|
||||
else:
|
||||
print(
|
||||
f"Warning: Invalid or missing task_category for task label '{current_task_label}': '{category_raw}'. Content: '{content_str}'"
|
||||
)
|
||||
failed_api_calls_in_chunk += 1
|
||||
else:
|
||||
finish_reason = (
|
||||
response.choices[0].finish_reason
|
||||
if (response.choices and response.choices[0].finish_reason)
|
||||
else "unknown"
|
||||
)
|
||||
error_message = (
|
||||
response.choices[0].message.content
|
||||
if (response.choices and response.choices[0].message)
|
||||
else "No content in message."
|
||||
)
|
||||
print(
|
||||
f"Warning: Received non-standard or empty response content for task label '{current_task_label}'. "
|
||||
f"Finish Reason: '{finish_reason}'. Message: '{error_message}'. Raw Choices: {response.choices}"
|
||||
)
|
||||
failed_api_calls_in_chunk += 1
|
||||
|
||||
except json.JSONDecodeError:
|
||||
print(
|
||||
f"Warning: Could not decode JSON for task label '{current_task_label}'. Content received: '{content_str}'"
|
||||
)
|
||||
failed_api_calls_in_chunk += 1
|
||||
except AttributeError as ae:
|
||||
print(
|
||||
f"Warning: Missing attribute processing response for task label '{current_task_label}': {ae}. Response: {response}"
|
||||
)
|
||||
failed_api_calls_in_chunk += 1
|
||||
except Exception as e:
|
||||
print(
|
||||
f"Warning: Unexpected error processing response for task label '{current_task_label}': {type(e).__name__} - {e}. Response: {response}"
|
||||
)
|
||||
failed_api_calls_in_chunk += 1
|
||||
else:
|
||||
print(
|
||||
f"Warning: Mismatch between #responses ({len(responses) if responses else 0}) "
|
||||
f"and #messages sent ({len(message_chunk)}) for chunk {i + 1}, or no responses. Marking all API calls in chunk as failed."
|
||||
)
|
||||
failed_api_calls_in_chunk = len(message_chunk)
|
||||
|
||||
# --- Update Main DataFrame and Save Periodically ---
|
||||
rows_updated_this_chunk = 0
|
||||
if chunk_task_classifications:
|
||||
print(
|
||||
f"Updating main DataFrame with classifications for {len(chunk_task_classifications)} unique tasks from chunk {i + 1}..."
|
||||
)
|
||||
for task_label, category in chunk_task_classifications.items():
|
||||
# Update all rows in the main df that match this task_label AND are still NA in the output column
|
||||
update_condition = (df["task"] == task_label) & (
|
||||
df[OUTPUT_COLUMN_NAME].isna()
|
||||
)
|
||||
num_rows_for_this_task_label = df[update_condition].shape[0]
|
||||
|
||||
if num_rows_for_this_task_label > 0:
|
||||
df.loc[update_condition, OUTPUT_COLUMN_NAME] = category
|
||||
rows_updated_this_chunk += num_rows_for_this_task_label
|
||||
|
||||
print(
|
||||
f"Updated {rows_updated_this_chunk} rows in the DataFrame based on this chunk's API responses."
|
||||
)
|
||||
print(f"Saving progress to {CLASSIFICATION_FILENAME}...")
|
||||
save_dataframe(df, CLASSIFICATION_FILENAME)
|
||||
else:
|
||||
print(
|
||||
f"No successful API classifications obtained in chunk {i + 1} to update DataFrame or save."
|
||||
)
|
||||
|
||||
print(
|
||||
f"Chunk {i + 1} API summary: Successful Calls={successful_api_calls_in_chunk}, Failed/Skipped Calls={failed_api_calls_in_chunk}. "
|
||||
f"Rows updated in DataFrame this chunk: {rows_updated_this_chunk}"
|
||||
)
|
||||
processed_rows_count_total += rows_updated_this_chunk
|
||||
|
||||
# --- Rate Limiting Pause ---
|
||||
chunk_end_time = time.time()
|
||||
chunk_duration = chunk_end_time - chunk_start_time
|
||||
print(f"Chunk {i + 1} (API calls and DF update) took {chunk_duration:.2f} seconds.")
|
||||
|
||||
if i < num_chunks - 1:
|
||||
time_per_request = SECONDS_PER_MINUTE / RATE_LIMIT if RATE_LIMIT > 0 else 0
|
||||
min_chunk_duration_for_rate = (
|
||||
len(message_chunk) * time_per_request
|
||||
) # Based on API calls made
|
||||
pause_needed = max(0, min_chunk_duration_for_rate - chunk_duration)
|
||||
|
||||
if pause_needed > 0:
|
||||
print(
|
||||
f"Pausing for {pause_needed:.2f} seconds to respect rate limit ({RATE_LIMIT}/min)..."
|
||||
)
|
||||
time.sleep(pause_needed)
|
||||
|
||||
overall_end_time = time.time()
|
||||
total_duration_minutes = (overall_end_time - overall_start_time) / 60
|
||||
print(
|
||||
f"\nBatch classification finished."
|
||||
f" Updated {processed_rows_count_total} rows in '{CLASSIFICATION_FILENAME}' with new classifications in this run."
|
||||
f" Total duration: {total_duration_minutes:.2f} minutes."
|
||||
)
|
||||
|
||||
print(f"Performing final save to {CLASSIFICATION_FILENAME}...")
|
||||
save_dataframe(df, CLASSIFICATION_FILENAME)
|
||||
|
||||
print("\nScript finished.")
|
85
old/create_onet_database.sh
Executable file
85
old/create_onet_database.sh
Executable file
|
@ -0,0 +1,85 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# Set database name and directories
|
||||
ONET_DB_NAME="onet.database"
|
||||
ONET_ZIP_URL="https://www.onetcenter.org/dl_files/database/db_29_1_mysql.zip"
|
||||
ONET_ZIP_FILE="db_29_1_mysql.zip"
|
||||
ONET_EXTRACT_DIR="db_29_1_mysql"
|
||||
|
||||
# Download O*NET database only if not already downloaded
|
||||
if [ ! -f "$ONET_ZIP_FILE" ]; then
|
||||
echo "Downloading O*NET database from $ONET_ZIP_URL"
|
||||
curl -L -o "$ONET_ZIP_FILE" "$ONET_ZIP_URL" || wget -O "$ONET_ZIP_FILE" "$ONET_ZIP_URL"
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed to download O*NET database"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
echo "Using existing O*NET database zip file"
|
||||
fi
|
||||
|
||||
# Extract downloaded zip file only if extraction directory doesn't exist
|
||||
if [ ! -d "$ONET_EXTRACT_DIR" ]; then
|
||||
echo "Extracting O*NET database files"
|
||||
unzip -o "$ONET_ZIP_FILE"
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed to extract O*NET database files"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
echo "Using existing extracted O*NET database files"
|
||||
fi
|
||||
|
||||
# Remove existing database if it exists
|
||||
if [ -f "$ONET_DB_NAME" ]; then
|
||||
echo "Removing existing database"
|
||||
rm "$ONET_DB_NAME"
|
||||
fi
|
||||
|
||||
# Create a new SQLite database with optimized settings for fast import
|
||||
echo "Creating new SQLite database: $ONET_DB_NAME with performance settings"
|
||||
sqlite3 "$ONET_DB_NAME" << EOF
|
||||
PRAGMA journal_mode = OFF;
|
||||
PRAGMA synchronous = 0;
|
||||
PRAGMA cache_size = 1000000;
|
||||
PRAGMA locking_mode = EXCLUSIVE;
|
||||
PRAGMA temp_store = MEMORY;
|
||||
PRAGMA foreign_keys = ON;
|
||||
EOF
|
||||
|
||||
# Combine and execute all SQL files in one transaction
|
||||
echo "Executing SQL files in alphabetical order (single transaction mode)"
|
||||
sqlite3 "$ONET_DB_NAME" << EOF
|
||||
BEGIN TRANSACTION;
|
||||
$(find "$ONET_EXTRACT_DIR" -name "*.sql" | sort | xargs cat)
|
||||
COMMIT;
|
||||
EOF
|
||||
|
||||
# Check if the execution was successful
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Error executing SQL files in batch transaction"
|
||||
exit 1
|
||||
else
|
||||
echo "Database populated successfully. Restoring reliability settings..."
|
||||
|
||||
# Restore reliability-focused settings after import
|
||||
sqlite3 "$ONET_DB_NAME" << EOF
|
||||
PRAGMA journal_mode = WAL;
|
||||
PRAGMA synchronous = NORMAL;
|
||||
PRAGMA locking_mode = NORMAL;
|
||||
PRAGMA temp_store = DEFAULT;
|
||||
PRAGMA foreign_keys = ON;
|
||||
PRAGMA optimize;
|
||||
VACUUM;
|
||||
EOF
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Warning: Failed to restore reliability settings, but database is populated"
|
||||
else
|
||||
echo "Reliability settings restored successfully"
|
||||
fi
|
||||
|
||||
echo "O*NET database created and optimized successfully!"
|
||||
fi
|
392
old/enrich_task_ratings.py
Normal file
392
old/enrich_task_ratings.py
Normal file
|
@ -0,0 +1,392 @@
|
|||
import sqlite3
|
||||
import pandas as pd
|
||||
import json
|
||||
import os
|
||||
from collections import defaultdict
|
||||
import numpy as np
|
||||
|
||||
# --- Configuration ---
|
||||
DB_FILE = "onet.database"
|
||||
OUTPUT_FILE = "task_ratings_enriched.json" # Changed output filename
|
||||
|
||||
# --- Database Interaction ---
|
||||
|
||||
|
||||
def fetch_data_from_db(db_path):
|
||||
"""
|
||||
Fetches required data from the O*NET SQLite database using JOINs,
|
||||
including DWAs.
|
||||
|
||||
Args:
|
||||
db_path (str): Path to the SQLite database file.
|
||||
|
||||
Returns:
|
||||
tuple(pandas.DataFrame, pandas.DataFrame): A tuple containing:
|
||||
- DataFrame with task ratings info.
|
||||
- DataFrame with task-to-DWA mapping.
|
||||
Returns (None, None) if the database file doesn't exist or an error occurs.
|
||||
"""
|
||||
if not os.path.exists(db_path):
|
||||
print(f"Error: Database file not found at {db_path}")
|
||||
return None, None
|
||||
|
||||
try:
|
||||
conn = sqlite3.connect(db_path)
|
||||
# Construct the SQL query to join the tables and select necessary columns
|
||||
# Added LEFT JOINs for tasks_to_dwas and dwa_reference
|
||||
# Use LEFT JOIN in case a task has no DWAs
|
||||
query = """
|
||||
SELECT
|
||||
tr.onetsoc_code,
|
||||
tr.task_id,
|
||||
ts.task,
|
||||
od.title AS occupation_title,
|
||||
od.description AS occupation_description,
|
||||
tr.scale_id,
|
||||
tr.category,
|
||||
tr.data_value,
|
||||
dr.dwa_title -- Added DWA title
|
||||
FROM
|
||||
task_ratings tr
|
||||
JOIN
|
||||
task_statements ts ON tr.task_id = ts.task_id
|
||||
JOIN
|
||||
occupation_data od ON tr.onetsoc_code = od.onetsoc_code
|
||||
LEFT JOIN
|
||||
tasks_to_dwas td ON tr.onetsoc_code = td.onetsoc_code AND tr.task_id = td.task_id --
|
||||
LEFT JOIN
|
||||
dwa_reference dr ON td.dwa_id = dr.dwa_id; --
|
||||
"""
|
||||
df = pd.read_sql_query(query, conn)
|
||||
conn.close()
|
||||
print(
|
||||
f"Successfully fetched {len(df)} records (including DWA info) from the database."
|
||||
)
|
||||
|
||||
if df.empty:
|
||||
print("Warning: Fetched DataFrame is empty.")
|
||||
# Return empty DataFrames with expected columns if the main fetch is empty
|
||||
ratings_cols = [
|
||||
"onetsoc_code",
|
||||
"task_id",
|
||||
"task",
|
||||
"occupation_title",
|
||||
"occupation_description",
|
||||
"scale_id",
|
||||
"category",
|
||||
"data_value",
|
||||
]
|
||||
dwa_cols = ["onetsoc_code", "task_id", "dwa_title"]
|
||||
return pd.DataFrame(columns=ratings_cols), pd.DataFrame(columns=dwa_cols)
|
||||
|
||||
# Remove duplicates caused by joining ratings with potentially multiple DWAs per task
|
||||
# Keep only unique combinations of the core task/rating info before processing
|
||||
core_cols = [
|
||||
"onetsoc_code",
|
||||
"task_id",
|
||||
"task",
|
||||
"occupation_title",
|
||||
"occupation_description",
|
||||
"scale_id",
|
||||
"category",
|
||||
"data_value",
|
||||
]
|
||||
# Check if all core columns exist before attempting to drop duplicates
|
||||
missing_core_cols = [col for col in core_cols if col not in df.columns]
|
||||
if missing_core_cols:
|
||||
print(f"Error: Missing core columns in fetched data: {missing_core_cols}")
|
||||
return None, None
|
||||
ratings_df = df[core_cols].drop_duplicates().reset_index(drop=True)
|
||||
|
||||
# Get unique DWA info separately
|
||||
dwa_cols = ["onetsoc_code", "task_id", "dwa_title"]
|
||||
# Check if all DWA columns exist before processing
|
||||
if all(col in df.columns for col in dwa_cols):
|
||||
dwas_df = (
|
||||
df[dwa_cols]
|
||||
.dropna(subset=["dwa_title"])
|
||||
.drop_duplicates()
|
||||
.reset_index(drop=True)
|
||||
)
|
||||
else:
|
||||
print("Warning: DWA related columns missing, creating empty DWA DataFrame.")
|
||||
dwas_df = pd.DataFrame(
|
||||
columns=dwa_cols
|
||||
) # Create empty df if columns missing
|
||||
|
||||
return ratings_df, dwas_df # Return two dataframes now
|
||||
|
||||
except sqlite3.Error as e:
|
||||
print(f"SQLite error: {e}")
|
||||
if "conn" in locals() and conn:
|
||||
conn.close()
|
||||
return None, None # Return None for both if error
|
||||
except Exception as e:
|
||||
print(f"An error occurred during data fetching: {e}")
|
||||
if "conn" in locals() and conn:
|
||||
conn.close()
|
||||
return None, None # Return None for both if error
|
||||
|
||||
|
||||
# --- Data Processing ---
|
||||
|
||||
|
||||
def process_task_ratings_with_dwas(ratings_df, dwas_df):
|
||||
"""
|
||||
Processes the fetched data to group, pivot frequency, calculate averages,
|
||||
structure the output, and add associated DWAs.
|
||||
|
||||
Args:
|
||||
ratings_df (pandas.DataFrame): The input DataFrame with task ratings info.
|
||||
dwas_df (pandas.DataFrame): The input DataFrame with task-to-DWA mapping. Can be None or empty.
|
||||
|
||||
Returns:
|
||||
list: A list of dictionaries, each representing an enriched task rating with DWAs.
|
||||
Returns None if the input ratings DataFrame is invalid.
|
||||
"""
|
||||
if ratings_df is None or not isinstance(
|
||||
ratings_df, pd.DataFrame
|
||||
): # Check if it's a DataFrame
|
||||
print("Error: Input ratings DataFrame is invalid.")
|
||||
return None
|
||||
if ratings_df.empty:
|
||||
print(
|
||||
"Warning: Input ratings DataFrame is empty. Processing will yield empty result."
|
||||
)
|
||||
# Decide how to handle empty input, maybe return empty list directly
|
||||
# return []
|
||||
|
||||
# Ensure dwas_df is a DataFrame, even if empty
|
||||
if dwas_df is None or not isinstance(dwas_df, pd.DataFrame):
|
||||
print("Warning: Invalid or missing DWA DataFrame. Proceeding without DWA data.")
|
||||
dwas_df = pd.DataFrame(
|
||||
columns=["onetsoc_code", "task_id", "dwa_title"]
|
||||
) # Ensure it's an empty DF
|
||||
|
||||
print("Starting data processing...")
|
||||
|
||||
# --- 1. Handle Frequency (FT) ---
|
||||
freq_df = ratings_df[ratings_df["scale_id"] == "FT"].copy()
|
||||
if not freq_df.empty:
|
||||
freq_pivot = freq_df.pivot_table(
|
||||
index=["onetsoc_code", "task_id"],
|
||||
columns="category",
|
||||
values="data_value",
|
||||
fill_value=0,
|
||||
)
|
||||
freq_pivot.columns = [
|
||||
f"frequency_category_{int(col)}" for col in freq_pivot.columns
|
||||
]
|
||||
print(f"Processed Frequency data. Shape: {freq_pivot.shape}")
|
||||
else:
|
||||
print("No Frequency (FT) data found.")
|
||||
# Create an empty DataFrame with the multi-index to allow merging later
|
||||
idx = pd.MultiIndex(
|
||||
levels=[[], []], codes=[[], []], names=["onetsoc_code", "task_id"]
|
||||
)
|
||||
freq_pivot = pd.DataFrame(index=idx)
|
||||
|
||||
# --- 2. Handle Importance (IM, IJ) ---
|
||||
imp_df = ratings_df[ratings_df["scale_id"].isin(["IM", "IJ"])].copy()
|
||||
if not imp_df.empty:
|
||||
imp_avg = (
|
||||
imp_df.groupby(["onetsoc_code", "task_id"])["data_value"]
|
||||
.mean()
|
||||
.reset_index()
|
||||
)
|
||||
imp_avg.rename(columns={"data_value": "importance_average"}, inplace=True)
|
||||
print(f"Processed Importance data. Shape: {imp_avg.shape}")
|
||||
else:
|
||||
print("No Importance (IM, IJ) data found.")
|
||||
imp_avg = pd.DataFrame(
|
||||
columns=["onetsoc_code", "task_id", "importance_average"]
|
||||
)
|
||||
|
||||
# --- 3. Handle Relevance (RT) ---
|
||||
rel_df = ratings_df[ratings_df["scale_id"] == "RT"].copy()
|
||||
if not rel_df.empty:
|
||||
rel_avg = (
|
||||
rel_df.groupby(["onetsoc_code", "task_id"])["data_value"]
|
||||
.mean()
|
||||
.reset_index()
|
||||
)
|
||||
rel_avg.rename(columns={"data_value": "relevance_average"}, inplace=True)
|
||||
print(f"Processed Relevance data. Shape: {rel_avg.shape}")
|
||||
else:
|
||||
print("No Relevance (RT) data found.")
|
||||
rel_avg = pd.DataFrame(columns=["onetsoc_code", "task_id", "relevance_average"])
|
||||
|
||||
# --- 4. Process DWAs ---
|
||||
if dwas_df is not None and not dwas_df.empty and "dwa_title" in dwas_df.columns:
|
||||
print("Processing DWA data...")
|
||||
# Group DWAs by task_id and aggregate titles into a list
|
||||
dwas_grouped = (
|
||||
dwas_df.groupby(["onetsoc_code", "task_id"])["dwa_title"]
|
||||
.apply(list)
|
||||
.reset_index()
|
||||
) #
|
||||
dwas_grouped.rename(
|
||||
columns={"dwa_title": "dwas"}, inplace=True
|
||||
) # Rename column to 'dwas'
|
||||
print(f"Processed DWA data. Shape: {dwas_grouped.shape}")
|
||||
else:
|
||||
print("No valid DWA data found or provided for processing.")
|
||||
dwas_grouped = None # Set to None if no DWAs
|
||||
|
||||
# --- 5. Get Base Task/Occupation Info ---
|
||||
base_cols = [
|
||||
"onetsoc_code",
|
||||
"task_id",
|
||||
"task",
|
||||
"occupation_title",
|
||||
"occupation_description",
|
||||
]
|
||||
# Check if base columns exist in ratings_df
|
||||
missing_base_cols = [col for col in base_cols if col not in ratings_df.columns]
|
||||
if missing_base_cols:
|
||||
print(
|
||||
f"Error: Missing base info columns in ratings_df: {missing_base_cols}. Cannot proceed."
|
||||
)
|
||||
return None
|
||||
if not ratings_df.empty:
|
||||
base_info = (
|
||||
ratings_df[base_cols]
|
||||
.drop_duplicates()
|
||||
.set_index(["onetsoc_code", "task_id"])
|
||||
)
|
||||
print(f"Extracted base info. Shape: {base_info.shape}")
|
||||
else:
|
||||
print("Cannot extract base info from empty ratings DataFrame.")
|
||||
# Create an empty df with index to avoid errors later if possible
|
||||
idx = pd.MultiIndex(
|
||||
levels=[[], []], codes=[[], []], names=["onetsoc_code", "task_id"]
|
||||
)
|
||||
base_info = pd.DataFrame(
|
||||
index=idx,
|
||||
columns=[
|
||||
col for col in base_cols if col not in ["onetsoc_code", "task_id"]
|
||||
],
|
||||
)
|
||||
|
||||
# --- 6. Merge Processed Data ---
|
||||
print("Merging processed data...")
|
||||
# Start with base_info, which should have the index ['onetsoc_code', 'task_id']
|
||||
final_df = base_info.merge(
|
||||
freq_pivot, left_index=True, right_index=True, how="left"
|
||||
)
|
||||
# Reset index before merging non-indexed dfs
|
||||
final_df = final_df.reset_index()
|
||||
|
||||
# Merge averages - check if they are not empty before merging
|
||||
if not imp_avg.empty:
|
||||
final_df = final_df.merge(imp_avg, on=["onetsoc_code", "task_id"], how="left")
|
||||
else:
|
||||
final_df["importance_average"] = np.nan # Add column if imp_avg was empty
|
||||
|
||||
if not rel_avg.empty:
|
||||
final_df = final_df.merge(rel_avg, on=["onetsoc_code", "task_id"], how="left")
|
||||
else:
|
||||
final_df["relevance_average"] = np.nan # Add column if rel_avg was empty
|
||||
|
||||
# Merge DWAs if available
|
||||
if dwas_grouped is not None and not dwas_grouped.empty:
|
||||
final_df = final_df.merge(
|
||||
dwas_grouped, on=["onetsoc_code", "task_id"], how="left"
|
||||
) # Merge the dwas list
|
||||
# Fill NaN in 'dwas' column (for tasks with no DWAs) with empty lists
|
||||
# Check if 'dwas' column exists before applying function
|
||||
if "dwas" in final_df.columns:
|
||||
final_df["dwas"] = final_df["dwas"].apply(
|
||||
lambda x: x if isinstance(x, list) else []
|
||||
) # Ensure tasks without DWAs get []
|
||||
else:
|
||||
print("Warning: 'dwas' column not created during merge.")
|
||||
final_df["dwas"] = [
|
||||
[] for _ in range(len(final_df))
|
||||
] # Add empty list column
|
||||
|
||||
else:
|
||||
# Add an empty 'dwas' column if no DWA data was processed or merged
|
||||
final_df["dwas"] = [[] for _ in range(len(final_df))]
|
||||
|
||||
print(f"Final merged data shape: {final_df.shape}")
|
||||
|
||||
# Convert DataFrame to list of dictionaries for JSON output
|
||||
# Handle potential NaN values during JSON conversion
|
||||
# Replace numpy NaN with Python None for JSON compatibility
|
||||
final_df = final_df.replace({np.nan: None})
|
||||
result_list = final_df.to_dict(orient="records")
|
||||
|
||||
return result_list
|
||||
|
||||
|
||||
# --- Output ---
|
||||
|
||||
|
||||
def write_to_json(data, output_path):
|
||||
"""
|
||||
Writes the processed data to a JSON file.
|
||||
|
||||
Args:
|
||||
data (list): The list of dictionaries to write.
|
||||
output_path (str): Path to the output JSON file.
|
||||
"""
|
||||
if data is None:
|
||||
print("No data to write to JSON.")
|
||||
return
|
||||
if not isinstance(data, list):
|
||||
print(
|
||||
f"Error: Data to write is not a list (type: {type(data)}). Cannot write to JSON."
|
||||
)
|
||||
return
|
||||
|
||||
# Create directory if it doesn't exist
|
||||
output_dir = os.path.dirname(output_path)
|
||||
if output_dir and not os.path.exists(output_dir):
|
||||
try:
|
||||
os.makedirs(output_dir)
|
||||
print(f"Created output directory: {output_dir}")
|
||||
except OSError as e:
|
||||
print(f"Error creating output directory {output_dir}: {e}")
|
||||
return # Exit if cannot create directory
|
||||
|
||||
try:
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, indent=4, ensure_ascii=False)
|
||||
print(f"Successfully wrote enriched data to {output_path}")
|
||||
except IOError as e:
|
||||
print(f"Error writing JSON file to {output_path}: {e}")
|
||||
except TypeError as e:
|
||||
print(f"Error during JSON serialization: {e}. Check data types.")
|
||||
except Exception as e:
|
||||
print(f"An unexpected error occurred during JSON writing: {e}")
|
||||
|
||||
|
||||
# --- Main Execution ---
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("Starting O*NET Task Ratings & DWAs Enrichment Script...")
|
||||
# 1. Fetch data
|
||||
ratings_data_df, dwas_data_df = fetch_data_from_db(DB_FILE) # Fetch both datasets
|
||||
|
||||
# 2. Process data
|
||||
# Proceed only if ratings_data_df is a valid DataFrame (even if empty)
|
||||
# dwas_data_df can be None or empty, handled inside process function
|
||||
if isinstance(ratings_data_df, pd.DataFrame):
|
||||
enriched_data = process_task_ratings_with_dwas(
|
||||
ratings_data_df, dwas_data_df
|
||||
) # Pass both dataframes
|
||||
|
||||
# 3. Write output
|
||||
if (
|
||||
enriched_data is not None
|
||||
): # Check if processing returned data (even an empty list is valid)
|
||||
write_to_json(enriched_data, OUTPUT_FILE)
|
||||
else:
|
||||
print("Data processing failed or returned None. No output file generated.")
|
||||
else:
|
||||
print(
|
||||
"Data fetching failed or returned invalid type for ratings data. Script terminated."
|
||||
)
|
||||
|
||||
print("Script finished.")
|
|
@ -7,6 +7,7 @@ from .run import Run
|
|||
import pandas as pd
|
||||
|
||||
def enrich_with_task_estimateability(run: Run) -> pd.DataFrame:
|
||||
run.metadata.
|
||||
raise NotImplementedError
|
||||
|
||||
def enrich_with_task_estimates(run: Run) -> pd.DataFrame:
|
||||
|
|
|
@ -5,13 +5,148 @@ Fetchers retrieve remote data and return it in a format suitable for further pro
|
|||
import sqlite3
|
||||
from typing import Tuple
|
||||
import pandas as pd
|
||||
from .metadata import Metadata
|
||||
import requests
|
||||
import hashlib
|
||||
import io
|
||||
import zipfile
|
||||
from .run import Run
|
||||
from .logger import logger
|
||||
|
||||
def fetch_onet_database(meta: Metadata) -> Tuple[sqlite3.Connection, str]:
|
||||
raise NotImplementedError
|
||||
def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
|
||||
"""
|
||||
Downloads the O*NET database, creates a local SQLite file from it, and returns a connection.
|
||||
The version is the sha256 of the downloaded zip file.
|
||||
"""
|
||||
url = "https://www.onetcenter.org/dl_files/database/db_29_1_mysql.zip"
|
||||
logger.info(f"Downloading O*NET database from {url}")
|
||||
response = requests.get(url, stream=True)
|
||||
response.raise_for_status()
|
||||
|
||||
def fetch_oesm_data(meta: Metadata) -> Tuple[pd.DataFrame, str]:
|
||||
raise NotImplementedError
|
||||
# Read content into memory
|
||||
zip_content = response.content
|
||||
version = hashlib.sha256(zip_content).hexdigest()
|
||||
logger.info(f"O*NET database version (sha256): {version}")
|
||||
|
||||
def fetch_epoch_remote_data(meta: Metadata) -> Tuple[pd.DataFrame, str]:
|
||||
raise NotImplementedError
|
||||
db_path = run.cache_dir / f"onet_{version}.db"
|
||||
|
||||
if db_path.exists():
|
||||
logger.info(f"Using cached O*NET database: {db_path}")
|
||||
conn = sqlite3.connect(db_path)
|
||||
# Set PRAGMA for foreign keys on every connection
|
||||
conn.execute("PRAGMA foreign_keys = ON;")
|
||||
return conn, version
|
||||
|
||||
logger.info(f"Creating new O*NET database: {db_path}")
|
||||
conn = sqlite3.connect(db_path)
|
||||
|
||||
# Set performance PRAGMAs for fast import
|
||||
logger.info("Creating new SQLite database with performance settings")
|
||||
conn.executescript("""
|
||||
PRAGMA journal_mode = OFF;
|
||||
PRAGMA synchronous = 0;
|
||||
PRAGMA cache_size = 1000000;
|
||||
PRAGMA locking_mode = EXCLUSIVE;
|
||||
PRAGMA temp_store = MEMORY;
|
||||
PRAGMA foreign_keys = ON;
|
||||
""")
|
||||
|
||||
with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
|
||||
sql_scripts = []
|
||||
for filename in sorted(z.namelist()):
|
||||
if filename.endswith(".sql"):
|
||||
sql_scripts.append(z.read(filename).decode('utf-8'))
|
||||
|
||||
if not sql_scripts:
|
||||
raise RuntimeError("No SQL files found in the O*NET zip archive.")
|
||||
|
||||
# Combine and execute all SQL files in one transaction
|
||||
full_script = "BEGIN TRANSACTION;\n" + "\n".join(sql_scripts) + "\nCOMMIT;"
|
||||
|
||||
logger.info("Executing SQL files in alphabetical order (single transaction mode)")
|
||||
conn.executescript(full_script)
|
||||
logger.info("Database populated successfully. Restoring reliability settings...")
|
||||
|
||||
# Restore reliability-focused settings after import
|
||||
conn.executescript("""
|
||||
PRAGMA journal_mode = WAL;
|
||||
PRAGMA synchronous = NORMAL;
|
||||
PRAGMA locking_mode = NORMAL;
|
||||
PRAGMA temp_store = DEFAULT;
|
||||
PRAGMA foreign_keys = ON;
|
||||
PRAGMA optimize;
|
||||
""")
|
||||
conn.execute("VACUUM;")
|
||||
conn.commit()
|
||||
logger.info("Reliability settings restored and database optimized successfully!")
|
||||
|
||||
return conn, version
|
||||
|
||||
def fetch_oesm_data(run: Run) -> Tuple[pd.DataFrame, str]:
|
||||
"""
|
||||
Downloads the OESM national data from the BLS website.
|
||||
The version is the sha256 of the downloaded zip file.
|
||||
"""
|
||||
url = "https://www.bls.gov/oes/special-requests/oesm23nat.zip"
|
||||
logger.info(f"Downloading OESM data from {url}")
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
|
||||
zip_content = response.content
|
||||
version = hashlib.sha256(zip_content).hexdigest()
|
||||
logger.info(f"OESM data version (sha256): {version}")
|
||||
|
||||
parquet_path = run.cache_dir / f"oesm_{version}.parquet"
|
||||
if parquet_path.exists():
|
||||
logger.info(f"Using cached OESM data: {parquet_path}")
|
||||
return pd.read_parquet(parquet_path), version
|
||||
|
||||
logger.info(f"Creating new OESM data cache: {parquet_path}")
|
||||
with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
|
||||
# Find the excel file in the zip
|
||||
excel_filename = None
|
||||
for filename in z.namelist():
|
||||
logger.debug(f"Found file in OESM zip: {filename}")
|
||||
if filename.lower().endswith(".xlsx"):
|
||||
excel_filename = filename
|
||||
break
|
||||
|
||||
if excel_filename is None:
|
||||
raise FileNotFoundError("Could not find the Excel file in the OESM zip archive.")
|
||||
|
||||
logger.info(f"Reading {excel_filename} from zip archive.")
|
||||
with z.open(excel_filename) as f:
|
||||
df = pd.read_excel(f, engine='openpyxl')
|
||||
|
||||
df.to_parquet(parquet_path)
|
||||
logger.info(f"Saved OESM data to cache: {parquet_path}")
|
||||
return df, version
|
||||
|
||||
def fetch_epoch_remote_data(run: Run) -> Tuple[pd.DataFrame, str]:
|
||||
"""
|
||||
Downloads the EPOCH AI remote work task data.
|
||||
The version is the sha256 of the downloaded CSV file.
|
||||
"""
|
||||
# This is the direct download link constructed from the Google Drive share link
|
||||
url = "https://drive.google.com/uc?export=download&id=1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r"
|
||||
logger.info(f"Downloading EPOCH remote data from Google Drive: {url}")
|
||||
|
||||
# Need to handle potential cookies/redirects from Google Drive
|
||||
session = requests.Session()
|
||||
response = session.get(url, stream=True)
|
||||
response.raise_for_status()
|
||||
|
||||
csv_content = response.content
|
||||
version = hashlib.sha256(csv_content).hexdigest()
|
||||
logger.info(f"EPOCH remote data version (sha256): {version}")
|
||||
|
||||
parquet_path = run.cache_dir / f"epoch_remote_{version}.parquet"
|
||||
if parquet_path.exists():
|
||||
logger.info(f"Using cached EPOCH remote data: {parquet_path}")
|
||||
return pd.read_parquet(parquet_path), version
|
||||
|
||||
logger.info(f"Creating new EPOCH remote data cache: {parquet_path}")
|
||||
df = pd.read_csv(io.BytesIO(csv_content))
|
||||
df.to_parquet(parquet_path)
|
||||
logger.info(f"Saved EPOCH remote data to cache: {parquet_path}")
|
||||
|
||||
return df, version
|
||||
|
|
|
@ -2,5 +2,5 @@ from ..run import Run
|
|||
from pathlib import Path
|
||||
from typing import Generator
|
||||
|
||||
def generate_estimate_histplot(run: Run, output_dir: Path) -> Generator[Path]:
|
||||
def generate_estimate_histplot(run: Run) -> Generator[Path]:
|
||||
raise NotImplementedError
|
||||
|
|
24
pipeline/logger.py
Normal file
24
pipeline/logger.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
import logging
|
||||
from logging.handlers import RotatingFileHandler
|
||||
from rich.logging import RichHandler
|
||||
|
||||
LOGGER_NAME = "pipeline"
|
||||
|
||||
def setup_logging() -> logging.Logger:
|
||||
# Set up Rich console handler
|
||||
rich_handler = RichHandler(
|
||||
level=logging.DEBUG,
|
||||
show_time=True,
|
||||
enable_link_path=True,
|
||||
rich_tracebacks=True,
|
||||
# omit_repeated_times=False,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(LOGGER_NAME)
|
||||
logger.setLevel(logging.DEBUG)
|
||||
logger.addHandler(rich_handler)
|
||||
|
||||
return logger
|
||||
|
||||
|
||||
logger = setup_logging()
|
|
@ -16,6 +16,7 @@ class Metadata(BaseModel):
|
|||
versions, and other important information.
|
||||
"""
|
||||
fetchers: Dict[str, Dict[str, Any]] = Field(default_factory=dict)
|
||||
enrichments: Dict[str, Dict[str, Any]] = Field(default_factory=dict)
|
||||
|
||||
ts: str = Field(default_factory=lambda: datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
|
||||
commit: str = Field(default_factory=lambda: _get_current_commit())
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from pydantic import BaseModel, Field
|
||||
import sqlite3
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from .metadata import Metadata
|
||||
|
||||
|
@ -20,3 +21,6 @@ class Run(BaseModel):
|
|||
task_estimates_df: Optional[pd.DataFrame] = None
|
||||
|
||||
meta: Metadata = Field(default_factory=Metadata)
|
||||
|
||||
cache_dir: Path
|
||||
output_dir: Path
|
||||
|
|
|
@ -5,11 +5,14 @@ from .postprocessors import check_for_insanity, create_df_tasks
|
|||
from .generators import GENERATORS
|
||||
from .run import Run
|
||||
from .constants import GRAY
|
||||
import platformdirs
|
||||
import seaborn as sns
|
||||
import matplotlib as mpl
|
||||
from pathlib import Path
|
||||
from typings import Optional
|
||||
|
||||
CACHE_DIR = platformdirs.user_cache_dir("econtai")
|
||||
|
||||
def run(output_dir: Optional[str] = None):
|
||||
if output_dir is None:
|
||||
output_dir = Path(".")
|
||||
|
@ -17,12 +20,12 @@ def run(output_dir: Optional[str] = None):
|
|||
load_dotenv()
|
||||
_setup_graph_rendering()
|
||||
|
||||
current_run = Run()
|
||||
current_run = Run(output_dir=output_dir, cache_dir=CACHE_DIR)
|
||||
|
||||
# Fetchers (fetchers.py)
|
||||
current_run.onet_conn, current_run.onet_version = fetch_onet_database(current_run.meta)
|
||||
current_run.oesm_df, current_run.oesm_version = fetch_oesm_data(current_run.meta)
|
||||
current_run.epoch_df, current_run.epoch_version = fetch_epoch_remote_data(current_run.meta)
|
||||
current_run.onet_conn, current_run.onet_version = fetch_onet_database(current_run)
|
||||
current_run.oesm_df, current_run.oesm_version = fetch_oesm_data(current_run)
|
||||
current_run.epoch_df, current_run.epoch_version = fetch_epoch_remote_data(current_run)
|
||||
|
||||
# Enrichments (enrichments.py)
|
||||
current_run.task_estimateability_df = enrich_with_task_estimateability(current_run)
|
||||
|
@ -34,7 +37,7 @@ def run(output_dir: Optional[str] = None):
|
|||
|
||||
# Generators (generators/)
|
||||
for gen in GENERATORS:
|
||||
gen(current_run, output_dir)
|
||||
gen(current_run)
|
||||
|
||||
|
||||
def _setup_graph_rendering():
|
||||
|
|
|
@ -6,9 +6,12 @@ readme = "README.md"
|
|||
requires-python = ">=3.13"
|
||||
dependencies = [
|
||||
"matplotlib>=3.10.3",
|
||||
"openpyxl>=3.1.5",
|
||||
"pandas>=2.2.3",
|
||||
"platformdirs>=4.3.8",
|
||||
"pydantic>=2.11.7",
|
||||
"python-dotenv>=1.1.1",
|
||||
"requests>=2.32.4",
|
||||
"seaborn>=0.13.2",
|
||||
]
|
||||
|
||||
|
|
100
uv.lock
generated
100
uv.lock
generated
|
@ -11,6 +11,37 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload_time = "2024-05-20T21:33:24.1Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "certifi"
|
||||
version = "2025.6.15"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/73/f7/f14b46d4bcd21092d7d3ccef689615220d8a08fb25e564b65d20738e672e/certifi-2025.6.15.tar.gz", hash = "sha256:d747aa5a8b9bbbb1bb8c22bb13e22bd1f18e9796defa16bab421f7f7a317323b", size = 158753, upload_time = "2025-06-15T02:45:51.329Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/84/ae/320161bd181fc06471eed047ecce67b693fd7515b16d495d8932db763426/certifi-2025.6.15-py3-none-any.whl", hash = "sha256:2e0c7ce7cb5d8f8634ca55d2ba7e6ec2689a2fd6537d8dec1296a477a4910057", size = 157650, upload_time = "2025-06-15T02:45:49.977Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "charset-normalizer"
|
||||
version = "3.4.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/e4/33/89c2ced2b67d1c2a61c19c6751aa8902d46ce3dacb23600a283619f5a12d/charset_normalizer-3.4.2.tar.gz", hash = "sha256:5baececa9ecba31eff645232d59845c07aa030f0c81ee70184a90d35099a0e63", size = 126367, upload_time = "2025-05-02T08:34:42.01Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/ea/12/a93df3366ed32db1d907d7593a94f1fe6293903e3e92967bebd6950ed12c/charset_normalizer-3.4.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:926ca93accd5d36ccdabd803392ddc3e03e6d4cd1cf17deff3b989ab8e9dbcf0", size = 199622, upload_time = "2025-05-02T08:32:56.363Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/04/93/bf204e6f344c39d9937d3c13c8cd5bbfc266472e51fc8c07cb7f64fcd2de/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eba9904b0f38a143592d9fc0e19e2df0fa2e41c3c3745554761c5f6447eedabf", size = 143435, upload_time = "2025-05-02T08:32:58.551Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/22/2a/ea8a2095b0bafa6c5b5a55ffdc2f924455233ee7b91c69b7edfcc9e02284/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3fddb7e2c84ac87ac3a947cb4e66d143ca5863ef48e4a5ecb83bd48619e4634e", size = 153653, upload_time = "2025-05-02T08:33:00.342Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b6/57/1b090ff183d13cef485dfbe272e2fe57622a76694061353c59da52c9a659/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98f862da73774290f251b9df8d11161b6cf25b599a66baf087c1ffe340e9bfd1", size = 146231, upload_time = "2025-05-02T08:33:02.081Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e2/28/ffc026b26f441fc67bd21ab7f03b313ab3fe46714a14b516f931abe1a2d8/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c9379d65defcab82d07b2a9dfbfc2e95bc8fe0ebb1b176a3190230a3ef0e07c", size = 148243, upload_time = "2025-05-02T08:33:04.063Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c0/0f/9abe9bd191629c33e69e47c6ef45ef99773320e9ad8e9cb08b8ab4a8d4cb/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e635b87f01ebc977342e2697d05b56632f5f879a4f15955dfe8cef2448b51691", size = 150442, upload_time = "2025-05-02T08:33:06.418Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/67/7c/a123bbcedca91d5916c056407f89a7f5e8fdfce12ba825d7d6b9954a1a3c/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1c95a1e2902a8b722868587c0e1184ad5c55631de5afc0eb96bc4b0d738092c0", size = 145147, upload_time = "2025-05-02T08:33:08.183Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ec/fe/1ac556fa4899d967b83e9893788e86b6af4d83e4726511eaaad035e36595/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ef8de666d6179b009dce7bcb2ad4c4a779f113f12caf8dc77f0162c29d20490b", size = 153057, upload_time = "2025-05-02T08:33:09.986Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2b/ff/acfc0b0a70b19e3e54febdd5301a98b72fa07635e56f24f60502e954c461/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:32fc0341d72e0f73f80acb0a2c94216bd704f4f0bce10aedea38f30502b271ff", size = 156454, upload_time = "2025-05-02T08:33:11.814Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/92/08/95b458ce9c740d0645feb0e96cea1f5ec946ea9c580a94adfe0b617f3573/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:289200a18fa698949d2b39c671c2cc7a24d44096784e76614899a7ccf2574b7b", size = 154174, upload_time = "2025-05-02T08:33:13.707Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/78/be/8392efc43487ac051eee6c36d5fbd63032d78f7728cb37aebcc98191f1ff/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4a476b06fbcf359ad25d34a057b7219281286ae2477cc5ff5e3f70a246971148", size = 149166, upload_time = "2025-05-02T08:33:15.458Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/44/96/392abd49b094d30b91d9fbda6a69519e95802250b777841cf3bda8fe136c/charset_normalizer-3.4.2-cp313-cp313-win32.whl", hash = "sha256:aaeeb6a479c7667fbe1099af9617c83aaca22182d6cf8c53966491a0f1b7ffb7", size = 98064, upload_time = "2025-05-02T08:33:17.06Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e9/b0/0200da600134e001d91851ddc797809e2fe0ea72de90e09bec5a2fbdaccb/charset_normalizer-3.4.2-cp313-cp313-win_amd64.whl", hash = "sha256:aa6af9e7d59f9c12b33ae4e9450619cf2488e2bbe9b44030905877f0b2324980", size = 105641, upload_time = "2025-05-02T08:33:18.753Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/20/94/c5790835a017658cbfabd07f3bfb549140c3ac458cfc196323996b10095a/charset_normalizer-3.4.2-py3-none-any.whl", hash = "sha256:7f56930ab0abd1c45cd15be65cc741c28b1c9a34876ce8c17a2fa107810c0af0", size = 52626, upload_time = "2025-05-02T08:34:40.053Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "contourpy"
|
||||
version = "1.3.2"
|
||||
|
@ -51,6 +82,15 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30", size = 8321, upload_time = "2023-10-07T05:32:16.783Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "et-xmlfile"
|
||||
version = "2.0.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload_time = "2024-10-25T17:25:40.039Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload_time = "2024-10-25T17:25:39.051Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fonttools"
|
||||
version = "4.58.5"
|
||||
|
@ -68,6 +108,15 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/d7/d4/1d85a1996b6188cd2713230e002d79a6f3a289bb17cef600cba385848b72/fonttools-4.58.5-py3-none-any.whl", hash = "sha256:e48a487ed24d9b611c5c4b25db1e50e69e9854ca2670e39a3486ffcd98863ec4", size = 1115318, upload_time = "2025-07-03T14:04:45.378Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "idna"
|
||||
version = "3.10"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490, upload_time = "2024-09-15T18:07:39.745Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload_time = "2024-09-15T18:07:37.964Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kiwisolver"
|
||||
version = "1.4.8"
|
||||
|
@ -163,6 +212,18 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/63/be/b85e4aa4bf42c6502851b971f1c326d583fcc68227385f92089cf50a7b45/numpy-2.2.5-cp313-cp313t-win_amd64.whl", hash = "sha256:d403c84991b5ad291d3809bace5e85f4bbf44a04bdc9a88ed2bb1807b3360bb8", size = 12750096, upload_time = "2025-04-19T22:47:00.147Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "openpyxl"
|
||||
version = "3.1.5"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "et-xmlfile" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload_time = "2024-06-28T14:03:44.161Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload_time = "2024-06-28T14:03:41.161Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "packaging"
|
||||
version = "25.0"
|
||||
|
@ -254,6 +315,15 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/89/c7/5572fa4a3f45740eaab6ae86fcdf7195b55beac1371ac8c619d880cfe948/pillow-11.3.0-cp314-cp314t-win_arm64.whl", hash = "sha256:79ea0d14d3ebad43ec77ad5272e6ff9bba5b679ef73375ea760261207fa8e0aa", size = 2512835, upload_time = "2025-07-01T09:15:50.399Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "platformdirs"
|
||||
version = "4.3.8"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/fe/8b/3c73abc9c759ecd3f1f7ceff6685840859e8070c4d947c93fae71f6a0bf2/platformdirs-4.3.8.tar.gz", hash = "sha256:3d512d96e16bcb959a814c9f348431070822a6496326a4be0911c40b5a74c2bc", size = 21362, upload_time = "2025-05-07T22:47:42.121Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/fe/39/979e8e21520d4e47a0bbe349e2713c0aac6f3d853d0e5b34d76206c439aa/platformdirs-4.3.8-py3-none-any.whl", hash = "sha256:ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4", size = 18567, upload_time = "2025-05-07T22:47:40.376Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pydantic"
|
||||
version = "2.11.7"
|
||||
|
@ -336,6 +406,21 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload_time = "2025-03-25T02:24:58.468Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "requests"
|
||||
version = "2.32.4"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "certifi" },
|
||||
{ name = "charset-normalizer" },
|
||||
{ name = "idna" },
|
||||
{ name = "urllib3" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/e1/0a/929373653770d8a0d7ea76c37de6e41f11eb07559b103b1c02cafb3f7cf8/requests-2.32.4.tar.gz", hash = "sha256:27d0316682c8a29834d3264820024b62a36942083d52caf2f14c0591336d3422", size = 135258, upload_time = "2025-06-09T16:43:07.34Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/7c/e4/56027c4a6b4ae70ca9de302488c5ca95ad4a39e190093d6c1a8ace08341b/requests-2.32.4-py3-none-any.whl", hash = "sha256:27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c", size = 64847, upload_time = "2025-06-09T16:43:05.728Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "seaborn"
|
||||
version = "0.13.2"
|
||||
|
@ -365,18 +450,24 @@ version = "0.1.0"
|
|||
source = { virtual = "." }
|
||||
dependencies = [
|
||||
{ name = "matplotlib" },
|
||||
{ name = "openpyxl" },
|
||||
{ name = "pandas" },
|
||||
{ name = "platformdirs" },
|
||||
{ name = "pydantic" },
|
||||
{ name = "python-dotenv" },
|
||||
{ name = "requests" },
|
||||
{ name = "seaborn" },
|
||||
]
|
||||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "matplotlib", specifier = ">=3.10.3" },
|
||||
{ name = "openpyxl", specifier = ">=3.1.5" },
|
||||
{ name = "pandas", specifier = ">=2.2.3" },
|
||||
{ name = "platformdirs", specifier = ">=4.3.8" },
|
||||
{ name = "pydantic", specifier = ">=2.11.7" },
|
||||
{ name = "python-dotenv", specifier = ">=1.1.1" },
|
||||
{ name = "requests", specifier = ">=2.32.4" },
|
||||
{ name = "seaborn", specifier = ">=0.13.2" },
|
||||
]
|
||||
|
||||
|
@ -412,3 +503,12 @@ sdist = { url = "https://files.pythonhosted.org/packages/95/32/1a225d6164441be76
|
|||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload_time = "2025-03-23T13:54:41.845Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "urllib3"
|
||||
version = "2.5.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/15/22/9ee70a2574a4f4599c47dd506532914ce044817c7752a79b6a51286319bc/urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760", size = 393185, upload_time = "2025-06-18T14:07:41.644Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload_time = "2025-06-18T14:07:40.39Z" },
|
||||
]
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue