diff --git a/agents.md b/agents.md index c6553b6..1ad96c4 100644 --- a/agents.md +++ b/agents.md @@ -1,2 +1,3 @@ - I use Nix. To run a command, prefix them with `nix develop .#impure -c` - I use uv. To add a package, use: uv add. To run a script use: uv run path/to/script +- To run the pipeline: `uv run -m pipeline.runner` diff --git a/dist/estimate_distribution_histplot.png b/dist/estimate_distribution_histplot.png new file mode 100644 index 0000000..7ff3cf9 Binary files /dev/null and b/dist/estimate_distribution_histplot.png differ diff --git a/dist/estimates_lower_vs_upper_scatter.png b/dist/estimates_lower_vs_upper_scatter.png new file mode 100644 index 0000000..2908733 Binary files /dev/null and b/dist/estimates_lower_vs_upper_scatter.png differ diff --git a/dist/estimates_spread_per_occupation.png b/dist/estimates_spread_per_occupation.png new file mode 100644 index 0000000..7d57749 Binary files /dev/null and b/dist/estimates_spread_per_occupation.png differ diff --git a/dist/intermediate/df_tasks.parquet b/dist/intermediate/df_tasks.parquet new file mode 100644 index 0000000..3f9dc1e Binary files /dev/null and b/dist/intermediate/df_tasks.parquet differ diff --git a/dist/intermediate/estimable_tasks_with_estimates.parquet b/dist/intermediate/estimable_tasks_with_estimates.parquet new file mode 100644 index 0000000..3d5b740 Binary files /dev/null and b/dist/intermediate/estimable_tasks_with_estimates.parquet differ diff --git a/dist/intermediate/task_summary_by_major_occupation.parquet b/dist/intermediate/task_summary_by_major_occupation.parquet new file mode 100644 index 0000000..eb5d72e Binary files /dev/null and b/dist/intermediate/task_summary_by_major_occupation.parquet differ diff --git a/dist/intermediate/task_summary_by_occupation.parquet b/dist/intermediate/task_summary_by_occupation.parquet new file mode 100644 index 0000000..8931eb3 Binary files /dev/null and b/dist/intermediate/task_summary_by_occupation.parquet differ diff --git a/dist/projected_automatable_wage_bill_sensitivity.png b/dist/projected_automatable_wage_bill_sensitivity.png new file mode 100644 index 0000000..1d12a91 Binary files /dev/null and b/dist/projected_automatable_wage_bill_sensitivity.png differ diff --git a/dist/projected_task_automation_p50.png b/dist/projected_task_automation_p50.png new file mode 100644 index 0000000..4eb4816 Binary files /dev/null and b/dist/projected_task_automation_p50.png differ diff --git a/dist/projected_task_automation_p80.png b/dist/projected_task_automation_p80.png new file mode 100644 index 0000000..18bb9a8 Binary files /dev/null and b/dist/projected_task_automation_p80.png differ diff --git a/dist/sequential_coherence_cdf.png b/dist/sequential_coherence_cdf.png new file mode 100644 index 0000000..befdf0e Binary files /dev/null and b/dist/sequential_coherence_cdf.png differ diff --git a/old/add_task_estimates.py b/old/add_task_estimates.py deleted file mode 100644 index e72a532..0000000 --- a/old/add_task_estimates.py +++ /dev/null @@ -1,507 +0,0 @@ -import pandas as pd -import litellm -import dotenv -import os -import time -import json -import math -import numpy as np - -# --- Configuration --- -MODEL = "gpt-4.1-mini" # Make sure this model supports json_schema or structured output -RATE_LIMIT = 5000 # Requests per minute -CHUNK_SIZE = 300 -SECONDS_PER_MINUTE = 60 -FILENAME = ( - "tasks_with_estimates.csv" # This CSV should contain the tasks to be processed -) - -# --- Prompts and Schema --- -SYSTEM_PROMPT = """ -You are an expert assistant evaluating the time to completion required for job tasks. Your goal is to estimate the time range needed for a skilled human to complete the following job task remotely, without supervision. - -Provide a lower and upper bound estimate for the time to completion time. These bounds should capture the time within which approximately 80% of instances of performing this specific task are typically completed by a qualified individual. - -Base your estimate on the provided task description, its associated activities, and the occupational context. Your estimate must be in one the allowed units: minute, hour, day, week, month, trimester, semester, year. -""".strip() - -USER_MESSAGE_TEMPLATE = """ -Please estimate the time range for the following remote task: - -**Task Description:** {task} -**Relevant activies for the task:** -{dwas} - -**Occupation Category:** {occupation_title} -**Occupation Description:** {occupation_description} - -Consider the complexity and the typical steps involved. -""".strip() - -ALLOWED_UNITS = [ - "minute", - "hour", - "day", - "week", - "month", - "trimester", - "semester", - "year", -] - -SCHEMA_FOR_VALIDATION = { - "name": "estimate_time", - "strict": True, # Enforce schema adherence - "schema": { - "type": "object", - "properties": { - "lower_bound_estimate": { - "type": "object", - "properties": { - "quantity": { - "type": "number", - "description": "The numerical value for the lower bound of the estimate.", - }, - "unit": { - "type": "string", - "enum": ALLOWED_UNITS, - "description": "The unit of time for the lower bound.", - }, - }, - "required": ["quantity", "unit"], - "additionalProperties": False, - }, - "upper_bound_estimate": { - "type": "object", - "properties": { - "quantity": { - "type": "number", - "description": "The numerical value for the upper bound of the estimate.", - }, - "unit": { - "type": "string", - "enum": ALLOWED_UNITS, - "description": "The unit of time for the upper bound.", - }, - }, - "required": ["quantity", "unit"], - "additionalProperties": False, - }, - }, - "required": ["lower_bound_estimate", "upper_bound_estimate"], - "additionalProperties": False, - }, -} - - -def save_dataframe(df_to_save, filename): - - """Saves the DataFrame to the specified CSV file using atomic write.""" - try: - temp_filename = filename + ".tmp" - df_to_save.to_csv(temp_filename, encoding="utf-8-sig", index=False) - os.replace(temp_filename, filename) - except Exception as e: - print(f"--- Error saving DataFrame to {filename}: {e} ---") - if os.path.exists(temp_filename): - try: - os.remove(temp_filename) - except Exception as remove_err: - print( - f"--- Error removing temporary save file {temp_filename}: {remove_err} ---" - ) - -def create_task_estimates(): - try: - # Read the CSV - if os.path.exists(FILENAME): - df = pd.read_csv(FILENAME, encoding="utf-8-sig") - print(f"Successfully read {len(df)} rows from {FILENAME}.") - - estimate_columns_spec = { - "lb_estimate_qty": float, - "lb_estimate_unit": object, - "ub_estimate_qty": float, - "ub_estimate_unit": object, - } - save_needed = False - - for col_name, target_dtype in estimate_columns_spec.items(): - if col_name not in df.columns: - # Initialize with a type-compatible missing value - if target_dtype == float: - df[col_name] = np.nan - else: # object - df[col_name] = pd.NA - df[col_name] = df[col_name].astype(target_dtype) # Enforce dtype - print(f"Added '{col_name}' column as {df[col_name].dtype}.") - save_needed = True - else: - # Column exists, ensure correct dtype - current_pd_dtype = df[col_name].dtype - expected_pd_dtype = pd.Series(dtype=target_dtype).dtype - - if current_pd_dtype != expected_pd_dtype: - try: - if target_dtype == float: - df[col_name] = pd.to_numeric(df[col_name], errors="coerce") - else: # object - df[col_name] = df[col_name].astype(object) - print( - f"Corrected dtype of '{col_name}' to {df[col_name].dtype}." - ) - save_needed = True - except Exception as e: - print( - f"Warning: Could not convert column '{col_name}' to {target_dtype}: {e}. Current dtype: {current_pd_dtype}" - ) - - # Standardize missing values (e.g., empty strings to NA/NaN) - # Replace common missing placeholders with pd.NA first - df[col_name].replace(["", None, ""], pd.NA, inplace=True) - if target_dtype == float: - # For float columns, ensure they are numeric and use np.nan after replacement - df[col_name] = pd.to_numeric(df[col_name], errors="coerce") - - if save_needed: - print(f"Saving {FILENAME} after adding/adjusting estimate columns.") - save_dataframe(df, FILENAME) - else: - print( - f"Error: {FILENAME} not found. Please ensure the file exists and contains task data." - ) - exit() - except FileNotFoundError: - print( - f"Error: {FILENAME} not found. Please ensure the file exists and contains task data." - ) - exit() - except Exception as e: - print(f"Error reading or initializing {FILENAME}: {e}") - exit() - - # --- Identify Rows to Process --- - # We'll check for NaN in one of the primary quantity columns. - unprocessed_mask = df["lb_estimate_qty"].isna() - if unprocessed_mask.any(): - start_index = unprocessed_mask.idxmax() # Finds the index of the first True value - print(f"Resuming processing. First unprocessed row found at index {start_index}.") - df_to_process = df.loc[unprocessed_mask].copy() - original_indices = df_to_process.index # Keep track of original indices - else: - print( - "All rows seem to have estimates already (based on 'lb_estimate_qty'). Exiting." - ) - exit() - - - # --- Prepare messages for batch completion (only for rows needing processing) --- - messages_list = [] - skipped_rows_indices = [] - valid_original_indices = [] - - if not df_to_process.empty: - required_cols = ["task", "occupation_title", "occupation_description", "dwas"] - print( - f"Preparing messages for up to {len(df_to_process)} rows starting from original index {original_indices[0] if len(original_indices) > 0 else 'N/A'}..." - ) - print(f"Checking for required columns: {required_cols}") - - for index, row in df_to_process.iterrows(): - missing_or_empty = [] - for col in required_cols: - if col not in row or pd.isna(row[col]) or str(row[col]).strip() == "": - missing_or_empty.append(col) - - if missing_or_empty: - print( - f"Warning: Skipping row original index {index} due to missing/empty required data in columns: {', '.join(missing_or_empty)}." - ) - skipped_rows_indices.append(index) - continue - - try: - user_message = USER_MESSAGE_TEMPLATE.format( - task=row["task"], - occupation_title=row["occupation_title"], - occupation_description=row["occupation_description"], - dwas=row["dwas"], - ) - except KeyError as e: - print( - f"Error: Skipping row original index {index} due to formatting error - missing key: {e}. Check USER_MESSAGE_TEMPLATE and CSV columns." - ) - skipped_rows_indices.append(index) - continue - - messages_for_row = [ - {"role": "system", "content": SYSTEM_PROMPT}, - {"role": "user", "content": user_message}, - ] - messages_list.append(messages_for_row) - valid_original_indices.append(index) # This is the original DataFrame index - - print( - f"Prepared {len(messages_list)} valid message sets for batch completion (skipped {len(skipped_rows_indices)} rows)." - ) - if not messages_list: - print("No valid rows found to process after checking required data. Exiting.") - exit() - else: - print( - "No rows found needing processing (df_to_process is empty)." - ) # Should have been caught by earlier check - exit() - - - # --- Call batch_completion in chunks with rate limiting and periodic saving --- - total_messages_to_send = len(messages_list) - num_chunks = math.ceil(total_messages_to_send / CHUNK_SIZE) - - print( - f"\nStarting batch completion for {total_messages_to_send} items in {num_chunks} chunks..." - ) - - overall_start_time = time.time() - processed_count_total = 0 - - for i in range(num_chunks): - chunk_start_message_index = i * CHUNK_SIZE - chunk_end_message_index = min((i + 1) * CHUNK_SIZE, total_messages_to_send) - message_chunk = messages_list[chunk_start_message_index:chunk_end_message_index] - # Get corresponding original DataFrame indices for this chunk - chunk_original_indices = valid_original_indices[ - chunk_start_message_index:chunk_end_message_index - ] - - if not message_chunk: - continue - - min_idx_disp = min(chunk_original_indices) if chunk_original_indices else "N/A" - max_idx_disp = max(chunk_original_indices) if chunk_original_indices else "N/A" - print( - f"\nProcessing chunk {i + 1}/{num_chunks} (Messages {chunk_start_message_index + 1}-{chunk_end_message_index} of this run)..." - f" Corresponding to original indices: {min_idx_disp} - {max_idx_disp}" - ) - chunk_start_time = time.time() - responses = [] - try: - print(f"Sending {len(message_chunk)} requests for chunk {i + 1}...") - responses = litellm.batch_completion( - model=MODEL, - messages=message_chunk, - response_format={ - "type": "json_schema", - "json_schema": SCHEMA_FOR_VALIDATION, - }, - num_retries=3, - # request_timeout=60 # Optional: uncomment if needed - ) - print(f"Chunk {i + 1} API call completed.") - - except Exception as e: - print(f"Error during litellm.batch_completion for chunk {i + 1}: {e}") - responses = [None] * len( - message_chunk - ) # Ensure responses list matches message_chunk length for processing loop - - # --- Process responses for the current chunk --- - chunk_updates = {} # To store {original_df_index: {qty/unit data}} - successful_in_chunk = 0 - failed_in_chunk = 0 - - if responses and len(responses) == len(message_chunk): - for j, response in enumerate(responses): - original_df_index = chunk_original_indices[j] - - # Initialize values for this item - lb_qty_val, lb_unit_val, ub_qty_val, ub_unit_val = None, None, None, None - content_str = None - - if response is None: - print( - f"Skipping processing for original index {original_df_index} due to API call failure for this item (response is None)." - ) - failed_in_chunk += 1 - continue - - try: - if ( - response.choices - and response.choices[0].message - and response.choices[0].message.content - ): - content_str = response.choices[0].message.content - estimate_data = json.loads(content_str) # Can raise JSONDecodeError - - lower_bound_dict = estimate_data.get("lower_bound_estimate") - upper_bound_dict = estimate_data.get("upper_bound_estimate") - - valid_response_structure = isinstance( - lower_bound_dict, dict - ) and isinstance(upper_bound_dict, dict) - - if valid_response_structure: - lb_qty_raw = lower_bound_dict.get("quantity") - lb_unit_raw = lower_bound_dict.get("unit") - ub_qty_raw = upper_bound_dict.get("quantity") - ub_unit_raw = upper_bound_dict.get("unit") - - is_valid_item = True - # Validate LB Qty - if ( - not isinstance(lb_qty_raw, (int, float)) - or math.isnan(float(lb_qty_raw)) - or float(lb_qty_raw) < 0 - ): - print( - f"Warning: Invalid lb_quantity for original index {original_df_index}: {lb_qty_raw}" - ) - is_valid_item = False - else: - lb_qty_val = float(lb_qty_raw) - - # Validate UB Qty - if ( - not isinstance(ub_qty_raw, (int, float)) - or math.isnan(float(ub_qty_raw)) - or float(ub_qty_raw) < 0 - ): - print( - f"Warning: Invalid ub_quantity for original index {original_df_index}: {ub_qty_raw}" - ) - is_valid_item = False - else: - ub_qty_val = float(ub_qty_raw) - - # Validate Units - if lb_unit_raw not in ALLOWED_UNITS: - print( - f"Warning: Invalid lb_unit for original index {original_df_index}: '{lb_unit_raw}'" - ) - is_valid_item = False - else: - lb_unit_val = lb_unit_raw - - if ub_unit_raw not in ALLOWED_UNITS: - print( - f"Warning: Invalid ub_unit for original index {original_df_index}: '{ub_unit_raw}'" - ) - is_valid_item = False - else: - ub_unit_val = ub_unit_raw - - if is_valid_item: - successful_in_chunk += 1 - chunk_updates[original_df_index] = { - "lb_estimate_qty": lb_qty_val, - "lb_estimate_unit": lb_unit_val, - "ub_estimate_qty": ub_qty_val, - "ub_estimate_unit": ub_unit_val, - } - else: - failed_in_chunk += ( - 1 # Values remain None if not fully valid - ) - else: - print( - f"Warning: Missing or malformed estimate dicts in JSON for original index {original_df_index}. Content: '{content_str}'" - ) - failed_in_chunk += 1 - else: - finish_reason = ( - response.choices[0].finish_reason - if (response.choices and response.choices[0].finish_reason) - else "unknown" - ) - error_message = ( - response.choices[0].message.content - if ( - response.choices - and response.choices[0].message - and response.choices[0].message.content - ) - else "No content in message." - ) - print( - f"Warning: Received non-standard or empty response content for original index {original_df_index}. " - f"Finish Reason: '{finish_reason}'. Message: '{error_message}'. Raw Choices: {response.choices}" - ) - failed_in_chunk += 1 - - except json.JSONDecodeError: - print( - f"Warning: Could not decode JSON for original index {original_df_index}. Content received: '{content_str}'" - ) - failed_in_chunk += 1 - except AttributeError as ae: - print( - f"Warning: Missing expected attribute processing response for original index {original_df_index}: {ae}. Response: {response}" - ) - failed_in_chunk += 1 - except Exception as e: - print( - f"Warning: An unexpected error occurred processing response for original index {original_df_index}: {type(e).__name__} - {e}. Response: {response}" - ) - failed_in_chunk += 1 - else: - print( - f"Warning: Mismatch between number of responses ({len(responses) if responses else 0}) " - f"and messages sent ({len(message_chunk)}) for chunk {i + 1}, or no responses. Marking all as failed." - ) - failed_in_chunk = len( - message_chunk - ) # All items in this chunk are considered failed if response array is problematic - - print( - f"Chunk {i + 1} processing summary: Success={successful_in_chunk}, Failed/Skipped={failed_in_chunk}" - ) - processed_count_total += successful_in_chunk - - # --- Update Main DataFrame and Save Periodically --- - if chunk_updates: - print( - f"Updating main DataFrame with {len(chunk_updates)} new estimates for chunk {i + 1}..." - ) - for idx, estimates in chunk_updates.items(): - if idx in df.index: - df.loc[idx, "lb_estimate_qty"] = estimates["lb_estimate_qty"] - df.loc[idx, "lb_estimate_unit"] = estimates["lb_estimate_unit"] - df.loc[idx, "ub_estimate_qty"] = estimates["ub_estimate_qty"] - df.loc[idx, "ub_estimate_unit"] = estimates["ub_estimate_unit"] - - print(f"Saving progress to {FILENAME}...") - save_dataframe(df, FILENAME) - else: - print(f"No successful estimates obtained in chunk {i + 1} to save.") - - # --- Rate Limiting Pause --- - chunk_end_time = time.time() - chunk_duration = chunk_end_time - chunk_start_time - print(f"Chunk {i + 1} took {chunk_duration:.2f} seconds.") - - if i < num_chunks - 1: # No pause after the last chunk - # Calculate ideal time per request based on rate limit - time_per_request = SECONDS_PER_MINUTE / RATE_LIMIT if RATE_LIMIT > 0 else 0 - # Calculate minimum duration this chunk should have taken to respect rate limit - min_chunk_duration_for_rate = len(message_chunk) * time_per_request - # Calculate pause needed - pause_needed = max(0, min_chunk_duration_for_rate - chunk_duration) - - if pause_needed > 0: - print( - f"Pausing for {pause_needed:.2f} seconds to respect rate limit ({RATE_LIMIT}/min)..." - ) - time.sleep(pause_needed) - - overall_end_time = time.time() - total_duration_minutes = (overall_end_time - overall_start_time) / 60 - print( - f"\nBatch completion finished." - f" Processed {processed_count_total} new estimates in this run in {total_duration_minutes:.2f} minutes." - ) - - print(f"Performing final save to {FILENAME}...") - save_dataframe(df, FILENAME) - - print("\nScript finished.") diff --git a/old/analysis.py b/old/analysis.py deleted file mode 100644 index 18cefb3..0000000 --- a/old/analysis.py +++ /dev/null @@ -1,528 +0,0 @@ -import os -import litellm -import sqlite3 -import numpy as np -import pandas as pd -from google.colab import userdata, files -import seaborn as sns -import matplotlib.pyplot as plt -import matplotlib as mpl - -os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY') -os.environ['GEMINI_API_KEY'] = userdata.get('GEMINI_API_KEY') - -occupation_major_codes = { - '11': 'Management', - '13': 'Business and Financial Operations', - '15': 'Computer and Mathematical Occupations', - '17': 'Architecture and Engineering', - '19': 'Life, Physical, and Social Science', - '21': 'Community and Social Services', - '23': 'Legal', - '25': 'Education, Training, and Library', - '27': 'Arts, Design, Entertainment, Sports, and Media', - '29': 'Healthcare Practitioners and Technical', - '31': 'Healthcare Support', - '33': 'Protective Service', - '35': 'Food Preparation and Serving Related', - '37': 'Building and Grounds Cleaning and Maintenance', - '39': 'Personal Care and Service', - '41': 'Sales and Related', - '43': 'Office and Administrative Support', - '45': 'Farming, Fishing, and Forestry', - '47': 'Construction and Extraction', - '49': 'Installation, Maintenance, and Repair', - '51': 'Production', - '53': 'Transportation and Material Moving', - '55': 'Military Specific' -} - -gray = {'50':'#f8fafc','100':'#f1f5f9','200':'#e2e8f0', - '300':'#cbd5e1','400':'#94a3b8','500':'#64748b', - '600':'#475569','700':'#334155','800':'#1e293b', - '900':'#0f172a','950':'#020617'} -lime = {'50': '#f7fee7','100': '#ecfcca','200': '#d8f999', - '300': '#bbf451','400': '#9ae600','500': '#83cd00', - '600': '#64a400','700': '#497d00','800': '#3c6300', - '900': '#35530e','950': '#192e03'} - -mpl.rcParams.update({ - 'figure.facecolor' : gray['50'], - 'axes.facecolor' : gray['50'], - 'axes.edgecolor' : gray['100'], - 'axes.labelcolor' : gray['700'], - 'xtick.color' : gray['700'], - 'ytick.color' : gray['700'], - 'font.family' : 'Inter', # falls back to DejaVu if Inter not present - 'font.size' : 11, -}) - -sns.set_style("white") # keep minimal axes, we will remove default grid -sns.set_context("notebook") - -def prepare_tasks(): - # This dataset comes from https://epoch.ai/gradient-updates/consequences-of-automating-remote-work - # It contains labels for a O*NET task can be done remotely or not (labeled by GPT-4o) - # You can download it here: https://drive.google.com/file/d/1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r/view?usp=sharing - df_remote_status = pd.read_csv("epoch_task_data.csv") - - # BLS OEWS: Https://www.bls.gov/oes/special-requests/oesm23nat.zip - df_oesm = pd.read_excel("oesm23national.xlsx") - - # Run uv run ./enrich_task_ratings.py - df_tasks = pd.read_json("task_ratings_enriched.json") - - # Run uv run classify_estimateability_of_tasks.py - df_task_estimateable = pd.read_csv("tasks_estimateable.csv").rename(columns={"task_estimateable": "estimateable"}).drop_duplicates(subset=['task'], keep='first') - - # df_tasks now has a remote_status column which contains either "remote" or "not remote" - df_tasks = pd.merge(df_tasks, df_remote_status[['Task', 'Remote']], left_on='task', right_on='Task', how='left') - df_tasks = df_tasks.drop('Task', axis=1).rename(columns={'Remote': 'remote_status'}) - - # df_tasks now has a estimateable column which contains either "ATOMIC" or "ONGOING-CONSTRAINT" - df_tasks = pd.merge(df_tasks, df_task_estimateable[['task', 'estimateable']], on='task', how='left') - - df_tasks = df_tasks[df_tasks['importance_average'] < 3].copy() - - df_tasks['onetsoc_major'] = df_tasks['onetsoc_code'].str[:2] - - df_remote_tasks = df_tasks[df_tasks['remote_status'] == 'remote'].copy() - - # Call create_task_estimates() from add_task_estimates? which creates tasks_with_estimates.csv - -def preprocessing_time_estimates(): - df = pd.read_csv("tasks_with_estimates.csv") - - df = df[df['importance_average'] > 3].copy() - - # The embeddings comes from running `uv run ./embed_task_description.py` - # Columns: ['embedding_id', 'task', 'embedding_vector'] - # These contain embedding for UNIQUE tasks - df_task_embeddings = pd.read_parquet("tasks_with_embeddings.parquet").drop_duplicates(subset=['task'])[['task', 'task_embedding']].rename(columns={"task_embedding": "embedding_vector"}).copy() - - df = pd.merge(df, df_task_embeddings[['task', 'embedding_vector']], on='task', how='left') - df = pd.merge(df, df_task_estimateable[['task', 'estimateable']], on='task', how='left') - - df['onetsoc_major'] = df['onetsoc_code'].str[:2] - - def convert_to_minutes(qty, unit): - """Converts a quantity in a given unit to minutes.""" - return qty * { - "minute": 1, - "hour": 60, - "day": 60 * 24, - "week": 60 * 24 * 7, - "month": 60 * 24 * 30, - "trimester": 60 * 24 * 90, - "semester": 60 * 24 * 180, - "year": 60 * 24 * 365, - }[unit] - - df['lb_estimate_in_minutes'] = df.apply( - lambda row: convert_to_minutes(row['lb_estimate_qty'], row['lb_estimate_unit']), axis=1 - ) - df['ub_estimate_in_minutes'] = df.apply( - lambda row: convert_to_minutes(row['ub_estimate_qty'], row['ub_estimate_unit']), axis=1 - ) - - df['estimate_range'] = df.ub_estimate_in_minutes - df.lb_estimate_in_minutes - df['estimate_ratio'] = df.ub_estimate_in_minutes / df.lb_estimate_in_minutes - df['estimate_midpoint'] = (df.lb_estimate_in_minutes + df.ub_estimate_in_minutes)/2 - - atomic_tasks = df[df['estimateable'] == 'ATOMIC'] - ongoing_tasks = df[df['estimateable'] == 'ONGOING-CONSTRAINT'] - - with pd.option_context('display.max_columns', None): - display(df) - - # Check for empty estimates - if atomic_tasks['lb_estimate_in_minutes'].isnull().sum() > 0: - print("Missing values in 'lb_estimate_in_minutes':", atomic_tasks['lb_estimate_in_minutes'].isnull().sum()) - - if atomic_tasks['ub_estimate_in_minutes'].isnull().sum() > 0: - print("Missing values in 'ub_estimate_in_minutes':", atomic_tasks['ub_estimate_in_minutes'].isnull().sum()) - - # Check for impossible bounds - impossible_bounds = atomic_tasks[ - (atomic_tasks['lb_estimate_in_minutes'] <= 0) | - (atomic_tasks['ub_estimate_in_minutes'] <= 0) | - (atomic_tasks['lb_estimate_in_minutes'] > atomic_tasks['ub_estimate_in_minutes']) - ] - if not impossible_bounds.empty: - print(f"Error: Found rows with impossible bounds.") - with pd.option_context('display.max_colwidth', None): - display(impossible_bounds[['task', 'lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'dwas']]) - - #with pd.option_context('display.max_colwidth', None): - #display(atomic_tasks.nlargest(20, 'ub_estimate_in_minutes')[['task', 'lb_estimate_qty', 'lb_estimate_unit', 'lb_estimate_in_minutes', 'ub_estimate_qty', 'ub_estimate_unit', 'ub_estimate_in_minutes', 'estimate_ratio']]) - -def cell1(): - sns.histplot(atomic_tasks.estimate_midpoint, log_scale=True) - -def cell2(): - plt.figure(figsize=(14,10)) - sns.boxplot( - data=atomic_tasks, - x='onetsoc_major', # 11 = Management, 15 = Computer/Math, … - y='estimate_range', - showfliers=False - ) - plt.yscale('log') # long tail => log scale - plt.xlabel('Occupation') - plt.ylabel('Range (upper-lower, minutes)') - plt.title('Spread of time-range estimates per occupation') - - ax = plt.gca() - ax.set_xticklabels([occupation_major_codes[code.get_text()] for code in ax.get_xticklabels()], rotation=60, ha='right') - -def cell3(): - plt.figure(figsize=(10, 10)) - ax = sns.scatterplot( - data=atomic_tasks.replace({'onetsoc_major': occupation_major_codes}), # Replace codes with labels - x='lb_estimate_in_minutes', y='ub_estimate_in_minutes', - alpha=0.2, edgecolor=None, hue="onetsoc_major" # Use the labeled column for hue - ) - - # 45° reference - lims = (1, atomic_tasks[['lb_estimate_in_minutes','ub_estimate_in_minutes']].max().max()) - ax.plot(lims, lims, color='black', linestyle='--', linewidth=1) - - # optional helper lines: 2× and 10×, 100× ratios - for k in [2,10, 100]: - ax.plot(lims, [k*l for l in lims], - linestyle=':', color='grey', linewidth=1) - - ax.set(xscale='log', yscale='log') - ax.set_xlabel('Lower-bound (min, log scale)') - ax.set_ylabel('Upper-bound (min, log scale)') - ax.set_title('Lower vs upper estimates for all tasks') - - # Place the legend outside the plot - ax.legend(bbox_to_anchor=(1, 1), loc='upper left') - -def cell4(): - plt.figure(figsize=(8,4)) - sns.histplot(np.log10(atomic_tasks['estimate_ratio'].replace([np.inf, -np.inf], np.nan).dropna()), - bins=60, kde=True) - plt.axvline(np.log10(10), color='red', ls='--', lw=1, label='10×') - plt.axvline(np.log10(1.05), color='orange', ls='--', lw=1, label='1.05×') - plt.axvline(0, color='black', ls='-', lw=1) # ub = lb - plt.xlabel('log₁₀(upper / lower)') - plt.ylabel('Count') - plt.title('Distribution of upper:lower ratio') - plt.legend() - plt.tight_layout() - - -def cell5(): - # 1. Bin lower bounds into quartiles (Q1–Q4) - atomic_tasks['lb_q'] = pd.qcut(atomic_tasks.lb_estimate_in_minutes, - q=4, labels=['Q1 shortest','Q2','Q3','Q4 longest']) - - - # 3. Aggregate: median (or mean) ratio per cell - pivot = atomic_tasks.pivot_table(index='onetsoc_major', columns='lb_q', - values='estimate_ratio', aggfunc='median') - - # Map the index (onetsoc_major codes) to their corresponding labels - pivot.index = pivot.index.map(occupation_major_codes) - - - # 4. Visualise - plt.figure(figsize=(10,8)) - sns.heatmap(pivot, cmap='RdYlGn_r', center=2, annot=True, fmt='.1f', - cbar_kws={'label':'Median upper/lower ratio'}) - plt.xlabel('Lower-bound quartile') - plt.ylabel('Occupation (major group)') - plt.title('Typical range width by occupation and task length') - plt.tight_layout() - - - -def cell6(): - """ - from scipy.stats import median_abs_deviation - - def mad_z(series): - med = series.median() - mad = median_abs_deviation(series, scale='normal') # ⇒ comparable to σ - return (series - med) / mad - - df['robust_z'] = df.groupby('onetsoc_code')['estimate_midpoint'].transform(mad_z) - """ - - agg = (atomic_tasks - .groupby('onetsoc_code')['estimate_midpoint'] - .agg(median='median', - q1=lambda x: x.quantile(.25), - q3=lambda x: x.quantile(.75), - mean='mean', - std='std') - .reset_index()) - agg['IQR'] = agg.q3 - agg.q1 - agg['CV'] = agg['std'] / agg['mean'] # coefficient of variation - - # merge back the group mean and std so each row can be scored - atomic_tasks = atomic_tasks.merge(agg[['onetsoc_code','mean','std']], on='onetsoc_code') - - - atomic_tasks['z'] = (atomic_tasks.estimate_midpoint - atomic_tasks['mean']) / atomic_tasks['std'] - outliers = atomic_tasks.loc[atomic_tasks.z.abs() > 3] - outliers - -def cell7(): - from scipy.stats import median_abs_deviation - - def mad_z(series): - med = series.median() - mad = median_abs_deviation(series, scale='normal') # ⇒ comparable to σ - return (series - med) / mad - - atomic_tasks['robust_z'] = atomic_tasks.groupby('onetsoc_code')['estimate_midpoint'].transform(mad_z) - -def cell10(): - import matplotlib.ticker as mtick # For percentage formatting - import matplotlib.colors as mcolors # For color conversion - - summary_data = [] - - for code, label in occupation_major_codes.items(): - occ_df = df_tasks[df_tasks['onetsoc_major'] == code] - total_tasks_in_occ = len(occ_df) - - if total_tasks_in_occ == 0: - continue # Skip if no tasks for this occupation - - # Stack 1: % that isn't equal to "remote" - not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote']) - - # For the remaining remote tasks: - remote_df = occ_df[occ_df['remote_status'] == 'remote'] - - # Stack 2: % of remote + ATOMIC - remote_atomic_count = len(remote_df[remote_df['estimateable'] == 'ATOMIC']) - - # Stack 3: % of remote + ONGOING-CONSTRAINT - remote_ongoing_count = len(remote_df[remote_df['estimateable'] == 'ONGOING-CONSTRAINT']) - - summary_data.append({ - 'onetsoc_major_code': code, - 'occupation_label': label, - 'count_not_remote': not_remote_count, - 'count_remote_atomic': remote_atomic_count, - 'count_remote_ongoing': remote_ongoing_count, - 'total_tasks': total_tasks_in_occ - }) - - summary_df = pd.DataFrame(summary_data) - - # --- 3. Calculate Percentages --- - # Ensure total_tasks is not zero to avoid division by zero errors if an occupation had no tasks - summary_df = summary_df[summary_df['total_tasks'] > 0].copy() # Use .copy() to avoid SettingWithCopyWarning - - summary_df['pct_not_remote'] = (summary_df['count_not_remote'] / summary_df['total_tasks']) * 100 - summary_df['pct_remote_atomic'] = (summary_df['count_remote_atomic'] / summary_df['total_tasks']) * 100 - summary_df['pct_remote_ongoing'] = (summary_df['count_remote_ongoing'] / summary_df['total_tasks']) * 100 - - # Select columns for plotting and set index to occupation label - plot_df = summary_df.set_index('occupation_label')[ - ['pct_not_remote', 'pct_remote_atomic', 'pct_remote_ongoing'] - ] - - # Rename columns for a clearer legend - plot_df.columns = ['Not Remote', 'Remote + Estimable', 'Remote + Not estimable'] - - plot_df = plot_df.sort_values(by='Not Remote', ascending=False) - - - # --- 4. Plotting (Modified) --- - - # Define the custom colors based on your requirements - # The order must match the column order in plot_df: - # 1. 'Not Remote' - # 2. 'Remote & ATOMIC' - # 3. 'Remote & ONGOING-CONSTRAINT' - bar_colors = [gray["300"], lime["500"], lime["200"]] - - fig, ax = plt.subplots(figsize=(14, 10)) # Adjusted figsize for better readability - - plot_df.plot(kind='barh', stacked=True, ax=ax, color=bar_colors) - - ax.set_xlabel("Percentage of Tasks (%)", fontsize=12) - ax.set_ylabel("Occupation Major Group", fontsize=12) - ax.set_title("Task Breakdown by Occupation, Remote Status, and Estimateability", fontsize=14, pad=20) - - # Format x-axis as percentages - ax.xaxis.set_major_formatter(mtick.PercentFormatter()) - plt.xlim(0, 100) # Ensure x-axis goes from 0 to 100% - - # Remove right and top spines - ax.spines['right'].set_visible(False) - ax.spines['top'].set_visible(False) - - # Function to get contrasting text color - def get_contrasting_text_color(bg_color_hex_or_rgba): - """ - Determines if black or white text provides better contrast against a given background color. - bg_color_hex_or_rgba: A hex string (e.g., '#RRGGBB') or an RGBA tuple (values in [0, 1]). - Returns: 'black' or 'white'. - """ - # Convert to RGBA if it's a hex string or name - if isinstance(bg_color_hex_or_rgba, str): - rgba = mcolors.to_rgba(bg_color_hex_or_rgba) - else: - rgba = bg_color_hex_or_rgba - - r, g, b, _ = rgba # Ignore alpha for luminance calculation - # Calculate luminance (standard formula for sRGB) - # Values r, g, b should be in [0, 1] for this formula - luminance = 0.2126 * r + 0.7152 * g + 0.0722 * b - # Threshold for deciding text color - return 'black' if luminance > 0.55 else 'white' # Adjusted threshold slightly for better visual - - # Add percentages inside each bar segment - # Iterate through each "category" of bars (Not Remote, Remote & ATOMIC, etc.) - for i, container in enumerate(ax.containers): - # Get the color for this container/category - segment_color = bar_colors[i] - text_color = get_contrasting_text_color(segment_color) - - for patch in container.patches: # Iterate through each bar segment in the category - width = patch.get_width() - if width > 3: # Only add text if segment is wide enough (e.g., >3%) - x = patch.get_x() + width / 2 - y = patch.get_y() + patch.get_height() / 2 - ax.text(x, y, - f"{width:.1f}%", - ha='center', - va='center', - fontsize=8, # Adjust font size as needed - color=text_color, - fontweight='medium') # Bolder text can help - - - plt.legend(title="Task Category", bbox_to_anchor=(1.02, 1), loc='upper left', frameon=False) - -def cell11(): - df_oesm['onetsoc_major'] = df_oesm['OCC_CODE'].str[:2] - - # Calculate wage bill per occupation - # Wage bill = Total Employment * Annual Mean Wage - # Ensure columns are numeric, converting non-numeric values to NaN first - df_oesm['TOT_EMP'] = pd.to_numeric(df_oesm['TOT_EMP'], errors='coerce') - df_oesm['A_MEAN'] = pd.to_numeric(df_oesm['A_MEAN'], errors='coerce') - - # Drop rows with NaN in necessary columns after coercion - df_oesm.dropna(subset=['TOT_EMP', 'A_MEAN', 'onetsoc_major'], inplace=True) - - df_oesm['wage_bill'] = df_oesm['TOT_EMP'] * df_oesm['A_MEAN'] - - # Aggregate wage bill by onetsoc_major - df_wage_bill_major = df_oesm.groupby('onetsoc_major')['wage_bill'].sum().reset_index() - - # Map major codes to titles for better plotting - df_wage_bill_major['OCC_TITLE_MAJOR'] = df_wage_bill_major['onetsoc_major'].map(occupation_major_codes) - - # Sort by wage bill for better visualization - df_wage_bill_major = df_wage_bill_major.sort_values('wage_bill', ascending=False) - - # Plotting - plt.figure(figsize=(12, 8)) - sns.barplot(x='wage_bill', y='OCC_TITLE_MAJOR', data=df_wage_bill_major, palette="viridis") - plt.title('Total Wage Bill per Major Occupation Group') - plt.xlabel('Total Wage Bill (in billions)') - plt.ylabel('Major Occupation Group') - plt.grid(axis='x', linestyle='--', alpha=0.7) - -def cell11(): - # ─────────────────────────────────────────────────────────────── - # 1. CUMULATIVE-DISTRIBUTION-FUNCTION (CDF) PREP - # ─────────────────────────────────────────────────────────────── - def cdf(series): - s = series.sort_values().reset_index(drop=True) - return s.values, ((s.index + 1) / len(s)) * 100 - - x_lb , y_lb = cdf(atomic_tasks['lb_estimate_in_minutes']) - x_ub , y_ub = cdf(atomic_tasks['ub_estimate_in_minutes']) - x_mid, y_mid = cdf((atomic_tasks['ub_estimate_in_minutes'] + atomic_tasks['lb_estimate_in_minutes']) / 2) - - # ─────────────────────────────────────────────────────────────── - # 2. PLOTTING - # ─────────────────────────────────────────────────────────────── - fig, ax = plt.subplots(figsize=(10, 6)) - - # horizontal reference lines every 10 % - for y_val in range(0, 101, 10): - ax.axhline(y_val, color=gray['100'], linewidth=.8, zorder=1) - - # Plot Lower Bound CDF - ax.step(x_lb, y_lb, - where='post', - color=lime['300'], # Example: light blue for lower bound - linewidth=1.8, - linestyle='--', - zorder=2, - label='Lower bound estimate (CDF)') - - # Plot Upper Bound CDF - ax.step(x_ub, y_ub, - where='post', - color=lime['900'], # Example: light orange/red for upper bound - linewidth=1.8, - linestyle=':', - zorder=3, - label='Upper bound estimate (CDF)') - - # Plot Midpoint CDF (plotted last to be on top, or adjust zorder) - ax.step(x_mid, y_mid, - where='post', - color=lime['600'], - linewidth=2.2, - zorder=4, # Ensure it's on top of other lines if they overlap significantly - label='Mid-point estimate (CDF)') - - - # axes limits / scales - ax.set_ylim(0, 100) - ax.set_xscale('log') - - # y-axis ➝ percent labels - ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(decimals=0)) - - - # move y-label to top-left (just inside plotting area) - ax.text(-0.06, 1.03, - "% of tasks with temporal coherence ≤ X", - ha='left', va='bottom', - transform=ax.transAxes, - fontsize=12, fontweight='semibold') - - # custom x-ticks at human-friendly durations - ticks = [1, 5, 10, 30, 60, 120, 240, 480, - 1440, 2880, 10080, 43200, 129600, - 259200, 525600] - ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hour', '2 hours', '4 hours', '8 hours', - '1 day', '2 days', '1 week', '30 days', - '90 days', '180 days', '1 year'] - - # Vertical reference lines for x-ticks - for tick in ticks: - ax.axvline(tick, color=gray['300'], linewidth=.8, linestyle='--', zorder=1) - - ax.set_xticks(ticks) - ax.set_xticklabels(ticklabels, rotation=45, ha='right') - - ax.spines['top'].set_visible(False) - ax.spines['right'].set_visible(False) - ax.spines['left'].set_edgecolor(gray['300']) - ax.spines['bottom'].set_edgecolor(gray['300']) - - - # legend - ax.legend(frameon=False, loc='lower right') # Keep 'lower right' or adjust as needed - - ax.text(0.5, -0.3, - 'Temporal coherence (X)', - ha='center', va='center', - transform=ax.transAxes, - fontsize=12, fontweight='semibold') diff --git a/old/classify_estimateability_of_tasks.py b/old/classify_estimateability_of_tasks.py deleted file mode 100644 index ccf110b..0000000 --- a/old/classify_estimateability_of_tasks.py +++ /dev/null @@ -1,411 +0,0 @@ -import pandas as pd -import litellm -import dotenv -import os -import time -import json -import math - -# Load environment variables -dotenv.load_dotenv(override=True) - -# litellm._turn_on_debug() # Optional debugging - -# --- Configuration --- -MODEL = "gpt-4.1-mini" # Make sure this model supports json_schema or structured output -RATE_LIMIT = 5000 # Requests per minute -CHUNK_SIZE = 300 # Number of unique tasks per API call -SECONDS_PER_MINUTE = 60 - -# File configuration -CLASSIFICATION_FILENAME = "tasks_estimateable.csv" # Output file with classifications -TASK_SOURCE_FOR_INIT_FILENAME = "tasks_with_estimates.csv" -OUTPUT_COLUMN_NAME = "task_estimateable" -SOURCE_FILTER_COLUMN = "remote_status" -SOURCE_FILTER_VALUE = "remote" - -# --- Prompts and Schema --- -SYSTEM_PROMPT_CLASSIFY = """ -Classify the provided O*NET task into one of these categories: - - ATOMIC (schedulable): A single, clearly-bounded activity, typically lasting minutes, hours, or a few days. - - ONGOING-CONSTRAINT (background role/ethical rule): A continuous responsibility or behavioural norm with no schedulable duration (e.g., “follow confidentiality rules,” “serve as department head”). -""".strip() - -USER_MESSAGE_TEMPLATE_CLASSIFY = "Task: {task}" - -CLASSIFICATION_CATEGORIES = ["ATOMIC", "ONGOING-CONSTRAINT"] - -SCHEMA_FOR_CLASSIFICATION = { - "name": "classify_task_type", - "strict": True, - "schema": { - "type": "object", - "properties": { - "task_category": { - "type": "string", - "enum": CLASSIFICATION_CATEGORIES, - "description": "The classification of the task (ATOMIC or ONGOING-CONSTRAINT).", - } - }, - "required": ["task_category"], - "additionalProperties": False, - }, -} - - -def save_dataframe(df_to_save, filename): - """Saves the DataFrame to the specified CSV file using atomic write.""" - try: - temp_filename = filename + ".tmp" - df_to_save.to_csv(temp_filename, encoding="utf-8-sig", index=False) - os.replace(temp_filename, filename) - except Exception as e: - print(f"--- Error saving DataFrame to {filename}: {e} ---") - if os.path.exists(temp_filename): - try: - os.remove(temp_filename) - except Exception as remove_err: - print( - f"--- Error removing temporary save file {temp_filename}: {remove_err} ---" - ) - - -# --- Load or Initialize DataFrame --- -try: - if os.path.exists(CLASSIFICATION_FILENAME): - df = pd.read_csv(CLASSIFICATION_FILENAME, encoding="utf-8-sig") - print(f"Successfully read {len(df)} rows from {CLASSIFICATION_FILENAME}.") - - save_needed_after_load = False - if OUTPUT_COLUMN_NAME not in df.columns: - df[OUTPUT_COLUMN_NAME] = pd.NA - print(f"Added '{OUTPUT_COLUMN_NAME}' column.") - save_needed_after_load = True - - df[OUTPUT_COLUMN_NAME].replace(["", None, ""], pd.NA, inplace=True) - - if df[OUTPUT_COLUMN_NAME].dtype != object and not isinstance( - df[OUTPUT_COLUMN_NAME].dtype, pd.StringDtype - ): - try: - df[OUTPUT_COLUMN_NAME] = df[OUTPUT_COLUMN_NAME].astype(object) - print( - f"Corrected dtype of '{OUTPUT_COLUMN_NAME}' to {df[OUTPUT_COLUMN_NAME].dtype}." - ) - save_needed_after_load = True - except Exception as e: - print( - f"Warning: Could not convert column '{OUTPUT_COLUMN_NAME}' to object: {e}." - ) - - if "task" not in df.columns: - print( - f"Error: {CLASSIFICATION_FILENAME} must contain a 'task' column for processing." - ) - exit() - - if save_needed_after_load: - print(f"Saving {CLASSIFICATION_FILENAME} after adding/adjusting column.") - save_dataframe(df, CLASSIFICATION_FILENAME) - else: - print( - f"{CLASSIFICATION_FILENAME} not found. Attempting to create it from {TASK_SOURCE_FOR_INIT_FILENAME}." - ) - if not os.path.exists(TASK_SOURCE_FOR_INIT_FILENAME): - print( - f"Error: Source file {TASK_SOURCE_FOR_INIT_FILENAME} not found. Cannot create {CLASSIFICATION_FILENAME}." - ) - exit() - - df_source = pd.read_csv(TASK_SOURCE_FOR_INIT_FILENAME, encoding="utf-8-sig") - - required_source_cols_for_init = ["task", SOURCE_FILTER_COLUMN] - missing_source_cols = [ - col for col in required_source_cols_for_init if col not in df_source.columns - ] - if missing_source_cols: - print( - f"Error: Source file {TASK_SOURCE_FOR_INIT_FILENAME} is missing required columns for initialization: {', '.join(missing_source_cols)}." - ) - exit() - - df_source_filtered = df_source[ - df_source[SOURCE_FILTER_COLUMN] == SOURCE_FILTER_VALUE - ].copy() - - if df_source_filtered.empty: - print( - f"Warning: No tasks with '{SOURCE_FILTER_COLUMN}' == '{SOURCE_FILTER_VALUE}' found in {TASK_SOURCE_FOR_INIT_FILENAME}. " - f"{CLASSIFICATION_FILENAME} will be created with schema but no tasks to classify initially." - ) - - df = df_source_filtered[["task"]].copy() - df[OUTPUT_COLUMN_NAME] = pd.NA - df[OUTPUT_COLUMN_NAME] = df[OUTPUT_COLUMN_NAME].astype(object) - - print( - f"Created {CLASSIFICATION_FILENAME} using tasks from {TASK_SOURCE_FOR_INIT_FILENAME} " - f"(where {SOURCE_FILTER_COLUMN}='{SOURCE_FILTER_VALUE}'). New file has {len(df)} tasks." - ) - save_dataframe(df, CLASSIFICATION_FILENAME) - -except FileNotFoundError: - print(f"Error: A required file was not found. Please check paths.") - exit() -except Exception as e: - print(f"Error during DataFrame loading or initialization: {e}") - exit() - - -# --- Identify Unique Tasks to Process --- -if df.empty: - print(f"{CLASSIFICATION_FILENAME} is empty. Nothing to process. Exiting.") - exit() - -initial_unprocessed_mask = df[OUTPUT_COLUMN_NAME].isna() - -if not initial_unprocessed_mask.any(): - print( - f"All tasks in {CLASSIFICATION_FILENAME} seem to have been classified already. Exiting." - ) - exit() - -# Filter for rows that are unprocessed AND have a valid 'task' string -valid_tasks_to_consider_df = df[ - initial_unprocessed_mask & df["task"].notna() & (df["task"].str.strip() != "") -] - -if valid_tasks_to_consider_df.empty: - print( - f"No valid, unclassified tasks found to process (after filtering out empty/NaN task descriptions). Exiting." - ) - exit() - -unique_task_labels_for_api = ( - valid_tasks_to_consider_df["task"].drop_duplicates().tolist() -) -total_rows_to_update_potentially = len( - df[initial_unprocessed_mask] -) # Count all rows that are NA - -print( - f"Found {total_rows_to_update_potentially} total rows in {CLASSIFICATION_FILENAME} needing classification." -) -print( - f"Identified {len(unique_task_labels_for_api)} unique, valid task labels to send to the API." -) - - -# --- Prepare messages for batch completion (only for unique task labels) --- -messages_list = [] -print(f"Preparing messages for {len(unique_task_labels_for_api)} unique task labels...") - -for task_label in unique_task_labels_for_api: - # task_label is already guaranteed to be non-empty and not NaN from the filtering above - user_message = USER_MESSAGE_TEMPLATE_CLASSIFY.format(task=task_label) - messages_for_task = [ - {"role": "system", "content": SYSTEM_PROMPT_CLASSIFY}, - {"role": "user", "content": user_message}, - ] - messages_list.append(messages_for_task) - -print(f"Prepared {len(messages_list)} message sets for batch completion.") -if ( - not messages_list -): # Should only happen if unique_task_labels_for_api was empty, caught above - print( - "No messages prepared, though unique tasks were identified. This is unexpected. Exiting." - ) - exit() - - -# --- Call batch_completion in chunks with rate limiting and periodic saving --- -total_unique_tasks_to_send = len( - messages_list -) # Same as len(unique_task_labels_for_api) -num_chunks = math.ceil(total_unique_tasks_to_send / CHUNK_SIZE) - -print( - f"\nStarting batch classification for {total_unique_tasks_to_send} unique task labels in {num_chunks} chunks..." -) - -overall_start_time = time.time() -processed_rows_count_total = 0 # Counts actual rows updated in the DataFrame - -for i in range(num_chunks): - chunk_start_message_index = i * CHUNK_SIZE - chunk_end_message_index = min((i + 1) * CHUNK_SIZE, total_unique_tasks_to_send) - - message_chunk = messages_list[chunk_start_message_index:chunk_end_message_index] - # Get corresponding unique task labels for this chunk - chunk_task_labels = unique_task_labels_for_api[ - chunk_start_message_index:chunk_end_message_index - ] - - if not message_chunk: # Should not happen if loop range is correct - continue - - print( - f"\nProcessing chunk {i + 1}/{num_chunks} (Unique Task Labels {chunk_start_message_index + 1}-{chunk_end_message_index} of this run)..." - ) - chunk_start_time = time.time() - responses = [] - try: - print( - f"Sending {len(message_chunk)} requests (for unique tasks) for chunk {i + 1}..." - ) - responses = litellm.batch_completion( - model=MODEL, - messages=message_chunk, - response_format={ - "type": "json_schema", - "json_schema": SCHEMA_FOR_CLASSIFICATION, - }, - num_retries=3, - ) - print(f"Chunk {i + 1} API call completed.") - - except Exception as e: - print(f"Error during litellm.batch_completion for chunk {i + 1}: {e}") - responses = [None] * len(message_chunk) - - # --- Process responses for the current chunk --- - # chunk_updates stores {task_label: classification_category} - chunk_task_classifications = {} - successful_api_calls_in_chunk = 0 - failed_api_calls_in_chunk = 0 - - if responses and len(responses) == len(message_chunk): - for j, response in enumerate(responses): - current_task_label = chunk_task_labels[ - j - ] # The unique task label for this response - content_str = None - - if response is None: - print( - f"API call failed for task label '{current_task_label}' (response is None)." - ) - failed_api_calls_in_chunk += 1 - continue - - try: - if ( - response.choices - and response.choices[0].message - and response.choices[0].message.content - ): - content_str = response.choices[0].message.content - classification_data = json.loads(content_str) - category_raw = classification_data.get("task_category") - - if category_raw in CLASSIFICATION_CATEGORIES: - successful_api_calls_in_chunk += 1 - chunk_task_classifications[current_task_label] = category_raw - else: - print( - f"Warning: Invalid or missing task_category for task label '{current_task_label}': '{category_raw}'. Content: '{content_str}'" - ) - failed_api_calls_in_chunk += 1 - else: - finish_reason = ( - response.choices[0].finish_reason - if (response.choices and response.choices[0].finish_reason) - else "unknown" - ) - error_message = ( - response.choices[0].message.content - if (response.choices and response.choices[0].message) - else "No content in message." - ) - print( - f"Warning: Received non-standard or empty response content for task label '{current_task_label}'. " - f"Finish Reason: '{finish_reason}'. Message: '{error_message}'. Raw Choices: {response.choices}" - ) - failed_api_calls_in_chunk += 1 - - except json.JSONDecodeError: - print( - f"Warning: Could not decode JSON for task label '{current_task_label}'. Content received: '{content_str}'" - ) - failed_api_calls_in_chunk += 1 - except AttributeError as ae: - print( - f"Warning: Missing attribute processing response for task label '{current_task_label}': {ae}. Response: {response}" - ) - failed_api_calls_in_chunk += 1 - except Exception as e: - print( - f"Warning: Unexpected error processing response for task label '{current_task_label}': {type(e).__name__} - {e}. Response: {response}" - ) - failed_api_calls_in_chunk += 1 - else: - print( - f"Warning: Mismatch between #responses ({len(responses) if responses else 0}) " - f"and #messages sent ({len(message_chunk)}) for chunk {i + 1}, or no responses. Marking all API calls in chunk as failed." - ) - failed_api_calls_in_chunk = len(message_chunk) - - # --- Update Main DataFrame and Save Periodically --- - rows_updated_this_chunk = 0 - if chunk_task_classifications: - print( - f"Updating main DataFrame with classifications for {len(chunk_task_classifications)} unique tasks from chunk {i + 1}..." - ) - for task_label, category in chunk_task_classifications.items(): - # Update all rows in the main df that match this task_label AND are still NA in the output column - update_condition = (df["task"] == task_label) & ( - df[OUTPUT_COLUMN_NAME].isna() - ) - num_rows_for_this_task_label = df[update_condition].shape[0] - - if num_rows_for_this_task_label > 0: - df.loc[update_condition, OUTPUT_COLUMN_NAME] = category - rows_updated_this_chunk += num_rows_for_this_task_label - - print( - f"Updated {rows_updated_this_chunk} rows in the DataFrame based on this chunk's API responses." - ) - print(f"Saving progress to {CLASSIFICATION_FILENAME}...") - save_dataframe(df, CLASSIFICATION_FILENAME) - else: - print( - f"No successful API classifications obtained in chunk {i + 1} to update DataFrame or save." - ) - - print( - f"Chunk {i + 1} API summary: Successful Calls={successful_api_calls_in_chunk}, Failed/Skipped Calls={failed_api_calls_in_chunk}. " - f"Rows updated in DataFrame this chunk: {rows_updated_this_chunk}" - ) - processed_rows_count_total += rows_updated_this_chunk - - # --- Rate Limiting Pause --- - chunk_end_time = time.time() - chunk_duration = chunk_end_time - chunk_start_time - print(f"Chunk {i + 1} (API calls and DF update) took {chunk_duration:.2f} seconds.") - - if i < num_chunks - 1: - time_per_request = SECONDS_PER_MINUTE / RATE_LIMIT if RATE_LIMIT > 0 else 0 - min_chunk_duration_for_rate = ( - len(message_chunk) * time_per_request - ) # Based on API calls made - pause_needed = max(0, min_chunk_duration_for_rate - chunk_duration) - - if pause_needed > 0: - print( - f"Pausing for {pause_needed:.2f} seconds to respect rate limit ({RATE_LIMIT}/min)..." - ) - time.sleep(pause_needed) - -overall_end_time = time.time() -total_duration_minutes = (overall_end_time - overall_start_time) / 60 -print( - f"\nBatch classification finished." - f" Updated {processed_rows_count_total} rows in '{CLASSIFICATION_FILENAME}' with new classifications in this run." - f" Total duration: {total_duration_minutes:.2f} minutes." -) - -print(f"Performing final save to {CLASSIFICATION_FILENAME}...") -save_dataframe(df, CLASSIFICATION_FILENAME) - -print("\nScript finished.") diff --git a/old/create_onet_database.sh b/old/create_onet_database.sh deleted file mode 100755 index ca5ac09..0000000 --- a/old/create_onet_database.sh +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env bash - -# Set database name and directories -ONET_DB_NAME="onet.database" -ONET_ZIP_URL="https://www.onetcenter.org/dl_files/database/db_29_1_mysql.zip" -ONET_ZIP_FILE="db_29_1_mysql.zip" -ONET_EXTRACT_DIR="db_29_1_mysql" - -# Download O*NET database only if not already downloaded -if [ ! -f "$ONET_ZIP_FILE" ]; then - echo "Downloading O*NET database from $ONET_ZIP_URL" - curl -L -o "$ONET_ZIP_FILE" "$ONET_ZIP_URL" || wget -O "$ONET_ZIP_FILE" "$ONET_ZIP_URL" - - if [ $? -ne 0 ]; then - echo "Failed to download O*NET database" - exit 1 - fi -else - echo "Using existing O*NET database zip file" -fi - -# Extract downloaded zip file only if extraction directory doesn't exist -if [ ! -d "$ONET_EXTRACT_DIR" ]; then - echo "Extracting O*NET database files" - unzip -o "$ONET_ZIP_FILE" - - if [ $? -ne 0 ]; then - echo "Failed to extract O*NET database files" - exit 1 - fi -else - echo "Using existing extracted O*NET database files" -fi - -# Remove existing database if it exists -if [ -f "$ONET_DB_NAME" ]; then - echo "Removing existing database" - rm "$ONET_DB_NAME" -fi - -# Create a new SQLite database with optimized settings for fast import -echo "Creating new SQLite database: $ONET_DB_NAME with performance settings" -sqlite3 "$ONET_DB_NAME" << EOF -PRAGMA journal_mode = OFF; -PRAGMA synchronous = 0; -PRAGMA cache_size = 1000000; -PRAGMA locking_mode = EXCLUSIVE; -PRAGMA temp_store = MEMORY; -PRAGMA foreign_keys = ON; -EOF - -# Combine and execute all SQL files in one transaction -echo "Executing SQL files in alphabetical order (single transaction mode)" -sqlite3 "$ONET_DB_NAME" << EOF -BEGIN TRANSACTION; -$(find "$ONET_EXTRACT_DIR" -name "*.sql" | sort | xargs cat) -COMMIT; -EOF - -# Check if the execution was successful -if [ $? -ne 0 ]; then - echo "Error executing SQL files in batch transaction" - exit 1 -else - echo "Database populated successfully. Restoring reliability settings..." - - # Restore reliability-focused settings after import - sqlite3 "$ONET_DB_NAME" << EOF -PRAGMA journal_mode = WAL; -PRAGMA synchronous = NORMAL; -PRAGMA locking_mode = NORMAL; -PRAGMA temp_store = DEFAULT; -PRAGMA foreign_keys = ON; -PRAGMA optimize; -VACUUM; -EOF - - if [ $? -ne 0 ]; then - echo "Warning: Failed to restore reliability settings, but database is populated" - else - echo "Reliability settings restored successfully" - fi - - echo "O*NET database created and optimized successfully!" -fi diff --git a/old/enrich_task_ratings.py b/old/enrich_task_ratings.py deleted file mode 100644 index 70ae0bf..0000000 --- a/old/enrich_task_ratings.py +++ /dev/null @@ -1,392 +0,0 @@ -import sqlite3 -import pandas as pd -import json -import os -from collections import defaultdict -import numpy as np - -# --- Configuration --- -DB_FILE = "onet.database" -OUTPUT_FILE = "task_ratings_enriched.json" # Changed output filename - -# --- Database Interaction --- - - -def fetch_data_from_db(db_path): - """ - Fetches required data from the O*NET SQLite database using JOINs, - including DWAs. - - Args: - db_path (str): Path to the SQLite database file. - - Returns: - tuple(pandas.DataFrame, pandas.DataFrame): A tuple containing: - - DataFrame with task ratings info. - - DataFrame with task-to-DWA mapping. - Returns (None, None) if the database file doesn't exist or an error occurs. - """ - if not os.path.exists(db_path): - print(f"Error: Database file not found at {db_path}") - return None, None - - try: - conn = sqlite3.connect(db_path) - # Construct the SQL query to join the tables and select necessary columns - # Added LEFT JOINs for tasks_to_dwas and dwa_reference - # Use LEFT JOIN in case a task has no DWAs - query = """ - SELECT - tr.onetsoc_code, - tr.task_id, - ts.task, - od.title AS occupation_title, - od.description AS occupation_description, - tr.scale_id, - tr.category, - tr.data_value, - dr.dwa_title -- Added DWA title - FROM - task_ratings tr - JOIN - task_statements ts ON tr.task_id = ts.task_id - JOIN - occupation_data od ON tr.onetsoc_code = od.onetsoc_code - LEFT JOIN - tasks_to_dwas td ON tr.onetsoc_code = td.onetsoc_code AND tr.task_id = td.task_id -- - LEFT JOIN - dwa_reference dr ON td.dwa_id = dr.dwa_id; -- - """ - df = pd.read_sql_query(query, conn) - conn.close() - print( - f"Successfully fetched {len(df)} records (including DWA info) from the database." - ) - - if df.empty: - print("Warning: Fetched DataFrame is empty.") - # Return empty DataFrames with expected columns if the main fetch is empty - ratings_cols = [ - "onetsoc_code", - "task_id", - "task", - "occupation_title", - "occupation_description", - "scale_id", - "category", - "data_value", - ] - dwa_cols = ["onetsoc_code", "task_id", "dwa_title"] - return pd.DataFrame(columns=ratings_cols), pd.DataFrame(columns=dwa_cols) - - # Remove duplicates caused by joining ratings with potentially multiple DWAs per task - # Keep only unique combinations of the core task/rating info before processing - core_cols = [ - "onetsoc_code", - "task_id", - "task", - "occupation_title", - "occupation_description", - "scale_id", - "category", - "data_value", - ] - # Check if all core columns exist before attempting to drop duplicates - missing_core_cols = [col for col in core_cols if col not in df.columns] - if missing_core_cols: - print(f"Error: Missing core columns in fetched data: {missing_core_cols}") - return None, None - ratings_df = df[core_cols].drop_duplicates().reset_index(drop=True) - - # Get unique DWA info separately - dwa_cols = ["onetsoc_code", "task_id", "dwa_title"] - # Check if all DWA columns exist before processing - if all(col in df.columns for col in dwa_cols): - dwas_df = ( - df[dwa_cols] - .dropna(subset=["dwa_title"]) - .drop_duplicates() - .reset_index(drop=True) - ) - else: - print("Warning: DWA related columns missing, creating empty DWA DataFrame.") - dwas_df = pd.DataFrame( - columns=dwa_cols - ) # Create empty df if columns missing - - return ratings_df, dwas_df # Return two dataframes now - - except sqlite3.Error as e: - print(f"SQLite error: {e}") - if "conn" in locals() and conn: - conn.close() - return None, None # Return None for both if error - except Exception as e: - print(f"An error occurred during data fetching: {e}") - if "conn" in locals() and conn: - conn.close() - return None, None # Return None for both if error - - -# --- Data Processing --- - - -def process_task_ratings_with_dwas(ratings_df, dwas_df): - """ - Processes the fetched data to group, pivot frequency, calculate averages, - structure the output, and add associated DWAs. - - Args: - ratings_df (pandas.DataFrame): The input DataFrame with task ratings info. - dwas_df (pandas.DataFrame): The input DataFrame with task-to-DWA mapping. Can be None or empty. - - Returns: - list: A list of dictionaries, each representing an enriched task rating with DWAs. - Returns None if the input ratings DataFrame is invalid. - """ - if ratings_df is None or not isinstance( - ratings_df, pd.DataFrame - ): # Check if it's a DataFrame - print("Error: Input ratings DataFrame is invalid.") - return None - if ratings_df.empty: - print( - "Warning: Input ratings DataFrame is empty. Processing will yield empty result." - ) - # Decide how to handle empty input, maybe return empty list directly - # return [] - - # Ensure dwas_df is a DataFrame, even if empty - if dwas_df is None or not isinstance(dwas_df, pd.DataFrame): - print("Warning: Invalid or missing DWA DataFrame. Proceeding without DWA data.") - dwas_df = pd.DataFrame( - columns=["onetsoc_code", "task_id", "dwa_title"] - ) # Ensure it's an empty DF - - print("Starting data processing...") - - # --- 1. Handle Frequency (FT) --- - freq_df = ratings_df[ratings_df["scale_id"] == "FT"].copy() - if not freq_df.empty: - freq_pivot = freq_df.pivot_table( - index=["onetsoc_code", "task_id"], - columns="category", - values="data_value", - fill_value=0, - ) - freq_pivot.columns = [ - f"frequency_category_{int(col)}" for col in freq_pivot.columns - ] - print(f"Processed Frequency data. Shape: {freq_pivot.shape}") - else: - print("No Frequency (FT) data found.") - # Create an empty DataFrame with the multi-index to allow merging later - idx = pd.MultiIndex( - levels=[[], []], codes=[[], []], names=["onetsoc_code", "task_id"] - ) - freq_pivot = pd.DataFrame(index=idx) - - # --- 2. Handle Importance (IM, IJ) --- - imp_df = ratings_df[ratings_df["scale_id"].isin(["IM", "IJ"])].copy() - if not imp_df.empty: - imp_avg = ( - imp_df.groupby(["onetsoc_code", "task_id"])["data_value"] - .mean() - .reset_index() - ) - imp_avg.rename(columns={"data_value": "importance_average"}, inplace=True) - print(f"Processed Importance data. Shape: {imp_avg.shape}") - else: - print("No Importance (IM, IJ) data found.") - imp_avg = pd.DataFrame( - columns=["onetsoc_code", "task_id", "importance_average"] - ) - - # --- 3. Handle Relevance (RT) --- - rel_df = ratings_df[ratings_df["scale_id"] == "RT"].copy() - if not rel_df.empty: - rel_avg = ( - rel_df.groupby(["onetsoc_code", "task_id"])["data_value"] - .mean() - .reset_index() - ) - rel_avg.rename(columns={"data_value": "relevance_average"}, inplace=True) - print(f"Processed Relevance data. Shape: {rel_avg.shape}") - else: - print("No Relevance (RT) data found.") - rel_avg = pd.DataFrame(columns=["onetsoc_code", "task_id", "relevance_average"]) - - # --- 4. Process DWAs --- - if dwas_df is not None and not dwas_df.empty and "dwa_title" in dwas_df.columns: - print("Processing DWA data...") - # Group DWAs by task_id and aggregate titles into a list - dwas_grouped = ( - dwas_df.groupby(["onetsoc_code", "task_id"])["dwa_title"] - .apply(list) - .reset_index() - ) # - dwas_grouped.rename( - columns={"dwa_title": "dwas"}, inplace=True - ) # Rename column to 'dwas' - print(f"Processed DWA data. Shape: {dwas_grouped.shape}") - else: - print("No valid DWA data found or provided for processing.") - dwas_grouped = None # Set to None if no DWAs - - # --- 5. Get Base Task/Occupation Info --- - base_cols = [ - "onetsoc_code", - "task_id", - "task", - "occupation_title", - "occupation_description", - ] - # Check if base columns exist in ratings_df - missing_base_cols = [col for col in base_cols if col not in ratings_df.columns] - if missing_base_cols: - print( - f"Error: Missing base info columns in ratings_df: {missing_base_cols}. Cannot proceed." - ) - return None - if not ratings_df.empty: - base_info = ( - ratings_df[base_cols] - .drop_duplicates() - .set_index(["onetsoc_code", "task_id"]) - ) - print(f"Extracted base info. Shape: {base_info.shape}") - else: - print("Cannot extract base info from empty ratings DataFrame.") - # Create an empty df with index to avoid errors later if possible - idx = pd.MultiIndex( - levels=[[], []], codes=[[], []], names=["onetsoc_code", "task_id"] - ) - base_info = pd.DataFrame( - index=idx, - columns=[ - col for col in base_cols if col not in ["onetsoc_code", "task_id"] - ], - ) - - # --- 6. Merge Processed Data --- - print("Merging processed data...") - # Start with base_info, which should have the index ['onetsoc_code', 'task_id'] - final_df = base_info.merge( - freq_pivot, left_index=True, right_index=True, how="left" - ) - # Reset index before merging non-indexed dfs - final_df = final_df.reset_index() - - # Merge averages - check if they are not empty before merging - if not imp_avg.empty: - final_df = final_df.merge(imp_avg, on=["onetsoc_code", "task_id"], how="left") - else: - final_df["importance_average"] = np.nan # Add column if imp_avg was empty - - if not rel_avg.empty: - final_df = final_df.merge(rel_avg, on=["onetsoc_code", "task_id"], how="left") - else: - final_df["relevance_average"] = np.nan # Add column if rel_avg was empty - - # Merge DWAs if available - if dwas_grouped is not None and not dwas_grouped.empty: - final_df = final_df.merge( - dwas_grouped, on=["onetsoc_code", "task_id"], how="left" - ) # Merge the dwas list - # Fill NaN in 'dwas' column (for tasks with no DWAs) with empty lists - # Check if 'dwas' column exists before applying function - if "dwas" in final_df.columns: - final_df["dwas"] = final_df["dwas"].apply( - lambda x: x if isinstance(x, list) else [] - ) # Ensure tasks without DWAs get [] - else: - print("Warning: 'dwas' column not created during merge.") - final_df["dwas"] = [ - [] for _ in range(len(final_df)) - ] # Add empty list column - - else: - # Add an empty 'dwas' column if no DWA data was processed or merged - final_df["dwas"] = [[] for _ in range(len(final_df))] - - print(f"Final merged data shape: {final_df.shape}") - - # Convert DataFrame to list of dictionaries for JSON output - # Handle potential NaN values during JSON conversion - # Replace numpy NaN with Python None for JSON compatibility - final_df = final_df.replace({np.nan: None}) - result_list = final_df.to_dict(orient="records") - - return result_list - - -# --- Output --- - - -def write_to_json(data, output_path): - """ - Writes the processed data to a JSON file. - - Args: - data (list): The list of dictionaries to write. - output_path (str): Path to the output JSON file. - """ - if data is None: - print("No data to write to JSON.") - return - if not isinstance(data, list): - print( - f"Error: Data to write is not a list (type: {type(data)}). Cannot write to JSON." - ) - return - - # Create directory if it doesn't exist - output_dir = os.path.dirname(output_path) - if output_dir and not os.path.exists(output_dir): - try: - os.makedirs(output_dir) - print(f"Created output directory: {output_dir}") - except OSError as e: - print(f"Error creating output directory {output_dir}: {e}") - return # Exit if cannot create directory - - try: - with open(output_path, "w", encoding="utf-8") as f: - json.dump(data, f, indent=4, ensure_ascii=False) - print(f"Successfully wrote enriched data to {output_path}") - except IOError as e: - print(f"Error writing JSON file to {output_path}: {e}") - except TypeError as e: - print(f"Error during JSON serialization: {e}. Check data types.") - except Exception as e: - print(f"An unexpected error occurred during JSON writing: {e}") - - -# --- Main Execution --- - -if __name__ == "__main__": - print("Starting O*NET Task Ratings & DWAs Enrichment Script...") - # 1. Fetch data - ratings_data_df, dwas_data_df = fetch_data_from_db(DB_FILE) # Fetch both datasets - - # 2. Process data - # Proceed only if ratings_data_df is a valid DataFrame (even if empty) - # dwas_data_df can be None or empty, handled inside process function - if isinstance(ratings_data_df, pd.DataFrame): - enriched_data = process_task_ratings_with_dwas( - ratings_data_df, dwas_data_df - ) # Pass both dataframes - - # 3. Write output - if ( - enriched_data is not None - ): # Check if processing returned data (even an empty list is valid) - write_to_json(enriched_data, OUTPUT_FILE) - else: - print("Data processing failed or returned None. No output file generated.") - else: - print( - "Data fetching failed or returned invalid type for ratings data. Script terminated." - ) - - print("Script finished.") diff --git a/pipeline/aggregate.py b/pipeline/aggregate.py new file mode 100644 index 0000000..a7653e4 --- /dev/null +++ b/pipeline/aggregate.py @@ -0,0 +1,81 @@ +from .utils import OCCUPATION_MAJOR_CODES +import pandas as pd + +def create_task_summary_by_occupation_df(df_tasks: pd.DataFrame, oesm_df: pd.DataFrame) -> pd.DataFrame: + # --- OESM Wage Bill Calculation --- + df_oesm_with_bill = oesm_df.copy() + df_oesm_with_bill.rename(columns={'OCC_CODE': 'onetsoc_code'}, inplace=True) + + # Convert key columns to numeric, handling potential errors + df_oesm_with_bill['TOT_EMP'] = pd.to_numeric(df_oesm_with_bill['TOT_EMP'], errors='coerce') + df_oesm_with_bill['A_MEAN'] = pd.to_numeric(df_oesm_with_bill['A_MEAN'], errors='coerce') + df_oesm_with_bill.dropna(subset=['TOT_EMP', 'A_MEAN', 'onetsoc_code'], inplace=True) + + # Calculate the wage bill for each occupation + df_oesm_with_bill['wage_bill'] = df_oesm_with_bill['TOT_EMP'] * df_oesm_with_bill['A_MEAN'] + oesm_lookup = df_oesm_with_bill.set_index('onetsoc_code') + + summary_data = [] + + # Assuming df_tasks has an 'onetsoc_code' column with the full SOC code + unique_soc_codes = df_tasks['onetsoc_code'].unique() + + for code in unique_soc_codes: + occ_df = df_tasks[df_tasks['onetsoc_code'] == code] + total_tasks_in_occ = len(occ_df) + + not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote']) + remote_df = occ_df[occ_df['remote_status'] == 'remote'] + remote_estimable_count = len(remote_df[remote_df['estimable']]) + remote_not_estimable_count = len(remote_df[~remote_df['estimable']]) + + try: + # O*NET codes (e.g., 11-1011.03) are more specific than OESM SOC codes (e.g., 11-1011). + # We strip the suffix from the O*NET code to find the corresponding wage data. + soc_code_for_lookup = code.split('.')[0] + wage_bill = oesm_lookup.loc[soc_code_for_lookup, 'wage_bill'] + label = oesm_lookup.loc[soc_code_for_lookup, 'OCC_TITLE'] + except KeyError: + wage_bill = 0 + label = "Unknown" + + summary_data.append({ + 'onetsoc_code': code, + 'occupation_label': label, + 'wage_bill': wage_bill, + 'count_not_remote': not_remote_count, + 'count_remote_estimable': remote_estimable_count, + 'count_remote_not_estimable': remote_not_estimable_count, + 'total_tasks': total_tasks_in_occ + }) + + return pd.DataFrame(summary_data) + + +def aggregate_task_summary_by_major_code(summary_df: pd.DataFrame) -> pd.DataFrame: + df_agg = summary_df.copy() + df_agg['onetsoc_major_code'] = df_agg['onetsoc_code'].str[:2] + + aggregation = { + 'wage_bill': 'sum', + 'count_not_remote': 'sum', + 'count_remote_estimable': 'sum', + 'count_remote_not_estimable': 'sum', + 'total_tasks': 'sum' + } + major_summary = df_agg.groupby('onetsoc_major_code').agg(aggregation).reset_index() + + major_summary['occupation_label'] = major_summary['onetsoc_major_code'].map(OCCUPATION_MAJOR_CODES) + + # Reorder columns to match original output format + major_summary = major_summary[[ + 'onetsoc_major_code', + 'occupation_label', + 'wage_bill', + 'count_not_remote', + 'count_remote_estimable', + 'count_remote_not_estimable', + 'total_tasks' + ]] + + return major_summary diff --git a/pipeline/classification.py b/pipeline/classification.py new file mode 100644 index 0000000..09319e4 --- /dev/null +++ b/pipeline/classification.py @@ -0,0 +1,225 @@ +from pathlib import Path +import pandas as pd +from .logger import logger +from .utils import enrich +import json + +ALLOWED_UNITS = [ + "minute", + "hour", + "day", + "week", + "month", + "trimester", + "semester", + "year", +] + +ESTIMABLE_CLASSIFICATION_VERSION = "old_version" +TIME_ESTIMATES_GENERATION_VERSION = "old_version" + +def classify_tasks_as_estimable(cache_dir: Path, df_to_process: pd.DataFrame, bust: bool = False) -> pd.DataFrame: + CACHE_PATH = cache_dir / f"task_estimability.{ESTIMABLE_CLASSIFICATION_VERSION}.parquet" + if CACHE_PATH.exists() and not bust: + logger.info(f"Loading cached task estimability from {CACHE_PATH}") + return pd.read_parquet(CACHE_PATH) + + logger.info("Enriching tasks with estimability classification.") + + df_unique_tasks = df_to_process.drop_duplicates(subset=['task']).copy() + + logger.info(f"Found {len(df_unique_tasks)} unique remote tasks to classify.") + + if df_unique_tasks.empty: + raise ValueError("No unique tasks to classify.") + + results = enrich( + model="gpt-4.1-mini", + rpm=5000, + messages_to_process=[ + [ + {"role": "system", "content": """ + Classify the provided O*NET task into one of these categories: + - ATOMIC (schedulable): A single, clearly-bounded activity, typically lasting minutes, hours, or a few days. + - ONGOING-CONSTRAINT (background role/ethical rule): A continuous responsibility or behavioural norm with no schedulable duration (e.g., “follow confidentiality rules,” “serve as department head”). + """.strip()}, + {"role": "user", "content": f"Task: {row.task}"}, + ] + for row in df_unique_tasks.itertuples() + ], + schema={ + "name": "estimability_classification", + "schema": { + "type": "object", + "properties": {"task_category": {"type": "string", "enum": ["ATOMIC", "ONGOING-CONSTRAINT"]}}, + "required": ["task_category"], + "additionalProperties": False + } + }, + chunk_size=300, + ) + + if not results or len(results) != len(df_unique_tasks): + raise ValueError(f"Task estimability classification failed or returned mismatched number of results. Expected {len(df_unique_tasks)}, got {len(results) if results else 0}.") + + classifications = [] + for index, response in enumerate(results): + task_label = df_unique_tasks.iloc[index]['task'] + task_category_flag = None + + if response is None: + logger.warning(f"API call failed for task (enrich returned None): '{task_label}'") + else: + try: + content_str = response.choices[0].message.content + if not content_str: + raise ValueError("No content found in the response message") + + data = json.loads(content_str) + + if 'task_category' in data and isinstance(data['task_category'], str): + task_category_flag = data['task_category'] + else: + logger.warning(f"Invalid or missing 'task_category' payload for task '{task_label}'. Data: '{data}'") + except (json.JSONDecodeError, AttributeError, KeyError, IndexError, ValueError) as e: + logger.warning(f"Could not parse response for task '{task_label}'. Error: {e}. Response: {response}") + + classifications.append({ + 'task': task_label, + 'estimable': task_category_flag == 'ATOMIC' + }) + + classification_df = pd.DataFrame(classifications) + + logger.info(f"Finished classification. Got {classification_df['estimable'].notna().sum()} successful classifications out of {len(df_unique_tasks)} unique tasks.") + + logger.info(f"Saving task estimability classifications to {CACHE_PATH}") + classification_df.to_parquet(CACHE_PATH) + + return classification_df + + +def generate_time_estimates_for_tasks(cache_dir: Path, df_to_process: pd.DataFrame, bust: bool = False) -> pd.DataFrame: + CACHE_PATH = cache_dir / f"task_estimates.{TIME_ESTIMATES_GENERATION_VERSION}.parquet" + if CACHE_PATH.exists() and not bust: + logger.info(f"Loading cached task estimates from {CACHE_PATH}") + return pd.read_parquet(CACHE_PATH) + + logger.info("Enriching tasks with time estimates.") + + if df_to_process.empty: + raise ValueError("No tasks to process for estimates.") + + results = enrich( + model="gpt-4.1-mini", + rpm=5000, + messages_to_process=[ + [ + { + "role": "system", + "content": """ + You are an expert assistant evaluating the time required for job tasks. Your goal is to estimate the 'effective time' range needed for a skilled human to complete the following job task **remotely**, without supervision + + 'Effective time' is the active, focused work duration required to complete the task. Crucially, **exclude all waiting periods, delays, or time spent on other unrelated activities**. Think of it as the continuous, productive time investment needed if the worker could pause and resume instantly without cost. + + Provide a lower and upper bound estimate for the 'effective time'. These bounds should capture the time within which approximately 80% of instances of performing this specific task are typically completed by a qualified individual. + + Base your estimate on the provided task and the associated occupation and occupation description. Your estimate must be in one the allowed units: minute, hour, day, week, month, trimester, semester, year.""".strip() + }, + { + "role": "user", + "content": f"{row.task} done by {row.occupation_title} ({row.occupation_description})" + } + ] + for row in df_to_process.itertuples() + ], + schema= { + "name": "estimate_time", + "strict": True, + "schema": { + "type": "object", + "properties": { + "lower_bound_estimate": { + "type": "object", + "properties": { + "quantity": { + "type": "number", + "description": "The numerical value for the lower bound of the estimate.", + }, + "unit": { + "type": "string", + "enum": ALLOWED_UNITS, + "description": "The unit of time for the lower bound.", + }, + }, + "required": ["quantity", "unit"], + "additionalProperties": False, + }, + "upper_bound_estimate": { + "type": "object", + "properties": { + "quantity": { + "type": "number", + "description": "The numerical value for the upper bound of the estimate.", + }, + "unit": { + "type": "string", + "enum": ALLOWED_UNITS, + "description": "The unit of time for the upper bound.", + }, + }, + "required": ["quantity", "unit"], + "additionalProperties": False, + }, + }, + "required": ["lower_bound_estimate", "upper_bound_estimate"], + "additionalProperties": False, + }, + }, + chunk_size=200, + ) + + if not results or len(results) != len(df_to_process): + raise ValueError(f"API call for task estimates failed or returned mismatched number of results. " + f"Expected {len(df_to_process)}, got {len(results) if results else 0}.") + + estimates = [] + for index, response in enumerate(results): + row = df_to_process.iloc[index] + task_info = f"O*NET: {row.onetsoc_code}, Task ID: {row.task_id}" + lb_qty, lb_unit, ub_qty, ub_unit = None, None, None, None + + if response is None: + logger.warning(f"API call failed for task (enrich returned None): {task_info}") + else: + try: + content_str = response.choices[0].message.content + if not content_str: + raise ValueError("No content found in the response message") + + data = json.loads(content_str) + + lb_qty = data['lower_bound_estimate']['quantity'] + lb_unit = data['lower_bound_estimate']['unit'] + ub_qty = data['upper_bound_estimate']['quantity'] + ub_unit = data['upper_bound_estimate']['unit'] + except Exception as e: + logger.warning(f"Could not parse valid estimate for task {task_info}. Error: {e}. Response: {response}") + lb_qty, lb_unit, ub_qty, ub_unit = None, None, None, None # Reset on failure + + estimates.append({ + 'onetsoc_code': row.onetsoc_code, + 'task_id': row.task_id, + 'lb_estimate_qty': lb_qty, + 'lb_estimate_unit': lb_unit, + 'ub_estimate_qty': ub_qty, + 'ub_estimate_unit': ub_unit + }) + + estimates_df = pd.DataFrame(estimates) + logger.info(f"Finished estimates. Got {estimates_df['lb_estimate_qty'].notna().sum()} successful estimates out of {len(df_to_process)} tasks.") + + logger.info(f"Saving task estimates to {CACHE_PATH}") + estimates_df.to_parquet(CACHE_PATH) + + return estimates_df diff --git a/pipeline/constants.py b/pipeline/constants.py deleted file mode 100644 index d4733ad..0000000 --- a/pipeline/constants.py +++ /dev/null @@ -1,35 +0,0 @@ -OCCUPATION_MAJOR_CODES = { - '11': 'Management', - '13': 'Business & Financial', - '15': 'Computer & Mathematical', - '17': 'Architecture & Engineering', - '19': 'Life, Physical, & Social Science', - '21': 'Community & Social Service', - '23': 'Legal', - '25': 'Education, Training, & Library', - '27': 'Arts, Design, & Media', - '29': 'Healthcare Practitioners', - '31': 'Healthcare Support', - '33': 'Protective Service', - '35': 'Food Preparation & Serving', - '37': 'Building & Grounds Maintenance', - '39': 'Personal Care & Service', - '41': 'Sales & Related', - '43': 'Office & Admin Support', - '45': 'Farming, Fishing, & Forestry', - '47': 'Construction & Extraction', - '49': 'Installation, Maintenance, & Repair', - '51': 'Production', - '53': 'Transportation & Material Moving', - '55': 'Military Specific', -} - -GRAY = {'50':'#f8fafc','100':'#f1f5f9','200':'#e2e8f0', - '300':'#cbd5e1','400':'#94a3b8','500':'#64748b', - '600':'#475569','700':'#334155','800':'#1e293b', - '900':'#0f172a','950':'#020617'} - -LIME = {'50': '#f7fee7','100': '#ecfcca','200': '#d8f999', - '300': '#bbf451','400': '#9ae600','500': '#83cd00', - '600': '#64a400','700': '#497d00','800': '#3c6300', - '900': '#35530e','950': '#192e03'} diff --git a/pipeline/enrichments.py b/pipeline/enrichments.py deleted file mode 100644 index ab12c1e..0000000 --- a/pipeline/enrichments.py +++ /dev/null @@ -1,97 +0,0 @@ -""" -This module enriches data, they take time to run, and are usually expensive (API calls...), -they should manage their own state, and only be run if the data's version is different than -their save. -""" -from .run import Run -import pandas as pd -from typing import Any, List, Dict -import litellm - -def enrich( - model: str, - rpm: int, - messages_to_process: List[List[Dict[str, str]]], - schema: Dict[str, Any], - chunk_size: int = 100, -): - # Use litellm.batch_completion - pass - -def enrich_with_task_estimateability(run: Run) -> pd.DataFrame: - output_path = run.cache_dir / "computed_task_estimateability.parquet" - if output_path.exists(): - print(f"Loading cached task estimateability from {output_path}") - return pd.read_parquet(output_path) - - df_remote_tasks = run.df_tasks[run.df_tasks['remote_status'] == 'remote'].copy() - - # In the old script, we only passed unique tasks to the API - df_unique_tasks = df_remote_tasks.drop_duplicates(subset=['task']) - - - results = enrich( - model="gpt-4.1-mini", - rpm=5000, - messages_to_process=[ - [ - {"role": "system", "content": """ - Judge whether the provided O*NET task is suitable for a time estimate. If it is a single, clearly-bounded activity, typically lasting minutes, hours, or a few days, then clearly yes. If it is a continuous responsibility or behavioural norm with no schedulable duration (e.g., “follow confidentiality rules,” “serve as department head”), then clearly no. - """}, - {"role": "user", "content": f"Task: {row.task}"}, - ] - for row in df_unique_tasks.itertuples() - ], - schema={ - "type": "object", - "properties": {"estimateable": {"type": "bool"}}, - "required": ["estimateable"] - }, - chunk_size=300, - ) - - # Create a new dataframe with just enough information to identify the task uniquely + estimateability classification, save it, return it. Careful: the "task" column in itself is not unique. - return pd.DataFrame() - -def enrich_with_task_estimates(run: Run) -> pd.DataFrame: - output_path = run.cache_dir / "computed_task_estimates.parquet" - if output_path.exists(): - print(f"Loading cached task estimates from {output_path}") - return pd.read_parquet(output_path) - - df = ... # todo - - results = enrich( - model="gpt-4.1-mini", - rpm=5000, - messages_to_process=[ - [ - {"role": "system", "content": "Estimate the time required to complete the following O*NET task. Your estimate should be a plausible range for how long it might take a typical, qualified worker to perform this task once. Provide your answer as a time range (lower and upper bounds). Do not provide explanations or apologies. If the task is not suitable for a time estimate (e.g., it is an ongoing responsibility), interpret it as a single, schedulable action."}, - {"role": "user", "content": f""" - Task: {row.task} - For Occupation: {row.occupation_title} - Occupation Description: {row.occupation_description}"""} - ] - for row in df.itertuples() - ], - schema={ - "type": "object", - "properties": { - "lower_bound_estimate": { - "type": "object", - "properties": {"quantity": {"type": "number"}, "unit": {"type": "string", "enum": ["minutes", "hours", "days"]}}, - "required": ["quantity", "unit"], - }, - "upper_bound_estimate": { - "type": "object", - "properties": {"quantity": {"type": "number"}, "unit": {"type": "string", "enum": ["minutes", "hours", "days"]}}, - "required": ["quantity", "unit"], - }, - }, - "required": ["lower_bound_estimate", "upper_bound_estimate"], - }, - chunk_size=200, - ) - - # Create a new dataframe with just enough information to identify the task uniquely + the estimates classification, save it, return it. Careful: the "task" column in itself is not unique. - raise NotImplementedError diff --git a/pipeline/fetchers.py b/pipeline/fetchers.py index 440cfde..527bd89 100644 --- a/pipeline/fetchers.py +++ b/pipeline/fetchers.py @@ -1,50 +1,30 @@ -""" -Fetchers retrieve remote data and return it in a format suitable for further processing, they also return its version, which should be considered opaque, though it is usually a checksum. -""" - import sqlite3 -from typing import Tuple import pandas as pd import requests import io import zipfile -from pipeline.run import Run -from pipeline.logger import logger +import yaml +from pathlib import Path +from .logger import logger +from typing import Tuple, Dict -def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]: - """ - Downloads the O*NET database, creates a local SQLite file from it, and returns a connection. - """ - version = "29_1" - url = f"https://www.onetcenter.org/dl_files/database/db_{version}_mysql.zip" - db_path = run.cache_dir / f"onet_{version}.db" - run.meta.fetchers['onet'] = { - 'url': url, - 'version': version, - 'db_path': str(db_path), - } +ONET_VERSION = "29_1" +ONET_URL = f"https://www.onetcenter.org/dl_files/database/db_{ONET_VERSION}_mysql.zip" - if db_path.exists(): - logger.info(f"Using cached O*NET database: {db_path}") - conn = sqlite3.connect(db_path) - return conn, version +def fetch_onet_database(cache_dir: Path) -> sqlite3.Connection: + DB_PATH = cache_dir / f"onet_{ONET_VERSION}.db" - logger.info(f"Downloading O*NET database from {url}") - response = requests.get(url, stream=True, headers={ + if DB_PATH.exists(): + logger.info(f"Using cached O*NET database: {DB_PATH}") + return sqlite3.connect(DB_PATH) + + logger.info(f"Downloading O*NET database from {ONET_URL}") + response = requests.get(ONET_URL, stream=True, headers={ "User-Agent": "econ-agent/1.0" }) response.raise_for_status() - # Read content into memory - zip_content = response.content - - db_path = run.cache_dir / f"onet_{version}.db" - - logger.info(f"Creating new O*NET database: {db_path}") - conn = sqlite3.connect(db_path) - - # Set performance PRAGMAs for fast import - logger.info("Creating new SQLite database with performance settings") + conn = sqlite3.connect(DB_PATH) conn.executescript(""" PRAGMA journal_mode = OFF; PRAGMA synchronous = 0; @@ -54,6 +34,7 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]: PRAGMA foreign_keys = ON; """) + zip_content = response.content with zipfile.ZipFile(io.BytesIO(zip_content)) as z: sql_scripts = [] for filename in sorted(z.namelist()): @@ -63,14 +44,10 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]: if not sql_scripts: raise RuntimeError("No SQL files found in the O*NET zip archive.") - # Combine and execute all SQL files in one transaction - full_script = "BEGIN TRANSACTION;\n" + "\n".join(sql_scripts) + "\nCOMMIT;" - logger.info("Executing SQL files in alphabetical order (single transaction mode)") + full_script = "BEGIN TRANSACTION;\n" + "\n".join(sql_scripts) + "\nCOMMIT;" conn.executescript(full_script) - logger.info("Database populated successfully. Restoring reliability settings...") - # Restore reliability-focused settings after import conn.executescript(""" PRAGMA journal_mode = WAL; PRAGMA synchronous = NORMAL; @@ -81,87 +58,75 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]: """) conn.execute("VACUUM;") conn.commit() - logger.info("Reliability settings restored and database optimized successfully!") - return conn, version + return conn -def fetch_oesm_data(run: Run) -> Tuple[pd.DataFrame, str]: - """ - Downloads the OESM national data from the BLS website. - """ - version = "23" - url = f"https://www.bls.gov/oes/special-requests/oesm{version}nat.zip" - parquet_path = run.cache_dir / "oesm.parquet" - run.meta.fetchers['oesm'] = { - 'url': url, - 'version': version, - 'parquet_path': str(parquet_path), - } +def fetch_oesm_data(cache_dir: Path) -> pd.DataFrame: + VERSION = "23" + URL = f"https://www.bls.gov/oes/special-requests/oesm{VERSION}nat.zip" + DATA_PATH = cache_dir / "oesm.parquet" - if parquet_path.exists(): - logger.info(f"Using cached OESM data: {parquet_path}") - return pd.read_parquet(parquet_path), version + if DATA_PATH.exists(): + logger.info(f"Using cached OESM data: {DATA_PATH}") + return pd.read_parquet(DATA_PATH) - logger.info(f"Downloading OESM data from {url}") + logger.info(f"Downloading OESM data from {URL}") headers = {'User-Agent': 'econ-agent/1.0'} - response = requests.get(url, headers=headers) + response = requests.get(URL, headers=headers) response.raise_for_status() zip_content = response.content - logger.info(f"OESM data version: {version}") - logger.info(f"Creating new OESM data cache: {parquet_path}") + logger.info(f"Creating new OESM data cache: {DATA_PATH}") with zipfile.ZipFile(io.BytesIO(zip_content)) as z: - # Find the excel file in the zip - excel_filename = None - for filename in z.namelist(): - logger.debug(f"Found file in OESM zip: {filename}") - if filename.lower().endswith(".xlsx"): - excel_filename = filename - break - - if excel_filename is None: - raise FileNotFoundError("Could not find the Excel file in the OESM zip archive.") - - logger.info(f"Reading {excel_filename} from zip archive.") - with z.open(excel_filename) as f: + with z.open(f"oesm{VERSION}national.xlsx") as f: df = pd.read_excel(f, engine='openpyxl', na_values=['*', '#']) - df.to_parquet(parquet_path) - logger.info(f"Saved OESM data to cache: {parquet_path}") - return df, version + df.to_parquet(DATA_PATH) + logger.info(f"Saved OESM data to cache: {DATA_PATH}") + return df -def fetch_epoch_remote_data(run: Run) -> Tuple[pd.DataFrame, str]: - """ - Downloads the EPOCH AI remote work task data. - """ - # This is the direct download link constructed from the Google Drive share link - version = "latest" - url = "https://drive.google.com/uc?export=download&id=1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r" - parquet_path = run.cache_dir / f"epoch_remote_{version}.parquet" - run.meta.fetchers['epoch_remote'] = { - 'url': url, - 'version': version, - 'parquet_path': str(parquet_path), - } +def fetch_epoch_remote_data(cache_dir: Path) -> pd.DataFrame: + URL = "https://drive.google.com/uc?export=download&id=1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r" + DATA_PATH = cache_dir / f"epoch_remote_latest.parquet" - if parquet_path.exists(): - logger.info(f"Using cached EPOCH remote data: {parquet_path}") - return pd.read_parquet(parquet_path), version + if DATA_PATH.exists(): + logger.info(f"Using cached EPOCH remote data: {DATA_PATH}") + return pd.read_parquet(DATA_PATH) - logger.info(f"Downloading EPOCH remote data from Google Drive: {url}") + logger.info(f"Downloading EPOCH remote data from Google Drive: {URL}") - # Need to handle potential cookies/redirects from Google Drive session = requests.Session() session.headers.update({"User-Agent": "econ-agent/1.0"}) - response = session.get(url, stream=True) + response = session.get(URL, stream=True) response.raise_for_status() csv_content = response.content - logger.info(f"Creating new EPOCH remote data cache: {parquet_path}") + logger.info(f"Creating new EPOCH remote data cache: {DATA_PATH}") df = pd.read_csv(io.BytesIO(csv_content)) - df.to_parquet(parquet_path) - logger.info(f"Saved EPOCH remote data to cache: {parquet_path}") + df.to_parquet(DATA_PATH) - return df, version + return df + +def fetch_metr_data(cache_dir: Path) -> Dict: + URL = "https://metr.org/assets/benchmark_results.yaml" + DATA_PATH = cache_dir / "metr_benchmark_results.yaml" + + if DATA_PATH.exists(): + logger.info(f"Using cached METR data: {DATA_PATH}") + with open(DATA_PATH, "r") as f: + return yaml.safe_load(f) + + logger.info(f"Downloading METR data from {URL}") + headers = {"User-Agent": "econ-agent/1.0"} + response = requests.get(URL, headers=headers) + response.raise_for_status() + + yaml_content = response.content + + logger.info(f"Creating new METR data cache: {DATA_PATH}") + with open(DATA_PATH, "wb") as f: + f.write(yaml_content) + + return yaml.safe_load(yaml_content) diff --git a/pipeline/generators/__init__.py b/pipeline/generators/__init__.py index c9a1d1c..0022dcd 100644 --- a/pipeline/generators/__init__.py +++ b/pipeline/generators/__init__.py @@ -1,5 +1,15 @@ from .estimate_histplot import generate_estimate_histplot +from .estimates_spread_per_occupation import generate_estimate_spread_per_occupation +from .estimates_lower_vs_upper_scatter import generate_estimates_lower_vs_upper_scatter +from .sequential_coherence_cdf import plot_sequential_coherence_cdf +from .projected_automatable_wage_bill import generate_projected_automatable_wage_bill +from .projected_task_automation import generate_projected_task_automation_plot GENERATORS = [ - generate_estimate_histplot + generate_estimate_histplot, + generate_estimate_spread_per_occupation, + generate_estimates_lower_vs_upper_scatter, + #plot_sequential_coherence_cdf, + generate_projected_automatable_wage_bill, + generate_projected_task_automation_plot, ] diff --git a/pipeline/generators/estimate_histplot.py b/pipeline/generators/estimate_histplot.py index 4725573..1875de9 100644 --- a/pipeline/generators/estimate_histplot.py +++ b/pipeline/generators/estimate_histplot.py @@ -1,6 +1,32 @@ -from ..run import Run from pathlib import Path from typing import Generator +import matplotlib.pyplot as plt +import seaborn as sns +import pandas as pd +from ..utils import style_plot -def generate_estimate_histplot(run: Run) -> Generator[Path]: - raise NotImplementedError +def generate_estimate_histplot(output_dir: Path, df: pd.DataFrame, **kwargs) -> Generator[Path]: + """ + Generates a styled histogram of the distribution of midpoint time estimates. + """ + style_plot() + OUTPUT_PATH = output_dir / "estimate_distribution_histplot.png" + + fig, ax = plt.subplots() + + sns.histplot( + data=df, + x='estimate_midpoint', + log_scale=True, + ax=ax + ) + + ax.set_xlabel("Task Time (minutes, log scale)") + ax.set_ylabel("Number of Tasks") + ax.set_title("Distribution of Time Estimates for Atomic Tasks") + + plt.tight_layout() + plt.savefig(OUTPUT_PATH) + plt.close(fig) + + yield OUTPUT_PATH diff --git a/pipeline/generators/estimates_lower_vs_upper_scatter.py b/pipeline/generators/estimates_lower_vs_upper_scatter.py new file mode 100644 index 0000000..5b7ebab --- /dev/null +++ b/pipeline/generators/estimates_lower_vs_upper_scatter.py @@ -0,0 +1,56 @@ +from pathlib import Path +from typing import Generator +import matplotlib.pyplot as plt +import seaborn as sns +import pandas as pd +from ..utils import OCCUPATION_MAJOR_CODES, style_plot + + +def generate_estimates_lower_vs_upper_scatter(output_dir: Path, df: pd.DataFrame, **kwargs) -> Generator[Path]: + """ + Generates a styled scatter plot of lower-bound vs upper-bound time estimates for tasks. + """ + style_plot() + OUTPUT_PATH = output_dir / "estimates_lower_vs_upper_scatter.png" + + plot_df = df.copy() + # Replace onetsoc_major codes with their corresponding labels for the plot legend + plot_df['onetsoc_major'] = plot_df['onetsoc_major'].map(OCCUPATION_MAJOR_CODES) + + fig, ax = plt.subplots(figsize=(12, 10)) + sns.scatterplot( + data=plot_df, + x='lb_estimate_in_minutes', + y='ub_estimate_in_minutes', + alpha=0.3, + edgecolor=None, + hue="onetsoc_major", + ax=ax + ) + + # 45° reference line (y=x) + lims = ( + min(df['lb_estimate_in_minutes'].min(), df['ub_estimate_in_minutes'].min()), + max(df['lb_estimate_in_minutes'].max(), df['ub_estimate_in_minutes'].max()) + ) + lims = (lims[0] * 0.9, lims[1] * 1.1) + ax.plot(lims, lims, color='black', linestyle='--', linewidth=1, zorder=0) + + # Optional helper lines for ratios + for k in [2, 10, 100]: + ax.plot(lims, [k*l for l in lims], + linestyle=':', color='grey', linewidth=1, zorder=0) + + ax.set_xscale('log') + ax.set_yscale('log') + ax.set_xlabel('Lower-bound (min, log scale)') + ax.set_ylabel('Upper-bound (min, log scale)') + ax.set_title('Lower vs Upper Estimates for All Tasks') + + ax.legend(title="Occupation Major Group", bbox_to_anchor=(1.02, 1), loc='upper left') + + plt.tight_layout() + plt.savefig(OUTPUT_PATH, bbox_inches='tight') + plt.close(fig) + + yield OUTPUT_PATH diff --git a/pipeline/generators/estimates_spread_per_occupation.py b/pipeline/generators/estimates_spread_per_occupation.py new file mode 100644 index 0000000..d943dd5 --- /dev/null +++ b/pipeline/generators/estimates_spread_per_occupation.py @@ -0,0 +1,39 @@ +from pathlib import Path +from typing import Generator +import matplotlib.pyplot as plt +import seaborn as sns +import pandas as pd +from ..utils import OCCUPATION_MAJOR_CODES, style_plot + + +def generate_estimate_spread_per_occupation(output_dir: Path, df: pd.DataFrame, **kwargs) -> Generator[Path]: + """ + Generates a styled boxplot of the estimate range spread per major occupation group. + """ + style_plot() + OUTPUT_PATH = output_dir / "estimates_spread_per_occupation.png" + + fig, ax = plt.subplots(figsize=(10, 12)) + + sns.boxplot( + data=df, + x='onetsoc_major', + y='estimate_range', + showfliers=False, + ax=ax + ) + + ax.set_yscale('log') + ax.set_xlabel('Occupation') + ax.set_ylabel('Range (upper-lower, minutes)') + ax.set_title('Spread of time-range estimates per occupation') + + # Get occupation labels from codes for x-axis ticks + labels = [OCCUPATION_MAJOR_CODES.get(code.get_text(), code.get_text()) for code in ax.get_xticklabels()] + ax.set_xticklabels(labels, rotation=60, ha='right') + + plt.tight_layout() + plt.savefig(OUTPUT_PATH) + plt.close(fig) + + yield OUTPUT_PATH diff --git a/pipeline/generators/helpers.py b/pipeline/generators/helpers.py deleted file mode 100644 index 1cd5cba..0000000 --- a/pipeline/generators/helpers.py +++ /dev/null @@ -1,6 +0,0 @@ -import pandas as pd -from typings import List - -def must_have_columns(df: pd.DataFrame, columns: List[str]): - if not all(col in df.columns for col in columns): - raise ValueError(f"DataFrame is missing required columns: {columns}") diff --git a/pipeline/generators/projected_automatable_wage_bill.py b/pipeline/generators/projected_automatable_wage_bill.py new file mode 100644 index 0000000..8c14916 --- /dev/null +++ b/pipeline/generators/projected_automatable_wage_bill.py @@ -0,0 +1,229 @@ +from pathlib import Path +from typing import Generator, Dict, Tuple, Optional +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import matplotlib.ticker as mticker +from scipy.stats import linregress +from datetime import datetime +from ..utils import style_plot, LIME + +def _generate_wage_projection_data( + metr_results: Dict, + df_with_wages: pd.DataFrame, + percentile_key: str, + doubling_time_modifier: float, +) -> Optional[Tuple[pd.DataFrame, pd.DataFrame, float]]: + """ + Generates wage projection data for different AI progress scenarios. + + Args: + metr_results: The METR benchmark data. + df_with_wages: DataFrame containing tasks with their estimated wage value. + percentile_key: The percentile to use from METR data (e.g., 'p50_horizon_length'). + doubling_time_modifier: Multiplier for the doubling time (e.g., 1.0 for baseline, + 0.5 for optimistic, 2.0 for pessimistic). + + Returns: + A tuple of (metr_df, projection_df, doubling_time_days), or None if data is insufficient. + """ + all_model_data = [] + for model_name, data in metr_results.get("results", {}).items(): + for agent_name, agent_data in data.get("agents", {}).items(): + release_date_str = data.get("release_date") + horizon = agent_data.get(percentile_key, {}).get("estimate") + if release_date_str and horizon is not None: + all_model_data.append({ + "release_date": release_date_str, + "horizon_minutes": horizon, + }) + + if not all_model_data: + return None + + metr_df = pd.DataFrame(all_model_data).sort_values("release_date").reset_index(drop=True) + metr_df['release_date'] = pd.to_datetime(metr_df['release_date']) + metr_df = metr_df[metr_df['horizon_minutes'] > 0].copy() + + if len(metr_df) < 2: + return None + + metr_df['days_since_start'] = (metr_df['release_date'] - metr_df['release_date'].min()).dt.days + log_y = np.log(metr_df['horizon_minutes']) + slope, intercept, r_value, _, _ = linregress(metr_df['days_since_start'], log_y) + + # Apply the scenario modifier to the doubling time + base_doubling_time_days = np.log(2) / slope + modified_doubling_time_days = base_doubling_time_days * doubling_time_modifier + modified_slope = np.log(2) / modified_doubling_time_days + + start_date = metr_df['release_date'].min() + future_dates = pd.to_datetime(pd.date_range(start=start_date, end="2035-01-01", freq="ME")) + future_days = (future_dates - start_date).days.to_numpy() + + projected_log_horizon = intercept + modified_slope * future_days + projected_horizon_minutes = np.exp(projected_log_horizon) + + projection_df = pd.DataFrame({ + "date": future_dates, + "projected_coherence_minutes": projected_horizon_minutes, + }) + + # Calculate the total wage bill of tasks automated over time + for bound in ["lb", "mid", "ub"]: + col_name = 'estimate_midpoint' if bound == 'mid' else f'{bound}_estimate_in_minutes' + projection_df[f"automatable_wage_bill_{bound}"] = projection_df["projected_coherence_minutes"].apply( + lambda h: df_with_wages.loc[df_with_wages[col_name] <= h, 'wage_per_task'].sum() + ) + + # Also calculate for the actual METR data points for plotting + metr_df["automatable_wage_bill_mid"] = metr_df["horizon_minutes"].apply( + lambda h: df_with_wages.loc[df_with_wages['estimate_midpoint'] <= h, 'wage_per_task'].sum() + ) + + return metr_df, projection_df, modified_doubling_time_days + + +def _plot_scenario(ax, projection_df, metr_df, label, color, line_style='-'): + """Helper function to draw a single projection scenario on a given axis.""" + # Plot the projected wage bill + ax.plot( + projection_df["date"], + projection_df["automatable_wage_bill_mid"], + label=label, + color=color, + linewidth=2.5, + linestyle=line_style, + zorder=3 + ) + # Plot the shaded range for lower/upper bounds + ax.fill_between( + projection_df["date"], + projection_df["automatable_wage_bill_lb"], + projection_df["automatable_wage_bill_ub"], + color=color, + alpha=0.15, + zorder=2 + ) + # Plot the actual METR data points against the wage bill + ax.scatter( + metr_df['release_date'], + metr_df['automatable_wage_bill_mid'], + color=color, + edgecolor='black', + s=60, + zorder=4, + label=f"Model Capabilities (P50)" + ) + + +def generate_projected_automatable_wage_bill( + output_dir: Path, + df: pd.DataFrame, + task_summary_by_occupation_df: pd.DataFrame, + metr_results: Dict, + **kwargs, +) -> Generator[Path, None, None]: + """ + Generates a plot projecting the automatable wage bill under different + AI progress scenarios (optimistic, baseline, pessimistic). + """ + style_plot() + OUTPUT_PATH = output_dir / "projected_automatable_wage_bill_sensitivity.png" + + # 1. Calculate wage_per_task for each occupation + wage_bill_info = task_summary_by_occupation_df[['onetsoc_code', 'wage_bill', 'total_tasks']].copy() + wage_bill_info['wage_per_task'] = wage_bill_info['wage_bill'] / wage_bill_info['total_tasks'] + wage_bill_info.replace([np.inf, -np.inf], 0, inplace=True) # Avoid division by zero issues + wage_bill_info.drop(columns=['wage_bill', 'total_tasks'], inplace=True) + + # 2. Merge wage_per_task into the main task dataframe + df_with_wages = pd.merge(df, wage_bill_info, on='onetsoc_code', how='left') + df_with_wages['wage_per_task'].fillna(0, inplace=True) + + # 3. Generate data for all three scenarios + scenarios = { + "Optimistic": {"modifier": 0.5, "color": "tab:green", "style": "--"}, + "Baseline": {"modifier": 1.0, "color": LIME['600'], "style": "-"}, + "Pessimistic": {"modifier": 2.0, "color": "tab:red", "style": ":"}, + } + + projection_results = {} + for name, config in scenarios.items(): + result = _generate_wage_projection_data(metr_results, df_with_wages, 'p50_horizon_length', config['modifier']) + if result: + projection_results[name] = result + + if not projection_results: + print("Warning: Could not generate any projection data. Skipping wage bill plot.") + return + + # 4. Create the plot + fig, ax = plt.subplots(figsize=(14, 9)) + + # We only need to plot the scatter points once, let's use the baseline ones. + if "Baseline" in projection_results: + metr_df, _, _ = projection_results["Baseline"] + ax.scatter( + metr_df['release_date'], + metr_df['automatable_wage_bill_mid'], + color='black', + s=80, + zorder=5, + label=f"Model Capabilities (P50)" + ) + + + legend_lines = [] + for name, (metr_df, proj_df, doubling_time) in projection_results.items(): + config = scenarios[name] + ax.plot( + proj_df["date"], + proj_df["automatable_wage_bill_mid"], + color=config['color'], + linestyle=config['style'], + linewidth=2.5, + zorder=3 + ) + ax.fill_between( + proj_df["date"], + proj_df["automatable_wage_bill_lb"], + proj_df["automatable_wage_bill_ub"], + color=config['color'], + alpha=0.15, + zorder=2 + ) + # Create a custom line for the legend + line = plt.Line2D([0], [0], color=config['color'], linestyle=config['style'], lw=2.5, + label=f'{name} (Doubling Time: {doubling_time:.0f} days)') + legend_lines.append(line) + + + # 5. Styling and annotations + ax.set_title("Projected Automatable Wage Bill (P50 Coherence)", fontsize=18, pad=20) + ax.set_xlabel("Year", fontsize=12) + ax.set_ylabel("Automatable Annual Wage Bill (Trillions of USD)", fontsize=12) + + # Format Y-axis to show trillions + def trillions_formatter(x, pos): + return f'${x / 1e12:.1f}T' + ax.yaxis.set_major_formatter(mticker.FuncFormatter(trillions_formatter)) + + total_wage_bill = df_with_wages['wage_per_task'].sum() + ax.set_ylim(0, total_wage_bill * 1.05) + + if "Baseline" in projection_results: + _, proj_df, _ = projection_results["Baseline"] + ax.set_xlim(datetime(2022, 1, 1), proj_df["date"].max()) + + # Create the legend from the custom lines and the scatter plot + scatter_legend = ax.get_legend_handles_labels()[0] + ax.legend(handles=legend_lines + scatter_legend, loc="upper left", fontsize=11) + + ax.grid(True, which="both", linestyle="--", linewidth=0.5) + plt.tight_layout() + plt.savefig(OUTPUT_PATH) + plt.close(fig) + + print(f"Generated sensitivity analysis plot: {OUTPUT_PATH}") + yield OUTPUT_PATH diff --git a/pipeline/generators/projected_task_automation.py b/pipeline/generators/projected_task_automation.py new file mode 100644 index 0000000..afe217d --- /dev/null +++ b/pipeline/generators/projected_task_automation.py @@ -0,0 +1,168 @@ +from pathlib import Path +from typing import Generator, Dict, Tuple +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from scipy.stats import linregress +from datetime import datetime +from ..utils import style_plot, LIME + +def _generate_projection_data( + metr_results: Dict, + df: pd.DataFrame, + percentile_key: str, +) -> Tuple[pd.DataFrame, pd.DataFrame] | None: + """ + Generates projection data for a given percentile key (e.g., 'p50_horizon_length'). + Returns a tuple of (metr_df_with_pct, projection_df), or None if data is insufficient. + """ + # 1. Process METR data to get all model performance over time for the given percentile + all_model_data = [] + for model_name, data in metr_results.get("results", {}).items(): + for agent_name, agent_data in data.get("agents", {}).items(): + release_date_str = data.get("release_date") + horizon = agent_data.get(percentile_key, {}).get("estimate") + + if release_date_str and horizon is not None: + unique_model_name = f"{model_name}-{agent_name}" + all_model_data.append({ + "model": unique_model_name, + "release_date": release_date_str, + "horizon_minutes": horizon, + }) + + if not all_model_data: + print(f"Warning: No models with {percentile_key} found in METR data. Skipping.") + return None + + metr_df = pd.DataFrame(all_model_data).sort_values("release_date").reset_index(drop=True) + metr_df['release_date'] = pd.to_datetime(metr_df['release_date']) + + # 2. Perform log-linear regression on coherence over time + metr_df = metr_df[metr_df['horizon_minutes'] > 0].copy() + if len(metr_df) < 2: + print(f"Warning: Not enough data points for regression for {percentile_key}. Skipping.") + return None + + metr_df['days_since_start'] = (metr_df['release_date'] - metr_df['release_date'].min()).dt.days + log_y = np.log(metr_df['horizon_minutes']) + x = metr_df['days_since_start'] + + slope, intercept, r_value, _, _ = linregress(x, log_y) + doubling_time_days = np.log(2) / slope + print(f"METR all models {percentile_key} trend: R^2 = {r_value**2:.2f}, Doubling time = {doubling_time_days:.1f} days") + + # 3. Project coherence into the future + start_date = metr_df['release_date'].min() + future_dates = pd.to_datetime(pd.date_range(start=start_date, end="2035-01-01", freq="ME")) + future_days = (future_dates - start_date).days.to_numpy() + + projected_log_horizon = intercept + slope * future_days + projected_horizon_minutes = np.exp(projected_log_horizon) + + projection_df = pd.DataFrame({ + "date": future_dates, + "projected_coherence_minutes": projected_horizon_minutes, + }) + + # 4. Calculate the percentage of tasks automated over time based on our estimates + total_tasks = len(df) + if total_tasks == 0: + return None + + for bound in ["lb", "mid", "ub"]: + col_name = 'estimate_midpoint' if bound == 'mid' else f'{bound}_estimate_in_minutes' + projection_df[f"pct_automatable_{bound}"] = projection_df["projected_coherence_minutes"].apply( + lambda h: (df[col_name] <= h).sum() / total_tasks * 100 + ) + + metr_df["pct_automatable_mid"] = metr_df["horizon_minutes"].apply( + lambda h: (df['estimate_midpoint'] <= h).sum() / total_tasks * 100 + ) + + return metr_df, projection_df + + +def _plot_projection(ax, projection_df, metr_df, label, color, line_style='-'): + """Helper function to draw a single projection on a given axis.""" + # Plot the projected automation percentage + ax.plot( + projection_df["date"], + projection_df["pct_automatable_mid"], + label=f"Mid-point", + color=color, + linewidth=2.5, + linestyle=line_style, + zorder=3 + ) + ax.fill_between( + projection_df["date"], + projection_df["pct_automatable_lb"], + projection_df["pct_automatable_ub"], + color=color, + alpha=0.15, + label=f"Lower/upper bound range", + zorder=2 + ) + # Plot the actual METR data points + ax.scatter( + metr_df['release_date'], + metr_df['pct_automatable_mid'], + color=color, + edgecolor='black', + s=60, + zorder=4, + label=f"Model with {label[1:]}% success rate" + ) + + +def generate_projected_task_automation_plot( + output_dir: Path, + metr_results: Dict, + df: pd.DataFrame, + **kwargs, +) -> Generator[Path, None, None]: + """ + Generates plots projecting task automation based on METR's p50 and p80 + coherence data. + """ + style_plot() + + p50_data = _generate_projection_data(metr_results, df, 'p50_horizon_length') + p80_data = _generate_projection_data(metr_results, df, 'p80_horizon_length') + + # Plot P50 alone + if p50_data: + p50_metr_df, p50_proj_df = p50_data + fig, ax = plt.subplots(figsize=(12, 8)) + _plot_projection(ax, p50_proj_df, p50_metr_df, "P50", LIME['600']) + ax.set_title("How long before sequential coherence stops being a bottleneck?", fontsize=16, pad=20) + ax.set_xlabel("Year") + ax.set_ylabel("% of task automatable (50% success rate)") + ax.set_ylim(0, 100.5) + ax.set_xlim(datetime(2022, 1, 1), p50_proj_df["date"].max()) + ax.grid(True, which="both", linestyle="--", linewidth=0.5) + ax.legend(loc="upper left") + plt.tight_layout() + output_path = output_dir / "projected_task_automation_p50.png" + plt.savefig(output_path) + plt.close(fig) + yield output_path + + # Plot P80 alone + if p80_data: + p80_metr_df, p80_proj_df = p80_data + fig, ax = plt.subplots(figsize=(12, 8)) + _plot_projection(ax, p80_proj_df, p80_metr_df, "P80", 'tab:cyan') + ax.set_title("Projected Task Automation (P80 AI Coherence)", fontsize=16, pad=20) + ax.set_xlabel("Year") + ax.set_ylabel("% of Estimable Economic Tasks Automatable") + ax.set_ylim(0, 100.5) + ax.set_xlim(datetime(2022, 1, 1), p80_proj_df["date"].max()) + ax.grid(True, which="both", linestyle="--", linewidth=0.5) + ax.legend(loc="upper left") + plt.tight_layout() + output_path = output_dir / "projected_task_automation_p80.png" + plt.savefig(output_path) + plt.close(fig) + yield output_path diff --git a/pipeline/generators/sequential_coherence_cdf.py b/pipeline/generators/sequential_coherence_cdf.py new file mode 100644 index 0000000..12b045d --- /dev/null +++ b/pipeline/generators/sequential_coherence_cdf.py @@ -0,0 +1,54 @@ +from pathlib import Path +import pandas as pd +import matplotlib.pyplot as plt +import matplotlib.ticker as mtick +from ..utils import LIME, style_plot + +def plot_sequential_coherence_cdf(output_dir: Path, df: pd.DataFrame, **kwargs): + style_plot() + output_path = output_dir / "sequential_coherence_cdf.png" + + def cdf(series): + """Helper function to calculate CDF data.""" + s = series.sort_values().reset_index(drop=True) + # Calculate cumulative percentage + return s.values, ((s.index + 1) / len(s)) * 100 + + # Calculate CDF for lower, upper, and midpoint estimates + x_lb, y_lb = cdf(df['lb_estimate_in_minutes']) + x_ub, y_ub = cdf(df['ub_estimate_in_minutes']) + x_mid, y_mid = cdf(df['estimate_midpoint']) + + # Create the plot + fig, ax = plt.subplots(figsize=(12, 7)) + + # Plot the CDFs as step plots + ax.step(x_lb, y_lb, where='post', color=LIME['300'], linewidth=1.8, linestyle='--', zorder=2, label='Lower bound estimate') + ax.step(x_ub, y_ub, where='post', color=LIME['900'], linewidth=1.8, linestyle=':', zorder=3, label='Upper bound estimate') + ax.step(x_mid, y_mid, where='post', color=LIME['600'], linewidth=2.2, zorder=4, label='Mid-point') + + # --- Styling and Annotations --- + ax.set_xscale('log') + ax.set_ylim(0, 100) + ax.yaxis.set_major_formatter(mtick.PercentFormatter(decimals=0)) + + # Set titles and labels using the standard axes methods + ax.set_title("% of Tasks With Sequential Coherence ≤ X") + ax.set_xlabel("Sequential Coherence (X)") + ax.set_ylabel("Cumulative Percentage of Tasks") + + # Define custom x-axis ticks and labels for better readability + ticks = [1, 5, 10, 30, 60, 120, 240, 480, 1440, 2880, 10080, 43200, 129600, 259200, 525600] + ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hr', '2 hr', '4 hr', '8 hr', '1 day', '2 days', + '1 wk', '30 days', '90 days', '180 days', '1 yr'] + ax.set_xticks(ticks) + ax.set_xticklabels(ticklabels, rotation=45, ha='right') + + ax.legend(loc='lower right') + + # --- Save and close --- + plt.tight_layout() + plt.savefig(output_path, bbox_inches='tight') + plt.close(fig) + + yield output_path diff --git a/pipeline/metadata.py b/pipeline/metadata.py deleted file mode 100644 index 6157ec2..0000000 --- a/pipeline/metadata.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -This module defines the Metadata model for the pipeline. -""" - -from datetime import datetime -from pydantic import BaseModel, Field -from typing import Dict, Any - -class Metadata(BaseModel): - """ - A Pydantic model for storing pipeline metadata. - - This class is intended to be instantiated once and passed through the - pipeline. Each step in the pipeline can then add its own metadata. - This provides a centralized and structured way to track data provenance, - versions, and other important information. - """ - fetchers: Dict[str, Dict[str, Any]] = Field(default_factory=dict) - enrichments: Dict[str, Dict[str, Any]] = Field(default_factory=dict) - - ts: str = Field(default_factory=lambda: datetime.now().strftime("%Y-%m-%d %H:%M:%S")) - commit: str = Field(default_factory=lambda: _get_current_commit()) - - -def _get_current_commit() -> str: - """ - Returns the current git commit hash, "unknown", or "errored" depending on why the commit could not be retrieved. - """ - import subprocess - try: - # Get the current commit hash - commit_hash = subprocess.check_output( - ["git", "rev-parse", "HEAD"], stderr=subprocess.PIPE, text=True - ).strip() - return commit_hash - except subprocess.CalledProcessError: - # If git command fails (e.g., not a git repository) - return "errored" - except FileNotFoundError: - # If git is not installed - return "unknown" diff --git a/pipeline/postprocessors.py b/pipeline/postprocessors.py deleted file mode 100644 index a390ded..0000000 --- a/pipeline/postprocessors.py +++ /dev/null @@ -1,140 +0,0 @@ -from .run import Run -from .logger import logger -import pandas as pd -import numpy as np - - -def check_for_insanity(run: Run) -> Run: - raise NotImplementedError - - -def create_df_tasks(run: Run) -> Run: - """ - Creates a dataframe of tasks from the O*NET database, and merges it with remote status data. - This replicates the logic from old/enrich_task_ratings.py and parts of old/analysis.py - - The resulting dataframe, `run.df_tasks` will be used by the enrichment steps. - """ - logger.info("Creating tasks dataframe") - cache_path = run.cache_dir / f"onet_{run.onet_version}_tasks_with_remote_status.parquet" - if cache_path.exists(): - logger.info(f"Loading cached tasks dataframe from {cache_path}") - run.df_tasks = pd.read_parquet(cache_path) - return run - - query = """ - SELECT - tr.onetsoc_code, - tr.task_id, - ts.task, - od.title AS occupation_title, - od.description AS occupation_description, - tr.scale_id, - tr.category, - tr.data_value, - dr.dwa_title - FROM - task_ratings tr - JOIN - task_statements ts ON tr.task_id = ts.task_id - JOIN - occupation_data od ON tr.onetsoc_code = od.onetsoc_code - LEFT JOIN - tasks_to_dwas td ON tr.onetsoc_code = td.onetsoc_code AND tr.task_id = td.task_id - LEFT JOIN - dwa_reference dr ON td.dwa_id = dr.dwa_id; - """ - df = pd.read_sql_query(query, run.onet_conn) - logger.info(f"Fetched {len(df)} records (including DWA info) from the database.") - - # Separate ratings from DWAs - core_cols = [ - "onetsoc_code", "task_id", "task", "occupation_title", - "occupation_description", "scale_id", "category", "data_value" - ] - ratings_df = df[core_cols].drop_duplicates().reset_index(drop=True) - - dwa_cols = ["onetsoc_code", "task_id", "dwa_title"] - dwas_df = df[dwa_cols].dropna(subset=["dwa_title"]).drop_duplicates().reset_index(drop=True) - - # 1. Handle Frequency (FT) - logger.info("Processing Frequency data") - freq_df = ratings_df[ratings_df["scale_id"] == "FT"].copy() - if not freq_df.empty: - freq_pivot = freq_df.pivot_table( - index=["onetsoc_code", "task_id"], - columns="category", - values="data_value", - fill_value=0, - ) - freq_pivot.columns = [f"frequency_category_{int(col)}" for col in freq_pivot.columns] - else: - idx = pd.MultiIndex(levels=[[], []], codes=[[], []], names=["onetsoc_code", "task_id"]) - freq_pivot = pd.DataFrame(index=idx) - - # 2. Handle Importance (IM, IJ) - logger.info("Processing Importance data") - imp_df = ratings_df[ratings_df["scale_id"].isin(["IM", "IJ"])].copy() - if not imp_df.empty: - imp_avg = imp_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index() - imp_avg.rename(columns={"data_value": "importance_average"}, inplace=True) - else: - imp_avg = pd.DataFrame(columns=["onetsoc_code", "task_id", "importance_average"]) - - # 3. Handle Relevance (RT) - logger.info("Processing Relevance data") - rel_df = ratings_df[ratings_df["scale_id"] == "RT"].copy() - if not rel_df.empty: - rel_avg = rel_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index() - rel_avg.rename(columns={"data_value": "relevance_average"}, inplace=True) - else: - rel_avg = pd.DataFrame(columns=["onetsoc_code", "task_id", "relevance_average"]) - - # 4. Process DWAs - logger.info("Processing DWA data") - if not dwas_df.empty: - dwas_grouped = dwas_df.groupby(["onetsoc_code", "task_id"])["dwa_title"].apply(list).reset_index() - dwas_grouped.rename(columns={"dwa_title": "dwas"}, inplace=True) - else: - dwas_grouped = None - - # 5. Get Base Task/Occupation Info - logger.info("Extracting base task/occupation info") - base_cols = ["onetsoc_code", "task_id", "task", "occupation_title", "occupation_description"] - base_info = ratings_df[base_cols].drop_duplicates().set_index(["onetsoc_code", "task_id"]) - - # 6. Merge Processed ONET Data - logger.info("Merging processed ONET data") - final_df = base_info.merge(freq_pivot, left_index=True, right_index=True, how="left") - final_df = final_df.reset_index() - - if not imp_avg.empty: - final_df = final_df.merge(imp_avg, on=["onetsoc_code", "task_id"], how="left") - else: - final_df["importance_average"] = np.nan - - if not rel_avg.empty: - final_df = final_df.merge(rel_avg, on=["onetsoc_code", "task_id"], how="left") - else: - final_df["relevance_average"] = np.nan - - if dwas_grouped is not None and not dwas_grouped.empty: - final_df = final_df.merge(dwas_grouped, on=["onetsoc_code", "task_id"], how="left") - if "dwas" in final_df.columns: - final_df["dwas"] = final_df["dwas"].apply(lambda x: x if isinstance(x, list) else []) - else: - final_df["dwas"] = [[] for _ in range(len(final_df))] - - final_df = final_df.replace({np.nan: None}) - - # 7. Merge with EPOCH remote data - logger.info("Merging with EPOCH remote data") - final_df = pd.merge(final_df, run.epoch_df[['Task', 'Remote']], left_on='task', right_on='Task', how='left') - final_df = final_df.drop('Task', axis=1).rename(columns={'Remote': 'remote_status'}) - - - logger.info(f"Created tasks dataframe with shape {final_df.shape}") - final_df.to_parquet(cache_path) - - run.df_tasks = final_df - return run diff --git a/pipeline/run.py b/pipeline/run.py deleted file mode 100644 index 90d71ad..0000000 --- a/pipeline/run.py +++ /dev/null @@ -1,27 +0,0 @@ -from pydantic import BaseModel, Field -import sqlite3 -import pandas as pd -from pathlib import Path -from typing import Optional -from .metadata import Metadata - -class Run(BaseModel): - model_config = {"arbitrary_types_allowed": True} - # === FETCHERS === - onet_conn: Optional[sqlite3.Connection] = None - onet_version: Optional[str] = None - - oesm_df: Optional[pd.DataFrame] = None - oesm_version: Optional[str] = None - - epoch_df: Optional[pd.DataFrame] = None - epoch_version: Optional[str] = None - - # === ENRICHMENTS === - task_estimateability_df: Optional[pd.DataFrame] = None - task_estimates_df: Optional[pd.DataFrame] = None - - meta: Metadata = Field(default_factory=Metadata) - - cache_dir: Path - output_dir: Path diff --git a/pipeline/runner.py b/pipeline/runner.py index e1b4464..670e98b 100644 --- a/pipeline/runner.py +++ b/pipeline/runner.py @@ -1,74 +1,215 @@ +import sqlite3 +import os +from .logger import logger +import pandas as pd from dotenv import load_dotenv -from .fetchers import fetch_oesm_data, fetch_epoch_remote_data, fetch_onet_database -from .enrichments import enrich_with_task_estimateability, enrich_with_task_estimates -from .postprocessors import check_for_insanity, create_df_tasks +from .fetchers import fetch_onet_database, fetch_oesm_data, fetch_epoch_remote_data, ONET_VERSION, fetch_metr_data +from .classification import classify_tasks_as_estimable, generate_time_estimates_for_tasks from .generators import GENERATORS -from .run import Run -from .constants import GRAY +from .aggregate import create_task_summary_by_occupation_df, aggregate_task_summary_by_major_code +from .utils import convert_to_minutes import argparse import platformdirs -import seaborn as sns -import matplotlib as mpl +import numpy as np from pathlib import Path -from typing import Optional - -CACHE_DIR = platformdirs.user_cache_dir("econtai") - -def run(output_dir: Path | Optional[str] = None): - load_dotenv() - _setup_graph_rendering() - - if output_dir is None: - output_dir = Path("dist/") - elif isinstance(output_dir, str): - output_dir = Path(output_dir).resolve() - - output_dir.mkdir(parents=True, exist_ok=True) - - current_run = Run(output_dir=output_dir, cache_dir=Path(CACHE_DIR).resolve()) - current_run.cache_dir.mkdir(parents=True, exist_ok=True) - - # Fetchers (fetchers.py) - current_run.onet_conn, current_run.onet_version = fetch_onet_database(current_run) - current_run.oesm_df, current_run.oesm_version = fetch_oesm_data(current_run) - current_run.epoch_df, current_run.epoch_version = fetch_epoch_remote_data(current_run) - - current_run = create_df_tasks(current_run) - - # Enrichments (enrichments.py) - current_run.task_estimateability_df = enrich_with_task_estimateability(current_run) - current_run.task_estimates_df = enrich_with_task_estimates(current_run) - - # Postprocessors (postprocessors.py) - check_for_insanity(current_run) - - # Generators (generators/) - for gen in GENERATORS: - gen(current_run) -def _setup_graph_rendering(): - mpl.rcParams.update({ - 'figure.facecolor' : GRAY['50'], - 'axes.facecolor' : GRAY['50'], - 'axes.edgecolor' : GRAY['100'], - 'axes.labelcolor' : GRAY['700'], - 'xtick.color' : GRAY['700'], - 'ytick.color' : GRAY['700'], - 'font.family' : 'Inter', - 'font.size' : 11, - }) +class Runner: + onet_conn: sqlite3.Connection + oesm_df: pd.DataFrame + epoch_df: pd.DataFrame + metr_results: dict + def __init__(self, output_dir: Path | str, debug: bool, bust_estimability: bool, bust_estimates: bool): + if isinstance(output_dir, str): + output_dir = Path(output_dir).resolve() - sns.set_style("white") + output_dir.mkdir(parents=True, exist_ok=True) + self.output_dir = output_dir + self.intermediate_dir = self.output_dir / "intermediate" + self.intermediate_dir.mkdir(parents=True, exist_ok=True) + self.cache_dir = platformdirs.user_cache_path("econtai") + self.debug = debug + self.bust_estimability = bust_estimability + self.bust_estimates = bust_estimates -def main(): - parser = argparse.ArgumentParser(description="Run the econtai pipeline.") - parser.add_argument("--output-dir", type=str, help="The directory to write output files to.") - args = parser.parse_args() - run(output_dir=args.output_dir) + if debug: + os.environ["LITELLM_LOG"] = os.environ.get("LITELLM_LOG", "INFO") + def run(self): + load_dotenv() + + self.onet_conn = fetch_onet_database(self.cache_dir) + self.oesm_df = fetch_oesm_data(self.cache_dir) + self.epoch_df = fetch_epoch_remote_data(self.cache_dir) + self.metr_results = fetch_metr_data(self.cache_dir) + + self.df_tasks = self._create_df_tasks() + self.df_tasks['onetsoc_major'] = self.df_tasks['onetsoc_code'].str[:2] + + df_to_process = self.df_tasks[ + (self.df_tasks['importance_average'] > 3) & + (self.df_tasks['remote_status'] == 'remote') + ].copy() + + if self.debug: + df_to_process = df_to_process.head(10) + + task_estimability_df = classify_tasks_as_estimable(self.cache_dir, df_to_process, bust=self.bust_estimability) + self.df_tasks = pd.merge(self.df_tasks, task_estimability_df, on='task', how='left') + self.df_tasks['estimable'] = self.df_tasks['estimable'].fillna(False) + self.df_tasks.to_parquet(self.intermediate_dir / "df_tasks.parquet") + df_to_process = pd.merge(df_to_process, task_estimability_df, on='task', how='left') + df_to_process['estimable'] = self.df_tasks['estimable'].fillna(False) + + df_to_process = df_to_process[df_to_process['estimable']].copy() + + task_estimates_df = generate_time_estimates_for_tasks(self.cache_dir, df_to_process, bust=self.bust_estimates) + df = pd.merge(df_to_process, task_estimates_df, on=['onetsoc_code', 'task_id'], how='left') + df['lb_estimate_in_minutes'] = df.apply(lambda row: convert_to_minutes(row['lb_estimate_qty'], row['lb_estimate_unit']), axis=1) + df['ub_estimate_in_minutes'] = df.apply(lambda row: convert_to_minutes(row['ub_estimate_qty'], row['ub_estimate_unit']), axis=1) + df['estimate_range'] = df.ub_estimate_in_minutes - df.lb_estimate_in_minutes + df['estimate_ratio'] = np.divide(df.ub_estimate_in_minutes, df.lb_estimate_in_minutes).replace([np.inf, -np.inf], None) + df['estimate_midpoint'] = (df.lb_estimate_in_minutes + df.ub_estimate_in_minutes) / 2 + + df.to_parquet(self.intermediate_dir / "estimable_tasks_with_estimates.parquet") + + self.task_summary_by_occupation_df = create_task_summary_by_occupation_df(self.df_tasks, self.oesm_df) + self.task_summary_by_occupation_df.to_parquet(self.intermediate_dir / "task_summary_by_occupation.parquet") + self.task_summary_by_major_occupation_df = aggregate_task_summary_by_major_code(self.task_summary_by_occupation_df) + self.task_summary_by_major_occupation_df.to_parquet(self.intermediate_dir / "task_summary_by_major_occupation.parquet") + + self._check_for_insanity(df) + + for gen in GENERATORS: + for asset in gen(**{ + "output_dir": self.output_dir, + "runner": self, + "df": df, + "task_summary_by_occupation_df": self.task_summary_by_occupation_df, + "task_summary_by_major_occupation_df": self.task_summary_by_major_occupation_df, + "df_tasks": self.df_tasks, + "oesm_df": self.oesm_df, + "metr_results": self.metr_results, + }): + logger.info(f"New asset: {asset}") + + def _create_df_tasks(self) -> pd.DataFrame: + DATA_PATH = self.cache_dir / f"onet_{ONET_VERSION}_tasks_with_remote_status.parquet" + if DATA_PATH.exists(): + logger.info(f"Loading cached tasks dataframe from {DATA_PATH}") + return pd.read_parquet(DATA_PATH) + + logger.info("Creating tasks dataframe") + query = """ + SELECT + tr.onetsoc_code, + tr.task_id, + ts.task, + od.title AS occupation_title, + od.description AS occupation_description, + tr.scale_id, + tr.category, + tr.data_value + FROM + task_ratings tr + JOIN + task_statements ts ON tr.task_id = ts.task_id + JOIN + occupation_data od ON tr.onetsoc_code = od.onetsoc_code; + """ + ratings_df = pd.read_sql_query(query, self.onet_conn) + logger.info(f"Fetched {len(ratings_df)} task rating records from the database.") + + # 1. Handle Frequency (FT) + logger.info("Processing Frequency data") + freq_df = ratings_df[ratings_df["scale_id"] == "FT"].copy() + if not freq_df.empty: + freq_pivot = freq_df.pivot_table( + index=["onetsoc_code", "task_id"], + columns="category", + values="data_value", + fill_value=0, + ) + freq_pivot.columns = [f"frequency_category_{int(col)}" for col in freq_pivot.columns] + else: + raise ValueError("No frequency data.") + + # 2. Handle Importance (IM, IJ) + logger.info("Processing Importance data") + imp_df = ratings_df[ratings_df["scale_id"].isin(["IM", "IJ"])].copy() + if not imp_df.empty: + imp_avg = imp_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index() + imp_avg.rename(columns={"data_value": "importance_average"}, inplace=True) + else: + raise ValueError("No importance data.") + + # 3. Handle Relevance (RT) + logger.info("Processing Relevance data") + rel_df = ratings_df[ratings_df["scale_id"] == "RT"].copy() + if not rel_df.empty: + rel_avg = rel_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index() + rel_avg.rename(columns={"data_value": "relevance_average"}, inplace=True) + else: + raise ValueError("No relevance data.") + + # 5. Get Base Task/Occupation Info + logger.info("Extracting base task/occupation info") + base_cols = ["onetsoc_code", "task_id", "task", "occupation_title", "occupation_description"] + base_info = ratings_df[base_cols].drop_duplicates().set_index(["onetsoc_code", "task_id"]) + + # 6. Merge Processed ONET Data + logger.info("Merging processed ONET data") + final_df = base_info.merge(freq_pivot, left_index=True, right_index=True, how="left") + final_df = final_df.reset_index() + + if not imp_avg.empty: + final_df = final_df.merge(imp_avg, on=["onetsoc_code", "task_id"], how="left") + else: + final_df["importance_average"] = np.nan + + if not rel_avg.empty: + final_df = final_df.merge(rel_avg, on=["onetsoc_code", "task_id"], how="left") + else: + final_df["relevance_average"] = np.nan + + final_df = final_df.replace({np.nan: None}) + + # 7. Merge with EPOCH remote data + logger.info("Merging with EPOCH remote data") + final_df = pd.merge(final_df, self.epoch_df[['Task', 'Remote']], left_on='task', right_on='Task', how='left') + final_df = final_df.drop('Task', axis=1).rename(columns={'Remote': 'remote_status'}) + + logger.info(f"Created tasks dataframe with shape {final_df.shape}") + final_df.to_parquet(DATA_PATH) + + return final_df + + def _check_for_insanity(self, df: pd.DataFrame): + if df['lb_estimate_in_minutes'].isnull().any(): + missing_count = df['lb_estimate_in_minutes'].isnull().sum() + raise ValueError(f"Found {missing_count} atomic tasks with missing 'lb_estimate_in_minutes'.") + + if df['ub_estimate_in_minutes'].isnull().any(): + missing_count = df['ub_estimate_in_minutes'].isnull().sum() + raise ValueError(f"Found {missing_count} atomic tasks with missing 'ub_estimate_in_minutes'.") + + valid_estimates = df.dropna(subset=['lb_estimate_in_minutes', 'ub_estimate_in_minutes']) + impossible_bounds = valid_estimates[ + (valid_estimates['lb_estimate_in_minutes'] <= 0) | + (valid_estimates['ub_estimate_in_minutes'] <= 0) | + (valid_estimates['lb_estimate_in_minutes'] > valid_estimates['ub_estimate_in_minutes']) + ] + if not impossible_bounds.empty: + raise ValueError(f"Found {len(impossible_bounds)} rows with impossible bounds (e.g., lb > ub or value <= 0).") if __name__ == "__main__": - main() + parser = argparse.ArgumentParser(description="Run the econtai pipeline.") + parser.add_argument("--output-dir", type=str, default="dist/", help="The directory to write output files to.") + parser.add_argument("--bust-estimability", action="store_true", help="Bust the saved task estimability classification (EXPENSIVE)") + parser.add_argument("--bust-estimates", action="store_true", help="Bust the tasks estimates (EXPENSIVE)") + parser.add_argument("--debug", action="store_true", help="Enable debug mode (e.g., process fewer tasks).") + + args = parser.parse_args() + Runner(output_dir=args.output_dir, debug=args.debug, bust_estimability=args.bust_estimability, bust_estimates=args.bust_estimates).run() diff --git a/pipeline/utils.py b/pipeline/utils.py new file mode 100644 index 0000000..7476a4c --- /dev/null +++ b/pipeline/utils.py @@ -0,0 +1,222 @@ +import subprocess +import matplotlib.colors as mcolors +import matplotlib as mpl +import seaborn as sns +import tempfile +import litellm +import time +import math +from tqdm import tqdm +from typing import Any, List, Dict +from .logger import logger + +OCCUPATION_MAJOR_CODES = { + '11': 'Management', + '13': 'Business & Financial', + '15': 'Computer & Mathematical', + '17': 'Architecture & Engineering', + '19': 'Life, Physical, & Social Science', + '21': 'Community & Social Service', + '23': 'Legal', + '25': 'Education, Training, & Library', + '27': 'Arts, Design, & Media', + '29': 'Healthcare Practitioners', + '31': 'Healthcare Support', + '33': 'Protective Service', + '35': 'Food Preparation & Serving', + '37': 'Building & Grounds Maintenance', + '39': 'Personal Care & Service', + '41': 'Sales & Related', + '43': 'Office & Admin Support', + '45': 'Farming, Fishing, & Forestry', + '47': 'Construction & Extraction', + '49': 'Installation, Maintenance, & Repair', + '51': 'Production', + '53': 'Transportation & Material Moving', + '55': 'Military Specific', +} + +GRAY = {'50':'#f8fafc','100':'#f1f5f9','200':'#e2e8f0', + '300':'#cbd5e1','400':'#94a3b8','500':'#64748b', + '600':'#475569','700':'#334155','800':'#1e293b', + '900':'#0f172a','950':'#020617'} + +LIME = {'50': '#f7fee7','100': '#ecfcca','200': '#d8f999', + '300': '#bbf451','400': '#9ae600','500': '#83cd00', + '600': '#64a400','700': '#497d00','800': '#3c6300', + '900': '#35530e','950': '#192e03'} + + +def convert_to_minutes(qty, unit): + """Converts a quantity in a given unit to minutes.""" + return qty * { + "minute": 1, + "hour": 60, + "day": 60 * 24, + "week": 60 * 24 * 7, + "month": 60 * 24 * 30, + "trimester": 60 * 24 * 90, + "semester": 60 * 24 * 180, + "year": 60 * 24 * 365, + }[unit] + + +def pretty_display(df): + print(df) + return + html_output = df.to_html(index=False) + + # Create a temporary HTML file + with tempfile.NamedTemporaryFile(mode='w', suffix=".html", encoding="utf-8") as temp_file: + temp_file.write(html_output) + temp_file_path = temp_file.name + subprocess.run(["/home/felix/.nix-profile/bin/firefox-devedition", "-p", "Work (YouthAI)", temp_file_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + input("Press Enter to continue after reviewing the HTML output...") + + +def enrich( + model: str, + rpm: int, # Requests per minute + messages_to_process: List[List[Dict[str, str]]], + schema: Dict[str, Any], + chunk_size: int = 100, +): + all_results = [] + num_messages = len(messages_to_process) + if num_messages == 0: + return all_results + + num_chunks = math.ceil(num_messages / chunk_size) + logger.info(f"Starting enrichment for {num_messages} messages, in {num_chunks} chunks of up to {chunk_size} each.") + + # Calculate the time that should be allocated per request to respect the RPM limit. + time_per_request = 60.0 / rpm if rpm > 0 else 0 + + for i in tqdm(range(num_chunks), desc="Enriching data in chunks"): + chunk_start_time = time.time() + + start_index = i * chunk_size + end_index = start_index + chunk_size + message_chunk = messages_to_process[start_index:end_index] + + if not message_chunk: + continue + + try: + # Send requests for the entire chunk in a batch for better performance. + responses = litellm.batch_completion( + model=model, + messages=message_chunk, + response_format={ + "type": "json_schema", + "json_schema": schema, + }, + ) + + # batch_completion returns the response or an exception object for each message. + # We'll replace exceptions with None as expected by the calling functions. + for response in responses: + if isinstance(response, Exception): + logger.error(f"API call within batch failed: {response}") + all_results.append(None) + else: + all_results.append(response) + + except Exception as e: + # This catches catastrophic failures in batch_completion itself (e.g., auth) + logger.error(f"litellm.batch_completion call failed for chunk {i+1}/{num_chunks}: {e}") + all_results.extend([None] * len(message_chunk)) + + chunk_end_time = time.time() + elapsed_time = chunk_end_time - chunk_start_time + + # To enforce the rate limit, we calculate how long the chunk *should* have taken + # and sleep for the remainder of that time. + if time_per_request > 0: + expected_duration_for_chunk = len(message_chunk) * time_per_request + if elapsed_time < expected_duration_for_chunk: + sleep_duration = expected_duration_for_chunk - elapsed_time + logger.debug(f"Chunk processed in {elapsed_time:.2f}s. Sleeping for {sleep_duration:.2f}s to respect RPM.") + time.sleep(sleep_duration) + + return all_results + +def get_contrasting_text_color(bg_color_hex_or_rgba): + if isinstance(bg_color_hex_or_rgba, str): + rgba = mcolors.to_rgba(bg_color_hex_or_rgba) + else: + rgba = bg_color_hex_or_rgba + r, g, b, _ = rgba + luminance = 0.2126 * r + 0.7152 * g + 0.0722 * b + return 'black' if luminance > 0.55 else 'white' + + +def style_plot(): + """ + Applies a consistent and professional style to all plots. + This function sets matplotlib's rcParams for a global effect. + """ + mpl.rcParams.update({ + 'figure.facecolor': GRAY['50'], + 'figure.edgecolor': 'none', + 'figure.figsize': (12, 8), + 'figure.dpi': 150, + + 'axes.facecolor': GRAY['50'], + 'axes.edgecolor': GRAY['300'], + 'axes.grid': True, + 'axes.labelcolor': GRAY['800'], + 'axes.titlecolor': GRAY['900'], + 'axes.titlesize': 18, + 'axes.titleweight': 'bold', + 'axes.titlepad': 20, + 'axes.labelsize': 14, + 'axes.labelweight': 'semibold', + 'axes.labelpad': 10, + 'axes.spines.top': False, + 'axes.spines.right': False, + 'axes.spines.left': True, + 'axes.spines.bottom': True, + + 'text.color': GRAY['700'], + + 'xtick.color': GRAY['600'], + 'ytick.color': GRAY['600'], + 'xtick.labelsize': 12, + 'ytick.labelsize': 12, + 'xtick.major.size': 0, + 'ytick.major.size': 0, + 'xtick.minor.size': 0, + 'ytick.minor.size': 0, + 'xtick.major.pad': 8, + 'ytick.major.pad': 8, + + 'grid.color': GRAY['200'], + 'grid.linestyle': '--', + 'grid.linewidth': 1, + + 'legend.frameon': False, + 'legend.fontsize': 12, + 'legend.title_fontsize': 14, + 'legend.facecolor': 'inherit', + + 'font.family': 'sans-serif', + 'font.sans-serif': ['Inter'], + 'font.weight': 'normal', + + 'lines.linewidth': 2, + 'lines.markersize': 6, + }) + + # Seaborn specific styles + # Use shades of LIME as the primary color palette. + # Sorting by integer value of keys, and reversed to have darker shades first. + # Excluding very light colors that won't be visible on a light background. + lime_palette = [LIME[k] for k in sorted(LIME.keys(), key=int, reverse=True) if k not in ['50', '100', '700', '800', '900', '950',]] + + sns.set_palette(lime_palette) + sns.set_style("whitegrid", { + 'axes.edgecolor': GRAY['300'], + 'grid.color': GRAY['200'], + 'grid.linestyle': '--', + }) diff --git a/pyproject.toml b/pyproject.toml index f3c3df2..4e7ee61 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,7 @@ dependencies = [ "python-dotenv>=1.1.1", "requests>=2.32.4", "rich>=14.0.0", + "scipy>=1.16.0", "seaborn>=0.13.2", ] diff --git a/uv.lock b/uv.lock index 3ffded7..8138021 100644 --- a/uv.lock +++ b/uv.lock @@ -1120,6 +1120,35 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/75/04/5302cea1aa26d886d34cadbf2dc77d90d7737e576c0065f357b96dc7a1a6/rpds_py-0.26.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f14440b9573a6f76b4ee4770c13f0b5921f71dde3b6fcb8dabbefd13b7fe05d7", size = 232821, upload_time = "2025-07-01T15:55:55.167Z" }, ] +[[package]] +name = "scipy" +version = "1.16.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/81/18/b06a83f0c5ee8cddbde5e3f3d0bb9b702abfa5136ef6d4620ff67df7eee5/scipy-1.16.0.tar.gz", hash = "sha256:b5ef54021e832869c8cfb03bc3bf20366cbcd426e02a58e8a58d7584dfbb8f62", size = 30581216, upload_time = "2025-06-22T16:27:55.782Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/46/95/0746417bc24be0c2a7b7563946d61f670a3b491b76adede420e9d173841f/scipy-1.16.0-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:e9f414cbe9ca289a73e0cc92e33a6a791469b6619c240aa32ee18abdce8ab451", size = 36418162, upload_time = "2025-06-22T16:19:56.3Z" }, + { url = "https://files.pythonhosted.org/packages/19/5a/914355a74481b8e4bbccf67259bbde171348a3f160b67b4945fbc5f5c1e5/scipy-1.16.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:bbba55fb97ba3cdef9b1ee973f06b09d518c0c7c66a009c729c7d1592be1935e", size = 28465985, upload_time = "2025-06-22T16:20:01.238Z" }, + { url = "https://files.pythonhosted.org/packages/58/46/63477fc1246063855969cbefdcee8c648ba4b17f67370bd542ba56368d0b/scipy-1.16.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:58e0d4354eacb6004e7aa1cd350e5514bd0270acaa8d5b36c0627bb3bb486974", size = 20737961, upload_time = "2025-06-22T16:20:05.913Z" }, + { url = "https://files.pythonhosted.org/packages/93/86/0fbb5588b73555e40f9d3d6dde24ee6fac7d8e301a27f6f0cab9d8f66ff2/scipy-1.16.0-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:75b2094ec975c80efc273567436e16bb794660509c12c6a31eb5c195cbf4b6dc", size = 23377941, upload_time = "2025-06-22T16:20:10.668Z" }, + { url = "https://files.pythonhosted.org/packages/ca/80/a561f2bf4c2da89fa631b3cbf31d120e21ea95db71fd9ec00cb0247c7a93/scipy-1.16.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6b65d232157a380fdd11a560e7e21cde34fdb69d65c09cb87f6cc024ee376351", size = 33196703, upload_time = "2025-06-22T16:20:16.097Z" }, + { url = "https://files.pythonhosted.org/packages/11/6b/3443abcd0707d52e48eb315e33cc669a95e29fc102229919646f5a501171/scipy-1.16.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1d8747f7736accd39289943f7fe53a8333be7f15a82eea08e4afe47d79568c32", size = 35083410, upload_time = "2025-06-22T16:20:21.734Z" }, + { url = "https://files.pythonhosted.org/packages/20/ab/eb0fc00e1e48961f1bd69b7ad7e7266896fe5bad4ead91b5fc6b3561bba4/scipy-1.16.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:eb9f147a1b8529bb7fec2a85cf4cf42bdfadf9e83535c309a11fdae598c88e8b", size = 35387829, upload_time = "2025-06-22T16:20:27.548Z" }, + { url = "https://files.pythonhosted.org/packages/57/9e/d6fc64e41fad5d481c029ee5a49eefc17f0b8071d636a02ceee44d4a0de2/scipy-1.16.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d2b83c37edbfa837a8923d19c749c1935ad3d41cf196006a24ed44dba2ec4358", size = 37841356, upload_time = "2025-06-22T16:20:35.112Z" }, + { url = "https://files.pythonhosted.org/packages/7c/a7/4c94bbe91f12126b8bf6709b2471900577b7373a4fd1f431f28ba6f81115/scipy-1.16.0-cp313-cp313-win_amd64.whl", hash = "sha256:79a3c13d43c95aa80b87328a46031cf52508cf5f4df2767602c984ed1d3c6bbe", size = 38403710, upload_time = "2025-06-22T16:21:54.473Z" }, + { url = "https://files.pythonhosted.org/packages/47/20/965da8497f6226e8fa90ad3447b82ed0e28d942532e92dd8b91b43f100d4/scipy-1.16.0-cp313-cp313t-macosx_10_14_x86_64.whl", hash = "sha256:f91b87e1689f0370690e8470916fe1b2308e5b2061317ff76977c8f836452a47", size = 36813833, upload_time = "2025-06-22T16:20:43.925Z" }, + { url = "https://files.pythonhosted.org/packages/28/f4/197580c3dac2d234e948806e164601c2df6f0078ed9f5ad4a62685b7c331/scipy-1.16.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:88a6ca658fb94640079e7a50b2ad3b67e33ef0f40e70bdb7dc22017dae73ac08", size = 28974431, upload_time = "2025-06-22T16:20:51.302Z" }, + { url = "https://files.pythonhosted.org/packages/8a/fc/e18b8550048d9224426e76906694c60028dbdb65d28b1372b5503914b89d/scipy-1.16.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:ae902626972f1bd7e4e86f58fd72322d7f4ec7b0cfc17b15d4b7006efc385176", size = 21246454, upload_time = "2025-06-22T16:20:57.276Z" }, + { url = "https://files.pythonhosted.org/packages/8c/48/07b97d167e0d6a324bfd7484cd0c209cc27338b67e5deadae578cf48e809/scipy-1.16.0-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:8cb824c1fc75ef29893bc32b3ddd7b11cf9ab13c1127fe26413a05953b8c32ed", size = 23772979, upload_time = "2025-06-22T16:21:03.363Z" }, + { url = "https://files.pythonhosted.org/packages/4c/4f/9efbd3f70baf9582edf271db3002b7882c875ddd37dc97f0f675ad68679f/scipy-1.16.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:de2db7250ff6514366a9709c2cba35cb6d08498e961cba20d7cff98a7ee88938", size = 33341972, upload_time = "2025-06-22T16:21:11.14Z" }, + { url = "https://files.pythonhosted.org/packages/3f/dc/9e496a3c5dbe24e76ee24525155ab7f659c20180bab058ef2c5fa7d9119c/scipy-1.16.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e85800274edf4db8dd2e4e93034f92d1b05c9421220e7ded9988b16976f849c1", size = 35185476, upload_time = "2025-06-22T16:21:19.156Z" }, + { url = "https://files.pythonhosted.org/packages/ce/b3/21001cff985a122ba434c33f2c9d7d1dc3b669827e94f4fc4e1fe8b9dfd8/scipy-1.16.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4f720300a3024c237ace1cb11f9a84c38beb19616ba7c4cdcd771047a10a1706", size = 35570990, upload_time = "2025-06-22T16:21:27.797Z" }, + { url = "https://files.pythonhosted.org/packages/e5/d3/7ba42647d6709251cdf97043d0c107e0317e152fa2f76873b656b509ff55/scipy-1.16.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:aad603e9339ddb676409b104c48a027e9916ce0d2838830691f39552b38a352e", size = 37950262, upload_time = "2025-06-22T16:21:36.976Z" }, + { url = "https://files.pythonhosted.org/packages/eb/c4/231cac7a8385394ebbbb4f1ca662203e9d8c332825ab4f36ffc3ead09a42/scipy-1.16.0-cp313-cp313t-win_amd64.whl", hash = "sha256:f56296fefca67ba605fd74d12f7bd23636267731a72cb3947963e76b8c0a25db", size = 38515076, upload_time = "2025-06-22T16:21:45.694Z" }, +] + [[package]] name = "seaborn" version = "0.13.2" @@ -1168,6 +1197,7 @@ dependencies = [ { name = "python-dotenv" }, { name = "requests" }, { name = "rich" }, + { name = "scipy" }, { name = "seaborn" }, ] @@ -1184,6 +1214,7 @@ requires-dist = [ { name = "python-dotenv", specifier = ">=1.1.1" }, { name = "requests", specifier = ">=2.32.4" }, { name = "rich", specifier = ">=14.0.0" }, + { name = "scipy", specifier = ">=1.16.0" }, { name = "seaborn", specifier = ">=0.13.2" }, ]