wip

2025-07-15 00:34:54 +02:00 · 2025-07-15 00:34:54 +02:00 · 65dc648797
commit 65dc648797
parent 62296e1b69
37 changed files with 1413 additions and 2433 deletions
--- a/agents.md
+++ b/agents.md
@ -1,2 +1,3 @@
 - I use Nix. To run a command, prefix them with `nix develop .#impure -c`
 - I use uv. To add a package, use: uv add. To run a script use: uv run path/to/script
+- To run the pipeline: `uv run -m pipeline.runner`
--- a/dist/estimate_distribution_histplot.png
+++ b/dist/estimate_distribution_histplot.png
--- a/dist/estimates_lower_vs_upper_scatter.png
+++ b/dist/estimates_lower_vs_upper_scatter.png
--- a/dist/estimates_spread_per_occupation.png
+++ b/dist/estimates_spread_per_occupation.png
--- a/dist/intermediate/df_tasks.parquet
+++ b/dist/intermediate/df_tasks.parquet
--- a/dist/intermediate/estimable_tasks_with_estimates.parquet
+++ b/dist/intermediate/estimable_tasks_with_estimates.parquet
--- a/dist/intermediate/task_summary_by_major_occupation.parquet
+++ b/dist/intermediate/task_summary_by_major_occupation.parquet
--- a/dist/intermediate/task_summary_by_occupation.parquet
+++ b/dist/intermediate/task_summary_by_occupation.parquet
--- a/dist/projected_automatable_wage_bill_sensitivity.png
+++ b/dist/projected_automatable_wage_bill_sensitivity.png
--- a/dist/projected_task_automation_p50.png
+++ b/dist/projected_task_automation_p50.png
--- a/dist/projected_task_automation_p80.png
+++ b/dist/projected_task_automation_p80.png
--- a/dist/sequential_coherence_cdf.png
+++ b/dist/sequential_coherence_cdf.png
--- a/old/add_task_estimates.py
+++ b/old/add_task_estimates.py
@ -1,507 +0,0 @@
-import pandas as pd
-import litellm
-import dotenv
-import os
-import time
-import json
-import math
-import numpy as np
-
-# --- Configuration ---
-MODEL = "gpt-4.1-mini"  # Make sure this model supports json_schema or structured output
-RATE_LIMIT = 5000  # Requests per minute
-CHUNK_SIZE = 300
-SECONDS_PER_MINUTE = 60
-FILENAME = (
-    "tasks_with_estimates.csv"  # This CSV should contain the tasks to be processed
-)
-
-# --- Prompts and Schema ---
-SYSTEM_PROMPT = """
-You are an expert assistant evaluating the time to completion required for job tasks. Your goal is to estimate the time range needed for a skilled human to complete the following job task remotely, without supervision.
-
-Provide a lower and upper bound estimate for the time to completion time. These bounds should capture the time within which approximately 80% of instances of performing this specific task are typically completed by a qualified individual.
-
-Base your estimate on the provided task description, its associated activities, and the occupational context. Your estimate must be in one the allowed units: minute, hour, day, week, month, trimester, semester, year.
-""".strip()
-
-USER_MESSAGE_TEMPLATE = """
-Please estimate the time range for the following remote task:
-
-**Task Description:** {task}
-**Relevant activies for the task:**
-{dwas}
-
-**Occupation Category:** {occupation_title}
-**Occupation Description:** {occupation_description}
-
-Consider the complexity and the typical steps involved.
-""".strip()
-
-ALLOWED_UNITS = [
-    "minute",
-    "hour",
-    "day",
-    "week",
-    "month",
-    "trimester",
-    "semester",
-    "year",
-]
-
-SCHEMA_FOR_VALIDATION = {
-    "name": "estimate_time",
-    "strict": True,  # Enforce schema adherence
-    "schema": {
-        "type": "object",
-        "properties": {
-            "lower_bound_estimate": {
-                "type": "object",
-                "properties": {
-                    "quantity": {
-                        "type": "number",
-                        "description": "The numerical value for the lower bound of the estimate.",
-                    },
-                    "unit": {
-                        "type": "string",
-                        "enum": ALLOWED_UNITS,
-                        "description": "The unit of time for the lower bound.",
-                    },
-                },
-                "required": ["quantity", "unit"],
-                "additionalProperties": False,
-            },
-            "upper_bound_estimate": {
-                "type": "object",
-                "properties": {
-                    "quantity": {
-                        "type": "number",
-                        "description": "The numerical value for the upper bound of the estimate.",
-                    },
-                    "unit": {
-                        "type": "string",
-                        "enum": ALLOWED_UNITS,
-                        "description": "The unit of time for the upper bound.",
-                    },
-                },
-                "required": ["quantity", "unit"],
-                "additionalProperties": False,
-            },
-        },
-        "required": ["lower_bound_estimate", "upper_bound_estimate"],
-        "additionalProperties": False,
-    },
-}
-
-
-def save_dataframe(df_to_save, filename):
-
-    """Saves the DataFrame to the specified CSV file using atomic write."""
-    try:
-        temp_filename = filename + ".tmp"
-        df_to_save.to_csv(temp_filename, encoding="utf-8-sig", index=False)
-        os.replace(temp_filename, filename)
-    except Exception as e:
-        print(f"--- Error saving DataFrame to {filename}: {e} ---")
-        if os.path.exists(temp_filename):
-            try:
-                os.remove(temp_filename)
-            except Exception as remove_err:
-                print(
-                    f"--- Error removing temporary save file {temp_filename}: {remove_err} ---"
-                )
-
-def create_task_estimates():
-    try:
-        # Read the CSV
-        if os.path.exists(FILENAME):
-            df = pd.read_csv(FILENAME, encoding="utf-8-sig")
-            print(f"Successfully read {len(df)} rows from {FILENAME}.")
-
-            estimate_columns_spec = {
-                "lb_estimate_qty": float,
-                "lb_estimate_unit": object,
-                "ub_estimate_qty": float,
-                "ub_estimate_unit": object,
-            }
-            save_needed = False
-
-            for col_name, target_dtype in estimate_columns_spec.items():
-                if col_name not in df.columns:
-                    # Initialize with a type-compatible missing value
-                    if target_dtype == float:
-                        df[col_name] = np.nan
-                    else:  # object
-                        df[col_name] = pd.NA
-                    df[col_name] = df[col_name].astype(target_dtype)  # Enforce dtype
-                    print(f"Added '{col_name}' column as {df[col_name].dtype}.")
-                    save_needed = True
-                else:
-                    # Column exists, ensure correct dtype
-                    current_pd_dtype = df[col_name].dtype
-                    expected_pd_dtype = pd.Series(dtype=target_dtype).dtype
-
-                    if current_pd_dtype != expected_pd_dtype:
-                        try:
-                            if target_dtype == float:
-                                df[col_name] = pd.to_numeric(df[col_name], errors="coerce")
-                            else:  # object
-                                df[col_name] = df[col_name].astype(object)
-                            print(
-                                f"Corrected dtype of '{col_name}' to {df[col_name].dtype}."
-                            )
-                            save_needed = True
-                        except Exception as e:
-                            print(
-                                f"Warning: Could not convert column '{col_name}' to {target_dtype}: {e}. Current dtype: {current_pd_dtype}"
-                            )
-
-                # Standardize missing values (e.g., empty strings to NA/NaN)
-                # Replace common missing placeholders with pd.NA first
-                df[col_name].replace(["", None, ""], pd.NA, inplace=True)
-                if target_dtype == float:
-                    # For float columns, ensure they are numeric and use np.nan after replacement
-                    df[col_name] = pd.to_numeric(df[col_name], errors="coerce")
-
-            if save_needed:
-                print(f"Saving {FILENAME} after adding/adjusting estimate columns.")
-                save_dataframe(df, FILENAME)
-        else:
-            print(
-                f"Error: {FILENAME} not found. Please ensure the file exists and contains task data."
-            )
-            exit()
-    except FileNotFoundError:
-        print(
-            f"Error: {FILENAME} not found. Please ensure the file exists and contains task data."
-        )
-        exit()
-    except Exception as e:
-        print(f"Error reading or initializing {FILENAME}: {e}")
-        exit()
-
-    # --- Identify Rows to Process ---
-    # We'll check for NaN in one of the primary quantity columns.
-    unprocessed_mask = df["lb_estimate_qty"].isna()
-    if unprocessed_mask.any():
-        start_index = unprocessed_mask.idxmax()  # Finds the index of the first True value
-        print(f"Resuming processing. First unprocessed row found at index {start_index}.")
-        df_to_process = df.loc[unprocessed_mask].copy()
-        original_indices = df_to_process.index  # Keep track of original indices
-    else:
-        print(
-            "All rows seem to have estimates already (based on 'lb_estimate_qty'). Exiting."
-        )
-        exit()
-
-
-    # --- Prepare messages for batch completion (only for rows needing processing) ---
-    messages_list = []
-    skipped_rows_indices = []
-    valid_original_indices = []
-
-    if not df_to_process.empty:
-        required_cols = ["task", "occupation_title", "occupation_description", "dwas"]
-        print(
-            f"Preparing messages for up to {len(df_to_process)} rows starting from original index {original_indices[0] if len(original_indices) > 0 else 'N/A'}..."
-        )
-        print(f"Checking for required columns: {required_cols}")
-
-        for index, row in df_to_process.iterrows():
-            missing_or_empty = []
-            for col in required_cols:
-                if col not in row or pd.isna(row[col]) or str(row[col]).strip() == "":
-                    missing_or_empty.append(col)
-
-            if missing_or_empty:
-                print(
-                    f"Warning: Skipping row original index {index} due to missing/empty required data in columns: {', '.join(missing_or_empty)}."
-                )
-                skipped_rows_indices.append(index)
-                continue
-
-            try:
-                user_message = USER_MESSAGE_TEMPLATE.format(
-                    task=row["task"],
-                    occupation_title=row["occupation_title"],
-                    occupation_description=row["occupation_description"],
-                    dwas=row["dwas"],
-                )
-            except KeyError as e:
-                print(
-                    f"Error: Skipping row original index {index} due to formatting error - missing key: {e}. Check USER_MESSAGE_TEMPLATE and CSV columns."
-                )
-                skipped_rows_indices.append(index)
-                continue
-
-            messages_for_row = [
-                {"role": "system", "content": SYSTEM_PROMPT},
-                {"role": "user", "content": user_message},
-            ]
-            messages_list.append(messages_for_row)
-            valid_original_indices.append(index)  # This is the original DataFrame index
-
-        print(
-            f"Prepared {len(messages_list)} valid message sets for batch completion (skipped {len(skipped_rows_indices)} rows)."
-        )
-        if not messages_list:
-            print("No valid rows found to process after checking required data. Exiting.")
-            exit()
-    else:
-        print(
-            "No rows found needing processing (df_to_process is empty)."
-        )  # Should have been caught by earlier check
-        exit()
-
-
-    # --- Call batch_completion in chunks with rate limiting and periodic saving ---
-    total_messages_to_send = len(messages_list)
-    num_chunks = math.ceil(total_messages_to_send / CHUNK_SIZE)
-
-    print(
-        f"\nStarting batch completion for {total_messages_to_send} items in {num_chunks} chunks..."
-    )
-
-    overall_start_time = time.time()
-    processed_count_total = 0
-
-    for i in range(num_chunks):
-        chunk_start_message_index = i * CHUNK_SIZE
-        chunk_end_message_index = min((i + 1) * CHUNK_SIZE, total_messages_to_send)
-        message_chunk = messages_list[chunk_start_message_index:chunk_end_message_index]
-        # Get corresponding original DataFrame indices for this chunk
-        chunk_original_indices = valid_original_indices[
-            chunk_start_message_index:chunk_end_message_index
-        ]
-
-        if not message_chunk:
-            continue
-
-        min_idx_disp = min(chunk_original_indices) if chunk_original_indices else "N/A"
-        max_idx_disp = max(chunk_original_indices) if chunk_original_indices else "N/A"
-        print(
-            f"\nProcessing chunk {i + 1}/{num_chunks} (Messages {chunk_start_message_index + 1}-{chunk_end_message_index} of this run)..."
-            f" Corresponding to original indices: {min_idx_disp} - {max_idx_disp}"
-        )
-        chunk_start_time = time.time()
-        responses = []
-        try:
-            print(f"Sending {len(message_chunk)} requests for chunk {i + 1}...")
-            responses = litellm.batch_completion(
-                model=MODEL,
-                messages=message_chunk,
-                response_format={
-                    "type": "json_schema",
-                    "json_schema": SCHEMA_FOR_VALIDATION,
-                },
-                num_retries=3,
-                # request_timeout=60 # Optional: uncomment if needed
-            )
-            print(f"Chunk {i + 1} API call completed.")
-
-        except Exception as e:
-            print(f"Error during litellm.batch_completion for chunk {i + 1}: {e}")
-            responses = [None] * len(
-                message_chunk
-            )  # Ensure responses list matches message_chunk length for processing loop
-
-        # --- Process responses for the current chunk ---
-        chunk_updates = {}  # To store {original_df_index: {qty/unit data}}
-        successful_in_chunk = 0
-        failed_in_chunk = 0
-
-        if responses and len(responses) == len(message_chunk):
-            for j, response in enumerate(responses):
-                original_df_index = chunk_original_indices[j]
-
-                # Initialize values for this item
-                lb_qty_val, lb_unit_val, ub_qty_val, ub_unit_val = None, None, None, None
-                content_str = None
-
-                if response is None:
-                    print(
-                        f"Skipping processing for original index {original_df_index} due to API call failure for this item (response is None)."
-                    )
-                    failed_in_chunk += 1
-                    continue
-
-                try:
-                    if (
-                        response.choices
-                        and response.choices[0].message
-                        and response.choices[0].message.content
-                    ):
-                        content_str = response.choices[0].message.content
-                        estimate_data = json.loads(content_str)  # Can raise JSONDecodeError
-
-                        lower_bound_dict = estimate_data.get("lower_bound_estimate")
-                        upper_bound_dict = estimate_data.get("upper_bound_estimate")
-
-                        valid_response_structure = isinstance(
-                            lower_bound_dict, dict
-                        ) and isinstance(upper_bound_dict, dict)
-
-                        if valid_response_structure:
-                            lb_qty_raw = lower_bound_dict.get("quantity")
-                            lb_unit_raw = lower_bound_dict.get("unit")
-                            ub_qty_raw = upper_bound_dict.get("quantity")
-                            ub_unit_raw = upper_bound_dict.get("unit")
-
-                            is_valid_item = True
-                            # Validate LB Qty
-                            if (
-                                not isinstance(lb_qty_raw, (int, float))
-                                or math.isnan(float(lb_qty_raw))
-                                or float(lb_qty_raw) < 0
-                            ):
-                                print(
-                                    f"Warning: Invalid lb_quantity for original index {original_df_index}: {lb_qty_raw}"
-                                )
-                                is_valid_item = False
-                            else:
-                                lb_qty_val = float(lb_qty_raw)
-
-                            # Validate UB Qty
-                            if (
-                                not isinstance(ub_qty_raw, (int, float))
-                                or math.isnan(float(ub_qty_raw))
-                                or float(ub_qty_raw) < 0
-                            ):
-                                print(
-                                    f"Warning: Invalid ub_quantity for original index {original_df_index}: {ub_qty_raw}"
-                                )
-                                is_valid_item = False
-                            else:
-                                ub_qty_val = float(ub_qty_raw)
-
-                            # Validate Units
-                            if lb_unit_raw not in ALLOWED_UNITS:
-                                print(
-                                    f"Warning: Invalid lb_unit for original index {original_df_index}: '{lb_unit_raw}'"
-                                )
-                                is_valid_item = False
-                            else:
-                                lb_unit_val = lb_unit_raw
-
-                            if ub_unit_raw not in ALLOWED_UNITS:
-                                print(
-                                    f"Warning: Invalid ub_unit for original index {original_df_index}: '{ub_unit_raw}'"
-                                )
-                                is_valid_item = False
-                            else:
-                                ub_unit_val = ub_unit_raw
-
-                            if is_valid_item:
-                                successful_in_chunk += 1
-                                chunk_updates[original_df_index] = {
-                                    "lb_estimate_qty": lb_qty_val,
-                                    "lb_estimate_unit": lb_unit_val,
-                                    "ub_estimate_qty": ub_qty_val,
-                                    "ub_estimate_unit": ub_unit_val,
-                                }
-                            else:
-                                failed_in_chunk += (
-                                    1  # Values remain None if not fully valid
-                                )
-                        else:
-                            print(
-                                f"Warning: Missing or malformed estimate dicts in JSON for original index {original_df_index}. Content: '{content_str}'"
-                            )
-                            failed_in_chunk += 1
-                    else:
-                        finish_reason = (
-                            response.choices[0].finish_reason
-                            if (response.choices and response.choices[0].finish_reason)
-                            else "unknown"
-                        )
-                        error_message = (
-                            response.choices[0].message.content
-                            if (
-                                response.choices
-                                and response.choices[0].message
-                                and response.choices[0].message.content
-                            )
-                            else "No content in message."
-                        )
-                        print(
-                            f"Warning: Received non-standard or empty response content for original index {original_df_index}. "
-                            f"Finish Reason: '{finish_reason}'. Message: '{error_message}'. Raw Choices: {response.choices}"
-                        )
-                        failed_in_chunk += 1
-
-                except json.JSONDecodeError:
-                    print(
-                        f"Warning: Could not decode JSON for original index {original_df_index}. Content received: '{content_str}'"
-                    )
-                    failed_in_chunk += 1
-                except AttributeError as ae:
-                    print(
-                        f"Warning: Missing expected attribute processing response for original index {original_df_index}: {ae}. Response: {response}"
-                    )
-                    failed_in_chunk += 1
-                except Exception as e:
-                    print(
-                        f"Warning: An unexpected error occurred processing response for original index {original_df_index}: {type(e).__name__} - {e}. Response: {response}"
-                    )
-                    failed_in_chunk += 1
-        else:
-            print(
-                f"Warning: Mismatch between number of responses ({len(responses) if responses else 0}) "
-                f"and messages sent ({len(message_chunk)}) for chunk {i + 1}, or no responses. Marking all as failed."
-            )
-            failed_in_chunk = len(
-                message_chunk
-            )  # All items in this chunk are considered failed if response array is problematic
-
-        print(
-            f"Chunk {i + 1} processing summary: Success={successful_in_chunk}, Failed/Skipped={failed_in_chunk}"
-        )
-        processed_count_total += successful_in_chunk
-
-        # --- Update Main DataFrame and Save Periodically ---
-        if chunk_updates:
-            print(
-                f"Updating main DataFrame with {len(chunk_updates)} new estimates for chunk {i + 1}..."
-            )
-            for idx, estimates in chunk_updates.items():
-                if idx in df.index:
-                    df.loc[idx, "lb_estimate_qty"] = estimates["lb_estimate_qty"]
-                    df.loc[idx, "lb_estimate_unit"] = estimates["lb_estimate_unit"]
-                    df.loc[idx, "ub_estimate_qty"] = estimates["ub_estimate_qty"]
-                    df.loc[idx, "ub_estimate_unit"] = estimates["ub_estimate_unit"]
-
-            print(f"Saving progress to {FILENAME}...")
-            save_dataframe(df, FILENAME)
-        else:
-            print(f"No successful estimates obtained in chunk {i + 1} to save.")
-
-        # --- Rate Limiting Pause ---
-        chunk_end_time = time.time()
-        chunk_duration = chunk_end_time - chunk_start_time
-        print(f"Chunk {i + 1} took {chunk_duration:.2f} seconds.")
-
-        if i < num_chunks - 1:  # No pause after the last chunk
-            # Calculate ideal time per request based on rate limit
-            time_per_request = SECONDS_PER_MINUTE / RATE_LIMIT if RATE_LIMIT > 0 else 0
-            # Calculate minimum duration this chunk should have taken to respect rate limit
-            min_chunk_duration_for_rate = len(message_chunk) * time_per_request
-            # Calculate pause needed
-            pause_needed = max(0, min_chunk_duration_for_rate - chunk_duration)
-
-            if pause_needed > 0:
-                print(
-                    f"Pausing for {pause_needed:.2f} seconds to respect rate limit ({RATE_LIMIT}/min)..."
-                )
-                time.sleep(pause_needed)
-
-    overall_end_time = time.time()
-    total_duration_minutes = (overall_end_time - overall_start_time) / 60
-    print(
-        f"\nBatch completion finished."
-        f" Processed {processed_count_total} new estimates in this run in {total_duration_minutes:.2f} minutes."
-    )
-
-    print(f"Performing final save to {FILENAME}...")
-    save_dataframe(df, FILENAME)
-
-    print("\nScript finished.")
--- a/old/analysis.py
+++ b/old/analysis.py
@ -1,528 +0,0 @@
-import os
-import litellm
-import sqlite3
-import numpy as np
-import pandas as pd
-from google.colab import userdata, files
-import seaborn as sns
-import matplotlib.pyplot as plt
-import matplotlib as mpl
-
-os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
-os.environ['GEMINI_API_KEY'] = userdata.get('GEMINI_API_KEY')
-
-occupation_major_codes = {
-    '11': 'Management',
-    '13': 'Business and Financial Operations',
-    '15': 'Computer and Mathematical Occupations',
-    '17': 'Architecture and Engineering',
-    '19': 'Life, Physical, and Social Science',
-    '21': 'Community and Social Services',
-    '23': 'Legal',
-    '25': 'Education, Training, and Library',
-    '27': 'Arts, Design, Entertainment, Sports, and Media',
-    '29': 'Healthcare Practitioners and Technical',
-    '31': 'Healthcare Support',
-    '33': 'Protective Service',
-    '35': 'Food Preparation and Serving Related',
-    '37': 'Building and Grounds Cleaning and Maintenance',
-    '39': 'Personal Care and Service',
-    '41': 'Sales and Related',
-    '43': 'Office and Administrative Support',
-    '45': 'Farming, Fishing, and Forestry',
-    '47': 'Construction and Extraction',
-    '49': 'Installation, Maintenance, and Repair',
-    '51': 'Production',
-    '53': 'Transportation and Material Moving',
-    '55': 'Military Specific'
-}
-
-gray   = {'50':'#f8fafc','100':'#f1f5f9','200':'#e2e8f0',
-                   '300':'#cbd5e1','400':'#94a3b8','500':'#64748b',
-                   '600':'#475569','700':'#334155','800':'#1e293b',
-                   '900':'#0f172a','950':'#020617'}
-lime            = {'50': '#f7fee7','100': '#ecfcca','200': '#d8f999',
-                   '300': '#bbf451','400': '#9ae600','500': '#83cd00',
-                   '600': '#64a400','700': '#497d00','800': '#3c6300',
-                   '900': '#35530e','950': '#192e03'}
-
-mpl.rcParams.update({
-    'figure.facecolor' : gray['50'],
-    'axes.facecolor'   : gray['50'],
-    'axes.edgecolor'   : gray['100'],
-    'axes.labelcolor'  : gray['700'],
-    'xtick.color'      : gray['700'],
-    'ytick.color'      : gray['700'],
-    'font.family'      : 'Inter',  # falls back to DejaVu if Inter not present
-    'font.size'        : 11,
-})
-
-sns.set_style("white")         # keep minimal axes, we will remove default grid
-sns.set_context("notebook")
-
-def prepare_tasks():
-    # This dataset comes from https://epoch.ai/gradient-updates/consequences-of-automating-remote-work
-    # It contains labels for a O*NET task can be done remotely or not (labeled by GPT-4o)
-    # You can download it here: https://drive.google.com/file/d/1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r/view?usp=sharing
-    df_remote_status = pd.read_csv("epoch_task_data.csv")
-
-    # BLS OEWS: Https://www.bls.gov/oes/special-requests/oesm23nat.zip
-    df_oesm = pd.read_excel("oesm23national.xlsx")
-
-    # Run uv run ./enrich_task_ratings.py
-    df_tasks = pd.read_json("task_ratings_enriched.json")
-
-    # Run uv run classify_estimateability_of_tasks.py
-    df_task_estimateable = pd.read_csv("tasks_estimateable.csv").rename(columns={"task_estimateable": "estimateable"}).drop_duplicates(subset=['task'], keep='first')
-
-    # df_tasks now has a remote_status column which contains either "remote" or "not remote"
-    df_tasks = pd.merge(df_tasks, df_remote_status[['Task', 'Remote']], left_on='task', right_on='Task', how='left')
-    df_tasks = df_tasks.drop('Task', axis=1).rename(columns={'Remote': 'remote_status'})
-
-    # df_tasks now has a estimateable column which contains either "ATOMIC" or "ONGOING-CONSTRAINT"
-    df_tasks = pd.merge(df_tasks, df_task_estimateable[['task', 'estimateable']], on='task', how='left')
-
-    df_tasks = df_tasks[df_tasks['importance_average'] < 3].copy()
-
-    df_tasks['onetsoc_major'] = df_tasks['onetsoc_code'].str[:2]
-
-    df_remote_tasks = df_tasks[df_tasks['remote_status'] == 'remote'].copy()
-
-    # Call create_task_estimates() from add_task_estimates? which creates tasks_with_estimates.csv
-
-def preprocessing_time_estimates():
-    df = pd.read_csv("tasks_with_estimates.csv")
-
-    df = df[df['importance_average'] > 3].copy()
-
-    # The embeddings comes from running `uv run ./embed_task_description.py`
-    # Columns: ['embedding_id', 'task', 'embedding_vector']
-    # These contain embedding for UNIQUE tasks
-    df_task_embeddings = pd.read_parquet("tasks_with_embeddings.parquet").drop_duplicates(subset=['task'])[['task', 'task_embedding']].rename(columns={"task_embedding": "embedding_vector"}).copy()
-
-    df = pd.merge(df, df_task_embeddings[['task', 'embedding_vector']], on='task', how='left')
-    df = pd.merge(df, df_task_estimateable[['task', 'estimateable']], on='task', how='left')
-
-    df['onetsoc_major'] = df['onetsoc_code'].str[:2]
-
-    def convert_to_minutes(qty, unit):
-        """Converts a quantity in a given unit to minutes."""
-        return qty * {
-            "minute": 1,
-            "hour": 60,
-            "day": 60 * 24,
-            "week": 60 * 24 * 7,
-            "month": 60 * 24 * 30,
-            "trimester": 60 * 24 * 90,
-            "semester": 60 * 24 * 180,
-            "year": 60 * 24 * 365,
-        }[unit]
-
-    df['lb_estimate_in_minutes'] = df.apply(
-        lambda row: convert_to_minutes(row['lb_estimate_qty'], row['lb_estimate_unit']), axis=1
-    )
-    df['ub_estimate_in_minutes'] = df.apply(
-        lambda row: convert_to_minutes(row['ub_estimate_qty'], row['ub_estimate_unit']), axis=1
-    )
-
-    df['estimate_range'] = df.ub_estimate_in_minutes - df.lb_estimate_in_minutes
-    df['estimate_ratio'] = df.ub_estimate_in_minutes / df.lb_estimate_in_minutes
-    df['estimate_midpoint'] = (df.lb_estimate_in_minutes + df.ub_estimate_in_minutes)/2
-
-    atomic_tasks = df[df['estimateable'] == 'ATOMIC']
-    ongoing_tasks = df[df['estimateable'] == 'ONGOING-CONSTRAINT']
-
-    with pd.option_context('display.max_columns', None):
-      display(df)
-
-    # Check for empty estimates
-    if atomic_tasks['lb_estimate_in_minutes'].isnull().sum() > 0:
-        print("Missing values in 'lb_estimate_in_minutes':", atomic_tasks['lb_estimate_in_minutes'].isnull().sum())
-
-    if atomic_tasks['ub_estimate_in_minutes'].isnull().sum() > 0:
-        print("Missing values in 'ub_estimate_in_minutes':", atomic_tasks['ub_estimate_in_minutes'].isnull().sum())
-
-    # Check for impossible bounds
-    impossible_bounds = atomic_tasks[
-        (atomic_tasks['lb_estimate_in_minutes'] <= 0) |
-        (atomic_tasks['ub_estimate_in_minutes'] <= 0) |
-        (atomic_tasks['lb_estimate_in_minutes'] > atomic_tasks['ub_estimate_in_minutes'])
-    ]
-    if not impossible_bounds.empty:
-        print(f"Error: Found rows with impossible bounds.")
-        with pd.option_context('display.max_colwidth', None):
-        display(impossible_bounds[['task', 'lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'dwas']])
-
-    #with pd.option_context('display.max_colwidth', None):
-        #display(atomic_tasks.nlargest(20, 'ub_estimate_in_minutes')[['task', 'lb_estimate_qty', 'lb_estimate_unit', 'lb_estimate_in_minutes', 'ub_estimate_qty', 'ub_estimate_unit', 'ub_estimate_in_minutes', 'estimate_ratio']])
-
-def cell1():
-    sns.histplot(atomic_tasks.estimate_midpoint, log_scale=True)
-
-def cell2():
-    plt.figure(figsize=(14,10))
-    sns.boxplot(
-        data=atomic_tasks,
-        x='onetsoc_major',           # 11 = Management, 15 = Computer/Math, …
-        y='estimate_range',
-        showfliers=False
-    )
-    plt.yscale('log')                # long tail => log scale
-    plt.xlabel('Occupation')
-    plt.ylabel('Range (upper-lower, minutes)')
-    plt.title('Spread of time-range estimates per occupation')
-
-    ax = plt.gca()
-    ax.set_xticklabels([occupation_major_codes[code.get_text()] for code in ax.get_xticklabels()], rotation=60, ha='right')
-
-def cell3():
-    plt.figure(figsize=(10, 10))
-    ax = sns.scatterplot(
-            data=atomic_tasks.replace({'onetsoc_major': occupation_major_codes}),  # Replace codes with labels
-            x='lb_estimate_in_minutes', y='ub_estimate_in_minutes',
-            alpha=0.2, edgecolor=None, hue="onetsoc_major"  # Use the labeled column for hue
-        )
-
-    # 45° reference
-    lims = (1, atomic_tasks[['lb_estimate_in_minutes','ub_estimate_in_minutes']].max().max())
-    ax.plot(lims, lims, color='black', linestyle='--', linewidth=1)
-
-    # optional helper lines: 2× and 10×, 100× ratios
-    for k in [2,10, 100]:
-        ax.plot(lims, [k*l for l in lims],
-                linestyle=':', color='grey', linewidth=1)
-
-    ax.set(xscale='log', yscale='log')
-    ax.set_xlabel('Lower-bound (min, log scale)')
-    ax.set_ylabel('Upper-bound (min, log scale)')
-    ax.set_title('Lower vs upper estimates for all tasks')
-
-    # Place the legend outside the plot
-    ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
-
-def cell4():
-    plt.figure(figsize=(8,4))
-    sns.histplot(np.log10(atomic_tasks['estimate_ratio'].replace([np.inf, -np.inf], np.nan).dropna()),
-                bins=60, kde=True)
-    plt.axvline(np.log10(10), color='red', ls='--', lw=1, label='10×')
-    plt.axvline(np.log10(1.05), color='orange', ls='--', lw=1, label='1.05×')
-    plt.axvline(0, color='black', ls='-', lw=1)          # ub = lb
-    plt.xlabel('log₁₀(upper / lower)')
-    plt.ylabel('Count')
-    plt.title('Distribution of upper:lower ratio')
-    plt.legend()
-    plt.tight_layout()
-
-
-def cell5():
-    # 1. Bin lower bounds into quartiles (Q1–Q4)
-    atomic_tasks['lb_q'] = pd.qcut(atomic_tasks.lb_estimate_in_minutes,
-                        q=4, labels=['Q1 shortest','Q2','Q3','Q4 longest'])
-
-
-    # 3. Aggregate: median (or mean) ratio per cell
-    pivot = atomic_tasks.pivot_table(index='onetsoc_major', columns='lb_q',
-                        values='estimate_ratio', aggfunc='median')
-
-    # Map the index (onetsoc_major codes) to their corresponding labels
-    pivot.index = pivot.index.map(occupation_major_codes)
-
-
-    # 4. Visualise
-    plt.figure(figsize=(10,8))
-    sns.heatmap(pivot, cmap='RdYlGn_r', center=2, annot=True, fmt='.1f',
-                cbar_kws={'label':'Median upper/lower ratio'})
-    plt.xlabel('Lower-bound quartile')
-    plt.ylabel('Occupation (major group)')
-    plt.title('Typical range width by occupation and task length')
-    plt.tight_layout()
-
-
-
-def cell6():
-    """
-    from scipy.stats import median_abs_deviation
-
-    def mad_z(series):
-        med = series.median()
-        mad = median_abs_deviation(series, scale='normal')  # ⇒ comparable to σ
-        return (series - med) / mad
-
-    df['robust_z'] = df.groupby('onetsoc_code')['estimate_midpoint'].transform(mad_z)
-    """
-
-    agg = (atomic_tasks
-           .groupby('onetsoc_code')['estimate_midpoint']
-           .agg(median='median',
-                q1=lambda x: x.quantile(.25),
-                q3=lambda x: x.quantile(.75),
-                mean='mean',
-                std='std')
-           .reset_index())
-    agg['IQR'] = agg.q3 - agg.q1
-    agg['CV']  = agg['std'] / agg['mean']            # coefficient of variation
-
-    # merge back the group mean and std so each row can be scored
-    atomic_tasks = atomic_tasks.merge(agg[['onetsoc_code','mean','std']], on='onetsoc_code')
-
-
-    atomic_tasks['z'] = (atomic_tasks.estimate_midpoint - atomic_tasks['mean']) / atomic_tasks['std']
-    outliers = atomic_tasks.loc[atomic_tasks.z.abs() > 3]
-    outliers
-
-def cell7():
-    from scipy.stats import median_abs_deviation
-
-    def mad_z(series):
-        med = series.median()
-        mad = median_abs_deviation(series, scale='normal')  # ⇒ comparable to σ
-        return (series - med) / mad
-
-    atomic_tasks['robust_z'] = atomic_tasks.groupby('onetsoc_code')['estimate_midpoint'].transform(mad_z)
-
-def cell10():
-    import matplotlib.ticker as mtick # For percentage formatting
-    import matplotlib.colors as mcolors # For color conversion
-
-    summary_data = []
-
-    for code, label in occupation_major_codes.items():
-        occ_df = df_tasks[df_tasks['onetsoc_major'] == code]
-        total_tasks_in_occ = len(occ_df)
-
-        if total_tasks_in_occ == 0:
-            continue # Skip if no tasks for this occupation
-
-        # Stack 1: % that isn't equal to "remote"
-        not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote'])
-
-        # For the remaining remote tasks:
-        remote_df = occ_df[occ_df['remote_status'] == 'remote']
-
-        # Stack 2: % of remote + ATOMIC
-        remote_atomic_count = len(remote_df[remote_df['estimateable'] == 'ATOMIC'])
-
-        # Stack 3: % of remote + ONGOING-CONSTRAINT
-        remote_ongoing_count = len(remote_df[remote_df['estimateable'] == 'ONGOING-CONSTRAINT'])
-
-        summary_data.append({
-            'onetsoc_major_code': code,
-            'occupation_label': label,
-            'count_not_remote': not_remote_count,
-            'count_remote_atomic': remote_atomic_count,
-            'count_remote_ongoing': remote_ongoing_count,
-            'total_tasks': total_tasks_in_occ
-        })
-
-    summary_df = pd.DataFrame(summary_data)
-
-    # --- 3. Calculate Percentages ---
-    # Ensure total_tasks is not zero to avoid division by zero errors if an occupation had no tasks
-    summary_df = summary_df[summary_df['total_tasks'] > 0].copy() # Use .copy() to avoid SettingWithCopyWarning
-
-    summary_df['pct_not_remote'] = (summary_df['count_not_remote'] / summary_df['total_tasks']) * 100
-    summary_df['pct_remote_atomic'] = (summary_df['count_remote_atomic'] / summary_df['total_tasks']) * 100
-    summary_df['pct_remote_ongoing'] = (summary_df['count_remote_ongoing'] / summary_df['total_tasks']) * 100
-
-    # Select columns for plotting and set index to occupation label
-    plot_df = summary_df.set_index('occupation_label')[
-        ['pct_not_remote', 'pct_remote_atomic', 'pct_remote_ongoing']
-    ]
-
-    # Rename columns for a clearer legend
-    plot_df.columns = ['Not Remote', 'Remote + Estimable', 'Remote + Not estimable']
-
-    plot_df = plot_df.sort_values(by='Not Remote', ascending=False)
-
-
-    # --- 4. Plotting (Modified) ---
-
-    # Define the custom colors based on your requirements
-    # The order must match the column order in plot_df:
-    # 1. 'Not Remote'
-    # 2. 'Remote & ATOMIC'
-    # 3. 'Remote & ONGOING-CONSTRAINT'
-    bar_colors = [gray["300"], lime["500"], lime["200"]]
-
-    fig, ax = plt.subplots(figsize=(14, 10)) # Adjusted figsize for better readability
-
-    plot_df.plot(kind='barh', stacked=True, ax=ax, color=bar_colors)
-
-    ax.set_xlabel("Percentage of Tasks (%)", fontsize=12)
-    ax.set_ylabel("Occupation Major Group", fontsize=12)
-    ax.set_title("Task Breakdown by Occupation, Remote Status, and Estimateability", fontsize=14, pad=20)
-
-    # Format x-axis as percentages
-    ax.xaxis.set_major_formatter(mtick.PercentFormatter())
-    plt.xlim(0, 100) # Ensure x-axis goes from 0 to 100%
-
-    # Remove right and top spines
-    ax.spines['right'].set_visible(False)
-    ax.spines['top'].set_visible(False)
-
-    # Function to get contrasting text color
-    def get_contrasting_text_color(bg_color_hex_or_rgba):
-        """
-        Determines if black or white text provides better contrast against a given background color.
-        bg_color_hex_or_rgba: A hex string (e.g., '#RRGGBB') or an RGBA tuple (values in [0, 1]).
-        Returns: 'black' or 'white'.
-        """
-        # Convert to RGBA if it's a hex string or name
-        if isinstance(bg_color_hex_or_rgba, str):
-            rgba = mcolors.to_rgba(bg_color_hex_or_rgba)
-        else:
-            rgba = bg_color_hex_or_rgba
-
-        r, g, b, _ = rgba # Ignore alpha for luminance calculation
-        # Calculate luminance (standard formula for sRGB)
-        # Values r, g, b should be in [0, 1] for this formula
-        luminance = 0.2126 * r + 0.7152 * g + 0.0722 * b
-        # Threshold for deciding text color
-        return 'black' if luminance > 0.55 else 'white' # Adjusted threshold slightly for better visual
-
-    # Add percentages inside each bar segment
-    # Iterate through each "category" of bars (Not Remote, Remote & ATOMIC, etc.)
-    for i, container in enumerate(ax.containers):
-        # Get the color for this container/category
-        segment_color = bar_colors[i]
-        text_color = get_contrasting_text_color(segment_color)
-
-        for patch in container.patches: # Iterate through each bar segment in the category
-            width = patch.get_width()
-            if width > 3:  # Only add text if segment is wide enough (e.g., >3%)
-                x = patch.get_x() + width / 2
-                y = patch.get_y() + patch.get_height() / 2
-                ax.text(x, y,
-                        f"{width:.1f}%",
-                        ha='center',
-                        va='center',
-                        fontsize=8, # Adjust font size as needed
-                        color=text_color,
-                        fontweight='medium') # Bolder text can help
-
-
-    plt.legend(title="Task Category", bbox_to_anchor=(1.02, 1), loc='upper left', frameon=False)
-
-def cell11():
-    df_oesm['onetsoc_major'] = df_oesm['OCC_CODE'].str[:2]
-
-    # Calculate wage bill per occupation
-    # Wage bill = Total Employment * Annual Mean Wage
-    # Ensure columns are numeric, converting non-numeric values to NaN first
-    df_oesm['TOT_EMP'] = pd.to_numeric(df_oesm['TOT_EMP'], errors='coerce')
-    df_oesm['A_MEAN'] = pd.to_numeric(df_oesm['A_MEAN'], errors='coerce')
-
-    # Drop rows with NaN in necessary columns after coercion
-    df_oesm.dropna(subset=['TOT_EMP', 'A_MEAN', 'onetsoc_major'], inplace=True)
-
-    df_oesm['wage_bill'] = df_oesm['TOT_EMP'] * df_oesm['A_MEAN']
-
-    # Aggregate wage bill by onetsoc_major
-    df_wage_bill_major = df_oesm.groupby('onetsoc_major')['wage_bill'].sum().reset_index()
-
-    # Map major codes to titles for better plotting
-    df_wage_bill_major['OCC_TITLE_MAJOR'] = df_wage_bill_major['onetsoc_major'].map(occupation_major_codes)
-
-    # Sort by wage bill for better visualization
-    df_wage_bill_major = df_wage_bill_major.sort_values('wage_bill', ascending=False)
-
-    # Plotting
-    plt.figure(figsize=(12, 8))
-    sns.barplot(x='wage_bill', y='OCC_TITLE_MAJOR', data=df_wage_bill_major, palette="viridis")
-    plt.title('Total Wage Bill per Major Occupation Group')
-    plt.xlabel('Total Wage Bill (in billions)')
-    plt.ylabel('Major Occupation Group')
-    plt.grid(axis='x', linestyle='--', alpha=0.7)
-
-def cell11():
-    # ───────────────────────────────────────────────────────────────
-    # 1.  CUMULATIVE-DISTRIBUTION-FUNCTION (CDF) PREP
-    # ───────────────────────────────────────────────────────────────
-    def cdf(series):
-        s = series.sort_values().reset_index(drop=True)
-        return s.values, ((s.index + 1) / len(s)) * 100
-
-    x_lb , y_lb  = cdf(atomic_tasks['lb_estimate_in_minutes'])
-    x_ub , y_ub  = cdf(atomic_tasks['ub_estimate_in_minutes'])
-    x_mid, y_mid = cdf((atomic_tasks['ub_estimate_in_minutes'] + atomic_tasks['lb_estimate_in_minutes']) / 2)
-
-    # ───────────────────────────────────────────────────────────────
-    # 2.  PLOTTING
-    # ───────────────────────────────────────────────────────────────
-    fig, ax = plt.subplots(figsize=(10, 6))
-
-    # horizontal reference lines every 10 %
-    for y_val in range(0, 101, 10):
-        ax.axhline(y_val, color=gray['100'], linewidth=.8, zorder=1)
-
-    # Plot Lower Bound CDF
-    ax.step(x_lb, y_lb,
-            where='post',
-            color=lime['300'], # Example: light blue for lower bound
-            linewidth=1.8,
-            linestyle='--',
-            zorder=2,
-            label='Lower bound estimate (CDF)')
-
-    # Plot Upper Bound CDF
-    ax.step(x_ub, y_ub,
-            where='post',
-            color=lime['900'], # Example: light orange/red for upper bound
-            linewidth=1.8,
-            linestyle=':',
-            zorder=3,
-            label='Upper bound estimate (CDF)')
-
-    # Plot Midpoint CDF (plotted last to be on top, or adjust zorder)
-    ax.step(x_mid, y_mid,
-            where='post',
-            color=lime['600'],
-            linewidth=2.2,
-            zorder=4, # Ensure it's on top of other lines if they overlap significantly
-            label='Mid-point estimate (CDF)')
-
-
-    # axes limits / scales
-    ax.set_ylim(0, 100)
-    ax.set_xscale('log')
-
-    # y-axis ➝ percent labels
-    ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(decimals=0))
-
-
-    # move y-label to top-left (just inside plotting area)
-    ax.text(-0.06, 1.03,
-            "% of tasks with temporal coherence ≤ X",
-            ha='left', va='bottom',
-            transform=ax.transAxes,
-            fontsize=12, fontweight='semibold')
-
-    # custom x-ticks at human-friendly durations
-    ticks      = [1, 5, 10, 30, 60, 120, 240, 480,
-                1440, 2880, 10080, 43200, 129600,
-                259200, 525600]
-    ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hour', '2 hours', '4 hours', '8 hours',
-                '1 day', '2 days', '1 week', '30 days',
-                '90 days', '180 days', '1 year']
-
-    # Vertical reference lines for x-ticks
-    for tick in ticks:
-        ax.axvline(tick, color=gray['300'], linewidth=.8, linestyle='--', zorder=1)
-
-    ax.set_xticks(ticks)
-    ax.set_xticklabels(ticklabels, rotation=45, ha='right')
-
-    ax.spines['top'].set_visible(False)
-    ax.spines['right'].set_visible(False)
-    ax.spines['left'].set_edgecolor(gray['300'])
-    ax.spines['bottom'].set_edgecolor(gray['300'])
-
-
-    # legend
-    ax.legend(frameon=False, loc='lower right') # Keep 'lower right' or adjust as needed
-
-    ax.text(0.5, -0.3,
-            'Temporal coherence (X)',
-            ha='center', va='center',
-            transform=ax.transAxes,
-            fontsize=12, fontweight='semibold')
--- a/old/classify_estimateability_of_tasks.py
+++ b/old/classify_estimateability_of_tasks.py
@ -1,411 +0,0 @@
-import pandas as pd
-import litellm
-import dotenv
-import os
-import time
-import json
-import math
-
-# Load environment variables
-dotenv.load_dotenv(override=True)
-
-# litellm._turn_on_debug() # Optional debugging
-
-# --- Configuration ---
-MODEL = "gpt-4.1-mini"  # Make sure this model supports json_schema or structured output
-RATE_LIMIT = 5000  # Requests per minute
-CHUNK_SIZE = 300  # Number of unique tasks per API call
-SECONDS_PER_MINUTE = 60
-
-# File configuration
-CLASSIFICATION_FILENAME = "tasks_estimateable.csv"  # Output file with classifications
-TASK_SOURCE_FOR_INIT_FILENAME = "tasks_with_estimates.csv"
-OUTPUT_COLUMN_NAME = "task_estimateable"
-SOURCE_FILTER_COLUMN = "remote_status"
-SOURCE_FILTER_VALUE = "remote"
-
-# --- Prompts and Schema ---
-SYSTEM_PROMPT_CLASSIFY = """
-Classify the provided O*NET task into one of these categories:
- -  ATOMIC (schedulable): A single, clearly-bounded activity, typically lasting minutes, hours, or a few days.
- -  ONGOING-CONSTRAINT (background role/ethical rule): A continuous responsibility or behavioural norm with no schedulable duration (e.g., “follow confidentiality rules,” “serve as department head”).
-""".strip()
-
-USER_MESSAGE_TEMPLATE_CLASSIFY = "Task: {task}"
-
-CLASSIFICATION_CATEGORIES = ["ATOMIC", "ONGOING-CONSTRAINT"]
-
-SCHEMA_FOR_CLASSIFICATION = {
-    "name": "classify_task_type",
-    "strict": True,
-    "schema": {
-        "type": "object",
-        "properties": {
-            "task_category": {
-                "type": "string",
-                "enum": CLASSIFICATION_CATEGORIES,
-                "description": "The classification of the task (ATOMIC or ONGOING-CONSTRAINT).",
-            }
-        },
-        "required": ["task_category"],
-        "additionalProperties": False,
-    },
-}
-
-
-def save_dataframe(df_to_save, filename):
-    """Saves the DataFrame to the specified CSV file using atomic write."""
-    try:
-        temp_filename = filename + ".tmp"
-        df_to_save.to_csv(temp_filename, encoding="utf-8-sig", index=False)
-        os.replace(temp_filename, filename)
-    except Exception as e:
-        print(f"--- Error saving DataFrame to {filename}: {e} ---")
-        if os.path.exists(temp_filename):
-            try:
-                os.remove(temp_filename)
-            except Exception as remove_err:
-                print(
-                    f"--- Error removing temporary save file {temp_filename}: {remove_err} ---"
-                )
-
-
-# --- Load or Initialize DataFrame ---
-try:
-    if os.path.exists(CLASSIFICATION_FILENAME):
-        df = pd.read_csv(CLASSIFICATION_FILENAME, encoding="utf-8-sig")
-        print(f"Successfully read {len(df)} rows from {CLASSIFICATION_FILENAME}.")
-
-        save_needed_after_load = False
-        if OUTPUT_COLUMN_NAME not in df.columns:
-            df[OUTPUT_COLUMN_NAME] = pd.NA
-            print(f"Added '{OUTPUT_COLUMN_NAME}' column.")
-            save_needed_after_load = True
-
-        df[OUTPUT_COLUMN_NAME].replace(["", None, ""], pd.NA, inplace=True)
-
-        if df[OUTPUT_COLUMN_NAME].dtype != object and not isinstance(
-            df[OUTPUT_COLUMN_NAME].dtype, pd.StringDtype
-        ):
-            try:
-                df[OUTPUT_COLUMN_NAME] = df[OUTPUT_COLUMN_NAME].astype(object)
-                print(
-                    f"Corrected dtype of '{OUTPUT_COLUMN_NAME}' to {df[OUTPUT_COLUMN_NAME].dtype}."
-                )
-                save_needed_after_load = True
-            except Exception as e:
-                print(
-                    f"Warning: Could not convert column '{OUTPUT_COLUMN_NAME}' to object: {e}."
-                )
-
-        if "task" not in df.columns:
-            print(
-                f"Error: {CLASSIFICATION_FILENAME} must contain a 'task' column for processing."
-            )
-            exit()
-
-        if save_needed_after_load:
-            print(f"Saving {CLASSIFICATION_FILENAME} after adding/adjusting column.")
-            save_dataframe(df, CLASSIFICATION_FILENAME)
-    else:
-        print(
-            f"{CLASSIFICATION_FILENAME} not found. Attempting to create it from {TASK_SOURCE_FOR_INIT_FILENAME}."
-        )
-        if not os.path.exists(TASK_SOURCE_FOR_INIT_FILENAME):
-            print(
-                f"Error: Source file {TASK_SOURCE_FOR_INIT_FILENAME} not found. Cannot create {CLASSIFICATION_FILENAME}."
-            )
-            exit()
-
-        df_source = pd.read_csv(TASK_SOURCE_FOR_INIT_FILENAME, encoding="utf-8-sig")
-
-        required_source_cols_for_init = ["task", SOURCE_FILTER_COLUMN]
-        missing_source_cols = [
-            col for col in required_source_cols_for_init if col not in df_source.columns
-        ]
-        if missing_source_cols:
-            print(
-                f"Error: Source file {TASK_SOURCE_FOR_INIT_FILENAME} is missing required columns for initialization: {', '.join(missing_source_cols)}."
-            )
-            exit()
-
-        df_source_filtered = df_source[
-            df_source[SOURCE_FILTER_COLUMN] == SOURCE_FILTER_VALUE
-        ].copy()
-
-        if df_source_filtered.empty:
-            print(
-                f"Warning: No tasks with '{SOURCE_FILTER_COLUMN}' == '{SOURCE_FILTER_VALUE}' found in {TASK_SOURCE_FOR_INIT_FILENAME}. "
-                f"{CLASSIFICATION_FILENAME} will be created with schema but no tasks to classify initially."
-            )
-
-        df = df_source_filtered[["task"]].copy()
-        df[OUTPUT_COLUMN_NAME] = pd.NA
-        df[OUTPUT_COLUMN_NAME] = df[OUTPUT_COLUMN_NAME].astype(object)
-
-        print(
-            f"Created {CLASSIFICATION_FILENAME} using tasks from {TASK_SOURCE_FOR_INIT_FILENAME} "
-            f"(where {SOURCE_FILTER_COLUMN}='{SOURCE_FILTER_VALUE}'). New file has {len(df)} tasks."
-        )
-        save_dataframe(df, CLASSIFICATION_FILENAME)
-
-except FileNotFoundError:
-    print(f"Error: A required file was not found. Please check paths.")
-    exit()
-except Exception as e:
-    print(f"Error during DataFrame loading or initialization: {e}")
-    exit()
-
-
-# --- Identify Unique Tasks to Process ---
-if df.empty:
-    print(f"{CLASSIFICATION_FILENAME} is empty. Nothing to process. Exiting.")
-    exit()
-
-initial_unprocessed_mask = df[OUTPUT_COLUMN_NAME].isna()
-
-if not initial_unprocessed_mask.any():
-    print(
-        f"All tasks in {CLASSIFICATION_FILENAME} seem to have been classified already. Exiting."
-    )
-    exit()
-
-# Filter for rows that are unprocessed AND have a valid 'task' string
-valid_tasks_to_consider_df = df[
-    initial_unprocessed_mask & df["task"].notna() & (df["task"].str.strip() != "")
-]
-
-if valid_tasks_to_consider_df.empty:
-    print(
-        f"No valid, unclassified tasks found to process (after filtering out empty/NaN task descriptions). Exiting."
-    )
-    exit()
-
-unique_task_labels_for_api = (
-    valid_tasks_to_consider_df["task"].drop_duplicates().tolist()
-)
-total_rows_to_update_potentially = len(
-    df[initial_unprocessed_mask]
-)  # Count all rows that are NA
-
-print(
-    f"Found {total_rows_to_update_potentially} total rows in {CLASSIFICATION_FILENAME} needing classification."
-)
-print(
-    f"Identified {len(unique_task_labels_for_api)} unique, valid task labels to send to the API."
-)
-
-
-# --- Prepare messages for batch completion (only for unique task labels) ---
-messages_list = []
-print(f"Preparing messages for {len(unique_task_labels_for_api)} unique task labels...")
-
-for task_label in unique_task_labels_for_api:
-    # task_label is already guaranteed to be non-empty and not NaN from the filtering above
-    user_message = USER_MESSAGE_TEMPLATE_CLASSIFY.format(task=task_label)
-    messages_for_task = [
-        {"role": "system", "content": SYSTEM_PROMPT_CLASSIFY},
-        {"role": "user", "content": user_message},
-    ]
-    messages_list.append(messages_for_task)
-
-print(f"Prepared {len(messages_list)} message sets for batch completion.")
-if (
-    not messages_list
-):  # Should only happen if unique_task_labels_for_api was empty, caught above
-    print(
-        "No messages prepared, though unique tasks were identified. This is unexpected. Exiting."
-    )
-    exit()
-
-
-# --- Call batch_completion in chunks with rate limiting and periodic saving ---
-total_unique_tasks_to_send = len(
-    messages_list
-)  # Same as len(unique_task_labels_for_api)
-num_chunks = math.ceil(total_unique_tasks_to_send / CHUNK_SIZE)
-
-print(
-    f"\nStarting batch classification for {total_unique_tasks_to_send} unique task labels in {num_chunks} chunks..."
-)
-
-overall_start_time = time.time()
-processed_rows_count_total = 0  # Counts actual rows updated in the DataFrame
-
-for i in range(num_chunks):
-    chunk_start_message_index = i * CHUNK_SIZE
-    chunk_end_message_index = min((i + 1) * CHUNK_SIZE, total_unique_tasks_to_send)
-
-    message_chunk = messages_list[chunk_start_message_index:chunk_end_message_index]
-    # Get corresponding unique task labels for this chunk
-    chunk_task_labels = unique_task_labels_for_api[
-        chunk_start_message_index:chunk_end_message_index
-    ]
-
-    if not message_chunk:  # Should not happen if loop range is correct
-        continue
-
-    print(
-        f"\nProcessing chunk {i + 1}/{num_chunks} (Unique Task Labels {chunk_start_message_index + 1}-{chunk_end_message_index} of this run)..."
-    )
-    chunk_start_time = time.time()
-    responses = []
-    try:
-        print(
-            f"Sending {len(message_chunk)} requests (for unique tasks) for chunk {i + 1}..."
-        )
-        responses = litellm.batch_completion(
-            model=MODEL,
-            messages=message_chunk,
-            response_format={
-                "type": "json_schema",
-                "json_schema": SCHEMA_FOR_CLASSIFICATION,
-            },
-            num_retries=3,
-        )
-        print(f"Chunk {i + 1} API call completed.")
-
-    except Exception as e:
-        print(f"Error during litellm.batch_completion for chunk {i + 1}: {e}")
-        responses = [None] * len(message_chunk)
-
-    # --- Process responses for the current chunk ---
-    # chunk_updates stores {task_label: classification_category}
-    chunk_task_classifications = {}
-    successful_api_calls_in_chunk = 0
-    failed_api_calls_in_chunk = 0
-
-    if responses and len(responses) == len(message_chunk):
-        for j, response in enumerate(responses):
-            current_task_label = chunk_task_labels[
-                j
-            ]  # The unique task label for this response
-            content_str = None
-
-            if response is None:
-                print(
-                    f"API call failed for task label '{current_task_label}' (response is None)."
-                )
-                failed_api_calls_in_chunk += 1
-                continue
-
-            try:
-                if (
-                    response.choices
-                    and response.choices[0].message
-                    and response.choices[0].message.content
-                ):
-                    content_str = response.choices[0].message.content
-                    classification_data = json.loads(content_str)
-                    category_raw = classification_data.get("task_category")
-
-                    if category_raw in CLASSIFICATION_CATEGORIES:
-                        successful_api_calls_in_chunk += 1
-                        chunk_task_classifications[current_task_label] = category_raw
-                    else:
-                        print(
-                            f"Warning: Invalid or missing task_category for task label '{current_task_label}': '{category_raw}'. Content: '{content_str}'"
-                        )
-                        failed_api_calls_in_chunk += 1
-                else:
-                    finish_reason = (
-                        response.choices[0].finish_reason
-                        if (response.choices and response.choices[0].finish_reason)
-                        else "unknown"
-                    )
-                    error_message = (
-                        response.choices[0].message.content
-                        if (response.choices and response.choices[0].message)
-                        else "No content in message."
-                    )
-                    print(
-                        f"Warning: Received non-standard or empty response content for task label '{current_task_label}'. "
-                        f"Finish Reason: '{finish_reason}'. Message: '{error_message}'. Raw Choices: {response.choices}"
-                    )
-                    failed_api_calls_in_chunk += 1
-
-            except json.JSONDecodeError:
-                print(
-                    f"Warning: Could not decode JSON for task label '{current_task_label}'. Content received: '{content_str}'"
-                )
-                failed_api_calls_in_chunk += 1
-            except AttributeError as ae:
-                print(
-                    f"Warning: Missing attribute processing response for task label '{current_task_label}': {ae}. Response: {response}"
-                )
-                failed_api_calls_in_chunk += 1
-            except Exception as e:
-                print(
-                    f"Warning: Unexpected error processing response for task label '{current_task_label}': {type(e).__name__} - {e}. Response: {response}"
-                )
-                failed_api_calls_in_chunk += 1
-    else:
-        print(
-            f"Warning: Mismatch between #responses ({len(responses) if responses else 0}) "
-            f"and #messages sent ({len(message_chunk)}) for chunk {i + 1}, or no responses. Marking all API calls in chunk as failed."
-        )
-        failed_api_calls_in_chunk = len(message_chunk)
-
-    # --- Update Main DataFrame and Save Periodically ---
-    rows_updated_this_chunk = 0
-    if chunk_task_classifications:
-        print(
-            f"Updating main DataFrame with classifications for {len(chunk_task_classifications)} unique tasks from chunk {i + 1}..."
-        )
-        for task_label, category in chunk_task_classifications.items():
-            # Update all rows in the main df that match this task_label AND are still NA in the output column
-            update_condition = (df["task"] == task_label) & (
-                df[OUTPUT_COLUMN_NAME].isna()
-            )
-            num_rows_for_this_task_label = df[update_condition].shape[0]
-
-            if num_rows_for_this_task_label > 0:
-                df.loc[update_condition, OUTPUT_COLUMN_NAME] = category
-                rows_updated_this_chunk += num_rows_for_this_task_label
-
-        print(
-            f"Updated {rows_updated_this_chunk} rows in the DataFrame based on this chunk's API responses."
-        )
-        print(f"Saving progress to {CLASSIFICATION_FILENAME}...")
-        save_dataframe(df, CLASSIFICATION_FILENAME)
-    else:
-        print(
-            f"No successful API classifications obtained in chunk {i + 1} to update DataFrame or save."
-        )
-
-    print(
-        f"Chunk {i + 1} API summary: Successful Calls={successful_api_calls_in_chunk}, Failed/Skipped Calls={failed_api_calls_in_chunk}. "
-        f"Rows updated in DataFrame this chunk: {rows_updated_this_chunk}"
-    )
-    processed_rows_count_total += rows_updated_this_chunk
-
-    # --- Rate Limiting Pause ---
-    chunk_end_time = time.time()
-    chunk_duration = chunk_end_time - chunk_start_time
-    print(f"Chunk {i + 1} (API calls and DF update) took {chunk_duration:.2f} seconds.")
-
-    if i < num_chunks - 1:
-        time_per_request = SECONDS_PER_MINUTE / RATE_LIMIT if RATE_LIMIT > 0 else 0
-        min_chunk_duration_for_rate = (
-            len(message_chunk) * time_per_request
-        )  # Based on API calls made
-        pause_needed = max(0, min_chunk_duration_for_rate - chunk_duration)
-
-        if pause_needed > 0:
-            print(
-                f"Pausing for {pause_needed:.2f} seconds to respect rate limit ({RATE_LIMIT}/min)..."
-            )
-            time.sleep(pause_needed)
-
-overall_end_time = time.time()
-total_duration_minutes = (overall_end_time - overall_start_time) / 60
-print(
-    f"\nBatch classification finished."
-    f" Updated {processed_rows_count_total} rows in '{CLASSIFICATION_FILENAME}' with new classifications in this run."
-    f" Total duration: {total_duration_minutes:.2f} minutes."
-)
-
-print(f"Performing final save to {CLASSIFICATION_FILENAME}...")
-save_dataframe(df, CLASSIFICATION_FILENAME)
-
-print("\nScript finished.")
--- a/old/create_onet_database.sh
+++ b/old/create_onet_database.sh
@ -1,85 +0,0 @@
-#!/usr/bin/env bash
-
-# Set database name and directories
-ONET_DB_NAME="onet.database"
-ONET_ZIP_URL="https://www.onetcenter.org/dl_files/database/db_29_1_mysql.zip"
-ONET_ZIP_FILE="db_29_1_mysql.zip"
-ONET_EXTRACT_DIR="db_29_1_mysql"
-
-# Download O*NET database only if not already downloaded
-if [ ! -f "$ONET_ZIP_FILE" ]; then
-    echo "Downloading O*NET database from $ONET_ZIP_URL"
-    curl -L -o "$ONET_ZIP_FILE" "$ONET_ZIP_URL" || wget -O "$ONET_ZIP_FILE" "$ONET_ZIP_URL"
-
-    if [ $? -ne 0 ]; then
-        echo "Failed to download O*NET database"
-        exit 1
-    fi
-else
-    echo "Using existing O*NET database zip file"
-fi
-
-# Extract downloaded zip file only if extraction directory doesn't exist
-if [ ! -d "$ONET_EXTRACT_DIR" ]; then
-    echo "Extracting O*NET database files"
-    unzip -o "$ONET_ZIP_FILE"
-
-    if [ $? -ne 0 ]; then
-        echo "Failed to extract O*NET database files"
-        exit 1
-    fi
-else
-    echo "Using existing extracted O*NET database files"
-fi
-
-# Remove existing database if it exists
-if [ -f "$ONET_DB_NAME" ]; then
-    echo "Removing existing database"
-    rm "$ONET_DB_NAME"
-fi
-
-# Create a new SQLite database with optimized settings for fast import
-echo "Creating new SQLite database: $ONET_DB_NAME with performance settings"
-sqlite3 "$ONET_DB_NAME" << EOF
-PRAGMA journal_mode = OFF;
-PRAGMA synchronous = 0;
-PRAGMA cache_size = 1000000;
-PRAGMA locking_mode = EXCLUSIVE;
-PRAGMA temp_store = MEMORY;
-PRAGMA foreign_keys = ON;
-EOF
-
-# Combine and execute all SQL files in one transaction
-echo "Executing SQL files in alphabetical order (single transaction mode)"
-sqlite3 "$ONET_DB_NAME" << EOF
-BEGIN TRANSACTION;
-$(find "$ONET_EXTRACT_DIR" -name "*.sql" | sort | xargs cat)
-COMMIT;
-EOF
-
-# Check if the execution was successful
-if [ $? -ne 0 ]; then
-    echo "Error executing SQL files in batch transaction"
-    exit 1
-else
-    echo "Database populated successfully. Restoring reliability settings..."
-
-    # Restore reliability-focused settings after import
-    sqlite3 "$ONET_DB_NAME" << EOF
-PRAGMA journal_mode = WAL;
-PRAGMA synchronous = NORMAL;
-PRAGMA locking_mode = NORMAL;
-PRAGMA temp_store = DEFAULT;
-PRAGMA foreign_keys = ON;
-PRAGMA optimize;
-VACUUM;
-EOF
-
-    if [ $? -ne 0 ]; then
-        echo "Warning: Failed to restore reliability settings, but database is populated"
-    else
-        echo "Reliability settings restored successfully"
-    fi
-
-    echo "O*NET database created and optimized successfully!"
-fi
--- a/old/enrich_task_ratings.py
+++ b/old/enrich_task_ratings.py
@ -1,392 +0,0 @@
-import sqlite3
-import pandas as pd
-import json
-import os
-from collections import defaultdict
-import numpy as np
-
-# --- Configuration ---
-DB_FILE = "onet.database"
-OUTPUT_FILE = "task_ratings_enriched.json"  # Changed output filename
-
-# --- Database Interaction ---
-
-
-def fetch_data_from_db(db_path):
-    """
-    Fetches required data from the O*NET SQLite database using JOINs,
-    including DWAs.
-
-    Args:
-        db_path (str): Path to the SQLite database file.
-
-    Returns:
-        tuple(pandas.DataFrame, pandas.DataFrame): A tuple containing:
-            - DataFrame with task ratings info.
-            - DataFrame with task-to-DWA mapping.
-        Returns (None, None) if the database file doesn't exist or an error occurs.
-    """
-    if not os.path.exists(db_path):
-        print(f"Error: Database file not found at {db_path}")
-        return None, None
-
-    try:
-        conn = sqlite3.connect(db_path)
-        # Construct the SQL query to join the tables and select necessary columns
-        # Added LEFT JOINs for tasks_to_dwas and dwa_reference
-        # Use LEFT JOIN in case a task has no DWAs
-        query = """
-        SELECT
-            tr.onetsoc_code,
-            tr.task_id,
-            ts.task,
-            od.title AS occupation_title,
-            od.description AS occupation_description,
-            tr.scale_id,
-            tr.category,
-            tr.data_value,
-            dr.dwa_title  -- Added DWA title
-        FROM
-            task_ratings tr
-        JOIN
-            task_statements ts ON tr.task_id = ts.task_id
-        JOIN
-            occupation_data od ON tr.onetsoc_code = od.onetsoc_code
-        LEFT JOIN
-            tasks_to_dwas td ON tr.onetsoc_code = td.onetsoc_code AND tr.task_id = td.task_id --
-        LEFT JOIN
-            dwa_reference dr ON td.dwa_id = dr.dwa_id; --
-        """
-        df = pd.read_sql_query(query, conn)
-        conn.close()
-        print(
-                f"Successfully fetched {len(df)} records (including DWA info) from the database."
-                )
-
-        if df.empty:
-            print("Warning: Fetched DataFrame is empty.")
-            # Return empty DataFrames with expected columns if the main fetch is empty
-            ratings_cols = [
-                "onetsoc_code",
-                "task_id",
-                "task",
-                "occupation_title",
-                "occupation_description",
-                "scale_id",
-                "category",
-                "data_value",
-            ]
-            dwa_cols = ["onetsoc_code", "task_id", "dwa_title"]
-            return pd.DataFrame(columns=ratings_cols), pd.DataFrame(columns=dwa_cols)
-
-        # Remove duplicates caused by joining ratings with potentially multiple DWAs per task
-        # Keep only unique combinations of the core task/rating info before processing
-        core_cols = [
-            "onetsoc_code",
-            "task_id",
-            "task",
-            "occupation_title",
-            "occupation_description",
-            "scale_id",
-            "category",
-            "data_value",
-        ]
-        # Check if all core columns exist before attempting to drop duplicates
-        missing_core_cols = [col for col in core_cols if col not in df.columns]
-        if missing_core_cols:
-            print(f"Error: Missing core columns in fetched data: {missing_core_cols}")
-            return None, None
-        ratings_df = df[core_cols].drop_duplicates().reset_index(drop=True)
-
-        # Get unique DWA info separately
-        dwa_cols = ["onetsoc_code", "task_id", "dwa_title"]
-        # Check if all DWA columns exist before processing
-        if all(col in df.columns for col in dwa_cols):
-            dwas_df = (
-                df[dwa_cols]
-                .dropna(subset=["dwa_title"])
-                .drop_duplicates()
-                .reset_index(drop=True)
-            )
-        else:
-            print("Warning: DWA related columns missing, creating empty DWA DataFrame.")
-            dwas_df = pd.DataFrame(
-                columns=dwa_cols
-            )  # Create empty df if columns missing
-
-        return ratings_df, dwas_df  # Return two dataframes now
-
-    except sqlite3.Error as e:
-        print(f"SQLite error: {e}")
-        if "conn" in locals() and conn:
-            conn.close()
-        return None, None  # Return None for both if error
-    except Exception as e:
-        print(f"An error occurred during data fetching: {e}")
-        if "conn" in locals() and conn:
-            conn.close()
-        return None, None  # Return None for both if error
-
-
-# --- Data Processing ---
-
-
-def process_task_ratings_with_dwas(ratings_df, dwas_df):
-    """
-    Processes the fetched data to group, pivot frequency, calculate averages,
-    structure the output, and add associated DWAs.
-
-    Args:
-        ratings_df (pandas.DataFrame): The input DataFrame with task ratings info.
-        dwas_df (pandas.DataFrame): The input DataFrame with task-to-DWA mapping. Can be None or empty.
-
-    Returns:
-        list: A list of dictionaries, each representing an enriched task rating with DWAs.
-              Returns None if the input ratings DataFrame is invalid.
-    """
-    if ratings_df is None or not isinstance(
-        ratings_df, pd.DataFrame
-    ):  # Check if it's a DataFrame
-        print("Error: Input ratings DataFrame is invalid.")
-        return None
-    if ratings_df.empty:
-        print(
-            "Warning: Input ratings DataFrame is empty. Processing will yield empty result."
-        )
-        # Decide how to handle empty input, maybe return empty list directly
-        # return []
-
-    # Ensure dwas_df is a DataFrame, even if empty
-    if dwas_df is None or not isinstance(dwas_df, pd.DataFrame):
-        print("Warning: Invalid or missing DWA DataFrame. Proceeding without DWA data.")
-        dwas_df = pd.DataFrame(
-            columns=["onetsoc_code", "task_id", "dwa_title"]
-        )  # Ensure it's an empty DF
-
-    print("Starting data processing...")
-
-    # --- 1. Handle Frequency (FT) ---
-    freq_df = ratings_df[ratings_df["scale_id"] == "FT"].copy()
-    if not freq_df.empty:
-        freq_pivot = freq_df.pivot_table(
-            index=["onetsoc_code", "task_id"],
-            columns="category",
-            values="data_value",
-            fill_value=0,
-        )
-        freq_pivot.columns = [
-            f"frequency_category_{int(col)}" for col in freq_pivot.columns
-        ]
-        print(f"Processed Frequency data. Shape: {freq_pivot.shape}")
-    else:
-        print("No Frequency (FT) data found.")
-        # Create an empty DataFrame with the multi-index to allow merging later
-        idx = pd.MultiIndex(
-            levels=[[], []], codes=[[], []], names=["onetsoc_code", "task_id"]
-        )
-        freq_pivot = pd.DataFrame(index=idx)
-
-    # --- 2. Handle Importance (IM, IJ) ---
-    imp_df = ratings_df[ratings_df["scale_id"].isin(["IM", "IJ"])].copy()
-    if not imp_df.empty:
-        imp_avg = (
-            imp_df.groupby(["onetsoc_code", "task_id"])["data_value"]
-            .mean()
-            .reset_index()
-        )
-        imp_avg.rename(columns={"data_value": "importance_average"}, inplace=True)
-        print(f"Processed Importance data. Shape: {imp_avg.shape}")
-    else:
-        print("No Importance (IM, IJ) data found.")
-        imp_avg = pd.DataFrame(
-            columns=["onetsoc_code", "task_id", "importance_average"]
-        )
-
-    # --- 3. Handle Relevance (RT) ---
-    rel_df = ratings_df[ratings_df["scale_id"] == "RT"].copy()
-    if not rel_df.empty:
-        rel_avg = (
-            rel_df.groupby(["onetsoc_code", "task_id"])["data_value"]
-            .mean()
-            .reset_index()
-        )
-        rel_avg.rename(columns={"data_value": "relevance_average"}, inplace=True)
-        print(f"Processed Relevance data. Shape: {rel_avg.shape}")
-    else:
-        print("No Relevance (RT) data found.")
-        rel_avg = pd.DataFrame(columns=["onetsoc_code", "task_id", "relevance_average"])
-
-    # --- 4. Process DWAs ---
-    if dwas_df is not None and not dwas_df.empty and "dwa_title" in dwas_df.columns:
-        print("Processing DWA data...")
-        # Group DWAs by task_id and aggregate titles into a list
-        dwas_grouped = (
-            dwas_df.groupby(["onetsoc_code", "task_id"])["dwa_title"]
-            .apply(list)
-            .reset_index()
-        )  #
-        dwas_grouped.rename(
-            columns={"dwa_title": "dwas"}, inplace=True
-        )  # Rename column to 'dwas'
-        print(f"Processed DWA data. Shape: {dwas_grouped.shape}")
-    else:
-        print("No valid DWA data found or provided for processing.")
-        dwas_grouped = None  # Set to None if no DWAs
-
-    # --- 5. Get Base Task/Occupation Info ---
-    base_cols = [
-        "onetsoc_code",
-        "task_id",
-        "task",
-        "occupation_title",
-        "occupation_description",
-    ]
-    # Check if base columns exist in ratings_df
-    missing_base_cols = [col for col in base_cols if col not in ratings_df.columns]
-    if missing_base_cols:
-        print(
-            f"Error: Missing base info columns in ratings_df: {missing_base_cols}. Cannot proceed."
-        )
-        return None
-    if not ratings_df.empty:
-        base_info = (
-            ratings_df[base_cols]
-            .drop_duplicates()
-            .set_index(["onetsoc_code", "task_id"])
-        )
-        print(f"Extracted base info. Shape: {base_info.shape}")
-    else:
-        print("Cannot extract base info from empty ratings DataFrame.")
-        # Create an empty df with index to avoid errors later if possible
-        idx = pd.MultiIndex(
-            levels=[[], []], codes=[[], []], names=["onetsoc_code", "task_id"]
-        )
-        base_info = pd.DataFrame(
-            index=idx,
-            columns=[
-                col for col in base_cols if col not in ["onetsoc_code", "task_id"]
-            ],
-        )
-
-    # --- 6. Merge Processed Data ---
-    print("Merging processed data...")
-    # Start with base_info, which should have the index ['onetsoc_code', 'task_id']
-    final_df = base_info.merge(
-        freq_pivot, left_index=True, right_index=True, how="left"
-    )
-    # Reset index before merging non-indexed dfs
-    final_df = final_df.reset_index()
-
-    # Merge averages - check if they are not empty before merging
-    if not imp_avg.empty:
-        final_df = final_df.merge(imp_avg, on=["onetsoc_code", "task_id"], how="left")
-    else:
-        final_df["importance_average"] = np.nan  # Add column if imp_avg was empty
-
-    if not rel_avg.empty:
-        final_df = final_df.merge(rel_avg, on=["onetsoc_code", "task_id"], how="left")
-    else:
-        final_df["relevance_average"] = np.nan  # Add column if rel_avg was empty
-
-    # Merge DWAs if available
-    if dwas_grouped is not None and not dwas_grouped.empty:
-        final_df = final_df.merge(
-            dwas_grouped, on=["onetsoc_code", "task_id"], how="left"
-        )  # Merge the dwas list
-        # Fill NaN in 'dwas' column (for tasks with no DWAs) with empty lists
-        # Check if 'dwas' column exists before applying function
-        if "dwas" in final_df.columns:
-            final_df["dwas"] = final_df["dwas"].apply(
-                lambda x: x if isinstance(x, list) else []
-            )  # Ensure tasks without DWAs get []
-        else:
-            print("Warning: 'dwas' column not created during merge.")
-            final_df["dwas"] = [
-                [] for _ in range(len(final_df))
-            ]  # Add empty list column
-
-    else:
-        # Add an empty 'dwas' column if no DWA data was processed or merged
-        final_df["dwas"] = [[] for _ in range(len(final_df))]
-
-    print(f"Final merged data shape: {final_df.shape}")
-
-    # Convert DataFrame to list of dictionaries for JSON output
-    # Handle potential NaN values during JSON conversion
-    # Replace numpy NaN with Python None for JSON compatibility
-    final_df = final_df.replace({np.nan: None})
-    result_list = final_df.to_dict(orient="records")
-
-    return result_list
-
-
-# --- Output ---
-
-
-def write_to_json(data, output_path):
-    """
-    Writes the processed data to a JSON file.
-
-    Args:
-        data (list): The list of dictionaries to write.
-        output_path (str): Path to the output JSON file.
-    """
-    if data is None:
-        print("No data to write to JSON.")
-        return
-    if not isinstance(data, list):
-        print(
-            f"Error: Data to write is not a list (type: {type(data)}). Cannot write to JSON."
-        )
-        return
-
-    # Create directory if it doesn't exist
-    output_dir = os.path.dirname(output_path)
-    if output_dir and not os.path.exists(output_dir):
-        try:
-            os.makedirs(output_dir)
-            print(f"Created output directory: {output_dir}")
-        except OSError as e:
-            print(f"Error creating output directory {output_dir}: {e}")
-            return  # Exit if cannot create directory
-
-    try:
-        with open(output_path, "w", encoding="utf-8") as f:
-            json.dump(data, f, indent=4, ensure_ascii=False)
-        print(f"Successfully wrote enriched data to {output_path}")
-    except IOError as e:
-        print(f"Error writing JSON file to {output_path}: {e}")
-    except TypeError as e:
-        print(f"Error during JSON serialization: {e}. Check data types.")
-    except Exception as e:
-        print(f"An unexpected error occurred during JSON writing: {e}")
-
-
-# --- Main Execution ---
-
-if __name__ == "__main__":
-    print("Starting O*NET Task Ratings & DWAs Enrichment Script...")
-    # 1. Fetch data
-    ratings_data_df, dwas_data_df = fetch_data_from_db(DB_FILE)  # Fetch both datasets
-
-    # 2. Process data
-    # Proceed only if ratings_data_df is a valid DataFrame (even if empty)
-    # dwas_data_df can be None or empty, handled inside process function
-    if isinstance(ratings_data_df, pd.DataFrame):
-        enriched_data = process_task_ratings_with_dwas(
-            ratings_data_df, dwas_data_df
-        )  # Pass both dataframes
-
-        # 3. Write output
-        if (
-            enriched_data is not None
-        ):  # Check if processing returned data (even an empty list is valid)
-            write_to_json(enriched_data, OUTPUT_FILE)
-        else:
-            print("Data processing failed or returned None. No output file generated.")
-    else:
-        print(
-            "Data fetching failed or returned invalid type for ratings data. Script terminated."
-        )
-
-    print("Script finished.")
--- a/pipeline/aggregate.py
+++ b/pipeline/aggregate.py
@ -0,0 +1,81 @@
+from .utils import OCCUPATION_MAJOR_CODES
+import pandas as pd
+
+def create_task_summary_by_occupation_df(df_tasks: pd.DataFrame, oesm_df: pd.DataFrame) -> pd.DataFrame:
+    # --- OESM Wage Bill Calculation ---
+    df_oesm_with_bill = oesm_df.copy()
+    df_oesm_with_bill.rename(columns={'OCC_CODE': 'onetsoc_code'}, inplace=True)
+
+    # Convert key columns to numeric, handling potential errors
+    df_oesm_with_bill['TOT_EMP'] = pd.to_numeric(df_oesm_with_bill['TOT_EMP'], errors='coerce')
+    df_oesm_with_bill['A_MEAN'] = pd.to_numeric(df_oesm_with_bill['A_MEAN'], errors='coerce')
+    df_oesm_with_bill.dropna(subset=['TOT_EMP', 'A_MEAN', 'onetsoc_code'], inplace=True)
+
+    # Calculate the wage bill for each occupation
+    df_oesm_with_bill['wage_bill'] = df_oesm_with_bill['TOT_EMP'] * df_oesm_with_bill['A_MEAN']
+    oesm_lookup = df_oesm_with_bill.set_index('onetsoc_code')
+
+    summary_data = []
+
+    # Assuming df_tasks has an 'onetsoc_code' column with the full SOC code
+    unique_soc_codes = df_tasks['onetsoc_code'].unique()
+
+    for code in unique_soc_codes:
+        occ_df = df_tasks[df_tasks['onetsoc_code'] == code]
+        total_tasks_in_occ = len(occ_df)
+
+        not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote'])
+        remote_df = occ_df[occ_df['remote_status'] == 'remote']
+        remote_estimable_count = len(remote_df[remote_df['estimable']])
+        remote_not_estimable_count = len(remote_df[~remote_df['estimable']])
+
+        try:
+            # O*NET codes (e.g., 11-1011.03) are more specific than OESM SOC codes (e.g., 11-1011).
+            # We strip the suffix from the O*NET code to find the corresponding wage data.
+            soc_code_for_lookup = code.split('.')[0]
+            wage_bill = oesm_lookup.loc[soc_code_for_lookup, 'wage_bill']
+            label = oesm_lookup.loc[soc_code_for_lookup, 'OCC_TITLE']
+        except KeyError:
+            wage_bill = 0
+            label = "Unknown"
+
+        summary_data.append({
+            'onetsoc_code': code,
+            'occupation_label': label,
+            'wage_bill': wage_bill,
+            'count_not_remote': not_remote_count,
+            'count_remote_estimable': remote_estimable_count,
+            'count_remote_not_estimable': remote_not_estimable_count,
+            'total_tasks': total_tasks_in_occ
+        })
+
+    return pd.DataFrame(summary_data)
+
+
+def aggregate_task_summary_by_major_code(summary_df: pd.DataFrame) -> pd.DataFrame:
+    df_agg = summary_df.copy()
+    df_agg['onetsoc_major_code'] = df_agg['onetsoc_code'].str[:2]
+
+    aggregation = {
+        'wage_bill': 'sum',
+        'count_not_remote': 'sum',
+        'count_remote_estimable': 'sum',
+        'count_remote_not_estimable': 'sum',
+        'total_tasks': 'sum'
+    }
+    major_summary = df_agg.groupby('onetsoc_major_code').agg(aggregation).reset_index()
+
+    major_summary['occupation_label'] = major_summary['onetsoc_major_code'].map(OCCUPATION_MAJOR_CODES)
+
+    # Reorder columns to match original output format
+    major_summary = major_summary[[
+        'onetsoc_major_code',
+        'occupation_label',
+        'wage_bill',
+        'count_not_remote',
+        'count_remote_estimable',
+        'count_remote_not_estimable',
+        'total_tasks'
+    ]]
+
+    return major_summary
--- a/pipeline/classification.py
+++ b/pipeline/classification.py
@ -0,0 +1,225 @@
+from pathlib import Path
+import pandas as pd
+from .logger import logger
+from .utils import enrich
+import json
+
+ALLOWED_UNITS = [
+    "minute",
+    "hour",
+    "day",
+    "week",
+    "month",
+    "trimester",
+    "semester",
+    "year",
+]
+
+ESTIMABLE_CLASSIFICATION_VERSION = "old_version"
+TIME_ESTIMATES_GENERATION_VERSION = "old_version"
+
+def classify_tasks_as_estimable(cache_dir: Path, df_to_process: pd.DataFrame, bust: bool = False) -> pd.DataFrame:
+    CACHE_PATH = cache_dir / f"task_estimability.{ESTIMABLE_CLASSIFICATION_VERSION}.parquet"
+    if CACHE_PATH.exists() and not bust:
+        logger.info(f"Loading cached task estimability from {CACHE_PATH}")
+        return pd.read_parquet(CACHE_PATH)
+
+    logger.info("Enriching tasks with estimability classification.")
+
+    df_unique_tasks = df_to_process.drop_duplicates(subset=['task']).copy()
+
+    logger.info(f"Found {len(df_unique_tasks)} unique remote tasks to classify.")
+
+    if df_unique_tasks.empty:
+        raise ValueError("No unique tasks to classify.")
+
+    results = enrich(
+        model="gpt-4.1-mini",
+        rpm=5000,
+        messages_to_process=[
+            [
+                {"role": "system", "content":  """
+                    Classify the provided O*NET task into one of these categories:
+                    -  ATOMIC (schedulable): A single, clearly-bounded activity, typically lasting minutes, hours, or a few days.
+                    -  ONGOING-CONSTRAINT (background role/ethical rule): A continuous responsibility or behavioural norm with no schedulable duration (e.g., “follow confidentiality rules,” “serve as department head”).
+                    """.strip()},
+                {"role": "user", "content": f"Task: {row.task}"},
+            ]
+            for row in df_unique_tasks.itertuples()
+        ],
+        schema={
+            "name": "estimability_classification",
+            "schema": {
+                "type": "object",
+                "properties": {"task_category": {"type": "string", "enum": ["ATOMIC", "ONGOING-CONSTRAINT"]}},
+                "required": ["task_category"],
+                "additionalProperties": False
+            }
+        },
+        chunk_size=300,
+    )
+
+    if not results or len(results) != len(df_unique_tasks):
+        raise ValueError(f"Task estimability classification failed or returned mismatched number of results. Expected {len(df_unique_tasks)}, got {len(results) if results else 0}.")
+
+    classifications = []
+    for index, response in enumerate(results):
+        task_label = df_unique_tasks.iloc[index]['task']
+        task_category_flag = None
+
+        if response is None:
+            logger.warning(f"API call failed for task (enrich returned None): '{task_label}'")
+        else:
+            try:
+                content_str = response.choices[0].message.content
+                if not content_str:
+                    raise ValueError("No content found in the response message")
+
+                data = json.loads(content_str)
+
+                if 'task_category' in data and isinstance(data['task_category'], str):
+                    task_category_flag = data['task_category']
+                else:
+                    logger.warning(f"Invalid or missing 'task_category' payload for task '{task_label}'. Data: '{data}'")
+            except (json.JSONDecodeError, AttributeError, KeyError, IndexError, ValueError) as e:
+                logger.warning(f"Could not parse response for task '{task_label}'. Error: {e}. Response: {response}")
+
+        classifications.append({
+            'task': task_label,
+            'estimable': task_category_flag == 'ATOMIC'
+        })
+
+    classification_df = pd.DataFrame(classifications)
+
+    logger.info(f"Finished classification. Got {classification_df['estimable'].notna().sum()} successful classifications out of {len(df_unique_tasks)} unique tasks.")
+
+    logger.info(f"Saving task estimability classifications to {CACHE_PATH}")
+    classification_df.to_parquet(CACHE_PATH)
+
+    return classification_df
+
+
+def generate_time_estimates_for_tasks(cache_dir: Path, df_to_process: pd.DataFrame, bust: bool = False) -> pd.DataFrame:
+    CACHE_PATH = cache_dir / f"task_estimates.{TIME_ESTIMATES_GENERATION_VERSION}.parquet"
+    if CACHE_PATH.exists() and not bust:
+        logger.info(f"Loading cached task estimates from {CACHE_PATH}")
+        return pd.read_parquet(CACHE_PATH)
+
+    logger.info("Enriching tasks with time estimates.")
+
+    if df_to_process.empty:
+        raise ValueError("No tasks to process for estimates.")
+
+    results = enrich(
+        model="gpt-4.1-mini",
+        rpm=5000,
+        messages_to_process=[
+            [
+                {
+                    "role": "system",
+                    "content":  """
+                        You are an expert assistant evaluating the time required for job tasks. Your goal is to estimate the 'effective time' range needed for a skilled human to complete the following job task **remotely**, without supervision
+
+                        'Effective time' is the active, focused work duration required to complete the task. Crucially, **exclude all waiting periods, delays, or time spent on other unrelated activities**. Think of it as the continuous, productive time investment needed if the worker could pause and resume instantly without cost.
+
+                        Provide a lower and upper bound estimate for the 'effective time'. These bounds should capture the time within which approximately 80% of instances of performing this specific task are typically completed by a qualified individual.
+
+                        Base your estimate on the provided task and the associated occupation and occupation description. Your estimate must be in one the allowed units: minute, hour, day, week, month, trimester, semester, year.""".strip()
+                },
+                {
+                    "role": "user",
+                    "content":  f"{row.task} done by {row.occupation_title} ({row.occupation_description})"
+                }
+            ]
+            for row in df_to_process.itertuples()
+        ],
+        schema= {
+            "name": "estimate_time",
+            "strict": True,
+            "schema": {
+                "type": "object",
+                "properties": {
+                    "lower_bound_estimate": {
+                        "type": "object",
+                        "properties": {
+                            "quantity": {
+                                "type": "number",
+                                "description": "The numerical value for the lower bound of the estimate.",
+                            },
+                            "unit": {
+                                "type": "string",
+                                "enum": ALLOWED_UNITS,
+                                "description": "The unit of time for the lower bound.",
+                            },
+                        },
+                        "required": ["quantity", "unit"],
+                        "additionalProperties": False,
+                    },
+                    "upper_bound_estimate": {
+                        "type": "object",
+                        "properties": {
+                            "quantity": {
+                                "type": "number",
+                                "description": "The numerical value for the upper bound of the estimate.",
+                            },
+                            "unit": {
+                                "type": "string",
+                                "enum": ALLOWED_UNITS,
+                                "description": "The unit of time for the upper bound.",
+                            },
+                        },
+                        "required": ["quantity", "unit"],
+                        "additionalProperties": False,
+                    },
+                },
+                "required": ["lower_bound_estimate", "upper_bound_estimate"],
+                "additionalProperties": False,
+            },
+        },
+        chunk_size=200,
+    )
+
+    if not results or len(results) != len(df_to_process):
+        raise ValueError(f"API call for task estimates failed or returned mismatched number of results. "
+            f"Expected {len(df_to_process)}, got {len(results) if results else 0}.")
+
+    estimates = []
+    for index, response in enumerate(results):
+        row = df_to_process.iloc[index]
+        task_info = f"O*NET: {row.onetsoc_code}, Task ID: {row.task_id}"
+        lb_qty, lb_unit, ub_qty, ub_unit = None, None, None, None
+
+        if response is None:
+            logger.warning(f"API call failed for task (enrich returned None): {task_info}")
+        else:
+            try:
+                content_str = response.choices[0].message.content
+                if not content_str:
+                    raise ValueError("No content found in the response message")
+
+                data = json.loads(content_str)
+
+                lb_qty = data['lower_bound_estimate']['quantity']
+                lb_unit = data['lower_bound_estimate']['unit']
+                ub_qty = data['upper_bound_estimate']['quantity']
+                ub_unit = data['upper_bound_estimate']['unit']
+            except Exception as e:
+                logger.warning(f"Could not parse valid estimate for task {task_info}. Error: {e}. Response: {response}")
+                lb_qty, lb_unit, ub_qty, ub_unit = None, None, None, None # Reset on failure
+
+        estimates.append({
+            'onetsoc_code': row.onetsoc_code,
+            'task_id': row.task_id,
+            'lb_estimate_qty': lb_qty,
+            'lb_estimate_unit': lb_unit,
+            'ub_estimate_qty': ub_qty,
+            'ub_estimate_unit': ub_unit
+        })
+
+    estimates_df = pd.DataFrame(estimates)
+    logger.info(f"Finished estimates. Got {estimates_df['lb_estimate_qty'].notna().sum()} successful estimates out of {len(df_to_process)} tasks.")
+
+    logger.info(f"Saving task estimates to {CACHE_PATH}")
+    estimates_df.to_parquet(CACHE_PATH)
+
+    return estimates_df
--- a/pipeline/constants.py
+++ b/pipeline/constants.py
@ -1,35 +0,0 @@
-OCCUPATION_MAJOR_CODES = {
-    '11': 'Management',
-    '13': 'Business & Financial',
-    '15': 'Computer & Mathematical',
-    '17': 'Architecture & Engineering',
-    '19': 'Life, Physical, & Social Science',
-    '21': 'Community & Social Service',
-    '23': 'Legal',
-    '25': 'Education, Training, & Library',
-    '27': 'Arts, Design, & Media',
-    '29': 'Healthcare Practitioners',
-    '31': 'Healthcare Support',
-    '33': 'Protective Service',
-    '35': 'Food Preparation & Serving',
-    '37': 'Building & Grounds Maintenance',
-    '39': 'Personal Care & Service',
-    '41': 'Sales & Related',
-    '43': 'Office & Admin Support',
-    '45': 'Farming, Fishing, & Forestry',
-    '47': 'Construction & Extraction',
-    '49': 'Installation, Maintenance, & Repair',
-    '51': 'Production',
-    '53': 'Transportation & Material Moving',
-    '55': 'Military Specific',
-}
-
-GRAY   = {'50':'#f8fafc','100':'#f1f5f9','200':'#e2e8f0',
-    '300':'#cbd5e1','400':'#94a3b8','500':'#64748b',
-    '600':'#475569','700':'#334155','800':'#1e293b',
-    '900':'#0f172a','950':'#020617'}
-
-LIME            = {'50': '#f7fee7','100': '#ecfcca','200': '#d8f999',
-    '300': '#bbf451','400': '#9ae600','500': '#83cd00',
-    '600': '#64a400','700': '#497d00','800': '#3c6300',
-    '900': '#35530e','950': '#192e03'}
--- a/pipeline/enrichments.py
+++ b/pipeline/enrichments.py
@ -1,97 +0,0 @@
-"""
-This module enriches data, they take time to run, and are usually expensive (API calls...),
-they should manage their own state, and only be run if the data's version is different than
-their save.
-"""
-from .run import Run
-import pandas as pd
-from typing import Any, List, Dict
-import litellm
-
-def enrich(
-    model: str,
-    rpm: int,
-    messages_to_process: List[List[Dict[str, str]]],
-    schema: Dict[str, Any],
-    chunk_size: int = 100,
-):
-    # Use litellm.batch_completion
-    pass
-
-def enrich_with_task_estimateability(run: Run) -> pd.DataFrame:
-    output_path = run.cache_dir / "computed_task_estimateability.parquet"
-    if output_path.exists():
-        print(f"Loading cached task estimateability from {output_path}")
-        return pd.read_parquet(output_path)
-
-    df_remote_tasks = run.df_tasks[run.df_tasks['remote_status'] == 'remote'].copy()
-
-    # In the old script, we only passed unique tasks to the API
-    df_unique_tasks = df_remote_tasks.drop_duplicates(subset=['task'])
-
-
-    results = enrich(
-        model="gpt-4.1-mini",
-        rpm=5000,
-        messages_to_process=[
-            [
-                {"role": "system", "content":  """
-                    Judge whether the provided O*NET task is suitable for a time estimate. If it is a single, clearly-bounded activity, typically lasting minutes, hours, or a few days, then clearly yes. If it is a continuous responsibility or behavioural norm with no schedulable duration (e.g., “follow confidentiality rules,” “serve as department head”), then clearly no.
-                    """},
-                {"role": "user", "content": f"Task: {row.task}"},
-            ]
-            for row in df_unique_tasks.itertuples()
-        ],
-        schema={
-            "type": "object",
-            "properties": {"estimateable": {"type": "bool"}},
-            "required": ["estimateable"]
-        },
-        chunk_size=300,
-    )
-
-    # Create a new dataframe with just enough information to identify the task uniquely + estimateability classification, save it, return it. Careful: the "task" column in itself is not unique.
-    return pd.DataFrame()
-
-def enrich_with_task_estimates(run: Run) -> pd.DataFrame:
-    output_path = run.cache_dir / "computed_task_estimates.parquet"
-    if output_path.exists():
-        print(f"Loading cached task estimates from {output_path}")
-        return pd.read_parquet(output_path)
-
-    df = ... # todo
-
-    results = enrich(
-        model="gpt-4.1-mini",
-        rpm=5000,
-        messages_to_process=[
-            [
-                {"role": "system", "content":  "Estimate the time required to complete the following O*NET task. Your estimate should be a plausible range for how long it might take a typical, qualified worker to perform this task once. Provide your answer as a time range (lower and upper bounds). Do not provide explanations or apologies. If the task is not suitable for a time estimate (e.g., it is an ongoing responsibility), interpret it as a single, schedulable action."},
-                {"role": "user", "content":  f"""
-                    Task: {row.task}
-                    For Occupation: {row.occupation_title}
-                    Occupation Description: {row.occupation_description}"""}
-            ]
-            for row in df.itertuples()
-        ],
-        schema={
-            "type": "object",
-            "properties": {
-                "lower_bound_estimate": {
-                    "type": "object",
-                    "properties": {"quantity": {"type": "number"}, "unit": {"type": "string", "enum": ["minutes", "hours", "days"]}},
-                    "required": ["quantity", "unit"],
-                },
-                "upper_bound_estimate": {
-                    "type": "object",
-                    "properties": {"quantity": {"type": "number"}, "unit": {"type": "string", "enum": ["minutes", "hours", "days"]}},
-                    "required": ["quantity", "unit"],
-                },
-            },
-            "required": ["lower_bound_estimate", "upper_bound_estimate"],
-        },
-        chunk_size=200,
-    )
-
-    # Create a new dataframe with just enough information to identify the task uniquely + the estimates classification, save it, return it. Careful: the "task" column in itself is not unique.
-    raise NotImplementedError
--- a/pipeline/fetchers.py
+++ b/pipeline/fetchers.py
@ -1,50 +1,30 @@
-"""
-Fetchers retrieve remote data and return it in a format suitable for further processing, they also return its version, which should be considered opaque, though it is usually a checksum.
-"""
-
 import sqlite3
-from typing import Tuple
 import pandas as pd
 import requests
 import io
 import zipfile
-from pipeline.run import Run
-from pipeline.logger import logger
+import yaml
+from pathlib import Path
+from .logger import logger
+from typing import Tuple, Dict

-def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
-    """
-    Downloads the O*NET database, creates a local SQLite file from it, and returns a connection.
-    """
-    version  = "29_1"
-    url = f"https://www.onetcenter.org/dl_files/database/db_{version}_mysql.zip"
-    db_path = run.cache_dir / f"onet_{version}.db"
-    run.meta.fetchers['onet'] = {
-        'url': url,
-        'version': version,
-        'db_path': str(db_path),
-    }
+ONET_VERSION  = "29_1"
+ONET_URL = f"https://www.onetcenter.org/dl_files/database/db_{ONET_VERSION}_mysql.zip"

-    if db_path.exists():
-        logger.info(f"Using cached O*NET database: {db_path}")
-        conn = sqlite3.connect(db_path)
-        return conn, version
+def fetch_onet_database(cache_dir: Path) -> sqlite3.Connection:
+    DB_PATH = cache_dir / f"onet_{ONET_VERSION}.db"

-    logger.info(f"Downloading O*NET database from {url}")
-    response = requests.get(url, stream=True, headers={
+    if DB_PATH.exists():
+        logger.info(f"Using cached O*NET database: {DB_PATH}")
+        return sqlite3.connect(DB_PATH)
+
+    logger.info(f"Downloading O*NET database from {ONET_URL}")
+    response = requests.get(ONET_URL, stream=True, headers={
        "User-Agent": "econ-agent/1.0"
    })
    response.raise_for_status()

-    # Read content into memory
-    zip_content = response.content
-
-    db_path = run.cache_dir / f"onet_{version}.db"
-
-    logger.info(f"Creating new O*NET database: {db_path}")
-    conn = sqlite3.connect(db_path)
-
-    # Set performance PRAGMAs for fast import
-    logger.info("Creating new SQLite database with performance settings")
+    conn = sqlite3.connect(DB_PATH)
    conn.executescript("""
        PRAGMA journal_mode = OFF;
        PRAGMA synchronous = 0;
@ -54,6 +34,7 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
        PRAGMA foreign_keys = ON;
    """)

+    zip_content = response.content
    with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
        sql_scripts = []
        for filename in sorted(z.namelist()):
@ -63,14 +44,10 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
        if not sql_scripts:
            raise RuntimeError("No SQL files found in the O*NET zip archive.")

-        # Combine and execute all SQL files in one transaction
-        full_script = "BEGIN TRANSACTION;\n" + "\n".join(sql_scripts) + "\nCOMMIT;"
-
        logger.info("Executing SQL files in alphabetical order (single transaction mode)")
+        full_script = "BEGIN TRANSACTION;\n" + "\n".join(sql_scripts) + "\nCOMMIT;"
        conn.executescript(full_script)
-        logger.info("Database populated successfully. Restoring reliability settings...")

-    # Restore reliability-focused settings after import
    conn.executescript("""
        PRAGMA journal_mode = WAL;
        PRAGMA synchronous = NORMAL;
@ -81,87 +58,75 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
    """)
    conn.execute("VACUUM;")
    conn.commit()
-    logger.info("Reliability settings restored and database optimized successfully!")

-    return conn, version
+    return conn

-def fetch_oesm_data(run: Run) -> Tuple[pd.DataFrame, str]:
-    """
-    Downloads the OESM national data from the BLS website.
-    """
-    version = "23"
-    url = f"https://www.bls.gov/oes/special-requests/oesm{version}nat.zip"
-    parquet_path = run.cache_dir / "oesm.parquet"
-    run.meta.fetchers['oesm'] = {
-        'url': url,
-        'version': version,
-        'parquet_path': str(parquet_path),
-    }
+def fetch_oesm_data(cache_dir: Path) -> pd.DataFrame:
+    VERSION = "23"
+    URL = f"https://www.bls.gov/oes/special-requests/oesm{VERSION}nat.zip"
+    DATA_PATH = cache_dir / "oesm.parquet"

-    if parquet_path.exists():
-        logger.info(f"Using cached OESM data: {parquet_path}")
-        return pd.read_parquet(parquet_path), version
+    if DATA_PATH.exists():
+        logger.info(f"Using cached OESM data: {DATA_PATH}")
+        return pd.read_parquet(DATA_PATH)

-    logger.info(f"Downloading OESM data from {url}")
+    logger.info(f"Downloading OESM data from {URL}")
    headers = {'User-Agent': 'econ-agent/1.0'}
-    response = requests.get(url, headers=headers)
+    response = requests.get(URL, headers=headers)
    response.raise_for_status()

    zip_content = response.content
-    logger.info(f"OESM data version: {version}")

-    logger.info(f"Creating new OESM data cache: {parquet_path}")
+    logger.info(f"Creating new OESM data cache: {DATA_PATH}")
    with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
-        # Find the excel file in the zip
-        excel_filename = None
-        for filename in z.namelist():
-            logger.debug(f"Found file in OESM zip: {filename}")
-            if filename.lower().endswith(".xlsx"):
-                excel_filename = filename
-                break
-
-        if excel_filename is None:
-            raise FileNotFoundError("Could not find the Excel file in the OESM zip archive.")
-
-        logger.info(f"Reading {excel_filename} from zip archive.")
-        with z.open(excel_filename) as f:
+        with z.open(f"oesm{VERSION}national.xlsx") as f:
            df = pd.read_excel(f, engine='openpyxl', na_values=['*', '#'])

-    df.to_parquet(parquet_path)
-    logger.info(f"Saved OESM data to cache: {parquet_path}")
-    return df, version
+    df.to_parquet(DATA_PATH)
+    logger.info(f"Saved OESM data to cache: {DATA_PATH}")
+    return df

-def fetch_epoch_remote_data(run: Run) -> Tuple[pd.DataFrame, str]:
-    """
-    Downloads the EPOCH AI remote work task data.
-    """
-    # This is the direct download link constructed from the Google Drive share link
-    version = "latest"
-    url = "https://drive.google.com/uc?export=download&id=1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r"
-    parquet_path = run.cache_dir / f"epoch_remote_{version}.parquet"
-    run.meta.fetchers['epoch_remote'] = {
-        'url': url,
-        'version': version,
-        'parquet_path': str(parquet_path),
-    }
+def fetch_epoch_remote_data(cache_dir: Path) -> pd.DataFrame:
+    URL = "https://drive.google.com/uc?export=download&id=1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r"
+    DATA_PATH = cache_dir / f"epoch_remote_latest.parquet"

-    if parquet_path.exists():
-        logger.info(f"Using cached EPOCH remote data: {parquet_path}")
-        return pd.read_parquet(parquet_path), version
+    if DATA_PATH.exists():
+        logger.info(f"Using cached EPOCH remote data: {DATA_PATH}")
+        return pd.read_parquet(DATA_PATH)

-    logger.info(f"Downloading EPOCH remote data from Google Drive: {url}")
+    logger.info(f"Downloading EPOCH remote data from Google Drive: {URL}")

-    # Need to handle potential cookies/redirects from Google Drive
    session = requests.Session()
    session.headers.update({"User-Agent": "econ-agent/1.0"})
-    response = session.get(url, stream=True)
+    response = session.get(URL, stream=True)
    response.raise_for_status()

    csv_content = response.content

-    logger.info(f"Creating new EPOCH remote data cache: {parquet_path}")
+    logger.info(f"Creating new EPOCH remote data cache: {DATA_PATH}")
    df = pd.read_csv(io.BytesIO(csv_content))
-    df.to_parquet(parquet_path)
-    logger.info(f"Saved EPOCH remote data to cache: {parquet_path}")
+    df.to_parquet(DATA_PATH)

-    return df, version
+    return df
+
+def fetch_metr_data(cache_dir: Path) -> Dict:
+    URL = "https://metr.org/assets/benchmark_results.yaml"
+    DATA_PATH = cache_dir / "metr_benchmark_results.yaml"
+
+    if DATA_PATH.exists():
+        logger.info(f"Using cached METR data: {DATA_PATH}")
+        with open(DATA_PATH, "r") as f:
+            return yaml.safe_load(f)
+
+    logger.info(f"Downloading METR data from {URL}")
+    headers = {"User-Agent": "econ-agent/1.0"}
+    response = requests.get(URL, headers=headers)
+    response.raise_for_status()
+
+    yaml_content = response.content
+
+    logger.info(f"Creating new METR data cache: {DATA_PATH}")
+    with open(DATA_PATH, "wb") as f:
+        f.write(yaml_content)
+
+    return yaml.safe_load(yaml_content)
--- a/pipeline/generators/init.py
+++ b/pipeline/generators/init.py
@ -1,5 +1,15 @@
 from .estimate_histplot import generate_estimate_histplot
+from .estimates_spread_per_occupation import generate_estimate_spread_per_occupation
+from .estimates_lower_vs_upper_scatter import generate_estimates_lower_vs_upper_scatter
+from .sequential_coherence_cdf import plot_sequential_coherence_cdf
+from .projected_automatable_wage_bill import generate_projected_automatable_wage_bill
+from .projected_task_automation import generate_projected_task_automation_plot

 GENERATORS = [
-    generate_estimate_histplot
+    generate_estimate_histplot,
+    generate_estimate_spread_per_occupation,
+    generate_estimates_lower_vs_upper_scatter,
+    #plot_sequential_coherence_cdf,
+    generate_projected_automatable_wage_bill,
+    generate_projected_task_automation_plot,
 ]
--- a/pipeline/generators/estimate_histplot.py
+++ b/pipeline/generators/estimate_histplot.py
@ -1,6 +1,32 @@
-from ..run import Run
 from pathlib import Path
 from typing import Generator
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+from ..utils import style_plot

-def generate_estimate_histplot(run: Run) -> Generator[Path]:
-    raise NotImplementedError
+def generate_estimate_histplot(output_dir: Path, df: pd.DataFrame, **kwargs) -> Generator[Path]:
+    """
+    Generates a styled histogram of the distribution of midpoint time estimates.
+    """
+    style_plot()
+    OUTPUT_PATH = output_dir / "estimate_distribution_histplot.png"
+
+    fig, ax = plt.subplots()
+
+    sns.histplot(
+        data=df,
+        x='estimate_midpoint',
+        log_scale=True,
+        ax=ax
+    )
+
+    ax.set_xlabel("Task Time (minutes, log scale)")
+    ax.set_ylabel("Number of Tasks")
+    ax.set_title("Distribution of Time Estimates for Atomic Tasks")
+
+    plt.tight_layout()
+    plt.savefig(OUTPUT_PATH)
+    plt.close(fig)
+
+    yield OUTPUT_PATH
--- a/pipeline/generators/estimates_lower_vs_upper_scatter.py
+++ b/pipeline/generators/estimates_lower_vs_upper_scatter.py
@ -0,0 +1,56 @@
+from pathlib import Path
+from typing import Generator
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+from ..utils import OCCUPATION_MAJOR_CODES, style_plot
+
+
+def generate_estimates_lower_vs_upper_scatter(output_dir: Path, df: pd.DataFrame, **kwargs) -> Generator[Path]:
+    """
+    Generates a styled scatter plot of lower-bound vs upper-bound time estimates for tasks.
+    """
+    style_plot()
+    OUTPUT_PATH = output_dir / "estimates_lower_vs_upper_scatter.png"
+
+    plot_df = df.copy()
+    # Replace onetsoc_major codes with their corresponding labels for the plot legend
+    plot_df['onetsoc_major'] = plot_df['onetsoc_major'].map(OCCUPATION_MAJOR_CODES)
+
+    fig, ax = plt.subplots(figsize=(12, 10))
+    sns.scatterplot(
+            data=plot_df,
+            x='lb_estimate_in_minutes',
+            y='ub_estimate_in_minutes',
+            alpha=0.3,
+            edgecolor=None,
+            hue="onetsoc_major",
+            ax=ax
+        )
+
+    # 45° reference line (y=x)
+    lims = (
+        min(df['lb_estimate_in_minutes'].min(), df['ub_estimate_in_minutes'].min()),
+        max(df['lb_estimate_in_minutes'].max(), df['ub_estimate_in_minutes'].max())
+    )
+    lims = (lims[0] * 0.9, lims[1] * 1.1)
+    ax.plot(lims, lims, color='black', linestyle='--', linewidth=1, zorder=0)
+
+    # Optional helper lines for ratios
+    for k in [2, 10, 100]:
+        ax.plot(lims, [k*l for l in lims],
+                linestyle=':', color='grey', linewidth=1, zorder=0)
+
+    ax.set_xscale('log')
+    ax.set_yscale('log')
+    ax.set_xlabel('Lower-bound (min, log scale)')
+    ax.set_ylabel('Upper-bound (min, log scale)')
+    ax.set_title('Lower vs Upper Estimates for All Tasks')
+
+    ax.legend(title="Occupation Major Group", bbox_to_anchor=(1.02, 1), loc='upper left')
+
+    plt.tight_layout()
+    plt.savefig(OUTPUT_PATH, bbox_inches='tight')
+    plt.close(fig)
+
+    yield OUTPUT_PATH
--- a/pipeline/generators/estimates_spread_per_occupation.py
+++ b/pipeline/generators/estimates_spread_per_occupation.py
@ -0,0 +1,39 @@
+from pathlib import Path
+from typing import Generator
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+from ..utils import OCCUPATION_MAJOR_CODES, style_plot
+
+
+def generate_estimate_spread_per_occupation(output_dir: Path, df: pd.DataFrame, **kwargs) -> Generator[Path]:
+    """
+    Generates a styled boxplot of the estimate range spread per major occupation group.
+    """
+    style_plot()
+    OUTPUT_PATH = output_dir / "estimates_spread_per_occupation.png"
+
+    fig, ax = plt.subplots(figsize=(10, 12))
+
+    sns.boxplot(
+        data=df,
+        x='onetsoc_major',
+        y='estimate_range',
+        showfliers=False,
+        ax=ax
+    )
+
+    ax.set_yscale('log')
+    ax.set_xlabel('Occupation')
+    ax.set_ylabel('Range (upper-lower, minutes)')
+    ax.set_title('Spread of time-range estimates per occupation')
+
+    # Get occupation labels from codes for x-axis ticks
+    labels = [OCCUPATION_MAJOR_CODES.get(code.get_text(), code.get_text()) for code in ax.get_xticklabels()]
+    ax.set_xticklabels(labels, rotation=60, ha='right')
+
+    plt.tight_layout()
+    plt.savefig(OUTPUT_PATH)
+    plt.close(fig)
+
+    yield OUTPUT_PATH
--- a/pipeline/generators/helpers.py
+++ b/pipeline/generators/helpers.py
@ -1,6 +0,0 @@
-import pandas as pd
-from typings import List
-
-def must_have_columns(df: pd.DataFrame, columns: List[str]):
-    if not all(col in df.columns for col in columns):
-        raise ValueError(f"DataFrame is missing required columns: {columns}")
--- a/pipeline/generators/projected_automatable_wage_bill.py
+++ b/pipeline/generators/projected_automatable_wage_bill.py
@ -0,0 +1,229 @@
+from pathlib import Path
+from typing import Generator, Dict, Tuple, Optional
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.ticker as mticker
+from scipy.stats import linregress
+from datetime import datetime
+from ..utils import style_plot, LIME
+
+def _generate_wage_projection_data(
+    metr_results: Dict,
+    df_with_wages: pd.DataFrame,
+    percentile_key: str,
+    doubling_time_modifier: float,
+) -> Optional[Tuple[pd.DataFrame, pd.DataFrame, float]]:
+    """
+    Generates wage projection data for different AI progress scenarios.
+
+    Args:
+        metr_results: The METR benchmark data.
+        df_with_wages: DataFrame containing tasks with their estimated wage value.
+        percentile_key: The percentile to use from METR data (e.g., 'p50_horizon_length').
+        doubling_time_modifier: Multiplier for the doubling time (e.g., 1.0 for baseline,
+                                  0.5 for optimistic, 2.0 for pessimistic).
+
+    Returns:
+        A tuple of (metr_df, projection_df, doubling_time_days), or None if data is insufficient.
+    """
+    all_model_data = []
+    for model_name, data in metr_results.get("results", {}).items():
+        for agent_name, agent_data in data.get("agents", {}).items():
+            release_date_str = data.get("release_date")
+            horizon = agent_data.get(percentile_key, {}).get("estimate")
+            if release_date_str and horizon is not None:
+                all_model_data.append({
+                    "release_date": release_date_str,
+                    "horizon_minutes": horizon,
+                })
+
+    if not all_model_data:
+        return None
+
+    metr_df = pd.DataFrame(all_model_data).sort_values("release_date").reset_index(drop=True)
+    metr_df['release_date'] = pd.to_datetime(metr_df['release_date'])
+    metr_df = metr_df[metr_df['horizon_minutes'] > 0].copy()
+
+    if len(metr_df) < 2:
+        return None
+
+    metr_df['days_since_start'] = (metr_df['release_date'] - metr_df['release_date'].min()).dt.days
+    log_y = np.log(metr_df['horizon_minutes'])
+    slope, intercept, r_value, _, _ = linregress(metr_df['days_since_start'], log_y)
+
+    # Apply the scenario modifier to the doubling time
+    base_doubling_time_days = np.log(2) / slope
+    modified_doubling_time_days = base_doubling_time_days * doubling_time_modifier
+    modified_slope = np.log(2) / modified_doubling_time_days
+
+    start_date = metr_df['release_date'].min()
+    future_dates = pd.to_datetime(pd.date_range(start=start_date, end="2035-01-01", freq="ME"))
+    future_days = (future_dates - start_date).days.to_numpy()
+
+    projected_log_horizon = intercept + modified_slope * future_days
+    projected_horizon_minutes = np.exp(projected_log_horizon)
+
+    projection_df = pd.DataFrame({
+        "date": future_dates,
+        "projected_coherence_minutes": projected_horizon_minutes,
+    })
+
+    # Calculate the total wage bill of tasks automated over time
+    for bound in ["lb", "mid", "ub"]:
+        col_name = 'estimate_midpoint' if bound == 'mid' else f'{bound}_estimate_in_minutes'
+        projection_df[f"automatable_wage_bill_{bound}"] = projection_df["projected_coherence_minutes"].apply(
+            lambda h: df_with_wages.loc[df_with_wages[col_name] <= h, 'wage_per_task'].sum()
+        )
+
+    # Also calculate for the actual METR data points for plotting
+    metr_df["automatable_wage_bill_mid"] = metr_df["horizon_minutes"].apply(
+         lambda h: df_with_wages.loc[df_with_wages['estimate_midpoint'] <= h, 'wage_per_task'].sum()
+    )
+
+    return metr_df, projection_df, modified_doubling_time_days
+
+
+def _plot_scenario(ax, projection_df, metr_df, label, color, line_style='-'):
+    """Helper function to draw a single projection scenario on a given axis."""
+    # Plot the projected wage bill
+    ax.plot(
+        projection_df["date"],
+        projection_df["automatable_wage_bill_mid"],
+        label=label,
+        color=color,
+        linewidth=2.5,
+        linestyle=line_style,
+        zorder=3
+    )
+    # Plot the shaded range for lower/upper bounds
+    ax.fill_between(
+        projection_df["date"],
+        projection_df["automatable_wage_bill_lb"],
+        projection_df["automatable_wage_bill_ub"],
+        color=color,
+        alpha=0.15,
+        zorder=2
+    )
+    # Plot the actual METR data points against the wage bill
+    ax.scatter(
+        metr_df['release_date'],
+        metr_df['automatable_wage_bill_mid'],
+        color=color,
+        edgecolor='black',
+        s=60,
+        zorder=4,
+        label=f"Model Capabilities (P50)"
+    )
+
+
+def generate_projected_automatable_wage_bill(
+    output_dir: Path,
+    df: pd.DataFrame,
+    task_summary_by_occupation_df: pd.DataFrame,
+    metr_results: Dict,
+    **kwargs,
+) -> Generator[Path, None, None]:
+    """
+    Generates a plot projecting the automatable wage bill under different
+    AI progress scenarios (optimistic, baseline, pessimistic).
+    """
+    style_plot()
+    OUTPUT_PATH = output_dir / "projected_automatable_wage_bill_sensitivity.png"
+
+    # 1. Calculate wage_per_task for each occupation
+    wage_bill_info = task_summary_by_occupation_df[['onetsoc_code', 'wage_bill', 'total_tasks']].copy()
+    wage_bill_info['wage_per_task'] = wage_bill_info['wage_bill'] / wage_bill_info['total_tasks']
+    wage_bill_info.replace([np.inf, -np.inf], 0, inplace=True) # Avoid division by zero issues
+    wage_bill_info.drop(columns=['wage_bill', 'total_tasks'], inplace=True)
+
+    # 2. Merge wage_per_task into the main task dataframe
+    df_with_wages = pd.merge(df, wage_bill_info, on='onetsoc_code', how='left')
+    df_with_wages['wage_per_task'].fillna(0, inplace=True)
+
+    # 3. Generate data for all three scenarios
+    scenarios = {
+        "Optimistic": {"modifier": 0.5, "color": "tab:green", "style": "--"},
+        "Baseline": {"modifier": 1.0, "color": LIME['600'], "style": "-"},
+        "Pessimistic": {"modifier": 2.0, "color": "tab:red", "style": ":"},
+    }
+
+    projection_results = {}
+    for name, config in scenarios.items():
+        result = _generate_wage_projection_data(metr_results, df_with_wages, 'p50_horizon_length', config['modifier'])
+        if result:
+            projection_results[name] = result
+
+    if not projection_results:
+        print("Warning: Could not generate any projection data. Skipping wage bill plot.")
+        return
+
+    # 4. Create the plot
+    fig, ax = plt.subplots(figsize=(14, 9))
+
+    # We only need to plot the scatter points once, let's use the baseline ones.
+    if "Baseline" in projection_results:
+        metr_df, _, _ = projection_results["Baseline"]
+        ax.scatter(
+            metr_df['release_date'],
+            metr_df['automatable_wage_bill_mid'],
+            color='black',
+            s=80,
+            zorder=5,
+            label=f"Model Capabilities (P50)"
+        )
+
+
+    legend_lines = []
+    for name, (metr_df, proj_df, doubling_time) in projection_results.items():
+        config = scenarios[name]
+        ax.plot(
+            proj_df["date"],
+            proj_df["automatable_wage_bill_mid"],
+            color=config['color'],
+            linestyle=config['style'],
+            linewidth=2.5,
+            zorder=3
+        )
+        ax.fill_between(
+            proj_df["date"],
+            proj_df["automatable_wage_bill_lb"],
+            proj_df["automatable_wage_bill_ub"],
+            color=config['color'],
+            alpha=0.15,
+            zorder=2
+        )
+        # Create a custom line for the legend
+        line = plt.Line2D([0], [0], color=config['color'], linestyle=config['style'], lw=2.5,
+                          label=f'{name} (Doubling Time: {doubling_time:.0f} days)')
+        legend_lines.append(line)
+
+
+    # 5. Styling and annotations
+    ax.set_title("Projected Automatable Wage Bill (P50 Coherence)", fontsize=18, pad=20)
+    ax.set_xlabel("Year", fontsize=12)
+    ax.set_ylabel("Automatable Annual Wage Bill (Trillions of USD)", fontsize=12)
+
+    # Format Y-axis to show trillions
+    def trillions_formatter(x, pos):
+        return f'${x / 1e12:.1f}T'
+    ax.yaxis.set_major_formatter(mticker.FuncFormatter(trillions_formatter))
+
+    total_wage_bill = df_with_wages['wage_per_task'].sum()
+    ax.set_ylim(0, total_wage_bill * 1.05)
+
+    if "Baseline" in projection_results:
+         _, proj_df, _ = projection_results["Baseline"]
+         ax.set_xlim(datetime(2022, 1, 1), proj_df["date"].max())
+
+    # Create the legend from the custom lines and the scatter plot
+    scatter_legend = ax.get_legend_handles_labels()[0]
+    ax.legend(handles=legend_lines + scatter_legend, loc="upper left", fontsize=11)
+
+    ax.grid(True, which="both", linestyle="--", linewidth=0.5)
+    plt.tight_layout()
+    plt.savefig(OUTPUT_PATH)
+    plt.close(fig)
+
+    print(f"Generated sensitivity analysis plot: {OUTPUT_PATH}")
+    yield OUTPUT_PATH
--- a/pipeline/generators/projected_task_automation.py
+++ b/pipeline/generators/projected_task_automation.py
@ -0,0 +1,168 @@
+from pathlib import Path
+from typing import Generator, Dict, Tuple
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.stats import linregress
+from datetime import datetime
+from ..utils import style_plot, LIME
+
+def _generate_projection_data(
+    metr_results: Dict,
+    df: pd.DataFrame,
+    percentile_key: str,
+) -> Tuple[pd.DataFrame, pd.DataFrame] | None:
+    """
+    Generates projection data for a given percentile key (e.g., 'p50_horizon_length').
+    Returns a tuple of (metr_df_with_pct, projection_df), or None if data is insufficient.
+    """
+    # 1. Process METR data to get all model performance over time for the given percentile
+    all_model_data = []
+    for model_name, data in metr_results.get("results", {}).items():
+        for agent_name, agent_data in data.get("agents", {}).items():
+            release_date_str = data.get("release_date")
+            horizon = agent_data.get(percentile_key, {}).get("estimate")
+
+            if release_date_str and horizon is not None:
+                unique_model_name = f"{model_name}-{agent_name}"
+                all_model_data.append({
+                    "model": unique_model_name,
+                    "release_date": release_date_str,
+                    "horizon_minutes": horizon,
+                })
+
+    if not all_model_data:
+        print(f"Warning: No models with {percentile_key} found in METR data. Skipping.")
+        return None
+
+    metr_df = pd.DataFrame(all_model_data).sort_values("release_date").reset_index(drop=True)
+    metr_df['release_date'] = pd.to_datetime(metr_df['release_date'])
+
+    # 2. Perform log-linear regression on coherence over time
+    metr_df = metr_df[metr_df['horizon_minutes'] > 0].copy()
+    if len(metr_df) < 2:
+        print(f"Warning: Not enough data points for regression for {percentile_key}. Skipping.")
+        return None
+
+    metr_df['days_since_start'] = (metr_df['release_date'] - metr_df['release_date'].min()).dt.days
+    log_y = np.log(metr_df['horizon_minutes'])
+    x = metr_df['days_since_start']
+
+    slope, intercept, r_value, _, _ = linregress(x, log_y)
+    doubling_time_days = np.log(2) / slope
+    print(f"METR all models {percentile_key} trend: R^2 = {r_value**2:.2f}, Doubling time = {doubling_time_days:.1f} days")
+
+    # 3. Project coherence into the future
+    start_date = metr_df['release_date'].min()
+    future_dates = pd.to_datetime(pd.date_range(start=start_date, end="2035-01-01", freq="ME"))
+    future_days = (future_dates - start_date).days.to_numpy()
+
+    projected_log_horizon = intercept + slope * future_days
+    projected_horizon_minutes = np.exp(projected_log_horizon)
+
+    projection_df = pd.DataFrame({
+        "date": future_dates,
+        "projected_coherence_minutes": projected_horizon_minutes,
+    })
+
+    # 4. Calculate the percentage of tasks automated over time based on our estimates
+    total_tasks = len(df)
+    if total_tasks == 0:
+        return None
+
+    for bound in ["lb", "mid", "ub"]:
+        col_name = 'estimate_midpoint' if bound == 'mid' else f'{bound}_estimate_in_minutes'
+        projection_df[f"pct_automatable_{bound}"] = projection_df["projected_coherence_minutes"].apply(
+            lambda h: (df[col_name] <= h).sum() / total_tasks * 100
+        )
+
+    metr_df["pct_automatable_mid"] = metr_df["horizon_minutes"].apply(
+         lambda h: (df['estimate_midpoint'] <= h).sum() / total_tasks * 100
+    )
+
+    return metr_df, projection_df
+
+
+def _plot_projection(ax, projection_df, metr_df, label, color, line_style='-'):
+    """Helper function to draw a single projection on a given axis."""
+    # Plot the projected automation percentage
+    ax.plot(
+        projection_df["date"],
+        projection_df["pct_automatable_mid"],
+        label=f"Mid-point",
+        color=color,
+        linewidth=2.5,
+        linestyle=line_style,
+        zorder=3
+    )
+    ax.fill_between(
+        projection_df["date"],
+        projection_df["pct_automatable_lb"],
+        projection_df["pct_automatable_ub"],
+        color=color,
+        alpha=0.15,
+        label=f"Lower/upper bound range",
+        zorder=2
+    )
+    # Plot the actual METR data points
+    ax.scatter(
+        metr_df['release_date'],
+        metr_df['pct_automatable_mid'],
+        color=color,
+        edgecolor='black',
+        s=60,
+        zorder=4,
+        label=f"Model with {label[1:]}% success rate"
+    )
+
+
+def generate_projected_task_automation_plot(
+    output_dir: Path,
+    metr_results: Dict,
+    df: pd.DataFrame,
+    **kwargs,
+) -> Generator[Path, None, None]:
+    """
+    Generates plots projecting task automation based on METR's p50 and p80
+    coherence data.
+    """
+    style_plot()
+
+    p50_data = _generate_projection_data(metr_results, df, 'p50_horizon_length')
+    p80_data = _generate_projection_data(metr_results, df, 'p80_horizon_length')
+
+    # Plot P50 alone
+    if p50_data:
+        p50_metr_df, p50_proj_df = p50_data
+        fig, ax = plt.subplots(figsize=(12, 8))
+        _plot_projection(ax, p50_proj_df, p50_metr_df, "P50", LIME['600'])
+        ax.set_title("How long before sequential coherence stops being a bottleneck?", fontsize=16, pad=20)
+        ax.set_xlabel("Year")
+        ax.set_ylabel("% of task automatable (50% success rate)")
+        ax.set_ylim(0, 100.5)
+        ax.set_xlim(datetime(2022, 1, 1), p50_proj_df["date"].max())
+        ax.grid(True, which="both", linestyle="--", linewidth=0.5)
+        ax.legend(loc="upper left")
+        plt.tight_layout()
+        output_path = output_dir / "projected_task_automation_p50.png"
+        plt.savefig(output_path)
+        plt.close(fig)
+        yield output_path
+
+    # Plot P80 alone
+    if p80_data:
+        p80_metr_df, p80_proj_df = p80_data
+        fig, ax = plt.subplots(figsize=(12, 8))
+        _plot_projection(ax, p80_proj_df, p80_metr_df, "P80", 'tab:cyan')
+        ax.set_title("Projected Task Automation (P80 AI Coherence)", fontsize=16, pad=20)
+        ax.set_xlabel("Year")
+        ax.set_ylabel("% of Estimable Economic Tasks Automatable")
+        ax.set_ylim(0, 100.5)
+        ax.set_xlim(datetime(2022, 1, 1), p80_proj_df["date"].max())
+        ax.grid(True, which="both", linestyle="--", linewidth=0.5)
+        ax.legend(loc="upper left")
+        plt.tight_layout()
+        output_path = output_dir / "projected_task_automation_p80.png"
+        plt.savefig(output_path)
+        plt.close(fig)
+        yield output_path
--- a/pipeline/generators/sequential_coherence_cdf.py
+++ b/pipeline/generators/sequential_coherence_cdf.py
@ -0,0 +1,54 @@
+from pathlib import Path
+import pandas as pd
+import matplotlib.pyplot as plt
+import matplotlib.ticker as mtick
+from ..utils import LIME, style_plot
+
+def plot_sequential_coherence_cdf(output_dir: Path, df: pd.DataFrame, **kwargs):
+    style_plot()
+    output_path = output_dir / "sequential_coherence_cdf.png"
+
+    def cdf(series):
+        """Helper function to calculate CDF data."""
+        s = series.sort_values().reset_index(drop=True)
+        # Calculate cumulative percentage
+        return s.values, ((s.index + 1) / len(s)) * 100
+
+    # Calculate CDF for lower, upper, and midpoint estimates
+    x_lb, y_lb = cdf(df['lb_estimate_in_minutes'])
+    x_ub, y_ub = cdf(df['ub_estimate_in_minutes'])
+    x_mid, y_mid = cdf(df['estimate_midpoint'])
+
+    # Create the plot
+    fig, ax = plt.subplots(figsize=(12, 7))
+
+    # Plot the CDFs as step plots
+    ax.step(x_lb, y_lb, where='post', color=LIME['300'], linewidth=1.8, linestyle='--', zorder=2, label='Lower bound estimate')
+    ax.step(x_ub, y_ub, where='post', color=LIME['900'], linewidth=1.8, linestyle=':', zorder=3, label='Upper bound estimate')
+    ax.step(x_mid, y_mid, where='post', color=LIME['600'], linewidth=2.2, zorder=4, label='Mid-point')
+
+    # --- Styling and Annotations ---
+    ax.set_xscale('log')
+    ax.set_ylim(0, 100)
+    ax.yaxis.set_major_formatter(mtick.PercentFormatter(decimals=0))
+
+    # Set titles and labels using the standard axes methods
+    ax.set_title("% of Tasks With Sequential Coherence ≤ X")
+    ax.set_xlabel("Sequential Coherence (X)")
+    ax.set_ylabel("Cumulative Percentage of Tasks")
+
+    # Define custom x-axis ticks and labels for better readability
+    ticks = [1, 5, 10, 30, 60, 120, 240, 480, 1440, 2880, 10080, 43200, 129600, 259200, 525600]
+    ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hr', '2 hr', '4 hr', '8 hr', '1 day', '2 days',
+ '1 wk', '30 days', '90 days', '180 days', '1 yr']
+    ax.set_xticks(ticks)
+    ax.set_xticklabels(ticklabels, rotation=45, ha='right')
+
+    ax.legend(loc='lower right')
+
+    # --- Save and close ---
+    plt.tight_layout()
+    plt.savefig(output_path, bbox_inches='tight')
+    plt.close(fig)
+
+    yield output_path
--- a/pipeline/metadata.py
+++ b/pipeline/metadata.py
@ -1,41 +0,0 @@
-"""
-This module defines the Metadata model for the pipeline.
-"""
-
-from datetime import datetime
-from pydantic import BaseModel, Field
-from typing import Dict, Any
-
-class Metadata(BaseModel):
-    """
-    A Pydantic model for storing pipeline metadata.
-
-    This class is intended to be instantiated once and passed through the
-    pipeline. Each step in the pipeline can then add its own metadata.
-    This provides a centralized and structured way to track data provenance,
-    versions, and other important information.
-    """
-    fetchers: Dict[str, Dict[str, Any]] = Field(default_factory=dict)
-    enrichments: Dict[str, Dict[str, Any]] = Field(default_factory=dict)
-
-    ts: str = Field(default_factory=lambda: datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
-    commit: str = Field(default_factory=lambda: _get_current_commit())
-
-
-def _get_current_commit() -> str:
-    """
-    Returns the current git commit hash, "unknown", or "errored" depending on why the commit could not be retrieved.
-    """
-    import subprocess
-    try:
-        # Get the current commit hash
-        commit_hash = subprocess.check_output(
-            ["git", "rev-parse", "HEAD"], stderr=subprocess.PIPE, text=True
-        ).strip()
-        return commit_hash
-    except subprocess.CalledProcessError:
-        # If git command fails (e.g., not a git repository)
-        return "errored"
-    except FileNotFoundError:
-        # If git is not installed
-        return "unknown"
--- a/pipeline/postprocessors.py
+++ b/pipeline/postprocessors.py
@ -1,140 +0,0 @@
-from .run import Run
-from .logger import logger
-import pandas as pd
-import numpy as np
-
-
-def check_for_insanity(run: Run) -> Run:
-    raise NotImplementedError
-
-
-def create_df_tasks(run: Run) -> Run:
-    """
-    Creates a dataframe of tasks from the O*NET database, and merges it with remote status data.
-    This replicates the logic from old/enrich_task_ratings.py and parts of old/analysis.py
-
-    The resulting dataframe, `run.df_tasks` will be used by the enrichment steps.
-    """
-    logger.info("Creating tasks dataframe")
-    cache_path = run.cache_dir / f"onet_{run.onet_version}_tasks_with_remote_status.parquet"
-    if cache_path.exists():
-        logger.info(f"Loading cached tasks dataframe from {cache_path}")
-        run.df_tasks = pd.read_parquet(cache_path)
-        return run
-
-    query = """
-    SELECT
-        tr.onetsoc_code,
-        tr.task_id,
-        ts.task,
-        od.title AS occupation_title,
-        od.description AS occupation_description,
-        tr.scale_id,
-        tr.category,
-        tr.data_value,
-        dr.dwa_title
-    FROM
-        task_ratings tr
-    JOIN
-        task_statements ts ON tr.task_id = ts.task_id
-    JOIN
-        occupation_data od ON tr.onetsoc_code = od.onetsoc_code
-    LEFT JOIN
-        tasks_to_dwas td ON tr.onetsoc_code = td.onetsoc_code AND tr.task_id = td.task_id
-    LEFT JOIN
-        dwa_reference dr ON td.dwa_id = dr.dwa_id;
-    """
-    df = pd.read_sql_query(query, run.onet_conn)
-    logger.info(f"Fetched {len(df)} records (including DWA info) from the database.")
-
-    # Separate ratings from DWAs
-    core_cols = [
-        "onetsoc_code", "task_id", "task", "occupation_title",
-        "occupation_description", "scale_id", "category", "data_value"
-    ]
-    ratings_df = df[core_cols].drop_duplicates().reset_index(drop=True)
-
-    dwa_cols = ["onetsoc_code", "task_id", "dwa_title"]
-    dwas_df = df[dwa_cols].dropna(subset=["dwa_title"]).drop_duplicates().reset_index(drop=True)
-
-    # 1. Handle Frequency (FT)
-    logger.info("Processing Frequency data")
-    freq_df = ratings_df[ratings_df["scale_id"] == "FT"].copy()
-    if not freq_df.empty:
-        freq_pivot = freq_df.pivot_table(
-            index=["onetsoc_code", "task_id"],
-            columns="category",
-            values="data_value",
-            fill_value=0,
-        )
-        freq_pivot.columns = [f"frequency_category_{int(col)}" for col in freq_pivot.columns]
-    else:
-        idx = pd.MultiIndex(levels=[[], []], codes=[[], []], names=["onetsoc_code", "task_id"])
-        freq_pivot = pd.DataFrame(index=idx)
-
-    # 2. Handle Importance (IM, IJ)
-    logger.info("Processing Importance data")
-    imp_df = ratings_df[ratings_df["scale_id"].isin(["IM", "IJ"])].copy()
-    if not imp_df.empty:
-        imp_avg = imp_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index()
-        imp_avg.rename(columns={"data_value": "importance_average"}, inplace=True)
-    else:
-        imp_avg = pd.DataFrame(columns=["onetsoc_code", "task_id", "importance_average"])
-
-    # 3. Handle Relevance (RT)
-    logger.info("Processing Relevance data")
-    rel_df = ratings_df[ratings_df["scale_id"] == "RT"].copy()
-    if not rel_df.empty:
-        rel_avg = rel_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index()
-        rel_avg.rename(columns={"data_value": "relevance_average"}, inplace=True)
-    else:
-        rel_avg = pd.DataFrame(columns=["onetsoc_code", "task_id", "relevance_average"])
-
-    # 4. Process DWAs
-    logger.info("Processing DWA data")
-    if not dwas_df.empty:
-        dwas_grouped = dwas_df.groupby(["onetsoc_code", "task_id"])["dwa_title"].apply(list).reset_index()
-        dwas_grouped.rename(columns={"dwa_title": "dwas"}, inplace=True)
-    else:
-        dwas_grouped = None
-
-    # 5. Get Base Task/Occupation Info
-    logger.info("Extracting base task/occupation info")
-    base_cols = ["onetsoc_code", "task_id", "task", "occupation_title", "occupation_description"]
-    base_info = ratings_df[base_cols].drop_duplicates().set_index(["onetsoc_code", "task_id"])
-
-    # 6. Merge Processed ONET Data
-    logger.info("Merging processed ONET data")
-    final_df = base_info.merge(freq_pivot, left_index=True, right_index=True, how="left")
-    final_df = final_df.reset_index()
-
-    if not imp_avg.empty:
-        final_df = final_df.merge(imp_avg, on=["onetsoc_code", "task_id"], how="left")
-    else:
-        final_df["importance_average"] = np.nan
-
-    if not rel_avg.empty:
-        final_df = final_df.merge(rel_avg, on=["onetsoc_code", "task_id"], how="left")
-    else:
-        final_df["relevance_average"] = np.nan
-
-    if dwas_grouped is not None and not dwas_grouped.empty:
-        final_df = final_df.merge(dwas_grouped, on=["onetsoc_code", "task_id"], how="left")
-        if "dwas" in final_df.columns:
-            final_df["dwas"] = final_df["dwas"].apply(lambda x: x if isinstance(x, list) else [])
-    else:
-        final_df["dwas"] = [[] for _ in range(len(final_df))]
-
-    final_df = final_df.replace({np.nan: None})
-
-    # 7. Merge with EPOCH remote data
-    logger.info("Merging with EPOCH remote data")
-    final_df = pd.merge(final_df, run.epoch_df[['Task', 'Remote']], left_on='task', right_on='Task', how='left')
-    final_df = final_df.drop('Task', axis=1).rename(columns={'Remote': 'remote_status'})
-
-
-    logger.info(f"Created tasks dataframe with shape {final_df.shape}")
-    final_df.to_parquet(cache_path)
-
-    run.df_tasks = final_df
-    return run
--- a/pipeline/run.py
+++ b/pipeline/run.py
@ -1,27 +0,0 @@
-from pydantic import BaseModel, Field
-import sqlite3
-import pandas as pd
-from pathlib import Path
-from typing import Optional
-from .metadata import Metadata
-
-class Run(BaseModel):
-    model_config = {"arbitrary_types_allowed": True}
-    # === FETCHERS ===
-    onet_conn: Optional[sqlite3.Connection] = None
-    onet_version: Optional[str] = None
-
-    oesm_df: Optional[pd.DataFrame] = None
-    oesm_version: Optional[str] = None
-
-    epoch_df: Optional[pd.DataFrame] = None
-    epoch_version: Optional[str] = None
-
-    # === ENRICHMENTS ===
-    task_estimateability_df: Optional[pd.DataFrame] = None
-    task_estimates_df: Optional[pd.DataFrame] = None
-
-    meta: Metadata = Field(default_factory=Metadata)
-
-    cache_dir: Path
-    output_dir: Path
--- a/pipeline/runner.py
+++ b/pipeline/runner.py
@ -1,74 +1,215 @@
+import sqlite3
+import os
+from .logger import logger
+import pandas as pd
 from dotenv import load_dotenv
-from .fetchers import fetch_oesm_data, fetch_epoch_remote_data, fetch_onet_database
-from .enrichments import enrich_with_task_estimateability, enrich_with_task_estimates
-from .postprocessors import check_for_insanity, create_df_tasks
+from .fetchers import fetch_onet_database, fetch_oesm_data, fetch_epoch_remote_data, ONET_VERSION, fetch_metr_data
+from .classification import classify_tasks_as_estimable, generate_time_estimates_for_tasks
 from .generators import GENERATORS
-from .run import Run
-from .constants import GRAY
+from .aggregate import create_task_summary_by_occupation_df, aggregate_task_summary_by_major_code
+from .utils import convert_to_minutes
 import argparse
 import platformdirs
-import seaborn as sns
-import matplotlib as mpl
+import numpy as np
 from pathlib import Path
-from typing import Optional
-
-CACHE_DIR = platformdirs.user_cache_dir("econtai")
-
-def run(output_dir: Path | Optional[str] = None):
-    load_dotenv()
-    _setup_graph_rendering()
-
-    if output_dir is None:
-        output_dir = Path("dist/")
-    elif isinstance(output_dir, str):
-        output_dir = Path(output_dir).resolve()
-
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    current_run = Run(output_dir=output_dir, cache_dir=Path(CACHE_DIR).resolve())
-    current_run.cache_dir.mkdir(parents=True, exist_ok=True)
-
-    # Fetchers (fetchers.py)
-    current_run.onet_conn, current_run.onet_version = fetch_onet_database(current_run)
-    current_run.oesm_df, current_run.oesm_version = fetch_oesm_data(current_run)
-    current_run.epoch_df, current_run.epoch_version = fetch_epoch_remote_data(current_run)
-
-    current_run = create_df_tasks(current_run)
-
-    # Enrichments (enrichments.py)
-    current_run.task_estimateability_df = enrich_with_task_estimateability(current_run)
-    current_run.task_estimates_df = enrich_with_task_estimates(current_run)
-
-    # Postprocessors (postprocessors.py)
-    check_for_insanity(current_run)
-
-    # Generators (generators/)
-    for gen in GENERATORS:
-        gen(current_run)


-def _setup_graph_rendering():
-    mpl.rcParams.update({
-        'figure.facecolor' : GRAY['50'],
-        'axes.facecolor'   : GRAY['50'],
-        'axes.edgecolor'   : GRAY['100'],
-        'axes.labelcolor'  : GRAY['700'],
-        'xtick.color'      : GRAY['700'],
-        'ytick.color'      : GRAY['700'],
-        'font.family'      : 'Inter',
-        'font.size'        : 11,
-    })
+class Runner:
+    onet_conn: sqlite3.Connection
+    oesm_df: pd.DataFrame
+    epoch_df: pd.DataFrame
+    metr_results: dict

+    def __init__(self,  output_dir: Path | str, debug: bool, bust_estimability: bool, bust_estimates: bool):
+        if isinstance(output_dir, str):
+            output_dir = Path(output_dir).resolve()

-    sns.set_style("white")
+        output_dir.mkdir(parents=True, exist_ok=True)

+        self.output_dir = output_dir
+        self.intermediate_dir = self.output_dir / "intermediate"
+        self.intermediate_dir.mkdir(parents=True, exist_ok=True)
+        self.cache_dir = platformdirs.user_cache_path("econtai")
+        self.debug = debug
+        self.bust_estimability = bust_estimability
+        self.bust_estimates = bust_estimates

-def main():
-    parser = argparse.ArgumentParser(description="Run the econtai pipeline.")
-    parser.add_argument("--output-dir", type=str, help="The directory to write output files to.")
-    args = parser.parse_args()
-    run(output_dir=args.output_dir)
+        if debug:
+            os.environ["LITELLM_LOG"] = os.environ.get("LITELLM_LOG", "INFO")

+    def run(self):
+        load_dotenv()
+
+        self.onet_conn = fetch_onet_database(self.cache_dir)
+        self.oesm_df = fetch_oesm_data(self.cache_dir)
+        self.epoch_df = fetch_epoch_remote_data(self.cache_dir)
+        self.metr_results = fetch_metr_data(self.cache_dir)
+
+        self.df_tasks = self._create_df_tasks()
+        self.df_tasks['onetsoc_major'] = self.df_tasks['onetsoc_code'].str[:2]
+
+        df_to_process = self.df_tasks[
+            (self.df_tasks['importance_average'] > 3) &
+            (self.df_tasks['remote_status'] == 'remote')
+        ].copy()
+
+        if self.debug:
+            df_to_process = df_to_process.head(10)
+
+        task_estimability_df = classify_tasks_as_estimable(self.cache_dir, df_to_process, bust=self.bust_estimability)
+        self.df_tasks = pd.merge(self.df_tasks, task_estimability_df, on='task', how='left')
+        self.df_tasks['estimable'] = self.df_tasks['estimable'].fillna(False)
+        self.df_tasks.to_parquet(self.intermediate_dir / "df_tasks.parquet")
+        df_to_process = pd.merge(df_to_process, task_estimability_df, on='task', how='left')
+        df_to_process['estimable'] = self.df_tasks['estimable'].fillna(False)
+
+        df_to_process = df_to_process[df_to_process['estimable']].copy()
+
+        task_estimates_df = generate_time_estimates_for_tasks(self.cache_dir, df_to_process, bust=self.bust_estimates)
+        df = pd.merge(df_to_process, task_estimates_df, on=['onetsoc_code', 'task_id'], how='left')
+        df['lb_estimate_in_minutes'] = df.apply(lambda row: convert_to_minutes(row['lb_estimate_qty'], row['lb_estimate_unit']), axis=1)
+        df['ub_estimate_in_minutes'] = df.apply(lambda row: convert_to_minutes(row['ub_estimate_qty'], row['ub_estimate_unit']), axis=1)
+        df['estimate_range'] = df.ub_estimate_in_minutes - df.lb_estimate_in_minutes
+        df['estimate_ratio'] = np.divide(df.ub_estimate_in_minutes, df.lb_estimate_in_minutes).replace([np.inf, -np.inf], None)
+        df['estimate_midpoint'] = (df.lb_estimate_in_minutes + df.ub_estimate_in_minutes) / 2
+
+        df.to_parquet(self.intermediate_dir / "estimable_tasks_with_estimates.parquet")
+
+        self.task_summary_by_occupation_df = create_task_summary_by_occupation_df(self.df_tasks, self.oesm_df)
+        self.task_summary_by_occupation_df.to_parquet(self.intermediate_dir / "task_summary_by_occupation.parquet")
+        self.task_summary_by_major_occupation_df = aggregate_task_summary_by_major_code(self.task_summary_by_occupation_df)
+        self.task_summary_by_major_occupation_df.to_parquet(self.intermediate_dir / "task_summary_by_major_occupation.parquet")
+
+        self._check_for_insanity(df)
+
+        for gen in GENERATORS:
+            for asset in gen(**{
+                "output_dir": self.output_dir,
+                "runner": self,
+                "df": df,
+                "task_summary_by_occupation_df": self.task_summary_by_occupation_df,
+                "task_summary_by_major_occupation_df": self.task_summary_by_major_occupation_df,
+                "df_tasks": self.df_tasks,
+                "oesm_df": self.oesm_df,
+                "metr_results": self.metr_results,
+            }):
+                logger.info(f"New asset: {asset}")
+
+    def _create_df_tasks(self) -> pd.DataFrame:
+        DATA_PATH = self.cache_dir / f"onet_{ONET_VERSION}_tasks_with_remote_status.parquet"
+        if DATA_PATH.exists():
+            logger.info(f"Loading cached tasks dataframe from {DATA_PATH}")
+            return pd.read_parquet(DATA_PATH)
+
+        logger.info("Creating tasks dataframe")
+        query = """
+        SELECT
+        tr.onetsoc_code,
+        tr.task_id,
+        ts.task,
+        od.title AS occupation_title,
+        od.description AS occupation_description,
+        tr.scale_id,
+        tr.category,
+        tr.data_value
+        FROM
+        task_ratings tr
+        JOIN
+        task_statements ts ON tr.task_id = ts.task_id
+        JOIN
+        occupation_data od ON tr.onetsoc_code = od.onetsoc_code;
+        """
+        ratings_df = pd.read_sql_query(query, self.onet_conn)
+        logger.info(f"Fetched {len(ratings_df)} task rating records from the database.")
+
+        # 1. Handle Frequency (FT)
+        logger.info("Processing Frequency data")
+        freq_df = ratings_df[ratings_df["scale_id"] == "FT"].copy()
+        if not freq_df.empty:
+            freq_pivot = freq_df.pivot_table(
+                index=["onetsoc_code", "task_id"],
+                columns="category",
+                values="data_value",
+                fill_value=0,
+            )
+            freq_pivot.columns = [f"frequency_category_{int(col)}" for col in freq_pivot.columns]
+        else:
+            raise ValueError("No frequency data.")
+
+        # 2. Handle Importance (IM, IJ)
+        logger.info("Processing Importance data")
+        imp_df = ratings_df[ratings_df["scale_id"].isin(["IM", "IJ"])].copy()
+        if not imp_df.empty:
+            imp_avg = imp_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index()
+            imp_avg.rename(columns={"data_value": "importance_average"}, inplace=True)
+        else:
+            raise ValueError("No importance data.")
+
+        # 3. Handle Relevance (RT)
+        logger.info("Processing Relevance data")
+        rel_df = ratings_df[ratings_df["scale_id"] == "RT"].copy()
+        if not rel_df.empty:
+            rel_avg = rel_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index()
+            rel_avg.rename(columns={"data_value": "relevance_average"}, inplace=True)
+        else:
+            raise ValueError("No relevance data.")
+
+        # 5. Get Base Task/Occupation Info
+        logger.info("Extracting base task/occupation info")
+        base_cols = ["onetsoc_code", "task_id", "task", "occupation_title", "occupation_description"]
+        base_info = ratings_df[base_cols].drop_duplicates().set_index(["onetsoc_code", "task_id"])
+
+        # 6. Merge Processed ONET Data
+        logger.info("Merging processed ONET data")
+        final_df = base_info.merge(freq_pivot, left_index=True, right_index=True, how="left")
+        final_df = final_df.reset_index()
+
+        if not imp_avg.empty:
+            final_df = final_df.merge(imp_avg, on=["onetsoc_code", "task_id"], how="left")
+        else:
+            final_df["importance_average"] = np.nan
+
+        if not rel_avg.empty:
+            final_df = final_df.merge(rel_avg, on=["onetsoc_code", "task_id"], how="left")
+        else:
+            final_df["relevance_average"] = np.nan
+
+        final_df = final_df.replace({np.nan: None})
+
+        # 7. Merge with EPOCH remote data
+        logger.info("Merging with EPOCH remote data")
+        final_df = pd.merge(final_df, self.epoch_df[['Task', 'Remote']], left_on='task', right_on='Task', how='left')
+        final_df = final_df.drop('Task', axis=1).rename(columns={'Remote': 'remote_status'})
+
+        logger.info(f"Created tasks dataframe with shape {final_df.shape}")
+        final_df.to_parquet(DATA_PATH)
+
+        return final_df
+
+    def _check_for_insanity(self, df: pd.DataFrame):
+        if df['lb_estimate_in_minutes'].isnull().any():
+            missing_count = df['lb_estimate_in_minutes'].isnull().sum()
+            raise ValueError(f"Found {missing_count} atomic tasks with missing 'lb_estimate_in_minutes'.")
+
+        if df['ub_estimate_in_minutes'].isnull().any():
+            missing_count = df['ub_estimate_in_minutes'].isnull().sum()
+            raise ValueError(f"Found {missing_count} atomic tasks with missing 'ub_estimate_in_minutes'.")
+
+        valid_estimates = df.dropna(subset=['lb_estimate_in_minutes', 'ub_estimate_in_minutes'])
+        impossible_bounds = valid_estimates[
+            (valid_estimates['lb_estimate_in_minutes'] <= 0) |
+            (valid_estimates['ub_estimate_in_minutes'] <= 0) |
+            (valid_estimates['lb_estimate_in_minutes'] > valid_estimates['ub_estimate_in_minutes'])
+        ]
+        if not impossible_bounds.empty:
+            raise ValueError(f"Found {len(impossible_bounds)} rows with impossible bounds (e.g., lb > ub or value <= 0).")

 if __name__ == "__main__":
-    main()
+    parser = argparse.ArgumentParser(description="Run the econtai pipeline.")
+    parser.add_argument("--output-dir", type=str, default="dist/", help="The directory to write output files to.")
+    parser.add_argument("--bust-estimability", action="store_true", help="Bust the saved task estimability classification (EXPENSIVE)")
+    parser.add_argument("--bust-estimates", action="store_true", help="Bust the tasks estimates (EXPENSIVE)")
+    parser.add_argument("--debug", action="store_true", help="Enable debug mode (e.g., process fewer tasks).")
+
+    args = parser.parse_args()
+    Runner(output_dir=args.output_dir, debug=args.debug, bust_estimability=args.bust_estimability, bust_estimates=args.bust_estimates).run()
--- a/pipeline/utils.py
+++ b/pipeline/utils.py
@ -0,0 +1,222 @@
+import subprocess
+import matplotlib.colors as mcolors
+import matplotlib as mpl
+import seaborn as sns
+import tempfile
+import litellm
+import time
+import math
+from tqdm import tqdm
+from typing import Any, List, Dict
+from .logger import logger
+
+OCCUPATION_MAJOR_CODES = {
+    '11': 'Management',
+    '13': 'Business & Financial',
+    '15': 'Computer & Mathematical',
+    '17': 'Architecture & Engineering',
+    '19': 'Life, Physical, & Social Science',
+    '21': 'Community & Social Service',
+    '23': 'Legal',
+    '25': 'Education, Training, & Library',
+    '27': 'Arts, Design, & Media',
+    '29': 'Healthcare Practitioners',
+    '31': 'Healthcare Support',
+    '33': 'Protective Service',
+    '35': 'Food Preparation & Serving',
+    '37': 'Building & Grounds Maintenance',
+    '39': 'Personal Care & Service',
+    '41': 'Sales & Related',
+    '43': 'Office & Admin Support',
+    '45': 'Farming, Fishing, & Forestry',
+    '47': 'Construction & Extraction',
+    '49': 'Installation, Maintenance, & Repair',
+    '51': 'Production',
+    '53': 'Transportation & Material Moving',
+    '55': 'Military Specific',
+}
+
+GRAY   = {'50':'#f8fafc','100':'#f1f5f9','200':'#e2e8f0',
+    '300':'#cbd5e1','400':'#94a3b8','500':'#64748b',
+    '600':'#475569','700':'#334155','800':'#1e293b',
+    '900':'#0f172a','950':'#020617'}
+
+LIME            = {'50': '#f7fee7','100': '#ecfcca','200': '#d8f999',
+    '300': '#bbf451','400': '#9ae600','500': '#83cd00',
+    '600': '#64a400','700': '#497d00','800': '#3c6300',
+    '900': '#35530e','950': '#192e03'}
+
+
+def convert_to_minutes(qty, unit):
+    """Converts a quantity in a given unit to minutes."""
+    return qty * {
+        "minute": 1,
+        "hour": 60,
+        "day": 60 * 24,
+        "week": 60 * 24 * 7,
+        "month": 60 * 24 * 30,
+        "trimester": 60 * 24 * 90,
+        "semester": 60 * 24 * 180,
+        "year": 60 * 24 * 365,
+    }[unit]
+
+
+def pretty_display(df):
+    print(df)
+    return
+    html_output = df.to_html(index=False)
+
+    # Create a temporary HTML file
+    with tempfile.NamedTemporaryFile(mode='w', suffix=".html", encoding="utf-8") as temp_file:
+        temp_file.write(html_output)
+        temp_file_path = temp_file.name
+        subprocess.run(["/home/felix/.nix-profile/bin/firefox-devedition", "-p", "Work (YouthAI)", temp_file_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        input("Press Enter to continue after reviewing the HTML output...")
+
+
+def enrich(
+    model: str,
+    rpm: int, # Requests per minute
+    messages_to_process: List[List[Dict[str, str]]],
+    schema: Dict[str, Any],
+    chunk_size: int = 100,
+):
+    all_results = []
+    num_messages = len(messages_to_process)
+    if num_messages == 0:
+        return all_results
+
+    num_chunks = math.ceil(num_messages / chunk_size)
+    logger.info(f"Starting enrichment for {num_messages} messages, in {num_chunks} chunks of up to {chunk_size} each.")
+
+    # Calculate the time that should be allocated per request to respect the RPM limit.
+    time_per_request = 60.0 / rpm if rpm > 0 else 0
+
+    for i in tqdm(range(num_chunks), desc="Enriching data in chunks"):
+        chunk_start_time = time.time()
+
+        start_index = i * chunk_size
+        end_index = start_index + chunk_size
+        message_chunk = messages_to_process[start_index:end_index]
+
+        if not message_chunk:
+            continue
+
+        try:
+            # Send requests for the entire chunk in a batch for better performance.
+            responses = litellm.batch_completion(
+                model=model,
+                messages=message_chunk,
+                response_format={
+                    "type": "json_schema",
+                    "json_schema": schema,
+                },
+            )
+
+            # batch_completion returns the response or an exception object for each message.
+            # We'll replace exceptions with None as expected by the calling functions.
+            for response in responses:
+                if isinstance(response, Exception):
+                    logger.error(f"API call within batch failed: {response}")
+                    all_results.append(None)
+                else:
+                    all_results.append(response)
+
+        except Exception as e:
+            # This catches catastrophic failures in batch_completion itself (e.g., auth)
+            logger.error(f"litellm.batch_completion call failed for chunk {i+1}/{num_chunks}: {e}")
+            all_results.extend([None] * len(message_chunk))
+
+        chunk_end_time = time.time()
+        elapsed_time = chunk_end_time - chunk_start_time
+
+        # To enforce the rate limit, we calculate how long the chunk *should* have taken
+        # and sleep for the remainder of that time.
+        if time_per_request > 0:
+            expected_duration_for_chunk = len(message_chunk) * time_per_request
+            if elapsed_time < expected_duration_for_chunk:
+                sleep_duration = expected_duration_for_chunk - elapsed_time
+                logger.debug(f"Chunk processed in {elapsed_time:.2f}s. Sleeping for {sleep_duration:.2f}s to respect RPM.")
+                time.sleep(sleep_duration)
+
+    return all_results
+
+def get_contrasting_text_color(bg_color_hex_or_rgba):
+    if isinstance(bg_color_hex_or_rgba, str):
+        rgba = mcolors.to_rgba(bg_color_hex_or_rgba)
+    else:
+        rgba = bg_color_hex_or_rgba
+    r, g, b, _ = rgba
+    luminance = 0.2126 * r + 0.7152 * g + 0.0722 * b
+    return 'black' if luminance > 0.55 else 'white'
+
+
+def style_plot():
+    """
+    Applies a consistent and professional style to all plots.
+    This function sets matplotlib's rcParams for a global effect.
+    """
+    mpl.rcParams.update({
+        'figure.facecolor': GRAY['50'],
+        'figure.edgecolor': 'none',
+        'figure.figsize': (12, 8),
+        'figure.dpi': 150,
+
+        'axes.facecolor': GRAY['50'],
+        'axes.edgecolor': GRAY['300'],
+        'axes.grid': True,
+        'axes.labelcolor': GRAY['800'],
+        'axes.titlecolor': GRAY['900'],
+        'axes.titlesize': 18,
+        'axes.titleweight': 'bold',
+        'axes.titlepad': 20,
+        'axes.labelsize': 14,
+        'axes.labelweight': 'semibold',
+        'axes.labelpad': 10,
+        'axes.spines.top': False,
+        'axes.spines.right': False,
+        'axes.spines.left': True,
+        'axes.spines.bottom': True,
+
+        'text.color': GRAY['700'],
+
+        'xtick.color': GRAY['600'],
+        'ytick.color': GRAY['600'],
+        'xtick.labelsize': 12,
+        'ytick.labelsize': 12,
+        'xtick.major.size': 0,
+        'ytick.major.size': 0,
+        'xtick.minor.size': 0,
+        'ytick.minor.size': 0,
+        'xtick.major.pad': 8,
+        'ytick.major.pad': 8,
+
+        'grid.color': GRAY['200'],
+        'grid.linestyle': '--',
+        'grid.linewidth': 1,
+
+        'legend.frameon': False,
+        'legend.fontsize': 12,
+        'legend.title_fontsize': 14,
+        'legend.facecolor': 'inherit',
+
+        'font.family': 'sans-serif',
+        'font.sans-serif': ['Inter'],
+        'font.weight': 'normal',
+
+        'lines.linewidth': 2,
+        'lines.markersize': 6,
+    })
+
+    # Seaborn specific styles
+    # Use shades of LIME as the primary color palette.
+    # Sorting by integer value of keys, and reversed to have darker shades first.
+    # Excluding very light colors that won't be visible on a light background.
+    lime_palette = [LIME[k] for k in sorted(LIME.keys(), key=int, reverse=True) if k not in ['50', '100', '700', '800', '900', '950',]]
+
+    sns.set_palette(lime_palette)
+    sns.set_style("whitegrid", {
+        'axes.edgecolor': GRAY['300'],
+        'grid.color': GRAY['200'],
+        'grid.linestyle': '--',
+    })
--- a/pyproject.toml
+++ b/pyproject.toml
@ -16,6 +16,7 @@ dependencies = [
    "python-dotenv>=1.1.1",
    "requests>=2.32.4",
    "rich>=14.0.0",
+    "scipy>=1.16.0",
    "seaborn>=0.13.2",
 ]

--- a/uv.lock
+++ b/uv.lock
@ -1120,6 +1120,35 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/75/04/5302cea1aa26d886d34cadbf2dc77d90d7737e576c0065f357b96dc7a1a6/rpds_py-0.26.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f14440b9573a6f76b4ee4770c13f0b5921f71dde3b6fcb8dabbefd13b7fe05d7", size = 232821, upload_time = "2025-07-01T15:55:55.167Z" },
 ]

+[[package]]
+name = "scipy"
+version = "1.16.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/81/18/b06a83f0c5ee8cddbde5e3f3d0bb9b702abfa5136ef6d4620ff67df7eee5/scipy-1.16.0.tar.gz", hash = "sha256:b5ef54021e832869c8cfb03bc3bf20366cbcd426e02a58e8a58d7584dfbb8f62", size = 30581216, upload_time = "2025-06-22T16:27:55.782Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/46/95/0746417bc24be0c2a7b7563946d61f670a3b491b76adede420e9d173841f/scipy-1.16.0-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:e9f414cbe9ca289a73e0cc92e33a6a791469b6619c240aa32ee18abdce8ab451", size = 36418162, upload_time = "2025-06-22T16:19:56.3Z" },
+    { url = "https://files.pythonhosted.org/packages/19/5a/914355a74481b8e4bbccf67259bbde171348a3f160b67b4945fbc5f5c1e5/scipy-1.16.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:bbba55fb97ba3cdef9b1ee973f06b09d518c0c7c66a009c729c7d1592be1935e", size = 28465985, upload_time = "2025-06-22T16:20:01.238Z" },
+    { url = "https://files.pythonhosted.org/packages/58/46/63477fc1246063855969cbefdcee8c648ba4b17f67370bd542ba56368d0b/scipy-1.16.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:58e0d4354eacb6004e7aa1cd350e5514bd0270acaa8d5b36c0627bb3bb486974", size = 20737961, upload_time = "2025-06-22T16:20:05.913Z" },
+    { url = "https://files.pythonhosted.org/packages/93/86/0fbb5588b73555e40f9d3d6dde24ee6fac7d8e301a27f6f0cab9d8f66ff2/scipy-1.16.0-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:75b2094ec975c80efc273567436e16bb794660509c12c6a31eb5c195cbf4b6dc", size = 23377941, upload_time = "2025-06-22T16:20:10.668Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/80/a561f2bf4c2da89fa631b3cbf31d120e21ea95db71fd9ec00cb0247c7a93/scipy-1.16.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6b65d232157a380fdd11a560e7e21cde34fdb69d65c09cb87f6cc024ee376351", size = 33196703, upload_time = "2025-06-22T16:20:16.097Z" },
+    { url = "https://files.pythonhosted.org/packages/11/6b/3443abcd0707d52e48eb315e33cc669a95e29fc102229919646f5a501171/scipy-1.16.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1d8747f7736accd39289943f7fe53a8333be7f15a82eea08e4afe47d79568c32", size = 35083410, upload_time = "2025-06-22T16:20:21.734Z" },
+    { url = "https://files.pythonhosted.org/packages/20/ab/eb0fc00e1e48961f1bd69b7ad7e7266896fe5bad4ead91b5fc6b3561bba4/scipy-1.16.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:eb9f147a1b8529bb7fec2a85cf4cf42bdfadf9e83535c309a11fdae598c88e8b", size = 35387829, upload_time = "2025-06-22T16:20:27.548Z" },
+    { url = "https://files.pythonhosted.org/packages/57/9e/d6fc64e41fad5d481c029ee5a49eefc17f0b8071d636a02ceee44d4a0de2/scipy-1.16.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d2b83c37edbfa837a8923d19c749c1935ad3d41cf196006a24ed44dba2ec4358", size = 37841356, upload_time = "2025-06-22T16:20:35.112Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/a7/4c94bbe91f12126b8bf6709b2471900577b7373a4fd1f431f28ba6f81115/scipy-1.16.0-cp313-cp313-win_amd64.whl", hash = "sha256:79a3c13d43c95aa80b87328a46031cf52508cf5f4df2767602c984ed1d3c6bbe", size = 38403710, upload_time = "2025-06-22T16:21:54.473Z" },
+    { url = "https://files.pythonhosted.org/packages/47/20/965da8497f6226e8fa90ad3447b82ed0e28d942532e92dd8b91b43f100d4/scipy-1.16.0-cp313-cp313t-macosx_10_14_x86_64.whl", hash = "sha256:f91b87e1689f0370690e8470916fe1b2308e5b2061317ff76977c8f836452a47", size = 36813833, upload_time = "2025-06-22T16:20:43.925Z" },
+    { url = "https://files.pythonhosted.org/packages/28/f4/197580c3dac2d234e948806e164601c2df6f0078ed9f5ad4a62685b7c331/scipy-1.16.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:88a6ca658fb94640079e7a50b2ad3b67e33ef0f40e70bdb7dc22017dae73ac08", size = 28974431, upload_time = "2025-06-22T16:20:51.302Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/fc/e18b8550048d9224426e76906694c60028dbdb65d28b1372b5503914b89d/scipy-1.16.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:ae902626972f1bd7e4e86f58fd72322d7f4ec7b0cfc17b15d4b7006efc385176", size = 21246454, upload_time = "2025-06-22T16:20:57.276Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/48/07b97d167e0d6a324bfd7484cd0c209cc27338b67e5deadae578cf48e809/scipy-1.16.0-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:8cb824c1fc75ef29893bc32b3ddd7b11cf9ab13c1127fe26413a05953b8c32ed", size = 23772979, upload_time = "2025-06-22T16:21:03.363Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/4f/9efbd3f70baf9582edf271db3002b7882c875ddd37dc97f0f675ad68679f/scipy-1.16.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:de2db7250ff6514366a9709c2cba35cb6d08498e961cba20d7cff98a7ee88938", size = 33341972, upload_time = "2025-06-22T16:21:11.14Z" },
+    { url = "https://files.pythonhosted.org/packages/3f/dc/9e496a3c5dbe24e76ee24525155ab7f659c20180bab058ef2c5fa7d9119c/scipy-1.16.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e85800274edf4db8dd2e4e93034f92d1b05c9421220e7ded9988b16976f849c1", size = 35185476, upload_time = "2025-06-22T16:21:19.156Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/b3/21001cff985a122ba434c33f2c9d7d1dc3b669827e94f4fc4e1fe8b9dfd8/scipy-1.16.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4f720300a3024c237ace1cb11f9a84c38beb19616ba7c4cdcd771047a10a1706", size = 35570990, upload_time = "2025-06-22T16:21:27.797Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/d3/7ba42647d6709251cdf97043d0c107e0317e152fa2f76873b656b509ff55/scipy-1.16.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:aad603e9339ddb676409b104c48a027e9916ce0d2838830691f39552b38a352e", size = 37950262, upload_time = "2025-06-22T16:21:36.976Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/c4/231cac7a8385394ebbbb4f1ca662203e9d8c332825ab4f36ffc3ead09a42/scipy-1.16.0-cp313-cp313t-win_amd64.whl", hash = "sha256:f56296fefca67ba605fd74d12f7bd23636267731a72cb3947963e76b8c0a25db", size = 38515076, upload_time = "2025-06-22T16:21:45.694Z" },
+]
+
 [[package]]
 name = "seaborn"
 version = "0.13.2"
@ -1168,6 +1197,7 @@ dependencies = [
    { name = "python-dotenv" },
    { name = "requests" },
    { name = "rich" },
+    { name = "scipy" },
    { name = "seaborn" },
 ]

@ -1184,6 +1214,7 @@ requires-dist = [
    { name = "python-dotenv", specifier = ">=1.1.1" },
    { name = "requests", specifier = ">=2.32.4" },
    { name = "rich", specifier = ">=14.0.0" },
+    { name = "scipy", specifier = ">=1.16.0" },
    { name = "seaborn", specifier = ">=0.13.2" },
 ]