progress

2025-07-03 17:32:41 +02:00 · 2025-07-03 17:32:41 +02:00 · b7c94590f9
commit b7c94590f9
parent 2da206d368
14 changed files with 2200 additions and 13 deletions
--- a/old/add_task_estimates.py
+++ b/old/add_task_estimates.py
@ -0,0 +1,507 @@
+import pandas as pd
+import litellm
+import dotenv
+import os
+import time
+import json
+import math
+import numpy as np
+
+# --- Configuration ---
+MODEL = "gpt-4.1-mini"  # Make sure this model supports json_schema or structured output
+RATE_LIMIT = 5000  # Requests per minute
+CHUNK_SIZE = 300
+SECONDS_PER_MINUTE = 60
+FILENAME = (
+    "tasks_with_estimates.csv"  # This CSV should contain the tasks to be processed
+)
+
+# --- Prompts and Schema ---
+SYSTEM_PROMPT = """
+You are an expert assistant evaluating the time to completion required for job tasks. Your goal is to estimate the time range needed for a skilled human to complete the following job task remotely, without supervision.
+
+Provide a lower and upper bound estimate for the time to completion time. These bounds should capture the time within which approximately 80% of instances of performing this specific task are typically completed by a qualified individual.
+
+Base your estimate on the provided task description, its associated activities, and the occupational context. Your estimate must be in one the allowed units: minute, hour, day, week, month, trimester, semester, year.
+""".strip()
+
+USER_MESSAGE_TEMPLATE = """
+Please estimate the time range for the following remote task:
+
+**Task Description:** {task}
+**Relevant activies for the task:**
+{dwas}
+
+**Occupation Category:** {occupation_title}
+**Occupation Description:** {occupation_description}
+
+Consider the complexity and the typical steps involved.
+""".strip()
+
+ALLOWED_UNITS = [
+    "minute",
+    "hour",
+    "day",
+    "week",
+    "month",
+    "trimester",
+    "semester",
+    "year",
+]
+
+SCHEMA_FOR_VALIDATION = {
+    "name": "estimate_time",
+    "strict": True,  # Enforce schema adherence
+    "schema": {
+        "type": "object",
+        "properties": {
+            "lower_bound_estimate": {
+                "type": "object",
+                "properties": {
+                    "quantity": {
+                        "type": "number",
+                        "description": "The numerical value for the lower bound of the estimate.",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "enum": ALLOWED_UNITS,
+                        "description": "The unit of time for the lower bound.",
+                    },
+                },
+                "required": ["quantity", "unit"],
+                "additionalProperties": False,
+            },
+            "upper_bound_estimate": {
+                "type": "object",
+                "properties": {
+                    "quantity": {
+                        "type": "number",
+                        "description": "The numerical value for the upper bound of the estimate.",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "enum": ALLOWED_UNITS,
+                        "description": "The unit of time for the upper bound.",
+                    },
+                },
+                "required": ["quantity", "unit"],
+                "additionalProperties": False,
+            },
+        },
+        "required": ["lower_bound_estimate", "upper_bound_estimate"],
+        "additionalProperties": False,
+    },
+}
+
+
+def save_dataframe(df_to_save, filename):
+
+    """Saves the DataFrame to the specified CSV file using atomic write."""
+    try:
+        temp_filename = filename + ".tmp"
+        df_to_save.to_csv(temp_filename, encoding="utf-8-sig", index=False)
+        os.replace(temp_filename, filename)
+    except Exception as e:
+        print(f"--- Error saving DataFrame to {filename}: {e} ---")
+        if os.path.exists(temp_filename):
+            try:
+                os.remove(temp_filename)
+            except Exception as remove_err:
+                print(
+                    f"--- Error removing temporary save file {temp_filename}: {remove_err} ---"
+                )
+
+def create_task_estimates():
+    try:
+        # Read the CSV
+        if os.path.exists(FILENAME):
+            df = pd.read_csv(FILENAME, encoding="utf-8-sig")
+            print(f"Successfully read {len(df)} rows from {FILENAME}.")
+
+            estimate_columns_spec = {
+                "lb_estimate_qty": float,
+                "lb_estimate_unit": object,
+                "ub_estimate_qty": float,
+                "ub_estimate_unit": object,
+            }
+            save_needed = False
+
+            for col_name, target_dtype in estimate_columns_spec.items():
+                if col_name not in df.columns:
+                    # Initialize with a type-compatible missing value
+                    if target_dtype == float:
+                        df[col_name] = np.nan
+                    else:  # object
+                        df[col_name] = pd.NA
+                    df[col_name] = df[col_name].astype(target_dtype)  # Enforce dtype
+                    print(f"Added '{col_name}' column as {df[col_name].dtype}.")
+                    save_needed = True
+                else:
+                    # Column exists, ensure correct dtype
+                    current_pd_dtype = df[col_name].dtype
+                    expected_pd_dtype = pd.Series(dtype=target_dtype).dtype
+
+                    if current_pd_dtype != expected_pd_dtype:
+                        try:
+                            if target_dtype == float:
+                                df[col_name] = pd.to_numeric(df[col_name], errors="coerce")
+                            else:  # object
+                                df[col_name] = df[col_name].astype(object)
+                            print(
+                                f"Corrected dtype of '{col_name}' to {df[col_name].dtype}."
+                            )
+                            save_needed = True
+                        except Exception as e:
+                            print(
+                                f"Warning: Could not convert column '{col_name}' to {target_dtype}: {e}. Current dtype: {current_pd_dtype}"
+                            )
+
+                # Standardize missing values (e.g., empty strings to NA/NaN)
+                # Replace common missing placeholders with pd.NA first
+                df[col_name].replace(["", None, ""], pd.NA, inplace=True)
+                if target_dtype == float:
+                    # For float columns, ensure they are numeric and use np.nan after replacement
+                    df[col_name] = pd.to_numeric(df[col_name], errors="coerce")
+
+            if save_needed:
+                print(f"Saving {FILENAME} after adding/adjusting estimate columns.")
+                save_dataframe(df, FILENAME)
+        else:
+            print(
+                f"Error: {FILENAME} not found. Please ensure the file exists and contains task data."
+            )
+            exit()
+    except FileNotFoundError:
+        print(
+            f"Error: {FILENAME} not found. Please ensure the file exists and contains task data."
+        )
+        exit()
+    except Exception as e:
+        print(f"Error reading or initializing {FILENAME}: {e}")
+        exit()
+
+    # --- Identify Rows to Process ---
+    # We'll check for NaN in one of the primary quantity columns.
+    unprocessed_mask = df["lb_estimate_qty"].isna()
+    if unprocessed_mask.any():
+        start_index = unprocessed_mask.idxmax()  # Finds the index of the first True value
+        print(f"Resuming processing. First unprocessed row found at index {start_index}.")
+        df_to_process = df.loc[unprocessed_mask].copy()
+        original_indices = df_to_process.index  # Keep track of original indices
+    else:
+        print(
+            "All rows seem to have estimates already (based on 'lb_estimate_qty'). Exiting."
+        )
+        exit()
+
+
+    # --- Prepare messages for batch completion (only for rows needing processing) ---
+    messages_list = []
+    skipped_rows_indices = []
+    valid_original_indices = []
+
+    if not df_to_process.empty:
+        required_cols = ["task", "occupation_title", "occupation_description", "dwas"]
+        print(
+            f"Preparing messages for up to {len(df_to_process)} rows starting from original index {original_indices[0] if len(original_indices) > 0 else 'N/A'}..."
+        )
+        print(f"Checking for required columns: {required_cols}")
+
+        for index, row in df_to_process.iterrows():
+            missing_or_empty = []
+            for col in required_cols:
+                if col not in row or pd.isna(row[col]) or str(row[col]).strip() == "":
+                    missing_or_empty.append(col)
+
+            if missing_or_empty:
+                print(
+                    f"Warning: Skipping row original index {index} due to missing/empty required data in columns: {', '.join(missing_or_empty)}."
+                )
+                skipped_rows_indices.append(index)
+                continue
+
+            try:
+                user_message = USER_MESSAGE_TEMPLATE.format(
+                    task=row["task"],
+                    occupation_title=row["occupation_title"],
+                    occupation_description=row["occupation_description"],
+                    dwas=row["dwas"],
+                )
+            except KeyError as e:
+                print(
+                    f"Error: Skipping row original index {index} due to formatting error - missing key: {e}. Check USER_MESSAGE_TEMPLATE and CSV columns."
+                )
+                skipped_rows_indices.append(index)
+                continue
+
+            messages_for_row = [
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": user_message},
+            ]
+            messages_list.append(messages_for_row)
+            valid_original_indices.append(index)  # This is the original DataFrame index
+
+        print(
+            f"Prepared {len(messages_list)} valid message sets for batch completion (skipped {len(skipped_rows_indices)} rows)."
+        )
+        if not messages_list:
+            print("No valid rows found to process after checking required data. Exiting.")
+            exit()
+    else:
+        print(
+            "No rows found needing processing (df_to_process is empty)."
+        )  # Should have been caught by earlier check
+        exit()
+
+
+    # --- Call batch_completion in chunks with rate limiting and periodic saving ---
+    total_messages_to_send = len(messages_list)
+    num_chunks = math.ceil(total_messages_to_send / CHUNK_SIZE)
+
+    print(
+        f"\nStarting batch completion for {total_messages_to_send} items in {num_chunks} chunks..."
+    )
+
+    overall_start_time = time.time()
+    processed_count_total = 0
+
+    for i in range(num_chunks):
+        chunk_start_message_index = i * CHUNK_SIZE
+        chunk_end_message_index = min((i + 1) * CHUNK_SIZE, total_messages_to_send)
+        message_chunk = messages_list[chunk_start_message_index:chunk_end_message_index]
+        # Get corresponding original DataFrame indices for this chunk
+        chunk_original_indices = valid_original_indices[
+            chunk_start_message_index:chunk_end_message_index
+        ]
+
+        if not message_chunk:
+            continue
+
+        min_idx_disp = min(chunk_original_indices) if chunk_original_indices else "N/A"
+        max_idx_disp = max(chunk_original_indices) if chunk_original_indices else "N/A"
+        print(
+            f"\nProcessing chunk {i + 1}/{num_chunks} (Messages {chunk_start_message_index + 1}-{chunk_end_message_index} of this run)..."
+            f" Corresponding to original indices: {min_idx_disp} - {max_idx_disp}"
+        )
+        chunk_start_time = time.time()
+        responses = []
+        try:
+            print(f"Sending {len(message_chunk)} requests for chunk {i + 1}...")
+            responses = litellm.batch_completion(
+                model=MODEL,
+                messages=message_chunk,
+                response_format={
+                    "type": "json_schema",
+                    "json_schema": SCHEMA_FOR_VALIDATION,
+                },
+                num_retries=3,
+                # request_timeout=60 # Optional: uncomment if needed
+            )
+            print(f"Chunk {i + 1} API call completed.")
+
+        except Exception as e:
+            print(f"Error during litellm.batch_completion for chunk {i + 1}: {e}")
+            responses = [None] * len(
+                message_chunk
+            )  # Ensure responses list matches message_chunk length for processing loop
+
+        # --- Process responses for the current chunk ---
+        chunk_updates = {}  # To store {original_df_index: {qty/unit data}}
+        successful_in_chunk = 0
+        failed_in_chunk = 0
+
+        if responses and len(responses) == len(message_chunk):
+            for j, response in enumerate(responses):
+                original_df_index = chunk_original_indices[j]
+
+                # Initialize values for this item
+                lb_qty_val, lb_unit_val, ub_qty_val, ub_unit_val = None, None, None, None
+                content_str = None
+
+                if response is None:
+                    print(
+                        f"Skipping processing for original index {original_df_index} due to API call failure for this item (response is None)."
+                    )
+                    failed_in_chunk += 1
+                    continue
+
+                try:
+                    if (
+                        response.choices
+                        and response.choices[0].message
+                        and response.choices[0].message.content
+                    ):
+                        content_str = response.choices[0].message.content
+                        estimate_data = json.loads(content_str)  # Can raise JSONDecodeError
+
+                        lower_bound_dict = estimate_data.get("lower_bound_estimate")
+                        upper_bound_dict = estimate_data.get("upper_bound_estimate")
+
+                        valid_response_structure = isinstance(
+                            lower_bound_dict, dict
+                        ) and isinstance(upper_bound_dict, dict)
+
+                        if valid_response_structure:
+                            lb_qty_raw = lower_bound_dict.get("quantity")
+                            lb_unit_raw = lower_bound_dict.get("unit")
+                            ub_qty_raw = upper_bound_dict.get("quantity")
+                            ub_unit_raw = upper_bound_dict.get("unit")
+
+                            is_valid_item = True
+                            # Validate LB Qty
+                            if (
+                                not isinstance(lb_qty_raw, (int, float))
+                                or math.isnan(float(lb_qty_raw))
+                                or float(lb_qty_raw) < 0
+                            ):
+                                print(
+                                    f"Warning: Invalid lb_quantity for original index {original_df_index}: {lb_qty_raw}"
+                                )
+                                is_valid_item = False
+                            else:
+                                lb_qty_val = float(lb_qty_raw)
+
+                            # Validate UB Qty
+                            if (
+                                not isinstance(ub_qty_raw, (int, float))
+                                or math.isnan(float(ub_qty_raw))
+                                or float(ub_qty_raw) < 0
+                            ):
+                                print(
+                                    f"Warning: Invalid ub_quantity for original index {original_df_index}: {ub_qty_raw}"
+                                )
+                                is_valid_item = False
+                            else:
+                                ub_qty_val = float(ub_qty_raw)
+
+                            # Validate Units
+                            if lb_unit_raw not in ALLOWED_UNITS:
+                                print(
+                                    f"Warning: Invalid lb_unit for original index {original_df_index}: '{lb_unit_raw}'"
+                                )
+                                is_valid_item = False
+                            else:
+                                lb_unit_val = lb_unit_raw
+
+                            if ub_unit_raw not in ALLOWED_UNITS:
+                                print(
+                                    f"Warning: Invalid ub_unit for original index {original_df_index}: '{ub_unit_raw}'"
+                                )
+                                is_valid_item = False
+                            else:
+                                ub_unit_val = ub_unit_raw
+
+                            if is_valid_item:
+                                successful_in_chunk += 1
+                                chunk_updates[original_df_index] = {
+                                    "lb_estimate_qty": lb_qty_val,
+                                    "lb_estimate_unit": lb_unit_val,
+                                    "ub_estimate_qty": ub_qty_val,
+                                    "ub_estimate_unit": ub_unit_val,
+                                }
+                            else:
+                                failed_in_chunk += (
+                                    1  # Values remain None if not fully valid
+                                )
+                        else:
+                            print(
+                                f"Warning: Missing or malformed estimate dicts in JSON for original index {original_df_index}. Content: '{content_str}'"
+                            )
+                            failed_in_chunk += 1
+                    else:
+                        finish_reason = (
+                            response.choices[0].finish_reason
+                            if (response.choices and response.choices[0].finish_reason)
+                            else "unknown"
+                        )
+                        error_message = (
+                            response.choices[0].message.content
+                            if (
+                                response.choices
+                                and response.choices[0].message
+                                and response.choices[0].message.content
+                            )
+                            else "No content in message."
+                        )
+                        print(
+                            f"Warning: Received non-standard or empty response content for original index {original_df_index}. "
+                            f"Finish Reason: '{finish_reason}'. Message: '{error_message}'. Raw Choices: {response.choices}"
+                        )
+                        failed_in_chunk += 1
+
+                except json.JSONDecodeError:
+                    print(
+                        f"Warning: Could not decode JSON for original index {original_df_index}. Content received: '{content_str}'"
+                    )
+                    failed_in_chunk += 1
+                except AttributeError as ae:
+                    print(
+                        f"Warning: Missing expected attribute processing response for original index {original_df_index}: {ae}. Response: {response}"
+                    )
+                    failed_in_chunk += 1
+                except Exception as e:
+                    print(
+                        f"Warning: An unexpected error occurred processing response for original index {original_df_index}: {type(e).__name__} - {e}. Response: {response}"
+                    )
+                    failed_in_chunk += 1
+        else:
+            print(
+                f"Warning: Mismatch between number of responses ({len(responses) if responses else 0}) "
+                f"and messages sent ({len(message_chunk)}) for chunk {i + 1}, or no responses. Marking all as failed."
+            )
+            failed_in_chunk = len(
+                message_chunk
+            )  # All items in this chunk are considered failed if response array is problematic
+
+        print(
+            f"Chunk {i + 1} processing summary: Success={successful_in_chunk}, Failed/Skipped={failed_in_chunk}"
+        )
+        processed_count_total += successful_in_chunk
+
+        # --- Update Main DataFrame and Save Periodically ---
+        if chunk_updates:
+            print(
+                f"Updating main DataFrame with {len(chunk_updates)} new estimates for chunk {i + 1}..."
+            )
+            for idx, estimates in chunk_updates.items():
+                if idx in df.index:
+                    df.loc[idx, "lb_estimate_qty"] = estimates["lb_estimate_qty"]
+                    df.loc[idx, "lb_estimate_unit"] = estimates["lb_estimate_unit"]
+                    df.loc[idx, "ub_estimate_qty"] = estimates["ub_estimate_qty"]
+                    df.loc[idx, "ub_estimate_unit"] = estimates["ub_estimate_unit"]
+
+            print(f"Saving progress to {FILENAME}...")
+            save_dataframe(df, FILENAME)
+        else:
+            print(f"No successful estimates obtained in chunk {i + 1} to save.")
+
+        # --- Rate Limiting Pause ---
+        chunk_end_time = time.time()
+        chunk_duration = chunk_end_time - chunk_start_time
+        print(f"Chunk {i + 1} took {chunk_duration:.2f} seconds.")
+
+        if i < num_chunks - 1:  # No pause after the last chunk
+            # Calculate ideal time per request based on rate limit
+            time_per_request = SECONDS_PER_MINUTE / RATE_LIMIT if RATE_LIMIT > 0 else 0
+            # Calculate minimum duration this chunk should have taken to respect rate limit
+            min_chunk_duration_for_rate = len(message_chunk) * time_per_request
+            # Calculate pause needed
+            pause_needed = max(0, min_chunk_duration_for_rate - chunk_duration)
+
+            if pause_needed > 0:
+                print(
+                    f"Pausing for {pause_needed:.2f} seconds to respect rate limit ({RATE_LIMIT}/min)..."
+                )
+                time.sleep(pause_needed)
+
+    overall_end_time = time.time()
+    total_duration_minutes = (overall_end_time - overall_start_time) / 60
+    print(
+        f"\nBatch completion finished."
+        f" Processed {processed_count_total} new estimates in this run in {total_duration_minutes:.2f} minutes."
+    )
+
+    print(f"Performing final save to {FILENAME}...")
+    save_dataframe(df, FILENAME)
+
+    print("\nScript finished.")
--- a/old/analysis.py
+++ b/old/analysis.py
@ -0,0 +1,521 @@
+import os
+import litellm
+import sqlite3
+import numpy as np
+import pandas as pd
+from google.colab import userdata, files
+import seaborn as sns
+import matplotlib.pyplot as plt
+import matplotlib as mpl
+
+os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
+os.environ['GEMINI_API_KEY'] = userdata.get('GEMINI_API_KEY')
+
+occupation_major_codes = {
+    '11': 'Management',
+    '13': 'Business and Financial Operations',
+    '15': 'Computer and Mathematical Occupations',
+    '17': 'Architecture and Engineering',
+    '19': 'Life, Physical, and Social Science',
+    '21': 'Community and Social Services',
+    '23': 'Legal',
+    '25': 'Education, Training, and Library',
+    '27': 'Arts, Design, Entertainment, Sports, and Media',
+    '29': 'Healthcare Practitioners and Technical',
+    '31': 'Healthcare Support',
+    '33': 'Protective Service',
+    '35': 'Food Preparation and Serving Related',
+    '37': 'Building and Grounds Cleaning and Maintenance',
+    '39': 'Personal Care and Service',
+    '41': 'Sales and Related',
+    '43': 'Office and Administrative Support',
+    '45': 'Farming, Fishing, and Forestry',
+    '47': 'Construction and Extraction',
+    '49': 'Installation, Maintenance, and Repair',
+    '51': 'Production',
+    '53': 'Transportation and Material Moving',
+    '55': 'Military Specific'
+}
+
+gray   = {'50':'#f8fafc','100':'#f1f5f9','200':'#e2e8f0',
+                   '300':'#cbd5e1','400':'#94a3b8','500':'#64748b',
+                   '600':'#475569','700':'#334155','800':'#1e293b',
+                   '900':'#0f172a','950':'#020617'}
+lime            = {'50': '#f7fee7','100': '#ecfcca','200': '#d8f999',
+                   '300': '#bbf451','400': '#9ae600','500': '#83cd00',
+                   '600': '#64a400','700': '#497d00','800': '#3c6300',
+                   '900': '#35530e','950': '#192e03'}
+
+mpl.rcParams.update({
+    'figure.facecolor' : gray['50'],
+    'axes.facecolor'   : gray['50'],
+    'axes.edgecolor'   : gray['100'],
+    'axes.labelcolor'  : gray['700'],
+    'xtick.color'      : gray['700'],
+    'ytick.color'      : gray['700'],
+    'font.family'      : 'Inter',  # falls back to DejaVu if Inter not present
+    'font.size'        : 11,
+})
+
+sns.set_style("white")         # keep minimal axes, we will remove default grid
+sns.set_context("notebook")
+
+def prepare_tasks():
+
+    # Run uv run ./enrich_task_ratings.py
+    df_tasks = pd.read_json("task_ratings_enriched.json")
+
+    # Run uv run classify_estimateability_of_tasks.py
+    df_task_estimateable = pd.read_csv("tasks_estimateable.csv").rename(columns={"task_estimateable": "estimateable"}).drop_duplicates(subset=['task'], keep='first')
+
+    # df_tasks now has a remote_status column which contains either "remote" or "not remote"
+    df_tasks = pd.merge(df_tasks, df_remote_status[['Task', 'Remote']], left_on='task', right_on='Task', how='left')
+    df_tasks = df_tasks.drop('Task', axis=1).rename(columns={'Remote': 'remote_status'})
+
+    # df_tasks now has a estimateable column which contains either "ATOMIC" or "ONGOING-CONSTRAINT"
+    df_tasks = pd.merge(df_tasks, df_task_estimateable[['task', 'estimateable']], on='task', how='left')
+
+    df_tasks = df_tasks[df_tasks['importance_average'] < 3].copy()
+
+    df_tasks['onetsoc_major'] = df_tasks['onetsoc_code'].str[:2]
+
+    df_remote_tasks = df_tasks[df_tasks['remote_status'] == 'remote'].copy()
+
+    # Call create_task_estimates() from add_task_estimates? which creates tasks_with_estimates.csv
+
+def preprocessing_time_estimates():
+    df = pd.read_csv("tasks_with_estimates.csv")
+
+    df = df[df['importance_average'] > 3].copy()
+
+    # The embeddings comes from running `uv run ./embed_task_description.py`
+    # Columns: ['embedding_id', 'task', 'embedding_vector']
+    # These contain embedding for UNIQUE tasks
+    df_task_embeddings = pd.read_parquet("tasks_with_embeddings.parquet").drop_duplicates(subset=['task'])[['task', 'task_embedding']].rename(columns={"task_embedding": "embedding_vector"}).copy()
+
+    df = pd.merge(df, df_task_embeddings[['task', 'embedding_vector']], on='task', how='left')
+    df = pd.merge(df, df_task_estimateable[['task', 'estimateable']], on='task', how='left')
+
+    df['onetsoc_major'] = df['onetsoc_code'].str[:2]
+
+    def convert_to_minutes(qty, unit):
+        """Converts a quantity in a given unit to minutes."""
+        return qty * {
+            "minute": 1,
+            "hour": 60,
+            "day": 60 * 24,
+            "week": 60 * 24 * 7,
+            "month": 60 * 24 * 30,
+            "trimester": 60 * 24 * 90,
+            "semester": 60 * 24 * 180,
+            "year": 60 * 24 * 365,
+        }[unit]
+
+    df['lb_estimate_in_minutes'] = df.apply(
+        lambda row: convert_to_minutes(row['lb_estimate_qty'], row['lb_estimate_unit']), axis=1
+    )
+    df['ub_estimate_in_minutes'] = df.apply(
+        lambda row: convert_to_minutes(row['ub_estimate_qty'], row['ub_estimate_unit']), axis=1
+    )
+
+    df['estimate_range'] = df.ub_estimate_in_minutes - df.lb_estimate_in_minutes
+    df['estimate_ratio'] = df.ub_estimate_in_minutes / df.lb_estimate_in_minutes
+    df['estimate_midpoint'] = (df.lb_estimate_in_minutes + df.ub_estimate_in_minutes)/2
+
+    atomic_tasks = df[df['estimateable'] == 'ATOMIC']
+    ongoing_tasks = df[df['estimateable'] == 'ONGOING-CONSTRAINT']
+
+    with pd.option_context('display.max_columns', None):
+      display(df)
+
+    # Check for empty estimates
+    if atomic_tasks['lb_estimate_in_minutes'].isnull().sum() > 0:
+        print("Missing values in 'lb_estimate_in_minutes':", atomic_tasks['lb_estimate_in_minutes'].isnull().sum())
+
+    if atomic_tasks['ub_estimate_in_minutes'].isnull().sum() > 0:
+        print("Missing values in 'ub_estimate_in_minutes':", atomic_tasks['ub_estimate_in_minutes'].isnull().sum())
+
+    # Check for impossible bounds
+    impossible_bounds = atomic_tasks[
+        (atomic_tasks['lb_estimate_in_minutes'] <= 0) |
+        (atomic_tasks['ub_estimate_in_minutes'] <= 0) |
+        (atomic_tasks['lb_estimate_in_minutes'] > atomic_tasks['ub_estimate_in_minutes'])
+    ]
+    if not impossible_bounds.empty:
+        print(f"Error: Found rows with impossible bounds.")
+        with pd.option_context('display.max_colwidth', None):
+        display(impossible_bounds[['task', 'lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'dwas']])
+
+    #with pd.option_context('display.max_colwidth', None):
+        #display(atomic_tasks.nlargest(20, 'ub_estimate_in_minutes')[['task', 'lb_estimate_qty', 'lb_estimate_unit', 'lb_estimate_in_minutes', 'ub_estimate_qty', 'ub_estimate_unit', 'ub_estimate_in_minutes', 'estimate_ratio']])
+
+def cell1():
+    sns.histplot(atomic_tasks.estimate_midpoint, log_scale=True)
+
+def cell2():
+    plt.figure(figsize=(14,10))
+    sns.boxplot(
+        data=atomic_tasks,
+        x='onetsoc_major',           # 11 = Management, 15 = Computer/Math, …
+        y='estimate_range',
+        showfliers=False
+    )
+    plt.yscale('log')                # long tail => log scale
+    plt.xlabel('Occupation')
+    plt.ylabel('Range (upper-lower, minutes)')
+    plt.title('Spread of time-range estimates per occupation')
+
+    ax = plt.gca()
+    ax.set_xticklabels([occupation_major_codes[code.get_text()] for code in ax.get_xticklabels()], rotation=60, ha='right')
+
+def cell3():
+    plt.figure(figsize=(10, 10))
+    ax = sns.scatterplot(
+            data=atomic_tasks.replace({'onetsoc_major': occupation_major_codes}),  # Replace codes with labels
+            x='lb_estimate_in_minutes', y='ub_estimate_in_minutes',
+            alpha=0.2, edgecolor=None, hue="onetsoc_major"  # Use the labeled column for hue
+        )
+
+    # 45° reference
+    lims = (1, atomic_tasks[['lb_estimate_in_minutes','ub_estimate_in_minutes']].max().max())
+    ax.plot(lims, lims, color='black', linestyle='--', linewidth=1)
+
+    # optional helper lines: 2× and 10×, 100× ratios
+    for k in [2,10, 100]:
+        ax.plot(lims, [k*l for l in lims],
+                linestyle=':', color='grey', linewidth=1)
+
+    ax.set(xscale='log', yscale='log')
+    ax.set_xlabel('Lower-bound (min, log scale)')
+    ax.set_ylabel('Upper-bound (min, log scale)')
+    ax.set_title('Lower vs upper estimates for all tasks')
+
+    # Place the legend outside the plot
+    ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
+
+def cell4():
+    plt.figure(figsize=(8,4))
+    sns.histplot(np.log10(atomic_tasks['estimate_ratio'].replace([np.inf, -np.inf], np.nan).dropna()),
+                bins=60, kde=True)
+    plt.axvline(np.log10(10), color='red', ls='--', lw=1, label='10×')
+    plt.axvline(np.log10(1.05), color='orange', ls='--', lw=1, label='1.05×')
+    plt.axvline(0, color='black', ls='-', lw=1)          # ub = lb
+    plt.xlabel('log₁₀(upper / lower)')
+    plt.ylabel('Count')
+    plt.title('Distribution of upper:lower ratio')
+    plt.legend()
+    plt.tight_layout()
+
+
+def cell5():
+    # 1. Bin lower bounds into quartiles (Q1–Q4)
+    atomic_tasks['lb_q'] = pd.qcut(atomic_tasks.lb_estimate_in_minutes,
+                        q=4, labels=['Q1 shortest','Q2','Q3','Q4 longest'])
+
+
+    # 3. Aggregate: median (or mean) ratio per cell
+    pivot = atomic_tasks.pivot_table(index='onetsoc_major', columns='lb_q',
+                        values='estimate_ratio', aggfunc='median')
+
+    # Map the index (onetsoc_major codes) to their corresponding labels
+    pivot.index = pivot.index.map(occupation_major_codes)
+
+
+    # 4. Visualise
+    plt.figure(figsize=(10,8))
+    sns.heatmap(pivot, cmap='RdYlGn_r', center=2, annot=True, fmt='.1f',
+                cbar_kws={'label':'Median upper/lower ratio'})
+    plt.xlabel('Lower-bound quartile')
+    plt.ylabel('Occupation (major group)')
+    plt.title('Typical range width by occupation and task length')
+    plt.tight_layout()
+
+
+
+def cell6():
+    """
+    from scipy.stats import median_abs_deviation
+
+    def mad_z(series):
+        med = series.median()
+        mad = median_abs_deviation(series, scale='normal')  # ⇒ comparable to σ
+        return (series - med) / mad
+
+    df['robust_z'] = df.groupby('onetsoc_code')['estimate_midpoint'].transform(mad_z)
+    """
+
+    agg = (atomic_tasks
+           .groupby('onetsoc_code')['estimate_midpoint']
+           .agg(median='median',
+                q1=lambda x: x.quantile(.25),
+                q3=lambda x: x.quantile(.75),
+                mean='mean',
+                std='std')
+           .reset_index())
+    agg['IQR'] = agg.q3 - agg.q1
+    agg['CV']  = agg['std'] / agg['mean']            # coefficient of variation
+
+    # merge back the group mean and std so each row can be scored
+    atomic_tasks = atomic_tasks.merge(agg[['onetsoc_code','mean','std']], on='onetsoc_code')
+
+
+    atomic_tasks['z'] = (atomic_tasks.estimate_midpoint - atomic_tasks['mean']) / atomic_tasks['std']
+    outliers = atomic_tasks.loc[atomic_tasks.z.abs() > 3]
+    outliers
+
+def cell7():
+    from scipy.stats import median_abs_deviation
+
+    def mad_z(series):
+        med = series.median()
+        mad = median_abs_deviation(series, scale='normal')  # ⇒ comparable to σ
+        return (series - med) / mad
+
+    atomic_tasks['robust_z'] = atomic_tasks.groupby('onetsoc_code')['estimate_midpoint'].transform(mad_z)
+
+def cell10():
+    import matplotlib.ticker as mtick # For percentage formatting
+    import matplotlib.colors as mcolors # For color conversion
+
+    summary_data = []
+
+    for code, label in occupation_major_codes.items():
+        occ_df = df_tasks[df_tasks['onetsoc_major'] == code]
+        total_tasks_in_occ = len(occ_df)
+
+        if total_tasks_in_occ == 0:
+            continue # Skip if no tasks for this occupation
+
+        # Stack 1: % that isn't equal to "remote"
+        not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote'])
+
+        # For the remaining remote tasks:
+        remote_df = occ_df[occ_df['remote_status'] == 'remote']
+
+        # Stack 2: % of remote + ATOMIC
+        remote_atomic_count = len(remote_df[remote_df['estimateable'] == 'ATOMIC'])
+
+        # Stack 3: % of remote + ONGOING-CONSTRAINT
+        remote_ongoing_count = len(remote_df[remote_df['estimateable'] == 'ONGOING-CONSTRAINT'])
+
+        summary_data.append({
+            'onetsoc_major_code': code,
+            'occupation_label': label,
+            'count_not_remote': not_remote_count,
+            'count_remote_atomic': remote_atomic_count,
+            'count_remote_ongoing': remote_ongoing_count,
+            'total_tasks': total_tasks_in_occ
+        })
+
+    summary_df = pd.DataFrame(summary_data)
+
+    # --- 3. Calculate Percentages ---
+    # Ensure total_tasks is not zero to avoid division by zero errors if an occupation had no tasks
+    summary_df = summary_df[summary_df['total_tasks'] > 0].copy() # Use .copy() to avoid SettingWithCopyWarning
+
+    summary_df['pct_not_remote'] = (summary_df['count_not_remote'] / summary_df['total_tasks']) * 100
+    summary_df['pct_remote_atomic'] = (summary_df['count_remote_atomic'] / summary_df['total_tasks']) * 100
+    summary_df['pct_remote_ongoing'] = (summary_df['count_remote_ongoing'] / summary_df['total_tasks']) * 100
+
+    # Select columns for plotting and set index to occupation label
+    plot_df = summary_df.set_index('occupation_label')[
+        ['pct_not_remote', 'pct_remote_atomic', 'pct_remote_ongoing']
+    ]
+
+    # Rename columns for a clearer legend
+    plot_df.columns = ['Not Remote', 'Remote + Estimable', 'Remote + Not estimable']
+
+    plot_df = plot_df.sort_values(by='Not Remote', ascending=False)
+
+
+    # --- 4. Plotting (Modified) ---
+
+    # Define the custom colors based on your requirements
+    # The order must match the column order in plot_df:
+    # 1. 'Not Remote'
+    # 2. 'Remote & ATOMIC'
+    # 3. 'Remote & ONGOING-CONSTRAINT'
+    bar_colors = [gray["300"], lime["500"], lime["200"]]
+
+    fig, ax = plt.subplots(figsize=(14, 10)) # Adjusted figsize for better readability
+
+    plot_df.plot(kind='barh', stacked=True, ax=ax, color=bar_colors)
+
+    ax.set_xlabel("Percentage of Tasks (%)", fontsize=12)
+    ax.set_ylabel("Occupation Major Group", fontsize=12)
+    ax.set_title("Task Breakdown by Occupation, Remote Status, and Estimateability", fontsize=14, pad=20)
+
+    # Format x-axis as percentages
+    ax.xaxis.set_major_formatter(mtick.PercentFormatter())
+    plt.xlim(0, 100) # Ensure x-axis goes from 0 to 100%
+
+    # Remove right and top spines
+    ax.spines['right'].set_visible(False)
+    ax.spines['top'].set_visible(False)
+
+    # Function to get contrasting text color
+    def get_contrasting_text_color(bg_color_hex_or_rgba):
+        """
+        Determines if black or white text provides better contrast against a given background color.
+        bg_color_hex_or_rgba: A hex string (e.g., '#RRGGBB') or an RGBA tuple (values in [0, 1]).
+        Returns: 'black' or 'white'.
+        """
+        # Convert to RGBA if it's a hex string or name
+        if isinstance(bg_color_hex_or_rgba, str):
+            rgba = mcolors.to_rgba(bg_color_hex_or_rgba)
+        else:
+            rgba = bg_color_hex_or_rgba
+
+        r, g, b, _ = rgba # Ignore alpha for luminance calculation
+        # Calculate luminance (standard formula for sRGB)
+        # Values r, g, b should be in [0, 1] for this formula
+        luminance = 0.2126 * r + 0.7152 * g + 0.0722 * b
+        # Threshold for deciding text color
+        return 'black' if luminance > 0.55 else 'white' # Adjusted threshold slightly for better visual
+
+    # Add percentages inside each bar segment
+    # Iterate through each "category" of bars (Not Remote, Remote & ATOMIC, etc.)
+    for i, container in enumerate(ax.containers):
+        # Get the color for this container/category
+        segment_color = bar_colors[i]
+        text_color = get_contrasting_text_color(segment_color)
+
+        for patch in container.patches: # Iterate through each bar segment in the category
+            width = patch.get_width()
+            if width > 3:  # Only add text if segment is wide enough (e.g., >3%)
+                x = patch.get_x() + width / 2
+                y = patch.get_y() + patch.get_height() / 2
+                ax.text(x, y,
+                        f"{width:.1f}%",
+                        ha='center',
+                        va='center',
+                        fontsize=8, # Adjust font size as needed
+                        color=text_color,
+                        fontweight='medium') # Bolder text can help
+
+
+    plt.legend(title="Task Category", bbox_to_anchor=(1.02, 1), loc='upper left', frameon=False)
+
+def cell11():
+    df_oesm['onetsoc_major'] = df_oesm['OCC_CODE'].str[:2]
+
+    # Calculate wage bill per occupation
+    # Wage bill = Total Employment * Annual Mean Wage
+    # Ensure columns are numeric, converting non-numeric values to NaN first
+    df_oesm['TOT_EMP'] = pd.to_numeric(df_oesm['TOT_EMP'], errors='coerce')
+    df_oesm['A_MEAN'] = pd.to_numeric(df_oesm['A_MEAN'], errors='coerce')
+
+    # Drop rows with NaN in necessary columns after coercion
+    df_oesm.dropna(subset=['TOT_EMP', 'A_MEAN', 'onetsoc_major'], inplace=True)
+
+    df_oesm['wage_bill'] = df_oesm['TOT_EMP'] * df_oesm['A_MEAN']
+
+    # Aggregate wage bill by onetsoc_major
+    df_wage_bill_major = df_oesm.groupby('onetsoc_major')['wage_bill'].sum().reset_index()
+
+    # Map major codes to titles for better plotting
+    df_wage_bill_major['OCC_TITLE_MAJOR'] = df_wage_bill_major['onetsoc_major'].map(occupation_major_codes)
+
+    # Sort by wage bill for better visualization
+    df_wage_bill_major = df_wage_bill_major.sort_values('wage_bill', ascending=False)
+
+    # Plotting
+    plt.figure(figsize=(12, 8))
+    sns.barplot(x='wage_bill', y='OCC_TITLE_MAJOR', data=df_wage_bill_major, palette="viridis")
+    plt.title('Total Wage Bill per Major Occupation Group')
+    plt.xlabel('Total Wage Bill (in billions)')
+    plt.ylabel('Major Occupation Group')
+    plt.grid(axis='x', linestyle='--', alpha=0.7)
+
+def cell11():
+    # ───────────────────────────────────────────────────────────────
+    # 1.  CUMULATIVE-DISTRIBUTION-FUNCTION (CDF) PREP
+    # ───────────────────────────────────────────────────────────────
+    def cdf(series):
+        s = series.sort_values().reset_index(drop=True)
+        return s.values, ((s.index + 1) / len(s)) * 100
+
+    x_lb , y_lb  = cdf(atomic_tasks['lb_estimate_in_minutes'])
+    x_ub , y_ub  = cdf(atomic_tasks['ub_estimate_in_minutes'])
+    x_mid, y_mid = cdf((atomic_tasks['ub_estimate_in_minutes'] + atomic_tasks['lb_estimate_in_minutes']) / 2)
+
+    # ───────────────────────────────────────────────────────────────
+    # 2.  PLOTTING
+    # ───────────────────────────────────────────────────────────────
+    fig, ax = plt.subplots(figsize=(10, 6))
+
+    # horizontal reference lines every 10 %
+    for y_val in range(0, 101, 10):
+        ax.axhline(y_val, color=gray['100'], linewidth=.8, zorder=1)
+
+    # Plot Lower Bound CDF
+    ax.step(x_lb, y_lb,
+            where='post',
+            color=lime['300'], # Example: light blue for lower bound
+            linewidth=1.8,
+            linestyle='--',
+            zorder=2,
+            label='Lower bound estimate (CDF)')
+
+    # Plot Upper Bound CDF
+    ax.step(x_ub, y_ub,
+            where='post',
+            color=lime['900'], # Example: light orange/red for upper bound
+            linewidth=1.8,
+            linestyle=':',
+            zorder=3,
+            label='Upper bound estimate (CDF)')
+
+    # Plot Midpoint CDF (plotted last to be on top, or adjust zorder)
+    ax.step(x_mid, y_mid,
+            where='post',
+            color=lime['600'],
+            linewidth=2.2,
+            zorder=4, # Ensure it's on top of other lines if they overlap significantly
+            label='Mid-point estimate (CDF)')
+
+
+    # axes limits / scales
+    ax.set_ylim(0, 100)
+    ax.set_xscale('log')
+
+    # y-axis ➝ percent labels
+    ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(decimals=0))
+
+
+    # move y-label to top-left (just inside plotting area)
+    ax.text(-0.06, 1.03,
+            "% of tasks with temporal coherence ≤ X",
+            ha='left', va='bottom',
+            transform=ax.transAxes,
+            fontsize=12, fontweight='semibold')
+
+    # custom x-ticks at human-friendly durations
+    ticks      = [1, 5, 10, 30, 60, 120, 240, 480,
+                1440, 2880, 10080, 43200, 129600,
+                259200, 525600]
+    ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hour', '2 hours', '4 hours', '8 hours',
+                '1 day', '2 days', '1 week', '30 days',
+                '90 days', '180 days', '1 year']
+
+    # Vertical reference lines for x-ticks
+    for tick in ticks:
+        ax.axvline(tick, color=gray['300'], linewidth=.8, linestyle='--', zorder=1)
+
+    ax.set_xticks(ticks)
+    ax.set_xticklabels(ticklabels, rotation=45, ha='right')
+
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.spines['left'].set_edgecolor(gray['300'])
+    ax.spines['bottom'].set_edgecolor(gray['300'])
+
+
+    # legend
+    ax.legend(frameon=False, loc='lower right') # Keep 'lower right' or adjust as needed
+
+    ax.text(0.5, -0.3,
+            'Temporal coherence (X)',
+            ha='center', va='center',
+            transform=ax.transAxes,
+            fontsize=12, fontweight='semibold')
--- a/old/classify_estimateability_of_tasks.py
+++ b/old/classify_estimateability_of_tasks.py
@ -0,0 +1,411 @@
+import pandas as pd
+import litellm
+import dotenv
+import os
+import time
+import json
+import math
+
+# Load environment variables
+dotenv.load_dotenv(override=True)
+
+# litellm._turn_on_debug() # Optional debugging
+
+# --- Configuration ---
+MODEL = "gpt-4.1-mini"  # Make sure this model supports json_schema or structured output
+RATE_LIMIT = 5000  # Requests per minute
+CHUNK_SIZE = 300  # Number of unique tasks per API call
+SECONDS_PER_MINUTE = 60
+
+# File configuration
+CLASSIFICATION_FILENAME = "tasks_estimateable.csv"  # Output file with classifications
+TASK_SOURCE_FOR_INIT_FILENAME = "tasks_with_estimates.csv"
+OUTPUT_COLUMN_NAME = "task_estimateable"
+SOURCE_FILTER_COLUMN = "remote_status"
+SOURCE_FILTER_VALUE = "remote"
+
+# --- Prompts and Schema ---
+SYSTEM_PROMPT_CLASSIFY = """
+Classify the provided O*NET task into one of these categories:
+ -  ATOMIC (schedulable): A single, clearly-bounded activity, typically lasting minutes, hours, or a few days.
+ -  ONGOING-CONSTRAINT (background role/ethical rule): A continuous responsibility or behavioural norm with no schedulable duration (e.g., “follow confidentiality rules,” “serve as department head”).
+""".strip()
+
+USER_MESSAGE_TEMPLATE_CLASSIFY = "Task: {task}"
+
+CLASSIFICATION_CATEGORIES = ["ATOMIC", "ONGOING-CONSTRAINT"]
+
+SCHEMA_FOR_CLASSIFICATION = {
+    "name": "classify_task_type",
+    "strict": True,
+    "schema": {
+        "type": "object",
+        "properties": {
+            "task_category": {
+                "type": "string",
+                "enum": CLASSIFICATION_CATEGORIES,
+                "description": "The classification of the task (ATOMIC or ONGOING-CONSTRAINT).",
+            }
+        },
+        "required": ["task_category"],
+        "additionalProperties": False,
+    },
+}
+
+
+def save_dataframe(df_to_save, filename):
+    """Saves the DataFrame to the specified CSV file using atomic write."""
+    try:
+        temp_filename = filename + ".tmp"
+        df_to_save.to_csv(temp_filename, encoding="utf-8-sig", index=False)
+        os.replace(temp_filename, filename)
+    except Exception as e:
+        print(f"--- Error saving DataFrame to {filename}: {e} ---")
+        if os.path.exists(temp_filename):
+            try:
+                os.remove(temp_filename)
+            except Exception as remove_err:
+                print(
+                    f"--- Error removing temporary save file {temp_filename}: {remove_err} ---"
+                )
+
+
+# --- Load or Initialize DataFrame ---
+try:
+    if os.path.exists(CLASSIFICATION_FILENAME):
+        df = pd.read_csv(CLASSIFICATION_FILENAME, encoding="utf-8-sig")
+        print(f"Successfully read {len(df)} rows from {CLASSIFICATION_FILENAME}.")
+
+        save_needed_after_load = False
+        if OUTPUT_COLUMN_NAME not in df.columns:
+            df[OUTPUT_COLUMN_NAME] = pd.NA
+            print(f"Added '{OUTPUT_COLUMN_NAME}' column.")
+            save_needed_after_load = True
+
+        df[OUTPUT_COLUMN_NAME].replace(["", None, ""], pd.NA, inplace=True)
+
+        if df[OUTPUT_COLUMN_NAME].dtype != object and not isinstance(
+            df[OUTPUT_COLUMN_NAME].dtype, pd.StringDtype
+        ):
+            try:
+                df[OUTPUT_COLUMN_NAME] = df[OUTPUT_COLUMN_NAME].astype(object)
+                print(
+                    f"Corrected dtype of '{OUTPUT_COLUMN_NAME}' to {df[OUTPUT_COLUMN_NAME].dtype}."
+                )
+                save_needed_after_load = True
+            except Exception as e:
+                print(
+                    f"Warning: Could not convert column '{OUTPUT_COLUMN_NAME}' to object: {e}."
+                )
+
+        if "task" not in df.columns:
+            print(
+                f"Error: {CLASSIFICATION_FILENAME} must contain a 'task' column for processing."
+            )
+            exit()
+
+        if save_needed_after_load:
+            print(f"Saving {CLASSIFICATION_FILENAME} after adding/adjusting column.")
+            save_dataframe(df, CLASSIFICATION_FILENAME)
+    else:
+        print(
+            f"{CLASSIFICATION_FILENAME} not found. Attempting to create it from {TASK_SOURCE_FOR_INIT_FILENAME}."
+        )
+        if not os.path.exists(TASK_SOURCE_FOR_INIT_FILENAME):
+            print(
+                f"Error: Source file {TASK_SOURCE_FOR_INIT_FILENAME} not found. Cannot create {CLASSIFICATION_FILENAME}."
+            )
+            exit()
+
+        df_source = pd.read_csv(TASK_SOURCE_FOR_INIT_FILENAME, encoding="utf-8-sig")
+
+        required_source_cols_for_init = ["task", SOURCE_FILTER_COLUMN]
+        missing_source_cols = [
+            col for col in required_source_cols_for_init if col not in df_source.columns
+        ]
+        if missing_source_cols:
+            print(
+                f"Error: Source file {TASK_SOURCE_FOR_INIT_FILENAME} is missing required columns for initialization: {', '.join(missing_source_cols)}."
+            )
+            exit()
+
+        df_source_filtered = df_source[
+            df_source[SOURCE_FILTER_COLUMN] == SOURCE_FILTER_VALUE
+        ].copy()
+
+        if df_source_filtered.empty:
+            print(
+                f"Warning: No tasks with '{SOURCE_FILTER_COLUMN}' == '{SOURCE_FILTER_VALUE}' found in {TASK_SOURCE_FOR_INIT_FILENAME}. "
+                f"{CLASSIFICATION_FILENAME} will be created with schema but no tasks to classify initially."
+            )
+
+        df = df_source_filtered[["task"]].copy()
+        df[OUTPUT_COLUMN_NAME] = pd.NA
+        df[OUTPUT_COLUMN_NAME] = df[OUTPUT_COLUMN_NAME].astype(object)
+
+        print(
+            f"Created {CLASSIFICATION_FILENAME} using tasks from {TASK_SOURCE_FOR_INIT_FILENAME} "
+            f"(where {SOURCE_FILTER_COLUMN}='{SOURCE_FILTER_VALUE}'). New file has {len(df)} tasks."
+        )
+        save_dataframe(df, CLASSIFICATION_FILENAME)
+
+except FileNotFoundError:
+    print(f"Error: A required file was not found. Please check paths.")
+    exit()
+except Exception as e:
+    print(f"Error during DataFrame loading or initialization: {e}")
+    exit()
+
+
+# --- Identify Unique Tasks to Process ---
+if df.empty:
+    print(f"{CLASSIFICATION_FILENAME} is empty. Nothing to process. Exiting.")
+    exit()
+
+initial_unprocessed_mask = df[OUTPUT_COLUMN_NAME].isna()
+
+if not initial_unprocessed_mask.any():
+    print(
+        f"All tasks in {CLASSIFICATION_FILENAME} seem to have been classified already. Exiting."
+    )
+    exit()
+
+# Filter for rows that are unprocessed AND have a valid 'task' string
+valid_tasks_to_consider_df = df[
+    initial_unprocessed_mask & df["task"].notna() & (df["task"].str.strip() != "")
+]
+
+if valid_tasks_to_consider_df.empty:
+    print(
+        f"No valid, unclassified tasks found to process (after filtering out empty/NaN task descriptions). Exiting."
+    )
+    exit()
+
+unique_task_labels_for_api = (
+    valid_tasks_to_consider_df["task"].drop_duplicates().tolist()
+)
+total_rows_to_update_potentially = len(
+    df[initial_unprocessed_mask]
+)  # Count all rows that are NA
+
+print(
+    f"Found {total_rows_to_update_potentially} total rows in {CLASSIFICATION_FILENAME} needing classification."
+)
+print(
+    f"Identified {len(unique_task_labels_for_api)} unique, valid task labels to send to the API."
+)
+
+
+# --- Prepare messages for batch completion (only for unique task labels) ---
+messages_list = []
+print(f"Preparing messages for {len(unique_task_labels_for_api)} unique task labels...")
+
+for task_label in unique_task_labels_for_api:
+    # task_label is already guaranteed to be non-empty and not NaN from the filtering above
+    user_message = USER_MESSAGE_TEMPLATE_CLASSIFY.format(task=task_label)
+    messages_for_task = [
+        {"role": "system", "content": SYSTEM_PROMPT_CLASSIFY},
+        {"role": "user", "content": user_message},
+    ]
+    messages_list.append(messages_for_task)
+
+print(f"Prepared {len(messages_list)} message sets for batch completion.")
+if (
+    not messages_list
+):  # Should only happen if unique_task_labels_for_api was empty, caught above
+    print(
+        "No messages prepared, though unique tasks were identified. This is unexpected. Exiting."
+    )
+    exit()
+
+
+# --- Call batch_completion in chunks with rate limiting and periodic saving ---
+total_unique_tasks_to_send = len(
+    messages_list
+)  # Same as len(unique_task_labels_for_api)
+num_chunks = math.ceil(total_unique_tasks_to_send / CHUNK_SIZE)
+
+print(
+    f"\nStarting batch classification for {total_unique_tasks_to_send} unique task labels in {num_chunks} chunks..."
+)
+
+overall_start_time = time.time()
+processed_rows_count_total = 0  # Counts actual rows updated in the DataFrame
+
+for i in range(num_chunks):
+    chunk_start_message_index = i * CHUNK_SIZE
+    chunk_end_message_index = min((i + 1) * CHUNK_SIZE, total_unique_tasks_to_send)
+
+    message_chunk = messages_list[chunk_start_message_index:chunk_end_message_index]
+    # Get corresponding unique task labels for this chunk
+    chunk_task_labels = unique_task_labels_for_api[
+        chunk_start_message_index:chunk_end_message_index
+    ]
+
+    if not message_chunk:  # Should not happen if loop range is correct
+        continue
+
+    print(
+        f"\nProcessing chunk {i + 1}/{num_chunks} (Unique Task Labels {chunk_start_message_index + 1}-{chunk_end_message_index} of this run)..."
+    )
+    chunk_start_time = time.time()
+    responses = []
+    try:
+        print(
+            f"Sending {len(message_chunk)} requests (for unique tasks) for chunk {i + 1}..."
+        )
+        responses = litellm.batch_completion(
+            model=MODEL,
+            messages=message_chunk,
+            response_format={
+                "type": "json_schema",
+                "json_schema": SCHEMA_FOR_CLASSIFICATION,
+            },
+            num_retries=3,
+        )
+        print(f"Chunk {i + 1} API call completed.")
+
+    except Exception as e:
+        print(f"Error during litellm.batch_completion for chunk {i + 1}: {e}")
+        responses = [None] * len(message_chunk)
+
+    # --- Process responses for the current chunk ---
+    # chunk_updates stores {task_label: classification_category}
+    chunk_task_classifications = {}
+    successful_api_calls_in_chunk = 0
+    failed_api_calls_in_chunk = 0
+
+    if responses and len(responses) == len(message_chunk):
+        for j, response in enumerate(responses):
+            current_task_label = chunk_task_labels[
+                j
+            ]  # The unique task label for this response
+            content_str = None
+
+            if response is None:
+                print(
+                    f"API call failed for task label '{current_task_label}' (response is None)."
+                )
+                failed_api_calls_in_chunk += 1
+                continue
+
+            try:
+                if (
+                    response.choices
+                    and response.choices[0].message
+                    and response.choices[0].message.content
+                ):
+                    content_str = response.choices[0].message.content
+                    classification_data = json.loads(content_str)
+                    category_raw = classification_data.get("task_category")
+
+                    if category_raw in CLASSIFICATION_CATEGORIES:
+                        successful_api_calls_in_chunk += 1
+                        chunk_task_classifications[current_task_label] = category_raw
+                    else:
+                        print(
+                            f"Warning: Invalid or missing task_category for task label '{current_task_label}': '{category_raw}'. Content: '{content_str}'"
+                        )
+                        failed_api_calls_in_chunk += 1
+                else:
+                    finish_reason = (
+                        response.choices[0].finish_reason
+                        if (response.choices and response.choices[0].finish_reason)
+                        else "unknown"
+                    )
+                    error_message = (
+                        response.choices[0].message.content
+                        if (response.choices and response.choices[0].message)
+                        else "No content in message."
+                    )
+                    print(
+                        f"Warning: Received non-standard or empty response content for task label '{current_task_label}'. "
+                        f"Finish Reason: '{finish_reason}'. Message: '{error_message}'. Raw Choices: {response.choices}"
+                    )
+                    failed_api_calls_in_chunk += 1
+
+            except json.JSONDecodeError:
+                print(
+                    f"Warning: Could not decode JSON for task label '{current_task_label}'. Content received: '{content_str}'"
+                )
+                failed_api_calls_in_chunk += 1
+            except AttributeError as ae:
+                print(
+                    f"Warning: Missing attribute processing response for task label '{current_task_label}': {ae}. Response: {response}"
+                )
+                failed_api_calls_in_chunk += 1
+            except Exception as e:
+                print(
+                    f"Warning: Unexpected error processing response for task label '{current_task_label}': {type(e).__name__} - {e}. Response: {response}"
+                )
+                failed_api_calls_in_chunk += 1
+    else:
+        print(
+            f"Warning: Mismatch between #responses ({len(responses) if responses else 0}) "
+            f"and #messages sent ({len(message_chunk)}) for chunk {i + 1}, or no responses. Marking all API calls in chunk as failed."
+        )
+        failed_api_calls_in_chunk = len(message_chunk)
+
+    # --- Update Main DataFrame and Save Periodically ---
+    rows_updated_this_chunk = 0
+    if chunk_task_classifications:
+        print(
+            f"Updating main DataFrame with classifications for {len(chunk_task_classifications)} unique tasks from chunk {i + 1}..."
+        )
+        for task_label, category in chunk_task_classifications.items():
+            # Update all rows in the main df that match this task_label AND are still NA in the output column
+            update_condition = (df["task"] == task_label) & (
+                df[OUTPUT_COLUMN_NAME].isna()
+            )
+            num_rows_for_this_task_label = df[update_condition].shape[0]
+
+            if num_rows_for_this_task_label > 0:
+                df.loc[update_condition, OUTPUT_COLUMN_NAME] = category
+                rows_updated_this_chunk += num_rows_for_this_task_label
+
+        print(
+            f"Updated {rows_updated_this_chunk} rows in the DataFrame based on this chunk's API responses."
+        )
+        print(f"Saving progress to {CLASSIFICATION_FILENAME}...")
+        save_dataframe(df, CLASSIFICATION_FILENAME)
+    else:
+        print(
+            f"No successful API classifications obtained in chunk {i + 1} to update DataFrame or save."
+        )
+
+    print(
+        f"Chunk {i + 1} API summary: Successful Calls={successful_api_calls_in_chunk}, Failed/Skipped Calls={failed_api_calls_in_chunk}. "
+        f"Rows updated in DataFrame this chunk: {rows_updated_this_chunk}"
+    )
+    processed_rows_count_total += rows_updated_this_chunk
+
+    # --- Rate Limiting Pause ---
+    chunk_end_time = time.time()
+    chunk_duration = chunk_end_time - chunk_start_time
+    print(f"Chunk {i + 1} (API calls and DF update) took {chunk_duration:.2f} seconds.")
+
+    if i < num_chunks - 1:
+        time_per_request = SECONDS_PER_MINUTE / RATE_LIMIT if RATE_LIMIT > 0 else 0
+        min_chunk_duration_for_rate = (
+            len(message_chunk) * time_per_request
+        )  # Based on API calls made
+        pause_needed = max(0, min_chunk_duration_for_rate - chunk_duration)
+
+        if pause_needed > 0:
+            print(
+                f"Pausing for {pause_needed:.2f} seconds to respect rate limit ({RATE_LIMIT}/min)..."
+            )
+            time.sleep(pause_needed)
+
+overall_end_time = time.time()
+total_duration_minutes = (overall_end_time - overall_start_time) / 60
+print(
+    f"\nBatch classification finished."
+    f" Updated {processed_rows_count_total} rows in '{CLASSIFICATION_FILENAME}' with new classifications in this run."
+    f" Total duration: {total_duration_minutes:.2f} minutes."
+)
+
+print(f"Performing final save to {CLASSIFICATION_FILENAME}...")
+save_dataframe(df, CLASSIFICATION_FILENAME)
+
+print("\nScript finished.")
--- a/old/create_onet_database.sh
+++ b/old/create_onet_database.sh
@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+
+# Set database name and directories
+ONET_DB_NAME="onet.database"
+ONET_ZIP_URL="https://www.onetcenter.org/dl_files/database/db_29_1_mysql.zip"
+ONET_ZIP_FILE="db_29_1_mysql.zip"
+ONET_EXTRACT_DIR="db_29_1_mysql"
+
+# Download O*NET database only if not already downloaded
+if [ ! -f "$ONET_ZIP_FILE" ]; then
+    echo "Downloading O*NET database from $ONET_ZIP_URL"
+    curl -L -o "$ONET_ZIP_FILE" "$ONET_ZIP_URL" || wget -O "$ONET_ZIP_FILE" "$ONET_ZIP_URL"
+
+    if [ $? -ne 0 ]; then
+        echo "Failed to download O*NET database"
+        exit 1
+    fi
+else
+    echo "Using existing O*NET database zip file"
+fi
+
+# Extract downloaded zip file only if extraction directory doesn't exist
+if [ ! -d "$ONET_EXTRACT_DIR" ]; then
+    echo "Extracting O*NET database files"
+    unzip -o "$ONET_ZIP_FILE"
+
+    if [ $? -ne 0 ]; then
+        echo "Failed to extract O*NET database files"
+        exit 1
+    fi
+else
+    echo "Using existing extracted O*NET database files"
+fi
+
+# Remove existing database if it exists
+if [ -f "$ONET_DB_NAME" ]; then
+    echo "Removing existing database"
+    rm "$ONET_DB_NAME"
+fi
+
+# Create a new SQLite database with optimized settings for fast import
+echo "Creating new SQLite database: $ONET_DB_NAME with performance settings"
+sqlite3 "$ONET_DB_NAME" << EOF
+PRAGMA journal_mode = OFF;
+PRAGMA synchronous = 0;
+PRAGMA cache_size = 1000000;
+PRAGMA locking_mode = EXCLUSIVE;
+PRAGMA temp_store = MEMORY;
+PRAGMA foreign_keys = ON;
+EOF
+
+# Combine and execute all SQL files in one transaction
+echo "Executing SQL files in alphabetical order (single transaction mode)"
+sqlite3 "$ONET_DB_NAME" << EOF
+BEGIN TRANSACTION;
+$(find "$ONET_EXTRACT_DIR" -name "*.sql" | sort | xargs cat)
+COMMIT;
+EOF
+
+# Check if the execution was successful
+if [ $? -ne 0 ]; then
+    echo "Error executing SQL files in batch transaction"
+    exit 1
+else
+    echo "Database populated successfully. Restoring reliability settings..."
+
+    # Restore reliability-focused settings after import
+    sqlite3 "$ONET_DB_NAME" << EOF
+PRAGMA journal_mode = WAL;
+PRAGMA synchronous = NORMAL;
+PRAGMA locking_mode = NORMAL;
+PRAGMA temp_store = DEFAULT;
+PRAGMA foreign_keys = ON;
+PRAGMA optimize;
+VACUUM;
+EOF
+
+    if [ $? -ne 0 ]; then
+        echo "Warning: Failed to restore reliability settings, but database is populated"
+    else
+        echo "Reliability settings restored successfully"
+    fi
+
+    echo "O*NET database created and optimized successfully!"
+fi
--- a/old/enrich_task_ratings.py
+++ b/old/enrich_task_ratings.py
@ -0,0 +1,392 @@
+import sqlite3
+import pandas as pd
+import json
+import os
+from collections import defaultdict
+import numpy as np
+
+# --- Configuration ---
+DB_FILE = "onet.database"
+OUTPUT_FILE = "task_ratings_enriched.json"  # Changed output filename
+
+# --- Database Interaction ---
+
+
+def fetch_data_from_db(db_path):
+    """
+    Fetches required data from the O*NET SQLite database using JOINs,
+    including DWAs.
+
+    Args:
+        db_path (str): Path to the SQLite database file.
+
+    Returns:
+        tuple(pandas.DataFrame, pandas.DataFrame): A tuple containing:
+            - DataFrame with task ratings info.
+            - DataFrame with task-to-DWA mapping.
+        Returns (None, None) if the database file doesn't exist or an error occurs.
+    """
+    if not os.path.exists(db_path):
+        print(f"Error: Database file not found at {db_path}")
+        return None, None
+
+    try:
+        conn = sqlite3.connect(db_path)
+        # Construct the SQL query to join the tables and select necessary columns
+        # Added LEFT JOINs for tasks_to_dwas and dwa_reference
+        # Use LEFT JOIN in case a task has no DWAs
+        query = """
+        SELECT
+            tr.onetsoc_code,
+            tr.task_id,
+            ts.task,
+            od.title AS occupation_title,
+            od.description AS occupation_description,
+            tr.scale_id,
+            tr.category,
+            tr.data_value,
+            dr.dwa_title  -- Added DWA title
+        FROM
+            task_ratings tr
+        JOIN
+            task_statements ts ON tr.task_id = ts.task_id
+        JOIN
+            occupation_data od ON tr.onetsoc_code = od.onetsoc_code
+        LEFT JOIN
+            tasks_to_dwas td ON tr.onetsoc_code = td.onetsoc_code AND tr.task_id = td.task_id --
+        LEFT JOIN
+            dwa_reference dr ON td.dwa_id = dr.dwa_id; --
+        """
+        df = pd.read_sql_query(query, conn)
+        conn.close()
+        print(
+                f"Successfully fetched {len(df)} records (including DWA info) from the database."
+                )
+
+        if df.empty:
+            print("Warning: Fetched DataFrame is empty.")
+            # Return empty DataFrames with expected columns if the main fetch is empty
+            ratings_cols = [
+                "onetsoc_code",
+                "task_id",
+                "task",
+                "occupation_title",
+                "occupation_description",
+                "scale_id",
+                "category",
+                "data_value",
+            ]
+            dwa_cols = ["onetsoc_code", "task_id", "dwa_title"]
+            return pd.DataFrame(columns=ratings_cols), pd.DataFrame(columns=dwa_cols)
+
+        # Remove duplicates caused by joining ratings with potentially multiple DWAs per task
+        # Keep only unique combinations of the core task/rating info before processing
+        core_cols = [
+            "onetsoc_code",
+            "task_id",
+            "task",
+            "occupation_title",
+            "occupation_description",
+            "scale_id",
+            "category",
+            "data_value",
+        ]
+        # Check if all core columns exist before attempting to drop duplicates
+        missing_core_cols = [col for col in core_cols if col not in df.columns]
+        if missing_core_cols:
+            print(f"Error: Missing core columns in fetched data: {missing_core_cols}")
+            return None, None
+        ratings_df = df[core_cols].drop_duplicates().reset_index(drop=True)
+
+        # Get unique DWA info separately
+        dwa_cols = ["onetsoc_code", "task_id", "dwa_title"]
+        # Check if all DWA columns exist before processing
+        if all(col in df.columns for col in dwa_cols):
+            dwas_df = (
+                df[dwa_cols]
+                .dropna(subset=["dwa_title"])
+                .drop_duplicates()
+                .reset_index(drop=True)
+            )
+        else:
+            print("Warning: DWA related columns missing, creating empty DWA DataFrame.")
+            dwas_df = pd.DataFrame(
+                columns=dwa_cols
+            )  # Create empty df if columns missing
+
+        return ratings_df, dwas_df  # Return two dataframes now
+
+    except sqlite3.Error as e:
+        print(f"SQLite error: {e}")
+        if "conn" in locals() and conn:
+            conn.close()
+        return None, None  # Return None for both if error
+    except Exception as e:
+        print(f"An error occurred during data fetching: {e}")
+        if "conn" in locals() and conn:
+            conn.close()
+        return None, None  # Return None for both if error
+
+
+# --- Data Processing ---
+
+
+def process_task_ratings_with_dwas(ratings_df, dwas_df):
+    """
+    Processes the fetched data to group, pivot frequency, calculate averages,
+    structure the output, and add associated DWAs.
+
+    Args:
+        ratings_df (pandas.DataFrame): The input DataFrame with task ratings info.
+        dwas_df (pandas.DataFrame): The input DataFrame with task-to-DWA mapping. Can be None or empty.
+
+    Returns:
+        list: A list of dictionaries, each representing an enriched task rating with DWAs.
+              Returns None if the input ratings DataFrame is invalid.
+    """
+    if ratings_df is None or not isinstance(
+        ratings_df, pd.DataFrame
+    ):  # Check if it's a DataFrame
+        print("Error: Input ratings DataFrame is invalid.")
+        return None
+    if ratings_df.empty:
+        print(
+            "Warning: Input ratings DataFrame is empty. Processing will yield empty result."
+        )
+        # Decide how to handle empty input, maybe return empty list directly
+        # return []
+
+    # Ensure dwas_df is a DataFrame, even if empty
+    if dwas_df is None or not isinstance(dwas_df, pd.DataFrame):
+        print("Warning: Invalid or missing DWA DataFrame. Proceeding without DWA data.")
+        dwas_df = pd.DataFrame(
+            columns=["onetsoc_code", "task_id", "dwa_title"]
+        )  # Ensure it's an empty DF
+
+    print("Starting data processing...")
+
+    # --- 1. Handle Frequency (FT) ---
+    freq_df = ratings_df[ratings_df["scale_id"] == "FT"].copy()
+    if not freq_df.empty:
+        freq_pivot = freq_df.pivot_table(
+            index=["onetsoc_code", "task_id"],
+            columns="category",
+            values="data_value",
+            fill_value=0,
+        )
+        freq_pivot.columns = [
+            f"frequency_category_{int(col)}" for col in freq_pivot.columns
+        ]
+        print(f"Processed Frequency data. Shape: {freq_pivot.shape}")
+    else:
+        print("No Frequency (FT) data found.")
+        # Create an empty DataFrame with the multi-index to allow merging later
+        idx = pd.MultiIndex(
+            levels=[[], []], codes=[[], []], names=["onetsoc_code", "task_id"]
+        )
+        freq_pivot = pd.DataFrame(index=idx)
+
+    # --- 2. Handle Importance (IM, IJ) ---
+    imp_df = ratings_df[ratings_df["scale_id"].isin(["IM", "IJ"])].copy()
+    if not imp_df.empty:
+        imp_avg = (
+            imp_df.groupby(["onetsoc_code", "task_id"])["data_value"]
+            .mean()
+            .reset_index()
+        )
+        imp_avg.rename(columns={"data_value": "importance_average"}, inplace=True)
+        print(f"Processed Importance data. Shape: {imp_avg.shape}")
+    else:
+        print("No Importance (IM, IJ) data found.")
+        imp_avg = pd.DataFrame(
+            columns=["onetsoc_code", "task_id", "importance_average"]
+        )
+
+    # --- 3. Handle Relevance (RT) ---
+    rel_df = ratings_df[ratings_df["scale_id"] == "RT"].copy()
+    if not rel_df.empty:
+        rel_avg = (
+            rel_df.groupby(["onetsoc_code", "task_id"])["data_value"]
+            .mean()
+            .reset_index()
+        )
+        rel_avg.rename(columns={"data_value": "relevance_average"}, inplace=True)
+        print(f"Processed Relevance data. Shape: {rel_avg.shape}")
+    else:
+        print("No Relevance (RT) data found.")
+        rel_avg = pd.DataFrame(columns=["onetsoc_code", "task_id", "relevance_average"])
+
+    # --- 4. Process DWAs ---
+    if dwas_df is not None and not dwas_df.empty and "dwa_title" in dwas_df.columns:
+        print("Processing DWA data...")
+        # Group DWAs by task_id and aggregate titles into a list
+        dwas_grouped = (
+            dwas_df.groupby(["onetsoc_code", "task_id"])["dwa_title"]
+            .apply(list)
+            .reset_index()
+        )  #
+        dwas_grouped.rename(
+            columns={"dwa_title": "dwas"}, inplace=True
+        )  # Rename column to 'dwas'
+        print(f"Processed DWA data. Shape: {dwas_grouped.shape}")
+    else:
+        print("No valid DWA data found or provided for processing.")
+        dwas_grouped = None  # Set to None if no DWAs
+
+    # --- 5. Get Base Task/Occupation Info ---
+    base_cols = [
+        "onetsoc_code",
+        "task_id",
+        "task",
+        "occupation_title",
+        "occupation_description",
+    ]
+    # Check if base columns exist in ratings_df
+    missing_base_cols = [col for col in base_cols if col not in ratings_df.columns]
+    if missing_base_cols:
+        print(
+            f"Error: Missing base info columns in ratings_df: {missing_base_cols}. Cannot proceed."
+        )
+        return None
+    if not ratings_df.empty:
+        base_info = (
+            ratings_df[base_cols]
+            .drop_duplicates()
+            .set_index(["onetsoc_code", "task_id"])
+        )
+        print(f"Extracted base info. Shape: {base_info.shape}")
+    else:
+        print("Cannot extract base info from empty ratings DataFrame.")
+        # Create an empty df with index to avoid errors later if possible
+        idx = pd.MultiIndex(
+            levels=[[], []], codes=[[], []], names=["onetsoc_code", "task_id"]
+        )
+        base_info = pd.DataFrame(
+            index=idx,
+            columns=[
+                col for col in base_cols if col not in ["onetsoc_code", "task_id"]
+            ],
+        )
+
+    # --- 6. Merge Processed Data ---
+    print("Merging processed data...")
+    # Start with base_info, which should have the index ['onetsoc_code', 'task_id']
+    final_df = base_info.merge(
+        freq_pivot, left_index=True, right_index=True, how="left"
+    )
+    # Reset index before merging non-indexed dfs
+    final_df = final_df.reset_index()
+
+    # Merge averages - check if they are not empty before merging
+    if not imp_avg.empty:
+        final_df = final_df.merge(imp_avg, on=["onetsoc_code", "task_id"], how="left")
+    else:
+        final_df["importance_average"] = np.nan  # Add column if imp_avg was empty
+
+    if not rel_avg.empty:
+        final_df = final_df.merge(rel_avg, on=["onetsoc_code", "task_id"], how="left")
+    else:
+        final_df["relevance_average"] = np.nan  # Add column if rel_avg was empty
+
+    # Merge DWAs if available
+    if dwas_grouped is not None and not dwas_grouped.empty:
+        final_df = final_df.merge(
+            dwas_grouped, on=["onetsoc_code", "task_id"], how="left"
+        )  # Merge the dwas list
+        # Fill NaN in 'dwas' column (for tasks with no DWAs) with empty lists
+        # Check if 'dwas' column exists before applying function
+        if "dwas" in final_df.columns:
+            final_df["dwas"] = final_df["dwas"].apply(
+                lambda x: x if isinstance(x, list) else []
+            )  # Ensure tasks without DWAs get []
+        else:
+            print("Warning: 'dwas' column not created during merge.")
+            final_df["dwas"] = [
+                [] for _ in range(len(final_df))
+            ]  # Add empty list column
+
+    else:
+        # Add an empty 'dwas' column if no DWA data was processed or merged
+        final_df["dwas"] = [[] for _ in range(len(final_df))]
+
+    print(f"Final merged data shape: {final_df.shape}")
+
+    # Convert DataFrame to list of dictionaries for JSON output
+    # Handle potential NaN values during JSON conversion
+    # Replace numpy NaN with Python None for JSON compatibility
+    final_df = final_df.replace({np.nan: None})
+    result_list = final_df.to_dict(orient="records")
+
+    return result_list
+
+
+# --- Output ---
+
+
+def write_to_json(data, output_path):
+    """
+    Writes the processed data to a JSON file.
+
+    Args:
+        data (list): The list of dictionaries to write.
+        output_path (str): Path to the output JSON file.
+    """
+    if data is None:
+        print("No data to write to JSON.")
+        return
+    if not isinstance(data, list):
+        print(
+            f"Error: Data to write is not a list (type: {type(data)}). Cannot write to JSON."
+        )
+        return
+
+    # Create directory if it doesn't exist
+    output_dir = os.path.dirname(output_path)
+    if output_dir and not os.path.exists(output_dir):
+        try:
+            os.makedirs(output_dir)
+            print(f"Created output directory: {output_dir}")
+        except OSError as e:
+            print(f"Error creating output directory {output_dir}: {e}")
+            return  # Exit if cannot create directory
+
+    try:
+        with open(output_path, "w", encoding="utf-8") as f:
+            json.dump(data, f, indent=4, ensure_ascii=False)
+        print(f"Successfully wrote enriched data to {output_path}")
+    except IOError as e:
+        print(f"Error writing JSON file to {output_path}: {e}")
+    except TypeError as e:
+        print(f"Error during JSON serialization: {e}. Check data types.")
+    except Exception as e:
+        print(f"An unexpected error occurred during JSON writing: {e}")
+
+
+# --- Main Execution ---
+
+if __name__ == "__main__":
+    print("Starting O*NET Task Ratings & DWAs Enrichment Script...")
+    # 1. Fetch data
+    ratings_data_df, dwas_data_df = fetch_data_from_db(DB_FILE)  # Fetch both datasets
+
+    # 2. Process data
+    # Proceed only if ratings_data_df is a valid DataFrame (even if empty)
+    # dwas_data_df can be None or empty, handled inside process function
+    if isinstance(ratings_data_df, pd.DataFrame):
+        enriched_data = process_task_ratings_with_dwas(
+            ratings_data_df, dwas_data_df
+        )  # Pass both dataframes
+
+        # 3. Write output
+        if (
+            enriched_data is not None
+        ):  # Check if processing returned data (even an empty list is valid)
+            write_to_json(enriched_data, OUTPUT_FILE)
+        else:
+            print("Data processing failed or returned None. No output file generated.")
+    else:
+        print(
+            "Data fetching failed or returned invalid type for ratings data. Script terminated."
+        )
+
+    print("Script finished.")
--- a/pipeline/enrichments.py
+++ b/pipeline/enrichments.py
@ -7,6 +7,7 @@ from  .run  import Run
 import pandas as pd

 def enrich_with_task_estimateability(run: Run) -> pd.DataFrame:
+    run.metadata.
    raise NotImplementedError

 def enrich_with_task_estimates(run: Run) -> pd.DataFrame:
--- a/pipeline/fetchers.py
+++ b/pipeline/fetchers.py
@ -5,13 +5,148 @@ Fetchers retrieve remote data and return it in a format suitable for further pro
 import sqlite3
 from typing import Tuple
 import pandas as pd
-from .metadata import Metadata
+import requests
+import hashlib
+import io
+import zipfile
+from .run import Run
+from .logger import logger

-def fetch_onet_database(meta: Metadata) -> Tuple[sqlite3.Connection, str]:
-    raise NotImplementedError
+def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
+    """
+    Downloads the O*NET database, creates a local SQLite file from it, and returns a connection.
+    The version is the sha256 of the downloaded zip file.
+    """
+    url = "https://www.onetcenter.org/dl_files/database/db_29_1_mysql.zip"
+    logger.info(f"Downloading O*NET database from {url}")
+    response = requests.get(url, stream=True)
+    response.raise_for_status()

-def fetch_oesm_data(meta: Metadata) -> Tuple[pd.DataFrame, str]:
-    raise NotImplementedError
+    # Read content into memory
+    zip_content = response.content
+    version = hashlib.sha256(zip_content).hexdigest()
+    logger.info(f"O*NET database version (sha256): {version}")

-def fetch_epoch_remote_data(meta: Metadata) -> Tuple[pd.DataFrame, str]:
-    raise NotImplementedError
+    db_path = run.cache_dir / f"onet_{version}.db"
+
+    if db_path.exists():
+        logger.info(f"Using cached O*NET database: {db_path}")
+        conn = sqlite3.connect(db_path)
+        # Set PRAGMA for foreign keys on every connection
+        conn.execute("PRAGMA foreign_keys = ON;")
+        return conn, version
+
+    logger.info(f"Creating new O*NET database: {db_path}")
+    conn = sqlite3.connect(db_path)
+
+    # Set performance PRAGMAs for fast import
+    logger.info("Creating new SQLite database with performance settings")
+    conn.executescript("""
+        PRAGMA journal_mode = OFF;
+        PRAGMA synchronous = 0;
+        PRAGMA cache_size = 1000000;
+        PRAGMA locking_mode = EXCLUSIVE;
+        PRAGMA temp_store = MEMORY;
+        PRAGMA foreign_keys = ON;
+    """)
+
+    with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
+        sql_scripts = []
+        for filename in sorted(z.namelist()):
+            if filename.endswith(".sql"):
+                sql_scripts.append(z.read(filename).decode('utf-8'))
+
+        if not sql_scripts:
+            raise RuntimeError("No SQL files found in the O*NET zip archive.")
+
+        # Combine and execute all SQL files in one transaction
+        full_script = "BEGIN TRANSACTION;\n" + "\n".join(sql_scripts) + "\nCOMMIT;"
+
+        logger.info("Executing SQL files in alphabetical order (single transaction mode)")
+        conn.executescript(full_script)
+        logger.info("Database populated successfully. Restoring reliability settings...")
+
+    # Restore reliability-focused settings after import
+    conn.executescript("""
+        PRAGMA journal_mode = WAL;
+        PRAGMA synchronous = NORMAL;
+        PRAGMA locking_mode = NORMAL;
+        PRAGMA temp_store = DEFAULT;
+        PRAGMA foreign_keys = ON;
+        PRAGMA optimize;
+    """)
+    conn.execute("VACUUM;")
+    conn.commit()
+    logger.info("Reliability settings restored and database optimized successfully!")
+
+    return conn, version
+
+def fetch_oesm_data(run: Run) -> Tuple[pd.DataFrame, str]:
+    """
+    Downloads the OESM national data from the BLS website.
+    The version is the sha256 of the downloaded zip file.
+    """
+    url = "https://www.bls.gov/oes/special-requests/oesm23nat.zip"
+    logger.info(f"Downloading OESM data from {url}")
+    response = requests.get(url)
+    response.raise_for_status()
+
+    zip_content = response.content
+    version = hashlib.sha256(zip_content).hexdigest()
+    logger.info(f"OESM data version (sha256): {version}")
+
+    parquet_path = run.cache_dir / f"oesm_{version}.parquet"
+    if parquet_path.exists():
+        logger.info(f"Using cached OESM data: {parquet_path}")
+        return pd.read_parquet(parquet_path), version
+
+    logger.info(f"Creating new OESM data cache: {parquet_path}")
+    with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
+        # Find the excel file in the zip
+        excel_filename = None
+        for filename in z.namelist():
+            logger.debug(f"Found file in OESM zip: {filename}")
+            if filename.lower().endswith(".xlsx"):
+                excel_filename = filename
+                break
+
+        if excel_filename is None:
+            raise FileNotFoundError("Could not find the Excel file in the OESM zip archive.")
+
+        logger.info(f"Reading {excel_filename} from zip archive.")
+        with z.open(excel_filename) as f:
+            df = pd.read_excel(f, engine='openpyxl')
+
+    df.to_parquet(parquet_path)
+    logger.info(f"Saved OESM data to cache: {parquet_path}")
+    return df, version
+
+def fetch_epoch_remote_data(run: Run) -> Tuple[pd.DataFrame, str]:
+    """
+    Downloads the EPOCH AI remote work task data.
+    The version is the sha256 of the downloaded CSV file.
+    """
+    # This is the direct download link constructed from the Google Drive share link
+    url = "https://drive.google.com/uc?export=download&id=1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r"
+    logger.info(f"Downloading EPOCH remote data from Google Drive: {url}")
+
+    # Need to handle potential cookies/redirects from Google Drive
+    session = requests.Session()
+    response = session.get(url, stream=True)
+    response.raise_for_status()
+
+    csv_content = response.content
+    version = hashlib.sha256(csv_content).hexdigest()
+    logger.info(f"EPOCH remote data version (sha256): {version}")
+
+    parquet_path = run.cache_dir / f"epoch_remote_{version}.parquet"
+    if parquet_path.exists():
+        logger.info(f"Using cached EPOCH remote data: {parquet_path}")
+        return pd.read_parquet(parquet_path), version
+
+    logger.info(f"Creating new EPOCH remote data cache: {parquet_path}")
+    df = pd.read_csv(io.BytesIO(csv_content))
+    df.to_parquet(parquet_path)
+    logger.info(f"Saved EPOCH remote data to cache: {parquet_path}")
+
+    return df, version
--- a/pipeline/generators/estimate_histplot.py
+++ b/pipeline/generators/estimate_histplot.py
@ -2,5 +2,5 @@ from ..run import Run
 from pathlib import Path
 from typing import Generator

-def generate_estimate_histplot(run: Run, output_dir: Path) -> Generator[Path]:
+def generate_estimate_histplot(run: Run) -> Generator[Path]:
    raise NotImplementedError
--- a/pipeline/logger.py
+++ b/pipeline/logger.py
@ -0,0 +1,24 @@
+import logging
+from logging.handlers import RotatingFileHandler
+from rich.logging import RichHandler
+
+LOGGER_NAME = "pipeline"
+
+def setup_logging() -> logging.Logger:
+    # Set up Rich console handler
+    rich_handler = RichHandler(
+        level=logging.DEBUG,
+        show_time=True,
+        enable_link_path=True,
+        rich_tracebacks=True,
+        # omit_repeated_times=False,
+    )
+
+    logger = logging.getLogger(LOGGER_NAME)
+    logger.setLevel(logging.DEBUG)
+    logger.addHandler(rich_handler)
+
+    return logger
+
+
+logger = setup_logging()
--- a/pipeline/metadata.py
+++ b/pipeline/metadata.py
@ -16,6 +16,7 @@ class Metadata(BaseModel):
    versions, and other important information.
    """
    fetchers: Dict[str, Dict[str, Any]] = Field(default_factory=dict)
+    enrichments: Dict[str, Dict[str, Any]] = Field(default_factory=dict)

    ts: str = Field(default_factory=lambda: datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    commit: str = Field(default_factory=lambda: _get_current_commit())
--- a/pipeline/run.py
+++ b/pipeline/run.py
@ -1,6 +1,7 @@
 from pydantic import BaseModel, Field
 import sqlite3
 import pandas as pd
+from pathlib import Path
 from typing import Optional
 from .metadata import Metadata

@ -20,3 +21,6 @@ class Run(BaseModel):
    task_estimates_df: Optional[pd.DataFrame] = None

    meta: Metadata = Field(default_factory=Metadata)
+
+    cache_dir: Path
+    output_dir: Path
--- a/pipeline/runner.py
+++ b/pipeline/runner.py
@ -5,11 +5,14 @@ from .postprocessors import check_for_insanity, create_df_tasks
 from .generators import GENERATORS
 from .run import Run
 from .constants import GRAY
+import platformdirs
 import seaborn as sns
 import matplotlib as mpl
 from pathlib import Path
 from typings import Optional

+CACHE_DIR = platformdirs.user_cache_dir("econtai")
+
 def run(output_dir: Optional[str] = None):
    if output_dir is None:
        output_dir = Path(".")
@ -17,12 +20,12 @@ def run(output_dir: Optional[str] = None):
    load_dotenv()
    _setup_graph_rendering()

-    current_run = Run()
+    current_run = Run(output_dir=output_dir, cache_dir=CACHE_DIR)

    # Fetchers (fetchers.py)
-    current_run.onet_conn, current_run.onet_version = fetch_onet_database(current_run.meta)
-    current_run.oesm_df, current_run.oesm_version = fetch_oesm_data(current_run.meta)
-    current_run.epoch_df, current_run.epoch_version = fetch_epoch_remote_data(current_run.meta)
+    current_run.onet_conn, current_run.onet_version = fetch_onet_database(current_run)
+    current_run.oesm_df, current_run.oesm_version = fetch_oesm_data(current_run)
+    current_run.epoch_df, current_run.epoch_version = fetch_epoch_remote_data(current_run)

    # Enrichments (enrichments.py)
    current_run.task_estimateability_df = enrich_with_task_estimateability(current_run)
@ -34,7 +37,7 @@ def run(output_dir: Optional[str] = None):

    # Generators (generators/)
    for gen in GENERATORS:
-        gen(current_run, output_dir)
+        gen(current_run)


 def _setup_graph_rendering():
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,9 +6,12 @@ readme = "README.md"
 requires-python = ">=3.13"
 dependencies = [
    "matplotlib>=3.10.3",
+    "openpyxl>=3.1.5",
    "pandas>=2.2.3",
+    "platformdirs>=4.3.8",
    "pydantic>=2.11.7",
    "python-dotenv>=1.1.1",
+    "requests>=2.32.4",
    "seaborn>=0.13.2",
 ]

--- a/uv.lock
+++ b/uv.lock
@ -11,6 +11,37 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload_time = "2024-05-20T21:33:24.1Z" },
 ]

+[[package]]
+name = "certifi"
+version = "2025.6.15"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/73/f7/f14b46d4bcd21092d7d3ccef689615220d8a08fb25e564b65d20738e672e/certifi-2025.6.15.tar.gz", hash = "sha256:d747aa5a8b9bbbb1bb8c22bb13e22bd1f18e9796defa16bab421f7f7a317323b", size = 158753, upload_time = "2025-06-15T02:45:51.329Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/84/ae/320161bd181fc06471eed047ecce67b693fd7515b16d495d8932db763426/certifi-2025.6.15-py3-none-any.whl", hash = "sha256:2e0c7ce7cb5d8f8634ca55d2ba7e6ec2689a2fd6537d8dec1296a477a4910057", size = 157650, upload_time = "2025-06-15T02:45:49.977Z" },
+]
+
+[[package]]
+name = "charset-normalizer"
+version = "3.4.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e4/33/89c2ced2b67d1c2a61c19c6751aa8902d46ce3dacb23600a283619f5a12d/charset_normalizer-3.4.2.tar.gz", hash = "sha256:5baececa9ecba31eff645232d59845c07aa030f0c81ee70184a90d35099a0e63", size = 126367, upload_time = "2025-05-02T08:34:42.01Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ea/12/a93df3366ed32db1d907d7593a94f1fe6293903e3e92967bebd6950ed12c/charset_normalizer-3.4.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:926ca93accd5d36ccdabd803392ddc3e03e6d4cd1cf17deff3b989ab8e9dbcf0", size = 199622, upload_time = "2025-05-02T08:32:56.363Z" },
+    { url = "https://files.pythonhosted.org/packages/04/93/bf204e6f344c39d9937d3c13c8cd5bbfc266472e51fc8c07cb7f64fcd2de/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eba9904b0f38a143592d9fc0e19e2df0fa2e41c3c3745554761c5f6447eedabf", size = 143435, upload_time = "2025-05-02T08:32:58.551Z" },
+    { url = "https://files.pythonhosted.org/packages/22/2a/ea8a2095b0bafa6c5b5a55ffdc2f924455233ee7b91c69b7edfcc9e02284/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3fddb7e2c84ac87ac3a947cb4e66d143ca5863ef48e4a5ecb83bd48619e4634e", size = 153653, upload_time = "2025-05-02T08:33:00.342Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/57/1b090ff183d13cef485dfbe272e2fe57622a76694061353c59da52c9a659/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98f862da73774290f251b9df8d11161b6cf25b599a66baf087c1ffe340e9bfd1", size = 146231, upload_time = "2025-05-02T08:33:02.081Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/28/ffc026b26f441fc67bd21ab7f03b313ab3fe46714a14b516f931abe1a2d8/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c9379d65defcab82d07b2a9dfbfc2e95bc8fe0ebb1b176a3190230a3ef0e07c", size = 148243, upload_time = "2025-05-02T08:33:04.063Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/0f/9abe9bd191629c33e69e47c6ef45ef99773320e9ad8e9cb08b8ab4a8d4cb/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e635b87f01ebc977342e2697d05b56632f5f879a4f15955dfe8cef2448b51691", size = 150442, upload_time = "2025-05-02T08:33:06.418Z" },
+    { url = "https://files.pythonhosted.org/packages/67/7c/a123bbcedca91d5916c056407f89a7f5e8fdfce12ba825d7d6b9954a1a3c/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1c95a1e2902a8b722868587c0e1184ad5c55631de5afc0eb96bc4b0d738092c0", size = 145147, upload_time = "2025-05-02T08:33:08.183Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/fe/1ac556fa4899d967b83e9893788e86b6af4d83e4726511eaaad035e36595/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ef8de666d6179b009dce7bcb2ad4c4a779f113f12caf8dc77f0162c29d20490b", size = 153057, upload_time = "2025-05-02T08:33:09.986Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/ff/acfc0b0a70b19e3e54febdd5301a98b72fa07635e56f24f60502e954c461/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:32fc0341d72e0f73f80acb0a2c94216bd704f4f0bce10aedea38f30502b271ff", size = 156454, upload_time = "2025-05-02T08:33:11.814Z" },
+    { url = "https://files.pythonhosted.org/packages/92/08/95b458ce9c740d0645feb0e96cea1f5ec946ea9c580a94adfe0b617f3573/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:289200a18fa698949d2b39c671c2cc7a24d44096784e76614899a7ccf2574b7b", size = 154174, upload_time = "2025-05-02T08:33:13.707Z" },
+    { url = "https://files.pythonhosted.org/packages/78/be/8392efc43487ac051eee6c36d5fbd63032d78f7728cb37aebcc98191f1ff/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4a476b06fbcf359ad25d34a057b7219281286ae2477cc5ff5e3f70a246971148", size = 149166, upload_time = "2025-05-02T08:33:15.458Z" },
+    { url = "https://files.pythonhosted.org/packages/44/96/392abd49b094d30b91d9fbda6a69519e95802250b777841cf3bda8fe136c/charset_normalizer-3.4.2-cp313-cp313-win32.whl", hash = "sha256:aaeeb6a479c7667fbe1099af9617c83aaca22182d6cf8c53966491a0f1b7ffb7", size = 98064, upload_time = "2025-05-02T08:33:17.06Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/b0/0200da600134e001d91851ddc797809e2fe0ea72de90e09bec5a2fbdaccb/charset_normalizer-3.4.2-cp313-cp313-win_amd64.whl", hash = "sha256:aa6af9e7d59f9c12b33ae4e9450619cf2488e2bbe9b44030905877f0b2324980", size = 105641, upload_time = "2025-05-02T08:33:18.753Z" },
+    { url = "https://files.pythonhosted.org/packages/20/94/c5790835a017658cbfabd07f3bfb549140c3ac458cfc196323996b10095a/charset_normalizer-3.4.2-py3-none-any.whl", hash = "sha256:7f56930ab0abd1c45cd15be65cc741c28b1c9a34876ce8c17a2fa107810c0af0", size = 52626, upload_time = "2025-05-02T08:34:40.053Z" },
+]
+
 [[package]]
 name = "contourpy"
 version = "1.3.2"
@ -51,6 +82,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30", size = 8321, upload_time = "2023-10-07T05:32:16.783Z" },
 ]

+[[package]]
+name = "et-xmlfile"
+version = "2.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload_time = "2024-10-25T17:25:40.039Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload_time = "2024-10-25T17:25:39.051Z" },
+]
+
 [[package]]
 name = "fonttools"
 version = "4.58.5"
@ -68,6 +108,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/d7/d4/1d85a1996b6188cd2713230e002d79a6f3a289bb17cef600cba385848b72/fonttools-4.58.5-py3-none-any.whl", hash = "sha256:e48a487ed24d9b611c5c4b25db1e50e69e9854ca2670e39a3486ffcd98863ec4", size = 1115318, upload_time = "2025-07-03T14:04:45.378Z" },
 ]

+[[package]]
+name = "idna"
+version = "3.10"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490, upload_time = "2024-09-15T18:07:39.745Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload_time = "2024-09-15T18:07:37.964Z" },
+]
+
 [[package]]
 name = "kiwisolver"
 version = "1.4.8"
@ -163,6 +212,18 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/63/be/b85e4aa4bf42c6502851b971f1c326d583fcc68227385f92089cf50a7b45/numpy-2.2.5-cp313-cp313t-win_amd64.whl", hash = "sha256:d403c84991b5ad291d3809bace5e85f4bbf44a04bdc9a88ed2bb1807b3360bb8", size = 12750096, upload_time = "2025-04-19T22:47:00.147Z" },
 ]

+[[package]]
+name = "openpyxl"
+version = "3.1.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "et-xmlfile" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload_time = "2024-06-28T14:03:44.161Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload_time = "2024-06-28T14:03:41.161Z" },
+]
+
 [[package]]
 name = "packaging"
 version = "25.0"
@ -254,6 +315,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/89/c7/5572fa4a3f45740eaab6ae86fcdf7195b55beac1371ac8c619d880cfe948/pillow-11.3.0-cp314-cp314t-win_arm64.whl", hash = "sha256:79ea0d14d3ebad43ec77ad5272e6ff9bba5b679ef73375ea760261207fa8e0aa", size = 2512835, upload_time = "2025-07-01T09:15:50.399Z" },
 ]

+[[package]]
+name = "platformdirs"
+version = "4.3.8"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/fe/8b/3c73abc9c759ecd3f1f7ceff6685840859e8070c4d947c93fae71f6a0bf2/platformdirs-4.3.8.tar.gz", hash = "sha256:3d512d96e16bcb959a814c9f348431070822a6496326a4be0911c40b5a74c2bc", size = 21362, upload_time = "2025-05-07T22:47:42.121Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fe/39/979e8e21520d4e47a0bbe349e2713c0aac6f3d853d0e5b34d76206c439aa/platformdirs-4.3.8-py3-none-any.whl", hash = "sha256:ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4", size = 18567, upload_time = "2025-05-07T22:47:40.376Z" },
+]
+
 [[package]]
 name = "pydantic"
 version = "2.11.7"
@ -336,6 +406,21 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload_time = "2025-03-25T02:24:58.468Z" },
 ]

+[[package]]
+name = "requests"
+version = "2.32.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "certifi" },
+    { name = "charset-normalizer" },
+    { name = "idna" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/e1/0a/929373653770d8a0d7ea76c37de6e41f11eb07559b103b1c02cafb3f7cf8/requests-2.32.4.tar.gz", hash = "sha256:27d0316682c8a29834d3264820024b62a36942083d52caf2f14c0591336d3422", size = 135258, upload_time = "2025-06-09T16:43:07.34Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7c/e4/56027c4a6b4ae70ca9de302488c5ca95ad4a39e190093d6c1a8ace08341b/requests-2.32.4-py3-none-any.whl", hash = "sha256:27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c", size = 64847, upload_time = "2025-06-09T16:43:05.728Z" },
+]
+
 [[package]]
 name = "seaborn"
 version = "0.13.2"
@ -365,18 +450,24 @@ version = "0.1.0"
 source = { virtual = "." }
 dependencies = [
    { name = "matplotlib" },
+    { name = "openpyxl" },
    { name = "pandas" },
+    { name = "platformdirs" },
    { name = "pydantic" },
    { name = "python-dotenv" },
+    { name = "requests" },
    { name = "seaborn" },
 ]

 [package.metadata]
 requires-dist = [
    { name = "matplotlib", specifier = ">=3.10.3" },
+    { name = "openpyxl", specifier = ">=3.1.5" },
    { name = "pandas", specifier = ">=2.2.3" },
+    { name = "platformdirs", specifier = ">=4.3.8" },
    { name = "pydantic", specifier = ">=2.11.7" },
    { name = "python-dotenv", specifier = ">=1.1.1" },
+    { name = "requests", specifier = ">=2.32.4" },
    { name = "seaborn", specifier = ">=0.13.2" },
 ]

@ -412,3 +503,12 @@ sdist = { url = "https://files.pythonhosted.org/packages/95/32/1a225d6164441be76
 wheels = [
    { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload_time = "2025-03-23T13:54:41.845Z" },
 ]
+
+[[package]]
+name = "urllib3"
+version = "2.5.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/15/22/9ee70a2574a4f4599c47dd506532914ce044817c7752a79b6a51286319bc/urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760", size = 393185, upload_time = "2025-06-18T14:07:41.644Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload_time = "2025-06-18T14:07:40.39Z" },
+]