wip

2025-07-15 00:34:54 +02:00 · 2025-07-15 00:34:54 +02:00 · 65dc648797
commit 65dc648797
parent 62296e1b69
37 changed files with 1413 additions and 2433 deletions
--- a/pipeline/aggregate.py
+++ b/pipeline/aggregate.py
@ -0,0 +1,81 @@
+from .utils import OCCUPATION_MAJOR_CODES
+import pandas as pd
+
+def create_task_summary_by_occupation_df(df_tasks: pd.DataFrame, oesm_df: pd.DataFrame) -> pd.DataFrame:
+    # --- OESM Wage Bill Calculation ---
+    df_oesm_with_bill = oesm_df.copy()
+    df_oesm_with_bill.rename(columns={'OCC_CODE': 'onetsoc_code'}, inplace=True)
+
+    # Convert key columns to numeric, handling potential errors
+    df_oesm_with_bill['TOT_EMP'] = pd.to_numeric(df_oesm_with_bill['TOT_EMP'], errors='coerce')
+    df_oesm_with_bill['A_MEAN'] = pd.to_numeric(df_oesm_with_bill['A_MEAN'], errors='coerce')
+    df_oesm_with_bill.dropna(subset=['TOT_EMP', 'A_MEAN', 'onetsoc_code'], inplace=True)
+
+    # Calculate the wage bill for each occupation
+    df_oesm_with_bill['wage_bill'] = df_oesm_with_bill['TOT_EMP'] * df_oesm_with_bill['A_MEAN']
+    oesm_lookup = df_oesm_with_bill.set_index('onetsoc_code')
+
+    summary_data = []
+
+    # Assuming df_tasks has an 'onetsoc_code' column with the full SOC code
+    unique_soc_codes = df_tasks['onetsoc_code'].unique()
+
+    for code in unique_soc_codes:
+        occ_df = df_tasks[df_tasks['onetsoc_code'] == code]
+        total_tasks_in_occ = len(occ_df)
+
+        not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote'])
+        remote_df = occ_df[occ_df['remote_status'] == 'remote']
+        remote_estimable_count = len(remote_df[remote_df['estimable']])
+        remote_not_estimable_count = len(remote_df[~remote_df['estimable']])
+
+        try:
+            # O*NET codes (e.g., 11-1011.03) are more specific than OESM SOC codes (e.g., 11-1011).
+            # We strip the suffix from the O*NET code to find the corresponding wage data.
+            soc_code_for_lookup = code.split('.')[0]
+            wage_bill = oesm_lookup.loc[soc_code_for_lookup, 'wage_bill']
+            label = oesm_lookup.loc[soc_code_for_lookup, 'OCC_TITLE']
+        except KeyError:
+            wage_bill = 0
+            label = "Unknown"
+
+        summary_data.append({
+            'onetsoc_code': code,
+            'occupation_label': label,
+            'wage_bill': wage_bill,
+            'count_not_remote': not_remote_count,
+            'count_remote_estimable': remote_estimable_count,
+            'count_remote_not_estimable': remote_not_estimable_count,
+            'total_tasks': total_tasks_in_occ
+        })
+
+    return pd.DataFrame(summary_data)
+
+
+def aggregate_task_summary_by_major_code(summary_df: pd.DataFrame) -> pd.DataFrame:
+    df_agg = summary_df.copy()
+    df_agg['onetsoc_major_code'] = df_agg['onetsoc_code'].str[:2]
+
+    aggregation = {
+        'wage_bill': 'sum',
+        'count_not_remote': 'sum',
+        'count_remote_estimable': 'sum',
+        'count_remote_not_estimable': 'sum',
+        'total_tasks': 'sum'
+    }
+    major_summary = df_agg.groupby('onetsoc_major_code').agg(aggregation).reset_index()
+
+    major_summary['occupation_label'] = major_summary['onetsoc_major_code'].map(OCCUPATION_MAJOR_CODES)
+
+    # Reorder columns to match original output format
+    major_summary = major_summary[[
+        'onetsoc_major_code',
+        'occupation_label',
+        'wage_bill',
+        'count_not_remote',
+        'count_remote_estimable',
+        'count_remote_not_estimable',
+        'total_tasks'
+    ]]
+
+    return major_summary
--- a/pipeline/classification.py
+++ b/pipeline/classification.py
@ -0,0 +1,225 @@
+from pathlib import Path
+import pandas as pd
+from .logger import logger
+from .utils import enrich
+import json
+
+ALLOWED_UNITS = [
+    "minute",
+    "hour",
+    "day",
+    "week",
+    "month",
+    "trimester",
+    "semester",
+    "year",
+]
+
+ESTIMABLE_CLASSIFICATION_VERSION = "old_version"
+TIME_ESTIMATES_GENERATION_VERSION = "old_version"
+
+def classify_tasks_as_estimable(cache_dir: Path, df_to_process: pd.DataFrame, bust: bool = False) -> pd.DataFrame:
+    CACHE_PATH = cache_dir / f"task_estimability.{ESTIMABLE_CLASSIFICATION_VERSION}.parquet"
+    if CACHE_PATH.exists() and not bust:
+        logger.info(f"Loading cached task estimability from {CACHE_PATH}")
+        return pd.read_parquet(CACHE_PATH)
+
+    logger.info("Enriching tasks with estimability classification.")
+
+    df_unique_tasks = df_to_process.drop_duplicates(subset=['task']).copy()
+
+    logger.info(f"Found {len(df_unique_tasks)} unique remote tasks to classify.")
+
+    if df_unique_tasks.empty:
+        raise ValueError("No unique tasks to classify.")
+
+    results = enrich(
+        model="gpt-4.1-mini",
+        rpm=5000,
+        messages_to_process=[
+            [
+                {"role": "system", "content":  """
+                    Classify the provided O*NET task into one of these categories:
+                    -  ATOMIC (schedulable): A single, clearly-bounded activity, typically lasting minutes, hours, or a few days.
+                    -  ONGOING-CONSTRAINT (background role/ethical rule): A continuous responsibility or behavioural norm with no schedulable duration (e.g., “follow confidentiality rules,” “serve as department head”).
+                    """.strip()},
+                {"role": "user", "content": f"Task: {row.task}"},
+            ]
+            for row in df_unique_tasks.itertuples()
+        ],
+        schema={
+            "name": "estimability_classification",
+            "schema": {
+                "type": "object",
+                "properties": {"task_category": {"type": "string", "enum": ["ATOMIC", "ONGOING-CONSTRAINT"]}},
+                "required": ["task_category"],
+                "additionalProperties": False
+            }
+        },
+        chunk_size=300,
+    )
+
+    if not results or len(results) != len(df_unique_tasks):
+        raise ValueError(f"Task estimability classification failed or returned mismatched number of results. Expected {len(df_unique_tasks)}, got {len(results) if results else 0}.")
+
+    classifications = []
+    for index, response in enumerate(results):
+        task_label = df_unique_tasks.iloc[index]['task']
+        task_category_flag = None
+
+        if response is None:
+            logger.warning(f"API call failed for task (enrich returned None): '{task_label}'")
+        else:
+            try:
+                content_str = response.choices[0].message.content
+                if not content_str:
+                    raise ValueError("No content found in the response message")
+
+                data = json.loads(content_str)
+
+                if 'task_category' in data and isinstance(data['task_category'], str):
+                    task_category_flag = data['task_category']
+                else:
+                    logger.warning(f"Invalid or missing 'task_category' payload for task '{task_label}'. Data: '{data}'")
+            except (json.JSONDecodeError, AttributeError, KeyError, IndexError, ValueError) as e:
+                logger.warning(f"Could not parse response for task '{task_label}'. Error: {e}. Response: {response}")
+
+        classifications.append({
+            'task': task_label,
+            'estimable': task_category_flag == 'ATOMIC'
+        })
+
+    classification_df = pd.DataFrame(classifications)
+
+    logger.info(f"Finished classification. Got {classification_df['estimable'].notna().sum()} successful classifications out of {len(df_unique_tasks)} unique tasks.")
+
+    logger.info(f"Saving task estimability classifications to {CACHE_PATH}")
+    classification_df.to_parquet(CACHE_PATH)
+
+    return classification_df
+
+
+def generate_time_estimates_for_tasks(cache_dir: Path, df_to_process: pd.DataFrame, bust: bool = False) -> pd.DataFrame:
+    CACHE_PATH = cache_dir / f"task_estimates.{TIME_ESTIMATES_GENERATION_VERSION}.parquet"
+    if CACHE_PATH.exists() and not bust:
+        logger.info(f"Loading cached task estimates from {CACHE_PATH}")
+        return pd.read_parquet(CACHE_PATH)
+
+    logger.info("Enriching tasks with time estimates.")
+
+    if df_to_process.empty:
+        raise ValueError("No tasks to process for estimates.")
+
+    results = enrich(
+        model="gpt-4.1-mini",
+        rpm=5000,
+        messages_to_process=[
+            [
+                {
+                    "role": "system",
+                    "content":  """
+                        You are an expert assistant evaluating the time required for job tasks. Your goal is to estimate the 'effective time' range needed for a skilled human to complete the following job task **remotely**, without supervision
+
+                        'Effective time' is the active, focused work duration required to complete the task. Crucially, **exclude all waiting periods, delays, or time spent on other unrelated activities**. Think of it as the continuous, productive time investment needed if the worker could pause and resume instantly without cost.
+
+                        Provide a lower and upper bound estimate for the 'effective time'. These bounds should capture the time within which approximately 80% of instances of performing this specific task are typically completed by a qualified individual.
+
+                        Base your estimate on the provided task and the associated occupation and occupation description. Your estimate must be in one the allowed units: minute, hour, day, week, month, trimester, semester, year.""".strip()
+                },
+                {
+                    "role": "user",
+                    "content":  f"{row.task} done by {row.occupation_title} ({row.occupation_description})"
+                }
+            ]
+            for row in df_to_process.itertuples()
+        ],
+        schema= {
+            "name": "estimate_time",
+            "strict": True,
+            "schema": {
+                "type": "object",
+                "properties": {
+                    "lower_bound_estimate": {
+                        "type": "object",
+                        "properties": {
+                            "quantity": {
+                                "type": "number",
+                                "description": "The numerical value for the lower bound of the estimate.",
+                            },
+                            "unit": {
+                                "type": "string",
+                                "enum": ALLOWED_UNITS,
+                                "description": "The unit of time for the lower bound.",
+                            },
+                        },
+                        "required": ["quantity", "unit"],
+                        "additionalProperties": False,
+                    },
+                    "upper_bound_estimate": {
+                        "type": "object",
+                        "properties": {
+                            "quantity": {
+                                "type": "number",
+                                "description": "The numerical value for the upper bound of the estimate.",
+                            },
+                            "unit": {
+                                "type": "string",
+                                "enum": ALLOWED_UNITS,
+                                "description": "The unit of time for the upper bound.",
+                            },
+                        },
+                        "required": ["quantity", "unit"],
+                        "additionalProperties": False,
+                    },
+                },
+                "required": ["lower_bound_estimate", "upper_bound_estimate"],
+                "additionalProperties": False,
+            },
+        },
+        chunk_size=200,
+    )
+
+    if not results or len(results) != len(df_to_process):
+        raise ValueError(f"API call for task estimates failed or returned mismatched number of results. "
+            f"Expected {len(df_to_process)}, got {len(results) if results else 0}.")
+
+    estimates = []
+    for index, response in enumerate(results):
+        row = df_to_process.iloc[index]
+        task_info = f"O*NET: {row.onetsoc_code}, Task ID: {row.task_id}"
+        lb_qty, lb_unit, ub_qty, ub_unit = None, None, None, None
+
+        if response is None:
+            logger.warning(f"API call failed for task (enrich returned None): {task_info}")
+        else:
+            try:
+                content_str = response.choices[0].message.content
+                if not content_str:
+                    raise ValueError("No content found in the response message")
+
+                data = json.loads(content_str)
+
+                lb_qty = data['lower_bound_estimate']['quantity']
+                lb_unit = data['lower_bound_estimate']['unit']
+                ub_qty = data['upper_bound_estimate']['quantity']
+                ub_unit = data['upper_bound_estimate']['unit']
+            except Exception as e:
+                logger.warning(f"Could not parse valid estimate for task {task_info}. Error: {e}. Response: {response}")
+                lb_qty, lb_unit, ub_qty, ub_unit = None, None, None, None # Reset on failure
+
+        estimates.append({
+            'onetsoc_code': row.onetsoc_code,
+            'task_id': row.task_id,
+            'lb_estimate_qty': lb_qty,
+            'lb_estimate_unit': lb_unit,
+            'ub_estimate_qty': ub_qty,
+            'ub_estimate_unit': ub_unit
+        })
+
+    estimates_df = pd.DataFrame(estimates)
+    logger.info(f"Finished estimates. Got {estimates_df['lb_estimate_qty'].notna().sum()} successful estimates out of {len(df_to_process)} tasks.")
+
+    logger.info(f"Saving task estimates to {CACHE_PATH}")
+    estimates_df.to_parquet(CACHE_PATH)
+
+    return estimates_df
--- a/pipeline/constants.py
+++ b/pipeline/constants.py
@ -1,35 +0,0 @@
-OCCUPATION_MAJOR_CODES = {
-    '11': 'Management',
-    '13': 'Business & Financial',
-    '15': 'Computer & Mathematical',
-    '17': 'Architecture & Engineering',
-    '19': 'Life, Physical, & Social Science',
-    '21': 'Community & Social Service',
-    '23': 'Legal',
-    '25': 'Education, Training, & Library',
-    '27': 'Arts, Design, & Media',
-    '29': 'Healthcare Practitioners',
-    '31': 'Healthcare Support',
-    '33': 'Protective Service',
-    '35': 'Food Preparation & Serving',
-    '37': 'Building & Grounds Maintenance',
-    '39': 'Personal Care & Service',
-    '41': 'Sales & Related',
-    '43': 'Office & Admin Support',
-    '45': 'Farming, Fishing, & Forestry',
-    '47': 'Construction & Extraction',
-    '49': 'Installation, Maintenance, & Repair',
-    '51': 'Production',
-    '53': 'Transportation & Material Moving',
-    '55': 'Military Specific',
-}
-
-GRAY   = {'50':'#f8fafc','100':'#f1f5f9','200':'#e2e8f0',
-    '300':'#cbd5e1','400':'#94a3b8','500':'#64748b',
-    '600':'#475569','700':'#334155','800':'#1e293b',
-    '900':'#0f172a','950':'#020617'}
-
-LIME            = {'50': '#f7fee7','100': '#ecfcca','200': '#d8f999',
-    '300': '#bbf451','400': '#9ae600','500': '#83cd00',
-    '600': '#64a400','700': '#497d00','800': '#3c6300',
-    '900': '#35530e','950': '#192e03'}
--- a/pipeline/enrichments.py
+++ b/pipeline/enrichments.py
@ -1,97 +0,0 @@
-"""
-This module enriches data, they take time to run, and are usually expensive (API calls...),
-they should manage their own state, and only be run if the data's version is different than
-their save.
-"""
-from .run import Run
-import pandas as pd
-from typing import Any, List, Dict
-import litellm
-
-def enrich(
-    model: str,
-    rpm: int,
-    messages_to_process: List[List[Dict[str, str]]],
-    schema: Dict[str, Any],
-    chunk_size: int = 100,
-):
-    # Use litellm.batch_completion
-    pass
-
-def enrich_with_task_estimateability(run: Run) -> pd.DataFrame:
-    output_path = run.cache_dir / "computed_task_estimateability.parquet"
-    if output_path.exists():
-        print(f"Loading cached task estimateability from {output_path}")
-        return pd.read_parquet(output_path)
-
-    df_remote_tasks = run.df_tasks[run.df_tasks['remote_status'] == 'remote'].copy()
-
-    # In the old script, we only passed unique tasks to the API
-    df_unique_tasks = df_remote_tasks.drop_duplicates(subset=['task'])
-
-
-    results = enrich(
-        model="gpt-4.1-mini",
-        rpm=5000,
-        messages_to_process=[
-            [
-                {"role": "system", "content":  """
-                    Judge whether the provided O*NET task is suitable for a time estimate. If it is a single, clearly-bounded activity, typically lasting minutes, hours, or a few days, then clearly yes. If it is a continuous responsibility or behavioural norm with no schedulable duration (e.g., “follow confidentiality rules,” “serve as department head”), then clearly no.
-                    """},
-                {"role": "user", "content": f"Task: {row.task}"},
-            ]
-            for row in df_unique_tasks.itertuples()
-        ],
-        schema={
-            "type": "object",
-            "properties": {"estimateable": {"type": "bool"}},
-            "required": ["estimateable"]
-        },
-        chunk_size=300,
-    )
-
-    # Create a new dataframe with just enough information to identify the task uniquely + estimateability classification, save it, return it. Careful: the "task" column in itself is not unique.
-    return pd.DataFrame()
-
-def enrich_with_task_estimates(run: Run) -> pd.DataFrame:
-    output_path = run.cache_dir / "computed_task_estimates.parquet"
-    if output_path.exists():
-        print(f"Loading cached task estimates from {output_path}")
-        return pd.read_parquet(output_path)
-
-    df = ... # todo
-
-    results = enrich(
-        model="gpt-4.1-mini",
-        rpm=5000,
-        messages_to_process=[
-            [
-                {"role": "system", "content":  "Estimate the time required to complete the following O*NET task. Your estimate should be a plausible range for how long it might take a typical, qualified worker to perform this task once. Provide your answer as a time range (lower and upper bounds). Do not provide explanations or apologies. If the task is not suitable for a time estimate (e.g., it is an ongoing responsibility), interpret it as a single, schedulable action."},
-                {"role": "user", "content":  f"""
-                    Task: {row.task}
-                    For Occupation: {row.occupation_title}
-                    Occupation Description: {row.occupation_description}"""}
-            ]
-            for row in df.itertuples()
-        ],
-        schema={
-            "type": "object",
-            "properties": {
-                "lower_bound_estimate": {
-                    "type": "object",
-                    "properties": {"quantity": {"type": "number"}, "unit": {"type": "string", "enum": ["minutes", "hours", "days"]}},
-                    "required": ["quantity", "unit"],
-                },
-                "upper_bound_estimate": {
-                    "type": "object",
-                    "properties": {"quantity": {"type": "number"}, "unit": {"type": "string", "enum": ["minutes", "hours", "days"]}},
-                    "required": ["quantity", "unit"],
-                },
-            },
-            "required": ["lower_bound_estimate", "upper_bound_estimate"],
-        },
-        chunk_size=200,
-    )
-
-    # Create a new dataframe with just enough information to identify the task uniquely + the estimates classification, save it, return it. Careful: the "task" column in itself is not unique.
-    raise NotImplementedError
--- a/pipeline/fetchers.py
+++ b/pipeline/fetchers.py
@ -1,50 +1,30 @@
-"""
-Fetchers retrieve remote data and return it in a format suitable for further processing, they also return its version, which should be considered opaque, though it is usually a checksum.
-"""
-
 import sqlite3
-from typing import Tuple
 import pandas as pd
 import requests
 import io
 import zipfile
-from pipeline.run import Run
-from pipeline.logger import logger
+import yaml
+from pathlib import Path
+from .logger import logger
+from typing import Tuple, Dict

-def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
-    """
-    Downloads the O*NET database, creates a local SQLite file from it, and returns a connection.
-    """
-    version  = "29_1"
-    url = f"https://www.onetcenter.org/dl_files/database/db_{version}_mysql.zip"
-    db_path = run.cache_dir / f"onet_{version}.db"
-    run.meta.fetchers['onet'] = {
-        'url': url,
-        'version': version,
-        'db_path': str(db_path),
-    }
+ONET_VERSION  = "29_1"
+ONET_URL = f"https://www.onetcenter.org/dl_files/database/db_{ONET_VERSION}_mysql.zip"

-    if db_path.exists():
-        logger.info(f"Using cached O*NET database: {db_path}")
-        conn = sqlite3.connect(db_path)
-        return conn, version
+def fetch_onet_database(cache_dir: Path) -> sqlite3.Connection:
+    DB_PATH = cache_dir / f"onet_{ONET_VERSION}.db"

-    logger.info(f"Downloading O*NET database from {url}")
-    response = requests.get(url, stream=True, headers={
+    if DB_PATH.exists():
+        logger.info(f"Using cached O*NET database: {DB_PATH}")
+        return sqlite3.connect(DB_PATH)
+
+    logger.info(f"Downloading O*NET database from {ONET_URL}")
+    response = requests.get(ONET_URL, stream=True, headers={
        "User-Agent": "econ-agent/1.0"
    })
    response.raise_for_status()

-    # Read content into memory
-    zip_content = response.content
-
-    db_path = run.cache_dir / f"onet_{version}.db"
-
-    logger.info(f"Creating new O*NET database: {db_path}")
-    conn = sqlite3.connect(db_path)
-
-    # Set performance PRAGMAs for fast import
-    logger.info("Creating new SQLite database with performance settings")
+    conn = sqlite3.connect(DB_PATH)
    conn.executescript("""
        PRAGMA journal_mode = OFF;
        PRAGMA synchronous = 0;
@ -54,6 +34,7 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
        PRAGMA foreign_keys = ON;
    """)

+    zip_content = response.content
    with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
        sql_scripts = []
        for filename in sorted(z.namelist()):
@ -63,14 +44,10 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
        if not sql_scripts:
            raise RuntimeError("No SQL files found in the O*NET zip archive.")

-        # Combine and execute all SQL files in one transaction
-        full_script = "BEGIN TRANSACTION;\n" + "\n".join(sql_scripts) + "\nCOMMIT;"
-
        logger.info("Executing SQL files in alphabetical order (single transaction mode)")
+        full_script = "BEGIN TRANSACTION;\n" + "\n".join(sql_scripts) + "\nCOMMIT;"
        conn.executescript(full_script)
-        logger.info("Database populated successfully. Restoring reliability settings...")

-    # Restore reliability-focused settings after import
    conn.executescript("""
        PRAGMA journal_mode = WAL;
        PRAGMA synchronous = NORMAL;
@ -81,87 +58,75 @@ def fetch_onet_database(run: Run) -> Tuple[sqlite3.Connection, str]:
    """)
    conn.execute("VACUUM;")
    conn.commit()
-    logger.info("Reliability settings restored and database optimized successfully!")

-    return conn, version
+    return conn

-def fetch_oesm_data(run: Run) -> Tuple[pd.DataFrame, str]:
-    """
-    Downloads the OESM national data from the BLS website.
-    """
-    version = "23"
-    url = f"https://www.bls.gov/oes/special-requests/oesm{version}nat.zip"
-    parquet_path = run.cache_dir / "oesm.parquet"
-    run.meta.fetchers['oesm'] = {
-        'url': url,
-        'version': version,
-        'parquet_path': str(parquet_path),
-    }
+def fetch_oesm_data(cache_dir: Path) -> pd.DataFrame:
+    VERSION = "23"
+    URL = f"https://www.bls.gov/oes/special-requests/oesm{VERSION}nat.zip"
+    DATA_PATH = cache_dir / "oesm.parquet"

-    if parquet_path.exists():
-        logger.info(f"Using cached OESM data: {parquet_path}")
-        return pd.read_parquet(parquet_path), version
+    if DATA_PATH.exists():
+        logger.info(f"Using cached OESM data: {DATA_PATH}")
+        return pd.read_parquet(DATA_PATH)

-    logger.info(f"Downloading OESM data from {url}")
+    logger.info(f"Downloading OESM data from {URL}")
    headers = {'User-Agent': 'econ-agent/1.0'}
-    response = requests.get(url, headers=headers)
+    response = requests.get(URL, headers=headers)
    response.raise_for_status()

    zip_content = response.content
-    logger.info(f"OESM data version: {version}")

-    logger.info(f"Creating new OESM data cache: {parquet_path}")
+    logger.info(f"Creating new OESM data cache: {DATA_PATH}")
    with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
-        # Find the excel file in the zip
-        excel_filename = None
-        for filename in z.namelist():
-            logger.debug(f"Found file in OESM zip: {filename}")
-            if filename.lower().endswith(".xlsx"):
-                excel_filename = filename
-                break
-
-        if excel_filename is None:
-            raise FileNotFoundError("Could not find the Excel file in the OESM zip archive.")
-
-        logger.info(f"Reading {excel_filename} from zip archive.")
-        with z.open(excel_filename) as f:
+        with z.open(f"oesm{VERSION}national.xlsx") as f:
            df = pd.read_excel(f, engine='openpyxl', na_values=['*', '#'])

-    df.to_parquet(parquet_path)
-    logger.info(f"Saved OESM data to cache: {parquet_path}")
-    return df, version
+    df.to_parquet(DATA_PATH)
+    logger.info(f"Saved OESM data to cache: {DATA_PATH}")
+    return df

-def fetch_epoch_remote_data(run: Run) -> Tuple[pd.DataFrame, str]:
-    """
-    Downloads the EPOCH AI remote work task data.
-    """
-    # This is the direct download link constructed from the Google Drive share link
-    version = "latest"
-    url = "https://drive.google.com/uc?export=download&id=1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r"
-    parquet_path = run.cache_dir / f"epoch_remote_{version}.parquet"
-    run.meta.fetchers['epoch_remote'] = {
-        'url': url,
-        'version': version,
-        'parquet_path': str(parquet_path),
-    }
+def fetch_epoch_remote_data(cache_dir: Path) -> pd.DataFrame:
+    URL = "https://drive.google.com/uc?export=download&id=1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r"
+    DATA_PATH = cache_dir / f"epoch_remote_latest.parquet"

-    if parquet_path.exists():
-        logger.info(f"Using cached EPOCH remote data: {parquet_path}")
-        return pd.read_parquet(parquet_path), version
+    if DATA_PATH.exists():
+        logger.info(f"Using cached EPOCH remote data: {DATA_PATH}")
+        return pd.read_parquet(DATA_PATH)

-    logger.info(f"Downloading EPOCH remote data from Google Drive: {url}")
+    logger.info(f"Downloading EPOCH remote data from Google Drive: {URL}")

-    # Need to handle potential cookies/redirects from Google Drive
    session = requests.Session()
    session.headers.update({"User-Agent": "econ-agent/1.0"})
-    response = session.get(url, stream=True)
+    response = session.get(URL, stream=True)
    response.raise_for_status()

    csv_content = response.content

-    logger.info(f"Creating new EPOCH remote data cache: {parquet_path}")
+    logger.info(f"Creating new EPOCH remote data cache: {DATA_PATH}")
    df = pd.read_csv(io.BytesIO(csv_content))
-    df.to_parquet(parquet_path)
-    logger.info(f"Saved EPOCH remote data to cache: {parquet_path}")
+    df.to_parquet(DATA_PATH)

-    return df, version
+    return df
+
+def fetch_metr_data(cache_dir: Path) -> Dict:
+    URL = "https://metr.org/assets/benchmark_results.yaml"
+    DATA_PATH = cache_dir / "metr_benchmark_results.yaml"
+
+    if DATA_PATH.exists():
+        logger.info(f"Using cached METR data: {DATA_PATH}")
+        with open(DATA_PATH, "r") as f:
+            return yaml.safe_load(f)
+
+    logger.info(f"Downloading METR data from {URL}")
+    headers = {"User-Agent": "econ-agent/1.0"}
+    response = requests.get(URL, headers=headers)
+    response.raise_for_status()
+
+    yaml_content = response.content
+
+    logger.info(f"Creating new METR data cache: {DATA_PATH}")
+    with open(DATA_PATH, "wb") as f:
+        f.write(yaml_content)
+
+    return yaml.safe_load(yaml_content)
--- a/pipeline/generators/init.py
+++ b/pipeline/generators/init.py
@ -1,5 +1,15 @@
 from .estimate_histplot import generate_estimate_histplot
+from .estimates_spread_per_occupation import generate_estimate_spread_per_occupation
+from .estimates_lower_vs_upper_scatter import generate_estimates_lower_vs_upper_scatter
+from .sequential_coherence_cdf import plot_sequential_coherence_cdf
+from .projected_automatable_wage_bill import generate_projected_automatable_wage_bill
+from .projected_task_automation import generate_projected_task_automation_plot

 GENERATORS = [
-    generate_estimate_histplot
+    generate_estimate_histplot,
+    generate_estimate_spread_per_occupation,
+    generate_estimates_lower_vs_upper_scatter,
+    #plot_sequential_coherence_cdf,
+    generate_projected_automatable_wage_bill,
+    generate_projected_task_automation_plot,
 ]
--- a/pipeline/generators/estimate_histplot.py
+++ b/pipeline/generators/estimate_histplot.py
@ -1,6 +1,32 @@
-from ..run import Run
 from pathlib import Path
 from typing import Generator
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+from ..utils import style_plot

-def generate_estimate_histplot(run: Run) -> Generator[Path]:
-    raise NotImplementedError
+def generate_estimate_histplot(output_dir: Path, df: pd.DataFrame, **kwargs) -> Generator[Path]:
+    """
+    Generates a styled histogram of the distribution of midpoint time estimates.
+    """
+    style_plot()
+    OUTPUT_PATH = output_dir / "estimate_distribution_histplot.png"
+
+    fig, ax = plt.subplots()
+
+    sns.histplot(
+        data=df,
+        x='estimate_midpoint',
+        log_scale=True,
+        ax=ax
+    )
+
+    ax.set_xlabel("Task Time (minutes, log scale)")
+    ax.set_ylabel("Number of Tasks")
+    ax.set_title("Distribution of Time Estimates for Atomic Tasks")
+
+    plt.tight_layout()
+    plt.savefig(OUTPUT_PATH)
+    plt.close(fig)
+
+    yield OUTPUT_PATH
--- a/pipeline/generators/estimates_lower_vs_upper_scatter.py
+++ b/pipeline/generators/estimates_lower_vs_upper_scatter.py
@ -0,0 +1,56 @@
+from pathlib import Path
+from typing import Generator
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+from ..utils import OCCUPATION_MAJOR_CODES, style_plot
+
+
+def generate_estimates_lower_vs_upper_scatter(output_dir: Path, df: pd.DataFrame, **kwargs) -> Generator[Path]:
+    """
+    Generates a styled scatter plot of lower-bound vs upper-bound time estimates for tasks.
+    """
+    style_plot()
+    OUTPUT_PATH = output_dir / "estimates_lower_vs_upper_scatter.png"
+
+    plot_df = df.copy()
+    # Replace onetsoc_major codes with their corresponding labels for the plot legend
+    plot_df['onetsoc_major'] = plot_df['onetsoc_major'].map(OCCUPATION_MAJOR_CODES)
+
+    fig, ax = plt.subplots(figsize=(12, 10))
+    sns.scatterplot(
+            data=plot_df,
+            x='lb_estimate_in_minutes',
+            y='ub_estimate_in_minutes',
+            alpha=0.3,
+            edgecolor=None,
+            hue="onetsoc_major",
+            ax=ax
+        )
+
+    # 45° reference line (y=x)
+    lims = (
+        min(df['lb_estimate_in_minutes'].min(), df['ub_estimate_in_minutes'].min()),
+        max(df['lb_estimate_in_minutes'].max(), df['ub_estimate_in_minutes'].max())
+    )
+    lims = (lims[0] * 0.9, lims[1] * 1.1)
+    ax.plot(lims, lims, color='black', linestyle='--', linewidth=1, zorder=0)
+
+    # Optional helper lines for ratios
+    for k in [2, 10, 100]:
+        ax.plot(lims, [k*l for l in lims],
+                linestyle=':', color='grey', linewidth=1, zorder=0)
+
+    ax.set_xscale('log')
+    ax.set_yscale('log')
+    ax.set_xlabel('Lower-bound (min, log scale)')
+    ax.set_ylabel('Upper-bound (min, log scale)')
+    ax.set_title('Lower vs Upper Estimates for All Tasks')
+
+    ax.legend(title="Occupation Major Group", bbox_to_anchor=(1.02, 1), loc='upper left')
+
+    plt.tight_layout()
+    plt.savefig(OUTPUT_PATH, bbox_inches='tight')
+    plt.close(fig)
+
+    yield OUTPUT_PATH
--- a/pipeline/generators/estimates_spread_per_occupation.py
+++ b/pipeline/generators/estimates_spread_per_occupation.py
@ -0,0 +1,39 @@
+from pathlib import Path
+from typing import Generator
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+from ..utils import OCCUPATION_MAJOR_CODES, style_plot
+
+
+def generate_estimate_spread_per_occupation(output_dir: Path, df: pd.DataFrame, **kwargs) -> Generator[Path]:
+    """
+    Generates a styled boxplot of the estimate range spread per major occupation group.
+    """
+    style_plot()
+    OUTPUT_PATH = output_dir / "estimates_spread_per_occupation.png"
+
+    fig, ax = plt.subplots(figsize=(10, 12))
+
+    sns.boxplot(
+        data=df,
+        x='onetsoc_major',
+        y='estimate_range',
+        showfliers=False,
+        ax=ax
+    )
+
+    ax.set_yscale('log')
+    ax.set_xlabel('Occupation')
+    ax.set_ylabel('Range (upper-lower, minutes)')
+    ax.set_title('Spread of time-range estimates per occupation')
+
+    # Get occupation labels from codes for x-axis ticks
+    labels = [OCCUPATION_MAJOR_CODES.get(code.get_text(), code.get_text()) for code in ax.get_xticklabels()]
+    ax.set_xticklabels(labels, rotation=60, ha='right')
+
+    plt.tight_layout()
+    plt.savefig(OUTPUT_PATH)
+    plt.close(fig)
+
+    yield OUTPUT_PATH
--- a/pipeline/generators/helpers.py
+++ b/pipeline/generators/helpers.py
@ -1,6 +0,0 @@
-import pandas as pd
-from typings import List
-
-def must_have_columns(df: pd.DataFrame, columns: List[str]):
-    if not all(col in df.columns for col in columns):
-        raise ValueError(f"DataFrame is missing required columns: {columns}")
--- a/pipeline/generators/projected_automatable_wage_bill.py
+++ b/pipeline/generators/projected_automatable_wage_bill.py
@ -0,0 +1,229 @@
+from pathlib import Path
+from typing import Generator, Dict, Tuple, Optional
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.ticker as mticker
+from scipy.stats import linregress
+from datetime import datetime
+from ..utils import style_plot, LIME
+
+def _generate_wage_projection_data(
+    metr_results: Dict,
+    df_with_wages: pd.DataFrame,
+    percentile_key: str,
+    doubling_time_modifier: float,
+) -> Optional[Tuple[pd.DataFrame, pd.DataFrame, float]]:
+    """
+    Generates wage projection data for different AI progress scenarios.
+
+    Args:
+        metr_results: The METR benchmark data.
+        df_with_wages: DataFrame containing tasks with their estimated wage value.
+        percentile_key: The percentile to use from METR data (e.g., 'p50_horizon_length').
+        doubling_time_modifier: Multiplier for the doubling time (e.g., 1.0 for baseline,
+                                  0.5 for optimistic, 2.0 for pessimistic).
+
+    Returns:
+        A tuple of (metr_df, projection_df, doubling_time_days), or None if data is insufficient.
+    """
+    all_model_data = []
+    for model_name, data in metr_results.get("results", {}).items():
+        for agent_name, agent_data in data.get("agents", {}).items():
+            release_date_str = data.get("release_date")
+            horizon = agent_data.get(percentile_key, {}).get("estimate")
+            if release_date_str and horizon is not None:
+                all_model_data.append({
+                    "release_date": release_date_str,
+                    "horizon_minutes": horizon,
+                })
+
+    if not all_model_data:
+        return None
+
+    metr_df = pd.DataFrame(all_model_data).sort_values("release_date").reset_index(drop=True)
+    metr_df['release_date'] = pd.to_datetime(metr_df['release_date'])
+    metr_df = metr_df[metr_df['horizon_minutes'] > 0].copy()
+
+    if len(metr_df) < 2:
+        return None
+
+    metr_df['days_since_start'] = (metr_df['release_date'] - metr_df['release_date'].min()).dt.days
+    log_y = np.log(metr_df['horizon_minutes'])
+    slope, intercept, r_value, _, _ = linregress(metr_df['days_since_start'], log_y)
+
+    # Apply the scenario modifier to the doubling time
+    base_doubling_time_days = np.log(2) / slope
+    modified_doubling_time_days = base_doubling_time_days * doubling_time_modifier
+    modified_slope = np.log(2) / modified_doubling_time_days
+
+    start_date = metr_df['release_date'].min()
+    future_dates = pd.to_datetime(pd.date_range(start=start_date, end="2035-01-01", freq="ME"))
+    future_days = (future_dates - start_date).days.to_numpy()
+
+    projected_log_horizon = intercept + modified_slope * future_days
+    projected_horizon_minutes = np.exp(projected_log_horizon)
+
+    projection_df = pd.DataFrame({
+        "date": future_dates,
+        "projected_coherence_minutes": projected_horizon_minutes,
+    })
+
+    # Calculate the total wage bill of tasks automated over time
+    for bound in ["lb", "mid", "ub"]:
+        col_name = 'estimate_midpoint' if bound == 'mid' else f'{bound}_estimate_in_minutes'
+        projection_df[f"automatable_wage_bill_{bound}"] = projection_df["projected_coherence_minutes"].apply(
+            lambda h: df_with_wages.loc[df_with_wages[col_name] <= h, 'wage_per_task'].sum()
+        )
+
+    # Also calculate for the actual METR data points for plotting
+    metr_df["automatable_wage_bill_mid"] = metr_df["horizon_minutes"].apply(
+         lambda h: df_with_wages.loc[df_with_wages['estimate_midpoint'] <= h, 'wage_per_task'].sum()
+    )
+
+    return metr_df, projection_df, modified_doubling_time_days
+
+
+def _plot_scenario(ax, projection_df, metr_df, label, color, line_style='-'):
+    """Helper function to draw a single projection scenario on a given axis."""
+    # Plot the projected wage bill
+    ax.plot(
+        projection_df["date"],
+        projection_df["automatable_wage_bill_mid"],
+        label=label,
+        color=color,
+        linewidth=2.5,
+        linestyle=line_style,
+        zorder=3
+    )
+    # Plot the shaded range for lower/upper bounds
+    ax.fill_between(
+        projection_df["date"],
+        projection_df["automatable_wage_bill_lb"],
+        projection_df["automatable_wage_bill_ub"],
+        color=color,
+        alpha=0.15,
+        zorder=2
+    )
+    # Plot the actual METR data points against the wage bill
+    ax.scatter(
+        metr_df['release_date'],
+        metr_df['automatable_wage_bill_mid'],
+        color=color,
+        edgecolor='black',
+        s=60,
+        zorder=4,
+        label=f"Model Capabilities (P50)"
+    )
+
+
+def generate_projected_automatable_wage_bill(
+    output_dir: Path,
+    df: pd.DataFrame,
+    task_summary_by_occupation_df: pd.DataFrame,
+    metr_results: Dict,
+    **kwargs,
+) -> Generator[Path, None, None]:
+    """
+    Generates a plot projecting the automatable wage bill under different
+    AI progress scenarios (optimistic, baseline, pessimistic).
+    """
+    style_plot()
+    OUTPUT_PATH = output_dir / "projected_automatable_wage_bill_sensitivity.png"
+
+    # 1. Calculate wage_per_task for each occupation
+    wage_bill_info = task_summary_by_occupation_df[['onetsoc_code', 'wage_bill', 'total_tasks']].copy()
+    wage_bill_info['wage_per_task'] = wage_bill_info['wage_bill'] / wage_bill_info['total_tasks']
+    wage_bill_info.replace([np.inf, -np.inf], 0, inplace=True) # Avoid division by zero issues
+    wage_bill_info.drop(columns=['wage_bill', 'total_tasks'], inplace=True)
+
+    # 2. Merge wage_per_task into the main task dataframe
+    df_with_wages = pd.merge(df, wage_bill_info, on='onetsoc_code', how='left')
+    df_with_wages['wage_per_task'].fillna(0, inplace=True)
+
+    # 3. Generate data for all three scenarios
+    scenarios = {
+        "Optimistic": {"modifier": 0.5, "color": "tab:green", "style": "--"},
+        "Baseline": {"modifier": 1.0, "color": LIME['600'], "style": "-"},
+        "Pessimistic": {"modifier": 2.0, "color": "tab:red", "style": ":"},
+    }
+
+    projection_results = {}
+    for name, config in scenarios.items():
+        result = _generate_wage_projection_data(metr_results, df_with_wages, 'p50_horizon_length', config['modifier'])
+        if result:
+            projection_results[name] = result
+
+    if not projection_results:
+        print("Warning: Could not generate any projection data. Skipping wage bill plot.")
+        return
+
+    # 4. Create the plot
+    fig, ax = plt.subplots(figsize=(14, 9))
+
+    # We only need to plot the scatter points once, let's use the baseline ones.
+    if "Baseline" in projection_results:
+        metr_df, _, _ = projection_results["Baseline"]
+        ax.scatter(
+            metr_df['release_date'],
+            metr_df['automatable_wage_bill_mid'],
+            color='black',
+            s=80,
+            zorder=5,
+            label=f"Model Capabilities (P50)"
+        )
+
+
+    legend_lines = []
+    for name, (metr_df, proj_df, doubling_time) in projection_results.items():
+        config = scenarios[name]
+        ax.plot(
+            proj_df["date"],
+            proj_df["automatable_wage_bill_mid"],
+            color=config['color'],
+            linestyle=config['style'],
+            linewidth=2.5,
+            zorder=3
+        )
+        ax.fill_between(
+            proj_df["date"],
+            proj_df["automatable_wage_bill_lb"],
+            proj_df["automatable_wage_bill_ub"],
+            color=config['color'],
+            alpha=0.15,
+            zorder=2
+        )
+        # Create a custom line for the legend
+        line = plt.Line2D([0], [0], color=config['color'], linestyle=config['style'], lw=2.5,
+                          label=f'{name} (Doubling Time: {doubling_time:.0f} days)')
+        legend_lines.append(line)
+
+
+    # 5. Styling and annotations
+    ax.set_title("Projected Automatable Wage Bill (P50 Coherence)", fontsize=18, pad=20)
+    ax.set_xlabel("Year", fontsize=12)
+    ax.set_ylabel("Automatable Annual Wage Bill (Trillions of USD)", fontsize=12)
+
+    # Format Y-axis to show trillions
+    def trillions_formatter(x, pos):
+        return f'${x / 1e12:.1f}T'
+    ax.yaxis.set_major_formatter(mticker.FuncFormatter(trillions_formatter))
+
+    total_wage_bill = df_with_wages['wage_per_task'].sum()
+    ax.set_ylim(0, total_wage_bill * 1.05)
+
+    if "Baseline" in projection_results:
+         _, proj_df, _ = projection_results["Baseline"]
+         ax.set_xlim(datetime(2022, 1, 1), proj_df["date"].max())
+
+    # Create the legend from the custom lines and the scatter plot
+    scatter_legend = ax.get_legend_handles_labels()[0]
+    ax.legend(handles=legend_lines + scatter_legend, loc="upper left", fontsize=11)
+
+    ax.grid(True, which="both", linestyle="--", linewidth=0.5)
+    plt.tight_layout()
+    plt.savefig(OUTPUT_PATH)
+    plt.close(fig)
+
+    print(f"Generated sensitivity analysis plot: {OUTPUT_PATH}")
+    yield OUTPUT_PATH
--- a/pipeline/generators/projected_task_automation.py
+++ b/pipeline/generators/projected_task_automation.py
@ -0,0 +1,168 @@
+from pathlib import Path
+from typing import Generator, Dict, Tuple
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.stats import linregress
+from datetime import datetime
+from ..utils import style_plot, LIME
+
+def _generate_projection_data(
+    metr_results: Dict,
+    df: pd.DataFrame,
+    percentile_key: str,
+) -> Tuple[pd.DataFrame, pd.DataFrame] | None:
+    """
+    Generates projection data for a given percentile key (e.g., 'p50_horizon_length').
+    Returns a tuple of (metr_df_with_pct, projection_df), or None if data is insufficient.
+    """
+    # 1. Process METR data to get all model performance over time for the given percentile
+    all_model_data = []
+    for model_name, data in metr_results.get("results", {}).items():
+        for agent_name, agent_data in data.get("agents", {}).items():
+            release_date_str = data.get("release_date")
+            horizon = agent_data.get(percentile_key, {}).get("estimate")
+
+            if release_date_str and horizon is not None:
+                unique_model_name = f"{model_name}-{agent_name}"
+                all_model_data.append({
+                    "model": unique_model_name,
+                    "release_date": release_date_str,
+                    "horizon_minutes": horizon,
+                })
+
+    if not all_model_data:
+        print(f"Warning: No models with {percentile_key} found in METR data. Skipping.")
+        return None
+
+    metr_df = pd.DataFrame(all_model_data).sort_values("release_date").reset_index(drop=True)
+    metr_df['release_date'] = pd.to_datetime(metr_df['release_date'])
+
+    # 2. Perform log-linear regression on coherence over time
+    metr_df = metr_df[metr_df['horizon_minutes'] > 0].copy()
+    if len(metr_df) < 2:
+        print(f"Warning: Not enough data points for regression for {percentile_key}. Skipping.")
+        return None
+
+    metr_df['days_since_start'] = (metr_df['release_date'] - metr_df['release_date'].min()).dt.days
+    log_y = np.log(metr_df['horizon_minutes'])
+    x = metr_df['days_since_start']
+
+    slope, intercept, r_value, _, _ = linregress(x, log_y)
+    doubling_time_days = np.log(2) / slope
+    print(f"METR all models {percentile_key} trend: R^2 = {r_value**2:.2f}, Doubling time = {doubling_time_days:.1f} days")
+
+    # 3. Project coherence into the future
+    start_date = metr_df['release_date'].min()
+    future_dates = pd.to_datetime(pd.date_range(start=start_date, end="2035-01-01", freq="ME"))
+    future_days = (future_dates - start_date).days.to_numpy()
+
+    projected_log_horizon = intercept + slope * future_days
+    projected_horizon_minutes = np.exp(projected_log_horizon)
+
+    projection_df = pd.DataFrame({
+        "date": future_dates,
+        "projected_coherence_minutes": projected_horizon_minutes,
+    })
+
+    # 4. Calculate the percentage of tasks automated over time based on our estimates
+    total_tasks = len(df)
+    if total_tasks == 0:
+        return None
+
+    for bound in ["lb", "mid", "ub"]:
+        col_name = 'estimate_midpoint' if bound == 'mid' else f'{bound}_estimate_in_minutes'
+        projection_df[f"pct_automatable_{bound}"] = projection_df["projected_coherence_minutes"].apply(
+            lambda h: (df[col_name] <= h).sum() / total_tasks * 100
+        )
+
+    metr_df["pct_automatable_mid"] = metr_df["horizon_minutes"].apply(
+         lambda h: (df['estimate_midpoint'] <= h).sum() / total_tasks * 100
+    )
+
+    return metr_df, projection_df
+
+
+def _plot_projection(ax, projection_df, metr_df, label, color, line_style='-'):
+    """Helper function to draw a single projection on a given axis."""
+    # Plot the projected automation percentage
+    ax.plot(
+        projection_df["date"],
+        projection_df["pct_automatable_mid"],
+        label=f"Mid-point",
+        color=color,
+        linewidth=2.5,
+        linestyle=line_style,
+        zorder=3
+    )
+    ax.fill_between(
+        projection_df["date"],
+        projection_df["pct_automatable_lb"],
+        projection_df["pct_automatable_ub"],
+        color=color,
+        alpha=0.15,
+        label=f"Lower/upper bound range",
+        zorder=2
+    )
+    # Plot the actual METR data points
+    ax.scatter(
+        metr_df['release_date'],
+        metr_df['pct_automatable_mid'],
+        color=color,
+        edgecolor='black',
+        s=60,
+        zorder=4,
+        label=f"Model with {label[1:]}% success rate"
+    )
+
+
+def generate_projected_task_automation_plot(
+    output_dir: Path,
+    metr_results: Dict,
+    df: pd.DataFrame,
+    **kwargs,
+) -> Generator[Path, None, None]:
+    """
+    Generates plots projecting task automation based on METR's p50 and p80
+    coherence data.
+    """
+    style_plot()
+
+    p50_data = _generate_projection_data(metr_results, df, 'p50_horizon_length')
+    p80_data = _generate_projection_data(metr_results, df, 'p80_horizon_length')
+
+    # Plot P50 alone
+    if p50_data:
+        p50_metr_df, p50_proj_df = p50_data
+        fig, ax = plt.subplots(figsize=(12, 8))
+        _plot_projection(ax, p50_proj_df, p50_metr_df, "P50", LIME['600'])
+        ax.set_title("How long before sequential coherence stops being a bottleneck?", fontsize=16, pad=20)
+        ax.set_xlabel("Year")
+        ax.set_ylabel("% of task automatable (50% success rate)")
+        ax.set_ylim(0, 100.5)
+        ax.set_xlim(datetime(2022, 1, 1), p50_proj_df["date"].max())
+        ax.grid(True, which="both", linestyle="--", linewidth=0.5)
+        ax.legend(loc="upper left")
+        plt.tight_layout()
+        output_path = output_dir / "projected_task_automation_p50.png"
+        plt.savefig(output_path)
+        plt.close(fig)
+        yield output_path
+
+    # Plot P80 alone
+    if p80_data:
+        p80_metr_df, p80_proj_df = p80_data
+        fig, ax = plt.subplots(figsize=(12, 8))
+        _plot_projection(ax, p80_proj_df, p80_metr_df, "P80", 'tab:cyan')
+        ax.set_title("Projected Task Automation (P80 AI Coherence)", fontsize=16, pad=20)
+        ax.set_xlabel("Year")
+        ax.set_ylabel("% of Estimable Economic Tasks Automatable")
+        ax.set_ylim(0, 100.5)
+        ax.set_xlim(datetime(2022, 1, 1), p80_proj_df["date"].max())
+        ax.grid(True, which="both", linestyle="--", linewidth=0.5)
+        ax.legend(loc="upper left")
+        plt.tight_layout()
+        output_path = output_dir / "projected_task_automation_p80.png"
+        plt.savefig(output_path)
+        plt.close(fig)
+        yield output_path
--- a/pipeline/generators/sequential_coherence_cdf.py
+++ b/pipeline/generators/sequential_coherence_cdf.py
@ -0,0 +1,54 @@
+from pathlib import Path
+import pandas as pd
+import matplotlib.pyplot as plt
+import matplotlib.ticker as mtick
+from ..utils import LIME, style_plot
+
+def plot_sequential_coherence_cdf(output_dir: Path, df: pd.DataFrame, **kwargs):
+    style_plot()
+    output_path = output_dir / "sequential_coherence_cdf.png"
+
+    def cdf(series):
+        """Helper function to calculate CDF data."""
+        s = series.sort_values().reset_index(drop=True)
+        # Calculate cumulative percentage
+        return s.values, ((s.index + 1) / len(s)) * 100
+
+    # Calculate CDF for lower, upper, and midpoint estimates
+    x_lb, y_lb = cdf(df['lb_estimate_in_minutes'])
+    x_ub, y_ub = cdf(df['ub_estimate_in_minutes'])
+    x_mid, y_mid = cdf(df['estimate_midpoint'])
+
+    # Create the plot
+    fig, ax = plt.subplots(figsize=(12, 7))
+
+    # Plot the CDFs as step plots
+    ax.step(x_lb, y_lb, where='post', color=LIME['300'], linewidth=1.8, linestyle='--', zorder=2, label='Lower bound estimate')
+    ax.step(x_ub, y_ub, where='post', color=LIME['900'], linewidth=1.8, linestyle=':', zorder=3, label='Upper bound estimate')
+    ax.step(x_mid, y_mid, where='post', color=LIME['600'], linewidth=2.2, zorder=4, label='Mid-point')
+
+    # --- Styling and Annotations ---
+    ax.set_xscale('log')
+    ax.set_ylim(0, 100)
+    ax.yaxis.set_major_formatter(mtick.PercentFormatter(decimals=0))
+
+    # Set titles and labels using the standard axes methods
+    ax.set_title("% of Tasks With Sequential Coherence ≤ X")
+    ax.set_xlabel("Sequential Coherence (X)")
+    ax.set_ylabel("Cumulative Percentage of Tasks")
+
+    # Define custom x-axis ticks and labels for better readability
+    ticks = [1, 5, 10, 30, 60, 120, 240, 480, 1440, 2880, 10080, 43200, 129600, 259200, 525600]
+    ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hr', '2 hr', '4 hr', '8 hr', '1 day', '2 days',
+ '1 wk', '30 days', '90 days', '180 days', '1 yr']
+    ax.set_xticks(ticks)
+    ax.set_xticklabels(ticklabels, rotation=45, ha='right')
+
+    ax.legend(loc='lower right')
+
+    # --- Save and close ---
+    plt.tight_layout()
+    plt.savefig(output_path, bbox_inches='tight')
+    plt.close(fig)
+
+    yield output_path
--- a/pipeline/metadata.py
+++ b/pipeline/metadata.py
@ -1,41 +0,0 @@
-"""
-This module defines the Metadata model for the pipeline.
-"""
-
-from datetime import datetime
-from pydantic import BaseModel, Field
-from typing import Dict, Any
-
-class Metadata(BaseModel):
-    """
-    A Pydantic model for storing pipeline metadata.
-
-    This class is intended to be instantiated once and passed through the
-    pipeline. Each step in the pipeline can then add its own metadata.
-    This provides a centralized and structured way to track data provenance,
-    versions, and other important information.
-    """
-    fetchers: Dict[str, Dict[str, Any]] = Field(default_factory=dict)
-    enrichments: Dict[str, Dict[str, Any]] = Field(default_factory=dict)
-
-    ts: str = Field(default_factory=lambda: datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
-    commit: str = Field(default_factory=lambda: _get_current_commit())
-
-
-def _get_current_commit() -> str:
-    """
-    Returns the current git commit hash, "unknown", or "errored" depending on why the commit could not be retrieved.
-    """
-    import subprocess
-    try:
-        # Get the current commit hash
-        commit_hash = subprocess.check_output(
-            ["git", "rev-parse", "HEAD"], stderr=subprocess.PIPE, text=True
-        ).strip()
-        return commit_hash
-    except subprocess.CalledProcessError:
-        # If git command fails (e.g., not a git repository)
-        return "errored"
-    except FileNotFoundError:
-        # If git is not installed
-        return "unknown"
--- a/pipeline/postprocessors.py
+++ b/pipeline/postprocessors.py
@ -1,140 +0,0 @@
-from .run import Run
-from .logger import logger
-import pandas as pd
-import numpy as np
-
-
-def check_for_insanity(run: Run) -> Run:
-    raise NotImplementedError
-
-
-def create_df_tasks(run: Run) -> Run:
-    """
-    Creates a dataframe of tasks from the O*NET database, and merges it with remote status data.
-    This replicates the logic from old/enrich_task_ratings.py and parts of old/analysis.py
-
-    The resulting dataframe, `run.df_tasks` will be used by the enrichment steps.
-    """
-    logger.info("Creating tasks dataframe")
-    cache_path = run.cache_dir / f"onet_{run.onet_version}_tasks_with_remote_status.parquet"
-    if cache_path.exists():
-        logger.info(f"Loading cached tasks dataframe from {cache_path}")
-        run.df_tasks = pd.read_parquet(cache_path)
-        return run
-
-    query = """
-    SELECT
-        tr.onetsoc_code,
-        tr.task_id,
-        ts.task,
-        od.title AS occupation_title,
-        od.description AS occupation_description,
-        tr.scale_id,
-        tr.category,
-        tr.data_value,
-        dr.dwa_title
-    FROM
-        task_ratings tr
-    JOIN
-        task_statements ts ON tr.task_id = ts.task_id
-    JOIN
-        occupation_data od ON tr.onetsoc_code = od.onetsoc_code
-    LEFT JOIN
-        tasks_to_dwas td ON tr.onetsoc_code = td.onetsoc_code AND tr.task_id = td.task_id
-    LEFT JOIN
-        dwa_reference dr ON td.dwa_id = dr.dwa_id;
-    """
-    df = pd.read_sql_query(query, run.onet_conn)
-    logger.info(f"Fetched {len(df)} records (including DWA info) from the database.")
-
-    # Separate ratings from DWAs
-    core_cols = [
-        "onetsoc_code", "task_id", "task", "occupation_title",
-        "occupation_description", "scale_id", "category", "data_value"
-    ]
-    ratings_df = df[core_cols].drop_duplicates().reset_index(drop=True)
-
-    dwa_cols = ["onetsoc_code", "task_id", "dwa_title"]
-    dwas_df = df[dwa_cols].dropna(subset=["dwa_title"]).drop_duplicates().reset_index(drop=True)
-
-    # 1. Handle Frequency (FT)
-    logger.info("Processing Frequency data")
-    freq_df = ratings_df[ratings_df["scale_id"] == "FT"].copy()
-    if not freq_df.empty:
-        freq_pivot = freq_df.pivot_table(
-            index=["onetsoc_code", "task_id"],
-            columns="category",
-            values="data_value",
-            fill_value=0,
-        )
-        freq_pivot.columns = [f"frequency_category_{int(col)}" for col in freq_pivot.columns]
-    else:
-        idx = pd.MultiIndex(levels=[[], []], codes=[[], []], names=["onetsoc_code", "task_id"])
-        freq_pivot = pd.DataFrame(index=idx)
-
-    # 2. Handle Importance (IM, IJ)
-    logger.info("Processing Importance data")
-    imp_df = ratings_df[ratings_df["scale_id"].isin(["IM", "IJ"])].copy()
-    if not imp_df.empty:
-        imp_avg = imp_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index()
-        imp_avg.rename(columns={"data_value": "importance_average"}, inplace=True)
-    else:
-        imp_avg = pd.DataFrame(columns=["onetsoc_code", "task_id", "importance_average"])
-
-    # 3. Handle Relevance (RT)
-    logger.info("Processing Relevance data")
-    rel_df = ratings_df[ratings_df["scale_id"] == "RT"].copy()
-    if not rel_df.empty:
-        rel_avg = rel_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index()
-        rel_avg.rename(columns={"data_value": "relevance_average"}, inplace=True)
-    else:
-        rel_avg = pd.DataFrame(columns=["onetsoc_code", "task_id", "relevance_average"])
-
-    # 4. Process DWAs
-    logger.info("Processing DWA data")
-    if not dwas_df.empty:
-        dwas_grouped = dwas_df.groupby(["onetsoc_code", "task_id"])["dwa_title"].apply(list).reset_index()
-        dwas_grouped.rename(columns={"dwa_title": "dwas"}, inplace=True)
-    else:
-        dwas_grouped = None
-
-    # 5. Get Base Task/Occupation Info
-    logger.info("Extracting base task/occupation info")
-    base_cols = ["onetsoc_code", "task_id", "task", "occupation_title", "occupation_description"]
-    base_info = ratings_df[base_cols].drop_duplicates().set_index(["onetsoc_code", "task_id"])
-
-    # 6. Merge Processed ONET Data
-    logger.info("Merging processed ONET data")
-    final_df = base_info.merge(freq_pivot, left_index=True, right_index=True, how="left")
-    final_df = final_df.reset_index()
-
-    if not imp_avg.empty:
-        final_df = final_df.merge(imp_avg, on=["onetsoc_code", "task_id"], how="left")
-    else:
-        final_df["importance_average"] = np.nan
-
-    if not rel_avg.empty:
-        final_df = final_df.merge(rel_avg, on=["onetsoc_code", "task_id"], how="left")
-    else:
-        final_df["relevance_average"] = np.nan
-
-    if dwas_grouped is not None and not dwas_grouped.empty:
-        final_df = final_df.merge(dwas_grouped, on=["onetsoc_code", "task_id"], how="left")
-        if "dwas" in final_df.columns:
-            final_df["dwas"] = final_df["dwas"].apply(lambda x: x if isinstance(x, list) else [])
-    else:
-        final_df["dwas"] = [[] for _ in range(len(final_df))]
-
-    final_df = final_df.replace({np.nan: None})
-
-    # 7. Merge with EPOCH remote data
-    logger.info("Merging with EPOCH remote data")
-    final_df = pd.merge(final_df, run.epoch_df[['Task', 'Remote']], left_on='task', right_on='Task', how='left')
-    final_df = final_df.drop('Task', axis=1).rename(columns={'Remote': 'remote_status'})
-
-
-    logger.info(f"Created tasks dataframe with shape {final_df.shape}")
-    final_df.to_parquet(cache_path)
-
-    run.df_tasks = final_df
-    return run
--- a/pipeline/run.py
+++ b/pipeline/run.py
@ -1,27 +0,0 @@
-from pydantic import BaseModel, Field
-import sqlite3
-import pandas as pd
-from pathlib import Path
-from typing import Optional
-from .metadata import Metadata
-
-class Run(BaseModel):
-    model_config = {"arbitrary_types_allowed": True}
-    # === FETCHERS ===
-    onet_conn: Optional[sqlite3.Connection] = None
-    onet_version: Optional[str] = None
-
-    oesm_df: Optional[pd.DataFrame] = None
-    oesm_version: Optional[str] = None
-
-    epoch_df: Optional[pd.DataFrame] = None
-    epoch_version: Optional[str] = None
-
-    # === ENRICHMENTS ===
-    task_estimateability_df: Optional[pd.DataFrame] = None
-    task_estimates_df: Optional[pd.DataFrame] = None
-
-    meta: Metadata = Field(default_factory=Metadata)
-
-    cache_dir: Path
-    output_dir: Path
--- a/pipeline/runner.py
+++ b/pipeline/runner.py
@ -1,74 +1,215 @@
+import sqlite3
+import os
+from .logger import logger
+import pandas as pd
 from dotenv import load_dotenv
-from .fetchers import fetch_oesm_data, fetch_epoch_remote_data, fetch_onet_database
-from .enrichments import enrich_with_task_estimateability, enrich_with_task_estimates
-from .postprocessors import check_for_insanity, create_df_tasks
+from .fetchers import fetch_onet_database, fetch_oesm_data, fetch_epoch_remote_data, ONET_VERSION, fetch_metr_data
+from .classification import classify_tasks_as_estimable, generate_time_estimates_for_tasks
 from .generators import GENERATORS
-from .run import Run
-from .constants import GRAY
+from .aggregate import create_task_summary_by_occupation_df, aggregate_task_summary_by_major_code
+from .utils import convert_to_minutes
 import argparse
 import platformdirs
-import seaborn as sns
-import matplotlib as mpl
+import numpy as np
 from pathlib import Path
-from typing import Optional
-
-CACHE_DIR = platformdirs.user_cache_dir("econtai")
-
-def run(output_dir: Path | Optional[str] = None):
-    load_dotenv()
-    _setup_graph_rendering()
-
-    if output_dir is None:
-        output_dir = Path("dist/")
-    elif isinstance(output_dir, str):
-        output_dir = Path(output_dir).resolve()
-
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    current_run = Run(output_dir=output_dir, cache_dir=Path(CACHE_DIR).resolve())
-    current_run.cache_dir.mkdir(parents=True, exist_ok=True)
-
-    # Fetchers (fetchers.py)
-    current_run.onet_conn, current_run.onet_version = fetch_onet_database(current_run)
-    current_run.oesm_df, current_run.oesm_version = fetch_oesm_data(current_run)
-    current_run.epoch_df, current_run.epoch_version = fetch_epoch_remote_data(current_run)
-
-    current_run = create_df_tasks(current_run)
-
-    # Enrichments (enrichments.py)
-    current_run.task_estimateability_df = enrich_with_task_estimateability(current_run)
-    current_run.task_estimates_df = enrich_with_task_estimates(current_run)
-
-    # Postprocessors (postprocessors.py)
-    check_for_insanity(current_run)
-
-    # Generators (generators/)
-    for gen in GENERATORS:
-        gen(current_run)


-def _setup_graph_rendering():
-    mpl.rcParams.update({
-        'figure.facecolor' : GRAY['50'],
-        'axes.facecolor'   : GRAY['50'],
-        'axes.edgecolor'   : GRAY['100'],
-        'axes.labelcolor'  : GRAY['700'],
-        'xtick.color'      : GRAY['700'],
-        'ytick.color'      : GRAY['700'],
-        'font.family'      : 'Inter',
-        'font.size'        : 11,
-    })
+class Runner:
+    onet_conn: sqlite3.Connection
+    oesm_df: pd.DataFrame
+    epoch_df: pd.DataFrame
+    metr_results: dict

+    def __init__(self,  output_dir: Path | str, debug: bool, bust_estimability: bool, bust_estimates: bool):
+        if isinstance(output_dir, str):
+            output_dir = Path(output_dir).resolve()

-    sns.set_style("white")
+        output_dir.mkdir(parents=True, exist_ok=True)

+        self.output_dir = output_dir
+        self.intermediate_dir = self.output_dir / "intermediate"
+        self.intermediate_dir.mkdir(parents=True, exist_ok=True)
+        self.cache_dir = platformdirs.user_cache_path("econtai")
+        self.debug = debug
+        self.bust_estimability = bust_estimability
+        self.bust_estimates = bust_estimates

-def main():
-    parser = argparse.ArgumentParser(description="Run the econtai pipeline.")
-    parser.add_argument("--output-dir", type=str, help="The directory to write output files to.")
-    args = parser.parse_args()
-    run(output_dir=args.output_dir)
+        if debug:
+            os.environ["LITELLM_LOG"] = os.environ.get("LITELLM_LOG", "INFO")

+    def run(self):
+        load_dotenv()
+
+        self.onet_conn = fetch_onet_database(self.cache_dir)
+        self.oesm_df = fetch_oesm_data(self.cache_dir)
+        self.epoch_df = fetch_epoch_remote_data(self.cache_dir)
+        self.metr_results = fetch_metr_data(self.cache_dir)
+
+        self.df_tasks = self._create_df_tasks()
+        self.df_tasks['onetsoc_major'] = self.df_tasks['onetsoc_code'].str[:2]
+
+        df_to_process = self.df_tasks[
+            (self.df_tasks['importance_average'] > 3) &
+            (self.df_tasks['remote_status'] == 'remote')
+        ].copy()
+
+        if self.debug:
+            df_to_process = df_to_process.head(10)
+
+        task_estimability_df = classify_tasks_as_estimable(self.cache_dir, df_to_process, bust=self.bust_estimability)
+        self.df_tasks = pd.merge(self.df_tasks, task_estimability_df, on='task', how='left')
+        self.df_tasks['estimable'] = self.df_tasks['estimable'].fillna(False)
+        self.df_tasks.to_parquet(self.intermediate_dir / "df_tasks.parquet")
+        df_to_process = pd.merge(df_to_process, task_estimability_df, on='task', how='left')
+        df_to_process['estimable'] = self.df_tasks['estimable'].fillna(False)
+
+        df_to_process = df_to_process[df_to_process['estimable']].copy()
+
+        task_estimates_df = generate_time_estimates_for_tasks(self.cache_dir, df_to_process, bust=self.bust_estimates)
+        df = pd.merge(df_to_process, task_estimates_df, on=['onetsoc_code', 'task_id'], how='left')
+        df['lb_estimate_in_minutes'] = df.apply(lambda row: convert_to_minutes(row['lb_estimate_qty'], row['lb_estimate_unit']), axis=1)
+        df['ub_estimate_in_minutes'] = df.apply(lambda row: convert_to_minutes(row['ub_estimate_qty'], row['ub_estimate_unit']), axis=1)
+        df['estimate_range'] = df.ub_estimate_in_minutes - df.lb_estimate_in_minutes
+        df['estimate_ratio'] = np.divide(df.ub_estimate_in_minutes, df.lb_estimate_in_minutes).replace([np.inf, -np.inf], None)
+        df['estimate_midpoint'] = (df.lb_estimate_in_minutes + df.ub_estimate_in_minutes) / 2
+
+        df.to_parquet(self.intermediate_dir / "estimable_tasks_with_estimates.parquet")
+
+        self.task_summary_by_occupation_df = create_task_summary_by_occupation_df(self.df_tasks, self.oesm_df)
+        self.task_summary_by_occupation_df.to_parquet(self.intermediate_dir / "task_summary_by_occupation.parquet")
+        self.task_summary_by_major_occupation_df = aggregate_task_summary_by_major_code(self.task_summary_by_occupation_df)
+        self.task_summary_by_major_occupation_df.to_parquet(self.intermediate_dir / "task_summary_by_major_occupation.parquet")
+
+        self._check_for_insanity(df)
+
+        for gen in GENERATORS:
+            for asset in gen(**{
+                "output_dir": self.output_dir,
+                "runner": self,
+                "df": df,
+                "task_summary_by_occupation_df": self.task_summary_by_occupation_df,
+                "task_summary_by_major_occupation_df": self.task_summary_by_major_occupation_df,
+                "df_tasks": self.df_tasks,
+                "oesm_df": self.oesm_df,
+                "metr_results": self.metr_results,
+            }):
+                logger.info(f"New asset: {asset}")
+
+    def _create_df_tasks(self) -> pd.DataFrame:
+        DATA_PATH = self.cache_dir / f"onet_{ONET_VERSION}_tasks_with_remote_status.parquet"
+        if DATA_PATH.exists():
+            logger.info(f"Loading cached tasks dataframe from {DATA_PATH}")
+            return pd.read_parquet(DATA_PATH)
+
+        logger.info("Creating tasks dataframe")
+        query = """
+        SELECT
+        tr.onetsoc_code,
+        tr.task_id,
+        ts.task,
+        od.title AS occupation_title,
+        od.description AS occupation_description,
+        tr.scale_id,
+        tr.category,
+        tr.data_value
+        FROM
+        task_ratings tr
+        JOIN
+        task_statements ts ON tr.task_id = ts.task_id
+        JOIN
+        occupation_data od ON tr.onetsoc_code = od.onetsoc_code;
+        """
+        ratings_df = pd.read_sql_query(query, self.onet_conn)
+        logger.info(f"Fetched {len(ratings_df)} task rating records from the database.")
+
+        # 1. Handle Frequency (FT)
+        logger.info("Processing Frequency data")
+        freq_df = ratings_df[ratings_df["scale_id"] == "FT"].copy()
+        if not freq_df.empty:
+            freq_pivot = freq_df.pivot_table(
+                index=["onetsoc_code", "task_id"],
+                columns="category",
+                values="data_value",
+                fill_value=0,
+            )
+            freq_pivot.columns = [f"frequency_category_{int(col)}" for col in freq_pivot.columns]
+        else:
+            raise ValueError("No frequency data.")
+
+        # 2. Handle Importance (IM, IJ)
+        logger.info("Processing Importance data")
+        imp_df = ratings_df[ratings_df["scale_id"].isin(["IM", "IJ"])].copy()
+        if not imp_df.empty:
+            imp_avg = imp_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index()
+            imp_avg.rename(columns={"data_value": "importance_average"}, inplace=True)
+        else:
+            raise ValueError("No importance data.")
+
+        # 3. Handle Relevance (RT)
+        logger.info("Processing Relevance data")
+        rel_df = ratings_df[ratings_df["scale_id"] == "RT"].copy()
+        if not rel_df.empty:
+            rel_avg = rel_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index()
+            rel_avg.rename(columns={"data_value": "relevance_average"}, inplace=True)
+        else:
+            raise ValueError("No relevance data.")
+
+        # 5. Get Base Task/Occupation Info
+        logger.info("Extracting base task/occupation info")
+        base_cols = ["onetsoc_code", "task_id", "task", "occupation_title", "occupation_description"]
+        base_info = ratings_df[base_cols].drop_duplicates().set_index(["onetsoc_code", "task_id"])
+
+        # 6. Merge Processed ONET Data
+        logger.info("Merging processed ONET data")
+        final_df = base_info.merge(freq_pivot, left_index=True, right_index=True, how="left")
+        final_df = final_df.reset_index()
+
+        if not imp_avg.empty:
+            final_df = final_df.merge(imp_avg, on=["onetsoc_code", "task_id"], how="left")
+        else:
+            final_df["importance_average"] = np.nan
+
+        if not rel_avg.empty:
+            final_df = final_df.merge(rel_avg, on=["onetsoc_code", "task_id"], how="left")
+        else:
+            final_df["relevance_average"] = np.nan
+
+        final_df = final_df.replace({np.nan: None})
+
+        # 7. Merge with EPOCH remote data
+        logger.info("Merging with EPOCH remote data")
+        final_df = pd.merge(final_df, self.epoch_df[['Task', 'Remote']], left_on='task', right_on='Task', how='left')
+        final_df = final_df.drop('Task', axis=1).rename(columns={'Remote': 'remote_status'})
+
+        logger.info(f"Created tasks dataframe with shape {final_df.shape}")
+        final_df.to_parquet(DATA_PATH)
+
+        return final_df
+
+    def _check_for_insanity(self, df: pd.DataFrame):
+        if df['lb_estimate_in_minutes'].isnull().any():
+            missing_count = df['lb_estimate_in_minutes'].isnull().sum()
+            raise ValueError(f"Found {missing_count} atomic tasks with missing 'lb_estimate_in_minutes'.")
+
+        if df['ub_estimate_in_minutes'].isnull().any():
+            missing_count = df['ub_estimate_in_minutes'].isnull().sum()
+            raise ValueError(f"Found {missing_count} atomic tasks with missing 'ub_estimate_in_minutes'.")
+
+        valid_estimates = df.dropna(subset=['lb_estimate_in_minutes', 'ub_estimate_in_minutes'])
+        impossible_bounds = valid_estimates[
+            (valid_estimates['lb_estimate_in_minutes'] <= 0) |
+            (valid_estimates['ub_estimate_in_minutes'] <= 0) |
+            (valid_estimates['lb_estimate_in_minutes'] > valid_estimates['ub_estimate_in_minutes'])
+        ]
+        if not impossible_bounds.empty:
+            raise ValueError(f"Found {len(impossible_bounds)} rows with impossible bounds (e.g., lb > ub or value <= 0).")

 if __name__ == "__main__":
-    main()
+    parser = argparse.ArgumentParser(description="Run the econtai pipeline.")
+    parser.add_argument("--output-dir", type=str, default="dist/", help="The directory to write output files to.")
+    parser.add_argument("--bust-estimability", action="store_true", help="Bust the saved task estimability classification (EXPENSIVE)")
+    parser.add_argument("--bust-estimates", action="store_true", help="Bust the tasks estimates (EXPENSIVE)")
+    parser.add_argument("--debug", action="store_true", help="Enable debug mode (e.g., process fewer tasks).")
+
+    args = parser.parse_args()
+    Runner(output_dir=args.output_dir, debug=args.debug, bust_estimability=args.bust_estimability, bust_estimates=args.bust_estimates).run()
--- a/pipeline/utils.py
+++ b/pipeline/utils.py
@ -0,0 +1,222 @@
+import subprocess
+import matplotlib.colors as mcolors
+import matplotlib as mpl
+import seaborn as sns
+import tempfile
+import litellm
+import time
+import math
+from tqdm import tqdm
+from typing import Any, List, Dict
+from .logger import logger
+
+OCCUPATION_MAJOR_CODES = {
+    '11': 'Management',
+    '13': 'Business & Financial',
+    '15': 'Computer & Mathematical',
+    '17': 'Architecture & Engineering',
+    '19': 'Life, Physical, & Social Science',
+    '21': 'Community & Social Service',
+    '23': 'Legal',
+    '25': 'Education, Training, & Library',
+    '27': 'Arts, Design, & Media',
+    '29': 'Healthcare Practitioners',
+    '31': 'Healthcare Support',
+    '33': 'Protective Service',
+    '35': 'Food Preparation & Serving',
+    '37': 'Building & Grounds Maintenance',
+    '39': 'Personal Care & Service',
+    '41': 'Sales & Related',
+    '43': 'Office & Admin Support',
+    '45': 'Farming, Fishing, & Forestry',
+    '47': 'Construction & Extraction',
+    '49': 'Installation, Maintenance, & Repair',
+    '51': 'Production',
+    '53': 'Transportation & Material Moving',
+    '55': 'Military Specific',
+}
+
+GRAY   = {'50':'#f8fafc','100':'#f1f5f9','200':'#e2e8f0',
+    '300':'#cbd5e1','400':'#94a3b8','500':'#64748b',
+    '600':'#475569','700':'#334155','800':'#1e293b',
+    '900':'#0f172a','950':'#020617'}
+
+LIME            = {'50': '#f7fee7','100': '#ecfcca','200': '#d8f999',
+    '300': '#bbf451','400': '#9ae600','500': '#83cd00',
+    '600': '#64a400','700': '#497d00','800': '#3c6300',
+    '900': '#35530e','950': '#192e03'}
+
+
+def convert_to_minutes(qty, unit):
+    """Converts a quantity in a given unit to minutes."""
+    return qty * {
+        "minute": 1,
+        "hour": 60,
+        "day": 60 * 24,
+        "week": 60 * 24 * 7,
+        "month": 60 * 24 * 30,
+        "trimester": 60 * 24 * 90,
+        "semester": 60 * 24 * 180,
+        "year": 60 * 24 * 365,
+    }[unit]
+
+
+def pretty_display(df):
+    print(df)
+    return
+    html_output = df.to_html(index=False)
+
+    # Create a temporary HTML file
+    with tempfile.NamedTemporaryFile(mode='w', suffix=".html", encoding="utf-8") as temp_file:
+        temp_file.write(html_output)
+        temp_file_path = temp_file.name
+        subprocess.run(["/home/felix/.nix-profile/bin/firefox-devedition", "-p", "Work (YouthAI)", temp_file_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        input("Press Enter to continue after reviewing the HTML output...")
+
+
+def enrich(
+    model: str,
+    rpm: int, # Requests per minute
+    messages_to_process: List[List[Dict[str, str]]],
+    schema: Dict[str, Any],
+    chunk_size: int = 100,
+):
+    all_results = []
+    num_messages = len(messages_to_process)
+    if num_messages == 0:
+        return all_results
+
+    num_chunks = math.ceil(num_messages / chunk_size)
+    logger.info(f"Starting enrichment for {num_messages} messages, in {num_chunks} chunks of up to {chunk_size} each.")
+
+    # Calculate the time that should be allocated per request to respect the RPM limit.
+    time_per_request = 60.0 / rpm if rpm > 0 else 0
+
+    for i in tqdm(range(num_chunks), desc="Enriching data in chunks"):
+        chunk_start_time = time.time()
+
+        start_index = i * chunk_size
+        end_index = start_index + chunk_size
+        message_chunk = messages_to_process[start_index:end_index]
+
+        if not message_chunk:
+            continue
+
+        try:
+            # Send requests for the entire chunk in a batch for better performance.
+            responses = litellm.batch_completion(
+                model=model,
+                messages=message_chunk,
+                response_format={
+                    "type": "json_schema",
+                    "json_schema": schema,
+                },
+            )
+
+            # batch_completion returns the response or an exception object for each message.
+            # We'll replace exceptions with None as expected by the calling functions.
+            for response in responses:
+                if isinstance(response, Exception):
+                    logger.error(f"API call within batch failed: {response}")
+                    all_results.append(None)
+                else:
+                    all_results.append(response)
+
+        except Exception as e:
+            # This catches catastrophic failures in batch_completion itself (e.g., auth)
+            logger.error(f"litellm.batch_completion call failed for chunk {i+1}/{num_chunks}: {e}")
+            all_results.extend([None] * len(message_chunk))
+
+        chunk_end_time = time.time()
+        elapsed_time = chunk_end_time - chunk_start_time
+
+        # To enforce the rate limit, we calculate how long the chunk *should* have taken
+        # and sleep for the remainder of that time.
+        if time_per_request > 0:
+            expected_duration_for_chunk = len(message_chunk) * time_per_request
+            if elapsed_time < expected_duration_for_chunk:
+                sleep_duration = expected_duration_for_chunk - elapsed_time
+                logger.debug(f"Chunk processed in {elapsed_time:.2f}s. Sleeping for {sleep_duration:.2f}s to respect RPM.")
+                time.sleep(sleep_duration)
+
+    return all_results
+
+def get_contrasting_text_color(bg_color_hex_or_rgba):
+    if isinstance(bg_color_hex_or_rgba, str):
+        rgba = mcolors.to_rgba(bg_color_hex_or_rgba)
+    else:
+        rgba = bg_color_hex_or_rgba
+    r, g, b, _ = rgba
+    luminance = 0.2126 * r + 0.7152 * g + 0.0722 * b
+    return 'black' if luminance > 0.55 else 'white'
+
+
+def style_plot():
+    """
+    Applies a consistent and professional style to all plots.
+    This function sets matplotlib's rcParams for a global effect.
+    """
+    mpl.rcParams.update({
+        'figure.facecolor': GRAY['50'],
+        'figure.edgecolor': 'none',
+        'figure.figsize': (12, 8),
+        'figure.dpi': 150,
+
+        'axes.facecolor': GRAY['50'],
+        'axes.edgecolor': GRAY['300'],
+        'axes.grid': True,
+        'axes.labelcolor': GRAY['800'],
+        'axes.titlecolor': GRAY['900'],
+        'axes.titlesize': 18,
+        'axes.titleweight': 'bold',
+        'axes.titlepad': 20,
+        'axes.labelsize': 14,
+        'axes.labelweight': 'semibold',
+        'axes.labelpad': 10,
+        'axes.spines.top': False,
+        'axes.spines.right': False,
+        'axes.spines.left': True,
+        'axes.spines.bottom': True,
+
+        'text.color': GRAY['700'],
+
+        'xtick.color': GRAY['600'],
+        'ytick.color': GRAY['600'],
+        'xtick.labelsize': 12,
+        'ytick.labelsize': 12,
+        'xtick.major.size': 0,
+        'ytick.major.size': 0,
+        'xtick.minor.size': 0,
+        'ytick.minor.size': 0,
+        'xtick.major.pad': 8,
+        'ytick.major.pad': 8,
+
+        'grid.color': GRAY['200'],
+        'grid.linestyle': '--',
+        'grid.linewidth': 1,
+
+        'legend.frameon': False,
+        'legend.fontsize': 12,
+        'legend.title_fontsize': 14,
+        'legend.facecolor': 'inherit',
+
+        'font.family': 'sans-serif',
+        'font.sans-serif': ['Inter'],
+        'font.weight': 'normal',
+
+        'lines.linewidth': 2,
+        'lines.markersize': 6,
+    })
+
+    # Seaborn specific styles
+    # Use shades of LIME as the primary color palette.
+    # Sorting by integer value of keys, and reversed to have darker shades first.
+    # Excluding very light colors that won't be visible on a light background.
+    lime_palette = [LIME[k] for k in sorted(LIME.keys(), key=int, reverse=True) if k not in ['50', '100', '700', '800', '900', '950',]]
+
+    sns.set_palette(lime_palette)
+    sns.set_style("whitegrid", {
+        'axes.edgecolor': GRAY['300'],
+        'grid.color': GRAY['200'],
+        'grid.linestyle': '--',
+    })