old

2025-07-15 00:41:05 +02:00 · 2025-07-15 00:41:05 +02:00 · 43076bcbb1
commit 43076bcbb1
parent 720f21a85b
42 changed files with 237415 additions and 7831 deletions
--- a/analysis/init.py
+++ b/analysis/init.py
--- a/analysis/data.py
+++ b/analysis/data.py
@ -0,0 +1,207 @@
+import logging
+import re
+import requests
+import shutil
+import sqlite3
+import zipfile
+from pathlib import Path
+
+# Configure logging to provide feedback during the data setup process
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+# --- Constants ---
+# Using a data directory at the root of the project
+DATA_DIR = Path("data")
+
+# O*NET database details. We download the MySQL version and convert it to SQLite.
+ONET_MYSQL_URL = "https://www.onetcenter.org/dl_files/database/db_29_3_mysql.zip"
+DB_ZIP_PATH = DATA_DIR / "onet_mysql.zip"
+DB_FILE_PATH = DATA_DIR / "onet.db"
+EXTRACT_DIR = DATA_DIR / "onet_mysql_extracted"
+
+# URLs for other required data files are in a separate text data archive.
+ONET_TEXT_URL = "https://www.onetcenter.org/dl_files/database/db_29_3_text.zip"
+TEXT_ZIP_PATH = DATA_DIR / "onet_text.zip"
+TASK_RATINGS_PATH = DATA_DIR / "Task Ratings.txt"
+DWA_REFERENCE_PATH = DATA_DIR / "DWA Reference.txt"
+
+
+def setup_data_and_database():
+    """
+    Main function to orchestrate the data setup.
+    It ensures the data directory exists, then downloads and sets up the O*NET database
+    and any other required data files.
+    """
+    logging.info("Starting data and database setup...")
+    DATA_DIR.mkdir(exist_ok=True)
+
+    _setup_onet_database()
+    _download_additional_data()
+
+    logging.info("Data and database setup complete.")
+
+
+def _setup_onet_database():
+    """
+    Downloads the O*NET MySQL database, extracts it, and imports it into a
+    new SQLite database, following performance best practices from a shell script.
+    This method performs minimal text-based conversion of the MySQL dump to
+    make it compatible with SQLite before importing.
+    """
+    if DB_FILE_PATH.exists():
+        logging.info("O*NET database already exists at %s. Skipping setup.", DB_FILE_PATH)
+        return
+
+    logging.info("O*NET database not found. Starting fresh setup.")
+    # Ensure the extraction directory is clean before use
+    if EXTRACT_DIR.exists():
+        shutil.rmtree(EXTRACT_DIR)
+    EXTRACT_DIR.mkdir()
+
+    try:
+        # 1. Download if necessary
+        if not DB_ZIP_PATH.exists():
+            logging.info("Downloading O*NET database from %s", ONET_MYSQL_URL)
+            _download_file(ONET_MYSQL_URL, DB_ZIP_PATH)
+        else:
+            logging.info("Using existing O*NET zip file at %s", DB_ZIP_PATH)
+
+        # 2. Extract
+        logging.info("Extracting O*NET database files to %s", EXTRACT_DIR)
+        with zipfile.ZipFile(DB_ZIP_PATH, 'r') as zip_ref:
+            zip_ref.extractall(EXTRACT_DIR)
+
+        # 3. Create new DB with performance PRAGMAs
+        logging.info("Creating new SQLite database with performance settings: %s", DB_FILE_PATH)
+        conn = sqlite3.connect(DB_FILE_PATH)
+        conn.executescript("""
+            PRAGMA journal_mode = OFF;
+            PRAGMA synchronous = 0;
+            PRAGMA cache_size = 1000000;
+            PRAGMA locking_mode = EXCLUSIVE;
+            PRAGMA temp_store = MEMORY;
+        """)
+        conn.close()
+
+        # 4. Combine all SQL files, convert, and import in a single transaction
+        logging.info("Combining and converting SQL files for single transaction import...")
+        sql_files = sorted(EXTRACT_DIR.rglob('*.sql'))
+        if not sql_files:
+            raise FileNotFoundError(f"No SQL files found in {EXTRACT_DIR}")
+
+        # Concatenate all files into one string
+        mysql_dump = "\n".join([sql_file.read_text(encoding='utf-8') for sql_file in sql_files])
+
+        # Minimal conversion for SQLite: remove backticks and ENGINE clauses
+        sqlite_dump = mysql_dump.replace('`', '')
+        sqlite_dump = re.sub(r'\) ENGINE=InnoDB.*?;', ');', sqlite_dump, flags=re.DOTALL)
+
+        full_script = f"BEGIN TRANSACTION;\n{sqlite_dump}\nCOMMIT;"
+
+        logging.info(f"Importing {len(sql_files)} SQL files into database...")
+        conn = sqlite3.connect(DB_FILE_PATH)
+        conn.executescript(full_script)
+        conn.close()
+        logging.info("Database populated successfully.")
+
+        # 5. Restore reliability settings and optimize
+        logging.info("Restoring reliability settings and optimizing database...")
+        conn = sqlite3.connect(DB_FILE_PATH)
+        conn.executescript("""
+            PRAGMA journal_mode = WAL;
+            PRAGMA synchronous = NORMAL;
+            PRAGMA locking_mode = NORMAL;
+            PRAGMA temp_store = DEFAULT;
+            PRAGMA foreign_keys = ON;
+            PRAGMA optimize;
+        """)
+        conn.execute("VACUUM;")
+        conn.close()
+        logging.info("Database setup and optimization complete.")
+
+    except Exception as e:
+        logging.error("Failed during database setup: %s", e, exc_info=True)
+        if DB_FILE_PATH.exists():
+            DB_FILE_PATH.unlink()
+        raise
+    finally:
+        # 6. Cleanup
+        logging.info("Cleaning up temporary files...")
+        if DB_ZIP_PATH.exists():
+            DB_ZIP_PATH.unlink()
+        if EXTRACT_DIR.exists():
+            shutil.rmtree(EXTRACT_DIR)
+
+
+def _download_additional_data():
+    """
+    Downloads and extracts supplementary data files from the O*NET text archive.
+    If the required text files already exist, this function does nothing.
+    """
+    required_files = [TASK_RATINGS_PATH, DWA_REFERENCE_PATH]
+    if all(p.exists() for p in required_files):
+        logging.info("All required text data files already exist. Skipping download.")
+        return
+
+    logging.info("One or more text data files are missing. Downloading and extracting from archive...")
+    try:
+        _download_file(ONET_TEXT_URL, TEXT_ZIP_PATH)
+        logging.info("Unzipping text data archive...")
+        with zipfile.ZipFile(TEXT_ZIP_PATH, 'r') as zip_ref:
+            # Extract only the files we need, without creating subdirectories
+            for target_path in required_files:
+                if not target_path.exists():
+                    # Find the corresponding file within the zip archive's directory structure
+                    member_name = next((m for m in zip_ref.namelist() if m.endswith(target_path.name)), None)
+                    if member_name:
+                        with zip_ref.open(member_name) as source, open(target_path, 'wb') as target:
+                            target.write(source.read())
+                        logging.info("Extracted %s", target_path.name)
+                    else:
+                        logging.warning("Could not find %s in the text data archive.", target_path.name)
+
+    except requests.exceptions.RequestException as e:
+        logging.error("Failed to download O*NET text data archive: %s", e)
+        raise
+    except zipfile.BadZipFile as e:
+        logging.error("Failed to process the text data archive: %s", e)
+        raise
+    finally:
+        # Clean up the downloaded zip file
+        if TEXT_ZIP_PATH.exists():
+            TEXT_ZIP_PATH.unlink()
+            logging.info("Cleaned up downloaded text archive zip file.")
+
+
+def _download_file(url, destination):
+    """
+    Helper function to download a file from a URL, with streaming for large files.
+    """
+    logging.info("Downloading from %s to %s", url, destination)
+    with requests.get(url, stream=True) as r:
+        r.raise_for_status()
+        with open(destination, 'wb') as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                f.write(chunk)
+    logging.info("Download of %s complete.", destination.name)
+
+
+def get_db_connection():
+    """
+    Establishes and returns a connection to the SQLite database.
+    Returns None if the database file does not exist.
+    """
+    if not DB_FILE_PATH.exists():
+        logging.error("Database file not found at %s. Run the setup process first.", DB_FILE_PATH)
+        return None
+    try:
+        conn = sqlite3.connect(DB_FILE_PATH)
+        return conn
+    except sqlite3.Error as e:
+        logging.error("Failed to connect to the database: %s", e)
+        return None
+
+if __name__ == '__main__':
+    # This allows the data setup to be run directly from the command line,
+    # which is useful for initialization or debugging.
+    setup_data_and_database()
--- a/analysis/generate.py
+++ b/analysis/generate.py
@ -0,0 +1,76 @@
+import importlib
+import logging
+import pkgutil
+import shutil
+from pathlib import Path
+
+# The final destination for all generated outputs
+DIST_DIR = Path("dist")
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+def create_all_outputs(processed_df):
+    """
+    Dynamically discovers, imports, and runs all output generators.
+
+    This function iterates through all modules in the 'analysis.generators'
+    package. For each module, it assumes there is a 'generate(data)' function,
+    which it calls with the provided preprocessed DataFrame.
+
+    The generator function is expected to save its output to a temporary file
+    and return the path to that file. This function then moves the output
+
+    to the 'dist/' directory.
+
+    Args:
+        processed_df (pd.DataFrame): The fully preprocessed data to be used
+                                     by the generator functions.
+    """
+    logging.info("Starting output generation...")
+    DIST_DIR.mkdir(exist_ok=True)
+    logging.info(f"Output directory is '{DIST_DIR.resolve()}'")
+
+    # Path to the generators package
+    from . import generators as generators_package
+    generators_path = generators_package.__path__
+    generators_prefix = generators_package.__name__ + "."
+
+    generated_files_count = 0
+
+    # Discover and run all modules in the generators package
+    for _, module_name, _ in pkgutil.iter_modules(generators_path, prefix=generators_prefix):
+        try:
+            logging.info(f"--- Running generator: {module_name} ---")
+
+            # Import the generator module
+            generator_module = importlib.import_module(module_name)
+
+            # Check if the module has the required 'generate' function
+            if not hasattr(generator_module, 'generate'):
+                logging.warning(f"Generator module {module_name} does not have a 'generate' function. Skipping.")
+                continue
+
+            # Call the generator function, passing in the preprocessed data
+            generator_func = getattr(generator_module, 'generate')
+            temp_output_path = generator_func(processed_df)
+
+            # If the generator returned a path, move the file to the dist directory
+            if temp_output_path and isinstance(temp_output_path, Path) and temp_output_path.exists():
+                # Sanitize the module name to create a valid filename
+                base_filename = module_name.split('.')[-1]
+                # Keep the original extension from the temp file
+                final_filename = base_filename + temp_output_path.suffix
+                final_output_path = DIST_DIR / final_filename
+
+                shutil.move(temp_output_path, final_output_path)
+                logging.info(f"Successfully generated '{final_output_path.name}'")
+                generated_files_count += 1
+            else:
+                logging.warning(f"Generator {module_name} did not return a valid output file path. Nothing was saved.")
+
+        except Exception as e:
+            logging.error(f"Failed to run generator {module_name}. Error: {e}", exc_info=True)
+            # Continue to the next generator
+
+    logging.info(f"--- Output generation complete. Total files generated: {generated_files_count} ---")
--- a/analysis/generators/init.py
+++ b/analysis/generators/init.py
--- a/analysis/generators/estimate_lower_vs_upper_bounds.py
+++ b/analysis/generators/estimate_lower_vs_upper_bounds.py
@ -0,0 +1,119 @@
+import seaborn as sns
+import matplotlib.pyplot as plt
+from pathlib import Path
+import tempfile
+import logging
+import pandas as pd
+import numpy as np
+
+# Copied from other generators for modularity. This dictionary maps
+# O*NET major occupation group codes to human-readable labels.
+OCCUPATION_MAJOR_CODES = {
+    '11': 'Management',
+    '13': 'Business & Financial',
+    '15': 'Computer & Mathematical',
+    '17': 'Architecture & Engineering',
+    '19': 'Life, Physical, & Social Science',
+    '21': 'Community & Social Service',
+    '23': 'Legal',
+    '25': 'Education, Training, & Library',
+    '27': 'Arts, Design, & Media',
+    '29': 'Healthcare Practitioners',
+    '31': 'Healthcare Support',
+    '33': 'Protective Service',
+    '35': 'Food Preparation & Serving',
+    '37': 'Building & Grounds Maintenance',
+    '39': 'Personal Care & Service',
+    '41': 'Sales & Related',
+    '43': 'Office & Admin Support',
+    '45': 'Farming, Fishing, & Forestry',
+    '47': 'Construction & Extraction',
+    '49': 'Installation, Maintenance, & Repair',
+    '51': 'Production',
+    '53': 'Transportation & Material Moving',
+    '55': 'Military Specific',
+}
+
+
+def generate(processed_df: pd.DataFrame):
+    """
+    Generates a scatter plot comparing lower vs. upper time estimates for tasks.
+
+    This corresponds to 'cell3' from the original analysis notebook. It helps
+    visualize the relationship and spread between the lower and upper bounds
+
+    of time estimates across different occupation groups.
+
+    Args:
+        processed_df (pd.DataFrame): The preprocessed data. Expected columns:
+                                     'lb_estimate_in_minutes',
+                                     'ub_estimate_in_minutes', 'onetsoc_major'.
+
+    Returns:
+        Path: The path to the generated temporary image file, or None on failure.
+    """
+    logging.info("Generating plot of lower vs. upper time estimates...")
+
+    # --- Data Validation and Preparation ---
+    required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'onetsoc_major']
+    if not all(col in processed_df.columns for col in required_cols):
+        logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
+        return None
+
+    df = processed_df.copy()
+
+    # For log scaling, both lower and upper bounds must be positive.
+    df = df[(df['lb_estimate_in_minutes'] > 0) & (df['ub_estimate_in_minutes'] > 0)]
+    if df.empty:
+        logging.warning("No data with positive lower and upper estimates available to plot.")
+        return None
+
+    # Replace the major code with its readable label for the hue legend.
+    df['occupation_label'] = df['onetsoc_major'].map(OCCUPATION_MAJOR_CODES)
+
+    # --- Plotting ---
+    try:
+        plt.figure(figsize=(12, 10))
+        ax = sns.scatterplot(
+            data=df,
+            x='lb_estimate_in_minutes',
+            y='ub_estimate_in_minutes',
+            alpha=0.2,
+            edgecolor=None,
+            hue="occupation_label"  # Use the labeled column for the legend
+        )
+
+        # Determine limits for the 45° reference line
+        # Use the maximum of both columns to create a square plot
+        max_val = df[['lb_estimate_in_minutes', 'ub_estimate_in_minutes']].max().max()
+        lims = (df[['lb_estimate_in_minutes', 'ub_estimate_in_minutes']].min().min(), max_val)
+        ax.plot(lims, lims, color='black', linestyle='--', linewidth=1, label='Upper = Lower')
+
+        # Add helper lines for constant ratios (2x, 10x, 100x)
+        for k in [2, 10, 100]:
+            ax.plot(lims, [k * l for l in lims],
+                    linestyle=':', color='grey', linewidth=0.8, label=f'Upper = {k}x Lower')
+
+        ax.set(xscale='log', yscale='log', xlim=lims, ylim=lims)
+        ax.set_xlabel('Lower-bound Estimate (minutes, log scale)', fontsize=12)
+        ax.set_ylabel('Upper-bound Estimate (minutes, log scale)', fontsize=12)
+        ax.set_title('Lower vs. Upper Time Estimates for All Tasks', fontsize=16)
+
+        # Place the legend outside the plot to avoid obscuring data
+        ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left', title='Occupation / Ratio')
+
+        # --- File Saving ---
+        temp_dir = tempfile.gettempdir()
+        temp_path = Path(temp_dir) / "estimate_lower_vs_upper_bounds.png"
+
+        # Use bbox_inches='tight' to ensure the external legend is included in the saved image.
+        plt.savefig(temp_path, dpi=300, bbox_inches='tight')
+        logging.info(f"Successfully saved plot to temporary file: {temp_path}")
+
+        return temp_path
+
+    except Exception as e:
+        logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
+        return None
+    finally:
+        plt.close()
--- a/analysis/generators/estimate_ratio_distribution.py
+++ b/analysis/generators/estimate_ratio_distribution.py
@ -0,0 +1,86 @@
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from pathlib import Path
+import tempfile
+import logging
+
+def generate(processed_df: pd.DataFrame):
+    """
+    Generates a histogram of the log-ratio of upper to lower time estimates.
+
+    This corresponds to 'cell4' from the original analysis notebook. It shows
+    the distribution of how many times larger the upper estimate is compared
+    to the lower estimate.
+
+    Args:
+        processed_df (pd.DataFrame): The preprocessed data. Expected columns:
+                                     'lb_estimate_in_minutes',
+                                     'ub_estimate_in_minutes'.
+
+    Returns:
+        Path: The path to the generated temporary image file, or None on failure.
+    """
+    logging.info("Generating distribution plot of estimate ratios...")
+
+    # --- Data Validation and Preparation ---
+    required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes']
+    if not all(col in processed_df.columns for col in required_cols):
+        logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
+        return None
+
+    df = processed_df.copy()
+
+    # Calculate the ratio. We need to handle cases where the lower bound is zero.
+    # Replace lower bound of 0 with a small number to avoid division by zero, or filter them out.
+    # Here, we filter, as a ratio with a zero denominator is undefined.
+    df = df[df['lb_estimate_in_minutes'] > 0]
+    df['estimate_ratio'] = df['ub_estimate_in_minutes'] / df['lb_estimate_in_minutes']
+
+    # Replace infinite values (which can occur if ub is huge and lb is tiny) with NaN
+    # and drop rows with NaN or infinite ratios.
+    df.replace([np.inf, -np.inf], np.nan, inplace=True)
+    df.dropna(subset=['estimate_ratio'], inplace=True)
+
+    if df.empty:
+        logging.warning("No valid data available to plot the estimate ratio distribution.")
+        return None
+
+    # --- Plotting ---
+    try:
+        plt.figure(figsize=(10, 6))
+
+        # We plot the log10 of the ratio to better visualize the wide distribution
+        log_ratio = np.log10(df['estimate_ratio'])
+
+        sns.histplot(log_ratio, bins=60, kde=True)
+
+        # Add vertical lines for reference points
+        # log10(1) = 0, which is where upper bound equals lower bound
+        plt.axvline(x=0, color='black', linestyle='-', linewidth=1.5, label='1x (Upper = Lower)')
+        # A small ratio, e.g., 5% difference
+        plt.axvline(x=np.log10(1.05), color='orange', linestyle='--', linewidth=1, label='1.05x ratio')
+        # A 10x ratio
+        plt.axvline(x=np.log10(10), color='red', linestyle='--', linewidth=1, label='10x ratio')
+
+        plt.xlabel('log₁₀(Upper Estimate / Lower Estimate)', fontsize=12)
+        plt.ylabel('Number of Tasks', fontsize=12)
+        plt.title('Distribution of Time Estimate Ratios', fontsize=16)
+        plt.legend()
+        plt.grid(axis='y', linestyle='--', alpha=0.7)
+        plt.tight_layout()
+
+        # --- File Saving ---
+        temp_dir = tempfile.gettempdir()
+        temp_path = Path(temp_dir) / "estimate_ratio_distribution.png"
+        plt.savefig(temp_path, dpi=300)
+        logging.info(f"Successfully saved plot to temporary file: {temp_path}")
+
+        return temp_path
+
+    except Exception as e:
+        logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
+        return None
+    finally:
+        plt.close()
--- a/analysis/generators/ratio_heatmap_by_occupation_and_task_length.py
+++ b/analysis/generators/ratio_heatmap_by_occupation_and_task_length.py
@ -0,0 +1,135 @@
+import seaborn as sns
+import matplotlib.pyplot as plt
+import pandas as pd
+import numpy as np
+from pathlib import Path
+import tempfile
+import logging
+
+# This mapping helps translate the O*NET 2-digit major group codes
+# into human-readable labels for the plot's y-axis.
+OCCUPATION_MAJOR_CODES = {
+    '11': 'Management',
+    '13': 'Business & Financial',
+    '15': 'Computer & Mathematical',
+    '17': 'Architecture & Engineering',
+    '19': 'Life, Physical, & Social Science',
+    '21': 'Community & Social Service',
+    '23': 'Legal',
+    '25': 'Education, Training, & Library',
+    '27': 'Arts, Design, & Media',
+    '29': 'Healthcare Practitioners',
+    '31': 'Healthcare Support',
+    '33': 'Protective Service',
+    '35': 'Food Preparation & Serving',
+    '37': 'Building & Grounds Maintenance',
+    '39': 'Personal Care & Service',
+    '41': 'Sales & Related',
+    '43': 'Office & Admin Support',
+    '45': 'Farming, Fishing, & Forestry',
+    '47': 'Construction & Extraction',
+    '49': 'Installation, Maintenance, & Repair',
+    '51': 'Production',
+    '53': 'Transportation & Material Moving',
+    '55': 'Military Specific',
+}
+
+
+def generate(processed_df: pd.DataFrame):
+    """
+    Generates a heatmap of the median estimate ratio by occupation and task length quartile.
+
+    This corresponds to 'cell5' from the original analysis notebook. It shows
+    how the ratio between upper and lower time estimates varies across
+    different occupations and for tasks of different typical lengths (binned
+    into quartiles).
+
+    Args:
+        processed_df (pd.DataFrame): The preprocessed data. Expected columns:
+                                     'lb_estimate_in_minutes',
+                                     'ub_estimate_in_minutes', 'onetsoc_major'.
+
+    Returns:
+        Path: The path to the generated temporary image file, or None on failure.
+    """
+    logging.info("Generating heatmap of estimate ratios by occupation and task length...")
+
+    # --- Data Validation and Preparation ---
+    required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'onetsoc_major']
+    if not all(col in processed_df.columns for col in required_cols):
+        logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
+        return None
+
+    df = processed_df.copy()
+
+    # Calculate the estimate ratio, handling division by zero and infinity
+    df = df[df['lb_estimate_in_minutes'] > 0]
+    df['estimate_ratio'] = df['ub_estimate_in_minutes'] / df['lb_estimate_in_minutes']
+    df.replace([np.inf, -np.inf], np.nan, inplace=True)
+    df.dropna(subset=['estimate_ratio'], inplace=True)
+
+    if df.empty:
+        logging.warning("No valid data available for the ratio heatmap.")
+        return None
+
+    # 1. Bin lower bounds into quartiles (Q1–Q4)
+    # Using duplicates='drop' can help if there are many identical values
+    # which can make binning into quantiles fail.
+    try:
+        df['lb_q'] = pd.qcut(
+            df.lb_estimate_in_minutes,
+            q=4,
+            labels=['Q1 (Shortest)', 'Q2', 'Q3', 'Q4 (Longest)'],
+            duplicates='drop'
+        )
+    except ValueError as e:
+        logging.error(f"Could not bin data into quartiles: {e}. There might not be enough unique values.")
+        return None
+
+
+    # 2. Aggregate: median ratio per cell (occupation x task length quartile)
+    pivot = df.pivot_table(
+        index='onetsoc_major',
+        columns='lb_q',
+        values='estimate_ratio',
+        aggfunc='median'
+    )
+
+    # Map the index (onetsoc_major codes) to their corresponding readable labels
+    pivot.index = pivot.index.map(OCCUPATION_MAJOR_CODES)
+    pivot.dropna(inplace=True) # Drop occupations with no data in some quartiles for a cleaner plot
+
+    if pivot.empty:
+        logging.warning("Pivot table is empty after processing. Cannot generate heatmap.")
+        return None
+
+    # --- Plotting ---
+    try:
+        plt.figure(figsize=(12, 10))
+        sns.heatmap(
+            pivot,
+            cmap='RdYlGn_r',  # Red-Yellow-Green (reversed), good for ratios centered around 1
+            center=2,         # Center the colormap around a ratio of 2
+            annot=True,       # Show the median values in the cells
+            fmt='.1f',        # Format annotations to one decimal place
+            linewidths=.5,
+            cbar_kws={'label': 'Median Upper/Lower Estimate Ratio'}
+        )
+        plt.xlabel('Task Length (based on lower-bound quartile)', fontsize=12)
+        plt.ylabel('Occupation Major Group', fontsize=12)
+        plt.title('Typical Estimate Range Width by Occupation and Task Length', fontsize=16)
+        plt.tight_layout()
+
+        # --- File Saving ---
+        temp_dir = tempfile.gettempdir()
+        temp_path = Path(temp_dir) / "ratio_heatmap_by_occupation_and_task_length.png"
+        plt.savefig(temp_path, dpi=300)
+        logging.info(f"Successfully saved plot to temporary file: {temp_path}")
+
+        return temp_path
+
+    except Exception as e:
+        logging.error(f"An error occurred while generating the heatmap: {e}", exc_info=True)
+        return None
+    finally:
+        plt.close()
--- a/analysis/generators/task_breakdown_by_occupation.py
+++ b/analysis/generators/task_breakdown_by_occupation.py
@ -0,0 +1,161 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import matplotlib.ticker as mtick
+import matplotlib.colors as mcolors
+from pathlib import Path
+import tempfile
+import logging
+
+# This mapping helps translate the O*NET 2-digit major group codes
+# into human-readable labels for the plot's y-axis.
+OCCUPATION_MAJOR_CODES = {
+    '11': 'Management',
+    '13': 'Business & Financial',
+    '15': 'Computer & Mathematical',
+    '17': 'Architecture & Engineering',
+    '19': 'Life, Physical, & Social Science',
+    '21': 'Community & Social Service',
+    '23': 'Legal',
+    '25': 'Education, Training, & Library',
+    '27': 'Arts, Design, & Media',
+    '29': 'Healthcare Practitioners',
+    '31': 'Healthcare Support',
+    '33': 'Protective Service',
+    '35': 'Food Preparation & Serving',
+    '37': 'Building & Grounds Maintenance',
+    '39': 'Personal Care & Service',
+    '41': 'Sales & Related',
+    '43': 'Office & Admin Support',
+    '45': 'Farming, Fishing, & Forestry',
+    '47': 'Construction & Extraction',
+    '49': 'Installation, Maintenance, & Repair',
+    '51': 'Production',
+    '53': 'Transportation & Material Moving',
+    '55': 'Military Specific',
+}
+
+# Define colors to match the original notebook's palette.
+# These are standard hex codes for gray and lime shades.
+BAR_COLORS = [
+    '#D1D5DB', # gray-300
+    '#84CC16', # lime-500
+    '#D9F99D', # lime-200
+]
+
+
+def _get_contrasting_text_color(bg_color_hex):
+    """
+    Determines if black or white text provides better contrast against a given background color.
+    """
+    try:
+        rgba = mcolors.to_rgba(bg_color_hex)
+        # Calculate luminance (Y) using the sRGB formula
+        luminance = 0.2126 * rgba[0] + 0.7152 * rgba[1] + 0.0722 * rgba[2]
+        return 'black' if luminance > 0.55 else 'white'
+    except ValueError:
+        return 'black' # Default to black if color is invalid
+
+
+def generate(processed_df: pd.DataFrame):
+    """
+    Generates a stacked bar chart breaking down tasks by remote status and estimability.
+
+    This corresponds to 'cell10' from the original analysis notebook. It shows,
+    for each occupation, the percentage of tasks that are not remote, remote and
+    estimable, or remote and not estimable.
+
+    Args:
+        processed_df (pd.DataFrame): The preprocessed data. Expected columns:
+                                     'onetsoc_major', 'remote_status', 'estimateable'.
+
+    Returns:
+        Path: The path to the generated temporary image file, or None on failure.
+    """
+    logging.info("Generating task breakdown by occupation plot...")
+
+    # --- Data Validation ---
+    required_cols = ['onetsoc_major', 'remote_status', 'estimateable']
+    if not all(col in processed_df.columns for col in required_cols):
+        logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
+        return None
+
+    df = processed_df.copy()
+
+    # --- Data Summarization ---
+    summary_data = []
+    for code, label in OCCUPATION_MAJOR_CODES.items():
+        occ_df = df[df['onetsoc_major'] == code]
+        total_tasks = len(occ_df)
+        if total_tasks == 0:
+            continue
+
+        not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote'])
+        remote_df = occ_df[occ_df['remote_status'] == 'remote']
+        remote_atomic_count = len(remote_df[remote_df['estimateable'] == 'ATOMIC'])
+        remote_ongoing_count = len(remote_df[remote_df['estimateable'] == 'ONGOING-CONSTRAINT'])
+
+        summary_data.append({
+            'occupation_label': label,
+            'count_not_remote': not_remote_count,
+            'count_remote_atomic': remote_atomic_count,
+            'count_remote_ongoing': remote_ongoing_count,
+            'total_tasks': total_tasks
+        })
+
+    if not summary_data:
+        logging.warning("No data available to generate the task breakdown plot.")
+        return None
+
+    summary_df = pd.DataFrame(summary_data)
+
+    # --- Percentage Calculation ---
+    summary_df['pct_not_remote'] = (summary_df['count_not_remote'] / summary_df['total_tasks']) * 100
+    summary_df['pct_remote_atomic'] = (summary_df['count_remote_atomic'] / summary_df['total_tasks']) * 100
+    summary_df['pct_remote_ongoing'] = (summary_df['count_remote_ongoing'] / summary_df['total_tasks']) * 100
+
+    plot_df = summary_df.set_index('occupation_label')[
+        ['pct_not_remote', 'pct_remote_atomic', 'pct_remote_ongoing']
+    ]
+    plot_df.columns = ['Not Remote', 'Remote & Estimable', 'Remote & Not Estimable']
+    plot_df = plot_df.sort_values(by='Not Remote', ascending=False)
+
+
+    # --- Plotting ---
+    try:
+        fig, ax = plt.subplots(figsize=(14, 10))
+        plot_df.plot(kind='barh', stacked=True, ax=ax, color=BAR_COLORS, width=0.8)
+
+        ax.set_xlabel("Percentage of Tasks", fontsize=12)
+        ax.set_ylabel("Occupation Major Group", fontsize=12)
+        ax.set_title("Task Breakdown by Occupation, Remote Status, and Estimability", fontsize=16, pad=20)
+        ax.xaxis.set_major_formatter(mtick.PercentFormatter())
+        ax.set_xlim(0, 100)
+        ax.spines['right'].set_visible(False)
+        ax.spines['top'].set_visible(False)
+
+        # Add percentage labels inside each bar segment
+        for i, container in enumerate(ax.containers):
+            text_color = _get_contrasting_text_color(BAR_COLORS[i])
+            for patch in container.patches:
+                width = patch.get_width()
+                if width > 3:  # Only label segments wider than 3%
+                    x = patch.get_x() + width / 2
+                    y = patch.get_y() + patch.get_height() / 2
+                    ax.text(x, y, f"{width:.1f}%", ha='center', va='center',
+                            fontsize=8, color=text_color, fontweight='medium')
+
+        ax.legend(title="Task Category", bbox_to_anchor=(1.02, 1), loc='upper left', frameon=False)
+
+        # --- File Saving ---
+        temp_dir = tempfile.gettempdir()
+        temp_path = Path(temp_dir) / "task_breakdown_by_occupation.png"
+        plt.savefig(temp_path, dpi=300, bbox_inches='tight')
+        logging.info(f"Successfully saved plot to temporary file: {temp_path}")
+
+        return temp_path
+
+    except Exception as e:
+        logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
+        return None
+    finally:
+        plt.close()
--- a/analysis/generators/task_estimate_distribution.py
+++ b/analysis/generators/task_estimate_distribution.py
@ -0,0 +1,74 @@
+import seaborn as sns
+import matplotlib.pyplot as plt
+from pathlib import Path
+import tempfile
+import logging
+import pandas as pd
+
+def generate(processed_df: pd.DataFrame):
+    """
+    Generates a histogram of the task time estimate midpoints.
+
+    This generator corresponds to 'cell1' from the original analysis notebook.
+    It visualizes the distribution of the calculated midpoint of time estimates
+    for all tasks on a logarithmic scale to handle the wide range of values.
+
+    Args:
+        processed_df (pd.DataFrame): The preprocessed data, expected to contain
+                                     'lb_estimate_in_minutes' and
+                                     'ub_estimate_in_minutes' columns.
+
+    Returns:
+        Path: The path to the generated temporary image file, or None if
+              generation fails.
+    """
+    logging.info("Generating task estimate distribution plot...")
+
+    # --- Data Validation and Preparation ---
+    required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes']
+    if not all(col in processed_df.columns for col in required_cols):
+        logging.error(
+            f"Required columns {required_cols} not found in the DataFrame. "
+            "Cannot generate plot."
+        )
+        return None
+
+    # Create a copy to avoid modifying the original DataFrame
+    df = processed_df.copy()
+
+    # Calculate the midpoint from lower and upper bounds, as was done in the notebook
+    df['estimate_midpoint'] = (df['lb_estimate_in_minutes'] + df['ub_estimate_in_minutes']) / 2
+
+    # For log scaling, we must use positive values. Filter out any non-positive midpoints.
+    df = df[df['estimate_midpoint'] > 0]
+    if df.empty:
+        logging.warning("No data with positive estimate midpoints available to plot.")
+        return None
+
+    # --- Plotting ---
+    try:
+        plt.figure(figsize=(10, 6))
+        ax = sns.histplot(data=df, x='estimate_midpoint', log_scale=True)
+
+        ax.set_title('Distribution of Task Time Estimate Midpoints', fontsize=16)
+        ax.set_xlabel('Estimate Midpoint (minutes, log scale)', fontsize=12)
+        ax.set_ylabel('Number of Tasks', fontsize=12)
+        plt.tight_layout()
+
+        # --- File Saving ---
+        # Create a temporary file to save the plot. The orchestrator (`generate.py`)
+        # will move this to the final 'dist/' directory.
+        temp_dir = tempfile.gettempdir()
+        temp_path = Path(temp_dir) / "task_estimate_distribution.png"
+
+        plt.savefig(temp_path, dpi=300)
+        logging.info(f"Successfully saved plot to temporary file: {temp_path}")
+
+        return temp_path
+
+    except Exception as e:
+        logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
+        return None
+    finally:
+        # Close the figure to free up memory, which is crucial when running many generators.
+        plt.close()
--- a/analysis/generators/temporal_coherence_cdf.py
+++ b/analysis/generators/temporal_coherence_cdf.py
@ -0,0 +1,134 @@
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib as mpl
+from pathlib import Path
+import tempfile
+import logging
+
+# Replicating the color palette from the original notebook for consistency.
+# These appear to be inspired by Tailwind CSS colors.
+GRAY_PALETTE = {
+    '100': '#F3F4F6',
+    '300': '#D1D5DB',
+}
+LIME_PALETTE = {
+    '300': '#D9F99D',
+    '600': '#A3E635', # A mid-tone lime
+    '900': '#4D7C0F', # A dark lime/green
+}
+
+
+def _calculate_cdf(series: pd.Series):
+    """
+    Calculates the empirical Cumulative Distribution Function (CDF) for a series.
+    Returns the sorted values and their corresponding cumulative percentages.
+    """
+    # Drop NA values and ensure the series is sorted
+    s = series.dropna().sort_values().reset_index(drop=True)
+    # Calculate cumulative percentage: (index + 1) / total_count
+    cdf_y = ((s.index + 1) / len(s)) * 100
+    return s.values, cdf_y
+
+
+def generate(processed_df: pd.DataFrame):
+    """
+    Generates a Cumulative Distribution Function (CDF) plot for task time estimates.
+
+    This corresponds to the second 'cell11' from the original notebook. It plots
+    the CDF for the lower-bound, upper-bound, and mid-point of time estimates,
+    showing the percentage of tasks that can be completed within a certain time.
+
+    Args:
+        processed_df (pd.DataFrame): The preprocessed data. Expected columns:
+                                     'lb_estimate_in_minutes',
+                                     'ub_estimate_in_minutes'.
+
+    Returns:
+        Path: The path to the generated temporary image file, or None on failure.
+    """
+    logging.info("Generating temporal coherence CDF plot...")
+
+    # --- Data Validation and Preparation ---
+    required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes']
+    if not all(col in processed_df.columns for col in required_cols):
+        logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
+        return None
+
+    df = processed_df.copy()
+
+    # Log scale requires positive values.
+    df = df[(df['lb_estimate_in_minutes'] > 0) & (df['ub_estimate_in_minutes'] > 0)]
+    if df.empty:
+        logging.warning("No data with positive estimates available to generate CDF plot.")
+        return None
+
+    # Calculate mid-point estimate
+    df['midpoint_estimate'] = (df['lb_estimate_in_minutes'] + df['ub_estimate_in_minutes']) / 2
+
+    # Prepare data for CDF plots
+    x_lb, y_lb = _calculate_cdf(df['lb_estimate_in_minutes'])
+    x_ub, y_ub = _calculate_cdf(df['ub_estimate_in_minutes'])
+    x_mid, y_mid = _calculate_cdf(df['midpoint_estimate'])
+
+    # --- Plotting ---
+    try:
+        fig, ax = plt.subplots(figsize=(12, 8))
+
+        # --- Grid and Reference Lines ---
+        # Horizontal reference lines for percentages
+        for y_val in range(0, 101, 10):
+            ax.axhline(y_val, color=GRAY_PALETTE['100'], linewidth=0.8, zorder=1)
+
+        # Vertical reference lines for human-friendly durations
+        ticks = [1, 5, 10, 30, 60, 120, 240, 480, 1440, 2880, 10080, 43200]
+        for tick in ticks:
+            ax.axvline(tick, color=GRAY_PALETTE['300'], linewidth=0.8, linestyle='--', zorder=1)
+
+        # --- CDF Plots ---
+        ax.step(x_lb, y_lb, where='post', color=LIME_PALETTE['300'], linewidth=1.8, linestyle='--', zorder=2, label='Lower-bound Estimate (CDF)')
+        ax.step(x_ub, y_ub, where='post', color=LIME_PALETTE['900'], linewidth=1.8, linestyle=':', zorder=3, label='Upper-bound Estimate (CDF)')
+        ax.step(x_mid, y_mid, where='post', color=LIME_PALETTE['600'], linewidth=2.2, zorder=4, label='Mid-point Estimate (CDF)')
+
+        # --- Axes Configuration ---
+        ax.set_ylim(0, 100)
+        ax.set_xscale('log')
+
+        # Custom x-ticks for durations
+        ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hr', '2 hrs', '4 hrs', '8 hrs', '1 day', '2 days', '1 week', '30 days']
+        ax.set_xticks(ticks)
+        ax.set_xticklabels(ticklabels, rotation=45, ha='right')
+        ax.minorticks_off() # Turn off minor ticks for clarity with custom grid
+
+        # Format y-axis as percentages
+        ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(decimals=0))
+
+        # --- Spines and Labels ---
+        for spine in ['top', 'right']:
+            ax.spines[spine].set_visible(False)
+        for spine in ['left', 'bottom']:
+            ax.spines[spine].set_edgecolor(GRAY_PALETTE['300'])
+
+        # Use ax.text for more control over label placement than ax.set_ylabel/xlabel
+        ax.text(-0.07, 1.02, "% of tasks with duration ≤ X", transform=ax.transAxes,
+                fontsize=12, fontweight='semibold', va='bottom')
+        ax.text(0.5, -0.25, 'Task Duration (X)', transform=ax.transAxes,
+                fontsize=12, fontweight='semibold', ha='center')
+
+        ax.legend(frameon=False, loc='lower right')
+        fig.suptitle('Cumulative Distribution of Task Time Estimates', fontsize=16, y=0.96)
+        plt.tight_layout(rect=[0, 0, 1, 0.95]) # Adjust layout to make space for suptitle
+
+        # --- File Saving ---
+        temp_dir = tempfile.gettempdir()
+        temp_path = Path(temp_dir) / "temporal_coherence_cdf.png"
+        plt.savefig(temp_path, dpi=300, bbox_inches='tight')
+        logging.info(f"Successfully saved plot to temporary file: {temp_path}")
+
+        return temp_path
+
+    except Exception as e:
+        logging.error(f"An error occurred while generating the CDF plot: {e}", exc_info=True)
+        return None
+    finally:
+        plt.close()
--- a/analysis/generators/time_estimate_spread_by_occupation.py
+++ b/analysis/generators/time_estimate_spread_by_occupation.py
@ -0,0 +1,112 @@
+import seaborn as sns
+import matplotlib.pyplot as plt
+from pathlib import Path
+import tempfile
+import logging
+import pandas as pd
+
+# Based on O*NET SOC 2018 structure, this mapping helps translate
+# the 2-digit major group codes into human-readable labels.
+OCCUPATION_MAJOR_CODES = {
+    '11': 'Management',
+    '13': 'Business & Financial',
+    '15': 'Computer & Mathematical',
+    '17': 'Architecture & Engineering',
+    '19': 'Life, Physical, & Social Science',
+    '21': 'Community & Social Service',
+    '23': 'Legal',
+    '25': 'Education, Training, & Library',
+    '27': 'Arts, Design, & Media',
+    '29': 'Healthcare Practitioners',
+    '31': 'Healthcare Support',
+    '33': 'Protective Service',
+    '35': 'Food Preparation & Serving',
+    '37': 'Building & Grounds Maintenance',
+    '39': 'Personal Care & Service',
+    '41': 'Sales & Related',
+    '43': 'Office & Admin Support',
+    '45': 'Farming, Fishing, & Forestry',
+    '47': 'Construction & Extraction',
+    '49': 'Installation, Maintenance, & Repair',
+    '51': 'Production',
+    '53': 'Transportation & Material Moving',
+    '55': 'Military Specific',
+}
+
+
+def generate(processed_df: pd.DataFrame):
+    """
+    Generates a box plot showing the spread of time-range estimates per occupation.
+
+    This corresponds to 'cell2' from the original analysis notebook. It visualizes
+    the distribution of the difference between upper and lower time estimates for
+    each major occupational group.
+
+    Args:
+        processed_df (pd.DataFrame): The preprocessed data. Expected columns:
+                                     'lb_estimate_in_minutes',
+                                     'ub_estimate_in_minutes', 'onetsoc_major'.
+
+    Returns:
+        Path: The path to the generated temporary image file, or None on failure.
+    """
+    logging.info("Generating plot of time estimate spread by occupation...")
+
+    # --- Data Validation and Preparation ---
+    required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'onetsoc_major']
+    if not all(col in processed_df.columns for col in required_cols):
+        logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
+        return None
+
+    df = processed_df.copy()
+
+    # Calculate the estimate range.
+    df['estimate_range'] = df['ub_estimate_in_minutes'] - df['lb_estimate_in_minutes']
+
+    # For log scaling, we need positive values. Filter out any non-positive ranges.
+    df = df[df['estimate_range'] > 0]
+    if df.empty:
+        logging.warning("No data with a positive estimate range available to plot.")
+        return None
+
+    # Sort by the major code to ensure a consistent plot order
+    df = df.sort_values('onetsoc_major')
+
+    # --- Plotting ---
+    try:
+        plt.figure(figsize=(14, 10))
+
+        ax = sns.boxplot(
+            data=df,
+            x='onetsoc_major',
+            y='estimate_range',
+            showfliers=False  # Outliers are excluded for a clearer view of the main distribution
+        )
+
+        plt.yscale('log')  # The long tail of the data makes a log scale more readable
+        plt.xlabel('Occupation Major Group', fontsize=12)
+        plt.ylabel('Time Estimate Range (upper - lower, in minutes, log scale)', fontsize=12)
+        plt.title('Spread of Time-Range Estimates by Occupation', fontsize=16)
+
+        # Replace numeric x-tick labels (e.g., '11', '15') with meaningful text labels
+        ax.set_xticklabels(
+            [OCCUPATION_MAJOR_CODES.get(code.get_text(), code.get_text()) for code in ax.get_xticklabels()],
+            rotation=60,
+            ha='right' # Align rotated labels correctly
+        )
+
+        plt.tight_layout()
+
+        # --- File Saving ---
+        temp_dir = tempfile.gettempdir()
+        temp_path = Path(temp_dir) / "time_estimate_spread_by_occupation.png"
+        plt.savefig(temp_path, dpi=300, bbox_inches='tight')
+        logging.info(f"Successfully saved plot to temporary file: {temp_path}")
+
+        return temp_path
+
+    except Exception as e:
+        logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
+        return None
+    finally:
+        plt.close()
--- a/analysis/generators/wage_bill_by_occupation.py
+++ b/analysis/generators/wage_bill_by_occupation.py
@ -0,0 +1,150 @@
+import seaborn as sns
+import matplotlib.pyplot as plt
+import matplotlib.ticker as mticker
+import pandas as pd
+from pathlib import Path
+import tempfile
+import logging
+
+# Assuming data.py is in the same package and provides this function
+from ..data import get_db_connection
+
+# This mapping helps translate the O*NET 2-digit major group codes
+# into human-readable labels for the plot's y-axis.
+OCCUPATION_MAJOR_CODES = {
+    '11': 'Management',
+    '13': 'Business & Financial',
+    '15': 'Computer & Mathematical',
+    '17': 'Architecture & Engineering',
+    '19': 'Life, Physical, & Social Science',
+    '21': 'Community & Social Service',
+    '23': 'Legal',
+    '25': 'Education, Training, & Library',
+    '27': 'Arts, Design, & Media',
+    '29': 'Healthcare Practitioners',
+    '31': 'Healthcare Support',
+    '33': 'Protective Service',
+    '35': 'Food Preparation & Serving',
+    '37': 'Building & Grounds Maintenance',
+    '39': 'Personal Care & Service',
+    '41': 'Sales & Related',
+    '43': 'Office & Admin Support',
+    '45': 'Farming, Fishing, & Forestry',
+    '47': 'Construction & Extraction',
+    '49': 'Installation, Maintenance, & Repair',
+    '51': 'Production',
+    '53': 'Transportation & Material Moving',
+    '55': 'Military Specific',
+}
+
+
+def generate(processed_df: pd.DataFrame):
+    """
+    Generates a bar plot of the total wage bill per major occupation group.
+
+    This corresponds to the first 'cell11' from the original analysis notebook.
+    It calculates the total wage bill (Total Employment * Annual Mean Wage) for
+    each occupation and aggregates it by major occupation group. This generator
+    loads its data directly from the O*NET database.
+
+    Args:
+        processed_df (pd.DataFrame): The preprocessed data (not used in this generator,
+                                     but required by the function signature).
+
+    Returns:
+        Path: The path to the generated temporary image file, or None on failure.
+    """
+    logging.info("Generating plot of total wage bill by occupation...")
+    conn = None
+    try:
+        # --- Data Loading ---
+        # This generator needs specific data that is not in the main preprocessed_df.
+        # It loads occupational employment and wage data directly from the database.
+        conn = get_db_connection()
+        if conn is None:
+            raise ConnectionError("Could not get database connection.")
+
+        # This data is stored in a long format in the `occupation_level_metadata` table.
+        # We need to query this table and pivot it to get employment and wage columns.
+        query = "SELECT onetsoc_code, item, response FROM occupation_level_metadata WHERE item IN ('Employment', 'Annual Mean Wage')"
+        try:
+            df_meta = pd.read_sql_query(query, conn)
+
+            # Pivot the table to create 'Employment' and 'Annual Mean Wage' columns
+            df_oesm = df_meta.pivot(index='onetsoc_code', columns='item', values='response').reset_index()
+            logging.info("Pivoted occupation metadata. Columns are: %s", df_oesm.columns.tolist())
+
+            # Rename for consistency with the original notebook's code
+            df_oesm.rename(columns={
+                'onetsoc_code': 'OCC_CODE',
+                'Employment': 'TOT_EMP',
+                'Annual Mean Wage': 'A_MEAN'
+            }, inplace=True)
+        except (pd.io.sql.DatabaseError, KeyError) as e:
+            logging.error(f"Failed to query or pivot occupation metadata: {e}", exc_info=True)
+            return None
+
+
+        # --- Data Preparation ---
+        # Create a 'major group' code from the first two digits of the SOC code
+        df_oesm['onetsoc_major'] = df_oesm['OCC_CODE'].str[:2]
+
+        # Ensure wage and employment columns are numeric, coercing errors to NaN
+        df_oesm['TOT_EMP'] = pd.to_numeric(df_oesm['TOT_EMP'], errors='coerce')
+        df_oesm['A_MEAN'] = pd.to_numeric(df_oesm['A_MEAN'], errors='coerce')
+
+        # Drop rows with missing data in critical columns
+        df_oesm.dropna(subset=['TOT_EMP', 'A_MEAN', 'onetsoc_major'], inplace=True)
+
+        # Calculate the wage bill for each occupation
+        df_oesm['wage_bill'] = df_oesm['TOT_EMP'] * df_oesm['A_MEAN']
+
+        # Aggregate the wage bill by major occupation group
+        df_wage_bill_major = df_oesm.groupby('onetsoc_major')['wage_bill'].sum().reset_index()
+
+        # Map the major codes to readable titles for plotting
+        df_wage_bill_major['OCC_TITLE_MAJOR'] = df_wage_bill_major['onetsoc_major'].map(OCCUPATION_MAJOR_CODES)
+        df_wage_bill_major.dropna(subset=['OCC_TITLE_MAJOR'], inplace=True) # Drop military/unmapped codes
+
+        # Sort by wage bill for a more informative plot
+        df_wage_bill_major = df_wage_bill_major.sort_values('wage_bill', ascending=False)
+
+        if df_wage_bill_major.empty:
+            logging.warning("No data available to generate the wage bill plot.")
+            return None
+
+
+        # --- Plotting ---
+        plt.figure(figsize=(12, 10))
+        ax = sns.barplot(x='wage_bill', y='OCC_TITLE_MAJOR', data=df_wage_bill_major, palette="viridis", orient='h')
+        ax.set_title('Total Wage Bill per Major Occupation Group', fontsize=16, pad=15)
+        ax.set_xlabel('Total Wage Bill (in USD)', fontsize=12)
+        ax.set_ylabel('Major Occupation Group', fontsize=12)
+        ax.grid(axis='x', linestyle='--', alpha=0.7)
+
+        # Format the x-axis to be more readable (e.g., "$2.0T" for trillions)
+        def format_billions(x, pos):
+            if x >= 1e12:
+                return f'${x*1e-12:.1f}T'
+            if x >= 1e9:
+                return f'${x*1e-9:.0f}B'
+            return f'${x*1e-6:.0f}M'
+        ax.xaxis.set_major_formatter(mticker.FuncFormatter(format_billions))
+
+        plt.tight_layout()
+
+        # --- File Saving ---
+        temp_dir = tempfile.gettempdir()
+        temp_path = Path(temp_dir) / "wage_bill_by_occupation.png"
+        plt.savefig(temp_path, dpi=300)
+        logging.info(f"Successfully saved plot to temporary file: {temp_path}")
+
+        return temp_path
+
+    except Exception as e:
+        logging.error(f"An error occurred while generating the wage bill plot: {e}", exc_info=True)
+        return None
+    finally:
+        plt.close()
+        if conn:
+            conn.close()
--- a/analysis/main.py
+++ b/analysis/main.py
@ -0,0 +1,64 @@
+import logging
+import sys
+
+# Since this file is inside the 'analysis' package, we use relative imports
+# to access the other modules within the same package.
+from . import data
+from . import preprocess
+from . import generate
+
+# Configure logging for the entire application.
+# This setup will apply to loggers in data, preprocess, and generate modules as well.
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    stream=sys.stdout
+)
+
+def main():
+    """
+    The main entry point for the entire analysis pipeline.
+
+    This function orchestrates the three main stages of the analysis:
+    1. Data Setup: Downloads and prepares the necessary raw data and database.
+    2. Preprocessing: Cleans, enriches, and transforms the raw data into an
+       analysis-ready DataFrame.
+    3. Output Generation: Runs all registered generators to produce figures,
+       tables, and other outputs, saving them to the 'dist/' directory.
+    """
+    logger = logging.getLogger(__name__)
+    logger.info("=================================================")
+    logger.info("  STARTING ECONTAI ANALYSIS PIPELINE  ")
+    logger.info("=================================================")
+
+    try:
+        # Stage 1: Set up the data and database
+        logger.info("--- STAGE 1: DATA SETUP ---")
+        data.setup_data_and_database()
+        logger.info("--- DATA SETUP COMPLETE ---")
+
+        # Stage 2: Run the preprocessing pipeline
+        logger.info("--- STAGE 2: PREPROCESSING ---")
+        processed_dataframe = preprocess.run_preprocessing()
+        logger.info("--- PREPROCESSING COMPLETE ---")
+
+        # Stage 3: Generate all outputs
+        logger.info("--- STAGE 3: OUTPUT GENERATION ---")
+        generate.create_all_outputs(processed_dataframe)
+        logger.info("--- OUTPUT GENERATION COMPLETE ---")
+
+        logger.info("=================================================")
+        logger.info("  ANALYSIS PIPELINE COMPLETED SUCCESSFULLY  ")
+        logger.info("=================================================")
+
+    except Exception as e:
+        logger.critical("An unrecoverable error occurred during the pipeline execution.", exc_info=True)
+        # Exit with a non-zero status code to indicate failure, which is useful for automation.
+        sys.exit(1)
+
+
+# This allows the script to be run from the command line using `python -m analysis.main`.
+# The `-m` flag is important because it adds the parent directory to the Python path,
+# allowing the relative imports (e.g., `from . import data`) to work correctly.
+if __name__ == '__main__':
+    main()
--- a/analysis/preprocess.py
+++ b/analysis/preprocess.py
@ -0,0 +1,160 @@
+import logging
+import pandas as pd
+import numpy as np
+from scipy.stats import median_abs_deviation
+from .data import get_db_connection
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+
+def _convert_to_minutes(level: float) -> float:
+    """
+    Converts O*NET 'Frequency' scale values (levels) to estimated minutes per day.
+    This logic is derived from the `preprocessing_time_estimates` function
+    in the original analysis notebook.
+    """
+    if pd.isna(level):
+        return 0
+    # This mapping is an interpretation of the O*NET frequency scale.
+    return {
+        1: 0,       # Yearly or less
+        2: 2,       # Several times a year
+        3: 10,      # Several times a month
+        4: 30,      # Several times a week
+        5: 120,     # Daily
+        6: 240,     # Several times a day
+        7: 480,     # Hourly or more
+    }.get(int(level), 0)
+
+
+def _mad_z_score(series: pd.Series) -> pd.Series:
+    """
+    Calculates the robust Z-score using Median Absolute Deviation (MAD).
+    This function is derived from 'cell7' of the original analysis.
+    """
+    if series.isnull().all():
+        return pd.Series([np.nan] * len(series), index=series.index)
+
+    median = series.median()
+    # scale='normal' makes MAD comparable to the standard deviation for a normal distribution.
+    mad = median_abs_deviation(series.dropna(), scale='normal')
+    if mad == 0:
+        return pd.Series([np.nan] * len(series), index=series.index)
+    return (series - median) / mad
+
+
+def run_preprocessing() -> pd.DataFrame:
+    """
+    Main orchestrator for the preprocessing pipeline.
+
+    This function faithfully reproduces the data transformation pipeline from the
+    original `analysis.py` script, including the `preprocessing_time_estimates`
+    and cell-specific data manipulations.
+
+    Returns:
+        pd.DataFrame: A fully preprocessed DataFrame ready for the generators.
+    """
+    logging.info("Starting data preprocessing...")
+    conn = None
+    try:
+        conn = get_db_connection()
+        if conn is None:
+            raise ConnectionError("Could not establish database connection.")
+
+        # --- 1. Load Data from Database ---
+        # Fetch all necessary tables to build the initial DataFrame.
+        logging.info("Loading data from O*NET database...")
+        task_ratings_df = pd.read_sql_query("SELECT * FROM task_ratings", conn)
+        task_statements_df = pd.read_sql_query("SELECT * FROM task_statements", conn)
+        occupations_df = pd.read_sql_query("SELECT * FROM occupation_data", conn)
+
+        # --- 2. Initial Merge ---
+        # Merge the tables to create a comprehensive base DataFrame.
+        # Merging on both 'onetsoc_code' and 'task_id' is crucial to avoid
+        # creating duplicate columns from the overlapping 'onetsoc_code'.
+        logging.info("Merging base tables...")
+        tasks_df = pd.merge(task_ratings_df, task_statements_df, on=['onetsoc_code', 'task_id'])
+        tasks_df = pd.merge(tasks_df, occupations_df, on='onetsoc_code')
+
+        # --- 3. Create "Atomic Tasks" and Time Estimates (from `preprocessing_time_estimates`) ---
+        # This is the core of the analysis, focusing on tasks with frequency ratings.
+        logging.info("Filtering for 'atomic tasks' (scale_id='FR') and calculating time estimates...")
+        # Strip whitespace from scale_id to ensure the filter works correctly.
+        tasks_df['scale_id'] = tasks_df['scale_id'].str.strip()
+        atomic_tasks = tasks_df[tasks_df['scale_id'] == 'FR'].copy()
+
+        # Convert frequency confidence intervals into minutes/day
+        atomic_tasks['lb_estimate_in_minutes'] = atomic_tasks['lower_ci_bound'].apply(_convert_to_minutes)
+        atomic_tasks['ub_estimate_in_minutes'] = atomic_tasks['upper_ci_bound'].apply(_convert_to_minutes)
+        atomic_tasks['estimate_midpoint'] = (atomic_tasks['lb_estimate_in_minutes'] + atomic_tasks['ub_estimate_in_minutes']) / 2
+
+        # --- 4. Add Derived Columns for Analysis (from `cell` logic) ---
+        logging.info("Adding derived columns for analysis...")
+
+        # Add `onetsoc_major` for grouping by occupation category
+        atomic_tasks['onetsoc_major'] = atomic_tasks['onetsoc_code'].str[:2]
+
+        # Calculate estimate_range and estimate_ratio used in several plots
+        atomic_tasks['estimate_range'] = atomic_tasks['ub_estimate_in_minutes'] - atomic_tasks['lb_estimate_in_minutes']
+
+        # To calculate ratio, ensure lower bound is positive to avoid division by zero
+        lb_positive = atomic_tasks['lb_estimate_in_minutes'] > 0
+        atomic_tasks['estimate_ratio'] = np.nan
+        atomic_tasks.loc[lb_positive, 'estimate_ratio'] = atomic_tasks['ub_estimate_in_minutes'] / atomic_tasks['lb_estimate_in_minutes']
+
+        # --- 5. Calculate Outlier Scores (from `cell6` and `cell7`) ---
+        logging.info("Calculating standard and robust Z-scores for outlier detection...")
+
+        # Standard Z-score
+        grouped_stats = atomic_tasks.groupby('onetsoc_code')['estimate_midpoint'].agg(['mean', 'std'])
+        atomic_tasks = atomic_tasks.merge(grouped_stats, on='onetsoc_code', how='left')
+
+        # Calculate Z-score, avoiding division by zero if std is 0
+        non_zero_std = atomic_tasks['std'].notna() & (atomic_tasks['std'] != 0)
+        atomic_tasks['z_score'] = np.nan
+        atomic_tasks.loc[non_zero_std, 'z_score'] = \
+            (atomic_tasks.loc[non_zero_std, 'estimate_midpoint'] - atomic_tasks.loc[non_zero_std, 'mean']) / atomic_tasks.loc[non_zero_std, 'std']
+
+        # Robust Z-score (using MAD)
+        atomic_tasks['robust_z_score'] = atomic_tasks.groupby('onetsoc_code')['estimate_midpoint'].transform(_mad_z_score)
+
+        # --- 6. Prepare for other generators ---
+        # NOTE: The data for the 'task_breakdown_by_occupation' generator, specifically
+        # the 'remote_status' and 'estimateable' columns, is not available in the O*NET
+        # database. This data was likely loaded from a separate file (e.g., 'tasks_clean.parquet')
+        # in the original notebook. For now, we will add placeholder columns.
+        atomic_tasks['remote_status'] = 'unknown'
+        atomic_tasks['estimateable'] = 'unknown'
+
+
+        logging.info("Data preprocessing complete.")
+        return atomic_tasks
+
+    except Exception as e:
+        logging.error("An error occurred during preprocessing: %s", e, exc_info=True)
+        # Return an empty DataFrame on failure to prevent downstream errors
+        return pd.DataFrame()
+    finally:
+        if conn:
+            conn.close()
+            logging.info("Database connection closed.")
+
+if __name__ == '__main__':
+    # This allows the preprocessing to be run directly for testing or debugging.
+    # Note: Requires data to be set up first by running data.py.
+    try:
+        processed_data = run_preprocessing()
+        if not processed_data.empty:
+            print("Preprocessing successful. DataFrame shape:", processed_data.shape)
+            print("Columns:", processed_data.columns.tolist())
+            print(processed_data.head())
+            # Save to a temporary file to inspect the output
+            output_path = "temp_preprocessed_data.csv"
+            processed_data.to_csv(output_path, index=False)
+            print(f"Sample output saved to {output_path}")
+        else:
+            print("Preprocessing failed or resulted in an empty DataFrame.")
+
+    except (FileNotFoundError, ConnectionError) as e:
+        logging.error("Failed to run preprocessing: %s", e)