old

2025-07-15 00:41:05 +02:00 · 2025-07-15 00:41:05 +02:00 · 43076bcbb1
commit 43076bcbb1
parent 720f21a85b
42 changed files with 237415 additions and 7831 deletions
--- a/analysis/generators/init.py
+++ b/analysis/generators/init.py
--- a/analysis/generators/estimate_lower_vs_upper_bounds.py
+++ b/analysis/generators/estimate_lower_vs_upper_bounds.py
@ -0,0 +1,119 @@
+import seaborn as sns
+import matplotlib.pyplot as plt
+from pathlib import Path
+import tempfile
+import logging
+import pandas as pd
+import numpy as np
+
+# Copied from other generators for modularity. This dictionary maps
+# O*NET major occupation group codes to human-readable labels.
+OCCUPATION_MAJOR_CODES = {
+    '11': 'Management',
+    '13': 'Business & Financial',
+    '15': 'Computer & Mathematical',
+    '17': 'Architecture & Engineering',
+    '19': 'Life, Physical, & Social Science',
+    '21': 'Community & Social Service',
+    '23': 'Legal',
+    '25': 'Education, Training, & Library',
+    '27': 'Arts, Design, & Media',
+    '29': 'Healthcare Practitioners',
+    '31': 'Healthcare Support',
+    '33': 'Protective Service',
+    '35': 'Food Preparation & Serving',
+    '37': 'Building & Grounds Maintenance',
+    '39': 'Personal Care & Service',
+    '41': 'Sales & Related',
+    '43': 'Office & Admin Support',
+    '45': 'Farming, Fishing, & Forestry',
+    '47': 'Construction & Extraction',
+    '49': 'Installation, Maintenance, & Repair',
+    '51': 'Production',
+    '53': 'Transportation & Material Moving',
+    '55': 'Military Specific',
+}
+
+
+def generate(processed_df: pd.DataFrame):
+    """
+    Generates a scatter plot comparing lower vs. upper time estimates for tasks.
+
+    This corresponds to 'cell3' from the original analysis notebook. It helps
+    visualize the relationship and spread between the lower and upper bounds
+
+    of time estimates across different occupation groups.
+
+    Args:
+        processed_df (pd.DataFrame): The preprocessed data. Expected columns:
+                                     'lb_estimate_in_minutes',
+                                     'ub_estimate_in_minutes', 'onetsoc_major'.
+
+    Returns:
+        Path: The path to the generated temporary image file, or None on failure.
+    """
+    logging.info("Generating plot of lower vs. upper time estimates...")
+
+    # --- Data Validation and Preparation ---
+    required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'onetsoc_major']
+    if not all(col in processed_df.columns for col in required_cols):
+        logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
+        return None
+
+    df = processed_df.copy()
+
+    # For log scaling, both lower and upper bounds must be positive.
+    df = df[(df['lb_estimate_in_minutes'] > 0) & (df['ub_estimate_in_minutes'] > 0)]
+    if df.empty:
+        logging.warning("No data with positive lower and upper estimates available to plot.")
+        return None
+
+    # Replace the major code with its readable label for the hue legend.
+    df['occupation_label'] = df['onetsoc_major'].map(OCCUPATION_MAJOR_CODES)
+
+    # --- Plotting ---
+    try:
+        plt.figure(figsize=(12, 10))
+        ax = sns.scatterplot(
+            data=df,
+            x='lb_estimate_in_minutes',
+            y='ub_estimate_in_minutes',
+            alpha=0.2,
+            edgecolor=None,
+            hue="occupation_label"  # Use the labeled column for the legend
+        )
+
+        # Determine limits for the 45° reference line
+        # Use the maximum of both columns to create a square plot
+        max_val = df[['lb_estimate_in_minutes', 'ub_estimate_in_minutes']].max().max()
+        lims = (df[['lb_estimate_in_minutes', 'ub_estimate_in_minutes']].min().min(), max_val)
+        ax.plot(lims, lims, color='black', linestyle='--', linewidth=1, label='Upper = Lower')
+
+        # Add helper lines for constant ratios (2x, 10x, 100x)
+        for k in [2, 10, 100]:
+            ax.plot(lims, [k * l for l in lims],
+                    linestyle=':', color='grey', linewidth=0.8, label=f'Upper = {k}x Lower')
+
+        ax.set(xscale='log', yscale='log', xlim=lims, ylim=lims)
+        ax.set_xlabel('Lower-bound Estimate (minutes, log scale)', fontsize=12)
+        ax.set_ylabel('Upper-bound Estimate (minutes, log scale)', fontsize=12)
+        ax.set_title('Lower vs. Upper Time Estimates for All Tasks', fontsize=16)
+
+        # Place the legend outside the plot to avoid obscuring data
+        ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left', title='Occupation / Ratio')
+
+        # --- File Saving ---
+        temp_dir = tempfile.gettempdir()
+        temp_path = Path(temp_dir) / "estimate_lower_vs_upper_bounds.png"
+
+        # Use bbox_inches='tight' to ensure the external legend is included in the saved image.
+        plt.savefig(temp_path, dpi=300, bbox_inches='tight')
+        logging.info(f"Successfully saved plot to temporary file: {temp_path}")
+
+        return temp_path
+
+    except Exception as e:
+        logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
+        return None
+    finally:
+        plt.close()
--- a/analysis/generators/estimate_ratio_distribution.py
+++ b/analysis/generators/estimate_ratio_distribution.py
@ -0,0 +1,86 @@
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from pathlib import Path
+import tempfile
+import logging
+
+def generate(processed_df: pd.DataFrame):
+    """
+    Generates a histogram of the log-ratio of upper to lower time estimates.
+
+    This corresponds to 'cell4' from the original analysis notebook. It shows
+    the distribution of how many times larger the upper estimate is compared
+    to the lower estimate.
+
+    Args:
+        processed_df (pd.DataFrame): The preprocessed data. Expected columns:
+                                     'lb_estimate_in_minutes',
+                                     'ub_estimate_in_minutes'.
+
+    Returns:
+        Path: The path to the generated temporary image file, or None on failure.
+    """
+    logging.info("Generating distribution plot of estimate ratios...")
+
+    # --- Data Validation and Preparation ---
+    required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes']
+    if not all(col in processed_df.columns for col in required_cols):
+        logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
+        return None
+
+    df = processed_df.copy()
+
+    # Calculate the ratio. We need to handle cases where the lower bound is zero.
+    # Replace lower bound of 0 with a small number to avoid division by zero, or filter them out.
+    # Here, we filter, as a ratio with a zero denominator is undefined.
+    df = df[df['lb_estimate_in_minutes'] > 0]
+    df['estimate_ratio'] = df['ub_estimate_in_minutes'] / df['lb_estimate_in_minutes']
+
+    # Replace infinite values (which can occur if ub is huge and lb is tiny) with NaN
+    # and drop rows with NaN or infinite ratios.
+    df.replace([np.inf, -np.inf], np.nan, inplace=True)
+    df.dropna(subset=['estimate_ratio'], inplace=True)
+
+    if df.empty:
+        logging.warning("No valid data available to plot the estimate ratio distribution.")
+        return None
+
+    # --- Plotting ---
+    try:
+        plt.figure(figsize=(10, 6))
+
+        # We plot the log10 of the ratio to better visualize the wide distribution
+        log_ratio = np.log10(df['estimate_ratio'])
+
+        sns.histplot(log_ratio, bins=60, kde=True)
+
+        # Add vertical lines for reference points
+        # log10(1) = 0, which is where upper bound equals lower bound
+        plt.axvline(x=0, color='black', linestyle='-', linewidth=1.5, label='1x (Upper = Lower)')
+        # A small ratio, e.g., 5% difference
+        plt.axvline(x=np.log10(1.05), color='orange', linestyle='--', linewidth=1, label='1.05x ratio')
+        # A 10x ratio
+        plt.axvline(x=np.log10(10), color='red', linestyle='--', linewidth=1, label='10x ratio')
+
+        plt.xlabel('log₁₀(Upper Estimate / Lower Estimate)', fontsize=12)
+        plt.ylabel('Number of Tasks', fontsize=12)
+        plt.title('Distribution of Time Estimate Ratios', fontsize=16)
+        plt.legend()
+        plt.grid(axis='y', linestyle='--', alpha=0.7)
+        plt.tight_layout()
+
+        # --- File Saving ---
+        temp_dir = tempfile.gettempdir()
+        temp_path = Path(temp_dir) / "estimate_ratio_distribution.png"
+        plt.savefig(temp_path, dpi=300)
+        logging.info(f"Successfully saved plot to temporary file: {temp_path}")
+
+        return temp_path
+
+    except Exception as e:
+        logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
+        return None
+    finally:
+        plt.close()
--- a/analysis/generators/ratio_heatmap_by_occupation_and_task_length.py
+++ b/analysis/generators/ratio_heatmap_by_occupation_and_task_length.py
@ -0,0 +1,135 @@
+import seaborn as sns
+import matplotlib.pyplot as plt
+import pandas as pd
+import numpy as np
+from pathlib import Path
+import tempfile
+import logging
+
+# This mapping helps translate the O*NET 2-digit major group codes
+# into human-readable labels for the plot's y-axis.
+OCCUPATION_MAJOR_CODES = {
+    '11': 'Management',
+    '13': 'Business & Financial',
+    '15': 'Computer & Mathematical',
+    '17': 'Architecture & Engineering',
+    '19': 'Life, Physical, & Social Science',
+    '21': 'Community & Social Service',
+    '23': 'Legal',
+    '25': 'Education, Training, & Library',
+    '27': 'Arts, Design, & Media',
+    '29': 'Healthcare Practitioners',
+    '31': 'Healthcare Support',
+    '33': 'Protective Service',
+    '35': 'Food Preparation & Serving',
+    '37': 'Building & Grounds Maintenance',
+    '39': 'Personal Care & Service',
+    '41': 'Sales & Related',
+    '43': 'Office & Admin Support',
+    '45': 'Farming, Fishing, & Forestry',
+    '47': 'Construction & Extraction',
+    '49': 'Installation, Maintenance, & Repair',
+    '51': 'Production',
+    '53': 'Transportation & Material Moving',
+    '55': 'Military Specific',
+}
+
+
+def generate(processed_df: pd.DataFrame):
+    """
+    Generates a heatmap of the median estimate ratio by occupation and task length quartile.
+
+    This corresponds to 'cell5' from the original analysis notebook. It shows
+    how the ratio between upper and lower time estimates varies across
+    different occupations and for tasks of different typical lengths (binned
+    into quartiles).
+
+    Args:
+        processed_df (pd.DataFrame): The preprocessed data. Expected columns:
+                                     'lb_estimate_in_minutes',
+                                     'ub_estimate_in_minutes', 'onetsoc_major'.
+
+    Returns:
+        Path: The path to the generated temporary image file, or None on failure.
+    """
+    logging.info("Generating heatmap of estimate ratios by occupation and task length...")
+
+    # --- Data Validation and Preparation ---
+    required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'onetsoc_major']
+    if not all(col in processed_df.columns for col in required_cols):
+        logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
+        return None
+
+    df = processed_df.copy()
+
+    # Calculate the estimate ratio, handling division by zero and infinity
+    df = df[df['lb_estimate_in_minutes'] > 0]
+    df['estimate_ratio'] = df['ub_estimate_in_minutes'] / df['lb_estimate_in_minutes']
+    df.replace([np.inf, -np.inf], np.nan, inplace=True)
+    df.dropna(subset=['estimate_ratio'], inplace=True)
+
+    if df.empty:
+        logging.warning("No valid data available for the ratio heatmap.")
+        return None
+
+    # 1. Bin lower bounds into quartiles (Q1–Q4)
+    # Using duplicates='drop' can help if there are many identical values
+    # which can make binning into quantiles fail.
+    try:
+        df['lb_q'] = pd.qcut(
+            df.lb_estimate_in_minutes,
+            q=4,
+            labels=['Q1 (Shortest)', 'Q2', 'Q3', 'Q4 (Longest)'],
+            duplicates='drop'
+        )
+    except ValueError as e:
+        logging.error(f"Could not bin data into quartiles: {e}. There might not be enough unique values.")
+        return None
+
+
+    # 2. Aggregate: median ratio per cell (occupation x task length quartile)
+    pivot = df.pivot_table(
+        index='onetsoc_major',
+        columns='lb_q',
+        values='estimate_ratio',
+        aggfunc='median'
+    )
+
+    # Map the index (onetsoc_major codes) to their corresponding readable labels
+    pivot.index = pivot.index.map(OCCUPATION_MAJOR_CODES)
+    pivot.dropna(inplace=True) # Drop occupations with no data in some quartiles for a cleaner plot
+
+    if pivot.empty:
+        logging.warning("Pivot table is empty after processing. Cannot generate heatmap.")
+        return None
+
+    # --- Plotting ---
+    try:
+        plt.figure(figsize=(12, 10))
+        sns.heatmap(
+            pivot,
+            cmap='RdYlGn_r',  # Red-Yellow-Green (reversed), good for ratios centered around 1
+            center=2,         # Center the colormap around a ratio of 2
+            annot=True,       # Show the median values in the cells
+            fmt='.1f',        # Format annotations to one decimal place
+            linewidths=.5,
+            cbar_kws={'label': 'Median Upper/Lower Estimate Ratio'}
+        )
+        plt.xlabel('Task Length (based on lower-bound quartile)', fontsize=12)
+        plt.ylabel('Occupation Major Group', fontsize=12)
+        plt.title('Typical Estimate Range Width by Occupation and Task Length', fontsize=16)
+        plt.tight_layout()
+
+        # --- File Saving ---
+        temp_dir = tempfile.gettempdir()
+        temp_path = Path(temp_dir) / "ratio_heatmap_by_occupation_and_task_length.png"
+        plt.savefig(temp_path, dpi=300)
+        logging.info(f"Successfully saved plot to temporary file: {temp_path}")
+
+        return temp_path
+
+    except Exception as e:
+        logging.error(f"An error occurred while generating the heatmap: {e}", exc_info=True)
+        return None
+    finally:
+        plt.close()
--- a/analysis/generators/task_breakdown_by_occupation.py
+++ b/analysis/generators/task_breakdown_by_occupation.py
@ -0,0 +1,161 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import matplotlib.ticker as mtick
+import matplotlib.colors as mcolors
+from pathlib import Path
+import tempfile
+import logging
+
+# This mapping helps translate the O*NET 2-digit major group codes
+# into human-readable labels for the plot's y-axis.
+OCCUPATION_MAJOR_CODES = {
+    '11': 'Management',
+    '13': 'Business & Financial',
+    '15': 'Computer & Mathematical',
+    '17': 'Architecture & Engineering',
+    '19': 'Life, Physical, & Social Science',
+    '21': 'Community & Social Service',
+    '23': 'Legal',
+    '25': 'Education, Training, & Library',
+    '27': 'Arts, Design, & Media',
+    '29': 'Healthcare Practitioners',
+    '31': 'Healthcare Support',
+    '33': 'Protective Service',
+    '35': 'Food Preparation & Serving',
+    '37': 'Building & Grounds Maintenance',
+    '39': 'Personal Care & Service',
+    '41': 'Sales & Related',
+    '43': 'Office & Admin Support',
+    '45': 'Farming, Fishing, & Forestry',
+    '47': 'Construction & Extraction',
+    '49': 'Installation, Maintenance, & Repair',
+    '51': 'Production',
+    '53': 'Transportation & Material Moving',
+    '55': 'Military Specific',
+}
+
+# Define colors to match the original notebook's palette.
+# These are standard hex codes for gray and lime shades.
+BAR_COLORS = [
+    '#D1D5DB', # gray-300
+    '#84CC16', # lime-500
+    '#D9F99D', # lime-200
+]
+
+
+def _get_contrasting_text_color(bg_color_hex):
+    """
+    Determines if black or white text provides better contrast against a given background color.
+    """
+    try:
+        rgba = mcolors.to_rgba(bg_color_hex)
+        # Calculate luminance (Y) using the sRGB formula
+        luminance = 0.2126 * rgba[0] + 0.7152 * rgba[1] + 0.0722 * rgba[2]
+        return 'black' if luminance > 0.55 else 'white'
+    except ValueError:
+        return 'black' # Default to black if color is invalid
+
+
+def generate(processed_df: pd.DataFrame):
+    """
+    Generates a stacked bar chart breaking down tasks by remote status and estimability.
+
+    This corresponds to 'cell10' from the original analysis notebook. It shows,
+    for each occupation, the percentage of tasks that are not remote, remote and
+    estimable, or remote and not estimable.
+
+    Args:
+        processed_df (pd.DataFrame): The preprocessed data. Expected columns:
+                                     'onetsoc_major', 'remote_status', 'estimateable'.
+
+    Returns:
+        Path: The path to the generated temporary image file, or None on failure.
+    """
+    logging.info("Generating task breakdown by occupation plot...")
+
+    # --- Data Validation ---
+    required_cols = ['onetsoc_major', 'remote_status', 'estimateable']
+    if not all(col in processed_df.columns for col in required_cols):
+        logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
+        return None
+
+    df = processed_df.copy()
+
+    # --- Data Summarization ---
+    summary_data = []
+    for code, label in OCCUPATION_MAJOR_CODES.items():
+        occ_df = df[df['onetsoc_major'] == code]
+        total_tasks = len(occ_df)
+        if total_tasks == 0:
+            continue
+
+        not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote'])
+        remote_df = occ_df[occ_df['remote_status'] == 'remote']
+        remote_atomic_count = len(remote_df[remote_df['estimateable'] == 'ATOMIC'])
+        remote_ongoing_count = len(remote_df[remote_df['estimateable'] == 'ONGOING-CONSTRAINT'])
+
+        summary_data.append({
+            'occupation_label': label,
+            'count_not_remote': not_remote_count,
+            'count_remote_atomic': remote_atomic_count,
+            'count_remote_ongoing': remote_ongoing_count,
+            'total_tasks': total_tasks
+        })
+
+    if not summary_data:
+        logging.warning("No data available to generate the task breakdown plot.")
+        return None
+
+    summary_df = pd.DataFrame(summary_data)
+
+    # --- Percentage Calculation ---
+    summary_df['pct_not_remote'] = (summary_df['count_not_remote'] / summary_df['total_tasks']) * 100
+    summary_df['pct_remote_atomic'] = (summary_df['count_remote_atomic'] / summary_df['total_tasks']) * 100
+    summary_df['pct_remote_ongoing'] = (summary_df['count_remote_ongoing'] / summary_df['total_tasks']) * 100
+
+    plot_df = summary_df.set_index('occupation_label')[
+        ['pct_not_remote', 'pct_remote_atomic', 'pct_remote_ongoing']
+    ]
+    plot_df.columns = ['Not Remote', 'Remote & Estimable', 'Remote & Not Estimable']
+    plot_df = plot_df.sort_values(by='Not Remote', ascending=False)
+
+
+    # --- Plotting ---
+    try:
+        fig, ax = plt.subplots(figsize=(14, 10))
+        plot_df.plot(kind='barh', stacked=True, ax=ax, color=BAR_COLORS, width=0.8)
+
+        ax.set_xlabel("Percentage of Tasks", fontsize=12)
+        ax.set_ylabel("Occupation Major Group", fontsize=12)
+        ax.set_title("Task Breakdown by Occupation, Remote Status, and Estimability", fontsize=16, pad=20)
+        ax.xaxis.set_major_formatter(mtick.PercentFormatter())
+        ax.set_xlim(0, 100)
+        ax.spines['right'].set_visible(False)
+        ax.spines['top'].set_visible(False)
+
+        # Add percentage labels inside each bar segment
+        for i, container in enumerate(ax.containers):
+            text_color = _get_contrasting_text_color(BAR_COLORS[i])
+            for patch in container.patches:
+                width = patch.get_width()
+                if width > 3:  # Only label segments wider than 3%
+                    x = patch.get_x() + width / 2
+                    y = patch.get_y() + patch.get_height() / 2
+                    ax.text(x, y, f"{width:.1f}%", ha='center', va='center',
+                            fontsize=8, color=text_color, fontweight='medium')
+
+        ax.legend(title="Task Category", bbox_to_anchor=(1.02, 1), loc='upper left', frameon=False)
+
+        # --- File Saving ---
+        temp_dir = tempfile.gettempdir()
+        temp_path = Path(temp_dir) / "task_breakdown_by_occupation.png"
+        plt.savefig(temp_path, dpi=300, bbox_inches='tight')
+        logging.info(f"Successfully saved plot to temporary file: {temp_path}")
+
+        return temp_path
+
+    except Exception as e:
+        logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
+        return None
+    finally:
+        plt.close()
--- a/analysis/generators/task_estimate_distribution.py
+++ b/analysis/generators/task_estimate_distribution.py
@ -0,0 +1,74 @@
+import seaborn as sns
+import matplotlib.pyplot as plt
+from pathlib import Path
+import tempfile
+import logging
+import pandas as pd
+
+def generate(processed_df: pd.DataFrame):
+    """
+    Generates a histogram of the task time estimate midpoints.
+
+    This generator corresponds to 'cell1' from the original analysis notebook.
+    It visualizes the distribution of the calculated midpoint of time estimates
+    for all tasks on a logarithmic scale to handle the wide range of values.
+
+    Args:
+        processed_df (pd.DataFrame): The preprocessed data, expected to contain
+                                     'lb_estimate_in_minutes' and
+                                     'ub_estimate_in_minutes' columns.
+
+    Returns:
+        Path: The path to the generated temporary image file, or None if
+              generation fails.
+    """
+    logging.info("Generating task estimate distribution plot...")
+
+    # --- Data Validation and Preparation ---
+    required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes']
+    if not all(col in processed_df.columns for col in required_cols):
+        logging.error(
+            f"Required columns {required_cols} not found in the DataFrame. "
+            "Cannot generate plot."
+        )
+        return None
+
+    # Create a copy to avoid modifying the original DataFrame
+    df = processed_df.copy()
+
+    # Calculate the midpoint from lower and upper bounds, as was done in the notebook
+    df['estimate_midpoint'] = (df['lb_estimate_in_minutes'] + df['ub_estimate_in_minutes']) / 2
+
+    # For log scaling, we must use positive values. Filter out any non-positive midpoints.
+    df = df[df['estimate_midpoint'] > 0]
+    if df.empty:
+        logging.warning("No data with positive estimate midpoints available to plot.")
+        return None
+
+    # --- Plotting ---
+    try:
+        plt.figure(figsize=(10, 6))
+        ax = sns.histplot(data=df, x='estimate_midpoint', log_scale=True)
+
+        ax.set_title('Distribution of Task Time Estimate Midpoints', fontsize=16)
+        ax.set_xlabel('Estimate Midpoint (minutes, log scale)', fontsize=12)
+        ax.set_ylabel('Number of Tasks', fontsize=12)
+        plt.tight_layout()
+
+        # --- File Saving ---
+        # Create a temporary file to save the plot. The orchestrator (`generate.py`)
+        # will move this to the final 'dist/' directory.
+        temp_dir = tempfile.gettempdir()
+        temp_path = Path(temp_dir) / "task_estimate_distribution.png"
+
+        plt.savefig(temp_path, dpi=300)
+        logging.info(f"Successfully saved plot to temporary file: {temp_path}")
+
+        return temp_path
+
+    except Exception as e:
+        logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
+        return None
+    finally:
+        # Close the figure to free up memory, which is crucial when running many generators.
+        plt.close()
--- a/analysis/generators/temporal_coherence_cdf.py
+++ b/analysis/generators/temporal_coherence_cdf.py
@ -0,0 +1,134 @@
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib as mpl
+from pathlib import Path
+import tempfile
+import logging
+
+# Replicating the color palette from the original notebook for consistency.
+# These appear to be inspired by Tailwind CSS colors.
+GRAY_PALETTE = {
+    '100': '#F3F4F6',
+    '300': '#D1D5DB',
+}
+LIME_PALETTE = {
+    '300': '#D9F99D',
+    '600': '#A3E635', # A mid-tone lime
+    '900': '#4D7C0F', # A dark lime/green
+}
+
+
+def _calculate_cdf(series: pd.Series):
+    """
+    Calculates the empirical Cumulative Distribution Function (CDF) for a series.
+    Returns the sorted values and their corresponding cumulative percentages.
+    """
+    # Drop NA values and ensure the series is sorted
+    s = series.dropna().sort_values().reset_index(drop=True)
+    # Calculate cumulative percentage: (index + 1) / total_count
+    cdf_y = ((s.index + 1) / len(s)) * 100
+    return s.values, cdf_y
+
+
+def generate(processed_df: pd.DataFrame):
+    """
+    Generates a Cumulative Distribution Function (CDF) plot for task time estimates.
+
+    This corresponds to the second 'cell11' from the original notebook. It plots
+    the CDF for the lower-bound, upper-bound, and mid-point of time estimates,
+    showing the percentage of tasks that can be completed within a certain time.
+
+    Args:
+        processed_df (pd.DataFrame): The preprocessed data. Expected columns:
+                                     'lb_estimate_in_minutes',
+                                     'ub_estimate_in_minutes'.
+
+    Returns:
+        Path: The path to the generated temporary image file, or None on failure.
+    """
+    logging.info("Generating temporal coherence CDF plot...")
+
+    # --- Data Validation and Preparation ---
+    required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes']
+    if not all(col in processed_df.columns for col in required_cols):
+        logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
+        return None
+
+    df = processed_df.copy()
+
+    # Log scale requires positive values.
+    df = df[(df['lb_estimate_in_minutes'] > 0) & (df['ub_estimate_in_minutes'] > 0)]
+    if df.empty:
+        logging.warning("No data with positive estimates available to generate CDF plot.")
+        return None
+
+    # Calculate mid-point estimate
+    df['midpoint_estimate'] = (df['lb_estimate_in_minutes'] + df['ub_estimate_in_minutes']) / 2
+
+    # Prepare data for CDF plots
+    x_lb, y_lb = _calculate_cdf(df['lb_estimate_in_minutes'])
+    x_ub, y_ub = _calculate_cdf(df['ub_estimate_in_minutes'])
+    x_mid, y_mid = _calculate_cdf(df['midpoint_estimate'])
+
+    # --- Plotting ---
+    try:
+        fig, ax = plt.subplots(figsize=(12, 8))
+
+        # --- Grid and Reference Lines ---
+        # Horizontal reference lines for percentages
+        for y_val in range(0, 101, 10):
+            ax.axhline(y_val, color=GRAY_PALETTE['100'], linewidth=0.8, zorder=1)
+
+        # Vertical reference lines for human-friendly durations
+        ticks = [1, 5, 10, 30, 60, 120, 240, 480, 1440, 2880, 10080, 43200]
+        for tick in ticks:
+            ax.axvline(tick, color=GRAY_PALETTE['300'], linewidth=0.8, linestyle='--', zorder=1)
+
+        # --- CDF Plots ---
+        ax.step(x_lb, y_lb, where='post', color=LIME_PALETTE['300'], linewidth=1.8, linestyle='--', zorder=2, label='Lower-bound Estimate (CDF)')
+        ax.step(x_ub, y_ub, where='post', color=LIME_PALETTE['900'], linewidth=1.8, linestyle=':', zorder=3, label='Upper-bound Estimate (CDF)')
+        ax.step(x_mid, y_mid, where='post', color=LIME_PALETTE['600'], linewidth=2.2, zorder=4, label='Mid-point Estimate (CDF)')
+
+        # --- Axes Configuration ---
+        ax.set_ylim(0, 100)
+        ax.set_xscale('log')
+
+        # Custom x-ticks for durations
+        ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hr', '2 hrs', '4 hrs', '8 hrs', '1 day', '2 days', '1 week', '30 days']
+        ax.set_xticks(ticks)
+        ax.set_xticklabels(ticklabels, rotation=45, ha='right')
+        ax.minorticks_off() # Turn off minor ticks for clarity with custom grid
+
+        # Format y-axis as percentages
+        ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(decimals=0))
+
+        # --- Spines and Labels ---
+        for spine in ['top', 'right']:
+            ax.spines[spine].set_visible(False)
+        for spine in ['left', 'bottom']:
+            ax.spines[spine].set_edgecolor(GRAY_PALETTE['300'])
+
+        # Use ax.text for more control over label placement than ax.set_ylabel/xlabel
+        ax.text(-0.07, 1.02, "% of tasks with duration ≤ X", transform=ax.transAxes,
+                fontsize=12, fontweight='semibold', va='bottom')
+        ax.text(0.5, -0.25, 'Task Duration (X)', transform=ax.transAxes,
+                fontsize=12, fontweight='semibold', ha='center')
+
+        ax.legend(frameon=False, loc='lower right')
+        fig.suptitle('Cumulative Distribution of Task Time Estimates', fontsize=16, y=0.96)
+        plt.tight_layout(rect=[0, 0, 1, 0.95]) # Adjust layout to make space for suptitle
+
+        # --- File Saving ---
+        temp_dir = tempfile.gettempdir()
+        temp_path = Path(temp_dir) / "temporal_coherence_cdf.png"
+        plt.savefig(temp_path, dpi=300, bbox_inches='tight')
+        logging.info(f"Successfully saved plot to temporary file: {temp_path}")
+
+        return temp_path
+
+    except Exception as e:
+        logging.error(f"An error occurred while generating the CDF plot: {e}", exc_info=True)
+        return None
+    finally:
+        plt.close()
--- a/analysis/generators/time_estimate_spread_by_occupation.py
+++ b/analysis/generators/time_estimate_spread_by_occupation.py
@ -0,0 +1,112 @@
+import seaborn as sns
+import matplotlib.pyplot as plt
+from pathlib import Path
+import tempfile
+import logging
+import pandas as pd
+
+# Based on O*NET SOC 2018 structure, this mapping helps translate
+# the 2-digit major group codes into human-readable labels.
+OCCUPATION_MAJOR_CODES = {
+    '11': 'Management',
+    '13': 'Business & Financial',
+    '15': 'Computer & Mathematical',
+    '17': 'Architecture & Engineering',
+    '19': 'Life, Physical, & Social Science',
+    '21': 'Community & Social Service',
+    '23': 'Legal',
+    '25': 'Education, Training, & Library',
+    '27': 'Arts, Design, & Media',
+    '29': 'Healthcare Practitioners',
+    '31': 'Healthcare Support',
+    '33': 'Protective Service',
+    '35': 'Food Preparation & Serving',
+    '37': 'Building & Grounds Maintenance',
+    '39': 'Personal Care & Service',
+    '41': 'Sales & Related',
+    '43': 'Office & Admin Support',
+    '45': 'Farming, Fishing, & Forestry',
+    '47': 'Construction & Extraction',
+    '49': 'Installation, Maintenance, & Repair',
+    '51': 'Production',
+    '53': 'Transportation & Material Moving',
+    '55': 'Military Specific',
+}
+
+
+def generate(processed_df: pd.DataFrame):
+    """
+    Generates a box plot showing the spread of time-range estimates per occupation.
+
+    This corresponds to 'cell2' from the original analysis notebook. It visualizes
+    the distribution of the difference between upper and lower time estimates for
+    each major occupational group.
+
+    Args:
+        processed_df (pd.DataFrame): The preprocessed data. Expected columns:
+                                     'lb_estimate_in_minutes',
+                                     'ub_estimate_in_minutes', 'onetsoc_major'.
+
+    Returns:
+        Path: The path to the generated temporary image file, or None on failure.
+    """
+    logging.info("Generating plot of time estimate spread by occupation...")
+
+    # --- Data Validation and Preparation ---
+    required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'onetsoc_major']
+    if not all(col in processed_df.columns for col in required_cols):
+        logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
+        return None
+
+    df = processed_df.copy()
+
+    # Calculate the estimate range.
+    df['estimate_range'] = df['ub_estimate_in_minutes'] - df['lb_estimate_in_minutes']
+
+    # For log scaling, we need positive values. Filter out any non-positive ranges.
+    df = df[df['estimate_range'] > 0]
+    if df.empty:
+        logging.warning("No data with a positive estimate range available to plot.")
+        return None
+
+    # Sort by the major code to ensure a consistent plot order
+    df = df.sort_values('onetsoc_major')
+
+    # --- Plotting ---
+    try:
+        plt.figure(figsize=(14, 10))
+
+        ax = sns.boxplot(
+            data=df,
+            x='onetsoc_major',
+            y='estimate_range',
+            showfliers=False  # Outliers are excluded for a clearer view of the main distribution
+        )
+
+        plt.yscale('log')  # The long tail of the data makes a log scale more readable
+        plt.xlabel('Occupation Major Group', fontsize=12)
+        plt.ylabel('Time Estimate Range (upper - lower, in minutes, log scale)', fontsize=12)
+        plt.title('Spread of Time-Range Estimates by Occupation', fontsize=16)
+
+        # Replace numeric x-tick labels (e.g., '11', '15') with meaningful text labels
+        ax.set_xticklabels(
+            [OCCUPATION_MAJOR_CODES.get(code.get_text(), code.get_text()) for code in ax.get_xticklabels()],
+            rotation=60,
+            ha='right' # Align rotated labels correctly
+        )
+
+        plt.tight_layout()
+
+        # --- File Saving ---
+        temp_dir = tempfile.gettempdir()
+        temp_path = Path(temp_dir) / "time_estimate_spread_by_occupation.png"
+        plt.savefig(temp_path, dpi=300, bbox_inches='tight')
+        logging.info(f"Successfully saved plot to temporary file: {temp_path}")
+
+        return temp_path
+
+    except Exception as e:
+        logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
+        return None
+    finally:
+        plt.close()
--- a/analysis/generators/wage_bill_by_occupation.py
+++ b/analysis/generators/wage_bill_by_occupation.py
@ -0,0 +1,150 @@
+import seaborn as sns
+import matplotlib.pyplot as plt
+import matplotlib.ticker as mticker
+import pandas as pd
+from pathlib import Path
+import tempfile
+import logging
+
+# Assuming data.py is in the same package and provides this function
+from ..data import get_db_connection
+
+# This mapping helps translate the O*NET 2-digit major group codes
+# into human-readable labels for the plot's y-axis.
+OCCUPATION_MAJOR_CODES = {
+    '11': 'Management',
+    '13': 'Business & Financial',
+    '15': 'Computer & Mathematical',
+    '17': 'Architecture & Engineering',
+    '19': 'Life, Physical, & Social Science',
+    '21': 'Community & Social Service',
+    '23': 'Legal',
+    '25': 'Education, Training, & Library',
+    '27': 'Arts, Design, & Media',
+    '29': 'Healthcare Practitioners',
+    '31': 'Healthcare Support',
+    '33': 'Protective Service',
+    '35': 'Food Preparation & Serving',
+    '37': 'Building & Grounds Maintenance',
+    '39': 'Personal Care & Service',
+    '41': 'Sales & Related',
+    '43': 'Office & Admin Support',
+    '45': 'Farming, Fishing, & Forestry',
+    '47': 'Construction & Extraction',
+    '49': 'Installation, Maintenance, & Repair',
+    '51': 'Production',
+    '53': 'Transportation & Material Moving',
+    '55': 'Military Specific',
+}
+
+
+def generate(processed_df: pd.DataFrame):
+    """
+    Generates a bar plot of the total wage bill per major occupation group.
+
+    This corresponds to the first 'cell11' from the original analysis notebook.
+    It calculates the total wage bill (Total Employment * Annual Mean Wage) for
+    each occupation and aggregates it by major occupation group. This generator
+    loads its data directly from the O*NET database.
+
+    Args:
+        processed_df (pd.DataFrame): The preprocessed data (not used in this generator,
+                                     but required by the function signature).
+
+    Returns:
+        Path: The path to the generated temporary image file, or None on failure.
+    """
+    logging.info("Generating plot of total wage bill by occupation...")
+    conn = None
+    try:
+        # --- Data Loading ---
+        # This generator needs specific data that is not in the main preprocessed_df.
+        # It loads occupational employment and wage data directly from the database.
+        conn = get_db_connection()
+        if conn is None:
+            raise ConnectionError("Could not get database connection.")
+
+        # This data is stored in a long format in the `occupation_level_metadata` table.
+        # We need to query this table and pivot it to get employment and wage columns.
+        query = "SELECT onetsoc_code, item, response FROM occupation_level_metadata WHERE item IN ('Employment', 'Annual Mean Wage')"
+        try:
+            df_meta = pd.read_sql_query(query, conn)
+
+            # Pivot the table to create 'Employment' and 'Annual Mean Wage' columns
+            df_oesm = df_meta.pivot(index='onetsoc_code', columns='item', values='response').reset_index()
+            logging.info("Pivoted occupation metadata. Columns are: %s", df_oesm.columns.tolist())
+
+            # Rename for consistency with the original notebook's code
+            df_oesm.rename(columns={
+                'onetsoc_code': 'OCC_CODE',
+                'Employment': 'TOT_EMP',
+                'Annual Mean Wage': 'A_MEAN'
+            }, inplace=True)
+        except (pd.io.sql.DatabaseError, KeyError) as e:
+            logging.error(f"Failed to query or pivot occupation metadata: {e}", exc_info=True)
+            return None
+
+
+        # --- Data Preparation ---
+        # Create a 'major group' code from the first two digits of the SOC code
+        df_oesm['onetsoc_major'] = df_oesm['OCC_CODE'].str[:2]
+
+        # Ensure wage and employment columns are numeric, coercing errors to NaN
+        df_oesm['TOT_EMP'] = pd.to_numeric(df_oesm['TOT_EMP'], errors='coerce')
+        df_oesm['A_MEAN'] = pd.to_numeric(df_oesm['A_MEAN'], errors='coerce')
+
+        # Drop rows with missing data in critical columns
+        df_oesm.dropna(subset=['TOT_EMP', 'A_MEAN', 'onetsoc_major'], inplace=True)
+
+        # Calculate the wage bill for each occupation
+        df_oesm['wage_bill'] = df_oesm['TOT_EMP'] * df_oesm['A_MEAN']
+
+        # Aggregate the wage bill by major occupation group
+        df_wage_bill_major = df_oesm.groupby('onetsoc_major')['wage_bill'].sum().reset_index()
+
+        # Map the major codes to readable titles for plotting
+        df_wage_bill_major['OCC_TITLE_MAJOR'] = df_wage_bill_major['onetsoc_major'].map(OCCUPATION_MAJOR_CODES)
+        df_wage_bill_major.dropna(subset=['OCC_TITLE_MAJOR'], inplace=True) # Drop military/unmapped codes
+
+        # Sort by wage bill for a more informative plot
+        df_wage_bill_major = df_wage_bill_major.sort_values('wage_bill', ascending=False)
+
+        if df_wage_bill_major.empty:
+            logging.warning("No data available to generate the wage bill plot.")
+            return None
+
+
+        # --- Plotting ---
+        plt.figure(figsize=(12, 10))
+        ax = sns.barplot(x='wage_bill', y='OCC_TITLE_MAJOR', data=df_wage_bill_major, palette="viridis", orient='h')
+        ax.set_title('Total Wage Bill per Major Occupation Group', fontsize=16, pad=15)
+        ax.set_xlabel('Total Wage Bill (in USD)', fontsize=12)
+        ax.set_ylabel('Major Occupation Group', fontsize=12)
+        ax.grid(axis='x', linestyle='--', alpha=0.7)
+
+        # Format the x-axis to be more readable (e.g., "$2.0T" for trillions)
+        def format_billions(x, pos):
+            if x >= 1e12:
+                return f'${x*1e-12:.1f}T'
+            if x >= 1e9:
+                return f'${x*1e-9:.0f}B'
+            return f'${x*1e-6:.0f}M'
+        ax.xaxis.set_major_formatter(mticker.FuncFormatter(format_billions))
+
+        plt.tight_layout()
+
+        # --- File Saving ---
+        temp_dir = tempfile.gettempdir()
+        temp_path = Path(temp_dir) / "wage_bill_by_occupation.png"
+        plt.savefig(temp_path, dpi=300)
+        logging.info(f"Successfully saved plot to temporary file: {temp_path}")
+
+        return temp_path
+
+    except Exception as e:
+        logging.error(f"An error occurred while generating the wage bill plot: {e}", exc_info=True)
+        return None
+    finally:
+        plt.close()
+        if conn:
+            conn.close()