sprint-econtai/analysis/generators/estimate_lower_vs_upper_bounds.py

import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import tempfile
import logging
import pandas as pd
import numpy as np

# Copied from other generators for modularity. This dictionary maps
# O*NET major occupation group codes to human-readable labels.
OCCUPATION_MAJOR_CODES = {
    '11': 'Management',
    '13': 'Business & Financial',
    '15': 'Computer & Mathematical',
    '17': 'Architecture & Engineering',
    '19': 'Life, Physical, & Social Science',
    '21': 'Community & Social Service',
    '23': 'Legal',
    '25': 'Education, Training, & Library',
    '27': 'Arts, Design, & Media',
    '29': 'Healthcare Practitioners',
    '31': 'Healthcare Support',
    '33': 'Protective Service',
    '35': 'Food Preparation & Serving',
    '37': 'Building & Grounds Maintenance',
    '39': 'Personal Care & Service',
    '41': 'Sales & Related',
    '43': 'Office & Admin Support',
    '45': 'Farming, Fishing, & Forestry',
    '47': 'Construction & Extraction',
    '49': 'Installation, Maintenance, & Repair',
    '51': 'Production',
    '53': 'Transportation & Material Moving',
    '55': 'Military Specific',
}


def generate(processed_df: pd.DataFrame):
    """
    Generates a scatter plot comparing lower vs. upper time estimates for tasks.

    This corresponds to 'cell3' from the original analysis notebook. It helps
    visualize the relationship and spread between the lower and upper bounds

    of time estimates across different occupation groups.

    Args:
        processed_df (pd.DataFrame): The preprocessed data. Expected columns:
                                     'lb_estimate_in_minutes',
                                     'ub_estimate_in_minutes', 'onetsoc_major'.

    Returns:
        Path: The path to the generated temporary image file, or None on failure.
    """
    logging.info("Generating plot of lower vs. upper time estimates...")

    # --- Data Validation and Preparation ---
    required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'onetsoc_major']
    if not all(col in processed_df.columns for col in required_cols):
        logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
        return None

    df = processed_df.copy()

    # For log scaling, both lower and upper bounds must be positive.
    df = df[(df['lb_estimate_in_minutes'] > 0) & (df['ub_estimate_in_minutes'] > 0)]
    if df.empty:
        logging.warning("No data with positive lower and upper estimates available to plot.")
        return None

    # Replace the major code with its readable label for the hue legend.
    df['occupation_label'] = df['onetsoc_major'].map(OCCUPATION_MAJOR_CODES)

    # --- Plotting ---
    try:
        plt.figure(figsize=(12, 10))
        ax = sns.scatterplot(
            data=df,
            x='lb_estimate_in_minutes',
            y='ub_estimate_in_minutes',
            alpha=0.2,
            edgecolor=None,
            hue="occupation_label"  # Use the labeled column for the legend
        )

        # Determine limits for the 45° reference line
        # Use the maximum of both columns to create a square plot
        max_val = df[['lb_estimate_in_minutes', 'ub_estimate_in_minutes']].max().max()
        lims = (df[['lb_estimate_in_minutes', 'ub_estimate_in_minutes']].min().min(), max_val)
        ax.plot(lims, lims, color='black', linestyle='--', linewidth=1, label='Upper = Lower')

        # Add helper lines for constant ratios (2x, 10x, 100x)
        for k in [2, 10, 100]:
            ax.plot(lims, [k * l for l in lims],
                    linestyle=':', color='grey', linewidth=0.8, label=f'Upper = {k}x Lower')

        ax.set(xscale='log', yscale='log', xlim=lims, ylim=lims)
        ax.set_xlabel('Lower-bound Estimate (minutes, log scale)', fontsize=12)
        ax.set_ylabel('Upper-bound Estimate (minutes, log scale)', fontsize=12)
        ax.set_title('Lower vs. Upper Time Estimates for All Tasks', fontsize=16)

        # Place the legend outside the plot to avoid obscuring data
        ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left', title='Occupation / Ratio')

        # --- File Saving ---
        temp_dir = tempfile.gettempdir()
        temp_path = Path(temp_dir) / "estimate_lower_vs_upper_bounds.png"

        # Use bbox_inches='tight' to ensure the external legend is included in the saved image.
        plt.savefig(temp_path, dpi=300, bbox_inches='tight')
        logging.info(f"Successfully saved plot to temporary file: {temp_path}")

        return temp_path

    except Exception as e:
        logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
        return None
    finally:
        plt.close()