sprint-econtai/analysis/generators/time_estimate_spread_by_occupation.py

import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import tempfile
import logging
import pandas as pd

# Based on O*NET SOC 2018 structure, this mapping helps translate
# the 2-digit major group codes into human-readable labels.
OCCUPATION_MAJOR_CODES = {
    '11': 'Management',
    '13': 'Business & Financial',
    '15': 'Computer & Mathematical',
    '17': 'Architecture & Engineering',
    '19': 'Life, Physical, & Social Science',
    '21': 'Community & Social Service',
    '23': 'Legal',
    '25': 'Education, Training, & Library',
    '27': 'Arts, Design, & Media',
    '29': 'Healthcare Practitioners',
    '31': 'Healthcare Support',
    '33': 'Protective Service',
    '35': 'Food Preparation & Serving',
    '37': 'Building & Grounds Maintenance',
    '39': 'Personal Care & Service',
    '41': 'Sales & Related',
    '43': 'Office & Admin Support',
    '45': 'Farming, Fishing, & Forestry',
    '47': 'Construction & Extraction',
    '49': 'Installation, Maintenance, & Repair',
    '51': 'Production',
    '53': 'Transportation & Material Moving',
    '55': 'Military Specific',
}


def generate(processed_df: pd.DataFrame):
    """
    Generates a box plot showing the spread of time-range estimates per occupation.

    This corresponds to 'cell2' from the original analysis notebook. It visualizes
    the distribution of the difference between upper and lower time estimates for
    each major occupational group.

    Args:
        processed_df (pd.DataFrame): The preprocessed data. Expected columns:
                                     'lb_estimate_in_minutes',
                                     'ub_estimate_in_minutes', 'onetsoc_major'.

    Returns:
        Path: The path to the generated temporary image file, or None on failure.
    """
    logging.info("Generating plot of time estimate spread by occupation...")

    # --- Data Validation and Preparation ---
    required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'onetsoc_major']
    if not all(col in processed_df.columns for col in required_cols):
        logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
        return None

    df = processed_df.copy()

    # Calculate the estimate range.
    df['estimate_range'] = df['ub_estimate_in_minutes'] - df['lb_estimate_in_minutes']

    # For log scaling, we need positive values. Filter out any non-positive ranges.
    df = df[df['estimate_range'] > 0]
    if df.empty:
        logging.warning("No data with a positive estimate range available to plot.")
        return None

    # Sort by the major code to ensure a consistent plot order
    df = df.sort_values('onetsoc_major')

    # --- Plotting ---
    try:
        plt.figure(figsize=(14, 10))

        ax = sns.boxplot(
            data=df,
            x='onetsoc_major',
            y='estimate_range',
            showfliers=False  # Outliers are excluded for a clearer view of the main distribution
        )

        plt.yscale('log')  # The long tail of the data makes a log scale more readable
        plt.xlabel('Occupation Major Group', fontsize=12)
        plt.ylabel('Time Estimate Range (upper - lower, in minutes, log scale)', fontsize=12)
        plt.title('Spread of Time-Range Estimates by Occupation', fontsize=16)

        # Replace numeric x-tick labels (e.g., '11', '15') with meaningful text labels
        ax.set_xticklabels(
            [OCCUPATION_MAJOR_CODES.get(code.get_text(), code.get_text()) for code in ax.get_xticklabels()],
            rotation=60,
            ha='right' # Align rotated labels correctly
        )

        plt.tight_layout()

        # --- File Saving ---
        temp_dir = tempfile.gettempdir()
        temp_path = Path(temp_dir) / "time_estimate_spread_by_occupation.png"
        plt.savefig(temp_path, dpi=300, bbox_inches='tight')
        logging.info(f"Successfully saved plot to temporary file: {temp_path}")

        return temp_path

    except Exception as e:
        logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
        return None
    finally:
        plt.close()