sprint-econtai/analysis/generators/task_breakdown_by_occupation.py

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.colors as mcolors
from pathlib import Path
import tempfile
import logging

# This mapping helps translate the O*NET 2-digit major group codes
# into human-readable labels for the plot's y-axis.
OCCUPATION_MAJOR_CODES = {
    '11': 'Management',
    '13': 'Business & Financial',
    '15': 'Computer & Mathematical',
    '17': 'Architecture & Engineering',
    '19': 'Life, Physical, & Social Science',
    '21': 'Community & Social Service',
    '23': 'Legal',
    '25': 'Education, Training, & Library',
    '27': 'Arts, Design, & Media',
    '29': 'Healthcare Practitioners',
    '31': 'Healthcare Support',
    '33': 'Protective Service',
    '35': 'Food Preparation & Serving',
    '37': 'Building & Grounds Maintenance',
    '39': 'Personal Care & Service',
    '41': 'Sales & Related',
    '43': 'Office & Admin Support',
    '45': 'Farming, Fishing, & Forestry',
    '47': 'Construction & Extraction',
    '49': 'Installation, Maintenance, & Repair',
    '51': 'Production',
    '53': 'Transportation & Material Moving',
    '55': 'Military Specific',
}

# Define colors to match the original notebook's palette.
# These are standard hex codes for gray and lime shades.
BAR_COLORS = [
    '#D1D5DB', # gray-300
    '#84CC16', # lime-500
    '#D9F99D', # lime-200
]


def _get_contrasting_text_color(bg_color_hex):
    """
    Determines if black or white text provides better contrast against a given background color.
    """
    try:
        rgba = mcolors.to_rgba(bg_color_hex)
        # Calculate luminance (Y) using the sRGB formula
        luminance = 0.2126 * rgba[0] + 0.7152 * rgba[1] + 0.0722 * rgba[2]
        return 'black' if luminance > 0.55 else 'white'
    except ValueError:
        return 'black' # Default to black if color is invalid


def generate(processed_df: pd.DataFrame):
    """
    Generates a stacked bar chart breaking down tasks by remote status and estimability.

    This corresponds to 'cell10' from the original analysis notebook. It shows,
    for each occupation, the percentage of tasks that are not remote, remote and
    estimable, or remote and not estimable.

    Args:
        processed_df (pd.DataFrame): The preprocessed data. Expected columns:
                                     'onetsoc_major', 'remote_status', 'estimateable'.

    Returns:
        Path: The path to the generated temporary image file, or None on failure.
    """
    logging.info("Generating task breakdown by occupation plot...")

    # --- Data Validation ---
    required_cols = ['onetsoc_major', 'remote_status', 'estimateable']
    if not all(col in processed_df.columns for col in required_cols):
        logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
        return None

    df = processed_df.copy()

    # --- Data Summarization ---
    summary_data = []
    for code, label in OCCUPATION_MAJOR_CODES.items():
        occ_df = df[df['onetsoc_major'] == code]
        total_tasks = len(occ_df)
        if total_tasks == 0:
            continue

        not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote'])
        remote_df = occ_df[occ_df['remote_status'] == 'remote']
        remote_atomic_count = len(remote_df[remote_df['estimateable'] == 'ATOMIC'])
        remote_ongoing_count = len(remote_df[remote_df['estimateable'] == 'ONGOING-CONSTRAINT'])

        summary_data.append({
            'occupation_label': label,
            'count_not_remote': not_remote_count,
            'count_remote_atomic': remote_atomic_count,
            'count_remote_ongoing': remote_ongoing_count,
            'total_tasks': total_tasks
        })

    if not summary_data:
        logging.warning("No data available to generate the task breakdown plot.")
        return None

    summary_df = pd.DataFrame(summary_data)

    # --- Percentage Calculation ---
    summary_df['pct_not_remote'] = (summary_df['count_not_remote'] / summary_df['total_tasks']) * 100
    summary_df['pct_remote_atomic'] = (summary_df['count_remote_atomic'] / summary_df['total_tasks']) * 100
    summary_df['pct_remote_ongoing'] = (summary_df['count_remote_ongoing'] / summary_df['total_tasks']) * 100

    plot_df = summary_df.set_index('occupation_label')[
        ['pct_not_remote', 'pct_remote_atomic', 'pct_remote_ongoing']
    ]
    plot_df.columns = ['Not Remote', 'Remote & Estimable', 'Remote & Not Estimable']
    plot_df = plot_df.sort_values(by='Not Remote', ascending=False)


    # --- Plotting ---
    try:
        fig, ax = plt.subplots(figsize=(14, 10))
        plot_df.plot(kind='barh', stacked=True, ax=ax, color=BAR_COLORS, width=0.8)

        ax.set_xlabel("Percentage of Tasks", fontsize=12)
        ax.set_ylabel("Occupation Major Group", fontsize=12)
        ax.set_title("Task Breakdown by Occupation, Remote Status, and Estimability", fontsize=16, pad=20)
        ax.xaxis.set_major_formatter(mtick.PercentFormatter())
        ax.set_xlim(0, 100)
        ax.spines['right'].set_visible(False)
        ax.spines['top'].set_visible(False)

        # Add percentage labels inside each bar segment
        for i, container in enumerate(ax.containers):
            text_color = _get_contrasting_text_color(BAR_COLORS[i])
            for patch in container.patches:
                width = patch.get_width()
                if width > 3:  # Only label segments wider than 3%
                    x = patch.get_x() + width / 2
                    y = patch.get_y() + patch.get_height() / 2
                    ax.text(x, y, f"{width:.1f}%", ha='center', va='center',
                            fontsize=8, color=text_color, fontweight='medium')

        ax.legend(title="Task Category", bbox_to_anchor=(1.02, 1), loc='upper left', frameon=False)

        # --- File Saving ---
        temp_dir = tempfile.gettempdir()
        temp_path = Path(temp_dir) / "task_breakdown_by_occupation.png"
        plt.savefig(temp_path, dpi=300, bbox_inches='tight')
        logging.info(f"Successfully saved plot to temporary file: {temp_path}")

        return temp_path

    except Exception as e:
        logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
        return None
    finally:
        plt.close()