sprint-econtai/analysis/generators/task_breakdown_by_occupation.py
Félix Dorn 43076bcbb1 old
2025-07-15 00:41:05 +02:00

161 lines
6.2 KiB
Python

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.colors as mcolors
from pathlib import Path
import tempfile
import logging
# This mapping helps translate the O*NET 2-digit major group codes
# into human-readable labels for the plot's y-axis.
OCCUPATION_MAJOR_CODES = {
'11': 'Management',
'13': 'Business & Financial',
'15': 'Computer & Mathematical',
'17': 'Architecture & Engineering',
'19': 'Life, Physical, & Social Science',
'21': 'Community & Social Service',
'23': 'Legal',
'25': 'Education, Training, & Library',
'27': 'Arts, Design, & Media',
'29': 'Healthcare Practitioners',
'31': 'Healthcare Support',
'33': 'Protective Service',
'35': 'Food Preparation & Serving',
'37': 'Building & Grounds Maintenance',
'39': 'Personal Care & Service',
'41': 'Sales & Related',
'43': 'Office & Admin Support',
'45': 'Farming, Fishing, & Forestry',
'47': 'Construction & Extraction',
'49': 'Installation, Maintenance, & Repair',
'51': 'Production',
'53': 'Transportation & Material Moving',
'55': 'Military Specific',
}
# Define colors to match the original notebook's palette.
# These are standard hex codes for gray and lime shades.
BAR_COLORS = [
'#D1D5DB', # gray-300
'#84CC16', # lime-500
'#D9F99D', # lime-200
]
def _get_contrasting_text_color(bg_color_hex):
"""
Determines if black or white text provides better contrast against a given background color.
"""
try:
rgba = mcolors.to_rgba(bg_color_hex)
# Calculate luminance (Y) using the sRGB formula
luminance = 0.2126 * rgba[0] + 0.7152 * rgba[1] + 0.0722 * rgba[2]
return 'black' if luminance > 0.55 else 'white'
except ValueError:
return 'black' # Default to black if color is invalid
def generate(processed_df: pd.DataFrame):
"""
Generates a stacked bar chart breaking down tasks by remote status and estimability.
This corresponds to 'cell10' from the original analysis notebook. It shows,
for each occupation, the percentage of tasks that are not remote, remote and
estimable, or remote and not estimable.
Args:
processed_df (pd.DataFrame): The preprocessed data. Expected columns:
'onetsoc_major', 'remote_status', 'estimateable'.
Returns:
Path: The path to the generated temporary image file, or None on failure.
"""
logging.info("Generating task breakdown by occupation plot...")
# --- Data Validation ---
required_cols = ['onetsoc_major', 'remote_status', 'estimateable']
if not all(col in processed_df.columns for col in required_cols):
logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
return None
df = processed_df.copy()
# --- Data Summarization ---
summary_data = []
for code, label in OCCUPATION_MAJOR_CODES.items():
occ_df = df[df['onetsoc_major'] == code]
total_tasks = len(occ_df)
if total_tasks == 0:
continue
not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote'])
remote_df = occ_df[occ_df['remote_status'] == 'remote']
remote_atomic_count = len(remote_df[remote_df['estimateable'] == 'ATOMIC'])
remote_ongoing_count = len(remote_df[remote_df['estimateable'] == 'ONGOING-CONSTRAINT'])
summary_data.append({
'occupation_label': label,
'count_not_remote': not_remote_count,
'count_remote_atomic': remote_atomic_count,
'count_remote_ongoing': remote_ongoing_count,
'total_tasks': total_tasks
})
if not summary_data:
logging.warning("No data available to generate the task breakdown plot.")
return None
summary_df = pd.DataFrame(summary_data)
# --- Percentage Calculation ---
summary_df['pct_not_remote'] = (summary_df['count_not_remote'] / summary_df['total_tasks']) * 100
summary_df['pct_remote_atomic'] = (summary_df['count_remote_atomic'] / summary_df['total_tasks']) * 100
summary_df['pct_remote_ongoing'] = (summary_df['count_remote_ongoing'] / summary_df['total_tasks']) * 100
plot_df = summary_df.set_index('occupation_label')[
['pct_not_remote', 'pct_remote_atomic', 'pct_remote_ongoing']
]
plot_df.columns = ['Not Remote', 'Remote & Estimable', 'Remote & Not Estimable']
plot_df = plot_df.sort_values(by='Not Remote', ascending=False)
# --- Plotting ---
try:
fig, ax = plt.subplots(figsize=(14, 10))
plot_df.plot(kind='barh', stacked=True, ax=ax, color=BAR_COLORS, width=0.8)
ax.set_xlabel("Percentage of Tasks", fontsize=12)
ax.set_ylabel("Occupation Major Group", fontsize=12)
ax.set_title("Task Breakdown by Occupation, Remote Status, and Estimability", fontsize=16, pad=20)
ax.xaxis.set_major_formatter(mtick.PercentFormatter())
ax.set_xlim(0, 100)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
# Add percentage labels inside each bar segment
for i, container in enumerate(ax.containers):
text_color = _get_contrasting_text_color(BAR_COLORS[i])
for patch in container.patches:
width = patch.get_width()
if width > 3: # Only label segments wider than 3%
x = patch.get_x() + width / 2
y = patch.get_y() + patch.get_height() / 2
ax.text(x, y, f"{width:.1f}%", ha='center', va='center',
fontsize=8, color=text_color, fontweight='medium')
ax.legend(title="Task Category", bbox_to_anchor=(1.02, 1), loc='upper left', frameon=False)
# --- File Saving ---
temp_dir = tempfile.gettempdir()
temp_path = Path(temp_dir) / "task_breakdown_by_occupation.png"
plt.savefig(temp_path, dpi=300, bbox_inches='tight')
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
return temp_path
except Exception as e:
logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
return None
finally:
plt.close()