161 lines
6.2 KiB
Python
161 lines
6.2 KiB
Python
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
import matplotlib.ticker as mtick
|
|
import matplotlib.colors as mcolors
|
|
from pathlib import Path
|
|
import tempfile
|
|
import logging
|
|
|
|
# This mapping helps translate the O*NET 2-digit major group codes
|
|
# into human-readable labels for the plot's y-axis.
|
|
OCCUPATION_MAJOR_CODES = {
|
|
'11': 'Management',
|
|
'13': 'Business & Financial',
|
|
'15': 'Computer & Mathematical',
|
|
'17': 'Architecture & Engineering',
|
|
'19': 'Life, Physical, & Social Science',
|
|
'21': 'Community & Social Service',
|
|
'23': 'Legal',
|
|
'25': 'Education, Training, & Library',
|
|
'27': 'Arts, Design, & Media',
|
|
'29': 'Healthcare Practitioners',
|
|
'31': 'Healthcare Support',
|
|
'33': 'Protective Service',
|
|
'35': 'Food Preparation & Serving',
|
|
'37': 'Building & Grounds Maintenance',
|
|
'39': 'Personal Care & Service',
|
|
'41': 'Sales & Related',
|
|
'43': 'Office & Admin Support',
|
|
'45': 'Farming, Fishing, & Forestry',
|
|
'47': 'Construction & Extraction',
|
|
'49': 'Installation, Maintenance, & Repair',
|
|
'51': 'Production',
|
|
'53': 'Transportation & Material Moving',
|
|
'55': 'Military Specific',
|
|
}
|
|
|
|
# Define colors to match the original notebook's palette.
|
|
# These are standard hex codes for gray and lime shades.
|
|
BAR_COLORS = [
|
|
'#D1D5DB', # gray-300
|
|
'#84CC16', # lime-500
|
|
'#D9F99D', # lime-200
|
|
]
|
|
|
|
|
|
def _get_contrasting_text_color(bg_color_hex):
|
|
"""
|
|
Determines if black or white text provides better contrast against a given background color.
|
|
"""
|
|
try:
|
|
rgba = mcolors.to_rgba(bg_color_hex)
|
|
# Calculate luminance (Y) using the sRGB formula
|
|
luminance = 0.2126 * rgba[0] + 0.7152 * rgba[1] + 0.0722 * rgba[2]
|
|
return 'black' if luminance > 0.55 else 'white'
|
|
except ValueError:
|
|
return 'black' # Default to black if color is invalid
|
|
|
|
|
|
def generate(processed_df: pd.DataFrame):
|
|
"""
|
|
Generates a stacked bar chart breaking down tasks by remote status and estimability.
|
|
|
|
This corresponds to 'cell10' from the original analysis notebook. It shows,
|
|
for each occupation, the percentage of tasks that are not remote, remote and
|
|
estimable, or remote and not estimable.
|
|
|
|
Args:
|
|
processed_df (pd.DataFrame): The preprocessed data. Expected columns:
|
|
'onetsoc_major', 'remote_status', 'estimateable'.
|
|
|
|
Returns:
|
|
Path: The path to the generated temporary image file, or None on failure.
|
|
"""
|
|
logging.info("Generating task breakdown by occupation plot...")
|
|
|
|
# --- Data Validation ---
|
|
required_cols = ['onetsoc_major', 'remote_status', 'estimateable']
|
|
if not all(col in processed_df.columns for col in required_cols):
|
|
logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
|
|
return None
|
|
|
|
df = processed_df.copy()
|
|
|
|
# --- Data Summarization ---
|
|
summary_data = []
|
|
for code, label in OCCUPATION_MAJOR_CODES.items():
|
|
occ_df = df[df['onetsoc_major'] == code]
|
|
total_tasks = len(occ_df)
|
|
if total_tasks == 0:
|
|
continue
|
|
|
|
not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote'])
|
|
remote_df = occ_df[occ_df['remote_status'] == 'remote']
|
|
remote_atomic_count = len(remote_df[remote_df['estimateable'] == 'ATOMIC'])
|
|
remote_ongoing_count = len(remote_df[remote_df['estimateable'] == 'ONGOING-CONSTRAINT'])
|
|
|
|
summary_data.append({
|
|
'occupation_label': label,
|
|
'count_not_remote': not_remote_count,
|
|
'count_remote_atomic': remote_atomic_count,
|
|
'count_remote_ongoing': remote_ongoing_count,
|
|
'total_tasks': total_tasks
|
|
})
|
|
|
|
if not summary_data:
|
|
logging.warning("No data available to generate the task breakdown plot.")
|
|
return None
|
|
|
|
summary_df = pd.DataFrame(summary_data)
|
|
|
|
# --- Percentage Calculation ---
|
|
summary_df['pct_not_remote'] = (summary_df['count_not_remote'] / summary_df['total_tasks']) * 100
|
|
summary_df['pct_remote_atomic'] = (summary_df['count_remote_atomic'] / summary_df['total_tasks']) * 100
|
|
summary_df['pct_remote_ongoing'] = (summary_df['count_remote_ongoing'] / summary_df['total_tasks']) * 100
|
|
|
|
plot_df = summary_df.set_index('occupation_label')[
|
|
['pct_not_remote', 'pct_remote_atomic', 'pct_remote_ongoing']
|
|
]
|
|
plot_df.columns = ['Not Remote', 'Remote & Estimable', 'Remote & Not Estimable']
|
|
plot_df = plot_df.sort_values(by='Not Remote', ascending=False)
|
|
|
|
|
|
# --- Plotting ---
|
|
try:
|
|
fig, ax = plt.subplots(figsize=(14, 10))
|
|
plot_df.plot(kind='barh', stacked=True, ax=ax, color=BAR_COLORS, width=0.8)
|
|
|
|
ax.set_xlabel("Percentage of Tasks", fontsize=12)
|
|
ax.set_ylabel("Occupation Major Group", fontsize=12)
|
|
ax.set_title("Task Breakdown by Occupation, Remote Status, and Estimability", fontsize=16, pad=20)
|
|
ax.xaxis.set_major_formatter(mtick.PercentFormatter())
|
|
ax.set_xlim(0, 100)
|
|
ax.spines['right'].set_visible(False)
|
|
ax.spines['top'].set_visible(False)
|
|
|
|
# Add percentage labels inside each bar segment
|
|
for i, container in enumerate(ax.containers):
|
|
text_color = _get_contrasting_text_color(BAR_COLORS[i])
|
|
for patch in container.patches:
|
|
width = patch.get_width()
|
|
if width > 3: # Only label segments wider than 3%
|
|
x = patch.get_x() + width / 2
|
|
y = patch.get_y() + patch.get_height() / 2
|
|
ax.text(x, y, f"{width:.1f}%", ha='center', va='center',
|
|
fontsize=8, color=text_color, fontweight='medium')
|
|
|
|
ax.legend(title="Task Category", bbox_to_anchor=(1.02, 1), loc='upper left', frameon=False)
|
|
|
|
# --- File Saving ---
|
|
temp_dir = tempfile.gettempdir()
|
|
temp_path = Path(temp_dir) / "task_breakdown_by_occupation.png"
|
|
plt.savefig(temp_path, dpi=300, bbox_inches='tight')
|
|
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
|
|
|
|
return temp_path
|
|
|
|
except Exception as e:
|
|
logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
|
|
return None
|
|
finally:
|
|
plt.close()
|