112 lines
4.1 KiB
Python
112 lines
4.1 KiB
Python
import seaborn as sns
|
|
import matplotlib.pyplot as plt
|
|
from pathlib import Path
|
|
import tempfile
|
|
import logging
|
|
import pandas as pd
|
|
|
|
# Based on O*NET SOC 2018 structure, this mapping helps translate
|
|
# the 2-digit major group codes into human-readable labels.
|
|
OCCUPATION_MAJOR_CODES = {
|
|
'11': 'Management',
|
|
'13': 'Business & Financial',
|
|
'15': 'Computer & Mathematical',
|
|
'17': 'Architecture & Engineering',
|
|
'19': 'Life, Physical, & Social Science',
|
|
'21': 'Community & Social Service',
|
|
'23': 'Legal',
|
|
'25': 'Education, Training, & Library',
|
|
'27': 'Arts, Design, & Media',
|
|
'29': 'Healthcare Practitioners',
|
|
'31': 'Healthcare Support',
|
|
'33': 'Protective Service',
|
|
'35': 'Food Preparation & Serving',
|
|
'37': 'Building & Grounds Maintenance',
|
|
'39': 'Personal Care & Service',
|
|
'41': 'Sales & Related',
|
|
'43': 'Office & Admin Support',
|
|
'45': 'Farming, Fishing, & Forestry',
|
|
'47': 'Construction & Extraction',
|
|
'49': 'Installation, Maintenance, & Repair',
|
|
'51': 'Production',
|
|
'53': 'Transportation & Material Moving',
|
|
'55': 'Military Specific',
|
|
}
|
|
|
|
|
|
def generate(processed_df: pd.DataFrame):
|
|
"""
|
|
Generates a box plot showing the spread of time-range estimates per occupation.
|
|
|
|
This corresponds to 'cell2' from the original analysis notebook. It visualizes
|
|
the distribution of the difference between upper and lower time estimates for
|
|
each major occupational group.
|
|
|
|
Args:
|
|
processed_df (pd.DataFrame): The preprocessed data. Expected columns:
|
|
'lb_estimate_in_minutes',
|
|
'ub_estimate_in_minutes', 'onetsoc_major'.
|
|
|
|
Returns:
|
|
Path: The path to the generated temporary image file, or None on failure.
|
|
"""
|
|
logging.info("Generating plot of time estimate spread by occupation...")
|
|
|
|
# --- Data Validation and Preparation ---
|
|
required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'onetsoc_major']
|
|
if not all(col in processed_df.columns for col in required_cols):
|
|
logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
|
|
return None
|
|
|
|
df = processed_df.copy()
|
|
|
|
# Calculate the estimate range.
|
|
df['estimate_range'] = df['ub_estimate_in_minutes'] - df['lb_estimate_in_minutes']
|
|
|
|
# For log scaling, we need positive values. Filter out any non-positive ranges.
|
|
df = df[df['estimate_range'] > 0]
|
|
if df.empty:
|
|
logging.warning("No data with a positive estimate range available to plot.")
|
|
return None
|
|
|
|
# Sort by the major code to ensure a consistent plot order
|
|
df = df.sort_values('onetsoc_major')
|
|
|
|
# --- Plotting ---
|
|
try:
|
|
plt.figure(figsize=(14, 10))
|
|
|
|
ax = sns.boxplot(
|
|
data=df,
|
|
x='onetsoc_major',
|
|
y='estimate_range',
|
|
showfliers=False # Outliers are excluded for a clearer view of the main distribution
|
|
)
|
|
|
|
plt.yscale('log') # The long tail of the data makes a log scale more readable
|
|
plt.xlabel('Occupation Major Group', fontsize=12)
|
|
plt.ylabel('Time Estimate Range (upper - lower, in minutes, log scale)', fontsize=12)
|
|
plt.title('Spread of Time-Range Estimates by Occupation', fontsize=16)
|
|
|
|
# Replace numeric x-tick labels (e.g., '11', '15') with meaningful text labels
|
|
ax.set_xticklabels(
|
|
[OCCUPATION_MAJOR_CODES.get(code.get_text(), code.get_text()) for code in ax.get_xticklabels()],
|
|
rotation=60,
|
|
ha='right' # Align rotated labels correctly
|
|
)
|
|
|
|
plt.tight_layout()
|
|
|
|
# --- File Saving ---
|
|
temp_dir = tempfile.gettempdir()
|
|
temp_path = Path(temp_dir) / "time_estimate_spread_by_occupation.png"
|
|
plt.savefig(temp_path, dpi=300, bbox_inches='tight')
|
|
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
|
|
|
|
return temp_path
|
|
|
|
except Exception as e:
|
|
logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
|
|
return None
|
|
finally:
|
|
plt.close()
|