119 lines
4.6 KiB
Python
119 lines
4.6 KiB
Python
import seaborn as sns
|
|
import matplotlib.pyplot as plt
|
|
from pathlib import Path
|
|
import tempfile
|
|
import logging
|
|
import pandas as pd
|
|
import numpy as np
|
|
|
|
# Copied from other generators for modularity. This dictionary maps
|
|
# O*NET major occupation group codes to human-readable labels.
|
|
OCCUPATION_MAJOR_CODES = {
|
|
'11': 'Management',
|
|
'13': 'Business & Financial',
|
|
'15': 'Computer & Mathematical',
|
|
'17': 'Architecture & Engineering',
|
|
'19': 'Life, Physical, & Social Science',
|
|
'21': 'Community & Social Service',
|
|
'23': 'Legal',
|
|
'25': 'Education, Training, & Library',
|
|
'27': 'Arts, Design, & Media',
|
|
'29': 'Healthcare Practitioners',
|
|
'31': 'Healthcare Support',
|
|
'33': 'Protective Service',
|
|
'35': 'Food Preparation & Serving',
|
|
'37': 'Building & Grounds Maintenance',
|
|
'39': 'Personal Care & Service',
|
|
'41': 'Sales & Related',
|
|
'43': 'Office & Admin Support',
|
|
'45': 'Farming, Fishing, & Forestry',
|
|
'47': 'Construction & Extraction',
|
|
'49': 'Installation, Maintenance, & Repair',
|
|
'51': 'Production',
|
|
'53': 'Transportation & Material Moving',
|
|
'55': 'Military Specific',
|
|
}
|
|
|
|
|
|
def generate(processed_df: pd.DataFrame):
|
|
"""
|
|
Generates a scatter plot comparing lower vs. upper time estimates for tasks.
|
|
|
|
This corresponds to 'cell3' from the original analysis notebook. It helps
|
|
visualize the relationship and spread between the lower and upper bounds
|
|
|
|
of time estimates across different occupation groups.
|
|
|
|
Args:
|
|
processed_df (pd.DataFrame): The preprocessed data. Expected columns:
|
|
'lb_estimate_in_minutes',
|
|
'ub_estimate_in_minutes', 'onetsoc_major'.
|
|
|
|
Returns:
|
|
Path: The path to the generated temporary image file, or None on failure.
|
|
"""
|
|
logging.info("Generating plot of lower vs. upper time estimates...")
|
|
|
|
# --- Data Validation and Preparation ---
|
|
required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'onetsoc_major']
|
|
if not all(col in processed_df.columns for col in required_cols):
|
|
logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
|
|
return None
|
|
|
|
df = processed_df.copy()
|
|
|
|
# For log scaling, both lower and upper bounds must be positive.
|
|
df = df[(df['lb_estimate_in_minutes'] > 0) & (df['ub_estimate_in_minutes'] > 0)]
|
|
if df.empty:
|
|
logging.warning("No data with positive lower and upper estimates available to plot.")
|
|
return None
|
|
|
|
# Replace the major code with its readable label for the hue legend.
|
|
df['occupation_label'] = df['onetsoc_major'].map(OCCUPATION_MAJOR_CODES)
|
|
|
|
# --- Plotting ---
|
|
try:
|
|
plt.figure(figsize=(12, 10))
|
|
ax = sns.scatterplot(
|
|
data=df,
|
|
x='lb_estimate_in_minutes',
|
|
y='ub_estimate_in_minutes',
|
|
alpha=0.2,
|
|
edgecolor=None,
|
|
hue="occupation_label" # Use the labeled column for the legend
|
|
)
|
|
|
|
# Determine limits for the 45° reference line
|
|
# Use the maximum of both columns to create a square plot
|
|
max_val = df[['lb_estimate_in_minutes', 'ub_estimate_in_minutes']].max().max()
|
|
lims = (df[['lb_estimate_in_minutes', 'ub_estimate_in_minutes']].min().min(), max_val)
|
|
ax.plot(lims, lims, color='black', linestyle='--', linewidth=1, label='Upper = Lower')
|
|
|
|
# Add helper lines for constant ratios (2x, 10x, 100x)
|
|
for k in [2, 10, 100]:
|
|
ax.plot(lims, [k * l for l in lims],
|
|
linestyle=':', color='grey', linewidth=0.8, label=f'Upper = {k}x Lower')
|
|
|
|
ax.set(xscale='log', yscale='log', xlim=lims, ylim=lims)
|
|
ax.set_xlabel('Lower-bound Estimate (minutes, log scale)', fontsize=12)
|
|
ax.set_ylabel('Upper-bound Estimate (minutes, log scale)', fontsize=12)
|
|
ax.set_title('Lower vs. Upper Time Estimates for All Tasks', fontsize=16)
|
|
|
|
# Place the legend outside the plot to avoid obscuring data
|
|
ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left', title='Occupation / Ratio')
|
|
|
|
# --- File Saving ---
|
|
temp_dir = tempfile.gettempdir()
|
|
temp_path = Path(temp_dir) / "estimate_lower_vs_upper_bounds.png"
|
|
|
|
# Use bbox_inches='tight' to ensure the external legend is included in the saved image.
|
|
plt.savefig(temp_path, dpi=300, bbox_inches='tight')
|
|
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
|
|
|
|
return temp_path
|
|
|
|
except Exception as e:
|
|
logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
|
|
return None
|
|
finally:
|
|
plt.close()
|