old
This commit is contained in:
parent
720f21a85b
commit
43076bcbb1
42 changed files with 237415 additions and 7831 deletions
119
analysis/generators/estimate_lower_vs_upper_bounds.py
Normal file
119
analysis/generators/estimate_lower_vs_upper_bounds.py
Normal file
|
@ -0,0 +1,119 @@
|
|||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
from pathlib import Path
|
||||
import tempfile
|
||||
import logging
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
# Copied from other generators for modularity. This dictionary maps
|
||||
# O*NET major occupation group codes to human-readable labels.
|
||||
OCCUPATION_MAJOR_CODES = {
|
||||
'11': 'Management',
|
||||
'13': 'Business & Financial',
|
||||
'15': 'Computer & Mathematical',
|
||||
'17': 'Architecture & Engineering',
|
||||
'19': 'Life, Physical, & Social Science',
|
||||
'21': 'Community & Social Service',
|
||||
'23': 'Legal',
|
||||
'25': 'Education, Training, & Library',
|
||||
'27': 'Arts, Design, & Media',
|
||||
'29': 'Healthcare Practitioners',
|
||||
'31': 'Healthcare Support',
|
||||
'33': 'Protective Service',
|
||||
'35': 'Food Preparation & Serving',
|
||||
'37': 'Building & Grounds Maintenance',
|
||||
'39': 'Personal Care & Service',
|
||||
'41': 'Sales & Related',
|
||||
'43': 'Office & Admin Support',
|
||||
'45': 'Farming, Fishing, & Forestry',
|
||||
'47': 'Construction & Extraction',
|
||||
'49': 'Installation, Maintenance, & Repair',
|
||||
'51': 'Production',
|
||||
'53': 'Transportation & Material Moving',
|
||||
'55': 'Military Specific',
|
||||
}
|
||||
|
||||
|
||||
def generate(processed_df: pd.DataFrame):
|
||||
"""
|
||||
Generates a scatter plot comparing lower vs. upper time estimates for tasks.
|
||||
|
||||
This corresponds to 'cell3' from the original analysis notebook. It helps
|
||||
visualize the relationship and spread between the lower and upper bounds
|
||||
|
||||
of time estimates across different occupation groups.
|
||||
|
||||
Args:
|
||||
processed_df (pd.DataFrame): The preprocessed data. Expected columns:
|
||||
'lb_estimate_in_minutes',
|
||||
'ub_estimate_in_minutes', 'onetsoc_major'.
|
||||
|
||||
Returns:
|
||||
Path: The path to the generated temporary image file, or None on failure.
|
||||
"""
|
||||
logging.info("Generating plot of lower vs. upper time estimates...")
|
||||
|
||||
# --- Data Validation and Preparation ---
|
||||
required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'onetsoc_major']
|
||||
if not all(col in processed_df.columns for col in required_cols):
|
||||
logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
|
||||
return None
|
||||
|
||||
df = processed_df.copy()
|
||||
|
||||
# For log scaling, both lower and upper bounds must be positive.
|
||||
df = df[(df['lb_estimate_in_minutes'] > 0) & (df['ub_estimate_in_minutes'] > 0)]
|
||||
if df.empty:
|
||||
logging.warning("No data with positive lower and upper estimates available to plot.")
|
||||
return None
|
||||
|
||||
# Replace the major code with its readable label for the hue legend.
|
||||
df['occupation_label'] = df['onetsoc_major'].map(OCCUPATION_MAJOR_CODES)
|
||||
|
||||
# --- Plotting ---
|
||||
try:
|
||||
plt.figure(figsize=(12, 10))
|
||||
ax = sns.scatterplot(
|
||||
data=df,
|
||||
x='lb_estimate_in_minutes',
|
||||
y='ub_estimate_in_minutes',
|
||||
alpha=0.2,
|
||||
edgecolor=None,
|
||||
hue="occupation_label" # Use the labeled column for the legend
|
||||
)
|
||||
|
||||
# Determine limits for the 45° reference line
|
||||
# Use the maximum of both columns to create a square plot
|
||||
max_val = df[['lb_estimate_in_minutes', 'ub_estimate_in_minutes']].max().max()
|
||||
lims = (df[['lb_estimate_in_minutes', 'ub_estimate_in_minutes']].min().min(), max_val)
|
||||
ax.plot(lims, lims, color='black', linestyle='--', linewidth=1, label='Upper = Lower')
|
||||
|
||||
# Add helper lines for constant ratios (2x, 10x, 100x)
|
||||
for k in [2, 10, 100]:
|
||||
ax.plot(lims, [k * l for l in lims],
|
||||
linestyle=':', color='grey', linewidth=0.8, label=f'Upper = {k}x Lower')
|
||||
|
||||
ax.set(xscale='log', yscale='log', xlim=lims, ylim=lims)
|
||||
ax.set_xlabel('Lower-bound Estimate (minutes, log scale)', fontsize=12)
|
||||
ax.set_ylabel('Upper-bound Estimate (minutes, log scale)', fontsize=12)
|
||||
ax.set_title('Lower vs. Upper Time Estimates for All Tasks', fontsize=16)
|
||||
|
||||
# Place the legend outside the plot to avoid obscuring data
|
||||
ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left', title='Occupation / Ratio')
|
||||
|
||||
# --- File Saving ---
|
||||
temp_dir = tempfile.gettempdir()
|
||||
temp_path = Path(temp_dir) / "estimate_lower_vs_upper_bounds.png"
|
||||
|
||||
# Use bbox_inches='tight' to ensure the external legend is included in the saved image.
|
||||
plt.savefig(temp_path, dpi=300, bbox_inches='tight')
|
||||
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
|
||||
|
||||
return temp_path
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
|
||||
return None
|
||||
finally:
|
||||
plt.close()
|
Loading…
Add table
Add a link
Reference in a new issue