import seaborn as sns import matplotlib.pyplot as plt from pathlib import Path import tempfile import logging import pandas as pd import numpy as np # Copied from other generators for modularity. This dictionary maps # O*NET major occupation group codes to human-readable labels. OCCUPATION_MAJOR_CODES = { '11': 'Management', '13': 'Business & Financial', '15': 'Computer & Mathematical', '17': 'Architecture & Engineering', '19': 'Life, Physical, & Social Science', '21': 'Community & Social Service', '23': 'Legal', '25': 'Education, Training, & Library', '27': 'Arts, Design, & Media', '29': 'Healthcare Practitioners', '31': 'Healthcare Support', '33': 'Protective Service', '35': 'Food Preparation & Serving', '37': 'Building & Grounds Maintenance', '39': 'Personal Care & Service', '41': 'Sales & Related', '43': 'Office & Admin Support', '45': 'Farming, Fishing, & Forestry', '47': 'Construction & Extraction', '49': 'Installation, Maintenance, & Repair', '51': 'Production', '53': 'Transportation & Material Moving', '55': 'Military Specific', } def generate(processed_df: pd.DataFrame): """ Generates a scatter plot comparing lower vs. upper time estimates for tasks. This corresponds to 'cell3' from the original analysis notebook. It helps visualize the relationship and spread between the lower and upper bounds of time estimates across different occupation groups. Args: processed_df (pd.DataFrame): The preprocessed data. Expected columns: 'lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'onetsoc_major'. Returns: Path: The path to the generated temporary image file, or None on failure. """ logging.info("Generating plot of lower vs. upper time estimates...") # --- Data Validation and Preparation --- required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'onetsoc_major'] if not all(col in processed_df.columns for col in required_cols): logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.") return None df = processed_df.copy() # For log scaling, both lower and upper bounds must be positive. df = df[(df['lb_estimate_in_minutes'] > 0) & (df['ub_estimate_in_minutes'] > 0)] if df.empty: logging.warning("No data with positive lower and upper estimates available to plot.") return None # Replace the major code with its readable label for the hue legend. df['occupation_label'] = df['onetsoc_major'].map(OCCUPATION_MAJOR_CODES) # --- Plotting --- try: plt.figure(figsize=(12, 10)) ax = sns.scatterplot( data=df, x='lb_estimate_in_minutes', y='ub_estimate_in_minutes', alpha=0.2, edgecolor=None, hue="occupation_label" # Use the labeled column for the legend ) # Determine limits for the 45° reference line # Use the maximum of both columns to create a square plot max_val = df[['lb_estimate_in_minutes', 'ub_estimate_in_minutes']].max().max() lims = (df[['lb_estimate_in_minutes', 'ub_estimate_in_minutes']].min().min(), max_val) ax.plot(lims, lims, color='black', linestyle='--', linewidth=1, label='Upper = Lower') # Add helper lines for constant ratios (2x, 10x, 100x) for k in [2, 10, 100]: ax.plot(lims, [k * l for l in lims], linestyle=':', color='grey', linewidth=0.8, label=f'Upper = {k}x Lower') ax.set(xscale='log', yscale='log', xlim=lims, ylim=lims) ax.set_xlabel('Lower-bound Estimate (minutes, log scale)', fontsize=12) ax.set_ylabel('Upper-bound Estimate (minutes, log scale)', fontsize=12) ax.set_title('Lower vs. Upper Time Estimates for All Tasks', fontsize=16) # Place the legend outside the plot to avoid obscuring data ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left', title='Occupation / Ratio') # --- File Saving --- temp_dir = tempfile.gettempdir() temp_path = Path(temp_dir) / "estimate_lower_vs_upper_bounds.png" # Use bbox_inches='tight' to ensure the external legend is included in the saved image. plt.savefig(temp_path, dpi=300, bbox_inches='tight') logging.info(f"Successfully saved plot to temporary file: {temp_path}") return temp_path except Exception as e: logging.error(f"An error occurred while generating the plot: {e}", exc_info=True) return None finally: plt.close()