import seaborn as sns import matplotlib.pyplot as plt from pathlib import Path import tempfile import logging import pandas as pd # Based on O*NET SOC 2018 structure, this mapping helps translate # the 2-digit major group codes into human-readable labels. OCCUPATION_MAJOR_CODES = { '11': 'Management', '13': 'Business & Financial', '15': 'Computer & Mathematical', '17': 'Architecture & Engineering', '19': 'Life, Physical, & Social Science', '21': 'Community & Social Service', '23': 'Legal', '25': 'Education, Training, & Library', '27': 'Arts, Design, & Media', '29': 'Healthcare Practitioners', '31': 'Healthcare Support', '33': 'Protective Service', '35': 'Food Preparation & Serving', '37': 'Building & Grounds Maintenance', '39': 'Personal Care & Service', '41': 'Sales & Related', '43': 'Office & Admin Support', '45': 'Farming, Fishing, & Forestry', '47': 'Construction & Extraction', '49': 'Installation, Maintenance, & Repair', '51': 'Production', '53': 'Transportation & Material Moving', '55': 'Military Specific', } def generate(processed_df: pd.DataFrame): """ Generates a box plot showing the spread of time-range estimates per occupation. This corresponds to 'cell2' from the original analysis notebook. It visualizes the distribution of the difference between upper and lower time estimates for each major occupational group. Args: processed_df (pd.DataFrame): The preprocessed data. Expected columns: 'lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'onetsoc_major'. Returns: Path: The path to the generated temporary image file, or None on failure. """ logging.info("Generating plot of time estimate spread by occupation...") # --- Data Validation and Preparation --- required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'onetsoc_major'] if not all(col in processed_df.columns for col in required_cols): logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.") return None df = processed_df.copy() # Calculate the estimate range. df['estimate_range'] = df['ub_estimate_in_minutes'] - df['lb_estimate_in_minutes'] # For log scaling, we need positive values. Filter out any non-positive ranges. df = df[df['estimate_range'] > 0] if df.empty: logging.warning("No data with a positive estimate range available to plot.") return None # Sort by the major code to ensure a consistent plot order df = df.sort_values('onetsoc_major') # --- Plotting --- try: plt.figure(figsize=(14, 10)) ax = sns.boxplot( data=df, x='onetsoc_major', y='estimate_range', showfliers=False # Outliers are excluded for a clearer view of the main distribution ) plt.yscale('log') # The long tail of the data makes a log scale more readable plt.xlabel('Occupation Major Group', fontsize=12) plt.ylabel('Time Estimate Range (upper - lower, in minutes, log scale)', fontsize=12) plt.title('Spread of Time-Range Estimates by Occupation', fontsize=16) # Replace numeric x-tick labels (e.g., '11', '15') with meaningful text labels ax.set_xticklabels( [OCCUPATION_MAJOR_CODES.get(code.get_text(), code.get_text()) for code in ax.get_xticklabels()], rotation=60, ha='right' # Align rotated labels correctly ) plt.tight_layout() # --- File Saving --- temp_dir = tempfile.gettempdir() temp_path = Path(temp_dir) / "time_estimate_spread_by_occupation.png" plt.savefig(temp_path, dpi=300, bbox_inches='tight') logging.info(f"Successfully saved plot to temporary file: {temp_path}") return temp_path except Exception as e: logging.error(f"An error occurred while generating the plot: {e}", exc_info=True) return None finally: plt.close()