old

2025-07-15 00:41:05 +02:00 · 2025-07-15 00:41:05 +02:00 · 43076bcbb1
commit 43076bcbb1
parent 720f21a85b
42 changed files with 237415 additions and 7831 deletions
--- a/analysis/generators/estimate_lower_vs_upper_bounds.py
+++ b/analysis/generators/estimate_lower_vs_upper_bounds.py
@ -0,0 +1,119 @@
+import seaborn as sns
+import matplotlib.pyplot as plt
+from pathlib import Path
+import tempfile
+import logging
+import pandas as pd
+import numpy as np
+
+# Copied from other generators for modularity. This dictionary maps
+# O*NET major occupation group codes to human-readable labels.
+OCCUPATION_MAJOR_CODES = {
+    '11': 'Management',
+    '13': 'Business & Financial',
+    '15': 'Computer & Mathematical',
+    '17': 'Architecture & Engineering',
+    '19': 'Life, Physical, & Social Science',
+    '21': 'Community & Social Service',
+    '23': 'Legal',
+    '25': 'Education, Training, & Library',
+    '27': 'Arts, Design, & Media',
+    '29': 'Healthcare Practitioners',
+    '31': 'Healthcare Support',
+    '33': 'Protective Service',
+    '35': 'Food Preparation & Serving',
+    '37': 'Building & Grounds Maintenance',
+    '39': 'Personal Care & Service',
+    '41': 'Sales & Related',
+    '43': 'Office & Admin Support',
+    '45': 'Farming, Fishing, & Forestry',
+    '47': 'Construction & Extraction',
+    '49': 'Installation, Maintenance, & Repair',
+    '51': 'Production',
+    '53': 'Transportation & Material Moving',
+    '55': 'Military Specific',
+}
+
+
+def generate(processed_df: pd.DataFrame):
+    """
+    Generates a scatter plot comparing lower vs. upper time estimates for tasks.
+
+    This corresponds to 'cell3' from the original analysis notebook. It helps
+    visualize the relationship and spread between the lower and upper bounds
+
+    of time estimates across different occupation groups.
+
+    Args:
+        processed_df (pd.DataFrame): The preprocessed data. Expected columns:
+                                     'lb_estimate_in_minutes',
+                                     'ub_estimate_in_minutes', 'onetsoc_major'.
+
+    Returns:
+        Path: The path to the generated temporary image file, or None on failure.
+    """
+    logging.info("Generating plot of lower vs. upper time estimates...")
+
+    # --- Data Validation and Preparation ---
+    required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'onetsoc_major']
+    if not all(col in processed_df.columns for col in required_cols):
+        logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
+        return None
+
+    df = processed_df.copy()
+
+    # For log scaling, both lower and upper bounds must be positive.
+    df = df[(df['lb_estimate_in_minutes'] > 0) & (df['ub_estimate_in_minutes'] > 0)]
+    if df.empty:
+        logging.warning("No data with positive lower and upper estimates available to plot.")
+        return None
+
+    # Replace the major code with its readable label for the hue legend.
+    df['occupation_label'] = df['onetsoc_major'].map(OCCUPATION_MAJOR_CODES)
+
+    # --- Plotting ---
+    try:
+        plt.figure(figsize=(12, 10))
+        ax = sns.scatterplot(
+            data=df,
+            x='lb_estimate_in_minutes',
+            y='ub_estimate_in_minutes',
+            alpha=0.2,
+            edgecolor=None,
+            hue="occupation_label"  # Use the labeled column for the legend
+        )
+
+        # Determine limits for the 45° reference line
+        # Use the maximum of both columns to create a square plot
+        max_val = df[['lb_estimate_in_minutes', 'ub_estimate_in_minutes']].max().max()
+        lims = (df[['lb_estimate_in_minutes', 'ub_estimate_in_minutes']].min().min(), max_val)
+        ax.plot(lims, lims, color='black', linestyle='--', linewidth=1, label='Upper = Lower')
+
+        # Add helper lines for constant ratios (2x, 10x, 100x)
+        for k in [2, 10, 100]:
+            ax.plot(lims, [k * l for l in lims],
+                    linestyle=':', color='grey', linewidth=0.8, label=f'Upper = {k}x Lower')
+
+        ax.set(xscale='log', yscale='log', xlim=lims, ylim=lims)
+        ax.set_xlabel('Lower-bound Estimate (minutes, log scale)', fontsize=12)
+        ax.set_ylabel('Upper-bound Estimate (minutes, log scale)', fontsize=12)
+        ax.set_title('Lower vs. Upper Time Estimates for All Tasks', fontsize=16)
+
+        # Place the legend outside the plot to avoid obscuring data
+        ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left', title='Occupation / Ratio')
+
+        # --- File Saving ---
+        temp_dir = tempfile.gettempdir()
+        temp_path = Path(temp_dir) / "estimate_lower_vs_upper_bounds.png"
+
+        # Use bbox_inches='tight' to ensure the external legend is included in the saved image.
+        plt.savefig(temp_path, dpi=300, bbox_inches='tight')
+        logging.info(f"Successfully saved plot to temporary file: {temp_path}")
+
+        return temp_path
+
+    except Exception as e:
+        logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
+        return None
+    finally:
+        plt.close()