old
This commit is contained in:
parent
720f21a85b
commit
43076bcbb1
42 changed files with 237415 additions and 7831 deletions
112
analysis/generators/time_estimate_spread_by_occupation.py
Normal file
112
analysis/generators/time_estimate_spread_by_occupation.py
Normal file
|
@ -0,0 +1,112 @@
|
|||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
from pathlib import Path
|
||||
import tempfile
|
||||
import logging
|
||||
import pandas as pd
|
||||
|
||||
# Based on O*NET SOC 2018 structure, this mapping helps translate
|
||||
# the 2-digit major group codes into human-readable labels.
|
||||
OCCUPATION_MAJOR_CODES = {
|
||||
'11': 'Management',
|
||||
'13': 'Business & Financial',
|
||||
'15': 'Computer & Mathematical',
|
||||
'17': 'Architecture & Engineering',
|
||||
'19': 'Life, Physical, & Social Science',
|
||||
'21': 'Community & Social Service',
|
||||
'23': 'Legal',
|
||||
'25': 'Education, Training, & Library',
|
||||
'27': 'Arts, Design, & Media',
|
||||
'29': 'Healthcare Practitioners',
|
||||
'31': 'Healthcare Support',
|
||||
'33': 'Protective Service',
|
||||
'35': 'Food Preparation & Serving',
|
||||
'37': 'Building & Grounds Maintenance',
|
||||
'39': 'Personal Care & Service',
|
||||
'41': 'Sales & Related',
|
||||
'43': 'Office & Admin Support',
|
||||
'45': 'Farming, Fishing, & Forestry',
|
||||
'47': 'Construction & Extraction',
|
||||
'49': 'Installation, Maintenance, & Repair',
|
||||
'51': 'Production',
|
||||
'53': 'Transportation & Material Moving',
|
||||
'55': 'Military Specific',
|
||||
}
|
||||
|
||||
|
||||
def generate(processed_df: pd.DataFrame):
|
||||
"""
|
||||
Generates a box plot showing the spread of time-range estimates per occupation.
|
||||
|
||||
This corresponds to 'cell2' from the original analysis notebook. It visualizes
|
||||
the distribution of the difference between upper and lower time estimates for
|
||||
each major occupational group.
|
||||
|
||||
Args:
|
||||
processed_df (pd.DataFrame): The preprocessed data. Expected columns:
|
||||
'lb_estimate_in_minutes',
|
||||
'ub_estimate_in_minutes', 'onetsoc_major'.
|
||||
|
||||
Returns:
|
||||
Path: The path to the generated temporary image file, or None on failure.
|
||||
"""
|
||||
logging.info("Generating plot of time estimate spread by occupation...")
|
||||
|
||||
# --- Data Validation and Preparation ---
|
||||
required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'onetsoc_major']
|
||||
if not all(col in processed_df.columns for col in required_cols):
|
||||
logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
|
||||
return None
|
||||
|
||||
df = processed_df.copy()
|
||||
|
||||
# Calculate the estimate range.
|
||||
df['estimate_range'] = df['ub_estimate_in_minutes'] - df['lb_estimate_in_minutes']
|
||||
|
||||
# For log scaling, we need positive values. Filter out any non-positive ranges.
|
||||
df = df[df['estimate_range'] > 0]
|
||||
if df.empty:
|
||||
logging.warning("No data with a positive estimate range available to plot.")
|
||||
return None
|
||||
|
||||
# Sort by the major code to ensure a consistent plot order
|
||||
df = df.sort_values('onetsoc_major')
|
||||
|
||||
# --- Plotting ---
|
||||
try:
|
||||
plt.figure(figsize=(14, 10))
|
||||
|
||||
ax = sns.boxplot(
|
||||
data=df,
|
||||
x='onetsoc_major',
|
||||
y='estimate_range',
|
||||
showfliers=False # Outliers are excluded for a clearer view of the main distribution
|
||||
)
|
||||
|
||||
plt.yscale('log') # The long tail of the data makes a log scale more readable
|
||||
plt.xlabel('Occupation Major Group', fontsize=12)
|
||||
plt.ylabel('Time Estimate Range (upper - lower, in minutes, log scale)', fontsize=12)
|
||||
plt.title('Spread of Time-Range Estimates by Occupation', fontsize=16)
|
||||
|
||||
# Replace numeric x-tick labels (e.g., '11', '15') with meaningful text labels
|
||||
ax.set_xticklabels(
|
||||
[OCCUPATION_MAJOR_CODES.get(code.get_text(), code.get_text()) for code in ax.get_xticklabels()],
|
||||
rotation=60,
|
||||
ha='right' # Align rotated labels correctly
|
||||
)
|
||||
|
||||
plt.tight_layout()
|
||||
|
||||
# --- File Saving ---
|
||||
temp_dir = tempfile.gettempdir()
|
||||
temp_path = Path(temp_dir) / "time_estimate_spread_by_occupation.png"
|
||||
plt.savefig(temp_path, dpi=300, bbox_inches='tight')
|
||||
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
|
||||
|
||||
return temp_path
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
|
||||
return None
|
||||
finally:
|
||||
plt.close()
|
Loading…
Add table
Add a link
Reference in a new issue