sprint-econtai/pipeline/generators/estimates_spread_per_occupation.py
Félix Dorn 65dc648797 wip
2025-07-15 00:34:54 +02:00

39 lines
1.1 KiB
Python

from pathlib import Path
from typing import Generator
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from ..utils import OCCUPATION_MAJOR_CODES, style_plot
def generate_estimate_spread_per_occupation(output_dir: Path, df: pd.DataFrame, **kwargs) -> Generator[Path]:
"""
Generates a styled boxplot of the estimate range spread per major occupation group.
"""
style_plot()
OUTPUT_PATH = output_dir / "estimates_spread_per_occupation.png"
fig, ax = plt.subplots(figsize=(10, 12))
sns.boxplot(
data=df,
x='onetsoc_major',
y='estimate_range',
showfliers=False,
ax=ax
)
ax.set_yscale('log')
ax.set_xlabel('Occupation')
ax.set_ylabel('Range (upper-lower, minutes)')
ax.set_title('Spread of time-range estimates per occupation')
# Get occupation labels from codes for x-axis ticks
labels = [OCCUPATION_MAJOR_CODES.get(code.get_text(), code.get_text()) for code in ax.get_xticklabels()]
ax.set_xticklabels(labels, rotation=60, ha='right')
plt.tight_layout()
plt.savefig(OUTPUT_PATH)
plt.close(fig)
yield OUTPUT_PATH