sprint-econtai/pipeline/generators/estimates_lower_vs_upper_scatter.py
Félix Dorn 65dc648797 wip
2025-07-15 00:34:54 +02:00

56 lines
1.9 KiB
Python

from pathlib import Path
from typing import Generator
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from ..utils import OCCUPATION_MAJOR_CODES, style_plot
def generate_estimates_lower_vs_upper_scatter(output_dir: Path, df: pd.DataFrame, **kwargs) -> Generator[Path]:
"""
Generates a styled scatter plot of lower-bound vs upper-bound time estimates for tasks.
"""
style_plot()
OUTPUT_PATH = output_dir / "estimates_lower_vs_upper_scatter.png"
plot_df = df.copy()
# Replace onetsoc_major codes with their corresponding labels for the plot legend
plot_df['onetsoc_major'] = plot_df['onetsoc_major'].map(OCCUPATION_MAJOR_CODES)
fig, ax = plt.subplots(figsize=(12, 10))
sns.scatterplot(
data=plot_df,
x='lb_estimate_in_minutes',
y='ub_estimate_in_minutes',
alpha=0.3,
edgecolor=None,
hue="onetsoc_major",
ax=ax
)
# 45° reference line (y=x)
lims = (
min(df['lb_estimate_in_minutes'].min(), df['ub_estimate_in_minutes'].min()),
max(df['lb_estimate_in_minutes'].max(), df['ub_estimate_in_minutes'].max())
)
lims = (lims[0] * 0.9, lims[1] * 1.1)
ax.plot(lims, lims, color='black', linestyle='--', linewidth=1, zorder=0)
# Optional helper lines for ratios
for k in [2, 10, 100]:
ax.plot(lims, [k*l for l in lims],
linestyle=':', color='grey', linewidth=1, zorder=0)
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel('Lower-bound (min, log scale)')
ax.set_ylabel('Upper-bound (min, log scale)')
ax.set_title('Lower vs Upper Estimates for All Tasks')
ax.legend(title="Occupation Major Group", bbox_to_anchor=(1.02, 1), loc='upper left')
plt.tight_layout()
plt.savefig(OUTPUT_PATH, bbox_inches='tight')
plt.close(fig)
yield OUTPUT_PATH