This commit is contained in:
Félix Dorn 2025-07-15 00:41:05 +02:00
parent 720f21a85b
commit 43076bcbb1
42 changed files with 237415 additions and 7831 deletions

View file

View file

@ -0,0 +1,119 @@
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import tempfile
import logging
import pandas as pd
import numpy as np
# Copied from other generators for modularity. This dictionary maps
# O*NET major occupation group codes to human-readable labels.
OCCUPATION_MAJOR_CODES = {
'11': 'Management',
'13': 'Business & Financial',
'15': 'Computer & Mathematical',
'17': 'Architecture & Engineering',
'19': 'Life, Physical, & Social Science',
'21': 'Community & Social Service',
'23': 'Legal',
'25': 'Education, Training, & Library',
'27': 'Arts, Design, & Media',
'29': 'Healthcare Practitioners',
'31': 'Healthcare Support',
'33': 'Protective Service',
'35': 'Food Preparation & Serving',
'37': 'Building & Grounds Maintenance',
'39': 'Personal Care & Service',
'41': 'Sales & Related',
'43': 'Office & Admin Support',
'45': 'Farming, Fishing, & Forestry',
'47': 'Construction & Extraction',
'49': 'Installation, Maintenance, & Repair',
'51': 'Production',
'53': 'Transportation & Material Moving',
'55': 'Military Specific',
}
def generate(processed_df: pd.DataFrame):
"""
Generates a scatter plot comparing lower vs. upper time estimates for tasks.
This corresponds to 'cell3' from the original analysis notebook. It helps
visualize the relationship and spread between the lower and upper bounds
of time estimates across different occupation groups.
Args:
processed_df (pd.DataFrame): The preprocessed data. Expected columns:
'lb_estimate_in_minutes',
'ub_estimate_in_minutes', 'onetsoc_major'.
Returns:
Path: The path to the generated temporary image file, or None on failure.
"""
logging.info("Generating plot of lower vs. upper time estimates...")
# --- Data Validation and Preparation ---
required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'onetsoc_major']
if not all(col in processed_df.columns for col in required_cols):
logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
return None
df = processed_df.copy()
# For log scaling, both lower and upper bounds must be positive.
df = df[(df['lb_estimate_in_minutes'] > 0) & (df['ub_estimate_in_minutes'] > 0)]
if df.empty:
logging.warning("No data with positive lower and upper estimates available to plot.")
return None
# Replace the major code with its readable label for the hue legend.
df['occupation_label'] = df['onetsoc_major'].map(OCCUPATION_MAJOR_CODES)
# --- Plotting ---
try:
plt.figure(figsize=(12, 10))
ax = sns.scatterplot(
data=df,
x='lb_estimate_in_minutes',
y='ub_estimate_in_minutes',
alpha=0.2,
edgecolor=None,
hue="occupation_label" # Use the labeled column for the legend
)
# Determine limits for the 45° reference line
# Use the maximum of both columns to create a square plot
max_val = df[['lb_estimate_in_minutes', 'ub_estimate_in_minutes']].max().max()
lims = (df[['lb_estimate_in_minutes', 'ub_estimate_in_minutes']].min().min(), max_val)
ax.plot(lims, lims, color='black', linestyle='--', linewidth=1, label='Upper = Lower')
# Add helper lines for constant ratios (2x, 10x, 100x)
for k in [2, 10, 100]:
ax.plot(lims, [k * l for l in lims],
linestyle=':', color='grey', linewidth=0.8, label=f'Upper = {k}x Lower')
ax.set(xscale='log', yscale='log', xlim=lims, ylim=lims)
ax.set_xlabel('Lower-bound Estimate (minutes, log scale)', fontsize=12)
ax.set_ylabel('Upper-bound Estimate (minutes, log scale)', fontsize=12)
ax.set_title('Lower vs. Upper Time Estimates for All Tasks', fontsize=16)
# Place the legend outside the plot to avoid obscuring data
ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left', title='Occupation / Ratio')
# --- File Saving ---
temp_dir = tempfile.gettempdir()
temp_path = Path(temp_dir) / "estimate_lower_vs_upper_bounds.png"
# Use bbox_inches='tight' to ensure the external legend is included in the saved image.
plt.savefig(temp_path, dpi=300, bbox_inches='tight')
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
return temp_path
except Exception as e:
logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
return None
finally:
plt.close()

View file

@ -0,0 +1,86 @@
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
import tempfile
import logging
def generate(processed_df: pd.DataFrame):
"""
Generates a histogram of the log-ratio of upper to lower time estimates.
This corresponds to 'cell4' from the original analysis notebook. It shows
the distribution of how many times larger the upper estimate is compared
to the lower estimate.
Args:
processed_df (pd.DataFrame): The preprocessed data. Expected columns:
'lb_estimate_in_minutes',
'ub_estimate_in_minutes'.
Returns:
Path: The path to the generated temporary image file, or None on failure.
"""
logging.info("Generating distribution plot of estimate ratios...")
# --- Data Validation and Preparation ---
required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes']
if not all(col in processed_df.columns for col in required_cols):
logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
return None
df = processed_df.copy()
# Calculate the ratio. We need to handle cases where the lower bound is zero.
# Replace lower bound of 0 with a small number to avoid division by zero, or filter them out.
# Here, we filter, as a ratio with a zero denominator is undefined.
df = df[df['lb_estimate_in_minutes'] > 0]
df['estimate_ratio'] = df['ub_estimate_in_minutes'] / df['lb_estimate_in_minutes']
# Replace infinite values (which can occur if ub is huge and lb is tiny) with NaN
# and drop rows with NaN or infinite ratios.
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(subset=['estimate_ratio'], inplace=True)
if df.empty:
logging.warning("No valid data available to plot the estimate ratio distribution.")
return None
# --- Plotting ---
try:
plt.figure(figsize=(10, 6))
# We plot the log10 of the ratio to better visualize the wide distribution
log_ratio = np.log10(df['estimate_ratio'])
sns.histplot(log_ratio, bins=60, kde=True)
# Add vertical lines for reference points
# log10(1) = 0, which is where upper bound equals lower bound
plt.axvline(x=0, color='black', linestyle='-', linewidth=1.5, label='1x (Upper = Lower)')
# A small ratio, e.g., 5% difference
plt.axvline(x=np.log10(1.05), color='orange', linestyle='--', linewidth=1, label='1.05x ratio')
# A 10x ratio
plt.axvline(x=np.log10(10), color='red', linestyle='--', linewidth=1, label='10x ratio')
plt.xlabel('log₁₀(Upper Estimate / Lower Estimate)', fontsize=12)
plt.ylabel('Number of Tasks', fontsize=12)
plt.title('Distribution of Time Estimate Ratios', fontsize=16)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
# --- File Saving ---
temp_dir = tempfile.gettempdir()
temp_path = Path(temp_dir) / "estimate_ratio_distribution.png"
plt.savefig(temp_path, dpi=300)
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
return temp_path
except Exception as e:
logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
return None
finally:
plt.close()

View file

@ -0,0 +1,135 @@
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pathlib import Path
import tempfile
import logging
# This mapping helps translate the O*NET 2-digit major group codes
# into human-readable labels for the plot's y-axis.
OCCUPATION_MAJOR_CODES = {
'11': 'Management',
'13': 'Business & Financial',
'15': 'Computer & Mathematical',
'17': 'Architecture & Engineering',
'19': 'Life, Physical, & Social Science',
'21': 'Community & Social Service',
'23': 'Legal',
'25': 'Education, Training, & Library',
'27': 'Arts, Design, & Media',
'29': 'Healthcare Practitioners',
'31': 'Healthcare Support',
'33': 'Protective Service',
'35': 'Food Preparation & Serving',
'37': 'Building & Grounds Maintenance',
'39': 'Personal Care & Service',
'41': 'Sales & Related',
'43': 'Office & Admin Support',
'45': 'Farming, Fishing, & Forestry',
'47': 'Construction & Extraction',
'49': 'Installation, Maintenance, & Repair',
'51': 'Production',
'53': 'Transportation & Material Moving',
'55': 'Military Specific',
}
def generate(processed_df: pd.DataFrame):
"""
Generates a heatmap of the median estimate ratio by occupation and task length quartile.
This corresponds to 'cell5' from the original analysis notebook. It shows
how the ratio between upper and lower time estimates varies across
different occupations and for tasks of different typical lengths (binned
into quartiles).
Args:
processed_df (pd.DataFrame): The preprocessed data. Expected columns:
'lb_estimate_in_minutes',
'ub_estimate_in_minutes', 'onetsoc_major'.
Returns:
Path: The path to the generated temporary image file, or None on failure.
"""
logging.info("Generating heatmap of estimate ratios by occupation and task length...")
# --- Data Validation and Preparation ---
required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'onetsoc_major']
if not all(col in processed_df.columns for col in required_cols):
logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
return None
df = processed_df.copy()
# Calculate the estimate ratio, handling division by zero and infinity
df = df[df['lb_estimate_in_minutes'] > 0]
df['estimate_ratio'] = df['ub_estimate_in_minutes'] / df['lb_estimate_in_minutes']
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(subset=['estimate_ratio'], inplace=True)
if df.empty:
logging.warning("No valid data available for the ratio heatmap.")
return None
# 1. Bin lower bounds into quartiles (Q1Q4)
# Using duplicates='drop' can help if there are many identical values
# which can make binning into quantiles fail.
try:
df['lb_q'] = pd.qcut(
df.lb_estimate_in_minutes,
q=4,
labels=['Q1 (Shortest)', 'Q2', 'Q3', 'Q4 (Longest)'],
duplicates='drop'
)
except ValueError as e:
logging.error(f"Could not bin data into quartiles: {e}. There might not be enough unique values.")
return None
# 2. Aggregate: median ratio per cell (occupation x task length quartile)
pivot = df.pivot_table(
index='onetsoc_major',
columns='lb_q',
values='estimate_ratio',
aggfunc='median'
)
# Map the index (onetsoc_major codes) to their corresponding readable labels
pivot.index = pivot.index.map(OCCUPATION_MAJOR_CODES)
pivot.dropna(inplace=True) # Drop occupations with no data in some quartiles for a cleaner plot
if pivot.empty:
logging.warning("Pivot table is empty after processing. Cannot generate heatmap.")
return None
# --- Plotting ---
try:
plt.figure(figsize=(12, 10))
sns.heatmap(
pivot,
cmap='RdYlGn_r', # Red-Yellow-Green (reversed), good for ratios centered around 1
center=2, # Center the colormap around a ratio of 2
annot=True, # Show the median values in the cells
fmt='.1f', # Format annotations to one decimal place
linewidths=.5,
cbar_kws={'label': 'Median Upper/Lower Estimate Ratio'}
)
plt.xlabel('Task Length (based on lower-bound quartile)', fontsize=12)
plt.ylabel('Occupation Major Group', fontsize=12)
plt.title('Typical Estimate Range Width by Occupation and Task Length', fontsize=16)
plt.tight_layout()
# --- File Saving ---
temp_dir = tempfile.gettempdir()
temp_path = Path(temp_dir) / "ratio_heatmap_by_occupation_and_task_length.png"
plt.savefig(temp_path, dpi=300)
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
return temp_path
except Exception as e:
logging.error(f"An error occurred while generating the heatmap: {e}", exc_info=True)
return None
finally:
plt.close()

View file

@ -0,0 +1,161 @@
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.colors as mcolors
from pathlib import Path
import tempfile
import logging
# This mapping helps translate the O*NET 2-digit major group codes
# into human-readable labels for the plot's y-axis.
OCCUPATION_MAJOR_CODES = {
'11': 'Management',
'13': 'Business & Financial',
'15': 'Computer & Mathematical',
'17': 'Architecture & Engineering',
'19': 'Life, Physical, & Social Science',
'21': 'Community & Social Service',
'23': 'Legal',
'25': 'Education, Training, & Library',
'27': 'Arts, Design, & Media',
'29': 'Healthcare Practitioners',
'31': 'Healthcare Support',
'33': 'Protective Service',
'35': 'Food Preparation & Serving',
'37': 'Building & Grounds Maintenance',
'39': 'Personal Care & Service',
'41': 'Sales & Related',
'43': 'Office & Admin Support',
'45': 'Farming, Fishing, & Forestry',
'47': 'Construction & Extraction',
'49': 'Installation, Maintenance, & Repair',
'51': 'Production',
'53': 'Transportation & Material Moving',
'55': 'Military Specific',
}
# Define colors to match the original notebook's palette.
# These are standard hex codes for gray and lime shades.
BAR_COLORS = [
'#D1D5DB', # gray-300
'#84CC16', # lime-500
'#D9F99D', # lime-200
]
def _get_contrasting_text_color(bg_color_hex):
"""
Determines if black or white text provides better contrast against a given background color.
"""
try:
rgba = mcolors.to_rgba(bg_color_hex)
# Calculate luminance (Y) using the sRGB formula
luminance = 0.2126 * rgba[0] + 0.7152 * rgba[1] + 0.0722 * rgba[2]
return 'black' if luminance > 0.55 else 'white'
except ValueError:
return 'black' # Default to black if color is invalid
def generate(processed_df: pd.DataFrame):
"""
Generates a stacked bar chart breaking down tasks by remote status and estimability.
This corresponds to 'cell10' from the original analysis notebook. It shows,
for each occupation, the percentage of tasks that are not remote, remote and
estimable, or remote and not estimable.
Args:
processed_df (pd.DataFrame): The preprocessed data. Expected columns:
'onetsoc_major', 'remote_status', 'estimateable'.
Returns:
Path: The path to the generated temporary image file, or None on failure.
"""
logging.info("Generating task breakdown by occupation plot...")
# --- Data Validation ---
required_cols = ['onetsoc_major', 'remote_status', 'estimateable']
if not all(col in processed_df.columns for col in required_cols):
logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
return None
df = processed_df.copy()
# --- Data Summarization ---
summary_data = []
for code, label in OCCUPATION_MAJOR_CODES.items():
occ_df = df[df['onetsoc_major'] == code]
total_tasks = len(occ_df)
if total_tasks == 0:
continue
not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote'])
remote_df = occ_df[occ_df['remote_status'] == 'remote']
remote_atomic_count = len(remote_df[remote_df['estimateable'] == 'ATOMIC'])
remote_ongoing_count = len(remote_df[remote_df['estimateable'] == 'ONGOING-CONSTRAINT'])
summary_data.append({
'occupation_label': label,
'count_not_remote': not_remote_count,
'count_remote_atomic': remote_atomic_count,
'count_remote_ongoing': remote_ongoing_count,
'total_tasks': total_tasks
})
if not summary_data:
logging.warning("No data available to generate the task breakdown plot.")
return None
summary_df = pd.DataFrame(summary_data)
# --- Percentage Calculation ---
summary_df['pct_not_remote'] = (summary_df['count_not_remote'] / summary_df['total_tasks']) * 100
summary_df['pct_remote_atomic'] = (summary_df['count_remote_atomic'] / summary_df['total_tasks']) * 100
summary_df['pct_remote_ongoing'] = (summary_df['count_remote_ongoing'] / summary_df['total_tasks']) * 100
plot_df = summary_df.set_index('occupation_label')[
['pct_not_remote', 'pct_remote_atomic', 'pct_remote_ongoing']
]
plot_df.columns = ['Not Remote', 'Remote & Estimable', 'Remote & Not Estimable']
plot_df = plot_df.sort_values(by='Not Remote', ascending=False)
# --- Plotting ---
try:
fig, ax = plt.subplots(figsize=(14, 10))
plot_df.plot(kind='barh', stacked=True, ax=ax, color=BAR_COLORS, width=0.8)
ax.set_xlabel("Percentage of Tasks", fontsize=12)
ax.set_ylabel("Occupation Major Group", fontsize=12)
ax.set_title("Task Breakdown by Occupation, Remote Status, and Estimability", fontsize=16, pad=20)
ax.xaxis.set_major_formatter(mtick.PercentFormatter())
ax.set_xlim(0, 100)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
# Add percentage labels inside each bar segment
for i, container in enumerate(ax.containers):
text_color = _get_contrasting_text_color(BAR_COLORS[i])
for patch in container.patches:
width = patch.get_width()
if width > 3: # Only label segments wider than 3%
x = patch.get_x() + width / 2
y = patch.get_y() + patch.get_height() / 2
ax.text(x, y, f"{width:.1f}%", ha='center', va='center',
fontsize=8, color=text_color, fontweight='medium')
ax.legend(title="Task Category", bbox_to_anchor=(1.02, 1), loc='upper left', frameon=False)
# --- File Saving ---
temp_dir = tempfile.gettempdir()
temp_path = Path(temp_dir) / "task_breakdown_by_occupation.png"
plt.savefig(temp_path, dpi=300, bbox_inches='tight')
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
return temp_path
except Exception as e:
logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
return None
finally:
plt.close()

View file

@ -0,0 +1,74 @@
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import tempfile
import logging
import pandas as pd
def generate(processed_df: pd.DataFrame):
"""
Generates a histogram of the task time estimate midpoints.
This generator corresponds to 'cell1' from the original analysis notebook.
It visualizes the distribution of the calculated midpoint of time estimates
for all tasks on a logarithmic scale to handle the wide range of values.
Args:
processed_df (pd.DataFrame): The preprocessed data, expected to contain
'lb_estimate_in_minutes' and
'ub_estimate_in_minutes' columns.
Returns:
Path: The path to the generated temporary image file, or None if
generation fails.
"""
logging.info("Generating task estimate distribution plot...")
# --- Data Validation and Preparation ---
required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes']
if not all(col in processed_df.columns for col in required_cols):
logging.error(
f"Required columns {required_cols} not found in the DataFrame. "
"Cannot generate plot."
)
return None
# Create a copy to avoid modifying the original DataFrame
df = processed_df.copy()
# Calculate the midpoint from lower and upper bounds, as was done in the notebook
df['estimate_midpoint'] = (df['lb_estimate_in_minutes'] + df['ub_estimate_in_minutes']) / 2
# For log scaling, we must use positive values. Filter out any non-positive midpoints.
df = df[df['estimate_midpoint'] > 0]
if df.empty:
logging.warning("No data with positive estimate midpoints available to plot.")
return None
# --- Plotting ---
try:
plt.figure(figsize=(10, 6))
ax = sns.histplot(data=df, x='estimate_midpoint', log_scale=True)
ax.set_title('Distribution of Task Time Estimate Midpoints', fontsize=16)
ax.set_xlabel('Estimate Midpoint (minutes, log scale)', fontsize=12)
ax.set_ylabel('Number of Tasks', fontsize=12)
plt.tight_layout()
# --- File Saving ---
# Create a temporary file to save the plot. The orchestrator (`generate.py`)
# will move this to the final 'dist/' directory.
temp_dir = tempfile.gettempdir()
temp_path = Path(temp_dir) / "task_estimate_distribution.png"
plt.savefig(temp_path, dpi=300)
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
return temp_path
except Exception as e:
logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
return None
finally:
# Close the figure to free up memory, which is crucial when running many generators.
plt.close()

View file

@ -0,0 +1,134 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from pathlib import Path
import tempfile
import logging
# Replicating the color palette from the original notebook for consistency.
# These appear to be inspired by Tailwind CSS colors.
GRAY_PALETTE = {
'100': '#F3F4F6',
'300': '#D1D5DB',
}
LIME_PALETTE = {
'300': '#D9F99D',
'600': '#A3E635', # A mid-tone lime
'900': '#4D7C0F', # A dark lime/green
}
def _calculate_cdf(series: pd.Series):
"""
Calculates the empirical Cumulative Distribution Function (CDF) for a series.
Returns the sorted values and their corresponding cumulative percentages.
"""
# Drop NA values and ensure the series is sorted
s = series.dropna().sort_values().reset_index(drop=True)
# Calculate cumulative percentage: (index + 1) / total_count
cdf_y = ((s.index + 1) / len(s)) * 100
return s.values, cdf_y
def generate(processed_df: pd.DataFrame):
"""
Generates a Cumulative Distribution Function (CDF) plot for task time estimates.
This corresponds to the second 'cell11' from the original notebook. It plots
the CDF for the lower-bound, upper-bound, and mid-point of time estimates,
showing the percentage of tasks that can be completed within a certain time.
Args:
processed_df (pd.DataFrame): The preprocessed data. Expected columns:
'lb_estimate_in_minutes',
'ub_estimate_in_minutes'.
Returns:
Path: The path to the generated temporary image file, or None on failure.
"""
logging.info("Generating temporal coherence CDF plot...")
# --- Data Validation and Preparation ---
required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes']
if not all(col in processed_df.columns for col in required_cols):
logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
return None
df = processed_df.copy()
# Log scale requires positive values.
df = df[(df['lb_estimate_in_minutes'] > 0) & (df['ub_estimate_in_minutes'] > 0)]
if df.empty:
logging.warning("No data with positive estimates available to generate CDF plot.")
return None
# Calculate mid-point estimate
df['midpoint_estimate'] = (df['lb_estimate_in_minutes'] + df['ub_estimate_in_minutes']) / 2
# Prepare data for CDF plots
x_lb, y_lb = _calculate_cdf(df['lb_estimate_in_minutes'])
x_ub, y_ub = _calculate_cdf(df['ub_estimate_in_minutes'])
x_mid, y_mid = _calculate_cdf(df['midpoint_estimate'])
# --- Plotting ---
try:
fig, ax = plt.subplots(figsize=(12, 8))
# --- Grid and Reference Lines ---
# Horizontal reference lines for percentages
for y_val in range(0, 101, 10):
ax.axhline(y_val, color=GRAY_PALETTE['100'], linewidth=0.8, zorder=1)
# Vertical reference lines for human-friendly durations
ticks = [1, 5, 10, 30, 60, 120, 240, 480, 1440, 2880, 10080, 43200]
for tick in ticks:
ax.axvline(tick, color=GRAY_PALETTE['300'], linewidth=0.8, linestyle='--', zorder=1)
# --- CDF Plots ---
ax.step(x_lb, y_lb, where='post', color=LIME_PALETTE['300'], linewidth=1.8, linestyle='--', zorder=2, label='Lower-bound Estimate (CDF)')
ax.step(x_ub, y_ub, where='post', color=LIME_PALETTE['900'], linewidth=1.8, linestyle=':', zorder=3, label='Upper-bound Estimate (CDF)')
ax.step(x_mid, y_mid, where='post', color=LIME_PALETTE['600'], linewidth=2.2, zorder=4, label='Mid-point Estimate (CDF)')
# --- Axes Configuration ---
ax.set_ylim(0, 100)
ax.set_xscale('log')
# Custom x-ticks for durations
ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hr', '2 hrs', '4 hrs', '8 hrs', '1 day', '2 days', '1 week', '30 days']
ax.set_xticks(ticks)
ax.set_xticklabels(ticklabels, rotation=45, ha='right')
ax.minorticks_off() # Turn off minor ticks for clarity with custom grid
# Format y-axis as percentages
ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(decimals=0))
# --- Spines and Labels ---
for spine in ['top', 'right']:
ax.spines[spine].set_visible(False)
for spine in ['left', 'bottom']:
ax.spines[spine].set_edgecolor(GRAY_PALETTE['300'])
# Use ax.text for more control over label placement than ax.set_ylabel/xlabel
ax.text(-0.07, 1.02, "% of tasks with duration ≤ X", transform=ax.transAxes,
fontsize=12, fontweight='semibold', va='bottom')
ax.text(0.5, -0.25, 'Task Duration (X)', transform=ax.transAxes,
fontsize=12, fontweight='semibold', ha='center')
ax.legend(frameon=False, loc='lower right')
fig.suptitle('Cumulative Distribution of Task Time Estimates', fontsize=16, y=0.96)
plt.tight_layout(rect=[0, 0, 1, 0.95]) # Adjust layout to make space for suptitle
# --- File Saving ---
temp_dir = tempfile.gettempdir()
temp_path = Path(temp_dir) / "temporal_coherence_cdf.png"
plt.savefig(temp_path, dpi=300, bbox_inches='tight')
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
return temp_path
except Exception as e:
logging.error(f"An error occurred while generating the CDF plot: {e}", exc_info=True)
return None
finally:
plt.close()

View file

@ -0,0 +1,112 @@
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import tempfile
import logging
import pandas as pd
# Based on O*NET SOC 2018 structure, this mapping helps translate
# the 2-digit major group codes into human-readable labels.
OCCUPATION_MAJOR_CODES = {
'11': 'Management',
'13': 'Business & Financial',
'15': 'Computer & Mathematical',
'17': 'Architecture & Engineering',
'19': 'Life, Physical, & Social Science',
'21': 'Community & Social Service',
'23': 'Legal',
'25': 'Education, Training, & Library',
'27': 'Arts, Design, & Media',
'29': 'Healthcare Practitioners',
'31': 'Healthcare Support',
'33': 'Protective Service',
'35': 'Food Preparation & Serving',
'37': 'Building & Grounds Maintenance',
'39': 'Personal Care & Service',
'41': 'Sales & Related',
'43': 'Office & Admin Support',
'45': 'Farming, Fishing, & Forestry',
'47': 'Construction & Extraction',
'49': 'Installation, Maintenance, & Repair',
'51': 'Production',
'53': 'Transportation & Material Moving',
'55': 'Military Specific',
}
def generate(processed_df: pd.DataFrame):
"""
Generates a box plot showing the spread of time-range estimates per occupation.
This corresponds to 'cell2' from the original analysis notebook. It visualizes
the distribution of the difference between upper and lower time estimates for
each major occupational group.
Args:
processed_df (pd.DataFrame): The preprocessed data. Expected columns:
'lb_estimate_in_minutes',
'ub_estimate_in_minutes', 'onetsoc_major'.
Returns:
Path: The path to the generated temporary image file, or None on failure.
"""
logging.info("Generating plot of time estimate spread by occupation...")
# --- Data Validation and Preparation ---
required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'onetsoc_major']
if not all(col in processed_df.columns for col in required_cols):
logging.error(f"Missing one or more required columns: {required_cols}. Cannot generate plot.")
return None
df = processed_df.copy()
# Calculate the estimate range.
df['estimate_range'] = df['ub_estimate_in_minutes'] - df['lb_estimate_in_minutes']
# For log scaling, we need positive values. Filter out any non-positive ranges.
df = df[df['estimate_range'] > 0]
if df.empty:
logging.warning("No data with a positive estimate range available to plot.")
return None
# Sort by the major code to ensure a consistent plot order
df = df.sort_values('onetsoc_major')
# --- Plotting ---
try:
plt.figure(figsize=(14, 10))
ax = sns.boxplot(
data=df,
x='onetsoc_major',
y='estimate_range',
showfliers=False # Outliers are excluded for a clearer view of the main distribution
)
plt.yscale('log') # The long tail of the data makes a log scale more readable
plt.xlabel('Occupation Major Group', fontsize=12)
plt.ylabel('Time Estimate Range (upper - lower, in minutes, log scale)', fontsize=12)
plt.title('Spread of Time-Range Estimates by Occupation', fontsize=16)
# Replace numeric x-tick labels (e.g., '11', '15') with meaningful text labels
ax.set_xticklabels(
[OCCUPATION_MAJOR_CODES.get(code.get_text(), code.get_text()) for code in ax.get_xticklabels()],
rotation=60,
ha='right' # Align rotated labels correctly
)
plt.tight_layout()
# --- File Saving ---
temp_dir = tempfile.gettempdir()
temp_path = Path(temp_dir) / "time_estimate_spread_by_occupation.png"
plt.savefig(temp_path, dpi=300, bbox_inches='tight')
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
return temp_path
except Exception as e:
logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
return None
finally:
plt.close()

View file

@ -0,0 +1,150 @@
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import pandas as pd
from pathlib import Path
import tempfile
import logging
# Assuming data.py is in the same package and provides this function
from ..data import get_db_connection
# This mapping helps translate the O*NET 2-digit major group codes
# into human-readable labels for the plot's y-axis.
OCCUPATION_MAJOR_CODES = {
'11': 'Management',
'13': 'Business & Financial',
'15': 'Computer & Mathematical',
'17': 'Architecture & Engineering',
'19': 'Life, Physical, & Social Science',
'21': 'Community & Social Service',
'23': 'Legal',
'25': 'Education, Training, & Library',
'27': 'Arts, Design, & Media',
'29': 'Healthcare Practitioners',
'31': 'Healthcare Support',
'33': 'Protective Service',
'35': 'Food Preparation & Serving',
'37': 'Building & Grounds Maintenance',
'39': 'Personal Care & Service',
'41': 'Sales & Related',
'43': 'Office & Admin Support',
'45': 'Farming, Fishing, & Forestry',
'47': 'Construction & Extraction',
'49': 'Installation, Maintenance, & Repair',
'51': 'Production',
'53': 'Transportation & Material Moving',
'55': 'Military Specific',
}
def generate(processed_df: pd.DataFrame):
"""
Generates a bar plot of the total wage bill per major occupation group.
This corresponds to the first 'cell11' from the original analysis notebook.
It calculates the total wage bill (Total Employment * Annual Mean Wage) for
each occupation and aggregates it by major occupation group. This generator
loads its data directly from the O*NET database.
Args:
processed_df (pd.DataFrame): The preprocessed data (not used in this generator,
but required by the function signature).
Returns:
Path: The path to the generated temporary image file, or None on failure.
"""
logging.info("Generating plot of total wage bill by occupation...")
conn = None
try:
# --- Data Loading ---
# This generator needs specific data that is not in the main preprocessed_df.
# It loads occupational employment and wage data directly from the database.
conn = get_db_connection()
if conn is None:
raise ConnectionError("Could not get database connection.")
# This data is stored in a long format in the `occupation_level_metadata` table.
# We need to query this table and pivot it to get employment and wage columns.
query = "SELECT onetsoc_code, item, response FROM occupation_level_metadata WHERE item IN ('Employment', 'Annual Mean Wage')"
try:
df_meta = pd.read_sql_query(query, conn)
# Pivot the table to create 'Employment' and 'Annual Mean Wage' columns
df_oesm = df_meta.pivot(index='onetsoc_code', columns='item', values='response').reset_index()
logging.info("Pivoted occupation metadata. Columns are: %s", df_oesm.columns.tolist())
# Rename for consistency with the original notebook's code
df_oesm.rename(columns={
'onetsoc_code': 'OCC_CODE',
'Employment': 'TOT_EMP',
'Annual Mean Wage': 'A_MEAN'
}, inplace=True)
except (pd.io.sql.DatabaseError, KeyError) as e:
logging.error(f"Failed to query or pivot occupation metadata: {e}", exc_info=True)
return None
# --- Data Preparation ---
# Create a 'major group' code from the first two digits of the SOC code
df_oesm['onetsoc_major'] = df_oesm['OCC_CODE'].str[:2]
# Ensure wage and employment columns are numeric, coercing errors to NaN
df_oesm['TOT_EMP'] = pd.to_numeric(df_oesm['TOT_EMP'], errors='coerce')
df_oesm['A_MEAN'] = pd.to_numeric(df_oesm['A_MEAN'], errors='coerce')
# Drop rows with missing data in critical columns
df_oesm.dropna(subset=['TOT_EMP', 'A_MEAN', 'onetsoc_major'], inplace=True)
# Calculate the wage bill for each occupation
df_oesm['wage_bill'] = df_oesm['TOT_EMP'] * df_oesm['A_MEAN']
# Aggregate the wage bill by major occupation group
df_wage_bill_major = df_oesm.groupby('onetsoc_major')['wage_bill'].sum().reset_index()
# Map the major codes to readable titles for plotting
df_wage_bill_major['OCC_TITLE_MAJOR'] = df_wage_bill_major['onetsoc_major'].map(OCCUPATION_MAJOR_CODES)
df_wage_bill_major.dropna(subset=['OCC_TITLE_MAJOR'], inplace=True) # Drop military/unmapped codes
# Sort by wage bill for a more informative plot
df_wage_bill_major = df_wage_bill_major.sort_values('wage_bill', ascending=False)
if df_wage_bill_major.empty:
logging.warning("No data available to generate the wage bill plot.")
return None
# --- Plotting ---
plt.figure(figsize=(12, 10))
ax = sns.barplot(x='wage_bill', y='OCC_TITLE_MAJOR', data=df_wage_bill_major, palette="viridis", orient='h')
ax.set_title('Total Wage Bill per Major Occupation Group', fontsize=16, pad=15)
ax.set_xlabel('Total Wage Bill (in USD)', fontsize=12)
ax.set_ylabel('Major Occupation Group', fontsize=12)
ax.grid(axis='x', linestyle='--', alpha=0.7)
# Format the x-axis to be more readable (e.g., "$2.0T" for trillions)
def format_billions(x, pos):
if x >= 1e12:
return f'${x*1e-12:.1f}T'
if x >= 1e9:
return f'${x*1e-9:.0f}B'
return f'${x*1e-6:.0f}M'
ax.xaxis.set_major_formatter(mticker.FuncFormatter(format_billions))
plt.tight_layout()
# --- File Saving ---
temp_dir = tempfile.gettempdir()
temp_path = Path(temp_dir) / "wage_bill_by_occupation.png"
plt.savefig(temp_path, dpi=300)
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
return temp_path
except Exception as e:
logging.error(f"An error occurred while generating the wage bill plot: {e}", exc_info=True)
return None
finally:
plt.close()
if conn:
conn.close()