563 lines
23 KiB
Python
563 lines
23 KiB
Python
import os
|
||
import litellm
|
||
import sqlite3
|
||
import numpy as np
|
||
import pandas as pd
|
||
from google.colab import userdata, files
|
||
import seaborn as sns
|
||
import matplotlib.pyplot as plt
|
||
import matplotlib as mpl
|
||
|
||
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
|
||
os.environ['GEMINI_API_KEY'] = userdata.get('GEMINI_API_KEY')
|
||
|
||
occupation_major_codes = {
|
||
'11': 'Management',
|
||
'13': 'Business and Financial Operations',
|
||
'15': 'Computer and Mathematical Occupations',
|
||
'17': 'Architecture and Engineering',
|
||
'19': 'Life, Physical, and Social Science',
|
||
'21': 'Community and Social Services',
|
||
'23': 'Legal',
|
||
'25': 'Education, Training, and Library',
|
||
'27': 'Arts, Design, Entertainment, Sports, and Media',
|
||
'29': 'Healthcare Practitioners and Technical',
|
||
'31': 'Healthcare Support',
|
||
'33': 'Protective Service',
|
||
'35': 'Food Preparation and Serving Related',
|
||
'37': 'Building and Grounds Cleaning and Maintenance',
|
||
'39': 'Personal Care and Service',
|
||
'41': 'Sales and Related',
|
||
'43': 'Office and Administrative Support',
|
||
'45': 'Farming, Fishing, and Forestry',
|
||
'47': 'Construction and Extraction',
|
||
'49': 'Installation, Maintenance, and Repair',
|
||
'51': 'Production',
|
||
'53': 'Transportation and Material Moving',
|
||
'55': 'Military Specific'
|
||
}
|
||
|
||
gray = {'50':'#f8fafc','100':'#f1f5f9','200':'#e2e8f0',
|
||
'300':'#cbd5e1','400':'#94a3b8','500':'#64748b',
|
||
'600':'#475569','700':'#334155','800':'#1e293b',
|
||
'900':'#0f172a','950':'#020617'}
|
||
lime = {'50': '#f7fee7','100': '#ecfcca','200': '#d8f999',
|
||
'300': '#bbf451','400': '#9ae600','500': '#83cd00',
|
||
'600': '#64a400','700': '#497d00','800': '#3c6300',
|
||
'900': '#35530e','950': '#192e03'}
|
||
|
||
mpl.rcParams.update({
|
||
'figure.facecolor' : gray['50'],
|
||
'axes.facecolor' : gray['50'],
|
||
'axes.edgecolor' : gray['100'],
|
||
'axes.labelcolor' : gray['700'],
|
||
'xtick.color' : gray['700'],
|
||
'ytick.color' : gray['700'],
|
||
'font.family' : 'Inter', # falls back to DejaVu if Inter not present
|
||
'font.size' : 11,
|
||
})
|
||
|
||
sns.set_style("white") # keep minimal axes, we will remove default grid
|
||
sns.set_context("notebook")
|
||
|
||
def prepare_tasks():
|
||
# This dataset comes from https://epoch.ai/gradient-updates/consequences-of-automating-remote-work
|
||
# It contains labels for a O*NET task can be done remotely or not (labeled by GPT-4o)
|
||
# You can download it here: https://drive.google.com/file/d/1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r/view?usp=sharing
|
||
df_remote_status = pd.read_csv("epoch_task_data.csv")
|
||
|
||
# BLS OEWS: https://www.bls.gov/oes/special-requests/oesm23nat.zip
|
||
df_oesm = pd.read_excel("oesm23national.xlsx")
|
||
|
||
# Run uv run ./enrich_task_ratings.py
|
||
df_tasks = pd.read_json("task_ratings_enriched.json")
|
||
|
||
# Run uv run classify_estimateability_of_tasks.py
|
||
df_task_estimateable = pd.read_csv("tasks_estimateable.csv").rename(columns={"task_estimateable": "estimateable"}).drop_duplicates(subset=['task'], keep='first')
|
||
|
||
# df_tasks now has a remote_status column which contains either "remote" or "not remote"
|
||
df_tasks = pd.merge(df_tasks, df_remote_status[['Task', 'Remote']], left_on='task', right_on='Task', how='left')
|
||
df_tasks = df_tasks.drop('Task', axis=1).rename(columns={'Remote': 'remote_status'})
|
||
|
||
# df_tasks now has a estimateable column which contains either "ATOMIC" or "ONGOING-CONSTRAINT"
|
||
df_tasks = pd.merge(df_tasks, df_task_estimateable[['task', 'estimateable']], on='task', how='left')
|
||
|
||
df_tasks = df_tasks[df_tasks['importance_average'] < 3].copy()
|
||
|
||
df_tasks['onetsoc_major'] = df_tasks['onetsoc_code'].str[:2]
|
||
|
||
df_remote_tasks = df_tasks[df_tasks['remote_status'] == 'remote'].copy()
|
||
|
||
# Call create_task_estimates() from add_task_estimates? which creates tasks_with_estimates.csv
|
||
|
||
def preprocessing_time_estimates():
|
||
df = pd.read_csv("tasks_with_estimates.csv")
|
||
|
||
df = df[df['importance_average'] > 3].copy()
|
||
|
||
# The embeddings comes from running `uv run ./embed_task_description.py`
|
||
# Columns: ['embedding_id', 'task', 'embedding_vector']
|
||
# These contain embedding for UNIQUE tasks
|
||
df_task_embeddings = pd.read_parquet("tasks_with_embeddings.parquet").drop_duplicates(subset=['task'])[['task', 'task_embedding']].rename(columns={"task_embedding": "embedding_vector"}).copy()
|
||
|
||
df = pd.merge(df, df_task_embeddings[['task', 'embedding_vector']], on='task', how='left')
|
||
df = pd.merge(df, df_task_estimateable[['task', 'estimateable']], on='task', how='left')
|
||
|
||
df['onetsoc_major'] = df['onetsoc_code'].str[:2]
|
||
|
||
def convert_to_minutes(qty, unit):
|
||
"""Converts a quantity in a given unit to minutes."""
|
||
return qty * {
|
||
"minute": 1,
|
||
"hour": 60,
|
||
"day": 60 * 24,
|
||
"week": 60 * 24 * 7,
|
||
"month": 60 * 24 * 30,
|
||
"trimester": 60 * 24 * 90,
|
||
"semester": 60 * 24 * 180,
|
||
"year": 60 * 24 * 365,
|
||
}[unit]
|
||
|
||
df['lb_estimate_in_minutes'] = df.apply(
|
||
lambda row: convert_to_minutes(row['lb_estimate_qty'], row['lb_estimate_unit']), axis=1
|
||
)
|
||
df['ub_estimate_in_minutes'] = df.apply(
|
||
lambda row: convert_to_minutes(row['ub_estimate_qty'], row['ub_estimate_unit']), axis=1
|
||
)
|
||
|
||
df['estimate_range'] = df.ub_estimate_in_minutes - df.lb_estimate_in_minutes
|
||
df['estimate_ratio'] = df.ub_estimate_in_minutes / df.lb_estimate_in_minutes
|
||
df['estimate_midpoint'] = (df.lb_estimate_in_minutes + df.ub_estimate_in_minutes)/2
|
||
|
||
atomic_tasks = df[df['estimateable'] == 'ATOMIC']
|
||
ongoing_tasks = df[df['estimateable'] == 'ONGOING-CONSTRAINT']
|
||
|
||
with pd.option_context('display.max_columns', None):
|
||
display(df)
|
||
|
||
# Check for empty estimates
|
||
if atomic_tasks['lb_estimate_in_minutes'].isnull().sum() > 0:
|
||
print("Missing values in 'lb_estimate_in_minutes':", atomic_tasks['lb_estimate_in_minutes'].isnull().sum())
|
||
|
||
if atomic_tasks['ub_estimate_in_minutes'].isnull().sum() > 0:
|
||
print("Missing values in 'ub_estimate_in_minutes':", atomic_tasks['ub_estimate_in_minutes'].isnull().sum())
|
||
|
||
# Check for impossible bounds
|
||
impossible_bounds = atomic_tasks[
|
||
(atomic_tasks['lb_estimate_in_minutes'] <= 0) |
|
||
(atomic_tasks['ub_estimate_in_minutes'] <= 0) |
|
||
(atomic_tasks['lb_estimate_in_minutes'] > atomic_tasks['ub_estimate_in_minutes'])
|
||
]
|
||
if not impossible_bounds.empty:
|
||
print(f"Error: Found rows with impossible bounds.")
|
||
with pd.option_context('display.max_colwidth', None):
|
||
display(impossible_bounds[['task', 'lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'dwas']])
|
||
|
||
#with pd.option_context('display.max_colwidth', None):
|
||
#display(atomic_tasks.nlargest(20, 'ub_estimate_in_minutes')[['task', 'lb_estimate_qty', 'lb_estimate_unit', 'lb_estimate_in_minutes', 'ub_estimate_qty', 'ub_estimate_unit', 'ub_estimate_in_minutes', 'estimate_ratio']])
|
||
|
||
def cell1():
|
||
sns.histplot(atomic_tasks.estimate_midpoint, log_scale=True)
|
||
|
||
def cell2():
|
||
plt.figure(figsize=(14,10))
|
||
sns.boxplot(
|
||
data=atomic_tasks,
|
||
x='onetsoc_major', # 11 = Management, 15 = Computer/Math, …
|
||
y='estimate_range',
|
||
showfliers=False
|
||
)
|
||
plt.yscale('log') # long tail => log scale
|
||
plt.xlabel('Occupation')
|
||
plt.ylabel('Range (upper-lower, minutes)')
|
||
plt.title('Spread of time-range estimates per occupation')
|
||
|
||
ax = plt.gca()
|
||
ax.set_xticklabels([occupation_major_codes[code.get_text()] for code in ax.get_xticklabels()], rotation=60, ha='right')
|
||
|
||
def cell3():
|
||
plt.figure(figsize=(10, 10))
|
||
ax = sns.scatterplot(
|
||
data=atomic_tasks.replace({'onetsoc_major': occupation_major_codes}), # Replace codes with labels
|
||
x='lb_estimate_in_minutes', y='ub_estimate_in_minutes',
|
||
alpha=0.2, edgecolor=None, hue="onetsoc_major" # Use the labeled column for hue
|
||
)
|
||
|
||
# 45° reference
|
||
lims = (1, atomic_tasks[['lb_estimate_in_minutes','ub_estimate_in_minutes']].max().max())
|
||
ax.plot(lims, lims, color='black', linestyle='--', linewidth=1)
|
||
|
||
# optional helper lines: 2× and 10×, 100× ratios
|
||
for k in [2,10, 100]:
|
||
ax.plot(lims, [k*l for l in lims],
|
||
linestyle=':', color='grey', linewidth=1)
|
||
|
||
ax.set(xscale='log', yscale='log')
|
||
ax.set_xlabel('Lower-bound (min, log scale)')
|
||
ax.set_ylabel('Upper-bound (min, log scale)')
|
||
ax.set_title('Lower vs upper estimates for all tasks')
|
||
|
||
# Place the legend outside the plot
|
||
ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
|
||
|
||
def cell4():
|
||
plt.figure(figsize=(8,4))
|
||
sns.histplot(np.log10(atomic_tasks['estimate_ratio'].replace([np.inf, -np.inf], np.nan).dropna()),
|
||
bins=60, kde=True)
|
||
plt.axvline(np.log10(10), color='red', ls='--', lw=1, label='10×')
|
||
plt.axvline(np.log10(1.05), color='orange', ls='--', lw=1, label='1.05×')
|
||
plt.axvline(0, color='black', ls='-', lw=1) # ub = lb
|
||
plt.xlabel('log₁₀(upper / lower)')
|
||
plt.ylabel('Count')
|
||
plt.title('Distribution of upper:lower ratio')
|
||
plt.legend()
|
||
plt.tight_layout()
|
||
|
||
|
||
def cell5():
|
||
# 1. Bin lower bounds into quartiles (Q1–Q4)
|
||
atomic_tasks['lb_q'] = pd.qcut(atomic_tasks.lb_estimate_in_minutes,
|
||
q=4, labels=['Q1 shortest','Q2','Q3','Q4 longest'])
|
||
|
||
|
||
# 3. Aggregate: median (or mean) ratio per cell
|
||
pivot = atomic_tasks.pivot_table(index='onetsoc_major', columns='lb_q',
|
||
values='estimate_ratio', aggfunc='median')
|
||
|
||
# Map the index (onetsoc_major codes) to their corresponding labels
|
||
pivot.index = pivot.index.map(occupation_major_codes)
|
||
|
||
|
||
# 4. Visualise
|
||
plt.figure(figsize=(10,8))
|
||
sns.heatmap(pivot, cmap='RdYlGn_r', center=2, annot=True, fmt='.1f',
|
||
cbar_kws={'label':'Median upper/lower ratio'})
|
||
plt.xlabel('Lower-bound quartile')
|
||
plt.ylabel('Occupation (major group)')
|
||
plt.title('Typical range width by occupation and task length')
|
||
plt.tight_layout()
|
||
|
||
|
||
|
||
def cell6():
|
||
"""
|
||
from scipy.stats import median_abs_deviation
|
||
|
||
def mad_z(series):
|
||
med = series.median()
|
||
mad = median_abs_deviation(series, scale='normal') # ⇒ comparable to σ
|
||
return (series - med) / mad
|
||
|
||
df['robust_z'] = df.groupby('onetsoc_code')['estimate_midpoint'].transform(mad_z)
|
||
"""
|
||
|
||
agg = (atomic_tasks
|
||
.groupby('onetsoc_code')['estimate_midpoint']
|
||
.agg(median='median',
|
||
q1=lambda x: x.quantile(.25),
|
||
q3=lambda x: x.quantile(.75),
|
||
mean='mean',
|
||
std='std')
|
||
.reset_index())
|
||
agg['IQR'] = agg.q3 - agg.q1
|
||
agg['CV'] = agg['std'] / agg['mean'] # coefficient of variation
|
||
|
||
# merge back the group mean and std so each row can be scored
|
||
atomic_tasks = atomic_tasks.merge(agg[['onetsoc_code','mean','std']], on='onetsoc_code')
|
||
|
||
|
||
atomic_tasks['z'] = (atomic_tasks.estimate_midpoint - atomic_tasks['mean']) / atomic_tasks['std']
|
||
outliers = atomic_tasks.loc[atomic_tasks.z.abs() > 3]
|
||
outliers
|
||
|
||
def cell7():
|
||
from scipy.stats import median_abs_deviation
|
||
|
||
def mad_z(series):
|
||
med = series.median()
|
||
mad = median_abs_deviation(series, scale='normal') # ⇒ comparable to σ
|
||
return (series - med) / mad
|
||
|
||
atomic_tasks['robust_z'] = atomic_tasks.groupby('onetsoc_code')['estimate_midpoint'].transform(mad_z)
|
||
|
||
def cell8():
|
||
from sklearn.metrics.pairwise import cosine_similarity
|
||
from sklearn.neighbors import NearestNeighbors
|
||
|
||
# unit-normalise embeddings
|
||
X = np.vstack(atomic_tasks.embedding_vector.values)
|
||
X = X / np.linalg.norm(X, axis=1, keepdims=True)
|
||
|
||
k = 5
|
||
nn = NearestNeighbors(n_neighbors=k+1, metric='cosine').fit(X)
|
||
dist, idx = nn.kneighbors(X) # idx[:,0] is the task itself
|
||
pairs = pd.DataFrame({
|
||
'i': np.repeat(np.arange(len(X)), k),
|
||
'j': idx[:,1:].ravel(), # drop self-match
|
||
'sim': 1 - dist[:,1:].ravel() # cosine similarity
|
||
})
|
||
|
||
# join time estimates
|
||
pairs = (pairs
|
||
.merge(atomic_tasks[['estimate_midpoint']], left_on='i', right_index=True)
|
||
.merge(atomic_tasks[['estimate_midpoint']], left_on='j', right_index=True,
|
||
suffixes=('_i','_j')))
|
||
|
||
pairs['ratio'] = pairs[['estimate_midpoint_i','estimate_midpoint_j']].max(1) / \
|
||
pairs[['estimate_midpoint_i','estimate_midpoint_j']].min(1)
|
||
|
||
pairs
|
||
|
||
def cell9():
|
||
sns.scatterplot(data=pairs.sample(20_000), x='sim', y=np.log10(pairs['ratio']),
|
||
alpha=.1)
|
||
plt.axhline(np.log10(2), ls=':', color='red'); # e.g. >2× difference
|
||
plt.xlabel('Cosine similarity'); plt.ylabel('log₁₀ time-ratio');
|
||
plt.title('Are similar tasks given similar time estimates?');
|
||
|
||
def cell10():
|
||
import matplotlib.ticker as mtick # For percentage formatting
|
||
import matplotlib.colors as mcolors # For color conversion
|
||
|
||
summary_data = []
|
||
|
||
for code, label in occupation_major_codes.items():
|
||
occ_df = df_tasks[df_tasks['onetsoc_major'] == code]
|
||
total_tasks_in_occ = len(occ_df)
|
||
|
||
if total_tasks_in_occ == 0:
|
||
continue # Skip if no tasks for this occupation
|
||
|
||
# Stack 1: % that isn't equal to "remote"
|
||
not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote'])
|
||
|
||
# For the remaining remote tasks:
|
||
remote_df = occ_df[occ_df['remote_status'] == 'remote']
|
||
|
||
# Stack 2: % of remote + ATOMIC
|
||
remote_atomic_count = len(remote_df[remote_df['estimateable'] == 'ATOMIC'])
|
||
|
||
# Stack 3: % of remote + ONGOING-CONSTRAINT
|
||
remote_ongoing_count = len(remote_df[remote_df['estimateable'] == 'ONGOING-CONSTRAINT'])
|
||
|
||
summary_data.append({
|
||
'onetsoc_major_code': code,
|
||
'occupation_label': label,
|
||
'count_not_remote': not_remote_count,
|
||
'count_remote_atomic': remote_atomic_count,
|
||
'count_remote_ongoing': remote_ongoing_count,
|
||
'total_tasks': total_tasks_in_occ
|
||
})
|
||
|
||
summary_df = pd.DataFrame(summary_data)
|
||
|
||
# --- 3. Calculate Percentages ---
|
||
# Ensure total_tasks is not zero to avoid division by zero errors if an occupation had no tasks
|
||
summary_df = summary_df[summary_df['total_tasks'] > 0].copy() # Use .copy() to avoid SettingWithCopyWarning
|
||
|
||
summary_df['pct_not_remote'] = (summary_df['count_not_remote'] / summary_df['total_tasks']) * 100
|
||
summary_df['pct_remote_atomic'] = (summary_df['count_remote_atomic'] / summary_df['total_tasks']) * 100
|
||
summary_df['pct_remote_ongoing'] = (summary_df['count_remote_ongoing'] / summary_df['total_tasks']) * 100
|
||
|
||
# Select columns for plotting and set index to occupation label
|
||
plot_df = summary_df.set_index('occupation_label')[
|
||
['pct_not_remote', 'pct_remote_atomic', 'pct_remote_ongoing']
|
||
]
|
||
|
||
# Rename columns for a clearer legend
|
||
plot_df.columns = ['Not Remote', 'Remote + Estimable', 'Remote + Not estimable']
|
||
|
||
plot_df = plot_df.sort_values(by='Not Remote', ascending=False)
|
||
|
||
|
||
# --- 4. Plotting (Modified) ---
|
||
|
||
# Define the custom colors based on your requirements
|
||
# The order must match the column order in plot_df:
|
||
# 1. 'Not Remote'
|
||
# 2. 'Remote & ATOMIC'
|
||
# 3. 'Remote & ONGOING-CONSTRAINT'
|
||
bar_colors = [gray["300"], lime["500"], lime["200"]]
|
||
|
||
fig, ax = plt.subplots(figsize=(14, 10)) # Adjusted figsize for better readability
|
||
|
||
plot_df.plot(kind='barh', stacked=True, ax=ax, color=bar_colors)
|
||
|
||
ax.set_xlabel("Percentage of Tasks (%)", fontsize=12)
|
||
ax.set_ylabel("Occupation Major Group", fontsize=12)
|
||
ax.set_title("Task Breakdown by Occupation, Remote Status, and Estimateability", fontsize=14, pad=20)
|
||
|
||
# Format x-axis as percentages
|
||
ax.xaxis.set_major_formatter(mtick.PercentFormatter())
|
||
plt.xlim(0, 100) # Ensure x-axis goes from 0 to 100%
|
||
|
||
# Remove right and top spines
|
||
ax.spines['right'].set_visible(False)
|
||
ax.spines['top'].set_visible(False)
|
||
|
||
# Function to get contrasting text color
|
||
def get_contrasting_text_color(bg_color_hex_or_rgba):
|
||
"""
|
||
Determines if black or white text provides better contrast against a given background color.
|
||
bg_color_hex_or_rgba: A hex string (e.g., '#RRGGBB') or an RGBA tuple (values in [0, 1]).
|
||
Returns: 'black' or 'white'.
|
||
"""
|
||
# Convert to RGBA if it's a hex string or name
|
||
if isinstance(bg_color_hex_or_rgba, str):
|
||
rgba = mcolors.to_rgba(bg_color_hex_or_rgba)
|
||
else:
|
||
rgba = bg_color_hex_or_rgba
|
||
|
||
r, g, b, _ = rgba # Ignore alpha for luminance calculation
|
||
# Calculate luminance (standard formula for sRGB)
|
||
# Values r, g, b should be in [0, 1] for this formula
|
||
luminance = 0.2126 * r + 0.7152 * g + 0.0722 * b
|
||
# Threshold for deciding text color
|
||
return 'black' if luminance > 0.55 else 'white' # Adjusted threshold slightly for better visual
|
||
|
||
# Add percentages inside each bar segment
|
||
# Iterate through each "category" of bars (Not Remote, Remote & ATOMIC, etc.)
|
||
for i, container in enumerate(ax.containers):
|
||
# Get the color for this container/category
|
||
segment_color = bar_colors[i]
|
||
text_color = get_contrasting_text_color(segment_color)
|
||
|
||
for patch in container.patches: # Iterate through each bar segment in the category
|
||
width = patch.get_width()
|
||
if width > 3: # Only add text if segment is wide enough (e.g., >3%)
|
||
x = patch.get_x() + width / 2
|
||
y = patch.get_y() + patch.get_height() / 2
|
||
ax.text(x, y,
|
||
f"{width:.1f}%",
|
||
ha='center',
|
||
va='center',
|
||
fontsize=8, # Adjust font size as needed
|
||
color=text_color,
|
||
fontweight='medium') # Bolder text can help
|
||
|
||
|
||
plt.legend(title="Task Category", bbox_to_anchor=(1.02, 1), loc='upper left', frameon=False)
|
||
|
||
def cell11():
|
||
df_oesm['onetsoc_major'] = df_oesm['OCC_CODE'].str[:2]
|
||
|
||
# Calculate wage bill per occupation
|
||
# Wage bill = Total Employment * Annual Mean Wage
|
||
# Ensure columns are numeric, converting non-numeric values to NaN first
|
||
df_oesm['TOT_EMP'] = pd.to_numeric(df_oesm['TOT_EMP'], errors='coerce')
|
||
df_oesm['A_MEAN'] = pd.to_numeric(df_oesm['A_MEAN'], errors='coerce')
|
||
|
||
# Drop rows with NaN in necessary columns after coercion
|
||
df_oesm.dropna(subset=['TOT_EMP', 'A_MEAN', 'onetsoc_major'], inplace=True)
|
||
|
||
df_oesm['wage_bill'] = df_oesm['TOT_EMP'] * df_oesm['A_MEAN']
|
||
|
||
# Aggregate wage bill by onetsoc_major
|
||
df_wage_bill_major = df_oesm.groupby('onetsoc_major')['wage_bill'].sum().reset_index()
|
||
|
||
# Map major codes to titles for better plotting
|
||
df_wage_bill_major['OCC_TITLE_MAJOR'] = df_wage_bill_major['onetsoc_major'].map(occupation_major_codes)
|
||
|
||
# Sort by wage bill for better visualization
|
||
df_wage_bill_major = df_wage_bill_major.sort_values('wage_bill', ascending=False)
|
||
|
||
# Plotting
|
||
plt.figure(figsize=(12, 8))
|
||
sns.barplot(x='wage_bill', y='OCC_TITLE_MAJOR', data=df_wage_bill_major, palette="viridis")
|
||
plt.title('Total Wage Bill per Major Occupation Group')
|
||
plt.xlabel('Total Wage Bill (in billions)')
|
||
plt.ylabel('Major Occupation Group')
|
||
plt.grid(axis='x', linestyle='--', alpha=0.7)
|
||
|
||
def cell11():
|
||
# ───────────────────────────────────────────────────────────────
|
||
# 1. CUMULATIVE-DISTRIBUTION-FUNCTION (CDF) PREP
|
||
# ───────────────────────────────────────────────────────────────
|
||
def cdf(series):
|
||
s = series.sort_values().reset_index(drop=True)
|
||
return s.values, ((s.index + 1) / len(s)) * 100
|
||
|
||
x_lb , y_lb = cdf(atomic_tasks['lb_estimate_in_minutes'])
|
||
x_ub , y_ub = cdf(atomic_tasks['ub_estimate_in_minutes'])
|
||
x_mid, y_mid = cdf((atomic_tasks['ub_estimate_in_minutes'] + atomic_tasks['lb_estimate_in_minutes']) / 2)
|
||
|
||
# ───────────────────────────────────────────────────────────────
|
||
# 2. PLOTTING
|
||
# ───────────────────────────────────────────────────────────────
|
||
fig, ax = plt.subplots(figsize=(10, 6))
|
||
|
||
# horizontal reference lines every 10 %
|
||
for y_val in range(0, 101, 10):
|
||
ax.axhline(y_val, color=gray['100'], linewidth=.8, zorder=1)
|
||
|
||
# Plot Lower Bound CDF
|
||
ax.step(x_lb, y_lb,
|
||
where='post',
|
||
color=lime['300'], # Example: light blue for lower bound
|
||
linewidth=1.8,
|
||
linestyle='--',
|
||
zorder=2,
|
||
label='Lower bound estimate (CDF)')
|
||
|
||
# Plot Upper Bound CDF
|
||
ax.step(x_ub, y_ub,
|
||
where='post',
|
||
color=lime['900'], # Example: light orange/red for upper bound
|
||
linewidth=1.8,
|
||
linestyle=':',
|
||
zorder=3,
|
||
label='Upper bound estimate (CDF)')
|
||
|
||
# Plot Midpoint CDF (plotted last to be on top, or adjust zorder)
|
||
ax.step(x_mid, y_mid,
|
||
where='post',
|
||
color=lime['600'],
|
||
linewidth=2.2,
|
||
zorder=4, # Ensure it's on top of other lines if they overlap significantly
|
||
label='Mid-point estimate (CDF)')
|
||
|
||
|
||
# axes limits / scales
|
||
ax.set_ylim(0, 100)
|
||
ax.set_xscale('log')
|
||
|
||
# y-axis ➝ percent labels
|
||
ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(decimals=0))
|
||
|
||
|
||
# move y-label to top-left (just inside plotting area)
|
||
ax.text(-0.06, 1.03,
|
||
"% of tasks with temporal coherence ≤ X",
|
||
ha='left', va='bottom',
|
||
transform=ax.transAxes,
|
||
fontsize=12, fontweight='semibold')
|
||
|
||
# custom x-ticks at human-friendly durations
|
||
ticks = [1, 5, 10, 30, 60, 120, 240, 480,
|
||
1440, 2880, 10080, 43200, 129600,
|
||
259200, 525600]
|
||
ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hour', '2 hours', '4 hours', '8 hours',
|
||
'1 day', '2 days', '1 week', '30 days',
|
||
'90 days', '180 days', '1 year']
|
||
|
||
# Vertical reference lines for x-ticks
|
||
for tick in ticks:
|
||
ax.axvline(tick, color=gray['300'], linewidth=.8, linestyle='--', zorder=1)
|
||
|
||
ax.set_xticks(ticks)
|
||
ax.set_xticklabels(ticklabels, rotation=45, ha='right')
|
||
|
||
ax.spines['top'].set_visible(False)
|
||
ax.spines['right'].set_visible(False)
|
||
ax.spines['left'].set_edgecolor(gray['300'])
|
||
ax.spines['bottom'].set_edgecolor(gray['300'])
|
||
|
||
|
||
# legend
|
||
ax.legend(frameon=False, loc='lower right') # Keep 'lower right' or adjust as needed
|
||
|
||
ax.text(0.5, -0.3,
|
||
'Temporal coherence (X)',
|
||
ha='center', va='center',
|
||
transform=ax.transAxes,
|
||
fontsize=12, fontweight='semibold')
|