import os import litellm import sqlite3 import numpy as np import pandas as pd from google.colab import userdata, files import seaborn as sns import matplotlib.pyplot as plt import matplotlib as mpl os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY') os.environ['GEMINI_API_KEY'] = userdata.get('GEMINI_API_KEY') occupation_major_codes = { '11': 'Management', '13': 'Business and Financial Operations', '15': 'Computer and Mathematical Occupations', '17': 'Architecture and Engineering', '19': 'Life, Physical, and Social Science', '21': 'Community and Social Services', '23': 'Legal', '25': 'Education, Training, and Library', '27': 'Arts, Design, Entertainment, Sports, and Media', '29': 'Healthcare Practitioners and Technical', '31': 'Healthcare Support', '33': 'Protective Service', '35': 'Food Preparation and Serving Related', '37': 'Building and Grounds Cleaning and Maintenance', '39': 'Personal Care and Service', '41': 'Sales and Related', '43': 'Office and Administrative Support', '45': 'Farming, Fishing, and Forestry', '47': 'Construction and Extraction', '49': 'Installation, Maintenance, and Repair', '51': 'Production', '53': 'Transportation and Material Moving', '55': 'Military Specific' } gray = {'50':'#f8fafc','100':'#f1f5f9','200':'#e2e8f0', '300':'#cbd5e1','400':'#94a3b8','500':'#64748b', '600':'#475569','700':'#334155','800':'#1e293b', '900':'#0f172a','950':'#020617'} lime = {'50': '#f7fee7','100': '#ecfcca','200': '#d8f999', '300': '#bbf451','400': '#9ae600','500': '#83cd00', '600': '#64a400','700': '#497d00','800': '#3c6300', '900': '#35530e','950': '#192e03'} mpl.rcParams.update({ 'figure.facecolor' : gray['50'], 'axes.facecolor' : gray['50'], 'axes.edgecolor' : gray['100'], 'axes.labelcolor' : gray['700'], 'xtick.color' : gray['700'], 'ytick.color' : gray['700'], 'font.family' : 'Inter', # falls back to DejaVu if Inter not present 'font.size' : 11, }) sns.set_style("white") # keep minimal axes, we will remove default grid sns.set_context("notebook") def prepare_tasks(): # This dataset comes from https://epoch.ai/gradient-updates/consequences-of-automating-remote-work # It contains labels for a O*NET task can be done remotely or not (labeled by GPT-4o) # You can download it here: https://drive.google.com/file/d/1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r/view?usp=sharing df_remote_status = pd.read_csv("epoch_task_data.csv") # BLS OEWS: https://www.bls.gov/oes/special-requests/oesm23nat.zip df_oesm = pd.read_excel("oesm23national.xlsx") # Run uv run ./enrich_task_ratings.py df_tasks = pd.read_json("task_ratings_enriched.json") # Run uv run classify_estimateability_of_tasks.py df_task_estimateable = pd.read_csv("tasks_estimateable.csv").rename(columns={"task_estimateable": "estimateable"}).drop_duplicates(subset=['task'], keep='first') # df_tasks now has a remote_status column which contains either "remote" or "not remote" df_tasks = pd.merge(df_tasks, df_remote_status[['Task', 'Remote']], left_on='task', right_on='Task', how='left') df_tasks = df_tasks.drop('Task', axis=1).rename(columns={'Remote': 'remote_status'}) # df_tasks now has a estimateable column which contains either "ATOMIC" or "ONGOING-CONSTRAINT" df_tasks = pd.merge(df_tasks, df_task_estimateable[['task', 'estimateable']], on='task', how='left') df_tasks = df_tasks[df_tasks['importance_average'] < 3].copy() df_tasks['onetsoc_major'] = df_tasks['onetsoc_code'].str[:2] df_remote_tasks = df_tasks[df_tasks['remote_status'] == 'remote'].copy() # Call create_task_estimates() from add_task_estimates? which creates tasks_with_estimates.csv def preprocessing_time_estimates(): df = pd.read_csv("tasks_with_estimates.csv") df = df[df['importance_average'] > 3].copy() # The embeddings comes from running `uv run ./embed_task_description.py` # Columns: ['embedding_id', 'task', 'embedding_vector'] # These contain embedding for UNIQUE tasks df_task_embeddings = pd.read_parquet("tasks_with_embeddings.parquet").drop_duplicates(subset=['task'])[['task', 'task_embedding']].rename(columns={"task_embedding": "embedding_vector"}).copy() df = pd.merge(df, df_task_embeddings[['task', 'embedding_vector']], on='task', how='left') df = pd.merge(df, df_task_estimateable[['task', 'estimateable']], on='task', how='left') df['onetsoc_major'] = df['onetsoc_code'].str[:2] def convert_to_minutes(qty, unit): """Converts a quantity in a given unit to minutes.""" return qty * { "minute": 1, "hour": 60, "day": 60 * 24, "week": 60 * 24 * 7, "month": 60 * 24 * 30, "trimester": 60 * 24 * 90, "semester": 60 * 24 * 180, "year": 60 * 24 * 365, }[unit] df['lb_estimate_in_minutes'] = df.apply( lambda row: convert_to_minutes(row['lb_estimate_qty'], row['lb_estimate_unit']), axis=1 ) df['ub_estimate_in_minutes'] = df.apply( lambda row: convert_to_minutes(row['ub_estimate_qty'], row['ub_estimate_unit']), axis=1 ) df['estimate_range'] = df.ub_estimate_in_minutes - df.lb_estimate_in_minutes df['estimate_ratio'] = df.ub_estimate_in_minutes / df.lb_estimate_in_minutes df['estimate_midpoint'] = (df.lb_estimate_in_minutes + df.ub_estimate_in_minutes)/2 atomic_tasks = df[df['estimateable'] == 'ATOMIC'] ongoing_tasks = df[df['estimateable'] == 'ONGOING-CONSTRAINT'] with pd.option_context('display.max_columns', None): display(df) # Check for empty estimates if atomic_tasks['lb_estimate_in_minutes'].isnull().sum() > 0: print("Missing values in 'lb_estimate_in_minutes':", atomic_tasks['lb_estimate_in_minutes'].isnull().sum()) if atomic_tasks['ub_estimate_in_minutes'].isnull().sum() > 0: print("Missing values in 'ub_estimate_in_minutes':", atomic_tasks['ub_estimate_in_minutes'].isnull().sum()) # Check for impossible bounds impossible_bounds = atomic_tasks[ (atomic_tasks['lb_estimate_in_minutes'] <= 0) | (atomic_tasks['ub_estimate_in_minutes'] <= 0) | (atomic_tasks['lb_estimate_in_minutes'] > atomic_tasks['ub_estimate_in_minutes']) ] if not impossible_bounds.empty: print(f"Error: Found rows with impossible bounds.") with pd.option_context('display.max_colwidth', None): display(impossible_bounds[['task', 'lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'dwas']]) #with pd.option_context('display.max_colwidth', None): #display(atomic_tasks.nlargest(20, 'ub_estimate_in_minutes')[['task', 'lb_estimate_qty', 'lb_estimate_unit', 'lb_estimate_in_minutes', 'ub_estimate_qty', 'ub_estimate_unit', 'ub_estimate_in_minutes', 'estimate_ratio']]) def cell1(): sns.histplot(atomic_tasks.estimate_midpoint, log_scale=True) def cell2(): plt.figure(figsize=(14,10)) sns.boxplot( data=atomic_tasks, x='onetsoc_major', # 11 = Management, 15 = Computer/Math, … y='estimate_range', showfliers=False ) plt.yscale('log') # long tail => log scale plt.xlabel('Occupation') plt.ylabel('Range (upper-lower, minutes)') plt.title('Spread of time-range estimates per occupation') ax = plt.gca() ax.set_xticklabels([occupation_major_codes[code.get_text()] for code in ax.get_xticklabels()], rotation=60, ha='right') def cell3(): plt.figure(figsize=(10, 10)) ax = sns.scatterplot( data=atomic_tasks.replace({'onetsoc_major': occupation_major_codes}), # Replace codes with labels x='lb_estimate_in_minutes', y='ub_estimate_in_minutes', alpha=0.2, edgecolor=None, hue="onetsoc_major" # Use the labeled column for hue ) # 45° reference lims = (1, atomic_tasks[['lb_estimate_in_minutes','ub_estimate_in_minutes']].max().max()) ax.plot(lims, lims, color='black', linestyle='--', linewidth=1) # optional helper lines: 2× and 10×, 100× ratios for k in [2,10, 100]: ax.plot(lims, [k*l for l in lims], linestyle=':', color='grey', linewidth=1) ax.set(xscale='log', yscale='log') ax.set_xlabel('Lower-bound (min, log scale)') ax.set_ylabel('Upper-bound (min, log scale)') ax.set_title('Lower vs upper estimates for all tasks') # Place the legend outside the plot ax.legend(bbox_to_anchor=(1, 1), loc='upper left') def cell4(): plt.figure(figsize=(8,4)) sns.histplot(np.log10(atomic_tasks['estimate_ratio'].replace([np.inf, -np.inf], np.nan).dropna()), bins=60, kde=True) plt.axvline(np.log10(10), color='red', ls='--', lw=1, label='10×') plt.axvline(np.log10(1.05), color='orange', ls='--', lw=1, label='1.05×') plt.axvline(0, color='black', ls='-', lw=1) # ub = lb plt.xlabel('log₁₀(upper / lower)') plt.ylabel('Count') plt.title('Distribution of upper:lower ratio') plt.legend() plt.tight_layout() def cell5(): # 1. Bin lower bounds into quartiles (Q1–Q4) atomic_tasks['lb_q'] = pd.qcut(atomic_tasks.lb_estimate_in_minutes, q=4, labels=['Q1 shortest','Q2','Q3','Q4 longest']) # 3. Aggregate: median (or mean) ratio per cell pivot = atomic_tasks.pivot_table(index='onetsoc_major', columns='lb_q', values='estimate_ratio', aggfunc='median') # Map the index (onetsoc_major codes) to their corresponding labels pivot.index = pivot.index.map(occupation_major_codes) # 4. Visualise plt.figure(figsize=(10,8)) sns.heatmap(pivot, cmap='RdYlGn_r', center=2, annot=True, fmt='.1f', cbar_kws={'label':'Median upper/lower ratio'}) plt.xlabel('Lower-bound quartile') plt.ylabel('Occupation (major group)') plt.title('Typical range width by occupation and task length') plt.tight_layout() def cell6(): """ from scipy.stats import median_abs_deviation def mad_z(series): med = series.median() mad = median_abs_deviation(series, scale='normal') # ⇒ comparable to σ return (series - med) / mad df['robust_z'] = df.groupby('onetsoc_code')['estimate_midpoint'].transform(mad_z) """ agg = (atomic_tasks .groupby('onetsoc_code')['estimate_midpoint'] .agg(median='median', q1=lambda x: x.quantile(.25), q3=lambda x: x.quantile(.75), mean='mean', std='std') .reset_index()) agg['IQR'] = agg.q3 - agg.q1 agg['CV'] = agg['std'] / agg['mean'] # coefficient of variation # merge back the group mean and std so each row can be scored atomic_tasks = atomic_tasks.merge(agg[['onetsoc_code','mean','std']], on='onetsoc_code') atomic_tasks['z'] = (atomic_tasks.estimate_midpoint - atomic_tasks['mean']) / atomic_tasks['std'] outliers = atomic_tasks.loc[atomic_tasks.z.abs() > 3] outliers def cell7(): from scipy.stats import median_abs_deviation def mad_z(series): med = series.median() mad = median_abs_deviation(series, scale='normal') # ⇒ comparable to σ return (series - med) / mad atomic_tasks['robust_z'] = atomic_tasks.groupby('onetsoc_code')['estimate_midpoint'].transform(mad_z) def cell8(): from sklearn.metrics.pairwise import cosine_similarity from sklearn.neighbors import NearestNeighbors # unit-normalise embeddings X = np.vstack(atomic_tasks.embedding_vector.values) X = X / np.linalg.norm(X, axis=1, keepdims=True) k = 5 nn = NearestNeighbors(n_neighbors=k+1, metric='cosine').fit(X) dist, idx = nn.kneighbors(X) # idx[:,0] is the task itself pairs = pd.DataFrame({ 'i': np.repeat(np.arange(len(X)), k), 'j': idx[:,1:].ravel(), # drop self-match 'sim': 1 - dist[:,1:].ravel() # cosine similarity }) # join time estimates pairs = (pairs .merge(atomic_tasks[['estimate_midpoint']], left_on='i', right_index=True) .merge(atomic_tasks[['estimate_midpoint']], left_on='j', right_index=True, suffixes=('_i','_j'))) pairs['ratio'] = pairs[['estimate_midpoint_i','estimate_midpoint_j']].max(1) / \ pairs[['estimate_midpoint_i','estimate_midpoint_j']].min(1) pairs def cell9(): sns.scatterplot(data=pairs.sample(20_000), x='sim', y=np.log10(pairs['ratio']), alpha=.1) plt.axhline(np.log10(2), ls=':', color='red'); # e.g. >2× difference plt.xlabel('Cosine similarity'); plt.ylabel('log₁₀ time-ratio'); plt.title('Are similar tasks given similar time estimates?'); def cell10(): import matplotlib.ticker as mtick # For percentage formatting import matplotlib.colors as mcolors # For color conversion summary_data = [] for code, label in occupation_major_codes.items(): occ_df = df_tasks[df_tasks['onetsoc_major'] == code] total_tasks_in_occ = len(occ_df) if total_tasks_in_occ == 0: continue # Skip if no tasks for this occupation # Stack 1: % that isn't equal to "remote" not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote']) # For the remaining remote tasks: remote_df = occ_df[occ_df['remote_status'] == 'remote'] # Stack 2: % of remote + ATOMIC remote_atomic_count = len(remote_df[remote_df['estimateable'] == 'ATOMIC']) # Stack 3: % of remote + ONGOING-CONSTRAINT remote_ongoing_count = len(remote_df[remote_df['estimateable'] == 'ONGOING-CONSTRAINT']) summary_data.append({ 'onetsoc_major_code': code, 'occupation_label': label, 'count_not_remote': not_remote_count, 'count_remote_atomic': remote_atomic_count, 'count_remote_ongoing': remote_ongoing_count, 'total_tasks': total_tasks_in_occ }) summary_df = pd.DataFrame(summary_data) # --- 3. Calculate Percentages --- # Ensure total_tasks is not zero to avoid division by zero errors if an occupation had no tasks summary_df = summary_df[summary_df['total_tasks'] > 0].copy() # Use .copy() to avoid SettingWithCopyWarning summary_df['pct_not_remote'] = (summary_df['count_not_remote'] / summary_df['total_tasks']) * 100 summary_df['pct_remote_atomic'] = (summary_df['count_remote_atomic'] / summary_df['total_tasks']) * 100 summary_df['pct_remote_ongoing'] = (summary_df['count_remote_ongoing'] / summary_df['total_tasks']) * 100 # Select columns for plotting and set index to occupation label plot_df = summary_df.set_index('occupation_label')[ ['pct_not_remote', 'pct_remote_atomic', 'pct_remote_ongoing'] ] # Rename columns for a clearer legend plot_df.columns = ['Not Remote', 'Remote + Estimable', 'Remote + Not estimable'] plot_df = plot_df.sort_values(by='Not Remote', ascending=False) # --- 4. Plotting (Modified) --- # Define the custom colors based on your requirements # The order must match the column order in plot_df: # 1. 'Not Remote' # 2. 'Remote & ATOMIC' # 3. 'Remote & ONGOING-CONSTRAINT' bar_colors = [gray["300"], lime["500"], lime["200"]] fig, ax = plt.subplots(figsize=(14, 10)) # Adjusted figsize for better readability plot_df.plot(kind='barh', stacked=True, ax=ax, color=bar_colors) ax.set_xlabel("Percentage of Tasks (%)", fontsize=12) ax.set_ylabel("Occupation Major Group", fontsize=12) ax.set_title("Task Breakdown by Occupation, Remote Status, and Estimateability", fontsize=14, pad=20) # Format x-axis as percentages ax.xaxis.set_major_formatter(mtick.PercentFormatter()) plt.xlim(0, 100) # Ensure x-axis goes from 0 to 100% # Remove right and top spines ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) # Function to get contrasting text color def get_contrasting_text_color(bg_color_hex_or_rgba): """ Determines if black or white text provides better contrast against a given background color. bg_color_hex_or_rgba: A hex string (e.g., '#RRGGBB') or an RGBA tuple (values in [0, 1]). Returns: 'black' or 'white'. """ # Convert to RGBA if it's a hex string or name if isinstance(bg_color_hex_or_rgba, str): rgba = mcolors.to_rgba(bg_color_hex_or_rgba) else: rgba = bg_color_hex_or_rgba r, g, b, _ = rgba # Ignore alpha for luminance calculation # Calculate luminance (standard formula for sRGB) # Values r, g, b should be in [0, 1] for this formula luminance = 0.2126 * r + 0.7152 * g + 0.0722 * b # Threshold for deciding text color return 'black' if luminance > 0.55 else 'white' # Adjusted threshold slightly for better visual # Add percentages inside each bar segment # Iterate through each "category" of bars (Not Remote, Remote & ATOMIC, etc.) for i, container in enumerate(ax.containers): # Get the color for this container/category segment_color = bar_colors[i] text_color = get_contrasting_text_color(segment_color) for patch in container.patches: # Iterate through each bar segment in the category width = patch.get_width() if width > 3: # Only add text if segment is wide enough (e.g., >3%) x = patch.get_x() + width / 2 y = patch.get_y() + patch.get_height() / 2 ax.text(x, y, f"{width:.1f}%", ha='center', va='center', fontsize=8, # Adjust font size as needed color=text_color, fontweight='medium') # Bolder text can help plt.legend(title="Task Category", bbox_to_anchor=(1.02, 1), loc='upper left', frameon=False) def cell11(): df_oesm['onetsoc_major'] = df_oesm['OCC_CODE'].str[:2] # Calculate wage bill per occupation # Wage bill = Total Employment * Annual Mean Wage # Ensure columns are numeric, converting non-numeric values to NaN first df_oesm['TOT_EMP'] = pd.to_numeric(df_oesm['TOT_EMP'], errors='coerce') df_oesm['A_MEAN'] = pd.to_numeric(df_oesm['A_MEAN'], errors='coerce') # Drop rows with NaN in necessary columns after coercion df_oesm.dropna(subset=['TOT_EMP', 'A_MEAN', 'onetsoc_major'], inplace=True) df_oesm['wage_bill'] = df_oesm['TOT_EMP'] * df_oesm['A_MEAN'] # Aggregate wage bill by onetsoc_major df_wage_bill_major = df_oesm.groupby('onetsoc_major')['wage_bill'].sum().reset_index() # Map major codes to titles for better plotting df_wage_bill_major['OCC_TITLE_MAJOR'] = df_wage_bill_major['onetsoc_major'].map(occupation_major_codes) # Sort by wage bill for better visualization df_wage_bill_major = df_wage_bill_major.sort_values('wage_bill', ascending=False) # Plotting plt.figure(figsize=(12, 8)) sns.barplot(x='wage_bill', y='OCC_TITLE_MAJOR', data=df_wage_bill_major, palette="viridis") plt.title('Total Wage Bill per Major Occupation Group') plt.xlabel('Total Wage Bill (in billions)') plt.ylabel('Major Occupation Group') plt.grid(axis='x', linestyle='--', alpha=0.7) def cell11(): # ─────────────────────────────────────────────────────────────── # 1. CUMULATIVE-DISTRIBUTION-FUNCTION (CDF) PREP # ─────────────────────────────────────────────────────────────── def cdf(series): s = series.sort_values().reset_index(drop=True) return s.values, ((s.index + 1) / len(s)) * 100 x_lb , y_lb = cdf(atomic_tasks['lb_estimate_in_minutes']) x_ub , y_ub = cdf(atomic_tasks['ub_estimate_in_minutes']) x_mid, y_mid = cdf((atomic_tasks['ub_estimate_in_minutes'] + atomic_tasks['lb_estimate_in_minutes']) / 2) # ─────────────────────────────────────────────────────────────── # 2. PLOTTING # ─────────────────────────────────────────────────────────────── fig, ax = plt.subplots(figsize=(10, 6)) # horizontal reference lines every 10 % for y_val in range(0, 101, 10): ax.axhline(y_val, color=gray['100'], linewidth=.8, zorder=1) # Plot Lower Bound CDF ax.step(x_lb, y_lb, where='post', color=lime['300'], # Example: light blue for lower bound linewidth=1.8, linestyle='--', zorder=2, label='Lower bound estimate (CDF)') # Plot Upper Bound CDF ax.step(x_ub, y_ub, where='post', color=lime['900'], # Example: light orange/red for upper bound linewidth=1.8, linestyle=':', zorder=3, label='Upper bound estimate (CDF)') # Plot Midpoint CDF (plotted last to be on top, or adjust zorder) ax.step(x_mid, y_mid, where='post', color=lime['600'], linewidth=2.2, zorder=4, # Ensure it's on top of other lines if they overlap significantly label='Mid-point estimate (CDF)') # axes limits / scales ax.set_ylim(0, 100) ax.set_xscale('log') # y-axis ➝ percent labels ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(decimals=0)) # move y-label to top-left (just inside plotting area) ax.text(-0.06, 1.03, "% of tasks with temporal coherence ≤ X", ha='left', va='bottom', transform=ax.transAxes, fontsize=12, fontweight='semibold') # custom x-ticks at human-friendly durations ticks = [1, 5, 10, 30, 60, 120, 240, 480, 1440, 2880, 10080, 43200, 129600, 259200, 525600] ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hour', '2 hours', '4 hours', '8 hours', '1 day', '2 days', '1 week', '30 days', '90 days', '180 days', '1 year'] # Vertical reference lines for x-ticks for tick in ticks: ax.axvline(tick, color=gray['300'], linewidth=.8, linestyle='--', zorder=1) ax.set_xticks(ticks) ax.set_xticklabels(ticklabels, rotation=45, ha='right') ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['left'].set_edgecolor(gray['300']) ax.spines['bottom'].set_edgecolor(gray['300']) # legend ax.legend(frameon=False, loc='lower right') # Keep 'lower right' or adjust as needed ax.text(0.5, -0.3, 'Temporal coherence (X)', ha='center', va='center', transform=ax.transAxes, fontsize=12, fontweight='semibold')