sprint-econtai/analysis.py

import os
import litellm
import sqlite3
import numpy as np
import pandas as pd
from google.colab import userdata, files
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
os.environ['GEMINI_API_KEY'] = userdata.get('GEMINI_API_KEY')

occupation_major_codes = {
    '11': 'Management',
    '13': 'Business and Financial Operations',
    '15': 'Computer and Mathematical Occupations',
    '17': 'Architecture and Engineering',
    '19': 'Life, Physical, and Social Science',
    '21': 'Community and Social Services',
    '23': 'Legal',
    '25': 'Education, Training, and Library',
    '27': 'Arts, Design, Entertainment, Sports, and Media',
    '29': 'Healthcare Practitioners and Technical',
    '31': 'Healthcare Support',
    '33': 'Protective Service',
    '35': 'Food Preparation and Serving Related',
    '37': 'Building and Grounds Cleaning and Maintenance',
    '39': 'Personal Care and Service',
    '41': 'Sales and Related',
    '43': 'Office and Administrative Support',
    '45': 'Farming, Fishing, and Forestry',
    '47': 'Construction and Extraction',
    '49': 'Installation, Maintenance, and Repair',
    '51': 'Production',
    '53': 'Transportation and Material Moving',
    '55': 'Military Specific'
}

gray   = {'50':'#f8fafc','100':'#f1f5f9','200':'#e2e8f0',
                   '300':'#cbd5e1','400':'#94a3b8','500':'#64748b',
                   '600':'#475569','700':'#334155','800':'#1e293b',
                   '900':'#0f172a','950':'#020617'}
lime            = {'50': '#f7fee7','100': '#ecfcca','200': '#d8f999',
                   '300': '#bbf451','400': '#9ae600','500': '#83cd00',
                   '600': '#64a400','700': '#497d00','800': '#3c6300',
                   '900': '#35530e','950': '#192e03'}

mpl.rcParams.update({
    'figure.facecolor' : gray['50'],
    'axes.facecolor'   : gray['50'],
    'axes.edgecolor'   : gray['100'],
    'axes.labelcolor'  : gray['700'],
    'xtick.color'      : gray['700'],
    'ytick.color'      : gray['700'],
    'font.family'      : 'Inter',  # falls back to DejaVu if Inter not present
    'font.size'        : 11,
})

sns.set_style("white")         # keep minimal axes, we will remove default grid
sns.set_context("notebook")

def prepare_tasks():
    # This dataset comes from https://epoch.ai/gradient-updates/consequences-of-automating-remote-work
    # It contains labels for a O*NET task can be done remotely or not (labeled by GPT-4o)
    # You can download it here: https://drive.google.com/file/d/1GrHhuYIgaCCgo99dZ_40BWraz-fzo76r/view?usp=sharing
    df_remote_status = pd.read_csv("epoch_task_data.csv")

    # BLS OEWS: https://www.bls.gov/oes/special-requests/oesm23nat.zip
    df_oesm = pd.read_excel("oesm23national.xlsx")

    # Run uv run ./enrich_task_ratings.py
    df_tasks = pd.read_json("task_ratings_enriched.json")

    # Run uv run classify_estimateability_of_tasks.py
    df_task_estimateable = pd.read_csv("tasks_estimateable.csv").rename(columns={"task_estimateable": "estimateable"}).drop_duplicates(subset=['task'], keep='first')

    # df_tasks now has a remote_status column which contains either "remote" or "not remote"
    df_tasks = pd.merge(df_tasks, df_remote_status[['Task', 'Remote']], left_on='task', right_on='Task', how='left')
    df_tasks = df_tasks.drop('Task', axis=1).rename(columns={'Remote': 'remote_status'})

    # df_tasks now has a estimateable column which contains either "ATOMIC" or "ONGOING-CONSTRAINT"
    df_tasks = pd.merge(df_tasks, df_task_estimateable[['task', 'estimateable']], on='task', how='left')

    df_tasks = df_tasks[df_tasks['importance_average'] < 3].copy()

    df_tasks['onetsoc_major'] = df_tasks['onetsoc_code'].str[:2]

    df_remote_tasks = df_tasks[df_tasks['remote_status'] == 'remote'].copy()

    # Call create_task_estimates() from add_task_estimates? which creates tasks_with_estimates.csv

def preprocessing_time_estimates():
    df = pd.read_csv("tasks_with_estimates.csv")

    df = df[df['importance_average'] > 3].copy()

    # The embeddings comes from running `uv run ./embed_task_description.py`
    # Columns: ['embedding_id', 'task', 'embedding_vector']
    # These contain embedding for UNIQUE tasks
    df_task_embeddings = pd.read_parquet("tasks_with_embeddings.parquet").drop_duplicates(subset=['task'])[['task', 'task_embedding']].rename(columns={"task_embedding": "embedding_vector"}).copy()

    df = pd.merge(df, df_task_embeddings[['task', 'embedding_vector']], on='task', how='left')
    df = pd.merge(df, df_task_estimateable[['task', 'estimateable']], on='task', how='left')

    df['onetsoc_major'] = df['onetsoc_code'].str[:2]

    def convert_to_minutes(qty, unit):
        """Converts a quantity in a given unit to minutes."""
        return qty * {
            "minute": 1,
            "hour": 60,
            "day": 60 * 24,
            "week": 60 * 24 * 7,
            "month": 60 * 24 * 30,
            "trimester": 60 * 24 * 90,
            "semester": 60 * 24 * 180,
            "year": 60 * 24 * 365,
        }[unit]

    df['lb_estimate_in_minutes'] = df.apply(
        lambda row: convert_to_minutes(row['lb_estimate_qty'], row['lb_estimate_unit']), axis=1
    )
    df['ub_estimate_in_minutes'] = df.apply(
        lambda row: convert_to_minutes(row['ub_estimate_qty'], row['ub_estimate_unit']), axis=1
    )

    df['estimate_range'] = df.ub_estimate_in_minutes - df.lb_estimate_in_minutes
    df['estimate_ratio'] = df.ub_estimate_in_minutes / df.lb_estimate_in_minutes
    df['estimate_midpoint'] = (df.lb_estimate_in_minutes + df.ub_estimate_in_minutes)/2

    atomic_tasks = df[df['estimateable'] == 'ATOMIC']
    ongoing_tasks = df[df['estimateable'] == 'ONGOING-CONSTRAINT']

    with pd.option_context('display.max_columns', None):
      display(df)

    # Check for empty estimates
    if atomic_tasks['lb_estimate_in_minutes'].isnull().sum() > 0:
        print("Missing values in 'lb_estimate_in_minutes':", atomic_tasks['lb_estimate_in_minutes'].isnull().sum())

    if atomic_tasks['ub_estimate_in_minutes'].isnull().sum() > 0:
        print("Missing values in 'ub_estimate_in_minutes':", atomic_tasks['ub_estimate_in_minutes'].isnull().sum())

    # Check for impossible bounds
    impossible_bounds = atomic_tasks[
        (atomic_tasks['lb_estimate_in_minutes'] <= 0) |
        (atomic_tasks['ub_estimate_in_minutes'] <= 0) |
        (atomic_tasks['lb_estimate_in_minutes'] > atomic_tasks['ub_estimate_in_minutes'])
    ]
    if not impossible_bounds.empty:
        print(f"Error: Found rows with impossible bounds.")
        with pd.option_context('display.max_colwidth', None):
        display(impossible_bounds[['task', 'lb_estimate_in_minutes', 'ub_estimate_in_minutes', 'dwas']])

    #with pd.option_context('display.max_colwidth', None):
        #display(atomic_tasks.nlargest(20, 'ub_estimate_in_minutes')[['task', 'lb_estimate_qty', 'lb_estimate_unit', 'lb_estimate_in_minutes', 'ub_estimate_qty', 'ub_estimate_unit', 'ub_estimate_in_minutes', 'estimate_ratio']])

def cell1():
    sns.histplot(atomic_tasks.estimate_midpoint, log_scale=True)

def cell2():
    plt.figure(figsize=(14,10))
    sns.boxplot(
        data=atomic_tasks,
        x='onetsoc_major',           # 11 = Management, 15 = Computer/Math, …
        y='estimate_range',
        showfliers=False
    )
    plt.yscale('log')                # long tail => log scale
    plt.xlabel('Occupation')
    plt.ylabel('Range (upper-lower, minutes)')
    plt.title('Spread of time-range estimates per occupation')

    ax = plt.gca()
    ax.set_xticklabels([occupation_major_codes[code.get_text()] for code in ax.get_xticklabels()], rotation=60, ha='right')

def cell3():
    plt.figure(figsize=(10, 10))
    ax = sns.scatterplot(
            data=atomic_tasks.replace({'onetsoc_major': occupation_major_codes}),  # Replace codes with labels
            x='lb_estimate_in_minutes', y='ub_estimate_in_minutes',
            alpha=0.2, edgecolor=None, hue="onetsoc_major"  # Use the labeled column for hue
        )

    # 45° reference
    lims = (1, atomic_tasks[['lb_estimate_in_minutes','ub_estimate_in_minutes']].max().max())
    ax.plot(lims, lims, color='black', linestyle='--', linewidth=1)

    # optional helper lines: 2× and 10×, 100× ratios
    for k in [2,10, 100]:
        ax.plot(lims, [k*l for l in lims],
                linestyle=':', color='grey', linewidth=1)

    ax.set(xscale='log', yscale='log')
    ax.set_xlabel('Lower-bound (min, log scale)')
    ax.set_ylabel('Upper-bound (min, log scale)')
    ax.set_title('Lower vs upper estimates for all tasks')

    # Place the legend outside the plot
    ax.legend(bbox_to_anchor=(1, 1), loc='upper left')

def cell4():
    plt.figure(figsize=(8,4))
    sns.histplot(np.log10(atomic_tasks['estimate_ratio'].replace([np.inf, -np.inf], np.nan).dropna()),
                bins=60, kde=True)
    plt.axvline(np.log10(10), color='red', ls='--', lw=1, label='10×')
    plt.axvline(np.log10(1.05), color='orange', ls='--', lw=1, label='1.05×')
    plt.axvline(0, color='black', ls='-', lw=1)          # ub = lb
    plt.xlabel('log₁₀(upper / lower)')
    plt.ylabel('Count')
    plt.title('Distribution of upper:lower ratio')
    plt.legend()
    plt.tight_layout()


def cell5():
    # 1. Bin lower bounds into quartiles (Q1–Q4)
    atomic_tasks['lb_q'] = pd.qcut(atomic_tasks.lb_estimate_in_minutes,
                        q=4, labels=['Q1 shortest','Q2','Q3','Q4 longest'])


    # 3. Aggregate: median (or mean) ratio per cell
    pivot = atomic_tasks.pivot_table(index='onetsoc_major', columns='lb_q',
                        values='estimate_ratio', aggfunc='median')

    # Map the index (onetsoc_major codes) to their corresponding labels
    pivot.index = pivot.index.map(occupation_major_codes)


    # 4. Visualise
    plt.figure(figsize=(10,8))
    sns.heatmap(pivot, cmap='RdYlGn_r', center=2, annot=True, fmt='.1f',
                cbar_kws={'label':'Median upper/lower ratio'})
    plt.xlabel('Lower-bound quartile')
    plt.ylabel('Occupation (major group)')
    plt.title('Typical range width by occupation and task length')
    plt.tight_layout()


def cell6():
    """
    from scipy.stats import median_abs_deviation

    def mad_z(series):
        med = series.median()
        mad = median_abs_deviation(series, scale='normal')  # ⇒ comparable to σ
        return (series - med) / mad

    df['robust_z'] = df.groupby('onetsoc_code')['estimate_midpoint'].transform(mad_z)
    """

    agg = (atomic_tasks
           .groupby('onetsoc_code')['estimate_midpoint']
           .agg(median='median',
                q1=lambda x: x.quantile(.25),
                q3=lambda x: x.quantile(.75),
                mean='mean',
                std='std')
           .reset_index())
    agg['IQR'] = agg.q3 - agg.q1
    agg['CV']  = agg['std'] / agg['mean']            # coefficient of variation

    # merge back the group mean and std so each row can be scored
    atomic_tasks = atomic_tasks.merge(agg[['onetsoc_code','mean','std']], on='onetsoc_code')


    atomic_tasks['z'] = (atomic_tasks.estimate_midpoint - atomic_tasks['mean']) / atomic_tasks['std']
    outliers = atomic_tasks.loc[atomic_tasks.z.abs() > 3]
    outliers

def cell7():
    from scipy.stats import median_abs_deviation

    def mad_z(series):
        med = series.median()
        mad = median_abs_deviation(series, scale='normal')  # ⇒ comparable to σ
        return (series - med) / mad

    atomic_tasks['robust_z'] = atomic_tasks.groupby('onetsoc_code')['estimate_midpoint'].transform(mad_z)

def cell8():
    from sklearn.metrics.pairwise import cosine_similarity
    from sklearn.neighbors import NearestNeighbors

    # unit-normalise embeddings
    X = np.vstack(atomic_tasks.embedding_vector.values)
    X = X / np.linalg.norm(X, axis=1, keepdims=True)

    k = 5
    nn = NearestNeighbors(n_neighbors=k+1, metric='cosine').fit(X)
    dist, idx = nn.kneighbors(X)          # idx[:,0] is the task itself
    pairs = pd.DataFrame({
        'i': np.repeat(np.arange(len(X)), k),
        'j': idx[:,1:].ravel(),           # drop self-match
        'sim': 1 - dist[:,1:].ravel()     # cosine similarity
    })

    # join time estimates
    pairs = (pairs
             .merge(atomic_tasks[['estimate_midpoint']], left_on='i', right_index=True)
             .merge(atomic_tasks[['estimate_midpoint']], left_on='j', right_index=True,
                    suffixes=('_i','_j')))

    pairs['ratio'] = pairs[['estimate_midpoint_i','estimate_midpoint_j']].max(1) / \
                     pairs[['estimate_midpoint_i','estimate_midpoint_j']].min(1)

    pairs

def cell9():
    sns.scatterplot(data=pairs.sample(20_000), x='sim', y=np.log10(pairs['ratio']),
                    alpha=.1)
    plt.axhline(np.log10(2), ls=':', color='red');   # e.g. >2× difference
    plt.xlabel('Cosine similarity'); plt.ylabel('log₁₀ time-ratio');
    plt.title('Are similar tasks given similar time estimates?');

def cell10():
    import matplotlib.ticker as mtick # For percentage formatting
    import matplotlib.colors as mcolors # For color conversion

    summary_data = []

    for code, label in occupation_major_codes.items():
        occ_df = df_tasks[df_tasks['onetsoc_major'] == code]
        total_tasks_in_occ = len(occ_df)

        if total_tasks_in_occ == 0:
            continue # Skip if no tasks for this occupation

        # Stack 1: % that isn't equal to "remote"
        not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote'])

        # For the remaining remote tasks:
        remote_df = occ_df[occ_df['remote_status'] == 'remote']

        # Stack 2: % of remote + ATOMIC
        remote_atomic_count = len(remote_df[remote_df['estimateable'] == 'ATOMIC'])

        # Stack 3: % of remote + ONGOING-CONSTRAINT
        remote_ongoing_count = len(remote_df[remote_df['estimateable'] == 'ONGOING-CONSTRAINT'])

        summary_data.append({
            'onetsoc_major_code': code,
            'occupation_label': label,
            'count_not_remote': not_remote_count,
            'count_remote_atomic': remote_atomic_count,
            'count_remote_ongoing': remote_ongoing_count,
            'total_tasks': total_tasks_in_occ
        })

    summary_df = pd.DataFrame(summary_data)

    # --- 3. Calculate Percentages ---
    # Ensure total_tasks is not zero to avoid division by zero errors if an occupation had no tasks
    summary_df = summary_df[summary_df['total_tasks'] > 0].copy() # Use .copy() to avoid SettingWithCopyWarning

    summary_df['pct_not_remote'] = (summary_df['count_not_remote'] / summary_df['total_tasks']) * 100
    summary_df['pct_remote_atomic'] = (summary_df['count_remote_atomic'] / summary_df['total_tasks']) * 100
    summary_df['pct_remote_ongoing'] = (summary_df['count_remote_ongoing'] / summary_df['total_tasks']) * 100

    # Select columns for plotting and set index to occupation label
    plot_df = summary_df.set_index('occupation_label')[
        ['pct_not_remote', 'pct_remote_atomic', 'pct_remote_ongoing']
    ]

    # Rename columns for a clearer legend
    plot_df.columns = ['Not Remote', 'Remote + Estimable', 'Remote + Not estimable']

    plot_df = plot_df.sort_values(by='Not Remote', ascending=False)


    # --- 4. Plotting (Modified) ---

    # Define the custom colors based on your requirements
    # The order must match the column order in plot_df:
    # 1. 'Not Remote'
    # 2. 'Remote & ATOMIC'
    # 3. 'Remote & ONGOING-CONSTRAINT'
    bar_colors = [gray["300"], lime["500"], lime["200"]]

    fig, ax = plt.subplots(figsize=(14, 10)) # Adjusted figsize for better readability

    plot_df.plot(kind='barh', stacked=True, ax=ax, color=bar_colors)

    ax.set_xlabel("Percentage of Tasks (%)", fontsize=12)
    ax.set_ylabel("Occupation Major Group", fontsize=12)
    ax.set_title("Task Breakdown by Occupation, Remote Status, and Estimateability", fontsize=14, pad=20)

    # Format x-axis as percentages
    ax.xaxis.set_major_formatter(mtick.PercentFormatter())
    plt.xlim(0, 100) # Ensure x-axis goes from 0 to 100%

    # Remove right and top spines
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)

    # Function to get contrasting text color
    def get_contrasting_text_color(bg_color_hex_or_rgba):
        """
        Determines if black or white text provides better contrast against a given background color.
        bg_color_hex_or_rgba: A hex string (e.g., '#RRGGBB') or an RGBA tuple (values in [0, 1]).
        Returns: 'black' or 'white'.
        """
        # Convert to RGBA if it's a hex string or name
        if isinstance(bg_color_hex_or_rgba, str):
            rgba = mcolors.to_rgba(bg_color_hex_or_rgba)
        else:
            rgba = bg_color_hex_or_rgba

        r, g, b, _ = rgba # Ignore alpha for luminance calculation
        # Calculate luminance (standard formula for sRGB)
        # Values r, g, b should be in [0, 1] for this formula
        luminance = 0.2126 * r + 0.7152 * g + 0.0722 * b
        # Threshold for deciding text color
        return 'black' if luminance > 0.55 else 'white' # Adjusted threshold slightly for better visual

    # Add percentages inside each bar segment
    # Iterate through each "category" of bars (Not Remote, Remote & ATOMIC, etc.)
    for i, container in enumerate(ax.containers):
        # Get the color for this container/category
        segment_color = bar_colors[i]
        text_color = get_contrasting_text_color(segment_color)

        for patch in container.patches: # Iterate through each bar segment in the category
            width = patch.get_width()
            if width > 3:  # Only add text if segment is wide enough (e.g., >3%)
                x = patch.get_x() + width / 2
                y = patch.get_y() + patch.get_height() / 2
                ax.text(x, y,
                        f"{width:.1f}%",
                        ha='center',
                        va='center',
                        fontsize=8, # Adjust font size as needed
                        color=text_color,
                        fontweight='medium') # Bolder text can help


    plt.legend(title="Task Category", bbox_to_anchor=(1.02, 1), loc='upper left', frameon=False)

def cell11():
    df_oesm['onetsoc_major'] = df_oesm['OCC_CODE'].str[:2]

    # Calculate wage bill per occupation
    # Wage bill = Total Employment * Annual Mean Wage
    # Ensure columns are numeric, converting non-numeric values to NaN first
    df_oesm['TOT_EMP'] = pd.to_numeric(df_oesm['TOT_EMP'], errors='coerce')
    df_oesm['A_MEAN'] = pd.to_numeric(df_oesm['A_MEAN'], errors='coerce')

    # Drop rows with NaN in necessary columns after coercion
    df_oesm.dropna(subset=['TOT_EMP', 'A_MEAN', 'onetsoc_major'], inplace=True)

    df_oesm['wage_bill'] = df_oesm['TOT_EMP'] * df_oesm['A_MEAN']

    # Aggregate wage bill by onetsoc_major
    df_wage_bill_major = df_oesm.groupby('onetsoc_major')['wage_bill'].sum().reset_index()

    # Map major codes to titles for better plotting
    df_wage_bill_major['OCC_TITLE_MAJOR'] = df_wage_bill_major['onetsoc_major'].map(occupation_major_codes)

    # Sort by wage bill for better visualization
    df_wage_bill_major = df_wage_bill_major.sort_values('wage_bill', ascending=False)

    # Plotting
    plt.figure(figsize=(12, 8))
    sns.barplot(x='wage_bill', y='OCC_TITLE_MAJOR', data=df_wage_bill_major, palette="viridis")
    plt.title('Total Wage Bill per Major Occupation Group')
    plt.xlabel('Total Wage Bill (in billions)')
    plt.ylabel('Major Occupation Group')
    plt.grid(axis='x', linestyle='--', alpha=0.7)

def cell11():
    # ───────────────────────────────────────────────────────────────
    # 1.  CUMULATIVE-DISTRIBUTION-FUNCTION (CDF) PREP
    # ───────────────────────────────────────────────────────────────
    def cdf(series):
        s = series.sort_values().reset_index(drop=True)
        return s.values, ((s.index + 1) / len(s)) * 100

    x_lb , y_lb  = cdf(atomic_tasks['lb_estimate_in_minutes'])
    x_ub , y_ub  = cdf(atomic_tasks['ub_estimate_in_minutes'])
    x_mid, y_mid = cdf((atomic_tasks['ub_estimate_in_minutes'] + atomic_tasks['lb_estimate_in_minutes']) / 2)

    # ───────────────────────────────────────────────────────────────
    # 2.  PLOTTING
    # ───────────────────────────────────────────────────────────────
    fig, ax = plt.subplots(figsize=(10, 6))

    # horizontal reference lines every 10 %
    for y_val in range(0, 101, 10):
        ax.axhline(y_val, color=gray['100'], linewidth=.8, zorder=1)

    # Plot Lower Bound CDF
    ax.step(x_lb, y_lb,
            where='post',
            color=lime['300'], # Example: light blue for lower bound
            linewidth=1.8,
            linestyle='--',
            zorder=2,
            label='Lower bound estimate (CDF)')

    # Plot Upper Bound CDF
    ax.step(x_ub, y_ub,
            where='post',
            color=lime['900'], # Example: light orange/red for upper bound
            linewidth=1.8,
            linestyle=':',
            zorder=3,
            label='Upper bound estimate (CDF)')

    # Plot Midpoint CDF (plotted last to be on top, or adjust zorder)
    ax.step(x_mid, y_mid,
            where='post',
            color=lime['600'],
            linewidth=2.2,
            zorder=4, # Ensure it's on top of other lines if they overlap significantly
            label='Mid-point estimate (CDF)')


    # axes limits / scales
    ax.set_ylim(0, 100)
    ax.set_xscale('log')

    # y-axis ➝ percent labels
    ax.yaxis.set_major_formatter(mpl.ticker.PercentFormatter(decimals=0))


    # move y-label to top-left (just inside plotting area)
    ax.text(-0.06, 1.03,
            "% of tasks with temporal coherence ≤ X",
            ha='left', va='bottom',
            transform=ax.transAxes,
            fontsize=12, fontweight='semibold')

    # custom x-ticks at human-friendly durations
    ticks      = [1, 5, 10, 30, 60, 120, 240, 480,
                1440, 2880, 10080, 43200, 129600,
                259200, 525600]
    ticklabels = ['1 min', '5 min', '10 min', '30 min', '1 hour', '2 hours', '4 hours', '8 hours',
                '1 day', '2 days', '1 week', '30 days',
                '90 days', '180 days', '1 year']

    # Vertical reference lines for x-ticks
    for tick in ticks:
        ax.axvline(tick, color=gray['300'], linewidth=.8, linestyle='--', zorder=1)

    ax.set_xticks(ticks)
    ax.set_xticklabels(ticklabels, rotation=45, ha='right')

    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_edgecolor(gray['300'])
    ax.spines['bottom'].set_edgecolor(gray['300'])


    # legend
    ax.legend(frameon=False, loc='lower right') # Keep 'lower right' or adjust as needed

    ax.text(0.5, -0.3,
            'Temporal coherence (X)',
            ha='center', va='center',
            transform=ax.transAxes,
            fontsize=12, fontweight='semibold')