from .utils import OCCUPATION_MAJOR_CODES import pandas as pd def create_task_summary_by_occupation_df(df_tasks: pd.DataFrame, oesm_df: pd.DataFrame) -> pd.DataFrame: # --- OESM Wage Bill Calculation --- df_oesm_with_bill = oesm_df.copy() df_oesm_with_bill.rename(columns={'OCC_CODE': 'onetsoc_code'}, inplace=True) # Convert key columns to numeric, handling potential errors df_oesm_with_bill['TOT_EMP'] = pd.to_numeric(df_oesm_with_bill['TOT_EMP'], errors='coerce') df_oesm_with_bill['A_MEAN'] = pd.to_numeric(df_oesm_with_bill['A_MEAN'], errors='coerce') df_oesm_with_bill.dropna(subset=['TOT_EMP', 'A_MEAN', 'onetsoc_code'], inplace=True) # Calculate the wage bill for each occupation df_oesm_with_bill['wage_bill'] = df_oesm_with_bill['TOT_EMP'] * df_oesm_with_bill['A_MEAN'] oesm_lookup = df_oesm_with_bill.set_index('onetsoc_code') summary_data = [] # Assuming df_tasks has an 'onetsoc_code' column with the full SOC code unique_soc_codes = df_tasks['onetsoc_code'].unique() for code in unique_soc_codes: occ_df = df_tasks[df_tasks['onetsoc_code'] == code] total_tasks_in_occ = len(occ_df) not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote']) remote_df = occ_df[occ_df['remote_status'] == 'remote'] remote_estimable_count = len(remote_df[remote_df['estimable']]) remote_not_estimable_count = len(remote_df[~remote_df['estimable']]) try: # O*NET codes (e.g., 11-1011.03) are more specific than OESM SOC codes (e.g., 11-1011). # We strip the suffix from the O*NET code to find the corresponding wage data. soc_code_for_lookup = code.split('.')[0] wage_bill = oesm_lookup.loc[soc_code_for_lookup, 'wage_bill'] label = oesm_lookup.loc[soc_code_for_lookup, 'OCC_TITLE'] except KeyError: wage_bill = 0 label = "Unknown" summary_data.append({ 'onetsoc_code': code, 'occupation_label': label, 'wage_bill': wage_bill, 'count_not_remote': not_remote_count, 'count_remote_estimable': remote_estimable_count, 'count_remote_not_estimable': remote_not_estimable_count, 'total_tasks': total_tasks_in_occ }) return pd.DataFrame(summary_data) def aggregate_task_summary_by_major_code(summary_df: pd.DataFrame) -> pd.DataFrame: df_agg = summary_df.copy() df_agg['onetsoc_major_code'] = df_agg['onetsoc_code'].str[:2] aggregation = { 'wage_bill': 'sum', 'count_not_remote': 'sum', 'count_remote_estimable': 'sum', 'count_remote_not_estimable': 'sum', 'total_tasks': 'sum' } major_summary = df_agg.groupby('onetsoc_major_code').agg(aggregation).reset_index() major_summary['occupation_label'] = major_summary['onetsoc_major_code'].map(OCCUPATION_MAJOR_CODES) # Reorder columns to match original output format major_summary = major_summary[[ 'onetsoc_major_code', 'occupation_label', 'wage_bill', 'count_not_remote', 'count_remote_estimable', 'count_remote_not_estimable', 'total_tasks' ]] return major_summary