wip
This commit is contained in:
parent
62296e1b69
commit
65dc648797
37 changed files with 1413 additions and 2433 deletions
81
pipeline/aggregate.py
Normal file
81
pipeline/aggregate.py
Normal file
|
@ -0,0 +1,81 @@
|
|||
from .utils import OCCUPATION_MAJOR_CODES
|
||||
import pandas as pd
|
||||
|
||||
def create_task_summary_by_occupation_df(df_tasks: pd.DataFrame, oesm_df: pd.DataFrame) -> pd.DataFrame:
|
||||
# --- OESM Wage Bill Calculation ---
|
||||
df_oesm_with_bill = oesm_df.copy()
|
||||
df_oesm_with_bill.rename(columns={'OCC_CODE': 'onetsoc_code'}, inplace=True)
|
||||
|
||||
# Convert key columns to numeric, handling potential errors
|
||||
df_oesm_with_bill['TOT_EMP'] = pd.to_numeric(df_oesm_with_bill['TOT_EMP'], errors='coerce')
|
||||
df_oesm_with_bill['A_MEAN'] = pd.to_numeric(df_oesm_with_bill['A_MEAN'], errors='coerce')
|
||||
df_oesm_with_bill.dropna(subset=['TOT_EMP', 'A_MEAN', 'onetsoc_code'], inplace=True)
|
||||
|
||||
# Calculate the wage bill for each occupation
|
||||
df_oesm_with_bill['wage_bill'] = df_oesm_with_bill['TOT_EMP'] * df_oesm_with_bill['A_MEAN']
|
||||
oesm_lookup = df_oesm_with_bill.set_index('onetsoc_code')
|
||||
|
||||
summary_data = []
|
||||
|
||||
# Assuming df_tasks has an 'onetsoc_code' column with the full SOC code
|
||||
unique_soc_codes = df_tasks['onetsoc_code'].unique()
|
||||
|
||||
for code in unique_soc_codes:
|
||||
occ_df = df_tasks[df_tasks['onetsoc_code'] == code]
|
||||
total_tasks_in_occ = len(occ_df)
|
||||
|
||||
not_remote_count = len(occ_df[occ_df['remote_status'] != 'remote'])
|
||||
remote_df = occ_df[occ_df['remote_status'] == 'remote']
|
||||
remote_estimable_count = len(remote_df[remote_df['estimable']])
|
||||
remote_not_estimable_count = len(remote_df[~remote_df['estimable']])
|
||||
|
||||
try:
|
||||
# O*NET codes (e.g., 11-1011.03) are more specific than OESM SOC codes (e.g., 11-1011).
|
||||
# We strip the suffix from the O*NET code to find the corresponding wage data.
|
||||
soc_code_for_lookup = code.split('.')[0]
|
||||
wage_bill = oesm_lookup.loc[soc_code_for_lookup, 'wage_bill']
|
||||
label = oesm_lookup.loc[soc_code_for_lookup, 'OCC_TITLE']
|
||||
except KeyError:
|
||||
wage_bill = 0
|
||||
label = "Unknown"
|
||||
|
||||
summary_data.append({
|
||||
'onetsoc_code': code,
|
||||
'occupation_label': label,
|
||||
'wage_bill': wage_bill,
|
||||
'count_not_remote': not_remote_count,
|
||||
'count_remote_estimable': remote_estimable_count,
|
||||
'count_remote_not_estimable': remote_not_estimable_count,
|
||||
'total_tasks': total_tasks_in_occ
|
||||
})
|
||||
|
||||
return pd.DataFrame(summary_data)
|
||||
|
||||
|
||||
def aggregate_task_summary_by_major_code(summary_df: pd.DataFrame) -> pd.DataFrame:
|
||||
df_agg = summary_df.copy()
|
||||
df_agg['onetsoc_major_code'] = df_agg['onetsoc_code'].str[:2]
|
||||
|
||||
aggregation = {
|
||||
'wage_bill': 'sum',
|
||||
'count_not_remote': 'sum',
|
||||
'count_remote_estimable': 'sum',
|
||||
'count_remote_not_estimable': 'sum',
|
||||
'total_tasks': 'sum'
|
||||
}
|
||||
major_summary = df_agg.groupby('onetsoc_major_code').agg(aggregation).reset_index()
|
||||
|
||||
major_summary['occupation_label'] = major_summary['onetsoc_major_code'].map(OCCUPATION_MAJOR_CODES)
|
||||
|
||||
# Reorder columns to match original output format
|
||||
major_summary = major_summary[[
|
||||
'onetsoc_major_code',
|
||||
'occupation_label',
|
||||
'wage_bill',
|
||||
'count_not_remote',
|
||||
'count_remote_estimable',
|
||||
'count_remote_not_estimable',
|
||||
'total_tasks'
|
||||
]]
|
||||
|
||||
return major_summary
|
Loading…
Add table
Add a link
Reference in a new issue