Feat: Implement task enrichment steps
Implement task estimateability and task estimate enrichment steps. Add a `create_df_tasks` postprocessor.
This commit is contained in:
parent
f9f9825abb
commit
62296e1b69
3 changed files with 221 additions and 22 deletions
|
@ -3,26 +3,95 @@ This module enriches data, they take time to run, and are usually expensive (API
|
||||||
they should manage their own state, and only be run if the data's version is different than
|
they should manage their own state, and only be run if the data's version is different than
|
||||||
their save.
|
their save.
|
||||||
"""
|
"""
|
||||||
from .run import Run
|
from .run import Run
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from typing import Any, List, Dict
|
||||||
|
import litellm
|
||||||
|
|
||||||
|
def enrich(
|
||||||
|
model: str,
|
||||||
|
rpm: int,
|
||||||
|
messages_to_process: List[List[Dict[str, str]]],
|
||||||
|
schema: Dict[str, Any],
|
||||||
|
chunk_size: int = 100,
|
||||||
|
):
|
||||||
|
# Use litellm.batch_completion
|
||||||
|
pass
|
||||||
|
|
||||||
def enrich_with_task_estimateability(run: Run) -> pd.DataFrame:
|
def enrich_with_task_estimateability(run: Run) -> pd.DataFrame:
|
||||||
"""
|
output_path = run.cache_dir / "computed_task_estimateability.parquet"
|
||||||
TODO: check run.cache_dir / computed_task_estimateability.parquet, if it exists, load it, return it, and don't compute this
|
if output_path.exists():
|
||||||
|
print(f"Loading cached task estimateability from {output_path}")
|
||||||
|
return pd.read_parquet(output_path)
|
||||||
|
|
||||||
call enrich with the right parameters, save the output to cache dir,
|
df_remote_tasks = run.df_tasks[run.df_tasks['remote_status'] == 'remote'].copy()
|
||||||
return it
|
|
||||||
"""
|
# In the old script, we only passed unique tasks to the API
|
||||||
raise NotImplementedError
|
df_unique_tasks = df_remote_tasks.drop_duplicates(subset=['task'])
|
||||||
|
|
||||||
|
|
||||||
|
results = enrich(
|
||||||
|
model="gpt-4.1-mini",
|
||||||
|
rpm=5000,
|
||||||
|
messages_to_process=[
|
||||||
|
[
|
||||||
|
{"role": "system", "content": """
|
||||||
|
Judge whether the provided O*NET task is suitable for a time estimate. If it is a single, clearly-bounded activity, typically lasting minutes, hours, or a few days, then clearly yes. If it is a continuous responsibility or behavioural norm with no schedulable duration (e.g., “follow confidentiality rules,” “serve as department head”), then clearly no.
|
||||||
|
"""},
|
||||||
|
{"role": "user", "content": f"Task: {row.task}"},
|
||||||
|
]
|
||||||
|
for row in df_unique_tasks.itertuples()
|
||||||
|
],
|
||||||
|
schema={
|
||||||
|
"type": "object",
|
||||||
|
"properties": {"estimateable": {"type": "bool"}},
|
||||||
|
"required": ["estimateable"]
|
||||||
|
},
|
||||||
|
chunk_size=300,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create a new dataframe with just enough information to identify the task uniquely + estimateability classification, save it, return it. Careful: the "task" column in itself is not unique.
|
||||||
|
return pd.DataFrame()
|
||||||
|
|
||||||
def enrich_with_task_estimates(run: Run) -> pd.DataFrame:
|
def enrich_with_task_estimates(run: Run) -> pd.DataFrame:
|
||||||
"""
|
output_path = run.cache_dir / "computed_task_estimates.parquet"
|
||||||
TODO: check run.cache_dir / computed_task_estimates.parquet, if it exists, load it, return it, and don't compute this
|
if output_path.exists():
|
||||||
|
print(f"Loading cached task estimates from {output_path}")
|
||||||
|
return pd.read_parquet(output_path)
|
||||||
|
|
||||||
call enrich with the right parameters, save the output to cache dir,
|
df = ... # todo
|
||||||
return it
|
|
||||||
"""
|
|
||||||
raise NotImplementedError
|
|
||||||
|
|
||||||
def enrich(model: str, system_prompt: str, schema: Any, rpm: int, chunk_size: int = 100, messages: Any):
|
results = enrich(
|
||||||
|
model="gpt-4.1-mini",
|
||||||
|
rpm=5000,
|
||||||
|
messages_to_process=[
|
||||||
|
[
|
||||||
|
{"role": "system", "content": "Estimate the time required to complete the following O*NET task. Your estimate should be a plausible range for how long it might take a typical, qualified worker to perform this task once. Provide your answer as a time range (lower and upper bounds). Do not provide explanations or apologies. If the task is not suitable for a time estimate (e.g., it is an ongoing responsibility), interpret it as a single, schedulable action."},
|
||||||
|
{"role": "user", "content": f"""
|
||||||
|
Task: {row.task}
|
||||||
|
For Occupation: {row.occupation_title}
|
||||||
|
Occupation Description: {row.occupation_description}"""}
|
||||||
|
]
|
||||||
|
for row in df.itertuples()
|
||||||
|
],
|
||||||
|
schema={
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"lower_bound_estimate": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {"quantity": {"type": "number"}, "unit": {"type": "string", "enum": ["minutes", "hours", "days"]}},
|
||||||
|
"required": ["quantity", "unit"],
|
||||||
|
},
|
||||||
|
"upper_bound_estimate": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {"quantity": {"type": "number"}, "unit": {"type": "string", "enum": ["minutes", "hours", "days"]}},
|
||||||
|
"required": ["quantity", "unit"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"required": ["lower_bound_estimate", "upper_bound_estimate"],
|
||||||
|
},
|
||||||
|
chunk_size=200,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create a new dataframe with just enough information to identify the task uniquely + the estimates classification, save it, return it. Careful: the "task" column in itself is not unique.
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
|
@ -1,10 +1,140 @@
|
||||||
from .run import Run
|
from .run import Run
|
||||||
|
from .logger import logger
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
def check_for_insanity(run: Run) -> Run:
|
def check_for_insanity(run: Run) -> Run:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
def create_df_tasks(run: Run) -> Run:
|
def create_df_tasks(run: Run) -> Run:
|
||||||
"""
|
"""
|
||||||
df_tasks are tasks that are remote-able, estimateable
|
Creates a dataframe of tasks from the O*NET database, and merges it with remote status data.
|
||||||
Add lb_estimate_in_minutes, ub_estimate_in_minutes, estimate_range, estimate_ratio, estimate_midpoint as columns
|
This replicates the logic from old/enrich_task_ratings.py and parts of old/analysis.py
|
||||||
|
|
||||||
|
The resulting dataframe, `run.df_tasks` will be used by the enrichment steps.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
logger.info("Creating tasks dataframe")
|
||||||
|
cache_path = run.cache_dir / f"onet_{run.onet_version}_tasks_with_remote_status.parquet"
|
||||||
|
if cache_path.exists():
|
||||||
|
logger.info(f"Loading cached tasks dataframe from {cache_path}")
|
||||||
|
run.df_tasks = pd.read_parquet(cache_path)
|
||||||
|
return run
|
||||||
|
|
||||||
|
query = """
|
||||||
|
SELECT
|
||||||
|
tr.onetsoc_code,
|
||||||
|
tr.task_id,
|
||||||
|
ts.task,
|
||||||
|
od.title AS occupation_title,
|
||||||
|
od.description AS occupation_description,
|
||||||
|
tr.scale_id,
|
||||||
|
tr.category,
|
||||||
|
tr.data_value,
|
||||||
|
dr.dwa_title
|
||||||
|
FROM
|
||||||
|
task_ratings tr
|
||||||
|
JOIN
|
||||||
|
task_statements ts ON tr.task_id = ts.task_id
|
||||||
|
JOIN
|
||||||
|
occupation_data od ON tr.onetsoc_code = od.onetsoc_code
|
||||||
|
LEFT JOIN
|
||||||
|
tasks_to_dwas td ON tr.onetsoc_code = td.onetsoc_code AND tr.task_id = td.task_id
|
||||||
|
LEFT JOIN
|
||||||
|
dwa_reference dr ON td.dwa_id = dr.dwa_id;
|
||||||
|
"""
|
||||||
|
df = pd.read_sql_query(query, run.onet_conn)
|
||||||
|
logger.info(f"Fetched {len(df)} records (including DWA info) from the database.")
|
||||||
|
|
||||||
|
# Separate ratings from DWAs
|
||||||
|
core_cols = [
|
||||||
|
"onetsoc_code", "task_id", "task", "occupation_title",
|
||||||
|
"occupation_description", "scale_id", "category", "data_value"
|
||||||
|
]
|
||||||
|
ratings_df = df[core_cols].drop_duplicates().reset_index(drop=True)
|
||||||
|
|
||||||
|
dwa_cols = ["onetsoc_code", "task_id", "dwa_title"]
|
||||||
|
dwas_df = df[dwa_cols].dropna(subset=["dwa_title"]).drop_duplicates().reset_index(drop=True)
|
||||||
|
|
||||||
|
# 1. Handle Frequency (FT)
|
||||||
|
logger.info("Processing Frequency data")
|
||||||
|
freq_df = ratings_df[ratings_df["scale_id"] == "FT"].copy()
|
||||||
|
if not freq_df.empty:
|
||||||
|
freq_pivot = freq_df.pivot_table(
|
||||||
|
index=["onetsoc_code", "task_id"],
|
||||||
|
columns="category",
|
||||||
|
values="data_value",
|
||||||
|
fill_value=0,
|
||||||
|
)
|
||||||
|
freq_pivot.columns = [f"frequency_category_{int(col)}" for col in freq_pivot.columns]
|
||||||
|
else:
|
||||||
|
idx = pd.MultiIndex(levels=[[], []], codes=[[], []], names=["onetsoc_code", "task_id"])
|
||||||
|
freq_pivot = pd.DataFrame(index=idx)
|
||||||
|
|
||||||
|
# 2. Handle Importance (IM, IJ)
|
||||||
|
logger.info("Processing Importance data")
|
||||||
|
imp_df = ratings_df[ratings_df["scale_id"].isin(["IM", "IJ"])].copy()
|
||||||
|
if not imp_df.empty:
|
||||||
|
imp_avg = imp_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index()
|
||||||
|
imp_avg.rename(columns={"data_value": "importance_average"}, inplace=True)
|
||||||
|
else:
|
||||||
|
imp_avg = pd.DataFrame(columns=["onetsoc_code", "task_id", "importance_average"])
|
||||||
|
|
||||||
|
# 3. Handle Relevance (RT)
|
||||||
|
logger.info("Processing Relevance data")
|
||||||
|
rel_df = ratings_df[ratings_df["scale_id"] == "RT"].copy()
|
||||||
|
if not rel_df.empty:
|
||||||
|
rel_avg = rel_df.groupby(["onetsoc_code", "task_id"])["data_value"].mean().reset_index()
|
||||||
|
rel_avg.rename(columns={"data_value": "relevance_average"}, inplace=True)
|
||||||
|
else:
|
||||||
|
rel_avg = pd.DataFrame(columns=["onetsoc_code", "task_id", "relevance_average"])
|
||||||
|
|
||||||
|
# 4. Process DWAs
|
||||||
|
logger.info("Processing DWA data")
|
||||||
|
if not dwas_df.empty:
|
||||||
|
dwas_grouped = dwas_df.groupby(["onetsoc_code", "task_id"])["dwa_title"].apply(list).reset_index()
|
||||||
|
dwas_grouped.rename(columns={"dwa_title": "dwas"}, inplace=True)
|
||||||
|
else:
|
||||||
|
dwas_grouped = None
|
||||||
|
|
||||||
|
# 5. Get Base Task/Occupation Info
|
||||||
|
logger.info("Extracting base task/occupation info")
|
||||||
|
base_cols = ["onetsoc_code", "task_id", "task", "occupation_title", "occupation_description"]
|
||||||
|
base_info = ratings_df[base_cols].drop_duplicates().set_index(["onetsoc_code", "task_id"])
|
||||||
|
|
||||||
|
# 6. Merge Processed ONET Data
|
||||||
|
logger.info("Merging processed ONET data")
|
||||||
|
final_df = base_info.merge(freq_pivot, left_index=True, right_index=True, how="left")
|
||||||
|
final_df = final_df.reset_index()
|
||||||
|
|
||||||
|
if not imp_avg.empty:
|
||||||
|
final_df = final_df.merge(imp_avg, on=["onetsoc_code", "task_id"], how="left")
|
||||||
|
else:
|
||||||
|
final_df["importance_average"] = np.nan
|
||||||
|
|
||||||
|
if not rel_avg.empty:
|
||||||
|
final_df = final_df.merge(rel_avg, on=["onetsoc_code", "task_id"], how="left")
|
||||||
|
else:
|
||||||
|
final_df["relevance_average"] = np.nan
|
||||||
|
|
||||||
|
if dwas_grouped is not None and not dwas_grouped.empty:
|
||||||
|
final_df = final_df.merge(dwas_grouped, on=["onetsoc_code", "task_id"], how="left")
|
||||||
|
if "dwas" in final_df.columns:
|
||||||
|
final_df["dwas"] = final_df["dwas"].apply(lambda x: x if isinstance(x, list) else [])
|
||||||
|
else:
|
||||||
|
final_df["dwas"] = [[] for _ in range(len(final_df))]
|
||||||
|
|
||||||
|
final_df = final_df.replace({np.nan: None})
|
||||||
|
|
||||||
|
# 7. Merge with EPOCH remote data
|
||||||
|
logger.info("Merging with EPOCH remote data")
|
||||||
|
final_df = pd.merge(final_df, run.epoch_df[['Task', 'Remote']], left_on='task', right_on='Task', how='left')
|
||||||
|
final_df = final_df.drop('Task', axis=1).rename(columns={'Remote': 'remote_status'})
|
||||||
|
|
||||||
|
|
||||||
|
logger.info(f"Created tasks dataframe with shape {final_df.shape}")
|
||||||
|
final_df.to_parquet(cache_path)
|
||||||
|
|
||||||
|
run.df_tasks = final_df
|
||||||
|
return run
|
||||||
|
|
|
@ -14,19 +14,18 @@ from typing import Optional
|
||||||
|
|
||||||
CACHE_DIR = platformdirs.user_cache_dir("econtai")
|
CACHE_DIR = platformdirs.user_cache_dir("econtai")
|
||||||
|
|
||||||
def run(output_dir: Optional[str] = None):
|
def run(output_dir: Path | Optional[str] = None):
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
_setup_graph_rendering()
|
_setup_graph_rendering()
|
||||||
|
|
||||||
if output_dir is None:
|
if output_dir is None:
|
||||||
output_dir = Path("dist/")
|
output_dir = Path("dist/")
|
||||||
else:
|
elif isinstance(output_dir, str):
|
||||||
output_dir = Path(output_dir).resolve()
|
output_dir = Path(output_dir).resolve()
|
||||||
|
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
current_run = Run(output_dir=output_dir, cache_dir=Path(CACHE_DIR).resolve())
|
||||||
current_run = Run(output_dir=output_dir, cache_dir=CACHE_DIR)
|
|
||||||
current_run.cache_dir.mkdir(parents=True, exist_ok=True)
|
current_run.cache_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
# Fetchers (fetchers.py)
|
# Fetchers (fetchers.py)
|
||||||
|
@ -34,12 +33,13 @@ def run(output_dir: Optional[str] = None):
|
||||||
current_run.oesm_df, current_run.oesm_version = fetch_oesm_data(current_run)
|
current_run.oesm_df, current_run.oesm_version = fetch_oesm_data(current_run)
|
||||||
current_run.epoch_df, current_run.epoch_version = fetch_epoch_remote_data(current_run)
|
current_run.epoch_df, current_run.epoch_version = fetch_epoch_remote_data(current_run)
|
||||||
|
|
||||||
|
current_run = create_df_tasks(current_run)
|
||||||
|
|
||||||
# Enrichments (enrichments.py)
|
# Enrichments (enrichments.py)
|
||||||
current_run.task_estimateability_df = enrich_with_task_estimateability(current_run)
|
current_run.task_estimateability_df = enrich_with_task_estimateability(current_run)
|
||||||
current_run.task_estimates_df = enrich_with_task_estimates(current_run)
|
current_run.task_estimates_df = enrich_with_task_estimates(current_run)
|
||||||
|
|
||||||
# Postprocessors (postprocessors.py)
|
# Postprocessors (postprocessors.py)
|
||||||
create_df_tasks(current_run)
|
|
||||||
check_for_insanity(current_run)
|
check_for_insanity(current_run)
|
||||||
|
|
||||||
# Generators (generators/)
|
# Generators (generators/)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue