sprint-econtai/pipeline/enrichments.py
Félix Dorn 62296e1b69 Feat: Implement task enrichment steps
Implement task estimateability and task estimate enrichment steps. Add a
`create_df_tasks` postprocessor.
2025-07-08 15:27:04 +02:00

97 lines
4.2 KiB
Python

"""
This module enriches data, they take time to run, and are usually expensive (API calls...),
they should manage their own state, and only be run if the data's version is different than
their save.
"""
from .run import Run
import pandas as pd
from typing import Any, List, Dict
import litellm
def enrich(
model: str,
rpm: int,
messages_to_process: List[List[Dict[str, str]]],
schema: Dict[str, Any],
chunk_size: int = 100,
):
# Use litellm.batch_completion
pass
def enrich_with_task_estimateability(run: Run) -> pd.DataFrame:
output_path = run.cache_dir / "computed_task_estimateability.parquet"
if output_path.exists():
print(f"Loading cached task estimateability from {output_path}")
return pd.read_parquet(output_path)
df_remote_tasks = run.df_tasks[run.df_tasks['remote_status'] == 'remote'].copy()
# In the old script, we only passed unique tasks to the API
df_unique_tasks = df_remote_tasks.drop_duplicates(subset=['task'])
results = enrich(
model="gpt-4.1-mini",
rpm=5000,
messages_to_process=[
[
{"role": "system", "content": """
Judge whether the provided O*NET task is suitable for a time estimate. If it is a single, clearly-bounded activity, typically lasting minutes, hours, or a few days, then clearly yes. If it is a continuous responsibility or behavioural norm with no schedulable duration (e.g., “follow confidentiality rules,” “serve as department head”), then clearly no.
"""},
{"role": "user", "content": f"Task: {row.task}"},
]
for row in df_unique_tasks.itertuples()
],
schema={
"type": "object",
"properties": {"estimateable": {"type": "bool"}},
"required": ["estimateable"]
},
chunk_size=300,
)
# Create a new dataframe with just enough information to identify the task uniquely + estimateability classification, save it, return it. Careful: the "task" column in itself is not unique.
return pd.DataFrame()
def enrich_with_task_estimates(run: Run) -> pd.DataFrame:
output_path = run.cache_dir / "computed_task_estimates.parquet"
if output_path.exists():
print(f"Loading cached task estimates from {output_path}")
return pd.read_parquet(output_path)
df = ... # todo
results = enrich(
model="gpt-4.1-mini",
rpm=5000,
messages_to_process=[
[
{"role": "system", "content": "Estimate the time required to complete the following O*NET task. Your estimate should be a plausible range for how long it might take a typical, qualified worker to perform this task once. Provide your answer as a time range (lower and upper bounds). Do not provide explanations or apologies. If the task is not suitable for a time estimate (e.g., it is an ongoing responsibility), interpret it as a single, schedulable action."},
{"role": "user", "content": f"""
Task: {row.task}
For Occupation: {row.occupation_title}
Occupation Description: {row.occupation_description}"""}
]
for row in df.itertuples()
],
schema={
"type": "object",
"properties": {
"lower_bound_estimate": {
"type": "object",
"properties": {"quantity": {"type": "number"}, "unit": {"type": "string", "enum": ["minutes", "hours", "days"]}},
"required": ["quantity", "unit"],
},
"upper_bound_estimate": {
"type": "object",
"properties": {"quantity": {"type": "number"}, "unit": {"type": "string", "enum": ["minutes", "hours", "days"]}},
"required": ["quantity", "unit"],
},
},
"required": ["lower_bound_estimate", "upper_bound_estimate"],
},
chunk_size=200,
)
# Create a new dataframe with just enough information to identify the task uniquely + the estimates classification, save it, return it. Careful: the "task" column in itself is not unique.
raise NotImplementedError