Feat: Implement task enrichment steps
Implement task estimateability and task estimate enrichment steps. Add a `create_df_tasks` postprocessor.
This commit is contained in:
parent
f9f9825abb
commit
62296e1b69
3 changed files with 221 additions and 22 deletions
|
@ -3,26 +3,95 @@ This module enriches data, they take time to run, and are usually expensive (API
|
|||
they should manage their own state, and only be run if the data's version is different than
|
||||
their save.
|
||||
"""
|
||||
from .run import Run
|
||||
from .run import Run
|
||||
import pandas as pd
|
||||
from typing import Any, List, Dict
|
||||
import litellm
|
||||
|
||||
def enrich(
|
||||
model: str,
|
||||
rpm: int,
|
||||
messages_to_process: List[List[Dict[str, str]]],
|
||||
schema: Dict[str, Any],
|
||||
chunk_size: int = 100,
|
||||
):
|
||||
# Use litellm.batch_completion
|
||||
pass
|
||||
|
||||
def enrich_with_task_estimateability(run: Run) -> pd.DataFrame:
|
||||
"""
|
||||
TODO: check run.cache_dir / computed_task_estimateability.parquet, if it exists, load it, return it, and don't compute this
|
||||
output_path = run.cache_dir / "computed_task_estimateability.parquet"
|
||||
if output_path.exists():
|
||||
print(f"Loading cached task estimateability from {output_path}")
|
||||
return pd.read_parquet(output_path)
|
||||
|
||||
call enrich with the right parameters, save the output to cache dir,
|
||||
return it
|
||||
"""
|
||||
raise NotImplementedError
|
||||
df_remote_tasks = run.df_tasks[run.df_tasks['remote_status'] == 'remote'].copy()
|
||||
|
||||
# In the old script, we only passed unique tasks to the API
|
||||
df_unique_tasks = df_remote_tasks.drop_duplicates(subset=['task'])
|
||||
|
||||
|
||||
results = enrich(
|
||||
model="gpt-4.1-mini",
|
||||
rpm=5000,
|
||||
messages_to_process=[
|
||||
[
|
||||
{"role": "system", "content": """
|
||||
Judge whether the provided O*NET task is suitable for a time estimate. If it is a single, clearly-bounded activity, typically lasting minutes, hours, or a few days, then clearly yes. If it is a continuous responsibility or behavioural norm with no schedulable duration (e.g., “follow confidentiality rules,” “serve as department head”), then clearly no.
|
||||
"""},
|
||||
{"role": "user", "content": f"Task: {row.task}"},
|
||||
]
|
||||
for row in df_unique_tasks.itertuples()
|
||||
],
|
||||
schema={
|
||||
"type": "object",
|
||||
"properties": {"estimateable": {"type": "bool"}},
|
||||
"required": ["estimateable"]
|
||||
},
|
||||
chunk_size=300,
|
||||
)
|
||||
|
||||
# Create a new dataframe with just enough information to identify the task uniquely + estimateability classification, save it, return it. Careful: the "task" column in itself is not unique.
|
||||
return pd.DataFrame()
|
||||
|
||||
def enrich_with_task_estimates(run: Run) -> pd.DataFrame:
|
||||
"""
|
||||
TODO: check run.cache_dir / computed_task_estimates.parquet, if it exists, load it, return it, and don't compute this
|
||||
output_path = run.cache_dir / "computed_task_estimates.parquet"
|
||||
if output_path.exists():
|
||||
print(f"Loading cached task estimates from {output_path}")
|
||||
return pd.read_parquet(output_path)
|
||||
|
||||
call enrich with the right parameters, save the output to cache dir,
|
||||
return it
|
||||
"""
|
||||
raise NotImplementedError
|
||||
df = ... # todo
|
||||
|
||||
def enrich(model: str, system_prompt: str, schema: Any, rpm: int, chunk_size: int = 100, messages: Any):
|
||||
results = enrich(
|
||||
model="gpt-4.1-mini",
|
||||
rpm=5000,
|
||||
messages_to_process=[
|
||||
[
|
||||
{"role": "system", "content": "Estimate the time required to complete the following O*NET task. Your estimate should be a plausible range for how long it might take a typical, qualified worker to perform this task once. Provide your answer as a time range (lower and upper bounds). Do not provide explanations or apologies. If the task is not suitable for a time estimate (e.g., it is an ongoing responsibility), interpret it as a single, schedulable action."},
|
||||
{"role": "user", "content": f"""
|
||||
Task: {row.task}
|
||||
For Occupation: {row.occupation_title}
|
||||
Occupation Description: {row.occupation_description}"""}
|
||||
]
|
||||
for row in df.itertuples()
|
||||
],
|
||||
schema={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"lower_bound_estimate": {
|
||||
"type": "object",
|
||||
"properties": {"quantity": {"type": "number"}, "unit": {"type": "string", "enum": ["minutes", "hours", "days"]}},
|
||||
"required": ["quantity", "unit"],
|
||||
},
|
||||
"upper_bound_estimate": {
|
||||
"type": "object",
|
||||
"properties": {"quantity": {"type": "number"}, "unit": {"type": "string", "enum": ["minutes", "hours", "days"]}},
|
||||
"required": ["quantity", "unit"],
|
||||
},
|
||||
},
|
||||
"required": ["lower_bound_estimate", "upper_bound_estimate"],
|
||||
},
|
||||
chunk_size=200,
|
||||
)
|
||||
|
||||
# Create a new dataframe with just enough information to identify the task uniquely + the estimates classification, save it, return it. Careful: the "task" column in itself is not unique.
|
||||
raise NotImplementedError
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue