
Implement task estimateability and task estimate enrichment steps. Add a `create_df_tasks` postprocessor.
97 lines
4.2 KiB
Python
97 lines
4.2 KiB
Python
"""
|
|
This module enriches data, they take time to run, and are usually expensive (API calls...),
|
|
they should manage their own state, and only be run if the data's version is different than
|
|
their save.
|
|
"""
|
|
from .run import Run
|
|
import pandas as pd
|
|
from typing import Any, List, Dict
|
|
import litellm
|
|
|
|
def enrich(
|
|
model: str,
|
|
rpm: int,
|
|
messages_to_process: List[List[Dict[str, str]]],
|
|
schema: Dict[str, Any],
|
|
chunk_size: int = 100,
|
|
):
|
|
# Use litellm.batch_completion
|
|
pass
|
|
|
|
def enrich_with_task_estimateability(run: Run) -> pd.DataFrame:
|
|
output_path = run.cache_dir / "computed_task_estimateability.parquet"
|
|
if output_path.exists():
|
|
print(f"Loading cached task estimateability from {output_path}")
|
|
return pd.read_parquet(output_path)
|
|
|
|
df_remote_tasks = run.df_tasks[run.df_tasks['remote_status'] == 'remote'].copy()
|
|
|
|
# In the old script, we only passed unique tasks to the API
|
|
df_unique_tasks = df_remote_tasks.drop_duplicates(subset=['task'])
|
|
|
|
|
|
results = enrich(
|
|
model="gpt-4.1-mini",
|
|
rpm=5000,
|
|
messages_to_process=[
|
|
[
|
|
{"role": "system", "content": """
|
|
Judge whether the provided O*NET task is suitable for a time estimate. If it is a single, clearly-bounded activity, typically lasting minutes, hours, or a few days, then clearly yes. If it is a continuous responsibility or behavioural norm with no schedulable duration (e.g., “follow confidentiality rules,” “serve as department head”), then clearly no.
|
|
"""},
|
|
{"role": "user", "content": f"Task: {row.task}"},
|
|
]
|
|
for row in df_unique_tasks.itertuples()
|
|
],
|
|
schema={
|
|
"type": "object",
|
|
"properties": {"estimateable": {"type": "bool"}},
|
|
"required": ["estimateable"]
|
|
},
|
|
chunk_size=300,
|
|
)
|
|
|
|
# Create a new dataframe with just enough information to identify the task uniquely + estimateability classification, save it, return it. Careful: the "task" column in itself is not unique.
|
|
return pd.DataFrame()
|
|
|
|
def enrich_with_task_estimates(run: Run) -> pd.DataFrame:
|
|
output_path = run.cache_dir / "computed_task_estimates.parquet"
|
|
if output_path.exists():
|
|
print(f"Loading cached task estimates from {output_path}")
|
|
return pd.read_parquet(output_path)
|
|
|
|
df = ... # todo
|
|
|
|
results = enrich(
|
|
model="gpt-4.1-mini",
|
|
rpm=5000,
|
|
messages_to_process=[
|
|
[
|
|
{"role": "system", "content": "Estimate the time required to complete the following O*NET task. Your estimate should be a plausible range for how long it might take a typical, qualified worker to perform this task once. Provide your answer as a time range (lower and upper bounds). Do not provide explanations or apologies. If the task is not suitable for a time estimate (e.g., it is an ongoing responsibility), interpret it as a single, schedulable action."},
|
|
{"role": "user", "content": f"""
|
|
Task: {row.task}
|
|
For Occupation: {row.occupation_title}
|
|
Occupation Description: {row.occupation_description}"""}
|
|
]
|
|
for row in df.itertuples()
|
|
],
|
|
schema={
|
|
"type": "object",
|
|
"properties": {
|
|
"lower_bound_estimate": {
|
|
"type": "object",
|
|
"properties": {"quantity": {"type": "number"}, "unit": {"type": "string", "enum": ["minutes", "hours", "days"]}},
|
|
"required": ["quantity", "unit"],
|
|
},
|
|
"upper_bound_estimate": {
|
|
"type": "object",
|
|
"properties": {"quantity": {"type": "number"}, "unit": {"type": "string", "enum": ["minutes", "hours", "days"]}},
|
|
"required": ["quantity", "unit"],
|
|
},
|
|
},
|
|
"required": ["lower_bound_estimate", "upper_bound_estimate"],
|
|
},
|
|
chunk_size=200,
|
|
)
|
|
|
|
# Create a new dataframe with just enough information to identify the task uniquely + the estimates classification, save it, return it. Careful: the "task" column in itself is not unique.
|
|
raise NotImplementedError
|